diff --git a/.asf.yaml b/.asf.yaml index 8ebab2d68463f..534f99613dfa2 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -29,3 +29,13 @@ github: - data-integration - apachespark - apacheflink + features: + wiki: true + issues: true + projects: true + discussions: true +notifications: + commits: commits@hudi.apache.org + issues: commits@hudi.apache.org + pullrequests: commits@hudi.apache.org + jira_options: link label diff --git a/.codecov.yml b/.codecov.yml index c63c6853bdec2..8f18bb51bc06e 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -23,6 +23,13 @@ coverage: precision: 2 round: down range: "50...100" + status: + project: # settings affecting project coverage + enabled: no + + # do not run coverage on patch nor changes + patch: no + changes: no # Ignoring Paths # -------------- @@ -47,14 +54,7 @@ ignore: - "hudi-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java" - "hudi-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeInputFormat.java" -comment: - layout: "reach, diff, flags, files" - behavior: default - require_changes: false # if true: only post the comment if coverage changes - require_base: no # [yes :: must have a base report to post] - require_head: no # [yes :: must have a head report to post] - branches: # https://docs.codecov.io/docs/pull-request-comments#branches - - "master" +comment: false flags: hudicli: diff --git a/.github/ISSUE_TEMPLATE/SUPPORT_REQUEST.md b/.github/ISSUE_TEMPLATE/SUPPORT_REQUEST.md index adf7273993558..9ce26e9d2bd3c 100644 --- a/.github/ISSUE_TEMPLATE/SUPPORT_REQUEST.md +++ b/.github/ISSUE_TEMPLATE/SUPPORT_REQUEST.md @@ -8,7 +8,7 @@ labels: question **_Tips before filing an issue_** -- Have you gone through our [FAQs](https://cwiki.apache.org/confluence/display/HUDI/FAQ)? +- Have you gone through our [FAQs](https://hudi.apache.org/learn/faq/)? - Join the mailing list to engage in conversations and get faster support at dev-subscribe@hudi.apache.org. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index e6da7e3aa350c..17ad995a97a7a 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,44 +1,27 @@ -## *Tips* -- *Thank you very much for contributing to Apache Hudi.* -- *Please review https://hudi.apache.org/contributing.html before opening a pull request.* +### Change Logs -## What is the purpose of the pull request +_Describe context and summary for this change. Highlight if any code was copied._ -*(For example: This pull request adds quick-start document.)* +### Impact -## Brief change log +_Describe any public API or user-facing feature change or any performance impact._ -*(for example:)* - - *Modify AnnotationLocation checkstyle rule in checkstyle.xml* +**Risk level: none | low | medium | high** -## Verify this pull request +_Choose one. If medium or high, explain what verification was done to mitigate the risks._ -*(Please pick either of the following options)* +### Documentation Update -This pull request is a trivial rework / code cleanup without any test coverage. +_Describe any necessary documentation update if there is any new feature, config, or user-facing change_ -*(or)* +- _The config description must be updated if new configs are added or the default value of the configs are changed_ +- _Any new feature or user-facing change requires updating the Hudi website. Please create a Jira ticket, attach the + ticket number here and follow the [instruction](https://hudi.apache.org/contribute/developer-setup#website) to make + changes to the website._ -This pull request is already covered by existing tests, such as *(please describe tests)*. +### Contributor's checklist -(or) - -This change added tests and can be verified as follows: - -*(example:)* - - - *Added integration tests for end-to-end.* - - *Added HoodieClientWriteTest to verify the change.* - - *Manually verified the change by running a job locally.* - -## Committer checklist - - - [ ] Has a corresponding JIRA in PR title & commit - - - [ ] Commit message is descriptive of the change - - - [ ] CI is green - - - [ ] Necessary doc changes done or have another open PR - - - [ ] For large changes, please consider breaking it into sub-tasks under an umbrella JIRA. \ No newline at end of file +- [ ] Read through [contributor's guide](https://hudi.apache.org/contribute/how-to-contribute) +- [ ] Change Logs and Impact were stated clearly +- [ ] Adequate tests were added if applicable +- [ ] CI passed diff --git a/.github/README.md b/.github/README.md new file mode 100644 index 0000000000000..a621d35e814b7 --- /dev/null +++ b/.github/README.md @@ -0,0 +1,10 @@ +## How to update the Pull Request Template + +When updating the pr template, you must consider if updates need to be made to scripts/pr_compliance.py + +## What are the files in workflows? +- bot.yml: runs the hudi unit tests with various versions of scala, spark, and flink +- pr_compliance.yml: checks pr titles and main comment to make sure that everything is filled out and formatted properly +- update_pr_compliance: runs the pr_compliance tests when scripts/pr_compliance.py is updated + + diff --git a/.github/actions/bot/package.json b/.github/actions/bot/package.json deleted file mode 100644 index 0d6e9d1e959b2..0000000000000 --- a/.github/actions/bot/package.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "name": "github_action_ci_bot", - "version": "1.1.1", - "description": "CI Bot for GitHub Actions", - "main": "dist/index.js", - "scripts": { - "lint": "eslint 'src/**.js' 'tests/**.js' --fix", - "test": "eslint 'src/**.js' 'tests/**.js' && jest --coverage", - "build": "ncc build src/action.js" - }, - "author": "lamber-ken", - "license": "Apache LICENSE 2.0", - "homepage": "https://github.com/apache/hudi", - "bugs": { - "url": "https://github.com/apache/hudi/issues" - }, - "dependencies": { - "@actions/core": "^1.2.4", - "@actions/github": "^2.2.0", - "@actions/io": "^1.0.2" - }, - "devDependencies": { - "@types/jest": "^25.1.4", - "@typescript-eslint/eslint-plugin": "^2.33.0", - "@typescript-eslint/parser": "^2.33.0", - "@zeit/ncc": "^0.22.0", - "eslint": "^7.0.0", - "eslint-config-prettier": "^6.11.0", - "husky": "^4.2.5", - "jest": "^25.1.0", - "npm-run-all": "^4.1.5", - "prettier": "^2.0.5", - "ts-jest": "^25.2.1", - "typescript": "^3.8.3" - } -} diff --git a/.github/actions/bot/src/action.js b/.github/actions/bot/src/action.js deleted file mode 100644 index cb4f6916db5e5..0000000000000 --- a/.github/actions/bot/src/action.js +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -async function check(core, context, github) { - - try { - const provider = process.env.PROVIDER; - const repository = process.env.REPOSITORY; - const command = context.payload.comment.body; - - if (command !== 'rerun tests') { - console.log("Invalid command:" + command); - return; - } - - const { - data: { - head: { - sha: ref, - } - } - } = await github.pulls.get({ - owner: provider, - repo: repository, - pull_number: context.issue.number, - }); - - const checks = await github.checks.listForRef({ - owner: provider, - repo: repository, - ref: ref - }); - - checks.data.check_runs.forEach(run => { - - if (run.app.owner.login === 'travis-ci') { - console.log("rerun travis ci check: " + run.external_id); - rebuild(run.external_id) - } else { - console.log("ignore github action check: " + run.id); - } - - }); - - } catch (e) { - console.log("check bot run failed: " + e); - } - -} - -function rebuild(buildId) { - const https = require('https'); - const token = process.env.HUDI_TRAVIS_ORG_TOKEN - - const options = { - hostname: 'api.travis-ci.org', - port: 443, - path: `/build/${buildId}/restart`, - method: 'POST', - headers: { - 'Content-Type': 'application/json', - 'Travis-API-Version': 3, - 'Authorization': `token ${token}`, - } - }; - - const req = https.request(options, function (res) { - res.setEncoding('utf8'); - res.on('data', function (data) { - console.log('data: ' + data); - }); - res.on('error', function (error) { - console.log('error: ' + error); - }); - }); - req.on('error', function (e) { - console.log('problem with request: ' + e.message); - }); - - req.end(); -} - -module.exports = ({core}, {context}, {github}) => { - return check(core, context, github); -} diff --git a/.github/workflows/bot.yml b/.github/workflows/bot.yml index ca34cc37e4dba..e05ac87bc3fae 100644 --- a/.github/workflows/bot.yml +++ b/.github/workflows/bot.yml @@ -1,43 +1,97 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -name: CI BOT +name: Java CI on: - issue_comment: - types: [created] + push: + branches: + - master + - 'release-*' + pull_request: + branches: + - master + - 'release-*' +env: + MVN_ARGS: -ntp -B -V -Pwarn-log -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.shade=warn -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.dependency=warn jobs: - bot: + test-spark: runs-on: ubuntu-latest - steps: - - name: clone repository - uses: actions/checkout@v2 + strategy: + matrix: + include: + - scalaProfile: "scala-2.11" + sparkProfile: "spark2.4" + + - scalaProfile: "scala-2.12" + sparkProfile: "spark2.4" + + - scalaProfile: "scala-2.12" + sparkProfile: "spark3.1" + + - scalaProfile: "scala-2.12" + sparkProfile: "spark3.2" - - name: bot actions - uses: actions/github-script@v1 + - scalaProfile: "scala-2.12" + sparkProfile: "spark3.3" + + steps: + - uses: actions/checkout@v2 + - name: Set up JDK 8 + uses: actions/setup-java@v2 + with: + java-version: '8' + distribution: 'adopt' + architecture: x64 + - name: Build Project + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + run: + mvn clean install -Pintegration-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DskipTests=true $MVN_ARGS + - name: Quickstart Test env: - PROVIDER : 'apache' - REPOSITORY: 'hudi' - HUDI_TRAVIS_ORG_TOKEN: ${{ secrets.HUDI_TRAVIS_ORG_TOKEN }} + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + run: + mvn test -Punit-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DfailIfNoTests=false -pl hudi-examples/hudi-examples-java,hudi-examples/hudi-examples-spark $MVN_ARGS + - name: IT - Bundle Validation + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + if: ${{ !endsWith(env.SPARK_PROFILE, '2.4') }} # skip test spark 2.4 as it's covered by Azure CI + run: | + HUDI_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout) + ./packaging/bundle-validation/ci_run.sh $HUDI_VERSION + - name: UT - Common & Spark + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + if: ${{ !endsWith(env.SPARK_PROFILE, '2.4') }} # skip test spark 2.4 as it's covered by Azure CI + run: + mvn test -Punit-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -pl hudi-common,hudi-spark-datasource/hudi-spark $MVN_ARGS + + test-flink: + runs-on: ubuntu-latest + strategy: + matrix: + include: + - flinkProfile: "flink1.13" + - flinkProfile: "flink1.14" + - flinkProfile: "flink1.15" + steps: + - uses: actions/checkout@v2 + - name: Set up JDK 8 + uses: actions/setup-java@v2 with: - script: | - const path = require('path') - const scriptPath = path.resolve('.github/actions/bot/src/action.js') - require(scriptPath)({core}, {context}, {github}) \ No newline at end of file + java-version: '8' + distribution: 'adopt' + architecture: x64 + - name: Build Project + env: + FLINK_PROFILE: ${{ matrix.flinkProfile }} + run: + mvn clean install -Pintegration-tests -Dscala-2.12 -D"$FLINK_PROFILE" -Davro.version=1.10.0 -DskipTests=true $MVN_ARGS + - name: Quickstart Test + env: + FLINK_PROFILE: ${{ matrix.flinkProfile }} + run: + mvn test -Punit-tests -Dscala-2.12 -D"$FLINK_PROFILE" -DfailIfNoTests=false -pl hudi-examples/hudi-examples-flink $MVN_ARGS diff --git a/.github/workflows/pr_compliance.yml b/.github/workflows/pr_compliance.yml new file mode 100644 index 0000000000000..67affbb7b749f --- /dev/null +++ b/.github/workflows/pr_compliance.yml @@ -0,0 +1,21 @@ +name: validate pr +on: + pull_request: + types: [opened, edited, reopened, synchronize] + branches: + - master + +jobs: + validate-pr: + runs-on: ubuntu-latest + env: + REQUEST_BODY: ${{ github.event.pull_request.body }} + REQUEST_TITLE: ${{ github.event.pull_request.title }} + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + - name: run script + run: python3 scripts/pr_compliance.py > test.log || { echo "::error::pr_compliance.py $(cat test.log)" && exit 1; } + + + diff --git a/.github/workflows/update_pr_compliance.yml b/.github/workflows/update_pr_compliance.yml new file mode 100644 index 0000000000000..2989617f4cc01 --- /dev/null +++ b/.github/workflows/update_pr_compliance.yml @@ -0,0 +1,18 @@ +name: Update Pr Compliance + +on: + pull_request: + types: [opened, edited, reopened, synchronize] + branches: + - master + paths: + - scripts/pr_compliance.py + +jobs: + run-tests: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + - name: run script + run: python3 scripts/pr_compliance.py run-tests > test.log || { echo "::error::pr_compliance.py $(cat test.log)" && exit 1; } diff --git a/.gitignore b/.gitignore index fcd673b34aa0c..2983889cc6e17 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Directories # /build/ target/ +.mvn/ # OS Files # .DS_Store @@ -61,7 +62,8 @@ local.properties # IntelliJ specific files/directories # ####################################### .out -.idea +.idea/* +!.idea/vcs.xml *.ipr *.iws *.iml @@ -78,4 +80,4 @@ dependency-reduced-pom.xml ####################################### hudi-integ-test/compose_env node_modules -package-lock.json \ No newline at end of file +package-lock.json diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000000000..6310577ae1b5e --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,21 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index d36c0cb709983..0000000000000 --- a/.travis.yml +++ /dev/null @@ -1,48 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -os: linux -dist: trusty -language: java -jdk: - - openjdk8 -jobs: - include: - - name: "Unit tests except hudi-spark-client" - env: MODE=unit MODULES='!hudi-client/hudi-spark-client' HUDI_QUIETER_LOGGING=1 - - name: "Unit tests for hudi-spark-client" - env: MODE=unit MODULES=hudi-client/hudi-spark-client HUDI_QUIETER_LOGGING=1 - - name: "Functional tests" - env: MODE=functional HUDI_QUIETER_LOGGING=1 - - name: "Integration tests" - env: MODE=integration HUDI_QUIETER_LOGGING=1 -install: true -services: - - docker -cache: - directories: - - "$HOME/.m2" -notifications: - slack: - rooms: - - secure: WNIZPBY//xf/xTJL1YUPzvPUDwjawaMM4IJ6IqxjRGcZCmuhNVu2XTJ3aL1g6X7ZcJKxJuwoU/TbSO8Dl6rgWSo/2OfyzBd4ks+hgeCsdycccTcvO8giQO1DOUGUSRdvUzOvKjWVK7iARYzQhoZawAYwI09UJLlwhYRCJ1IKc1ZksrEt964GeEmPyJbwMoZOJVUU84jJIAZPIpOFGTKM652FMermg9yaY2W5oSjDXaV98z0/mJV4Ry++J2v0fvoDs5HxkXYhZJP+dpWR82KDr6Q6LGL5/IlJ+b+IH3pF8LyKR4nCH6l1EZ8KpoFZapyYWYQpXMfQoF2K/JEQkpz1EqBCeEDSJ2+j1PPLhOWXd7ok4DsS26S8BP2ImvyXwua51THN1/r1fCGSIdxiQ5C8aeYmPCSr+oLChCVivEG2eeU34Z1nQJ5aDymNGeFE9qUUpjS0ETfFcjI/WQaA+FiYiPkDfeAoT1+6ySdY7l9gJhMygupILjq57IHbqx4nEr/8AB3Rqb8iIDTWDXgUBI9xKmty36zjIGcVOsCT/SGPccxvEJBXQk8uQqs/rDhaA/ErJPMLX/2b7ElSSObKFdjpMaxVvZIE6wvMLJpIYfChDoXwgfhN6zlAFZrEib7PFI4dGkS8u4wkkHkBS7C+uz2e92EhsAB+BIhUR1M3NQ33+Is= - on_pull_requests: false -script: - # ping stdout every 9 minutes or Travis kills build - # https://docs.travis-ci.com/user/common-build-problems/#Build-times-out-because-no-output-was-received - - while sleep 9m; do echo "=====[ $SECONDS seconds still running ]====="; done & - - scripts/run_travis_tests.sh $MODE $MODULES -after_success: - - scripts/report_coverage.sh diff --git a/LICENSE b/LICENSE index 385191d1b9efa..28222a717e693 100644 --- a/LICENSE +++ b/LICENSE @@ -333,3 +333,15 @@ Copyright (c) 2005, European Commission project OneLab under contract 034819 (ht Home page: https://commons.apache.org/proper/commons-lang/ License: http://www.apache.org/licenses/LICENSE-2.0 + + ------------------------------------------------------------------------------- + + This product includes code from StreamSets Data Collector + + * com.streamsets.pipeline.lib.util.avroorc.AvroToOrcRecordConverter copied and modified to org.apache.hudi.common.util.AvroOrcUtils + * com.streamsets.pipeline.lib.util.avroorc.AvroToOrcSchemaConverter copied and modified to org.apache.hudi.common.util.AvroOrcUtils + + Copyright 2018 StreamSets Inc. + + Home page: https://github.com/streamsets/datacollector-oss + License: http://www.apache.org/licenses/LICENSE-2.0 diff --git a/NOTICE b/NOTICE index 2f1aee6738972..437b974ac217b 100644 --- a/NOTICE +++ b/NOTICE @@ -147,3 +147,21 @@ its NOTICE file: This product includes software developed at The Apache Software Foundation (http://www.apache.org/). + +-------------------------------------------------------------------------------- + +This product includes code from StreamSets Data Collector, which includes the following in +its NOTICE file: + + StreamSets datacollector-oss + Copyright 2018 StreamSets Inc. + + This product includes software developed at + StreamSets (http://www.streamsets.com/). + +-------------------------------------------------------------------------------- + +This product includes code from hilbert-curve project + * Copyright https://github.com/davidmoten/hilbert-curve + * Licensed under the Apache-2.0 License + diff --git a/README.md b/README.md index 427d8595f4365..d389754ca2166 100644 --- a/README.md +++ b/README.md @@ -16,24 +16,32 @@ --> # Apache Hudi -Apache Hudi (pronounced Hoodie) stands for `Hadoop Upserts Deletes and Incrementals`. -Hudi manages the storage of large analytical datasets on DFS (Cloud stores, HDFS or any Hadoop FileSystem compatible storage). + +Apache Hudi (pronounced Hoodie) stands for `Hadoop Upserts Deletes and Incrementals`. Hudi manages the storage of large +analytical datasets on DFS (Cloud stores, HDFS or any Hadoop FileSystem compatible storage). + +Hudi logo -[![Build Status](https://travis-ci.org/apache/hudi.svg?branch=master)](https://travis-ci.org/apache/hudi) +[![Build](https://github.com/apache/hudi/actions/workflows/bot.yml/badge.svg)](https://github.com/apache/hudi/actions/workflows/bot.yml) +[![Test](https://dev.azure.com/apache-hudi-ci-org/apache-hudi-ci/_apis/build/status/apachehudi-ci.hudi-mirror?branchName=master)](https://dev.azure.com/apache-hudi-ci-org/apache-hudi-ci/_build/latest?definitionId=3&branchName=master) [![License](https://img.shields.io/badge/license-Apache%202-4EB1BA.svg)](https://www.apache.org/licenses/LICENSE-2.0.html) [![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.apache.hudi/hudi/badge.svg)](http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.hudi%22) -[![Join on Slack](https://img.shields.io/badge/slack-%23hudi-72eff8?logo=slack&color=48c628&label=Join%20on%20Slack)](https://join.slack.com/t/apache-hudi/shared_invite/enQtODYyNDAxNzc5MTg2LTE5OTBlYmVhYjM0N2ZhOTJjOWM4YzBmMWU2MjZjMGE4NDc5ZDFiOGQ2N2VkYTVkNzU3ZDQ4OTI1NmFmYWQ0NzE) +![GitHub commit activity](https://img.shields.io/github/commit-activity/m/apache/hudi) +[![Join on Slack](https://img.shields.io/badge/slack-%23hudi-72eff8?logo=slack&color=48c628&label=Join%20on%20Slack)](https://join.slack.com/t/apache-hudi/shared_invite/zt-1e94d3xro-JvlNO1kSeIHJBTVfLPlI5w) +![Twitter Follow](https://img.shields.io/twitter/follow/ApacheHudi) ## Features + * Upsert support with fast, pluggable indexing * Atomically publish data with rollback support -* Snapshot isolation between writer & queries +* Snapshot isolation between writer & queries * Savepoints for data recovery * Manages file sizes, layout using statistics * Async compaction of row & columnar data * Timeline metadata to track lineage +* Optimize data lake layout with clustering Hudi supports three types of queries: * **Snapshot Query** - Provides snapshot queries on real-time data, using a combination of columnar & row-based storage (e.g [Parquet](https://parquet.apache.org/) + [Avro](https://avro.apache.org/docs/current/mr.html)). @@ -49,7 +57,7 @@ Prerequisites for building Apache Hudi: * Unix-like system (like Linux, Mac OS X) * Java 8 (Java 9 or 10 may work) * Git -* Maven +* Maven (>=3.3.1) ``` # Checkout code and build @@ -62,43 +70,57 @@ spark-2.4.4-bin-hadoop2.7/bin/spark-shell \ --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' ``` +To build for integration tests that include `hudi-integ-test-bundle`, use `-Dintegration-tests`. + To build the Javadoc for all Java and Scala classes: ``` # Javadoc generated under target/site/apidocs mvn clean javadoc:aggregate -Pjavadocs ``` -### Build with Scala 2.12 +### Build with different Spark versions -The default Scala version supported is 2.11. To build for Scala 2.12 version, build using `scala-2.12` profile +The default Spark 2.x version supported is 2.4.4. The default Spark 3.x version, corresponding to `spark3` profile is 3.3.1. +Refer to the table below for building with different Spark and Scala versions. -``` -mvn clean package -DskipTests -Dscala-2.12 -``` +| Maven build options | Expected Spark bundle jar name | Notes | +|:--------------------------|:---------------------------------------------|:-------------------------------------------------| +| (empty) | hudi-spark-bundle_2.11 (legacy bundle name) | For Spark 2.4.4 and Scala 2.11 (default options) | +| `-Dspark2.4` | hudi-spark2.4-bundle_2.11 | For Spark 2.4.4 and Scala 2.11 (same as default) | +| `-Dspark2.4 -Dscala-2.12` | hudi-spark2.4-bundle_2.12 | For Spark 2.4.4 and Scala 2.12 | +| `-Dspark3.1 -Dscala-2.12` | hudi-spark3.1-bundle_2.12 | For Spark 3.1.x and Scala 2.12 | +| `-Dspark3.2 -Dscala-2.12` | hudi-spark3.2-bundle_2.12 | For Spark 3.2.x and Scala 2.12 | +| `-Dspark3.3 -Dscala-2.12` | hudi-spark3.3-bundle_2.12 | For Spark 3.3.x and Scala 2.12 | +| `-Dspark3` | hudi-spark3-bundle_2.12 (legacy bundle name) | For Spark 3.3.x and Scala 2.12 | +| `-Dscala-2.12` | hudi-spark-bundle_2.12 (legacy bundle name) | For Spark 2.4.4 and Scala 2.12 | -### Build with Spark 3.0.0 +For example, +``` +# Build against Spark 3.2.x +mvn clean package -DskipTests -Dspark3.2 -Dscala-2.12 -The default Spark version supported is 2.4.4. To build for Spark 3.0.0 version, build using `spark3` profile +# Build against Spark 3.1.x +mvn clean package -DskipTests -Dspark3.1 -Dscala-2.12 -``` -mvn clean package -DskipTests -Dspark3 +# Build against Spark 2.4.4 and Scala 2.12 +mvn clean package -DskipTests -Dspark2.4 -Dscala-2.12 ``` -### Build without spark-avro module +#### What about "spark-avro" module? -The default hudi-jar bundles spark-avro module. To build without spark-avro module, build using `spark-shade-unbundle-avro` profile +Starting from versions 0.11, Hudi no longer requires `spark-avro` to be specified using `--packages` -``` -# Checkout code and build -git clone https://github.com/apache/hudi.git && cd hudi -mvn clean package -DskipTests -Pspark-shade-unbundle-avro +### Build with different Flink versions -# Start command -spark-2.4.4-bin-hadoop2.7/bin/spark-shell \ - --packages org.apache.spark:spark-avro_2.11:2.4.4 \ - --jars `ls packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.11-*.*.*-SNAPSHOT.jar` \ - --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' -``` +The default Flink version supported is 1.14. Refer to the table below for building with different Flink and Scala versions. + +| Maven build options | Expected Flink bundle jar name | Notes | +|:---------------------------|:-------------------------------|:------------------------------------------------| +| (empty) | hudi-flink1.14-bundle_2.11 | For Flink 1.14 and Scala 2.11 (default options) | +| `-Dflink1.14` | hudi-flink1.14-bundle_2.11 | For Flink 1.14 and Scala 2.11 (same as default) | +| `-Dflink1.14 -Dscala-2.12` | hudi-flink1.14-bundle_2.12 | For Flink 1.14 and Scala 2.12 | +| `-Dflink1.13` | hudi-flink1.13-bundle_2.11 | For Flink 1.13 and Scala 2.11 | +| `-Dflink1.13 -Dscala-2.12` | hudi-flink1.13-bundle_2.12 | For Flink 1.13 and Scala 2.12 | ## Running Tests @@ -120,3 +142,8 @@ mvn -Punit-tests test -DSPARK_EVLOG_DIR=/path/for/spark/event/log ## Quickstart Please visit [https://hudi.apache.org/docs/quick-start-guide.html](https://hudi.apache.org/docs/quick-start-guide.html) to quickly explore Hudi's capabilities using spark-shell. + +## Contributing + +Please check out our [contribution guide](https://hudi.apache.org/contribute/how-to-contribute) to learn more about how to contribute. +For code contributions, please refer to the [developer setup](https://hudi.apache.org/contribute/developer-setup). diff --git a/azure-pipelines.yml b/azure-pipelines.yml new file mode 100644 index 0000000000000..d450bfb9ab2b0 --- /dev/null +++ b/azure-pipelines.yml @@ -0,0 +1,241 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +trigger: + branches: + include: + - '*' # must quote since "*" is a YAML reserved character; we want a string + +pool: + vmImage: 'ubuntu-18.04' + +parameters: + - name: job1Modules + type: object + default: + - 'hudi-common' + - 'hudi-flink-datasource' + - 'hudi-flink-datasource/hudi-flink' + - 'hudi-flink-datasource/hudi-flink1.13.x' + - 'hudi-flink-datasource/hudi-flink1.14.x' + - 'hudi-flink-datasource/hudi-flink1.15.x' + - name: job2Modules + type: object + default: + - 'hudi-client/hudi-spark-client' + - name: job3Modules + type: object + default: + - 'hudi-spark-datasource' + - 'hudi-spark-datasource/hudi-spark' + - 'hudi-spark-datasource/hudi-spark2' + - 'hudi-spark-datasource/hudi-spark2-common' + - 'hudi-spark-datasource/hudi-spark-common' + - name: job4Modules + type: object + default: + - '!hudi-client/hudi-spark-client' + - '!hudi-common' + - '!hudi-examples' + - '!hudi-examples/hudi-examples-common' + - '!hudi-examples/hudi-examples-flink' + - '!hudi-examples/hudi-examples-java' + - '!hudi-examples/hudi-examples-spark' + - '!hudi-flink-datasource' + - '!hudi-flink-datasource/hudi-flink' + - '!hudi-flink-datasource/hudi-flink1.13.x' + - '!hudi-flink-datasource/hudi-flink1.14.x' + - '!hudi-flink-datasource/hudi-flink1.15.x' + - '!hudi-spark-datasource' + - '!hudi-spark-datasource/hudi-spark' + - '!hudi-spark-datasource/hudi-spark2' + - '!hudi-spark-datasource/hudi-spark2-common' + - '!hudi-spark-datasource/hudi-spark-common' + +variables: + BUILD_PROFILES: '-Dscala-2.11 -Dspark2.4 -Dflink1.14' + PLUGIN_OPTS: '-Dcheckstyle.skip=true -Drat.skip=true -Djacoco.skip=true -ntp -B -V -Pwarn-log -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.shade=warn -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.dependency=warn' + MVN_OPTS_INSTALL: '-DskipTests $(BUILD_PROFILES) $(PLUGIN_OPTS)' + MVN_OPTS_TEST: '-fae -Pwarn-log $(BUILD_PROFILES) $(PLUGIN_OPTS)' + SPARK_VERSION: '2.4.4' + HADOOP_VERSION: '2.7' + SPARK_ARCHIVE: spark-$(SPARK_VERSION)-bin-hadoop$(HADOOP_VERSION) + JOB1_MODULES: ${{ join(',',parameters.job1Modules) }} + JOB2_MODULES: ${{ join(',',parameters.job2Modules) }} + JOB3_MODULES: ${{ join(',',parameters.job3Modules) }} + JOB4_MODULES: ${{ join(',',parameters.job4Modules) }} + +stages: + - stage: test + jobs: + - job: UT_FT_1 + displayName: UT FT common & flink & UT client/spark-client + timeoutInMinutes: '150' + steps: + - task: Maven@4 + displayName: maven install + inputs: + mavenPomFile: 'pom.xml' + goals: 'clean install' + options: $(MVN_OPTS_INSTALL) + publishJUnitResults: false + jdkVersionOption: '1.8' + - task: Maven@4 + displayName: UT common flink client/spark-client + inputs: + mavenPomFile: 'pom.xml' + goals: 'test' + options: $(MVN_OPTS_TEST) -Punit-tests -pl $(JOB1_MODULES),hudi-client/hudi-spark-client + publishJUnitResults: false + jdkVersionOption: '1.8' + mavenOptions: '-Xmx4g' + - task: Maven@4 + displayName: FT common flink + inputs: + mavenPomFile: 'pom.xml' + goals: 'test' + options: $(MVN_OPTS_TEST) -Pfunctional-tests -pl $(JOB1_MODULES) + publishJUnitResults: false + jdkVersionOption: '1.8' + mavenOptions: '-Xmx4g' + - script: | + grep "testcase" */target/surefire-reports/*.xml */*/target/surefire-reports/*.xml | awk -F'"' ' { print $6,$4,$2 } ' | sort -nr | head -n 100 + displayName: Top 100 long-running testcases + - job: UT_FT_2 + displayName: FT client/spark-client + timeoutInMinutes: '150' + steps: + - task: Maven@4 + displayName: maven install + inputs: + mavenPomFile: 'pom.xml' + goals: 'clean install' + options: $(MVN_OPTS_INSTALL) + publishJUnitResults: false + jdkVersionOption: '1.8' + - task: Maven@4 + displayName: FT client/spark-client + inputs: + mavenPomFile: 'pom.xml' + goals: 'test' + options: $(MVN_OPTS_TEST) -Pfunctional-tests -pl $(JOB2_MODULES) + publishJUnitResults: false + jdkVersionOption: '1.8' + mavenOptions: '-Xmx4g' + - script: | + grep "testcase" */target/surefire-reports/*.xml */*/target/surefire-reports/*.xml | awk -F'"' ' { print $6,$4,$2 } ' | sort -nr | head -n 100 + displayName: Top 100 long-running testcases + - job: UT_FT_3 + displayName: UT FT spark-datasource + timeoutInMinutes: '150' + steps: + - task: Maven@4 + displayName: maven install + inputs: + mavenPomFile: 'pom.xml' + goals: 'clean install' + options: $(MVN_OPTS_INSTALL) + publishJUnitResults: false + jdkVersionOption: '1.8' + - task: Maven@4 + displayName: UT spark-datasource + inputs: + mavenPomFile: 'pom.xml' + goals: 'test' + options: $(MVN_OPTS_TEST) -Punit-tests -pl $(JOB3_MODULES) + publishJUnitResults: false + jdkVersionOption: '1.8' + mavenOptions: '-Xmx4g' + - task: Maven@4 + displayName: FT spark-datasource + inputs: + mavenPomFile: 'pom.xml' + goals: 'test' + options: $(MVN_OPTS_TEST) -Pfunctional-tests -pl $(JOB3_MODULES) + publishJUnitResults: false + jdkVersionOption: '1.8' + mavenOptions: '-Xmx4g' + - script: | + grep "testcase" */target/surefire-reports/*.xml */*/target/surefire-reports/*.xml | awk -F'"' ' { print $6,$4,$2 } ' | sort -nr | head -n 100 + displayName: Top 100 long-running testcases + - job: UT_FT_4 + displayName: UT FT other modules + timeoutInMinutes: '150' + steps: + - task: Maven@4 + displayName: maven install + inputs: + mavenPomFile: 'pom.xml' + goals: 'clean install' + options: $(MVN_OPTS_INSTALL) + publishJUnitResults: false + jdkVersionOption: '1.8' + - task: Maven@4 + displayName: UT other modules + inputs: + mavenPomFile: 'pom.xml' + goals: 'test' + options: $(MVN_OPTS_TEST) -Punit-tests -pl $(JOB4_MODULES) + publishJUnitResults: false + jdkVersionOption: '1.8' + mavenOptions: '-Xmx4g' + - task: Maven@4 + displayName: FT other modules + inputs: + mavenPomFile: 'pom.xml' + goals: 'test' + options: $(MVN_OPTS_TEST) -Pfunctional-tests -pl $(JOB4_MODULES) + publishJUnitResults: false + jdkVersionOption: '1.8' + mavenOptions: '-Xmx4g' + - script: | + grep "testcase" */target/surefire-reports/*.xml */*/target/surefire-reports/*.xml | awk -F'"' ' { print $6,$4,$2 } ' | sort -nr | head -n 100 + displayName: Top 100 long-running testcases + - job: IT + displayName: IT modules + timeoutInMinutes: '150' + steps: + - task: Maven@4 + displayName: maven install + inputs: + mavenPomFile: 'pom.xml' + goals: 'clean install' + options: $(MVN_OPTS_INSTALL) -Pintegration-tests + publishJUnitResults: false + jdkVersionOption: '1.8' + - task: Maven@4 + displayName: UT integ-test + inputs: + mavenPomFile: 'pom.xml' + goals: 'test' + options: $(MVN_OPTS_TEST) -Pintegration-tests -DskipUTs=false -DskipITs=true -pl hudi-integ-test + publishJUnitResults: false + jdkVersionOption: '1.8' + mavenOptions: '-Xmx4g' + - task: AzureCLI@2 + displayName: Prepare for IT + inputs: + azureSubscription: apachehudici-service-connection + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + echo 'Downloading $(SPARK_ARCHIVE)' + az storage blob download -c ci-caches -n $(SPARK_ARCHIVE).tgz -f $(Pipeline.Workspace)/$(SPARK_ARCHIVE).tgz --account-name apachehudici + tar -xvf $(Pipeline.Workspace)/$(SPARK_ARCHIVE).tgz -C $(Pipeline.Workspace)/ + mkdir /tmp/spark-events/ + - script: | + export SPARK_HOME=$(Pipeline.Workspace)/$(SPARK_ARCHIVE) + mvn $(MVN_OPTS_TEST) -Pintegration-tests verify + displayName: IT diff --git a/conf/hudi-defaults.conf.template b/conf/hudi-defaults.conf.template new file mode 100644 index 0000000000000..175dbaf23d739 --- /dev/null +++ b/conf/hudi-defaults.conf.template @@ -0,0 +1,26 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running Hudi jobs. +# This is useful for setting default environmental settings. + +# Example: +# hoodie.datasource.hive_sync.jdbcurl jdbc:hive2://localhost:10000 +# hoodie.datasource.hive_sync.use_jdbc true +# hoodie.datasource.hive_sync.support_timestamp false +# hoodie.index.type BLOOM +# hoodie.metadata.enable false diff --git a/dependencies/hudi-flink-bundle_2.11.txt b/dependencies/hudi-flink-bundle_2.11.txt new file mode 100644 index 0000000000000..a38c9114946e8 --- /dev/null +++ b/dependencies/hudi-flink-bundle_2.11.txt @@ -0,0 +1,290 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +HikariCP/com.zaxxer/2.5.1//HikariCP-2.5.1.jar +ST4/org.antlr/4.0.4//ST4-4.0.4.jar +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +akka-actor_2.11/com.typesafe.akka/2.5.21//akka-actor_2.11-2.5.21.jar +akka-protobuf_2.11/com.typesafe.akka/2.5.21//akka-protobuf_2.11-2.5.21.jar +akka-slf4j_2.11/com.typesafe.akka/2.5.21//akka-slf4j_2.11-2.5.21.jar +akka-stream_2.11/com.typesafe.akka/2.5.21//akka-stream_2.11-2.5.21.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +ant-launcher/org.apache.ant/1.9.1//ant-launcher-1.9.1.jar +ant/ant/1.6.5//ant-1.6.5.jar +ant/org.apache.ant/1.9.1//ant-1.9.1.jar +antlr-runtime/org.antlr/3.5.2//antlr-runtime-3.5.2.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apache-curator/org.apache.curator/2.7.1//apache-curator-2.7.1.pom +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +audience-annotations/org.apache.yetus/0.11.0//audience-annotations-0.11.0.jar +avatica-metrics/org.apache.calcite.avatica/1.8.0//avatica-metrics-1.8.0.jar +avatica/org.apache.calcite.avatica/1.8.0//avatica-1.8.0.jar +avro/org.apache.avro/1.10.0//avro-1.10.0.jar +bijection-avro_2.11/com.twitter/0.9.7//bijection-avro_2.11-0.9.7.jar +bijection-core_2.11/com.twitter/0.9.7//bijection-core_2.11-0.9.7.jar +bonecp/com.jolbox/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +calcite-core/org.apache.calcite/1.10.0//calcite-core-1.10.0.jar +calcite-druid/org.apache.calcite/1.10.0//calcite-druid-1.10.0.jar +calcite-linq4j/org.apache.calcite/1.10.0//calcite-linq4j-1.10.0.jar +chill-java/com.twitter/0.7.6//chill-java-0.7.6.jar +chill_2.11/com.twitter/0.7.6//chill_2.11-0.7.6.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/org.codehaus.janino/2.7.6//commons-compiler-2.7.6.jar +commons-compress/org.apache.commons/1.20//commons-compress-1.20.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-dbcp/commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-el/commons-el/1.0//commons-el-1.0.jar +commons-httpclient/commons-httpclient/3.0.1//commons-httpclient-3.0.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/org.apache.commons/3.1//commons-lang3-3.1.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.5//commons-math3-3.5.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.6//commons-pool-1.6.jar +config/com.typesafe/1.3.3//config-1.3.3.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +datanucleus-api-jdo/org.datanucleus/5.0.1//datanucleus-api-jdo-5.0.1.jar +datanucleus-core/org.datanucleus/5.0.1//datanucleus-core-5.0.1.jar +datanucleus-rdbms/org.datanucleus/4.1.19//datanucleus-rdbms-4.1.19.jar +derby/org.apache.derby/10.10.2.0//derby-10.10.2.0.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +dropwizard-metrics-hadoop-metrics2-reporter/com.github.joshelser/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +eigenbase-properties/net.hydromatic/1.1.5//eigenbase-properties-1.1.5.jar +fastutil/it.unimi.dsi/6.5.6//fastutil-6.5.6.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +flink-annotations/org.apache.flink/1.13.1//flink-annotations-1.13.1.jar +flink-avro/org.apache.flink/1.13.1//flink-avro-1.13.1.jar +flink-clients_2.11/org.apache.flink/1.13.1//flink-clients_2.11-1.13.1.jar +flink-connector-base/org.apache.flink/1.13.1//flink-connector-base-1.13.1.jar +flink-connector-kafka_2.11/org.apache.flink/1.13.1//flink-connector-kafka_2.11-1.13.1.jar +flink-core/org.apache.flink/1.13.1//flink-core-1.13.1.jar +flink-file-sink-common/org.apache.flink/1.13.1//flink-file-sink-common-1.13.1.jar +flink-hadoop-compatibility_2.11/org.apache.flink/1.13.1//flink-hadoop-compatibility_2.11-1.13.1.jar +flink-hadoop-fs/org.apache.flink/1.13.1//flink-hadoop-fs-1.13.1.jar +flink-java/org.apache.flink/1.13.1//flink-java-1.13.1.jar +flink-json/org.apache.flink/1.13.1//flink-json-1.13.1.jar +flink-metrics-core/org.apache.flink/1.13.1//flink-metrics-core-1.13.1.jar +flink-optimizer_2.11/org.apache.flink/1.13.1//flink-optimizer_2.11-1.13.1.jar +flink-parquet_2.11/org.apache.flink/1.13.1//flink-parquet_2.11-1.13.1.jar +flink-queryable-state-client-java/org.apache.flink/1.13.1//flink-queryable-state-client-java-1.13.1.jar +flink-runtime_2.11/org.apache.flink/1.13.1//flink-runtime_2.11-1.13.1.jar +flink-shaded-asm-7/org.apache.flink/7.1-13.0//flink-shaded-asm-7-7.1-13.0.jar +flink-shaded-guava/org.apache.flink/18.0-13.0//flink-shaded-guava-18.0-13.0.jar +flink-shaded-jackson/org.apache.flink/2.12.1-13.0//flink-shaded-jackson-2.12.1-13.0.jar +flink-shaded-netty/org.apache.flink/4.1.49.Final-13.0//flink-shaded-netty-4.1.49.Final-13.0.jar +flink-shaded-zookeeper-3/org.apache.flink/3.4.14-13.0//flink-shaded-zookeeper-3-3.4.14-13.0.jar +flink-streaming-java_2.11/org.apache.flink/1.13.1//flink-streaming-java_2.11-1.13.1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +force-shading/org.apache.flink/1.13.1//force-shading-1.13.1.jar +grizzled-slf4j_2.11/org.clapper/1.3.2//grizzled-slf4j_2.11-1.3.2.jar +groovy-all/org.codehaus.groovy/2.4.4//groovy-all-2.4.4.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/12.0.1//guava-12.0.1.jar +guice-assistedinject/com.google.inject.extensions/3.0//guice-assistedinject-3.0.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-registry/org.apache.hadoop/2.7.1//hadoop-yarn-registry-2.7.1.jar +hadoop-yarn-server-applicationhistoryservice/org.apache.hadoop/2.7.2//hadoop-yarn-server-applicationhistoryservice-2.7.2.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.2//hadoop-yarn-server-common-2.7.2.jar +hadoop-yarn-server-resourcemanager/org.apache.hadoop/2.7.2//hadoop-yarn-server-resourcemanager-2.7.2.jar +hadoop-yarn-server-web-proxy/org.apache.hadoop/2.7.2//hadoop-yarn-server-web-proxy-2.7.2.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-common/org.apache.hive/2.3.1//hive-common-2.3.1.jar +hive-exec/org.apache.hive/2.3.1//hive-exec-2.3.1.jar +hive-jdbc/org.apache.hive/2.3.1//hive-jdbc-2.3.1.jar +hive-llap-client/org.apache.hive/2.3.1//hive-llap-client-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1//hive-llap-common-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1/tests/hive-llap-common-2.3.1-tests.jar +hive-llap-server/org.apache.hive/2.3.1//hive-llap-server-2.3.1.jar +hive-llap-tez/org.apache.hive/2.3.1//hive-llap-tez-2.3.1.jar +hive-metastore/org.apache.hive/2.3.1//hive-metastore-2.3.1.jar +hive-serde/org.apache.hive/2.3.1//hive-serde-2.3.1.jar +hive-service-rpc/org.apache.hive/2.3.1//hive-service-rpc-2.3.1.jar +hive-service/org.apache.hive/2.3.1//hive-service-2.3.1.jar +hive-shims-0.23/org.apache.hive.shims/2.3.1//hive-shims-0.23-2.3.1.jar +hive-shims-common/org.apache.hive.shims/2.3.1//hive-shims-common-2.3.1.jar +hive-shims-scheduler/org.apache.hive.shims/2.3.1//hive-shims-scheduler-2.3.1.jar +hive-shims/org.apache.hive/2.3.1//hive-shims-2.3.1.jar +hive-storage-api/org.apache.hive/2.3.1//hive-storage-api-2.3.1.jar +hive-vector-code-gen/org.apache.hive/2.3.1//hive-vector-code-gen-2.3.1.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +ivy/org.apache.ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jamon-runtime/org.jamon/2.3.1//jamon-runtime-2.3.1.jar +janino/org.codehaus.janino/2.7.6//janino-2.7.6.jar +jasper-compiler/tomcat/5.5.23//jasper-compiler-5.5.23.jar +jasper-runtime/tomcat/5.5.23//jasper-runtime-5.5.23.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javassist/org.javassist/3.24.0-GA//javassist-3.24.0-GA.jar +javax.annotation-api/javax.annotation/1.3.2//javax.annotation-api-1.3.2.jar +javax.inject/javax.inject/1//javax.inject-1.jar +javax.jdo/org.datanucleus/3.2.0-m3//javax.jdo-3.2.0-m3.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +javolution/javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jdo-api/javax.jdo/3.0.1//jdo-api-3.0.1.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +jetty/org.mortbay.jetty/6.1.26//jetty-6.1.26.jar +jline/jline/2.12//jline-2.12.jar +joda-time/joda-time/2.9.9//joda-time-2.9.9.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jpam/net.sf.jpam/1.1//jpam-1.1.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +json/com.tdunning/1.8//json-1.8.jar +jsp-api/javax.servlet.jsp/2.1//jsp-api-2.1.jar +jsp-api/javax.servlet/2.0//jsp-api-2.0.jar +jsr305/com.google.code.findbugs/1.3.9//jsr305-1.3.9.jar +junit/junit/4.12//junit-4.12.jar +kafka-clients/org.apache.kafka/2.0.0//kafka-clients-2.0.0.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +kryo/com.esotericsoftware.kryo/2.24.0//kryo-2.24.0.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +libfb303/org.apache.thrift/0.9.3//libfb303-0.9.3.jar +libthrift/org.apache.thrift/0.9.3//libthrift-0.9.3.jar +log4j-1.2-api/org.apache.logging.log4j/2.6.2//log4j-1.2-api-2.6.2.jar +log4j-api/org.apache.logging.log4j/2.6.2//log4j-api-2.6.2.jar +log4j-core/org.apache.logging.log4j/2.6.2//log4j-core-2.6.2.jar +log4j-slf4j-impl/org.apache.logging.log4j/2.6.2//log4j-slf4j-impl-2.6.2.jar +log4j-web/org.apache.logging.log4j/2.6.2//log4j-web-2.6.2.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +lz4-java/org.lz4/1.4.1//lz4-java-1.4.1.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +metrics-core/io.dropwizard.metrics/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/io.dropwizard.metrics/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/io.dropwizard.metrics/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/io.dropwizard.metrics/3.1.0//metrics-json-3.1.0.jar +metrics-jvm/io.dropwizard.metrics/3.1.0//metrics-jvm-3.1.0.jar +minlog/com.esotericsoftware.minlog/1.2//minlog-1.2.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.6.2.Final//netty-3.6.2.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +opencsv/net.sf.opencsv/2.3//opencsv-2.3.jar +orc-core/org.apache.orc/1.3.3//orc-core-1.3.3.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +oro/oro/2.0.8//oro-2.0.8.jar +parquet-avro/org.apache.parquet/1.11.1//parquet-avro-1.11.1.jar +parquet-column/org.apache.parquet/1.11.1//parquet-column-1.11.1.jar +parquet-common/org.apache.parquet/1.11.1//parquet-common-1.11.1.jar +parquet-encoding/org.apache.parquet/1.11.1//parquet-encoding-1.11.1.jar +parquet-format-structures/org.apache.parquet/1.11.1//parquet-format-structures-1.11.1.jar +parquet-hadoop-bundle/org.apache.parquet/1.8.1//parquet-hadoop-bundle-1.8.1.jar +parquet-hadoop/org.apache.parquet/1.11.1//parquet-hadoop-1.11.1.jar +parquet-jackson/org.apache.parquet/1.11.1//parquet-jackson-1.11.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +reactive-streams/org.reactivestreams/1.0.2//reactive-streams-1.0.2.jar +scala-java8-compat_2.11/org.scala-lang.modules/0.7.0//scala-java8-compat_2.11-0.7.0.jar +scala-library/org.scala-lang/2.11.12//scala-library-2.11.12.jar +scala-parser-combinators_2.11/org.scala-lang.modules/1.1.1//scala-parser-combinators_2.11-1.1.1.jar +scopt_2.11/com.github.scopt/3.5.0//scopt_2.11-3.5.0.jar +servlet-api/javax.servlet/2.4//servlet-api-2.4.jar +simpleclient/io.prometheus/0.8.0//simpleclient-0.8.0.jar +simpleclient_common/io.prometheus/0.8.0//simpleclient_common-0.8.0.jar +simpleclient_dropwizard/io.prometheus/0.8.0//simpleclient_dropwizard-0.8.0.jar +simpleclient_httpserver/io.prometheus/0.8.0//simpleclient_httpserver-0.8.0.jar +simpleclient_pushgateway/io.prometheus/0.8.0//simpleclient_pushgateway-0.8.0.jar +slf4j-api/org.slf4j/1.7.22//slf4j-api-1.7.22.jar +slf4j-log4j12/org.slf4j/1.7.10//slf4j-log4j12-1.7.10.jar +slider-core/org.apache.slider/0.90.2-incubating//slider-core-0.90.2-incubating.jar +snappy-java/org.xerial.snappy/1.1.7.1//snappy-java-1.1.7.1.jar +ssl-config-core_2.11/com.typesafe/0.3.7//ssl-config-core_2.11-0.3.7.jar +stax-api/stax/1.0.1//stax-api-1.0.1.jar +tephra-api/co.cask.tephra/0.6.0//tephra-api-0.6.0.jar +tephra-core/co.cask.tephra/0.6.0//tephra-core-0.6.0.jar +tephra-hbase-compat-1.0/co.cask.tephra/0.6.0//tephra-hbase-compat-1.0-0.6.0.jar +twill-api/org.apache.twill/0.6.0-incubating//twill-api-0.6.0-incubating.jar +twill-common/org.apache.twill/0.6.0-incubating//twill-common-0.6.0-incubating.jar +twill-core/org.apache.twill/0.6.0-incubating//twill-core-0.6.0-incubating.jar +twill-discovery-api/org.apache.twill/0.6.0-incubating//twill-discovery-api-0.6.0-incubating.jar +twill-discovery-core/org.apache.twill/0.6.0-incubating//twill-discovery-core-0.6.0-incubating.jar +twill-zookeeper/org.apache.twill/0.6.0-incubating//twill-zookeeper-0.6.0-incubating.jar +velocity/org.apache.velocity/1.5//velocity-1.5.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar +zookeeper/org.apache.zookeeper/3.4.6/tests/zookeeper-3.4.6-tests.jar diff --git a/dependencies/hudi-flink-bundle_2.12.txt b/dependencies/hudi-flink-bundle_2.12.txt new file mode 100644 index 0000000000000..37f957aeebd1f --- /dev/null +++ b/dependencies/hudi-flink-bundle_2.12.txt @@ -0,0 +1,291 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +HikariCP/com.zaxxer/2.5.1//HikariCP-2.5.1.jar +ST4/org.antlr/4.0.4//ST4-4.0.4.jar +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +akka-actor_2.11/com.typesafe.akka/2.5.21//akka-actor_2.11-2.5.21.jar +akka-protobuf_2.11/com.typesafe.akka/2.5.21//akka-protobuf_2.11-2.5.21.jar +akka-slf4j_2.11/com.typesafe.akka/2.5.21//akka-slf4j_2.11-2.5.21.jar +akka-stream_2.11/com.typesafe.akka/2.5.21//akka-stream_2.11-2.5.21.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +ant-launcher/org.apache.ant/1.9.1//ant-launcher-1.9.1.jar +ant/ant/1.6.5//ant-1.6.5.jar +ant/org.apache.ant/1.9.1//ant-1.9.1.jar +antlr-runtime/org.antlr/3.5.2//antlr-runtime-3.5.2.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apache-curator/org.apache.curator/2.7.1//apache-curator-2.7.1.pom +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +audience-annotations/org.apache.yetus/0.11.0//audience-annotations-0.11.0.jar +avatica-metrics/org.apache.calcite.avatica/1.8.0//avatica-metrics-1.8.0.jar +avatica/org.apache.calcite.avatica/1.8.0//avatica-1.8.0.jar +avro/org.apache.avro/1.10.0//avro-1.10.0.jar +bijection-avro_2.11/com.twitter/0.9.7//bijection-avro_2.11-0.9.7.jar +bijection-core_2.11/com.twitter/0.9.7//bijection-core_2.11-0.9.7.jar +bonecp/com.jolbox/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +calcite-core/org.apache.calcite/1.10.0//calcite-core-1.10.0.jar +calcite-druid/org.apache.calcite/1.10.0//calcite-druid-1.10.0.jar +calcite-linq4j/org.apache.calcite/1.10.0//calcite-linq4j-1.10.0.jar +chill-java/com.twitter/0.7.6//chill-java-0.7.6.jar +chill_2.11/com.twitter/0.7.6//chill_2.11-0.7.6.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/org.codehaus.janino/2.7.6//commons-compiler-2.7.6.jar +commons-compress/org.apache.commons/1.20//commons-compress-1.20.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-dbcp/commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-el/commons-el/1.0//commons-el-1.0.jar +commons-httpclient/commons-httpclient/3.0.1//commons-httpclient-3.0.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/org.apache.commons/3.1//commons-lang3-3.1.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.5//commons-math3-3.5.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.6//commons-pool-1.6.jar +config/com.typesafe/1.3.3//config-1.3.3.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +datanucleus-api-jdo/org.datanucleus/5.0.1//datanucleus-api-jdo-5.0.1.jar +datanucleus-core/org.datanucleus/5.0.1//datanucleus-core-5.0.1.jar +datanucleus-rdbms/org.datanucleus/4.1.19//datanucleus-rdbms-4.1.19.jar +derby/org.apache.derby/10.10.2.0//derby-10.10.2.0.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +dropwizard-metrics-hadoop-metrics2-reporter/com.github.joshelser/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +eigenbase-properties/net.hydromatic/1.1.5//eigenbase-properties-1.1.5.jar +fastutil/it.unimi.dsi/6.5.6//fastutil-6.5.6.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +flink-annotations/org.apache.flink/1.13.1//flink-annotations-1.13.1.jar +flink-avro/org.apache.flink/1.13.1//flink-avro-1.13.1.jar +flink-clients_2.11/org.apache.flink/1.13.1//flink-clients_2.11-1.13.1.jar +flink-connector-base/org.apache.flink/1.13.1//flink-connector-base-1.13.1.jar +flink-connector-kafka_2.11/org.apache.flink/1.13.1//flink-connector-kafka_2.11-1.13.1.jar +flink-core/org.apache.flink/1.13.1//flink-core-1.13.1.jar +flink-file-sink-common/org.apache.flink/1.13.1//flink-file-sink-common-1.13.1.jar +flink-hadoop-compatibility_2.11/org.apache.flink/1.13.1//flink-hadoop-compatibility_2.11-1.13.1.jar +flink-hadoop-compatibility_2.12/org.apache.flink/1.13.1//flink-hadoop-compatibility_2.12-1.13.1.jar +flink-hadoop-fs/org.apache.flink/1.13.1//flink-hadoop-fs-1.13.1.jar +flink-java/org.apache.flink/1.13.1//flink-java-1.13.1.jar +flink-json/org.apache.flink/1.13.1//flink-json-1.13.1.jar +flink-metrics-core/org.apache.flink/1.13.1//flink-metrics-core-1.13.1.jar +flink-optimizer_2.11/org.apache.flink/1.13.1//flink-optimizer_2.11-1.13.1.jar +flink-parquet_2.12/org.apache.flink/1.13.1//flink-parquet_2.12-1.13.1.jar +flink-queryable-state-client-java/org.apache.flink/1.13.1//flink-queryable-state-client-java-1.13.1.jar +flink-runtime_2.11/org.apache.flink/1.13.1//flink-runtime_2.11-1.13.1.jar +flink-shaded-asm-7/org.apache.flink/7.1-13.0//flink-shaded-asm-7-7.1-13.0.jar +flink-shaded-guava/org.apache.flink/18.0-13.0//flink-shaded-guava-18.0-13.0.jar +flink-shaded-jackson/org.apache.flink/2.12.1-13.0//flink-shaded-jackson-2.12.1-13.0.jar +flink-shaded-netty/org.apache.flink/4.1.49.Final-13.0//flink-shaded-netty-4.1.49.Final-13.0.jar +flink-shaded-zookeeper-3/org.apache.flink/3.4.14-13.0//flink-shaded-zookeeper-3-3.4.14-13.0.jar +flink-streaming-java_2.11/org.apache.flink/1.13.1//flink-streaming-java_2.11-1.13.1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +force-shading/org.apache.flink/1.13.1//force-shading-1.13.1.jar +grizzled-slf4j_2.11/org.clapper/1.3.2//grizzled-slf4j_2.11-1.3.2.jar +groovy-all/org.codehaus.groovy/2.4.4//groovy-all-2.4.4.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/12.0.1//guava-12.0.1.jar +guice-assistedinject/com.google.inject.extensions/3.0//guice-assistedinject-3.0.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-registry/org.apache.hadoop/2.7.1//hadoop-yarn-registry-2.7.1.jar +hadoop-yarn-server-applicationhistoryservice/org.apache.hadoop/2.7.2//hadoop-yarn-server-applicationhistoryservice-2.7.2.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.2//hadoop-yarn-server-common-2.7.2.jar +hadoop-yarn-server-resourcemanager/org.apache.hadoop/2.7.2//hadoop-yarn-server-resourcemanager-2.7.2.jar +hadoop-yarn-server-web-proxy/org.apache.hadoop/2.7.2//hadoop-yarn-server-web-proxy-2.7.2.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-common/org.apache.hive/2.3.1//hive-common-2.3.1.jar +hive-exec/org.apache.hive/2.3.1//hive-exec-2.3.1.jar +hive-jdbc/org.apache.hive/2.3.1//hive-jdbc-2.3.1.jar +hive-llap-client/org.apache.hive/2.3.1//hive-llap-client-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1//hive-llap-common-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1/tests/hive-llap-common-2.3.1-tests.jar +hive-llap-server/org.apache.hive/2.3.1//hive-llap-server-2.3.1.jar +hive-llap-tez/org.apache.hive/2.3.1//hive-llap-tez-2.3.1.jar +hive-metastore/org.apache.hive/2.3.1//hive-metastore-2.3.1.jar +hive-serde/org.apache.hive/2.3.1//hive-serde-2.3.1.jar +hive-service-rpc/org.apache.hive/2.3.1//hive-service-rpc-2.3.1.jar +hive-service/org.apache.hive/2.3.1//hive-service-2.3.1.jar +hive-shims-0.23/org.apache.hive.shims/2.3.1//hive-shims-0.23-2.3.1.jar +hive-shims-common/org.apache.hive.shims/2.3.1//hive-shims-common-2.3.1.jar +hive-shims-scheduler/org.apache.hive.shims/2.3.1//hive-shims-scheduler-2.3.1.jar +hive-shims/org.apache.hive/2.3.1//hive-shims-2.3.1.jar +hive-storage-api/org.apache.hive/2.3.1//hive-storage-api-2.3.1.jar +hive-vector-code-gen/org.apache.hive/2.3.1//hive-vector-code-gen-2.3.1.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +ivy/org.apache.ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/com.fasterxml.jackson.core/2.10.0//jackson-annotations-2.10.0.jar +jackson-core/com.fasterxml.jackson.core/2.10.0//jackson-core-2.10.0.jar +jackson-databind/com.fasterxml.jackson.core/2.10.0//jackson-databind-2.10.0.jar +jamon-runtime/org.jamon/2.3.1//jamon-runtime-2.3.1.jar +janino/org.codehaus.janino/2.7.6//janino-2.7.6.jar +jasper-compiler/tomcat/5.5.23//jasper-compiler-5.5.23.jar +jasper-runtime/tomcat/5.5.23//jasper-runtime-5.5.23.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javassist/org.javassist/3.24.0-GA//javassist-3.24.0-GA.jar +javax.annotation-api/javax.annotation/1.3.2//javax.annotation-api-1.3.2.jar +javax.inject/javax.inject/1//javax.inject-1.jar +javax.jdo/org.datanucleus/3.2.0-m3//javax.jdo-3.2.0-m3.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +javolution/javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jdo-api/javax.jdo/3.0.1//jdo-api-3.0.1.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +jetty/org.mortbay.jetty/6.1.26//jetty-6.1.26.jar +jline/jline/2.12//jline-2.12.jar +joda-time/joda-time/2.9.9//joda-time-2.9.9.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jpam/net.sf.jpam/1.1//jpam-1.1.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +json/com.tdunning/1.8//json-1.8.jar +jsp-api/javax.servlet.jsp/2.1//jsp-api-2.1.jar +jsp-api/javax.servlet/2.0//jsp-api-2.0.jar +jsr305/com.google.code.findbugs/1.3.9//jsr305-1.3.9.jar +junit/junit/4.12//junit-4.12.jar +kafka-clients/org.apache.kafka/2.0.0//kafka-clients-2.0.0.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +kryo/com.esotericsoftware.kryo/2.24.0//kryo-2.24.0.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +libfb303/org.apache.thrift/0.9.3//libfb303-0.9.3.jar +libthrift/org.apache.thrift/0.9.3//libthrift-0.9.3.jar +log4j-1.2-api/org.apache.logging.log4j/2.6.2//log4j-1.2-api-2.6.2.jar +log4j-api/org.apache.logging.log4j/2.6.2//log4j-api-2.6.2.jar +log4j-core/org.apache.logging.log4j/2.6.2//log4j-core-2.6.2.jar +log4j-slf4j-impl/org.apache.logging.log4j/2.6.2//log4j-slf4j-impl-2.6.2.jar +log4j-web/org.apache.logging.log4j/2.6.2//log4j-web-2.6.2.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +lz4-java/org.lz4/1.4.1//lz4-java-1.4.1.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +metrics-core/io.dropwizard.metrics/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/io.dropwizard.metrics/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/io.dropwizard.metrics/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/io.dropwizard.metrics/3.1.0//metrics-json-3.1.0.jar +metrics-jvm/io.dropwizard.metrics/3.1.0//metrics-jvm-3.1.0.jar +minlog/com.esotericsoftware.minlog/1.2//minlog-1.2.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.6.2.Final//netty-3.6.2.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +opencsv/net.sf.opencsv/2.3//opencsv-2.3.jar +orc-core/org.apache.orc/1.3.3//orc-core-1.3.3.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +oro/oro/2.0.8//oro-2.0.8.jar +parquet-avro/org.apache.parquet/1.11.1//parquet-avro-1.11.1.jar +parquet-column/org.apache.parquet/1.11.1//parquet-column-1.11.1.jar +parquet-common/org.apache.parquet/1.11.1//parquet-common-1.11.1.jar +parquet-encoding/org.apache.parquet/1.11.1//parquet-encoding-1.11.1.jar +parquet-format-structures/org.apache.parquet/1.11.1//parquet-format-structures-1.11.1.jar +parquet-hadoop-bundle/org.apache.parquet/1.8.1//parquet-hadoop-bundle-1.8.1.jar +parquet-hadoop/org.apache.parquet/1.11.1//parquet-hadoop-1.11.1.jar +parquet-jackson/org.apache.parquet/1.11.1//parquet-jackson-1.11.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +reactive-streams/org.reactivestreams/1.0.2//reactive-streams-1.0.2.jar +scala-java8-compat_2.11/org.scala-lang.modules/0.7.0//scala-java8-compat_2.11-0.7.0.jar +scala-library/org.scala-lang/2.11.12//scala-library-2.11.12.jar +scala-parser-combinators_2.11/org.scala-lang.modules/1.1.1//scala-parser-combinators_2.11-1.1.1.jar +scopt_2.11/com.github.scopt/3.5.0//scopt_2.11-3.5.0.jar +servlet-api/javax.servlet/2.4//servlet-api-2.4.jar +simpleclient/io.prometheus/0.8.0//simpleclient-0.8.0.jar +simpleclient_common/io.prometheus/0.8.0//simpleclient_common-0.8.0.jar +simpleclient_dropwizard/io.prometheus/0.8.0//simpleclient_dropwizard-0.8.0.jar +simpleclient_httpserver/io.prometheus/0.8.0//simpleclient_httpserver-0.8.0.jar +simpleclient_pushgateway/io.prometheus/0.8.0//simpleclient_pushgateway-0.8.0.jar +slf4j-api/org.slf4j/1.7.22//slf4j-api-1.7.22.jar +slf4j-log4j12/org.slf4j/1.7.10//slf4j-log4j12-1.7.10.jar +slider-core/org.apache.slider/0.90.2-incubating//slider-core-0.90.2-incubating.jar +snappy-java/org.xerial.snappy/1.1.7.1//snappy-java-1.1.7.1.jar +ssl-config-core_2.11/com.typesafe/0.3.7//ssl-config-core_2.11-0.3.7.jar +stax-api/stax/1.0.1//stax-api-1.0.1.jar +tephra-api/co.cask.tephra/0.6.0//tephra-api-0.6.0.jar +tephra-core/co.cask.tephra/0.6.0//tephra-core-0.6.0.jar +tephra-hbase-compat-1.0/co.cask.tephra/0.6.0//tephra-hbase-compat-1.0-0.6.0.jar +twill-api/org.apache.twill/0.6.0-incubating//twill-api-0.6.0-incubating.jar +twill-common/org.apache.twill/0.6.0-incubating//twill-common-0.6.0-incubating.jar +twill-core/org.apache.twill/0.6.0-incubating//twill-core-0.6.0-incubating.jar +twill-discovery-api/org.apache.twill/0.6.0-incubating//twill-discovery-api-0.6.0-incubating.jar +twill-discovery-core/org.apache.twill/0.6.0-incubating//twill-discovery-core-0.6.0-incubating.jar +twill-zookeeper/org.apache.twill/0.6.0-incubating//twill-zookeeper-0.6.0-incubating.jar +velocity/org.apache.velocity/1.5//velocity-1.5.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar +zookeeper/org.apache.zookeeper/3.4.6/tests/zookeeper-3.4.6-tests.jar diff --git a/dependencies/hudi-hadoop-mr-bundle.txt b/dependencies/hudi-hadoop-mr-bundle.txt new file mode 100644 index 0000000000000..8d9a6ce2f4255 --- /dev/null +++ b/dependencies/hudi-hadoop-mr-bundle.txt @@ -0,0 +1,131 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.9//commons-codec-1.9.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compress/org.apache.commons/1.8.1//commons-compress-1.8.1.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.6//commons-pool-1.6.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/12.0.1//guava-12.0.1.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.3//hadoop-yarn-server-common-2.7.3.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-storage-api/org.apache.hive/2.6.0//hive-storage-api-2.6.0.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jamon-runtime/org.jamon/2.4.1//jamon-runtime-2.4.1.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javax.inject/javax.inject/1//javax.inject-1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty/org.mortbay.jetty/6.1.26//jetty-6.1.26.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +junit/junit/4.12//junit-4.12.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +servlet-api/javax.servlet/2.5//servlet-api-2.5.jar +slf4j-api/org.slf4j/1.7.7//slf4j-api-1.7.7.jar +slf4j-log4j12/org.slf4j/1.7.10//slf4j-log4j12-1.7.10.jar +snappy-java/org.xerial.snappy/1.1.1.3//snappy-java-1.1.1.3.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar diff --git a/dependencies/hudi-hive-sync-bundle.txt b/dependencies/hudi-hive-sync-bundle.txt new file mode 100644 index 0000000000000..5b5f4b73c9e94 --- /dev/null +++ b/dependencies/hudi-hive-sync-bundle.txt @@ -0,0 +1,130 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.9//commons-codec-1.9.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compress/org.apache.commons/1.8.1//commons-compress-1.8.1.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.6//commons-pool-1.6.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/12.0.1//guava-12.0.1.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.3//hadoop-yarn-server-common-2.7.3.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3/tests/hbase-common-1.2.3-tests.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-storage-api/org.apache.hive/2.6.0//hive-storage-api-2.6.0.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jamon-runtime/org.jamon/2.4.1//jamon-runtime-2.4.1.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javax.inject/javax.inject/1//javax.inject-1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +junit/junit/4.12//junit-4.12.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +slf4j-api/org.slf4j/1.7.7//slf4j-api-1.7.7.jar +slf4j-log4j12/org.slf4j/1.6.1//slf4j-log4j12-1.6.1.jar +snappy-java/org.xerial.snappy/1.1.1.3//snappy-java-1.1.1.3.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar diff --git a/dependencies/hudi-integ-test-bundle.txt b/dependencies/hudi-integ-test-bundle.txt new file mode 100644 index 0000000000000..ec0f14e4a443c --- /dev/null +++ b/dependencies/hudi-integ-test-bundle.txt @@ -0,0 +1,341 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +HikariCP/com.zaxxer/2.5.1//HikariCP-2.5.1.jar +RoaringBitmap/org.roaringbitmap/0.7.45//RoaringBitmap-0.7.45.jar +ST4/org.antlr/4.0.4//ST4-4.0.4.jar +activation/javax.activation/1.1.1//activation-1.1.1.jar +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +ant-launcher/org.apache.ant/1.9.1//ant-launcher-1.9.1.jar +ant/ant/1.6.5//ant-1.6.5.jar +ant/org.apache.ant/1.9.1//ant-1.9.1.jar +antlr-runtime/org.antlr/3.5.2//antlr-runtime-3.5.2.jar +antlr4-runtime/org.antlr/4.7//antlr4-runtime-4.7.jar +aopalliance-repackaged/org.glassfish.hk2.external/2.4.0-b10//aopalliance-repackaged-2.4.0-b10.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apache-curator/org.apache.curator/2.7.1//apache-curator-2.7.1.pom +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +arrow-format/org.apache.arrow/0.10.0//arrow-format-0.10.0.jar +arrow-memory/org.apache.arrow/0.10.0//arrow-memory-0.10.0.jar +arrow-vector/org.apache.arrow/0.10.0//arrow-vector-0.10.0.jar +asm/asm/3.1//asm-3.1.jar +avatica-metrics/org.apache.calcite.avatica/1.8.0//avatica-metrics-1.8.0.jar +avatica/org.apache.calcite.avatica/1.8.0//avatica-1.8.0.jar +avro-ipc/org.apache.avro/1.7.7//avro-ipc-1.7.7.jar +avro-ipc/org.apache.avro/1.7.7/tests/avro-ipc-1.7.7-tests.jar +avro-mapred/org.apache.avro/1.7.7//avro-mapred-1.7.7.jar +avro-mapred/org.apache.avro/1.8.2/hadoop2/avro-mapred-1.8.2-hadoop2.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +aws-java-sdk-core/com.amazonaws/1.12.22//aws-java-sdk-core-1.12.22.jar +aws-java-sdk-sqs/com.amazonaws/1.12.22//aws-java-sdk-sqs-1.12.22.jar +bijection-avro_2.11/com.twitter/0.9.3//bijection-avro_2.11-0.9.3.jar +bijection-core_2.11/com.twitter/0.9.3//bijection-core_2.11-0.9.3.jar +bonecp/com.jolbox/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +calcite-core/org.apache.calcite/1.10.0//calcite-core-1.10.0.jar +calcite-druid/org.apache.calcite/1.10.0//calcite-druid-1.10.0.jar +calcite-linq4j/org.apache.calcite/1.10.0//calcite-linq4j-1.10.0.jar +chill-java/com.twitter/0.9.3//chill-java-0.9.3.jar +chill_2.11/com.twitter/0.9.3//chill_2.11-0.9.3.jar +common-config/io.confluent/5.3.4//common-config-5.3.4.jar +common-utils/io.confluent/5.3.4//common-utils-5.3.4.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/org.codehaus.janino/2.7.6//commons-compiler-2.7.6.jar +commons-compress/org.apache.commons/1.4.1//commons-compress-1.4.1.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-crypto/org.apache.commons/1.0.0//commons-crypto-1.0.0.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-dbcp/commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-el/commons-el/1.0//commons-el-1.0.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/org.apache.commons/3.1//commons-lang3-3.1.jar +commons-logging/commons-logging/1.1.3//commons-logging-1.1.3.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.4//commons-pool-1.4.jar +compress-lzf/com.ning/1.0.3//compress-lzf-1.0.3.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +datanucleus-api-jdo/org.datanucleus/4.2.4//datanucleus-api-jdo-4.2.4.jar +datanucleus-core/org.datanucleus/4.1.17//datanucleus-core-4.1.17.jar +datanucleus-rdbms/org.datanucleus/4.1.19//datanucleus-rdbms-4.1.19.jar +derby/org.apache.derby/10.10.2.0//derby-10.10.2.0.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +dropwizard-metrics-hadoop-metrics2-reporter/com.github.joshelser/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +eigenbase-properties/net.hydromatic/1.1.5//eigenbase-properties-1.1.5.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +flatbuffers/com.vlkan/1.2.0-3f79e055//flatbuffers-1.2.0-3f79e055.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +groovy-all/org.codehaus.groovy/2.4.4//groovy-all-2.4.4.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/11.0.2//guava-11.0.2.jar +guice-assistedinject/com.google.inject.extensions/3.0//guice-assistedinject-3.0.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3/tests/hadoop-common-2.7.3-tests.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3/tests/hadoop-hdfs-2.7.3-tests.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-registry/org.apache.hadoop/2.7.1//hadoop-yarn-registry-2.7.1.jar +hadoop-yarn-server-applicationhistoryservice/org.apache.hadoop/2.7.2//hadoop-yarn-server-applicationhistoryservice-2.7.2.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.3//hadoop-yarn-server-common-2.7.3.jar +hadoop-yarn-server-resourcemanager/org.apache.hadoop/2.7.2//hadoop-yarn-server-resourcemanager-2.7.2.jar +hadoop-yarn-server-web-proxy/org.apache.hadoop/2.7.2//hadoop-yarn-server-web-proxy-2.7.2.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.1.1//hbase-annotations-1.1.1.jar +hbase-client/org.apache.hbase/1.1.1//hbase-client-1.1.1.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3/tests/hbase-common-1.2.3-tests.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-common/org.apache.hive/2.3.1//hive-common-2.3.1.jar +hive-exec/org.apache.hive/2.3.1//hive-exec-2.3.1.jar +hive-jdbc/org.apache.hive/2.3.1//hive-jdbc-2.3.1.jar +hive-jdbc/org.apache.hive/2.3.1/standalone/hive-jdbc-2.3.1-standalone.jar +hive-llap-client/org.apache.hive/2.3.1//hive-llap-client-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1//hive-llap-common-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1/tests/hive-llap-common-2.3.1-tests.jar +hive-llap-server/org.apache.hive/2.3.1//hive-llap-server-2.3.1.jar +hive-llap-tez/org.apache.hive/2.3.1//hive-llap-tez-2.3.1.jar +hive-metastore/org.apache.hive/2.3.1//hive-metastore-2.3.1.jar +hive-serde/org.apache.hive/2.3.1//hive-serde-2.3.1.jar +hive-service-rpc/org.apache.hive/2.3.1//hive-service-rpc-2.3.1.jar +hive-service/org.apache.hive/2.3.1//hive-service-2.3.1.jar +hive-shims-0.23/org.apache.hive.shims/2.3.1//hive-shims-0.23-2.3.1.jar +hive-shims-common/org.apache.hive.shims/2.3.1//hive-shims-common-2.3.1.jar +hive-shims-scheduler/org.apache.hive.shims/2.3.1//hive-shims-scheduler-2.3.1.jar +hive-shims/org.apache.hive/2.3.1//hive-shims-2.3.1.jar +hive-storage-api/org.apache.hive/2.3.1//hive-storage-api-2.3.1.jar +hive-vector-code-gen/org.apache.hive/2.3.1//hive-vector-code-gen-2.3.1.jar +hk2-api/org.glassfish.hk2/2.4.0-b10//hk2-api-2.4.0-b10.jar +hk2-locator/org.glassfish.hk2/2.4.0-b10//hk2-locator-2.4.0-b10.jar +hk2-utils/org.glassfish.hk2/2.4.0-b10//hk2-utils-2.4.0-b10.jar +hppc/com.carrotsearch/0.7.2//hppc-0.7.2.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +ion-java/software.amazon.ion/1.0.2//ion-java-1.0.2.jar +ivy/org.apache.ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jackson-dataformat-cbor/com.fasterxml.jackson.dataformat/2.12.3//jackson-dataformat-cbor-2.12.3.jar +jackson-dataformat-csv/com.fasterxml.jackson.dataformat/2.6.7//jackson-dataformat-csv-2.6.7.jar +jackson-dataformat-yaml/com.fasterxml.jackson.dataformat/2.7.4//jackson-dataformat-yaml-2.7.4.jar +jackson-module-paranamer/com.fasterxml.jackson.module/2.7.9//jackson-module-paranamer-2.7.9.jar +jackson-module-scala_2.11/com.fasterxml.jackson.module/2.6.7.1//jackson-module-scala_2.11-2.6.7.1.jar +jamon-runtime/org.jamon/2.4.1//jamon-runtime-2.4.1.jar +janino/org.codehaus.janino/2.7.6//janino-2.7.6.jar +jasper-compiler/tomcat/5.5.23//jasper-compiler-5.5.23.jar +jasper-runtime/tomcat/5.5.23//jasper-runtime-5.5.23.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javassist/org.javassist/3.18.1-GA//javassist-3.18.1-GA.jar +javax.annotation-api/javax.annotation/1.2//javax.annotation-api-1.2.jar +javax.inject/javax.inject/1//javax.inject-1.jar +javax.inject/org.glassfish.hk2.external/2.4.0-b10//javax.inject-2.4.0-b10.jar +javax.jdo/org.datanucleus/3.2.0-m3//javax.jdo-3.2.0-m3.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +javax.servlet/org.eclipse.jetty.orbit/3.0.0.v201112011016//javax.servlet-3.0.0.v201112011016.jar +javax.ws.rs-api/javax.ws.rs/2.0.1//javax.ws.rs-api-2.0.1.jar +javolution/javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcl-over-slf4j/org.slf4j/1.7.16//jcl-over-slf4j-1.7.16.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jdo-api/javax.jdo/3.0.1//jdo-api-3.0.1.jar +jersey-apache-connector/org.glassfish.jersey.connectors/2.17//jersey-apache-connector-2.17.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-client/org.glassfish.jersey.core/2.22.2//jersey-client-2.22.2.jar +jersey-common/org.glassfish.jersey.core/2.22.2//jersey-common-2.22.2.jar +jersey-container-servlet-core/org.glassfish.jersey.containers/2.17//jersey-container-servlet-core-2.17.jar +jersey-container-servlet/org.glassfish.jersey.containers/2.22.2//jersey-container-servlet-2.22.2.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guava/org.glassfish.jersey.bundles.repackaged/2.22.2//jersey-guava-2.22.2.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-media-jaxb/org.glassfish.jersey.media/2.17//jersey-media-jaxb-2.17.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jersey-server/org.glassfish.jersey.core/2.17//jersey-server-2.17.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +jetty/org.mortbay.jetty/6.1.26//jetty-6.1.26.jar +jline/jline/2.12//jline-2.12.jar +jmespath-java/com.amazonaws/1.12.22//jmespath-java-1.12.22.jar +joda-time/joda-time/2.9.9//joda-time-2.9.9.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jpam/net.sf.jpam/1.1//jpam-1.1.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +json/com.tdunning/1.8//json-1.8.jar +json4s-ast_2.11/org.json4s/3.5.3//json4s-ast_2.11-3.5.3.jar +json4s-core_2.11/org.json4s/3.5.3//json4s-core_2.11-3.5.3.jar +json4s-jackson_2.11/org.json4s/3.5.3//json4s-jackson_2.11-3.5.3.jar +json4s-scalap_2.11/org.json4s/3.5.3//json4s-scalap_2.11-3.5.3.jar +jsp-api/javax.servlet.jsp/2.1//jsp-api-2.1.jar +jsp-api/javax.servlet/2.0//jsp-api-2.0.jar +jsr305/com.google.code.findbugs/3.0.0//jsr305-3.0.0.jar +jul-to-slf4j/org.slf4j/1.7.16//jul-to-slf4j-1.7.16.jar +junit/junit/4.12//junit-4.12.jar +kafka-avro-serializer/io.confluent/5.3.4//kafka-avro-serializer-5.3.4.jar +kafka-clients/org.apache.kafka/2.0.0//kafka-clients-2.0.0.jar +kafka-schema-registry-client/io.confluent/5.3.4//kafka-schema-registry-client-5.3.4.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +libfb303/org.apache.thrift/0.9.3//libfb303-0.9.3.jar +libthrift/org.apache.thrift/0.9.3//libthrift-0.9.3.jar +log4j-1.2-api/org.apache.logging.log4j/2.6.2//log4j-1.2-api-2.6.2.jar +log4j-api/org.apache.logging.log4j/2.6.2//log4j-api-2.6.2.jar +log4j-core/org.apache.logging.log4j/2.6.2//log4j-core-2.6.2.jar +log4j-slf4j-impl/org.apache.logging.log4j/2.6.2//log4j-slf4j-impl-2.6.2.jar +log4j-web/org.apache.logging.log4j/2.6.2//log4j-web-2.6.2.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +lz4-java/org.lz4/1.4.0//lz4-java-1.4.0.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +metrics-core/io.dropwizard.metrics/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/io.dropwizard.metrics/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/io.dropwizard.metrics/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/io.dropwizard.metrics/3.1.0//metrics-json-3.1.0.jar +metrics-jvm/io.dropwizard.metrics/3.1.0//metrics-jvm-3.1.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +mockito-all/org.mockito/1.10.19//mockito-all-1.10.19.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.6.2.Final//netty-3.6.2.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +opencsv/net.sf.opencsv/2.3//opencsv-2.3.jar +orc-core/org.apache.orc/1.3.3//orc-core-1.3.3.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-mapreduce/org.apache.orc/1.5.5/nohive/orc-mapreduce-1.5.5-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +oro/oro/2.0.8//oro-2.0.8.jar +osgi-resource-locator/org.glassfish.hk2/1.0.1//osgi-resource-locator-1.0.1.jar +paranamer/com.thoughtworks.paranamer/2.8//paranamer-2.8.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop-bundle/org.apache.parquet/1.8.1//parquet-hadoop-bundle-1.8.1.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +py4j/net.sf.py4j/0.10.7//py4j-0.10.7.jar +pyrolite/net.razorvine/4.13//pyrolite-4.13.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +scala-library/org.scala-lang/2.11.8//scala-library-2.11.8.jar +scala-parser-combinators_2.11/org.scala-lang.modules/1.1.0//scala-parser-combinators_2.11-1.1.0.jar +scala-reflect/org.scala-lang/2.11.8//scala-reflect-2.11.8.jar +scala-xml_2.11/org.scala-lang.modules/1.0.6//scala-xml_2.11-1.0.6.jar +scalac-scoverage-runtime_2.11/org.scoverage/1.3.0//scalac-scoverage-runtime_2.11-1.3.0.jar +servlet-api/javax.servlet/2.5//servlet-api-2.5.jar +servlet-api/org.mortbay.jetty/2.5-20081211//servlet-api-2.5-20081211.jar +shims/org.roaringbitmap/0.7.45//shims-0.7.45.jar +simpleclient/io.prometheus/0.8.0//simpleclient-0.8.0.jar +simpleclient_common/io.prometheus/0.8.0//simpleclient_common-0.8.0.jar +simpleclient_dropwizard/io.prometheus/0.8.0//simpleclient_dropwizard-0.8.0.jar +simpleclient_httpserver/io.prometheus/0.8.0//simpleclient_httpserver-0.8.0.jar +simpleclient_pushgateway/io.prometheus/0.8.0//simpleclient_pushgateway-0.8.0.jar +slf4j-api/org.slf4j/1.7.15//slf4j-api-1.7.15.jar +slf4j-log4j12/org.slf4j/1.7.10//slf4j-log4j12-1.7.10.jar +slider-core/org.apache.slider/0.90.2-incubating//slider-core-0.90.2-incubating.jar +snakeyaml/org.yaml/1.15//snakeyaml-1.15.jar +snappy-java/org.xerial.snappy/1.1.7.3//snappy-java-1.1.7.3.jar +spark-catalyst_2.11/org.apache.spark/2.4.4//spark-catalyst_2.11-2.4.4.jar +spark-core_2.11/org.apache.spark/2.4.4//spark-core_2.11-2.4.4.jar +spark-kvstore_2.11/org.apache.spark/2.4.4//spark-kvstore_2.11-2.4.4.jar +spark-launcher_2.11/org.apache.spark/2.4.4//spark-launcher_2.11-2.4.4.jar +spark-network-common_2.11/org.apache.spark/2.4.4//spark-network-common_2.11-2.4.4.jar +spark-network-shuffle_2.11/org.apache.spark/2.4.4//spark-network-shuffle_2.11-2.4.4.jar +spark-sketch_2.11/org.apache.spark/2.4.4//spark-sketch_2.11-2.4.4.jar +spark-sql_2.11/org.apache.spark/2.4.4//spark-sql_2.11-2.4.4.jar +spark-streaming-kafka-0-10_2.11/org.apache.spark/2.4.4//spark-streaming-kafka-0-10_2.11-2.4.4.jar +spark-streaming-kafka-0-10_2.11/org.apache.spark/2.4.4/tests/spark-streaming-kafka-0-10_2.11-2.4.4-tests.jar +spark-streaming_2.11/org.apache.spark/2.4.4//spark-streaming_2.11-2.4.4.jar +spark-tags_2.11/org.apache.spark/2.4.4//spark-tags_2.11-2.4.4.jar +spark-unsafe_2.11/org.apache.spark/2.4.4//spark-unsafe_2.11-2.4.4.jar +stax-api/stax/1.0.1//stax-api-1.0.1.jar +stream/com.clearspring.analytics/2.7.0//stream-2.7.0.jar +stringtemplate/org.antlr/4.0.2//stringtemplate-4.0.2.jar +tephra-api/co.cask.tephra/0.6.0//tephra-api-0.6.0.jar +tephra-core/co.cask.tephra/0.6.0//tephra-core-0.6.0.jar +tephra-hbase-compat-1.0/co.cask.tephra/0.6.0//tephra-hbase-compat-1.0-0.6.0.jar +twill-api/org.apache.twill/0.6.0-incubating//twill-api-0.6.0-incubating.jar +twill-common/org.apache.twill/0.6.0-incubating//twill-common-0.6.0-incubating.jar +twill-core/org.apache.twill/0.6.0-incubating//twill-core-0.6.0-incubating.jar +twill-discovery-api/org.apache.twill/0.6.0-incubating//twill-discovery-api-0.6.0-incubating.jar +twill-discovery-core/org.apache.twill/0.6.0-incubating//twill-discovery-core-0.6.0-incubating.jar +twill-zookeeper/org.apache.twill/0.6.0-incubating//twill-zookeeper-0.6.0-incubating.jar +univocity-parsers/com.univocity/2.7.3//univocity-parsers-2.7.3.jar +unused/org.spark-project.spark/1.0.0//unused-1.0.0.jar +validation-api/javax.validation/1.1.0.Final//validation-api-1.1.0.Final.jar +velocity/org.apache.velocity/1.5//velocity-1.5.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xbean-asm6-shaded/org.apache.xbean/4.8//xbean-asm6-shaded-4.8.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zkclient/com.101tec/0.10//zkclient-0.10.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar +zookeeper/org.apache.zookeeper/3.4.6/tests/zookeeper-3.4.6-tests.jar +zstd-jni/com.github.luben/1.3.2-2//zstd-jni-1.3.2-2.jar diff --git a/dependencies/hudi-kafka-connect-bundle.txt b/dependencies/hudi-kafka-connect-bundle.txt new file mode 100644 index 0000000000000..aeb0f4fc55f13 --- /dev/null +++ b/dependencies/hudi-kafka-connect-bundle.txt @@ -0,0 +1,269 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +RoaringBitmap/org.roaringbitmap/0.7.45//RoaringBitmap-0.7.45.jar +activation/javax.activation/1.1.1//activation-1.1.1.jar +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +akka-actor_2.11/com.typesafe.akka/2.5.21//akka-actor_2.11-2.5.21.jar +akka-protobuf_2.11/com.typesafe.akka/2.5.21//akka-protobuf_2.11-2.5.21.jar +akka-slf4j_2.11/com.typesafe.akka/2.5.21//akka-slf4j_2.11-2.5.21.jar +akka-stream_2.11/com.typesafe.akka/2.5.21//akka-stream_2.11-2.5.21.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +antlr-runtime/org.antlr/3.3//antlr-runtime-3.3.jar +aopalliance-repackaged/org.glassfish.hk2.external/2.4.0-b34//aopalliance-repackaged-2.4.0-b34.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avro-ipc/org.apache.avro/1.8.2//avro-ipc-1.8.2.jar +avro-mapred/org.apache.avro/1.8.2/hadoop2/avro-mapred-1.8.2-hadoop2.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +aws-java-sdk-core/com.amazonaws/1.12.22//aws-java-sdk-core-1.12.22.jar +aws-java-sdk-sqs/com.amazonaws/1.12.22//aws-java-sdk-sqs-1.12.22.jar +bijection-avro_2.11/com.twitter/0.9.7//bijection-avro_2.11-0.9.7.jar +bijection-core_2.11/com.twitter/0.9.7//bijection-core_2.11-0.9.7.jar +chill-java/com.twitter/0.9.3//chill-java-0.9.3.jar +chill_2.11/com.twitter/0.9.3//chill_2.11-0.9.3.jar +common-config/io.confluent/5.3.4//common-config-5.3.4.jar +common-utils/io.confluent/5.3.4//common-utils-5.3.4.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compress/org.apache.commons/1.8.1//commons-compress-1.8.1.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-crypto/org.apache.commons/1.0.0//commons-crypto-1.0.0.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/org.apache.commons/3.3.2//commons-lang3-3.3.2.jar +commons-logging/commons-logging/1.1.3//commons-logging-1.1.3.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.6//commons-pool-1.6.jar +compress-lzf/com.ning/1.0.3//compress-lzf-1.0.3.jar +config/com.typesafe/1.3.3//config-1.3.3.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +flink-annotations/org.apache.flink/1.12.1//flink-annotations-1.12.1.jar +flink-clients_2.11/org.apache.flink/1.13.1//flink-clients_2.11-1.13.1.jar +flink-connector-base/org.apache.flink/1.13.1//flink-connector-base-1.13.1.jar +flink-connector-kafka_2.11/org.apache.flink/1.13.1//flink-connector-kafka_2.11-1.13.1.jar +flink-core/org.apache.flink/1.12.1//flink-core-1.12.1.jar +flink-file-sink-common/org.apache.flink/1.13.1//flink-file-sink-common-1.13.1.jar +flink-hadoop-compatibility_2.11/org.apache.flink/1.13.1//flink-hadoop-compatibility_2.11-1.13.1.jar +flink-hadoop-fs/org.apache.flink/1.13.1//flink-hadoop-fs-1.13.1.jar +flink-java/org.apache.flink/1.13.1//flink-java-1.13.1.jar +flink-metrics-core/org.apache.flink/1.12.1//flink-metrics-core-1.12.1.jar +flink-optimizer_2.11/org.apache.flink/1.13.1//flink-optimizer_2.11-1.13.1.jar +flink-queryable-state-client-java/org.apache.flink/1.13.1//flink-queryable-state-client-java-1.13.1.jar +flink-runtime_2.11/org.apache.flink/1.13.1//flink-runtime_2.11-1.13.1.jar +flink-shaded-asm-7/org.apache.flink/7.1-12.0//flink-shaded-asm-7-7.1-12.0.jar +flink-shaded-guava/org.apache.flink/18.0-12.0//flink-shaded-guava-18.0-12.0.jar +flink-shaded-jackson/org.apache.flink/2.12.1-13.0//flink-shaded-jackson-2.12.1-13.0.jar +flink-shaded-netty/org.apache.flink/4.1.49.Final-13.0//flink-shaded-netty-4.1.49.Final-13.0.jar +flink-shaded-zookeeper-3/org.apache.flink/3.4.14-13.0//flink-shaded-zookeeper-3-3.4.14-13.0.jar +flink-streaming-java_2.11/org.apache.flink/1.13.1//flink-streaming-java_2.11-1.13.1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +force-shading/org.apache.flink/1.12.1//force-shading-1.12.1.jar +grizzled-slf4j_2.11/org.clapper/1.3.2//grizzled-slf4j_2.11-1.3.2.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/11.0.2//guava-11.0.2.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.3//hadoop-yarn-server-common-2.7.3.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3/tests/hbase-common-1.2.3-tests.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-storage-api/org.apache.hive/2.6.0//hive-storage-api-2.6.0.jar +hk2-api/org.glassfish.hk2/2.4.0-b34//hk2-api-2.4.0-b34.jar +hk2-locator/org.glassfish.hk2/2.4.0-b34//hk2-locator-2.4.0-b34.jar +hk2-utils/org.glassfish.hk2/2.4.0-b34//hk2-utils-2.4.0-b34.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +ion-java/software.amazon.ion/1.0.2//ion-java-1.0.2.jar +ivy/org.apache.ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jackson-dataformat-cbor/com.fasterxml.jackson.dataformat/2.12.3//jackson-dataformat-cbor-2.12.3.jar +jackson-dataformat-csv/com.fasterxml.jackson.dataformat/2.6.7//jackson-dataformat-csv-2.6.7.jar +jackson-module-paranamer/com.fasterxml.jackson.module/2.7.9//jackson-module-paranamer-2.7.9.jar +jackson-module-scala_2.11/com.fasterxml.jackson.module/2.6.7.1//jackson-module-scala_2.11-2.6.7.1.jar +jamon-runtime/org.jamon/2.4.1//jamon-runtime-2.4.1.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javassist/org.javassist/3.24.0-GA//javassist-3.24.0-GA.jar +javax.annotation-api/javax.annotation/1.2//javax.annotation-api-1.2.jar +javax.inject/javax.inject/1//javax.inject-1.jar +javax.inject/org.glassfish.hk2.external/2.4.0-b34//javax.inject-2.4.0-b34.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +javax.ws.rs-api/javax.ws.rs/2.0.1//javax.ws.rs-api-2.0.1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcl-over-slf4j/org.slf4j/1.7.16//jcl-over-slf4j-1.7.16.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-client/org.glassfish.jersey.core/2.22.2//jersey-client-2.22.2.jar +jersey-common/org.glassfish.jersey.core/2.22.2//jersey-common-2.22.2.jar +jersey-container-servlet-core/org.glassfish.jersey.containers/2.17//jersey-container-servlet-core-2.17.jar +jersey-container-servlet/org.glassfish.jersey.containers/2.22.2//jersey-container-servlet-2.22.2.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guava/org.glassfish.jersey.bundles.repackaged/2.22.2//jersey-guava-2.22.2.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-media-jaxb/org.glassfish.jersey.media/2.17//jersey-media-jaxb-2.17.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jersey-server/org.glassfish.jersey.core/2.17//jersey-server-2.17.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +jmespath-java/com.amazonaws/1.12.22//jmespath-java-1.12.22.jar +joda-time/joda-time/2.9.9//joda-time-2.9.9.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +json4s-ast_2.11/org.json4s/3.5.3//json4s-ast_2.11-3.5.3.jar +json4s-core_2.11/org.json4s/3.5.3//json4s-core_2.11-3.5.3.jar +json4s-jackson_2.11/org.json4s/3.5.3//json4s-jackson_2.11-3.5.3.jar +json4s-scalap_2.11/org.json4s/3.5.3//json4s-scalap_2.11-3.5.3.jar +jsr305/com.google.code.findbugs/3.0.0//jsr305-3.0.0.jar +jul-to-slf4j/org.slf4j/1.7.16//jul-to-slf4j-1.7.16.jar +junit/junit/4.12//junit-4.12.jar +kafka-avro-serializer/io.confluent/5.3.4//kafka-avro-serializer-5.3.4.jar +kafka-clients/org.apache.kafka/2.0.0//kafka-clients-2.0.0.jar +kafka-schema-registry-client/io.confluent/5.3.4//kafka-schema-registry-client-5.3.4.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +lz4-java/org.lz4/1.4.1//lz4-java-1.4.1.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +metrics-core/io.dropwizard.metrics/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/io.dropwizard.metrics/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/io.dropwizard.metrics/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/io.dropwizard.metrics/3.1.5//metrics-json-3.1.5.jar +metrics-jvm/io.dropwizard.metrics/3.1.5//metrics-jvm-3.1.5.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.7.0.Final//netty-3.7.0.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +oro/oro/2.0.8//oro-2.0.8.jar +osgi-resource-locator/org.glassfish.hk2/1.0.1//osgi-resource-locator-1.0.1.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/3.17.3//protobuf-java-3.17.3.jar +py4j/net.sf.py4j/0.10.7//py4j-0.10.7.jar +pyrolite/net.razorvine/4.13//pyrolite-4.13.jar +reactive-streams/org.reactivestreams/1.0.2//reactive-streams-1.0.2.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +scala-java8-compat_2.11/org.scala-lang.modules/0.7.0//scala-java8-compat_2.11-0.7.0.jar +scala-library/org.scala-lang/2.11.8//scala-library-2.11.8.jar +scala-parser-combinators_2.11/org.scala-lang.modules/1.1.1//scala-parser-combinators_2.11-1.1.1.jar +scala-reflect/org.scala-lang/2.11.8//scala-reflect-2.11.8.jar +scala-xml_2.11/org.scala-lang.modules/1.0.6//scala-xml_2.11-1.0.6.jar +scopt_2.11/com.github.scopt/3.5.0//scopt_2.11-3.5.0.jar +servlet-api/javax.servlet/2.5//servlet-api-2.5.jar +shims/org.roaringbitmap/0.7.45//shims-0.7.45.jar +simpleclient/io.prometheus/0.8.0//simpleclient-0.8.0.jar +simpleclient_common/io.prometheus/0.8.0//simpleclient_common-0.8.0.jar +simpleclient_dropwizard/io.prometheus/0.8.0//simpleclient_dropwizard-0.8.0.jar +simpleclient_httpserver/io.prometheus/0.8.0//simpleclient_httpserver-0.8.0.jar +simpleclient_pushgateway/io.prometheus/0.8.0//simpleclient_pushgateway-0.8.0.jar +slf4j-api/org.slf4j/1.7.15//slf4j-api-1.7.15.jar +slf4j-log4j12/org.slf4j/1.7.10//slf4j-log4j12-1.7.10.jar +snappy-java/org.xerial.snappy/1.1.1.3//snappy-java-1.1.1.3.jar +spark-core_2.11/org.apache.spark/2.4.4//spark-core_2.11-2.4.4.jar +spark-kvstore_2.11/org.apache.spark/2.4.4//spark-kvstore_2.11-2.4.4.jar +spark-launcher_2.11/org.apache.spark/2.4.4//spark-launcher_2.11-2.4.4.jar +spark-network-common_2.11/org.apache.spark/2.4.4//spark-network-common_2.11-2.4.4.jar +spark-network-shuffle_2.11/org.apache.spark/2.4.4//spark-network-shuffle_2.11-2.4.4.jar +spark-streaming-kafka-0-10_2.11/org.apache.spark/2.4.4//spark-streaming-kafka-0-10_2.11-2.4.4.jar +spark-streaming-kafka-0-10_2.11/org.apache.spark/2.4.4/tests/spark-streaming-kafka-0-10_2.11-2.4.4-tests.jar +spark-streaming_2.11/org.apache.spark/2.4.4//spark-streaming_2.11-2.4.4.jar +spark-tags_2.11/org.apache.spark/2.4.4//spark-tags_2.11-2.4.4.jar +spark-unsafe_2.11/org.apache.spark/2.4.4//spark-unsafe_2.11-2.4.4.jar +ssl-config-core_2.11/com.typesafe/0.3.7//ssl-config-core_2.11-0.3.7.jar +stream/com.clearspring.analytics/2.7.0//stream-2.7.0.jar +stringtemplate/org.antlr/4.0.2//stringtemplate-4.0.2.jar +unused/org.spark-project.spark/1.0.0//unused-1.0.0.jar +validation-api/javax.validation/1.1.0.Final//validation-api-1.1.0.Final.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xbean-asm6-shaded/org.apache.xbean/4.8//xbean-asm6-shaded-4.8.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zkclient/com.101tec/0.10//zkclient-0.10.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar +zstd-jni/com.github.luben/1.3.2-2//zstd-jni-1.3.2-2.jar diff --git a/dependencies/hudi-presto-bundle.txt b/dependencies/hudi-presto-bundle.txt new file mode 100644 index 0000000000000..4f8ffc4c77421 --- /dev/null +++ b/dependencies/hudi-presto-bundle.txt @@ -0,0 +1,130 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.9//commons-codec-1.9.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compress/org.apache.commons/1.8.1//commons-compress-1.8.1.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.6//commons-pool-1.6.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/12.0.1//guava-12.0.1.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.3//hadoop-yarn-server-common-2.7.3.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3/tests/hbase-common-1.2.3-tests.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-storage-api/org.apache.hive/2.6.0//hive-storage-api-2.6.0.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jamon-runtime/org.jamon/2.4.1//jamon-runtime-2.4.1.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javax.inject/javax.inject/1//javax.inject-1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +jsr305/com.google.code.findbugs/1.3.9//jsr305-1.3.9.jar +junit/junit/4.12//junit-4.12.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +slf4j-api/org.slf4j/1.7.7//slf4j-api-1.7.7.jar +slf4j-log4j12/org.slf4j/1.6.1//slf4j-log4j12-1.6.1.jar +snappy-java/org.xerial.snappy/1.1.1.3//snappy-java-1.1.1.3.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar diff --git a/dependencies/hudi-spark-bundle_2.11.txt b/dependencies/hudi-spark-bundle_2.11.txt new file mode 100644 index 0000000000000..39d183520a45d --- /dev/null +++ b/dependencies/hudi-spark-bundle_2.11.txt @@ -0,0 +1,255 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +HikariCP/com.zaxxer/2.5.1//HikariCP-2.5.1.jar +ST4/org.antlr/4.0.4//ST4-4.0.4.jar +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +ant-launcher/org.apache.ant/1.9.1//ant-launcher-1.9.1.jar +ant/ant/1.6.5//ant-1.6.5.jar +ant/org.apache.ant/1.9.1//ant-1.9.1.jar +antlr-runtime/org.antlr/3.5.2//antlr-runtime-3.5.2.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apache-curator/org.apache.curator/2.7.1//apache-curator-2.7.1.pom +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avatica-metrics/org.apache.calcite.avatica/1.8.0//avatica-metrics-1.8.0.jar +avatica/org.apache.calcite.avatica/1.8.0//avatica-1.8.0.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +bonecp/com.jolbox/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +calcite-core/org.apache.calcite/1.10.0//calcite-core-1.10.0.jar +calcite-druid/org.apache.calcite/1.10.0//calcite-druid-1.10.0.jar +calcite-linq4j/org.apache.calcite/1.10.0//calcite-linq4j-1.10.0.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/org.codehaus.janino/2.7.6//commons-compiler-2.7.6.jar +commons-compress/org.apache.commons/1.9//commons-compress-1.9.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-dbcp/commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-el/commons-el/1.0//commons-el-1.0.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/org.apache.commons/3.1//commons-lang3-3.1.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.5.4//commons-pool-1.5.4.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +datanucleus-api-jdo/org.datanucleus/4.2.4//datanucleus-api-jdo-4.2.4.jar +datanucleus-core/org.datanucleus/4.1.17//datanucleus-core-4.1.17.jar +datanucleus-rdbms/org.datanucleus/4.1.19//datanucleus-rdbms-4.1.19.jar +derby/org.apache.derby/10.10.2.0//derby-10.10.2.0.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +dropwizard-metrics-hadoop-metrics2-reporter/com.github.joshelser/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +eigenbase-properties/net.hydromatic/1.1.5//eigenbase-properties-1.1.5.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +groovy-all/org.codehaus.groovy/2.4.4//groovy-all-2.4.4.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/14.0.1//guava-14.0.1.jar +guice-assistedinject/com.google.inject.extensions/3.0//guice-assistedinject-3.0.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-registry/org.apache.hadoop/2.7.1//hadoop-yarn-registry-2.7.1.jar +hadoop-yarn-server-applicationhistoryservice/org.apache.hadoop/2.7.2//hadoop-yarn-server-applicationhistoryservice-2.7.2.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.2//hadoop-yarn-server-common-2.7.2.jar +hadoop-yarn-server-resourcemanager/org.apache.hadoop/2.7.2//hadoop-yarn-server-resourcemanager-2.7.2.jar +hadoop-yarn-server-web-proxy/org.apache.hadoop/2.7.2//hadoop-yarn-server-web-proxy-2.7.2.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-common/org.apache.hive/2.3.1//hive-common-2.3.1.jar +hive-exec/org.apache.hive/2.3.1//hive-exec-2.3.1.jar +hive-jdbc/org.apache.hive/2.3.1//hive-jdbc-2.3.1.jar +hive-llap-client/org.apache.hive/2.3.1//hive-llap-client-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1//hive-llap-common-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1/tests/hive-llap-common-2.3.1-tests.jar +hive-llap-server/org.apache.hive/2.3.1//hive-llap-server-2.3.1.jar +hive-llap-tez/org.apache.hive/2.3.1//hive-llap-tez-2.3.1.jar +hive-metastore/org.apache.hive/2.3.1//hive-metastore-2.3.1.jar +hive-serde/org.apache.hive/2.3.1//hive-serde-2.3.1.jar +hive-service-rpc/org.apache.hive/2.3.1//hive-service-rpc-2.3.1.jar +hive-service/org.apache.hive/2.3.1//hive-service-2.3.1.jar +hive-shims-0.23/org.apache.hive.shims/2.3.1//hive-shims-0.23-2.3.1.jar +hive-shims-common/org.apache.hive.shims/2.3.1//hive-shims-common-2.3.1.jar +hive-shims-scheduler/org.apache.hive.shims/2.3.1//hive-shims-scheduler-2.3.1.jar +hive-shims/org.apache.hive/2.3.1//hive-shims-2.3.1.jar +hive-storage-api/org.apache.hive/2.3.1//hive-storage-api-2.3.1.jar +hive-vector-code-gen/org.apache.hive/2.3.1//hive-vector-code-gen-2.3.1.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +ivy/org.apache.ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jackson-module-paranamer/com.fasterxml.jackson.module/2.7.9//jackson-module-paranamer-2.7.9.jar +jackson-module-scala_2.11/com.fasterxml.jackson.module/2.6.7.1//jackson-module-scala_2.11-2.6.7.1.jar +jamon-runtime/org.jamon/2.3.1//jamon-runtime-2.3.1.jar +janino/org.codehaus.janino/2.7.6//janino-2.7.6.jar +jasper-compiler/tomcat/5.5.23//jasper-compiler-5.5.23.jar +jasper-runtime/tomcat/5.5.23//jasper-runtime-5.5.23.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javax.inject/javax.inject/1//javax.inject-1.jar +javax.jdo/org.datanucleus/3.2.0-m3//javax.jdo-3.2.0-m3.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +javax.servlet/org.eclipse.jetty.orbit/3.0.0.v201112011016//javax.servlet-3.0.0.v201112011016.jar +javolution/javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jdo-api/javax.jdo/3.0.1//jdo-api-3.0.1.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +jetty/org.mortbay.jetty/6.1.26//jetty-6.1.26.jar +jline/jline/2.12//jline-2.12.jar +joda-time/joda-time/2.9.9//joda-time-2.9.9.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jpam/net.sf.jpam/1.1//jpam-1.1.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +json/com.tdunning/1.8//json-1.8.jar +jsp-api/javax.servlet.jsp/2.1//jsp-api-2.1.jar +jsp-api/javax.servlet/2.0//jsp-api-2.0.jar +jsr305/com.google.code.findbugs/3.0.0//jsr305-3.0.0.jar +junit/junit/4.12//junit-4.12.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +libfb303/org.apache.thrift/0.9.3//libfb303-0.9.3.jar +libthrift/org.apache.thrift/0.9.3//libthrift-0.9.3.jar +log4j-1.2-api/org.apache.logging.log4j/2.6.2//log4j-1.2-api-2.6.2.jar +log4j-api/org.apache.logging.log4j/2.6.2//log4j-api-2.6.2.jar +log4j-core/org.apache.logging.log4j/2.6.2//log4j-core-2.6.2.jar +log4j-slf4j-impl/org.apache.logging.log4j/2.6.2//log4j-slf4j-impl-2.6.2.jar +log4j-web/org.apache.logging.log4j/2.6.2//log4j-web-2.6.2.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +metrics-core/io.dropwizard.metrics/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/io.dropwizard.metrics/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/io.dropwizard.metrics/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/io.dropwizard.metrics/3.1.0//metrics-json-3.1.0.jar +metrics-jvm/io.dropwizard.metrics/3.1.0//metrics-jvm-3.1.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.6.2.Final//netty-3.6.2.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +opencsv/net.sf.opencsv/2.3//opencsv-2.3.jar +orc-core/org.apache.orc/1.3.3//orc-core-1.3.3.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +oro/oro/2.0.8//oro-2.0.8.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop-bundle/org.apache.parquet/1.8.1//parquet-hadoop-bundle-1.8.1.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +scala-library/org.scala-lang/2.11.12//scala-library-2.11.12.jar +scala-reflect/org.scala-lang/2.11.8//scala-reflect-2.11.8.jar +servlet-api/javax.servlet/2.4//servlet-api-2.4.jar +simpleclient/io.prometheus/0.8.0//simpleclient-0.8.0.jar +simpleclient_common/io.prometheus/0.8.0//simpleclient_common-0.8.0.jar +simpleclient_dropwizard/io.prometheus/0.8.0//simpleclient_dropwizard-0.8.0.jar +simpleclient_httpserver/io.prometheus/0.8.0//simpleclient_httpserver-0.8.0.jar +simpleclient_pushgateway/io.prometheus/0.8.0//simpleclient_pushgateway-0.8.0.jar +slf4j-api/org.slf4j/1.7.10//slf4j-api-1.7.10.jar +slf4j-log4j12/org.slf4j/1.6.1//slf4j-log4j12-1.6.1.jar +slider-core/org.apache.slider/0.90.2-incubating//slider-core-0.90.2-incubating.jar +snappy-java/org.xerial.snappy/1.1.2.6//snappy-java-1.1.2.6.jar +spark-avro_2.11/org.apache.spark/2.4.4//spark-avro_2.11-2.4.4.jar +spark-tags_2.11/org.apache.spark/2.4.4//spark-tags_2.11-2.4.4.jar +stax-api/stax/1.0.1//stax-api-1.0.1.jar +tephra-api/co.cask.tephra/0.6.0//tephra-api-0.6.0.jar +tephra-core/co.cask.tephra/0.6.0//tephra-core-0.6.0.jar +tephra-hbase-compat-1.0/co.cask.tephra/0.6.0//tephra-hbase-compat-1.0-0.6.0.jar +twill-api/org.apache.twill/0.6.0-incubating//twill-api-0.6.0-incubating.jar +twill-common/org.apache.twill/0.6.0-incubating//twill-common-0.6.0-incubating.jar +twill-core/org.apache.twill/0.6.0-incubating//twill-core-0.6.0-incubating.jar +twill-discovery-api/org.apache.twill/0.6.0-incubating//twill-discovery-api-0.6.0-incubating.jar +twill-discovery-core/org.apache.twill/0.6.0-incubating//twill-discovery-core-0.6.0-incubating.jar +twill-zookeeper/org.apache.twill/0.6.0-incubating//twill-zookeeper-0.6.0-incubating.jar +unused/org.spark-project.spark/1.0.0//unused-1.0.0.jar +velocity/org.apache.velocity/1.5//velocity-1.5.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar +zookeeper/org.apache.zookeeper/3.4.6/tests/zookeeper-3.4.6-tests.jar diff --git a/dependencies/hudi-spark-bundle_2.12.txt b/dependencies/hudi-spark-bundle_2.12.txt new file mode 100644 index 0000000000000..207778449642e --- /dev/null +++ b/dependencies/hudi-spark-bundle_2.12.txt @@ -0,0 +1,255 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +HikariCP/com.zaxxer/2.5.1//HikariCP-2.5.1.jar +ST4/org.antlr/4.0.4//ST4-4.0.4.jar +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +ant-launcher/org.apache.ant/1.9.1//ant-launcher-1.9.1.jar +ant/ant/1.6.5//ant-1.6.5.jar +ant/org.apache.ant/1.9.1//ant-1.9.1.jar +antlr-runtime/org.antlr/3.5.2//antlr-runtime-3.5.2.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apache-curator/org.apache.curator/2.7.1//apache-curator-2.7.1.pom +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avatica-metrics/org.apache.calcite.avatica/1.8.0//avatica-metrics-1.8.0.jar +avatica/org.apache.calcite.avatica/1.8.0//avatica-1.8.0.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +bonecp/com.jolbox/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +calcite-core/org.apache.calcite/1.10.0//calcite-core-1.10.0.jar +calcite-druid/org.apache.calcite/1.10.0//calcite-druid-1.10.0.jar +calcite-linq4j/org.apache.calcite/1.10.0//calcite-linq4j-1.10.0.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/org.codehaus.janino/2.7.6//commons-compiler-2.7.6.jar +commons-compress/org.apache.commons/1.9//commons-compress-1.9.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-dbcp/commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-el/commons-el/1.0//commons-el-1.0.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/org.apache.commons/3.1//commons-lang3-3.1.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.5.4//commons-pool-1.5.4.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +datanucleus-api-jdo/org.datanucleus/4.2.4//datanucleus-api-jdo-4.2.4.jar +datanucleus-core/org.datanucleus/4.1.17//datanucleus-core-4.1.17.jar +datanucleus-rdbms/org.datanucleus/4.1.19//datanucleus-rdbms-4.1.19.jar +derby/org.apache.derby/10.10.2.0//derby-10.10.2.0.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +dropwizard-metrics-hadoop-metrics2-reporter/com.github.joshelser/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +eigenbase-properties/net.hydromatic/1.1.5//eigenbase-properties-1.1.5.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +groovy-all/org.codehaus.groovy/2.4.4//groovy-all-2.4.4.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/14.0.1//guava-14.0.1.jar +guice-assistedinject/com.google.inject.extensions/3.0//guice-assistedinject-3.0.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-registry/org.apache.hadoop/2.7.1//hadoop-yarn-registry-2.7.1.jar +hadoop-yarn-server-applicationhistoryservice/org.apache.hadoop/2.7.2//hadoop-yarn-server-applicationhistoryservice-2.7.2.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.2//hadoop-yarn-server-common-2.7.2.jar +hadoop-yarn-server-resourcemanager/org.apache.hadoop/2.7.2//hadoop-yarn-server-resourcemanager-2.7.2.jar +hadoop-yarn-server-web-proxy/org.apache.hadoop/2.7.2//hadoop-yarn-server-web-proxy-2.7.2.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-common/org.apache.hive/2.3.1//hive-common-2.3.1.jar +hive-exec/org.apache.hive/2.3.1//hive-exec-2.3.1.jar +hive-jdbc/org.apache.hive/2.3.1//hive-jdbc-2.3.1.jar +hive-llap-client/org.apache.hive/2.3.1//hive-llap-client-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1//hive-llap-common-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1/tests/hive-llap-common-2.3.1-tests.jar +hive-llap-server/org.apache.hive/2.3.1//hive-llap-server-2.3.1.jar +hive-llap-tez/org.apache.hive/2.3.1//hive-llap-tez-2.3.1.jar +hive-metastore/org.apache.hive/2.3.1//hive-metastore-2.3.1.jar +hive-serde/org.apache.hive/2.3.1//hive-serde-2.3.1.jar +hive-service-rpc/org.apache.hive/2.3.1//hive-service-rpc-2.3.1.jar +hive-service/org.apache.hive/2.3.1//hive-service-2.3.1.jar +hive-shims-0.23/org.apache.hive.shims/2.3.1//hive-shims-0.23-2.3.1.jar +hive-shims-common/org.apache.hive.shims/2.3.1//hive-shims-common-2.3.1.jar +hive-shims-scheduler/org.apache.hive.shims/2.3.1//hive-shims-scheduler-2.3.1.jar +hive-shims/org.apache.hive/2.3.1//hive-shims-2.3.1.jar +hive-storage-api/org.apache.hive/2.3.1//hive-storage-api-2.3.1.jar +hive-vector-code-gen/org.apache.hive/2.3.1//hive-vector-code-gen-2.3.1.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +ivy/org.apache.ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jackson-module-paranamer/com.fasterxml.jackson.module/2.7.9//jackson-module-paranamer-2.7.9.jar +jackson-module-scala_2.11/com.fasterxml.jackson.module/2.6.7.1//jackson-module-scala_2.11-2.6.7.1.jar +jamon-runtime/org.jamon/2.3.1//jamon-runtime-2.3.1.jar +janino/org.codehaus.janino/2.7.6//janino-2.7.6.jar +jasper-compiler/tomcat/5.5.23//jasper-compiler-5.5.23.jar +jasper-runtime/tomcat/5.5.23//jasper-runtime-5.5.23.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javax.inject/javax.inject/1//javax.inject-1.jar +javax.jdo/org.datanucleus/3.2.0-m3//javax.jdo-3.2.0-m3.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +javax.servlet/org.eclipse.jetty.orbit/3.0.0.v201112011016//javax.servlet-3.0.0.v201112011016.jar +javolution/javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jdo-api/javax.jdo/3.0.1//jdo-api-3.0.1.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +jetty/org.mortbay.jetty/6.1.26//jetty-6.1.26.jar +jline/jline/2.12//jline-2.12.jar +joda-time/joda-time/2.9.9//joda-time-2.9.9.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jpam/net.sf.jpam/1.1//jpam-1.1.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +json/com.tdunning/1.8//json-1.8.jar +jsp-api/javax.servlet.jsp/2.1//jsp-api-2.1.jar +jsp-api/javax.servlet/2.0//jsp-api-2.0.jar +jsr305/com.google.code.findbugs/3.0.0//jsr305-3.0.0.jar +junit/junit/4.12//junit-4.12.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +libfb303/org.apache.thrift/0.9.3//libfb303-0.9.3.jar +libthrift/org.apache.thrift/0.9.3//libthrift-0.9.3.jar +log4j-1.2-api/org.apache.logging.log4j/2.6.2//log4j-1.2-api-2.6.2.jar +log4j-api/org.apache.logging.log4j/2.6.2//log4j-api-2.6.2.jar +log4j-core/org.apache.logging.log4j/2.6.2//log4j-core-2.6.2.jar +log4j-slf4j-impl/org.apache.logging.log4j/2.6.2//log4j-slf4j-impl-2.6.2.jar +log4j-web/org.apache.logging.log4j/2.6.2//log4j-web-2.6.2.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +metrics-core/io.dropwizard.metrics/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/io.dropwizard.metrics/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/io.dropwizard.metrics/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/io.dropwizard.metrics/3.1.0//metrics-json-3.1.0.jar +metrics-jvm/io.dropwizard.metrics/3.1.0//metrics-jvm-3.1.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.6.2.Final//netty-3.6.2.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +opencsv/net.sf.opencsv/2.3//opencsv-2.3.jar +orc-core/org.apache.orc/1.3.3//orc-core-1.3.3.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +oro/oro/2.0.8//oro-2.0.8.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop-bundle/org.apache.parquet/1.8.1//parquet-hadoop-bundle-1.8.1.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +scala-library/org.scala-lang/2.11.12//scala-library-2.11.12.jar +scala-reflect/org.scala-lang/2.11.8//scala-reflect-2.11.8.jar +servlet-api/javax.servlet/2.4//servlet-api-2.4.jar +simpleclient/io.prometheus/0.8.0//simpleclient-0.8.0.jar +simpleclient_common/io.prometheus/0.8.0//simpleclient_common-0.8.0.jar +simpleclient_dropwizard/io.prometheus/0.8.0//simpleclient_dropwizard-0.8.0.jar +simpleclient_httpserver/io.prometheus/0.8.0//simpleclient_httpserver-0.8.0.jar +simpleclient_pushgateway/io.prometheus/0.8.0//simpleclient_pushgateway-0.8.0.jar +slf4j-api/org.slf4j/1.7.10//slf4j-api-1.7.10.jar +slf4j-log4j12/org.slf4j/1.6.1//slf4j-log4j12-1.6.1.jar +slider-core/org.apache.slider/0.90.2-incubating//slider-core-0.90.2-incubating.jar +snappy-java/org.xerial.snappy/1.1.2.6//snappy-java-1.1.2.6.jar +spark-avro_2.12/org.apache.spark/2.4.4//spark-avro_2.12-2.4.4.jar +spark-tags_2.12/org.apache.spark/2.4.4//spark-tags_2.12-2.4.4.jar +stax-api/stax/1.0.1//stax-api-1.0.1.jar +tephra-api/co.cask.tephra/0.6.0//tephra-api-0.6.0.jar +tephra-core/co.cask.tephra/0.6.0//tephra-core-0.6.0.jar +tephra-hbase-compat-1.0/co.cask.tephra/0.6.0//tephra-hbase-compat-1.0-0.6.0.jar +twill-api/org.apache.twill/0.6.0-incubating//twill-api-0.6.0-incubating.jar +twill-common/org.apache.twill/0.6.0-incubating//twill-common-0.6.0-incubating.jar +twill-core/org.apache.twill/0.6.0-incubating//twill-core-0.6.0-incubating.jar +twill-discovery-api/org.apache.twill/0.6.0-incubating//twill-discovery-api-0.6.0-incubating.jar +twill-discovery-core/org.apache.twill/0.6.0-incubating//twill-discovery-core-0.6.0-incubating.jar +twill-zookeeper/org.apache.twill/0.6.0-incubating//twill-zookeeper-0.6.0-incubating.jar +unused/org.spark-project.spark/1.0.0//unused-1.0.0.jar +velocity/org.apache.velocity/1.5//velocity-1.5.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar +zookeeper/org.apache.zookeeper/3.4.6/tests/zookeeper-3.4.6-tests.jar diff --git a/dependencies/hudi-spark3-bundle_2.12.txt b/dependencies/hudi-spark3-bundle_2.12.txt new file mode 100644 index 0000000000000..25f174777443b --- /dev/null +++ b/dependencies/hudi-spark3-bundle_2.12.txt @@ -0,0 +1,255 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +HikariCP/com.zaxxer/2.5.1//HikariCP-2.5.1.jar +ST4/org.antlr/4.0.4//ST4-4.0.4.jar +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +ant-launcher/org.apache.ant/1.9.1//ant-launcher-1.9.1.jar +ant/ant/1.6.5//ant-1.6.5.jar +ant/org.apache.ant/1.9.1//ant-1.9.1.jar +antlr-runtime/org.antlr/3.5.2//antlr-runtime-3.5.2.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apache-curator/org.apache.curator/2.7.1//apache-curator-2.7.1.pom +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avatica-metrics/org.apache.calcite.avatica/1.8.0//avatica-metrics-1.8.0.jar +avatica/org.apache.calcite.avatica/1.8.0//avatica-1.8.0.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +bonecp/com.jolbox/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +calcite-core/org.apache.calcite/1.10.0//calcite-core-1.10.0.jar +calcite-druid/org.apache.calcite/1.10.0//calcite-druid-1.10.0.jar +calcite-linq4j/org.apache.calcite/1.10.0//calcite-linq4j-1.10.0.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/org.codehaus.janino/2.7.6//commons-compiler-2.7.6.jar +commons-compress/org.apache.commons/1.9//commons-compress-1.9.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-dbcp/commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-el/commons-el/1.0//commons-el-1.0.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/org.apache.commons/3.1//commons-lang3-3.1.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.5.4//commons-pool-1.5.4.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +datanucleus-api-jdo/org.datanucleus/4.2.4//datanucleus-api-jdo-4.2.4.jar +datanucleus-core/org.datanucleus/4.1.17//datanucleus-core-4.1.17.jar +datanucleus-rdbms/org.datanucleus/4.1.19//datanucleus-rdbms-4.1.19.jar +derby/org.apache.derby/10.10.2.0//derby-10.10.2.0.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +dropwizard-metrics-hadoop-metrics2-reporter/com.github.joshelser/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +eigenbase-properties/net.hydromatic/1.1.5//eigenbase-properties-1.1.5.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +groovy-all/org.codehaus.groovy/2.4.4//groovy-all-2.4.4.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/14.0.1//guava-14.0.1.jar +guice-assistedinject/com.google.inject.extensions/3.0//guice-assistedinject-3.0.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-registry/org.apache.hadoop/2.7.1//hadoop-yarn-registry-2.7.1.jar +hadoop-yarn-server-applicationhistoryservice/org.apache.hadoop/2.7.2//hadoop-yarn-server-applicationhistoryservice-2.7.2.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.2//hadoop-yarn-server-common-2.7.2.jar +hadoop-yarn-server-resourcemanager/org.apache.hadoop/2.7.2//hadoop-yarn-server-resourcemanager-2.7.2.jar +hadoop-yarn-server-web-proxy/org.apache.hadoop/2.7.2//hadoop-yarn-server-web-proxy-2.7.2.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-common/org.apache.hive/2.3.1//hive-common-2.3.1.jar +hive-exec/org.apache.hive/2.3.1//hive-exec-2.3.1.jar +hive-jdbc/org.apache.hive/2.3.1//hive-jdbc-2.3.1.jar +hive-llap-client/org.apache.hive/2.3.1//hive-llap-client-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1//hive-llap-common-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1/tests/hive-llap-common-2.3.1-tests.jar +hive-llap-server/org.apache.hive/2.3.1//hive-llap-server-2.3.1.jar +hive-llap-tez/org.apache.hive/2.3.1//hive-llap-tez-2.3.1.jar +hive-metastore/org.apache.hive/2.3.1//hive-metastore-2.3.1.jar +hive-serde/org.apache.hive/2.3.1//hive-serde-2.3.1.jar +hive-service-rpc/org.apache.hive/2.3.1//hive-service-rpc-2.3.1.jar +hive-service/org.apache.hive/2.3.1//hive-service-2.3.1.jar +hive-shims-0.23/org.apache.hive.shims/2.3.1//hive-shims-0.23-2.3.1.jar +hive-shims-common/org.apache.hive.shims/2.3.1//hive-shims-common-2.3.1.jar +hive-shims-scheduler/org.apache.hive.shims/2.3.1//hive-shims-scheduler-2.3.1.jar +hive-shims/org.apache.hive/2.3.1//hive-shims-2.3.1.jar +hive-storage-api/org.apache.hive/2.3.1//hive-storage-api-2.3.1.jar +hive-vector-code-gen/org.apache.hive/2.3.1//hive-vector-code-gen-2.3.1.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +ivy/org.apache.ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/com.fasterxml.jackson.core/2.10.0//jackson-annotations-2.10.0.jar +jackson-core/com.fasterxml.jackson.core/2.10.0//jackson-core-2.10.0.jar +jackson-databind/com.fasterxml.jackson.core/2.10.0//jackson-databind-2.10.0.jar +jackson-module-paranamer/com.fasterxml.jackson.module/2.7.9//jackson-module-paranamer-2.7.9.jar +jackson-module-scala_2.11/com.fasterxml.jackson.module/2.6.7.1//jackson-module-scala_2.11-2.6.7.1.jar +jamon-runtime/org.jamon/2.3.1//jamon-runtime-2.3.1.jar +janino/org.codehaus.janino/2.7.6//janino-2.7.6.jar +jasper-compiler/tomcat/5.5.23//jasper-compiler-5.5.23.jar +jasper-runtime/tomcat/5.5.23//jasper-runtime-5.5.23.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javax.inject/javax.inject/1//javax.inject-1.jar +javax.jdo/org.datanucleus/3.2.0-m3//javax.jdo-3.2.0-m3.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +javax.servlet/org.eclipse.jetty.orbit/3.0.0.v201112011016//javax.servlet-3.0.0.v201112011016.jar +javolution/javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jdo-api/javax.jdo/3.0.1//jdo-api-3.0.1.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +jetty/org.mortbay.jetty/6.1.26//jetty-6.1.26.jar +jline/jline/2.12//jline-2.12.jar +joda-time/joda-time/2.9.9//joda-time-2.9.9.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jpam/net.sf.jpam/1.1//jpam-1.1.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +json/com.tdunning/1.8//json-1.8.jar +jsp-api/javax.servlet.jsp/2.1//jsp-api-2.1.jar +jsp-api/javax.servlet/2.0//jsp-api-2.0.jar +jsr305/com.google.code.findbugs/3.0.0//jsr305-3.0.0.jar +junit/junit/4.12//junit-4.12.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +libfb303/org.apache.thrift/0.9.3//libfb303-0.9.3.jar +libthrift/org.apache.thrift/0.9.3//libthrift-0.9.3.jar +log4j-1.2-api/org.apache.logging.log4j/2.6.2//log4j-1.2-api-2.6.2.jar +log4j-api/org.apache.logging.log4j/2.6.2//log4j-api-2.6.2.jar +log4j-core/org.apache.logging.log4j/2.6.2//log4j-core-2.6.2.jar +log4j-slf4j-impl/org.apache.logging.log4j/2.6.2//log4j-slf4j-impl-2.6.2.jar +log4j-web/org.apache.logging.log4j/2.6.2//log4j-web-2.6.2.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +metrics-core/io.dropwizard.metrics/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/io.dropwizard.metrics/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/io.dropwizard.metrics/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/io.dropwizard.metrics/3.1.0//metrics-json-3.1.0.jar +metrics-jvm/io.dropwizard.metrics/3.1.0//metrics-jvm-3.1.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.6.2.Final//netty-3.6.2.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +opencsv/net.sf.opencsv/2.3//opencsv-2.3.jar +orc-core/org.apache.orc/1.3.3//orc-core-1.3.3.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +oro/oro/2.0.8//oro-2.0.8.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop-bundle/org.apache.parquet/1.8.1//parquet-hadoop-bundle-1.8.1.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +scala-library/org.scala-lang/2.11.12//scala-library-2.11.12.jar +scala-reflect/org.scala-lang/2.11.8//scala-reflect-2.11.8.jar +servlet-api/javax.servlet/2.4//servlet-api-2.4.jar +simpleclient/io.prometheus/0.8.0//simpleclient-0.8.0.jar +simpleclient_common/io.prometheus/0.8.0//simpleclient_common-0.8.0.jar +simpleclient_dropwizard/io.prometheus/0.8.0//simpleclient_dropwizard-0.8.0.jar +simpleclient_httpserver/io.prometheus/0.8.0//simpleclient_httpserver-0.8.0.jar +simpleclient_pushgateway/io.prometheus/0.8.0//simpleclient_pushgateway-0.8.0.jar +slf4j-api/org.slf4j/1.7.10//slf4j-api-1.7.10.jar +slf4j-log4j12/org.slf4j/1.6.1//slf4j-log4j12-1.6.1.jar +slider-core/org.apache.slider/0.90.2-incubating//slider-core-0.90.2-incubating.jar +snappy-java/org.xerial.snappy/1.1.2.6//snappy-java-1.1.2.6.jar +spark-avro_2.12/org.apache.spark/3.0.0//spark-avro_2.12-3.0.0.jar +spark-tags_2.12/org.apache.spark/3.0.0//spark-tags_2.12-3.0.0.jar +stax-api/stax/1.0.1//stax-api-1.0.1.jar +tephra-api/co.cask.tephra/0.6.0//tephra-api-0.6.0.jar +tephra-core/co.cask.tephra/0.6.0//tephra-core-0.6.0.jar +tephra-hbase-compat-1.0/co.cask.tephra/0.6.0//tephra-hbase-compat-1.0-0.6.0.jar +twill-api/org.apache.twill/0.6.0-incubating//twill-api-0.6.0-incubating.jar +twill-common/org.apache.twill/0.6.0-incubating//twill-common-0.6.0-incubating.jar +twill-core/org.apache.twill/0.6.0-incubating//twill-core-0.6.0-incubating.jar +twill-discovery-api/org.apache.twill/0.6.0-incubating//twill-discovery-api-0.6.0-incubating.jar +twill-discovery-core/org.apache.twill/0.6.0-incubating//twill-discovery-core-0.6.0-incubating.jar +twill-zookeeper/org.apache.twill/0.6.0-incubating//twill-zookeeper-0.6.0-incubating.jar +unused/org.spark-project.spark/1.0.0//unused-1.0.0.jar +velocity/org.apache.velocity/1.5//velocity-1.5.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar +zookeeper/org.apache.zookeeper/3.4.6/tests/zookeeper-3.4.6-tests.jar diff --git a/dependencies/hudi-timeline-server-bundle.txt b/dependencies/hudi-timeline-server-bundle.txt new file mode 100644 index 0000000000000..3042a1af5ad50 --- /dev/null +++ b/dependencies/hudi-timeline-server-bundle.txt @@ -0,0 +1,139 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compress/org.apache.commons/1.4.1//commons-compress-1.4.1.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/11.0.2//guava-11.0.2.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.3//hadoop-yarn-server-common-2.7.3.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3/tests/hbase-common-1.2.3-tests.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-storage-api/org.apache.hive/2.6.0//hive-storage-api-2.6.0.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jamon-runtime/org.jamon/2.4.1//jamon-runtime-2.4.1.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +jsr305/com.google.code.findbugs/3.0.0//jsr305-3.0.0.jar +junit/junit/4.12//junit-4.12.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.6.2.Final//netty-3.6.2.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +slf4j-api/org.slf4j/1.7.26//slf4j-api-1.7.26.jar +slf4j-log4j12/org.slf4j/1.7.10//slf4j-log4j12-1.7.10.jar +snappy-java/org.xerial.snappy/1.1.1.3//snappy-java-1.1.1.3.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar diff --git a/dependencies/hudi-utilities-bundle_2.11.txt b/dependencies/hudi-utilities-bundle_2.11.txt new file mode 100644 index 0000000000000..d884e59098a85 --- /dev/null +++ b/dependencies/hudi-utilities-bundle_2.11.txt @@ -0,0 +1,317 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +HikariCP/com.zaxxer/2.5.1//HikariCP-2.5.1.jar +RoaringBitmap/org.roaringbitmap/0.7.45//RoaringBitmap-0.7.45.jar +ST4/org.antlr/4.0.4//ST4-4.0.4.jar +activation/javax.activation/1.1.1//activation-1.1.1.jar +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +ant-launcher/org.apache.ant/1.9.1//ant-launcher-1.9.1.jar +ant/ant/1.6.5//ant-1.6.5.jar +ant/org.apache.ant/1.9.1//ant-1.9.1.jar +antlr-runtime/org.antlr/3.5.2//antlr-runtime-3.5.2.jar +aopalliance-repackaged/org.glassfish.hk2.external/2.4.0-b34//aopalliance-repackaged-2.4.0-b34.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apache-curator/org.apache.curator/2.7.1//apache-curator-2.7.1.pom +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avatica-metrics/org.apache.calcite.avatica/1.8.0//avatica-metrics-1.8.0.jar +avatica/org.apache.calcite.avatica/1.8.0//avatica-1.8.0.jar +avro-ipc/org.apache.avro/1.8.2//avro-ipc-1.8.2.jar +avro-mapred/org.apache.avro/1.8.2/hadoop2/avro-mapred-1.8.2-hadoop2.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +aws-java-sdk-core/com.amazonaws/1.12.22//aws-java-sdk-core-1.12.22.jar +aws-java-sdk-sqs/com.amazonaws/1.12.22//aws-java-sdk-sqs-1.12.22.jar +bijection-avro_2.11/com.twitter/0.9.7//bijection-avro_2.11-0.9.7.jar +bijection-core_2.11/com.twitter/0.9.7//bijection-core_2.11-0.9.7.jar +bonecp/com.jolbox/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +calcite-core/org.apache.calcite/1.10.0//calcite-core-1.10.0.jar +calcite-druid/org.apache.calcite/1.10.0//calcite-druid-1.10.0.jar +calcite-linq4j/org.apache.calcite/1.10.0//calcite-linq4j-1.10.0.jar +chill-java/com.twitter/0.9.3//chill-java-0.9.3.jar +chill_2.11/com.twitter/0.9.3//chill_2.11-0.9.3.jar +common-config/io.confluent/5.3.4//common-config-5.3.4.jar +common-utils/io.confluent/5.3.4//common-utils-5.3.4.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/org.codehaus.janino/2.7.6//commons-compiler-2.7.6.jar +commons-compress/org.apache.commons/1.9//commons-compress-1.9.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-crypto/org.apache.commons/1.0.0//commons-crypto-1.0.0.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-dbcp/commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-el/commons-el/1.0//commons-el-1.0.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/org.apache.commons/3.1//commons-lang3-3.1.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.5.4//commons-pool-1.5.4.jar +compress-lzf/com.ning/1.0.3//compress-lzf-1.0.3.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +datanucleus-api-jdo/org.datanucleus/4.2.4//datanucleus-api-jdo-4.2.4.jar +datanucleus-core/org.datanucleus/4.1.17//datanucleus-core-4.1.17.jar +datanucleus-rdbms/org.datanucleus/4.1.19//datanucleus-rdbms-4.1.19.jar +derby/org.apache.derby/10.10.2.0//derby-10.10.2.0.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +dropwizard-metrics-hadoop-metrics2-reporter/com.github.joshelser/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +eigenbase-properties/net.hydromatic/1.1.5//eigenbase-properties-1.1.5.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +groovy-all/org.codehaus.groovy/2.4.4//groovy-all-2.4.4.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/14.0.1//guava-14.0.1.jar +guice-assistedinject/com.google.inject.extensions/3.0//guice-assistedinject-3.0.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-registry/org.apache.hadoop/2.7.1//hadoop-yarn-registry-2.7.1.jar +hadoop-yarn-server-applicationhistoryservice/org.apache.hadoop/2.7.2//hadoop-yarn-server-applicationhistoryservice-2.7.2.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.2//hadoop-yarn-server-common-2.7.2.jar +hadoop-yarn-server-resourcemanager/org.apache.hadoop/2.7.2//hadoop-yarn-server-resourcemanager-2.7.2.jar +hadoop-yarn-server-web-proxy/org.apache.hadoop/2.7.2//hadoop-yarn-server-web-proxy-2.7.2.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-common/org.apache.hive/2.3.1//hive-common-2.3.1.jar +hive-exec/org.apache.hive/2.3.1//hive-exec-2.3.1.jar +hive-jdbc/org.apache.hive/2.3.1//hive-jdbc-2.3.1.jar +hive-llap-client/org.apache.hive/2.3.1//hive-llap-client-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1//hive-llap-common-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1/tests/hive-llap-common-2.3.1-tests.jar +hive-llap-server/org.apache.hive/2.3.1//hive-llap-server-2.3.1.jar +hive-llap-tez/org.apache.hive/2.3.1//hive-llap-tez-2.3.1.jar +hive-metastore/org.apache.hive/2.3.1//hive-metastore-2.3.1.jar +hive-serde/org.apache.hive/2.3.1//hive-serde-2.3.1.jar +hive-service-rpc/org.apache.hive/2.3.1//hive-service-rpc-2.3.1.jar +hive-service/org.apache.hive/2.3.1//hive-service-2.3.1.jar +hive-shims-0.23/org.apache.hive.shims/2.3.1//hive-shims-0.23-2.3.1.jar +hive-shims-common/org.apache.hive.shims/2.3.1//hive-shims-common-2.3.1.jar +hive-shims-scheduler/org.apache.hive.shims/2.3.1//hive-shims-scheduler-2.3.1.jar +hive-shims/org.apache.hive/2.3.1//hive-shims-2.3.1.jar +hive-storage-api/org.apache.hive/2.3.1//hive-storage-api-2.3.1.jar +hive-vector-code-gen/org.apache.hive/2.3.1//hive-vector-code-gen-2.3.1.jar +hk2-api/org.glassfish.hk2/2.4.0-b34//hk2-api-2.4.0-b34.jar +hk2-locator/org.glassfish.hk2/2.4.0-b34//hk2-locator-2.4.0-b34.jar +hk2-utils/org.glassfish.hk2/2.4.0-b34//hk2-utils-2.4.0-b34.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +ion-java/software.amazon.ion/1.0.2//ion-java-1.0.2.jar +ivy/org.apache.ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/com.fasterxml.jackson.core/2.6.7//jackson-annotations-2.6.7.jar +jackson-core/com.fasterxml.jackson.core/2.6.7//jackson-core-2.6.7.jar +jackson-databind/com.fasterxml.jackson.core/2.6.7.3//jackson-databind-2.6.7.3.jar +jackson-dataformat-cbor/com.fasterxml.jackson.dataformat/2.12.3//jackson-dataformat-cbor-2.12.3.jar +jackson-dataformat-csv/com.fasterxml.jackson.dataformat/2.6.7//jackson-dataformat-csv-2.6.7.jar +jackson-module-paranamer/com.fasterxml.jackson.module/2.7.9//jackson-module-paranamer-2.7.9.jar +jackson-module-scala_2.11/com.fasterxml.jackson.module/2.6.7.1//jackson-module-scala_2.11-2.6.7.1.jar +jamon-runtime/org.jamon/2.3.1//jamon-runtime-2.3.1.jar +janino/org.codehaus.janino/2.7.6//janino-2.7.6.jar +jasper-compiler/tomcat/5.5.23//jasper-compiler-5.5.23.jar +jasper-runtime/tomcat/5.5.23//jasper-runtime-5.5.23.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javassist/org.javassist/3.18.1-GA//javassist-3.18.1-GA.jar +javax.annotation-api/javax.annotation/1.2//javax.annotation-api-1.2.jar +javax.inject/javax.inject/1//javax.inject-1.jar +javax.inject/org.glassfish.hk2.external/2.4.0-b34//javax.inject-2.4.0-b34.jar +javax.jdo/org.datanucleus/3.2.0-m3//javax.jdo-3.2.0-m3.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +javax.servlet/org.eclipse.jetty.orbit/3.0.0.v201112011016//javax.servlet-3.0.0.v201112011016.jar +javax.ws.rs-api/javax.ws.rs/2.0.1//javax.ws.rs-api-2.0.1.jar +javolution/javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcl-over-slf4j/org.slf4j/1.7.16//jcl-over-slf4j-1.7.16.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jdo-api/javax.jdo/3.0.1//jdo-api-3.0.1.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-client/org.glassfish.jersey.core/2.22.2//jersey-client-2.22.2.jar +jersey-common/org.glassfish.jersey.core/2.22.2//jersey-common-2.22.2.jar +jersey-container-servlet-core/org.glassfish.jersey.containers/2.17//jersey-container-servlet-core-2.17.jar +jersey-container-servlet/org.glassfish.jersey.containers/2.22.2//jersey-container-servlet-2.22.2.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guava/org.glassfish.jersey.bundles.repackaged/2.22.2//jersey-guava-2.22.2.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-media-jaxb/org.glassfish.jersey.media/2.17//jersey-media-jaxb-2.17.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jersey-server/org.glassfish.jersey.core/2.17//jersey-server-2.17.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +jetty/org.mortbay.jetty/6.1.26//jetty-6.1.26.jar +jline/jline/2.12//jline-2.12.jar +jmespath-java/com.amazonaws/1.12.22//jmespath-java-1.12.22.jar +joda-time/joda-time/2.9.9//joda-time-2.9.9.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jpam/net.sf.jpam/1.1//jpam-1.1.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +json/com.tdunning/1.8//json-1.8.jar +json4s-ast_2.11/org.json4s/3.5.3//json4s-ast_2.11-3.5.3.jar +json4s-core_2.11/org.json4s/3.5.3//json4s-core_2.11-3.5.3.jar +json4s-jackson_2.11/org.json4s/3.5.3//json4s-jackson_2.11-3.5.3.jar +json4s-scalap_2.11/org.json4s/3.5.3//json4s-scalap_2.11-3.5.3.jar +jsp-api/javax.servlet.jsp/2.1//jsp-api-2.1.jar +jsp-api/javax.servlet/2.0//jsp-api-2.0.jar +jsr305/com.google.code.findbugs/3.0.0//jsr305-3.0.0.jar +jul-to-slf4j/org.slf4j/1.7.16//jul-to-slf4j-1.7.16.jar +junit/junit/4.12//junit-4.12.jar +kafka-avro-serializer/io.confluent/5.3.4//kafka-avro-serializer-5.3.4.jar +kafka-clients/org.apache.kafka/2.0.0//kafka-clients-2.0.0.jar +kafka-schema-registry-client/io.confluent/5.3.4//kafka-schema-registry-client-5.3.4.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +libfb303/org.apache.thrift/0.9.3//libfb303-0.9.3.jar +libthrift/org.apache.thrift/0.9.3//libthrift-0.9.3.jar +log4j-1.2-api/org.apache.logging.log4j/2.6.2//log4j-1.2-api-2.6.2.jar +log4j-api/org.apache.logging.log4j/2.6.2//log4j-api-2.6.2.jar +log4j-core/org.apache.logging.log4j/2.6.2//log4j-core-2.6.2.jar +log4j-slf4j-impl/org.apache.logging.log4j/2.6.2//log4j-slf4j-impl-2.6.2.jar +log4j-web/org.apache.logging.log4j/2.6.2//log4j-web-2.6.2.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +lz4-java/org.lz4/1.4.1//lz4-java-1.4.1.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +metrics-core/io.dropwizard.metrics/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/io.dropwizard.metrics/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/io.dropwizard.metrics/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/io.dropwizard.metrics/3.1.0//metrics-json-3.1.0.jar +metrics-jvm/io.dropwizard.metrics/3.1.0//metrics-jvm-3.1.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.6.2.Final//netty-3.6.2.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +opencsv/net.sf.opencsv/2.3//opencsv-2.3.jar +orc-core/org.apache.orc/1.3.3//orc-core-1.3.3.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +oro/oro/2.0.8//oro-2.0.8.jar +osgi-resource-locator/org.glassfish.hk2/1.0.1//osgi-resource-locator-1.0.1.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop-bundle/org.apache.parquet/1.8.1//parquet-hadoop-bundle-1.8.1.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +py4j/net.sf.py4j/0.10.7//py4j-0.10.7.jar +pyrolite/net.razorvine/4.13//pyrolite-4.13.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +scala-library/org.scala-lang/2.11.12//scala-library-2.11.12.jar +scala-reflect/org.scala-lang/2.11.8//scala-reflect-2.11.8.jar +scala-xml_2.11/org.scala-lang.modules/1.0.6//scala-xml_2.11-1.0.6.jar +servlet-api/javax.servlet/2.4//servlet-api-2.4.jar +shims/org.roaringbitmap/0.7.45//shims-0.7.45.jar +simpleclient/io.prometheus/0.8.0//simpleclient-0.8.0.jar +simpleclient_common/io.prometheus/0.8.0//simpleclient_common-0.8.0.jar +simpleclient_dropwizard/io.prometheus/0.8.0//simpleclient_dropwizard-0.8.0.jar +simpleclient_httpserver/io.prometheus/0.8.0//simpleclient_httpserver-0.8.0.jar +simpleclient_pushgateway/io.prometheus/0.8.0//simpleclient_pushgateway-0.8.0.jar +slf4j-api/org.slf4j/1.7.15//slf4j-api-1.7.15.jar +slf4j-log4j12/org.slf4j/1.6.1//slf4j-log4j12-1.6.1.jar +slider-core/org.apache.slider/0.90.2-incubating//slider-core-0.90.2-incubating.jar +snappy-java/org.xerial.snappy/1.1.7.1//snappy-java-1.1.7.1.jar +spark-core_2.11/org.apache.spark/2.4.4//spark-core_2.11-2.4.4.jar +spark-kvstore_2.11/org.apache.spark/2.4.4//spark-kvstore_2.11-2.4.4.jar +spark-launcher_2.11/org.apache.spark/2.4.4//spark-launcher_2.11-2.4.4.jar +spark-network-common_2.11/org.apache.spark/2.4.4//spark-network-common_2.11-2.4.4.jar +spark-network-shuffle_2.11/org.apache.spark/2.4.4//spark-network-shuffle_2.11-2.4.4.jar +spark-streaming-kafka-0-10_2.11/org.apache.spark/2.4.4//spark-streaming-kafka-0-10_2.11-2.4.4.jar +spark-streaming-kafka-0-10_2.11/org.apache.spark/2.4.4/tests/spark-streaming-kafka-0-10_2.11-2.4.4-tests.jar +spark-streaming_2.11/org.apache.spark/2.4.4//spark-streaming_2.11-2.4.4.jar +spark-tags_2.11/org.apache.spark/2.4.4//spark-tags_2.11-2.4.4.jar +spark-unsafe_2.11/org.apache.spark/2.4.4//spark-unsafe_2.11-2.4.4.jar +stax-api/stax/1.0.1//stax-api-1.0.1.jar +stream/com.clearspring.analytics/2.7.0//stream-2.7.0.jar +stringtemplate/org.antlr/4.0.2//stringtemplate-4.0.2.jar +tephra-api/co.cask.tephra/0.6.0//tephra-api-0.6.0.jar +tephra-core/co.cask.tephra/0.6.0//tephra-core-0.6.0.jar +tephra-hbase-compat-1.0/co.cask.tephra/0.6.0//tephra-hbase-compat-1.0-0.6.0.jar +twill-api/org.apache.twill/0.6.0-incubating//twill-api-0.6.0-incubating.jar +twill-common/org.apache.twill/0.6.0-incubating//twill-common-0.6.0-incubating.jar +twill-core/org.apache.twill/0.6.0-incubating//twill-core-0.6.0-incubating.jar +twill-discovery-api/org.apache.twill/0.6.0-incubating//twill-discovery-api-0.6.0-incubating.jar +twill-discovery-core/org.apache.twill/0.6.0-incubating//twill-discovery-core-0.6.0-incubating.jar +twill-zookeeper/org.apache.twill/0.6.0-incubating//twill-zookeeper-0.6.0-incubating.jar +unused/org.spark-project.spark/1.0.0//unused-1.0.0.jar +validation-api/javax.validation/1.1.0.Final//validation-api-1.1.0.Final.jar +velocity/org.apache.velocity/1.5//velocity-1.5.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xbean-asm6-shaded/org.apache.xbean/4.8//xbean-asm6-shaded-4.8.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zkclient/com.101tec/0.10//zkclient-0.10.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar +zookeeper/org.apache.zookeeper/3.4.6/tests/zookeeper-3.4.6-tests.jar +zstd-jni/com.github.luben/1.3.2-2//zstd-jni-1.3.2-2.jar diff --git a/dependencies/hudi-utilities-bundle_2.12.txt b/dependencies/hudi-utilities-bundle_2.12.txt new file mode 100644 index 0000000000000..468492edfe3e2 --- /dev/null +++ b/dependencies/hudi-utilities-bundle_2.12.txt @@ -0,0 +1,317 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +HikariCP/com.zaxxer/2.5.1//HikariCP-2.5.1.jar +RoaringBitmap/org.roaringbitmap/0.7.45//RoaringBitmap-0.7.45.jar +ST4/org.antlr/4.0.4//ST4-4.0.4.jar +activation/javax.activation/1.1.1//activation-1.1.1.jar +aircompressor/io.airlift/0.15//aircompressor-0.15.jar +annotations/org.jetbrains/17.0.0//annotations-17.0.0.jar +ant-launcher/org.apache.ant/1.9.1//ant-launcher-1.9.1.jar +ant/ant/1.6.5//ant-1.6.5.jar +ant/org.apache.ant/1.9.1//ant-1.9.1.jar +antlr-runtime/org.antlr/3.5.2//antlr-runtime-3.5.2.jar +aopalliance-repackaged/org.glassfish.hk2.external/2.4.0-b34//aopalliance-repackaged-2.4.0-b34.jar +aopalliance/aopalliance/1.0//aopalliance-1.0.jar +apache-curator/org.apache.curator/2.7.1//apache-curator-2.7.1.pom +apacheds-i18n/org.apache.directory.server/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/org.apache.directory.server/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/org.apache.directory.api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/org.apache.directory.api/1.0.0-M20//api-util-1.0.0-M20.jar +asm/asm/3.1//asm-3.1.jar +avatica-metrics/org.apache.calcite.avatica/1.8.0//avatica-metrics-1.8.0.jar +avatica/org.apache.calcite.avatica/1.8.0//avatica-1.8.0.jar +avro-ipc/org.apache.avro/1.8.2//avro-ipc-1.8.2.jar +avro-mapred/org.apache.avro/1.8.2/hadoop2/avro-mapred-1.8.2-hadoop2.jar +avro/org.apache.avro/1.8.2//avro-1.8.2.jar +aws-java-sdk-core/com.amazonaws/1.12.22//aws-java-sdk-core-1.12.22.jar +aws-java-sdk-sqs/com.amazonaws/1.12.22//aws-java-sdk-sqs-1.12.22.jar +bijection-avro_2.11/com.twitter/0.9.7//bijection-avro_2.11-0.9.7.jar +bijection-core_2.11/com.twitter/0.9.7//bijection-core_2.11-0.9.7.jar +bonecp/com.jolbox/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +calcite-core/org.apache.calcite/1.10.0//calcite-core-1.10.0.jar +calcite-druid/org.apache.calcite/1.10.0//calcite-druid-1.10.0.jar +calcite-linq4j/org.apache.calcite/1.10.0//calcite-linq4j-1.10.0.jar +chill-java/com.twitter/0.9.3//chill-java-0.9.3.jar +chill_2.11/com.twitter/0.9.3//chill_2.11-0.9.3.jar +common-config/io.confluent/5.3.4//common-config-5.3.4.jar +common-utils/io.confluent/5.3.4//common-utils-5.3.4.jar +commons-beanutils-core/commons-beanutils/1.8.0//commons-beanutils-core-1.8.0.jar +commons-beanutils/commons-beanutils/1.7.0//commons-beanutils-1.7.0.jar +commons-cli/commons-cli/1.2//commons-cli-1.2.jar +commons-codec/commons-codec/1.4//commons-codec-1.4.jar +commons-collections/commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/org.codehaus.janino/2.7.6//commons-compiler-2.7.6.jar +commons-compress/org.apache.commons/1.9//commons-compress-1.9.jar +commons-configuration/commons-configuration/1.6//commons-configuration-1.6.jar +commons-crypto/org.apache.commons/1.0.0//commons-crypto-1.0.0.jar +commons-daemon/commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-dbcp/commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-digester/commons-digester/1.8//commons-digester-1.8.jar +commons-el/commons-el/1.0//commons-el-1.0.jar +commons-httpclient/commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/commons-io/2.4//commons-io-2.4.jar +commons-lang/commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/org.apache.commons/3.1//commons-lang3-3.1.jar +commons-logging/commons-logging/1.2//commons-logging-1.2.jar +commons-math/org.apache.commons/2.2//commons-math-2.2.jar +commons-math3/org.apache.commons/3.1.1//commons-math3-3.1.1.jar +commons-net/commons-net/3.1//commons-net-3.1.jar +commons-pool/commons-pool/1.5.4//commons-pool-1.5.4.jar +compress-lzf/com.ning/1.0.3//compress-lzf-1.0.3.jar +curator-client/org.apache.curator/2.7.1//curator-client-2.7.1.jar +curator-framework/org.apache.curator/2.7.1//curator-framework-2.7.1.jar +curator-recipes/org.apache.curator/2.7.1//curator-recipes-2.7.1.jar +datanucleus-api-jdo/org.datanucleus/4.2.4//datanucleus-api-jdo-4.2.4.jar +datanucleus-core/org.datanucleus/4.1.17//datanucleus-core-4.1.17.jar +datanucleus-rdbms/org.datanucleus/4.1.19//datanucleus-rdbms-4.1.19.jar +derby/org.apache.derby/10.10.2.0//derby-10.10.2.0.jar +disruptor/com.lmax/3.3.0//disruptor-3.3.0.jar +dropwizard-metrics-hadoop-metrics2-reporter/com.github.joshelser/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +eigenbase-properties/net.hydromatic/1.1.5//eigenbase-properties-1.1.5.jar +fastutil/it.unimi.dsi/7.0.13//fastutil-7.0.13.jar +findbugs-annotations/com.github.stephenc.findbugs/1.3.9-1//findbugs-annotations-1.3.9-1.jar +fluent-hc/org.apache.httpcomponents/4.4.1//fluent-hc-4.4.1.jar +groovy-all/org.codehaus.groovy/2.4.4//groovy-all-2.4.4.jar +gson/com.google.code.gson/2.3.1//gson-2.3.1.jar +guava/com.google.guava/14.0.1//guava-14.0.1.jar +guice-assistedinject/com.google.inject.extensions/3.0//guice-assistedinject-3.0.jar +guice-servlet/com.google.inject.extensions/3.0//guice-servlet-3.0.jar +guice/com.google.inject/3.0//guice-3.0.jar +hadoop-annotations/org.apache.hadoop/2.7.3//hadoop-annotations-2.7.3.jar +hadoop-auth/org.apache.hadoop/2.7.3//hadoop-auth-2.7.3.jar +hadoop-client/org.apache.hadoop/2.7.3//hadoop-client-2.7.3.jar +hadoop-common/org.apache.hadoop/2.7.3//hadoop-common-2.7.3.jar +hadoop-hdfs/org.apache.hadoop/2.7.3//hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle/org.apache.hadoop/2.7.3//hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api/org.apache.hadoop/2.7.3//hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client/org.apache.hadoop/2.7.3//hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common/org.apache.hadoop/2.7.3//hadoop-yarn-common-2.7.3.jar +hadoop-yarn-registry/org.apache.hadoop/2.7.1//hadoop-yarn-registry-2.7.1.jar +hadoop-yarn-server-applicationhistoryservice/org.apache.hadoop/2.7.2//hadoop-yarn-server-applicationhistoryservice-2.7.2.jar +hadoop-yarn-server-common/org.apache.hadoop/2.7.2//hadoop-yarn-server-common-2.7.2.jar +hadoop-yarn-server-resourcemanager/org.apache.hadoop/2.7.2//hadoop-yarn-server-resourcemanager-2.7.2.jar +hadoop-yarn-server-web-proxy/org.apache.hadoop/2.7.2//hadoop-yarn-server-web-proxy-2.7.2.jar +hamcrest-core/org.hamcrest/1.3//hamcrest-core-1.3.jar +hbase-annotations/org.apache.hbase/1.2.3//hbase-annotations-1.2.3.jar +hbase-client/org.apache.hbase/1.2.3//hbase-client-1.2.3.jar +hbase-common/org.apache.hbase/1.2.3//hbase-common-1.2.3.jar +hbase-hadoop-compat/org.apache.hbase/1.2.3//hbase-hadoop-compat-1.2.3.jar +hbase-hadoop2-compat/org.apache.hbase/1.2.3//hbase-hadoop2-compat-1.2.3.jar +hbase-prefix-tree/org.apache.hbase/1.2.3//hbase-prefix-tree-1.2.3.jar +hbase-procedure/org.apache.hbase/1.2.3//hbase-procedure-1.2.3.jar +hbase-protocol/org.apache.hbase/1.2.3//hbase-protocol-1.2.3.jar +hbase-server/org.apache.hbase/1.2.3//hbase-server-1.2.3.jar +hive-common/org.apache.hive/2.3.1//hive-common-2.3.1.jar +hive-exec/org.apache.hive/2.3.1//hive-exec-2.3.1.jar +hive-jdbc/org.apache.hive/2.3.1//hive-jdbc-2.3.1.jar +hive-llap-client/org.apache.hive/2.3.1//hive-llap-client-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1//hive-llap-common-2.3.1.jar +hive-llap-common/org.apache.hive/2.3.1/tests/hive-llap-common-2.3.1-tests.jar +hive-llap-server/org.apache.hive/2.3.1//hive-llap-server-2.3.1.jar +hive-llap-tez/org.apache.hive/2.3.1//hive-llap-tez-2.3.1.jar +hive-metastore/org.apache.hive/2.3.1//hive-metastore-2.3.1.jar +hive-serde/org.apache.hive/2.3.1//hive-serde-2.3.1.jar +hive-service-rpc/org.apache.hive/2.3.1//hive-service-rpc-2.3.1.jar +hive-service/org.apache.hive/2.3.1//hive-service-2.3.1.jar +hive-shims-0.23/org.apache.hive.shims/2.3.1//hive-shims-0.23-2.3.1.jar +hive-shims-common/org.apache.hive.shims/2.3.1//hive-shims-common-2.3.1.jar +hive-shims-scheduler/org.apache.hive.shims/2.3.1//hive-shims-scheduler-2.3.1.jar +hive-shims/org.apache.hive/2.3.1//hive-shims-2.3.1.jar +hive-storage-api/org.apache.hive/2.3.1//hive-storage-api-2.3.1.jar +hive-vector-code-gen/org.apache.hive/2.3.1//hive-vector-code-gen-2.3.1.jar +hk2-api/org.glassfish.hk2/2.4.0-b34//hk2-api-2.4.0-b34.jar +hk2-locator/org.glassfish.hk2/2.4.0-b34//hk2-locator-2.4.0-b34.jar +hk2-utils/org.glassfish.hk2/2.4.0-b34//hk2-utils-2.4.0-b34.jar +htrace-core/org.apache.htrace/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/org.apache.httpcomponents/4.4.1//httpclient-4.4.1.jar +httpcore/org.apache.httpcomponents/4.4.1//httpcore-4.4.1.jar +ion-java/software.amazon.ion/1.0.2//ion-java-1.0.2.jar +ivy/org.apache.ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/com.fasterxml.jackson.core/2.10.0//jackson-annotations-2.10.0.jar +jackson-core/com.fasterxml.jackson.core/2.10.0//jackson-core-2.10.0.jar +jackson-databind/com.fasterxml.jackson.core/2.10.0//jackson-databind-2.10.0.jar +jackson-dataformat-cbor/com.fasterxml.jackson.dataformat/2.12.3//jackson-dataformat-cbor-2.12.3.jar +jackson-dataformat-csv/com.fasterxml.jackson.dataformat/2.6.7//jackson-dataformat-csv-2.6.7.jar +jackson-module-paranamer/com.fasterxml.jackson.module/2.7.9//jackson-module-paranamer-2.7.9.jar +jackson-module-scala_2.11/com.fasterxml.jackson.module/2.6.7.1//jackson-module-scala_2.11-2.6.7.1.jar +jamon-runtime/org.jamon/2.3.1//jamon-runtime-2.3.1.jar +janino/org.codehaus.janino/2.7.6//janino-2.7.6.jar +jasper-compiler/tomcat/5.5.23//jasper-compiler-5.5.23.jar +jasper-runtime/tomcat/5.5.23//jasper-runtime-5.5.23.jar +java-xmlbuilder/com.jamesmurty.utils/0.4//java-xmlbuilder-0.4.jar +javalin/io.javalin/2.8.0//javalin-2.8.0.jar +javassist/org.javassist/3.18.1-GA//javassist-3.18.1-GA.jar +javax.annotation-api/javax.annotation/1.2//javax.annotation-api-1.2.jar +javax.inject/javax.inject/1//javax.inject-1.jar +javax.inject/org.glassfish.hk2.external/2.4.0-b34//javax.inject-2.4.0-b34.jar +javax.jdo/org.datanucleus/3.2.0-m3//javax.jdo-3.2.0-m3.jar +javax.servlet-api/javax.servlet/3.1.0//javax.servlet-api-3.1.0.jar +javax.servlet/org.eclipse.jetty.orbit/3.0.0.v201112011016//javax.servlet-3.0.0.v201112011016.jar +javax.ws.rs-api/javax.ws.rs/2.0.1//javax.ws.rs-api-2.0.1.jar +javolution/javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/javax.xml.bind/2.2.11//jaxb-api-2.2.11.jar +jaxb-impl/com.sun.xml.bind/2.2.3-1//jaxb-impl-2.2.3-1.jar +jcl-over-slf4j/org.slf4j/1.7.16//jcl-over-slf4j-1.7.16.jar +jcodings/org.jruby.jcodings/1.0.8//jcodings-1.0.8.jar +jcommander/com.beust/1.72//jcommander-1.72.jar +jdo-api/javax.jdo/3.0.1//jdo-api-3.0.1.jar +jersey-client/com.sun.jersey/1.9//jersey-client-1.9.jar +jersey-client/org.glassfish.jersey.core/2.22.2//jersey-client-2.22.2.jar +jersey-common/org.glassfish.jersey.core/2.22.2//jersey-common-2.22.2.jar +jersey-container-servlet-core/org.glassfish.jersey.containers/2.17//jersey-container-servlet-core-2.17.jar +jersey-container-servlet/org.glassfish.jersey.containers/2.22.2//jersey-container-servlet-2.22.2.jar +jersey-core/com.sun.jersey/1.9//jersey-core-1.9.jar +jersey-guava/org.glassfish.jersey.bundles.repackaged/2.22.2//jersey-guava-2.22.2.jar +jersey-guice/com.sun.jersey.contribs/1.9//jersey-guice-1.9.jar +jersey-json/com.sun.jersey/1.9//jersey-json-1.9.jar +jersey-media-jaxb/org.glassfish.jersey.media/2.17//jersey-media-jaxb-2.17.jar +jersey-server/com.sun.jersey/1.9//jersey-server-1.9.jar +jersey-server/org.glassfish.jersey.core/2.17//jersey-server-2.17.jar +jets3t/net.java.dev.jets3t/0.9.0//jets3t-0.9.0.jar +jettison/org.codehaus.jettison/1.1//jettison-1.1.jar +jetty-client/org.eclipse.jetty/9.4.15.v20190215//jetty-client-9.4.15.v20190215.jar +jetty-http/org.eclipse.jetty/9.4.15.v20190215//jetty-http-9.4.15.v20190215.jar +jetty-io/org.eclipse.jetty/9.4.15.v20190215//jetty-io-9.4.15.v20190215.jar +jetty-security/org.eclipse.jetty/9.4.15.v20190215//jetty-security-9.4.15.v20190215.jar +jetty-server/org.eclipse.jetty/9.4.15.v20190215//jetty-server-9.4.15.v20190215.jar +jetty-servlet/org.eclipse.jetty/9.4.15.v20190215//jetty-servlet-9.4.15.v20190215.jar +jetty-util/org.eclipse.jetty/9.4.15.v20190215//jetty-util-9.4.15.v20190215.jar +jetty-util/org.mortbay.jetty/6.1.26//jetty-util-6.1.26.jar +jetty-webapp/org.eclipse.jetty/9.4.15.v20190215//jetty-webapp-9.4.15.v20190215.jar +jetty-xml/org.eclipse.jetty/9.4.15.v20190215//jetty-xml-9.4.15.v20190215.jar +jetty/org.mortbay.jetty/6.1.26//jetty-6.1.26.jar +jline/jline/2.12//jline-2.12.jar +jmespath-java/com.amazonaws/1.12.22//jmespath-java-1.12.22.jar +joda-time/joda-time/2.9.9//joda-time-2.9.9.jar +joni/org.jruby.joni/2.1.2//joni-2.1.2.jar +jpam/net.sf.jpam/1.1//jpam-1.1.jar +jsch/com.jcraft/0.1.42//jsch-0.1.42.jar +json/com.tdunning/1.8//json-1.8.jar +json4s-ast_2.11/org.json4s/3.5.3//json4s-ast_2.11-3.5.3.jar +json4s-core_2.11/org.json4s/3.5.3//json4s-core_2.11-3.5.3.jar +json4s-jackson_2.11/org.json4s/3.5.3//json4s-jackson_2.11-3.5.3.jar +json4s-scalap_2.11/org.json4s/3.5.3//json4s-scalap_2.11-3.5.3.jar +jsp-api/javax.servlet.jsp/2.1//jsp-api-2.1.jar +jsp-api/javax.servlet/2.0//jsp-api-2.0.jar +jsr305/com.google.code.findbugs/3.0.0//jsr305-3.0.0.jar +jul-to-slf4j/org.slf4j/1.7.16//jul-to-slf4j-1.7.16.jar +junit/junit/4.12//junit-4.12.jar +kafka-avro-serializer/io.confluent/5.3.4//kafka-avro-serializer-5.3.4.jar +kafka-clients/org.apache.kafka/2.0.0//kafka-clients-2.0.0.jar +kafka-schema-registry-client/io.confluent/5.3.4//kafka-schema-registry-client-5.3.4.jar +kotlin-stdlib-common/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-common-1.2.71.jar +kotlin-stdlib-jdk7/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk7-1.2.71.jar +kotlin-stdlib-jdk8/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-jdk8-1.2.71.jar +kotlin-stdlib/org.jetbrains.kotlin/1.2.71//kotlin-stdlib-1.2.71.jar +kryo-shaded/com.esotericsoftware/4.0.2//kryo-shaded-4.0.2.jar +leveldbjni-all/org.fusesource.leveldbjni/1.8//leveldbjni-all-1.8.jar +libfb303/org.apache.thrift/0.9.3//libfb303-0.9.3.jar +libthrift/org.apache.thrift/0.9.3//libthrift-0.9.3.jar +log4j-1.2-api/org.apache.logging.log4j/2.6.2//log4j-1.2-api-2.6.2.jar +log4j-api/org.apache.logging.log4j/2.6.2//log4j-api-2.6.2.jar +log4j-core/org.apache.logging.log4j/2.6.2//log4j-core-2.6.2.jar +log4j-slf4j-impl/org.apache.logging.log4j/2.6.2//log4j-slf4j-impl-2.6.2.jar +log4j-web/org.apache.logging.log4j/2.6.2//log4j-web-2.6.2.jar +log4j/log4j/1.2.17//log4j-1.2.17.jar +lz4-java/org.lz4/1.4.1//lz4-java-1.4.1.jar +metrics-core/com.yammer.metrics/2.2.0//metrics-core-2.2.0.jar +metrics-core/io.dropwizard.metrics/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/io.dropwizard.metrics/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/io.dropwizard.metrics/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/io.dropwizard.metrics/3.1.0//metrics-json-3.1.0.jar +metrics-jvm/io.dropwizard.metrics/3.1.0//metrics-jvm-3.1.0.jar +minlog/com.esotericsoftware/1.3.0//minlog-1.3.0.jar +netty-all/io.netty/4.0.23.Final//netty-all-4.0.23.Final.jar +netty/io.netty/3.6.2.Final//netty-3.6.2.Final.jar +objenesis/org.objenesis/2.5.1//objenesis-2.5.1.jar +opencsv/net.sf.opencsv/2.3//opencsv-2.3.jar +orc-core/org.apache.orc/1.3.3//orc-core-1.3.3.jar +orc-core/org.apache.orc/1.6.0/nohive/orc-core-1.6.0-nohive.jar +orc-shims/org.apache.orc/1.6.0//orc-shims-1.6.0.jar +oro/oro/2.0.8//oro-2.0.8.jar +osgi-resource-locator/org.glassfish.hk2/1.0.1//osgi-resource-locator-1.0.1.jar +paranamer/com.thoughtworks.paranamer/2.7//paranamer-2.7.jar +parquet-avro/org.apache.parquet/1.10.1//parquet-avro-1.10.1.jar +parquet-column/org.apache.parquet/1.10.1//parquet-column-1.10.1.jar +parquet-common/org.apache.parquet/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/org.apache.parquet/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/org.apache.parquet/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop-bundle/org.apache.parquet/1.8.1//parquet-hadoop-bundle-1.8.1.jar +parquet-hadoop/org.apache.parquet/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/org.apache.parquet/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/com.google.protobuf/2.5.0//protobuf-java-2.5.0.jar +py4j/net.sf.py4j/0.10.7//py4j-0.10.7.jar +pyrolite/net.razorvine/4.13//pyrolite-4.13.jar +rocksdbjni/org.rocksdb/5.17.2//rocksdbjni-5.17.2.jar +scala-library/org.scala-lang/2.11.12//scala-library-2.11.12.jar +scala-reflect/org.scala-lang/2.11.8//scala-reflect-2.11.8.jar +scala-xml_2.11/org.scala-lang.modules/1.0.6//scala-xml_2.11-1.0.6.jar +servlet-api/javax.servlet/2.4//servlet-api-2.4.jar +shims/org.roaringbitmap/0.7.45//shims-0.7.45.jar +simpleclient/io.prometheus/0.8.0//simpleclient-0.8.0.jar +simpleclient_common/io.prometheus/0.8.0//simpleclient_common-0.8.0.jar +simpleclient_dropwizard/io.prometheus/0.8.0//simpleclient_dropwizard-0.8.0.jar +simpleclient_httpserver/io.prometheus/0.8.0//simpleclient_httpserver-0.8.0.jar +simpleclient_pushgateway/io.prometheus/0.8.0//simpleclient_pushgateway-0.8.0.jar +slf4j-api/org.slf4j/1.7.15//slf4j-api-1.7.15.jar +slf4j-log4j12/org.slf4j/1.6.1//slf4j-log4j12-1.6.1.jar +slider-core/org.apache.slider/0.90.2-incubating//slider-core-0.90.2-incubating.jar +snappy-java/org.xerial.snappy/1.1.7.1//snappy-java-1.1.7.1.jar +spark-core_2.11/org.apache.spark/2.4.4//spark-core_2.11-2.4.4.jar +spark-kvstore_2.11/org.apache.spark/2.4.4//spark-kvstore_2.11-2.4.4.jar +spark-launcher_2.11/org.apache.spark/2.4.4//spark-launcher_2.11-2.4.4.jar +spark-network-common_2.11/org.apache.spark/2.4.4//spark-network-common_2.11-2.4.4.jar +spark-network-shuffle_2.11/org.apache.spark/2.4.4//spark-network-shuffle_2.11-2.4.4.jar +spark-streaming-kafka-0-10_2.11/org.apache.spark/2.4.4//spark-streaming-kafka-0-10_2.11-2.4.4.jar +spark-streaming-kafka-0-10_2.11/org.apache.spark/2.4.4/tests/spark-streaming-kafka-0-10_2.11-2.4.4-tests.jar +spark-streaming_2.11/org.apache.spark/2.4.4//spark-streaming_2.11-2.4.4.jar +spark-tags_2.11/org.apache.spark/2.4.4//spark-tags_2.11-2.4.4.jar +spark-unsafe_2.11/org.apache.spark/2.4.4//spark-unsafe_2.11-2.4.4.jar +stax-api/stax/1.0.1//stax-api-1.0.1.jar +stream/com.clearspring.analytics/2.7.0//stream-2.7.0.jar +stringtemplate/org.antlr/4.0.2//stringtemplate-4.0.2.jar +tephra-api/co.cask.tephra/0.6.0//tephra-api-0.6.0.jar +tephra-core/co.cask.tephra/0.6.0//tephra-core-0.6.0.jar +tephra-hbase-compat-1.0/co.cask.tephra/0.6.0//tephra-hbase-compat-1.0-0.6.0.jar +twill-api/org.apache.twill/0.6.0-incubating//twill-api-0.6.0-incubating.jar +twill-common/org.apache.twill/0.6.0-incubating//twill-common-0.6.0-incubating.jar +twill-core/org.apache.twill/0.6.0-incubating//twill-core-0.6.0-incubating.jar +twill-discovery-api/org.apache.twill/0.6.0-incubating//twill-discovery-api-0.6.0-incubating.jar +twill-discovery-core/org.apache.twill/0.6.0-incubating//twill-discovery-core-0.6.0-incubating.jar +twill-zookeeper/org.apache.twill/0.6.0-incubating//twill-zookeeper-0.6.0-incubating.jar +unused/org.spark-project.spark/1.0.0//unused-1.0.0.jar +validation-api/javax.validation/1.1.0.Final//validation-api-1.1.0.Final.jar +velocity/org.apache.velocity/1.5//velocity-1.5.jar +websocket-api/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-api-9.4.15.v20190215.jar +websocket-client/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-client-9.4.15.v20190215.jar +websocket-common/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-common-9.4.15.v20190215.jar +websocket-server/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-server-9.4.15.v20190215.jar +websocket-servlet/org.eclipse.jetty.websocket/9.4.15.v20190215//websocket-servlet-9.4.15.v20190215.jar +xbean-asm6-shaded/org.apache.xbean/4.8//xbean-asm6-shaded-4.8.jar +xercesImpl/xerces/2.9.1//xercesImpl-2.9.1.jar +xml-apis/xml-apis/1.3.04//xml-apis-1.3.04.jar +xmlenc/xmlenc/0.52//xmlenc-0.52.jar +xz/org.tukaani/1.5//xz-1.5.jar +zkclient/com.101tec/0.10//zkclient-0.10.jar +zookeeper/org.apache.zookeeper/3.4.6//zookeeper-3.4.6.jar +zookeeper/org.apache.zookeeper/3.4.6/tests/zookeeper-3.4.6-tests.jar +zstd-jni/com.github.luben/1.3.2-2//zstd-jni-1.3.2-2.jar diff --git a/doap_HUDI.rdf b/doap_HUDI.rdf index 77db135e124ec..e153fb3d4c5fc 100644 --- a/doap_HUDI.rdf +++ b/doap_HUDI.rdf @@ -61,6 +61,46 @@ 2020-08-22 0.6.0 + + Apache Hudi 0.7.0 + 2021-01-25 + 0.7.0 + + + Apache Hudi 0.8.0 + 2021-04-06 + 0.8.0 + + + Apache Hudi 0.9.0 + 2021-08-26 + 0.9.0 + + + Apache Hudi 0.10.0 + 2021-12-08 + 0.10.0 + + + Apache Hudi 0.10.1 + 2022-01-26 + 0.10.1 + + + Apache Hudi 0.11.0 + 2022-04-30 + 0.11.0 + + + Apache Hudi 0.11.1 + 2022-06-18 + 0.11.1 + + + Apache Hudi 0.12.0 + 2022-08-16 + 0.12.0 + diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000000000..0851e9b5b7858 --- /dev/null +++ b/docker/README.md @@ -0,0 +1,197 @@ + + +# Docker Demo for Hudi + +This repo contains the docker demo resources for building docker demo images, set up the demo, and running Hudi in the +docker demo environment. + +## Repo Organization + +### Configs for assembling docker images - `/hoodie` + +The `/hoodie` folder contains all the configs for assembling necessary docker images. The name and repository of each +docker image, e.g., `apachehudi/hudi-hadoop_2.8.4-trinobase_368`, is defined in the maven configuration file `pom.xml`. + +### Docker compose config for the Demo - `/compose` + +The `/compose` folder contains the yaml file to compose the Docker environment for running Hudi Demo. + +### Resources and Sample Data for the Demo - `/demo` + +The `/demo` folder contains useful resources and sample data use for the Demo. + +## Build and Test Image locally + +To build all docker images locally, you can run the script: + +```shell +./build_local_docker_images.sh +``` + +To build a single image target, you can run + +```shell +mvn clean pre-integration-test -DskipTests -Ddocker.compose.skip=true -Ddocker.build.skip=false -pl : -am +# For example, to build hudi-hadoop-trinobase-docker +mvn clean pre-integration-test -DskipTests -Ddocker.compose.skip=true -Ddocker.build.skip=false -pl :hudi-hadoop-trinobase-docker -am +``` + +Alternatively, you can use `docker` cli directly under `hoodie/hadoop` to build images in a faster way. If you use this +approach, make sure you first build Hudi modules with `integration-tests` profile as below so that the latest Hudi jars +built are copied to the corresponding Hudi docker folder, e.g., `$HUDI_DIR/docker/hoodie/hadoop/hive_base/target`, which +is required to build each docker image. Otherwise, the `target/` folder can be missing and `docker` cli complains about +that: `failed to compute cache key: "/target" not found: not found`. + +```shell +mvn -Pintegration-tests clean package -DskipTests +``` + +Note that, to build the image with `docker` cli, you need to manually name your local image by using `-t` option to +match the naming in the `pom.xml`, so that you can update the corresponding image repository in Docker Hub (detailed +steps in the next section). + +```shell +# Run under hoodie/hadoop, the is optional, "latest" by default +docker build -t /[:] +# For example, to build trinobase +docker build trinobase -t apachehudi/hudi-hadoop_2.8.4-trinobase_368 +``` + +After new images are built, you can run the following script to bring up docker demo with your local images: + +```shell +./setup_demo.sh dev +``` + +## Upload Updated Image to Repository on Docker Hub + +Once you have built the updated image locally, you can push the corresponding this repository of the image to the Docker +Hud registry designated by its name or tag: + +```shell +docker push /: +# For example +docker push apachehudi/hudi-hadoop_2.8.4-trinobase_368 +``` + +You can also easily push the image to the Docker Hub using Docker Desktop app: go to `Images`, search for the image by +the name, and then click on the three dots and `Push to Hub`. + +![Push to Docker Hub](images/push_to_docker_hub.png) + +Note that you need to ask for permission to upload the Hudi Docker Demo images to the repositories. + +You can find more information on [Docker Hub Repositories Manual](https://docs.docker.com/docker-hub/repos/). + +## Docker Demo Setup + +Please refer to the [Docker Demo Docs page](https://hudi.apache.org/docs/docker_demo). + +## Building Multi-Arch Images + +NOTE: The steps below require some code changes. Support for multi-arch builds in a fully automated manner is being +tracked by [HUDI-3601](https://issues.apache.org/jira/browse/HUDI-3601). + +By default, the docker images are built for x86_64 (amd64) architecture. Docker `buildx` allows you to build multi-arch +images, link them together with a manifest file, and push them all to a registry – with a single command. Let's say we +want to build for arm64 architecture. First we need to ensure that `buildx` setup is done locally. Please follow the +below steps (referred from https://www.docker.com/blog/multi-arch-images): + +``` +# List builders +~ ❯❯❯ docker buildx ls +NAME/NODE DRIVER/ENDPOINT STATUS PLATFORMS +default * docker + default default running linux/amd64, linux/arm64, linux/arm/v7, linux/arm/v6 + +# If you are using the default builder, which is basically the old builder, then do following +~ ❯❯❯ docker buildx create --name mybuilder +mybuilder +~ ❯❯❯ docker buildx use mybuilder +~ ❯❯❯ docker buildx inspect --bootstrap +[+] Building 2.5s (1/1) FINISHED + => [internal] booting buildkit 2.5s + => => pulling image moby/buildkit:master 1.3s + => => creating container buildx_buildkit_mybuilder0 1.2s +Name: mybuilder +Driver: docker-container + +Nodes: +Name: mybuilder0 +Endpoint: unix:///var/run/docker.sock +Status: running + +Platforms: linux/amd64, linux/arm64, linux/arm/v7, linux/arm/v6 +``` + +Now goto `/docker/hoodie/hadoop` and change the `Dockerfile` to pull dependent images corresponding to +arm64. For example, in [base/Dockerfile](./hoodie/hadoop/base/Dockerfile) (which pulls jdk8 image), change the +line `FROM openjdk:8u212-jdk-slim-stretch` to `FROM arm64v8/openjdk:8u212-jdk-slim-stretch`. + +Then, from under `/docker/hoodie/hadoop` directory, execute the following command to build as well as +push the image to the dockerhub repo: + +``` +# Run under hoodie/hadoop, the is optional, "latest" by default +docker buildx build --platform -t /[:] --push + +# For example, to build base image +docker buildx build base --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-base:linux-arm64-0.10.1 --push +``` + +Once the base image is pushed then you could do something similar for other images. +Change [hive](./hoodie/hadoop/hive_base/Dockerfile) dockerfile to pull the base image with tag corresponding to +linux/arm64 platform. + +``` +# Change below line in the Dockerfile +FROM apachehudi/hudi-hadoop_${HADOOP_VERSION}-base:latest +# as shown below +FROM --platform=linux/arm64 apachehudi/hudi-hadoop_${HADOOP_VERSION}-base:linux-arm64-0.10.1 + +# and then build & push from under hoodie/hadoop dir +docker buildx build hive_base --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-hive_2.3.3:linux-arm64-0.10.1 --push +``` + +Similarly, for images that are dependent on hive (e.g. [base spark](./hoodie/hadoop/spark_base/Dockerfile) +, [sparkmaster](./hoodie/hadoop/sparkmaster/Dockerfile), [sparkworker](./hoodie/hadoop/sparkworker/Dockerfile) +and [sparkadhoc](./hoodie/hadoop/sparkadhoc/Dockerfile)), change the corresponding Dockerfile to pull the base hive +image with tag corresponding to arm64. Then build and push using `docker buildx` command. + +For the sake of completeness, here is a [patch](https://gist.github.com/xushiyan/cec16585e884cf0693250631a1d10ec2) which +shows what changes to make in Dockerfiles (assuming tag is named `linux-arm64-0.10.1`), and below is the list +of `docker buildx` commands. + +``` +docker buildx build base --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-base:linux-arm64-0.10.1 --push +docker buildx build datanode --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-datanode:linux-arm64-0.10.1 --push +docker buildx build historyserver --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-history:linux-arm64-0.10.1 --push +docker buildx build hive_base --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-hive_2.3.3:linux-arm64-0.10.1 --push +docker buildx build namenode --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-namenode:linux-arm64-0.10.1 --push +docker buildx build prestobase --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-prestobase_0.217:linux-arm64-0.10.1 --push +docker buildx build spark_base --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkbase_2.4.4:linux-arm64-0.10.1 --push +docker buildx build sparkadhoc --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:linux-arm64-0.10.1 --push +docker buildx build sparkmaster --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.4.4:linux-arm64-0.10.1 --push +docker buildx build sparkworker --platform linux/arm64 -t apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkworker_2.4.4:linux-arm64-0.10.1 --push +``` + +Once all the required images are pushed to the dockerhub repos, then we need to do one additional change +in [docker compose](./compose/docker-compose_hadoop284_hive233_spark244.yml) file. +Apply [this patch](https://gist.github.com/codope/3dd986de5e54f0650dd74b6032e4456c) to the docker compose file so +that [setup_demo](./setup_demo.sh) pulls images with the correct tag for arm64. And now we should be ready to run the +setup script and follow the docker demo. diff --git a/docker/compose/docker-compose_hadoop284_hive233_spark244.yml b/docker/compose/docker-compose_hadoop284_hive233_spark244.yml index 3e42d532bd2c3..b8217fc0d0401 100644 --- a/docker/compose/docker-compose_hadoop284_hive233_spark244.yml +++ b/docker/compose/docker-compose_hadoop284_hive233_spark244.yml @@ -26,6 +26,8 @@ services: ports: - "50070:50070" - "8020:8020" + # JVM debugging port (will be mapped to a random port on host) + - "5005" env_file: - ./hadoop.env healthcheck: @@ -33,7 +35,7 @@ services: interval: 30s timeout: 10s retries: 3 - + datanode1: image: apachehudi/hudi-hadoop_2.8.4-datanode:latest container_name: datanode1 @@ -45,6 +47,8 @@ services: ports: - "50075:50075" - "50010:50010" + # JVM debugging port (will be mapped to a random port on host) + - "5005" links: - "namenode" - "historyserver" @@ -84,7 +88,7 @@ services: - hive-metastore-postgresql:/var/lib/postgresql hostname: hive-metastore-postgresql container_name: hive-metastore-postgresql - + hivemetastore: image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3:latest hostname: hivemetastore @@ -99,6 +103,8 @@ services: SERVICE_PRECONDITION: "namenode:50070 hive-metastore-postgresql:5432" ports: - "9083:9083" + # JVM debugging port (will be mapped to a random port on host) + - "5005" healthcheck: test: ["CMD", "nc", "-z", "hivemetastore", "9083"] interval: 30s @@ -118,6 +124,8 @@ services: SERVICE_PRECONDITION: "hivemetastore:9083" ports: - "10000:10000" + # JVM debugging port (will be mapped to a random port on host) + - "5005" depends_on: - "hivemetastore" links: @@ -136,6 +144,8 @@ services: ports: - "8080:8080" - "7077:7077" + # JVM debugging port (will be mapped to a random port on host) + - "5005" environment: - INIT_DAEMON_STEP=setup_spark links: @@ -154,6 +164,8 @@ services: - sparkmaster ports: - "8081:8081" + # JVM debugging port (will be mapped to a random port on host) + - "5005" environment: - "SPARK_MASTER=spark://sparkmaster:7077" links: @@ -167,7 +179,7 @@ services: hostname: zookeeper container_name: zookeeper ports: - - '2181:2181' + - "2181:2181" environment: - ALLOW_ANONYMOUS_LOGIN=yes @@ -176,7 +188,7 @@ services: hostname: kafkabroker container_name: kafkabroker ports: - - '9092:9092' + - "9092:9092" environment: - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181 - ALLOW_PLAINTEXT_LISTENER=yes @@ -184,9 +196,11 @@ services: presto-coordinator-1: container_name: presto-coordinator-1 hostname: presto-coordinator-1 - image: apachehudi/hudi-hadoop_2.8.4-prestobase_0.217:latest + image: apachehudi/hudi-hadoop_2.8.4-prestobase_0.271:latest ports: - - '8090:8090' + - "8090:8090" + # JVM debugging port (will be mapped to a random port on host) + - "5005" environment: - PRESTO_JVM_MAX_HEAP=512M - PRESTO_QUERY_MAX_MEMORY=1GB @@ -201,25 +215,66 @@ services: command: coordinator presto-worker-1: - container_name: presto-worker-1 - hostname: presto-worker-1 - image: apachehudi/hudi-hadoop_2.8.4-prestobase_0.217:latest - depends_on: ["presto-coordinator-1"] - environment: - - PRESTO_JVM_MAX_HEAP=512M - - PRESTO_QUERY_MAX_MEMORY=1GB - - PRESTO_QUERY_MAX_MEMORY_PER_NODE=256MB - - PRESTO_QUERY_MAX_TOTAL_MEMORY_PER_NODE=384MB - - PRESTO_MEMORY_HEAP_HEADROOM_PER_NODE=100MB - - TERM=xterm - links: - - "hivemetastore" - - "hiveserver" - - "hive-metastore-postgresql" - - "namenode" - volumes: - - ${HUDI_WS}:/var/hoodie/ws - command: worker + container_name: presto-worker-1 + hostname: presto-worker-1 + image: apachehudi/hudi-hadoop_2.8.4-prestobase_0.271:latest + depends_on: [ "presto-coordinator-1" ] + environment: + - PRESTO_JVM_MAX_HEAP=512M + - PRESTO_QUERY_MAX_MEMORY=1GB + - PRESTO_QUERY_MAX_MEMORY_PER_NODE=256MB + - PRESTO_QUERY_MAX_TOTAL_MEMORY_PER_NODE=384MB + - PRESTO_MEMORY_HEAP_HEADROOM_PER_NODE=100MB + - TERM=xterm + links: + - "hivemetastore" + - "hiveserver" + - "hive-metastore-postgresql" + - "namenode" + volumes: + - ${HUDI_WS}:/var/hoodie/ws + command: worker + + trino-coordinator-1: + container_name: trino-coordinator-1 + hostname: trino-coordinator-1 + image: apachehudi/hudi-hadoop_2.8.4-trinocoordinator_368:latest + ports: + - "8091:8091" + # JVM debugging port (will be mapped to a random port on host) + - "5005" + links: + - "hivemetastore" + volumes: + - ${HUDI_WS}:/var/hoodie/ws + command: http://trino-coordinator-1:8091 trino-coordinator-1 + + trino-worker-1: + container_name: trino-worker-1 + hostname: trino-worker-1 + image: apachehudi/hudi-hadoop_2.8.4-trinoworker_368:latest + depends_on: [ "trino-coordinator-1" ] + ports: + - "8092:8092" + # JVM debugging port (will be mapped to a random port on host) + - "5005" + links: + - "hivemetastore" + - "hiveserver" + - "hive-metastore-postgresql" + - "namenode" + volumes: + - ${HUDI_WS}:/var/hoodie/ws + command: http://trino-coordinator-1:8091 trino-worker-1 + + graphite: + container_name: graphite + hostname: graphite + image: graphiteapp/graphite-statsd + ports: + - 80:80 + - 2003-2004:2003-2004 + - 8126:8126 adhoc-1: image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:latest @@ -231,6 +286,8 @@ services: - sparkmaster ports: - '4040:4040' + # JVM debugging port (mapped to 5006 on the host) + - "5006:5005" environment: - "SPARK_MASTER=spark://sparkmaster:7077" links: @@ -239,6 +296,7 @@ services: - "hive-metastore-postgresql" - "namenode" - "presto-coordinator-1" + - "trino-coordinator-1" volumes: - ${HUDI_WS}:/var/hoodie/ws @@ -248,6 +306,9 @@ services: container_name: adhoc-2 env_file: - ./hadoop.env + ports: + # JVM debugging port (mapped to 5005 on the host) + - "5005:5005" depends_on: - sparkmaster environment: @@ -258,6 +319,7 @@ services: - "hive-metastore-postgresql" - "namenode" - "presto-coordinator-1" + - "trino-coordinator-1" volumes: - ${HUDI_WS}:/var/hoodie/ws diff --git a/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml b/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml new file mode 100644 index 0000000000000..857180cfbee20 --- /dev/null +++ b/docker/compose/docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml @@ -0,0 +1,259 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +version: "3.3" + +services: + + namenode: + image: apachehudi/hudi-hadoop_2.8.4-namenode:linux-arm64-0.10.1 + platform: linux/arm64 + hostname: namenode + container_name: namenode + environment: + - CLUSTER_NAME=hudi_hadoop284_hive232_spark244 + ports: + - "50070:50070" + - "8020:8020" + # JVM debugging port (will be mapped to a random port on host) + - "5005" + env_file: + - ./hadoop.env + healthcheck: + test: [ "CMD", "curl", "-f", "http://namenode:50070" ] + interval: 30s + timeout: 10s + retries: 3 + + datanode1: + image: apachehudi/hudi-hadoop_2.8.4-datanode:linux-arm64-0.10.1 + platform: linux/arm64 + container_name: datanode1 + hostname: datanode1 + environment: + - CLUSTER_NAME=hudi_hadoop284_hive232_spark244 + env_file: + - ./hadoop.env + ports: + - "50075:50075" + - "50010:50010" + # JVM debugging port (will be mapped to a random port on host) + - "5005" + links: + - "namenode" + - "historyserver" + healthcheck: + test: [ "CMD", "curl", "-f", "http://datanode1:50075" ] + interval: 30s + timeout: 10s + retries: 3 + depends_on: + - namenode + + historyserver: + image: apachehudi/hudi-hadoop_2.8.4-history:latest + hostname: historyserver + container_name: historyserver + environment: + - CLUSTER_NAME=hudi_hadoop284_hive232_spark244 + depends_on: + - "namenode" + links: + - "namenode" + ports: + - "58188:8188" + healthcheck: + test: [ "CMD", "curl", "-f", "http://historyserver:8188" ] + interval: 30s + timeout: 10s + retries: 3 + env_file: + - ./hadoop.env + volumes: + - historyserver:/hadoop/yarn/timeline + + hive-metastore-postgresql: + image: menorah84/hive-metastore-postgresql:2.3.0 + platform: linux/arm64 + environment: + - POSTGRES_HOST_AUTH_METHOD=trust + volumes: + - hive-metastore-postgresql:/var/lib/postgresql + hostname: hive-metastore-postgresql + container_name: hive-metastore-postgresql + + hivemetastore: + image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3:linux-arm64-0.10.1 + platform: linux/arm64 + hostname: hivemetastore + container_name: hivemetastore + links: + - "hive-metastore-postgresql" + - "namenode" + env_file: + - ./hadoop.env + command: /opt/hive/bin/hive --service metastore + environment: + SERVICE_PRECONDITION: "namenode:50070 hive-metastore-postgresql:5432" + ports: + - "9083:9083" + # JVM debugging port (will be mapped to a random port on host) + - "5005" + healthcheck: + test: [ "CMD", "nc", "-z", "hivemetastore", "9083" ] + interval: 30s + timeout: 10s + retries: 3 + depends_on: + - "hive-metastore-postgresql" + - "namenode" + + hiveserver: + image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3:linux-arm64-0.10.1 + platform: linux/arm64 + hostname: hiveserver + container_name: hiveserver + env_file: + - ./hadoop.env + environment: + SERVICE_PRECONDITION: "hivemetastore:9083" + ports: + - "10000:10000" + # JVM debugging port (will be mapped to a random port on host) + - "5005" + depends_on: + - "hivemetastore" + links: + - "hivemetastore" + - "hive-metastore-postgresql" + - "namenode" + volumes: + - ${HUDI_WS}:/var/hoodie/ws + + sparkmaster: + image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.4.4:linux-arm64-0.10.1 + platform: linux/arm64 + hostname: sparkmaster + container_name: sparkmaster + env_file: + - ./hadoop.env + ports: + - "8080:8080" + - "7077:7077" + # JVM debugging port (will be mapped to a random port on host) + - "5005" + environment: + - INIT_DAEMON_STEP=setup_spark + links: + - "hivemetastore" + - "hiveserver" + - "hive-metastore-postgresql" + - "namenode" + + spark-worker-1: + image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkworker_2.4.4:linux-arm64-0.10.1 + platform: linux/arm64 + hostname: spark-worker-1 + container_name: spark-worker-1 + env_file: + - ./hadoop.env + depends_on: + - sparkmaster + ports: + - "8081:8081" + # JVM debugging port (will be mapped to a random port on host) + - "5005" + environment: + - "SPARK_MASTER=spark://sparkmaster:7077" + links: + - "hivemetastore" + - "hiveserver" + - "hive-metastore-postgresql" + - "namenode" + + zookeeper: + image: 'arm64v8/zookeeper:3.4.12' + platform: linux/arm64 + hostname: zookeeper + container_name: zookeeper + ports: + - "2181:2181" + environment: + - ALLOW_ANONYMOUS_LOGIN=yes + + kafka: + image: 'wurstmeister/kafka:2.12-2.0.1' + platform: linux/arm64 + hostname: kafkabroker + container_name: kafkabroker + ports: + - "9092:9092" + environment: + - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181 + - ALLOW_PLAINTEXT_LISTENER=yes + - KAFKA_ADVERTISED_HOST_NAME=kafkabroker + + adhoc-1: + image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:linux-arm64-0.10.1 + platform: linux/arm64 + hostname: adhoc-1 + container_name: adhoc-1 + env_file: + - ./hadoop.env + depends_on: + - sparkmaster + ports: + - '4040:4040' + # JVM debugging port (mapped to 5006 on the host) + - "5006:5005" + environment: + - "SPARK_MASTER=spark://sparkmaster:7077" + links: + - "hivemetastore" + - "hiveserver" + - "hive-metastore-postgresql" + - "namenode" + volumes: + - ${HUDI_WS}:/var/hoodie/ws + + adhoc-2: + image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:linux-arm64-0.10.1 + platform: linux/arm64 + hostname: adhoc-2 + container_name: adhoc-2 + env_file: + - ./hadoop.env + ports: + # JVM debugging port (mapped to 5005 on the host) + - "5005:5005" + depends_on: + - sparkmaster + environment: + - "SPARK_MASTER=spark://sparkmaster:7077" + links: + - "hivemetastore" + - "hiveserver" + - "hive-metastore-postgresql" + - "namenode" + volumes: + - ${HUDI_WS}:/var/hoodie/ws + +volumes: + namenode: + historyserver: + hive-metastore-postgresql: + +networks: + default: diff --git a/docker/demo/compaction-bootstrap.commands b/docker/demo/compaction-bootstrap.commands index 6c246be747124..a44a26ff35e14 100644 --- a/docker/demo/compaction-bootstrap.commands +++ b/docker/demo/compaction-bootstrap.commands @@ -1,19 +1,19 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. connect --path /user/hive/warehouse/stock_ticks_mor_bs compactions show all diff --git a/docker/demo/compaction.commands b/docker/demo/compaction.commands index a8baaff3ed33d..e8d7f39e6b4b4 100644 --- a/docker/demo/compaction.commands +++ b/docker/demo/compaction.commands @@ -1,19 +1,19 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. connect --path /user/hive/warehouse/stock_ticks_mor compactions show all diff --git a/docker/demo/config/dfs-source.properties b/docker/demo/config/dfs-source.properties index ac7080e1412bc..a90629ef8e67e 100644 --- a/docker/demo/config/dfs-source.properties +++ b/docker/demo/config/dfs-source.properties @@ -19,6 +19,10 @@ include=base.properties # Key fields, for kafka example hoodie.datasource.write.recordkey.field=key hoodie.datasource.write.partitionpath.field=date +# NOTE: We have to duplicate configuration since this is being used +# w/ both Spark and DeltaStreamer +hoodie.table.recordkey.fields=key +hoodie.table.partition.fields=date # Schema provider props (change to absolute path based on your installation) hoodie.deltastreamer.schemaprovider.source.schema.file=/var/demo/config/schema.avsc hoodie.deltastreamer.schemaprovider.target.schema.file=/var/demo/config/schema.avsc diff --git a/docker/demo/config/hoodie-incr.properties b/docker/demo/config/hoodie-incr.properties index 80f474b1e7716..c46ec48a40184 100644 --- a/docker/demo/config/hoodie-incr.properties +++ b/docker/demo/config/hoodie-incr.properties @@ -28,5 +28,6 @@ hoodie.deltastreamer.source.hoodieincr.path=/docker_hoodie_sync_valid_test hoodie.deltastreamer.source.hoodieincr.read_latest_on_missing_ckpt=true # hive sync hoodie.datasource.hive_sync.table=docker_hoodie_sync_valid_test_2 -hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000 -hoodie.datasource.hive_sync.partition_fields=partition \ No newline at end of file +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.partition_fields=partition +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor diff --git a/docker/demo/config/log4j.properties b/docker/demo/config/log4j.properties deleted file mode 100644 index 1618bff9c2233..0000000000000 --- a/docker/demo/config/log4j.properties +++ /dev/null @@ -1,41 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Set everything to be logged to the console -log4j.rootCategory=WARN, console -log4j.appender.console=org.apache.log4j.ConsoleAppender -log4j.appender.console.target=System.err -log4j.appender.console.layout=org.apache.log4j.PatternLayout -log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n - -# Set the default spark-shell log level to WARN. When running the spark-shell, the -# log level for this class is used to overwrite the root logger's log level, so that -# the user can have different defaults for the shell and regular Spark apps. -log4j.logger.org.apache.spark.repl.Main=WARN - -# Settings to quiet third party logs that are too verbose -log4j.logger.org.spark_project.jetty=WARN -log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR -log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO -log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO -log4j.logger.org.apache.parquet=ERROR -log4j.logger.parquet=ERROR -log4j.logger.org.apache.spark=WARN - -# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support -log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL -log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR \ No newline at end of file diff --git a/docker/demo/config/log4j2.properties b/docker/demo/config/log4j2.properties new file mode 100644 index 0000000000000..dd3c1ff9ef7a2 --- /dev/null +++ b/docker/demo/config/log4j2.properties @@ -0,0 +1,60 @@ +### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +### +status = warn +name = HudiConsoleLog + +# Set everything to be logged to the console +appender.console.type = Console +appender.console.name = CONSOLE +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Root logger level +rootLogger.level = warn +# Root logger referring to console appender +rootLogger.appenderRef.stdout.ref = CONSOLE + +# Set the default spark-shell log level to WARN. When running the spark-shell, the +# log level for this class is used to overwrite the root logger's log level, so that +# the user can have different defaults for the shell and regular Spark apps. +logger.apache_spark_repl.name = org.apache.spark.repl.Main +logger.apache_spark_repl.level = warn +# Set logging of integration testsuite to INFO level +logger.hudi_integ.name = org.apache.hudi.integ.testsuite +logger.hudi_integ.level = info +# Settings to quiet third party logs that are too verbose +logger.apache_spark_jetty.name = org.spark_project.jetty +logger.apache_spark_jetty.level = warn +logger.apache_spark_jett_lifecycle.name = org.spark_project.jetty.util.component.AbstractLifeCycle +logger.apache_spark_jett_lifecycle.level = error +logger.apache_spark_repl_imain.name = org.apache.spark.repl.SparkIMain$exprTyper +logger.apache_spark_repl_imain.level = info +logger.apache_spark_repl_iloop.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter +logger.apache_spark_repl_iloop.level = info +logger.parquet.name = org.apache.parquet +logger.parquet.level = error +logger.spark.name = org.apache.spark +logger.spark.level = warn +# Disabling Jetty logs +logger.jetty.name = org.apache.hudi.org.eclipse.jetty +logger.jetty.level = error +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +logger.hive_handler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler +logger.hive_handler.level = fatal +logger.hive_func_registry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry +logger.hive_func_registry.level = error \ No newline at end of file diff --git a/docker/demo/config/test-suite/compact-test.properties b/docker/demo/config/test-suite/compact-test.properties new file mode 100644 index 0000000000000..2eca88de3a426 --- /dev/null +++ b/docker/demo/config/test-suite/compact-test.properties @@ -0,0 +1,50 @@ + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +hoodie.insert.shuffle.parallelism=100 +hoodie.upsert.shuffle.parallelism=100 +hoodie.bulkinsert.shuffle.parallelism=100 + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.embed.timeline.server=false +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.compact.inline.max.delta.commits=2 + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.mode=jdbc +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/complex-dag-cow.yaml b/docker/demo/config/test-suite/complex-dag-cow.yaml deleted file mode 100644 index a10026c0b948b..0000000000000 --- a/docker/demo/config/test-suite/complex-dag-cow.yaml +++ /dev/null @@ -1,134 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -first_insert: - config: - record_size: 70000 - num_insert_partitions: 1 - repeat_count: 1 - num_records_insert: 1000 - type: InsertNode - deps: none -second_insert: - config: - record_size: 70000 - num_insert_partitions: 1 - repeat_count: 1 - num_records_insert: 10000 - deps: first_insert - type: InsertNode -third_insert: - config: - record_size: 70000 - num_insert_partitions: 1 - repeat_count: 1 - num_records_insert: 300 - deps: second_insert - type: InsertNode -first_rollback: - config: - deps: third_insert - type: RollbackNode -first_upsert: - config: - record_size: 70000 - num_insert_partitions: 1 - num_records_insert: 300 - repeat_count: 1 - num_records_upsert: 100 - num_upsert_partitions: 10 - type: UpsertNode - deps: first_rollback -first_hive_sync: - config: - queue_name: "adhoc" - engine: "mr" - type: HiveSyncNode - deps: first_upsert -first_hive_query: - config: - queue_name: "adhoc" - engine: "mr" - hive_queries: - query1: "select count(*) from testdb.table1 group by `_row_key` having count(*) > 1" - result1: 0 - query2: "select count(*) from testdb.table1" - result2: 11300 - type: HiveQueryNode - deps: first_hive_sync -second_upsert: - config: - record_size: 70000 - num_insert_partitions: 1 - num_records_insert: 300 - repeat_count: 1 - num_records_upsert: 100 - num_upsert_partitions: 10 - type: UpsertNode - deps: first_hive_query -second_hive_query: - config: - queue_name: "adhoc" - engine: "mr" - hive_queries: - query1: "select count(*) from testdb.table1 group by `_row_key` having count(*) > 1" - result1: 0 - query2: "select count(*) from testdb.table1" - result2: 11600 - type: HiveQueryNode - deps: second_upsert -fourth_insert: - config: - record_size: 70000 - num_insert_partitions: 1 - repeat_count: 1 - num_records_insert: 1000 - deps: second_hive_query - type: InsertNode -third_hive_query: - config: - queue_name: "adhoc" - engine: "mr" - hive_queries: - query1: "select count(*) from testdb.table1 group by `_row_key` having count(*) > 1" - result1: 0 - query2: "select count(*) from testdb.table1" - result2: 12600 - type: HiveQueryNode - deps: fourth_insert -first_delete: - config: - record_size: 70000 - num_partitions_delete: 1 - num_records_delete: 200 - deps: third_hive_query - type: DeleteNode -fourth_hive_sync: - config: - queue_name: "adhoc" - engine: "mr" - type: HiveSyncNode - deps: first_delete -fourth_hive_query: - config: - queue_name: "adhoc" - engine: "mr" - hive_queries: - query1: "select count(*) from testdb.table1 group by `_row_key` having count(*) > 1" - result1: 0 - query2: "select count(*) from testdb.table1" - result2: 12400 - type: HiveQueryNode - deps: fourth_hive_sync \ No newline at end of file diff --git a/docker/demo/config/test-suite/complex-dag-mor.yaml b/docker/demo/config/test-suite/complex-dag-mor.yaml index 2652b03070a7e..24f3a9c3b62c7 100644 --- a/docker/demo/config/test-suite/complex-dag-mor.yaml +++ b/docker/demo/config/test-suite/complex-dag-mor.yaml @@ -13,103 +13,72 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -first_insert: - config: - record_size: 70000 - num_insert_partitions: 1 - repeat_count: 1 - num_records_insert: 100 - type: InsertNode - deps: none -second_insert: - config: - record_size: 70000 - num_insert_partitions: 1 - repeat_count: 1 - num_records_insert: 100 - deps: first_insert - type: InsertNode -third_insert: - config: - record_size: 70000 - num_insert_partitions: 1 - repeat_count: 1 - num_records_insert: 300 - deps: second_insert - type: InsertNode -first_rollback: - config: - deps: third_insert - type: RollbackNode -first_upsert: - config: - record_size: 70000 - num_insert_partitions: 1 - num_records_insert: 300 - repeat_count: 1 - num_records_upsert: 100 - num_upsert_partitions: 10 - type: UpsertNode - deps: first_rollback -first_hive_sync: - config: - queue_name: "adhoc" - engine: "mr" - type: HiveSyncNode - deps: first_upsert -first_hive_query: - config: - queue_name: "adhoc" - engine: "mr" - type: HiveQueryNode - deps: first_hive_sync -second_upsert: - config: - record_size: 70000 - num_insert_partitions: 1 - num_records_insert: 300 - repeat_count: 1 - num_records_upsert: 100 - num_upsert_partitions: 10 - type: UpsertNode - deps: first_hive_query -second_hive_query: - config: - queue_name: "adhoc" - engine: "mr" - hive_queries: - query1: "select count(*) from testdb.table1 group by `_row_key` having count(*) > 1" - result1: 0 - query2: "select count(*) from testdb.table1" - result2: 1100 - type: HiveQueryNode - deps: second_upsert -first_schedule_compact: - config: - type: ScheduleCompactNode - deps: second_hive_query -third_upsert: - config: - record_size: 70000 - num_insert_partitions: 1 - num_records_insert: 300 - repeat_count: 1 - num_records_upsert: 100 - num_upsert_partitions: 10 - type: UpsertNode - deps: first_schedule_compact -first_compact: - config: - type: CompactNode - deps: first_schedule_compact -third_hive_query: - config: - queue_name: "adhoc" - engine: "mr" - hive_queries: - query1: "select count(*) from testdb.table1 group by `_row_key` having count(*) > 1" - result1: 0 - query2: "select count(*) from testdb.table1" - result2: 1400 - type: HiveQueryNode - deps: first_compact +dag_name: complex-dag-mor.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 100 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: third_insert + first_validate: + config: + type: ValidateDatasetNode + deps: first_hive_sync + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 100 + num_partitions_upsert: 1 + type: UpsertNode + deps: first_validate + first_schedule_compact: + config: + type: ScheduleCompactNode + deps: first_upsert + first_delete: + config: + num_partitions_delete: 1 + num_records_delete: 500 + type: DeleteNode + deps: first_schedule_compact + second_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_delete + second_validate: + config: + delete_input_data: true + type: ValidateDatasetNode + deps: second_hive_sync diff --git a/docker/demo/config/test-suite/deltastreamer-hive-sync-presto.yaml b/docker/demo/config/test-suite/deltastreamer-hive-sync-presto.yaml new file mode 100644 index 0000000000000..61ea13c18e566 --- /dev/null +++ b/docker/demo/config/test-suite/deltastreamer-hive-sync-presto.yaml @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dag_name: unit-test-cow-dag +dag_rounds: 1 +dag_intermittent_delay_mins: 10 +dag_content: + first_insert: + config: + record_size: 70000 + num_partitions_insert: 1 + repeat_count: 2 + num_records_insert: 100 + type: InsertNode + deps: none + second_insert: + config: + record_size: 70000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 100 + type: InsertNode + deps: first_insert + third_insert: + config: + record_size: 70000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 100 + type: InsertNode + deps: second_insert + first_upsert: + config: + record_size: 70000 + num_partitions_upsert: 1 + repeat_count: 1 + num_records_upsert: 100 + type: UpsertNode + deps: third_insert + first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_upsert + first_presto_query: + config: + presto_props: + prop1: "SET SESSION hive.parquet_use_column_names = true" + presto_queries: + query1: "select count(*) from testdb.table1" + result1: 400 + query2: "select count(*) from testdb.table1 group by _row_key having count(*) > 1" + result2: 0 + type: PrestoQueryNode + deps: first_hive_sync +# first_trino_query: +# config: +# trino_queries: +# query1: "select count(*) from testdb1.table1" +# result1: 300 +# query2: "select count(*) from testdb1.table1 group by `_row_key` having count(*) > 1" +# result2: 0 +# type: TrinoQueryNode +# deps: first_presto_query \ No newline at end of file diff --git a/docker/demo/config/test-suite/deltastreamer-immutable-dataset.yaml b/docker/demo/config/test-suite/deltastreamer-immutable-dataset.yaml new file mode 100644 index 0000000000000..a19617ef135c5 --- /dev/null +++ b/docker/demo/config/test-suite/deltastreamer-immutable-dataset.yaml @@ -0,0 +1,65 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: deltastreamer-immutable-dataset.yaml +dag_rounds: 5 +dag_intermittent_delay_mins: 0 +dag_content: + first_bulk_insert: + config: + record_size: 200 + num_partitions_insert: 10 + repeat_count: 3 + num_records_insert: 5000 + type: BulkInsertNode + deps: none + first_validate: + config: + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: first_bulk_insert + first_insert: + config: + record_size: 200 + num_partitions_insert: 10 + repeat_count: 3 + num_records_insert: 5000 + type: InsertNode + deps: first_validate + second_validate: + config: + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: first_insert + first_presto_query: + config: + execute_itr_count: 5 + presto_props: + prop1: "SET SESSION hive.parquet_use_column_names = true" + presto_queries: + query1: "select count(*) from testdb.table1" + result1: 30000 + query2: "select count(*) from testdb.table1 group by _row_key having count(*) > 1" + result2: 0 + type: PrestoQueryNode + deps: second_validate + last_validate: + config: + execute_itr_count: 5 + delete_input_data: true + type: ValidateAsyncOperations + deps: first_presto_query \ No newline at end of file diff --git a/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions-hive.yaml b/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions-hive.yaml new file mode 100644 index 0000000000000..6e94b05a698ae --- /dev/null +++ b/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions-hive.yaml @@ -0,0 +1,85 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: deltastreamer-long-running-multi-partitions.yaml +dag_rounds: 20 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 5 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 50 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 2 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: third_insert + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 2 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 100 + num_partitions_upsert: 1 + type: UpsertNode + deps: first_hive_sync + first_delete: + config: + num_partitions_delete: 50 + num_records_delete: 4000 + type: DeleteNode + deps: first_upsert + second_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_delete + second_validate: + config: + execute_itr_count: 20 + validate_hive: true + delete_input_data: true + max_wait_time_for_deltastreamer_catch_up_ms: 600000 + type: ValidateDatasetNode + deps: second_hive_sync + last_validate: + config: + execute_itr_count: 20 + max_wait_time_for_deltastreamer_catch_up_ms: 600000 + type: ValidateAsyncOperations + deps: second_validate diff --git a/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions-metadata.yaml b/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions-metadata.yaml new file mode 100644 index 0000000000000..9ba6993e1d500 --- /dev/null +++ b/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions-metadata.yaml @@ -0,0 +1,85 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: deltastreamer-long-running-multi-partitions.yaml +dag_rounds: 20 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 100 + num_partitions_upsert: 10 + type: UpsertNode + deps: third_insert + first_delete: + config: + num_partitions_delete: 10 + num_records_delete: 4000 + type: DeleteNode + deps: first_upsert + second_validate: + config: + validate_once_every_itr : 5 + validate_hive: false + delete_input_data: true + max_wait_time_for_deltastreamer_catch_up_ms: 600000 + type: ValidateDatasetNode + deps: first_delete + first_presto_query: + config: + execute_itr_count: 20 + presto_props: + prop1: "SET SESSION hive.parquet_use_column_names = true" + presto_queries: + query1: "select count(*) from testdb.table1" + result1: 7600 + query2: "select count(*) from testdb.table1 group by _row_key having count(*) > 1" + result2: 0 + type: PrestoQueryNode + deps: second_validate + last_validate: + config: + execute_itr_count: 20 + max_wait_time_for_deltastreamer_catch_up_ms: 600000 + type: ValidateAsyncOperations + deps: first_presto_query diff --git a/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions.yaml b/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions.yaml new file mode 100644 index 0000000000000..9ba6993e1d500 --- /dev/null +++ b/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions.yaml @@ -0,0 +1,85 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: deltastreamer-long-running-multi-partitions.yaml +dag_rounds: 20 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 100 + num_partitions_upsert: 10 + type: UpsertNode + deps: third_insert + first_delete: + config: + num_partitions_delete: 10 + num_records_delete: 4000 + type: DeleteNode + deps: first_upsert + second_validate: + config: + validate_once_every_itr : 5 + validate_hive: false + delete_input_data: true + max_wait_time_for_deltastreamer_catch_up_ms: 600000 + type: ValidateDatasetNode + deps: first_delete + first_presto_query: + config: + execute_itr_count: 20 + presto_props: + prop1: "SET SESSION hive.parquet_use_column_names = true" + presto_queries: + query1: "select count(*) from testdb.table1" + result1: 7600 + query2: "select count(*) from testdb.table1 group by _row_key having count(*) > 1" + result2: 0 + type: PrestoQueryNode + deps: second_validate + last_validate: + config: + execute_itr_count: 20 + max_wait_time_for_deltastreamer_catch_up_ms: 600000 + type: ValidateAsyncOperations + deps: first_presto_query diff --git a/docker/demo/config/test-suite/deltastreamer-medium-clustering.yaml b/docker/demo/config/test-suite/deltastreamer-medium-clustering.yaml new file mode 100644 index 0000000000000..b0207920320c9 --- /dev/null +++ b/docker/demo/config/test-suite/deltastreamer-medium-clustering.yaml @@ -0,0 +1,88 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# to be used with test-aggressive-clean-archival.properties + +dag_name: deltastreamer-medium-clustering.yaml +dag_rounds: 15 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 50 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 50 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 50 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 50 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 100 + num_partitions_upsert: 50 + type: UpsertNode + deps: third_insert + first_delete: + config: + num_partitions_delete: 50 + num_records_delete: 8000 + type: DeleteNode + deps: first_upsert + second_validate: + config: + validate_once_every_itr: 3 + validate_hive: false + delete_input_data: true + max_wait_time_for_deltastreamer_catch_up_ms: 600000 + type: ValidateDatasetNode + deps: first_delete + first_presto_query: + config: + execute_itr_count: 15 + presto_props: + prop1: "SET SESSION hive.parquet_use_column_names = true" + presto_queries: + query1: "select count(*) from testdb.table1" + result1: 3600 + query2: "select count(*) from testdb.table1 group by _row_key having count(*) > 1" + result2: 0 + type: PrestoQueryNode + deps: second_validate + last_validate: + config: + execute_itr_count: 15 + max_wait_time_for_deltastreamer_catch_up_ms: 600000 + type: ValidateAsyncOperations + deps: first_presto_query diff --git a/docker/demo/config/test-suite/deltastreamer-medium-full-dataset-validation.yaml b/docker/demo/config/test-suite/deltastreamer-medium-full-dataset-validation.yaml new file mode 100644 index 0000000000000..563640299144e --- /dev/null +++ b/docker/demo/config/test-suite/deltastreamer-medium-full-dataset-validation.yaml @@ -0,0 +1,88 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# to be used with test-aggressive-clean-archival.properties + +dag_name: deltastreamer-long-running-multi-partitions.yaml +dag_rounds: 15 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 50 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 50 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 50 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 50 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 100 + num_partitions_upsert: 50 + type: UpsertNode + deps: third_insert + first_delete: + config: + num_partitions_delete: 50 + num_records_delete: 8000 + type: DeleteNode + deps: first_upsert + second_validate: + config: + validate_once_every_itr : 5 + validate_hive: false + delete_input_data: false + max_wait_time_for_deltastreamer_catch_up_ms: 600000 + type: ValidateDatasetNode + deps: first_delete + first_presto_query: + config: + execute_itr_count: 15 + presto_props: + prop1: "SET SESSION hive.parquet_use_column_names = true" + presto_queries: + query1: "select count(*) from testdb.table1" + result1: 3600 + query2: "select count(*) from testdb.table1 group by _row_key having count(*) > 1" + result2: 0 + type: PrestoQueryNode + deps: second_validate + last_validate: + config: + execute_itr_count: 15 + max_wait_time_for_deltastreamer_catch_up_ms: 600000 + type: ValidateAsyncOperations + deps: first_presto_query diff --git a/docker/demo/config/test-suite/deltastreamer-non-partitioned.yaml b/docker/demo/config/test-suite/deltastreamer-non-partitioned.yaml new file mode 100644 index 0000000000000..8d42eea877b85 --- /dev/null +++ b/docker/demo/config/test-suite/deltastreamer-non-partitioned.yaml @@ -0,0 +1,75 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: deltastreamer-long-running-multi-partitions.yaml +dag_rounds: 6 +dag_intermittent_delay_mins: 0 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 1 + type: UpsertNode + deps: second_insert + first_delete: + config: + num_partitions_delete: 1 + num_records_delete: 1000 + type: DeleteNode + deps: first_upsert + second_validate: + config: + validate_once_every_itr : 3 + validate_hive: false + delete_input_data: true + type: ValidateDatasetNode + deps: first_delete + first_presto_query: + config: + execute_itr_count: 6 + presto_props: + prop1: "SET SESSION hive.parquet_use_column_names = true" + presto_queries: + query1: "select count(*) from testdb.table1" + result1: 11000 + query2: "select count(*) from testdb.table1 group by _row_key having count(*) > 1" + result2: 0 + type: PrestoQueryNode + deps: second_validate + last_validate: + config: + execute_itr_count: 6 + type: ValidateAsyncOperations + deps: first_presto_query diff --git a/docker/demo/config/test-suite/deltastreamer-pure-bulk-inserts.yaml b/docker/demo/config/test-suite/deltastreamer-pure-bulk-inserts.yaml new file mode 100644 index 0000000000000..d5342e22b1282 --- /dev/null +++ b/docker/demo/config/test-suite/deltastreamer-pure-bulk-inserts.yaml @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: deltastreamer-pure-bulk-inserts.yaml +dag_rounds: 10 +dag_intermittent_delay_mins: 0 +dag_content: + first_bulk_insert: + config: + record_size: 200 + num_partitions_insert: 10 + repeat_count: 3 + num_records_insert: 5000 + type: BulkInsertNode + deps: none + second_validate: + config: + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: first_bulk_insert + last_validate: + config: + execute_itr_count: 10 + type: ValidateAsyncOperations + deps: second_validate \ No newline at end of file diff --git a/docker/demo/config/test-suite/deltastreamer-pure-inserts.yaml b/docker/demo/config/test-suite/deltastreamer-pure-inserts.yaml new file mode 100644 index 0000000000000..3b209fe5fe016 --- /dev/null +++ b/docker/demo/config/test-suite/deltastreamer-pure-inserts.yaml @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: deltastreamer-pure-inserts.yaml +dag_rounds: 10 +dag_intermittent_delay_mins: 0 +dag_content: + first_insert: + config: + record_size: 200 + num_partitions_insert: 10 + repeat_count: 3 + num_records_insert: 5000 + type: InsertNode + deps: none + second_validate: + config: + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: first_insert + last_validate: + config: + execute_itr_count: 10 + type: ValidateAsyncOperations + deps: second_validate \ No newline at end of file diff --git a/docker/demo/config/test-suite/detlastreamer-long-running-example.yaml b/docker/demo/config/test-suite/detlastreamer-long-running-example.yaml new file mode 100644 index 0000000000000..4fefcc497d32b --- /dev/null +++ b/docker/demo/config/test-suite/detlastreamer-long-running-example.yaml @@ -0,0 +1,85 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: detlastreamer-long-running-example.yaml +dag_rounds: 20 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 100 + num_partitions_upsert: 1 + type: UpsertNode + deps: third_insert + first_delete: + config: + num_partitions_delete: 1 + num_records_delete: 8000 + type: DeleteNode + deps: first_upsert + second_validate: + config: + validate_once_every_itr : 5 + validate_hive: false + delete_input_data: true + max_wait_time_for_deltastreamer_catch_up_ms: 600000 + type: ValidateDatasetNode + deps: first_delete + first_presto_query: + config: + execute_itr_count: 20 + presto_props: + prop1: "SET SESSION hive.parquet_use_column_names = true" + presto_queries: + query1: "select count(*) from testdb.table1" + result1: 3600 + query2: "select count(*) from testdb.table1 group by _row_key having count(*) > 1" + result2: 0 + type: PrestoQueryNode + deps: second_validate + last_validate: + config: + execute_itr_count: 20 + max_wait_time_for_deltastreamer_catch_up_ms: 600000 + type: ValidateAsyncOperations + deps: first_presto_query diff --git a/docker/demo/config/test-suite/insert-overwrite-table.yaml b/docker/demo/config/test-suite/insert-overwrite-table.yaml new file mode 100644 index 0000000000000..2251660b7028c --- /dev/null +++ b/docker/demo/config/test-suite/insert-overwrite-table.yaml @@ -0,0 +1,92 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: simple-deltastreamer.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: none + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: first_insert + second_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: first_upsert + second_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: second_insert + first_insert_overwrite_table: + config: + record_size: 1000 + repeat_count: 1 + num_records_insert: 10 + type: SparkInsertOverwriteTableNode + deps: second_upsert + delete_all_input_except_last: + config: + delete_input_data_except_latest: true + type: DeleteInputDatasetNode + deps: first_insert_overwrite_table + third_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: delete_all_input_except_last + third_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: third_insert + second_validate: + config: + validate_full_data : true + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: third_upsert diff --git a/docker/demo/config/test-suite/insert-overwrite.yaml b/docker/demo/config/test-suite/insert-overwrite.yaml new file mode 100644 index 0000000000000..7e54cea6a910d --- /dev/null +++ b/docker/demo/config/test-suite/insert-overwrite.yaml @@ -0,0 +1,93 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: simple-deltastreamer.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: none + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: first_insert + second_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: first_upsert + second_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: second_insert + first_insert_overwrite: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10 + type: SparkInsertOverwriteNode + deps: second_upsert + delete_all_input_except_last: + config: + delete_input_data_except_latest: true + type: DeleteInputDatasetNode + deps: first_insert_overwrite + third_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: delete_all_input_except_last + third_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: third_insert + second_validate: + config: + validate_full_data : true + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: third_upsert \ No newline at end of file diff --git a/docker/demo/config/test-suite/large-scale/cow-large-scale-long-running.yaml b/docker/demo/config/test-suite/large-scale/cow-large-scale-long-running.yaml new file mode 100644 index 0000000000000..9231407c98922 --- /dev/null +++ b/docker/demo/config/test-suite/large-scale/cow-large-scale-long-running.yaml @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Sanity yaml to test simple operations. +dag_name: cow-large-scale-long-running.yaml +dag_rounds: 50 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 1000000 # this will generate about 1.5 GB data + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 100000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 5 + repeat_count: 1 + num_records_insert: 300000 + deps: second_insert + type: InsertNode + first_validate: + config: + validate_hive: false + type: ValidateDatasetNode + deps: third_insert + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 2 + num_records_insert: 3000 + repeat_count: 1 + num_records_upsert: 100000 + num_partitions_upsert: 10 + type: UpsertNode + deps: first_validate + first_delete: + config: + num_partitions_delete: 5 + num_records_delete: 8000 + type: DeleteNode + deps: first_upsert + second_validate: + config: + validate_hive: false + type: ValidateDatasetNode + deps: first_delete diff --git a/docker/demo/config/test-suite/large-scale/cow-large-scale-sanity.yaml b/docker/demo/config/test-suite/large-scale/cow-large-scale-sanity.yaml new file mode 100644 index 0000000000000..813c7671a7a27 --- /dev/null +++ b/docker/demo/config/test-suite/large-scale/cow-large-scale-sanity.yaml @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Sanity yaml to test simple operations. +dag_name: cow-large-scale-sanity.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 100 + repeat_count: 1 + num_records_insert: 3000000 # this will generate about 60GB data + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 50 + repeat_count: 1 + num_records_insert: 1000000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 5 + repeat_count: 1 + num_records_insert: 300000 + deps: second_insert + type: InsertNode + first_validate: + config: + validate_hive: false + type: ValidateDatasetNode + deps: third_insert + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 2 + num_records_insert: 3000 + repeat_count: 1 + num_records_upsert: 100000 + num_partitions_upsert: 20 + type: UpsertNode + deps: first_validate + first_delete: + config: + num_partitions_delete: 5 + num_records_delete: 8000 + type: DeleteNode + deps: first_upsert + second_validate: + config: + validate_hive: false + type: ValidateDatasetNode + deps: first_delete diff --git a/docker/demo/config/test-suite/large-scale/mor-large-scale-long-running.yaml b/docker/demo/config/test-suite/large-scale/mor-large-scale-long-running.yaml new file mode 100644 index 0000000000000..f37b206eb773e --- /dev/null +++ b/docker/demo/config/test-suite/large-scale/mor-large-scale-long-running.yaml @@ -0,0 +1,75 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Sanity yaml to test simple operations. +dag_name: mor-large-scale-long-running.yaml +dag_rounds: 50 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 1000000 # this will generate about 1.5 GB data + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 100000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 5 + repeat_count: 1 + num_records_insert: 300000 + deps: second_insert + type: InsertNode + first_validate: + config: + validate_hive: false + type: ValidateDatasetNode + deps: third_insert + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 2 + num_records_insert: 3000 + repeat_count: 1 + num_records_upsert: 100000 + num_partitions_upsert: 10 + type: UpsertNode + deps: first_validate + first_schedule_compact: + config: + type: ScheduleCompactNode + deps: first_upsert + first_delete: + config: + num_partitions_delete: 5 + num_records_delete: 8000 + type: DeleteNode + deps: first_schedule_compact + second_validate: + config: + validate_hive: false + type: ValidateDatasetNode + deps: first_delete diff --git a/docker/demo/config/test-suite/large-scale/mor-large-scale-sanity.yaml b/docker/demo/config/test-suite/large-scale/mor-large-scale-sanity.yaml new file mode 100644 index 0000000000000..1137b1388ec20 --- /dev/null +++ b/docker/demo/config/test-suite/large-scale/mor-large-scale-sanity.yaml @@ -0,0 +1,75 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Sanity yaml to test simple operations. +dag_name: mor-large-scale-sanity.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 100 + repeat_count: 1 + num_records_insert: 3000000 # this will generate about 60GB data + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 50 + repeat_count: 1 + num_records_insert: 1000000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 5 + repeat_count: 1 + num_records_insert: 300000 + deps: second_insert + type: InsertNode + first_validate: + config: + validate_hive: false + type: ValidateDatasetNode + deps: third_insert + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 2 + num_records_insert: 3000 + repeat_count: 1 + num_records_upsert: 100000 + num_partitions_upsert: 20 + type: UpsertNode + deps: first_validate + first_schedule_compact: + config: + type: ScheduleCompactNode + deps: first_upsert + first_delete: + config: + num_partitions_delete: 5 + num_records_delete: 8000 + type: DeleteNode + deps: first_schedule_compact + second_validate: + config: + validate_hive: false + type: ValidateDatasetNode + deps: first_delete diff --git a/docker/demo/config/test-suite/mor-async-compact.yaml b/docker/demo/config/test-suite/mor-async-compact.yaml new file mode 100644 index 0000000000000..4ee9c535ebce3 --- /dev/null +++ b/docker/demo/config/test-suite/mor-async-compact.yaml @@ -0,0 +1,126 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Use compact-test.properties for this yaml file. +dag_name: mor-async-compact.yaml +dag_rounds: 4 +dag_intermittent_delay_mins: 0 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 10000 + type: InsertNode + deps: none + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 2000 + num_partitions_upsert: 1 + type: UpsertNode + deps: first_insert + second_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 2000 + num_partitions_upsert: 1 + type: UpsertNode + deps: first_upsert + third_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 2000 + num_partitions_upsert: 1 + type: UpsertNode + deps: second_upsert + first_validate: + config: + delete_input_data: false + type: ValidateDatasetNode + deps: third_upsert + first_schedule_compact: + config: + type: ScheduleCompactNode + deps: first_validate + fourth_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 2000 + num_partitions_upsert: 1 + type: UpsertNode + deps: first_schedule_compact + fifth_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 2000 + num_partitions_upsert: 1 + type: UpsertNode + deps: fourth_upsert + second_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 10000 + type: InsertNode + deps: fifth_upsert + sixth_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 2000 + num_partitions_upsert: 1 + type: UpsertNode + deps: second_insert + third_validate: + config: + delete_input_data: false + type: ValidateDatasetNode + deps: sixth_upsert + first_compact: + config: + type: CompactNode + deps: third_validate + first_delete: + config: + num_partitions_delete: 1 + num_records_delete: 500 + type: DeleteNode + deps: first_compact + fifth_validate: + config: + delete_input_data: false + type: ValidateDatasetNode + deps: first_delete \ No newline at end of file diff --git a/docker/demo/config/test-suite/multi-writer-1-ds.yaml b/docker/demo/config/test-suite/multi-writer-1-ds.yaml new file mode 100644 index 0000000000000..3476d8075a6ed --- /dev/null +++ b/docker/demo/config/test-suite/multi-writer-1-ds.yaml @@ -0,0 +1,65 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: simple-deltastreamer.yaml +dag_rounds: 6 +dag_intermittent_delay_mins: 0 +dag_content: + first_insert: + config: + record_size: 5000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 100000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 30000 + deps: second_insert + type: InsertNode + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 5000 + repeat_count: 1 + num_records_upsert: 50000 + num_partitions_upsert: 1 + type: UpsertNode + deps: third_insert + first_delete: + config: + num_partitions_delete : 0 + num_records_delete: 100000 + type: DeleteNode + deps: first_upsert + second_validate: + config: + validate_hive: false + delete_input_data: true + type: ValidateDatasetNode + deps: first_delete diff --git a/docker/demo/config/test-suite/multi-writer-1-sds.yaml b/docker/demo/config/test-suite/multi-writer-1-sds.yaml new file mode 100644 index 0000000000000..d60a8ba6d78a6 --- /dev/null +++ b/docker/demo/config/test-suite/multi-writer-1-sds.yaml @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: cow-spark-simple.yaml +dag_rounds: 6 +dag_intermittent_delay_mins: 0 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 100000 + start_partition: 1 + type: SparkInsertNode + deps: none + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 50000 + repeat_count: 1 + num_records_upsert: 50000 + num_partitions_upsert: 1 + start_partition: 1 + type: SparkUpsertNode + deps: first_insert + first_delete: + config: + num_partitions_delete: 0 + num_records_delete: 10000 + start_partition: 1 + type: SparkDeleteNode + deps: first_upsert + second_validate: + config: + validate_hive: false + delete_input_data: true + type: ValidateDatasetNode + deps: first_delete \ No newline at end of file diff --git a/docker/demo/config/test-suite/multi-writer-1.properties b/docker/demo/config/test-suite/multi-writer-1.properties new file mode 100644 index 0000000000000..502a1b771e8cd --- /dev/null +++ b/docker/demo/config/test-suite/multi-writer-1.properties @@ -0,0 +1,58 @@ + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +hoodie.insert.shuffle.parallelism=2 +hoodie.upsert.shuffle.parallelism=2 +hoodie.bulkinsert.shuffle.parallelism=2 +hoodie.delete.shuffle.parallelism=2 + +hoodie.metadata.enable=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.embed.timeline.server=false +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.write.concurrency.mode=optimistic_concurrency_control +hoodie.cleaner.policy.failed.writes=LAZY +hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider +hoodie.write.lock.zookeeper.url=zookeeper:2181 +hoodie.write.lock.zookeeper.port=2181 +hoodie.write.lock.zookeeper.lock_key=locks +hoodie.write.lock.zookeeper.base_path=/tmp/.locks + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input1 +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/multi-writer-2-sds.yaml b/docker/demo/config/test-suite/multi-writer-2-sds.yaml new file mode 100644 index 0000000000000..702065c672112 --- /dev/null +++ b/docker/demo/config/test-suite/multi-writer-2-sds.yaml @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: cow-spark-simple.yaml +dag_rounds: 5 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 100000 + start_partition: 10 + type: SparkInsertNode + deps: none + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 50000 + repeat_count: 1 + num_records_upsert: 50000 + num_partitions_upsert: 1 + start_partition: 10 + type: SparkUpsertNode + deps: first_insert + first_delete: + config: + num_partitions_delete: 0 + num_records_delete: 10000 + start_partition: 10 + type: SparkDeleteNode + deps: first_upsert + second_validate: + config: + validate_hive: false + delete_input_data: true + type: ValidateDatasetNode + deps: first_delete \ No newline at end of file diff --git a/docker/demo/config/test-suite/multi-writer-2.properties b/docker/demo/config/test-suite/multi-writer-2.properties new file mode 100644 index 0000000000000..80db8912b5406 --- /dev/null +++ b/docker/demo/config/test-suite/multi-writer-2.properties @@ -0,0 +1,58 @@ + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +hoodie.insert.shuffle.parallelism=2 +hoodie.upsert.shuffle.parallelism=2 +hoodie.bulkinsert.shuffle.parallelism=2 +hoodie.delete.shuffle.parallelism=2 + +hoodie.metadata.enable=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.embed.timeline.server=false +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.write.concurrency.mode=optimistic_concurrency_control +hoodie.cleaner.policy.failed.writes=LAZY +hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider +hoodie.write.lock.zookeeper.url=zookeeper:2181 +hoodie.write.lock.zookeeper.port=2181 +hoodie.write.lock.zookeeper.lock_key=locks +hoodie.write.lock.zookeeper.base_path=/tmp/.locks + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input2 +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/multi-writer-3-sds.yaml b/docker/demo/config/test-suite/multi-writer-3-sds.yaml new file mode 100644 index 0000000000000..9ad21f467d50b --- /dev/null +++ b/docker/demo/config/test-suite/multi-writer-3-sds.yaml @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: cow-spark-simple.yaml +dag_rounds: 4 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 100000 + start_partition: 20 + type: SparkInsertNode + deps: none + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 50000 + repeat_count: 1 + num_records_upsert: 50000 + num_partitions_upsert: 1 + start_partition: 20 + type: SparkUpsertNode + deps: first_insert + first_delete: + config: + num_partitions_delete: 0 + num_records_delete: 10000 + start_partition: 20 + type: SparkDeleteNode + deps: first_upsert + second_validate: + config: + validate_hive: false + delete_input_data: true + type: ValidateDatasetNode + deps: first_delete \ No newline at end of file diff --git a/docker/demo/config/test-suite/multi-writer-4-sds.yaml b/docker/demo/config/test-suite/multi-writer-4-sds.yaml new file mode 100644 index 0000000000000..74dfa1cb4ba6a --- /dev/null +++ b/docker/demo/config/test-suite/multi-writer-4-sds.yaml @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: cow-spark-simple.yaml +dag_rounds: 4 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 100000 + start_partition: 30 + type: SparkInsertNode + deps: none + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 50000 + repeat_count: 1 + num_records_upsert: 50000 + num_partitions_upsert: 1 + start_partition: 30 + type: SparkUpsertNode + deps: first_insert + first_delete: + config: + num_partitions_delete: 0 + num_records_delete: 10000 + start_partition: 30 + type: SparkDeleteNode + deps: first_upsert + second_validate: + config: + validate_hive: false + delete_input_data: true + type: ValidateDatasetNode + deps: first_delete \ No newline at end of file diff --git a/docker/demo/config/test-suite/multi-writer-local-1.properties b/docker/demo/config/test-suite/multi-writer-local-1.properties new file mode 100644 index 0000000000000..be16f91c17459 --- /dev/null +++ b/docker/demo/config/test-suite/multi-writer-local-1.properties @@ -0,0 +1,57 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=2 +hoodie.upsert.shuffle.parallelism=2 +hoodie.bulkinsert.shuffle.parallelism=2 +hoodie.delete.shuffle.parallelism=2 + +hoodie.metadata.enable=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.embed.timeline.server=false +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.write.concurrency.mode=optimistic_concurrency_control +hoodie.cleaner.policy.failed.writes=LAZY +hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.InProcessLockProvider + +hoodie.deltastreamer.source.dfs.root=/tmp/hudi/input1 +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/tmp/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/tmp/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/multi-writer-local-2.properties b/docker/demo/config/test-suite/multi-writer-local-2.properties new file mode 100644 index 0000000000000..08f294ce1461e --- /dev/null +++ b/docker/demo/config/test-suite/multi-writer-local-2.properties @@ -0,0 +1,57 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=2 +hoodie.upsert.shuffle.parallelism=2 +hoodie.bulkinsert.shuffle.parallelism=2 +hoodie.delete.shuffle.parallelism=2 + +hoodie.metadata.enable=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.embed.timeline.server=false +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.write.concurrency.mode=optimistic_concurrency_control +hoodie.cleaner.policy.failed.writes=LAZY +hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.InProcessLockProvider + +hoodie.deltastreamer.source.dfs.root=/tmp/hudi/input2 +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/tmp/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/tmp/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/multi-writer-local-3.properties b/docker/demo/config/test-suite/multi-writer-local-3.properties new file mode 100644 index 0000000000000..48f0f0b1ace8b --- /dev/null +++ b/docker/demo/config/test-suite/multi-writer-local-3.properties @@ -0,0 +1,57 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=2 +hoodie.upsert.shuffle.parallelism=2 +hoodie.bulkinsert.shuffle.parallelism=2 +hoodie.delete.shuffle.parallelism=2 + +hoodie.metadata.enable=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.embed.timeline.server=false +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.write.concurrency.mode=optimistic_concurrency_control +hoodie.cleaner.policy.failed.writes=LAZY +hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.InProcessLockProvider + +hoodie.deltastreamer.source.dfs.root=/tmp/hudi/input3 +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/tmp/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/tmp/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/multi-writer-local-4.properties b/docker/demo/config/test-suite/multi-writer-local-4.properties new file mode 100644 index 0000000000000..4b5120928ccb1 --- /dev/null +++ b/docker/demo/config/test-suite/multi-writer-local-4.properties @@ -0,0 +1,57 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=2 +hoodie.upsert.shuffle.parallelism=2 +hoodie.bulkinsert.shuffle.parallelism=2 +hoodie.delete.shuffle.parallelism=2 + +hoodie.metadata.enable=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.embed.timeline.server=false +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.write.concurrency.mode=optimistic_concurrency_control +hoodie.cleaner.policy.failed.writes=LAZY +hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.InProcessLockProvider + +hoodie.deltastreamer.source.dfs.root=/tmp/hudi/input4 +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/tmp/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/tmp/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/simple-clustering-hive.yaml b/docker/demo/config/test-suite/simple-clustering-hive.yaml new file mode 100644 index 0000000000000..1127bd02b93e8 --- /dev/null +++ b/docker/demo/config/test-suite/simple-clustering-hive.yaml @@ -0,0 +1,76 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: simple-clustering-hive.yaml +dag_rounds: 30 +dag_intermittent_delay_mins: 0 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_delete: + config: + num_partitions_delete: 1 + num_records_delete: 3000 + type: DeleteNode + deps: third_insert + first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_delete + first_validate: + config: + validate_hive: false + type: ValidateDatasetNode + deps: first_hive_sync + first_cluster: + config: + execute_itr_count: 20 + type: ClusteringNode + deps: first_validate + second_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_cluster + second_validate: + config: + validate_hive: true + type: ValidateDatasetNode + deps: second_hive_sync diff --git a/docker/demo/config/test-suite/simple-clustering.yaml b/docker/demo/config/test-suite/simple-clustering.yaml new file mode 100644 index 0000000000000..96f741ecc56a5 --- /dev/null +++ b/docker/demo/config/test-suite/simple-clustering.yaml @@ -0,0 +1,76 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: simple-clustering.yaml +dag_rounds: 15 +dag_intermittent_delay_mins: 0 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_delete: + config: + num_partitions_delete: 1 + num_records_delete: 3000 + type: DeleteNode + deps: third_insert + first_validate: + config: + validate_hive: false + type: ValidateDatasetNode + deps: first_delete + first_cluster: + config: + execute_itr_count: 10 + type: ClusteringNode + deps: first_validate + second_validate: + config: + validate_hive: false + type: ValidateDatasetNode + deps: first_cluster + first_presto_query: + config: + validate_once_every_itr: 5 + presto_props: + prop1: "SET SESSION hive.parquet_use_column_names = true" + presto_queries: + query1: "select count(*) from testdb.table1" + result1: 8300 + query2: "select count(*) from testdb.table1 group by _row_key having count(*) > 1" + result2: 0 + type: PrestoQueryNode + deps: second_validate diff --git a/docker/demo/config/test-suite/simple-deltastreamer-hive.yaml b/docker/demo/config/test-suite/simple-deltastreamer-hive.yaml new file mode 100644 index 0000000000000..e6738b6942b35 --- /dev/null +++ b/docker/demo/config/test-suite/simple-deltastreamer-hive.yaml @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: simple-deltastreamer.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: third_insert + first_validate: + config: + validate_hive: false + type: ValidateDatasetNode + deps: first_hive_sync + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 100 + num_partitions_upsert: 1 + type: UpsertNode + deps: first_validate + first_delete: + config: + num_partitions_delete: 1 + num_records_delete: 2000 + type: DeleteNode + deps: first_upsert + second_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_delete + second_validate: + config: + validate_hive: true + delete_input_data: true + type: ValidateDatasetNode + deps: second_hive_sync diff --git a/docker/demo/config/test-suite/simple-deltastreamer.yaml b/docker/demo/config/test-suite/simple-deltastreamer.yaml new file mode 100644 index 0000000000000..1215b337c83a0 --- /dev/null +++ b/docker/demo/config/test-suite/simple-deltastreamer.yaml @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: simple-deltastreamer.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_validate: + config: + validate_hive: false + type: ValidateDatasetNode + deps: third_insert + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 100 + num_partitions_upsert: 1 + type: UpsertNode + deps: first_validate + first_delete: + config: + num_partitions_delete: 1 + num_records_delete: 2000 + type: DeleteNode + deps: first_upsert + second_validate: + config: + validate_hive: false + delete_input_data: true + type: ValidateDatasetNode + deps: first_delete + first_presto_query: + config: + validate_once_every_itr: 3 + presto_props: + prop1: "SET SESSION hive.parquet_use_column_names = true" + presto_queries: + query1: "select count(*) from testdb.table1" + result1: 9600 + query2: "select count(*) from testdb.table1 group by _row_key having count(*) > 1" + result2: 0 + type: PrestoQueryNode + deps: second_validate diff --git a/docker/demo/config/test-suite/spark-clustering.yaml b/docker/demo/config/test-suite/spark-clustering.yaml new file mode 100644 index 0000000000000..8da4f953983b8 --- /dev/null +++ b/docker/demo/config/test-suite/spark-clustering.yaml @@ -0,0 +1,67 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: cow-spark-simple.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: none + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: first_insert + second_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: first_upsert + second_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: second_insert + first_delete: + config: + num_partitions_delete: 10 + num_records_delete: 16000 + type: SparkDeleteNode + deps: second_upsert + second_validate: + config: + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: first_delete \ No newline at end of file diff --git a/docker/demo/config/test-suite/spark-delete-partition.yaml b/docker/demo/config/test-suite/spark-delete-partition.yaml new file mode 100644 index 0000000000000..1d23fa7b0851c --- /dev/null +++ b/docker/demo/config/test-suite/spark-delete-partition.yaml @@ -0,0 +1,57 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-delete-partition.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 5 + repeat_count: 1 + num_records_insert: 10 + type: SparkInsertNode + deps: none + first_delete_partition: + config: + partitions_to_delete: "1970/01/01" + type: SparkDeletePartitionNode + deps: first_insert + second_validate: + config: + validate_full_data : true + input_partitions_to_skip_validate : "0" + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: first_delete_partition + second_insert: + config: + record_size: 1000 + num_partitions_insert: 5 + repeat_count: 1 + num_records_insert: 10 + start_partition: 2 + type: SparkInsertNode + deps: second_validate + third_validate: + config: + validate_full_data : true + input_partitions_to_skip_validate : "0" + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: second_insert \ No newline at end of file diff --git a/docker/demo/config/test-suite/spark-immutable-dataset.yaml b/docker/demo/config/test-suite/spark-immutable-dataset.yaml new file mode 100644 index 0000000000000..b609f3dc0886d --- /dev/null +++ b/docker/demo/config/test-suite/spark-immutable-dataset.yaml @@ -0,0 +1,65 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-immutable-dataset.yaml +dag_rounds: 5 +dag_intermittent_delay_mins: 0 +dag_content: + first_bulk_insert: + config: + record_size: 200 + num_partitions_insert: 10 + repeat_count: 5 + num_records_insert: 5000 + type: SparkBulkInsertNode + deps: none + first_validate: + config: + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: first_bulk_insert + first_insert: + config: + record_size: 200 + num_partitions_insert: 10 + repeat_count: 5 + num_records_insert: 5000 + type: SparkInsertNode + deps: first_validate + second_validate: + config: + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: first_insert + first_presto_query: + config: + execute_itr_count: 5 + presto_props: + prop1: "SET SESSION hive.parquet_use_column_names = true" + presto_queries: + query1: "select count(*) from testdb.table1" + result1: 48000 + query2: "select count(*) from testdb.table1 group by _row_key having count(*) > 1" + result2: 0 + type: PrestoQueryNode + deps: second_validate + last_validate: + config: + execute_itr_count: 5 + delete_input_data: true + type: ValidateAsyncOperations + deps: first_presto_query \ No newline at end of file diff --git a/docker/demo/config/test-suite/spark-immutable-to-mutable.yaml b/docker/demo/config/test-suite/spark-immutable-to-mutable.yaml new file mode 100644 index 0000000000000..4b974c54d21af --- /dev/null +++ b/docker/demo/config/test-suite/spark-immutable-to-mutable.yaml @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-immutable-dataset.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 0 +dag_content: + first_bulk_insert: + config: + record_size: 200 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 100 + type: SparkBulkInsertNode + deps: none + first_validate: + config: + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: first_bulk_insert + first_update: + config: + record_size: 200 + num_partitions_upsert: 10 + repeat_count: 1 + num_records_upsert: 50 + type: SparkUpsertNode + deps: first_validate + second_validate: + config: + validate_hive: false + delete_input_data: false + validate_full_data: true + type: ValidateDatasetNode + deps: first_update + last_validate: + config: + execute_itr_count: 1 + delete_input_data: true + type: ValidateAsyncOperations + deps: second_validate diff --git a/docker/demo/config/test-suite/spark-long-running-non-partitioned.yaml b/docker/demo/config/test-suite/spark-long-running-non-partitioned.yaml new file mode 100644 index 0000000000000..693d7bf22710a --- /dev/null +++ b/docker/demo/config/test-suite/spark-long-running-non-partitioned.yaml @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: cow-spark-deltastreamer-long-running-multi-partitions.yaml +dag_rounds: 10 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 200 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: SparkInsertNode + deps: none + first_upsert: + config: + record_size: 200 + num_partitions_insert: 1 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 1000 + num_partitions_upsert: 1 + type: SparkUpsertNode + deps: first_insert + first_delete: + config: + num_partitions_delete: 1 + num_records_delete: 1000 + type: SparkDeleteNode + deps: first_upsert + second_validate: + config: + validate_hive: false + delete_input_data: true + max_wait_time_for_deltastreamer_catch_up_ms: 600000 + type: ValidateDatasetNode + deps: first_delete + first_presto_query: + config: + execute_itr_count: 6 + presto_props: + prop1: "SET SESSION hive.parquet_use_column_names = true" + presto_queries: + query1: "select count(*) from testdb.table1" + result1: 6000 + query2: "select count(*) from testdb.table1 group by _row_key having count(*) > 1" + result2: 0 + type: PrestoQueryNode + deps: second_validate + last_validate: + config: + execute_itr_count: 6 + max_wait_time_for_deltastreamer_catch_up_ms: 600000 + type: ValidateAsyncOperations + deps: second_validate diff --git a/docker/demo/config/test-suite/spark-long-running.yaml b/docker/demo/config/test-suite/spark-long-running.yaml new file mode 100644 index 0000000000000..52aeb92a7f3e7 --- /dev/null +++ b/docker/demo/config/test-suite/spark-long-running.yaml @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: cow-spark-deltastreamer-long-running-multi-partitions.yaml +dag_rounds: 20 +dag_intermittent_delay_mins: 0 +dag_content: + first_insert: + config: + record_size: 200 + num_partitions_insert: 50 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: none + first_upsert: + config: + record_size: 200 + num_partitions_insert: 50 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 3000 + num_partitions_upsert: 50 + type: SparkUpsertNode + deps: first_insert + first_delete: + config: + num_partitions_delete: 50 + num_records_delete: 4000 + type: SparkDeleteNode + deps: first_upsert + second_validate: + config: + validate_once_every_itr : 5 + validate_hive: false + delete_input_data: true + max_wait_time_for_deltastreamer_catch_up_ms: 600000 + type: ValidateDatasetNode + deps: first_delete + first_presto_query: + config: + execute_itr_count: 30 + presto_props: + prop1: "SET SESSION hive.parquet_use_column_names = true" + presto_queries: + query1: "select count(*) from testdb.table1" + result1: 189000 + query2: "select count(*) from testdb.table1 group by _row_key having count(*) > 1" + result2: 0 + type: PrestoQueryNode + deps: second_validate + last_validate: + config: + execute_itr_count: 30 + max_wait_time_for_deltastreamer_catch_up_ms: 600000 + type: ValidateAsyncOperations + deps: first_presto_query diff --git a/docker/demo/config/test-suite/spark-medium-clustering.yaml b/docker/demo/config/test-suite/spark-medium-clustering.yaml new file mode 100644 index 0000000000000..3045f7c4b9542 --- /dev/null +++ b/docker/demo/config/test-suite/spark-medium-clustering.yaml @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-medium-clustering.yaml +dag_rounds: 15 +dag_intermittent_delay_mins: 0 +dag_content: + first_insert: + config: + record_size: 200 + num_partitions_insert: 50 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: none + first_validate: + config: + validate_hive: false + type: ValidateDatasetNode + deps: first_insert + first_upsert: + config: + record_size: 200 + num_partitions_insert: 50 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 3000 + num_partitions_upsert: 50 + type: SparkUpsertNode + deps: first_validate + first_delete: + config: + num_partitions_delete: 50 + num_records_delete: 8000 + type: SparkDeleteNode + deps: first_upsert + second_validate: + config: + validate_hive: false + delete_input_data: true + type: ValidateDatasetNode + deps: first_delete + first_presto_query: + config: + execute_itr_count: 20 + presto_props: + prop1: "SET SESSION hive.parquet_use_column_names = true" + presto_queries: + query1: "select count(*) from testdb.table1" + result1: 146000 + query2: "select count(*) from testdb.table1 group by _row_key having count(*) > 1" + result2: 0 + type: PrestoQueryNode + deps: second_validate + last_validate: + config: + execute_itr_count: 20 + type: ValidateAsyncOperations + deps: first_presto_query diff --git a/docker/demo/config/test-suite/spark-non-core-operations.yaml b/docker/demo/config/test-suite/spark-non-core-operations.yaml new file mode 100644 index 0000000000000..f7189ce4587c8 --- /dev/null +++ b/docker/demo/config/test-suite/spark-non-core-operations.yaml @@ -0,0 +1,204 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-non-core-operations.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: none + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: first_insert + second_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: first_upsert + second_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: second_insert + first_insert_overwrite: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10 + type: SparkInsertOverwriteNode + deps: second_upsert + delete_all_input_except_last: + config: + delete_input_data_except_latest: true + type: DeleteInputDatasetNode + deps: first_insert_overwrite + third_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: delete_all_input_except_last + third_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: third_insert + second_validate: + config: + validate_full_data : true + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: third_upsert + fourth_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: second_validate + fourth_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: fourth_insert + fifth_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: fourth_upsert + fifth_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: fifth_insert + first_insert_overwrite_table: + config: + record_size: 1000 + repeat_count: 1 + num_records_insert: 10 + type: SparkInsertOverwriteTableNode + deps: fifth_upsert + second_delete_all_input_except_last: + config: + delete_input_data_except_latest: true + type: DeleteInputDatasetNode + deps: first_insert_overwrite_table + sixth_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: second_delete_all_input_except_last + sixth_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: sixth_insert + third_validate: + config: + validate_full_data : true + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: sixth_upsert + seventh_insert: + config: + record_size: 1000 + num_partitions_insert: 5 + repeat_count: 1 + num_records_insert: 10 + type: SparkInsertNode + deps: third_validate + first_delete_partition: + config: + partitions_to_delete: "1970/01/01" + type: SparkDeletePartitionNode + deps: seventh_insert + fourth_validate: + config: + validate_full_data : true + input_partitions_to_skip_validate : "0" + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: first_delete_partition + eigth_insert: + config: + record_size: 1000 + num_partitions_insert: 5 + repeat_count: 1 + num_records_insert: 10 + start_partition: 2 + type: SparkInsertNode + deps: fourth_validate + fifth_validate: + config: + validate_full_data : true + input_partitions_to_skip_validate : "0" + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: eigth_insert \ No newline at end of file diff --git a/docker/demo/config/test-suite/spark-pure-bulk-inserts.yaml b/docker/demo/config/test-suite/spark-pure-bulk-inserts.yaml new file mode 100644 index 0000000000000..f82705cea3cec --- /dev/null +++ b/docker/demo/config/test-suite/spark-pure-bulk-inserts.yaml @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-pure-bulk-inserts.yaml +dag_rounds: 5 +dag_intermittent_delay_mins: 0 +dag_content: + first_bulk_insert: + config: + record_size: 200 + num_partitions_insert: 10 + repeat_count: 4 + num_records_insert: 5000 + type: SparkBulkInsertNode + deps: none + second_validate: + config: + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: first_bulk_insert + last_validate: + config: + execute_itr_count: 5 + type: ValidateAsyncOperations + deps: second_validate \ No newline at end of file diff --git a/docker/demo/config/test-suite/spark-pure-inserts.yaml b/docker/demo/config/test-suite/spark-pure-inserts.yaml new file mode 100644 index 0000000000000..13482f988c70c --- /dev/null +++ b/docker/demo/config/test-suite/spark-pure-inserts.yaml @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-pure-inserts.yaml +dag_rounds: 5 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 200 + num_partitions_insert: 10 + repeat_count: 3 + num_records_insert: 5000 + type: SparkInsertNode + deps: none + second_validate: + config: + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: first_insert + last_validate: + config: + execute_itr_count: 10 + type: ValidateAsyncOperations + deps: second_validate \ No newline at end of file diff --git a/docker/demo/config/test-suite/spark-simple.yaml b/docker/demo/config/test-suite/spark-simple.yaml new file mode 100644 index 0000000000000..ebd1cd2d4d3ca --- /dev/null +++ b/docker/demo/config/test-suite/spark-simple.yaml @@ -0,0 +1,66 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: cow-spark-simple.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 100 + type: SparkInsertNode + deps: none + first_validate: + config: + validate_hive: false + type: ValidateDatasetNode + deps: first_insert + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 50 + repeat_count: 1 + num_records_upsert: 100 + num_partitions_upsert: 1 + type: SparkUpsertNode + deps: first_validate + first_delete: + config: + num_partitions_delete: 1 + num_records_delete: 30 + type: SparkDeleteNode + deps: first_upsert + second_validate: + config: + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: first_delete + first_presto_query: + config: + execute_itr_count: 1 + presto_props: + prop1: "SET SESSION hive.parquet_use_column_names = true" + presto_queries: + query1: "select count(*) from testdb.table1" + result1: 120 + query2: "select count(*) from testdb.table1 group by _row_key having count(*) > 1" + result2: 0 + type: PrestoQueryNode + deps: second_validate \ No newline at end of file diff --git a/docker/demo/config/test-suite/spark-sql-nonpartitioned-external-cow-ctas.yaml b/docker/demo/config/test-suite/spark-sql-nonpartitioned-external-cow-ctas.yaml new file mode 100644 index 0000000000000..376d2a540b3f7 --- /dev/null +++ b/docker/demo/config/test-suite/spark-sql-nonpartitioned-external-cow-ctas.yaml @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-sql-nonpartitioned-managed-cow-ctas.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + create_table: + config: + table_type: cow + is_external: true + primary_key: _row_key + pre_combine_field: test_suite_source_ordering_field + use_ctas: true + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlCreateTableNode + deps: none + insert_records: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlInsertNode + deps: create_table + validate: + config: + delete_input_data: true + type: spark.sql.SparkSqlValidateDatasetNode + deps: insert_records \ No newline at end of file diff --git a/docker/demo/config/test-suite/spark-sql-nonpartitioned-external-mor.yaml b/docker/demo/config/test-suite/spark-sql-nonpartitioned-external-mor.yaml new file mode 100644 index 0000000000000..1899830c6fc16 --- /dev/null +++ b/docker/demo/config/test-suite/spark-sql-nonpartitioned-external-mor.yaml @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: sspark-sql-nonpartitioned-external-mor.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + create_table: + config: + table_type: mor + is_external: true + primary_key: _row_key + pre_combine_field: test_suite_source_ordering_field + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlCreateTableNode + deps: none + insert_records: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlInsertNode + deps: create_table + validate: + config: + delete_input_data: true + type: spark.sql.SparkSqlValidateDatasetNode + deps: insert_records \ No newline at end of file diff --git a/docker/demo/config/test-suite/spark-sql-nonpartitioned-managed-cow-ctas.yaml b/docker/demo/config/test-suite/spark-sql-nonpartitioned-managed-cow-ctas.yaml new file mode 100644 index 0000000000000..8659a90470188 --- /dev/null +++ b/docker/demo/config/test-suite/spark-sql-nonpartitioned-managed-cow-ctas.yaml @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-sql-nonpartitioned-managed-cow-ctas.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + create_table: + config: + table_type: cow + primary_key: _row_key + pre_combine_field: test_suite_source_ordering_field + use_ctas: true + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlCreateTableNode + deps: none + insert_records: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlInsertNode + deps: create_table + validate: + config: + delete_input_data: true + type: spark.sql.SparkSqlValidateDatasetNode + deps: insert_records \ No newline at end of file diff --git a/docker/demo/config/test-suite/spark-sql-nonpartitioned-managed-cow.yaml b/docker/demo/config/test-suite/spark-sql-nonpartitioned-managed-cow.yaml new file mode 100644 index 0000000000000..79ea448d0433a --- /dev/null +++ b/docker/demo/config/test-suite/spark-sql-nonpartitioned-managed-cow.yaml @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-sql-nonpartitioned-managed-cow.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + create_table: + config: + table_type: cow + primary_key: _row_key + pre_combine_field: test_suite_source_ordering_field + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlCreateTableNode + deps: none + insert_records: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlInsertNode + deps: create_table + #merge_records: + # config: + # merge_condition: target._row_key = source._row_key + # matched_action: update set * + # not_matched_action: insert * + # record_size: 1000 + # num_partitions_insert: 10 + # repeat_count: 1 + # num_records_upsert: 100 + # num_records_insert: 1000 + # type: spark.sql.SparkSqlMergeNode + # deps: insert_records + delete_records: + config: + condition_column: begin_lat + record_size: 1000 + repeat_count: 1 + ratio_records_change: 0.2 + type: spark.sql.SparkSqlDeleteNode + deps: insert_records + validate: + config: + delete_input_data: true + type: spark.sql.SparkSqlValidateDatasetNode + deps: delete_records \ No newline at end of file diff --git a/docker/demo/config/test-suite/spark-sql-partition-cow-updates.yaml b/docker/demo/config/test-suite/spark-sql-partition-cow-updates.yaml new file mode 100644 index 0000000000000..a4b52559a3375 --- /dev/null +++ b/docker/demo/config/test-suite/spark-sql-partition-cow-updates.yaml @@ -0,0 +1,61 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-sql-partitioned-managed-cow.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + create_table: + config: + table_type: cow + primary_key: _row_key + pre_combine_field: test_suite_source_ordering_field + partition_field: rider + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlCreateTableNode + deps: none + insert_records: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlInsertNode + deps: create_table + first_validate: + config: + delete_input_data: false + type: spark.sql.SparkSqlValidateDatasetNode + deps: insert_records + update_records: + config: + type: spark.sql.SparkSqlUpdateNode + deps: first_validate + delete_records: + config: + condition_column: begin_lat + record_size: 1000 + repeat_count: 1 + ratio_records_change: 0.2 + type: spark.sql.SparkSqlDeleteNode + deps: update_records + second_validate: + config: + delete_input_data: true + type: spark.sql.SparkSqlValidateDatasetNode + deps: delete_records diff --git a/docker/demo/config/test-suite/spark-sql-partitioned-managed-cow-ctas.yaml b/docker/demo/config/test-suite/spark-sql-partitioned-managed-cow-ctas.yaml new file mode 100644 index 0000000000000..da0f512315c3b --- /dev/null +++ b/docker/demo/config/test-suite/spark-sql-partitioned-managed-cow-ctas.yaml @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-sql-partitioned-managed-cow-ctas.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + create_table: + config: + table_type: cow + primary_key: _row_key + pre_combine_field: test_suite_source_ordering_field + partition_field: rider + use_ctas: true + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlCreateTableNode + deps: none + insert_records: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlInsertNode + deps: create_table + validate: + config: + delete_input_data: true + type: spark.sql.SparkSqlValidateDatasetNode + deps: insert_records \ No newline at end of file diff --git a/docker/demo/config/test-suite/spark-sql-partitioned-managed-cow.yaml b/docker/demo/config/test-suite/spark-sql-partitioned-managed-cow.yaml new file mode 100644 index 0000000000000..cb75949552d6e --- /dev/null +++ b/docker/demo/config/test-suite/spark-sql-partitioned-managed-cow.yaml @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: spark-sql-partitioned-managed-cow.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + create_table: + config: + table_type: cow + primary_key: _row_key + pre_combine_field: test_suite_source_ordering_field + partition_field: rider + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlCreateTableNode + deps: none + insert_records: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlInsertNode + deps: create_table + #merge_records: + # config: + # merge_condition: target._row_key = source._row_key + # matched_action: update set * + # not_matched_action: insert * + # record_size: 1000 + # num_partitions_insert: 10 + # repeat_count: 1 + # num_records_upsert: 100 + # num_records_insert: 1000 + # type: spark.sql.SparkSqlMergeNode + # deps: insert_records + delete_records: + config: + condition_column: begin_lat + record_size: 1000 + repeat_count: 1 + ratio_records_change: 0.2 + type: spark.sql.SparkSqlDeleteNode + deps: insert_records + validate: + config: + delete_input_data: true + type: spark.sql.SparkSqlValidateDatasetNode + deps: delete_records diff --git a/docker/demo/config/test-suite/templates/clustering.yaml.template b/docker/demo/config/test-suite/templates/clustering.yaml.template new file mode 100644 index 0000000000000..fab10ecf4cab9 --- /dev/null +++ b/docker/demo/config/test-suite/templates/clustering.yaml.template @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# yaml to test clustering +dag_name: NAME-clustering.yaml +dag_rounds: clustering_num_iterations +dag_intermittent_delay_mins: clustering_delay_in_mins +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_delete: + config: + num_partitions_delete: 1 + num_records_delete: 9000 + type: DeleteNode + deps: third_insert + first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_delete + first_validate: + config: + validate_hive: true + type: ValidateDatasetNode + deps: first_hive_sync + first_cluster: + config: + execute_itr_count: clustering_itr_count + type: ClusteringNode + deps: first_validate + second_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_cluster + second_validate: + config: + validate_hive: true + type: ValidateDatasetNode + deps: second_hive_sync diff --git a/docker/demo/config/test-suite/templates/long_test_suite.yaml.template b/docker/demo/config/test-suite/templates/long_test_suite.yaml.template new file mode 100644 index 0000000000000..0715eb27e10e8 --- /dev/null +++ b/docker/demo/config/test-suite/templates/long_test_suite.yaml.template @@ -0,0 +1,92 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Long running test suite which cleans up input after every round of a dag. Which means, validation +# happens only for 1 round of dag everytime (as input is cleaned up) +dag_name: NAME-long-running-multi-partitions.yaml +dag_rounds: long_num_iterations +dag_intermittent_delay_mins: delay_in_mins +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 5 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 50 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 2 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: third_insert + first_validate: + config: + validate_hive: true + type: ValidateDatasetNode + deps: first_hive_sync + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 2 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 100 + num_partitions_upsert: 1 + type: UpsertNode + deps: first_validate + first_delete: + config: + num_partitions_delete: 50 + num_records_delete: 8000 + type: DeleteNode + deps: first_upsert + second_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_delete + second_validate: + config: + validate_hive: true + delete_input_data: true + type: ValidateDatasetNode + deps: second_hive_sync + last_validate: + config: + execute_itr_count: long_num_iterations + validate_clean: true + validate_archival: true + type: ValidateAsyncOperations + deps: second_validate diff --git a/docker/demo/config/test-suite/templates/medium_test_suite.yaml.template b/docker/demo/config/test-suite/templates/medium_test_suite.yaml.template new file mode 100644 index 0000000000000..b499a92fa692d --- /dev/null +++ b/docker/demo/config/test-suite/templates/medium_test_suite.yaml.template @@ -0,0 +1,92 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Long running test suite which validates entire input after every dag. Input accumulates and so validation +# happens for entire dataset. +dag_name: NAME-long-running-multi-partitions.yaml +dag_rounds: medium_num_iterations +dag_intermittent_delay_mins: delay_in_mins +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 5 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 50 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 2 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: third_insert + first_validate: + config: + validate_hive: true + type: ValidateDatasetNode + deps: first_hive_sync + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 2 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 100 + num_partitions_upsert: 1 + type: UpsertNode + deps: first_validate + first_delete: + config: + num_partitions_delete: 50 + num_records_delete: 8000 + type: DeleteNode + deps: first_upsert + second_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_delete + second_validate: + config: + validate_hive: true + delete_input_data: false + type: ValidateDatasetNode + deps: second_hive_sync + last_validate: + config: + execute_itr_count: medium_num_iterations + validate_clean: true + validate_archival: true + type: ValidateAsyncOperations + deps: second_validate diff --git a/docker/demo/config/test-suite/templates/sanity.yaml.template b/docker/demo/config/test-suite/templates/sanity.yaml.template new file mode 100644 index 0000000000000..eae83b6af38ad --- /dev/null +++ b/docker/demo/config/test-suite/templates/sanity.yaml.template @@ -0,0 +1,83 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Sanity yaml to test simple operations. +dag_name: NAME-sanity.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: delay_in_mins +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 5 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 50 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 2 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: third_insert + first_validate: + config: + validate_hive: true + type: ValidateDatasetNode + deps: first_hive_sync + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 2 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 100 + num_partitions_upsert: 1 + type: UpsertNode + deps: first_validate + first_delete: + config: + num_partitions_delete: 50 + num_records_delete: 8000 + type: DeleteNode + deps: first_upsert + second_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_delete + second_validate: + config: + validate_hive: true + type: ValidateDatasetNode + deps: second_hive_sync diff --git a/docker/demo/config/test-suite/templates/spark_command.txt.template b/docker/demo/config/test-suite/templates/spark_command.txt.template new file mode 100644 index 0000000000000..bf19631b0f427 --- /dev/null +++ b/docker/demo/config/test-suite/templates/spark_command.txt.template @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +spark-submit \ +--conf spark.task.cpus=1 \ +--conf spark.executor.cores=1 \ +--conf spark.task.maxFailures=100 \ +--conf spark.memory.fraction=0.4 \ +--conf spark.rdd.compress=true \ +--conf spark.kryoserializer.buffer.max=2000m \ +--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ +--conf spark.memory.storageFraction=0.1 \ +--conf spark.shuffle.service.enabled=true \ +--conf spark.sql.hive.convertMetastoreParquet=false \ +--conf spark.driver.maxResultSize=12g \ +--conf spark.executor.heartbeatInterval=120s \ +--conf spark.network.timeout=600s \ +--conf spark.yarn.max.executor.failures=10 \ +--conf spark.sql.catalogImplementation=hive \ +--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \ +/opt/JAR_NAME \ +--source-ordering-field test_suite_source_ordering_field \ +--use-deltastreamer \ +--target-base-path OUTPUT_PATH \ +--input-base-path INPUT_PATH \ +--target-table table1 \ +--props file:/opt/staging/test.properties \ +--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \ +--source-class org.apache.hudi.utilities.sources.AvroDFSSource \ +--input-file-size 125829120 \ +--workload-yaml-path file:/opt/staging/input_yaml \ +--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \ +--table-type TABLE_TYPE \ +--compact-scheduling-minshare 1 \ +--clean-input \ +--clean-output \ No newline at end of file diff --git a/docker/demo/config/test-suite/templates/test.properties.template b/docker/demo/config/test-suite/templates/test.properties.template new file mode 100644 index 0000000000000..e1b65fb730a18 --- /dev/null +++ b/docker/demo/config/test-suite/templates/test.properties.template @@ -0,0 +1,50 @@ + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +hoodie.insert.shuffle.parallelism=100 +hoodie.upsert.shuffle.parallelism=100 +hoodie.bulkinsert.shuffle.parallelism=100 + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.embed.timeline.server=false +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.plan.strategy.sort.columns=_row_key +hoodie.clustering.plan.strategy.daybased.lookback.partitions=0 +hoodie.clustering.inline.max.commits=1 + +hoodie.deltastreamer.source.dfs.root=INPUT_PATH +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-aggressive-clean-archival-inline-compact.properties b/docker/demo/config/test-suite/test-aggressive-clean-archival-inline-compact.properties new file mode 100644 index 0000000000000..14427f323cead --- /dev/null +++ b/docker/demo/config/test-suite/test-aggressive-clean-archival-inline-compact.properties @@ -0,0 +1,57 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.cleaner.commits.retained=8 +hoodie.keep.min.commits=12 +hoodie.keep.max.commits=14 + +hoodie.compact.inline=true +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-aggressive-clean-archival.properties b/docker/demo/config/test-suite/test-aggressive-clean-archival.properties new file mode 100644 index 0000000000000..f0d9de251b869 --- /dev/null +++ b/docker/demo/config/test-suite/test-aggressive-clean-archival.properties @@ -0,0 +1,55 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.cleaner.commits.retained=8 +hoodie.keep.min.commits=12 +hoodie.keep.max.commits=14 +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival-inline-compact.properties b/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival-inline-compact.properties new file mode 100644 index 0000000000000..748972861851d --- /dev/null +++ b/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival-inline-compact.properties @@ -0,0 +1,64 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.compact.inline=true +hoodie.embed.timeline.server=false + +hoodie.cleaner.commits.retained=8 +hoodie.keep.min.commits=12 +hoodie.keep.max.commits=14 + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival.properties b/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival.properties new file mode 100644 index 0000000000000..b94ccabb55e09 --- /dev/null +++ b/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival.properties @@ -0,0 +1,62 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.cleaner.commits.retained=8 +hoodie.keep.min.commits=12 +hoodie.keep.max.commits=14 +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-clustering-inline-compact.properties b/docker/demo/config/test-suite/test-clustering-inline-compact.properties new file mode 100644 index 0000000000000..5e86790c723a9 --- /dev/null +++ b/docker/demo/config/test-suite/test-clustering-inline-compact.properties @@ -0,0 +1,60 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.embed.timeline.server=false + +hoodie.compact.inline=true +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival-inline-compact.properties b/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival-inline-compact.properties new file mode 100644 index 0000000000000..dd3089d190184 --- /dev/null +++ b/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival-inline-compact.properties @@ -0,0 +1,65 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.cleaner.commits.retained=8 +hoodie.keep.min.commits=12 +hoodie.keep.max.commits=14 + +hoodie.compact.inline=true +hoodie.metadata.enable=true +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival.properties b/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival.properties new file mode 100644 index 0000000000000..c10d6ecc48007 --- /dev/null +++ b/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival.properties @@ -0,0 +1,64 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.cleaner.commits.retained=8 +hoodie.keep.min.commits=12 +hoodie.keep.max.commits=14 + +hoodie.embed.timeline.server=false +hoodie.metadata.enable=true + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-clustering.properties b/docker/demo/config/test-suite/test-clustering.properties new file mode 100644 index 0000000000000..677cf96751d77 --- /dev/null +++ b/docker/demo/config/test-suite/test-clustering.properties @@ -0,0 +1,59 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-inline-compact.properties b/docker/demo/config/test-suite/test-inline-compact.properties new file mode 100644 index 0000000000000..76de6bd2678e1 --- /dev/null +++ b/docker/demo/config/test-suite/test-inline-compact.properties @@ -0,0 +1,54 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=false +hoodie.compact.inline=true +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival-inline-compact.properties b/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival-inline-compact.properties new file mode 100644 index 0000000000000..7921162356e2d --- /dev/null +++ b/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival-inline-compact.properties @@ -0,0 +1,58 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.cleaner.commits.retained=8 +hoodie.keep.min.commits=12 +hoodie.keep.max.commits=14 + +hoodie.embed.timeline.server=false +hoodie.metadata.enable=true +hoodie.compact.inline=true + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival.properties b/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival.properties new file mode 100644 index 0000000000000..5bad7fc4ef100 --- /dev/null +++ b/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival.properties @@ -0,0 +1,57 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.cleaner.commits.retained=8 +hoodie.keep.min.commits=12 +hoodie.keep.max.commits=14 + +hoodie.metadata.enable=true +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-metadata-inline-compact.properties b/docker/demo/config/test-suite/test-metadata-inline-compact.properties new file mode 100644 index 0000000000000..5230a1488ca67 --- /dev/null +++ b/docker/demo/config/test-suite/test-metadata-inline-compact.properties @@ -0,0 +1,58 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=true +hoodie.compact.inline=true +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.plan.strategy.sort.columns=_row_key +hoodie.clustering.plan.strategy.daybased.lookback.partitions=0 +hoodie.clustering.inline.max.commits=1 + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-metadata.properties b/docker/demo/config/test-suite/test-metadata.properties new file mode 100644 index 0000000000000..0edcd3c63d2ef --- /dev/null +++ b/docker/demo/config/test-suite/test-metadata.properties @@ -0,0 +1,57 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=true +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.plan.strategy.sort.columns=_row_key +hoodie.clustering.plan.strategy.daybased.lookback.partitions=0 +hoodie.clustering.inline.max.commits=1 + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-nonpartitioned-inline-compact.properties b/docker/demo/config/test-suite/test-nonpartitioned-inline-compact.properties new file mode 100644 index 0000000000000..97f2bfa4978d3 --- /dev/null +++ b/docker/demo/config/test-suite/test-nonpartitioned-inline-compact.properties @@ -0,0 +1,61 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=false +hoodie.compact.inline=true +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator +hoodie.datasource.write.partitionpath.field= + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-nonpartitioned-metadata-inline-compact.properties b/docker/demo/config/test-suite/test-nonpartitioned-metadata-inline-compact.properties new file mode 100644 index 0000000000000..2298be18fe91d --- /dev/null +++ b/docker/demo/config/test-suite/test-nonpartitioned-metadata-inline-compact.properties @@ -0,0 +1,61 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=true +hoodie.compact.inline=true +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator +hoodie.datasource.write.partitionpath.field= + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-nonpartitioned-metadata.properties b/docker/demo/config/test-suite/test-nonpartitioned-metadata.properties new file mode 100644 index 0000000000000..520534f3b3e92 --- /dev/null +++ b/docker/demo/config/test-suite/test-nonpartitioned-metadata.properties @@ -0,0 +1,60 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=true +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator +hoodie.datasource.write.partitionpath.field= + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-nonpartitioned.properties b/docker/demo/config/test-suite/test-nonpartitioned.properties new file mode 100644 index 0000000000000..d51c4e5f843d3 --- /dev/null +++ b/docker/demo/config/test-suite/test-nonpartitioned.properties @@ -0,0 +1,60 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=false +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator +hoodie.datasource.write.partitionpath.field= + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-source.properties b/docker/demo/config/test-suite/test-source.properties deleted file mode 100644 index cc18a39d57086..0000000000000 --- a/docker/demo/config/test-suite/test-source.properties +++ /dev/null @@ -1,37 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# write configs -hoodie.datasource.write.recordkey.field=_row_key -hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator -hoodie.datasource.write.partitionpath.field=timestamp - - -# deltastreamer configs -hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd -hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP -hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-bench/input -hoodie.deltastreamer.schemaprovider.source.schema.file=/var/hoodie/ws/docker/demo/config/test-suite/source.avsc -hoodie.deltastreamer.schemaprovider.target.schema.file=/var/hoodie/ws/docker/demo/config/bench/source.avsc - -#hive sync -hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ -hoodie.datasource.hive_sync.database=testdb -hoodie.datasource.hive_sync.table=table1 -hoodie.datasource.hive_sync.use_jdbc=false -hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor -hoodie.datasource.hive_sync.assume_date_partitioning=true -hoodie.datasource.hive_sync.use_pre_apache_input_format=true diff --git a/docker/demo/config/test-suite/test.properties b/docker/demo/config/test-suite/test.properties index a7fd3986a730b..3b20d3286251a 100644 --- a/docker/demo/config/test-suite/test.properties +++ b/docker/demo/config/test-suite/test.properties @@ -1,13 +1,36 @@ -hoodie.insert.shuffle.parallelism=100 -hoodie.upsert.shuffle.parallelism=100 -hoodie.bulkinsert.shuffle.parallelism=100 + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=false +hoodie.embed.timeline.server=false hoodie.deltastreamer.source.test.num_partitions=100 hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false hoodie.deltastreamer.source.test.max_unique_records=100000000 -hoodie.embed.timeline.server=false hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + hoodie.datasource.write.recordkey.field=_row_key hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator hoodie.datasource.write.partitionpath.field=timestamp diff --git a/docker/demo/hive-batch1.commands b/docker/demo/hive-batch1.commands index 021c6d55b800d..ed2eaca8aca94 100644 --- a/docker/demo/hive-batch1.commands +++ b/docker/demo/hive-batch1.commands @@ -21,16 +21,16 @@ select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOO select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'; select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'; -select symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'; -select symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'; -select symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'; +select symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG' order by ts; +select symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG' order by ts; +select symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG' order by ts; select symbol, max(ts) from stock_ticks_cow_bs group by symbol HAVING symbol = 'GOOG'; select symbol, max(ts) from stock_ticks_mor_bs_ro group by symbol HAVING symbol = 'GOOG'; select symbol, max(ts) from stock_ticks_mor_bs_rt group by symbol HAVING symbol = 'GOOG'; -select symbol, ts, volume, open, close from stock_ticks_cow_bs where symbol = 'GOOG'; -select symbol, ts, volume, open, close from stock_ticks_mor_bs_ro where symbol = 'GOOG'; -select symbol, ts, volume, open, close from stock_ticks_mor_bs_rt where symbol = 'GOOG'; +select symbol, ts, volume, open, close from stock_ticks_cow_bs where symbol = 'GOOG' order by ts; +select symbol, ts, volume, open, close from stock_ticks_mor_bs_ro where symbol = 'GOOG' order by ts; +select symbol, ts, volume, open, close from stock_ticks_mor_bs_rt where symbol = 'GOOG' order by ts; !quit diff --git a/docker/demo/hive-batch2-after-compaction.commands b/docker/demo/hive-batch2-after-compaction.commands index 06582a309ae00..9d5a0b27d67d0 100644 --- a/docker/demo/hive-batch2-after-compaction.commands +++ b/docker/demo/hive-batch2-after-compaction.commands @@ -20,13 +20,13 @@ add jar ${hudi.hadoop.bundle}; select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'; select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'; -select symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'; -select symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'; +select symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG' order by ts; +select symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG' order by ts; select symbol, max(ts) from stock_ticks_mor_bs_ro group by symbol HAVING symbol = 'GOOG'; select symbol, max(ts) from stock_ticks_mor_bs_rt group by symbol HAVING symbol = 'GOOG'; -select symbol, ts, volume, open, close from stock_ticks_mor_bs_ro where symbol = 'GOOG'; -select symbol, ts, volume, open, close from stock_ticks_mor_bs_rt where symbol = 'GOOG'; +select symbol, ts, volume, open, close from stock_ticks_mor_bs_ro where symbol = 'GOOG' order by ts; +select symbol, ts, volume, open, close from stock_ticks_mor_bs_rt where symbol = 'GOOG' order by ts; !quit diff --git a/docker/demo/hive-table-check.commands b/docker/demo/hive-table-check.commands index 8cdf033a79565..1102ca53bbbd0 100644 --- a/docker/demo/hive-table-check.commands +++ b/docker/demo/hive-table-check.commands @@ -22,6 +22,13 @@ show partitions stock_ticks_cow; show partitions stock_ticks_mor_ro; show partitions stock_ticks_mor_rt; +show create table stock_ticks_cow; +show create table stock_ticks_mor_ro; +show create table stock_ticks_mor_rt; +show create table stock_ticks_cow_bs; +show create table stock_ticks_mor_bs_ro; +show create table stock_ticks_mor_bs_rt; + !quit diff --git a/docker/demo/setup_demo_container.sh b/docker/demo/setup_demo_container.sh index e3ba231771b08..ecec2e99f713c 100755 --- a/docker/demo/setup_demo_container.sh +++ b/docker/demo/setup_demo_container.sh @@ -17,7 +17,7 @@ echo "Copying spark default config and setting up configs" cp /var/hoodie/ws/docker/demo/config/spark-defaults.conf $SPARK_CONF_DIR/. -cp /var/hoodie/ws/docker/demo/config/log4j.properties $SPARK_CONF_DIR/. +cp /var/hoodie/ws/docker/demo/config/log4j2.properties $SPARK_CONF_DIR/. hadoop fs -mkdir -p /var/demo/ hadoop fs -mkdir -p /tmp/spark-events hadoop fs -copyFromLocal -f /var/hoodie/ws/docker/demo/config /var/demo/. diff --git a/docker/demo/sparksql-bootstrap-prep-source.commands b/docker/demo/sparksql-bootstrap-prep-source.commands index 23db3e4d38c4b..ca54b4bab31c5 100644 --- a/docker/demo/sparksql-bootstrap-prep-source.commands +++ b/docker/demo/sparksql-bootstrap-prep-source.commands @@ -18,5 +18,7 @@ import org.apache.spark.sql.functions.col val df = spark.read.format("org.apache.hudi").load("/user/hive/warehouse/stock_ticks_cow/*/*/*").drop("_hoodie_commit_time", "_hoodie_record_key", "_hoodie_file_name", "_hoodie_commit_seqno", "_hoodie_partition_path") +// TODO(HUDI-4944): fix the test to use a partition column with slashes (`/`) included +// in the value. Currently it fails the tests due to slash encoding. df.write.format("parquet").save("/user/hive/warehouse/stock_ticks_cow_bs_src/2018/08/31/") System.exit(0) diff --git a/docker/demo/sparksql-incremental.commands b/docker/demo/sparksql-incremental.commands index febfcd28a1116..9ec586e49d854 100644 --- a/docker/demo/sparksql-incremental.commands +++ b/docker/demo/sparksql-incremental.commands @@ -21,13 +21,16 @@ import org.apache.hudi.DataSourceWriteOptions; import org.apache.spark.sql.SaveMode; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.HoodieDataSourceHelpers; +import org.apache.hudi.hive.HiveSyncConfigHolder; +import org.apache.hudi.sync.common.HoodieSyncConfig; +import org.apache.hudi.hive.MultiPartKeysValueExtractor; import org.apache.hadoop.fs.FileSystem; val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) val beginInstantTime = HoodieDataSourceHelpers.listCommitsSince(fs, "/user/hive/warehouse/stock_ticks_cow", "00000").get(0) val hoodieIncQueryDF = spark.read.format("org.apache.hudi"). - option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL). - option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY, beginInstantTime). + option(DataSourceReadOptions.QUERY_TYPE.key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL). + option(DataSourceReadOptions.BEGIN_INSTANTTIME.key(), beginInstantTime). load("/user/hive/warehouse/stock_ticks_cow"); hoodieIncQueryDF.registerTempTable("stock_ticks_cow_incr") spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow_incr where symbol = 'GOOG'").show(100, false); @@ -36,19 +39,21 @@ spark.sql("select key, `_hoodie_partition_path` as datestr, symbol, ts, open, cl write.format("org.apache.hudi"). option("hoodie.insert.shuffle.parallelism", "2"). option("hoodie.upsert.shuffle.parallelism","2"). - option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL). - option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL). - option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "key"). - option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "datestr"). - option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "ts"). - option(HoodieWriteConfig.TABLE_NAME, "stock_ticks_derived_mor"). - option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY, "stock_ticks_derived_mor"). - option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY, "default"). - option(DataSourceWriteOptions.HIVE_URL_OPT_KEY, "jdbc:hive2://hiveserver:10000"). - option(DataSourceWriteOptions.HIVE_USER_OPT_KEY, "hive"). - option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY, "hive"). - option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY, "true"). - option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY, "datestr"). + option(DataSourceWriteOptions.TABLE_TYPE.key(), DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL). + option(DataSourceWriteOptions.OPERATION.key(), DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL). + option(DataSourceWriteOptions.RECORDKEY_FIELD.key(), "key"). + option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key(), "datestr"). + option(DataSourceWriteOptions.PRECOMBINE_FIELD.key(), "ts"). + option(HoodieWriteConfig.TBL_NAME.key(), "stock_ticks_derived_mor"). + option(HoodieSyncConfig.META_SYNC_TABLE_NAME.key(), "stock_ticks_derived_mor"). + option(HoodieSyncConfig.META_SYNC_DATABASE_NAME.key(), "default"). + option(HiveSyncConfigHolder.HIVE_URL.key(), "jdbc:hive2://hiveserver:10000"). + option(HiveSyncConfigHolder.HIVE_USER.key(), "hive"). + option(HiveSyncConfigHolder.HIVE_PASS.key(), "hive"). + option(HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key(), "true"). + option(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr"). + option(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), classOf[MultiPartKeysValueExtractor].getCanonicalName). + option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key(), "true"). mode(SaveMode.Overwrite). save("/user/hive/warehouse/stock_ticks_derived_mor"); @@ -56,8 +61,8 @@ spark.sql("select count(*) from stock_ticks_derived_mor_ro").show(20, false) spark.sql("select count(*) from stock_ticks_derived_mor_rt").show(20, false) val hoodieIncQueryBsDF = spark.read.format("org.apache.hudi"). - option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL). - option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY, "00000000000001"). + option(DataSourceReadOptions.QUERY_TYPE.key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL). + option(DataSourceReadOptions.BEGIN_INSTANTTIME.key(), "00000000000001"). load("/user/hive/warehouse/stock_ticks_cow_bs"); hoodieIncQueryBsDF.registerTempTable("stock_ticks_cow_bs_incr") spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow_bs_incr where symbol = 'GOOG'").show(100, false); @@ -66,19 +71,21 @@ spark.sql("select key, `_hoodie_partition_path` as datestr, symbol, ts, open, cl write.format("org.apache.hudi"). option("hoodie.insert.shuffle.parallelism", "2"). option("hoodie.upsert.shuffle.parallelism","2"). - option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL). - option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL). - option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "key"). - option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "datestr"). - option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "ts"). - option(HoodieWriteConfig.TABLE_NAME, "stock_ticks_derived_mor_bs"). - option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY, "stock_ticks_derived_mor_bs"). - option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY, "default"). - option(DataSourceWriteOptions.HIVE_URL_OPT_KEY, "jdbc:hive2://hiveserver:10000"). - option(DataSourceWriteOptions.HIVE_USER_OPT_KEY, "hive"). - option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY, "hive"). - option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY, "true"). - option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY, "datestr"). + option(DataSourceWriteOptions.TABLE_TYPE.key(), DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL). + option(DataSourceWriteOptions.OPERATION.key(), DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL). + option(DataSourceWriteOptions.RECORDKEY_FIELD.key(), "key"). + option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key(), "datestr"). + option(DataSourceWriteOptions.PRECOMBINE_FIELD.key(), "ts"). + option(HoodieWriteConfig.TBL_NAME.key(), "stock_ticks_derived_mor_bs"). + option(HoodieSyncConfig.META_SYNC_TABLE_NAME.key(), "stock_ticks_derived_mor_bs"). + option(HoodieSyncConfig.META_SYNC_DATABASE_NAME.key(), "default"). + option(HiveSyncConfigHolder.HIVE_URL.key(), "jdbc:hive2://hiveserver:10000"). + option(HiveSyncConfigHolder.HIVE_USER.key(), "hive"). + option(HiveSyncConfigHolder.HIVE_PASS.key(), "hive"). + option(HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key(), "true"). + option(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr"). + option(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), classOf[MultiPartKeysValueExtractor].getCanonicalName). + option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key(), "true"). mode(SaveMode.Overwrite). save("/user/hive/warehouse/stock_ticks_derived_mor_bs"); diff --git a/docker/demo/sync-validate.commands b/docker/demo/sync-validate.commands index 32c334eee01ad..e629a049a346f 100644 --- a/docker/demo/sync-validate.commands +++ b/docker/demo/sync-validate.commands @@ -1,18 +1,18 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. connect --path /docker_hoodie_sync_valid_test commits sync --path /docker_hoodie_sync_valid_test_2 diff --git a/docker/demo/trino-batch1.commands b/docker/demo/trino-batch1.commands new file mode 100644 index 0000000000000..d89c19b0bf0bf --- /dev/null +++ b/docker/demo/trino-batch1.commands @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'; +select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'; +select symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'; +select symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'; diff --git a/docker/demo/trino-batch2-after-compaction.commands b/docker/demo/trino-batch2-after-compaction.commands new file mode 100644 index 0000000000000..da42b4728252d --- /dev/null +++ b/docker/demo/trino-batch2-after-compaction.commands @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'; +select symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'; diff --git a/docker/demo/trino-table-check.commands b/docker/demo/trino-table-check.commands new file mode 100644 index 0000000000000..4362d79fe770c --- /dev/null +++ b/docker/demo/trino-table-check.commands @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +show tables; diff --git a/docker/generate_test_suite.sh b/docker/generate_test_suite.sh new file mode 100755 index 0000000000000..48c876fa6184c --- /dev/null +++ b/docker/generate_test_suite.sh @@ -0,0 +1,301 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +usage=" +USAGE: +$(basename "$0") [--help] [--all boolen] -- Script to generate the test suite according to arguments provided and run these test suites. + +where: + --help show this help text + --all set the seed value + --execute_test_suite flag if test need to execute (DEFAULT- true) + --medium_num_iterations number of medium iterations (DEFAULT- 20) + --long_num_iterations number of long iterations (DEFAULT- 30) + --intermittent_delay_mins delay after every test run (DEFAULT- 1) + --table_type hoodie table type to test (DEFAULT COPY_ON_WRITE) + --include_long_test_suite_yaml include long infra test suite (DEFAULT false) + --include_medium_test_suite_yaml include medium infra test suite (DEFAULT false) + --cluster_num_itr number of cluster iterations (DEFAULT 30) + --include_cluster_yaml include cluster infra test suite (DEFAULT false) + --input_path input path for test in docker image (DEFAULT /user/hive/warehouse/hudi-integ-test-suite/input/) + --output_path input path for test in docker image (DEFAULT /user/hive/warehouse/hudi-integ-test-suite/output/) + +Example: +Note - Execute the command from within docker folder + + 1. To generate and run all test suites + ./generate_test_suite.sh --all true + 2. To only generate test suites + ./generate_test_suite.sh --all --execute_test_suite false + 3. To run only specific test suite yaml + ./generate_test_suite.sh --execute_test_suite true --include_medium_test_suite_yaml true + " + + +MEDIUM_NUM_ITR=20 +LONG_NUM_ITR=50 +DELAY_MINS=1 +TABLE_TYPE=COPY_ON_WRITE +INCLUDE_LONG_TEST_SUITE=false +INCLUDE_MEDIUM_TEST_SUITE=false +INCLUDE_CLUSTER_YAML=false +CLUSTER_NUM_ITR=30 +CLUSTER_DELAY_MINS=1 +CLUSTER_ITR_COUNT=15 +EXECUTE_TEST_SUITE=true +JAR_NAME=hudi-integ-test-bundle-0.9.0-SNAPSHOT.jar +INPUT_PATH="/user/hive/warehouse/hudi-integ-test-suite/input/" +OUTPUT_PATH="/user/hive/warehouse/hudi-integ-test-suite/output/" + +CUR_DIR=$(pwd) + +POSITIONAL=() +while [[ $# -gt 0 ]] +do +key="$1" + +case $key in + --help) + echo "$usage" + exit + ;; + --all) + INCLUDE_LONG_TEST_SUITE="$2" + INCLUDE_MEDIUM_TEST_SUITE="$2" + INCLUDE_CLUSTER_YAML="$2" + shift # past argument + shift # past value + ;; + --execute_test_suite) + EXECUTE_TEST_SUITE="$2" + shift # past argument + shift # past value + ;; + --medium_num_iterations) + MEDIUM_NUM_ITR="$2" + shift # past argument + shift # past value + ;; + --long_num_iterations) + LONG_NUM_ITR="$2" + shift # past argument + shift # past value + ;; + --intermittent_delay_mins) + DELAY_MINS="$2" + shift # past argument + shift # past value + ;; + --table_type) + TABLE_TYPE="$2" + shift # past argument + shift # past value + ;; + --include_long_test_suite_yaml) + INCLUDE_LONG_TEST_SUITE="$2" + shift # past argument + shift # past value + ;; + --include_medium_test_suite_yaml) + INCLUDE_MEDIUM_TEST_SUITE="$2" + shift # past argument + shift # past value + ;; + --include_cluster_yaml) + INCLUDE_CLUSTER_YAML="$2" + shift # past argument + shift # past value + ;; + --cluster_num_itr) + CLUSTER_NUM_ITR="$2" + shift # past argument + shift # past value + ;; + --cluster_delay_mins) + CLUSTER_DELAY_MINS="$2" + shift # past argument + shift # past value + ;; + --cluster_exec_itr_count) + CLUSTER_ITR_COUNT="$2" + shift # past argument + shift # past value + ;; + --integ_test_jar_name) + JAR_NAME="$2" + shift # past argument + shift # past value + ;; + --input_path) + INPUT_PATH="$2" + shift # past argument + shift # past value + ;; + --output_path) + OUTPUT_PATH="$2" + shift # past argument + shift # past value + ;; + --default) + DEFAULT=YES + shift # past argument + ;; + *) # unknown option + POSITIONAL+=("$1") # save it in an array for later + echo "Unknown argument provided - '$1'" + echo "$usage" + exit 0 + shift # past argument + ;; +esac +done +set -- "${POSITIONAL[@]}" # restore positional parameters +echo "$POSITIONAL" +echo "Include Medium test suite $INCLUDE_MEDIUM_TEST_SUITE" +if $INCLUDE_MEDIUM_TEST_SUITE ; then + echo "Medium test suite iterations = ${MEDIUM_NUM_ITR}" +fi +echo "Include Long test suite $INCLUDE_LONG_TEST_SUITE" +if $INCLUDE_LONG_TEST_SUITE ; then + echo "Long test suite iterations = ${LONG_NUM_ITR}" +fi +echo "Intermittent delay in mins = ${DELAY_MINS}" +echo "Table type = ${TABLE_TYPE}" + +echo "Include cluster yaml $INCLUDE_CLUSTER_YAML" +if $INCLUDE_CLUSTER_YAML ; then + echo "Cluster total itr count $CLUSTER_NUM_ITR" + echo "Cluster delay mins $CLUSTER_DELAY_MINS" + echo "Cluster exec itr count $CLUSTER_ITR_COUNT" +fi +echo "Jar name $JAR_NAME" +INPUT_PATH=$(echo "$INPUT_PATH" | sed "s|\/|\\\/|g") +echo "Input path $INPUT_PATH" +OUTPUT_PATH=$(echo "$OUTPUT_PATH" | sed "s|\/|\\\/|g") +echo "Output path $OUTPUT_PATH" + +if [ ! -f $CUR_DIR/../packaging/hudi-integ-test-bundle/target/$JAR_NAME ]; then + echo "Integ test bundle not found at $CUR_DIR/../packaging/hudi-integ-test-bundle/target/$JAR_NAME" + exit 1 +fi + +if [ -d "demo/config/test-suite/staging" ]; then + echo "Cleaning up staging dir" + rm -rf demo/config/test-suite/staging* +fi + +if [ ! -d "demo/config/test-suite/staging" ]; then + echo "Creating staging dir" + mkdir demo/config/test-suite/staging +fi + +cp demo/config/test-suite/templates/sanity.yaml.template demo/config/test-suite/staging/sanity.yaml + +sed -i '' "s/NAME/$TABLE_TYPE/" demo/config/test-suite/staging/sanity.yaml + +cp demo/config/test-suite/templates/test.properties.template demo/config/test-suite/staging/test.properties +sed -i '' "s/INPUT_PATH/$INPUT_PATH/" demo/config/test-suite/staging/test.properties + +cp demo/config/test-suite/templates/spark_command.txt.template demo/config/test-suite/staging/sanity_spark_command.sh + +sed -i '' "s/JAR_NAME/$JAR_NAME/" demo/config/test-suite/staging/sanity_spark_command.sh +sed -i '' "s/INPUT_PATH/$INPUT_PATH/" demo/config/test-suite/staging/sanity_spark_command.sh +sed -i '' "s/OUTPUT_PATH/$OUTPUT_PATH/" demo/config/test-suite/staging/sanity_spark_command.sh +sed -i '' "s/input_yaml/sanity.yaml/" demo/config/test-suite/staging/sanity_spark_command.sh +sed -i '' "s/TABLE_TYPE/$TABLE_TYPE/" demo/config/test-suite/staging/sanity_spark_command.sh + +if $INCLUDE_MEDIUM_TEST_SUITE ; then + + cp demo/config/test-suite/templates/medium_test_suite.yaml.template demo/config/test-suite/staging/medium_test_suite.yaml + + sed -i '' "s/NAME/$TABLE_TYPE/" demo/config/test-suite/staging/medium_test_suite.yaml + sed -i '' "s/medium_num_iterations/$MEDIUM_NUM_ITR/" demo/config/test-suite/staging/medium_test_suite.yaml + sed -i '' "s/delay_in_mins/$DELAY_MINS/" demo/config/test-suite/staging/medium_test_suite.yaml + + cp demo/config/test-suite/templates/spark_command.txt.template demo/config/test-suite/staging/medium_test_suite_spark_command.sh + + sed -i '' "s/JAR_NAME/$JAR_NAME/" demo/config/test-suite/staging/medium_test_suite_spark_command.sh + sed -i '' "s/INPUT_PATH/$INPUT_PATH/" demo/config/test-suite/staging/medium_test_suite_spark_command.sh + sed -i '' "s/OUTPUT_PATH/$OUTPUT_PATH/" demo/config/test-suite/staging/medium_test_suite_spark_command.sh + sed -i '' "s/input_yaml/medium_test_suite.yaml/" demo/config/test-suite/staging/medium_test_suite_spark_command.sh + sed -i '' "s/TABLE_TYPE/$TABLE_TYPE/" demo/config/test-suite/staging/medium_test_suite_spark_command.sh + +fi + +if $INCLUDE_LONG_TEST_SUITE ; then + + cp demo/config/test-suite/templates/long_test_suite.yaml.template demo/config/test-suite/staging/long_test_suite.yaml + + sed -i '' "s/NAME/$TABLE_TYPE/" demo/config/test-suite/staging/long_test_suite.yaml + sed -i '' "s/long_num_iterations/$LONG_NUM_ITR/" demo/config/test-suite/staging/long_test_suite.yaml + sed -i '' "s/delay_in_mins/$DELAY_MINS/" demo/config/test-suite/staging/long_test_suite.yaml + + cp demo/config/test-suite/templates/spark_command.txt.template demo/config/test-suite/staging/long_test_suite_spark_command.sh + + sed -i '' "s/JAR_NAME/$JAR_NAME/" demo/config/test-suite/staging/long_test_suite_spark_command.sh + sed -i '' "s/INPUT_PATH/$INPUT_PATH/" demo/config/test-suite/staging/long_test_suite_spark_command.sh + sed -i '' "s/OUTPUT_PATH/$OUTPUT_PATH/" demo/config/test-suite/staging/long_test_suite_spark_command.sh + sed -i '' "s/input_yaml/long_test_suite.yaml/" demo/config/test-suite/staging/long_test_suite_spark_command.sh + sed -i '' "s/TABLE_TYPE/$TABLE_TYPE/" demo/config/test-suite/staging/long_test_suite_spark_command.sh + +fi + +if $INCLUDE_CLUSTER_YAML ; then + + cp demo/config/test-suite/templates/clustering.yaml.template demo/config/test-suite/staging/clustering.yaml + + sed -i '' "s/NAME/$TABLE_TYPE/" demo/config/test-suite/staging/clustering.yaml + sed -i '' "s/clustering_num_iterations/$CLUSTER_NUM_ITR/" demo/config/test-suite/staging/clustering.yaml + sed -i '' "s/delay_in_mins/$CLUSTER_DELAY_MINS/" demo/config/test-suite/staging/clustering.yaml + sed -i '' "s/clustering_itr_count/$CLUSTER_ITR_COUNT/" demo/config/test-suite/staging/clustering.yaml + + cp demo/config/test-suite/templates/spark_command.txt.template demo/config/test-suite/staging/clustering_spark_command.sh + + sed -i '' "s/JAR_NAME/$JAR_NAME/" demo/config/test-suite/staging/clustering_spark_command.sh + sed -i '' "s/INPUT_PATH/$INPUT_PATH/" demo/config/test-suite/staging/clustering_spark_command.sh + sed -i '' "s/OUTPUT_PATH/$OUTPUT_PATH/" demo/config/test-suite/staging/clustering_spark_command.sh + sed -i '' "s/input_yaml/clustering.yaml/" demo/config/test-suite/staging/clustering_spark_command.sh + sed -i '' "s/TABLE_TYPE/$TABLE_TYPE/" demo/config/test-suite/staging/clustering_spark_command.sh + sed -i '' "/use-deltastreamer/d" demo/config/test-suite/staging/clustering_spark_command.sh + +fi + +if $EXECUTE_TEST_SUITE ; then + + docker cp $CUR_DIR/../packaging/hudi-integ-test-bundle/target/"$JAR_NAME" adhoc-2:/opt/ + docker exec -it adhoc-2 /bin/bash rm -rf /opt/staging* + docker cp demo/config/test-suite/staging/ adhoc-2:/opt/ + docker exec -it adhoc-2 /bin/bash echo "\n============================== Executing sanity test suite ============================== " + docker exec -it adhoc-2 /bin/bash /opt/staging/sanity_spark_command.sh + + if [ -f demo/config/test-suite/staging/medium_test_suite_spark_command.sh ]; then + docker exec -it adhoc-2 /bin/bash echo "\n\n\n============================== Executing medium test suite ============================== " + docker exec -it adhoc-2 /bin/bash /opt/staging/medium_test_suite_spark_command.sh + fi + + if [ -f demo/config/test-suite/staging/long_test_suite_spark_command.sh ]; then + docker exec -it adhoc-2 /bin/bash echo "\n\n\n============================== Executing long test suite ============================== " + docker exec -it adhoc-2 /bin/bash /opt/staging/long_test_suite_spark_command.sh + fi + + if [ -f demo/config/test-suite/staging/clustering_spark_command.sh ]; then + docker exec -it adhoc-2 /bin/bash echo "\n\n\n============================== Executing clustering test suite ============================== " + docker exec -it adhoc-2 /bin/bash /opt/staging/clustering_spark_command.sh + fi + +fi diff --git a/docker/hoodie/hadoop/base/pom.xml b/docker/hoodie/hadoop/base/pom.xml index 459379da4c14e..b296efe57c01a 100644 --- a/docker/hoodie/hadoop/base/pom.xml +++ b/docker/hoodie/hadoop/base/pom.xml @@ -19,7 +19,7 @@ hudi-hadoop-docker org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 pom diff --git a/docker/hoodie/hadoop/base_java11/Dockerfile b/docker/hoodie/hadoop/base_java11/Dockerfile new file mode 100644 index 0000000000000..8052eae6add84 --- /dev/null +++ b/docker/hoodie/hadoop/base_java11/Dockerfile @@ -0,0 +1,60 @@ + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM openjdk:11-jdk-slim-bullseye +MAINTAINER Hoodie +USER root + +# Default to UTF-8 file.encoding +ENV LANG C.UTF-8 + +ARG HADOOP_VERSION=2.8.4 +ARG HADOOP_URL=https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz +ENV HADOOP_VERSION ${HADOOP_VERSION} +ENV HADOOP_URL ${HADOOP_URL} + +RUN set -x \ + && DEBIAN_FRONTEND=noninteractive apt-get -yq update && apt-get -yq install curl wget netcat procps \ + && echo "Fetch URL2 is : ${HADOOP_URL}" \ + && curl -fSL "${HADOOP_URL}" -o /tmp/hadoop.tar.gz \ + && curl -fSL "${HADOOP_URL}.asc" -o /tmp/hadoop.tar.gz.asc \ + && mkdir -p /opt/hadoop-$HADOOP_VERSION/logs \ + && tar -xvf /tmp/hadoop.tar.gz -C /opt/ \ + && rm /tmp/hadoop.tar.gz* \ + && ln -s /opt/hadoop-$HADOOP_VERSION/etc/hadoop /etc/hadoop \ + && cp /etc/hadoop/mapred-site.xml.template /etc/hadoop/mapred-site.xml \ + && mkdir /hadoop-data + +ENV HADOOP_PREFIX=/opt/hadoop-$HADOOP_VERSION +ENV HADOOP_CONF_DIR=/etc/hadoop +ENV MULTIHOMED_NETWORK=1 +ENV HADOOP_HOME=${HADOOP_PREFIX} +ENV HADOOP_INSTALL=${HADOOP_HOME} +ENV USER=root +ENV PATH /usr/bin:/bin:$HADOOP_PREFIX/bin/:$PATH + +# Exposing a union of ports across hadoop versions +# Well known ports including ssh +EXPOSE 0-1024 4040 7000-10100 5000-5100 50000-50200 58188 58088 58042 + +ADD entrypoint.sh /entrypoint.sh +ADD export_container_ip.sh /usr/bin/ +RUN chmod a+x /usr/bin/export_container_ip.sh \ + && chmod a+x /entrypoint.sh + +ENTRYPOINT ["/bin/bash", "/entrypoint.sh"] + diff --git a/docker/hoodie/hadoop/base_java11/entrypoint.sh b/docker/hoodie/hadoop/base_java11/entrypoint.sh new file mode 100644 index 0000000000000..7c26f29f66886 --- /dev/null +++ b/docker/hoodie/hadoop/base_java11/entrypoint.sh @@ -0,0 +1,107 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +####################################################################################### +## COPIED FROM ## +## https://github.com/big-data-europe/docker-hadoop/blob/master/base/entrypoint.sh ## +# ## +####################################################################################### + +# Set some sensible defaults +export CORE_CONF_fs_defaultFS=${CORE_CONF_fs_defaultFS:-hdfs://`hostname -f`:8020} + +function addProperty() { + local path=$1 + local name=$2 + local value=$3 + + local entry="$name${value}" + local escapedEntry=$(echo $entry | sed 's/\//\\\//g') + sed -i "/<\/configuration>/ s/.*/${escapedEntry}\n&/" $path +} + +function configure() { + local path=$1 + local module=$2 + local envPrefix=$3 + + local var + local value + + echo "Configuring $module" + for c in `printenv | perl -sne 'print "$1 " if m/^${envPrefix}_(.+?)=.*/' -- -envPrefix=$envPrefix`; do + name=`echo ${c} | perl -pe 's/___/-/g; s/__/@/g; s/_/./g; s/@/_/g;'` + var="${envPrefix}_${c}" + value=${!var} + echo " - Setting $name=$value" + addProperty /etc/hadoop/$module-site.xml $name "$value" + done +} + +configure /etc/hadoop/core-site.xml core CORE_CONF +configure /etc/hadoop/hdfs-site.xml hdfs HDFS_CONF +configure /etc/hadoop/yarn-site.xml yarn YARN_CONF +configure /etc/hadoop/httpfs-site.xml httpfs HTTPFS_CONF +configure /etc/hadoop/kms-site.xml kms KMS_CONF + +if [ "$MULTIHOMED_NETWORK" = "1" ]; then + echo "Configuring for multihomed network" + + # HDFS + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.rpc-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.servicerpc-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.http-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.https-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.client.use.datanode.hostname true + addProperty /etc/hadoop/hdfs-site.xml dfs.datanode.use.datanode.hostname true + + # YARN + addProperty /etc/hadoop/yarn-site.xml yarn.resourcemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.timeline-service.bind-host 0.0.0.0 + + # MAPRED + addProperty /etc/hadoop/mapred-site.xml yarn.nodemanager.bind-host 0.0.0.0 +fi + +if [ -n "$GANGLIA_HOST" ]; then + mv /etc/hadoop/hadoop-metrics.properties /etc/hadoop/hadoop-metrics.properties.orig + mv /etc/hadoop/hadoop-metrics2.properties /etc/hadoop/hadoop-metrics2.properties.orig + + for module in mapred jvm rpc ugi; do + echo "$module.class=org.apache.hadoop.metrics.ganglia.GangliaContext31" + echo "$module.period=10" + echo "$module.servers=$GANGLIA_HOST:8649" + done > /etc/hadoop/hadoop-metrics.properties + + for module in namenode datanode resourcemanager nodemanager mrappmaster jobhistoryserver; do + echo "$module.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31" + echo "$module.sink.ganglia.period=10" + echo "$module.sink.ganglia.supportsparse=true" + echo "$module.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both" + echo "$module.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40" + echo "$module.sink.ganglia.servers=$GANGLIA_HOST:8649" + done > /etc/hadoop/hadoop-metrics2.properties +fi + +# Save Container IP in ENV variable +/usr/bin/export_container_ip.sh + +exec "$@" diff --git a/docker/hoodie/hadoop/base_java11/export_container_ip.sh b/docker/hoodie/hadoop/base_java11/export_container_ip.sh new file mode 100755 index 0000000000000..b427f92ccf7c3 --- /dev/null +++ b/docker/hoodie/hadoop/base_java11/export_container_ip.sh @@ -0,0 +1,30 @@ + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +interfaces=( "en0" "eth0" ) + +ipAddr="" +for interface in "${interfaces[@]}" +do + ipAddr=`ifconfig $interface | grep -Eo 'inet (addr:)?([0-9]+\.){3}[0-9]+' | grep -Eo '([0-9]+\.){3}[0-9]+' | grep -v '127.0.0.1' | head` + if [ -n "$ipAddr" ]; then + break + fi +done + +echo "Container IP is set to : $ipAddr" +export MY_CONTAINER_IP=$ipAddr diff --git a/docker/hoodie/hadoop/base_java11/pom.xml b/docker/hoodie/hadoop/base_java11/pom.xml new file mode 100644 index 0000000000000..5531c64aa8725 --- /dev/null +++ b/docker/hoodie/hadoop/base_java11/pom.xml @@ -0,0 +1,96 @@ + + + + + hudi-hadoop-docker + org.apache.hudi + 0.12.2-dt-SNAPSHOT + + 4.0.0 + pom + hudi-hadoop-base-java11-docker + + Base Docker Image with Hoodie + + + UTF-8 + true + ${project.parent.parent.basedir} + + + + + + + org.apache.hudi + hudi-hadoop-docker + ${project.version} + pom + import + + + + + + + hudi + + + + com.spotify + dockerfile-maven-plugin + ${dockerfile.maven.version} + + + tag-latest + pre-integration-test + + build + tag + + + + ${docker.build.skip} + false + apachehudi/hudi-hadoop_${docker.hadoop.version}-base-java11 + true + latest + + + + tag-version + pre-integration-test + + build + tag + + + + ${docker.build.skip} + false + apachehudi/hudi-hadoop_${docker.hadoop.version}-base-java11 + true + ${project.version} + + + + + + + diff --git a/docker/hoodie/hadoop/datanode/pom.xml b/docker/hoodie/hadoop/datanode/pom.xml index f7406a18fbfb1..29aeadcb9b24d 100644 --- a/docker/hoodie/hadoop/datanode/pom.xml +++ b/docker/hoodie/hadoop/datanode/pom.xml @@ -19,7 +19,7 @@ hudi-hadoop-docker org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 pom diff --git a/docker/hoodie/hadoop/historyserver/pom.xml b/docker/hoodie/hadoop/historyserver/pom.xml index da90fa07ecc40..4f255b4081cc5 100644 --- a/docker/hoodie/hadoop/historyserver/pom.xml +++ b/docker/hoodie/hadoop/historyserver/pom.xml @@ -19,7 +19,7 @@ hudi-hadoop-docker org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 pom diff --git a/docker/hoodie/hadoop/hive_base/Dockerfile b/docker/hoodie/hadoop/hive_base/Dockerfile index 8d85fd5b52969..7d04d94fc60cc 100644 --- a/docker/hoodie/hadoop/hive_base/Dockerfile +++ b/docker/hoodie/hadoop/hive_base/Dockerfile @@ -64,6 +64,8 @@ COPY entrypoint.sh /usr/local/bin/ RUN chmod +x /usr/local/bin/entrypoint.sh ENV PATH $HIVE_HOME/bin/:$PATH +# NOTE: This is the only battle-proven method to inject jars into Hive CLI +ENV AUX_CLASSPATH=file://${HUDI_HADOOP_BUNDLE} ENTRYPOINT ["entrypoint.sh"] CMD startup.sh diff --git a/docker/hoodie/hadoop/hive_base/pom.xml b/docker/hoodie/hadoop/hive_base/pom.xml index 220483e7297e7..0d5f8412d1117 100644 --- a/docker/hoodie/hadoop/hive_base/pom.xml +++ b/docker/hoodie/hadoop/hive_base/pom.xml @@ -19,7 +19,7 @@ hudi-hadoop-docker org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 pom @@ -55,12 +55,12 @@ package - + - - - + + + run diff --git a/docker/hoodie/hadoop/namenode/pom.xml b/docker/hoodie/hadoop/namenode/pom.xml index 6e1dfd23d72c4..c506d9e48a2d7 100644 --- a/docker/hoodie/hadoop/namenode/pom.xml +++ b/docker/hoodie/hadoop/namenode/pom.xml @@ -19,7 +19,7 @@ hudi-hadoop-docker org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 pom diff --git a/docker/hoodie/hadoop/pom.xml b/docker/hoodie/hadoop/pom.xml index efb1153a8b6a3..44eb722a33e9f 100644 --- a/docker/hoodie/hadoop/pom.xml +++ b/docker/hoodie/hadoop/pom.xml @@ -19,7 +19,7 @@ hudi org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT ../../../pom.xml 4.0.0 @@ -28,6 +28,7 @@ pom base + base_java11 namenode datanode historyserver @@ -37,12 +38,15 @@ sparkworker sparkadhoc prestobase + trinobase + trinocoordinator + trinoworker org.apache.hudi - hudi-spark-bundle_${scala.binary.version} + hudi-spark${sparkbundle.version}-bundle_${scala.binary.version} ${project.version} @@ -53,8 +57,9 @@ 2.4.4 2.3.3 2.8.4 - 0.217 - 1.4.3 + 0.271 + 368 + 1.4.13 true ${project.parent.basedir} diff --git a/docker/hoodie/hadoop/prestobase/Dockerfile b/docker/hoodie/hadoop/prestobase/Dockerfile index 43b989e6b60e6..accedb94db3dc 100644 --- a/docker/hoodie/hadoop/prestobase/Dockerfile +++ b/docker/hoodie/hadoop/prestobase/Dockerfile @@ -22,7 +22,7 @@ ARG HADOOP_VERSION=2.8.4 ARG HIVE_VERSION=2.3.3 FROM apachehudi/hudi-hadoop_${HADOOP_VERSION}-base:latest as hadoop-base -ARG PRESTO_VERSION=0.217 +ARG PRESTO_VERSION=0.271 ENV PRESTO_VERSION ${PRESTO_VERSION} ENV PRESTO_HOME /opt/presto-server-${PRESTO_VERSION} @@ -79,6 +79,15 @@ RUN chmod +x /usr/local/bin/entrypoint.sh ADD target/ /var/hoodie/ws/docker/hoodie/hadoop/prestobase/target/ ENV HUDI_PRESTO_BUNDLE /var/hoodie/ws/docker/hoodie/hadoop/prestobase/target/hudi-presto-bundle.jar RUN cp ${HUDI_PRESTO_BUNDLE} ${PRESTO_HOME}/plugin/hive-hadoop2/ +# TODO: the latest master of Presto relies on hudi-presto-bundle, while current Presto releases +# rely on hudi-common and hudi-hadoop-mr 0.9.0, which are pulled in plugin/hive-hadoop2/ in the +# docker setup, making it hard to test the latest changes in Hudi due to class conflict. +# To get around the conflicts due to older Hudi jars below, they are removed for integration tests, +# so the hudi-presto-bundle build can be used solely for testing. This temporary logic must be +# removed once Presto has a new release depending on hudi-presto-bundle and we upgrade docker setup +# to that release version. +RUN rm ${PRESTO_HOME}/plugin/hive-hadoop2/hudi-common-* +RUN rm ${PRESTO_HOME}/plugin/hive-hadoop2/hudi-hadoop-mr-* VOLUME ["${PRESTO_LOG_DIR}"] diff --git a/docker/hoodie/hadoop/prestobase/pom.xml b/docker/hoodie/hadoop/prestobase/pom.xml index 5f3cd4cc00544..c318a498395ee 100644 --- a/docker/hoodie/hadoop/prestobase/pom.xml +++ b/docker/hoodie/hadoop/prestobase/pom.xml @@ -16,13 +16,11 @@ See the License for the specific language governing permissions and limitations under the License. --> - + hudi-hadoop-docker org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 pom @@ -57,9 +55,9 @@ package - + - + run diff --git a/docker/hoodie/hadoop/spark_base/pom.xml b/docker/hoodie/hadoop/spark_base/pom.xml index 98ad8c972a886..3dbb4f8f2f778 100644 --- a/docker/hoodie/hadoop/spark_base/pom.xml +++ b/docker/hoodie/hadoop/spark_base/pom.xml @@ -19,7 +19,7 @@ hudi-hadoop-docker org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 pom diff --git a/docker/hoodie/hadoop/sparkadhoc/Dockerfile b/docker/hoodie/hadoop/sparkadhoc/Dockerfile index a114cf0fbe92c..9e5a4cb68332b 100644 --- a/docker/hoodie/hadoop/sparkadhoc/Dockerfile +++ b/docker/hoodie/hadoop/sparkadhoc/Dockerfile @@ -20,16 +20,37 @@ ARG HIVE_VERSION=2.3.3 ARG SPARK_VERSION=2.4.4 FROM apachehudi/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}-sparkbase_${SPARK_VERSION} -ARG PRESTO_VERSION=0.217 +ARG PRESTO_VERSION=0.268 +ARG TRINO_VERSION=368 COPY adhoc.sh /opt/spark ENV SPARK_WORKER_WEBUI_PORT 8081 ENV SPARK_WORKER_LOG /spark/logs ENV SPARK_MASTER "spark://spark-master:7077" ENV PRESTO_VERSION ${PRESTO_VERSION} +ENV TRINO_VERSION ${TRINO_VERSION} +ENV BASE_URL=https://repo1.maven.org/maven2 + +RUN apt-get update +RUN apt-get install -y \ + curl \ + tar \ + sudo \ + rsync \ + python \ + wget \ + python3-pip \ + python-dev \ + build-essential \ + uuid-runtime \ + less RUN set -x \ ## presto-client - && wget -q -O /usr/local/bin/presto https://repo1.maven.org/maven2/com/facebook/presto/presto-cli/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar \ + && wget -q -O /usr/local/bin/presto ${BASE_URL}/com/facebook/presto/presto-cli/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar \ && chmod +x /usr/local/bin/presto +RUN set -x \ + ## trino-cli + && wget -q -O /usr/local/bin/trino ${BASE_URL}/io/trino/trino-cli/${TRINO_VERSION}/trino-cli-${TRINO_VERSION}-executable.jar \ + && chmod +x /usr/local/bin/trino CMD ["/bin/bash", "/opt/spark/adhoc.sh"] diff --git a/docker/hoodie/hadoop/sparkadhoc/adhoc.sh b/docker/hoodie/hadoop/sparkadhoc/adhoc.sh index b20e8cb5fe06a..fd2ef651765f1 100644 --- a/docker/hoodie/hadoop/sparkadhoc/adhoc.sh +++ b/docker/hoodie/hadoop/sparkadhoc/adhoc.sh @@ -22,10 +22,12 @@ export SPARK_HOME=/opt/spark -export PRESTO_CLI_CMD="/usr/local/bin/presto --server presto-coordinator-1" +export PRESTO_CLI_CMD="/usr/local/bin/presto --server presto-coordinator-1:8090" +export TRINO_CLI_CMD="/usr/local/bin/trino --server trino-coordinator-1:8091" date echo "SPARK HOME is : $SPARK_HOME" echo "PRESTO CLI CMD is : $PRESTO_CLI_CMD" +echo "TRINO CLI CMD is : $TRINO_CLI_CMD" tail -f /dev/null diff --git a/docker/hoodie/hadoop/sparkadhoc/pom.xml b/docker/hoodie/hadoop/sparkadhoc/pom.xml index 0ec0c1c9a4803..dc8f9ff0bfe51 100644 --- a/docker/hoodie/hadoop/sparkadhoc/pom.xml +++ b/docker/hoodie/hadoop/sparkadhoc/pom.xml @@ -19,7 +19,7 @@ hudi-hadoop-docker org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 pom @@ -64,7 +64,9 @@ ${docker.build.skip} false - apachehudi/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkadhoc_${docker.spark.version} + + apachehudi/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkadhoc_${docker.spark.version} + true latest @@ -80,7 +82,9 @@ ${docker.build.skip} false - apachehudi/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkadhoc_${docker.spark.version} + + apachehudi/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkadhoc_${docker.spark.version} + true ${project.version} diff --git a/docker/hoodie/hadoop/sparkmaster/pom.xml b/docker/hoodie/hadoop/sparkmaster/pom.xml index 78758fc040e85..aa739268739c5 100644 --- a/docker/hoodie/hadoop/sparkmaster/pom.xml +++ b/docker/hoodie/hadoop/sparkmaster/pom.xml @@ -19,7 +19,7 @@ hudi-hadoop-docker org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 pom diff --git a/docker/hoodie/hadoop/sparkworker/pom.xml b/docker/hoodie/hadoop/sparkworker/pom.xml index 486baad829988..89fbcc56fe020 100644 --- a/docker/hoodie/hadoop/sparkworker/pom.xml +++ b/docker/hoodie/hadoop/sparkworker/pom.xml @@ -19,7 +19,7 @@ hudi-hadoop-docker org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 pom diff --git a/docker/hoodie/hadoop/trinobase/Dockerfile b/docker/hoodie/hadoop/trinobase/Dockerfile new file mode 100644 index 0000000000000..9d7c23010fbb8 --- /dev/null +++ b/docker/hoodie/hadoop/trinobase/Dockerfile @@ -0,0 +1,66 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Trino docker setup is adapted from https://github.com/Lewuathe/docker-trino-cluster + +ARG HADOOP_VERSION=2.8.4 +ARG HIVE_VERSION=2.3.3 +FROM apachehudi/hudi-hadoop_${HADOOP_VERSION}-base-java11:latest as hadoop-base + +ENV TRINO_VERSION=368 +ENV TRINO_HOME=/usr/local/trino +ENV BASE_URL=https://repo1.maven.org/maven2 + +RUN apt-get update +RUN apt-get install -y \ + curl \ + tar \ + sudo \ + rsync \ + python \ + wget \ + python3-pip \ + python-dev \ + build-essential \ + uuid-runtime \ + less + +ENV JAVA_HOME /usr/java/default +ENV PATH $PATH:$JAVA_HOME/bin + +WORKDIR /usr/local/bin +RUN wget -q ${BASE_URL}/io/trino/trino-cli/${TRINO_VERSION}/trino-cli-${TRINO_VERSION}-executable.jar +RUN chmod +x trino-cli-${TRINO_VERSION}-executable.jar +RUN mv trino-cli-${TRINO_VERSION}-executable.jar trino-cli + +WORKDIR /usr/local +RUN wget -q ${BASE_URL}/io/trino/trino-server/${TRINO_VERSION}/trino-server-${TRINO_VERSION}.tar.gz +RUN tar xvzf trino-server-${TRINO_VERSION}.tar.gz -C /usr/local/ +RUN ln -s /usr/local/trino-server-${TRINO_VERSION} $TRINO_HOME + +ENV TRINO_BASE_WS /var/hoodie/ws/docker/hoodie/hadoop/trinobase +RUN mkdir -p ${TRINO_BASE_WS}/target/ +ADD target/ ${TRINO_BASE_WS}/target/ +ENV HUDI_TRINO_BUNDLE ${TRINO_BASE_WS}/target/hudi-trino-bundle.jar +RUN cp ${HUDI_TRINO_BUNDLE} ${TRINO_HOME}/plugin/hive/ + +ADD scripts ${TRINO_HOME}/scripts +RUN chmod +x ${TRINO_HOME}/scripts/trino.sh + +RUN mkdir -p $TRINO_HOME/data +VOLUME ["$TRINO_HOME/data"] diff --git a/docker/hoodie/hadoop/trinobase/pom.xml b/docker/hoodie/hadoop/trinobase/pom.xml new file mode 100644 index 0000000000000..792123e505d4f --- /dev/null +++ b/docker/hoodie/hadoop/trinobase/pom.xml @@ -0,0 +1,116 @@ + + + + + hudi-hadoop-docker + org.apache.hudi + 0.12.2-dt-SNAPSHOT + + 4.0.0 + pom + hudi-hadoop-trinobase-docker + Trino Base Docker Image with Hudi + + + UTF-8 + true + ${project.parent.parent.basedir} + + + + + + org.apache.hudi + hudi-hadoop-base-java11-docker + ${project.version} + pom + import + + + + + + + + org.apache.maven.plugins + maven-antrun-plugin + 1.7 + + + package + + + + + + + run + + + + + + + com.spotify + dockerfile-maven-plugin + ${dockerfile.maven.version} + + + tag-latest + pre-integration-test + + build + tag + + + ${docker.build.skip} + false + + apachehudi/hudi-hadoop_${docker.hadoop.version}-trinobase_${docker.trino.version} + + true + latest + + + + tag-version + pre-integration-test + + build + tag + + + + ${docker.build.skip} + false + + apachehudi/hudi-hadoop_${docker.hadoop.version}-trinobase_${docker.trino.version} + + true + ${project.version} + + + + + + + diff --git a/docker/hoodie/hadoop/trinobase/scripts/trino.sh b/docker/hoodie/hadoop/trinobase/scripts/trino.sh new file mode 100644 index 0000000000000..9aacd842c3dec --- /dev/null +++ b/docker/hoodie/hadoop/trinobase/scripts/trino.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +/usr/local/trino/bin/launcher run diff --git a/docker/hoodie/hadoop/trinocoordinator/Dockerfile b/docker/hoodie/hadoop/trinocoordinator/Dockerfile new file mode 100644 index 0000000000000..67a31448d7a65 --- /dev/null +++ b/docker/hoodie/hadoop/trinocoordinator/Dockerfile @@ -0,0 +1,29 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Trino docker setup is adapted from https://github.com/Lewuathe/docker-trino-cluster + +ARG HADOOP_VERSION=2.8.4 +ARG TRINO_VERSION=368 +FROM apachehudi/hudi-hadoop_${HADOOP_VERSION}-trinobase_${TRINO_VERSION}:latest as trino-base + +ADD etc /usr/local/trino/etc +EXPOSE 8091 + +WORKDIR /usr/local/trino +ENTRYPOINT [ "./scripts/trino.sh" ] diff --git a/docker/hoodie/hadoop/trinocoordinator/etc/catalog/hive.properties b/docker/hoodie/hadoop/trinocoordinator/etc/catalog/hive.properties new file mode 100644 index 0000000000000..ed7fce1b3e640 --- /dev/null +++ b/docker/hoodie/hadoop/trinocoordinator/etc/catalog/hive.properties @@ -0,0 +1,22 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +connector.name=hive +hive.metastore.uri=thrift://hivemetastore:9083 +hive.config.resources=/etc/hadoop/core-site.xml,/etc/hadoop/hdfs-site.xml +hive.hdfs.authentication.type=NONE diff --git a/docker/hoodie/hadoop/trinocoordinator/etc/config.properties b/docker/hoodie/hadoop/trinocoordinator/etc/config.properties new file mode 100644 index 0000000000000..9876a0fe0f008 --- /dev/null +++ b/docker/hoodie/hadoop/trinocoordinator/etc/config.properties @@ -0,0 +1,26 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +coordinator=true +node-scheduler.include-coordinator=false +http-server.http.port=8091 +query.max-memory=50GB +query.max-memory-per-node=1GB +query.max-total-memory-per-node=2GB +discovery-server.enabled=true +discovery.uri=http://trino-coordinator-1:8091 diff --git a/docker/hoodie/hadoop/trinocoordinator/etc/jvm.config b/docker/hoodie/hadoop/trinocoordinator/etc/jvm.config new file mode 100644 index 0000000000000..fb17203ca211b --- /dev/null +++ b/docker/hoodie/hadoop/trinocoordinator/etc/jvm.config @@ -0,0 +1,27 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +-server +-Xmx16G +-XX:+UseG1GC +-XX:G1HeapRegionSize=32M +-XX:+UseGCOverheadLimit +-XX:+ExplicitGCInvokesConcurrent +-XX:+HeapDumpOnOutOfMemoryError +-XX:OnOutOfMemoryError=kill -9 %p +-Djdk.attach.allowAttachSelf=true diff --git a/docker/hoodie/hadoop/trinocoordinator/etc/log.properties b/docker/hoodie/hadoop/trinocoordinator/etc/log.properties new file mode 100644 index 0000000000000..23b063080b4fe --- /dev/null +++ b/docker/hoodie/hadoop/trinocoordinator/etc/log.properties @@ -0,0 +1,19 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +io.trinosql=INFO diff --git a/docker/hoodie/hadoop/trinocoordinator/etc/node.properties b/docker/hoodie/hadoop/trinocoordinator/etc/node.properties new file mode 100644 index 0000000000000..d97d547485998 --- /dev/null +++ b/docker/hoodie/hadoop/trinocoordinator/etc/node.properties @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +node.environment=development +node.id=3044b958-f077-4fce-87ed-ca8308f800b6 +node.data-dir=/usr/local/trino/data diff --git a/docker/hoodie/hadoop/trinocoordinator/pom.xml b/docker/hoodie/hadoop/trinocoordinator/pom.xml new file mode 100644 index 0000000000000..69f6a525962c0 --- /dev/null +++ b/docker/hoodie/hadoop/trinocoordinator/pom.xml @@ -0,0 +1,96 @@ + + + + + hudi-hadoop-docker + org.apache.hudi + 0.12.2-dt-SNAPSHOT + + 4.0.0 + pom + hudi-hadoop-trinocoordinator-docker + Trino Coordinator Docker Image with Hudi + + + UTF-8 + true + ${project.parent.parent.basedir} + + + + + + org.apache.hudi + hudi-hadoop-trinobase-docker + ${project.version} + pom + + + + + + + + + com.spotify + dockerfile-maven-plugin + ${dockerfile.maven.version} + + + tag-latest + pre-integration-test + + build + tag + + + ${docker.build.skip} + false + + apachehudi/hudi-hadoop_${docker.hadoop.version}-trinocoordinator_${docker.trino.version} + + true + latest + + + + tag-version + pre-integration-test + + build + tag + + + + ${docker.build.skip} + false + + apachehudi/hudi-hadoop_${docker.hadoop.version}-trinocoordinator_${docker.trino.version} + + true + ${project.version} + + + + + + + diff --git a/docker/hoodie/hadoop/trinoworker/Dockerfile b/docker/hoodie/hadoop/trinoworker/Dockerfile new file mode 100644 index 0000000000000..ae5b2766dc9d9 --- /dev/null +++ b/docker/hoodie/hadoop/trinoworker/Dockerfile @@ -0,0 +1,29 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Trino docker setup is adapted from https://github.com/Lewuathe/docker-trino-cluster + +ARG HADOOP_VERSION=2.8.4 +ARG TRINO_VERSION=368 +FROM apachehudi/hudi-hadoop_${HADOOP_VERSION}-trinobase_${TRINO_VERSION}:latest as trino-base + +ADD etc /usr/local/trino/etc +EXPOSE 8092 + +WORKDIR /usr/local/trino +ENTRYPOINT [ "./scripts/trino.sh" ] diff --git a/docker/hoodie/hadoop/trinoworker/etc/catalog/hive.properties b/docker/hoodie/hadoop/trinoworker/etc/catalog/hive.properties new file mode 100644 index 0000000000000..ed7fce1b3e640 --- /dev/null +++ b/docker/hoodie/hadoop/trinoworker/etc/catalog/hive.properties @@ -0,0 +1,22 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +connector.name=hive +hive.metastore.uri=thrift://hivemetastore:9083 +hive.config.resources=/etc/hadoop/core-site.xml,/etc/hadoop/hdfs-site.xml +hive.hdfs.authentication.type=NONE diff --git a/docker/hoodie/hadoop/trinoworker/etc/config.properties b/docker/hoodie/hadoop/trinoworker/etc/config.properties new file mode 100644 index 0000000000000..0e15d3d7c1e9c --- /dev/null +++ b/docker/hoodie/hadoop/trinoworker/etc/config.properties @@ -0,0 +1,24 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +coordinator=false +http-server.http.port=8091 +query.max-memory=50GB +query.max-memory-per-node=1GB +query.max-total-memory-per-node=2GB +discovery.uri=http://trino-coordinator-1:8091 diff --git a/docker/hoodie/hadoop/trinoworker/etc/jvm.config b/docker/hoodie/hadoop/trinoworker/etc/jvm.config new file mode 100644 index 0000000000000..fb17203ca211b --- /dev/null +++ b/docker/hoodie/hadoop/trinoworker/etc/jvm.config @@ -0,0 +1,27 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +-server +-Xmx16G +-XX:+UseG1GC +-XX:G1HeapRegionSize=32M +-XX:+UseGCOverheadLimit +-XX:+ExplicitGCInvokesConcurrent +-XX:+HeapDumpOnOutOfMemoryError +-XX:OnOutOfMemoryError=kill -9 %p +-Djdk.attach.allowAttachSelf=true diff --git a/docker/hoodie/hadoop/trinoworker/etc/log.properties b/docker/hoodie/hadoop/trinoworker/etc/log.properties new file mode 100644 index 0000000000000..23b063080b4fe --- /dev/null +++ b/docker/hoodie/hadoop/trinoworker/etc/log.properties @@ -0,0 +1,19 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +io.trinosql=INFO diff --git a/docker/hoodie/hadoop/trinoworker/etc/node.properties b/docker/hoodie/hadoop/trinoworker/etc/node.properties new file mode 100644 index 0000000000000..6cfebf995602e --- /dev/null +++ b/docker/hoodie/hadoop/trinoworker/etc/node.properties @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +node.environment=development +node.id=6606f0b3-6ae7-4152-a4b1-ddadb6345fe6 +node.data-dir=/var/trino/data diff --git a/docker/hoodie/hadoop/trinoworker/pom.xml b/docker/hoodie/hadoop/trinoworker/pom.xml new file mode 100644 index 0000000000000..c1346272b3a4b --- /dev/null +++ b/docker/hoodie/hadoop/trinoworker/pom.xml @@ -0,0 +1,96 @@ + + + + + hudi-hadoop-docker + org.apache.hudi + 0.12.2-dt-SNAPSHOT + + 4.0.0 + pom + hudi-hadoop-trinoworker-docker + Trino Worker Docker Image with Hudi + + + UTF-8 + true + ${project.parent.parent.basedir} + + + + + + org.apache.hudi + hudi-hadoop-trinobase-docker + ${project.version} + pom + + + + + + + + + com.spotify + dockerfile-maven-plugin + ${dockerfile.maven.version} + + + tag-latest + pre-integration-test + + build + tag + + + ${docker.build.skip} + false + + apachehudi/hudi-hadoop_${docker.hadoop.version}-trinoworker_${docker.trino.version} + + true + latest + + + + tag-version + pre-integration-test + + build + tag + + + + ${docker.build.skip} + false + + apachehudi/hudi-hadoop_${docker.hadoop.version}-trinoworker_${docker.trino.version} + + true + ${project.version} + + + + + + + diff --git a/docker/images/push_to_docker_hub.png b/docker/images/push_to_docker_hub.png new file mode 100644 index 0000000000000..faa431bd16784 Binary files /dev/null and b/docker/images/push_to_docker_hub.png differ diff --git a/docker/setup_demo.sh b/docker/setup_demo.sh index 634fe9e509bdd..81270bba75ffe 100755 --- a/docker/setup_demo.sh +++ b/docker/setup_demo.sh @@ -17,12 +17,20 @@ # limitations under the License. SCRIPT_PATH=$(cd `dirname $0`; pwd) +HUDI_DEMO_ENV=$1 WS_ROOT=`dirname $SCRIPT_PATH` +COMPOSE_FILE_NAME="docker-compose_hadoop284_hive233_spark244.yml" +if [ "$HUDI_DEMO_ENV" = "--mac-aarch64" ]; then + COMPOSE_FILE_NAME="docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml" +fi # restart cluster -HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/compose/docker-compose_hadoop284_hive233_spark244.yml down -HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/compose/docker-compose_hadoop284_hive233_spark244.yml pull +HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/compose/${COMPOSE_FILE_NAME} down +if [ "$HUDI_DEMO_ENV" != "dev" ]; then + echo "Pulling docker demo images ..." + HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/compose/${COMPOSE_FILE_NAME} pull +fi sleep 5 -HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/compose/docker-compose_hadoop284_hive233_spark244.yml up -d +HUDI_WS=${WS_ROOT} docker-compose --verbose -f ${SCRIPT_PATH}/compose/${COMPOSE_FILE_NAME} up -d sleep 15 docker exec -it adhoc-1 /bin/bash /var/hoodie/ws/docker/demo/setup_demo_container.sh diff --git a/docker/stop_demo.sh b/docker/stop_demo.sh index 83b8a2c1ef5c0..32a0e70c37919 100755 --- a/docker/stop_demo.sh +++ b/docker/stop_demo.sh @@ -17,10 +17,15 @@ # limitations under the License. SCRIPT_PATH=$(cd `dirname $0`; pwd) +HUDI_DEMO_ENV=$1 # set up root directory WS_ROOT=`dirname $SCRIPT_PATH` +COMPOSE_FILE_NAME="docker-compose_hadoop284_hive233_spark244.yml" +if [ "$HUDI_DEMO_ENV" = "--mac-aarch64" ]; then + COMPOSE_FILE_NAME="docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml" +fi # shut down cluster -HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/compose/docker-compose_hadoop284_hive233_spark244.yml down +HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/compose/${COMPOSE_FILE_NAME} down # remove houst mount directory rm -rf /tmp/hadoop_data diff --git a/hudi-aws/pom.xml b/hudi-aws/pom.xml new file mode 100644 index 0000000000000..3bd3f49c3efdd --- /dev/null +++ b/hudi-aws/pom.xml @@ -0,0 +1,278 @@ + + + + + hudi + org.apache.hudi + 0.12.2-dt-SNAPSHOT + + 4.0.0 + + hudi-aws + 0.12.2-dt-SNAPSHOT + + hudi-aws + jar + + + 1.15.0 + + + + + + org.apache.logging.log4j + log4j-1.2-api + + + + + org.apache.hudi + hudi-common + ${project.version} + + + org.apache.hudi + hudi-hive-sync + ${project.version} + + + + + org.apache.hadoop + hadoop-common + tests + test + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + + + + com.amazonaws + dynamodb-lock-client + ${dynamodb.lockclient.version} + + + + + ${hive.groupid} + hive-service + ${hive.version} + + + org.slf4j + slf4j-api + + + org.slf4j + slf4j-log4j12 + + + + + + org.apache.parquet + parquet-avro + + + + + com.amazonaws + aws-java-sdk-cloudwatch + ${aws.sdk.version} + + + + com.amazonaws + aws-java-sdk-dynamodb + ${aws.sdk.version} + + + io.netty + * + + + + + com.amazonaws + aws-java-sdk-core + ${aws.sdk.version} + + + + io.dropwizard.metrics + metrics-core + + + + com.amazonaws + aws-java-sdk-glue + ${aws.sdk.version} + + + + + org.apache.hudi + hudi-tests-common + ${project.version} + test + + + + org.junit.jupiter + junit-jupiter-api + test + + + + org.junit.jupiter + junit-jupiter-engine + test + + + + org.junit.vintage + junit-vintage-engine + test + + + + org.junit.jupiter + junit-jupiter-params + test + + + + org.mockito + mockito-junit-jupiter + test + + + + org.junit.platform + junit-platform-runner + test + + + + org.junit.platform + junit-platform-suite-api + test + + + + + + + org.jacoco + jacoco-maven-plugin + + + org.apache.maven.plugins + maven-compiler-plugin + + + compile + + compile + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + + test-jar + + test-compile + + + + false + + + + io.fabric8 + docker-maven-plugin + + + prepare-it-database + pre-integration-test + + start + + + + + amazon/dynamodb-local:${dynamodb-local.version} + it-database + + + ${dynamodb-local.port}:${dynamodb-local.port} + + + + ${dynamodb-local.endpoint}/shell/ + + + + + + + + + + remove-it-database + post-integration-test + + stop + + + + + + org.apache.rat + apache-rat-plugin + + + + + + src/main/resources + + + src/test/resources + + + + diff --git a/hudi-aws/src/main/java/org/apache/hudi/aws/cloudwatch/CloudWatchReporter.java b/hudi-aws/src/main/java/org/apache/hudi/aws/cloudwatch/CloudWatchReporter.java new file mode 100644 index 0000000000000..b208ec92260fd --- /dev/null +++ b/hudi-aws/src/main/java/org/apache/hudi/aws/cloudwatch/CloudWatchReporter.java @@ -0,0 +1,315 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.aws.cloudwatch; + +import org.apache.hudi.aws.credentials.HoodieAWSCredentialsProviderFactory; +import org.apache.hudi.common.util.Option; + +import com.amazonaws.services.cloudwatch.AmazonCloudWatchAsync; +import com.amazonaws.services.cloudwatch.AmazonCloudWatchAsyncClientBuilder; +import com.amazonaws.services.cloudwatch.model.Dimension; +import com.amazonaws.services.cloudwatch.model.MetricDatum; +import com.amazonaws.services.cloudwatch.model.PutMetricDataRequest; +import com.amazonaws.services.cloudwatch.model.PutMetricDataResult; +import com.amazonaws.services.cloudwatch.model.StandardUnit; +import com.codahale.metrics.Clock; +import com.codahale.metrics.Counter; +import com.codahale.metrics.Counting; +import com.codahale.metrics.Gauge; +import com.codahale.metrics.Histogram; +import com.codahale.metrics.Meter; +import com.codahale.metrics.MetricFilter; +import com.codahale.metrics.MetricRegistry; +import com.codahale.metrics.ScheduledReporter; +import com.codahale.metrics.Timer; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.SortedMap; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; + +/** + * A reporter for publishing metrics to Amazon CloudWatch. It is responsible for collecting, converting DropWizard + * metrics to CloudWatch metrics and composing metrics payload. + */ +public class CloudWatchReporter extends ScheduledReporter { + + static final String DIMENSION_TABLE_NAME_KEY = "Table"; + static final String DIMENSION_METRIC_TYPE_KEY = "Metric Type"; + static final String DIMENSION_GAUGE_TYPE_VALUE = "gauge"; + static final String DIMENSION_COUNT_TYPE_VALUE = "count"; + + private static final Logger LOG = LogManager.getLogger(CloudWatchReporter.class); + + private final AmazonCloudWatchAsync cloudWatchClientAsync; + private final Clock clock; + private final String prefix; + private final String namespace; + private final int maxDatumsPerRequest; + + public static Builder forRegistry(MetricRegistry registry) { + return new Builder(registry); + } + + public static class Builder { + private final MetricRegistry registry; + private Clock clock; + private String prefix; + private TimeUnit rateUnit; + private TimeUnit durationUnit; + private MetricFilter filter; + private String namespace; + private int maxDatumsPerRequest; + + private Builder(MetricRegistry registry) { + this.registry = registry; + this.clock = Clock.defaultClock(); + this.rateUnit = TimeUnit.SECONDS; + this.durationUnit = TimeUnit.MILLISECONDS; + this.filter = MetricFilter.ALL; + this.maxDatumsPerRequest = 20; + } + + public Builder withClock(Clock clock) { + this.clock = clock; + return this; + } + + public Builder prefixedWith(String prefix) { + this.prefix = prefix; + return this; + } + + public Builder convertRatesTo(TimeUnit rateUnit) { + this.rateUnit = rateUnit; + return this; + } + + public Builder convertDurationsTo(TimeUnit durationUnit) { + this.durationUnit = durationUnit; + return this; + } + + public Builder filter(MetricFilter filter) { + this.filter = filter; + return this; + } + + public Builder namespace(String namespace) { + this.namespace = namespace; + return this; + } + + public Builder maxDatumsPerRequest(int maxDatumsPerRequest) { + this.maxDatumsPerRequest = maxDatumsPerRequest; + return this; + } + + public CloudWatchReporter build(Properties props) { + return new CloudWatchReporter(registry, + getAmazonCloudWatchClient(props), + clock, + prefix, + namespace, + maxDatumsPerRequest, + filter, + rateUnit, + durationUnit); + } + + CloudWatchReporter build(AmazonCloudWatchAsync amazonCloudWatchAsync) { + return new CloudWatchReporter(registry, + amazonCloudWatchAsync, + clock, + prefix, + namespace, + maxDatumsPerRequest, + filter, + rateUnit, + durationUnit); + } + } + + protected CloudWatchReporter(MetricRegistry registry, + AmazonCloudWatchAsync cloudWatchClientAsync, + Clock clock, + String prefix, + String namespace, + int maxDatumsPerRequest, + MetricFilter filter, + TimeUnit rateUnit, + TimeUnit durationUnit) { + super(registry, "hudi-cloudWatch-reporter", filter, rateUnit, durationUnit); + this.cloudWatchClientAsync = cloudWatchClientAsync; + this.clock = clock; + this.prefix = prefix; + this.namespace = namespace; + this.maxDatumsPerRequest = maxDatumsPerRequest; + } + + private static AmazonCloudWatchAsync getAmazonCloudWatchClient(Properties props) { + return AmazonCloudWatchAsyncClientBuilder.standard() + .withCredentials(HoodieAWSCredentialsProviderFactory.getAwsCredentialsProvider(props)) + .build(); + } + + @Override + public void report(SortedMap gauges, + SortedMap counters, + SortedMap histograms, + SortedMap meters, + SortedMap timers) { + LOG.info("Reporting Metrics to CloudWatch."); + + final long timestampMilliSec = clock.getTime(); + List metricsData = new ArrayList<>(); + + for (Map.Entry entry : gauges.entrySet()) { + processGauge(entry.getKey(), entry.getValue(), timestampMilliSec, metricsData); + } + + for (Map.Entry entry : counters.entrySet()) { + processCounter(entry.getKey(), entry.getValue(), timestampMilliSec, metricsData); + } + + for (Map.Entry entry : histograms.entrySet()) { + processCounter(entry.getKey(), entry.getValue(), timestampMilliSec, metricsData); + //TODO: Publish other Histogram metrics to cloud watch + } + + for (Map.Entry entry : meters.entrySet()) { + processCounter(entry.getKey(), entry.getValue(), timestampMilliSec, metricsData); + //TODO: Publish other Meter metrics to cloud watch + } + + for (Map.Entry entry : timers.entrySet()) { + processCounter(entry.getKey(), entry.getValue(), timestampMilliSec, metricsData); + //TODO: Publish other Timer metrics to cloud watch + } + + report(metricsData); + } + + private void report(List metricsData) { + List> cloudWatchFutures = new ArrayList<>(metricsData.size()); + List> partitions = new ArrayList<>(); + + for (int i = 0; i < metricsData.size(); i += maxDatumsPerRequest) { + int end = Math.min(metricsData.size(), i + maxDatumsPerRequest); + partitions.add(metricsData.subList(i, end)); + } + + for (List partition : partitions) { + PutMetricDataRequest request = new PutMetricDataRequest() + .withNamespace(namespace) + .withMetricData(partition); + + cloudWatchFutures.add(cloudWatchClientAsync.putMetricDataAsync(request)); + } + + for (final Future cloudWatchFuture : cloudWatchFutures) { + try { + cloudWatchFuture.get(30, TimeUnit.SECONDS); + } catch (final Exception ex) { + LOG.error("Error reporting metrics to CloudWatch. The data in this CloudWatch request " + + "may have been discarded, and not made it to CloudWatch.", ex); + } + } + } + + private void processGauge(final String metricName, + final Gauge gauge, + final long timestampMilliSec, + final List metricData) { + Option.ofNullable(gauge.getValue()) + .toJavaOptional() + .filter(value -> value instanceof Number) + .map(value -> (Number) value) + .ifPresent(value -> stageMetricDatum(metricName, + value.doubleValue(), + DIMENSION_GAUGE_TYPE_VALUE, + StandardUnit.None, + timestampMilliSec, + metricData)); + } + + private void processCounter(final String metricName, + final Counting counter, + final long timestampMilliSec, + final List metricData) { + stageMetricDatum(metricName, + counter.getCount(), + DIMENSION_COUNT_TYPE_VALUE, + StandardUnit.Count, + timestampMilliSec, + metricData); + } + + private void stageMetricDatum(String metricName, + double metricValue, + String metricType, + StandardUnit standardUnit, + long timestampMilliSec, + List metricData) { + String[] metricNameParts = metricName.split("\\.", 2); + String tableName = metricNameParts[0]; + + + metricData.add(new MetricDatum() + .withTimestamp(new Date(timestampMilliSec)) + .withMetricName(prefix(metricNameParts[1])) + .withValue(metricValue) + .withDimensions(getDimensions(tableName, metricType)) + .withUnit(standardUnit)); + } + + private List getDimensions(String tableName, String metricType) { + List dimensions = new ArrayList<>(); + dimensions.add(new Dimension() + .withName(DIMENSION_TABLE_NAME_KEY) + .withValue(tableName)); + dimensions.add(new Dimension() + .withName(DIMENSION_METRIC_TYPE_KEY) + .withValue(metricType)); + return dimensions; + } + + private String prefix(String... components) { + return MetricRegistry.name(prefix, components); + } + + @Override + public void stop() { + try { + super.stop(); + } finally { + try { + cloudWatchClientAsync.shutdown(); + } catch (Exception ex) { + LOG.warn("Exception while shutting down CloudWatch client.", ex); + } + } + } +} diff --git a/hudi-aws/src/main/java/org/apache/hudi/aws/credentials/HoodieAWSCredentialsProviderFactory.java b/hudi-aws/src/main/java/org/apache/hudi/aws/credentials/HoodieAWSCredentialsProviderFactory.java new file mode 100644 index 0000000000000..631b0fa8d5349 --- /dev/null +++ b/hudi-aws/src/main/java/org/apache/hudi/aws/credentials/HoodieAWSCredentialsProviderFactory.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.aws.credentials; + +import com.amazonaws.auth.AWSCredentialsProvider; +import com.amazonaws.auth.AWSCredentialsProviderChain; +import com.amazonaws.auth.DefaultAWSCredentialsProviderChain; + +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +/** + * Factory class for Hoodie AWSCredentialsProvider. + */ +public class HoodieAWSCredentialsProviderFactory { + public static AWSCredentialsProvider getAwsCredentialsProvider(Properties props) { + return getAwsCredentialsProviderChain(props); + } + + private static AWSCredentialsProvider getAwsCredentialsProviderChain(Properties props) { + List providers = new ArrayList<>(); + providers.add(new HoodieConfigAWSCredentialsProvider(props)); + providers.add(new DefaultAWSCredentialsProviderChain()); + AWSCredentialsProviderChain providerChain = new AWSCredentialsProviderChain(providers); + providerChain.setReuseLastProvider(true); + return providerChain; + } +} diff --git a/hudi-aws/src/main/java/org/apache/hudi/aws/credentials/HoodieConfigAWSCredentialsProvider.java b/hudi-aws/src/main/java/org/apache/hudi/aws/credentials/HoodieConfigAWSCredentialsProvider.java new file mode 100644 index 0000000000000..4e9cf383906a4 --- /dev/null +++ b/hudi-aws/src/main/java/org/apache/hudi/aws/credentials/HoodieConfigAWSCredentialsProvider.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.aws.credentials; + +import com.amazonaws.auth.AWSCredentials; +import com.amazonaws.auth.AWSCredentialsProvider; +import com.amazonaws.auth.BasicAWSCredentials; +import com.amazonaws.auth.BasicSessionCredentials; +import org.apache.hudi.config.HoodieAWSConfig; +import org.apache.hudi.common.util.StringUtils; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Properties; + +/** + * Credentials provider which fetches AWS access key from Hoodie config. + */ +public class HoodieConfigAWSCredentialsProvider implements AWSCredentialsProvider { + + private static final Logger LOG = LogManager.getLogger(HoodieConfigAWSCredentialsProvider.class); + + private AWSCredentials awsCredentials; + + public HoodieConfigAWSCredentialsProvider(Properties props) { + String accessKey = props.getProperty(HoodieAWSConfig.AWS_ACCESS_KEY.key()); + String secretKey = props.getProperty(HoodieAWSConfig.AWS_SECRET_KEY.key()); + String sessionToken = props.getProperty(HoodieAWSConfig.AWS_SESSION_TOKEN.key()); + + if (StringUtils.isNullOrEmpty(accessKey) || StringUtils.isNullOrEmpty(secretKey)) { + LOG.debug("AWS access key or secret key not found in the Hudi configuration. " + + "Use default AWS credentials"); + } else { + this.awsCredentials = createCredentials(accessKey, secretKey, sessionToken); + } + } + + private static AWSCredentials createCredentials(String accessKey, String secretKey, + String sessionToken) { + return (sessionToken == null) + ? new BasicAWSCredentials(accessKey, secretKey) + : new BasicSessionCredentials(accessKey, secretKey, sessionToken); + } + + @Override + public AWSCredentials getCredentials() { + return this.awsCredentials; + } + + @Override + public void refresh() { + + } +} diff --git a/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AWSGlueCatalogSyncClient.java b/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AWSGlueCatalogSyncClient.java new file mode 100644 index 0000000000000..d9c4ecd96af03 --- /dev/null +++ b/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AWSGlueCatalogSyncClient.java @@ -0,0 +1,472 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.aws.sync; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.hive.HiveSyncConfig; +import org.apache.hudi.sync.common.HoodieSyncClient; +import org.apache.hudi.sync.common.model.Partition; + +import com.amazonaws.services.glue.AWSGlue; +import com.amazonaws.services.glue.AWSGlueClientBuilder; +import com.amazonaws.services.glue.model.AlreadyExistsException; +import com.amazonaws.services.glue.model.BatchCreatePartitionRequest; +import com.amazonaws.services.glue.model.BatchCreatePartitionResult; +import com.amazonaws.services.glue.model.BatchUpdatePartitionRequest; +import com.amazonaws.services.glue.model.BatchUpdatePartitionRequestEntry; +import com.amazonaws.services.glue.model.BatchUpdatePartitionResult; +import com.amazonaws.services.glue.model.Column; +import com.amazonaws.services.glue.model.CreateDatabaseRequest; +import com.amazonaws.services.glue.model.CreateDatabaseResult; +import com.amazonaws.services.glue.model.CreateTableRequest; +import com.amazonaws.services.glue.model.CreateTableResult; +import com.amazonaws.services.glue.model.DatabaseInput; +import com.amazonaws.services.glue.model.EntityNotFoundException; +import com.amazonaws.services.glue.model.GetDatabaseRequest; +import com.amazonaws.services.glue.model.GetPartitionsRequest; +import com.amazonaws.services.glue.model.GetPartitionsResult; +import com.amazonaws.services.glue.model.GetTableRequest; +import com.amazonaws.services.glue.model.PartitionInput; +import com.amazonaws.services.glue.model.SerDeInfo; +import com.amazonaws.services.glue.model.StorageDescriptor; +import com.amazonaws.services.glue.model.Table; +import com.amazonaws.services.glue.model.TableInput; +import com.amazonaws.services.glue.model.UpdateTableRequest; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.parquet.schema.MessageType; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + +import static org.apache.hudi.aws.utils.S3Utils.s3aToS3; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE; +import static org.apache.hudi.common.util.MapUtils.isNullOrEmpty; +import static org.apache.hudi.hive.util.HiveSchemaUtil.getPartitionKeyType; +import static org.apache.hudi.hive.util.HiveSchemaUtil.parquetSchemaToMapSchema; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS; +import static org.apache.hudi.sync.common.util.TableUtils.tableId; + +/** + * This class implements all the AWS APIs to enable syncing of a Hudi Table with the + * AWS Glue Data Catalog (https://docs.aws.amazon.com/glue/latest/dg/populate-data-catalog.html). + * + * @Experimental + */ +public class AWSGlueCatalogSyncClient extends HoodieSyncClient { + + private static final Logger LOG = LogManager.getLogger(AWSGlueCatalogSyncClient.class); + private static final int MAX_PARTITIONS_PER_REQUEST = 100; + private static final long BATCH_REQUEST_SLEEP_MILLIS = 1000L; + private final AWSGlue awsGlue; + private final String databaseName; + + public AWSGlueCatalogSyncClient(HiveSyncConfig config) { + super(config); + this.awsGlue = AWSGlueClientBuilder.standard().build(); + this.databaseName = config.getStringOrDefault(META_SYNC_DATABASE_NAME); + } + + @Override + public List getAllPartitions(String tableName) { + try { + List partitions = new ArrayList<>(); + String nextToken = null; + do { + GetPartitionsResult result = awsGlue.getPartitions(new GetPartitionsRequest() + .withDatabaseName(databaseName) + .withTableName(tableName) + .withNextToken(nextToken)); + partitions.addAll(result.getPartitions().stream() + .map(p -> new Partition(p.getValues(), p.getStorageDescriptor().getLocation())) + .collect(Collectors.toList())); + nextToken = result.getNextToken(); + } while (nextToken != null); + return partitions; + } catch (Exception e) { + throw new HoodieGlueSyncException("Failed to get all partitions for table " + tableId(databaseName, tableName), e); + } + } + + @Override + public void addPartitionsToTable(String tableName, List partitionsToAdd) { + if (partitionsToAdd.isEmpty()) { + LOG.info("No partitions to add for " + tableId(databaseName, tableName)); + return; + } + LOG.info("Adding " + partitionsToAdd.size() + " partition(s) in table " + tableId(databaseName, tableName)); + try { + Table table = getTable(awsGlue, databaseName, tableName); + StorageDescriptor sd = table.getStorageDescriptor(); + List partitionInputs = partitionsToAdd.stream().map(partition -> { + StorageDescriptor partitionSd = sd.clone(); + String fullPartitionPath = FSUtils.getPartitionPath(getBasePath(), partition).toString(); + List partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition); + partitionSd.setLocation(fullPartitionPath); + return new PartitionInput().withValues(partitionValues).withStorageDescriptor(partitionSd); + }).collect(Collectors.toList()); + + for (List batch : CollectionUtils.batches(partitionInputs, MAX_PARTITIONS_PER_REQUEST)) { + BatchCreatePartitionRequest request = new BatchCreatePartitionRequest(); + request.withDatabaseName(databaseName).withTableName(tableName).withPartitionInputList(batch); + + BatchCreatePartitionResult result = awsGlue.batchCreatePartition(request); + if (CollectionUtils.nonEmpty(result.getErrors())) { + throw new HoodieGlueSyncException("Fail to add partitions to " + tableId(databaseName, tableName) + + " with error(s): " + result.getErrors()); + } + Thread.sleep(BATCH_REQUEST_SLEEP_MILLIS); + } + } catch (Exception e) { + throw new HoodieGlueSyncException("Fail to add partitions to " + tableId(databaseName, tableName), e); + } + } + + @Override + public void updatePartitionsToTable(String tableName, List changedPartitions) { + if (changedPartitions.isEmpty()) { + LOG.info("No partitions to change for " + tableName); + return; + } + LOG.info("Updating " + changedPartitions.size() + "partition(s) in table " + tableId(databaseName, tableName)); + try { + Table table = getTable(awsGlue, databaseName, tableName); + StorageDescriptor sd = table.getStorageDescriptor(); + List updatePartitionEntries = changedPartitions.stream().map(partition -> { + StorageDescriptor partitionSd = sd.clone(); + String fullPartitionPath = FSUtils.getPartitionPath(getBasePath(), partition).toString(); + List partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition); + partitionSd.setLocation(fullPartitionPath); + PartitionInput partitionInput = new PartitionInput().withValues(partitionValues).withStorageDescriptor(partitionSd); + return new BatchUpdatePartitionRequestEntry().withPartitionInput(partitionInput).withPartitionValueList(partitionValues); + }).collect(Collectors.toList()); + + for (List batch : CollectionUtils.batches(updatePartitionEntries, MAX_PARTITIONS_PER_REQUEST)) { + BatchUpdatePartitionRequest request = new BatchUpdatePartitionRequest(); + request.withDatabaseName(databaseName).withTableName(tableName).withEntries(batch); + + BatchUpdatePartitionResult result = awsGlue.batchUpdatePartition(request); + if (CollectionUtils.nonEmpty(result.getErrors())) { + throw new HoodieGlueSyncException("Fail to update partitions to " + tableId(databaseName, tableName) + + " with error(s): " + result.getErrors()); + } + Thread.sleep(BATCH_REQUEST_SLEEP_MILLIS); + } + } catch (Exception e) { + throw new HoodieGlueSyncException("Fail to update partitions to " + tableId(databaseName, tableName), e); + } + } + + @Override + public void dropPartitions(String tableName, List partitionsToDrop) { + throw new UnsupportedOperationException("Not support dropPartitionsToTable yet."); + } + + /** + * Update the table properties to the table. + */ + @Override + public void updateTableProperties(String tableName, Map tableProperties) { + if (isNullOrEmpty(tableProperties)) { + return; + } + try { + updateTableParameters(awsGlue, databaseName, tableName, tableProperties, false); + } catch (Exception e) { + throw new HoodieGlueSyncException("Fail to update properties for table " + tableId(databaseName, tableName), e); + } + } + + @Override + public void updateTableSchema(String tableName, MessageType newSchema) { + // ToDo Cascade is set in Hive meta sync, but need to investigate how to configure it for Glue meta + boolean cascade = config.getSplitStrings(META_SYNC_PARTITION_FIELDS).size() > 0; + try { + Table table = getTable(awsGlue, databaseName, tableName); + Map newSchemaMap = parquetSchemaToMapSchema(newSchema, config.getBoolean(HIVE_SUPPORT_TIMESTAMP_TYPE), false); + List newColumns = getColumnsFromSchema(newSchemaMap); + StorageDescriptor sd = table.getStorageDescriptor(); + sd.setColumns(newColumns); + + final Date now = new Date(); + TableInput updatedTableInput = new TableInput() + .withName(tableName) + .withTableType(table.getTableType()) + .withParameters(table.getParameters()) + .withPartitionKeys(table.getPartitionKeys()) + .withStorageDescriptor(sd) + .withLastAccessTime(now) + .withLastAnalyzedTime(now); + + UpdateTableRequest request = new UpdateTableRequest() + .withDatabaseName(databaseName) + .withTableInput(updatedTableInput); + + awsGlue.updateTable(request); + } catch (Exception e) { + throw new HoodieGlueSyncException("Fail to update definition for table " + tableId(databaseName, tableName), e); + } + } + + @Override + public void createTable(String tableName, + MessageType storageSchema, + String inputFormatClass, + String outputFormatClass, + String serdeClass, + Map serdeProperties, + Map tableProperties) { + if (tableExists(tableName)) { + return; + } + CreateTableRequest request = new CreateTableRequest(); + Map params = new HashMap<>(); + if (!config.getBoolean(HIVE_CREATE_MANAGED_TABLE)) { + params.put("EXTERNAL", "TRUE"); + } + params.putAll(tableProperties); + + try { + Map mapSchema = parquetSchemaToMapSchema(storageSchema, config.getBoolean(HIVE_SUPPORT_TIMESTAMP_TYPE), false); + + List schemaWithoutPartitionKeys = getColumnsFromSchema(mapSchema); + + // now create the schema partition + List schemaPartitionKeys = config.getSplitStrings(META_SYNC_PARTITION_FIELDS).stream().map(partitionKey -> { + String keyType = getPartitionKeyType(mapSchema, partitionKey); + return new Column().withName(partitionKey).withType(keyType.toLowerCase()).withComment(""); + }).collect(Collectors.toList()); + + StorageDescriptor storageDescriptor = new StorageDescriptor(); + serdeProperties.put("serialization.format", "1"); + storageDescriptor + .withSerdeInfo(new SerDeInfo().withSerializationLibrary(serdeClass).withParameters(serdeProperties)) + .withLocation(s3aToS3(getBasePath())) + .withInputFormat(inputFormatClass) + .withOutputFormat(outputFormatClass) + .withColumns(schemaWithoutPartitionKeys); + + final Date now = new Date(); + TableInput tableInput = new TableInput() + .withName(tableName) + .withTableType(TableType.EXTERNAL_TABLE.toString()) + .withParameters(params) + .withPartitionKeys(schemaPartitionKeys) + .withStorageDescriptor(storageDescriptor) + .withLastAccessTime(now) + .withLastAnalyzedTime(now); + request.withDatabaseName(databaseName) + .withTableInput(tableInput); + + CreateTableResult result = awsGlue.createTable(request); + LOG.info("Created table " + tableId(databaseName, tableName) + " : " + result); + } catch (AlreadyExistsException e) { + LOG.warn("Table " + tableId(databaseName, tableName) + " already exists.", e); + } catch (Exception e) { + throw new HoodieGlueSyncException("Fail to create " + tableId(databaseName, tableName), e); + } + } + + @Override + public Map getMetastoreSchema(String tableName) { + try { + // GlueMetastoreClient returns partition keys separate from Columns, hence get both and merge to + // get the Schema of the table. + Table table = getTable(awsGlue, databaseName, tableName); + Map partitionKeysMap = + table.getPartitionKeys().stream().collect(Collectors.toMap(Column::getName, f -> f.getType().toUpperCase())); + + Map columnsMap = + table.getStorageDescriptor().getColumns().stream().collect(Collectors.toMap(Column::getName, f -> f.getType().toUpperCase())); + + Map schema = new HashMap<>(); + schema.putAll(columnsMap); + schema.putAll(partitionKeysMap); + return schema; + } catch (Exception e) { + throw new HoodieGlueSyncException("Fail to get schema for table " + tableId(databaseName, tableName), e); + } + } + + @Override + public boolean tableExists(String tableName) { + GetTableRequest request = new GetTableRequest() + .withDatabaseName(databaseName) + .withName(tableName); + try { + return Objects.nonNull(awsGlue.getTable(request).getTable()); + } catch (EntityNotFoundException e) { + LOG.info("Table not found: " + tableId(databaseName, tableName), e); + return false; + } catch (Exception e) { + throw new HoodieGlueSyncException("Fail to get table: " + tableId(databaseName, tableName), e); + } + } + + @Override + public boolean databaseExists(String databaseName) { + GetDatabaseRequest request = new GetDatabaseRequest(); + request.setName(databaseName); + try { + return Objects.nonNull(awsGlue.getDatabase(request).getDatabase()); + } catch (EntityNotFoundException e) { + LOG.info("Database not found: " + databaseName, e); + return false; + } catch (Exception e) { + throw new HoodieGlueSyncException("Fail to check if database exists " + databaseName, e); + } + } + + @Override + public void createDatabase(String databaseName) { + if (databaseExists(databaseName)) { + return; + } + CreateDatabaseRequest request = new CreateDatabaseRequest(); + request.setDatabaseInput(new DatabaseInput() + .withName(databaseName) + .withDescription("Automatically created by " + this.getClass().getName()) + .withParameters(null) + .withLocationUri(null)); + try { + CreateDatabaseResult result = awsGlue.createDatabase(request); + LOG.info("Successfully created database in AWS Glue: " + result.toString()); + } catch (AlreadyExistsException e) { + LOG.warn("AWS Glue Database " + databaseName + " already exists", e); + } catch (Exception e) { + throw new HoodieGlueSyncException("Fail to create database " + databaseName, e); + } + } + + @Override + public Option getLastCommitTimeSynced(String tableName) { + try { + Table table = getTable(awsGlue, databaseName, tableName); + return Option.ofNullable(table.getParameters().get(HOODIE_LAST_COMMIT_TIME_SYNC)); + } catch (Exception e) { + throw new HoodieGlueSyncException("Fail to get last sync commit time for " + tableId(databaseName, tableName), e); + } + } + + @Override + public void close() { + awsGlue.shutdown(); + } + + @Override + public void updateLastCommitTimeSynced(String tableName) { + if (!getActiveTimeline().lastInstant().isPresent()) { + LOG.warn("No commit in active timeline."); + return; + } + final String lastCommitTimestamp = getActiveTimeline().lastInstant().get().getTimestamp(); + try { + updateTableParameters(awsGlue, databaseName, tableName, Collections.singletonMap(HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitTimestamp), false); + } catch (Exception e) { + throw new HoodieGlueSyncException("Fail to update last sync commit time for " + tableId(databaseName, tableName), e); + } + } + + @Override + public Option getLastReplicatedTime(String tableName) { + throw new UnsupportedOperationException("Not supported: `getLastReplicatedTime`"); + } + + @Override + public void updateLastReplicatedTimeStamp(String tableName, String timeStamp) { + throw new UnsupportedOperationException("Not supported: `updateLastReplicatedTimeStamp`"); + } + + @Override + public void deleteLastReplicatedTimeStamp(String tableName) { + throw new UnsupportedOperationException("Not supported: `deleteLastReplicatedTimeStamp`"); + } + + private List getColumnsFromSchema(Map mapSchema) { + List cols = new ArrayList<>(); + for (String key : mapSchema.keySet()) { + // In Glue, the full schema should exclude the partition keys + if (!config.getSplitStrings(META_SYNC_PARTITION_FIELDS).contains(key)) { + String keyType = getPartitionKeyType(mapSchema, key); + Column column = new Column().withName(key).withType(keyType.toLowerCase()).withComment(""); + cols.add(column); + } + } + return cols; + } + + private enum TableType { + MANAGED_TABLE, + EXTERNAL_TABLE, + VIRTUAL_VIEW, + INDEX_TABLE, + MATERIALIZED_VIEW + } + + private static Table getTable(AWSGlue awsGlue, String databaseName, String tableName) throws HoodieGlueSyncException { + GetTableRequest request = new GetTableRequest() + .withDatabaseName(databaseName) + .withName(tableName); + try { + return awsGlue.getTable(request).getTable(); + } catch (EntityNotFoundException e) { + throw new HoodieGlueSyncException("Table not found: " + tableId(databaseName, tableName), e); + } catch (Exception e) { + throw new HoodieGlueSyncException("Fail to get table " + tableId(databaseName, tableName), e); + } + } + + private static void updateTableParameters(AWSGlue awsGlue, String databaseName, String tableName, Map updatingParams, boolean shouldReplace) { + final Map newParams = new HashMap<>(); + try { + Table table = getTable(awsGlue, databaseName, tableName); + if (!shouldReplace) { + newParams.putAll(table.getParameters()); + } + newParams.putAll(updatingParams); + + final Date now = new Date(); + TableInput updatedTableInput = new TableInput() + .withName(tableName) + .withTableType(table.getTableType()) + .withParameters(newParams) + .withPartitionKeys(table.getPartitionKeys()) + .withStorageDescriptor(table.getStorageDescriptor()) + .withLastAccessTime(now) + .withLastAnalyzedTime(now); + + UpdateTableRequest request = new UpdateTableRequest(); + request.withDatabaseName(databaseName) + .withTableInput(updatedTableInput); + awsGlue.updateTable(request); + } catch (Exception e) { + throw new HoodieGlueSyncException("Fail to update params for table " + tableId(databaseName, tableName) + ": " + newParams, e); + } + } +} diff --git a/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AwsGlueCatalogSyncTool.java b/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AwsGlueCatalogSyncTool.java new file mode 100644 index 0000000000000..b8f0d565df7f7 --- /dev/null +++ b/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AwsGlueCatalogSyncTool.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.aws.sync; + +import org.apache.hudi.hive.HiveSyncConfig; +import org.apache.hudi.hive.HiveSyncTool; + +import com.beust.jcommander.JCommander; +import org.apache.hadoop.conf.Configuration; + +import java.util.Properties; + +/** + * Currently Experimental. Utility class that implements syncing a Hudi Table with the + * AWS Glue Data Catalog (https://docs.aws.amazon.com/glue/latest/dg/populate-data-catalog.html) + * to enable querying via Glue ETLs, Athena etc. + *

+ * Extends HiveSyncTool since most logic is similar to Hive syncing, + * expect using a different client {@link AWSGlueCatalogSyncClient} that implements + * the necessary functionality using Glue APIs. + * + * @Experimental + */ +public class AwsGlueCatalogSyncTool extends HiveSyncTool { + + public AwsGlueCatalogSyncTool(Properties props, Configuration hadoopConf) { + super(props, hadoopConf); + } + + @Override + protected void initSyncClient(HiveSyncConfig hiveSyncConfig) { + syncClient = new AWSGlueCatalogSyncClient(hiveSyncConfig); + } + + public static void main(String[] args) { + final HiveSyncConfig.HiveSyncConfigParams params = new HiveSyncConfig.HiveSyncConfigParams(); + JCommander cmd = JCommander.newBuilder().addObject(params).build(); + cmd.parse(args); + if (params.isHelp()) { + cmd.usage(); + System.exit(0); + } + new AwsGlueCatalogSyncTool(params.toProps(), new Configuration()).syncHoodieTable(); + } +} diff --git a/hudi-aws/src/main/java/org/apache/hudi/aws/sync/HoodieGlueSyncException.java b/hudi-aws/src/main/java/org/apache/hudi/aws/sync/HoodieGlueSyncException.java new file mode 100644 index 0000000000000..5b788ebf317ee --- /dev/null +++ b/hudi-aws/src/main/java/org/apache/hudi/aws/sync/HoodieGlueSyncException.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.aws.sync; + +import org.apache.hudi.hive.HoodieHiveSyncException; + +public class HoodieGlueSyncException extends HoodieHiveSyncException { + + public HoodieGlueSyncException(String message) { + super(message); + } + + public HoodieGlueSyncException(String message, Throwable t) { + super(message, t); + } +} diff --git a/hudi-aws/src/main/java/org/apache/hudi/aws/transaction/lock/DynamoDBBasedLockProvider.java b/hudi-aws/src/main/java/org/apache/hudi/aws/transaction/lock/DynamoDBBasedLockProvider.java new file mode 100644 index 0000000000000..1d72f71844a49 --- /dev/null +++ b/hudi-aws/src/main/java/org/apache/hudi/aws/transaction/lock/DynamoDBBasedLockProvider.java @@ -0,0 +1,220 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.aws.transaction.lock; + +import com.amazonaws.client.builder.AwsClientBuilder; +import com.amazonaws.regions.RegionUtils; +import com.amazonaws.services.dynamodbv2.AcquireLockOptions; +import com.amazonaws.services.dynamodbv2.AmazonDynamoDB; +import com.amazonaws.services.dynamodbv2.AmazonDynamoDBClientBuilder; +import com.amazonaws.services.dynamodbv2.AmazonDynamoDBLockClient; +import com.amazonaws.services.dynamodbv2.AmazonDynamoDBLockClientOptions; +import com.amazonaws.services.dynamodbv2.LockItem; +import com.amazonaws.services.dynamodbv2.model.AttributeDefinition; +import com.amazonaws.services.dynamodbv2.model.BillingMode; +import com.amazonaws.services.dynamodbv2.model.CreateTableRequest; +import com.amazonaws.services.dynamodbv2.model.KeySchemaElement; +import com.amazonaws.services.dynamodbv2.model.KeyType; +import com.amazonaws.services.dynamodbv2.model.LockNotGrantedException; +import com.amazonaws.services.dynamodbv2.model.ProvisionedThroughput; +import com.amazonaws.services.dynamodbv2.model.ScalarAttributeType; +import com.amazonaws.services.dynamodbv2.util.TableUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hudi.aws.credentials.HoodieAWSCredentialsProviderFactory; +import org.apache.hudi.common.config.LockConfiguration; +import org.apache.hudi.common.lock.LockProvider; +import org.apache.hudi.common.lock.LockState; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.config.DynamoDbBasedLockConfig; +import org.apache.hudi.exception.HoodieLockException; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.TimeUnit; +import javax.annotation.concurrent.NotThreadSafe; + +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY; + +/** + * A DynamoDB based lock. This {@link LockProvider} implementation allows to lock table operations + * using DynamoDB. Users need to have access to AWS DynamoDB to be able to use this lock. + */ +@NotThreadSafe +public class DynamoDBBasedLockProvider implements LockProvider { + + private static final Logger LOG = LogManager.getLogger(DynamoDBBasedLockProvider.class); + + private static final String DYNAMODB_ATTRIBUTE_NAME = "key"; + + private final AmazonDynamoDBLockClient client; + private final String tableName; + private final String dynamoDBPartitionKey; + protected LockConfiguration lockConfiguration; + private volatile LockItem lock; + + public DynamoDBBasedLockProvider(final LockConfiguration lockConfiguration, final Configuration conf) { + this(lockConfiguration, conf, null); + } + + public DynamoDBBasedLockProvider(final LockConfiguration lockConfiguration, final Configuration conf, AmazonDynamoDB dynamoDB) { + checkRequiredProps(lockConfiguration); + this.lockConfiguration = lockConfiguration; + this.tableName = lockConfiguration.getConfig().getString(DynamoDbBasedLockConfig.DYNAMODB_LOCK_TABLE_NAME.key()); + this.dynamoDBPartitionKey = lockConfiguration.getConfig().getString(DynamoDbBasedLockConfig.DYNAMODB_LOCK_PARTITION_KEY.key()); + long leaseDuration = Long.parseLong(lockConfiguration.getConfig().getString(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY)); + if (dynamoDB == null) { + dynamoDB = getDynamoDBClient(); + } + // build the dynamoDb lock client + this.client = new AmazonDynamoDBLockClient( + AmazonDynamoDBLockClientOptions.builder(dynamoDB, tableName) + .withTimeUnit(TimeUnit.MILLISECONDS) + .withLeaseDuration(leaseDuration) + .withHeartbeatPeriod(leaseDuration / 3) + .withCreateHeartbeatBackgroundThread(true) + .build()); + + if (!this.client.lockTableExists()) { + createLockTableInDynamoDB(dynamoDB, tableName); + } + } + + @Override + public boolean tryLock(long time, TimeUnit unit) { + LOG.info(generateLogStatement(LockState.ACQUIRING, generateLogSuffixString())); + try { + lock = client.acquireLock(AcquireLockOptions.builder(dynamoDBPartitionKey) + .withAdditionalTimeToWaitForLock(time) + .withTimeUnit(TimeUnit.MILLISECONDS) + .build()); + LOG.info(generateLogStatement(LockState.ACQUIRED, generateLogSuffixString())); + } catch (InterruptedException e) { + throw new HoodieLockException(generateLogStatement(LockState.FAILED_TO_ACQUIRE, generateLogSuffixString()), e); + } catch (LockNotGrantedException e) { + return false; + } + return lock != null && !lock.isExpired(); + } + + @Override + public void unlock() { + try { + LOG.info(generateLogStatement(LockState.RELEASING, generateLogSuffixString())); + if (lock == null) { + return; + } + if (!client.releaseLock(lock)) { + LOG.warn("The lock has already been stolen"); + } + lock = null; + LOG.info(generateLogStatement(LockState.RELEASED, generateLogSuffixString())); + } catch (Exception e) { + throw new HoodieLockException(generateLogStatement(LockState.FAILED_TO_RELEASE, generateLogSuffixString()), e); + } + } + + @Override + public void close() { + try { + if (lock != null) { + if (!client.releaseLock(lock)) { + LOG.warn("The lock has already been stolen"); + } + lock = null; + } + this.client.close(); + } catch (Exception e) { + LOG.error(generateLogStatement(LockState.FAILED_TO_RELEASE, generateLogSuffixString())); + } + } + + @Override + public LockItem getLock() { + return lock; + } + + private AmazonDynamoDB getDynamoDBClient() { + String region = this.lockConfiguration.getConfig().getString(DynamoDbBasedLockConfig.DYNAMODB_LOCK_REGION.key()); + String endpointURL = this.lockConfiguration.getConfig().containsKey(DynamoDbBasedLockConfig.DYNAMODB_ENDPOINT_URL.key()) + ? this.lockConfiguration.getConfig().getString(DynamoDbBasedLockConfig.DYNAMODB_ENDPOINT_URL.key()) + : RegionUtils.getRegion(region).getServiceEndpoint(AmazonDynamoDB.ENDPOINT_PREFIX); + AwsClientBuilder.EndpointConfiguration dynamodbEndpoint = + new AwsClientBuilder.EndpointConfiguration(endpointURL, region); + return AmazonDynamoDBClientBuilder.standard() + .withEndpointConfiguration(dynamodbEndpoint) + .withCredentials(HoodieAWSCredentialsProviderFactory.getAwsCredentialsProvider(lockConfiguration.getConfig())) + .build(); + } + + private void createLockTableInDynamoDB(AmazonDynamoDB dynamoDB, String tableName) { + String billingMode = lockConfiguration.getConfig().getString(DynamoDbBasedLockConfig.DYNAMODB_LOCK_BILLING_MODE.key()); + KeySchemaElement partitionKeyElement = new KeySchemaElement(); + partitionKeyElement.setAttributeName(DYNAMODB_ATTRIBUTE_NAME); + partitionKeyElement.setKeyType(KeyType.HASH); + + List keySchema = new ArrayList<>(); + keySchema.add(partitionKeyElement); + + Collection attributeDefinitions = new ArrayList<>(); + attributeDefinitions.add(new AttributeDefinition().withAttributeName(DYNAMODB_ATTRIBUTE_NAME).withAttributeType(ScalarAttributeType.S)); + + CreateTableRequest createTableRequest = new CreateTableRequest(tableName, keySchema); + createTableRequest.setAttributeDefinitions(attributeDefinitions); + createTableRequest.setBillingMode(billingMode); + if (billingMode.equals(BillingMode.PROVISIONED.name())) { + createTableRequest.setProvisionedThroughput(new ProvisionedThroughput() + .withReadCapacityUnits(Long.parseLong(lockConfiguration.getConfig().getString(DynamoDbBasedLockConfig.DYNAMODB_LOCK_READ_CAPACITY.key()))) + .withWriteCapacityUnits(Long.parseLong(lockConfiguration.getConfig().getString(DynamoDbBasedLockConfig.DYNAMODB_LOCK_WRITE_CAPACITY.key())))); + } + dynamoDB.createTable(createTableRequest); + + LOG.info("Creating dynamoDB table " + tableName + ", waiting for table to be active"); + try { + TableUtils.waitUntilActive(dynamoDB, tableName, Integer.parseInt(lockConfiguration.getConfig().getString(DynamoDbBasedLockConfig.DYNAMODB_LOCK_TABLE_CREATION_TIMEOUT.key())), 20 * 1000); + } catch (TableUtils.TableNeverTransitionedToStateException e) { + throw new HoodieLockException("Created dynamoDB table never transits to active", e); + } catch (InterruptedException e) { + throw new HoodieLockException("Thread interrupted while waiting for dynamoDB table to turn active", e); + } + LOG.info("Created dynamoDB table " + tableName); + } + + private void checkRequiredProps(final LockConfiguration config) { + ValidationUtils.checkArgument(config.getConfig().getString(DynamoDbBasedLockConfig.DYNAMODB_LOCK_TABLE_NAME.key()) != null); + ValidationUtils.checkArgument(config.getConfig().getString(DynamoDbBasedLockConfig.DYNAMODB_LOCK_REGION.key()) != null); + ValidationUtils.checkArgument(config.getConfig().getString(DynamoDbBasedLockConfig.DYNAMODB_LOCK_PARTITION_KEY.key()) != null); + config.getConfig().putIfAbsent(DynamoDbBasedLockConfig.DYNAMODB_LOCK_BILLING_MODE.key(), BillingMode.PAY_PER_REQUEST.name()); + config.getConfig().putIfAbsent(DynamoDbBasedLockConfig.DYNAMODB_LOCK_READ_CAPACITY.key(), "20"); + config.getConfig().putIfAbsent(DynamoDbBasedLockConfig.DYNAMODB_LOCK_WRITE_CAPACITY.key(), "10"); + config.getConfig().putIfAbsent(DynamoDbBasedLockConfig.DYNAMODB_LOCK_TABLE_CREATION_TIMEOUT.key(), "600000"); + } + + private String generateLogSuffixString() { + return StringUtils.join("DynamoDb table = ", tableName, ", partition key = ", dynamoDBPartitionKey); + } + + protected String generateLogStatement(LockState state, String suffix) { + return StringUtils.join(state.name(), " lock at ", suffix); + } +} diff --git a/hudi-aws/src/main/java/org/apache/hudi/aws/utils/S3Utils.java b/hudi-aws/src/main/java/org/apache/hudi/aws/utils/S3Utils.java new file mode 100644 index 0000000000000..bfb208ee15058 --- /dev/null +++ b/hudi-aws/src/main/java/org/apache/hudi/aws/utils/S3Utils.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.aws.utils; + +public final class S3Utils { + + public static String s3aToS3(String s3aUrl) { + return s3aUrl.replaceFirst("(?i)^s3a://", "s3://"); + } +} diff --git a/hudi-aws/src/main/java/org/apache/hudi/config/DynamoDbBasedLockConfig.java b/hudi-aws/src/main/java/org/apache/hudi/config/DynamoDbBasedLockConfig.java new file mode 100644 index 0000000000000..1894b8641c1be --- /dev/null +++ b/hudi-aws/src/main/java/org/apache/hudi/config/DynamoDbBasedLockConfig.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.util.Option; + +import com.amazonaws.regions.RegionUtils; +import com.amazonaws.services.dynamodbv2.model.BillingMode; + +import static org.apache.hudi.common.config.LockConfiguration.LOCK_PREFIX; + +/** + * Hoodie Configs for Locks. + */ +@ConfigClassProperty(name = "DynamoDB based Locks Configurations", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Configs that control DynamoDB based locking mechanisms required for concurrency control " + + " between writers to a Hudi table. Concurrency between Hudi's own table services " + + " are auto managed internally.") +public class DynamoDbBasedLockConfig extends HoodieConfig { + + // configs for DynamoDb based locks + public static final String DYNAMODB_BASED_LOCK_PROPERTY_PREFIX = LOCK_PREFIX + "dynamodb."; + + public static final ConfigProperty DYNAMODB_LOCK_TABLE_NAME = ConfigProperty + .key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "table") + .noDefaultValue() + .sinceVersion("0.10.0") + .withDocumentation("For DynamoDB based lock provider, the name of the DynamoDB table acting as lock table"); + + public static final ConfigProperty DYNAMODB_LOCK_PARTITION_KEY = ConfigProperty + .key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "partition_key") + .noDefaultValue() + .sinceVersion("0.10.0") + .withInferFunction(cfg -> { + if (cfg.contains(HoodieTableConfig.NAME)) { + return Option.of(cfg.getString(HoodieTableConfig.NAME)); + } + return Option.empty(); + }) + .withDocumentation("For DynamoDB based lock provider, the partition key for the DynamoDB lock table. " + + "Each Hudi dataset should has it's unique key so concurrent writers could refer to the same partition key." + + " By default we use the Hudi table name specified to be the partition key"); + + public static final ConfigProperty DYNAMODB_LOCK_REGION = ConfigProperty + .key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "region") + .defaultValue("us-east-1") + .sinceVersion("0.10.0") + .withInferFunction(cfg -> { + String regionFromEnv = System.getenv("AWS_REGION"); + if (regionFromEnv != null) { + return Option.of(RegionUtils.getRegion(regionFromEnv).getName()); + } + return Option.empty(); + }) + .withDocumentation("For DynamoDB based lock provider, the region used in endpoint for Amazon DynamoDB service." + + " Would try to first get it from AWS_REGION environment variable. If not find, by default use us-east-1"); + + public static final ConfigProperty DYNAMODB_LOCK_BILLING_MODE = ConfigProperty + .key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "billing_mode") + .defaultValue(BillingMode.PAY_PER_REQUEST.name()) + .sinceVersion("0.10.0") + .withDocumentation("For DynamoDB based lock provider, by default it is PAY_PER_REQUEST mode"); + + public static final ConfigProperty DYNAMODB_LOCK_READ_CAPACITY = ConfigProperty + .key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "read_capacity") + .defaultValue("20") + .sinceVersion("0.10.0") + .withDocumentation("For DynamoDB based lock provider, read capacity units when using PROVISIONED billing mode"); + + public static final ConfigProperty DYNAMODB_LOCK_WRITE_CAPACITY = ConfigProperty + .key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "write_capacity") + .defaultValue("10") + .sinceVersion("0.10.0") + .withDocumentation("For DynamoDB based lock provider, write capacity units when using PROVISIONED billing mode"); + + public static final ConfigProperty DYNAMODB_LOCK_TABLE_CREATION_TIMEOUT = ConfigProperty + .key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "table_creation_timeout") + .defaultValue(String.valueOf(10 * 60 * 1000)) + .sinceVersion("0.10.0") + .withDocumentation("For DynamoDB based lock provider, the maximum number of milliseconds to wait for creating DynamoDB table"); + + public static final ConfigProperty DYNAMODB_ENDPOINT_URL = ConfigProperty + .key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "endpoint_url") + .noDefaultValue() + .sinceVersion("0.10.1") + .withDocumentation("For DynamoDB based lock provider, the url endpoint used for Amazon DynamoDB service." + + " Useful for development with a local dynamodb instance."); +} diff --git a/hudi-aws/src/main/java/org/apache/hudi/config/HoodieAWSConfig.java b/hudi-aws/src/main/java/org/apache/hudi/config/HoodieAWSConfig.java new file mode 100644 index 0000000000000..623704232e419 --- /dev/null +++ b/hudi-aws/src/main/java/org/apache/hudi/config/HoodieAWSConfig.java @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; +import javax.annotation.concurrent.Immutable; + +import static org.apache.hudi.config.DynamoDbBasedLockConfig.DYNAMODB_LOCK_BILLING_MODE; +import static org.apache.hudi.config.DynamoDbBasedLockConfig.DYNAMODB_LOCK_PARTITION_KEY; +import static org.apache.hudi.config.DynamoDbBasedLockConfig.DYNAMODB_LOCK_READ_CAPACITY; +import static org.apache.hudi.config.DynamoDbBasedLockConfig.DYNAMODB_LOCK_REGION; +import static org.apache.hudi.config.DynamoDbBasedLockConfig.DYNAMODB_LOCK_TABLE_NAME; +import static org.apache.hudi.config.DynamoDbBasedLockConfig.DYNAMODB_LOCK_WRITE_CAPACITY; + +/** + * Configurations used by the AWS credentials and AWS DynamoDB based lock. + */ +@Immutable +@ConfigClassProperty(name = "Amazon Web Services Configs", + groupName = ConfigGroups.Names.AWS, + description = "Amazon Web Services configurations to access resources like Amazon DynamoDB (for locks)," + + " Amazon CloudWatch (metrics).") +public class HoodieAWSConfig extends HoodieConfig { + public static final ConfigProperty AWS_ACCESS_KEY = ConfigProperty + .key("hoodie.aws.access.key") + .noDefaultValue() + .sinceVersion("0.10.0") + .withDocumentation("AWS access key id"); + + public static final ConfigProperty AWS_SECRET_KEY = ConfigProperty + .key("hoodie.aws.secret.key") + .noDefaultValue() + .sinceVersion("0.10.0") + .withDocumentation("AWS secret key"); + + public static final ConfigProperty AWS_SESSION_TOKEN = ConfigProperty + .key("hoodie.aws.session.token") + .noDefaultValue() + .sinceVersion("0.10.0") + .withDocumentation("AWS session token"); + + private HoodieAWSConfig() { + super(); + } + + public static HoodieAWSConfig.Builder newBuilder() { + return new HoodieAWSConfig.Builder(); + } + + public String getAWSAccessKey() { + return getString(AWS_ACCESS_KEY); + } + + public String getAWSSecretKey() { + return getString(AWS_SECRET_KEY); + } + + public String getAWSSessionToken() { + return getString(AWS_SESSION_TOKEN); + } + + public static class Builder { + + private final HoodieAWSConfig awsConfig = new HoodieAWSConfig(); + + public HoodieAWSConfig.Builder fromFile(File propertiesFile) throws IOException { + try (FileReader reader = new FileReader(propertiesFile)) { + this.awsConfig.getProps().load(reader); + return this; + } + } + + public HoodieAWSConfig.Builder fromProperties(Properties props) { + this.awsConfig.getProps().putAll(props); + return this; + } + + public HoodieAWSConfig.Builder withAccessKey(String accessKey) { + awsConfig.setValue(AWS_ACCESS_KEY, accessKey); + return this; + } + + public HoodieAWSConfig.Builder withSecretKey(String secretKey) { + awsConfig.setValue(AWS_SECRET_KEY, secretKey); + return this; + } + + public HoodieAWSConfig.Builder withSessionToken(String sessionToken) { + awsConfig.setValue(AWS_SESSION_TOKEN, sessionToken); + return this; + } + + public Builder withDynamoDBTable(String dynamoDbTableName) { + awsConfig.setValue(DYNAMODB_LOCK_TABLE_NAME, dynamoDbTableName); + return this; + } + + public Builder withDynamoDBPartitionKey(String partitionKey) { + awsConfig.setValue(DYNAMODB_LOCK_PARTITION_KEY, partitionKey); + return this; + } + + public Builder withDynamoDBRegion(String region) { + awsConfig.setValue(DYNAMODB_LOCK_REGION, region); + return this; + } + + public Builder withDynamoDBBillingMode(String mode) { + awsConfig.setValue(DYNAMODB_LOCK_BILLING_MODE, mode); + return this; + } + + public Builder withDynamoDBReadCapacity(String capacity) { + awsConfig.setValue(DYNAMODB_LOCK_READ_CAPACITY, capacity); + return this; + } + + public Builder withDynamoDBWriteCapacity(String capacity) { + awsConfig.setValue(DYNAMODB_LOCK_WRITE_CAPACITY, capacity); + return this; + } + + public HoodieAWSConfig build() { + awsConfig.setDefaults(HoodieAWSConfig.class.getName()); + return awsConfig; + } + } +} diff --git a/hudi-aws/src/test/java/org/apache/hudi/aws/TestHoodieAWSCredentialsProviderFactory.java b/hudi-aws/src/test/java/org/apache/hudi/aws/TestHoodieAWSCredentialsProviderFactory.java new file mode 100644 index 0000000000000..051fe81e8b0ff --- /dev/null +++ b/hudi-aws/src/test/java/org/apache/hudi/aws/TestHoodieAWSCredentialsProviderFactory.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.aws; + +import com.amazonaws.auth.BasicSessionCredentials; +import org.apache.hudi.config.HoodieAWSConfig; +import org.apache.hudi.common.config.HoodieConfig; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestHoodieAWSCredentialsProviderFactory { + + @Test + public void testGetAWSCredentials() { + HoodieConfig cfg = new HoodieConfig(); + cfg.setValue(HoodieAWSConfig.AWS_ACCESS_KEY, "random-access-key"); + cfg.setValue(HoodieAWSConfig.AWS_SECRET_KEY, "random-secret-key"); + cfg.setValue(HoodieAWSConfig.AWS_SESSION_TOKEN, "random-session-token"); + BasicSessionCredentials credentials = (BasicSessionCredentials) org.apache.hudi.aws.credentials.HoodieAWSCredentialsProviderFactory.getAwsCredentialsProvider(cfg.getProps()).getCredentials(); + assertEquals("random-access-key", credentials.getAWSAccessKeyId()); + assertEquals("random-secret-key", credentials.getAWSSecretKey()); + assertEquals("random-session-token", credentials.getSessionToken()); + } +} diff --git a/hudi-aws/src/test/java/org/apache/hudi/aws/cloudwatch/TestCloudWatchReporter.java b/hudi-aws/src/test/java/org/apache/hudi/aws/cloudwatch/TestCloudWatchReporter.java new file mode 100644 index 0000000000000..85f551e6fda82 --- /dev/null +++ b/hudi-aws/src/test/java/org/apache/hudi/aws/cloudwatch/TestCloudWatchReporter.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.aws.cloudwatch; + +import com.amazonaws.services.cloudwatch.AmazonCloudWatchAsync; +import com.amazonaws.services.cloudwatch.model.Dimension; +import com.amazonaws.services.cloudwatch.model.MetricDatum; +import com.amazonaws.services.cloudwatch.model.PutMetricDataRequest; +import com.amazonaws.services.cloudwatch.model.PutMetricDataResult; +import com.codahale.metrics.Clock; +import com.codahale.metrics.Counter; +import com.codahale.metrics.ExponentiallyDecayingReservoir; +import com.codahale.metrics.Gauge; +import com.codahale.metrics.Histogram; +import com.codahale.metrics.Meter; +import com.codahale.metrics.MetricFilter; +import com.codahale.metrics.MetricRegistry; +import com.codahale.metrics.Timer; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.ArgumentCaptor; +import org.mockito.ArgumentMatchers; +import org.mockito.Captor; +import org.mockito.Mock; +import org.mockito.Mockito; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.util.List; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeUnit; + +import static org.apache.hudi.aws.cloudwatch.CloudWatchReporter.DIMENSION_COUNT_TYPE_VALUE; +import static org.apache.hudi.aws.cloudwatch.CloudWatchReporter.DIMENSION_GAUGE_TYPE_VALUE; +import static org.apache.hudi.aws.cloudwatch.CloudWatchReporter.DIMENSION_METRIC_TYPE_KEY; +import static org.apache.hudi.aws.cloudwatch.CloudWatchReporter.DIMENSION_TABLE_NAME_KEY; +import static org.junit.jupiter.api.Assertions.assertEquals; + +@ExtendWith(MockitoExtension.class) +public class TestCloudWatchReporter { + + private static final String NAMESPACE = "Hudi Test"; + private static final String PREFIX = "testPrefix"; + private static final String TABLE_NAME = "testTable"; + private static final int MAX_DATUMS_PER_REQUEST = 2; + + @Mock + MetricRegistry metricRegistry; + + @Mock + AmazonCloudWatchAsync cloudWatchAsync; + + @Mock + CompletableFuture cloudWatchFuture; + + @Captor + ArgumentCaptor putMetricDataRequestCaptor; + + CloudWatchReporter reporter; + + @BeforeEach + public void setup() { + reporter = CloudWatchReporter.forRegistry(metricRegistry) + .namespace(NAMESPACE) + .prefixedWith(PREFIX) + .maxDatumsPerRequest(MAX_DATUMS_PER_REQUEST) + .withClock(Clock.defaultClock()) + .filter(MetricFilter.ALL) + .convertRatesTo(TimeUnit.SECONDS) + .convertDurationsTo(TimeUnit.MILLISECONDS) + .build(cloudWatchAsync); + + Mockito.when(cloudWatchAsync.putMetricDataAsync(ArgumentMatchers.any())).thenReturn(cloudWatchFuture); + } + + @Test + public void testReporter() { + SortedMap gauges = new TreeMap<>(); + Gauge gauge1 = () -> 100L; + Gauge gauge2 = () -> 100.1; + gauges.put(TABLE_NAME + ".gauge1", gauge1); + gauges.put(TABLE_NAME + ".gauge2", gauge2); + + SortedMap counters = new TreeMap<>(); + Counter counter1 = new Counter(); + counter1.inc(200); + counters.put(TABLE_NAME + ".counter1", counter1); + + SortedMap histograms = new TreeMap<>(); + Histogram histogram1 = new Histogram(new ExponentiallyDecayingReservoir()); + histogram1.update(300); + histograms.put(TABLE_NAME + ".histogram1", histogram1); + + SortedMap meters = new TreeMap<>(); + Meter meter1 = new Meter(); + meter1.mark(400); + meters.put(TABLE_NAME + ".meter1", meter1); + + SortedMap timers = new TreeMap<>(); + Timer timer1 = new Timer(); + timer1.update(100, TimeUnit.SECONDS); + timers.put(TABLE_NAME + ".timer1", timer1); + + Mockito.when(metricRegistry.getGauges(MetricFilter.ALL)).thenReturn(gauges); + Mockito.when(metricRegistry.getCounters(MetricFilter.ALL)).thenReturn(counters); + Mockito.when(metricRegistry.getHistograms(MetricFilter.ALL)).thenReturn(histograms); + Mockito.when(metricRegistry.getMeters(MetricFilter.ALL)).thenReturn(meters); + Mockito.when(metricRegistry.getTimers(MetricFilter.ALL)).thenReturn(timers); + + reporter.report(); + + // Since there are 6 metrics in total, and max datums per request is 2 we would expect 3 calls to CloudWatch + // with 2 datums in each + Mockito.verify(cloudWatchAsync, Mockito.times(3)).putMetricDataAsync(putMetricDataRequestCaptor.capture()); + Assertions.assertEquals(NAMESPACE, putMetricDataRequestCaptor.getValue().getNamespace()); + + List putMetricDataRequests = putMetricDataRequestCaptor.getAllValues(); + putMetricDataRequests.forEach(request -> assertEquals(2, request.getMetricData().size())); + + List metricDataBatch1 = putMetricDataRequests.get(0).getMetricData(); + assertEquals(PREFIX + ".gauge1", metricDataBatch1.get(0).getMetricName()); + assertEquals(Double.valueOf(gauge1.getValue()), metricDataBatch1.get(0).getValue()); + assertDimensions(metricDataBatch1.get(0).getDimensions(), DIMENSION_GAUGE_TYPE_VALUE); + + assertEquals(PREFIX + ".gauge2", metricDataBatch1.get(1).getMetricName()); + assertEquals(gauge2.getValue(), metricDataBatch1.get(1).getValue()); + assertDimensions(metricDataBatch1.get(1).getDimensions(), DIMENSION_GAUGE_TYPE_VALUE); + + List metricDataBatch2 = putMetricDataRequests.get(1).getMetricData(); + assertEquals(PREFIX + ".counter1", metricDataBatch2.get(0).getMetricName()); + assertEquals(counter1.getCount(), metricDataBatch2.get(0).getValue().longValue()); + assertDimensions(metricDataBatch2.get(0).getDimensions(), DIMENSION_COUNT_TYPE_VALUE); + + assertEquals(PREFIX + ".histogram1", metricDataBatch2.get(1).getMetricName()); + assertEquals(histogram1.getCount(), metricDataBatch2.get(1).getValue().longValue()); + assertDimensions(metricDataBatch2.get(1).getDimensions(), DIMENSION_COUNT_TYPE_VALUE); + + List metricDataBatch3 = putMetricDataRequests.get(2).getMetricData(); + assertEquals(PREFIX + ".meter1", metricDataBatch3.get(0).getMetricName()); + assertEquals(meter1.getCount(), metricDataBatch3.get(0).getValue().longValue()); + assertDimensions(metricDataBatch3.get(0).getDimensions(), DIMENSION_COUNT_TYPE_VALUE); + + assertEquals(PREFIX + ".timer1", metricDataBatch3.get(1).getMetricName()); + assertEquals(timer1.getCount(), metricDataBatch3.get(1).getValue().longValue()); + assertDimensions(metricDataBatch3.get(1).getDimensions(), DIMENSION_COUNT_TYPE_VALUE); + + reporter.stop(); + Mockito.verify(cloudWatchAsync).shutdown(); + } + + private void assertDimensions(List actualDimensions, String metricTypeDimensionVal) { + assertEquals(2, actualDimensions.size()); + + Dimension expectedTableNameDimension = new Dimension() + .withName(DIMENSION_TABLE_NAME_KEY) + .withValue(TABLE_NAME); + Dimension expectedMetricTypeDimension = new Dimension() + .withName(DIMENSION_METRIC_TYPE_KEY) + .withValue(metricTypeDimensionVal); + + assertEquals(expectedTableNameDimension, actualDimensions.get(0)); + assertEquals(expectedMetricTypeDimension, actualDimensions.get(1)); + } +} \ No newline at end of file diff --git a/hudi-aws/src/test/java/org/apache/hudi/aws/transaction/integ/ITTestDynamoDBBasedLockProvider.java b/hudi-aws/src/test/java/org/apache/hudi/aws/transaction/integ/ITTestDynamoDBBasedLockProvider.java new file mode 100644 index 0000000000000..d2ab0375e050c --- /dev/null +++ b/hudi-aws/src/test/java/org/apache/hudi/aws/transaction/integ/ITTestDynamoDBBasedLockProvider.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.aws.transaction.integ; + +import com.amazonaws.auth.AWSCredentialsProvider; +import com.amazonaws.auth.AWSStaticCredentialsProvider; +import com.amazonaws.auth.BasicAWSCredentials; +import com.amazonaws.client.builder.AwsClientBuilder; +import com.amazonaws.services.dynamodbv2.AmazonDynamoDB; +import com.amazonaws.services.dynamodbv2.AmazonDynamoDBClientBuilder; +import com.amazonaws.services.dynamodbv2.model.BillingMode; +import org.apache.hudi.aws.transaction.lock.DynamoDBBasedLockProvider; +import org.apache.hudi.common.config.LockConfiguration; +import org.apache.hudi.config.DynamoDbBasedLockConfig; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import java.util.UUID; +import java.util.Properties; +import java.util.concurrent.TimeUnit; + +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY; + +/** + * Test for {@link DynamoDBBasedLockProvider}. + * Set it as integration test because it requires setting up docker environment. + */ +public class ITTestDynamoDBBasedLockProvider { + + private static LockConfiguration lockConfiguration; + private static AmazonDynamoDB dynamoDb; + + private static final String TABLE_NAME_PREFIX = "testDDBTable-"; + private static final String REGION = "us-east-2"; + + @BeforeAll + public static void setup() throws InterruptedException { + Properties properties = new Properties(); + properties.setProperty(DynamoDbBasedLockConfig.DYNAMODB_LOCK_BILLING_MODE.key(), BillingMode.PAY_PER_REQUEST.name()); + // properties.setProperty(AWSLockConfig.DYNAMODB_LOCK_TABLE_NAME.key(), TABLE_NAME_PREFIX); + properties.setProperty(DynamoDbBasedLockConfig.DYNAMODB_LOCK_PARTITION_KEY.key(), "testKey"); + properties.setProperty(DynamoDbBasedLockConfig.DYNAMODB_LOCK_REGION.key(), REGION); + properties.setProperty(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "1000"); + properties.setProperty(DynamoDbBasedLockConfig.DYNAMODB_LOCK_READ_CAPACITY.key(), "0"); + properties.setProperty(DynamoDbBasedLockConfig.DYNAMODB_LOCK_WRITE_CAPACITY.key(), "0"); + lockConfiguration = new LockConfiguration(properties); + dynamoDb = getDynamoClientWithLocalEndpoint(); + } + + @Test + public void testAcquireLock() { + lockConfiguration.getConfig().setProperty(DynamoDbBasedLockConfig.DYNAMODB_LOCK_TABLE_NAME.key(), TABLE_NAME_PREFIX + UUID.randomUUID()); + DynamoDBBasedLockProvider dynamoDbBasedLockProvider = new DynamoDBBasedLockProvider(lockConfiguration, null, dynamoDb); + Assertions.assertTrue(dynamoDbBasedLockProvider.tryLock(lockConfiguration.getConfig() + .getLong(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY), TimeUnit.MILLISECONDS)); + dynamoDbBasedLockProvider.unlock(); + } + + @Test + public void testUnlock() { + lockConfiguration.getConfig().setProperty(DynamoDbBasedLockConfig.DYNAMODB_LOCK_TABLE_NAME.key(), TABLE_NAME_PREFIX + UUID.randomUUID()); + DynamoDBBasedLockProvider dynamoDbBasedLockProvider = new DynamoDBBasedLockProvider(lockConfiguration, null, dynamoDb); + Assertions.assertTrue(dynamoDbBasedLockProvider.tryLock(lockConfiguration.getConfig() + .getLong(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY), TimeUnit.MILLISECONDS)); + dynamoDbBasedLockProvider.unlock(); + Assertions.assertTrue(dynamoDbBasedLockProvider.tryLock(lockConfiguration.getConfig() + .getLong(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY), TimeUnit.MILLISECONDS)); + } + + @Test + public void testReentrantLock() { + lockConfiguration.getConfig().setProperty(DynamoDbBasedLockConfig.DYNAMODB_LOCK_TABLE_NAME.key(), TABLE_NAME_PREFIX + UUID.randomUUID()); + DynamoDBBasedLockProvider dynamoDbBasedLockProvider = new DynamoDBBasedLockProvider(lockConfiguration, null, dynamoDb); + Assertions.assertTrue(dynamoDbBasedLockProvider.tryLock(lockConfiguration.getConfig() + .getLong(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY), TimeUnit.MILLISECONDS)); + Assertions.assertFalse(dynamoDbBasedLockProvider.tryLock(lockConfiguration.getConfig() + .getLong(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY), TimeUnit.MILLISECONDS)); + dynamoDbBasedLockProvider.unlock(); + } + + @Test + public void testUnlockWithoutLock() { + lockConfiguration.getConfig().setProperty(DynamoDbBasedLockConfig.DYNAMODB_LOCK_TABLE_NAME.key(), TABLE_NAME_PREFIX + UUID.randomUUID()); + DynamoDBBasedLockProvider dynamoDbBasedLockProvider = new DynamoDBBasedLockProvider(lockConfiguration, null, dynamoDb); + dynamoDbBasedLockProvider.unlock(); + } + + private static AmazonDynamoDB getDynamoClientWithLocalEndpoint() { + String endpoint = System.getProperty("dynamodb-local.endpoint"); + if (endpoint == null || endpoint.isEmpty()) { + throw new IllegalStateException("dynamodb-local.endpoint system property not set"); + } + return AmazonDynamoDBClientBuilder.standard() + .withEndpointConfiguration(new AwsClientBuilder.EndpointConfiguration(endpoint, REGION)) + .withCredentials(getCredentials()) + .build(); + } + + private static AWSCredentialsProvider getCredentials() { + return new AWSStaticCredentialsProvider(new BasicAWSCredentials("random-access-key", "random-secret-key")); + } +} diff --git a/hudi-cli/hudi-cli.sh b/hudi-cli/hudi-cli.sh index bbfba85a8010e..df309ca0b8327 100755 --- a/hudi-cli/hudi-cli.sh +++ b/hudi-cli/hudi-cli.sh @@ -27,5 +27,5 @@ fi OTHER_JARS=`ls ${DIR}/target/lib/* | grep -v 'hudi-[^/]*jar' | tr '\n' ':'` -echo "Running : java -cp ${HADOOP_CONF_DIR}:${SPARK_CONF_DIR}:${HOODIE_JAR}:${OTHER_JARS}:${CLIENT_JAR} -DSPARK_CONF_DIR=${SPARK_CONF_DIR} -DHADOOP_CONF_DIR=${HADOOP_CONF_DIR} org.springframework.shell.Bootstrap $@" -java -cp ${HADOOP_CONF_DIR}:${SPARK_CONF_DIR}:${HOODIE_JAR}:${OTHER_JARS}:${CLIENT_JAR} -DSPARK_CONF_DIR=${SPARK_CONF_DIR} -DHADOOP_CONF_DIR=${HADOOP_CONF_DIR} org.springframework.shell.Bootstrap $@ +echo "Running : java -cp ${HADOOP_CONF_DIR}:${SPARK_CONF_DIR}:${HOODIE_JAR}:${OTHER_JARS}:${CLIENT_JAR} -DSPARK_CONF_DIR=${SPARK_CONF_DIR} -DHADOOP_CONF_DIR=${HADOOP_CONF_DIR} org.apache.hudi.cli.Main $@" +java -cp ${HADOOP_CONF_DIR}:${SPARK_CONF_DIR}:${HOODIE_JAR}:${OTHER_JARS}:${CLIENT_JAR} -DSPARK_CONF_DIR=${SPARK_CONF_DIR} -DHADOOP_CONF_DIR=${HADOOP_CONF_DIR} org.apache.hudi.cli.Main $@ diff --git a/hudi-cli/pom.xml b/hudi-cli/pom.xml index dda07f9a111dd..ce799ff3f1bb2 100644 --- a/hudi-cli/pom.xml +++ b/hudi-cli/pom.xml @@ -19,7 +19,7 @@ hudi org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 @@ -27,8 +27,7 @@ jar - 1.2.0.RELEASE - org.springframework.shell.Bootstrap + org.apache.hudi.cli.Main ${project.parent.basedir} @@ -48,6 +47,7 @@ -nobootcp + false @@ -129,8 +129,26 @@ - + + + org.springframework.boot + spring-boot-starter-test + test + + + org.springframework.boot + spring-boot-starter-logging + + + + + org.junit.platform + junit-platform-launcher + ${junit.platform.version} + test + + org.scala-lang @@ -138,6 +156,17 @@ ${scala.version} + + org.scala-lang.modules + scala-collection-compat_${scala.binary.version} + + + + + org.apache.logging.log4j + log4j-1.2-api + + org.apache.hudi @@ -200,8 +229,12 @@ - log4j - log4j + org.apache.logging.log4j + log4j-core + + + org.apache.logging.log4j + log4j-api @@ -224,15 +257,16 @@ org.apache.spark spark-sql_${scala.binary.version} - - org.apache.spark - spark-avro_${scala.binary.version} - org.springframework.shell - spring-shell - ${spring.shell.version} + spring-shell-starter + + + com.google.guava + guava + + @@ -250,41 +284,114 @@ org.apache.hadoop hadoop-common + + + com.google.code.gson + gson + + + + + org.apache.hadoop + hadoop-hdfs + + + + com.google.code.gson + gson + 2.6.2 + + + + + + org.apache.hadoop + hadoop-common + tests + test + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + org.apache.hadoop hadoop-hdfs + tests + test + + + javax.servlet + * + + + netty + io.netty + + + netty-all + io.netty + + + + org.apache.hudi + hudi-tests-common + ${project.version} + test + org.junit.jupiter junit-jupiter-api test - org.junit.jupiter junit-jupiter-engine test - org.junit.vintage junit-vintage-engine test - org.junit.jupiter junit-jupiter-params test - org.mockito mockito-junit-jupiter test + + org.junit.platform + junit-platform-runner + test + + + org.junit.platform + junit-platform-suite-api + test + + + org.junit.platform + junit-platform-commons + test + diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/HoodieCLI.java b/hudi-cli/src/main/java/org/apache/hudi/cli/HoodieCLI.java index a4059e16be220..7b54760cddcea 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/HoodieCLI.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/HoodieCLI.java @@ -44,7 +44,7 @@ public class HoodieCLI { protected static HoodieTableMetaClient tableMetadata; public static HoodieTableMetaClient syncTableMetadata; public static TimelineLayoutVersion layoutVersion; - private static TempViewProvider tempViewProvider; + public static TempViewProvider tempViewProvider; /** * Enum for CLI state. @@ -85,8 +85,8 @@ public static void initFS(boolean force) throws IOException { } public static void refreshTableMetadata() { - setTableMetaClient(new HoodieTableMetaClient(HoodieCLI.conf, basePath, false, HoodieCLI.consistencyGuardConfig, - Option.of(layoutVersion))); + setTableMetaClient(HoodieTableMetaClient.builder().setConf(HoodieCLI.conf).setBasePath(basePath).setLoadActiveTimelineOnLoad(false).setConsistencyGuardConfig(HoodieCLI.consistencyGuardConfig) + .setLayoutVersion(Option.of(layoutVersion)).build()); } public static void connectTo(String basePath, Integer layoutVersion) { @@ -114,17 +114,4 @@ public static synchronized TempViewProvider getTempViewProvider() { return tempViewProvider; } - - /** - * Close tempViewProvider. - *

- * For test, avoid multiple SparkContexts. - */ - public static synchronized void closeTempViewProvider() { - if (tempViewProvider != null) { - tempViewProvider.close(); - tempViewProvider = null; - } - } - } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/HoodieHistoryFileNameProvider.java b/hudi-cli/src/main/java/org/apache/hudi/cli/HoodieHistoryFileNameProvider.java deleted file mode 100644 index 95f983416a50d..0000000000000 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/HoodieHistoryFileNameProvider.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.cli; - -import org.springframework.core.Ordered; -import org.springframework.core.annotation.Order; -import org.springframework.shell.plugin.support.DefaultHistoryFileNameProvider; -import org.springframework.stereotype.Component; - -/** - * CLI history file provider. - */ -@Component -@Order(Ordered.HIGHEST_PRECEDENCE) -public class HoodieHistoryFileNameProvider extends DefaultHistoryFileNameProvider { - - @Override - public String getHistoryFileName() { - return "hoodie-cmd.log"; - } - - @Override - public String getProviderName() { - return "Hoodie file name provider"; - } - -} diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/HoodiePrintHelper.java b/hudi-cli/src/main/java/org/apache/hudi/cli/HoodiePrintHelper.java index be640376eeef2..0ffec2cac08e5 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/HoodiePrintHelper.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/HoodiePrintHelper.java @@ -58,30 +58,76 @@ public static String print(String[] header, String[][] rows) { * @param rows List of rows * @return Serialized form for printing */ - public static String print(TableHeader rowHeader, Map> fieldNameToConverterMap, + public static String print( + TableHeader rowHeader, Map> fieldNameToConverterMap, String sortByField, boolean isDescending, Integer limit, boolean headerOnly, List rows) { - return print(rowHeader, fieldNameToConverterMap, sortByField, isDescending, limit, headerOnly, rows, ""); + return print(rowHeader, fieldNameToConverterMap, false, sortByField, isDescending, limit, headerOnly, rows); + } + + /** + * Serialize Table to printable string. + * + * @param rowHeader Row Header + * @param fieldNameToConverterMap Field Specific Converters + * @param withRowNo Whether to add row number + * @param sortByField Sorting field + * @param isDescending Order + * @param limit Limit + * @param headerOnly Headers only + * @param rows List of rows + * @return Serialized form for printing + */ + public static String print( + TableHeader rowHeader, Map> fieldNameToConverterMap, boolean withRowNo, + String sortByField, boolean isDescending, Integer limit, boolean headerOnly, List rows) { + return print(rowHeader, fieldNameToConverterMap, withRowNo, sortByField, isDescending, limit, headerOnly, rows, ""); } /** * Serialize Table to printable string and also export a temporary view to easily write sql queries. + *

+ * Ideally, exporting view needs to be outside PrintHelper, but all commands use this. So this is easy + * way to add support for all commands * + * @param rowHeader Row Header + * @param fieldNameToConverterMap Field Specific Converters + * @param sortByField Sorting field + * @param isDescending Order + * @param limit Limit + * @param headerOnly Headers only + * @param rows List of rows + * @param tempTableName table name to export + * @return Serialized form for printing + */ + public static String print( + TableHeader rowHeader, Map> fieldNameToConverterMap, + String sortByField, boolean isDescending, Integer limit, boolean headerOnly, + List rows, String tempTableName) { + return print(rowHeader, fieldNameToConverterMap, false, sortByField, isDescending, limit, + headerOnly, rows, tempTableName); + } + + /** + * Serialize Table to printable string and also export a temporary view to easily write sql queries. + *

* Ideally, exporting view needs to be outside PrintHelper, but all commands use this. So this is easy * way to add support for all commands * - * @param rowHeader Row Header + * @param rowHeader Row Header * @param fieldNameToConverterMap Field Specific Converters - * @param sortByField Sorting field - * @param isDescending Order - * @param limit Limit - * @param headerOnly Headers only - * @param rows List of rows - * @param tempTableName table name to export + * @param withRowNo Whether to add row number + * @param sortByField Sorting field + * @param isDescending Order + * @param limit Limit + * @param headerOnly Headers only + * @param rows List of rows + * @param tempTableName table name to export * @return Serialized form for printing */ - public static String print(TableHeader rowHeader, Map> fieldNameToConverterMap, - String sortByField, boolean isDescending, Integer limit, boolean headerOnly, List rows, - String tempTableName) { + public static String print( + TableHeader rowHeader, Map> fieldNameToConverterMap, + boolean withRowNo, String sortByField, boolean isDescending, Integer limit, boolean headerOnly, + List rows, String tempTableName) { if (headerOnly) { return HoodiePrintHelper.print(rowHeader); @@ -97,7 +143,8 @@ public static String print(TableHeader rowHeader, Map"; + return new AttributedString("hudi->"); case TABLE: - return "hudi:" + tableName + "->"; + return new AttributedString("hudi:" + tableName + "->"); case SYNC: - return "hudi:" + tableName + " <==> " + HoodieCLI.syncTableMetadata.getTableConfig().getTableName() + "->"; + return new AttributedString("hudi:" + tableName + " <==> " + HoodieCLI.syncTableMetadata.getTableConfig().getTableName() + "->"); default: - return "hudi:" + tableName + "->"; + return new AttributedString("hudi:" + tableName + "->"); } } - return "hudi->"; + return new AttributedString("hudi->"); } - - @Override - public String getProviderName() { - return "Hoodie provider"; - } - } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/HoodieSplashScreen.java b/hudi-cli/src/main/java/org/apache/hudi/cli/HoodieSplashScreen.java deleted file mode 100644 index f2a458c196c94..0000000000000 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/HoodieSplashScreen.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.cli; - -import org.springframework.core.Ordered; -import org.springframework.core.annotation.Order; -import org.springframework.shell.plugin.support.DefaultBannerProvider; -import org.springframework.shell.support.util.OsUtils; -import org.springframework.stereotype.Component; - -/** - * This class is responsible to print the splash screen at the start of the application. - */ -@Component -@Order(Ordered.HIGHEST_PRECEDENCE) -public class HoodieSplashScreen extends DefaultBannerProvider { - - static { - System.out.println("HoodieSplashScreen loaded"); - } - - private static String screen = "===================================================================" + OsUtils.LINE_SEPARATOR - + "* ___ ___ *" + OsUtils.LINE_SEPARATOR - + "* /\\__\\ ___ /\\ \\ ___ *" + OsUtils.LINE_SEPARATOR - + "* / / / /\\__\\ / \\ \\ /\\ \\ *" + OsUtils.LINE_SEPARATOR - + "* / /__/ / / / / /\\ \\ \\ \\ \\ \\ *" + OsUtils.LINE_SEPARATOR - + "* / \\ \\ ___ / / / / / \\ \\__\\ / \\__\\ *" + OsUtils.LINE_SEPARATOR - + "* / /\\ \\ /\\__\\ / /__/ ___ / /__/ \\ |__| / /\\/__/ *" + OsUtils.LINE_SEPARATOR - + "* \\/ \\ \\/ / / \\ \\ \\ /\\__\\ \\ \\ \\ / / / /\\/ / / *" + OsUtils.LINE_SEPARATOR - + "* \\ / / \\ \\ / / / \\ \\ / / / \\ /__/ *" + OsUtils.LINE_SEPARATOR - + "* / / / \\ \\/ / / \\ \\/ / / \\ \\__\\ *" + OsUtils.LINE_SEPARATOR - + "* / / / \\ / / \\ / / \\/__/ *" + OsUtils.LINE_SEPARATOR - + "* \\/__/ \\/__/ \\/__/ Apache Hudi CLI *" + OsUtils.LINE_SEPARATOR - + "* *" + OsUtils.LINE_SEPARATOR - + "===================================================================" + OsUtils.LINE_SEPARATOR; - - @Override - public String getBanner() { - return screen; - } - - @Override - public String getVersion() { - return "1.0"; - } - - @Override - public String getWelcomeMessage() { - return "Welcome to Apache Hudi CLI. Please type help if you are looking for help. "; - } - - @Override - public String getProviderName() { - return "Hoodie Banner"; - } -} diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/HoodieTableHeaderFields.java b/hudi-cli/src/main/java/org/apache/hudi/cli/HoodieTableHeaderFields.java index f32b7bc36c9bd..e6016e4cc1cb7 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/HoodieTableHeaderFields.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/HoodieTableHeaderFields.java @@ -22,6 +22,7 @@ * Fields of print table header. */ public class HoodieTableHeaderFields { + public static final String HEADER_ROW_NO = "No."; public static final String HEADER_PARTITION = "Partition"; public static final String HEADER_INSTANT = "Instant"; public static final String HEADER_PARTITION_PATH = HEADER_PARTITION + " Path"; @@ -83,6 +84,8 @@ public class HoodieTableHeaderFields { public static final String HEADER_HOODIE_PROPERTY = "Property"; public static final String HEADER_OLD_VALUE = "Old Value"; public static final String HEADER_NEW_VALUE = "New Value"; + public static final String HEADER_TEXT_METAFILE_PRESENT = "Text Metafile present ?"; + public static final String HEADER_BASE_METAFILE_PRESENT = "Base Metafile present ?"; /** * Fields of Savepoints. @@ -126,6 +129,9 @@ public class HoodieTableHeaderFields { public static final String HEADER_TOTAL_RECORDS_INSERTED = "Total Records Inserted"; public static final String HEADER_TOTAL_RECORDS_UPDATED = "Total Records Updated"; public static final String HEADER_TOTAL_ERRORS = "Total Errors"; + public static final String HEADER_TOTAL_RECORDS_WRITTEN_COMMIT = "Total Records Written for entire commit"; + public static final String HEADER_TOTAL_BYTES_WRITTEN_COMMIT = "Total Bytes Written for entire commit"; + public static final String HEADER_AVG_REC_SIZE_COMMIT = "Avg record size for entire commit"; /** * Fields of commit metadata. @@ -140,4 +146,71 @@ public class HoodieTableHeaderFields { public static final String HEADER_TOTAL_ROLLBACK_BLOCKS = "Total Rollback Blocks"; public static final String HEADER_TOTAL_LOG_RECORDS = "Total Log Records"; public static final String HEADER_TOTAL_UPDATED_RECORDS_COMPACTED = "Total Updated Records Compacted"; + + /** + * Fields of Compaction. + */ + public static final String HEADER_INSTANT_BLANK_TIME = "Instant Time"; + public static final String HEADER_FILE_PATH = "File Path"; + public static final String HEADER_COMPACTION_INSTANT_TIME = "Compaction " + HEADER_INSTANT_BLANK_TIME; + public static final String HEADER_STATE = "State"; + public static final String HEADER_TOTAL_FILES_TO_BE_COMPACTED = "Total FileIds to be Compacted"; + public static final String HEADER_EXTRA_METADATA = "Extra Metadata"; + public static final String HEADER_DATA_FILE_PATH = "Data " + HEADER_FILE_PATH; + public static final String HEADER_TOTAL_DELTA_FILES = "Total " + HEADER_DELTA_FILES; + public static final String HEADER_METRICS = "getMetrics"; + public static final String HEADER_BASE_INSTANT_TIME = "Base " + HEADER_INSTANT_BLANK_TIME; + public static final String HEADER_BASE_DATA_FILE = "Base Data File"; + public static final String HEADER_VALID = "Valid"; + public static final String HEADER_ERROR = "Error"; + public static final String HEADER_SOURCE_FILE_PATH = "Source " + HEADER_FILE_PATH; + public static final String HEADER_DESTINATION_FILE_PATH = "Destination " + HEADER_FILE_PATH; + public static final String HEADER_RENAME_EXECUTED = "Rename Executed?"; + public static final String HEADER_RENAME_SUCCEEDED = "Rename Succeeded?"; + + /** + * Fields of timeline command output + */ + public static final String HEADER_REQUESTED_TIME = "Requested\nTime"; + public static final String HEADER_INFLIGHT_TIME = "Inflight\nTime"; + public static final String HEADER_COMPLETED_TIME = "Completed\nTime"; + public static final String HEADER_ROLLBACK_INFO = "Rollback Info"; + public static final String HEADER_MT_PREFIX = "MT\n"; + public static final String HEADER_MT_ACTION = HEADER_MT_PREFIX + HEADER_ACTION; + public static final String HEADER_MT_STATE = HEADER_MT_PREFIX + HEADER_STATE; + public static final String HEADER_MT_REQUESTED_TIME = HEADER_MT_PREFIX + HEADER_REQUESTED_TIME; + public static final String HEADER_MT_INFLIGHT_TIME = HEADER_MT_PREFIX + HEADER_INFLIGHT_TIME; + public static final String HEADER_MT_COMPLETED_TIME = HEADER_MT_PREFIX + HEADER_COMPLETED_TIME; + + public static TableHeader getTableHeader() { + return new TableHeader() + .addTableHeaderField(HoodieTableHeaderFields.HEADER_COMMIT_TIME) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_FILES_ADDED) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_FILES_UPDATED) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_PARTITIONS_WRITTEN) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_RECORDS_WRITTEN) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_UPDATE_RECORDS_WRITTEN) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_ERRORS); + } + + public static TableHeader getTableHeaderWithExtraMetadata() { + return new TableHeader() + .addTableHeaderField(HoodieTableHeaderFields.HEADER_ACTION) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_INSTANT) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_FILE_ID) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_PREVIOUS_COMMIT) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_NUM_WRITES) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_NUM_INSERTS) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_NUM_DELETES) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_NUM_UPDATE_WRITES) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_ERRORS) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_LOG_BLOCKS) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_CORRUPT_LOG_BLOCKS) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_ROLLBACK_BLOCKS) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_LOG_RECORDS) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_UPDATED_RECORDS_COMPACTED) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN); + } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/Main.java b/hudi-cli/src/main/java/org/apache/hudi/cli/Main.java index e924be9e50f52..e98707800196e 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/Main.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/Main.java @@ -18,18 +18,19 @@ package org.apache.hudi.cli; -import org.springframework.shell.Bootstrap; +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; import java.io.IOException; /** * Main class that delegates to Spring Shell's Bootstrap class in order to simplify debugging inside an IDE. */ +@SpringBootApplication public class Main { public static void main(String[] args) throws IOException { System.out.println("Main called"); - new HoodieSplashScreen(); - Bootstrap.main(args); + SpringApplication.run(Main.class, args); } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/Table.java b/hudi-cli/src/main/java/org/apache/hudi/cli/Table.java index 8158eef8d5f84..70e8a9740301c 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/Table.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/Table.java @@ -37,6 +37,8 @@ public class Table implements Iterable> { // Header for this table private final TableHeader rowHeader; + // Whether to print row number + private final boolean addRowNo; // User-specified conversions before rendering private final Map> fieldNameToConverterMap; // Option attribute to track sorting field @@ -49,12 +51,17 @@ public class Table implements Iterable> { private final List> rawRows; // Flag to determine if all the rows have been added private boolean finishedAdding = false; - // Rows ready for Rendering + // Headers ready for rendering + private TableHeader renderHeaders; + // Rows ready for rendering private List> renderRows; - public Table(TableHeader rowHeader, Map> fieldNameToConverterMap, - Option orderingFieldNameOptional, Option isDescendingOptional, Option limitOptional) { + public Table( + TableHeader rowHeader, Map> fieldNameToConverterMap, + boolean addRowNo, Option orderingFieldNameOptional, + Option isDescendingOptional, Option limitOptional) { this.rowHeader = rowHeader; + this.addRowNo = addRowNo; this.fieldNameToConverterMap = fieldNameToConverterMap; this.orderingFieldNameOptional = orderingFieldNameOptional; this.isDescendingOptional = isDescendingOptional; @@ -64,7 +71,7 @@ public Table(TableHeader rowHeader, Map> fieldN /** * Main API to add row to the table. - * + * * @param row Row */ public Table add(List row) { @@ -134,15 +141,34 @@ private List> orderRows() { private void sortAndLimit() { this.renderRows = new ArrayList<>(); final int limit = this.limitOptional.orElse(rawRows.size()); - final List> orderedRows = orderRows(); - renderRows = orderedRows.stream().limit(limit).map(row -> IntStream.range(0, rowHeader.getNumFields()).mapToObj(idx -> { - String fieldName = rowHeader.get(idx); - if (fieldNameToConverterMap.containsKey(fieldName)) { - return fieldNameToConverterMap.get(fieldName).apply(row.get(idx)); + // Row number is added here if enabled + final List> rawOrderedRows = orderRows(); + final List> orderedRows; + if (addRowNo) { + orderedRows = new ArrayList<>(); + int rowNo = 0; + for (List row : rawOrderedRows) { + List newRow = new ArrayList<>(); + newRow.add(rowNo++); + newRow.addAll(row); + orderedRows.add(newRow); } - Object v = row.get(idx); - return v == null ? "null" : v.toString(); - }).collect(Collectors.toList())).collect(Collectors.toList()); + } else { + orderedRows = rawOrderedRows; + } + renderHeaders = addRowNo + ? new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_ROW_NO) + .addTableHeaderFields(rowHeader) + : rowHeader; + renderRows = orderedRows.stream().limit(limit) + .map(row -> IntStream.range(0, renderHeaders.getNumFields()).mapToObj(idx -> { + String fieldName = renderHeaders.get(idx); + if (fieldNameToConverterMap.containsKey(fieldName)) { + return fieldNameToConverterMap.get(fieldName).apply(row.get(idx)); + } + Object v = row.get(idx); + return v == null ? "null" : v.toString(); + }).collect(Collectors.toList())).collect(Collectors.toList()); } @Override @@ -162,6 +188,9 @@ public void forEach(Consumer> action) { } public List getFieldNames() { + if (renderHeaders != null) { + return renderHeaders.getFieldNames(); + } return rowHeader.getFieldNames(); } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/TableHeader.java b/hudi-cli/src/main/java/org/apache/hudi/cli/TableHeader.java index 8ec392d1abfe3..ee17480a30da2 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/TableHeader.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/TableHeader.java @@ -39,6 +39,16 @@ public TableHeader addTableHeaderField(String fieldName) { return this; } + /** + * Add fields from another {@link TableHeader} instance. + * + * @param tableHeader {@link TableHeader} instance. + */ + public TableHeader addTableHeaderFields(TableHeader tableHeader) { + fieldNames.addAll(tableHeader.getFieldNames()); + return this; + } + /** * Get all field names. */ diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ArchivedCommitsCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ArchivedCommitsCommand.java index 102fcc2ae7a63..dcd6a2cf3c8e9 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ArchivedCommitsCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ArchivedCommitsCommand.java @@ -18,6 +18,11 @@ package org.apache.hudi.cli.commands; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.avro.specific.SpecificData; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.model.HoodieArchivedMetaEntry; import org.apache.hudi.avro.model.HoodieCommitMetadata; import org.apache.hudi.cli.HoodieCLI; @@ -30,17 +35,11 @@ import org.apache.hudi.common.table.log.HoodieLogFormat.Reader; import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.Option; - -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.avro.specific.SpecificData; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; import java.io.IOException; import java.util.ArrayList; @@ -51,17 +50,17 @@ /** * CLI command to display archived commits and stats if available. */ -@Component -public class ArchivedCommitsCommand implements CommandMarker { +@ShellComponent +public class ArchivedCommitsCommand { - @CliCommand(value = "show archived commit stats", help = "Read commits from archived files and show details") + @ShellMethod(key = "show archived commit stats", value = "Read commits from archived files and show details") public String showArchivedCommits( - @CliOption(key = {"archiveFolderPattern"}, help = "Archive Folder", unspecifiedDefaultValue = "") String folder, - @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") final boolean headerOnly) + @ShellOption(value = {"--archiveFolderPattern"}, help = "Archive Folder", defaultValue = "") String folder, + @ShellOption(value = {"--limit"}, help = "Limit commits", defaultValue = "-1") final Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly) throws IOException { System.out.println("===============> Showing only " + limit + " archived commits <==============="); String basePath = HoodieCLI.getTableMetaClient().getBasePath(); @@ -80,8 +79,7 @@ public String showArchivedCommits( // read the avro blocks while (reader.hasNext()) { HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); - List records = blk.getRecords(); - readRecords.addAll(records); + blk.getRecordIterator().forEachRemaining(readRecords::add); } List readCommits = readRecords.stream().map(r -> (GenericRecord) r) .filter(r -> r.get("actionType").toString().equals(HoodieTimeline.COMMIT_ACTION) @@ -128,15 +126,15 @@ public String showArchivedCommits( return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, allStats); } - @CliCommand(value = "show archived commits", help = "Read commits from archived files and show details") + @ShellMethod(key = "show archived commits", value = "Read commits from archived files and show details") public String showCommits( - @CliOption(key = {"skipMetadata"}, help = "Skip displaying commit metadata", - unspecifiedDefaultValue = "true") boolean skipMetadata, - @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") final boolean headerOnly) + @ShellOption(value = {"--skipMetadata"}, help = "Skip displaying commit metadata", + defaultValue = "true") boolean skipMetadata, + @ShellOption(value = {"--limit"}, help = "Limit commits", defaultValue = "10") final Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly) throws IOException { System.out.println("===============> Showing only " + limit + " archived commits <==============="); @@ -155,8 +153,9 @@ public String showCommits( // read the avro blocks while (reader.hasNext()) { HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); - List records = blk.getRecords(); - readRecords.addAll(records); + try (ClosableIterator recordItr = blk.getRecordIterator()) { + recordItr.forEachRemaining(readRecords::add); + } } List readCommits = readRecords.stream().map(r -> (GenericRecord) r) .map(r -> readCommit(r, skipMetadata)).collect(Collectors.toList()); @@ -199,12 +198,12 @@ private Comparable[] readCommit(GenericRecord record, boolean skipMetadata) { case HoodieTimeline.COMPACTION_ACTION: return commitDetail(record, "hoodieCompactionMetadata", skipMetadata); default: { - return new Comparable[]{}; + return new Comparable[] {}; } } } catch (Exception e) { e.printStackTrace(); - return new Comparable[]{}; + return new Comparable[] {}; } } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/BootstrapCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/BootstrapCommand.java index 015743d2f299f..98cf9fc0d9067 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/BootstrapCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/BootstrapCommand.java @@ -30,60 +30,56 @@ import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.utilities.UtilHelpers; - import org.apache.spark.launcher.SparkLauncher; import org.apache.spark.util.Utils; -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; +import scala.collection.JavaConverters; import java.io.IOException; import java.net.URISyntaxException; -import java.util.Arrays; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.stream.Collectors; -import scala.collection.JavaConverters; - /** * CLI command to perform bootstrap action & display bootstrap index. */ -@Component -public class BootstrapCommand implements CommandMarker { +@ShellComponent +public class BootstrapCommand { - @CliCommand(value = "bootstrap run", help = "Run a bootstrap action for current Hudi table") + @ShellMethod(key = "bootstrap run", value = "Run a bootstrap action for current Hudi table") public String bootstrap( - @CliOption(key = {"srcPath"}, mandatory = true, help = "Bootstrap source data path of the table") final String srcPath, - @CliOption(key = {"targetPath"}, mandatory = true, - help = "Base path for the target hoodie table") final String targetPath, - @CliOption(key = {"tableName"}, mandatory = true, help = "Hoodie table name") final String tableName, - @CliOption(key = {"tableType"}, mandatory = true, help = "Hoodie table type") final String tableType, - @CliOption(key = {"rowKeyField"}, mandatory = true, help = "Record key columns for bootstrap data") final String rowKeyField, - @CliOption(key = {"partitionPathField"}, unspecifiedDefaultValue = "", + @ShellOption(value = {"--srcPath"}, help = "Bootstrap source data path of the table") final String srcPath, + @ShellOption(value = {"--targetPath"}, help = "Base path for the target hoodie table") final String targetPath, + @ShellOption(value = {"--tableName"}, help = "Hoodie table name") final String tableName, + @ShellOption(value = {"--tableType"}, help = "Hoodie table type") final String tableType, + @ShellOption(value = {"--rowKeyField"}, help = "Record key columns for bootstrap data") final String rowKeyField, + @ShellOption(value = {"--partitionPathField"}, defaultValue = "", help = "Partition fields for bootstrap source data") final String partitionPathField, - @CliOption(key = {"bootstrapIndexClass"}, unspecifiedDefaultValue = "org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex", + @ShellOption(value = {"--bootstrapIndexClass"}, defaultValue = "org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex", help = "Bootstrap Index Class") final String bootstrapIndexClass, - @CliOption(key = {"selectorClass"}, unspecifiedDefaultValue = "org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector", + @ShellOption(value = {"--selectorClass"}, defaultValue = "org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector", help = "Selector class for bootstrap") final String selectorClass, - @CliOption(key = {"keyGeneratorClass"}, unspecifiedDefaultValue = "org.apache.hudi.keygen.SimpleKeyGenerator", + @ShellOption(value = {"--keyGeneratorClass"}, defaultValue = "org.apache.hudi.keygen.SimpleKeyGenerator", help = "Key generator class for bootstrap") final String keyGeneratorClass, - @CliOption(key = {"fullBootstrapInputProvider"}, unspecifiedDefaultValue = "org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider", + @ShellOption(value = {"--fullBootstrapInputProvider"}, defaultValue = "org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider", help = "Class for Full bootstrap input provider") final String fullBootstrapInputProvider, - @CliOption(key = {"schemaProviderClass"}, unspecifiedDefaultValue = "", + @ShellOption(value = {"--schemaProviderClass"}, defaultValue = "", help = "SchemaProvider to attach schemas to bootstrap source data") final String schemaProviderClass, - @CliOption(key = {"payloadClass"}, unspecifiedDefaultValue = "org.apache.hudi.common.model.OverwriteWithLatestAvroPayload", + @ShellOption(value = {"--payloadClass"}, defaultValue = "org.apache.hudi.common.model.OverwriteWithLatestAvroPayload", help = "Payload Class") final String payloadClass, - @CliOption(key = {"parallelism"}, unspecifiedDefaultValue = "1500", help = "Bootstrap writer parallelism") final int parallelism, - @CliOption(key = {"sparkMaster"}, unspecifiedDefaultValue = "", help = "Spark Master") String master, - @CliOption(key = {"sparkMemory"}, unspecifiedDefaultValue = "4G", help = "Spark executor memory") final String sparkMemory, - @CliOption(key = {"enableHiveSync"}, unspecifiedDefaultValue = "false", help = "Enable Hive sync") final Boolean enableHiveSync, - @CliOption(key = {"propsFilePath"}, help = "path to properties file on localfs or dfs with configurations for hoodie client for importing", - unspecifiedDefaultValue = "") final String propsFilePath, - @CliOption(key = {"hoodieConfigs"}, help = "Any configuration that can be set in the properties file can be passed here in the form of an array", - unspecifiedDefaultValue = "") final String[] configs) + @ShellOption(value = {"--parallelism"}, defaultValue = "1500", help = "Bootstrap writer parallelism") final int parallelism, + @ShellOption(value = {"--sparkMaster"}, defaultValue = "", help = "Spark Master") String master, + @ShellOption(value = {"--sparkMemory"}, defaultValue = "4G", help = "Spark executor memory") final String sparkMemory, + @ShellOption(value = {"--enableHiveSync"}, defaultValue = "false", help = "Enable Hive sync") final Boolean enableHiveSync, + @ShellOption(value = {"--propsFilePath"}, help = "path to properties file on localfs or dfs with configurations for hoodie client for importing", + defaultValue = "") final String propsFilePath, + @ShellOption(value = {"--hoodieConfigs"}, help = "Any configuration that can be set in the properties file can be passed here in the form of an array", + defaultValue = "") final String[] configs) throws IOException, InterruptedException, URISyntaxException { String sparkPropertiesPath = @@ -106,15 +102,14 @@ public String bootstrap( return "Bootstrapped source data as Hudi dataset"; } - @CliCommand(value = "bootstrap index showmapping", help = "Show bootstrap index mapping") + @ShellMethod(key = "bootstrap index showmapping", value = "Show bootstrap index mapping") public String showBootstrapIndexMapping( - @CliOption(key = {"partitionPath"}, unspecifiedDefaultValue = "", help = "A valid partition path") String partitionPath, - @CliOption(key = {"fileIds"}, unspecifiedDefaultValue = "", help = "Valid fileIds split by comma") String fileIds, - @CliOption(key = {"limit"}, unspecifiedDefaultValue = "-1", help = "Limit rows to be displayed") Integer limit, - @CliOption(key = {"sortBy"}, unspecifiedDefaultValue = "", help = "Sorting Field") final String sortByField, - @CliOption(key = {"desc"}, unspecifiedDefaultValue = "false", help = "Ordering") final boolean descending, - @CliOption(key = {"headeronly"}, unspecifiedDefaultValue = "false", help = "Print Header Only") - final boolean headerOnly) { + @ShellOption(value = {"--partitionPath"}, defaultValue = "", help = "A valid partition path") String partitionPath, + @ShellOption(value = {"--fileIds"}, defaultValue = "", help = "Valid fileIds split by comma") String fileIds, + @ShellOption(value = {"--limit"}, defaultValue = "-1", help = "Limit rows to be displayed") Integer limit, + @ShellOption(value = {"--sortBy"}, defaultValue = "", help = "Sorting Field") final String sortByField, + @ShellOption(value = {"--desc"}, defaultValue = "false", help = "Ordering") final boolean descending, + @ShellOption(value = {"--headeronly"}, defaultValue = "false", help = "Print Header Only") final boolean headerOnly) { if (partitionPath.isEmpty() && !fileIds.isEmpty()) { throw new IllegalStateException("PartitionPath is mandatory when passing fileIds."); @@ -152,7 +147,7 @@ public String showBootstrapIndexMapping( limit, headerOnly, rows); } - @CliCommand(value = "bootstrap index showpartitions", help = "Show bootstrap indexed partitions") + @ShellMethod(key = "bootstrap index showpartitions", value = "Show bootstrap indexed partitions") public String showBootstrapIndexPartitions() { BootstrapIndex.IndexReader indexReader = createBootstrapIndexReader(); diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CleansCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CleansCommand.java index 4924eaacbf46b..de0e4aa109894 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CleansCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CleansCommand.java @@ -32,13 +32,12 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.utilities.UtilHelpers; - import org.apache.spark.launcher.SparkLauncher; import org.apache.spark.util.Utils; -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; +import scala.collection.JavaConverters; import java.io.IOException; import java.net.URISyntaxException; @@ -48,21 +47,19 @@ import java.util.Map; import java.util.stream.Collectors; -import scala.collection.JavaConverters; - /** * CLI command to show cleans options. */ -@Component -public class CleansCommand implements CommandMarker { +@ShellComponent +public class CleansCommand { - @CliCommand(value = "cleans show", help = "Show the cleans") + @ShellMethod(key = "cleans show", value = "Show the cleans") public String showCleans( - @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") final boolean headerOnly) + @ShellOption(value = {"--limit"}, help = "Limit commits", defaultValue = "-1") final Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly) throws IOException { HoodieActiveTimeline activeTimeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); @@ -71,9 +68,9 @@ public String showCleans( List rows = new ArrayList<>(); for (HoodieInstant clean : cleans) { HoodieCleanMetadata cleanMetadata = - TimelineMetadataUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get()); - rows.add(new Comparable[]{clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(), - cleanMetadata.getTotalFilesDeleted(), cleanMetadata.getTimeTakenInMillis()}); + TimelineMetadataUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(clean).get()); + rows.add(new Comparable[] {clean.getTimestamp(), cleanMetadata.getEarliestCommitToRetain(), + cleanMetadata.getTotalFilesDeleted(), cleanMetadata.getTimeTakenInMillis()}); } TableHeader header = @@ -84,13 +81,14 @@ public String showCleans( return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); } - @CliCommand(value = "clean showpartitions", help = "Show partition level details of a clean") - public String showCleanPartitions(@CliOption(key = {"clean"}, help = "clean to show") final String instantTime, - @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") final boolean headerOnly) + @ShellMethod(key = "clean showpartitions", value = "Show partition level details of a clean") + public String showCleanPartitions( + @ShellOption(value = {"--clean"}, help = "clean to show") final String instantTime, + @ShellOption(value = {"--limit"}, help = "Limit commits", defaultValue = "-1") final Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly) throws Exception { HoodieActiveTimeline activeTimeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); @@ -121,14 +119,15 @@ public String showCleanPartitions(@CliOption(key = {"clean"}, help = "clean to s } - @CliCommand(value = "cleans run", help = "run clean") - public String runClean(@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", - help = "Spark executor memory") final String sparkMemory, - @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for cleaning", - unspecifiedDefaultValue = "") final String propsFilePath, - @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array", - unspecifiedDefaultValue = "") final String[] configs, - @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master) throws IOException, InterruptedException, URISyntaxException { + @ShellMethod(key = "cleans run", value = "run clean") + public String runClean( + @ShellOption(value = "--sparkMemory", defaultValue = "4G", + help = "Spark executor memory") final String sparkMemory, + @ShellOption(value = "--propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for cleaning", + defaultValue = "") final String propsFilePath, + @ShellOption(value = "--hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array", + defaultValue = "") final String[] configs, + @ShellOption(value = "--sparkMaster", defaultValue = "", help = "Spark Master ") String master) throws IOException, InterruptedException, URISyntaxException { boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ClusteringCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ClusteringCommand.java new file mode 100644 index 0000000000000..963411bf98a1e --- /dev/null +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ClusteringCommand.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.commands; + +import org.apache.hudi.cli.HoodieCLI; +import org.apache.hudi.cli.commands.SparkMain.SparkCommand; +import org.apache.hudi.cli.utils.InputStreamConsumer; +import org.apache.hudi.cli.utils.SparkUtil; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.utilities.UtilHelpers; +import org.apache.spark.launcher.SparkLauncher; +import org.apache.spark.util.Utils; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; +import scala.collection.JavaConverters; + +@ShellComponent +public class ClusteringCommand { + + /** + * Schedule clustering table service. + *

+ * Example: + * > connect --path {path to hudi table} + * > clustering schedule --sparkMaster local --sparkMemory 2g + */ + @ShellMethod(key = "clustering schedule", value = "Schedule Clustering") + public String scheduleClustering( + @ShellOption(value = "--sparkMaster", defaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master, + @ShellOption(value = "--sparkMemory", defaultValue = "1g", help = "Spark executor memory") final String sparkMemory, + @ShellOption(value = "--propsFilePath", help = "path to properties file on localfs or dfs with configurations " + + "for hoodie client for clustering", defaultValue = "") final String propsFilePath, + @ShellOption(value = "--hoodieConfigs", help = "Any configuration that can be set in the properties file can " + + "be passed here in the form of an array", defaultValue = "") final String[] configs) throws Exception { + HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); + boolean initialized = HoodieCLI.initConf(); + HoodieCLI.initFS(initialized); + + String sparkPropertiesPath = + Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala()); + SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); + + // First get a clustering instant time and pass it to spark launcher for scheduling clustering + String clusteringInstantTime = HoodieActiveTimeline.createNewInstantTime(); + + sparkLauncher.addAppArgs(SparkCommand.CLUSTERING_SCHEDULE.toString(), master, sparkMemory, + client.getBasePath(), client.getTableConfig().getTableName(), clusteringInstantTime, propsFilePath); + UtilHelpers.validateAndAddProperties(configs, sparkLauncher); + Process process = sparkLauncher.launch(); + InputStreamConsumer.captureOutput(process); + int exitCode = process.waitFor(); + if (exitCode != 0) { + return "Failed to schedule clustering for " + clusteringInstantTime; + } + return "Succeeded to schedule clustering for " + clusteringInstantTime; + } + + /** + * Run clustering table service. + *

+ * Example: + * > connect --path {path to hudi table} + * > clustering schedule --sparkMaster local --sparkMemory 2g + * > clustering run --sparkMaster local --sparkMemory 2g --clusteringInstant 20211124005208 + */ + @ShellMethod(key = "clustering run", value = "Run Clustering") + public String runClustering( + @ShellOption(value = "--sparkMaster", defaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master, + @ShellOption(value = "--sparkMemory", help = "Spark executor memory", defaultValue = "4g") final String sparkMemory, + @ShellOption(value = "--parallelism", help = "Parallelism for hoodie clustering", defaultValue = "1") final String parallelism, + @ShellOption(value = "--retry", help = "Number of retries", defaultValue = "1") final String retry, + @ShellOption(value = "--clusteringInstant", help = "Clustering instant time", + defaultValue = ShellOption.NULL) final String clusteringInstantTime, + @ShellOption(value = "--propsFilePath", help = "path to properties file on localfs or dfs with configurations for " + + "hoodie client for compacting", defaultValue = "") final String propsFilePath, + @ShellOption(value = "--hoodieConfigs", help = "Any configuration that can be set in the properties file can be " + + "passed here in the form of an array", defaultValue = "") final String[] configs) throws Exception { + HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); + boolean initialized = HoodieCLI.initConf(); + HoodieCLI.initFS(initialized); + + String sparkPropertiesPath = + Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala()); + SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); + sparkLauncher.addAppArgs(SparkCommand.CLUSTERING_RUN.toString(), master, sparkMemory, + client.getBasePath(), client.getTableConfig().getTableName(), clusteringInstantTime, + parallelism, retry, propsFilePath); + UtilHelpers.validateAndAddProperties(configs, sparkLauncher); + Process process = sparkLauncher.launch(); + InputStreamConsumer.captureOutput(process); + int exitCode = process.waitFor(); + if (exitCode != 0) { + return "Failed to run clustering for " + clusteringInstantTime; + } + return "Succeeded to run clustering for " + clusteringInstantTime; + } + + /** + * Run clustering table service. + *

+ * Example: + * > connect --path {path to hudi table} + * > clustering scheduleAndExecute --sparkMaster local --sparkMemory 2g + */ + @ShellMethod(key = "clustering scheduleAndExecute", value = "Run Clustering. Make a cluster plan first and execute that plan immediately") + public String runClustering( + @ShellOption(value = "--sparkMaster", defaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master, + @ShellOption(value = "--sparkMemory", help = "Spark executor memory", defaultValue = "4g") final String sparkMemory, + @ShellOption(value = "--parallelism", help = "Parallelism for hoodie clustering", defaultValue = "1") final String parallelism, + @ShellOption(value = "--retry", help = "Number of retries", defaultValue = "1") final String retry, + @ShellOption(value = "--propsFilePath", help = "path to properties file on localfs or dfs with configurations for " + + "hoodie client for compacting", defaultValue = "") final String propsFilePath, + @ShellOption(value = "--hoodieConfigs", help = "Any configuration that can be set in the properties file can be " + + "passed here in the form of an array", defaultValue = "") final String[] configs) throws Exception { + HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); + boolean initialized = HoodieCLI.initConf(); + HoodieCLI.initFS(initialized); + + String sparkPropertiesPath = + Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala()); + SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); + sparkLauncher.addAppArgs(SparkCommand.CLUSTERING_SCHEDULE_AND_EXECUTE.toString(), master, sparkMemory, + client.getBasePath(), client.getTableConfig().getTableName(), parallelism, retry, propsFilePath); + UtilHelpers.validateAndAddProperties(configs, sparkLauncher); + Process process = sparkLauncher.launch(); + InputStreamConsumer.captureOutput(process); + int exitCode = process.waitFor(); + if (exitCode != 0) { + return "Failed to run clustering for scheduleAndExecute."; + } + return "Succeeded to run clustering for scheduleAndExecute"; + } +} diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CommitsCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CommitsCommand.java index 852a413b014ac..e269f8da0cba8 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CommitsCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CommitsCommand.java @@ -22,200 +22,176 @@ import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.HoodieTableHeaderFields; import org.apache.hudi.cli.TableHeader; -import org.apache.hudi.cli.utils.CommitUtil; -import org.apache.hudi.cli.utils.InputStreamConsumer; -import org.apache.hudi.cli.utils.SparkUtil; import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline; import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.NumericUtils; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; -import org.apache.spark.launcher.SparkLauncher; -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; import java.io.IOException; import java.util.ArrayList; -import java.util.Collections; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.function.Function; import java.util.stream.Collectors; +import static org.apache.hudi.cli.utils.CommitUtil.getTimeDaysAgo; +import static org.apache.hudi.common.table.timeline.TimelineUtils.getTimeline; + /** * CLI command to display commits options. */ -@Component -public class CommitsCommand implements CommandMarker { +@ShellComponent +public class CommitsCommand { private String printCommits(HoodieDefaultTimeline timeline, - final Integer limit, final String sortByField, + final Integer limit, + final String sortByField, final boolean descending, final boolean headerOnly, final String tempTableName) throws IOException { final List rows = new ArrayList<>(); final List commits = timeline.getCommitsTimeline().filterCompletedInstants() - .getInstants().collect(Collectors.toList()); - // timeline can be read from multiple files. So sort is needed instead of reversing the collection - Collections.sort(commits, HoodieInstant.COMPARATOR.reversed()); - - for (int i = 0; i < commits.size(); i++) { - final HoodieInstant commit = commits.get(i); - final HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( - timeline.getInstantDetails(commit).get(), - HoodieCommitMetadata.class); - rows.add(new Comparable[]{commit.getTimestamp(), - commitMetadata.fetchTotalBytesWritten(), - commitMetadata.fetchTotalFilesInsert(), - commitMetadata.fetchTotalFilesUpdated(), - commitMetadata.fetchTotalPartitionsWritten(), - commitMetadata.fetchTotalRecordsWritten(), - commitMetadata.fetchTotalUpdateRecordsWritten(), - commitMetadata.fetchTotalWriteErrors()}); + .getInstants().sorted(HoodieInstant.COMPARATOR.reversed()).collect(Collectors.toList()); + + for (final HoodieInstant commit : commits) { + if (timeline.getInstantDetails(commit).isPresent()) { + final HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( + timeline.getInstantDetails(commit).get(), + HoodieCommitMetadata.class); + rows.add(new Comparable[] {commit.getTimestamp(), + commitMetadata.fetchTotalBytesWritten(), + commitMetadata.fetchTotalFilesInsert(), + commitMetadata.fetchTotalFilesUpdated(), + commitMetadata.fetchTotalPartitionsWritten(), + commitMetadata.fetchTotalRecordsWritten(), + commitMetadata.fetchTotalUpdateRecordsWritten(), + commitMetadata.fetchTotalWriteErrors()}); + } } final Map> fieldNameToConverterMap = new HashMap<>(); - fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN, entry -> { - return NumericUtils.humanReadableByteCount((Double.valueOf(entry.toString()))); - }); - - final TableHeader header = new TableHeader() - .addTableHeaderField(HoodieTableHeaderFields.HEADER_COMMIT_TIME) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_FILES_ADDED) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_FILES_UPDATED) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_PARTITIONS_WRITTEN) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_RECORDS_WRITTEN) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_UPDATE_RECORDS_WRITTEN) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_ERRORS); + fieldNameToConverterMap.put( + HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN, + entry -> NumericUtils.humanReadableByteCount((Double.parseDouble(entry.toString())))); + + final TableHeader header = HoodieTableHeaderFields.getTableHeader(); return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, - limit, headerOnly, rows, tempTableName); + limit, headerOnly, rows, tempTableName); } private String printCommitsWithMetadata(HoodieDefaultTimeline timeline, - final Integer limit, final String sortByField, - final boolean descending, - final boolean headerOnly, - final String tempTableName) throws IOException { + final Integer limit, final String sortByField, + final boolean descending, + final boolean headerOnly, + final String tempTableName, + final String partition) throws IOException { final List rows = new ArrayList<>(); final List commits = timeline.getCommitsTimeline().filterCompletedInstants() - .getInstants().collect(Collectors.toList()); - // timeline can be read from multiple files. So sort is needed instead of reversing the collection - Collections.sort(commits, HoodieInstant.COMPARATOR.reversed()); - - for (int i = 0; i < commits.size(); i++) { - final HoodieInstant commit = commits.get(i); - final HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( - timeline.getInstantDetails(commit).get(), - HoodieCommitMetadata.class); - - for (Map.Entry> partitionWriteStat : - commitMetadata.getPartitionToWriteStats().entrySet()) { - for (HoodieWriteStat hoodieWriteStat : partitionWriteStat.getValue()) { - rows.add(new Comparable[]{ commit.getAction(), commit.getTimestamp(), hoodieWriteStat.getPartitionPath(), + .getInstants().sorted(HoodieInstant.COMPARATOR.reversed()).collect(Collectors.toList()); + + for (final HoodieInstant commit : commits) { + if (timeline.getInstantDetails(commit).isPresent()) { + final HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( + timeline.getInstantDetails(commit).get(), + HoodieCommitMetadata.class); + + for (Map.Entry> partitionWriteStat : + commitMetadata.getPartitionToWriteStats().entrySet()) { + for (HoodieWriteStat hoodieWriteStat : partitionWriteStat.getValue()) { + if (StringUtils.isNullOrEmpty(partition) || partition.equals(hoodieWriteStat.getPartitionPath())) { + rows.add(new Comparable[] {commit.getAction(), commit.getTimestamp(), hoodieWriteStat.getPartitionPath(), hoodieWriteStat.getFileId(), hoodieWriteStat.getPrevCommit(), hoodieWriteStat.getNumWrites(), hoodieWriteStat.getNumInserts(), hoodieWriteStat.getNumDeletes(), hoodieWriteStat.getNumUpdateWrites(), hoodieWriteStat.getTotalWriteErrors(), hoodieWriteStat.getTotalLogBlocks(), hoodieWriteStat.getTotalCorruptLogBlock(), hoodieWriteStat.getTotalRollbackBlocks(), hoodieWriteStat.getTotalLogRecords(), hoodieWriteStat.getTotalUpdatedRecordsCompacted(), hoodieWriteStat.getTotalWriteBytes() - }); + }); + } + } } } } final Map> fieldNameToConverterMap = new HashMap<>(); - fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN, entry -> { - return NumericUtils.humanReadableByteCount((Double.valueOf(entry.toString()))); - }); - - TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_ACTION) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_INSTANT) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_FILE_ID) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_PREVIOUS_COMMIT) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_NUM_WRITES) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_NUM_INSERTS) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_NUM_DELETES) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_NUM_UPDATE_WRITES) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_ERRORS) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_LOG_BLOCKS) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_CORRUPT_LOG_BLOCKS) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_ROLLBACK_BLOCKS) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_LOG_RECORDS) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_UPDATED_RECORDS_COMPACTED) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN); + fieldNameToConverterMap.put( + HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN, + entry -> NumericUtils.humanReadableByteCount((Double.parseDouble(entry.toString())))); - return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, - limit, headerOnly, rows, tempTableName); + return HoodiePrintHelper.print(HoodieTableHeaderFields.getTableHeaderWithExtraMetadata(), + fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows, tempTableName); } - @CliCommand(value = "commits show", help = "Show the commits") + @ShellMethod(key = "commits show", value = "Show the commits") public String showCommits( - @CliOption(key = {"includeExtraMetadata"}, help = "Include extra metadata", - unspecifiedDefaultValue = "false") final boolean includeExtraMetadata, - @CliOption(key = {"createView"}, mandatory = false, help = "view name to store output table", - unspecifiedDefaultValue = "") final String exportTableName, - @CliOption(key = {"limit"}, help = "Limit commits", - unspecifiedDefaultValue = "-1") final Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") final boolean headerOnly) + @ShellOption(value = {"--includeExtraMetadata"}, help = "Include extra metadata", + defaultValue = "false") final boolean includeExtraMetadata, + @ShellOption(value = {"--createView"}, help = "view name to store output table", + defaultValue = "") final String exportTableName, + @ShellOption(value = {"--limit"}, help = "Limit commits", + defaultValue = "-1") final Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly, + @ShellOption(value = {"--partition"}, help = "Partition value", defaultValue = ShellOption.NULL) final String partition, + @ShellOption(value = {"--includeArchivedTimeline"}, help = "Include archived commits as well", + defaultValue = "false") final boolean includeArchivedTimeline) throws IOException { - HoodieActiveTimeline activeTimeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); + HoodieDefaultTimeline timeline = getTimeline(HoodieCLI.getTableMetaClient(), includeArchivedTimeline); if (includeExtraMetadata) { - return printCommitsWithMetadata(activeTimeline, limit, sortByField, descending, headerOnly, exportTableName); - } else { - return printCommits(activeTimeline, limit, sortByField, descending, headerOnly, exportTableName); + return printCommitsWithMetadata(timeline, limit, sortByField, descending, headerOnly, exportTableName, partition); + } else { + return printCommits(timeline, limit, sortByField, descending, headerOnly, exportTableName); } } - @CliCommand(value = "commits showarchived", help = "Show the archived commits") + @ShellMethod(key = "commits showarchived", value = "Show the archived commits") public String showArchivedCommits( - @CliOption(key = {"includeExtraMetadata"}, help = "Include extra metadata", - unspecifiedDefaultValue = "false") final boolean includeExtraMetadata, - @CliOption(key = {"createView"}, mandatory = false, help = "view name to store output table", - unspecifiedDefaultValue = "") final String exportTableName, - @CliOption(key = {"startTs"}, mandatory = false, help = "start time for commits, default: now - 10 days") + @ShellOption(value = {"--includeExtraMetadata"}, help = "Include extra metadata", + defaultValue = "false") final boolean includeExtraMetadata, + @ShellOption(value = {"--createView"}, help = "view name to store output table", + defaultValue = "") final String exportTableName, + @ShellOption(value = {"--startTs"}, defaultValue = ShellOption.NULL, help = "start time for commits, default: now - 10 days") String startTs, - @CliOption(key = {"endTs"}, mandatory = false, help = "end time for commits, default: now - 1 day") + @ShellOption(value = {"--endTs"}, defaultValue = ShellOption.NULL, help = "end time for commits, default: now - 1 day") String endTs, - @CliOption(key = {"limit"}, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "-1") - final Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") - final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") - final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") - final boolean headerOnly) - throws IOException { + @ShellOption(value = {"--limit"}, help = "Limit commits", defaultValue = "-1") final Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", defaultValue = "false") final boolean headerOnly, + @ShellOption(value = {"--partition"}, help = "Partition value", defaultValue = ShellOption.NULL) final String partition) + throws IOException { if (StringUtils.isNullOrEmpty(startTs)) { - startTs = CommitUtil.getTimeDaysAgo(10); + startTs = getTimeDaysAgo(10); } if (StringUtils.isNullOrEmpty(endTs)) { - endTs = CommitUtil.getTimeDaysAgo(1); + endTs = getTimeDaysAgo(1); } HoodieArchivedTimeline archivedTimeline = HoodieCLI.getTableMetaClient().getArchivedTimeline(); try { archivedTimeline.loadInstantDetailsInMemory(startTs, endTs); HoodieDefaultTimeline timelineRange = archivedTimeline.findInstantsInRange(startTs, endTs); if (includeExtraMetadata) { - return printCommitsWithMetadata(timelineRange, limit, sortByField, descending, headerOnly, exportTableName); - } else { + return printCommitsWithMetadata(timelineRange, limit, sortByField, descending, headerOnly, exportTableName, partition); + } else { return printCommits(timelineRange, limit, sortByField, descending, headerOnly, exportTableName); } } finally { @@ -224,57 +200,34 @@ public String showArchivedCommits( } } - @CliCommand(value = "commit rollback", help = "Rollback a commit") - public String rollbackCommit(@CliOption(key = {"commit"}, help = "Commit to rollback") final String instantTime, - @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path") final String sparkPropertiesPath, - @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master, - @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", - help = "Spark executor memory") final String sparkMemory) - throws Exception { - HoodieActiveTimeline activeTimeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); - HoodieTimeline completedTimeline = activeTimeline.getCommitsTimeline().filterCompletedInstants(); - HoodieTimeline filteredTimeline = completedTimeline.filter(instant -> instant.getTimestamp().equals(instantTime)); - if (filteredTimeline.empty()) { - return "Commit " + instantTime + " not found in Commits " + completedTimeline; - } - - SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); - sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), master, sparkMemory, instantTime, - HoodieCLI.getTableMetaClient().getBasePath()); - Process process = sparkLauncher.launch(); - InputStreamConsumer.captureOutput(process); - int exitCode = process.waitFor(); - // Refresh the current - HoodieCLI.refreshTableMetadata(); - if (exitCode != 0) { - return "Commit " + instantTime + " failed to roll back"; - } - return "Commit " + instantTime + " rolled back"; - } - - @CliCommand(value = "commit showpartitions", help = "Show partition level details of a commit") + @ShellMethod(key = "commit showpartitions", value = "Show partition level details of a commit") public String showCommitPartitions( - @CliOption(key = {"createView"}, mandatory = false, help = "view name to store output table", - unspecifiedDefaultValue = "") final String exportTableName, - @CliOption(key = {"commit"}, help = "Commit to show") final String instantTime, - @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") final boolean headerOnly) + @ShellOption(value = {"--createView"}, help = "view name to store output table", + defaultValue = "") final String exportTableName, + @ShellOption(value = {"--commit"}, help = "Commit to show") final String instantTime, + @ShellOption(value = {"--limit"}, help = "Limit commits", defaultValue = "-1") final Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly, + @ShellOption(value = {"includeArchivedTimeline"}, help = "Include archived commits as well", + defaultValue = "false") final boolean includeArchivedTimeline) throws Exception { - HoodieActiveTimeline activeTimeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); - HoodieTimeline timeline = activeTimeline.getCommitsTimeline().filterCompletedInstants(); - HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, instantTime); + HoodieDefaultTimeline defaultTimeline = getTimeline(HoodieCLI.getTableMetaClient(), includeArchivedTimeline); + HoodieTimeline timeline = defaultTimeline.getCommitsTimeline().filterCompletedInstants(); + + Option hoodieInstantOption = getCommitForInstant(timeline, instantTime); + Option commitMetadataOptional = getHoodieCommitMetadata(timeline, hoodieInstantOption); - if (!timeline.containsInstant(commitInstant)) { + if (!commitMetadataOptional.isPresent()) { return "Commit " + instantTime + " not found in Commits " + timeline; } - HoodieCommitMetadata meta = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get(), - HoodieCommitMetadata.class); + + HoodieCommitMetadata meta = commitMetadataOptional.get(); List rows = new ArrayList<>(); for (Map.Entry> entry : meta.getPartitionToWriteStats().entrySet()) { + String action = hoodieInstantOption.get().getAction(); String path = entry.getKey(); List stats = entry.getValue(); long totalFilesAdded = 0; @@ -294,7 +247,7 @@ public String showCommitPartitions( totalBytesWritten += stat.getTotalWriteBytes(); totalWriteErrors += stat.getTotalWriteErrors(); } - rows.add(new Comparable[] {path, totalFilesAdded, totalFilesUpdated, totalRecordsInserted, totalRecordsUpdated, + rows.add(new Comparable[] {action, path, totalFilesAdded, totalFilesUpdated, totalRecordsInserted, totalRecordsUpdated, totalBytesWritten, totalWriteErrors}); } @@ -302,7 +255,8 @@ public String showCommitPartitions( fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN, entry -> NumericUtils.humanReadableByteCount((Long.parseLong(entry.toString())))); - TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION_PATH) + TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_ACTION) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION_PATH) .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_FILES_ADDED) .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_FILES_UPDATED) .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_RECORDS_INSERTED) @@ -314,38 +268,90 @@ public String showCommitPartitions( limit, headerOnly, rows, exportTableName); } - @CliCommand(value = "commit showfiles", help = "Show file level details of a commit") + @ShellMethod(key = "commit show_write_stats", value = "Show write stats of a commit") + public String showWriteStats( + @ShellOption(value = {"--createView"}, help = "view name to store output table", + defaultValue = "") final String exportTableName, + @ShellOption(value = {"--commit"}, help = "Commit to show") final String instantTime, + @ShellOption(value = {"--limit"}, help = "Limit commits", defaultValue = "-1") final Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly, + @ShellOption(value = {"includeArchivedTimeline"}, help = "Include archived commits as well", + defaultValue = "false") final boolean includeArchivedTimeline) + throws Exception { + + HoodieDefaultTimeline defaultTimeline = getTimeline(HoodieCLI.getTableMetaClient(), includeArchivedTimeline); + HoodieTimeline timeline = defaultTimeline.getCommitsTimeline().filterCompletedInstants(); + + Option hoodieInstantOption = getCommitForInstant(timeline, instantTime); + Option commitMetadataOptional = getHoodieCommitMetadata(timeline, hoodieInstantOption); + + if (!commitMetadataOptional.isPresent()) { + return "Commit " + instantTime + " not found in Commits " + timeline; + } + + HoodieCommitMetadata meta = commitMetadataOptional.get(); + + String action = hoodieInstantOption.get().getAction(); + long recordsWritten = meta.fetchTotalRecordsWritten(); + long bytesWritten = meta.fetchTotalBytesWritten(); + long avgRecSize = (long) Math.ceil((1.0 * bytesWritten) / recordsWritten); + List rows = new ArrayList<>(); + rows.add(new Comparable[] {action, bytesWritten, recordsWritten, avgRecSize}); + + Map> fieldNameToConverterMap = new HashMap<>(); + fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN, entry -> + NumericUtils.humanReadableByteCount((Long.parseLong(entry.toString())))); + + TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_ACTION) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN_COMMIT) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_RECORDS_WRITTEN_COMMIT) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_AVG_REC_SIZE_COMMIT); + + return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, + limit, headerOnly, rows, exportTableName); + } + + @ShellMethod(key = "commit showfiles", value = "Show file level details of a commit") public String showCommitFiles( - @CliOption(key = {"createView"}, mandatory = false, help = "view name to store output table", - unspecifiedDefaultValue = "") final String exportTableName, - @CliOption(key = {"commit"}, help = "Commit to show") final String instantTime, - @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") final boolean headerOnly) + @ShellOption(value = {"--createView"}, help = "view name to store output table", + defaultValue = "") final String exportTableName, + @ShellOption(value = {"--commit"}, help = "Commit to show") final String instantTime, + @ShellOption(value = {"--limit"}, help = "Limit commits", defaultValue = "-1") final Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly, + @ShellOption(value = {"includeArchivedTimeline"}, help = "Include archived commits as well", + defaultValue = "false") final boolean includeArchivedTimeline) throws Exception { - HoodieActiveTimeline activeTimeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); - HoodieTimeline timeline = activeTimeline.getCommitsTimeline().filterCompletedInstants(); - HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, instantTime); + HoodieDefaultTimeline defaultTimeline = getTimeline(HoodieCLI.getTableMetaClient(), includeArchivedTimeline); + HoodieTimeline timeline = defaultTimeline.getCommitsTimeline().filterCompletedInstants(); - if (!timeline.containsInstant(commitInstant)) { + Option hoodieInstantOption = getCommitForInstant(timeline, instantTime); + Option commitMetadataOptional = getHoodieCommitMetadata(timeline, hoodieInstantOption); + + if (!commitMetadataOptional.isPresent()) { return "Commit " + instantTime + " not found in Commits " + timeline; } - HoodieCommitMetadata meta = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(commitInstant).get(), - HoodieCommitMetadata.class); + + HoodieCommitMetadata meta = commitMetadataOptional.get(); List rows = new ArrayList<>(); for (Map.Entry> entry : meta.getPartitionToWriteStats().entrySet()) { + String action = hoodieInstantOption.get().getAction(); String path = entry.getKey(); List stats = entry.getValue(); for (HoodieWriteStat stat : stats) { - rows.add(new Comparable[] {path, stat.getFileId(), stat.getPrevCommit(), stat.getNumUpdateWrites(), + rows.add(new Comparable[] {action, path, stat.getFileId(), stat.getPrevCommit(), stat.getNumUpdateWrites(), stat.getNumWrites(), stat.getTotalWriteBytes(), stat.getTotalWriteErrors(), stat.getFileSizeInBytes()}); } } - TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION_PATH) + TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_ACTION) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION_PATH) .addTableHeaderField(HoodieTableHeaderFields.HEADER_FILE_ID) .addTableHeaderField(HoodieTableHeaderFields.HEADER_PREVIOUS_COMMIT) .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_RECORDS_UPDATED) @@ -358,11 +364,11 @@ public String showCommitFiles( limit, headerOnly, rows, exportTableName); } - @CliCommand(value = "commits compare", help = "Compare commits with another Hoodie table") - public String compareCommits(@CliOption(key = {"path"}, help = "Path of the table to compare to") final String path) { + @ShellMethod(key = "commits compare", value = "Compare commits with another Hoodie table") + public String compareCommits(@ShellOption(value = {"--path"}, help = "Path of the table to compare to") final String path) { HoodieTableMetaClient source = HoodieCLI.getTableMetaClient(); - HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.conf, path); + HoodieTableMetaClient target = HoodieTableMetaClient.builder().setConf(HoodieCLI.conf).setBasePath(path).build(); HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); String targetLatestCommit = @@ -385,11 +391,36 @@ public String compareCommits(@CliOption(key = {"path"}, help = "Path of the tabl } } - @CliCommand(value = "commits sync", help = "Compare commits with another Hoodie table") - public String syncCommits(@CliOption(key = {"path"}, help = "Path of the table to compare to") final String path) { - HoodieCLI.syncTableMetadata = new HoodieTableMetaClient(HoodieCLI.conf, path); + @ShellMethod(key = "commits sync", value = "Sync commits with another Hoodie table") + public String syncCommits(@ShellOption(value = {"--path"}, help = "Path of the table to sync to") final String path) { + HoodieCLI.syncTableMetadata = HoodieTableMetaClient.builder().setConf(HoodieCLI.conf).setBasePath(path).build(); HoodieCLI.state = HoodieCLI.CLIState.SYNC; return "Load sync state between " + HoodieCLI.getTableMetaClient().getTableConfig().getTableName() + " and " + HoodieCLI.syncTableMetadata.getTableConfig().getTableName(); } + + /* + Checks whether a commit or replacecommit action exists in the timeline. + * */ + private Option getCommitForInstant(HoodieTimeline timeline, String instantTime) { + List instants = Arrays.asList( + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, instantTime), + new HoodieInstant(false, HoodieTimeline.REPLACE_COMMIT_ACTION, instantTime), + new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, instantTime)); + + return Option.fromJavaOptional(instants.stream().filter(timeline::containsInstant).findAny()); + } + + private Option getHoodieCommitMetadata(HoodieTimeline timeline, Option hoodieInstant) throws IOException { + if (hoodieInstant.isPresent()) { + if (hoodieInstant.get().getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION)) { + return Option.of(HoodieReplaceCommitMetadata.fromBytes(timeline.getInstantDetails(hoodieInstant.get()).get(), + HoodieReplaceCommitMetadata.class)); + } + return Option.of(HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(hoodieInstant.get()).get(), + HoodieCommitMetadata.class)); + } + + return Option.empty(); + } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CompactionCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CompactionCommand.java index ffbf70e12a9bc..cb24f56236cf3 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CompactionCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CompactionCommand.java @@ -22,9 +22,9 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.cli.HoodieCLI; import org.apache.hudi.cli.HoodiePrintHelper; +import org.apache.hudi.cli.HoodieTableHeaderFields; import org.apache.hudi.cli.TableHeader; import org.apache.hudi.cli.commands.SparkMain.SparkCommand; -import org.apache.hudi.cli.utils.CommitUtil; import org.apache.hudi.cli.utils.InputStreamConsumer; import org.apache.hudi.cli.utils.SparkUtil; import org.apache.hudi.client.CompactionAdminClient.RenameOpResult; @@ -48,14 +48,13 @@ import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.apache.spark.launcher.SparkLauncher; import org.apache.spark.util.Utils; -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; import java.io.IOException; import java.io.ObjectInputStream; @@ -70,11 +69,13 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.apache.hudi.cli.utils.CommitUtil.getTimeDaysAgo; + /** * CLI command to display compaction related options. */ -@Component -public class CompactionCommand implements CommandMarker { +@ShellComponent +public class CompactionCommand { private static final Logger LOG = LogManager.getLogger(CompactionCommand.class); @@ -88,34 +89,33 @@ private HoodieTableMetaClient checkAndGetMetaClient() { return client; } - @CliCommand(value = "compactions show all", help = "Shows all compactions that are in active timeline") + @ShellMethod(key = "compactions show all", value = "Shows all compactions that are in active timeline") public String compactionsAll( - @CliOption(key = {"includeExtraMetadata"}, help = "Include extra metadata", - unspecifiedDefaultValue = "false") final boolean includeExtraMetadata, - @CliOption(key = {"limit"}, help = "Limit commits", - unspecifiedDefaultValue = "-1") final Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") final boolean headerOnly) - throws IOException { + @ShellOption(value = {"--includeExtraMetadata"}, help = "Include extra metadata", + defaultValue = "false") final boolean includeExtraMetadata, + @ShellOption(value = {"--limit"}, help = "Limit commits", + defaultValue = "-1") final Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly) { HoodieTableMetaClient client = checkAndGetMetaClient(); HoodieActiveTimeline activeTimeline = client.getActiveTimeline(); return printAllCompactions(activeTimeline, - compactionPlanReader(this::readCompactionPlanForActiveTimeline, activeTimeline), - includeExtraMetadata, sortByField, descending, limit, headerOnly); + compactionPlanReader(this::readCompactionPlanForActiveTimeline, activeTimeline), + includeExtraMetadata, sortByField, descending, limit, headerOnly); } - @CliCommand(value = "compaction show", help = "Shows compaction details for a specific compaction instant") + @ShellMethod(key = "compaction show", value = "Shows compaction details for a specific compaction instant") public String compactionShow( - @CliOption(key = "instant", mandatory = true, - help = "Base path for the target hoodie table") final String compactionInstantTime, - @CliOption(key = {"limit"}, help = "Limit commits", - unspecifiedDefaultValue = "-1") final Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") final boolean headerOnly) + @ShellOption(value = "--instant", + help = "Base path for the target hoodie table") final String compactionInstantTime, + @ShellOption(value = {"--limit"}, help = "Limit commits", defaultValue = "-1") final Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly, + @ShellOption(value = {"--partition"}, help = "Partition value", defaultValue = ShellOption.NULL) final String partition) throws Exception { HoodieTableMetaClient client = checkAndGetMetaClient(); HoodieActiveTimeline activeTimeline = client.getActiveTimeline(); @@ -123,77 +123,75 @@ public String compactionShow( activeTimeline.readCompactionPlanAsBytes( HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime)).get()); - return printCompaction(compactionPlan, sortByField, descending, limit, headerOnly); + return printCompaction(compactionPlan, sortByField, descending, limit, headerOnly, partition); } - @CliCommand(value = "compactions showarchived", help = "Shows compaction details for specified time window") + @ShellMethod(key = "compactions showarchived", value = "Shows compaction details for specified time window") public String compactionsShowArchived( - @CliOption(key = {"includeExtraMetadata"}, help = "Include extra metadata", - unspecifiedDefaultValue = "false") final boolean includeExtraMetadata, - @CliOption(key = {"startTs"}, mandatory = false, help = "start time for compactions, default: now - 10 days") - String startTs, - @CliOption(key = {"endTs"}, mandatory = false, help = "end time for compactions, default: now - 1 day") - String endTs, - @CliOption(key = {"limit"}, help = "Limit compactions", - unspecifiedDefaultValue = "-1") final Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") final boolean headerOnly) - throws Exception { + @ShellOption(value = {"--includeExtraMetadata"}, help = "Include extra metadata", + defaultValue = "false") final boolean includeExtraMetadata, + @ShellOption(value = {"--startTs"}, defaultValue = ShellOption.NULL, + help = "start time for compactions, default: now - 10 days") String startTs, + @ShellOption(value = {"--endTs"}, defaultValue = ShellOption.NULL, + help = "end time for compactions, default: now - 1 day") String endTs, + @ShellOption(value = {"--limit"}, help = "Limit compactions", defaultValue = "-1") final Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly) { if (StringUtils.isNullOrEmpty(startTs)) { - startTs = CommitUtil.getTimeDaysAgo(10); + startTs = getTimeDaysAgo(10); } if (StringUtils.isNullOrEmpty(endTs)) { - endTs = CommitUtil.getTimeDaysAgo(1); + endTs = getTimeDaysAgo(1); } HoodieTableMetaClient client = checkAndGetMetaClient(); HoodieArchivedTimeline archivedTimeline = client.getArchivedTimeline(); - archivedTimeline.loadInstantDetailsInMemory(startTs, endTs); + archivedTimeline.loadCompactionDetailsInMemory(startTs, endTs); try { return printAllCompactions(archivedTimeline, - compactionPlanReader(this::readCompactionPlanForArchivedTimeline, archivedTimeline), - includeExtraMetadata, sortByField, descending, limit, headerOnly); + compactionPlanReader(this::readCompactionPlanForArchivedTimeline, archivedTimeline), + includeExtraMetadata, sortByField, descending, limit, headerOnly); } finally { archivedTimeline.clearInstantDetailsFromMemory(startTs, endTs); } } - @CliCommand(value = "compaction showarchived", help = "Shows compaction details for a specific compaction instant") + @ShellMethod(key = "compaction showarchived", value = "Shows compaction details for a specific compaction instant") public String compactionShowArchived( - @CliOption(key = "instant", mandatory = true, - help = "instant time") final String compactionInstantTime, - @CliOption(key = {"limit"}, help = "Limit commits", - unspecifiedDefaultValue = "-1") final Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") final boolean headerOnly) - throws Exception { + @ShellOption(value = "--instant", help = "instant time") final String compactionInstantTime, + @ShellOption(value = {"--limit"}, help = "Limit commits", defaultValue = "-1") final Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly, + @ShellOption(value = {"--partition"}, help = "Partition value", defaultValue = ShellOption.NULL) final String partition) + throws Exception { HoodieTableMetaClient client = checkAndGetMetaClient(); HoodieArchivedTimeline archivedTimeline = client.getArchivedTimeline(); HoodieInstant instant = new HoodieInstant(HoodieInstant.State.COMPLETED, - HoodieTimeline.COMPACTION_ACTION, compactionInstantTime); - String startTs = CommitUtil.addHours(compactionInstantTime, -1); - String endTs = CommitUtil.addHours(compactionInstantTime, 1); + HoodieTimeline.COMPACTION_ACTION, compactionInstantTime); try { - archivedTimeline.loadInstantDetailsInMemory(startTs, endTs); - HoodieCompactionPlan compactionPlan = TimelineMetadataUtils.deserializeCompactionPlan( - archivedTimeline.getInstantDetails(instant).get()); - return printCompaction(compactionPlan, sortByField, descending, limit, headerOnly); + archivedTimeline.loadCompactionDetailsInMemory(compactionInstantTime); + HoodieCompactionPlan compactionPlan = TimelineMetadataUtils.deserializeAvroRecordMetadata( + archivedTimeline.getInstantDetails(instant).get(), HoodieCompactionPlan.getClassSchema()); + return printCompaction(compactionPlan, sortByField, descending, limit, headerOnly, partition); } finally { - archivedTimeline.clearInstantDetailsFromMemory(startTs, endTs); + archivedTimeline.clearInstantDetailsFromMemory(compactionInstantTime); } } - @CliCommand(value = "compaction schedule", help = "Schedule Compaction") - public String scheduleCompact(@CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1G", - help = "Spark executor memory") final String sparkMemory, - @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for compacting", - unspecifiedDefaultValue = "") final String propsFilePath, - @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array", - unspecifiedDefaultValue = "") final String[] configs) throws Exception { + @ShellMethod(key = "compaction schedule", value = "Schedule Compaction") + public String scheduleCompact( + @ShellOption(value = "--sparkMemory", defaultValue = "1G", + help = "Spark executor memory") final String sparkMemory, + @ShellOption(value = "--propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for compacting", + defaultValue = "") final String propsFilePath, + @ShellOption(value = "--hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array", + defaultValue = "") final String[] configs, + @ShellOption(value = "--sparkMaster", defaultValue = "local", help = "Spark Master") String master) + throws Exception { HoodieTableMetaClient client = checkAndGetMetaClient(); boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); @@ -204,8 +202,9 @@ public String scheduleCompact(@CliOption(key = "sparkMemory", unspecifiedDefault String sparkPropertiesPath = Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); - sparkLauncher.addAppArgs(SparkCommand.COMPACT_SCHEDULE.toString(), client.getBasePath(), - client.getTableConfig().getTableName(), compactionInstantTime, sparkMemory, propsFilePath); + String cmd = SparkCommand.COMPACT_SCHEDULE.toString(); + sparkLauncher.addAppArgs(cmd, master, sparkMemory, client.getBasePath(), + client.getTableConfig().getTableName(), compactionInstantTime, propsFilePath); UtilHelpers.validateAndAddProperties(configs, sparkLauncher); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); @@ -216,20 +215,23 @@ public String scheduleCompact(@CliOption(key = "sparkMemory", unspecifiedDefault return "Attempted to schedule compaction for " + compactionInstantTime; } - @CliCommand(value = "compaction run", help = "Run Compaction for given instant time") + @ShellMethod(key = "compaction run", value = "Run Compaction for given instant time") public String compact( - @CliOption(key = {"parallelism"}, mandatory = true, + @ShellOption(value = {"--parallelism"}, defaultValue = "3", help = "Parallelism for hoodie compaction") final String parallelism, - @CliOption(key = "schemaFilePath", mandatory = true, - help = "Path for Avro schema file") final String schemaFilePath, - @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", + @ShellOption(value = "--schemaFilePath", + help = "Path for Avro schema file", defaultValue = "") final String schemaFilePath, + @ShellOption(value = "--sparkMaster", defaultValue = "local", + help = "Spark Master") String master, + @ShellOption(value = "--sparkMemory", defaultValue = "4G", help = "Spark executor memory") final String sparkMemory, - @CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries") final String retry, - @CliOption(key = "compactionInstant", help = "Base path for the target hoodie table") String compactionInstantTime, - @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for compacting", - unspecifiedDefaultValue = "") final String propsFilePath, - @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array", - unspecifiedDefaultValue = "") final String[] configs) + @ShellOption(value = "--retry", defaultValue = "1", help = "Number of retries") final String retry, + @ShellOption(value = "--compactionInstant", help = "Instant of compaction.request", + defaultValue = ShellOption.NULL) String compactionInstantTime, + @ShellOption(value = "--propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for compacting", + defaultValue = "") final String propsFilePath, + @ShellOption(value = "--hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array", + defaultValue = "") final String[] configs) throws Exception { HoodieTableMetaClient client = checkAndGetMetaClient(); boolean initialized = HoodieCLI.initConf(); @@ -249,9 +251,9 @@ public String compact( String sparkPropertiesPath = Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); - sparkLauncher.addAppArgs(SparkCommand.COMPACT_RUN.toString(), client.getBasePath(), + sparkLauncher.addAppArgs(SparkCommand.COMPACT_RUN.toString(), master, sparkMemory, client.getBasePath(), client.getTableConfig().getTableName(), compactionInstantTime, parallelism, schemaFilePath, - sparkMemory, retry, propsFilePath); + retry, propsFilePath); UtilHelpers.validateAndAddProperties(configs, sparkLauncher); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); @@ -262,32 +264,67 @@ public String compact( return "Compaction successfully completed for " + compactionInstantTime; } + @ShellMethod(key = "compaction scheduleAndExecute", value = "Schedule compaction plan and execute this plan") + public String compact( + @ShellOption(value = {"--parallelism"}, defaultValue = "3", + help = "Parallelism for hoodie compaction") final String parallelism, + @ShellOption(value = "--schemaFilePath", + help = "Path for Avro schema file", defaultValue = ShellOption.NULL) final String schemaFilePath, + @ShellOption(value = "--sparkMaster", defaultValue = "local", + help = "Spark Master") String master, + @ShellOption(value = "--sparkMemory", defaultValue = "4G", + help = "Spark executor memory") final String sparkMemory, + @ShellOption(value = "--retry", defaultValue = "1", help = "Number of retries") final String retry, + @ShellOption(value = "--propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for compacting", + defaultValue = "") final String propsFilePath, + @ShellOption(value = "--hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array", + defaultValue = "") final String[] configs) + throws Exception { + HoodieTableMetaClient client = checkAndGetMetaClient(); + boolean initialized = HoodieCLI.initConf(); + HoodieCLI.initFS(initialized); + String sparkPropertiesPath = + Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); + SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); + sparkLauncher.addAppArgs(SparkCommand.COMPACT_SCHEDULE_AND_EXECUTE.toString(), master, sparkMemory, client.getBasePath(), + client.getTableConfig().getTableName(), parallelism, schemaFilePath, + retry, propsFilePath); + UtilHelpers.validateAndAddProperties(configs, sparkLauncher); + Process process = sparkLauncher.launch(); + InputStreamConsumer.captureOutput(process); + int exitCode = process.waitFor(); + if (exitCode != 0) { + return "Failed to schedule and execute compaction "; + } + return "Schedule and execute compaction successfully completed"; + } + /** * Prints all compaction details. */ - private String printAllCompactions(HoodieDefaultTimeline timeline, - Function compactionPlanReader, - boolean includeExtraMetadata, - String sortByField, - boolean descending, - int limit, - boolean headerOnly) { - - Stream instantsStream = timeline.getCommitsAndCompactionTimeline().getReverseOrderedInstants(); + private static String printAllCompactions(HoodieDefaultTimeline timeline, + Function compactionPlanReader, + boolean includeExtraMetadata, + String sortByField, + boolean descending, + int limit, + boolean headerOnly) { + + Stream instantsStream = timeline.getWriteTimeline().getReverseOrderedInstants(); List> compactionPlans = instantsStream - .map(instant -> Pair.of(instant, compactionPlanReader.apply(instant))) - .filter(pair -> pair.getRight() != null) - .collect(Collectors.toList()); + .map(instant -> Pair.of(instant, compactionPlanReader.apply(instant))) + .filter(pair -> pair.getRight() != null) + .collect(Collectors.toList()); - Set committedInstants = timeline.getCommitTimeline().filterCompletedInstants() - .getInstants().collect(Collectors.toSet()); + Set committedInstants = timeline.getCommitTimeline().filterCompletedInstants() + .getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toSet()); List rows = new ArrayList<>(); for (Pair compactionPlan : compactionPlans) { HoodieCompactionPlan plan = compactionPlan.getRight(); HoodieInstant instant = compactionPlan.getLeft(); final HoodieInstant.State state; - if (committedInstants.contains(instant)) { + if (committedInstants.contains(instant.getTimestamp())) { state = HoodieInstant.State.COMPLETED; } else { state = instant.getState(); @@ -295,19 +332,21 @@ private String printAllCompactions(HoodieDefaultTimeline timeline, if (includeExtraMetadata) { rows.add(new Comparable[] {instant.getTimestamp(), state.toString(), - plan.getOperations() == null ? 0 : plan.getOperations().size(), - plan.getExtraMetadata().toString()}); + plan.getOperations() == null ? 0 : plan.getOperations().size(), + plan.getExtraMetadata().toString()}); } else { rows.add(new Comparable[] {instant.getTimestamp(), state.toString(), - plan.getOperations() == null ? 0 : plan.getOperations().size()}); + plan.getOperations() == null ? 0 : plan.getOperations().size()}); } } Map> fieldNameToConverterMap = new HashMap<>(); - TableHeader header = new TableHeader().addTableHeaderField("Compaction Instant Time").addTableHeaderField("State") - .addTableHeaderField("Total FileIds to be Compacted"); + TableHeader header = new TableHeader() + .addTableHeaderField(HoodieTableHeaderFields.HEADER_COMPACTION_INSTANT_TIME) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_STATE) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_FILES_TO_BE_COMPACTED); if (includeExtraMetadata) { - header = header.addTableHeaderField("Extra Metadata"); + header = header.addTableHeaderField(HoodieTableHeaderFields.HEADER_EXTRA_METADATA); } return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); } @@ -319,21 +358,24 @@ private String printAllCompactions(HoodieDefaultTimeline timeline, */ private Function compactionPlanReader( - BiFunction f, T timeline) { + BiFunction f, T timeline) { return (y) -> f.apply(timeline, y); } private HoodieCompactionPlan readCompactionPlanForArchivedTimeline(HoodieArchivedTimeline archivedTimeline, HoodieInstant instant) { - if (!HoodieTimeline.COMPACTION_ACTION.equals(instant.getAction())) { - return null; - } else { + // filter inflight compaction + if (HoodieTimeline.COMPACTION_ACTION.equals(instant.getAction()) + && HoodieInstant.State.INFLIGHT.equals(instant.getState())) { try { - return TimelineMetadataUtils.deserializeCompactionPlan(archivedTimeline.getInstantDetails(instant).get()); - } catch (IOException e) { - throw new HoodieIOException(e.getMessage(), e); + return TimelineMetadataUtils.deserializeAvroRecordMetadata(archivedTimeline.getInstantDetails(instant).get(), + HoodieCompactionPlan.getClassSchema()); + } catch (Exception e) { + throw new HoodieException(e.getMessage(), e); } + } else { + return null; } } @@ -347,38 +389,45 @@ private HoodieCompactionPlan readCompactionPlanForActiveTimeline(HoodieActiveTim try { // This could be a completed compaction. Assume a compaction request file is present but skip if fails return TimelineMetadataUtils.deserializeCompactionPlan( - activeTimeline.readCompactionPlanAsBytes( - HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get()); + activeTimeline.readCompactionPlanAsBytes( + HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get()); } catch (HoodieIOException ioe) { // SKIP return null; } } else { return TimelineMetadataUtils.deserializeCompactionPlan(activeTimeline.readCompactionPlanAsBytes( - HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get()); + HoodieTimeline.getCompactionRequestedInstant(instant.getTimestamp())).get()); } } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); } } - private String printCompaction(HoodieCompactionPlan compactionPlan, - String sortByField, - boolean descending, - int limit, - boolean headerOnly) { + protected static String printCompaction(HoodieCompactionPlan compactionPlan, + String sortByField, + boolean descending, + int limit, + boolean headerOnly, + final String partition) { List rows = new ArrayList<>(); if ((null != compactionPlan) && (null != compactionPlan.getOperations())) { for (HoodieCompactionOperation op : compactionPlan.getOperations()) { - rows.add(new Comparable[]{op.getPartitionPath(), op.getFileId(), op.getBaseInstantTime(), op.getDataFilePath(), - op.getDeltaFilePaths().size(), op.getMetrics() == null ? "" : op.getMetrics().toString()}); + if (StringUtils.isNullOrEmpty(partition) || partition.equals(op.getPartitionPath())) { + rows.add(new Comparable[] {op.getPartitionPath(), op.getFileId(), op.getBaseInstantTime(), op.getDataFilePath(), + op.getDeltaFilePaths().size(), op.getMetrics() == null ? "" : op.getMetrics().toString()}); + } } } Map> fieldNameToConverterMap = new HashMap<>(); - TableHeader header = new TableHeader().addTableHeaderField("Partition Path").addTableHeaderField("File Id") - .addTableHeaderField("Base Instant").addTableHeaderField("Data File Path") - .addTableHeaderField("Total Delta Files").addTableHeaderField("getMetrics"); + TableHeader header = new TableHeader() + .addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION_PATH) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_FILE_ID) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_BASE_INSTANT) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_DATA_FILE_PATH) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_DELTA_FILES) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_METRICS); return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); } @@ -400,17 +449,17 @@ private T deSerializeOperationResult(String inputP, FileSystem fs) throws Ex } } - @CliCommand(value = "compaction validate", help = "Validate Compaction") + @ShellMethod(key = "compaction validate", value = "Validate Compaction") public String validateCompaction( - @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant, - @CliOption(key = {"parallelism"}, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism, - @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master, - @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory, - @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") boolean headerOnly) + @ShellOption(value = "--instant", help = "Compaction Instant") String compactionInstant, + @ShellOption(value = {"--parallelism"}, defaultValue = "3", help = "Parallelism") String parallelism, + @ShellOption(value = "--sparkMaster", defaultValue = "local", help = "Spark Master") String master, + @ShellOption(value = "--sparkMemory", defaultValue = "2G", help = "executor memory") String sparkMemory, + @ShellOption(value = {"--limit"}, help = "Limit commits", defaultValue = "-1") Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") boolean headerOnly) throws Exception { HoodieTableMetaClient client = checkAndGetMetaClient(); boolean initialized = HoodieCLI.initConf(); @@ -444,9 +493,13 @@ public String validateCompaction( }); Map> fieldNameToConverterMap = new HashMap<>(); - TableHeader header = new TableHeader().addTableHeaderField("File Id").addTableHeaderField("Base Instant Time") - .addTableHeaderField("Base Data File").addTableHeaderField("Num Delta Files").addTableHeaderField("Valid") - .addTableHeaderField("Error"); + TableHeader header = new TableHeader() + .addTableHeaderField(HoodieTableHeaderFields.HEADER_FILE_ID) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_BASE_INSTANT_TIME) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_BASE_DATA_FILE) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_NUM_DELTA_FILES) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_VALID) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_ERROR); output = message + HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); @@ -459,19 +512,19 @@ public String validateCompaction( return output; } - @CliCommand(value = "compaction unschedule", help = "Unschedule Compaction") + @ShellMethod(key = "compaction unschedule", value = "Unschedule Compaction") public String unscheduleCompaction( - @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant, - @CliOption(key = {"parallelism"}, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism, - @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master, - @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory, - @CliOption(key = {"skipValidation"}, help = "skip validation", unspecifiedDefaultValue = "false") boolean skipV, - @CliOption(key = {"dryRun"}, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun, - @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") boolean headerOnly) + @ShellOption(value = "--instant", help = "Compaction Instant") String compactionInstant, + @ShellOption(value = {"--parallelism"}, defaultValue = "3", help = "Parallelism") String parallelism, + @ShellOption(value = "--sparkMaster", defaultValue = "local", help = "Spark Master") String master, + @ShellOption(value = "--sparkMemory", defaultValue = "2G", help = "executor memory") String sparkMemory, + @ShellOption(value = {"--skipValidation"}, help = "skip validation", defaultValue = "false") boolean skipV, + @ShellOption(value = {"--dryRun"}, help = "Dry Run Mode", defaultValue = "false") boolean dryRun, + @ShellOption(value = {"--limit"}, help = "Limit commits", defaultValue = "-1") Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") boolean headerOnly) throws Exception { HoodieTableMetaClient client = checkAndGetMetaClient(); boolean initialized = HoodieCLI.initConf(); @@ -505,17 +558,18 @@ public String unscheduleCompaction( return output; } - @CliCommand(value = "compaction unscheduleFileId", help = "UnSchedule Compaction for a fileId") + @ShellMethod(key = "compaction unscheduleFileId", value = "UnSchedule Compaction for a fileId") public String unscheduleCompactFile( - @CliOption(key = "fileId", mandatory = true, help = "File Id") final String fileId, - @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master, - @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory, - @CliOption(key = {"skipValidation"}, help = "skip validation", unspecifiedDefaultValue = "false") boolean skipV, - @CliOption(key = {"dryRun"}, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun, - @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending, - @CliOption(key = {"headeronly"}, help = "Header Only", unspecifiedDefaultValue = "false") boolean headerOnly) + @ShellOption(value = "--fileId", help = "File Id") final String fileId, + @ShellOption(value = "--partitionPath", defaultValue = "", help = "partition path") final String partitionPath, + @ShellOption(value = "--sparkMaster", defaultValue = "local", help = "Spark Master") String master, + @ShellOption(value = "--sparkMemory", defaultValue = "2G", help = "executor memory") String sparkMemory, + @ShellOption(value = {"--skipValidation"}, help = "skip validation", defaultValue = "false") boolean skipV, + @ShellOption(value = {"--dryRun"}, help = "Dry Run Mode", defaultValue = "false") boolean dryRun, + @ShellOption(value = {"--limit"}, help = "Limit commits", defaultValue = "-1") Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Header Only", defaultValue = "false") boolean headerOnly) throws Exception { HoodieTableMetaClient client = checkAndGetMetaClient(); boolean initialized = HoodieCLI.initConf(); @@ -529,7 +583,7 @@ public String unscheduleCompactFile( .getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_FILE.toString(), master, sparkMemory, client.getBasePath(), - fileId, outputPathStr, "1", Boolean.valueOf(skipV).toString(), + fileId, partitionPath, outputPathStr, "1", Boolean.valueOf(skipV).toString(), Boolean.valueOf(dryRun).toString()); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); @@ -549,19 +603,19 @@ public String unscheduleCompactFile( return output; } - @CliCommand(value = "compaction repair", help = "Renames the files to make them consistent with the timeline as " + @ShellMethod(key = "compaction repair", value = "Renames the files to make them consistent with the timeline as " + "dictated by Hoodie metadata. Use when compaction unschedule fails partially.") public String repairCompaction( - @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant, - @CliOption(key = {"parallelism"}, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism, - @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master, - @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory, - @CliOption(key = {"dryRun"}, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun, - @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") boolean headerOnly) + @ShellOption(value = "--instant", help = "Compaction Instant") String compactionInstant, + @ShellOption(value = {"--parallelism"}, defaultValue = "3", help = "Parallelism") String parallelism, + @ShellOption(value = "--sparkMaster", defaultValue = "local", help = "Spark Master") String master, + @ShellOption(value = "--sparkMemory", defaultValue = "2G", help = "executor memory") String sparkMemory, + @ShellOption(value = {"--dryRun"}, help = "Dry Run Mode", defaultValue = "false") boolean dryRun, + @ShellOption(value = {"--limit"}, help = "Limit commits", defaultValue = "-1") Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") boolean headerOnly) throws Exception { HoodieTableMetaClient client = checkAndGetMetaClient(); boolean initialized = HoodieCLI.initConf(); @@ -594,7 +648,7 @@ public String repairCompaction( } private String getRenamesToBePrinted(List res, Integer limit, String sortByField, boolean descending, - boolean headerOnly, String operation) { + boolean headerOnly, String operation) { Option result = Option.fromJavaOptional(res.stream().map(r -> r.isExecuted() && r.isSuccess()).reduce(Boolean::logicalAnd)); @@ -616,9 +670,13 @@ private String getRenamesToBePrinted(List res, Integer limit, St }); Map> fieldNameToConverterMap = new HashMap<>(); - TableHeader header = new TableHeader().addTableHeaderField("File Id").addTableHeaderField("Source File Path") - .addTableHeaderField("Destination File Path").addTableHeaderField("Rename Executed?") - .addTableHeaderField("Rename Succeeded?").addTableHeaderField("Error"); + TableHeader header = new TableHeader() + .addTableHeaderField(HoodieTableHeaderFields.HEADER_FILE_ID) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_SOURCE_FILE_PATH) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_DESTINATION_FILE_PATH) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_RENAME_EXECUTED) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_RENAME_SUCCEEDED) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_ERROR); return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); } else { diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/DiffCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/DiffCommand.java new file mode 100644 index 0000000000000..07d21fe022668 --- /dev/null +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/DiffCommand.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.cli.commands; + +import org.apache.hudi.cli.HoodieCLI; +import org.apache.hudi.cli.HoodiePrintHelper; +import org.apache.hudi.cli.HoodieTableHeaderFields; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline; +import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.NumericUtils; +import org.apache.hudi.common.util.Option; + +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.stream.Collectors; + +import static org.apache.hudi.cli.utils.CommitUtil.getTimeDaysAgo; +import static org.apache.hudi.common.util.StringUtils.isNullOrEmpty; +import static org.apache.hudi.common.util.StringUtils.nonEmpty; +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; + +/** + * Given a file id or partition value, this command line utility tracks the changes to the file group or partition across range of commits. + * Usage: diff file --fileId + */ +@ShellComponent +public class DiffCommand { + + private static final BiFunction FILE_ID_CHECKER = (writeStat, fileId) -> fileId.equals(writeStat.getFileId()); + private static final BiFunction PARTITION_CHECKER = (writeStat, partitionPath) -> partitionPath.equals(writeStat.getPartitionPath()); + + @ShellMethod(key = "diff file", value = "Check how file differs across range of commits") + public String diffFile( + @ShellOption(value = {"--fileId"}, help = "File ID to diff across range of commits") String fileId, + @ShellOption(value = {"--startTs"}, help = "start time for compactions, default: now - 10 days", + defaultValue = ShellOption.NULL) String startTs, + @ShellOption(value = {"--endTs"}, help = "end time for compactions, default: now - 1 day", + defaultValue = ShellOption.NULL) String endTs, + @ShellOption(value = {"--limit"}, help = "Limit compactions", defaultValue = "-1") final Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", defaultValue = "false") final boolean headerOnly, + @ShellOption(value = {"--includeArchivedTimeline"}, help = "Include archived commits as well", + defaultValue = "false") final boolean includeArchivedTimeline) throws IOException { + HoodieDefaultTimeline timeline = getTimelineInRange(startTs, endTs, includeArchivedTimeline); + return printCommitsWithMetadataForFileId(timeline, limit, sortByField, descending, headerOnly, "", fileId); + } + + @ShellMethod(key = "diff partition", value = "Check how file differs across range of commits. It is meant to be used only for partitioned tables.") + public String diffPartition( + @ShellOption(value = {"--partitionPath"}, help = "Relative partition path to diff across range of commits") String partitionPath, + @ShellOption(value = {"--startTs"}, help = "start time for compactions, default: now - 10 days", + defaultValue = ShellOption.NULL) String startTs, + @ShellOption(value = {"--endTs"}, help = "end time for compactions, default: now - 1 day", + defaultValue = ShellOption.NULL) String endTs, + @ShellOption(value = {"--limit"}, help = "Limit compactions", defaultValue = "-1") final Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", defaultValue = "false") final boolean headerOnly, + @ShellOption(value = {"--includeArchivedTimeline"}, help = "Include archived commits as well", + defaultValue = "false") final boolean includeArchivedTimeline) throws IOException { + HoodieDefaultTimeline timeline = getTimelineInRange(startTs, endTs, includeArchivedTimeline); + return printCommitsWithMetadataForPartition(timeline, limit, sortByField, descending, headerOnly, "", partitionPath); + } + + private HoodieDefaultTimeline getTimelineInRange(String startTs, String endTs, boolean includeArchivedTimeline) { + if (isNullOrEmpty(startTs)) { + startTs = getTimeDaysAgo(10); + } + if (isNullOrEmpty(endTs)) { + endTs = getTimeDaysAgo(1); + } + checkArgument(nonEmpty(startTs), "startTs is null or empty"); + checkArgument(nonEmpty(endTs), "endTs is null or empty"); + HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); + HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); + if (includeArchivedTimeline) { + HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline(); + archivedTimeline.loadInstantDetailsInMemory(startTs, endTs); + return archivedTimeline.findInstantsInRange(startTs, endTs).mergeTimeline(activeTimeline); + } + return activeTimeline; + } + + private String printCommitsWithMetadataForFileId(HoodieDefaultTimeline timeline, + final Integer limit, + final String sortByField, + final boolean descending, + final boolean headerOnly, + final String tempTableName, + final String fileId) throws IOException { + return printDiffWithMetadata(timeline, limit, sortByField, descending, headerOnly, tempTableName, fileId, FILE_ID_CHECKER); + } + + private String printCommitsWithMetadataForPartition(HoodieDefaultTimeline timeline, + final Integer limit, + final String sortByField, + final boolean descending, + final boolean headerOnly, + final String tempTableName, + final String partition) throws IOException { + return printDiffWithMetadata(timeline, limit, sortByField, descending, headerOnly, tempTableName, partition, PARTITION_CHECKER); + } + + private String printDiffWithMetadata(HoodieDefaultTimeline timeline, Integer limit, String sortByField, boolean descending, boolean headerOnly, String tempTableName, String diffEntity, + BiFunction diffEntityChecker) throws IOException { + List rows = new ArrayList<>(); + List commits = timeline.getCommitsTimeline().filterCompletedInstants() + .getInstants().sorted(HoodieInstant.COMPARATOR.reversed()).collect(Collectors.toList()); + + for (final HoodieInstant commit : commits) { + Option instantDetails = timeline.getInstantDetails(commit); + if (instantDetails.isPresent()) { + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(instantDetails.get(), HoodieCommitMetadata.class); + for (Map.Entry> partitionWriteStat : + commitMetadata.getPartitionToWriteStats().entrySet()) { + for (HoodieWriteStat hoodieWriteStat : partitionWriteStat.getValue()) { + populateRows(rows, commit, hoodieWriteStat, diffEntity, diffEntityChecker); + } + } + } + } + + Map> fieldNameToConverterMap = new HashMap<>(); + fieldNameToConverterMap.put( + HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN, + entry -> NumericUtils.humanReadableByteCount((Double.parseDouble(entry.toString())))); + + return HoodiePrintHelper.print(HoodieTableHeaderFields.getTableHeaderWithExtraMetadata(), + fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows, tempTableName); + } + + private void populateRows(List rows, HoodieInstant commit, HoodieWriteStat hoodieWriteStat, + String value, BiFunction checker) { + if (checker.apply(hoodieWriteStat, value)) { + rows.add(new Comparable[] { + commit.getAction(), + commit.getTimestamp(), + hoodieWriteStat.getPartitionPath(), + hoodieWriteStat.getFileId(), + hoodieWriteStat.getPrevCommit(), + hoodieWriteStat.getNumWrites(), + hoodieWriteStat.getNumInserts(), + hoodieWriteStat.getNumDeletes(), + hoodieWriteStat.getNumUpdateWrites(), + hoodieWriteStat.getTotalWriteErrors(), + hoodieWriteStat.getTotalLogBlocks(), + hoodieWriteStat.getTotalCorruptLogBlock(), + hoodieWriteStat.getTotalRollbackBlocks(), + hoodieWriteStat.getTotalLogRecords(), + hoodieWriteStat.getTotalUpdatedRecordsCompacted(), + hoodieWriteStat.getTotalWriteBytes() + }); + } + } +} diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java index 8bd842c825659..2406eddacf320 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java @@ -18,6 +18,12 @@ package org.apache.hudi.cli.commands; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.avro.specific.SpecificData; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieArchivedMetaEntry; import org.apache.hudi.avro.model.HoodieCleanMetadata; @@ -34,18 +40,11 @@ import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.exception.HoodieException; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.avro.specific.SpecificData; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; import java.io.File; import java.io.FileOutputStream; @@ -58,21 +57,21 @@ /** * CLI commands to export various information from a HUDI dataset. - * + *

* "export instants": Export Instants and their metadata from the Timeline to a local - * directory specified by the parameter --localFolder - * The instants are exported in the json format. + * directory specified by the parameter --localFolder + * The instants are exported in the json format. */ -@Component -public class ExportCommand implements CommandMarker { +@ShellComponent +public class ExportCommand { - @CliCommand(value = "export instants", help = "Export Instants and their metadata from the Timeline") + @ShellMethod(key = "export instants", value = "Export Instants and their metadata from the Timeline") public String exportInstants( - @CliOption(key = {"limit"}, help = "Limit Instants", unspecifiedDefaultValue = "-1") final Integer limit, - @CliOption(key = {"actions"}, help = "Comma seperated list of Instant actions to export", - unspecifiedDefaultValue = "clean,commit,deltacommit,rollback,savepoint,restore") final String filter, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"localFolder"}, help = "Local Folder to export to", mandatory = true) String localFolder) + @ShellOption(value = {"--limit"}, help = "Limit Instants", defaultValue = "-1") final Integer limit, + @ShellOption(value = {"--actions"}, help = "Comma separated list of Instant actions to export", + defaultValue = "clean,commit,deltacommit,rollback,savepoint,restore") final String filter, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--localFolder"}, help = "Local Folder to export to") String localFolder) throws Exception { final String basePath = HoodieCLI.getTableMetaClient().getBasePath(); @@ -81,7 +80,7 @@ public String exportInstants( int numExports = limit == -1 ? Integer.MAX_VALUE : limit; int numCopied = 0; - if (! new File(localFolder).isDirectory()) { + if (!new File(localFolder).isDirectory()) { throw new HoodieException(localFolder + " is not a valid local directory"); } @@ -92,7 +91,7 @@ public String exportInstants( // Archived instants are in the commit archive files FileStatus[] statuses = FSUtils.getFs(basePath, HoodieCLI.conf).globStatus(archivePath); - List archivedStatuses = Arrays.stream(statuses).sorted((f1, f2) -> (int)(f1.getModificationTime() - f2.getModificationTime())).collect(Collectors.toList()); + List archivedStatuses = Arrays.stream(statuses).sorted((f1, f2) -> (int) (f1.getModificationTime() - f2.getModificationTime())).collect(Collectors.toList()); if (descending) { Collections.reverse(nonArchivedInstants); @@ -113,53 +112,55 @@ public String exportInstants( private int copyArchivedInstants(List statuses, Set actionSet, int limit, String localFolder) throws Exception { int copyCount = 0; + FileSystem fileSystem = FSUtils.getFs(HoodieCLI.getTableMetaClient().getBasePath(), HoodieCLI.conf); for (FileStatus fs : statuses) { // read the archived file - Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(HoodieCLI.getTableMetaClient().getBasePath(), HoodieCLI.conf), - new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema()); + Reader reader = HoodieLogFormat.newReader(fileSystem, new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema()); // read the avro blocks while (reader.hasNext() && copyCount < limit) { HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); - for (IndexedRecord ir : blk.getRecords()) { - // Archived instants are saved as arvo encoded HoodieArchivedMetaEntry records. We need to get the - // metadata record from the entry and convert it to json. - HoodieArchivedMetaEntry archiveEntryRecord = (HoodieArchivedMetaEntry) SpecificData.get() - .deepCopy(HoodieArchivedMetaEntry.SCHEMA$, ir); - - final String action = archiveEntryRecord.get("actionType").toString(); - if (!actionSet.contains(action)) { - continue; - } - - GenericRecord metadata = null; - switch (action) { - case HoodieTimeline.CLEAN_ACTION: - metadata = archiveEntryRecord.getHoodieCleanMetadata(); - break; - case HoodieTimeline.COMMIT_ACTION: - case HoodieTimeline.DELTA_COMMIT_ACTION: - metadata = archiveEntryRecord.getHoodieCommitMetadata(); - break; - case HoodieTimeline.ROLLBACK_ACTION: - metadata = archiveEntryRecord.getHoodieRollbackMetadata(); + try (ClosableIterator recordItr = blk.getRecordIterator()) { + while (recordItr.hasNext()) { + IndexedRecord ir = recordItr.next(); + // Archived instants are saved as arvo encoded HoodieArchivedMetaEntry records. We need to get the + // metadata record from the entry and convert it to json. + HoodieArchivedMetaEntry archiveEntryRecord = (HoodieArchivedMetaEntry) SpecificData.get() + .deepCopy(HoodieArchivedMetaEntry.SCHEMA$, ir); + final String action = archiveEntryRecord.get("actionType").toString(); + if (!actionSet.contains(action)) { + continue; + } + + GenericRecord metadata = null; + switch (action) { + case HoodieTimeline.CLEAN_ACTION: + metadata = archiveEntryRecord.getHoodieCleanMetadata(); + break; + case HoodieTimeline.COMMIT_ACTION: + case HoodieTimeline.DELTA_COMMIT_ACTION: + metadata = archiveEntryRecord.getHoodieCommitMetadata(); + break; + case HoodieTimeline.ROLLBACK_ACTION: + metadata = archiveEntryRecord.getHoodieRollbackMetadata(); + break; + case HoodieTimeline.SAVEPOINT_ACTION: + metadata = archiveEntryRecord.getHoodieSavePointMetadata(); + break; + case HoodieTimeline.COMPACTION_ACTION: + metadata = archiveEntryRecord.getHoodieCompactionMetadata(); + break; + default: + throw new HoodieException("Unknown type of action " + action); + } + + final String instantTime = archiveEntryRecord.get("commitTime").toString(); + final String outPath = localFolder + Path.SEPARATOR + instantTime + "." + action; + writeToFile(outPath, HoodieAvroUtils.avroToJson(metadata, true)); + if (++copyCount == limit) { break; - case HoodieTimeline.SAVEPOINT_ACTION: - metadata = archiveEntryRecord.getHoodieSavePointMetadata(); - break; - case HoodieTimeline.COMPACTION_ACTION: - metadata = archiveEntryRecord.getHoodieCompactionMetadata(); - break; - default: - throw new HoodieException("Unknown type of action " + action); - } - - final String instantTime = archiveEntryRecord.get("commitTime").toString(); - final String outPath = localFolder + Path.SEPARATOR + instantTime + "." + action; - writeToFile(outPath, HoodieAvroUtils.avroToJson(metadata, true)); - if (++copyCount == limit) { - break; + } } } } @@ -174,14 +175,13 @@ private int copyNonArchivedInstants(List instants, int limit, Str int copyCount = 0; if (instants.isEmpty()) { - return limit; + return copyCount; } - final Logger LOG = LogManager.getLogger(ExportCommand.class); final HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); final HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); for (HoodieInstant instant : instants) { - String localPath = localFolder + File.separator + instant.getFileName(); + String localPath = localFolder + Path.SEPARATOR + instant.getFileName(); byte[] data = null; switch (instant.getAction()) { @@ -217,6 +217,7 @@ private int copyNonArchivedInstants(List instants, int limit, Str if (data != null) { writeToFile(localPath, data); + copyCount = copyCount + 1; } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/FileSystemViewCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/FileSystemViewCommand.java index ef76ee4e2f1ad..78e7d90195d58 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/FileSystemViewCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/FileSystemViewCommand.java @@ -18,6 +18,9 @@ package org.apache.hudi.cli.commands; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hudi.cli.HoodieCLI; import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.HoodieTableHeaderFields; @@ -32,14 +35,9 @@ import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.util.NumericUtils; import org.apache.hudi.common.util.Option; - -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; import java.io.IOException; import java.io.Serializable; @@ -55,30 +53,32 @@ /** * CLI command to display file system options. */ -@Component -public class FileSystemViewCommand implements CommandMarker { +@ShellComponent +public class FileSystemViewCommand { - @CliCommand(value = "show fsview all", help = "Show entire file-system view") + @ShellMethod(key = "show fsview all", value = "Show entire file-system view") public String showAllFileSlices( - @CliOption(key = {"pathRegex"}, help = "regex to select files, eg: 2016/08/02", - unspecifiedDefaultValue = "*/*/*") String globRegex, - @CliOption(key = {"baseFileOnly"}, help = "Only display base files view", - unspecifiedDefaultValue = "false") boolean baseFileOnly, - @CliOption(key = {"maxInstant"}, help = "File-Slices upto this instant are displayed", - unspecifiedDefaultValue = "") String maxInstant, - @CliOption(key = {"includeMax"}, help = "Include Max Instant", - unspecifiedDefaultValue = "false") boolean includeMaxInstant, - @CliOption(key = {"includeInflight"}, help = "Include Inflight Instants", - unspecifiedDefaultValue = "false") boolean includeInflight, - @CliOption(key = {"excludeCompaction"}, help = "Exclude compaction Instants", - unspecifiedDefaultValue = "false") boolean excludeCompaction, - @CliOption(key = {"limit"}, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") final boolean headerOnly) + @ShellOption(value = {"--pathRegex"}, help = "regex to select files, eg: par1", + defaultValue = "*") String globRegex, + @ShellOption(value = {"--baseFileOnly"}, help = "Only display base files view", + defaultValue = "false") boolean baseFileOnly, + @ShellOption(value = {"--maxInstant"}, help = "File-Slices upto this instant are displayed", + defaultValue = "") String maxInstant, + @ShellOption(value = {"--includeMax"}, help = "Include Max Instant", + defaultValue = "false") boolean includeMaxInstant, + @ShellOption(value = {"--includeInflight"}, help = "Include Inflight Instants", + defaultValue = "false") boolean includeInflight, + @ShellOption(value = {"--excludeCompaction"}, help = "Exclude compaction Instants", + defaultValue = "false") boolean excludeCompaction, + @ShellOption(value = {"--limit"}, help = "Limit rows to be displayed", defaultValue = "-1") Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly) throws IOException { + globRegex = globRegex == null ? "" : globRegex; + HoodieTableFileSystemView fsView = buildFileSystemView(globRegex, maxInstant, baseFileOnly, includeMaxInstant, includeInflight, excludeCompaction); List rows = new ArrayList<>(); @@ -117,26 +117,26 @@ public String showAllFileSlices( return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); } - @CliCommand(value = "show fsview latest", help = "Show latest file-system view") + @ShellMethod(key = "show fsview latest", value = "Show latest file-system view") public String showLatestFileSlices( - @CliOption(key = {"partitionPath"}, help = "A valid paritition path", mandatory = true) String partition, - @CliOption(key = {"baseFileOnly"}, help = "Only display base file view", - unspecifiedDefaultValue = "false") boolean baseFileOnly, - @CliOption(key = {"maxInstant"}, help = "File-Slices upto this instant are displayed", - unspecifiedDefaultValue = "") String maxInstant, - @CliOption(key = {"merge"}, help = "Merge File Slices due to pending compaction", - unspecifiedDefaultValue = "true") final boolean merge, - @CliOption(key = {"includeMax"}, help = "Include Max Instant", - unspecifiedDefaultValue = "false") boolean includeMaxInstant, - @CliOption(key = {"includeInflight"}, help = "Include Inflight Instants", - unspecifiedDefaultValue = "false") boolean includeInflight, - @CliOption(key = {"excludeCompaction"}, help = "Exclude compaction Instants", - unspecifiedDefaultValue = "false") boolean excludeCompaction, - @CliOption(key = {"limit"}, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") final boolean headerOnly) + @ShellOption(value = {"--partitionPath"}, help = "A valid partition path", defaultValue = "") String partition, + @ShellOption(value = {"--baseFileOnly"}, help = "Only display base file view", + defaultValue = "false") boolean baseFileOnly, + @ShellOption(value = {"--maxInstant"}, help = "File-Slices upto this instant are displayed", + defaultValue = "") String maxInstant, + @ShellOption(value = {"--merge"}, help = "Merge File Slices due to pending compaction", + defaultValue = "true") final boolean merge, + @ShellOption(value = {"--includeMax"}, help = "Include Max Instant", + defaultValue = "false") boolean includeMaxInstant, + @ShellOption(value = {"--includeInflight"}, help = "Include Inflight Instants", + defaultValue = "false") boolean includeInflight, + @ShellOption(value = {"--excludeCompaction"}, help = "Exclude compaction Instants", + defaultValue = "false") boolean excludeCompaction, + @ShellOption(value = {"--limit"}, help = "Limit rows to be displayed", defaultValue = "-1") Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly) throws IOException { HoodieTableFileSystemView fsView = buildFileSystemView(partition, maxInstant, baseFileOnly, includeMaxInstant, @@ -223,21 +223,21 @@ public String showLatestFileSlices( /** * Build File System View. - * - * @param globRegex Path Regex - * @param maxInstant Max Instants to be used for displaying file-instants - * @param basefileOnly Include only base file view + * + * @param globRegex Path Regex + * @param maxInstant Max Instants to be used for displaying file-instants + * @param basefileOnly Include only base file view * @param includeMaxInstant Include Max instant - * @param includeInflight Include inflight instants + * @param includeInflight Include inflight instants * @param excludeCompaction Exclude Compaction instants * @return * @throws IOException */ private HoodieTableFileSystemView buildFileSystemView(String globRegex, String maxInstant, boolean basefileOnly, - boolean includeMaxInstant, boolean includeInflight, boolean excludeCompaction) throws IOException { + boolean includeMaxInstant, boolean includeInflight, boolean excludeCompaction) throws IOException { HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); HoodieTableMetaClient metaClient = - new HoodieTableMetaClient(client.getHadoopConf(), client.getBasePath(), true); + HoodieTableMetaClient.builder().setConf(client.getHadoopConf()).setBasePath(client.getBasePath()).setLoadActiveTimelineOnLoad(true).build(); FileSystem fs = HoodieCLI.fs; String globPath = String.format("%s/%s/*", client.getBasePath(), globRegex); List statuses = FSUtils.getGlobStatusExcludingMetaFolder(fs, new Path(globPath)); @@ -249,7 +249,7 @@ private HoodieTableFileSystemView buildFileSystemView(String globRegex, String m } else if (excludeCompaction) { timeline = metaClient.getActiveTimeline().getCommitsTimeline(); } else { - timeline = metaClient.getActiveTimeline().getCommitsAndCompactionTimeline(); + timeline = metaClient.getActiveTimeline().getWriteTimeline(); } if (!includeInflight) { diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HDFSParquetImportCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HDFSParquetImportCommand.java index a31f31012830d..9ea5bbab04bda 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HDFSParquetImportCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HDFSParquetImportCommand.java @@ -23,46 +23,47 @@ import org.apache.hudi.cli.utils.SparkUtil; import org.apache.hudi.utilities.HDFSParquetImporter.FormatValidator; import org.apache.hudi.utilities.UtilHelpers; - +import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer; import org.apache.spark.launcher.SparkLauncher; import org.apache.spark.util.Utils; -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; - +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; import scala.collection.JavaConverters; /** * CLI command for importing parquet table to hudi table. + * + * @see HoodieDeltaStreamer + * @deprecated This utility is deprecated in 0.10.0 and will be removed in 0.11.0. Use {@link HoodieDeltaStreamer.Config#runBootstrap} instead. */ -@Component -public class HDFSParquetImportCommand implements CommandMarker { +@ShellComponent +public class HDFSParquetImportCommand { - @CliCommand(value = "hdfsparquetimport", help = "Imports Parquet table to a hoodie table") + @ShellMethod(key = "hdfsparquetimport", value = "Imports Parquet table to a hoodie table") public String convert( - @CliOption(key = "upsert", unspecifiedDefaultValue = "false", + @ShellOption(value = "--upsert", defaultValue = "false", help = "Uses upsert API instead of the default insert API of WriteClient") boolean useUpsert, - @CliOption(key = "srcPath", mandatory = true, help = "Base path for the input table") final String srcPath, - @CliOption(key = "targetPath", mandatory = true, + @ShellOption(value = "--srcPath", help = "Base path for the input table") final String srcPath, + @ShellOption(value = "--targetPath", help = "Base path for the target hoodie table") final String targetPath, - @CliOption(key = "tableName", mandatory = true, help = "Table name") final String tableName, - @CliOption(key = "tableType", mandatory = true, help = "Table type") final String tableType, - @CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") final String rowKeyField, - @CliOption(key = "partitionPathField", mandatory = true, + @ShellOption(value = "--tableName", help = "Table name") final String tableName, + @ShellOption(value = "--tableType", help = "Table type") final String tableType, + @ShellOption(value = "--rowKeyField", help = "Row key field name") final String rowKeyField, + @ShellOption(value = "--partitionPathField", defaultValue = "", help = "Partition path field name") final String partitionPathField, - @CliOption(key = {"parallelism"}, mandatory = true, + @ShellOption(value = {"--parallelism"}, help = "Parallelism for hoodie insert") final String parallelism, - @CliOption(key = "schemaFilePath", mandatory = true, + @ShellOption(value = "--schemaFilePath", help = "Path for Avro schema file") final String schemaFilePath, - @CliOption(key = "format", mandatory = true, help = "Format for the input data") final String format, - @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master, - @CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory") final String sparkMemory, - @CliOption(key = "retry", mandatory = true, help = "Number of retries") final String retry, - @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for importing", - unspecifiedDefaultValue = "") final String propsFilePath, - @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array", - unspecifiedDefaultValue = "") final String[] configs) throws Exception { + @ShellOption(value = "--format", help = "Format for the input data") final String format, + @ShellOption(value = "--sparkMaster", defaultValue = "", help = "Spark Master") String master, + @ShellOption(value = "--sparkMemory", help = "Spark executor memory") final String sparkMemory, + @ShellOption(value = "--retry", help = "Number of retries") final String retry, + @ShellOption(value = "--propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for importing", + defaultValue = "") final String propsFilePath, + @ShellOption(value = "--hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array", + defaultValue = "") final String[] configs) throws Exception { (new FormatValidator()).validate("format", format); diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java index e53dd38891604..56e00aa24cd7c 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java @@ -18,10 +18,17 @@ package org.apache.hudi.cli.commands; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.avro.Schema; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hudi.cli.HoodieCLI; import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.HoodieTableHeaderFields; import org.apache.hudi.cli.TableHeader; +import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; @@ -36,21 +43,16 @@ import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieMemoryConfig; - -import com.fasterxml.jackson.databind.ObjectMapper; -import org.apache.avro.Schema; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.parquet.avro.AvroSchemaConverter; -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; +import scala.Tuple2; +import scala.Tuple3; import java.io.IOException; import java.util.ArrayList; @@ -59,26 +61,26 @@ import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; -import scala.Tuple2; -import scala.Tuple3; +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; /** * CLI command to display log file options. */ -@Component -public class HoodieLogFileCommand implements CommandMarker { +@ShellComponent +public class HoodieLogFileCommand { - @CliCommand(value = "show logfile metadata", help = "Read commit metadata from log files") + @ShellMethod(key = "show logfile metadata", value = "Read commit metadata from log files") public String showLogFileCommits( - @CliOption(key = "logFilePathPattern", mandatory = true, + @ShellOption(value = "--logFilePathPattern", help = "Fully qualified path for the log file") final String logFilePathPattern, - @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") final boolean headerOnly) + @ShellOption(value = {"--limit"}, help = "Limit commits", defaultValue = "-1") final Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly) throws IOException { FileSystem fs = HoodieCLI.getTableMetaClient().getFs(); @@ -99,7 +101,7 @@ public String showLogFileCommits( while (reader.hasNext()) { HoodieLogBlock n = reader.next(); String instantTime; - int recordCount = 0; + AtomicInteger recordCount = new AtomicInteger(0); if (n instanceof HoodieCorruptBlock) { try { instantTime = n.getLogBlockHeader().get(HeaderMetadataType.INSTANT_TIME); @@ -119,17 +121,19 @@ public String showLogFileCommits( instantTime = "dummy_instant_time_" + dummyInstantTimeCount; } if (n instanceof HoodieDataBlock) { - recordCount = ((HoodieDataBlock) n).getRecords().size(); + try (ClosableIterator recordItr = ((HoodieDataBlock) n).getRecordIterator()) { + recordItr.forEachRemaining(r -> recordCount.incrementAndGet()); + } } } if (commitCountAndMetadata.containsKey(instantTime)) { commitCountAndMetadata.get(instantTime).add( - new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount)); + new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount.get())); } else { List, Map>, Integer>> list = new ArrayList<>(); list.add( - new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount)); + new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount.get())); commitCountAndMetadata.put(instantTime, list); } } @@ -161,14 +165,14 @@ public String showLogFileCommits( return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); } - @CliCommand(value = "show logfile records", help = "Read records from log files") + @ShellMethod(key = "show logfile records", value = "Read records from log files") public String showLogFileRecords( - @CliOption(key = {"limit"}, help = "Limit commits", - unspecifiedDefaultValue = "10") final Integer limit, - @CliOption(key = "logFilePathPattern", mandatory = true, + @ShellOption(value = {"--limit"}, help = "Limit commits", + defaultValue = "10") final Integer limit, + @ShellOption(value = "--logFilePathPattern", help = "Fully qualified paths for the log files") final String logFilePathPattern, - @CliOption(key = "mergeRecords", help = "If the records in the log files should be merged", - unspecifiedDefaultValue = "false") final Boolean shouldMerge) + @ShellOption(value = "--mergeRecords", help = "If the records in the log files should be merged", + defaultValue = "false") final Boolean shouldMerge) throws IOException { System.out.println("===============> Showing only " + limit + " records <==============="); @@ -180,7 +184,7 @@ public String showLogFileRecords( .collect(Collectors.toList()); // logFilePaths size must > 1 - assert logFilePaths.size() > 0 : "There is no log file"; + checkArgument(logFilePaths.size() > 0, "There is no log file"); // TODO : readerSchema can change across blocks/log files, fix this inside Scanner AvroSchemaConverter converter = new AvroSchemaConverter(); @@ -203,14 +207,16 @@ public String showLogFileRecords( .getCommitTimeline().lastInstant().get().getTimestamp()) .withReadBlocksLazily( Boolean.parseBoolean( - HoodieCompactionConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED)) + HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLE.defaultValue())) .withReverseReader( Boolean.parseBoolean( - HoodieCompactionConfig.DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED)) - .withBufferSize(HoodieMemoryConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE) + HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLE.defaultValue())) + .withBufferSize(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE.defaultValue()) .withMaxMemorySizeInBytes( HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES) - .withSpillableMapBasePath(HoodieMemoryConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH) + .withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()) + .withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()) + .withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()) .build(); for (HoodieRecord hoodieRecord : scanner) { Option record = hoodieRecord.getData().getInsertValue(readerSchema); @@ -229,11 +235,12 @@ public String showLogFileRecords( HoodieLogBlock n = reader.next(); if (n instanceof HoodieDataBlock) { HoodieDataBlock blk = (HoodieDataBlock) n; - List records = blk.getRecords(); - for (IndexedRecord record : records) { - if (allRecords.size() < limit) { - allRecords.add(record); - } + try (ClosableIterator recordItr = blk.getRecordIterator()) { + recordItr.forEachRemaining(record -> { + if (allRecords.size() < limit) { + allRecords.add(record); + } + }); } } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieSyncCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieSyncCommand.java deleted file mode 100644 index 66c2eb02159e4..0000000000000 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieSyncCommand.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.cli.commands; - -import org.apache.hudi.cli.HoodieCLI; -import org.apache.hudi.cli.utils.CommitUtil; -import org.apache.hudi.cli.utils.HiveUtil; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.exception.HoodieException; - -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; - -import java.io.IOException; -import java.util.List; -import java.util.stream.Collectors; - -/** - * CLI command to display sync options. - */ -@Component -public class HoodieSyncCommand implements CommandMarker { - - @CliCommand(value = "sync validate", help = "Validate the sync by counting the number of records") - public String validateSync( - @CliOption(key = {"mode"}, unspecifiedDefaultValue = "complete", help = "Check mode") final String mode, - @CliOption(key = {"sourceDb"}, unspecifiedDefaultValue = "rawdata", help = "source database") final String srcDb, - @CliOption(key = {"targetDb"}, unspecifiedDefaultValue = "dwh_hoodie", - help = "target database") final String tgtDb, - @CliOption(key = {"partitionCount"}, unspecifiedDefaultValue = "5", - help = "total number of recent partitions to validate") final int partitionCount, - @CliOption(key = {"hiveServerUrl"}, mandatory = true, - help = "hiveServerURL to connect to") final String hiveServerUrl, - @CliOption(key = {"hiveUser"}, unspecifiedDefaultValue = "", - help = "hive username to connect to") final String hiveUser, - @CliOption(key = {"hivePass"}, mandatory = true, unspecifiedDefaultValue = "", - help = "hive password to connect to") final String hivePass) - throws Exception { - if (HoodieCLI.syncTableMetadata == null) { - throw new HoodieException("Sync validate request target table not null."); - } - HoodieTableMetaClient target = HoodieCLI.syncTableMetadata; - HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline(); - HoodieTableMetaClient source = HoodieCLI.getTableMetaClient(); - HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsTimeline(); - long sourceCount = 0; - long targetCount = 0; - if ("complete".equals(mode)) { - sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, hiveUser, hivePass); - targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, hiveUser, hivePass); - } else if ("latestPartitions".equals(mode)) { - sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass); - targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass); - } - - String targetLatestCommit = - targetTimeline.getInstants().iterator().hasNext() ? targetTimeline.lastInstant().get().getTimestamp() : "0"; - String sourceLatestCommit = - sourceTimeline.getInstants().iterator().hasNext() ? sourceTimeline.lastInstant().get().getTimestamp() : "0"; - - if (sourceLatestCommit != null - && HoodieTimeline.compareTimestamps(targetLatestCommit, HoodieTimeline.GREATER_THAN, sourceLatestCommit)) { - // source is behind the target - return getString(target, targetTimeline, source, sourceCount, targetCount, sourceLatestCommit); - } else { - return getString(source, sourceTimeline, target, targetCount, sourceCount, targetLatestCommit); - - } - } - - private String getString(HoodieTableMetaClient target, HoodieTimeline targetTimeline, HoodieTableMetaClient source, long sourceCount, long targetCount, String sourceLatestCommit) - throws IOException { - List commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE) - .getInstants().collect(Collectors.toList()); - if (commitsToCatchup.isEmpty()) { - return "Count difference now is (count(" + target.getTableConfig().getTableName() + ") - count(" - + source.getTableConfig().getTableName() + ") == " + (targetCount - sourceCount); - } else { - long newInserts = CommitUtil.countNewRecords(target, - commitsToCatchup.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList())); - return "Count difference now is (count(" + target.getTableConfig().getTableName() + ") - count(" - + source.getTableConfig().getTableName() + ") == " + (targetCount - sourceCount) + ". Catch up count is " - + newInserts; - } - } - -} diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieSyncValidateCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieSyncValidateCommand.java new file mode 100644 index 0000000000000..0fc26a55b8990 --- /dev/null +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieSyncValidateCommand.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.commands; + +import org.apache.hudi.cli.HoodieCLI; +import org.apache.hudi.cli.utils.HiveUtil; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.exception.HoodieException; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; + +import java.io.IOException; +import java.util.List; +import java.util.stream.Collectors; + +import static org.apache.hudi.cli.utils.CommitUtil.countNewRecords; + +/** + * CLI command to display sync options. + */ +@ShellComponent +public class HoodieSyncValidateCommand { + + @ShellMethod(key = "sync validate", value = "Validate the sync by counting the number of records") + public String validateSync( + @ShellOption(value = {"--mode"}, defaultValue = "complete", help = "Check mode") final String mode, + @ShellOption(value = {"--sourceDb"}, defaultValue = "rawdata", help = "source database") final String srcDb, + @ShellOption(value = {"--targetDb"}, defaultValue = "dwh_hoodie", + help = "target database") final String tgtDb, + @ShellOption(value = {"--partitionCount"}, defaultValue = "5", + help = "total number of recent partitions to validate") final int partitionCount, + @ShellOption(value = {"--hiveServerUrl"}, + help = "hiveServerURL to connect to") final String hiveServerUrl, + @ShellOption(value = {"--hiveUser"}, defaultValue = "", + help = "hive username to connect to") final String hiveUser, + @ShellOption(value = {"--hivePass"}, defaultValue = "", + help = "hive password to connect to") final String hivePass) + throws Exception { + if (HoodieCLI.syncTableMetadata == null) { + throw new HoodieException("Sync validate request target table not null."); + } + HoodieTableMetaClient target = HoodieCLI.syncTableMetadata; + HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline(); + HoodieTableMetaClient source = HoodieCLI.getTableMetaClient(); + HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsTimeline(); + long sourceCount = 0; + long targetCount = 0; + if ("complete".equals(mode)) { + sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, hiveUser, hivePass); + targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, hiveUser, hivePass); + } else if ("latestPartitions".equals(mode)) { + sourceCount = HiveUtil.countRecords(hiveServerUrl, source, srcDb, partitionCount, hiveUser, hivePass); + targetCount = HiveUtil.countRecords(hiveServerUrl, target, tgtDb, partitionCount, hiveUser, hivePass); + } + + String targetLatestCommit = + targetTimeline.getInstants().iterator().hasNext() ? targetTimeline.lastInstant().get().getTimestamp() : "0"; + String sourceLatestCommit = + sourceTimeline.getInstants().iterator().hasNext() ? sourceTimeline.lastInstant().get().getTimestamp() : "0"; + + if (sourceLatestCommit != null + && HoodieTimeline.compareTimestamps(targetLatestCommit, HoodieTimeline.GREATER_THAN, sourceLatestCommit)) { + // source is behind the target + return getString(target, targetTimeline, source, sourceCount, targetCount, sourceLatestCommit); + } else { + return getString(source, sourceTimeline, target, targetCount, sourceCount, targetLatestCommit); + + } + } + + private String getString(HoodieTableMetaClient target, HoodieTimeline targetTimeline, HoodieTableMetaClient source, long sourceCount, long targetCount, String sourceLatestCommit) + throws IOException { + List commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE) + .getInstants().collect(Collectors.toList()); + if (commitsToCatchup.isEmpty()) { + return "Count difference now is (count(" + target.getTableConfig().getTableName() + ") - count(" + + source.getTableConfig().getTableName() + ") == " + (targetCount - sourceCount); + } else { + long newInserts = countNewRecords(target, + commitsToCatchup.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList())); + return "Count difference now is (count(" + target.getTableConfig().getTableName() + ") - count(" + + source.getTableConfig().getTableName() + ") == " + (targetCount - sourceCount) + ". Catch up count is " + + newInserts; + } + } + +} diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/KerberosAuthenticationCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/KerberosAuthenticationCommand.java new file mode 100644 index 0000000000000..d79279a315809 --- /dev/null +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/KerberosAuthenticationCommand.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.commands; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.security.UserGroupInformation; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; + +import java.io.IOException; + +/** + * CLI command to perform Kerberos authentication. + */ +@ShellComponent +public class KerberosAuthenticationCommand { + + @ShellMethod(key = "kerberos kinit", value = "Perform Kerberos authentication") + public String performKerberosAuthentication( + @ShellOption(value = "--krb5conf", help = "Path to krb5.conf", defaultValue = "/etc/krb5.conf") String krb5ConfPath, + @ShellOption(value = "--principal", help = "Kerberos principal") String principal, + @ShellOption(value = "--keytab", help = "Path to keytab") String keytabPath) throws IOException { + + System.out.println("Perform Kerberos authentication"); + System.out.println("Parameters:"); + System.out.println("--krb5conf: " + krb5ConfPath); + System.out.println("--principal: " + principal); + System.out.println("--keytab: " + keytabPath); + + System.setProperty("java.security.krb5.conf", krb5ConfPath); + Configuration conf = new Configuration(); + conf.set("hadoop.security.authentication", "kerberos"); + conf.set("keytab.file", keytabPath); + conf.set("kerberos.principal", principal); + UserGroupInformation.setConfiguration(conf); + UserGroupInformation.loginUserFromKeytab(principal, keytabPath); + + System.out.println("Kerberos current user: " + UserGroupInformation.getCurrentUser()); + System.out.println("Kerberos login user: " + UserGroupInformation.getLoginUser()); + + return "Kerberos authentication success"; + } +} diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/MarkersCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/MarkersCommand.java new file mode 100644 index 0000000000000..008c61aa9a84b --- /dev/null +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/MarkersCommand.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.commands; + +import org.apache.hudi.cli.HoodieCLI; +import org.apache.hudi.cli.utils.InputStreamConsumer; +import org.apache.hudi.cli.utils.SparkUtil; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.spark.launcher.SparkLauncher; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; + +/** + * CLI command for marker options. + */ +@ShellComponent +public class MarkersCommand { + + @ShellMethod(key = "marker delete", value = "Delete the marker") + public String deleteMarker( + @ShellOption(value = {"--commit"}, help = "Delete a marker") final String instantTime, + @ShellOption(value = {"--sparkProperties"}, help = "Spark Properties File Path", + defaultValue = "") final String sparkPropertiesPath, + @ShellOption(value = "--sparkMaster", defaultValue = "", help = "Spark Master") String master, + @ShellOption(value = "--sparkMemory", defaultValue = "1G", + help = "Spark executor memory") final String sparkMemory) + throws Exception { + HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); + SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); + sparkLauncher.addAppArgs(SparkMain.SparkCommand.DELETE_MARKER.toString(), master, sparkMemory, instantTime, + metaClient.getBasePath()); + Process process = sparkLauncher.launch(); + InputStreamConsumer.captureOutput(process); + int exitCode = process.waitFor(); + // Refresh the current + HoodieCLI.refreshTableMetadata(); + if (exitCode != 0) { + return String.format("Failed: Could not delete marker \"%s\".", instantTime); + } + return String.format("Marker \"%s\" deleted.", instantTime); + } +} diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/MetadataCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/MetadataCommand.java new file mode 100644 index 0000000000000..65b01bb2545e4 --- /dev/null +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/MetadataCommand.java @@ -0,0 +1,374 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.commands; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.cli.HoodieCLI; +import org.apache.hudi.cli.HoodiePrintHelper; +import org.apache.hudi.cli.TableHeader; +import org.apache.hudi.cli.utils.SparkUtil; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.engine.HoodieLocalEngineContext; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.metadata.HoodieBackedTableMetadata; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.spark.api.java.JavaSparkContext; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * CLI commands to operate on the Metadata Table. + *

+ *

+ * Example: + * The default spark.master conf is set to yarn. If you are running on a local deployment, + * we can set the spark master to local using set conf command. + * > set --conf SPARK_MASTER=local[2] + *

+ * Connect to the table + * > connect --path {path to hudi table} + *

+ * Run metadata commands + * > metadata list-partitions + */ +@ShellComponent +public class MetadataCommand { + + private static final Logger LOG = LogManager.getLogger(MetadataCommand.class); + private static String metadataBaseDirectory; + private JavaSparkContext jsc; + + /** + * Sets the directory to store/read Metadata Table. + *

+ * This can be used to store the metadata table away from the dataset directory. + * - Useful for testing as well as for using via the HUDI CLI so that the actual dataset is not written to. + * - Useful for testing Metadata Table performance and operations on existing datasets before enabling. + */ + public static void setMetadataBaseDirectory(String metadataDir) { + ValidationUtils.checkState(metadataBaseDirectory == null, + "metadataBaseDirectory is already set to " + metadataBaseDirectory); + metadataBaseDirectory = metadataDir; + } + + public static String getMetadataTableBasePath(String tableBasePath) { + if (metadataBaseDirectory != null) { + return metadataBaseDirectory; + } + return HoodieTableMetadata.getMetadataTableBasePath(tableBasePath); + } + + @ShellMethod(key = "metadata set", value = "Set options for Metadata Table") + public String set(@ShellOption(value = {"--metadataDir"}, + help = "Directory to read/write metadata table (can be different from dataset)", defaultValue = "") final String metadataDir) { + if (!metadataDir.isEmpty()) { + setMetadataBaseDirectory(metadataDir); + } + + return "Ok"; + } + + @ShellMethod(key = "metadata create", value = "Create the Metadata Table if it does not exist") + public String create( + @ShellOption(value = "--sparkMaster", defaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master + ) throws IOException { + HoodieCLI.getTableMetaClient(); + Path metadataPath = new Path(getMetadataTableBasePath(HoodieCLI.basePath)); + try { + FileStatus[] statuses = HoodieCLI.fs.listStatus(metadataPath); + if (statuses.length > 0) { + throw new RuntimeException("Metadata directory (" + metadataPath.toString() + ") not empty."); + } + } catch (FileNotFoundException e) { + // Metadata directory does not exist yet + HoodieCLI.fs.mkdirs(metadataPath); + } + + HoodieTimer timer = new HoodieTimer().startTimer(); + HoodieWriteConfig writeConfig = getWriteConfig(); + initJavaSparkContext(Option.of(master)); + SparkHoodieBackedTableMetadataWriter.create(HoodieCLI.conf, writeConfig, new HoodieSparkEngineContext(jsc)); + return String.format("Created Metadata Table in %s (duration=%.2f secs)", metadataPath, timer.endTimer() / 1000.0); + } + + @ShellMethod(key = "metadata delete", value = "Remove the Metadata Table") + public String delete() throws Exception { + HoodieCLI.getTableMetaClient(); + Path metadataPath = new Path(getMetadataTableBasePath(HoodieCLI.basePath)); + try { + FileStatus[] statuses = HoodieCLI.fs.listStatus(metadataPath); + if (statuses.length > 0) { + HoodieCLI.fs.delete(metadataPath, true); + } + } catch (FileNotFoundException e) { + // Metadata directory does not exist + } + + return String.format("Removed Metadata Table from %s", metadataPath); + } + + @ShellMethod(key = "metadata init", value = "Update the metadata table from commits since the creation") + public String init(@ShellOption(value = "--sparkMaster", defaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master, + @ShellOption(value = {"--readonly"}, defaultValue = "false", + help = "Open in read-only mode") final boolean readOnly) throws Exception { + HoodieCLI.getTableMetaClient(); + Path metadataPath = new Path(getMetadataTableBasePath(HoodieCLI.basePath)); + try { + HoodieCLI.fs.listStatus(metadataPath); + } catch (FileNotFoundException e) { + // Metadata directory does not exist + throw new RuntimeException("Metadata directory (" + metadataPath.toString() + ") does not exist."); + } + + HoodieTimer timer = new HoodieTimer().startTimer(); + if (!readOnly) { + HoodieWriteConfig writeConfig = getWriteConfig(); + initJavaSparkContext(Option.of(master)); + SparkHoodieBackedTableMetadataWriter.create(HoodieCLI.conf, writeConfig, new HoodieSparkEngineContext(jsc)); + } + + String action = readOnly ? "Opened" : "Initialized"; + return String.format(action + " Metadata Table in %s (duration=%.2fsec)", metadataPath, (timer.endTimer()) / 1000.0); + } + + @ShellMethod(key = "metadata stats", value = "Print stats about the metadata") + public String stats() throws IOException { + HoodieCLI.getTableMetaClient(); + HoodieMetadataConfig config = HoodieMetadataConfig.newBuilder().enable(true).build(); + HoodieBackedTableMetadata metadata = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(HoodieCLI.conf), + config, HoodieCLI.basePath, "/tmp"); + Map stats = metadata.stats(); + + final List rows = new ArrayList<>(); + for (Map.Entry entry : stats.entrySet()) { + Comparable[] row = new Comparable[2]; + row[0] = entry.getKey(); + row[1] = entry.getValue(); + rows.add(row); + } + + TableHeader header = new TableHeader() + .addTableHeaderField("stat key") + .addTableHeaderField("stat value"); + return HoodiePrintHelper.print(header, new HashMap<>(), "", + false, Integer.MAX_VALUE, false, rows); + } + + @ShellMethod(key = "metadata list-partitions", value = "List all partitions from metadata") + public String listPartitions( + @ShellOption(value = "--sparkMaster", defaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master + ) throws IOException { + HoodieCLI.getTableMetaClient(); + initJavaSparkContext(Option.of(master)); + HoodieMetadataConfig config = HoodieMetadataConfig.newBuilder().enable(true).build(); + HoodieBackedTableMetadata metadata = new HoodieBackedTableMetadata(new HoodieSparkEngineContext(jsc), config, + HoodieCLI.basePath, "/tmp"); + + if (!metadata.enabled()) { + return "[ERROR] Metadata Table not enabled/initialized\n\n"; + } + + HoodieTimer timer = new HoodieTimer().startTimer(); + List partitions = metadata.getAllPartitionPaths(); + LOG.debug("Took " + timer.endTimer() + " ms"); + + final List rows = new ArrayList<>(); + partitions.stream().sorted(Comparator.reverseOrder()).forEach(p -> { + Comparable[] row = new Comparable[1]; + row[0] = p; + rows.add(row); + }); + + TableHeader header = new TableHeader().addTableHeaderField("partition"); + return HoodiePrintHelper.print(header, new HashMap<>(), "", + false, Integer.MAX_VALUE, false, rows); + } + + @ShellMethod(key = "metadata list-files", value = "Print a list of all files in a partition from the metadata") + public String listFiles( + @ShellOption(value = {"--partition"}, help = "Name of the partition to list files", defaultValue = "") final String partition) throws IOException { + HoodieCLI.getTableMetaClient(); + HoodieMetadataConfig config = HoodieMetadataConfig.newBuilder().enable(true).build(); + HoodieBackedTableMetadata metaReader = new HoodieBackedTableMetadata( + new HoodieLocalEngineContext(HoodieCLI.conf), config, HoodieCLI.basePath, "/tmp"); + + if (!metaReader.enabled()) { + return "[ERROR] Metadata Table not enabled/initialized\n\n"; + } + + Path partitionPath = new Path(HoodieCLI.basePath); + if (!StringUtils.isNullOrEmpty(partition)) { + partitionPath = new Path(HoodieCLI.basePath, partition); + } + + HoodieTimer timer = new HoodieTimer().startTimer(); + FileStatus[] statuses = metaReader.getAllFilesInPartition(partitionPath); + LOG.debug("Took " + timer.endTimer() + " ms"); + + final List rows = new ArrayList<>(); + Arrays.stream(statuses).sorted((p1, p2) -> p2.getPath().getName().compareTo(p1.getPath().getName())).forEach(f -> { + Comparable[] row = new Comparable[1]; + row[0] = f; + rows.add(row); + }); + + TableHeader header = new TableHeader().addTableHeaderField("file path"); + return HoodiePrintHelper.print(header, new HashMap<>(), "", + false, Integer.MAX_VALUE, false, rows); + } + + @ShellMethod(key = "metadata validate-files", value = "Validate all files in all partitions from the metadata") + public String validateFiles( + @ShellOption(value = {"--verbose"}, help = "Print all file details", defaultValue = "false") final boolean verbose) throws IOException { + HoodieCLI.getTableMetaClient(); + HoodieMetadataConfig config = HoodieMetadataConfig.newBuilder().enable(true).build(); + HoodieBackedTableMetadata metadataReader = new HoodieBackedTableMetadata( + new HoodieLocalEngineContext(HoodieCLI.conf), config, HoodieCLI.basePath, "/tmp"); + + if (!metadataReader.enabled()) { + return "[ERROR] Metadata Table not enabled/initialized\n\n"; + } + + HoodieMetadataConfig fsConfig = HoodieMetadataConfig.newBuilder().enable(false).build(); + HoodieBackedTableMetadata fsMetaReader = new HoodieBackedTableMetadata( + new HoodieLocalEngineContext(HoodieCLI.conf), fsConfig, HoodieCLI.basePath, "/tmp"); + + HoodieTimer timer = new HoodieTimer().startTimer(); + List metadataPartitions = metadataReader.getAllPartitionPaths(); + LOG.debug("Listing partitions Took " + timer.endTimer() + " ms"); + List fsPartitions = fsMetaReader.getAllPartitionPaths(); + Collections.sort(fsPartitions); + Collections.sort(metadataPartitions); + + Set allPartitions = new HashSet<>(); + allPartitions.addAll(fsPartitions); + allPartitions.addAll(metadataPartitions); + + if (!fsPartitions.equals(metadataPartitions)) { + LOG.error("FS partition listing is not matching with metadata partition listing!"); + LOG.error("All FS partitions: " + Arrays.toString(fsPartitions.toArray())); + LOG.error("All Metadata partitions: " + Arrays.toString(metadataPartitions.toArray())); + } + + final List rows = new ArrayList<>(); + for (String partition : allPartitions) { + Map fileStatusMap = new HashMap<>(); + Map metadataFileStatusMap = new HashMap<>(); + FileStatus[] metadataStatuses = metadataReader.getAllFilesInPartition(new Path(HoodieCLI.basePath, partition)); + Arrays.stream(metadataStatuses).forEach(entry -> metadataFileStatusMap.put(entry.getPath().getName(), entry)); + FileStatus[] fsStatuses = fsMetaReader.getAllFilesInPartition(new Path(HoodieCLI.basePath, partition)); + Arrays.stream(fsStatuses).forEach(entry -> fileStatusMap.put(entry.getPath().getName(), entry)); + + Set allFiles = new HashSet<>(); + allFiles.addAll(fileStatusMap.keySet()); + allFiles.addAll(metadataFileStatusMap.keySet()); + + for (String file : allFiles) { + Comparable[] row = new Comparable[6]; + row[0] = partition; + FileStatus fsFileStatus = fileStatusMap.get(file); + FileStatus metaFileStatus = metadataFileStatusMap.get(file); + boolean doesFsFileExists = fsFileStatus != null; + boolean doesMetadataFileExists = metaFileStatus != null; + long fsFileLength = doesFsFileExists ? fsFileStatus.getLen() : 0; + long metadataFileLength = doesMetadataFileExists ? metaFileStatus.getLen() : 0; + row[1] = file; + row[2] = doesFsFileExists; + row[3] = doesMetadataFileExists; + row[4] = fsFileLength; + row[5] = metadataFileLength; + if (verbose) { // if verbose print all files + rows.add(row); + } else if ((doesFsFileExists != doesMetadataFileExists) || (fsFileLength != metadataFileLength)) { // if non verbose, print only non matching files + rows.add(row); + } + } + + if (metadataStatuses.length != fsStatuses.length) { + LOG.error(" FS and metadata files count not matching for " + partition + ". FS files count " + fsStatuses.length + ", metadata base files count " + + metadataStatuses.length); + } + + for (Map.Entry entry : fileStatusMap.entrySet()) { + if (!metadataFileStatusMap.containsKey(entry.getKey())) { + LOG.error("FS file not found in metadata " + entry.getKey()); + } else { + if (entry.getValue().getLen() != metadataFileStatusMap.get(entry.getKey()).getLen()) { + LOG.error(" FS file size mismatch " + entry.getKey() + ", size equality " + + (entry.getValue().getLen() == metadataFileStatusMap.get(entry.getKey()).getLen()) + + ". FS size " + entry.getValue().getLen() + ", metadata size " + + metadataFileStatusMap.get(entry.getKey()).getLen()); + } + } + } + for (Map.Entry entry : metadataFileStatusMap.entrySet()) { + if (!fileStatusMap.containsKey(entry.getKey())) { + LOG.error("Metadata file not found in FS " + entry.getKey()); + } else { + if (entry.getValue().getLen() != fileStatusMap.get(entry.getKey()).getLen()) { + LOG.error(" Metadata file size mismatch " + entry.getKey() + ", size equality " + + (entry.getValue().getLen() == fileStatusMap.get(entry.getKey()).getLen()) + + ". Metadata size " + entry.getValue().getLen() + ", FS size " + + metadataFileStatusMap.get(entry.getKey()).getLen()); + } + } + } + } + TableHeader header = new TableHeader().addTableHeaderField("Partition") + .addTableHeaderField("File Name") + .addTableHeaderField(" Is Present in FS ") + .addTableHeaderField(" Is Present in Metadata") + .addTableHeaderField(" FS size") + .addTableHeaderField(" Metadata size"); + return HoodiePrintHelper.print(header, new HashMap<>(), "", false, Integer.MAX_VALUE, false, rows); + } + + private HoodieWriteConfig getWriteConfig() { + return HoodieWriteConfig.newBuilder().withPath(HoodieCLI.basePath) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()).build(); + } + + private void initJavaSparkContext(Option userDefinedMaster) { + if (jsc == null) { + jsc = SparkUtil.initJavaSparkContext(SparkUtil.getDefaultConf("HoodieCLI", userDefinedMaster)); + } + } +} \ No newline at end of file diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java index 40dddfc725488..2b11e20a10d42 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java @@ -18,33 +18,35 @@ package org.apache.hudi.cli.commands; +import org.apache.avro.AvroRuntimeException; +import org.apache.hadoop.fs.Path; import org.apache.hudi.cli.DeDupeType; import org.apache.hudi.cli.HoodieCLI; import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.HoodieTableHeaderFields; import org.apache.hudi.cli.utils.InputStreamConsumer; import org.apache.hudi.cli.utils.SparkUtil; +import org.apache.hudi.common.engine.HoodieLocalEngineContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.CleanerUtils; -import org.apache.hudi.exception.HoodieIOException; - -import org.apache.avro.AvroRuntimeException; -import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.PartitionPathEncodeUtils; import org.apache.hudi.common.util.StringUtils; -import org.apache.log4j.Logger; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.apache.spark.launcher.SparkLauncher; import org.apache.spark.util.Utils; -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; import scala.collection.JavaConverters; -import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.List; @@ -58,29 +60,29 @@ /** * CLI command to display and trigger repair options. */ -@Component -public class RepairsCommand implements CommandMarker { +@ShellComponent +public class RepairsCommand { - private static final Logger LOG = Logger.getLogger(RepairsCommand.class); + private static final Logger LOG = LogManager.getLogger(RepairsCommand.class); public static final String DEDUPLICATE_RETURN_PREFIX = "Deduplicated files placed in: "; - @CliCommand(value = "repair deduplicate", - help = "De-duplicate a partition path contains duplicates & produce repaired files to replace with") + @ShellMethod(key = "repair deduplicate", + value = "De-duplicate a partition path contains duplicates & produce repaired files to replace with") public String deduplicate( - @CliOption(key = {"duplicatedPartitionPath"}, help = "Partition Path containing the duplicates", - mandatory = true) final String duplicatedPartitionPath, - @CliOption(key = {"repairedOutputPath"}, help = "Location to place the repaired files", - mandatory = true) final String repairedOutputPath, - @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path", - unspecifiedDefaultValue = "") String sparkPropertiesPath, - @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master, - @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", + @ShellOption(value = {"--duplicatedPartitionPath"}, defaultValue = "", help = "Partition Path containing the duplicates") + final String duplicatedPartitionPath, + @ShellOption(value = {"--repairedOutputPath"}, help = "Location to place the repaired files") + final String repairedOutputPath, + @ShellOption(value = {"--sparkProperties"}, help = "Spark Properties File Path", + defaultValue = "") String sparkPropertiesPath, + @ShellOption(value = "--sparkMaster", defaultValue = "", help = "Spark Master") String master, + @ShellOption(value = "--sparkMemory", defaultValue = "4G", help = "Spark executor memory") final String sparkMemory, - @CliOption(key = {"dryrun"}, + @ShellOption(value = {"--dryrun"}, help = "Should we actually remove duplicates or just run and store result to repairedOutputPath", - unspecifiedDefaultValue = "true") final boolean dryRun, - @CliOption(key = {"dedupeType"}, help = "Valid values are - insert_type, update_type and upsert_type", - unspecifiedDefaultValue = "insert_type") final String dedupeType) + defaultValue = "true") final boolean dryRun, + @ShellOption(value = {"--dedupeType"}, help = "Valid values are - insert_type, update_type and upsert_type", + defaultValue = "insert_type") final String dedupeType) throws Exception { if (!DeDupeType.values().contains(DeDupeType.withName(dedupeType))) { throw new IllegalArgumentException("Please provide valid dedupe type!"); @@ -108,10 +110,10 @@ public String deduplicate( } } - @CliCommand(value = "repair addpartitionmeta", help = "Add partition metadata to a table, if not present") + @ShellMethod(key = "repair addpartitionmeta", value = "Add partition metadata to a table, if not present") public String addPartitionMeta( - @CliOption(key = {"dryrun"}, help = "Should we actually add or just print what would be done", - unspecifiedDefaultValue = "true") final boolean dryRun) + @ShellOption(value = {"--dryrun"}, help = "Should we actually add or just print what would be done", + defaultValue = "true") final boolean dryRun) throws IOException { HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); @@ -133,7 +135,8 @@ public String addPartitionMeta( row[1] = "No"; if (!dryRun) { HoodiePartitionMetadata partitionMetadata = - new HoodiePartitionMetadata(HoodieCLI.fs, latestCommit, basePath, partitionPath); + new HoodiePartitionMetadata(HoodieCLI.fs, latestCommit, basePath, partitionPath, + client.getTableConfig().getPartitionMetafileFormat()); partitionMetadata.trySave(0); row[2] = "Repaired"; } @@ -145,17 +148,21 @@ public String addPartitionMeta( HoodieTableHeaderFields.HEADER_METADATA_PRESENT, HoodieTableHeaderFields.HEADER_ACTION}, rows); } - @CliCommand(value = "repair overwrite-hoodie-props", help = "Overwrite hoodie.properties with provided file. Risky operation. Proceed with caution!") + @ShellMethod(key = "repair overwrite-hoodie-props", + value = "Overwrite hoodie.properties with provided file. Risky operation. Proceed with caution!") public String overwriteHoodieProperties( - @CliOption(key = {"new-props-file"}, help = "Path to a properties file on local filesystem to overwrite the table's hoodie.properties with") + @ShellOption(value = {"--new-props-file"}, + help = "Path to a properties file on local filesystem to overwrite the table's hoodie.properties with") final String overwriteFilePath) throws IOException { HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); Properties newProps = new Properties(); - newProps.load(new FileInputStream(new File(overwriteFilePath))); - Map oldProps = client.getTableConfig().getProps(); + newProps.load(new FileInputStream(overwriteFilePath)); + Map oldProps = client.getTableConfig().propsMap(); Path metaPathDir = new Path(client.getBasePath(), METAFOLDER_NAME); - HoodieTableConfig.createHoodieProperties(client.getFs(), metaPathDir, newProps); + HoodieTableConfig.create(client.getFs(), metaPathDir, newProps); + // reload new props as checksum would have been added + newProps = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()).getTableConfig().getProps(); TreeSet allPropKeys = new TreeSet<>(); allPropKeys.addAll(newProps.keySet().stream().map(Object::toString).collect(Collectors.toSet())); @@ -164,7 +171,7 @@ public String overwriteHoodieProperties( String[][] rows = new String[allPropKeys.size()][]; int ind = 0; for (String propKey : allPropKeys) { - String[] row = new String[]{ + String[] row = new String[] { propKey, oldProps.getOrDefault(propKey, "null"), newProps.getOrDefault(propKey, "null").toString() @@ -175,7 +182,7 @@ public String overwriteHoodieProperties( HoodieTableHeaderFields.HEADER_OLD_VALUE, HoodieTableHeaderFields.HEADER_NEW_VALUE}, rows); } - @CliCommand(value = "repair corrupted clean files", help = "repair corrupted clean files") + @ShellMethod(key = "repair corrupted clean files", value = "repair corrupted clean files") public void removeCorruptedPendingCleanAction() { HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); @@ -186,15 +193,130 @@ public void removeCorruptedPendingCleanAction() { CleanerUtils.getCleanerPlan(client, instant); } catch (AvroRuntimeException e) { LOG.warn("Corruption found. Trying to remove corrupted clean instant file: " + instant); - FSUtils.deleteInstantFile(client.getFs(), client.getMetaPath(), instant); + HoodieActiveTimeline.deleteInstantFile(client.getFs(), client.getMetaPath(), instant); } catch (IOException ioe) { if (ioe.getMessage().contains("Not an Avro data file")) { LOG.warn("Corruption found. Trying to remove corrupted clean instant file: " + instant); - FSUtils.deleteInstantFile(client.getFs(), client.getMetaPath(), instant); + HoodieActiveTimeline.deleteInstantFile(client.getFs(), client.getMetaPath(), instant); } else { throw new HoodieIOException(ioe.getMessage(), ioe); } } }); } + + @ShellMethod(key = "repair migrate-partition-meta", value = "Migrate all partition meta file currently stored in text format " + + "to be stored in base file format. See HoodieTableConfig#PARTITION_METAFILE_USE_DATA_FORMAT.") + public String migratePartitionMeta( + @ShellOption(value = {"--dryrun"}, help = "dry run without modifying anything.", defaultValue = "true") + final boolean dryRun) + throws IOException { + + HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(HoodieCLI.conf); + HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); + List partitionPaths = FSUtils.getAllPartitionPaths(engineContext, client.getBasePath(), false, false); + Path basePath = new Path(client.getBasePath()); + + String[][] rows = new String[partitionPaths.size()][]; + int ind = 0; + for (String partitionPath : partitionPaths) { + Path partition = FSUtils.getPartitionPath(client.getBasePath(), partitionPath); + Option textFormatFile = HoodiePartitionMetadata.textFormatMetaPathIfExists(HoodieCLI.fs, partition); + Option baseFormatFile = HoodiePartitionMetadata.baseFormatMetaPathIfExists(HoodieCLI.fs, partition); + String latestCommit = client.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp(); + + String[] row = new String[] { + partitionPath, + String.valueOf(textFormatFile.isPresent()), + String.valueOf(baseFormatFile.isPresent()), + textFormatFile.isPresent() ? "MIGRATE" : "NONE" + }; + + if (!dryRun) { + if (!baseFormatFile.isPresent()) { + HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(HoodieCLI.fs, latestCommit, basePath, partition, + Option.of(client.getTableConfig().getBaseFileFormat())); + partitionMetadata.trySave(0); + } + + // delete it, in case we failed midway last time. + textFormatFile.ifPresent(path -> { + try { + HoodieCLI.fs.delete(path, false); + } catch (IOException e) { + throw new HoodieIOException(e.getMessage(), e); + } + }); + + row[3] = "MIGRATED"; + } + + rows[ind++] = row; + } + + Properties props = new Properties(); + props.setProperty(HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT.key(), "true"); + HoodieTableConfig.update(HoodieCLI.fs, new Path(client.getMetaPath()), props); + + return HoodiePrintHelper.print(new String[] { + HoodieTableHeaderFields.HEADER_PARTITION_PATH, + HoodieTableHeaderFields.HEADER_TEXT_METAFILE_PRESENT, + HoodieTableHeaderFields.HEADER_BASE_METAFILE_PRESENT, + HoodieTableHeaderFields.HEADER_ACTION + }, rows); + } + + @ShellMethod(key = "repair deprecated partition", + value = "Repair deprecated partition (\"default\"). Re-writes data from the deprecated partition into " + PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH) + public String repairDeprecatePartition( + @ShellOption(value = {"--sparkProperties"}, help = "Spark Properties File Path", + defaultValue = "") String sparkPropertiesPath, + @ShellOption(value = "--sparkMaster", defaultValue = "", help = "Spark Master") String master, + @ShellOption(value = "--sparkMemory", defaultValue = "4G", + help = "Spark executor memory") final String sparkMemory) throws Exception { + if (StringUtils.isNullOrEmpty(sparkPropertiesPath)) { + sparkPropertiesPath = + Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala()); + } + + SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); + sparkLauncher.addAppArgs(SparkMain.SparkCommand.REPAIR_DEPRECATED_PARTITION.toString(), master, sparkMemory, + HoodieCLI.getTableMetaClient().getBasePathV2().toString()); + Process process = sparkLauncher.launch(); + InputStreamConsumer.captureOutput(process); + int exitCode = process.waitFor(); + + if (exitCode != 0) { + return "Deduplication failed!"; + } + return "Repair succeeded"; + } + + @ShellMethod(key = "rename partition", + value = "Rename partition. Usage: rename partition --oldPartition --newPartition ") + public String renamePartition( + @ShellOption(value = {"--oldPartition"}, help = "Partition value to be renamed") String oldPartition, + @ShellOption(value = {"--newPartition"}, help = "New partition value after rename") String newPartition, + @ShellOption(value = {"--sparkProperties"}, help = "Spark Properties File Path", + defaultValue = "") String sparkPropertiesPath, + @ShellOption(value = "--sparkMaster", defaultValue = "", help = "Spark Master") String master, + @ShellOption(value = "--sparkMemory", defaultValue = "4G", + help = "Spark executor memory") final String sparkMemory) throws Exception { + if (StringUtils.isNullOrEmpty(sparkPropertiesPath)) { + sparkPropertiesPath = + Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala()); + } + + SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); + sparkLauncher.addAppArgs(SparkMain.SparkCommand.RENAME_PARTITION.toString(), master, sparkMemory, + HoodieCLI.getTableMetaClient().getBasePathV2().toString(), oldPartition, newPartition); + Process process = sparkLauncher.launch(); + InputStreamConsumer.captureOutput(process); + int exitCode = process.waitFor(); + + if (exitCode != 0) { + return "rename partition failed!"; + } + return "rename partition succeeded"; + } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RollbacksCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RollbacksCommand.java index faa778943c9b0..e0fad70d99b3e 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RollbacksCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RollbacksCommand.java @@ -23,6 +23,8 @@ import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.HoodieTableHeaderFields; import org.apache.hudi.cli.TableHeader; +import org.apache.hudi.cli.utils.InputStreamConsumer; +import org.apache.hudi.cli.utils.SparkUtil; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; @@ -31,11 +33,10 @@ import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.collection.Pair; - -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; +import org.apache.spark.launcher.SparkLauncher; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; import java.io.IOException; import java.util.ArrayList; @@ -48,16 +49,16 @@ /** * CLI command to display rollback options. */ -@Component -public class RollbacksCommand implements CommandMarker { +@ShellComponent +public class RollbacksCommand { - @CliCommand(value = "show rollbacks", help = "List all rollback instants") + @ShellMethod(key = "show rollbacks", value = "List all rollback instants") public String showRollbacks( - @CliOption(key = {"limit"}, help = "Limit #rows to be displayed", unspecifiedDefaultValue = "10") Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") final boolean headerOnly) { + @ShellOption(value = {"--limit"}, help = "Limit #rows to be displayed", defaultValue = "10") Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly) { HoodieActiveTimeline activeTimeline = new RollbackTimeline(HoodieCLI.getTableMetaClient()); HoodieTimeline rollback = activeTimeline.getRollbackTimeline().filterCompletedInstants(); @@ -87,14 +88,14 @@ public String showRollbacks( return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); } - @CliCommand(value = "show rollback", help = "Show details of a rollback instant") + @ShellMethod(key = "show rollback", value = "Show details of a rollback instant") public String showRollback( - @CliOption(key = {"instant"}, help = "Rollback instant", mandatory = true) String rollbackInstant, - @CliOption(key = {"limit"}, help = "Limit #rows to be displayed", unspecifiedDefaultValue = "10") Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") final boolean headerOnly) + @ShellOption(value = {"--instant"}, help = "Rollback instant") String rollbackInstant, + @ShellOption(value = {"--limit"}, help = "Limit #rows to be displayed", defaultValue = "10") Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly) throws IOException { HoodieActiveTimeline activeTimeline = new RollbackTimeline(HoodieCLI.getTableMetaClient()); final List rows = new ArrayList<>(); @@ -102,17 +103,17 @@ public String showRollback( activeTimeline.getInstantDetails(new HoodieInstant(State.COMPLETED, ROLLBACK_ACTION, rollbackInstant)).get(), HoodieRollbackMetadata.class); metadata.getPartitionMetadata().forEach((key, value) -> Stream - .concat(value.getSuccessDeleteFiles().stream().map(f -> Pair.of(f, true)), - value.getFailedDeleteFiles().stream().map(f -> Pair.of(f, false))) - .forEach(fileWithDeleteStatus -> { - Comparable[] row = new Comparable[5]; - row[0] = metadata.getStartRollbackTime(); - row[1] = metadata.getCommitsRollback().toString(); - row[2] = key; - row[3] = fileWithDeleteStatus.getLeft(); - row[4] = fileWithDeleteStatus.getRight(); - rows.add(row); - })); + .concat(value.getSuccessDeleteFiles().stream().map(f -> Pair.of(f, true)), + value.getFailedDeleteFiles().stream().map(f -> Pair.of(f, false))) + .forEach(fileWithDeleteStatus -> { + Comparable[] row = new Comparable[5]; + row[0] = metadata.getStartRollbackTime(); + row[1] = metadata.getCommitsRollback().toString(); + row[2] = key; + row[3] = fileWithDeleteStatus.getLeft(); + row[4] = fileWithDeleteStatus.getRight(); + rows.add(row); + })); TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_INSTANT) .addTableHeaderField(HoodieTableHeaderFields.HEADER_ROLLBACK_INSTANT) @@ -122,6 +123,38 @@ public String showRollback( return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); } + @ShellMethod(key = "commit rollback", value = "Rollback a commit") + public String rollbackCommit( + @ShellOption(value = {"--commit"}, help = "Commit to rollback") final String instantTime, + @ShellOption(value = {"--sparkProperties"}, help = "Spark Properties File Path", + defaultValue = "") final String sparkPropertiesPath, + @ShellOption(value = "--sparkMaster", defaultValue = "", help = "Spark Master") String master, + @ShellOption(value = "--sparkMemory", defaultValue = "4G", + help = "Spark executor memory") final String sparkMemory, + @ShellOption(value = "--rollbackUsingMarkers", defaultValue = "false", + help = "Enabling marker based rollback") final String rollbackUsingMarkers) + throws Exception { + HoodieActiveTimeline activeTimeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); + HoodieTimeline completedTimeline = activeTimeline.getCommitsTimeline().filterCompletedInstants(); + HoodieTimeline filteredTimeline = completedTimeline.filter(instant -> instant.getTimestamp().equals(instantTime)); + if (filteredTimeline.empty()) { + return "Commit " + instantTime + " not found in Commits " + completedTimeline; + } + + SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); + sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), master, sparkMemory, instantTime, + HoodieCLI.getTableMetaClient().getBasePath(), rollbackUsingMarkers); + Process process = sparkLauncher.launch(); + InputStreamConsumer.captureOutput(process); + int exitCode = process.waitFor(); + // Refresh the current + HoodieCLI.refreshTableMetadata(); + if (exitCode != 0) { + return "Commit " + instantTime + " failed to roll back"; + } + return "Commit " + instantTime + " rolled back"; + } + /** * An Active timeline containing only rollbacks. */ diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SavepointsCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SavepointsCommand.java index e4d7cc69e9eb1..73f94acda8787 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SavepointsCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SavepointsCommand.java @@ -23,23 +23,15 @@ import org.apache.hudi.cli.HoodieTableHeaderFields; import org.apache.hudi.cli.utils.InputStreamConsumer; import org.apache.hudi.cli.utils.SparkUtil; -import org.apache.hudi.client.SparkRDDWriteClient; -import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.index.HoodieIndex; - -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.launcher.SparkLauncher; -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; import java.util.List; import java.util.stream.Collectors; @@ -47,10 +39,10 @@ /** * CLI command to display savepoint options. */ -@Component -public class SavepointsCommand implements CommandMarker { +@ShellComponent +public class SavepointsCommand { - @CliCommand(value = "savepoints show", help = "Show the savepoints") + @ShellMethod(key = "savepoints show", value = "Show the savepoints") public String showSavepoints() { HoodieActiveTimeline activeTimeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); HoodieTimeline timeline = activeTimeline.getSavePointTimeline().filterCompletedInstants(); @@ -63,24 +55,24 @@ public String showSavepoints() { return HoodiePrintHelper.print(new String[] {HoodieTableHeaderFields.HEADER_SAVEPOINT_TIME}, rows); } - @CliCommand(value = "savepoint create", help = "Savepoint a commit") - public String savepoint(@CliOption(key = {"commit"}, help = "Commit to savepoint") final String commitTime, - @CliOption(key = {"user"}, unspecifiedDefaultValue = "default", + @ShellMethod(key = "savepoint create", value = "Savepoint a commit") + public String savepoint( + @ShellOption(value = {"--commit"}, help = "Commit to savepoint") final String commitTime, + @ShellOption(value = {"--user"}, defaultValue = "default", help = "User who is creating the savepoint") final String user, - @CliOption(key = {"comments"}, unspecifiedDefaultValue = "default", + @ShellOption(value = {"--comments"}, defaultValue = "default", help = "Comments for creating the savepoint") final String comments, - @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path") final String sparkPropertiesPath, - @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master, - @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", + @ShellOption(value = {"--sparkProperties"}, help = "Spark Properties File Path", + defaultValue = "") final String sparkPropertiesPath, + @ShellOption(value = "--sparkMaster", defaultValue = "", help = "Spark Master") String master, + @ShellOption(value = "--sparkMemory", defaultValue = "4G", help = "Spark executor memory") final String sparkMemory) throws Exception { HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); - HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants(); - HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); - if (!timeline.containsInstant(commitInstant)) { - return "Commit " + commitTime + " not found in Commits " + timeline; + if (!activeTimeline.getCommitsTimeline().filterCompletedInstants().containsInstant(commitTime)) { + return "Commit " + commitTime + " not found in Commits " + activeTimeline; } SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); @@ -97,12 +89,15 @@ public String savepoint(@CliOption(key = {"commit"}, help = "Commit to savepoint return String.format("The commit \"%s\" has been savepointed.", commitTime); } - @CliCommand(value = "savepoint rollback", help = "Savepoint a commit") + @ShellMethod(key = "savepoint rollback", value = "Savepoint a commit") public String rollbackToSavepoint( - @CliOption(key = {"savepoint"}, help = "Savepoint to rollback") final String instantTime, - @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path") final String sparkPropertiesPath, - @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master, - @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", + @ShellOption(value = {"--savepoint"}, help = "Savepoint to rollback") final String instantTime, + @ShellOption(value = {"--sparkProperties"}, help = "Spark Properties File Path", + defaultValue = "") final String sparkPropertiesPath, + @ShellOption(value = "--sparkMaster", defaultValue = "", help = "Spark Master") String master, + @ShellOption(value = {"--lazyFailedWritesCleanPolicy"}, help = "True if FailedWriteCleanPolicy is lazy", + defaultValue = "false") final String lazyFailedWritesCleanPolicy, + @ShellOption(value = "--sparkMemory", defaultValue = "4G", help = "Spark executor memory") final String sparkMemory) throws Exception { HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); @@ -110,16 +105,16 @@ public String rollbackToSavepoint( throw new HoodieException("There are no completed instants to run rollback"); } HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); - HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants(); - HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, instantTime); + HoodieTimeline timeline = activeTimeline.getCommitsTimeline().filterCompletedInstants(); + List instants = timeline.getInstants().filter(instant -> instant.getTimestamp().equals(instantTime)).collect(Collectors.toList()); - if (!timeline.containsInstant(commitInstant)) { + if (instants.isEmpty()) { return "Commit " + instantTime + " not found in Commits " + timeline; } SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK_TO_SAVEPOINT.toString(), master, sparkMemory, - instantTime, metaClient.getBasePath()); + instantTime, metaClient.getBasePath(), lazyFailedWritesCleanPolicy); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); @@ -131,11 +126,13 @@ public String rollbackToSavepoint( return String.format("Savepoint \"%s\" rolled back", instantTime); } - @CliCommand(value = "savepoint delete", help = "Delete the savepoint") - public String deleteSavepoint(@CliOption(key = {"commit"}, help = "Delete a savepoint") final String instantTime, - @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path") final String sparkPropertiesPath, - @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master, - @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", + @ShellMethod(key = "savepoint delete", value = "Delete the savepoint") + public String deleteSavepoint( + @ShellOption(value = {"--commit"}, help = "Delete a savepoint") final String instantTime, + @ShellOption(value = {"--sparkProperties"}, help = "Spark Properties File Path", + defaultValue = "") final String sparkPropertiesPath, + @ShellOption(value = "--sparkMaster", defaultValue = "", help = "Spark Master") String master, + @ShellOption(value = "--sparkMemory", defaultValue = "4G", help = "Spark executor memory") final String sparkMemory) throws Exception { HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); @@ -162,11 +159,4 @@ public String deleteSavepoint(@CliOption(key = {"commit"}, help = "Delete a save } return String.format("Savepoint \"%s\" deleted.", instantTime); } - - private static SparkRDDWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); - return new SparkRDDWriteClient(new HoodieSparkEngineContext(jsc), config, false); - } - } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkEnvCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkEnvCommand.java index 7969808e29831..02778ac2cff60 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkEnvCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkEnvCommand.java @@ -19,11 +19,9 @@ package org.apache.hudi.cli.commands; import org.apache.hudi.cli.HoodiePrintHelper; - -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; import java.util.HashMap; import java.util.Map; @@ -31,37 +29,38 @@ /** * CLI command to set and show spark launcher init env. */ -@Component -public class SparkEnvCommand implements CommandMarker { +@ShellComponent +public class SparkEnvCommand { public static Map env = new HashMap<>(); - @CliCommand(value = "set", help = "Set spark launcher env to cli") - public void setEnv(@CliOption(key = {"conf"}, help = "Env config to be set") final String confMap) { + @ShellMethod(key = "set", value = "Set spark launcher env to cli") + public void setEnv(@ShellOption(value = {"--conf"}, help = "Env config to be set") final String confMap) { String[] map = confMap.split("="); if (map.length != 2) { throw new IllegalArgumentException("Illegal set parameter, please use like [set --conf SPARK_HOME=/usr/etc/spark]"); } env.put(map[0].trim(), map[1].trim()); + System.setProperty(map[0].trim(), map[1].trim()); } - @CliCommand(value = "show envs all", help = "Show spark launcher envs") + @ShellMethod(key = "show envs all", value = "Show spark launcher envs") public String showAllEnv() { String[][] rows = new String[env.size()][2]; int i = 0; - for (Map.Entry entry: env.entrySet()) { - rows[i] = new String[]{entry.getKey(), entry.getValue()}; + for (Map.Entry entry : env.entrySet()) { + rows[i] = new String[] {entry.getKey(), entry.getValue()}; i++; } return HoodiePrintHelper.print(new String[] {"key", "value"}, rows); } - @CliCommand(value = "show env", help = "Show spark launcher env by key") - public String showEnvByKey(@CliOption(key = {"key"}, help = "Which env conf want to show") final String key) { + @ShellMethod(key = "show env", value = "Show spark launcher env by key") + public String showEnvByKey(@ShellOption(value = {"--key"}, help = "Which env conf want to show") final String key) { if (key == null || key.isEmpty()) { return showAllEnv(); } else { - return HoodiePrintHelper.print(new String[] {"key", "value"}, new String[][]{new String[]{key, env.get(key)}}); + return HoodiePrintHelper.print(new String[] {"key", "value"}, new String[][] {new String[] {key, env.getOrDefault(key, "")}}); } } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java index f715b16e012c3..4abfe48e1119e 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java @@ -18,200 +18,292 @@ package org.apache.hudi.cli.commands; -import org.apache.hudi.cli.DeDupeType; import org.apache.hudi.DataSourceWriteOptions; +import org.apache.hudi.cli.DeDupeType; import org.apache.hudi.cli.DedupeSparkJob; import org.apache.hudi.cli.utils.SparkUtil; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.HoodieTableVersion; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.PartitionPathEncodeUtils; import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.config.HoodieBootstrapConfig; +import org.apache.hudi.config.HoodieCleanConfig; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieSavepointException; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.keygen.constant.KeyGeneratorType; +import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.action.compact.strategy.UnBoundedCompactionStrategy; -import org.apache.hudi.table.upgrade.SparkUpgradeDowngrade; +import org.apache.hudi.table.marker.WriteMarkersFactory; +import org.apache.hudi.table.upgrade.SparkUpgradeDowngradeHelper; +import org.apache.hudi.table.upgrade.UpgradeDowngrade; import org.apache.hudi.utilities.HDFSParquetImporter; import org.apache.hudi.utilities.HDFSParquetImporter.Config; import org.apache.hudi.utilities.HoodieCleaner; +import org.apache.hudi.utilities.HoodieClusteringJob; import org.apache.hudi.utilities.HoodieCompactionAdminTool; import org.apache.hudi.utilities.HoodieCompactionAdminTool.Operation; import org.apache.hudi.utilities.HoodieCompactor; -import org.apache.hudi.utilities.UtilHelpers; import org.apache.hudi.utilities.deltastreamer.BootstrapExecutor; import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.log4j.Logger; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.functions; +import org.apache.spark.sql.types.StructType; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.List; +import java.util.Locale; +import java.util.Map; + +import static org.apache.hudi.utilities.UtilHelpers.EXECUTE; +import static org.apache.hudi.utilities.UtilHelpers.SCHEDULE; +import static org.apache.hudi.utilities.UtilHelpers.SCHEDULE_AND_EXECUTE; +import static org.apache.hudi.utilities.UtilHelpers.buildProperties; +import static org.apache.hudi.utilities.UtilHelpers.readConfig; /** * This class deals with initializing spark context based on command entered to hudi-cli. */ public class SparkMain { - private static final Logger LOG = Logger.getLogger(SparkMain.class); + private static final Logger LOG = LogManager.getLogger(SparkMain.class); /** * Commands. */ enum SparkCommand { - BOOTSTRAP, ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT, UPSERT, COMPACT_SCHEDULE, COMPACT_RUN, - COMPACT_UNSCHEDULE_PLAN, COMPACT_UNSCHEDULE_FILE, COMPACT_VALIDATE, COMPACT_REPAIR, CLEAN, DELETE_SAVEPOINT, UPGRADE, DOWNGRADE + BOOTSTRAP, ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT, UPSERT, COMPACT_SCHEDULE, COMPACT_RUN, COMPACT_SCHEDULE_AND_EXECUTE, + COMPACT_UNSCHEDULE_PLAN, COMPACT_UNSCHEDULE_FILE, COMPACT_VALIDATE, COMPACT_REPAIR, CLUSTERING_SCHEDULE, + CLUSTERING_RUN, CLUSTERING_SCHEDULE_AND_EXECUTE, CLEAN, DELETE_MARKER, DELETE_SAVEPOINT, UPGRADE, DOWNGRADE, + REPAIR_DEPRECATED_PARTITION, RENAME_PARTITION } public static void main(String[] args) throws Exception { - String command = args[0]; - LOG.info("Invoking SparkMain:" + command); + ValidationUtils.checkArgument(args.length >= 4); + final String commandString = args[0]; + LOG.info("Invoking SparkMain: " + commandString); + final SparkCommand cmd = SparkCommand.valueOf(commandString); - SparkCommand cmd = SparkCommand.valueOf(command); + JavaSparkContext jsc = SparkUtil.initJavaSparkContext("hoodie-cli-" + commandString, + Option.of(args[1]), Option.of(args[2])); - JavaSparkContext jsc = sparkMasterContained(cmd) - ? SparkUtil.initJavaSparkConf("hoodie-cli-" + command, Option.of(args[1]), Option.of(args[2])) - : SparkUtil.initJavaSparkConf("hoodie-cli-" + command); int returnCode = 0; - switch (cmd) { - case ROLLBACK: - assert (args.length == 5); - returnCode = rollback(jsc, args[3], args[4]); - break; - case DEDUPLICATE: - assert (args.length == 8); - returnCode = deduplicatePartitionPath(jsc, args[3], args[4], args[5], Boolean.parseBoolean(args[6]), args[7]); - break; - case ROLLBACK_TO_SAVEPOINT: - assert (args.length == 5); - returnCode = rollbackToSavepoint(jsc, args[3], args[4]); - break; - case IMPORT: - case UPSERT: - assert (args.length >= 13); - String propsFilePath = null; - if (!StringUtils.isNullOrEmpty(args[12])) { - propsFilePath = args[12]; - } - List configs = new ArrayList<>(); - if (args.length > 13) { - configs.addAll(Arrays.asList(args).subList(13, args.length)); - } - returnCode = dataLoad(jsc, command, args[3], args[4], args[5], args[6], args[7], args[8], - Integer.parseInt(args[9]), args[10], Integer.parseInt(args[11]), propsFilePath, configs); - break; - case COMPACT_RUN: - assert (args.length >= 9); - propsFilePath = null; - if (!StringUtils.isNullOrEmpty(args[8])) { - propsFilePath = args[8]; - } - configs = new ArrayList<>(); - if (args.length > 9) { - configs.addAll(Arrays.asList(args).subList(9, args.length)); - } - returnCode = compact(jsc, args[1], args[2], args[3], Integer.parseInt(args[4]), args[5], args[6], - Integer.parseInt(args[7]), false, propsFilePath, configs); - break; - case COMPACT_SCHEDULE: - assert (args.length >= 6); - propsFilePath = null; - if (!StringUtils.isNullOrEmpty(args[5])) { - propsFilePath = args[5]; - } - configs = new ArrayList<>(); - if (args.length > 6) { - configs.addAll(Arrays.asList(args).subList(6, args.length)); - } - returnCode = compact(jsc, args[1], args[2], args[3], 1, "", args[4], 0, true, propsFilePath, configs); - break; - case COMPACT_VALIDATE: - assert (args.length == 7); - doCompactValidate(jsc, args[3], args[4], args[5], Integer.parseInt(args[6])); - returnCode = 0; - break; - case COMPACT_REPAIR: - assert (args.length == 8); - doCompactRepair(jsc, args[3], args[4], args[5], Integer.parseInt(args[6]), - Boolean.parseBoolean(args[7])); - returnCode = 0; - break; - case COMPACT_UNSCHEDULE_FILE: - assert (args.length == 9); - doCompactUnscheduleFile(jsc, args[3], args[4], args[5], Integer.parseInt(args[6]), - Boolean.parseBoolean(args[7]), Boolean.parseBoolean(args[8])); - returnCode = 0; - break; - case COMPACT_UNSCHEDULE_PLAN: - assert (args.length == 9); - doCompactUnschedule(jsc, args[3], args[4], args[5], Integer.parseInt(args[6]), - Boolean.parseBoolean(args[7]), Boolean.parseBoolean(args[8])); - returnCode = 0; - break; - case CLEAN: - assert (args.length >= 5); - propsFilePath = null; - if (!StringUtils.isNullOrEmpty(args[4])) { - propsFilePath = args[4]; - } - configs = new ArrayList<>(); - if (args.length > 5) { - configs.addAll(Arrays.asList(args).subList(5, args.length)); - } - clean(jsc, args[3], propsFilePath, configs); - break; - case SAVEPOINT: - assert (args.length == 7); - returnCode = createSavepoint(jsc, args[3], args[4], args[5], args[6]); - break; - case DELETE_SAVEPOINT: - assert (args.length == 5); - returnCode = deleteSavepoint(jsc, args[3], args[4]); - break; - case BOOTSTRAP: - assert (args.length >= 18); - propsFilePath = null; - if (!StringUtils.isNullOrEmpty(args[17])) { - propsFilePath = args[17]; - } - configs = new ArrayList<>(); - if (args.length > 18) { - configs.addAll(Arrays.asList(args).subList(18, args.length)); - } - returnCode = doBootstrap(jsc, args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10], - args[11], args[12], args[13], args[14], args[15], args[16], propsFilePath, configs); - break; - case UPGRADE: - case DOWNGRADE: - assert (args.length == 5); - returnCode = upgradeOrDowngradeTable(jsc, args[3], args[4]); - break; - default: - break; + try { + switch (cmd) { + case ROLLBACK: + assert (args.length == 6); + returnCode = rollback(jsc, args[3], args[4], Boolean.parseBoolean(args[5])); + break; + case DEDUPLICATE: + assert (args.length == 8); + returnCode = deduplicatePartitionPath(jsc, args[3], args[4], args[5], Boolean.parseBoolean(args[6]), args[7]); + break; + case ROLLBACK_TO_SAVEPOINT: + assert (args.length == 6); + returnCode = rollbackToSavepoint(jsc, args[3], args[4], Boolean.parseBoolean(args[5])); + break; + case IMPORT: + case UPSERT: + assert (args.length >= 13); + String propsFilePath = null; + if (!StringUtils.isNullOrEmpty(args[12])) { + propsFilePath = args[12]; + } + List configs = new ArrayList<>(); + if (args.length > 13) { + configs.addAll(Arrays.asList(args).subList(13, args.length)); + } + returnCode = dataLoad(jsc, commandString, args[3], args[4], args[5], args[6], args[7], args[8], + Integer.parseInt(args[9]), args[10], Integer.parseInt(args[11]), propsFilePath, configs); + break; + case COMPACT_RUN: + assert (args.length >= 10); + propsFilePath = null; + if (!StringUtils.isNullOrEmpty(args[9])) { + propsFilePath = args[9]; + } + configs = new ArrayList<>(); + if (args.length > 10) { + configs.addAll(Arrays.asList(args).subList(10, args.length)); + } + returnCode = compact(jsc, args[3], args[4], args[5], Integer.parseInt(args[6]), args[7], + Integer.parseInt(args[8]), HoodieCompactor.EXECUTE, propsFilePath, configs); + break; + case COMPACT_SCHEDULE_AND_EXECUTE: + assert (args.length >= 9); + propsFilePath = null; + if (!StringUtils.isNullOrEmpty(args[8])) { + propsFilePath = args[8]; + } + configs = new ArrayList<>(); + if (args.length > 9) { + configs.addAll(Arrays.asList(args).subList(9, args.length)); + } + + returnCode = compact(jsc, args[3], args[4], null, Integer.parseInt(args[5]), args[6], + Integer.parseInt(args[7]), HoodieCompactor.SCHEDULE_AND_EXECUTE, propsFilePath, configs); + break; + case COMPACT_SCHEDULE: + assert (args.length >= 7); + propsFilePath = null; + if (!StringUtils.isNullOrEmpty(args[6])) { + propsFilePath = args[6]; + } + configs = new ArrayList<>(); + if (args.length > 7) { + configs.addAll(Arrays.asList(args).subList(7, args.length)); + } + returnCode = compact(jsc, args[3], args[4], args[5], 1, "", 0, HoodieCompactor.SCHEDULE, propsFilePath, configs); + break; + case COMPACT_VALIDATE: + assert (args.length == 7); + doCompactValidate(jsc, args[3], args[4], args[5], Integer.parseInt(args[6])); + returnCode = 0; + break; + case COMPACT_REPAIR: + assert (args.length == 8); + doCompactRepair(jsc, args[3], args[4], args[5], Integer.parseInt(args[6]), + Boolean.parseBoolean(args[7])); + returnCode = 0; + break; + case COMPACT_UNSCHEDULE_FILE: + assert (args.length == 10); + doCompactUnscheduleFile(jsc, args[3], args[4], args[5], args[6], Integer.parseInt(args[7]), + Boolean.parseBoolean(args[8]), Boolean.parseBoolean(args[9])); + returnCode = 0; + break; + case COMPACT_UNSCHEDULE_PLAN: + assert (args.length == 9); + doCompactUnschedule(jsc, args[3], args[4], args[5], Integer.parseInt(args[6]), + Boolean.parseBoolean(args[7]), Boolean.parseBoolean(args[8])); + returnCode = 0; + break; + case CLUSTERING_RUN: + assert (args.length >= 9); + propsFilePath = null; + if (!StringUtils.isNullOrEmpty(args[8])) { + propsFilePath = args[8]; + } + configs = new ArrayList<>(); + if (args.length > 9) { + configs.addAll(Arrays.asList(args).subList(9, args.length)); + } + returnCode = cluster(jsc, args[3], args[4], args[5], Integer.parseInt(args[6]), args[2], + Integer.parseInt(args[7]), EXECUTE, propsFilePath, configs); + break; + case CLUSTERING_SCHEDULE_AND_EXECUTE: + assert (args.length >= 8); + propsFilePath = null; + if (!StringUtils.isNullOrEmpty(args[7])) { + propsFilePath = args[7]; + } + configs = new ArrayList<>(); + if (args.length > 8) { + configs.addAll(Arrays.asList(args).subList(8, args.length)); + } + returnCode = cluster(jsc, args[3], args[4], null, Integer.parseInt(args[5]), args[2], + Integer.parseInt(args[6]), SCHEDULE_AND_EXECUTE, propsFilePath, configs); + break; + case CLUSTERING_SCHEDULE: + assert (args.length >= 7); + propsFilePath = null; + if (!StringUtils.isNullOrEmpty(args[6])) { + propsFilePath = args[6]; + } + configs = new ArrayList<>(); + if (args.length > 7) { + configs.addAll(Arrays.asList(args).subList(7, args.length)); + } + returnCode = cluster(jsc, args[3], args[4], args[5], 1, args[2], + 0, SCHEDULE, propsFilePath, configs); + break; + case CLEAN: + assert (args.length >= 5); + propsFilePath = null; + if (!StringUtils.isNullOrEmpty(args[4])) { + propsFilePath = args[4]; + } + configs = new ArrayList<>(); + if (args.length > 5) { + configs.addAll(Arrays.asList(args).subList(5, args.length)); + } + clean(jsc, args[3], propsFilePath, configs); + break; + case SAVEPOINT: + assert (args.length == 7); + returnCode = createSavepoint(jsc, args[3], args[4], args[5], args[6]); + break; + case DELETE_MARKER: + assert (args.length == 5); + returnCode = deleteMarker(jsc, args[3], args[4]); + break; + case DELETE_SAVEPOINT: + assert (args.length == 5); + returnCode = deleteSavepoint(jsc, args[3], args[4]); + break; + case BOOTSTRAP: + assert (args.length >= 18); + propsFilePath = null; + if (!StringUtils.isNullOrEmpty(args[17])) { + propsFilePath = args[17]; + } + configs = new ArrayList<>(); + if (args.length > 18) { + configs.addAll(Arrays.asList(args).subList(18, args.length)); + } + returnCode = doBootstrap(jsc, args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10], + args[11], args[12], args[13], args[14], args[15], args[16], propsFilePath, configs); + break; + case UPGRADE: + case DOWNGRADE: + assert (args.length == 5); + returnCode = upgradeOrDowngradeTable(jsc, args[3], args[4]); + break; + case REPAIR_DEPRECATED_PARTITION: + assert (args.length == 4); + returnCode = repairDeprecatedPartition(jsc, args[3]); + break; + case RENAME_PARTITION: + assert (args.length == 6); + returnCode = renamePartition(jsc, args[3], args[4], args[5]); + break; + default: + break; + } + } catch (Throwable throwable) { + LOG.error("Fail to execute commandString", throwable); + returnCode = -1; + } finally { + jsc.stop(); } System.exit(returnCode); } - private static boolean sparkMasterContained(SparkCommand command) { - List masterContained = Arrays.asList(SparkCommand.COMPACT_VALIDATE, SparkCommand.COMPACT_REPAIR, - SparkCommand.COMPACT_UNSCHEDULE_PLAN, SparkCommand.COMPACT_UNSCHEDULE_FILE, SparkCommand.CLEAN, - SparkCommand.IMPORT, SparkCommand.UPSERT, SparkCommand.DEDUPLICATE, SparkCommand.SAVEPOINT, - SparkCommand.DELETE_SAVEPOINT, SparkCommand.ROLLBACK_TO_SAVEPOINT, SparkCommand.ROLLBACK, SparkCommand.BOOTSTRAP); - return masterContained.contains(command); - } - protected static void clean(JavaSparkContext jsc, String basePath, String propsFilePath, - List configs) { + List configs) { HoodieCleaner.Config cfg = new HoodieCleaner.Config(); cfg.basePath = basePath; cfg.propsFilePath = propsFilePath; @@ -219,9 +311,24 @@ protected static void clean(JavaSparkContext jsc, String basePath, String propsF new HoodieCleaner(cfg, jsc).run(); } + protected static int deleteMarker(JavaSparkContext jsc, String instantTime, String basePath) { + try { + SparkRDDWriteClient client = createHoodieClient(jsc, basePath, false); + HoodieWriteConfig config = client.getConfig(); + HoodieEngineContext context = client.getEngineContext(); + HoodieSparkTable table = HoodieSparkTable.create(config, context); + WriteMarkersFactory.get(config.getMarkersType(), table, instantTime) + .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); + return 0; + } catch (Exception e) { + LOG.warn(String.format("Failed: Could not clean marker instantTime: \"%s\".", instantTime), e); + return -1; + } + } + private static int dataLoad(JavaSparkContext jsc, String command, String srcPath, String targetPath, String tableName, - String tableType, String rowKey, String partitionKey, int parallelism, String schemaFile, - int retry, String propsFilePath, List configs) { + String tableType, String rowKey, String partitionKey, int parallelism, String schemaFile, + int retry, String propsFilePath, List configs) { Config cfg = new Config(); cfg.command = command; cfg.srcPath = srcPath; @@ -238,7 +345,7 @@ private static int dataLoad(JavaSparkContext jsc, String command, String srcPath } private static void doCompactValidate(JavaSparkContext jsc, String basePath, String compactionInstant, - String outputPath, int parallelism) throws Exception { + String outputPath, int parallelism) throws Exception { HoodieCompactionAdminTool.Config cfg = new HoodieCompactionAdminTool.Config(); cfg.basePath = basePath; cfg.operation = Operation.VALIDATE; @@ -249,7 +356,7 @@ private static void doCompactValidate(JavaSparkContext jsc, String basePath, Str } private static void doCompactRepair(JavaSparkContext jsc, String basePath, String compactionInstant, - String outputPath, int parallelism, boolean dryRun) throws Exception { + String outputPath, int parallelism, boolean dryRun) throws Exception { HoodieCompactionAdminTool.Config cfg = new HoodieCompactionAdminTool.Config(); cfg.basePath = basePath; cfg.operation = Operation.REPAIR; @@ -261,7 +368,7 @@ private static void doCompactRepair(JavaSparkContext jsc, String basePath, Strin } private static void doCompactUnschedule(JavaSparkContext jsc, String basePath, String compactionInstant, - String outputPath, int parallelism, boolean skipValidation, boolean dryRun) throws Exception { + String outputPath, int parallelism, boolean skipValidation, boolean dryRun) throws Exception { HoodieCompactionAdminTool.Config cfg = new HoodieCompactionAdminTool.Config(); cfg.basePath = basePath; cfg.operation = Operation.UNSCHEDULE_PLAN; @@ -273,13 +380,14 @@ private static void doCompactUnschedule(JavaSparkContext jsc, String basePath, S new HoodieCompactionAdminTool(cfg).run(jsc); } - private static void doCompactUnscheduleFile(JavaSparkContext jsc, String basePath, String fileId, String outputPath, - int parallelism, boolean skipValidation, boolean dryRun) + private static void doCompactUnscheduleFile(JavaSparkContext jsc, String basePath, String fileId, String partitionPath, + String outputPath, int parallelism, boolean skipValidation, boolean dryRun) throws Exception { HoodieCompactionAdminTool.Config cfg = new HoodieCompactionAdminTool.Config(); cfg.basePath = basePath; cfg.operation = Operation.UNSCHEDULE_FILE; cfg.outputPath = outputPath; + cfg.partitionPath = partitionPath; cfg.fileId = fileId; cfg.parallelism = parallelism; cfg.dryRun = dryRun; @@ -288,8 +396,8 @@ private static void doCompactUnscheduleFile(JavaSparkContext jsc, String basePat } private static int compact(JavaSparkContext jsc, String basePath, String tableName, String compactionInstant, - int parallelism, String schemaFile, String sparkMemory, int retry, boolean schedule, String propsFilePath, - List configs) { + int parallelism, String schemaFile, int retry, String mode, String propsFilePath, + List configs) { HoodieCompactor.Config cfg = new HoodieCompactor.Config(); cfg.basePath = basePath; cfg.tableName = tableName; @@ -298,36 +406,137 @@ private static int compact(JavaSparkContext jsc, String basePath, String tableNa cfg.strategyClassName = UnBoundedCompactionStrategy.class.getCanonicalName(); cfg.parallelism = parallelism; cfg.schemaFile = schemaFile; - cfg.runSchedule = schedule; + cfg.runningMode = mode; cfg.propsFilePath = propsFilePath; cfg.configs = configs; - jsc.getConf().set("spark.executor.memory", sparkMemory); return new HoodieCompactor(jsc, cfg).compact(retry); } + private static int cluster(JavaSparkContext jsc, String basePath, String tableName, String clusteringInstant, + int parallelism, String sparkMemory, int retry, String runningMode, String propsFilePath, List configs) { + HoodieClusteringJob.Config cfg = new HoodieClusteringJob.Config(); + cfg.basePath = basePath; + cfg.tableName = tableName; + cfg.clusteringInstantTime = clusteringInstant; + cfg.parallelism = parallelism; + cfg.runningMode = runningMode; + cfg.propsFilePath = propsFilePath; + cfg.configs = configs; + jsc.getConf().set("spark.executor.memory", sparkMemory); + return new HoodieClusteringJob(jsc, cfg).cluster(retry); + } + private static int deduplicatePartitionPath(JavaSparkContext jsc, String duplicatedPartitionPath, - String repairedOutputPath, String basePath, boolean dryRun, String dedupeType) { + String repairedOutputPath, String basePath, boolean dryRun, String dedupeType) { DedupeSparkJob job = new DedupeSparkJob(basePath, duplicatedPartitionPath, repairedOutputPath, new SQLContext(jsc), FSUtils.getFs(basePath, jsc.hadoopConfiguration()), DeDupeType.withName(dedupeType)); job.fixDuplicates(dryRun); return 0; } + public static int repairDeprecatedPartition(JavaSparkContext jsc, String basePath) { + SQLContext sqlContext = new SQLContext(jsc); + Dataset recordsToRewrite = getRecordsToRewrite(basePath, PartitionPathEncodeUtils.DEPRECATED_DEFAULT_PARTITION_PATH, sqlContext); + + if (!recordsToRewrite.isEmpty()) { + recordsToRewrite.cache(); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build(); + Map propsMap = getPropsForRewrite(metaClient); + rewriteRecordsToNewPartition(basePath, PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH, recordsToRewrite, metaClient, propsMap); + // after re-writing, we can safely delete older data. + deleteOlderPartition(basePath, PartitionPathEncodeUtils.DEPRECATED_DEFAULT_PARTITION_PATH, recordsToRewrite, propsMap); + } + return 0; + } + + public static int renamePartition(JavaSparkContext jsc, String basePath, String oldPartition, String newPartition) { + SQLContext sqlContext = new SQLContext(jsc); + Dataset recordsToRewrite = getRecordsToRewrite(basePath, oldPartition, sqlContext); + + if (!recordsToRewrite.isEmpty()) { + recordsToRewrite.cache(); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build(); + Map propsMap = getPropsForRewrite(metaClient); + rewriteRecordsToNewPartition(basePath, newPartition, recordsToRewrite, metaClient, propsMap); + // after re-writing, we can safely delete older partition. + deleteOlderPartition(basePath, oldPartition, recordsToRewrite, propsMap); + // also, we can physically delete the old partition. + FileSystem fs = FSUtils.getFs(new Path(basePath), metaClient.getHadoopConf()); + try { + fs.delete(new Path(basePath, oldPartition), true); + } catch (IOException e) { + LOG.warn("Failed to delete older partition " + basePath); + } + } + return 0; + } + + private static void deleteOlderPartition(String basePath, String oldPartition, Dataset recordsToRewrite, Map propsMap) { + propsMap.put("hoodie.datasource.write.partitions.to.delete", oldPartition); + recordsToRewrite.write() + .options(propsMap) + .option("hoodie.datasource.write.operation", WriteOperationType.DELETE_PARTITION.value()) + .format("hudi") + .mode("Append") + .save(basePath); + } + + private static void rewriteRecordsToNewPartition(String basePath, String newPartition, Dataset recordsToRewrite, HoodieTableMetaClient metaClient, Map propsMap) { + String partitionFieldProp = metaClient.getTableConfig().getPartitionFieldProp(); + StructType structType = recordsToRewrite.schema(); + int partitionIndex = structType.fieldIndex(partitionFieldProp); + + recordsToRewrite.withColumn(metaClient.getTableConfig().getPartitionFieldProp(), functions.lit(null).cast(structType.apply(partitionIndex).dataType())) + .write() + .options(propsMap) + .option("hoodie.datasource.write.operation", WriteOperationType.BULK_INSERT.value()) + .format("hudi") + .mode("Append") + .save(basePath); + } + + private static Dataset getRecordsToRewrite(String basePath, String oldPartition, SQLContext sqlContext) { + return sqlContext.read() + .format("hudi") + .load(basePath + "/" + oldPartition) + .drop(HoodieRecord.RECORD_KEY_METADATA_FIELD) + .drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD) + .drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD) + .drop(HoodieRecord.FILENAME_METADATA_FIELD) + .drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD); + } + + private static Map getPropsForRewrite(HoodieTableMetaClient metaClient) { + Map propsMap = new HashMap<>(); + metaClient.getTableConfig().getProps().forEach((k, v) -> propsMap.put(k.toString(), v.toString())); + propsMap.put(HoodieWriteConfig.SKIP_DEFAULT_PARTITION_VALIDATION.key(), "true"); + propsMap.put(DataSourceWriteOptions.RECORDKEY_FIELD().key(), metaClient.getTableConfig().getRecordKeyFieldProp()); + propsMap.put(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), metaClient.getTableConfig().getPartitionFieldProp()); + propsMap.put(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), metaClient.getTableConfig().getKeyGeneratorClassName()); + return propsMap; + } + private static int doBootstrap(JavaSparkContext jsc, String tableName, String tableType, String basePath, - String sourcePath, String recordKeyCols, String partitionFields, String parallelism, String schemaProviderClass, - String bootstrapIndexClass, String selectorClass, String keyGeneratorClass, String fullBootstrapInputProvider, - String payloadClassName, String enableHiveSync, String propsFilePath, List configs) throws IOException { + String sourcePath, String recordKeyCols, String partitionFields, String parallelism, String schemaProviderClass, + String bootstrapIndexClass, String selectorClass, String keyGenerator, String fullBootstrapInputProvider, + String payloadClassName, String enableHiveSync, String propsFilePath, List configs) throws IOException { - TypedProperties properties = propsFilePath == null ? UtilHelpers.buildProperties(configs) - : UtilHelpers.readConfig(FSUtils.getFs(propsFilePath, jsc.hadoopConfiguration()), new Path(propsFilePath), configs).getConfig(); + TypedProperties properties = propsFilePath == null ? buildProperties(configs) + : readConfig(jsc.hadoopConfiguration(), new Path(propsFilePath), configs).getProps(true); - properties.setProperty(HoodieBootstrapConfig.BOOTSTRAP_BASE_PATH_PROP, sourcePath); - properties.setProperty(HoodieBootstrapConfig.BOOTSTRAP_KEYGEN_CLASS, keyGeneratorClass); - properties.setProperty(HoodieBootstrapConfig.FULL_BOOTSTRAP_INPUT_PROVIDER, fullBootstrapInputProvider); - properties.setProperty(HoodieBootstrapConfig.BOOTSTRAP_PARALLELISM, parallelism); - properties.setProperty(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR, selectorClass); - properties.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), recordKeyCols); - properties.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), partitionFields); + properties.setProperty(HoodieBootstrapConfig.BASE_PATH.key(), sourcePath); + + if (!StringUtils.isNullOrEmpty(keyGenerator) && KeyGeneratorType.getNames().contains(keyGenerator.toUpperCase(Locale.ROOT))) { + properties.setProperty(HoodieBootstrapConfig.KEYGEN_TYPE.key(), keyGenerator.toUpperCase(Locale.ROOT)); + } else { + properties.setProperty(HoodieBootstrapConfig.KEYGEN_CLASS_NAME.key(), keyGenerator); + } + + properties.setProperty(HoodieBootstrapConfig.FULL_BOOTSTRAP_INPUT_PROVIDER_CLASS_NAME.key(), fullBootstrapInputProvider); + properties.setProperty(HoodieBootstrapConfig.PARALLELISM_VALUE.key(), parallelism); + properties.setProperty(HoodieBootstrapConfig.MODE_SELECTOR_CLASS_NAME.key(), selectorClass); + properties.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD().key(), recordKeyCols); + properties.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), partitionFields); HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config(); cfg.targetTableName = tableName; @@ -343,8 +552,8 @@ private static int doBootstrap(JavaSparkContext jsc, String tableName, String ta return 0; } - private static int rollback(JavaSparkContext jsc, String instantTime, String basePath) throws Exception { - SparkRDDWriteClient client = createHoodieClient(jsc, basePath); + private static int rollback(JavaSparkContext jsc, String instantTime, String basePath, Boolean rollbackUsingMarkers) throws Exception { + SparkRDDWriteClient client = createHoodieClient(jsc, basePath, rollbackUsingMarkers, false); if (client.rollback(instantTime)) { LOG.info(String.format("The commit \"%s\" rolled back.", instantTime)); return 0; @@ -355,8 +564,8 @@ private static int rollback(JavaSparkContext jsc, String instantTime, String bas } private static int createSavepoint(JavaSparkContext jsc, String commitTime, String user, - String comments, String basePath) throws Exception { - SparkRDDWriteClient client = createHoodieClient(jsc, basePath); + String comments, String basePath) throws Exception { + SparkRDDWriteClient client = createHoodieClient(jsc, basePath, false); try { client.savepoint(commitTime, user, comments); LOG.info(String.format("The commit \"%s\" has been savepointed.", commitTime)); @@ -367,26 +576,26 @@ private static int createSavepoint(JavaSparkContext jsc, String commitTime, Stri } } - private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime, String basePath) throws Exception { - SparkRDDWriteClient client = createHoodieClient(jsc, basePath); + private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTime, String basePath, boolean lazyCleanPolicy) throws Exception { + SparkRDDWriteClient client = createHoodieClient(jsc, basePath, lazyCleanPolicy); try { client.restoreToSavepoint(savepointTime); LOG.info(String.format("The commit \"%s\" rolled back.", savepointTime)); return 0; } catch (Exception e) { - LOG.warn(String.format("The commit \"%s\" failed to roll back.", savepointTime)); + LOG.warn(String.format("The commit \"%s\" failed to roll back.", savepointTime), e); return -1; } } private static int deleteSavepoint(JavaSparkContext jsc, String savepointTime, String basePath) throws Exception { - SparkRDDWriteClient client = createHoodieClient(jsc, basePath); + SparkRDDWriteClient client = createHoodieClient(jsc, basePath, false); try { client.deleteSavepoint(savepointTime); LOG.info(String.format("Savepoint \"%s\" deleted.", savepointTime)); return 0; } catch (Exception e) { - LOG.warn(String.format("Failed: Could not delete savepoint \"%s\".", savepointTime)); + LOG.warn(String.format("Failed: Could not delete savepoint \"%s\".", savepointTime), e); return -1; } } @@ -394,18 +603,25 @@ private static int deleteSavepoint(JavaSparkContext jsc, String savepointTime, S /** * Upgrade or downgrade table. * - * @param jsc instance of {@link JavaSparkContext} to use. - * @param basePath base path of the dataset. + * @param jsc instance of {@link JavaSparkContext} to use. + * @param basePath base path of the dataset. * @param toVersion version to which upgrade/downgrade to be done. * @return 0 if success, else -1. * @throws Exception */ protected static int upgradeOrDowngradeTable(JavaSparkContext jsc, String basePath, String toVersion) { - HoodieWriteConfig config = getWriteConfig(basePath); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), false, - config.getConsistencyGuardConfig(), Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion()))); + HoodieWriteConfig config = getWriteConfig(basePath, Boolean.parseBoolean(HoodieWriteConfig.ROLLBACK_USING_MARKERS_ENABLE.defaultValue()), + false); + HoodieTableMetaClient metaClient = + HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(config.getBasePath()) + .setLoadActiveTimelineOnLoad(false).setConsistencyGuardConfig(config.getConsistencyGuardConfig()) + .setLayoutVersion(Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion()))) + .setFileSystemRetryConfig(config.getFileSystemRetryConfig()).build(); + HoodieWriteConfig updatedConfig = HoodieWriteConfig.newBuilder().withProps(config.getProps()) + .forTable(metaClient.getTableConfig().getTableName()).build(); try { - new SparkUpgradeDowngrade(metaClient, config, new HoodieSparkEngineContext(jsc)).run(metaClient, HoodieTableVersion.valueOf(toVersion), config, new HoodieSparkEngineContext(jsc), null); + new UpgradeDowngrade(metaClient, updatedConfig, new HoodieSparkEngineContext(jsc), SparkUpgradeDowngradeHelper.getInstance()) + .run(HoodieTableVersion.valueOf(toVersion), null); LOG.info(String.format("Table at \"%s\" upgraded / downgraded to version \"%s\".", basePath, toVersion)); return 0; } catch (Exception e) { @@ -414,13 +630,20 @@ protected static int upgradeOrDowngradeTable(JavaSparkContext jsc, String basePa } } - private static SparkRDDWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception { - HoodieWriteConfig config = getWriteConfig(basePath); + private static SparkRDDWriteClient createHoodieClient(JavaSparkContext jsc, String basePath, Boolean rollbackUsingMarkers, boolean lazyCleanPolicy) throws Exception { + HoodieWriteConfig config = getWriteConfig(basePath, rollbackUsingMarkers, lazyCleanPolicy); return new SparkRDDWriteClient(new HoodieSparkEngineContext(jsc), config); } - private static HoodieWriteConfig getWriteConfig(String basePath) { + private static SparkRDDWriteClient createHoodieClient(JavaSparkContext jsc, String basePath, boolean lazyCleanPolicy) throws Exception { + return createHoodieClient(jsc, basePath, Boolean.parseBoolean(HoodieWriteConfig.ROLLBACK_USING_MARKERS_ENABLE.defaultValue()), lazyCleanPolicy); + } + + private static HoodieWriteConfig getWriteConfig(String basePath, Boolean rollbackUsingMarkers, boolean lazyCleanPolicy) { return HoodieWriteConfig.newBuilder().withPath(basePath) + .withRollbackUsingMarkers(rollbackUsingMarkers) + .withCleanConfig(HoodieCleanConfig.newBuilder().withFailedWritesCleaningPolicy(lazyCleanPolicy ? HoodieFailedWritesCleaningPolicy.LAZY : + HoodieFailedWritesCleaningPolicy.EAGER).build()) .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/StatsCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/StatsCommand.java index 66c5563102848..c9034d03d5fcd 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/StatsCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/StatsCommand.java @@ -18,6 +18,12 @@ package org.apache.hudi.cli.commands; +import com.codahale.metrics.Histogram; +import com.codahale.metrics.Snapshot; +import com.codahale.metrics.UniformReservoir; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hudi.cli.HoodieCLI; import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.HoodieTableHeaderFields; @@ -28,17 +34,9 @@ import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.NumericUtils; - -import com.codahale.metrics.Histogram; -import com.codahale.metrics.Snapshot; -import com.codahale.metrics.UniformReservoir; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; import java.io.IOException; import java.text.DecimalFormat; @@ -52,19 +50,19 @@ /** * CLI command to displays stats options. */ -@Component -public class StatsCommand implements CommandMarker { +@ShellComponent +public class StatsCommand { public static final int MAX_FILES = 1000000; - @CliCommand(value = "stats wa", help = "Write Amplification. Ratio of how many records were upserted to how many " + @ShellMethod(key = "stats wa", value = "Write Amplification. Ratio of how many records were upserted to how many " + "records were actually written") public String writeAmplificationStats( - @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") final boolean headerOnly) + @ShellOption(value = {"--limit"}, help = "Limit commits", defaultValue = "-1") final Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly) throws IOException { long totalRecordsUpserted = 0; @@ -105,15 +103,15 @@ public Comparable[] printFileSizeHistogram(String instantTime, Snapshot s) { s.getMax(), s.size(), s.getStdDev()}; } - @CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files") + @ShellMethod(key = "stats filesizes", value = "File Sizes. Display summary stats on sizes of files") public String fileSizeStats( - @CliOption(key = {"partitionPath"}, help = "regex to select files, eg: 2016/08/02", - unspecifiedDefaultValue = "*/*/*") final String globRegex, - @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, - @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, - @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, - @CliOption(key = {"headeronly"}, help = "Print Header Only", - unspecifiedDefaultValue = "false") final boolean headerOnly) + @ShellOption(value = {"--partitionPath"}, help = "regex to select files, eg: 2016/08/02", + defaultValue = "*/*/*") final String globRegex, + @ShellOption(value = {"--limit"}, help = "Limit commits", defaultValue = "-1") final Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly) throws IOException { FileSystem fs = HoodieCLI.fs; diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TableCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TableCommand.java index 9c947e4d407e3..b3dfaf5ab73bc 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TableCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TableCommand.java @@ -18,45 +18,63 @@ package org.apache.hudi.cli.commands; +import org.apache.avro.Schema; +import org.apache.hadoop.fs.Path; import org.apache.hudi.cli.HoodieCLI; import org.apache.hudi.cli.HoodiePrintHelper; +import org.apache.hudi.cli.HoodieTableHeaderFields; import org.apache.hudi.cli.TableHeader; import org.apache.hudi.common.fs.ConsistencyGuardConfig; -import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.exception.TableNotFoundException; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; - +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.OutputStream; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.TreeSet; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.table.HoodieTableMetaClient.METAFOLDER_NAME; /** * CLI command to display hudi table options. */ -@Component -public class TableCommand implements CommandMarker { +@ShellComponent +public class TableCommand { + + private static final Logger LOG = LogManager.getLogger(TableCommand.class); static { System.out.println("Table command getting loaded"); } - @CliCommand(value = "connect", help = "Connect to a hoodie table") + @ShellMethod(key = "connect", value = "Connect to a hoodie table") public String connect( - @CliOption(key = {"path"}, mandatory = true, help = "Base Path of the table") final String path, - @CliOption(key = {"layoutVersion"}, help = "Timeline Layout version") Integer layoutVersion, - @CliOption(key = {"eventuallyConsistent"}, unspecifiedDefaultValue = "false", + @ShellOption(value = {"--path"}, help = "Base Path of the table") final String path, + @ShellOption(value = {"--layoutVersion"}, help = "Timeline Layout version", defaultValue = ShellOption.NULL) Integer layoutVersion, + @ShellOption(value = {"--eventuallyConsistent"}, defaultValue = "false", help = "Enable eventual consistency") final boolean eventuallyConsistent, - @CliOption(key = {"initialCheckIntervalMs"}, unspecifiedDefaultValue = "2000", + @ShellOption(value = {"--initialCheckIntervalMs"}, defaultValue = "2000", help = "Initial wait time for eventual consistency") final Integer initialConsistencyIntervalMs, - @CliOption(key = {"maxWaitIntervalMs"}, unspecifiedDefaultValue = "300000", + @ShellOption(value = {"--maxWaitIntervalMs"}, defaultValue = "300000", help = "Max wait time for eventual consistency") final Integer maxConsistencyIntervalMs, - @CliOption(key = {"maxCheckIntervalMs"}, unspecifiedDefaultValue = "7", + @ShellOption(value = {"--maxCheckIntervalMs"}, defaultValue = "7", help = "Max checks for eventual consistency") final Integer maxConsistencyChecks) throws IOException { HoodieCLI @@ -74,20 +92,22 @@ public String connect( /** * Create a Hoodie Table if it does not exist. * - * @param path Base Path - * @param name Hoodie Table Name + * @param path Base Path + * @param name Hoodie Table Name * @param tableTypeStr Hoodie Table Type * @param payloadClass Payload Class */ - @CliCommand(value = "create", help = "Create a hoodie table if not present") + @ShellMethod(key = "create", value = "Create a hoodie table if not present") public String createTable( - @CliOption(key = {"path"}, mandatory = true, help = "Base Path of the table") final String path, - @CliOption(key = {"tableName"}, mandatory = true, help = "Hoodie Table Name") final String name, - @CliOption(key = {"tableType"}, unspecifiedDefaultValue = "COPY_ON_WRITE", + @ShellOption(value = {"--path"}, help = "Base Path of the table") final String path, + @ShellOption(value = {"--tableName"}, help = "Hoodie Table Name") final String name, + @ShellOption(value = {"--tableType"}, defaultValue = "COPY_ON_WRITE", help = "Hoodie Table Type. Must be one of : COPY_ON_WRITE or MERGE_ON_READ") final String tableTypeStr, - @CliOption(key = {"archiveLogFolder"}, help = "Folder Name for storing archived timeline") String archiveFolder, - @CliOption(key = {"layoutVersion"}, help = "Specific Layout Version to use") Integer layoutVersion, - @CliOption(key = {"payloadClass"}, unspecifiedDefaultValue = "org.apache.hudi.common.model.HoodieAvroPayload", + @ShellOption(value = {"--archiveLogFolder"}, help = "Folder Name for storing archived timeline", + defaultValue = ShellOption.NULL) String archiveFolder, + @ShellOption(value = {"--layoutVersion"}, help = "Specific Layout Version to use", + defaultValue = ShellOption.NULL) Integer layoutVersion, + @ShellOption(value = {"--payloadClass"}, defaultValue = "org.apache.hudi.common.model.HoodieAvroPayload", help = "Payload Class") final String payloadClass) throws IOException { boolean initialized = HoodieCLI.initConf(); @@ -95,7 +115,7 @@ public String createTable( boolean existing = false; try { - new HoodieTableMetaClient(HoodieCLI.conf, path); + HoodieTableMetaClient.builder().setConf(HoodieCLI.conf).setBasePath(path).build(); existing = true; } catch (TableNotFoundException dfe) { // expected @@ -106,10 +126,13 @@ public String createTable( throw new IllegalStateException("Table already existing in path : " + path); } - final HoodieTableType tableType = HoodieTableType.valueOf(tableTypeStr); - HoodieTableMetaClient.initTableType(HoodieCLI.conf, path, tableType, name, archiveFolder, - payloadClass, layoutVersion); - + HoodieTableMetaClient.withPropertyBuilder() + .setTableType(tableTypeStr) + .setTableName(name) + .setArchiveLogFolder(archiveFolder) + .setPayloadClassName(payloadClass) + .setTimelineLayoutVersion(layoutVersion) + .initTable(HoodieCLI.conf, path); // Now connect to ensure loading works return connect(path, layoutVersion, false, 0, 0, 0); } @@ -117,7 +140,7 @@ public String createTable( /** * Describes table properties. */ - @CliCommand(value = "desc", help = "Describe Hoodie Table properties") + @ShellMethod(key = "desc", value = "Describe Hoodie Table properties") public String descTable() { HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); TableHeader header = new TableHeader().addTableHeaderField("Property").addTableHeaderField("Value"); @@ -125,7 +148,7 @@ public String descTable() { rows.add(new Comparable[] {"basePath", client.getBasePath()}); rows.add(new Comparable[] {"metaPath", client.getMetaPath()}); rows.add(new Comparable[] {"fileSystem", client.getFs().getScheme()}); - client.getTableConfig().getProps().entrySet().forEach(e -> { + client.getTableConfig().propsMap().entrySet().forEach(e -> { rows.add(new Comparable[] {e.getKey(), e.getValue()}); }); return HoodiePrintHelper.print(header, new HashMap<>(), "", false, -1, false, rows); @@ -134,10 +157,110 @@ public String descTable() { /** * Refresh table metadata. */ - @CliCommand(value = {"refresh", "metadata refresh", "commits refresh", "cleans refresh", "savepoints refresh"}, - help = "Refresh table metadata") + @ShellMethod(key = {"refresh", "metadata refresh", "commits refresh", "cleans refresh", "savepoints refresh"}, + value = "Refresh table metadata") public String refreshMetadata() { HoodieCLI.refreshTableMetadata(); return "Metadata for table " + HoodieCLI.getTableMetaClient().getTableConfig().getTableName() + " refreshed."; } + + /** + * Fetches table schema in avro format. + */ + @ShellMethod(key = "fetch table schema", value = "Fetches latest table schema") + public String fetchTableSchema( + @ShellOption(value = {"--outputFilePath"}, defaultValue = ShellOption.NULL, + help = "File path to write schema") final String outputFilePath) throws Exception { + HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); + TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(client); + Schema schema = tableSchemaResolver.getTableAvroSchema(); + if (outputFilePath != null) { + LOG.info("Latest table schema : " + schema.toString(true)); + writeToFile(outputFilePath, schema.toString(true)); + return String.format("Latest table schema written to %s", outputFilePath); + } else { + return String.format("Latest table schema %s", schema.toString(true)); + } + } + + @ShellMethod(key = "table recover-configs", value = "Recover table configs, from update/delete that failed midway.") + public String recoverTableConfig() throws IOException { + HoodieCLI.refreshTableMetadata(); + HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); + Path metaPathDir = new Path(client.getBasePath(), METAFOLDER_NAME); + HoodieTableConfig.recover(client.getFs(), metaPathDir); + return descTable(); + } + + @ShellMethod(key = "table update-configs", value = "Update the table configs with configs with provided file.") + public String updateTableConfig( + @ShellOption(value = {"--props-file"}, help = "Path to a properties file on local filesystem") + final String updatePropsFilePath) throws IOException { + HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); + Map oldProps = client.getTableConfig().propsMap(); + + Properties updatedProps = new Properties(); + updatedProps.load(new FileInputStream(updatePropsFilePath)); + Path metaPathDir = new Path(client.getBasePath(), METAFOLDER_NAME); + HoodieTableConfig.update(client.getFs(), metaPathDir, updatedProps); + + HoodieCLI.refreshTableMetadata(); + Map newProps = HoodieCLI.getTableMetaClient().getTableConfig().propsMap(); + return renderOldNewProps(newProps, oldProps); + } + + @ShellMethod(key = "table delete-configs", value = "Delete the supplied table configs from the table.") + public String deleteTableConfig( + @ShellOption(value = {"--comma-separated-configs"}, + help = "Comma separated list of configs to delete.") final String csConfigs) { + HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); + Map oldProps = client.getTableConfig().propsMap(); + + Set deleteConfigs = Arrays.stream(csConfigs.split(",")).collect(Collectors.toSet()); + Path metaPathDir = new Path(client.getBasePath(), METAFOLDER_NAME); + HoodieTableConfig.delete(client.getFs(), metaPathDir, deleteConfigs); + + HoodieCLI.refreshTableMetadata(); + Map newProps = HoodieCLI.getTableMetaClient().getTableConfig().propsMap(); + return renderOldNewProps(newProps, oldProps); + } + + private static String renderOldNewProps(Map newProps, Map oldProps) { + TreeSet allPropKeys = new TreeSet<>(); + allPropKeys.addAll(newProps.keySet().stream().map(Object::toString).collect(Collectors.toSet())); + allPropKeys.addAll(oldProps.keySet()); + + String[][] rows = new String[allPropKeys.size()][]; + int ind = 0; + for (String propKey : allPropKeys) { + String[] row = new String[] { + propKey, + oldProps.getOrDefault(propKey, "null"), + newProps.getOrDefault(propKey, "null") + }; + rows[ind++] = row; + } + return HoodiePrintHelper.print(new String[] {HoodieTableHeaderFields.HEADER_HOODIE_PROPERTY, + HoodieTableHeaderFields.HEADER_OLD_VALUE, HoodieTableHeaderFields.HEADER_NEW_VALUE}, rows); + } + + /** + * Use Streams when you are dealing with raw data. + * + * @param filePath output file path. + * @param data to be written to file. + */ + private static void writeToFile(String filePath, String data) throws IOException { + File outFile = new File(filePath); + if (outFile.exists()) { + outFile.delete(); + } + OutputStream os = null; + try { + os = new FileOutputStream(outFile); + os.write(data.getBytes(), 0, data.length()); + } finally { + os.close(); + } + } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TempViewCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TempViewCommand.java index 975e89fc772a1..3f88532b568ec 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TempViewCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TempViewCommand.java @@ -19,26 +19,24 @@ package org.apache.hudi.cli.commands; import org.apache.hudi.cli.HoodieCLI; - import org.apache.hudi.exception.HoodieException; -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; /** * CLI command to query/delete temp views. */ -@Component -public class TempViewCommand implements CommandMarker { +@ShellComponent +public class TempViewCommand { public static final String QUERY_SUCCESS = "Query ran successfully!"; public static final String QUERY_FAIL = "Query ran failed!"; public static final String SHOW_SUCCESS = "Show all views name successfully!"; - @CliCommand(value = {"temp_query", "temp query"}, help = "query against created temp view") + @ShellMethod(key = {"temp_query", "temp query"}, value = "query against created temp view") public String query( - @CliOption(key = {"sql"}, mandatory = true, help = "select query to run against view") final String sql) { + @ShellOption(value = {"--sql"}, help = "select query to run against view") final String sql) { try { HoodieCLI.getTempViewProvider().runQuery(sql); @@ -49,7 +47,7 @@ public String query( } - @CliCommand(value = {"temps_show", "temps show"}, help = "Show all views name") + @ShellMethod(key = {"temps_show", "temps show"}, value = "Show all views name") public String showAll() { try { @@ -60,9 +58,9 @@ public String showAll() { } } - @CliCommand(value = {"temp_delete", "temp delete"}, help = "Delete view name") + @ShellMethod(key = {"temp_delete", "temp delete"}, value = "Delete view name") public String delete( - @CliOption(key = {"view"}, mandatory = true, help = "view name") final String tableName) { + @ShellOption(value = {"--view"}, help = "view name") final String tableName) { try { HoodieCLI.getTempViewProvider().deleteTable(tableName); diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TimelineCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TimelineCommand.java new file mode 100644 index 0000000000000..bf7e5397cab93 --- /dev/null +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TimelineCommand.java @@ -0,0 +1,408 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.cli.commands; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; +import org.apache.hudi.cli.HoodieCLI; +import org.apache.hudi.cli.HoodiePrintHelper; +import org.apache.hudi.cli.HoodieTableHeaderFields; +import org.apache.hudi.cli.TableHeader; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.LogManager; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; + +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Date; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * CLI command to display timeline options. + */ +@ShellComponent +public class TimelineCommand { + + private static final Logger LOG = LogManager.getLogger(TimelineCommand.class); + private static final SimpleDateFormat DATE_FORMAT_DEFAULT = new SimpleDateFormat("MM-dd HH:mm"); + private static final SimpleDateFormat DATE_FORMAT_SECONDS = new SimpleDateFormat("MM-dd HH:mm:ss"); + + @ShellMethod(key = "timeline show active", value = "List all instants in active timeline") + public String showActive( + @ShellOption(value = {"--limit"}, help = "Limit #rows to be displayed", defaultValue = "10") Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly, + @ShellOption(value = {"--with-metadata-table"}, help = "Show metadata table timeline together with data table", + defaultValue = "false") final boolean withMetadataTable, + @ShellOption(value = {"--show-rollback-info"}, help = "Show instant to rollback for rollbacks", + defaultValue = "false") final boolean showRollbackInfo, + @ShellOption(value = {"--show-time-seconds"}, help = "Show seconds in instant file modification time", + defaultValue = "false") final boolean showTimeSeconds) { + HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); + try { + if (withMetadataTable) { + HoodieTableMetaClient mtMetaClient = getMetadataTableMetaClient(metaClient); + return printTimelineInfoWithMetadataTable( + metaClient.getActiveTimeline(), mtMetaClient.getActiveTimeline(), + getInstantInfoFromTimeline(metaClient.getFs(), metaClient.getMetaPath()), + getInstantInfoFromTimeline(mtMetaClient.getFs(), mtMetaClient.getMetaPath()), + limit, sortByField, descending, headerOnly, true, showTimeSeconds, showRollbackInfo); + } + return printTimelineInfo( + metaClient.getActiveTimeline(), + getInstantInfoFromTimeline(metaClient.getFs(), metaClient.getMetaPath()), + limit, sortByField, descending, headerOnly, true, showTimeSeconds, showRollbackInfo); + } catch (IOException e) { + e.printStackTrace(); + return e.getMessage(); + } + } + + @ShellMethod(key = "timeline show incomplete", value = "List all incomplete instants in active timeline") + public String showIncomplete( + @ShellOption(value = {"--limit"}, help = "Limit #rows to be displayed", defaultValue = "10") Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly, + @ShellOption(value = {"--show-rollback-info"}, help = "Show instant to rollback for rollbacks", + defaultValue = "false") final boolean showRollbackInfo, + @ShellOption(value = {"--show-time-seconds"}, help = "Show seconds in instant file modification time", + defaultValue = "false") final boolean showTimeSeconds) { + HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); + try { + return printTimelineInfo( + metaClient.getActiveTimeline().filterInflightsAndRequested(), + getInstantInfoFromTimeline(metaClient.getFs(), metaClient.getMetaPath()), + limit, sortByField, descending, headerOnly, true, showTimeSeconds, showRollbackInfo); + } catch (IOException e) { + e.printStackTrace(); + return e.getMessage(); + } + } + + @ShellMethod(key = "metadata timeline show active", + value = "List all instants in active timeline of metadata table") + public String metadataShowActive( + @ShellOption(value = {"--limit"}, help = "Limit #rows to be displayed", defaultValue = "10") Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly, + @ShellOption(value = {"--show-time-seconds"}, help = "Show seconds in instant file modification time", + defaultValue = "false") final boolean showTimeSeconds) { + HoodieTableMetaClient metaClient = getMetadataTableMetaClient(HoodieCLI.getTableMetaClient()); + try { + return printTimelineInfo( + metaClient.getActiveTimeline(), + getInstantInfoFromTimeline(metaClient.getFs(), metaClient.getMetaPath()), + limit, sortByField, descending, headerOnly, true, showTimeSeconds, false); + } catch (IOException e) { + e.printStackTrace(); + return e.getMessage(); + } + } + + @ShellMethod(key = "metadata timeline show incomplete", + value = "List all incomplete instants in active timeline of metadata table") + public String metadataShowIncomplete( + @ShellOption(value = {"--limit"}, help = "Limit #rows to be displayed", defaultValue = "10") Integer limit, + @ShellOption(value = {"--sortBy"}, help = "Sorting Field", defaultValue = "") final String sortByField, + @ShellOption(value = {"--desc"}, help = "Ordering", defaultValue = "false") final boolean descending, + @ShellOption(value = {"--headeronly"}, help = "Print Header Only", + defaultValue = "false") final boolean headerOnly, + @ShellOption(value = {"--show-time-seconds"}, help = "Show seconds in instant file modification time", + defaultValue = "false") final boolean showTimeSeconds) { + HoodieTableMetaClient metaClient = getMetadataTableMetaClient(HoodieCLI.getTableMetaClient()); + try { + return printTimelineInfo( + metaClient.getActiveTimeline().filterInflightsAndRequested(), + getInstantInfoFromTimeline(metaClient.getFs(), metaClient.getMetaPath()), + limit, sortByField, descending, headerOnly, true, showTimeSeconds, false); + } catch (IOException e) { + e.printStackTrace(); + return e.getMessage(); + } + } + + private HoodieTableMetaClient getMetadataTableMetaClient(HoodieTableMetaClient metaClient) { + return HoodieTableMetaClient.builder().setConf(HoodieCLI.conf) + .setBasePath(HoodieTableMetadata.getMetadataTableBasePath(metaClient.getBasePath())) + .setLoadActiveTimelineOnLoad(false) + .setConsistencyGuardConfig(HoodieCLI.consistencyGuardConfig) + .build(); + } + + private Map> getInstantInfoFromTimeline( + FileSystem fs, String metaPath) throws IOException { + Map> instantMap = new HashMap<>(); + Stream instantStream = Arrays.stream( + HoodieTableMetaClient.scanFiles(fs, new Path(metaPath), path -> { + // Include only the meta files with extensions that needs to be included + String extension = HoodieInstant.getTimelineFileExtension(path.getName()); + return HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE.contains(extension); + })).map(HoodieInstantWithModTime::new); + instantStream.forEach(instant -> { + instantMap.computeIfAbsent(instant.getTimestamp(), t -> new HashMap<>()) + .put(instant.getState(), instant); + }); + return instantMap; + } + + private String getFormattedDate( + String instantTimestamp, HoodieInstant.State state, + Map> instantInfoMap, + boolean showTimeSeconds) { + Long timeMs = null; + Map mapping = instantInfoMap.get(instantTimestamp); + if (mapping != null && mapping.containsKey(state)) { + timeMs = mapping.get(state).getModificationTime(); + } + SimpleDateFormat sdf = showTimeSeconds ? DATE_FORMAT_SECONDS : DATE_FORMAT_DEFAULT; + return timeMs != null ? sdf.format(new Date(timeMs)) : "-"; + } + + private String printTimelineInfo( + HoodieTimeline timeline, + Map> instantInfoMap, + Integer limit, String sortByField, boolean descending, boolean headerOnly, boolean withRowNo, + boolean showTimeSeconds, boolean showRollbackInfo) { + Map> rollbackInfo = getRolledBackInstantInfo(timeline); + final List rows = timeline.getInstants().map(instant -> { + int numColumns = showRollbackInfo ? 7 : 6; + Comparable[] row = new Comparable[numColumns]; + String instantTimestamp = instant.getTimestamp(); + row[0] = instantTimestamp; + row[1] = instant.getAction(); + row[2] = instant.getState(); + if (showRollbackInfo) { + if (HoodieTimeline.ROLLBACK_ACTION.equalsIgnoreCase(instant.getAction())) { + row[3] = "Rolls back\n" + getInstantToRollback(timeline, instant); + } else { + if (rollbackInfo.containsKey(instantTimestamp)) { + row[3] = "Rolled back by\n" + String.join(",\n", rollbackInfo.get(instantTimestamp)); + } else { + row[3] = "-"; + } + } + } + row[numColumns - 3] = getFormattedDate( + instantTimestamp, HoodieInstant.State.REQUESTED, instantInfoMap, showTimeSeconds); + row[numColumns - 2] = getFormattedDate( + instantTimestamp, HoodieInstant.State.INFLIGHT, instantInfoMap, showTimeSeconds); + row[numColumns - 1] = getFormattedDate( + instantTimestamp, HoodieInstant.State.COMPLETED, instantInfoMap, showTimeSeconds); + return row; + }).collect(Collectors.toList()); + TableHeader header = new TableHeader() + .addTableHeaderField(HoodieTableHeaderFields.HEADER_INSTANT) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_ACTION) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_STATE); + if (showRollbackInfo) { + header.addTableHeaderField(HoodieTableHeaderFields.HEADER_ROLLBACK_INFO); + } + header.addTableHeaderField(HoodieTableHeaderFields.HEADER_REQUESTED_TIME) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_INFLIGHT_TIME) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_COMPLETED_TIME); + return HoodiePrintHelper.print( + header, new HashMap<>(), withRowNo, sortByField, descending, limit, headerOnly, rows); + } + + private String printTimelineInfoWithMetadataTable( + HoodieTimeline dtTimeline, HoodieTimeline mtTimeline, + Map> dtInstantInfoMap, + Map> mtInstantInfoMap, + Integer limit, String sortByField, boolean descending, boolean headerOnly, boolean withRowNo, + boolean showTimeSeconds, boolean showRollbackInfo) { + Set instantTimeSet = new HashSet(dtInstantInfoMap.keySet()); + instantTimeSet.addAll(mtInstantInfoMap.keySet()); + List instantTimeList = instantTimeSet.stream() + .sorted(new HoodieInstantTimeComparator()).collect(Collectors.toList()); + Map> dtRollbackInfo = getRolledBackInstantInfo(dtTimeline); + + final List rows = instantTimeList.stream().map(instantTimestamp -> { + int numColumns = showRollbackInfo ? 12 : 11; + Option dtInstant = getInstant(dtTimeline, instantTimestamp); + Option mtInstant = getInstant(mtTimeline, instantTimestamp); + Comparable[] row = new Comparable[numColumns]; + row[0] = instantTimestamp; + row[1] = dtInstant.isPresent() ? dtInstant.get().getAction() : "-"; + row[2] = dtInstant.isPresent() ? dtInstant.get().getState() : "-"; + if (showRollbackInfo) { + if (dtInstant.isPresent() + && HoodieTimeline.ROLLBACK_ACTION.equalsIgnoreCase(dtInstant.get().getAction())) { + row[3] = "Rolls back\n" + getInstantToRollback(dtTimeline, dtInstant.get()); + } else { + if (dtRollbackInfo.containsKey(instantTimestamp)) { + row[3] = "Rolled back by\n" + String.join(",\n", dtRollbackInfo.get(instantTimestamp)); + } else { + row[3] = "-"; + } + } + } + row[numColumns - 8] = getFormattedDate( + instantTimestamp, HoodieInstant.State.REQUESTED, dtInstantInfoMap, showTimeSeconds); + row[numColumns - 7] = getFormattedDate( + instantTimestamp, HoodieInstant.State.INFLIGHT, dtInstantInfoMap, showTimeSeconds); + row[numColumns - 6] = getFormattedDate( + instantTimestamp, HoodieInstant.State.COMPLETED, dtInstantInfoMap, showTimeSeconds); + row[numColumns - 5] = mtInstant.isPresent() ? mtInstant.get().getAction() : "-"; + row[numColumns - 4] = mtInstant.isPresent() ? mtInstant.get().getState() : "-"; + row[numColumns - 3] = getFormattedDate( + instantTimestamp, HoodieInstant.State.REQUESTED, mtInstantInfoMap, showTimeSeconds); + row[numColumns - 2] = getFormattedDate( + instantTimestamp, HoodieInstant.State.INFLIGHT, mtInstantInfoMap, showTimeSeconds); + row[numColumns - 1] = getFormattedDate( + instantTimestamp, HoodieInstant.State.COMPLETED, mtInstantInfoMap, showTimeSeconds); + return row; + }).collect(Collectors.toList()); + TableHeader header = new TableHeader() + .addTableHeaderField(HoodieTableHeaderFields.HEADER_INSTANT) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_ACTION) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_STATE); + if (showRollbackInfo) { + header.addTableHeaderField(HoodieTableHeaderFields.HEADER_ROLLBACK_INFO); + } + header.addTableHeaderField(HoodieTableHeaderFields.HEADER_REQUESTED_TIME) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_INFLIGHT_TIME) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_COMPLETED_TIME) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_MT_ACTION) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_MT_STATE) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_MT_REQUESTED_TIME) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_MT_INFLIGHT_TIME) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_MT_COMPLETED_TIME); + return HoodiePrintHelper.print( + header, new HashMap<>(), withRowNo, sortByField, descending, limit, headerOnly, rows); + } + + private Option getInstant(HoodieTimeline timeline, String instantTimestamp) { + return timeline.filter(instant -> instant.getTimestamp().equals(instantTimestamp)).firstInstant(); + } + + private String getInstantToRollback(HoodieTimeline timeline, HoodieInstant instant) { + try { + if (instant.isInflight()) { + HoodieInstant instantToUse = new HoodieInstant( + HoodieInstant.State.REQUESTED, instant.getAction(), instant.getTimestamp()); + HoodieRollbackPlan metadata = TimelineMetadataUtils + .deserializeAvroMetadata(timeline.getInstantDetails(instantToUse).get(), HoodieRollbackPlan.class); + return metadata.getInstantToRollback().getCommitTime(); + } else { + HoodieRollbackMetadata metadata = TimelineMetadataUtils + .deserializeAvroMetadata(timeline.getInstantDetails(instant).get(), HoodieRollbackMetadata.class); + return String.join(",", metadata.getCommitsRollback()); + } + } catch (IOException e) { + LOG.error(String.format("Error reading rollback info of %s", instant)); + e.printStackTrace(); + return "-"; + } + } + + private Map> getRolledBackInstantInfo(HoodieTimeline timeline) { + // Instant rolled back or to roll back -> rollback instants + Map> rollbackInfoMap = new HashMap<>(); + List rollbackInstants = timeline.filter(instant -> + HoodieTimeline.ROLLBACK_ACTION.equalsIgnoreCase(instant.getAction())) + .getInstants().collect(Collectors.toList()); + rollbackInstants.forEach(rollbackInstant -> { + try { + if (rollbackInstant.isInflight()) { + HoodieInstant instantToUse = new HoodieInstant( + HoodieInstant.State.REQUESTED, rollbackInstant.getAction(), rollbackInstant.getTimestamp()); + HoodieRollbackPlan metadata = TimelineMetadataUtils + .deserializeAvroMetadata(timeline.getInstantDetails(instantToUse).get(), HoodieRollbackPlan.class); + rollbackInfoMap.computeIfAbsent(metadata.getInstantToRollback().getCommitTime(), k -> new ArrayList<>()) + .add(rollbackInstant.getTimestamp()); + } else { + HoodieRollbackMetadata metadata = TimelineMetadataUtils + .deserializeAvroMetadata(timeline.getInstantDetails(rollbackInstant).get(), HoodieRollbackMetadata.class); + metadata.getCommitsRollback().forEach(instant -> { + rollbackInfoMap.computeIfAbsent(instant, k -> new ArrayList<>()) + .add(rollbackInstant.getTimestamp()); + }); + } + } catch (IOException e) { + LOG.error(String.format("Error reading rollback info of %s", rollbackInstant)); + e.printStackTrace(); + } + }); + return rollbackInfoMap; + } + + static class HoodieInstantWithModTime extends HoodieInstant { + + private final long modificationTimeMs; + + public HoodieInstantWithModTime(FileStatus fileStatus) { + super(fileStatus); + this.modificationTimeMs = fileStatus.getModificationTime(); + } + + public long getModificationTime() { + return modificationTimeMs; + } + } + + static class HoodieInstantTimeComparator implements Comparator { + @Override + public int compare(String o1, String o2) { + // For metadata table, the compaction instant time is "012345001" while the delta commit + // later is "012345", i.e., the compaction instant time has trailing "001". In the + // actual event sequence, metadata table compaction happens before the corresponding + // delta commit. For better visualization, we put "012345001" before "012345" + // when sorting in ascending order. + if (o1.length() != o2.length()) { + // o1 is longer than o2 + if (o1.length() - o2.length() == 3 && o1.endsWith("001") && o1.startsWith(o2)) { + return -1; + } + // o1 is shorter than o2 + if (o2.length() - o1.length() == 3 && o2.endsWith("001") && o2.startsWith(o1)) { + return 1; + } + } + return o1.compareTo(o2); + } + } +} diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/UpgradeOrDowngradeCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/UpgradeOrDowngradeCommand.java index deb9e0727171b..5561723d7a57a 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/UpgradeOrDowngradeCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/UpgradeOrDowngradeCommand.java @@ -23,59 +23,79 @@ import org.apache.hudi.cli.utils.InputStreamConsumer; import org.apache.hudi.cli.utils.SparkUtil; import org.apache.hudi.common.table.HoodieTableMetaClient; - +import org.apache.hudi.common.table.HoodieTableVersion; +import org.apache.hudi.common.util.StringUtils; import org.apache.spark.launcher.SparkLauncher; -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; /** - * CLI command to assist in upgrading/downgrading Hoodie dataset to a different version. + * CLI command to assist in upgrading/downgrading Hoodie table to a different version. */ -public class UpgradeOrDowngradeCommand implements CommandMarker { +@ShellComponent +public class UpgradeOrDowngradeCommand { - @CliCommand(value = "upgrade hoodie dataset ", help = "Upgrades hoodie dataset") - public String upgradeHoodieDataset( - @CliOption(key = {"toVersion"}, help = "To version of Hoodie dataset to be upgraded/downgraded to", unspecifiedDefaultValue = "") final String toVersion, - @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path") final String sparkPropertiesPath, - @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master, - @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", + @ShellMethod(key = "upgrade table", value = "Upgrades a table") + public String upgradeHoodieTable( + @ShellOption(value = {"--toVersion"}, help = "To version of Hoodie table to be upgraded/downgraded to", defaultValue = "") final String toVersion, + @ShellOption(value = {"--sparkProperties"}, help = "Spark Properties File Path", + defaultValue = "") final String sparkPropertiesPath, + @ShellOption(value = "--sparkMaster", defaultValue = "", help = "Spark Master") String master, + @ShellOption(value = "--sparkMemory", defaultValue = "4G", help = "Spark executor memory") final String sparkMemory) throws Exception { HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); - sparkLauncher.addAppArgs(SparkCommand.UPGRADE.toString(), master, sparkMemory, metaClient.getBasePath(), toVersion); + String toVersionName = getHoodieTableVersionName(toVersion, true); + sparkLauncher.addAppArgs(SparkCommand.UPGRADE.toString(), master, sparkMemory, metaClient.getBasePath(), toVersionName); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); HoodieCLI.refreshTableMetadata(); if (exitCode != 0) { - return String.format("Failed: Could not Upgrade/Downgrade Hoodie dataset to \"%s\".", toVersion); + return String.format("Failed: Could not Upgrade/Downgrade Hoodie table to \"%s\".", toVersionName); } - return String.format("Hoodie dataset upgraded/downgraded to ", toVersion); + return String.format("Hoodie table upgraded/downgraded to %s", toVersionName); } - @CliCommand(value = "downgrade hoodie dataset ", help = "Upgrades hoodie dataset") - public String downgradeHoodieDataset( - @CliOption(key = {"toVersion"}, help = "To version of Hoodie dataset to be upgraded/downgraded to", unspecifiedDefaultValue = "") final String toVersion, - @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path") final String sparkPropertiesPath, - @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master, - @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", + @ShellMethod(key = "downgrade table", value = "Downgrades a table") + public String downgradeHoodieTable( + @ShellOption(value = {"--toVersion"}, help = "To version of Hoodie table to be upgraded/downgraded to", defaultValue = "") final String toVersion, + @ShellOption(value = {"--sparkProperties"}, help = "Spark Properties File Path", + defaultValue = "") final String sparkPropertiesPath, + @ShellOption(value = "--sparkMaster", defaultValue = "", help = "Spark Master") String master, + @ShellOption(value = "--sparkMemory", defaultValue = "4G", help = "Spark executor memory") final String sparkMemory) throws Exception { HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); - sparkLauncher.addAppArgs(SparkCommand.DOWNGRADE.toString(), master, sparkMemory, metaClient.getBasePath(), toVersion); + String toVersionName = getHoodieTableVersionName(toVersion, false); + sparkLauncher.addAppArgs(SparkCommand.DOWNGRADE.toString(), master, sparkMemory, metaClient.getBasePath(), toVersionName); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); HoodieCLI.refreshTableMetadata(); if (exitCode != 0) { - return String.format("Failed: Could not Upgrade/Downgrade Hoodie dataset to \"%s\".", toVersion); + return String.format("Failed: Could not Upgrade/Downgrade Hoodie table to \"%s\".", toVersionName); + } + return String.format("Hoodie table upgraded/downgraded to %s", toVersionName); + } + + static String getHoodieTableVersionName(String versionOption, boolean overrideWithDefault) { + if (StringUtils.isNullOrEmpty(versionOption) && overrideWithDefault) { + return HoodieTableVersion.current().name(); + } + + try { + int versionCode = Integer.parseInt(versionOption); + return HoodieTableVersion.versionFromCode(versionCode).name(); + } catch (NumberFormatException e) { + // The version option from the CLI is not a number, returns the original String + return versionOption; } - return String.format("Hoodie dataset upgraded/downgraded to ", toVersion); } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/UtilsCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/UtilsCommand.java index 677cb7ffce2bd..2861b05b44d09 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/UtilsCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/UtilsCommand.java @@ -19,19 +19,18 @@ package org.apache.hudi.cli.commands; import org.apache.hudi.common.util.StringUtils; -import org.springframework.shell.core.CommandMarker; -import org.springframework.shell.core.annotation.CliCommand; -import org.springframework.shell.core.annotation.CliOption; -import org.springframework.stereotype.Component; +import org.springframework.shell.standard.ShellComponent; +import org.springframework.shell.standard.ShellMethod; +import org.springframework.shell.standard.ShellOption; /** * CLI command to display utils. */ -@Component -public class UtilsCommand implements CommandMarker { +@ShellComponent +public class UtilsCommand { - @CliCommand(value = "utils loadClass", help = "Load a class") - public String loadClass(@CliOption(key = {"class"}, help = "Check mode") final String clazz) { + @ShellMethod(key = "utils loadClass", value = "Load a class") + public String loadClass(@ShellOption(value = {"--class"}, help = "Check mode") final String clazz) { if (StringUtils.isNullOrEmpty(clazz)) { return "Class to be loaded can not be null!"; } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/utils/CommitUtil.java b/hudi-cli/src/main/java/org/apache/hudi/cli/utils/CommitUtil.java index 5a1c457b10ef1..21910fd956dfe 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/utils/CommitUtil.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/utils/CommitUtil.java @@ -25,9 +25,6 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import java.io.IOException; -import java.text.ParseException; -import java.time.Instant; -import java.time.ZoneId; import java.time.ZonedDateTime; import java.util.Date; import java.util.List; @@ -37,9 +34,9 @@ */ public class CommitUtil { - public static long countNewRecords(HoodieTableMetaClient target, List commitsToCatchup) throws IOException { + public static long countNewRecords(HoodieTableMetaClient metaClient, List commitsToCatchup) throws IOException { long totalNew = 0; - HoodieTimeline timeline = target.reloadActiveTimeline().getCommitTimeline().filterCompletedInstants(); + HoodieTimeline timeline = metaClient.reloadActiveTimeline().getCommitTimeline().filterCompletedInstants(); for (String commit : commitsToCatchup) { HoodieCommitMetadata c = HoodieCommitMetadata.fromBytes( timeline.getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit)).get(), @@ -51,18 +48,6 @@ public static long countNewRecords(HoodieTableMetaClient target, List co public static String getTimeDaysAgo(int numberOfDays) { Date date = Date.from(ZonedDateTime.now().minusDays(numberOfDays).toInstant()); - return HoodieActiveTimeline.COMMIT_FORMATTER.format(date); - } - - /** - * Add hours to specified time. If hours <0, this acts as remove hours. - * example, say compactionCommitTime: "20200202020000" - * a) hours: +1, returns 20200202030000 - * b) hours: -1, returns 20200202010000 - */ - public static String addHours(String compactionCommitTime, int hours) throws ParseException { - Instant instant = HoodieActiveTimeline.COMMIT_FORMATTER.parse(compactionCommitTime).toInstant(); - ZonedDateTime commitDateTime = ZonedDateTime.ofInstant(instant, ZoneId.systemDefault()); - return HoodieActiveTimeline.COMMIT_FORMATTER.format(Date.from(commitDateTime.plusHours(hours).toInstant())); + return HoodieActiveTimeline.formatDate(date); } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/utils/InputStreamConsumer.java b/hudi-cli/src/main/java/org/apache/hudi/cli/utils/InputStreamConsumer.java index 73aa45c500d09..a2ebe5769d488 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/utils/InputStreamConsumer.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/utils/InputStreamConsumer.java @@ -18,18 +18,19 @@ package org.apache.hudi.cli.utils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + import java.io.BufferedReader; -import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.util.logging.Logger; /** * This class is responsible to read a Process output. */ public class InputStreamConsumer extends Thread { - private static final Logger LOG = Logger.getLogger(InputStreamConsumer.class.getName()); + private static final Logger LOG = LogManager.getLogger(InputStreamConsumer.class); private InputStream is; public InputStreamConsumer(InputStream is) { @@ -41,13 +42,10 @@ public void run() { try { InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr); - String line; - while ((line = br.readLine()) != null) { - LOG.info(line); - } - } catch (IOException ioe) { - LOG.severe(ioe.toString()); - ioe.printStackTrace(); + br.lines().forEach(LOG::info); + } catch (Exception e) { + LOG.fatal(e.toString()); + e.printStackTrace(); } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/utils/SparkTempViewProvider.java b/hudi-cli/src/main/java/org/apache/hudi/cli/utils/SparkTempViewProvider.java index 5e029cd050b34..4f9e4b0d9a9c0 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/utils/SparkTempViewProvider.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/utils/SparkTempViewProvider.java @@ -19,9 +19,8 @@ package org.apache.hudi.cli.utils; import org.apache.hudi.exception.HoodieException; - -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Dataset; @@ -36,6 +35,7 @@ import java.util.stream.Collectors; public class SparkTempViewProvider implements TempViewProvider { + private static final Logger LOG = LogManager.getLogger(SparkTempViewProvider.class); private JavaSparkContext jsc; @@ -46,16 +46,19 @@ public SparkTempViewProvider(String appName) { SparkConf sparkConf = new SparkConf().setAppName(appName) .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").setMaster("local[8]"); jsc = new JavaSparkContext(sparkConf); - jsc.setLogLevel("ERROR"); - sqlContext = new SQLContext(jsc); } catch (Throwable ex) { // log full stack trace and rethrow. Without this its difficult to debug failures, if any - LOG.error("unable to initialize spark context ", ex); + LOG.warn("unable to initialize spark context ", ex); throw new HoodieException(ex); } } + public SparkTempViewProvider(JavaSparkContext jsc, SQLContext sqlContext) { + this.jsc = jsc; + this.sqlContext = sqlContext; + } + @Override public void createOrReplace(String tableName, List headers, List> rows) { try { @@ -85,7 +88,7 @@ public void createOrReplace(String tableName, List headers, List + sparkLauncher.addJar(new File(libDirectory, library).getAbsolutePath())); } return sparkLauncher; } - public static JavaSparkContext initJavaSparkConf(String name) { - return initJavaSparkConf(name, Option.empty(), Option.empty()); - } - - public static JavaSparkContext initJavaSparkConf(String name, Option master, - Option executorMemory) { - SparkConf sparkConf = new SparkConf().setAppName(name); + /** + * Get the default spark configuration. + * + * @param appName - Spark application name + * @param sparkMaster - Spark master node name + * @return Spark configuration + */ + public static SparkConf getDefaultConf(final String appName, final Option sparkMaster) { + final Properties properties = System.getProperties(); + SparkConf sparkConf = new SparkConf().setAppName(appName); - String defMaster = master.orElse(sparkConf.getenv(HoodieCliSparkConfig.CLI_SPARK_MASTER)); - if ((null == defMaster) || (defMaster.isEmpty())) { - sparkConf.setMaster(DEFAULT_SPARK_MASTER); - } else { - sparkConf.setMaster(defMaster); + // Configure the sparkMaster + String sparkMasterNode = DEFAULT_SPARK_MASTER; + if (properties.getProperty(HoodieCliSparkConfig.CLI_SPARK_MASTER) != null) { + sparkMasterNode = properties.getProperty(HoodieCliSparkConfig.CLI_SPARK_MASTER); + } + if (sparkMaster.isPresent() && !sparkMaster.get().trim().isEmpty()) { + sparkMasterNode = sparkMaster.orElse(sparkMasterNode); } + sparkConf.setMaster(sparkMasterNode); - sparkConf.set(HoodieCliSparkConfig.CLI_SERIALIZER, "org.apache.spark.serializer.KryoSerializer"); + // Configure driver sparkConf.set(HoodieCliSparkConfig.CLI_DRIVER_MAX_RESULT_SIZE, "2g"); sparkConf.set(HoodieCliSparkConfig.CLI_EVENT_LOG_OVERWRITE, "true"); - sparkConf.set(HoodieCliSparkConfig.CLI_EVENT_LOG_ENABLED, "true"); - if (executorMemory.isPresent()) { - sparkConf.set(HoodieCliSparkConfig.CLI_EXECUTOR_MEMORY, executorMemory.get()); - } + sparkConf.set(HoodieCliSparkConfig.CLI_EVENT_LOG_ENABLED, "false"); + sparkConf.set(HoodieCliSparkConfig.CLI_SERIALIZER, "org.apache.spark.serializer.KryoSerializer"); // Configure hadoop conf sparkConf.set(HoodieCliSparkConfig.CLI_MAPRED_OUTPUT_COMPRESS, "true"); @@ -91,10 +99,28 @@ public static JavaSparkContext initJavaSparkConf(String name, Option mas sparkConf.set(HoodieCliSparkConfig.CLI_MAPRED_OUTPUT_COMPRESSION_CODEC, "org.apache.hadoop.io.compress.GzipCodec"); sparkConf.set(HoodieCliSparkConfig.CLI_MAPRED_OUTPUT_COMPRESSION_TYPE, "BLOCK"); + return sparkConf; + } + + public static JavaSparkContext initJavaSparkContext(String name) { + return initJavaSparkContext(name, Option.empty(), Option.empty()); + } + + public static JavaSparkContext initJavaSparkContext(String name, Option master, Option executorMemory) { + SparkConf sparkConf = getDefaultConf(name, master); + if (executorMemory.isPresent()) { + sparkConf.set(HoodieCliSparkConfig.CLI_EXECUTOR_MEMORY, executorMemory.get()); + } + + return initJavaSparkContext(sparkConf); + } + + public static JavaSparkContext initJavaSparkContext(SparkConf sparkConf) { SparkRDDWriteClient.registerClasses(sparkConf); JavaSparkContext jsc = new JavaSparkContext(sparkConf); jsc.hadoopConfiguration().setBoolean(HoodieCliSparkConfig.CLI_PARQUET_ENABLE_SUMMARY_METADATA, false); FSUtils.prepareHadoopConf(jsc.hadoopConfiguration()); return jsc; } + } diff --git a/hudi-cli/src/main/resources/META-INF/spring/spring-shell-plugin.xml b/hudi-cli/src/main/resources/META-INF/spring/spring-shell-plugin.xml deleted file mode 100644 index e28b4f9e887eb..0000000000000 --- a/hudi-cli/src/main/resources/META-INF/spring/spring-shell-plugin.xml +++ /dev/null @@ -1,28 +0,0 @@ - - - - - - - - diff --git a/hudi-cli/src/main/resources/application.yml b/hudi-cli/src/main/resources/application.yml new file mode 100644 index 0000000000000..036524c58d5ed --- /dev/null +++ b/hudi-cli/src/main/resources/application.yml @@ -0,0 +1,23 @@ +### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +### + +spring: + shell: + history: + enabled: true + name: hoodie-cmd.log \ No newline at end of file diff --git a/hudi-cli/src/main/resources/banner.txt b/hudi-cli/src/main/resources/banner.txt new file mode 100644 index 0000000000000..be572b83eb277 --- /dev/null +++ b/hudi-cli/src/main/resources/banner.txt @@ -0,0 +1,14 @@ +=================================================================== +* ___ ___ * +* /\__\ ___ /\ \ ___ * +* / / / /\__\ / \ \ /\ \ * +* / /__/ / / / / /\ \ \ \ \ \ * +* / \ \ ___ / / / / / \ \__\ / \__\ * +* / /\ \ /\__\ / /__/ ___ / /__/ \ |__| / /\/__/ * +* \/ \ \/ / / \ \ \ /\__\ \ \ \ / / / /\/ / / * +* \ / / \ \ / / / \ \ / / / \ /__/ * +* / / / \ \/ / / \ \/ / / \ \__\ * +* / / / \ / / \ / / \/__/ * +* \/__/ \/__/ \/__/ Apache Hudi CLI * +* * +=================================================================== \ No newline at end of file diff --git a/hudi-cli/src/main/resources/log4j2.properties b/hudi-cli/src/main/resources/log4j2.properties new file mode 100644 index 0000000000000..bc8e5ad56c7d6 --- /dev/null +++ b/hudi-cli/src/main/resources/log4j2.properties @@ -0,0 +1,38 @@ +### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +### + +status = INFO +name = HudiCliLog4j2 + +appender.console.type = Console +appender.console.name = CONSOLE +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %-4r [%t] %-5p %c %x - %m%n + +# Root logger level +rootLogger.level = warn +# Root logger referring to console appender +rootLogger.appenderRef.stdout.ref = CONSOLE + +logger.hudi_cli.name = org.apache.hudi.cli +logger.hudi_cli.level = info +logger.hudi_common.name = org.apache.hudi.common +logger.hudi_common.level = info + +logger.spark.name = org.apache.spark +logger.spark.level = info diff --git a/hudi-cli/src/main/scala/org/apache/hudi/cli/DedupeSparkJob.scala b/hudi-cli/src/main/scala/org/apache/hudi/cli/DedupeSparkJob.scala index 96944c5c032cd..00e96a3487504 100644 --- a/hudi-cli/src/main/scala/org/apache/hudi/cli/DedupeSparkJob.scala +++ b/hudi-cli/src/main/scala/org/apache/hudi/cli/DedupeSparkJob.scala @@ -18,14 +18,14 @@ package org.apache.hudi.cli import java.util.stream.Collectors - import org.apache.hadoop.fs.{FileSystem, FileUtil, Path} import org.apache.hudi.common.fs.FSUtils import org.apache.hudi.common.model.{HoodieBaseFile, HoodieRecord} import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.common.table.view.HoodieTableFileSystemView import org.apache.hudi.exception.HoodieException -import org.apache.log4j.Logger +import org.apache.logging.log4j.LogManager +import org.apache.logging.log4j.Logger import org.apache.spark.sql.{DataFrame, Row, SQLContext} import scala.collection.JavaConversions._ @@ -42,7 +42,7 @@ class DedupeSparkJob(basePath: String, dedupeType: DeDupeType.Value) { val sparkHelper = new SparkHelper(sqlContext, fs) - val LOG = Logger.getLogger(this.getClass) + val LOG = LogManager.getLogger(this.getClass) /** @@ -55,7 +55,7 @@ class DedupeSparkJob(basePath: String, s""" select `${HoodieRecord.RECORD_KEY_METADATA_FIELD}` as dupe_key, count(*) as dupe_cnt - from ${tblName} + from $tblName group by `${HoodieRecord.RECORD_KEY_METADATA_FIELD}` having dupe_cnt > 1 """ @@ -75,10 +75,10 @@ class DedupeSparkJob(basePath: String, val tmpTableName = s"htbl_${System.currentTimeMillis()}" val dedupeTblName = s"${tmpTableName}_dupeKeys" - val metadata = new HoodieTableMetaClient(fs.getConf, basePath) + val metadata = HoodieTableMetaClient.builder().setConf(fs.getConf).setBasePath(basePath).build() val allFiles = fs.listStatus(new org.apache.hadoop.fs.Path(s"$basePath/$duplicatedPartitionPath")) - val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles) + val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitsTimeline.filterCompletedInstants(), allFiles) val latestFiles: java.util.List[HoodieBaseFile] = fsView.getLatestBaseFiles().collect(Collectors.toList[HoodieBaseFile]()) val filteredStatuses = latestFiles.map(f => f.getPath) LOG.info(s" List of files under partition: ${} => ${filteredStatuses.mkString(" ")}") @@ -184,10 +184,10 @@ class DedupeSparkJob(basePath: String, } def fixDuplicates(dryRun: Boolean = true) = { - val metadata = new HoodieTableMetaClient(fs.getConf, basePath) + val metadata = HoodieTableMetaClient.builder().setConf(fs.getConf).setBasePath(basePath).build() val allFiles = fs.listStatus(new Path(s"$basePath/$duplicatedPartitionPath")) - val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles) + val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitsTimeline.filterCompletedInstants(), allFiles) val latestFiles: java.util.List[HoodieBaseFile] = fsView.getLatestBaseFiles().collect(Collectors.toList[HoodieBaseFile]()) @@ -233,7 +233,7 @@ class DedupeSparkJob(basePath: String, } println("No duplicates found & counts are in check!!!! ") - // 4. Prepare to copy the fixed files back. + // 5. Prepare to copy the fixed files back. fileNameToPathMap.foreach { case (_, filePath) => val srcPath = new Path(s"$repairOutputPath/${filePath.getName}") val dstPath = new Path(s"$basePath/$duplicatedPartitionPath/${filePath.getName}") diff --git a/hudi-cli/src/main/scala/org/apache/hudi/cli/SparkHelpers.scala b/hudi-cli/src/main/scala/org/apache/hudi/cli/SparkHelpers.scala index 6859f7038c731..826fbcb4788e7 100644 --- a/hudi-cli/src/main/scala/org/apache/hudi/cli/SparkHelpers.scala +++ b/hudi-cli/src/main/scala/org/apache/hudi/cli/SparkHelpers.scala @@ -23,12 +23,11 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hudi.avro.HoodieAvroWriteSupport import org.apache.hudi.client.SparkTaskContextSupplier -import org.apache.hudi.common.HoodieJsonPayload import org.apache.hudi.common.bloom.{BloomFilter, BloomFilterFactory} -import org.apache.hudi.common.model.HoodieRecord -import org.apache.hudi.common.util.ParquetUtils +import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord} +import org.apache.hudi.common.util.BaseFileUtils import org.apache.hudi.config.{HoodieIndexConfig, HoodieStorageConfig} -import org.apache.hudi.io.storage.{HoodieAvroParquetConfig, HoodieParquetWriter} +import org.apache.hudi.io.storage.{HoodieAvroParquetWriter, HoodieParquetConfig} import org.apache.parquet.avro.AvroSchemaConverter import org.apache.parquet.hadoop.metadata.CompressionCodecName import org.apache.spark.sql.{DataFrame, SQLContext} @@ -40,17 +39,17 @@ import scala.collection.mutable._ object SparkHelpers { @throws[Exception] def skipKeysAndWriteNewFile(instantTime: String, fs: FileSystem, sourceFile: Path, destinationFile: Path, keysToSkip: Set[String]) { - val sourceRecords = ParquetUtils.readAvroRecords(fs.getConf, sourceFile) + val sourceRecords = BaseFileUtils.getInstance(HoodieFileFormat.PARQUET).readAvroRecords(fs.getConf, sourceFile) val schema: Schema = sourceRecords.get(0).getSchema - val filter: BloomFilter = BloomFilterFactory.createBloomFilter(HoodieIndexConfig.DEFAULT_BLOOM_FILTER_NUM_ENTRIES.toInt, HoodieIndexConfig.DEFAULT_BLOOM_FILTER_FPP.toDouble, - HoodieIndexConfig.DEFAULT_HOODIE_BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES.toInt, HoodieIndexConfig.DEFAULT_BLOOM_INDEX_FILTER_TYPE); - val writeSupport: HoodieAvroWriteSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter) - val parquetConfig: HoodieAvroParquetConfig = new HoodieAvroParquetConfig(writeSupport, CompressionCodecName.GZIP, HoodieStorageConfig.DEFAULT_PARQUET_BLOCK_SIZE_BYTES.toInt, HoodieStorageConfig.DEFAULT_PARQUET_PAGE_SIZE_BYTES.toInt, HoodieStorageConfig.DEFAULT_PARQUET_FILE_MAX_BYTES.toInt, fs.getConf, HoodieStorageConfig.DEFAULT_STREAM_COMPRESSION_RATIO.toDouble) + val filter: BloomFilter = BloomFilterFactory.createBloomFilter(HoodieIndexConfig.BLOOM_FILTER_NUM_ENTRIES_VALUE.defaultValue.toInt, HoodieIndexConfig.BLOOM_FILTER_FPP_VALUE.defaultValue.toDouble, + HoodieIndexConfig.BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES.defaultValue.toInt, HoodieIndexConfig.BLOOM_FILTER_TYPE.defaultValue); + val writeSupport: HoodieAvroWriteSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(fs.getConf).convert(schema), schema, org.apache.hudi.common.util.Option.of(filter)) + val parquetConfig: HoodieParquetConfig[HoodieAvroWriteSupport] = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP, HoodieStorageConfig.PARQUET_BLOCK_SIZE.defaultValue.toInt, HoodieStorageConfig.PARQUET_PAGE_SIZE.defaultValue.toInt, HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.defaultValue.toInt, fs.getConf, HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION.defaultValue.toDouble) // Add current classLoad for config, if not will throw classNotFound of 'HoodieWrapperFileSystem'. parquetConfig.getHadoopConf().setClassLoader(Thread.currentThread.getContextClassLoader) - val writer = new HoodieParquetWriter[HoodieJsonPayload, IndexedRecord](instantTime, destinationFile, parquetConfig, schema, new SparkTaskContextSupplier()) + val writer = new HoodieAvroParquetWriter(destinationFile, parquetConfig, instantTime, new SparkTaskContextSupplier(), true) for (rec <- sourceRecords) { val key: String = rec.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString if (!keysToSkip.contains(key)) { @@ -125,7 +124,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) { * @return */ def fileKeysAgainstBF(conf: Configuration, sqlContext: SQLContext, file: String): Boolean = { - val bf = ParquetUtils.readBloomFilterFromParquetMetadata(conf, new Path(file)) + val bf = BaseFileUtils.getInstance(HoodieFileFormat.PARQUET).readBloomFilterFromMetadata(conf, new Path(file)) val foundCount = sqlContext.parquetFile(file) .select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`") .collect().count(r => !bf.mightContain(r.getString(0))) diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/TestSparkUtil.java b/hudi-cli/src/test/java/org/apache/hudi/cli/TestSparkUtil.java new file mode 100644 index 0000000000000..4dcd15156baf1 --- /dev/null +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/TestSparkUtil.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli; + +import org.apache.hudi.common.util.Option; +import org.apache.hudi.cli.utils.SparkUtil; +import org.apache.spark.SparkConf; + +import org.apache.spark.launcher.SparkLauncher; +import org.junit.jupiter.api.Test; + +import java.net.URISyntaxException; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class TestSparkUtil { + + @Test + public void testInitSparkLauncher() throws URISyntaxException { + SparkLauncher sparkLauncher = SparkUtil.initLauncher(null); + assertNotNull(sparkLauncher); + } + + @Test + public void testGetDefaultSparkConf() { + SparkConf sparkConf = SparkUtil.getDefaultConf("test-spark-app", Option.of("")); + assertEquals(SparkUtil.DEFAULT_SPARK_MASTER, sparkConf.get("spark.master")); + } +} diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestArchivedCommitsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestArchivedCommitsCommand.java index 5c27636da18c8..b642c1b3f8ebb 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestArchivedCommitsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestArchivedCommitsCommand.java @@ -21,25 +21,29 @@ import org.apache.hudi.cli.HoodieCLI; import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.TableHeader; -import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; +import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; import org.apache.hudi.cli.testutils.HoodieTestCommitMetadataGenerator; import org.apache.hudi.cli.testutils.HoodieTestCommitUtilities; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; +import org.apache.hudi.client.HoodieTimelineArchiver; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.config.HoodieArchivalConfig; +import org.apache.hudi.config.HoodieCleanConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieSparkTable; -import org.apache.hudi.table.HoodieTimelineArchiveLog; -import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.springframework.shell.core.CommandResult; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; -import java.io.File; -import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -50,30 +54,36 @@ /** * Test Cases for {@link ArchivedCommitsCommand}. */ -public class TestArchivedCommitsCommand extends AbstractShellIntegrationTest { +@Tag("functional") +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class TestArchivedCommitsCommand extends CLIFunctionalTestHarness { + + @Autowired + private Shell shell; private String tablePath; @BeforeEach public void init() throws Exception { - initDFS(); - jsc.hadoopConfiguration().addResource(dfs.getConf()); - HoodieCLI.conf = dfs.getConf(); + HoodieCLI.conf = hadoopConf(); // Create table and connect - String tableName = "test_table"; - tablePath = basePath + File.separator + tableName; + String tableName = tableName(); + tablePath = tablePath(tableName); new TableCommand().createTable( tablePath, tableName, "COPY_ON_WRITE", "", 1, "org.apache.hudi.common.model.HoodieAvroPayload"); - metaClient = HoodieCLI.getTableMetaClient(); + HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); // Generate archive HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath) .withSchema(HoodieTestCommitMetadataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build()) + .withArchivalConfig(HoodieArchivalConfig.newBuilder().archiveCommitsWith(2, 3).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder().retainCommits(1).build()) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withRemoteServerPort(timelineServicePort).build()) .forTable("test-trip-table").build(); // Create six commits @@ -81,26 +91,26 @@ public void init() throws Exception { String timestamp = String.valueOf(i); // Requested Compaction HoodieTestCommitMetadataGenerator.createCompactionAuxiliaryMetadata(tablePath, - new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, timestamp), dfs.getConf()); + new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, timestamp), hadoopConf()); // Inflight Compaction HoodieTestCommitMetadataGenerator.createCompactionAuxiliaryMetadata(tablePath, - new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, timestamp), dfs.getConf()); - HoodieTestCommitMetadataGenerator.createCommitFileWithMetadata(tablePath, timestamp, dfs.getConf()); + new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, timestamp), hadoopConf()); + HoodieTestCommitMetadataGenerator.createCommitFileWithMetadata(tablePath, timestamp, hadoopConf()); } + // Simulate a compaction commit in metadata table timeline + // so the archival in data table can happen + HoodieTestUtils.createCompactionCommitInMetadataTable( + hadoopConf(), metaClient.getFs(), tablePath, "105"); + metaClient = HoodieTableMetaClient.reload(metaClient); // reload the timeline and get all the commits before archive metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); // archive - HoodieSparkTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - archiveLog.archiveIfRequired(context); - } - - @AfterEach - public void clean() throws IOException { - cleanupDFS(); + HoodieSparkTable table = HoodieSparkTable.create(cfg, context(), metaClient); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(cfg, table); + archiver.archiveIfRequired(context()); } /** @@ -108,8 +118,8 @@ public void clean() throws IOException { */ @Test public void testShowArchivedCommits() { - CommandResult cr = getShell().executeCommand("show archived commit stats"); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "show archived commit stats"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); TableHeader header = new TableHeader().addTableHeaderField("action").addTableHeaderField("instant") .addTableHeaderField("partition").addTableHeaderField("file_id").addTableHeaderField("prev_instant") @@ -150,7 +160,7 @@ public void testShowArchivedCommits() { String expectedResult = HoodiePrintHelper.print( header, new HashMap<>(), "", false, -1, false, rows); expectedResult = removeNonWordAndStripSpace(expectedResult); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expectedResult, got); } @@ -159,8 +169,8 @@ public void testShowArchivedCommits() { */ @Test public void testShowCommits() throws Exception { - CommandResult cr = getShell().executeCommand("show archived commits"); - assertTrue(cr.isSuccess()); + Object cmdResult = shell.evaluate(() -> "show archived commits"); + assertTrue(ShellEvaluationResultUtil.isSuccess(cmdResult)); final List rows = new ArrayList<>(); // Test default skipMetadata and limit 10 @@ -175,12 +185,12 @@ public void testShowCommits() throws Exception { rows.add(new Comparable[] {"103", "commit"}); String expected = HoodiePrintHelper.print(header, new HashMap<>(), "", false, 10, false, rows); expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(cmdResult.toString()); assertEquals(expected, got); // Test with Metadata and no limit - cr = getShell().executeCommand("show archived commits --skipMetadata false --limit -1"); - assertTrue(cr.isSuccess()); + cmdResult = shell.evaluate(() -> "show archived commits --skipMetadata false --limit 0"); + assertTrue(ShellEvaluationResultUtil.isSuccess(cmdResult)); rows.clear(); @@ -195,9 +205,9 @@ public void testShowCommits() throws Exception { rows.add(result); } header = header.addTableHeaderField("CommitDetails"); - expected = HoodiePrintHelper.print(header, new HashMap<>(), "", false, -1, false, rows); + expected = HoodiePrintHelper.print(header, new HashMap<>(), "", false, 0, false, rows); expected = removeNonWordAndStripSpace(expected); - got = removeNonWordAndStripSpace(cr.getResult().toString()); + got = removeNonWordAndStripSpace(cmdResult.toString()); assertEquals(expected, got); } } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCleansCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCleansCommand.java index 2311aaa22f3fa..f0ed1787e21f8 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCleansCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCleansCommand.java @@ -23,9 +23,12 @@ import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.HoodieTableHeaderFields; import org.apache.hudi.cli.TableHeader; -import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; +import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; import org.apache.hudi.cli.testutils.HoodieTestCommitMetadataGenerator; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; +import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; @@ -34,19 +37,25 @@ import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestTable; import org.apache.hudi.common.util.Option; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.springframework.shell.core.CommandResult; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; -import java.io.File; import java.io.IOException; import java.net.URL; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.UUID; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -56,17 +65,22 @@ /** * Test Cases for {@link CleansCommand}. */ -public class TestCleansCommand extends AbstractShellIntegrationTest { +@Tag("functional") +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class TestCleansCommand extends CLIFunctionalTestHarness { + + @Autowired + private Shell shell; - private String tablePath; private URL propsFilePath; + private HoodieTableMetaClient metaClient; @BeforeEach public void init() throws Exception { - HoodieCLI.conf = jsc.hadoopConfiguration(); + HoodieCLI.conf = hadoopConf(); - String tableName = "test_table"; - tablePath = basePath + File.separator + tableName; + String tableName = tableName(); + String tablePath = tablePath(tableName); propsFilePath = TestCleansCommand.class.getClassLoader().getResource("clean.properties"); // Create table and connect @@ -79,7 +93,8 @@ public void init() throws Exception { metaClient = HoodieCLI.getTableMetaClient(); String fileId1 = UUID.randomUUID().toString(); String fileId2 = UUID.randomUUID().toString(); - HoodieTestDataGenerator.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, tablePath); + FileSystem fs = FSUtils.getFs(basePath(), hadoopConf()); + HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, tablePath); // Create four commits for (int i = 100; i < 104; i++) { @@ -90,8 +105,11 @@ public void init() throws Exception { // Inflight Compaction HoodieTestCommitMetadataGenerator.createCompactionAuxiliaryMetadata(tablePath, new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, timestamp), conf); + + Map extraCommitMetadata = + Collections.singletonMap(HoodieCommitMetadata.SCHEMA_KEY, HoodieTestTable.PHONY_TABLE_SCHEMA); HoodieTestCommitMetadataGenerator.createCommitFileWithMetadata(tablePath, timestamp, conf, fileId1, fileId2, - Option.empty(), Option.empty()); + Option.empty(), Option.empty(), extraCommitMetadata); } metaClient = HoodieTableMetaClient.reload(metaClient); @@ -108,12 +126,12 @@ public void testShowCleans() throws Exception { assertNotNull(propsFilePath, "Not found properties file"); // First, run clean - SparkMain.clean(jsc, HoodieCLI.basePath, propsFilePath.getPath(), new ArrayList<>()); + SparkMain.clean(jsc(), HoodieCLI.basePath, propsFilePath.getPath(), new ArrayList<>()); assertEquals(1, metaClient.getActiveTimeline().reload().getCleanerTimeline().getInstants().count(), "Loaded 1 clean and the count should match"); - CommandResult cr = getShell().executeCommand("cleans show"); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "cleans show"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); HoodieInstant clean = metaClient.getActiveTimeline().reload().getCleanerTimeline().getInstants().findFirst().orElse(null); assertNotNull(clean); @@ -131,7 +149,7 @@ public void testShowCleans() throws Exception { String expected = HoodiePrintHelper.print(header, new HashMap<>(), "", false, -1, false, rows); expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expected, got); } @@ -139,19 +157,19 @@ public void testShowCleans() throws Exception { * Test case for show partitions of a clean instant. */ @Test - public void testShowCleanPartitions() throws IOException { + public void testShowCleanPartitions() { // Check properties file exists. assertNotNull(propsFilePath, "Not found properties file"); // First, run clean with two partition - SparkMain.clean(jsc, HoodieCLI.basePath, propsFilePath.toString(), new ArrayList<>()); + SparkMain.clean(jsc(), HoodieCLI.basePath, propsFilePath.toString(), new ArrayList<>()); assertEquals(1, metaClient.getActiveTimeline().reload().getCleanerTimeline().getInstants().count(), "Loaded 1 clean and the count should match"); HoodieInstant clean = metaClient.getActiveTimeline().reload().getCleanerTimeline().getInstants().findFirst().get(); - CommandResult cr = getShell().executeCommand("clean showpartitions --clean " + clean.getTimestamp()); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "clean showpartitions --clean " + clean.getTimestamp()); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION_PATH) .addTableHeaderField(HoodieTableHeaderFields.HEADER_CLEANING_POLICY) @@ -169,7 +187,7 @@ public void testShowCleanPartitions() throws IOException { String expected = HoodiePrintHelper.print(header, new HashMap<>(), "", false, -1, false, rows); expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expected, got); } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCommitsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCommitsCommand.java index 84b357622948e..7e504488a2dfd 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCommitsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCommitsCommand.java @@ -22,28 +22,39 @@ import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.HoodieTableHeaderFields; import org.apache.hudi.cli.TableHeader; -import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; +import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; import org.apache.hudi.cli.testutils.HoodieTestCommitMetadataGenerator; +import org.apache.hudi.cli.testutils.HoodieTestReplaceCommitMetadataGenerator; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; +import org.apache.hudi.client.HoodieTimelineArchiver; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.util.NumericUtils; import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieArchivalConfig; +import org.apache.hudi.config.HoodieCleanConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieSparkTable; -import org.apache.hudi.table.HoodieTimelineArchiveLog; import org.apache.hadoop.fs.FileSystem; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.springframework.shell.core.CommandResult; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.ValueSource; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; -import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; @@ -54,27 +65,38 @@ import java.util.function.Function; import java.util.stream.Collectors; +import static org.apache.hudi.common.testutils.HoodieTestUtils.createCompactionCommitInMetadataTable; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; /** * Test class for {@link org.apache.hudi.cli.commands.CommitsCommand}. */ -public class TestCommitsCommand extends AbstractShellIntegrationTest { +@Tag("functional") +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class TestCommitsCommand extends CLIFunctionalTestHarness { - private String tableName; - private String tablePath; + @Autowired + private Shell shell; + + private String tableName1; + private String tableName2; + private String tablePath1; + private String tablePath2; + private HoodieTableMetaClient metaClient; @BeforeEach public void init() throws IOException { - tableName = "test_table"; - tablePath = basePath + File.separator + tableName; - - HoodieCLI.conf = jsc.hadoopConfiguration(); + tableName1 = tableName("_1"); + tableName2 = tableName("_2"); + tablePath1 = tablePath(tableName1); + tablePath2 = tablePath(tableName2); + HoodieCLI.conf = hadoopConf(); // Create table and connect new TableCommand().createTable( - tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), + tablePath1, tableName1, HoodieTableType.COPY_ON_WRITE.name(), "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + metaClient = HoodieCLI.getTableMetaClient(); } private LinkedHashMap generateData() throws Exception { @@ -87,7 +109,7 @@ private LinkedHashMap generateData() throws Exception { for (Map.Entry entry : data.entrySet()) { String key = entry.getKey(); Integer[] value = entry.getValue(); - HoodieTestCommitMetadataGenerator.createCommitFileWithMetadata(tablePath, key, jsc.hadoopConfiguration(), + HoodieTestCommitMetadataGenerator.createCommitFileWithMetadata(tablePath1, key, hadoopConf(), Option.of(value[0]), Option.of(value[1])); } @@ -97,10 +119,46 @@ private LinkedHashMap generateData() throws Exception { return data; } + /* + * generates both replace commit and commit data + * */ + private LinkedHashMap generateMixedData() throws Exception { + // generate data and metadata + LinkedHashMap replaceCommitData = new LinkedHashMap<>(); + replaceCommitData.put(new HoodieInstant(false, HoodieTimeline.REPLACE_COMMIT_ACTION, "103"), new Integer[] {15, 10}); + + LinkedHashMap commitData = new LinkedHashMap<>(); + commitData.put(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "102"), new Integer[] {15, 10}); + commitData.put(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "101"), new Integer[] {20, 10}); + + for (Map.Entry entry : commitData.entrySet()) { + String key = entry.getKey().getTimestamp(); + Integer[] value = entry.getValue(); + HoodieTestCommitMetadataGenerator.createCommitFileWithMetadata(tablePath1, key, hadoopConf(), + Option.of(value[0]), Option.of(value[1])); + } + + for (Map.Entry entry : replaceCommitData.entrySet()) { + String key = entry.getKey().getTimestamp(); + Integer[] value = entry.getValue(); + HoodieTestReplaceCommitMetadataGenerator.createReplaceCommitFileWithMetadata(tablePath1, key, + Option.of(value[0]), Option.of(value[1]), metaClient); + } + + metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); + assertEquals(3, metaClient.reloadActiveTimeline().getCommitsTimeline().countInstants(), + "There should be 3 commits"); + + LinkedHashMap data = replaceCommitData; + data.putAll(commitData); + + return data; + } + private String generateExpectData(int records, Map data) throws IOException { - FileSystem fs = FileSystem.get(jsc.hadoopConfiguration()); + FileSystem fs = FileSystem.get(hadoopConf()); List partitionPaths = - FSUtils.getAllPartitionFoldersThreeLevelsDown(fs, tablePath); + FSUtils.getAllPartitionFoldersThreeLevelsDown(fs, tablePath1); int partitions = partitionPaths.size(); // default pre-commit is not null, file add always be 0 and update always be partition nums @@ -113,27 +171,15 @@ private String generateExpectData(int records, Map data) thro data.forEach((key, value) -> { for (int i = 0; i < records; i++) { // there are more than 1 partitions, so need to * partitions - rows.add(new Comparable[]{key, partitions * HoodieTestCommitMetadataGenerator.DEFAULT_TOTAL_WRITE_BYTES, + rows.add(new Comparable[] {key, partitions * HoodieTestCommitMetadataGenerator.DEFAULT_TOTAL_WRITE_BYTES, fileAdded, fileUpdated, partitions, partitions * value[0], partitions * value[1], errors}); } }); final Map> fieldNameToConverterMap = new HashMap<>(); - fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN, entry -> { - return NumericUtils.humanReadableByteCount((Double.valueOf(entry.toString()))); - }); - - final TableHeader header = new TableHeader() - .addTableHeaderField(HoodieTableHeaderFields.HEADER_COMMIT_TIME) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_FILES_ADDED) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_FILES_UPDATED) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_PARTITIONS_WRITTEN) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_RECORDS_WRITTEN) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_UPDATE_RECORDS_WRITTEN) - .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_ERRORS); + fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN, entry -> NumericUtils.humanReadableByteCount((Double.parseDouble(entry.toString())))); - return HoodiePrintHelper.print(header, fieldNameToConverterMap, "", false, + return HoodiePrintHelper.print(HoodieTableHeaderFields.getTableHeader(), fieldNameToConverterMap, "", false, -1, false, rows); } @@ -144,24 +190,84 @@ private String generateExpectData(int records, Map data) thro public void testShowCommits() throws Exception { Map data = generateData(); - CommandResult cr = getShell().executeCommand("commits show"); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "commits show"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); String expected = generateExpectData(1, data); expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expected, got); } + @Test + public void testShowCommitsIncludingArchivedTimeline() throws Exception { + Map data = generateDataAndArchive(true); + data.remove("101"); + data.remove("102"); + + Object result = shell.evaluate(() -> "commits show --includeExtraMetadata true --includeArchivedTimeline true --partition 2015/03/16"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); + + String expected = generateExpectDataWithExtraMetadata(1, data); + expected = removeNonWordAndStripSpace(expected); + String got = removeNonWordAndStripSpace(result.toString()); + assertEquals(expected, got); + } + + private String generateExpectDataWithExtraMetadata(int records, Map data) throws IOException { + List rows = new ArrayList<>(); + data.forEach((key, value) -> { + for (int i = 0; i < records; i++) { + // there are more than 1 partitions, so need to * partitions + rows.add(new Comparable[] {HoodieTimeline.COMMIT_ACTION, key, "2015/03/16", HoodieTestCommitMetadataGenerator.DEFAULT_FILEID, + HoodieTestCommitMetadataGenerator.DEFAULT_PRE_COMMIT, key.equals("104") ? "20" : "15", "0", "0", key.equals("104") ? "10" : "15", + "0", HoodieTestCommitMetadataGenerator.DEFAULT_TOTAL_LOG_BLOCKS, "0", "0", HoodieTestCommitMetadataGenerator.DEFAULT_TOTAL_LOG_RECORDS, + "0", HoodieTestCommitMetadataGenerator.DEFAULT_TOTAL_WRITE_BYTES}); + } + }); + + final Map> fieldNameToConverterMap = new HashMap<>(); + fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN, entry -> NumericUtils.humanReadableByteCount((Double.parseDouble(entry.toString())))); + + final TableHeader header = HoodieTableHeaderFields.getTableHeaderWithExtraMetadata(); + + return HoodiePrintHelper.print(header, fieldNameToConverterMap, "", false, + -1, false, rows); + } + /** * Test case of 'commits showarchived' command. */ - @Test - public void testShowArchivedCommits() throws Exception { + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testShowArchivedCommits(boolean enableMetadataTable) throws Exception { + Map data = generateDataAndArchive(enableMetadataTable); + + Object result = shell.evaluate(() -> String.format("commits showarchived --startTs %s --endTs %s", "100", "104")); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); + + // archived 101 and 102 instant, generate expect data + assertEquals(2, metaClient.reloadActiveTimeline().getCommitsTimeline().countInstants(), + "There should 2 instants not be archived!"); + + // archived 101 and 102 instants, remove 103 and 104 instant + data.remove("103"); + data.remove("104"); + String expected = generateExpectData(1, data); + expected = removeNonWordAndStripSpace(expected); + String got = removeNonWordAndStripSpace(result.toString()); + assertEquals(expected, got); + } + + private Map generateDataAndArchive(boolean enableMetadataTable) throws Exception { // Generate archive - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath) + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath1) .withSchema(HoodieTestCommitMetadataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder().retainCommits(1).build()) + .withArchivalConfig(HoodieArchivalConfig.newBuilder().archiveCommitsWith(2, 3).build()) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withRemoteServerPort(timelineServicePort).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadataTable).build()) .forTable("test-trip-table").build(); // generate data and metadata @@ -174,29 +280,76 @@ public void testShowArchivedCommits() throws Exception { for (Map.Entry entry : data.entrySet()) { String key = entry.getKey(); Integer[] value = entry.getValue(); - HoodieTestCommitMetadataGenerator.createCommitFileWithMetadata(tablePath, key, jsc.hadoopConfiguration(), + HoodieTestCommitMetadataGenerator.createCommitFileWithMetadata(tablePath1, key, hadoopConf(), Option.of(value[0]), Option.of(value[1])); } + if (enableMetadataTable) { + // Simulate a compaction commit in metadata table timeline + // so the archival in data table can happen + createCompactionCommitInMetadataTable(hadoopConf(), metaClient.getFs(), tablePath1, "104"); + } + // archive metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); - HoodieSparkTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - archiveLog.archiveIfRequired(context); + HoodieSparkTable table = HoodieSparkTable.create(cfg, context(), metaClient); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(cfg, table); + archiver.archiveIfRequired(context()); + return data; + } - CommandResult cr = getShell().executeCommand(String.format("commits showarchived --startTs %s --endTs %s", "100", "104")); - assertTrue(cr.isSuccess()); + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testShowArchivedCommitsWithMultiCommitsFile(boolean enableMetadataTable) throws Exception { + // Generate archive + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath1) + .withSchema(HoodieTestCommitMetadataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .withCleanConfig(HoodieCleanConfig.newBuilder().retainCommits(1).build()) + .withArchivalConfig(HoodieArchivalConfig.newBuilder().archiveCommitsWith(2, 3).build()) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withRemoteServerPort(timelineServicePort).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadataTable).build()) + .forTable("test-trip-table").build(); - // archived 101 and 102 instant, generate expect data - assertEquals(2, metaClient.reloadActiveTimeline().getCommitsTimeline().countInstants(), - "There should 2 instants not be archived!"); + // generate data and metadata + Map data = new LinkedHashMap<>(); - // archived 101 and 102 instants, remove 103 and 104 instant - data.remove("103"); - data.remove("104"); - String expected = generateExpectData(3, data); + for (int i = 194; i >= 154; i--) { + data.put(String.valueOf(i), new Integer[] {i, i}); + } + + if (enableMetadataTable) { + // Simulate a compaction commit in metadata table timeline + // so the archival in data table can happen + createCompactionCommitInMetadataTable(hadoopConf(), metaClient.getFs(), tablePath1, "194"); + } + + for (Map.Entry entry : data.entrySet()) { + String key = entry.getKey(); + Integer[] value = entry.getValue(); + HoodieTestCommitMetadataGenerator.createCommitFileWithMetadata(tablePath1, key, hadoopConf(), + Option.of(value[0]), Option.of(value[1])); + // archive + metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); + HoodieSparkTable table = HoodieSparkTable.create(cfg, context(), metaClient); + + // need to create multi archive files + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(cfg, table); + archiver.archiveIfRequired(context()); + } + + Object result = shell.evaluate(() -> String.format("commits showarchived --startTs %s --endTs %s", "160", "174")); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); + assertEquals(3, metaClient.reloadActiveTimeline().getCommitsTimeline().countInstants(), + "There should 3 instants not be archived!"); + + Map data2 = new LinkedHashMap<>(); + for (int i = 174; i >= 161; i--) { + data2.put(String.valueOf(i), new Integer[] {i, i}); + } + String expected = generateExpectData(1, data2); expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expected, got); } @@ -208,22 +361,23 @@ public void testShowCommitPartitions() throws Exception { Map data = generateData(); String commitInstant = "101"; - CommandResult cr = getShell().executeCommand(String.format("commit showpartitions --commit %s", commitInstant)); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> String.format("commit showpartitions --commit %s", commitInstant)); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); Integer[] value = data.get(commitInstant); List rows = new ArrayList<>(); // prevCommit not null, so add 0, update 1 Arrays.asList(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).stream().forEach(partition -> - rows.add(new Comparable[] {partition, 0, 1, 0, value[1], HoodieTestCommitMetadataGenerator.DEFAULT_TOTAL_WRITE_BYTES, 0}) + rows.add(new Comparable[] {HoodieTimeline.COMMIT_ACTION, partition, 0, 1, 0, value[1], HoodieTestCommitMetadataGenerator.DEFAULT_TOTAL_WRITE_BYTES, 0}) ); Map> fieldNameToConverterMap = new HashMap<>(); fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN, entry -> NumericUtils.humanReadableByteCount((Long.parseLong(entry.toString())))); - TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION_PATH) + TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_ACTION) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION_PATH) .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_FILES_ADDED) .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_FILES_UPDATED) .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_RECORDS_INSERTED) @@ -233,10 +387,48 @@ public void testShowCommitPartitions() throws Exception { String expected = HoodiePrintHelper.print(header, fieldNameToConverterMap, "", false, -1, false, rows); expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expected, got); } + @Test + public void testShowCommitPartitionsWithReplaceCommits() throws Exception { + Map data = generateMixedData(); + + for (HoodieInstant commitInstant : data.keySet()) { + Object result = shell.evaluate(() -> + String.format("commit showpartitions --commit %s", commitInstant.getTimestamp())); + + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); + + Integer[] value = data.get(commitInstant); + List rows = new ArrayList<>(); + // prevCommit not null, so add 0, update 1 + Arrays.asList(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, + HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).stream().forEach(partition -> + rows.add(new Comparable[] {commitInstant.getAction(), partition, 0, 1, 0, value[1], HoodieTestCommitMetadataGenerator.DEFAULT_TOTAL_WRITE_BYTES, 0}) + ); + + Map> fieldNameToConverterMap = new HashMap<>(); + fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN, + entry -> NumericUtils.humanReadableByteCount((Long.parseLong(entry.toString())))); + + TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_ACTION) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION_PATH) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_FILES_ADDED) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_FILES_UPDATED) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_RECORDS_INSERTED) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_RECORDS_UPDATED) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_ERRORS); + + String expected = HoodiePrintHelper.print(header, fieldNameToConverterMap, "", false, -1, false, rows); + expected = removeNonWordAndStripSpace(expected); + String got = removeNonWordAndStripSpace(result.toString()); + assertEquals(expected, got); + } + } + /** * Test case of 'commit showfiles' command. */ @@ -245,19 +437,20 @@ public void testShowCommitFiles() throws Exception { Map data = generateData(); String commitInstant = "101"; - CommandResult cr = getShell().executeCommand(String.format("commit showfiles --commit %s", commitInstant)); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> String.format("commit showfiles --commit %s", commitInstant)); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); Integer[] value = data.get(commitInstant); List rows = new ArrayList<>(); Arrays.asList(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).stream().forEach(partition -> - rows.add(new Comparable[] {partition, HoodieTestCommitMetadataGenerator.DEFAULT_FILEID, + rows.add(new Comparable[] {HoodieTimeline.COMMIT_ACTION, partition, HoodieTestCommitMetadataGenerator.DEFAULT_FILEID, HoodieTestCommitMetadataGenerator.DEFAULT_PRE_COMMIT, value[1], value[0], HoodieTestCommitMetadataGenerator.DEFAULT_TOTAL_WRITE_BYTES, // default 0 errors and blank file with 0 size 0, 0})); - TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION_PATH) + TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_ACTION) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION_PATH) .addTableHeaderField(HoodieTableHeaderFields.HEADER_FILE_ID) .addTableHeaderField(HoodieTableHeaderFields.HEADER_PREVIOUS_COMMIT) .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_RECORDS_UPDATED) @@ -268,63 +461,94 @@ public void testShowCommitFiles() throws Exception { String expected = HoodiePrintHelper.print(header, new HashMap<>(), "", false, -1, false, rows); expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expected, got); } + @Test + public void testShowCommitFilesWithReplaceCommits() throws Exception { + Map data = generateMixedData(); + + for (HoodieInstant commitInstant : data.keySet()) { + Object result = shell.evaluate(() -> String.format("commit showfiles --commit %s", commitInstant.getTimestamp())); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); + + Integer[] value = data.get(commitInstant); + List rows = new ArrayList<>(); + Arrays.asList(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, + HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).stream().forEach(partition -> + rows.add(new Comparable[] {commitInstant.getAction(), partition, HoodieTestCommitMetadataGenerator.DEFAULT_FILEID, + HoodieTestCommitMetadataGenerator.DEFAULT_PRE_COMMIT, + value[1], value[0], HoodieTestCommitMetadataGenerator.DEFAULT_TOTAL_WRITE_BYTES, + // default 0 errors and blank file with 0 size + 0, 0})); + TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_ACTION) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION_PATH) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_FILE_ID) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_PREVIOUS_COMMIT) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_RECORDS_UPDATED) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_RECORDS_WRITTEN) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_ERRORS) + .addTableHeaderField(HoodieTableHeaderFields.HEADER_FILE_SIZE); + + String expected = HoodiePrintHelper.print(header, new HashMap<>(), "", false, -1, false, rows); + expected = removeNonWordAndStripSpace(expected); + String got = removeNonWordAndStripSpace(result.toString()); + assertEquals(expected, got); + } + } + /** * Test case of 'commits compare' command. */ - @Test - public void testCompareCommits() throws Exception { + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testCompareCommits(HoodieTableType tableType) throws Exception { Map data = generateData(); - - String tableName2 = "test_table2"; - String tablePath2 = basePath + File.separator + tableName2; - HoodieTestUtils.init(jsc.hadoopConfiguration(), tablePath2, getTableType()); + HoodieTestUtils.init(hadoopConf(), tablePath2, tableType); data.remove("102"); for (Map.Entry entry : data.entrySet()) { String key = entry.getKey(); Integer[] value = entry.getValue(); - HoodieTestCommitMetadataGenerator.createCommitFileWithMetadata(tablePath2, key, jsc.hadoopConfiguration(), + HoodieTestCommitMetadataGenerator.createCommitFileWithMetadata(tablePath2, key, hadoopConf(), Option.of(value[0]), Option.of(value[1])); } - CommandResult cr = getShell().executeCommand(String.format("commits compare --path %s", tablePath2)); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> String.format("commits compare --path %s", tablePath2)); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); // the latest instant of test_table2 is 101 List commitsToCatchup = metaClient.getActiveTimeline().findInstantsAfter("101", Integer.MAX_VALUE) .getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); String expected = String.format("Source %s is ahead by %d commits. Commits to catch up - %s", - tableName, commitsToCatchup.size(), commitsToCatchup); - assertEquals(expected, cr.getResult().toString()); + tableName1, commitsToCatchup.size(), commitsToCatchup); + assertEquals(expected, result.toString()); } /** * Test case of 'commits sync' command. */ - @Test - public void testSyncCommits() throws Exception { + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testSyncCommits(HoodieTableType tableType) throws Exception { Map data = generateData(); - String tableName2 = "test_table2"; - String tablePath2 = basePath + File.separator + tableName2; - HoodieTestUtils.init(jsc.hadoopConfiguration(), tablePath2, getTableType(), tableName2); + HoodieTestUtils.init(hadoopConf(), tablePath2, tableType, tableName2); data.remove("102"); for (Map.Entry entry : data.entrySet()) { String key = entry.getKey(); Integer[] value = entry.getValue(); - HoodieTestCommitMetadataGenerator.createCommitFileWithMetadata(tablePath2, key, jsc.hadoopConfiguration(), + HoodieTestCommitMetadataGenerator.createCommitFileWithMetadata(tablePath2, key, hadoopConf(), Option.of(value[0]), Option.of(value[1])); } - CommandResult cr = getShell().executeCommand(String.format("commits sync --path %s", tablePath2)); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> String.format("commits sync --path %s", tablePath2)); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); - String expected = String.format("Load sync state between %s and %s", tableName, tableName2); - assertEquals(expected, cr.getResult().toString()); + String expected = String.format("Load sync state between %s and %s", tableName1, tableName2); + assertEquals(expected, result.toString()); } } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCompactionCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCompactionCommand.java new file mode 100644 index 0000000000000..f1ea09470d35c --- /dev/null +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCompactionCommand.java @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.commands; + +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.cli.HoodieCLI; +import org.apache.hudi.cli.HoodiePrintHelper; +import org.apache.hudi.cli.TableHeader; +import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; +import org.apache.hudi.cli.testutils.HoodieTestCommitMetadataGenerator; +import org.apache.hudi.client.HoodieTimelineArchiver; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; +import org.apache.hudi.common.fs.NoOpConsistencyGuard; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.testutils.CompactionTestUtils; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieArchivalConfig; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.table.HoodieSparkTable; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.table.timeline.HoodieTimeline.COMPACTION_ACTION; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +/** + * Test Cases for {@link CompactionCommand}. + */ +@Tag("functional") +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class TestCompactionCommand extends CLIFunctionalTestHarness { + + @Autowired + private Shell shell; + + private String tableName; + private String tablePath; + + @BeforeEach + public void init() { + tableName = tableName(); + tablePath = tablePath(tableName); + } + + @Test + public void testVerifyTableType() throws IOException { + // create COW table. + new TableCommand().createTable( + tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), + "", TimelineLayoutVersion.VERSION_1, HoodieAvroPayload.class.getName()); + + // expect HoodieException for COPY_ON_WRITE table. + assertThrows(HoodieException.class, + () -> new CompactionCommand().compactionsAll(false, -1, "", false, false)); + } + + /** + * Test case for command 'compactions show all'. + */ + @Test + public void testCompactionsAll() throws IOException { + // create MOR table. + new TableCommand().createTable( + tablePath, tableName, HoodieTableType.MERGE_ON_READ.name(), + "", TimelineLayoutVersion.VERSION_1, HoodieAvroPayload.class.getName()); + + CompactionTestUtils.setupAndValidateCompactionOperations(HoodieCLI.getTableMetaClient(), false, 3, 4, 3, 3); + + HoodieCLI.getTableMetaClient().reloadActiveTimeline(); + + Object result = shell.evaluate(() -> "compactions show all"); + System.out.println(result.toString()); + + TableHeader header = new TableHeader().addTableHeaderField("Compaction Instant Time").addTableHeaderField("State") + .addTableHeaderField("Total FileIds to be Compacted"); + Map fileIds = new HashMap(); + fileIds.put("001", 3); + fileIds.put("003", 4); + fileIds.put("005", 3); + fileIds.put("007", 3); + List rows = new ArrayList<>(); + Arrays.asList("001", "003", "005", "007").stream().sorted(Comparator.reverseOrder()).forEach(instant -> { + rows.add(new Comparable[] {instant, "REQUESTED", fileIds.get(instant)}); + }); + String expected = HoodiePrintHelper.print(header, new HashMap<>(), "", false, -1, false, rows); + assertEquals(expected, result.toString()); + } + + /** + * Test case for command 'compaction show'. + */ + @Test + public void testCompactionShow() throws IOException { + // create MOR table. + new TableCommand().createTable( + tablePath, tableName, HoodieTableType.MERGE_ON_READ.name(), + "", TimelineLayoutVersion.VERSION_1, HoodieAvroPayload.class.getName()); + + CompactionTestUtils.setupAndValidateCompactionOperations(HoodieCLI.getTableMetaClient(), false, 3, 4, 3, 3); + + HoodieCLI.getTableMetaClient().reloadActiveTimeline(); + + Object result = shell.evaluate(() -> "compaction show --instant 001"); + System.out.println(result.toString()); + } + + private void generateCompactionInstances() throws IOException { + // create MOR table. + new TableCommand().createTable( + tablePath, tableName, HoodieTableType.MERGE_ON_READ.name(), + "", TimelineLayoutVersion.VERSION_1, HoodieAvroPayload.class.getName()); + + CompactionTestUtils.setupAndValidateCompactionOperations(HoodieCLI.getTableMetaClient(), true, 1, 2, 3, 4); + + HoodieActiveTimeline activeTimeline = HoodieCLI.getTableMetaClient().reloadActiveTimeline(); + // Create six commits + Arrays.asList("001", "003", "005", "007").forEach(timestamp -> { + activeTimeline.transitionCompactionInflightToComplete( + new HoodieInstant(HoodieInstant.State.INFLIGHT, COMPACTION_ACTION, timestamp), Option.empty()); + }); + // Simulate a compaction commit in metadata table timeline + // so the archival in data table can happen + HoodieTestUtils.createCompactionCommitInMetadataTable(hadoopConf(), + new HoodieWrapperFileSystem( + FSUtils.getFs(tablePath, hadoopConf()), new NoOpConsistencyGuard()), tablePath, "007"); + } + + private void generateArchive() throws IOException { + // Generate archive + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath) + .withSchema(HoodieTestCommitMetadataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .withCleanConfig(HoodieCleanConfig.newBuilder().retainCommits(1).build()) + .withArchivalConfig(HoodieArchivalConfig.newBuilder().archiveCommitsWith(2, 3).build()) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withRemoteServerPort(timelineServicePort).build()) + .forTable("test-trip-table").build(); + // archive + HoodieTableMetaClient metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); + HoodieSparkTable table = HoodieSparkTable.create(cfg, context(), metaClient); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(cfg, table); + archiver.archiveIfRequired(context()); + } + + /** + * Test case for command 'compactions showarchived'. + */ + @Test + public void testCompactionsShowArchived() throws IOException { + generateCompactionInstances(); + + generateArchive(); + + Object result = shell.evaluate(() -> "compactions showarchived --startTs 001 --endTs 005"); + + // generate result + Map fileMap = new HashMap<>(); + fileMap.put("001", 1); + fileMap.put("003", 2); + fileMap.put("005", 3); + List rows = Arrays.asList("005", "003", "001").stream().map(i -> + new Comparable[] {i, HoodieInstant.State.COMPLETED, fileMap.get(i)}).collect(Collectors.toList()); + Map> fieldNameToConverterMap = new HashMap<>(); + TableHeader header = new TableHeader().addTableHeaderField("Compaction Instant Time").addTableHeaderField("State") + .addTableHeaderField("Total FileIds to be Compacted"); + String expected = HoodiePrintHelper.print(header, fieldNameToConverterMap, "", false, -1, false, rows); + + expected = removeNonWordAndStripSpace(expected); + String got = removeNonWordAndStripSpace(result.toString()); + assertEquals(expected, got); + } + + /** + * Test case for command 'compaction showarchived'. + */ + @Test + public void testCompactionShowArchived() throws IOException { + generateCompactionInstances(); + + String instance = "001"; + // get compaction plan before compaction + HoodieCompactionPlan plan = TimelineMetadataUtils.deserializeCompactionPlan( + HoodieCLI.getTableMetaClient().reloadActiveTimeline().readCompactionPlanAsBytes( + HoodieTimeline.getCompactionRequestedInstant(instance)).get()); + + generateArchive(); + + Object result = shell.evaluate(() -> "compaction showarchived --instant " + instance); + + // generate expected + String expected = CompactionCommand.printCompaction(plan, "", false, -1, false, null); + + expected = removeNonWordAndStripSpace(expected); + String got = removeNonWordAndStripSpace(result.toString()); + assertEquals(expected, got); + } +} diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestDiffCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestDiffCommand.java new file mode 100644 index 0000000000000..c12ad676d41c7 --- /dev/null +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestDiffCommand.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.cli.commands; + +import org.apache.hudi.cli.HoodieCLI; +import org.apache.hudi.cli.HoodiePrintHelper; +import org.apache.hudi.cli.HoodieTableHeaderFields; +import org.apache.hudi.cli.TableHeader; +import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; +import org.apache.hudi.cli.testutils.HoodieTestCommitMetadataGenerator; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.util.NumericUtils; +import org.apache.hudi.common.util.Option; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import java.util.function.Function; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Test Cases for {@link DiffCommand}. + */ +@Tag("functional") +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class TestDiffCommand extends CLIFunctionalTestHarness { + + @Autowired + private Shell shell; + private String tableName; + private String tablePath; + + @BeforeEach + public void init() { + tableName = tableName(); + tablePath = tablePath(tableName); + } + + @Test + public void testDiffFile() throws Exception { + // create COW table. + new TableCommand().createTable( + tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), + "", TimelineLayoutVersion.VERSION_1, HoodieAvroPayload.class.getName()); + + Configuration conf = HoodieCLI.conf; + + HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); + String fileId1 = UUID.randomUUID().toString(); + String fileId2 = UUID.randomUUID().toString(); + FileSystem fs = FSUtils.getFs(basePath(), hadoopConf()); + HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, tablePath); + + // Create four commits + Set commits = new HashSet<>(); + for (int i = 100; i < 104; i++) { + String timestamp = String.valueOf(i); + commits.add(timestamp); + // Requested Compaction + HoodieTestCommitMetadataGenerator.createCompactionAuxiliaryMetadata(tablePath, + new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, timestamp), conf); + // Inflight Compaction + HoodieTestCommitMetadataGenerator.createCompactionAuxiliaryMetadata(tablePath, + new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, timestamp), conf); + + Map extraCommitMetadata = + Collections.singletonMap(HoodieCommitMetadata.SCHEMA_KEY, HoodieTestTable.PHONY_TABLE_SCHEMA); + HoodieTestCommitMetadataGenerator.createCommitFileWithMetadata(tablePath, timestamp, conf, fileId1, fileId2, + Option.empty(), Option.empty(), extraCommitMetadata, false); + } + + HoodieTableMetaClient.reload(metaClient); + + Object result = shell.evaluate(() -> String.format("diff file --fileId %s", fileId1)); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); + String expected = generateExpectDataWithExtraMetadata(commits, fileId1, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH); + expected = removeNonWordAndStripSpace(expected); + String got = removeNonWordAndStripSpace(result.toString()); + assertEquals(expected, got); + } + + private String generateExpectDataWithExtraMetadata(Set commits, String fileId, String partition) { + List rows = new ArrayList<>(); + commits.stream().sorted(Comparator.reverseOrder()).forEach(commit -> rows.add(new Comparable[] { + HoodieTimeline.COMMIT_ACTION, + commit, + partition, + fileId, + HoodieTestCommitMetadataGenerator.DEFAULT_PRE_COMMIT, + HoodieTestCommitMetadataGenerator.DEFAULT_NUM_WRITES, + "0", + "0", + HoodieTestCommitMetadataGenerator.DEFAULT_NUM_UPDATE_WRITES, + "0", + HoodieTestCommitMetadataGenerator.DEFAULT_TOTAL_LOG_BLOCKS, + "0", + "0", + HoodieTestCommitMetadataGenerator.DEFAULT_TOTAL_LOG_RECORDS, + "0", + HoodieTestCommitMetadataGenerator.DEFAULT_TOTAL_WRITE_BYTES})); + + final Map> fieldNameToConverterMap = new HashMap<>(); + fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_TOTAL_BYTES_WRITTEN, entry -> NumericUtils.humanReadableByteCount((Double.parseDouble(entry.toString())))); + + final TableHeader header = HoodieTableHeaderFields.getTableHeaderWithExtraMetadata(); + + return HoodiePrintHelper.print(header, fieldNameToConverterMap, "", false, -1, false, rows); + } +} diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestFileSystemViewCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestFileSystemViewCommand.java index f92caea0ca930..ddc420a087633 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestFileSystemViewCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestFileSystemViewCommand.java @@ -22,8 +22,9 @@ import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.HoodieTableHeaderFields; import org.apache.hudi.cli.TableHeader; -import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; +import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; import org.apache.hudi.cli.testutils.HoodieTestCommitMetadataGenerator; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieFileGroup; @@ -34,8 +35,11 @@ import org.apache.hudi.common.util.NumericUtils; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.springframework.shell.core.CommandResult; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; import java.io.IOException; import java.nio.file.Files; @@ -55,26 +59,80 @@ /** * Test class for {@link FileSystemViewCommand}. */ -public class TestFileSystemViewCommand extends AbstractShellIntegrationTest { +@Tag("functional") +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class TestFileSystemViewCommand extends CLIFunctionalTestHarness { + @Autowired + private Shell shell; + + private String nonpartitionedTablePath; + private String partitionedTablePath; private String partitionPath; - private SyncableFileSystemView fsView; + private SyncableFileSystemView nonpartitionedFsView; + private SyncableFileSystemView partitionedFsView; @BeforeEach public void init() throws IOException { - HoodieCLI.conf = jsc.hadoopConfiguration(); + createNonpartitionedTable(); + createPartitionedTable(); + } + + private void createNonpartitionedTable() throws IOException { + HoodieCLI.conf = hadoopConf(); + + // Create table and connect + String nonpartitionedTableName = "nonpartitioned_" + tableName(); + nonpartitionedTablePath = tablePath(nonpartitionedTableName); + new TableCommand().createTable( + nonpartitionedTablePath, nonpartitionedTableName, + "COPY_ON_WRITE", "", 1, "org.apache.hudi.common.model.HoodieAvroPayload"); + + HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); + + Files.createDirectories(Paths.get(nonpartitionedTablePath)); + + // Generate 2 commits + String commitTime1 = "3"; + String commitTime2 = "4"; + + String fileId1 = UUID.randomUUID().toString(); + + // Write date files and log file + String testWriteToken = "2-0-2"; + Files.createFile(Paths.get(nonpartitionedTablePath, FSUtils + .makeBaseFileName(commitTime1, testWriteToken, fileId1))); + Files.createFile(Paths.get(nonpartitionedTablePath, FSUtils + .makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime1, 0, testWriteToken))); + Files.createFile(Paths.get(nonpartitionedTablePath, FSUtils + .makeBaseFileName(commitTime2, testWriteToken, fileId1))); + Files.createFile(Paths.get(nonpartitionedTablePath, FSUtils + .makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime2, 0, testWriteToken))); + + // Write commit files + Files.createFile(Paths.get(nonpartitionedTablePath, ".hoodie", commitTime1 + ".commit")); + Files.createFile(Paths.get(nonpartitionedTablePath, ".hoodie", commitTime2 + ".commit")); + + // Reload meta client and create fsView + metaClient = HoodieTableMetaClient.reload(metaClient); + + nonpartitionedFsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline(), true); + } + + private void createPartitionedTable() throws IOException { + HoodieCLI.conf = hadoopConf(); // Create table and connect - String tableName = "test_table"; - String tablePath = Paths.get(basePath, tableName).toString(); + String partitionedTableName = "partitioned_" + tableName(); + partitionedTablePath = tablePath(partitionedTableName); new TableCommand().createTable( - tablePath, tableName, + partitionedTablePath, partitionedTableName, "COPY_ON_WRITE", "", 1, "org.apache.hudi.common.model.HoodieAvroPayload"); - metaClient = HoodieCLI.getTableMetaClient(); + HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); partitionPath = HoodieTestCommitMetadataGenerator.DEFAULT_FIRST_PARTITION_PATH; - String fullPartitionPath = Paths.get(tablePath, partitionPath).toString(); + String fullPartitionPath = Paths.get(partitionedTablePath, partitionPath).toString(); Files.createDirectories(Paths.get(fullPartitionPath)); // Generate 2 commits @@ -86,22 +144,22 @@ public void init() throws IOException { // Write date files and log file String testWriteToken = "1-0-1"; Files.createFile(Paths.get(fullPartitionPath, FSUtils - .makeDataFileName(commitTime1, testWriteToken, fileId1))); + .makeBaseFileName(commitTime1, testWriteToken, fileId1))); Files.createFile(Paths.get(fullPartitionPath, FSUtils .makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime1, 0, testWriteToken))); Files.createFile(Paths.get(fullPartitionPath, FSUtils - .makeDataFileName(commitTime2, testWriteToken, fileId1))); + .makeBaseFileName(commitTime2, testWriteToken, fileId1))); Files.createFile(Paths.get(fullPartitionPath, FSUtils .makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime2, 0, testWriteToken))); // Write commit files - Files.createFile(Paths.get(tablePath, ".hoodie", commitTime1 + ".commit")); - Files.createFile(Paths.get(tablePath, ".hoodie", commitTime2 + ".commit")); + Files.createFile(Paths.get(partitionedTablePath, ".hoodie", commitTime1 + ".commit")); + Files.createFile(Paths.get(partitionedTablePath, ".hoodie", commitTime2 + ".commit")); // Reload meta client and create fsView metaClient = HoodieTableMetaClient.reload(metaClient); - fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline(), true); + partitionedFsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline(), true); } /** @@ -110,11 +168,11 @@ public void init() throws IOException { @Test public void testShowCommits() { // Test default show fsview all - CommandResult cr = getShell().executeCommand("show fsview all"); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "show fsview all --pathRegex */*/*"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); // Get all file groups - Stream fileGroups = fsView.getAllFileGroups(partitionPath); + Stream fileGroups = partitionedFsView.getAllFileGroups(partitionPath); List rows = new ArrayList<>(); fileGroups.forEach(fg -> fg.getAllFileSlices().forEach(fs -> { @@ -148,7 +206,7 @@ public void testShowCommits() { .addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_FILES); String expected = HoodiePrintHelper.print(header, fieldNameToConverterMap, "", false, -1, false, rows); expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expected, got); } @@ -158,11 +216,11 @@ public void testShowCommits() { @Test public void testShowCommitsWithSpecifiedValues() { // Test command with options, baseFileOnly and maxInstant is 2 - CommandResult cr = getShell().executeCommand("show fsview all --baseFileOnly true --maxInstant 2"); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "show fsview all --pathRegex */*/* --baseFileOnly true --maxInstant 2"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); List rows = new ArrayList<>(); - Stream fileGroups = fsView.getAllFileGroups(partitionPath); + Stream fileGroups = partitionedFsView.getAllFileGroups(partitionPath); // Only get instant 1, since maxInstant was specified 2 fileGroups.forEach(fg -> fg.getAllFileSlices().filter(fs -> fs.getBaseInstantTime().equals("1")).forEach(fs -> { @@ -191,21 +249,11 @@ public void testShowCommitsWithSpecifiedValues() { String expected = HoodiePrintHelper.print(header, fieldNameToConverterMap, "", false, -1, false, rows); expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expected, got); } - /** - * Test case for command 'show fsview latest'. - */ - @Test - public void testShowLatestFileSlices() { - // Test show with partition path '2016/03/15' - CommandResult cr = getShell().executeCommand("show fsview latest --partitionPath " + partitionPath); - assertTrue(cr.isSuccess()); - - Stream fileSlice = fsView.getLatestFileSlices(partitionPath); - + private List fileSlicesToCRList(Stream fileSlice, String partitionPath) { List rows = new ArrayList<>(); fileSlice.forEach(fs -> { int idx = 0; @@ -243,7 +291,15 @@ public void testShowLatestFileSlices() { .collect(Collectors.toList()).toString(); rows.add(row); }); + return rows; + } + /** + * ( + * Test case for command 'show fsview latest'. + */ + @Test + public void testShowLatestFileSlices() throws IOException { Function converterFunction = entry -> NumericUtils.humanReadableByteCount((Double.parseDouble(entry.toString()))); Map> fieldNameToConverterMap = new HashMap<>(); @@ -265,9 +321,32 @@ public void testShowLatestFileSlices() { .addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_BASE_UNSCHEDULED) .addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_FILES_SCHEDULED) .addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_FILES_UNSCHEDULED); - String expected = HoodiePrintHelper.print(header, fieldNameToConverterMap, "", false, -1, false, rows); - expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); - assertEquals(expected, got); + + // Test show with partition path '2016/03/15' + new TableCommand().connect(partitionedTablePath, null, false, 0, 0, 0); + Object partitionedTable = shell.evaluate(() -> "show fsview latest --partitionPath " + partitionPath); + assertTrue(ShellEvaluationResultUtil.isSuccess(partitionedTable)); + + Stream partitionedFileSlice = partitionedFsView.getLatestFileSlices(partitionPath); + + List partitionedRows = fileSlicesToCRList(partitionedFileSlice, partitionPath); + String partitionedExpected = HoodiePrintHelper.print(header, fieldNameToConverterMap, "", false, -1, false, partitionedRows); + partitionedExpected = removeNonWordAndStripSpace(partitionedExpected); + String partitionedResults = removeNonWordAndStripSpace(partitionedTable.toString()); + assertEquals(partitionedExpected, partitionedResults); + + // Test show for non-partitioned table + new TableCommand().connect(nonpartitionedTablePath, null, false, 0, 0, 0); + Object nonpartitionedTable = shell.evaluate(() -> "show fsview latest"); + assertTrue(ShellEvaluationResultUtil.isSuccess(nonpartitionedTable)); + + Stream nonpartitionedFileSlice = nonpartitionedFsView.getLatestFileSlices(""); + + List nonpartitionedRows = fileSlicesToCRList(nonpartitionedFileSlice, ""); + + String nonpartitionedExpected = HoodiePrintHelper.print(header, fieldNameToConverterMap, "", false, -1, false, nonpartitionedRows); + nonpartitionedExpected = removeNonWordAndStripSpace(nonpartitionedExpected); + String nonpartitionedResults = removeNonWordAndStripSpace(nonpartitionedTable.toString()); + assertEquals(nonpartitionedExpected, nonpartitionedResults); } } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java index fbd2b92c2bb00..e93ad0c8cad4e 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java @@ -23,8 +23,11 @@ import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.HoodieTableHeaderFields; import org.apache.hudi.cli.TableHeader; -import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; +import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; import org.apache.hudi.cli.testutils.HoodieTestCommitMetadataGenerator; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; +import org.apache.hudi.common.config.HoodieCommonConfig; +import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -43,12 +46,16 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.springframework.shell.core.CommandResult; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; -import java.io.File; import java.io.IOException; import java.net.URISyntaxException; import java.nio.file.Files; @@ -69,56 +76,62 @@ /** * Test Cases for {@link HoodieLogFileCommand}. */ -public class TestHoodieLogFileCommand extends AbstractShellIntegrationTest { +@Tag("functional") +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class TestHoodieLogFileCommand extends CLIFunctionalTestHarness { + + @Autowired + private Shell shell; private String partitionPath; private HoodieAvroDataBlock dataBlock; private String tablePath; + private FileSystem fs; private static final String INSTANT_TIME = "100"; @BeforeEach public void init() throws IOException, InterruptedException, URISyntaxException { - HoodieCLI.conf = jsc.hadoopConfiguration(); + HoodieCLI.conf = hadoopConf(); // Create table and connect - String tableName = "test_table"; - tablePath = basePath + File.separator + tableName; - partitionPath = tablePath + File.separator + HoodieTestCommitMetadataGenerator.DEFAULT_FIRST_PARTITION_PATH; + String tableName = tableName(); + tablePath = tablePath(tableName); + partitionPath = Paths.get(tablePath, HoodieTestCommitMetadataGenerator.DEFAULT_FIRST_PARTITION_PATH).toString(); new TableCommand().createTable( tablePath, tableName, HoodieTableType.MERGE_ON_READ.name(), "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); Files.createDirectories(Paths.get(partitionPath)); + fs = FSUtils.getFs(tablePath, hadoopConf()); - HoodieLogFormat.Writer writer = null; - try { - writer = - HoodieLogFormat.newWriterBuilder().onParentPath(new Path(partitionPath)) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION) - .withFileId("test-log-fileid1").overBaseCommit("100").withFs(fs).build(); + try (HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder() + .onParentPath(new Path(partitionPath)) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-log-fileid1").overBaseCommit("100").withFs(fs).build()) { // write data to file List records = SchemaTestUtil.generateTestRecords(0, 100); Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, INSTANT_TIME); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - dataBlock = new HoodieAvroDataBlock(records, header); - writer = writer.appendBlock(dataBlock); - } finally { - if (writer != null) { - writer.close(); - } + dataBlock = new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD); + writer.appendBlock(dataBlock); } } + @AfterEach + public void cleanUp() throws IOException { + fs.close(); + } + /** * Test case for 'show logfile metadata'. */ @Test public void testShowLogFileCommits() throws JsonProcessingException { - CommandResult cr = getShell().executeCommand("show logfile metadata --logFilePathPattern " + partitionPath + "/*"); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "show logfile metadata --logFilePathPattern " + partitionPath + "/*"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_INSTANT_TIME) .addTableHeaderField(HoodieTableHeaderFields.HEADER_RECORD_COUNT) @@ -131,12 +144,12 @@ public void testShowLogFileCommits() throws JsonProcessingException { ObjectMapper objectMapper = new ObjectMapper(); String headerStr = objectMapper.writeValueAsString(dataBlock.getLogBlockHeader()); String footerStr = objectMapper.writeValueAsString(dataBlock.getLogBlockFooter()); - Comparable[] output = new Comparable[]{INSTANT_TIME, 100, dataBlock.getBlockType(), headerStr, footerStr}; + Comparable[] output = new Comparable[] {INSTANT_TIME, 100, dataBlock.getBlockType(), headerStr, footerStr}; rows.add(output); String expected = HoodiePrintHelper.print(header, new HashMap<>(), "", false, -1, false, rows); expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expected, got); } @@ -145,15 +158,15 @@ public void testShowLogFileCommits() throws JsonProcessingException { */ @Test public void testShowLogFileRecords() throws IOException, URISyntaxException { - CommandResult cr = getShell().executeCommand("show logfile records --logFilePathPattern " + partitionPath + "/*"); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "show logfile records --logFilePathPattern " + partitionPath + "/*"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); // construct expect result, get 10 records. List records = SchemaTestUtil.generateTestRecords(0, 10); - String[][] rows = records.stream().map(r -> new String[]{r.toString()}).toArray(String[][]::new); + String[][] rows = records.stream().map(r -> new String[] {r.toString()}).toArray(String[][]::new); String expected = HoodiePrintHelper.print(new String[] {HoodieTableHeaderFields.HEADER_RECORDS}, rows); expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expected, got); } @@ -167,7 +180,7 @@ public void testShowLogFileRecordsWithMerge() throws IOException, InterruptedExc // write to path '2015/03/16'. Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); - partitionPath = tablePath + File.separator + HoodieTestCommitMetadataGenerator.DEFAULT_SECOND_PARTITION_PATH; + partitionPath = tablePath + Path.SEPARATOR + HoodieTestCommitMetadataGenerator.DEFAULT_SECOND_PARTITION_PATH; Files.createDirectories(Paths.get(partitionPath)); HoodieLogFormat.Writer writer = null; @@ -182,17 +195,17 @@ public void testShowLogFileRecordsWithMerge() throws IOException, InterruptedExc Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, INSTANT_TIME); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header); - writer = writer.appendBlock(dataBlock); + HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header, HoodieRecord.RECORD_KEY_METADATA_FIELD); + writer.appendBlock(dataBlock); } finally { if (writer != null) { writer.close(); } } - CommandResult cr = getShell().executeCommand("show logfile records --logFilePathPattern " - + partitionPath + "/* --mergeRecords true"); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "show logfile records --logFilePathPattern " + + partitionPath + "/* --mergeRecords true"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); // get expected result of 10 records. List logFilePaths = Arrays.stream(fs.globStatus(new Path(partitionPath + "/*"))) @@ -207,12 +220,14 @@ public void testShowLogFileRecordsWithMerge() throws IOException, InterruptedExc HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES) .withReadBlocksLazily( Boolean.parseBoolean( - HoodieCompactionConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED)) + HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLE.defaultValue())) .withReverseReader( Boolean.parseBoolean( - HoodieCompactionConfig.DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED)) - .withBufferSize(HoodieMemoryConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE) - .withSpillableMapBasePath(HoodieMemoryConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH) + HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLE.defaultValue())) + .withBufferSize(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE.defaultValue()) + .withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()) + .withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()) + .withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()) .build(); Iterator> records = scanner.iterator(); @@ -224,12 +239,12 @@ public void testShowLogFileRecordsWithMerge() throws IOException, InterruptedExc indexRecords.add(hoodieRecord.get()); num++; } - String[][] rows = indexRecords.stream().map(r -> new String[]{r.toString()}).toArray(String[][]::new); + String[][] rows = indexRecords.stream().map(r -> new String[] {r.toString()}).toArray(String[][]::new); assertNotNull(rows); String expected = HoodiePrintHelper.print(new String[] {HoodieTableHeaderFields.HEADER_RECORDS}, rows); expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expected, got); } } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRepairsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRepairsCommand.java index ed14a64fc443a..29377c21ea880 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRepairsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRepairsCommand.java @@ -21,32 +21,63 @@ import org.apache.hudi.cli.HoodieCLI; import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.HoodieTableHeaderFields; -import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; +import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; import org.apache.hudi.cli.testutils.HoodieTestCommitMetadataGenerator; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.RawTripTestPayload; +import org.apache.hudi.common.util.PartitionPathEncodeUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.keygen.SimpleKeyGenerator; +import org.apache.hudi.testutils.Assertions; +import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.SQLContext; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.springframework.shell.core.CommandResult; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; -import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.net.URL; import java.nio.file.Files; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.stream.Collectors; +import static org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER; +import static org.apache.hudi.common.table.HoodieTableConfig.DROP_PARTITION_COLUMNS; +import static org.apache.hudi.common.table.HoodieTableConfig.NAME; +import static org.apache.hudi.common.table.HoodieTableConfig.TABLE_CHECKSUM; +import static org.apache.hudi.common.table.HoodieTableConfig.TIMELINE_LAYOUT_VERSION; +import static org.apache.hudi.common.table.HoodieTableConfig.TYPE; +import static org.apache.hudi.common.table.HoodieTableConfig.VERSION; +import static org.apache.hudi.common.table.HoodieTableConfig.generateChecksum; +import static org.apache.hudi.common.table.HoodieTableConfig.validateChecksum; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -54,19 +85,31 @@ /** * Test class for {@link RepairsCommand}. */ -public class TestRepairsCommand extends AbstractShellIntegrationTest { +@Tag("functional") +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class TestRepairsCommand extends CLIFunctionalTestHarness { + + @Autowired + private Shell shell; private String tablePath; + private FileSystem fs; @BeforeEach public void init() throws IOException { - String tableName = "test_table"; - tablePath = basePath + File.separator + tableName; + String tableName = tableName(); + tablePath = tablePath(tableName); + fs = FSUtils.getFs(tablePath, hadoopConf()); // Create table and connect new TableCommand().createTable( tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), - "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + HoodieTableConfig.ARCHIVELOG_FOLDER.defaultValue(), TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + } + + @AfterEach + public void cleanUp() throws IOException { + fs.close(); } /** @@ -75,29 +118,29 @@ public void init() throws IOException { @Test public void testAddPartitionMetaWithDryRun() throws IOException { // create commit instant - Files.createFile(Paths.get(tablePath + "/.hoodie/100.commit")); + Files.createFile(Paths.get(tablePath, ".hoodie", "100.commit")); // create partition path - String partition1 = tablePath + File.separator + HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; - String partition2 = tablePath + File.separator + HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH; - String partition3 = tablePath + File.separator + HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH; + String partition1 = Paths.get(tablePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).toString(); + String partition2 = Paths.get(tablePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).toString(); + String partition3 = Paths.get(tablePath, HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH).toString(); assertTrue(fs.mkdirs(new Path(partition1))); assertTrue(fs.mkdirs(new Path(partition2))); assertTrue(fs.mkdirs(new Path(partition3))); // default is dry run. - CommandResult cr = getShell().executeCommand("repair addpartitionmeta"); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "repair addpartitionmeta"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); // expected all 'No'. String[][] rows = FSUtils.getAllPartitionFoldersThreeLevelsDown(fs, tablePath) .stream() - .map(partition -> new String[]{partition, "No", "None"}) + .map(partition -> new String[] {partition, "No", "None"}) .toArray(String[][]::new); String expected = HoodiePrintHelper.print(new String[] {HoodieTableHeaderFields.HEADER_PARTITION_PATH, HoodieTableHeaderFields.HEADER_METADATA_PRESENT, HoodieTableHeaderFields.HEADER_ACTION}, rows); expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expected, got); } @@ -107,40 +150,40 @@ public void testAddPartitionMetaWithDryRun() throws IOException { @Test public void testAddPartitionMetaWithRealRun() throws IOException { // create commit instant - Files.createFile(Paths.get(tablePath + "/.hoodie/100.commit")); + Files.createFile(Paths.get(tablePath, ".hoodie", "100.commit")); // create partition path - String partition1 = tablePath + File.separator + HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; - String partition2 = tablePath + File.separator + HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH; - String partition3 = tablePath + File.separator + HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH; + String partition1 = Paths.get(tablePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).toString(); + String partition2 = Paths.get(tablePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).toString(); + String partition3 = Paths.get(tablePath, HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH).toString(); assertTrue(fs.mkdirs(new Path(partition1))); assertTrue(fs.mkdirs(new Path(partition2))); assertTrue(fs.mkdirs(new Path(partition3))); - CommandResult cr = getShell().executeCommand("repair addpartitionmeta --dryrun false"); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "repair addpartitionmeta --dryrun false"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); List paths = FSUtils.getAllPartitionFoldersThreeLevelsDown(fs, tablePath); // after dry run, the action will be 'Repaired' String[][] rows = paths.stream() - .map(partition -> new String[]{partition, "No", "Repaired"}) + .map(partition -> new String[] {partition, "No", "Repaired"}) .toArray(String[][]::new); String expected = HoodiePrintHelper.print(new String[] {HoodieTableHeaderFields.HEADER_PARTITION_PATH, HoodieTableHeaderFields.HEADER_METADATA_PRESENT, HoodieTableHeaderFields.HEADER_ACTION}, rows); expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expected, got); - cr = getShell().executeCommand("repair addpartitionmeta"); + result = shell.evaluate(() -> "repair addpartitionmeta"); // after real run, Metadata is present now. rows = paths.stream() - .map(partition -> new String[]{partition, "Yes", "None"}) + .map(partition -> new String[] {partition, "Yes", "None"}) .toArray(String[][]::new); expected = HoodiePrintHelper.print(new String[] {HoodieTableHeaderFields.HEADER_PARTITION_PATH, HoodieTableHeaderFields.HEADER_METADATA_PRESENT, HoodieTableHeaderFields.HEADER_ACTION}, rows); expected = removeNonWordAndStripSpace(expected); - got = removeNonWordAndStripSpace(cr.getResult().toString()); + got = removeNonWordAndStripSpace(result.toString()); assertEquals(expected, got); } @@ -152,30 +195,36 @@ public void testOverwriteHoodieProperties() throws IOException { URL newProps = this.getClass().getClassLoader().getResource("table-config.properties"); assertNotNull(newProps, "New property file must exist"); - CommandResult cr = getShell().executeCommand("repair overwrite-hoodie-props --new-props-file " + newProps.getPath()); - assertTrue(cr.isSuccess()); + Object cmdResult = shell.evaluate(() -> "repair overwrite-hoodie-props --new-props-file " + newProps.getPath()); + assertTrue(ShellEvaluationResultUtil.isSuccess(cmdResult)); - Map oldProps = HoodieCLI.getTableMetaClient().getTableConfig().getProps(); + Map oldProps = HoodieCLI.getTableMetaClient().getTableConfig().propsMap(); // after overwrite, the stored value in .hoodie is equals to which read from properties. - Map result = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()).getTableConfig().getProps(); + HoodieTableConfig tableConfig = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()).getTableConfig(); + Map result = tableConfig.propsMap(); + // validate table checksum + assertTrue(result.containsKey(TABLE_CHECKSUM.key())); + assertTrue(validateChecksum(tableConfig.getProps())); Properties expectProps = new Properties(); - expectProps.load(new FileInputStream(new File(newProps.getPath()))); + expectProps.load(new FileInputStream(newProps.getPath())); Map expected = expectProps.entrySet().stream() .collect(Collectors.toMap(e -> String.valueOf(e.getKey()), e -> String.valueOf(e.getValue()))); + expected.putIfAbsent(TABLE_CHECKSUM.key(), String.valueOf(generateChecksum(tableConfig.getProps()))); + expected.putIfAbsent(DROP_PARTITION_COLUMNS.key(), String.valueOf(DROP_PARTITION_COLUMNS.defaultValue())); assertEquals(expected, result); // check result - List allPropsStr = Arrays.asList("hoodie.table.name", "hoodie.table.type", "hoodie.table.version", - "hoodie.archivelog.folder", "hoodie.timeline.layout.version"); - String[][] rows = allPropsStr.stream().sorted().map(key -> new String[]{key, - oldProps.getOrDefault(key, "null"), result.getOrDefault(key, "null")}) + List allPropsStr = Arrays.asList(NAME.key(), TYPE.key(), VERSION.key(), + ARCHIVELOG_FOLDER.key(), TIMELINE_LAYOUT_VERSION.key(), TABLE_CHECKSUM.key(), DROP_PARTITION_COLUMNS.key()); + String[][] rows = allPropsStr.stream().sorted().map(key -> new String[] {key, + oldProps.getOrDefault(key, "null"), result.getOrDefault(key, "null")}) .toArray(String[][]::new); String expect = HoodiePrintHelper.print(new String[] {HoodieTableHeaderFields.HEADER_HOODIE_PROPERTY, HoodieTableHeaderFields.HEADER_OLD_VALUE, HoodieTableHeaderFields.HEADER_NEW_VALUE}, rows); expect = removeNonWordAndStripSpace(expect); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(cmdResult.toString()); assertEquals(expect, got); } @@ -184,11 +233,11 @@ public void testOverwriteHoodieProperties() throws IOException { */ @Test public void testRemoveCorruptedPendingCleanAction() throws IOException { - HoodieCLI.conf = jsc.hadoopConfiguration(); + HoodieCLI.conf = hadoopConf(); Configuration conf = HoodieCLI.conf; - metaClient = HoodieCLI.getTableMetaClient(); + HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); // Create four requested files for (int i = 100; i < 104; i++) { @@ -202,11 +251,127 @@ public void testRemoveCorruptedPendingCleanAction() throws IOException { // first, there are four instants assertEquals(4, metaClient.getActiveTimeline().filterInflightsAndRequested().getInstants().count()); - CommandResult cr = getShell().executeCommand("repair corrupted clean files"); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "repair corrupted clean files"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); // reload meta client metaClient = HoodieTableMetaClient.reload(metaClient); assertEquals(0, metaClient.getActiveTimeline().filterInflightsAndRequested().getInstants().count()); } + + @Test + public void testRepairDeprecatedPartition() throws IOException { + tablePath = tablePath + "/repair_test/"; + HoodieTableMetaClient.withPropertyBuilder() + .setTableType(HoodieTableType.COPY_ON_WRITE.name()) + .setTableName(tableName()) + .setArchiveLogFolder(HoodieTableConfig.ARCHIVELOG_FOLDER.defaultValue()) + .setPayloadClassName("org.apache.hudi.common.model.HoodieAvroPayload") + .setTimelineLayoutVersion(TimelineLayoutVersion.VERSION_1) + .setPartitionFields("partition_path") + .setRecordKeyFields("_row_key") + .setKeyGeneratorClassProp(SimpleKeyGenerator.class.getCanonicalName()) + .initTable(HoodieCLI.conf, tablePath); + + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(tablePath).withSchema(TRIP_EXAMPLE_SCHEMA).build(); + + try (SparkRDDWriteClient client = new SparkRDDWriteClient(context(), config)) { + String newCommitTime = "001"; + int numRecords = 10; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, numRecords); + JavaRDD writeRecords = context().getJavaSparkContext().parallelize(records, 1); + List result = client.upsert(writeRecords, newCommitTime).collect(); + Assertions.assertNoWriteErrors(result); + + newCommitTime = "002"; + // Generate HoodieRecords w/ null values for partition path field. + List records1 = dataGen.generateInserts(newCommitTime, numRecords); + List records2 = new ArrayList<>(); + records1.forEach(entry -> { + HoodieKey hoodieKey = new HoodieKey(entry.getRecordKey(), PartitionPathEncodeUtils.DEPRECATED_DEFAULT_PARTITION_PATH); + RawTripTestPayload testPayload = (RawTripTestPayload) entry.getData(); + try { + GenericRecord genericRecord = (GenericRecord) testPayload.getRecordToInsert(HoodieTestDataGenerator.AVRO_SCHEMA); + genericRecord.put("partition_path", null); + records2.add(new HoodieAvroRecord(hoodieKey, new RawTripTestPayload(genericRecord.toString(), hoodieKey.getRecordKey(), hoodieKey.getPartitionPath(), TRIP_EXAMPLE_SCHEMA))); + } catch (IOException e) { + e.printStackTrace(); + } + }); + + client.startCommitWithTime(newCommitTime); + // ingest records2 which has null for partition path fields, but goes into "default" partition. + JavaRDD writeRecords2 = context().getJavaSparkContext().parallelize(records2, 1); + List result2 = client.bulkInsert(writeRecords2, newCommitTime).collect(); + Assertions.assertNoWriteErrors(result2); + + SQLContext sqlContext = context().getSqlContext(); + long totalRecs = sqlContext.read().format("hudi").load(tablePath).count(); + assertEquals(totalRecs, 20); + + // Execute repair deprecated partition command + assertEquals(0, SparkMain.repairDeprecatedPartition(jsc(), tablePath)); + + // there should not be any records w/ default partition + totalRecs = sqlContext.read().format("hudi").load(tablePath) + .filter(HoodieRecord.PARTITION_PATH_METADATA_FIELD + " == '" + PartitionPathEncodeUtils.DEPRECATED_DEFAULT_PARTITION_PATH + "'").count(); + assertEquals(totalRecs, 0); + + // all records from default partition should have been migrated to __HIVE_DEFAULT_PARTITION__ + totalRecs = sqlContext.read().format("hudi").load(tablePath) + .filter(HoodieRecord.PARTITION_PATH_METADATA_FIELD + " == '" + PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH + "'").count(); + assertEquals(totalRecs, 10); + } + } + + @Test + public void testRenamePartition() throws IOException { + tablePath = tablePath + "/rename_partition_test/"; + HoodieTableMetaClient.withPropertyBuilder() + .setTableType(HoodieTableType.COPY_ON_WRITE.name()) + .setTableName(tableName()) + .setArchiveLogFolder(HoodieTableConfig.ARCHIVELOG_FOLDER.defaultValue()) + .setPayloadClassName("org.apache.hudi.common.model.HoodieAvroPayload") + .setTimelineLayoutVersion(TimelineLayoutVersion.VERSION_1) + .setPartitionFields("partition_path") + .setRecordKeyFields("_row_key") + .setKeyGeneratorClassProp(SimpleKeyGenerator.class.getCanonicalName()) + .initTable(HoodieCLI.conf, tablePath); + + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(tablePath).withSchema(TRIP_EXAMPLE_SCHEMA).build(); + + try (SparkRDDWriteClient client = new SparkRDDWriteClient(context(), config)) { + String newCommitTime = "001"; + int numRecords = 20; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, numRecords); + JavaRDD writeRecords = context().getJavaSparkContext().parallelize(records, 1); + List result = client.upsert(writeRecords, newCommitTime).collect(); + Assertions.assertNoWriteErrors(result); + + SQLContext sqlContext = context().getSqlContext(); + long totalRecs = sqlContext.read().format("hudi").load(tablePath).count(); + assertEquals(totalRecs, 20); + long totalRecsInOldPartition = sqlContext.read().format("hudi").load(tablePath) + .filter(HoodieRecord.PARTITION_PATH_METADATA_FIELD + " == '" + DEFAULT_FIRST_PARTITION_PATH + "'").count(); + + // Execute rename partition command + assertEquals(0, SparkMain.renamePartition(jsc(), tablePath, DEFAULT_FIRST_PARTITION_PATH, "2016/03/18")); + + // there should not be any records in old partition + totalRecs = sqlContext.read().format("hudi").load(tablePath) + .filter(HoodieRecord.PARTITION_PATH_METADATA_FIELD + " == '" + DEFAULT_FIRST_PARTITION_PATH + "'").count(); + assertEquals(totalRecs, 0); + + // all records from old partition should have been migrated to new partition + totalRecs = sqlContext.read().format("hudi").load(tablePath) + .filter(HoodieRecord.PARTITION_PATH_METADATA_FIELD + " == '" + "2016/03/18" + "'").count(); + assertEquals(totalRecs, totalRecsInOldPartition); + } + } } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRollbacksCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRollbacksCommand.java index 8cf2be9580638..a4144937621a6 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRollbacksCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRollbacksCommand.java @@ -23,26 +23,32 @@ import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.HoodieTableHeaderFields; import org.apache.hudi.cli.TableHeader; -import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; -import org.apache.hudi.client.AbstractHoodieWriteClient; +import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; +import org.apache.hudi.client.BaseHoodieWriteClient; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; -import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.testutils.HoodieMetadataTestTable; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.springframework.shell.core.CommandResult; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; import java.io.IOException; -import java.nio.file.Paths; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -60,17 +66,22 @@ /** * Test class for {@link org.apache.hudi.cli.commands.RollbacksCommand}. */ -public class TestRollbacksCommand extends AbstractShellIntegrationTest { +@Tag("functional") +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class TestRollbacksCommand extends CLIFunctionalTestHarness { + + @Autowired + private Shell shell; @BeforeEach public void init() throws Exception { - String tableName = "test_table"; - String tablePath = Paths.get(basePath, tableName).toString(); + String tableName = tableName(); + String tablePath = tablePath(tableName); new TableCommand().createTable( tablePath, tableName, HoodieTableType.MERGE_ON_READ.name(), "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); - metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); - //Create some commits files and parquet files + HoodieTableMetaClient metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); + //Create some commits files and base files Map partitionAndFileId = new HashMap() { { put(DEFAULT_FIRST_PARTITION_PATH, "file-1"); @@ -78,7 +89,19 @@ public void init() throws Exception { put(DEFAULT_THIRD_PARTITION_PATH, "file-3"); } }; - HoodieTestTable.of(metaClient) + + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(tablePath) + .withMetadataConfig( + // Column Stats Index is disabled, since these tests construct tables which are + // not valid (empty commit metadata, etc) + HoodieMetadataConfig.newBuilder() + .withMetadataIndexColumnStats(false) + .build() + ) + .withRollbackUsingMarkers(false) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); + HoodieMetadataTestTable.of(metaClient, SparkHoodieBackedTableMetadataWriter.create( + metaClient.getHadoopConf(), config, context)) .withPartitionMetaFiles(DEFAULT_PARTITION_PATHS) .addCommit("100") .withBaseFilesInPartitions(partitionAndFileId) @@ -86,11 +109,9 @@ public void init() throws Exception { .withBaseFilesInPartitions(partitionAndFileId) .addInflightCommit("102") .withBaseFilesInPartitions(partitionAndFileId); - // generate two rollback - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(tablePath) - .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); - try (AbstractHoodieWriteClient client = getHoodieWriteClient(config)) { + // generate two rollback + try (BaseHoodieWriteClient client = new SparkRDDWriteClient(context(), config)) { // Rollback inflight commit3 and commit2 client.rollback("102"); client.rollback("101"); @@ -102,8 +123,8 @@ public void init() throws Exception { */ @Test public void testShowRollbacks() { - CommandResult cr = getShell().executeCommand("show rollbacks"); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "show rollbacks"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); // get rollback instants HoodieActiveTimeline activeTimeline = new RollbacksCommand.RollbackTimeline(HoodieCLI.getTableMetaClient()); @@ -137,7 +158,7 @@ public void testShowRollbacks() { .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_PARTITIONS); String expected = HoodiePrintHelper.print(header, new HashMap<>(), "", false, -1, false, rows); expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expected, got); } @@ -152,8 +173,8 @@ public void testShowRollback() throws IOException { HoodieInstant instant = rollback.findFirst().orElse(null); assertNotNull(instant, "The instant can not be null."); - CommandResult cr = getShell().executeCommand("show rollback --instant " + instant.getTimestamp()); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "show rollback --instant " + instant.getTimestamp()); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); List rows = new ArrayList<>(); // get metadata of instant @@ -180,7 +201,7 @@ public void testShowRollback() throws IOException { .addTableHeaderField(HoodieTableHeaderFields.HEADER_SUCCEEDED); String expected = HoodiePrintHelper.print(header, new HashMap<>(), "", false, -1, false, rows); expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expected, got); } } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestSavepointsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestSavepointsCommand.java index 949a764038cbf..e4c8a4b1a41a4 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestSavepointsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestSavepointsCommand.java @@ -21,20 +21,23 @@ import org.apache.hudi.cli.HoodieCLI; import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.HoodieTableHeaderFields; -import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; +import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.springframework.shell.core.CommandResult; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; -import java.io.File; import java.io.IOException; -import java.util.Arrays; import java.util.Comparator; +import java.util.stream.Stream; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -42,14 +45,19 @@ /** * Test class for {@link org.apache.hudi.cli.commands.SavepointsCommand}. */ -public class TestSavepointsCommand extends AbstractShellIntegrationTest { +@Tag("functional") +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class TestSavepointsCommand extends CLIFunctionalTestHarness { + + @Autowired + private Shell shell; private String tablePath; @BeforeEach public void init() throws IOException { - String tableName = "test_table"; - tablePath = basePath + File.separator + tableName; + String tableName = tableName(); + tablePath = tablePath(tableName); // Create table and connect new TableCommand().createTable( @@ -65,18 +73,18 @@ public void testShowSavepoints() throws IOException { // generate four savepoints for (int i = 100; i < 104; i++) { String instantTime = String.valueOf(i); - HoodieTestDataGenerator.createSavepointFile(tablePath, instantTime, jsc.hadoopConfiguration()); + HoodieTestDataGenerator.createSavepointFile(tablePath, instantTime, hadoopConf()); } - CommandResult cr = getShell().executeCommand("savepoints show"); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "savepoints show"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); // generate expect result - String[][] rows = Arrays.asList("100", "101", "102", "103").stream().sorted(Comparator.reverseOrder()) - .map(instant -> new String[]{instant}).toArray(String[][]::new); + String[][] rows = Stream.of("100", "101", "102", "103").sorted(Comparator.reverseOrder()) + .map(instant -> new String[] {instant}).toArray(String[][]::new); String expected = HoodiePrintHelper.print(new String[] {HoodieTableHeaderFields.HEADER_SAVEPOINT_TIME}, rows); expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expected, got); } @@ -92,7 +100,7 @@ public void testRefreshMetaClient() throws IOException { // generate four savepoints for (int i = 100; i < 104; i++) { String instantTime = String.valueOf(i); - HoodieTestDataGenerator.createSavepointFile(tablePath, instantTime, jsc.hadoopConfiguration()); + HoodieTestDataGenerator.createSavepointFile(tablePath, instantTime, hadoopConf()); } // Before refresh, no instant @@ -100,8 +108,8 @@ public void testRefreshMetaClient() throws IOException { HoodieCLI.getTableMetaClient().getActiveTimeline().getSavePointTimeline().filterCompletedInstants(); assertEquals(0, timeline.countInstants(), "there should have no instant"); - CommandResult cr = getShell().executeCommand("savepoints refresh"); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "savepoints refresh"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); timeline = HoodieCLI.getTableMetaClient().getActiveTimeline().getSavePointTimeline().filterCompletedInstants(); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestSparkEnvCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestSparkEnvCommand.java index 19fcf2f42a2d2..09f5bd0576a68 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestSparkEnvCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestSparkEnvCommand.java @@ -19,10 +19,14 @@ package org.apache.hudi.cli.commands; import org.apache.hudi.cli.HoodiePrintHelper; -import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; +import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.springframework.shell.core.CommandResult; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -30,7 +34,12 @@ /** * Test Cases for {@link SparkEnvCommand}. */ -public class TestSparkEnvCommand extends AbstractShellIntegrationTest { +@Tag("functional") +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class TestSparkEnvCommand extends CLIFunctionalTestHarness { + + @Autowired + private Shell shell; /** * Test Cases for set and get spark env. @@ -38,21 +47,21 @@ public class TestSparkEnvCommand extends AbstractShellIntegrationTest { @Test public void testSetAndGetSparkEnv() { // First, be empty - CommandResult cr = getShell().executeCommand("show envs all"); + Object cmdResult = shell.evaluate(() -> "show envs all"); String nullResult = HoodiePrintHelper.print(new String[] {"key", "value"}, new String[0][2]); nullResult = removeNonWordAndStripSpace(nullResult); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(cmdResult.toString()); assertEquals(nullResult, got); // Set SPARK_HOME - cr = getShell().executeCommand("set --conf SPARK_HOME=/usr/etc/spark"); - assertTrue(cr.isSuccess()); + cmdResult = shell.evaluate(() -> "set --conf SPARK_HOME=/usr/etc/spark"); + assertTrue(ShellEvaluationResultUtil.isSuccess(cmdResult)); //Get - cr = getShell().executeCommand("show env --key SPARK_HOME"); + cmdResult = shell.evaluate(() -> "show env --key SPARK_HOME"); String result = HoodiePrintHelper.print(new String[] {"key", "value"}, new String[][] {new String[] {"SPARK_HOME", "/usr/etc/spark"}}); result = removeNonWordAndStripSpace(result); - got = removeNonWordAndStripSpace(cr.getResult().toString()); + got = removeNonWordAndStripSpace(cmdResult.toString()); assertEquals(result, got); } } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestStatsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestStatsCommand.java index cd4f1960232b0..dfdb37b3bb00a 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestStatsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestStatsCommand.java @@ -22,8 +22,9 @@ import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.HoodieTableHeaderFields; import org.apache.hudi.cli.TableHeader; -import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; +import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; import org.apache.hudi.cli.testutils.HoodieTestCommitMetadataGenerator; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; @@ -34,10 +35,12 @@ import com.codahale.metrics.Snapshot; import com.codahale.metrics.UniformReservoir; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.springframework.shell.core.CommandResult; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; -import java.io.File; import java.io.IOException; import java.text.DecimalFormat; import java.util.ArrayList; @@ -52,21 +55,25 @@ /** * Test class of {@link org.apache.hudi.cli.commands.StatsCommand}. */ -public class TestStatsCommand extends AbstractShellIntegrationTest { +@Tag("functional") +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class TestStatsCommand extends CLIFunctionalTestHarness { + + @Autowired + private Shell shell; private String tablePath; @BeforeEach public void init() throws IOException { - String tableName = "test_table"; - tablePath = basePath + File.separator + tableName; + String tableName = tableName(); + tablePath = tablePath(tableName); - HoodieCLI.conf = jsc.hadoopConfiguration(); + HoodieCLI.conf = hadoopConf(); // Create table and connect new TableCommand().createTable( tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); - metaClient = HoodieCLI.getTableMetaClient(); } /** @@ -83,12 +90,12 @@ public void testWriteAmplificationStats() throws Exception { for (Map.Entry entry : data.entrySet()) { String k = entry.getKey(); Integer[] v = entry.getValue(); - HoodieTestCommitMetadataGenerator.createCommitFileWithMetadata(tablePath, k, jsc.hadoopConfiguration(), + HoodieTestCommitMetadataGenerator.createCommitFileWithMetadata(tablePath, k, hadoopConf(), Option.of(v[0]), Option.of(v[1])); } - CommandResult cr = getShell().executeCommand("stats wa"); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "stats wa"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); // generate expect List rows = new ArrayList<>(); @@ -99,7 +106,7 @@ public void testWriteAmplificationStats() throws Exception { }); int totalWrite = data.values().stream().map(integers -> integers[0] * 2).mapToInt(s -> s).sum(); int totalUpdate = data.values().stream().map(integers -> integers[1] * 2).mapToInt(s -> s).sum(); - rows.add(new Comparable[]{"Total", totalUpdate, totalWrite, df.format((float) totalWrite / totalUpdate)}); + rows.add(new Comparable[] {"Total", totalUpdate, totalWrite, df.format((float) totalWrite / totalUpdate)}); TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_COMMIT_TIME) .addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_UPSERTED) @@ -107,7 +114,7 @@ public void testWriteAmplificationStats() throws Exception { .addTableHeaderField(HoodieTableHeaderFields.HEADER_WRITE_AMPLIFICATION_FACTOR); String expected = HoodiePrintHelper.print(header, new HashMap<>(), "", false, -1, false, rows); expected = removeNonWordAndStripSpace(expected); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expected, got); } @@ -127,7 +134,7 @@ public void testFileSizeStats() throws Exception { String partition2 = HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH; String partition3 = HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH; - HoodieTestTable testTable = HoodieTestTable.of(metaClient); + HoodieTestTable testTable = HoodieTestTable.of(HoodieCLI.getTableMetaClient()); Integer[] data1 = data.get(commit1); assertTrue(3 <= data1.length); testTable.addCommit(commit1) @@ -142,8 +149,8 @@ public void testFileSizeStats() throws Exception { .withBaseFilesInPartition(partition2, data2[1], data2[2]) .withBaseFilesInPartition(partition3, data2[3]); - CommandResult cr = getShell().executeCommand("stats filesizes"); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "stats filesizes"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); Histogram globalHistogram = new Histogram(new UniformReservoir(StatsCommand.MAX_FILES)); HashMap commitHistoMap = new HashMap<>(); @@ -177,7 +184,7 @@ public void testFileSizeStats() throws Exception { String expect = HoodiePrintHelper.print(header, new StatsCommand().getFieldNameToConverterMap(), "", false, -1, false, rows); expect = removeNonWordAndStripSpace(expect); - String got = removeNonWordAndStripSpace(cr.getResult().toString()); + String got = removeNonWordAndStripSpace(result.toString()); assertEquals(expect, got); } } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestTableCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestTableCommand.java index cdf9db3443fe6..c1c44f6251889 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestTableCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestTableCommand.java @@ -18,24 +18,41 @@ package org.apache.hudi.cli.commands; +import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.cli.HoodieCLI; -import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; +import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; +import org.apache.hudi.cli.testutils.HoodieTestCommitMetadataGenerator; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; import org.apache.hudi.common.fs.ConsistencyGuardConfig; +import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.Option; +import org.apache.avro.Schema; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.springframework.shell.core.CommandResult; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Paths; import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import static org.apache.hudi.common.table.HoodieTableMetaClient.METAFOLDER_NAME; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -45,29 +62,36 @@ /** * Test Cases for {@link TableCommand}. */ -public class TestTableCommand extends AbstractShellIntegrationTest { +@Tag("functional") +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class TestTableCommand extends CLIFunctionalTestHarness { - private final String tableName = "test_table"; + @Autowired + private Shell shell; + + private String tableName; private String tablePath; private String metaPath; + private String archivePath; /** * Init path after Mini hdfs init. */ @BeforeEach public void init() { - HoodieCLI.conf = jsc.hadoopConfiguration(); - tablePath = basePath + File.separator + tableName; - metaPath = tablePath + File.separator + METAFOLDER_NAME; + HoodieCLI.conf = hadoopConf(); + tableName = tableName(); + tablePath = tablePath(tableName); + metaPath = Paths.get(tablePath, METAFOLDER_NAME).toString(); + archivePath = Paths.get(metaPath, HoodieTableConfig.ARCHIVELOG_FOLDER.defaultValue()).toString(); } /** * Method to create a table for connect or desc. */ private boolean prepareTable() { - CommandResult cr = getShell().executeCommand( - "create --path " + tablePath + " --tableName " + tableName); - return cr.isSuccess(); + Object result = shell.evaluate(() -> "create --path " + tablePath + " --tableName " + tableName); + return ShellEvaluationResultUtil.isSuccess(result); } /** @@ -79,10 +103,9 @@ public void testConnectTable() { assertTrue(prepareTable()); // Test connect with specified values - CommandResult cr = getShell().executeCommand( - "connect --path " + tablePath + " --initialCheckIntervalMs 3000 " + Object result = shell.evaluate(() -> "connect --path " + tablePath + " --initialCheckIntervalMs 3000 " + "--maxWaitIntervalMs 40000 --maxCheckIntervalMs 8"); - assertTrue(cr.isSuccess()); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); // Check specified values ConsistencyGuardConfig conf = HoodieCLI.consistencyGuardConfig; @@ -105,7 +128,7 @@ public void testDefaultCreate() { // Test meta HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); - assertEquals(metaPath, client.getArchivePath()); + assertEquals(archivePath, client.getArchivePath()); assertEquals(tablePath, client.getBasePath()); assertEquals(metaPath, client.getMetaPath()); assertEquals(HoodieTableType.COPY_ON_WRITE, client.getTableType()); @@ -118,13 +141,12 @@ public void testDefaultCreate() { @Test public void testCreateWithSpecifiedValues() { // Test create with specified values - CommandResult cr = getShell().executeCommand( - "create --path " + tablePath + " --tableName " + tableName + Object result = shell.evaluate(() -> "create --path " + tablePath + " --tableName " + tableName + " --tableType MERGE_ON_READ --archiveLogFolder archive"); - assertTrue(cr.isSuccess()); - assertEquals("Metadata for table " + tableName + " loaded", cr.getResult().toString()); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); + assertEquals("Metadata for table " + tableName + " loaded", result.toString()); HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); - assertEquals(metaPath + File.separator + "archive", client.getArchivePath()); + assertEquals(metaPath + Path.SEPARATOR + "archive", client.getArchivePath()); assertEquals(tablePath, client.getBasePath()); assertEquals(metaPath, client.getMetaPath()); assertEquals(HoodieTableType.MERGE_ON_READ, client.getTableType()); @@ -139,13 +161,13 @@ public void testDescTable() { assertTrue(prepareTable()); // Test desc table - CommandResult cr = getShell().executeCommand("desc"); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> "desc"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); // check table's basePath metaPath and type - assertTrue(cr.getResult().toString().contains(tablePath)); - assertTrue(cr.getResult().toString().contains(metaPath)); - assertTrue(cr.getResult().toString().contains("COPY_ON_WRITE")); + assertTrue(result.toString().contains(tablePath)); + assertTrue(result.toString().contains(metaPath)); + assertTrue(result.toString().contains("COPY_ON_WRITE")); } /** @@ -155,15 +177,15 @@ public void testDescTable() { public void testRefresh() throws IOException { List refreshCommands = Arrays.asList("refresh", "metadata refresh", "commits refresh", "cleans refresh", "savepoints refresh"); - for (String command: refreshCommands) { + for (String command : refreshCommands) { testRefreshCommand(command); } } private void testRefreshCommand(String command) throws IOException { // clean table matedata - FileSystem fs = FileSystem.get(jsc.hadoopConfiguration()); - fs.delete(new Path(tablePath + File.separator + HoodieTableMetaClient.METAFOLDER_NAME), true); + FileSystem fs = FileSystem.get(hadoopConf()); + fs.delete(new Path(tablePath + Path.SEPARATOR + HoodieTableMetaClient.METAFOLDER_NAME), true); // Create table assertTrue(prepareTable()); @@ -175,7 +197,7 @@ private void testRefreshCommand(String command) throws IOException { // generate four savepoints for (int i = 100; i < 104; i++) { String instantTime = String.valueOf(i); - HoodieTestDataGenerator.createCommitFile(tablePath, instantTime, jsc.hadoopConfiguration()); + HoodieTestDataGenerator.createCommitFile(tablePath, instantTime, hadoopConf()); } // Before refresh, no instant @@ -183,8 +205,8 @@ private void testRefreshCommand(String command) throws IOException { HoodieCLI.getTableMetaClient().getActiveTimeline().getCommitTimeline().filterCompletedInstants(); assertEquals(0, timeline.countInstants(), "there should have no instant"); - CommandResult cr = getShell().executeCommand(command); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> command); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); timeline = HoodieCLI.getTableMetaClient().getActiveTimeline().getCommitTimeline().filterCompletedInstants(); @@ -192,4 +214,77 @@ private void testRefreshCommand(String command) throws IOException { // After refresh, there are 4 instants assertEquals(4, timeline.countInstants(), "there should have 4 instants"); } + + @Test + public void testFetchTableSchema() throws Exception { + // Create table and connect + HoodieCLI.conf = hadoopConf(); + new TableCommand().createTable( + tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), + "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + + String schemaStr = "{\n" + + " \"type\" : \"record\",\n" + + " \"name\" : \"SchemaName\",\n" + + " \"namespace\" : \"SchemaNS\",\n" + + " \"fields\" : [ {\n" + + " \"name\" : \"key\",\n" + + " \"type\" : \"int\"\n" + + " }, {\n" + + " \"name\" : \"val\",\n" + + " \"type\" : [ \"null\", \"string\" ],\n" + + " \"default\" : null\n" + + " }]};"; + + generateData(schemaStr); + + Object result = shell.evaluate(() -> "fetch table schema"); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); + + String actualSchemaStr = result.toString().substring(result.toString().indexOf("{")); + Schema actualSchema = new Schema.Parser().parse(actualSchemaStr); + + Schema expectedSchema = new Schema.Parser().parse(schemaStr); + expectedSchema = HoodieAvroUtils.addMetadataFields(expectedSchema); + assertEquals(actualSchema, expectedSchema); + + File file = File.createTempFile("temp", null); + result = shell.evaluate(() -> "fetch table schema --outputFilePath " + file.getAbsolutePath()); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); + + actualSchemaStr = getFileContent(file.getAbsolutePath()); + actualSchema = new Schema.Parser().parse(actualSchemaStr); + assertEquals(actualSchema, expectedSchema); + } + + private LinkedHashMap generateData(String schemaStr) throws Exception { + // generate data and metadata + LinkedHashMap data = new LinkedHashMap<>(); + data.put("102", new Integer[] {15, 10}); + data.put("101", new Integer[] {20, 10}); + data.put("100", new Integer[] {15, 15}); + for (Map.Entry entry : data.entrySet()) { + String key = entry.getKey(); + Integer[] value = entry.getValue(); + HoodieTestCommitMetadataGenerator.createCommitFileWithMetadata(tablePath, key, HoodieCLI.conf, + Option.of(value[0]), Option.of(value[1]), Collections.singletonMap(HoodieCommitMetadata.SCHEMA_KEY, schemaStr)); + } + + HoodieTableMetaClient metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); + assertEquals(3, metaClient.reloadActiveTimeline().getCommitsTimeline().countInstants(), + "There should have 3 commits"); + return data; + } + + private String getFileContent(String fileToReadStr) throws IOException { + File fileToRead = new File(fileToReadStr); + if (!fileToRead.exists()) { + throw new IllegalStateException("Outfile " + fileToReadStr + "not found "); + } + FileInputStream fis = new FileInputStream(fileToRead); + byte[] data = new byte[(int) fileToRead.length()]; + fis.read(data); + fis.close(); + return new String(data, StandardCharsets.UTF_8); + } } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestTempViewCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestTempViewCommand.java index 504f0d7ba5c7e..b6f17fa3364e7 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestTempViewCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestTempViewCommand.java @@ -19,13 +19,19 @@ package org.apache.hudi.cli.commands; import org.apache.hudi.cli.HoodieCLI; -import org.apache.hudi.cli.testutils.AbstractShellBaseIntegrationTest; +import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; +import org.apache.hudi.cli.testutils.MockCommandLineInput; +import org.apache.hudi.cli.utils.SparkTempViewProvider; +import org.apache.hudi.cli.utils.TempViewProvider; import org.apache.hudi.exception.HoodieException; -import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.springframework.shell.core.CommandResult; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; import java.util.ArrayList; import java.util.Arrays; @@ -35,9 +41,14 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestTempViewCommand extends AbstractShellBaseIntegrationTest { +@Tag("functional") +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class TestTempViewCommand extends CLIFunctionalTestHarness { - private String tableName = "test_table"; + @Autowired + private Shell shell; + private TempViewProvider tempViewProvider; + private final String tableName = tableName(); @BeforeEach public void init() { @@ -45,38 +56,41 @@ public void init() { for (int i = 0; i < 3; i++) { rows.add(Arrays.asList(new Comparable[] {"c1", "c2", "c3"})); } - HoodieCLI.getTempViewProvider().createOrReplace(tableName, Arrays.asList("t1", "t2", "t3"), rows); + tempViewProvider = new SparkTempViewProvider(jsc(), sqlContext()); + tempViewProvider.createOrReplace(tableName, Arrays.asList("t1", "t2", "t3"), rows); + HoodieCLI.tempViewProvider = tempViewProvider; } - @AfterAll - public static void shutdown() { - if (HoodieCLI.getTempViewProvider() != null) { - HoodieCLI.closeTempViewProvider(); - } + @AfterEach + public void cleanUpTempView() { + tempViewProvider.close(); + HoodieCLI.tempViewProvider = null; } @Test public void testQueryWithException() { - CommandResult cr = getShell().executeCommand(String.format("temp query --sql 'select * from %s'", "table_1")); - assertEquals(TempViewCommand.QUERY_FAIL, cr.getResult().toString()); + Object result = shell.evaluate((MockCommandLineInput) () -> + String.format("temp query --sql 'select * from %s'", "table_non_exist")); + assertEquals(TempViewCommand.QUERY_FAIL, result.toString()); } @Test public void testQuery() { - CommandResult cr = getShell().executeCommand(String.format("temp query --sql 'select * from %s'", tableName)); - assertEquals(TempViewCommand.QUERY_SUCCESS, cr.getResult().toString()); + Object result = shell.evaluate((MockCommandLineInput) () -> + String.format("temp query --sql 'select * from %s'", tableName)); + assertEquals(TempViewCommand.QUERY_SUCCESS, result.toString()); } @Test public void testShowAll() { - CommandResult cr = getShell().executeCommand("temps show"); - assertEquals(TempViewCommand.SHOW_SUCCESS, cr.getResult().toString()); + Object result = shell.evaluate(() -> "temps show"); + assertEquals(TempViewCommand.SHOW_SUCCESS, result.toString()); } @Test public void testDelete() { - CommandResult cr = getShell().executeCommand(String.format("temp delete --view %s", tableName)); - assertTrue(cr.getResult().toString().endsWith("successfully!")); + Object result = shell.evaluate(() -> String.format("temp delete --view %s", tableName)); + assertTrue(result.toString().endsWith("successfully!")); // after delete, we can not access table yet. assertThrows(HoodieException.class, () -> HoodieCLI.getTempViewProvider().runQuery("select * from " + tableName)); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestUpgradeDowngradeCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestUpgradeDowngradeCommand.java index b9aa3f7310c3a..ed4c9528243ce 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestUpgradeDowngradeCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestUpgradeDowngradeCommand.java @@ -7,37 +7,46 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ package org.apache.hudi.cli.commands; import org.apache.hudi.cli.HoodieCLI; -import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; +import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; +import org.apache.hudi.common.config.HoodieConfig; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.IOType; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.HoodieTableVersion; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.testutils.FileCreateUtils; import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.testutils.HoodieClientTestUtils; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; import java.io.IOException; -import java.nio.file.Paths; -import java.util.Properties; +import java.util.Arrays; +import java.util.stream.Stream; import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS; @@ -48,19 +57,23 @@ /** * Tests {@link UpgradeOrDowngradeCommand}. */ -public class TestUpgradeDowngradeCommand extends AbstractShellIntegrationTest { +@Tag("functional") +public class TestUpgradeDowngradeCommand extends CLIFunctionalTestHarness { private String tablePath; + private HoodieTableMetaClient metaClient; @BeforeEach public void init() throws Exception { - String tableName = "test_table"; - tablePath = Paths.get(basePath, tableName).toString(); + String tableName = tableName(); + tablePath = tablePath(tableName); new TableCommand().createTable( tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + timelineService = HoodieClientTestUtils.initTimelineService( + context, basePath(), FileSystemViewStorageConfig.REMOTE_PORT_NUM.defaultValue()); metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); - //Create some commits files and parquet files + //Create some commits files and base files HoodieTestTable.of(metaClient) .withPartitionMetaFiles(DEFAULT_PARTITION_PATHS) .addCommit("100") @@ -76,12 +89,34 @@ public void init() throws Exception { .withMarkerFile(DEFAULT_THIRD_PARTITION_PATH, "file-3", IOType.MERGE); } - @Test - public void testDowngradeCommand() throws Exception { - // update hoodie.table.version to 1 - metaClient.getTableConfig().setTableVersion(HoodieTableVersion.ONE); + @AfterEach + public void cleanup() { + if (timelineService != null) { + timelineService.close(); + } + } + + private static Stream testArgsForUpgradeDowngradeCommand() { + return Arrays.stream(new HoodieTableVersion[][] { + {HoodieTableVersion.FIVE, HoodieTableVersion.ZERO}, + {HoodieTableVersion.ZERO, HoodieTableVersion.ONE}, + // Table upgrade from version ONE to TWO requires key generator related configs + // such as "hoodie.datasource.write.recordkey.field" which is only available + // when user configures the write job. So the table upgrade from version ONE to TWO + // through CLI is not supported, and user should rely on the automatic upgrade + // in the write client instead. + // {HoodieTableVersion.ONE, HoodieTableVersion.TWO}, + {HoodieTableVersion.TWO, HoodieTableVersion.FIVE} + }).map(Arguments::of); + } + + @ParameterizedTest + @MethodSource("testArgsForUpgradeDowngradeCommand") + public void testUpgradeDowngradeCommand(HoodieTableVersion fromVersion, HoodieTableVersion toVersion) throws Exception { + // Start with hoodie.table.version to 5 + metaClient.getTableConfig().setTableVersion(HoodieTableVersion.FIVE); try (FSDataOutputStream os = metaClient.getFs().create(new Path(metaClient.getMetaPath() + "/" + HoodieTableConfig.HOODIE_PROPERTIES_FILE), true)) { - metaClient.getTableConfig().getProperties().store(os, ""); + metaClient.getTableConfig().getProps().store(os, ""); } metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); @@ -90,28 +125,48 @@ public void testDowngradeCommand() throws Exception { assertEquals(1, FileCreateUtils.getTotalMarkerFileCount(tablePath, partitionPath, "101", IOType.MERGE)); } - SparkMain.upgradeOrDowngradeTable(jsc, tablePath, HoodieTableVersion.ZERO.name()); - - // verify hoodie.table.version got downgraded - metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); + if (fromVersion != HoodieTableVersion.FIVE) { + SparkMain.upgradeOrDowngradeTable(jsc(), tablePath, fromVersion.name()); + } + verifyTableVersion(fromVersion); - // verify hoodie.table.version - assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.ZERO.versionCode()); - assertTableVersionFromPropertyFile(); + SparkMain.upgradeOrDowngradeTable(jsc(), tablePath, toVersion.name()); + verifyTableVersion(toVersion); - // verify marker files are non existant - for (String partitionPath : DEFAULT_PARTITION_PATHS) { - assertEquals(0, FileCreateUtils.getTotalMarkerFileCount(tablePath, partitionPath, "101", IOType.MERGE)); + if (toVersion == HoodieTableVersion.ZERO) { + // verify marker files are non existent + for (String partitionPath : DEFAULT_PARTITION_PATHS) { + assertEquals(0, FileCreateUtils.getTotalMarkerFileCount(tablePath, partitionPath, "101", IOType.MERGE)); + } } } - private void assertTableVersionFromPropertyFile() throws IOException { + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testGetHoodieTableVersionName(boolean overrideWithDefault) { + assertEquals(overrideWithDefault ? HoodieTableVersion.current().name() : null, + UpgradeOrDowngradeCommand.getHoodieTableVersionName(null, overrideWithDefault)); + assertEquals(overrideWithDefault ? HoodieTableVersion.current().name() : "", + UpgradeOrDowngradeCommand.getHoodieTableVersionName("", overrideWithDefault)); + assertEquals("FIVE", + UpgradeOrDowngradeCommand.getHoodieTableVersionName("FIVE", overrideWithDefault)); + assertEquals("FIVE", + UpgradeOrDowngradeCommand.getHoodieTableVersionName("5", overrideWithDefault)); + } + + private void verifyTableVersion(HoodieTableVersion expectedVersion) throws IOException { + metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); + assertEquals(expectedVersion.versionCode(), metaClient.getTableConfig().getTableVersion().versionCode()); + assertTableVersionFromPropertyFile(expectedVersion); + } + + private void assertTableVersionFromPropertyFile(HoodieTableVersion expectedVersion) throws IOException { Path propertyFile = new Path(metaClient.getMetaPath() + "/" + HoodieTableConfig.HOODIE_PROPERTIES_FILE); // Load the properties and verify FSDataInputStream fsDataInputStream = metaClient.getFs().open(propertyFile); - Properties prop = new Properties(); - prop.load(fsDataInputStream); + HoodieConfig hoodieConfig = HoodieConfig.create(fsDataInputStream); fsDataInputStream.close(); - assertEquals(Integer.toString(HoodieTableVersion.ZERO.versionCode()), prop.getProperty(HoodieTableConfig.HOODIE_TABLE_VERSION_PROP_NAME)); + assertEquals(Integer.toString(expectedVersion.versionCode()), hoodieConfig + .getString(HoodieTableConfig.VERSION)); } } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestUtilsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestUtilsCommand.java index 63c4bcddd9d39..f7b82d7a3dc6d 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestUtilsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestUtilsCommand.java @@ -18,11 +18,15 @@ package org.apache.hudi.cli.commands; -import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; +import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; import org.apache.hudi.table.HoodieTable; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.springframework.shell.core.CommandResult; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; import static org.junit.jupiter.api.Assertions.assertAll; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -32,7 +36,12 @@ /** * Test class for {@link org.apache.hudi.cli.commands.UtilsCommand}. */ -public class TestUtilsCommand extends AbstractShellIntegrationTest { +@Tag("functional") +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class TestUtilsCommand extends CLIFunctionalTestHarness { + + @Autowired + private Shell shell; /** * Test case for success load class. @@ -40,11 +49,11 @@ public class TestUtilsCommand extends AbstractShellIntegrationTest { @Test public void testLoadClass() { String name = HoodieTable.class.getName(); - CommandResult cr = getShell().executeCommand(String.format("utils loadClass --class %s", name)); + Object result = shell.evaluate(() -> String.format("utils loadClass --class %s", name)); assertAll("Command runs success", - () -> assertTrue(cr.isSuccess()), - () -> assertNotNull(cr.getResult().toString()), - () -> assertTrue(cr.getResult().toString().startsWith("file:"))); + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result)), + () -> assertNotNull(result.toString()), + () -> assertTrue(result.toString().startsWith("file:"))); } /** @@ -53,12 +62,12 @@ public void testLoadClass() { @Test public void testLoadClassNotFound() { String name = "test.class.NotFound"; - CommandResult cr = getShell().executeCommand(String.format("utils loadClass --class %s", name)); + Object result = shell.evaluate(() -> String.format("utils loadClass --class %s", name)); assertAll("Command runs success", - () -> assertTrue(cr.isSuccess()), - () -> assertNotNull(cr.getResult().toString()), - () -> assertEquals(cr.getResult().toString(), String.format("Class %s not found!", name))); + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result)), + () -> assertNotNull(result.toString()), + () -> assertEquals(result.toString(), String.format("Class %s not found!", name))); } /** @@ -67,11 +76,11 @@ public void testLoadClassNotFound() { @Test public void testLoadClassNull() { String name = ""; - CommandResult cr = getShell().executeCommand(String.format("utils loadClass --class %s", name)); + Object result = shell.evaluate(() -> String.format("utils loadClass --class %s", name)); assertAll("Command runs success", - () -> assertTrue(cr.isSuccess()), - () -> assertNotNull(cr.getResult().toString()), - () -> assertEquals("Class to be loaded can not be null!", cr.getResult().toString())); + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result)), + () -> assertNotNull(result.toString()), + () -> assertEquals("Class to be loaded can not be null!", result.toString())); } } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/functional/CLIFunctionalTestHarness.java b/hudi-cli/src/test/java/org/apache/hudi/cli/functional/CLIFunctionalTestHarness.java new file mode 100644 index 0000000000000..04f77df549606 --- /dev/null +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/functional/CLIFunctionalTestHarness.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.cli.functional; + +import org.apache.hudi.client.SparkRDDReadClient; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.testutils.HoodieClientTestUtils; +import org.apache.hudi.testutils.providers.SparkProvider; +import org.apache.hudi.timeline.service.TimelineService; + +import org.apache.hadoop.conf.Configuration; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.io.TempDir; + +import java.nio.file.Paths; + +public class CLIFunctionalTestHarness implements SparkProvider { + + protected static int timelineServicePort = + FileSystemViewStorageConfig.REMOTE_PORT_NUM.defaultValue(); + protected static transient TimelineService timelineService; + protected static transient HoodieSparkEngineContext context; + private static transient SparkSession spark; + private static transient SQLContext sqlContext; + private static transient JavaSparkContext jsc; + + /** + * An indicator of the initialization status. + */ + protected boolean initialized = false; + @TempDir + protected java.nio.file.Path tempDir; + + public String basePath() { + return tempDir.toAbsolutePath().toString(); + } + + @Override + public SparkSession spark() { + return spark; + } + + @Override + public SQLContext sqlContext() { + return sqlContext; + } + + @Override + public JavaSparkContext jsc() { + return jsc; + } + + @Override + public HoodieSparkEngineContext context() { + return context; + } + + public String tableName() { + return tableName("_test_table"); + } + + public String tableName(String suffix) { + return getClass().getSimpleName() + suffix; + } + + public String tablePath(String tableName) { + return Paths.get(basePath(), tableName).toString(); + } + + public Configuration hadoopConf() { + return jsc().hadoopConfiguration(); + } + + @BeforeEach + public synchronized void runBeforeEach() { + initialized = spark != null; + if (!initialized) { + SparkConf sparkConf = conf(); + SparkRDDWriteClient.registerClasses(sparkConf); + SparkRDDReadClient.addHoodieSupport(sparkConf); + spark = SparkSession.builder().config(sparkConf).getOrCreate(); + sqlContext = spark.sqlContext(); + jsc = new JavaSparkContext(spark.sparkContext()); + context = new HoodieSparkEngineContext(jsc); + timelineService = HoodieClientTestUtils.initTimelineService( + context, basePath(), incrementTimelineServicePortToUse()); + timelineServicePort = timelineService.getServerPort(); + } + } + + @AfterAll + public static synchronized void cleanUpAfterAll() { + if (spark != null) { + spark.close(); + spark = null; + } + if (timelineService != null) { + timelineService.close(); + } + } + + /** + * Helper to prepare string for matching. + * + * @param str Input string. + * @return pruned string with non word characters removed. + */ + protected static String removeNonWordAndStripSpace(String str) { + return str.replaceAll("[\\s]+", ",").replaceAll("[\\W]+", ","); + } + + protected int incrementTimelineServicePortToUse() { + // Increment the timeline service port for each individual test + // to avoid port reuse causing failures + timelineServicePort = (timelineServicePort + 1 - 1024) % (65536 - 1024) + 1024; + return timelineServicePort; + } +} diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestBootstrapCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestBootstrapCommand.java index 0cb278e634619..f22ce1bbaf523 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestBootstrapCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestBootstrapCommand.java @@ -18,21 +18,23 @@ package org.apache.hudi.cli.integ; +import org.apache.hadoop.fs.Path; import org.apache.hudi.cli.HoodieCLI; import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.commands.TableCommand; -import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; -import org.apache.hudi.client.TestBootstrap; +import org.apache.hudi.cli.testutils.HoodieCLIIntegrationTestBase; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; - +import org.apache.hudi.functional.TestBootstrap; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.springframework.shell.core.CommandResult; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; -import java.io.File; import java.io.IOException; import java.time.Instant; import java.util.Arrays; @@ -44,8 +46,11 @@ /** * Test class of {@link org.apache.hudi.cli.commands.BootstrapCommand}. */ -public class ITTestBootstrapCommand extends AbstractShellIntegrationTest { +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class ITTestBootstrapCommand extends HoodieCLIIntegrationTestBase { + @Autowired + private Shell shell; private static final int NUM_OF_RECORDS = 100; private static final String PARTITION_FIELD = "datestr"; private static final String RECORD_KEY_FIELD = "_row_key"; @@ -59,8 +64,8 @@ public class ITTestBootstrapCommand extends AbstractShellIntegrationTest { public void init() { String srcName = "source"; tableName = "test-table"; - sourcePath = basePath + File.separator + srcName; - tablePath = basePath + File.separator + tableName; + sourcePath = basePath + Path.SEPARATOR + srcName; + tablePath = basePath + Path.SEPARATOR + tableName; // generate test data partitions = Arrays.asList("2018", "2019", "2020"); @@ -68,7 +73,7 @@ public void init() { for (int i = 0; i < partitions.size(); i++) { Dataset df = TestBootstrap.generateTestRawTripDataset(timestamp, i * NUM_OF_RECORDS, i * NUM_OF_RECORDS + NUM_OF_RECORDS, null, jsc, sqlContext); - df.write().parquet(sourcePath + File.separator + PARTITION_FIELD + "=" + partitions.get(i)); + df.write().parquet(sourcePath + Path.SEPARATOR + PARTITION_FIELD + "=" + partitions.get(i)); } } @@ -81,8 +86,8 @@ public void testBootstrapRunCommand() throws IOException { String cmdStr = String.format( "bootstrap run --targetPath %s --tableName %s --tableType %s --srcPath %s --rowKeyField %s --partitionPathField %s --sparkMaster %s", tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), sourcePath, RECORD_KEY_FIELD, PARTITION_FIELD, "local"); - CommandResult cr = getShell().executeCommand(cmdStr); - assertTrue(cr.isSuccess()); + Object resultForBootstrapRun = shell.evaluate(() -> cmdStr); + assertTrue(ShellEvaluationResultUtil.isSuccess(resultForBootstrapRun)); // Connect & check Hudi table exist new TableCommand().connect(tablePath, TimelineLayoutVersion.VERSION_1, false, 2000, 300000, 7); @@ -90,8 +95,8 @@ public void testBootstrapRunCommand() throws IOException { assertEquals(1, metaClient.getActiveTimeline().getCommitsTimeline().countInstants(), "Should have 1 commit."); // test "bootstrap index showpartitions" - CommandResult crForIndexedPartitions = getShell().executeCommand("bootstrap index showpartitions"); - assertTrue(crForIndexedPartitions.isSuccess()); + Object resultForIndexedPartitions = shell.evaluate(() -> "bootstrap index showpartitions"); + assertTrue(ShellEvaluationResultUtil.isSuccess(resultForIndexedPartitions)); String[] header = new String[] {"Indexed partitions"}; String[][] rows = new String[partitions.size()][1]; @@ -100,15 +105,15 @@ public void testBootstrapRunCommand() throws IOException { } String expect = HoodiePrintHelper.print(header, rows); expect = removeNonWordAndStripSpace(expect); - String got = removeNonWordAndStripSpace(crForIndexedPartitions.getResult().toString()); + String got = removeNonWordAndStripSpace(resultForIndexedPartitions.toString()); assertEquals(expect, got); // test "bootstrap index showMapping" - CommandResult crForIndexedMapping = getShell().executeCommand("bootstrap index showmapping"); - assertTrue(crForIndexedMapping.isSuccess()); + Object resultForIndexedMapping = shell.evaluate(() -> "bootstrap index showmapping"); + assertTrue(ShellEvaluationResultUtil.isSuccess(resultForIndexedMapping)); - CommandResult crForIndexedMappingWithPartition = getShell().executeCommand(String.format( - "bootstrap index showmapping --partitionPath %s=%s", PARTITION_FIELD, partitions.get(0))); - assertTrue(crForIndexedMappingWithPartition.isSuccess()); + Object resultForIndexedMappingWithPartition = shell.evaluate(() -> String.format( + "bootstrap index showmapping --partitionPath %s=%s", PARTITION_FIELD, partitions.get(0))); + assertTrue(ShellEvaluationResultUtil.isSuccess(resultForIndexedMappingWithPartition)); } } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestClusteringCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestClusteringCommand.java new file mode 100644 index 0000000000000..f81133aca0066 --- /dev/null +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestClusteringCommand.java @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.integ; + +import org.apache.hudi.cli.HoodieCLI; +import org.apache.hudi.cli.commands.TableCommand; +import org.apache.hudi.cli.testutils.HoodieCLIIntegrationTestBase; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.testutils.HoodieClientTestBase; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; + +import java.io.IOException; +import java.nio.file.Paths; +import java.util.List; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertAll; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Integration test class for {@link org.apache.hudi.cli.commands.ClusteringCommand}. + *

+ * A command use SparkLauncher need load jars under lib which generate during mvn package. + * Use integration test instead of unit test. + */ +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class ITTestClusteringCommand extends HoodieCLIIntegrationTestBase { + + @Autowired + private Shell shell; + + @BeforeEach + public void init() throws IOException { + tableName = "test_table_" + ITTestClusteringCommand.class.getName(); + basePath = Paths.get(basePath, tableName).toString(); + + HoodieCLI.conf = jsc.hadoopConfiguration(); + // Create table and connect + new TableCommand().createTable( + basePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), + "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + + initMetaClient(); + } + + /** + * Test case for command 'clustering schedule'. + */ + @Test + public void testScheduleClustering() throws IOException { + // generate commits + generateCommits(); + + Object result = scheduleClustering(); + assertAll("Command run failed", + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result)), + () -> assertTrue( + result.toString().startsWith("Succeeded to schedule clustering for"))); + + // there is 1 requested clustering + HoodieActiveTimeline timeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); + assertEquals(1, timeline.filterPendingReplaceTimeline().countInstants()); + } + + /** + * Test case for command 'clustering run'. + */ + @Test + public void testClustering() throws IOException { + // generate commits + generateCommits(); + + Object result1 = scheduleClustering(); + assertTrue(ShellEvaluationResultUtil.isSuccess(result1)); + + // get clustering instance + HoodieActiveTimeline timeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); + Option instanceOpt = + timeline.filterPendingReplaceTimeline().firstInstant().map(HoodieInstant::getTimestamp); + assertTrue(instanceOpt.isPresent(), "Must have pending clustering."); + final String instance = instanceOpt.get(); + + Object result2 = shell.evaluate(() -> + String.format("clustering run --parallelism %s --clusteringInstant %s --sparkMaster %s", + 2, instance, "local")); + + assertAll("Command run failed", + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result2)), + () -> assertTrue( + result2.toString().startsWith("Succeeded to run clustering for "))); + + // assert clustering complete + assertTrue(HoodieCLI.getTableMetaClient().getActiveTimeline().reload() + .filterCompletedInstants().getInstants() + .map(HoodieInstant::getTimestamp).collect(Collectors.toList()).contains(instance), + "Pending clustering must be completed"); + + assertTrue(HoodieCLI.getTableMetaClient().getActiveTimeline().reload() + .getCompletedReplaceTimeline().getInstants() + .map(HoodieInstant::getTimestamp).collect(Collectors.toList()).contains(instance), + "Pending clustering must be completed"); + } + + /** + * Test case for command 'clustering scheduleAndExecute'. + */ + @Test + public void testClusteringScheduleAndExecute() throws IOException { + // generate commits + generateCommits(); + + Object result = shell.evaluate(() -> + String.format("clustering scheduleAndExecute --parallelism %s --sparkMaster %s", 2, "local")); + + assertAll("Command run failed", + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result)), + () -> assertTrue( + result.toString().startsWith("Succeeded to run clustering for scheduleAndExecute"))); + + // assert clustering complete + assertTrue(HoodieCLI.getTableMetaClient().getActiveTimeline().reload() + .getCompletedReplaceTimeline().getInstants() + .map(HoodieInstant::getTimestamp).count() > 0, + "Completed clustering couldn't be 0"); + } + + private Object scheduleClustering() { + // generate requested clustering + return shell.evaluate(() -> + String.format("clustering schedule --hoodieConfigs hoodie.clustering.inline.max.commits=1 --sparkMaster %s", "local")); + } + + private void generateCommits() throws IOException { + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + + // Create the write client to write some records in + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .withDeleteParallelism(2).forTable(tableName) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); + + SparkRDDWriteClient client = new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jsc), cfg); + + insert(jsc, client, dataGen, "001"); + insert(jsc, client, dataGen, "002"); + } + + private List insert(JavaSparkContext jsc, SparkRDDWriteClient client, + HoodieTestDataGenerator dataGen, String newCommitTime) throws IOException { + // inserts + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 10); + JavaRDD writeRecords = jsc.parallelize(records, 1); + operateFunc(SparkRDDWriteClient::insert, client, writeRecords, newCommitTime); + return records; + } + + private JavaRDD operateFunc( + HoodieClientTestBase.Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, + SparkRDDWriteClient client, JavaRDD writeRecords, String commitTime) + throws IOException { + return writeFn.apply(client, writeRecords, commitTime); + } +} diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCommitsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCommitsCommand.java index adc61457a2d30..3f32081e5e4a9 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCommitsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCommitsCommand.java @@ -21,16 +21,19 @@ import org.apache.hudi.cli.HoodieCLI; import org.apache.hudi.cli.commands.RollbacksCommand; import org.apache.hudi.cli.commands.TableCommand; -import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; +import org.apache.hudi.cli.testutils.HoodieCLIIntegrationTestBase; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.testutils.HoodieTestTable; - import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; -import org.springframework.shell.core.CommandResult; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; import java.io.IOException; import java.nio.file.Paths; @@ -41,6 +44,7 @@ import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS; import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH; import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH; +import static org.junit.jupiter.api.Assertions.assertAll; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -50,20 +54,30 @@ * A command use SparkLauncher need load jars under lib which generate during mvn package. * Use integration test instead of unit test. */ -public class ITTestCommitsCommand extends AbstractShellIntegrationTest { +@Disabled("HUDI-4226") +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class ITTestCommitsCommand extends HoodieCLIIntegrationTestBase { + + @Autowired + private Shell shell; + + @Override + protected HoodieTableType getTableType() { + return HoodieTableType.COPY_ON_WRITE; + } @BeforeEach public void init() throws IOException { - String tableName = "test_table_" + ITTestCommitsCommand.class.getName(); - String tablePath = Paths.get(basePath, tableName).toString(); + tableName = "test_table_" + ITTestCommitsCommand.class.getName(); + basePath = Paths.get(basePath, tableName).toString(); HoodieCLI.conf = jsc.hadoopConfiguration(); // Create table and connect new TableCommand().createTable( - tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), + basePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); - metaClient.setBasePath(tablePath); - metaClient = HoodieTableMetaClient.reload(metaClient); + + initMetaClient(); } /** @@ -71,7 +85,7 @@ public void init() throws IOException { */ @Test public void testRollbackCommit() throws Exception { - //Create some commits files and parquet files + //Create some commits files and base files Map partitionAndFileId = new HashMap() { { put(DEFAULT_FIRST_PARTITION_PATH, "file-1"); @@ -79,19 +93,21 @@ public void testRollbackCommit() throws Exception { put(DEFAULT_THIRD_PARTITION_PATH, "file-3"); } }; - final String rollbackCommit = "102"; HoodieTestTable.of(metaClient) .withPartitionMetaFiles(DEFAULT_PARTITION_PATHS) .addCommit("100") .withBaseFilesInPartitions(partitionAndFileId) .addCommit("101") .withBaseFilesInPartitions(partitionAndFileId) - .addCommit(rollbackCommit) + .addCommit("102") .withBaseFilesInPartitions(partitionAndFileId); - CommandResult cr = getShell().executeCommand(String.format("commit rollback --commit %s --sparkMaster %s --sparkMemory %s", - rollbackCommit, "local", "4G")); - assertTrue(cr.isSuccess()); + Object result = shell.evaluate(() -> String.format("commit rollback --commit %s --sparkMaster %s --sparkMemory %s", + "102", "local", "4G")); + + assertAll("Command run failed", + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result)), + () -> assertEquals("Commit 102 rolled back", result.toString())); metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); @@ -100,5 +116,37 @@ public void testRollbackCommit() throws Exception { HoodieActiveTimeline timeline = metaClient.reloadActiveTimeline(); assertEquals(2, timeline.getCommitsTimeline().countInstants(), "There should have 2 instants."); + + // rollback complete commit + Object result2 = shell.evaluate(() -> String.format("commit rollback --commit %s --sparkMaster %s --sparkMemory %s", + "101", "local", "4G")); + + assertAll("Command run failed", + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result2)), + () -> assertEquals("Commit 101 rolled back", result2.toString())); + + metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); + + HoodieActiveTimeline rollbackTimeline2 = new RollbacksCommand.RollbackTimeline(metaClient); + assertEquals(2, rollbackTimeline2.getRollbackTimeline().countInstants(), "There should have 2 rollback instant."); + + HoodieActiveTimeline timeline2 = metaClient.reloadActiveTimeline(); + assertEquals(1, timeline2.getCommitsTimeline().countInstants(), "There should have 1 instants."); + + // rollback with rollbackUsingMarkers==false + Object result3 = shell.evaluate(() -> + String.format("commit rollback --commit %s --rollbackUsingMarkers false --sparkMaster %s --sparkMemory %s", + "100", "local", "4G")); + + assertAll("Command run failed", + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result3)), + () -> assertEquals("Commit 100 rolled back", result3.toString())); + metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); + + HoodieActiveTimeline rollbackTimeline3 = new RollbacksCommand.RollbackTimeline(metaClient); + assertEquals(3, rollbackTimeline3.getRollbackTimeline().countInstants(), "There should have 3 rollback instant."); + + HoodieActiveTimeline timeline3 = metaClient.reloadActiveTimeline(); + assertEquals(0, timeline3.getCommitsTimeline().countInstants(), "There should have 0 instants."); } } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCompactionCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCompactionCommand.java new file mode 100644 index 0000000000000..b8294f9e334cf --- /dev/null +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCompactionCommand.java @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.integ; + +import org.apache.hudi.cli.HoodieCLI; +import org.apache.hudi.cli.commands.TableCommand; +import org.apache.hudi.cli.testutils.HoodieCLIIntegrationTestBase; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; +import org.apache.hudi.client.CompactionAdminClient; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.TestCompactionAdminClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.model.CompactionOperation; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.testutils.CompactionTestUtils; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.CompactionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.testutils.HoodieClientTestBase; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; + +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.file.Paths; +import java.util.List; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertAll; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Integration test class for {@link org.apache.hudi.cli.commands.CompactionCommand}. + *

+ * A command use SparkLauncher need load jars under lib which generate during mvn package. + * Use integration test instead of unit test. + */ +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class ITTestCompactionCommand extends HoodieCLIIntegrationTestBase { + + @Autowired + private Shell shell; + @BeforeEach + public void init() throws IOException { + tableName = "test_table_" + ITTestCompactionCommand.class.getName(); + basePath = Paths.get(basePath, tableName).toString(); + + HoodieCLI.conf = jsc.hadoopConfiguration(); + // Create table and connect + new TableCommand().createTable( + basePath, tableName, HoodieTableType.MERGE_ON_READ.name(), + "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + + initMetaClient(); + } + + /** + * Test case for command 'compaction schedule'. + */ + @Test + public void testScheduleCompact() throws IOException { + // generate commits + generateCommits(); + + Object result = shell.evaluate(() -> + String.format("compaction schedule --hoodieConfigs hoodie.compact.inline.max.delta.commits=1 --sparkMaster %s", + "local")); + assertAll("Command run failed", + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result)), + () -> assertTrue( + result.toString().startsWith("Attempted to schedule compaction for"))); + + // there is 1 requested compaction + HoodieActiveTimeline timeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); + assertEquals(1, timeline.filterPendingCompactionTimeline().countInstants()); + } + + /** + * Test case for command 'compaction run'. + */ + @Test + public void testCompact() throws IOException { + // generate commits + generateCommits(); + + String instance = prepareScheduleCompaction(); + + String schemaPath = Paths.get(basePath, "compaction.schema").toString(); + writeSchemaToTmpFile(schemaPath); + + Object result2 = shell.evaluate(() -> + String.format("compaction run --parallelism %s --schemaFilePath %s --sparkMaster %s --hoodieConfigs hoodie.embed.timeline.server=false", + 2, schemaPath, "local")); + + assertAll("Command run failed", + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result2)), + () -> assertTrue( + result2.toString().startsWith("Compaction successfully completed for"))); + + // assert compaction complete + assertTrue(HoodieCLI.getTableMetaClient().getActiveTimeline().reload() + .filterCompletedInstants().getInstants() + .map(HoodieInstant::getTimestamp).collect(Collectors.toList()).contains(instance), + "Pending compaction must be completed"); + } + + /** + * Test case for command 'compaction scheduleAndExecute'. + */ + @Test + public void testCompactScheduleAndExecute() throws IOException { + // generate commits + generateCommits(); + + String schemaPath = Paths.get(basePath, "compaction.schema").toString(); + writeSchemaToTmpFile(schemaPath); + + Object result = shell.evaluate(() -> + String.format("compaction scheduleAndExecute --parallelism %s --schemaFilePath %s --sparkMaster %s " + + "--hoodieConfigs hoodie.compact.inline.max.delta.commits=1", + 2, schemaPath, "local")); + + assertAll("Command run failed", + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result)), + () -> assertTrue( + result.toString().startsWith("Schedule and execute compaction successfully completed"))); + + // assert compaction complete + assertTrue(HoodieCLI.getTableMetaClient().getActiveTimeline().reload() + .filterCompletedInstants().getInstants() + .map(HoodieInstant::getTimestamp).count() > 0, + "Completed compaction couldn't be 0"); + } + + /** + * Test case for command 'compaction validate'. + */ + @Test + public void testValidateCompaction() throws IOException { + // generate commits + generateCommits(); + + String instance = prepareScheduleCompaction(); + + Object result = shell.evaluate(() -> + String.format("compaction validate --instant %s --sparkMaster %s", instance, "local")); + + assertAll("Command run failed", + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result)), + () -> assertTrue( + // compaction requested should be valid + result.toString().contains("COMPACTION PLAN VALID"))); + } + + /** + * This function mainly tests the workflow of 'compaction unschedule' command. + * The real test of {@link org.apache.hudi.client.CompactionAdminClient#unscheduleCompactionPlan} + * is {@link TestCompactionAdminClient#testUnscheduleCompactionPlan()}. + */ + @Test + public void testUnscheduleCompaction() throws Exception { + // generate commits + generateCommits(); + + String instance = prepareScheduleCompaction(); + + Object result = shell.evaluate(() -> + String.format("compaction unschedule --instant %s --sparkMaster %s", instance, "local")); + + // Always has no file + assertAll("Command run failed", + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result)), + () -> assertEquals("No File renames needed to unschedule pending compaction. Operation successful.", + result.toString())); + } + + /** + * This function mainly tests the workflow of 'compaction unscheduleFileId' command. + * The real test of {@link org.apache.hudi.client.CompactionAdminClient#unscheduleCompactionFileId} + * is {@link TestCompactionAdminClient#testUnscheduleCompactionFileId}. + */ + @Test + public void testUnscheduleCompactFile() throws IOException { + int numEntriesPerInstant = 10; + CompactionTestUtils.setupAndValidateCompactionOperations(metaClient, false, numEntriesPerInstant, + numEntriesPerInstant, numEntriesPerInstant, numEntriesPerInstant); + + CompactionOperation op = CompactionOperation.convertFromAvroRecordInstance( + CompactionUtils.getCompactionPlan(metaClient, "001").getOperations().stream().findFirst().get()); + + Object result = shell.evaluate(() -> + String.format("compaction unscheduleFileId --fileId %s --partitionPath %s --sparkMaster %s", + op.getFileGroupId().getFileId(), op.getFileGroupId().getPartitionPath(), "local")); + + assertAll("Command run failed", + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result)), + () -> assertTrue(removeNonWordAndStripSpace(result.toString()).contains("true")), + () -> assertFalse(removeNonWordAndStripSpace(result.toString()).contains("false"))); + } + + /** + * This function mainly tests the workflow of 'compaction repair' command. + * The real test of {@link org.apache.hudi.client.CompactionAdminClient#repairCompaction} + * is {@link TestCompactionAdminClient#testRepairCompactionPlan}. + */ + @Test + public void testRepairCompaction() throws Exception { + int numEntriesPerInstant = 10; + String compactionInstant = "001"; + CompactionTestUtils.setupAndValidateCompactionOperations(metaClient, false, numEntriesPerInstant, + numEntriesPerInstant, numEntriesPerInstant, numEntriesPerInstant); + + metaClient.reloadActiveTimeline(); + CompactionAdminClient client = new CompactionAdminClient(new HoodieSparkEngineContext(jsc), metaClient.getBasePath()); + List> renameFiles = + client.getRenamingActionsForUnschedulingCompactionPlan(metaClient, compactionInstant, 1, Option.empty(), false); + + renameFiles.forEach(lfPair -> { + try { + metaClient.getFs().rename(lfPair.getLeft().getPath(), lfPair.getRight().getPath()); + } catch (IOException e) { + throw new HoodieIOException(e.getMessage(), e); + } + }); + + client.unscheduleCompactionPlan(compactionInstant, false, 1, false); + + Object result = shell.evaluate(() -> + String.format("compaction repair --instant %s --sparkMaster %s", compactionInstant, "local")); + + // All Executes is succeeded, result contains true and has no false + // Expected: + // ║ File Id │ Source File Path │ Destination File Path │ Rename Executed? │ Rename Succeeded? │ Error ║ + // ║ * │ * │ * │ true │ true │ ║ + assertAll("Command run failed", + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result)), + () -> assertTrue(removeNonWordAndStripSpace(result.toString()).contains("true")), + () -> assertFalse(removeNonWordAndStripSpace(result.toString()).contains("false"))); + } + + private String prepareScheduleCompaction() { + // generate requested compaction + Object result = shell.evaluate(() -> + String.format("compaction schedule --hoodieConfigs hoodie.compact.inline.max.delta.commits=1 --sparkMaster %s", + "local")); + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); + + // get compaction instance + HoodieActiveTimeline timeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); + Option instance = + timeline.filterPendingCompactionTimeline().firstInstant().map(HoodieInstant::getTimestamp); + assertTrue(instance.isPresent(), "Must have pending compaction."); + return instance.get(); + } + + private void writeSchemaToTmpFile(String schemaPath) throws IOException { + try (BufferedWriter out = new BufferedWriter(new FileWriter(schemaPath))) { + out.write(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA); + } + } + + private void generateCommits() throws IOException { + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + + // Create the write client to write some records in + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .withDeleteParallelism(2).forTable(tableName) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); + + SparkRDDWriteClient client = new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jsc), cfg); + + List records = insert(jsc, client, dataGen); + upsert(jsc, client, dataGen, records); + delete(jsc, client, records); + } + + private List insert(JavaSparkContext jsc, SparkRDDWriteClient client, + HoodieTestDataGenerator dataGen) throws IOException { + // inserts + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 10); + JavaRDD writeRecords = jsc.parallelize(records, 1); + operateFunc(SparkRDDWriteClient::insert, client, writeRecords, newCommitTime); + return records; + } + + private void upsert(JavaSparkContext jsc, SparkRDDWriteClient client, + HoodieTestDataGenerator dataGen, List records) + throws IOException { + // updates + String newCommitTime = "002"; + client.startCommitWithTime(newCommitTime); + + List toBeUpdated = dataGen.generateUpdates(newCommitTime, 2); + records.addAll(toBeUpdated); + JavaRDD writeRecords = jsc.parallelize(records, 1); + operateFunc(SparkRDDWriteClient::upsert, client, writeRecords, newCommitTime); + } + + private void delete(JavaSparkContext jsc, SparkRDDWriteClient client, + List records) { + // Delete + String newCommitTime = "003"; + client.startCommitWithTime(newCommitTime); + + // just delete half of the records + int numToDelete = records.size() / 2; + List toBeDeleted = records.stream().map(HoodieRecord::getKey).limit(numToDelete).collect(Collectors.toList()); + JavaRDD deleteRecords = jsc.parallelize(toBeDeleted, 1); + client.delete(deleteRecords, newCommitTime); + } + + private JavaRDD operateFunc( + HoodieClientTestBase.Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, + SparkRDDWriteClient client, JavaRDD writeRecords, String commitTime) + throws IOException { + return writeFn.apply(client, writeRecords, commitTime); + } +} diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestHDFSParquetImportCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestHDFSParquetImportCommand.java index 17b997ac26f23..a71697657a0d7 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestHDFSParquetImportCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestHDFSParquetImportCommand.java @@ -18,9 +18,13 @@ package org.apache.hudi.cli.integ; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.Path; import org.apache.hudi.cli.HoodieCLI; import org.apache.hudi.cli.commands.TableCommand; -import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; +import org.apache.hudi.cli.testutils.HoodieCLIIntegrationTestBase; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; @@ -29,17 +33,15 @@ import org.apache.hudi.utilities.HDFSParquetImporter; import org.apache.hudi.utilities.functional.TestHDFSParquetImporter; import org.apache.hudi.utilities.functional.TestHDFSParquetImporter.HoodieTripModel; - -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.Path; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; -import org.springframework.shell.core.CommandResult; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; -import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; @@ -54,8 +56,12 @@ /** * Test class for {@link org.apache.hudi.cli.commands.HDFSParquetImportCommand}. */ -public class ITTestHDFSParquetImportCommand extends AbstractShellIntegrationTest { +@Disabled("Disable due to flakiness and feature deprecation.") +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class ITTestHDFSParquetImportCommand extends HoodieCLIIntegrationTestBase { + @Autowired + private Shell shell; private Path sourcePath; private Path targetPath; private String tableName; @@ -68,7 +74,7 @@ public class ITTestHDFSParquetImportCommand extends AbstractShellIntegrationTest @BeforeEach public void init() throws IOException, ParseException { tableName = "test_table"; - tablePath = basePath + File.separator + tableName; + tablePath = basePath + Path.SEPARATOR + tableName; sourcePath = new Path(basePath, "source"); targetPath = new Path(tablePath); schemaFile = new Path(basePath, "file.schema").toString(); @@ -92,14 +98,15 @@ public void testConvertWithInsert() throws IOException { + "--schemaFilePath %s --format %s --sparkMemory %s --retry %s --sparkMaster %s", sourcePath.toString(), targetPath.toString(), tableName, HoodieTableType.COPY_ON_WRITE.name(), "_row_key", "timestamp", "1", schemaFile, "parquet", "2G", "1", "local"); - CommandResult cr = getShell().executeCommand(command); + + Object result = shell.evaluate(() -> command); assertAll("Command run success", - () -> assertTrue(cr.isSuccess()), - () -> assertEquals("Table imported to hoodie format", cr.getResult().toString())); + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result)), + () -> assertEquals("Table imported to hoodie format", result.toString())); // Check hudi table exist - String metaPath = targetPath + File.separator + HoodieTableMetaClient.METAFOLDER_NAME; + String metaPath = targetPath + Path.SEPARATOR + HoodieTableMetaClient.METAFOLDER_NAME; assertTrue(Files.exists(Paths.get(metaPath)), "Hoodie table not exist."); // Load meta data @@ -138,11 +145,11 @@ public void testConvertWithUpsert() throws IOException, ParseException { + "--schemaFilePath %s --format %s --sparkMemory %s --retry %s --sparkMaster %s --upsert %s", upsertFolder.toString(), targetPath.toString(), tableName, HoodieTableType.COPY_ON_WRITE.name(), "_row_key", "timestamp", "1", schemaFile, "parquet", "2G", "1", "local", "true"); - CommandResult cr = getShell().executeCommand(command); + Object result = shell.evaluate(() -> command); assertAll("Command run success", - () -> assertTrue(cr.isSuccess()), - () -> assertEquals("Table imported to hoodie format", cr.getResult().toString())); + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result)), + () -> assertEquals("Table imported to hoodie format", result.toString())); // reload meta client metaClient = HoodieTableMetaClient.reload(metaClient); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestMarkersCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestMarkersCommand.java new file mode 100644 index 0000000000000..5aacfd82de044 --- /dev/null +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestMarkersCommand.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.integ; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.cli.commands.TableCommand; +import org.apache.hudi.cli.testutils.HoodieCLIIntegrationTestBase; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.IOType; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.testutils.FileCreateUtils; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; + +import java.io.IOException; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Integration test class for {@link org.apache.hudi.cli.commands.MarkersCommand}. + *

+ * A command use SparkLauncher need load jars under lib which generate during mvn package. + * Use integration test instead of unit test. + */ +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class ITTestMarkersCommand extends HoodieCLIIntegrationTestBase { + + @Autowired + private Shell shell; + private String tablePath; + + @BeforeEach + public void init() throws IOException { + String tableName = "test_table"; + tablePath = basePath + Path.SEPARATOR + tableName; + + // Create table and connect + new TableCommand().createTable( + tablePath, "test_table", HoodieTableType.COPY_ON_WRITE.name(), + "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + } + + /** + * Test case of command 'marker delete'. + */ + @Test + public void testDeleteMarker() throws IOException { + // generate markers + String instantTime1 = "101"; + + FileCreateUtils.createMarkerFile(tablePath, "partA", instantTime1, "f0", IOType.APPEND); + FileCreateUtils.createMarkerFile(tablePath, "partA", instantTime1, "f1", IOType.APPEND); + + assertEquals(2, FileCreateUtils.getTotalMarkerFileCount(tablePath, "partA", instantTime1, IOType.APPEND)); + + Object result = shell.evaluate(() -> + String.format("marker delete --commit %s --sparkMaster %s", instantTime1, "local")); + + assertTrue(ShellEvaluationResultUtil.isSuccess(result)); + + assertEquals(0, FileCreateUtils.getTotalMarkerFileCount(tablePath, "partA", instantTime1, IOType.APPEND)); + } +} diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java index 133dcb0577bab..69db47136e918 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java @@ -18,12 +18,17 @@ package org.apache.hudi.cli.integ; +import org.apache.avro.Schema; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.cli.HoodieCLI; import org.apache.hudi.cli.commands.RepairsCommand; import org.apache.hudi.cli.commands.TableCommand; -import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; +import org.apache.hudi.cli.testutils.HoodieCLIIntegrationTestBase; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -31,15 +36,14 @@ import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.SchemaTestUtil; -import org.apache.hudi.testutils.HoodieWriteableTestTable; - -import org.apache.avro.Schema; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; +import org.apache.hudi.testutils.HoodieSparkWriteableTestTable; import org.apache.spark.sql.Dataset; import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.springframework.shell.core.CommandResult; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; import java.io.IOException; import java.nio.file.Paths; @@ -56,166 +60,275 @@ * A command use SparkLauncher need load jars under lib which generate during mvn package. * Use integration test instead of unit test. */ -public class ITTestRepairsCommand extends AbstractShellIntegrationTest { +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class ITTestRepairsCommand extends HoodieCLIIntegrationTestBase { + + @Autowired + private Shell shell; private String duplicatedPartitionPath; private String duplicatedPartitionPathWithUpdates; private String duplicatedPartitionPathWithUpserts; + private String duplicatedNoPartitionPath; private String repairedOutputPath; + private HoodieFileFormat fileFormat; + @BeforeEach public void init() throws Exception { - final String tablePath = Paths.get(basePath, "test_table").toString(); - duplicatedPartitionPath = Paths.get(tablePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).toString(); - duplicatedPartitionPathWithUpdates = Paths.get(tablePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).toString(); - duplicatedPartitionPathWithUpserts = Paths.get(tablePath, HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH).toString(); + duplicatedPartitionPath = HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; + duplicatedPartitionPathWithUpdates = HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH; + duplicatedPartitionPathWithUpserts = HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH; + duplicatedNoPartitionPath = HoodieTestDataGenerator.NO_PARTITION_PATH; repairedOutputPath = Paths.get(basePath, "tmp").toString(); HoodieCLI.conf = jsc.hadoopConfiguration(); + Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); + + // generate 200 records + HoodieRecord[] hoodieRecords1 = SchemaTestUtil.generateHoodieTestRecords(0, 100, schema).toArray(new HoodieRecord[100]); + HoodieRecord[] hoodieRecords2 = SchemaTestUtil.generateHoodieTestRecords(100, 100, schema).toArray(new HoodieRecord[100]); + + // generate duplicates + HoodieRecord[] dupRecords = Arrays.copyOf(hoodieRecords1, 10); + + // init cow table + String cowTablePath = Paths.get(basePath, HoodieTableType.COPY_ON_WRITE.name()).toString(); - // Create table and connect + // Create cow table and connect new TableCommand().createTable( - tablePath, "test_table", HoodieTableType.COPY_ON_WRITE.name(), + cowTablePath, "cow_table", HoodieTableType.COPY_ON_WRITE.name(), "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); - // generate 200 records - Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(HoodieCLI.getTableMetaClient(), schema); + HoodieSparkWriteableTestTable cowTable = HoodieSparkWriteableTestTable.of(HoodieCLI.getTableMetaClient(), schema); - HoodieRecord[] hoodieRecords1 = SchemaTestUtil.generateHoodieTestRecords(0, 100, schema).toArray(new HoodieRecord[100]); - HoodieRecord[] hoodieRecords2 = SchemaTestUtil.generateHoodieTestRecords(100, 100, schema).toArray(new HoodieRecord[100]); - testTable.addCommit("20160401010101") + cowTable.addCommit("20160401010101") .withInserts(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "1", hoodieRecords1) .withInserts(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "2", hoodieRecords2) .getFileIdWithLogFile(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH); - testTable.withInserts(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "4", hoodieRecords1) + cowTable.withInserts(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "4", hoodieRecords1) .withInserts(HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH, "6", hoodieRecords1); - // read records and get 10 to generate duplicates - HoodieRecord[] dupRecords = Arrays.copyOf(hoodieRecords1, 10); - testTable.withInserts(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "5", dupRecords); - testTable.addCommit("20160401010202") + cowTable.withInserts(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "5", dupRecords); + cowTable.addCommit("20160401010202") .withInserts(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "3", dupRecords); - testTable.withInserts(HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH, "7", dupRecords) + cowTable.withInserts(HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH, "7", dupRecords) .withInserts(HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH, "8", dupRecords); - metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); + // init mor table + String morTablePath = Paths.get(basePath, HoodieTableType.MERGE_ON_READ.name()).toString(); + // Create mor table and connect + new TableCommand().createTable( + morTablePath, "mor_table", HoodieTableType.MERGE_ON_READ.name(), + "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + HoodieSparkWriteableTestTable morTable = HoodieSparkWriteableTestTable.of(HoodieCLI.getTableMetaClient(), schema); + + morTable.addDeltaCommit("20160401010101"); + morTable.withInserts(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "1", hoodieRecords1) + .withInserts(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "2", hoodieRecords2) + .getFileIdWithLogFile(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH); + + morTable.withInserts(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "4", hoodieRecords1) + .withInserts(HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH, "6", hoodieRecords1) + .withInserts(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "5", dupRecords); + morTable.addDeltaCommit("20160401010202"); + morTable.withInserts(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "3", dupRecords) + .withInserts(HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH, "7", dupRecords) + .withInserts(HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH, "8", dupRecords); + + // init cow table for non-partitioned table tests + String cowNonPartitionedTablePath = Paths.get(basePath, "cow_table_non_partitioned").toString(); + + // Create cow table and connect + new TableCommand().createTable( + cowNonPartitionedTablePath, "cow_table_non_partitioned", HoodieTableType.COPY_ON_WRITE.name(), + "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + + HoodieSparkWriteableTestTable cowNonPartitionedTable = HoodieSparkWriteableTestTable.of(HoodieCLI.getTableMetaClient(), schema); + + cowNonPartitionedTable.addCommit("20160401010101") + .withInserts(HoodieTestDataGenerator.NO_PARTITION_PATH, "1", hoodieRecords1) + .getFileIdWithLogFile(HoodieTestDataGenerator.NO_PARTITION_PATH); + + cowNonPartitionedTable.addCommit("20160401010202") + .withInserts(HoodieTestDataGenerator.NO_PARTITION_PATH, "2", dupRecords); + + fileFormat = metaClient.getTableConfig().getBaseFileFormat(); } /** * Test case for dry run deduplicate. */ - @Test - public void testDeduplicateWithInserts() throws IOException { + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + public void testDeduplicateWithInserts(HoodieTableType tableType) throws IOException { + String tablePath = Paths.get(basePath, tableType.name()).toString(); + connectTableAndReloadMetaClient(tablePath); // get fs and check number of latest files HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, - metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(), - fs.listStatus(new Path(duplicatedPartitionPath))); + metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(), + fs.listStatus(new Path(Paths.get(tablePath, duplicatedPartitionPath).toString()))); List filteredStatuses = fsView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList()); assertEquals(3, filteredStatuses.size(), "There should be 3 files."); // Before deduplicate, all files contain 210 records String[] files = filteredStatuses.toArray(new String[0]); - Dataset df = sqlContext.read().parquet(files); + Dataset df = readFiles(files); assertEquals(210, df.count()); String partitionPath = HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; String cmdStr = String.format("repair deduplicate --duplicatedPartitionPath %s --repairedOutputPath %s --sparkMaster %s", partitionPath, repairedOutputPath, "local"); - CommandResult cr = getShell().executeCommand(cmdStr); - assertTrue(cr.isSuccess()); - assertEquals(RepairsCommand.DEDUPLICATE_RETURN_PREFIX + repairedOutputPath, cr.getResult().toString()); + Object resultForCmd = shell.evaluate(() -> cmdStr); + assertTrue(ShellEvaluationResultUtil.isSuccess(resultForCmd)); + assertEquals(RepairsCommand.DEDUPLICATE_RETURN_PREFIX + repairedOutputPath, resultForCmd.toString()); // After deduplicate, there are 200 records FileStatus[] fileStatus = fs.listStatus(new Path(repairedOutputPath)); files = Arrays.stream(fileStatus).map(status -> status.getPath().toString()).toArray(String[]::new); - Dataset result = sqlContext.read().parquet(files); + Dataset result = readFiles(files); assertEquals(200, result.count()); } - @Test - public void testDeduplicateWithUpdates() throws IOException { + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + public void testDeduplicateWithUpdates(HoodieTableType tableType) throws IOException { + String tablePath = Paths.get(basePath, tableType.name()).toString(); + connectTableAndReloadMetaClient(tablePath); HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, - metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(), - fs.listStatus(new Path(duplicatedPartitionPathWithUpdates))); + metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(), + fs.listStatus(new Path(Paths.get(tablePath, duplicatedPartitionPathWithUpdates).toString()))); List filteredStatuses = fsView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList()); assertEquals(2, filteredStatuses.size(), "There should be 2 files."); // Before deduplicate, all files contain 110 records String[] files = filteredStatuses.toArray(new String[0]); - Dataset df = sqlContext.read().parquet(files); + Dataset df = readFiles(files); assertEquals(110, df.count()); String partitionPath = HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH; String cmdStr = String.format("repair deduplicate --duplicatedPartitionPath %s --repairedOutputPath %s --sparkMaster %s --dedupeType %s", partitionPath, repairedOutputPath, "local", "update_type"); - CommandResult cr = getShell().executeCommand(cmdStr); - assertTrue(cr.isSuccess()); - assertEquals(RepairsCommand.DEDUPLICATE_RETURN_PREFIX + repairedOutputPath, cr.getResult().toString()); + Object resultForCmd = shell.evaluate(() -> cmdStr); + assertTrue(ShellEvaluationResultUtil.isSuccess(resultForCmd)); + assertEquals(RepairsCommand.DEDUPLICATE_RETURN_PREFIX + repairedOutputPath, resultForCmd.toString()); // After deduplicate, there are 100 records FileStatus[] fileStatus = fs.listStatus(new Path(repairedOutputPath)); files = Arrays.stream(fileStatus).map(status -> status.getPath().toString()).toArray(String[]::new); - Dataset result = sqlContext.read().parquet(files); + Dataset result = readFiles(files); assertEquals(100, result.count()); } - @Test - public void testDeduplicateWithUpserts() throws IOException { + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + public void testDeduplicateWithUpserts(HoodieTableType tableType) throws IOException { + String tablePath = Paths.get(basePath, tableType.name()).toString(); + connectTableAndReloadMetaClient(tablePath); HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, - metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(), - fs.listStatus(new Path(duplicatedPartitionPathWithUpserts))); + metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(), + fs.listStatus(new Path(Paths.get(tablePath, duplicatedPartitionPathWithUpserts).toString()))); List filteredStatuses = fsView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList()); assertEquals(3, filteredStatuses.size(), "There should be 3 files."); // Before deduplicate, all files contain 120 records String[] files = filteredStatuses.toArray(new String[0]); - Dataset df = sqlContext.read().parquet(files); + Dataset df = readFiles(files); assertEquals(120, df.count()); String partitionPath = HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH; String cmdStr = String.format("repair deduplicate --duplicatedPartitionPath %s --repairedOutputPath %s --sparkMaster %s --dedupeType %s", partitionPath, repairedOutputPath, "local", "upsert_type"); - CommandResult cr = getShell().executeCommand(cmdStr); - assertTrue(cr.isSuccess()); - assertEquals(RepairsCommand.DEDUPLICATE_RETURN_PREFIX + repairedOutputPath, cr.getResult().toString()); + Object resultForCmd = shell.evaluate(() -> cmdStr); + assertTrue(ShellEvaluationResultUtil.isSuccess(resultForCmd)); + assertEquals(RepairsCommand.DEDUPLICATE_RETURN_PREFIX + repairedOutputPath, resultForCmd.toString()); // After deduplicate, there are 100 records FileStatus[] fileStatus = fs.listStatus(new Path(repairedOutputPath)); files = Arrays.stream(fileStatus).map(status -> status.getPath().toString()).toArray(String[]::new); - Dataset result = sqlContext.read().parquet(files); + Dataset result = readFiles(files); + assertEquals(100, result.count()); + } + + /** + * Test case dry run deduplicate for non-partitioned dataset. + */ + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + public void testDeduplicateNoPartitionWithInserts(HoodieTableType tableType) throws IOException { + String tablePath = Paths.get(basePath, "cow_table_non_partitioned").toString(); + connectTableAndReloadMetaClient(tablePath); + HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, + metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(), + fs.listStatus(new Path(Paths.get(tablePath, duplicatedNoPartitionPath).toString()))); + List filteredStatuses = fsView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList()); + assertEquals(2, filteredStatuses.size(), "There should be 2 files."); + + // Before deduplicate, all files contain 110 records + String[] files = filteredStatuses.toArray(new String[0]); + Dataset df = readFiles(files); + assertEquals(110, df.count()); + + // use default value without specifying duplicatedPartitionPath + String cmdStr = String.format("repair deduplicate --repairedOutputPath %s --sparkMaster %s", + repairedOutputPath, "local"); + Object resultForCmd = shell.evaluate(() -> cmdStr); + assertTrue(ShellEvaluationResultUtil.isSuccess(resultForCmd)); + assertEquals(RepairsCommand.DEDUPLICATE_RETURN_PREFIX + repairedOutputPath, resultForCmd.toString()); + + // After deduplicate, there are 100 records + FileStatus[] fileStatus = fs.listStatus(new Path(repairedOutputPath)); + files = Arrays.stream(fileStatus).map(status -> status.getPath().toString()).toArray(String[]::new); + Dataset result = readFiles(files); assertEquals(100, result.count()); } /** * Test case for real run deduplicate. */ - @Test - public void testDeduplicateWithReal() throws IOException { + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + public void testDeduplicateWithReal(HoodieTableType tableType) throws IOException { + String tablePath = Paths.get(basePath, tableType.name()).toString(); + connectTableAndReloadMetaClient(tablePath); // get fs and check number of latest files HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, - metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(), - fs.listStatus(new Path(duplicatedPartitionPath))); + metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(), + fs.listStatus(new Path(Paths.get(tablePath, duplicatedPartitionPath).toString()))); List filteredStatuses = fsView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList()); assertEquals(3, filteredStatuses.size(), "There should be 3 files."); // Before deduplicate, all files contain 210 records String[] files = filteredStatuses.toArray(new String[0]); - Dataset df = sqlContext.read().parquet(files); + Dataset df = readFiles(files); assertEquals(210, df.count()); String partitionPath = HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; String cmdStr = String.format("repair deduplicate --duplicatedPartitionPath %s --repairedOutputPath %s" + " --sparkMaster %s --dryrun %s", partitionPath, repairedOutputPath, "local", false); - CommandResult cr = getShell().executeCommand(cmdStr); - assertTrue(cr.isSuccess()); - assertEquals(RepairsCommand.DEDUPLICATE_RETURN_PREFIX + partitionPath, cr.getResult().toString()); + Object resultForCmd = shell.evaluate(() -> cmdStr); + assertTrue(ShellEvaluationResultUtil.isSuccess(resultForCmd)); + assertEquals(RepairsCommand.DEDUPLICATE_RETURN_PREFIX + partitionPath, resultForCmd.toString()); // After deduplicate, there are 200 records under partition path - FileStatus[] fileStatus = fs.listStatus(new Path(duplicatedPartitionPath)); + FileStatus[] fileStatus = fs.listStatus(new Path(Paths.get(tablePath, duplicatedPartitionPath).toString())); files = Arrays.stream(fileStatus).map(status -> status.getPath().toString()).toArray(String[]::new); - Dataset result = sqlContext.read().parquet(files); + Dataset result = readFiles(files); assertEquals(200, result.count()); } + + private void connectTableAndReloadMetaClient(String tablePath) throws IOException { + new TableCommand().connect(tablePath, TimelineLayoutVersion.VERSION_1, false, 0, 0, 0); + metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); + } + + private Dataset readFiles(String[] files) { + if (HoodieFileFormat.PARQUET.equals(fileFormat)) { + return sqlContext.read().parquet(files); + } else if (HoodieFileFormat.ORC.equals(fileFormat)) { + return sqlContext.read().orc(files); + } + throw new UnsupportedOperationException(fileFormat.name() + " format not supported yet."); + } } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestSavepointsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestSavepointsCommand.java index e93323942b0d8..9bc368e952248 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestSavepointsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestSavepointsCommand.java @@ -18,9 +18,13 @@ package org.apache.hudi.cli.integ; +import org.apache.hadoop.fs.Path; import org.apache.hudi.cli.HoodieCLI; import org.apache.hudi.cli.commands.TableCommand; -import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; +import org.apache.hudi.cli.testutils.HoodieCLIIntegrationTestBase; +import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; @@ -28,11 +32,15 @@ import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.springframework.shell.core.CommandResult; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.shell.Shell; -import java.io.File; import java.io.IOException; import static org.junit.jupiter.api.Assertions.assertAll; @@ -46,14 +54,17 @@ * A command use SparkLauncher need load jars under lib which generate during mvn package. * Use integration test instead of unit test. */ -public class ITTestSavepointsCommand extends AbstractShellIntegrationTest { +@SpringBootTest(properties = {"spring.shell.interactive.enabled=false", "spring.shell.command.script.enabled=false"}) +public class ITTestSavepointsCommand extends HoodieCLIIntegrationTestBase { + @Autowired + private Shell shell; private String tablePath; @BeforeEach public void init() throws IOException { String tableName = "test_table"; - tablePath = basePath + File.separator + tableName; + tablePath = basePath + Path.SEPARATOR + tableName; // Create table and connect new TableCommand().createTable( @@ -73,13 +84,13 @@ public void testSavepoint() { } String savepoint = "102"; - CommandResult cr = getShell().executeCommand( - String.format("savepoint create --commit %s --sparkMaster %s", savepoint, "local")); + Object result = shell.evaluate(() -> + String.format("savepoint create --commit %s --sparkMaster %s", savepoint, "local")); assertAll("Command run failed", - () -> assertTrue(cr.isSuccess()), + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result)), () -> assertEquals( - String.format("The commit \"%s\" has been savepointed.", savepoint), cr.getResult().toString())); + String.format("The commit \"%s\" has been savepointed.", savepoint), result.toString())); // there is 1 savepoint instant HoodieActiveTimeline timeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); @@ -101,13 +112,13 @@ public void testRollbackToSavepoint() throws IOException { String savepoint = "102"; HoodieTestDataGenerator.createSavepointFile(tablePath, savepoint, jsc.hadoopConfiguration()); - CommandResult cr = getShell().executeCommand( - String.format("savepoint rollback --savepoint %s --sparkMaster %s", savepoint, "local")); + Object result = shell.evaluate(() -> + String.format("savepoint rollback --savepoint %s --sparkMaster %s", savepoint, "local")); assertAll("Command run failed", - () -> assertTrue(cr.isSuccess()), + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result)), () -> assertEquals( - String.format("Savepoint \"%s\" rolled back", savepoint), cr.getResult().toString())); + String.format("Savepoint \"%s\" rolled back", savepoint), result.toString())); // there is 1 restore instant HoodieActiveTimeline timeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); @@ -118,6 +129,50 @@ public void testRollbackToSavepoint() throws IOException { new HoodieInstant(HoodieInstant.State.COMPLETED, "commit", "103"))); } + /** + * Test case of command 'savepoint rollback' with metadata table bootstrap. + */ + @Test + public void testRollbackToSavepointWithMetadataTableEnable() throws IOException { + // generate for savepoints + for (int i = 101; i < 105; i++) { + String instantTime = String.valueOf(i); + HoodieTestDataGenerator.createCommitFile(tablePath, instantTime, jsc.hadoopConfiguration()); + } + + // generate one savepoint at 102 + String savepoint = "102"; + HoodieTestDataGenerator.createSavepointFile(tablePath, savepoint, jsc.hadoopConfiguration()); + + // re-bootstrap metadata table + Path metadataTableBasePath = new Path(HoodieTableMetadata.getMetadataTableBasePath(HoodieCLI.basePath)); + // then bootstrap metadata table at instant 104 + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath(HoodieCLI.basePath) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()).build(); + SparkHoodieBackedTableMetadataWriter.create(HoodieCLI.conf, writeConfig, new HoodieSparkEngineContext(jsc)); + + assertTrue(HoodieCLI.fs.exists(metadataTableBasePath)); + + // roll back to savepoint + Object result = shell.evaluate(() -> + String.format("savepoint rollback --savepoint %s --sparkMaster %s", savepoint, "local")); + + assertAll("Command run failed", + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result)), + () -> assertEquals( + String.format("Savepoint \"%s\" rolled back", savepoint), result.toString())); + + // there is 1 restore instant + HoodieActiveTimeline timeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); + assertEquals(1, timeline.getRestoreTimeline().countInstants()); + + // 103 and 104 instant had rollback + assertFalse(timeline.getCommitTimeline().containsInstant( + new HoodieInstant(HoodieInstant.State.COMPLETED, "commit", "103"))); + assertFalse(timeline.getCommitTimeline().containsInstant( + new HoodieInstant(HoodieInstant.State.COMPLETED, "commit", "104"))); + } + /** * Test case of command 'savepoint delete'. */ @@ -138,13 +193,13 @@ public void testDeleteSavepoint() throws IOException { HoodieActiveTimeline timeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); assertEquals(2, timeline.getSavePointTimeline().countInstants(), "There should 2 instants."); - CommandResult cr = getShell().executeCommand( - String.format("savepoint delete --commit %s --sparkMaster %s", savepoint1, "local")); + Object result = shell.evaluate(() -> + String.format("savepoint delete --commit %s --sparkMaster %s", savepoint1, "local")); assertAll("Command run failed", - () -> assertTrue(cr.isSuccess()), + () -> assertTrue(ShellEvaluationResultUtil.isSuccess(result)), () -> assertEquals( - String.format("Savepoint \"%s\" deleted.", savepoint1), cr.getResult().toString())); + String.format("Savepoint \"%s\" deleted.", savepoint1),result.toString())); // reload timeline timeline = timeline.reload(); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/AbstractShellBaseIntegrationTest.java b/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/AbstractShellBaseIntegrationTest.java deleted file mode 100644 index e016564439696..0000000000000 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/AbstractShellBaseIntegrationTest.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.cli.testutils; - -import org.apache.hudi.testutils.HoodieClientTestHarness; - -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.BeforeEach; -import org.springframework.shell.Bootstrap; -import org.springframework.shell.core.JLineShellComponent; - -/** - * Class to start Bootstrap and JLineShellComponent. - */ -public class AbstractShellBaseIntegrationTest extends HoodieClientTestHarness { - - private static JLineShellComponent shell; - - @BeforeAll - public static void startup() { - Bootstrap bootstrap = new Bootstrap(); - shell = bootstrap.getJLineShellComponent(); - } - - @AfterAll - public static void shutdown() { - shell.stop(); - } - - @BeforeEach - public void setup() throws Exception { - initPath(); - } - - @AfterEach - public void teardown() throws Exception { - System.gc(); - } - - protected static JLineShellComponent getShell() { - return shell; - } - - /** - * Helper to prepare string for matching. - * @param str Input string. - * @return pruned string with non word characters removed. - */ - protected String removeNonWordAndStripSpace(String str) { - return str.replaceAll("[\\s]+", ",").replaceAll("[\\W]+", ","); - } -} diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/AbstractShellIntegrationTest.java b/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/AbstractShellIntegrationTest.java deleted file mode 100644 index a7cf85cf9235b..0000000000000 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/AbstractShellIntegrationTest.java +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.cli.testutils; - -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; - -/** - * Class to initial resources for shell. - */ -public abstract class AbstractShellIntegrationTest extends AbstractShellBaseIntegrationTest { - - @Override - @BeforeEach - public void setup() throws Exception { - initResources(); - } - - @Override - @AfterEach - public void teardown() throws Exception { - cleanupResources(); - } -} diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/HoodieCLIIntegrationTestBase.java b/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/HoodieCLIIntegrationTestBase.java new file mode 100644 index 0000000000000..86b618d502297 --- /dev/null +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/HoodieCLIIntegrationTestBase.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.testutils; + +import org.apache.hudi.common.model.HoodieTableType; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; + +/** + * Class to initial resources for shell. + */ +public class HoodieCLIIntegrationTestBase extends HoodieCLIIntegrationTestHarness { + + @Override + @BeforeEach + public void setup() throws Exception { + initResources(); + } + + @Override + @AfterEach + public void teardown() throws Exception { + cleanupResources(); + } + + protected HoodieTableType getTableType() { + return HoodieTableType.MERGE_ON_READ; + } +} diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/HoodieCLIIntegrationTestHarness.java b/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/HoodieCLIIntegrationTestHarness.java new file mode 100644 index 0000000000000..d49ac6b3289a6 --- /dev/null +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/HoodieCLIIntegrationTestHarness.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.testutils; + +import org.apache.hudi.testutils.HoodieClientTestHarness; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; + +/** + * Class to start Bootstrap and JLineShellComponent. + */ +public class HoodieCLIIntegrationTestHarness extends HoodieClientTestHarness { + + @BeforeEach + public void setup() throws Exception { + initPath(); + } + + @AfterEach + public void teardown() throws Exception { + System.gc(); + } + + /** + * Helper to prepare string for matching. + * @param str Input string. + * @return pruned string with non word characters removed. + */ + protected String removeNonWordAndStripSpace(String str) { + return str.replaceAll("[\\s]+", ",").replaceAll("[\\W]+", ","); + } +} diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/HoodieTestCommitMetadataGenerator.java b/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/HoodieTestCommitMetadataGenerator.java index f4d8019be0599..67592be1adcf3 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/HoodieTestCommitMetadataGenerator.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/HoodieTestCommitMetadataGenerator.java @@ -31,8 +31,10 @@ import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; +import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -67,23 +69,52 @@ public static void createCommitFileWithMetadata(String basePath, String commitTi public static void createCommitFileWithMetadata(String basePath, String commitTime, Configuration configuration, Option writes, Option updates) throws Exception { + createCommitFileWithMetadata(basePath, commitTime, configuration, writes, updates, Collections.emptyMap()); + } + + public static void createCommitFileWithMetadata(String basePath, String commitTime, Configuration configuration, + Option writes, Option updates, Map extraMetadata) throws Exception { createCommitFileWithMetadata(basePath, commitTime, configuration, UUID.randomUUID().toString(), - UUID.randomUUID().toString(), writes, updates); + UUID.randomUUID().toString(), writes, updates, extraMetadata); } public static void createCommitFileWithMetadata(String basePath, String commitTime, Configuration configuration, String fileId1, String fileId2, Option writes, Option updates) throws Exception { + createCommitFileWithMetadata(basePath, commitTime, configuration, fileId1, fileId2, writes, updates, Collections.emptyMap()); + } + + public static void createCommitFileWithMetadata(String basePath, String commitTime, Configuration configuration, + String fileId1, String fileId2, Option writes, Option updates, Map extraMetadata) throws Exception { List commitFileNames = Arrays.asList(HoodieTimeline.makeCommitFileName(commitTime), HoodieTimeline.makeInflightCommitFileName(commitTime), HoodieTimeline.makeRequestedCommitFileName(commitTime)); for (String name : commitFileNames) { - Path commitFilePath = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + name); - try (FSDataOutputStream os = FSUtils.getFs(basePath, configuration).create(commitFilePath, true)) { - // Generate commitMetadata - HoodieCommitMetadata commitMetadata = - generateCommitMetadata(basePath, commitTime, fileId1, fileId2, writes, updates); - // Write empty commit metadata - os.writeBytes(new String(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); - } + HoodieCommitMetadata commitMetadata = + generateCommitMetadata(basePath, commitTime, fileId1, fileId2, writes, updates, extraMetadata, true); + String content = commitMetadata.toJsonString(); + createFileWithMetadata(basePath, configuration, name, content); + } + } + + public static void createCommitFileWithMetadata(String basePath, String commitTime, Configuration configuration, + String fileId1, String fileId2, Option writes, + Option updates, Map extraMetadata, + boolean setDefaultFileId) throws Exception { + List commitFileNames = Arrays.asList( + HoodieTimeline.makeCommitFileName(commitTime), + HoodieTimeline.makeInflightCommitFileName(commitTime), + HoodieTimeline.makeRequestedCommitFileName(commitTime)); + for (String name : commitFileNames) { + HoodieCommitMetadata commitMetadata = + generateCommitMetadata(basePath, commitTime, fileId1, fileId2, writes, updates, extraMetadata, setDefaultFileId); + String content = commitMetadata.toJsonString(); + createFileWithMetadata(basePath, configuration, name, content); + } + } + + static void createFileWithMetadata(String basePath, Configuration configuration, String name, String content) throws IOException { + Path commitFilePath = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + name); + try (FSDataOutputStream os = FSUtils.getFs(basePath, configuration).create(commitFilePath, true)) { + os.writeBytes(new String(content.getBytes(StandardCharsets.UTF_8))); } } @@ -102,6 +133,13 @@ public static HoodieCommitMetadata generateCommitMetadata(String basePath, Strin public static HoodieCommitMetadata generateCommitMetadata(String basePath, String commitTime, String fileId1, String fileId2, Option writes, Option updates) throws Exception { + return generateCommitMetadata(basePath, commitTime, fileId1, fileId2, writes, updates, Collections.emptyMap(), true); + } + + public static HoodieCommitMetadata generateCommitMetadata(String basePath, String commitTime, String fileId1, + String fileId2, Option writes, + Option updates, Map extraMetadata, + boolean setDefaultFileId) throws Exception { FileCreateUtils.createBaseFile(basePath, DEFAULT_FIRST_PARTITION_PATH, commitTime, fileId1); FileCreateUtils.createBaseFile(basePath, DEFAULT_SECOND_PARTITION_PATH, commitTime, fileId2); return generateCommitMetadata(new HashMap>() { @@ -109,20 +147,28 @@ public static HoodieCommitMetadata generateCommitMetadata(String basePath, Strin put(DEFAULT_FIRST_PARTITION_PATH, createImmutableList(baseFileName(DEFAULT_FIRST_PARTITION_PATH, fileId1))); put(DEFAULT_SECOND_PARTITION_PATH, createImmutableList(baseFileName(DEFAULT_SECOND_PARTITION_PATH, fileId2))); } - }, writes, updates); + }, writes, updates, extraMetadata, setDefaultFileId); + } + + private static HoodieCommitMetadata generateCommitMetadata(Map> partitionToFilePaths, + Option writes, Option updates) { + return generateCommitMetadata(partitionToFilePaths, writes, updates, Collections.emptyMap(), true); } /** * Method to generate commit metadata. */ private static HoodieCommitMetadata generateCommitMetadata(Map> partitionToFilePaths, - Option writes, Option updates) { + Option writes, Option updates, Map extraMetadata, boolean setDefaultFileId) { HoodieCommitMetadata metadata = new HoodieCommitMetadata(); + for (Map.Entry entry: extraMetadata.entrySet()) { + metadata.addMetadata(entry.getKey(), entry.getValue()); + } partitionToFilePaths.forEach((key, value) -> value.forEach(f -> { HoodieWriteStat writeStat = new HoodieWriteStat(); writeStat.setPartitionPath(key); writeStat.setPath(DEFAULT_PATH); - writeStat.setFileId(DEFAULT_FILEID); + writeStat.setFileId(setDefaultFileId ? DEFAULT_FILEID : FSUtils.getFileId(f)); writeStat.setTotalWriteBytes(DEFAULT_TOTAL_WRITE_BYTES); writeStat.setPrevCommit(DEFAULT_PRE_COMMIT); writeStat.setNumWrites(writes.orElse(DEFAULT_NUM_WRITES)); @@ -133,4 +179,5 @@ private static HoodieCommitMetadata generateCommitMetadata(Map writes, Option updates, + HoodieTableMetaClient metaclient) throws Exception { + + HoodieReplaceCommitMetadata replaceMetadata = generateReplaceCommitMetadata(basePath, commitTime, UUID.randomUUID().toString(), + UUID.randomUUID().toString(), writes, updates); + HoodieRequestedReplaceMetadata requestedReplaceMetadata = getHoodieRequestedReplaceMetadata(); + + HoodieTestTable.of(metaclient).addReplaceCommit(commitTime, Option.ofNullable(requestedReplaceMetadata), Option.empty(), replaceMetadata); + } + + private static HoodieRequestedReplaceMetadata getHoodieRequestedReplaceMetadata() { + return HoodieRequestedReplaceMetadata.newBuilder() + .setOperationType(WriteOperationType.INSERT_OVERWRITE.toString()) + .setVersion(1) + .setExtraMetadata(Collections.emptyMap()) + .build(); + } + + private static HoodieReplaceCommitMetadata generateReplaceCommitMetadata(String basePath, String commitTime, String fileId1, String fileId2, Option writes, Option updates) + throws Exception { + FileCreateUtils.createBaseFile(basePath, DEFAULT_FIRST_PARTITION_PATH, commitTime, fileId1); + FileCreateUtils.createBaseFile(basePath, DEFAULT_SECOND_PARTITION_PATH, commitTime, fileId2); + return generateReplaceCommitMetadata(new HashMap>() { + { + put(DEFAULT_FIRST_PARTITION_PATH, createImmutableList(baseFileName(DEFAULT_FIRST_PARTITION_PATH, fileId1))); + put(DEFAULT_SECOND_PARTITION_PATH, createImmutableList(baseFileName(DEFAULT_SECOND_PARTITION_PATH, fileId2))); + } + }, writes, updates); + } + + private static HoodieReplaceCommitMetadata generateReplaceCommitMetadata(HashMap> partitionToFilePaths, Option writes, Option updates) { + HoodieReplaceCommitMetadata metadata = new HoodieReplaceCommitMetadata(); + partitionToFilePaths.forEach((key, value) -> value.forEach(f -> { + HoodieWriteStat writeStat = new HoodieWriteStat(); + writeStat.setPartitionPath(key); + writeStat.setPath(DEFAULT_PATH); + writeStat.setFileId(DEFAULT_FILEID); + writeStat.setTotalWriteBytes(DEFAULT_TOTAL_WRITE_BYTES); + writeStat.setPrevCommit(DEFAULT_PRE_COMMIT); + writeStat.setNumWrites(writes.orElse(DEFAULT_NUM_WRITES)); + writeStat.setNumUpdateWrites(updates.orElse(DEFAULT_NUM_UPDATE_WRITES)); + writeStat.setTotalLogBlocks(DEFAULT_TOTAL_LOG_BLOCKS); + writeStat.setTotalLogRecords(DEFAULT_TOTAL_LOG_RECORDS); + metadata.addWriteStat(key, writeStat); + })); + metadata.setPartitionToReplaceFileIds(new HashMap>() { + { + //TODO fix + put(DEFAULT_FIRST_PARTITION_PATH, createImmutableList(baseFileName(DEFAULT_FIRST_PARTITION_PATH, "1"))); + } + }); + return metadata; + } +} diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/MockCommandLineInput.java b/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/MockCommandLineInput.java new file mode 100644 index 0000000000000..1d803fc8103bd --- /dev/null +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/MockCommandLineInput.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.testutils; + +import org.springframework.shell.Input; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public interface MockCommandLineInput extends Input { + @Override + default List words() { + if (null == rawText() || rawText().isEmpty()) { + return Collections.emptyList(); + } + boolean isInQuote = false; + List result = new ArrayList<>(); + StringBuilder stringBuilder = new StringBuilder(); + for (int i = 0; i < rawText().length(); i++) { + char c = rawText().charAt(i); + if (' ' == c && !isInQuote) { + if (stringBuilder.length() != 0) { + result.add(stringBuilder.toString()); + stringBuilder.delete(0, stringBuilder.length()); + } + } else if ('\'' == c || '"' == c) { + if (isInQuote) { + isInQuote = false; + result.add(stringBuilder.toString()); + stringBuilder.delete(0, stringBuilder.length()); + } else { + isInQuote = true; + } + } else { + stringBuilder.append(c); + } + } + return result; + } +} diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/ShellEvaluationResultUtil.java b/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/ShellEvaluationResultUtil.java new file mode 100644 index 0000000000000..d1832a82691cb --- /dev/null +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/ShellEvaluationResultUtil.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.testutils; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +public class ShellEvaluationResultUtil { + private static final Logger LOGGER = LogManager.getLogger(ShellEvaluationResultUtil.class); + private ShellEvaluationResultUtil() {} + + public static boolean isSuccess(Object shellEvaluationResult) { + boolean hasError = shellEvaluationResult instanceof Throwable; + if (hasError) { + Throwable throwable = (Throwable) shellEvaluationResult; + LOGGER.error(throwable.toString()); + } + return !hasError; + } +} diff --git a/hudi-cli/src/test/resources/log4j-surefire-quiet.properties b/hudi-cli/src/test/resources/log4j-surefire-quiet.properties deleted file mode 100644 index b21b5d4070c41..0000000000000 --- a/hudi-cli/src/test/resources/log4j-surefire-quiet.properties +++ /dev/null @@ -1,29 +0,0 @@ -### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -### -log4j.rootLogger=WARN, CONSOLE -log4j.logger.org.apache.hudi=DEBUG - -# CONSOLE is set to be a ConsoleAppender. -log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# CONSOLE uses PatternLayout. -log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout -log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n -log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter -log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true -log4j.appender.CONSOLE.filter.a.LevelMin=WARN -log4j.appender.CONSOLE.filter.a.LevelMax=FATAL diff --git a/hudi-cli/src/test/resources/log4j-surefire.properties b/hudi-cli/src/test/resources/log4j-surefire.properties deleted file mode 100644 index a59d4ebe2b194..0000000000000 --- a/hudi-cli/src/test/resources/log4j-surefire.properties +++ /dev/null @@ -1,25 +0,0 @@ -### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -### -log4j.rootLogger=WARN, A1 -log4j.category.org.apache=INFO -log4j.category.org.apache.parquet.hadoop=WARN -# A1 is set to be a ConsoleAppender. -log4j.appender.A1=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. -log4j.appender.A1.layout=org.apache.log4j.PatternLayout -log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n diff --git a/hudi-client/hudi-client-common/pom.xml b/hudi-client/hudi-client-common/pom.xml index 487a2e2b3ea8f..3e8755fa3fa4e 100644 --- a/hudi-client/hudi-client-common/pom.xml +++ b/hudi-client/hudi-client-common/pom.xml @@ -15,27 +15,25 @@ See the License for the specific language governing permissions and limitations under the License. --> - + hudi-client org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 hudi-client-common - ${parent.version} + 0.12.2-dt-SNAPSHOT hudi-client-common jar - + - org.scala-lang - scala-library - ${scala.version} + org.apache.logging.log4j + log4j-1.2-api @@ -44,6 +42,12 @@ hudi-common ${project.version} + + org.apache.hudi + hudi-aws + ${project.version} + provided + org.apache.hudi hudi-timeline-service @@ -55,18 +59,19 @@ joda-time - - - log4j - log4j - - org.apache.parquet parquet-avro + + + com.github.davidmoten + hilbert-curve + 0.2.2 + + io.dropwizard.metrics @@ -102,7 +107,6 @@ io.prometheus simpleclient_pushgateway - org.apache.hudi @@ -118,6 +122,7 @@ org.apache.hadoop hadoop-hdfs tests + test @@ -138,6 +143,7 @@ org.apache.hadoop hadoop-common tests + test org.mortbay.jetty @@ -154,7 +160,37 @@ + + + org.awaitility + awaitility + test + + + + + org.apache.curator + curator-framework + ${zk-curator.version} + + + org.apache.curator + curator-client + ${zk-curator.version} + + + org.apache.curator + curator-recipes + ${zk-curator.version} + + + + org.apache.hudi + hudi-tests-common + ${project.version} + test + org.junit.jupiter junit-jupiter-api @@ -195,6 +231,13 @@ junit-platform-commons test + + org.apache.curator + curator-test + ${zk-curator.version} + test + + diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncArchiveService.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncArchiveService.java new file mode 100644 index 0000000000000..3fdc21dd21683 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncArchiveService.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.async; + +import org.apache.hudi.client.BaseHoodieWriteClient; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * Async archive service to run concurrently with write operation. + */ +public class AsyncArchiveService extends HoodieAsyncTableService { + + private static final Logger LOG = LogManager.getLogger(AsyncArchiveService.class); + + private final BaseHoodieWriteClient writeClient; + private final transient ExecutorService executor = Executors.newSingleThreadExecutor(); + + protected AsyncArchiveService(BaseHoodieWriteClient writeClient) { + super(writeClient.getConfig()); + this.writeClient = writeClient; + } + + @Override + protected Pair startService() { + LOG.info("Starting async archive service..."); + return Pair.of(CompletableFuture.supplyAsync(() -> { + writeClient.archive(); + return true; + }, executor), executor); + } + + public static AsyncArchiveService startAsyncArchiveIfEnabled(BaseHoodieWriteClient writeClient) { + HoodieWriteConfig config = writeClient.getConfig(); + if (!config.isAutoArchive() || !config.isAsyncArchive()) { + LOG.info("The HoodieWriteClient is not configured to auto & async archive. Async archive service will not start."); + return null; + } + AsyncArchiveService asyncArchiveService = new AsyncArchiveService(writeClient); + asyncArchiveService.start(null); + return asyncArchiveService; + } + + public static void waitForCompletion(AsyncArchiveService asyncArchiveService) { + if (asyncArchiveService != null) { + LOG.info("Waiting for async archive service to finish"); + try { + asyncArchiveService.waitForShutdown(); + } catch (Exception e) { + throw new HoodieException("Error waiting for async archive service to finish", e); + } + } + } + + public static void forceShutdown(AsyncArchiveService asyncArchiveService) { + if (asyncArchiveService != null) { + LOG.info("Shutting down async archive service..."); + asyncArchiveService.shutdown(true); + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncCleanerService.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncCleanerService.java new file mode 100644 index 0000000000000..72907e6d3fbcd --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncCleanerService.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.async; + +import org.apache.hudi.client.BaseHoodieWriteClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * Async clean service to run concurrently with write operation. + */ +public class AsyncCleanerService extends HoodieAsyncTableService { + + private static final Logger LOG = LogManager.getLogger(AsyncCleanerService.class); + + private final BaseHoodieWriteClient writeClient; + private final transient ExecutorService executor = Executors.newSingleThreadExecutor(); + + protected AsyncCleanerService(BaseHoodieWriteClient writeClient) { + super(writeClient.getConfig()); + this.writeClient = writeClient; + } + + @Override + protected Pair startService() { + String instantTime = HoodieActiveTimeline.createNewInstantTime(); + LOG.info(String.format("Starting async clean service with instant time %s...", instantTime)); + return Pair.of(CompletableFuture.supplyAsync(() -> { + writeClient.clean(instantTime); + return true; + }, executor), executor); + } + + public static AsyncCleanerService startAsyncCleaningIfEnabled(BaseHoodieWriteClient writeClient) { + HoodieWriteConfig config = writeClient.getConfig(); + if (!config.isAutoClean() || !config.isAsyncClean()) { + LOG.info("The HoodieWriteClient is not configured to auto & async clean. Async clean service will not start."); + return null; + } + AsyncCleanerService asyncCleanerService = new AsyncCleanerService(writeClient); + asyncCleanerService.start(null); + return asyncCleanerService; + } + + public static void waitForCompletion(AsyncCleanerService asyncCleanerService) { + if (asyncCleanerService != null) { + LOG.info("Waiting for async clean service to finish"); + try { + asyncCleanerService.waitForShutdown(); + } catch (Exception e) { + throw new HoodieException("Error waiting for async clean service to finish", e); + } + } + } + + public static void forceShutdown(AsyncCleanerService asyncCleanerService) { + if (asyncCleanerService != null) { + LOG.info("Shutting down async clean service..."); + asyncCleanerService.shutdown(true); + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncClusteringService.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncClusteringService.java new file mode 100644 index 0000000000000..1e4d4d1f593af --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncClusteringService.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.async; + +import org.apache.hudi.client.BaseClusterer; +import org.apache.hudi.client.BaseHoodieWriteClient; +import org.apache.hudi.common.engine.EngineProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.CustomizedThreadFactory; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieIOException; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.stream.IntStream; + +/** + * Async clustering service that runs in a separate thread. + * Currently, only one clustering thread is allowed to run at any time. + */ +public abstract class AsyncClusteringService extends HoodieAsyncTableService { + + public static final String CLUSTERING_POOL_NAME = "hoodiecluster"; + private static final long serialVersionUID = 1L; + private static final Logger LOG = LogManager.getLogger(AsyncClusteringService.class); + private final int maxConcurrentClustering; + protected transient HoodieEngineContext context; + private transient BaseClusterer clusteringClient; + + public AsyncClusteringService(HoodieEngineContext context, BaseHoodieWriteClient writeClient) { + this(context, writeClient, false); + } + + public AsyncClusteringService(HoodieEngineContext context, BaseHoodieWriteClient writeClient, boolean runInDaemonMode) { + super(writeClient.getConfig(), runInDaemonMode); + this.clusteringClient = createClusteringClient(writeClient); + this.maxConcurrentClustering = 1; + this.context = context; + } + + protected abstract BaseClusterer createClusteringClient(BaseHoodieWriteClient client); + + /** + * Start clustering service. + */ + @Override + protected Pair startService() { + ExecutorService executor = Executors.newFixedThreadPool(maxConcurrentClustering, + new CustomizedThreadFactory("async_clustering_thread", isRunInDaemonMode())); + return Pair.of(CompletableFuture.allOf(IntStream.range(0, maxConcurrentClustering).mapToObj(i -> CompletableFuture.supplyAsync(() -> { + try { + // Set Compactor Pool Name for allowing users to prioritize compaction + LOG.info("Setting pool name for clustering to " + CLUSTERING_POOL_NAME); + context.setProperty(EngineProperty.CLUSTERING_POOL_NAME, CLUSTERING_POOL_NAME); + while (!isShutdownRequested()) { + final HoodieInstant instant = fetchNextAsyncServiceInstant(); + if (null != instant) { + LOG.info("Starting clustering for instant " + instant); + clusteringClient.cluster(instant); + LOG.info("Finished clustering for instant " + instant); + } + } + LOG.info("Clustering executor shutting down properly"); + } catch (InterruptedException ie) { + hasError = true; + LOG.warn("Clustering executor got interrupted exception! Stopping", ie); + } catch (IOException e) { + hasError = true; + LOG.error("Clustering executor failed due to IOException", e); + throw new HoodieIOException(e.getMessage(), e); + } catch (Exception e) { + hasError = true; + LOG.error("Clustering executor failed", e); + throw e; + } + return true; + }, executor)).toArray(CompletableFuture[]::new)), executor); + } + + /** + * Update the write client to be used for clustering. + */ + public synchronized void updateWriteClient(BaseHoodieWriteClient writeClient) { + this.clusteringClient.updateWriteClient(writeClient); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncCompactService.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncCompactService.java index 47f883284adb8..a62beae02bbdb 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncCompactService.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncCompactService.java @@ -17,104 +17,51 @@ package org.apache.hudi.async; -import org.apache.hudi.client.AbstractCompactor; -import org.apache.hudi.client.AbstractHoodieWriteClient; -import org.apache.hudi.client.common.EngineProperty; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.client.BaseCompactor; +import org.apache.hudi.client.BaseHoodieWriteClient; +import org.apache.hudi.common.engine.EngineProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.CustomizedThreadFactory; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieIOException; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.IOException; -import java.util.concurrent.BlockingQueue; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.locks.Condition; -import java.util.concurrent.locks.ReentrantLock; import java.util.stream.IntStream; /** * Async Compactor Service that runs in separate thread. Currently, only one compactor is allowed to run at any time. */ -public abstract class AsyncCompactService extends HoodieAsyncService { - - private static final long serialVersionUID = 1L; - private static final Logger LOG = LogManager.getLogger(AsyncCompactService.class); +public abstract class AsyncCompactService extends HoodieAsyncTableService { /** * This is the job pool used by async compaction. */ public static final String COMPACT_POOL_NAME = "hoodiecompact"; - + private static final long serialVersionUID = 1L; + private static final Logger LOG = LogManager.getLogger(AsyncCompactService.class); private final int maxConcurrentCompaction; - private transient AbstractCompactor compactor; - private transient HoodieEngineContext context; - private transient BlockingQueue pendingCompactions = new LinkedBlockingQueue<>(); - private transient ReentrantLock queueLock = new ReentrantLock(); - private transient Condition consumed = queueLock.newCondition(); + protected transient HoodieEngineContext context; + private transient BaseCompactor compactor; - public AsyncCompactService(HoodieEngineContext context, AbstractHoodieWriteClient client) { + public AsyncCompactService(HoodieEngineContext context, BaseHoodieWriteClient client) { this(context, client, false); } - public AsyncCompactService(HoodieEngineContext context, AbstractHoodieWriteClient client, boolean runInDaemonMode) { - super(runInDaemonMode); + public AsyncCompactService(HoodieEngineContext context, BaseHoodieWriteClient client, boolean runInDaemonMode) { + super(client.getConfig(), runInDaemonMode); this.context = context; this.compactor = createCompactor(client); this.maxConcurrentCompaction = 1; } - protected abstract AbstractCompactor createCompactor(AbstractHoodieWriteClient client); - - /** - * Enqueues new Pending compaction. - */ - public void enqueuePendingCompaction(HoodieInstant instant) { - pendingCompactions.add(instant); - } - - /** - * Wait till outstanding pending compactions reduces to the passed in value. - * - * @param numPendingCompactions Maximum pending compactions allowed - * @throws InterruptedException - */ - public void waitTillPendingCompactionsReducesTo(int numPendingCompactions) throws InterruptedException { - try { - queueLock.lock(); - while (!isShutdown() && (pendingCompactions.size() > numPendingCompactions)) { - consumed.await(); - } - } finally { - queueLock.unlock(); - } - } - - /** - * Fetch Next pending compaction if available. - * - * @return - * @throws InterruptedException - */ - private HoodieInstant fetchNextCompactionInstant() throws InterruptedException { - LOG.info("Compactor waiting for next instant for compaction upto 60 seconds"); - HoodieInstant instant = pendingCompactions.poll(10, TimeUnit.SECONDS); - if (instant != null) { - try { - queueLock.lock(); - // Signal waiting thread - consumed.signal(); - } finally { - queueLock.unlock(); - } - } - return instant; - } + protected abstract BaseCompactor createCompactor(BaseHoodieWriteClient client); /** * Start Compaction Service. @@ -122,11 +69,7 @@ private HoodieInstant fetchNextCompactionInstant() throws InterruptedException { @Override protected Pair startService() { ExecutorService executor = Executors.newFixedThreadPool(maxConcurrentCompaction, - r -> { - Thread t = new Thread(r, "async_compact_thread"); - t.setDaemon(isRunInDaemonMode()); - return t; - }); + new CustomizedThreadFactory("async_compact_thread", isRunInDaemonMode())); return Pair.of(CompletableFuture.allOf(IntStream.range(0, maxConcurrentCompaction).mapToObj(i -> CompletableFuture.supplyAsync(() -> { try { // Set Compactor Pool Name for allowing users to prioritize compaction @@ -134,7 +77,7 @@ protected Pair startService() { context.setProperty(EngineProperty.COMPACTION_POOL_NAME, COMPACT_POOL_NAME); while (!isShutdownRequested()) { - final HoodieInstant instant = fetchNextCompactionInstant(); + final HoodieInstant instant = fetchNextAsyncServiceInstant(); if (null != instant) { LOG.info("Starting Compaction for instant " + instant); @@ -144,25 +87,31 @@ protected Pair startService() { } LOG.info("Compactor shutting down properly!!"); } catch (InterruptedException ie) { + hasError = true; LOG.warn("Compactor executor thread got interrupted exception. Stopping", ie); } catch (IOException e) { - LOG.error("Compactor executor failed", e); + hasError = true; + LOG.error("Compactor executor failed due to IOException", e); throw new HoodieIOException(e.getMessage(), e); + } catch (Exception e) { + hasError = true; + LOG.error("Compactor executor failed", e); + throw e; } return true; }, executor)).toArray(CompletableFuture[]::new)), executor); } - /** * Check whether compactor thread needs to be stopped. + * * @return */ protected boolean shouldStopCompactor() { return false; } - public synchronized void updateWriteClient(AbstractHoodieWriteClient writeClient) { + public synchronized void updateWriteClient(BaseHoodieWriteClient writeClient) { this.compactor.updateWriteClient(writeClient); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/HoodieAsyncService.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/HoodieAsyncService.java index 32dd0427255f0..889d7945b00c3 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/HoodieAsyncService.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/HoodieAsyncService.java @@ -18,26 +18,33 @@ package org.apache.hudi.async; +import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.collection.Pair; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.Serializable; +import java.util.concurrent.BlockingQueue; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; import java.util.function.Function; /** - * Base Class for running clean/delta-sync/compaction in separate thread and controlling their life-cycle. + * Base Class for running archive/clean/delta-sync/compaction/clustering in separate thread and controlling their life-cycles. */ public abstract class HoodieAsyncService implements Serializable { private static final Logger LOG = LogManager.getLogger(HoodieAsyncService.class); + private static final long POLLING_SECONDS = 10; + // Flag indicating whether an error is incurred in the service + protected boolean hasError; // Flag to track if the service is started. private boolean started; // Flag indicating shutdown is externally requested @@ -50,6 +57,12 @@ public abstract class HoodieAsyncService implements Serializable { private transient CompletableFuture future; // Run in daemon mode private final boolean runInDaemonMode; + // Queue to hold pending compaction/clustering instants + private transient BlockingQueue pendingInstants = new LinkedBlockingQueue<>(); + // Mutex lock for synchronized access to pendingInstants queue + private transient ReentrantLock queueLock = new ReentrantLock(); + // Condition instance to use with the queueLock + private transient Condition consumed = queueLock.newCondition(); protected HoodieAsyncService() { this(false); @@ -60,21 +73,32 @@ protected HoodieAsyncService(boolean runInDaemonMode) { this.runInDaemonMode = runInDaemonMode; } - protected boolean isShutdownRequested() { + public boolean isStarted() { + return started; + } + + public boolean isShutdownRequested() { return shutdownRequested; } - protected boolean isShutdown() { + public boolean isShutdown() { return shutdown; } + public boolean hasError() { + return hasError; + } + /** * Wait till the service shutdown. If the service shutdown with exception, it will be thrown - * + * * @throws ExecutionException * @throws InterruptedException */ public void waitForShutdown() throws ExecutionException, InterruptedException { + if (future == null) { + return; + } try { future.get(); } catch (ExecutionException ex) { @@ -92,6 +116,7 @@ public void waitForShutdown() throws ExecutionException, InterruptedException { public void shutdown(boolean force) { if (!shutdownRequested || force) { shutdownRequested = true; + shutdown = true; if (executor != null) { if (force) { executor.shutdownNow(); @@ -115,54 +140,88 @@ public void shutdown(boolean force) { * @param onShutdownCallback */ public void start(Function onShutdownCallback) { + if (started) { + LOG.warn("The async service already started."); + return; + } Pair res = startService(); future = res.getKey(); executor = res.getValue(); started = true; - monitorThreads(onShutdownCallback); + shutdownCallback(onShutdownCallback); } /** * Service implementation. - * - * @return */ protected abstract Pair startService(); /** - * A monitor thread is started which would trigger a callback if the service is shutdown. + * Add shutdown callback for the completable future. * - * @param onShutdownCallback + * @param callback The callback */ - private void monitorThreads(Function onShutdownCallback) { - LOG.info("Submitting monitor thread !!"); - Executors.newSingleThreadExecutor(r -> { - Thread t = new Thread(r, "Monitor Thread"); - t.setDaemon(isRunInDaemonMode()); - return t; - }).submit(() -> { - boolean error = false; - try { - LOG.info("Monitoring thread(s) !!"); - future.get(); - } catch (ExecutionException ex) { - LOG.error("Monitor noticed one or more threads failed. Requesting graceful shutdown of other threads", ex); - error = true; - } catch (InterruptedException ie) { - LOG.error("Got interrupted Monitoring threads", ie); - error = true; - } finally { - // Mark as shutdown - shutdown = true; - if (null != onShutdownCallback) { - onShutdownCallback.apply(error); - } - shutdown(false); + @SuppressWarnings("unchecked") + private void shutdownCallback(Function callback) { + if (future == null) { + return; + } + future.whenComplete((resp, error) -> { + if (null != callback) { + callback.apply(null != error); } + this.started = false; }); } public boolean isRunInDaemonMode() { return runInDaemonMode; } + + /** + * Wait till outstanding pending compaction/clustering reduces to the passed in value. + * + * @param numPending Maximum pending compactions/clustering allowed + * @throws InterruptedException + */ + public void waitTillPendingAsyncServiceInstantsReducesTo(int numPending) throws InterruptedException { + try { + queueLock.lock(); + while (!isShutdown() && !hasError() && (pendingInstants.size() > numPending)) { + consumed.await(POLLING_SECONDS, TimeUnit.SECONDS); + } + } finally { + queueLock.unlock(); + } + } + + /** + * Enqueues new pending clustering instant. + * @param instant {@link HoodieInstant} to enqueue. + */ + public void enqueuePendingAsyncServiceInstant(HoodieInstant instant) { + LOG.info("Enqueuing new pending clustering instant: " + instant.getTimestamp()); + pendingInstants.add(instant); + } + + /** + * Fetch next pending compaction/clustering instant if available. + * + * @return {@link HoodieInstant} corresponding to the next pending compaction/clustering. + * @throws InterruptedException + */ + HoodieInstant fetchNextAsyncServiceInstant() throws InterruptedException { + LOG.info(String.format("Waiting for next instant up to %d seconds", POLLING_SECONDS)); + HoodieInstant instant = pendingInstants.poll(POLLING_SECONDS, TimeUnit.SECONDS); + if (instant != null) { + try { + queueLock.lock(); + // Signal waiting thread + consumed.signal(); + } finally { + queueLock.unlock(); + } + } + return instant; + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/HoodieAsyncTableService.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/HoodieAsyncTableService.java new file mode 100644 index 0000000000000..6a53d30063c1d --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/HoodieAsyncTableService.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.async; + +import org.apache.hudi.client.RunsTableService; +import org.apache.hudi.config.HoodieWriteConfig; + +import java.util.function.Function; + +public abstract class HoodieAsyncTableService extends HoodieAsyncService implements RunsTableService { + + protected HoodieWriteConfig writeConfig; + + protected HoodieAsyncTableService() { + } + + protected HoodieAsyncTableService(HoodieWriteConfig writeConfig) { + this.writeConfig = writeConfig; + } + + protected HoodieAsyncTableService(HoodieWriteConfig writeConfig, boolean runInDaemonMode) { + super(runInDaemonMode); + this.writeConfig = writeConfig; + } + + @Override + public void start(Function onShutdownCallback) { + if (!tableServicesEnabled(writeConfig)) { + return; + } + super.start(onShutdownCallback); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/HoodieWriteCommitCallback.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/HoodieWriteCommitCallback.java index 2f5a4eff4191b..6f287123cf609 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/HoodieWriteCommitCallback.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/HoodieWriteCommitCallback.java @@ -17,11 +17,15 @@ package org.apache.hudi.callback; +import org.apache.hudi.ApiMaturityLevel; +import org.apache.hudi.PublicAPIClass; +import org.apache.hudi.PublicAPIMethod; import org.apache.hudi.callback.common.HoodieWriteCommitCallbackMessage; /** * A callback interface help to call back when a write commit completes successfully. */ +@PublicAPIClass(maturity = ApiMaturityLevel.STABLE) public interface HoodieWriteCommitCallback { /** @@ -30,6 +34,7 @@ public interface HoodieWriteCommitCallback { * * @param callbackMessage Callback msg, which will be sent to external system. */ + @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) void call(HoodieWriteCommitCallbackMessage callbackMessage); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/client/http/HoodieWriteCommitHttpCallbackClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/client/http/HoodieWriteCommitHttpCallbackClient.java index 6c41e2f5ead70..6d1059cb98da8 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/client/http/HoodieWriteCommitHttpCallbackClient.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/client/http/HoodieWriteCommitHttpCallbackClient.java @@ -33,7 +33,6 @@ import java.io.Closeable; import java.io.IOException; -import java.util.Properties; /** * Write commit callback http client. @@ -47,10 +46,10 @@ public class HoodieWriteCommitHttpCallbackClient implements Closeable { private final String apiKey; private final String url; private final CloseableHttpClient client; - private Properties props; + private HoodieWriteConfig writeConfig; public HoodieWriteCommitHttpCallbackClient(HoodieWriteConfig config) { - this.props = config.getProps(); + this.writeConfig = config; this.apiKey = getApiKey(); this.url = getUrl(); this.client = getClient(); @@ -72,7 +71,7 @@ public void send(String callbackMsg) { if (statusCode >= 300) { LOG.warn(String.format("Failed to send callback message. Response was %s", response)); } else { - LOG.info(String.format("Sent Callback data %s to %s successfully !", callbackMsg, url)); + LOG.info(String.format("Sent Callback data to %s successfully !", url)); } } catch (IOException e) { LOG.warn("Failed to send callback.", e); @@ -80,11 +79,11 @@ public void send(String callbackMsg) { } private String getApiKey() { - return props.getProperty(HoodieWriteCommitCallbackConfig.CALLBACK_HTTP_API_KEY); + return writeConfig.getString(HoodieWriteCommitCallbackConfig.CALLBACK_HTTP_API_KEY_VALUE); } private String getUrl() { - return props.getProperty(HoodieWriteCommitCallbackConfig.CALLBACK_HTTP_URL_PROP); + return writeConfig.getString(HoodieWriteCommitCallbackConfig.CALLBACK_HTTP_URL); } private CloseableHttpClient getClient() { @@ -98,7 +97,7 @@ private CloseableHttpClient getClient() { } private Integer getHttpTimeoutSeconds() { - return Integer.parseInt(props.getProperty(HoodieWriteCommitCallbackConfig.CALLBACK_HTTP_TIMEOUT_SECONDS)); + return writeConfig.getInt(HoodieWriteCommitCallbackConfig.CALLBACK_HTTP_TIMEOUT_IN_SECONDS); } @Override diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/common/HoodieWriteCommitCallbackMessage.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/common/HoodieWriteCommitCallbackMessage.java index 0233feeaeec7a..8210693a75657 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/common/HoodieWriteCommitCallbackMessage.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/common/HoodieWriteCommitCallbackMessage.java @@ -17,11 +17,17 @@ package org.apache.hudi.callback.common; +import org.apache.hudi.ApiMaturityLevel; +import org.apache.hudi.PublicAPIClass; +import org.apache.hudi.common.model.HoodieWriteStat; + import java.io.Serializable; +import java.util.List; /** * Base callback message, which contains commitTime and tableName only for now. */ +@PublicAPIClass(maturity = ApiMaturityLevel.EVOLVING) public class HoodieWriteCommitCallbackMessage implements Serializable { private static final long serialVersionUID = -3033643980627719561L; @@ -29,48 +35,43 @@ public class HoodieWriteCommitCallbackMessage implements Serializable { /** * CommitTime for one batch write, this is required. */ - private String commitTime; + private final String commitTime; /** * Table name this batch commit to. */ - private String tableName; + private final String tableName; /** * BathPath the table located. */ - private String basePath; + private final String basePath; - public HoodieWriteCommitCallbackMessage() { - } + /** + * Statistics about Hoodie write operation. + */ + private final List hoodieWriteStat; - public HoodieWriteCommitCallbackMessage(String commitTime, String tableName, String basePath) { + public HoodieWriteCommitCallbackMessage(String commitTime, String tableName, String basePath, List hoodieWriteStat) { this.commitTime = commitTime; this.tableName = tableName; this.basePath = basePath; + this.hoodieWriteStat = hoodieWriteStat; } public String getCommitTime() { return commitTime; } - public void setCommitTime(String commitTime) { - this.commitTime = commitTime; - } - public String getTableName() { return tableName; } - public void setTableName(String tableName) { - this.tableName = tableName; - } - public String getBasePath() { return basePath; } - public void setBasePath(String basePath) { - this.basePath = basePath; + public List getHoodieWriteStat() { + return hoodieWriteStat; } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/impl/HoodieWriteCommitHttpCallback.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/impl/HoodieWriteCommitHttpCallback.java index bb60879efbbad..1f30c7cd57031 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/impl/HoodieWriteCommitHttpCallback.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/impl/HoodieWriteCommitHttpCallback.java @@ -43,8 +43,6 @@ public HoodieWriteCommitHttpCallback(HoodieWriteConfig config) { public void call(HoodieWriteCommitCallbackMessage callbackMessage) { // convert to json String callbackMsg = HoodieWriteCommitCallbackUtil.convertToJsonString(callbackMessage); - LOG.info("Try to send callbackMsg, msg = " + callbackMsg); client.send(callbackMsg); } - } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/util/HoodieCommitCallbackFactory.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/util/HoodieCommitCallbackFactory.java index 74eb8b6dadbdd..aaa5eed28a4b6 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/util/HoodieCommitCallbackFactory.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/util/HoodieCommitCallbackFactory.java @@ -39,7 +39,7 @@ public static HoodieWriteCommitCallback create(HoodieWriteConfig config) { return (HoodieWriteCommitCallback) instance; } else { throw new HoodieCommitCallbackException(String.format("The value of the config option %s can not be null or " - + "empty", HoodieWriteCommitCallbackConfig.CALLBACK_CLASS_PROP)); + + "empty", HoodieWriteCommitCallbackConfig.CALLBACK_CLASS_NAME.key())); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/util/HoodieWriteCommitCallbackUtil.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/util/HoodieWriteCommitCallbackUtil.java index c160819f11157..fff0b713528be 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/util/HoodieWriteCommitCallbackUtil.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/callback/util/HoodieWriteCommitCallbackUtil.java @@ -19,7 +19,7 @@ import org.apache.hudi.exception.HoodieCommitCallbackException; -import org.codehaus.jackson.map.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractCompactor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractCompactor.java deleted file mode 100644 index c80b34a3ef656..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractCompactor.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.client; - -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; - -import java.io.IOException; -import java.io.Serializable; - -/** - * Run one round of compaction. - */ -public abstract class AbstractCompactor implements Serializable { - - private static final long serialVersionUID = 1L; - - protected transient AbstractHoodieWriteClient compactionClient; - - public AbstractCompactor(AbstractHoodieWriteClient compactionClient) { - this.compactionClient = compactionClient; - } - - public abstract void compact(HoodieInstant instant) throws IOException; - - public void updateWriteClient(AbstractHoodieWriteClient writeClient) { - this.compactionClient = writeClient; - } - -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieClient.java deleted file mode 100644 index e502281322fe7..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieClient.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.client; - -import org.apache.hudi.client.embedded.EmbeddedTimelineServerHelper; -import org.apache.hudi.client.embedded.EmbeddedTimelineService; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.IOException; -import java.io.Serializable; - -/** - * Abstract class taking care of holding common member variables (FileSystem, SparkContext, HoodieConfigs) Also, manages - * embedded timeline-server if enabled. - */ -public abstract class AbstractHoodieClient implements Serializable, AutoCloseable { - - private static final Logger LOG = LogManager.getLogger(AbstractHoodieClient.class); - - protected final transient FileSystem fs; - protected final transient HoodieEngineContext context; - protected final transient Configuration hadoopConf; - protected final HoodieWriteConfig config; - protected final String basePath; - - /** - * Timeline Server has the same lifetime as that of Client. Any operations done on the same timeline service will be - * able to take advantage of the cached file-system view. New completed actions will be synced automatically in an - * incremental fashion. - */ - private transient Option timelineServer; - private final boolean shouldStopTimelineServer; - - protected AbstractHoodieClient(HoodieEngineContext context, HoodieWriteConfig clientConfig) { - this(context, clientConfig, Option.empty()); - } - - protected AbstractHoodieClient(HoodieEngineContext context, HoodieWriteConfig clientConfig, - Option timelineServer) { - this.hadoopConf = context.getHadoopConf().get(); - this.fs = FSUtils.getFs(clientConfig.getBasePath(), hadoopConf); - this.context = context; - this.basePath = clientConfig.getBasePath(); - this.config = clientConfig; - this.timelineServer = timelineServer; - shouldStopTimelineServer = !timelineServer.isPresent(); - startEmbeddedServerView(); - } - - /** - * Releases any resources used by the client. - */ - @Override - public void close() { - stopEmbeddedServerView(true); - } - - private synchronized void stopEmbeddedServerView(boolean resetViewStorageConfig) { - if (timelineServer.isPresent() && shouldStopTimelineServer) { - // Stop only if owner - LOG.info("Stopping Timeline service !!"); - timelineServer.get().stop(); - } - - timelineServer = Option.empty(); - // Reset Storage Config to Client specified config - if (resetViewStorageConfig) { - config.resetViewStorageConfig(); - } - } - - private synchronized void startEmbeddedServerView() { - if (config.isEmbeddedTimelineServerEnabled()) { - if (!timelineServer.isPresent()) { - // Run Embedded Timeline Server - try { - timelineServer = EmbeddedTimelineServerHelper.createEmbeddedTimelineService(context, config); - } catch (IOException e) { - LOG.warn("Unable to start timeline service. Proceeding as if embedded server is disabled", e); - stopEmbeddedServerView(false); - } - } else { - LOG.info("Timeline Server already running. Not restarting the service"); - } - } else { - LOG.info("Embedded Timeline Server is disabled. Not starting timeline service"); - } - } - - public HoodieWriteConfig getConfig() { - return config; - } - - protected HoodieTableMetaClient createMetaClient(boolean loadActiveTimelineOnLoad) { - return new HoodieTableMetaClient(hadoopConf, config.getBasePath(), loadActiveTimelineOnLoad, - config.getConsistencyGuardConfig(), - Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion()))); - } - - public Option getTimelineServer() { - return timelineServer; - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java deleted file mode 100644 index 222e1ab2ca5b2..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java +++ /dev/null @@ -1,806 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.client; - -import com.codahale.metrics.Timer; -import org.apache.hadoop.conf.Configuration; -import org.apache.hudi.avro.model.HoodieCleanMetadata; -import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.avro.model.HoodieRestoreMetadata; -import org.apache.hudi.avro.model.HoodieRollbackMetadata; -import org.apache.hudi.callback.HoodieWriteCommitCallback; -import org.apache.hudi.callback.common.HoodieWriteCommitCallbackMessage; -import org.apache.hudi.callback.util.HoodieCommitCallbackFactory; -import org.apache.hudi.client.embedded.EmbeddedTimelineService; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.HoodieWriteStat; -import org.apache.hudi.common.model.WriteOperationType; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.CommitUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.config.HoodieCompactionConfig; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieCommitException; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.exception.HoodieRestoreException; -import org.apache.hudi.exception.HoodieRollbackException; -import org.apache.hudi.exception.HoodieSavepointException; -import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.metrics.HoodieMetrics; -import org.apache.hudi.table.BulkInsertPartitioner; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.HoodieTimelineArchiveLog; -import org.apache.hudi.table.MarkerFiles; -import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.hudi.table.action.savepoint.SavepointHelpers; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.text.ParseException; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -/** - * Abstract Write Client providing functionality for performing commit, index updates and rollback - * Reused for regular write operations like upsert/insert/bulk-insert.. as well as bootstrap - * - * @param Sub type of HoodieRecordPayload - * @param Type of inputs - * @param Type of keys - * @param Type of outputs - */ -public abstract class AbstractHoodieWriteClient extends AbstractHoodieClient { - - protected static final String LOOKUP_STR = "lookup"; - private static final long serialVersionUID = 1L; - private static final Logger LOG = LogManager.getLogger(AbstractHoodieWriteClient.class); - - protected final transient HoodieMetrics metrics; - private final transient HoodieIndex index; - - protected transient Timer.Context writeTimer = null; - protected transient Timer.Context compactionTimer; - - private transient WriteOperationType operationType; - private transient HoodieWriteCommitCallback commitCallback; - protected final boolean rollbackPending; - protected transient AsyncCleanerService asyncCleanerService; - - /** - * Create a write client, without cleaning up failed/inflight commits. - * - * @param context HoodieEngineContext - * @param clientConfig instance of HoodieWriteConfig - */ - public AbstractHoodieWriteClient(HoodieEngineContext context, HoodieWriteConfig clientConfig) { - this(context, clientConfig, false); - } - - /** - * Create a write client, with new hudi index. - * - * @param context HoodieEngineContext - * @param writeConfig instance of HoodieWriteConfig - * @param rollbackPending whether need to cleanup pending commits - */ - public AbstractHoodieWriteClient(HoodieEngineContext context, HoodieWriteConfig writeConfig, boolean rollbackPending) { - this(context, writeConfig, rollbackPending, Option.empty()); - } - - /** - * Create a write client, allows to specify all parameters. - * - * @param context HoodieEngineContext - * @param writeConfig instance of HoodieWriteConfig - * @param rollbackPending whether need to cleanup pending commits - * @param timelineService Timeline Service that runs as part of write client. - */ - public AbstractHoodieWriteClient(HoodieEngineContext context, HoodieWriteConfig writeConfig, boolean rollbackPending, - Option timelineService) { - super(context, writeConfig, timelineService); - this.metrics = new HoodieMetrics(config, config.getTableName()); - this.rollbackPending = rollbackPending; - this.index = createIndex(writeConfig); - } - - protected abstract HoodieIndex createIndex(HoodieWriteConfig writeConfig); - - public void setOperationType(WriteOperationType operationType) { - this.operationType = operationType; - } - - public WriteOperationType getOperationType() { - return this.operationType; - } - - /** - * Commit changes performed at the given instantTime marker. - */ - public boolean commit(String instantTime, O writeStatuses) { - return commit(instantTime, writeStatuses, Option.empty()); - } - - /** - * - * Commit changes performed at the given instantTime marker. - */ - public boolean commit(String instantTime, O writeStatuses, Option> extraMetadata) { - HoodieTableMetaClient metaClient = createMetaClient(false); - String actionType = metaClient.getCommitActionType(); - return commit(instantTime, writeStatuses, extraMetadata, actionType, Collections.emptyMap()); - } - - public abstract boolean commit(String instantTime, O writeStatuses, Option> extraMetadata, - String commitActionType, Map> partitionToReplacedFileIds); - - public boolean commitStats(String instantTime, List stats, Option> extraMetadata, - String commitActionType) { - return commitStats(instantTime, stats, extraMetadata, commitActionType, Collections.emptyMap()); - } - - public boolean commitStats(String instantTime, List stats, Option> extraMetadata, - String commitActionType, Map> partitionToReplaceFileIds) { - LOG.info("Committing " + instantTime + " action " + commitActionType); - // Create a Hoodie table which encapsulated the commits and files visible - HoodieTable table = createTable(config, hadoopConf); - - HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); - HoodieCommitMetadata metadata = CommitUtils.buildMetadata(stats, partitionToReplaceFileIds, extraMetadata, operationType, config.getSchema(), commitActionType); - // Finalize write - finalizeWrite(table, instantTime, stats); - - try { - activeTimeline.saveAsComplete(new HoodieInstant(true, commitActionType, instantTime), - Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); - postCommit(table, metadata, instantTime, extraMetadata); - emitCommitMetrics(instantTime, metadata, commitActionType); - LOG.info("Committed " + instantTime); - } catch (IOException e) { - throw new HoodieCommitException("Failed to complete commit " + config.getBasePath() + " at time " + instantTime, - e); - } - - // callback if needed. - if (config.writeCommitCallbackOn()) { - if (null == commitCallback) { - commitCallback = HoodieCommitCallbackFactory.create(config); - } - commitCallback.call(new HoodieWriteCommitCallbackMessage(instantTime, config.getTableName(), config.getBasePath())); - } - return true; - } - - protected abstract HoodieTable createTable(HoodieWriteConfig config, Configuration hadoopConf); - - void emitCommitMetrics(String instantTime, HoodieCommitMetadata metadata, String actionType) { - try { - - if (writeTimer != null) { - long durationInMs = metrics.getDurationInMs(writeTimer.stop()); - metrics.updateCommitMetrics(HoodieActiveTimeline.COMMIT_FORMATTER.parse(instantTime).getTime(), durationInMs, - metadata, actionType); - writeTimer = null; - } - } catch (ParseException e) { - throw new HoodieCommitException("Failed to complete commit " + config.getBasePath() + " at time " + instantTime - + "Instant time is not of valid format", e); - } - } - - /** - * Filter out HoodieRecords that already exists in the output folder. This is useful in deduplication. - * - * @param hoodieRecords Input Hoodie records. - * @return A subset of hoodieRecords, with existing records filtered out. - */ - public abstract I filterExists(I hoodieRecords); - - /** - * Main API to run bootstrap to hudi. - */ - public void bootstrap(Option> extraMetadata) { - if (rollbackPending) { - rollBackInflightBootstrap(); - } - HoodieTable table = getTableAndInitCtx(WriteOperationType.UPSERT, HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS); - table.bootstrap(context, extraMetadata); - } - - /** - * Main API to rollback pending bootstrap. - */ - protected void rollBackInflightBootstrap() { - LOG.info("Rolling back pending bootstrap if present"); - HoodieTable table = createTable(config, hadoopConf); - HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction(); - Option instant = Option.fromJavaOptional( - inflightTimeline.getReverseOrderedInstants().map(HoodieInstant::getTimestamp).findFirst()); - if (instant.isPresent() && HoodieTimeline.compareTimestamps(instant.get(), HoodieTimeline.LESSER_THAN_OR_EQUALS, - HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS)) { - LOG.info("Found pending bootstrap instants. Rolling them back"); - table.rollbackBootstrap(context, HoodieActiveTimeline.createNewInstantTime()); - LOG.info("Finished rolling back pending bootstrap"); - } - - } - - /** - * Upsert a batch of new records into Hoodie table at the supplied instantTime. - * - * @param records hoodieRecords to upsert - * @param instantTime Instant time of the commit - * @return WriteStatus to inspect errors and counts - */ - public abstract O upsert(I records, final String instantTime); - - /** - * Upserts the given prepared records into the Hoodie table, at the supplied instantTime. - *

- * This implementation requires that the input records are already tagged, and de-duped if needed. - * - * @param preppedRecords Prepared HoodieRecords to upsert - * @param instantTime Instant time of the commit - * @return Collection of WriteStatus to inspect errors and counts - */ - public abstract O upsertPreppedRecords(I preppedRecords, final String instantTime); - - /** - * Inserts the given HoodieRecords, into the table. This API is intended to be used for normal writes. - *

- * This implementation skips the index check and is able to leverage benefits such as small file handling/blocking - * alignment, as with upsert(), by profiling the workload - * - * @param records HoodieRecords to insert - * @param instantTime Instant time of the commit - * @return Collection of WriteStatus to inspect errors and counts - */ - public abstract O insert(I records, final String instantTime); - - /** - * Inserts the given prepared records into the Hoodie table, at the supplied instantTime. - *

- * This implementation skips the index check, skips de-duping and is able to leverage benefits such as small file - * handling/blocking alignment, as with insert(), by profiling the workload. The prepared HoodieRecords should be - * de-duped if needed. - * - * @param preppedRecords HoodieRecords to insert - * @param instantTime Instant time of the commit - * @return Collection of WriteStatus to inspect errors and counts - */ - public abstract O insertPreppedRecords(I preppedRecords, final String instantTime); - - /** - * Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk loads into a Hoodie - * table for the very first time (e.g: converting an existing table to Hoodie). - *

- * This implementation uses sortBy (which does range partitioning based on reservoir sampling) and attempts to control - * the numbers of files with less memory compared to the {@link AbstractHoodieWriteClient#insert(I, String)} - * - * @param records HoodieRecords to insert - * @param instantTime Instant time of the commit - * @return Collection of WriteStatus to inspect errors and counts - */ - public abstract O bulkInsert(I records, final String instantTime); - - /** - * Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk loads into a Hoodie - * table for the very first time (e.g: converting an existing table to Hoodie). - *

- * This implementation uses sortBy (which does range partitioning based on reservoir sampling) and attempts to control - * the numbers of files with less memory compared to the {@link AbstractHoodieWriteClient#insert(I, String)}. Optionally - * it allows users to specify their own partitioner. If specified then it will be used for repartitioning records. See - * {@link BulkInsertPartitioner}. - * - * @param records HoodieRecords to insert - * @param instantTime Instant time of the commit - * @param userDefinedBulkInsertPartitioner If specified then it will be used to partition input records before they are inserted - * into hoodie. - * @return Collection of WriteStatus to inspect errors and counts - */ - public abstract O bulkInsert(I records, final String instantTime, - Option> userDefinedBulkInsertPartitioner); - - - /** - * Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk loads into a Hoodie - * table for the very first time (e.g: converting an existing table to Hoodie). The input records should contain no - * duplicates if needed. - *

- * This implementation uses sortBy (which does range partitioning based on reservoir sampling) and attempts to control - * the numbers of files with less memory compared to the {@link AbstractHoodieWriteClient#insert(I, String)}. Optionally - * it allows users to specify their own partitioner. If specified then it will be used for repartitioning records. See - * {@link BulkInsertPartitioner}. - * - * @param preppedRecords HoodieRecords to insert - * @param instantTime Instant time of the commit - * @param bulkInsertPartitioner If specified then it will be used to partition input records before they are inserted - * into hoodie. - * @return Collection of WriteStatus to inspect errors and counts - */ - public abstract O bulkInsertPreppedRecords(I preppedRecords, final String instantTime, - Option> bulkInsertPartitioner); - - /** - * Deletes a list of {@link HoodieKey}s from the Hoodie table, at the supplied instantTime {@link HoodieKey}s will be - * de-duped and non existent keys will be removed before deleting. - * - * @param keys {@link List} of {@link HoodieKey}s to be deleted - * @param instantTime Commit time handle - * @return Collection of WriteStatus to inspect errors and counts - */ - public abstract O delete(K keys, final String instantTime); - - /** - * Common method containing steps to be performed after write (upsert/insert/..) operations including auto-commit. - * @param result Commit Action Result - * @param instantTime Instant Time - * @param hoodieTable Hoodie Table - * @return Write Status - */ - protected abstract O postWrite(HoodieWriteMetadata result, String instantTime, HoodieTable hoodieTable); - - /** - * Post Commit Hook. Derived classes use this method to perform post-commit processing - * - * @param table table to commit on - * @param metadata Commit Metadata corresponding to committed instant - * @param instantTime Instant Time - * @param extraMetadata Additional Metadata passed by user - */ - protected void postCommit(HoodieTable table, HoodieCommitMetadata metadata, String instantTime, Option> extraMetadata) { - try { - - // Delete the marker directory for the instant. - new MarkerFiles(table, instantTime).quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); - - // Do an inline compaction if enabled - if (config.isInlineCompaction()) { - runAnyPendingCompactions(table); - metadata.addMetadata(HoodieCompactionConfig.INLINE_COMPACT_PROP, "true"); - inlineCompact(extraMetadata); - } else { - metadata.addMetadata(HoodieCompactionConfig.INLINE_COMPACT_PROP, "false"); - } - // We cannot have unbounded commit files. Archive commits if we have to archive - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(config, table); - archiveLog.archiveIfRequired(context); - autoCleanOnCommit(instantTime); - } catch (IOException ioe) { - throw new HoodieIOException(ioe.getMessage(), ioe); - } - } - - protected void runAnyPendingCompactions(HoodieTable table) { - table.getActiveTimeline().getCommitsAndCompactionTimeline().filterPendingCompactionTimeline().getInstants() - .forEach(instant -> { - LOG.info("Running previously failed inflight compaction at instant " + instant); - compact(instant.getTimestamp(), true); - }); - } - - /** - * Handle auto clean during commit. - * - * @param instantTime - */ - protected void autoCleanOnCommit(String instantTime) { - if (config.isAutoClean()) { - // Call clean to cleanup if there is anything to cleanup after the commit, - if (config.isAsyncClean()) { - LOG.info("Cleaner has been spawned already. Waiting for it to finish"); - AsyncCleanerService.waitForCompletion(asyncCleanerService); - LOG.info("Cleaner has finished"); - } else { - LOG.info("Auto cleaning is enabled. Running cleaner now"); - clean(instantTime); - } - } - } - - /** - * Create a savepoint based on the latest commit action on the timeline. - * - * @param user - User creating the savepoint - * @param comment - Comment for the savepoint - */ - public void savepoint(String user, String comment) { - HoodieTable table = createTable(config, hadoopConf); - if (table.getCompletedCommitsTimeline().empty()) { - throw new HoodieSavepointException("Could not savepoint. Commit timeline is empty"); - } - - String latestCommit = table.getCompletedCommitsTimeline().lastInstant().get().getTimestamp(); - LOG.info("Savepointing latest commit " + latestCommit); - savepoint(latestCommit, user, comment); - } - - /** - * Savepoint a specific commit instant time. Latest version of data files as of the passed in instantTime - * will be referenced in the savepoint and will never be cleaned. The savepointed commit will never be rolledback or archived. - *

- * This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be manually created and - * deleted. - *

- * Savepoint should be on a commit that could not have been cleaned. - * - * @param instantTime - commit that should be savepointed - * @param user - User creating the savepoint - * @param comment - Comment for the savepoint - */ - public void savepoint(String instantTime, String user, String comment) { - HoodieTable table = createTable(config, hadoopConf); - table.savepoint(context, instantTime, user, comment); - } - - /** - * Delete a savepoint that was created. Once the savepoint is deleted, the commit can be rolledback and cleaner may - * clean up data files. - * - * @param savepointTime - delete the savepoint - * @return true if the savepoint was deleted successfully - */ - public void deleteSavepoint(String savepointTime) { - HoodieTable table = createTable(config, hadoopConf); - SavepointHelpers.deleteSavepoint(table, savepointTime); - } - - /** - * Restore the data to the savepoint. - * - * WARNING: This rolls back recent commits and deleted data files and also pending compactions after savepoint time. - * Queries accessing the files will mostly fail. This is expected to be a manual operation and no concurrent write or - * compaction is expected to be running - * - * @param savepointTime - savepoint time to rollback to - * @return true if the savepoint was restored to successfully - */ - public void restoreToSavepoint(String savepointTime) { - HoodieTable table = createTable(config, hadoopConf); - SavepointHelpers.validateSavepointPresence(table, savepointTime); - restoreToInstant(savepointTime); - SavepointHelpers.validateSavepointRestore(table, savepointTime); - } - - /** - * Rollback the inflight record changes with the given commit time. - * - * @param commitInstantTime Instant time of the commit - * @throws HoodieRollbackException if rollback cannot be performed successfully - */ - public boolean rollback(final String commitInstantTime) throws HoodieRollbackException { - LOG.info("Begin rollback of instant " + commitInstantTime); - final String rollbackInstantTime = HoodieActiveTimeline.createNewInstantTime(); - final Timer.Context timerContext = this.metrics.getRollbackCtx(); - try { - HoodieTable table = createTable(config, hadoopConf); - Option commitInstantOpt = Option.fromJavaOptional(table.getActiveTimeline().getCommitsTimeline().getInstants() - .filter(instant -> HoodieActiveTimeline.EQUALS.test(instant.getTimestamp(), commitInstantTime)) - .findFirst()); - if (commitInstantOpt.isPresent()) { - HoodieRollbackMetadata rollbackMetadata = table.rollback(context, rollbackInstantTime, commitInstantOpt.get(), true); - if (timerContext != null) { - long durationInMs = metrics.getDurationInMs(timerContext.stop()); - metrics.updateRollbackMetrics(durationInMs, rollbackMetadata.getTotalFilesDeleted()); - } - return true; - } else { - LOG.warn("Cannot find instant " + commitInstantTime + " in the timeline, for rollback"); - return false; - } - } catch (Exception e) { - throw new HoodieRollbackException("Failed to rollback " + config.getBasePath() + " commits " + commitInstantTime, e); - } - } - - /** - * NOTE : This action requires all writers (ingest and compact) to a table to be stopped before proceeding. Revert - * the (inflight/committed) record changes for all commits after the provided instant time. - * - * @param instantTime Instant time to which restoration is requested - */ - public HoodieRestoreMetadata restoreToInstant(final String instantTime) throws HoodieRestoreException { - LOG.info("Begin restore to instant " + instantTime); - final String restoreInstantTime = HoodieActiveTimeline.createNewInstantTime(); - Timer.Context timerContext = metrics.getRollbackCtx(); - try { - HoodieTable table = createTable(config, hadoopConf); - HoodieRestoreMetadata restoreMetadata = table.restore(context, restoreInstantTime, instantTime); - if (timerContext != null) { - final long durationInMs = metrics.getDurationInMs(timerContext.stop()); - final long totalFilesDeleted = restoreMetadata.getHoodieRestoreMetadata().values().stream() - .flatMap(Collection::stream) - .mapToLong(HoodieRollbackMetadata::getTotalFilesDeleted) - .sum(); - metrics.updateRollbackMetrics(durationInMs, totalFilesDeleted); - } - return restoreMetadata; - } catch (Exception e) { - throw new HoodieRestoreException("Failed to restore to " + instantTime, e); - } - } - - /** - * Clean up any stale/old files/data lying around (either on file storage or index storage) based on the - * configurations and CleaningPolicy used. (typically files that no longer can be used by a running query can be - * cleaned) - */ - public HoodieCleanMetadata clean(String cleanInstantTime) throws HoodieIOException { - LOG.info("Cleaner started"); - final Timer.Context timerContext = metrics.getCleanCtx(); - HoodieCleanMetadata metadata = createTable(config, hadoopConf).clean(context, cleanInstantTime); - if (timerContext != null && metadata != null) { - long durationMs = metrics.getDurationInMs(timerContext.stop()); - metrics.updateCleanMetrics(durationMs, metadata.getTotalFilesDeleted()); - LOG.info("Cleaned " + metadata.getTotalFilesDeleted() + " files" - + " Earliest Retained Instant :" + metadata.getEarliestCommitToRetain() - + " cleanerElapsedMs" + durationMs); - } - return metadata; - } - - public HoodieCleanMetadata clean() { - return clean(HoodieActiveTimeline.createNewInstantTime()); - } - - /** - * Provides a new commit time for a write operation (insert/update/delete). - */ - public String startCommit() { - // NOTE : Need to ensure that rollback is done before a new commit is started - if (rollbackPending) { - // Only rollback pending commit/delta-commits. Do not touch compaction commits - rollbackPendingCommits(); - } - String instantTime = HoodieActiveTimeline.createNewInstantTime(); - startCommitWithTime(instantTime); - return instantTime; - } - - /** - * Provides a new commit time for a write operation (insert/update/delete). - * - * @param instantTime Instant time to be generated - */ - public void startCommitWithTime(String instantTime) { - HoodieTableMetaClient metaClient = createMetaClient(true); - startCommitWithTime(instantTime, metaClient.getCommitActionType(), metaClient); - } - - /** - * Completes a new commit time for a write operation (insert/update/delete) with specified action. - */ - public void startCommitWithTime(String instantTime, String actionType) { - HoodieTableMetaClient metaClient = createMetaClient(true); - startCommitWithTime(instantTime, actionType, metaClient); - } - - /** - * Completes a new commit time for a write operation (insert/update/delete) with specified action. - */ - private void startCommitWithTime(String instantTime, String actionType, HoodieTableMetaClient metaClient) { - // NOTE : Need to ensure that rollback is done before a new commit is started - if (rollbackPending) { - // Only rollback inflight commit/delta-commits. Do not touch compaction commits - rollbackPendingCommits(); - } - startCommit(instantTime, actionType, metaClient); - } - - private void startCommit(String instantTime, String actionType, HoodieTableMetaClient metaClient) { - LOG.info("Generate a new instant time: " + instantTime + " action: " + actionType); - // if there are pending compactions, their instantTime must not be greater than that of this instant time - metaClient.getActiveTimeline().filterPendingCompactionTimeline().lastInstant().ifPresent(latestPending -> - ValidationUtils.checkArgument( - HoodieTimeline.compareTimestamps(latestPending.getTimestamp(), HoodieTimeline.LESSER_THAN, instantTime), - "Latest pending compaction instant time must be earlier than this instant time. Latest Compaction :" - + latestPending + ", Ingesting at " + instantTime)); - metaClient.getActiveTimeline().createNewInstant(new HoodieInstant(HoodieInstant.State.REQUESTED, actionType, - instantTime)); - } - - /** - * Schedules a new compaction instant. - * - * @param extraMetadata Extra Metadata to be stored - */ - public Option scheduleCompaction(Option> extraMetadata) throws HoodieIOException { - String instantTime = HoodieActiveTimeline.createNewInstantTime(); - return scheduleCompactionAtInstant(instantTime, extraMetadata) ? Option.of(instantTime) : Option.empty(); - } - - /** - * Schedules a new compaction instant with passed-in instant time. - * - * @param instantTime Compaction Instant Time - * @param extraMetadata Extra Metadata to be stored - */ - public boolean scheduleCompactionAtInstant(String instantTime, Option> extraMetadata) throws HoodieIOException { - LOG.info("Scheduling compaction at instant time :" + instantTime); - Option plan = createTable(config, hadoopConf) - .scheduleCompaction(context, instantTime, extraMetadata); - return plan.isPresent(); - } - - /** - * Performs Compaction for the workload stored in instant-time. - * - * @param compactionInstantTime Compaction Instant Time - * @return Collection of WriteStatus to inspect errors and counts - */ - public O compact(String compactionInstantTime) { - return compact(compactionInstantTime, config.shouldAutoCommit()); - } - - /** - * Commit a compaction operation. Allow passing additional meta-data to be stored in commit instant file. - * - * @param compactionInstantTime Compaction Instant Time - * @param writeStatuses Collection of WriteStatus to inspect errors and counts - * @param extraMetadata Extra Metadata to be stored - */ - public abstract void commitCompaction(String compactionInstantTime, O writeStatuses, - Option> extraMetadata) throws IOException; - - /** - * Commit Compaction and track metrics. - */ - protected abstract void completeCompaction(HoodieCommitMetadata metadata, O writeStatuses, - HoodieTable table, String compactionCommitTime); - - - /** - * Rollback failed compactions. Inflight rollbacks for compactions revert the .inflight file to the .requested file - * - * @param inflightInstant Inflight Compaction Instant - * @param table Hoodie Table - */ - public void rollbackInflightCompaction(HoodieInstant inflightInstant, HoodieTable table) { - table.rollback(context, HoodieActiveTimeline.createNewInstantTime(), inflightInstant, false); - table.getActiveTimeline().revertCompactionInflightToRequested(inflightInstant); - } - - /** - * Cleanup all pending commits. - */ - private void rollbackPendingCommits() { - HoodieTable table = createTable(config, hadoopConf); - HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction(); - List commits = inflightTimeline.getReverseOrderedInstants().map(HoodieInstant::getTimestamp) - .collect(Collectors.toList()); - for (String commit : commits) { - if (HoodieTimeline.compareTimestamps(commit, HoodieTimeline.LESSER_THAN_OR_EQUALS, - HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS)) { - rollBackInflightBootstrap(); - break; - } else { - rollback(commit); - } - } - } - - /** - * Ensures compaction instant is in expected state and performs Compaction for the workload stored in instant-time. - * - * @param compactionInstantTime Compaction Instant Time - * @return Collection of Write Status - */ - protected abstract O compact(String compactionInstantTime, boolean shouldComplete); - - /** - * Performs a compaction operation on a table, serially before or after an insert/upsert action. - */ - protected Option inlineCompact(Option> extraMetadata) { - Option compactionInstantTimeOpt = scheduleCompaction(extraMetadata); - compactionInstantTimeOpt.ifPresent(compactionInstantTime -> { - // inline compaction should auto commit as the user is never given control - compact(compactionInstantTime, true); - }); - return compactionInstantTimeOpt; - } - - /** - * Finalize Write operation. - * - * @param table HoodieTable - * @param instantTime Instant Time - * @param stats Hoodie Write Stat - */ - protected void finalizeWrite(HoodieTable table, String instantTime, List stats) { - try { - final Timer.Context finalizeCtx = metrics.getFinalizeCtx(); - table.finalizeWrite(context, instantTime, stats); - if (finalizeCtx != null) { - Option durationInMs = Option.of(metrics.getDurationInMs(finalizeCtx.stop())); - durationInMs.ifPresent(duration -> { - LOG.info("Finalize write elapsed time (milliseconds): " + duration); - metrics.updateFinalizeWriteMetrics(duration, stats.size()); - }); - } - } catch (HoodieIOException ioe) { - throw new HoodieCommitException("Failed to complete commit " + instantTime + " due to finalize errors.", ioe); - } - } - - public HoodieMetrics getMetrics() { - return metrics; - } - - public HoodieIndex getIndex() { - return index; - } - - /** - * Get HoodieTable and init {@link Timer.Context}. - * - * @param operationType write operation type - * @param instantTime current inflight instant time - * @return HoodieTable - */ - protected abstract HoodieTable getTableAndInitCtx(WriteOperationType operationType, String instantTime); - - /** - * Sets write schema from last instant since deletes may not have schema set in the config. - */ - protected void setWriteSchemaForDeletes(HoodieTableMetaClient metaClient) { - try { - HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); - Option lastInstant = - activeTimeline.filterCompletedInstants().filter(s -> s.getAction().equals(metaClient.getCommitActionType())) - .lastInstant(); - if (lastInstant.isPresent()) { - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( - activeTimeline.getInstantDetails(lastInstant.get()).get(), HoodieCommitMetadata.class); - if (commitMetadata.getExtraMetadata().containsKey(HoodieCommitMetadata.SCHEMA_KEY)) { - config.setSchema(commitMetadata.getExtraMetadata().get(HoodieCommitMetadata.SCHEMA_KEY)); - } else { - throw new HoodieIOException("Latest commit does not have any schema in commit metadata"); - } - } else { - throw new HoodieIOException("Deletes issued without any prior commits"); - } - } catch (IOException e) { - throw new HoodieIOException("IOException thrown while reading last commit metadata", e); - } - } - - @Override - public void close() { - // release AsyncCleanerService - AsyncCleanerService.forceShutdown(asyncCleanerService); - asyncCleanerService = null; - - // Stop timeline-server if running - super.close(); - // Calling this here releases any resources used by your index, so make sure to finish any related operations - // before this point - this.index.close(); - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AsyncCleanerService.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AsyncCleanerService.java deleted file mode 100644 index e8016c957be65..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AsyncCleanerService.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.client; - -import org.apache.hudi.async.HoodieAsyncService; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.exception.HoodieException; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; - -/** - * Clean service running concurrently with write operation. - */ -class AsyncCleanerService extends HoodieAsyncService { - - private static final Logger LOG = LogManager.getLogger(AsyncCleanerService.class); - - private final AbstractHoodieWriteClient writeClient; - private final String cleanInstantTime; - private final transient ExecutorService executor = Executors.newSingleThreadExecutor(); - - protected AsyncCleanerService(AbstractHoodieWriteClient writeClient, String cleanInstantTime) { - this.writeClient = writeClient; - this.cleanInstantTime = cleanInstantTime; - } - - @Override - protected Pair startService() { - return Pair.of(CompletableFuture.supplyAsync(() -> { - writeClient.clean(cleanInstantTime); - return true; - }), executor); - } - - public static AsyncCleanerService startAsyncCleaningIfEnabled(AbstractHoodieWriteClient writeClient, - String instantTime) { - AsyncCleanerService asyncCleanerService = null; - if (writeClient.getConfig().isAutoClean() && writeClient.getConfig().isAsyncClean()) { - LOG.info("Auto cleaning is enabled. Running cleaner async to write operation"); - asyncCleanerService = new AsyncCleanerService(writeClient, instantTime); - asyncCleanerService.start(null); - } else { - LOG.info("Auto cleaning is not enabled. Not running cleaner now"); - } - return asyncCleanerService; - } - - public static void waitForCompletion(AsyncCleanerService asyncCleanerService) { - if (asyncCleanerService != null) { - LOG.info("Waiting for async cleaner to finish"); - try { - asyncCleanerService.waitForShutdown(); - } catch (Exception e) { - throw new HoodieException("Error waiting for async cleaning to finish", e); - } - } - } - - public static void forceShutdown(AsyncCleanerService asyncCleanerService) { - if (asyncCleanerService != null) { - LOG.info("Shutting down async cleaner"); - asyncCleanerService.shutdown(true); - } - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseClusterer.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseClusterer.java new file mode 100644 index 0000000000000..648ce805b0825 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseClusterer.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.client; + +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieInstant; + +import java.io.IOException; +import java.io.Serializable; + +/** + * Client will run one round of clustering. + */ +public abstract class BaseClusterer implements Serializable { + + private static final long serialVersionUID = 1L; + + protected transient BaseHoodieWriteClient clusteringClient; + + public BaseClusterer(BaseHoodieWriteClient clusteringClient) { + this.clusteringClient = clusteringClient; + } + + /** + * Run clustering for the instant. + * @param instant + * @throws IOException + */ + public abstract void cluster(HoodieInstant instant) throws IOException; + + /** + * Update the write client used by async clustering. + * @param writeClient + */ + public void updateWriteClient(BaseHoodieWriteClient writeClient) { + this.clusteringClient = writeClient; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseCompactor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseCompactor.java new file mode 100644 index 0000000000000..88737dbcf1d7e --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseCompactor.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client; + +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieInstant; + +import java.io.IOException; +import java.io.Serializable; + +/** + * Run one round of compaction. + */ +public abstract class BaseCompactor implements Serializable { + + private static final long serialVersionUID = 1L; + + protected transient BaseHoodieWriteClient compactionClient; + + public BaseCompactor(BaseHoodieWriteClient compactionClient) { + this.compactionClient = compactionClient; + } + + public abstract void compact(HoodieInstant instant) throws IOException; + + public void updateWriteClient(BaseHoodieWriteClient writeClient) { + this.compactionClient = writeClient; + } + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieClient.java new file mode 100644 index 0000000000000..b41747d83a85e --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieClient.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client; + +import org.apache.hudi.client.embedded.EmbeddedTimelineServerHelper; +import org.apache.hudi.client.embedded.EmbeddedTimelineService; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.client.heartbeat.HoodieHeartbeatClient; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.io.Serializable; + +/** + * Abstract class taking care of holding common member variables (FileSystem, SparkContext, HoodieConfigs) Also, manages + * embedded timeline-server if enabled. + */ +public abstract class BaseHoodieClient implements Serializable, AutoCloseable { + + private static final Logger LOG = LogManager.getLogger(BaseHoodieClient.class); + + protected final transient FileSystem fs; + protected final transient HoodieEngineContext context; + protected final transient Configuration hadoopConf; + protected final HoodieWriteConfig config; + protected final String basePath; + protected final HoodieHeartbeatClient heartbeatClient; + + /** + * Timeline Server has the same lifetime as that of Client. Any operations done on the same timeline service will be + * able to take advantage of the cached file-system view. New completed actions will be synced automatically in an + * incremental fashion. + */ + private transient Option timelineServer; + private final boolean shouldStopTimelineServer; + + protected BaseHoodieClient(HoodieEngineContext context, HoodieWriteConfig clientConfig) { + this(context, clientConfig, Option.empty()); + } + + protected BaseHoodieClient(HoodieEngineContext context, HoodieWriteConfig clientConfig, + Option timelineServer) { + this.hadoopConf = context.getHadoopConf().get(); + this.fs = FSUtils.getFs(clientConfig.getBasePath(), hadoopConf); + this.context = context; + this.basePath = clientConfig.getBasePath(); + this.config = clientConfig; + this.timelineServer = timelineServer; + shouldStopTimelineServer = !timelineServer.isPresent(); + this.heartbeatClient = new HoodieHeartbeatClient(this.fs, this.basePath, + clientConfig.getHoodieClientHeartbeatIntervalInMs(), clientConfig.getHoodieClientHeartbeatTolerableMisses()); + startEmbeddedServerView(); + initWrapperFSMetrics(); + } + + /** + * Releases any resources used by the client. + */ + @Override + public void close() { + stopEmbeddedServerView(true); + this.context.setJobStatus("", ""); + } + + private synchronized void stopEmbeddedServerView(boolean resetViewStorageConfig) { + if (timelineServer.isPresent() && shouldStopTimelineServer) { + // Stop only if owner + LOG.info("Stopping Timeline service !!"); + timelineServer.get().stop(); + } + + timelineServer = Option.empty(); + // Reset Storage Config to Client specified config + if (resetViewStorageConfig) { + config.resetViewStorageConfig(); + } + } + + private synchronized void startEmbeddedServerView() { + if (config.isEmbeddedTimelineServerEnabled()) { + if (!timelineServer.isPresent()) { + // Run Embedded Timeline Server + try { + timelineServer = EmbeddedTimelineServerHelper.createEmbeddedTimelineService(context, config); + } catch (IOException e) { + LOG.warn("Unable to start timeline service. Proceeding as if embedded server is disabled", e); + stopEmbeddedServerView(false); + } + } else { + LOG.info("Timeline Server already running. Not restarting the service"); + } + } else { + LOG.info("Embedded Timeline Server is disabled. Not starting timeline service"); + } + } + + public HoodieWriteConfig getConfig() { + return config; + } + + public HoodieEngineContext getEngineContext() { + return context; + } + + protected void initWrapperFSMetrics() { + // no-op. + } + + protected HoodieTableMetaClient createMetaClient(boolean loadActiveTimelineOnLoad) { + return HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(config.getBasePath()) + .setLoadActiveTimelineOnLoad(loadActiveTimelineOnLoad).setConsistencyGuardConfig(config.getConsistencyGuardConfig()) + .setLayoutVersion(Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion()))) + .setFileSystemRetryConfig(config.getFileSystemRetryConfig()) + .setProperties(config.getProps()).build(); + } + + public Option getTimelineServer() { + return timelineServer; + } + + public HoodieHeartbeatClient getHeartbeatClient() { + return heartbeatClient; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java new file mode 100644 index 0000000000000..b4958f5692db4 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java @@ -0,0 +1,1723 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client; + +import org.apache.hudi.async.AsyncArchiveService; +import org.apache.hudi.async.AsyncCleanerService; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieCleanerPlan; +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.avro.model.HoodieIndexCommitMetadata; +import org.apache.hudi.avro.model.HoodieIndexPlan; +import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRestorePlan; +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; +import org.apache.hudi.callback.HoodieWriteCommitCallback; +import org.apache.hudi.callback.common.HoodieWriteCommitCallbackMessage; +import org.apache.hudi.callback.util.HoodieCommitCallbackFactory; +import org.apache.hudi.client.embedded.EmbeddedTimelineService; +import org.apache.hudi.client.heartbeat.HeartbeatUtils; +import org.apache.hudi.client.transaction.TransactionManager; +import org.apache.hudi.client.utils.TransactionUtils; +import org.apache.hudi.common.HoodiePendingRollbackInfo; +import org.apache.hudi.common.config.HoodieCommonConfig; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.TableServiceType; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.HoodieTableVersion; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieInstant.State; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.CleanerUtils; +import org.apache.hudi.common.util.ClusteringUtils; +import org.apache.hudi.common.util.CommitUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieArchivalConfig; +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieCommitException; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.exception.HoodieRestoreException; +import org.apache.hudi.exception.HoodieRollbackException; +import org.apache.hudi.exception.HoodieSavepointException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.Type; +import org.apache.hudi.internal.schema.action.InternalSchemaChangeApplier; +import org.apache.hudi.internal.schema.action.TableChange; +import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter; +import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager; +import org.apache.hudi.internal.schema.utils.AvroSchemaEvolutionUtils; +import org.apache.hudi.internal.schema.utils.InternalSchemaUtils; +import org.apache.hudi.internal.schema.utils.SerDeHelper; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.MetadataPartitionType; +import org.apache.hudi.metrics.HoodieMetrics; +import org.apache.hudi.table.BulkInsertPartitioner; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.rollback.RollbackUtils; +import org.apache.hudi.table.action.savepoint.SavepointHelpers; +import org.apache.hudi.table.marker.WriteMarkersFactory; +import org.apache.hudi.table.upgrade.SupportsUpgradeDowngrade; +import org.apache.hudi.table.upgrade.UpgradeDowngrade; + +import com.codahale.metrics.Timer; +import org.apache.avro.Schema; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.apache.hudi.common.model.HoodieCommitMetadata.SCHEMA_KEY; + +/** + * Abstract Write Client providing functionality for performing commit, index updates and rollback + * Reused for regular write operations like upsert/insert/bulk-insert.. as well as bootstrap + * + * @param Sub type of HoodieRecordPayload + * @param Type of inputs + * @param Type of keys + * @param Type of outputs + */ +public abstract class BaseHoodieWriteClient extends BaseHoodieClient + implements RunsTableService { + + protected static final String LOOKUP_STR = "lookup"; + private static final long serialVersionUID = 1L; + private static final Logger LOG = LogManager.getLogger(BaseHoodieWriteClient.class); + + private final transient HoodieIndex index; + private final SupportsUpgradeDowngrade upgradeDowngradeHelper; + private transient WriteOperationType operationType; + private transient HoodieWriteCommitCallback commitCallback; + + protected final transient HoodieMetrics metrics; + protected transient Timer.Context writeTimer = null; + protected transient Timer.Context compactionTimer; + protected transient Timer.Context clusteringTimer; + + protected transient AsyncCleanerService asyncCleanerService; + protected transient AsyncArchiveService asyncArchiveService; + protected final TransactionManager txnManager; + protected Option>> lastCompletedTxnAndMetadata = Option.empty(); + protected Set pendingInflightAndRequestedInstants; + + /** + * Create a write client, with new hudi index. + * @param context HoodieEngineContext + * @param writeConfig instance of HoodieWriteConfig + * @param upgradeDowngradeHelper engine-specific instance of {@link SupportsUpgradeDowngrade} + */ + @Deprecated + public BaseHoodieWriteClient(HoodieEngineContext context, + HoodieWriteConfig writeConfig, + SupportsUpgradeDowngrade upgradeDowngradeHelper) { + this(context, writeConfig, Option.empty(), upgradeDowngradeHelper); + } + + /** + * Create a write client, allows to specify all parameters. + * + * @param context HoodieEngineContext + * @param writeConfig instance of HoodieWriteConfig + * @param timelineService Timeline Service that runs as part of write client. + */ + @Deprecated + public BaseHoodieWriteClient(HoodieEngineContext context, + HoodieWriteConfig writeConfig, + Option timelineService, + SupportsUpgradeDowngrade upgradeDowngradeHelper) { + super(context, writeConfig, timelineService); + this.metrics = new HoodieMetrics(config); + this.index = createIndex(writeConfig); + this.txnManager = new TransactionManager(config, fs); + this.upgradeDowngradeHelper = upgradeDowngradeHelper; + } + + protected abstract HoodieIndex createIndex(HoodieWriteConfig writeConfig); + + public void setOperationType(WriteOperationType operationType) { + this.operationType = operationType; + } + + public WriteOperationType getOperationType() { + return this.operationType; + } + + /** + * Commit changes performed at the given instantTime marker. + */ + public boolean commit(String instantTime, O writeStatuses) { + return commit(instantTime, writeStatuses, Option.empty()); + } + + /** + * + * Commit changes performed at the given instantTime marker. + */ + public boolean commit(String instantTime, O writeStatuses, Option> extraMetadata) { + HoodieTableMetaClient metaClient = createMetaClient(false); + String actionType = metaClient.getCommitActionType(); + return commit(instantTime, writeStatuses, extraMetadata, actionType, Collections.emptyMap()); + } + + public abstract boolean commit(String instantTime, O writeStatuses, Option> extraMetadata, + String commitActionType, Map> partitionToReplacedFileIds); + + public boolean commitStats(String instantTime, List stats, Option> extraMetadata, + String commitActionType) { + return commitStats(instantTime, stats, extraMetadata, commitActionType, Collections.emptyMap()); + } + + public boolean commitStats(String instantTime, List stats, Option> extraMetadata, + String commitActionType, Map> partitionToReplaceFileIds) { + // Skip the empty commit if not allowed + if (!config.allowEmptyCommit() && stats.isEmpty()) { + return true; + } + LOG.info("Committing " + instantTime + " action " + commitActionType); + // Create a Hoodie table which encapsulated the commits and files visible + HoodieTable table = createTable(config, hadoopConf); + HoodieCommitMetadata metadata = CommitUtils.buildMetadata(stats, partitionToReplaceFileIds, + extraMetadata, operationType, config.getWriteSchema(), commitActionType); + HoodieInstant inflightInstant = new HoodieInstant(State.INFLIGHT, table.getMetaClient().getCommitActionType(), instantTime); + HeartbeatUtils.abortIfHeartbeatExpired(instantTime, table, heartbeatClient, config); + this.txnManager.beginTransaction(Option.of(inflightInstant), + lastCompletedTxnAndMetadata.isPresent() ? Option.of(lastCompletedTxnAndMetadata.get().getLeft()) : Option.empty()); + try { + preCommit(inflightInstant, metadata); + commit(table, commitActionType, instantTime, metadata, stats); + // already within lock, and so no lock requried for archival + postCommit(table, metadata, instantTime, extraMetadata, false); + LOG.info("Committed " + instantTime); + releaseResources(); + } catch (IOException e) { + throw new HoodieCommitException("Failed to complete commit " + config.getBasePath() + " at time " + instantTime, e); + } finally { + this.txnManager.endTransaction(Option.of(inflightInstant)); + } + + // We don't want to fail the commit if hoodie.fail.writes.on.inline.table.service.exception is false. We catch warn if false + try { + // do this outside of lock since compaction, clustering can be time taking and we don't need a lock for the entire execution period + runTableServicesInline(table, metadata, extraMetadata); + } catch (Exception e) { + if (config.isFailOnInlineTableServiceExceptionEnabled()) { + throw e; + } + LOG.warn("Inline compaction or clustering failed with exception: " + e.getMessage() + + ". Moving further since \"hoodie.fail.writes.on.inline.table.service.exception\" is set to false."); + } + + emitCommitMetrics(instantTime, metadata, commitActionType); + + // callback if needed. + if (config.writeCommitCallbackOn()) { + if (null == commitCallback) { + commitCallback = HoodieCommitCallbackFactory.create(config); + } + commitCallback.call(new HoodieWriteCommitCallbackMessage(instantTime, config.getTableName(), config.getBasePath(), stats)); + } + return true; + } + + protected void commit(HoodieTable table, String commitActionType, String instantTime, HoodieCommitMetadata metadata, + List stats) throws IOException { + LOG.info("Committing " + instantTime + " action " + commitActionType); + HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); + // Finalize write + finalizeWrite(table, instantTime, stats); + // do save internal schema to support Implicitly add columns in write process + if (!metadata.getExtraMetadata().containsKey(SerDeHelper.LATEST_SCHEMA) + && metadata.getExtraMetadata().containsKey(SCHEMA_KEY) && table.getConfig().getSchemaEvolutionEnable()) { + saveInternalSchema(table, instantTime, metadata); + } + // update Metadata table + writeTableMetadata(table, instantTime, commitActionType, metadata); + activeTimeline.saveAsComplete(new HoodieInstant(true, commitActionType, instantTime), + Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + } + + // Save internal schema + private void saveInternalSchema(HoodieTable table, String instantTime, HoodieCommitMetadata metadata) { + TableSchemaResolver schemaUtil = new TableSchemaResolver(table.getMetaClient()); + String historySchemaStr = schemaUtil.getTableHistorySchemaStrFromCommitMetadata().orElse(""); + FileBasedInternalSchemaStorageManager schemasManager = new FileBasedInternalSchemaStorageManager(table.getMetaClient()); + if (!historySchemaStr.isEmpty() || Boolean.parseBoolean(config.getString(HoodieCommonConfig.RECONCILE_SCHEMA.key()))) { + InternalSchema internalSchema; + Schema avroSchema = HoodieAvroUtils.createHoodieWriteSchema(config.getSchema(), config.allowOperationMetadataField()); + if (historySchemaStr.isEmpty()) { + internalSchema = AvroInternalSchemaConverter.convert(avroSchema); + internalSchema.setSchemaId(Long.parseLong(instantTime)); + } else { + internalSchema = InternalSchemaUtils.searchSchema(Long.parseLong(instantTime), + SerDeHelper.parseSchemas(historySchemaStr)); + } + InternalSchema evolvedSchema = AvroSchemaEvolutionUtils.reconcileSchema(avroSchema, internalSchema); + if (evolvedSchema.equals(internalSchema)) { + metadata.addMetadata(SerDeHelper.LATEST_SCHEMA, SerDeHelper.toJson(evolvedSchema)); + //TODO save history schema by metaTable + schemasManager.persistHistorySchemaStr(instantTime, historySchemaStr.isEmpty() ? SerDeHelper.inheritSchemas(evolvedSchema, "") : historySchemaStr); + } else { + evolvedSchema.setSchemaId(Long.parseLong(instantTime)); + String newSchemaStr = SerDeHelper.toJson(evolvedSchema); + metadata.addMetadata(SerDeHelper.LATEST_SCHEMA, newSchemaStr); + schemasManager.persistHistorySchemaStr(instantTime, SerDeHelper.inheritSchemas(evolvedSchema, historySchemaStr)); + } + // update SCHEMA_KEY + metadata.addMetadata(SCHEMA_KEY, AvroInternalSchemaConverter.convert(evolvedSchema, avroSchema.getName()).toString()); + } + } + + protected abstract HoodieTable createTable(HoodieWriteConfig config, Configuration hadoopConf); + + void emitCommitMetrics(String instantTime, HoodieCommitMetadata metadata, String actionType) { + if (writeTimer != null) { + long durationInMs = metrics.getDurationInMs(writeTimer.stop()); + // instantTime could be a non-standard value, so use `parseDateFromInstantTimeSafely` + // e.g. INIT_INSTANT_TS, METADATA_BOOTSTRAP_INSTANT_TS and FULL_BOOTSTRAP_INSTANT_TS in HoodieTimeline + HoodieActiveTimeline.parseDateFromInstantTimeSafely(instantTime).ifPresent(parsedInstant -> + metrics.updateCommitMetrics(parsedInstant.getTime(), durationInMs, metadata, actionType) + ); + writeTimer = null; + } + } + + /** + * Any pre-commit actions like conflict resolution goes here. + * @param inflightInstant instant of inflight operation. + * @param metadata commit metadata for which pre commit is being invoked. + */ + protected void preCommit(HoodieInstant inflightInstant, HoodieCommitMetadata metadata) { + // To be overridden by specific engines to perform conflict resolution if any. + } + + /** + * Write the HoodieCommitMetadata to metadata table if available. + * @param table {@link HoodieTable} of interest. + * @param instantTime instant time of the commit. + * @param actionType action type of the commit. + * @param metadata instance of {@link HoodieCommitMetadata}. + */ + protected void writeTableMetadata(HoodieTable table, String instantTime, String actionType, HoodieCommitMetadata metadata) { + context.setJobStatus(this.getClass().getSimpleName(), "Committing to metadata table: " + config.getTableName()); + table.getMetadataWriter(instantTime).ifPresent(w -> ((HoodieTableMetadataWriter) w).update(metadata, instantTime, + table.isTableServiceAction(actionType, instantTime))); + } + + /** + * Filter out HoodieRecords that already exists in the output folder. This is useful in deduplication. + * + * @param hoodieRecords Input Hoodie records. + * @return A subset of hoodieRecords, with existing records filtered out. + */ + public abstract I filterExists(I hoodieRecords); + + /** + * Main API to run bootstrap to hudi. + */ + public void bootstrap(Option> extraMetadata) { + // TODO : MULTIWRITER -> check if failed bootstrap files can be cleaned later + if (config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl()) { + throw new HoodieException("Cannot bootstrap the table in multi-writer mode"); + } + HoodieTable table = initTable(WriteOperationType.UPSERT, Option.ofNullable(HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS)); + rollbackFailedBootstrap(); + table.bootstrap(context, extraMetadata); + } + + /** + * Main API to rollback failed bootstrap. + */ + protected void rollbackFailedBootstrap() { + LOG.info("Rolling back pending bootstrap if present"); + HoodieTable table = createTable(config, hadoopConf); + HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction(); + Option instant = Option.fromJavaOptional( + inflightTimeline.getReverseOrderedInstants().map(HoodieInstant::getTimestamp).findFirst()); + if (instant.isPresent() && HoodieTimeline.compareTimestamps(instant.get(), HoodieTimeline.LESSER_THAN_OR_EQUALS, + HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS)) { + LOG.info("Found pending bootstrap instants. Rolling them back"); + table.rollbackBootstrap(context, HoodieActiveTimeline.createNewInstantTime()); + LOG.info("Finished rolling back pending bootstrap"); + } + } + + /** + * Upsert a batch of new records into Hoodie table at the supplied instantTime. + * + * @param records hoodieRecords to upsert + * @param instantTime Instant time of the commit + * @return WriteStatus to inspect errors and counts + */ + public abstract O upsert(I records, final String instantTime); + + /** + * Upserts the given prepared records into the Hoodie table, at the supplied instantTime. + *

+ * This implementation requires that the input records are already tagged, and de-duped if needed. + * + * @param preppedRecords Prepared HoodieRecords to upsert + * @param instantTime Instant time of the commit + * @return Collection of WriteStatus to inspect errors and counts + */ + public abstract O upsertPreppedRecords(I preppedRecords, final String instantTime); + + /** + * Inserts the given HoodieRecords, into the table. This API is intended to be used for normal writes. + *

+ * This implementation skips the index check and is able to leverage benefits such as small file handling/blocking + * alignment, as with upsert(), by profiling the workload + * + * @param records HoodieRecords to insert + * @param instantTime Instant time of the commit + * @return Collection of WriteStatus to inspect errors and counts + */ + public abstract O insert(I records, final String instantTime); + + /** + * Inserts the given prepared records into the Hoodie table, at the supplied instantTime. + *

+ * This implementation skips the index check, skips de-duping and is able to leverage benefits such as small file + * handling/blocking alignment, as with insert(), by profiling the workload. The prepared HoodieRecords should be + * de-duped if needed. + * + * @param preppedRecords HoodieRecords to insert + * @param instantTime Instant time of the commit + * @return Collection of WriteStatus to inspect errors and counts + */ + public abstract O insertPreppedRecords(I preppedRecords, final String instantTime); + + /** + * Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk loads into a Hoodie + * table for the very first time (e.g: converting an existing table to Hoodie). + *

+ * This implementation uses sortBy (which does range partitioning based on reservoir sampling) and attempts to control + * the numbers of files with less memory compared to the {@link BaseHoodieWriteClient#insert(I, String)} + * + * @param records HoodieRecords to insert + * @param instantTime Instant time of the commit + * @return Collection of WriteStatus to inspect errors and counts + */ + public abstract O bulkInsert(I records, final String instantTime); + + /** + * Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk loads into a Hoodie + * table for the very first time (e.g: converting an existing table to Hoodie). + *

+ * This implementation uses sortBy (which does range partitioning based on reservoir sampling) and attempts to control + * the numbers of files with less memory compared to the {@link BaseHoodieWriteClient#insert(I, String)}. Optionally + * it allows users to specify their own partitioner. If specified then it will be used for repartitioning records. See + * {@link BulkInsertPartitioner}. + * + * @param records HoodieRecords to insert + * @param instantTime Instant time of the commit + * @param userDefinedBulkInsertPartitioner If specified then it will be used to partition input records before they are inserted + * into hoodie. + * @return Collection of WriteStatus to inspect errors and counts + */ + public abstract O bulkInsert(I records, final String instantTime, + Option userDefinedBulkInsertPartitioner); + + /** + * Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk loads into a Hoodie + * table for the very first time (e.g: converting an existing table to Hoodie). The input records should contain no + * duplicates if needed. + *

+ * This implementation uses sortBy (which does range partitioning based on reservoir sampling) and attempts to control + * the numbers of files with less memory compared to the {@link BaseHoodieWriteClient#insert(I, String)}. Optionally + * it allows users to specify their own partitioner. If specified then it will be used for repartitioning records. See + * {@link BulkInsertPartitioner}. + * + * @param preppedRecords HoodieRecords to insert + * @param instantTime Instant time of the commit + * @param bulkInsertPartitioner If specified then it will be used to partition input records before they are inserted + * into hoodie. + * @return Collection of WriteStatus to inspect errors and counts + */ + public abstract O bulkInsertPreppedRecords(I preppedRecords, final String instantTime, + Option bulkInsertPartitioner); + + /** + * Deletes a list of {@link HoodieKey}s from the Hoodie table, at the supplied instantTime {@link HoodieKey}s will be + * de-duped and non existent keys will be removed before deleting. + * + * @param keys {@link List} of {@link HoodieKey}s to be deleted + * @param instantTime Commit time handle + * @return Collection of WriteStatus to inspect errors and counts + */ + public abstract O delete(K keys, final String instantTime); + + /** + * Common method containing steps to be performed before write (upsert/insert/... + * @param instantTime + * @param writeOperationType + * @param metaClient + */ + public void preWrite(String instantTime, WriteOperationType writeOperationType, + HoodieTableMetaClient metaClient) { + setOperationType(writeOperationType); + this.lastCompletedTxnAndMetadata = txnManager.isOptimisticConcurrencyControlEnabled() + ? TransactionUtils.getLastCompletedTxnInstantAndMetadata(metaClient) : Option.empty(); + this.pendingInflightAndRequestedInstants = TransactionUtils.getInflightAndRequestedInstants(metaClient); + this.pendingInflightAndRequestedInstants.remove(instantTime); + if (null == this.asyncCleanerService) { + this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this); + } else { + this.asyncCleanerService.start(null); + } + if (null == this.asyncArchiveService) { + this.asyncArchiveService = AsyncArchiveService.startAsyncArchiveIfEnabled(this); + } else { + this.asyncArchiveService.start(null); + } + } + + /** + * Common method containing steps to be performed after write (upsert/insert/..) operations including auto-commit. + * @param result Commit Action Result + * @param instantTime Instant Time + * @param hoodieTable Hoodie Table + * @return Write Status + */ + protected abstract O postWrite(HoodieWriteMetadata result, String instantTime, HoodieTable hoodieTable); + + /** + * Post Commit Hook. Derived classes use this method to perform post-commit processing + * + * @param table table to commit on + * @param metadata Commit Metadata corresponding to committed instant + * @param instantTime Instant Time + * @param extraMetadata Additional Metadata passed by user + * @param acquireLockForArchival true if lock has to be acquired for archival. false otherwise. + */ + protected void postCommit(HoodieTable table, HoodieCommitMetadata metadata, String instantTime, Option> extraMetadata, + boolean acquireLockForArchival) { + try { + // Delete the marker directory for the instant. + WriteMarkersFactory.get(config.getMarkersType(), table, instantTime) + .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); + autoCleanOnCommit(); + autoArchiveOnCommit(table, acquireLockForArchival); + } finally { + this.heartbeatClient.stop(instantTime); + } + } + + protected void runTableServicesInline(HoodieTable table, HoodieCommitMetadata metadata, Option> extraMetadata) { + if (!tableServicesEnabled(config)) { + return; + } + if (config.areAnyTableServicesExecutedInline() || config.areAnyTableServicesScheduledInline()) { + if (config.isMetadataTableEnabled()) { + table.getHoodieView().sync(); + } + // Do an inline compaction if enabled + if (config.inlineCompactionEnabled()) { + runAnyPendingCompactions(table); + metadata.addMetadata(HoodieCompactionConfig.INLINE_COMPACT.key(), "true"); + inlineCompaction(extraMetadata); + } else { + metadata.addMetadata(HoodieCompactionConfig.INLINE_COMPACT.key(), "false"); + } + + // if just inline schedule is enabled + if (!config.inlineCompactionEnabled() && config.scheduleInlineCompaction() + && !table.getActiveTimeline().getWriteTimeline().filterPendingCompactionTimeline().getInstants().findAny().isPresent()) { + // proceed only if there are no pending compactions + metadata.addMetadata(HoodieCompactionConfig.SCHEDULE_INLINE_COMPACT.key(), "true"); + inlineScheduleCompaction(extraMetadata); + } + + // Do an inline clustering if enabled + if (config.inlineClusteringEnabled()) { + runAnyPendingClustering(table); + metadata.addMetadata(HoodieClusteringConfig.INLINE_CLUSTERING.key(), "true"); + inlineClustering(extraMetadata); + } else { + metadata.addMetadata(HoodieClusteringConfig.INLINE_CLUSTERING.key(), "false"); + } + + // if just inline schedule is enabled + if (!config.inlineClusteringEnabled() && config.scheduleInlineClustering() + && !table.getActiveTimeline().filterPendingReplaceTimeline().getInstants().findAny().isPresent()) { + // proceed only if there are no pending clustering + metadata.addMetadata(HoodieClusteringConfig.SCHEDULE_INLINE_CLUSTERING.key(), "true"); + inlineScheduleClustering(extraMetadata); + } + } + } + + protected void runAnyPendingCompactions(HoodieTable table) { + table.getActiveTimeline().getWriteTimeline().filterPendingCompactionTimeline().getInstants() + .forEach(instant -> { + LOG.info("Running previously failed inflight compaction at instant " + instant); + compact(instant.getTimestamp(), true); + }); + } + + protected void runAnyPendingClustering(HoodieTable table) { + table.getActiveTimeline().filterPendingReplaceTimeline().getInstants().forEach(instant -> { + Option> instantPlan = ClusteringUtils.getClusteringPlan(table.getMetaClient(), instant); + if (instantPlan.isPresent()) { + LOG.info("Running pending clustering at instant " + instantPlan.get().getLeft()); + cluster(instant.getTimestamp(), true); + } + }); + } + + protected void autoCleanOnCommit() { + if (!config.isAutoClean()) { + return; + } + + if (config.isAsyncClean()) { + LOG.info("Async cleaner has been spawned. Waiting for it to finish"); + AsyncCleanerService.waitForCompletion(asyncCleanerService); + LOG.info("Async cleaner has finished"); + } else { + LOG.info("Start to clean synchronously."); + // Do not reuse instantTime for clean as metadata table requires all changes to have unique instant timestamps. + clean(true); + } + } + + protected void autoArchiveOnCommit(HoodieTable table, boolean acquireLockForArchival) { + if (!config.isAutoArchive()) { + return; + } + + if (config.isAsyncArchive()) { + LOG.info("Async archiver has been spawned. Waiting for it to finish"); + AsyncArchiveService.waitForCompletion(asyncArchiveService); + LOG.info("Async archiver has finished"); + } else { + LOG.info("Start to archive synchronously."); + archive(table, acquireLockForArchival); + } + } + + /** + * Run any pending compactions. + */ + public void runAnyPendingCompactions() { + runAnyPendingCompactions(createTable(config, hadoopConf)); + } + + /** + * Create a savepoint based on the latest commit action on the timeline. + * + * @param user - User creating the savepoint + * @param comment - Comment for the savepoint + */ + public void savepoint(String user, String comment) { + HoodieTable table = createTable(config, hadoopConf); + if (table.getCompletedCommitsTimeline().empty()) { + throw new HoodieSavepointException("Could not savepoint. Commit timeline is empty"); + } + + String latestCommit = table.getCompletedCommitsTimeline().lastInstant().get().getTimestamp(); + LOG.info("Savepointing latest commit " + latestCommit); + savepoint(latestCommit, user, comment); + } + + /** + * Savepoint a specific commit instant time. Latest version of data files as of the passed in instantTime + * will be referenced in the savepoint and will never be cleaned. The savepointed commit will never be rolledback or archived. + *

+ * This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be manually created and + * deleted. + *

+ * Savepoint should be on a commit that could not have been cleaned. + * + * @param instantTime - commit that should be savepointed + * @param user - User creating the savepoint + * @param comment - Comment for the savepoint + */ + public void savepoint(String instantTime, String user, String comment) { + HoodieTable table = createTable(config, hadoopConf); + table.savepoint(context, instantTime, user, comment); + } + + /** + * Delete a savepoint that was created. Once the savepoint is deleted, the commit can be rolledback and cleaner may + * clean up data files. + * + * @param savepointTime - delete the savepoint + * @return true if the savepoint was deleted successfully + */ + public void deleteSavepoint(String savepointTime) { + HoodieTable table = createTable(config, hadoopConf); + SavepointHelpers.deleteSavepoint(table, savepointTime); + } + + /** + * Restore the data to the savepoint. + * + * WARNING: This rolls back recent commits and deleted data files and also pending compactions after savepoint time. + * Queries accessing the files will mostly fail. This is expected to be a manual operation and no concurrent write or + * compaction is expected to be running + * + * @param savepointTime - savepoint time to rollback to + * @return true if the savepoint was restored to successfully + */ + public void restoreToSavepoint(String savepointTime) { + boolean initialMetadataTableIfNecessary = config.isMetadataTableEnabled(); + if (initialMetadataTableIfNecessary) { + try { + // Delete metadata table directly when users trigger savepoint rollback if mdt existed and beforeTimelineStarts + String metadataTableBasePathStr = HoodieTableMetadata.getMetadataTableBasePath(config.getBasePath()); + HoodieTableMetaClient mdtClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePathStr).build(); + // Same as HoodieTableMetadataUtil#processRollbackMetadata + HoodieInstant syncedInstant = new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, savepointTime); + // The instant required to sync rollback to MDT has been archived and the mdt syncing will be failed + // So that we need to delete the whole MDT here. + if (mdtClient.getCommitsTimeline().isBeforeTimelineStarts(syncedInstant.getTimestamp())) { + mdtClient.getFs().delete(new Path(metadataTableBasePathStr), true); + // rollbackToSavepoint action will try to bootstrap MDT at first but sync to MDT will fail at the current scenario. + // so that we need to disable metadata initialized here. + initialMetadataTableIfNecessary = false; + } + } catch (Exception e) { + // Metadata directory does not exist + } + } + + HoodieTable table = initTable(WriteOperationType.UNKNOWN, Option.empty(), initialMetadataTableIfNecessary); + SavepointHelpers.validateSavepointPresence(table, savepointTime); + ValidationUtils.checkArgument(!config.shouldArchiveBeyondSavepoint(), "Restore is not supported when " + HoodieArchivalConfig.ARCHIVE_BEYOND_SAVEPOINT.key() + + " is enabled"); + restoreToInstant(savepointTime, initialMetadataTableIfNecessary); + SavepointHelpers.validateSavepointRestore(table, savepointTime); + } + + @Deprecated + public boolean rollback(final String commitInstantTime) throws HoodieRollbackException { + HoodieTable table = initTable(WriteOperationType.UNKNOWN, Option.empty()); + Option pendingRollbackInfo = getPendingRollbackInfo(table.getMetaClient(), commitInstantTime); + return rollback(commitInstantTime, pendingRollbackInfo, false); + } + + /** + * @Deprecated + * Rollback the inflight record changes with the given commit time. This + * will be removed in future in favor of {@link BaseHoodieWriteClient#restoreToInstant(String, boolean) + * + * @param commitInstantTime Instant time of the commit + * @param pendingRollbackInfo pending rollback instant and plan if rollback failed from previous attempt. + * @param skipLocking if this is triggered by another parent transaction, locking can be skipped. + * @throws HoodieRollbackException if rollback cannot be performed successfully + */ + @Deprecated + public boolean rollback(final String commitInstantTime, Option pendingRollbackInfo, boolean skipLocking) throws HoodieRollbackException { + LOG.info("Begin rollback of instant " + commitInstantTime); + final String rollbackInstantTime = pendingRollbackInfo.map(entry -> entry.getRollbackInstant().getTimestamp()).orElse(HoodieActiveTimeline.createNewInstantTime()); + final Timer.Context timerContext = this.metrics.getRollbackCtx(); + try { + HoodieTable table = createTable(config, hadoopConf); + Option commitInstantOpt = Option.fromJavaOptional(table.getActiveTimeline().getCommitsTimeline().getInstants() + .filter(instant -> HoodieActiveTimeline.EQUALS.test(instant.getTimestamp(), commitInstantTime)) + .findFirst()); + if (commitInstantOpt.isPresent() || pendingRollbackInfo.isPresent()) { + LOG.info(String.format("Scheduling Rollback at instant time : %s " + + "(exists in active timeline: %s), with rollback plan: %s", + rollbackInstantTime, commitInstantOpt.isPresent(), pendingRollbackInfo.isPresent())); + Option rollbackPlanOption = pendingRollbackInfo.map(entry -> Option.of(entry.getRollbackPlan())) + .orElseGet(() -> table.scheduleRollback(context, rollbackInstantTime, commitInstantOpt.get(), false, config.shouldRollbackUsingMarkers())); + if (rollbackPlanOption.isPresent()) { + // There can be a case where the inflight rollback failed after the instant files + // are deleted for commitInstantTime, so that commitInstantOpt is empty as it is + // not present in the timeline. In such a case, the hoodie instant instance + // is reconstructed to allow the rollback to be reattempted, and the deleteInstants + // is set to false since they are already deleted. + // Execute rollback + HoodieRollbackMetadata rollbackMetadata = commitInstantOpt.isPresent() + ? table.rollback(context, rollbackInstantTime, commitInstantOpt.get(), true, skipLocking) + : table.rollback(context, rollbackInstantTime, new HoodieInstant( + true, rollbackPlanOption.get().getInstantToRollback().getAction(), commitInstantTime), + false, skipLocking); + if (timerContext != null) { + long durationInMs = metrics.getDurationInMs(timerContext.stop()); + metrics.updateRollbackMetrics(durationInMs, rollbackMetadata.getTotalFilesDeleted()); + } + return true; + } else { + throw new HoodieRollbackException("Failed to rollback " + config.getBasePath() + " commits " + commitInstantTime); + } + } else { + LOG.warn("Cannot find instant " + commitInstantTime + " in the timeline, for rollback"); + return false; + } + } catch (Exception e) { + throw new HoodieRollbackException("Failed to rollback " + config.getBasePath() + " commits " + commitInstantTime, e); + } + } + + /** + * NOTE : This action requires all writers (ingest and compact) to a table to be stopped before proceeding. Revert + * the (inflight/committed) record changes for all commits after the provided instant time. + * + * @param instantTime Instant time to which restoration is requested + */ + public HoodieRestoreMetadata restoreToInstant(final String instantTime, boolean initialMetadataTableIfNecessary) throws HoodieRestoreException { + LOG.info("Begin restore to instant " + instantTime); + final String restoreInstantTime = HoodieActiveTimeline.createNewInstantTime(); + Timer.Context timerContext = metrics.getRollbackCtx(); + try { + HoodieTable table = initTable(WriteOperationType.UNKNOWN, Option.of(restoreInstantTime), initialMetadataTableIfNecessary); + Option restorePlanOption = table.scheduleRestore(context, restoreInstantTime, instantTime); + if (restorePlanOption.isPresent()) { + HoodieRestoreMetadata restoreMetadata = table.restore(context, restoreInstantTime, instantTime); + if (timerContext != null) { + final long durationInMs = metrics.getDurationInMs(timerContext.stop()); + final long totalFilesDeleted = restoreMetadata.getHoodieRestoreMetadata().values().stream() + .flatMap(Collection::stream) + .mapToLong(HoodieRollbackMetadata::getTotalFilesDeleted) + .sum(); + metrics.updateRollbackMetrics(durationInMs, totalFilesDeleted); + } + return restoreMetadata; + } else { + throw new HoodieRestoreException("Failed to restore " + config.getBasePath() + " to commit " + instantTime); + } + } catch (Exception e) { + throw new HoodieRestoreException("Failed to restore to " + instantTime, e); + } + } + + /** + * Clean up any stale/old files/data lying around (either on file storage or index storage) based on the + * configurations and CleaningPolicy used. (typically files that no longer can be used by a running query can be + * cleaned) + */ + public HoodieCleanMetadata clean(String cleanInstantTime) throws HoodieIOException { + return clean(cleanInstantTime, true, false); + } + + /** + * Clean up any stale/old files/data lying around (either on file storage or index storage) based on the + * configurations and CleaningPolicy used. (typically files that no longer can be used by a running query can be + * cleaned) + * @param cleanInstantTime instant time for clean. + * @param skipLocking if this is triggered by another parent transaction, locking can be skipped. + * @return instance of {@link HoodieCleanMetadata}. + */ + public HoodieCleanMetadata clean(String cleanInstantTime, boolean skipLocking) throws HoodieIOException { + return clean(cleanInstantTime, true, skipLocking); + } + + /** + * Clean up any stale/old files/data lying around (either on file storage or index storage) based on the + * configurations and CleaningPolicy used. (typically files that no longer can be used by a running query can be + * cleaned). This API provides the flexibility to schedule clean instant asynchronously via + * {@link BaseHoodieWriteClient#scheduleTableService(String, Option, TableServiceType)} and disable inline scheduling + * of clean. + * @param cleanInstantTime instant time for clean. + * @param scheduleInline true if needs to be scheduled inline. false otherwise. + * @param skipLocking if this is triggered by another parent transaction, locking can be skipped. + */ + public HoodieCleanMetadata clean(String cleanInstantTime, boolean scheduleInline, boolean skipLocking) throws HoodieIOException { + if (!tableServicesEnabled(config)) { + return null; + } + final Timer.Context timerContext = metrics.getCleanCtx(); + CleanerUtils.rollbackFailedWrites(config.getFailedWritesCleanPolicy(), + HoodieTimeline.CLEAN_ACTION, () -> rollbackFailedWrites(skipLocking)); + + HoodieTable table = createTable(config, hadoopConf); + if (config.allowMultipleCleans() || !table.getActiveTimeline().getCleanerTimeline().filterInflightsAndRequested().firstInstant().isPresent()) { + LOG.info("Cleaner started"); + // proceed only if multiple clean schedules are enabled or if there are no pending cleans. + if (scheduleInline) { + scheduleTableServiceInternal(cleanInstantTime, Option.empty(), TableServiceType.CLEAN); + table.getMetaClient().reloadActiveTimeline(); + } + } + + // Proceeds to execute any requested or inflight clean instances in the timeline + HoodieCleanMetadata metadata = table.clean(context, cleanInstantTime, skipLocking); + if (timerContext != null && metadata != null) { + long durationMs = metrics.getDurationInMs(timerContext.stop()); + metrics.updateCleanMetrics(durationMs, metadata.getTotalFilesDeleted()); + LOG.info("Cleaned " + metadata.getTotalFilesDeleted() + " files" + + " Earliest Retained Instant :" + metadata.getEarliestCommitToRetain() + + " cleanerElapsedMs" + durationMs); + } + return metadata; + } + + public HoodieCleanMetadata clean() { + return clean(false); + } + + /** + * Triggers clean for the table. This refers to Clean up any stale/old files/data lying around (either on file storage or index storage) based on the + * * configurations and CleaningPolicy used. + * @param skipLocking if this is triggered by another parent transaction, locking can be skipped. + * @return instance of {@link HoodieCleanMetadata}. + */ + public HoodieCleanMetadata clean(boolean skipLocking) { + return clean(HoodieActiveTimeline.createNewInstantTime(), skipLocking); + } + + /** + * Trigger archival for the table. This ensures that the number of commits do not explode + * and keep increasing unbounded over time. + * @param table table to commit on. + * @param acquireLockForArchival true if lock has to be acquired for archival. false otherwise. + */ + protected void archive(HoodieTable table, boolean acquireLockForArchival) { + if (!tableServicesEnabled(config)) { + return; + } + try { + // We cannot have unbounded commit files. Archive commits if we have to archive + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(config, table); + archiver.archiveIfRequired(context, acquireLockForArchival); + } catch (IOException ioe) { + throw new HoodieIOException("Failed to archive", ioe); + } + } + + /** + * Trigger archival for the table. This ensures that the number of commits do not explode + * and keep increasing unbounded over time. + */ + public void archive() { + // Create a Hoodie table which encapsulated the commits and files visible + HoodieTable table = createTable(config, hadoopConf); + archive(table, true); + } + + /** + * Provides a new commit time for a write operation (insert/update/delete). + */ + public String startCommit() { + HoodieTableMetaClient metaClient = createMetaClient(true); + return startCommit(metaClient.getCommitActionType(), metaClient); + } + + /** + * Provides a new commit time for a write operation (insert/update/delete/insert_overwrite/insert_overwrite_table) with specified action. + */ + public String startCommit(String actionType, HoodieTableMetaClient metaClient) { + CleanerUtils.rollbackFailedWrites(config.getFailedWritesCleanPolicy(), + HoodieTimeline.COMMIT_ACTION, () -> rollbackFailedWrites()); + String instantTime = HoodieActiveTimeline.createNewInstantTime(); + startCommit(instantTime, actionType, metaClient); + return instantTime; + } + + /** + * Provides a new commit time for a write operation (insert/update/delete/insert_overwrite/insert_overwrite_table) without specified action. + * @param instantTime Instant time to be generated + */ + public void startCommitWithTime(String instantTime) { + HoodieTableMetaClient metaClient = createMetaClient(true); + startCommitWithTime(instantTime, metaClient.getCommitActionType(), metaClient); + } + + /** + * Completes a new commit time for a write operation (insert/update/delete/insert_overwrite/insert_overwrite_table) with specified action. + */ + public void startCommitWithTime(String instantTime, String actionType) { + HoodieTableMetaClient metaClient = createMetaClient(true); + startCommitWithTime(instantTime, actionType, metaClient); + } + + /** + * Completes a new commit time for a write operation (insert/update/delete) with specified action. + */ + private void startCommitWithTime(String instantTime, String actionType, HoodieTableMetaClient metaClient) { + CleanerUtils.rollbackFailedWrites(config.getFailedWritesCleanPolicy(), + HoodieTimeline.COMMIT_ACTION, () -> rollbackFailedWrites()); + startCommit(instantTime, actionType, metaClient); + } + + private void startCommit(String instantTime, String actionType, HoodieTableMetaClient metaClient) { + LOG.info("Generate a new instant time: " + instantTime + " action: " + actionType); + // if there are pending compactions, their instantTime must not be greater than that of this instant time + metaClient.getActiveTimeline().filterPendingCompactionTimeline().lastInstant().ifPresent(latestPending -> + ValidationUtils.checkArgument( + HoodieTimeline.compareTimestamps(latestPending.getTimestamp(), HoodieTimeline.LESSER_THAN, instantTime), + "Latest pending compaction instant time must be earlier than this instant time. Latest Compaction :" + + latestPending + ", Ingesting at " + instantTime)); + if (config.getFailedWritesCleanPolicy().isLazy()) { + this.heartbeatClient.start(instantTime); + } + + if (actionType.equals(HoodieTimeline.REPLACE_COMMIT_ACTION)) { + metaClient.getActiveTimeline().createRequestedReplaceCommit(instantTime, actionType); + } else { + metaClient.getActiveTimeline().createNewInstant(new HoodieInstant(HoodieInstant.State.REQUESTED, actionType, + instantTime)); + } + } + + /** + * Schedules a new compaction instant. + * @param extraMetadata Extra Metadata to be stored + */ + public Option scheduleCompaction(Option> extraMetadata) throws HoodieIOException { + String instantTime = HoodieActiveTimeline.createNewInstantTime(); + return scheduleCompactionAtInstant(instantTime, extraMetadata) ? Option.of(instantTime) : Option.empty(); + } + + /** + * Schedules a new compaction instant with passed-in instant time. + * @param instantTime Compaction Instant Time + * @param extraMetadata Extra Metadata to be stored + */ + public boolean scheduleCompactionAtInstant(String instantTime, Option> extraMetadata) throws HoodieIOException { + return scheduleTableService(instantTime, extraMetadata, TableServiceType.COMPACT).isPresent(); + } + + /** + * Schedules INDEX action. + * + * @param partitionTypes - list of {@link MetadataPartitionType} which needs to be indexed + * @return instant time for the requested INDEX action + */ + public Option scheduleIndexing(List partitionTypes) { + String instantTime = HoodieActiveTimeline.createNewInstantTime(); + Option indexPlan = createTable(config, hadoopConf) + .scheduleIndexing(context, instantTime, partitionTypes); + return indexPlan.isPresent() ? Option.of(instantTime) : Option.empty(); + } + + /** + * Runs INDEX action to build out the metadata partitions as planned for the given instant time. + * + * @param indexInstantTime - instant time for the requested INDEX action + * @return {@link Option} after successful indexing. + */ + public Option index(String indexInstantTime) { + return createTable(config, hadoopConf).index(context, indexInstantTime); + } + + /** + * Drops the index and removes the metadata partitions. + * + * @param partitionTypes - list of {@link MetadataPartitionType} which needs to be indexed + */ + public void dropIndex(List partitionTypes) { + HoodieTable table = createTable(config, hadoopConf); + String dropInstant = HoodieActiveTimeline.createNewInstantTime(); + HoodieInstant ownerInstant = new HoodieInstant(true, HoodieTimeline.INDEXING_ACTION, dropInstant); + this.txnManager.beginTransaction(Option.of(ownerInstant), Option.empty()); + try { + context.setJobStatus(this.getClass().getSimpleName(), "Dropping partitions from metadata table: " + config.getTableName()); + table.getMetadataWriter(dropInstant).ifPresent(w -> { + try { + ((HoodieTableMetadataWriter) w).dropMetadataPartitions(partitionTypes); + } catch (IOException e) { + throw new HoodieIndexException("Failed to drop metadata index. ", e); + } + }); + } finally { + this.txnManager.endTransaction(Option.of(ownerInstant)); + } + } + + /** + * Performs Compaction for the workload stored in instant-time. + * + * @param compactionInstantTime Compaction Instant Time + * @return Collection of WriteStatus to inspect errors and counts + */ + public HoodieWriteMetadata compact(String compactionInstantTime) { + return compact(compactionInstantTime, config.shouldAutoCommit()); + } + + /** + * Commit a compaction operation. Allow passing additional meta-data to be stored in commit instant file. + * + * @param compactionInstantTime Compaction Instant Time + * @param metadata All the metadata that gets stored along with a commit + * @param extraMetadata Extra Metadata to be stored + */ + public abstract void commitCompaction(String compactionInstantTime, HoodieCommitMetadata metadata, + Option> extraMetadata); + + /** + * Commit Compaction and track metrics. + */ + protected abstract void completeCompaction(HoodieCommitMetadata metadata, HoodieTable table, String compactionCommitTime); + + /** + * Get inflight time line exclude compaction and clustering. + * @param metaClient + * @return + */ + private HoodieTimeline getInflightTimelineExcludeCompactionAndClustering(HoodieTableMetaClient metaClient) { + HoodieTimeline inflightTimelineWithReplaceCommit = metaClient.getCommitsTimeline().filterPendingExcludingCompaction(); + HoodieTimeline inflightTimelineExcludeClusteringCommit = inflightTimelineWithReplaceCommit.filter(instant -> { + if (instant.getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION)) { + Option> instantPlan = ClusteringUtils.getClusteringPlan(metaClient, instant); + return !instantPlan.isPresent(); + } else { + return true; + } + }); + return inflightTimelineExcludeClusteringCommit; + } + + protected Option getPendingRollbackInfo(HoodieTableMetaClient metaClient, String commitToRollback) { + return getPendingRollbackInfo(metaClient, commitToRollback, true); + } + + public Option getPendingRollbackInfo(HoodieTableMetaClient metaClient, String commitToRollback, boolean ignoreCompactionAndClusteringInstants) { + return getPendingRollbackInfos(metaClient, ignoreCompactionAndClusteringInstants).getOrDefault(commitToRollback, Option.empty()); + } + + protected Map> getPendingRollbackInfos(HoodieTableMetaClient metaClient) { + return getPendingRollbackInfos(metaClient, true); + } + + /** + * Fetch map of pending commits to be rolled-back to {@link HoodiePendingRollbackInfo}. + * @param metaClient instance of {@link HoodieTableMetaClient} to use. + * @return map of pending commits to be rolled-back instants to Rollback Instant and Rollback plan Pair. + */ + protected Map> getPendingRollbackInfos(HoodieTableMetaClient metaClient, boolean ignoreCompactionAndClusteringInstants) { + List instants = metaClient.getActiveTimeline().filterPendingRollbackTimeline().getInstants().collect(Collectors.toList()); + Map> infoMap = new HashMap<>(); + for (HoodieInstant rollbackInstant : instants) { + HoodieRollbackPlan rollbackPlan; + try { + rollbackPlan = RollbackUtils.getRollbackPlan(metaClient, rollbackInstant); + } catch (Exception e) { + if (rollbackInstant.isRequested()) { + LOG.warn("Fetching rollback plan failed for " + rollbackInstant + ", deleting the plan since it's in REQUESTED state", e); + try { + metaClient.getActiveTimeline().deletePending(rollbackInstant); + } catch (HoodieIOException he) { + LOG.warn("Cannot delete " + rollbackInstant, he); + continue; + } + } else { + // Here we assume that if the rollback is inflight, the rollback plan is intact + // in instant.rollback.requested. The exception here can be due to other reasons. + LOG.warn("Fetching rollback plan failed for " + rollbackInstant + ", skip the plan", e); + } + continue; + } + + try { + String action = rollbackPlan.getInstantToRollback().getAction(); + if (ignoreCompactionAndClusteringInstants) { + if (!HoodieTimeline.COMPACTION_ACTION.equals(action)) { + boolean isClustering = HoodieTimeline.REPLACE_COMMIT_ACTION.equals(action) + && ClusteringUtils.getClusteringPlan(metaClient, new HoodieInstant(true, rollbackPlan.getInstantToRollback().getAction(), + rollbackPlan.getInstantToRollback().getCommitTime())).isPresent(); + if (!isClustering) { + String instantToRollback = rollbackPlan.getInstantToRollback().getCommitTime(); + infoMap.putIfAbsent(instantToRollback, Option.of(new HoodiePendingRollbackInfo(rollbackInstant, rollbackPlan))); + } + } + } else { + infoMap.putIfAbsent(rollbackPlan.getInstantToRollback().getCommitTime(), Option.of(new HoodiePendingRollbackInfo(rollbackInstant, rollbackPlan))); + } + } catch (Exception e) { + LOG.warn("Processing rollback plan failed for " + rollbackInstant + ", skip the plan", e); + } + } + return infoMap; + } + + /** + * Rollback all failed writes. + */ + protected Boolean rollbackFailedWrites() { + return rollbackFailedWrites(false); + } + + /** + * Rollback all failed writes. + * @param skipLocking if this is triggered by another parent transaction, locking can be skipped. + */ + protected Boolean rollbackFailedWrites(boolean skipLocking) { + HoodieTable table = createTable(config, hadoopConf); + List instantsToRollback = getInstantsToRollback(table.getMetaClient(), config.getFailedWritesCleanPolicy(), Option.empty()); + Map> pendingRollbacks = getPendingRollbackInfos(table.getMetaClient()); + instantsToRollback.forEach(entry -> pendingRollbacks.putIfAbsent(entry, Option.empty())); + rollbackFailedWrites(pendingRollbacks, skipLocking); + return true; + } + + protected void rollbackFailedWrites(Map> instantsToRollback, boolean skipLocking) { + // sort in reverse order of commit times + LinkedHashMap> reverseSortedRollbackInstants = instantsToRollback.entrySet() + .stream().sorted((i1, i2) -> i2.getKey().compareTo(i1.getKey())) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e1, LinkedHashMap::new)); + for (Map.Entry> entry : reverseSortedRollbackInstants.entrySet()) { + if (HoodieTimeline.compareTimestamps(entry.getKey(), HoodieTimeline.LESSER_THAN_OR_EQUALS, + HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS)) { + // do we need to handle failed rollback of a bootstrap + rollbackFailedBootstrap(); + HeartbeatUtils.deleteHeartbeatFile(fs, basePath, entry.getKey(), config); + break; + } else { + rollback(entry.getKey(), entry.getValue(), skipLocking); + HeartbeatUtils.deleteHeartbeatFile(fs, basePath, entry.getKey(), config); + } + } + } + + protected List getInstantsToRollback(HoodieTableMetaClient metaClient, HoodieFailedWritesCleaningPolicy cleaningPolicy, Option curInstantTime) { + Stream inflightInstantsStream = getInflightTimelineExcludeCompactionAndClustering(metaClient) + .getReverseOrderedInstants(); + if (cleaningPolicy.isEager()) { + return inflightInstantsStream.map(HoodieInstant::getTimestamp).filter(entry -> { + if (curInstantTime.isPresent()) { + return !entry.equals(curInstantTime.get()); + } else { + return true; + } + }).collect(Collectors.toList()); + } else if (cleaningPolicy.isLazy()) { + return inflightInstantsStream.filter(instant -> { + try { + return heartbeatClient.isHeartbeatExpired(instant.getTimestamp()); + } catch (IOException io) { + throw new HoodieException("Failed to check heartbeat for instant " + instant, io); + } + }).map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + } else if (cleaningPolicy.isNever()) { + return Collections.EMPTY_LIST; + } else { + throw new IllegalArgumentException("Invalid Failed Writes Cleaning Policy " + config.getFailedWritesCleanPolicy()); + } + } + + /** + * Ensures compaction instant is in expected state and performs Compaction for the workload stored in instant-time. + * + * @param compactionInstantTime Compaction Instant Time + * @return Collection of Write Status + */ + protected abstract HoodieWriteMetadata compact(String compactionInstantTime, boolean shouldComplete); + + /** + * Performs a compaction operation on a table, serially before or after an insert/upsert action. + * Scheduling and execution is done inline. + */ + protected Option inlineCompaction(Option> extraMetadata) { + Option compactionInstantTimeOpt = inlineScheduleCompaction(extraMetadata); + compactionInstantTimeOpt.ifPresent(compactInstantTime -> { + // inline compaction should auto commit as the user is never given control + compact(compactInstantTime, true); + }); + return compactionInstantTimeOpt; + } + + /*** + * Schedules compaction inline. + * @param extraMetadata extrametada to be used. + * @return compaction instant if scheduled. + */ + protected Option inlineScheduleCompaction(Option> extraMetadata) { + return scheduleCompaction(extraMetadata); + } + + /** + * Schedules a new clustering instant. + * @param extraMetadata Extra Metadata to be stored + */ + public Option scheduleClustering(Option> extraMetadata) throws HoodieIOException { + String instantTime = HoodieActiveTimeline.createNewInstantTime(); + return scheduleClusteringAtInstant(instantTime, extraMetadata) ? Option.of(instantTime) : Option.empty(); + } + + /** + * Schedules a new clustering instant with passed-in instant time. + * @param instantTime clustering Instant Time + * @param extraMetadata Extra Metadata to be stored + */ + public boolean scheduleClusteringAtInstant(String instantTime, Option> extraMetadata) throws HoodieIOException { + return scheduleTableService(instantTime, extraMetadata, TableServiceType.CLUSTER).isPresent(); + } + + /** + * Schedules a new cleaning instant. + * @param extraMetadata Extra Metadata to be stored + */ + protected Option scheduleCleaning(Option> extraMetadata) throws HoodieIOException { + String instantTime = HoodieActiveTimeline.createNewInstantTime(); + return scheduleCleaningAtInstant(instantTime, extraMetadata) ? Option.of(instantTime) : Option.empty(); + } + + /** + * Schedules a new cleaning instant with passed-in instant time. + * @param instantTime cleaning Instant Time + * @param extraMetadata Extra Metadata to be stored + */ + protected boolean scheduleCleaningAtInstant(String instantTime, Option> extraMetadata) throws HoodieIOException { + return scheduleTableService(instantTime, extraMetadata, TableServiceType.CLEAN).isPresent(); + } + + /** + * Ensures clustering instant is in expected state and performs clustering for the plan stored in metadata. + * @param clusteringInstant Clustering Instant Time + * @return Collection of Write Status + */ + public abstract HoodieWriteMetadata cluster(String clusteringInstant, boolean shouldComplete); + + /** + * Schedule table services such as clustering, compaction & cleaning. + * + * @param extraMetadata Metadata to pass onto the scheduled service instant + * @param tableServiceType Type of table service to schedule + * @return + */ + public Option scheduleTableService(Option> extraMetadata, TableServiceType tableServiceType) { + String instantTime = HoodieActiveTimeline.createNewInstantTime(); + return scheduleTableService(instantTime, extraMetadata, tableServiceType); + } + + /** + * Schedule table services such as clustering, compaction & cleaning. + * + * @param extraMetadata Metadata to pass onto the scheduled service instant + * @param tableServiceType Type of table service to schedule + * @return + */ + public Option scheduleTableService(String instantTime, Option> extraMetadata, + TableServiceType tableServiceType) { + // A lock is required to guard against race conditions between an on-going writer and scheduling a table service. + final Option inflightInstant = Option.of(new HoodieInstant(HoodieInstant.State.REQUESTED, + tableServiceType.getAction(), instantTime)); + try { + this.txnManager.beginTransaction(inflightInstant, Option.empty()); + LOG.info("Scheduling table service " + tableServiceType); + return scheduleTableServiceInternal(instantTime, extraMetadata, tableServiceType); + } finally { + this.txnManager.endTransaction(inflightInstant); + } + } + + private Option scheduleTableServiceInternal(String instantTime, Option> extraMetadata, + TableServiceType tableServiceType) { + if (!tableServicesEnabled(config)) { + return Option.empty(); + } + switch (tableServiceType) { + case ARCHIVE: + LOG.info("Scheduling archiving is not supported. Skipping."); + return Option.empty(); + case CLUSTER: + LOG.info("Scheduling clustering at instant time :" + instantTime); + Option clusteringPlan = createTable(config, hadoopConf) + .scheduleClustering(context, instantTime, extraMetadata); + return clusteringPlan.isPresent() ? Option.of(instantTime) : Option.empty(); + case COMPACT: + LOG.info("Scheduling compaction at instant time :" + instantTime); + Option compactionPlan = createTable(config, hadoopConf) + .scheduleCompaction(context, instantTime, extraMetadata); + return compactionPlan.isPresent() ? Option.of(instantTime) : Option.empty(); + case CLEAN: + LOG.info("Scheduling cleaning at instant time :" + instantTime); + Option cleanerPlan = createTable(config, hadoopConf) + .scheduleCleaning(context, instantTime, extraMetadata); + return cleanerPlan.isPresent() ? Option.of(instantTime) : Option.empty(); + default: + throw new IllegalArgumentException("Invalid TableService " + tableServiceType); + } + } + + /** + * Executes a clustering plan on a table, serially before or after an insert/upsert action. + * Schedules and executes clustering inline. + */ + protected Option inlineClustering(Option> extraMetadata) { + Option clusteringInstantOpt = inlineScheduleClustering(extraMetadata); + clusteringInstantOpt.ifPresent(clusteringInstant -> { + // inline cluster should auto commit as the user is never given control + cluster(clusteringInstant, true); + }); + return clusteringInstantOpt; + } + + /** + * Schedules clustering inline. + * @param extraMetadata extrametadata to use. + * @return clustering instant if scheduled. + */ + protected Option inlineScheduleClustering(Option> extraMetadata) { + return scheduleClustering(extraMetadata); + } + + /** + * Finalize Write operation. + * + * @param table HoodieTable + * @param instantTime Instant Time + * @param stats Hoodie Write Stat + */ + protected void finalizeWrite(HoodieTable table, String instantTime, List stats) { + try { + final Timer.Context finalizeCtx = metrics.getFinalizeCtx(); + table.finalizeWrite(context, instantTime, stats); + if (finalizeCtx != null) { + Option durationInMs = Option.of(metrics.getDurationInMs(finalizeCtx.stop())); + durationInMs.ifPresent(duration -> { + LOG.info("Finalize write elapsed time (milliseconds): " + duration); + metrics.updateFinalizeWriteMetrics(duration, stats.size()); + }); + } + } catch (HoodieIOException ioe) { + throw new HoodieCommitException("Failed to complete commit " + instantTime + " due to finalize errors.", ioe); + } + } + + public HoodieMetrics getMetrics() { + return metrics; + } + + public HoodieIndex getIndex() { + return index; + } + + /** + * Instantiates engine-specific instance of {@link HoodieTable} as well as performs necessary + * bootstrapping operations (for ex, validating whether Metadata Table has to be bootstrapped) + * + * NOTE: THIS OPERATION IS EXECUTED UNDER LOCK, THEREFORE SHOULD AVOID ANY OPERATIONS + * NOT REQUIRING EXTERNAL SYNCHRONIZATION + * + * @param metaClient instance of {@link HoodieTableMetaClient} + * @param instantTime current inflight instant time + * @return instantiated {@link HoodieTable} + */ + protected abstract HoodieTable doInitTable(HoodieTableMetaClient metaClient, Option instantTime, boolean initialMetadataTableIfNecessary); + + /** + * Instantiates and initializes instance of {@link HoodieTable}, performing crucial bootstrapping + * operations such as: + * + * NOTE: This method is engine-agnostic and SHOULD NOT be overloaded, please check on + * {@link #doInitTable(HoodieTableMetaClient, Option, boolean)} instead + * + *

    + *
  • Checking whether upgrade/downgrade is required
  • + *
  • Bootstrapping Metadata Table (if required)
  • + *
  • Initializing metrics contexts
  • + *
+ */ + protected final HoodieTable initTable(WriteOperationType operationType, Option instantTime, boolean initialMetadataTableIfNecessary) { + HoodieTableMetaClient metaClient = createMetaClient(true); + // Setup write schemas for deletes + if (operationType == WriteOperationType.DELETE) { + setWriteSchemaForDeletes(metaClient); + } + + HoodieTable table; + Option ownerInstant = Option.empty(); + if (instantTime.isPresent()) { + ownerInstant = Option.of(new HoodieInstant(true, CommitUtils.getCommitActionType(operationType, metaClient.getTableType()), instantTime.get())); + } + this.txnManager.beginTransaction(ownerInstant, Option.empty()); + try { + tryUpgrade(metaClient, instantTime); + table = doInitTable(metaClient, instantTime, initialMetadataTableIfNecessary); + } finally { + this.txnManager.endTransaction(ownerInstant); + } + + // Validate table properties + metaClient.validateTableProperties(config.getProps()); + + switch (operationType) { + case INSERT: + case INSERT_PREPPED: + case UPSERT: + case UPSERT_PREPPED: + case BULK_INSERT: + case BULK_INSERT_PREPPED: + case INSERT_OVERWRITE: + case INSERT_OVERWRITE_TABLE: + setWriteTimer(table); + break; + case CLUSTER: + clusteringTimer = metrics.getClusteringCtx(); + break; + case COMPACT: + compactionTimer = metrics.getCompactionCtx(); + break; + default: + } + + return table; + } + + protected final HoodieTable initTable(WriteOperationType operationType, Option instantTime) { + return initTable(operationType, instantTime, config.isMetadataTableEnabled()); + } + + /** + * Sets write schema from last instant since deletes may not have schema set in the config. + */ + protected void setWriteSchemaForDeletes(HoodieTableMetaClient metaClient) { + try { + HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); + Option lastInstant = + activeTimeline.filterCompletedInstants().filter(s -> s.getAction().equals(metaClient.getCommitActionType()) + || s.getAction().equals(HoodieActiveTimeline.REPLACE_COMMIT_ACTION)) + .lastInstant(); + if (lastInstant.isPresent()) { + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( + activeTimeline.getInstantDetails(lastInstant.get()).get(), HoodieCommitMetadata.class); + if (commitMetadata.getExtraMetadata().containsKey(SCHEMA_KEY)) { + config.setSchema(commitMetadata.getExtraMetadata().get(SCHEMA_KEY)); + } else { + throw new HoodieIOException("Latest commit does not have any schema in commit metadata"); + } + } else { + throw new HoodieIOException("Deletes issued without any prior commits"); + } + } catch (IOException e) { + throw new HoodieIOException("IOException thrown while reading last commit metadata", e); + } + } + + /** + * Called after each write, to release any resources used. + */ + protected void releaseResources() { + // do nothing here + } + + @Override + public void close() { + AsyncArchiveService.forceShutdown(asyncArchiveService); + asyncArchiveService = null; + AsyncCleanerService.forceShutdown(asyncCleanerService); + asyncCleanerService = null; + // Stop timeline-server if running + super.close(); + // Calling this here releases any resources used by your index, so make sure to finish any related operations + // before this point + this.index.close(); + this.heartbeatClient.stop(); + this.txnManager.close(); + } + + private void setWriteTimer(HoodieTable table) { + String commitType = table.getMetaClient().getCommitActionType(); + if (commitType.equals(HoodieTimeline.COMMIT_ACTION)) { + writeTimer = metrics.getCommitCtx(); + } else if (commitType.equals(HoodieTimeline.DELTA_COMMIT_ACTION)) { + writeTimer = metrics.getDeltaCommitCtx(); + } + } + + protected void tryUpgrade(HoodieTableMetaClient metaClient, Option instantTime) { + UpgradeDowngrade upgradeDowngrade = + new UpgradeDowngrade(metaClient, config, context, upgradeDowngradeHelper); + + if (upgradeDowngrade.needsUpgradeOrDowngrade(HoodieTableVersion.current())) { + metaClient = HoodieTableMetaClient.reload(metaClient); + // Ensure no inflight commits by setting EAGER policy and explicitly cleaning all failed commits + List instantsToRollback = getInstantsToRollback(metaClient, HoodieFailedWritesCleaningPolicy.EAGER, instantTime); + + if (!instantsToRollback.isEmpty()) { + Map> pendingRollbacks = getPendingRollbackInfos(metaClient); + instantsToRollback.forEach(entry -> pendingRollbacks.putIfAbsent(entry, Option.empty())); + + rollbackFailedWrites(pendingRollbacks, true); + } + + new UpgradeDowngrade(metaClient, config, context, upgradeDowngradeHelper) + .run(HoodieTableVersion.current(), instantTime.orElse(null)); + + metaClient.reloadActiveTimeline(); + } + } + + /** + * add columns to table. + * + * @param colName col name to be added. if we want to add col to a nested filed, the fullName should be specify + * @param schema col type to be added. + * @param doc col doc to be added. + * @param position col position to be added + * @param positionType col position change type. now support three change types: first/after/before + */ + public void addColumn(String colName, Schema schema, String doc, String position, TableChange.ColumnPositionChange.ColumnPositionType positionType) { + Pair pair = getInternalSchemaAndMetaClient(); + InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()) + .applyAddChange(colName, AvroInternalSchemaConverter.convertToField(schema), doc, position, positionType); + commitTableChange(newSchema, pair.getRight()); + } + + public void addColumn(String colName, Schema schema) { + addColumn(colName, schema, null, "", TableChange.ColumnPositionChange.ColumnPositionType.NO_OPERATION); + } + + /** + * delete columns to table. + * + * @param colNames col name to be deleted. if we want to delete col from a nested filed, the fullName should be specify + */ + public void deleteColumns(String... colNames) { + Pair pair = getInternalSchemaAndMetaClient(); + InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()).applyDeleteChange(colNames); + commitTableChange(newSchema, pair.getRight()); + } + + /** + * rename col name for hudi table. + * + * @param colName col name to be renamed. if we want to rename col from a nested filed, the fullName should be specify + * @param newName new name for current col. no need to specify fullName. + */ + public void renameColumn(String colName, String newName) { + Pair pair = getInternalSchemaAndMetaClient(); + InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()).applyRenameChange(colName, newName); + commitTableChange(newSchema, pair.getRight()); + } + + /** + * update col nullable attribute for hudi table. + * + * @param colName col name to be changed. if we want to change col from a nested filed, the fullName should be specify + * @param nullable . + */ + public void updateColumnNullability(String colName, boolean nullable) { + Pair pair = getInternalSchemaAndMetaClient(); + InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()).applyColumnNullabilityChange(colName, nullable); + commitTableChange(newSchema, pair.getRight()); + } + + /** + * update col Type for hudi table. + * only support update primitive type to primitive type. + * cannot update nest type to nest type or primitive type eg: RecordType -> MapType, MapType -> LongType. + * + * @param colName col name to be changed. if we want to change col from a nested filed, the fullName should be specify + * @param newType . + */ + public void updateColumnType(String colName, Type newType) { + Pair pair = getInternalSchemaAndMetaClient(); + InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()).applyColumnTypeChange(colName, newType); + commitTableChange(newSchema, pair.getRight()); + } + + /** + * update col comment for hudi table. + * + * @param colName col name to be changed. if we want to change col from a nested filed, the fullName should be specify + * @param doc . + */ + public void updateColumnComment(String colName, String doc) { + Pair pair = getInternalSchemaAndMetaClient(); + InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()).applyColumnCommentChange(colName, doc); + commitTableChange(newSchema, pair.getRight()); + } + + /** + * reorder the position of col. + * + * @param colName column which need to be reordered. if we want to change col from a nested filed, the fullName should be specify. + * @param referColName reference position. + * @param orderType col position change type. now support three change types: first/after/before + */ + public void reOrderColPosition(String colName, String referColName, TableChange.ColumnPositionChange.ColumnPositionType orderType) { + if (colName == null || orderType == null || referColName == null) { + return; + } + //get internalSchema + Pair pair = getInternalSchemaAndMetaClient(); + InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()) + .applyReOrderColPositionChange(colName, referColName, orderType); + commitTableChange(newSchema, pair.getRight()); + } + + private Pair getInternalSchemaAndMetaClient() { + HoodieTableMetaClient metaClient = createMetaClient(true); + TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient); + return Pair.of(getInternalSchema(schemaUtil), metaClient); + } + + private void commitTableChange(InternalSchema newSchema, HoodieTableMetaClient metaClient) { + TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient); + String historySchemaStr = schemaUtil.getTableHistorySchemaStrFromCommitMetadata().orElseGet( + () -> SerDeHelper.inheritSchemas(getInternalSchema(schemaUtil), "")); + Schema schema = AvroInternalSchemaConverter.convert(newSchema, config.getTableName()); + String commitActionType = CommitUtils.getCommitActionType(WriteOperationType.ALTER_SCHEMA, metaClient.getTableType()); + String instantTime = HoodieActiveTimeline.createNewInstantTime(); + startCommitWithTime(instantTime, commitActionType, metaClient); + config.setSchema(schema.toString()); + HoodieActiveTimeline timeLine = metaClient.getActiveTimeline(); + HoodieInstant requested = new HoodieInstant(State.REQUESTED, commitActionType, instantTime); + HoodieCommitMetadata metadata = new HoodieCommitMetadata(); + metadata.setOperationType(WriteOperationType.ALTER_SCHEMA); + try { + timeLine.transitionRequestedToInflight(requested, Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + } catch (IOException io) { + throw new HoodieCommitException("Failed to commit " + instantTime + " unable to save inflight metadata ", io); + } + Map extraMeta = new HashMap<>(); + extraMeta.put(SerDeHelper.LATEST_SCHEMA, SerDeHelper.toJson(newSchema.setSchemaId(Long.parseLong(instantTime)))); + // try to save history schemas + FileBasedInternalSchemaStorageManager schemasManager = new FileBasedInternalSchemaStorageManager(metaClient); + schemasManager.persistHistorySchemaStr(instantTime, SerDeHelper.inheritSchemas(newSchema, historySchemaStr)); + commitStats(instantTime, Collections.emptyList(), Option.of(extraMeta), commitActionType); + } + + private InternalSchema getInternalSchema(TableSchemaResolver schemaUtil) { + return schemaUtil.getTableInternalSchemaFromCommitMetadata().orElseGet(() -> { + try { + return AvroInternalSchemaConverter.convert(schemaUtil.getTableAvroSchema()); + } catch (Exception e) { + throw new HoodieException(String.format("cannot find schema for current table: %s", config.getBasePath())); + } + }); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/CompactionAdminClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/CompactionAdminClient.java index a2ecb67277afe..a394c6d905543 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/CompactionAdminClient.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/CompactionAdminClient.java @@ -18,11 +18,9 @@ package org.apache.hudi.client; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.model.HoodieCompactionOperation; import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.CompactionOperation; import org.apache.hudi.common.model.FileSlice; @@ -44,6 +42,9 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.table.action.compact.OperationResult; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -61,7 +62,7 @@ /** * Client to perform admin operations related to compaction. */ -public class CompactionAdminClient extends AbstractHoodieClient { +public class CompactionAdminClient extends BaseHoodieClient { private static final Logger LOG = LogManager.getLogger(CompactionAdminClient.class); @@ -85,7 +86,7 @@ public List validateCompactionPlan(HoodieTableMetaClient met if (plan.getOperations() != null) { List ops = plan.getOperations().stream() .map(CompactionOperation::convertFromAvroRecordInstance).collect(Collectors.toList()); - context.setJobStatus(this.getClass().getSimpleName(), "Validate compaction operations"); + context.setJobStatus(this.getClass().getSimpleName(), "Validate compaction operations: " + config.getTableName()); return context.map(ops, op -> { try { return validateCompactionOperation(metaClient, compactionInstant, op, Option.of(fsView)); @@ -172,7 +173,7 @@ public List unscheduleCompactionFileId(HoodieFileGroupId fgId, b Path inflightPath = new Path(metaClient.getMetaPath(), inflight.getFileName()); if (metaClient.getFs().exists(inflightPath)) { // revert if in inflight state - metaClient.getActiveTimeline().revertCompactionInflightToRequested(inflight); + metaClient.getActiveTimeline().revertInstantFromInflightToRequested(inflight); } // Overwrite compaction plan with updated info metaClient.getActiveTimeline().saveToCompactionRequested( @@ -351,7 +352,7 @@ private List runRenamingOps(HoodieTableMetaClient metaClient, } else { LOG.info("The following compaction renaming operations needs to be performed to un-schedule"); if (!dryRun) { - context.setJobStatus(this.getClass().getSimpleName(), "Execute unschedule operations"); + context.setJobStatus(this.getClass().getSimpleName(), "Execute unschedule operations: " + config.getTableName()); return context.map(renameActions, lfPair -> { try { LOG.info("RENAME " + lfPair.getLeft().getPath() + " => " + lfPair.getRight().getPath()); @@ -383,7 +384,7 @@ private List runRenamingOps(HoodieTableMetaClient metaClient, * @return list of pairs of log-files (old, new) and for each pair, rename must be done to successfully unschedule * compaction. */ - protected List> getRenamingActionsForUnschedulingCompactionPlan( + public List> getRenamingActionsForUnschedulingCompactionPlan( HoodieTableMetaClient metaClient, String compactionInstant, int parallelism, Option fsViewOpt, boolean skipValidation) throws IOException { HoodieTableFileSystemView fsView = fsViewOpt.isPresent() ? fsViewOpt.get() @@ -394,7 +395,7 @@ protected List> getRenamingActionsForUnschedu "Number of Compaction Operations :" + plan.getOperations().size() + " for instant :" + compactionInstant); List ops = plan.getOperations().stream() .map(CompactionOperation::convertFromAvroRecordInstance).collect(Collectors.toList()); - context.setJobStatus(this.getClass().getSimpleName(), "Generate compaction unscheduling operations"); + context.setJobStatus(this.getClass().getSimpleName(), "Generate compaction unscheduling operations: " + config.getTableName()); return context.flatMap(ops, op -> { try { return getRenamingActionsForUnschedulingCompactionOperation(metaClient, compactionInstant, op, diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieInternalWriteStatus.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieInternalWriteStatus.java index 87a117bb595e9..808eda5071e1f 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieInternalWriteStatus.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieInternalWriteStatus.java @@ -55,6 +55,10 @@ public HoodieInternalWriteStatus(Boolean trackSuccessRecords, Double failureFrac this.random = new Random(RANDOM_SEED); } + public boolean isTrackingSuccessfulWrites() { + return trackSuccessRecords; + } + public void markSuccess(String recordKey) { if (trackSuccessRecords) { this.successRecordKeys.add(recordKey); @@ -62,6 +66,10 @@ public void markSuccess(String recordKey) { totalRecords++; } + public void markSuccess() { + totalRecords++; + } + public void markFailure(String recordKey, Throwable t) { if (failedRecordKeys.isEmpty() || (random.nextDouble() <= failureFraction)) { failedRecordKeys.add(Pair.of(recordKey, t)); @@ -141,10 +149,27 @@ public void setSuccessRecordKeys(List successRecordKeys) { this.successRecordKeys = successRecordKeys; } + public double getFailureFraction() { + return failureFraction; + } + + public boolean isTrackSuccessRecords() { + return trackSuccessRecords; + } + @Override public String toString() { return "PartitionPath " + partitionPath + ", FileID " + fileId + ", Success records " + totalRecords + ", errored Rows " + totalErrorRecords + ", global error " + (globalError != null); } + + public WriteStatus toWriteStatus() { + WriteStatus status = new WriteStatus(trackSuccessRecords, failureFraction); + status.setFileId(fileId); + status.setTotalRecords(totalRecords); + status.setPartitionPath(partitionPath); + status.setStat(stat); + return status; + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieTimelineArchiver.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieTimelineArchiver.java new file mode 100644 index 0000000000000..2992f4abd4c9e --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieTimelineArchiver.java @@ -0,0 +1,684 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.client; + +import org.apache.hudi.avro.model.HoodieArchivedMetaEntry; +import org.apache.hudi.avro.model.HoodieMergeArchiveFilePlan; +import org.apache.hudi.client.transaction.TransactionManager; +import org.apache.hudi.client.utils.MetadataConversionUtils; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; +import org.apache.hudi.common.fs.StorageSchemes; +import org.apache.hudi.common.model.HoodieArchivedLogFile; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.log.HoodieLogFormat; +import org.apache.hudi.common.table.log.HoodieLogFormat.Writer; +import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; +import org.apache.hudi.common.table.log.block.HoodieLogBlock; +import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.CompactionUtils; +import org.apache.hudi.common.util.FileIOUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieCommitException; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.compact.CompactionTriggerStrategy; +import org.apache.hudi.table.marker.WriteMarkers; +import org.apache.hudi.table.marker.WriteMarkersFactory; + +import org.apache.avro.Schema; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.compareTimestamps; + +/** + * Archiver to bound the growth of files under .hoodie meta path. + */ +public class HoodieTimelineArchiver { + + private static final Logger LOG = LogManager.getLogger(HoodieTimelineArchiver.class); + + private final Path archiveFilePath; + private final HoodieWriteConfig config; + private Writer writer; + private final int maxInstantsToKeep; + private final int minInstantsToKeep; + private final HoodieTable table; + private final HoodieTableMetaClient metaClient; + private final TransactionManager txnManager; + + public HoodieTimelineArchiver(HoodieWriteConfig config, HoodieTable table) { + this.config = config; + this.table = table; + this.metaClient = table.getMetaClient(); + this.archiveFilePath = HoodieArchivedTimeline.getArchiveLogPath(metaClient.getArchivePath()); + this.maxInstantsToKeep = config.getMaxCommitsToKeep(); + this.minInstantsToKeep = config.getMinCommitsToKeep(); + this.txnManager = new TransactionManager(config, table.getMetaClient().getFs()); + } + + private Writer openWriter() { + try { + if (this.writer == null) { + return HoodieLogFormat.newWriterBuilder().onParentPath(archiveFilePath.getParent()) + .withFileId(archiveFilePath.getName()).withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION) + .withFs(metaClient.getFs()).overBaseCommit("").build(); + } else { + return this.writer; + } + } catch (IOException e) { + throw new HoodieException("Unable to initialize HoodieLogFormat writer", e); + } + } + + public Writer reOpenWriter() { + try { + if (this.writer != null) { + this.writer.close(); + this.writer = null; + } + this.writer = openWriter(); + return writer; + } catch (IOException e) { + throw new HoodieException("Unable to initialize HoodieLogFormat writer", e); + } + } + + private void close() { + try { + if (this.writer != null) { + this.writer.close(); + } + } catch (IOException e) { + throw new HoodieException("Unable to close HoodieLogFormat writer", e); + } + } + + public boolean archiveIfRequired(HoodieEngineContext context) throws IOException { + return archiveIfRequired(context, false); + } + + /** + * Check if commits need to be archived. If yes, archive commits. + */ + public boolean archiveIfRequired(HoodieEngineContext context, boolean acquireLock) throws IOException { + try { + if (acquireLock) { + // there is no owner or instant time per se for archival. + txnManager.beginTransaction(Option.empty(), Option.empty()); + } + List instantsToArchive = getInstantsToArchive().collect(Collectors.toList()); + verifyLastMergeArchiveFilesIfNecessary(context); + boolean success = true; + if (!instantsToArchive.isEmpty()) { + this.writer = openWriter(); + LOG.info("Archiving instants " + instantsToArchive); + archive(context, instantsToArchive); + LOG.info("Deleting archived instants " + instantsToArchive); + success = deleteArchivedInstants(instantsToArchive, context); + } else { + LOG.info("No Instants to archive"); + } + + if (shouldMergeSmallArchiveFies()) { + mergeArchiveFilesIfNecessary(context); + } + return success; + } finally { + close(); + if (acquireLock) { + txnManager.endTransaction(Option.empty()); + } + } + } + + public boolean shouldMergeSmallArchiveFies() { + return config.getArchiveMergeEnable() && !StorageSchemes.isAppendSupported(metaClient.getFs().getScheme()); + } + + /** + * Here Hoodie can merge the small archive files into a new larger one. + * Only used for filesystem which does not support append operation. + * The whole merge small archive files operation has four stages: + * 1. Build merge plan with merge candidates/merged file name infos. + * 2. Do merge. + * 3. Delete all the candidates. + * 4. Delete the merge plan. + * @param context HoodieEngineContext + * @throws IOException + */ + private void mergeArchiveFilesIfNecessary(HoodieEngineContext context) throws IOException { + Path planPath = new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME); + // Flush remained content if existed and open a new write + reOpenWriter(); + // List all archive files + FileStatus[] fsStatuses = metaClient.getFs().globStatus( + new Path(metaClient.getArchivePath() + "/.commits_.archive*")); + // Sort files by version suffix in reverse (implies reverse chronological order) + Arrays.sort(fsStatuses, new HoodieArchivedTimeline.ArchiveFileVersionComparator()); + + int archiveMergeFilesBatchSize = config.getArchiveMergeFilesBatchSize(); + long smallFileLimitBytes = config.getArchiveMergeSmallFileLimitBytes(); + + List mergeCandidate = getMergeCandidates(smallFileLimitBytes, fsStatuses); + + if (mergeCandidate.size() >= archiveMergeFilesBatchSize) { + List candidateFiles = mergeCandidate.stream().map(fs -> fs.getPath().toString()).collect(Collectors.toList()); + // before merge archive files build merge plan + String logFileName = computeLogFileName(); + buildArchiveMergePlan(candidateFiles, planPath, logFileName); + // merge archive files + mergeArchiveFiles(mergeCandidate); + // after merge, delete the small archive files. + deleteFilesParallelize(metaClient, candidateFiles, context, true); + LOG.info("Success to delete replaced small archive files."); + // finally, delete archiveMergePlan which means merging small archive files operation is succeed. + metaClient.getFs().delete(planPath, false); + LOG.info("Success to merge small archive files."); + } + } + + /** + * Find the latest 'huge archive file' index as a break point and only check/merge newer archive files. + * Because we need to keep the original order of archive files which is important when loading archived instants with time filter. + * {@link HoodieArchivedTimeline} loadInstants(TimeRangeFilter filter, boolean loadInstantDetails, Function commitsFilter) + * @param smallFileLimitBytes small File Limit Bytes + * @param fsStatuses Sort by version suffix in reverse + * @return merge candidates + */ + private List getMergeCandidates(long smallFileLimitBytes, FileStatus[] fsStatuses) { + int index = 0; + for (; index < fsStatuses.length; index++) { + if (fsStatuses[index].getLen() > smallFileLimitBytes) { + break; + } + } + return Arrays.stream(fsStatuses).limit(index).collect(Collectors.toList()); + } + + /** + * Get final written archive file name based on storageSchemes which does not support append. + */ + private String computeLogFileName() throws IOException { + String logWriteToken = writer.getLogFile().getLogWriteToken(); + HoodieLogFile hoodieLogFile = writer.getLogFile().rollOver(metaClient.getFs(), logWriteToken); + return hoodieLogFile.getFileName(); + } + + /** + * Check/Solve if there is any failed and unfinished merge small archive files operation + * @param context HoodieEngineContext used for parallelize to delete small archive files if necessary. + * @throws IOException + */ + private void verifyLastMergeArchiveFilesIfNecessary(HoodieEngineContext context) throws IOException { + if (shouldMergeSmallArchiveFies()) { + Path planPath = new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME); + HoodieWrapperFileSystem fs = metaClient.getFs(); + // If plan exist, last merge small archive files was failed. + // we need to revert or complete last action. + if (fs.exists(planPath)) { + HoodieMergeArchiveFilePlan plan = null; + try { + plan = TimelineMetadataUtils.deserializeAvroMetadata(FileIOUtils.readDataFromPath(fs, planPath).get(), HoodieMergeArchiveFilePlan.class); + } catch (IOException e) { + LOG.warn("Parsing merge archive plan failed.", e); + // Reading partial plan file which means last merge action is failed during writing plan file. + fs.delete(planPath); + return; + } + Path mergedArchiveFile = new Path(metaClient.getArchivePath(), plan.getMergedArchiveFileName()); + List candidates = plan.getCandidate().stream().map(Path::new).collect(Collectors.toList()); + if (candidateAllExists(candidates)) { + // Last merge action is failed during writing merged archive file. + // But all the small archive files are not deleted. + // Revert last action by deleting mergedArchiveFile if existed. + if (fs.exists(mergedArchiveFile)) { + fs.delete(mergedArchiveFile, false); + } + } else { + // Last merge action is failed during deleting small archive files. + // But the merged files is completed. + // Try to complete last action + if (fs.exists(mergedArchiveFile)) { + deleteFilesParallelize(metaClient, plan.getCandidate(), context, true); + } + } + + fs.delete(planPath); + } + } + } + + /** + * If all the candidate small archive files existed, last merge operation was failed during writing the merged archive file. + * If at least one of candidate small archive files existed, the merged archive file was created and last operation was failed during deleting the small archive files. + */ + private boolean candidateAllExists(List candidates) throws IOException { + for (Path archiveFile : candidates) { + if (!metaClient.getFs().exists(archiveFile)) { + // candidate is deleted + return false; + } + } + return true; + } + + public void buildArchiveMergePlan(List compactCandidate, Path planPath, String compactedArchiveFileName) throws IOException { + LOG.info("Start to build archive merge plan."); + HoodieMergeArchiveFilePlan plan = HoodieMergeArchiveFilePlan.newBuilder() + .setCandidate(compactCandidate) + .setMergedArchiveFileName(compactedArchiveFileName) + .build(); + Option content = TimelineMetadataUtils.serializeAvroMetadata(plan, HoodieMergeArchiveFilePlan.class); + // building merge archive files plan. + FileIOUtils.createFileInPath(metaClient.getFs(), planPath, content); + LOG.info("Success to build archive merge plan"); + } + + public void mergeArchiveFiles(List compactCandidate) throws IOException { + LOG.info("Starting to merge small archive files."); + Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema(); + try { + List records = new ArrayList<>(); + for (FileStatus fs : compactCandidate) { + // Read the archived file + try (HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(metaClient.getFs(), + new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema())) { + // Read the avro blocks + while (reader.hasNext()) { + HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); + blk.getRecordIterator().forEachRemaining(records::add); + if (records.size() >= this.config.getCommitArchivalBatchSize()) { + writeToFile(wrapperSchema, records); + } + } + } + } + writeToFile(wrapperSchema, records); + } catch (Exception e) { + throw new HoodieCommitException("Failed to merge small archive files", e); + } finally { + writer.close(); + } + LOG.info("Success to merge small archive files."); + } + + private Map deleteFilesParallelize(HoodieTableMetaClient metaClient, List paths, HoodieEngineContext context, boolean ignoreFailed) { + + return FSUtils.parallelizeFilesProcess(context, + metaClient.getFs(), + config.getArchiveDeleteParallelism(), + pairOfSubPathAndConf -> { + Path file = new Path(pairOfSubPathAndConf.getKey()); + try { + FileSystem fs = metaClient.getFs(); + if (fs.exists(file)) { + return fs.delete(file, false); + } + return true; + } catch (IOException e) { + if (!ignoreFailed) { + throw new HoodieIOException("Failed to delete : " + file, e); + } else { + LOG.warn("Ignore failed deleting : " + file); + return true; + } + } + }, + paths); + } + + private Stream getCleanInstantsToArchive() { + HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline() + .getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION, HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants(); + return cleanAndRollbackTimeline.getInstants() + .collect(Collectors.groupingBy(HoodieInstant::getAction)).values().stream() + .map(hoodieInstants -> { + if (hoodieInstants.size() > this.maxInstantsToKeep) { + return hoodieInstants.subList(0, hoodieInstants.size() - this.minInstantsToKeep); + } else { + return new ArrayList(); + } + }).flatMap(Collection::stream); + } + + private Stream getCommitInstantsToArchive() { + // TODO (na) : Add a way to return actions associated with a timeline and then merge/unify + // with logic above to avoid Stream.concat + HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline(); + + Option oldestPendingCompactionAndReplaceInstant = table.getActiveTimeline() + .getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.COMPACTION_ACTION, HoodieTimeline.REPLACE_COMMIT_ACTION)) + .filter(s -> !s.isCompleted()) + .firstInstant(); + + Option oldestInflightCommitInstant = + table.getActiveTimeline() + .getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.COMMIT_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION)) + .filterInflights().firstInstant(); + + // NOTE: We cannot have any holes in the commit timeline. + // We cannot archive any commits which are made after the first savepoint present, + // unless HoodieArchivalConfig#ARCHIVE_BEYOND_SAVEPOINT is enabled. + Option firstSavepoint = table.getCompletedSavepointTimeline().firstInstant(); + Set savepointTimestamps = table.getSavepointTimestamps(); + if (!commitTimeline.empty() && commitTimeline.countInstants() > maxInstantsToKeep) { + // For Merge-On-Read table, inline or async compaction is enabled + // We need to make sure that there are enough delta commits in the active timeline + // to trigger compaction scheduling, when the trigger strategy of compaction is + // NUM_COMMITS or NUM_AND_TIME. + Option oldestInstantToRetainForCompaction = + (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ + && (config.getInlineCompactTriggerStrategy() == CompactionTriggerStrategy.NUM_COMMITS + || config.getInlineCompactTriggerStrategy() == CompactionTriggerStrategy.NUM_AND_TIME)) + ? CompactionUtils.getOldestInstantToRetainForCompaction( + table.getActiveTimeline(), config.getInlineCompactDeltaCommitMax()) + : Option.empty(); + + // Actually do the commits + Stream instantToArchiveStream = commitTimeline.getInstants() + .filter(s -> { + if (config.shouldArchiveBeyondSavepoint()) { + // skip savepoint commits and proceed further + return !savepointTimestamps.contains(s.getTimestamp()); + } else { + // if no savepoint present, then don't filter + // stop at first savepoint commit + return !(firstSavepoint.isPresent() && compareTimestamps(firstSavepoint.get().getTimestamp(), LESSER_THAN_OR_EQUALS, s.getTimestamp())); + } + }).filter(s -> { + // Ensure commits >= oldest pending compaction commit is retained + return oldestPendingCompactionAndReplaceInstant + .map(instant -> compareTimestamps(instant.getTimestamp(), GREATER_THAN, s.getTimestamp())) + .orElse(true); + }).filter(s -> { + // We need this to ensure that when multiple writers are performing conflict resolution, eligible instants don't + // get archived, i.e, instants after the oldestInflight are retained on the timeline + if (config.getFailedWritesCleanPolicy() == HoodieFailedWritesCleaningPolicy.LAZY) { + return oldestInflightCommitInstant.map(instant -> + compareTimestamps(instant.getTimestamp(), GREATER_THAN, s.getTimestamp())) + .orElse(true); + } + return true; + }).filter(s -> + oldestInstantToRetainForCompaction.map(instantToRetain -> + compareTimestamps(s.getTimestamp(), LESSER_THAN, instantToRetain.getTimestamp())) + .orElse(true) + ); + return instantToArchiveStream.limit(commitTimeline.countInstants() - minInstantsToKeep); + } else { + return Stream.empty(); + } + } + + private Stream getInstantsToArchive() { + Stream instants = Stream.concat(getCleanInstantsToArchive(), getCommitInstantsToArchive()); + if (config.isMetastoreEnabled()) { + return Stream.empty(); + } + + // For archiving and cleaning instants, we need to include intermediate state files if they exist + HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false); + Map, List> groupByTsAction = rawActiveTimeline.getInstants() + .collect(Collectors.groupingBy(i -> Pair.of(i.getTimestamp(), + HoodieInstant.getComparableAction(i.getAction())))); + + // If metadata table is enabled, do not archive instants which are more recent than the last compaction on the + // metadata table. + if (config.isMetadataTableEnabled()) { + try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(table.getContext(), config.getMetadataConfig(), + config.getBasePath(), FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue())) { + Option latestCompactionTime = tableMetadata.getLatestCompactionTime(); + if (!latestCompactionTime.isPresent()) { + LOG.info("Not archiving as there is no compaction yet on the metadata table"); + instants = Stream.empty(); + } else { + LOG.info("Limiting archiving of instants to latest compaction on metadata table at " + latestCompactionTime.get()); + instants = instants.filter(instant -> compareTimestamps(instant.getTimestamp(), LESSER_THAN, + latestCompactionTime.get())); + } + } catch (Exception e) { + throw new HoodieException("Error limiting instant archival based on metadata table", e); + } + } + + if (HoodieTableMetadata.isMetadataTable(config.getBasePath())) { + HoodieTableMetaClient dataMetaClient = HoodieTableMetaClient.builder() + .setBasePath(HoodieTableMetadata.getDatasetBasePath(config.getBasePath())) + .setConf(metaClient.getHadoopConf()) + .build(); + Option earliestActiveDatasetCommit = dataMetaClient.getActiveTimeline().firstInstant(); + + if (config.shouldArchiveBeyondSavepoint()) { + // There are chances that there could be holes in the timeline due to archival and savepoint interplay. + // So, the first non-savepoint commit in the data timeline is considered as beginning of the active timeline. + Option firstNonSavepointCommit = dataMetaClient.getActiveTimeline().getFirstNonSavepointCommit(); + if (firstNonSavepointCommit.isPresent()) { + String firstNonSavepointCommitTime = firstNonSavepointCommit.get().getTimestamp(); + instants = instants.filter(instant -> + compareTimestamps(instant.getTimestamp(), LESSER_THAN, firstNonSavepointCommitTime)); + } + } else { + // Do not archive the commits that live in data set active timeline. + // This is required by metadata table, see HoodieTableMetadataUtil#processRollbackMetadata for details. + if (earliestActiveDatasetCommit.isPresent()) { + instants = instants.filter(instant -> + compareTimestamps(instant.getTimestamp(), HoodieTimeline.LESSER_THAN, earliestActiveDatasetCommit.get().getTimestamp())); + } + } + } + + return instants.flatMap(hoodieInstant -> { + List instantsToStream = groupByTsAction.get(Pair.of(hoodieInstant.getTimestamp(), + HoodieInstant.getComparableAction(hoodieInstant.getAction()))); + if (instantsToStream != null) { + return instantsToStream.stream(); + } else { + // if a concurrent writer archived the instant + return Stream.empty(); + } + }); + } + + private boolean deleteArchivedInstants(List archivedInstants, HoodieEngineContext context) throws IOException { + LOG.info("Deleting instants " + archivedInstants); + + List pendingInstantFiles = new ArrayList<>(); + List completedInstantFiles = new ArrayList<>(); + + for (HoodieInstant instant : archivedInstants) { + String filePath = new Path(metaClient.getMetaPath(), instant.getFileName()).toString(); + if (instant.isCompleted()) { + completedInstantFiles.add(filePath); + } else { + pendingInstantFiles.add(filePath); + } + } + + context.setJobStatus(this.getClass().getSimpleName(), "Delete archived instants: " + config.getTableName()); + // Delete the metadata files + // in HoodieInstant.State sequence: requested -> inflight -> completed, + // this is important because when a COMPLETED metadata file is removed first, + // other monitors on the timeline(such as the compaction or clustering services) would + // mistakenly recognize the pending file as a pending operation, + // then all kinds of weird bugs occur. + boolean success = deleteArchivedInstantFiles(context, true, pendingInstantFiles); + success &= deleteArchivedInstantFiles(context, success, completedInstantFiles); + + // Remove older meta-data from auxiliary path too + Option latestCommitted = Option.fromJavaOptional(archivedInstants.stream().filter(i -> i.isCompleted() && (i.getAction().equals(HoodieTimeline.COMMIT_ACTION) + || (i.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION)))).max(Comparator.comparing(HoodieInstant::getTimestamp))); + LOG.info("Latest Committed Instant=" + latestCommitted); + if (latestCommitted.isPresent()) { + success &= deleteAllInstantsOlderOrEqualsInAuxMetaFolder(latestCommitted.get()); + } + return success; + } + + private boolean deleteArchivedInstantFiles(HoodieEngineContext context, boolean success, List files) { + Map resultDeleteInstantFiles = deleteFilesParallelize(metaClient, files, context, false); + + for (Map.Entry result : resultDeleteInstantFiles.entrySet()) { + LOG.info("Archived and deleted instant file " + result.getKey() + " : " + result.getValue()); + success &= result.getValue(); + } + return success; + } + + /** + * Remove older instants from auxiliary meta folder. + * + * @param thresholdInstant Hoodie Instant + * @return success if all eligible file deleted successfully + * @throws IOException in case of error + */ + private boolean deleteAllInstantsOlderOrEqualsInAuxMetaFolder(HoodieInstant thresholdInstant) throws IOException { + List instants = null; + boolean success = true; + try { + instants = + metaClient.scanHoodieInstantsFromFileSystem( + new Path(metaClient.getMetaAuxiliaryPath()), + HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE, + false); + } catch (FileNotFoundException e) { + /* + * On some FSs deletion of all files in the directory can auto remove the directory itself. + * GCS is one example, as it doesn't have real directories and subdirectories. When client + * removes all the files from a "folder" on GCS is has to create a special "/" to keep the folder + * around. If this doesn't happen (timeout, mis configured client, ...) folder will be deleted and + * in this case we should not break when aux folder is not found. + * GCS information: (https://cloud.google.com/storage/docs/gsutil/addlhelp/HowSubdirectoriesWork) + */ + LOG.warn("Aux path not found. Skipping: " + metaClient.getMetaAuxiliaryPath()); + return true; + } + + List instantsToBeDeleted = + instants.stream().filter(instant1 -> compareTimestamps(instant1.getTimestamp(), + LESSER_THAN_OR_EQUALS, thresholdInstant.getTimestamp())).collect(Collectors.toList()); + + for (HoodieInstant deleteInstant : instantsToBeDeleted) { + LOG.info("Deleting instant " + deleteInstant + " in auxiliary meta path " + metaClient.getMetaAuxiliaryPath()); + Path metaFile = new Path(metaClient.getMetaAuxiliaryPath(), deleteInstant.getFileName()); + if (metaClient.getFs().exists(metaFile)) { + success &= metaClient.getFs().delete(metaFile, false); + LOG.info("Deleted instant file in auxiliary meta path : " + metaFile); + } + } + return success; + } + + public void archive(HoodieEngineContext context, List instants) throws HoodieCommitException { + try { + Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema(); + LOG.info("Wrapper schema " + wrapperSchema.toString()); + List records = new ArrayList<>(); + for (HoodieInstant hoodieInstant : instants) { + try { + deleteAnyLeftOverMarkers(context, hoodieInstant); + // in local FS and HDFS, there could be empty completed instants due to crash. + if (table.getActiveTimeline().isEmpty(hoodieInstant) && hoodieInstant.isCompleted()) { + // lets add an entry to the archival, even if not for the plan. + records.add(createAvroRecordFromEmptyInstant(hoodieInstant)); + } else { + records.add(convertToAvroRecord(hoodieInstant)); + } + if (records.size() >= this.config.getCommitArchivalBatchSize()) { + writeToFile(wrapperSchema, records); + } + } catch (Exception e) { + LOG.error("Failed to archive commits, .commit file: " + hoodieInstant.getFileName(), e); + if (this.config.isFailOnTimelineArchivingEnabled()) { + throw e; + } + } + } + writeToFile(wrapperSchema, records); + } catch (Exception e) { + throw new HoodieCommitException("Failed to archive commits", e); + } + } + + private void deleteAnyLeftOverMarkers(HoodieEngineContext context, HoodieInstant instant) { + WriteMarkers writeMarkers = WriteMarkersFactory.get(config.getMarkersType(), table, instant.getTimestamp()); + if (writeMarkers.deleteMarkerDir(context, config.getMarkersDeleteParallelism())) { + LOG.info("Cleaned up left over marker directory for instant :" + instant); + } + } + + private void writeToFile(Schema wrapperSchema, List records) throws Exception { + if (records.size() > 0) { + Map header = new HashMap<>(); + header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, wrapperSchema.toString()); + final String keyField = table.getMetaClient().getTableConfig().getRecordKeyFieldProp(); + HoodieAvroDataBlock block = new HoodieAvroDataBlock(records, header, keyField); + writer.appendBlock(block); + records.clear(); + } + } + + private IndexedRecord convertToAvroRecord(HoodieInstant hoodieInstant) + throws IOException { + return MetadataConversionUtils.createMetaWrapper(hoodieInstant, metaClient); + } + + private IndexedRecord createAvroRecordFromEmptyInstant(HoodieInstant hoodieInstant) throws IOException { + return MetadataConversionUtils.createMetaWrapperForEmptyInstant(hoodieInstant); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/ReplaceArchivalHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/ReplaceArchivalHelper.java index 515f43e64f108..40eff71c94faf 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/ReplaceArchivalHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/ReplaceArchivalHelper.java @@ -20,24 +20,13 @@ import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; -import org.apache.hadoop.fs.Path; - -import org.apache.hudi.common.model.FileSlice; -import org.apache.hudi.common.model.HoodieFileGroup; import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; import org.apache.hudi.common.model.HoodieRollingStatMetadata; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.view.TableFileSystemView; -import org.apache.hudi.client.common.HoodieEngineContext; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.IOException; import java.io.Serializable; -import java.util.List; -import java.util.stream.Stream; /** * Operates on marker files for a given write action (commit, delta commit, compaction). @@ -61,40 +50,4 @@ public static org.apache.hudi.avro.model.HoodieReplaceCommitMetadata convertRepl avroMetaData.getExtraMetadata().put(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY, ""); return avroMetaData; } - - /** - * Delete all files represented by FileSlices in parallel. Return true if all files are deleted successfully. - */ - public static boolean deleteReplacedFileGroups(HoodieEngineContext context, HoodieTableMetaClient metaClient, - TableFileSystemView fileSystemView, - HoodieInstant instant, List replacedPartitions) { - - List f = context.map(replacedPartitions, partition -> { - Stream fileSlices = fileSystemView.getReplacedFileGroupsBeforeOrOn(instant.getTimestamp(), partition) - .flatMap(HoodieFileGroup::getAllRawFileSlices); - return fileSlices.allMatch(slice -> deleteFileSlice(slice, metaClient, instant)); - }, replacedPartitions.size()); - - return f.stream().reduce((x, y) -> x & y).orElse(true); - } - - private static boolean deleteFileSlice(FileSlice fileSlice, HoodieTableMetaClient metaClient, HoodieInstant instant) { - boolean baseFileDeleteSuccess = fileSlice.getBaseFile().map(baseFile -> - deletePath(new Path(baseFile.getPath()), metaClient, instant)).orElse(true); - - boolean logFileSuccess = fileSlice.getLogFiles().map(logFile -> - deletePath(logFile.getPath(), metaClient, instant)).allMatch(x -> x); - return baseFileDeleteSuccess & logFileSuccess; - } - - private static boolean deletePath(Path path, HoodieTableMetaClient metaClient, HoodieInstant instant) { - try { - LOG.info("Deleting " + path + " before archiving " + instant); - metaClient.getFs().delete(path); - return true; - } catch (IOException e) { - LOG.error("unable to delete file groups that are replaced", e); - return false; - } - } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/RunsTableService.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/RunsTableService.java new file mode 100644 index 0000000000000..64e540568e8dc --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/RunsTableService.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client; + +import org.apache.hudi.config.HoodieWriteConfig; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +public interface RunsTableService { + + Logger LOG = LogManager.getLogger(RunsTableService.class); + + default boolean tableServicesEnabled(HoodieWriteConfig config) { + boolean enabled = config.areTableServicesEnabled(); + if (!enabled) { + LOG.warn(String.format("Table services are disabled. Set `%s` to enable.", HoodieWriteConfig.TABLE_SERVICES_ENABLED)); + } + return enabled; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/WriteStatus.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/WriteStatus.java index a93f2682bff2d..8f74858669278 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/WriteStatus.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/WriteStatus.java @@ -18,23 +18,34 @@ package org.apache.hudi.client; +import org.apache.hudi.ApiMaturityLevel; +import org.apache.hudi.PublicAPIClass; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.util.DateTimeUtils; import org.apache.hudi.common.util.Option; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + import java.io.Serializable; +import java.time.DateTimeException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Random; +import static org.apache.hudi.common.model.DefaultHoodieRecordPayload.METADATA_EVENT_TIME_KEY; + /** * Status of a write operation. */ +@PublicAPIClass(maturity = ApiMaturityLevel.STABLE) public class WriteStatus implements Serializable { + private static final Logger LOG = LogManager.getLogger(WriteStatus.class); private static final long serialVersionUID = 1L; private static final long RANDOM_SEED = 9038412832L; @@ -65,6 +76,12 @@ public WriteStatus(Boolean trackSuccessRecords, Double failureFraction) { this.random = new Random(RANDOM_SEED); } + public WriteStatus() { + this.failureFraction = 0.0d; + this.trackSuccessRecords = false; + this.random = null; + } + /** * Mark write as success, optionally using given parameters for the purpose of calculating some aggregate metrics. * This method is not meant to cache passed arguments, since WriteStatus objects are collected in Spark Driver. @@ -77,6 +94,18 @@ public void markSuccess(HoodieRecord record, Option> optiona writtenRecords.add(record); } totalRecords++; + + // get the min and max event time for calculating latency and freshness + if (optionalRecordMetadata.isPresent()) { + String eventTimeVal = optionalRecordMetadata.get().getOrDefault(METADATA_EVENT_TIME_KEY, null); + try { + long eventTime = DateTimeUtils.parseDateTime(eventTimeVal).toEpochMilli(); + stat.setMinEventTime(eventTime); + stat.setMaxEventTime(eventTime); + } catch (DateTimeException | IllegalArgumentException e) { + LOG.debug(String.format("Fail to parse event time value: %s", eventTimeVal), e); + } + } } /** @@ -172,6 +201,7 @@ public void setTotalErrorRecords(long totalErrorRecords) { public String toString() { final StringBuilder sb = new StringBuilder("WriteStatus {"); sb.append("fileId=").append(fileId); + sb.append(", writeStat=").append(stat); sb.append(", globalError='").append(globalError).append('\''); sb.append(", hasErrors='").append(hasErrors()).append('\''); sb.append(", errorCount='").append(totalErrorRecords).append('\''); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/BootstrapRecordPayload.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/BootstrapRecordPayload.java index fa508e42f120c..a60a0d39f7c55 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/BootstrapRecordPayload.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/BootstrapRecordPayload.java @@ -34,7 +34,7 @@ public BootstrapRecordPayload(GenericRecord record) { } @Override - public BootstrapRecordPayload preCombine(BootstrapRecordPayload another) { + public BootstrapRecordPayload preCombine(BootstrapRecordPayload oldValue) { return this; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/FullRecordBootstrapDataProvider.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/FullRecordBootstrapDataProvider.java index 0a07ee5aa9f36..1cf1702717295 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/FullRecordBootstrapDataProvider.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/FullRecordBootstrapDataProvider.java @@ -20,7 +20,7 @@ import java.io.Serializable; import org.apache.hudi.avro.model.HoodieFileStatus; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.util.collection.Pair; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/HoodieBootstrapSchemaProvider.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/HoodieBootstrapSchemaProvider.java index f63345d915da5..c8e552b1a8dd6 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/HoodieBootstrapSchemaProvider.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/HoodieBootstrapSchemaProvider.java @@ -20,7 +20,7 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieFileStatus; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/HoodieEngineContext.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/HoodieEngineContext.java deleted file mode 100644 index 408029126eba8..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/HoodieEngineContext.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.client.common; - -import org.apache.hudi.common.config.SerializableConfiguration; -import org.apache.hudi.client.common.function.SerializableConsumer; -import org.apache.hudi.client.common.function.SerializableFunction; -import org.apache.hudi.client.common.function.SerializablePairFunction; -import org.apache.hudi.common.util.Option; - -import java.util.List; -import java.util.Map; -import java.util.stream.Stream; - -/** - * Base class contains the context information needed by the engine at runtime. It will be extended by different - * engine implementation if needed. - */ -public abstract class HoodieEngineContext { - - /** - * A wrapped hadoop configuration which can be serialized. - */ - private SerializableConfiguration hadoopConf; - - protected TaskContextSupplier taskContextSupplier; - - public HoodieEngineContext(SerializableConfiguration hadoopConf, TaskContextSupplier taskContextSupplier) { - this.hadoopConf = hadoopConf; - this.taskContextSupplier = taskContextSupplier; - } - - public SerializableConfiguration getHadoopConf() { - return hadoopConf; - } - - public TaskContextSupplier getTaskContextSupplier() { - return taskContextSupplier; - } - - public abstract List map(List data, SerializableFunction func, int parallelism); - - public abstract List flatMap(List data, SerializableFunction> func, int parallelism); - - public abstract void foreach(List data, SerializableConsumer consumer, int parallelism); - - public abstract Map mapToPair(List data, SerializablePairFunction func, Integer parallelism); - - public abstract void setProperty(EngineProperty key, String value); - - public abstract Option getProperty(EngineProperty key); - - public abstract void setJobStatus(String activeModule, String activityDescription); - -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/function/FunctionWrapper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/function/FunctionWrapper.java deleted file mode 100644 index 4e91bd29d676b..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/function/FunctionWrapper.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.client.common.function; - -import org.apache.hudi.exception.HoodieException; - -import java.util.function.Consumer; -import java.util.function.Function; -import java.util.stream.Stream; - -import scala.Tuple2; - -/** - * Function wrapper util class, which catches the exception thrown by input function and return a similar function - * with no exception thrown. - */ -public class FunctionWrapper { - - public static Function throwingMapWrapper(SerializableFunction throwingMapFunction) { - return v1 -> { - try { - return throwingMapFunction.apply(v1); - } catch (Exception e) { - throw new HoodieException("Error occurs when executing map", e); - } - }; - } - - public static Function> throwingFlatMapWrapper(SerializableFunction> throwingFlatMapFunction) { - return v1 -> { - try { - return throwingFlatMapFunction.apply(v1); - } catch (Exception e) { - throw new HoodieException("Error occurs when executing flatMap", e); - } - }; - } - - public static Consumer throwingForeachWrapper(SerializableConsumer throwingConsumer) { - return v1 -> { - try { - throwingConsumer.accept(v1); - } catch (Exception e) { - throw new HoodieException("Error occurs when executing foreach", e); - } - }; - } - - public static Function> throwingMapToPairWrapper(SerializablePairFunction throwingPairFunction) { - return v1 -> { - try { - return throwingPairFunction.call(v1); - } catch (Exception e) { - throw new HoodieException("Error occurs when executing mapToPair", e); - } - }; - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/embedded/EmbeddedTimelineServerHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/embedded/EmbeddedTimelineServerHelper.java index 1d5984794b1bc..573684a4ba54c 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/embedded/EmbeddedTimelineServerHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/embedded/EmbeddedTimelineServerHelper.java @@ -18,8 +18,8 @@ package org.apache.hudi.client.embedded; -import org.apache.hudi.client.common.EngineProperty; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.engine.EngineProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; @@ -35,6 +35,8 @@ public class EmbeddedTimelineServerHelper { private static final Logger LOG = LogManager.getLogger(EmbeddedTimelineService.class); + private static Option TIMELINE_SERVER = Option.empty(); + /** * Instantiate Embedded Timeline Server. * @param context Hoodie Engine Context @@ -42,19 +44,33 @@ public class EmbeddedTimelineServerHelper { * @return TimelineServer if configured to run * @throws IOException */ - public static Option createEmbeddedTimelineService( + public static synchronized Option createEmbeddedTimelineService( HoodieEngineContext context, HoodieWriteConfig config) throws IOException { - Option timelineServer = Option.empty(); + if (config.isEmbeddedTimelineServerReuseEnabled()) { + if (!TIMELINE_SERVER.isPresent() || !TIMELINE_SERVER.get().canReuseFor(config.getBasePath())) { + TIMELINE_SERVER = Option.of(startTimelineService(context, config)); + } else { + updateWriteConfigWithTimelineServer(TIMELINE_SERVER.get(), config); + } + return TIMELINE_SERVER; + } if (config.isEmbeddedTimelineServerEnabled()) { - // Run Embedded Timeline Server - LOG.info("Starting Timeline service !!"); - Option hostAddr = context.getProperty(EngineProperty.EMBEDDED_SERVER_HOST); - timelineServer = Option.of(new EmbeddedTimelineService(context, hostAddr.orElse(null), - config.getEmbeddedTimelineServerPort(), config.getClientSpecifiedViewStorageConfig())); - timelineServer.get().startServer(); - updateWriteConfigWithTimelineServer(timelineServer.get(), config); + return Option.of(startTimelineService(context, config)); + } else { + return Option.empty(); } - return timelineServer; + } + + private static EmbeddedTimelineService startTimelineService( + HoodieEngineContext context, HoodieWriteConfig config) throws IOException { + // Run Embedded Timeline Server + LOG.info("Starting Timeline service !!"); + Option hostAddr = context.getProperty(EngineProperty.EMBEDDED_SERVER_HOST); + EmbeddedTimelineService timelineService = new EmbeddedTimelineService( + context, hostAddr.orElse(null), config); + timelineService.startServer(); + updateWriteConfigWithTimelineServer(timelineService, config); + return timelineService; } /** diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/embedded/EmbeddedTimelineService.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/embedded/EmbeddedTimelineService.java index 738119cef2854..4d5375894d7e3 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/embedded/EmbeddedTimelineService.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/embedded/EmbeddedTimelineService.java @@ -18,12 +18,15 @@ package org.apache.hudi.client.embedded; -import org.apache.hudi.client.common.HoodieEngineContext; import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.marker.MarkerType; import org.apache.hudi.common.table.view.FileSystemViewManager; import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.table.view.FileSystemViewStorageType; import org.apache.hudi.common.util.NetworkUtils; +import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.timeline.service.TimelineService; import org.apache.log4j.LogManager; @@ -39,36 +42,55 @@ public class EmbeddedTimelineService { private static final Logger LOG = LogManager.getLogger(EmbeddedTimelineService.class); private int serverPort; - private int preferredPort; private String hostAddr; + private HoodieEngineContext context; private final SerializableConfiguration hadoopConf; - private final FileSystemViewStorageConfig config; + private final HoodieWriteConfig writeConfig; + private final String basePath; + private transient FileSystemViewManager viewManager; private transient TimelineService server; - public EmbeddedTimelineService(HoodieEngineContext context, String embeddedTimelineServiceHostAddr, int embeddedTimelineServerPort, FileSystemViewStorageConfig config) { + public EmbeddedTimelineService(HoodieEngineContext context, String embeddedTimelineServiceHostAddr, HoodieWriteConfig writeConfig) { setHostAddr(embeddedTimelineServiceHostAddr); - this.config = config; + this.context = context; + this.writeConfig = writeConfig; + this.basePath = writeConfig.getBasePath(); this.hadoopConf = context.getHadoopConf(); this.viewManager = createViewManager(); - this.preferredPort = embeddedTimelineServerPort; } private FileSystemViewManager createViewManager() { // Using passed-in configs to build view storage configs FileSystemViewStorageConfig.Builder builder = - FileSystemViewStorageConfig.newBuilder().fromProperties(config.getProps()); + FileSystemViewStorageConfig.newBuilder().fromProperties(writeConfig.getClientSpecifiedViewStorageConfig().getProps()); FileSystemViewStorageType storageType = builder.build().getStorageType(); if (storageType.equals(FileSystemViewStorageType.REMOTE_ONLY) || storageType.equals(FileSystemViewStorageType.REMOTE_FIRST)) { // Reset to default if set to Remote builder.withStorageType(FileSystemViewStorageType.MEMORY); } - return FileSystemViewManager.createViewManager(hadoopConf, builder.build()); + return FileSystemViewManager.createViewManager(context, writeConfig.getMetadataConfig(), builder.build(), writeConfig.getCommonConfig(), basePath); } public void startServer() throws IOException { - server = new TimelineService(preferredPort, viewManager, hadoopConf.newCopy()); + TimelineService.Config.Builder timelineServiceConfBuilder = TimelineService.Config.builder() + .serverPort(writeConfig.getEmbeddedTimelineServerPort()) + .numThreads(writeConfig.getEmbeddedTimelineServerThreads()) + .compress(writeConfig.getEmbeddedTimelineServerCompressOutput()) + .async(writeConfig.getEmbeddedTimelineServerUseAsync()); + // Only passing marker-related write configs to timeline server + // if timeline-server-based markers are used. + if (writeConfig.getMarkersType() == MarkerType.TIMELINE_SERVER_BASED) { + timelineServiceConfBuilder + .enableMarkerRequests(true) + .markerBatchNumThreads(writeConfig.getMarkersTimelineServerBasedBatchNumThreads()) + .markerBatchIntervalMs(writeConfig.getMarkersTimelineServerBasedBatchIntervalMs()) + .markerParallelism(writeConfig.getMarkersDeleteParallelism()); + } + + server = new TimelineService(context, hadoopConf.newCopy(), timelineServiceConfBuilder.build(), + FSUtils.getFs(basePath, hadoopConf.newCopy()), viewManager); serverPort = server.startService(); LOG.info("Started embedded timeline server at " + hostAddr + ":" + serverPort); } @@ -87,16 +109,32 @@ private void setHostAddr(String embeddedTimelineServiceHostAddr) { * Retrieves proper view storage configs for remote clients to access this service. */ public FileSystemViewStorageConfig getRemoteFileSystemViewConfig() { - FileSystemViewStorageType viewStorageType = config.shouldEnableBackupForRemoteFileSystemView() - ? FileSystemViewStorageType.REMOTE_FIRST : FileSystemViewStorageType.REMOTE_ONLY; - return FileSystemViewStorageConfig.newBuilder().withStorageType(viewStorageType) - .withRemoteServerHost(hostAddr).withRemoteServerPort(serverPort).build(); + FileSystemViewStorageType viewStorageType = writeConfig.getClientSpecifiedViewStorageConfig() + .shouldEnableBackupForRemoteFileSystemView() + ? FileSystemViewStorageType.REMOTE_FIRST : FileSystemViewStorageType.REMOTE_ONLY; + return FileSystemViewStorageConfig.newBuilder() + .withStorageType(viewStorageType) + .withRemoteServerHost(hostAddr) + .withRemoteServerPort(serverPort) + .withRemoteTimelineClientTimeoutSecs(writeConfig.getClientSpecifiedViewStorageConfig().getRemoteTimelineClientTimeoutSecs()) + .withRemoteTimelineClientRetry(writeConfig.getClientSpecifiedViewStorageConfig().isRemoteTimelineClientRetryEnabled()) + .withRemoteTimelineClientMaxRetryNumbers(writeConfig.getClientSpecifiedViewStorageConfig().getRemoteTimelineClientMaxRetryNumbers()) + .withRemoteTimelineInitialRetryIntervalMs(writeConfig.getClientSpecifiedViewStorageConfig().getRemoteTimelineInitialRetryIntervalMs()) + .withRemoteTimelineClientMaxRetryIntervalMs(writeConfig.getClientSpecifiedViewStorageConfig().getRemoteTimelineClientMaxRetryIntervalMs()) + .withRemoteTimelineClientRetryExceptions(writeConfig.getClientSpecifiedViewStorageConfig().getRemoteTimelineClientRetryExceptions()) + .build(); } public FileSystemViewManager getViewManager() { return viewManager; } + public boolean canReuseFor(String basePath) { + return this.server != null + && this.viewManager != null + && this.basePath.equals(basePath); + } + public void stop() { if (null != server) { LOG.info("Closing Timeline server"); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/heartbeat/HeartbeatUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/heartbeat/HeartbeatUtils.java new file mode 100644 index 0000000000000..a20469429030a --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/heartbeat/HeartbeatUtils.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.heartbeat; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.table.HoodieTable; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; + +/** + * Helper class to delete heartbeat for completed or failed instants with expired heartbeats. + */ +public class HeartbeatUtils { + + private static final Logger LOG = LogManager.getLogger(HeartbeatUtils.class); + + /** + * Deletes the heartbeat file for the specified instant. + * @param fs + * @param basePath + * @param instantTime + * @return + */ + public static boolean deleteHeartbeatFile(FileSystem fs, String basePath, String instantTime) { + boolean deleted = false; + try { + String heartbeatFolderPath = HoodieTableMetaClient.getHeartbeatFolderPath(basePath); + deleted = fs.delete(new Path(heartbeatFolderPath + Path.SEPARATOR + instantTime), false); + if (!deleted) { + LOG.error("Failed to delete heartbeat for instant " + instantTime); + } else { + LOG.info("Deleted the heartbeat for instant " + instantTime); + } + } catch (IOException io) { + LOG.error("Unable to delete heartbeat for instant " + instantTime, io); + } + return deleted; + } + + /** + * Deletes the heartbeat file for the specified instant. + * @param fs Hadoop FileSystem instance + * @param basePath Hoodie table base path + * @param instantTime Commit instant time + * @param config HoodieWriteConfig instance + * @return Boolean indicating whether heartbeat file was deleted or not + */ + public static boolean deleteHeartbeatFile(FileSystem fs, String basePath, String instantTime, HoodieWriteConfig config) { + if (config.getFailedWritesCleanPolicy().isLazy()) { + return deleteHeartbeatFile(fs, basePath, instantTime); + } + + return false; + } + + /** + * Check if the heartbeat corresponding to instantTime has expired. If yes, abort by throwing an exception. + * @param instantTime + * @param table + * @param heartbeatClient + * @param config + */ + public static void abortIfHeartbeatExpired(String instantTime, HoodieTable table, + HoodieHeartbeatClient heartbeatClient, HoodieWriteConfig config) { + ValidationUtils.checkArgument(heartbeatClient != null); + try { + if (config.getFailedWritesCleanPolicy().isLazy() && heartbeatClient.isHeartbeatExpired(instantTime)) { + throw new HoodieException("Heartbeat for instant " + instantTime + " has expired, last heartbeat " + + HoodieHeartbeatClient.getLastHeartbeatTime(table.getMetaClient().getFs(), config.getBasePath(), instantTime)); + } + } catch (IOException io) { + throw new HoodieException("Unable to read heartbeat", io); + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/heartbeat/HoodieHeartbeatClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/heartbeat/HoodieHeartbeatClient.java new file mode 100644 index 0000000000000..341d72c754a95 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/heartbeat/HoodieHeartbeatClient.java @@ -0,0 +1,287 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.heartbeat; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieHeartbeatException; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import javax.annotation.concurrent.NotThreadSafe; +import java.io.IOException; +import java.io.OutputStream; +import java.io.Serializable; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.Timer; +import java.util.TimerTask; + +/** + * This class creates heartbeat for hudi client. This heartbeat is used to ascertain whether the running job is or not. + * NOTE: Due to CPU contention on the driver/client node, the heartbeats could be delayed, hence it's important to set + * the value high enough to avoid that possibility. + */ +@NotThreadSafe +public class HoodieHeartbeatClient implements AutoCloseable, Serializable { + + private static final Logger LOG = LogManager.getLogger(HoodieHeartbeatClient.class); + + private final transient FileSystem fs; + private final String basePath; + // path to the heartbeat folder where all writers are updating their heartbeats + private String heartbeatFolderPath; + // heartbeat interval in millis + private final Long heartbeatIntervalInMs; + private Integer numTolerableHeartbeatMisses; + private final Long maxAllowableHeartbeatIntervalInMs; + private Map instantToHeartbeatMap; + + public HoodieHeartbeatClient(FileSystem fs, String basePath, Long heartbeatIntervalInMs, + Integer numTolerableHeartbeatMisses) { + ValidationUtils.checkArgument(heartbeatIntervalInMs >= 1000, "Cannot set heartbeat lower than 1 second"); + this.fs = fs; + this.basePath = basePath; + this.heartbeatFolderPath = HoodieTableMetaClient.getHeartbeatFolderPath(basePath); + this.heartbeatIntervalInMs = heartbeatIntervalInMs; + this.numTolerableHeartbeatMisses = numTolerableHeartbeatMisses; + this.maxAllowableHeartbeatIntervalInMs = this.heartbeatIntervalInMs * this.numTolerableHeartbeatMisses; + this.instantToHeartbeatMap = new HashMap<>(); + } + + class Heartbeat { + + private String instantTime; + private Boolean isHeartbeatStarted = false; + private Boolean isHeartbeatStopped = false; + private Long lastHeartbeatTime; + private Integer numHeartbeats = 0; + private Timer timer = new Timer(); + + public String getInstantTime() { + return instantTime; + } + + public void setInstantTime(String instantTime) { + this.instantTime = instantTime; + } + + public Boolean isHeartbeatStarted() { + return isHeartbeatStarted; + } + + public void setHeartbeatStarted(Boolean heartbeatStarted) { + isHeartbeatStarted = heartbeatStarted; + } + + public Boolean isHeartbeatStopped() { + return isHeartbeatStopped; + } + + public void setHeartbeatStopped(Boolean heartbeatStopped) { + isHeartbeatStopped = heartbeatStopped; + } + + public Long getLastHeartbeatTime() { + return lastHeartbeatTime; + } + + public void setLastHeartbeatTime(Long lastHeartbeatTime) { + this.lastHeartbeatTime = lastHeartbeatTime; + } + + public Integer getNumHeartbeats() { + return numHeartbeats; + } + + public void setNumHeartbeats(Integer numHeartbeats) { + this.numHeartbeats = numHeartbeats; + } + + public Timer getTimer() { + return timer; + } + + public void setTimer(Timer timer) { + this.timer = timer; + } + + @Override + public String toString() { + return "Heartbeat{" + + "instantTime='" + instantTime + '\'' + + ", isHeartbeatStarted=" + isHeartbeatStarted + + ", isHeartbeatStopped=" + isHeartbeatStopped + + ", lastHeartbeatTime=" + lastHeartbeatTime + + ", numHeartbeats=" + numHeartbeats + + ", timer=" + timer + + '}'; + } + } + + class HeartbeatTask extends TimerTask { + + private final String instantTime; + + HeartbeatTask(String instantTime) { + this.instantTime = instantTime; + } + + @Override + public void run() { + updateHeartbeat(instantTime); + } + } + + /** + * Start a new heartbeat for the specified instant. If there is already one running, this will be a NO_OP + * @param instantTime + */ + public void start(String instantTime) { + LOG.info("Received request to start heartbeat for instant time " + instantTime); + Heartbeat heartbeat = instantToHeartbeatMap.get(instantTime); + ValidationUtils.checkArgument(heartbeat == null || !heartbeat.isHeartbeatStopped(), "Cannot restart a stopped heartbeat for " + instantTime); + if (heartbeat != null && heartbeat.isHeartbeatStarted()) { + // heartbeat already started, NO_OP + } else { + Heartbeat newHeartbeat = new Heartbeat(); + newHeartbeat.setHeartbeatStarted(true); + instantToHeartbeatMap.put(instantTime, newHeartbeat); + // Ensure heartbeat is generated for the first time with this blocking call. + // Since timer submits the task to a thread, no guarantee when that thread will get CPU + // cycles to generate the first heartbeat. + updateHeartbeat(instantTime); + newHeartbeat.getTimer().scheduleAtFixedRate(new HeartbeatTask(instantTime), this.heartbeatIntervalInMs, + this.heartbeatIntervalInMs); + } + } + + /** + * Stops the heartbeat for the specified instant. + * @param instantTime + * @throws HoodieException + */ + public void stop(String instantTime) throws HoodieException { + Heartbeat heartbeat = instantToHeartbeatMap.get(instantTime); + if (heartbeat != null && heartbeat.isHeartbeatStarted() && !heartbeat.isHeartbeatStopped()) { + LOG.info("Stopping heartbeat for instant " + instantTime); + heartbeat.getTimer().cancel(); + heartbeat.setHeartbeatStopped(true); + LOG.info("Stopped heartbeat for instant " + instantTime); + HeartbeatUtils.deleteHeartbeatFile(fs, basePath, instantTime); + LOG.info("Deleted heartbeat file for instant " + instantTime); + } + } + + /** + * Stops all heartbeats started via this instance of the client. + * @throws HoodieException + */ + public void stop() throws HoodieException { + instantToHeartbeatMap.values().forEach(heartbeat -> stop(heartbeat.getInstantTime())); + } + + public static Long getLastHeartbeatTime(FileSystem fs, String basePath, String instantTime) throws IOException { + Path heartbeatFilePath = new Path(HoodieTableMetaClient.getHeartbeatFolderPath(basePath) + Path.SEPARATOR + instantTime); + if (fs.exists(heartbeatFilePath)) { + return fs.getFileStatus(heartbeatFilePath).getModificationTime(); + } else { + // NOTE : This can happen when a writer is upgraded to use lazy cleaning and the last write had failed + return 0L; + } + } + + public static Boolean heartbeatExists(FileSystem fs, String basePath, String instantTime) throws IOException { + Path heartbeatFilePath = new Path(HoodieTableMetaClient.getHeartbeatFolderPath(basePath) + Path.SEPARATOR + instantTime); + if (fs.exists(heartbeatFilePath)) { + return true; + } + return false; + } + + public boolean isHeartbeatExpired(String instantTime) throws IOException { + Long currentTime = System.currentTimeMillis(); + Heartbeat lastHeartbeatForWriter = instantToHeartbeatMap.get(instantTime); + if (lastHeartbeatForWriter == null) { + LOG.info("Heartbeat not found in internal map, falling back to reading from DFS"); + long lastHeartbeatForWriterTime = getLastHeartbeatTime(this.fs, basePath, instantTime); + lastHeartbeatForWriter = new Heartbeat(); + lastHeartbeatForWriter.setLastHeartbeatTime(lastHeartbeatForWriterTime); + lastHeartbeatForWriter.setInstantTime(instantTime); + } + if (currentTime - lastHeartbeatForWriter.getLastHeartbeatTime() > this.maxAllowableHeartbeatIntervalInMs) { + LOG.warn("Heartbeat expired, currentTime = " + currentTime + ", last heartbeat = " + lastHeartbeatForWriter + + ", heartbeat interval = " + this.heartbeatIntervalInMs); + return true; + } + return false; + } + + public List getAllExistingHeartbeatInstants() throws IOException { + Path heartbeatFolder = new Path(heartbeatFolderPath); + if (this.fs.exists(heartbeatFolder)) { + FileStatus[] fileStatus = this.fs.listStatus(new Path(heartbeatFolderPath)); + return Arrays.stream(fileStatus).map(fs -> fs.getPath().getName()).collect(Collectors.toList()); + } + return Collections.EMPTY_LIST; + } + + private void updateHeartbeat(String instantTime) throws HoodieHeartbeatException { + try { + Long newHeartbeatTime = System.currentTimeMillis(); + OutputStream outputStream = + this.fs.create(new Path(heartbeatFolderPath + Path.SEPARATOR + instantTime), true); + outputStream.close(); + Heartbeat heartbeat = instantToHeartbeatMap.get(instantTime); + if (heartbeat.getLastHeartbeatTime() != null && isHeartbeatExpired(instantTime)) { + LOG.error("Aborting, missed generating heartbeat within allowable interval " + this.maxAllowableHeartbeatIntervalInMs); + // Since TimerTask allows only java.lang.Runnable, cannot throw an exception and bubble to the caller thread, hence + // explicitly interrupting the timer thread. + Thread.currentThread().interrupt(); + } + heartbeat.setInstantTime(instantTime); + heartbeat.setLastHeartbeatTime(newHeartbeatTime); + heartbeat.setNumHeartbeats(heartbeat.getNumHeartbeats() + 1); + } catch (IOException io) { + throw new HoodieHeartbeatException("Unable to generate heartbeat ", io); + } + } + + public String getHeartbeatFolderPath() { + return heartbeatFolderPath; + } + + public Heartbeat getHeartbeat(String instantTime) { + return this.instantToHeartbeatMap.get(instantTime); + } + + @Override + public void close() { + this.stop(); + this.instantToHeartbeatMap.clear(); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/ConcurrentOperation.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/ConcurrentOperation.java new file mode 100644 index 0000000000000..35580229e3867 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/ConcurrentOperation.java @@ -0,0 +1,178 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.transaction; + +import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; +import org.apache.hudi.client.utils.MetadataConversionUtils; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieMetadataWrapper; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.CommitUtils; +import org.apache.hudi.common.util.Option; + +import java.io.IOException; +import java.util.Collections; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.table.timeline.HoodieTimeline.COMMIT_ACTION; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.COMPACTION_ACTION; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.DELTA_COMMIT_ACTION; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.REPLACE_COMMIT_ACTION; +import static org.apache.hudi.common.util.CommitUtils.getFileIdWithoutSuffixAndRelativePathsFromSpecificRecord; + +/** + * This class is used to hold all information used to identify how to resolve conflicts between instants. + * Since we interchange payload types between AVRO specific records and POJO's, this object serves as + * a common payload to manage these conversions. + */ +public class ConcurrentOperation { + + private WriteOperationType operationType; + private final HoodieMetadataWrapper metadataWrapper; + private final Option commitMetadataOption; + private final String actionState; + private final String actionType; + private final String instantTime; + private Set mutatedFileIds = Collections.EMPTY_SET; + + public ConcurrentOperation(HoodieInstant instant, HoodieTableMetaClient metaClient) throws IOException { + this.metadataWrapper = new HoodieMetadataWrapper(MetadataConversionUtils.createMetaWrapper(instant, metaClient)); + this.commitMetadataOption = Option.empty(); + this.actionState = instant.getState().name(); + this.actionType = instant.getAction(); + this.instantTime = instant.getTimestamp(); + init(instant); + } + + public ConcurrentOperation(HoodieInstant instant, HoodieCommitMetadata commitMetadata) { + this.commitMetadataOption = Option.of(commitMetadata); + this.metadataWrapper = new HoodieMetadataWrapper(commitMetadata); + this.actionState = instant.getState().name(); + this.actionType = instant.getAction(); + this.instantTime = instant.getTimestamp(); + init(instant); + } + + public String getInstantActionState() { + return actionState; + } + + public String getInstantActionType() { + return actionType; + } + + public String getInstantTimestamp() { + return instantTime; + } + + public WriteOperationType getOperationType() { + return operationType; + } + + public Set getMutatedFileIds() { + return mutatedFileIds; + } + + public Option getCommitMetadataOption() { + return commitMetadataOption; + } + + private void init(HoodieInstant instant) { + if (this.metadataWrapper.isAvroMetadata()) { + switch (getInstantActionType()) { + case COMPACTION_ACTION: + this.operationType = WriteOperationType.COMPACT; + this.mutatedFileIds = this.metadataWrapper.getMetadataFromTimeline().getHoodieCompactionPlan().getOperations() + .stream() + .map(op -> op.getFileId()) + .collect(Collectors.toSet()); + break; + case COMMIT_ACTION: + case DELTA_COMMIT_ACTION: + this.mutatedFileIds = getFileIdWithoutSuffixAndRelativePathsFromSpecificRecord(this.metadataWrapper.getMetadataFromTimeline().getHoodieCommitMetadata() + .getPartitionToWriteStats()).keySet(); + this.operationType = WriteOperationType.fromValue(this.metadataWrapper.getMetadataFromTimeline().getHoodieCommitMetadata().getOperationType()); + break; + case REPLACE_COMMIT_ACTION: + if (instant.isCompleted()) { + this.mutatedFileIds = getFileIdWithoutSuffixAndRelativePathsFromSpecificRecord( + this.metadataWrapper.getMetadataFromTimeline().getHoodieReplaceCommitMetadata().getPartitionToWriteStats()).keySet(); + this.operationType = WriteOperationType.fromValue(this.metadataWrapper.getMetadataFromTimeline().getHoodieReplaceCommitMetadata().getOperationType()); + } else { + // we need to have different handling for requested and inflight replacecommit because + // for requested replacecommit, clustering will generate a plan and HoodieRequestedReplaceMetadata will not be empty, but insert_overwrite/insert_overwrite_table could have empty content + // for inflight replacecommit, clustering will have no content in metadata, but insert_overwrite/insert_overwrite_table will have some commit metadata + HoodieRequestedReplaceMetadata requestedReplaceMetadata = this.metadataWrapper.getMetadataFromTimeline().getHoodieRequestedReplaceMetadata(); + org.apache.hudi.avro.model.HoodieCommitMetadata inflightCommitMetadata = this.metadataWrapper.getMetadataFromTimeline().getHoodieInflightReplaceMetadata(); + if (instant.isRequested()) { + // for insert_overwrite/insert_overwrite_table clusteringPlan will be empty + if (requestedReplaceMetadata != null && requestedReplaceMetadata.getClusteringPlan() != null) { + this.mutatedFileIds = getFileIdsFromRequestedReplaceMetadata(requestedReplaceMetadata); + this.operationType = WriteOperationType.CLUSTER; + } + } else { + if (inflightCommitMetadata != null) { + this.mutatedFileIds = getFileIdWithoutSuffixAndRelativePathsFromSpecificRecord(inflightCommitMetadata.getPartitionToWriteStats()).keySet(); + this.operationType = WriteOperationType.fromValue(this.metadataWrapper.getMetadataFromTimeline().getHoodieInflightReplaceMetadata().getOperationType()); + } else if (requestedReplaceMetadata != null) { + // inflight replacecommit metadata is empty due to clustering, read fileIds from requested replacecommit + this.mutatedFileIds = getFileIdsFromRequestedReplaceMetadata(requestedReplaceMetadata); + this.operationType = WriteOperationType.CLUSTER; + } + // NOTE: it cannot be the case that instant is inflight, and both the requested and inflight replacecommit metadata are empty + } + } + break; + default: + throw new IllegalArgumentException("Unsupported Action Type " + getInstantActionType()); + } + } else { + switch (getInstantActionType()) { + case COMMIT_ACTION: + case DELTA_COMMIT_ACTION: + this.mutatedFileIds = CommitUtils.getFileIdWithoutSuffixAndRelativePaths(this.metadataWrapper.getCommitMetadata().getPartitionToWriteStats()).keySet(); + this.operationType = this.metadataWrapper.getCommitMetadata().getOperationType(); + break; + default: + throw new IllegalArgumentException("Unsupported Action Type " + getInstantActionType()); + } + } + } + + private static Set getFileIdsFromRequestedReplaceMetadata(HoodieRequestedReplaceMetadata requestedReplaceMetadata) { + return requestedReplaceMetadata + .getClusteringPlan().getInputGroups() + .stream() + .flatMap(ig -> ig.getSlices().stream()) + .map(file -> file.getFileId()) + .collect(Collectors.toSet()); + } + + @Override + public String toString() { + return "{" + + "actionType=" + this.getInstantActionType() + + ", instantTime=" + this.getInstantTimestamp() + + ", actionState=" + this.getInstantActionState() + + '\'' + '}'; + } +} \ No newline at end of file diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/ConflictResolutionStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/ConflictResolutionStrategy.java new file mode 100644 index 0000000000000..d1e988adb59ae --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/ConflictResolutionStrategy.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.transaction; + +import org.apache.hudi.ApiMaturityLevel; +import org.apache.hudi.PublicAPIMethod; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieWriteConflictException; +import org.apache.hudi.table.HoodieTable; + +import java.util.stream.Stream; + +/** + * Strategy interface for conflict resolution with multiple writers. + * Users can provide pluggable implementations for different kinds of strategies to resolve conflicts when multiple + * writers are mutating the hoodie table. + */ +public interface ConflictResolutionStrategy { + + /** + * Stream of instants to check conflicts against. + * @return + */ + Stream getCandidateInstants(HoodieActiveTimeline activeTimeline, HoodieInstant currentInstant, Option lastSuccessfulInstant); + + /** + * Implementations of this method will determine whether a conflict exists between 2 commits. + * @param thisOperation + * @param otherOperation + * @return + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + boolean hasConflict(ConcurrentOperation thisOperation, ConcurrentOperation otherOperation); + + /** + * Implementations of this method will determine how to resolve a conflict between 2 commits. + * @param thisOperation + * @param otherOperation + * @return + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + Option resolveConflict(HoodieTable table, + ConcurrentOperation thisOperation, ConcurrentOperation otherOperation) throws HoodieWriteConflictException; + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/SimpleConcurrentFileWritesConflictResolutionStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/SimpleConcurrentFileWritesConflictResolutionStrategy.java new file mode 100644 index 0000000000000..938a40684a092 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/SimpleConcurrentFileWritesConflictResolutionStrategy.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.transaction; + +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieWriteConflictException; +import org.apache.hudi.table.HoodieTable; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ConcurrentModificationException; +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Stream; + +import static org.apache.hudi.common.table.timeline.HoodieTimeline.COMPACTION_ACTION; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.REPLACE_COMMIT_ACTION; + +/** + * This class is a basic implementation of a conflict resolution strategy for concurrent writes {@link ConflictResolutionStrategy}. + */ +public class SimpleConcurrentFileWritesConflictResolutionStrategy + implements ConflictResolutionStrategy { + + private static final Logger LOG = LogManager.getLogger(SimpleConcurrentFileWritesConflictResolutionStrategy.class); + + @Override + public Stream getCandidateInstants(HoodieActiveTimeline activeTimeline, HoodieInstant currentInstant, + Option lastSuccessfulInstant) { + + // To find which instants are conflicting, we apply the following logic + // 1. Get completed instants timeline only for commits that have happened since the last successful write. + // 2. Get any scheduled or completed compaction or clustering operations that have started and/or finished + // after the current instant. We need to check for write conflicts since they may have mutated the same files + // that are being newly created by the current write. + Stream completedCommitsInstantStream = activeTimeline + .getCommitsTimeline() + .filterCompletedInstants() + .findInstantsAfter(lastSuccessfulInstant.isPresent() ? lastSuccessfulInstant.get().getTimestamp() : HoodieTimeline.INIT_INSTANT_TS) + .getInstants(); + + Stream compactionAndClusteringPendingTimeline = activeTimeline + .getTimelineOfActions(CollectionUtils.createSet(REPLACE_COMMIT_ACTION, COMPACTION_ACTION)) + .findInstantsAfter(currentInstant.getTimestamp()) + .filterInflightsAndRequested() + .getInstants(); + return Stream.concat(completedCommitsInstantStream, compactionAndClusteringPendingTimeline); + } + + @Override + public boolean hasConflict(ConcurrentOperation thisOperation, ConcurrentOperation otherOperation) { + // TODO : UUID's can clash even for insert/insert, handle that case. + Set fileIdsSetForFirstInstant = thisOperation.getMutatedFileIds(); + Set fileIdsSetForSecondInstant = otherOperation.getMutatedFileIds(); + Set intersection = new HashSet<>(fileIdsSetForFirstInstant); + intersection.retainAll(fileIdsSetForSecondInstant); + if (!intersection.isEmpty()) { + LOG.info("Found conflicting writes between first operation = " + thisOperation + + ", second operation = " + otherOperation + " , intersecting file ids " + intersection); + return true; + } + return false; + } + + @Override + public Option resolveConflict(HoodieTable table, + ConcurrentOperation thisOperation, ConcurrentOperation otherOperation) { + // A completed COMPACTION action eventually shows up as a COMMIT action on the timeline. + // We need to ensure we handle this during conflict resolution and not treat the commit from a + // compaction operation as a regular commit. Regular commits & deltacommits are candidates for conflict. + // Since the REPLACE action with CLUSTER operation does not support concurrent updates, we have + // to consider it as conflict if we see overlapping file ids. Once concurrent updates are + // supported for CLUSTER (https://issues.apache.org/jira/browse/HUDI-1042), + // add that to the below check so that concurrent updates do not conflict. + if (otherOperation.getOperationType() == WriteOperationType.COMPACT + && HoodieTimeline.compareTimestamps(otherOperation.getInstantTimestamp(), HoodieTimeline.LESSER_THAN, thisOperation.getInstantTimestamp())) { + return thisOperation.getCommitMetadataOption(); + } + // just abort the current write if conflicts are found + throw new HoodieWriteConflictException(new ConcurrentModificationException("Cannot resolve conflicts for overlapping writes")); + } + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/TransactionManager.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/TransactionManager.java new file mode 100644 index 0000000000000..bcf8ef6ea5045 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/TransactionManager.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.transaction; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hudi.client.transaction.lock.LockManager; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.Serializable; + +/** + * This class allows clients to start and end transactions. Anything done between a start and end transaction is + * guaranteed to be atomic. + */ +public class TransactionManager implements Serializable { + + private static final Logger LOG = LogManager.getLogger(TransactionManager.class); + private final LockManager lockManager; + private final boolean isOptimisticConcurrencyControlEnabled; + private Option currentTxnOwnerInstant = Option.empty(); + private Option lastCompletedTxnOwnerInstant = Option.empty(); + + public TransactionManager(HoodieWriteConfig config, FileSystem fs) { + this.lockManager = new LockManager(config, fs); + this.isOptimisticConcurrencyControlEnabled = config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl(); + } + + public void beginTransaction(Option newTxnOwnerInstant, + Option lastCompletedTxnOwnerInstant) { + if (isOptimisticConcurrencyControlEnabled) { + LOG.info("Transaction starting for " + newTxnOwnerInstant + + " with latest completed transaction instant " + lastCompletedTxnOwnerInstant); + lockManager.lock(); + reset(currentTxnOwnerInstant, newTxnOwnerInstant, lastCompletedTxnOwnerInstant); + LOG.info("Transaction started for " + newTxnOwnerInstant + + " with latest completed transaction instant " + lastCompletedTxnOwnerInstant); + } + } + + public void endTransaction(Option currentTxnOwnerInstant) { + if (isOptimisticConcurrencyControlEnabled) { + LOG.info("Transaction ending with transaction owner " + currentTxnOwnerInstant); + if (reset(currentTxnOwnerInstant, Option.empty(), Option.empty())) { + lockManager.unlock(); + LOG.info("Transaction ended with transaction owner " + currentTxnOwnerInstant); + } + } + } + + private synchronized boolean reset(Option callerInstant, + Option newTxnOwnerInstant, + Option lastCompletedTxnOwnerInstant) { + if (!this.currentTxnOwnerInstant.isPresent() || this.currentTxnOwnerInstant.get().equals(callerInstant.get())) { + this.currentTxnOwnerInstant = newTxnOwnerInstant; + this.lastCompletedTxnOwnerInstant = lastCompletedTxnOwnerInstant; + return true; + } + return false; + } + + public void close() { + if (isOptimisticConcurrencyControlEnabled) { + lockManager.close(); + LOG.info("Transaction manager closed"); + } + } + + public LockManager getLockManager() { + return lockManager; + } + + public Option getLastCompletedTransactionOwner() { + return lastCompletedTxnOwnerInstant; + } + + public Option getCurrentTransactionOwner() { + return currentTxnOwnerInstant; + } + + public boolean isOptimisticConcurrencyControlEnabled() { + return isOptimisticConcurrencyControlEnabled; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/FileSystemBasedLockProvider.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/FileSystemBasedLockProvider.java new file mode 100644 index 0000000000000..4135ef9acded4 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/FileSystemBasedLockProvider.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.client.transaction.lock; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.config.LockConfiguration; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.lock.LockProvider; +import org.apache.hudi.common.lock.LockState; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieLockException; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.io.Serializable; +import java.util.concurrent.TimeUnit; + +import static org.apache.hudi.common.config.LockConfiguration.FILESYSTEM_LOCK_EXPIRE_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.FILESYSTEM_LOCK_PATH_PROP_KEY; + +/** + * A FileSystem based lock. This {@link LockProvider} implementation allows to lock table operations + * using DFS. Users might need to manually clean the Locker's path if writeClient crash and never run again. + * NOTE: This only works for DFS with atomic create/delete operation + */ +public class FileSystemBasedLockProvider implements LockProvider, Serializable { + + private static final Logger LOG = LogManager.getLogger(FileSystemBasedLockProvider.class); + + private static final String LOCK_FILE_NAME = "lock"; + + private final int lockTimeoutMinutes; + private final transient FileSystem fs; + private final transient Path lockFile; + protected LockConfiguration lockConfiguration; + + public FileSystemBasedLockProvider(final LockConfiguration lockConfiguration, final Configuration configuration) { + checkRequiredProps(lockConfiguration); + this.lockConfiguration = lockConfiguration; + String lockDirectory = lockConfiguration.getConfig().getString(FILESYSTEM_LOCK_PATH_PROP_KEY, null); + if (StringUtils.isNullOrEmpty(lockDirectory)) { + lockDirectory = lockConfiguration.getConfig().getString(HoodieWriteConfig.BASE_PATH.key()) + + Path.SEPARATOR + HoodieTableMetaClient.METAFOLDER_NAME; + } + this.lockTimeoutMinutes = lockConfiguration.getConfig().getInteger(FILESYSTEM_LOCK_EXPIRE_PROP_KEY); + this.lockFile = new Path(lockDirectory + Path.SEPARATOR + LOCK_FILE_NAME); + this.fs = FSUtils.getFs(this.lockFile.toString(), configuration); + } + + @Override + public void close() { + synchronized (LOCK_FILE_NAME) { + try { + fs.delete(this.lockFile, true); + } catch (IOException e) { + throw new HoodieLockException(generateLogStatement(LockState.FAILED_TO_RELEASE), e); + } + } + } + + @Override + public boolean tryLock(long time, TimeUnit unit) { + try { + synchronized (LOCK_FILE_NAME) { + // Check whether lock is already expired, if so try to delete lock file + if (fs.exists(this.lockFile)) { + if (checkIfExpired()) { + fs.delete(this.lockFile, true); + LOG.warn("Delete expired lock file: " + this.lockFile); + } else { + return false; + } + } + acquireLock(); + return fs.exists(this.lockFile); + } + } catch (IOException | HoodieIOException e) { + LOG.info(generateLogStatement(LockState.FAILED_TO_ACQUIRE), e); + return false; + } + } + + @Override + public void unlock() { + synchronized (LOCK_FILE_NAME) { + try { + if (fs.exists(this.lockFile)) { + fs.delete(this.lockFile, true); + } + } catch (IOException io) { + throw new HoodieIOException(generateLogStatement(LockState.FAILED_TO_RELEASE), io); + } + } + } + + @Override + public String getLock() { + return this.lockFile.toString(); + } + + private boolean checkIfExpired() { + if (lockTimeoutMinutes == 0) { + return false; + } + try { + long modificationTime = fs.getFileStatus(this.lockFile).getModificationTime(); + if (System.currentTimeMillis() - modificationTime > lockTimeoutMinutes * 60 * 1000L) { + return true; + } + } catch (IOException | HoodieIOException e) { + LOG.error(generateLogStatement(LockState.ALREADY_RELEASED) + " failed to get lockFile's modification time", e); + } + return false; + } + + private void acquireLock() { + try { + fs.create(this.lockFile, false).close(); + } catch (IOException e) { + throw new HoodieIOException(generateLogStatement(LockState.FAILED_TO_ACQUIRE), e); + } + } + + protected String generateLogStatement(LockState state) { + return StringUtils.join(state.name(), " lock at: ", getLock()); + } + + private void checkRequiredProps(final LockConfiguration config) { + ValidationUtils.checkArgument(config.getConfig().getString(FILESYSTEM_LOCK_PATH_PROP_KEY, null) != null + || config.getConfig().getString(HoodieWriteConfig.BASE_PATH.key(), null) != null); + ValidationUtils.checkArgument(config.getConfig().getInteger(FILESYSTEM_LOCK_EXPIRE_PROP_KEY) >= 0); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/InProcessLockProvider.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/InProcessLockProvider.java new file mode 100644 index 0000000000000..c3cd5742482a0 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/InProcessLockProvider.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.client.transaction.lock; + +import org.apache.hudi.common.config.LockConfiguration; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.lock.LockProvider; +import org.apache.hudi.common.lock.LockState; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.exception.HoodieLockException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.jetbrains.annotations.NotNull; + +import java.io.Serializable; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.ReentrantReadWriteLock; + +/** + * InProcess level lock. This {@link LockProvider} implementation is to + * guard table from concurrent operations happening in the local JVM process. + *

+ * Note: This Lock provider implementation doesn't allow lock reentrancy. + * Attempting to reacquire the lock from the same thread will throw + * HoodieLockException. Threads other than the current lock owner, will + * block on lock() and return false on tryLock(). + */ +public class InProcessLockProvider implements LockProvider, Serializable { + + private static final Logger LOG = LogManager.getLogger(InProcessLockProvider.class); + private static final ReentrantReadWriteLock LOCK = new ReentrantReadWriteLock(); + private final long maxWaitTimeMillis; + + public InProcessLockProvider(final LockConfiguration lockConfiguration, final Configuration conf) { + TypedProperties typedProperties = lockConfiguration.getConfig(); + maxWaitTimeMillis = typedProperties.getLong(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, + LockConfiguration.DEFAULT_ACQUIRE_LOCK_WAIT_TIMEOUT_MS); + } + + @Override + public void lock() { + LOG.info(getLogMessage(LockState.ACQUIRING)); + if (LOCK.isWriteLockedByCurrentThread()) { + throw new HoodieLockException(getLogMessage(LockState.ALREADY_ACQUIRED)); + } + LOCK.writeLock().lock(); + LOG.info(getLogMessage(LockState.ACQUIRED)); + } + + @Override + public boolean tryLock() { + return tryLock(maxWaitTimeMillis, TimeUnit.MILLISECONDS); + } + + @Override + public boolean tryLock(long time, @NotNull TimeUnit unit) { + LOG.info(getLogMessage(LockState.ACQUIRING)); + if (LOCK.isWriteLockedByCurrentThread()) { + throw new HoodieLockException(getLogMessage(LockState.ALREADY_ACQUIRED)); + } + + boolean isLockAcquired; + try { + isLockAcquired = LOCK.writeLock().tryLock(time, unit); + } catch (InterruptedException e) { + throw new HoodieLockException(getLogMessage(LockState.FAILED_TO_ACQUIRE)); + } + + LOG.info(getLogMessage(isLockAcquired ? LockState.ACQUIRED : LockState.FAILED_TO_ACQUIRE)); + return isLockAcquired; + } + + @Override + public void unlock() { + LOG.info(getLogMessage(LockState.RELEASING)); + try { + if (LOCK.isWriteLockedByCurrentThread()) { + LOCK.writeLock().unlock(); + LOG.info(getLogMessage(LockState.RELEASED)); + } else { + LOG.warn("Cannot unlock because the current thread does not hold the lock."); + } + } catch (Exception e) { + throw new HoodieLockException(getLogMessage(LockState.FAILED_TO_RELEASE), e); + } + } + + @Override + public ReentrantReadWriteLock getLock() { + return LOCK; + } + + @Override + public void close() { + if (LOCK.isWriteLockedByCurrentThread()) { + LOCK.writeLock().unlock(); + } + } + + private String getLogMessage(LockState state) { + return StringUtils.join("Thread ", String.valueOf(Thread.currentThread().getName()), " ", + state.name(), " in-process lock."); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/LockManager.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/LockManager.java new file mode 100644 index 0000000000000..2c5a8846383e4 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/LockManager.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.transaction.lock; + +import org.apache.hudi.client.transaction.lock.metrics.HoodieLockMetrics; +import org.apache.hudi.common.config.LockConfiguration; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.lock.LockProvider; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.config.HoodieLockConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieLockException; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.Serializable; +import java.util.concurrent.TimeUnit; + +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY; + +/** + * This class wraps implementations of {@link LockProvider} and provides an easy way to manage the lifecycle of a lock. + */ +public class LockManager implements Serializable, AutoCloseable { + + private static final Logger LOG = LogManager.getLogger(LockManager.class); + private final HoodieWriteConfig writeConfig; + private final LockConfiguration lockConfiguration; + private final SerializableConfiguration hadoopConf; + private final int maxRetries; + private final long maxWaitTimeInMs; + private transient HoodieLockMetrics metrics; + private volatile LockProvider lockProvider; + + public LockManager(HoodieWriteConfig writeConfig, FileSystem fs) { + this.writeConfig = writeConfig; + this.hadoopConf = new SerializableConfiguration(fs.getConf()); + this.lockConfiguration = new LockConfiguration(writeConfig.getProps()); + maxRetries = lockConfiguration.getConfig().getInteger(LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY, + Integer.parseInt(HoodieLockConfig.LOCK_ACQUIRE_CLIENT_NUM_RETRIES.defaultValue())); + maxWaitTimeInMs = lockConfiguration.getConfig().getLong(LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY, + Long.parseLong(HoodieLockConfig.LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS.defaultValue())); + metrics = new HoodieLockMetrics(writeConfig); + } + + public void lock() { + if (writeConfig.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl()) { + LockProvider lockProvider = getLockProvider(); + int retryCount = 0; + boolean acquired = false; + while (retryCount <= maxRetries) { + try { + metrics.startLockApiTimerContext(); + acquired = lockProvider.tryLock(writeConfig.getLockAcquireWaitTimeoutInMs(), TimeUnit.MILLISECONDS); + if (acquired) { + metrics.updateLockAcquiredMetric(); + break; + } + metrics.updateLockNotAcquiredMetric(); + LOG.info("Retrying to acquire lock..."); + Thread.sleep(maxWaitTimeInMs); + } catch (HoodieLockException | InterruptedException e) { + metrics.updateLockNotAcquiredMetric(); + if (retryCount >= maxRetries) { + throw new HoodieLockException("Unable to acquire lock, lock object " + lockProvider.getLock(), e); + } + try { + Thread.sleep(maxWaitTimeInMs); + } catch (InterruptedException ex) { + // ignore InterruptedException here + } + } finally { + retryCount++; + } + } + if (!acquired) { + throw new HoodieLockException("Unable to acquire lock, lock object " + lockProvider.getLock()); + } + } + } + + /** + * We need to take care of the scenarios that current thread may not be the holder of this lock + * and tries to call unlock() + */ + public void unlock() { + if (writeConfig.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl()) { + getLockProvider().unlock(); + metrics.updateLockHeldTimerMetrics(); + } + } + + public synchronized LockProvider getLockProvider() { + // Perform lazy initialization of lock provider only if needed + if (lockProvider == null) { + LOG.info("LockProvider " + writeConfig.getLockProviderClass()); + lockProvider = (LockProvider) ReflectionUtils.loadClass(writeConfig.getLockProviderClass(), + lockConfiguration, hadoopConf.get()); + } + return lockProvider; + } + + @Override + public void close() { + closeQuietly(); + } + + private void closeQuietly() { + try { + if (lockProvider != null) { + lockProvider.close(); + LOG.info("Released connection created for acquiring lock"); + lockProvider = null; + } + } catch (Exception e) { + LOG.error("Unable to close and release connection created for acquiring lock", e); + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/ZookeeperBasedLockProvider.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/ZookeeperBasedLockProvider.java new file mode 100644 index 0000000000000..fc5b7a75f7f60 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/ZookeeperBasedLockProvider.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.transaction.lock; + +import org.apache.hudi.common.config.LockConfiguration; +import org.apache.hudi.common.lock.LockProvider; +import org.apache.hudi.common.lock.LockState; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.exception.HoodieLockException; + +import org.apache.curator.framework.CuratorFramework; +import org.apache.curator.framework.CuratorFrameworkFactory; +import org.apache.curator.framework.imps.CuratorFrameworkState; +import org.apache.curator.framework.recipes.locks.InterProcessMutex; +import org.apache.curator.retry.BoundedExponentialBackoffRetry; +import org.apache.hadoop.conf.Configuration; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import javax.annotation.concurrent.NotThreadSafe; + +import java.io.Serializable; +import java.util.concurrent.TimeUnit; + +import static org.apache.hudi.common.config.LockConfiguration.DEFAULT_ZK_CONNECTION_TIMEOUT_MS; +import static org.apache.hudi.common.config.LockConfiguration.DEFAULT_ZK_SESSION_TIMEOUT_MS; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_NUM_RETRIES_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_RETRY_MAX_WAIT_TIME_IN_MILLIS_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.ZK_BASE_PATH_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.ZK_CONNECTION_TIMEOUT_MS_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.ZK_CONNECT_URL_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.ZK_LOCK_KEY_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.ZK_SESSION_TIMEOUT_MS_PROP_KEY; + +/** + * A zookeeper based lock. This {@link LockProvider} implementation allows to lock table operations + * using zookeeper. Users need to have a Zookeeper cluster deployed to be able to use this lock. + */ +@NotThreadSafe +public class ZookeeperBasedLockProvider implements LockProvider, Serializable { + + private static final Logger LOG = LogManager.getLogger(ZookeeperBasedLockProvider.class); + + private final transient CuratorFramework curatorFrameworkClient; + private volatile InterProcessMutex lock = null; + protected LockConfiguration lockConfiguration; + + public ZookeeperBasedLockProvider(final LockConfiguration lockConfiguration, final Configuration conf) { + checkRequiredProps(lockConfiguration); + this.lockConfiguration = lockConfiguration; + this.curatorFrameworkClient = CuratorFrameworkFactory.builder() + .connectString(lockConfiguration.getConfig().getString(ZK_CONNECT_URL_PROP_KEY)) + .retryPolicy(new BoundedExponentialBackoffRetry(lockConfiguration.getConfig().getInteger(LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY), + lockConfiguration.getConfig().getInteger(LOCK_ACQUIRE_RETRY_MAX_WAIT_TIME_IN_MILLIS_PROP_KEY), lockConfiguration.getConfig().getInteger(LOCK_ACQUIRE_NUM_RETRIES_PROP_KEY))) + .sessionTimeoutMs(lockConfiguration.getConfig().getInteger(ZK_SESSION_TIMEOUT_MS_PROP_KEY, DEFAULT_ZK_SESSION_TIMEOUT_MS)) + .connectionTimeoutMs(lockConfiguration.getConfig().getInteger(ZK_CONNECTION_TIMEOUT_MS_PROP_KEY, DEFAULT_ZK_CONNECTION_TIMEOUT_MS)) + .build(); + this.curatorFrameworkClient.start(); + } + + // Only used for testing + public ZookeeperBasedLockProvider( + final LockConfiguration lockConfiguration, final CuratorFramework curatorFrameworkClient) { + checkRequiredProps(lockConfiguration); + this.lockConfiguration = lockConfiguration; + this.curatorFrameworkClient = curatorFrameworkClient; + synchronized (this.curatorFrameworkClient) { + if (this.curatorFrameworkClient.getState() != CuratorFrameworkState.STARTED) { + this.curatorFrameworkClient.start(); + } + } + } + + @Override + public boolean tryLock(long time, TimeUnit unit) { + LOG.info(generateLogStatement(LockState.ACQUIRING, generateLogSuffixString())); + try { + acquireLock(time, unit); + LOG.info(generateLogStatement(LockState.ACQUIRED, generateLogSuffixString())); + } catch (HoodieLockException e) { + throw e; + } catch (Exception e) { + throw new HoodieLockException(generateLogStatement(LockState.FAILED_TO_ACQUIRE, generateLogSuffixString()), e); + } + return lock != null && lock.isAcquiredInThisProcess(); + } + + @Override + public void unlock() { + try { + LOG.info(generateLogStatement(LockState.RELEASING, generateLogSuffixString())); + if (lock == null || !lock.isAcquiredInThisProcess()) { + return; + } + lock.release(); + lock = null; + LOG.info(generateLogStatement(LockState.RELEASED, generateLogSuffixString())); + } catch (Exception e) { + throw new HoodieLockException(generateLogStatement(LockState.FAILED_TO_RELEASE, generateLogSuffixString()), e); + } + } + + @Override + public void close() { + try { + if (lock != null) { + lock.release(); + lock = null; + } + this.curatorFrameworkClient.close(); + } catch (Exception e) { + LOG.error(generateLogStatement(LockState.FAILED_TO_RELEASE, generateLogSuffixString())); + } + } + + @Override + public InterProcessMutex getLock() { + return this.lock; + } + + private void acquireLock(long time, TimeUnit unit) throws Exception { + ValidationUtils.checkArgument(this.lock == null, generateLogStatement(LockState.ALREADY_ACQUIRED, generateLogSuffixString())); + InterProcessMutex newLock = new InterProcessMutex( + this.curatorFrameworkClient, lockConfiguration.getConfig().getString(ZK_BASE_PATH_PROP_KEY) + "/" + + this.lockConfiguration.getConfig().getString(ZK_LOCK_KEY_PROP_KEY)); + boolean acquired = newLock.acquire(time, unit); + if (!acquired) { + throw new HoodieLockException(generateLogStatement(LockState.FAILED_TO_ACQUIRE, generateLogSuffixString())); + } + if (newLock.isAcquiredInThisProcess()) { + lock = newLock; + } else { + throw new HoodieLockException(generateLogStatement(LockState.FAILED_TO_ACQUIRE, generateLogSuffixString())); + } + } + + private void checkRequiredProps(final LockConfiguration config) { + ValidationUtils.checkArgument(config.getConfig().getString(ZK_CONNECT_URL_PROP_KEY) != null); + ValidationUtils.checkArgument(config.getConfig().getString(ZK_BASE_PATH_PROP_KEY) != null); + ValidationUtils.checkArgument(config.getConfig().getString(ZK_SESSION_TIMEOUT_MS_PROP_KEY) != null); + ValidationUtils.checkArgument(config.getConfig().getString(ZK_CONNECTION_TIMEOUT_MS_PROP_KEY) != null); + ValidationUtils.checkArgument(config.getConfig().getString(ZK_LOCK_KEY_PROP_KEY) != null); + } + + private String generateLogSuffixString() { + String zkBasePath = this.lockConfiguration.getConfig().getString(ZK_BASE_PATH_PROP_KEY); + String lockKey = this.lockConfiguration.getConfig().getString(ZK_LOCK_KEY_PROP_KEY); + return StringUtils.join("ZkBasePath = ", zkBasePath, ", lock key = ", lockKey); + } + + protected String generateLogStatement(LockState state, String suffix) { + return StringUtils.join(state.name(), " lock at", suffix); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/metrics/HoodieLockMetrics.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/metrics/HoodieLockMetrics.java new file mode 100644 index 0000000000000..c33a86bfbe79d --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/metrics/HoodieLockMetrics.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.transaction.lock.metrics; + +import com.codahale.metrics.Counter; +import com.codahale.metrics.MetricRegistry; +import com.codahale.metrics.SlidingWindowReservoir; +import com.codahale.metrics.Timer; + +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.metrics.Metrics; + +import java.util.concurrent.TimeUnit; + +public class HoodieLockMetrics { + + public static final String LOCK_ACQUIRE_ATTEMPTS_COUNTER_NAME = "lock.acquire.attempts"; + public static final String LOCK_ACQUIRE_SUCCESS_COUNTER_NAME = "lock.acquire.success"; + public static final String LOCK_ACQUIRE_FAILURES_COUNTER_NAME = "lock.acquire.failure"; + public static final String LOCK_ACQUIRE_DURATION_TIMER_NAME = "lock.acquire.duration"; + public static final String LOCK_REQUEST_LATENCY_TIMER_NAME = "lock.request.latency"; + private final HoodieWriteConfig writeConfig; + private final boolean isMetricsEnabled; + private final int keepLastNtimes = 100; + private final transient HoodieTimer lockDurationTimer = HoodieTimer.create(); + private final transient HoodieTimer lockApiRequestDurationTimer = HoodieTimer.create(); + private transient Counter lockAttempts; + private transient Counter successfulLockAttempts; + private transient Counter failedLockAttempts; + private transient Timer lockDuration; + private transient Timer lockApiRequestDuration; + private static final Object REGISTRY_LOCK = new Object(); + + public HoodieLockMetrics(HoodieWriteConfig writeConfig) { + this.isMetricsEnabled = writeConfig.isLockingMetricsEnabled(); + this.writeConfig = writeConfig; + + if (isMetricsEnabled) { + MetricRegistry registry = Metrics.getInstance().getRegistry(); + + lockAttempts = registry.counter(getMetricsName(LOCK_ACQUIRE_ATTEMPTS_COUNTER_NAME)); + successfulLockAttempts = registry.counter(getMetricsName(LOCK_ACQUIRE_SUCCESS_COUNTER_NAME)); + failedLockAttempts = registry.counter(getMetricsName(LOCK_ACQUIRE_FAILURES_COUNTER_NAME)); + + lockDuration = createTimerForMetrics(registry, LOCK_ACQUIRE_DURATION_TIMER_NAME); + lockApiRequestDuration = createTimerForMetrics(registry, LOCK_REQUEST_LATENCY_TIMER_NAME); + } + } + + private String getMetricsName(String metric) { + return writeConfig == null ? null : String.format("%s.%s", writeConfig.getMetricReporterMetricsNamePrefix(), metric); + } + + private Timer createTimerForMetrics(MetricRegistry registry, String metric) { + String metricName = getMetricsName(metric); + synchronized (REGISTRY_LOCK) { + if (registry.getMetrics().get(metricName) == null) { + lockDuration = new Timer(new SlidingWindowReservoir(keepLastNtimes)); + registry.register(metricName, lockDuration); + return lockDuration; + } + } + return (Timer) registry.getMetrics().get(metricName); + } + + public void startLockApiTimerContext() { + if (isMetricsEnabled) { + lockApiRequestDurationTimer.startTimer(); + } + } + + public void updateLockAcquiredMetric() { + if (isMetricsEnabled) { + long durationMs = lockApiRequestDurationTimer.endTimer(); + lockApiRequestDuration.update(durationMs, TimeUnit.MILLISECONDS); + lockAttempts.inc(); + successfulLockAttempts.inc(); + lockDurationTimer.startTimer(); + } + } + + public void updateLockNotAcquiredMetric() { + if (isMetricsEnabled) { + long durationMs = lockApiRequestDurationTimer.endTimer(); + lockApiRequestDuration.update(durationMs, TimeUnit.MILLISECONDS); + failedLockAttempts.inc(); + } + } + + public void updateLockHeldTimerMetrics() { + if (isMetricsEnabled && lockDurationTimer != null) { + long lockDurationInMs = lockDurationTimer.endTimer(); + lockDuration.update(lockDurationInMs, TimeUnit.MILLISECONDS); + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/ConcatenatingIterator.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/ConcatenatingIterator.java new file mode 100644 index 0000000000000..aa6c29b0844a2 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/ConcatenatingIterator.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.utils; + +import org.apache.hudi.common.util.ValidationUtils; + +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Queue; + +/** + * Provides iterator interface over List of iterators. Consumes all records from first iterator element + * before moving to next iterator in the list. That is concatenate elements across multiple iterators. + * + * @param + */ +public class ConcatenatingIterator implements Iterator { + + private final Queue> allIterators; + + public ConcatenatingIterator(List> iterators) { + allIterators = new LinkedList<>(iterators); + } + + @Override + public boolean hasNext() { + while (!allIterators.isEmpty()) { + if (allIterators.peek().hasNext()) { + return true; + } + // iterator at current head is done. move ahead + allIterators.poll(); + } + + return false; + } + + @Override + public T next() { + ValidationUtils.checkArgument(hasNext(), "No more elements left"); + return allIterators.peek().next(); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/FileSliceMetricUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/FileSliceMetricUtils.java new file mode 100644 index 0000000000000..09f022bbd76f4 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/FileSliceMetricUtils.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.utils; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieLogFile; + +import java.util.List; +import java.util.Map; + +/** + * A utility class for calculating metrics related to FileSlice. + */ +public class FileSliceMetricUtils { + + public static final String TOTAL_IO_READ_MB = "TOTAL_IO_READ_MB"; + public static final String TOTAL_IO_WRITE_MB = "TOTAL_IO_WRITE_MB"; + public static final String TOTAL_IO_MB = "TOTAL_IO_MB"; + public static final String TOTAL_LOG_FILE_SIZE = "TOTAL_LOG_FILES_SIZE"; + public static final String TOTAL_LOG_FILES = "TOTAL_LOG_FILES"; + + public static void addFileSliceCommonMetrics(List fileSlices, Map metrics, long defaultBaseFileSize) { + int numLogFiles = 0; + long totalLogFileSize = 0; + long totalIORead = 0; + long totalIOWrite = 0; + long totalIO = 0; + + for (FileSlice slice : fileSlices) { + numLogFiles += slice.getLogFiles().count(); + // Total size of all the log files + totalLogFileSize += slice.getLogFiles().map(HoodieLogFile::getFileSize).filter(size -> size >= 0) + .reduce(Long::sum).orElse(0L); + + long baseFileSize = slice.getBaseFile().isPresent() ? slice.getBaseFile().get().getFileSize() : 0L; + totalIORead += baseFileSize; + // Total write will be similar to the size of the base file + totalIOWrite += baseFileSize > 0 ? baseFileSize : defaultBaseFileSize; + } + // Total read will be the base file + all the log files + totalIORead = FSUtils.getSizeInMB(totalIORead + totalLogFileSize); + totalIOWrite = FSUtils.getSizeInMB(totalIOWrite); + + // Total IO will be the IO for read + write + totalIO = totalIORead + totalIOWrite; + + metrics.put(TOTAL_IO_READ_MB, (double) totalIORead); + metrics.put(TOTAL_IO_WRITE_MB, (double) totalIOWrite); + metrics.put(TOTAL_IO_MB, (double) totalIO); + metrics.put(TOTAL_LOG_FILE_SIZE, (double) totalLogFileSize); + metrics.put(TOTAL_LOG_FILES, (double) numLogFiles); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/LazyIterableIterator.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/LazyIterableIterator.java index 020944e7ab9b1..ad54f8c0a0992 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/LazyIterableIterator.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/LazyIterableIterator.java @@ -45,7 +45,7 @@ public LazyIterableIterator(Iterator in) { /** * Called once, before any elements are processed. */ - protected abstract void start(); + protected void start() {} /** * Block computation to be overwritten by sub classes. @@ -55,7 +55,7 @@ public LazyIterableIterator(Iterator in) { /** * Called once, after all elements are processed. */ - protected abstract void end(); + protected void end() {} ////////////////// // iterable implementation diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/MetadataConversionUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/MetadataConversionUtils.java new file mode 100644 index 0000000000000..342de74a11395 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/MetadataConversionUtils.java @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.utils; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.IOException; +import org.apache.hudi.avro.model.HoodieArchivedMetaEntry; +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieSavepointMetadata; +import org.apache.hudi.client.ReplaceArchivalHelper; +import org.apache.hudi.common.model.ActionType; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; +import org.apache.hudi.common.model.HoodieRollingStatMetadata; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.util.CleanerUtils; +import org.apache.hudi.common.util.CompactionUtils; +import org.apache.hudi.common.util.Option; + +/** + * Helper class to convert between different action related payloads and {@link HoodieArchivedMetaEntry}. + */ +public class MetadataConversionUtils { + + public static HoodieArchivedMetaEntry createMetaWrapper(HoodieInstant hoodieInstant, HoodieTableMetaClient metaClient) throws IOException { + HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry(); + archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp()); + archivedMetaWrapper.setActionState(hoodieInstant.getState().name()); + switch (hoodieInstant.getAction()) { + case HoodieTimeline.CLEAN_ACTION: { + if (hoodieInstant.isCompleted()) { + archivedMetaWrapper.setHoodieCleanMetadata(CleanerUtils.getCleanerMetadata(metaClient, hoodieInstant)); + } else { + archivedMetaWrapper.setHoodieCleanerPlan(CleanerUtils.getCleanerPlan(metaClient, hoodieInstant)); + } + archivedMetaWrapper.setActionType(ActionType.clean.name()); + break; + } + case HoodieTimeline.COMMIT_ACTION: { + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(metaClient.getActiveTimeline().getInstantDetails(hoodieInstant).get(), HoodieCommitMetadata.class); + archivedMetaWrapper.setHoodieCommitMetadata(convertCommitMetadata(commitMetadata)); + archivedMetaWrapper.setActionType(ActionType.commit.name()); + break; + } + case HoodieTimeline.DELTA_COMMIT_ACTION: { + HoodieCommitMetadata deltaCommitMetadata = HoodieCommitMetadata + .fromBytes(metaClient.getActiveTimeline().getInstantDetails(hoodieInstant).get(), HoodieCommitMetadata.class); + archivedMetaWrapper.setHoodieCommitMetadata(convertCommitMetadata(deltaCommitMetadata)); + archivedMetaWrapper.setActionType(ActionType.deltacommit.name()); + break; + } + case HoodieTimeline.REPLACE_COMMIT_ACTION: { + if (hoodieInstant.isCompleted()) { + HoodieReplaceCommitMetadata replaceCommitMetadata = HoodieReplaceCommitMetadata + .fromBytes(metaClient.getActiveTimeline().getInstantDetails(hoodieInstant).get(), HoodieReplaceCommitMetadata.class); + archivedMetaWrapper.setHoodieReplaceCommitMetadata(ReplaceArchivalHelper.convertReplaceCommitMetadata(replaceCommitMetadata)); + } else if (hoodieInstant.isInflight()) { + // inflight replacecommit files have the same meta data body as HoodieCommitMetadata + // so we could re-use it without further creating an inflight extension. + // Or inflight replacecommit files are empty under clustering circumstance + Option inflightCommitMetadata = getInflightReplaceMetadata(metaClient, hoodieInstant); + if (inflightCommitMetadata.isPresent()) { + archivedMetaWrapper.setHoodieInflightReplaceMetadata(convertCommitMetadata(inflightCommitMetadata.get())); + } + } else { + // we may have cases with empty HoodieRequestedReplaceMetadata e.g. insert_overwrite_table or insert_overwrite + // without clustering. However, we should revisit the requested commit file standardization + Option requestedReplaceMetadata = getRequestedReplaceMetadata(metaClient, hoodieInstant); + if (requestedReplaceMetadata.isPresent()) { + archivedMetaWrapper.setHoodieRequestedReplaceMetadata(requestedReplaceMetadata.get()); + } + } + archivedMetaWrapper.setActionType(ActionType.replacecommit.name()); + break; + } + case HoodieTimeline.ROLLBACK_ACTION: { + if (hoodieInstant.isCompleted()) { + archivedMetaWrapper.setHoodieRollbackMetadata(TimelineMetadataUtils.deserializeAvroMetadata( + metaClient.getActiveTimeline().getInstantDetails(hoodieInstant).get(), HoodieRollbackMetadata.class)); + } + archivedMetaWrapper.setActionType(ActionType.rollback.name()); + break; + } + case HoodieTimeline.SAVEPOINT_ACTION: { + archivedMetaWrapper.setHoodieSavePointMetadata(TimelineMetadataUtils.deserializeAvroMetadata( + metaClient.getActiveTimeline().getInstantDetails(hoodieInstant).get(), HoodieSavepointMetadata.class)); + archivedMetaWrapper.setActionType(ActionType.savepoint.name()); + break; + } + case HoodieTimeline.COMPACTION_ACTION: { + HoodieCompactionPlan plan = CompactionUtils.getCompactionPlan(metaClient, hoodieInstant.getTimestamp()); + archivedMetaWrapper.setHoodieCompactionPlan(plan); + archivedMetaWrapper.setActionType(ActionType.compaction.name()); + break; + } + default: { + throw new UnsupportedOperationException("Action not fully supported yet"); + } + } + return archivedMetaWrapper; + } + + public static HoodieArchivedMetaEntry createMetaWrapperForEmptyInstant(HoodieInstant hoodieInstant) throws IOException { + HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry(); + archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp()); + archivedMetaWrapper.setActionState(hoodieInstant.getState().name()); + switch (hoodieInstant.getAction()) { + case HoodieTimeline.CLEAN_ACTION: { + archivedMetaWrapper.setActionType(ActionType.clean.name()); + break; + } + case HoodieTimeline.COMMIT_ACTION: { + archivedMetaWrapper.setActionType(ActionType.commit.name()); + break; + } + case HoodieTimeline.DELTA_COMMIT_ACTION: { + archivedMetaWrapper.setActionType(ActionType.deltacommit.name()); + break; + } + case HoodieTimeline.REPLACE_COMMIT_ACTION: { + archivedMetaWrapper.setActionType(ActionType.replacecommit.name()); + break; + } + case HoodieTimeline.ROLLBACK_ACTION: { + archivedMetaWrapper.setActionType(ActionType.rollback.name()); + break; + } + case HoodieTimeline.SAVEPOINT_ACTION: { + archivedMetaWrapper.setActionType(ActionType.savepoint.name()); + break; + } + case HoodieTimeline.COMPACTION_ACTION: { + archivedMetaWrapper.setActionType(ActionType.compaction.name()); + break; + } + default: { + throw new UnsupportedOperationException("Action not fully supported yet"); + } + } + return archivedMetaWrapper; + } + + public static Option getInflightReplaceMetadata(HoodieTableMetaClient metaClient, HoodieInstant instant) throws IOException { + Option inflightContent = metaClient.getActiveTimeline().getInstantDetails(instant); + if (!inflightContent.isPresent() || inflightContent.get().length == 0) { + // inflight files can be empty in some certain cases, e.g. when users opt in clustering + return Option.empty(); + } + return Option.of(HoodieCommitMetadata.fromBytes(inflightContent.get(), HoodieCommitMetadata.class)); + } + + private static Option getRequestedReplaceMetadata(HoodieTableMetaClient metaClient, HoodieInstant instant) throws IOException { + Option requestedContent = metaClient.getActiveTimeline().getInstantDetails(instant); + if (!requestedContent.isPresent() || requestedContent.get().length == 0) { + // requested commit files can be empty in some certain cases, e.g. insert_overwrite or insert_overwrite_table. + // However, it appears requested files are supposed to contain meta data and we should revisit the standardization + // of requested commit files + // TODO revisit requested commit file standardization https://issues.apache.org/jira/browse/HUDI-1739 + return Option.empty(); + } + return Option.of(TimelineMetadataUtils.deserializeRequestedReplaceMetadata(requestedContent.get())); + } + + public static Option getHoodieCommitMetadata(HoodieTableMetaClient metaClient, HoodieInstant hoodieInstant) throws IOException { + HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); + HoodieTimeline timeline = activeTimeline.getCommitsTimeline().filterCompletedInstants(); + + if (hoodieInstant.getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION)) { + return Option.of(HoodieReplaceCommitMetadata.fromBytes(timeline.getInstantDetails(hoodieInstant).get(), + HoodieReplaceCommitMetadata.class)); + } + return Option.of(HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(hoodieInstant).get(), + HoodieCommitMetadata.class)); + + } + + public static org.apache.hudi.avro.model.HoodieCommitMetadata convertCommitMetadata( + HoodieCommitMetadata hoodieCommitMetadata) { + ObjectMapper mapper = new ObjectMapper(); + // Need this to ignore other public get() methods + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + org.apache.hudi.avro.model.HoodieCommitMetadata avroMetaData = + mapper.convertValue(hoodieCommitMetadata, org.apache.hudi.avro.model.HoodieCommitMetadata.class); + if (hoodieCommitMetadata.getCompacted()) { + avroMetaData.setOperationType(WriteOperationType.COMPACT.name()); + } + // Do not archive Rolling Stats, cannot set to null since AVRO will throw null pointer + avroMetaData.getExtraMetadata().put(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY, ""); + return avroMetaData; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/OperationConverter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/OperationConverter.java new file mode 100644 index 0000000000000..05b07bdce03c9 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/OperationConverter.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.utils; + +import org.apache.hudi.common.model.WriteOperationType; + +import com.beust.jcommander.IStringConverter; +import com.beust.jcommander.ParameterException; + +/** + * Converter that converts a string into enum WriteOperationType. + */ +public class OperationConverter implements IStringConverter { + @Override + public WriteOperationType convert(String value) throws ParameterException { + return WriteOperationType.valueOf(value); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/TransactionUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/TransactionUtils.java new file mode 100644 index 0000000000000..ec15effdc4663 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/TransactionUtils.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.utils; + +import org.apache.hudi.client.transaction.ConcurrentOperation; +import org.apache.hudi.client.transaction.ConflictResolutionStrategy; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieWriteConflictException; +import org.apache.hudi.table.HoodieTable; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.Set; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +public class TransactionUtils { + + private static final Logger LOG = LogManager.getLogger(TransactionUtils.class); + + /** + * Resolve any write conflicts when committing data. + * + * @param table + * @param currentTxnOwnerInstant + * @param thisCommitMetadata + * @param config + * @param lastCompletedTxnOwnerInstant + * @param pendingInstants + * + * @return + * @throws HoodieWriteConflictException + */ + public static Option resolveWriteConflictIfAny( + final HoodieTable table, + final Option currentTxnOwnerInstant, + final Option thisCommitMetadata, + final HoodieWriteConfig config, + Option lastCompletedTxnOwnerInstant, + boolean reloadActiveTimeline, + Set pendingInstants) throws HoodieWriteConflictException { + if (config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl()) { + // deal with pendingInstants + Stream completedInstantsDuringCurrentWriteOperation = getCompletedInstantsDuringCurrentWriteOperation(table.getMetaClient(), pendingInstants); + + ConflictResolutionStrategy resolutionStrategy = config.getWriteConflictResolutionStrategy(); + Stream instantStream = Stream.concat(resolutionStrategy.getCandidateInstants(reloadActiveTimeline + ? table.getMetaClient().reloadActiveTimeline() : table.getActiveTimeline(), currentTxnOwnerInstant.get(), lastCompletedTxnOwnerInstant), + completedInstantsDuringCurrentWriteOperation); + final ConcurrentOperation thisOperation = new ConcurrentOperation(currentTxnOwnerInstant.get(), thisCommitMetadata.orElse(new HoodieCommitMetadata())); + instantStream.forEach(instant -> { + try { + ConcurrentOperation otherOperation = new ConcurrentOperation(instant, table.getMetaClient()); + if (resolutionStrategy.hasConflict(thisOperation, otherOperation)) { + LOG.info("Conflict encountered between current instant = " + thisOperation + " and instant = " + + otherOperation + ", attempting to resolve it..."); + resolutionStrategy.resolveConflict(table, thisOperation, otherOperation); + } + } catch (IOException io) { + throw new HoodieWriteConflictException("Unable to resolve conflict, if present", io); + } + }); + LOG.info("Successfully resolved conflicts, if any"); + + return thisOperation.getCommitMetadataOption(); + } + return thisCommitMetadata; + } + + /** + * Get the last completed transaction hoodie instant and {@link HoodieCommitMetadata#getExtraMetadata()}. + * + * @param metaClient + * @return + */ + public static Option>> getLastCompletedTxnInstantAndMetadata( + HoodieTableMetaClient metaClient) { + Option hoodieInstantOption = metaClient.getActiveTimeline().getCommitsTimeline() + .filterCompletedInstants().lastInstant(); + try { + if (hoodieInstantOption.isPresent()) { + switch (hoodieInstantOption.get().getAction()) { + case HoodieTimeline.REPLACE_COMMIT_ACTION: + HoodieReplaceCommitMetadata replaceCommitMetadata = HoodieReplaceCommitMetadata + .fromBytes(metaClient.getActiveTimeline().getInstantDetails(hoodieInstantOption.get()).get(), HoodieReplaceCommitMetadata.class); + return Option.of(Pair.of(hoodieInstantOption.get(), replaceCommitMetadata.getExtraMetadata())); + case HoodieTimeline.DELTA_COMMIT_ACTION: + case HoodieTimeline.COMMIT_ACTION: + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(metaClient.getActiveTimeline().getInstantDetails(hoodieInstantOption.get()).get(), HoodieCommitMetadata.class); + return Option.of(Pair.of(hoodieInstantOption.get(), commitMetadata.getExtraMetadata())); + default: + throw new IllegalArgumentException("Unknown instant action" + hoodieInstantOption.get().getAction()); + } + } else { + return Option.empty(); + } + } catch (IOException io) { + throw new HoodieIOException("Unable to read metadata for instant " + hoodieInstantOption.get(), io); + } + } + + /** + * Get InflightAndRequest instants. + * + * @param metaClient + * @return + */ + public static Set getInflightAndRequestedInstants(HoodieTableMetaClient metaClient) { + // collect InflightAndRequest instants for deltaCommit/commit/compaction/clustering + Set timelineActions = CollectionUtils + .createImmutableSet(HoodieTimeline.REPLACE_COMMIT_ACTION, HoodieTimeline.COMPACTION_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION, HoodieTimeline.COMMIT_ACTION); + return metaClient + .getActiveTimeline() + .getTimelineOfActions(timelineActions) + .filterInflightsAndRequested() + .getInstants() + .map(HoodieInstant::getTimestamp) + .collect(Collectors.toSet()); + } + + public static Stream getCompletedInstantsDuringCurrentWriteOperation(HoodieTableMetaClient metaClient, Set pendingInstants) { + // deal with pendingInstants + // some pending instants maybe finished during current write operation, + // we should check the conflict of those pending operation + return metaClient + .reloadActiveTimeline() + .getCommitsTimeline() + .filterCompletedInstants() + .getInstants() + .filter(f -> pendingInstants.contains(f.getTimestamp())); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/common/table/log/HoodieFileSliceReader.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/common/table/log/HoodieFileSliceReader.java new file mode 100644 index 0000000000000..a042255cdcb1a --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/common/table/log/HoodieFileSliceReader.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.table.log; + +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.SpillableMapUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodiePayloadConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.io.storage.HoodieFileReader; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; + +import java.io.IOException; +import java.util.Iterator; +import java.util.stream.StreamSupport; + +/** + * Reads records from base file and merges any updates from log files and provides iterable over all records in the file slice. + */ +public class HoodieFileSliceReader implements Iterator> { + private final Iterator> recordsIterator; + + public static HoodieFileSliceReader getFileSliceReader( + Option baseFileReader, HoodieMergedLogRecordScanner scanner, Schema schema, String payloadClass, + String preCombineField, Option> simpleKeyGenFieldsOpt) throws IOException { + if (baseFileReader.isPresent()) { + Iterator baseIterator = baseFileReader.get().getRecordIterator(schema); + while (baseIterator.hasNext()) { + GenericRecord record = (GenericRecord) baseIterator.next(); + HoodieRecord hoodieRecord = transform( + record, scanner, payloadClass, preCombineField, simpleKeyGenFieldsOpt); + scanner.processNextRecord(hoodieRecord); + } + return new HoodieFileSliceReader(scanner.iterator()); + } else { + Iterable> iterable = () -> scanner.iterator(); + HoodiePayloadConfig payloadConfig = HoodiePayloadConfig.newBuilder().withPayloadOrderingField(preCombineField).build(); + return new HoodieFileSliceReader(StreamSupport.stream(iterable.spliterator(), false) + .map(e -> { + try { + GenericRecord record = (GenericRecord) e.getData().getInsertValue(schema, payloadConfig.getProps()).get(); + return transform(record, scanner, payloadClass, preCombineField, simpleKeyGenFieldsOpt); + } catch (IOException io) { + throw new HoodieIOException("Error while creating reader for file slice with no base file.", io); + } + }).iterator()); + } + } + + private static HoodieRecord transform( + GenericRecord record, HoodieMergedLogRecordScanner scanner, String payloadClass, + String preCombineField, Option> simpleKeyGenFieldsOpt) { + return simpleKeyGenFieldsOpt.isPresent() + ? SpillableMapUtils.convertToHoodieRecordPayload(record, + payloadClass, preCombineField, simpleKeyGenFieldsOpt.get(), scanner.isWithOperationField(), Option.empty()) + : SpillableMapUtils.convertToHoodieRecordPayload(record, + payloadClass, preCombineField, scanner.isWithOperationField(), scanner.getPartitionName()); + } + + private HoodieFileSliceReader(Iterator> recordsItr) { + this.recordsIterator = recordsItr; + } + + @Override + public boolean hasNext() { + return recordsIterator.hasNext(); + } + + @Override + public HoodieRecord next() { + return recordsIterator.next(); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieArchivalConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieArchivalConfig.java new file mode 100644 index 0000000000000..681ca20baeebf --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieArchivalConfig.java @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +import javax.annotation.concurrent.Immutable; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +/** + * Archival related config. + */ +@Immutable +@ConfigClassProperty(name = "Archival Configs", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Configurations that control archival.") +public class HoodieArchivalConfig extends HoodieConfig { + + public static final ConfigProperty AUTO_ARCHIVE = ConfigProperty + .key("hoodie.archive.automatic") + .defaultValue("true") + .withDocumentation("When enabled, the archival table service is invoked immediately after each commit," + + " to archive commits if we cross a maximum value of commits." + + " It's recommended to enable this, to ensure number of active commits is bounded."); + + public static final ConfigProperty ASYNC_ARCHIVE = ConfigProperty + .key("hoodie.archive.async") + .defaultValue("false") + .sinceVersion("0.11.0") + .withDocumentation("Only applies when " + AUTO_ARCHIVE.key() + " is turned on. " + + "When turned on runs archiver async with writing, which can speed up overall write performance."); + + public static final ConfigProperty MAX_COMMITS_TO_KEEP = ConfigProperty + .key("hoodie.keep.max.commits") + .defaultValue("30") + .withDocumentation("Archiving service moves older entries from timeline into an archived log after each write, to" + + " keep the metadata overhead constant, even as the table size grows." + + " This config controls the maximum number of instants to retain in the active timeline. "); + + public static final ConfigProperty DELETE_ARCHIVED_INSTANT_PARALLELISM_VALUE = ConfigProperty + .key("hoodie.archive.delete.parallelism") + .defaultValue(100) + .withDocumentation("Parallelism for deleting archived hoodie commits."); + + public static final ConfigProperty MIN_COMMITS_TO_KEEP = ConfigProperty + .key("hoodie.keep.min.commits") + .defaultValue("20") + .withDocumentation("Similar to " + MAX_COMMITS_TO_KEEP.key() + ", but controls the minimum number of" + + " instants to retain in the active timeline."); + + public static final ConfigProperty COMMITS_ARCHIVAL_BATCH_SIZE = ConfigProperty + .key("hoodie.commits.archival.batch") + .defaultValue(String.valueOf(10)) + .withDocumentation("Archiving of instants is batched in best-effort manner, to pack more instants into a single" + + " archive log. This config controls such archival batch size."); + + public static final ConfigProperty ARCHIVE_MERGE_FILES_BATCH_SIZE = ConfigProperty + .key("hoodie.archive.merge.files.batch.size") + .defaultValue(10) + .withDocumentation("The number of small archive files to be merged at once."); + + public static final ConfigProperty ARCHIVE_MERGE_SMALL_FILE_LIMIT_BYTES = ConfigProperty + .key("hoodie.archive.merge.small.file.limit.bytes") + .defaultValue(20L * 1024 * 1024) + .withDocumentation("This config sets the archive file size limit below which an archive file becomes a candidate to be selected as such a small file."); + + public static final ConfigProperty ARCHIVE_MERGE_ENABLE = ConfigProperty + .key("hoodie.archive.merge.enable") + .defaultValue(false) + .withDocumentation("When enable, hoodie will auto merge several small archive files into larger one. It's" + + " useful when storage scheme doesn't support append operation."); + + public static final ConfigProperty ARCHIVE_BEYOND_SAVEPOINT = ConfigProperty + .key("hoodie.archive.beyond.savepoint") + .defaultValue(false) + .sinceVersion("0.12.0") + .withDocumentation("If enabled, archival will proceed beyond savepoint, skipping savepoint commits." + + " If disabled, archival will stop at the earliest savepoint commit."); + + /** + * @deprecated Use {@link #MAX_COMMITS_TO_KEEP} and its methods instead + */ + @Deprecated + public static final String MAX_COMMITS_TO_KEEP_PROP = MAX_COMMITS_TO_KEEP.key(); + /** + * @deprecated Use {@link #MIN_COMMITS_TO_KEEP} and its methods instead + */ + @Deprecated + public static final String MIN_COMMITS_TO_KEEP_PROP = MIN_COMMITS_TO_KEEP.key(); + /** + * @deprecated Use {@link #COMMITS_ARCHIVAL_BATCH_SIZE} and its methods instead + */ + @Deprecated + public static final String COMMITS_ARCHIVAL_BATCH_SIZE_PROP = COMMITS_ARCHIVAL_BATCH_SIZE.key(); + /** + * @deprecated Use {@link #MAX_COMMITS_TO_KEEP} and its methods instead + */ + @Deprecated + private static final String DEFAULT_MAX_COMMITS_TO_KEEP = MAX_COMMITS_TO_KEEP.defaultValue(); + /** + * @deprecated Use {@link #MIN_COMMITS_TO_KEEP} and its methods instead + */ + @Deprecated + private static final String DEFAULT_MIN_COMMITS_TO_KEEP = MIN_COMMITS_TO_KEEP.defaultValue(); + /** + * @deprecated Use {@link #COMMITS_ARCHIVAL_BATCH_SIZE} and its methods instead + */ + @Deprecated + private static final String DEFAULT_COMMITS_ARCHIVAL_BATCH_SIZE = COMMITS_ARCHIVAL_BATCH_SIZE.defaultValue(); + + private HoodieArchivalConfig() { + super(); + } + + public static HoodieArchivalConfig.Builder newBuilder() { + return new HoodieArchivalConfig.Builder(); + } + + public static class Builder { + + private final HoodieArchivalConfig archivalConfig = new HoodieArchivalConfig(); + + public HoodieArchivalConfig.Builder fromFile(File propertiesFile) throws IOException { + try (FileReader reader = new FileReader(propertiesFile)) { + this.archivalConfig.getProps().load(reader); + return this; + } + } + + public HoodieArchivalConfig.Builder fromProperties(Properties props) { + this.archivalConfig.getProps().putAll(props); + return this; + } + + public HoodieArchivalConfig.Builder withAutoArchive(Boolean autoArchive) { + archivalConfig.setValue(AUTO_ARCHIVE, String.valueOf(autoArchive)); + return this; + } + + public HoodieArchivalConfig.Builder withAsyncArchive(Boolean asyncArchive) { + archivalConfig.setValue(ASYNC_ARCHIVE, String.valueOf(asyncArchive)); + return this; + } + + public HoodieArchivalConfig.Builder archiveCommitsWith(int minToKeep, int maxToKeep) { + archivalConfig.setValue(MIN_COMMITS_TO_KEEP, String.valueOf(minToKeep)); + archivalConfig.setValue(MAX_COMMITS_TO_KEEP, String.valueOf(maxToKeep)); + return this; + } + + public HoodieArchivalConfig.Builder withArchiveMergeFilesBatchSize(int number) { + archivalConfig.setValue(ARCHIVE_MERGE_FILES_BATCH_SIZE, String.valueOf(number)); + return this; + } + + public HoodieArchivalConfig.Builder withArchiveMergeSmallFileLimit(long size) { + archivalConfig.setValue(ARCHIVE_MERGE_SMALL_FILE_LIMIT_BYTES, String.valueOf(size)); + return this; + } + + public HoodieArchivalConfig.Builder withArchiveMergeEnable(boolean enable) { + archivalConfig.setValue(ARCHIVE_MERGE_ENABLE, String.valueOf(enable)); + return this; + } + + public HoodieArchivalConfig.Builder withArchiveDeleteParallelism(int archiveDeleteParallelism) { + archivalConfig.setValue(DELETE_ARCHIVED_INSTANT_PARALLELISM_VALUE, String.valueOf(archiveDeleteParallelism)); + return this; + } + + public HoodieArchivalConfig.Builder withCommitsArchivalBatchSize(int batchSize) { + archivalConfig.setValue(COMMITS_ARCHIVAL_BATCH_SIZE, String.valueOf(batchSize)); + return this; + } + + public Builder withArchiveBeyondSavepoint(boolean archiveBeyondSavepoint) { + archivalConfig.setValue(ARCHIVE_BEYOND_SAVEPOINT, String.valueOf(archiveBeyondSavepoint)); + return this; + } + + public HoodieArchivalConfig build() { + archivalConfig.setDefaults(HoodieArchivalConfig.class.getName()); + return archivalConfig; + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java index bde2e03da9f00..0b9116b01c9cc 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java @@ -22,43 +22,172 @@ import org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector; import org.apache.hudi.client.bootstrap.translator.IdentityBootstrapPartitionPathTranslator; import org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex; -import org.apache.hudi.common.config.DefaultHoodieConfig; +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.keygen.constant.KeyGeneratorType; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.Properties; +import static org.apache.hudi.client.bootstrap.BootstrapMode.FULL_RECORD; +import static org.apache.hudi.client.bootstrap.BootstrapMode.METADATA_ONLY; + /** * Bootstrap specific configs. */ -public class HoodieBootstrapConfig extends DefaultHoodieConfig { - - public static final String BOOTSTRAP_BASE_PATH_PROP = "hoodie.bootstrap.base.path"; - public static final String BOOTSTRAP_MODE_SELECTOR = "hoodie.bootstrap.mode.selector"; - public static final String FULL_BOOTSTRAP_INPUT_PROVIDER = "hoodie.bootstrap.full.input.provider"; - public static final String DEFAULT_FULL_BOOTSTRAP_INPUT_PROVIDER = "org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider"; - public static final String BOOTSTRAP_KEYGEN_CLASS = "hoodie.bootstrap.keygen.class"; - public static final String BOOTSTRAP_PARTITION_PATH_TRANSLATOR_CLASS = - "hoodie.bootstrap.partitionpath.translator.class"; - public static final String DEFAULT_BOOTSTRAP_PARTITION_PATH_TRANSLATOR_CLASS = - IdentityBootstrapPartitionPathTranslator.class.getName(); - - public static final String BOOTSTRAP_PARALLELISM = "hoodie.bootstrap.parallelism"; - public static final String DEFAULT_BOOTSTRAP_PARALLELISM = "1500"; - - // Used By BootstrapRegexModeSelector class. When a partition path matches the regex, the corresponding - // mode will be used. Otherwise, the alternative mode will be used. - public static final String BOOTSTRAP_MODE_SELECTOR_REGEX = "hoodie.bootstrap.mode.selector.regex"; - public static final String BOOTSTRAP_MODE_SELECTOR_REGEX_MODE = "hoodie.bootstrap.mode.selector.regex.mode"; - public static final String DEFAULT_BOOTSTRAP_MODE_SELECTOR_REGEX = ".*"; - public static final String DEFAULT_BOOTSTRAP_MODE_SELECTOR_REGEX_MODE = BootstrapMode.METADATA_ONLY.name(); - - public static final String BOOTSTRAP_INDEX_CLASS_PROP = "hoodie.bootstrap.index.class"; - public static final String DEFAULT_BOOTSTRAP_INDEX_CLASS = HFileBootstrapIndex.class.getName(); - - public HoodieBootstrapConfig(Properties props) { - super(props); +@ConfigClassProperty(name = "Bootstrap Configs", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Configurations that control how you want to bootstrap your existing tables for the first time into hudi. " + + "The bootstrap operation can flexibly avoid copying data over before you can use Hudi and support running the existing " + + " writers and new hudi writers in parallel, to validate the migration.") +public class HoodieBootstrapConfig extends HoodieConfig { + + public static final ConfigProperty BASE_PATH = ConfigProperty + .key("hoodie.bootstrap.base.path") + .noDefaultValue() + .sinceVersion("0.6.0") + .withDocumentation("Base path of the dataset that needs to be bootstrapped as a Hudi table"); + + public static final ConfigProperty PARTITION_SELECTOR_REGEX_MODE = ConfigProperty + .key("hoodie.bootstrap.mode.selector.regex.mode") + .defaultValue(METADATA_ONLY.name()) + .sinceVersion("0.6.0") + .withValidValues(METADATA_ONLY.name(), FULL_RECORD.name()) + .withDocumentation("Bootstrap mode to apply for partition paths, that match regex above. " + + "METADATA_ONLY will generate just skeleton base files with keys/footers, avoiding full cost of rewriting the dataset. " + + "FULL_RECORD will perform a full copy/rewrite of the data as a Hudi table."); + + public static final ConfigProperty MODE_SELECTOR_CLASS_NAME = ConfigProperty + .key("hoodie.bootstrap.mode.selector") + .defaultValue(MetadataOnlyBootstrapModeSelector.class.getCanonicalName()) + .sinceVersion("0.6.0") + .withDocumentation("Selects the mode in which each file/partition in the bootstrapped dataset gets bootstrapped"); + + public static final ConfigProperty FULL_BOOTSTRAP_INPUT_PROVIDER_CLASS_NAME = ConfigProperty + .key("hoodie.bootstrap.full.input.provider") + .defaultValue("org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider") + .sinceVersion("0.6.0") + .withDocumentation("Class to use for reading the bootstrap dataset partitions/files, for Bootstrap mode FULL_RECORD"); + + public static final ConfigProperty KEYGEN_CLASS_NAME = ConfigProperty + .key("hoodie.bootstrap.keygen.class") + .noDefaultValue() + .sinceVersion("0.6.0") + .withDocumentation("Key generator implementation to be used for generating keys from the bootstrapped dataset"); + + public static final ConfigProperty KEYGEN_TYPE = ConfigProperty + .key("hoodie.bootstrap.keygen.type") + .defaultValue(KeyGeneratorType.SIMPLE.name()) + .sinceVersion("0.9.0") + .withDocumentation("Type of build-in key generator, currently support SIMPLE, COMPLEX, TIMESTAMP, CUSTOM, NON_PARTITION, GLOBAL_DELETE"); + + public static final ConfigProperty PARTITION_PATH_TRANSLATOR_CLASS_NAME = ConfigProperty + .key("hoodie.bootstrap.partitionpath.translator.class") + .defaultValue(IdentityBootstrapPartitionPathTranslator.class.getName()) + .sinceVersion("0.6.0") + .withDocumentation("Translates the partition paths from the bootstrapped data into how is laid out as a Hudi table."); + + public static final ConfigProperty PARALLELISM_VALUE = ConfigProperty + .key("hoodie.bootstrap.parallelism") + .defaultValue("1500") + .sinceVersion("0.6.0") + .withDocumentation("Parallelism value to be used to bootstrap data into hudi"); + + public static final ConfigProperty PARTITION_SELECTOR_REGEX_PATTERN = ConfigProperty + .key("hoodie.bootstrap.mode.selector.regex") + .defaultValue(".*") + .sinceVersion("0.6.0") + .withDocumentation("Matches each bootstrap dataset partition against this regex and applies the mode below to it."); + + public static final ConfigProperty INDEX_CLASS_NAME = ConfigProperty + .key("hoodie.bootstrap.index.class") + .defaultValue(HFileBootstrapIndex.class.getName()) + .sinceVersion("0.6.0") + .withDocumentation("Implementation to use, for mapping a skeleton base file to a boostrap base file."); + + /** + * @deprecated Use {@link #BASE_PATH} and its methods instead + */ + @Deprecated + public static final String BOOTSTRAP_BASE_PATH_PROP = BASE_PATH.key(); + /** + * @deprecated Use {@link #INDEX_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String BOOTSTRAP_INDEX_CLASS_PROP = INDEX_CLASS_NAME.key(); + /** + * @deprecated Use {@link #INDEX_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_BOOTSTRAP_INDEX_CLASS = INDEX_CLASS_NAME.defaultValue(); + /** + * @deprecated Use {@link #MODE_SELECTOR_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String BOOTSTRAP_MODE_SELECTOR = MODE_SELECTOR_CLASS_NAME.key(); + /** + * @deprecated Use {@link #FULL_BOOTSTRAP_INPUT_PROVIDER_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String FULL_BOOTSTRAP_INPUT_PROVIDER = FULL_BOOTSTRAP_INPUT_PROVIDER_CLASS_NAME.key(); + /** + * @deprecated Use {@link #FULL_BOOTSTRAP_INPUT_PROVIDER_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_FULL_BOOTSTRAP_INPUT_PROVIDER = FULL_BOOTSTRAP_INPUT_PROVIDER_CLASS_NAME.defaultValue(); + /** + * @deprecated Use {@link #KEYGEN_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String BOOTSTRAP_KEYGEN_CLASS = KEYGEN_CLASS_NAME.key(); + /** + * @deprecated Use {@link #PARTITION_PATH_TRANSLATOR_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String BOOTSTRAP_PARTITION_PATH_TRANSLATOR_CLASS = PARTITION_PATH_TRANSLATOR_CLASS_NAME.key(); + /** + * @deprecated Use {@link #PARTITION_PATH_TRANSLATOR_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_BOOTSTRAP_PARTITION_PATH_TRANSLATOR_CLASS = PARTITION_PATH_TRANSLATOR_CLASS_NAME.defaultValue(); + /** + * @deprecated Use {@link #PARALLELISM_VALUE} and its methods instead + */ + @Deprecated + public static final String BOOTSTRAP_PARALLELISM = PARALLELISM_VALUE.key(); + /** + * @deprecated Use {@link #PARALLELISM_VALUE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_BOOTSTRAP_PARALLELISM = PARALLELISM_VALUE.defaultValue(); + /** + * @deprecated Use {@link #PARTITION_SELECTOR_REGEX_PATTERN} and its methods instead + */ + @Deprecated + public static final String BOOTSTRAP_MODE_SELECTOR_REGEX = PARTITION_SELECTOR_REGEX_PATTERN.key(); + /** + * @deprecated Use {@link #PARTITION_SELECTOR_REGEX_MODE} and its methods instead + */ + @Deprecated + public static final String BOOTSTRAP_MODE_SELECTOR_REGEX_MODE = PARTITION_SELECTOR_REGEX_MODE.key(); + /** + * @deprecated Use {@link #PARTITION_SELECTOR_REGEX_PATTERN} and its methods instead + */ + @Deprecated + public static final String DEFAULT_BOOTSTRAP_MODE_SELECTOR_REGEX = PARTITION_SELECTOR_REGEX_PATTERN.defaultValue(); + /** + * @deprecated Use {@link #PARTITION_SELECTOR_REGEX_MODE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_BOOTSTRAP_MODE_SELECTOR_REGEX_MODE = PARTITION_SELECTOR_REGEX_MODE.defaultValue(); + + private HoodieBootstrapConfig() { + super(); } public static Builder newBuilder() { @@ -67,78 +196,72 @@ public static Builder newBuilder() { public static class Builder { - private final Properties props = new Properties(); + private final HoodieBootstrapConfig bootstrapConfig = new HoodieBootstrapConfig(); public Builder fromFile(File propertiesFile) throws IOException { try (FileReader reader = new FileReader(propertiesFile)) { - this.props.load(reader); + this.bootstrapConfig.getProps().load(reader); return this; } } public Builder withBootstrapBasePath(String basePath) { - props.setProperty(BOOTSTRAP_BASE_PATH_PROP, basePath); + bootstrapConfig.setValue(BASE_PATH, basePath); return this; } public Builder withBootstrapModeSelector(String partitionSelectorClass) { - props.setProperty(BOOTSTRAP_MODE_SELECTOR, partitionSelectorClass); + bootstrapConfig.setValue(MODE_SELECTOR_CLASS_NAME, partitionSelectorClass); return this; } public Builder withFullBootstrapInputProvider(String partitionSelectorClass) { - props.setProperty(FULL_BOOTSTRAP_INPUT_PROVIDER, partitionSelectorClass); + bootstrapConfig.setValue(FULL_BOOTSTRAP_INPUT_PROVIDER_CLASS_NAME, partitionSelectorClass); return this; } public Builder withBootstrapKeyGenClass(String keyGenClass) { - props.setProperty(BOOTSTRAP_KEYGEN_CLASS, keyGenClass); + bootstrapConfig.setValue(KEYGEN_CLASS_NAME, keyGenClass); + return this; + } + + public Builder withBootstrapKeyGenType(String keyGenType) { + bootstrapConfig.setValue(KEYGEN_TYPE, keyGenType); return this; } public Builder withBootstrapPartitionPathTranslatorClass(String partitionPathTranslatorClass) { - props.setProperty(BOOTSTRAP_PARTITION_PATH_TRANSLATOR_CLASS, partitionPathTranslatorClass); + bootstrapConfig + .setValue(PARTITION_PATH_TRANSLATOR_CLASS_NAME, partitionPathTranslatorClass); return this; } public Builder withBootstrapParallelism(int parallelism) { - props.setProperty(BOOTSTRAP_PARALLELISM, String.valueOf(parallelism)); + bootstrapConfig.setValue(PARALLELISM_VALUE, String.valueOf(parallelism)); return this; } public Builder withBootstrapModeSelectorRegex(String regex) { - props.setProperty(BOOTSTRAP_MODE_SELECTOR_REGEX, regex); + bootstrapConfig.setValue(PARTITION_SELECTOR_REGEX_PATTERN, regex); return this; } public Builder withBootstrapModeForRegexMatch(BootstrapMode modeForRegexMatch) { - props.setProperty(BOOTSTRAP_MODE_SELECTOR_REGEX_MODE, modeForRegexMatch.name()); + bootstrapConfig.setValue(PARTITION_SELECTOR_REGEX_MODE, modeForRegexMatch.name()); return this; } public Builder fromProperties(Properties props) { - this.props.putAll(props); + this.bootstrapConfig.getProps().putAll(props); return this; } public HoodieBootstrapConfig build() { - HoodieBootstrapConfig config = new HoodieBootstrapConfig(props); - setDefaultOnCondition(props, !props.containsKey(BOOTSTRAP_PARALLELISM), BOOTSTRAP_PARALLELISM, - DEFAULT_BOOTSTRAP_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(BOOTSTRAP_PARTITION_PATH_TRANSLATOR_CLASS), - BOOTSTRAP_PARTITION_PATH_TRANSLATOR_CLASS, DEFAULT_BOOTSTRAP_PARTITION_PATH_TRANSLATOR_CLASS); - setDefaultOnCondition(props, !props.containsKey(BOOTSTRAP_MODE_SELECTOR), BOOTSTRAP_MODE_SELECTOR, - MetadataOnlyBootstrapModeSelector.class.getCanonicalName()); - setDefaultOnCondition(props, !props.containsKey(BOOTSTRAP_MODE_SELECTOR_REGEX), BOOTSTRAP_MODE_SELECTOR_REGEX, - DEFAULT_BOOTSTRAP_MODE_SELECTOR_REGEX); - setDefaultOnCondition(props, !props.containsKey(BOOTSTRAP_MODE_SELECTOR_REGEX_MODE), - BOOTSTRAP_MODE_SELECTOR_REGEX_MODE, DEFAULT_BOOTSTRAP_MODE_SELECTOR_REGEX_MODE); - BootstrapMode.valueOf(props.getProperty(BOOTSTRAP_MODE_SELECTOR_REGEX_MODE)); - setDefaultOnCondition(props, !props.containsKey(BOOTSTRAP_INDEX_CLASS_PROP), BOOTSTRAP_INDEX_CLASS_PROP, - DEFAULT_BOOTSTRAP_INDEX_CLASS); - setDefaultOnCondition(props, !props.containsKey(FULL_BOOTSTRAP_INPUT_PROVIDER), FULL_BOOTSTRAP_INPUT_PROVIDER, - DEFAULT_FULL_BOOTSTRAP_INPUT_PROVIDER); - return config; + // TODO: use infer function instead + bootstrapConfig.setDefaultValue(INDEX_CLASS_NAME, HoodieTableConfig.getDefaultBootstrapIndexClass( + bootstrapConfig.getProps())); + bootstrapConfig.setDefaults(HoodieBootstrapConfig.class.getName()); + return bootstrapConfig; } } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCleanConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCleanConfig.java new file mode 100644 index 0000000000000..30289e1acbab4 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCleanConfig.java @@ -0,0 +1,307 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.common.model.WriteConcurrencyMode; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.table.action.clean.CleaningTriggerStrategy; + +import javax.annotation.concurrent.Immutable; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Arrays; +import java.util.Properties; +import java.util.stream.Collectors; + +/** + * Clean related config. + */ +@Immutable +@ConfigClassProperty(name = "Clean Configs", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Cleaning (reclamation of older/unused file groups/slices).") +public class HoodieCleanConfig extends HoodieConfig { + + public static final ConfigProperty AUTO_CLEAN = ConfigProperty + .key("hoodie.clean.automatic") + .defaultValue("true") + .withDocumentation("When enabled, the cleaner table service is invoked immediately after each commit," + + " to delete older file slices. It's recommended to enable this, to ensure metadata and data storage" + + " growth is bounded."); + + public static final ConfigProperty ASYNC_CLEAN = ConfigProperty + .key("hoodie.clean.async") + .defaultValue("false") + .withDocumentation("Only applies when " + AUTO_CLEAN.key() + " is turned on. " + + "When turned on runs cleaner async with writing, which can speed up overall write performance."); + + public static final ConfigProperty CLEANER_COMMITS_RETAINED = ConfigProperty + .key("hoodie.cleaner.commits.retained") + .defaultValue("10") + .withDocumentation("Number of commits to retain, without cleaning. This will be retained for num_of_commits * time_between_commits " + + "(scheduled). This also directly translates into how much data retention the table supports for incremental queries."); + + public static final ConfigProperty CLEANER_HOURS_RETAINED = ConfigProperty.key("hoodie.cleaner.hours.retained") + .defaultValue("24") + .withDocumentation("Number of hours for which commits need to be retained. This config provides a more flexible option as" + + "compared to number of commits retained for cleaning service. Setting this property ensures all the files, but the latest in a file group," + + " corresponding to commits with commit times older than the configured number of hours to be retained are cleaned."); + + public static final ConfigProperty CLEANER_POLICY = ConfigProperty + .key("hoodie.cleaner.policy") + .defaultValue(HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name()) + .withDocumentation("Cleaning policy to be used. The cleaner service deletes older file slices files to re-claim space." + + " By default, cleaner spares the file slices written by the last N commits, determined by " + CLEANER_COMMITS_RETAINED.key() + + " Long running query plans may often refer to older file slices and will break if those are cleaned, before the query has had" + + " a chance to run. So, it is good to make sure that the data is retained for more than the maximum query execution time"); + + public static final ConfigProperty CLEAN_TRIGGER_STRATEGY = ConfigProperty + .key("hoodie.clean.trigger.strategy") + .defaultValue(CleaningTriggerStrategy.NUM_COMMITS.name()) + .withDocumentation("Controls how cleaning is scheduled. Valid options: " + + Arrays.stream(CleaningTriggerStrategy.values()).map(Enum::name).collect(Collectors.joining(","))); + + public static final ConfigProperty CLEAN_MAX_COMMITS = ConfigProperty + .key("hoodie.clean.max.commits") + .defaultValue("1") + .withDocumentation("Number of commits after the last clean operation, before scheduling of a new clean is attempted."); + + public static final ConfigProperty CLEANER_FILE_VERSIONS_RETAINED = ConfigProperty + .key("hoodie.cleaner.fileversions.retained") + .defaultValue("3") + .withDocumentation("When " + HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS.name() + " cleaning policy is used, " + + " the minimum number of file slices to retain in each file group, during cleaning."); + + public static final ConfigProperty CLEANER_INCREMENTAL_MODE_ENABLE = ConfigProperty + .key("hoodie.cleaner.incremental.mode") + .defaultValue("true") + .withDocumentation("When enabled, the plans for each cleaner service run is computed incrementally off the events " + + " in the timeline, since the last cleaner run. This is much more efficient than obtaining listings for the full" + + " table for each planning (even with a metadata table)."); + + public static final ConfigProperty FAILED_WRITES_CLEANER_POLICY = ConfigProperty + .key("hoodie.cleaner.policy.failed.writes") + .defaultValue(HoodieFailedWritesCleaningPolicy.EAGER.name()) + .withInferFunction(cfg -> { + Option writeConcurrencyModeOpt = Option.ofNullable(cfg.getString(HoodieWriteConfig.WRITE_CONCURRENCY_MODE)); + if (!writeConcurrencyModeOpt.isPresent() + || !writeConcurrencyModeOpt.get().equals(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL.name())) { + return Option.empty(); + } + return Option.of(HoodieFailedWritesCleaningPolicy.LAZY.name()); + }) + .withDocumentation("Cleaning policy for failed writes to be used. Hudi will delete any files written by " + + "failed writes to re-claim space. Choose to perform this rollback of failed writes eagerly before " + + "every writer starts (only supported for single writer) or lazily by the cleaner (required for multi-writers)"); + + public static final ConfigProperty CLEANER_PARALLELISM_VALUE = ConfigProperty + .key("hoodie.cleaner.parallelism") + .defaultValue("200") + .withDocumentation("Parallelism for the cleaning operation. Increase this if cleaning becomes slow."); + + public static final ConfigProperty ALLOW_MULTIPLE_CLEANS = ConfigProperty + .key("hoodie.clean.allow.multiple") + .defaultValue(true) + .sinceVersion("0.11.0") + .withDocumentation("Allows scheduling/executing multiple cleans by enabling this config. If users prefer to strictly ensure clean requests should be mutually exclusive, " + + ".i.e. a 2nd clean will not be scheduled if another clean is not yet completed to avoid repeat cleaning of same files, they might want to disable this config."); + + public static final ConfigProperty CLEANER_BOOTSTRAP_BASE_FILE_ENABLE = ConfigProperty + .key("hoodie.cleaner.delete.bootstrap.base.file") + .defaultValue("false") + .withDocumentation("When set to true, cleaner also deletes the bootstrap base file when it's skeleton base file is " + + " cleaned. Turn this to true, if you want to ensure the bootstrap dataset storage is reclaimed over time, as the" + + " table receives updates/deletes. Another reason to turn this on, would be to ensure data residing in bootstrap " + + " base files are also physically deleted, to comply with data privacy enforcement processes."); + + + /** @deprecated Use {@link #CLEANER_POLICY} and its methods instead */ + @Deprecated + public static final String CLEANER_POLICY_PROP = CLEANER_POLICY.key(); + /** @deprecated Use {@link #AUTO_CLEAN} and its methods instead */ + @Deprecated + public static final String AUTO_CLEAN_PROP = AUTO_CLEAN.key(); + /** @deprecated Use {@link #ASYNC_CLEAN} and its methods instead */ + @Deprecated + public static final String ASYNC_CLEAN_PROP = ASYNC_CLEAN.key(); + /** @deprecated Use {@link #CLEANER_FILE_VERSIONS_RETAINED} and its methods instead */ + @Deprecated + public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP = CLEANER_FILE_VERSIONS_RETAINED.key(); + /** + * @deprecated Use {@link #CLEANER_COMMITS_RETAINED} and its methods instead + */ + @Deprecated + public static final String CLEANER_COMMITS_RETAINED_PROP = CLEANER_COMMITS_RETAINED.key(); + /** + * @deprecated Use {@link #CLEANER_INCREMENTAL_MODE_ENABLE} and its methods instead + */ + @Deprecated + public static final String CLEANER_INCREMENTAL_MODE = CLEANER_INCREMENTAL_MODE_ENABLE.key(); + /** + * @deprecated Use {@link #CLEANER_BOOTSTRAP_BASE_FILE_ENABLE} and its methods instead + */ + @Deprecated + public static final String CLEANER_BOOTSTRAP_BASE_FILE_ENABLED = CLEANER_BOOTSTRAP_BASE_FILE_ENABLE.key(); + /** + * @deprecated Use {@link #CLEANER_PARALLELISM_VALUE} and its methods instead + */ + @Deprecated + public static final String CLEANER_PARALLELISM = CLEANER_PARALLELISM_VALUE.key(); + /** + * @deprecated Use {@link #CLEANER_PARALLELISM_VALUE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_CLEANER_PARALLELISM = CLEANER_PARALLELISM_VALUE.defaultValue(); + /** @deprecated Use {@link #CLEANER_POLICY} and its methods instead */ + @Deprecated + private static final String DEFAULT_CLEANER_POLICY = CLEANER_POLICY.defaultValue(); + /** @deprecated Use {@link #FAILED_WRITES_CLEANER_POLICY} and its methods instead */ + @Deprecated + public static final String FAILED_WRITES_CLEANER_POLICY_PROP = FAILED_WRITES_CLEANER_POLICY.key(); + /** @deprecated Use {@link #FAILED_WRITES_CLEANER_POLICY} and its methods instead */ + @Deprecated + private static final String DEFAULT_FAILED_WRITES_CLEANER_POLICY = FAILED_WRITES_CLEANER_POLICY.defaultValue(); + /** @deprecated Use {@link #AUTO_CLEAN} and its methods instead */ + @Deprecated + private static final String DEFAULT_AUTO_CLEAN = AUTO_CLEAN.defaultValue(); + /** + * @deprecated Use {@link #ASYNC_CLEAN} and its methods instead + */ + @Deprecated + private static final String DEFAULT_ASYNC_CLEAN = ASYNC_CLEAN.defaultValue(); + /** + * @deprecated Use {@link #CLEANER_INCREMENTAL_MODE_ENABLE} and its methods instead + */ + @Deprecated + private static final String DEFAULT_INCREMENTAL_CLEANER = CLEANER_INCREMENTAL_MODE_ENABLE.defaultValue(); + /** @deprecated Use {@link #CLEANER_FILE_VERSIONS_RETAINED} and its methods instead */ + @Deprecated + private static final String DEFAULT_CLEANER_FILE_VERSIONS_RETAINED = CLEANER_FILE_VERSIONS_RETAINED.defaultValue(); + /** @deprecated Use {@link #CLEANER_COMMITS_RETAINED} and its methods instead */ + @Deprecated + private static final String DEFAULT_CLEANER_COMMITS_RETAINED = CLEANER_COMMITS_RETAINED.defaultValue(); + /** + * @deprecated Use {@link #CLEANER_BOOTSTRAP_BASE_FILE_ENABLE} and its methods instead + */ + @Deprecated + private static final String DEFAULT_CLEANER_BOOTSTRAP_BASE_FILE_ENABLED = CLEANER_BOOTSTRAP_BASE_FILE_ENABLE.defaultValue(); + + private HoodieCleanConfig() { + super(); + } + + public static HoodieCleanConfig.Builder newBuilder() { + return new HoodieCleanConfig.Builder(); + } + + public static class Builder { + + private final HoodieCleanConfig cleanConfig = new HoodieCleanConfig(); + + public HoodieCleanConfig.Builder fromFile(File propertiesFile) throws IOException { + try (FileReader reader = new FileReader(propertiesFile)) { + this.cleanConfig.getProps().load(reader); + return this; + } + } + + public HoodieCleanConfig.Builder fromProperties(Properties props) { + this.cleanConfig.getProps().putAll(props); + return this; + } + + public HoodieCleanConfig.Builder withAutoClean(Boolean autoClean) { + cleanConfig.setValue(AUTO_CLEAN, String.valueOf(autoClean)); + return this; + } + + public HoodieCleanConfig.Builder withAsyncClean(Boolean asyncClean) { + cleanConfig.setValue(ASYNC_CLEAN, String.valueOf(asyncClean)); + return this; + } + + public HoodieCleanConfig.Builder withIncrementalCleaningMode(Boolean incrementalCleaningMode) { + cleanConfig.setValue(CLEANER_INCREMENTAL_MODE_ENABLE, String.valueOf(incrementalCleaningMode)); + return this; + } + + public HoodieCleanConfig.Builder withCleaningTriggerStrategy(String cleaningTriggerStrategy) { + cleanConfig.setValue(CLEAN_TRIGGER_STRATEGY, cleaningTriggerStrategy); + return this; + } + + public HoodieCleanConfig.Builder withMaxCommitsBeforeCleaning(int maxCommitsBeforeCleaning) { + cleanConfig.setValue(CLEAN_MAX_COMMITS, String.valueOf(maxCommitsBeforeCleaning)); + return this; + } + + public HoodieCleanConfig.Builder withCleanerPolicy(HoodieCleaningPolicy policy) { + cleanConfig.setValue(CLEANER_POLICY, policy.name()); + return this; + } + + public HoodieCleanConfig.Builder retainFileVersions(int fileVersionsRetained) { + cleanConfig.setValue(CLEANER_FILE_VERSIONS_RETAINED, String.valueOf(fileVersionsRetained)); + return this; + } + + public HoodieCleanConfig.Builder retainCommits(int commitsRetained) { + cleanConfig.setValue(CLEANER_COMMITS_RETAINED, String.valueOf(commitsRetained)); + return this; + } + + public HoodieCleanConfig.Builder cleanerNumHoursRetained(int cleanerHoursRetained) { + cleanConfig.setValue(CLEANER_HOURS_RETAINED, String.valueOf(cleanerHoursRetained)); + return this; + } + + public HoodieCleanConfig.Builder allowMultipleCleans(boolean allowMultipleCleanSchedules) { + cleanConfig.setValue(ALLOW_MULTIPLE_CLEANS, String.valueOf(allowMultipleCleanSchedules)); + return this; + } + + public HoodieCleanConfig.Builder withCleanerParallelism(int cleanerParallelism) { + cleanConfig.setValue(CLEANER_PARALLELISM_VALUE, String.valueOf(cleanerParallelism)); + return this; + } + + public HoodieCleanConfig.Builder withCleanBootstrapBaseFileEnabled(Boolean cleanBootstrapSourceFileEnabled) { + cleanConfig.setValue(CLEANER_BOOTSTRAP_BASE_FILE_ENABLE, String.valueOf(cleanBootstrapSourceFileEnabled)); + return this; + } + + public HoodieCleanConfig.Builder withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy failedWritesPolicy) { + cleanConfig.setValue(FAILED_WRITES_CLEANER_POLICY, failedWritesPolicy.name()); + return this; + } + + public HoodieCleanConfig build() { + cleanConfig.setDefaults(HoodieCleanConfig.class.getName()); + HoodieCleaningPolicy.valueOf(cleanConfig.getString(CLEANER_POLICY)); + return cleanConfig; + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java new file mode 100644 index 0000000000000..1180845a6ed8a --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java @@ -0,0 +1,684 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.util.TypeUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieNotSupportedException; +import org.apache.hudi.table.action.cluster.ClusteringPlanPartitionFilterMode; + +import javax.annotation.Nonnull; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Map; +import java.util.Properties; + +/** + * Clustering specific configs. + */ +@ConfigClassProperty(name = "Clustering Configs", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Configurations that control the clustering table service in hudi, " + + "which optimizes the storage layout for better query performance by sorting and sizing data files.") +public class HoodieClusteringConfig extends HoodieConfig { + + // Any strategy specific params can be saved with this prefix + public static final String CLUSTERING_STRATEGY_PARAM_PREFIX = "hoodie.clustering.plan.strategy."; + public static final String SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY = + "org.apache.hudi.client.clustering.plan.strategy.SparkSizeBasedClusteringPlanStrategy"; + public static final String FLINK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY = + "org.apache.hudi.client.clustering.plan.strategy.FlinkSizeBasedClusteringPlanStrategy"; + public static final String JAVA_SIZED_BASED_CLUSTERING_PLAN_STRATEGY = + "org.apache.hudi.client.clustering.plan.strategy.JavaSizeBasedClusteringPlanStrategy"; + public static final String SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY = + "org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy"; + public static final String JAVA_SORT_AND_SIZE_EXECUTION_STRATEGY = + "org.apache.hudi.client.clustering.run.strategy.JavaSortAndSizeExecutionStrategy"; + public static final String PLAN_PARTITION_FILTER_MODE = + "hoodie.clustering.plan.partition.filter.mode"; + + // Any Space-filling curves optimize(z-order/hilbert) params can be saved with this prefix + private static final String LAYOUT_OPTIMIZE_PARAM_PREFIX = "hoodie.layout.optimize."; + + public static final ConfigProperty DAYBASED_LOOKBACK_PARTITIONS = ConfigProperty + .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "daybased.lookback.partitions") + .defaultValue("2") + .sinceVersion("0.7.0") + .withDocumentation("Number of partitions to list to create ClusteringPlan"); + + public static final ConfigProperty PARTITION_FILTER_BEGIN_PARTITION = ConfigProperty + .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "cluster.begin.partition") + .noDefaultValue() + .sinceVersion("0.11.0") + .withDocumentation("Begin partition used to filter partition (inclusive), only effective when the filter mode '" + + PLAN_PARTITION_FILTER_MODE + "' is " + ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name()); + + public static final ConfigProperty PARTITION_FILTER_END_PARTITION = ConfigProperty + .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "cluster.end.partition") + .noDefaultValue() + .sinceVersion("0.11.0") + .withDocumentation("End partition used to filter partition (inclusive), only effective when the filter mode '" + + PLAN_PARTITION_FILTER_MODE + "' is " + ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name()); + + public static final ConfigProperty PLAN_STRATEGY_SMALL_FILE_LIMIT = ConfigProperty + .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "small.file.limit") + .defaultValue(String.valueOf(300 * 1024 * 1024L)) + .sinceVersion("0.7.0") + .withDocumentation("Files smaller than the size in bytes specified here are candidates for clustering"); + + public static final ConfigProperty PARTITION_REGEX_PATTERN = ConfigProperty + .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "partition.regex.pattern") + .noDefaultValue() + .sinceVersion("0.11.0") + .withDocumentation("Filter clustering partitions that matched regex pattern"); + + public static final ConfigProperty PARTITION_SELECTED = ConfigProperty + .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "partition.selected") + .noDefaultValue() + .sinceVersion("0.11.0") + .withDocumentation("Partitions to run clustering"); + + public static final ConfigProperty PLAN_STRATEGY_CLASS_NAME = ConfigProperty + .key("hoodie.clustering.plan.strategy.class") + .defaultValue(SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY) + .sinceVersion("0.7.0") + .withDocumentation("Config to provide a strategy class (subclass of ClusteringPlanStrategy) to create clustering plan " + + "i.e select what file groups are being clustered. Default strategy, looks at the clustering small file size limit (determined by " + + PLAN_STRATEGY_SMALL_FILE_LIMIT.key() + ") to pick the small file slices within partitions for clustering."); + + public static final ConfigProperty EXECUTION_STRATEGY_CLASS_NAME = ConfigProperty + .key("hoodie.clustering.execution.strategy.class") + .defaultValue(SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY) + .sinceVersion("0.7.0") + .withDocumentation("Config to provide a strategy class (subclass of RunClusteringStrategy) to define how the " + + " clustering plan is executed. By default, we sort the file groups in th plan by the specified columns, while " + + " meeting the configured target file sizes."); + + public static final ConfigProperty INLINE_CLUSTERING = ConfigProperty + .key("hoodie.clustering.inline") + .defaultValue("false") + .sinceVersion("0.7.0") + .withDocumentation("Turn on inline clustering - clustering will be run after each write operation is complete") + .withAlternatives("hoodie.datasource.clustering.inline.enable"); + + public static final ConfigProperty INLINE_CLUSTERING_MAX_COMMITS = ConfigProperty + .key("hoodie.clustering.inline.max.commits") + .defaultValue("4") + .sinceVersion("0.7.0") + .withDocumentation("Config to control frequency of clustering planning"); + + public static final ConfigProperty ASYNC_CLUSTERING_MAX_COMMITS = ConfigProperty + .key("hoodie.clustering.async.max.commits") + .defaultValue("4") + .sinceVersion("0.9.0") + .withDocumentation("Config to control frequency of async clustering"); + + public static final ConfigProperty PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST = ConfigProperty + .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "daybased.skipfromlatest.partitions") + .defaultValue("0") + .sinceVersion("0.9.0") + .withDocumentation("Number of partitions to skip from latest when choosing partitions to create ClusteringPlan"); + + public static final ConfigProperty PLAN_PARTITION_FILTER_MODE_NAME = ConfigProperty + .key(PLAN_PARTITION_FILTER_MODE) + .defaultValue(ClusteringPlanPartitionFilterMode.NONE) + .sinceVersion("0.11.0") + .withDocumentation("Partition filter mode used in the creation of clustering plan. Available values are - " + + "NONE: do not filter table partition and thus the clustering plan will include all partitions that have clustering candidate." + + "RECENT_DAYS: keep a continuous range of partitions, worked together with configs '" + DAYBASED_LOOKBACK_PARTITIONS.key() + "' and '" + + PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST.key() + "." + + "SELECTED_PARTITIONS: keep partitions that are in the specified range ['" + PARTITION_FILTER_BEGIN_PARTITION.key() + "', '" + + PARTITION_FILTER_END_PARTITION.key() + "']."); + + public static final ConfigProperty PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP = ConfigProperty + .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "max.bytes.per.group") + .defaultValue(String.valueOf(2 * 1024 * 1024 * 1024L)) + .sinceVersion("0.7.0") + .withDocumentation("Each clustering operation can create multiple output file groups. Total amount of data processed by clustering operation" + + " is defined by below two properties (CLUSTERING_MAX_BYTES_PER_GROUP * CLUSTERING_MAX_NUM_GROUPS)." + + " Max amount of data to be included in one group"); + + public static final ConfigProperty PLAN_STRATEGY_MAX_GROUPS = ConfigProperty + .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "max.num.groups") + .defaultValue("30") + .sinceVersion("0.7.0") + .withDocumentation("Maximum number of groups to create as part of ClusteringPlan. Increasing groups will increase parallelism"); + + public static final ConfigProperty PLAN_STRATEGY_TARGET_FILE_MAX_BYTES = ConfigProperty + .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "target.file.max.bytes") + .defaultValue(String.valueOf(1024 * 1024 * 1024L)) + .sinceVersion("0.7.0") + .withDocumentation("Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups"); + + public static final ConfigProperty PLAN_STRATEGY_SORT_COLUMNS = ConfigProperty + .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "sort.columns") + .noDefaultValue() + .sinceVersion("0.7.0") + .withDocumentation("Columns to sort the data by when clustering"); + + public static final ConfigProperty UPDATES_STRATEGY = ConfigProperty + .key("hoodie.clustering.updates.strategy") + .defaultValue("org.apache.hudi.client.clustering.update.strategy.SparkRejectUpdateStrategy") + .sinceVersion("0.7.0") + .withDocumentation("Determines how to handle updates, deletes to file groups that are under clustering." + + " Default strategy just rejects the update"); + + public static final ConfigProperty SCHEDULE_INLINE_CLUSTERING = ConfigProperty + .key("hoodie.clustering.schedule.inline") + .defaultValue("false") + .withDocumentation("When set to true, clustering service will be attempted for inline scheduling after each write. Users have to ensure " + + "they have a separate job to run async clustering(execution) for the one scheduled by this writer. Users can choose to set both " + + "`hoodie.clustering.inline` and `hoodie.clustering.schedule.inline` to false and have both scheduling and execution triggered by any async process, on which " + + "case `hoodie.clustering.async.enabled` is expected to be set to true. But if `hoodie.clustering.inline` is set to false, and `hoodie.clustering.schedule.inline` " + + "is set to true, regular writers will schedule clustering inline, but users are expected to trigger async job for execution. If `hoodie.clustering.inline` is set " + + "to true, regular writers will do both scheduling and execution inline for clustering"); + + public static final ConfigProperty ASYNC_CLUSTERING_ENABLE = ConfigProperty + .key("hoodie.clustering.async.enabled") + .defaultValue("false") + .sinceVersion("0.7.0") + .withDocumentation("Enable running of clustering service, asynchronously as inserts happen on the table.") + .withAlternatives("hoodie.datasource.clustering.async.enable"); + + public static final ConfigProperty PRESERVE_COMMIT_METADATA = ConfigProperty + .key("hoodie.clustering.preserve.commit.metadata") + .defaultValue(true) + .sinceVersion("0.9.0") + .withDocumentation("When rewriting data, preserves existing hoodie_commit_time"); + + /** + * @deprecated this setting has no effect. Please refer to clustering configuration, as well as + * {@link #LAYOUT_OPTIMIZE_STRATEGY} config to enable advanced record layout optimization strategies + */ + public static final ConfigProperty LAYOUT_OPTIMIZE_ENABLE = ConfigProperty + .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "enable") + .defaultValue(false) + .sinceVersion("0.10.0") + .deprecatedAfter("0.11.0") + .withDocumentation("This setting has no effect. Please refer to clustering configuration, as well as " + + "LAYOUT_OPTIMIZE_STRATEGY config to enable advanced record layout optimization strategies"); + + /** + * Determines ordering strategy in for records layout optimization. + * Currently, following strategies are supported + *

    + *
  • Linear: simply orders records lexicographically
  • + *
  • Z-order: orders records along Z-order spatial-curve
  • + *
  • Hilbert: orders records along Hilbert's spatial-curve
  • + *
+ * + * NOTE: "z-order", "hilbert" strategies may consume considerably more compute, than "linear". + * Make sure to perform small-scale local testing for your dataset before applying globally. + */ + public static final ConfigProperty LAYOUT_OPTIMIZE_STRATEGY = ConfigProperty + .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "strategy") + .defaultValue("linear") + .sinceVersion("0.10.0") + .withDocumentation("Determines ordering strategy used in records layout optimization. " + + "Currently supported strategies are \"linear\", \"z-order\" and \"hilbert\" values are supported."); + + /** + * NOTE: This setting only has effect if {@link #LAYOUT_OPTIMIZE_STRATEGY} value is set to + * either "z-order" or "hilbert" (ie leveraging space-filling curves) + * + * Currently, two methods to order records along the curve are supported "build" and "sample": + * + *
    + *
  • Direct: entails that spatial curve will be built in full, "filling in" all of the individual + * points corresponding to each individual record
  • + *
  • Sample: leverages boundary-base interleaved index method (described in more details in + * Amazon DynamoDB blog [1])
  • + *
+ * + * NOTE: Boundary-based interleaved Index method has better generalization, + * but is slower than direct method. + * + * Please refer to RFC-28 for specific elaboration on both flows. + * + * [1] https://aws.amazon.com/cn/blogs/database/tag/z-order/ + */ + public static final ConfigProperty LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD = ConfigProperty + .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "curve.build.method") + .defaultValue("direct") + .sinceVersion("0.10.0") + .withDocumentation("Controls how data is sampled to build the space-filling curves. " + + "Two methods: \"direct\", \"sample\". The direct method is faster than the sampling, " + + "however sample method would produce a better data layout."); + + /** + * NOTE: This setting only has effect if {@link #LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD} value + * is set to "sample" + * + * Determines target sample size used by the Boundary-based Interleaved Index method. + * Larger sample size entails better layout optimization outcomes, at the expense of higher memory + * footprint. + */ + public static final ConfigProperty LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE = ConfigProperty + .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "build.curve.sample.size") + .defaultValue("200000") + .sinceVersion("0.10.0") + .withDocumentation("Determines target sample size used by the Boundary-based Interleaved Index method " + + "of building space-filling curve. Larger sample size entails better layout optimization outcomes, " + + "at the expense of higher memory footprint."); + + /** + * @deprecated this setting has no effect + */ + public static final ConfigProperty LAYOUT_OPTIMIZE_DATA_SKIPPING_ENABLE = ConfigProperty + .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "data.skipping.enable") + .defaultValue(true) + .sinceVersion("0.10.0") + .deprecatedAfter("0.11.0") + .withDocumentation("Enable data skipping by collecting statistics once layout optimization is complete."); + + public static final ConfigProperty ROLLBACK_PENDING_CLUSTERING_ON_CONFLICT = ConfigProperty + .key("hoodie.clustering.rollback.pending.replacecommit.on.conflict") + .defaultValue(false) + .sinceVersion("0.10.0") + .withDocumentation("If updates are allowed to file groups pending clustering, then set this config to rollback failed or pending clustering instants. " + + "Pending clustering will be rolled back ONLY IF there is conflict between incoming upsert and filegroup to be clustered. " + + "Please exercise caution while setting this config, especially when clustering is done very frequently. This could lead to race condition in " + + "rare scenarios, for example, when the clustering completes after instants are fetched but before rollback completed."); + + /** + * @deprecated Use {@link #PLAN_STRATEGY_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String CLUSTERING_PLAN_STRATEGY_CLASS = PLAN_STRATEGY_CLASS_NAME.key(); + /** + * @deprecated Use {@link #PLAN_STRATEGY_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_CLUSTERING_PLAN_STRATEGY_CLASS = PLAN_STRATEGY_CLASS_NAME.defaultValue(); + /** + * @deprecated Use {@link #EXECUTION_STRATEGY_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String CLUSTERING_EXECUTION_STRATEGY_CLASS = EXECUTION_STRATEGY_CLASS_NAME.key(); + /** + * @deprecated Use {@link #EXECUTION_STRATEGY_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_CLUSTERING_EXECUTION_STRATEGY_CLASS = EXECUTION_STRATEGY_CLASS_NAME.defaultValue(); + /** + * @deprecated Use {@link #INLINE_CLUSTERING} and its methods instead + */ + @Deprecated + public static final String INLINE_CLUSTERING_PROP = INLINE_CLUSTERING.key(); + /** + * @deprecated Use {@link #INLINE_CLUSTERING} and its methods instead + */ + @Deprecated + private static final String DEFAULT_INLINE_CLUSTERING = INLINE_CLUSTERING.defaultValue(); + /** + * @deprecated Use {@link #INLINE_CLUSTERING_MAX_COMMITS} and its methods instead + */ + @Deprecated + public static final String INLINE_CLUSTERING_MAX_COMMIT_PROP = INLINE_CLUSTERING_MAX_COMMITS.key(); + /** + * @deprecated Use {@link #INLINE_CLUSTERING_MAX_COMMITS} and its methods instead + */ + @Deprecated + private static final String DEFAULT_INLINE_CLUSTERING_NUM_COMMITS = INLINE_CLUSTERING_MAX_COMMITS.defaultValue(); + /** + * @deprecated Use {@link #DAYBASED_LOOKBACK_PARTITIONS} and its methods instead + */ + @Deprecated + public static final String CLUSTERING_TARGET_PARTITIONS = DAYBASED_LOOKBACK_PARTITIONS.key(); + /** + * @deprecated Use {@link #DAYBASED_LOOKBACK_PARTITIONS} and its methods instead + */ + @Deprecated + public static final String DEFAULT_CLUSTERING_TARGET_PARTITIONS = DAYBASED_LOOKBACK_PARTITIONS.defaultValue(); + /** + * @deprecated Use {@link #PLAN_STRATEGY_SMALL_FILE_LIMIT} and its methods instead + */ + @Deprecated + public static final String CLUSTERING_PLAN_SMALL_FILE_LIMIT = PLAN_STRATEGY_SMALL_FILE_LIMIT.key(); + /** + * @deprecated Use {@link #PLAN_STRATEGY_SMALL_FILE_LIMIT} and its methods instead + */ + @Deprecated + public static final String DEFAULT_CLUSTERING_PLAN_SMALL_FILE_LIMIT = PLAN_STRATEGY_SMALL_FILE_LIMIT.defaultValue(); + /** + * @deprecated Use {@link #PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP} and its methods instead + */ + @Deprecated + public static final String CLUSTERING_MAX_BYTES_PER_GROUP = PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP.key(); + /** + * @deprecated Use {@link #PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP} and its methods instead + */ + @Deprecated + public static final String DEFAULT_CLUSTERING_MAX_GROUP_SIZE = PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP.defaultValue(); + /** + * @deprecated Use {@link #PLAN_STRATEGY_MAX_GROUPS} and its methods instead + */ + @Deprecated + public static final String CLUSTERING_MAX_NUM_GROUPS = PLAN_STRATEGY_MAX_GROUPS.key(); + /** + * @deprecated Use {@link #PLAN_STRATEGY_MAX_GROUPS} and its methods instead + */ + @Deprecated + public static final String DEFAULT_CLUSTERING_MAX_NUM_GROUPS = PLAN_STRATEGY_MAX_GROUPS.defaultValue(); + /** + * @deprecated Use {@link #PLAN_STRATEGY_TARGET_FILE_MAX_BYTES} and its methods instead + */ + @Deprecated + public static final String CLUSTERING_TARGET_FILE_MAX_BYTES = PLAN_STRATEGY_TARGET_FILE_MAX_BYTES.key(); + /** + * @deprecated Use {@link #PLAN_STRATEGY_TARGET_FILE_MAX_BYTES} and its methods instead + */ + @Deprecated + public static final String DEFAULT_CLUSTERING_TARGET_FILE_MAX_BYTES = PLAN_STRATEGY_TARGET_FILE_MAX_BYTES.defaultValue(); + /** + * @deprecated Use {@link #PLAN_STRATEGY_SORT_COLUMNS} and its methods instead + */ + @Deprecated + public static final String CLUSTERING_SORT_COLUMNS_PROPERTY = PLAN_STRATEGY_SORT_COLUMNS.key(); + /** + * @deprecated Use {@link #UPDATES_STRATEGY} and its methods instead + */ + @Deprecated + public static final String CLUSTERING_UPDATES_STRATEGY_PROP = UPDATES_STRATEGY.key(); + /** + * @deprecated Use {@link #UPDATES_STRATEGY} and its methods instead + */ + @Deprecated + public static final String DEFAULT_CLUSTERING_UPDATES_STRATEGY = UPDATES_STRATEGY.defaultValue(); + /** + * @deprecated Use {@link #ASYNC_CLUSTERING_ENABLE} and its methods instead + */ + @Deprecated + public static final String ASYNC_CLUSTERING_ENABLE_OPT_KEY = ASYNC_CLUSTERING_ENABLE.key(); + /** @deprecated Use {@link #ASYNC_CLUSTERING_ENABLE} and its methods instead */ + @Deprecated + public static final String DEFAULT_ASYNC_CLUSTERING_ENABLE_OPT_VAL = ASYNC_CLUSTERING_ENABLE.defaultValue(); + + // NOTE: This ctor is required for appropriate deserialization + public HoodieClusteringConfig() { + super(); + } + + public boolean isAsyncClusteringEnabled() { + return getBooleanOrDefault(HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE); + } + + public boolean isInlineClusteringEnabled() { + return getBooleanOrDefault(HoodieClusteringConfig.INLINE_CLUSTERING); + } + + public static HoodieClusteringConfig from(TypedProperties props) { + return HoodieClusteringConfig.newBuilder().fromProperties(props).build(); + } + + public static Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + + private final HoodieClusteringConfig clusteringConfig = new HoodieClusteringConfig(); + private EngineType engineType = EngineType.SPARK; + + public Builder withEngineType(EngineType engineType) { + this.engineType = engineType; + return this; + } + + public Builder fromFile(File propertiesFile) throws IOException { + try (FileReader reader = new FileReader(propertiesFile)) { + this.clusteringConfig.getProps().load(reader); + return this; + } + } + + public Builder withClusteringPlanStrategyClass(String clusteringStrategyClass) { + clusteringConfig.setValue(PLAN_STRATEGY_CLASS_NAME, clusteringStrategyClass); + return this; + } + + public Builder withClusteringPlanPartitionFilterMode(ClusteringPlanPartitionFilterMode mode) { + clusteringConfig.setValue(PLAN_PARTITION_FILTER_MODE_NAME.key(), mode.toString()); + return this; + } + + public Builder withClusteringExecutionStrategyClass(String runClusteringStrategyClass) { + clusteringConfig.setValue(EXECUTION_STRATEGY_CLASS_NAME, runClusteringStrategyClass); + return this; + } + + public Builder withClusteringTargetPartitions(int clusteringTargetPartitions) { + clusteringConfig.setValue(DAYBASED_LOOKBACK_PARTITIONS, String.valueOf(clusteringTargetPartitions)); + return this; + } + + public Builder withClusteringPartitionRegexPattern(String pattern) { + clusteringConfig.setValue(PARTITION_REGEX_PATTERN, pattern); + return this; + } + + public Builder withClusteringPartitionSelected(String partitionSelected) { + clusteringConfig.setValue(PARTITION_SELECTED, partitionSelected); + return this; + } + + public Builder withClusteringSkipPartitionsFromLatest(int clusteringSkipPartitionsFromLatest) { + clusteringConfig.setValue(PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST, String.valueOf(clusteringSkipPartitionsFromLatest)); + return this; + } + + public Builder withClusteringPartitionFilterBeginPartition(String begin) { + clusteringConfig.setValue(PARTITION_FILTER_BEGIN_PARTITION, begin); + return this; + } + + public Builder withClusteringPartitionFilterEndPartition(String end) { + clusteringConfig.setValue(PARTITION_FILTER_END_PARTITION, end); + return this; + } + + public Builder withClusteringPlanSmallFileLimit(long clusteringSmallFileLimit) { + clusteringConfig.setValue(PLAN_STRATEGY_SMALL_FILE_LIMIT, String.valueOf(clusteringSmallFileLimit)); + return this; + } + + public Builder withClusteringSortColumns(String sortColumns) { + clusteringConfig.setValue(PLAN_STRATEGY_SORT_COLUMNS, sortColumns); + return this; + } + + public Builder withClusteringMaxBytesInGroup(long clusteringMaxGroupSize) { + clusteringConfig.setValue(PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP, String.valueOf(clusteringMaxGroupSize)); + return this; + } + + public Builder withClusteringMaxNumGroups(int maxNumGroups) { + clusteringConfig.setValue(PLAN_STRATEGY_MAX_GROUPS, String.valueOf(maxNumGroups)); + return this; + } + + public Builder withClusteringTargetFileMaxBytes(long targetFileSize) { + clusteringConfig.setValue(PLAN_STRATEGY_TARGET_FILE_MAX_BYTES, String.valueOf(targetFileSize)); + return this; + } + + public Builder withInlineClustering(Boolean inlineClustering) { + clusteringConfig.setValue(INLINE_CLUSTERING, String.valueOf(inlineClustering)); + return this; + } + + public Builder withScheduleInlineClustering(Boolean scheduleInlineClustering) { + clusteringConfig.setValue(SCHEDULE_INLINE_CLUSTERING, String.valueOf(scheduleInlineClustering)); + return this; + } + + public Builder withInlineClusteringNumCommits(int numCommits) { + clusteringConfig.setValue(INLINE_CLUSTERING_MAX_COMMITS, String.valueOf(numCommits)); + return this; + } + + public Builder withAsyncClusteringMaxCommits(int numCommits) { + clusteringConfig.setValue(ASYNC_CLUSTERING_MAX_COMMITS, String.valueOf(numCommits)); + return this; + } + + public Builder fromProperties(Properties props) { + // TODO this should cherry-pick only clustering properties + this.clusteringConfig.getProps().putAll(props); + return this; + } + + public Builder withClusteringUpdatesStrategy(String updatesStrategyClass) { + clusteringConfig.setValue(UPDATES_STRATEGY, updatesStrategyClass); + return this; + } + + public Builder withAsyncClustering(Boolean asyncClustering) { + clusteringConfig.setValue(ASYNC_CLUSTERING_ENABLE, String.valueOf(asyncClustering)); + return this; + } + + public Builder withPreserveHoodieCommitMetadata(Boolean preserveHoodieCommitMetadata) { + clusteringConfig.setValue(PRESERVE_COMMIT_METADATA, String.valueOf(preserveHoodieCommitMetadata)); + return this; + } + + public Builder withRollbackPendingClustering(Boolean rollbackPendingClustering) { + clusteringConfig.setValue(ROLLBACK_PENDING_CLUSTERING_ON_CONFLICT, String.valueOf(rollbackPendingClustering)); + return this; + } + + public Builder withDataOptimizeStrategy(String strategy) { + clusteringConfig.setValue(LAYOUT_OPTIMIZE_STRATEGY, strategy); + return this; + } + + public Builder withDataOptimizeBuildCurveStrategy(String method) { + clusteringConfig.setValue(LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD, method); + return this; + } + + public Builder withDataOptimizeBuildCurveSampleNumber(int sampleNumber) { + clusteringConfig.setValue(LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE, String.valueOf(sampleNumber)); + return this; + } + + public HoodieClusteringConfig build() { + clusteringConfig.setDefaultValue( + PLAN_STRATEGY_CLASS_NAME, getDefaultPlanStrategyClassName(engineType)); + clusteringConfig.setDefaultValue( + EXECUTION_STRATEGY_CLASS_NAME, getDefaultExecutionStrategyClassName(engineType)); + clusteringConfig.setDefaults(HoodieClusteringConfig.class.getName()); + + boolean inlineCluster = clusteringConfig.getBoolean(HoodieClusteringConfig.INLINE_CLUSTERING); + boolean inlineClusterSchedule = clusteringConfig.getBoolean(HoodieClusteringConfig.SCHEDULE_INLINE_CLUSTERING); + ValidationUtils.checkArgument(!(inlineCluster && inlineClusterSchedule), String.format("Either of inline clustering (%s) or " + + "schedule inline clustering (%s) can be enabled. Both can't be set to true at the same time. %s,%s", HoodieClusteringConfig.INLINE_CLUSTERING.key(), + HoodieClusteringConfig.SCHEDULE_INLINE_CLUSTERING.key(), inlineCluster, inlineClusterSchedule)); + return clusteringConfig; + } + + private String getDefaultPlanStrategyClassName(EngineType engineType) { + switch (engineType) { + case SPARK: + return SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY; + case FLINK: + case JAVA: + return JAVA_SIZED_BASED_CLUSTERING_PLAN_STRATEGY; + default: + throw new HoodieNotSupportedException("Unsupported engine " + engineType); + } + } + + private String getDefaultExecutionStrategyClassName(EngineType engineType) { + switch (engineType) { + case SPARK: + return SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY; + case FLINK: + case JAVA: + return JAVA_SORT_AND_SIZE_EXECUTION_STRATEGY; + default: + throw new HoodieNotSupportedException("Unsupported engine " + engineType); + } + } + } + + /** + * Type of a strategy for building Z-order/Hilbert space-filling curves. + */ + public enum SpatialCurveCompositionStrategyType { + DIRECT("direct"), + SAMPLE("sample"); + + private static final Map VALUE_TO_ENUM_MAP = + TypeUtils.getValueToEnumMap(SpatialCurveCompositionStrategyType.class, e -> e.value); + + private final String value; + + SpatialCurveCompositionStrategyType(String value) { + this.value = value; + } + + public static SpatialCurveCompositionStrategyType fromValue(String value) { + SpatialCurveCompositionStrategyType enumValue = VALUE_TO_ENUM_MAP.get(value); + if (enumValue == null) { + throw new HoodieException(String.format("Invalid value (%s)", value)); + } + + return enumValue; + } + } + + /** + * Layout optimization strategies such as Z-order/Hilbert space-curves, etc + */ + public enum LayoutOptimizationStrategy { + LINEAR("linear"), + ZORDER("z-order"), + HILBERT("hilbert"); + + private static final Map VALUE_TO_ENUM_MAP = + TypeUtils.getValueToEnumMap(LayoutOptimizationStrategy.class, e -> e.value); + + private final String value; + + LayoutOptimizationStrategy(String value) { + this.value = value; + } + + @Nonnull + public static LayoutOptimizationStrategy fromValue(String value) { + LayoutOptimizationStrategy enumValue = VALUE_TO_ENUM_MAP.get(value); + if (enumValue == null) { + throw new HoodieException(String.format("Invalid value (%s)", value)); + } + + return enumValue; + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java index 08f37740725dd..b6fe6d8aa026f 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java @@ -18,10 +18,11 @@ package org.apache.hudi.config; -import org.apache.hudi.common.config.DefaultHoodieConfig; -import org.apache.hudi.common.model.HoodieCleaningPolicy; -import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; -import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.table.action.compact.CompactionTriggerStrategy; import org.apache.hudi.table.action.compact.strategy.CompactionStrategy; import org.apache.hudi.table.action.compact.strategy.LogFileSizeBasedCompactionStrategy; @@ -30,98 +31,264 @@ import java.io.File; import java.io.FileReader; import java.io.IOException; +import java.util.Arrays; import java.util.Properties; +import java.util.stream.Collectors; /** * Compaction related config. */ @Immutable -public class HoodieCompactionConfig extends DefaultHoodieConfig { - - public static final String CLEANER_POLICY_PROP = "hoodie.cleaner.policy"; - public static final String AUTO_CLEAN_PROP = "hoodie.clean.automatic"; - public static final String ASYNC_CLEAN_PROP = "hoodie.clean.async"; - - // Turn on inline compaction - after fw delta commits a inline compaction will be run - public static final String INLINE_COMPACT_PROP = "hoodie.compact.inline"; - // Run a compaction every N delta commits - public static final String INLINE_COMPACT_NUM_DELTA_COMMITS_PROP = "hoodie.compact.inline.max.delta.commits"; - public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP = "hoodie.cleaner.fileversions.retained"; - public static final String CLEANER_COMMITS_RETAINED_PROP = "hoodie.cleaner.commits.retained"; - public static final String CLEANER_INCREMENTAL_MODE = "hoodie.cleaner.incremental.mode"; - public static final String MAX_COMMITS_TO_KEEP_PROP = "hoodie.keep.max.commits"; - public static final String MIN_COMMITS_TO_KEEP_PROP = "hoodie.keep.min.commits"; - public static final String COMMITS_ARCHIVAL_BATCH_SIZE_PROP = "hoodie.commits.archival.batch"; - // Set true to clean bootstrap source files when necessary - public static final String CLEANER_BOOTSTRAP_BASE_FILE_ENABLED = "hoodie.cleaner.delete.bootstrap.base.file"; - // Upsert uses this file size to compact new data onto existing files.. - public static final String PARQUET_SMALL_FILE_LIMIT_BYTES = "hoodie.parquet.small.file.limit"; - // By default, treat any file <= 100MB as a small file. - public static final String DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES = String.valueOf(104857600); - // Hudi will use the previous commit to calculate the estimated record size by totalBytesWritten/totalRecordsWritten. - // If the previous commit is too small to make an accurate estimation, Hudi will search commits in the reverse order, - // until find a commit has totalBytesWritten larger than (PARQUET_SMALL_FILE_LIMIT_BYTES * RECORD_SIZE_ESTIMATION_THRESHOLD) - public static final String RECORD_SIZE_ESTIMATION_THRESHOLD_PROP = "hoodie.record.size.estimation.threshold"; - public static final String DEFAULT_RECORD_SIZE_ESTIMATION_THRESHOLD = "1.0"; +@ConfigClassProperty(name = "Compaction Configs", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Configurations that control compaction " + + "(merging of log files onto a new base files).") +public class HoodieCompactionConfig extends HoodieConfig { + + public static final ConfigProperty INLINE_COMPACT = ConfigProperty + .key("hoodie.compact.inline") + .defaultValue("false") + .withDocumentation("When set to true, compaction service is triggered after each write. While being " + + " simpler operationally, this adds extra latency on the write path."); + + public static final ConfigProperty SCHEDULE_INLINE_COMPACT = ConfigProperty + .key("hoodie.compact.schedule.inline") + .defaultValue("false") + .withDocumentation("When set to true, compaction service will be attempted for inline scheduling after each write. Users have to ensure " + + "they have a separate job to run async compaction(execution) for the one scheduled by this writer. Users can choose to set both " + + "`hoodie.compact.inline` and `hoodie.compact.schedule.inline` to false and have both scheduling and execution triggered by any async process. " + + "But if `hoodie.compact.inline` is set to false, and `hoodie.compact.schedule.inline` is set to true, regular writers will schedule compaction inline, " + + "but users are expected to trigger async job for execution. If `hoodie.compact.inline` is set to true, regular writers will do both scheduling and " + + "execution inline for compaction"); + + public static final ConfigProperty INLINE_COMPACT_NUM_DELTA_COMMITS = ConfigProperty + .key("hoodie.compact.inline.max.delta.commits") + .defaultValue("5") + .withDocumentation("Number of delta commits after the last compaction, before scheduling of a new compaction is attempted. " + + "This config takes effect only for the compaction triggering strategy based on the number of commits, " + + "i.e., NUM_COMMITS, NUM_COMMITS_AFTER_LAST_REQUEST, NUM_AND_TIME, and NUM_OR_TIME."); + + public static final ConfigProperty INLINE_COMPACT_TIME_DELTA_SECONDS = ConfigProperty + .key("hoodie.compact.inline.max.delta.seconds") + .defaultValue(String.valueOf(60 * 60)) + .withDocumentation("Number of elapsed seconds after the last compaction, before scheduling a new one. " + + "This config takes effect only for the compaction triggering strategy based on the elapsed time, " + + "i.e., TIME_ELAPSED, NUM_AND_TIME, and NUM_OR_TIME."); + + public static final ConfigProperty INLINE_COMPACT_TRIGGER_STRATEGY = ConfigProperty + .key("hoodie.compact.inline.trigger.strategy") + .defaultValue(CompactionTriggerStrategy.NUM_COMMITS.name()) + .withDocumentation("Controls how compaction scheduling is triggered, by time or num delta commits or combination of both. " + + "Valid options: " + Arrays.stream(CompactionTriggerStrategy.values()).map(Enum::name).collect(Collectors.joining(","))); + + public static final ConfigProperty PARQUET_SMALL_FILE_LIMIT = ConfigProperty + .key("hoodie.parquet.small.file.limit") + .defaultValue(String.valueOf(104857600)) + .withDocumentation("During upsert operation, we opportunistically expand existing small files on storage, instead of writing" + + " new files, to keep number of files to an optimum. This config sets the file size limit below which a file on storage " + + " becomes a candidate to be selected as such a `small file`. By default, treat any file <= 100MB as a small file." + + " Also note that if this set <= 0, will not try to get small files and directly write new files"); + + public static final ConfigProperty RECORD_SIZE_ESTIMATION_THRESHOLD = ConfigProperty + .key("hoodie.record.size.estimation.threshold") + .defaultValue("1.0") + .withDocumentation("We use the previous commits' metadata to calculate the estimated record size and use it " + + " to bin pack records into partitions. If the previous commit is too small to make an accurate estimation, " + + " Hudi will search commits in the reverse order, until we find a commit that has totalBytesWritten " + + " larger than (PARQUET_SMALL_FILE_LIMIT_BYTES * this_threshold)"); + + // 500GB of target IO per compaction (both read and write + public static final ConfigProperty TARGET_IO_PER_COMPACTION_IN_MB = ConfigProperty + .key("hoodie.compaction.target.io") + .defaultValue(String.valueOf(500 * 1024)) + .withDocumentation("Amount of MBs to spend during compaction run for the LogFileSizeBasedCompactionStrategy. " + + "This value helps bound ingestion latency while compaction is run inline mode."); + + public static final ConfigProperty COMPACTION_LOG_FILE_SIZE_THRESHOLD = ConfigProperty + .key("hoodie.compaction.logfile.size.threshold") + .defaultValue(0L) + .withDocumentation("Only if the log file size is greater than the threshold in bytes," + + " the file group will be compacted."); + + public static final ConfigProperty COMPACTION_LOG_FILE_NUM_THRESHOLD = ConfigProperty + .key("hoodie.compaction.logfile.num.threshold") + .defaultValue(0L) + .withDocumentation("Only if the log file num is greater than the threshold," + + " the file group will be compacted."); + + public static final ConfigProperty COMPACTION_STRATEGY = ConfigProperty + .key("hoodie.compaction.strategy") + .defaultValue(LogFileSizeBasedCompactionStrategy.class.getName()) + .withDocumentation("Compaction strategy decides which file groups are picked up for " + + "compaction during each compaction run. By default. Hudi picks the log file " + + "with most accumulated unmerged data"); + + public static final ConfigProperty COMPACTION_LAZY_BLOCK_READ_ENABLE = ConfigProperty + .key("hoodie.compaction.lazy.block.read") + .defaultValue("true") + .withDocumentation("When merging the delta log files, this config helps to choose whether the log blocks " + + "should be read lazily or not. Choose true to use lazy block reading (low memory usage, but incurs seeks to each block" + + " header) or false for immediate block read (higher memory usage)"); + + public static final ConfigProperty COMPACTION_REVERSE_LOG_READ_ENABLE = ConfigProperty + .key("hoodie.compaction.reverse.log.read") + .defaultValue("false") + .withDocumentation("HoodieLogFormatReader reads a logfile in the forward direction starting from pos=0 to pos=file_length. " + + "If this config is set to true, the reader reads the logfile in reverse direction, from pos=file_length to pos=0"); + + public static final ConfigProperty TARGET_PARTITIONS_PER_DAYBASED_COMPACTION = ConfigProperty + .key("hoodie.compaction.daybased.target.partitions") + .defaultValue("10") + .withDocumentation("Used by org.apache.hudi.io.compact.strategy.DayBasedCompactionStrategy to denote the number of " + + "latest partitions to compact during a compaction run."); + + public static final ConfigProperty PRESERVE_COMMIT_METADATA = ConfigProperty + .key("hoodie.compaction.preserve.commit.metadata") + .defaultValue(true) + .sinceVersion("0.11.0") + .withDocumentation("When rewriting data, preserves existing hoodie_commit_time"); /** * Configs related to specific table types. */ - // Number of inserts, that will be put each partition/bucket for writing - public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = "hoodie.copyonwrite.insert.split.size"; - // The rationale to pick the insert parallelism is the following. Writing out 100MB files, - // with atleast 1kb records, means 100K records per file. we just overprovision to 500K - public static final String DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = String.valueOf(500000); - // Config to control whether we control insert split sizes automatically based on average - // record sizes - public static final String COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = "hoodie.copyonwrite.insert.auto.split"; - // its off by default - public static final String DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = String.valueOf(true); - // This value is used as a guesstimate for the record size, if we can't determine this from - // previous commits - public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = "hoodie.copyonwrite.record.size.estimate"; - // Used to determine how much more can be packed into a small file, before it exceeds the size - // limit. - public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String.valueOf(1024); - public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism"; - public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200); - public static final String TARGET_IO_PER_COMPACTION_IN_MB_PROP = "hoodie.compaction.target.io"; - // 500GB of target IO per compaction (both read and write) - public static final String DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB = String.valueOf(500 * 1024); - public static final String COMPACTION_STRATEGY_PROP = "hoodie.compaction.strategy"; - // 200GB of target IO per compaction - public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class.getName(); - // used to merge records written to log file - public static final String DEFAULT_PAYLOAD_CLASS = OverwriteWithLatestAvroPayload.class.getName(); - public static final String PAYLOAD_CLASS_PROP = "hoodie.compaction.payload.class"; - - // used to choose a trade off between IO vs Memory when performing compaction process - // Depending on outputfile_size and memory provided, choose true to avoid OOM for large file - // size + small memory - public static final String COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP = "hoodie.compaction.lazy.block.read"; - public static final String DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED = "false"; - // used to choose whether to enable reverse log reading (reverse log traversal) - public static final String COMPACTION_REVERSE_LOG_READ_ENABLED_PROP = "hoodie.compaction.reverse.log.read"; - public static final String DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED = "false"; - private static final String DEFAULT_CLEANER_POLICY = HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name(); - private static final String DEFAULT_AUTO_CLEAN = "true"; - private static final String DEFAULT_ASYNC_CLEAN = "false"; - private static final String DEFAULT_INLINE_COMPACT = "false"; - private static final String DEFAULT_INCREMENTAL_CLEANER = "true"; - private static final String DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS = "5"; - private static final String DEFAULT_CLEANER_FILE_VERSIONS_RETAINED = "3"; - private static final String DEFAULT_CLEANER_COMMITS_RETAINED = "10"; - private static final String DEFAULT_MAX_COMMITS_TO_KEEP = "30"; - private static final String DEFAULT_MIN_COMMITS_TO_KEEP = "20"; - private static final String DEFAULT_COMMITS_ARCHIVAL_BATCH_SIZE = String.valueOf(10); - private static final String DEFAULT_CLEANER_BOOTSTRAP_BASE_FILE_ENABLED = "false"; - public static final String TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP = - "hoodie.compaction.daybased.target.partitions"; - // 500GB of target IO per compaction (both read and write) - public static final String DEFAULT_TARGET_PARTITIONS_PER_DAYBASED_COMPACTION = String.valueOf(10); - - private HoodieCompactionConfig(Properties props) { - super(props); + public static final ConfigProperty COPY_ON_WRITE_INSERT_SPLIT_SIZE = ConfigProperty + .key("hoodie.copyonwrite.insert.split.size") + .defaultValue(String.valueOf(500000)) + .withDocumentation("Number of inserts assigned for each partition/bucket for writing. " + + "We based the default on writing out 100MB files, with at least 1kb records (100K records per file), and " + + " over provision to 500K. As long as auto-tuning of splits is turned on, this only affects the first " + + " write, where there is no history to learn record sizes from."); + + public static final ConfigProperty COPY_ON_WRITE_AUTO_SPLIT_INSERTS = ConfigProperty + .key("hoodie.copyonwrite.insert.auto.split") + .defaultValue("true") + .withDocumentation("Config to control whether we control insert split sizes automatically based on average" + + " record sizes. It's recommended to keep this turned on, since hand tuning is otherwise extremely" + + " cumbersome."); + + public static final ConfigProperty COPY_ON_WRITE_RECORD_SIZE_ESTIMATE = ConfigProperty + .key("hoodie.copyonwrite.record.size.estimate") + .defaultValue(String.valueOf(1024)) + .withDocumentation("The average record size. If not explicitly specified, hudi will compute the " + + "record size estimate compute dynamically based on commit metadata. " + + " This is critical in computing the insert parallelism and bin-packing inserts into small files."); + + + /** @deprecated Use {@link #INLINE_COMPACT} and its methods instead */ + @Deprecated + public static final String INLINE_COMPACT_PROP = INLINE_COMPACT.key(); + /** @deprecated Use {@link #INLINE_COMPACT_NUM_DELTA_COMMITS} and its methods instead */ + @Deprecated + public static final String INLINE_COMPACT_NUM_DELTA_COMMITS_PROP = INLINE_COMPACT_NUM_DELTA_COMMITS.key(); + /** @deprecated Use {@link #INLINE_COMPACT_TIME_DELTA_SECONDS} and its methods instead */ + @Deprecated + public static final String INLINE_COMPACT_TIME_DELTA_SECONDS_PROP = INLINE_COMPACT_TIME_DELTA_SECONDS.key(); + /** @deprecated Use {@link #INLINE_COMPACT_TRIGGER_STRATEGY} and its methods instead */ + @Deprecated + public static final String INLINE_COMPACT_TRIGGER_STRATEGY_PROP = INLINE_COMPACT_TRIGGER_STRATEGY.key(); + /** + * @deprecated Use {@link #PARQUET_SMALL_FILE_LIMIT} and its methods instead + */ + @Deprecated + public static final String PARQUET_SMALL_FILE_LIMIT_BYTES = PARQUET_SMALL_FILE_LIMIT.key(); + /** + * @deprecated Use {@link #PARQUET_SMALL_FILE_LIMIT} and its methods instead + */ + @Deprecated + public static final String DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES = PARQUET_SMALL_FILE_LIMIT.defaultValue(); + /** + * @deprecated Use {@link #RECORD_SIZE_ESTIMATION_THRESHOLD} and its methods instead + */ + @Deprecated + public static final String RECORD_SIZE_ESTIMATION_THRESHOLD_PROP = RECORD_SIZE_ESTIMATION_THRESHOLD.key(); + /** + * @deprecated Use {@link #RECORD_SIZE_ESTIMATION_THRESHOLD} and its methods instead + */ + @Deprecated + public static final String DEFAULT_RECORD_SIZE_ESTIMATION_THRESHOLD = RECORD_SIZE_ESTIMATION_THRESHOLD.defaultValue(); + /** + * @deprecated Use {@link #COPY_ON_WRITE_INSERT_SPLIT_SIZE} and its methods instead + */ + @Deprecated + public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = COPY_ON_WRITE_INSERT_SPLIT_SIZE.key(); + /** + * @deprecated Use {@link #COPY_ON_WRITE_INSERT_SPLIT_SIZE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = COPY_ON_WRITE_INSERT_SPLIT_SIZE.defaultValue(); + /** + * @deprecated Use {@link #COPY_ON_WRITE_AUTO_SPLIT_INSERTS} and its methods instead + */ + @Deprecated + public static final String COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = COPY_ON_WRITE_AUTO_SPLIT_INSERTS.key(); + /** + * @deprecated Use {@link #COPY_ON_WRITE_AUTO_SPLIT_INSERTS} and its methods instead + */ + @Deprecated + public static final String DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = COPY_ON_WRITE_AUTO_SPLIT_INSERTS.defaultValue(); + /** + * @deprecated Use {@link #COPY_ON_WRITE_RECORD_SIZE_ESTIMATE} and its methods instead + */ + @Deprecated + public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = COPY_ON_WRITE_RECORD_SIZE_ESTIMATE.key(); + /** + * @deprecated Use {@link #COPY_ON_WRITE_RECORD_SIZE_ESTIMATE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = COPY_ON_WRITE_RECORD_SIZE_ESTIMATE.defaultValue(); + /** + * @deprecated Use {@link #TARGET_IO_PER_COMPACTION_IN_MB} and its methods instead + */ + @Deprecated + public static final String TARGET_IO_PER_COMPACTION_IN_MB_PROP = TARGET_IO_PER_COMPACTION_IN_MB.key(); + /** + * @deprecated Use {@link #TARGET_IO_PER_COMPACTION_IN_MB} and its methods instead + */ + @Deprecated + public static final String DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB = TARGET_IO_PER_COMPACTION_IN_MB.defaultValue(); + /** + * @deprecated Use {@link #COMPACTION_STRATEGY} and its methods instead + */ + @Deprecated + public static final String COMPACTION_STRATEGY_PROP = COMPACTION_STRATEGY.key(); + /** @deprecated Use {@link #COMPACTION_STRATEGY} and its methods instead */ + @Deprecated + public static final String DEFAULT_COMPACTION_STRATEGY = COMPACTION_STRATEGY.defaultValue(); + /** @deprecated Use {@link #COMPACTION_LAZY_BLOCK_READ_ENABLE} and its methods instead */ + @Deprecated + public static final String COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP = COMPACTION_LAZY_BLOCK_READ_ENABLE.key(); + /** @deprecated Use {@link #COMPACTION_LAZY_BLOCK_READ_ENABLE} and its methods instead */ + @Deprecated + public static final String DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED = COMPACTION_REVERSE_LOG_READ_ENABLE.defaultValue(); + /** @deprecated Use {@link #COMPACTION_REVERSE_LOG_READ_ENABLE} and its methods instead */ + @Deprecated + public static final String COMPACTION_REVERSE_LOG_READ_ENABLED_PROP = COMPACTION_REVERSE_LOG_READ_ENABLE.key(); + /** @deprecated Use {@link #COMPACTION_REVERSE_LOG_READ_ENABLE} and its methods instead */ + @Deprecated + public static final String DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED = COMPACTION_REVERSE_LOG_READ_ENABLE.defaultValue(); + /** + * @deprecated Use {@link #INLINE_COMPACT} and its methods instead + */ + @Deprecated + private static final String DEFAULT_INLINE_COMPACT = INLINE_COMPACT.defaultValue(); + /** @deprecated Use {@link #INLINE_COMPACT_NUM_DELTA_COMMITS} and its methods instead */ + @Deprecated + private static final String DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS = INLINE_COMPACT_NUM_DELTA_COMMITS.defaultValue(); + /** @deprecated Use {@link #INLINE_COMPACT_TIME_DELTA_SECONDS} and its methods instead */ + @Deprecated + private static final String DEFAULT_INLINE_COMPACT_TIME_DELTA_SECONDS = INLINE_COMPACT_TIME_DELTA_SECONDS.defaultValue(); + /** @deprecated Use {@link #INLINE_COMPACT_TRIGGER_STRATEGY} and its methods instead */ + @Deprecated + private static final String DEFAULT_INLINE_COMPACT_TRIGGER_STRATEGY = INLINE_COMPACT_TRIGGER_STRATEGY.defaultValue(); + /** @deprecated Use {@link #TARGET_PARTITIONS_PER_DAYBASED_COMPACTION} and its methods instead */ + @Deprecated + public static final String TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP = TARGET_PARTITIONS_PER_DAYBASED_COMPACTION.key(); + /** @deprecated Use {@link #TARGET_PARTITIONS_PER_DAYBASED_COMPACTION} and its methods instead */ + @Deprecated + public static final String DEFAULT_TARGET_PARTITIONS_PER_DAYBASED_COMPACTION = TARGET_PARTITIONS_PER_DAYBASED_COMPACTION.defaultValue(); + + private HoodieCompactionConfig() { + super(); } public static HoodieCompactionConfig.Builder newBuilder() { @@ -130,205 +297,113 @@ public static HoodieCompactionConfig.Builder newBuilder() { public static class Builder { - private final Properties props = new Properties(); + private final HoodieCompactionConfig compactionConfig = new HoodieCompactionConfig(); public Builder fromFile(File propertiesFile) throws IOException { try (FileReader reader = new FileReader(propertiesFile)) { - this.props.load(reader); + this.compactionConfig.getProps().load(reader); return this; } } public Builder fromProperties(Properties props) { - this.props.putAll(props); - return this; - } - - public Builder withAutoClean(Boolean autoClean) { - props.setProperty(AUTO_CLEAN_PROP, String.valueOf(autoClean)); - return this; - } - - public Builder withAsyncClean(Boolean asyncClean) { - props.setProperty(ASYNC_CLEAN_PROP, String.valueOf(asyncClean)); - return this; - } - - public Builder withIncrementalCleaningMode(Boolean incrementalCleaningMode) { - props.setProperty(CLEANER_INCREMENTAL_MODE, String.valueOf(incrementalCleaningMode)); + this.compactionConfig.getProps().putAll(props); return this; } public Builder withInlineCompaction(Boolean inlineCompaction) { - props.setProperty(INLINE_COMPACT_PROP, String.valueOf(inlineCompaction)); - return this; - } - - public Builder withCleanerPolicy(HoodieCleaningPolicy policy) { - props.setProperty(CLEANER_POLICY_PROP, policy.name()); + compactionConfig.setValue(INLINE_COMPACT, String.valueOf(inlineCompaction)); return this; } - public Builder retainFileVersions(int fileVersionsRetained) { - props.setProperty(CLEANER_FILE_VERSIONS_RETAINED_PROP, String.valueOf(fileVersionsRetained)); + public Builder withScheduleInlineCompaction(Boolean scheduleAsyncCompaction) { + compactionConfig.setValue(SCHEDULE_INLINE_COMPACT, String.valueOf(scheduleAsyncCompaction)); return this; } - public Builder retainCommits(int commitsRetained) { - props.setProperty(CLEANER_COMMITS_RETAINED_PROP, String.valueOf(commitsRetained)); - return this; - } - - public Builder archiveCommitsWith(int minToKeep, int maxToKeep) { - props.setProperty(MIN_COMMITS_TO_KEEP_PROP, String.valueOf(minToKeep)); - props.setProperty(MAX_COMMITS_TO_KEEP_PROP, String.valueOf(maxToKeep)); + public Builder withInlineCompactionTriggerStrategy(CompactionTriggerStrategy compactionTriggerStrategy) { + compactionConfig.setValue(INLINE_COMPACT_TRIGGER_STRATEGY, compactionTriggerStrategy.name()); return this; } public Builder compactionSmallFileSize(long smallFileLimitBytes) { - props.setProperty(PARQUET_SMALL_FILE_LIMIT_BYTES, String.valueOf(smallFileLimitBytes)); + compactionConfig.setValue(PARQUET_SMALL_FILE_LIMIT, String.valueOf(smallFileLimitBytes)); return this; } public Builder compactionRecordSizeEstimateThreshold(double threshold) { - props.setProperty(RECORD_SIZE_ESTIMATION_THRESHOLD_PROP, String.valueOf(threshold)); + compactionConfig.setValue(RECORD_SIZE_ESTIMATION_THRESHOLD, String.valueOf(threshold)); return this; } public Builder insertSplitSize(int insertSplitSize) { - props.setProperty(COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE, String.valueOf(insertSplitSize)); + compactionConfig.setValue(COPY_ON_WRITE_INSERT_SPLIT_SIZE, String.valueOf(insertSplitSize)); return this; } public Builder autoTuneInsertSplits(boolean autoTuneInsertSplits) { - props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, String.valueOf(autoTuneInsertSplits)); + compactionConfig.setValue(COPY_ON_WRITE_AUTO_SPLIT_INSERTS, String.valueOf(autoTuneInsertSplits)); return this; } public Builder approxRecordSize(int recordSizeEstimate) { - props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, String.valueOf(recordSizeEstimate)); - return this; - } - - public Builder withCleanerParallelism(int cleanerParallelism) { - props.setProperty(CLEANER_PARALLELISM, String.valueOf(cleanerParallelism)); + compactionConfig.setValue(COPY_ON_WRITE_RECORD_SIZE_ESTIMATE, String.valueOf(recordSizeEstimate)); return this; } public Builder withCompactionStrategy(CompactionStrategy compactionStrategy) { - props.setProperty(COMPACTION_STRATEGY_PROP, compactionStrategy.getClass().getName()); + compactionConfig.setValue(COMPACTION_STRATEGY, compactionStrategy.getClass().getName()); return this; } - public Builder withPayloadClass(String payloadClassName) { - props.setProperty(PAYLOAD_CLASS_PROP, payloadClassName); + public Builder withTargetIOPerCompactionInMB(long targetIOPerCompactionInMB) { + compactionConfig.setValue(TARGET_IO_PER_COMPACTION_IN_MB, String.valueOf(targetIOPerCompactionInMB)); return this; } - public Builder withTargetIOPerCompactionInMB(long targetIOPerCompactionInMB) { - props.setProperty(TARGET_IO_PER_COMPACTION_IN_MB_PROP, String.valueOf(targetIOPerCompactionInMB)); + public Builder withMaxNumDeltaCommitsBeforeCompaction(int maxNumDeltaCommitsBeforeCompaction) { + compactionConfig.setValue(INLINE_COMPACT_NUM_DELTA_COMMITS, String.valueOf(maxNumDeltaCommitsBeforeCompaction)); return this; } - public Builder withMaxNumDeltaCommitsBeforeCompaction(int maxNumDeltaCommitsBeforeCompaction) { - props.setProperty(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, String.valueOf(maxNumDeltaCommitsBeforeCompaction)); + public Builder withMaxDeltaSecondsBeforeCompaction(int maxDeltaSecondsBeforeCompaction) { + compactionConfig.setValue(INLINE_COMPACT_TIME_DELTA_SECONDS, String.valueOf(maxDeltaSecondsBeforeCompaction)); return this; } public Builder withCompactionLazyBlockReadEnabled(Boolean compactionLazyBlockReadEnabled) { - props.setProperty(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, String.valueOf(compactionLazyBlockReadEnabled)); + compactionConfig.setValue(COMPACTION_LAZY_BLOCK_READ_ENABLE, String.valueOf(compactionLazyBlockReadEnabled)); return this; } public Builder withCompactionReverseLogReadEnabled(Boolean compactionReverseLogReadEnabled) { - props.setProperty(COMPACTION_REVERSE_LOG_READ_ENABLED_PROP, String.valueOf(compactionReverseLogReadEnabled)); + compactionConfig.setValue(COMPACTION_REVERSE_LOG_READ_ENABLE, String.valueOf(compactionReverseLogReadEnabled)); return this; } public Builder withTargetPartitionsPerDayBasedCompaction(int targetPartitionsPerCompaction) { - props.setProperty(TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP, String.valueOf(targetPartitionsPerCompaction)); + compactionConfig.setValue(TARGET_PARTITIONS_PER_DAYBASED_COMPACTION, String.valueOf(targetPartitionsPerCompaction)); + return this; + } + + public Builder withLogFileSizeThresholdBasedCompaction(long logFileSizeThreshold) { + compactionConfig.setValue(COMPACTION_LOG_FILE_SIZE_THRESHOLD, String.valueOf(logFileSizeThreshold)); return this; } - public Builder withCommitsArchivalBatchSize(int batchSize) { - props.setProperty(COMMITS_ARCHIVAL_BATCH_SIZE_PROP, String.valueOf(batchSize)); + public Builder withCompactionLogFileNumThreshold(int logFileNumThreshold) { + compactionConfig.setValue(COMPACTION_LOG_FILE_NUM_THRESHOLD, String.valueOf(logFileNumThreshold)); return this; } - public Builder withCleanBootstrapBaseFileEnabled(Boolean cleanBootstrapSourceFileEnabled) { - props.setProperty(CLEANER_BOOTSTRAP_BASE_FILE_ENABLED, String.valueOf(cleanBootstrapSourceFileEnabled)); + public Builder withPreserveCommitMetadata(boolean preserveCommitMetadata) { + compactionConfig.setValue(PRESERVE_COMMIT_METADATA, String.valueOf(preserveCommitMetadata)); return this; } public HoodieCompactionConfig build() { - HoodieCompactionConfig config = new HoodieCompactionConfig(props); - setDefaultOnCondition(props, !props.containsKey(AUTO_CLEAN_PROP), AUTO_CLEAN_PROP, DEFAULT_AUTO_CLEAN); - setDefaultOnCondition(props, !props.containsKey(ASYNC_CLEAN_PROP), ASYNC_CLEAN_PROP, - DEFAULT_ASYNC_CLEAN); - setDefaultOnCondition(props, !props.containsKey(CLEANER_INCREMENTAL_MODE), CLEANER_INCREMENTAL_MODE, - DEFAULT_INCREMENTAL_CLEANER); - setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_PROP), INLINE_COMPACT_PROP, - DEFAULT_INLINE_COMPACT); - setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP), - INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS); - setDefaultOnCondition(props, !props.containsKey(CLEANER_POLICY_PROP), CLEANER_POLICY_PROP, - DEFAULT_CLEANER_POLICY); - setDefaultOnCondition(props, !props.containsKey(CLEANER_FILE_VERSIONS_RETAINED_PROP), - CLEANER_FILE_VERSIONS_RETAINED_PROP, DEFAULT_CLEANER_FILE_VERSIONS_RETAINED); - setDefaultOnCondition(props, !props.containsKey(CLEANER_COMMITS_RETAINED_PROP), CLEANER_COMMITS_RETAINED_PROP, - DEFAULT_CLEANER_COMMITS_RETAINED); - setDefaultOnCondition(props, !props.containsKey(MAX_COMMITS_TO_KEEP_PROP), MAX_COMMITS_TO_KEEP_PROP, - DEFAULT_MAX_COMMITS_TO_KEEP); - setDefaultOnCondition(props, !props.containsKey(MIN_COMMITS_TO_KEEP_PROP), MIN_COMMITS_TO_KEEP_PROP, - DEFAULT_MIN_COMMITS_TO_KEEP); - setDefaultOnCondition(props, !props.containsKey(PARQUET_SMALL_FILE_LIMIT_BYTES), PARQUET_SMALL_FILE_LIMIT_BYTES, - DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES); - setDefaultOnCondition(props, !props.containsKey(RECORD_SIZE_ESTIMATION_THRESHOLD_PROP), RECORD_SIZE_ESTIMATION_THRESHOLD_PROP, - DEFAULT_RECORD_SIZE_ESTIMATION_THRESHOLD); - setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE), - COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE, DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE); - setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS), - COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS); - setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE), - COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE); - setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM), CLEANER_PARALLELISM, - DEFAULT_CLEANER_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(COMPACTION_STRATEGY_PROP), COMPACTION_STRATEGY_PROP, - DEFAULT_COMPACTION_STRATEGY); - setDefaultOnCondition(props, !props.containsKey(PAYLOAD_CLASS_PROP), PAYLOAD_CLASS_PROP, DEFAULT_PAYLOAD_CLASS); - setDefaultOnCondition(props, !props.containsKey(TARGET_IO_PER_COMPACTION_IN_MB_PROP), - TARGET_IO_PER_COMPACTION_IN_MB_PROP, DEFAULT_TARGET_IO_PER_COMPACTION_IN_MB); - setDefaultOnCondition(props, !props.containsKey(COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP), - COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED); - setDefaultOnCondition(props, !props.containsKey(COMPACTION_REVERSE_LOG_READ_ENABLED_PROP), - COMPACTION_REVERSE_LOG_READ_ENABLED_PROP, DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED); - setDefaultOnCondition(props, !props.containsKey(TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP), - TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP, DEFAULT_TARGET_PARTITIONS_PER_DAYBASED_COMPACTION); - setDefaultOnCondition(props, !props.containsKey(COMMITS_ARCHIVAL_BATCH_SIZE_PROP), - COMMITS_ARCHIVAL_BATCH_SIZE_PROP, DEFAULT_COMMITS_ARCHIVAL_BATCH_SIZE); - setDefaultOnCondition(props, !props.containsKey(CLEANER_BOOTSTRAP_BASE_FILE_ENABLED), - CLEANER_BOOTSTRAP_BASE_FILE_ENABLED, DEFAULT_CLEANER_BOOTSTRAP_BASE_FILE_ENABLED); - - HoodieCleaningPolicy.valueOf(props.getProperty(CLEANER_POLICY_PROP)); - - // Ensure minInstantsToKeep > cleanerCommitsRetained, otherwise we will archive some - // commit instant on timeline, that still has not been cleaned. Could miss some data via incr pull - int minInstantsToKeep = Integer.parseInt(props.getProperty(HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP)); - int maxInstantsToKeep = Integer.parseInt(props.getProperty(HoodieCompactionConfig.MAX_COMMITS_TO_KEEP_PROP)); - int cleanerCommitsRetained = - Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP)); - ValidationUtils.checkArgument(maxInstantsToKeep > minInstantsToKeep, - String.format( - "Increase %s=%d to be greater than %s=%d.", - HoodieCompactionConfig.MAX_COMMITS_TO_KEEP_PROP, maxInstantsToKeep, - HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP, minInstantsToKeep)); - ValidationUtils.checkArgument(minInstantsToKeep > cleanerCommitsRetained, - String.format( - "Increase %s=%d to be greater than %s=%d. Otherwise, there is risk of incremental pull " - + "missing data from few instants.", - HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP, minInstantsToKeep, - HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP, cleanerCommitsRetained)); - return config; + compactionConfig.setDefaults(HoodieCompactionConfig.class.getName()); + return compactionConfig; } } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieHBaseIndexConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieHBaseIndexConfig.java index 5d79776b19b83..d52c407028a21 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieHBaseIndexConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieHBaseIndexConfig.java @@ -18,7 +18,10 @@ package org.apache.hudi.config; -import org.apache.hudi.common.config.DefaultHoodieConfig; +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; import org.apache.hudi.index.hbase.DefaultHBaseQPSResourceAllocator; import java.io.File; @@ -26,95 +29,330 @@ import java.io.IOException; import java.util.Properties; -public class HoodieHBaseIndexConfig extends DefaultHoodieConfig { +@ConfigClassProperty(name = "HBase Index Configs", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Configurations that control indexing behavior " + + "(when HBase based indexing is enabled), which tags incoming " + + "records as either inserts or updates to older records.") +public class HoodieHBaseIndexConfig extends HoodieConfig { + + public static final ConfigProperty ZKQUORUM = ConfigProperty + .key("hoodie.index.hbase.zkquorum") + .noDefaultValue() + .withDocumentation("Only applies if index type is HBASE. HBase ZK Quorum url to connect to"); + + public static final ConfigProperty ZKPORT = ConfigProperty + .key("hoodie.index.hbase.zkport") + .noDefaultValue() + .withDocumentation("Only applies if index type is HBASE. HBase ZK Quorum port to connect to"); + + public static final ConfigProperty TABLENAME = ConfigProperty + .key("hoodie.index.hbase.table") + .noDefaultValue() + .withDocumentation("Only applies if index type is HBASE. HBase Table name to use as the index. " + + "Hudi stores the row_key and [partition_path, fileID, commitTime] mapping in the table"); + + public static final ConfigProperty GET_BATCH_SIZE = ConfigProperty + .key("hoodie.index.hbase.get.batch.size") + .defaultValue(100) + .withDocumentation("Controls the batch size for performing gets against HBase. " + + "Batching improves throughput, by saving round trips."); + + public static final ConfigProperty ZK_NODE_PATH = ConfigProperty + .key("hoodie.index.hbase.zknode.path") + .noDefaultValue() + .withDocumentation("Only applies if index type is HBASE. This is the root znode that will contain " + + "all the znodes created/used by HBase"); + + public static final ConfigProperty PUT_BATCH_SIZE = ConfigProperty + .key("hoodie.index.hbase.put.batch.size") + .defaultValue(100) + .withDocumentation("Controls the batch size for performing puts against HBase. " + + "Batching improves throughput, by saving round trips."); + + public static final ConfigProperty QPS_ALLOCATOR_CLASS_NAME = ConfigProperty + .key("hoodie.index.hbase.qps.allocator.class") + .defaultValue(DefaultHBaseQPSResourceAllocator.class.getName()) + .withDocumentation("Property to set which implementation of HBase QPS resource allocator to be used, which" + + "controls the batching rate dynamically."); + + public static final ConfigProperty PUT_BATCH_SIZE_AUTO_COMPUTE = ConfigProperty + .key("hoodie.index.hbase.put.batch.size.autocompute") + .defaultValue(false) + .withDocumentation("Property to set to enable auto computation of put batch size"); + + public static final ConfigProperty QPS_FRACTION = ConfigProperty + .key("hoodie.index.hbase.qps.fraction") + .defaultValue(0.5f) + .withDocumentation("Property to set the fraction of the global share of QPS that should be allocated to this job. Let's say there are 3" + + " jobs which have input size in terms of number of rows required for HbaseIndexing as x, 2x, 3x respectively. Then" + + " this fraction for the jobs would be (0.17) 1/6, 0.33 (2/6) and 0.5 (3/6) respectively." + + " Default is 50%, which means a total of 2 jobs can run using HbaseIndex without overwhelming Region Servers."); + + public static final ConfigProperty MAX_QPS_PER_REGION_SERVER = ConfigProperty + .key("hoodie.index.hbase.max.qps.per.region.server") + .defaultValue(1000) + .withDocumentation("Property to set maximum QPS allowed per Region Server. This should be same across various jobs. This is intended to\n" + + " limit the aggregate QPS generated across various jobs to an Hbase Region Server. It is recommended to set this\n" + + " value based on global indexing throughput needs and most importantly, how much the HBase installation in use is\n" + + " able to tolerate without Region Servers going down."); + + public static final ConfigProperty COMPUTE_QPS_DYNAMICALLY = ConfigProperty + .key("hoodie.index.hbase.dynamic_qps") + .defaultValue(false) + .withDocumentation("Property to decide if HBASE_QPS_FRACTION_PROP is dynamically calculated based on write volume."); + + public static final ConfigProperty MIN_QPS_FRACTION = ConfigProperty + .key("hoodie.index.hbase.min.qps.fraction") + .noDefaultValue() + .withDocumentation("Minimum for HBASE_QPS_FRACTION_PROP to stabilize skewed write workloads"); + + public static final ConfigProperty MAX_QPS_FRACTION = ConfigProperty + .key("hoodie.index.hbase.max.qps.fraction") + .noDefaultValue() + .withDocumentation("Maximum for HBASE_QPS_FRACTION_PROP to stabilize skewed write workloads"); + + public static final ConfigProperty DESIRED_PUTS_TIME_IN_SECONDS = ConfigProperty + .key("hoodie.index.hbase.desired_puts_time_in_secs") + .defaultValue(600) + .withDocumentation(""); + + public static final ConfigProperty SLEEP_MS_FOR_PUT_BATCH = ConfigProperty + .key("hoodie.index.hbase.sleep.ms.for.put.batch") + .noDefaultValue() + .withDocumentation(""); + + public static final ConfigProperty SLEEP_MS_FOR_GET_BATCH = ConfigProperty + .key("hoodie.index.hbase.sleep.ms.for.get.batch") + .noDefaultValue() + .withDocumentation(""); + + public static final ConfigProperty ZK_SESSION_TIMEOUT_MS = ConfigProperty + .key("hoodie.index.hbase.zk.session_timeout_ms") + .defaultValue(60 * 1000) + .withDocumentation("Session timeout value to use for Zookeeper failure detection, for the HBase client." + + "Lower this value, if you want to fail faster."); + + public static final ConfigProperty ZK_CONNECTION_TIMEOUT_MS = ConfigProperty + .key("hoodie.index.hbase.zk.connection_timeout_ms") + .defaultValue(15 * 1000) + .withDocumentation("Timeout to use for establishing connection with zookeeper, from HBase client."); + + public static final ConfigProperty ZKPATH_QPS_ROOT = ConfigProperty + .key("hoodie.index.hbase.zkpath.qps_root") + .defaultValue("/QPS_ROOT") + .withDocumentation("chroot in zookeeper, to use for all qps allocation co-ordination."); + + public static final ConfigProperty UPDATE_PARTITION_PATH_ENABLE = ConfigProperty + .key("hoodie.hbase.index.update.partition.path") + .defaultValue(false) + .withDocumentation("Only applies if index type is HBASE. " + + "When an already existing record is upserted to a new partition compared to whats in storage, " + + "this config when set, will delete old record in old partition " + + "and will insert it as new record in new partition."); + + public static final ConfigProperty ROLLBACK_SYNC_ENABLE = ConfigProperty + .key("hoodie.index.hbase.rollback.sync") + .defaultValue(false) + .withDocumentation("When set to true, the rollback method will delete the last failed task index. " + + "The default value is false. Because deleting the index will add extra load on the Hbase cluster for each rollback"); + + public static final ConfigProperty SECURITY_AUTHENTICATION = ConfigProperty + .key("hoodie.index.hbase.security.authentication") + .defaultValue("simple") + .withDocumentation("Property to decide if the hbase cluster secure authentication is enabled or not. " + + "Possible values are 'simple' (no authentication), and 'kerberos'."); + + public static final ConfigProperty KERBEROS_USER_KEYTAB = ConfigProperty + .key("hoodie.index.hbase.kerberos.user.keytab") + .noDefaultValue() + .withDocumentation("File name of the kerberos keytab file for connecting to the hbase cluster."); + + public static final ConfigProperty KERBEROS_USER_PRINCIPAL = ConfigProperty + .key("hoodie.index.hbase.kerberos.user.principal") + .noDefaultValue() + .withDocumentation("The kerberos principal name for connecting to the hbase cluster."); + + public static final ConfigProperty REGIONSERVER_PRINCIPAL = ConfigProperty + .key("hoodie.index.hbase.regionserver.kerberos.principal") + .noDefaultValue() + .withDocumentation("The value of hbase.regionserver.kerberos.principal in hbase cluster."); + + public static final ConfigProperty MASTER_PRINCIPAL = ConfigProperty + .key("hoodie.index.hbase.master.kerberos.principal") + .noDefaultValue() + .withDocumentation("The value of hbase.master.kerberos.principal in hbase cluster."); + + public static final ConfigProperty BUCKET_NUMBER = ConfigProperty + .key("hoodie.index.hbase.bucket.number") + .defaultValue(8) + .withDocumentation("Only applicable when using RebalancedSparkHoodieHBaseIndex, same as hbase regions count can get the best performance"); - public static final String HBASE_ZKQUORUM_PROP = "hoodie.index.hbase.zkquorum"; - public static final String HBASE_ZKPORT_PROP = "hoodie.index.hbase.zkport"; - public static final String HBASE_TABLENAME_PROP = "hoodie.index.hbase.table"; - public static final String HBASE_GET_BATCH_SIZE_PROP = "hoodie.index.hbase.get.batch.size"; - public static final String HBASE_ZK_ZNODEPARENT = "hoodie.index.hbase.zknode.path"; /** - * Note that if HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP is set to true, this batch size will not be honored for HBase - * Puts. + * @deprecated Use {@link #ZKQUORUM} and its methods instead */ - public static final String HBASE_PUT_BATCH_SIZE_PROP = "hoodie.index.hbase.put.batch.size"; - + @Deprecated + public static final String HBASE_ZKQUORUM_PROP = ZKQUORUM.key(); + /** + * @deprecated Use {@link #ZKPORT} and its methods instead + */ + @Deprecated + public static final String HBASE_ZKPORT_PROP = ZKPORT.key(); + /** + * @deprecated Use {@link #TABLENAME} and its methods instead + */ + @Deprecated + public static final String HBASE_TABLENAME_PROP = TABLENAME.key(); + /** + * @deprecated Use {@link #GET_BATCH_SIZE} and its methods instead + */ + @Deprecated + public static final String HBASE_GET_BATCH_SIZE_PROP = GET_BATCH_SIZE.key(); + /** + * @deprecated Use {@link #ZK_NODE_PATH} and its methods instead + */ + @Deprecated + public static final String HBASE_ZK_ZNODEPARENT = ZK_NODE_PATH.key(); + /** + * @deprecated Use {@link #PUT_BATCH_SIZE} and its methods instead + */ + @Deprecated + public static final String HBASE_PUT_BATCH_SIZE_PROP = PUT_BATCH_SIZE.key(); + /** + * @deprecated Use {@link #QPS_ALLOCATOR_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String HBASE_INDEX_QPS_ALLOCATOR_CLASS = QPS_ALLOCATOR_CLASS_NAME.key(); /** - * Property to set which implementation of HBase QPS resource allocator to be used. + * @deprecated Use {@link #QPS_ALLOCATOR_CLASS_NAME} and its methods instead */ - public static final String HBASE_INDEX_QPS_ALLOCATOR_CLASS = "hoodie.index.hbase.qps.allocator.class"; - public static final String DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS = DefaultHBaseQPSResourceAllocator.class.getName(); + @Deprecated + public static final String DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS = QPS_ALLOCATOR_CLASS_NAME.defaultValue(); /** - * Property to set to enable auto computation of put batch size. + * @deprecated Use {@link #PUT_BATCH_SIZE_AUTO_COMPUTE} and its methods instead */ - public static final String HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP = "hoodie.index.hbase.put.batch.size.autocompute"; - public static final String DEFAULT_HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE = "false"; + @Deprecated + public static final String HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP = PUT_BATCH_SIZE_AUTO_COMPUTE.key(); /** - * Property to set the fraction of the global share of QPS that should be allocated to this job. Let's say there are 3 - * jobs which have input size in terms of number of rows required for HbaseIndexing as x, 2x, 3x respectively. Then - * this fraction for the jobs would be (0.17) 1/6, 0.33 (2/6) and 0.5 (3/6) respectively. + * @deprecated Use {@link #PUT_BATCH_SIZE_AUTO_COMPUTE} and its methods instead */ - public static final String HBASE_QPS_FRACTION_PROP = "hoodie.index.hbase.qps.fraction"; + @Deprecated + public static final Boolean DEFAULT_HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE = PUT_BATCH_SIZE_AUTO_COMPUTE.defaultValue(); /** - * Property to set maximum QPS allowed per Region Server. This should be same across various jobs. This is intended to - * limit the aggregate QPS generated across various jobs to an Hbase Region Server. It is recommended to set this - * value based on global indexing throughput needs and most importantly, how much the HBase installation in use is - * able to tolerate without Region Servers going down. + * @deprecated Use {@link #MAX_QPS_FRACTION} and its methods instead */ - public static final String HBASE_MAX_QPS_PER_REGION_SERVER_PROP = "hoodie.index.hbase.max.qps.per.region.server"; + @Deprecated + public static final String HBASE_QPS_FRACTION_PROP = QPS_FRACTION.key(); /** - * Default batch size, used only for Get, but computed for Put. + * @deprecated Use {@link #MAX_QPS_PER_REGION_SERVER} and its methods instead */ + @Deprecated + public static final String HBASE_MAX_QPS_PER_REGION_SERVER_PROP = MAX_QPS_PER_REGION_SERVER.key(); + @Deprecated public static final int DEFAULT_HBASE_BATCH_SIZE = 100; /** - * A low default value. + * @deprecated Use {@link #MAX_QPS_PER_REGION_SERVER} and its methods instead */ - public static final int DEFAULT_HBASE_MAX_QPS_PER_REGION_SERVER = 1000; + @Deprecated + public static final int DEFAULT_HBASE_MAX_QPS_PER_REGION_SERVER = MAX_QPS_PER_REGION_SERVER.defaultValue(); /** - * Default is 50%, which means a total of 2 jobs can run using HbaseIndex without overwhelming Region Servers. + * @deprecated Use {@link #QPS_FRACTION} and its methods instead */ - public static final float DEFAULT_HBASE_QPS_FRACTION = 0.5f; - + @Deprecated + public static final float DEFAULT_HBASE_QPS_FRACTION = QPS_FRACTION.defaultValue(); /** - * Property to decide if HBASE_QPS_FRACTION_PROP is dynamically calculated based on volume. + * @deprecated Use {@link #COMPUTE_QPS_DYNAMICALLY} and its methods instead */ - public static final String HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY = "hoodie.index.hbase.dynamic_qps"; - public static final boolean DEFAULT_HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY = false; + @Deprecated + public static final String HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY = COMPUTE_QPS_DYNAMICALLY.key(); /** - * Min and Max for HBASE_QPS_FRACTION_PROP to stabilize skewed volume workloads. + * @deprecated Use {@link #COMPUTE_QPS_DYNAMICALLY} and its methods instead */ - public static final String HBASE_MIN_QPS_FRACTION_PROP = "hoodie.index.hbase.min.qps.fraction"; - - public static final String HBASE_MAX_QPS_FRACTION_PROP = "hoodie.index.hbase.max.qps.fraction"; - + @Deprecated + public static final boolean DEFAULT_HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY = COMPUTE_QPS_DYNAMICALLY.defaultValue(); /** - * Hoodie index desired puts operation time in seconds. + * @deprecated Use {@link #MIN_QPS_FRACTION} and its methods instead */ - public static final String HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS = "hoodie.index.hbase.desired_puts_time_in_secs"; - public static final int DEFAULT_HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS = 600; - public static final String HBASE_SLEEP_MS_PUT_BATCH_PROP = "hoodie.index.hbase.sleep.ms.for.put.batch"; - public static final String HBASE_SLEEP_MS_GET_BATCH_PROP = "hoodie.index.hbase.sleep.ms.for.get.batch"; - public static final String HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS = "hoodie.index.hbase.zk.session_timeout_ms"; - public static final int DEFAULT_ZK_SESSION_TIMEOUT_MS = 60 * 1000; - public static final String HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS = - "hoodie.index.hbase.zk.connection_timeout_ms"; - public static final int DEFAULT_ZK_CONNECTION_TIMEOUT_MS = 15 * 1000; - public static final String HBASE_ZK_PATH_QPS_ROOT = "hoodie.index.hbase.zkpath.qps_root"; - public static final String DEFAULT_HBASE_ZK_PATH_QPS_ROOT = "/QPS_ROOT"; - + @Deprecated + public static final String HBASE_MIN_QPS_FRACTION_PROP = MIN_QPS_FRACTION.key(); + /** + * @deprecated Use {@link #MAX_QPS_FRACTION} and its methods instead + */ + @Deprecated + public static final String HBASE_MAX_QPS_FRACTION_PROP = MAX_QPS_FRACTION.key(); + /** + * @deprecated Use {@link #DESIRED_PUTS_TIME_IN_SECONDS} and its methods instead + */ + @Deprecated + public static final String HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS = DESIRED_PUTS_TIME_IN_SECONDS.key(); + /** + * @deprecated Use {@link #DESIRED_PUTS_TIME_IN_SECONDS} and its methods instead + */ + @Deprecated + public static final int DEFAULT_HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS = DESIRED_PUTS_TIME_IN_SECONDS.defaultValue(); + /** + * @deprecated Use {@link #SLEEP_MS_FOR_PUT_BATCH} and its methods instead + */ + @Deprecated + public static final String HBASE_SLEEP_MS_PUT_BATCH_PROP = SLEEP_MS_FOR_PUT_BATCH.key(); + /** + * @deprecated Use {@link #SLEEP_MS_FOR_GET_BATCH} and its methods instead + */ + @Deprecated + public static final String HBASE_SLEEP_MS_GET_BATCH_PROP = SLEEP_MS_FOR_GET_BATCH.key(); + /** + * @deprecated Use {@link #ZK_SESSION_TIMEOUT_MS} and its methods instead + */ + @Deprecated + public static final String HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS = ZK_SESSION_TIMEOUT_MS.key(); + /** + * @deprecated Use {@link #ZK_SESSION_TIMEOUT_MS} and its methods instead + */ + @Deprecated + public static final int DEFAULT_ZK_SESSION_TIMEOUT_MS = ZK_SESSION_TIMEOUT_MS.defaultValue(); /** - * Only applies if index type is Hbase. - *

- * When set to true, an update to a record with a different partition from its existing one - * will insert the record to the new partition and delete it from the old partition. - *

- * When set to false, a record will be updated to the old partition. + * @deprecated Use {@link #ZK_CONNECTION_TIMEOUT_MS} and its methods instead */ - public static final String HBASE_INDEX_UPDATE_PARTITION_PATH = "hoodie.hbase.index.update.partition.path"; - public static final Boolean DEFAULT_HBASE_INDEX_UPDATE_PARTITION_PATH = false; + @Deprecated + public static final String HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS = ZK_CONNECTION_TIMEOUT_MS.key(); + /** + * @deprecated Use {@link #ZK_CONNECTION_TIMEOUT_MS} and its methods instead + */ + @Deprecated + public static final int DEFAULT_ZK_CONNECTION_TIMEOUT_MS = ZK_CONNECTION_TIMEOUT_MS.defaultValue(); + /** + * @deprecated Use {@link #ZKPATH_QPS_ROOT} and its methods instead + */ + @Deprecated + public static final String HBASE_ZK_PATH_QPS_ROOT = ZKPATH_QPS_ROOT.key(); + /** + * @deprecated Use {@link #ZKPATH_QPS_ROOT} and its methods instead + */ + @Deprecated + public static final String DEFAULT_HBASE_ZK_PATH_QPS_ROOT = ZKPATH_QPS_ROOT.defaultValue(); + /** + * @deprecated Use {@link #UPDATE_PARTITION_PATH_ENABLE} and its methods instead + */ + @Deprecated + public static final String HBASE_INDEX_UPDATE_PARTITION_PATH = UPDATE_PARTITION_PATH_ENABLE.key(); + /** + * @deprecated Use {@link #UPDATE_PARTITION_PATH_ENABLE} and its methods instead + */ + @Deprecated + public static final Boolean DEFAULT_HBASE_INDEX_UPDATE_PARTITION_PATH = UPDATE_PARTITION_PATH_ENABLE.defaultValue(); + /** + * @deprecated Use {@link #ROLLBACK_SYNC_ENABLE} and its methods instead + */ + @Deprecated + public static final String HBASE_INDEX_ROLLBACK_SYNC = ROLLBACK_SYNC_ENABLE.key(); + /** + * @deprecated Use {@link #ROLLBACK_SYNC_ENABLE} and its methods instead + */ + @Deprecated + public static final Boolean DEFAULT_HBASE_INDEX_ROLLBACK_SYNC = ROLLBACK_SYNC_ENABLE.defaultValue(); - public HoodieHBaseIndexConfig(final Properties props) { - super(props); + private HoodieHBaseIndexConfig() { + super(); } public static HoodieHBaseIndexConfig.Builder newBuilder() { @@ -123,112 +361,142 @@ public static HoodieHBaseIndexConfig.Builder newBuilder() { public static class Builder { - private final Properties props = new Properties(); + private final HoodieHBaseIndexConfig hBaseIndexConfig = new HoodieHBaseIndexConfig(); public HoodieHBaseIndexConfig.Builder fromFile(File propertiesFile) throws IOException { try (FileReader reader = new FileReader(propertiesFile)) { - this.props.load(reader); + this.hBaseIndexConfig.getProps().load(reader); return this; } } public HoodieHBaseIndexConfig.Builder fromProperties(Properties props) { - this.props.putAll(props); + this.hBaseIndexConfig.getProps().putAll(props); return this; } public HoodieHBaseIndexConfig.Builder hbaseZkQuorum(String zkString) { - props.setProperty(HBASE_ZKQUORUM_PROP, zkString); + hBaseIndexConfig.setValue(ZKQUORUM, zkString); return this; } public HoodieHBaseIndexConfig.Builder hbaseZkPort(int port) { - props.setProperty(HBASE_ZKPORT_PROP, String.valueOf(port)); + hBaseIndexConfig.setValue(ZKPORT, String.valueOf(port)); return this; } public HoodieHBaseIndexConfig.Builder hbaseTableName(String tableName) { - props.setProperty(HBASE_TABLENAME_PROP, tableName); + hBaseIndexConfig.setValue(TABLENAME, tableName); return this; } public Builder hbaseZkZnodeQPSPath(String zkZnodeQPSPath) { - props.setProperty(HBASE_ZK_PATH_QPS_ROOT, zkZnodeQPSPath); + hBaseIndexConfig.setValue(ZKPATH_QPS_ROOT, zkZnodeQPSPath); return this; } public Builder hbaseIndexGetBatchSize(int getBatchSize) { - props.setProperty(HBASE_GET_BATCH_SIZE_PROP, String.valueOf(getBatchSize)); + hBaseIndexConfig.setValue(GET_BATCH_SIZE, String.valueOf(getBatchSize)); return this; } public Builder hbaseIndexPutBatchSize(int putBatchSize) { - props.setProperty(HBASE_PUT_BATCH_SIZE_PROP, String.valueOf(putBatchSize)); + hBaseIndexConfig.setValue(PUT_BATCH_SIZE, String.valueOf(putBatchSize)); return this; } public Builder hbaseIndexPutBatchSizeAutoCompute(boolean putBatchSizeAutoCompute) { - props.setProperty(HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP, String.valueOf(putBatchSizeAutoCompute)); + hBaseIndexConfig.setValue(PUT_BATCH_SIZE_AUTO_COMPUTE, String.valueOf(putBatchSizeAutoCompute)); return this; } public Builder hbaseIndexDesiredPutsTime(int desiredPutsTime) { - props.setProperty(HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS, String.valueOf(desiredPutsTime)); + hBaseIndexConfig.setValue(DESIRED_PUTS_TIME_IN_SECONDS, String.valueOf(desiredPutsTime)); return this; } public Builder hbaseIndexShouldComputeQPSDynamically(boolean shouldComputeQPsDynamically) { - props.setProperty(HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY, String.valueOf(shouldComputeQPsDynamically)); + hBaseIndexConfig.setValue(COMPUTE_QPS_DYNAMICALLY, String.valueOf(shouldComputeQPsDynamically)); return this; } public Builder hbaseIndexQPSFraction(float qpsFraction) { - props.setProperty(HBASE_QPS_FRACTION_PROP, String.valueOf(qpsFraction)); + hBaseIndexConfig.setValue(QPS_FRACTION, String.valueOf(qpsFraction)); return this; } public Builder hbaseIndexMinQPSFraction(float minQPSFraction) { - props.setProperty(HBASE_MIN_QPS_FRACTION_PROP, String.valueOf(minQPSFraction)); + hBaseIndexConfig.setValue(MIN_QPS_FRACTION, String.valueOf(minQPSFraction)); return this; } public Builder hbaseIndexMaxQPSFraction(float maxQPSFraction) { - props.setProperty(HBASE_MAX_QPS_FRACTION_PROP, String.valueOf(maxQPSFraction)); + hBaseIndexConfig.setValue(MAX_QPS_FRACTION, String.valueOf(maxQPSFraction)); return this; } public Builder hbaseIndexSleepMsBetweenPutBatch(int sleepMsBetweenPutBatch) { - props.setProperty(HBASE_SLEEP_MS_PUT_BATCH_PROP, String.valueOf(sleepMsBetweenPutBatch)); + hBaseIndexConfig.setValue(SLEEP_MS_FOR_PUT_BATCH, String.valueOf(sleepMsBetweenPutBatch)); return this; } public Builder hbaseIndexSleepMsBetweenGetBatch(int sleepMsBetweenGetBatch) { - props.setProperty(HBASE_SLEEP_MS_GET_BATCH_PROP, String.valueOf(sleepMsBetweenGetBatch)); + hBaseIndexConfig.setValue(SLEEP_MS_FOR_GET_BATCH, String.valueOf(sleepMsBetweenGetBatch)); return this; } public Builder hbaseIndexUpdatePartitionPath(boolean updatePartitionPath) { - props.setProperty(HBASE_INDEX_UPDATE_PARTITION_PATH, String.valueOf(updatePartitionPath)); + hBaseIndexConfig.setValue(UPDATE_PARTITION_PATH_ENABLE, String.valueOf(updatePartitionPath)); + return this; + } + + public Builder hbaseIndexRollbackSync(boolean rollbackSync) { + hBaseIndexConfig.setValue(ROLLBACK_SYNC_ENABLE, String.valueOf(rollbackSync)); return this; } public Builder withQPSResourceAllocatorType(String qpsResourceAllocatorClass) { - props.setProperty(HBASE_INDEX_QPS_ALLOCATOR_CLASS, qpsResourceAllocatorClass); + hBaseIndexConfig.setValue(QPS_ALLOCATOR_CLASS_NAME, qpsResourceAllocatorClass); return this; } public Builder hbaseIndexZkSessionTimeout(int zkSessionTimeout) { - props.setProperty(HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS, String.valueOf(zkSessionTimeout)); + hBaseIndexConfig.setValue(ZK_SESSION_TIMEOUT_MS, String.valueOf(zkSessionTimeout)); return this; } public Builder hbaseIndexZkConnectionTimeout(int zkConnectionTimeout) { - props.setProperty(HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS, String.valueOf(zkConnectionTimeout)); + hBaseIndexConfig.setValue(ZK_CONNECTION_TIMEOUT_MS, String.valueOf(zkConnectionTimeout)); return this; } public Builder hbaseZkZnodeParent(String zkZnodeParent) { - props.setProperty(HBASE_ZK_ZNODEPARENT, zkZnodeParent); + hBaseIndexConfig.setValue(ZK_NODE_PATH, zkZnodeParent); + return this; + } + + public Builder hbaseSecurityAuthentication(String authentication) { + hBaseIndexConfig.setValue(SECURITY_AUTHENTICATION, authentication); + return this; + } + + public Builder hbaseKerberosUserKeytab(String keytab) { + hBaseIndexConfig.setValue(KERBEROS_USER_KEYTAB, keytab); + return this; + } + + public Builder hbaseKerberosUserPrincipal(String principal) { + hBaseIndexConfig.setValue(KERBEROS_USER_PRINCIPAL, principal); + return this; + } + + public Builder hbaseKerberosRegionserverPrincipal(String principal) { + hBaseIndexConfig.setValue(REGIONSERVER_PRINCIPAL, principal); + return this; + } + + public Builder hbaseKerberosMasterPrincipal(String principal) { + hBaseIndexConfig.setValue(MASTER_PRINCIPAL, principal); return this; } @@ -244,40 +512,14 @@ public Builder hbaseZkZnodeParent(String zkZnodeParent) { */ public HoodieHBaseIndexConfig.Builder hbaseIndexMaxQPSPerRegionServer(int maxQPSPerRegionServer) { // This should be same across various jobs - props.setProperty(HoodieHBaseIndexConfig.HBASE_MAX_QPS_PER_REGION_SERVER_PROP, + hBaseIndexConfig.setValue(HoodieHBaseIndexConfig.MAX_QPS_PER_REGION_SERVER, String.valueOf(maxQPSPerRegionServer)); return this; } public HoodieHBaseIndexConfig build() { - HoodieHBaseIndexConfig config = new HoodieHBaseIndexConfig(props); - setDefaultOnCondition(props, !props.containsKey(HBASE_GET_BATCH_SIZE_PROP), HBASE_GET_BATCH_SIZE_PROP, - String.valueOf(DEFAULT_HBASE_BATCH_SIZE)); - setDefaultOnCondition(props, !props.containsKey(HBASE_PUT_BATCH_SIZE_PROP), HBASE_PUT_BATCH_SIZE_PROP, - String.valueOf(DEFAULT_HBASE_BATCH_SIZE)); - setDefaultOnCondition(props, !props.containsKey(HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP), - HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP, DEFAULT_HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE); - setDefaultOnCondition(props, !props.containsKey(HBASE_QPS_FRACTION_PROP), HBASE_QPS_FRACTION_PROP, - String.valueOf(DEFAULT_HBASE_QPS_FRACTION)); - setDefaultOnCondition(props, !props.containsKey(HBASE_MAX_QPS_PER_REGION_SERVER_PROP), - HBASE_MAX_QPS_PER_REGION_SERVER_PROP, String.valueOf(DEFAULT_HBASE_MAX_QPS_PER_REGION_SERVER)); - setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY), - HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY, String.valueOf(DEFAULT_HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY)); - setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_QPS_ALLOCATOR_CLASS), HBASE_INDEX_QPS_ALLOCATOR_CLASS, - String.valueOf(DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS)); - setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS), - HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS, String.valueOf(DEFAULT_HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS)); - setDefaultOnCondition(props, !props.containsKey(HBASE_ZK_PATH_QPS_ROOT), HBASE_ZK_PATH_QPS_ROOT, - DEFAULT_HBASE_ZK_PATH_QPS_ROOT); - setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS), - HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS, String.valueOf(DEFAULT_ZK_SESSION_TIMEOUT_MS)); - setDefaultOnCondition(props, !props.containsKey(HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS), - HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS, String.valueOf(DEFAULT_ZK_CONNECTION_TIMEOUT_MS)); - setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_QPS_ALLOCATOR_CLASS), HBASE_INDEX_QPS_ALLOCATOR_CLASS, - String.valueOf(DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS)); - setDefaultOnCondition(props, !props.containsKey(HBASE_INDEX_UPDATE_PARTITION_PATH), HBASE_INDEX_UPDATE_PARTITION_PATH, - String.valueOf(DEFAULT_HBASE_INDEX_UPDATE_PARTITION_PATH)); - return config; + hBaseIndexConfig.setDefaults(HoodieHBaseIndexConfig.class.getName()); + return hBaseIndexConfig; } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java index 48b652aa58845..ee5b83a43a169 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java @@ -19,76 +19,190 @@ package org.apache.hudi.config; import org.apache.hudi.common.bloom.BloomFilterTypeCode; -import org.apache.hudi.common.config.DefaultHoodieConfig; -import org.apache.hudi.client.common.EngineType; +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.exception.HoodieIndexException; import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import javax.annotation.concurrent.Immutable; import java.io.File; import java.io.FileReader; import java.io.IOException; +import java.util.Arrays; import java.util.Properties; +import java.util.stream.Collectors; + +import static org.apache.hudi.config.HoodieHBaseIndexConfig.GET_BATCH_SIZE; +import static org.apache.hudi.config.HoodieHBaseIndexConfig.PUT_BATCH_SIZE; +import static org.apache.hudi.config.HoodieHBaseIndexConfig.TABLENAME; +import static org.apache.hudi.config.HoodieHBaseIndexConfig.ZKPORT; +import static org.apache.hudi.config.HoodieHBaseIndexConfig.ZKQUORUM; +import static org.apache.hudi.index.HoodieIndex.IndexType.BLOOM; +import static org.apache.hudi.index.HoodieIndex.IndexType.BUCKET; +import static org.apache.hudi.index.HoodieIndex.IndexType.GLOBAL_BLOOM; +import static org.apache.hudi.index.HoodieIndex.IndexType.GLOBAL_SIMPLE; +import static org.apache.hudi.index.HoodieIndex.IndexType.HBASE; +import static org.apache.hudi.index.HoodieIndex.IndexType.INMEMORY; +import static org.apache.hudi.index.HoodieIndex.IndexType.SIMPLE; /** * Indexing related config. */ @Immutable -public class HoodieIndexConfig extends DefaultHoodieConfig { - - public static final String INDEX_TYPE_PROP = "hoodie.index.type"; - - public static final String INDEX_CLASS_PROP = "hoodie.index.class"; - public static final String DEFAULT_INDEX_CLASS = ""; +@ConfigClassProperty(name = "Index Configs", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Configurations that control indexing behavior, " + + "which tags incoming records as either inserts or updates to older records.") +public class HoodieIndexConfig extends HoodieConfig { + + public static final ConfigProperty INDEX_TYPE = ConfigProperty + .key("hoodie.index.type") + // Builder#getDefaultIndexType has already set it according to engine type + .noDefaultValue() + .withValidValues(HBASE.name(), INMEMORY.name(), BLOOM.name(), GLOBAL_BLOOM.name(), + SIMPLE.name(), GLOBAL_SIMPLE.name(), BUCKET.name()) + .withDocumentation("Type of index to use. Default is SIMPLE on Spark engine, " + + "and INMEMORY on Flink and Java engines. " + + "Possible options are [BLOOM | GLOBAL_BLOOM |SIMPLE | GLOBAL_SIMPLE | INMEMORY | HBASE | BUCKET]. " + + "Bloom filters removes the dependency on a external system " + + "and is stored in the footer of the Parquet Data Files"); + + public static final ConfigProperty INDEX_CLASS_NAME = ConfigProperty + .key("hoodie.index.class") + .defaultValue("") + .withDocumentation("Full path of user-defined index class and must be a subclass of HoodieIndex class. " + + "It will take precedence over the hoodie.index.type configuration if specified"); // ***** Bloom Index configs ***** - public static final String BLOOM_FILTER_NUM_ENTRIES = "hoodie.index.bloom.num_entries"; - public static final String DEFAULT_BLOOM_FILTER_NUM_ENTRIES = "60000"; - public static final String BLOOM_FILTER_FPP = "hoodie.index.bloom.fpp"; - public static final String DEFAULT_BLOOM_FILTER_FPP = "0.000000001"; - public static final String BLOOM_INDEX_PARALLELISM_PROP = "hoodie.bloom.index.parallelism"; - // Disable explicit bloom index parallelism setting by default - hoodie auto computes - public static final String DEFAULT_BLOOM_INDEX_PARALLELISM = "0"; - public static final String BLOOM_INDEX_PRUNE_BY_RANGES_PROP = "hoodie.bloom.index.prune.by.ranges"; - public static final String DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES = "true"; - public static final String BLOOM_INDEX_USE_CACHING_PROP = "hoodie.bloom.index.use.caching"; - public static final String DEFAULT_BLOOM_INDEX_USE_CACHING = "true"; - public static final String BLOOM_INDEX_TREE_BASED_FILTER_PROP = "hoodie.bloom.index.use.treebased.filter"; - public static final String DEFAULT_BLOOM_INDEX_TREE_BASED_FILTER = "true"; + public static final ConfigProperty BLOOM_FILTER_NUM_ENTRIES_VALUE = ConfigProperty + .key("hoodie.index.bloom.num_entries") + .defaultValue("60000") + .withDocumentation("Only applies if index type is BLOOM. " + + "This is the number of entries to be stored in the bloom filter. " + + "The rationale for the default: Assume the maxParquetFileSize is 128MB and averageRecordSize is 1kb and " + + "hence we approx a total of 130K records in a file. The default (60000) is roughly half of this approximation. " + + "Warning: Setting this very low, will generate a lot of false positives and index lookup " + + "will have to scan a lot more files than it has to and setting this to a very high number will " + + "increase the size every base file linearly (roughly 4KB for every 50000 entries). " + + "This config is also used with DYNAMIC bloom filter which determines the initial size for the bloom."); + + public static final ConfigProperty BLOOM_FILTER_FPP_VALUE = ConfigProperty + .key("hoodie.index.bloom.fpp") + .defaultValue("0.000000001") + .withDocumentation("Only applies if index type is BLOOM. " + + "Error rate allowed given the number of entries. This is used to calculate how many bits should be " + + "assigned for the bloom filter and the number of hash functions. This is usually set very low (default: 0.000000001), " + + "we like to tradeoff disk space for lower false positives. " + + "If the number of entries added to bloom filter exceeds the configured value (hoodie.index.bloom.num_entries), " + + "then this fpp may not be honored."); + + public static final ConfigProperty BLOOM_INDEX_PARALLELISM = ConfigProperty + .key("hoodie.bloom.index.parallelism") + .defaultValue("0") + .withDocumentation("Only applies if index type is BLOOM. " + + "This is the amount of parallelism for index lookup, which involves a shuffle. " + + "By default, this is auto computed based on input workload characteristics."); + + public static final ConfigProperty BLOOM_INDEX_PRUNE_BY_RANGES = ConfigProperty + .key("hoodie.bloom.index.prune.by.ranges") + .defaultValue("true") + .withDocumentation("Only applies if index type is BLOOM. " + + "When true, range information from files to leveraged speed up index lookups. Particularly helpful, " + + "if the key has a monotonously increasing prefix, such as timestamp. " + + "If the record key is completely random, it is better to turn this off, since range pruning will only " + + " add extra overhead to the index lookup."); + + public static final ConfigProperty BLOOM_INDEX_USE_CACHING = ConfigProperty + .key("hoodie.bloom.index.use.caching") + .defaultValue("true") + .withDocumentation("Only applies if index type is BLOOM." + + "When true, the input RDD will cached to speed up index lookup by reducing IO " + + "for computing parallelism or affected partitions"); + + public static final ConfigProperty BLOOM_INDEX_USE_METADATA = ConfigProperty + .key("hoodie.bloom.index.use.metadata") + .defaultValue(false) + .sinceVersion("0.11.0") + .withDocumentation("Only applies if index type is BLOOM." + + "When true, the index lookup uses bloom filters and column stats from metadata " + + "table when available to speed up the process."); + + public static final ConfigProperty BLOOM_INDEX_TREE_BASED_FILTER = ConfigProperty + .key("hoodie.bloom.index.use.treebased.filter") + .defaultValue("true") + .withDocumentation("Only applies if index type is BLOOM. " + + "When true, interval tree based file pruning optimization is enabled. " + + "This mode speeds-up file-pruning based on key ranges when compared with the brute-force mode"); + // TODO: On by default. Once stable, we will remove the other mode. - public static final String BLOOM_INDEX_BUCKETIZED_CHECKING_PROP = "hoodie.bloom.index.bucketized.checking"; - public static final String DEFAULT_BLOOM_INDEX_BUCKETIZED_CHECKING = "true"; - public static final String BLOOM_INDEX_FILTER_TYPE = "hoodie.bloom.index.filter.type"; - public static final String DEFAULT_BLOOM_INDEX_FILTER_TYPE = BloomFilterTypeCode.SIMPLE.name(); - public static final String HOODIE_BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES = "hoodie.bloom.index.filter.dynamic.max.entries"; - public static final String DEFAULT_HOODIE_BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES = "100000"; - public static final String SIMPLE_INDEX_USE_CACHING_PROP = "hoodie.simple.index.use.caching"; - public static final String DEFAULT_SIMPLE_INDEX_USE_CACHING = "true"; - public static final String SIMPLE_INDEX_PARALLELISM_PROP = "hoodie.simple.index.parallelism"; - public static final String DEFAULT_SIMPLE_INDEX_PARALLELISM = "50"; - public static final String GLOBAL_SIMPLE_INDEX_PARALLELISM_PROP = "hoodie.global.simple.index.parallelism"; - public static final String DEFAULT_GLOBAL_SIMPLE_INDEX_PARALLELISM = "100"; + public static final ConfigProperty BLOOM_INDEX_BUCKETIZED_CHECKING = ConfigProperty + .key("hoodie.bloom.index.bucketized.checking") + .defaultValue("true") + .withDocumentation("Only applies if index type is BLOOM. " + + "When true, bucketized bloom filtering is enabled. " + + "This reduces skew seen in sort based bloom index lookup"); + + public static final ConfigProperty BLOOM_FILTER_TYPE = ConfigProperty + .key("hoodie.bloom.index.filter.type") + .defaultValue(BloomFilterTypeCode.DYNAMIC_V0.name()) + .withValidValues(BloomFilterTypeCode.SIMPLE.name(), BloomFilterTypeCode.DYNAMIC_V0.name()) + .withDocumentation("Filter type used. Default is BloomFilterTypeCode.DYNAMIC_V0. " + + "Available values are [BloomFilterTypeCode.SIMPLE , BloomFilterTypeCode.DYNAMIC_V0]. " + + "Dynamic bloom filters auto size themselves based on number of keys."); + + public static final ConfigProperty BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES = ConfigProperty + .key("hoodie.bloom.index.filter.dynamic.max.entries") + .defaultValue("100000") + .withDocumentation("The threshold for the maximum number of keys to record in a dynamic Bloom filter row. " + + "Only applies if filter type is BloomFilterTypeCode.DYNAMIC_V0."); + + public static final ConfigProperty SIMPLE_INDEX_USE_CACHING = ConfigProperty + .key("hoodie.simple.index.use.caching") + .defaultValue("true") + .withDocumentation("Only applies if index type is SIMPLE. " + + "When true, the incoming writes will cached to speed up index lookup by reducing IO " + + "for computing parallelism or affected partitions"); + + public static final ConfigProperty SIMPLE_INDEX_PARALLELISM = ConfigProperty + .key("hoodie.simple.index.parallelism") + .defaultValue("100") + .withDocumentation("Only applies if index type is SIMPLE. " + + "This is the amount of parallelism for index lookup, which involves a Spark Shuffle"); + + public static final ConfigProperty GLOBAL_SIMPLE_INDEX_PARALLELISM = ConfigProperty + .key("hoodie.global.simple.index.parallelism") + .defaultValue("100") + .withDocumentation("Only applies if index type is GLOBAL_SIMPLE. " + + "This is the amount of parallelism for index lookup, which involves a Spark Shuffle"); // 1B bloom filter checks happen in 250 seconds. 500ms to read a bloom filter. // 10M checks in 2500ms, thus amortizing the cost of reading bloom filter across partitions. - public static final String BLOOM_INDEX_KEYS_PER_BUCKET_PROP = "hoodie.bloom.index.keys.per.bucket"; - public static final String DEFAULT_BLOOM_INDEX_KEYS_PER_BUCKET = "10000000"; - - // ***** HBase Index Configs ***** - public static final String HBASE_ZKQUORUM_PROP = "hoodie.index.hbase.zkquorum"; - public static final String HBASE_ZKPORT_PROP = "hoodie.index.hbase.zkport"; - public static final String HBASE_ZK_ZNODEPARENT = "hoodie.index.hbase.zknode.path"; - public static final String HBASE_TABLENAME_PROP = "hoodie.index.hbase.table"; - public static final String HBASE_GET_BATCH_SIZE_PROP = "hoodie.index.hbase.get.batch.size"; - public static final String HBASE_PUT_BATCH_SIZE_PROP = "hoodie.index.hbase.put.batch.size"; - public static final String DEFAULT_HBASE_BATCH_SIZE = "100"; - - - public static final String BLOOM_INDEX_INPUT_STORAGE_LEVEL = "hoodie.bloom.index.input.storage.level"; - public static final String DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL = "MEMORY_AND_DISK_SER"; - public static final String SIMPLE_INDEX_INPUT_STORAGE_LEVEL = "hoodie.simple.index.input.storage.level"; - public static final String DEFAULT_SIMPLE_INDEX_INPUT_STORAGE_LEVEL = "MEMORY_AND_DISK_SER"; + public static final ConfigProperty BLOOM_INDEX_KEYS_PER_BUCKET = ConfigProperty + .key("hoodie.bloom.index.keys.per.bucket") + .defaultValue("10000000") + .withDocumentation("Only applies if bloomIndexBucketizedChecking is enabled and index type is bloom. " + + "This configuration controls the “bucket” size which tracks the number of record-key checks made against " + + "a single file and is the unit of work allocated to each partition performing bloom filter lookup. " + + "A higher value would amortize the fixed cost of reading a bloom filter to memory."); + + public static final ConfigProperty BLOOM_INDEX_INPUT_STORAGE_LEVEL_VALUE = ConfigProperty + .key("hoodie.bloom.index.input.storage.level") + .defaultValue("MEMORY_AND_DISK_SER") + .withDocumentation("Only applies when #bloomIndexUseCaching is set. Determine what level of persistence is used to cache input RDDs. " + + "Refer to org.apache.spark.storage.StorageLevel for different values"); + + public static final ConfigProperty SIMPLE_INDEX_INPUT_STORAGE_LEVEL_VALUE = ConfigProperty + .key("hoodie.simple.index.input.storage.level") + .defaultValue("MEMORY_AND_DISK_SER") + .withDocumentation("Only applies when #simpleIndexUseCaching is set. Determine what level of persistence is used to cache input RDDs. " + + "Refer to org.apache.spark.storage.StorageLevel for different values"); /** * Only applies if index type is GLOBAL_BLOOM. @@ -98,23 +212,261 @@ public class HoodieIndexConfig extends DefaultHoodieConfig { *

* When set to false, a record will be updated to the old partition. */ - public static final String BLOOM_INDEX_UPDATE_PARTITION_PATH = "hoodie.bloom.index.update.partition.path"; - public static final String DEFAULT_BLOOM_INDEX_UPDATE_PARTITION_PATH = "false"; + public static final ConfigProperty BLOOM_INDEX_UPDATE_PARTITION_PATH_ENABLE = ConfigProperty + .key("hoodie.bloom.index.update.partition.path") + .defaultValue("true") + .withDocumentation("Only applies if index type is GLOBAL_BLOOM. " + + "When set to true, an update including the partition path of a record that already exists will result in " + + "inserting the incoming record into the new partition and deleting the original record in the old partition. " + + "When set to false, the original record will only be updated in the old partition"); + + public static final ConfigProperty SIMPLE_INDEX_UPDATE_PARTITION_PATH_ENABLE = ConfigProperty + .key("hoodie.simple.index.update.partition.path") + .defaultValue("true") + .withDocumentation("Similar to " + BLOOM_INDEX_UPDATE_PARTITION_PATH_ENABLE + ", but for simple index."); - public static final String SIMPLE_INDEX_UPDATE_PARTITION_PATH = "hoodie.simple.index.update.partition.path"; - public static final String DEFAULT_SIMPLE_INDEX_UPDATE_PARTITION_PATH = "false"; + /** + * ***** Bucket Index Configs ***** + * Bucket Index is targeted to locate the record fast by hash in big data scenarios. + * A bucket size is recommended less than 3GB to avoid being too small. + * For more details and progress, see [HUDI-3039]. + */ + + /** + * Bucket Index Engine Type: implementation of bucket index + * + * SIMPLE: + * 0. Check `HoodieSimpleBucketLayout` for its supported operations. + * 1. Bucket num is fixed and requires rewriting the partition if we want to change it. + * + * CONSISTENT_HASHING: + * 0. Check `HoodieConsistentBucketLayout` for its supported operations. + * 1. Bucket num will auto-adjust by running clustering (still in progress) + */ + public static final ConfigProperty BUCKET_INDEX_ENGINE_TYPE = ConfigProperty + .key("hoodie.index.bucket.engine") + .defaultValue("SIMPLE") + .sinceVersion("0.11.0") + .withDocumentation("Type of bucket index engine to use. Default is SIMPLE bucket index, with fixed number of bucket." + + "Possible options are [SIMPLE | CONSISTENT_HASHING]." + + "Consistent hashing supports dynamic resizing of the number of bucket, solving potential data skew and file size " + + "issues of the SIMPLE hashing engine."); + + /** + * Bucket num equals file groups num in each partition. + * Bucket num can be set according to partition size and file group size. + * + * In dynamic bucket index cases (e.g., using CONSISTENT_HASHING), this config of number of bucket serves as a initial bucket size + */ + public static final ConfigProperty BUCKET_INDEX_NUM_BUCKETS = ConfigProperty + .key("hoodie.bucket.index.num.buckets") + .defaultValue(256) + .withDocumentation("Only applies if index type is BUCKET. Determine the number of buckets in the hudi table, " + + "and each partition is divided to N buckets."); + + public static final ConfigProperty BUCKET_INDEX_HASH_FIELD = ConfigProperty + .key("hoodie.bucket.index.hash.field") + .noDefaultValue() + .withDocumentation("Index key. It is used to index the record and find its file group. " + + "If not set, use record key field as default"); + + /** + * Deprecated configs. These are now part of {@link HoodieHBaseIndexConfig}. + */ + @Deprecated + public static final String HBASE_ZKQUORUM_PROP = ZKQUORUM.key(); + @Deprecated + public static final String HBASE_ZKPORT_PROP = ZKPORT.key(); + @Deprecated + public static final String HBASE_ZK_ZNODEPARENT = HoodieHBaseIndexConfig.ZK_NODE_PATH.key(); + @Deprecated + public static final String HBASE_TABLENAME_PROP = TABLENAME.key(); + @Deprecated + public static final String HBASE_GET_BATCH_SIZE_PROP = GET_BATCH_SIZE.key(); + @Deprecated + public static final String HBASE_PUT_BATCH_SIZE_PROP = PUT_BATCH_SIZE.key(); + @Deprecated + public static final String DEFAULT_HBASE_BATCH_SIZE = "100"; + /** @deprecated Use {@link #INDEX_TYPE} and its methods instead */ + @Deprecated + public static final String INDEX_TYPE_PROP = INDEX_TYPE.key(); + /** + * @deprecated Use {@link #INDEX_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String INDEX_CLASS_PROP = INDEX_CLASS_NAME.key(); + /** + * @deprecated Use {@link #INDEX_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_INDEX_CLASS = INDEX_CLASS_NAME.defaultValue(); + /** + * @deprecated Use {@link #BLOOM_FILTER_NUM_ENTRIES_VALUE} and its methods instead + */ + @Deprecated + public static final String BLOOM_FILTER_NUM_ENTRIES = BLOOM_FILTER_NUM_ENTRIES_VALUE.key(); + /** + * @deprecated Use {@link #BLOOM_FILTER_NUM_ENTRIES_VALUE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_BLOOM_FILTER_NUM_ENTRIES = BLOOM_FILTER_NUM_ENTRIES_VALUE.defaultValue(); + /** + * @deprecated Use {@link #BLOOM_FILTER_FPP_VALUE} and its methods instead + */ + @Deprecated + public static final String BLOOM_FILTER_FPP = BLOOM_FILTER_FPP_VALUE.key(); + /** + * @deprecated Use {@link #BLOOM_FILTER_FPP_VALUE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_BLOOM_FILTER_FPP = BLOOM_FILTER_FPP_VALUE.defaultValue(); + /** + * @deprecated Use {@link #BLOOM_INDEX_PARALLELISM} and its methods instead + */ + @Deprecated + public static final String BLOOM_INDEX_PARALLELISM_PROP = BLOOM_INDEX_PARALLELISM.key(); + /** + * @deprecated Use {@link #BLOOM_INDEX_PARALLELISM} and its methods instead + */ + @Deprecated + public static final String DEFAULT_BLOOM_INDEX_PARALLELISM = BLOOM_INDEX_PARALLELISM.defaultValue(); + /** + * @deprecated Use {@link #BLOOM_INDEX_PRUNE_BY_RANGES} and its methods instead + */ + @Deprecated + public static final String BLOOM_INDEX_PRUNE_BY_RANGES_PROP = BLOOM_INDEX_PRUNE_BY_RANGES.key(); + /** @deprecated Use {@link #BLOOM_INDEX_PRUNE_BY_RANGES} and its methods instead */ + @Deprecated + public static final String DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES = BLOOM_INDEX_PRUNE_BY_RANGES.defaultValue(); + /** @deprecated Use {@link #BLOOM_INDEX_USE_CACHING} and its methods instead */ + @Deprecated + public static final String BLOOM_INDEX_USE_CACHING_PROP = BLOOM_INDEX_USE_CACHING.key(); + /** @deprecated Use {@link #BLOOM_INDEX_USE_CACHING} and its methods instead */ + @Deprecated + public static final String DEFAULT_BLOOM_INDEX_USE_CACHING = BLOOM_INDEX_USE_CACHING.defaultValue(); + /** @deprecated Use {@link #BLOOM_INDEX_TREE_BASED_FILTER} and its methods instead */ + @Deprecated + public static final String BLOOM_INDEX_TREE_BASED_FILTER_PROP = BLOOM_INDEX_TREE_BASED_FILTER.key(); + /** @deprecated Use {@link #BLOOM_INDEX_TREE_BASED_FILTER} and its methods instead */ + @Deprecated + public static final String DEFAULT_BLOOM_INDEX_TREE_BASED_FILTER = BLOOM_INDEX_TREE_BASED_FILTER.defaultValue(); + /** + * @deprecated Use {@link #BLOOM_INDEX_BUCKETIZED_CHECKING} and its methods instead + */ + @Deprecated + public static final String BLOOM_INDEX_BUCKETIZED_CHECKING_PROP = BLOOM_INDEX_BUCKETIZED_CHECKING.key(); + /** + * @deprecated Use {@link #BLOOM_INDEX_BUCKETIZED_CHECKING} and its methods instead + */ + @Deprecated + public static final String DEFAULT_BLOOM_INDEX_BUCKETIZED_CHECKING = BLOOM_INDEX_BUCKETIZED_CHECKING.defaultValue(); + /** + * @deprecated Use {@link #BLOOM_FILTER_TYPE} and its methods instead + */ + @Deprecated + public static final String BLOOM_INDEX_FILTER_TYPE = BLOOM_FILTER_TYPE.key(); + /** + * @deprecated Use {@link #BLOOM_FILTER_TYPE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_BLOOM_INDEX_FILTER_TYPE = BLOOM_FILTER_TYPE.defaultValue(); + /** + * @deprecated Use {@link #BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES} and its methods instead + */ + @Deprecated + public static final String HOODIE_BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES = BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES.key(); + /** + * @deprecated Use {@link #BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES} and its methods instead + */ + @Deprecated + public static final String DEFAULT_HOODIE_BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES = BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES.defaultValue(); + /** + * @deprecated Use {@link #SIMPLE_INDEX_USE_CACHING} and its methods instead + */ + @Deprecated + public static final String SIMPLE_INDEX_USE_CACHING_PROP = SIMPLE_INDEX_USE_CACHING.key(); + /** + * @deprecated Use {@link #SIMPLE_INDEX_USE_CACHING} and its methods instead + */ + @Deprecated + public static final String DEFAULT_SIMPLE_INDEX_USE_CACHING = SIMPLE_INDEX_USE_CACHING.defaultValue(); + /** @deprecated Use {@link #SIMPLE_INDEX_PARALLELISM} and its methods instead */ + @Deprecated + public static final String SIMPLE_INDEX_PARALLELISM_PROP = SIMPLE_INDEX_PARALLELISM.key(); + /** @deprecated Use {@link #SIMPLE_INDEX_PARALLELISM} and its methods instead */ + @Deprecated + public static final String DEFAULT_SIMPLE_INDEX_PARALLELISM = SIMPLE_INDEX_PARALLELISM.defaultValue(); + /** @deprecated Use {@link #GLOBAL_SIMPLE_INDEX_PARALLELISM} and its methods instead */ + @Deprecated + public static final String GLOBAL_SIMPLE_INDEX_PARALLELISM_PROP = GLOBAL_SIMPLE_INDEX_PARALLELISM.key(); + /** + * @deprecated Use {@link #GLOBAL_SIMPLE_INDEX_PARALLELISM} and its methods instead + */ + @Deprecated + public static final String DEFAULT_GLOBAL_SIMPLE_INDEX_PARALLELISM = GLOBAL_SIMPLE_INDEX_PARALLELISM.defaultValue(); + /** + * @deprecated Use {@link #BLOOM_INDEX_KEYS_PER_BUCKET} and its methods instead + */ + @Deprecated + public static final String BLOOM_INDEX_KEYS_PER_BUCKET_PROP = BLOOM_INDEX_KEYS_PER_BUCKET.key(); + /** + * @deprecated Use {@link #BLOOM_INDEX_KEYS_PER_BUCKET} and its methods instead + */ + @Deprecated + public static final String DEFAULT_BLOOM_INDEX_KEYS_PER_BUCKET = BLOOM_INDEX_KEYS_PER_BUCKET.defaultValue(); + /** + * @deprecated Use {@link #BLOOM_INDEX_INPUT_STORAGE_LEVEL_VALUE} and its methods instead + */ + @Deprecated + public static final String BLOOM_INDEX_INPUT_STORAGE_LEVEL = BLOOM_INDEX_INPUT_STORAGE_LEVEL_VALUE.key(); + /** + * @deprecated Use {@link #BLOOM_INDEX_INPUT_STORAGE_LEVEL_VALUE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL = BLOOM_INDEX_INPUT_STORAGE_LEVEL_VALUE.defaultValue(); + /** + * @deprecated Use {@link #SIMPLE_INDEX_INPUT_STORAGE_LEVEL_VALUE} and its methods instead + */ + @Deprecated + public static final String SIMPLE_INDEX_INPUT_STORAGE_LEVEL = SIMPLE_INDEX_INPUT_STORAGE_LEVEL_VALUE.key(); + /** + * @deprecated Use {@link #SIMPLE_INDEX_INPUT_STORAGE_LEVEL_VALUE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_SIMPLE_INDEX_INPUT_STORAGE_LEVEL = SIMPLE_INDEX_INPUT_STORAGE_LEVEL_VALUE.defaultValue(); + /** + * @deprecated Use {@link #BLOOM_INDEX_UPDATE_PARTITION_PATH_ENABLE} and its methods instead + */ + @Deprecated + public static final String BLOOM_INDEX_UPDATE_PARTITION_PATH = BLOOM_INDEX_UPDATE_PARTITION_PATH_ENABLE.key(); + /** + * @deprecated Use {@link #BLOOM_INDEX_UPDATE_PARTITION_PATH_ENABLE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_BLOOM_INDEX_UPDATE_PARTITION_PATH = BLOOM_INDEX_UPDATE_PARTITION_PATH_ENABLE.defaultValue(); + /** + * @deprecated Use {@link #SIMPLE_INDEX_UPDATE_PARTITION_PATH_ENABLE} and its methods instead + */ + @Deprecated + public static final String SIMPLE_INDEX_UPDATE_PARTITION_PATH = SIMPLE_INDEX_UPDATE_PARTITION_PATH_ENABLE.key(); + /** + * @deprecated Use {@link #SIMPLE_INDEX_UPDATE_PARTITION_PATH_ENABLE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_SIMPLE_INDEX_UPDATE_PARTITION_PATH = SIMPLE_INDEX_UPDATE_PARTITION_PATH_ENABLE.defaultValue(); private EngineType engineType; /** * Use Spark engine by default. */ - private HoodieIndexConfig(Properties props) { - this(EngineType.SPARK, props); + + private HoodieIndexConfig() { + this(EngineType.SPARK); } - private HoodieIndexConfig(EngineType engineType, Properties props) { - super(props); + private HoodieIndexConfig(EngineType engineType) { + super(); this.engineType = engineType; } @@ -125,127 +477,117 @@ public static HoodieIndexConfig.Builder newBuilder() { public static class Builder { private EngineType engineType = EngineType.SPARK; - private final Properties props = new Properties(); + private final HoodieIndexConfig hoodieIndexConfig = new HoodieIndexConfig(); public Builder fromFile(File propertiesFile) throws IOException { try (FileReader reader = new FileReader(propertiesFile)) { - this.props.load(reader); + this.hoodieIndexConfig.getProps().load(reader); return this; } } public Builder fromProperties(Properties props) { - this.props.putAll(props); + this.hoodieIndexConfig.getProps().putAll(props); return this; } public Builder withIndexType(HoodieIndex.IndexType indexType) { - props.setProperty(INDEX_TYPE_PROP, indexType.name()); + hoodieIndexConfig.setValue(INDEX_TYPE, indexType.name()); + return this; + } + + public Builder withBucketIndexEngineType(HoodieIndex.BucketIndexEngineType bucketType) { + hoodieIndexConfig.setValue(BUCKET_INDEX_ENGINE_TYPE, bucketType.name()); return this; } public Builder withIndexClass(String indexClass) { - props.setProperty(INDEX_CLASS_PROP, indexClass); + hoodieIndexConfig.setValue(INDEX_CLASS_NAME, indexClass); return this; } public Builder withHBaseIndexConfig(HoodieHBaseIndexConfig hBaseIndexConfig) { - props.putAll(hBaseIndexConfig.getProps()); + hoodieIndexConfig.getProps().putAll(hBaseIndexConfig.getProps()); return this; } public Builder bloomFilterNumEntries(int numEntries) { - props.setProperty(BLOOM_FILTER_NUM_ENTRIES, String.valueOf(numEntries)); + hoodieIndexConfig.setValue(BLOOM_FILTER_NUM_ENTRIES_VALUE, String.valueOf(numEntries)); return this; } public Builder bloomFilterFPP(double fpp) { - props.setProperty(BLOOM_FILTER_FPP, String.valueOf(fpp)); - return this; - } - - public Builder hbaseZkQuorum(String zkString) { - props.setProperty(HBASE_ZKQUORUM_PROP, zkString); - return this; - } - - public Builder hbaseZkPort(int port) { - props.setProperty(HBASE_ZKPORT_PROP, String.valueOf(port)); - return this; - } - - public Builder hbaseZkZnodeParent(String zkZnodeParent) { - props.setProperty(HBASE_ZK_ZNODEPARENT, zkZnodeParent); - return this; - } - - public Builder hbaseTableName(String tableName) { - props.setProperty(HBASE_TABLENAME_PROP, tableName); + hoodieIndexConfig.setValue(BLOOM_FILTER_FPP_VALUE, String.valueOf(fpp)); return this; } public Builder bloomIndexParallelism(int parallelism) { - props.setProperty(BLOOM_INDEX_PARALLELISM_PROP, String.valueOf(parallelism)); + hoodieIndexConfig.setValue(BLOOM_INDEX_PARALLELISM, String.valueOf(parallelism)); return this; } public Builder bloomIndexPruneByRanges(boolean pruneRanges) { - props.setProperty(BLOOM_INDEX_PRUNE_BY_RANGES_PROP, String.valueOf(pruneRanges)); + hoodieIndexConfig.setValue(BLOOM_INDEX_PRUNE_BY_RANGES, String.valueOf(pruneRanges)); return this; } public Builder bloomIndexUseCaching(boolean useCaching) { - props.setProperty(BLOOM_INDEX_USE_CACHING_PROP, String.valueOf(useCaching)); + hoodieIndexConfig.setValue(BLOOM_INDEX_USE_CACHING, String.valueOf(useCaching)); + return this; + } + + public Builder bloomIndexUseMetadata(boolean useMetadata) { + hoodieIndexConfig.setValue(BLOOM_INDEX_USE_METADATA, String.valueOf(useMetadata)); return this; } public Builder bloomIndexTreebasedFilter(boolean useTreeFilter) { - props.setProperty(BLOOM_INDEX_TREE_BASED_FILTER_PROP, String.valueOf(useTreeFilter)); + hoodieIndexConfig.setValue(BLOOM_INDEX_TREE_BASED_FILTER, String.valueOf(useTreeFilter)); return this; } public Builder bloomIndexBucketizedChecking(boolean bucketizedChecking) { - props.setProperty(BLOOM_INDEX_BUCKETIZED_CHECKING_PROP, String.valueOf(bucketizedChecking)); + hoodieIndexConfig.setValue(BLOOM_INDEX_BUCKETIZED_CHECKING, String.valueOf(bucketizedChecking)); return this; } public Builder bloomIndexKeysPerBucket(int keysPerBucket) { - props.setProperty(BLOOM_INDEX_KEYS_PER_BUCKET_PROP, String.valueOf(keysPerBucket)); + hoodieIndexConfig.setValue(BLOOM_INDEX_KEYS_PER_BUCKET, String.valueOf(keysPerBucket)); return this; } public Builder withBloomIndexInputStorageLevel(String level) { - props.setProperty(BLOOM_INDEX_INPUT_STORAGE_LEVEL, level); + hoodieIndexConfig.setValue(BLOOM_INDEX_INPUT_STORAGE_LEVEL_VALUE, level); return this; } public Builder withBloomIndexUpdatePartitionPath(boolean updatePartitionPath) { - props.setProperty(BLOOM_INDEX_UPDATE_PARTITION_PATH, String.valueOf(updatePartitionPath)); + hoodieIndexConfig.setValue(BLOOM_INDEX_UPDATE_PARTITION_PATH_ENABLE, String.valueOf(updatePartitionPath)); return this; } public Builder withSimpleIndexParallelism(int parallelism) { - props.setProperty(SIMPLE_INDEX_PARALLELISM_PROP, String.valueOf(parallelism)); + hoodieIndexConfig.setValue(SIMPLE_INDEX_PARALLELISM, String.valueOf(parallelism)); return this; } public Builder simpleIndexUseCaching(boolean useCaching) { - props.setProperty(SIMPLE_INDEX_USE_CACHING_PROP, String.valueOf(useCaching)); + hoodieIndexConfig.setValue(SIMPLE_INDEX_USE_CACHING, String.valueOf(useCaching)); return this; } public Builder withSimpleIndexInputStorageLevel(String level) { - props.setProperty(SIMPLE_INDEX_INPUT_STORAGE_LEVEL, level); + hoodieIndexConfig.setValue(SIMPLE_INDEX_INPUT_STORAGE_LEVEL_VALUE, level); return this; } public Builder withGlobalSimpleIndexParallelism(int parallelism) { - props.setProperty(GLOBAL_SIMPLE_INDEX_PARALLELISM_PROP, String.valueOf(parallelism)); + hoodieIndexConfig.setValue(GLOBAL_SIMPLE_INDEX_PARALLELISM, String.valueOf(parallelism)); return this; } public Builder withGlobalSimpleIndexUpdatePartitionPath(boolean updatePartitionPath) { - props.setProperty(SIMPLE_INDEX_UPDATE_PARTITION_PATH, String.valueOf(updatePartitionPath)); + hoodieIndexConfig.setValue(SIMPLE_INDEX_UPDATE_PARTITION_PATH_ENABLE, String.valueOf(updatePartitionPath)); return this; } @@ -254,53 +596,32 @@ public Builder withEngineType(EngineType engineType) { return this; } + public Builder withBucketNum(String bucketNum) { + hoodieIndexConfig.setValue(BUCKET_INDEX_NUM_BUCKETS, bucketNum); + return this; + } + + public Builder withIndexKeyField(String keyField) { + hoodieIndexConfig.setValue(BUCKET_INDEX_HASH_FIELD, keyField); + return this; + } + public HoodieIndexConfig build() { - HoodieIndexConfig config = new HoodieIndexConfig(engineType, props); - setDefaultOnCondition(props, !props.containsKey(INDEX_TYPE_PROP), INDEX_TYPE_PROP, getDefaultIndexType(engineType)); - setDefaultOnCondition(props, !props.containsKey(INDEX_CLASS_PROP), INDEX_CLASS_PROP, DEFAULT_INDEX_CLASS); - setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_NUM_ENTRIES), BLOOM_FILTER_NUM_ENTRIES, - DEFAULT_BLOOM_FILTER_NUM_ENTRIES); - setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_FPP), BLOOM_FILTER_FPP, DEFAULT_BLOOM_FILTER_FPP); - setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PARALLELISM_PROP), BLOOM_INDEX_PARALLELISM_PROP, - DEFAULT_BLOOM_INDEX_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_PRUNE_BY_RANGES_PROP), - BLOOM_INDEX_PRUNE_BY_RANGES_PROP, DEFAULT_BLOOM_INDEX_PRUNE_BY_RANGES); - setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_USE_CACHING_PROP), BLOOM_INDEX_USE_CACHING_PROP, - DEFAULT_BLOOM_INDEX_USE_CACHING); - setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_INPUT_STORAGE_LEVEL), BLOOM_INDEX_INPUT_STORAGE_LEVEL, - DEFAULT_BLOOM_INDEX_INPUT_STORAGE_LEVEL); - setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_UPDATE_PARTITION_PATH), - BLOOM_INDEX_UPDATE_PARTITION_PATH, DEFAULT_BLOOM_INDEX_UPDATE_PARTITION_PATH); - setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_TREE_BASED_FILTER_PROP), - BLOOM_INDEX_TREE_BASED_FILTER_PROP, DEFAULT_BLOOM_INDEX_TREE_BASED_FILTER); - setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_BUCKETIZED_CHECKING_PROP), - BLOOM_INDEX_BUCKETIZED_CHECKING_PROP, DEFAULT_BLOOM_INDEX_BUCKETIZED_CHECKING); - setDefaultOnCondition(props, !props.containsKey(BLOOM_INDEX_KEYS_PER_BUCKET_PROP), - BLOOM_INDEX_KEYS_PER_BUCKET_PROP, DEFAULT_BLOOM_INDEX_KEYS_PER_BUCKET); - setDefaultOnCondition(props, !props.contains(BLOOM_INDEX_FILTER_TYPE), - BLOOM_INDEX_FILTER_TYPE, DEFAULT_BLOOM_INDEX_FILTER_TYPE); - setDefaultOnCondition(props, !props.contains(HOODIE_BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES), - HOODIE_BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES, DEFAULT_HOODIE_BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES); - setDefaultOnCondition(props, !props.containsKey(SIMPLE_INDEX_PARALLELISM_PROP), SIMPLE_INDEX_PARALLELISM_PROP, - DEFAULT_SIMPLE_INDEX_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(SIMPLE_INDEX_USE_CACHING_PROP), SIMPLE_INDEX_USE_CACHING_PROP, - DEFAULT_SIMPLE_INDEX_USE_CACHING); - setDefaultOnCondition(props, !props.containsKey(SIMPLE_INDEX_INPUT_STORAGE_LEVEL), SIMPLE_INDEX_INPUT_STORAGE_LEVEL, - DEFAULT_SIMPLE_INDEX_INPUT_STORAGE_LEVEL); - setDefaultOnCondition(props, !props.containsKey(GLOBAL_SIMPLE_INDEX_PARALLELISM_PROP), GLOBAL_SIMPLE_INDEX_PARALLELISM_PROP, - DEFAULT_GLOBAL_SIMPLE_INDEX_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(SIMPLE_INDEX_UPDATE_PARTITION_PATH), - SIMPLE_INDEX_UPDATE_PARTITION_PATH, DEFAULT_SIMPLE_INDEX_UPDATE_PARTITION_PATH); + hoodieIndexConfig.setDefaultValue(INDEX_TYPE, getDefaultIndexType(engineType)); + hoodieIndexConfig.setDefaults(HoodieIndexConfig.class.getName()); + // Throws IllegalArgumentException if the value set is not a known Hoodie Index Type - HoodieIndex.IndexType.valueOf(props.getProperty(INDEX_TYPE_PROP)); - return config; + HoodieIndex.IndexType.valueOf(hoodieIndexConfig.getString(INDEX_TYPE)); + validateBucketIndexConfig(); + return hoodieIndexConfig; } private String getDefaultIndexType(EngineType engineType) { switch (engineType) { case SPARK: - return HoodieIndex.IndexType.BLOOM.name(); + return HoodieIndex.IndexType.SIMPLE.name(); case FLINK: + case JAVA: return HoodieIndex.IndexType.INMEMORY.name(); default: throw new HoodieNotSupportedException("Unsupported engine " + engineType); @@ -310,5 +631,27 @@ private String getDefaultIndexType(EngineType engineType) { public EngineType getEngineType() { return engineType; } + + private void validateBucketIndexConfig() { + if (hoodieIndexConfig.getString(INDEX_TYPE).equalsIgnoreCase(HoodieIndex.IndexType.BUCKET.toString())) { + // check the bucket index hash field + if (StringUtils.isNullOrEmpty(hoodieIndexConfig.getString(BUCKET_INDEX_HASH_FIELD))) { + hoodieIndexConfig.setValue(BUCKET_INDEX_HASH_FIELD, + hoodieIndexConfig.getStringOrDefault(KeyGeneratorOptions.RECORDKEY_FIELD_NAME)); + } else { + boolean valid = Arrays + .stream(hoodieIndexConfig.getStringOrDefault(KeyGeneratorOptions.RECORDKEY_FIELD_NAME).split(",")) + .collect(Collectors.toSet()) + .containsAll(Arrays.asList(hoodieIndexConfig.getString(BUCKET_INDEX_HASH_FIELD).split(","))); + if (!valid) { + throw new HoodieIndexException("Bucket index key (if configured) must be subset of record key."); + } + } + // check the bucket num + if (hoodieIndexConfig.getIntOrDefault(BUCKET_INDEX_NUM_BUCKETS) <= 0) { + throw new HoodieIndexException("When using bucket index, hoodie.bucket.index.num.buckets cannot be negative."); + } + } + } } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieInternalConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieInternalConfig.java new file mode 100644 index 0000000000000..f6e1e6ec9a115 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieInternalConfig.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +/** + * Configs/params used for internal purposes. + */ +public class HoodieInternalConfig extends HoodieConfig { + + private static final long serialVersionUID = 0L; + + public static final String BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED = "hoodie.bulkinsert.are.partitioner.records.sorted"; + public static final Boolean DEFAULT_BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED = false; + + public static final ConfigProperty BULKINSERT_INPUT_DATA_SCHEMA_DDL = ConfigProperty + .key("hoodie.bulkinsert.schema.ddl") + .noDefaultValue() + .withDocumentation("Schema set for row writer/bulk insert."); + + /** + * Returns if partition records are sorted or not. + * + * @param propertyValue value for property BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED. + * @return the property value. + */ + public static Boolean getBulkInsertIsPartitionRecordsSorted(String propertyValue) { + return propertyValue != null ? Boolean.parseBoolean(propertyValue) : DEFAULT_BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieLayoutConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieLayoutConfig.java new file mode 100644 index 0000000000000..0579ee3d623bd --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieLayoutConfig.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.table.storage.HoodieStorageLayout; + +import javax.annotation.concurrent.Immutable; +import java.util.Properties; + +/** + * Storage layout related config. + */ +@Immutable +@ConfigClassProperty(name = "Layout Configs", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Configurations that control storage layout and data distribution, " + + "which defines how the files are organized within a table.") +public class HoodieLayoutConfig extends HoodieConfig { + + public static final ConfigProperty LAYOUT_TYPE = ConfigProperty + .key("hoodie.storage.layout.type") + .defaultValue("DEFAULT") + .withDocumentation("Type of storage layout. Possible options are [DEFAULT | BUCKET]"); + + public static final ConfigProperty LAYOUT_PARTITIONER_CLASS_NAME = ConfigProperty + .key("hoodie.storage.layout.partitioner.class") + .noDefaultValue() + .withDocumentation("Partitioner class, it is used to distribute data in a specific way."); + + public static final String SIMPLE_BUCKET_LAYOUT_PARTITIONER_CLASS_NAME = + "org.apache.hudi.table.action.commit.SparkBucketIndexPartitioner"; + + private HoodieLayoutConfig() { + super(); + } + + public static HoodieLayoutConfig.Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + public HoodieLayoutConfig layoutConfig = new HoodieLayoutConfig(); + + public Builder fromProperties(Properties props) { + this.layoutConfig.getProps().putAll(props); + return this; + } + + public Builder withLayoutType(String type) { + layoutConfig.setValue(LAYOUT_TYPE, type); + return this; + } + + public Builder withLayoutPartitioner(String partitionerClass) { + layoutConfig.setValue(LAYOUT_PARTITIONER_CLASS_NAME, partitionerClass); + return this; + } + + public HoodieLayoutConfig build() { + setDefault(); + return layoutConfig; + } + + private void setDefault() { + if (layoutConfig.contains(HoodieIndexConfig.INDEX_TYPE.key()) + && layoutConfig.getString(HoodieIndexConfig.INDEX_TYPE.key()).equals(HoodieIndex.IndexType.BUCKET.name())) { + layoutConfig.setDefaultValue(LAYOUT_TYPE, HoodieStorageLayout.LayoutType.BUCKET.name()); + + // Currently, the partitioner of the SIMPLE bucket index is supported by SparkBucketIndexPartitioner only. + if ("SIMPLE".equals(layoutConfig.getString(HoodieIndexConfig.BUCKET_INDEX_ENGINE_TYPE))) { + layoutConfig.setDefaultValue(LAYOUT_PARTITIONER_CLASS_NAME, SIMPLE_BUCKET_LAYOUT_PARTITIONER_CLASS_NAME); + } + } + layoutConfig.setDefaultValue(LAYOUT_TYPE, LAYOUT_TYPE.defaultValue()); + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieLockConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieLockConfig.java new file mode 100644 index 0000000000000..3623a04232be2 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieLockConfig.java @@ -0,0 +1,330 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config; + +import org.apache.hudi.client.transaction.ConflictResolutionStrategy; +import org.apache.hudi.client.transaction.SimpleConcurrentFileWritesConflictResolutionStrategy; +import org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider; +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.lock.LockProvider; +import org.apache.hudi.common.util.Option; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +import static org.apache.hudi.common.config.LockConfiguration.DEFAULT_LOCK_ACQUIRE_NUM_RETRIES; +import static org.apache.hudi.common.config.LockConfiguration.DEFAULT_LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS; +import static org.apache.hudi.common.config.LockConfiguration.DEFAULT_ZK_CONNECTION_TIMEOUT_MS; +import static org.apache.hudi.common.config.LockConfiguration.DEFAULT_ZK_SESSION_TIMEOUT_MS; +import static org.apache.hudi.common.config.LockConfiguration.FILESYSTEM_LOCK_EXPIRE_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.FILESYSTEM_LOCK_PATH_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.HIVE_DATABASE_NAME_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.HIVE_METASTORE_URI_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.HIVE_TABLE_NAME_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_NUM_RETRIES_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_RETRY_MAX_WAIT_TIME_IN_MILLIS_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_PREFIX; +import static org.apache.hudi.common.config.LockConfiguration.ZK_BASE_PATH_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.ZK_CONNECTION_TIMEOUT_MS_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.ZK_CONNECT_URL_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.ZK_LOCK_KEY_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.ZK_PORT_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.ZK_SESSION_TIMEOUT_MS_PROP_KEY; + +/** + * Hoodie Configs for Locks. + */ +@ConfigClassProperty(name = "Locks Configurations", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Configs that control locking mechanisms required for concurrency control " + + " between writers to a Hudi table. Concurrency between Hudi's own table services " + + " are auto managed internally.") +public class HoodieLockConfig extends HoodieConfig { + + public static final ConfigProperty LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS = ConfigProperty + .key(LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY) + .defaultValue(DEFAULT_LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS) + .sinceVersion("0.8.0") + .withDocumentation("Initial amount of time to wait between retries to acquire locks, " + + " subsequent retries will exponentially backoff."); + + public static final ConfigProperty LOCK_ACQUIRE_RETRY_MAX_WAIT_TIME_IN_MILLIS = ConfigProperty + .key(LOCK_ACQUIRE_RETRY_MAX_WAIT_TIME_IN_MILLIS_PROP_KEY) + .defaultValue(String.valueOf(5000L)) + .sinceVersion("0.8.0") + .withDocumentation("Maximum amount of time to wait between retries by lock provider client. This bounds" + + " the maximum delay from the exponential backoff. Currently used by ZK based lock provider only."); + + public static final ConfigProperty LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS = ConfigProperty + .key(LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY) + .defaultValue(String.valueOf(10000L)) + .sinceVersion("0.8.0") + .withDocumentation("Amount of time to wait between retries on the lock provider by the lock manager"); + + public static final ConfigProperty LOCK_ACQUIRE_NUM_RETRIES = ConfigProperty + .key(LOCK_ACQUIRE_NUM_RETRIES_PROP_KEY) + .defaultValue(DEFAULT_LOCK_ACQUIRE_NUM_RETRIES) + .sinceVersion("0.8.0") + .withDocumentation("Maximum number of times to retry lock acquire, at each lock provider"); + + public static final ConfigProperty LOCK_ACQUIRE_CLIENT_NUM_RETRIES = ConfigProperty + .key(LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY) + .defaultValue(String.valueOf(10)) + .sinceVersion("0.8.0") + .withDocumentation("Maximum number of times to retry to acquire lock additionally from the lock manager."); + + public static final ConfigProperty LOCK_ACQUIRE_WAIT_TIMEOUT_MS = ConfigProperty + .key(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY) + .defaultValue(60 * 1000) + .sinceVersion("0.8.0") + .withDocumentation("Timeout in ms, to wait on an individual lock acquire() call, at the lock provider."); + + public static final ConfigProperty FILESYSTEM_LOCK_PATH = ConfigProperty + .key(FILESYSTEM_LOCK_PATH_PROP_KEY) + .noDefaultValue() + .sinceVersion("0.8.0") + .withDocumentation("For DFS based lock providers, path to store the locks under. use Table's meta path as default"); + + public static final ConfigProperty FILESYSTEM_LOCK_EXPIRE = ConfigProperty + .key(FILESYSTEM_LOCK_EXPIRE_PROP_KEY) + .defaultValue(0) + .sinceVersion("0.12.0") + .withDocumentation("For DFS based lock providers, expire time in minutes, must be a nonnegative number, default means no expire"); + + public static final ConfigProperty HIVE_DATABASE_NAME = ConfigProperty + .key(HIVE_DATABASE_NAME_PROP_KEY) + .noDefaultValue() + .sinceVersion("0.8.0") + .withDocumentation("For Hive based lock provider, the Hive database to acquire lock against"); + + public static final ConfigProperty HIVE_TABLE_NAME = ConfigProperty + .key(HIVE_TABLE_NAME_PROP_KEY) + .noDefaultValue() + .sinceVersion("0.8.0") + .withDocumentation("For Hive based lock provider, the Hive table to acquire lock against"); + + public static final ConfigProperty HIVE_METASTORE_URI = ConfigProperty + .key(HIVE_METASTORE_URI_PROP_KEY) + .noDefaultValue() + .sinceVersion("0.8.0") + .withDocumentation("For Hive based lock provider, the Hive metastore URI to acquire locks against."); + + public static final ConfigProperty ZK_BASE_PATH = ConfigProperty + .key(ZK_BASE_PATH_PROP_KEY) + .noDefaultValue() + .sinceVersion("0.8.0") + .withDocumentation("The base path on Zookeeper under which to create lock related ZNodes. " + + "This should be same for all concurrent writers to the same table"); + + public static final ConfigProperty ZK_SESSION_TIMEOUT_MS = ConfigProperty + .key(ZK_SESSION_TIMEOUT_MS_PROP_KEY) + .defaultValue(DEFAULT_ZK_SESSION_TIMEOUT_MS) + .sinceVersion("0.8.0") + .withDocumentation("Timeout in ms, to wait after losing connection to ZooKeeper, before the session is expired"); + + public static final ConfigProperty ZK_CONNECTION_TIMEOUT_MS = ConfigProperty + .key(ZK_CONNECTION_TIMEOUT_MS_PROP_KEY) + .defaultValue(DEFAULT_ZK_CONNECTION_TIMEOUT_MS) + .sinceVersion("0.8.0") + .withDocumentation("Timeout in ms, to wait for establishing connection with Zookeeper."); + + public static final ConfigProperty ZK_CONNECT_URL = ConfigProperty + .key(ZK_CONNECT_URL_PROP_KEY) + .noDefaultValue() + .sinceVersion("0.8.0") + .withDocumentation("Zookeeper URL to connect to."); + + public static final ConfigProperty ZK_PORT = ConfigProperty + .key(ZK_PORT_PROP_KEY) + .noDefaultValue() + .sinceVersion("0.8.0") + .withDocumentation("Zookeeper port to connect to."); + + public static final ConfigProperty ZK_LOCK_KEY = ConfigProperty + .key(ZK_LOCK_KEY_PROP_KEY) + .noDefaultValue() + .withInferFunction(p -> Option.ofNullable(p.getStringOrDefault(HoodieWriteConfig.TBL_NAME, null))) + .sinceVersion("0.8.0") + .withDocumentation("Key name under base_path at which to create a ZNode and acquire lock. " + + "Final path on zk will look like base_path/lock_key. If this parameter is not set, we would " + + "set it as the table name"); + + // Pluggable type of lock provider + public static final ConfigProperty LOCK_PROVIDER_CLASS_NAME = ConfigProperty + .key(LOCK_PREFIX + "provider") + .defaultValue(ZookeeperBasedLockProvider.class.getName()) + .sinceVersion("0.8.0") + .withDocumentation("Lock provider class name, user can provide their own implementation of LockProvider " + + "which should be subclass of org.apache.hudi.common.lock.LockProvider"); + + // Pluggable strategies to use when resolving conflicts + public static final ConfigProperty WRITE_CONFLICT_RESOLUTION_STRATEGY_CLASS_NAME = ConfigProperty + .key(LOCK_PREFIX + "conflict.resolution.strategy") + .defaultValue(SimpleConcurrentFileWritesConflictResolutionStrategy.class.getName()) + .sinceVersion("0.8.0") + .withDocumentation("Lock provider class name, this should be subclass of " + + "org.apache.hudi.client.transaction.ConflictResolutionStrategy"); + + /** @deprecated Use {@link #WRITE_CONFLICT_RESOLUTION_STRATEGY_CLASS_NAME} and its methods instead */ + @Deprecated + public static final String WRITE_CONFLICT_RESOLUTION_STRATEGY_CLASS_PROP = WRITE_CONFLICT_RESOLUTION_STRATEGY_CLASS_NAME.key(); + /** @deprecated Use {@link #WRITE_CONFLICT_RESOLUTION_STRATEGY_CLASS_NAME} and its methods instead */ + @Deprecated + public static final String DEFAULT_WRITE_CONFLICT_RESOLUTION_STRATEGY_CLASS = WRITE_CONFLICT_RESOLUTION_STRATEGY_CLASS_NAME.defaultValue(); + /** @deprecated Use {@link #LOCK_PROVIDER_CLASS_NAME} and its methods instead */ + @Deprecated + public static final String LOCK_PROVIDER_CLASS_PROP = LOCK_PROVIDER_CLASS_NAME.key(); + /** @deprecated Use {@link #LOCK_PROVIDER_CLASS_NAME} and its methods instead */ + @Deprecated + public static final String DEFAULT_LOCK_PROVIDER_CLASS = LOCK_PROVIDER_CLASS_NAME.defaultValue(); + + private HoodieLockConfig() { + super(); + } + + public static HoodieLockConfig.Builder newBuilder() { + return new HoodieLockConfig.Builder(); + } + + public static class Builder { + + private final HoodieLockConfig lockConfig = new HoodieLockConfig(); + + public HoodieLockConfig.Builder fromFile(File propertiesFile) throws IOException { + try (FileReader reader = new FileReader(propertiesFile)) { + this.lockConfig.getProps().load(reader); + return this; + } + } + + public HoodieLockConfig.Builder fromProperties(Properties props) { + this.lockConfig.getProps().putAll(props); + return this; + } + + public HoodieLockConfig.Builder withLockProvider(Class lockProvider) { + lockConfig.setValue(LOCK_PROVIDER_CLASS_NAME, lockProvider.getName()); + return this; + } + + public HoodieLockConfig.Builder withHiveDatabaseName(String databaseName) { + lockConfig.setValue(HIVE_DATABASE_NAME, databaseName); + return this; + } + + public HoodieLockConfig.Builder withHiveTableName(String tableName) { + lockConfig.setValue(HIVE_TABLE_NAME, tableName); + return this; + } + + public HoodieLockConfig.Builder withHiveMetastoreURIs(String hiveMetastoreURIs) { + lockConfig.setValue(HIVE_METASTORE_URI, hiveMetastoreURIs); + return this; + } + + public HoodieLockConfig.Builder withZkQuorum(String zkQuorum) { + lockConfig.setValue(ZK_CONNECT_URL, zkQuorum); + return this; + } + + public HoodieLockConfig.Builder withZkBasePath(String zkBasePath) { + lockConfig.setValue(ZK_BASE_PATH, zkBasePath); + return this; + } + + public HoodieLockConfig.Builder withZkPort(String zkPort) { + lockConfig.setValue(ZK_PORT, zkPort); + return this; + } + + public HoodieLockConfig.Builder withZkLockKey(String zkLockKey) { + lockConfig.setValue(ZK_LOCK_KEY, zkLockKey); + return this; + } + + public HoodieLockConfig.Builder withZkConnectionTimeoutInMs(Long connectionTimeoutInMs) { + lockConfig.setValue(ZK_CONNECTION_TIMEOUT_MS, String.valueOf(connectionTimeoutInMs)); + return this; + } + + public HoodieLockConfig.Builder withZkSessionTimeoutInMs(Long sessionTimeoutInMs) { + lockConfig.setValue(ZK_SESSION_TIMEOUT_MS, String.valueOf(sessionTimeoutInMs)); + return this; + } + + public HoodieLockConfig.Builder withNumRetries(int numRetries) { + lockConfig.setValue(LOCK_ACQUIRE_NUM_RETRIES, String.valueOf(numRetries)); + return this; + } + + public HoodieLockConfig.Builder withRetryWaitTimeInMillis(Long retryWaitTimeInMillis) { + lockConfig.setValue(LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS, String.valueOf(retryWaitTimeInMillis)); + return this; + } + + public HoodieLockConfig.Builder withRetryMaxWaitTimeInMillis(Long retryMaxWaitTimeInMillis) { + lockConfig.setValue(LOCK_ACQUIRE_RETRY_MAX_WAIT_TIME_IN_MILLIS, String.valueOf(retryMaxWaitTimeInMillis)); + return this; + } + + public HoodieLockConfig.Builder withClientNumRetries(int clientNumRetries) { + lockConfig.setValue(LOCK_ACQUIRE_CLIENT_NUM_RETRIES, String.valueOf(clientNumRetries)); + return this; + } + + public HoodieLockConfig.Builder withClientRetryWaitTimeInMillis(Long clientRetryWaitTimeInMillis) { + lockConfig.setValue(LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS, String.valueOf(clientRetryWaitTimeInMillis)); + return this; + } + + public HoodieLockConfig.Builder withLockWaitTimeInMillis(Long waitTimeInMillis) { + lockConfig.setValue(LOCK_ACQUIRE_WAIT_TIMEOUT_MS, String.valueOf(waitTimeInMillis)); + return this; + } + + public HoodieLockConfig.Builder withConflictResolutionStrategy(ConflictResolutionStrategy conflictResolutionStrategy) { + lockConfig.setValue(WRITE_CONFLICT_RESOLUTION_STRATEGY_CLASS_NAME, conflictResolutionStrategy.getClass().getName()); + return this; + } + + public HoodieLockConfig.Builder withFileSystemLockPath(String path) { + lockConfig.setValue(FILESYSTEM_LOCK_PATH, path); + return this; + } + + public HoodieLockConfig.Builder withFileSystemLockExpire(Integer expireTime) { + lockConfig.setValue(FILESYSTEM_LOCK_EXPIRE, String.valueOf(expireTime)); + return this; + } + + public HoodieLockConfig build() { + lockConfig.setDefaults(HoodieLockConfig.class.getName()); + return lockConfig; + } + } + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMemoryConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMemoryConfig.java index 687033f45c6d8..960ec61dc0b0e 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMemoryConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMemoryConfig.java @@ -18,10 +18,14 @@ package org.apache.hudi.config; -import org.apache.hudi.common.config.DefaultHoodieConfig; +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.util.FileIOUtils; +import org.apache.hudi.common.util.Option; import javax.annotation.concurrent.Immutable; - import java.io.File; import java.io.FileReader; import java.io.IOException; @@ -31,39 +35,104 @@ * Memory related config. */ @Immutable -public class HoodieMemoryConfig extends DefaultHoodieConfig { +@ConfigClassProperty(name = "Memory Configurations", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Controls memory usage for compaction " + + "and merges, performed internally by Hudi.") +public class HoodieMemoryConfig extends HoodieConfig { - // This fraction is multiplied with the spark.memory.fraction to get a final fraction of heap space to use - // during merge. This makes it easier to scale this value as one increases the spark.executor.memory - public static final String MAX_MEMORY_FRACTION_FOR_MERGE_PROP = "hoodie.memory.merge.fraction"; // Default max memory fraction during hash-merge, excess spills to disk - public static final String DEFAULT_MAX_MEMORY_FRACTION_FOR_MERGE = String.valueOf(0.6); - public static final String MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP = "hoodie.memory.compaction.fraction"; + public static final ConfigProperty MAX_MEMORY_FRACTION_FOR_MERGE = ConfigProperty + .key("hoodie.memory.merge.fraction") + .defaultValue(String.valueOf(0.6)) + .withDocumentation("This fraction is multiplied with the user memory fraction (1 - spark.memory.fraction) " + + "to get a final fraction of heap space to use during merge"); + // Default max memory fraction during compaction, excess spills to disk - public static final String DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION = String.valueOf(0.6); + public static final ConfigProperty MAX_MEMORY_FRACTION_FOR_COMPACTION = ConfigProperty + .key("hoodie.memory.compaction.fraction") + .defaultValue(String.valueOf(0.6)) + .withDocumentation("HoodieCompactedLogScanner reads logblocks, converts records to HoodieRecords and then " + + "merges these log blocks and records. At any point, the number of entries in a log block can be " + + "less than or equal to the number of entries in the corresponding parquet file. This can lead to " + + "OOM in the Scanner. Hence, a spillable map helps alleviate the memory pressure. Use this config to " + + "set the max allowable inMemory footprint of the spillable map"); + // Default memory size (1GB) per compaction (used if SparkEnv is absent), excess spills to disk public static final long DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES = 1024 * 1024 * 1024L; // Minimum memory size (100MB) for the spillable map. public static final long DEFAULT_MIN_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES = 100 * 1024 * 1024L; - // Property to set the max memory for merge - public static final String MAX_MEMORY_FOR_MERGE_PROP = "hoodie.memory.merge.max.size"; - // Property to set the max memory for compaction - public static final String MAX_MEMORY_FOR_COMPACTION_PROP = "hoodie.memory.compaction.max.size"; - // Property to set the max memory for dfs inputstream buffer size - public static final String MAX_DFS_STREAM_BUFFER_SIZE_PROP = "hoodie.memory.dfs.buffer.max.size"; - public static final int DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE = 16 * 1024 * 1024; // 16MB - public static final String SPILLABLE_MAP_BASE_PATH_PROP = "hoodie.memory.spillable.map.path"; - // Default file path prefix for spillable file - public static final String DEFAULT_SPILLABLE_MAP_BASE_PATH = "/tmp/"; - - // Property to control how what fraction of the failed record, exceptions we report back to driver. - public static final String WRITESTATUS_FAILURE_FRACTION_PROP = "hoodie.memory.writestatus.failure.fraction"; - // Default is 10%. If set to 100%, with lot of failures, this can cause memory pressure, cause OOMs and - // mask actual data errors. - public static final double DEFAULT_WRITESTATUS_FAILURE_FRACTION = 0.1; - - private HoodieMemoryConfig(Properties props) { - super(props); + + public static final ConfigProperty MAX_MEMORY_FOR_MERGE = ConfigProperty + .key("hoodie.memory.merge.max.size") + .defaultValue(DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES) + .withDocumentation("Maximum amount of memory used in bytes for merge operations, before spilling to local storage."); + + public static final ConfigProperty MAX_MEMORY_FOR_COMPACTION = ConfigProperty + .key("hoodie.memory.compaction.max.size") + .noDefaultValue() + .withDocumentation("Maximum amount of memory used in bytes for compaction operations in bytes , before spilling to local storage."); + + public static final ConfigProperty MAX_DFS_STREAM_BUFFER_SIZE = ConfigProperty + .key("hoodie.memory.dfs.buffer.max.size") + .defaultValue(16 * 1024 * 1024) + .withDocumentation("Property to control the max memory in bytes for dfs input stream buffer size"); + + public static final ConfigProperty SPILLABLE_MAP_BASE_PATH = ConfigProperty + .key("hoodie.memory.spillable.map.path") + .defaultValue("/tmp/") + .withInferFunction(cfg -> { + String[] localDirs = FileIOUtils.getConfiguredLocalDirs(); + return (localDirs != null && localDirs.length > 0) ? Option.of(localDirs[0]) : Option.empty(); + }) + .withDocumentation("Default file path for spillable map"); + + public static final ConfigProperty WRITESTATUS_FAILURE_FRACTION = ConfigProperty + .key("hoodie.memory.writestatus.failure.fraction") + .defaultValue(0.1) + .withDocumentation("Property to control how what fraction of the failed record, exceptions we report back to driver. " + + "Default is 10%. If set to 100%, with lot of failures, this can cause memory pressure, cause OOMs and " + + "mask actual data errors."); + + /** @deprecated Use {@link #MAX_MEMORY_FRACTION_FOR_MERGE} and its methods instead */ + @Deprecated + public static final String MAX_MEMORY_FRACTION_FOR_MERGE_PROP = MAX_MEMORY_FRACTION_FOR_MERGE.key(); + /** @deprecated Use {@link #MAX_MEMORY_FRACTION_FOR_MERGE} and its methods instead */ + @Deprecated + public static final String DEFAULT_MAX_MEMORY_FRACTION_FOR_MERGE = MAX_MEMORY_FRACTION_FOR_MERGE.defaultValue(); + /** @deprecated Use {@link #MAX_MEMORY_FRACTION_FOR_COMPACTION} and its methods instead */ + @Deprecated + public static final String MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP = MAX_MEMORY_FRACTION_FOR_COMPACTION.key(); + /** @deprecated Use {@link #MAX_MEMORY_FRACTION_FOR_COMPACTION} and its methods instead */ + @Deprecated + public static final String DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION = MAX_MEMORY_FRACTION_FOR_COMPACTION.defaultValue(); + /** @deprecated Use {@link #MAX_MEMORY_FOR_MERGE} and its methods instead */ + @Deprecated + public static final String MAX_MEMORY_FOR_MERGE_PROP = MAX_MEMORY_FOR_MERGE.key(); + /** @deprecated Use {@link #MAX_MEMORY_FOR_COMPACTION} and its methods instead */ + @Deprecated + public static final String MAX_MEMORY_FOR_COMPACTION_PROP = MAX_MEMORY_FOR_COMPACTION.key(); + /** @deprecated Use {@link #MAX_DFS_STREAM_BUFFER_SIZE} and its methods instead */ + @Deprecated + public static final String MAX_DFS_STREAM_BUFFER_SIZE_PROP = MAX_DFS_STREAM_BUFFER_SIZE.key(); + /** @deprecated Use {@link #MAX_DFS_STREAM_BUFFER_SIZE} and its methods instead */ + @Deprecated + public static final int DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE = MAX_DFS_STREAM_BUFFER_SIZE.defaultValue(); + /** @deprecated Use {@link #SPILLABLE_MAP_BASE_PATH} and its methods instead */ + @Deprecated + public static final String SPILLABLE_MAP_BASE_PATH_PROP = SPILLABLE_MAP_BASE_PATH.key(); + /** @deprecated Use {@link #SPILLABLE_MAP_BASE_PATH} and its methods instead */ + @Deprecated + public static final String DEFAULT_SPILLABLE_MAP_BASE_PATH = SPILLABLE_MAP_BASE_PATH.defaultValue(); + /** @deprecated Use {@link #WRITESTATUS_FAILURE_FRACTION} and its methods instead */ + @Deprecated + public static final String WRITESTATUS_FAILURE_FRACTION_PROP = WRITESTATUS_FAILURE_FRACTION.key(); + /** @deprecated Use {@link #WRITESTATUS_FAILURE_FRACTION} and its methods instead */ + @Deprecated + public static final double DEFAULT_WRITESTATUS_FAILURE_FRACTION = WRITESTATUS_FAILURE_FRACTION.defaultValue(); + + private HoodieMemoryConfig() { + super(); } public static HoodieMemoryConfig.Builder newBuilder() { @@ -72,57 +141,49 @@ public static HoodieMemoryConfig.Builder newBuilder() { public static class Builder { - private final Properties props = new Properties(); + private final HoodieMemoryConfig memoryConfig = new HoodieMemoryConfig(); public Builder fromFile(File propertiesFile) throws IOException { try (FileReader reader = new FileReader(propertiesFile)) { - this.props.load(reader); + this.memoryConfig.getProps().load(reader); return this; } } public Builder fromProperties(Properties props) { - this.props.putAll(props); + this.memoryConfig.getProps().putAll(props); return this; } public Builder withMaxMemoryFractionPerPartitionMerge(double maxMemoryFractionPerPartitionMerge) { - props.setProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP, String.valueOf(maxMemoryFractionPerPartitionMerge)); + memoryConfig.setValue(MAX_MEMORY_FRACTION_FOR_MERGE, String.valueOf(maxMemoryFractionPerPartitionMerge)); return this; } public Builder withMaxMemoryMaxSize(long mergeMaxSize, long compactionMaxSize) { - props.setProperty(MAX_MEMORY_FOR_MERGE_PROP, String.valueOf(mergeMaxSize)); - props.setProperty(MAX_MEMORY_FOR_COMPACTION_PROP, String.valueOf(compactionMaxSize)); + memoryConfig.setValue(MAX_MEMORY_FOR_MERGE, String.valueOf(mergeMaxSize)); + memoryConfig.setValue(MAX_MEMORY_FOR_COMPACTION, String.valueOf(compactionMaxSize)); return this; } public Builder withMaxMemoryFractionPerCompaction(double maxMemoryFractionPerCompaction) { - props.setProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP, String.valueOf(maxMemoryFractionPerCompaction)); + memoryConfig.setValue(MAX_MEMORY_FRACTION_FOR_COMPACTION, String.valueOf(maxMemoryFractionPerCompaction)); return this; } public Builder withMaxDFSStreamBufferSize(int maxStreamBufferSize) { - props.setProperty(MAX_DFS_STREAM_BUFFER_SIZE_PROP, String.valueOf(maxStreamBufferSize)); + memoryConfig.setValue(MAX_DFS_STREAM_BUFFER_SIZE, String.valueOf(maxStreamBufferSize)); return this; } public Builder withWriteStatusFailureFraction(double failureFraction) { - props.setProperty(WRITESTATUS_FAILURE_FRACTION_PROP, String.valueOf(failureFraction)); + memoryConfig.setValue(WRITESTATUS_FAILURE_FRACTION, String.valueOf(failureFraction)); return this; } public HoodieMemoryConfig build() { - HoodieMemoryConfig config = new HoodieMemoryConfig(props); - setDefaultOnCondition(props, !props.containsKey(MAX_DFS_STREAM_BUFFER_SIZE_PROP), MAX_DFS_STREAM_BUFFER_SIZE_PROP, - String.valueOf(DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE)); - setDefaultOnCondition(props, !props.containsKey(SPILLABLE_MAP_BASE_PATH_PROP), SPILLABLE_MAP_BASE_PATH_PROP, - DEFAULT_SPILLABLE_MAP_BASE_PATH); - setDefaultOnCondition(props, !props.containsKey(MAX_MEMORY_FOR_MERGE_PROP), MAX_MEMORY_FOR_MERGE_PROP, - String.valueOf(DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES)); - setDefaultOnCondition(props, !props.containsKey(WRITESTATUS_FAILURE_FRACTION_PROP), - WRITESTATUS_FAILURE_FRACTION_PROP, String.valueOf(DEFAULT_WRITESTATUS_FAILURE_FRACTION)); - return config; + memoryConfig.setDefaults(HoodieMemoryConfig.class.getName()); + return memoryConfig; } } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMetricsConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMetricsConfig.java deleted file mode 100644 index 800c75f824fd4..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMetricsConfig.java +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.config; - -import org.apache.hudi.common.config.DefaultHoodieConfig; -import org.apache.hudi.metrics.MetricsReporterType; - -import javax.annotation.concurrent.Immutable; - -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.Properties; - -/** - * Fetch the configurations used by the Metrics system. - */ -@Immutable -public class HoodieMetricsConfig extends DefaultHoodieConfig { - - public static final String METRIC_PREFIX = "hoodie.metrics"; - public static final String METRICS_ON = METRIC_PREFIX + ".on"; - public static final boolean DEFAULT_METRICS_ON = false; - public static final String METRICS_REPORTER_TYPE = METRIC_PREFIX + ".reporter.type"; - public static final MetricsReporterType DEFAULT_METRICS_REPORTER_TYPE = MetricsReporterType.GRAPHITE; - - // Graphite - public static final String GRAPHITE_PREFIX = METRIC_PREFIX + ".graphite"; - public static final String GRAPHITE_SERVER_HOST = GRAPHITE_PREFIX + ".host"; - public static final String DEFAULT_GRAPHITE_SERVER_HOST = "localhost"; - - public static final String GRAPHITE_SERVER_PORT = GRAPHITE_PREFIX + ".port"; - public static final int DEFAULT_GRAPHITE_SERVER_PORT = 4756; - - // Jmx - public static final String JMX_PREFIX = METRIC_PREFIX + ".jmx"; - public static final String JMX_HOST = JMX_PREFIX + ".host"; - public static final String DEFAULT_JMX_HOST = "localhost"; - - public static final String JMX_PORT = JMX_PREFIX + ".port"; - public static final int DEFAULT_JMX_PORT = 9889; - - public static final String GRAPHITE_METRIC_PREFIX = GRAPHITE_PREFIX + ".metric.prefix"; - - // User defined - public static final String METRICS_REPORTER_CLASS = METRIC_PREFIX + ".reporter.class"; - public static final String DEFAULT_METRICS_REPORTER_CLASS = ""; - - private HoodieMetricsConfig(Properties props) { - super(props); - } - - public static HoodieMetricsConfig.Builder newBuilder() { - return new Builder(); - } - - public static class Builder { - - private final Properties props = new Properties(); - - public Builder fromFile(File propertiesFile) throws IOException { - try (FileReader reader = new FileReader(propertiesFile)) { - this.props.load(reader); - return this; - } - } - - public Builder fromProperties(Properties props) { - this.props.putAll(props); - return this; - } - - public Builder on(boolean metricsOn) { - props.setProperty(METRICS_ON, String.valueOf(metricsOn)); - return this; - } - - public Builder withReporterType(String reporterType) { - props.setProperty(METRICS_REPORTER_TYPE, reporterType); - return this; - } - - public Builder toGraphiteHost(String host) { - props.setProperty(GRAPHITE_SERVER_HOST, host); - return this; - } - - public Builder onGraphitePort(int port) { - props.setProperty(GRAPHITE_SERVER_PORT, String.valueOf(port)); - return this; - } - - public Builder toJmxHost(String host) { - props.setProperty(JMX_HOST, host); - return this; - } - - public Builder onJmxPort(String port) { - props.setProperty(JMX_PORT, port); - return this; - } - - public Builder usePrefix(String prefix) { - props.setProperty(GRAPHITE_METRIC_PREFIX, prefix); - return this; - } - - public Builder withReporterClass(String className) { - props.setProperty(METRICS_REPORTER_CLASS, className); - return this; - } - - public HoodieMetricsConfig build() { - HoodieMetricsConfig config = new HoodieMetricsConfig(props); - setDefaultOnCondition(props, !props.containsKey(METRICS_ON), METRICS_ON, String.valueOf(DEFAULT_METRICS_ON)); - setDefaultOnCondition(props, !props.containsKey(METRICS_REPORTER_TYPE), METRICS_REPORTER_TYPE, - DEFAULT_METRICS_REPORTER_TYPE.name()); - setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_HOST), GRAPHITE_SERVER_HOST, - DEFAULT_GRAPHITE_SERVER_HOST); - setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_PORT), GRAPHITE_SERVER_PORT, - String.valueOf(DEFAULT_GRAPHITE_SERVER_PORT)); - setDefaultOnCondition(props, !props.containsKey(JMX_HOST), JMX_HOST, - DEFAULT_JMX_HOST); - setDefaultOnCondition(props, !props.containsKey(JMX_PORT), JMX_PORT, - String.valueOf(DEFAULT_JMX_PORT)); - MetricsReporterType reporterType = MetricsReporterType.valueOf(props.getProperty(METRICS_REPORTER_TYPE)); - setDefaultOnCondition(props, reporterType == MetricsReporterType.DATADOG, - HoodieMetricsDatadogConfig.newBuilder().fromProperties(props).build()); - setDefaultOnCondition(props, !props.containsKey(METRICS_REPORTER_CLASS), - METRICS_REPORTER_CLASS, DEFAULT_METRICS_REPORTER_CLASS); - setDefaultOnCondition(props, reporterType == MetricsReporterType.PROMETHEUS_PUSHGATEWAY, - HoodieMetricsPrometheusConfig.newBuilder().fromProperties(props).build()); - setDefaultOnCondition(props, reporterType == MetricsReporterType.PROMETHEUS, - HoodieMetricsPrometheusConfig.newBuilder().fromProperties(props).build()); - - return config; - } - } - -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMetricsDatadogConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMetricsDatadogConfig.java deleted file mode 100644 index e6dcc282c1d63..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMetricsDatadogConfig.java +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.config; - -import org.apache.hudi.common.config.DefaultHoodieConfig; - -import javax.annotation.concurrent.Immutable; - -import java.util.Properties; - -import static org.apache.hudi.config.HoodieMetricsConfig.METRIC_PREFIX; - -/** - * Configs for Datadog reporter type. - *

- * {@link org.apache.hudi.metrics.MetricsReporterType#DATADOG} - */ -@Immutable -public class HoodieMetricsDatadogConfig extends DefaultHoodieConfig { - - public static final String DATADOG_PREFIX = METRIC_PREFIX + ".datadog"; - public static final String DATADOG_REPORT_PERIOD_SECONDS = DATADOG_PREFIX + ".report.period.seconds"; - public static final int DEFAULT_DATADOG_REPORT_PERIOD_SECONDS = 30; - public static final String DATADOG_API_SITE = DATADOG_PREFIX + ".api.site"; - public static final String DATADOG_API_KEY = DATADOG_PREFIX + ".api.key"; - public static final String DATADOG_API_KEY_SKIP_VALIDATION = DATADOG_PREFIX + ".api.key.skip.validation"; - public static final boolean DEFAULT_DATADOG_API_KEY_SKIP_VALIDATION = false; - public static final String DATADOG_API_KEY_SUPPLIER = DATADOG_PREFIX + ".api.key.supplier"; - public static final String DATADOG_API_TIMEOUT_SECONDS = DATADOG_PREFIX + ".api.timeout.seconds"; - public static final int DEFAULT_DATADOG_API_TIMEOUT_SECONDS = 3; - public static final String DATADOG_METRIC_PREFIX = DATADOG_PREFIX + ".metric.prefix"; - public static final String DATADOG_METRIC_HOST = DATADOG_PREFIX + ".metric.host"; - public static final String DATADOG_METRIC_TAGS = DATADOG_PREFIX + ".metric.tags"; - - private HoodieMetricsDatadogConfig(Properties props) { - super(props); - } - - public static HoodieMetricsDatadogConfig.Builder newBuilder() { - return new Builder(); - } - - public static class Builder { - - private final Properties props = new Properties(); - - public Builder fromProperties(Properties props) { - this.props.putAll(props); - return this; - } - - public Builder withDatadogReportPeriodSeconds(int period) { - props.setProperty(DATADOG_REPORT_PERIOD_SECONDS, String.valueOf(period)); - return this; - } - - public Builder withDatadogApiSite(String apiSite) { - props.setProperty(DATADOG_API_SITE, apiSite); - return this; - } - - public Builder withDatadogApiKey(String apiKey) { - props.setProperty(DATADOG_API_KEY, apiKey); - return this; - } - - public Builder withDatadogApiKeySkipValidation(boolean skip) { - props.setProperty(DATADOG_API_KEY_SKIP_VALIDATION, String.valueOf(skip)); - return this; - } - - public Builder withDatadogApiKeySupplier(String apiKeySupplier) { - props.setProperty(DATADOG_API_KEY_SUPPLIER, apiKeySupplier); - return this; - } - - public Builder withDatadogApiTimeoutSeconds(int timeout) { - props.setProperty(DATADOG_API_TIMEOUT_SECONDS, String.valueOf(timeout)); - return this; - } - - public Builder withDatadogPrefix(String prefix) { - props.setProperty(DATADOG_METRIC_PREFIX, prefix); - return this; - } - - public Builder withDatadogHost(String host) { - props.setProperty(DATADOG_METRIC_HOST, host); - return this; - } - - public Builder withDatadogTags(String tags) { - props.setProperty(DATADOG_METRIC_TAGS, tags); - return this; - } - - public HoodieMetricsDatadogConfig build() { - HoodieMetricsDatadogConfig config = new HoodieMetricsDatadogConfig(props); - setDefaultOnCondition(props, !props.containsKey(DATADOG_REPORT_PERIOD_SECONDS), - DATADOG_REPORT_PERIOD_SECONDS, - String.valueOf(DEFAULT_DATADOG_REPORT_PERIOD_SECONDS)); - setDefaultOnCondition(props, !props.containsKey(DATADOG_API_KEY_SKIP_VALIDATION), - DATADOG_API_KEY_SKIP_VALIDATION, - String.valueOf(DEFAULT_DATADOG_API_KEY_SKIP_VALIDATION)); - setDefaultOnCondition(props, !props.containsKey(DATADOG_API_TIMEOUT_SECONDS), - DATADOG_API_TIMEOUT_SECONDS, - String.valueOf(DEFAULT_DATADOG_API_TIMEOUT_SECONDS)); - return config; - } - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMetricsPrometheusConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMetricsPrometheusConfig.java deleted file mode 100644 index 3e2d50f5a0c4a..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMetricsPrometheusConfig.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.config; - -import org.apache.hudi.common.config.DefaultHoodieConfig; - -import java.util.Properties; - -import static org.apache.hudi.config.HoodieMetricsConfig.METRIC_PREFIX; - -public class HoodieMetricsPrometheusConfig extends DefaultHoodieConfig { - - // Prometheus PushGateWay - public static final String PUSHGATEWAY_PREFIX = METRIC_PREFIX + ".pushgateway"; - - public static final String PUSHGATEWAY_HOST = PUSHGATEWAY_PREFIX + ".host"; - public static final String DEFAULT_PUSHGATEWAY_HOST = "localhost"; - - public static final String PUSHGATEWAY_PORT = PUSHGATEWAY_PREFIX + ".port"; - public static final int DEFAULT_PUSHGATEWAY_PORT = 9091; - - public static final String PUSHGATEWAY_REPORT_PERIOD_SECONDS = PUSHGATEWAY_PREFIX + ".report.period.seconds"; - public static final int DEFAULT_PUSHGATEWAY_REPORT_PERIOD_SECONDS = 30; - - public static final String PUSHGATEWAY_DELETE_ON_SHUTDOWN = PUSHGATEWAY_PREFIX + ".delete.on.shutdown"; - public static final boolean DEFAULT_PUSHGATEWAY_DELETE_ON_SHUTDOWN = true; - - public static final String PUSHGATEWAY_JOB_NAME = PUSHGATEWAY_PREFIX + ".job.name"; - public static final String DEFAULT_PUSHGATEWAY_JOB_NAME = ""; - - public static final String PUSHGATEWAY_RANDOM_JOB_NAME_SUFFIX = PUSHGATEWAY_PREFIX + ".random.job.name.suffix"; - public static final boolean DEFAULT_PUSHGATEWAY_RANDOM_JOB_NAME_SUFFIX = true; - - - // Prometheus HttpServer - public static final String PROMETHEUS_PREFIX = METRIC_PREFIX + ".prometheus"; - public static final String PROMETHEUS_PORT = PROMETHEUS_PREFIX + ".port"; - public static final int DEFAULT_PROMETHEUS_PORT = 9090; - - public HoodieMetricsPrometheusConfig(Properties props) { - super(props); - } - - public static HoodieMetricsPrometheusConfig.Builder newBuilder() { - return new HoodieMetricsPrometheusConfig.Builder(); - } - - @Override - public Properties getProps() { - return super.getProps(); - } - - public static class Builder { - - private Properties props = new Properties(); - - public Builder fromProperties(Properties props) { - this.props.putAll(props); - return this; - } - - public HoodieMetricsPrometheusConfig build() { - HoodieMetricsPrometheusConfig config = new HoodieMetricsPrometheusConfig(props); - setDefaultOnCondition(props, !props.containsKey(PROMETHEUS_PORT), PROMETHEUS_PORT, - String.valueOf(DEFAULT_PROMETHEUS_PORT)); - setDefaultOnCondition(props, !props.containsKey(PUSHGATEWAY_HOST), - PUSHGATEWAY_HOST, - DEFAULT_PUSHGATEWAY_HOST); - setDefaultOnCondition(props, !props.containsKey(PUSHGATEWAY_PORT), - PUSHGATEWAY_PORT, - String.valueOf(DEFAULT_PUSHGATEWAY_PORT)); - setDefaultOnCondition(props, !props.containsKey(PUSHGATEWAY_REPORT_PERIOD_SECONDS), - PUSHGATEWAY_REPORT_PERIOD_SECONDS, - String.valueOf(DEFAULT_PUSHGATEWAY_REPORT_PERIOD_SECONDS)); - setDefaultOnCondition(props, !props.containsKey(PUSHGATEWAY_DELETE_ON_SHUTDOWN), - PUSHGATEWAY_DELETE_ON_SHUTDOWN, - String.valueOf(DEFAULT_PUSHGATEWAY_DELETE_ON_SHUTDOWN)); - setDefaultOnCondition(props, !props.containsKey(PUSHGATEWAY_JOB_NAME), - PUSHGATEWAY_JOB_NAME, DEFAULT_PUSHGATEWAY_JOB_NAME); - setDefaultOnCondition(props, !props.containsKey(PUSHGATEWAY_RANDOM_JOB_NAME_SUFFIX), - PUSHGATEWAY_RANDOM_JOB_NAME_SUFFIX, - String.valueOf(DEFAULT_PUSHGATEWAY_RANDOM_JOB_NAME_SUFFIX)); - return config; - } - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodiePayloadConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodiePayloadConfig.java new file mode 100644 index 0000000000000..2a05752aa6e1e --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodiePayloadConfig.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +import static org.apache.hudi.common.model.HoodiePayloadProps.PAYLOAD_EVENT_TIME_FIELD_PROP_KEY; +import static org.apache.hudi.common.model.HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY; + +/** + * Hoodie payload related configs. + */ +@ConfigClassProperty(name = "Payload Configurations", + groupName = ConfigGroups.Names.RECORD_PAYLOAD, + description = "Payload related configs, that can be leveraged to " + + "control merges based on specific business fields in the data.") +public class HoodiePayloadConfig extends HoodieConfig { + + public static final ConfigProperty ORDERING_FIELD = ConfigProperty + .key(PAYLOAD_ORDERING_FIELD_PROP_KEY) + .defaultValue("ts") + .withDocumentation("Table column/field name to order records that have the same key, before " + + "merging and writing to storage."); + + public static final ConfigProperty EVENT_TIME_FIELD = ConfigProperty + .key(PAYLOAD_EVENT_TIME_FIELD_PROP_KEY) + .defaultValue("ts") + .withDocumentation("Table column/field name to derive timestamp associated with the records. This can" + + "be useful for e.g, determining the freshness of the table."); + + public static final ConfigProperty PAYLOAD_CLASS_NAME = ConfigProperty + .key("hoodie.compaction.payload.class") + .defaultValue(OverwriteWithLatestAvroPayload.class.getName()) + .withDocumentation("This needs to be same as class used during insert/upserts. Just like writing, compaction also uses " + + "the record payload class to merge records in the log against each other, merge again with the base file and " + + "produce the final record to be written after compaction."); + + /** @deprecated Use {@link #PAYLOAD_CLASS_NAME} and its methods instead */ + @Deprecated + public static final String DEFAULT_PAYLOAD_CLASS = PAYLOAD_CLASS_NAME.defaultValue(); + /** @deprecated Use {@link #PAYLOAD_CLASS_NAME} and its methods instead */ + @Deprecated + public static final String PAYLOAD_CLASS_PROP = PAYLOAD_CLASS_NAME.key(); + + private HoodiePayloadConfig() { + super(); + } + + public static HoodiePayloadConfig.Builder newBuilder() { + return new HoodiePayloadConfig.Builder(); + } + + public static class Builder { + + private final HoodiePayloadConfig payloadConfig = new HoodiePayloadConfig(); + + public Builder fromFile(File propertiesFile) throws IOException { + try (FileReader reader = new FileReader(propertiesFile)) { + this.payloadConfig.getProps().load(reader); + return this; + } + } + + public Builder fromProperties(Properties props) { + this.payloadConfig.getProps().putAll(props); + return this; + } + + public Builder withPayloadOrderingField(String payloadOrderingField) { + payloadConfig.setValue(ORDERING_FIELD, String.valueOf(payloadOrderingField)); + return this; + } + + public Builder withPayloadEventTimeField(String payloadEventTimeField) { + payloadConfig.setValue(EVENT_TIME_FIELD, String.valueOf(payloadEventTimeField)); + return this; + } + + public HoodiePayloadConfig.Builder withPayloadClass(String payloadClassName) { + payloadConfig.setValue(PAYLOAD_CLASS_NAME, payloadClassName); + return this; + } + + public HoodiePayloadConfig build() { + payloadConfig.setDefaults(HoodiePayloadConfig.class.getName()); + return payloadConfig; + } + } + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodiePreCommitValidatorConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodiePreCommitValidatorConfig.java new file mode 100644 index 0000000000000..e65e35475f878 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodiePreCommitValidatorConfig.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +import javax.annotation.concurrent.Immutable; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +/** + * Storage related config. + */ +@Immutable +@ConfigClassProperty(name = "PreCommit Validator Configurations", + groupName = ConfigGroups.Names.SPARK_DATASOURCE, + description = "The following set of configurations help validate new data before commits.") +public class HoodiePreCommitValidatorConfig extends HoodieConfig { + + public static final ConfigProperty VALIDATOR_CLASS_NAMES = ConfigProperty + .key("hoodie.precommit.validators") + .defaultValue("") + .withDocumentation("Comma separated list of class names that can be invoked to validate commit"); + public static final String VALIDATOR_TABLE_VARIABLE = ""; + + public static final ConfigProperty EQUALITY_SQL_QUERIES = ConfigProperty + .key("hoodie.precommit.validators.equality.sql.queries") + .defaultValue("") + .withDocumentation("Spark SQL queries to run on table before committing new data to validate state before and after commit." + + " Multiple queries separated by ';' delimiter are supported." + + " Example: \"select count(*) from \\" + + " Note \\ is replaced by table state before and after commit."); + + public static final ConfigProperty SINGLE_VALUE_SQL_QUERIES = ConfigProperty + .key("hoodie.precommit.validators.single.value.sql.queries") + .defaultValue("") + .withDocumentation("Spark SQL queries to run on table before committing new data to validate state after commit." + + "Multiple queries separated by ';' delimiter are supported." + + "Expected result is included as part of query separated by '#'. Example query: 'query1#result1:query2#result2'" + + "Note \\ variable is expected to be present in query."); + + /** + * Spark SQL queries to run on table before committing new data to validate state before and after commit. + * Multiple queries separated by ';' delimiter are supported. + * Example query: 'select count(*) from \ where col=null' + * Note \ variable is expected to be present in query. + */ + public static final ConfigProperty INEQUALITY_SQL_QUERIES = ConfigProperty + .key("hoodie.precommit.validators.inequality.sql.queries") + .defaultValue("") + .withDocumentation("Spark SQL queries to run on table before committing new data to validate state before and after commit." + + "Multiple queries separated by ';' delimiter are supported." + + "Example query: 'select count(*) from \\ where col=null'" + + "Note \\ variable is expected to be present in query."); + + private HoodiePreCommitValidatorConfig() { + super(); + } + + public static HoodiePreCommitValidatorConfig.Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + + private final HoodiePreCommitValidatorConfig preCommitValidatorConfig = new HoodiePreCommitValidatorConfig(); + + public Builder fromFile(File propertiesFile) throws IOException { + try (FileReader reader = new FileReader(propertiesFile)) { + this.preCommitValidatorConfig.getProps().load(reader); + return this; + } + } + + public Builder fromProperties(Properties props) { + this.preCommitValidatorConfig.getProps().putAll(props); + return this; + } + + public Builder withPreCommitValidator(String preCommitValidators) { + preCommitValidatorConfig.setValue(VALIDATOR_CLASS_NAMES, preCommitValidators); + return this; + } + + public Builder withPrecommitValidatorEqualitySqlQueries(String preCommitValidators) { + preCommitValidatorConfig.setValue(EQUALITY_SQL_QUERIES, preCommitValidators); + return this; + } + + public Builder withPrecommitValidatorSingleResultSqlQueries(String preCommitValidators) { + preCommitValidatorConfig.setValue(SINGLE_VALUE_SQL_QUERIES, preCommitValidators); + return this; + } + + public Builder withPrecommitValidatorInequalitySqlQueries(String preCommitValidators) { + preCommitValidatorConfig.setValue(INEQUALITY_SQL_QUERIES, preCommitValidators); + return this; + } + + public HoodiePreCommitValidatorConfig build() { + preCommitValidatorConfig.setDefaults(HoodiePreCommitValidatorConfig.class.getName()); + return preCommitValidatorConfig; + } + } + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java index 50b45f335b3ca..40c53fae9686b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java @@ -18,7 +18,10 @@ package org.apache.hudi.config; -import org.apache.hudi.common.config.DefaultHoodieConfig; +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; import javax.annotation.concurrent.Immutable; @@ -31,38 +34,242 @@ * Storage related config. */ @Immutable -public class HoodieStorageConfig extends DefaultHoodieConfig { - - public static final String PARQUET_FILE_MAX_BYTES = "hoodie.parquet.max.file.size"; - public static final String DEFAULT_PARQUET_FILE_MAX_BYTES = String.valueOf(120 * 1024 * 1024); - public static final String PARQUET_BLOCK_SIZE_BYTES = "hoodie.parquet.block.size"; - public static final String DEFAULT_PARQUET_BLOCK_SIZE_BYTES = DEFAULT_PARQUET_FILE_MAX_BYTES; - public static final String PARQUET_PAGE_SIZE_BYTES = "hoodie.parquet.page.size"; - public static final String DEFAULT_PARQUET_PAGE_SIZE_BYTES = String.valueOf(1 * 1024 * 1024); - public static final String HFILE_FILE_MAX_BYTES = "hoodie.hfile.max.file.size"; - public static final String HFILE_BLOCK_SIZE_BYTES = "hoodie.hfile.block.size"; - public static final String DEFAULT_HFILE_BLOCK_SIZE_BYTES = String.valueOf(1 * 1024 * 1024); - public static final String DEFAULT_HFILE_FILE_MAX_BYTES = String.valueOf(120 * 1024 * 1024); - // used to size log files - public static final String LOGFILE_SIZE_MAX_BYTES = "hoodie.logfile.max.size"; - public static final String DEFAULT_LOGFILE_SIZE_MAX_BYTES = String.valueOf(1024 * 1024 * 1024); // 1 GB - // used to size data blocks in log file - public static final String LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = "hoodie.logfile.data.block.max.size"; - public static final String DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = String.valueOf(256 * 1024 * 1024); // 256 MB - public static final String PARQUET_COMPRESSION_RATIO = "hoodie.parquet.compression.ratio"; - // Default compression ratio for parquet - public static final String DEFAULT_STREAM_COMPRESSION_RATIO = String.valueOf(0.1); - public static final String PARQUET_COMPRESSION_CODEC = "hoodie.parquet.compression.codec"; - public static final String HFILE_COMPRESSION_ALGORITHM = "hoodie.hfile.compression.algorithm"; +@ConfigClassProperty(name = "Storage Configs", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Configurations that control aspects around writing, sizing, reading base and log files.") +public class HoodieStorageConfig extends HoodieConfig { + + public static final ConfigProperty PARQUET_MAX_FILE_SIZE = ConfigProperty + .key("hoodie.parquet.max.file.size") + .defaultValue(String.valueOf(120 * 1024 * 1024)) + .withDocumentation("Target size in bytes for parquet files produced by Hudi write phases. " + + "For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance."); + + public static final ConfigProperty PARQUET_BLOCK_SIZE = ConfigProperty + .key("hoodie.parquet.block.size") + .defaultValue(String.valueOf(120 * 1024 * 1024)) + .withDocumentation("Parquet RowGroup size in bytes. It's recommended to make this large enough that scan costs can be" + + " amortized by packing enough column values into a single row group."); + + public static final ConfigProperty PARQUET_PAGE_SIZE = ConfigProperty + .key("hoodie.parquet.page.size") + .defaultValue(String.valueOf(1 * 1024 * 1024)) + .withDocumentation("Parquet page size in bytes. Page is the unit of read within a parquet file. " + + "Within a block, pages are compressed separately."); + + public static final ConfigProperty ORC_FILE_MAX_SIZE = ConfigProperty + .key("hoodie.orc.max.file.size") + .defaultValue(String.valueOf(120 * 1024 * 1024)) + .withDocumentation("Target file size in bytes for ORC base files."); + + public static final ConfigProperty ORC_STRIPE_SIZE = ConfigProperty + .key("hoodie.orc.stripe.size") + .defaultValue(String.valueOf(64 * 1024 * 1024)) + .withDocumentation("Size of the memory buffer in bytes for writing"); + + public static final ConfigProperty ORC_BLOCK_SIZE = ConfigProperty + .key("hoodie.orc.block.size") + .defaultValue(ORC_FILE_MAX_SIZE.defaultValue()) + .withDocumentation("ORC block size, recommended to be aligned with the target file size."); + + public static final ConfigProperty HFILE_MAX_FILE_SIZE = ConfigProperty + .key("hoodie.hfile.max.file.size") + .defaultValue(String.valueOf(120 * 1024 * 1024)) + .withDocumentation("Target file size in bytes for HFile base files."); + + public static final ConfigProperty HFILE_BLOCK_SIZE = ConfigProperty + .key("hoodie.hfile.block.size") + .defaultValue(String.valueOf(1024 * 1024)) + .withDocumentation("Lower values increase the size in bytes of metadata tracked within HFile, but can offer potentially " + + "faster lookup times."); + + public static final ConfigProperty LOGFILE_DATA_BLOCK_FORMAT = ConfigProperty + .key("hoodie.logfile.data.block.format") + .noDefaultValue() + .withDocumentation("Format of the data block within delta logs. Following formats are currently supported \"avro\", \"hfile\", \"parquet\""); + + public static final ConfigProperty LOGFILE_MAX_SIZE = ConfigProperty + .key("hoodie.logfile.max.size") + .defaultValue(String.valueOf(1024 * 1024 * 1024)) // 1 GB + .withDocumentation("LogFile max size in bytes. This is the maximum size allowed for a log file " + + "before it is rolled over to the next version."); + + public static final ConfigProperty LOGFILE_DATA_BLOCK_MAX_SIZE = ConfigProperty + .key("hoodie.logfile.data.block.max.size") + .defaultValue(String.valueOf(256 * 1024 * 1024)) + .withDocumentation("LogFile Data block max size in bytes. This is the maximum size allowed for a single data block " + + "to be appended to a log file. This helps to make sure the data appended to the log file is broken up " + + "into sizable blocks to prevent from OOM errors. This size should be greater than the JVM memory."); + + public static final ConfigProperty PARQUET_COMPRESSION_RATIO_FRACTION = ConfigProperty + .key("hoodie.parquet.compression.ratio") + .defaultValue(String.valueOf(0.1)) + .withDocumentation("Expected compression of parquet data used by Hudi, when it tries to size new parquet files. " + + "Increase this value, if bulk_insert is producing smaller than expected sized files"); + // Default compression codec for parquet - public static final String DEFAULT_PARQUET_COMPRESSION_CODEC = "gzip"; - public static final String DEFAULT_HFILE_COMPRESSION_ALGORITHM = "GZ"; - public static final String LOGFILE_TO_PARQUET_COMPRESSION_RATIO = "hoodie.logfile.to.parquet.compression.ratio"; + public static final ConfigProperty PARQUET_COMPRESSION_CODEC_NAME = ConfigProperty + .key("hoodie.parquet.compression.codec") + .defaultValue("gzip") + .withDocumentation("Compression Codec for parquet files"); + + public static final ConfigProperty PARQUET_DICTIONARY_ENABLED = ConfigProperty + .key("hoodie.parquet.dictionary.enabled") + .defaultValue(true) + .withDocumentation("Whether to use dictionary encoding"); + + public static final ConfigProperty PARQUET_WRITE_LEGACY_FORMAT_ENABLED = ConfigProperty + .key("hoodie.parquet.writelegacyformat.enabled") + .defaultValue("false") + .withDocumentation("Sets spark.sql.parquet.writeLegacyFormat. If true, data will be written in a way of Spark 1.4 and earlier. " + + "For example, decimal values will be written in Parquet's fixed-length byte array format which other systems such as Apache Hive and Apache Impala use. " + + "If false, the newer format in Parquet will be used. For example, decimals will be written in int-based format."); + + public static final ConfigProperty PARQUET_OUTPUT_TIMESTAMP_TYPE = ConfigProperty + .key("hoodie.parquet.outputtimestamptype") + .defaultValue("TIMESTAMP_MICROS") + .withDocumentation("Sets spark.sql.parquet.outputTimestampType. Parquet timestamp type to use when Spark writes data to Parquet files."); + + // SPARK-38094 Spark 3.3 checks if this field is enabled. Hudi has to provide this or there would be NPE thrown + // Would ONLY be effective with Spark 3.3+ + // default value is true which is in accordance with Spark 3.3 + public static final ConfigProperty PARQUET_FIELD_ID_WRITE_ENABLED = ConfigProperty + .key("hoodie.parquet.field_id.write.enabled") + .defaultValue("true") + .sinceVersion("0.12.0") + .withDocumentation("Would only be effective with Spark 3.3+. Sets spark.sql.parquet.fieldId.write.enabled. " + + "If enabled, Spark will write out parquet native field ids that are stored inside StructField's metadata as parquet.field.id to parquet files."); + + public static final ConfigProperty HFILE_COMPRESSION_ALGORITHM_NAME = ConfigProperty + .key("hoodie.hfile.compression.algorithm") + .defaultValue("GZ") + .withDocumentation("Compression codec to use for hfile base files."); + + public static final ConfigProperty ORC_COMPRESSION_CODEC_NAME = ConfigProperty + .key("hoodie.orc.compression.codec") + .defaultValue("ZLIB") + .withDocumentation("Compression codec to use for ORC base files."); + // Default compression ratio for log file to parquet, general 3x - public static final String DEFAULT_LOGFILE_TO_PARQUET_COMPRESSION_RATIO = String.valueOf(0.35); + public static final ConfigProperty LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION = ConfigProperty + .key("hoodie.logfile.to.parquet.compression.ratio") + .defaultValue(String.valueOf(0.35)) + .withDocumentation("Expected additional compression as records move from log files to parquet. Used for merge_on_read " + + "table to send inserts into log files & control the size of compacted parquet file."); - private HoodieStorageConfig(Properties props) { - super(props); + /** + * @deprecated Use {@link #PARQUET_MAX_FILE_SIZE} and its methods instead + */ + @Deprecated + public static final String PARQUET_FILE_MAX_BYTES = PARQUET_MAX_FILE_SIZE.key(); + /** + * @deprecated Use {@link #PARQUET_MAX_FILE_SIZE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_PARQUET_FILE_MAX_BYTES = PARQUET_MAX_FILE_SIZE.defaultValue(); + /** + * @deprecated Use {@link #PARQUET_BLOCK_SIZE} and its methods instead + */ + @Deprecated + public static final String PARQUET_BLOCK_SIZE_BYTES = PARQUET_BLOCK_SIZE.key(); + /** + * @deprecated Use {@link #PARQUET_BLOCK_SIZE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_PARQUET_BLOCK_SIZE_BYTES = PARQUET_BLOCK_SIZE.defaultValue(); + /** + * @deprecated Use {@link #PARQUET_PAGE_SIZE} and its methods instead + */ + @Deprecated + public static final String PARQUET_PAGE_SIZE_BYTES = PARQUET_PAGE_SIZE.key(); + /** + * @deprecated Use {@link #PARQUET_PAGE_SIZE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_PARQUET_PAGE_SIZE_BYTES = PARQUET_PAGE_SIZE.defaultValue(); + /** + * @deprecated Use {@link #HFILE_MAX_FILE_SIZE} and its methods instead + */ + @Deprecated + public static final String HFILE_FILE_MAX_BYTES = HFILE_MAX_FILE_SIZE.key(); + /** + * @deprecated Use {@link #HFILE_MAX_FILE_SIZE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_HFILE_FILE_MAX_BYTES = HFILE_MAX_FILE_SIZE.defaultValue(); + /** + * @deprecated Use {@link #HFILE_BLOCK_SIZE} and its methods instead + */ + @Deprecated + public static final String HFILE_BLOCK_SIZE_BYTES = HFILE_BLOCK_SIZE.defaultValue(); + /** + * @deprecated Use {@link #HFILE_BLOCK_SIZE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_HFILE_BLOCK_SIZE_BYTES = HFILE_BLOCK_SIZE.defaultValue(); + /** + * @deprecated Use {@link #LOGFILE_MAX_SIZE} and its methods instead + */ + @Deprecated + public static final String LOGFILE_SIZE_MAX_BYTES = LOGFILE_MAX_SIZE.key(); + /** + * @deprecated Use {@link #LOGFILE_MAX_SIZE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_LOGFILE_SIZE_MAX_BYTES = LOGFILE_MAX_SIZE.defaultValue(); + /** + * @deprecated Use {@link #LOGFILE_DATA_BLOCK_MAX_SIZE} and its methods instead + */ + @Deprecated + public static final String LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = LOGFILE_DATA_BLOCK_MAX_SIZE.key(); + /** + * @deprecated Use {@link #LOGFILE_DATA_BLOCK_MAX_SIZE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = LOGFILE_DATA_BLOCK_MAX_SIZE.defaultValue(); + /** + * @deprecated Use {@link #PARQUET_COMPRESSION_RATIO_FRACTION} and its methods instead + */ + @Deprecated + public static final String PARQUET_COMPRESSION_RATIO = PARQUET_COMPRESSION_RATIO_FRACTION.key(); + /** + * @deprecated Use {@link #PARQUET_COMPRESSION_RATIO_FRACTION} and its methods instead + */ + @Deprecated + public static final String DEFAULT_STREAM_COMPRESSION_RATIO = PARQUET_COMPRESSION_RATIO_FRACTION.defaultValue(); + /** + * @deprecated Use {@link #PARQUET_COMPRESSION_CODEC_NAME} and its methods instead + */ + @Deprecated + public static final String PARQUET_COMPRESSION_CODEC = PARQUET_COMPRESSION_CODEC_NAME.key(); + /** + * @deprecated Use {@link #HFILE_COMPRESSION_ALGORITHM_NAME} and its methods instead + */ + @Deprecated + public static final String HFILE_COMPRESSION_ALGORITHM = HFILE_COMPRESSION_ALGORITHM_NAME.key(); + /** + * @deprecated Use {@link #PARQUET_COMPRESSION_CODEC_NAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_PARQUET_COMPRESSION_CODEC = PARQUET_COMPRESSION_CODEC_NAME.defaultValue(); + /** + * @deprecated Use {@link #HFILE_COMPRESSION_ALGORITHM_NAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_HFILE_COMPRESSION_ALGORITHM = HFILE_COMPRESSION_ALGORITHM_NAME.defaultValue(); + /** + * @deprecated Use {@link #LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION} and its methods instead + */ + @Deprecated + public static final String LOGFILE_TO_PARQUET_COMPRESSION_RATIO = LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION.key(); + /** + * @deprecated Use {@link #LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION} and its methods instead + */ + @Deprecated + public static final String DEFAULT_LOGFILE_TO_PARQUET_COMPRESSION_RATIO = LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION.defaultValue(); + + private HoodieStorageConfig() { + super(); } public static HoodieStorageConfig.Builder newBuilder() { @@ -71,103 +278,113 @@ public static HoodieStorageConfig.Builder newBuilder() { public static class Builder { - private final Properties props = new Properties(); + private final HoodieStorageConfig storageConfig = new HoodieStorageConfig(); public Builder fromFile(File propertiesFile) throws IOException { try (FileReader reader = new FileReader(propertiesFile)) { - this.props.load(reader); + this.storageConfig.getProps().load(reader); return this; } } public Builder fromProperties(Properties props) { - this.props.putAll(props); + this.storageConfig.getProps().putAll(props); return this; } public Builder parquetMaxFileSize(long maxFileSize) { - props.setProperty(PARQUET_FILE_MAX_BYTES, String.valueOf(maxFileSize)); + storageConfig.setValue(PARQUET_MAX_FILE_SIZE, String.valueOf(maxFileSize)); return this; } public Builder parquetBlockSize(int blockSize) { - props.setProperty(PARQUET_BLOCK_SIZE_BYTES, String.valueOf(blockSize)); + storageConfig.setValue(PARQUET_BLOCK_SIZE, String.valueOf(blockSize)); return this; } public Builder parquetPageSize(int pageSize) { - props.setProperty(PARQUET_PAGE_SIZE_BYTES, String.valueOf(pageSize)); + storageConfig.setValue(PARQUET_PAGE_SIZE, String.valueOf(pageSize)); return this; } public Builder hfileMaxFileSize(long maxFileSize) { - props.setProperty(HFILE_FILE_MAX_BYTES, String.valueOf(maxFileSize)); + storageConfig.setValue(HFILE_MAX_FILE_SIZE, String.valueOf(maxFileSize)); return this; } public Builder hfileBlockSize(int blockSize) { - props.setProperty(HFILE_BLOCK_SIZE_BYTES, String.valueOf(blockSize)); + storageConfig.setValue(HFILE_BLOCK_SIZE, String.valueOf(blockSize)); return this; } public Builder logFileDataBlockMaxSize(int dataBlockSize) { - props.setProperty(LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES, String.valueOf(dataBlockSize)); + storageConfig.setValue(LOGFILE_DATA_BLOCK_MAX_SIZE, String.valueOf(dataBlockSize)); return this; } - public Builder logFileMaxSize(int logFileSize) { - props.setProperty(LOGFILE_SIZE_MAX_BYTES, String.valueOf(logFileSize)); + public Builder logFileMaxSize(long logFileSize) { + storageConfig.setValue(LOGFILE_MAX_SIZE, String.valueOf(logFileSize)); return this; } public Builder parquetCompressionRatio(double parquetCompressionRatio) { - props.setProperty(PARQUET_COMPRESSION_RATIO, String.valueOf(parquetCompressionRatio)); + storageConfig.setValue(PARQUET_COMPRESSION_RATIO_FRACTION, String.valueOf(parquetCompressionRatio)); return this; } public Builder parquetCompressionCodec(String parquetCompressionCodec) { - props.setProperty(PARQUET_COMPRESSION_CODEC, parquetCompressionCodec); + storageConfig.setValue(PARQUET_COMPRESSION_CODEC_NAME, parquetCompressionCodec); + return this; + } + + public Builder parquetWriteLegacyFormat(String parquetWriteLegacyFormat) { + storageConfig.setValue(PARQUET_WRITE_LEGACY_FORMAT_ENABLED, parquetWriteLegacyFormat); + return this; + } + + public Builder parquetOutputTimestampType(String parquetOutputTimestampType) { + storageConfig.setValue(PARQUET_OUTPUT_TIMESTAMP_TYPE, parquetOutputTimestampType); + return this; + } + + public Builder parquetFieldIdWrite(String parquetFieldIdWrite) { + storageConfig.setValue(PARQUET_FIELD_ID_WRITE_ENABLED, parquetFieldIdWrite); return this; } public Builder hfileCompressionAlgorithm(String hfileCompressionAlgorithm) { - props.setProperty(HFILE_COMPRESSION_ALGORITHM, hfileCompressionAlgorithm); + storageConfig.setValue(HFILE_COMPRESSION_ALGORITHM_NAME, hfileCompressionAlgorithm); return this; } public Builder logFileToParquetCompressionRatio(double logFileToParquetCompressionRatio) { - props.setProperty(LOGFILE_TO_PARQUET_COMPRESSION_RATIO, String.valueOf(logFileToParquetCompressionRatio)); + storageConfig.setValue(LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION, String.valueOf(logFileToParquetCompressionRatio)); + return this; + } + + public Builder orcMaxFileSize(long maxFileSize) { + storageConfig.setValue(ORC_FILE_MAX_SIZE, String.valueOf(maxFileSize)); + return this; + } + + public Builder orcStripeSize(int orcStripeSize) { + storageConfig.setValue(ORC_STRIPE_SIZE, String.valueOf(orcStripeSize)); + return this; + } + + public Builder orcBlockSize(int orcBlockSize) { + storageConfig.setValue(ORC_BLOCK_SIZE, String.valueOf(orcBlockSize)); + return this; + } + + public Builder orcCompressionCodec(String orcCompressionCodec) { + storageConfig.setValue(ORC_COMPRESSION_CODEC_NAME, orcCompressionCodec); return this; } public HoodieStorageConfig build() { - HoodieStorageConfig config = new HoodieStorageConfig(props); - setDefaultOnCondition(props, !props.containsKey(PARQUET_FILE_MAX_BYTES), PARQUET_FILE_MAX_BYTES, - DEFAULT_PARQUET_FILE_MAX_BYTES); - setDefaultOnCondition(props, !props.containsKey(PARQUET_BLOCK_SIZE_BYTES), PARQUET_BLOCK_SIZE_BYTES, - DEFAULT_PARQUET_BLOCK_SIZE_BYTES); - setDefaultOnCondition(props, !props.containsKey(PARQUET_PAGE_SIZE_BYTES), PARQUET_PAGE_SIZE_BYTES, - DEFAULT_PARQUET_PAGE_SIZE_BYTES); - setDefaultOnCondition(props, !props.containsKey(LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES), - LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES, DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES); - setDefaultOnCondition(props, !props.containsKey(LOGFILE_SIZE_MAX_BYTES), LOGFILE_SIZE_MAX_BYTES, - DEFAULT_LOGFILE_SIZE_MAX_BYTES); - setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_RATIO), PARQUET_COMPRESSION_RATIO, - DEFAULT_STREAM_COMPRESSION_RATIO); - setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_CODEC), PARQUET_COMPRESSION_CODEC, - DEFAULT_PARQUET_COMPRESSION_CODEC); - setDefaultOnCondition(props, !props.containsKey(LOGFILE_TO_PARQUET_COMPRESSION_RATIO), - LOGFILE_TO_PARQUET_COMPRESSION_RATIO, DEFAULT_LOGFILE_TO_PARQUET_COMPRESSION_RATIO); - - setDefaultOnCondition(props, !props.containsKey(HFILE_BLOCK_SIZE_BYTES), HFILE_BLOCK_SIZE_BYTES, - DEFAULT_HFILE_BLOCK_SIZE_BYTES); - setDefaultOnCondition(props, !props.containsKey(HFILE_COMPRESSION_ALGORITHM), HFILE_COMPRESSION_ALGORITHM, - DEFAULT_HFILE_COMPRESSION_ALGORITHM); - setDefaultOnCondition(props, !props.containsKey(HFILE_FILE_MAX_BYTES), HFILE_FILE_MAX_BYTES, - DEFAULT_HFILE_FILE_MAX_BYTES); - - return config; + storageConfig.setDefaults(HoodieStorageConfig.class.getName()); + return storageConfig; } } - } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteCommitCallbackConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteCommitCallbackConfig.java index 126e4f01917b0..09c2b09be0321 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteCommitCallbackConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteCommitCallbackConfig.java @@ -17,7 +17,10 @@ package org.apache.hudi.config; -import org.apache.hudi.common.config.DefaultHoodieConfig; +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; import java.io.File; import java.io.FileReader; @@ -27,24 +30,94 @@ /** * Write callback related config. */ -public class HoodieWriteCommitCallbackConfig extends DefaultHoodieConfig { +@ConfigClassProperty(name = "Write commit callback configs", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Controls callback behavior into HTTP endpoints, to push " + + " notifications on commits on hudi tables.") +public class HoodieWriteCommitCallbackConfig extends HoodieConfig { public static final String CALLBACK_PREFIX = "hoodie.write.commit.callback."; - public static final String CALLBACK_ON = CALLBACK_PREFIX + "on"; - public static final boolean DEFAULT_CALLBACK_ON = false; - public static final String CALLBACK_CLASS_PROP = CALLBACK_PREFIX + "class"; - public static final String DEFAULT_CALLBACK_CLASS_PROP = "org.apache.hudi.callback.impl.HoodieWriteCommitHttpCallback"; + public static final ConfigProperty TURN_CALLBACK_ON = ConfigProperty + .key(CALLBACK_PREFIX + "on") + .defaultValue(false) + .sinceVersion("0.6.0") + .withDocumentation("Turn commit callback on/off. off by default."); + + public static final ConfigProperty CALLBACK_CLASS_NAME = ConfigProperty + .key(CALLBACK_PREFIX + "class") + .defaultValue("org.apache.hudi.callback.impl.HoodieWriteCommitHttpCallback") + .sinceVersion("0.6.0") + .withDocumentation("Full path of callback class and must be a subclass of HoodieWriteCommitCallback class, " + + "org.apache.hudi.callback.impl.HoodieWriteCommitHttpCallback by default"); // ***** HTTP callback configs ***** - public static final String CALLBACK_HTTP_URL_PROP = CALLBACK_PREFIX + "http.url"; - public static final String CALLBACK_HTTP_API_KEY = CALLBACK_PREFIX + "http.api.key"; - public static final String DEFAULT_CALLBACK_HTTP_API_KEY = "hudi_write_commit_http_callback"; - public static final String CALLBACK_HTTP_TIMEOUT_SECONDS = CALLBACK_PREFIX + "http.timeout.seconds"; - public static final int DEFAULT_CALLBACK_HTTP_TIMEOUT_SECONDS = 3; - - private HoodieWriteCommitCallbackConfig(Properties props) { - super(props); + public static final ConfigProperty CALLBACK_HTTP_URL = ConfigProperty + .key(CALLBACK_PREFIX + "http.url") + .noDefaultValue() + .sinceVersion("0.6.0") + .withDocumentation("Callback host to be sent along with callback messages"); + + public static final ConfigProperty CALLBACK_HTTP_API_KEY_VALUE = ConfigProperty + .key(CALLBACK_PREFIX + "http.api.key") + .defaultValue("hudi_write_commit_http_callback") + .sinceVersion("0.6.0") + .withDocumentation("Http callback API key. hudi_write_commit_http_callback by default"); + + public static final ConfigProperty CALLBACK_HTTP_TIMEOUT_IN_SECONDS = ConfigProperty + .key(CALLBACK_PREFIX + "http.timeout.seconds") + .defaultValue(3) + .sinceVersion("0.6.0") + .withDocumentation("Callback timeout in seconds. 3 by default"); + + /** + * @deprecated Use {@link #TURN_CALLBACK_ON} and its methods instead + */ + @Deprecated + public static final String CALLBACK_ON = TURN_CALLBACK_ON.key(); + /** + * @deprecated Use {@link #TURN_CALLBACK_ON} and its methods instead + */ + @Deprecated + public static final boolean DEFAULT_CALLBACK_ON = TURN_CALLBACK_ON.defaultValue(); + /** + * @deprecated Use {@link #CALLBACK_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String CALLBACK_CLASS_PROP = CALLBACK_CLASS_NAME.key(); + /** + * @deprecated Use {@link #CALLBACK_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_CALLBACK_CLASS_PROP = CALLBACK_CLASS_NAME.defaultValue(); + /** + * @deprecated Use {@link #CALLBACK_HTTP_URL} and its methods instead + */ + @Deprecated + public static final String CALLBACK_HTTP_URL_PROP = CALLBACK_HTTP_URL.key(); + /** + * @deprecated Use {@link #CALLBACK_HTTP_API_KEY_VALUE} and its methods instead + */ + @Deprecated + public static final String CALLBACK_HTTP_API_KEY = CALLBACK_HTTP_API_KEY_VALUE.key(); + /** + * @deprecated Use {@link #CALLBACK_HTTP_API_KEY_VALUE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_CALLBACK_HTTP_API_KEY = CALLBACK_HTTP_API_KEY_VALUE.defaultValue(); + /** + * @deprecated Use {@link #CALLBACK_HTTP_TIMEOUT_IN_SECONDS} and its methods instead + */ + @Deprecated + public static final String CALLBACK_HTTP_TIMEOUT_SECONDS = CALLBACK_HTTP_TIMEOUT_IN_SECONDS.key(); + /** + * @deprecated Use {@link #CALLBACK_HTTP_TIMEOUT_IN_SECONDS} and its methods instead + */ + @Deprecated + public static final int DEFAULT_CALLBACK_HTTP_TIMEOUT_SECONDS = CALLBACK_HTTP_TIMEOUT_IN_SECONDS.defaultValue(); + + private HoodieWriteCommitCallbackConfig() { + super(); } public static HoodieWriteCommitCallbackConfig.Builder newBuilder() { @@ -53,54 +126,48 @@ public static HoodieWriteCommitCallbackConfig.Builder newBuilder() { public static class Builder { - private final Properties props = new Properties(); + private final HoodieWriteCommitCallbackConfig writeCommitCallbackConfig = new HoodieWriteCommitCallbackConfig(); public HoodieWriteCommitCallbackConfig.Builder fromFile(File propertiesFile) throws IOException { try (FileReader reader = new FileReader(propertiesFile)) { - this.props.load(reader); + this.writeCommitCallbackConfig.getProps().load(reader); return this; } } public HoodieWriteCommitCallbackConfig.Builder fromProperties(Properties props) { - this.props.putAll(props); + this.writeCommitCallbackConfig.getProps().putAll(props); return this; } public HoodieWriteCommitCallbackConfig.Builder writeCommitCallbackOn(String callbackOn) { - props.setProperty(CALLBACK_ON, callbackOn); + writeCommitCallbackConfig.setValue(TURN_CALLBACK_ON, callbackOn); return this; } public HoodieWriteCommitCallbackConfig.Builder withCallbackClass(String callbackClass) { - props.setProperty(CALLBACK_CLASS_PROP, callbackClass); + writeCommitCallbackConfig.setValue(CALLBACK_CLASS_NAME, callbackClass); return this; } public HoodieWriteCommitCallbackConfig.Builder withCallbackHttpUrl(String url) { - props.setProperty(CALLBACK_HTTP_URL_PROP, url); + writeCommitCallbackConfig.setValue(CALLBACK_HTTP_URL, url); return this; } public Builder withCallbackHttpTimeoutSeconds(String timeoutSeconds) { - props.setProperty(CALLBACK_HTTP_TIMEOUT_SECONDS, timeoutSeconds); + writeCommitCallbackConfig.setValue(CALLBACK_HTTP_TIMEOUT_IN_SECONDS, timeoutSeconds); return this; } public Builder withCallbackHttpApiKey(String apiKey) { - props.setProperty(CALLBACK_HTTP_API_KEY, apiKey); + writeCommitCallbackConfig.setValue(CALLBACK_HTTP_API_KEY_VALUE, apiKey); return this; } public HoodieWriteCommitCallbackConfig build() { - HoodieWriteCommitCallbackConfig config = new HoodieWriteCommitCallbackConfig(props); - setDefaultOnCondition(props, !props.containsKey(CALLBACK_ON), CALLBACK_ON, String.valueOf(DEFAULT_CALLBACK_ON)); - setDefaultOnCondition(props, !props.containsKey(CALLBACK_CLASS_PROP), CALLBACK_CLASS_PROP, DEFAULT_CALLBACK_CLASS_PROP); - setDefaultOnCondition(props, !props.containsKey(CALLBACK_HTTP_API_KEY), CALLBACK_HTTP_API_KEY, DEFAULT_CALLBACK_HTTP_API_KEY); - setDefaultOnCondition(props, !props.containsKey(CALLBACK_HTTP_TIMEOUT_SECONDS), CALLBACK_HTTP_TIMEOUT_SECONDS, - String.valueOf(DEFAULT_CALLBACK_HTTP_TIMEOUT_SECONDS)); - - return config; + writeCommitCallbackConfig.setDefaults(HoodieWriteCommitCallbackConfig.class.getName()); + return writeCommitCallbackConfig; } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java index 8c22cab3067e0..19ee7ff21464d 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java @@ -18,22 +18,61 @@ package org.apache.hudi.config; -import org.apache.hadoop.hbase.io.compress.Compression; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.bootstrap.BootstrapMode; -import org.apache.hudi.common.config.DefaultHoodieConfig; +import org.apache.hudi.client.transaction.ConflictResolutionStrategy; +import org.apache.hudi.client.transaction.lock.InProcessLockProvider; +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieCommonConfig; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.HoodieMetastoreConfig; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.EngineType; import org.apache.hudi.common.fs.ConsistencyGuardConfig; -import org.apache.hudi.client.common.EngineType; +import org.apache.hudi.common.fs.FileSystemRetryConfig; import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; +import org.apache.hudi.common.model.WriteConcurrencyMode; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.log.block.HoodieLogBlock; +import org.apache.hudi.common.table.marker.MarkerType; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.config.metrics.HoodieMetricsCloudWatchConfig; +import org.apache.hudi.config.metrics.HoodieMetricsConfig; +import org.apache.hudi.config.metrics.HoodieMetricsDatadogConfig; +import org.apache.hudi.config.metrics.HoodieMetricsGraphiteConfig; +import org.apache.hudi.config.metrics.HoodieMetricsJmxConfig; +import org.apache.hudi.config.metrics.HoodieMetricsPrometheusConfig; +import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.execution.bulkinsert.BulkInsertSortMode; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.keygen.SimpleAvroKeyGenerator; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.apache.hudi.keygen.constant.KeyGeneratorType; import org.apache.hudi.metrics.MetricsReporterType; import org.apache.hudi.metrics.datadog.DatadogHttpClient.ApiSite; +import org.apache.hudi.table.RandomFileIdPrefixProvider; +import org.apache.hudi.table.action.clean.CleaningTriggerStrategy; +import org.apache.hudi.table.action.cluster.ClusteringPlanPartitionFilterMode; +import org.apache.hudi.table.action.compact.CompactionTriggerStrategy; import org.apache.hudi.table.action.compact.strategy.CompactionStrategy; +import org.apache.hudi.table.storage.HoodieStorageLayout; +import org.apache.hadoop.hbase.io.compress.Compression; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.orc.CompressionKind; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import javax.annotation.concurrent.Immutable; @@ -50,77 +89,354 @@ import java.util.function.Supplier; import java.util.stream.Collectors; +import static org.apache.hudi.config.HoodieCleanConfig.CLEANER_POLICY; + /** * Class storing configs for the HoodieWriteClient. */ @Immutable -public class HoodieWriteConfig extends DefaultHoodieConfig { +@ConfigClassProperty(name = "Write Configurations", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Configurations that control write behavior on Hudi tables. These can be directly passed down from even " + + "higher level frameworks (e.g Spark datasources, Flink sink) and utilities (e.g DeltaStreamer).") +public class HoodieWriteConfig extends HoodieConfig { + private static final Logger LOG = LogManager.getLogger(HoodieWriteConfig.class); private static final long serialVersionUID = 0L; - public static final String TABLE_NAME = "hoodie.table.name"; - public static final String DEFAULT_ROLLBACK_USING_MARKERS = "false"; - public static final String ROLLBACK_USING_MARKERS = "hoodie.rollback.using.markers"; - public static final String TIMELINE_LAYOUT_VERSION = "hoodie.timeline.layout.version"; - public static final String BASE_PATH_PROP = "hoodie.base.path"; - public static final String AVRO_SCHEMA = "hoodie.avro.schema"; - public static final String AVRO_SCHEMA_VALIDATE = "hoodie.avro.schema.validate"; - public static final String DEFAULT_AVRO_SCHEMA_VALIDATE = "false"; - public static final String DEFAULT_PARALLELISM = "1500"; - public static final String INSERT_PARALLELISM = "hoodie.insert.shuffle.parallelism"; - public static final String BULKINSERT_PARALLELISM = "hoodie.bulkinsert.shuffle.parallelism"; - public static final String BULKINSERT_USER_DEFINED_PARTITIONER_CLASS = "hoodie.bulkinsert.user.defined.partitioner.class"; - public static final String UPSERT_PARALLELISM = "hoodie.upsert.shuffle.parallelism"; - public static final String DELETE_PARALLELISM = "hoodie.delete.shuffle.parallelism"; - public static final String DEFAULT_ROLLBACK_PARALLELISM = "100"; - public static final String ROLLBACK_PARALLELISM = "hoodie.rollback.parallelism"; - public static final String WRITE_BUFFER_LIMIT_BYTES = "hoodie.write.buffer.limit.bytes"; - public static final String DEFAULT_WRITE_BUFFER_LIMIT_BYTES = String.valueOf(4 * 1024 * 1024); - public static final String COMBINE_BEFORE_INSERT_PROP = "hoodie.combine.before.insert"; - public static final String DEFAULT_COMBINE_BEFORE_INSERT = "false"; - public static final String COMBINE_BEFORE_UPSERT_PROP = "hoodie.combine.before.upsert"; - public static final String DEFAULT_COMBINE_BEFORE_UPSERT = "true"; - public static final String COMBINE_BEFORE_DELETE_PROP = "hoodie.combine.before.delete"; - public static final String DEFAULT_COMBINE_BEFORE_DELETE = "true"; - public static final String WRITE_STATUS_STORAGE_LEVEL = "hoodie.write.status.storage.level"; - public static final String DEFAULT_WRITE_STATUS_STORAGE_LEVEL = "MEMORY_AND_DISK_SER"; - public static final String HOODIE_AUTO_COMMIT_PROP = "hoodie.auto.commit"; - public static final String DEFAULT_HOODIE_AUTO_COMMIT = "true"; - public static final String HOODIE_ASSUME_DATE_PARTITIONING_PROP = "hoodie.assume.date.partitioning"; - public static final String DEFAULT_ASSUME_DATE_PARTITIONING = "false"; - public static final String HOODIE_WRITE_STATUS_CLASS_PROP = "hoodie.writestatus.class"; - public static final String DEFAULT_HOODIE_WRITE_STATUS_CLASS = WriteStatus.class.getName(); - public static final String FINALIZE_WRITE_PARALLELISM = "hoodie.finalize.write.parallelism"; - public static final String DEFAULT_FINALIZE_WRITE_PARALLELISM = DEFAULT_PARALLELISM; - public static final String MARKERS_DELETE_PARALLELISM = "hoodie.markers.delete.parallelism"; - public static final String DEFAULT_MARKERS_DELETE_PARALLELISM = "100"; - public static final String BULKINSERT_SORT_MODE = "hoodie.bulkinsert.sort.mode"; - public static final String DEFAULT_BULKINSERT_SORT_MODE = BulkInsertSortMode.GLOBAL_SORT - .toString(); - - public static final String EMBEDDED_TIMELINE_SERVER_ENABLED = "hoodie.embed.timeline.server"; - public static final String DEFAULT_EMBEDDED_TIMELINE_SERVER_ENABLED = "true"; - public static final String EMBEDDED_TIMELINE_SERVER_PORT = "hoodie.embed.timeline.server.port"; - public static final String DEFAULT_EMBEDDED_TIMELINE_SERVER_PORT = "0"; - - public static final String FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP = "hoodie.fail.on.timeline.archiving"; - public static final String DEFAULT_FAIL_ON_TIMELINE_ARCHIVING_ENABLED = "true"; - // time between successive attempts to ensure written data's metadata is consistent on storage - public static final String INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP = - "hoodie.consistency.check.initial_interval_ms"; - public static long DEFAULT_INITIAL_CONSISTENCY_CHECK_INTERVAL_MS = 2000L; - - // max interval time - public static final String MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP = "hoodie.consistency.check.max_interval_ms"; - public static long DEFAULT_MAX_CONSISTENCY_CHECK_INTERVAL_MS = 300000L; - - // maximum number of checks, for consistency of written data. Will wait upto 256 Secs - public static final String MAX_CONSISTENCY_CHECKS_PROP = "hoodie.consistency.check.max_checks"; - public static int DEFAULT_MAX_CONSISTENCY_CHECKS = 7; - - // Data validation check performed during merges before actual commits - private static final String MERGE_DATA_VALIDATION_CHECK_ENABLED = "hoodie.merge.data.validation.enabled"; - private static final String DEFAULT_MERGE_DATA_VALIDATION_CHECK_ENABLED = "false"; + // This is a constant as is should never be changed via config (will invalidate previous commits) + // It is here so that both the client and deltastreamer use the same reference + public static final String DELTASTREAMER_CHECKPOINT_KEY = "deltastreamer.checkpoint.key"; + + public static final ConfigProperty TBL_NAME = ConfigProperty + .key(HoodieTableConfig.HOODIE_TABLE_NAME_KEY) + .noDefaultValue() + .withDocumentation("Table name that will be used for registering with metastores like HMS. Needs to be same across runs."); + + public static final ConfigProperty PRECOMBINE_FIELD_NAME = ConfigProperty + .key("hoodie.datasource.write.precombine.field") + .defaultValue("ts") + .withDocumentation("Field used in preCombining before actual write. When two records have the same key value, " + + "we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..)"); + + public static final ConfigProperty WRITE_PAYLOAD_CLASS_NAME = ConfigProperty + .key("hoodie.datasource.write.payload.class") + .defaultValue(OverwriteWithLatestAvroPayload.class.getName()) + .withDocumentation("Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. " + + "This will render any value set for PRECOMBINE_FIELD_OPT_VAL in-effective"); + + public static final ConfigProperty KEYGENERATOR_CLASS_NAME = ConfigProperty + .key("hoodie.datasource.write.keygenerator.class") + .noDefaultValue() + .withDocumentation("Key generator class, that implements `org.apache.hudi.keygen.KeyGenerator` " + + "extract a key out of incoming records."); + + public static final ConfigProperty KEYGENERATOR_TYPE = ConfigProperty + .key("hoodie.datasource.write.keygenerator.type") + .defaultValue(KeyGeneratorType.SIMPLE.name()) + .withDocumentation("Easily configure one the built-in key generators, instead of specifying the key generator class." + + "Currently supports SIMPLE, COMPLEX, TIMESTAMP, CUSTOM, NON_PARTITION, GLOBAL_DELETE"); + + public static final ConfigProperty ROLLBACK_USING_MARKERS_ENABLE = ConfigProperty + .key("hoodie.rollback.using.markers") + .defaultValue("true") + .withDocumentation("Enables a more efficient mechanism for rollbacks based on the marker files generated " + + "during the writes. Turned on by default."); + + public static final ConfigProperty TIMELINE_LAYOUT_VERSION_NUM = ConfigProperty + .key("hoodie.timeline.layout.version") + .defaultValue(Integer.toString(TimelineLayoutVersion.VERSION_1)) + .sinceVersion("0.5.1") + .withDocumentation("Controls the layout of the timeline. Version 0 relied on renames, Version 1 (default) models " + + "the timeline as an immutable log relying only on atomic writes for object storage."); + + public static final ConfigProperty BASE_FILE_FORMAT = ConfigProperty + .key("hoodie.table.base.file.format") + .defaultValue(HoodieFileFormat.PARQUET) + .withAlternatives("hoodie.table.ro.file.format") + .withDocumentation("Base file format to store all the base file data."); + + public static final ConfigProperty BASE_PATH = ConfigProperty + .key("hoodie.base.path") + .noDefaultValue() + .withDocumentation("Base path on lake storage, under which all the table data is stored. " + + "Always prefix it explicitly with the storage scheme (e.g hdfs://, s3:// etc). " + + "Hudi stores all the main meta-data about commits, savepoints, cleaning audit logs " + + "etc in .hoodie directory under this base path directory."); + + public static final ConfigProperty AVRO_SCHEMA_STRING = ConfigProperty + .key("hoodie.avro.schema") + .noDefaultValue() + .withDocumentation("Schema string representing the current write schema of the table. Hudi passes this to " + + "implementations of HoodieRecordPayload to convert incoming records to avro. This is also used as the write schema " + + "evolving records during an update."); + + public static final ConfigProperty INTERNAL_SCHEMA_STRING = ConfigProperty + .key("hoodie.internal.schema") + .noDefaultValue() + .withDocumentation("Schema string representing the latest schema of the table. Hudi passes this to " + + "implementations of evolution of schema"); + + public static final ConfigProperty ENABLE_INTERNAL_SCHEMA_CACHE = ConfigProperty + .key("hoodie.schema.cache.enable") + .defaultValue(false) + .withDocumentation("cache query internalSchemas in driver/executor side"); + + public static final ConfigProperty AVRO_SCHEMA_VALIDATE_ENABLE = ConfigProperty + .key("hoodie.avro.schema.validate") + .defaultValue("false") + .withDocumentation("Validate the schema used for the write against the latest schema, for backwards compatibility."); + + public static final ConfigProperty INSERT_PARALLELISM_VALUE = ConfigProperty + .key("hoodie.insert.shuffle.parallelism") + .defaultValue("200") + .withDocumentation("Parallelism for inserting records into the table. Inserts can shuffle data before writing to tune file sizes and optimize the storage layout."); + + public static final ConfigProperty BULKINSERT_PARALLELISM_VALUE = ConfigProperty + .key("hoodie.bulkinsert.shuffle.parallelism") + .defaultValue("200") + .withDocumentation("For large initial imports using bulk_insert operation, controls the parallelism to use for sort modes or custom partitioning done" + + "before writing records to the table."); + + public static final ConfigProperty BULKINSERT_USER_DEFINED_PARTITIONER_SORT_COLUMNS = ConfigProperty + .key("hoodie.bulkinsert.user.defined.partitioner.sort.columns") + .noDefaultValue() + .withDocumentation("Columns to sort the data by when use org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner as user defined partitioner during bulk_insert. " + + "For example 'column1,column2'"); + + public static final ConfigProperty BULKINSERT_USER_DEFINED_PARTITIONER_CLASS_NAME = ConfigProperty + .key("hoodie.bulkinsert.user.defined.partitioner.class") + .noDefaultValue() + .withDocumentation("If specified, this class will be used to re-partition records before they are bulk inserted. This can be used to sort, pack, cluster data" + + " optimally for common query patterns. For now we support a build-in user defined bulkinsert partitioner org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner" + + " which can does sorting based on specified column values set by " + BULKINSERT_USER_DEFINED_PARTITIONER_SORT_COLUMNS.key()); + + public static final ConfigProperty UPSERT_PARALLELISM_VALUE = ConfigProperty + .key("hoodie.upsert.shuffle.parallelism") + .defaultValue("200") + .withDocumentation("Parallelism to use for upsert operation on the table. Upserts can shuffle data to perform index lookups, file sizing, bin packing records optimally" + + "into file groups."); + + public static final ConfigProperty DELETE_PARALLELISM_VALUE = ConfigProperty + .key("hoodie.delete.shuffle.parallelism") + .defaultValue("200") + .withDocumentation("Parallelism used for “delete” operation. Delete operations also performs shuffles, similar to upsert operation."); + + public static final ConfigProperty ROLLBACK_PARALLELISM_VALUE = ConfigProperty + .key("hoodie.rollback.parallelism") + .defaultValue("100") + .withDocumentation("Parallelism for rollback of commits. Rollbacks perform delete of files or logging delete blocks to file groups on storage in parallel."); + + public static final ConfigProperty WRITE_BUFFER_LIMIT_BYTES_VALUE = ConfigProperty + .key("hoodie.write.buffer.limit.bytes") + .defaultValue(String.valueOf(4 * 1024 * 1024)) + .withDocumentation("Size of in-memory buffer used for parallelizing network reads and lake storage writes."); + + public static final ConfigProperty COMBINE_BEFORE_INSERT = ConfigProperty + .key("hoodie.combine.before.insert") + .defaultValue("false") + .withDocumentation("When inserted records share same key, controls whether they should be first combined (i.e de-duplicated) before" + + " writing to storage."); + + public static final ConfigProperty COMBINE_BEFORE_UPSERT = ConfigProperty + .key("hoodie.combine.before.upsert") + .defaultValue("true") + .withDocumentation("When upserted records share same key, controls whether they should be first combined (i.e de-duplicated) before" + + " writing to storage. This should be turned off only if you are absolutely certain that there are no duplicates incoming, " + + " otherwise it can lead to duplicate keys and violate the uniqueness guarantees."); + + public static final ConfigProperty COMBINE_BEFORE_DELETE = ConfigProperty + .key("hoodie.combine.before.delete") + .defaultValue("true") + .withDocumentation("During delete operations, controls whether we should combine deletes (and potentially also upserts) before " + + " writing to storage."); + + public static final ConfigProperty WRITE_STATUS_STORAGE_LEVEL_VALUE = ConfigProperty + .key("hoodie.write.status.storage.level") + .defaultValue("MEMORY_AND_DISK_SER") + .withDocumentation("Write status objects hold metadata about a write (stats, errors), that is not yet committed to storage. " + + "This controls the how that information is cached for inspection by clients. We rarely expect this to be changed."); + + public static final ConfigProperty AUTO_COMMIT_ENABLE = ConfigProperty + .key("hoodie.auto.commit") + .defaultValue("true") + .withDocumentation("Controls whether a write operation should auto commit. This can be turned off to perform inspection" + + " of the uncommitted write before deciding to commit."); + + public static final ConfigProperty WRITE_STATUS_CLASS_NAME = ConfigProperty + .key("hoodie.writestatus.class") + .defaultValue(WriteStatus.class.getName()) + .withDocumentation("Subclass of " + WriteStatus.class.getName() + " to be used to collect information about a write. Can be " + + "overridden to collection additional metrics/statistics about the data if needed."); + + public static final ConfigProperty FINALIZE_WRITE_PARALLELISM_VALUE = ConfigProperty + .key("hoodie.finalize.write.parallelism") + .defaultValue("200") + .withDocumentation("Parallelism for the write finalization internal operation, which involves removing any partially written " + + "files from lake storage, before committing the write. Reduce this value, if the high number of tasks incur delays for smaller tables " + + "or low latency writes."); + + public static final ConfigProperty MARKERS_TYPE = ConfigProperty + .key("hoodie.write.markers.type") + .defaultValue(MarkerType.TIMELINE_SERVER_BASED.toString()) + .sinceVersion("0.9.0") + .withDocumentation("Marker type to use. Two modes are supported: " + + "- DIRECT: individual marker file corresponding to each data file is directly " + + "created by the writer. " + + "- TIMELINE_SERVER_BASED: marker operations are all handled at the timeline service " + + "which serves as a proxy. New marker entries are batch processed and stored " + + "in a limited number of underlying files for efficiency. If HDFS is used or " + + "timeline server is disabled, DIRECT markers are used as fallback even if this " + + "is configure. For Spark structured streaming, this configuration does not " + + "take effect, i.e., DIRECT markers are always used for Spark structured streaming."); + + public static final ConfigProperty MARKERS_TIMELINE_SERVER_BASED_BATCH_NUM_THREADS = ConfigProperty + .key("hoodie.markers.timeline_server_based.batch.num_threads") + .defaultValue(20) + .sinceVersion("0.9.0") + .withDocumentation("Number of threads to use for batch processing marker " + + "creation requests at the timeline server"); + + public static final ConfigProperty MARKERS_TIMELINE_SERVER_BASED_BATCH_INTERVAL_MS = ConfigProperty + .key("hoodie.markers.timeline_server_based.batch.interval_ms") + .defaultValue(50L) + .sinceVersion("0.9.0") + .withDocumentation("The batch interval in milliseconds for marker creation batch processing"); + + public static final ConfigProperty MARKERS_DELETE_PARALLELISM_VALUE = ConfigProperty + .key("hoodie.markers.delete.parallelism") + .defaultValue("100") + .withDocumentation("Determines the parallelism for deleting marker files, which are used to track all files (valid or invalid/partial) written during " + + "a write operation. Increase this value if delays are observed, with large batch writes."); + + public static final ConfigProperty BULK_INSERT_SORT_MODE = ConfigProperty + .key("hoodie.bulkinsert.sort.mode") + .defaultValue(BulkInsertSortMode.NONE.toString()) + .withDocumentation("Sorting modes to use for sorting records for bulk insert. This is use when user " + + BULKINSERT_USER_DEFINED_PARTITIONER_CLASS_NAME.key() + "is not configured. Available values are - " + + "GLOBAL_SORT: this ensures best file sizes, with lowest memory overhead at cost of sorting. " + + "PARTITION_SORT: Strikes a balance by only sorting within a partition, still keeping the memory overhead of writing " + + "lowest and best effort file sizing. " + + "PARTITION_PATH_REPARTITION: this ensures that the data for a single physical partition in the table is written " + + "by the same Spark executor, best for input data evenly distributed across different partition paths. " + + "This can cause imbalance among Spark executors if the input data is skewed, i.e., most records are intended for " + + "a handful of partition paths among all. " + + "PARTITION_PATH_REPARTITION_AND_SORT: this ensures that the data for a single physical partition in the table is written " + + "by the same Spark executor, best for input data evenly distributed across different partition paths. " + + "Compared to PARTITION_PATH_REPARTITION, this sort mode does an additional step of sorting the records " + + "based on the partition path within a single Spark partition, given that data for multiple physical partitions " + + "can be sent to the same Spark partition and executor. " + + "This can cause imbalance among Spark executors if the input data is skewed, i.e., most records are intended for " + + "a handful of partition paths among all. " + + "NONE: No sorting. Fastest and matches `spark.write.parquet()` in terms of number of files, overheads"); + + public static final ConfigProperty EMBEDDED_TIMELINE_SERVER_ENABLE = ConfigProperty + .key("hoodie.embed.timeline.server") + .defaultValue("true") + .withDocumentation("When true, spins up an instance of the timeline server (meta server that serves cached file listings, statistics)," + + "running on each writer's driver process, accepting requests during the write from executors."); + + public static final ConfigProperty EMBEDDED_TIMELINE_SERVER_REUSE_ENABLED = ConfigProperty + .key("hoodie.embed.timeline.server.reuse.enabled") + .defaultValue(false) + .withDocumentation("Controls whether the timeline server instance should be cached and reused across the JVM (across task lifecycles)" + + "to avoid startup costs. This should rarely be changed."); + + public static final ConfigProperty EMBEDDED_TIMELINE_SERVER_PORT_NUM = ConfigProperty + .key("hoodie.embed.timeline.server.port") + .defaultValue("0") + .withDocumentation("Port at which the timeline server listens for requests. When running embedded in each writer, it picks " + + "a free port and communicates to all the executors. This should rarely be changed."); + + public static final ConfigProperty EMBEDDED_TIMELINE_NUM_SERVER_THREADS = ConfigProperty + .key("hoodie.embed.timeline.server.threads") + .defaultValue("-1") + .withDocumentation("Number of threads to serve requests in the timeline server. By default, auto configured based on the number of underlying cores."); + + public static final ConfigProperty EMBEDDED_TIMELINE_SERVER_COMPRESS_ENABLE = ConfigProperty + .key("hoodie.embed.timeline.server.gzip") + .defaultValue("true") + .withDocumentation("Controls whether gzip compression is used, for large responses from the timeline server, to improve latency."); + + public static final ConfigProperty EMBEDDED_TIMELINE_SERVER_USE_ASYNC_ENABLE = ConfigProperty + .key("hoodie.embed.timeline.server.async") + .defaultValue("false") + .withDocumentation("Controls whether or not, the requests to the timeline server are processed in asynchronous fashion, " + + "potentially improving throughput."); + + public static final ConfigProperty FAIL_ON_TIMELINE_ARCHIVING_ENABLE = ConfigProperty + .key("hoodie.fail.on.timeline.archiving") + .defaultValue("true") + .withDocumentation("Timeline archiving removes older instants from the timeline, after each write operation, to minimize metadata overhead. " + + "Controls whether or not, the write should be failed as well, if such archiving fails."); + + public static final ConfigProperty FAIL_ON_INLINE_TABLE_SERVICE_EXCEPTION = ConfigProperty + .key("hoodie.fail.writes.on.inline.table.service.exception") + .defaultValue("true") + .withDocumentation("Table services such as compaction and clustering can fail and prevent syncing to " + + "the metaclient. Set this to true to fail writes when table services fail"); + + public static final ConfigProperty INITIAL_CONSISTENCY_CHECK_INTERVAL_MS = ConfigProperty + .key("hoodie.consistency.check.initial_interval_ms") + .defaultValue(2000L) + .withDocumentation("Initial time between successive attempts to ensure written data's metadata is consistent on storage. Grows with exponential" + + " backoff after the initial value."); + + public static final ConfigProperty MAX_CONSISTENCY_CHECK_INTERVAL_MS = ConfigProperty + .key("hoodie.consistency.check.max_interval_ms") + .defaultValue(300000L) + .withDocumentation("Max time to wait between successive attempts at performing consistency checks"); + + public static final ConfigProperty MAX_CONSISTENCY_CHECKS = ConfigProperty + .key("hoodie.consistency.check.max_checks") + .defaultValue(7) + .withDocumentation("Maximum number of checks, for consistency of written data."); + + public static final ConfigProperty MERGE_DATA_VALIDATION_CHECK_ENABLE = ConfigProperty + .key("hoodie.merge.data.validation.enabled") + .defaultValue("false") + .withDocumentation("When enabled, data validation checks are performed during merges to ensure expected " + + "number of records after merge operation."); + + public static final ConfigProperty MERGE_ALLOW_DUPLICATE_ON_INSERTS_ENABLE = ConfigProperty + .key("hoodie.merge.allow.duplicate.on.inserts") + .defaultValue("false") + .withDocumentation("When enabled, we allow duplicate keys even if inserts are routed to merge with an existing file (for ensuring file sizing)." + + " This is only relevant for insert operation, since upsert, delete operations will ensure unique key constraints are maintained."); + + public static final ConfigProperty MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT = ConfigProperty + .key("hoodie.merge.small.file.group.candidates.limit") + .defaultValue(1) + .withDocumentation("Limits number of file groups, whose base file satisfies small-file limit, to consider for appending records during upsert operation. " + + "Only applicable to MOR tables"); + + public static final ConfigProperty CLIENT_HEARTBEAT_INTERVAL_IN_MS = ConfigProperty + .key("hoodie.client.heartbeat.interval_in_ms") + .defaultValue(60 * 1000) + .withDocumentation("Writers perform heartbeats to indicate liveness. Controls how often (in ms), such heartbeats are registered to lake storage."); + + public static final ConfigProperty CLIENT_HEARTBEAT_NUM_TOLERABLE_MISSES = ConfigProperty + .key("hoodie.client.heartbeat.tolerable.misses") + .defaultValue(2) + .withDocumentation("Number of heartbeat misses, before a writer is deemed not alive and all pending writes are aborted."); + + public static final ConfigProperty WRITE_CONCURRENCY_MODE = ConfigProperty + .key("hoodie.write.concurrency.mode") + .defaultValue(WriteConcurrencyMode.SINGLE_WRITER.name()) + .withDocumentation("Enable different concurrency modes. Options are " + + "SINGLE_WRITER: Only one active writer to the table. Maximizes throughput" + + "OPTIMISTIC_CONCURRENCY_CONTROL: Multiple writers can operate on the table and exactly one of them succeed " + + "if a conflict (writes affect the same file group) is detected."); + + /** + * Currently the use this to specify the write schema. + */ + public static final ConfigProperty WRITE_SCHEMA = ConfigProperty + .key("hoodie.write.schema") + .noDefaultValue() + .withDocumentation("The specified write schema. In most case, we do not need set this parameter," + + " but for the case the write schema is not equal to the specified table schema, we can" + + " specify the write schema by this parameter. Used by MergeIntoHoodieTableCommand"); /** * HUDI-858 : There are users who had been directly using RDD APIs and have relied on a behavior in 0.4.x to allow @@ -132,27 +448,450 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { * Given the importance of supporting such cases for the user's migration to 0.5.x, we are proposing a safety flag * (disabled by default) which will allow this old behavior. */ - public static final String ALLOW_MULTI_WRITE_ON_SAME_INSTANT = - "_.hoodie.allow.multi.write.on.same.instant"; - public static final String DEFAULT_ALLOW_MULTI_WRITE_ON_SAME_INSTANT = "false"; - - public static final String EXTERNAL_RECORD_AND_SCHEMA_TRANSFORMATION = AVRO_SCHEMA + ".externalTransformation"; - public static final String DEFAULT_EXTERNAL_RECORD_AND_SCHEMA_TRANSFORMATION = "false"; + public static final ConfigProperty ALLOW_MULTI_WRITE_ON_SAME_INSTANT_ENABLE = ConfigProperty + .key("_.hoodie.allow.multi.write.on.same.instant") + .defaultValue("false") + .withDocumentation(""); + + public static final ConfigProperty AVRO_EXTERNAL_SCHEMA_TRANSFORMATION_ENABLE = ConfigProperty + .key(AVRO_SCHEMA_STRING.key() + ".external.transformation") + .defaultValue("false") + .withAlternatives(AVRO_SCHEMA_STRING.key() + ".externalTransformation") + .withDocumentation("When enabled, records in older schema are rewritten into newer schema during upsert,delete and background" + + " compaction,clustering operations."); + + public static final ConfigProperty ALLOW_EMPTY_COMMIT = ConfigProperty + .key("hoodie.allow.empty.commit") + .defaultValue(true) + .withDocumentation("Whether to allow generation of empty commits, even if no data was written in the commit. " + + "It's useful in cases where extra metadata needs to be published regardless e.g tracking source offsets when ingesting data"); + + public static final ConfigProperty ALLOW_OPERATION_METADATA_FIELD = ConfigProperty + .key("hoodie.allow.operation.metadata.field") + .defaultValue(false) + .sinceVersion("0.9.0") + .withDocumentation("Whether to include '_hoodie_operation' in the metadata fields. " + + "Once enabled, all the changes of a record are persisted to the delta log directly without merge"); + + public static final ConfigProperty FILEID_PREFIX_PROVIDER_CLASS = ConfigProperty + .key("hoodie.fileid.prefix.provider.class") + .defaultValue(RandomFileIdPrefixProvider.class.getName()) + .sinceVersion("0.10.0") + .withDocumentation("File Id Prefix provider class, that implements `org.apache.hudi.fileid.FileIdPrefixProvider`"); + + public static final ConfigProperty TABLE_SERVICES_ENABLED = ConfigProperty + .key("hoodie.table.services.enabled") + .defaultValue(true) + .sinceVersion("0.11.0") + .withDocumentation("Master control to disable all table services including archive, clean, compact, cluster, etc."); + + public static final ConfigProperty RELEASE_RESOURCE_ENABLE = ConfigProperty + .key("hoodie.release.resource.on.completion.enable") + .defaultValue(true) + .sinceVersion("0.11.0") + .withDocumentation("Control to enable release all persist rdds when the spark job finish."); + + public static final ConfigProperty AUTO_ADJUST_LOCK_CONFIGS = ConfigProperty + .key("hoodie.auto.adjust.lock.configs") + .defaultValue(false) + .sinceVersion("0.11.0") + .withDocumentation("Auto adjust lock configurations when metadata table is enabled and for async table services."); + + public static final ConfigProperty SKIP_DEFAULT_PARTITION_VALIDATION = ConfigProperty + .key("hoodie.skip.default.partition.validation") + .defaultValue(false) + .sinceVersion("0.12.0") + .withDocumentation("When table is upgraded from pre 0.12 to 0.12, we check for \"default\" partition and fail if found one. " + + "Users are expected to rewrite the data in those partitions. Enabling this config will bypass this validation"); private ConsistencyGuardConfig consistencyGuardConfig; + private FileSystemRetryConfig fileSystemRetryConfig; // Hoodie Write Client transparently rewrites File System View config when embedded mode is enabled // We keep track of original config and rewritten config private final FileSystemViewStorageConfig clientSpecifiedViewStorageConfig; private FileSystemViewStorageConfig viewStorageConfig; - + private HoodiePayloadConfig hoodiePayloadConfig; + private HoodieMetadataConfig metadataConfig; + private HoodieMetastoreConfig metastoreConfig; + private HoodieCommonConfig commonConfig; private EngineType engineType; + /** + * @deprecated Use {@link #TBL_NAME} and its methods instead + */ + @Deprecated + public static final String TABLE_NAME = TBL_NAME.key(); + /** + * @deprecated Use {@link #PRECOMBINE_FIELD_NAME} and its methods instead + */ + @Deprecated + public static final String PRECOMBINE_FIELD_PROP = PRECOMBINE_FIELD_NAME.key(); + /** + * @deprecated Use {@link #WRITE_PAYLOAD_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String WRITE_PAYLOAD_CLASS = WRITE_PAYLOAD_CLASS_NAME.key(); + /** + * @deprecated Use {@link #WRITE_PAYLOAD_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_WRITE_PAYLOAD_CLASS = WRITE_PAYLOAD_CLASS_NAME.defaultValue(); + /** + * @deprecated Use {@link #KEYGENERATOR_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String KEYGENERATOR_CLASS_PROP = KEYGENERATOR_CLASS_NAME.key(); + /** + * @deprecated Use {@link #KEYGENERATOR_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_KEYGENERATOR_CLASS = SimpleAvroKeyGenerator.class.getName(); + /** + * @deprecated Use {@link #ROLLBACK_USING_MARKERS_ENABLE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_ROLLBACK_USING_MARKERS = ROLLBACK_USING_MARKERS_ENABLE.defaultValue(); + /** + * @deprecated Use {@link #ROLLBACK_USING_MARKERS_ENABLE} and its methods instead + */ + @Deprecated + public static final String ROLLBACK_USING_MARKERS = ROLLBACK_USING_MARKERS_ENABLE.key(); + /** + * @deprecated Use {@link #TIMELINE_LAYOUT_VERSION_NUM} and its methods instead + */ + @Deprecated + public static final String TIMELINE_LAYOUT_VERSION = TIMELINE_LAYOUT_VERSION_NUM.key(); + /** + * @deprecated Use {@link #BASE_PATH} and its methods instead + */ + @Deprecated + public static final String BASE_PATH_PROP = BASE_PATH.key(); + /** + * @deprecated Use {@link #AVRO_SCHEMA_STRING} and its methods instead + */ + @Deprecated + public static final String AVRO_SCHEMA = AVRO_SCHEMA_STRING.key(); + /** + * @deprecated Use {@link #AVRO_SCHEMA_VALIDATE_ENABLE} and its methods instead + */ + @Deprecated + public static final String AVRO_SCHEMA_VALIDATE = AVRO_SCHEMA_VALIDATE_ENABLE.key(); + /** + * @deprecated Use {@link #AVRO_SCHEMA_VALIDATE_ENABLE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_AVRO_SCHEMA_VALIDATE = AVRO_SCHEMA_VALIDATE_ENABLE.defaultValue(); + /** + * @deprecated Use {@link #INSERT_PARALLELISM_VALUE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_PARALLELISM = INSERT_PARALLELISM_VALUE.defaultValue(); + /** + * @deprecated Use {@link #INSERT_PARALLELISM_VALUE} and its methods instead + */ + @Deprecated + public static final String INSERT_PARALLELISM = INSERT_PARALLELISM_VALUE.key(); + /** + * @deprecated Use {@link #BULKINSERT_PARALLELISM_VALUE} and its methods instead + */ + @Deprecated + public static final String BULKINSERT_PARALLELISM = BULKINSERT_PARALLELISM_VALUE.key(); + /** + * @deprecated Use {@link #BULKINSERT_USER_DEFINED_PARTITIONER_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String BULKINSERT_USER_DEFINED_PARTITIONER_CLASS = BULKINSERT_USER_DEFINED_PARTITIONER_CLASS_NAME.key(); + @Deprecated + public static final String BULKINSERT_INPUT_DATA_SCHEMA_DDL = "hoodie.bulkinsert.schema.ddl"; + /** + * @deprecated Use {@link #UPSERT_PARALLELISM_VALUE} and its methods instead + */ + @Deprecated + public static final String UPSERT_PARALLELISM = UPSERT_PARALLELISM_VALUE.key(); + /** + * @deprecated Use {@link #DELETE_PARALLELISM_VALUE} and its methods instead + */ + @Deprecated + public static final String DELETE_PARALLELISM = DELETE_PARALLELISM_VALUE.key(); + /** + * @deprecated Use {@link #ROLLBACK_PARALLELISM_VALUE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_ROLLBACK_PARALLELISM = ROLLBACK_PARALLELISM_VALUE.defaultValue(); + /** + * @deprecated Use {@link #ROLLBACK_PARALLELISM_VALUE} and its methods instead + */ + @Deprecated + public static final String ROLLBACK_PARALLELISM = ROLLBACK_PARALLELISM_VALUE.key(); + /** + * @deprecated Use {@link #WRITE_BUFFER_LIMIT_BYTES_VALUE} and its methods instead + */ + @Deprecated + public static final String WRITE_BUFFER_LIMIT_BYTES = WRITE_BUFFER_LIMIT_BYTES_VALUE.key(); + /** + * @deprecated Use {@link #WRITE_BUFFER_LIMIT_BYTES_VALUE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_WRITE_BUFFER_LIMIT_BYTES = WRITE_BUFFER_LIMIT_BYTES_VALUE.defaultValue(); + /** + * @deprecated Use {@link #COMBINE_BEFORE_INSERT} and its methods instead + */ + @Deprecated + public static final String COMBINE_BEFORE_INSERT_PROP = COMBINE_BEFORE_INSERT.key(); + /** + * @deprecated Use {@link #COMBINE_BEFORE_INSERT} and its methods instead + */ + @Deprecated + public static final String DEFAULT_COMBINE_BEFORE_INSERT = COMBINE_BEFORE_INSERT.defaultValue(); + /** + * @deprecated Use {@link #COMBINE_BEFORE_UPSERT} and its methods instead + */ + @Deprecated + public static final String COMBINE_BEFORE_UPSERT_PROP = COMBINE_BEFORE_UPSERT.key(); + /** + * @deprecated Use {@link #COMBINE_BEFORE_UPSERT} and its methods instead + */ + @Deprecated + public static final String DEFAULT_COMBINE_BEFORE_UPSERT = COMBINE_BEFORE_UPSERT.defaultValue(); + /** + * @deprecated Use {@link #COMBINE_BEFORE_DELETE} and its methods instead + */ + @Deprecated + public static final String COMBINE_BEFORE_DELETE_PROP = COMBINE_BEFORE_DELETE.key(); + /** + * @deprecated Use {@link #COMBINE_BEFORE_DELETE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_COMBINE_BEFORE_DELETE = COMBINE_BEFORE_DELETE.defaultValue(); + /** + * @deprecated Use {@link #WRITE_STATUS_STORAGE_LEVEL_VALUE} and its methods instead + */ + @Deprecated + public static final String WRITE_STATUS_STORAGE_LEVEL = WRITE_STATUS_STORAGE_LEVEL_VALUE.key(); + /** + * @deprecated Use {@link #WRITE_STATUS_STORAGE_LEVEL_VALUE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_WRITE_STATUS_STORAGE_LEVEL = WRITE_STATUS_STORAGE_LEVEL_VALUE.defaultValue(); + /** + * @deprecated Use {@link #AUTO_COMMIT_ENABLE} and its methods instead + */ + @Deprecated + public static final String HOODIE_AUTO_COMMIT_PROP = AUTO_COMMIT_ENABLE.key(); + /** + * @deprecated Use {@link #AUTO_COMMIT_ENABLE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_HOODIE_AUTO_COMMIT = AUTO_COMMIT_ENABLE.defaultValue(); + /** + * @deprecated Use {@link #WRITE_STATUS_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String HOODIE_WRITE_STATUS_CLASS_PROP = WRITE_STATUS_CLASS_NAME.key(); + /** + * @deprecated Use {@link #WRITE_STATUS_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_HOODIE_WRITE_STATUS_CLASS = WRITE_STATUS_CLASS_NAME.defaultValue(); + /** + * @deprecated Use {@link #FINALIZE_WRITE_PARALLELISM_VALUE} and its methods instead + */ + @Deprecated + public static final String FINALIZE_WRITE_PARALLELISM = FINALIZE_WRITE_PARALLELISM_VALUE.key(); + /** + * @deprecated Use {@link #FINALIZE_WRITE_PARALLELISM_VALUE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_FINALIZE_WRITE_PARALLELISM = FINALIZE_WRITE_PARALLELISM_VALUE.defaultValue(); + /** + * @deprecated Use {@link #MARKERS_DELETE_PARALLELISM_VALUE} and its methods instead + */ + @Deprecated + public static final String MARKERS_DELETE_PARALLELISM = MARKERS_DELETE_PARALLELISM_VALUE.key(); + /** + * @deprecated Use {@link #MARKERS_DELETE_PARALLELISM_VALUE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_MARKERS_DELETE_PARALLELISM = MARKERS_DELETE_PARALLELISM_VALUE.defaultValue(); + /** + * @deprecated Use {@link #BULK_INSERT_SORT_MODE} and its methods instead + */ + @Deprecated + public static final String BULKINSERT_SORT_MODE = BULK_INSERT_SORT_MODE.key(); + /** + * @deprecated Use {@link #BULK_INSERT_SORT_MODE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_BULKINSERT_SORT_MODE = BULK_INSERT_SORT_MODE.defaultValue(); + /** + * @deprecated Use {@link #EMBEDDED_TIMELINE_SERVER_ENABLE} and its methods instead + */ + @Deprecated + public static final String EMBEDDED_TIMELINE_SERVER_ENABLED = EMBEDDED_TIMELINE_SERVER_ENABLE.key(); + /** + * @deprecated Use {@link #EMBEDDED_TIMELINE_SERVER_ENABLE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_EMBEDDED_TIMELINE_SERVER_ENABLED = EMBEDDED_TIMELINE_SERVER_ENABLE.defaultValue(); + /** + * @deprecated Use {@link #EMBEDDED_TIMELINE_SERVER_PORT_NUM} and its methods instead + */ + @Deprecated + public static final String EMBEDDED_TIMELINE_SERVER_PORT = EMBEDDED_TIMELINE_SERVER_PORT_NUM.key(); + /** + * @deprecated Use {@link #EMBEDDED_TIMELINE_SERVER_PORT_NUM} and its methods instead + */ + @Deprecated + public static final String DEFAULT_EMBEDDED_TIMELINE_SERVER_PORT = EMBEDDED_TIMELINE_SERVER_PORT_NUM.defaultValue(); + /** + * @deprecated Use {@link #EMBEDDED_TIMELINE_NUM_SERVER_THREADS} and its methods instead + */ + @Deprecated + public static final String EMBEDDED_TIMELINE_SERVER_THREADS = EMBEDDED_TIMELINE_NUM_SERVER_THREADS.key(); + /** + * @deprecated Use {@link #EMBEDDED_TIMELINE_NUM_SERVER_THREADS} and its methods instead + */ + @Deprecated + public static final String DEFAULT_EMBEDDED_TIMELINE_SERVER_THREADS = EMBEDDED_TIMELINE_NUM_SERVER_THREADS.defaultValue(); + /** + * @deprecated Use {@link #EMBEDDED_TIMELINE_SERVER_COMPRESS_ENABLE} and its methods instead + */ + @Deprecated + public static final String EMBEDDED_TIMELINE_SERVER_COMPRESS_OUTPUT = EMBEDDED_TIMELINE_SERVER_COMPRESS_ENABLE.key(); + /** + * @deprecated Use {@link #EMBEDDED_TIMELINE_SERVER_COMPRESS_ENABLE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_EMBEDDED_TIMELINE_COMPRESS_OUTPUT = EMBEDDED_TIMELINE_SERVER_COMPRESS_ENABLE.defaultValue(); + /** + * @deprecated Use {@link #EMBEDDED_TIMELINE_SERVER_USE_ASYNC_ENABLE} and its methods instead + */ + @Deprecated + public static final String EMBEDDED_TIMELINE_SERVER_USE_ASYNC = EMBEDDED_TIMELINE_SERVER_USE_ASYNC_ENABLE.key(); + /** + * @deprecated Use {@link #EMBEDDED_TIMELINE_SERVER_USE_ASYNC_ENABLE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_EMBEDDED_TIMELINE_SERVER_ASYNC = EMBEDDED_TIMELINE_SERVER_USE_ASYNC_ENABLE.defaultValue(); + /** + * @deprecated Use {@link #FAIL_ON_TIMELINE_ARCHIVING_ENABLE} and its methods instead + */ + @Deprecated + public static final String FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP = FAIL_ON_TIMELINE_ARCHIVING_ENABLE.key(); + /** + * @deprecated Use {@link #FAIL_ON_TIMELINE_ARCHIVING_ENABLE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_FAIL_ON_TIMELINE_ARCHIVING_ENABLED = FAIL_ON_TIMELINE_ARCHIVING_ENABLE.defaultValue(); + /** + * @deprecated Use {@link #INITIAL_CONSISTENCY_CHECK_INTERVAL_MS} and its methods instead + */ + @Deprecated + public static final String INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP = INITIAL_CONSISTENCY_CHECK_INTERVAL_MS.key(); + /** + * @deprecated Use {@link #INITIAL_CONSISTENCY_CHECK_INTERVAL_MS} and its methods instead + */ + @Deprecated + public static long DEFAULT_INITIAL_CONSISTENCY_CHECK_INTERVAL_MS = INITIAL_CONSISTENCY_CHECK_INTERVAL_MS.defaultValue(); + /** + * @deprecated Use {@link #MAX_CONSISTENCY_CHECK_INTERVAL_MS} and its methods instead + */ + @Deprecated + public static final String MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP = MAX_CONSISTENCY_CHECK_INTERVAL_MS.key(); + /** + * @deprecated Use {@link #MAX_CONSISTENCY_CHECK_INTERVAL_MS} and its methods instead + */ + @Deprecated + public static long DEFAULT_MAX_CONSISTENCY_CHECK_INTERVAL_MS = MAX_CONSISTENCY_CHECK_INTERVAL_MS.defaultValue(); + /** + * @deprecated Use {@link #MAX_CONSISTENCY_CHECKS} and its methods instead + */ + @Deprecated + public static final String MAX_CONSISTENCY_CHECKS_PROP = MAX_CONSISTENCY_CHECKS.key(); + /** + * @deprecated Use {@link #MAX_CONSISTENCY_CHECKS} and its methods instead + */ + @Deprecated + public static int DEFAULT_MAX_CONSISTENCY_CHECKS = MAX_CONSISTENCY_CHECKS.defaultValue(); + /** + * @deprecated Use {@link #MERGE_DATA_VALIDATION_CHECK_ENABLE} and its methods instead + */ + @Deprecated + private static final String MERGE_DATA_VALIDATION_CHECK_ENABLED = MERGE_DATA_VALIDATION_CHECK_ENABLE.key(); + /** + * @deprecated Use {@link #MERGE_DATA_VALIDATION_CHECK_ENABLE} and its methods instead + */ + @Deprecated + private static final String DEFAULT_MERGE_DATA_VALIDATION_CHECK_ENABLED = MERGE_DATA_VALIDATION_CHECK_ENABLE.defaultValue(); + /** + * @deprecated Use {@link #MERGE_ALLOW_DUPLICATE_ON_INSERTS_ENABLE} and its methods instead + */ + @Deprecated + private static final String MERGE_ALLOW_DUPLICATE_ON_INSERTS = MERGE_ALLOW_DUPLICATE_ON_INSERTS_ENABLE.key(); + /** + * @deprecated Use {@link #MERGE_ALLOW_DUPLICATE_ON_INSERTS_ENABLE} and its methods instead + */ + @Deprecated + private static final String DEFAULT_MERGE_ALLOW_DUPLICATE_ON_INSERTS = MERGE_ALLOW_DUPLICATE_ON_INSERTS_ENABLE.defaultValue(); + /** + * @deprecated Use {@link #CLIENT_HEARTBEAT_INTERVAL_IN_MS} and its methods instead + */ + @Deprecated + public static final String CLIENT_HEARTBEAT_INTERVAL_IN_MS_PROP = CLIENT_HEARTBEAT_INTERVAL_IN_MS.key(); + /** + * @deprecated Use {@link #CLIENT_HEARTBEAT_INTERVAL_IN_MS} and its methods instead + */ + @Deprecated + public static final Integer DEFAULT_CLIENT_HEARTBEAT_INTERVAL_IN_MS = CLIENT_HEARTBEAT_INTERVAL_IN_MS.defaultValue(); + /** + * @deprecated Use {@link #CLIENT_HEARTBEAT_NUM_TOLERABLE_MISSES} and its methods instead + */ + @Deprecated + public static final String CLIENT_HEARTBEAT_NUM_TOLERABLE_MISSES_PROP = CLIENT_HEARTBEAT_NUM_TOLERABLE_MISSES.key(); + /** + * @deprecated Use {@link #CLIENT_HEARTBEAT_NUM_TOLERABLE_MISSES} and its methods instead + */ + @Deprecated + public static final Integer DEFAULT_CLIENT_HEARTBEAT_NUM_TOLERABLE_MISSES = CLIENT_HEARTBEAT_NUM_TOLERABLE_MISSES.defaultValue(); + /** + * @deprecated Use {@link #WRITE_CONCURRENCY_MODE} and its methods instead + */ + @Deprecated + public static final String WRITE_CONCURRENCY_MODE_PROP = WRITE_CONCURRENCY_MODE.key(); + /** + * @deprecated Use {@link #WRITE_CONCURRENCY_MODE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_WRITE_CONCURRENCY_MODE = WRITE_CONCURRENCY_MODE.defaultValue(); + /** + * @deprecated Use {@link #ALLOW_MULTI_WRITE_ON_SAME_INSTANT_ENABLE} and its methods instead + */ + @Deprecated + public static final String ALLOW_MULTI_WRITE_ON_SAME_INSTANT = ALLOW_MULTI_WRITE_ON_SAME_INSTANT_ENABLE.key(); + /** + * @deprecated Use {@link #ALLOW_MULTI_WRITE_ON_SAME_INSTANT_ENABLE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_ALLOW_MULTI_WRITE_ON_SAME_INSTANT = ALLOW_MULTI_WRITE_ON_SAME_INSTANT_ENABLE.defaultValue(); + /** + * @deprecated Use {@link #AVRO_EXTERNAL_SCHEMA_TRANSFORMATION_ENABLE} and its methods instead + */ + @Deprecated + public static final String EXTERNAL_RECORD_AND_SCHEMA_TRANSFORMATION = AVRO_EXTERNAL_SCHEMA_TRANSFORMATION_ENABLE.key(); + /** + * @deprecated Use {@link #AVRO_EXTERNAL_SCHEMA_TRANSFORMATION_ENABLE} and its methods instead + */ + @Deprecated + public static final String DEFAULT_EXTERNAL_RECORD_AND_SCHEMA_TRANSFORMATION = AVRO_EXTERNAL_SCHEMA_TRANSFORMATION_ENABLE.defaultValue(); + /** * Use Spark engine by default. */ - protected HoodieWriteConfig(Properties props) { - this(EngineType.SPARK, props); + protected HoodieWriteConfig() { + super(); + this.engineType = EngineType.SPARK; + this.clientSpecifiedViewStorageConfig = null; } protected HoodieWriteConfig(EngineType engineType, Properties props) { @@ -161,8 +900,13 @@ protected HoodieWriteConfig(EngineType engineType, Properties props) { newProps.putAll(props); this.engineType = engineType; this.consistencyGuardConfig = ConsistencyGuardConfig.newBuilder().fromProperties(newProps).build(); + this.fileSystemRetryConfig = FileSystemRetryConfig.newBuilder().fromProperties(newProps).build(); this.clientSpecifiedViewStorageConfig = FileSystemViewStorageConfig.newBuilder().fromProperties(newProps).build(); this.viewStorageConfig = clientSpecifiedViewStorageConfig; + this.hoodiePayloadConfig = HoodiePayloadConfig.newBuilder().fromProperties(newProps).build(); + this.metadataConfig = HoodieMetadataConfig.newBuilder().fromProperties(props).build(); + this.metastoreConfig = HoodieMetastoreConfig.newBuilder().fromProperties(props).build(); + this.commonConfig = HoodieCommonConfig.newBuilder().fromProperties(props).build(); } public static HoodieWriteConfig.Builder newBuilder() { @@ -173,314 +917,643 @@ public static HoodieWriteConfig.Builder newBuilder() { * base properties. */ public String getBasePath() { - return props.getProperty(BASE_PATH_PROP); + return getString(BASE_PATH); } public String getSchema() { - return props.getProperty(AVRO_SCHEMA); + return getString(AVRO_SCHEMA_STRING); } public void setSchema(String schemaStr) { - props.setProperty(AVRO_SCHEMA, schemaStr); + setValue(AVRO_SCHEMA_STRING, schemaStr); + } + + public String getInternalSchema() { + return getString(INTERNAL_SCHEMA_STRING); + } + + public boolean getInternalSchemaCacheEnable() { + return getBoolean(ENABLE_INTERNAL_SCHEMA_CACHE); + } + + public void setInternalSchemaString(String internalSchemaString) { + setValue(INTERNAL_SCHEMA_STRING, internalSchemaString); + } + + public void setInternalSchemaCacheEnable(boolean enable) { + setValue(ENABLE_INTERNAL_SCHEMA_CACHE, String.valueOf(enable)); + } + + public boolean getSchemaEvolutionEnable() { + return getBoolean(HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE); + } + + public void setSchemaEvolutionEnable(boolean enable) { + setValue(HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE, String.valueOf(enable)); + } + + /** + * Get the write schema for written records. + * + * If the WRITE_SCHEMA has specified, we use the WRITE_SCHEMA. + * Or else we use the AVRO_SCHEMA as the write schema. + * @return + */ + public String getWriteSchema() { + if (props.containsKey(WRITE_SCHEMA.key())) { + return getString(WRITE_SCHEMA); + } + return getSchema(); } public boolean getAvroSchemaValidate() { - return Boolean.parseBoolean(props.getProperty(AVRO_SCHEMA_VALIDATE)); + return getBoolean(AVRO_SCHEMA_VALIDATE_ENABLE); } public String getTableName() { - return props.getProperty(TABLE_NAME); + return getString(TBL_NAME); + } + + public HoodieTableType getTableType() { + return HoodieTableType.valueOf(getStringOrDefault( + HoodieTableConfig.TYPE, HoodieTableConfig.TYPE.defaultValue().name()).toUpperCase()); + } + + public String getPreCombineField() { + return getString(PRECOMBINE_FIELD_NAME); + } + + public String getWritePayloadClass() { + return getString(WRITE_PAYLOAD_CLASS_NAME); + } + + public String getKeyGeneratorClass() { + return getString(KEYGENERATOR_CLASS_NAME); + } + + public boolean isConsistentLogicalTimestampEnabled() { + return getBooleanOrDefault(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED); } public Boolean shouldAutoCommit() { - return Boolean.parseBoolean(props.getProperty(HOODIE_AUTO_COMMIT_PROP)); + return getBoolean(AUTO_COMMIT_ENABLE); } public Boolean shouldAssumeDatePartitioning() { - return Boolean.parseBoolean(props.getProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP)); + return metadataConfig.shouldAssumeDatePartitioning(); } public boolean shouldUseExternalSchemaTransformation() { - return Boolean.parseBoolean(props.getProperty(EXTERNAL_RECORD_AND_SCHEMA_TRANSFORMATION)); + return getBoolean(AVRO_EXTERNAL_SCHEMA_TRANSFORMATION_ENABLE); } public Integer getTimelineLayoutVersion() { - return Integer.parseInt(props.getProperty(TIMELINE_LAYOUT_VERSION)); + return getInt(TIMELINE_LAYOUT_VERSION_NUM); } public int getBulkInsertShuffleParallelism() { - return Integer.parseInt(props.getProperty(BULKINSERT_PARALLELISM)); + return getInt(BULKINSERT_PARALLELISM_VALUE); } public String getUserDefinedBulkInsertPartitionerClass() { - return props.getProperty(BULKINSERT_USER_DEFINED_PARTITIONER_CLASS); + return getString(BULKINSERT_USER_DEFINED_PARTITIONER_CLASS_NAME); + } + + public String getUserDefinedBulkInsertPartitionerSortColumns() { + return getString(BULKINSERT_USER_DEFINED_PARTITIONER_SORT_COLUMNS); } public int getInsertShuffleParallelism() { - return Integer.parseInt(props.getProperty(INSERT_PARALLELISM)); + return getInt(INSERT_PARALLELISM_VALUE); } public int getUpsertShuffleParallelism() { - return Integer.parseInt(props.getProperty(UPSERT_PARALLELISM)); + return getInt(UPSERT_PARALLELISM_VALUE); } public int getDeleteShuffleParallelism() { - return Math.max(Integer.parseInt(props.getProperty(DELETE_PARALLELISM)), 1); + return Math.max(getInt(DELETE_PARALLELISM_VALUE), 1); } public int getRollbackParallelism() { - return Integer.parseInt(props.getProperty(ROLLBACK_PARALLELISM)); + return getInt(ROLLBACK_PARALLELISM_VALUE); + } + + public int getFileListingParallelism() { + return metadataConfig.getFileListingParallelism(); } public boolean shouldRollbackUsingMarkers() { - return Boolean.parseBoolean(props.getProperty(ROLLBACK_USING_MARKERS)); + return getBoolean(ROLLBACK_USING_MARKERS_ENABLE); } public int getWriteBufferLimitBytes() { - return Integer.parseInt(props.getProperty(WRITE_BUFFER_LIMIT_BYTES, DEFAULT_WRITE_BUFFER_LIMIT_BYTES)); + return Integer.parseInt(getStringOrDefault(WRITE_BUFFER_LIMIT_BYTES_VALUE)); } public boolean shouldCombineBeforeInsert() { - return Boolean.parseBoolean(props.getProperty(COMBINE_BEFORE_INSERT_PROP)); + return getBoolean(COMBINE_BEFORE_INSERT); } public boolean shouldCombineBeforeUpsert() { - return Boolean.parseBoolean(props.getProperty(COMBINE_BEFORE_UPSERT_PROP)); + return getBoolean(COMBINE_BEFORE_UPSERT); } public boolean shouldCombineBeforeDelete() { - return Boolean.parseBoolean(props.getProperty(COMBINE_BEFORE_DELETE_PROP)); + return getBoolean(COMBINE_BEFORE_DELETE); } public boolean shouldAllowMultiWriteOnSameInstant() { - return Boolean.parseBoolean(props.getProperty(ALLOW_MULTI_WRITE_ON_SAME_INSTANT)); + return getBoolean(ALLOW_MULTI_WRITE_ON_SAME_INSTANT_ENABLE); } public String getWriteStatusClassName() { - return props.getProperty(HOODIE_WRITE_STATUS_CLASS_PROP); + return getString(WRITE_STATUS_CLASS_NAME); } public int getFinalizeWriteParallelism() { - return Integer.parseInt(props.getProperty(FINALIZE_WRITE_PARALLELISM)); + return getInt(FINALIZE_WRITE_PARALLELISM_VALUE); + } + + public MarkerType getMarkersType() { + String markerType = getString(MARKERS_TYPE); + return MarkerType.valueOf(markerType.toUpperCase()); + } + + public boolean isHiveStylePartitioningEnabled() { + return getBooleanOrDefault(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE); + } + + public int getMarkersTimelineServerBasedBatchNumThreads() { + return getInt(MARKERS_TIMELINE_SERVER_BASED_BATCH_NUM_THREADS); + } + + public long getMarkersTimelineServerBasedBatchIntervalMs() { + return getLong(MARKERS_TIMELINE_SERVER_BASED_BATCH_INTERVAL_MS); } public int getMarkersDeleteParallelism() { - return Integer.parseInt(props.getProperty(MARKERS_DELETE_PARALLELISM)); + return getInt(MARKERS_DELETE_PARALLELISM_VALUE); } public boolean isEmbeddedTimelineServerEnabled() { - return Boolean.parseBoolean(props.getProperty(EMBEDDED_TIMELINE_SERVER_ENABLED)); + return getBoolean(EMBEDDED_TIMELINE_SERVER_ENABLE); + } + + public boolean isEmbeddedTimelineServerReuseEnabled() { + return getBoolean(EMBEDDED_TIMELINE_SERVER_REUSE_ENABLED); } public int getEmbeddedTimelineServerPort() { - return Integer.parseInt(props.getProperty(EMBEDDED_TIMELINE_SERVER_PORT, DEFAULT_EMBEDDED_TIMELINE_SERVER_PORT)); + return Integer.parseInt(getStringOrDefault(EMBEDDED_TIMELINE_SERVER_PORT_NUM)); + } + + public int getEmbeddedTimelineServerThreads() { + return Integer.parseInt(getStringOrDefault(EMBEDDED_TIMELINE_NUM_SERVER_THREADS)); + } + + public boolean getEmbeddedTimelineServerCompressOutput() { + return Boolean.parseBoolean(getStringOrDefault(EMBEDDED_TIMELINE_SERVER_COMPRESS_ENABLE)); + } + + public boolean getEmbeddedTimelineServerUseAsync() { + return Boolean.parseBoolean(getStringOrDefault(EMBEDDED_TIMELINE_SERVER_USE_ASYNC_ENABLE)); } public boolean isFailOnTimelineArchivingEnabled() { - return Boolean.parseBoolean(props.getProperty(FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP)); + return getBoolean(FAIL_ON_TIMELINE_ARCHIVING_ENABLE); + } + + public boolean isFailOnInlineTableServiceExceptionEnabled() { + return getBoolean(FAIL_ON_INLINE_TABLE_SERVICE_EXCEPTION); } public int getMaxConsistencyChecks() { - return Integer.parseInt(props.getProperty(MAX_CONSISTENCY_CHECKS_PROP)); + return getInt(MAX_CONSISTENCY_CHECKS); } public int getInitialConsistencyCheckIntervalMs() { - return Integer.parseInt(props.getProperty(INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP)); + return getInt(INITIAL_CONSISTENCY_CHECK_INTERVAL_MS); } public int getMaxConsistencyCheckIntervalMs() { - return Integer.parseInt(props.getProperty(MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP)); + return getInt(MAX_CONSISTENCY_CHECK_INTERVAL_MS); } public BulkInsertSortMode getBulkInsertSortMode() { - String sortMode = props.getProperty(BULKINSERT_SORT_MODE); + String sortMode = getStringOrDefault(BULK_INSERT_SORT_MODE); return BulkInsertSortMode.valueOf(sortMode.toUpperCase()); } public boolean isMergeDataValidationCheckEnabled() { - return Boolean.parseBoolean(props.getProperty(MERGE_DATA_VALIDATION_CHECK_ENABLED)); + return getBoolean(MERGE_DATA_VALIDATION_CHECK_ENABLE); + } + + public boolean allowDuplicateInserts() { + return getBoolean(MERGE_ALLOW_DUPLICATE_ON_INSERTS_ENABLE); + } + + public int getSmallFileGroupCandidatesLimit() { + return getInt(MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT); } public EngineType getEngineType() { return engineType; } + public boolean populateMetaFields() { + return getBooleanOrDefault(HoodieTableConfig.POPULATE_META_FIELDS); + } + /** * compaction properties. */ public HoodieCleaningPolicy getCleanerPolicy() { - return HoodieCleaningPolicy.valueOf(props.getProperty(HoodieCompactionConfig.CLEANER_POLICY_PROP)); + return HoodieCleaningPolicy.valueOf(getString(CLEANER_POLICY)); } public int getCleanerFileVersionsRetained() { - return Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_FILE_VERSIONS_RETAINED_PROP)); + return getInt(HoodieCleanConfig.CLEANER_FILE_VERSIONS_RETAINED); } public int getCleanerCommitsRetained() { - return Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP)); + return getInt(HoodieCleanConfig.CLEANER_COMMITS_RETAINED); + } + + public int getCleanerHoursRetained() { + return getInt(HoodieCleanConfig.CLEANER_HOURS_RETAINED); } public int getMaxCommitsToKeep() { - return Integer.parseInt(props.getProperty(HoodieCompactionConfig.MAX_COMMITS_TO_KEEP_PROP)); + return getInt(HoodieArchivalConfig.MAX_COMMITS_TO_KEEP); } public int getMinCommitsToKeep() { - return Integer.parseInt(props.getProperty(HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP)); + return getInt(HoodieArchivalConfig.MIN_COMMITS_TO_KEEP); + } + + public int getArchiveMergeFilesBatchSize() { + return getInt(HoodieArchivalConfig.ARCHIVE_MERGE_FILES_BATCH_SIZE); } public int getParquetSmallFileLimit() { - return Integer.parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES)); + return getInt(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT); } public double getRecordSizeEstimationThreshold() { - return Double.parseDouble(props.getProperty(HoodieCompactionConfig.RECORD_SIZE_ESTIMATION_THRESHOLD_PROP)); + return getDouble(HoodieCompactionConfig.RECORD_SIZE_ESTIMATION_THRESHOLD); } public int getCopyOnWriteInsertSplitSize() { - return Integer.parseInt(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE)); + return getInt(HoodieCompactionConfig.COPY_ON_WRITE_INSERT_SPLIT_SIZE); } public int getCopyOnWriteRecordSizeEstimate() { - return Integer.parseInt(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE)); + return getInt(HoodieCompactionConfig.COPY_ON_WRITE_RECORD_SIZE_ESTIMATE); + } + + public boolean allowMultipleCleans() { + return getBoolean(HoodieCleanConfig.ALLOW_MULTIPLE_CLEANS); } public boolean shouldAutoTuneInsertSplits() { - return Boolean.parseBoolean(props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS)); + return getBoolean(HoodieCompactionConfig.COPY_ON_WRITE_AUTO_SPLIT_INSERTS); } public int getCleanerParallelism() { - return Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_PARALLELISM)); + return getInt(HoodieCleanConfig.CLEANER_PARALLELISM_VALUE); + } + + public int getCleaningMaxCommits() { + return getInt(HoodieCleanConfig.CLEAN_MAX_COMMITS); + } + + public CleaningTriggerStrategy getCleaningTriggerStrategy() { + return CleaningTriggerStrategy.valueOf(getString(HoodieCleanConfig.CLEAN_TRIGGER_STRATEGY)); } public boolean isAutoClean() { - return Boolean.parseBoolean(props.getProperty(HoodieCompactionConfig.AUTO_CLEAN_PROP)); + return getBoolean(HoodieCleanConfig.AUTO_CLEAN); + } + + public boolean getArchiveMergeEnable() { + return getBooleanOrDefault(HoodieArchivalConfig.ARCHIVE_MERGE_ENABLE); + } + + public boolean shouldArchiveBeyondSavepoint() { + return getBooleanOrDefault(HoodieArchivalConfig.ARCHIVE_BEYOND_SAVEPOINT); + } + + public long getArchiveMergeSmallFileLimitBytes() { + return getLong(HoodieArchivalConfig.ARCHIVE_MERGE_SMALL_FILE_LIMIT_BYTES); + } + + public boolean isAutoArchive() { + return getBoolean(HoodieArchivalConfig.AUTO_ARCHIVE); + } + + public boolean isAsyncArchive() { + return getBoolean(HoodieArchivalConfig.ASYNC_ARCHIVE); } public boolean isAsyncClean() { - return Boolean.parseBoolean(props.getProperty(HoodieCompactionConfig.ASYNC_CLEAN_PROP)); + return getBoolean(HoodieCleanConfig.ASYNC_CLEAN); } public boolean incrementalCleanerModeEnabled() { - return Boolean.parseBoolean(props.getProperty(HoodieCompactionConfig.CLEANER_INCREMENTAL_MODE)); + return getBoolean(HoodieCleanConfig.CLEANER_INCREMENTAL_MODE_ENABLE); } - public boolean isInlineCompaction() { - return Boolean.parseBoolean(props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_PROP)); + public boolean inlineCompactionEnabled() { + return getBoolean(HoodieCompactionConfig.INLINE_COMPACT); + } + + public boolean scheduleInlineCompaction() { + return getBoolean(HoodieCompactionConfig.SCHEDULE_INLINE_COMPACT); + } + + public CompactionTriggerStrategy getInlineCompactTriggerStrategy() { + return CompactionTriggerStrategy.valueOf(getString(HoodieCompactionConfig.INLINE_COMPACT_TRIGGER_STRATEGY)); } public int getInlineCompactDeltaCommitMax() { - return Integer.parseInt(props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP)); + return getInt(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS); + } + + public int getInlineCompactDeltaSecondsMax() { + return getInt(HoodieCompactionConfig.INLINE_COMPACT_TIME_DELTA_SECONDS); } public CompactionStrategy getCompactionStrategy() { - return ReflectionUtils.loadClass(props.getProperty(HoodieCompactionConfig.COMPACTION_STRATEGY_PROP)); + return ReflectionUtils.loadClass(getString(HoodieCompactionConfig.COMPACTION_STRATEGY)); } public Long getTargetIOPerCompactionInMB() { - return Long.parseLong(props.getProperty(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB_PROP)); + return getLong(HoodieCompactionConfig.TARGET_IO_PER_COMPACTION_IN_MB); + } + + public Long getCompactionLogFileSizeThreshold() { + return getLong(HoodieCompactionConfig.COMPACTION_LOG_FILE_SIZE_THRESHOLD); + } + + public Long getCompactionLogFileNumThreshold() { + return getLong(HoodieCompactionConfig.COMPACTION_LOG_FILE_NUM_THRESHOLD); } public Boolean getCompactionLazyBlockReadEnabled() { - return Boolean.valueOf(props.getProperty(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP)); + return getBoolean(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLE); } public Boolean getCompactionReverseLogReadEnabled() { - return Boolean.valueOf(props.getProperty(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLED_PROP)); + return getBoolean(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLE); + } + + public int getArchiveDeleteParallelism() { + return getInt(HoodieArchivalConfig.DELETE_ARCHIVED_INSTANT_PARALLELISM_VALUE); + } + + public boolean inlineClusteringEnabled() { + return getBoolean(HoodieClusteringConfig.INLINE_CLUSTERING); + } + + public boolean scheduleInlineClustering() { + return getBoolean(HoodieClusteringConfig.SCHEDULE_INLINE_CLUSTERING); + } + + public boolean isAsyncClusteringEnabled() { + return getBoolean(HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE); + } + + public boolean isPreserveHoodieCommitMetadataForClustering() { + return getBoolean(HoodieClusteringConfig.PRESERVE_COMMIT_METADATA); + } + + public boolean isPreserveHoodieCommitMetadataForCompaction() { + return getBoolean(HoodieCompactionConfig.PRESERVE_COMMIT_METADATA); + } + + public boolean isClusteringEnabled() { + // TODO: future support async clustering + return inlineClusteringEnabled() || isAsyncClusteringEnabled(); + } + + public boolean isRollbackPendingClustering() { + return getBoolean(HoodieClusteringConfig.ROLLBACK_PENDING_CLUSTERING_ON_CONFLICT); + } + + public int getInlineClusterMaxCommits() { + return getInt(HoodieClusteringConfig.INLINE_CLUSTERING_MAX_COMMITS); + } + + public int getAsyncClusterMaxCommits() { + return getInt(HoodieClusteringConfig.ASYNC_CLUSTERING_MAX_COMMITS); } public String getPayloadClass() { - return props.getProperty(HoodieCompactionConfig.PAYLOAD_CLASS_PROP); + return getString(HoodiePayloadConfig.PAYLOAD_CLASS_NAME); } public int getTargetPartitionsPerDayBasedCompaction() { - return Integer.parseInt(props.getProperty(HoodieCompactionConfig.TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP)); + return getInt(HoodieCompactionConfig.TARGET_PARTITIONS_PER_DAYBASED_COMPACTION); } public int getCommitArchivalBatchSize() { - return Integer.parseInt(props.getProperty(HoodieCompactionConfig.COMMITS_ARCHIVAL_BATCH_SIZE_PROP)); + return getInt(HoodieArchivalConfig.COMMITS_ARCHIVAL_BATCH_SIZE); } public Boolean shouldCleanBootstrapBaseFile() { - return Boolean.valueOf(props.getProperty(HoodieCompactionConfig.CLEANER_BOOTSTRAP_BASE_FILE_ENABLED)); + return getBoolean(HoodieCleanConfig.CLEANER_BOOTSTRAP_BASE_FILE_ENABLE); + } + + public String getClusteringUpdatesStrategyClass() { + return getString(HoodieClusteringConfig.UPDATES_STRATEGY); + } + + public HoodieFailedWritesCleaningPolicy getFailedWritesCleanPolicy() { + return HoodieFailedWritesCleaningPolicy + .valueOf(getString(HoodieCleanConfig.FAILED_WRITES_CLEANER_POLICY)); + } + + /** + * Clustering properties. + */ + public String getClusteringPlanStrategyClass() { + return getString(HoodieClusteringConfig.PLAN_STRATEGY_CLASS_NAME); + } + + public ClusteringPlanPartitionFilterMode getClusteringPlanPartitionFilterMode() { + String mode = getString(HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME); + return ClusteringPlanPartitionFilterMode.valueOf(mode); + } + + public String getBeginPartitionForClustering() { + return getString(HoodieClusteringConfig.PARTITION_FILTER_BEGIN_PARTITION); + } + + public String getEndPartitionForClustering() { + return getString(HoodieClusteringConfig.PARTITION_FILTER_END_PARTITION); + } + + public String getClusteringExecutionStrategyClass() { + return getString(HoodieClusteringConfig.EXECUTION_STRATEGY_CLASS_NAME); + } + + public long getClusteringMaxBytesInGroup() { + return getLong(HoodieClusteringConfig.PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP); + } + + public long getClusteringSmallFileLimit() { + return getLong(HoodieClusteringConfig.PLAN_STRATEGY_SMALL_FILE_LIMIT); + } + + public String getClusteringPartitionSelected() { + return getString(HoodieClusteringConfig.PARTITION_SELECTED); + } + + public String getClusteringPartitionFilterRegexPattern() { + return getString(HoodieClusteringConfig.PARTITION_REGEX_PATTERN); + } + + public int getClusteringMaxNumGroups() { + return getInt(HoodieClusteringConfig.PLAN_STRATEGY_MAX_GROUPS); + } + + public long getClusteringTargetFileMaxBytes() { + return getLong(HoodieClusteringConfig.PLAN_STRATEGY_TARGET_FILE_MAX_BYTES); + } + + public int getTargetPartitionsForClustering() { + return getInt(HoodieClusteringConfig.DAYBASED_LOOKBACK_PARTITIONS); + } + + public int getSkipPartitionsFromLatestForClustering() { + return getInt(HoodieClusteringConfig.PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST); + } + + public String getClusteringSortColumns() { + return getString(HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS); + } + + public HoodieClusteringConfig.LayoutOptimizationStrategy getLayoutOptimizationStrategy() { + return HoodieClusteringConfig.LayoutOptimizationStrategy.fromValue( + getStringOrDefault(HoodieClusteringConfig.LAYOUT_OPTIMIZE_STRATEGY) + ); + } + + public HoodieClusteringConfig.SpatialCurveCompositionStrategyType getLayoutOptimizationCurveBuildMethod() { + return HoodieClusteringConfig.SpatialCurveCompositionStrategyType.fromValue( + getString(HoodieClusteringConfig.LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD)); + } + + public int getLayoutOptimizationSampleSize() { + return getInt(HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE); } /** * index properties. */ public HoodieIndex.IndexType getIndexType() { - return HoodieIndex.IndexType.valueOf(props.getProperty(HoodieIndexConfig.INDEX_TYPE_PROP)); + return HoodieIndex.IndexType.valueOf(getString(HoodieIndexConfig.INDEX_TYPE)); } public String getIndexClass() { - return props.getProperty(HoodieIndexConfig.INDEX_CLASS_PROP); + return getString(HoodieIndexConfig.INDEX_CLASS_NAME); + } + + public HoodieIndex.BucketIndexEngineType getBucketIndexEngineType() { + return HoodieIndex.BucketIndexEngineType.valueOf(getString(HoodieIndexConfig.BUCKET_INDEX_ENGINE_TYPE)); } public int getBloomFilterNumEntries() { - return Integer.parseInt(props.getProperty(HoodieIndexConfig.BLOOM_FILTER_NUM_ENTRIES)); + return getInt(HoodieIndexConfig.BLOOM_FILTER_NUM_ENTRIES_VALUE); } public double getBloomFilterFPP() { - return Double.parseDouble(props.getProperty(HoodieIndexConfig.BLOOM_FILTER_FPP)); + return getDouble(HoodieIndexConfig.BLOOM_FILTER_FPP_VALUE); } public String getHbaseZkQuorum() { - return props.getProperty(HoodieHBaseIndexConfig.HBASE_ZKQUORUM_PROP); + return getString(HoodieHBaseIndexConfig.ZKQUORUM); } public int getHbaseZkPort() { - return Integer.parseInt(props.getProperty(HoodieHBaseIndexConfig.HBASE_ZKPORT_PROP)); + return getInt(HoodieHBaseIndexConfig.ZKPORT); } public String getHBaseZkZnodeParent() { - return props.getProperty(HoodieIndexConfig.HBASE_ZK_ZNODEPARENT); + return getString(HoodieHBaseIndexConfig.ZK_NODE_PATH); } public String getHbaseTableName() { - return props.getProperty(HoodieHBaseIndexConfig.HBASE_TABLENAME_PROP); + return getString(HoodieHBaseIndexConfig.TABLENAME); } public int getHbaseIndexGetBatchSize() { - return Integer.parseInt(props.getProperty(HoodieHBaseIndexConfig.HBASE_GET_BATCH_SIZE_PROP)); + return getInt(HoodieHBaseIndexConfig.GET_BATCH_SIZE); + } + + public Boolean getHBaseIndexRollbackSync() { + return getBoolean(HoodieHBaseIndexConfig.ROLLBACK_SYNC_ENABLE); } public int getHbaseIndexPutBatchSize() { - return Integer.parseInt(props.getProperty(HoodieHBaseIndexConfig.HBASE_PUT_BATCH_SIZE_PROP)); + return getInt(HoodieHBaseIndexConfig.PUT_BATCH_SIZE); } - public Boolean getHbaseIndexPutBatchSizeAutoCompute() { - return Boolean.valueOf(props.getProperty(HoodieHBaseIndexConfig.HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP)); + public boolean getHbaseIndexPutBatchSizeAutoCompute() { + return getBooleanOrDefault(HoodieHBaseIndexConfig.PUT_BATCH_SIZE_AUTO_COMPUTE); } public String getHBaseQPSResourceAllocatorClass() { - return props.getProperty(HoodieHBaseIndexConfig.HBASE_INDEX_QPS_ALLOCATOR_CLASS); + return getString(HoodieHBaseIndexConfig.QPS_ALLOCATOR_CLASS_NAME); } public String getHBaseQPSZKnodePath() { - return props.getProperty(HoodieHBaseIndexConfig.HBASE_ZK_PATH_QPS_ROOT); + return getString(HoodieHBaseIndexConfig.ZKPATH_QPS_ROOT); } public String getHBaseZkZnodeSessionTimeout() { - return props.getProperty(HoodieHBaseIndexConfig.HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS); + return getString(HoodieHBaseIndexConfig.ZK_SESSION_TIMEOUT_MS); } public String getHBaseZkZnodeConnectionTimeout() { - return props.getProperty(HoodieHBaseIndexConfig.HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS); + return getString(HoodieHBaseIndexConfig.ZK_CONNECTION_TIMEOUT_MS); + } + + public boolean getHBaseIndexShouldComputeQPSDynamically() { + return getBoolean(HoodieHBaseIndexConfig.COMPUTE_QPS_DYNAMICALLY); + } + + public String getHBaseIndexSecurityAuthentication() { + return getString(HoodieHBaseIndexConfig.SECURITY_AUTHENTICATION); + } + + public String getHBaseIndexKerberosUserKeytab() { + return getString(HoodieHBaseIndexConfig.KERBEROS_USER_KEYTAB); + } + + public String getHBaseIndexKerberosUserPrincipal() { + return getString(HoodieHBaseIndexConfig.KERBEROS_USER_PRINCIPAL); + } + + public String getHBaseIndexRegionserverPrincipal() { + return getString(HoodieHBaseIndexConfig.REGIONSERVER_PRINCIPAL); } - public boolean getHBaseIndexShouldComputeQPSDynamically() { - return Boolean.parseBoolean(props.getProperty(HoodieHBaseIndexConfig.HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY)); + public String getHBaseIndexMasterPrincipal() { + return getString(HoodieHBaseIndexConfig.MASTER_PRINCIPAL); } public int getHBaseIndexDesiredPutsTime() { - return Integer.parseInt(props.getProperty(HoodieHBaseIndexConfig.HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS)); + return getInt(HoodieHBaseIndexConfig.DESIRED_PUTS_TIME_IN_SECONDS); } public String getBloomFilterType() { - return props.getProperty(HoodieIndexConfig.BLOOM_INDEX_FILTER_TYPE); + return getString(HoodieIndexConfig.BLOOM_FILTER_TYPE); } public int getDynamicBloomFilterMaxNumEntries() { - return Integer.parseInt(props.getProperty(HoodieIndexConfig.HOODIE_BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES)); + return getInt(HoodieIndexConfig.BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES); } /** @@ -489,15 +1562,15 @@ public int getDynamicBloomFilterMaxNumEntries() { * the jobs would be (0.17) 1/6, 0.33 (2/6) and 0.5 (3/6) respectively. */ public float getHbaseIndexQPSFraction() { - return Float.parseFloat(props.getProperty(HoodieHBaseIndexConfig.HBASE_QPS_FRACTION_PROP)); + return getFloat(HoodieHBaseIndexConfig.QPS_FRACTION); } public float getHBaseIndexMinQPSFraction() { - return Float.parseFloat(props.getProperty(HoodieHBaseIndexConfig.HBASE_MIN_QPS_FRACTION_PROP)); + return getFloat(HoodieHBaseIndexConfig.MIN_QPS_FRACTION); } public float getHBaseIndexMaxQPSFraction() { - return Float.parseFloat(props.getProperty(HoodieHBaseIndexConfig.HBASE_MAX_QPS_FRACTION_PROP)); + return getFloat(HoodieHBaseIndexConfig.MAX_QPS_FRACTION); } /** @@ -505,225 +1578,344 @@ public float getHBaseIndexMaxQPSFraction() { * Hoodie jobs to an Hbase Region Server */ public int getHbaseIndexMaxQPSPerRegionServer() { - return Integer.parseInt(props.getProperty(HoodieHBaseIndexConfig.HBASE_MAX_QPS_PER_REGION_SERVER_PROP)); + return getInt(HoodieHBaseIndexConfig.MAX_QPS_PER_REGION_SERVER); } public boolean getHbaseIndexUpdatePartitionPath() { - return Boolean.parseBoolean(props.getProperty(HoodieHBaseIndexConfig.HBASE_INDEX_UPDATE_PARTITION_PATH)); + return getBooleanOrDefault(HoodieHBaseIndexConfig.UPDATE_PARTITION_PATH_ENABLE); + } + + public int getHBaseIndexRegionCount() { + return getInt(HoodieHBaseIndexConfig.BUCKET_NUMBER); } public int getBloomIndexParallelism() { - return Integer.parseInt(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PARALLELISM_PROP)); + return getInt(HoodieIndexConfig.BLOOM_INDEX_PARALLELISM); } public boolean getBloomIndexPruneByRanges() { - return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES_PROP)); + return getBoolean(HoodieIndexConfig.BLOOM_INDEX_PRUNE_BY_RANGES); } public boolean getBloomIndexUseCaching() { - return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_USE_CACHING_PROP)); + return getBoolean(HoodieIndexConfig.BLOOM_INDEX_USE_CACHING); + } + + public boolean getBloomIndexUseMetadata() { + return getBooleanOrDefault(HoodieIndexConfig.BLOOM_INDEX_USE_METADATA); } public boolean useBloomIndexTreebasedFilter() { - return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_TREE_BASED_FILTER_PROP)); + return getBoolean(HoodieIndexConfig.BLOOM_INDEX_TREE_BASED_FILTER); } public boolean useBloomIndexBucketizedChecking() { - return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_BUCKETIZED_CHECKING_PROP)); + return getBoolean(HoodieIndexConfig.BLOOM_INDEX_BUCKETIZED_CHECKING); + } + + public boolean isMetadataBloomFilterIndexEnabled() { + return isMetadataTableEnabled() && getMetadataConfig().isBloomFilterIndexEnabled(); + } + + public boolean isMetadataColumnStatsIndexEnabled() { + return isMetadataTableEnabled() && getMetadataConfig().isColumnStatsIndexEnabled(); + } + + public List getColumnsEnabledForColumnStatsIndex() { + return getMetadataConfig().getColumnsEnabledForColumnStatsIndex(); + } + + public List getColumnsEnabledForBloomFilterIndex() { + return getMetadataConfig().getColumnsEnabledForBloomFilterIndex(); + } + + public int getIndexingCheckTimeoutSeconds() { + return getMetadataConfig().getIndexingCheckTimeoutSeconds(); + } + + public int getMetadataBloomFilterIndexParallelism() { + return metadataConfig.getBloomFilterIndexParallelism(); + } + + public int getColumnStatsIndexParallelism() { + return metadataConfig.getColumnStatsIndexParallelism(); } public int getBloomIndexKeysPerBucket() { - return Integer.parseInt(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_KEYS_PER_BUCKET_PROP)); + return getInt(HoodieIndexConfig.BLOOM_INDEX_KEYS_PER_BUCKET); } public boolean getBloomIndexUpdatePartitionPath() { - return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.BLOOM_INDEX_UPDATE_PARTITION_PATH)); + return getBoolean(HoodieIndexConfig.BLOOM_INDEX_UPDATE_PARTITION_PATH_ENABLE); } public int getSimpleIndexParallelism() { - return Integer.parseInt(props.getProperty(HoodieIndexConfig.SIMPLE_INDEX_PARALLELISM_PROP)); + return getInt(HoodieIndexConfig.SIMPLE_INDEX_PARALLELISM); } public boolean getSimpleIndexUseCaching() { - return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.SIMPLE_INDEX_USE_CACHING_PROP)); + return getBoolean(HoodieIndexConfig.SIMPLE_INDEX_USE_CACHING); } public int getGlobalSimpleIndexParallelism() { - return Integer.parseInt(props.getProperty(HoodieIndexConfig.GLOBAL_SIMPLE_INDEX_PARALLELISM_PROP)); + return getInt(HoodieIndexConfig.GLOBAL_SIMPLE_INDEX_PARALLELISM); } public boolean getGlobalSimpleIndexUpdatePartitionPath() { - return Boolean.parseBoolean(props.getProperty(HoodieIndexConfig.SIMPLE_INDEX_UPDATE_PARTITION_PATH)); + return getBoolean(HoodieIndexConfig.SIMPLE_INDEX_UPDATE_PARTITION_PATH_ENABLE); + } + + public int getBucketIndexNumBuckets() { + return getIntOrDefault(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS); + } + + public String getBucketIndexHashField() { + return getString(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD); } /** * storage properties. */ public long getParquetMaxFileSize() { - return Long.parseLong(props.getProperty(HoodieStorageConfig.PARQUET_FILE_MAX_BYTES)); + return getLong(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE); } public int getParquetBlockSize() { - return Integer.parseInt(props.getProperty(HoodieStorageConfig.PARQUET_BLOCK_SIZE_BYTES)); + return getInt(HoodieStorageConfig.PARQUET_BLOCK_SIZE); } public int getParquetPageSize() { - return Integer.parseInt(props.getProperty(HoodieStorageConfig.PARQUET_PAGE_SIZE_BYTES)); + return getInt(HoodieStorageConfig.PARQUET_PAGE_SIZE); } public int getLogFileDataBlockMaxSize() { - return Integer.parseInt(props.getProperty(HoodieStorageConfig.LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES)); - } - - public int getLogFileMaxSize() { - return Integer.parseInt(props.getProperty(HoodieStorageConfig.LOGFILE_SIZE_MAX_BYTES)); + return getInt(HoodieStorageConfig.LOGFILE_DATA_BLOCK_MAX_SIZE); } public double getParquetCompressionRatio() { - return Double.parseDouble(props.getProperty(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO)); + return getDouble(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION); } public CompressionCodecName getParquetCompressionCodec() { - return CompressionCodecName.fromConf(props.getProperty(HoodieStorageConfig.PARQUET_COMPRESSION_CODEC)); + String codecName = getString(HoodieStorageConfig.PARQUET_COMPRESSION_CODEC_NAME); + return CompressionCodecName.fromConf(StringUtils.isNullOrEmpty(codecName) ? null : codecName); + } + + public boolean parquetDictionaryEnabled() { + return getBoolean(HoodieStorageConfig.PARQUET_DICTIONARY_ENABLED); + } + + public String parquetWriteLegacyFormatEnabled() { + return getString(HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED); + } + + public String parquetOutputTimestampType() { + return getString(HoodieStorageConfig.PARQUET_OUTPUT_TIMESTAMP_TYPE); + } + + public String parquetFieldIdWriteEnabled() { + return getString(HoodieStorageConfig.PARQUET_FIELD_ID_WRITE_ENABLED); + } + + public Option getLogDataBlockFormat() { + return Option.ofNullable(getString(HoodieStorageConfig.LOGFILE_DATA_BLOCK_FORMAT)) + .map(HoodieLogBlock.HoodieLogBlockType::fromId); + } + + public long getLogFileMaxSize() { + return getLong(HoodieStorageConfig.LOGFILE_MAX_SIZE); } public double getLogFileToParquetCompressionRatio() { - return Double.parseDouble(props.getProperty(HoodieStorageConfig.LOGFILE_TO_PARQUET_COMPRESSION_RATIO)); + return getDouble(HoodieStorageConfig.LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION); } public long getHFileMaxFileSize() { - return Long.parseLong(props.getProperty(HoodieStorageConfig.HFILE_FILE_MAX_BYTES)); + return getLong(HoodieStorageConfig.HFILE_MAX_FILE_SIZE); } public int getHFileBlockSize() { - return Integer.parseInt(props.getProperty(HoodieStorageConfig.HFILE_BLOCK_SIZE_BYTES)); + return getInt(HoodieStorageConfig.HFILE_BLOCK_SIZE); } public Compression.Algorithm getHFileCompressionAlgorithm() { - return Compression.Algorithm.valueOf(props.getProperty(HoodieStorageConfig.HFILE_COMPRESSION_ALGORITHM)); + return Compression.Algorithm.valueOf(getString(HoodieStorageConfig.HFILE_COMPRESSION_ALGORITHM_NAME)); + } + + public long getOrcMaxFileSize() { + return getLong(HoodieStorageConfig.ORC_FILE_MAX_SIZE); + } + + public int getOrcStripeSize() { + return getInt(HoodieStorageConfig.ORC_STRIPE_SIZE); + } + + public int getOrcBlockSize() { + return getInt(HoodieStorageConfig.ORC_BLOCK_SIZE); + } + + public CompressionKind getOrcCompressionCodec() { + return CompressionKind.valueOf(getString(HoodieStorageConfig.ORC_COMPRESSION_CODEC_NAME)); } /** * metrics properties. */ public boolean isMetricsOn() { - return Boolean.parseBoolean(props.getProperty(HoodieMetricsConfig.METRICS_ON)); + return getBoolean(HoodieMetricsConfig.TURN_METRICS_ON); + } + + public boolean isExecutorMetricsEnabled() { + return Boolean.parseBoolean( + getStringOrDefault(HoodieMetricsConfig.EXECUTOR_METRICS_ENABLE, "false")); + } + + public boolean isLockingMetricsEnabled() { + return getBoolean(HoodieMetricsConfig.LOCK_METRICS_ENABLE); } public MetricsReporterType getMetricsReporterType() { - return MetricsReporterType.valueOf(props.getProperty(HoodieMetricsConfig.METRICS_REPORTER_TYPE)); + return MetricsReporterType.valueOf(getString(HoodieMetricsConfig.METRICS_REPORTER_TYPE_VALUE)); } public String getGraphiteServerHost() { - return props.getProperty(HoodieMetricsConfig.GRAPHITE_SERVER_HOST); + return getString(HoodieMetricsGraphiteConfig.GRAPHITE_SERVER_HOST_NAME); } public int getGraphiteServerPort() { - return Integer.parseInt(props.getProperty(HoodieMetricsConfig.GRAPHITE_SERVER_PORT)); + return getInt(HoodieMetricsGraphiteConfig.GRAPHITE_SERVER_PORT_NUM); } public String getGraphiteMetricPrefix() { - return props.getProperty(HoodieMetricsConfig.GRAPHITE_METRIC_PREFIX); + return getString(HoodieMetricsGraphiteConfig.GRAPHITE_METRIC_PREFIX_VALUE); + } + + public int getGraphiteReportPeriodSeconds() { + return getInt(HoodieMetricsGraphiteConfig.GRAPHITE_REPORT_PERIOD_IN_SECONDS); } public String getJmxHost() { - return props.getProperty(HoodieMetricsConfig.JMX_HOST); + return getString(HoodieMetricsJmxConfig.JMX_HOST_NAME); } public String getJmxPort() { - return props.getProperty(HoodieMetricsConfig.JMX_PORT); + return getString(HoodieMetricsJmxConfig.JMX_PORT_NUM); } public int getDatadogReportPeriodSeconds() { - return Integer.parseInt(props.getProperty(HoodieMetricsDatadogConfig.DATADOG_REPORT_PERIOD_SECONDS)); + return getInt(HoodieMetricsDatadogConfig.REPORT_PERIOD_IN_SECONDS); } public ApiSite getDatadogApiSite() { - return ApiSite.valueOf(props.getProperty(HoodieMetricsDatadogConfig.DATADOG_API_SITE)); + return ApiSite.valueOf(getString(HoodieMetricsDatadogConfig.API_SITE_VALUE)); } public String getDatadogApiKey() { - if (props.containsKey(HoodieMetricsDatadogConfig.DATADOG_API_KEY)) { - return props.getProperty(HoodieMetricsDatadogConfig.DATADOG_API_KEY); + if (props.containsKey(HoodieMetricsDatadogConfig.API_KEY.key())) { + return getString(HoodieMetricsDatadogConfig.API_KEY); } else { Supplier apiKeySupplier = ReflectionUtils.loadClass( - props.getProperty(HoodieMetricsDatadogConfig.DATADOG_API_KEY_SUPPLIER)); + getString(HoodieMetricsDatadogConfig.API_KEY_SUPPLIER)); return apiKeySupplier.get(); } } public boolean getDatadogApiKeySkipValidation() { - return Boolean.parseBoolean(props.getProperty(HoodieMetricsDatadogConfig.DATADOG_API_KEY_SKIP_VALIDATION)); + return getBoolean(HoodieMetricsDatadogConfig.API_KEY_SKIP_VALIDATION); } public int getDatadogApiTimeoutSeconds() { - return Integer.parseInt(props.getProperty(HoodieMetricsDatadogConfig.DATADOG_API_TIMEOUT_SECONDS)); + return getInt(HoodieMetricsDatadogConfig.API_TIMEOUT_IN_SECONDS); } public String getDatadogMetricPrefix() { - return props.getProperty(HoodieMetricsDatadogConfig.DATADOG_METRIC_PREFIX); + return getString(HoodieMetricsDatadogConfig.METRIC_PREFIX_VALUE); } public String getDatadogMetricHost() { - return props.getProperty(HoodieMetricsDatadogConfig.DATADOG_METRIC_HOST); + return getString(HoodieMetricsDatadogConfig.METRIC_HOST_NAME); } public List getDatadogMetricTags() { - return Arrays.stream(props.getProperty( - HoodieMetricsDatadogConfig.DATADOG_METRIC_TAGS).split("\\s*,\\s*")).collect(Collectors.toList()); + return Arrays.stream(getStringOrDefault( + HoodieMetricsDatadogConfig.METRIC_TAG_VALUES, ",").split("\\s*,\\s*")).collect(Collectors.toList()); + } + + public int getCloudWatchReportPeriodSeconds() { + return getInt(HoodieMetricsCloudWatchConfig.REPORT_PERIOD_SECONDS); + } + + public String getCloudWatchMetricPrefix() { + return getString(HoodieMetricsCloudWatchConfig.METRIC_PREFIX); + } + + public String getCloudWatchMetricNamespace() { + return getString(HoodieMetricsCloudWatchConfig.METRIC_NAMESPACE); + } + + public int getCloudWatchMaxDatumsPerRequest() { + return getInt(HoodieMetricsCloudWatchConfig.MAX_DATUMS_PER_REQUEST); } public String getMetricReporterClassName() { - return props.getProperty(HoodieMetricsConfig.METRICS_REPORTER_CLASS); + return getString(HoodieMetricsConfig.METRICS_REPORTER_CLASS_NAME); } public int getPrometheusPort() { - return Integer.parseInt(props.getProperty(HoodieMetricsPrometheusConfig.PROMETHEUS_PORT)); + return getInt(HoodieMetricsPrometheusConfig.PROMETHEUS_PORT_NUM); } public String getPushGatewayHost() { - return props.getProperty(HoodieMetricsPrometheusConfig.PUSHGATEWAY_HOST); + return getString(HoodieMetricsPrometheusConfig.PUSHGATEWAY_HOST_NAME); } public int getPushGatewayPort() { - return Integer.parseInt(props.getProperty(HoodieMetricsPrometheusConfig.PUSHGATEWAY_PORT)); + return getInt(HoodieMetricsPrometheusConfig.PUSHGATEWAY_PORT_NUM); } public int getPushGatewayReportPeriodSeconds() { - return Integer.parseInt(props.getProperty(HoodieMetricsPrometheusConfig.PUSHGATEWAY_REPORT_PERIOD_SECONDS)); + return getInt(HoodieMetricsPrometheusConfig.PUSHGATEWAY_REPORT_PERIOD_IN_SECONDS); } public boolean getPushGatewayDeleteOnShutdown() { - return Boolean.parseBoolean(props.getProperty(HoodieMetricsPrometheusConfig.PUSHGATEWAY_DELETE_ON_SHUTDOWN)); + return getBoolean(HoodieMetricsPrometheusConfig.PUSHGATEWAY_DELETE_ON_SHUTDOWN_ENABLE); } public String getPushGatewayJobName() { - return props.getProperty(HoodieMetricsPrometheusConfig.PUSHGATEWAY_JOB_NAME); + return getString(HoodieMetricsPrometheusConfig.PUSHGATEWAY_JOBNAME); } public boolean getPushGatewayRandomJobNameSuffix() { - return Boolean.parseBoolean(props.getProperty(HoodieMetricsPrometheusConfig.PUSHGATEWAY_RANDOM_JOB_NAME_SUFFIX)); + return getBoolean(HoodieMetricsPrometheusConfig.PUSHGATEWAY_RANDOM_JOBNAME_SUFFIX); + } + + public String getMetricReporterMetricsNamePrefix() { + return getStringOrDefault(HoodieMetricsConfig.METRICS_REPORTER_PREFIX); } /** * memory configs. */ public int getMaxDFSStreamBufferSize() { - return Integer.parseInt(props.getProperty(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE_PROP)); + return getInt(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE); } public String getSpillableMapBasePath() { - return props.getProperty(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH_PROP); + return getString(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH); } public double getWriteStatusFailureFraction() { - return Double.parseDouble(props.getProperty(HoodieMemoryConfig.WRITESTATUS_FAILURE_FRACTION_PROP)); + return getDouble(HoodieMemoryConfig.WRITESTATUS_FAILURE_FRACTION); } public ConsistencyGuardConfig getConsistencyGuardConfig() { return consistencyGuardConfig; } + public FileSystemRetryConfig getFileSystemRetryConfig() { + return fileSystemRetryConfig; + } + public void setConsistencyGuardConfig(ConsistencyGuardConfig consistencyGuardConfig) { this.consistencyGuardConfig = consistencyGuardConfig; } @@ -744,66 +1936,250 @@ public FileSystemViewStorageConfig getClientSpecifiedViewStorageConfig() { return clientSpecifiedViewStorageConfig; } + public HoodiePayloadConfig getPayloadConfig() { + return hoodiePayloadConfig; + } + + public HoodieMetadataConfig getMetadataConfig() { + return metadataConfig; + } + + public HoodieCommonConfig getCommonConfig() { + return commonConfig; + } + /** * Commit call back configs. */ public boolean writeCommitCallbackOn() { - return Boolean.parseBoolean(props.getProperty(HoodieWriteCommitCallbackConfig.CALLBACK_ON)); + return getBoolean(HoodieWriteCommitCallbackConfig.TURN_CALLBACK_ON); } public String getCallbackClass() { - return props.getProperty(HoodieWriteCommitCallbackConfig.CALLBACK_CLASS_PROP); + return getString(HoodieWriteCommitCallbackConfig.CALLBACK_CLASS_NAME); } public String getBootstrapSourceBasePath() { - return props.getProperty(HoodieBootstrapConfig.BOOTSTRAP_BASE_PATH_PROP); + return getString(HoodieBootstrapConfig.BASE_PATH); } public String getBootstrapModeSelectorClass() { - return props.getProperty(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR); + return getString(HoodieBootstrapConfig.MODE_SELECTOR_CLASS_NAME); } public String getFullBootstrapInputProvider() { - return props.getProperty(HoodieBootstrapConfig.FULL_BOOTSTRAP_INPUT_PROVIDER); + return getString(HoodieBootstrapConfig.FULL_BOOTSTRAP_INPUT_PROVIDER_CLASS_NAME); } public String getBootstrapKeyGeneratorClass() { - return props.getProperty(HoodieBootstrapConfig.BOOTSTRAP_KEYGEN_CLASS); + return getString(HoodieBootstrapConfig.KEYGEN_CLASS_NAME); + } + + public String getBootstrapKeyGeneratorType() { + return getString(HoodieBootstrapConfig.KEYGEN_TYPE); } public String getBootstrapModeSelectorRegex() { - return props.getProperty(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR_REGEX); + return getString(HoodieBootstrapConfig.PARTITION_SELECTOR_REGEX_PATTERN); } public BootstrapMode getBootstrapModeForRegexMatch() { - return BootstrapMode.valueOf(props.getProperty(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR_REGEX_MODE)); + return BootstrapMode.valueOf(getString(HoodieBootstrapConfig.PARTITION_SELECTOR_REGEX_MODE)); } public String getBootstrapPartitionPathTranslatorClass() { - return props.getProperty(HoodieBootstrapConfig.BOOTSTRAP_PARTITION_PATH_TRANSLATOR_CLASS); + return getString(HoodieBootstrapConfig.PARTITION_PATH_TRANSLATOR_CLASS_NAME); } public int getBootstrapParallelism() { - return Integer.parseInt(props.getProperty(HoodieBootstrapConfig.BOOTSTRAP_PARALLELISM)); + return getInt(HoodieBootstrapConfig.PARALLELISM_VALUE); } public Long getMaxMemoryPerPartitionMerge() { - return Long.valueOf(props.getProperty(HoodieMemoryConfig.MAX_MEMORY_FOR_MERGE_PROP)); + return getLong(HoodieMemoryConfig.MAX_MEMORY_FOR_MERGE); + } + + public Long getHoodieClientHeartbeatIntervalInMs() { + return getLong(CLIENT_HEARTBEAT_INTERVAL_IN_MS); + } + + public Integer getHoodieClientHeartbeatTolerableMisses() { + return getInt(CLIENT_HEARTBEAT_NUM_TOLERABLE_MISSES); + } + + /** + * File listing metadata configs. + */ + public boolean isMetadataTableEnabled() { + return metadataConfig.enabled(); + } + + public int getMetadataInsertParallelism() { + return getInt(HoodieMetadataConfig.INSERT_PARALLELISM_VALUE); + } + + public int getMetadataCompactDeltaCommitMax() { + return getInt(HoodieMetadataConfig.COMPACT_NUM_DELTA_COMMITS); + } + + public boolean isMetadataAsyncClean() { + return getBoolean(HoodieMetadataConfig.ASYNC_CLEAN_ENABLE); + } + + public boolean isMetadataAsyncIndex() { + return getBooleanOrDefault(HoodieMetadataConfig.ASYNC_INDEX_ENABLE); + } + + public int getMetadataMaxCommitsToKeep() { + return getInt(HoodieMetadataConfig.MAX_COMMITS_TO_KEEP); + } + + public int getMetadataMinCommitsToKeep() { + return getInt(HoodieMetadataConfig.MIN_COMMITS_TO_KEEP); + } + + public int getMetadataCleanerCommitsRetained() { + return getInt(HoodieMetadataConfig.CLEANER_COMMITS_RETAINED); + } + + /** + * Hoodie Client Lock Configs. + * @return + */ + public boolean isAutoAdjustLockConfigs() { + return getBooleanOrDefault(AUTO_ADJUST_LOCK_CONFIGS); + } + + public String getLockProviderClass() { + return getString(HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME); + } + + public String getLockHiveDatabaseName() { + return getString(HoodieLockConfig.HIVE_DATABASE_NAME); + } + + public String getLockHiveTableName() { + return getString(HoodieLockConfig.HIVE_TABLE_NAME); + } + + public ConflictResolutionStrategy getWriteConflictResolutionStrategy() { + return ReflectionUtils.loadClass(getString(HoodieLockConfig.WRITE_CONFLICT_RESOLUTION_STRATEGY_CLASS_NAME)); + } + + public Long getLockAcquireWaitTimeoutInMs() { + return getLong(HoodieLockConfig.LOCK_ACQUIRE_WAIT_TIMEOUT_MS); + } + + public WriteConcurrencyMode getWriteConcurrencyMode() { + return WriteConcurrencyMode.fromValue(getString(WRITE_CONCURRENCY_MODE)); + } + + // misc configs + public Boolean doSkipDefaultPartitionValidation() { + return getBoolean(SKIP_DEFAULT_PARTITION_VALIDATION); + } + + /** + * Are any table services configured to run inline for both scheduling and execution? + * + * @return True if any table services are configured to run inline, false otherwise. + */ + public Boolean areAnyTableServicesExecutedInline() { + return areTableServicesEnabled() + && (inlineClusteringEnabled() || inlineCompactionEnabled() + || (isAutoClean() && !isAsyncClean()) || (isAutoArchive() && !isAsyncArchive())); + } + + /** + * Are any table services configured to run async? + * + * @return True if any table services are configured to run async, false otherwise. + */ + public Boolean areAnyTableServicesAsync() { + return areTableServicesEnabled() + && (isAsyncClusteringEnabled() + || (getTableType() == HoodieTableType.MERGE_ON_READ && !inlineCompactionEnabled()) + || (isAutoClean() && isAsyncClean()) || (isAutoArchive() && isAsyncArchive())); + } + + public Boolean areAnyTableServicesScheduledInline() { + return scheduleInlineCompaction() || scheduleInlineClustering(); + } + + public String getPreCommitValidators() { + return getString(HoodiePreCommitValidatorConfig.VALIDATOR_CLASS_NAMES); + } + + public String getPreCommitValidatorEqualitySqlQueries() { + return getString(HoodiePreCommitValidatorConfig.EQUALITY_SQL_QUERIES); + } + + public String getPreCommitValidatorSingleResultSqlQueries() { + return getString(HoodiePreCommitValidatorConfig.SINGLE_VALUE_SQL_QUERIES); + } + + public String getPreCommitValidatorInequalitySqlQueries() { + return getString(HoodiePreCommitValidatorConfig.INEQUALITY_SQL_QUERIES); + } + + public boolean allowEmptyCommit() { + return getBooleanOrDefault(ALLOW_EMPTY_COMMIT); + } + + public boolean allowOperationMetadataField() { + return getBooleanOrDefault(ALLOW_OPERATION_METADATA_FIELD); + } + + public String getFileIdPrefixProviderClassName() { + return getString(FILEID_PREFIX_PROVIDER_CLASS); + } + + public boolean areTableServicesEnabled() { + return getBooleanOrDefault(TABLE_SERVICES_ENABLED); + } + + public boolean areReleaseResourceEnabled() { + return getBooleanOrDefault(RELEASE_RESOURCE_ENABLE); + } + + /** + * Layout configs. + */ + public HoodieStorageLayout.LayoutType getLayoutType() { + return HoodieStorageLayout.LayoutType.valueOf(getString(HoodieLayoutConfig.LAYOUT_TYPE)); + } + + /** + * Metastore configs. + */ + public boolean isMetastoreEnabled() { + return metastoreConfig.enableMetastore(); } public static class Builder { - protected final Properties props = new Properties(); + protected final HoodieWriteConfig writeConfig = new HoodieWriteConfig(); protected EngineType engineType = EngineType.SPARK; private boolean isIndexConfigSet = false; private boolean isStorageConfigSet = false; private boolean isCompactionConfigSet = false; + private boolean isCleanConfigSet = false; + private boolean isArchivalConfigSet = false; + private boolean isClusteringConfigSet = false; + private boolean isOptimizeConfigSet = false; private boolean isMetricsConfigSet = false; private boolean isBootstrapConfigSet = false; private boolean isMemoryConfigSet = false; private boolean isViewConfigSet = false; private boolean isConsistencyGuardSet = false; private boolean isCallbackConfigSet = false; + private boolean isPayloadConfigSet = false; + private boolean isMetadataConfigSet = false; + private boolean isLockConfigSet = false; + private boolean isPreCommitValidationConfigSet = false; + private boolean isMetricsJmxConfigSet = false; + private boolean isMetricsGraphiteConfigSet = false; + private boolean isLayoutConfigSet = false; public Builder withEngineType(EngineType engineType) { this.engineType = engineType; @@ -812,14 +2188,14 @@ public Builder withEngineType(EngineType engineType) { public Builder fromFile(File propertiesFile) throws IOException { try (FileReader reader = new FileReader(propertiesFile)) { - this.props.load(reader); + this.writeConfig.getProps().load(reader); return this; } } public Builder fromInputStream(InputStream inputStream) throws IOException { try { - this.props.load(inputStream); + this.writeConfig.getProps().load(inputStream); return this; } finally { inputStream.close(); @@ -827,283 +2203,522 @@ public Builder fromInputStream(InputStream inputStream) throws IOException { } public Builder withProps(Map kvprops) { - props.putAll(kvprops); + writeConfig.getProps().putAll(kvprops); return this; } public Builder withPath(String basePath) { - props.setProperty(BASE_PATH_PROP, basePath); + writeConfig.setValue(BASE_PATH, basePath); return this; } public Builder withSchema(String schemaStr) { - props.setProperty(AVRO_SCHEMA, schemaStr); + writeConfig.setValue(AVRO_SCHEMA_STRING, schemaStr); + return this; + } + + public Builder withSchemaEvolutionEnable(boolean enable) { + writeConfig.setValue(HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE, String.valueOf(enable)); + return this; + } + + public Builder withInternalSchemaCacheEnable(boolean enable) { + writeConfig.setValue(ENABLE_INTERNAL_SCHEMA_CACHE, String.valueOf(enable)); return this; } public Builder withAvroSchemaValidate(boolean enable) { - props.setProperty(AVRO_SCHEMA_VALIDATE, String.valueOf(enable)); + writeConfig.setValue(AVRO_SCHEMA_VALIDATE_ENABLE, String.valueOf(enable)); return this; } public Builder forTable(String tableName) { - props.setProperty(TABLE_NAME, tableName); + writeConfig.setValue(TBL_NAME, tableName); + return this; + } + + public Builder withPreCombineField(String preCombineField) { + writeConfig.setValue(PRECOMBINE_FIELD_NAME, preCombineField); + return this; + } + + public Builder withWritePayLoad(String payload) { + writeConfig.setValue(WRITE_PAYLOAD_CLASS_NAME, payload); + return this; + } + + public Builder withKeyGenerator(String keyGeneratorClass) { + writeConfig.setValue(KEYGENERATOR_CLASS_NAME, keyGeneratorClass); return this; } public Builder withTimelineLayoutVersion(int version) { - props.setProperty(TIMELINE_LAYOUT_VERSION, String.valueOf(version)); + writeConfig.setValue(TIMELINE_LAYOUT_VERSION_NUM, String.valueOf(version)); return this; } public Builder withBulkInsertParallelism(int bulkInsertParallelism) { - props.setProperty(BULKINSERT_PARALLELISM, String.valueOf(bulkInsertParallelism)); + writeConfig.setValue(BULKINSERT_PARALLELISM_VALUE, String.valueOf(bulkInsertParallelism)); return this; } public Builder withUserDefinedBulkInsertPartitionerClass(String className) { - props.setProperty(BULKINSERT_USER_DEFINED_PARTITIONER_CLASS, className); + writeConfig.setValue(BULKINSERT_USER_DEFINED_PARTITIONER_CLASS_NAME, className); + return this; + } + + public Builder withUserDefinedBulkInsertPartitionerSortColumns(String columns) { + writeConfig.setValue(BULKINSERT_USER_DEFINED_PARTITIONER_SORT_COLUMNS, columns); return this; } public Builder withDeleteParallelism(int parallelism) { - props.setProperty(DELETE_PARALLELISM, String.valueOf(parallelism)); + writeConfig.setValue(DELETE_PARALLELISM_VALUE, String.valueOf(parallelism)); + return this; + } + + public Builder withFailureOnInlineTableServiceException(boolean fail) { + writeConfig.setValue(FAIL_ON_INLINE_TABLE_SERVICE_EXCEPTION, String.valueOf(fail)); return this; } public Builder withParallelism(int insertShuffleParallelism, int upsertShuffleParallelism) { - props.setProperty(INSERT_PARALLELISM, String.valueOf(insertShuffleParallelism)); - props.setProperty(UPSERT_PARALLELISM, String.valueOf(upsertShuffleParallelism)); + writeConfig.setValue(INSERT_PARALLELISM_VALUE, String.valueOf(insertShuffleParallelism)); + writeConfig.setValue(UPSERT_PARALLELISM_VALUE, String.valueOf(upsertShuffleParallelism)); return this; } public Builder withRollbackParallelism(int rollbackParallelism) { - props.setProperty(ROLLBACK_PARALLELISM, String.valueOf(rollbackParallelism)); + writeConfig.setValue(ROLLBACK_PARALLELISM_VALUE, String.valueOf(rollbackParallelism)); return this; } public Builder withRollbackUsingMarkers(boolean rollbackUsingMarkers) { - props.setProperty(ROLLBACK_USING_MARKERS, String.valueOf(rollbackUsingMarkers)); + writeConfig.setValue(ROLLBACK_USING_MARKERS_ENABLE, String.valueOf(rollbackUsingMarkers)); return this; } public Builder withWriteBufferLimitBytes(int writeBufferLimit) { - props.setProperty(WRITE_BUFFER_LIMIT_BYTES, String.valueOf(writeBufferLimit)); + writeConfig.setValue(WRITE_BUFFER_LIMIT_BYTES_VALUE, String.valueOf(writeBufferLimit)); return this; } public Builder combineInput(boolean onInsert, boolean onUpsert) { - props.setProperty(COMBINE_BEFORE_INSERT_PROP, String.valueOf(onInsert)); - props.setProperty(COMBINE_BEFORE_UPSERT_PROP, String.valueOf(onUpsert)); + writeConfig.setValue(COMBINE_BEFORE_INSERT, String.valueOf(onInsert)); + writeConfig.setValue(COMBINE_BEFORE_UPSERT, String.valueOf(onUpsert)); return this; } public Builder combineDeleteInput(boolean onDelete) { - props.setProperty(COMBINE_BEFORE_DELETE_PROP, String.valueOf(onDelete)); + writeConfig.setValue(COMBINE_BEFORE_DELETE, String.valueOf(onDelete)); return this; } public Builder withWriteStatusStorageLevel(String level) { - props.setProperty(WRITE_STATUS_STORAGE_LEVEL, level); + writeConfig.setValue(WRITE_STATUS_STORAGE_LEVEL_VALUE, level); return this; } public Builder withIndexConfig(HoodieIndexConfig indexConfig) { - props.putAll(indexConfig.getProps()); + writeConfig.getProps().putAll(indexConfig.getProps()); isIndexConfigSet = true; return this; } public Builder withStorageConfig(HoodieStorageConfig storageConfig) { - props.putAll(storageConfig.getProps()); + writeConfig.getProps().putAll(storageConfig.getProps()); isStorageConfigSet = true; return this; } public Builder withCompactionConfig(HoodieCompactionConfig compactionConfig) { - props.putAll(compactionConfig.getProps()); + writeConfig.getProps().putAll(compactionConfig.getProps()); isCompactionConfigSet = true; return this; } + public Builder withCleanConfig(HoodieCleanConfig cleanConfig) { + writeConfig.getProps().putAll(cleanConfig.getProps()); + isCleanConfigSet = true; + return this; + } + + public Builder withArchivalConfig(HoodieArchivalConfig cleanConfig) { + writeConfig.getProps().putAll(cleanConfig.getProps()); + isArchivalConfigSet = true; + return this; + } + + public Builder withClusteringConfig(HoodieClusteringConfig clusteringConfig) { + writeConfig.getProps().putAll(clusteringConfig.getProps()); + isClusteringConfigSet = true; + return this; + } + + public Builder withLockConfig(HoodieLockConfig lockConfig) { + writeConfig.getProps().putAll(lockConfig.getProps()); + isLockConfigSet = true; + return this; + } + + public Builder withMetricsJmxConfig(HoodieMetricsJmxConfig metricsJmxConfig) { + writeConfig.getProps().putAll(metricsJmxConfig.getProps()); + isMetricsJmxConfigSet = true; + return this; + } + + public Builder withMetricsGraphiteConfig(HoodieMetricsGraphiteConfig mericsGraphiteConfig) { + writeConfig.getProps().putAll(mericsGraphiteConfig.getProps()); + isMetricsGraphiteConfigSet = true; + return this; + } + + public Builder withPreCommitValidatorConfig(HoodiePreCommitValidatorConfig validatorConfig) { + writeConfig.getProps().putAll(validatorConfig.getProps()); + isPreCommitValidationConfigSet = true; + return this; + } + public Builder withMetricsConfig(HoodieMetricsConfig metricsConfig) { - props.putAll(metricsConfig.getProps()); + writeConfig.getProps().putAll(metricsConfig.getProps()); isMetricsConfigSet = true; return this; } public Builder withMemoryConfig(HoodieMemoryConfig memoryConfig) { - props.putAll(memoryConfig.getProps()); + writeConfig.getProps().putAll(memoryConfig.getProps()); isMemoryConfigSet = true; return this; } public Builder withBootstrapConfig(HoodieBootstrapConfig bootstrapConfig) { - props.putAll(bootstrapConfig.getProps()); + writeConfig.getProps().putAll(bootstrapConfig.getProps()); isBootstrapConfigSet = true; return this; } - public Builder withAutoCommit(boolean autoCommit) { - props.setProperty(HOODIE_AUTO_COMMIT_PROP, String.valueOf(autoCommit)); + public Builder withPayloadConfig(HoodiePayloadConfig payloadConfig) { + writeConfig.getProps().putAll(payloadConfig.getProps()); + isPayloadConfigSet = true; + return this; + } + + public Builder withMetadataConfig(HoodieMetadataConfig metadataConfig) { + writeConfig.getProps().putAll(metadataConfig.getProps()); + isMetadataConfigSet = true; return this; } - public Builder withAssumeDatePartitioning(boolean assumeDatePartitioning) { - props.setProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP, String.valueOf(assumeDatePartitioning)); + public Builder withAutoCommit(boolean autoCommit) { + writeConfig.setValue(AUTO_COMMIT_ENABLE, String.valueOf(autoCommit)); return this; } public Builder withWriteStatusClass(Class writeStatusClass) { - props.setProperty(HOODIE_WRITE_STATUS_CLASS_PROP, writeStatusClass.getName()); + writeConfig.setValue(WRITE_STATUS_CLASS_NAME, writeStatusClass.getName()); return this; } public Builder withFileSystemViewConfig(FileSystemViewStorageConfig viewStorageConfig) { - props.putAll(viewStorageConfig.getProps()); + writeConfig.getProps().putAll(viewStorageConfig.getProps()); isViewConfigSet = true; return this; } public Builder withConsistencyGuardConfig(ConsistencyGuardConfig consistencyGuardConfig) { - props.putAll(consistencyGuardConfig.getProps()); + writeConfig.getProps().putAll(consistencyGuardConfig.getProps()); isConsistencyGuardSet = true; return this; } public Builder withCallbackConfig(HoodieWriteCommitCallbackConfig callbackConfig) { - props.putAll(callbackConfig.getProps()); + writeConfig.getProps().putAll(callbackConfig.getProps()); isCallbackConfigSet = true; return this; } + public Builder withLayoutConfig(HoodieLayoutConfig layoutConfig) { + writeConfig.getProps().putAll(layoutConfig.getProps()); + isLayoutConfigSet = true; + return this; + } + public Builder withFinalizeWriteParallelism(int parallelism) { - props.setProperty(FINALIZE_WRITE_PARALLELISM, String.valueOf(parallelism)); + writeConfig.setValue(FINALIZE_WRITE_PARALLELISM_VALUE, String.valueOf(parallelism)); + return this; + } + + public Builder withMarkersType(String markerType) { + writeConfig.setValue(MARKERS_TYPE, markerType); + return this; + } + + public Builder withMarkersTimelineServerBasedBatchNumThreads(int numThreads) { + writeConfig.setValue(MARKERS_TIMELINE_SERVER_BASED_BATCH_NUM_THREADS, String.valueOf(numThreads)); + return this; + } + + public Builder withMarkersTimelineServerBasedBatchIntervalMs(long intervalMs) { + writeConfig.setValue(MARKERS_TIMELINE_SERVER_BASED_BATCH_INTERVAL_MS, String.valueOf(intervalMs)); return this; } public Builder withMarkersDeleteParallelism(int parallelism) { - props.setProperty(MARKERS_DELETE_PARALLELISM, String.valueOf(parallelism)); + writeConfig.setValue(MARKERS_DELETE_PARALLELISM_VALUE, String.valueOf(parallelism)); return this; } public Builder withEmbeddedTimelineServerEnabled(boolean enabled) { - props.setProperty(EMBEDDED_TIMELINE_SERVER_ENABLED, String.valueOf(enabled)); + writeConfig.setValue(EMBEDDED_TIMELINE_SERVER_ENABLE, String.valueOf(enabled)); + return this; + } + + public Builder withEmbeddedTimelineServerReuseEnabled(boolean enabled) { + writeConfig.setValue(EMBEDDED_TIMELINE_SERVER_REUSE_ENABLED, String.valueOf(enabled)); return this; } public Builder withEmbeddedTimelineServerPort(int port) { - props.setProperty(EMBEDDED_TIMELINE_SERVER_PORT, String.valueOf(port)); + writeConfig.setValue(EMBEDDED_TIMELINE_SERVER_PORT_NUM, String.valueOf(port)); return this; } public Builder withBulkInsertSortMode(String mode) { - props.setProperty(BULKINSERT_SORT_MODE, mode); + writeConfig.setValue(BULK_INSERT_SORT_MODE, mode); return this; } public Builder withAllowMultiWriteOnSameInstant(boolean allow) { - props.setProperty(ALLOW_MULTI_WRITE_ON_SAME_INSTANT, String.valueOf(allow)); + writeConfig.setValue(ALLOW_MULTI_WRITE_ON_SAME_INSTANT_ENABLE, String.valueOf(allow)); return this; } public Builder withExternalSchemaTrasformation(boolean enabled) { - props.setProperty(EXTERNAL_RECORD_AND_SCHEMA_TRANSFORMATION, String.valueOf(enabled)); + writeConfig.setValue(AVRO_EXTERNAL_SCHEMA_TRANSFORMATION_ENABLE, String.valueOf(enabled)); return this; } public Builder withMergeDataValidationCheckEnabled(boolean enabled) { - props.setProperty(MERGE_DATA_VALIDATION_CHECK_ENABLED, String.valueOf(enabled)); + writeConfig.setValue(MERGE_DATA_VALIDATION_CHECK_ENABLE, String.valueOf(enabled)); + return this; + } + + public Builder withMergeAllowDuplicateOnInserts(boolean routeInsertsToNewFiles) { + writeConfig.setValue(MERGE_ALLOW_DUPLICATE_ON_INSERTS_ENABLE, String.valueOf(routeInsertsToNewFiles)); + return this; + } + + public Builder withMergeSmallFileGroupCandidatesLimit(int limit) { + writeConfig.setValue(MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT, String.valueOf(limit)); + return this; + } + + public Builder withHeartbeatIntervalInMs(Integer heartbeatIntervalInMs) { + writeConfig.setValue(CLIENT_HEARTBEAT_INTERVAL_IN_MS, String.valueOf(heartbeatIntervalInMs)); + return this; + } + + public Builder withHeartbeatTolerableMisses(Integer heartbeatTolerableMisses) { + writeConfig.setValue(CLIENT_HEARTBEAT_NUM_TOLERABLE_MISSES, String.valueOf(heartbeatTolerableMisses)); + return this; + } + + public Builder withWriteConcurrencyMode(WriteConcurrencyMode concurrencyMode) { + writeConfig.setValue(WRITE_CONCURRENCY_MODE, concurrencyMode.value()); + return this; + } + + public Builder withPopulateMetaFields(boolean populateMetaFields) { + writeConfig.setValue(HoodieTableConfig.POPULATE_META_FIELDS, Boolean.toString(populateMetaFields)); + return this; + } + + public Builder withAllowOperationMetadataField(boolean allowOperationMetadataField) { + writeConfig.setValue(ALLOW_OPERATION_METADATA_FIELD, Boolean.toString(allowOperationMetadataField)); + return this; + } + + public Builder withFileIdPrefixProviderClassName(String fileIdPrefixProviderClassName) { + writeConfig.setValue(FILEID_PREFIX_PROVIDER_CLASS, fileIdPrefixProviderClassName); + return this; + } + + public Builder withTableServicesEnabled(boolean enabled) { + writeConfig.setValue(TABLE_SERVICES_ENABLED, Boolean.toString(enabled)); + return this; + } + + public Builder withReleaseResourceEnabled(boolean enabled) { + writeConfig.setValue(RELEASE_RESOURCE_ENABLE, Boolean.toString(enabled)); return this; } public Builder withProperties(Properties properties) { - this.props.putAll(properties); + this.writeConfig.getProps().putAll(properties); + return this; + } + + public Builder withAutoAdjustLockConfigs(boolean autoAdjustLockConfigs) { + writeConfig.setValue(AUTO_ADJUST_LOCK_CONFIGS, String.valueOf(autoAdjustLockConfigs)); + return this; + } + + public Builder doSkipDefaultPartitionValidation(boolean skipDefaultPartitionValidation) { + writeConfig.setValue(SKIP_DEFAULT_PARTITION_VALIDATION, String.valueOf(skipDefaultPartitionValidation)); return this; } protected void setDefaults() { + writeConfig.setDefaultValue(MARKERS_TYPE, getDefaultMarkersType(engineType)); // Check for mandatory properties - setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM, DEFAULT_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM), BULKINSERT_PARALLELISM, - DEFAULT_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM, DEFAULT_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(DELETE_PARALLELISM), DELETE_PARALLELISM, DEFAULT_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(ROLLBACK_PARALLELISM), ROLLBACK_PARALLELISM, - DEFAULT_ROLLBACK_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(ROLLBACK_USING_MARKERS), ROLLBACK_USING_MARKERS, - DEFAULT_ROLLBACK_USING_MARKERS); - setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_INSERT_PROP), COMBINE_BEFORE_INSERT_PROP, - DEFAULT_COMBINE_BEFORE_INSERT); - setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_UPSERT_PROP), COMBINE_BEFORE_UPSERT_PROP, - DEFAULT_COMBINE_BEFORE_UPSERT); - setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_DELETE_PROP), COMBINE_BEFORE_DELETE_PROP, - DEFAULT_COMBINE_BEFORE_DELETE); - setDefaultOnCondition(props, !props.containsKey(ALLOW_MULTI_WRITE_ON_SAME_INSTANT), - ALLOW_MULTI_WRITE_ON_SAME_INSTANT, DEFAULT_ALLOW_MULTI_WRITE_ON_SAME_INSTANT); - setDefaultOnCondition(props, !props.containsKey(WRITE_STATUS_STORAGE_LEVEL), WRITE_STATUS_STORAGE_LEVEL, - DEFAULT_WRITE_STATUS_STORAGE_LEVEL); - setDefaultOnCondition(props, !props.containsKey(HOODIE_AUTO_COMMIT_PROP), HOODIE_AUTO_COMMIT_PROP, - DEFAULT_HOODIE_AUTO_COMMIT); - setDefaultOnCondition(props, !props.containsKey(HOODIE_ASSUME_DATE_PARTITIONING_PROP), - HOODIE_ASSUME_DATE_PARTITIONING_PROP, DEFAULT_ASSUME_DATE_PARTITIONING); - setDefaultOnCondition(props, !props.containsKey(HOODIE_WRITE_STATUS_CLASS_PROP), HOODIE_WRITE_STATUS_CLASS_PROP, - DEFAULT_HOODIE_WRITE_STATUS_CLASS); - setDefaultOnCondition(props, !props.containsKey(FINALIZE_WRITE_PARALLELISM), FINALIZE_WRITE_PARALLELISM, - DEFAULT_FINALIZE_WRITE_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(MARKERS_DELETE_PARALLELISM), MARKERS_DELETE_PARALLELISM, - DEFAULT_MARKERS_DELETE_PARALLELISM); - setDefaultOnCondition(props, !props.containsKey(EMBEDDED_TIMELINE_SERVER_ENABLED), - EMBEDDED_TIMELINE_SERVER_ENABLED, DEFAULT_EMBEDDED_TIMELINE_SERVER_ENABLED); - setDefaultOnCondition(props, !props.containsKey(INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP), - INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(DEFAULT_INITIAL_CONSISTENCY_CHECK_INTERVAL_MS)); - setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP), - MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECK_INTERVAL_MS)); - setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECKS_PROP), MAX_CONSISTENCY_CHECKS_PROP, - String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECKS)); - setDefaultOnCondition(props, !props.containsKey(FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP), - FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP, DEFAULT_FAIL_ON_TIMELINE_ARCHIVING_ENABLED); - setDefaultOnCondition(props, !props.containsKey(AVRO_SCHEMA_VALIDATE), AVRO_SCHEMA_VALIDATE, DEFAULT_AVRO_SCHEMA_VALIDATE); - setDefaultOnCondition(props, !props.containsKey(BULKINSERT_SORT_MODE), - BULKINSERT_SORT_MODE, DEFAULT_BULKINSERT_SORT_MODE); - setDefaultOnCondition(props, !props.containsKey(MERGE_DATA_VALIDATION_CHECK_ENABLED), - MERGE_DATA_VALIDATION_CHECK_ENABLED, DEFAULT_MERGE_DATA_VALIDATION_CHECK_ENABLED); - + writeConfig.setDefaults(HoodieWriteConfig.class.getName()); + // Set default values of HoodieHBaseIndexConfig + writeConfig.setDefaults(HoodieHBaseIndexConfig.class.getName()); // Make sure the props is propagated - setDefaultOnCondition(props, !isIndexConfigSet, HoodieIndexConfig.newBuilder().withEngineType(engineType).fromProperties(props).build()); - setDefaultOnCondition(props, !isStorageConfigSet, HoodieStorageConfig.newBuilder().fromProperties(props).build()); - setDefaultOnCondition(props, !isCompactionConfigSet, - HoodieCompactionConfig.newBuilder().fromProperties(props).build()); - setDefaultOnCondition(props, !isMetricsConfigSet, HoodieMetricsConfig.newBuilder().fromProperties(props).build()); - setDefaultOnCondition(props, !isBootstrapConfigSet, - HoodieBootstrapConfig.newBuilder().fromProperties(props).build()); - setDefaultOnCondition(props, !isMemoryConfigSet, HoodieMemoryConfig.newBuilder().fromProperties(props).build()); - setDefaultOnCondition(props, !isViewConfigSet, - FileSystemViewStorageConfig.newBuilder().fromProperties(props).build()); - setDefaultOnCondition(props, !isConsistencyGuardSet, - ConsistencyGuardConfig.newBuilder().fromProperties(props).build()); - setDefaultOnCondition(props, !isCallbackConfigSet, - HoodieWriteCommitCallbackConfig.newBuilder().fromProperties(props).build()); - - setDefaultOnCondition(props, !props.containsKey(EXTERNAL_RECORD_AND_SCHEMA_TRANSFORMATION), - EXTERNAL_RECORD_AND_SCHEMA_TRANSFORMATION, DEFAULT_EXTERNAL_RECORD_AND_SCHEMA_TRANSFORMATION); - setDefaultOnCondition(props, !props.containsKey(TIMELINE_LAYOUT_VERSION), TIMELINE_LAYOUT_VERSION, - String.valueOf(TimelineLayoutVersion.CURR_VERSION)); + writeConfig.setDefaultOnCondition( + !isIndexConfigSet, HoodieIndexConfig.newBuilder().withEngineType(engineType).fromProperties( + writeConfig.getProps()).build()); + writeConfig.setDefaultOnCondition(!isStorageConfigSet, HoodieStorageConfig.newBuilder().fromProperties( + writeConfig.getProps()).build()); + writeConfig.setDefaultOnCondition(!isCompactionConfigSet, + HoodieCompactionConfig.newBuilder().fromProperties(writeConfig.getProps()).build()); + writeConfig.setDefaultOnCondition(!isCleanConfigSet, + HoodieCleanConfig.newBuilder().fromProperties(writeConfig.getProps()).build()); + writeConfig.setDefaultOnCondition(!isArchivalConfigSet, + HoodieArchivalConfig.newBuilder().fromProperties(writeConfig.getProps()).build()); + writeConfig.setDefaultOnCondition(!isClusteringConfigSet, + HoodieClusteringConfig.newBuilder().withEngineType(engineType) + .fromProperties(writeConfig.getProps()).build()); + writeConfig.setDefaultOnCondition(!isMetricsConfigSet, HoodieMetricsConfig.newBuilder().fromProperties( + writeConfig.getProps()).build()); + writeConfig.setDefaultOnCondition(!isBootstrapConfigSet, + HoodieBootstrapConfig.newBuilder().fromProperties(writeConfig.getProps()).build()); + writeConfig.setDefaultOnCondition(!isMemoryConfigSet, HoodieMemoryConfig.newBuilder().fromProperties( + writeConfig.getProps()).build()); + writeConfig.setDefaultOnCondition(!isViewConfigSet, + FileSystemViewStorageConfig.newBuilder().fromProperties(writeConfig.getProps()).build()); + writeConfig.setDefaultOnCondition(!isConsistencyGuardSet, + ConsistencyGuardConfig.newBuilder().fromProperties(writeConfig.getProps()).build()); + writeConfig.setDefaultOnCondition(!isCallbackConfigSet, + HoodieWriteCommitCallbackConfig.newBuilder().fromProperties(writeConfig.getProps()).build()); + writeConfig.setDefaultOnCondition(!isPayloadConfigSet, + HoodiePayloadConfig.newBuilder().fromProperties(writeConfig.getProps()).build()); + writeConfig.setDefaultOnCondition(!isMetadataConfigSet, + HoodieMetadataConfig.newBuilder().withEngineType(engineType).fromProperties(writeConfig.getProps()).build()); + writeConfig.setDefaultOnCondition(!isPreCommitValidationConfigSet, + HoodiePreCommitValidatorConfig.newBuilder().fromProperties(writeConfig.getProps()).build()); + writeConfig.setDefaultOnCondition(!isLayoutConfigSet, + HoodieLayoutConfig.newBuilder().fromProperties(writeConfig.getProps()).build()); + writeConfig.setDefaultValue(TIMELINE_LAYOUT_VERSION_NUM, String.valueOf(TimelineLayoutVersion.CURR_VERSION)); + + // isLockProviderPropertySet must be fetched before setting defaults of HoodieLockConfig + final TypedProperties writeConfigProperties = writeConfig.getProps(); + final boolean isLockProviderPropertySet = writeConfigProperties.containsKey(HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME) + || writeConfigProperties.containsKey(HoodieLockConfig.LOCK_PROVIDER_CLASS_PROP); + writeConfig.setDefaultOnCondition(!isLockConfigSet, + HoodieLockConfig.newBuilder().fromProperties(writeConfig.getProps()).build()); + + autoAdjustConfigsForConcurrencyMode(isLockProviderPropertySet); + } + + private void autoAdjustConfigsForConcurrencyMode(boolean isLockProviderPropertySet) { + if (writeConfig.isAutoAdjustLockConfigs()) { + // auto adjustment is required only for deltastreamer and spark streaming where async table services can be executed in the same JVM. + boolean isMetadataTableEnabled = writeConfig.getBoolean(HoodieMetadataConfig.ENABLE); + + if (isMetadataTableEnabled) { + // When metadata table is enabled, optimistic concurrency control must be used for + // single writer with async table services. + // Async table services can update the metadata table and a lock provider is + // needed to guard against any concurrent table write operations. If user has + // not configured any lock provider, let's use the InProcess lock provider. + boolean areTableServicesEnabled = writeConfig.areTableServicesEnabled(); + boolean areAsyncTableServicesEnabled = writeConfig.areAnyTableServicesAsync(); + if (!isLockProviderPropertySet && areTableServicesEnabled && areAsyncTableServicesEnabled) { + // This is targeted at Single writer with async table services + // If user does not set the lock provider, likely that the concurrency mode is not set either + // Override the configs for metadata table + writeConfig.setValue(WRITE_CONCURRENCY_MODE.key(), + WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL.value()); + writeConfig.setValue(HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.key(), + InProcessLockProvider.class.getName()); + LOG.info(String.format("Automatically set %s=%s and %s=%s since user has not set the " + + "lock provider for single writer with async table services", + WRITE_CONCURRENCY_MODE.key(), WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL.value(), + HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.key(), InProcessLockProvider.class.getName())); + } + } + } + + // We check if "hoodie.cleaner.policy.failed.writes" + // is properly set to LAZY for optimistic concurrency control + String writeConcurrencyMode = writeConfig.getString(WRITE_CONCURRENCY_MODE); + if (WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL.value() + .equalsIgnoreCase(writeConcurrencyMode)) { + // In this case, we assume that the user takes care of setting the lock provider used + writeConfig.setValue(HoodieCleanConfig.FAILED_WRITES_CLEANER_POLICY.key(), + HoodieFailedWritesCleaningPolicy.LAZY.name()); + LOG.info(String.format("Automatically set %s=%s since optimistic concurrency control is used", + HoodieCleanConfig.FAILED_WRITES_CLEANER_POLICY.key(), + HoodieFailedWritesCleaningPolicy.LAZY.name())); + } } private void validate() { - String layoutVersion = props.getProperty(TIMELINE_LAYOUT_VERSION); + String layoutVersion = writeConfig.getString(TIMELINE_LAYOUT_VERSION_NUM); // Ensure Layout Version is good new TimelineLayoutVersion(Integer.parseInt(layoutVersion)); - Objects.requireNonNull(props.getProperty(BASE_PATH_PROP)); + Objects.requireNonNull(writeConfig.getString(BASE_PATH)); + if (writeConfig.getString(WRITE_CONCURRENCY_MODE) + .equalsIgnoreCase(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL.value())) { + ValidationUtils.checkArgument(!writeConfig.getString(HoodieCleanConfig.FAILED_WRITES_CLEANER_POLICY) + .equals(HoodieFailedWritesCleaningPolicy.EAGER.name()), "To enable optimistic concurrency control, set hoodie.cleaner.policy.failed.writes=LAZY"); + } + + HoodieCleaningPolicy.valueOf(writeConfig.getString(CLEANER_POLICY)); + // Ensure minInstantsToKeep > cleanerCommitsRetained, otherwise we will archive some + // commit instant on timeline, that still has not been cleaned. Could miss some data via incr pull + int minInstantsToKeep = Integer.parseInt(writeConfig.getStringOrDefault(HoodieArchivalConfig.MIN_COMMITS_TO_KEEP)); + int maxInstantsToKeep = Integer.parseInt(writeConfig.getStringOrDefault(HoodieArchivalConfig.MAX_COMMITS_TO_KEEP)); + int cleanerCommitsRetained = + Integer.parseInt(writeConfig.getStringOrDefault(HoodieCleanConfig.CLEANER_COMMITS_RETAINED)); + ValidationUtils.checkArgument(maxInstantsToKeep > minInstantsToKeep, + String.format( + "Increase %s=%d to be greater than %s=%d.", + HoodieArchivalConfig.MAX_COMMITS_TO_KEEP.key(), maxInstantsToKeep, + HoodieArchivalConfig.MIN_COMMITS_TO_KEEP.key(), minInstantsToKeep)); + ValidationUtils.checkArgument(minInstantsToKeep > cleanerCommitsRetained, + String.format( + "Increase %s=%d to be greater than %s=%d. Otherwise, there is risk of incremental pull " + + "missing data from few instants.", + HoodieArchivalConfig.MIN_COMMITS_TO_KEEP.key(), minInstantsToKeep, + HoodieCleanConfig.CLEANER_COMMITS_RETAINED.key(), cleanerCommitsRetained)); + + boolean inlineCompact = writeConfig.getBoolean(HoodieCompactionConfig.INLINE_COMPACT); + boolean inlineCompactSchedule = writeConfig.getBoolean(HoodieCompactionConfig.SCHEDULE_INLINE_COMPACT); + ValidationUtils.checkArgument(!(inlineCompact && inlineCompactSchedule), String.format("Either of inline compaction (%s) or " + + "schedule inline compaction (%s) can be enabled. Both can't be set to true at the same time. %s, %s", HoodieCompactionConfig.INLINE_COMPACT.key(), + HoodieCompactionConfig.SCHEDULE_INLINE_COMPACT.key(), inlineCompact, inlineCompactSchedule)); } public HoodieWriteConfig build() { setDefaults(); validate(); // Build WriteConfig at the end - HoodieWriteConfig config = new HoodieWriteConfig(engineType, props); - return config; + return new HoodieWriteConfig(engineType, writeConfig.getProps()); + } + + private String getDefaultMarkersType(EngineType engineType) { + switch (engineType) { + case SPARK: + return MarkerType.TIMELINE_SERVER_BASED.toString(); + case FLINK: + case JAVA: + // Timeline-server-based marker is not supported for Flink and Java engines + return MarkerType.DIRECT.toString(); + default: + throw new HoodieNotSupportedException("Unsupported engine " + engineType); + } } } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsCloudWatchConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsCloudWatchConfig.java new file mode 100644 index 0000000000000..3c4b860e69230 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsCloudWatchConfig.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.config.metrics; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +import java.util.Properties; + +@ConfigClassProperty( + name = "Metrics Configurations for Amazon CloudWatch", + groupName = ConfigGroups.Names.METRICS, + description = + "Enables reporting on Hudi metrics using Amazon CloudWatch. " + + " Hudi publishes metrics on every commit, clean, rollback etc.") +public class HoodieMetricsCloudWatchConfig extends HoodieConfig { + + public static final String CLOUDWATCH_PREFIX = "hoodie.metrics.cloudwatch"; + + public static final ConfigProperty REPORT_PERIOD_SECONDS = ConfigProperty + .key(CLOUDWATCH_PREFIX + ".report.period.seconds") + .defaultValue(60) + .sinceVersion("0.10.0") + .withDocumentation("Reporting interval in seconds"); + + public static final ConfigProperty METRIC_PREFIX = ConfigProperty + .key(CLOUDWATCH_PREFIX + ".metric.prefix") + .defaultValue("") + .sinceVersion("0.10.0") + .withDocumentation("Metric prefix of reporter"); + + public static final ConfigProperty METRIC_NAMESPACE = ConfigProperty + .key(CLOUDWATCH_PREFIX + ".namespace") + .defaultValue("Hudi") + .sinceVersion("0.10.0") + .withDocumentation("Namespace of reporter"); + /* + Amazon CloudWatch allows a maximum of 20 metrics per request. Choosing this as the default maximum. + Reference: https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/API_PutMetricData.html + */ + public static final ConfigProperty MAX_DATUMS_PER_REQUEST = + ConfigProperty.key(CLOUDWATCH_PREFIX + ".maxDatumsPerRequest") + .defaultValue(20) + .sinceVersion("0.10.0") + .withDocumentation("Max number of Datums per request"); + + public HoodieMetricsCloudWatchConfig() { + super(); + } + + static Builder newBuilder() { + return new Builder(); + } + + static class Builder { + + private final HoodieMetricsCloudWatchConfig hoodieMetricsCloudWatchConfig = new HoodieMetricsCloudWatchConfig(); + + public HoodieMetricsCloudWatchConfig.Builder fromProperties(Properties props) { + this.hoodieMetricsCloudWatchConfig.getProps().putAll(props); + return this; + } + + public HoodieMetricsCloudWatchConfig build() { + hoodieMetricsCloudWatchConfig.setDefaults(HoodieMetricsCloudWatchConfig.class.getName()); + return hoodieMetricsCloudWatchConfig; + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsConfig.java new file mode 100644 index 0000000000000..957b439051a81 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsConfig.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config.metrics; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.metrics.MetricsReporterType; + +import javax.annotation.concurrent.Immutable; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +/** + * Fetch the configurations used by the Metrics system. + */ +@Immutable +@ConfigClassProperty(name = "Metrics Configurations", + groupName = ConfigGroups.Names.METRICS, + description = "Enables reporting on Hudi metrics. Hudi publishes metrics on " + + "every commit, clean, rollback etc. The following sections list the supported reporters.") +public class HoodieMetricsConfig extends HoodieConfig { + + public static final String METRIC_PREFIX = "hoodie.metrics"; + + public static final ConfigProperty TURN_METRICS_ON = ConfigProperty + .key(METRIC_PREFIX + ".on") + .defaultValue(false) + .sinceVersion("0.5.0") + .withDocumentation("Turn on/off metrics reporting. off by default."); + + public static final ConfigProperty METRICS_REPORTER_TYPE_VALUE = ConfigProperty + .key(METRIC_PREFIX + ".reporter.type") + .defaultValue(MetricsReporterType.GRAPHITE) + .sinceVersion("0.5.0") + .withDocumentation("Type of metrics reporter."); + + // User defined + public static final ConfigProperty METRICS_REPORTER_CLASS_NAME = ConfigProperty + .key(METRIC_PREFIX + ".reporter.class") + .defaultValue("") + .sinceVersion("0.6.0") + .withDocumentation(""); + + public static final ConfigProperty METRICS_REPORTER_PREFIX = ConfigProperty + .key(METRIC_PREFIX + ".reporter.metricsname.prefix") + .defaultValue("") + .sinceVersion("0.11.0") + .withInferFunction(cfg -> { + if (cfg.contains(HoodieTableConfig.NAME)) { + return Option.of(cfg.getString(HoodieTableConfig.NAME)); + } + return Option.empty(); + }) + .withDocumentation("The prefix given to the metrics names."); + + // Enable metrics collection from executors + public static final ConfigProperty EXECUTOR_METRICS_ENABLE = ConfigProperty + .key(METRIC_PREFIX + ".executor.enable") + .noDefaultValue() + .sinceVersion("0.7.0") + .withDocumentation(""); + + public static final ConfigProperty LOCK_METRICS_ENABLE = ConfigProperty + .key(METRIC_PREFIX + ".lock.enable") + .defaultValue(false) + .withInferFunction(cfg -> { + if (cfg.contains(TURN_METRICS_ON)) { + return Option.of(cfg.getBoolean(TURN_METRICS_ON)); + } + return Option.empty(); + }) + .withDocumentation("Enable metrics for locking infra. Useful when operating in multiwriter mode"); + + /** + * @deprecated Use {@link #TURN_METRICS_ON} and its methods instead + */ + @Deprecated + public static final String METRICS_ON = TURN_METRICS_ON.key(); + /** + * @deprecated Use {@link #TURN_METRICS_ON} and its methods instead + */ + @Deprecated + public static final boolean DEFAULT_METRICS_ON = TURN_METRICS_ON.defaultValue(); + /** + * @deprecated Use {@link #METRICS_REPORTER_TYPE_VALUE} and its methods instead + */ + @Deprecated + public static final String METRICS_REPORTER_TYPE = METRICS_REPORTER_TYPE_VALUE.key(); + /** + * @deprecated Use {@link #METRICS_REPORTER_TYPE_VALUE} and its methods instead + */ + @Deprecated + public static final MetricsReporterType DEFAULT_METRICS_REPORTER_TYPE = METRICS_REPORTER_TYPE_VALUE.defaultValue(); + /** + * @deprecated Use {@link #METRICS_REPORTER_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String METRICS_REPORTER_CLASS = METRICS_REPORTER_CLASS_NAME.key(); + /** + * @deprecated Use {@link #METRICS_REPORTER_CLASS_NAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_METRICS_REPORTER_CLASS = METRICS_REPORTER_CLASS_NAME.defaultValue(); + /** + * @deprecated Use {@link #EXECUTOR_METRICS_ENABLE} and its methods instead + */ + @Deprecated + public static final String ENABLE_EXECUTOR_METRICS = EXECUTOR_METRICS_ENABLE.key(); + + private HoodieMetricsConfig() { + super(); + } + + public static HoodieMetricsConfig.Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + + private final HoodieMetricsConfig hoodieMetricsConfig = new HoodieMetricsConfig(); + + public Builder fromFile(File propertiesFile) throws IOException { + try (FileReader reader = new FileReader(propertiesFile)) { + this.hoodieMetricsConfig.getProps().load(reader); + return this; + } + } + + public Builder fromProperties(Properties props) { + this.hoodieMetricsConfig.getProps().putAll(props); + return this; + } + + public Builder on(boolean metricsOn) { + hoodieMetricsConfig.setValue(TURN_METRICS_ON, String.valueOf(metricsOn)); + return this; + } + + public Builder withReporterType(String reporterType) { + hoodieMetricsConfig.setValue(METRICS_REPORTER_TYPE_VALUE, reporterType); + return this; + } + + public Builder withReporterClass(String className) { + hoodieMetricsConfig.setValue(METRICS_REPORTER_CLASS_NAME, className); + return this; + } + + public Builder withExecutorMetrics(boolean enable) { + hoodieMetricsConfig.setValue(EXECUTOR_METRICS_ENABLE, String.valueOf(enable)); + return this; + } + + public Builder withLockingMetrics(boolean enable) { + hoodieMetricsConfig.setValue(LOCK_METRICS_ENABLE, String.valueOf(enable)); + return this; + } + + public HoodieMetricsConfig build() { + + hoodieMetricsConfig.setDefaults(HoodieMetricsConfig.class.getName()); + + MetricsReporterType reporterType = MetricsReporterType.valueOf(hoodieMetricsConfig.getString(METRICS_REPORTER_TYPE_VALUE)); + + hoodieMetricsConfig.setDefaultOnCondition(reporterType == MetricsReporterType.DATADOG, + HoodieMetricsDatadogConfig.newBuilder().fromProperties(hoodieMetricsConfig.getProps()).build()); + hoodieMetricsConfig.setDefaultOnCondition(reporterType == MetricsReporterType.PROMETHEUS_PUSHGATEWAY, + HoodieMetricsPrometheusConfig.newBuilder().fromProperties(hoodieMetricsConfig.getProps()).build()); + hoodieMetricsConfig.setDefaultOnCondition(reporterType == MetricsReporterType.PROMETHEUS, + HoodieMetricsPrometheusConfig.newBuilder().fromProperties(hoodieMetricsConfig.getProps()).build()); + hoodieMetricsConfig.setDefaultOnCondition(reporterType == MetricsReporterType.JMX, + HoodieMetricsJmxConfig.newBuilder().fromProperties(hoodieMetricsConfig.getProps()).build()); + hoodieMetricsConfig.setDefaultOnCondition(reporterType == MetricsReporterType.GRAPHITE, + HoodieMetricsGraphiteConfig.newBuilder().fromProperties(hoodieMetricsConfig.getProps()).build()); + hoodieMetricsConfig.setDefaultOnCondition(reporterType == MetricsReporterType.CLOUDWATCH, + HoodieMetricsCloudWatchConfig.newBuilder().fromProperties(hoodieMetricsConfig.getProps()).build()); + return hoodieMetricsConfig; + } + } + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsDatadogConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsDatadogConfig.java new file mode 100644 index 0000000000000..3fc306b8cdd3b --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsDatadogConfig.java @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config.metrics; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +import javax.annotation.concurrent.Immutable; + +import java.util.Properties; + +import static org.apache.hudi.config.metrics.HoodieMetricsConfig.METRIC_PREFIX; + +/** + * Configs for Datadog reporter type. + *

+ * {@link org.apache.hudi.metrics.MetricsReporterType#DATADOG} + */ +@Immutable +@ConfigClassProperty(name = "Metrics Configurations for Datadog reporter", + groupName = ConfigGroups.Names.METRICS, + description = "Enables reporting on Hudi metrics using the Datadog reporter type. " + + "Hudi publishes metrics on every commit, clean, rollback etc.") +public class HoodieMetricsDatadogConfig extends HoodieConfig { + + public static final String DATADOG_PREFIX = METRIC_PREFIX + ".datadog"; + + public static final ConfigProperty REPORT_PERIOD_IN_SECONDS = ConfigProperty + .key(DATADOG_PREFIX + ".report.period.seconds") + .defaultValue(30) + .sinceVersion("0.6.0") + .withDocumentation("Datadog reporting period in seconds. Default to 30."); + + public static final ConfigProperty API_SITE_VALUE = ConfigProperty + .key(DATADOG_PREFIX + ".api.site") + .noDefaultValue() + .sinceVersion("0.6.0") + .withDocumentation("Datadog API site: EU or US"); + + public static final ConfigProperty API_KEY = ConfigProperty + .key(DATADOG_PREFIX + ".api.key") + .noDefaultValue() + .sinceVersion("0.6.0") + .withDocumentation("Datadog API key"); + + public static final ConfigProperty API_KEY_SKIP_VALIDATION = ConfigProperty + .key(DATADOG_PREFIX + ".api.key.skip.validation") + .defaultValue(false) + .sinceVersion("0.6.0") + .withDocumentation("Before sending metrics via Datadog API, whether to skip validating Datadog API key or not. " + + "Default to false."); + + public static final ConfigProperty API_KEY_SUPPLIER = ConfigProperty + .key(DATADOG_PREFIX + ".api.key.supplier") + .noDefaultValue() + .sinceVersion("0.6.0") + .withDocumentation("Datadog API key supplier to supply the API key at runtime. " + + "This will take effect if hoodie.metrics.datadog.api.key is not set."); + + public static final ConfigProperty API_TIMEOUT_IN_SECONDS = ConfigProperty + .key(DATADOG_PREFIX + ".api.timeout.seconds") + .defaultValue(3) + .sinceVersion("0.6.0") + .withDocumentation("Datadog API timeout in seconds. Default to 3."); + + public static final ConfigProperty METRIC_PREFIX_VALUE = ConfigProperty + .key(DATADOG_PREFIX + ".metric.prefix") + .noDefaultValue() + .sinceVersion("0.6.0") + .withDocumentation("Datadog metric prefix to be prepended to each metric name with a dot as delimiter. " + + "For example, if it is set to foo, foo. will be prepended."); + + public static final ConfigProperty METRIC_HOST_NAME = ConfigProperty + .key(DATADOG_PREFIX + ".metric.host") + .noDefaultValue() + .sinceVersion("0.6.0") + .withDocumentation("Datadog metric host to be sent along with metrics data."); + + public static final ConfigProperty METRIC_TAG_VALUES = ConfigProperty + .key(DATADOG_PREFIX + ".metric.tags") + .noDefaultValue() + .sinceVersion("0.6.0") + .withDocumentation("Datadog metric tags (comma-delimited) to be sent along with metrics data."); + + /** + * @deprecated Use {@link #REPORT_PERIOD_IN_SECONDS} and its methods instead + */ + @Deprecated + public static final String DATADOG_REPORT_PERIOD_SECONDS = REPORT_PERIOD_IN_SECONDS.key(); + /** + * @deprecated Use {@link #REPORT_PERIOD_IN_SECONDS} and its methods instead + */ + @Deprecated + public static final int DEFAULT_DATADOG_REPORT_PERIOD_SECONDS = REPORT_PERIOD_IN_SECONDS.defaultValue(); + /** + * @deprecated Use {@link #API_SITE_VALUE} and its methods instead + */ + @Deprecated + public static final String DATADOG_API_SITE = API_SITE_VALUE.key(); + /** + * @deprecated Use {@link #API_KEY} and its methods instead + */ + @Deprecated + public static final String DATADOG_API_KEY = API_KEY.key(); + /** + * @deprecated Use {@link #API_KEY_SKIP_VALIDATION} and its methods instead + */ + @Deprecated + public static final String DATADOG_API_KEY_SKIP_VALIDATION = API_KEY_SKIP_VALIDATION.key(); + /** + * @deprecated Use {@link #API_KEY_SKIP_VALIDATION} and its methods instead + */ + @Deprecated + public static final boolean DEFAULT_DATADOG_API_KEY_SKIP_VALIDATION = API_KEY_SKIP_VALIDATION.defaultValue(); + /** + * @deprecated Use {@link #API_KEY_SUPPLIER} and its methods instead + */ + @Deprecated + public static final String DATADOG_API_KEY_SUPPLIER = API_KEY_SUPPLIER.key(); + /** + * @deprecated Use {@link #API_TIMEOUT_IN_SECONDS} and its methods instead + */ + @Deprecated + public static final String DATADOG_API_TIMEOUT_SECONDS = API_TIMEOUT_IN_SECONDS.key(); + /** + * @deprecated Use {@link #API_TIMEOUT_IN_SECONDS} and its methods instead + */ + @Deprecated + public static final int DEFAULT_DATADOG_API_TIMEOUT_SECONDS = API_TIMEOUT_IN_SECONDS.defaultValue(); + /** + * @deprecated Use {@link #METRIC_PREFIX_VALUE} and its methods instead + */ + @Deprecated + public static final String DATADOG_METRIC_PREFIX = METRIC_PREFIX_VALUE.key(); + /** + * @deprecated Use {@link #METRIC_HOST_NAME} and its methods instead + */ + @Deprecated + public static final String DATADOG_METRIC_HOST = METRIC_HOST_NAME.key(); + /** + * @deprecated Use {@link #METRIC_TAG_VALUES} and its methods instead + */ + @Deprecated + public static final String DATADOG_METRIC_TAGS = METRIC_TAG_VALUES.key(); + + private HoodieMetricsDatadogConfig() { + super(); + } + + public static HoodieMetricsDatadogConfig.Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + + private final HoodieMetricsDatadogConfig metricsDatadogConfig = new HoodieMetricsDatadogConfig(); + + public Builder fromProperties(Properties props) { + this.metricsDatadogConfig.getProps().putAll(props); + return this; + } + + public Builder withDatadogReportPeriodSeconds(int period) { + metricsDatadogConfig.setValue(REPORT_PERIOD_IN_SECONDS, String.valueOf(period)); + return this; + } + + public Builder withDatadogApiSite(String apiSite) { + metricsDatadogConfig.setValue(API_SITE_VALUE, apiSite); + return this; + } + + public Builder withDatadogApiKey(String apiKey) { + metricsDatadogConfig.setValue(API_KEY, apiKey); + return this; + } + + public Builder withDatadogApiKeySkipValidation(boolean skip) { + metricsDatadogConfig.setValue(API_KEY_SKIP_VALIDATION, String.valueOf(skip)); + return this; + } + + public Builder withDatadogApiKeySupplier(String apiKeySupplier) { + metricsDatadogConfig.setValue(API_KEY_SUPPLIER, apiKeySupplier); + return this; + } + + public Builder withDatadogApiTimeoutSeconds(int timeout) { + metricsDatadogConfig.setValue(API_TIMEOUT_IN_SECONDS, String.valueOf(timeout)); + return this; + } + + public Builder withDatadogPrefix(String prefix) { + metricsDatadogConfig.setValue(METRIC_PREFIX_VALUE, prefix); + return this; + } + + public Builder withDatadogHost(String host) { + metricsDatadogConfig.setValue(METRIC_HOST_NAME, host); + return this; + } + + public Builder withDatadogTags(String tags) { + metricsDatadogConfig.setValue(METRIC_TAG_VALUES, tags); + return this; + } + + public HoodieMetricsDatadogConfig build() { + metricsDatadogConfig.setDefaults(HoodieMetricsDatadogConfig.class.getName()); + return metricsDatadogConfig; + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsGraphiteConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsGraphiteConfig.java new file mode 100644 index 0000000000000..25c4c6af4a4c2 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsGraphiteConfig.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config.metrics; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +import static org.apache.hudi.config.metrics.HoodieMetricsConfig.METRIC_PREFIX; + +/** + * Configs for Graphite reporter type. + *

+ * {@link org.apache.hudi.metrics.MetricsReporterType#GRAPHITE} + */ +@ConfigClassProperty(name = "Metrics Configurations for Graphite", + groupName = ConfigGroups.Names.METRICS, + description = "Enables reporting on Hudi metrics using Graphite. " + + " Hudi publishes metrics on every commit, clean, rollback etc.") +public class HoodieMetricsGraphiteConfig extends HoodieConfig { + + public static final String GRAPHITE_PREFIX = METRIC_PREFIX + ".graphite"; + + public static final ConfigProperty GRAPHITE_SERVER_HOST_NAME = ConfigProperty + .key(GRAPHITE_PREFIX + ".host") + .defaultValue("localhost") + .sinceVersion("0.5.0") + .withDocumentation("Graphite host to connect to."); + + public static final ConfigProperty GRAPHITE_SERVER_PORT_NUM = ConfigProperty + .key(GRAPHITE_PREFIX + ".port") + .defaultValue(4756) + .sinceVersion("0.5.0") + .withDocumentation("Graphite port to connect to."); + + public static final ConfigProperty GRAPHITE_METRIC_PREFIX_VALUE = ConfigProperty + .key(GRAPHITE_PREFIX + ".metric.prefix") + .noDefaultValue() + .sinceVersion("0.5.1") + .withDocumentation("Standard prefix applied to all metrics. This helps to add datacenter, environment information for e.g"); + + public static final ConfigProperty GRAPHITE_REPORT_PERIOD_IN_SECONDS = ConfigProperty + .key(GRAPHITE_PREFIX + ".report.period.seconds") + .defaultValue(30) + .sinceVersion("0.10.0") + .withDocumentation("Graphite reporting period in seconds. Default to 30."); + + /** + * @deprecated Use {@link #GRAPHITE_SERVER_HOST_NAME} and its methods instead + */ + @Deprecated + public static final String GRAPHITE_SERVER_HOST = GRAPHITE_SERVER_HOST_NAME.key(); + /** + * @deprecated Use {@link #GRAPHITE_SERVER_HOST_NAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_GRAPHITE_SERVER_HOST = GRAPHITE_SERVER_HOST_NAME.defaultValue(); + /** + * @deprecated Use {@link #GRAPHITE_SERVER_PORT_NUM} and its methods instead + */ + @Deprecated + public static final String GRAPHITE_SERVER_PORT = GRAPHITE_SERVER_PORT_NUM.key(); + /** + * @deprecated Use {@link #GRAPHITE_SERVER_PORT_NUM} and its methods instead + */ + @Deprecated + public static final int DEFAULT_GRAPHITE_SERVER_PORT = GRAPHITE_SERVER_PORT_NUM.defaultValue(); + /** + * @deprecated Use {@link #GRAPHITE_METRIC_PREFIX_VALUE} and its methods instead + */ + @Deprecated + public static final String GRAPHITE_METRIC_PREFIX = GRAPHITE_METRIC_PREFIX_VALUE.key(); + + private HoodieMetricsGraphiteConfig() { + super(); + } + + public static HoodieMetricsGraphiteConfig.Builder newBuilder() { + return new HoodieMetricsGraphiteConfig.Builder(); + } + + public static class Builder { + + private final HoodieMetricsGraphiteConfig hoodieMetricsGraphiteConfig = new HoodieMetricsGraphiteConfig(); + + public HoodieMetricsGraphiteConfig.Builder fromFile(File propertiesFile) throws IOException { + try (FileReader reader = new FileReader(propertiesFile)) { + this.hoodieMetricsGraphiteConfig.getProps().load(reader); + return this; + } + } + + public HoodieMetricsGraphiteConfig.Builder fromProperties(Properties props) { + this.hoodieMetricsGraphiteConfig.getProps().putAll(props); + return this; + } + + public HoodieMetricsGraphiteConfig.Builder toGraphiteHost(String host) { + hoodieMetricsGraphiteConfig.setValue(GRAPHITE_SERVER_HOST_NAME, host); + return this; + } + + public HoodieMetricsGraphiteConfig.Builder onGraphitePort(int port) { + hoodieMetricsGraphiteConfig.setValue(GRAPHITE_SERVER_PORT_NUM, String.valueOf(port)); + return this; + } + + public HoodieMetricsGraphiteConfig.Builder usePrefix(String prefix) { + hoodieMetricsGraphiteConfig.setValue(GRAPHITE_METRIC_PREFIX_VALUE, prefix); + return this; + } + + public HoodieMetricsGraphiteConfig.Builder periodSeconds(String periodSeconds) { + hoodieMetricsGraphiteConfig.setValue(GRAPHITE_REPORT_PERIOD_IN_SECONDS, periodSeconds); + return this; + } + + public HoodieMetricsGraphiteConfig build() { + hoodieMetricsGraphiteConfig.setDefaults(HoodieMetricsGraphiteConfig.class.getName()); + return hoodieMetricsGraphiteConfig; + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsJmxConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsJmxConfig.java new file mode 100644 index 0000000000000..e3a57a1c5caf4 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsJmxConfig.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config.metrics; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +import static org.apache.hudi.config.metrics.HoodieMetricsConfig.METRIC_PREFIX; + +/** + * Configs for Jmx reporter type. + *

+ * {@link org.apache.hudi.metrics.MetricsReporterType#JMX} + */ +@ConfigClassProperty(name = "Metrics Configurations for Jmx", + groupName = ConfigGroups.Names.METRICS, + description = "Enables reporting on Hudi metrics using Jmx. " + + " Hudi publishes metrics on every commit, clean, rollback etc.") +public class HoodieMetricsJmxConfig extends HoodieConfig { + + public static final String JMX_PREFIX = METRIC_PREFIX + ".jmx"; + + public static final ConfigProperty JMX_HOST_NAME = ConfigProperty + .key(JMX_PREFIX + ".host") + .defaultValue("localhost") + .sinceVersion("0.5.1") + .withDocumentation("Jmx host to connect to"); + + public static final ConfigProperty JMX_PORT_NUM = ConfigProperty + .key(JMX_PREFIX + ".port") + .defaultValue(9889) + .sinceVersion("0.5.1") + .withDocumentation("Jmx port to connect to"); + + /** + * @deprecated Use {@link #JMX_HOST_NAME} and its methods instead + */ + @Deprecated + public static final String JMX_HOST = JMX_HOST_NAME.key(); + /** + * @deprecated Use {@link #JMX_HOST_NAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_JMX_HOST = JMX_HOST_NAME.defaultValue(); + /** + * @deprecated Use {@link #JMX_PORT_NUM} and its methods instead + */ + @Deprecated + public static final String JMX_PORT = JMX_PORT_NUM.key(); + /** + * @deprecated Use {@link #JMX_PORT_NUM} and its methods instead + */ + @Deprecated + public static final int DEFAULT_JMX_PORT = JMX_PORT_NUM.defaultValue(); + + private HoodieMetricsJmxConfig() { + super(); + } + + public static HoodieMetricsJmxConfig.Builder newBuilder() { + return new HoodieMetricsJmxConfig.Builder(); + } + + public static class Builder { + + private final HoodieMetricsJmxConfig hoodieMetricsJmxConfig = new HoodieMetricsJmxConfig(); + + public HoodieMetricsJmxConfig.Builder fromFile(File propertiesFile) throws IOException { + try (FileReader reader = new FileReader(propertiesFile)) { + this.hoodieMetricsJmxConfig.getProps().load(reader); + return this; + } + } + + public HoodieMetricsJmxConfig.Builder fromProperties(Properties props) { + this.hoodieMetricsJmxConfig.getProps().putAll(props); + return this; + } + + public HoodieMetricsJmxConfig.Builder toJmxHost(String host) { + hoodieMetricsJmxConfig.setValue(JMX_HOST_NAME, host); + return this; + } + + public HoodieMetricsJmxConfig.Builder onJmxPort(String port) { + hoodieMetricsJmxConfig.setValue(JMX_PORT_NUM, port); + return this; + } + + public HoodieMetricsJmxConfig build() { + hoodieMetricsJmxConfig.setDefaults(HoodieMetricsJmxConfig.class.getName()); + return hoodieMetricsJmxConfig; + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsPrometheusConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsPrometheusConfig.java new file mode 100644 index 0000000000000..e27ff1bcb0897 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsPrometheusConfig.java @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config.metrics; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +import java.util.Properties; + +import static org.apache.hudi.config.metrics.HoodieMetricsConfig.METRIC_PREFIX; + +/** + * Configs for Prometheus/Pushgaeway reporter type. + *

+ * {@link org.apache.hudi.metrics.MetricsReporterType#PROMETHEUS} + * {@link org.apache.hudi.metrics.MetricsReporterType#PROMETHEUS_PUSHGATEWAY} + */ +@ConfigClassProperty(name = "Metrics Configurations for Prometheus", + groupName = ConfigGroups.Names.METRICS, + description = "Enables reporting on Hudi metrics using Prometheus. " + + " Hudi publishes metrics on every commit, clean, rollback etc.") +public class HoodieMetricsPrometheusConfig extends HoodieConfig { + + // Prometheus PushGateWay + public static final String PUSHGATEWAY_PREFIX = METRIC_PREFIX + ".pushgateway"; + + public static final ConfigProperty PUSHGATEWAY_HOST_NAME = ConfigProperty + .key(PUSHGATEWAY_PREFIX + ".host") + .defaultValue("localhost") + .sinceVersion("0.6.0") + .withDocumentation("Hostname of the prometheus push gateway."); + + public static final ConfigProperty PUSHGATEWAY_PORT_NUM = ConfigProperty + .key(PUSHGATEWAY_PREFIX + ".port") + .defaultValue(9091) + .sinceVersion("0.6.0") + .withDocumentation("Port for the push gateway."); + + public static final ConfigProperty PUSHGATEWAY_REPORT_PERIOD_IN_SECONDS = ConfigProperty + .key(PUSHGATEWAY_PREFIX + ".report.period.seconds") + .defaultValue(30) + .sinceVersion("0.6.0") + .withDocumentation("Reporting interval in seconds."); + + public static final ConfigProperty PUSHGATEWAY_DELETE_ON_SHUTDOWN_ENABLE = ConfigProperty + .key(PUSHGATEWAY_PREFIX + ".delete.on.shutdown") + .defaultValue(true) + .sinceVersion("0.6.0") + .withDocumentation("Delete the pushgateway info or not when job shutdown, true by default."); + + public static final ConfigProperty PUSHGATEWAY_JOBNAME = ConfigProperty + .key(PUSHGATEWAY_PREFIX + ".job.name") + .defaultValue("") + .sinceVersion("0.6.0") + .withDocumentation("Name of the push gateway job."); + + public static final ConfigProperty PUSHGATEWAY_RANDOM_JOBNAME_SUFFIX = ConfigProperty + .key(PUSHGATEWAY_PREFIX + ".random.job.name.suffix") + .defaultValue(true) + .sinceVersion("0.6.0") + .withDocumentation("Whether the pushgateway name need a random suffix , default true."); + + // Prometheus HttpServer + public static final String PROMETHEUS_PREFIX = METRIC_PREFIX + ".prometheus"; + + public static final ConfigProperty PROMETHEUS_PORT_NUM = ConfigProperty + .key(PROMETHEUS_PREFIX + ".port") + .defaultValue(9090) + .sinceVersion("0.6.0") + .withDocumentation("Port for prometheus server."); + + /** + * @deprecated Use {@link #PUSHGATEWAY_HOST_NAME} and its methods instead + */ + @Deprecated + public static final String PUSHGATEWAY_HOST = PUSHGATEWAY_HOST_NAME.key(); + /** + * @deprecated Use {@link #PUSHGATEWAY_HOST_NAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_PUSHGATEWAY_HOST = PUSHGATEWAY_HOST_NAME.defaultValue(); + /** + * @deprecated Use {@link #PUSHGATEWAY_PORT_NUM} and its methods instead + */ + @Deprecated + public static final String PUSHGATEWAY_PORT = PUSHGATEWAY_PORT_NUM.key(); + /** + * @deprecated Use {@link #PUSHGATEWAY_PORT_NUM} and its methods instead + */ + @Deprecated + public static final int DEFAULT_PUSHGATEWAY_PORT = PUSHGATEWAY_PORT_NUM.defaultValue(); + /** + * @deprecated Use {@link #PUSHGATEWAY_REPORT_PERIOD_IN_SECONDS} and its methods instead + */ + @Deprecated + public static final String PUSHGATEWAY_REPORT_PERIOD_SECONDS = PUSHGATEWAY_REPORT_PERIOD_IN_SECONDS.key(); + /** + * @deprecated Use {@link #PUSHGATEWAY_REPORT_PERIOD_IN_SECONDS} and its methods instead + */ + @Deprecated + public static final int DEFAULT_PUSHGATEWAY_REPORT_PERIOD_SECONDS = PUSHGATEWAY_REPORT_PERIOD_IN_SECONDS.defaultValue(); + /** + * @deprecated Use {@link #PUSHGATEWAY_DELETE_ON_SHUTDOWN_ENABLE} and its methods instead + */ + @Deprecated + public static final String PUSHGATEWAY_DELETE_ON_SHUTDOWN = PUSHGATEWAY_DELETE_ON_SHUTDOWN_ENABLE.key(); + /** + * @deprecated Use {@link #PUSHGATEWAY_DELETE_ON_SHUTDOWN_ENABLE} and its methods instead + */ + @Deprecated + public static final boolean DEFAULT_PUSHGATEWAY_DELETE_ON_SHUTDOWN = PUSHGATEWAY_DELETE_ON_SHUTDOWN_ENABLE.defaultValue(); + /** + * @deprecated Use {@link #PUSHGATEWAY_JOBNAME} and its methods instead + */ + @Deprecated + public static final String PUSHGATEWAY_JOB_NAME = PUSHGATEWAY_JOBNAME.key(); + /** + * @deprecated Use {@link #PUSHGATEWAY_JOBNAME} and its methods instead + */ + @Deprecated + public static final String DEFAULT_PUSHGATEWAY_JOB_NAME = PUSHGATEWAY_JOBNAME.defaultValue(); + /** + * @deprecated Use {@link #PUSHGATEWAY_RANDOM_JOBNAME_SUFFIX} and its methods instead + */ + @Deprecated + public static final String PUSHGATEWAY_RANDOM_JOB_NAME_SUFFIX = PUSHGATEWAY_RANDOM_JOBNAME_SUFFIX.key(); + /** + * @deprecated Use {@link #PUSHGATEWAY_RANDOM_JOBNAME_SUFFIX} and its methods instead + */ + @Deprecated + public static final boolean DEFAULT_PUSHGATEWAY_RANDOM_JOB_NAME_SUFFIX = PUSHGATEWAY_RANDOM_JOBNAME_SUFFIX.defaultValue(); + /** + * @deprecated Use {@link #PROMETHEUS_PORT_NUM} and its methods instead + */ + @Deprecated + public static final String PROMETHEUS_PORT = PROMETHEUS_PORT_NUM.key(); + /** + * @deprecated Use {@link #PROMETHEUS_PORT_NUM} and its methods instead + */ + @Deprecated + public static final int DEFAULT_PROMETHEUS_PORT = PROMETHEUS_PORT_NUM.defaultValue(); + + private HoodieMetricsPrometheusConfig() { + super(); + } + + public static HoodieMetricsPrometheusConfig.Builder newBuilder() { + return new HoodieMetricsPrometheusConfig.Builder(); + } + + public static class Builder { + + private HoodieMetricsPrometheusConfig hoodieMetricsPrometheusConfig = new HoodieMetricsPrometheusConfig(); + + public Builder fromProperties(Properties props) { + this.hoodieMetricsPrometheusConfig.getProps().putAll(props); + return this; + } + + public HoodieMetricsPrometheusConfig.Builder withPushgatewayHostName(String hostName) { + hoodieMetricsPrometheusConfig.setValue(PUSHGATEWAY_HOST_NAME, String.valueOf(hostName)); + return this; + } + + public HoodieMetricsPrometheusConfig.Builder withPushgatewayPortNum(Integer pushgatewayPortNum) { + hoodieMetricsPrometheusConfig.setValue(PUSHGATEWAY_PORT_NUM, String.valueOf(pushgatewayPortNum)); + return this; + } + + public HoodieMetricsPrometheusConfig.Builder withPushgatewayReportPeriodInSeconds(String periodTime) { + hoodieMetricsPrometheusConfig.setValue(PUSHGATEWAY_REPORT_PERIOD_IN_SECONDS, periodTime); + return this; + } + + public HoodieMetricsPrometheusConfig.Builder withPushgatewayDeleteOnShutdownEnable(boolean deleteOnShutdownEnable) { + hoodieMetricsPrometheusConfig.setValue(PUSHGATEWAY_DELETE_ON_SHUTDOWN_ENABLE, String.valueOf(deleteOnShutdownEnable)); + return this; + } + + public HoodieMetricsPrometheusConfig.Builder withPushgatewayJobname(String jobname) { + hoodieMetricsPrometheusConfig.setValue(PUSHGATEWAY_JOBNAME, jobname); + return this; + } + + public HoodieMetricsPrometheusConfig.Builder withPushgatewayRandomJobnameSuffix(boolean randomJobnameSuffix) { + hoodieMetricsPrometheusConfig.setValue(PUSHGATEWAY_RANDOM_JOBNAME_SUFFIX, String.valueOf(randomJobnameSuffix)); + return this; + } + + public HoodieMetricsPrometheusConfig.Builder withPrometheusPortNum(int prometheusPortNum) { + hoodieMetricsPrometheusConfig.setValue(PROMETHEUS_PORT_NUM, String.valueOf(prometheusPortNum)); + return this; + } + + public HoodieMetricsPrometheusConfig build() { + hoodieMetricsPrometheusConfig.setDefaults(HoodieMetricsPrometheusConfig.class.getName()); + return hoodieMetricsPrometheusConfig; + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieClusteringException.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieClusteringException.java new file mode 100644 index 0000000000000..bb6aaa24777cf --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieClusteringException.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.exception; + +public class HoodieClusteringException extends HoodieException { + + public HoodieClusteringException(String msg) { + super(msg); + } + + public HoodieClusteringException(String msg, Throwable e) { + super(msg, e); + } +} + diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieClusteringUpdateException.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieClusteringUpdateException.java new file mode 100644 index 0000000000000..68b62a5421706 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieClusteringUpdateException.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.exception; + +public class HoodieClusteringUpdateException extends HoodieException { + public HoodieClusteringUpdateException(String msg) { + super(msg); + } + + public HoodieClusteringUpdateException(String msg, Throwable e) { + super(msg, e); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieDeletePartitionException.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieDeletePartitionException.java new file mode 100644 index 0000000000000..34eb734b32423 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieDeletePartitionException.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.exception; + +/** + *

+ * Exception thrown for any higher level errors when doing delete partitions. + *

+ */ +public class HoodieDeletePartitionException extends HoodieException { + + public HoodieDeletePartitionException(String msg, Throwable e) { + super(msg, e); + } + + public HoodieDeletePartitionException(String msg) { + super(msg); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieDependentSystemUnavailableException.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieDependentSystemUnavailableException.java index 4c83ebc849bc4..f221f363771e3 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieDependentSystemUnavailableException.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieDependentSystemUnavailableException.java @@ -27,8 +27,8 @@ public class HoodieDependentSystemUnavailableException extends HoodieException { public static final String HBASE = "HBASE"; - public HoodieDependentSystemUnavailableException(String system, String connectURL) { - super(getLogMessage(system, connectURL)); + public HoodieDependentSystemUnavailableException(String system, String connectURL, Throwable t) { + super(getLogMessage(system, connectURL), t); } private static String getLogMessage(String system, String connectURL) { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieRestoreException.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieRestoreException.java index c6c9076f51bae..baad53aba5941 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieRestoreException.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieRestoreException.java @@ -23,4 +23,8 @@ public class HoodieRestoreException extends HoodieException { public HoodieRestoreException(String msg, Throwable e) { super(msg, e); } + + public HoodieRestoreException(String msg) { + super(msg); + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/execution/CopyOnWriteInsertHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/execution/CopyOnWriteInsertHandler.java index 8af72f351fff0..5e1f832b7f239 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/execution/CopyOnWriteInsertHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/execution/CopyOnWriteInsertHandler.java @@ -19,7 +19,7 @@ package org.apache.hudi.execution; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.TaskContextSupplier; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer; @@ -71,7 +71,7 @@ public CopyOnWriteInsertHandler(HoodieWriteConfig config, String instantTime, public void consumeOneRecord(HoodieInsertValueGenResult payload) { final HoodieRecord insertPayload = payload.record; String partitionPath = insertPayload.getPartitionPath(); - HoodieWriteHandle handle = handles.get(partitionPath); + HoodieWriteHandle handle = handles.get(partitionPath); if (handle == null) { // If the records are sorted, this means that we encounter a new partition path // and the records for the previous partition path are all written, @@ -87,7 +87,7 @@ public void consumeOneRecord(HoodieInsertValueGenResult payload) { if (!handle.canWrite(payload.record)) { // Handle is full. Close the handle and add the WriteStatus - statuses.add(handle.close()); + statuses.addAll(handle.close()); // Open new handle handle = writeHandleFactory.create(config, instantTime, hoodieTable, insertPayload.getPartitionPath(), idPrefix, taskContextSupplier); @@ -108,8 +108,8 @@ public List getResult() { } private void closeOpenHandles() { - for (HoodieWriteHandle handle : handles.values()) { - statuses.add(handle.close()); + for (HoodieWriteHandle handle : handles.values()) { + statuses.addAll(handle.close()); } handles.clear(); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/execution/HoodieLazyInsertIterable.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/execution/HoodieLazyInsertIterable.java index b435c68de5e2b..1754836c91c4a 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/execution/HoodieLazyInsertIterable.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/execution/HoodieLazyInsertIterable.java @@ -20,9 +20,10 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.utils.LazyIterableIterator; -import org.apache.hudi.client.common.TaskContextSupplier; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.io.CreateHandleFactory; @@ -34,6 +35,7 @@ import java.util.Iterator; import java.util.List; +import java.util.Properties; import java.util.function.Function; /** @@ -82,10 +84,10 @@ public static class HoodieInsertValueGenResult { // It caches the exception seen while fetching insert value. public Option exception = Option.empty(); - public HoodieInsertValueGenResult(T record, Schema schema) { + public HoodieInsertValueGenResult(T record, Schema schema, Properties properties) { this.record = record; try { - this.insertValue = record.getData().getInsertValue(schema); + this.insertValue = ((HoodieRecordPayload) record.getData()).getInsertValue(schema, properties); } catch (Exception e) { this.exception = Option.of(e); } @@ -96,9 +98,14 @@ public HoodieInsertValueGenResult(T record, Schema schema) { * Transformer function to help transform a HoodieRecord. This transformer is used by BufferedIterator to offload some * expensive operations of transformation to the reader thread. */ + static Function, HoodieInsertValueGenResult> getTransformFunction( + Schema schema, HoodieWriteConfig config) { + return hoodieRecord -> new HoodieInsertValueGenResult(hoodieRecord, schema, config.getProps()); + } + static Function, HoodieInsertValueGenResult> getTransformFunction( Schema schema) { - return hoodieRecord -> new HoodieInsertValueGenResult(hoodieRecord, schema); + return hoodieRecord -> new HoodieInsertValueGenResult(hoodieRecord, schema, CollectionUtils.emptyProps()); } @Override diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertSortMode.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertSortMode.java index d171b8cd7264c..6fd06545ae562 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertSortMode.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertSortMode.java @@ -22,7 +22,9 @@ * Bulk insert sort mode. */ public enum BulkInsertSortMode { - NONE, - GLOBAL_SORT, - PARTITION_SORT + NONE, + GLOBAL_SORT, + PARTITION_SORT, + PARTITION_PATH_REPARTITION, + PARTITION_PATH_REPARTITION_AND_SORT } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndex.java index 6d04594cbab63..7ebd94748accf 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndex.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndex.java @@ -21,12 +21,16 @@ import org.apache.hudi.ApiMaturityLevel; import org.apache.hudi.PublicAPIClass; import org.apache.hudi.PublicAPIMethod; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.table.HoodieTable; import java.io.Serializable; @@ -34,13 +38,11 @@ /** * Base class for different types of indexes to determine the mapping from uuid. * - * @param Sub type of HoodieRecordPayload - * @param Type of inputs - * @param Type of keys - * @param Type of outputs + * @param Type of inputs for deprecated APIs + * @param Type of outputs for deprecated APIs */ @PublicAPIClass(maturity = ApiMaturityLevel.EVOLVING) -public abstract class HoodieIndex implements Serializable { +public abstract class HoodieIndex implements Serializable { protected final HoodieWriteConfig config; @@ -52,18 +54,39 @@ protected HoodieIndex(HoodieWriteConfig config) { * Looks up the index and tags each incoming record with a location of a file that contains the row (if it is actually * present). */ - @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) - public abstract I tagLocation(I records, HoodieEngineContext context, - HoodieTable hoodieTable) throws HoodieIndexException; + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) + public I tagLocation(I records, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { + throw new HoodieNotSupportedException("Deprecated API should not be called"); + } /** * Extracts the location of written records, and updates the index. - *

- * TODO(vc): We may need to propagate the record as well in a WriteStatus class */ - @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) - public abstract O updateLocation(O writeStatuses, HoodieEngineContext context, - HoodieTable hoodieTable) throws HoodieIndexException; + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) + public O updateLocation(O writeStatuses, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { + throw new HoodieNotSupportedException("Deprecated API should not be called"); + } + + /** + * Looks up the index and tags each incoming record with a location of a file that contains + * the row (if it is actually present). + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public abstract HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException; + + /** + * Extracts the location of written records, and updates the index. + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public abstract HoodieData updateLocation( + HoodieData writeStatuses, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException; /** * Rollback the effects of the commit made at instantTime. @@ -76,13 +99,13 @@ public abstract O updateLocation(O writeStatuses, HoodieEngineContext context, * implementation is able to obtain the same mapping, for two hoodie keys with same `recordKey` but different * `partitionPath` * - * @return whether or not, the index implementation is global in nature + * @return whether the index implementation is global in nature */ @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) public abstract boolean isGlobal(); /** - * This is used by storage to determine, if its safe to send inserts, straight to the log, i.e having a + * This is used by storage to determine, if it is safe to send inserts, straight to the log, i.e. having a * {@link FileSlice}, with no data file. * * @return Returns true/false depending on whether the impl has this capability @@ -98,12 +121,30 @@ public abstract O updateLocation(O writeStatuses, HoodieEngineContext context, public abstract boolean isImplicitWithStorage(); /** - * Each index type should implement it's own logic to release any resources acquired during the process. + * To indicate if an operation type requires location tagging before writing + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public boolean requiresTagging(WriteOperationType operationType) { + switch (operationType) { + case DELETE: + case UPSERT: + return true; + default: + return false; + } + } + + /** + * Each index type should implement its own logic to release any resources acquired during the process. */ public void close() { } public enum IndexType { - HBASE, INMEMORY, BLOOM, GLOBAL_BLOOM, SIMPLE, GLOBAL_SIMPLE + HBASE, INMEMORY, BLOOM, GLOBAL_BLOOM, SIMPLE, GLOBAL_SIMPLE, BUCKET, FLINK_STATE + } + + public enum BucketIndexEngineType { + SIMPLE, CONSISTENT_HASHING } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndexUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndexUtils.java index ad7807b707dcb..e3c2651718fd9 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndexUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndexUtils.java @@ -18,17 +18,32 @@ package org.apache.hudi.index; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.io.storage.HoodieFileReader; +import org.apache.hudi.io.storage.HoodieFileReaderFactory; import org.apache.hudi.table.HoodieTable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + import java.util.ArrayList; +import java.util.Collections; import java.util.List; +import java.util.Set; +import java.util.TreeSet; import static java.util.stream.Collectors.toList; @@ -37,6 +52,48 @@ */ public class HoodieIndexUtils { + private static final Logger LOG = LogManager.getLogger(HoodieIndexUtils.class); + + /** + * Fetches Pair of partition path and {@link HoodieBaseFile}s for interested partitions. + * + * @param partition Partition of interest + * @param hoodieTable Instance of {@link HoodieTable} of interest + * @return the list of {@link HoodieBaseFile} + */ + public static List getLatestBaseFilesForPartition( + final String partition, + final HoodieTable hoodieTable) { + Option latestCommitTime = hoodieTable.getMetaClient().getCommitsTimeline() + .filterCompletedInstants().lastInstant(); + if (latestCommitTime.isPresent()) { + return hoodieTable.getBaseFileOnlyView() + .getLatestBaseFilesBeforeOrOn(partition, latestCommitTime.get().getTimestamp()) + .collect(toList()); + } + return Collections.emptyList(); + } + + /** + * Fetches Pair of partition path and {@link FileSlice}s for interested partitions. + * + * @param partition Partition of interest + * @param hoodieTable Instance of {@link HoodieTable} of interest + * @return the list of {@link FileSlice} + */ + public static List getLatestFileSlicesForPartition( + final String partition, + final HoodieTable hoodieTable) { + Option latestCommitTime = hoodieTable.getMetaClient().getCommitsTimeline() + .filterCompletedInstants().lastInstant(); + if (latestCommitTime.isPresent()) { + return hoodieTable.getHoodieView() + .getLatestFileSlicesBeforeOrOn(partition, latestCommitTime.get().getTimestamp(), true) + .collect(toList()); + } + return Collections.emptyList(); + } + /** * Fetches Pair of partition path and {@link HoodieBaseFile}s for interested partitions. * @@ -48,17 +105,13 @@ public class HoodieIndexUtils { public static List> getLatestBaseFilesForAllPartitions(final List partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) { - context.setJobStatus(HoodieIndexUtils.class.getSimpleName(), "Load latest base files from all partitions"); + context.setJobStatus(HoodieIndexUtils.class.getSimpleName(), "Load latest base files from all partitions: " + hoodieTable.getConfig().getTableName()); return context.flatMap(partitions, partitionPath -> { - Option latestCommitTime = hoodieTable.getMetaClient().getCommitsTimeline() - .filterCompletedInstants().lastInstant(); - List> filteredFiles = new ArrayList<>(); - if (latestCommitTime.isPresent()) { - filteredFiles = hoodieTable.getBaseFileOnlyView() - .getLatestBaseFilesBeforeOrOn(partitionPath, latestCommitTime.get().getTimestamp()) - .map(f -> Pair.of(partitionPath, f)) - .collect(toList()); - } + List> filteredFiles = + getLatestBaseFilesForPartition(partitionPath, hoodieTable).stream() + .map(baseFile -> Pair.of(partitionPath, baseFile)) + .collect(toList()); + return filteredFiles.stream(); }, Math.max(partitions.size(), 1)); } @@ -71,18 +124,47 @@ public static List> getLatestBaseFilesForAllPartiti * @return the tagged {@link HoodieRecord} */ public static HoodieRecord getTaggedRecord(HoodieRecord inputRecord, Option location) { - HoodieRecord record = inputRecord; + HoodieRecord record = inputRecord; if (location.isPresent()) { // When you have a record in multiple files in the same partition, then collection // will have 2 entries with the same exact in memory copy of the HoodieRecord and the 2 // separate filenames that the record is found in. This will result in setting // currentLocation 2 times and it will fail the second time. So creating a new in memory // copy of the hoodie record. - record = new HoodieRecord<>(inputRecord); + record = inputRecord.newInstance(); record.unseal(); record.setCurrentLocation(location.get()); record.seal(); } return record; } + + /** + * Given a list of row keys and one file, return only row keys existing in that file. + * + * @param filePath - File to filter keys from + * @param candidateRecordKeys - Candidate keys to filter + * @return List of candidate keys that are available in the file + */ + public static List filterKeysFromFile(Path filePath, List candidateRecordKeys, + Configuration configuration) throws HoodieIndexException { + ValidationUtils.checkArgument(FSUtils.isBaseFile(filePath)); + List foundRecordKeys = new ArrayList<>(); + try (HoodieFileReader fileReader = HoodieFileReaderFactory.getFileReader(configuration, filePath)) { + // Load all rowKeys from the file, to double-confirm + if (!candidateRecordKeys.isEmpty()) { + HoodieTimer timer = new HoodieTimer().startTimer(); + Set fileRowKeys = fileReader.filterRowKeys(new TreeSet<>(candidateRecordKeys)); + foundRecordKeys.addAll(fileRowKeys); + LOG.info(String.format("Checked keys against file %s, in %d ms. #candidates (%d) #found (%d)", filePath, + timer.endTimer(), candidateRecordKeys.size(), foundRecordKeys.size())); + if (LOG.isDebugEnabled()) { + LOG.debug("Keys matching for file " + filePath + " => " + foundRecordKeys); + } + } + } catch (Exception e) { + throw new HoodieIndexException("Error checking candidate keys against file.", e); + } + return foundRecordKeys; + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/BaseHoodieBloomIndexHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/BaseHoodieBloomIndexHelper.java new file mode 100644 index 0000000000000..9430d9bb5e50b --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/BaseHoodieBloomIndexHelper.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.index.bloom; + +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodiePairData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; + +import java.io.Serializable; +import java.util.List; +import java.util.Map; + +/** + * Helper for {@link HoodieBloomIndex} containing engine-specific logic. + */ +public abstract class BaseHoodieBloomIndexHelper implements Serializable { + /** + * Find out pair. + * + * @param config Write config. + * @param context {@link HoodieEngineContext} instance to use. + * @param hoodieTable {@link HoodieTable} instance to use. + * @param partitionRecordKeyPairs Pairs of partition path and record key. + * @param fileComparisonPairs Pairs of filename and record key based on file comparisons. + * @param partitionToFileInfo Partition path to {@link BloomIndexFileInfo} map. + * @param recordsPerPartition Number of records per partition in a map. + * @return {@link HoodiePairData} of {@link HoodieKey} and {@link HoodieRecordLocation} pairs. + */ + public abstract HoodiePairData findMatchingFilesForRecordKeys( + HoodieWriteConfig config, HoodieEngineContext context, HoodieTable hoodieTable, + HoodiePairData partitionRecordKeyPairs, + HoodieData> fileComparisonPairs, + Map> partitionToFileInfo, + Map recordsPerPartition); +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBaseBloomIndexCheckFunction.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBaseBloomIndexCheckFunction.java new file mode 100644 index 0000000000000..80031f4e8f025 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBaseBloomIndexCheckFunction.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index.bloom; + +import org.apache.hudi.client.utils.LazyIterableIterator; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.io.HoodieKeyLookupHandle; +import org.apache.hudi.io.HoodieKeyLookupResult; +import org.apache.hudi.table.HoodieTable; + +import java.util.function.Function; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * Function performing actual checking of list containing (fileId, hoodieKeys) against the actual files. + */ +public class HoodieBaseBloomIndexCheckFunction + implements Function>, Iterator>> { + + private final HoodieTable hoodieTable; + + private final HoodieWriteConfig config; + + public HoodieBaseBloomIndexCheckFunction(HoodieTable hoodieTable, HoodieWriteConfig config) { + this.hoodieTable = hoodieTable; + this.config = config; + } + + @Override + public Iterator> apply(Iterator> filePartitionRecordKeyTripletItr) { + return new LazyKeyCheckIterator(filePartitionRecordKeyTripletItr); + } + + class LazyKeyCheckIterator extends LazyIterableIterator, List> { + + private HoodieKeyLookupHandle keyLookupHandle; + + LazyKeyCheckIterator(Iterator> filePartitionRecordKeyTripletItr) { + super(filePartitionRecordKeyTripletItr); + } + + @Override + protected void start() { + } + + @Override + protected List computeNext() { + List ret = new ArrayList<>(); + try { + // process one file in each go. + while (inputItr.hasNext()) { + Pair currentTuple = inputItr.next(); + String fileId = currentTuple.getLeft(); + String partitionPath = currentTuple.getRight().getPartitionPath(); + String recordKey = currentTuple.getRight().getRecordKey(); + Pair partitionPathFilePair = Pair.of(partitionPath, fileId); + + // lazily init state + if (keyLookupHandle == null) { + keyLookupHandle = new HoodieKeyLookupHandle(config, hoodieTable, partitionPathFilePair); + } + + // if continue on current file + if (keyLookupHandle.getPartitionPathFileIDPair().equals(partitionPathFilePair)) { + keyLookupHandle.addKey(recordKey); + } else { + // do the actual checking of file & break out + ret.add(keyLookupHandle.getLookupResult()); + keyLookupHandle = new HoodieKeyLookupHandle(config, hoodieTable, partitionPathFilePair); + keyLookupHandle.addKey(recordKey); + break; + } + } + + // handle case, where we ran out of input, close pending work, update return val + if (!inputItr.hasNext()) { + ret.add(keyLookupHandle.getLookupResult()); + } + } catch (Throwable e) { + if (e instanceof HoodieException) { + throw e; + } + throw new HoodieIndexException("Error checking bloom filter index. ", e); + } + return ret; + } + + @Override + protected void end() { + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java new file mode 100644 index 0000000000000..1417e40a9f587 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java @@ -0,0 +1,318 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.index.bloom; + +import org.apache.hudi.avro.model.HoodieMetadataColumnStats; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodiePairData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieMetadataException; +import org.apache.hudi.exception.MetadataNotFoundException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.index.HoodieIndexUtils; +import org.apache.hudi.io.HoodieRangeInfoHandle; +import org.apache.hudi.table.HoodieTable; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static java.util.stream.Collectors.groupingBy; +import static java.util.stream.Collectors.mapping; +import static java.util.stream.Collectors.toList; +import static org.apache.hudi.common.util.CollectionUtils.isNullOrEmpty; +import static org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions; +import static org.apache.hudi.metadata.HoodieMetadataPayload.unwrapStatisticValueWrapper; +import static org.apache.hudi.metadata.MetadataPartitionType.COLUMN_STATS; + +/** + * Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in its metadata. + */ +public class HoodieBloomIndex extends HoodieIndex { + private static final Logger LOG = LogManager.getLogger(HoodieBloomIndex.class); + + private final BaseHoodieBloomIndexHelper bloomIndexHelper; + + public HoodieBloomIndex(HoodieWriteConfig config, BaseHoodieBloomIndexHelper bloomIndexHelper) { + super(config); + this.bloomIndexHelper = bloomIndexHelper; + } + + @Override + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) { + // Step 0: cache the input records if needed + if (config.getBloomIndexUseCaching()) { + records.persist(new HoodieConfig(config.getProps()) + .getString(HoodieIndexConfig.BLOOM_INDEX_INPUT_STORAGE_LEVEL_VALUE)); + } + + // Step 1: Extract out thinner pairs of (partitionPath, recordKey) + HoodiePairData partitionRecordKeyPairs = records.mapToPair( + record -> new ImmutablePair<>(record.getPartitionPath(), record.getRecordKey())); + + // Step 2: Lookup indexes for all the partition/recordkey pair + HoodiePairData keyFilenamePairs = + lookupIndex(partitionRecordKeyPairs, context, hoodieTable); + + // Cache the result, for subsequent stages. + if (config.getBloomIndexUseCaching()) { + keyFilenamePairs.persist("MEMORY_AND_DISK_SER"); + } + if (LOG.isDebugEnabled()) { + long totalTaggedRecords = keyFilenamePairs.count(); + LOG.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords); + } + + // Step 3: Tag the incoming records, as inserts or updates, by joining with existing record keys + HoodieData> taggedRecords = tagLocationBacktoRecords(keyFilenamePairs, records); + + if (config.getBloomIndexUseCaching()) { + records.unpersist(); + keyFilenamePairs.unpersist(); + } + + return taggedRecords; + } + + /** + * Lookup the location for each record key and return the pair for all record keys already + * present and drop the record keys if not present. + */ + private HoodiePairData lookupIndex( + HoodiePairData partitionRecordKeyPairs, final HoodieEngineContext context, + final HoodieTable hoodieTable) { + // Step 1: Obtain records per partition, in the incoming records + Map recordsPerPartition = partitionRecordKeyPairs.countByKey(); + List affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet()); + + // Step 2: Load all involved files as pairs + List> fileInfoList = getBloomIndexFileInfoForPartitions(context, hoodieTable, affectedPartitionPathList); + final Map> partitionToFileInfo = + fileInfoList.stream().collect(groupingBy(Pair::getLeft, mapping(Pair::getRight, toList()))); + + // Step 3: Obtain a HoodieData, for each incoming record, that already exists, with the file id, + // that contains it. + HoodieData> fileComparisonPairs = + explodeRecordsWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairs); + + return bloomIndexHelper.findMatchingFilesForRecordKeys(config, context, hoodieTable, + partitionRecordKeyPairs, fileComparisonPairs, partitionToFileInfo, recordsPerPartition); + } + + private List> getBloomIndexFileInfoForPartitions(HoodieEngineContext context, + HoodieTable hoodieTable, + List affectedPartitionPathList) { + List> fileInfoList = new ArrayList<>(); + + if (config.getBloomIndexPruneByRanges()) { + // load column ranges from metadata index if column stats index is enabled and column_stats metadata partition is available + if (config.getBloomIndexUseMetadata() + && hoodieTable.getMetaClient().getTableConfig().getMetadataPartitions().contains(COLUMN_STATS.getPartitionPath())) { + fileInfoList = loadColumnRangesFromMetaIndex(affectedPartitionPathList, context, hoodieTable); + } + // fallback to loading column ranges from files + if (isNullOrEmpty(fileInfoList)) { + fileInfoList = loadColumnRangesFromFiles(affectedPartitionPathList, context, hoodieTable); + } + } else { + fileInfoList = getFileInfoForLatestBaseFiles(affectedPartitionPathList, context, hoodieTable); + } + + return fileInfoList; + } + + /** + * Load all involved files as pair List. + */ + List> loadColumnRangesFromFiles( + List partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) { + // Obtain the latest data files from all the partitions. + List> partitionPathFileIDList = getLatestBaseFilesForAllPartitions(partitions, context, hoodieTable).stream() + .map(pair -> Pair.of(pair.getKey(), pair.getValue().getFileId())) + .collect(toList()); + + context.setJobStatus(this.getClass().getName(), "Obtain key ranges for file slices (range pruning=on): " + config.getTableName()); + return context.map(partitionPathFileIDList, pf -> { + try { + HoodieRangeInfoHandle rangeInfoHandle = new HoodieRangeInfoHandle(config, hoodieTable, pf); + String[] minMaxKeys = rangeInfoHandle.getMinMaxKeys(); + return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue(), minMaxKeys[0], minMaxKeys[1])); + } catch (MetadataNotFoundException me) { + LOG.warn("Unable to find range metadata in file :" + pf); + return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue())); + } + }, Math.max(partitionPathFileIDList.size(), 1)); + } + + /** + * Get BloomIndexFileInfo for all the latest base files for the requested partitions. + * + * @param partitions - List of partitions to get the base files for + * @param context - Engine context + * @param hoodieTable - Hoodie Table + * @return List of partition and file column range info pairs + */ + private List> getFileInfoForLatestBaseFiles( + List partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) { + List> partitionPathFileIDList = getLatestBaseFilesForAllPartitions(partitions, context, + hoodieTable).stream() + .map(pair -> Pair.of(pair.getKey(), pair.getValue().getFileId())) + .collect(toList()); + return partitionPathFileIDList.stream() + .map(pf -> Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue()))).collect(toList()); + } + + /** + * Load the column stats index as BloomIndexFileInfo for all the involved files in the partition. + * + * @param partitions - List of partitions for which column stats need to be loaded + * @param context - Engine context + * @param hoodieTable - Hoodie table + * @return List of partition and file column range info pairs + */ + protected List> loadColumnRangesFromMetaIndex( + List partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) { + // also obtain file ranges, if range pruning is enabled + context.setJobStatus(this.getClass().getName(), "Load meta index key ranges for file slices: " + config.getTableName()); + + final String keyField = hoodieTable.getMetaClient().getTableConfig().getRecordKeyFieldProp(); + return context.flatMap(partitions, partitionName -> { + // Partition and file name pairs + List> partitionFileNameList = HoodieIndexUtils.getLatestBaseFilesForPartition(partitionName, + hoodieTable).stream().map(baseFile -> Pair.of(partitionName, baseFile.getFileName())) + .sorted() + .collect(toList()); + if (partitionFileNameList.isEmpty()) { + return Stream.empty(); + } + try { + Map, HoodieMetadataColumnStats> fileToColumnStatsMap = + hoodieTable.getMetadataTable().getColumnStats(partitionFileNameList, keyField); + List> result = new ArrayList<>(); + for (Map.Entry, HoodieMetadataColumnStats> entry : fileToColumnStatsMap.entrySet()) { + result.add(Pair.of(entry.getKey().getLeft(), + new BloomIndexFileInfo( + FSUtils.getFileId(entry.getKey().getRight()), + // NOTE: Here we assume that the type of the primary key field is string + (String) unwrapStatisticValueWrapper(entry.getValue().getMinValue()), + (String) unwrapStatisticValueWrapper(entry.getValue().getMaxValue()) + ))); + } + return result.stream(); + } catch (MetadataNotFoundException me) { + throw new HoodieMetadataException("Unable to find column range metadata for partition:" + partitionName, me); + } + }, Math.max(partitions.size(), 1)); + } + + @Override + public boolean rollbackCommit(String instantTime) { + // Nope, don't need to do anything. + return true; + } + + /** + * This is not global, since we depend on the partitionPath to do the lookup. + */ + @Override + public boolean isGlobal() { + return false; + } + + /** + * No indexes into log files yet. + */ + @Override + public boolean canIndexLogFiles() { + return false; + } + + /** + * Bloom filters are stored, into the same data files. + */ + @Override + public boolean isImplicitWithStorage() { + return true; + } + + /** + * For each incoming record, produce N output records, 1 each for each file against which the record's key needs to be + * checked. For tables, where the keys have a definite insert order (e.g: timestamp as prefix), the number of files + * to be compared gets cut down a lot from range pruning. + *

+ * Sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on + * recordKey ranges in the index info. + */ + HoodieData> explodeRecordsWithFileComparisons( + final Map> partitionToFileIndexInfo, + HoodiePairData partitionRecordKeyPairs) { + IndexFileFilter indexFileFilter = + config.useBloomIndexTreebasedFilter() ? new IntervalTreeBasedIndexFileFilter(partitionToFileIndexInfo) + : new ListBasedIndexFileFilter(partitionToFileIndexInfo); + + return partitionRecordKeyPairs.map(partitionRecordKeyPair -> { + String recordKey = partitionRecordKeyPair.getRight(); + String partitionPath = partitionRecordKeyPair.getLeft(); + + return indexFileFilter.getMatchingFilesAndPartition(partitionPath, recordKey).stream() + .map(partitionFileIdPair -> (Pair) new ImmutablePair<>(partitionFileIdPair.getRight(), + new HoodieKey(recordKey, partitionPath))) + .collect(Collectors.toList()); + }).flatMap(List::iterator); + } + + /** + * Tag the back to the original HoodieRecord List. + */ + protected HoodieData> tagLocationBacktoRecords( + HoodiePairData keyFilenamePair, + HoodieData> records) { + HoodiePairData> keyRecordPairs = + records.mapToPair(record -> new ImmutablePair<>(record.getKey(), record)); + // Here as the records might have more data than keyFilenamePairs (some row keys' fileId is null), + // so we do left outer join. + return keyRecordPairs.leftOuterJoin(keyFilenamePair).values() + .map(v -> HoodieIndexUtils.getTaggedRecord(v.getLeft(), Option.ofNullable(v.getRight().orElse(null)))); + } + + @Override + public HoodieData updateLocation( + HoodieData writeStatusData, HoodieEngineContext context, + HoodieTable hoodieTable) { + return writeStatusData; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieGlobalBloomIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieGlobalBloomIndex.java new file mode 100644 index 0000000000000..5f2007ea53668 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieGlobalBloomIndex.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.index.bloom; + +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodiePairData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndexUtils; +import org.apache.hudi.table.HoodieTable; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * This filter will only work with hoodie table since it will only load partitions + * with .hoodie_partition_metadata file in it. + */ +public class HoodieGlobalBloomIndex extends HoodieBloomIndex { + public HoodieGlobalBloomIndex(HoodieWriteConfig config, BaseHoodieBloomIndexHelper bloomIndexHelper) { + super(config, bloomIndexHelper); + } + + /** + * Load all involved files as pairs from all partitions in the table. + */ + @Override + List> loadColumnRangesFromFiles(List partitions, final HoodieEngineContext context, + final HoodieTable hoodieTable) { + HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); + List allPartitionPaths = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), metaClient.getBasePath()); + return super.loadColumnRangesFromFiles(allPartitionPaths, context, hoodieTable); + } + + /** + * For each incoming record, produce N output records, 1 each for each file against which the record's key needs to be + * checked. For tables, where the keys have a definite insert order (e.g: timestamp as prefix), the number of files + * to be compared gets cut down a lot from range pruning. + *

+ * Sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on + * recordKey ranges in the index info. the partition path of the incoming record (partitionRecordKeyPairs._2()) will + * be ignored since the search scope should be bigger than that + */ + + @Override + HoodieData> explodeRecordsWithFileComparisons( + final Map> partitionToFileIndexInfo, + HoodiePairData partitionRecordKeyPairs) { + + IndexFileFilter indexFileFilter = + config.useBloomIndexTreebasedFilter() ? new IntervalTreeBasedGlobalIndexFileFilter(partitionToFileIndexInfo) + : new ListBasedGlobalIndexFileFilter(partitionToFileIndexInfo); + + return partitionRecordKeyPairs.map(partitionRecordKeyPair -> { + String recordKey = partitionRecordKeyPair.getRight(); + String partitionPath = partitionRecordKeyPair.getLeft(); + + return indexFileFilter.getMatchingFilesAndPartition(partitionPath, recordKey).stream() + .map(partitionFileIdPair -> (Pair) new ImmutablePair<>(partitionFileIdPair.getRight(), + new HoodieKey(recordKey, partitionFileIdPair.getLeft()))) + .collect(Collectors.toList()); + }).flatMap(List::iterator); + } + + /** + * Tagging for global index should only consider the record key. + */ + @Override + protected HoodieData> tagLocationBacktoRecords( + HoodiePairData keyLocationPairs, + HoodieData> records) { + + HoodiePairData> incomingRowKeyRecordPairs = + records.mapToPair(record -> new ImmutablePair<>(record.getRecordKey(), record)); + + HoodiePairData> existingRecordKeyToRecordLocationHoodieKeyMap = + keyLocationPairs.mapToPair(p -> new ImmutablePair<>( + p.getKey().getRecordKey(), new ImmutablePair<>(p.getValue(), p.getKey()))); + + // Here as the records might have more data than rowKeys (some rowKeys' fileId is null), so we do left outer join. + return incomingRowKeyRecordPairs.leftOuterJoin(existingRecordKeyToRecordLocationHoodieKeyMap).values().flatMap(record -> { + final HoodieRecord hoodieRecord = record.getLeft(); + final Option> recordLocationHoodieKeyPair = record.getRight(); + if (recordLocationHoodieKeyPair.isPresent()) { + // Record key matched to file + if (config.getBloomIndexUpdatePartitionPath() + && !recordLocationHoodieKeyPair.get().getRight().getPartitionPath().equals(hoodieRecord.getPartitionPath())) { + // Create an empty record to delete the record in the old partition + HoodieRecord deleteRecord = new HoodieAvroRecord(recordLocationHoodieKeyPair.get().getRight(), + new EmptyHoodieRecordPayload()); + deleteRecord.setCurrentLocation(recordLocationHoodieKeyPair.get().getLeft()); + deleteRecord.seal(); + // Tag the incoming record for inserting to the new partition + HoodieRecord insertRecord = HoodieIndexUtils.getTaggedRecord(hoodieRecord, Option.empty()); + return Arrays.asList(deleteRecord, insertRecord).iterator(); + } else { + // Ignore the incoming record's partition, regardless of whether it differs from its old partition or not. + // When it differs, the record will still be updated at its old partition. + return Collections.singletonList( + (HoodieRecord) HoodieIndexUtils.getTaggedRecord(new HoodieAvroRecord(recordLocationHoodieKeyPair.get().getRight(), (HoodieRecordPayload) hoodieRecord.getData()), + Option.ofNullable(recordLocationHoodieKeyPair.get().getLeft()))).iterator(); + } + } else { + return Collections.singletonList((HoodieRecord) HoodieIndexUtils.getTaggedRecord(hoodieRecord, Option.empty())).iterator(); + } + }); + } + + @Override + public boolean isGlobal() { + return true; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/IntervalTreeBasedGlobalIndexFileFilter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/IntervalTreeBasedGlobalIndexFileFilter.java index 50d31f9cc50d2..18b094890081b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/IntervalTreeBasedGlobalIndexFileFilter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/IntervalTreeBasedGlobalIndexFileFilter.java @@ -46,8 +46,8 @@ class IntervalTreeBasedGlobalIndexFileFilter implements IndexFileFilter { IntervalTreeBasedGlobalIndexFileFilter(final Map> partitionToFileIndexInfo) { List allIndexFiles = new ArrayList<>(); - partitionToFileIndexInfo.forEach((parition, bloomIndexFileInfoList) -> bloomIndexFileInfoList.forEach(file -> { - fileIdToPartitionPathMap.put(file.getFileId(), parition); + partitionToFileIndexInfo.forEach((partition, bloomIndexFileInfoList) -> bloomIndexFileInfoList.forEach(file -> { + fileIdToPartitionPathMap.put(file.getFileId(), partition); allIndexFiles.add(file); })); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/ListBasedHoodieBloomIndexHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/ListBasedHoodieBloomIndexHelper.java new file mode 100644 index 0000000000000..cffee5ee74081 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/ListBasedHoodieBloomIndexHelper.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.index.bloom; + +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodiePairData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.HoodieKeyLookupResult; +import org.apache.hudi.table.HoodieTable; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import static java.util.stream.Collectors.toList; + +/** + * Helper for {@link HoodieBloomIndex} containing Java {@link List}-based logic. + */ +public class ListBasedHoodieBloomIndexHelper extends BaseHoodieBloomIndexHelper { + + private static final ListBasedHoodieBloomIndexHelper SINGLETON_INSTANCE = new ListBasedHoodieBloomIndexHelper(); + + protected ListBasedHoodieBloomIndexHelper() { + } + + public static ListBasedHoodieBloomIndexHelper getInstance() { + return SINGLETON_INSTANCE; + } + + @Override + public HoodiePairData findMatchingFilesForRecordKeys( + HoodieWriteConfig config, HoodieEngineContext context, HoodieTable hoodieTable, + HoodiePairData partitionRecordKeyPairs, + HoodieData> fileComparisonPairs, + Map> partitionToFileInfo, Map recordsPerPartition) { + List> fileComparisonPairList = + fileComparisonPairs.collectAsList().stream() + .sorted(Comparator.comparing(Pair::getLeft)).collect(toList()); + + List keyLookupResults = new ArrayList<>(); + Iterator> iterator = new HoodieBaseBloomIndexCheckFunction( + hoodieTable, config).apply(fileComparisonPairList.iterator()); + while (iterator.hasNext()) { + keyLookupResults.addAll(iterator.next()); + } + + keyLookupResults = keyLookupResults.stream().filter( + lr -> lr.getMatchingRecordKeys().size() > 0).collect(toList()); + return context.parallelize(keyLookupResults).flatMap(lookupResult -> + lookupResult.getMatchingRecordKeys().stream() + .map(recordKey -> new ImmutablePair<>(lookupResult, recordKey)).iterator() + ).mapToPair(pair -> { + HoodieKeyLookupResult lookupResult = pair.getLeft(); + String recordKey = pair.getRight(); + return new ImmutablePair<>( + new HoodieKey(recordKey, lookupResult.getPartitionPath()), + new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId())); + }); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/BucketIdentifier.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/BucketIdentifier.java new file mode 100644 index 0000000000000..48ccce1d1740c --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/BucketIdentifier.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index.bucket; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +public class BucketIdentifier implements Serializable { + // Compatible with the spark bucket name + private static final Pattern BUCKET_NAME = Pattern.compile(".*_(\\d+)(?:\\..*)?$"); + + public static int getBucketId(HoodieRecord record, String indexKeyFields, int numBuckets) { + return getBucketId(record.getKey(), indexKeyFields, numBuckets); + } + + public static int getBucketId(HoodieKey hoodieKey, String indexKeyFields, int numBuckets) { + return (getHashKeys(hoodieKey, indexKeyFields).hashCode() & Integer.MAX_VALUE) % numBuckets; + } + + public static int getBucketId(HoodieKey hoodieKey, List indexKeyFields, int numBuckets) { + return (getHashKeys(hoodieKey.getRecordKey(), indexKeyFields).hashCode() & Integer.MAX_VALUE) % numBuckets; + } + + public static int getBucketId(String recordKey, String indexKeyFields, int numBuckets) { + return getBucketId(getHashKeys(recordKey, indexKeyFields), numBuckets); + } + + public static int getBucketId(List hashKeyFields, int numBuckets) { + return (hashKeyFields.hashCode() & Integer.MAX_VALUE) % numBuckets; + } + + public static List getHashKeys(HoodieKey hoodieKey, String indexKeyFields) { + return getHashKeys(hoodieKey.getRecordKey(), indexKeyFields); + } + + protected static List getHashKeys(String recordKey, String indexKeyFields) { + return !recordKey.contains(":") ? Collections.singletonList(recordKey) : + getHashKeysUsingIndexFields(recordKey, Arrays.asList(indexKeyFields.split(","))); + } + + protected static List getHashKeys(String recordKey, List indexKeyFields) { + return !recordKey.contains(":") ? Collections.singletonList(recordKey) : + getHashKeysUsingIndexFields(recordKey, indexKeyFields); + } + + private static List getHashKeysUsingIndexFields(String recordKey, List indexKeyFields) { + Map recordKeyPairs = Arrays.stream(recordKey.split(",")) + .map(p -> p.split(":")) + .collect(Collectors.toMap(p -> p[0], p -> p[1])); + return indexKeyFields.stream() + .map(recordKeyPairs::get).collect(Collectors.toList()); + } + + public static String partitionBucketIdStr(String partition, int bucketId) { + return String.format("%s_%s", partition, bucketIdStr(bucketId)); + } + + public static int bucketIdFromFileId(String fileId) { + return Integer.parseInt(fileId.substring(0, 8)); + } + + public static String bucketIdStr(int n) { + return String.format("%08d", n); + } + + public static String newBucketFileIdPrefix(int bucketId) { + return newBucketFileIdPrefix(bucketIdStr(bucketId)); + } + + public static String newBucketFileIdPrefix(String bucketId) { + return FSUtils.createNewFileIdPfx().replaceFirst(".{8}", bucketId); + } + + public static boolean isBucketFileName(String name) { + return BUCKET_NAME.matcher(name).matches(); + } + + public static int mod(int x, int y) { + return x % y; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/BucketIndexLocationMapper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/BucketIndexLocationMapper.java new file mode 100644 index 0000000000000..4955087333a25 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/BucketIndexLocationMapper.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.index.bucket; + +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.util.Option; + +import java.io.Serializable; + +public interface BucketIndexLocationMapper extends Serializable { + + /** + * Get record location given hoodie key and partition path + */ + Option getRecordLocation(HoodieKey key, String partitionPath); + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/ConsistentBucketIdentifier.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/ConsistentBucketIdentifier.java new file mode 100644 index 0000000000000..c44a8a6ccfb0c --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/ConsistentBucketIdentifier.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index.bucket; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.ConsistentHashingNode; +import org.apache.hudi.common.model.HoodieConsistentHashingMetadata; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.util.hash.HashID; + +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; + +public class ConsistentBucketIdentifier extends BucketIdentifier { + + /** + * Hashing metadata of a partition + */ + private final HoodieConsistentHashingMetadata metadata; + /** + * In-memory structure to speed up ring mapping (hashing value -> hashing node) + */ + private final TreeMap ring; + /** + * Mapping from fileId -> hashing node + */ + private final Map fileIdToBucket; + + public ConsistentBucketIdentifier(HoodieConsistentHashingMetadata metadata) { + this.metadata = metadata; + this.fileIdToBucket = new HashMap<>(); + this.ring = new TreeMap<>(); + initialize(); + } + + public Collection getNodes() { + return ring.values(); + } + + public HoodieConsistentHashingMetadata getMetadata() { + return metadata; + } + + public int getNumBuckets() { + return ring.size(); + } + + /** + * Get bucket of the given file group + * + * @param fileId the file group id. NOTE: not filePfx (i.e., uuid) + */ + public ConsistentHashingNode getBucketByFileId(String fileId) { + return fileIdToBucket.get(fileId); + } + + public ConsistentHashingNode getBucket(HoodieKey hoodieKey, List indexKeyFields) { + return getBucket(getHashKeys(hoodieKey.getRecordKey(), indexKeyFields)); + } + + protected ConsistentHashingNode getBucket(List hashKeys) { + int hashValue = HashID.getXXHash32(String.join("", hashKeys), 0); + return getBucket(hashValue & HoodieConsistentHashingMetadata.HASH_VALUE_MASK); + } + + protected ConsistentHashingNode getBucket(int hashValue) { + SortedMap tailMap = ring.tailMap(hashValue); + return tailMap.isEmpty() ? ring.firstEntry().getValue() : tailMap.get(tailMap.firstKey()); + } + + /** + * Initialize necessary data structure to facilitate bucket identifying. + * Specifically, we construct: + * - An in-memory tree (ring) to speed up range mapping searching. + * - A hash table (fileIdToBucket) to allow lookup of bucket using fileId. + */ + private void initialize() { + for (ConsistentHashingNode p : metadata.getNodes()) { + ring.put(p.getValue(), p); + // One bucket has only one file group, so append 0 directly + fileIdToBucket.put(FSUtils.createNewFileId(p.getFileIdPrefix(), 0), p); + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/HoodieBucketIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/HoodieBucketIndex.java new file mode 100644 index 0000000000000..cbb3b07f4457f --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/HoodieBucketIndex.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index.bucket; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.utils.LazyIterableIterator; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.index.HoodieIndexUtils; +import org.apache.hudi.table.HoodieTable; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Arrays; +import java.util.List; + +/** + * Hash indexing mechanism. + */ +public abstract class HoodieBucketIndex extends HoodieIndex { + + private static final Logger LOG = LogManager.getLogger(HoodieBucketIndex.class); + + protected final int numBuckets; + protected final List indexKeyFields; + + public HoodieBucketIndex(HoodieWriteConfig config) { + super(config); + + this.numBuckets = config.getBucketIndexNumBuckets(); + this.indexKeyFields = Arrays.asList(config.getBucketIndexHashField().split(",")); + LOG.info("Use bucket index, numBuckets = " + numBuckets + ", indexFields: " + indexKeyFields); + } + + @Override + public HoodieData updateLocation(HoodieData writeStatuses, + HoodieEngineContext context, + HoodieTable hoodieTable) + throws HoodieIndexException { + return writeStatuses; + } + + @Override + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) + throws HoodieIndexException { + // Initialize necessary information before tagging. e.g., hashing metadata + List partitions = records.map(HoodieRecord::getPartitionPath).distinct().collectAsList(); + LOG.info("Initializing hashing metadata for partitions: " + partitions); + BucketIndexLocationMapper mapper = getLocationMapper(hoodieTable, partitions); + + return records.mapPartitions(iterator -> + new LazyIterableIterator, HoodieRecord>(iterator) { + @Override + protected HoodieRecord computeNext() { + // TODO maybe batch the operation to improve performance + HoodieRecord record = inputItr.next(); + Option loc = mapper.getRecordLocation(record.getKey(), record.getPartitionPath()); + return HoodieIndexUtils.getTaggedRecord(record, loc); + } + }, + false + ); + } + + @Override + public boolean requiresTagging(WriteOperationType operationType) { + switch (operationType) { + case INSERT: + case INSERT_OVERWRITE: + case UPSERT: + case DELETE: + return true; + default: + return false; + } + } + + @Override + public boolean rollbackCommit(String instantTime) { + return true; + } + + @Override + public boolean isGlobal() { + return false; + } + + @Override + public boolean canIndexLogFiles() { + return true; + } + + @Override + public boolean isImplicitWithStorage() { + return true; + } + + public int getNumBuckets() { + return numBuckets; + } + + /** + * Get a location mapper for the given table & partitionPath + */ + protected abstract BucketIndexLocationMapper getLocationMapper(HoodieTable table, List partitionPath); +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/HoodieSimpleBucketIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/HoodieSimpleBucketIndex.java new file mode 100644 index 0000000000000..8dc580998a00c --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/HoodieSimpleBucketIndex.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index.bucket; + +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.index.HoodieIndexUtils; +import org.apache.hudi.table.HoodieTable; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * Simple bucket index implementation, with fixed bucket number. + */ +public class HoodieSimpleBucketIndex extends HoodieBucketIndex { + + private static final Logger LOG = LogManager.getLogger(HoodieSimpleBucketIndex.class); + + public HoodieSimpleBucketIndex(HoodieWriteConfig config) { + super(config); + } + + private Map loadPartitionBucketIdFileIdMapping( + HoodieTable hoodieTable, + String partition) { + // bucketId -> fileIds + Map bucketIdToFileIdMapping = new HashMap<>(); + hoodieTable.getMetaClient().reloadActiveTimeline(); + HoodieIndexUtils + .getLatestFileSlicesForPartition(partition, hoodieTable) + .forEach(fileSlice -> { + String fileId = fileSlice.getFileId(); + String commitTime = fileSlice.getBaseInstantTime(); + + int bucketId = BucketIdentifier.bucketIdFromFileId(fileId); + if (!bucketIdToFileIdMapping.containsKey(bucketId)) { + bucketIdToFileIdMapping.put(bucketId, new HoodieRecordLocation(commitTime, fileId)); + } else { + // Check if bucket data is valid + throw new HoodieIOException("Find multiple files at partition path=" + + partition + " belongs to the same bucket id = " + bucketId); + } + }); + return bucketIdToFileIdMapping; + } + + @Override + public boolean canIndexLogFiles() { + return false; + } + + @Override + protected BucketIndexLocationMapper getLocationMapper(HoodieTable table, List partitionPath) { + return new SimpleBucketIndexLocationMapper(table, partitionPath); + } + + public class SimpleBucketIndexLocationMapper implements BucketIndexLocationMapper { + + /** + * Mapping from partitionPath -> bucketId -> fileInfo + */ + private final Map> partitionPathFileIDList; + + public SimpleBucketIndexLocationMapper(HoodieTable table, List partitions) { + partitionPathFileIDList = partitions.stream().collect(Collectors.toMap(p -> p, p -> loadPartitionBucketIdFileIdMapping(table, p))); + } + + @Override + public Option getRecordLocation(HoodieKey key, String partitionPath) { + int bucketId = BucketIdentifier.getBucketId(key, indexKeyFields, numBuckets); + Map bucketIdToFileIdMapping = partitionPathFileIDList.get(partitionPath); + return Option.ofNullable(bucketIdToFileIdMapping.getOrDefault(bucketId, null)); + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/inmemory/HoodieInMemoryHashIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/inmemory/HoodieInMemoryHashIndex.java new file mode 100644 index 0000000000000..42dcc1b97d760 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/inmemory/HoodieInMemoryHashIndex.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.index.inmemory; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.table.HoodieTable; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; + +/** + * Hoodie Index implementation backed by an in-memory Hash map. + *

+ * ONLY USE FOR LOCAL TESTING + */ +public class HoodieInMemoryHashIndex + extends HoodieIndex { + + private static ConcurrentMap recordLocationMap; + + public HoodieInMemoryHashIndex(HoodieWriteConfig config) { + super(config); + synchronized (HoodieInMemoryHashIndex.class) { + if (recordLocationMap == null) { + recordLocationMap = new ConcurrentHashMap<>(); + } + } + } + + @Override + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) { + return records.mapPartitions(hoodieRecordIterator -> { + List> taggedRecords = new ArrayList<>(); + while (hoodieRecordIterator.hasNext()) { + HoodieRecord record = hoodieRecordIterator.next(); + if (recordLocationMap.containsKey(record.getKey())) { + record.unseal(); + record.setCurrentLocation(recordLocationMap.get(record.getKey())); + record.seal(); + } + taggedRecords.add(record); + } + return taggedRecords.iterator(); + }, true); + } + + @Override + public HoodieData updateLocation( + HoodieData writeStatuses, HoodieEngineContext context, + HoodieTable hoodieTable) { + return writeStatuses.map(writeStatus -> { + for (HoodieRecord record : writeStatus.getWrittenRecords()) { + if (!writeStatus.isErrored(record.getKey())) { + HoodieKey key = record.getKey(); + Option newLocation = record.getNewLocation(); + if (newLocation.isPresent()) { + recordLocationMap.put(key, newLocation.get()); + } else { + // Delete existing index for a deleted record + recordLocationMap.remove(key); + } + } + } + return writeStatus; + }); + } + + @Override + public boolean rollbackCommit(String instantTime) { + return true; + } + + /** + * Only looks up by recordKey. + */ + @Override + public boolean isGlobal() { + return true; + } + + /** + * Mapping is available in HBase already. + */ + @Override + public boolean canIndexLogFiles() { + return true; + } + + /** + * Index needs to be explicitly updated after storage write. + */ + @Override + public boolean isImplicitWithStorage() { + return false; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieGlobalSimpleIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieGlobalSimpleIndex.java new file mode 100644 index 0000000000000..805ae462a1128 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieGlobalSimpleIndex.java @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.index.simple; + +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodiePairData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndexUtils; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.table.HoodieTable; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import static org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions; + +/** + * A global simple index which reads interested fields(record key and partition path) from base files and + * joins with incoming records to find the tagged location. + */ +public class HoodieGlobalSimpleIndex extends HoodieSimpleIndex { + public HoodieGlobalSimpleIndex(HoodieWriteConfig config, Option keyGeneratorOpt) { + super(config, keyGeneratorOpt); + } + + @Override + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) { + return tagLocationInternal(records, context, hoodieTable); + } + + /** + * Tags records location for incoming records. + * + * @param inputRecords {@link HoodieData} of incoming records + * @param context instance of {@link HoodieEngineContext} to use + * @param hoodieTable instance of {@link HoodieTable} to use + * @return {@link HoodieData} of records with record locations set + */ + @Override + protected HoodieData> tagLocationInternal( + HoodieData> inputRecords, HoodieEngineContext context, + HoodieTable hoodieTable) { + + HoodiePairData> keyedInputRecords = + inputRecords.mapToPair(entry -> new ImmutablePair<>(entry.getRecordKey(), entry)); + HoodiePairData allRecordLocationsInTable = + fetchAllRecordLocations(context, hoodieTable, config.getGlobalSimpleIndexParallelism()); + return getTaggedRecords(keyedInputRecords, allRecordLocationsInTable); + } + + /** + * Fetch record locations for passed in {@link HoodieKey}s. + * + * @param context instance of {@link HoodieEngineContext} to use + * @param hoodieTable instance of {@link HoodieTable} of interest + * @param parallelism parallelism to use + * @return {@link HoodiePairData} of {@link HoodieKey} and {@link HoodieRecordLocation} + */ + protected HoodiePairData fetchAllRecordLocations( + HoodieEngineContext context, HoodieTable hoodieTable, int parallelism) { + List> latestBaseFiles = getAllBaseFilesInTable(context, hoodieTable); + return fetchRecordLocations(context, hoodieTable, parallelism, latestBaseFiles); + } + + /** + * Load all files for all partitions as pair data. + */ + protected List> getAllBaseFilesInTable( + final HoodieEngineContext context, final HoodieTable hoodieTable) { + HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); + List allPartitionPaths = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), metaClient.getBasePath()); + // Obtain the latest data files from all the partitions. + return getLatestBaseFilesForAllPartitions(allPartitionPaths, context, hoodieTable); + } + + /** + * Tag records with right {@link HoodieRecordLocation}. + * + * @param incomingRecords incoming {@link HoodieRecord}s + * @param existingRecords existing records with {@link HoodieRecordLocation}s + * @return {@link HoodieData} of {@link HoodieRecord}s with tagged {@link HoodieRecordLocation}s + */ + private HoodieData> getTaggedRecords( + HoodiePairData> incomingRecords, + HoodiePairData existingRecords) { + HoodiePairData> existingRecordByRecordKey = + existingRecords.mapToPair( + entry -> new ImmutablePair<>(entry.getLeft().getRecordKey(), + Pair.of(entry.getLeft().getPartitionPath(), entry.getRight()))); + + return incomingRecords.leftOuterJoin(existingRecordByRecordKey).values() + .flatMap(entry -> { + HoodieRecord inputRecord = entry.getLeft(); + Option> partitionPathLocationPair = Option.ofNullable(entry.getRight().orElse(null)); + List> taggedRecords; + + if (partitionPathLocationPair.isPresent()) { + String partitionPath = partitionPathLocationPair.get().getKey(); + HoodieRecordLocation location = partitionPathLocationPair.get().getRight(); + if (config.getGlobalSimpleIndexUpdatePartitionPath() && !(inputRecord.getPartitionPath().equals(partitionPath))) { + // Create an empty record to delete the record in the old partition + HoodieRecord deleteRecord = new HoodieAvroRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), new EmptyHoodieRecordPayload()); + deleteRecord.setCurrentLocation(location); + deleteRecord.seal(); + // Tag the incoming record for inserting to the new partition + HoodieRecord insertRecord = (HoodieRecord) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty()); + taggedRecords = Arrays.asList(deleteRecord, insertRecord); + } else { + // Ignore the incoming record's partition, regardless of whether it differs from its old partition or not. + // When it differs, the record will still be updated at its old partition. + HoodieRecord newRecord = new HoodieAvroRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), (HoodieRecordPayload) inputRecord.getData()); + taggedRecords = Collections.singletonList((HoodieRecord) HoodieIndexUtils.getTaggedRecord(newRecord, Option.ofNullable(location))); + } + } else { + taggedRecords = Collections.singletonList((HoodieRecord) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty())); + } + return taggedRecords.iterator(); + }); + } + + @Override + public boolean isGlobal() { + return true; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieSimpleIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieSimpleIndex.java new file mode 100644 index 0000000000000..95823ff51e35d --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieSimpleIndex.java @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.index.simple; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodiePairData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.index.HoodieIndexUtils; +import org.apache.hudi.io.HoodieKeyLocationFetchHandle; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.table.HoodieTable; + +import java.util.List; + +import static org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions; + +/** + * A simple index which reads interested fields(record key and partition path) from base files and + * joins with incoming records to find the tagged location. + */ +public class HoodieSimpleIndex + extends HoodieIndex { + + private final Option keyGeneratorOpt; + + public HoodieSimpleIndex(HoodieWriteConfig config, Option keyGeneratorOpt) { + super(config); + this.keyGeneratorOpt = keyGeneratorOpt; + } + + @Override + public HoodieData updateLocation( + HoodieData writeStatuses, HoodieEngineContext context, + HoodieTable hoodieTable) { + return writeStatuses; + } + + @Override + public boolean rollbackCommit(String commitTime) { + return true; + } + + @Override + public boolean isGlobal() { + return false; + } + + @Override + public boolean canIndexLogFiles() { + return false; + } + + @Override + public boolean isImplicitWithStorage() { + return true; + } + + @Override + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) { + return tagLocationInternal(records, context, hoodieTable); + } + + /** + * Tags records location for incoming records. + * + * @param inputRecords {@link HoodieData} of incoming records + * @param context instance of {@link HoodieEngineContext} to use + * @param hoodieTable instance of {@link HoodieTable} to use + * @return {@link HoodieData} of records with record locations set + */ + protected HoodieData> tagLocationInternal( + HoodieData> inputRecords, HoodieEngineContext context, + HoodieTable hoodieTable) { + if (config.getSimpleIndexUseCaching()) { + inputRecords.persist(new HoodieConfig(config.getProps()) + .getString(HoodieIndexConfig.SIMPLE_INDEX_INPUT_STORAGE_LEVEL_VALUE)); + } + + HoodiePairData> keyedInputRecords = + inputRecords.mapToPair(record -> new ImmutablePair<>(record.getKey(), record)); + HoodiePairData existingLocationsOnTable = + fetchRecordLocationsForAffectedPartitions(keyedInputRecords.keys(), context, hoodieTable, + config.getSimpleIndexParallelism()); + + HoodieData> taggedRecords = + keyedInputRecords.leftOuterJoin(existingLocationsOnTable).map(entry -> { + final HoodieRecord untaggedRecord = entry.getRight().getLeft(); + final Option location = Option.ofNullable(entry.getRight().getRight().orElse(null)); + return HoodieIndexUtils.getTaggedRecord(untaggedRecord, location); + }); + + if (config.getSimpleIndexUseCaching()) { + inputRecords.unpersist(); + } + return taggedRecords; + } + + /** + * Fetch record locations for passed in {@link HoodieKey}s. + * + * @param hoodieKeys {@link HoodieData} of {@link HoodieKey}s for which locations are fetched + * @param context instance of {@link HoodieEngineContext} to use + * @param hoodieTable instance of {@link HoodieTable} of interest + * @param parallelism parallelism to use + * @return {@link HoodiePairData} of {@link HoodieKey} and {@link HoodieRecordLocation} + */ + protected HoodiePairData fetchRecordLocationsForAffectedPartitions( + HoodieData hoodieKeys, HoodieEngineContext context, HoodieTable hoodieTable, + int parallelism) { + List affectedPartitionPathList = + hoodieKeys.map(HoodieKey::getPartitionPath).distinct().collectAsList(); + List> latestBaseFiles = + getLatestBaseFilesForAllPartitions(affectedPartitionPathList, context, hoodieTable); + return fetchRecordLocations(context, hoodieTable, parallelism, latestBaseFiles); + } + + protected HoodiePairData fetchRecordLocations( + HoodieEngineContext context, HoodieTable hoodieTable, int parallelism, + List> baseFiles) { + int fetchParallelism = Math.max(1, Math.min(baseFiles.size(), parallelism)); + + return context.parallelize(baseFiles, fetchParallelism) + .flatMap(partitionPathBaseFile -> new HoodieKeyLocationFetchHandle(config, hoodieTable, partitionPathBaseFile, keyGeneratorOpt) + .locations().iterator()) + .mapToPair(e -> (Pair) e); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/AppendHandleFactory.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/AppendHandleFactory.java index 5c54dce31cd60..b4c83c141b2bc 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/AppendHandleFactory.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/AppendHandleFactory.java @@ -18,7 +18,7 @@ package org.apache.hudi.io; -import org.apache.hudi.client.common.TaskContextSupplier; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/CreateHandleFactory.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/CreateHandleFactory.java index 67ebadb2dd300..09131b421f402 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/CreateHandleFactory.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/CreateHandleFactory.java @@ -18,19 +18,31 @@ package org.apache.hudi.io; -import org.apache.hudi.client.common.TaskContextSupplier; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; -public class CreateHandleFactory extends WriteHandleFactory { +import java.io.Serializable; + +public class CreateHandleFactory extends WriteHandleFactory implements Serializable { + + private boolean preserveMetadata = false; + + public CreateHandleFactory() { + this(false); + } + + public CreateHandleFactory(boolean preserveMetadata) { + this.preserveMetadata = preserveMetadata; + } @Override public HoodieWriteHandle create(final HoodieWriteConfig hoodieConfig, final String commitTime, - final HoodieTable hoodieTable, final String partitionPath, - final String fileIdPrefix, TaskContextSupplier taskContextSupplier) { + final HoodieTable hoodieTable, final String partitionPath, + final String fileIdPrefix, TaskContextSupplier taskContextSupplier) { return new HoodieCreateHandle(hoodieConfig, commitTime, hoodieTable, partitionPath, - getNextFileId(fileIdPrefix), taskContextSupplier); + getNextFileId(fileIdPrefix), taskContextSupplier, preserveMetadata); } -} \ No newline at end of file +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java index 0c590fe8818c4..e0d40642a6a24 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java @@ -18,49 +18,63 @@ package org.apache.hudi.io; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.TaskContextSupplier; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.BaseFile; +import org.apache.hudi.common.model.DeleteRecord; import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieColumnRangeMetadata; import org.apache.hudi.common.model.HoodieDeltaWriteStat; -import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodiePartitionMetadata; +import org.apache.hudi.common.model.HoodiePayloadProps; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats; import org.apache.hudi.common.model.IOType; +import org.apache.hudi.common.table.log.AppendResult; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.HoodieLogFormat.Writer; -import org.apache.hudi.common.table.log.block.HoodieDataBlock; +import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; import org.apache.hudi.common.table.log.block.HoodieDeleteBlock; +import org.apache.hudi.common.table.log.block.HoodieHFileDataBlock; import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; +import org.apache.hudi.common.table.log.block.HoodieParquetDataBlock; import org.apache.hudi.common.table.view.TableFileSystemView.SliceView; import org.apache.hudi.common.util.DefaultSizeEstimator; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.SizeEstimator; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieAppendException; +import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.table.HoodieTable; - -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Properties; +import java.util.Set; import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; + +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.collectColumnRangeMetadata; /** * IO Operation to append data onto an existing file. @@ -69,48 +83,52 @@ public class HoodieAppendHandle extends private static final Logger LOG = LogManager.getLogger(HoodieAppendHandle.class); // This acts as the sequenceID for records written - private static AtomicLong recordIndex = new AtomicLong(1); - private final String fileId; + private static final AtomicLong RECORD_COUNTER = new AtomicLong(1); + + protected final String fileId; // Buffer for holding records in memory before they are flushed to disk - private List recordList = new ArrayList<>(); + private final List recordList = new ArrayList<>(); // Buffer for holding records (to be deleted) in memory before they are flushed to disk - private List keysToDelete = new ArrayList<>(); + private final List recordsToDelete = new ArrayList<>(); + // Incoming records to be written to logs. + protected Iterator> recordItr; + // Writer to log into the file group's latest slice. + protected Writer writer; - private Iterator> recordItr; + protected final List statuses; // Total number of records written during an append - private long recordsWritten = 0; + protected long recordsWritten = 0; // Total number of records deleted during an append - private long recordsDeleted = 0; + protected long recordsDeleted = 0; // Total number of records updated during an append - private long updatedRecordsWritten = 0; + protected long updatedRecordsWritten = 0; + // Total number of new records inserted into the delta file + protected long insertRecordsWritten = 0; + // Average record size for a HoodieRecord. This size is updated at the end of every log block flushed to disk private long averageRecordSize = 0; - private HoodieLogFile currentLogFile; - private Writer writer; // Flag used to initialize some metadata private boolean doInit = true; // Total number of bytes written during this append phase (an estimation) - private long estimatedNumberOfBytesWritten; - // Total number of bytes written to file - private long sizeInBytes = 0; + protected long estimatedNumberOfBytesWritten; // Number of records that must be written to meet the max block size for a log block private int numberOfRecords = 0; // Max block size to limit to for a log block - private int maxBlockSize = config.getLogFileDataBlockMaxSize(); + private final int maxBlockSize = config.getLogFileDataBlockMaxSize(); // Header metadata for a log block - private Map header = new HashMap<>(); - // Total number of new records inserted into the delta file - private long insertRecordsWritten = 0; - + protected final Map header = new HashMap<>(); private SizeEstimator sizeEstimator; + private Properties recordProperties = new Properties(); + public HoodieAppendHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, String partitionPath, String fileId, Iterator> recordItr, TaskContextSupplier taskContextSupplier) { super(config, instantTime, partitionPath, fileId, hoodieTable, taskContextSupplier); - writeStatus.setStat(new HoodieDeltaWriteStat()); this.fileId = fileId; this.recordItr = recordItr; sizeEstimator = new DefaultSizeEstimator(); + this.statuses = new ArrayList<>(); + this.recordProperties.putAll(config.getProps()); } public HoodieAppendHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, @@ -125,62 +143,92 @@ private void init(HoodieRecord record) { Option fileSlice = rtView.getLatestFileSlice(partitionPath, fileId); // Set the base commit time as the current instantTime for new inserts into log files String baseInstantTime; + String baseFile = ""; + List logFiles = new ArrayList<>(); if (fileSlice.isPresent()) { baseInstantTime = fileSlice.get().getBaseInstantTime(); + baseFile = fileSlice.get().getBaseFile().map(BaseFile::getFileName).orElse(""); + logFiles = fileSlice.get().getLogFiles().map(HoodieLogFile::getFileName).collect(Collectors.toList()); } else { baseInstantTime = instantTime; // This means there is no base data file, start appending to a new log file fileSlice = Option.of(new FileSlice(partitionPath, baseInstantTime, this.fileId)); - LOG.info("New InsertHandle for partition :" + partitionPath); + LOG.info("New AppendHandle for partition :" + partitionPath); } - writeStatus.getStat().setPrevCommit(baseInstantTime); + + // Prepare the first write status + writeStatus.setStat(new HoodieDeltaWriteStat()); writeStatus.setFileId(fileId); writeStatus.setPartitionPath(partitionPath); - writeStatus.getStat().setPartitionPath(partitionPath); - writeStatus.getStat().setFileId(fileId); averageRecordSize = sizeEstimator.sizeEstimate(record); + + HoodieDeltaWriteStat deltaWriteStat = (HoodieDeltaWriteStat) writeStatus.getStat(); + deltaWriteStat.setPrevCommit(baseInstantTime); + deltaWriteStat.setPartitionPath(partitionPath); + deltaWriteStat.setFileId(fileId); + deltaWriteStat.setBaseFile(baseFile); + deltaWriteStat.setLogFiles(logFiles); + try { - //save hoodie partition meta in the partition path + // Save hoodie partition meta in the partition path HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, baseInstantTime, - new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath)); + new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath), + hoodieTable.getPartitionMetafileFormat()); partitionMetadata.trySave(getPartitionId()); // Since the actual log file written to can be different based on when rollover happens, we use the // base file to denote some log appends happened on a slice. writeToken will still fence concurrent // writers. - createMarkerFile(partitionPath, FSUtils.makeDataFileName(baseInstantTime, writeToken, fileId, hoodieTable.getBaseFileExtension())); + // https://issues.apache.org/jira/browse/HUDI-1517 + createMarkerFile(partitionPath, FSUtils.makeBaseFileName(baseInstantTime, writeToken, fileId, hoodieTable.getBaseFileExtension())); this.writer = createLogWriter(fileSlice, baseInstantTime); - this.currentLogFile = writer.getLogFile(); - ((HoodieDeltaWriteStat) writeStatus.getStat()).setLogVersion(currentLogFile.getLogVersion()); - ((HoodieDeltaWriteStat) writeStatus.getStat()).setLogOffset(writer.getCurrentSize()); } catch (Exception e) { LOG.error("Error in update task at commit " + instantTime, e); writeStatus.setGlobalError(e); throw new HoodieUpsertException("Failed to initialize HoodieAppendHandle for FileId: " + fileId + " on commit " - + instantTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePath() + partitionPath, e); + + instantTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePath() + "/" + partitionPath, e); } - Path path = partitionPath.length() == 0 ? new Path(writer.getLogFile().getFileName()) - : new Path(partitionPath, writer.getLogFile().getFileName()); - writeStatus.getStat().setPath(path.toString()); doInit = false; } } + /** + * Returns whether the hoodie record is an UPDATE. + */ + protected boolean isUpdateRecord(HoodieRecord hoodieRecord) { + // If currentLocation is present, then this is an update + return hoodieRecord.getCurrentLocation() != null; + } + private Option getIndexedRecord(HoodieRecord hoodieRecord) { - Option recordMetadata = hoodieRecord.getData().getMetadata(); + Option> recordMetadata = hoodieRecord.getData().getMetadata(); try { - Option avroRecord = hoodieRecord.getData().getInsertValue(writerSchema); + // Pass the isUpdateRecord to the props for HoodieRecordPayload to judge + // Whether it is an update or insert record. + boolean isUpdateRecord = isUpdateRecord(hoodieRecord); + // If the format can not record the operation field, nullify the DELETE payload manually. + boolean nullifyPayload = HoodieOperation.isDelete(hoodieRecord.getOperation()) && !config.allowOperationMetadataField(); + recordProperties.put(HoodiePayloadProps.PAYLOAD_IS_UPDATE_RECORD_FOR_MOR, String.valueOf(isUpdateRecord)); + Option avroRecord = nullifyPayload ? Option.empty() : hoodieRecord.getData().getInsertValue(tableSchema, recordProperties); if (avroRecord.isPresent()) { + if (avroRecord.get().equals(IGNORE_RECORD)) { + return avroRecord; + } // Convert GenericRecord to GenericRecord with hoodie commit metadata in schema - avroRecord = Option.of(rewriteRecord((GenericRecord) avroRecord.get())); + GenericRecord rewriteRecord = rewriteRecord((GenericRecord) avroRecord.get()); + avroRecord = Option.of(rewriteRecord); String seqId = - HoodieRecord.generateSequenceId(instantTime, getPartitionId(), recordIndex.getAndIncrement()); - HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord.get(), hoodieRecord.getRecordKey(), - hoodieRecord.getPartitionPath(), fileId); - HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord.get(), instantTime, seqId); - // If currentLocation is present, then this is an update - if (hoodieRecord.getCurrentLocation() != null) { + HoodieRecord.generateSequenceId(instantTime, getPartitionId(), RECORD_COUNTER.getAndIncrement()); + if (config.populateMetaFields()) { + HoodieAvroUtils.addHoodieKeyToRecord(rewriteRecord, hoodieRecord.getRecordKey(), + hoodieRecord.getPartitionPath(), fileId); + HoodieAvroUtils.addCommitMetadataToRecord(rewriteRecord, instantTime, seqId); + } + if (config.allowOperationMetadataField()) { + HoodieAvroUtils.addOperationToRecord(rewriteRecord, hoodieRecord.getOperation()); + } + if (isUpdateRecord) { updatedRecordsWritten++; } else { insertRecordsWritten++; @@ -203,6 +251,127 @@ private Option getIndexedRecord(HoodieRecord hoodieRecord) { return Option.empty(); } + private void initNewStatus() { + HoodieDeltaWriteStat prevStat = (HoodieDeltaWriteStat) this.writeStatus.getStat(); + // Make a new write status and copy basic fields over. + HoodieDeltaWriteStat stat = new HoodieDeltaWriteStat(); + stat.setFileId(fileId); + stat.setPartitionPath(partitionPath); + stat.setPrevCommit(prevStat.getPrevCommit()); + stat.setBaseFile(prevStat.getBaseFile()); + stat.setLogFiles(new ArrayList<>(prevStat.getLogFiles())); + + this.writeStatus = (WriteStatus) ReflectionUtils.loadClass(config.getWriteStatusClassName(), + !hoodieTable.getIndex().isImplicitWithStorage(), config.getWriteStatusFailureFraction()); + this.writeStatus.setFileId(fileId); + this.writeStatus.setPartitionPath(partitionPath); + this.writeStatus.setStat(stat); + } + + private String makeFilePath(HoodieLogFile logFile) { + return partitionPath.length() == 0 + ? new Path(logFile.getFileName()).toString() + : new Path(partitionPath, logFile.getFileName()).toString(); + } + + private void resetWriteCounts() { + recordsWritten = 0; + updatedRecordsWritten = 0; + insertRecordsWritten = 0; + recordsDeleted = 0; + } + + private void updateWriteCounts(HoodieDeltaWriteStat stat, AppendResult result) { + stat.setNumWrites(recordsWritten); + stat.setNumUpdateWrites(updatedRecordsWritten); + stat.setNumInserts(insertRecordsWritten); + stat.setNumDeletes(recordsDeleted); + stat.setTotalWriteBytes(result.size()); + } + + private void accumulateWriteCounts(HoodieDeltaWriteStat stat, AppendResult result) { + stat.setNumWrites(stat.getNumWrites() + recordsWritten); + stat.setNumUpdateWrites(stat.getNumUpdateWrites() + updatedRecordsWritten); + stat.setNumInserts(stat.getNumInserts() + insertRecordsWritten); + stat.setNumDeletes(stat.getNumDeletes() + recordsDeleted); + stat.setTotalWriteBytes(stat.getTotalWriteBytes() + result.size()); + } + + private void updateWriteStat(HoodieDeltaWriteStat stat, AppendResult result) { + stat.setPath(makeFilePath(result.logFile())); + stat.setLogOffset(result.offset()); + stat.setLogVersion(result.logFile().getLogVersion()); + if (!stat.getLogFiles().contains(result.logFile().getFileName())) { + stat.addLogFiles(result.logFile().getFileName()); + } + stat.setFileSizeInBytes(result.size()); + } + + private void updateRuntimeStats(HoodieDeltaWriteStat stat) { + RuntimeStats runtimeStats = new RuntimeStats(); + runtimeStats.setTotalUpsertTime(timer.endTimer()); + stat.setRuntimeStats(runtimeStats); + } + + private void accumulateRuntimeStats(HoodieDeltaWriteStat stat) { + RuntimeStats runtimeStats = stat.getRuntimeStats(); + assert runtimeStats != null; + runtimeStats.setTotalUpsertTime(runtimeStats.getTotalUpsertTime() + timer.endTimer()); + } + + private void updateWriteStatus(HoodieDeltaWriteStat stat, AppendResult result) { + updateWriteStat(stat, result); + updateWriteCounts(stat, result); + updateRuntimeStats(stat); + statuses.add(this.writeStatus); + } + + private void processAppendResult(AppendResult result, List recordList) { + HoodieDeltaWriteStat stat = (HoodieDeltaWriteStat) this.writeStatus.getStat(); + + if (stat.getPath() == null) { + // first time writing to this log block. + updateWriteStatus(stat, result); + } else if (stat.getPath().endsWith(result.logFile().getFileName())) { + // append/continued writing to the same log file + stat.setLogOffset(Math.min(stat.getLogOffset(), result.offset())); + stat.setFileSizeInBytes(stat.getFileSizeInBytes() + result.size()); + accumulateWriteCounts(stat, result); + accumulateRuntimeStats(stat); + } else { + // written to a newer log file, due to rollover/otherwise. + initNewStatus(); + stat = (HoodieDeltaWriteStat) this.writeStatus.getStat(); + updateWriteStatus(stat, result); + } + + if (config.isMetadataColumnStatsIndexEnabled()) { + final List fieldsToIndex; + // If column stats index is enabled but columns not configured then we assume that + // all columns should be indexed + if (config.getColumnsEnabledForColumnStatsIndex().isEmpty()) { + fieldsToIndex = writeSchemaWithMetaFields.getFields(); + } else { + Set columnsToIndexSet = new HashSet<>(config.getColumnsEnabledForColumnStatsIndex()); + + fieldsToIndex = writeSchemaWithMetaFields.getFields().stream() + .filter(field -> columnsToIndexSet.contains(field.name())) + .collect(Collectors.toList()); + } + + Map> columnRangesMetadataMap = + collectColumnRangeMetadata(recordList, fieldsToIndex, stat.getPath()); + + stat.setRecordsStats(columnRangesMetadataMap); + } + + resetWriteCounts(); + assert stat.getRuntimeStats() != null; + LOG.info(String.format("AppendHandle for partitionPath %s filePath %s, took %d ms.", partitionPath, + stat.getPath(), stat.getRuntimeStats().getTotalUpsertTime())); + timer.startTimer(); + } + public void doAppend() { while (recordItr.hasNext()) { HoodieRecord record = recordItr.next(); @@ -210,24 +379,35 @@ public void doAppend() { flushToDiskIfRequired(record); writeToBuffer(record); } - doAppend(header); + appendDataAndDeleteBlocks(header); estimatedNumberOfBytesWritten += averageRecordSize * numberOfRecords; } - private void doAppend(Map header) { + protected void appendDataAndDeleteBlocks(Map header) { try { header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, instantTime); - header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writerSchemaWithMetafields.toString()); + header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writeSchemaWithMetaFields.toString()); + List blocks = new ArrayList<>(2); if (recordList.size() > 0) { - writer = writer.appendBlock(HoodieDataBlock.getBlock(hoodieTable.getLogDataBlockFormat(), recordList, header)); - recordList.clear(); + String keyField = config.populateMetaFields() + ? HoodieRecord.RECORD_KEY_METADATA_FIELD + : hoodieTable.getMetaClient().getTableConfig().getRecordKeyFieldProp(); + + blocks.add(getBlock(config, pickLogDataBlockFormat(), recordList, header, keyField)); } - if (keysToDelete.size() > 0) { - writer = writer.appendBlock(new HoodieDeleteBlock(keysToDelete.toArray(new HoodieKey[keysToDelete.size()]), header)); - keysToDelete.clear(); + + if (recordsToDelete.size() > 0) { + blocks.add(new HoodieDeleteBlock(recordsToDelete.toArray(new DeleteRecord[0]), header)); + } + + if (blocks.size() > 0) { + AppendResult appendResult = writer.appendBlocks(blocks); + processAppendResult(appendResult, recordList); + recordList.clear(); + recordsToDelete.clear(); } } catch (Exception e) { - throw new HoodieAppendException("Failed while appending records to " + currentLogFile.getPath(), e); + throw new HoodieAppendException("Failed while appending records to " + writer.getLogFile().getPath(), e); } } @@ -239,7 +419,7 @@ public boolean canWrite(HoodieRecord record) { @Override public void write(HoodieRecord record, Option insertValue) { - Option recordMetadata = record.getData().getMetadata(); + Option> recordMetadata = ((HoodieRecordPayload) record.getData()).getMetadata(); try { init(record); flushToDiskIfRequired(record); @@ -253,62 +433,62 @@ public void write(HoodieRecord record, Option insertValue) { } @Override - public WriteStatus close() { + public List close() { try { // flush any remaining records to disk - doAppend(header); - + appendDataAndDeleteBlocks(header); + recordItr = null; if (writer != null) { - sizeInBytes = writer.getCurrentSize(); writer.close(); - } + writer = null; - HoodieWriteStat stat = writeStatus.getStat(); - stat.setFileId(this.fileId); - stat.setNumWrites(recordsWritten); - stat.setNumUpdateWrites(updatedRecordsWritten); - stat.setNumInserts(insertRecordsWritten); - stat.setNumDeletes(recordsDeleted); - stat.setTotalWriteBytes(estimatedNumberOfBytesWritten); - stat.setFileSizeInBytes(sizeInBytes); - stat.setTotalWriteErrors(writeStatus.getTotalErrorRecords()); - RuntimeStats runtimeStats = new RuntimeStats(); - runtimeStats.setTotalUpsertTime(timer.endTimer()); - stat.setRuntimeStats(runtimeStats); - - LOG.info(String.format("AppendHandle for partitionPath %s fileID %s, took %d ms.", stat.getPartitionPath(), - stat.getFileId(), runtimeStats.getTotalUpsertTime())); - - return writeStatus; + // update final size, once for all log files + // TODO we can actually deduce file size purely from AppendResult (based on offset and size + // of the appended block) + for (WriteStatus status : statuses) { + long logFileSize = FSUtils.getFileSize(fs, new Path(config.getBasePath(), status.getStat().getPath())); + status.getStat().setFileSizeInBytes(logFileSize); + } + } + return statuses; } catch (IOException e) { throw new HoodieUpsertException("Failed to close UpdateHandle", e); } } - @Override - public WriteStatus getWriteStatus() { - return writeStatus; - } - @Override public IOType getIOType() { return IOType.APPEND; } + public List writeStatuses() { + return statuses; + } + private Writer createLogWriter(Option fileSlice, String baseCommitTime) - throws IOException, InterruptedException { + throws IOException { Option latestLogFile = fileSlice.get().getLatestLogFile(); return HoodieLogFormat.newWriterBuilder() .onParentPath(FSUtils.getPartitionPath(hoodieTable.getMetaClient().getBasePath(), partitionPath)) - .withFileId(fileId).overBaseCommit(baseCommitTime) + .withFileId(fileId) + .overBaseCommit(baseCommitTime) .withLogVersion(latestLogFile.map(HoodieLogFile::getLogVersion).orElse(HoodieLogFile.LOGFILE_BASE_VERSION)) - .withSizeThreshold(config.getLogFileMaxSize()).withFs(fs) - .withLogWriteToken(latestLogFile.map(x -> FSUtils.getWriteTokenFromLogPath(x.getPath())).orElse(writeToken)) + .withFileSize(latestLogFile.map(HoodieLogFile::getFileSize).orElse(0L)) + .withSizeThreshold(config.getLogFileMaxSize()) + .withFs(fs) .withRolloverLogWriteToken(writeToken) + .withLogWriteToken(latestLogFile.map(x -> FSUtils.getWriteTokenFromLogPath(x.getPath())).orElse(writeToken)) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); } + /** + * Whether there is need to update the record location. + */ + protected boolean needsUpdateLocation() { + return true; + } + private void writeToBuffer(HoodieRecord record) { if (!partitionPath.equals(record.getPartitionPath())) { HoodieUpsertException failureEx = new HoodieUpsertException("mismatched partition path, record partition: " @@ -318,14 +498,21 @@ private void writeToBuffer(HoodieRecord record) { } // update the new location of the record, so we know where to find it next - record.unseal(); - record.setNewLocation(new HoodieRecordLocation(instantTime, fileId)); - record.seal(); + if (needsUpdateLocation()) { + record.unseal(); + record.setNewLocation(new HoodieRecordLocation(instantTime, fileId)); + record.seal(); + } + // fetch the ordering val first in case the record was deflated. + final Comparable orderingVal = record.getData().getOrderingValue(); Option indexedRecord = getIndexedRecord(record); if (indexedRecord.isPresent()) { - recordList.add(indexedRecord.get()); + // Skip the ignored record. + if (!indexedRecord.get().equals(IGNORE_RECORD)) { + recordList.add(indexedRecord.get()); + } } else { - keysToDelete.add(record.getKey()); + recordsToDelete.add(DeleteRecord.create(record.getKey(), orderingVal)); } numberOfRecords++; } @@ -340,9 +527,46 @@ private void flushToDiskIfRequired(HoodieRecord record) { // avg of new and old LOG.info("AvgRecordSize => " + averageRecordSize); averageRecordSize = (averageRecordSize + sizeEstimator.sizeEstimate(record)) / 2; - doAppend(header); + appendDataAndDeleteBlocks(header); estimatedNumberOfBytesWritten += averageRecordSize * numberOfRecords; numberOfRecords = 0; } } + + private HoodieLogBlock.HoodieLogBlockType pickLogDataBlockFormat() { + Option logBlockTypeOpt = config.getLogDataBlockFormat(); + if (logBlockTypeOpt.isPresent()) { + return logBlockTypeOpt.get(); + } + + // Fallback to deduce data-block type based on the base file format + switch (hoodieTable.getBaseFileFormat()) { + case PARQUET: + case ORC: + return HoodieLogBlock.HoodieLogBlockType.AVRO_DATA_BLOCK; + case HFILE: + return HoodieLogBlock.HoodieLogBlockType.HFILE_DATA_BLOCK; + default: + throw new HoodieException("Base file format " + hoodieTable.getBaseFileFormat() + + " does not have associated log block type"); + } + } + + private static HoodieLogBlock getBlock(HoodieWriteConfig writeConfig, + HoodieLogBlock.HoodieLogBlockType logDataBlockFormat, + List recordList, + Map header, + String keyField) { + switch (logDataBlockFormat) { + case AVRO_DATA_BLOCK: + return new HoodieAvroDataBlock(recordList, header, keyField); + case HFILE_DATA_BLOCK: + return new HoodieHFileDataBlock( + recordList, header, writeConfig.getHFileCompressionAlgorithm(), new Path(writeConfig.getBasePath())); + case PARQUET_DATA_BLOCK: + return new HoodieParquetDataBlock(recordList, header, keyField, writeConfig.getParquetCompressionCodec()); + default: + throw new HoodieException("Data block format " + logDataBlockFormat + " not implemented"); + } + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieBootstrapHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieBootstrapHandle.java index b2eaedd53ce7d..8e7f66467a6d0 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieBootstrapHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieBootstrapHandle.java @@ -19,10 +19,10 @@ package org.apache.hudi.io; import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.client.common.TaskContextSupplier; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; @@ -38,8 +38,7 @@ public class HoodieBootstrapHandle exten public HoodieBootstrapHandle(HoodieWriteConfig config, String commitTime, HoodieTable hoodieTable, String partitionPath, String fileId, TaskContextSupplier taskContextSupplier) { super(config, commitTime, hoodieTable, partitionPath, fileId, - Pair.of(HoodieAvroUtils.RECORD_KEY_SCHEMA, - HoodieAvroUtils.addMetadataFields(HoodieAvroUtils.RECORD_KEY_SCHEMA)), taskContextSupplier); + Option.of(HoodieAvroUtils.RECORD_KEY_SCHEMA), taskContextSupplier); } @Override diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieConcatHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieConcatHandle.java new file mode 100644 index 0000000000000..ca245e0c391ba --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieConcatHandle.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io; + +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieUpsertException; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.keygen.KeyGenUtils; +import org.apache.hudi.table.HoodieTable; + +import org.apache.avro.generic.GenericRecord; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import javax.annotation.concurrent.NotThreadSafe; + +import java.io.IOException; +import java.util.Collections; +import java.util.Iterator; +import java.util.Map; + +/** + * Handle to concatenate new records to old records w/o any merging. If Operation is set to Inserts, and if {{@link HoodieWriteConfig#allowDuplicateInserts()}} + * is set, this handle will be used instead of {@link HoodieMergeHandle}. + * + * Simplified Logic: + * For every existing record + * Write the record as is + * For all incoming records, write to file as is, without de-duplicating based on the record key. + * + * Illustration with simple data. + * Incoming data: + * rec1_2, rec1_3, rec4_2, rec5_1, rec6_1 + * Existing data: + * rec1_1, rec2_1, rec3_1, rec4_1 + * + * For every existing record, write to storage as is. + * => rec1_1, rec2_1, rec3_1 and rec4_1 is written to storage + * Write all records from incoming set to storage + * => rec1_2, rec1_3, rec4_2, rec5_1 and rec6_1 + * + * Final snapshot in storage + * rec1_1, rec2_1, rec3_1, rec4_1, rec1_2, rec1_3, rec4_2, rec5_1, rec6_1 + * + * Users should ensure there are no duplicates when "insert" operation is used and if the respective config is enabled. So, above scenario should not + * happen and every batch should have new records to be inserted. Above example is for illustration purposes only. + */ +@NotThreadSafe +public class HoodieConcatHandle extends HoodieMergeHandle { + + private static final Logger LOG = LogManager.getLogger(HoodieConcatHandle.class); + // a representation of incoming records that tolerates duplicate keys + private final Iterator> recordItr; + + public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, + Iterator> recordItr, String partitionPath, String fileId, + TaskContextSupplier taskContextSupplier, Option keyGeneratorOpt) { + super(config, instantTime, hoodieTable, Collections.emptyIterator(), partitionPath, fileId, taskContextSupplier, keyGeneratorOpt); + this.recordItr = recordItr; + } + + public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, + Map> keyToNewRecords, String partitionPath, String fileId, + HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) { + super(config, instantTime, hoodieTable, Collections.emptyMap(), partitionPath, fileId, dataFileToBeMerged, taskContextSupplier, + Option.empty()); + this.recordItr = keyToNewRecords.values().iterator(); + } + + /** + * Write old record as is w/o merging with incoming record. + */ + @Override + public void write(GenericRecord oldRecord) { + String key = KeyGenUtils.getRecordKeyFromGenericRecord(oldRecord, keyGeneratorOpt); + try { + // NOTE: We're enforcing preservation of the record metadata to keep existing semantic + writeToFile(new HoodieKey(key, partitionPath), oldRecord, true); + } catch (IOException | RuntimeException e) { + String errMsg = String.format("Failed to write old record into new file for key %s from old file %s to new file %s with writerSchema %s", + key, getOldFilePath(), newFilePath, writeSchemaWithMetaFields.toString(true)); + LOG.debug("Old record is " + oldRecord); + throw new HoodieUpsertException(errMsg, e); + } + recordsWritten++; + } + + @Override + protected void writeIncomingRecords() throws IOException { + while (recordItr.hasNext()) { + HoodieRecord record = recordItr.next(); + if (needsUpdateLocation()) { + record.unseal(); + record.setNewLocation(new HoodieRecordLocation(instantTime, fileId)); + record.seal(); + } + writeInsertRecord(record); + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java index 6a8e7735093d5..738e2d6b48d13 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java @@ -19,9 +19,13 @@ package org.apache.hudi.io; import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.Path; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.TaskContextSupplier; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; @@ -30,57 +34,75 @@ import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats; import org.apache.hudi.common.model.IOType; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieInsertException; import org.apache.hudi.io.storage.HoodieFileWriter; import org.apache.hudi.io.storage.HoodieFileWriterFactory; import org.apache.hudi.table.HoodieTable; - -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import javax.annotation.concurrent.NotThreadSafe; + import java.io.IOException; +import java.util.Collections; import java.util.Iterator; +import java.util.List; import java.util.Map; +@NotThreadSafe public class HoodieCreateHandle extends HoodieWriteHandle { private static final Logger LOG = LogManager.getLogger(HoodieCreateHandle.class); - private final HoodieFileWriter fileWriter; - private final Path path; - private long recordsWritten = 0; - private long insertRecordsWritten = 0; - private long recordsDeleted = 0; + protected final HoodieFileWriter fileWriter; + protected final Path path; + protected long recordsWritten = 0; + protected long insertRecordsWritten = 0; + protected long recordsDeleted = 0; private Map> recordMap; private boolean useWriterSchema = false; + private final boolean preserveMetadata; public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, String partitionPath, String fileId, TaskContextSupplier taskContextSupplier) { - this(config, instantTime, hoodieTable, partitionPath, fileId, getWriterSchemaIncludingAndExcludingMetadataPair(config), - taskContextSupplier); + this(config, instantTime, hoodieTable, partitionPath, fileId, Option.empty(), + taskContextSupplier, false); } public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, - String partitionPath, String fileId, Pair writerSchemaIncludingAndExcludingMetadataPair, + String partitionPath, String fileId, TaskContextSupplier taskContextSupplier, + boolean preserveMetadata) { + this(config, instantTime, hoodieTable, partitionPath, fileId, Option.empty(), + taskContextSupplier, preserveMetadata); + } + + public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, + String partitionPath, String fileId, Option overriddenSchema, TaskContextSupplier taskContextSupplier) { - super(config, instantTime, partitionPath, fileId, hoodieTable, writerSchemaIncludingAndExcludingMetadataPair, + this(config, instantTime, hoodieTable, partitionPath, fileId, overriddenSchema, taskContextSupplier, false); + } + + public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, + String partitionPath, String fileId, Option overriddenSchema, + TaskContextSupplier taskContextSupplier, boolean preserveMetadata) { + super(config, instantTime, partitionPath, fileId, hoodieTable, overriddenSchema, taskContextSupplier); + this.preserveMetadata = preserveMetadata; writeStatus.setFileId(fileId); writeStatus.setPartitionPath(partitionPath); + writeStatus.setStat(new HoodieWriteStat()); this.path = makeNewPath(partitionPath); try { HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, instantTime, - new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath)); + new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath), + hoodieTable.getPartitionMetafileFormat()); partitionMetadata.trySave(getPartitionId()); - createMarkerFile(partitionPath, FSUtils.makeDataFileName(this.instantTime, this.writeToken, this.fileId, hoodieTable.getBaseFileExtension())); - this.fileWriter = HoodieFileWriterFactory.getFileWriter(instantTime, path, hoodieTable, config, writerSchemaWithMetafields, this.taskContextSupplier); + createMarkerFile(partitionPath, FSUtils.makeBaseFileName(this.instantTime, this.writeToken, this.fileId, hoodieTable.getBaseFileExtension())); + this.fileWriter = HoodieFileWriterFactory.getFileWriter(instantTime, path, hoodieTable, config, + writeSchemaWithMetaFields, this.taskContextSupplier); } catch (IOException e) { throw new HoodieInsertException("Failed to initialize HoodieStorageWriter for path " + path, e); } @@ -93,14 +115,15 @@ public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTa public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, String partitionPath, String fileId, Map> recordMap, TaskContextSupplier taskContextSupplier) { - this(config, instantTime, hoodieTable, partitionPath, fileId, taskContextSupplier); + this(config, instantTime, hoodieTable, partitionPath, fileId, taskContextSupplier, config.isPreserveHoodieCommitMetadataForCompaction()); this.recordMap = recordMap; this.useWriterSchema = true; } @Override public boolean canWrite(HoodieRecord record) { - return fileWriter.canWrite() && record.getPartitionPath().equals(writeStatus.getPartitionPath()); + return (fileWriter.canWrite() && record.getPartitionPath().equals(writeStatus.getPartitionPath())) + || layoutControlsNumFiles(); } /** @@ -108,12 +131,22 @@ public boolean canWrite(HoodieRecord record) { */ @Override public void write(HoodieRecord record, Option avroRecord) { - Option recordMetadata = record.getData().getMetadata(); + Option recordMetadata = ((HoodieRecordPayload) record.getData()).getMetadata(); + if (HoodieOperation.isDelete(record.getOperation())) { + avroRecord = Option.empty(); + } try { if (avroRecord.isPresent()) { + if (avroRecord.get().equals(IGNORE_RECORD)) { + return; + } // Convert GenericRecord to GenericRecord with hoodie commit metadata in schema - IndexedRecord recordWithMetadataInSchema = rewriteRecord((GenericRecord) avroRecord.get()); - fileWriter.writeAvroWithMetadata(recordWithMetadataInSchema, record); + if (preserveMetadata) { + fileWriter.writeAvro(record.getRecordKey(), + rewriteRecordWithMetadata((GenericRecord) avroRecord.get(), path.getName())); + } else { + fileWriter.writeAvroWithMetadata(record.getKey(), rewriteRecord((GenericRecord) avroRecord.get())); + } // update the new location of record, so we know where to find it next record.unseal(); record.setNewLocation(new HoodieRecordLocation(instantTime, writeStatus.getFileId())); @@ -152,9 +185,9 @@ public void write() { final String key = keyIterator.next(); HoodieRecord record = recordMap.get(key); if (useWriterSchema) { - write(record, record.getData().getInsertValue(writerSchemaWithMetafields)); + write(record, record.getData().getInsertValue(tableSchemaWithMetaFields, config.getProps())); } else { - write(record, record.getData().getInsertValue(writerSchema)); + write(record, record.getData().getInsertValue(tableSchema, config.getProps())); } } } catch (IOException io) { @@ -162,11 +195,6 @@ public void write() { } } - @Override - public WriteStatus getWriteStatus() { - return writeStatus; - } - @Override public IOType getIOType() { return IOType.CREATE; @@ -176,36 +204,46 @@ public IOType getIOType() { * Performs actions to durably, persist the current changes and returns a WriteStatus object. */ @Override - public WriteStatus close() { - LOG - .info("Closing the file " + writeStatus.getFileId() + " as we are done with all the records " + recordsWritten); + public List close() { + LOG.info("Closing the file " + writeStatus.getFileId() + " as we are done with all the records " + recordsWritten); try { fileWriter.close(); - HoodieWriteStat stat = new HoodieWriteStat(); - stat.setPartitionPath(writeStatus.getPartitionPath()); - stat.setNumWrites(recordsWritten); - stat.setNumDeletes(recordsDeleted); - stat.setNumInserts(insertRecordsWritten); - stat.setPrevCommit(HoodieWriteStat.NULL_COMMIT); - stat.setFileId(writeStatus.getFileId()); - stat.setPath(new Path(config.getBasePath()), path); - long fileSizeInBytes = FSUtils.getFileSize(fs, path); - stat.setTotalWriteBytes(fileSizeInBytes); - stat.setFileSizeInBytes(fileSizeInBytes); - stat.setTotalWriteErrors(writeStatus.getTotalErrorRecords()); - RuntimeStats runtimeStats = new RuntimeStats(); - runtimeStats.setTotalCreateTime(timer.endTimer()); - stat.setRuntimeStats(runtimeStats); - writeStatus.setStat(stat); - - LOG.info(String.format("CreateHandle for partitionPath %s fileID %s, took %d ms.", stat.getPartitionPath(), - stat.getFileId(), runtimeStats.getTotalCreateTime())); - - return writeStatus; + setupWriteStatus(); + + LOG.info(String.format("CreateHandle for partitionPath %s fileID %s, took %d ms.", + writeStatus.getStat().getPartitionPath(), writeStatus.getStat().getFileId(), + writeStatus.getStat().getRuntimeStats().getTotalCreateTime())); + + return Collections.singletonList(writeStatus); } catch (IOException e) { throw new HoodieInsertException("Failed to close the Insert Handle for path " + path, e); } } + + /** + * Set up the write status. + * + * @throws IOException if error occurs + */ + protected void setupWriteStatus() throws IOException { + HoodieWriteStat stat = writeStatus.getStat(); + stat.setPartitionPath(writeStatus.getPartitionPath()); + stat.setNumWrites(recordsWritten); + stat.setNumDeletes(recordsDeleted); + stat.setNumInserts(insertRecordsWritten); + stat.setPrevCommit(HoodieWriteStat.NULL_COMMIT); + stat.setFileId(writeStatus.getFileId()); + stat.setPath(new Path(config.getBasePath()), path); + stat.setTotalWriteErrors(writeStatus.getTotalErrorRecords()); + + long fileSize = FSUtils.getFileSize(fs, path); + stat.setTotalWriteBytes(fileSize); + stat.setFileSizeInBytes(fileSize); + + RuntimeStats runtimeStats = new RuntimeStats(); + runtimeStats.setTotalCreateTime(timer.endTimer()); + stat.setRuntimeStats(runtimeStats); + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieIOHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieIOHandle.java index c6f9dddef30db..1ad28d14b3a8d 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieIOHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieIOHandle.java @@ -19,6 +19,8 @@ package org.apache.hudi.io; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; @@ -31,8 +33,8 @@ public abstract class HoodieIOHandle { protected final FileSystem fs; protected final HoodieTable hoodieTable; - HoodieIOHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable) { - this.instantTime = instantTime; + HoodieIOHandle(HoodieWriteConfig config, Option instantTime, HoodieTable hoodieTable) { + this.instantTime = instantTime.orElse(StringUtils.EMPTY_STRING); this.config = config; this.hoodieTable = hoodieTable; this.fs = getFileSystem(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLocationFetchHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLocationFetchHandle.java index 9194fc042f988..ab8b83c14aeec 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLocationFetchHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLocationFetchHandle.java @@ -22,16 +22,18 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.ParquetUtils; +import org.apache.hudi.common.util.BaseFileUtils; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.keygen.BaseKeyGenerator; import org.apache.hudi.table.HoodieTable; import org.apache.hadoop.fs.Path; -import java.util.Iterator; - -import scala.Tuple2; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Stream; /** * {@link HoodieRecordLocation} fetch handle for all records from {@link HoodieBaseFile} of interest. @@ -41,17 +43,26 @@ public class HoodieKeyLocationFetchHandle extends HoodieReadHandle { private final Pair partitionPathBaseFilePair; + private final Option keyGeneratorOpt; public HoodieKeyLocationFetchHandle(HoodieWriteConfig config, HoodieTable hoodieTable, - Pair partitionPathBaseFilePair) { - super(config, null, hoodieTable, Pair.of(partitionPathBaseFilePair.getLeft(), partitionPathBaseFilePair.getRight().getFileId())); + Pair partitionPathBaseFilePair, Option keyGeneratorOpt) { + super(config, hoodieTable, Pair.of(partitionPathBaseFilePair.getLeft(), partitionPathBaseFilePair.getRight().getFileId())); this.partitionPathBaseFilePair = partitionPathBaseFilePair; + this.keyGeneratorOpt = keyGeneratorOpt; } - public Iterator> locations() { + public Stream> locations() { HoodieBaseFile baseFile = partitionPathBaseFilePair.getRight(); - return ParquetUtils.fetchRecordKeyPartitionPathFromParquet(hoodieTable.getHadoopConf(), new Path(baseFile.getPath())).stream() - .map(entry -> new Tuple2<>(entry, - new HoodieRecordLocation(baseFile.getCommitTime(), baseFile.getFileId()))).iterator(); + BaseFileUtils baseFileUtils = BaseFileUtils.getInstance(baseFile.getPath()); + List hoodieKeyList = new ArrayList<>(); + if (keyGeneratorOpt.isPresent()) { + hoodieKeyList = baseFileUtils.fetchHoodieKeys(hoodieTable.getHadoopConf(), new Path(baseFile.getPath()), keyGeneratorOpt); + } else { + hoodieKeyList = baseFileUtils.fetchHoodieKeys(hoodieTable.getHadoopConf(), new Path(baseFile.getPath())); + } + return hoodieKeyList.stream() + .map(entry -> Pair.of(entry, + new HoodieRecordLocation(baseFile.getCommitTime(), baseFile.getFileId()))); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLookupHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLookupHandle.java index ad84e3e974af8..a38ae7f1f149b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLookupHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLookupHandle.java @@ -21,23 +21,23 @@ import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.index.HoodieIndexUtils; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.table.HoodieTable; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.IOException; import java.util.ArrayList; -import java.util.HashSet; import java.util.List; -import java.util.Set; + +import static org.apache.hudi.metadata.MetadataPartitionType.BLOOM_FILTERS; /** * Takes a bunch of keys and returns ones that are present in the file group. @@ -46,52 +46,37 @@ public class HoodieKeyLookupHandle exten private static final Logger LOG = LogManager.getLogger(HoodieKeyLookupHandle.class); - private final HoodieTableType tableType; - private final BloomFilter bloomFilter; - private final List candidateRecordKeys; - private long totalKeysChecked; public HoodieKeyLookupHandle(HoodieWriteConfig config, HoodieTable hoodieTable, - Pair partitionPathFilePair) { - super(config, null, hoodieTable, partitionPathFilePair); - this.tableType = hoodieTable.getMetaClient().getTableType(); + Pair partitionPathFileIDPair) { + super(config, hoodieTable, partitionPathFileIDPair); this.candidateRecordKeys = new ArrayList<>(); this.totalKeysChecked = 0; - HoodieTimer timer = new HoodieTimer().startTimer(); - - try { - this.bloomFilter = createNewFileReader().readBloomFilter(); - } catch (IOException e) { - throw new HoodieIndexException(String.format("Error reading bloom filter from %s: %s", partitionPathFilePair, e)); - } - LOG.info(String.format("Read bloom filter from %s in %d ms", partitionPathFilePair, timer.endTimer())); + this.bloomFilter = getBloomFilter(); } - /** - * Given a list of row keys and one file, return only row keys existing in that file. - */ - public List checkCandidatesAgainstFile(Configuration configuration, List candidateRecordKeys, - Path filePath) throws HoodieIndexException { - List foundRecordKeys = new ArrayList<>(); + private BloomFilter getBloomFilter() { + BloomFilter bloomFilter = null; + HoodieTimer timer = new HoodieTimer().startTimer(); try { - // Load all rowKeys from the file, to double-confirm - if (!candidateRecordKeys.isEmpty()) { - HoodieTimer timer = new HoodieTimer().startTimer(); - Set fileRowKeys = createNewFileReader().filterRowKeys(new HashSet<>(candidateRecordKeys)); - foundRecordKeys.addAll(fileRowKeys); - LOG.info(String.format("Checked keys against file %s, in %d ms. #candidates (%d) #found (%d)", filePath, - timer.endTimer(), candidateRecordKeys.size(), foundRecordKeys.size())); - if (LOG.isDebugEnabled()) { - LOG.debug("Keys matching for file " + filePath + " => " + foundRecordKeys); + if (config.getBloomIndexUseMetadata() + && hoodieTable.getMetaClient().getTableConfig().getMetadataPartitions() + .contains(BLOOM_FILTERS.getPartitionPath())) { + bloomFilter = hoodieTable.getMetadataTable().getBloomFilter(partitionPathFileIDPair.getLeft(), partitionPathFileIDPair.getRight()) + .orElseThrow(() -> new HoodieIndexException("BloomFilter missing for " + partitionPathFileIDPair.getRight())); + } else { + try (HoodieFileReader reader = createNewFileReader()) { + bloomFilter = reader.readBloomFilter(); } } - } catch (Exception e) { - throw new HoodieIndexException("Error checking candidate keys against file.", e); + } catch (IOException e) { + throw new HoodieIndexException(String.format("Error reading bloom filter from %s", getPartitionPathFileIDPair()), e); } - return foundRecordKeys; + LOG.info(String.format("Read bloom filter from %s in %d ms", partitionPathFileIDPair, timer.endTimer())); + return bloomFilter; } /** @@ -101,7 +86,7 @@ public void addKey(String recordKey) { // check record key against bloom filter of current file & add to possible keys if needed if (bloomFilter.mightContain(recordKey)) { if (LOG.isDebugEnabled()) { - LOG.debug("Record key " + recordKey + " matches bloom filter in " + partitionPathFilePair); + LOG.debug("Record key " + recordKey + " matches bloom filter in " + partitionPathFileIDPair); } candidateRecordKeys.add(recordKey); } @@ -111,53 +96,18 @@ public void addKey(String recordKey) { /** * Of all the keys, that were added, return a list of keys that were actually found in the file group. */ - public KeyLookupResult getLookupResult() { + public HoodieKeyLookupResult getLookupResult() { if (LOG.isDebugEnabled()) { - LOG.debug("#The candidate row keys for " + partitionPathFilePair + " => " + candidateRecordKeys); + LOG.debug("#The candidate row keys for " + partitionPathFileIDPair + " => " + candidateRecordKeys); } HoodieBaseFile dataFile = getLatestDataFile(); - List matchingKeys = - checkCandidatesAgainstFile(hoodieTable.getHadoopConf(), candidateRecordKeys, new Path(dataFile.getPath())); + List matchingKeys = HoodieIndexUtils.filterKeysFromFile(new Path(dataFile.getPath()), candidateRecordKeys, + hoodieTable.getHadoopConf()); LOG.info( String.format("Total records (%d), bloom filter candidates (%d)/fp(%d), actual matches (%d)", totalKeysChecked, candidateRecordKeys.size(), candidateRecordKeys.size() - matchingKeys.size(), matchingKeys.size())); - return new KeyLookupResult(partitionPathFilePair.getRight(), partitionPathFilePair.getLeft(), + return new HoodieKeyLookupResult(partitionPathFileIDPair.getRight(), partitionPathFileIDPair.getLeft(), dataFile.getCommitTime(), matchingKeys); } - - /** - * Encapsulates the result from a key lookup. - */ - public static class KeyLookupResult { - - private final String fileId; - private final String baseInstantTime; - private final List matchingRecordKeys; - private final String partitionPath; - - public KeyLookupResult(String fileId, String partitionPath, String baseInstantTime, - List matchingRecordKeys) { - this.fileId = fileId; - this.partitionPath = partitionPath; - this.baseInstantTime = baseInstantTime; - this.matchingRecordKeys = matchingRecordKeys; - } - - public String getFileId() { - return fileId; - } - - public String getBaseInstantTime() { - return baseInstantTime; - } - - public String getPartitionPath() { - return partitionPath; - } - - public List getMatchingRecordKeys() { - return matchingRecordKeys; - } - } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLookupResult.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLookupResult.java new file mode 100644 index 0000000000000..19096a21d8700 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLookupResult.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io; + +import java.util.List; + +/** + * Encapsulates the result from a key lookup. + */ +public class HoodieKeyLookupResult { + + private final String fileId; + private final String baseInstantTime; + private final List matchingRecordKeys; + private final String partitionPath; + + public HoodieKeyLookupResult(String fileId, String partitionPath, String baseInstantTime, + List matchingRecordKeys) { + this.fileId = fileId; + this.partitionPath = partitionPath; + this.baseInstantTime = baseInstantTime; + this.matchingRecordKeys = matchingRecordKeys; + } + + public String getFileId() { + return fileId; + } + + public String getBaseInstantTime() { + return baseInstantTime; + } + + public String getPartitionPath() { + return partitionPath; + } + + public List getMatchingRecordKeys() { + return matchingRecordKeys; + } +} + diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java index cab7283f4e06b..6e172d01a6520 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java @@ -19,9 +19,11 @@ package org.apache.hudi.io; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.TaskContextSupplier; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; @@ -32,6 +34,7 @@ import org.apache.hudi.common.util.DefaultSizeEstimator; import org.apache.hudi.common.util.HoodieRecordSizeEstimator; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieCorruptedDataException; @@ -40,6 +43,8 @@ import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; import org.apache.hudi.io.storage.HoodieFileWriter; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.keygen.KeyGenUtils; import org.apache.hudi.table.HoodieTable; import org.apache.avro.Schema; @@ -49,36 +54,80 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import javax.annotation.concurrent.NotThreadSafe; + import java.io.IOException; +import java.util.Collections; import java.util.HashSet; import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; import java.util.Map; import java.util.Set; @SuppressWarnings("Duplicates") +/** + * Handle to merge incoming records to those in storage. + *

+ * Simplified Logic: + * For every existing record + * Check if there is a new record coming in. If yes, merge two records and write to file + * else write the record as is + * For all pending records from incoming batch, write to file. + * + * Illustration with simple data. + * Incoming data: + * rec1_2, rec4_2, rec5_1, rec6_1 + * Existing data: + * rec1_1, rec2_1, rec3_1, rec4_1 + * + * For every existing record, merge w/ incoming if required and write to storage. + * => rec1_1 and rec1_2 is merged to write rec1_2 to storage + * => rec2_1 is written as is + * => rec3_1 is written as is + * => rec4_2 and rec4_1 is merged to write rec4_2 to storage + * Write all pending records from incoming set to storage + * => rec5_1 and rec6_1 + * + * Final snapshot in storage + * rec1_2, rec2_1, rec3_1, rec4_2, rec5_1, rec6_1 + * + *

+ */ +@NotThreadSafe public class HoodieMergeHandle extends HoodieWriteHandle { private static final Logger LOG = LogManager.getLogger(HoodieMergeHandle.class); protected Map> keyToNewRecords; protected Set writtenRecordKeys; - private HoodieFileWriter fileWriter; - - private Path newFilePath; - private Path oldFilePath; - private long recordsWritten = 0; - private long recordsDeleted = 0; - private long updatedRecordsWritten = 0; + protected HoodieFileWriter fileWriter; + private boolean preserveMetadata = false; + + protected Path newFilePath; + protected Path oldFilePath; + protected long recordsWritten = 0; + protected long recordsDeleted = 0; + protected long updatedRecordsWritten = 0; protected long insertRecordsWritten = 0; - protected boolean useWriterSchema; + protected boolean useWriterSchemaForCompaction; + protected Option keyGeneratorOpt; private HoodieBaseFile baseFileToMerge; public HoodieMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, Iterator> recordItr, String partitionPath, String fileId, - TaskContextSupplier taskContextSupplier) { + TaskContextSupplier taskContextSupplier, Option keyGeneratorOpt) { + this(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier, + getLatestBaseFile(hoodieTable, partitionPath, fileId), keyGeneratorOpt); + } + + public HoodieMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, + Iterator> recordItr, String partitionPath, String fileId, + TaskContextSupplier taskContextSupplier, HoodieBaseFile baseFile, Option keyGeneratorOpt) { super(config, instantTime, partitionPath, fileId, hoodieTable, taskContextSupplier); init(fileId, recordItr); - init(fileId, partitionPath, hoodieTable.getBaseFileOnlyView().getLatestBaseFile(partitionPath, fileId).get()); + init(fileId, partitionPath, baseFile); + validateAndSetAndKeyGenProps(keyGeneratorOpt, config.populateMetaFields()); } /** @@ -86,20 +135,35 @@ public HoodieMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTab */ public HoodieMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, Map> keyToNewRecords, String partitionPath, String fileId, - HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) { + HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier, Option keyGeneratorOpt) { super(config, instantTime, partitionPath, fileId, hoodieTable, taskContextSupplier); this.keyToNewRecords = keyToNewRecords; - this.useWriterSchema = true; + this.useWriterSchemaForCompaction = true; + this.preserveMetadata = config.isPreserveHoodieCommitMetadataForCompaction(); init(fileId, this.partitionPath, dataFileToBeMerged); + validateAndSetAndKeyGenProps(keyGeneratorOpt, config.populateMetaFields()); + } + + private void validateAndSetAndKeyGenProps(Option keyGeneratorOpt, boolean populateMetaFields) { + ValidationUtils.checkArgument(populateMetaFields == !keyGeneratorOpt.isPresent()); + this.keyGeneratorOpt = keyGeneratorOpt; + } + + public static HoodieBaseFile getLatestBaseFile(HoodieTable hoodieTable, String partitionPath, String fileId) { + Option baseFileOp = hoodieTable.getBaseFileOnlyView().getLatestBaseFile(partitionPath, fileId); + if (!baseFileOp.isPresent()) { + throw new NoSuchElementException(String.format("FileID %s of partition path %s does not exist.", fileId, partitionPath)); + } + return baseFileOp.get(); } @Override - public Schema getWriterSchemaWithMetafields() { - return writerSchemaWithMetafields; + public Schema getWriterSchemaWithMetaFields() { + return writeSchemaWithMetaFields; } public Schema getWriterSchema() { - return writerSchema; + return writeSchema; } /** @@ -115,14 +179,12 @@ private void init(String fileId, String partitionPath, HoodieBaseFile baseFileTo writeStatus.getStat().setPrevCommit(FSUtils.getCommitTime(latestValidFilePath)); HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, instantTime, - new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath)); + new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath), + hoodieTable.getPartitionMetafileFormat()); partitionMetadata.trySave(getPartitionId()); - oldFilePath = new Path(config.getBasePath() + "/" + partitionPath + "/" + latestValidFilePath); - String newFileName = FSUtils.makeDataFileName(instantTime, writeToken, fileId, hoodieTable.getBaseFileExtension()); - String relativePath = new Path((partitionPath.isEmpty() ? "" : partitionPath + "/") - + newFileName).toString(); - newFilePath = new Path(config.getBasePath(), relativePath); + String newFileName = FSUtils.makeBaseFileName(instantTime, writeToken, fileId, hoodieTable.getBaseFileExtension()); + makeOldAndNewFilePaths(partitionPath, latestValidFilePath, newFileName); LOG.info(String.format("Merging new data into oldPath %s, as newPath %s", oldFilePath.toString(), newFilePath.toString())); @@ -131,13 +193,16 @@ private void init(String fileId, String partitionPath, HoodieBaseFile baseFileTo writeStatus.setPartitionPath(partitionPath); writeStatus.getStat().setPartitionPath(partitionPath); writeStatus.getStat().setFileId(fileId); - writeStatus.getStat().setPath(new Path(config.getBasePath()), newFilePath); + setWriteStatusPath(); - // Create Marker file - createMarkerFile(partitionPath, newFileName); + // Create Marker file, + // uses name of `newFilePath` instead of `newFileName` + // in case the sub-class may roll over the file handle name. + createMarkerFile(partitionPath, newFilePath.getName()); // Create the writer for writing the new version file - fileWriter = createNewFileWriter(instantTime, newFilePath, hoodieTable, config, writerSchemaWithMetafields, taskContextSupplier); + fileWriter = createNewFileWriter(instantTime, newFilePath, hoodieTable, config, + writeSchemaWithMetaFields, taskContextSupplier); } catch (IOException io) { LOG.error("Error in update task at commit " + instantTime, io); writeStatus.setGlobalError(io); @@ -146,44 +211,96 @@ private void init(String fileId, String partitionPath, HoodieBaseFile baseFileTo } } + protected void setWriteStatusPath() { + writeStatus.getStat().setPath(new Path(config.getBasePath()), newFilePath); + } + + protected void makeOldAndNewFilePaths(String partitionPath, String oldFileName, String newFileName) { + oldFilePath = makeNewFilePath(partitionPath, oldFileName); + newFilePath = makeNewFilePath(partitionPath, newFileName); + } + /** - * Load the new incoming records in a map and return partitionPath. + * Initialize a spillable map for incoming records. */ - private void init(String fileId, Iterator> newRecordsItr) { + protected void initializeIncomingRecordsMap() { try { // Load the new records in a map - long memoryForMerge = IOUtils.getMaxMemoryPerPartitionMerge(taskContextSupplier, config.getProps()); + long memoryForMerge = IOUtils.getMaxMemoryPerPartitionMerge(taskContextSupplier, config); LOG.info("MaxMemoryPerPartitionMerge => " + memoryForMerge); this.keyToNewRecords = new ExternalSpillableMap<>(memoryForMerge, config.getSpillableMapBasePath(), - new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(writerSchema)); + new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(tableSchema), + config.getCommonConfig().getSpillableDiskMapType(), + config.getCommonConfig().isBitCaskDiskMapCompressionEnabled()); } catch (IOException io) { throw new HoodieIOException("Cannot instantiate an ExternalSpillableMap", io); } + } + + /** + * Whether there is need to update the record location. + */ + boolean needsUpdateLocation() { + return true; + } + + /** + * Load the new incoming records in a map and return partitionPath. + */ + protected void init(String fileId, Iterator> newRecordsItr) { + initializeIncomingRecordsMap(); while (newRecordsItr.hasNext()) { HoodieRecord record = newRecordsItr.next(); // update the new location of the record, so we know where to find it next - record.unseal(); - record.setNewLocation(new HoodieRecordLocation(instantTime, fileId)); - record.seal(); + if (needsUpdateLocation()) { + record.unseal(); + record.setNewLocation(new HoodieRecordLocation(instantTime, fileId)); + record.seal(); + } // NOTE: Once Records are added to map (spillable-map), DO NOT change it as they won't persist keyToNewRecords.put(record.getRecordKey(), record); } LOG.info("Number of entries in MemoryBasedMap => " + ((ExternalSpillableMap) keyToNewRecords).getInMemoryMapNumEntries() - + "Total size in bytes of MemoryBasedMap => " - + ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize() + "Number of entries in DiskBasedMap => " - + ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries() + "Size of file spilled to disk => " + + ", Total size in bytes of MemoryBasedMap => " + + ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize() + ", Number of entries in BitCaskDiskMap => " + + ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries() + ", Size of file spilled to disk => " + ((ExternalSpillableMap) keyToNewRecords).getSizeOfFileOnDiskInBytes()); } - private boolean writeUpdateRecord(HoodieRecord hoodieRecord, Option indexedRecord) { + private boolean writeUpdateRecord(HoodieRecord hoodieRecord, GenericRecord oldRecord, Option indexedRecord) { + boolean isDelete = false; if (indexedRecord.isPresent()) { updatedRecordsWritten++; + GenericRecord record = (GenericRecord) indexedRecord.get(); + if (oldRecord != record) { + // the incoming record is chosen + isDelete = HoodieOperation.isDelete(hoodieRecord.getOperation()); + } else { + // the incoming record is dropped + return false; + } + } + return writeRecord(hoodieRecord, indexedRecord, isDelete); + } + + protected void writeInsertRecord(HoodieRecord hoodieRecord) throws IOException { + Schema schema = useWriterSchemaForCompaction ? tableSchemaWithMetaFields : tableSchema; + Option insertRecord = hoodieRecord.getData().getInsertValue(schema, config.getProps()); + // just skip the ignored record + if (insertRecord.isPresent() && insertRecord.get().equals(IGNORE_RECORD)) { + return; + } + if (writeRecord(hoodieRecord, insertRecord, HoodieOperation.isDelete(hoodieRecord.getOperation()))) { + insertRecordsWritten++; } - return writeRecord(hoodieRecord, indexedRecord); } protected boolean writeRecord(HoodieRecord hoodieRecord, Option indexedRecord) { + return writeRecord(hoodieRecord, indexedRecord, false); + } + + protected boolean writeRecord(HoodieRecord hoodieRecord, Option indexedRecord, boolean isDelete) { Option recordMetadata = hoodieRecord.getData().getMetadata(); if (!partitionPath.equals(hoodieRecord.getPartitionPath())) { HoodieUpsertException failureEx = new HoodieUpsertException("mismatched partition path, record partition: " @@ -192,10 +309,8 @@ protected boolean writeRecord(HoodieRecord hoodieRecord, Option hoodieRecord, Option hoodieRecord = new HoodieRecord<>(keyToNewRecords.get(key)); + HoodieRecord hoodieRecord = keyToNewRecords.get(key).newInstance(); try { Option combinedAvroRecord = - hoodieRecord.getData().combineAndGetUpdateValue(oldRecord, useWriterSchema ? writerSchemaWithMetafields : writerSchema); - if (writeUpdateRecord(hoodieRecord, combinedAvroRecord)) { + hoodieRecord.getData().combineAndGetUpdateValue(oldRecord, + useWriterSchemaForCompaction ? tableSchemaWithMetaFields : tableSchema, + config.getPayloadConfig().getProps()); + + if (combinedAvroRecord.isPresent() && combinedAvroRecord.get().equals(IGNORE_RECORD)) { + // If it is an IGNORE_RECORD, just copy the old record, and do not update the new record. + copyOldRecord = true; + } else if (writeUpdateRecord(hoodieRecord, oldRecord, combinedAvroRecord)) { /* - * ONLY WHEN 1) we have an update for this key AND 2) We are able to successfully write the the combined new - * value + * ONLY WHEN 1) we have an update for this key AND 2) We are able to successfully + * write the combined new value * * We no longer need to copy the old record over. */ @@ -243,12 +364,12 @@ public void write(GenericRecord oldRecord) { } if (copyOldRecord) { - // this should work as it is, since this is an existing record try { - fileWriter.writeAvro(key, oldRecord); + // NOTE: We're enforcing preservation of the record metadata to keep existing semantic + writeToFile(new HoodieKey(key, partitionPath), oldRecord, true); } catch (IOException | RuntimeException e) { String errMsg = String.format("Failed to merge old record into new file for key %s from old file %s to new file %s with writerSchema %s", - key, getOldFilePath(), newFilePath, writerSchemaWithMetafields.toString(true)); + key, getOldFilePath(), newFilePath, writeSchemaWithMetaFields.toString(true)); LOG.debug("Old record is " + oldRecord); throw new HoodieUpsertException(errMsg, e); } @@ -256,29 +377,43 @@ public void write(GenericRecord oldRecord) { } } + protected void writeToFile(HoodieKey key, GenericRecord avroRecord, boolean shouldPreserveRecordMetadata) throws IOException { + if (shouldPreserveRecordMetadata) { + // NOTE: `FILENAME_METADATA_FIELD` has to be rewritten to correctly point to the + // file holding this record even in cases when overall metadata is preserved + fileWriter.writeAvro(key.getRecordKey(), rewriteRecordWithMetadata(avroRecord, newFilePath.getName())); + } else { + fileWriter.writeAvroWithMetadata(key, rewriteRecord(avroRecord)); + } + } + + protected void writeIncomingRecords() throws IOException { + // write out any pending records (this can happen when inserts are turned into updates) + Iterator> newRecordsItr = (keyToNewRecords instanceof ExternalSpillableMap) + ? ((ExternalSpillableMap)keyToNewRecords).iterator() : keyToNewRecords.values().iterator(); + while (newRecordsItr.hasNext()) { + HoodieRecord hoodieRecord = newRecordsItr.next(); + if (!writtenRecordKeys.contains(hoodieRecord.getRecordKey())) { + writeInsertRecord(hoodieRecord); + } + } + } + @Override - public WriteStatus close() { + public List close() { try { - // write out any pending records (this can happen when inserts are turned into updates) - Iterator> newRecordsItr = (keyToNewRecords instanceof ExternalSpillableMap) - ? ((ExternalSpillableMap)keyToNewRecords).iterator() : keyToNewRecords.values().iterator(); - while (newRecordsItr.hasNext()) { - HoodieRecord hoodieRecord = newRecordsItr.next(); - if (!writtenRecordKeys.contains(hoodieRecord.getRecordKey())) { - if (useWriterSchema) { - writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(writerSchemaWithMetafields)); - } else { - writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(writerSchema)); - } - insertRecordsWritten++; - } - } + writeIncomingRecords(); - ((ExternalSpillableMap) keyToNewRecords).close(); + if (keyToNewRecords instanceof ExternalSpillableMap) { + ((ExternalSpillableMap) keyToNewRecords).close(); + } else { + keyToNewRecords.clear(); + } writtenRecordKeys.clear(); if (fileWriter != null) { fileWriter.close(); + fileWriter = null; } long fileSizeInBytes = FSUtils.getFileSize(fs, newFilePath); @@ -300,7 +435,7 @@ public WriteStatus close() { LOG.info(String.format("MergeHandle for partitionPath %s fileID %s, took %d ms.", stat.getPartitionPath(), stat.getFileId(), runtimeStats.getTotalUpsertTime())); - return writeStatus; + return Collections.singletonList(writeStatus); } catch (IOException e) { throw new HoodieUpsertException("Failed to close UpdateHandle", e); } @@ -312,8 +447,7 @@ public void performMergeDataValidationCheck(WriteStatus writeStatus) { } long oldNumWrites = 0; - try { - HoodieFileReader reader = HoodieFileReaderFactory.getFileReader(hoodieTable.getHadoopConf(), oldFilePath); + try (HoodieFileReader reader = HoodieFileReaderFactory.getFileReader(hoodieTable.getHadoopConf(), oldFilePath)) { oldNumWrites = reader.getTotalRecords(); } catch (IOException e) { throw new HoodieUpsertException("Failed to check for merge data validation", e); @@ -332,11 +466,6 @@ public Path getOldFilePath() { return oldFilePath; } - @Override - public WriteStatus getWriteStatus() { - return writeStatus; - } - @Override public IOType getIOType() { return IOType.MERGE; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieRangeInfoHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieRangeInfoHandle.java index 78fa9be690367..abe4a9befef9b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieRangeInfoHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieRangeInfoHandle.java @@ -21,6 +21,7 @@ import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.table.HoodieTable; import java.io.IOException; @@ -32,10 +33,12 @@ public class HoodieRangeInfoHandle exten public HoodieRangeInfoHandle(HoodieWriteConfig config, HoodieTable hoodieTable, Pair partitionPathFilePair) { - super(config, null, hoodieTable, partitionPathFilePair); + super(config, hoodieTable, partitionPathFilePair); } public String[] getMinMaxKeys() throws IOException { - return createNewFileReader().readMinMaxRecordKeys(); + try (HoodieFileReader reader = createNewFileReader()) { + return reader.readMinMaxRecordKeys(); + } } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieReadHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieReadHandle.java index a771c33c40661..fee75b22decd7 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieReadHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieReadHandle.java @@ -18,8 +18,11 @@ package org.apache.hudi.io; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.io.storage.HoodieFileReader; @@ -28,20 +31,17 @@ import java.io.IOException; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; - /** * Base class for read operations done logically on the file group. */ public abstract class HoodieReadHandle extends HoodieIOHandle { - protected final Pair partitionPathFilePair; + protected final Pair partitionPathFileIDPair; - public HoodieReadHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, - Pair partitionPathFilePair) { - super(config, instantTime, hoodieTable); - this.partitionPathFilePair = partitionPathFilePair; + public HoodieReadHandle(HoodieWriteConfig config, HoodieTable hoodieTable, + Pair partitionPathFileIDPair) { + super(config, Option.empty(), hoodieTable); + this.partitionPathFileIDPair = partitionPathFileIDPair; } @Override @@ -49,17 +49,17 @@ protected FileSystem getFileSystem() { return hoodieTable.getMetaClient().getFs(); } - public Pair getPartitionPathFilePair() { - return partitionPathFilePair; + public Pair getPartitionPathFileIDPair() { + return partitionPathFileIDPair; } public String getFileId() { - return partitionPathFilePair.getRight(); + return partitionPathFileIDPair.getRight(); } protected HoodieBaseFile getLatestDataFile() { return hoodieTable.getBaseFileOnlyView() - .getLatestBaseFile(partitionPathFilePair.getLeft(), partitionPathFilePair.getRight()).get(); + .getLatestBaseFile(partitionPathFileIDPair.getLeft(), partitionPathFileIDPair.getRight()).get(); } protected HoodieFileReader createNewFileReader() throws IOException { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieSortedMergeHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieSortedMergeHandle.java index 71610b1aa9c2f..7dce31a4c349b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieSortedMergeHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieSortedMergeHandle.java @@ -19,18 +19,24 @@ package org.apache.hudi.io; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.TaskContextSupplier; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieUpsertException; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.keygen.KeyGenUtils; import org.apache.hudi.table.HoodieTable; import org.apache.avro.generic.GenericRecord; +import javax.annotation.concurrent.NotThreadSafe; + import java.io.IOException; import java.util.Iterator; +import java.util.List; import java.util.Map; import java.util.PriorityQueue; import java.util.Queue; @@ -41,13 +47,15 @@ * The implementation performs a merge-sort by comparing the key of the record being written to the list of * keys in newRecordKeys (sorted in-memory). */ +@NotThreadSafe public class HoodieSortedMergeHandle extends HoodieMergeHandle { - private Queue newRecordKeysSorted = new PriorityQueue<>(); + private final Queue newRecordKeysSorted = new PriorityQueue<>(); public HoodieSortedMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, - Iterator> recordItr, String partitionPath, String fileId, TaskContextSupplier taskContextSupplier) { - super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier); + Iterator> recordItr, String partitionPath, String fileId, TaskContextSupplier taskContextSupplier, + Option keyGeneratorOpt) { + super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier, keyGeneratorOpt); newRecordKeysSorted.addAll(keyToNewRecords.keySet()); } @@ -56,9 +64,9 @@ public HoodieSortedMergeHandle(HoodieWriteConfig config, String instantTime, Hoo */ public HoodieSortedMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, Map> keyToNewRecordsOrig, String partitionPath, String fileId, - HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier) { + HoodieBaseFile dataFileToBeMerged, TaskContextSupplier taskContextSupplier, Option keyGeneratorOpt) { super(config, instantTime, hoodieTable, keyToNewRecordsOrig, partitionPath, fileId, dataFileToBeMerged, - taskContextSupplier); + taskContextSupplier, keyGeneratorOpt); newRecordKeysSorted.addAll(keyToNewRecords.keySet()); } @@ -68,7 +76,7 @@ public HoodieSortedMergeHandle(HoodieWriteConfig config, String instantTime, Hoo */ @Override public void write(GenericRecord oldRecord) { - String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String key = KeyGenUtils.getRecordKeyFromGenericRecord(oldRecord, keyGeneratorOpt); // To maintain overall sorted order across updates and inserts, write any new inserts whose keys are less than // the oldRecord's key. @@ -80,15 +88,15 @@ public void write(GenericRecord oldRecord) { } // This is a new insert - HoodieRecord hoodieRecord = new HoodieRecord<>(keyToNewRecords.get(keyToPreWrite)); + HoodieRecord hoodieRecord = keyToNewRecords.get(keyToPreWrite).newInstance(); if (writtenRecordKeys.contains(keyToPreWrite)) { throw new HoodieUpsertException("Insert/Update not in sorted order"); } try { - if (useWriterSchema) { - writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(writerSchemaWithMetafields)); + if (useWriterSchemaForCompaction) { + writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(tableSchemaWithMetaFields, config.getProps())); } else { - writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(writerSchema)); + writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(tableSchema, config.getProps())); } insertRecordsWritten++; writtenRecordKeys.add(keyToPreWrite); @@ -101,23 +109,24 @@ public void write(GenericRecord oldRecord) { } @Override - public WriteStatus close() { + public List close() { // write out any pending records (this can happen when inserts are turned into updates) - newRecordKeysSorted.stream().forEach(key -> { + while (!newRecordKeysSorted.isEmpty()) { try { + String key = newRecordKeysSorted.poll(); HoodieRecord hoodieRecord = keyToNewRecords.get(key); if (!writtenRecordKeys.contains(hoodieRecord.getRecordKey())) { - if (useWriterSchema) { - writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(writerSchemaWithMetafields)); + if (useWriterSchemaForCompaction) { + writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(tableSchemaWithMetaFields, config.getProps())); } else { - writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(writerSchema)); + writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(tableSchema, config.getProps())); } insertRecordsWritten++; } } catch (IOException e) { throw new HoodieUpsertException("Failed to close UpdateHandle", e); } - }); + } newRecordKeysSorted.clear(); keyToNewRecords.clear(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieUnboundedCreateHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieUnboundedCreateHandle.java new file mode 100644 index 0000000000000..ebbc7a5c28ea1 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieUnboundedCreateHandle.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io; + +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import javax.annotation.concurrent.NotThreadSafe; + +/** + * A HoodieCreateHandle which writes all data into a single file. + *

+ * Please use this with caution. This can end up creating very large files if not used correctly. + */ +@NotThreadSafe +public class HoodieUnboundedCreateHandle extends HoodieCreateHandle { + + private static final Logger LOG = LogManager.getLogger(HoodieUnboundedCreateHandle.class); + + public HoodieUnboundedCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, + String partitionPath, String fileId, TaskContextSupplier taskContextSupplier, + boolean preserveHoodieMetadata) { + super(config, instantTime, hoodieTable, partitionPath, fileId, Option.empty(), + taskContextSupplier, preserveHoodieMetadata); + } + + @Override + public boolean canWrite(HoodieRecord record) { + return true; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java index 85898bccc7052..b7fdbecfd56d1 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java @@ -20,7 +20,7 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.TaskContextSupplier; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -28,13 +28,12 @@ import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ReflectionUtils; -import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.io.storage.HoodieFileWriter; import org.apache.hudi.io.storage.HoodieFileWriterFactory; import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.MarkerFiles; +import org.apache.hudi.table.marker.WriteMarkersFactory; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; @@ -45,6 +44,11 @@ import org.apache.log4j.Logger; import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.HashMap; + +import static org.apache.hudi.common.util.StringUtils.isNullOrEmpty; /** * Base class for all write operations logically performed at the file group level. @@ -53,47 +57,94 @@ public abstract class HoodieWriteHandle private static final Logger LOG = LogManager.getLogger(HoodieWriteHandle.class); - protected final Schema writerSchema; - protected final Schema writerSchemaWithMetafields; + /** + * A special record returned by {@link HoodieRecordPayload}, which means + * {@link HoodieWriteHandle} should just skip this record. + * This record is only used for {@link HoodieRecordPayload} currently, so it should not + * shuffle though network, we can compare the record locally by the equal method. + * The HoodieRecordPayload#combineAndGetUpdateValue and HoodieRecordPayload#getInsertValue + * have 3 kind of return: + * 1、Option.empty + * This means we should delete this record. + * 2、IGNORE_RECORD + * This means we should not process this record,just skip. + * 3、Other non-empty record + * This means we should process this record. + * + * We can see the usage of IGNORE_RECORD in + * org.apache.spark.sql.hudi.command.payload.ExpressionPayload + */ + public static IgnoreRecord IGNORE_RECORD = new IgnoreRecord(); + + /** + * The specified schema of the table. ("specified" denotes that this is configured by the client, + * as opposed to being implicitly fetched out of the commit metadata) + */ + protected final Schema tableSchema; + protected final Schema tableSchemaWithMetaFields; + + /** + * The write schema. In most case the write schema is the same to the + * input schema. But if HoodieWriteConfig#WRITE_SCHEMA is specified, + * we use the WRITE_SCHEMA as the write schema. + * + * This is useful for the case of custom HoodieRecordPayload which do some conversion + * to the incoming record in it. e.g. the ExpressionPayload do the sql expression conversion + * to the input. + */ + protected final Schema writeSchema; + protected final Schema writeSchemaWithMetaFields; + protected HoodieTimer timer; - protected final WriteStatus writeStatus; + protected WriteStatus writeStatus; protected final String partitionPath; protected final String fileId; protected final String writeToken; protected final TaskContextSupplier taskContextSupplier; + // For full schema evolution + protected final boolean schemaOnReadEnabled; public HoodieWriteHandle(HoodieWriteConfig config, String instantTime, String partitionPath, String fileId, HoodieTable hoodieTable, TaskContextSupplier taskContextSupplier) { this(config, instantTime, partitionPath, fileId, hoodieTable, - getWriterSchemaIncludingAndExcludingMetadataPair(config), taskContextSupplier); + Option.empty(), taskContextSupplier); } protected HoodieWriteHandle(HoodieWriteConfig config, String instantTime, String partitionPath, String fileId, - HoodieTable hoodieTable, Pair writerSchemaIncludingAndExcludingMetadataPair, + HoodieTable hoodieTable, Option overriddenSchema, TaskContextSupplier taskContextSupplier) { - super(config, instantTime, hoodieTable); + super(config, Option.of(instantTime), hoodieTable); this.partitionPath = partitionPath; this.fileId = fileId; - this.writerSchema = writerSchemaIncludingAndExcludingMetadataPair.getKey(); - this.writerSchemaWithMetafields = writerSchemaIncludingAndExcludingMetadataPair.getValue(); + this.tableSchema = overriddenSchema.orElseGet(() -> getSpecifiedTableSchema(config)); + this.tableSchemaWithMetaFields = HoodieAvroUtils.addMetadataFields(tableSchema, config.allowOperationMetadataField()); + this.writeSchema = overriddenSchema.orElseGet(() -> getWriteSchema(config)); + this.writeSchemaWithMetaFields = HoodieAvroUtils.addMetadataFields(writeSchema, config.allowOperationMetadataField()); this.timer = new HoodieTimer().startTimer(); this.writeStatus = (WriteStatus) ReflectionUtils.loadClass(config.getWriteStatusClassName(), !hoodieTable.getIndex().isImplicitWithStorage(), config.getWriteStatusFailureFraction()); this.taskContextSupplier = taskContextSupplier; this.writeToken = makeWriteToken(); + schemaOnReadEnabled = !isNullOrEmpty(hoodieTable.getConfig().getInternalSchema()); } /** - * Returns writer schema pairs containing - * (a) Writer Schema from client - * (b) (a) with hoodie metadata fields. - * @param config Write Config + * Get the specified table schema. + * @param config * @return */ - protected static Pair getWriterSchemaIncludingAndExcludingMetadataPair(HoodieWriteConfig config) { - Schema originalSchema = new Schema.Parser().parse(config.getSchema()); - Schema hoodieSchema = HoodieAvroUtils.addMetadataFields(originalSchema); - return Pair.of(originalSchema, hoodieSchema); + private static Schema getSpecifiedTableSchema(HoodieWriteConfig config) { + return new Schema.Parser().parse(config.getSchema()); + } + + /** + * Get the schema, of the actual write. + * + * @param config + * @return + */ + private static Schema getWriteSchema(HoodieWriteConfig config) { + return new Schema.Parser().parse(config.getWriteSchema()); } /** @@ -106,27 +157,38 @@ private String makeWriteToken() { public Path makeNewPath(String partitionPath) { Path path = FSUtils.getPartitionPath(config.getBasePath(), partitionPath); try { - fs.mkdirs(path); // create a new partition as needed. + if (!fs.exists(path)) { + fs.mkdirs(path); // create a new partition as needed. + } } catch (IOException e) { throw new HoodieIOException("Failed to make dir " + path, e); } - return new Path(path.toString(), FSUtils.makeDataFileName(instantTime, writeToken, fileId, + return new Path(path.toString(), FSUtils.makeBaseFileName(instantTime, writeToken, fileId, hoodieTable.getMetaClient().getTableConfig().getBaseFileFormat().getFileExtension())); } + /** + * Make new file path with given file name. + */ + protected Path makeNewFilePath(String partitionPath, String fileName) { + String relativePath = new Path((partitionPath.isEmpty() ? "" : partitionPath + "/") + + fileName).toString(); + return new Path(config.getBasePath(), relativePath); + } + /** * Creates an empty marker file corresponding to storage writer path. * * @param partitionPath Partition path */ protected void createMarkerFile(String partitionPath, String dataFileName) { - MarkerFiles markerFiles = new MarkerFiles(hoodieTable, instantTime); - markerFiles.create(partitionPath, dataFileName, getIOType()); + WriteMarkersFactory.get(config.getMarkersType(), hoodieTable, instantTime) + .create(partitionPath, dataFileName, getIOType()); } - public Schema getWriterSchemaWithMetafields() { - return writerSchemaWithMetafields; + public Schema getWriterSchemaWithMetaFields() { + return writeSchemaWithMetaFields; } /** @@ -139,6 +201,10 @@ public boolean canWrite(HoodieRecord record) { return false; } + boolean layoutControlsNumFiles() { + return hoodieTable.getStorageLayout().determinesNumFileGroups(); + } + /** * Perform the actual writing of the given record into the backing file. */ @@ -150,7 +216,7 @@ public void write(HoodieRecord record, Option insertValue) { * Perform the actual writing of the given record into the backing file. */ public void write(HoodieRecord record, Option avroRecord, Option exception) { - Option recordMetadata = record.getData().getMetadata(); + Option recordMetadata = ((HoodieRecordPayload) record.getData()).getMetadata(); if (exception.isPresent() && exception.get() instanceof Throwable) { // Not throwing exception from here, since we don't want to fail the entire job for a single record writeStatus.markFailure(record, exception.get(), recordMetadata); @@ -164,12 +230,24 @@ public void write(HoodieRecord record, Option avroRecord, Option< * Rewrite the GenericRecord with the Schema containing the Hoodie Metadata fields. */ protected GenericRecord rewriteRecord(GenericRecord record) { - return HoodieAvroUtils.rewriteRecord(record, writerSchemaWithMetafields); + return schemaOnReadEnabled ? HoodieAvroUtils.rewriteRecordWithNewSchema(record, writeSchemaWithMetaFields, new HashMap<>()) + : HoodieAvroUtils.rewriteRecord(record, writeSchemaWithMetaFields); + } + + protected GenericRecord rewriteRecordWithMetadata(GenericRecord record, String fileName) { + return schemaOnReadEnabled ? HoodieAvroUtils.rewriteEvolutionRecordWithMetadata(record, writeSchemaWithMetaFields, fileName) + : HoodieAvroUtils.rewriteRecordWithMetadata(record, writeSchemaWithMetaFields, fileName); } - public abstract WriteStatus close(); + public abstract List close(); - public abstract WriteStatus getWriteStatus(); + public List writeStatuses() { + return Collections.singletonList(writeStatus); + } + + public String getPartitionPath() { + return partitionPath; + } public abstract IOType getIOType(); @@ -194,4 +272,32 @@ protected HoodieFileWriter createNewFileWriter(String instantTime, Path path, Ho HoodieWriteConfig config, Schema schema, TaskContextSupplier taskContextSupplier) throws IOException { return HoodieFileWriterFactory.getFileWriter(instantTime, path, hoodieTable, config, schema, taskContextSupplier); } + + private static class IgnoreRecord implements GenericRecord { + + @Override + public void put(int i, Object v) { + + } + + @Override + public Object get(int i) { + return null; + } + + @Override + public Schema getSchema() { + return null; + } + + @Override + public void put(String key, Object v) { + + } + + @Override + public Object get(String key) { + return null; + } + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/IOUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/IOUtils.java index 03719157fd998..7636384c3a6d3 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/IOUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/IOUtils.java @@ -18,37 +18,38 @@ package org.apache.hudi.io; -import org.apache.hudi.client.common.EngineProperty; -import org.apache.hudi.client.common.TaskContextSupplier; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.engine.EngineProperty; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.util.Option; -import java.util.Properties; - import static org.apache.hudi.config.HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES; -import static org.apache.hudi.config.HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION; -import static org.apache.hudi.config.HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FRACTION_FOR_MERGE; import static org.apache.hudi.config.HoodieMemoryConfig.DEFAULT_MIN_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES; -import static org.apache.hudi.config.HoodieMemoryConfig.MAX_MEMORY_FOR_COMPACTION_PROP; -import static org.apache.hudi.config.HoodieMemoryConfig.MAX_MEMORY_FOR_MERGE_PROP; -import static org.apache.hudi.config.HoodieMemoryConfig.MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP; -import static org.apache.hudi.config.HoodieMemoryConfig.MAX_MEMORY_FRACTION_FOR_MERGE_PROP; +import static org.apache.hudi.config.HoodieMemoryConfig.MAX_MEMORY_FOR_COMPACTION; +import static org.apache.hudi.config.HoodieMemoryConfig.MAX_MEMORY_FOR_MERGE; +import static org.apache.hudi.config.HoodieMemoryConfig.MAX_MEMORY_FRACTION_FOR_COMPACTION; +import static org.apache.hudi.config.HoodieMemoryConfig.MAX_MEMORY_FRACTION_FOR_MERGE; public class IOUtils { /** - * Dynamic calculation of max memory to use for for spillable map. user.available.memory = executor.memory * - * (1 - memory.fraction) spillable.available.memory = user.available.memory * hoodie.memory.fraction. Anytime - * the engine memory fractions/total memory is changed, the memory used for spillable map changes - * accordingly + * Dynamic calculation of max memory to use for spillable map. There is always more than one task + * running on a executor and the each task maintains a spillable map. + * user.available.memory = executor.memory * (1 - memory.fraction) + * spillable.available.memory = user.available.memory * hoodie.memory.fraction / executor.cores. + * Anytime the engine memory fractions/total memory is changed, the memory used for spillable map + * changes accordingly. */ public static long getMaxMemoryAllowedForMerge(TaskContextSupplier context, String maxMemoryFraction) { Option totalMemoryOpt = context.getProperty(EngineProperty.TOTAL_MEMORY_AVAILABLE); Option memoryFractionOpt = context.getProperty(EngineProperty.MEMORY_FRACTION_IN_USE); + Option totalCoresOpt = context.getProperty(EngineProperty.TOTAL_CORES_PER_EXECUTOR); - if (totalMemoryOpt.isPresent() && memoryFractionOpt.isPresent()) { + if (totalMemoryOpt.isPresent() && memoryFractionOpt.isPresent() && totalCoresOpt.isPresent()) { long executorMemoryInBytes = Long.parseLong(totalMemoryOpt.get()); double memoryFraction = Double.parseDouble(memoryFractionOpt.get()); double maxMemoryFractionForMerge = Double.parseDouble(maxMemoryFraction); - double userAvailableMemory = executorMemoryInBytes * (1 - memoryFraction); + long executorCores = Long.parseLong(totalCoresOpt.get()); + double userAvailableMemory = executorMemoryInBytes * (1 - memoryFraction) / executorCores; long maxMemoryForMerge = (long) Math.floor(userAvailableMemory * maxMemoryFractionForMerge); return Math.max(DEFAULT_MIN_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES, maxMemoryForMerge); } else { @@ -56,19 +57,19 @@ public static long getMaxMemoryAllowedForMerge(TaskContextSupplier context, Stri } } - public static long getMaxMemoryPerPartitionMerge(TaskContextSupplier context, Properties properties) { - if (properties.containsKey(MAX_MEMORY_FOR_MERGE_PROP)) { - return Long.parseLong(properties.getProperty(MAX_MEMORY_FOR_MERGE_PROP)); + public static long getMaxMemoryPerPartitionMerge(TaskContextSupplier context, HoodieConfig hoodieConfig) { + if (hoodieConfig.contains(MAX_MEMORY_FOR_MERGE)) { + return hoodieConfig.getLong(MAX_MEMORY_FOR_MERGE); } - String fraction = properties.getProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP, DEFAULT_MAX_MEMORY_FRACTION_FOR_MERGE); + String fraction = hoodieConfig.getStringOrDefault(MAX_MEMORY_FRACTION_FOR_MERGE); return getMaxMemoryAllowedForMerge(context, fraction); } - public static long getMaxMemoryPerCompaction(TaskContextSupplier context, Properties properties) { - if (properties.containsKey(MAX_MEMORY_FOR_COMPACTION_PROP)) { - return Long.parseLong(properties.getProperty(MAX_MEMORY_FOR_COMPACTION_PROP)); + public static long getMaxMemoryPerCompaction(TaskContextSupplier context, HoodieConfig hoodieConfig) { + if (hoodieConfig.contains(MAX_MEMORY_FOR_COMPACTION)) { + return hoodieConfig.getLong(MAX_MEMORY_FOR_COMPACTION); } - String fraction = properties.getProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP, DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION); + String fraction = hoodieConfig.getStringOrDefault(MAX_MEMORY_FRACTION_FOR_COMPACTION); return getMaxMemoryAllowedForMerge(context, fraction); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/SingleFileHandleCreateFactory.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/SingleFileHandleCreateFactory.java new file mode 100644 index 0000000000000..a3f7c04ef23a8 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/SingleFileHandleCreateFactory.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io; + +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.table.HoodieTable; + +import java.io.Serializable; +import java.util.concurrent.atomic.AtomicBoolean; + +/** + * A SingleFileHandleCreateFactory is used to write all data in the spark partition into a single data file. + *

+ * Please use this with caution. This can end up creating very large files if not used correctly. + */ +public class SingleFileHandleCreateFactory extends CreateHandleFactory implements Serializable { + + private final AtomicBoolean isHandleCreated = new AtomicBoolean(false); + private final String fileId; + private final boolean preserveHoodieMetadata; + + public SingleFileHandleCreateFactory(String fileId, boolean preserveHoodieMetadata) { + super(); + this.fileId = fileId; + this.preserveHoodieMetadata = preserveHoodieMetadata; + } + + @Override + public HoodieWriteHandle create(final HoodieWriteConfig hoodieConfig, final String commitTime, + final HoodieTable hoodieTable, final String partitionPath, + final String fileIdPrefix, TaskContextSupplier taskContextSupplier) { + + if (isHandleCreated.compareAndSet(false, true)) { + return new HoodieUnboundedCreateHandle(hoodieConfig, commitTime, hoodieTable, partitionPath, + fileId, // ignore idPfx, always use same fileId + taskContextSupplier, preserveHoodieMetadata); + } + + throw new HoodieIOException("Fixed handle create is only expected to be invoked once"); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/WriteHandleFactory.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/WriteHandleFactory.java index c66442a48d5ad..c267b5969d801 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/WriteHandleFactory.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/WriteHandleFactory.java @@ -18,18 +18,21 @@ package org.apache.hudi.io; -import org.apache.hudi.client.common.TaskContextSupplier; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; -public abstract class WriteHandleFactory { +import java.io.Serializable; + +public abstract class WriteHandleFactory implements Serializable { private int numFilesWritten = 0; public abstract HoodieWriteHandle create(HoodieWriteConfig config, String commitTime, HoodieTable hoodieTable, String partitionPath, String fileIdPrefix, TaskContextSupplier taskContextSupplier); protected String getNextFileId(String idPfx) { - return String.format("%s-%d", idPfx, numFilesWritten++); + return FSUtils.createNewFileId(idPfx, numFilesWritten++); } } \ No newline at end of file diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetConfig.java deleted file mode 100644 index f934a8a83784f..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetConfig.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.io.storage; - -import org.apache.hudi.avro.HoodieAvroWriteSupport; - -import org.apache.hadoop.conf.Configuration; -import org.apache.parquet.hadoop.metadata.CompressionCodecName; - -/** - * ParquetConfig for writing avro records in Parquet files. - */ -public class HoodieAvroParquetConfig extends HoodieBaseParquetConfig { - - public HoodieAvroParquetConfig(HoodieAvroWriteSupport writeSupport, CompressionCodecName compressionCodecName, - int blockSize, int pageSize, long maxFileSize, Configuration hadoopConf, - double compressionRatio) { - super(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio); - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetWriter.java new file mode 100644 index 0000000000000..06631dc53fb1c --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetWriter.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.HoodieAvroWriteSupport; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.model.HoodieKey; + +import javax.annotation.concurrent.NotThreadSafe; + +import java.io.IOException; + +/** + * HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides a way to check if + * the current file can take more records with the canWrite() + * + * ATTENTION: HoodieParquetWriter is not thread safe and developer should take care of the order of write and close + */ +@NotThreadSafe +public class HoodieAvroParquetWriter + extends HoodieBaseParquetWriter + implements HoodieFileWriter { + + private final String fileName; + private final String instantTime; + private final TaskContextSupplier taskContextSupplier; + private final boolean populateMetaFields; + private final HoodieAvroWriteSupport writeSupport; + + @SuppressWarnings({"unchecked", "rawtypes"}) + public HoodieAvroParquetWriter(Path file, + HoodieParquetConfig parquetConfig, + String instantTime, + TaskContextSupplier taskContextSupplier, + boolean populateMetaFields) throws IOException { + super(file, (HoodieParquetConfig) parquetConfig); + this.fileName = file.getName(); + this.writeSupport = parquetConfig.getWriteSupport(); + this.instantTime = instantTime; + this.taskContextSupplier = taskContextSupplier; + this.populateMetaFields = populateMetaFields; + } + + @Override + public void writeAvroWithMetadata(HoodieKey key, R avroRecord) throws IOException { + if (populateMetaFields) { + prepRecordWithMetadata(key, avroRecord, instantTime, + taskContextSupplier.getPartitionIdSupplier().get(), getWrittenRecordCount(), fileName); + super.write(avroRecord); + writeSupport.add(key.getRecordKey()); + } else { + super.write(avroRecord); + } + } + + @Override + public void writeAvro(String key, IndexedRecord object) throws IOException { + super.write(object); + if (populateMetaFields) { + writeSupport.add(key); + } + } + + @Override + public void close() throws IOException { + super.close(); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetConfig.java deleted file mode 100644 index 6e6f66c5eac6d..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetConfig.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.io.storage; - -import org.apache.hadoop.conf.Configuration; -import org.apache.parquet.hadoop.metadata.CompressionCodecName; - -/** - * Base ParquetConfig to hold config params for writing to Parquet. - * @param - */ -public class HoodieBaseParquetConfig { - private final T writeSupport; - private CompressionCodecName compressionCodecName; - private int blockSize; - private int pageSize; - private long maxFileSize; - private Configuration hadoopConf; - private double compressionRatio; - - public HoodieBaseParquetConfig(T writeSupport, CompressionCodecName compressionCodecName, int blockSize, - int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio) { - this.writeSupport = writeSupport; - this.compressionCodecName = compressionCodecName; - this.blockSize = blockSize; - this.pageSize = pageSize; - this.maxFileSize = maxFileSize; - this.hadoopConf = hadoopConf; - this.compressionRatio = compressionRatio; - } - - public CompressionCodecName getCompressionCodecName() { - return compressionCodecName; - } - - public int getBlockSize() { - return blockSize; - } - - public int getPageSize() { - return pageSize; - } - - public long getMaxFileSize() { - return maxFileSize; - } - - public Configuration getHadoopConf() { - return hadoopConf; - } - - public double getCompressionRatio() { - return compressionRatio; - } - - public T getWriteSupport() { - return writeSupport; - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetWriter.java new file mode 100644 index 0000000000000..e38b41d422a74 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetWriter.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.api.WriteSupport; + +import java.io.IOException; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Base class of Hudi's custom {@link ParquetWriter} implementations + * + * @param target type of the object being written into Parquet files (for ex, + * {@code IndexedRecord}, {@code InternalRow}) + */ +public abstract class HoodieBaseParquetWriter extends ParquetWriter { + + private static final int WRITTEN_RECORDS_THRESHOLD_FOR_FILE_SIZE_CHECK = 1000; + + private final AtomicLong writtenRecordCount = new AtomicLong(0); + private final long maxFileSize; + private long lastCachedDataSize = -1; + + public HoodieBaseParquetWriter(Path file, + HoodieParquetConfig> parquetConfig) throws IOException { + super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()), + ParquetFileWriter.Mode.CREATE, + parquetConfig.getWriteSupport(), + parquetConfig.getCompressionCodecName(), + parquetConfig.getBlockSize(), + parquetConfig.getPageSize(), + parquetConfig.getPageSize(), + parquetConfig.dictionaryEnabled(), + DEFAULT_IS_VALIDATING_ENABLED, + DEFAULT_WRITER_VERSION, + FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf())); + + // We cannot accurately measure the snappy compressed output file size. We are choosing a + // conservative 10% + // TODO - compute this compression ratio dynamically by looking at the bytes written to the + // stream and the actual file size reported by HDFS + this.maxFileSize = parquetConfig.getMaxFileSize() + + Math.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio()); + } + + public boolean canWrite() { + // TODO we can actually do evaluation more accurately: + // if we cache last data size check, since we account for how many records + // were written we can accurately project avg record size, and therefore + // estimate how many more records we can write before cut off + if (lastCachedDataSize == -1 || getWrittenRecordCount() % WRITTEN_RECORDS_THRESHOLD_FOR_FILE_SIZE_CHECK == 0) { + lastCachedDataSize = getDataSize(); + } + return lastCachedDataSize < maxFileSize; + } + + @Override + public void write(R object) throws IOException { + super.write(object); + writtenRecordCount.incrementAndGet(); + } + + protected long getWrittenRecordCount() { + return writtenRecordCount.get(); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriter.java index ea9ecad6e31a9..cce59d3b6624a 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriter.java @@ -18,19 +18,28 @@ package org.apache.hudi.io.storage; -import org.apache.hudi.common.model.HoodieRecord; - +import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; import java.io.IOException; public interface HoodieFileWriter { - void writeAvroWithMetadata(R newRecord, HoodieRecord record) throws IOException; + void writeAvroWithMetadata(HoodieKey key, R newRecord) throws IOException; boolean canWrite(); void close() throws IOException; void writeAvro(String key, R oldRecord) throws IOException; + + default void prepRecordWithMetadata(HoodieKey key, R avroRecord, String instantTime, Integer partitionId, long recordIndex, String fileName) { + String seqId = HoodieRecord.generateSequenceId(instantTime, partitionId, recordIndex); + HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, key.getRecordKey(), key.getPartitionPath(), fileName); + HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, instantTime, seqId); + return; + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java index 5f4eec0887003..9ee8571ebd066 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java @@ -19,23 +19,30 @@ package org.apache.hudi.io.storage; import org.apache.hudi.avro.HoodieAvroWriteSupport; -import org.apache.hudi.client.common.TaskContextSupplier; import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.BloomFilterFactory; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.parquet.avro.AvroSchemaConverter; import java.io.IOException; -import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; import static org.apache.hudi.common.model.HoodieFileFormat.HFILE; +import static org.apache.hudi.common.model.HoodieFileFormat.ORC; +import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; +import static org.apache.hudi.io.storage.HoodieHFileConfig.CACHE_DATA_IN_L1; +import static org.apache.hudi.io.storage.HoodieHFileConfig.DROP_BEHIND_CACHE_COMPACTION; +import static org.apache.hudi.io.storage.HoodieHFileConfig.HFILE_COMPARATOR; +import static org.apache.hudi.io.storage.HoodieHFileConfig.PREFETCH_ON_OPEN; public class HoodieFileWriterFactory { @@ -44,37 +51,59 @@ public static TaskContextSupplier taskContextSupplier) throws IOException { final String extension = FSUtils.getFileExtension(path.getName()); if (PARQUET.getFileExtension().equals(extension)) { - return newParquetFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier); + return newParquetFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier, config.populateMetaFields()); } if (HFILE.getFileExtension().equals(extension)) { - return newHFileFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier); + return newHFileFileWriter( + instantTime, path, config, schema, hoodieTable.getHadoopConf(), taskContextSupplier); + } + if (ORC.getFileExtension().equals(extension)) { + return newOrcFileWriter( + instantTime, path, config, schema, hoodieTable.getHadoopConf(), taskContextSupplier); } throw new UnsupportedOperationException(extension + " format not supported yet."); } private static HoodieFileWriter newParquetFileWriter( String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable, - TaskContextSupplier taskContextSupplier) throws IOException { - BloomFilter filter = createBloomFilter(config); - HoodieAvroWriteSupport writeSupport = - new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter); + TaskContextSupplier taskContextSupplier, boolean populateMetaFields) throws IOException { + return newParquetFileWriter(instantTime, path, config, schema, hoodieTable.getHadoopConf(), + taskContextSupplier, populateMetaFields, populateMetaFields); + } - HoodieAvroParquetConfig parquetConfig = new HoodieAvroParquetConfig(writeSupport, config.getParquetCompressionCodec(), + private static HoodieFileWriter newParquetFileWriter( + String instantTime, Path path, HoodieWriteConfig config, Schema schema, Configuration conf, + TaskContextSupplier taskContextSupplier, boolean populateMetaFields, boolean enableBloomFilter) throws IOException { + Option filter = enableBloomFilter ? Option.of(createBloomFilter(config)) : Option.empty(); + HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(conf).convert(schema), schema, filter); + + HoodieParquetConfig parquetConfig = new HoodieParquetConfig<>(writeSupport, config.getParquetCompressionCodec(), config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(), - hoodieTable.getHadoopConf(), config.getParquetCompressionRatio()); + conf, config.getParquetCompressionRatio(), config.parquetDictionaryEnabled()); - return new HoodieParquetWriter<>(instantTime, path, parquetConfig, schema, taskContextSupplier); + return new HoodieAvroParquetWriter<>(path, parquetConfig, instantTime, taskContextSupplier, populateMetaFields); } - private static HoodieFileWriter newHFileFileWriter( - String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable, + static HoodieFileWriter newHFileFileWriter( + String instantTime, Path path, HoodieWriteConfig config, Schema schema, Configuration conf, TaskContextSupplier taskContextSupplier) throws IOException { BloomFilter filter = createBloomFilter(config); - HoodieHFileConfig hfileConfig = new HoodieHFileConfig(hoodieTable.getHadoopConf(), - config.getHFileCompressionAlgorithm(), config.getHFileBlockSize(), config.getHFileMaxFileSize(), filter); + HoodieHFileConfig hfileConfig = new HoodieHFileConfig(conf, + config.getHFileCompressionAlgorithm(), config.getHFileBlockSize(), config.getHFileMaxFileSize(), + HoodieHFileReader.KEY_FIELD_NAME, PREFETCH_ON_OPEN, CACHE_DATA_IN_L1, DROP_BEHIND_CACHE_COMPACTION, + filter, HFILE_COMPARATOR); + + return new HoodieHFileWriter<>(instantTime, path, hfileConfig, schema, taskContextSupplier, config.populateMetaFields()); + } - return new HoodieHFileWriter<>(instantTime, path, hfileConfig, schema, taskContextSupplier); + private static HoodieFileWriter newOrcFileWriter( + String instantTime, Path path, HoodieWriteConfig config, Schema schema, Configuration conf, + TaskContextSupplier taskContextSupplier) throws IOException { + BloomFilter filter = createBloomFilter(config); + HoodieOrcConfig orcConfig = new HoodieOrcConfig(conf, config.getOrcCompressionCodec(), + config.getOrcStripeSize(), config.getOrcBlockSize(), config.getOrcMaxFileSize(), filter); + return new HoodieOrcWriter<>(instantTime, path, orcConfig, schema, taskContextSupplier); } private static BloomFilter createBloomFilter(HoodieWriteConfig config) { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java index 031f92cccdfaf..5ce377901a4ba 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java @@ -18,35 +18,36 @@ package org.apache.hudi.io.storage; +import org.apache.hudi.common.bloom.BloomFilter; + import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.CellComparator; import org.apache.hadoop.hbase.HColumnDescriptor; import org.apache.hadoop.hbase.io.compress.Compression; import org.apache.hadoop.hbase.io.hfile.CacheConfig; -import org.apache.hudi.common.bloom.BloomFilter; public class HoodieHFileConfig { - private Compression.Algorithm compressionAlgorithm; - private int blockSize; - private long maxFileSize; - private boolean prefetchBlocksOnOpen; - private boolean cacheDataInL1; - private boolean dropBehindCacheCompaction; - private Configuration hadoopConf; - private BloomFilter bloomFilter; - + public static final CellComparator HFILE_COMPARATOR = new HoodieHBaseKVComparator(); + public static final boolean PREFETCH_ON_OPEN = CacheConfig.DEFAULT_PREFETCH_ON_OPEN; + public static final boolean CACHE_DATA_IN_L1 = HColumnDescriptor.DEFAULT_CACHE_DATA_IN_L1; // This is private in CacheConfig so have been copied here. - private static boolean DROP_BEHIND_CACHE_COMPACTION_DEFAULT = true; + public static final boolean DROP_BEHIND_CACHE_COMPACTION = true; + + private final Compression.Algorithm compressionAlgorithm; + private final int blockSize; + private final long maxFileSize; + private final boolean prefetchBlocksOnOpen; + private final boolean cacheDataInL1; + private final boolean dropBehindCacheCompaction; + private final Configuration hadoopConf; + private final BloomFilter bloomFilter; + private final CellComparator hfileComparator; + private final String keyFieldName; public HoodieHFileConfig(Configuration hadoopConf, Compression.Algorithm compressionAlgorithm, int blockSize, - long maxFileSize, BloomFilter bloomFilter) { - this(hadoopConf, compressionAlgorithm, blockSize, maxFileSize, CacheConfig.DEFAULT_PREFETCH_ON_OPEN, - HColumnDescriptor.DEFAULT_CACHE_DATA_IN_L1, DROP_BEHIND_CACHE_COMPACTION_DEFAULT, bloomFilter); - } - - public HoodieHFileConfig(Configuration hadoopConf, Compression.Algorithm compressionAlgorithm, int blockSize, - long maxFileSize, boolean prefetchBlocksOnOpen, boolean cacheDataInL1, - boolean dropBehindCacheCompaction, BloomFilter bloomFilter) { + long maxFileSize, String keyFieldName, boolean prefetchBlocksOnOpen, boolean cacheDataInL1, + boolean dropBehindCacheCompaction, BloomFilter bloomFilter, CellComparator hfileComparator) { this.hadoopConf = hadoopConf; this.compressionAlgorithm = compressionAlgorithm; this.blockSize = blockSize; @@ -55,6 +56,8 @@ public HoodieHFileConfig(Configuration hadoopConf, Compression.Algorithm compres this.cacheDataInL1 = cacheDataInL1; this.dropBehindCacheCompaction = dropBehindCacheCompaction; this.bloomFilter = bloomFilter; + this.hfileComparator = hfileComparator; + this.keyFieldName = keyFieldName; } public Configuration getHadoopConf() { @@ -92,4 +95,12 @@ public boolean useBloomFilter() { public BloomFilter getBloomFilter() { return bloomFilter; } + + public CellComparator getHFileComparator() { + return hfileComparator; + } + + public String getKeyFieldName() { + return keyFieldName; + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java index 3684f9d4cd931..f065608b29bd5 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java @@ -18,13 +18,6 @@ package org.apache.hudi.io.storage; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.client.common.TaskContextSupplier; -import org.apache.hudi.common.bloom.BloomFilter; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.fs.HoodieWrapperFileSystem; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; @@ -37,6 +30,15 @@ import org.apache.hadoop.hbase.io.hfile.HFileContext; import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder; import org.apache.hadoop.io.Writable; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; import java.io.DataInput; import java.io.DataOutput; @@ -61,6 +63,9 @@ public class HoodieHFileWriter keyFieldSchema; private HFile.Writer writer; private String minRecordKey; private String maxRecordKey; @@ -69,12 +74,14 @@ public class HoodieHFileWriter= 0 ? maxRecordKey : recordKey; - } else { - maxRecordKey = recordKey; - } + maxRecordKey = recordKey; } } @@ -156,7 +178,8 @@ public void write(DataOutput out) throws IOException { } @Override - public void readFields(DataInput in) throws IOException { } + public void readFields(DataInput in) throws IOException { + } }); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcConfig.java new file mode 100644 index 0000000000000..c45e02452e32b --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcConfig.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.orc.CompressionKind; + +public class HoodieOrcConfig { + static final String AVRO_SCHEMA_METADATA_KEY = "orc.avro.schema"; + + private final CompressionKind compressionKind; + private final int stripeSize; + private final int blockSize; + private final long maxFileSize; + private final Configuration hadoopConf; + private final BloomFilter bloomFilter; + + public HoodieOrcConfig(Configuration hadoopConf, CompressionKind compressionKind, int stripeSize, + int blockSize, long maxFileSize, BloomFilter bloomFilter) { + this.hadoopConf = hadoopConf; + this.compressionKind = compressionKind; + this.stripeSize = stripeSize; + this.blockSize = blockSize; + this.maxFileSize = maxFileSize; + this.bloomFilter = bloomFilter; + } + + public Configuration getHadoopConf() { + return hadoopConf; + } + + public CompressionKind getCompressionKind() { + return compressionKind; + } + + public int getStripeSize() { + return stripeSize; + } + + public int getBlockSize() { + return blockSize; + } + + public long getMaxFileSize() { + return maxFileSize; + } + + public boolean useBloomFilter() { + return bloomFilter != null; + } + + public BloomFilter getBloomFilter() { + return bloomFilter; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcWriter.java new file mode 100644 index 0000000000000..4bcab2cec8a1f --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcWriter.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.HoodieBloomFilterWriteSupport; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.AvroOrcUtils; +import org.apache.orc.OrcFile; +import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; +import org.apache.orc.storage.ql.exec.vector.ColumnVector; +import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.concurrent.atomic.AtomicLong; + +import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY; + +public class HoodieOrcWriter + implements HoodieFileWriter, Closeable { + private static final AtomicLong RECORD_INDEX = new AtomicLong(1); + + private final long maxFileSize; + private final Schema avroSchema; + private final List fieldTypes; + private final List fieldNames; + private final VectorizedRowBatch batch; + private final Writer writer; + + private final Path file; + private final HoodieWrapperFileSystem fs; + private final String instantTime; + private final TaskContextSupplier taskContextSupplier; + + private HoodieOrcConfig orcConfig; + private String minRecordKey; + private String maxRecordKey; + + public HoodieOrcWriter(String instantTime, Path file, HoodieOrcConfig config, Schema schema, + TaskContextSupplier taskContextSupplier) throws IOException { + + Configuration conf = FSUtils.registerFileSystem(file, config.getHadoopConf()); + this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, conf); + this.fs = (HoodieWrapperFileSystem) this.file.getFileSystem(conf); + this.instantTime = instantTime; + this.taskContextSupplier = taskContextSupplier; + + this.avroSchema = schema; + final TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(avroSchema); + this.fieldTypes = orcSchema.getChildren(); + this.fieldNames = orcSchema.getFieldNames(); + this.maxFileSize = config.getMaxFileSize(); + this.batch = orcSchema.createRowBatch(); + OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(conf) + .blockSize(config.getBlockSize()) + .stripeSize(config.getStripeSize()) + .compress(config.getCompressionKind()) + .bufferSize(config.getBlockSize()) + .fileSystem(fs) + .setSchema(orcSchema); + this.writer = OrcFile.createWriter(this.file, writerOptions); + this.orcConfig = config; + } + + @Override + public void writeAvroWithMetadata(HoodieKey key, R avroRecord) throws IOException { + prepRecordWithMetadata(key, avroRecord, instantTime, + taskContextSupplier.getPartitionIdSupplier().get(), RECORD_INDEX.getAndIncrement(), file.getName()); + writeAvro(key.getRecordKey(), avroRecord); + } + + @Override + public boolean canWrite() { + return fs.getBytesWritten(file) < maxFileSize; + } + + @Override + public void writeAvro(String recordKey, IndexedRecord object) throws IOException { + for (int col = 0; col < batch.numCols; col++) { + ColumnVector colVector = batch.cols[col]; + final String thisField = fieldNames.get(col); + final TypeDescription type = fieldTypes.get(col); + + Object fieldValue = ((GenericRecord) object).get(thisField); + Schema.Field avroField = avroSchema.getField(thisField); + AvroOrcUtils.addToVector(type, colVector, avroField.schema(), fieldValue, batch.size); + } + + batch.size++; + + // Batch size corresponds to the number of written rows out of 1024 total rows (by default) + // in the row batch, add the batch to file once all rows are filled and reset. + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + batch.size = 0; + } + + if (orcConfig.useBloomFilter()) { + orcConfig.getBloomFilter().add(recordKey); + if (minRecordKey != null) { + minRecordKey = minRecordKey.compareTo(recordKey) <= 0 ? minRecordKey : recordKey; + } else { + minRecordKey = recordKey; + } + + if (maxRecordKey != null) { + maxRecordKey = maxRecordKey.compareTo(recordKey) >= 0 ? maxRecordKey : recordKey; + } else { + maxRecordKey = recordKey; + } + } + } + + @Override + public void close() throws IOException { + if (batch.size != 0) { + writer.addRowBatch(batch); + batch.reset(); + } + + if (orcConfig.useBloomFilter()) { + final BloomFilter bloomFilter = orcConfig.getBloomFilter(); + writer.addUserMetadata(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, ByteBuffer.wrap(bloomFilter.serializeToString().getBytes())); + if (minRecordKey != null && maxRecordKey != null) { + writer.addUserMetadata(HoodieBloomFilterWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER, ByteBuffer.wrap(minRecordKey.getBytes())); + writer.addUserMetadata(HoodieBloomFilterWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER, ByteBuffer.wrap(maxRecordKey.getBytes())); + } + if (bloomFilter.getBloomFilterTypeCode().name().contains(HoodieDynamicBoundedBloomFilter.TYPE_CODE_PREFIX)) { + writer.addUserMetadata(HoodieBloomFilterWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE, ByteBuffer.wrap(bloomFilter.getBloomFilterTypeCode().name().getBytes())); + } + } + writer.addUserMetadata(HoodieOrcConfig.AVRO_SCHEMA_METADATA_KEY, ByteBuffer.wrap(avroSchema.toString().getBytes())); + + writer.close(); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java deleted file mode 100644 index 166e2bc42ab98..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.io.storage; - -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.avro.HoodieAvroWriteSupport; -import org.apache.hudi.client.common.TaskContextSupplier; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.fs.HoodieWrapperFileSystem; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; - -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.fs.Path; -import org.apache.parquet.hadoop.ParquetFileWriter; -import org.apache.parquet.hadoop.ParquetWriter; - -import java.io.IOException; -import java.util.concurrent.atomic.AtomicLong; - -/** - * HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. Provides a way to check if - * the current file can take more records with the canWrite() - */ -public class HoodieParquetWriter - extends ParquetWriter implements HoodieFileWriter { - - private static AtomicLong recordIndex = new AtomicLong(1); - - private final Path file; - private final HoodieWrapperFileSystem fs; - private final long maxFileSize; - private final HoodieAvroWriteSupport writeSupport; - private final String instantTime; - private final TaskContextSupplier taskContextSupplier; - - public HoodieParquetWriter(String instantTime, Path file, HoodieAvroParquetConfig parquetConfig, - Schema schema, TaskContextSupplier taskContextSupplier) throws IOException { - super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()), - ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(), - parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(), - DEFAULT_IS_DICTIONARY_ENABLED, DEFAULT_IS_VALIDATING_ENABLED, - DEFAULT_WRITER_VERSION, FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf())); - this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()); - this.fs = - (HoodieWrapperFileSystem) this.file.getFileSystem(FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf())); - // We cannot accurately measure the snappy compressed output file size. We are choosing a - // conservative 10% - // TODO - compute this compression ratio dynamically by looking at the bytes written to the - // stream and the actual file size reported by HDFS - this.maxFileSize = parquetConfig.getMaxFileSize() - + Math.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio()); - this.writeSupport = parquetConfig.getWriteSupport(); - this.instantTime = instantTime; - this.taskContextSupplier = taskContextSupplier; - } - - @Override - public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException { - String seqId = - HoodieRecord.generateSequenceId(instantTime, taskContextSupplier.getPartitionIdSupplier().get(), recordIndex.getAndIncrement()); - HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, record.getRecordKey(), record.getPartitionPath(), - file.getName()); - HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, instantTime, seqId); - super.write(avroRecord); - writeSupport.add(record.getRecordKey()); - } - - @Override - public boolean canWrite() { - return fs.getBytesWritten(file) < maxFileSize; - } - - @Override - public void writeAvro(String key, IndexedRecord object) throws IOException { - super.write(object); - writeSupport.add(key); - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/BaseKeyGenerator.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/BaseKeyGenerator.java deleted file mode 100644 index 8020be8ab720b..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/BaseKeyGenerator.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.keygen; - -import org.apache.avro.generic.GenericRecord; -import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.exception.HoodieKeyException; -import org.apache.hudi.keygen.constant.KeyGeneratorOptions; - -import java.util.List; -import java.util.stream.Collectors; - -public abstract class BaseKeyGenerator extends KeyGenerator { - - protected List recordKeyFields; - protected List partitionPathFields; - protected final boolean encodePartitionPath; - protected final boolean hiveStylePartitioning; - - protected BaseKeyGenerator(TypedProperties config) { - super(config); - this.encodePartitionPath = config.getBoolean(KeyGeneratorOptions.URL_ENCODE_PARTITIONING_OPT_KEY, - Boolean.parseBoolean(KeyGeneratorOptions.DEFAULT_URL_ENCODE_PARTITIONING_OPT_VAL)); - this.hiveStylePartitioning = config.getBoolean(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_OPT_KEY, - Boolean.parseBoolean(KeyGeneratorOptions.DEFAULT_HIVE_STYLE_PARTITIONING_OPT_VAL)); - } - - /** - * Generate a record Key out of provided generic record. - */ - public abstract String getRecordKey(GenericRecord record); - - /** - * Generate a partition path out of provided generic record. - */ - public abstract String getPartitionPath(GenericRecord record); - - /** - * Generate a Hoodie Key out of provided generic record. - */ - @Override - public final HoodieKey getKey(GenericRecord record) { - if (getRecordKeyFields() == null || getPartitionPathFields() == null) { - throw new HoodieKeyException("Unable to find field names for record key or partition path in cfg"); - } - return new HoodieKey(getRecordKey(record), getPartitionPath(record)); - } - - @Override - public final List getRecordKeyFieldNames() { - // For nested columns, pick top level column name - return getRecordKeyFields().stream().map(k -> { - int idx = k.indexOf('.'); - return idx > 0 ? k.substring(0, idx) : k; - }).collect(Collectors.toList()); - } - - public List getRecordKeyFields() { - return recordKeyFields; - } - - public List getPartitionPathFields() { - return partitionPathFields; - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/ComplexAvroKeyGenerator.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/ComplexAvroKeyGenerator.java index edc1ad9cebc15..9ff5c522e4527 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/ComplexAvroKeyGenerator.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/ComplexAvroKeyGenerator.java @@ -32,19 +32,23 @@ public class ComplexAvroKeyGenerator extends BaseKeyGenerator { public ComplexAvroKeyGenerator(TypedProperties props) { super(props); - this.recordKeyFields = Arrays.stream(props.getString(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY) - .split(",")).map(String::trim).filter(s -> !s.isEmpty()).collect(Collectors.toList()); - this.partitionPathFields = Arrays.stream(props.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY) - .split(",")).map(String::trim).filter(s -> !s.isEmpty()).collect(Collectors.toList()); + this.recordKeyFields = Arrays.stream(props.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()).split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toList()); + this.partitionPathFields = Arrays.stream(props.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()).split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toList()); } @Override public String getRecordKey(GenericRecord record) { - return KeyGenUtils.getRecordKey(record, getRecordKeyFields()); + return KeyGenUtils.getRecordKey(record, getRecordKeyFieldNames(), isConsistentLogicalTimestampEnabled()); } @Override public String getPartitionPath(GenericRecord record) { - return KeyGenUtils.getRecordPartitionPath(record, getPartitionPathFields(), hiveStylePartitioning, encodePartitionPath); + return KeyGenUtils.getRecordPartitionPath(record, getPartitionPathFields(), hiveStylePartitioning, encodePartitionPath, isConsistentLogicalTimestampEnabled()); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/CustomAvroKeyGenerator.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/CustomAvroKeyGenerator.java index 6266fd15c6b84..77377de7ab8c7 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/CustomAvroKeyGenerator.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/CustomAvroKeyGenerator.java @@ -44,7 +44,7 @@ public class CustomAvroKeyGenerator extends BaseKeyGenerator { private static final String DEFAULT_PARTITION_PATH_SEPARATOR = "/"; - private static final String SPLIT_REGEX = ":"; + public static final String SPLIT_REGEX = ":"; /** * Used as a part of config in CustomKeyGenerator.java. @@ -55,8 +55,8 @@ public enum PartitionKeyType { public CustomAvroKeyGenerator(TypedProperties props) { super(props); - this.recordKeyFields = Arrays.stream(props.getString(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY).split(",")).map(String::trim).collect(Collectors.toList()); - this.partitionPathFields = Arrays.stream(props.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY).split(",")).map(String::trim).collect(Collectors.toList()); + this.recordKeyFields = Arrays.stream(props.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()).split(",")).map(String::trim).collect(Collectors.toList()); + this.partitionPathFields = Arrays.stream(props.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()).split(",")).map(String::trim).collect(Collectors.toList()); } @Override @@ -88,7 +88,7 @@ public String getPartitionPath(GenericRecord record) { try { partitionPath.append(new TimestampBasedAvroKeyGenerator(config, partitionPathField).getPartitionPath(record)); } catch (IOException e) { - throw new HoodieKeyGeneratorException("Unable to initialise TimestampBasedKeyGenerator class"); + throw new HoodieKeyGeneratorException("Unable to initialise TimestampBasedKeyGenerator class", e); } break; default: @@ -103,13 +103,13 @@ public String getPartitionPath(GenericRecord record) { @Override public String getRecordKey(GenericRecord record) { validateRecordKeyFields(); - return getRecordKeyFields().size() == 1 + return getRecordKeyFieldNames().size() == 1 ? new SimpleAvroKeyGenerator(config).getRecordKey(record) : new ComplexAvroKeyGenerator(config).getRecordKey(record); } private void validateRecordKeyFields() { - if (getRecordKeyFields() == null || getRecordKeyFields().isEmpty()) { + if (getRecordKeyFieldNames() == null || getRecordKeyFieldNames().isEmpty()) { throw new HoodieKeyException("Unable to find field names for record key in cfg"); } } @@ -117,8 +117,4 @@ private void validateRecordKeyFields() { public String getDefaultPartitionPathSeparator() { return DEFAULT_PARTITION_PATH_SEPARATOR; } - - public String getSplitRegex() { - return SPLIT_REGEX; - } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/GlobalAvroDeleteKeyGenerator.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/GlobalAvroDeleteKeyGenerator.java index b074a25450ab6..dc0bc3cef2f00 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/GlobalAvroDeleteKeyGenerator.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/GlobalAvroDeleteKeyGenerator.java @@ -35,12 +35,12 @@ public class GlobalAvroDeleteKeyGenerator extends BaseKeyGenerator { public GlobalAvroDeleteKeyGenerator(TypedProperties config) { super(config); - this.recordKeyFields = Arrays.asList(config.getString(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY).split(",")); + this.recordKeyFields = Arrays.asList(config.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()).split(",")); } @Override public String getRecordKey(GenericRecord record) { - return KeyGenUtils.getRecordKey(record, getRecordKeyFields()); + return KeyGenUtils.getRecordKey(record, getRecordKeyFieldNames(), isConsistentLogicalTimestampEnabled()); } @Override diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java index 1f59bab266adb..d28263574b763 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java @@ -21,16 +21,18 @@ import org.apache.avro.generic.GenericRecord; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.PartitionPathEncodeUtils; import org.apache.hudi.common.util.ReflectionUtils; -import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieKeyException; import org.apache.hudi.exception.HoodieNotSupportedException; -import org.apache.hudi.keygen.parser.AbstractHoodieDateTimeParser; +import org.apache.hudi.keygen.parser.BaseHoodieDateTimeParser; import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.net.URLEncoder; -import java.nio.charset.StandardCharsets; +import java.util.Arrays; import java.util.List; public class KeyGenUtils { @@ -38,14 +40,58 @@ public class KeyGenUtils { protected static final String NULL_RECORDKEY_PLACEHOLDER = "__null__"; protected static final String EMPTY_RECORDKEY_PLACEHOLDER = "__empty__"; - protected static final String DEFAULT_PARTITION_PATH = "default"; - protected static final String DEFAULT_PARTITION_PATH_SEPARATOR = "/"; + protected static final String HUDI_DEFAULT_PARTITION_PATH = PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH; + public static final String DEFAULT_PARTITION_PATH_SEPARATOR = "/"; + public static final String DEFAULT_RECORD_KEY_PARTS_SEPARATOR = ","; - public static String getRecordKey(GenericRecord record, List recordKeyFields) { + /** + * Fetches record key from the GenericRecord. + * @param genericRecord generic record of interest. + * @param keyGeneratorOpt Optional BaseKeyGenerator. If not, meta field will be used. + * @return the record key for the passed in generic record. + */ + public static String getRecordKeyFromGenericRecord(GenericRecord genericRecord, Option keyGeneratorOpt) { + return keyGeneratorOpt.isPresent() ? keyGeneratorOpt.get().getRecordKey(genericRecord) : genericRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + } + + /** + * Fetches partition path from the GenericRecord. + * @param genericRecord generic record of interest. + * @param keyGeneratorOpt Optional BaseKeyGenerator. If not, meta field will be used. + * @return the partition path for the passed in generic record. + */ + public static String getPartitionPathFromGenericRecord(GenericRecord genericRecord, Option keyGeneratorOpt) { + return keyGeneratorOpt.isPresent() ? keyGeneratorOpt.get().getPartitionPath(genericRecord) : genericRecord.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); + } + + /** + * Extracts the record key fields in strings out of the given record key, + * this is the reverse operation of {@link #getRecordKey(GenericRecord, String, boolean)}. + * + * @see SimpleAvroKeyGenerator + * @see org.apache.hudi.keygen.ComplexAvroKeyGenerator + */ + public static String[] extractRecordKeys(String recordKey) { + String[] fieldKV = recordKey.split(","); + return Arrays.stream(fieldKV).map(kv -> { + final String[] kvArray = kv.split(":", 2); + if (kvArray.length == 1) { + return kvArray[0]; + } else if (kvArray[1].equals(NULL_RECORDKEY_PLACEHOLDER)) { + return null; + } else if (kvArray[1].equals(EMPTY_RECORDKEY_PLACEHOLDER)) { + return ""; + } else { + return kvArray[1]; + } + }).toArray(String[]::new); + } + + public static String getRecordKey(GenericRecord record, List recordKeyFields, boolean consistentLogicalTimestampEnabled) { boolean keyIsNullEmpty = true; StringBuilder recordKey = new StringBuilder(); for (String recordKeyField : recordKeyFields) { - String recordKeyValue = HoodieAvroUtils.getNestedFieldValAsString(record, recordKeyField, true); + String recordKeyValue = HoodieAvroUtils.getNestedFieldValAsString(record, recordKeyField, true, consistentLogicalTimestampEnabled); if (recordKeyValue == null) { recordKey.append(recordKeyField + ":" + NULL_RECORDKEY_PLACEHOLDER + ","); } else if (recordKeyValue.isEmpty()) { @@ -64,24 +110,20 @@ public static String getRecordKey(GenericRecord record, List recordKeyFi } public static String getRecordPartitionPath(GenericRecord record, List partitionPathFields, - boolean hiveStylePartitioning, boolean encodePartitionPath) { + boolean hiveStylePartitioning, boolean encodePartitionPath, boolean consistentLogicalTimestampEnabled) { if (partitionPathFields.isEmpty()) { return ""; } StringBuilder partitionPath = new StringBuilder(); for (String partitionPathField : partitionPathFields) { - String fieldVal = HoodieAvroUtils.getNestedFieldValAsString(record, partitionPathField, true); + String fieldVal = HoodieAvroUtils.getNestedFieldValAsString(record, partitionPathField, true, consistentLogicalTimestampEnabled); if (fieldVal == null || fieldVal.isEmpty()) { - partitionPath.append(hiveStylePartitioning ? partitionPathField + "=" + DEFAULT_PARTITION_PATH - : DEFAULT_PARTITION_PATH); + partitionPath.append(hiveStylePartitioning ? partitionPathField + "=" + HUDI_DEFAULT_PARTITION_PATH + : HUDI_DEFAULT_PARTITION_PATH); } else { if (encodePartitionPath) { - try { - fieldVal = URLEncoder.encode(fieldVal, StandardCharsets.UTF_8.toString()); - } catch (UnsupportedEncodingException uoe) { - throw new HoodieException(uoe.getMessage(), uoe); - } + fieldVal = PartitionPathEncodeUtils.escapePathName(fieldVal); } partitionPath.append(hiveStylePartitioning ? partitionPathField + "=" + fieldVal : fieldVal); } @@ -91,8 +133,8 @@ public static String getRecordPartitionPath(GenericRecord record, List p return partitionPath.toString(); } - public static String getRecordKey(GenericRecord record, String recordKeyField) { - String recordKey = HoodieAvroUtils.getNestedFieldValAsString(record, recordKeyField, true); + public static String getRecordKey(GenericRecord record, String recordKeyField, boolean consistentLogicalTimestampEnabled) { + String recordKey = HoodieAvroUtils.getNestedFieldValAsString(record, recordKeyField, true, consistentLogicalTimestampEnabled); if (recordKey == null || recordKey.isEmpty()) { throw new HoodieKeyException("recordKey value: \"" + recordKey + "\" for field: \"" + recordKeyField + "\" cannot be null or empty."); } @@ -100,17 +142,13 @@ public static String getRecordKey(GenericRecord record, String recordKeyField) { } public static String getPartitionPath(GenericRecord record, String partitionPathField, - boolean hiveStylePartitioning, boolean encodePartitionPath) { - String partitionPath = HoodieAvroUtils.getNestedFieldValAsString(record, partitionPathField, true); + boolean hiveStylePartitioning, boolean encodePartitionPath, boolean consistentLogicalTimestampEnabled) { + String partitionPath = HoodieAvroUtils.getNestedFieldValAsString(record, partitionPathField, true, consistentLogicalTimestampEnabled); if (partitionPath == null || partitionPath.isEmpty()) { - partitionPath = DEFAULT_PARTITION_PATH; + partitionPath = HUDI_DEFAULT_PARTITION_PATH; } if (encodePartitionPath) { - try { - partitionPath = URLEncoder.encode(partitionPath, StandardCharsets.UTF_8.toString()); - } catch (UnsupportedEncodingException uoe) { - throw new HoodieException(uoe.getMessage(), uoe); - } + partitionPath = PartitionPathEncodeUtils.escapePathName(partitionPath); } if (hiveStylePartitioning) { partitionPath = partitionPathField + "=" + partitionPath; @@ -121,9 +159,9 @@ public static String getPartitionPath(GenericRecord record, String partitionPath /** * Create a date time parser class for TimestampBasedKeyGenerator, passing in any configs needed. */ - public static AbstractHoodieDateTimeParser createDateTimeParser(TypedProperties props, String parserClass) throws IOException { + public static BaseHoodieDateTimeParser createDateTimeParser(TypedProperties props, String parserClass) throws IOException { try { - return (AbstractHoodieDateTimeParser) ReflectionUtils.loadClass(parserClass, props); + return (BaseHoodieDateTimeParser) ReflectionUtils.loadClass(parserClass, props); } catch (Throwable e) { throw new IOException("Could not load date time parser class " + parserClass, e); } @@ -136,4 +174,24 @@ public static void checkRequiredProperties(TypedProperties props, List c } }); } -} \ No newline at end of file + + /** + * Create a key generator class via reflection, passing in any configs needed. + *

+ * This method is for user-defined classes. To create hudi's built-in key generators, please set proper + * {@link org.apache.hudi.keygen.constant.KeyGeneratorType} conf, and use the relevant factory, see + * {@link org.apache.hudi.keygen.factory.HoodieAvroKeyGeneratorFactory}. + */ + public static KeyGenerator createKeyGeneratorByClassName(TypedProperties props) throws IOException { + KeyGenerator keyGenerator = null; + String keyGeneratorClass = props.getString(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key(), null); + if (!StringUtils.isNullOrEmpty(keyGeneratorClass)) { + try { + keyGenerator = (KeyGenerator) ReflectionUtils.loadClass(keyGeneratorClass, props); + } catch (Throwable e) { + throw new IOException("Could not load key generator class " + keyGeneratorClass, e); + } + } + return keyGenerator; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/NonpartitionedAvroKeyGenerator.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/NonpartitionedAvroKeyGenerator.java index a5272b38bbff3..5b5cedcbf8855 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/NonpartitionedAvroKeyGenerator.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/NonpartitionedAvroKeyGenerator.java @@ -19,20 +19,26 @@ import org.apache.avro.generic.GenericRecord; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; +import java.util.stream.Collectors; /** * Avro simple Key generator for unpartitioned Hive Tables. */ -public class NonpartitionedAvroKeyGenerator extends SimpleAvroKeyGenerator { +public class NonpartitionedAvroKeyGenerator extends BaseKeyGenerator { private static final String EMPTY_PARTITION = ""; private static final List EMPTY_PARTITION_FIELD_LIST = new ArrayList<>(); public NonpartitionedAvroKeyGenerator(TypedProperties props) { super(props); + this.recordKeyFields = Arrays.stream(props.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()) + .split(",")).map(String::trim).filter(s -> !s.isEmpty()).collect(Collectors.toList()); + this.partitionPathFields = EMPTY_PARTITION_FIELD_LIST; } @Override @@ -45,6 +51,17 @@ public List getPartitionPathFields() { return EMPTY_PARTITION_FIELD_LIST; } + @Override + public String getRecordKey(GenericRecord record) { + // for backward compatibility, we need to use the right format according to the number of record key fields + // 1. if there is only one record key field, the format of record key is just "" + // 2. if there are multiple record key fields, the format is ":,:,..." + if (getRecordKeyFieldNames().size() == 1) { + return KeyGenUtils.getRecordKey(record, getRecordKeyFieldNames().get(0), isConsistentLogicalTimestampEnabled()); + } + return KeyGenUtils.getRecordKey(record, getRecordKeyFieldNames(), isConsistentLogicalTimestampEnabled()); + } + public String getEmptyPartition() { return EMPTY_PARTITION; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/SimpleAvroKeyGenerator.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/SimpleAvroKeyGenerator.java index 59fe6be313030..c7398e94ecea0 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/SimpleAvroKeyGenerator.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/SimpleAvroKeyGenerator.java @@ -29,8 +29,8 @@ public class SimpleAvroKeyGenerator extends BaseKeyGenerator { public SimpleAvroKeyGenerator(TypedProperties props) { - this(props, props.getString(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY), - props.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY)); + this(props, props.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()), + props.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())); } SimpleAvroKeyGenerator(TypedProperties props, String partitionPathField) { @@ -47,11 +47,11 @@ public SimpleAvroKeyGenerator(TypedProperties props) { @Override public String getRecordKey(GenericRecord record) { - return KeyGenUtils.getRecordKey(record, getRecordKeyFields().get(0)); + return KeyGenUtils.getRecordKey(record, getRecordKeyFieldNames().get(0), isConsistentLogicalTimestampEnabled()); } @Override public String getPartitionPath(GenericRecord record) { - return KeyGenUtils.getPartitionPath(record, getPartitionPathFields().get(0), hiveStylePartitioning, encodePartitionPath); + return KeyGenUtils.getPartitionPath(record, getPartitionPathFields().get(0), hiveStylePartitioning, encodePartitionPath, isConsistentLogicalTimestampEnabled()); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java index 28048a16b88da..60ccc694f947d 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java @@ -21,12 +21,13 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.PartitionPathEncodeUtils; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieKeyGeneratorException; import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; -import org.apache.hudi.keygen.parser.AbstractHoodieDateTimeParser; -import org.apache.hudi.keygen.parser.HoodieDateTimeParserImpl; +import org.apache.hudi.keygen.parser.BaseHoodieDateTimeParser; +import org.apache.hudi.keygen.parser.HoodieDateTimeParser; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import org.joda.time.format.DateTimeFormat; @@ -34,9 +35,9 @@ import java.io.IOException; import java.io.Serializable; -import java.io.UnsupportedEncodingException; -import java.net.URLEncoder; -import java.nio.charset.StandardCharsets; +import java.math.BigDecimal; +import java.sql.Timestamp; +import java.time.LocalDate; import java.util.TimeZone; import java.util.concurrent.TimeUnit; @@ -56,7 +57,7 @@ public enum TimestampType implements Serializable { private final String outputDateFormat; private transient Option inputFormatter; private transient DateTimeFormatter partitionFormatter; - private final AbstractHoodieDateTimeParser parser; + private final BaseHoodieDateTimeParser parser; // TimeZone detailed settings reference // https://docs.oracle.com/javase/8/docs/api/java/util/TimeZone.html @@ -65,32 +66,9 @@ public enum TimestampType implements Serializable { protected final boolean encodePartitionPath; - /** - * Supported configs. - */ - public static class Config { - - // One value from TimestampType above - public static final String TIMESTAMP_TYPE_FIELD_PROP = "hoodie.deltastreamer.keygen.timebased.timestamp.type"; - public static final String INPUT_TIME_UNIT = - "hoodie.deltastreamer.keygen.timebased.timestamp.scalar.time.unit"; - //This prop can now accept list of input date formats. - public static final String TIMESTAMP_INPUT_DATE_FORMAT_PROP = - "hoodie.deltastreamer.keygen.timebased.input.dateformat"; - public static final String TIMESTAMP_INPUT_DATE_FORMAT_LIST_DELIMITER_REGEX_PROP = "hoodie.deltastreamer.keygen.timebased.input.dateformat.list.delimiter.regex"; - public static final String TIMESTAMP_INPUT_TIMEZONE_FORMAT_PROP = "hoodie.deltastreamer.keygen.timebased.input.timezone"; - public static final String TIMESTAMP_OUTPUT_DATE_FORMAT_PROP = - "hoodie.deltastreamer.keygen.timebased.output.dateformat"; - //still keeping this prop for backward compatibility so that functionality for existing users does not break. - public static final String TIMESTAMP_TIMEZONE_FORMAT_PROP = - "hoodie.deltastreamer.keygen.timebased.timezone"; - public static final String TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP = "hoodie.deltastreamer.keygen.timebased.output.timezone"; - static final String DATE_TIME_PARSER_PROP = "hoodie.deltastreamer.keygen.datetime.parser.class"; - } - public TimestampBasedAvroKeyGenerator(TypedProperties config) throws IOException { - this(config, config.getString(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY), - config.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY)); + this(config, config.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()), + config.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())); } TimestampBasedAvroKeyGenerator(TypedProperties config, String partitionPathField) throws IOException { @@ -99,12 +77,12 @@ public TimestampBasedAvroKeyGenerator(TypedProperties config) throws IOException TimestampBasedAvroKeyGenerator(TypedProperties config, String recordKeyField, String partitionPathField) throws IOException { super(config, recordKeyField, partitionPathField); - String dateTimeParserClass = config.getString(Config.DATE_TIME_PARSER_PROP, HoodieDateTimeParserImpl.class.getName()); + String dateTimeParserClass = config.getString(KeyGeneratorOptions.Config.DATE_TIME_PARSER_PROP, HoodieDateTimeParser.class.getName()); this.parser = KeyGenUtils.createDateTimeParser(config, dateTimeParserClass); this.inputDateTimeZone = parser.getInputDateTimeZone(); this.outputDateTimeZone = parser.getOutputDateTimeZone(); this.outputDateFormat = parser.getOutputDateFormat(); - this.timestampType = TimestampType.valueOf(config.getString(Config.TIMESTAMP_TYPE_FIELD_PROP)); + this.timestampType = TimestampType.valueOf(config.getString(KeyGeneratorOptions.Config.TIMESTAMP_TYPE_FIELD_PROP)); switch (this.timestampType) { case EPOCHMILLISECONDS: @@ -114,26 +92,26 @@ public TimestampBasedAvroKeyGenerator(TypedProperties config) throws IOException timeUnit = SECONDS; break; case SCALAR: - String timeUnitStr = config.getString(Config.INPUT_TIME_UNIT, TimeUnit.SECONDS.toString()); + String timeUnitStr = config.getString(KeyGeneratorOptions.Config.INPUT_TIME_UNIT, TimeUnit.SECONDS.toString()); timeUnit = TimeUnit.valueOf(timeUnitStr.toUpperCase()); break; default: timeUnit = null; } - this.encodePartitionPath = config.getBoolean(KeyGeneratorOptions.URL_ENCODE_PARTITIONING_OPT_KEY, - Boolean.parseBoolean(KeyGeneratorOptions.DEFAULT_URL_ENCODE_PARTITIONING_OPT_VAL)); + this.encodePartitionPath = config.getBoolean(KeyGeneratorOptions.URL_ENCODE_PARTITIONING.key(), + Boolean.parseBoolean(KeyGeneratorOptions.URL_ENCODE_PARTITIONING.defaultValue())); } @Override public String getPartitionPath(GenericRecord record) { - Object partitionVal = HoodieAvroUtils.getNestedFieldVal(record, getPartitionPathFields().get(0), true); + Object partitionVal = HoodieAvroUtils.getNestedFieldVal(record, getPartitionPathFields().get(0), true, isConsistentLogicalTimestampEnabled()); if (partitionVal == null) { partitionVal = getDefaultPartitionVal(); } try { return getPartitionPath(partitionVal); } catch (Exception e) { - throw new HoodieKeyGeneratorException("Unable to parse input partition field :" + partitionVal, e); + throw new HoodieKeyGeneratorException("Unable to parse input partition field: " + partitionVal, e); } } @@ -148,7 +126,7 @@ public Object getDefaultPartitionVal() { // {Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP} won't be null, it has been checked in the initialization process of // inputFormatter String delimiter = parser.getConfigInputDateFormatDelimiter(); - String format = config.getString(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, "").split(delimiter)[0]; + String format = config.getString(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, "").split(delimiter)[0]; // if both input and output timeZone are not configured, use GMT. if (null != inputDateTimeZone) { @@ -192,9 +170,18 @@ public String getPartitionPath(Object partitionVal) { timeMs = convertLongTimeToMillis(((Float) partitionVal).longValue()); } else if (partitionVal instanceof Long) { timeMs = convertLongTimeToMillis((Long) partitionVal); + } else if (partitionVal instanceof Timestamp && isConsistentLogicalTimestampEnabled()) { + timeMs = ((Timestamp) partitionVal).getTime(); + } else if (partitionVal instanceof Integer) { + timeMs = convertLongTimeToMillis(((Integer) partitionVal).longValue()); + } else if (partitionVal instanceof BigDecimal) { + timeMs = convertLongTimeToMillis(((BigDecimal) partitionVal).longValue()); + } else if (partitionVal instanceof LocalDate) { + // Avro uses LocalDate to represent the Date value internal. + timeMs = convertLongTimeToMillis(((LocalDate) partitionVal).toEpochDay()); } else if (partitionVal instanceof CharSequence) { if (!inputFormatter.isPresent()) { - throw new HoodieException("Missing inputformatter. Ensure " + Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP + " config is set when timestampType is DATE_STRING or MIXED!"); + throw new HoodieException("Missing input formatter. Ensure " + KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP + " config is set when timestampType is DATE_STRING or MIXED!"); } DateTime parsedDateTime = inputFormatter.get().parseDateTime(partitionVal.toString()); if (this.outputDateTimeZone == null) { @@ -210,11 +197,7 @@ public String getPartitionPath(Object partitionVal) { DateTime timestamp = new DateTime(timeMs, outputDateTimeZone); String partitionPath = timestamp.toString(partitionFormatter); if (encodePartitionPath) { - try { - partitionPath = URLEncoder.encode(partitionPath, StandardCharsets.UTF_8.toString()); - } catch (UnsupportedEncodingException uoe) { - throw new HoodieException(uoe.getMessage(), uoe); - } + partitionPath = PartitionPathEncodeUtils.escapePathName(partitionPath); } return hiveStylePartitioning ? getPartitionPathFields().get(0) + "=" + partitionPath : partitionPath; } @@ -222,9 +205,8 @@ public String getPartitionPath(Object partitionVal) { private long convertLongTimeToMillis(Long partitionVal) { if (timeUnit == null) { // should not be possible - throw new RuntimeException(Config.INPUT_TIME_UNIT + " is not specified but scalar it supplied as time value"); + throw new RuntimeException(KeyGeneratorOptions.Config.INPUT_TIME_UNIT + " is not specified but scalar it supplied as time value"); } return MILLISECONDS.convert(partitionVal, timeUnit); } - } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/constant/KeyGeneratorOptions.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/constant/KeyGeneratorOptions.java deleted file mode 100644 index da567e078790a..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/constant/KeyGeneratorOptions.java +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.keygen.constant; - -public class KeyGeneratorOptions { - - /** - * Flag to indicate whether to use Hive style partitioning. - * If set true, the names of partition folders follow = format. - * By default false (the names of partition folders are only partition values) - */ - public static final String URL_ENCODE_PARTITIONING_OPT_KEY = "hoodie.datasource.write.partitionpath.urlencode"; - public static final String DEFAULT_URL_ENCODE_PARTITIONING_OPT_VAL = "false"; - public static final String HIVE_STYLE_PARTITIONING_OPT_KEY = "hoodie.datasource.write.hive_style_partitioning"; - public static final String DEFAULT_HIVE_STYLE_PARTITIONING_OPT_VAL = "false"; - - /** - * Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value - * will be obtained by invoking .toString() on the field value. Nested fields can be specified using - * the dot notation eg: `a.b.c` - */ - public static final String RECORDKEY_FIELD_OPT_KEY = "hoodie.datasource.write.recordkey.field"; - public static final String PARTITIONPATH_FIELD_OPT_KEY = "hoodie.datasource.write.partitionpath.field"; -} - diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/factory/HoodieAvroKeyGeneratorFactory.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/factory/HoodieAvroKeyGeneratorFactory.java new file mode 100644 index 0000000000000..b24b9a8e2d9b4 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/factory/HoodieAvroKeyGeneratorFactory.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.keygen.factory; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieKeyGeneratorException; +import org.apache.hudi.keygen.ComplexAvroKeyGenerator; +import org.apache.hudi.keygen.CustomAvroKeyGenerator; +import org.apache.hudi.keygen.GlobalAvroDeleteKeyGenerator; +import org.apache.hudi.keygen.KeyGenUtils; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.keygen.NonpartitionedAvroKeyGenerator; +import org.apache.hudi.keygen.SimpleAvroKeyGenerator; +import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator; +import org.apache.hudi.keygen.constant.KeyGeneratorType; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Locale; +import java.util.Objects; + +/** + * Factory help to create {@link org.apache.hudi.keygen.KeyGenerator}. + *

+ * This factory will try {@link HoodieWriteConfig#KEYGENERATOR_CLASS_NAME} firstly, this ensures the class prop + * will not be overwritten by {@link KeyGeneratorType} + */ +public class HoodieAvroKeyGeneratorFactory { + + private static final Logger LOG = LoggerFactory.getLogger(HoodieAvroKeyGeneratorFactory.class); + + public static KeyGenerator createKeyGenerator(TypedProperties props) throws IOException { + // keyGenerator class name has higher priority + KeyGenerator keyGenerator = KeyGenUtils.createKeyGeneratorByClassName(props); + return Objects.isNull(keyGenerator) ? createAvroKeyGeneratorByType(props) : keyGenerator; + } + + public static KeyGenerator createAvroKeyGeneratorByType(TypedProperties props) throws IOException { + // Use KeyGeneratorType.SIMPLE as default keyGeneratorType + String keyGeneratorType = + props.getString(HoodieWriteConfig.KEYGENERATOR_TYPE.key(), null); + + if (StringUtils.isNullOrEmpty(keyGeneratorType)) { + LOG.info("The value of {} is empty, using SIMPLE", HoodieWriteConfig.KEYGENERATOR_TYPE.key()); + keyGeneratorType = KeyGeneratorType.SIMPLE.name(); + } + + KeyGeneratorType keyGeneratorTypeEnum; + try { + keyGeneratorTypeEnum = KeyGeneratorType.valueOf(keyGeneratorType.toUpperCase(Locale.ROOT)); + } catch (IllegalArgumentException e) { + throw new HoodieKeyGeneratorException("Unsupported keyGenerator Type " + keyGeneratorType); + } + + switch (keyGeneratorTypeEnum) { + case SIMPLE: + return new SimpleAvroKeyGenerator(props); + case COMPLEX: + return new ComplexAvroKeyGenerator(props); + case TIMESTAMP: + return new TimestampBasedAvroKeyGenerator(props); + case CUSTOM: + return new CustomAvroKeyGenerator(props); + case NON_PARTITION: + return new NonpartitionedAvroKeyGenerator(props); + case GLOBAL_DELETE: + return new GlobalAvroDeleteKeyGenerator(props); + default: + throw new HoodieKeyGeneratorException("Unsupported keyGenerator Type " + keyGeneratorType); + } + } + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/AbstractHoodieDateTimeParser.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/AbstractHoodieDateTimeParser.java deleted file mode 100644 index 6fb05c30be11a..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/AbstractHoodieDateTimeParser.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.keygen.parser; - -import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator.Config; -import org.joda.time.DateTimeZone; -import org.joda.time.format.DateTimeFormatter; - -import java.io.Serializable; - -public abstract class AbstractHoodieDateTimeParser implements Serializable { - - protected final TypedProperties config; - protected final String configInputDateFormatDelimiter; - - public AbstractHoodieDateTimeParser(TypedProperties config) { - this.config = config; - this.configInputDateFormatDelimiter = initInputDateFormatDelimiter(); - } - - private String initInputDateFormatDelimiter() { - String inputDateFormatDelimiter = config.getString(Config.TIMESTAMP_INPUT_DATE_FORMAT_LIST_DELIMITER_REGEX_PROP, ",").trim(); - inputDateFormatDelimiter = inputDateFormatDelimiter.isEmpty() ? "," : inputDateFormatDelimiter; - return inputDateFormatDelimiter; - } - - /** - * Returns the output date format in which the partition paths will be created for the hudi dataset. - */ - public String getOutputDateFormat() { - return config.getString(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP); - } - - /** - * Returns input formats in which datetime based values might be coming in incoming records. - */ - public abstract Option getInputFormatter(); - - /** - * Returns the datetime zone one should expect the incoming values into. - */ - public abstract DateTimeZone getInputDateTimeZone(); - - /** - * Returns the datetime zone using which the final partition paths for hudi dataset are created. - */ - public abstract DateTimeZone getOutputDateTimeZone(); - - /** - * Returns the input date format delimiter, comma by default. - */ - public String getConfigInputDateFormatDelimiter() { - return this.configInputDateFormatDelimiter; - } - -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/BaseHoodieDateTimeParser.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/BaseHoodieDateTimeParser.java new file mode 100644 index 0000000000000..74c62fc63f537 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/BaseHoodieDateTimeParser.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.keygen.parser; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.joda.time.DateTimeZone; +import org.joda.time.format.DateTimeFormatter; + +import java.io.Serializable; + +public abstract class BaseHoodieDateTimeParser implements Serializable { + + protected final TypedProperties config; + protected final String configInputDateFormatDelimiter; + + public BaseHoodieDateTimeParser(TypedProperties config) { + this.config = config; + this.configInputDateFormatDelimiter = initInputDateFormatDelimiter(); + } + + private String initInputDateFormatDelimiter() { + String inputDateFormatDelimiter = config.getString(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_LIST_DELIMITER_REGEX_PROP, ",").trim(); + inputDateFormatDelimiter = inputDateFormatDelimiter.isEmpty() ? "," : inputDateFormatDelimiter; + return inputDateFormatDelimiter; + } + + /** + * Returns the output date format in which the partition paths will be created for the hudi dataset. + */ + public String getOutputDateFormat() { + return config.getString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP); + } + + /** + * Returns input formats in which datetime based values might be coming in incoming records. + */ + public abstract Option getInputFormatter(); + + /** + * Returns the datetime zone one should expect the incoming values into. + */ + public abstract DateTimeZone getInputDateTimeZone(); + + /** + * Returns the datetime zone using which the final partition paths for hudi dataset are created. + */ + public abstract DateTimeZone getOutputDateTimeZone(); + + /** + * Returns the input date format delimiter, comma by default. + */ + public String getConfigInputDateFormatDelimiter() { + return this.configInputDateFormatDelimiter; + } + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/HoodieDateTimeParser.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/HoodieDateTimeParser.java new file mode 100644 index 0000000000000..c15d484df7a53 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/HoodieDateTimeParser.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.keygen.parser; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator.TimestampType; +import org.apache.hudi.keygen.KeyGenUtils; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.joda.time.DateTimeZone; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; +import org.joda.time.format.DateTimeFormatterBuilder; +import org.joda.time.format.DateTimeParser; + +import java.util.Arrays; +import java.util.Collections; +import java.util.TimeZone; + +public class HoodieDateTimeParser extends BaseHoodieDateTimeParser { + + private String configInputDateFormatList; + + // TimeZone detailed settings reference + // https://docs.oracle.com/javase/8/docs/api/java/util/TimeZone.html + private final DateTimeZone inputDateTimeZone; + + public HoodieDateTimeParser(TypedProperties config) { + super(config); + KeyGenUtils.checkRequiredProperties(config, Arrays.asList(KeyGeneratorOptions.Config.TIMESTAMP_TYPE_FIELD_PROP, KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP)); + this.inputDateTimeZone = getInputDateTimeZone(); + } + + private DateTimeFormatter getInputDateFormatter() { + if (this.configInputDateFormatList.isEmpty()) { + throw new IllegalArgumentException(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP + " configuration is required"); + } + + DateTimeFormatter formatter = new DateTimeFormatterBuilder() + .append( + null, + Arrays.stream( + this.configInputDateFormatList.split(super.configInputDateFormatDelimiter)) + .map(String::trim) + .map(DateTimeFormat::forPattern) + .map(DateTimeFormatter::getParser) + .toArray(DateTimeParser[]::new)) + .toFormatter(); + if (this.inputDateTimeZone != null) { + formatter = formatter.withZone(this.inputDateTimeZone); + } else { + formatter = formatter.withOffsetParsed(); + } + + return formatter; + } + + @Override + public String getOutputDateFormat() { + return config.getString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP); + } + + @Override + public Option getInputFormatter() { + TimestampType timestampType = TimestampType.valueOf(config.getString(KeyGeneratorOptions.Config.TIMESTAMP_TYPE_FIELD_PROP)); + if (timestampType == TimestampType.DATE_STRING || timestampType == TimestampType.MIXED) { + KeyGenUtils.checkRequiredProperties(config, + Collections.singletonList(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP)); + this.configInputDateFormatList = config.getString(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, ""); + return Option.of(getInputDateFormatter()); + } + + return Option.empty(); + } + + @Override + public DateTimeZone getInputDateTimeZone() { + String inputTimeZone; + if (config.containsKey(KeyGeneratorOptions.Config.TIMESTAMP_TIMEZONE_FORMAT_PROP)) { + inputTimeZone = config.getString(KeyGeneratorOptions.Config.TIMESTAMP_TIMEZONE_FORMAT_PROP, "GMT"); + } else { + inputTimeZone = config.getString(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_TIMEZONE_FORMAT_PROP, ""); + } + return !inputTimeZone.trim().isEmpty() ? DateTimeZone.forTimeZone(TimeZone.getTimeZone(inputTimeZone)) : null; + } + + @Override + public DateTimeZone getOutputDateTimeZone() { + String outputTimeZone; + if (config.containsKey(KeyGeneratorOptions.Config.TIMESTAMP_TIMEZONE_FORMAT_PROP)) { + outputTimeZone = config.getString(KeyGeneratorOptions.Config.TIMESTAMP_TIMEZONE_FORMAT_PROP, "GMT"); + } else { + outputTimeZone = config.getString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP, ""); + } + return !outputTimeZone.trim().isEmpty() ? DateTimeZone.forTimeZone(TimeZone.getTimeZone(outputTimeZone)) : null; + } + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/HoodieDateTimeParserImpl.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/HoodieDateTimeParserImpl.java deleted file mode 100644 index 81960ea168391..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/HoodieDateTimeParserImpl.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.keygen.parser; - -import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator.TimestampType; -import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator.Config; -import org.apache.hudi.keygen.KeyGenUtils; -import org.joda.time.DateTimeZone; -import org.joda.time.format.DateTimeFormat; -import org.joda.time.format.DateTimeFormatter; -import org.joda.time.format.DateTimeFormatterBuilder; -import org.joda.time.format.DateTimeParser; - -import java.util.Arrays; -import java.util.Collections; -import java.util.TimeZone; - -public class HoodieDateTimeParserImpl extends AbstractHoodieDateTimeParser { - - private String configInputDateFormatList; - - // TimeZone detailed settings reference - // https://docs.oracle.com/javase/8/docs/api/java/util/TimeZone.html - private final DateTimeZone inputDateTimeZone; - - public HoodieDateTimeParserImpl(TypedProperties config) { - super(config); - KeyGenUtils.checkRequiredProperties(config, Arrays.asList(Config.TIMESTAMP_TYPE_FIELD_PROP, Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP)); - this.inputDateTimeZone = getInputDateTimeZone(); - } - - private DateTimeFormatter getInputDateFormatter() { - if (this.configInputDateFormatList.isEmpty()) { - throw new IllegalArgumentException(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP + " configuration is required"); - } - - DateTimeFormatter formatter = new DateTimeFormatterBuilder() - .append( - null, - Arrays.stream( - this.configInputDateFormatList.split(super.configInputDateFormatDelimiter)) - .map(String::trim) - .map(DateTimeFormat::forPattern) - .map(DateTimeFormatter::getParser) - .toArray(DateTimeParser[]::new)) - .toFormatter(); - if (this.inputDateTimeZone != null) { - formatter = formatter.withZone(this.inputDateTimeZone); - } else { - formatter = formatter.withOffsetParsed(); - } - - return formatter; - } - - @Override - public String getOutputDateFormat() { - return config.getString(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP); - } - - @Override - public Option getInputFormatter() { - TimestampType timestampType = TimestampType.valueOf(config.getString(Config.TIMESTAMP_TYPE_FIELD_PROP)); - if (timestampType == TimestampType.DATE_STRING || timestampType == TimestampType.MIXED) { - KeyGenUtils.checkRequiredProperties(config, - Collections.singletonList(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP)); - this.configInputDateFormatList = config.getString(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, ""); - return Option.of(getInputDateFormatter()); - } - - return Option.empty(); - } - - @Override - public DateTimeZone getInputDateTimeZone() { - String inputTimeZone; - if (config.containsKey(Config.TIMESTAMP_TIMEZONE_FORMAT_PROP)) { - inputTimeZone = config.getString(Config.TIMESTAMP_TIMEZONE_FORMAT_PROP, "GMT"); - } else { - inputTimeZone = config.getString(Config.TIMESTAMP_INPUT_TIMEZONE_FORMAT_PROP, ""); - } - return !inputTimeZone.trim().isEmpty() ? DateTimeZone.forTimeZone(TimeZone.getTimeZone(inputTimeZone)) : null; - } - - @Override - public DateTimeZone getOutputDateTimeZone() { - String outputTimeZone; - if (config.containsKey(Config.TIMESTAMP_TIMEZONE_FORMAT_PROP)) { - outputTimeZone = config.getString(Config.TIMESTAMP_TIMEZONE_FORMAT_PROP, "GMT"); - } else { - outputTimeZone = config.getString(Config.TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP, ""); - } - return !outputTimeZone.trim().isEmpty() ? DateTimeZone.forTimeZone(TimeZone.getTimeZone(outputTimeZone)) : null; - } - -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java new file mode 100644 index 0000000000000..9607080d1e74e --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java @@ -0,0 +1,1200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieIndexPartitionInfo; +import org.apache.hudi.avro.model.HoodieIndexPlan; +import org.apache.hudi.avro.model.HoodieInstantInfo; +import org.apache.hudi.avro.model.HoodieMetadataRecord; +import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.client.BaseHoodieWriteClient; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.ConsistencyGuardConfig; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.DeleteRecord; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodiePartitionMetadata; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.WriteConcurrencyMode; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.log.HoodieLogFormat; +import org.apache.hudi.common.table.log.block.HoodieDeleteBlock; +import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; +import org.apache.hudi.common.table.marker.MarkerType; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieArchivalConfig; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.config.metrics.HoodieMetricsConfig; +import org.apache.hudi.config.metrics.HoodieMetricsGraphiteConfig; +import org.apache.hudi.config.metrics.HoodieMetricsJmxConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.exception.HoodieMetadataException; +import org.apache.hudi.hadoop.CachingPath; +import org.apache.hudi.hadoop.SerializablePath; + +import org.apache.avro.specific.SpecificRecordBase; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER; +import static org.apache.hudi.common.table.timeline.HoodieInstant.State.REQUESTED; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.getIndexInflightInstant; +import static org.apache.hudi.common.table.timeline.TimelineMetadataUtils.deserializeIndexPlan; +import static org.apache.hudi.common.util.StringUtils.EMPTY_STRING; +import static org.apache.hudi.metadata.HoodieTableMetadata.METADATA_TABLE_NAME_SUFFIX; +import static org.apache.hudi.metadata.HoodieTableMetadata.SOLO_COMMIT_TIMESTAMP; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getInflightAndCompletedMetadataPartitions; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getInflightMetadataPartitions; + +/** + * Writer implementation backed by an internal hudi table. Partition and file listing are saved within an internal MOR table + * called Metadata Table. This table is created by listing files and partitions (first time) + * and kept in sync using the instants on the main dataset. + */ +public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMetadataWriter { + + private static final Logger LOG = LogManager.getLogger(HoodieBackedTableMetadataWriter.class); + + // Virtual keys support for metadata table. This Field is + // from the metadata payload schema. + private static final String RECORD_KEY_FIELD_NAME = HoodieMetadataPayload.KEY_FIELD_NAME; + + protected HoodieWriteConfig metadataWriteConfig; + protected HoodieWriteConfig dataWriteConfig; + protected String tableName; + + protected HoodieBackedTableMetadata metadata; + protected HoodieTableMetaClient metadataMetaClient; + protected HoodieTableMetaClient dataMetaClient; + protected Option metrics; + protected boolean enabled; + protected SerializableConfiguration hadoopConf; + protected final transient HoodieEngineContext engineContext; + protected final List enabledPartitionTypes; + + /** + * Hudi backed table metadata writer. + * + * @param hadoopConf - Hadoop configuration to use for the metadata writer + * @param writeConfig - Writer config + * @param engineContext - Engine context + * @param actionMetadata - Optional action metadata to help decide initialize operations + * @param - Action metadata types extending Avro generated SpecificRecordBase + * @param inflightInstantTimestamp - Timestamp of any instant in progress + */ + protected HoodieBackedTableMetadataWriter(Configuration hadoopConf, + HoodieWriteConfig writeConfig, + HoodieEngineContext engineContext, + Option actionMetadata, + Option inflightInstantTimestamp) { + this.dataWriteConfig = writeConfig; + this.engineContext = engineContext; + this.hadoopConf = new SerializableConfiguration(hadoopConf); + this.metrics = Option.empty(); + this.enabledPartitionTypes = new ArrayList<>(); + + if (writeConfig.isMetadataTableEnabled()) { + this.tableName = writeConfig.getTableName() + METADATA_TABLE_NAME_SUFFIX; + this.metadataWriteConfig = createMetadataWriteConfig(writeConfig); + enabled = true; + + // Inline compaction and auto clean is required as we dont expose this table outside + ValidationUtils.checkArgument(!this.metadataWriteConfig.isAutoClean(), + "Cleaning is controlled internally for Metadata table."); + ValidationUtils.checkArgument(!this.metadataWriteConfig.inlineCompactionEnabled(), + "Compaction is controlled internally for metadata table."); + // Metadata Table cannot have metadata listing turned on. (infinite loop, much?) + ValidationUtils.checkArgument(this.metadataWriteConfig.shouldAutoCommit(), + "Auto commit is required for Metadata Table"); + ValidationUtils.checkArgument(!this.metadataWriteConfig.isMetadataTableEnabled(), + "File listing cannot be used for Metadata Table"); + + this.dataMetaClient = + HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(dataWriteConfig.getBasePath()).build(); + enablePartitions(); + initRegistry(); + initialize(engineContext, actionMetadata, inflightInstantTimestamp); + initTableMetadata(); + } else { + enabled = false; + } + } + + public HoodieBackedTableMetadataWriter(Configuration hadoopConf, HoodieWriteConfig writeConfig, + HoodieEngineContext engineContext) { + this(hadoopConf, writeConfig, engineContext, Option.empty(), Option.empty()); + } + + /** + * Enable metadata table partitions based on config. + */ + private void enablePartitions() { + final HoodieMetadataConfig metadataConfig = dataWriteConfig.getMetadataConfig(); + boolean isBootstrapCompleted; + Option metaClient = Option.empty(); + try { + isBootstrapCompleted = dataMetaClient.getFs().exists(new Path(metadataWriteConfig.getBasePath(), HoodieTableMetaClient.METAFOLDER_NAME)); + if (isBootstrapCompleted) { + metaClient = Option.of(HoodieTableMetaClient.builder().setConf(hadoopConf.get()) + .setBasePath(metadataWriteConfig.getBasePath()).build()); + } + } catch (IOException e) { + throw new HoodieException("Failed to enable metadata partitions!", e); + } + + Option fsView = Option.ofNullable( + metaClient.isPresent() ? HoodieTableMetadataUtil.getFileSystemView(metaClient.get()) : null); + enablePartition(MetadataPartitionType.FILES, metadataConfig, metaClient, fsView, isBootstrapCompleted); + if (metadataConfig.isBloomFilterIndexEnabled()) { + enablePartition(MetadataPartitionType.BLOOM_FILTERS, metadataConfig, metaClient, fsView, isBootstrapCompleted); + } + if (metadataConfig.isColumnStatsIndexEnabled()) { + enablePartition(MetadataPartitionType.COLUMN_STATS, metadataConfig, metaClient, fsView, isBootstrapCompleted); + } + } + + /** + * Enable metadata table partition. + * + * @param partitionType - Metadata table partition type + * @param metadataConfig - Table config + * @param metaClient - Meta client for the metadata table + * @param fsView - Metadata table filesystem view to use + * @param isBootstrapCompleted - Is metadata table initializing completed + */ + private void enablePartition(final MetadataPartitionType partitionType, final HoodieMetadataConfig metadataConfig, + final Option metaClient, Option fsView, boolean isBootstrapCompleted) { + final int fileGroupCount = HoodieTableMetadataUtil.getPartitionFileGroupCount(partitionType, metaClient, fsView, + metadataConfig, isBootstrapCompleted); + partitionType.setFileGroupCount(fileGroupCount); + this.enabledPartitionTypes.add(partitionType); + } + + protected abstract void initRegistry(); + + /** + * Create a {@code HoodieWriteConfig} to use for the Metadata Table. + * + * @param writeConfig {@code HoodieWriteConfig} of the main dataset writer + */ + private HoodieWriteConfig createMetadataWriteConfig(HoodieWriteConfig writeConfig) { + int parallelism = writeConfig.getMetadataInsertParallelism(); + + int minCommitsToKeep = Math.max(writeConfig.getMetadataMinCommitsToKeep(), writeConfig.getMinCommitsToKeep()); + int maxCommitsToKeep = Math.max(writeConfig.getMetadataMaxCommitsToKeep(), writeConfig.getMaxCommitsToKeep()); + + // Create the write config for the metadata table by borrowing options from the main write config. + HoodieWriteConfig.Builder builder = HoodieWriteConfig.newBuilder() + .withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder() + .withConsistencyCheckEnabled(writeConfig.getConsistencyGuardConfig().isConsistencyCheckEnabled()) + .withInitialConsistencyCheckIntervalMs(writeConfig.getConsistencyGuardConfig().getInitialConsistencyCheckIntervalMs()) + .withMaxConsistencyCheckIntervalMs(writeConfig.getConsistencyGuardConfig().getMaxConsistencyCheckIntervalMs()) + .withMaxConsistencyChecks(writeConfig.getConsistencyGuardConfig().getMaxConsistencyChecks()) + .build()) + .withWriteConcurrencyMode(WriteConcurrencyMode.SINGLE_WRITER) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).withFileListingParallelism(writeConfig.getFileListingParallelism()).build()) + .withAutoCommit(true) + .withAvroSchemaValidate(true) + .withEmbeddedTimelineServerEnabled(false) + .withMarkersType(MarkerType.DIRECT.name()) + .withRollbackUsingMarkers(false) + .withPath(HoodieTableMetadata.getMetadataTableBasePath(writeConfig.getBasePath())) + .withSchema(HoodieMetadataRecord.getClassSchema().toString()) + .forTable(tableName) + // we will trigger cleaning manually, to control the instant times + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withAsyncClean(writeConfig.isMetadataAsyncClean()) + .withAutoClean(false) + .withCleanerParallelism(parallelism) + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS) + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) + .retainCommits(writeConfig.getMetadataCleanerCommitsRetained()) + .build()) + // we will trigger archive manually, to ensure only regular writer invokes it + .withArchivalConfig(HoodieArchivalConfig.newBuilder() + .archiveCommitsWith(minCommitsToKeep, maxCommitsToKeep) + .withAutoArchive(false) + .build()) + // we will trigger compaction manually, to control the instant times + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withInlineCompaction(false) + .withMaxNumDeltaCommitsBeforeCompaction(writeConfig.getMetadataCompactDeltaCommitMax()) + // by default, the HFile does not keep the metadata fields, set up as false + // to always use the metadata of the new record. + .withPreserveCommitMetadata(false) + .build()) + .withParallelism(parallelism, parallelism) + .withDeleteParallelism(parallelism) + .withRollbackParallelism(parallelism) + .withFinalizeWriteParallelism(parallelism) + .withAllowMultiWriteOnSameInstant(true) + .withKeyGenerator(HoodieTableMetadataKeyGenerator.class.getCanonicalName()) + .withPopulateMetaFields(dataWriteConfig.getMetadataConfig().populateMetaFields()); + + // RecordKey properties are needed for the metadata table records + final Properties properties = new Properties(); + properties.put(HoodieTableConfig.RECORDKEY_FIELDS.key(), RECORD_KEY_FIELD_NAME); + properties.put("hoodie.datasource.write.recordkey.field", RECORD_KEY_FIELD_NAME); + builder.withProperties(properties); + + if (writeConfig.isMetricsOn()) { + // Table Name is needed for metric reporters prefix + Properties commonProperties = new Properties(); + commonProperties.put(HoodieWriteConfig.TBL_NAME.key(), tableName); + + builder.withMetricsConfig(HoodieMetricsConfig.newBuilder() + .fromProperties(commonProperties) + .withReporterType(writeConfig.getMetricsReporterType().toString()) + .withExecutorMetrics(writeConfig.isExecutorMetricsEnabled()) + .on(true).build()); + switch (writeConfig.getMetricsReporterType()) { + case GRAPHITE: + builder.withMetricsGraphiteConfig(HoodieMetricsGraphiteConfig.newBuilder() + .onGraphitePort(writeConfig.getGraphiteServerPort()) + .toGraphiteHost(writeConfig.getGraphiteServerHost()) + .usePrefix(writeConfig.getGraphiteMetricPrefix()).build()); + break; + case JMX: + builder.withMetricsJmxConfig(HoodieMetricsJmxConfig.newBuilder() + .onJmxPort(writeConfig.getJmxPort()) + .toJmxHost(writeConfig.getJmxHost()) + .build()); + break; + case DATADOG: + case PROMETHEUS: + case PROMETHEUS_PUSHGATEWAY: + case CONSOLE: + case INMEMORY: + case CLOUDWATCH: + break; + default: + throw new HoodieMetadataException("Unsupported Metrics Reporter type " + writeConfig.getMetricsReporterType()); + } + } + return builder.build(); + } + + public HoodieWriteConfig getWriteConfig() { + return metadataWriteConfig; + } + + public HoodieBackedTableMetadata getTableMetadata() { + return metadata; + } + + public List getEnabledPartitionTypes() { + return this.enabledPartitionTypes; + } + + /** + * Initialize the metadata table if it does not exist. + *

+ * If the metadata table does not exist, then file and partition listing is used to initialize the table. + * + * @param engineContext + * @param actionMetadata Action metadata types extending Avro generated SpecificRecordBase + * @param inflightInstantTimestamp Timestamp of an instant in progress on the dataset. This instant is ignored + * while deciding to initialize the metadata table. + */ + protected abstract void initialize(HoodieEngineContext engineContext, + Option actionMetadata, + Option inflightInstantTimestamp); + + public void initTableMetadata() { + try { + if (this.metadata != null) { + this.metadata.close(); + } + this.metadata = new HoodieBackedTableMetadata(engineContext, dataWriteConfig.getMetadataConfig(), + dataWriteConfig.getBasePath(), dataWriteConfig.getSpillableMapBasePath()); + this.metadataMetaClient = metadata.getMetadataMetaClient(); + } catch (Exception e) { + throw new HoodieException("Error initializing metadata table for reads", e); + } + } + + /** + * Initialize the metadata table if needed. + * + * @param dataMetaClient - meta client for the data table + * @param actionMetadata - optional action metadata + * @param inflightInstantTimestamp - timestamp of an instant in progress on the dataset + * @param - action metadata types extending Avro generated SpecificRecordBase + * @throws IOException + */ + protected void initializeIfNeeded(HoodieTableMetaClient dataMetaClient, + Option actionMetadata, + Option inflightInstantTimestamp) throws IOException { + HoodieTimer timer = new HoodieTimer().startTimer(); + + boolean exists = metadataTableExists(dataMetaClient, actionMetadata); + + if (!exists) { + // Initialize for the first time by listing partitions and files directly from the file system + if (initializeFromFilesystem(dataMetaClient, inflightInstantTimestamp)) { + metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.INITIALIZE_STR, timer.endTimer())); + } + return; + } + + // if metadata table exists, then check if any of the enabled partition types needs to be initialized + // NOTE: It needs to be guarded by async index config because if that is enabled then initialization happens through the index scheduler. + if (!dataWriteConfig.isMetadataAsyncIndex()) { + Set inflightAndCompletedPartitions = getInflightAndCompletedMetadataPartitions(dataMetaClient.getTableConfig()); + LOG.info("Async metadata indexing enabled and following partitions already initialized: " + inflightAndCompletedPartitions); + List partitionsToInit = this.enabledPartitionTypes.stream() + .filter(p -> !inflightAndCompletedPartitions.contains(p.getPartitionPath()) && !MetadataPartitionType.FILES.equals(p)) + .collect(Collectors.toList()); + // if there are no partitions to initialize or there is a pending operation, then don't initialize in this round + if (partitionsToInit.isEmpty() || anyPendingDataInstant(dataMetaClient, inflightInstantTimestamp)) { + return; + } + + String createInstantTime = getInitialCommitInstantTime(dataMetaClient); + initTableMetadata(); // re-init certain flags in BaseTableMetadata + initializeEnabledFileGroups(dataMetaClient, createInstantTime, partitionsToInit); + initialCommit(createInstantTime, partitionsToInit); + updateInitializedPartitionsInTableConfig(partitionsToInit); + } + } + + private boolean metadataTableExists(HoodieTableMetaClient dataMetaClient, + Option actionMetadata) throws IOException { + boolean exists = dataMetaClient.getFs().exists(new Path(metadataWriteConfig.getBasePath(), + HoodieTableMetaClient.METAFOLDER_NAME)); + boolean reInitialize = false; + + // If the un-synced instants have been archived, then + // the metadata table will need to be initialized again. + if (exists) { + HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf.get()) + .setBasePath(metadataWriteConfig.getBasePath()).build(); + + if (dataWriteConfig.getMetadataConfig().populateMetaFields() != metadataMetaClient.getTableConfig().populateMetaFields()) { + LOG.info("Re-initiating metadata table properties since populate meta fields have changed"); + metadataMetaClient = initializeMetaClient(dataWriteConfig.getMetadataConfig().populateMetaFields()); + } + + final Option latestMetadataInstant = + metadataMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant(); + + reInitialize = isBootstrapNeeded(latestMetadataInstant, actionMetadata); + } + + if (reInitialize) { + metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.REBOOTSTRAP_STR, 1)); + LOG.info("Deleting Metadata Table directory so that it can be re-initialized"); + dataMetaClient.getFs().delete(new Path(metadataWriteConfig.getBasePath()), true); + exists = false; + } + + return exists; + } + + /** + * Whether initialize operation needed for this metadata table. + *

+ * Rollback of the first commit would look like un-synced instants in the metadata table. + * Action metadata is needed to verify the instant time and avoid erroneous initializing. + *

+ * TODO: Revisit this logic and validate that filtering for all + * commits timeline is the right thing to do + * + * @return True if the initialize is not needed, False otherwise + */ + private boolean isBootstrapNeeded(Option latestMetadataInstant, + Option actionMetadata) { + if (!latestMetadataInstant.isPresent()) { + LOG.warn("Metadata Table will need to be re-initialized as no instants were found"); + return true; + } + + final String latestMetadataInstantTimestamp = latestMetadataInstant.get().getTimestamp(); + if (latestMetadataInstantTimestamp.equals(SOLO_COMMIT_TIMESTAMP)) { + return false; + } + + // Detect the commit gaps if any from the data and the metadata active timeline + if (dataMetaClient.getActiveTimeline().getAllCommitsTimeline().isBeforeTimelineStarts( + latestMetadataInstant.get().getTimestamp()) + && !isCommitRevertedByInFlightAction(actionMetadata, latestMetadataInstantTimestamp)) { + LOG.error("Metadata Table will need to be re-initialized as un-synced instants have been archived." + + " latestMetadataInstant=" + latestMetadataInstant.get().getTimestamp() + + ", latestDataInstant=" + dataMetaClient.getActiveTimeline().firstInstant().get().getTimestamp()); + return true; + } + + return false; + } + + /** + * Is the latest commit instant reverted by the in-flight instant action? + * + * @param actionMetadata - In-flight instant action metadata + * @param latestMetadataInstantTimestamp - Metadata table latest instant timestamp + * @param - ActionMetadata type + * @return True if the latest instant action is reverted by the action + */ + private boolean isCommitRevertedByInFlightAction(Option actionMetadata, + final String latestMetadataInstantTimestamp) { + if (!actionMetadata.isPresent()) { + return false; + } + + final String INSTANT_ACTION = (actionMetadata.get() instanceof HoodieRollbackMetadata + ? HoodieTimeline.ROLLBACK_ACTION + : (actionMetadata.get() instanceof HoodieRestoreMetadata ? HoodieTimeline.RESTORE_ACTION : EMPTY_STRING)); + + List affectedInstantTimestamps; + switch (INSTANT_ACTION) { + case HoodieTimeline.ROLLBACK_ACTION: + List rollbackedInstants = + ((HoodieRollbackMetadata) actionMetadata.get()).getInstantsRollback(); + affectedInstantTimestamps = rollbackedInstants.stream().map(HoodieInstantInfo::getCommitTime).collect(Collectors.toList()); + + if (affectedInstantTimestamps.contains(latestMetadataInstantTimestamp)) { + return true; + } + break; + case HoodieTimeline.RESTORE_ACTION: + List restoredInstants = + ((HoodieRestoreMetadata) actionMetadata.get()).getRestoreInstantInfo(); + affectedInstantTimestamps = restoredInstants.stream().map(HoodieInstantInfo::getCommitTime).collect(Collectors.toList()); + + if (affectedInstantTimestamps.contains(latestMetadataInstantTimestamp)) { + return true; + } + break; + default: + return false; + } + + return false; + } + + /** + * Initialize the Metadata Table by listing files and partitions from the file system. + * + * @param dataMetaClient - {@code HoodieTableMetaClient} for the dataset. + * @param inflightInstantTimestamp - Current action instant responsible for this initialization + */ + private boolean initializeFromFilesystem(HoodieTableMetaClient dataMetaClient, + Option inflightInstantTimestamp) throws IOException { + if (anyPendingDataInstant(dataMetaClient, inflightInstantTimestamp)) { + return false; + } + + String createInstantTime = getInitialCommitInstantTime(dataMetaClient); + + initializeMetaClient(dataWriteConfig.getMetadataConfig().populateMetaFields()); + initTableMetadata(); + // if async metadata indexing is enabled, + // then only initialize files partition as other partitions will be built using HoodieIndexer + List enabledPartitionTypes = new ArrayList<>(); + if (dataWriteConfig.isMetadataAsyncIndex()) { + enabledPartitionTypes.add(MetadataPartitionType.FILES); + } else { + // all enabled ones should be initialized + enabledPartitionTypes = this.enabledPartitionTypes; + } + initializeEnabledFileGroups(dataMetaClient, createInstantTime, enabledPartitionTypes); + initialCommit(createInstantTime, enabledPartitionTypes); + updateInitializedPartitionsInTableConfig(enabledPartitionTypes); + return true; + } + + private String getInitialCommitInstantTime(HoodieTableMetaClient dataMetaClient) { + // If there is no commit on the dataset yet, use the SOLO_COMMIT_TIMESTAMP as the instant time for initial commit + // Otherwise, we use the timestamp of the latest completed action. + String createInstantTime = dataMetaClient.getActiveTimeline().filterCompletedInstants() + .getReverseOrderedInstants().findFirst().map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP); + LOG.info("Creating a new metadata table in " + metadataWriteConfig.getBasePath() + " at instant " + createInstantTime); + return createInstantTime; + } + + private boolean anyPendingDataInstant(HoodieTableMetaClient dataMetaClient, Option inflightInstantTimestamp) { + ValidationUtils.checkState(enabled, "Metadata table cannot be initialized as it is not enabled"); + + // We can only initialize if there are no pending operations on the dataset + List pendingDataInstant = dataMetaClient.getActiveTimeline() + .getInstants().filter(i -> !i.isCompleted()) + .filter(i -> !inflightInstantTimestamp.isPresent() || !i.getTimestamp().equals(inflightInstantTimestamp.get())) + // regular writers should not be blocked due to pending indexing action + .filter(i -> !HoodieTimeline.INDEXING_ACTION.equals(i.getAction())) + .collect(Collectors.toList()); + + if (!pendingDataInstant.isEmpty()) { + metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BOOTSTRAP_ERR_STR, 1)); + LOG.warn("Cannot initialize metadata table as operation(s) are in progress on the dataset: " + + Arrays.toString(pendingDataInstant.toArray())); + return true; + } + return false; + } + + private void updateInitializedPartitionsInTableConfig(List partitionTypes) { + Set completedPartitions = dataMetaClient.getTableConfig().getMetadataPartitions(); + completedPartitions.addAll(partitionTypes.stream().map(MetadataPartitionType::getPartitionPath).collect(Collectors.toSet())); + dataMetaClient.getTableConfig().setValue(HoodieTableConfig.TABLE_METADATA_PARTITIONS.key(), String.join(",", completedPartitions)); + HoodieTableConfig.update(dataMetaClient.getFs(), new Path(dataMetaClient.getMetaPath()), dataMetaClient.getTableConfig().getProps()); + } + + private HoodieTableMetaClient initializeMetaClient(boolean populateMetaFields) throws IOException { + return HoodieTableMetaClient.withPropertyBuilder() + .setTableType(HoodieTableType.MERGE_ON_READ) + .setTableName(tableName) + .setArchiveLogFolder(ARCHIVELOG_FOLDER.defaultValue()) + .setPayloadClassName(HoodieMetadataPayload.class.getName()) + .setBaseFileFormat(HoodieFileFormat.HFILE.toString()) + .setRecordKeyFields(RECORD_KEY_FIELD_NAME) + .setPopulateMetaFields(populateMetaFields) + .setKeyGeneratorClassProp(HoodieTableMetadataKeyGenerator.class.getCanonicalName()) + .initTable(hadoopConf.get(), metadataWriteConfig.getBasePath()); + } + + /** + * Function to find hoodie partitions and list files in them in parallel. + * + * @param datasetMetaClient data set meta client instance. + * @return Map of partition names to a list of FileStatus for all the files in the partition + */ + private List listAllPartitions(HoodieTableMetaClient datasetMetaClient) { + List pathsToList = new LinkedList<>(); + pathsToList.add(new SerializablePath(new CachingPath(dataWriteConfig.getBasePath()))); + + List partitionsToBootstrap = new LinkedList<>(); + final int fileListingParallelism = metadataWriteConfig.getFileListingParallelism(); + SerializableConfiguration conf = new SerializableConfiguration(datasetMetaClient.getHadoopConf()); + final String dirFilterRegex = dataWriteConfig.getMetadataConfig().getDirectoryFilterRegex(); + final String datasetBasePath = datasetMetaClient.getBasePath(); + SerializablePath serializableBasePath = new SerializablePath(new CachingPath(datasetBasePath)); + + while (!pathsToList.isEmpty()) { + // In each round we will list a section of directories + int numDirsToList = Math.min(fileListingParallelism, pathsToList.size()); + // List all directories in parallel + List processedDirectories = engineContext.map(pathsToList.subList(0, numDirsToList), path -> { + FileSystem fs = path.get().getFileSystem(conf.get()); + String relativeDirPath = FSUtils.getRelativePartitionPath(serializableBasePath.get(), path.get()); + return new DirectoryInfo(relativeDirPath, fs.listStatus(path.get())); + }, numDirsToList); + + pathsToList = new LinkedList<>(pathsToList.subList(numDirsToList, pathsToList.size())); + + // If the listing reveals a directory, add it to queue. If the listing reveals a hoodie partition, add it to + // the results. + for (DirectoryInfo dirInfo : processedDirectories) { + if (!dirFilterRegex.isEmpty()) { + final String relativePath = dirInfo.getRelativePath(); + if (!relativePath.isEmpty()) { + Path partitionPath = new Path(datasetBasePath, relativePath); + if (partitionPath.getName().matches(dirFilterRegex)) { + LOG.info("Ignoring directory " + partitionPath + " which matches the filter regex " + dirFilterRegex); + continue; + } + } + } + + if (dirInfo.isHoodiePartition()) { + // Add to result + partitionsToBootstrap.add(dirInfo); + } else { + // Add sub-dirs to the queue + pathsToList.addAll(dirInfo.getSubDirectories().stream() + .map(path -> new SerializablePath(new CachingPath(path.toUri()))) + .collect(Collectors.toList())); + } + } + } + + return partitionsToBootstrap; + } + + /** + * Initialize file groups for all the enabled partition types. + * + * @param dataMetaClient - Meta client for the data table + * @param createInstantTime - Metadata table create instant time + * @throws IOException + */ + private void initializeEnabledFileGroups(HoodieTableMetaClient dataMetaClient, String createInstantTime, List partitionTypes) throws IOException { + for (MetadataPartitionType enabledPartitionType : partitionTypes) { + initializeFileGroups(dataMetaClient, enabledPartitionType, createInstantTime, + enabledPartitionType.getFileGroupCount()); + } + } + + public void initializeMetadataPartitions(HoodieTableMetaClient dataMetaClient, List metadataPartitions, String instantTime) throws IOException { + for (MetadataPartitionType partitionType : metadataPartitions) { + initializeFileGroups(dataMetaClient, partitionType, instantTime, partitionType.getFileGroupCount()); + } + } + + /** + * Initialize file groups for a partition. For file listing, we just have one file group. + * + * All FileGroups for a given metadata partition has a fixed prefix as per the {@link MetadataPartitionType#getFileIdPrefix()}. + * Each file group is suffixed with 4 digits with increments of 1 starting with 0000. + * + * Lets say we configure 10 file groups for record level index partition, and prefix as "record-index-bucket-" + * File groups will be named as : + * record-index-bucket-0000, .... -> ..., record-index-bucket-0009 + */ + private void initializeFileGroups(HoodieTableMetaClient dataMetaClient, MetadataPartitionType metadataPartition, String instantTime, + int fileGroupCount) throws IOException { + final HashMap blockHeader = new HashMap<>(); + blockHeader.put(HeaderMetadataType.INSTANT_TIME, instantTime); + // Archival of data table has a dependency on compaction(base files) in metadata table. + // It is assumed that as of time Tx of base instant (/compaction time) in metadata table, + // all commits in data table is in sync with metadata table. So, we always start with log file for any fileGroup. + final HoodieDeleteBlock block = new HoodieDeleteBlock(new DeleteRecord[0], blockHeader); + + LOG.info(String.format("Creating %d file groups for partition %s with base fileId %s at instant time %s", + fileGroupCount, metadataPartition.getPartitionPath(), metadataPartition.getFileIdPrefix(), instantTime)); + for (int i = 0; i < fileGroupCount; ++i) { + final String fileGroupFileId = String.format("%s%04d", metadataPartition.getFileIdPrefix(), i); + try { + HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder() + .onParentPath(FSUtils.getPartitionPath(metadataWriteConfig.getBasePath(), metadataPartition.getPartitionPath())) + .withFileId(fileGroupFileId).overBaseCommit(instantTime) + .withLogVersion(HoodieLogFile.LOGFILE_BASE_VERSION) + .withFileSize(0L) + .withSizeThreshold(metadataWriteConfig.getLogFileMaxSize()) + .withFs(dataMetaClient.getFs()) + .withRolloverLogWriteToken(HoodieLogFormat.DEFAULT_WRITE_TOKEN) + .withLogWriteToken(HoodieLogFormat.DEFAULT_WRITE_TOKEN) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); + writer.appendBlock(block); + writer.close(); + } catch (InterruptedException e) { + throw new HoodieException("Failed to created fileGroup " + fileGroupFileId + " for partition " + metadataPartition.getPartitionPath(), e); + } + } + } + + public void dropMetadataPartitions(List metadataPartitions) throws IOException { + Set completedIndexes = dataMetaClient.getTableConfig().getMetadataPartitions(); + Set inflightIndexes = getInflightMetadataPartitions(dataMetaClient.getTableConfig()); + + for (MetadataPartitionType partitionType : metadataPartitions) { + String partitionPath = partitionType.getPartitionPath(); + // first update table config + if (inflightIndexes.contains(partitionPath)) { + inflightIndexes.remove(partitionPath); + dataMetaClient.getTableConfig().setValue(HoodieTableConfig.TABLE_METADATA_PARTITIONS_INFLIGHT.key(), String.join(",", inflightIndexes)); + } else if (completedIndexes.contains(partitionPath)) { + completedIndexes.remove(partitionPath); + dataMetaClient.getTableConfig().setValue(HoodieTableConfig.TABLE_METADATA_PARTITIONS.key(), String.join(",", completedIndexes)); + } + HoodieTableConfig.update(dataMetaClient.getFs(), new Path(dataMetaClient.getMetaPath()), dataMetaClient.getTableConfig().getProps()); + LOG.warn("Deleting Metadata Table partitions: " + partitionPath); + dataMetaClient.getFs().delete(new Path(metadataWriteConfig.getBasePath(), partitionPath), true); + // delete corresponding pending indexing instant file in the timeline + LOG.warn("Deleting pending indexing instant from the timeline for partition: " + partitionPath); + deletePendingIndexingInstant(dataMetaClient, partitionPath); + } + closeInternal(); + } + + /** + * Deletes any pending indexing instant, if it exists. + * It reads the plan from indexing.requested file and deletes both requested and inflight instants, + * if the partition path in the plan matches with the given partition path. + */ + private static void deletePendingIndexingInstant(HoodieTableMetaClient metaClient, String partitionPath) { + metaClient.reloadActiveTimeline().filterPendingIndexTimeline().getInstants().filter(instant -> REQUESTED.equals(instant.getState())) + .forEach(instant -> { + try { + HoodieIndexPlan indexPlan = deserializeIndexPlan(metaClient.getActiveTimeline().readIndexPlanAsBytes(instant).get()); + if (indexPlan.getIndexPartitionInfos().stream() + .anyMatch(indexPartitionInfo -> indexPartitionInfo.getMetadataPartitionPath().equals(partitionPath))) { + metaClient.getActiveTimeline().deleteInstantFileIfExists(instant); + metaClient.getActiveTimeline().deleteInstantFileIfExists(getIndexInflightInstant(instant.getTimestamp())); + } + } catch (IOException e) { + LOG.error("Failed to delete the instant file corresponding to " + instant); + } + }); + } + + private MetadataRecordsGenerationParams getRecordsGenerationParams() { + return new MetadataRecordsGenerationParams( + dataMetaClient, + enabledPartitionTypes, + dataWriteConfig.getBloomFilterType(), + dataWriteConfig.getMetadataBloomFilterIndexParallelism(), + dataWriteConfig.isMetadataColumnStatsIndexEnabled(), + dataWriteConfig.getColumnStatsIndexParallelism(), + dataWriteConfig.getColumnsEnabledForColumnStatsIndex(), + dataWriteConfig.getColumnsEnabledForBloomFilterIndex()); + } + + /** + * Interface to assist in converting commit metadata to List of HoodieRecords to be written to metadata table. + * Updates of different commit metadata uses the same method to convert to HoodieRecords and hence. + */ + private interface ConvertMetadataFunction { + Map> convertMetadata(); + } + + /** + * Processes commit metadata from data table and commits to metadata table. + * + * @param instantTime instant time of interest. + * @param convertMetadataFunction converter function to convert the respective metadata to List of HoodieRecords to be written to metadata table. + * @param type of commit metadata. + * @param canTriggerTableService true if table services can be triggered. false otherwise. + */ + private void processAndCommit(String instantTime, ConvertMetadataFunction convertMetadataFunction, boolean canTriggerTableService) { + if (!dataWriteConfig.isMetadataTableEnabled()) { + return; + } + Set partitionsToUpdate = getMetadataPartitionsToUpdate(); + Set inflightIndexes = getInflightMetadataPartitions(dataMetaClient.getTableConfig()); + // if indexing is inflight then do not trigger table service + boolean doNotTriggerTableService = partitionsToUpdate.stream().anyMatch(inflightIndexes::contains); + + if (enabled && metadata != null) { + // convert metadata and filter only the entries whose partition path are in partitionsToUpdate + Map> partitionRecordsMap = convertMetadataFunction.convertMetadata().entrySet().stream() + .filter(entry -> partitionsToUpdate.contains(entry.getKey().getPartitionPath())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + commit(instantTime, partitionRecordsMap, !doNotTriggerTableService && canTriggerTableService); + } + } + + private Set getMetadataPartitionsToUpdate() { + // fetch partitions to update from table config + Set partitionsToUpdate = dataMetaClient.getTableConfig().getMetadataPartitions(); + // add inflight indexes as well because the file groups have already been initialized, so writers can log updates + // NOTE: Async HoodieIndexer can move some partition to inflight. While that partition is still being built, + // the regular ingestion writers should not be blocked. They can go ahead and log updates to the metadata partition. + // Instead of depending on enabledPartitionTypes, the table config becomes the source of truth for which partitions to update. + partitionsToUpdate.addAll(getInflightMetadataPartitions(dataMetaClient.getTableConfig())); + if (!partitionsToUpdate.isEmpty()) { + return partitionsToUpdate; + } + // fallback to all enabled partitions if table config returned no partitions + LOG.warn("There are no partitions to update according to table config. Falling back to enabled partition types in the write config."); + return getEnabledPartitionTypes().stream().map(MetadataPartitionType::getPartitionPath).collect(Collectors.toSet()); + } + + @Override + public void buildMetadataPartitions(HoodieEngineContext engineContext, List indexPartitionInfos) { + if (indexPartitionInfos.isEmpty()) { + LOG.warn("No partition to index in the plan"); + return; + } + String indexUptoInstantTime = indexPartitionInfos.get(0).getIndexUptoInstant(); + List partitionTypes = new ArrayList<>(); + indexPartitionInfos.forEach(indexPartitionInfo -> { + String relativePartitionPath = indexPartitionInfo.getMetadataPartitionPath(); + LOG.info(String.format("Creating a new metadata index for partition '%s' under path %s upto instant %s", + relativePartitionPath, metadataWriteConfig.getBasePath(), indexUptoInstantTime)); + try { + // file group should have already been initialized while scheduling index for this partition + if (!dataMetaClient.getFs().exists(new Path(metadataWriteConfig.getBasePath(), relativePartitionPath))) { + throw new HoodieIndexException(String.format("File group not initialized for metadata partition: %s, indexUptoInstant: %s. Looks like index scheduling failed!", + relativePartitionPath, indexUptoInstantTime)); + } + } catch (IOException e) { + throw new HoodieIndexException(String.format("Unable to check whether file group is initialized for metadata partition: %s, indexUptoInstant: %s", + relativePartitionPath, indexUptoInstantTime)); + } + + // return early and populate enabledPartitionTypes correctly (check in initialCommit) + MetadataPartitionType partitionType = MetadataPartitionType.valueOf(relativePartitionPath.toUpperCase(Locale.ROOT)); + if (!enabledPartitionTypes.contains(partitionType)) { + throw new HoodieIndexException(String.format("Indexing for metadata partition: %s is not enabled", partitionType)); + } + partitionTypes.add(partitionType); + }); + // before initial commit update inflight indexes in table config + Set inflightIndexes = getInflightMetadataPartitions(dataMetaClient.getTableConfig()); + inflightIndexes.addAll(indexPartitionInfos.stream().map(HoodieIndexPartitionInfo::getMetadataPartitionPath).collect(Collectors.toSet())); + dataMetaClient.getTableConfig().setValue(HoodieTableConfig.TABLE_METADATA_PARTITIONS_INFLIGHT.key(), String.join(",", inflightIndexes)); + HoodieTableConfig.update(dataMetaClient.getFs(), new Path(dataMetaClient.getMetaPath()), dataMetaClient.getTableConfig().getProps()); + initialCommit(indexUptoInstantTime, partitionTypes); + } + + /** + * Update from {@code HoodieCommitMetadata}. + * + * @param commitMetadata {@code HoodieCommitMetadata} + * @param instantTime Timestamp at which the commit was performed + * @param isTableServiceAction {@code true} if commit metadata is pertaining to a table service. {@code false} otherwise. + */ + @Override + public void update(HoodieCommitMetadata commitMetadata, String instantTime, boolean isTableServiceAction) { + processAndCommit(instantTime, () -> HoodieTableMetadataUtil.convertMetadataToRecords( + engineContext, commitMetadata, instantTime, getRecordsGenerationParams()), !isTableServiceAction); + closeInternal(); + } + + /** + * Update from {@code HoodieCleanMetadata}. + * + * @param cleanMetadata {@code HoodieCleanMetadata} + * @param instantTime Timestamp at which the clean was completed + */ + @Override + public void update(HoodieCleanMetadata cleanMetadata, String instantTime) { + processAndCommit(instantTime, () -> HoodieTableMetadataUtil.convertMetadataToRecords(engineContext, + cleanMetadata, getRecordsGenerationParams(), instantTime), false); + closeInternal(); + } + + /** + * Update from {@code HoodieRestoreMetadata}. + * + * @param restoreMetadata {@code HoodieRestoreMetadata} + * @param instantTime Timestamp at which the restore was performed + */ + @Override + public void update(HoodieRestoreMetadata restoreMetadata, String instantTime) { + processAndCommit(instantTime, () -> HoodieTableMetadataUtil.convertMetadataToRecords(engineContext, + metadataMetaClient.getActiveTimeline(), restoreMetadata, getRecordsGenerationParams(), instantTime, + metadata.getSyncedInstantTime()), false); + closeInternal(); + } + + /** + * Update from {@code HoodieRollbackMetadata}. + * + * @param rollbackMetadata {@code HoodieRollbackMetadata} + * @param instantTime Timestamp at which the rollback was performed + */ + @Override + public void update(HoodieRollbackMetadata rollbackMetadata, String instantTime) { + if (enabled && metadata != null) { + // Is this rollback of an instant that has been synced to the metadata table? + String rollbackInstant = rollbackMetadata.getCommitsRollback().get(0); + boolean wasSynced = metadataMetaClient.getActiveTimeline().containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, rollbackInstant)); + if (!wasSynced) { + // A compaction may have taken place on metadata table which would have included this instant being rolled back. + // Revisit this logic to relax the compaction fencing : https://issues.apache.org/jira/browse/HUDI-2458 + Option latestCompaction = metadata.getLatestCompactionTime(); + if (latestCompaction.isPresent()) { + wasSynced = HoodieTimeline.compareTimestamps(rollbackInstant, HoodieTimeline.LESSER_THAN_OR_EQUALS, latestCompaction.get()); + } + } + + Map> records = + HoodieTableMetadataUtil.convertMetadataToRecords(engineContext, metadataMetaClient.getActiveTimeline(), + rollbackMetadata, getRecordsGenerationParams(), instantTime, + metadata.getSyncedInstantTime(), wasSynced); + commit(instantTime, records, false); + closeInternal(); + } + } + + @Override + public void close() throws Exception { + if (metadata != null) { + metadata.close(); + } + } + + /** + * Commit the {@code HoodieRecord}s to Metadata Table as a new delta-commit. + * + * @param instantTime - Action instant time for this commit + * @param partitionRecordsMap - Map of partition name to its records to commit + * @param canTriggerTableService true if table services can be scheduled and executed. false otherwise. + */ + protected abstract void commit( + String instantTime, Map> partitionRecordsMap, + boolean canTriggerTableService); + + /** + * Tag each record with the location in the given partition. + * The record is tagged with respective file slice's location based on its record key. + */ + protected HoodieData prepRecords(Map> partitionRecordsMap) { + // The result set + HoodieData allPartitionRecords = engineContext.emptyHoodieData(); + + HoodieTableFileSystemView fsView = HoodieTableMetadataUtil.getFileSystemView(metadataMetaClient); + for (Map.Entry> entry : partitionRecordsMap.entrySet()) { + final String partitionName = entry.getKey().getPartitionPath(); + final int fileGroupCount = entry.getKey().getFileGroupCount(); + HoodieData records = entry.getValue(); + + List fileSlices = + HoodieTableMetadataUtil.getPartitionLatestFileSlices(metadataMetaClient, Option.ofNullable(fsView), partitionName); + if (fileSlices.isEmpty()) { + // scheduling of INDEX only initializes the file group and not add commit + // so if there are no committed file slices, look for inflight slices + fileSlices = HoodieTableMetadataUtil.getPartitionLatestFileSlicesIncludingInflight(metadataMetaClient, Option.ofNullable(fsView), partitionName); + } + ValidationUtils.checkArgument(fileSlices.size() == fileGroupCount, + String.format("Invalid number of file groups for partition:%s, found=%d, required=%d", + partitionName, fileSlices.size(), fileGroupCount)); + + List finalFileSlices = fileSlices; + HoodieData rddSinglePartitionRecords = records.map(r -> { + FileSlice slice = finalFileSlices.get(HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(r.getRecordKey(), + fileGroupCount)); + r.setCurrentLocation(new HoodieRecordLocation(slice.getBaseInstantTime(), slice.getFileId())); + return r; + }); + + allPartitionRecords = allPartitionRecords.union(rddSinglePartitionRecords); + } + return allPartitionRecords; + } + + /** + * Perform a compaction on the Metadata Table. + * + * Cases to be handled: + * 1. We cannot perform compaction if there are previous inflight operations on the dataset. This is because + * a compacted metadata base file at time Tx should represent all the actions on the dataset till time Tx. + * + * 2. In multi-writer scenario, a parallel operation with a greater instantTime may have completed creating a + * deltacommit. + */ + protected void compactIfNecessary(BaseHoodieWriteClient writeClient, String instantTime) { + // finish off any pending compactions if any from previous attempt. + writeClient.runAnyPendingCompactions(); + + String latestDeltaCommitTime = metadataMetaClient.reloadActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants().lastInstant() + .get().getTimestamp(); + List pendingInstants = dataMetaClient.reloadActiveTimeline().filterInflightsAndRequested() + .findInstantsBefore(instantTime).getInstants().collect(Collectors.toList()); + + if (!pendingInstants.isEmpty()) { + LOG.info(String.format("Cannot compact metadata table as there are %d inflight instants before latest deltacommit %s: %s", + pendingInstants.size(), latestDeltaCommitTime, Arrays.toString(pendingInstants.toArray()))); + return; + } + + // Trigger compaction with suffixes based on the same instant time. This ensures that any future + // delta commits synced over will not have an instant time lesser than the last completed instant on the + // metadata table. + final String compactionInstantTime = latestDeltaCommitTime + "001"; + if (writeClient.scheduleCompactionAtInstant(compactionInstantTime, Option.empty())) { + writeClient.compact(compactionInstantTime); + } + } + + protected void cleanIfNecessary(BaseHoodieWriteClient writeClient, String instantTime) { + Option lastCompletedCompactionInstant = metadataMetaClient.reloadActiveTimeline() + .getCommitTimeline().filterCompletedInstants().lastInstant(); + if (lastCompletedCompactionInstant.isPresent() + && metadataMetaClient.getActiveTimeline().filterCompletedInstants() + .findInstantsAfter(lastCompletedCompactionInstant.get().getTimestamp()).countInstants() < 3) { + // do not clean the log files immediately after compaction to give some buffer time for metadata table reader, + // because there is case that the reader has prepared for the log file readers already before the compaction completes + // while before/during the reading of the log files, the cleaning triggers and delete the reading files, + // then a FileNotFoundException(for LogFormatReader) or NPE(for HFileReader) would throw. + + // 3 is a value that I think is enough for metadata table reader. + return; + } + // Trigger cleaning with suffixes based on the same instant time. This ensures that any future + // delta commits synced over will not have an instant time lesser than the last completed instant on the + // metadata table. + writeClient.clean(instantTime + "002"); + } + + /** + * This is invoked to initialize metadata table for a dataset. + * Initial commit has special handling mechanism due to its scale compared to other regular commits. + * During cold startup, the list of files to be committed can be huge. + * So creating a HoodieCommitMetadata out of these large number of files, + * and calling the existing update(HoodieCommitMetadata) function does not scale well. + * Hence, we have a special commit just for the initialization scenario. + */ + private void initialCommit(String createInstantTime, List partitionTypes) { + // List all partitions in the basePath of the containing dataset + LOG.info("Initializing metadata table by using file listings in " + dataWriteConfig.getBasePath()); + engineContext.setJobStatus(this.getClass().getSimpleName(), "Initializing metadata table by listing files and partitions: " + dataWriteConfig.getTableName()); + + Map> partitionToRecordsMap = new HashMap<>(); + + List partitionInfoList = listAllPartitions(dataMetaClient); + Map> partitionToFilesMap = partitionInfoList.stream() + .map(p -> { + String partitionName = HoodieTableMetadataUtil.getPartitionIdentifier(p.getRelativePath()); + return Pair.of(partitionName, p.getFileNameToSizeMap()); + }) + .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); + + int totalDataFilesCount = partitionToFilesMap.values().stream().mapToInt(Map::size).sum(); + List partitions = new ArrayList<>(partitionToFilesMap.keySet()); + + if (partitionTypes.contains(MetadataPartitionType.FILES)) { + // Record which saves the list of all partitions + HoodieRecord allPartitionRecord = HoodieMetadataPayload.createPartitionListRecord(partitions); + HoodieData filesPartitionRecords = getFilesPartitionRecords(createInstantTime, partitionInfoList, allPartitionRecord); + ValidationUtils.checkState(filesPartitionRecords.count() == (partitions.size() + 1)); + partitionToRecordsMap.put(MetadataPartitionType.FILES, filesPartitionRecords); + } + + if (partitionTypes.contains(MetadataPartitionType.BLOOM_FILTERS) && totalDataFilesCount > 0) { + final HoodieData recordsRDD = HoodieTableMetadataUtil.convertFilesToBloomFilterRecords( + engineContext, Collections.emptyMap(), partitionToFilesMap, getRecordsGenerationParams(), createInstantTime); + partitionToRecordsMap.put(MetadataPartitionType.BLOOM_FILTERS, recordsRDD); + } + + if (partitionTypes.contains(MetadataPartitionType.COLUMN_STATS) && totalDataFilesCount > 0) { + final HoodieData recordsRDD = HoodieTableMetadataUtil.convertFilesToColumnStatsRecords( + engineContext, Collections.emptyMap(), partitionToFilesMap, getRecordsGenerationParams()); + partitionToRecordsMap.put(MetadataPartitionType.COLUMN_STATS, recordsRDD); + } + + LOG.info("Committing " + partitions.size() + " partitions and " + totalDataFilesCount + " files to metadata"); + + commit(createInstantTime, partitionToRecordsMap, false); + } + + private HoodieData getFilesPartitionRecords(String createInstantTime, List partitionInfoList, HoodieRecord allPartitionRecord) { + HoodieData filesPartitionRecords = engineContext.parallelize(Arrays.asList(allPartitionRecord), 1); + if (partitionInfoList.isEmpty()) { + return filesPartitionRecords; + } + + HoodieData fileListRecords = engineContext.parallelize(partitionInfoList, partitionInfoList.size()).map(partitionInfo -> { + Map fileNameToSizeMap = partitionInfo.getFileNameToSizeMap(); + // filter for files that are part of the completed commits + Map validFileNameToSizeMap = fileNameToSizeMap.entrySet().stream().filter(fileSizePair -> { + String commitTime = FSUtils.getCommitTime(fileSizePair.getKey()); + return HoodieTimeline.compareTimestamps(commitTime, HoodieTimeline.LESSER_THAN_OR_EQUALS, createInstantTime); + }).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + + // Record which saves files within a partition + return HoodieMetadataPayload.createPartitionFilesRecord( + HoodieTableMetadataUtil.getPartitionIdentifier(partitionInfo.getRelativePath()), Option.of(validFileNameToSizeMap), Option.empty()); + }); + + return filesPartitionRecords.union(fileListRecords); + } + + protected void closeInternal() { + try { + close(); + } catch (Exception e) { + throw new HoodieException("Failed to close HoodieMetadata writer ", e); + } + } + + /** + * A class which represents a directory and the files and directories inside it. + *

+ * A {@code PartitionFileInfo} object saves the name of the partition and various properties requires of each file + * required for initializing the metadata table. Saving limited properties reduces the total memory footprint when + * a very large number of files are present in the dataset being initialized. + */ + static class DirectoryInfo implements Serializable { + // Relative path of the directory (relative to the base directory) + private final String relativePath; + // Map of filenames within this partition to their respective sizes + private final HashMap filenameToSizeMap; + // List of directories within this partition + private final List subDirectories = new ArrayList<>(); + // Is this a hoodie partition + private boolean isHoodiePartition = false; + + public DirectoryInfo(String relativePath, FileStatus[] fileStatus) { + this.relativePath = relativePath; + + // Pre-allocate with the maximum length possible + filenameToSizeMap = new HashMap<>(fileStatus.length); + + for (FileStatus status : fileStatus) { + if (status.isDirectory()) { + // Ignore .hoodie directory as there cannot be any partitions inside it + if (!status.getPath().getName().equals(HoodieTableMetaClient.METAFOLDER_NAME)) { + this.subDirectories.add(status.getPath()); + } + } else if (status.getPath().getName().startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX)) { + // Presence of partition meta file implies this is a HUDI partition + this.isHoodiePartition = true; + } else if (FSUtils.isDataFile(status.getPath())) { + // Regular HUDI data file (base file or log file) + filenameToSizeMap.put(status.getPath().getName(), status.getLen()); + } + } + } + + String getRelativePath() { + return relativePath; + } + + int getTotalFiles() { + return filenameToSizeMap.size(); + } + + boolean isHoodiePartition() { + return isHoodiePartition; + } + + List getSubDirectories() { + return subDirectories; + } + + // Returns a map of filenames mapped to their lengths + Map getFileNameToSizeMap() { + return filenameToSizeMap; + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataKeyGenerator.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataKeyGenerator.java new file mode 100644 index 0000000000000..332be73b14f57 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataKeyGenerator.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.keygen.KeyGenUtils; + +/** + * Custom key generator for the Hoodie table metadata. The metadata table record payload + * has an internal schema with a known key field HoodieMetadataPayload.SCHEMA_FIELD_ID_KEY. + * With or without the virtual keys, getting the key from the metadata table record is always + * via the above field and there is no real need for a key generator. But, when a write + * client is instantiated for the metadata table, when virtual keys are enabled, and when + * key generator class is not configured, the default SimpleKeyGenerator will be used. + * To avoid using any other key generators for the metadata table which rely on certain + * config properties, we need this custom key generator exclusively for the metadata table. + */ +public class HoodieTableMetadataKeyGenerator extends BaseKeyGenerator { + + public HoodieTableMetadataKeyGenerator(TypedProperties config) { + super(config); + } + + @Override + public String getRecordKey(GenericRecord record) { + return KeyGenUtils.getRecordKey(record, HoodieMetadataPayload.KEY_FIELD_NAME, isConsistentLogicalTimestampEnabled()); + } + + @Override + public String getPartitionPath(GenericRecord record) { + return ""; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataWriter.java new file mode 100644 index 0000000000000..83fe186727b32 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataWriter.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieIndexPartitionInfo; +import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.table.HoodieTableMetaClient; + +import java.io.IOException; +import java.io.Serializable; +import java.util.List; + +/** + * Interface that supports updating metadata for a given table, as actions complete. + */ +public interface HoodieTableMetadataWriter extends Serializable, AutoCloseable { + + /** + * Builds the given metadata partitions to create index. + * + * @param engineContext + * @param indexPartitionInfos - information about partitions to build such as partition type and base instant time + */ + void buildMetadataPartitions(HoodieEngineContext engineContext, List indexPartitionInfos); + + /** + * Initialize file groups for the given metadata partitions when indexing is requested. + * + * @param dataMetaClient - meta client for the data table + * @param metadataPartitions - metadata partitions for which file groups needs to be initialized + * @param instantTime - instant time of the index action + * @throws IOException + */ + void initializeMetadataPartitions(HoodieTableMetaClient dataMetaClient, List metadataPartitions, String instantTime) throws IOException; + + /** + * Drop the given metadata partitions. + * + * @param metadataPartitions + * @throws IOException + */ + void dropMetadataPartitions(List metadataPartitions) throws IOException; + + /** + * Update the metadata table due to a COMMIT operation. + * + * @param commitMetadata commit metadata of the operation of interest. + * @param instantTime instant time of the commit. + * @param isTableServiceAction true if caller is a table service. false otherwise. Only regular write operations can trigger metadata table services and this argument + * will assist in this. + */ + void update(HoodieCommitMetadata commitMetadata, String instantTime, boolean isTableServiceAction); + + /** + * Update the metadata table due to a CLEAN operation. + * + * @param cleanMetadata clean metadata of the operation of interest. + * @param instantTime instant time of the commit. + */ + void update(HoodieCleanMetadata cleanMetadata, String instantTime); + + /** + * Update the metadata table due to a RESTORE operation. + * + * @param restoreMetadata restore metadata of the operation of interest. + * @param instantTime instant time of the commit. + */ + void update(HoodieRestoreMetadata restoreMetadata, String instantTime); + + /** + * Update the metadata table due to a ROLLBACK operation. + * + * @param rollbackMetadata rollback metadata of the operation of interest. + * @param instantTime instant time of the commit. + */ + void update(HoodieRollbackMetadata rollbackMetadata, String instantTime); + + /** + * Deletes the given metadata partitions. This path reuses DELETE_PARTITION operation. + * + * @param instantTime - instant time when replacecommit corresponding to the drop will be recorded in the metadata timeline + * @param partitions - list of {@link MetadataPartitionType} to drop + */ + void deletePartitions(String instantTime, List partitions); +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/ConsoleMetricsReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/ConsoleMetricsReporter.java index b65c4ade88a82..5664240c627a0 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/ConsoleMetricsReporter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/ConsoleMetricsReporter.java @@ -18,15 +18,13 @@ package org.apache.hudi.metrics; -import java.io.Closeable; -import java.util.concurrent.TimeUnit; - -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - import com.codahale.metrics.ConsoleReporter; import com.codahale.metrics.MetricFilter; import com.codahale.metrics.MetricRegistry; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.concurrent.TimeUnit; /** * Hudi Console metrics reporter. Reports the metrics by printing them to the stdout on the console. @@ -61,11 +59,6 @@ public void report() { } } - @Override - public Closeable getReporter() { - return consoleReporter; - } - @Override public void stop() { if (consoleReporter != null) { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/HoodieMetrics.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/HoodieMetrics.java index 292039b83ad41..69ef7917b284f 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/HoodieMetrics.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/HoodieMetrics.java @@ -20,8 +20,11 @@ import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; +import com.codahale.metrics.Counter; import com.codahale.metrics.Timer; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -37,9 +40,13 @@ public class HoodieMetrics { public String cleanTimerName = null; public String commitTimerName = null; public String deltaCommitTimerName = null; + public String replaceCommitTimerName = null; public String finalizeTimerName = null; public String compactionTimerName = null; public String indexTimerName = null; + private String conflictResolutionTimerName = null; + private String conflictResolutionSuccessCounterName = null; + private String conflictResolutionFailureCounterName = null; private HoodieWriteConfig config; private String tableName; private Timer rollbackTimer = null; @@ -48,20 +55,28 @@ public class HoodieMetrics { private Timer deltaCommitTimer = null; private Timer finalizeTimer = null; private Timer compactionTimer = null; + private Timer clusteringTimer = null; private Timer indexTimer = null; + private Timer conflictResolutionTimer = null; + private Counter conflictResolutionSuccessCounter = null; + private Counter conflictResolutionFailureCounter = null; - public HoodieMetrics(HoodieWriteConfig config, String tableName) { + public HoodieMetrics(HoodieWriteConfig config) { this.config = config; - this.tableName = tableName; + this.tableName = config.getTableName(); if (config.isMetricsOn()) { Metrics.init(config); this.rollbackTimerName = getMetricsName("timer", HoodieTimeline.ROLLBACK_ACTION); this.cleanTimerName = getMetricsName("timer", HoodieTimeline.CLEAN_ACTION); this.commitTimerName = getMetricsName("timer", HoodieTimeline.COMMIT_ACTION); this.deltaCommitTimerName = getMetricsName("timer", HoodieTimeline.DELTA_COMMIT_ACTION); + this.replaceCommitTimerName = getMetricsName("timer", HoodieTimeline.REPLACE_COMMIT_ACTION); this.finalizeTimerName = getMetricsName("timer", "finalize"); this.compactionTimerName = getMetricsName("timer", HoodieTimeline.COMPACTION_ACTION); this.indexTimerName = getMetricsName("timer", "index"); + this.conflictResolutionTimerName = getMetricsName("timer", "conflict_resolution"); + this.conflictResolutionSuccessCounterName = getMetricsName("counter", "conflict_resolution.success"); + this.conflictResolutionFailureCounterName = getMetricsName("counter", "conflict_resolution.failure"); } } @@ -83,6 +98,13 @@ public Timer.Context getCompactionCtx() { return compactionTimer == null ? null : compactionTimer.time(); } + public Timer.Context getClusteringCtx() { + if (config.isMetricsOn() && clusteringTimer == null) { + clusteringTimer = createTimer(replaceCommitTimerName); + } + return clusteringTimer == null ? null : clusteringTimer.time(); + } + public Timer.Context getCleanCtx() { if (config.isMetricsOn() && cleanTimer == null) { cleanTimer = createTimer(cleanTimerName); @@ -118,8 +140,36 @@ public Timer.Context getIndexCtx() { return indexTimer == null ? null : indexTimer.time(); } + public Timer.Context getConflictResolutionCtx() { + if (config.isLockingMetricsEnabled() && conflictResolutionTimer == null) { + conflictResolutionTimer = createTimer(conflictResolutionTimerName); + } + return conflictResolutionTimer == null ? null : conflictResolutionTimer.time(); + } + + public void updateMetricsForEmptyData(String actionType) { + if (!config.isMetricsOn() || !config.getMetricsReporterType().equals(MetricsReporterType.PROMETHEUS_PUSHGATEWAY)) { + // No-op if metrics are not of type PROMETHEUS_PUSHGATEWAY. + return; + } + Metrics.registerGauge(getMetricsName(actionType, "totalPartitionsWritten"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalFilesInsert"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalFilesUpdate"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalRecordsWritten"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalUpdateRecordsWritten"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalInsertRecordsWritten"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalBytesWritten"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalScanTime"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalCreateTime"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalUpsertTime"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalCompactedRecordsUpdated"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalLogFilesCompacted"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalLogFilesSize"), 0); + } + public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs, HoodieCommitMetadata metadata, String actionType) { + updateCommitTimingMetrics(commitEpochTimeInMs, durationInMs, metadata, actionType); if (config.isMetricsOn()) { long totalPartitionsWritten = metadata.fetchTotalPartitionsWritten(); long totalFilesInsert = metadata.fetchTotalFilesInsert(); @@ -134,7 +184,6 @@ public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs, Hoo long totalCompactedRecordsUpdated = metadata.getTotalCompactedRecordsUpdated(); long totalLogFilesCompacted = metadata.getTotalLogFilesCompacted(); long totalLogFilesSize = metadata.getTotalLogFilesSize(); - Metrics.registerGauge(getMetricsName(actionType, "duration"), durationInMs); Metrics.registerGauge(getMetricsName(actionType, "totalPartitionsWritten"), totalPartitionsWritten); Metrics.registerGauge(getMetricsName(actionType, "totalFilesInsert"), totalFilesInsert); Metrics.registerGauge(getMetricsName(actionType, "totalFilesUpdate"), totalFilesUpdate); @@ -142,7 +191,6 @@ public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs, Hoo Metrics.registerGauge(getMetricsName(actionType, "totalUpdateRecordsWritten"), totalUpdateRecordsWritten); Metrics.registerGauge(getMetricsName(actionType, "totalInsertRecordsWritten"), totalInsertRecordsWritten); Metrics.registerGauge(getMetricsName(actionType, "totalBytesWritten"), totalBytesWritten); - Metrics.registerGauge(getMetricsName(actionType, "commitTime"), commitEpochTimeInMs); Metrics.registerGauge(getMetricsName(actionType, "totalScanTime"), totalTimeTakenByScanner); Metrics.registerGauge(getMetricsName(actionType, "totalCreateTime"), totalTimeTakenForInsert); Metrics.registerGauge(getMetricsName(actionType, "totalUpsertTime"), totalTimeTakenForUpsert); @@ -152,6 +200,23 @@ public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs, Hoo } } + private void updateCommitTimingMetrics(long commitEpochTimeInMs, long durationInMs, HoodieCommitMetadata metadata, + String actionType) { + if (config.isMetricsOn()) { + Pair, Option> eventTimePairMinMax = metadata.getMinAndMaxEventTime(); + if (eventTimePairMinMax.getLeft().isPresent()) { + long commitLatencyInMs = commitEpochTimeInMs + durationInMs - eventTimePairMinMax.getLeft().get(); + Metrics.registerGauge(getMetricsName(actionType, "commitLatencyInMs"), commitLatencyInMs); + } + if (eventTimePairMinMax.getRight().isPresent()) { + long commitFreshnessInMs = commitEpochTimeInMs + durationInMs - eventTimePairMinMax.getRight().get(); + Metrics.registerGauge(getMetricsName(actionType, "commitFreshnessInMs"), commitFreshnessInMs); + } + Metrics.registerGauge(getMetricsName(actionType, "commitTime"), commitEpochTimeInMs); + Metrics.registerGauge(getMetricsName(actionType, "duration"), durationInMs); + } + } + public void updateRollbackMetrics(long durationInMs, long numFilesDeleted) { if (config.isMetricsOn()) { LOG.info( @@ -187,7 +252,7 @@ public void updateIndexMetrics(final String action, final long durationInMs) { } String getMetricsName(String action, String metric) { - return config == null ? null : String.format("%s.%s.%s", tableName, action, metric); + return config == null ? null : String.format("%s.%s.%s", config.getMetricReporterMetricsNamePrefix(), action, metric); } /** @@ -196,4 +261,27 @@ String getMetricsName(String action, String metric) { public long getDurationInMs(long ctxDuration) { return ctxDuration / 1000000; } + + public void emitConflictResolutionSuccessful() { + if (config.isLockingMetricsEnabled()) { + LOG.info("Sending conflict resolution success metric"); + conflictResolutionSuccessCounter = getCounter(conflictResolutionSuccessCounter, conflictResolutionSuccessCounterName); + conflictResolutionSuccessCounter.inc(); + } + } + + public void emitConflictResolutionFailed() { + if (config.isLockingMetricsEnabled()) { + LOG.info("Sending conflict resolution failure metric"); + conflictResolutionFailureCounter = getCounter(conflictResolutionFailureCounter, conflictResolutionFailureCounterName); + conflictResolutionFailureCounter.inc(); + } + } + + private Counter getCounter(Counter counter, String name) { + if (counter == null) { + return Metrics.getInstance().getRegistry().counter(name); + } + return counter; + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/InMemoryMetricsReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/InMemoryMetricsReporter.java index a145024574d76..96439c3b31075 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/InMemoryMetricsReporter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/InMemoryMetricsReporter.java @@ -18,8 +18,6 @@ package org.apache.hudi.metrics; -import java.io.Closeable; - /** * Used for testing. */ @@ -31,11 +29,6 @@ public void start() {} @Override public void report() {} - @Override - public Closeable getReporter() { - return null; - } - @Override public void stop() { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/JmxMetricsReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/JmxMetricsReporter.java index 309981a9d85cf..a909f62355bed 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/JmxMetricsReporter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/JmxMetricsReporter.java @@ -26,12 +26,10 @@ import javax.management.MBeanServer; -import java.io.Closeable; import java.lang.management.ManagementFactory; import java.util.Objects; import java.util.stream.IntStream; - /** * Implementation of Jmx reporter, which used to report jmx metric. */ @@ -86,11 +84,6 @@ public void start() { public void report() { } - @Override - public Closeable getReporter() { - return jmxReporterServer.getReporter(); - } - @Override public void stop() { Objects.requireNonNull(jmxReporterServer, "jmxReporterServer is not running."); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/Metrics.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/Metrics.java index 5667a66a54934..8f3e49748121b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/Metrics.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/Metrics.java @@ -27,7 +27,6 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.Closeable; import java.util.Map; /** @@ -46,25 +45,22 @@ public class Metrics { private Metrics(HoodieWriteConfig metricConfig) { registry = new MetricRegistry(); - commonMetricPrefix = metricConfig.getTableName(); + commonMetricPrefix = metricConfig.getMetricReporterMetricsNamePrefix(); reporter = MetricsReporterFactory.createReporter(metricConfig, registry); if (reporter == null) { throw new RuntimeException("Cannot initialize Reporter."); } reporter.start(); - Runtime.getRuntime().addShutdownHook(new Thread(() -> { - reportAndCloseReporter(); - })); + Runtime.getRuntime().addShutdownHook(new Thread(Metrics::shutdown)); } - private void reportAndCloseReporter() { + private void reportAndStopReporter() { try { registerHoodieCommonMetrics(); reporter.report(); - if (getReporter() != null) { - getReporter().close(); - } + LOG.info("Stopping the metrics reporter..."); + reporter.stop(); } catch (Exception e) { LOG.warn("Error while closing reporter", e); } @@ -106,7 +102,7 @@ public static synchronized void shutdown() { if (!initialized) { return; } - instance.reportAndCloseReporter(); + instance.reportAndStopReporter(); initialized = false; } @@ -138,7 +134,7 @@ public MetricRegistry getRegistry() { return registry; } - public Closeable getReporter() { - return reporter.getReporter(); + public static boolean isInitialized() { + return initialized; } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsGraphiteReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsGraphiteReporter.java index 9855ac0b0272d..34221f2c2f832 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsGraphiteReporter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsGraphiteReporter.java @@ -27,7 +27,6 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.Closeable; import java.net.InetSocketAddress; import java.util.concurrent.TimeUnit; @@ -42,6 +41,7 @@ public class MetricsGraphiteReporter extends MetricsReporter { private final HoodieWriteConfig config; private String serverHost; private int serverPort; + private final int periodSeconds; public MetricsGraphiteReporter(HoodieWriteConfig config, MetricRegistry registry) { this.registry = registry; @@ -56,12 +56,13 @@ public MetricsGraphiteReporter(HoodieWriteConfig config, MetricRegistry registry } this.graphiteReporter = createGraphiteReport(); + this.periodSeconds = config.getGraphiteReportPeriodSeconds(); } @Override public void start() { if (graphiteReporter != null) { - graphiteReporter.start(30, TimeUnit.SECONDS); + graphiteReporter.start(periodSeconds, TimeUnit.SECONDS); } else { LOG.error("Cannot start as the graphiteReporter is null."); } @@ -76,11 +77,6 @@ public void report() { } } - @Override - public Closeable getReporter() { - return graphiteReporter; - } - private GraphiteReporter createGraphiteReport() { Graphite graphite = new Graphite(new InetSocketAddress(serverHost, serverPort)); String reporterPrefix = config.getGraphiteMetricPrefix(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporter.java index 773bb3bfd8141..64a0ae561383f 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporter.java @@ -18,8 +18,6 @@ package org.apache.hudi.metrics; -import java.io.Closeable; - /** * Interface for implementing a Reporter. */ @@ -35,8 +33,6 @@ public abstract class MetricsReporter { */ public abstract void report(); - public abstract Closeable getReporter(); - /** * Stop this reporter. Should be used to stop channels, streams and release resources. */ diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterFactory.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterFactory.java index 66cdeebe97f08..d81e337b28d7a 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterFactory.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterFactory.java @@ -22,12 +22,13 @@ import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.metrics.cloudwatch.CloudWatchMetricsReporter; +import org.apache.hudi.metrics.custom.CustomizableMetricsReporter; import org.apache.hudi.metrics.datadog.DatadogMetricsReporter; - -import com.codahale.metrics.MetricRegistry; import org.apache.hudi.metrics.prometheus.PrometheusReporter; import org.apache.hudi.metrics.prometheus.PushGatewayMetricsReporter; -import org.apache.hudi.metrics.userdefined.AbstractUserDefinedMetricsReporter; + +import com.codahale.metrics.MetricRegistry; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -41,20 +42,20 @@ public class MetricsReporterFactory { private static final Logger LOG = LogManager.getLogger(MetricsReporterFactory.class); public static MetricsReporter createReporter(HoodieWriteConfig config, MetricRegistry registry) { - MetricsReporterType type = config.getMetricsReporterType(); - MetricsReporter reporter = null; + String reporterClassName = config.getMetricReporterClassName(); - if (!StringUtils.isNullOrEmpty(config.getMetricReporterClassName())) { - Object instance = ReflectionUtils - .loadClass(config.getMetricReporterClassName(), - new Class[] {Properties.class, MetricRegistry.class}, config.getProps(), registry); - if (!(instance instanceof AbstractUserDefinedMetricsReporter)) { + if (!StringUtils.isNullOrEmpty(reporterClassName)) { + Object instance = ReflectionUtils.loadClass( + reporterClassName, new Class[] {Properties.class, MetricRegistry.class}, config.getProps(), registry); + if (!(instance instanceof CustomizableMetricsReporter)) { throw new HoodieException(config.getMetricReporterClassName() - + " is not a subclass of AbstractUserDefinedMetricsReporter"); + + " is not a subclass of CustomizableMetricsReporter"); } return (MetricsReporter) instance; } + MetricsReporterType type = config.getMetricsReporterType(); + MetricsReporter reporter = null; switch (type) { case GRAPHITE: reporter = new MetricsGraphiteReporter(config, registry); @@ -77,6 +78,9 @@ public static MetricsReporter createReporter(HoodieWriteConfig config, MetricReg case CONSOLE: reporter = new ConsoleMetricsReporter(registry); break; + case CLOUDWATCH: + reporter = new CloudWatchMetricsReporter(config, registry); + break; default: LOG.error("Reporter type[" + type + "] is not supported."); break; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterType.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterType.java index 36b15a89ac883..3c8600159287c 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterType.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterType.java @@ -22,5 +22,5 @@ * Types of the reporter supported, hudi also supports user defined reporter. */ public enum MetricsReporterType { - GRAPHITE, INMEMORY, JMX, DATADOG, CONSOLE, PROMETHEUS_PUSHGATEWAY, PROMETHEUS + GRAPHITE, INMEMORY, JMX, DATADOG, CONSOLE, PROMETHEUS_PUSHGATEWAY, PROMETHEUS, CLOUDWATCH } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/cloudwatch/CloudWatchMetricsReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/cloudwatch/CloudWatchMetricsReporter.java new file mode 100644 index 0000000000000..a0eb01abd0c26 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/cloudwatch/CloudWatchMetricsReporter.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metrics.cloudwatch; + +import org.apache.hudi.aws.cloudwatch.CloudWatchReporter; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.metrics.MetricsReporter; + +import com.codahale.metrics.MetricRegistry; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.concurrent.TimeUnit; + +/** + * Hudi Amazon CloudWatch metrics reporter. Responsible for reading Hoodie metrics configurations and hooking up with + * {@link org.apache.hudi.metrics.Metrics}. Internally delegates reporting tasks to {@link CloudWatchReporter}. + */ +public class CloudWatchMetricsReporter extends MetricsReporter { + + private static final Logger LOG = LogManager.getLogger(CloudWatchMetricsReporter.class); + + private final MetricRegistry registry; + private final HoodieWriteConfig config; + private final CloudWatchReporter reporter; + + public CloudWatchMetricsReporter(HoodieWriteConfig config, MetricRegistry registry) { + this.config = config; + this.registry = registry; + this.reporter = createCloudWatchReporter(); + } + + CloudWatchMetricsReporter(HoodieWriteConfig config, MetricRegistry registry, CloudWatchReporter reporter) { + this.config = config; + this.registry = registry; + this.reporter = reporter; + } + + private CloudWatchReporter createCloudWatchReporter() { + return CloudWatchReporter.forRegistry(registry) + .prefixedWith(config.getCloudWatchMetricPrefix()) + .namespace(config.getCloudWatchMetricNamespace()) + .maxDatumsPerRequest(config.getCloudWatchMaxDatumsPerRequest()) + .build(config.getProps()); + } + + @Override + public void start() { + LOG.info("Starting CloudWatch Metrics Reporter."); + reporter.start(config.getCloudWatchReportPeriodSeconds(), TimeUnit.SECONDS); + } + + @Override + public void report() { + reporter.report(); + } + + @Override + public void stop() { + LOG.info("Stopping CloudWatch Metrics Reporter."); + reporter.stop(); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/custom/CustomizableMetricsReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/custom/CustomizableMetricsReporter.java new file mode 100644 index 0000000000000..13574b1e15693 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/custom/CustomizableMetricsReporter.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metrics.custom; + +import org.apache.hudi.metrics.MetricsReporter; + +import com.codahale.metrics.MetricRegistry; + +import java.util.Properties; + +/** + * Extensible metrics reporter for custom implementation. + */ +public abstract class CustomizableMetricsReporter extends MetricsReporter { + private Properties props; + private MetricRegistry registry; + + public CustomizableMetricsReporter(Properties props, MetricRegistry registry) { + this.props = props; + this.registry = registry; + } + + public Properties getProps() { + return props; + } + + public MetricRegistry getRegistry() { + return registry; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/datadog/DatadogMetricsReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/datadog/DatadogMetricsReporter.java index 0830ef4c5bad1..fdbd0cc7e4097 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/datadog/DatadogMetricsReporter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/datadog/DatadogMetricsReporter.java @@ -28,7 +28,6 @@ import com.codahale.metrics.MetricFilter; import com.codahale.metrics.MetricRegistry; -import java.io.Closeable; import java.util.List; import java.util.concurrent.TimeUnit; @@ -81,11 +80,6 @@ public void report() { reporter.report(); } - @Override - public Closeable getReporter() { - return reporter; - } - @Override public void stop() { reporter.stop(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/prometheus/PrometheusReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/prometheus/PrometheusReporter.java index 81c89b6e1ccc4..1b53e9ea89452 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/prometheus/PrometheusReporter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/prometheus/PrometheusReporter.java @@ -18,17 +18,17 @@ package org.apache.hudi.metrics.prometheus; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.metrics.MetricsReporter; + import com.codahale.metrics.MetricRegistry; import io.prometheus.client.CollectorRegistry; import io.prometheus.client.dropwizard.DropwizardExports; import io.prometheus.client.exporter.HTTPServer; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.metrics.MetricsReporter; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.Closeable; import java.net.InetSocketAddress; /** @@ -65,11 +65,6 @@ public void start() { public void report() { } - @Override - public Closeable getReporter() { - return null; - } - @Override public void stop() { collectorRegistry.unregister(metricExports); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/prometheus/PushGatewayMetricsReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/prometheus/PushGatewayMetricsReporter.java index 17c4d7b92392c..e2bfa6a67b026 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/prometheus/PushGatewayMetricsReporter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/prometheus/PushGatewayMetricsReporter.java @@ -18,12 +18,12 @@ package org.apache.hudi.metrics.prometheus; -import com.codahale.metrics.MetricFilter; -import com.codahale.metrics.MetricRegistry; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.metrics.MetricsReporter; -import java.io.Closeable; +import com.codahale.metrics.MetricFilter; +import com.codahale.metrics.MetricRegistry; + import java.util.Random; import java.util.concurrent.TimeUnit; @@ -50,7 +50,8 @@ public PushGatewayMetricsReporter(HoodieWriteConfig config, MetricRegistry regis TimeUnit.SECONDS, TimeUnit.SECONDS, getJobName(), - serverHost + ":" + serverPort, + serverHost, + serverPort, deleteShutdown); } @@ -64,11 +65,6 @@ public void report() { pushGatewayReporter.report(null, null, null, null, null); } - @Override - public Closeable getReporter() { - return pushGatewayReporter; - } - @Override public void stop() { pushGatewayReporter.stop(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/prometheus/PushGatewayReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/prometheus/PushGatewayReporter.java index 3b1988259a15b..5f82b6679ffd0 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/prometheus/PushGatewayReporter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/prometheus/PushGatewayReporter.java @@ -29,6 +29,8 @@ import io.prometheus.client.CollectorRegistry; import io.prometheus.client.dropwizard.DropwizardExports; import io.prometheus.client.exporter.PushGateway; +import java.net.MalformedURLException; +import java.net.URL; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -51,17 +53,30 @@ protected PushGatewayReporter(MetricRegistry registry, TimeUnit rateUnit, TimeUnit durationUnit, String jobName, - String address, + String serverHost, + int serverPort, boolean deleteShutdown) { super(registry, "hudi-push-gateway-reporter", filter, rateUnit, durationUnit); this.jobName = jobName; this.deleteShutdown = deleteShutdown; collectorRegistry = new CollectorRegistry(); metricExports = new DropwizardExports(registry); - pushGateway = new PushGateway(address); + pushGateway = createPushGatewayClient(serverHost, serverPort); metricExports.register(collectorRegistry); } + private PushGateway createPushGatewayClient(String serverHost, int serverPort) { + if (serverPort == 443) { + try { + return new PushGateway(new URL("https://" + serverHost + ":" + serverPort)); + } catch (MalformedURLException e) { + e.printStackTrace(); + throw new IllegalArgumentException("Malformed pushgateway host: " + serverHost); + } + } + return new PushGateway(serverHost + ":" + serverPort); + } + @Override public void report(SortedMap gauges, SortedMap counters, diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/userdefined/AbstractUserDefinedMetricsReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/userdefined/AbstractUserDefinedMetricsReporter.java index 0a0d7bbe123a6..715b9564c5f70 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/userdefined/AbstractUserDefinedMetricsReporter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/userdefined/AbstractUserDefinedMetricsReporter.java @@ -7,38 +7,31 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ package org.apache.hudi.metrics.userdefined; +import org.apache.hudi.metrics.custom.CustomizableMetricsReporter; + import com.codahale.metrics.MetricRegistry; -import org.apache.hudi.metrics.MetricsReporter; + import java.util.Properties; /** - * Abstract class of user defined metrics reporter. + * @deprecated Extend {@link CustomizableMetricsReporter} instead. */ -public abstract class AbstractUserDefinedMetricsReporter extends MetricsReporter { - private Properties props; - private MetricRegistry registry; +@Deprecated +public abstract class AbstractUserDefinedMetricsReporter extends CustomizableMetricsReporter { public AbstractUserDefinedMetricsReporter(Properties props, MetricRegistry registry) { - this.props = props; - this.registry = registry; - } - - public Properties getProps() { - return props; - } - - public MetricRegistry getRegistry() { - return registry; + super(props, registry); } -} \ No newline at end of file +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/HilbertCurveUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/HilbertCurveUtils.java new file mode 100644 index 0000000000000..0f216abeee748 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/HilbertCurveUtils.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.optimize; + +import org.davidmoten.hilbert.HilbertCurve; + +import java.math.BigInteger; + +/** + * Utils for Hilbert Curve. + */ +public class HilbertCurveUtils { + public static byte[] indexBytes(HilbertCurve hilbertCurve, long[] points, int paddingNum) { + BigInteger index = hilbertCurve.index(points); + return paddingToNByte(index.toByteArray(), paddingNum); + } + + public static byte[] paddingToNByte(byte[] a, int paddingNum) { + if (a.length == paddingNum) { + return a; + } + if (a.length > paddingNum) { + byte[] result = new byte[paddingNum]; + System.arraycopy(a, 0, result, 0, paddingNum); + return result; + } + int paddingSize = paddingNum - a.length; + byte[] result = new byte[paddingNum]; + for (int i = 0; i < paddingSize; i++) { + result[i] = 0; + } + System.arraycopy(a, 0, result, paddingSize, a.length); + return result; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/BulkInsertPartitioner.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/BulkInsertPartitioner.java index fd1558a8232bb..844e9f4f0f8ac 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/BulkInsertPartitioner.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/BulkInsertPartitioner.java @@ -18,24 +18,58 @@ package org.apache.hudi.table; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.execution.bulkinsert.BulkInsertSortMode; +import org.apache.hudi.io.WriteHandleFactory; + +import java.io.Serializable; + /** - * Repartition input records into at least expected number of output spark partitions. It should give below guarantees - - * Output spark partition will have records from only one hoodie partition. - Average records per output spark - * partitions should be almost equal to (#inputRecords / #outputSparkPartitions) to avoid possible skews. + * Partitions the input records for bulk insert operation. + *

+ * The actual implementation of {@link BulkInsertPartitioner} is determined by the bulk insert + * sort mode, {@link BulkInsertSortMode}, specified by + * {@code HoodieWriteConfig.BULK_INSERT_SORT_MODE} (`hoodie.bulkinsert.sort.mode`). */ -public interface BulkInsertPartitioner { +public interface BulkInsertPartitioner extends Serializable { /** - * Repartitions the input records into at least expected number of output spark partitions. + * Partitions the input records based on the number of output partitions as a hint. + *

+ * Note that, the number of output partitions may or may not be enforced, depending on the + * specific implementation. * - * @param records Input Hoodie records - * @param outputSparkPartitions Expected number of output partitions + * @param records Input Hoodie records. + * @param outputPartitions Expected number of output partitions as a hint. * @return */ - I repartitionRecords(I records, int outputSparkPartitions); + I repartitionRecords(I records, int outputPartitions); /** * @return {@code true} if the records within a partition are sorted; {@code false} otherwise. */ boolean arePartitionRecordsSorted(); + + /** + * Return file group id prefix for the given data partition. + * By defauult, return a new file group id prefix, so that incoming records will route to a fresh new file group + * + * @param partitionId data partition + * @return + */ + default String getFileIdPfx(int partitionId) { + return FSUtils.createNewFileIdPfx(); + } + + /** + * Return write handle factory for the given partition. + * + * @param partitionId data partition + * @return + */ + default Option getWriteHandleFactory(int partitionId) { + return Option.empty(); + } + } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/FileIdPrefixProvider.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/FileIdPrefixProvider.java new file mode 100644 index 0000000000000..0fc0823184ea0 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/FileIdPrefixProvider.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table; + +import org.apache.hudi.common.config.TypedProperties; + +public abstract class FileIdPrefixProvider { + + private final TypedProperties props; + + public FileIdPrefixProvider(TypedProperties props) { + this.props = props; + } + + public TypedProperties getProps() { + return props; + } + + public abstract String createFilePrefix(String partitionPath); +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieCompactionHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieCompactionHandler.java new file mode 100644 index 0000000000000..eeb287abd543c --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieCompactionHandler.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +/** + * Interface for insert and update operations in compaction. + * + * @param HoodieRecordPayload type. + */ +public interface HoodieCompactionHandler { + Iterator> handleUpdate(String instantTime, String partitionPath, String fileId, + Map> keyToNewRecords, HoodieBaseFile oldDataFile) throws IOException; + + Iterator> handleInsert(String instantTime, String partitionPath, String fileId, + Map> recordMap); +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java index a33ad99e03a33..35eb0edfbfc61 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java @@ -18,19 +18,24 @@ package org.apache.hudi.table; -import org.apache.avro.Schema; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieCleanerPlan; +import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.avro.model.HoodieIndexCommitMetadata; +import org.apache.hudi.avro.model.HoodieIndexPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRestorePlan; import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.avro.model.HoodieSavepointMetadata; -import org.apache.hudi.client.common.TaskContextSupplier; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.HoodiePendingRollbackInfo; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieLocalEngineContext; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.fs.ConsistencyGuard; import org.apache.hudi.common.fs.ConsistencyGuard.FileVisibility; import org.apache.hudi.common.fs.ConsistencyGuardConfig; @@ -39,29 +44,48 @@ import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.TableSchemaResolver; -import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.view.FileSystemViewManager; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.table.view.SyncableFileSystemView; import org.apache.hudi.common.table.view.TableFileSystemView; import org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView; import org.apache.hudi.common.table.view.TableFileSystemView.SliceView; +import org.apache.hudi.common.util.ClusteringUtils; +import org.apache.hudi.common.util.Functions; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieInsertException; +import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.MetadataPartitionType; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata; +import org.apache.hudi.table.marker.WriteMarkers; +import org.apache.hudi.table.marker.WriteMarkersFactory; +import org.apache.hudi.table.storage.HoodieLayoutFactory; +import org.apache.hudi.table.storage.HoodieStorageLayout; + +import org.apache.avro.Schema; +import org.apache.avro.specific.SpecificRecordBase; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -72,9 +96,16 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.TimeoutException; +import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.apache.hudi.common.table.HoodieTableConfig.TABLE_METADATA_PARTITIONS; +import static org.apache.hudi.common.util.StringUtils.EMPTY_STRING; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.deleteMetadataPartition; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.deleteMetadataTable; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.metadataPartitionExists; + /** * Abstract implementation of a HoodieTable. * @@ -89,32 +120,49 @@ public abstract class HoodieTable implem protected final HoodieWriteConfig config; protected final HoodieTableMetaClient metaClient; - protected final HoodieIndex index; - + protected final HoodieIndex index; private SerializableConfiguration hadoopConfiguration; - private transient FileSystemViewManager viewManager; - protected final TaskContextSupplier taskContextSupplier; + private final HoodieTableMetadata metadata; + private final HoodieStorageLayout storageLayout; + + private transient FileSystemViewManager viewManager; + protected final transient HoodieEngineContext context; protected HoodieTable(HoodieWriteConfig config, HoodieEngineContext context, HoodieTableMetaClient metaClient) { this.config = config; this.hadoopConfiguration = context.getHadoopConf(); - this.viewManager = FileSystemViewManager.createViewManager(hadoopConfiguration, - config.getViewStorageConfig()); + this.context = context; + + HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder().fromProperties(config.getMetadataConfig().getProps()) + .build(); + this.metadata = HoodieTableMetadata.create(context, metadataConfig, config.getBasePath(), + FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue()); + + this.viewManager = FileSystemViewManager.createViewManager(context, config.getMetadataConfig(), config.getViewStorageConfig(), config.getCommonConfig(), () -> metadata); this.metaClient = metaClient; this.index = getIndex(config, context); + this.storageLayout = getStorageLayout(config); this.taskContextSupplier = context.getTaskContextSupplier(); } - protected abstract HoodieIndex getIndex(HoodieWriteConfig config, HoodieEngineContext context); + protected abstract HoodieIndex getIndex(HoodieWriteConfig config, HoodieEngineContext context); + + protected HoodieStorageLayout getStorageLayout(HoodieWriteConfig config) { + return HoodieLayoutFactory.createLayout(config); + } private synchronized FileSystemViewManager getViewManager() { if (null == viewManager) { - viewManager = FileSystemViewManager.createViewManager(hadoopConfiguration, config.getViewStorageConfig()); + viewManager = FileSystemViewManager.createViewManager(getContext(), config.getMetadataConfig(), config.getViewStorageConfig(), config.getCommonConfig(), () -> metadata); } return viewManager; } + public HoodieTableMetadata getMetadata() { + return metadata; + } + /** * Upsert a batch of new records into Hoodie table at the supplied instantTime. * @param context HoodieEngineContext @@ -144,7 +192,7 @@ public abstract HoodieWriteMetadata insert(HoodieEngineContext context, Strin * @return HoodieWriteMetadata */ public abstract HoodieWriteMetadata bulkInsert(HoodieEngineContext context, String instantTime, - I records, Option> bulkInsertPartitioner); + I records, Option bulkInsertPartitioner); /** * Deletes a list of {@link HoodieKey}s from the Hoodie table, at the supplied instantTime {@link HoodieKey}s will be @@ -157,6 +205,15 @@ public abstract HoodieWriteMetadata bulkInsert(HoodieEngineContext context, S */ public abstract HoodieWriteMetadata delete(HoodieEngineContext context, String instantTime, K keys); + /** + * Deletes all data of partitions. + * @param context HoodieEngineContext + * @param instantTime Instant Time for the action + * @param partitions {@link List} of partition to be deleted + * @return HoodieWriteMetadata + */ + public abstract HoodieWriteMetadata deletePartitions(HoodieEngineContext context, String instantTime, List partitions); + /** * Upserts the given prepared records into the Hoodie table, at the supplied instantTime. *

@@ -192,7 +249,7 @@ public abstract HoodieWriteMetadata insertPrepped(HoodieEngineContext context * @return HoodieWriteMetadata */ public abstract HoodieWriteMetadata bulkInsertPrepped(HoodieEngineContext context, String instantTime, - I preppedRecords, Option> bulkInsertPartitioner); + I preppedRecords, Option bulkInsertPartitioner); /** * Replaces all the existing records and inserts the specified new records into Hoodie table at the supplied instantTime, @@ -224,6 +281,13 @@ public HoodieTableMetaClient getMetaClient() { return metaClient; } + /** + * @return if the table is physically partitioned, based on the partition fields stored in the table config. + */ + public boolean isPartitioned() { + return getMetaClient().getTableConfig().isTablePartitioned(); + } + public Configuration getHadoopConf() { return metaClient.getHadoopConf(); } @@ -291,6 +355,20 @@ public HoodieTimeline getCleanTimeline() { return getActiveTimeline().getCleanerTimeline(); } + /** + * Get rollback timeline. + */ + public HoodieTimeline getRollbackTimeline() { + return getActiveTimeline().getRollbackTimeline(); + } + + /** + * Get restore timeline. + */ + public HoodieTimeline getRestoreTimeline() { + return getActiveTimeline().getRestoreTimeline(); + } + /** * Get only the completed (no-inflights) savepoint timeline. */ @@ -299,10 +377,10 @@ public HoodieTimeline getCompletedSavepointTimeline() { } /** - * Get the list of savepoints in this table. + * Get the list of savepoint timestamps in this table. */ - public List getSavepoints() { - return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + public Set getSavepointTimestamps() { + return getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toSet()); } public HoodieActiveTimeline getActiveTimeline() { @@ -312,10 +390,14 @@ public HoodieActiveTimeline getActiveTimeline() { /** * Return the index. */ - public HoodieIndex getIndex() { + public HoodieIndex getIndex() { return index; } + public HoodieStorageLayout getStorageLayout() { + return storageLayout; + } + /** * Schedule compaction for the instant time. * @@ -331,11 +413,31 @@ public abstract Option scheduleCompaction(HoodieEngineCont /** * Run Compaction on the table. Compaction arranges the data so that it is optimized for data access. * - * @param context HoodieEngineContext + * @param context HoodieEngineContext * @param compactionInstantTime Instant Time */ public abstract HoodieWriteMetadata compact(HoodieEngineContext context, - String compactionInstantTime); + String compactionInstantTime); + + /** + * Schedule clustering for the instant time. + * + * @param context HoodieEngineContext + * @param instantTime Instant Time for scheduling clustering + * @param extraMetadata additional metadata to write into plan + * @return HoodieClusteringPlan, if there is enough data for clustering. + */ + public abstract Option scheduleClustering(HoodieEngineContext context, + String instantTime, + Option> extraMetadata); + + /** + * Execute Clustering on the table. Clustering re-arranges the data so that it is optimized for data access. + * + * @param context HoodieEngineContext + * @param clusteringInstantTime Instant Time + */ + public abstract HoodieWriteMetadata cluster(HoodieEngineContext context, String clusteringInstantTime); /** * Perform metadata/full bootstrap of a Hudi table. @@ -351,12 +453,38 @@ public abstract HoodieWriteMetadata compact(HoodieEngineContext context, */ public abstract void rollbackBootstrap(HoodieEngineContext context, String instantTime); + /** + * Schedule cleaning for the instant time. + * + * @param context HoodieEngineContext + * @param instantTime Instant Time for scheduling cleaning + * @param extraMetadata additional metadata to write into plan + * @return HoodieCleanerPlan, if there is anything to clean. + */ + public abstract Option scheduleCleaning(HoodieEngineContext context, + String instantTime, + Option> extraMetadata); + /** * Executes a new clean action. * * @return information on cleaned file slices */ - public abstract HoodieCleanMetadata clean(HoodieEngineContext context, String cleanInstantTime); + public abstract HoodieCleanMetadata clean(HoodieEngineContext context, String cleanInstantTime, boolean skipLocking); + + /** + * Schedule rollback for the instant time. + * + * @param context HoodieEngineContext + * @param instantTime Instant Time for scheduling rollback + * @param instantToRollback instant to be rolled back + * @param shouldRollbackUsingMarkers uses marker based rollback strategy when set to true. uses list based rollback when false. + * @return HoodieRollbackPlan containing info on rollback. + */ + public abstract Option scheduleRollback(HoodieEngineContext context, + String instantTime, + HoodieInstant instantToRollback, + boolean skipTimelinePublish, boolean shouldRollbackUsingMarkers); /** * Rollback the (inflight/committed) record changes with the given commit time. @@ -371,7 +499,27 @@ public abstract HoodieWriteMetadata compact(HoodieEngineContext context, public abstract HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollbackInstantTime, HoodieInstant commitInstant, - boolean deleteInstants); + boolean deleteInstants, + boolean skipLocking); + + /** + * Schedules Indexing for the table to the given instant. + * + * @param context HoodieEngineContext + * @param indexInstantTime Instant time for scheduling index action. + * @param partitionsToIndex List of {@link MetadataPartitionType} that should be indexed. + * @return HoodieIndexPlan containing metadata partitions and instant upto which they should be indexed. + */ + public abstract Option scheduleIndexing(HoodieEngineContext context, String indexInstantTime, List partitionsToIndex); + + /** + * Execute requested index action. + * + * @param context HoodieEngineContext + * @param indexInstantTime Instant time for which index action was scheduled. + * @return HoodieIndexCommitMetadata containing write stats for each metadata partition. + */ + public abstract Option index(HoodieEngineContext context, String indexInstantTime); /** * Create a savepoint at the specified instant, so that the table can be restored @@ -390,11 +538,61 @@ public abstract HoodieRestoreMetadata restore(HoodieEngineContext context, String restoreInstantTime, String instantToRestore); + /** + * Schedules Restore for the table to the given instant. + */ + public abstract Option scheduleRestore(HoodieEngineContext context, + String restoreInstantTime, + String instantToRestore); + + public void rollbackInflightCompaction(HoodieInstant inflightInstant) { + rollbackInflightCompaction(inflightInstant, s -> Option.empty()); + } + + /** + * Rollback failed compactions. Inflight rollbacks for compactions revert the .inflight file + * to the .requested file. + * + * @param inflightInstant Inflight Compaction Instant + */ + public void rollbackInflightCompaction(HoodieInstant inflightInstant, + Function> getPendingRollbackInstantFunc) { + ValidationUtils.checkArgument(inflightInstant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)); + rollbackInflightInstant(inflightInstant, getPendingRollbackInstantFunc); + } + + /** + * Rollback inflight clustering instant to requested clustering instant + * + * @param inflightInstant Inflight clustering instant + * @param getPendingRollbackInstantFunc Function to get rollback instant + */ + public void rollbackInflightClustering(HoodieInstant inflightInstant, + Function> getPendingRollbackInstantFunc) { + ValidationUtils.checkArgument(inflightInstant.getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION)); + rollbackInflightInstant(inflightInstant, getPendingRollbackInstantFunc); + } + + /** + * Rollback inflight instant to requested instant + * + * @param inflightInstant Inflight instant + * @param getPendingRollbackInstantFunc Function to get rollback instant + */ + private void rollbackInflightInstant(HoodieInstant inflightInstant, + Function> getPendingRollbackInstantFunc) { + final String commitTime = getPendingRollbackInstantFunc.apply(inflightInstant.getTimestamp()).map(entry + -> entry.getRollbackInstant().getTimestamp()).orElse(HoodieActiveTimeline.createNewInstantTime()); + scheduleRollback(context, commitTime, inflightInstant, false, config.shouldRollbackUsingMarkers()); + rollback(context, commitTime, inflightInstant, false, false); + getActiveTimeline().revertInstantFromInflightToRequested(inflightInstant); + } + /** * Finalize the written data onto storage. Perform any final cleanups. * * @param context HoodieEngineContext - * @param stats List of HoodieWriteStats + * @param stats List of HoodieWriteStats * @throws HoodieIOException if some paths can't be finalized on storage */ public void finalizeWrite(HoodieEngineContext context, String instantTs, List stats) throws HoodieIOException { @@ -403,6 +601,7 @@ public void finalizeWrite(HoodieEngineContext context, String instantTs, List>> invalidFilesByPartition) { // Now delete partially written files + context.setJobStatus(this.getClass().getSimpleName(), "Delete invalid files generated during the write operation: " + config.getTableName()); context.map(new ArrayList<>(invalidFilesByPartition.values()), partitionWithFileList -> { final FileSystem fileSystem = metaClient.getFs(); LOG.info("Deleting invalid data files=" + partitionWithFileList); @@ -422,6 +621,13 @@ private void deleteInvalidFilesByPartitions(HoodieEngineContext context, Map getInvalidDataPaths(WriteMarkers markers) throws IOException { + return markers.createdAndMergedDataPaths(context, config.getFinalizeWriteParallelism()); + } + /** * Reconciles WriteStats and marker files to detect and safely delete duplicate data files created because of Spark * retries. @@ -440,7 +646,7 @@ protected void reconcileAgainstMarkers(HoodieEngineContext context, // Reconcile marker and data files with WriteStats so that partially written data-files due to failed // (but succeeded on retry) tasks are removed. String basePath = getMetaClient().getBasePath(); - MarkerFiles markers = new MarkerFiles(this, instantTs); + WriteMarkers markers = WriteMarkersFactory.get(config.getMarkersType(), this, instantTs); if (!markers.doesMarkerDirExist()) { // can happen if it was an empty write say. @@ -448,7 +654,7 @@ protected void reconcileAgainstMarkers(HoodieEngineContext context, } // we are not including log appends here, since they are already fail-safe. - Set invalidDataPaths = markers.createdAndMergedDataPaths(context, config.getFinalizeWriteParallelism()); + Set invalidDataPaths = getInvalidDataPaths(markers); Set validDataPaths = stats.stream() .map(HoodieWriteStat::getPath) .filter(p -> p.endsWith(this.getBaseFileExtension())) @@ -458,9 +664,9 @@ protected void reconcileAgainstMarkers(HoodieEngineContext context, invalidDataPaths.removeAll(validDataPaths); if (!invalidDataPaths.isEmpty()) { - LOG.info("Removing duplicate data files created due to spark retries before committing. Paths=" + invalidDataPaths); + LOG.info("Removing duplicate data files created due to task retries before committing. Paths=" + invalidDataPaths); Map>> invalidPathsByPartition = invalidDataPaths.stream() - .map(dp -> Pair.of(new Path(dp).getParent().toString(), new Path(basePath, dp).toString())) + .map(dp -> Pair.of(new Path(basePath, dp).getParent().toString(), new Path(basePath, dp).toString())) .collect(Collectors.groupingBy(Pair::getKey)); // Ensure all files in delete list is actually present. This is mandatory for an eventually consistent FS. @@ -471,7 +677,7 @@ protected void reconcileAgainstMarkers(HoodieEngineContext context, } // Now delete partially written files - context.setJobStatus(this.getClass().getSimpleName(), "Delete all partially written files"); + context.setJobStatus(this.getClass().getSimpleName(), "Delete all partially written files: " + config.getTableName()); deleteInvalidFilesByPartitions(context, invalidPathsByPartition); // Now ensure the deleted files disappear @@ -494,7 +700,7 @@ protected void reconcileAgainstMarkers(HoodieEngineContext context, */ private void waitForAllFiles(HoodieEngineContext context, Map>> groupByPartition, FileVisibility visibility) { // This will either ensure all files to be deleted are present. - context.setJobStatus(this.getClass().getSimpleName(), "Wait for all files to appear/disappear"); + context.setJobStatus(this.getClass().getSimpleName(), "Wait for all files to appear/disappear: " + config.getTableName()); boolean checkPassed = context.map(new ArrayList<>(groupByPartition.entrySet()), partitionWithFileList -> waitForCondition(partitionWithFileList.getKey(), partitionWithFileList.getValue().stream(), visibility), config.getFinalizeWriteParallelism()) @@ -551,9 +757,9 @@ private void validateSchema() throws HoodieUpsertException, HoodieInsertExceptio Schema writerSchema; boolean isValid; try { - TableSchemaResolver schemaUtil = new TableSchemaResolver(getMetaClient()); + TableSchemaResolver schemaResolver = new TableSchemaResolver(getMetaClient()); writerSchema = HoodieAvroUtils.createHoodieWriteSchema(config.getSchema()); - tableSchema = HoodieAvroUtils.createHoodieWriteSchema(schemaUtil.getTableAvroSchemaWithoutMetadataFields()); + tableSchema = HoodieAvroUtils.createHoodieWriteSchema(schemaResolver.getTableAvroSchemaWithoutMetadataFields()); isValid = TableSchemaResolver.isSchemaCompatible(tableSchema, writerSchema); } catch (Exception e) { throw new HoodieException("Failed to read schema/check compatibility for base path " + metaClient.getBasePath(), e); @@ -589,16 +795,8 @@ public HoodieFileFormat getLogFileFormat() { return metaClient.getTableConfig().getLogFileFormat(); } - public HoodieLogBlockType getLogDataBlockFormat() { - switch (getBaseFileFormat()) { - case PARQUET: - return HoodieLogBlockType.AVRO_DATA_BLOCK; - case HFILE: - return HoodieLogBlockType.HFILE_DATA_BLOCK; - default: - throw new HoodieException("Base file format " + getBaseFileFormat() - + " does not have associated log block format"); - } + public Option getPartitionMetafileFormat() { + return metaClient.getTableConfig().getPartitionMetafileFormat(); } public String getBaseFileExtension() { @@ -608,4 +806,156 @@ public String getBaseFileExtension() { public boolean requireSortedRecords() { return getBaseFileFormat() == HoodieFileFormat.HFILE; } + + public HoodieEngineContext getContext() { + // This is to handle scenarios where this is called at the executor tasks which do not have access + // to engine context, and it ends up being null (as its not serializable and marked transient here). + return context == null ? new HoodieLocalEngineContext(hadoopConfiguration.get()) : context; + } + + /** + * Get Table metadata writer. + * + * @param triggeringInstantTimestamp - The instant that is triggering this metadata write + * @return instance of {@link HoodieTableMetadataWriter} + */ + public final Option getMetadataWriter(String triggeringInstantTimestamp) { + return getMetadataWriter(triggeringInstantTimestamp, Option.empty()); + } + + /** + * Check if action type is a table service. + * @param actionType action type of the instant + * @param instantTime instant time of the instant. + * @return true if action represents a table service. false otherwise. + */ + public boolean isTableServiceAction(String actionType, String instantTime) { + if (actionType.equals(HoodieTimeline.REPLACE_COMMIT_ACTION)) { + Option> instantPlan = ClusteringUtils.getClusteringPlan(metaClient, new HoodieInstant(HoodieInstant.State.NIL, actionType, instantTime)); + // only clustering is table service with replace commit action + return instantPlan.isPresent(); + } else { + if (this.metaClient.getTableType() == HoodieTableType.COPY_ON_WRITE) { + return !actionType.equals(HoodieTimeline.COMMIT_ACTION); + } else { + return !actionType.equals(HoodieTimeline.DELTA_COMMIT_ACTION); + } + } + } + + /** + * Get Table metadata writer. + *

+ * Note: + * Get the metadata writer for the conf. If the metadata table doesn't exist, + * this wil trigger the creation of the table and the initial bootstrapping. + * Since this call is under the transaction lock, other concurrent writers + * are blocked from doing the similar initial metadata table creation and + * the bootstrapping. + * + * @param triggeringInstantTimestamp - The instant that is triggering this metadata write + * @return instance of {@link HoodieTableMetadataWriter} + */ + public Option getMetadataWriter(String triggeringInstantTimestamp, + Option actionMetadata) { + // Each engine is expected to override this and + // provide the actual metadata writer, if enabled. + return Option.empty(); + } + + /** + * Deletes the metadata table if the writer disables metadata table with hoodie.metadata.enable=false + */ + public void maybeDeleteMetadataTable() { + if (shouldExecuteMetadataTableDeletion()) { + try { + LOG.info("Deleting metadata table because it is disabled in writer."); + deleteMetadataTable(config.getBasePath(), context); + clearMetadataTablePartitionsConfig(Option.empty(), true); + } catch (HoodieMetadataException e) { + throw new HoodieException("Failed to delete metadata table.", e); + } + } + } + + /** + * Deletes the metadata partition if the writer disables any metadata index. + */ + public void deleteMetadataIndexIfNecessary() { + Stream.of(MetadataPartitionType.values()).forEach(partitionType -> { + if (shouldDeleteMetadataPartition(partitionType)) { + try { + LOG.info("Deleting metadata partition because it is disabled in writer: " + partitionType.name()); + if (metadataPartitionExists(metaClient.getBasePath(), context, partitionType)) { + deleteMetadataPartition(metaClient.getBasePath(), context, partitionType); + } + clearMetadataTablePartitionsConfig(Option.of(partitionType), false); + } catch (HoodieMetadataException e) { + throw new HoodieException("Failed to delete metadata partition: " + partitionType.name(), e); + } + } + }); + } + + private boolean shouldDeleteMetadataPartition(MetadataPartitionType partitionType) { + // Only delete metadata table partition when all the following conditions are met: + // (1) This is data table. + // (2) Index corresponding to this metadata partition is disabled in HoodieWriteConfig. + // (3) The completed metadata partitions in table config contains this partition. + // NOTE: Inflight metadata partitions are not considered as they could have been inflight due to async indexer. + if (HoodieTableMetadata.isMetadataTable(metaClient.getBasePath()) || !config.isMetadataTableEnabled()) { + return false; + } + boolean metadataIndexDisabled; + switch (partitionType) { + // NOTE: FILES partition type is always considered in sync with hoodie.metadata.enable. + // It cannot be the case that metadata is enabled but FILES is disabled. + case COLUMN_STATS: + metadataIndexDisabled = !config.isMetadataColumnStatsIndexEnabled(); + break; + case BLOOM_FILTERS: + metadataIndexDisabled = !config.isMetadataBloomFilterIndexEnabled(); + break; + default: + LOG.debug("Not a valid metadata partition type: " + partitionType.name()); + return false; + } + return metadataIndexDisabled + && metaClient.getTableConfig().getMetadataPartitions().contains(partitionType.getPartitionPath()); + } + + private boolean shouldExecuteMetadataTableDeletion() { + // Only execute metadata table deletion when all the following conditions are met + // (1) This is data table + // (2) Metadata table is disabled in HoodieWriteConfig for the writer + // (3) Check `HoodieTableConfig.TABLE_METADATA_PARTITIONS`. Either the table config + // does not exist, or the table config is non-empty indicating that metadata table + // partitions are ready to use + return !HoodieTableMetadata.isMetadataTable(metaClient.getBasePath()) + && !config.isMetadataTableEnabled() + && !metaClient.getTableConfig().getMetadataPartitions().isEmpty(); + } + + /** + * Clears hoodie.table.metadata.partitions in hoodie.properties + */ + private void clearMetadataTablePartitionsConfig(Option partitionType, boolean clearAll) { + Set partitions = metaClient.getTableConfig().getMetadataPartitions(); + if (clearAll && partitions.size() > 0) { + LOG.info("Clear hoodie.table.metadata.partitions in hoodie.properties"); + metaClient.getTableConfig().setValue(TABLE_METADATA_PARTITIONS.key(), EMPTY_STRING); + HoodieTableConfig.update(metaClient.getFs(), new Path(metaClient.getMetaPath()), metaClient.getTableConfig().getProps()); + } else if (partitions.remove(partitionType.get().getPartitionPath())) { + metaClient.getTableConfig().setValue(HoodieTableConfig.TABLE_METADATA_PARTITIONS.key(), String.join(",", partitions)); + HoodieTableConfig.update(metaClient.getFs(), new Path(metaClient.getMetaPath()), metaClient.getTableConfig().getProps()); + } + } + + public HoodieTableMetadata getMetadataTable() { + return this.metadata; + } + + public Runnable getPreExecuteRunnable() { + return Functions.noop(); + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTimelineArchiveLog.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTimelineArchiveLog.java deleted file mode 100644 index a6b11af23dc36..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTimelineArchiveLog.java +++ /dev/null @@ -1,409 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table; - -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import org.apache.avro.Schema; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.fs.Path; -import org.apache.hudi.avro.model.HoodieArchivedMetaEntry; -import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.avro.model.HoodieRollbackMetadata; -import org.apache.hudi.avro.model.HoodieSavepointMetadata; -import org.apache.hudi.client.ReplaceArchivalHelper; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.model.ActionType; -import org.apache.hudi.common.model.HoodieArchivedLogFile; -import org.apache.hudi.common.model.HoodieAvroPayload; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; -import org.apache.hudi.common.model.HoodieRollingStatMetadata; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.log.HoodieLogFormat; -import org.apache.hudi.common.table.log.HoodieLogFormat.Writer; -import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; -import org.apache.hudi.common.table.log.block.HoodieLogBlock; -import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; -import org.apache.hudi.common.table.view.TableFileSystemView; -import org.apache.hudi.common.util.CleanerUtils; -import org.apache.hudi.common.util.CollectionUtils; -import org.apache.hudi.common.util.CompactionUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieCommitException; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Comparator; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN; -import static org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS; - -/** - * Archiver to bound the growth of files under .hoodie meta path. - */ -public class HoodieTimelineArchiveLog { - - private static final Logger LOG = LogManager.getLogger(HoodieTimelineArchiveLog.class); - - private final Path archiveFilePath; - private final HoodieWriteConfig config; - private Writer writer; - private final int maxInstantsToKeep; - private final int minInstantsToKeep; - private final HoodieTable table; - private final HoodieTableMetaClient metaClient; - - public HoodieTimelineArchiveLog(HoodieWriteConfig config, HoodieTable table) { - this.config = config; - this.table = table; - this.metaClient = table.getMetaClient(); - this.archiveFilePath = HoodieArchivedTimeline.getArchiveLogPath(metaClient.getArchivePath()); - this.maxInstantsToKeep = config.getMaxCommitsToKeep(); - this.minInstantsToKeep = config.getMinCommitsToKeep(); - } - - private Writer openWriter() { - try { - if (this.writer == null) { - return HoodieLogFormat.newWriterBuilder().onParentPath(archiveFilePath.getParent()) - .withFileId(archiveFilePath.getName()).withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION) - .withFs(metaClient.getFs()).overBaseCommit("").build(); - } else { - return this.writer; - } - } catch (InterruptedException | IOException e) { - throw new HoodieException("Unable to initialize HoodieLogFormat writer", e); - } - } - - private void close() { - try { - if (this.writer != null) { - this.writer.close(); - } - } catch (IOException e) { - throw new HoodieException("Unable to close HoodieLogFormat writer", e); - } - } - - /** - * Check if commits need to be archived. If yes, archive commits. - */ - public boolean archiveIfRequired(HoodieEngineContext context) throws IOException { - try { - List instantsToArchive = getInstantsToArchive().collect(Collectors.toList()); - - boolean success = true; - if (!instantsToArchive.isEmpty()) { - this.writer = openWriter(); - LOG.info("Archiving instants " + instantsToArchive); - archive(context, instantsToArchive); - LOG.info("Deleting archived instants " + instantsToArchive); - success = deleteArchivedInstants(instantsToArchive); - } else { - LOG.info("No Instants to archive"); - } - - return success; - } finally { - close(); - } - } - - private Stream getCleanInstantsToArchive() { - HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline() - .getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION, HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants(); - return cleanAndRollbackTimeline.getInstants() - .collect(Collectors.groupingBy(HoodieInstant::getAction)).values().stream() - .map(hoodieInstants -> { - if (hoodieInstants.size() > this.maxInstantsToKeep) { - return hoodieInstants.subList(0, hoodieInstants.size() - this.minInstantsToKeep); - } else { - return new ArrayList(); - } - }).flatMap(Collection::stream); - } - - private Stream getCommitInstantsToArchive() { - // TODO (na) : Add a way to return actions associated with a timeline and then merge/unify - // with logic above to avoid Stream.concats - HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline(); - Option oldestPendingCompactionInstant = - table.getActiveTimeline().filterPendingCompactionTimeline().firstInstant(); - - // We cannot have any holes in the commit timeline. We cannot archive any commits which are - // made after the first savepoint present. - Option firstSavepoint = table.getCompletedSavepointTimeline().firstInstant(); - if (!commitTimeline.empty() && commitTimeline.countInstants() > maxInstantsToKeep) { - // Actually do the commits - return commitTimeline.getInstants() - .filter(s -> { - // if no savepoint present, then dont filter - return !(firstSavepoint.isPresent() && HoodieTimeline.compareTimestamps(firstSavepoint.get().getTimestamp(), LESSER_THAN_OR_EQUALS, s.getTimestamp())); - }).filter(s -> { - // Ensure commits >= oldest pending compaction commit is retained - return oldestPendingCompactionInstant - .map(instant -> HoodieTimeline.compareTimestamps(instant.getTimestamp(), GREATER_THAN, s.getTimestamp())) - .orElse(true); - }).limit(commitTimeline.countInstants() - minInstantsToKeep); - } else { - return Stream.empty(); - } - } - - private Stream getInstantsToArchive() { - Stream instants = Stream.concat(getCleanInstantsToArchive(), getCommitInstantsToArchive()); - - // For archiving and cleaning instants, we need to include intermediate state files if they exist - HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false); - Map, List> groupByTsAction = rawActiveTimeline.getInstants() - .collect(Collectors.groupingBy(i -> Pair.of(i.getTimestamp(), - HoodieInstant.getComparableAction(i.getAction())))); - - return instants.flatMap(hoodieInstant -> - groupByTsAction.get(Pair.of(hoodieInstant.getTimestamp(), - HoodieInstant.getComparableAction(hoodieInstant.getAction()))).stream()); - } - - private boolean deleteArchivedInstants(List archivedInstants) throws IOException { - LOG.info("Deleting instants " + archivedInstants); - boolean success = true; - for (HoodieInstant archivedInstant : archivedInstants) { - Path commitFile = new Path(metaClient.getMetaPath(), archivedInstant.getFileName()); - try { - if (metaClient.getFs().exists(commitFile)) { - success &= metaClient.getFs().delete(commitFile, false); - LOG.info("Archived and deleted instant file " + commitFile); - } - } catch (IOException e) { - throw new HoodieIOException("Failed to delete archived instant " + archivedInstant, e); - } - } - - // Remove older meta-data from auxiliary path too - Option latestCommitted = Option.fromJavaOptional(archivedInstants.stream().filter(i -> i.isCompleted() && (i.getAction().equals(HoodieTimeline.COMMIT_ACTION) - || (i.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION)))).max(Comparator.comparing(HoodieInstant::getTimestamp))); - LOG.info("Latest Committed Instant=" + latestCommitted); - if (latestCommitted.isPresent()) { - success &= deleteAllInstantsOlderorEqualsInAuxMetaFolder(latestCommitted.get()); - } - return success; - } - - /** - * Remove older instants from auxiliary meta folder. - * - * @param thresholdInstant Hoodie Instant - * @return success if all eligible file deleted successfully - * @throws IOException in case of error - */ - private boolean deleteAllInstantsOlderorEqualsInAuxMetaFolder(HoodieInstant thresholdInstant) throws IOException { - List instants = null; - boolean success = true; - try { - instants = - metaClient.scanHoodieInstantsFromFileSystem( - new Path(metaClient.getMetaAuxiliaryPath()), - HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE, - false); - } catch (FileNotFoundException e) { - /* - * On some FSs deletion of all files in the directory can auto remove the directory itself. - * GCS is one example, as it doesn't have real directories and subdirectories. When client - * removes all the files from a "folder" on GCS is has to create a special "/" to keep the folder - * around. If this doesn't happen (timeout, misconfigured client, ...) folder will be deleted and - * in this case we should not break when aux folder is not found. - * GCS information: (https://cloud.google.com/storage/docs/gsutil/addlhelp/HowSubdirectoriesWork) - */ - LOG.warn("Aux path not found. Skipping: " + metaClient.getMetaAuxiliaryPath()); - return success; - } - - List instantsToBeDeleted = - instants.stream().filter(instant1 -> HoodieTimeline.compareTimestamps(instant1.getTimestamp(), - LESSER_THAN_OR_EQUALS, thresholdInstant.getTimestamp())).collect(Collectors.toList()); - - for (HoodieInstant deleteInstant : instantsToBeDeleted) { - LOG.info("Deleting instant " + deleteInstant + " in auxiliary meta path " + metaClient.getMetaAuxiliaryPath()); - Path metaFile = new Path(metaClient.getMetaAuxiliaryPath(), deleteInstant.getFileName()); - if (metaClient.getFs().exists(metaFile)) { - success &= metaClient.getFs().delete(metaFile, false); - LOG.info("Deleted instant file in auxiliary metapath : " + metaFile); - } - } - return success; - } - - public void archive(HoodieEngineContext context, List instants) throws HoodieCommitException { - try { - HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getAllCommitsTimeline().filterCompletedInstants(); - Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema(); - LOG.info("Wrapper schema " + wrapperSchema.toString()); - List records = new ArrayList<>(); - for (HoodieInstant hoodieInstant : instants) { - boolean deleteSuccess = deleteReplacedFileGroups(context, hoodieInstant); - if (!deleteSuccess) { - // throw error and stop archival if deleting replaced file groups failed. - throw new HoodieCommitException("Unable to delete file(s) for " + hoodieInstant.getFileName()); - } - try { - deleteAnyLeftOverMarkerFiles(context, hoodieInstant); - records.add(convertToAvroRecord(commitTimeline, hoodieInstant)); - if (records.size() >= this.config.getCommitArchivalBatchSize()) { - writeToFile(wrapperSchema, records); - } - } catch (Exception e) { - LOG.error("Failed to archive commits, .commit file: " + hoodieInstant.getFileName(), e); - if (this.config.isFailOnTimelineArchivingEnabled()) { - throw e; - } - } - } - writeToFile(wrapperSchema, records); - } catch (Exception e) { - throw new HoodieCommitException("Failed to archive commits", e); - } - } - - private void deleteAnyLeftOverMarkerFiles(HoodieEngineContext context, HoodieInstant instant) { - MarkerFiles markerFiles = new MarkerFiles(table, instant.getTimestamp()); - if (markerFiles.deleteMarkerDir(context, config.getMarkersDeleteParallelism())) { - LOG.info("Cleaned up left over marker directory for instant :" + instant); - } - } - - private boolean deleteReplacedFileGroups(HoodieEngineContext context, HoodieInstant instant) { - if (!instant.isCompleted() || !HoodieTimeline.REPLACE_COMMIT_ACTION.equals(instant.getAction())) { - // only delete files for completed replace instants - return true; - } - - TableFileSystemView fileSystemView = this.table.getFileSystemView(); - List replacedPartitions = getReplacedPartitions(instant); - return ReplaceArchivalHelper.deleteReplacedFileGroups(context, metaClient, fileSystemView, instant, replacedPartitions); - } - - private List getReplacedPartitions(HoodieInstant instant) { - try { - HoodieReplaceCommitMetadata metadata = HoodieReplaceCommitMetadata.fromBytes( - metaClient.getActiveTimeline().getInstantDetails(instant).get(), - HoodieReplaceCommitMetadata.class); - - return new ArrayList<>(metadata.getPartitionToReplaceFileIds().keySet()); - } catch (IOException e) { - throw new HoodieCommitException("Failed to archive because cannot delete replace files", e); - } - } - - private void writeToFile(Schema wrapperSchema, List records) throws Exception { - if (records.size() > 0) { - Map header = new HashMap<>(); - header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, wrapperSchema.toString()); - HoodieAvroDataBlock block = new HoodieAvroDataBlock(records, header); - this.writer = writer.appendBlock(block); - records.clear(); - } - } - - private IndexedRecord convertToAvroRecord(HoodieTimeline commitTimeline, HoodieInstant hoodieInstant) - throws IOException { - HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry(); - archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp()); - archivedMetaWrapper.setActionState(hoodieInstant.getState().name()); - switch (hoodieInstant.getAction()) { - case HoodieTimeline.CLEAN_ACTION: { - if (hoodieInstant.isCompleted()) { - archivedMetaWrapper.setHoodieCleanMetadata(CleanerUtils.getCleanerMetadata(metaClient, hoodieInstant)); - } else { - archivedMetaWrapper.setHoodieCleanerPlan(CleanerUtils.getCleanerPlan(metaClient, hoodieInstant)); - } - archivedMetaWrapper.setActionType(ActionType.clean.name()); - break; - } - case HoodieTimeline.COMMIT_ACTION: - case HoodieTimeline.DELTA_COMMIT_ACTION: { - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata - .fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieCommitMetadata.class); - archivedMetaWrapper.setHoodieCommitMetadata(convertCommitMetadata(commitMetadata)); - archivedMetaWrapper.setActionType(ActionType.commit.name()); - break; - } - case HoodieTimeline.REPLACE_COMMIT_ACTION: { - HoodieReplaceCommitMetadata replaceCommitMetadata = HoodieReplaceCommitMetadata - .fromBytes(commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieReplaceCommitMetadata.class); - archivedMetaWrapper.setHoodieReplaceCommitMetadata(ReplaceArchivalHelper.convertReplaceCommitMetadata(replaceCommitMetadata)); - archivedMetaWrapper.setActionType(ActionType.replacecommit.name()); - break; - } - case HoodieTimeline.ROLLBACK_ACTION: { - archivedMetaWrapper.setHoodieRollbackMetadata(TimelineMetadataUtils.deserializeAvroMetadata( - commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieRollbackMetadata.class)); - archivedMetaWrapper.setActionType(ActionType.rollback.name()); - break; - } - case HoodieTimeline.SAVEPOINT_ACTION: { - archivedMetaWrapper.setHoodieSavePointMetadata(TimelineMetadataUtils.deserializeAvroMetadata( - commitTimeline.getInstantDetails(hoodieInstant).get(), HoodieSavepointMetadata.class)); - archivedMetaWrapper.setActionType(ActionType.savepoint.name()); - break; - } - case HoodieTimeline.COMPACTION_ACTION: { - HoodieCompactionPlan plan = CompactionUtils.getCompactionPlan(metaClient, hoodieInstant.getTimestamp()); - archivedMetaWrapper.setHoodieCompactionPlan(plan); - archivedMetaWrapper.setActionType(ActionType.compaction.name()); - break; - } - default: { - throw new UnsupportedOperationException("Action not fully supported yet"); - } - } - return archivedMetaWrapper; - } - - public static org.apache.hudi.avro.model.HoodieCommitMetadata convertCommitMetadata( - HoodieCommitMetadata hoodieCommitMetadata) { - ObjectMapper mapper = new ObjectMapper(); - // Need this to ignore other public get() methods - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - org.apache.hudi.avro.model.HoodieCommitMetadata avroMetaData = - mapper.convertValue(hoodieCommitMetadata, org.apache.hudi.avro.model.HoodieCommitMetadata.class); - // Do not archive Rolling Stats, cannot set to null since AVRO will throw null pointer - avroMetaData.getExtraMetadata().put(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY, ""); - return avroMetaData; - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/MarkerFiles.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/MarkerFiles.java deleted file mode 100644 index 8826204cdafe1..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/MarkerFiles.java +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table; - -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.config.SerializableConfiguration; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.IOType; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieIOException; - -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.IOException; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -/** - * Operates on marker files for a given write action (commit, delta commit, compaction). - */ -public class MarkerFiles implements Serializable { - - private static final Logger LOG = LogManager.getLogger(MarkerFiles.class); - - private final String instantTime; - private final transient FileSystem fs; - private final transient Path markerDirPath; - private final String basePath; - - public MarkerFiles(FileSystem fs, String basePath, String markerFolderPath, String instantTime) { - this.instantTime = instantTime; - this.fs = fs; - this.markerDirPath = new Path(markerFolderPath); - this.basePath = basePath; - } - - public MarkerFiles(HoodieTable table, String instantTime) { - this(table.getMetaClient().getFs(), - table.getMetaClient().getBasePath(), - table.getMetaClient().getMarkerFolderPath(instantTime), - instantTime); - } - - public void quietDeleteMarkerDir(HoodieEngineContext context, int parallelism) { - try { - deleteMarkerDir(context, parallelism); - } catch (HoodieIOException ioe) { - LOG.warn("Error deleting marker directory for instant " + instantTime, ioe); - } - } - - /** - * Delete Marker directory corresponding to an instant. - * - * @param context HoodieEngineContext. - * @param parallelism parallelism for deletion. - */ - public boolean deleteMarkerDir(HoodieEngineContext context, int parallelism) { - try { - if (fs.exists(markerDirPath)) { - FileStatus[] fileStatuses = fs.listStatus(markerDirPath); - List markerDirSubPaths = Arrays.stream(fileStatuses) - .map(fileStatus -> fileStatus.getPath().toString()) - .collect(Collectors.toList()); - - if (markerDirSubPaths.size() > 0) { - SerializableConfiguration conf = new SerializableConfiguration(fs.getConf()); - parallelism = Math.min(markerDirSubPaths.size(), parallelism); - context.foreach(markerDirSubPaths, subPathStr -> { - Path subPath = new Path(subPathStr); - FileSystem fileSystem = subPath.getFileSystem(conf.get()); - fileSystem.delete(subPath, true); - }, parallelism); - } - - boolean result = fs.delete(markerDirPath, true); - LOG.info("Removing marker directory at " + markerDirPath); - return result; - } - } catch (IOException ioe) { - throw new HoodieIOException(ioe.getMessage(), ioe); - } - return false; - } - - public boolean doesMarkerDirExist() throws IOException { - return fs.exists(markerDirPath); - } - - public Set createdAndMergedDataPaths(HoodieEngineContext context, int parallelism) throws IOException { - Set dataFiles = new HashSet<>(); - - FileStatus[] topLevelStatuses = fs.listStatus(markerDirPath); - List subDirectories = new ArrayList<>(); - for (FileStatus topLevelStatus: topLevelStatuses) { - if (topLevelStatus.isFile()) { - String pathStr = topLevelStatus.getPath().toString(); - if (pathStr.contains(HoodieTableMetaClient.MARKER_EXTN) && !pathStr.endsWith(IOType.APPEND.name())) { - dataFiles.add(translateMarkerToDataPath(pathStr)); - } - } else { - subDirectories.add(topLevelStatus.getPath().toString()); - } - } - - if (subDirectories.size() > 0) { - parallelism = Math.min(subDirectories.size(), parallelism); - SerializableConfiguration serializedConf = new SerializableConfiguration(fs.getConf()); - dataFiles.addAll(context.flatMap(subDirectories, directory -> { - Path path = new Path(directory); - FileSystem fileSystem = path.getFileSystem(serializedConf.get()); - RemoteIterator itr = fileSystem.listFiles(path, true); - List result = new ArrayList<>(); - while (itr.hasNext()) { - FileStatus status = itr.next(); - String pathStr = status.getPath().toString(); - if (pathStr.contains(HoodieTableMetaClient.MARKER_EXTN) && !pathStr.endsWith(IOType.APPEND.name())) { - result.add(translateMarkerToDataPath(pathStr)); - } - } - return result.stream(); - }, parallelism)); - } - - return dataFiles; - } - - private String translateMarkerToDataPath(String markerPath) { - String rPath = stripMarkerFolderPrefix(markerPath); - return MarkerFiles.stripMarkerSuffix(rPath); - } - - public static String stripMarkerSuffix(String path) { - return path.substring(0, path.indexOf(HoodieTableMetaClient.MARKER_EXTN)); - } - - public List allMarkerFilePaths() throws IOException { - List markerFiles = new ArrayList<>(); - FSUtils.processFiles(fs, markerDirPath.toString(), fileStatus -> { - markerFiles.add(stripMarkerFolderPrefix(fileStatus.getPath().toString())); - return true; - }, false); - return markerFiles; - } - - private String stripMarkerFolderPrefix(String fullMarkerPath) { - ValidationUtils.checkArgument(fullMarkerPath.contains(HoodieTableMetaClient.MARKER_EXTN)); - String markerRootPath = Path.getPathWithoutSchemeAndAuthority( - new Path(String.format("%s/%s/%s", basePath, HoodieTableMetaClient.TEMPFOLDER_NAME, instantTime))).toString(); - int begin = fullMarkerPath.indexOf(markerRootPath); - ValidationUtils.checkArgument(begin >= 0, - "Not in marker dir. Marker Path=" + fullMarkerPath + ", Expected Marker Root=" + markerRootPath); - return fullMarkerPath.substring(begin + markerRootPath.length() + 1); - } - - /** - * The marker path will be /.hoodie/.temp//2019/04/25/filename.marker.writeIOType. - */ - public Path create(String partitionPath, String dataFileName, IOType type) { - Path path = FSUtils.getPartitionPath(markerDirPath, partitionPath); - try { - fs.mkdirs(path); // create a new partition as needed. - } catch (IOException e) { - throw new HoodieIOException("Failed to make dir " + path, e); - } - String markerFileName = String.format("%s%s.%s", dataFileName, HoodieTableMetaClient.MARKER_EXTN, type.name()); - Path markerPath = new Path(path, markerFileName); - try { - LOG.info("Creating Marker Path=" + markerPath); - fs.create(markerPath, false).close(); - } catch (IOException e) { - throw new HoodieException("Failed to create marker file " + markerPath, e); - } - return markerPath; - } - -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/RandomFileIdPrefixProvider.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/RandomFileIdPrefixProvider.java new file mode 100644 index 0000000000000..5ad3eedf437c2 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/RandomFileIdPrefixProvider.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.fs.FSUtils; + +public class RandomFileIdPrefixProvider extends FileIdPrefixProvider { + + public RandomFileIdPrefixProvider(TypedProperties props) { + super(props); + } + + @Override + public String createFilePrefix(String partitionPath) { + return FSUtils.createNewFileIdPfx(); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/WorkloadProfile.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/WorkloadProfile.java index a56710bfb736d..8e6160b095483 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/WorkloadProfile.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/WorkloadProfile.java @@ -18,6 +18,7 @@ package org.apache.hudi.table; +import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.util.collection.Pair; import java.io.Serializable; @@ -32,18 +33,41 @@ public class WorkloadProfile implements Serializable { /** - * Computed workload profile. + * Computed workload stats. */ - protected final HashMap partitionPathStatMap; + protected final HashMap inputPartitionPathStatMap; + + /** + * Execution/Output workload stats + */ + protected final HashMap outputPartitionPathStatMap; /** * Global workloadStat. */ protected final WorkloadStat globalStat; + /** + * Write operation type. + */ + private WriteOperationType operationType; + + private final boolean hasOutputWorkLoadStats; + public WorkloadProfile(Pair, WorkloadStat> profile) { - this.partitionPathStatMap = profile.getLeft(); + this(profile, false); + } + + public WorkloadProfile(Pair, WorkloadStat> profile, boolean hasOutputWorkLoadStats) { + this.inputPartitionPathStatMap = profile.getLeft(); this.globalStat = profile.getRight(); + this.outputPartitionPathStatMap = new HashMap<>(); + this.hasOutputWorkLoadStats = hasOutputWorkLoadStats; + } + + public WorkloadProfile(Pair, WorkloadStat> profile, WriteOperationType operationType, boolean hasOutputWorkLoadStats) { + this(profile, hasOutputWorkLoadStats); + this.operationType = operationType; } public WorkloadStat getGlobalStat() { @@ -51,22 +75,50 @@ public WorkloadStat getGlobalStat() { } public Set getPartitionPaths() { - return partitionPathStatMap.keySet(); + return inputPartitionPathStatMap.keySet(); } - public HashMap getPartitionPathStatMap() { - return partitionPathStatMap; + public Set getOutputPartitionPaths() { + return hasOutputWorkLoadStats ? outputPartitionPathStatMap.keySet() : inputPartitionPathStatMap.keySet(); + } + + public HashMap getInputPartitionPathStatMap() { + return inputPartitionPathStatMap; + } + + public HashMap getOutputPartitionPathStatMap() { + return outputPartitionPathStatMap; + } + + public boolean hasOutputWorkLoadStats() { + return hasOutputWorkLoadStats; + } + + public void updateOutputPartitionPathStatMap(String partitionPath, WorkloadStat workloadStat) { + if (hasOutputWorkLoadStats) { + outputPartitionPathStatMap.put(partitionPath, workloadStat); + } } public WorkloadStat getWorkloadStat(String partitionPath) { - return partitionPathStatMap.get(partitionPath); + return inputPartitionPathStatMap.get(partitionPath); + } + + public WorkloadStat getOutputWorkloadStat(String partitionPath) { + return hasOutputWorkLoadStats ? outputPartitionPathStatMap.get(partitionPath) : inputPartitionPathStatMap.get(partitionPath); + } + + public WriteOperationType getOperationType() { + return operationType; } @Override public String toString() { final StringBuilder sb = new StringBuilder("WorkloadProfile {"); sb.append("globalStat=").append(globalStat).append(", "); - sb.append("partitionStat=").append(partitionPathStatMap); + sb.append("InputPartitionStat=").append(inputPartitionPathStatMap).append(", "); + sb.append("OutputPartitionStat=").append(outputPartitionPathStatMap).append(", "); + sb.append("operationType=").append(operationType); sb.append('}'); return sb.toString(); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/WorkloadStat.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/WorkloadStat.java index 6fdb217a0dcaf..327a5a3ae7980 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/WorkloadStat.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/WorkloadStat.java @@ -33,9 +33,12 @@ public class WorkloadStat implements Serializable { private long numUpdates = 0L; + private HashMap> insertLocationToCount; + private HashMap> updateLocationToCount; public WorkloadStat() { + insertLocationToCount = new HashMap<>(); updateLocationToCount = new HashMap<>(); } @@ -43,8 +46,25 @@ public long addInserts(long numInserts) { return this.numInserts += numInserts; } + public long addInserts(HoodieRecordLocation location, long numInserts) { + long accNumInserts = 0; + if (insertLocationToCount.containsKey(location.getFileId())) { + accNumInserts = insertLocationToCount.get(location.getFileId()).getRight(); + } + insertLocationToCount.put( + location.getFileId(), + Pair.of(location.getInstantTime(), numInserts + accNumInserts)); + return this.numInserts += numInserts; + } + public long addUpdates(HoodieRecordLocation location, long numUpdates) { - updateLocationToCount.put(location.getFileId(), Pair.of(location.getInstantTime(), numUpdates)); + long accNumUpdates = 0; + if (updateLocationToCount.containsKey(location.getFileId())) { + accNumUpdates = updateLocationToCount.get(location.getFileId()).getRight(); + } + updateLocationToCount.put( + location.getFileId(), + Pair.of(location.getInstantTime(), numUpdates + accNumUpdates)); return this.numUpdates += numUpdates; } @@ -60,6 +80,10 @@ public HashMap> getUpdateLocationToCount() { return updateLocationToCount; } + public HashMap> getInsertLocationToCount() { + return insertLocationToCount; + } + @Override public String toString() { final StringBuilder sb = new StringBuilder("WorkloadStat {"); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/BaseActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/BaseActionExecutor.java index 96189e7cfa522..d2b2cef2f604b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/BaseActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/BaseActionExecutor.java @@ -21,8 +21,13 @@ import java.io.Serializable; import org.apache.hadoop.conf.Configuration; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; @@ -46,4 +51,37 @@ public BaseActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, } public abstract R execute(); + + /** + * Writes commits metadata to table metadata. + * @param metadata commit metadata of interest. + */ + protected final void writeTableMetadata(HoodieCommitMetadata metadata, String actionType) { + table.getMetadataWriter(instantTime).ifPresent(w -> w.update( + metadata, instantTime, table.isTableServiceAction(actionType, instantTime))); + } + + /** + * Writes clean metadata to table metadata. + * @param metadata clean metadata of interest. + */ + protected final void writeTableMetadata(HoodieCleanMetadata metadata, String instantTime) { + table.getMetadataWriter(instantTime).ifPresent(w -> w.update(metadata, instantTime)); + } + + /** + * Writes rollback metadata to table metadata. + * @param metadata rollback metadata of interest. + */ + protected final void writeTableMetadata(HoodieRollbackMetadata metadata) { + table.getMetadataWriter(instantTime, Option.of(metadata)).ifPresent(w -> w.update(metadata, instantTime)); + } + + /** + * Writes restore metadata to table metadata. + * @param metadata restore metadata of interest. + */ + protected final void writeTableMetadata(HoodieRestoreMetadata metadata) { + table.getMetadataWriter(instantTime, Option.of(metadata)).ifPresent(w -> w.update(metadata, instantTime)); + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/HoodieWriteMetadata.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/HoodieWriteMetadata.java index 5ef204f9706db..d771a574e37e5 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/HoodieWriteMetadata.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/HoodieWriteMetadata.java @@ -46,6 +46,36 @@ public class HoodieWriteMetadata { public HoodieWriteMetadata() { } + /** + * Clones the write metadata with transformed write statuses. + * + * @param transformedWriteStatuses transformed write statuses + * @param type of transformed write statuses + * @return Cloned {@link HoodieWriteMetadata} instance + */ + public HoodieWriteMetadata clone(T transformedWriteStatuses) { + HoodieWriteMetadata newMetadataInstance = new HoodieWriteMetadata<>(); + newMetadataInstance.setWriteStatuses(transformedWriteStatuses); + if (indexLookupDuration.isPresent()) { + newMetadataInstance.setIndexLookupDuration(indexLookupDuration.get()); + } + newMetadataInstance.setCommitted(isCommitted); + newMetadataInstance.setCommitMetadata(commitMetadata); + if (writeStats.isPresent()) { + newMetadataInstance.setWriteStats(writeStats.get()); + } + if (indexUpdateDuration.isPresent()) { + newMetadataInstance.setIndexUpdateDuration(indexUpdateDuration.get()); + } + if (finalizeDuration.isPresent()) { + newMetadataInstance.setFinalizeDuration(finalizeDuration.get()); + } + if (partitionToReplaceFileIds.isPresent()) { + newMetadataInstance.setPartitionToReplaceFileIds(partitionToReplaceFileIds.get()); + } + return newMetadataInstance; + } + public O getWriteStatuses() { return writeStatuses; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapRecordConsumer.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapRecordConsumer.java index 7ee240d96c5ce..8966a5d51c7cb 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapRecordConsumer.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapRecordConsumer.java @@ -19,6 +19,7 @@ package org.apache.hudi.table.action.bootstrap; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.io.HoodieBootstrapHandle; @@ -39,7 +40,8 @@ public BootstrapRecordConsumer(HoodieBootstrapHandle bootstrapHandle) { @Override protected void consumeOneRecord(HoodieRecord record) { try { - bootstrapHandle.write(record, record.getData().getInsertValue(bootstrapHandle.getWriterSchemaWithMetafields())); + bootstrapHandle.write(record, ((HoodieRecordPayload) record.getData()) + .getInsertValue(bootstrapHandle.getWriterSchemaWithMetaFields())); } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapUtils.java index ade2b1885a117..9b0db44ecf45f 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapUtils.java @@ -26,8 +26,8 @@ import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.fs.RemoteIterator; import org.apache.hudi.avro.model.HoodieFileStatus; -import org.apache.hudi.client.common.HoodieEngineContext; import org.apache.hudi.common.bootstrap.FileStatusUtils; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.util.collection.Pair; @@ -70,7 +70,7 @@ public static List>> getAllLeafFoldersWithFi Integer level = (int) relativePath.chars().filter(ch -> ch == '/').count(); HoodieFileStatus hoodieFileStatus = FileStatusUtils.fromFileStatus(topLevelStatus); result.add(Pair.of(hoodieFileStatus, Pair.of(level, relativePath))); - } else if (metaPathFilter.accept(topLevelStatus.getPath())) { + } else if (topLevelStatus.isDirectory() && metaPathFilter.accept(topLevelStatus.getPath())) { subDirectories.add(topLevelStatus.getPath().toString()); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/BaseCleanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/BaseCleanActionExecutor.java deleted file mode 100644 index 588437c5149a2..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/BaseCleanActionExecutor.java +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.clean; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hudi.avro.model.HoodieActionInstant; -import org.apache.hudi.avro.model.HoodieCleanMetadata; -import org.apache.hudi.avro.model.HoodieCleanerPlan; -import org.apache.hudi.avro.model.HoodieCleanFileInfo; -import org.apache.hudi.common.HoodieCleanStat; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieCleaningPolicy; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; -import org.apache.hudi.common.util.CleanerUtils; -import org.apache.hudi.common.util.CollectionUtils; -import org.apache.hudi.common.util.HoodieTimer; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.action.BaseActionExecutor; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -public abstract class BaseCleanActionExecutor extends BaseActionExecutor { - - private static final long serialVersionUID = 1L; - private static final Logger LOG = LogManager.getLogger(BaseCleanActionExecutor.class); - - public BaseCleanActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime) { - super(context, config, table, instantTime); - } - - /** - * Generates List of files to be cleaned. - * - * @param context HoodieEngineContext - * @return Cleaner Plan - */ - HoodieCleanerPlan requestClean(HoodieEngineContext context) { - try { - CleanPlanner planner = new CleanPlanner<>(table, config); - Option earliestInstant = planner.getEarliestCommitToRetain(); - List partitionsToClean = planner.getPartitionPathsToClean(earliestInstant); - - if (partitionsToClean.isEmpty()) { - LOG.info("Nothing to clean here. It is already clean"); - return HoodieCleanerPlan.newBuilder().setPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name()).build(); - } - LOG.info("Total Partitions to clean : " + partitionsToClean.size() + ", with policy " + config.getCleanerPolicy()); - int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism()); - LOG.info("Using cleanerParallelism: " + cleanerParallelism); - - context.setJobStatus(this.getClass().getSimpleName(), "Generates list of file slices to be cleaned"); - - Map> cleanOps = context - .map(partitionsToClean, partitionPathToClean -> Pair.of(partitionPathToClean, planner.getDeletePaths(partitionPathToClean)), cleanerParallelism) - .stream() - .collect(Collectors.toMap(Pair::getKey, y -> CleanerUtils.convertToHoodieCleanFileInfoList(y.getValue()))); - - return new HoodieCleanerPlan(earliestInstant - .map(x -> new HoodieActionInstant(x.getTimestamp(), x.getAction(), x.getState().name())).orElse(null), - config.getCleanerPolicy().name(), CollectionUtils.createImmutableMap(), - CleanPlanner.LATEST_CLEAN_PLAN_VERSION, cleanOps); - } catch (IOException e) { - throw new HoodieIOException("Failed to schedule clean operation", e); - } - } - - protected static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr) throws IOException { - Path deletePath = new Path(deletePathStr); - LOG.debug("Working on delete path :" + deletePath); - try { - boolean deleteResult = fs.delete(deletePath, false); - if (deleteResult) { - LOG.debug("Cleaned file at path :" + deletePath); - } - return deleteResult; - } catch (FileNotFoundException fio) { - // With cleanPlan being used for retried cleaning operations, its possible to clean a file twice - return false; - } - } - - /** - * Performs cleaning of partition paths according to cleaning policy and returns the number of files cleaned. Handles - * skews in partitions to clean by making files to clean as the unit of task distribution. - * - * @throws IllegalArgumentException if unknown cleaning policy is provided - */ - abstract List clean(HoodieEngineContext context, HoodieCleanerPlan cleanerPlan); - - /** - * Creates a Cleaner plan if there are files to be cleaned and stores them in instant file. - * Cleaner Plan contains absolute file paths. - * - * @param startCleanTime Cleaner Instant Time - * @return Cleaner Plan if generated - */ - Option requestClean(String startCleanTime) { - final HoodieCleanerPlan cleanerPlan = requestClean(context); - if ((cleanerPlan.getFilePathsToBeDeletedPerPartition() != null) - && !cleanerPlan.getFilePathsToBeDeletedPerPartition().isEmpty() - && cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().mapToInt(List::size).sum() > 0) { - // Only create cleaner plan which does some work - final HoodieInstant cleanInstant = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.CLEAN_ACTION, startCleanTime); - // Save to both aux and timeline folder - try { - table.getActiveTimeline().saveToCleanRequested(cleanInstant, TimelineMetadataUtils.serializeCleanerPlan(cleanerPlan)); - LOG.info("Requesting Cleaning with instant time " + cleanInstant); - } catch (IOException e) { - LOG.error("Got exception when saving cleaner requested file", e); - throw new HoodieIOException(e.getMessage(), e); - } - return Option.of(cleanerPlan); - } - return Option.empty(); - } - - /** - * Executes the Cleaner plan stored in the instant metadata. - */ - void runPendingClean(HoodieTable table, HoodieInstant cleanInstant) { - try { - HoodieCleanerPlan cleanerPlan = CleanerUtils.getCleanerPlan(table.getMetaClient(), cleanInstant); - runClean(table, cleanInstant, cleanerPlan); - } catch (IOException e) { - throw new HoodieIOException(e.getMessage(), e); - } - } - - private HoodieCleanMetadata runClean(HoodieTable table, HoodieInstant cleanInstant, HoodieCleanerPlan cleanerPlan) { - ValidationUtils.checkArgument(cleanInstant.getState().equals(HoodieInstant.State.REQUESTED) - || cleanInstant.getState().equals(HoodieInstant.State.INFLIGHT)); - - try { - final HoodieInstant inflightInstant; - final HoodieTimer timer = new HoodieTimer(); - timer.startTimer(); - if (cleanInstant.isRequested()) { - inflightInstant = table.getActiveTimeline().transitionCleanRequestedToInflight(cleanInstant, - TimelineMetadataUtils.serializeCleanerPlan(cleanerPlan)); - } else { - inflightInstant = cleanInstant; - } - - List cleanStats = clean(context, cleanerPlan); - if (cleanStats.isEmpty()) { - return HoodieCleanMetadata.newBuilder().build(); - } - - table.getMetaClient().reloadActiveTimeline(); - HoodieCleanMetadata metadata = CleanerUtils.convertCleanMetadata( - inflightInstant.getTimestamp(), - Option.of(timer.endTimer()), - cleanStats - ); - - table.getActiveTimeline().transitionCleanInflightToComplete(inflightInstant, - TimelineMetadataUtils.serializeCleanMetadata(metadata)); - LOG.info("Marked clean started on " + inflightInstant.getTimestamp() + " as complete"); - return metadata; - } catch (IOException e) { - throw new HoodieIOException("Failed to clean up after commit", e); - } - } - - @Override - public HoodieCleanMetadata execute() { - // If there are inflight(failed) or previously requested clean operation, first perform them - List pendingCleanInstants = table.getCleanTimeline() - .filterInflightsAndRequested().getInstants().collect(Collectors.toList()); - if (pendingCleanInstants.size() > 0) { - pendingCleanInstants.forEach(hoodieInstant -> { - LOG.info("Finishing previously unfinished cleaner instant=" + hoodieInstant); - try { - runPendingClean(table, hoodieInstant); - } catch (Exception e) { - LOG.warn("Failed to perform previous clean operation, instant: " + hoodieInstant, e); - } - }); - table.getMetaClient().reloadActiveTimeline(); - } - - // Plan and execute a new clean action - Option cleanerPlanOpt = requestClean(instantTime); - if (cleanerPlanOpt.isPresent()) { - table.getMetaClient().reloadActiveTimeline(); - HoodieCleanerPlan cleanerPlan = cleanerPlanOpt.get(); - if ((cleanerPlan.getFilePathsToBeDeletedPerPartition() != null) && !cleanerPlan.getFilePathsToBeDeletedPerPartition().isEmpty()) { - return runClean(table, HoodieTimeline.getCleanRequestedInstant(instantTime), cleanerPlan); - } - } - return null; - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java new file mode 100644 index 0000000000000..56b01ec77b62b --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java @@ -0,0 +1,276 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.clean; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import org.apache.hudi.avro.model.HoodieActionInstant; +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieCleanerPlan; +import org.apache.hudi.client.transaction.TransactionManager; +import org.apache.hudi.common.HoodieCleanStat; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.CleanFileInfo; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.util.CleanerUtils; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.BaseActionExecutor; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +public class CleanActionExecutor extends BaseActionExecutor { + + private static final long serialVersionUID = 1L; + private static final Logger LOG = LogManager.getLogger(CleanActionExecutor.class); + private final TransactionManager txnManager; + private final boolean skipLocking; + + public CleanActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime) { + this(context, config, table, instantTime, false); + } + + public CleanActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime, boolean skipLocking) { + super(context, config, table, instantTime); + this.txnManager = new TransactionManager(config, table.getMetaClient().getFs()); + this.skipLocking = skipLocking; + } + + private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr) throws IOException { + Path deletePath = new Path(deletePathStr); + LOG.debug("Working on delete path :" + deletePath); + try { + boolean isDirectory = fs.isDirectory(deletePath); + boolean deleteResult = fs.delete(deletePath, isDirectory); + if (deleteResult) { + LOG.debug("Cleaned file at path :" + deletePath); + } + return deleteResult; + } catch (FileNotFoundException fio) { + // With cleanPlan being used for retried cleaning operations, its possible to clean a file twice + return false; + } + } + + private static Stream> deleteFilesFunc(Iterator> cleanFileInfo, HoodieTable table) { + Map partitionCleanStatMap = new HashMap<>(); + FileSystem fs = table.getMetaClient().getFs(); + + cleanFileInfo.forEachRemaining(partitionDelFileTuple -> { + String partitionPath = partitionDelFileTuple.getLeft(); + Path deletePath = new Path(partitionDelFileTuple.getRight().getFilePath()); + String deletePathStr = deletePath.toString(); + Boolean deletedFileResult = null; + try { + deletedFileResult = deleteFileAndGetResult(fs, deletePathStr); + + } catch (IOException e) { + LOG.error("Delete file failed: " + deletePathStr); + } + final PartitionCleanStat partitionCleanStat = + partitionCleanStatMap.computeIfAbsent(partitionPath, k -> new PartitionCleanStat(partitionPath)); + boolean isBootstrapBasePathFile = partitionDelFileTuple.getRight().isBootstrapBaseFile(); + + if (isBootstrapBasePathFile) { + // For Bootstrap Base file deletions, store the full file path. + partitionCleanStat.addDeleteFilePatterns(deletePath.toString(), true); + partitionCleanStat.addDeletedFileResult(deletePath.toString(), deletedFileResult, true); + } else { + partitionCleanStat.addDeleteFilePatterns(deletePath.getName(), false); + partitionCleanStat.addDeletedFileResult(deletePath.getName(), deletedFileResult, false); + } + }); + return partitionCleanStatMap.entrySet().stream().map(e -> Pair.of(e.getKey(), e.getValue())); + } + + /** + * Performs cleaning of partition paths according to cleaning policy and returns the number of files cleaned. Handles + * skews in partitions to clean by making files to clean as the unit of task distribution. + * + * @throws IllegalArgumentException if unknown cleaning policy is provided + */ + List clean(HoodieEngineContext context, HoodieCleanerPlan cleanerPlan) { + int cleanerParallelism = Math.min( + (int) (cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().mapToInt(List::size).count()), + config.getCleanerParallelism()); + LOG.info("Using cleanerParallelism: " + cleanerParallelism); + + context.setJobStatus(this.getClass().getSimpleName(), "Perform cleaning of partitions: " + config.getTableName()); + + Stream> filesToBeDeletedPerPartition = + cleanerPlan.getFilePathsToBeDeletedPerPartition().entrySet().stream() + .flatMap(x -> x.getValue().stream().map(y -> new ImmutablePair<>(x.getKey(), + new CleanFileInfo(y.getFilePath(), y.getIsBootstrapBaseFile())))); + + Stream> partitionCleanStats = + context.mapPartitionsToPairAndReduceByKey(filesToBeDeletedPerPartition, + iterator -> deleteFilesFunc(iterator, table), PartitionCleanStat::merge, cleanerParallelism); + + Map partitionCleanStatsMap = partitionCleanStats + .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); + + List partitionsToBeDeleted = cleanerPlan.getPartitionsToBeDeleted() != null ? cleanerPlan.getPartitionsToBeDeleted() : new ArrayList<>(); + partitionsToBeDeleted.forEach(entry -> { + try { + deleteFileAndGetResult(table.getMetaClient().getFs(), table.getMetaClient().getBasePath() + "/" + entry); + } catch (IOException e) { + LOG.warn("Partition deletion failed " + entry); + } + }); + + // Return PartitionCleanStat for each partition passed. + return cleanerPlan.getFilePathsToBeDeletedPerPartition().keySet().stream().map(partitionPath -> { + PartitionCleanStat partitionCleanStat = partitionCleanStatsMap.containsKey(partitionPath) + ? partitionCleanStatsMap.get(partitionPath) + : new PartitionCleanStat(partitionPath); + HoodieActionInstant actionInstant = cleanerPlan.getEarliestInstantToRetain(); + return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()).withPartitionPath(partitionPath) + .withEarliestCommitRetained(Option.ofNullable( + actionInstant != null + ? new HoodieInstant(HoodieInstant.State.valueOf(actionInstant.getState()), + actionInstant.getAction(), actionInstant.getTimestamp()) + : null)) + .withLastCompletedCommitTimestamp(cleanerPlan.getLastCompletedCommitTimestamp()) + .withDeletePathPattern(partitionCleanStat.deletePathPatterns()) + .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles()) + .withFailedDeletes(partitionCleanStat.failedDeleteFiles()) + .withDeleteBootstrapBasePathPatterns(partitionCleanStat.getDeleteBootstrapBasePathPatterns()) + .withSuccessfulDeleteBootstrapBaseFiles(partitionCleanStat.getSuccessfulDeleteBootstrapBaseFiles()) + .withFailedDeleteBootstrapBaseFiles(partitionCleanStat.getFailedDeleteBootstrapBaseFiles()) + .isPartitionDeleted(partitionsToBeDeleted.contains(partitionPath)) + .build(); + }).collect(Collectors.toList()); + } + + + /** + * Executes the Cleaner plan stored in the instant metadata. + */ + HoodieCleanMetadata runPendingClean(HoodieTable table, HoodieInstant cleanInstant) { + try { + HoodieCleanerPlan cleanerPlan = CleanerUtils.getCleanerPlan(table.getMetaClient(), cleanInstant); + return runClean(table, cleanInstant, cleanerPlan); + } catch (IOException e) { + throw new HoodieIOException(e.getMessage(), e); + } + } + + private HoodieCleanMetadata runClean(HoodieTable table, HoodieInstant cleanInstant, HoodieCleanerPlan cleanerPlan) { + ValidationUtils.checkArgument(cleanInstant.getState().equals(HoodieInstant.State.REQUESTED) + || cleanInstant.getState().equals(HoodieInstant.State.INFLIGHT)); + + HoodieInstant inflightInstant = null; + try { + final HoodieTimer timer = new HoodieTimer(); + timer.startTimer(); + if (cleanInstant.isRequested()) { + inflightInstant = table.getActiveTimeline().transitionCleanRequestedToInflight(cleanInstant, + TimelineMetadataUtils.serializeCleanerPlan(cleanerPlan)); + } else { + inflightInstant = cleanInstant; + } + + List cleanStats = clean(context, cleanerPlan); + if (cleanStats.isEmpty()) { + return HoodieCleanMetadata.newBuilder().build(); + } + + table.getMetaClient().reloadActiveTimeline(); + HoodieCleanMetadata metadata = CleanerUtils.convertCleanMetadata( + inflightInstant.getTimestamp(), + Option.of(timer.endTimer()), + cleanStats + ); + if (!skipLocking) { + this.txnManager.beginTransaction(Option.of(inflightInstant), Option.empty()); + } + writeTableMetadata(metadata, inflightInstant.getTimestamp()); + table.getActiveTimeline().transitionCleanInflightToComplete(inflightInstant, + TimelineMetadataUtils.serializeCleanMetadata(metadata)); + LOG.info("Marked clean started on " + inflightInstant.getTimestamp() + " as complete"); + return metadata; + } catch (IOException e) { + throw new HoodieIOException("Failed to clean up after commit", e); + } finally { + if (!skipLocking) { + this.txnManager.endTransaction(Option.of(inflightInstant)); + } + } + } + + @Override + public HoodieCleanMetadata execute() { + List cleanMetadataList = new ArrayList<>(); + // If there are inflight(failed) or previously requested clean operation, first perform them + List pendingCleanInstants = table.getCleanTimeline() + .filterInflightsAndRequested().getInstants().collect(Collectors.toList()); + if (pendingCleanInstants.size() > 0) { + // try to clean old history schema. + try { + FileBasedInternalSchemaStorageManager fss = new FileBasedInternalSchemaStorageManager(table.getMetaClient()); + fss.cleanOldFiles(pendingCleanInstants.stream().map(is -> is.getTimestamp()).collect(Collectors.toList())); + } catch (Exception e) { + // we should not affect original clean logic. Swallow exception and log warn. + LOG.warn("failed to clean old history schema"); + } + pendingCleanInstants.forEach(hoodieInstant -> { + if (table.getCleanTimeline().isEmpty(hoodieInstant)) { + table.getActiveTimeline().deleteEmptyInstantIfExists(hoodieInstant); + } else { + LOG.info("Finishing previously unfinished cleaner instant=" + hoodieInstant); + try { + cleanMetadataList.add(runPendingClean(table, hoodieInstant)); + } catch (Exception e) { + LOG.warn("Failed to perform previous clean operation, instant: " + hoodieInstant, e); + } + } + table.getMetaClient().reloadActiveTimeline(); + if (config.isMetadataTableEnabled()) { + table.getHoodieView().sync(); + } + }); + } + + // return the last clean metadata for now + // TODO (NA) : Clean only the earliest pending clean just like how we do for other table services + // This requires the CleanActionExecutor to be refactored as BaseCommitActionExecutor + return cleanMetadataList.size() > 0 ? cleanMetadataList.get(cleanMetadataList.size() - 1) : null; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanActionExecutor.java new file mode 100644 index 0000000000000..7f3b437178fd4 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanActionExecutor.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.clean; + +import org.apache.hudi.avro.model.HoodieActionInstant; +import org.apache.hudi.avro.model.HoodieCleanFileInfo; +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieCleanerPlan; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.CleanFileInfo; +import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.util.CleanerUtils; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.BaseActionExecutor; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class CleanPlanActionExecutor extends BaseActionExecutor> { + + private static final Logger LOG = LogManager.getLogger(CleanPlanner.class); + + private final Option> extraMetadata; + + public CleanPlanActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + Option> extraMetadata) { + super(context, config, table, instantTime); + this.extraMetadata = extraMetadata; + } + + private int getCommitsSinceLastCleaning() { + Option lastCleanInstant = table.getActiveTimeline().getCleanerTimeline().filterCompletedInstants().lastInstant(); + HoodieTimeline commitTimeline = table.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); + + int numCommits; + if (lastCleanInstant.isPresent() && !table.getActiveTimeline().isEmpty(lastCleanInstant.get())) { + try { + HoodieCleanMetadata cleanMetadata = TimelineMetadataUtils + .deserializeHoodieCleanMetadata(table.getActiveTimeline().getInstantDetails(lastCleanInstant.get()).get()); + String lastCompletedCommitTimestamp = cleanMetadata.getLastCompletedCommitTimestamp(); + numCommits = commitTimeline.findInstantsAfter(lastCompletedCommitTimestamp).countInstants(); + } catch (IOException e) { + throw new HoodieIOException("Parsing of last clean instant " + lastCleanInstant.get() + " failed", e); + } + } else { + numCommits = commitTimeline.countInstants(); + } + + return numCommits; + } + + private boolean needsCleaning(CleaningTriggerStrategy strategy) { + if (strategy == CleaningTriggerStrategy.NUM_COMMITS) { + int numberOfCommits = getCommitsSinceLastCleaning(); + int maxInlineCommitsForNextClean = config.getCleaningMaxCommits(); + return numberOfCommits >= maxInlineCommitsForNextClean; + } else { + throw new HoodieException("Unsupported cleaning trigger strategy: " + config.getCleaningTriggerStrategy()); + } + } + + /** + * Generates List of files to be cleaned. + * + * @param context HoodieEngineContext + * @return Cleaner Plan + */ + HoodieCleanerPlan requestClean(HoodieEngineContext context) { + try { + CleanPlanner planner = new CleanPlanner<>(context, table, config); + Option earliestInstant = planner.getEarliestCommitToRetain(); + context.setJobStatus(this.getClass().getSimpleName(), "Obtaining list of partitions to be cleaned: " + config.getTableName()); + List partitionsToClean = planner.getPartitionPathsToClean(earliestInstant); + + if (partitionsToClean.isEmpty()) { + LOG.info("Nothing to clean here. It is already clean"); + return HoodieCleanerPlan.newBuilder().setPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name()).build(); + } + LOG.info("Total Partitions to clean : " + partitionsToClean.size() + ", with policy " + config.getCleanerPolicy()); + int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism()); + LOG.info("Using cleanerParallelism: " + cleanerParallelism); + + context.setJobStatus(this.getClass().getSimpleName(), "Generating list of file slices to be cleaned: " + config.getTableName()); + + Map>> cleanOpsWithPartitionMeta = context + .map(partitionsToClean, partitionPathToClean -> Pair.of(partitionPathToClean, planner.getDeletePaths(partitionPathToClean)), cleanerParallelism) + .stream() + .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); + + Map> cleanOps = cleanOpsWithPartitionMeta.entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, + e -> CleanerUtils.convertToHoodieCleanFileInfoList(e.getValue().getValue()))); + + List partitionsToDelete = cleanOpsWithPartitionMeta.entrySet().stream().filter(entry -> entry.getValue().getKey()).map(Map.Entry::getKey) + .collect(Collectors.toList()); + + return new HoodieCleanerPlan(earliestInstant + .map(x -> new HoodieActionInstant(x.getTimestamp(), x.getAction(), x.getState().name())).orElse(null), + planner.getLastCompletedCommitTimestamp(), + config.getCleanerPolicy().name(), CollectionUtils.createImmutableMap(), + CleanPlanner.LATEST_CLEAN_PLAN_VERSION, cleanOps, partitionsToDelete); + } catch (IOException e) { + throw new HoodieIOException("Failed to schedule clean operation", e); + } + } + + /** + * Creates a Cleaner plan if there are files to be cleaned and stores them in instant file. + * Cleaner Plan contains absolute file paths. + * + * @param startCleanTime Cleaner Instant Time + * @return Cleaner Plan if generated + */ + protected Option requestClean(String startCleanTime) { + final HoodieCleanerPlan cleanerPlan = requestClean(context); + if ((cleanerPlan.getFilePathsToBeDeletedPerPartition() != null) + && !cleanerPlan.getFilePathsToBeDeletedPerPartition().isEmpty() + && cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().mapToInt(List::size).sum() > 0) { + // Only create cleaner plan which does some work + final HoodieInstant cleanInstant = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.CLEAN_ACTION, startCleanTime); + // Save to both aux and timeline folder + try { + table.getActiveTimeline().saveToCleanRequested(cleanInstant, TimelineMetadataUtils.serializeCleanerPlan(cleanerPlan)); + LOG.info("Requesting Cleaning with instant time " + cleanInstant); + } catch (IOException e) { + LOG.error("Got exception when saving cleaner requested file", e); + throw new HoodieIOException(e.getMessage(), e); + } + return Option.of(cleanerPlan); + } + return Option.empty(); + } + + @Override + public Option execute() { + if (!needsCleaning(config.getCleaningTriggerStrategy())) { + return Option.empty(); + } + // Plan a new clean action + return requestClean(instantTime); + } + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java index 405fc81b7a0c8..64e69b1d2a9bd 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java @@ -20,7 +20,7 @@ import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieSavepointMetadata; -import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.CleanFileInfo; import org.apache.hudi.common.model.CompactionOperation; import org.apache.hudi.common.model.FileSlice; @@ -30,7 +30,9 @@ import org.apache.hudi.common.model.HoodieFileGroup; import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; @@ -42,14 +44,20 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieSavepointException; +import org.apache.hudi.metadata.FileSystemBackedTableMetadata; import org.apache.hudi.table.HoodieTable; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.IOException; import java.io.Serializable; +import java.time.Instant; +import java.time.ZoneId; +import java.time.ZonedDateTime; import java.util.ArrayList; import java.util.Collections; +import java.util.Date; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -76,11 +84,13 @@ public class CleanPlanner implements Ser private final Map fgIdToPendingCompactionOperations; private HoodieTable hoodieTable; private HoodieWriteConfig config; + private transient HoodieEngineContext context; - public CleanPlanner(HoodieTable hoodieTable, HoodieWriteConfig config) { + public CleanPlanner(HoodieEngineContext context, HoodieTable hoodieTable, HoodieWriteConfig config) { + this.context = context; this.hoodieTable = hoodieTable; this.fileSystemView = hoodieTable.getHoodieView(); - this.commitTimeline = hoodieTable.getCompletedCommitTimeline(); + this.commitTimeline = hoodieTable.getCompletedCommitsTimeline(); this.config = config; this.fgIdToPendingCompactionOperations = ((SyncableFileSystemView) hoodieTable.getSliceView()).getPendingCompactionOperations() @@ -94,7 +104,7 @@ public CleanPlanner(HoodieTable hoodieTable, HoodieWriteConfig confi * Get the list of data file names savepointed. */ public Stream getSavepointedDataFiles(String savepointTime) { - if (!hoodieTable.getSavepoints().contains(savepointTime)) { + if (!hoodieTable.getSavepointTimestamps().contains(savepointTime)) { throw new HoodieSavepointException( "Could not get data files for savepoint " + savepointTime + ". No such savepoint."); } @@ -112,14 +122,15 @@ public Stream getSavepointedDataFiles(String savepointTime) { /** * Returns list of partitions where clean operations needs to be performed. * - * @param newInstantToRetain New instant to be retained after this cleanup operation + * @param earliestRetainedInstant New instant to be retained after this cleanup operation * @return list of partitions to scan for cleaning * @throws IOException when underlying file-system throws this exception */ - public List getPartitionPathsToClean(Option newInstantToRetain) throws IOException { + public List getPartitionPathsToClean(Option earliestRetainedInstant) throws IOException { switch (config.getCleanerPolicy()) { case KEEP_LATEST_COMMITS: - return getPartitionPathsForCleanByCommits(newInstantToRetain); + case KEEP_LATEST_BY_HOURS: + return getPartitionPathsForCleanByCommits(earliestRetainedInstant); case KEEP_LATEST_FILE_VERSIONS: return getPartitionPathsForFullCleaning(); default: @@ -142,11 +153,15 @@ private List getPartitionPathsForCleanByCommits(Option in if (config.incrementalCleanerModeEnabled()) { Option lastClean = hoodieTable.getCleanTimeline().filterCompletedInstants().lastInstant(); if (lastClean.isPresent()) { - HoodieCleanMetadata cleanMetadata = TimelineMetadataUtils - .deserializeHoodieCleanMetadata(hoodieTable.getActiveTimeline().getInstantDetails(lastClean.get()).get()); - if ((cleanMetadata.getEarliestCommitToRetain() != null) - && (cleanMetadata.getEarliestCommitToRetain().length() > 0)) { - return getPartitionPathsForIncrementalCleaning(cleanMetadata, instantToRetain); + if (hoodieTable.getActiveTimeline().isEmpty(lastClean.get())) { + hoodieTable.getActiveTimeline().deleteEmptyInstantIfExists(lastClean.get()); + } else { + HoodieCleanMetadata cleanMetadata = TimelineMetadataUtils + .deserializeHoodieCleanMetadata(hoodieTable.getActiveTimeline().getInstantDetails(lastClean.get()).get()); + if ((cleanMetadata.getEarliestCommitToRetain() != null) + && (cleanMetadata.getEarliestCommitToRetain().length() > 0)) { + return getPartitionPathsForIncrementalCleaning(cleanMetadata, instantToRetain); + } } } } @@ -161,7 +176,7 @@ private List getPartitionPathsForCleanByCommits(Option in */ private List getPartitionPathsForIncrementalCleaning(HoodieCleanMetadata cleanMetadata, Option newInstantToRetain) { - LOG.warn("Incremental Cleaning mode is enabled. Looking up partition-paths that have since changed " + LOG.info("Incremental Cleaning mode is enabled. Looking up partition-paths that have since changed " + "since last cleaned at " + cleanMetadata.getEarliestCommitToRetain() + ". New Instant to retain : " + newInstantToRetain); return hoodieTable.getCompletedCommitsTimeline().getInstants().filter( @@ -169,10 +184,16 @@ private List getPartitionPathsForIncrementalCleaning(HoodieCleanMetadata cleanMetadata.getEarliestCommitToRetain()) && HoodieTimeline.compareTimestamps(instant.getTimestamp(), HoodieTimeline.LESSER_THAN, newInstantToRetain.get().getTimestamp())).flatMap(instant -> { try { - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata - .fromBytes(hoodieTable.getActiveTimeline().getInstantDetails(instant).get(), - HoodieCommitMetadata.class); - return commitMetadata.getPartitionToWriteStats().keySet().stream(); + if (HoodieTimeline.REPLACE_COMMIT_ACTION.equals(instant.getAction())) { + HoodieReplaceCommitMetadata replaceCommitMetadata = HoodieReplaceCommitMetadata.fromBytes( + hoodieTable.getActiveTimeline().getInstantDetails(instant).get(), HoodieReplaceCommitMetadata.class); + return Stream.concat(replaceCommitMetadata.getPartitionToReplaceFileIds().keySet().stream(), replaceCommitMetadata.getPartitionToWriteStats().keySet().stream()); + } else { + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(hoodieTable.getActiveTimeline().getInstantDetails(instant).get(), + HoodieCommitMetadata.class); + return commitMetadata.getPartitionToWriteStats().keySet().stream(); + } } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); } @@ -180,14 +201,20 @@ private List getPartitionPathsForIncrementalCleaning(HoodieCleanMetadata } /** - * Scan and list all paritions for cleaning. + * Scan and list all partitions for cleaning. * @return all partitions paths for the dataset. - * @throws IOException */ - private List getPartitionPathsForFullCleaning() throws IOException { + private List getPartitionPathsForFullCleaning() { // Go to brute force mode of scanning all partitions - return FSUtils.getAllPartitionPaths(hoodieTable.getMetaClient().getFs(), hoodieTable.getMetaClient().getBasePath(), - config.shouldAssumeDatePartitioning()); + try { + // Because the partition of BaseTableMetadata has been deleted, + // all partition information can only be obtained from FileSystemBackedTableMetadata. + FileSystemBackedTableMetadata fsBackedTableMetadata = new FileSystemBackedTableMetadata(context, + context.getHadoopConf(), config.getBasePath(), config.shouldAssumeDatePartitioning()); + return fsBackedTableMetadata.getAllPartitionPaths(); + } catch (IOException e) { + return Collections.emptyList(); + } } /** @@ -195,16 +222,20 @@ private List getPartitionPathsForFullCleaning() throws IOException { * policy is useful, if you are simply interested in querying the table, and you don't want too many versions for a * single file (i.e run it with versionsRetained = 1) */ - private List getFilesToCleanKeepingLatestVersions(String partitionPath) { + private Pair> getFilesToCleanKeepingLatestVersions(String partitionPath) { LOG.info("Cleaning " + partitionPath + ", retaining latest " + config.getCleanerFileVersionsRetained() + " file versions. "); - List fileGroups = fileSystemView.getAllFileGroups(partitionPath).collect(Collectors.toList()); List deletePaths = new ArrayList<>(); // Collect all the datafiles savepointed by all the savepoints - List savepointedFiles = hoodieTable.getSavepoints().stream() + List savepointedFiles = hoodieTable.getSavepointTimestamps().stream() .flatMap(this::getSavepointedDataFiles) .collect(Collectors.toList()); + // In this scenario, we will assume that once replaced a file group automatically becomes eligible for cleaning completely + // In other words, the file versions only apply to the active file groups. + deletePaths.addAll(getReplacedFilesEligibleToClean(savepointedFiles, partitionPath, Option.empty())); + boolean toDeletePartition = false; + List fileGroups = fileSystemView.getAllFileGroups(partitionPath).collect(Collectors.toList()); for (HoodieFileGroup fileGroup : fileGroups) { int keepVersions = config.getCleanerFileVersionsRetained(); // do not cleanup slice required for pending compaction @@ -217,32 +248,29 @@ private List getFilesToCleanKeepingLatestVersions(String partitio while (fileSliceIterator.hasNext() && keepVersions > 0) { // Skip this most recent version - FileSlice nextSlice = fileSliceIterator.next(); - Option dataFile = nextSlice.getBaseFile(); - if (dataFile.isPresent() && savepointedFiles.contains(dataFile.get().getFileName())) { - // do not clean up a savepoint data file - continue; - } + fileSliceIterator.next(); keepVersions--; } // Delete the remaining files while (fileSliceIterator.hasNext()) { FileSlice nextSlice = fileSliceIterator.next(); - if (nextSlice.getBaseFile().isPresent()) { - HoodieBaseFile dataFile = nextSlice.getBaseFile().get(); - deletePaths.add(new CleanFileInfo(dataFile.getPath(), false)); - if (dataFile.getBootstrapBaseFile().isPresent() && config.shouldCleanBootstrapBaseFile()) { - deletePaths.add(new CleanFileInfo(dataFile.getBootstrapBaseFile().get().getPath(), true)); - } - } - if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) { - // If merge on read, then clean the log files for the commits as well - deletePaths.addAll(nextSlice.getLogFiles().map(lf -> new CleanFileInfo(lf.getPath().toString(), false)) - .collect(Collectors.toList())); + Option dataFile = nextSlice.getBaseFile(); + if (dataFile.isPresent() && savepointedFiles.contains(dataFile.get().getFileName())) { + // do not clean up a savepoint data file + continue; } + deletePaths.addAll(getCleanFileInfoForSlice(nextSlice)); } } - return deletePaths; + // if there are no valid file groups for the partition, mark it to be deleted + if (fileGroups.isEmpty()) { + toDeletePartition = true; + } + return Pair.of(toDeletePartition, deletePaths); + } + + private Pair> getFilesToCleanKeepingLatestCommits(String partitionPath) { + return getFilesToCleanKeepingLatestCommits(partitionPath, config.getCleanerCommitsRetained(), HoodieCleaningPolicy.KEEP_LATEST_COMMITS); } /** @@ -258,20 +286,27 @@ private List getFilesToCleanKeepingLatestVersions(String partitio * retain 10 commits, and commit batch time is 30 mins, then you have 5 hrs of lookback) *

* This policy is the default. + * + * @return A {@link Pair} whose left is boolean indicating whether partition itself needs to be deleted, + * and right is a list of {@link CleanFileInfo} about the files in the partition that needs to be deleted. */ - private List getFilesToCleanKeepingLatestCommits(String partitionPath) { - int commitsRetained = config.getCleanerCommitsRetained(); + private Pair> getFilesToCleanKeepingLatestCommits(String partitionPath, int commitsRetained, HoodieCleaningPolicy policy) { LOG.info("Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. "); List deletePaths = new ArrayList<>(); // Collect all the datafiles savepointed by all the savepoints - List savepointedFiles = hoodieTable.getSavepoints().stream() + List savepointedFiles = hoodieTable.getSavepointTimestamps().stream() .flatMap(this::getSavepointedDataFiles) .collect(Collectors.toList()); // determine if we have enough commits, to start cleaning. + boolean toDeletePartition = false; if (commitTimeline.countInstants() > commitsRetained) { - HoodieInstant earliestCommitToRetain = getEarliestCommitToRetain().get(); + Option earliestCommitToRetainOption = getEarliestCommitToRetain(); + HoodieInstant earliestCommitToRetain = earliestCommitToRetainOption.get(); + // all replaced file groups before earliestCommitToRetain are eligible to clean + deletePaths.addAll(getReplacedFilesEligibleToClean(savepointedFiles, partitionPath, earliestCommitToRetainOption)); + // add active files List fileGroups = fileSystemView.getAllFileGroups(partitionPath).collect(Collectors.toList()); for (HoodieFileGroup fileGroup : fileGroups) { List fileSliceList = fileGroup.getAllFileSlices().collect(Collectors.toList()); @@ -293,14 +328,24 @@ private List getFilesToCleanKeepingLatestCommits(String partition // do not clean up a savepoint data file continue; } - // Dont delete the latest commit and also the last commit before the earliest commit we - // are retaining - // The window of commit retain == max query run time. So a query could be running which - // still - // uses this file. - if (fileCommitTime.equals(lastVersion) || (fileCommitTime.equals(lastVersionBeforeEarliestCommitToRetain))) { - // move on to the next file - continue; + + if (policy == HoodieCleaningPolicy.KEEP_LATEST_COMMITS) { + // Dont delete the latest commit and also the last commit before the earliest commit we + // are retaining + // The window of commit retain == max query run time. So a query could be running which + // still + // uses this file. + if (fileCommitTime.equals(lastVersion) || (fileCommitTime.equals(lastVersionBeforeEarliestCommitToRetain))) { + // move on to the next file + continue; + } + } else if (policy == HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS) { + // This block corresponds to KEEP_LATEST_BY_HOURS policy + // Do not delete the latest commit. + if (fileCommitTime.equals(lastVersion)) { + // move on to the next file + continue; + } } // Always keep the last commit @@ -314,15 +359,46 @@ private List getFilesToCleanKeepingLatestCommits(String partition } }); if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) { - // If merge on read, then clean the log files for the commits as well + // 1. If merge on read, then clean the log files for the commits as well; + // 2. If change log capture is enabled, clean the log files no matter the table type is mor or cow. deletePaths.addAll(aSlice.getLogFiles().map(lf -> new CleanFileInfo(lf.getPath().toString(), false)) .collect(Collectors.toList())); } } } } + // if there are no valid file groups for the partition, mark it to be deleted + if (fileGroups.isEmpty()) { + toDeletePartition = true; + } } - return deletePaths; + return Pair.of(toDeletePartition, deletePaths); + } + + /** + * This method finds the files to be cleaned based on the number of hours. If {@code config.getCleanerHoursRetained()} is set to 5, + * all the files with commit time earlier than 5 hours will be removed. Also the latest file for any file group is retained. + * This policy gives much more flexibility to users for retaining data for running incremental queries as compared to + * KEEP_LATEST_COMMITS cleaning policy. The default number of hours is 5. + * @param partitionPath partition path to check + * @return list of files to clean + */ + private Pair> getFilesToCleanKeepingLatestHours(String partitionPath) { + return getFilesToCleanKeepingLatestCommits(partitionPath, 0, HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS); + } + + private List getReplacedFilesEligibleToClean(List savepointedFiles, String partitionPath, Option earliestCommitToRetain) { + final Stream replacedGroups; + if (earliestCommitToRetain.isPresent()) { + replacedGroups = fileSystemView.getReplacedFileGroupsBefore(earliestCommitToRetain.get().getTimestamp(), partitionPath); + } else { + replacedGroups = fileSystemView.getAllReplacedFileGroups(partitionPath); + } + return replacedGroups.flatMap(HoodieFileGroup::getAllFileSlices) + // do not delete savepointed files (archival will make sure corresponding replacecommit file is not deleted) + .filter(slice -> !slice.getBaseFile().isPresent() || !savepointedFiles.contains(slice.getBaseFile().get().getFileName())) + .flatMap(slice -> getCleanFileInfoForSlice(slice).stream()) + .collect(Collectors.toList()); } /** @@ -341,21 +417,42 @@ private String getLatestVersionBeforeCommit(List fileSliceList, Hoodi return null; } + private List getCleanFileInfoForSlice(FileSlice nextSlice) { + List cleanPaths = new ArrayList<>(); + if (nextSlice.getBaseFile().isPresent()) { + HoodieBaseFile dataFile = nextSlice.getBaseFile().get(); + cleanPaths.add(new CleanFileInfo(dataFile.getPath(), false)); + if (dataFile.getBootstrapBaseFile().isPresent() && config.shouldCleanBootstrapBaseFile()) { + cleanPaths.add(new CleanFileInfo(dataFile.getBootstrapBaseFile().get().getPath(), true)); + } + } + if (hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) { + // If merge on read, then clean the log files for the commits as well + cleanPaths.addAll(nextSlice.getLogFiles().map(lf -> new CleanFileInfo(lf.getPath().toString(), false)) + .collect(Collectors.toList())); + } + return cleanPaths; + } + /** * Returns files to be cleaned for the given partitionPath based on cleaning policy. */ - public List getDeletePaths(String partitionPath) { + public Pair> getDeletePaths(String partitionPath) { HoodieCleaningPolicy policy = config.getCleanerPolicy(); - List deletePaths; + Pair> deletePaths; if (policy == HoodieCleaningPolicy.KEEP_LATEST_COMMITS) { deletePaths = getFilesToCleanKeepingLatestCommits(partitionPath); } else if (policy == HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) { deletePaths = getFilesToCleanKeepingLatestVersions(partitionPath); + } else if (policy == HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS) { + deletePaths = getFilesToCleanKeepingLatestHours(partitionPath); } else { throw new IllegalArgumentException("Unknown cleaning policy : " + policy.name()); } - LOG.info(deletePaths.size() + " patterns used to delete in partition path:" + partitionPath); - + LOG.info(deletePaths.getValue().size() + " patterns used to delete in partition path:" + partitionPath); + if (deletePaths.getKey()) { + LOG.info("Partition " + partitionPath + " to be deleted"); + } return deletePaths; } @@ -365,16 +462,34 @@ public List getDeletePaths(String partitionPath) { public Option getEarliestCommitToRetain() { Option earliestCommitToRetain = Option.empty(); int commitsRetained = config.getCleanerCommitsRetained(); + int hoursRetained = config.getCleanerHoursRetained(); if (config.getCleanerPolicy() == HoodieCleaningPolicy.KEEP_LATEST_COMMITS && commitTimeline.countInstants() > commitsRetained) { - earliestCommitToRetain = commitTimeline.nthInstant(commitTimeline.countInstants() - commitsRetained); + earliestCommitToRetain = commitTimeline.nthInstant(commitTimeline.countInstants() - commitsRetained); //15 instants total, 10 commits to retain, this gives 6th instant in the list + } else if (config.getCleanerPolicy() == HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS) { + Instant instant = Instant.now(); + ZonedDateTime currentDateTime = ZonedDateTime.ofInstant(instant, ZoneId.systemDefault()); + String earliestTimeToRetain = HoodieActiveTimeline.formatDate(Date.from(currentDateTime.minusHours(hoursRetained).toInstant())); + earliestCommitToRetain = Option.fromJavaOptional(commitTimeline.getInstants().filter(i -> HoodieTimeline.compareTimestamps(i.getTimestamp(), + HoodieTimeline.GREATER_THAN_OR_EQUALS, earliestTimeToRetain)).findFirst()); } return earliestCommitToRetain; } + /** + * Returns the last completed commit timestamp before clean. + */ + public String getLastCompletedCommitTimestamp() { + if (commitTimeline.lastInstant().isPresent()) { + return commitTimeline.lastInstant().get().getTimestamp(); + } else { + return ""; + } + } + /** * Determine if file slice needed to be preserved for pending compaction. - * + * * @param fileSlice File Slice * @return true if file slice needs to be preserved, false otherwise. */ diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleaningTriggerStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleaningTriggerStrategy.java new file mode 100644 index 0000000000000..f1ffad261694d --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleaningTriggerStrategy.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.clean; + +public enum CleaningTriggerStrategy { + // trigger cleaning when reach n commits + NUM_COMMITS +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanActionExecutor.java new file mode 100644 index 0000000000000..ab97204c079b1 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanActionExecutor.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.cluster; + +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.BaseActionExecutor; +import org.apache.hudi.table.action.cluster.strategy.ClusteringPlanStrategy; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.Collections; +import java.util.Map; + +public class ClusteringPlanActionExecutor extends BaseActionExecutor> { + + private static final Logger LOG = LogManager.getLogger(ClusteringPlanActionExecutor.class); + + private final Option> extraMetadata; + + public ClusteringPlanActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + Option> extraMetadata) { + super(context, config, table, instantTime); + this.extraMetadata = extraMetadata; + } + + protected Option createClusteringPlan() { + LOG.info("Checking if clustering needs to be run on " + config.getBasePath()); + Option lastClusteringInstant = table.getActiveTimeline() + .filter(s -> s.getAction().equalsIgnoreCase(HoodieTimeline.REPLACE_COMMIT_ACTION)).lastInstant(); + + int commitsSinceLastClustering = table.getActiveTimeline().getCommitsTimeline().filterCompletedInstants() + .findInstantsAfter(lastClusteringInstant.map(HoodieInstant::getTimestamp).orElse("0"), Integer.MAX_VALUE) + .countInstants(); + + if (config.inlineClusteringEnabled() && config.getInlineClusterMaxCommits() > commitsSinceLastClustering) { + LOG.info("Not scheduling inline clustering as only " + commitsSinceLastClustering + + " commits was found since last clustering " + lastClusteringInstant + ". Waiting for " + + config.getInlineClusterMaxCommits()); + return Option.empty(); + } + + if (config.isAsyncClusteringEnabled() && config.getAsyncClusterMaxCommits() > commitsSinceLastClustering) { + LOG.info("Not scheduling async clustering as only " + commitsSinceLastClustering + + " commits was found since last clustering " + lastClusteringInstant + ". Waiting for " + + config.getAsyncClusterMaxCommits()); + return Option.empty(); + } + + LOG.info("Generating clustering plan for table " + config.getBasePath()); + ClusteringPlanStrategy strategy = (ClusteringPlanStrategy) + ReflectionUtils.loadClass(ClusteringPlanStrategy.checkAndGetClusteringPlanStrategy(config), table, context, config); + + return strategy.generateClusteringPlan(); + } + + @Override + public Option execute() { + Option planOption = createClusteringPlan(); + if (planOption.isPresent()) { + HoodieInstant clusteringInstant = + new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.REPLACE_COMMIT_ACTION, instantTime); + try { + HoodieRequestedReplaceMetadata requestedReplaceMetadata = HoodieRequestedReplaceMetadata.newBuilder() + .setOperationType(WriteOperationType.CLUSTER.name()) + .setExtraMetadata(extraMetadata.orElse(Collections.emptyMap())) + .setClusteringPlan(planOption.get()) + .build(); + table.getActiveTimeline().saveToPendingReplaceCommit(clusteringInstant, + TimelineMetadataUtils.serializeRequestedReplaceMetadata(requestedReplaceMetadata)); + } catch (IOException ioe) { + throw new HoodieIOException("Exception scheduling clustering", ioe); + } + } + return planOption; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanPartitionFilter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanPartitionFilter.java new file mode 100644 index 0000000000000..3a889de753d86 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanPartitionFilter.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.cluster; + +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieClusteringException; + +import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * Partition filter utilities. Currently, we support three mode: + * NONE: skip filter + * RECENT DAYS: output recent partition given skip num and days lookback config + * SELECTED_PARTITIONS: output partition falls in the [start, end] condition + */ +public class ClusteringPlanPartitionFilter { + + public static List filter(List partitions, HoodieWriteConfig config) { + ClusteringPlanPartitionFilterMode mode = config.getClusteringPlanPartitionFilterMode(); + switch (mode) { + case NONE: + return partitions; + case RECENT_DAYS: + return recentDaysFilter(partitions, config); + case SELECTED_PARTITIONS: + return selectedPartitionsFilter(partitions, config); + default: + throw new HoodieClusteringException("Unknown partition filter, filter mode: " + mode); + } + } + + private static List recentDaysFilter(List partitions, HoodieWriteConfig config) { + int targetPartitionsForClustering = config.getTargetPartitionsForClustering(); + int skipPartitionsFromLatestForClustering = config.getSkipPartitionsFromLatestForClustering(); + return partitions.stream() + .sorted(Comparator.reverseOrder()) + .skip(Math.max(skipPartitionsFromLatestForClustering, 0)) + .limit(targetPartitionsForClustering > 0 ? targetPartitionsForClustering : partitions.size()) + .collect(Collectors.toList()); + } + + private static List selectedPartitionsFilter(List partitions, HoodieWriteConfig config) { + Stream filteredPartitions = partitions.stream(); + + String beginPartition = config.getBeginPartitionForClustering(); + if (beginPartition != null) { + filteredPartitions = filteredPartitions.filter(path -> path.compareTo(beginPartition) >= 0); + } + + String endPartition = config.getEndPartitionForClustering(); + if (endPartition != null) { + filteredPartitions = filteredPartitions.filter(path -> path.compareTo(endPartition) <= 0); + } + + return filteredPartitions.collect(Collectors.toList()); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanPartitionFilterMode.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanPartitionFilterMode.java new file mode 100644 index 0000000000000..fbaf79797f00d --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanPartitionFilterMode.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.cluster; + +/** + * Clustering partition filter mode + */ +public enum ClusteringPlanPartitionFilterMode { + NONE, + RECENT_DAYS, + SELECTED_PARTITIONS +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/ClusteringExecutionStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/ClusteringExecutionStrategy.java new file mode 100644 index 0000000000000..163947fa34481 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/ClusteringExecutionStrategy.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.cluster.strategy; + +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import org.apache.avro.Schema; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.Serializable; + +/** + * Pluggable implementation for writing data into new file groups based on ClusteringPlan. + */ +public abstract class ClusteringExecutionStrategy implements Serializable { + private static final Logger LOG = LogManager.getLogger(ClusteringExecutionStrategy.class); + + private final HoodieTable hoodieTable; + private final transient HoodieEngineContext engineContext; + private final HoodieWriteConfig writeConfig; + + public ClusteringExecutionStrategy(HoodieTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) { + this.writeConfig = writeConfig; + this.hoodieTable = table; + this.engineContext = engineContext; + } + + /** + * Execute clustering to write inputRecords into new files as defined by rules in strategy parameters. The number of new + * file groups created is bounded by numOutputGroups. + * Note that commit is not done as part of strategy. commit is callers responsibility. + */ + public abstract HoodieWriteMetadata performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime); + + protected HoodieTable getHoodieTable() { + return this.hoodieTable; + } + + protected HoodieEngineContext getEngineContext() { + return this.engineContext; + } + + protected HoodieWriteConfig getWriteConfig() { + return this.writeConfig; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/ClusteringPlanStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/ClusteringPlanStrategy.java new file mode 100644 index 0000000000000..34b35d2ba946d --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/ClusteringPlanStrategy.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.cluster.strategy; + +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.avro.model.HoodieSliceInfo; +import org.apache.hudi.client.utils.FileSliceMetricUtils; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.BaseFile; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.view.SyncableFileSystemView; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.cluster.ClusteringPlanPartitionFilterMode; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.Serializable; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * Pluggable implementation for scheduling clustering and creating ClusteringPlan. + */ +public abstract class ClusteringPlanStrategy implements Serializable { + private static final Logger LOG = LogManager.getLogger(ClusteringPlanStrategy.class); + + public static final int CLUSTERING_PLAN_VERSION_1 = 1; + + private final HoodieTable hoodieTable; + private final transient HoodieEngineContext engineContext; + private final HoodieWriteConfig writeConfig; + + /** + * Check if the given class is deprecated. + * If it is, then try to convert it to suitable one and update the write config accordingly. + * @param config write config + * @return class name of clustering plan strategy + */ + public static String checkAndGetClusteringPlanStrategy(HoodieWriteConfig config) { + String className = config.getClusteringPlanStrategyClass(); + String sparkSizeBasedClassName = HoodieClusteringConfig.SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY; + String sparkSelectedPartitionsClassName = "org.apache.hudi.client.clustering.plan.strategy.SparkSelectedPartitionsClusteringPlanStrategy"; + String sparkRecentDaysClassName = "org.apache.hudi.client.clustering.plan.strategy.SparkRecentDaysClusteringPlanStrategy"; + String javaSelectedPartitionClassName = "org.apache.hudi.client.clustering.plan.strategy.JavaRecentDaysClusteringPlanStrategy"; + String javaSizeBasedClassName = HoodieClusteringConfig.JAVA_SIZED_BASED_CLUSTERING_PLAN_STRATEGY; + + String logStr = "The clustering plan '%s' is deprecated. Please set the plan as '%s' and set '%s' as '%s' to achieve the same behaviour"; + if (sparkRecentDaysClassName.equals(className)) { + config.setValue(HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME, ClusteringPlanPartitionFilterMode.RECENT_DAYS.name()); + LOG.warn(String.format(logStr, className, sparkSizeBasedClassName, HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME.key(), ClusteringPlanPartitionFilterMode.RECENT_DAYS.name())); + return sparkSizeBasedClassName; + } else if (sparkSelectedPartitionsClassName.equals(className)) { + config.setValue(HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME, ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name()); + LOG.warn(String.format(logStr, className, sparkSizeBasedClassName, HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME.key(), ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name())); + return sparkSizeBasedClassName; + } else if (javaSelectedPartitionClassName.equals(className)) { + config.setValue(HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME, ClusteringPlanPartitionFilterMode.RECENT_DAYS.name()); + LOG.warn(String.format(logStr, className, javaSizeBasedClassName, HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME.key(), ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name())); + return javaSizeBasedClassName; + } + return className; + } + + public ClusteringPlanStrategy(HoodieTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) { + this.writeConfig = writeConfig; + this.hoodieTable = table; + this.engineContext = engineContext; + } + + /** + * Generate metadata for grouping eligible files and create a plan. Note that data is not moved around + * as part of this step. + * + * If there is no data available to cluster, return None. + */ + public abstract Option generateClusteringPlan(); + + /** + * Return file slices eligible for clustering. FileIds in pending clustering/compaction are not eligible for clustering. + */ + protected Stream getFileSlicesEligibleForClustering(String partition) { + SyncableFileSystemView fileSystemView = (SyncableFileSystemView) getHoodieTable().getSliceView(); + Set fgIdsInPendingCompactionAndClustering = fileSystemView.getPendingCompactionOperations() + .map(instantTimeOpPair -> instantTimeOpPair.getValue().getFileGroupId()) + .collect(Collectors.toSet()); + fgIdsInPendingCompactionAndClustering.addAll(fileSystemView.getFileGroupsInPendingClustering().map(Pair::getKey).collect(Collectors.toSet())); + + return hoodieTable.getSliceView().getLatestFileSlices(partition) + // file ids already in clustering are not eligible + .filter(slice -> !fgIdsInPendingCompactionAndClustering.contains(slice.getFileGroupId())); + } + + /** + * Get parameters specific to strategy. These parameters are passed from 'schedule clustering' step to + * 'execute clustering' step. 'execute clustering' step is typically async. So these params help with passing any required + * context from schedule to run step. + */ + protected abstract Map getStrategyParams(); + + /** + * Returns any specific parameters to be stored as part of clustering metadata. + */ + protected Map getExtraMetadata() { + return Collections.emptyMap(); + } + + /** + * Version to support future changes for plan. + */ + protected int getPlanVersion() { + return CLUSTERING_PLAN_VERSION_1; + } + + /** + * Transform {@link FileSlice} to {@link HoodieSliceInfo}. + */ + protected static List getFileSliceInfo(List slices) { + return slices.stream().map(slice -> new HoodieSliceInfo().newBuilder() + .setPartitionPath(slice.getPartitionPath()) + .setFileId(slice.getFileId()) + .setDataFilePath(slice.getBaseFile().map(BaseFile::getPath).orElse(StringUtils.EMPTY_STRING)) + .setDeltaFilePaths(slice.getLogFiles().map(f -> f.getPath().toString()).collect(Collectors.toList())) + .setBootstrapFilePath(slice.getBaseFile().map(bf -> bf.getBootstrapBaseFile().map(bbf -> bbf.getPath()).orElse(StringUtils.EMPTY_STRING)).orElse(StringUtils.EMPTY_STRING)) + .build()).collect(Collectors.toList()); + } + + /** + * Generate metrics for the data to be clustered. + */ + protected Map buildMetrics(List fileSlices) { + Map metrics = new HashMap<>(); + FileSliceMetricUtils.addFileSliceCommonMetrics(fileSlices, metrics, getWriteConfig().getParquetMaxFileSize()); + return metrics; + } + + protected HoodieTable getHoodieTable() { + return this.hoodieTable; + } + + protected HoodieEngineContext getEngineContext() { + return this.engineContext; + } + + protected HoodieWriteConfig getWriteConfig() { + return this.writeConfig; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/PartitionAwareClusteringPlanStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/PartitionAwareClusteringPlanStrategy.java new file mode 100644 index 0000000000000..5d62ef390233f --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/PartitionAwareClusteringPlanStrategy.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.cluster.strategy; + +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.avro.model.HoodieClusteringStrategy; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.cluster.ClusteringPlanPartitionFilter; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Arrays; +import java.util.List; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * Scheduling strategy with restriction that clustering groups can only contain files from same partition. + */ +public abstract class PartitionAwareClusteringPlanStrategy extends ClusteringPlanStrategy { + private static final Logger LOG = LogManager.getLogger(PartitionAwareClusteringPlanStrategy.class); + + public PartitionAwareClusteringPlanStrategy(HoodieTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + /** + * Create Clustering group based on files eligible for clustering in the partition. + */ + protected abstract Stream buildClusteringGroupsForPartition(String partitionPath, + List fileSlices); + + /** + * Return list of partition paths to be considered for clustering. + */ + protected List filterPartitionPaths(List partitionPaths) { + List filteredPartitions = ClusteringPlanPartitionFilter.filter(partitionPaths, getWriteConfig()); + LOG.debug("Filtered to the following partitions: " + filteredPartitions); + return filteredPartitions; + } + + @Override + public Option generateClusteringPlan() { + HoodieTableMetaClient metaClient = getHoodieTable().getMetaClient(); + LOG.info("Scheduling clustering for " + metaClient.getBasePath()); + HoodieWriteConfig config = getWriteConfig(); + List partitionPaths = FSUtils.getAllPartitionPaths(getEngineContext(), config.getMetadataConfig(), metaClient.getBasePath()); + + // get matched partitions if set + partitionPaths = getMatchedPartitions(config, partitionPaths); + // filter the partition paths if needed to reduce list status + partitionPaths = filterPartitionPaths(partitionPaths); + + if (partitionPaths.isEmpty()) { + // In case no partitions could be picked, return no clustering plan + return Option.empty(); + } + + List clusteringGroups = getEngineContext() + .flatMap( + partitionPaths, + partitionPath -> { + List fileSlicesEligible = getFileSlicesEligibleForClustering(partitionPath).collect(Collectors.toList()); + return buildClusteringGroupsForPartition(partitionPath, fileSlicesEligible).limit(getWriteConfig().getClusteringMaxNumGroups()); + }, + partitionPaths.size()) + .stream() + .limit(getWriteConfig().getClusteringMaxNumGroups()) + .collect(Collectors.toList()); + + if (clusteringGroups.isEmpty()) { + LOG.info("No data available to cluster"); + return Option.empty(); + } + + HoodieClusteringStrategy strategy = HoodieClusteringStrategy.newBuilder() + .setStrategyClassName(getWriteConfig().getClusteringExecutionStrategyClass()) + .setStrategyParams(getStrategyParams()) + .build(); + + return Option.of(HoodieClusteringPlan.newBuilder() + .setStrategy(strategy) + .setInputGroups(clusteringGroups) + .setExtraMetadata(getExtraMetadata()) + .setVersion(getPlanVersion()) + .setPreserveHoodieMetadata(getWriteConfig().isPreserveHoodieCommitMetadataForClustering()) + .build()); + } + + public List getMatchedPartitions(HoodieWriteConfig config, List partitionPaths) { + String partitionSelected = config.getClusteringPartitionSelected(); + if (!StringUtils.isNullOrEmpty(partitionSelected)) { + return Arrays.asList(partitionSelected.split(",")); + } else { + return getRegexPatternMatchedPartitions(config, partitionPaths); + } + } + + public List getRegexPatternMatchedPartitions(HoodieWriteConfig config, List partitionPaths) { + String pattern = config.getClusteringPartitionFilterRegexPattern(); + if (!StringUtils.isNullOrEmpty(pattern)) { + partitionPaths = partitionPaths.stream() + .filter(partition -> Pattern.matches(pattern, partition)) + .collect(Collectors.toList()); + } + return partitionPaths; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/UpdateStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/UpdateStrategy.java new file mode 100644 index 0000000000000..4e33eb06038cd --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/UpdateStrategy.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.cluster.strategy; + +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.collection.Pair; + +import java.util.Set; + +/** + * When file groups in clustering, write records to these file group need to check. + */ +public abstract class UpdateStrategy { + + protected final HoodieEngineContext engineContext; + protected Set fileGroupsInPendingClustering; + + protected UpdateStrategy(HoodieEngineContext engineContext, Set fileGroupsInPendingClustering) { + this.engineContext = engineContext; + this.fileGroupsInPendingClustering = fileGroupsInPendingClustering; + } + + /** + * Check the update records to the file group in clustering. + * @param taggedRecordsRDD the records to write, tagged with target file id, + * future can update tagged records location to a different fileId. + * @return the recordsRDD strategy updated and a set of file groups to be updated while pending clustering. + */ + public abstract Pair> handleUpdate(I taggedRecordsRDD); + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractBulkInsertHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractBulkInsertHelper.java deleted file mode 100644 index 3ead7a07d9698..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractBulkInsertHelper.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.commit; - -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.BulkInsertPartitioner; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.action.HoodieWriteMetadata; - -public abstract class AbstractBulkInsertHelper { - - public abstract HoodieWriteMetadata bulkInsert(I inputRecords, String instantTime, - HoodieTable table, HoodieWriteConfig config, - BaseCommitActionExecutor executor, boolean performDedupe, - Option> userDefinedBulkInsertPartitioner); -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractDeleteHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractDeleteHelper.java deleted file mode 100644 index c2d2df7ebac99..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractDeleteHelper.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.commit; - -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.action.HoodieWriteMetadata; - -/** - * Helper class to perform delete keys on hoodie table. - * - * @param - */ -public abstract class AbstractDeleteHelper { - - /** - * Deduplicate Hoodie records, using the given deduplication function. - * - * @param keys HoodieKeys to deduplicate - * @param table target Hoodie table for deduplicating - * @param parallelism parallelism or partitions to be used while reducing/deduplicating - * @return HoodieKey already be deduplicated - */ - public abstract K deduplicateKeys(K keys, HoodieTable table, int parallelism); - - public abstract HoodieWriteMetadata execute(String instantTime, - K keys, HoodieEngineContext context, - HoodieWriteConfig config, HoodieTable table, - BaseCommitActionExecutor deleteExecutor); -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractMergeHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractMergeHelper.java deleted file mode 100644 index e318fe304c67b..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractMergeHelper.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.commit; - -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.client.utils.MergingIterator; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.io.HoodieMergeHandle; -import org.apache.hudi.io.storage.HoodieFileReader; -import org.apache.hudi.io.storage.HoodieFileReaderFactory; -import org.apache.hudi.table.HoodieTable; - -import org.apache.avro.generic.GenericDatumReader; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.io.BinaryDecoder; -import org.apache.avro.io.BinaryEncoder; -import org.apache.avro.io.DecoderFactory; -import org.apache.avro.io.EncoderFactory; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.util.Iterator; - -/** - * Helper to read records from previous version of base file and run Merge. - */ -public abstract class AbstractMergeHelper { - - /** - * Read records from previous version of base file and merge. - * @param table Hoodie Table - * @param upsertHandle Merge Handle - * @throws IOException in case of error - */ - public abstract void runMerge(HoodieTable table, HoodieMergeHandle upsertHandle) throws IOException; - - protected GenericRecord transformRecordBasedOnNewSchema(GenericDatumReader gReader, GenericDatumWriter gWriter, - ThreadLocal encoderCache, ThreadLocal decoderCache, - GenericRecord gRec) { - ByteArrayOutputStream inStream = null; - try { - inStream = new ByteArrayOutputStream(); - BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(inStream, encoderCache.get()); - encoderCache.set(encoder); - gWriter.write(gRec, encoder); - encoder.flush(); - - BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inStream.toByteArray(), decoderCache.get()); - decoderCache.set(decoder); - GenericRecord transformedRec = gReader.read(null, decoder); - return transformedRec; - } catch (IOException e) { - throw new HoodieException(e); - } finally { - try { - inStream.close(); - } catch (IOException ioe) { - throw new HoodieException(ioe.getMessage(), ioe); - } - } - } - - /** - * Create Parquet record iterator that provides a stitched view of record read from skeleton and bootstrap file. - * Skeleton file is a representation of the bootstrap file inside the table, with just the bare bone fields needed - * for indexing, writing and other functionality. - * - */ - protected Iterator getMergingIterator(HoodieTable table, HoodieMergeHandle mergeHandle, - HoodieBaseFile baseFile, HoodieFileReader reader, - Schema readSchema, boolean externalSchemaTransformation) throws IOException { - Path externalFilePath = new Path(baseFile.getBootstrapBaseFile().get().getPath()); - Configuration bootstrapFileConfig = new Configuration(table.getHadoopConf()); - HoodieFileReader bootstrapReader = HoodieFileReaderFactory.getFileReader(bootstrapFileConfig, externalFilePath); - Schema bootstrapReadSchema; - if (externalSchemaTransformation) { - bootstrapReadSchema = bootstrapReader.getSchema(); - } else { - bootstrapReadSchema = mergeHandle.getWriterSchema(); - } - - return new MergingIterator<>(reader.getRecordIterator(readSchema), bootstrapReader.getRecordIterator(bootstrapReadSchema), - (inputRecordPair) -> HoodieAvroUtils.stitchRecords(inputRecordPair.getLeft(), inputRecordPair.getRight(), mergeHandle.getWriterSchemaWithMetafields())); - } - - /** - * Consumer that dequeues records from queue and sends to Merge Handle. - */ - protected static class UpdateHandler extends BoundedInMemoryQueueConsumer { - - private final HoodieMergeHandle upsertHandle; - - protected UpdateHandler(HoodieMergeHandle upsertHandle) { - this.upsertHandle = upsertHandle; - } - - @Override - protected void consumeOneRecord(GenericRecord record) { - upsertHandle.write(record); - } - - @Override - protected void finish() {} - - @Override - protected Void getResult() { - return null; - } - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java deleted file mode 100644 index caa6ecdb953a7..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.commit; - -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.exception.HoodieUpsertException; -import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.table.HoodieTable; - -import org.apache.hudi.table.action.HoodieWriteMetadata; - -import java.time.Duration; -import java.time.Instant; - -public abstract class AbstractWriteHelper { - - public HoodieWriteMetadata write(String instantTime, - I inputRecords, - HoodieEngineContext context, - HoodieTable table, - boolean shouldCombine, - int shuffleParallelism, - BaseCommitActionExecutor executor, - boolean performTagging) { - try { - // De-dupe/merge if needed - I dedupedRecords = - combineOnCondition(shouldCombine, inputRecords, shuffleParallelism, table); - - Instant lookupBegin = Instant.now(); - I taggedRecords = dedupedRecords; - if (performTagging) { - // perform index loop up to get existing location of records - taggedRecords = tag(dedupedRecords, context, table); - } - Duration indexLookupDuration = Duration.between(lookupBegin, Instant.now()); - - HoodieWriteMetadata result = executor.execute(taggedRecords); - result.setIndexLookupDuration(indexLookupDuration); - return result; - } catch (Throwable e) { - if (e instanceof HoodieUpsertException) { - throw (HoodieUpsertException) e; - } - throw new HoodieUpsertException("Failed to upsert for commit time " + instantTime, e); - } - } - - private I tag( - I dedupedRecords, HoodieEngineContext context, HoodieTable table) { - // perform index loop up to get existing location of records - return table.getIndex().tagLocation(dedupedRecords, context, table); - } - - public I combineOnCondition( - boolean condition, I records, int parallelism, HoodieTable table) { - return condition ? deduplicateRecords(records, table, parallelism) : records; - } - - /** - * Deduplicate Hoodie records, using the given deduplication function. - * - * @param records hoodieRecords to deduplicate - * @param parallelism parallelism or partitions to be used while reducing/deduplicating - * @return Collection of HoodieRecord already be deduplicated - */ - public I deduplicateRecords( - I records, HoodieTable table, int parallelism) { - return deduplicateRecords(records, table.getIndex(), parallelism); - } - - public abstract I deduplicateRecords( - I records, HoodieIndex index, int parallelism); -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseBulkInsertHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseBulkInsertHelper.java new file mode 100644 index 0000000000000..5355194ff75bf --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseBulkInsertHelper.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.WriteHandleFactory; +import org.apache.hudi.table.BulkInsertPartitioner; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +public abstract class BaseBulkInsertHelper { + + /** + * Mark instant as inflight, write input records, update index and return result. + */ + public abstract HoodieWriteMetadata bulkInsert(I inputRecords, String instantTime, + HoodieTable table, HoodieWriteConfig config, + BaseCommitActionExecutor executor, boolean performDedupe, + Option userDefinedBulkInsertPartitioner); + + /** + * Only write input records. Does not change timeline/index. Return information about new files created. + */ + public abstract O bulkInsert(I inputRecords, String instantTime, + HoodieTable table, HoodieWriteConfig config, + boolean performDedupe, + BulkInsertPartitioner partitioner, + boolean addMetadataFields, + int parallelism, + WriteHandleFactory writeHandleFactory); +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java index 71de9b6fc6f73..fa70004af4c95 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java @@ -18,10 +18,18 @@ package org.apache.hudi.table.action.commit; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.TaskContextSupplier; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.client.transaction.TransactionManager; +import org.apache.hudi.client.utils.TransactionUtils; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieWriteStat; @@ -29,8 +37,15 @@ import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieInstant.State; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.ClusteringUtils; +import org.apache.hudi.common.util.CommitUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieClusteringException; import org.apache.hudi.exception.HoodieCommitException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.table.HoodieTable; @@ -38,7 +53,9 @@ import org.apache.hudi.table.WorkloadStat; import org.apache.hudi.table.action.BaseActionExecutor; import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy; +import org.apache.avro.Schema; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -49,6 +66,9 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; public abstract class BaseCommitActionExecutor extends BaseActionExecutor { @@ -58,6 +78,9 @@ public abstract class BaseCommitActionExecutor> extraMetadata; protected final WriteOperationType operationType; protected final TaskContextSupplier taskContextSupplier; + protected final TransactionManager txnManager; + protected Option>> lastCompletedTxn; + protected Set pendingInflightAndRequestedInstants; public BaseCommitActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime, WriteOperationType operationType, @@ -66,6 +89,16 @@ public BaseCommitActionExecutor(HoodieEngineContext context, HoodieWriteConfig c this.operationType = operationType; this.extraMetadata = extraMetadata; this.taskContextSupplier = context.getTaskContextSupplier(); + // TODO : Remove this once we refactor and move out autoCommit method from here, since the TxnManager is held in {@link BaseHoodieWriteClient}. + this.txnManager = new TransactionManager(config, table.getMetaClient().getFs()); + this.lastCompletedTxn = txnManager.isOptimisticConcurrencyControlEnabled() + ? TransactionUtils.getLastCompletedTxnInstantAndMetadata(table.getMetaClient()) : Option.empty(); + this.pendingInflightAndRequestedInstants = TransactionUtils.getInflightAndRequestedInstants(table.getMetaClient()); + this.pendingInflightAndRequestedInstants.remove(instantTime); + if (!table.getStorageLayout().writeOperationSupported(operationType)) { + throw new UnsupportedOperationException("Executor " + this.getClass().getSimpleName() + + " is not compatible with table layout " + table.getStorageLayout().getClass().getSimpleName()); + } } public abstract HoodieWriteMetadata execute(I inputRecords); @@ -80,22 +113,32 @@ void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile, String insta throws HoodieCommitException { try { HoodieCommitMetadata metadata = new HoodieCommitMetadata(); - profile.getPartitionPaths().forEach(path -> { - WorkloadStat partitionStat = profile.getWorkloadStat(path.toString()); + profile.getOutputPartitionPaths().forEach(path -> { + WorkloadStat partitionStat = profile.getOutputWorkloadStat(path); HoodieWriteStat insertStat = new HoodieWriteStat(); insertStat.setNumInserts(partitionStat.getNumInserts()); insertStat.setFileId(""); insertStat.setPrevCommit(HoodieWriteStat.NULL_COMMIT); metadata.addWriteStat(path, insertStat); - - partitionStat.getUpdateLocationToCount().forEach((key, value) -> { - HoodieWriteStat writeStat = new HoodieWriteStat(); - writeStat.setFileId(key); - // TODO : Write baseCommitTime is possible here ? - writeStat.setPrevCommit(value.getKey()); - writeStat.setNumUpdateWrites(value.getValue()); - metadata.addWriteStat(path, writeStat); - }); + Map> updateLocationMap = partitionStat.getUpdateLocationToCount(); + Map> insertLocationMap = partitionStat.getInsertLocationToCount(); + Stream.concat(updateLocationMap.keySet().stream(), insertLocationMap.keySet().stream()) + .distinct() + .forEach(fileId -> { + HoodieWriteStat writeStat = new HoodieWriteStat(); + writeStat.setFileId(fileId); + Pair updateLocation = updateLocationMap.get(fileId); + Pair insertLocation = insertLocationMap.get(fileId); + // TODO : Write baseCommitTime is possible here ? + writeStat.setPrevCommit(updateLocation != null ? updateLocation.getKey() : insertLocation.getKey()); + if (updateLocation != null) { + writeStat.setNumUpdateWrites(updateLocation.getValue()); + } + if (insertLocation != null) { + writeStat.setNumInserts(insertLocation.getValue()); + } + metadata.addWriteStat(path, writeStat); + }); }); metadata.setOperationType(operationType); @@ -114,15 +157,46 @@ protected String getCommitActionType() { return table.getMetaClient().getCommitActionType(); } + + /** + * Check if any validators are configured and run those validations. If any of the validations fail, throws HoodieValidationException. + */ + protected void runPrecommitValidators(HoodieWriteMetadata writeMetadata) { + if (StringUtils.isNullOrEmpty(config.getPreCommitValidators())) { + return; + } + throw new HoodieIOException("Precommit validation not implemented for all engines yet"); + } + protected void commitOnAutoCommit(HoodieWriteMetadata result) { + // validate commit action before committing result + runPrecommitValidators(result); if (config.shouldAutoCommit()) { LOG.info("Auto commit enabled: Committing " + instantTime); - commit(extraMetadata, result); + autoCommit(extraMetadata, result); } else { LOG.info("Auto commit disabled for " + instantTime); } } + protected void autoCommit(Option> extraMetadata, HoodieWriteMetadata result) { + final Option inflightInstant = Option.of(new HoodieInstant(State.INFLIGHT, + getCommitActionType(), instantTime)); + this.txnManager.beginTransaction(inflightInstant, + lastCompletedTxn.isPresent() ? Option.of(lastCompletedTxn.get().getLeft()) : Option.empty()); + try { + setCommitMetadata(result); + // reload active timeline so as to get all updates after current transaction have started. hence setting last arg to true. + TransactionUtils.resolveWriteConflictIfAny(table, this.txnManager.getCurrentTransactionOwner(), + result.getCommitMetadata(), config, this.txnManager.getLastCompletedTransactionOwner(), true, pendingInflightAndRequestedInstants); + commit(extraMetadata, result); + } finally { + this.txnManager.endTransaction(inflightInstant); + } + } + + protected abstract void setCommitMetadata(HoodieWriteMetadata result); + protected abstract void commit(Option> extraMetadata, HoodieWriteMetadata result); /** @@ -156,4 +230,67 @@ protected abstract Iterator> handleInsert(String idPfx, protected abstract Iterator> handleUpdate(String partitionPath, String fileId, Iterator> recordItr) throws IOException; + + protected HoodieWriteMetadata> executeClustering(HoodieClusteringPlan clusteringPlan) { + HoodieInstant instant = HoodieTimeline.getReplaceCommitRequestedInstant(instantTime); + // Mark instant as clustering inflight + table.getActiveTimeline().transitionReplaceRequestedToInflight(instant, Option.empty()); + table.getMetaClient().reloadActiveTimeline(); + + // Disable auto commit. Strategy is only expected to write data in new files. + config.setValue(HoodieWriteConfig.AUTO_COMMIT_ENABLE, Boolean.FALSE.toString()); + + final Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); + HoodieWriteMetadata> writeMetadata = ( + (ClusteringExecutionStrategy>, HoodieData, HoodieData>) + ReflectionUtils.loadClass(config.getClusteringExecutionStrategyClass(), + new Class[] {HoodieTable.class, HoodieEngineContext.class, HoodieWriteConfig.class}, table, context, config)) + .performClustering(clusteringPlan, schema, instantTime); + HoodieData writeStatusList = writeMetadata.getWriteStatuses(); + HoodieData statuses = updateIndex(writeStatusList, writeMetadata); + writeMetadata.setWriteStats(statuses.map(WriteStatus::getStat).collectAsList()); + writeMetadata.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(clusteringPlan, writeMetadata)); + commitOnAutoCommit(writeMetadata); + if (!writeMetadata.getCommitMetadata().isPresent()) { + HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(writeMetadata.getWriteStats().get(), writeMetadata.getPartitionToReplaceFileIds(), + extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()); + writeMetadata.setCommitMetadata(Option.of(commitMetadata)); + } + return writeMetadata; + } + + private HoodieData updateIndex(HoodieData writeStatuses, HoodieWriteMetadata> result) { + Instant indexStartTime = Instant.now(); + // Update the index back + HoodieData statuses = table.getIndex().updateLocation(writeStatuses, context, table); + result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now())); + result.setWriteStatuses(statuses); + return statuses; + } + + private Map> getPartitionToReplacedFileIds(HoodieClusteringPlan clusteringPlan, HoodieWriteMetadata> writeMetadata) { + Set newFilesWritten = writeMetadata.getWriteStats().get().stream() + .map(s -> new HoodieFileGroupId(s.getPartitionPath(), s.getFileId())).collect(Collectors.toSet()); + + return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan) + .filter(fg -> "org.apache.hudi.client.clustering.run.strategy.SparkSingleFileSortExecutionStrategy" + .equals(config.getClusteringExecutionStrategyClass()) + || !newFilesWritten.contains(fg)) + .collect(Collectors.groupingBy(HoodieFileGroupId::getPartitionPath, Collectors.mapping(HoodieFileGroupId::getFileId, Collectors.toList()))); + } + + /** + * Validate actions taken by clustering. In the first implementation, we validate at least one new file is written. + * But we can extend this to add more validation. E.g. number of records read = number of records written etc. + * We can also make these validations in BaseCommitActionExecutor to reuse pre-commit hooks for multiple actions. + */ + private void validateWriteResult(HoodieClusteringPlan clusteringPlan, HoodieWriteMetadata> writeMetadata) { + if (writeMetadata.getWriteStatuses().isEmpty()) { + throw new HoodieClusteringException("Clustering plan produced 0 WriteStatus for " + instantTime + + " #groups: " + clusteringPlan.getInputGroups().size() + " expected at least " + + clusteringPlan.getInputGroups().stream().mapToInt(HoodieClusteringGroup::getNumOutputFileGroups).sum() + + " write statuses"); + } + } + } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseDeleteHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseDeleteHelper.java new file mode 100644 index 0000000000000..b119587f47535 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseDeleteHelper.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +/** + * Helper class to perform delete keys on hoodie table. + * + * @param + */ +public abstract class BaseDeleteHelper { + + /** + * Deduplicate Hoodie records, using the given deduplication function. + * + * @param keys HoodieKeys to deduplicate + * @param table target Hoodie table for deduplicating + * @param parallelism parallelism or partitions to be used while reducing/deduplicating + * @return HoodieKey already be deduplicated + */ + public abstract K deduplicateKeys(K keys, HoodieTable table, int parallelism); + + public abstract HoodieWriteMetadata execute(String instantTime, + K keys, HoodieEngineContext context, + HoodieWriteConfig config, HoodieTable table, + BaseCommitActionExecutor deleteExecutor); +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseMergeHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseMergeHelper.java new file mode 100644 index 0000000000000..8c34e3c3a74ca --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseMergeHelper.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.io.HoodieMergeHandle; +import org.apache.hudi.table.HoodieTable; + +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.io.BinaryDecoder; +import org.apache.avro.io.BinaryEncoder; +import org.apache.avro.io.DecoderFactory; +import org.apache.avro.io.EncoderFactory; +import org.apache.avro.generic.GenericRecord; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +/** + * Helper to read records from previous version of base file and run Merge. + */ +public abstract class BaseMergeHelper { + + /** + * Read records from previous version of base file and merge. + * @param table Hoodie Table + * @param upsertHandle Merge Handle + * @throws IOException in case of error + */ + public abstract void runMerge(HoodieTable table, HoodieMergeHandle upsertHandle) throws IOException; + + protected GenericRecord transformRecordBasedOnNewSchema(GenericDatumReader gReader, GenericDatumWriter gWriter, + ThreadLocal encoderCache, ThreadLocal decoderCache, + GenericRecord gRec) { + ByteArrayOutputStream inStream = null; + try { + inStream = new ByteArrayOutputStream(); + BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(inStream, encoderCache.get()); + encoderCache.set(encoder); + gWriter.write(gRec, encoder); + encoder.flush(); + + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inStream.toByteArray(), decoderCache.get()); + decoderCache.set(decoder); + GenericRecord transformedRec = gReader.read(null, decoder); + return transformedRec; + } catch (IOException e) { + throw new HoodieException(e); + } finally { + try { + inStream.close(); + } catch (IOException ioe) { + throw new HoodieException(ioe.getMessage(), ioe); + } + } + } + + /** + * Consumer that dequeues records from queue and sends to Merge Handle. + */ + protected static class UpdateHandler extends BoundedInMemoryQueueConsumer { + + private final HoodieMergeHandle upsertHandle; + + protected UpdateHandler(HoodieMergeHandle upsertHandle) { + this.upsertHandle = upsertHandle; + } + + @Override + protected void consumeOneRecord(GenericRecord record) { + upsertHandle.write(record); + } + + @Override + protected void finish() {} + + @Override + protected Void getResult() { + return null; + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseWriteHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseWriteHelper.java new file mode 100644 index 0000000000000..846afec7c1db3 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseWriteHelper.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.exception.HoodieUpsertException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.table.HoodieTable; + +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.time.Duration; +import java.time.Instant; + +public abstract class BaseWriteHelper { + + public HoodieWriteMetadata write(String instantTime, + I inputRecords, + HoodieEngineContext context, + HoodieTable table, + boolean shouldCombine, + int shuffleParallelism, + BaseCommitActionExecutor executor, + WriteOperationType operationType) { + try { + // De-dupe/merge if needed + I dedupedRecords = + combineOnCondition(shouldCombine, inputRecords, shuffleParallelism, table); + + Instant lookupBegin = Instant.now(); + I taggedRecords = dedupedRecords; + if (table.getIndex().requiresTagging(operationType)) { + // perform index loop up to get existing location of records + context.setJobStatus(this.getClass().getSimpleName(), "Tagging: " + table.getConfig().getTableName()); + taggedRecords = tag(dedupedRecords, context, table); + } + Duration indexLookupDuration = Duration.between(lookupBegin, Instant.now()); + + HoodieWriteMetadata result = executor.execute(taggedRecords); + result.setIndexLookupDuration(indexLookupDuration); + return result; + } catch (Throwable e) { + if (e instanceof HoodieUpsertException) { + throw (HoodieUpsertException) e; + } + throw new HoodieUpsertException("Failed to upsert for commit time " + instantTime, e); + } + } + + protected abstract I tag( + I dedupedRecords, HoodieEngineContext context, HoodieTable table); + + public I combineOnCondition( + boolean condition, I records, int parallelism, HoodieTable table) { + return condition ? deduplicateRecords(records, table, parallelism) : records; + } + + /** + * Deduplicate Hoodie records, using the given deduplication function. + * + * @param records hoodieRecords to deduplicate + * @param parallelism parallelism or partitions to be used while reducing/deduplicating + * @return Collection of HoodieRecord already be deduplicated + */ + public I deduplicateRecords( + I records, HoodieTable table, int parallelism) { + return deduplicateRecords(records, table.getIndex(), parallelism); + } + + public abstract I deduplicateRecords( + I records, HoodieIndex index, int parallelism); +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BucketInfo.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BucketInfo.java index 1d98ad49e77fb..6547da6425460 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BucketInfo.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BucketInfo.java @@ -19,6 +19,7 @@ package org.apache.hudi.table.action.commit; import java.io.Serializable; +import java.util.Objects; /** * Helper class for a bucket's type (INSERT and UPDATE) and its file location. @@ -29,6 +30,24 @@ public class BucketInfo implements Serializable { String fileIdPrefix; String partitionPath; + public BucketInfo(BucketType bucketType, String fileIdPrefix, String partitionPath) { + this.bucketType = bucketType; + this.fileIdPrefix = fileIdPrefix; + this.partitionPath = partitionPath; + } + + public BucketType getBucketType() { + return bucketType; + } + + public String getFileIdPrefix() { + return fileIdPrefix; + } + + public String getPartitionPath() { + return partitionPath; + } + @Override public String toString() { final StringBuilder sb = new StringBuilder("BucketInfo {"); @@ -38,4 +57,23 @@ public String toString() { sb.append('}'); return sb.toString(); } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + BucketInfo that = (BucketInfo) o; + return bucketType == that.bucketType + && fileIdPrefix.equals(that.fileIdPrefix) + && partitionPath.equals(that.partitionPath); + } + + @Override + public int hashCode() { + return Objects.hash(bucketType, fileIdPrefix, partitionPath); + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieDeleteHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieDeleteHelper.java new file mode 100644 index 0000000000000..fff52eb24d736 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieDeleteHelper.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieUpsertException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.WorkloadProfile; +import org.apache.hudi.table.WorkloadStat; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.time.Duration; +import java.time.Instant; +import java.util.HashMap; + +/** + * A spark implementation of {@link BaseDeleteHelper}. + * + * @param + */ +@SuppressWarnings("checkstyle:LineLength") +public class HoodieDeleteHelper extends + BaseDeleteHelper>, HoodieData, HoodieData, R> { + private HoodieDeleteHelper() { + } + + private static class DeleteHelperHolder { + private static final HoodieDeleteHelper HOODIE_DELETE_HELPER = new HoodieDeleteHelper<>(); + } + + public static HoodieDeleteHelper newInstance() { + return DeleteHelperHolder.HOODIE_DELETE_HELPER; + } + + @Override + public HoodieData deduplicateKeys(HoodieData keys, HoodieTable>, HoodieData, HoodieData> table, int parallelism) { + boolean isIndexingGlobal = table.getIndex().isGlobal(); + if (isIndexingGlobal) { + return keys.distinctWithKey(HoodieKey::getRecordKey, parallelism); + } else { + return keys.distinct(parallelism); + } + } + + @Override + public HoodieWriteMetadata> execute(String instantTime, + HoodieData keys, + HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable>, HoodieData, HoodieData> table, + BaseCommitActionExecutor>, HoodieData, HoodieData, R> deleteExecutor) { + try { + HoodieData dedupedKeys = keys; + final int parallelism = config.getDeleteShuffleParallelism(); + if (config.shouldCombineBeforeDelete()) { + // De-dupe/merge if needed + dedupedKeys = deduplicateKeys(keys, table, parallelism); + } else if (!keys.isEmpty()) { + dedupedKeys = keys.repartition(parallelism); + } + + HoodieData> dedupedRecords = + dedupedKeys.map(key -> new HoodieAvroRecord(key, new EmptyHoodieRecordPayload())); + Instant beginTag = Instant.now(); + // perform index loop up to get existing location of records + HoodieData> taggedRecords = table.getIndex().tagLocation(dedupedRecords, context, table); + Duration tagLocationDuration = Duration.between(beginTag, Instant.now()); + + // filter out non existent keys/records + HoodieData> taggedValidRecords = taggedRecords.filter(HoodieRecord::isCurrentLocationKnown); + HoodieWriteMetadata> result; + if (!taggedValidRecords.isEmpty()) { + result = deleteExecutor.execute(taggedValidRecords); + result.setIndexLookupDuration(tagLocationDuration); + } else { + // if entire set of keys are non existent + deleteExecutor.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(Pair.of(new HashMap<>(), new WorkloadStat())), instantTime); + result = new HoodieWriteMetadata<>(); + result.setWriteStatuses(context.emptyHoodieData()); + deleteExecutor.commitOnAutoCommit(result); + } + return result; + } catch (Throwable e) { + if (e instanceof HoodieUpsertException) { + throw (HoodieUpsertException) e; + } + throw new HoodieUpsertException("Failed to delete for commit time " + instantTime, e); + } + } + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieMergeHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieMergeHelper.java new file mode 100644 index 0000000000000..21ebf5fe28249 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieMergeHelper.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.avro.SchemaCompatibility; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.utils.MergingIterator; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.InternalSchemaCache; +import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.action.InternalSchemaMerger; +import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter; +import org.apache.hudi.internal.schema.utils.AvroSchemaEvolutionUtils; +import org.apache.hudi.internal.schema.utils.SerDeHelper; +import org.apache.hudi.internal.schema.utils.InternalSchemaUtils; +import org.apache.hudi.io.HoodieMergeHandle; +import org.apache.hudi.io.storage.HoodieFileReader; +import org.apache.hudi.io.storage.HoodieFileReaderFactory; +import org.apache.hudi.table.HoodieTable; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.BinaryDecoder; +import org.apache.avro.io.BinaryEncoder; +import org.apache.hadoop.conf.Configuration; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Collectors; + +public class HoodieMergeHelper extends + BaseMergeHelper>, HoodieData, HoodieData> { + + private HoodieMergeHelper() { + } + + private static class MergeHelperHolder { + private static final HoodieMergeHelper HOODIE_MERGE_HELPER = new HoodieMergeHelper<>(); + } + + public static HoodieMergeHelper newInstance() { + return MergeHelperHolder.HOODIE_MERGE_HELPER; + } + + @Override + public void runMerge(HoodieTable>, HoodieData, HoodieData> table, + HoodieMergeHandle>, HoodieData, HoodieData> mergeHandle) throws IOException { + final boolean externalSchemaTransformation = table.getConfig().shouldUseExternalSchemaTransformation(); + HoodieBaseFile baseFile = mergeHandle.baseFileForMerge(); + + Configuration hadoopConf = new Configuration(table.getHadoopConf()); + HoodieFileReader baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf, mergeHandle.getOldFilePath()); + HoodieFileReader bootstrapFileReader = null; + + final GenericDatumWriter gWriter; + final GenericDatumReader gReader; + Schema readSchema; + if (externalSchemaTransformation || baseFile.getBootstrapBaseFile().isPresent()) { + readSchema = baseFileReader.getSchema(); + gWriter = new GenericDatumWriter<>(readSchema); + gReader = new GenericDatumReader<>(readSchema, mergeHandle.getWriterSchemaWithMetaFields()); + } else { + gReader = null; + gWriter = null; + readSchema = mergeHandle.getWriterSchemaWithMetaFields(); + } + + BoundedInMemoryExecutor wrapper = null; + + Option querySchemaOpt = SerDeHelper.fromJson(table.getConfig().getInternalSchema()); + boolean needToReWriteRecord = false; + Map renameCols = new HashMap<>(); + // TODO support bootstrap + if (querySchemaOpt.isPresent() && !baseFile.getBootstrapBaseFile().isPresent()) { + // check implicitly add columns, and position reorder(spark sql may change cols order) + InternalSchema querySchema = AvroSchemaEvolutionUtils.reconcileSchema(readSchema, querySchemaOpt.get()); + long commitInstantTime = Long.valueOf(FSUtils.getCommitTime(mergeHandle.getOldFilePath().getName())); + InternalSchema writeInternalSchema = InternalSchemaCache.searchSchemaAndCache(commitInstantTime, table.getMetaClient(), table.getConfig().getInternalSchemaCacheEnable()); + if (writeInternalSchema.isEmptySchema()) { + throw new HoodieException(String.format("cannot find file schema for current commit %s", commitInstantTime)); + } + List colNamesFromQuerySchema = querySchema.getAllColsFullName(); + List colNamesFromWriteSchema = writeInternalSchema.getAllColsFullName(); + List sameCols = colNamesFromWriteSchema.stream() + .filter(f -> colNamesFromQuerySchema.contains(f) + && writeInternalSchema.findIdByName(f) == querySchema.findIdByName(f) + && writeInternalSchema.findIdByName(f) != -1 + && writeInternalSchema.findType(writeInternalSchema.findIdByName(f)).equals(querySchema.findType(writeInternalSchema.findIdByName(f)))).collect(Collectors.toList()); + readSchema = AvroInternalSchemaConverter + .convert(new InternalSchemaMerger(writeInternalSchema, querySchema, true, false, false).mergeSchema(), readSchema.getName()); + Schema writeSchemaFromFile = AvroInternalSchemaConverter.convert(writeInternalSchema, readSchema.getName()); + needToReWriteRecord = sameCols.size() != colNamesFromWriteSchema.size() + || SchemaCompatibility.checkReaderWriterCompatibility(readSchema, writeSchemaFromFile).getType() == org.apache.avro.SchemaCompatibility.SchemaCompatibilityType.COMPATIBLE; + if (needToReWriteRecord) { + renameCols = InternalSchemaUtils.collectRenameCols(writeInternalSchema, querySchema); + } + } + + try { + final Iterator readerIterator; + if (baseFile.getBootstrapBaseFile().isPresent()) { + Path bootstrapFilePath = new Path(baseFile.getBootstrapBaseFile().get().getPath()); + Configuration bootstrapFileConfig = new Configuration(table.getHadoopConf()); + bootstrapFileReader = HoodieFileReaderFactory.getFileReader(bootstrapFileConfig, bootstrapFilePath); + // NOTE: It's important for us to rely on writer's schema here + // - When records will be read by Parquet reader, if schema will be decoded from the + // file itself by taking its Parquet one and converting it to Avro. This will be problematic + // w/ schema validations of the records since Avro's schemas also validate corresponding + // qualified names of the structs, which could not be reconstructed when converting from + // Parquet to Avro (b/c Parquet doesn't bear these) + Schema bootstrapSchema = externalSchemaTransformation ? bootstrapFileReader.getSchema() : mergeHandle.getWriterSchema(); + readerIterator = new MergingIterator<>( + baseFileReader.getRecordIterator(readSchema), + bootstrapFileReader.getRecordIterator(bootstrapSchema), + (inputRecordPair) -> HoodieAvroUtils.stitchRecords(inputRecordPair.getLeft(), inputRecordPair.getRight(), mergeHandle.getWriterSchemaWithMetaFields())); + } else { + if (needToReWriteRecord) { + readerIterator = HoodieAvroUtils.rewriteRecordWithNewSchema(baseFileReader.getRecordIterator(), readSchema, renameCols); + } else { + readerIterator = baseFileReader.getRecordIterator(readSchema); + } + } + + ThreadLocal encoderCache = new ThreadLocal<>(); + ThreadLocal decoderCache = new ThreadLocal<>(); + wrapper = new BoundedInMemoryExecutor(table.getConfig().getWriteBufferLimitBytes(), readerIterator, + new UpdateHandler(mergeHandle), record -> { + if (!externalSchemaTransformation) { + return record; + } + return transformRecordBasedOnNewSchema(gReader, gWriter, encoderCache, decoderCache, (GenericRecord) record); + }, table.getPreExecuteRunnable()); + wrapper.execute(); + } catch (Exception e) { + throw new HoodieException(e); + } finally { + // HUDI-2875: mergeHandle is not thread safe, we should totally terminate record inputting + // and executor firstly and then close mergeHandle. + baseFileReader.close(); + if (bootstrapFileReader != null) { + bootstrapFileReader.close(); + } + if (null != wrapper) { + wrapper.shutdownNow(); + wrapper.awaitTermination(); + } + mergeHandle.close(); + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieWriteHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieWriteHelper.java new file mode 100644 index 0000000000000..b359550e8a7b6 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieWriteHelper.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.table.HoodieTable; + +public class HoodieWriteHelper extends BaseWriteHelper>, + HoodieData, HoodieData, R> { + + private HoodieWriteHelper() { + } + + private static class WriteHelperHolder { + private static final HoodieWriteHelper HOODIE_WRITE_HELPER = new HoodieWriteHelper<>(); + } + + public static HoodieWriteHelper newInstance() { + return WriteHelperHolder.HOODIE_WRITE_HELPER; + } + + @Override + protected HoodieData> tag(HoodieData> dedupedRecords, HoodieEngineContext context, + HoodieTable>, HoodieData, HoodieData> table) { + return table.getIndex().tagLocation(dedupedRecords, context, table); + } + + @Override + public HoodieData> deduplicateRecords( + HoodieData> records, HoodieIndex index, int parallelism) { + boolean isIndexingGlobal = index.isGlobal(); + // Auto-tunes the parallelism for reduce transformation based on the number of data partitions + // in engine-specific representation + int reduceParallelism = Math.max(1, Math.min(records.getNumPartitions(), parallelism)); + return records.mapToPair(record -> { + HoodieKey hoodieKey = record.getKey(); + // If index used is global, then records are expected to differ in their partitionPath + Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey; + return Pair.of(key, record); + }).reduceByKey((rec1, rec2) -> { + @SuppressWarnings("unchecked") + T reducedData = (T) rec2.getData().preCombine(rec1.getData(), CollectionUtils.emptyProps()); + HoodieKey reducedKey = rec1.getData().equals(reducedData) ? rec1.getKey() : rec2.getKey(); + + return new HoodieAvroRecord<>(reducedKey, reducedData); + }, reduceParallelism).map(Pair::getRight); + } + +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/Partitioner.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/Partitioner.java similarity index 100% rename from hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/Partitioner.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/Partitioner.java diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/AbstractCompactHelpers.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/AbstractCompactHelpers.java deleted file mode 100644 index 3ff9e625e8c7c..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/AbstractCompactHelpers.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.compact; - -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.exception.HoodieCompactionException; -import org.apache.hudi.table.HoodieTable; - -import java.io.IOException; -import java.nio.charset.StandardCharsets; - -/** - * Base class helps to perform compact. - * - * @param Type of payload in {@link org.apache.hudi.common.model.HoodieRecord} - * @param Type of inputs - * @param Type of keys - * @param Type of outputs - */ -public abstract class AbstractCompactHelpers { - public abstract HoodieCommitMetadata createCompactionMetadata(HoodieTable table, - String compactionInstantTime, - O writeStatuses, - String schema) throws IOException; - - public void completeInflightCompaction(HoodieTable table, String compactionCommitTime, HoodieCommitMetadata commitMetadata) { - HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); - try { - activeTimeline.transitionCompactionInflightToComplete( - new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactionCommitTime), - Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); - } catch (IOException e) { - throw new HoodieCompactionException( - "Failed to commit " + table.getMetaClient().getBasePath() + " at time " + compactionCommitTime, e); - } - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/BaseScheduleCompactionActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/BaseScheduleCompactionActionExecutor.java deleted file mode 100644 index d21c7d96d113a..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/BaseScheduleCompactionActionExecutor.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.compact; - -import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.action.BaseActionExecutor; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -public abstract class BaseScheduleCompactionActionExecutor extends BaseActionExecutor> { - - private final Option> extraMetadata; - - public BaseScheduleCompactionActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable table, - String instantTime, - Option> extraMetadata) { - super(context, config, table, instantTime); - this.extraMetadata = extraMetadata; - } - - protected abstract HoodieCompactionPlan scheduleCompaction(); - - @Override - public Option execute() { - // if there are inflight writes, their instantTime must not be less than that of compaction instant time - table.getActiveTimeline().getCommitsTimeline().filterPendingExcludingCompaction().firstInstant() - .ifPresent(earliestInflight -> ValidationUtils.checkArgument( - HoodieTimeline.compareTimestamps(earliestInflight.getTimestamp(), HoodieTimeline.GREATER_THAN, instantTime), - "Earliest write inflight instant time must be later than compaction time. Earliest :" + earliestInflight - + ", Compaction scheduled at " + instantTime)); - - // Committed and pending compaction instants should have strictly lower timestamps - List conflictingInstants = table.getActiveTimeline() - .getCommitsAndCompactionTimeline().getInstants() - .filter(instant -> HoodieTimeline.compareTimestamps( - instant.getTimestamp(), HoodieTimeline.GREATER_THAN_OR_EQUALS, instantTime)) - .collect(Collectors.toList()); - ValidationUtils.checkArgument(conflictingInstants.isEmpty(), - "Following instants have timestamps >= compactionInstant (" + instantTime + ") Instants :" - + conflictingInstants); - - HoodieCompactionPlan plan = scheduleCompaction(); - if (plan != null && (plan.getOperations() != null) && (!plan.getOperations().isEmpty())) { - extraMetadata.ifPresent(plan::setExtraMetadata); - HoodieInstant compactionInstant = - new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, instantTime); - try { - table.getActiveTimeline().saveToCompactionRequested(compactionInstant, - TimelineMetadataUtils.serializeCompactionPlan(plan)); - } catch (IOException ioe) { - throw new HoodieIOException("Exception scheduling compaction", ioe); - } - return Option.of(plan); - } - return Option.empty(); - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/CompactHelpers.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/CompactHelpers.java new file mode 100644 index 0000000000000..3379d16f4c035 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/CompactHelpers.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.compact; + +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieCompactionException; +import org.apache.hudi.table.HoodieTable; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.List; + +/** + * Base class helps to perform compact. + * + * @param Type of payload in {@link org.apache.hudi.common.model.HoodieRecord} + * @param Type of inputs + * @param Type of keys + * @param Type of outputs + */ +public class CompactHelpers { + + private static final CompactHelpers SINGLETON_INSTANCE = new CompactHelpers(); + + private CompactHelpers() { + } + + public static CompactHelpers getInstance() { + return SINGLETON_INSTANCE; + } + + public HoodieCommitMetadata createCompactionMetadata( + HoodieTable table, String compactionInstantTime, HoodieData writeStatuses, + String schema) throws IOException { + byte[] planBytes = table.getActiveTimeline().readCompactionPlanAsBytes( + HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime)).get(); + HoodieCompactionPlan compactionPlan = TimelineMetadataUtils.deserializeCompactionPlan(planBytes); + List updateStatusMap = writeStatuses.map(WriteStatus::getStat).collectAsList(); + HoodieCommitMetadata metadata = new HoodieCommitMetadata(true); + for (HoodieWriteStat stat : updateStatusMap) { + metadata.addWriteStat(stat.getPartitionPath(), stat); + } + metadata.addMetadata(org.apache.hudi.common.model.HoodieCommitMetadata.SCHEMA_KEY, schema); + if (compactionPlan.getExtraMetadata() != null) { + compactionPlan.getExtraMetadata().forEach(metadata::addMetadata); + } + return metadata; + } + + public void completeInflightCompaction(HoodieTable table, String compactionCommitTime, HoodieCommitMetadata commitMetadata) { + HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); + try { + activeTimeline.transitionCompactionInflightToComplete( + HoodieTimeline.getCompactionInflightInstant(compactionCommitTime), + Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + } catch (IOException e) { + throw new HoodieCompactionException( + "Failed to commit " + table.getMetaClient().getBasePath() + " at time " + compactionCommitTime, e); + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/CompactionTriggerStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/CompactionTriggerStrategy.java new file mode 100644 index 0000000000000..ec85978552c23 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/CompactionTriggerStrategy.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.compact; + +public enum CompactionTriggerStrategy { + // trigger compaction when reach N delta commits + NUM_COMMITS, + // trigger compaction when reach N delta commits since last compaction request + NUM_COMMITS_AFTER_LAST_REQUEST, + // trigger compaction when time elapsed > N seconds since last compaction + TIME_ELAPSED, + // trigger compaction when both NUM_COMMITS and TIME_ELAPSED are satisfied + NUM_AND_TIME, + // trigger compaction when NUM_COMMITS or TIME_ELAPSED is satisfied + NUM_OR_TIME +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/HoodieCompactor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/HoodieCompactor.java index e5212e8f43a5b..75954872aedd5 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/HoodieCompactor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/HoodieCompactor.java @@ -18,39 +18,313 @@ package org.apache.hudi.table.action.compact; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieCompactionOperation; import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieAccumulator; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.CompactionOperation; +import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.TableFileSystemView.SliceView; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.CompactionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.utils.SerDeHelper; +import org.apache.hudi.io.IOUtils; +import org.apache.hudi.table.HoodieCompactionHandler; import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.compact.strategy.CompactionStrategy; + +import org.apache.avro.Schema; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import java.io.IOException; import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; import java.util.Set; +import java.util.stream.StreamSupport; + +import static java.util.stream.Collectors.toList; /** * A HoodieCompactor runs compaction on a hoodie table. */ -public interface HoodieCompactor extends Serializable { +public abstract class HoodieCompactor implements Serializable { + + private static final Logger LOG = LogManager.getLogger(HoodieCompactor.class); /** - * Generate a new compaction plan for scheduling. + * Handles the compaction timeline based on the compaction instant before actual compaction. * - * @param context HoodieEngineContext - * @param hoodieTable Hoodie Table - * @param config Hoodie Write Configuration - * @param compactionCommitTime scheduled compaction commit time - * @param fgIdsInPendingCompactions partition-fileId pairs for which compaction is pending - * @return Compaction Plan - * @throws IOException when encountering errors + * @param table {@link HoodieTable} instance to use. + * @param pendingCompactionTimeline pending compaction timeline. + * @param compactionInstantTime compaction instant + */ + public abstract void preCompact( + HoodieTable table, HoodieTimeline pendingCompactionTimeline, String compactionInstantTime); + + /** + * Maybe persist write status. + * + * @param writeStatus {@link HoodieData} of {@link WriteStatus}. */ - HoodieCompactionPlan generateCompactionPlan(HoodieEngineContext context, HoodieTable hoodieTable, HoodieWriteConfig config, - String compactionCommitTime, Set fgIdsInPendingCompactions) throws IOException; + public abstract void maybePersist(HoodieData writeStatus, HoodieWriteConfig config); /** * Execute compaction operations and report back status. */ - O compact(HoodieEngineContext context, HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable, - HoodieWriteConfig config, String compactionInstantTime) throws IOException; + public HoodieData compact( + HoodieEngineContext context, HoodieCompactionPlan compactionPlan, + HoodieTable table, HoodieWriteConfig config, String compactionInstantTime, + HoodieCompactionHandler compactionHandler) { + if (compactionPlan == null || (compactionPlan.getOperations() == null) + || (compactionPlan.getOperations().isEmpty())) { + return context.emptyHoodieData(); + } + HoodieActiveTimeline timeline = table.getActiveTimeline(); + HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime); + // Mark instant as compaction inflight + timeline.transitionCompactionRequestedToInflight(instant); + table.getMetaClient().reloadActiveTimeline(); + + HoodieTableMetaClient metaClient = table.getMetaClient(); + TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient); + + // Here we firstly use the table schema as the reader schema to read + // log file.That is because in the case of MergeInto, the config.getSchema may not + // the same with the table schema. + try { + if (StringUtils.isNullOrEmpty(config.getInternalSchema())) { + Schema readerSchema = schemaResolver.getTableAvroSchema(false); + config.setSchema(readerSchema.toString()); + } + } catch (Exception e) { + // If there is no commit in the table, just ignore the exception. + } + + // Compacting is very similar to applying updates to existing file + List operations = compactionPlan.getOperations().stream() + .map(CompactionOperation::convertFromAvroRecordInstance).collect(toList()); + LOG.info("Compactor compacting " + operations + " files"); + + context.setJobStatus(this.getClass().getSimpleName(), "Compacting file slices: " + config.getTableName()); + TaskContextSupplier taskContextSupplier = table.getTaskContextSupplier(); + return context.parallelize(operations).map(operation -> compact( + compactionHandler, metaClient, config, operation, compactionInstantTime, taskContextSupplier)) + .flatMap(List::iterator); + } + + /** + * Execute a single compaction operation and report back status. + */ + public List compact(HoodieCompactionHandler compactionHandler, + HoodieTableMetaClient metaClient, + HoodieWriteConfig config, + CompactionOperation operation, + String instantTime, + TaskContextSupplier taskContextSupplier) throws IOException { + FileSystem fs = metaClient.getFs(); + Schema readerSchema; + Option internalSchemaOption = Option.empty(); + if (!StringUtils.isNullOrEmpty(config.getInternalSchema())) { + readerSchema = new Schema.Parser().parse(config.getSchema()); + internalSchemaOption = SerDeHelper.fromJson(config.getInternalSchema()); + // its safe to modify config here, since we running in task side. + ((HoodieTable) compactionHandler).getConfig().setDefault(config); + } else { + readerSchema = HoodieAvroUtils.addMetadataFields( + new Schema.Parser().parse(config.getSchema()), config.allowOperationMetadataField()); + } + LOG.info("Compacting base " + operation.getDataFileName() + " with delta files " + operation.getDeltaFileNames() + + " for commit " + instantTime); + // TODO - FIX THIS + // Reads the entire avro file. Always only specific blocks should be read from the avro file + // (failure recover). + // Load all the delta commits since the last compaction commit and get all the blocks to be + // loaded and load it using CompositeAvroLogReader + // Since a DeltaCommit is not defined yet, reading all the records. revisit this soon. + String maxInstantTime = metaClient + .getActiveTimeline().getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.COMMIT_ACTION, + HoodieTimeline.ROLLBACK_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION)) + .filterCompletedInstants().lastInstant().get().getTimestamp(); + long maxMemoryPerCompaction = IOUtils.getMaxMemoryPerCompaction(taskContextSupplier, config); + LOG.info("MaxMemoryPerCompaction => " + maxMemoryPerCompaction); + + List logFiles = operation.getDeltaFileNames().stream().map( + p -> new Path(FSUtils.getPartitionPath(metaClient.getBasePath(), operation.getPartitionPath()), p).toString()) + .collect(toList()); + HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder() + .withFileSystem(fs) + .withBasePath(metaClient.getBasePath()) + .withLogFilePaths(logFiles) + .withReaderSchema(readerSchema) + .withLatestInstantTime(maxInstantTime) + .withInternalSchema(internalSchemaOption.orElse(InternalSchema.getEmptyInternalSchema())) + .withMaxMemorySizeInBytes(maxMemoryPerCompaction) + .withReadBlocksLazily(config.getCompactionLazyBlockReadEnabled()) + .withReverseReader(config.getCompactionReverseLogReadEnabled()) + .withBufferSize(config.getMaxDFSStreamBufferSize()) + .withSpillableMapBasePath(config.getSpillableMapBasePath()) + .withDiskMapType(config.getCommonConfig().getSpillableDiskMapType()) + .withBitCaskDiskMapCompressionEnabled(config.getCommonConfig().isBitCaskDiskMapCompressionEnabled()) + .withOperationField(config.allowOperationMetadataField()) + .withPartition(operation.getPartitionPath()) + .build(); + + Option oldDataFileOpt = + operation.getBaseFile(metaClient.getBasePath(), operation.getPartitionPath()); + + // Considering following scenario: if all log blocks in this fileSlice is rollback, it returns an empty scanner. + // But in this case, we need to give it a base file. Otherwise, it will lose base file in following fileSlice. + if (!scanner.iterator().hasNext()) { + if (!oldDataFileOpt.isPresent()) { + scanner.close(); + return new ArrayList<>(); + } else { + // TODO: we may directly rename original parquet file if there is not evolution/devolution of schema + /* + TaskContextSupplier taskContextSupplier = hoodieCopyOnWriteTable.getTaskContextSupplier(); + String newFileName = FSUtils.makeDataFileName(instantTime, + FSUtils.makeWriteToken(taskContextSupplier.getPartitionIdSupplier().get(), taskContextSupplier.getStageIdSupplier().get(), taskContextSupplier.getAttemptIdSupplier().get()), + operation.getFileId(), hoodieCopyOnWriteTable.getBaseFileExtension()); + Path oldFilePath = new Path(oldDataFileOpt.get().getPath()); + Path newFilePath = new Path(oldFilePath.getParent(), newFileName); + FileUtil.copy(fs,oldFilePath, fs, newFilePath, false, fs.getConf()); + */ + } + } + + // Compacting is very similar to applying updates to existing file + Iterator> result; + // If the dataFile is present, perform updates else perform inserts into a new base file. + if (oldDataFileOpt.isPresent()) { + result = compactionHandler.handleUpdate(instantTime, operation.getPartitionPath(), + operation.getFileId(), scanner.getRecords(), + oldDataFileOpt.get()); + } else { + result = compactionHandler.handleInsert(instantTime, operation.getPartitionPath(), operation.getFileId(), + scanner.getRecords()); + } + scanner.close(); + Iterable> resultIterable = () -> result; + return StreamSupport.stream(resultIterable.spliterator(), false).flatMap(Collection::stream).peek(s -> { + s.getStat().setTotalUpdatedRecordsCompacted(scanner.getNumMergedRecordsInLog()); + s.getStat().setTotalLogFilesCompacted(scanner.getTotalLogFiles()); + s.getStat().setTotalLogRecords(scanner.getTotalLogRecords()); + s.getStat().setPartitionPath(operation.getPartitionPath()); + s.getStat() + .setTotalLogSizeCompacted(operation.getMetrics().get(CompactionStrategy.TOTAL_LOG_FILE_SIZE).longValue()); + s.getStat().setTotalLogBlocks(scanner.getTotalLogBlocks()); + s.getStat().setTotalCorruptLogBlock(scanner.getTotalCorruptBlocks()); + s.getStat().setTotalRollbackBlocks(scanner.getTotalRollbacks()); + RuntimeStats runtimeStats = new RuntimeStats(); + runtimeStats.setTotalScanTime(scanner.getTotalTimeTakenToReadAndMergeBlocks()); + s.getStat().setRuntimeStats(runtimeStats); + }).collect(toList()); + } + + /** + * Generate a new compaction plan for scheduling. + * + * @param context HoodieEngineContext + * @param hoodieTable Hoodie Table + * @param config Hoodie Write Configuration + * @param compactionCommitTime scheduled compaction commit time + * @param fgIdsInPendingCompactionAndClustering partition-fileId pairs for which compaction is pending + * @return Compaction Plan + * @throws IOException when encountering errors + */ + HoodieCompactionPlan generateCompactionPlan( + HoodieEngineContext context, HoodieTable hoodieTable, HoodieWriteConfig config, + String compactionCommitTime, Set fgIdsInPendingCompactionAndClustering) throws IOException { + // Accumulator to keep track of total log files for a table + HoodieAccumulator totalLogFiles = context.newAccumulator(); + // Accumulator to keep track of total log file slices for a table + HoodieAccumulator totalFileSlices = context.newAccumulator(); + + ValidationUtils.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ, + "Can only compact table of type " + HoodieTableType.MERGE_ON_READ + " and not " + + hoodieTable.getMetaClient().getTableType().name()); + + // TODO : check if maxMemory is not greater than JVM or executor memory + // TODO - rollback any compactions in flight + HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); + LOG.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime); + List partitionPaths = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), metaClient.getBasePath()); + + // filter the partition paths if needed to reduce list status + partitionPaths = config.getCompactionStrategy().filterPartitionPaths(config, partitionPaths); + + if (partitionPaths.isEmpty()) { + // In case no partitions could be picked, return no compaction plan + return null; + } + + SliceView fileSystemView = hoodieTable.getSliceView(); + LOG.info("Compaction looking for files to compact in " + partitionPaths + " partitions"); + context.setJobStatus(this.getClass().getSimpleName(), "Looking for files to compact: " + config.getTableName()); + + List operations = context.flatMap(partitionPaths, partitionPath -> fileSystemView + .getLatestFileSlices(partitionPath) + .filter(slice -> !fgIdsInPendingCompactionAndClustering.contains(slice.getFileGroupId())) + .map(s -> { + List logFiles = + s.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(toList()); + totalLogFiles.add(logFiles.size()); + totalFileSlices.add(1L); + // Avro generated classes are not inheriting Serializable. Using CompactionOperation POJO + // for Map operations and collecting them finally in Avro generated classes for storing + // into meta files. + Option dataFile = s.getBaseFile(); + return new CompactionOperation(dataFile, partitionPath, logFiles, + config.getCompactionStrategy().captureMetrics(config, s)); + }) + .filter(c -> !c.getDeltaFileNames().isEmpty()), partitionPaths.size()).stream() + .map(CompactionUtils::buildHoodieCompactionOperation).collect(toList()); + + LOG.info("Total of " + operations.size() + " compactions are retrieved"); + LOG.info("Total number of latest files slices " + totalFileSlices.value()); + LOG.info("Total number of log files " + totalLogFiles.value()); + LOG.info("Total number of file slices " + totalFileSlices.value()); + // Filter the compactions with the passed in filter. This lets us choose most effective + // compactions only + HoodieCompactionPlan compactionPlan = config.getCompactionStrategy().generateCompactionPlan(config, operations, + CompactionUtils.getAllPendingCompactionPlans(metaClient).stream().map(Pair::getValue).collect(toList())); + ValidationUtils.checkArgument( + compactionPlan.getOperations().stream().noneMatch( + op -> fgIdsInPendingCompactionAndClustering.contains(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()))), + "Bad Compaction Plan. FileId MUST NOT have multiple pending compactions. " + + "Please fix your strategy implementation. FileIdsWithPendingCompactions :" + fgIdsInPendingCompactionAndClustering + + ", Selected workload :" + compactionPlan); + if (compactionPlan.getOperations().isEmpty()) { + LOG.warn("After filtering, Nothing to compact for " + metaClient.getBasePath()); + } + return compactionPlan; + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/RunCompactionActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/RunCompactionActionExecutor.java new file mode 100644 index 0000000000000..fc4ae986e6d55 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/RunCompactionActionExecutor.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.compact; + +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.CompactionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.InternalSchemaCache; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieCompactionException; +import org.apache.hudi.internal.schema.utils.SerDeHelper; +import org.apache.hudi.table.HoodieCompactionHandler; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.BaseActionExecutor; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.io.IOException; +import java.util.List; + +@SuppressWarnings("checkstyle:LineLength") +public class RunCompactionActionExecutor extends + BaseActionExecutor>, HoodieData, HoodieData, HoodieWriteMetadata>> { + + private final HoodieCompactor compactor; + private final HoodieCompactionHandler compactionHandler; + + public RunCompactionActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + HoodieCompactor compactor, + HoodieCompactionHandler compactionHandler) { + super(context, config, table, instantTime); + this.compactor = compactor; + this.compactionHandler = compactionHandler; + } + + @Override + public HoodieWriteMetadata> execute() { + HoodieTimeline pendingCompactionTimeline = table.getActiveTimeline().filterPendingCompactionTimeline(); + compactor.preCompact(table, pendingCompactionTimeline, instantTime); + + HoodieWriteMetadata> compactionMetadata = new HoodieWriteMetadata<>(); + try { + // generate compaction plan + // should support configurable commit metadata + HoodieCompactionPlan compactionPlan = + CompactionUtils.getCompactionPlan(table.getMetaClient(), instantTime); + + // try to load internalSchema to support schema Evolution + HoodieWriteConfig configCopy = config; + Pair, Option> schemaPair = InternalSchemaCache + .getInternalSchemaAndAvroSchemaForClusteringAndCompaction(table.getMetaClient(), instantTime); + if (schemaPair.getLeft().isPresent() && schemaPair.getRight().isPresent()) { + // should not influence the original config, just copy it + configCopy = HoodieWriteConfig.newBuilder().withProperties(config.getProps()).build(); + configCopy.setInternalSchemaString(schemaPair.getLeft().get()); + configCopy.setSchema(schemaPair.getRight().get()); + } + + HoodieData statuses = compactor.compact( + context, compactionPlan, table, configCopy, instantTime, compactionHandler); + + compactor.maybePersist(statuses, config); + context.setJobStatus(this.getClass().getSimpleName(), "Preparing compaction metadata: " + config.getTableName()); + List updateStatusMap = statuses.map(WriteStatus::getStat).collectAsList(); + HoodieCommitMetadata metadata = new HoodieCommitMetadata(true); + for (HoodieWriteStat stat : updateStatusMap) { + metadata.addWriteStat(stat.getPartitionPath(), stat); + } + metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, config.getSchema()); + if (schemaPair.getLeft().isPresent()) { + metadata.addMetadata(SerDeHelper.LATEST_SCHEMA, schemaPair.getLeft().get()); + metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, schemaPair.getRight().get()); + } + metadata.setOperationType(WriteOperationType.COMPACT); + compactionMetadata.setWriteStatuses(statuses); + compactionMetadata.setCommitted(false); + compactionMetadata.setCommitMetadata(Option.of(metadata)); + } catch (IOException e) { + throw new HoodieCompactionException("Could not compact " + config.getBasePath(), e); + } + + return compactionMetadata; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/ScheduleCompactionActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/ScheduleCompactionActionExecutor.java new file mode 100644 index 0000000000000..4fb5f9f7ddba5 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/ScheduleCompactionActionExecutor.java @@ -0,0 +1,220 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.compact; + +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.table.view.SyncableFileSystemView; +import org.apache.hudi.common.util.CompactionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieCompactionException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.BaseActionExecutor; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.text.ParseException; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +public class ScheduleCompactionActionExecutor extends BaseActionExecutor> { + + private static final Logger LOG = LogManager.getLogger(ScheduleCompactionActionExecutor.class); + + private final Option> extraMetadata; + private final HoodieCompactor compactor; + + public ScheduleCompactionActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + Option> extraMetadata, + HoodieCompactor compactor) { + super(context, config, table, instantTime); + this.extraMetadata = extraMetadata; + this.compactor = compactor; + } + + @Override + public Option execute() { + if (!config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl() + && !config.getFailedWritesCleanPolicy().isLazy()) { + // TODO(yihua): this validation is removed for Java client used by kafka-connect. Need to revisit this. + if (config.getEngineType() == EngineType.SPARK) { + // if there are inflight writes, their instantTime must not be less than that of compaction instant time + table.getActiveTimeline().getCommitsTimeline().filterPendingExcludingCompaction().firstInstant() + .ifPresent(earliestInflight -> ValidationUtils.checkArgument( + HoodieTimeline.compareTimestamps(earliestInflight.getTimestamp(), HoodieTimeline.GREATER_THAN, instantTime), + "Earliest write inflight instant time must be later than compaction time. Earliest :" + earliestInflight + + ", Compaction scheduled at " + instantTime)); + } + // Committed and pending compaction instants should have strictly lower timestamps + List conflictingInstants = table.getActiveTimeline() + .getWriteTimeline().filterCompletedAndCompactionInstants().getInstants() + .filter(instant -> HoodieTimeline.compareTimestamps( + instant.getTimestamp(), HoodieTimeline.GREATER_THAN_OR_EQUALS, instantTime)) + .collect(Collectors.toList()); + ValidationUtils.checkArgument(conflictingInstants.isEmpty(), + "Following instants have timestamps >= compactionInstant (" + instantTime + ") Instants :" + + conflictingInstants); + } + + HoodieCompactionPlan plan = scheduleCompaction(); + if (plan != null && (plan.getOperations() != null) && (!plan.getOperations().isEmpty())) { + extraMetadata.ifPresent(plan::setExtraMetadata); + HoodieInstant compactionInstant = + new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, instantTime); + try { + table.getActiveTimeline().saveToCompactionRequested(compactionInstant, + TimelineMetadataUtils.serializeCompactionPlan(plan)); + } catch (IOException ioe) { + throw new HoodieIOException("Exception scheduling compaction", ioe); + } + return Option.of(plan); + } + return Option.empty(); + } + + private HoodieCompactionPlan scheduleCompaction() { + LOG.info("Checking if compaction needs to be run on " + config.getBasePath()); + // judge if we need to compact according to num delta commits and time elapsed + boolean compactable = needCompact(config.getInlineCompactTriggerStrategy()); + if (compactable) { + LOG.info("Generating compaction plan for merge on read table " + config.getBasePath()); + try { + SyncableFileSystemView fileSystemView = (SyncableFileSystemView) table.getSliceView(); + Set fgInPendingCompactionAndClustering = fileSystemView.getPendingCompactionOperations() + .map(instantTimeOpPair -> instantTimeOpPair.getValue().getFileGroupId()) + .collect(Collectors.toSet()); + // exclude files in pending clustering from compaction. + fgInPendingCompactionAndClustering.addAll(fileSystemView.getFileGroupsInPendingClustering().map(Pair::getLeft).collect(Collectors.toSet())); + context.setJobStatus(this.getClass().getSimpleName(), "Compaction: generating compaction plan: " + config.getTableName()); + return compactor.generateCompactionPlan(context, table, config, instantTime, fgInPendingCompactionAndClustering); + } catch (IOException e) { + throw new HoodieCompactionException("Could not schedule compaction " + config.getBasePath(), e); + } + } + + return new HoodieCompactionPlan(); + } + + private Option> getLatestDeltaCommitInfo() { + Option> deltaCommitsInfo = + CompactionUtils.getDeltaCommitsSinceLatestCompaction(table.getActiveTimeline()); + if (deltaCommitsInfo.isPresent()) { + return Option.of(Pair.of( + deltaCommitsInfo.get().getLeft().countInstants(), + deltaCommitsInfo.get().getRight().getTimestamp())); + } + return Option.empty(); + } + + private Option> getLatestDeltaCommitInfoSinceLastCompactionRequest() { + Option> deltaCommitsInfo = + CompactionUtils.getDeltaCommitsSinceLatestCompactionRequest(table.getActiveTimeline()); + if (deltaCommitsInfo.isPresent()) { + return Option.of(Pair.of( + deltaCommitsInfo.get().getLeft().countInstants(), + deltaCommitsInfo.get().getRight().getTimestamp())); + } + return Option.empty(); + } + + private boolean needCompact(CompactionTriggerStrategy compactionTriggerStrategy) { + boolean compactable; + // get deltaCommitsSinceLastCompaction and lastCompactionTs + Option> latestDeltaCommitInfoOption = getLatestDeltaCommitInfo(); + if (!latestDeltaCommitInfoOption.isPresent()) { + return false; + } + Pair latestDeltaCommitInfo = latestDeltaCommitInfoOption.get(); + int inlineCompactDeltaCommitMax = config.getInlineCompactDeltaCommitMax(); + int inlineCompactDeltaSecondsMax = config.getInlineCompactDeltaSecondsMax(); + switch (compactionTriggerStrategy) { + case NUM_COMMITS: + compactable = inlineCompactDeltaCommitMax <= latestDeltaCommitInfo.getLeft(); + if (compactable) { + LOG.info(String.format("The delta commits >= %s, trigger compaction scheduler.", inlineCompactDeltaCommitMax)); + } + break; + case NUM_COMMITS_AFTER_LAST_REQUEST: + latestDeltaCommitInfoOption = getLatestDeltaCommitInfoSinceLastCompactionRequest(); + + if (!latestDeltaCommitInfoOption.isPresent()) { + return false; + } + latestDeltaCommitInfo = latestDeltaCommitInfoOption.get(); + compactable = inlineCompactDeltaCommitMax <= latestDeltaCommitInfo.getLeft(); + if (compactable) { + LOG.info(String.format("The delta commits >= %s since the last compaction request, trigger compaction scheduler.", inlineCompactDeltaCommitMax)); + } + break; + case TIME_ELAPSED: + compactable = inlineCompactDeltaSecondsMax <= parsedToSeconds(instantTime) - parsedToSeconds(latestDeltaCommitInfo.getRight()); + if (compactable) { + LOG.info(String.format("The elapsed time >=%ss, trigger compaction scheduler.", inlineCompactDeltaSecondsMax)); + } + break; + case NUM_OR_TIME: + compactable = inlineCompactDeltaCommitMax <= latestDeltaCommitInfo.getLeft() + || inlineCompactDeltaSecondsMax <= parsedToSeconds(instantTime) - parsedToSeconds(latestDeltaCommitInfo.getRight()); + if (compactable) { + LOG.info(String.format("The delta commits >= %s or elapsed_time >=%ss, trigger compaction scheduler.", inlineCompactDeltaCommitMax, + inlineCompactDeltaSecondsMax)); + } + break; + case NUM_AND_TIME: + compactable = inlineCompactDeltaCommitMax <= latestDeltaCommitInfo.getLeft() + && inlineCompactDeltaSecondsMax <= parsedToSeconds(instantTime) - parsedToSeconds(latestDeltaCommitInfo.getRight()); + if (compactable) { + LOG.info(String.format("The delta commits >= %s and elapsed_time >=%ss, trigger compaction scheduler.", inlineCompactDeltaCommitMax, + inlineCompactDeltaSecondsMax)); + } + break; + default: + throw new HoodieCompactionException("Unsupported compaction trigger strategy: " + config.getInlineCompactTriggerStrategy()); + } + return compactable; + } + + private Long parsedToSeconds(String time) { + long timestamp; + try { + timestamp = HoodieActiveTimeline.parseDateFromInstantTime(time).getTime() / 1000; + } catch (ParseException e) { + throw new HoodieCompactionException(e.getMessage(), e); + } + return timestamp; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/BoundedPartitionAwareCompactionStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/BoundedPartitionAwareCompactionStrategy.java index 747e0b2f3c47d..09c19b1aabe85 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/BoundedPartitionAwareCompactionStrategy.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/BoundedPartitionAwareCompactionStrategy.java @@ -39,14 +39,17 @@ */ public class BoundedPartitionAwareCompactionStrategy extends DayBasedCompactionStrategy { - SimpleDateFormat dateFormat = new SimpleDateFormat(DayBasedCompactionStrategy.DATE_PARTITION_FORMAT); + // NOTE: {@code SimpleDataFormat} is NOT thread-safe + // TODO replace w/ DateTimeFormatter + private final ThreadLocal dateFormat = + ThreadLocal.withInitial(() -> new SimpleDateFormat(DayBasedCompactionStrategy.DATE_PARTITION_FORMAT)); @Override public List orderAndFilter(HoodieWriteConfig writeConfig, List operations, List pendingCompactionPlans) { // The earliest partition to compact - current day minus the target partitions limit String earliestPartitionPathToCompact = - dateFormat.format(getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction())); + dateFormat.get().format(getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction())); // Filter out all partitions greater than earliestPartitionPathToCompact return operations.stream().collect(Collectors.groupingBy(HoodieCompactionOperation::getPartitionPath)).entrySet() @@ -59,7 +62,7 @@ public List orderAndFilter(HoodieWriteConfig writeCon public List filterPartitionPaths(HoodieWriteConfig writeConfig, List partitionPaths) { // The earliest partition to compact - current day minus the target partitions limit String earliestPartitionPathToCompact = - dateFormat.format(getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction())); + dateFormat.get().format(getDateAtOffsetFromToday(-1 * writeConfig.getTargetPartitionsPerDayBasedCompaction())); // Get all partitions and sort them return partitionPaths.stream().map(partition -> partition.replace("/", "-")) .sorted(Comparator.reverseOrder()).map(partitionPath -> partitionPath.replace("-", "/")) diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/CompactionStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/CompactionStrategy.java index 6c631c462bda1..ff2dfbd4c1ecb 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/CompactionStrategy.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/CompactionStrategy.java @@ -20,14 +20,13 @@ import org.apache.hudi.avro.model.HoodieCompactionOperation; import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.util.CompactionUtils; -import org.apache.hudi.common.util.Option; +import org.apache.hudi.client.utils.FileSliceMetricUtils; import org.apache.hudi.config.HoodieWriteConfig; import java.io.Serializable; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -50,32 +49,14 @@ public abstract class CompactionStrategy implements Serializable { * Callback hook when a HoodieCompactionOperation is created. Individual strategies can capture the metrics they need * to decide on the priority. * - * @param dataFile - Base file to compact - * @param partitionPath - Partition path - * @param logFiles - List of log files to compact with the base file + * @param writeConfig write configuration. + * @param slice fileSlice to capture metrics for. * @return Map[String, Object] - metrics captured */ - public Map captureMetrics(HoodieWriteConfig writeConfig, Option dataFile, - String partitionPath, List logFiles) { + public Map captureMetrics(HoodieWriteConfig writeConfig, FileSlice slice) { Map metrics = new HashMap<>(); long defaultMaxParquetFileSize = writeConfig.getParquetMaxFileSize(); - // Total size of all the log files - Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter(size -> size >= 0) - .reduce(Long::sum).orElse(0L); - // Total read will be the base file + all the log files - Long totalIORead = - FSUtils.getSizeInMB((dataFile.isPresent() ? dataFile.get().getFileSize() : 0L) + totalLogFileSize); - // Total write will be similar to the size of the base file - Long totalIOWrite = - FSUtils.getSizeInMB(dataFile.isPresent() ? dataFile.get().getFileSize() : defaultMaxParquetFileSize); - // Total IO will the the IO for read + write - long totalIO = totalIORead + totalIOWrite; - // Save these metrics and we will use during the filter - metrics.put(TOTAL_IO_READ_MB, totalIORead.doubleValue()); - metrics.put(TOTAL_IO_WRITE_MB, totalIOWrite.doubleValue()); - metrics.put(TOTAL_IO_MB, (double) totalIO); - metrics.put(TOTAL_LOG_FILE_SIZE, totalLogFileSize.doubleValue()); - metrics.put(TOTAL_LOG_FILES, (double) logFiles.size()); + FileSliceMetricUtils.addFileSliceCommonMetrics(Collections.singletonList(slice), metrics, defaultMaxParquetFileSize); return metrics; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/DayBasedCompactionStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/DayBasedCompactionStrategy.java index 4a12bb8a08b72..424ce51d1ef51 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/DayBasedCompactionStrategy.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/DayBasedCompactionStrategy.java @@ -41,12 +41,18 @@ public class DayBasedCompactionStrategy extends CompactionStrategy { // For now, use SimpleDateFormat as default partition format protected static final String DATE_PARTITION_FORMAT = "yyyy/MM/dd"; // Sorts compaction in LastInFirstCompacted order + + // NOTE: {@code SimpleDataFormat} is NOT thread-safe + // TODO replace w/ DateTimeFormatter + private static final ThreadLocal DATE_FORMAT = + ThreadLocal.withInitial(() -> new SimpleDateFormat(DATE_PARTITION_FORMAT, Locale.ENGLISH)); + protected static Comparator comparator = (String leftPartition, String rightPartition) -> { try { leftPartition = getPartitionPathWithoutPartitionKeys(leftPartition); rightPartition = getPartitionPathWithoutPartitionKeys(rightPartition); - Date left = new SimpleDateFormat(DATE_PARTITION_FORMAT, Locale.ENGLISH).parse(leftPartition); - Date right = new SimpleDateFormat(DATE_PARTITION_FORMAT, Locale.ENGLISH).parse(rightPartition); + Date left = DATE_FORMAT.get().parse(leftPartition); + Date right = DATE_FORMAT.get().parse(rightPartition); return left.after(right) ? -1 : right.after(left) ? 1 : 0; } catch (ParseException e) { throw new HoodieException("Invalid Partition Date Format", e); @@ -72,7 +78,8 @@ public List orderAndFilter(HoodieWriteConfig writeCon public List filterPartitionPaths(HoodieWriteConfig writeConfig, List allPartitionPaths) { return allPartitionPaths.stream().map(partition -> partition.replace("/", "-")) .sorted(Comparator.reverseOrder()).map(partitionPath -> partitionPath.replace("-", "/")) - .collect(Collectors.toList()).subList(0, writeConfig.getTargetPartitionsPerDayBasedCompaction()); + .collect(Collectors.toList()).subList(0, Math.min(allPartitionPaths.size(), + writeConfig.getTargetPartitionsPerDayBasedCompaction())); } /** diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/LogFileNumBasedCompactionStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/LogFileNumBasedCompactionStrategy.java new file mode 100644 index 0000000000000..6f79b684d0a46 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/LogFileNumBasedCompactionStrategy.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.compact.strategy; + +import org.apache.hudi.avro.model.HoodieCompactionOperation; +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.config.HoodieWriteConfig; + +import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; + +/** + * LogFileLengthBasedCompactionStrategy orders the compactions based on the total log files num, + * filters the file group which log files length is greater than the threshold and limits the compactions within a configured IO bound. + */ +public class LogFileNumBasedCompactionStrategy extends BoundedIOCompactionStrategy + implements Comparator { + + @Override + public List orderAndFilter(HoodieWriteConfig writeConfig, List operations, List pendingCompactionPlans) { + Long numThreshold = writeConfig.getCompactionLogFileNumThreshold(); + List filterOperator = operations.stream() + .filter(e -> e.getDeltaFilePaths().size() >= numThreshold) + .sorted(this).collect(Collectors.toList()); + return super.orderAndFilter(writeConfig, filterOperator, pendingCompactionPlans); + } + + @Override + public int compare(HoodieCompactionOperation hco1, HoodieCompactionOperation hco2) { + return hco2.getDeltaFilePaths().size() - hco1.getDeltaFilePaths().size(); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/LogFileSizeBasedCompactionStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/LogFileSizeBasedCompactionStrategy.java index c9a811a1cceeb..c165141dfc5ee 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/LogFileSizeBasedCompactionStrategy.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/strategy/LogFileSizeBasedCompactionStrategy.java @@ -20,18 +20,15 @@ import org.apache.hudi.avro.model.HoodieCompactionOperation; import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import java.util.Comparator; import java.util.List; -import java.util.Map; import java.util.stream.Collectors; /** - * LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size and limits the + * LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size, + * filters the file group which log files size is greater than the threshold and limits the * compactions within a configured IO bound. * * @see BoundedIOCompactionStrategy @@ -40,26 +37,15 @@ public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrategy implements Comparator { - private static final String TOTAL_LOG_FILE_SIZE = "TOTAL_LOG_FILE_SIZE"; - - @Override - public Map captureMetrics(HoodieWriteConfig config, Option dataFile, - String partitionPath, List logFiles) { - Map metrics = super.captureMetrics(config, dataFile, partitionPath, logFiles); - - // Total size of all the log files - Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter(size -> size >= 0) - .reduce(Long::sum).orElse(0L); - // save the metrics needed during the order - metrics.put(TOTAL_LOG_FILE_SIZE, totalLogFileSize.doubleValue()); - return metrics; - } - @Override public List orderAndFilter(HoodieWriteConfig writeConfig, List operations, List pendingCompactionPlans) { + // Filter the file group which log files size is greater than the threshold in bytes. // Order the operations based on the reverse size of the logs and limit them by the IO - return super.orderAndFilter(writeConfig, operations.stream().sorted(this).collect(Collectors.toList()), + long threshold = writeConfig.getCompactionLogFileSizeThreshold(); + return super.orderAndFilter(writeConfig, operations.stream() + .filter(e -> e.getMetrics().getOrDefault(TOTAL_LOG_FILE_SIZE, 0d) >= threshold) + .sorted(this).collect(Collectors.toList()), pendingCompactionPlans); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/index/RunIndexActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/index/RunIndexActionExecutor.java new file mode 100644 index 0000000000000..96d46928e7c3a --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/index/RunIndexActionExecutor.java @@ -0,0 +1,409 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.action.index; + +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieIndexCommitMetadata; +import org.apache.hudi.avro.model.HoodieIndexPartitionInfo; +import org.apache.hudi.avro.model.HoodieIndexPlan; +import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.client.transaction.TransactionManager; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.util.CleanerUtils; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.MetadataPartitionType; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.BaseActionExecutor; + +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Locale; +import java.util.Set; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.model.WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL; +import static org.apache.hudi.common.table.HoodieTableConfig.TABLE_METADATA_PARTITIONS; +import static org.apache.hudi.common.table.HoodieTableConfig.TABLE_METADATA_PARTITIONS_INFLIGHT; +import static org.apache.hudi.common.table.timeline.HoodieInstant.State.COMPLETED; +import static org.apache.hudi.common.table.timeline.HoodieInstant.State.REQUESTED; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.CLEAN_ACTION; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN_OR_EQUALS; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.INDEXING_ACTION; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.RESTORE_ACTION; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.ROLLBACK_ACTION; +import static org.apache.hudi.config.HoodieWriteConfig.WRITE_CONCURRENCY_MODE; +import static org.apache.hudi.metadata.HoodieTableMetadata.getMetadataTableBasePath; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.deleteMetadataPartition; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getInflightAndCompletedMetadataPartitions; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getInflightMetadataPartitions; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.metadataPartitionExists; + +/** + * Reads the index plan and executes the plan. + * It also reconciles updates on data timeline while indexing was in progress. + */ +public class RunIndexActionExecutor extends BaseActionExecutor> { + + private static final Logger LOG = LogManager.getLogger(RunIndexActionExecutor.class); + private static final Integer INDEX_COMMIT_METADATA_VERSION_1 = 1; + private static final Integer LATEST_INDEX_COMMIT_METADATA_VERSION = INDEX_COMMIT_METADATA_VERSION_1; + private static final int MAX_CONCURRENT_INDEXING = 1; + private static final int TIMELINE_RELOAD_INTERVAL_MILLIS = 5000; + + // we use this to update the latest instant in data timeline that has been indexed in metadata table + // this needs to be volatile as it can be updated in the IndexingCheckTask spawned by this executor + // assumption is that only one indexer can execute at a time + private volatile String currentCaughtupInstant; + + private final TransactionManager txnManager; + + public RunIndexActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime) { + super(context, config, table, instantTime); + this.txnManager = new TransactionManager(config, table.getMetaClient().getFs()); + } + + @Override + public Option execute() { + HoodieTimer indexTimer = new HoodieTimer(); + indexTimer.startTimer(); + + HoodieInstant indexInstant = validateAndGetIndexInstant(); + // read HoodieIndexPlan + HoodieIndexPlan indexPlan; + try { + indexPlan = TimelineMetadataUtils.deserializeIndexPlan(table.getActiveTimeline().readIndexPlanAsBytes(indexInstant).get()); + } catch (IOException e) { + throw new HoodieIndexException("Failed to read the index plan for instant: " + indexInstant); + } + List indexPartitionInfos = indexPlan.getIndexPartitionInfos(); + try { + if (indexPartitionInfos == null || indexPartitionInfos.isEmpty()) { + throw new HoodieIndexException(String.format("No partitions to index for instant: %s", instantTime)); + } + boolean firstTimeInitializingMetadataTable = false; + HoodieIndexPartitionInfo fileIndexPartitionInfo = null; + if (indexPartitionInfos.size() == 1 && indexPartitionInfos.get(0).getMetadataPartitionPath().equals(MetadataPartitionType.FILES.getPartitionPath())) { + firstTimeInitializingMetadataTable = true; + fileIndexPartitionInfo = indexPartitionInfos.get(0); + } + // ensure the metadata partitions for the requested indexes are not already available (or inflight) + Set indexesInflightOrCompleted = getInflightAndCompletedMetadataPartitions(table.getMetaClient().getTableConfig()); + Set requestedPartitions = indexPartitionInfos.stream() + .map(HoodieIndexPartitionInfo::getMetadataPartitionPath).collect(Collectors.toSet()); + requestedPartitions.retainAll(indexesInflightOrCompleted); + if (!firstTimeInitializingMetadataTable && !requestedPartitions.isEmpty()) { + throw new HoodieIndexException(String.format("Following partitions already exist or inflight: %s", requestedPartitions)); + } + + // transition requested indexInstant to inflight + table.getActiveTimeline().transitionIndexRequestedToInflight(indexInstant, Option.empty()); + List finalIndexPartitionInfos = null; + if (!firstTimeInitializingMetadataTable) { + // start indexing for each partition + HoodieTableMetadataWriter metadataWriter = table.getMetadataWriter(instantTime) + .orElseThrow(() -> new HoodieIndexException(String.format("Could not get metadata writer to run index action for instant: %s", instantTime))); + // this will only build index upto base instant as generated by the plan, we will be doing catchup later + String indexUptoInstant = indexPartitionInfos.get(0).getIndexUptoInstant(); + LOG.info("Starting Index Building with base instant: " + indexUptoInstant); + metadataWriter.buildMetadataPartitions(context, indexPartitionInfos); + + // get remaining instants to catchup + List instantsToCatchup = getInstantsToCatchup(indexUptoInstant); + LOG.info("Total remaining instants to index: " + instantsToCatchup.size()); + + // reconcile with metadata table timeline + String metadataBasePath = getMetadataTableBasePath(table.getMetaClient().getBasePath()); + HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataBasePath).build(); + Set metadataCompletedTimestamps = getCompletedArchivedAndActiveInstantsAfter(indexUptoInstant, metadataMetaClient).stream() + .map(HoodieInstant::getTimestamp).collect(Collectors.toSet()); + + // index catchup for all remaining instants with a timeout + currentCaughtupInstant = indexUptoInstant; + catchupWithInflightWriters(metadataWriter, instantsToCatchup, metadataMetaClient, metadataCompletedTimestamps); + // save index commit metadata and update table config + finalIndexPartitionInfos = indexPartitionInfos.stream() + .map(info -> new HoodieIndexPartitionInfo( + info.getVersion(), + info.getMetadataPartitionPath(), + currentCaughtupInstant)) + .collect(Collectors.toList()); + } else { + String indexUptoInstant = fileIndexPartitionInfo.getIndexUptoInstant(); + // save index commit metadata and update table config + finalIndexPartitionInfos = Collections.singletonList(fileIndexPartitionInfo).stream() + .map(info -> new HoodieIndexPartitionInfo( + info.getVersion(), + info.getMetadataPartitionPath(), + indexUptoInstant)) + .collect(Collectors.toList()); + } + + HoodieIndexCommitMetadata indexCommitMetadata = HoodieIndexCommitMetadata.newBuilder() + .setVersion(LATEST_INDEX_COMMIT_METADATA_VERSION).setIndexPartitionInfos(finalIndexPartitionInfos).build(); + updateTableConfigAndTimeline(indexInstant, finalIndexPartitionInfos, indexCommitMetadata); + return Option.of(indexCommitMetadata); + } catch (IOException e) { + // abort gracefully + abort(indexInstant, indexPartitionInfos.stream().map(HoodieIndexPartitionInfo::getMetadataPartitionPath).collect(Collectors.toSet())); + throw new HoodieIndexException(String.format("Unable to index instant: %s", indexInstant)); + } + } + + private void abort(HoodieInstant indexInstant, Set requestedPartitions) { + Set inflightPartitions = getInflightMetadataPartitions(table.getMetaClient().getTableConfig()); + Set completedPartitions = table.getMetaClient().getTableConfig().getMetadataPartitions(); + // update table config + requestedPartitions.forEach(partition -> { + inflightPartitions.remove(partition); + completedPartitions.remove(partition); + }); + table.getMetaClient().getTableConfig().setValue(TABLE_METADATA_PARTITIONS_INFLIGHT.key(), String.join(",", inflightPartitions)); + table.getMetaClient().getTableConfig().setValue(TABLE_METADATA_PARTITIONS.key(), String.join(",", completedPartitions)); + HoodieTableConfig.update(table.getMetaClient().getFs(), new Path(table.getMetaClient().getMetaPath()), table.getMetaClient().getTableConfig().getProps()); + + // delete metadata partition + requestedPartitions.forEach(partition -> { + MetadataPartitionType partitionType = MetadataPartitionType.valueOf(partition.toUpperCase(Locale.ROOT)); + if (metadataPartitionExists(table.getMetaClient().getBasePath(), context, partitionType)) { + deleteMetadataPartition(table.getMetaClient().getBasePath(), context, partitionType); + } + }); + + // delete inflight instant + table.getMetaClient().reloadActiveTimeline().deleteInstantFileIfExists(HoodieTimeline.getIndexInflightInstant(indexInstant.getTimestamp())); + } + + private List getInstantsToCatchup(String indexUptoInstant) { + // since only write timeline was considered while scheduling index, which gives us the indexUpto instant + // here we consider other valid actions to pick catchupStart instant + Set validActions = CollectionUtils.createSet(CLEAN_ACTION, RESTORE_ACTION, ROLLBACK_ACTION); + Option catchupStartInstant = table.getMetaClient().reloadActiveTimeline() + .getTimelineOfActions(validActions) + .filterInflightsAndRequested() + .findInstantsBefore(indexUptoInstant) + .firstInstant(); + // get all instants since the plan completed (both from active timeline and archived timeline) + List instantsToIndex; + if (catchupStartInstant.isPresent()) { + instantsToIndex = getRemainingArchivedAndActiveInstantsSince(catchupStartInstant.get().getTimestamp(), table.getMetaClient()); + } else { + instantsToIndex = getRemainingArchivedAndActiveInstantsSince(indexUptoInstant, table.getMetaClient()); + } + return instantsToIndex; + } + + private HoodieInstant validateAndGetIndexInstant() { + // ensure lock provider configured + if (!config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl() || StringUtils.isNullOrEmpty(config.getLockProviderClass())) { + throw new HoodieIndexException(String.format("Need to set %s as %s and configure lock provider class", + WRITE_CONCURRENCY_MODE.key(), OPTIMISTIC_CONCURRENCY_CONTROL.name())); + } + + return table.getActiveTimeline() + .filterPendingIndexTimeline() + .filter(instant -> instant.getTimestamp().equals(instantTime) && REQUESTED.equals(instant.getState())) + .lastInstant() + .orElseThrow(() -> new HoodieIndexException(String.format("No requested index instant found: %s", instantTime))); + } + + private void updateTableConfigAndTimeline(HoodieInstant indexInstant, + List finalIndexPartitionInfos, + HoodieIndexCommitMetadata indexCommitMetadata) throws IOException { + try { + // update the table config and timeline in a lock as there could be another indexer running + txnManager.beginTransaction(Option.of(indexInstant), Option.empty()); + updateMetadataPartitionsTableConfig(table.getMetaClient(), + finalIndexPartitionInfos.stream().map(HoodieIndexPartitionInfo::getMetadataPartitionPath).collect(Collectors.toSet())); + table.getActiveTimeline().saveAsComplete( + new HoodieInstant(true, INDEXING_ACTION, indexInstant.getTimestamp()), + TimelineMetadataUtils.serializeIndexCommitMetadata(indexCommitMetadata)); + } finally { + txnManager.endTransaction(Option.of(indexInstant)); + } + } + + private void catchupWithInflightWriters(HoodieTableMetadataWriter metadataWriter, List instantsToIndex, + HoodieTableMetaClient metadataMetaClient, Set metadataCompletedTimestamps) { + ExecutorService executorService = Executors.newFixedThreadPool(MAX_CONCURRENT_INDEXING); + Future indexingCatchupTaskFuture = executorService.submit( + new IndexingCatchupTask(metadataWriter, instantsToIndex, metadataCompletedTimestamps, table.getMetaClient(), metadataMetaClient)); + try { + LOG.info("Starting index catchup task"); + indexingCatchupTaskFuture.get(config.getIndexingCheckTimeoutSeconds(), TimeUnit.SECONDS); + } catch (Exception e) { + indexingCatchupTaskFuture.cancel(true); + throw new HoodieIndexException(String.format("Index catchup failed. Current indexed instant = %s. Aborting!", currentCaughtupInstant), e); + } finally { + executorService.shutdownNow(); + } + } + + private static List getRemainingArchivedAndActiveInstantsSince(String instant, HoodieTableMetaClient metaClient) { + List remainingInstantsToIndex = metaClient.getArchivedTimeline().getInstants() + .filter(i -> HoodieTimeline.compareTimestamps(i.getTimestamp(), GREATER_THAN_OR_EQUALS, instant)) + .filter(i -> !INDEXING_ACTION.equals(i.getAction())) + .collect(Collectors.toList()); + remainingInstantsToIndex.addAll(metaClient.getActiveTimeline().findInstantsAfter(instant).getInstants() + .filter(i -> HoodieTimeline.compareTimestamps(i.getTimestamp(), GREATER_THAN_OR_EQUALS, instant)) + .filter(i -> !INDEXING_ACTION.equals(i.getAction())) + .collect(Collectors.toList())); + return remainingInstantsToIndex; + } + + private static List getCompletedArchivedAndActiveInstantsAfter(String instant, HoodieTableMetaClient metaClient) { + List completedInstants = metaClient.getArchivedTimeline().filterCompletedInstants().findInstantsAfter(instant) + .getInstants().filter(i -> !INDEXING_ACTION.equals(i.getAction())).collect(Collectors.toList()); + completedInstants.addAll(metaClient.reloadActiveTimeline().filterCompletedInstants().findInstantsAfter(instant) + .getInstants().filter(i -> !INDEXING_ACTION.equals(i.getAction())).collect(Collectors.toList())); + return completedInstants; + } + + private void updateMetadataPartitionsTableConfig(HoodieTableMetaClient metaClient, Set metadataPartitions) { + // remove from inflight and update completed indexes + Set inflightPartitions = getInflightMetadataPartitions(metaClient.getTableConfig()); + Set completedPartitions = metaClient.getTableConfig().getMetadataPartitions(); + inflightPartitions.removeAll(metadataPartitions); + completedPartitions.addAll(metadataPartitions); + // update table config + metaClient.getTableConfig().setValue(TABLE_METADATA_PARTITIONS_INFLIGHT.key(), String.join(",", inflightPartitions)); + metaClient.getTableConfig().setValue(TABLE_METADATA_PARTITIONS.key(), String.join(",", completedPartitions)); + HoodieTableConfig.update(metaClient.getFs(), new Path(metaClient.getMetaPath()), metaClient.getTableConfig().getProps()); + } + + /** + * Indexing check runs for instants that completed after the base instant (in the index plan). + * It will check if these later instants have logged updates to metadata table or not. + * If not, then it will do the update. If a later instant is inflight, it will wait until it is completed or the task times out. + */ + class IndexingCatchupTask implements Runnable { + + private final HoodieTableMetadataWriter metadataWriter; + private final List instantsToIndex; + private final Set metadataCompletedInstants; + private final HoodieTableMetaClient metaClient; + private final HoodieTableMetaClient metadataMetaClient; + + IndexingCatchupTask(HoodieTableMetadataWriter metadataWriter, + List instantsToIndex, + Set metadataCompletedInstants, + HoodieTableMetaClient metaClient, + HoodieTableMetaClient metadataMetaClient) { + this.metadataWriter = metadataWriter; + this.instantsToIndex = instantsToIndex; + this.metadataCompletedInstants = metadataCompletedInstants; + this.metaClient = metaClient; + this.metadataMetaClient = metadataMetaClient; + } + + @Override + public void run() { + for (HoodieInstant instant : instantsToIndex) { + // metadata index already updated for this instant + if (!metadataCompletedInstants.isEmpty() && metadataCompletedInstants.contains(instant.getTimestamp())) { + currentCaughtupInstant = instant.getTimestamp(); + continue; + } + while (!instant.isCompleted()) { + try { + LOG.warn("instant not completed, reloading timeline " + instant); + // reload timeline and fetch instant details again wait until timeout + String instantTime = instant.getTimestamp(); + Option currentInstant = metaClient.reloadActiveTimeline() + .filterCompletedInstants().filter(i -> i.getTimestamp().equals(instantTime)).firstInstant(); + instant = currentInstant.orElse(instant); + // so that timeline is not reloaded very frequently + Thread.sleep(TIMELINE_RELOAD_INTERVAL_MILLIS); + } catch (InterruptedException e) { + throw new HoodieIndexException(String.format("Thread interrupted while running indexing check for instant: %s", instant), e); + } + } + // if instant completed, ensure that there was metadata commit, else update metadata for this completed instant + if (COMPLETED.equals(instant.getState())) { + String instantTime = instant.getTimestamp(); + Option metadataInstant = metadataMetaClient.reloadActiveTimeline() + .filterCompletedInstants().filter(i -> i.getTimestamp().equals(instantTime)).firstInstant(); + if (metadataInstant.isPresent()) { + currentCaughtupInstant = instantTime; + continue; + } + try { + // we need take a lock here as inflight writer could also try to update the timeline + txnManager.beginTransaction(Option.of(instant), Option.empty()); + LOG.info("Updating metadata table for instant: " + instant); + switch (instant.getAction()) { + // TODO: see if this can be moved to metadata writer itself + case HoodieTimeline.COMMIT_ACTION: + case HoodieTimeline.DELTA_COMMIT_ACTION: + case HoodieTimeline.REPLACE_COMMIT_ACTION: + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( + table.getActiveTimeline().getInstantDetails(instant).get(), HoodieCommitMetadata.class); + // do not trigger any table service as partition is not fully built out yet + metadataWriter.update(commitMetadata, instant.getTimestamp(), false); + break; + case CLEAN_ACTION: + HoodieCleanMetadata cleanMetadata = CleanerUtils.getCleanerMetadata(table.getMetaClient(), instant); + metadataWriter.update(cleanMetadata, instant.getTimestamp()); + break; + case RESTORE_ACTION: + HoodieRestoreMetadata restoreMetadata = TimelineMetadataUtils.deserializeHoodieRestoreMetadata( + table.getActiveTimeline().getInstantDetails(instant).get()); + metadataWriter.update(restoreMetadata, instant.getTimestamp()); + break; + case ROLLBACK_ACTION: + HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.deserializeHoodieRollbackMetadata( + table.getActiveTimeline().getInstantDetails(instant).get()); + metadataWriter.update(rollbackMetadata, instant.getTimestamp()); + break; + default: + throw new IllegalStateException("Unexpected value: " + instant.getAction()); + } + } catch (IOException e) { + throw new HoodieIndexException(String.format("Could not update metadata partition for instant: %s", instant), e); + } finally { + txnManager.endTransaction(Option.of(instant)); + } + } + } + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/index/ScheduleIndexActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/index/ScheduleIndexActionExecutor.java new file mode 100644 index 0000000000000..d562dec671d14 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/index/ScheduleIndexActionExecutor.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.action.index; + +import org.apache.hudi.avro.model.HoodieIndexPartitionInfo; +import org.apache.hudi.avro.model.HoodieIndexPlan; +import org.apache.hudi.client.transaction.TransactionManager; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.MetadataPartitionType; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.BaseActionExecutor; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.EnumSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.model.WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL; +import static org.apache.hudi.config.HoodieWriteConfig.WRITE_CONCURRENCY_MODE; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.deleteMetadataPartition; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getInflightAndCompletedMetadataPartitions; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.metadataPartitionExists; + +/** + * Schedules INDEX action. + *

  • + * 1. Fetch last completed instant on data timeline. + * 2. Write the index plan to the .index.requested. + * 3. Initialize file groups for the enabled partition types within a transaction. + *
  • + */ +public class ScheduleIndexActionExecutor extends BaseActionExecutor> { + + private static final Logger LOG = LogManager.getLogger(ScheduleIndexActionExecutor.class); + private static final Integer INDEX_PLAN_VERSION_1 = 1; + private static final Integer LATEST_INDEX_PLAN_VERSION = INDEX_PLAN_VERSION_1; + + private final List partitionIndexTypes; + private final TransactionManager txnManager; + + public ScheduleIndexActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + List partitionIndexTypes) { + super(context, config, table, instantTime); + this.partitionIndexTypes = partitionIndexTypes; + this.txnManager = new TransactionManager(config, table.getMetaClient().getFs()); + } + + @Override + public Option execute() { + validateBeforeScheduling(); + // make sure that it is idempotent, check with previously pending index operations. + Set indexesInflightOrCompleted = getInflightAndCompletedMetadataPartitions(table.getMetaClient().getTableConfig()); + Set requestedPartitions = partitionIndexTypes.stream().map(MetadataPartitionType::getPartitionPath).collect(Collectors.toSet()); + requestedPartitions.removeAll(indexesInflightOrCompleted); + if (!requestedPartitions.isEmpty()) { + LOG.warn(String.format("Following partitions already exist or inflight: %s. Going to schedule indexing of only these partitions: %s", + indexesInflightOrCompleted, requestedPartitions)); + } else { + LOG.error("All requested index types are inflight or completed: " + partitionIndexTypes); + return Option.empty(); + } + List finalPartitionsToIndex = partitionIndexTypes.stream() + .filter(p -> requestedPartitions.contains(p.getPartitionPath())).collect(Collectors.toList()); + final HoodieInstant indexInstant = HoodieTimeline.getIndexRequestedInstant(instantTime); + try { + this.txnManager.beginTransaction(Option.of(indexInstant), Option.empty()); + // get last completed instant + Option indexUptoInstant = table.getActiveTimeline().getContiguousCompletedWriteTimeline().lastInstant(); + if (indexUptoInstant.isPresent()) { + // start initializing file groups + // in case FILES partition itself was not initialized before (i.e. metadata was never enabled), this will initialize synchronously + HoodieTableMetadataWriter metadataWriter = table.getMetadataWriter(instantTime) + .orElseThrow(() -> new HoodieIndexException(String.format("Could not get metadata writer to initialize filegroups for indexing for instant: %s", instantTime))); + if (!finalPartitionsToIndex.get(0).getPartitionPath().equals(MetadataPartitionType.FILES.getPartitionPath())) { + // initialize metadata partition only if not for FILES partition. + metadataWriter.initializeMetadataPartitions(table.getMetaClient(), finalPartitionsToIndex, indexUptoInstant.get().getTimestamp()); + } + + // for each partitionToIndex add that time to the plan + List indexPartitionInfos = finalPartitionsToIndex.stream() + .map(p -> new HoodieIndexPartitionInfo(LATEST_INDEX_PLAN_VERSION, p.getPartitionPath(), indexUptoInstant.get().getTimestamp())) + .collect(Collectors.toList()); + HoodieIndexPlan indexPlan = new HoodieIndexPlan(LATEST_INDEX_PLAN_VERSION, indexPartitionInfos); + // update data timeline with requested instant + table.getActiveTimeline().saveToPendingIndexAction(indexInstant, TimelineMetadataUtils.serializeIndexPlan(indexPlan)); + return Option.of(indexPlan); + } + } catch (IOException e) { + LOG.error("Could not initialize file groups", e); + // abort gracefully + abort(indexInstant); + throw new HoodieIOException(e.getMessage(), e); + } finally { + this.txnManager.endTransaction(Option.of(indexInstant)); + } + + return Option.empty(); + } + + private void validateBeforeScheduling() { + if (!EnumSet.allOf(MetadataPartitionType.class).containsAll(partitionIndexTypes)) { + throw new HoodieIndexException("Not all index types are valid: " + partitionIndexTypes); + } + // ensure lock provider configured + if (!config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl() || StringUtils.isNullOrEmpty(config.getLockProviderClass())) { + throw new HoodieIndexException(String.format("Need to set %s as %s and configure lock provider class", + WRITE_CONCURRENCY_MODE.key(), OPTIMISTIC_CONCURRENCY_CONTROL.name())); + } + } + + private void abort(HoodieInstant indexInstant) { + // delete metadata partition + partitionIndexTypes.forEach(partitionType -> { + if (metadataPartitionExists(table.getMetaClient().getBasePath(), context, partitionType)) { + deleteMetadataPartition(table.getMetaClient().getBasePath(), context, partitionType); + } + }); + // delete requested instant + table.getMetaClient().reloadActiveTimeline().deleteInstantFileIfExists(indexInstant); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/BaseRestoreActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/BaseRestoreActionExecutor.java index 0bbc0a05f9e59..62ecbe2a31f8f 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/BaseRestoreActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/BaseRestoreActionExecutor.java @@ -18,24 +18,31 @@ package org.apache.hudi.table.action.restore; +import org.apache.hudi.avro.model.HoodieInstantInfo; import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRestorePlan; import org.apache.hudi.avro.model.HoodieRollbackMetadata; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.client.transaction.TransactionManager; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieRestoreException; import org.apache.hudi.exception.HoodieRollbackException; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.BaseActionExecutor; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.IOException; +import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -47,6 +54,7 @@ public abstract class BaseRestoreActionExecutor instantsToRollback = table.getActiveTimeline().getCommitsAndCompactionTimeline() - .getReverseOrderedInstants() - .filter(instant -> HoodieActiveTimeline.GREATER_THAN.test(instant.getTimestamp(), restoreInstantTime)) - .collect(Collectors.toList()); + Option restoreInstant = table.getRestoreTimeline() + .filterInflightsAndRequested() + .filter(instant -> instant.getTimestamp().equals(instantTime)) + .firstInstant(); + if (!restoreInstant.isPresent()) { + throw new HoodieRollbackException("No pending restore instants found to execute restore"); + } + try { + List instantsToRollback = getInstantsToRollback(restoreInstant.get()); + ValidationUtils.checkArgument(restoreInstant.get().getState().equals(HoodieInstant.State.REQUESTED) + || restoreInstant.get().getState().equals(HoodieInstant.State.INFLIGHT)); + Map> instantToMetadata = new HashMap<>(); + if (restoreInstant.get().isRequested()) { + table.getActiveTimeline().transitionRestoreRequestedToInflight(restoreInstant.get()); + } - Map> instantToMetadata = new HashMap<>(); - table.getActiveTimeline().createNewInstant(new HoodieInstant(true, HoodieTimeline.RESTORE_ACTION, instantTime)); - instantsToRollback.forEach(instant -> { - instantToMetadata.put(instant.getTimestamp(), Collections.singletonList(rollbackInstant(instant))); - LOG.info("Deleted instant " + instant); - }); + instantsToRollback.forEach(instant -> { + instantToMetadata.put(instant.getTimestamp(), Collections.singletonList(rollbackInstant(instant))); + LOG.info("Deleted instant " + instant); + }); - try { return finishRestore(instantToMetadata, instantsToRollback, restoreTimer.endTimer() ); } catch (IOException io) { - throw new HoodieRollbackException("unable to rollback instants " + instantsToRollback, io); + throw new HoodieRestoreException("unable to Restore instant " + restoreInstant.get(), io); + } + } + + private List getInstantsToRollback(HoodieInstant restoreInstant) throws IOException { + List instantsToRollback = new ArrayList<>(); + HoodieRestorePlan restorePlan = RestoreUtils.getRestorePlan(table.getMetaClient(), restoreInstant); + for (HoodieInstantInfo instantInfo : restorePlan.getInstantsToRollback()) { + // If restore crashed mid-way, there are chances that some commits are already rolled back, + // but some are not. so, we can ignore those commits which are fully rolledback in previous attempt if any. + Option rollbackInstantOpt = table.getActiveTimeline().getWriteTimeline() + .filter(instant -> instant.getTimestamp().equals(instantInfo.getCommitTime()) && instant.getAction().equals(instantInfo.getAction())).firstInstant(); + if (rollbackInstantOpt.isPresent()) { + instantsToRollback.add(rollbackInstantOpt.get()); + } else { + LOG.warn("Ignoring already rolledback instant " + instantInfo.toString()); + } } + return instantsToRollback; } protected abstract HoodieRollbackMetadata rollbackInstant(HoodieInstant rollbackInstant); @@ -93,20 +126,37 @@ private HoodieRestoreMetadata finishRestore(Map instantsToRollback = table.getActiveTimeline().getRollbackTimeline() + .getReverseOrderedInstants() + .filter(instant -> HoodieActiveTimeline.GREATER_THAN.test(instant.getTimestamp(), restoreInstantTime)) + .collect(Collectors.toList()); + instantsToRollback.forEach(entry -> { + if (entry.isCompleted()) { + table.getActiveTimeline().deleteCompletedRollback(entry); + } + table.getActiveTimeline().deletePending(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.ROLLBACK_ACTION, entry.getTimestamp())); + table.getActiveTimeline().deletePending(new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.ROLLBACK_ACTION, entry.getTimestamp())); + }); LOG.info("Commits " + instantsRolledBack + " rollback is complete. Restored table to " + restoreInstantTime); + return restoreMetadata; + } - if (!table.getActiveTimeline().getCleanerTimeline().empty()) { - LOG.info("Cleaning up older restore meta files"); - // Cleanup of older cleaner meta files - // TODO - make the commit archival generic and archive rollback metadata - FSUtils.deleteOlderRollbackMetaFiles( - table.getMetaClient().getFs(), - table.getMetaClient().getMetaPath(), - table.getActiveTimeline().getRestoreTimeline().getInstants() - ); + /** + * Update metadata table if available. Any update to metadata table happens within data table lock. + * + * @param restoreMetadata instance of {@link HoodieRestoreMetadata} to be applied to metadata. + */ + private void writeToMetadata(HoodieRestoreMetadata restoreMetadata, HoodieInstant restoreInflightInstant) { + try { + this.txnManager.beginTransaction(Option.of(restoreInflightInstant), Option.empty()); + writeTableMetadata(restoreMetadata); + } finally { + this.txnManager.endTransaction(Option.of(restoreInflightInstant)); } - return restoreMetadata; } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/CopyOnWriteRestoreActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/CopyOnWriteRestoreActionExecutor.java new file mode 100644 index 0000000000000..f6e104e3dcdc5 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/CopyOnWriteRestoreActionExecutor.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.action.restore; + +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieRollbackException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.rollback.CopyOnWriteRollbackActionExecutor; + +public class CopyOnWriteRestoreActionExecutor + extends BaseRestoreActionExecutor { + public CopyOnWriteRestoreActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + String restoreInstantTime) { + super(context, config, table, instantTime, restoreInstantTime); + } + + @Override + protected HoodieRollbackMetadata rollbackInstant(HoodieInstant instantToRollback) { + if (!instantToRollback.getAction().equals(HoodieTimeline.COMMIT_ACTION) + && !instantToRollback.getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION)) { + throw new HoodieRollbackException("Unsupported action in rollback instant:" + instantToRollback); + } + table.getMetaClient().reloadActiveTimeline(); + String newInstantTime = HoodieActiveTimeline.createNewInstantTime(); + table.scheduleRollback(context, newInstantTime, instantToRollback, false, false); + table.getMetaClient().reloadActiveTimeline(); + CopyOnWriteRollbackActionExecutor rollbackActionExecutor = new CopyOnWriteRollbackActionExecutor( + context, + config, + table, + newInstantTime, + instantToRollback, + true, + true, + false); + return rollbackActionExecutor.execute(); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/MergeOnReadRestoreActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/MergeOnReadRestoreActionExecutor.java new file mode 100644 index 0000000000000..01c3d44fabc93 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/MergeOnReadRestoreActionExecutor.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.action.restore; + +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.rollback.MergeOnReadRollbackActionExecutor; + +public class MergeOnReadRestoreActionExecutor + extends BaseRestoreActionExecutor { + public MergeOnReadRestoreActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, + String instantTime, String restoreInstantTime) { + super(context, config, table, instantTime, restoreInstantTime); + } + + @Override + protected HoodieRollbackMetadata rollbackInstant(HoodieInstant instantToRollback) { + switch (instantToRollback.getAction()) { + case HoodieTimeline.COMMIT_ACTION: + case HoodieTimeline.DELTA_COMMIT_ACTION: + case HoodieTimeline.COMPACTION_ACTION: + case HoodieTimeline.REPLACE_COMMIT_ACTION: + // TODO : Get file status and create a rollback stat and file + // TODO : Delete the .aux files along with the instant file, okay for now since the archival process will + // delete these files when it does not see a corresponding instant file under .hoodie + break; + default: + throw new IllegalArgumentException("invalid action name " + instantToRollback.getAction()); + } + table.getMetaClient().reloadActiveTimeline(); + String instantTime = HoodieActiveTimeline.createNewInstantTime(); + table.scheduleRollback(context, instantTime, instantToRollback, false, false); + table.getMetaClient().reloadActiveTimeline(); + MergeOnReadRollbackActionExecutor rollbackActionExecutor = new MergeOnReadRollbackActionExecutor( + context, + config, + table, + instantTime, + instantToRollback, + true, + true, + false); + + // TODO : Get file status and create a rollback stat and file + // TODO : Delete the .aux files along with the instant file, okay for now since the archival process will + // delete these files when it does not see a corresponding instant file under .hoodie + return rollbackActionExecutor.execute(); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/RestoreUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/RestoreUtils.java new file mode 100644 index 0000000000000..24c57a0709b1a --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/RestoreUtils.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.action.restore; + +import org.apache.hudi.avro.model.HoodieRestorePlan; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; + +import java.io.IOException; + +public class RestoreUtils { + + /** + * Get Latest version of Restore plan corresponding to a restore instant. + * + * @param metaClient Hoodie Table Meta Client + * @param restoreInstant Instant referring to restore action + * @return Rollback plan corresponding to rollback instant + * @throws IOException + */ + public static HoodieRestorePlan getRestorePlan(HoodieTableMetaClient metaClient, HoodieInstant restoreInstant) + throws IOException { + final HoodieInstant requested = HoodieTimeline.getRollbackRequestedInstant(restoreInstant); + return TimelineMetadataUtils.deserializeAvroMetadata( + metaClient.getActiveTimeline().readRestoreInfoAsBytes(requested).get(), HoodieRestorePlan.class); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/AbstractMarkerBasedRollbackStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/AbstractMarkerBasedRollbackStrategy.java deleted file mode 100644 index 40526b86f2cdb..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/AbstractMarkerBasedRollbackStrategy.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hadoop.fs.Path; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.log.HoodieLogFormat; -import org.apache.hudi.common.table.log.block.HoodieCommandBlock; -import org.apache.hudi.common.table.log.block.HoodieLogBlock; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.table.HoodieTable; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.IOException; -import java.util.Collections; -import java.util.Map; - -/** - * Performs rollback using marker files generated during the write.. - */ -public abstract class AbstractMarkerBasedRollbackStrategy implements BaseRollbackActionExecutor.RollbackStrategy { - - private static final Logger LOG = LogManager.getLogger(AbstractMarkerBasedRollbackStrategy.class); - - protected final HoodieTable table; - - protected final transient HoodieEngineContext context; - - protected final HoodieWriteConfig config; - - private final String basePath; - - private final String instantTime; - - public AbstractMarkerBasedRollbackStrategy(HoodieTable table, HoodieEngineContext context, HoodieWriteConfig config, String instantTime) { - this.table = table; - this.context = context; - this.basePath = table.getMetaClient().getBasePath(); - this.config = config; - this.instantTime = instantTime; - } - - protected HoodieRollbackStat undoMerge(String mergedBaseFilePath) throws IOException { - LOG.info("Rolling back by deleting the merged base file:" + mergedBaseFilePath); - return deleteBaseFile(mergedBaseFilePath); - } - - protected HoodieRollbackStat undoCreate(String createdBaseFilePath) throws IOException { - LOG.info("Rolling back by deleting the created base file:" + createdBaseFilePath); - return deleteBaseFile(createdBaseFilePath); - } - - private HoodieRollbackStat deleteBaseFile(String baseFilePath) throws IOException { - Path fullDeletePath = new Path(basePath, baseFilePath); - String partitionPath = FSUtils.getRelativePartitionPath(new Path(basePath), fullDeletePath.getParent()); - boolean isDeleted = table.getMetaClient().getFs().delete(fullDeletePath); - return HoodieRollbackStat.newBuilder() - .withPartitionPath(partitionPath) - .withDeletedFileResult(baseFilePath, isDeleted) - .build(); - } - - protected HoodieRollbackStat undoAppend(String appendBaseFilePath, HoodieInstant instantToRollback) throws IOException, InterruptedException { - Path baseFilePathForAppend = new Path(basePath, appendBaseFilePath); - String fileId = FSUtils.getFileIdFromFilePath(baseFilePathForAppend); - String baseCommitTime = FSUtils.getCommitTime(baseFilePathForAppend.getName()); - String partitionPath = FSUtils.getRelativePartitionPath(new Path(basePath), new Path(basePath, appendBaseFilePath).getParent()); - - HoodieLogFormat.Writer writer = null; - try { - Path partitionFullPath = FSUtils.getPartitionPath(basePath, partitionPath); - - if (!table.getMetaClient().getFs().exists(partitionFullPath)) { - return HoodieRollbackStat.newBuilder() - .withPartitionPath(partitionPath) - .build(); - } - writer = HoodieLogFormat.newWriterBuilder() - .onParentPath(partitionFullPath) - .withFileId(fileId) - .overBaseCommit(baseCommitTime) - .withFs(table.getMetaClient().getFs()) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); - - // generate metadata - Map header = RollbackUtils.generateHeader(instantToRollback.getTimestamp(), instantTime); - // if update belongs to an existing log file - writer = writer.appendBlock(new HoodieCommandBlock(header)); - } finally { - try { - if (writer != null) { - writer.close(); - } - } catch (IOException io) { - throw new HoodieIOException("Error closing append of rollback block..", io); - } - } - - return HoodieRollbackStat.newBuilder() - .withPartitionPath(partitionPath) - // we don't use this field per se. Avoiding the extra file status call. - .withRollbackBlockAppendResults(Collections.emptyMap()) - .build(); - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseCopyOnWriteRollbackActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseCopyOnWriteRollbackActionExecutor.java deleted file mode 100644 index c60cb7f5e7932..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseCopyOnWriteRollbackActionExecutor.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.util.HoodieTimer; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; - -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.util.ArrayList; -import java.util.List; - -public abstract class BaseCopyOnWriteRollbackActionExecutor extends BaseRollbackActionExecutor { - - private static final Logger LOG = LogManager.getLogger(BaseCopyOnWriteRollbackActionExecutor.class); - - public BaseCopyOnWriteRollbackActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants) { - super(context, config, table, instantTime, commitInstant, deleteInstants); - } - - public BaseCopyOnWriteRollbackActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants, - boolean skipTimelinePublish, - boolean useMarkerBasedStrategy) { - super(context, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, useMarkerBasedStrategy); - } - - @Override - protected List executeRollback() { - HoodieTimer rollbackTimer = new HoodieTimer(); - rollbackTimer.startTimer(); - - List stats = new ArrayList<>(); - HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); - HoodieInstant resolvedInstant = instantToRollback; - - if (instantToRollback.isCompleted()) { - LOG.info("Unpublishing instant " + instantToRollback); - resolvedInstant = activeTimeline.revertToInflight(instantToRollback); - // reload meta-client to reflect latest timeline status - table.getMetaClient().reloadActiveTimeline(); - } - - // For Requested State (like failure during index lookup), there is nothing to do rollback other than - // deleting the timeline file - if (!resolvedInstant.isRequested()) { - // delete all the data files for this commit - LOG.info("Clean out all base files generated for commit: " + resolvedInstant); - stats = getRollbackStrategy().execute(resolvedInstant); - } - - dropBootstrapIndexIfNeeded(instantToRollback); - - // Delete Inflight instant if enabled - deleteInflightAndRequestedInstant(deleteInstants, activeTimeline, resolvedInstant); - LOG.info("Time(in ms) taken to finish rollback " + rollbackTimer.endTimer()); - return stats; - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseMergeOnReadRollbackActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseMergeOnReadRollbackActionExecutor.java deleted file mode 100644 index 0e747a58dc673..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseMergeOnReadRollbackActionExecutor.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.util.HoodieTimer; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -public abstract class BaseMergeOnReadRollbackActionExecutor extends BaseRollbackActionExecutor { - - private static final Logger LOG = LogManager.getLogger(BaseMergeOnReadRollbackActionExecutor.class); - - public BaseMergeOnReadRollbackActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants) { - super(context, config, table, instantTime, commitInstant, deleteInstants); - } - - public BaseMergeOnReadRollbackActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants, - boolean skipTimelinePublish, - boolean useMarkerBasedStrategy) { - super(context, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, useMarkerBasedStrategy); - } - - @Override - protected List executeRollback() throws IOException { - HoodieTimer rollbackTimer = new HoodieTimer(); - rollbackTimer.startTimer(); - - LOG.info("Rolling back instant " + instantToRollback); - - HoodieInstant resolvedInstant = instantToRollback; - // Atomically un-publish all non-inflight commits - if (instantToRollback.isCompleted()) { - LOG.info("Un-publishing instant " + instantToRollback + ", deleteInstants=" + deleteInstants); - resolvedInstant = table.getActiveTimeline().revertToInflight(instantToRollback); - // reload meta-client to reflect latest timeline status - table.getMetaClient().reloadActiveTimeline(); - } - - List allRollbackStats = new ArrayList<>(); - - // At the moment, MOR table type does not support bulk nested rollbacks. Nested rollbacks is an experimental - // feature that is expensive. To perform nested rollbacks, initiate multiple requests of client.rollback - // (commitToRollback). - // NOTE {@link HoodieCompactionConfig#withCompactionLazyBlockReadEnabled} needs to be set to TRUE. This is - // required to avoid OOM when merging multiple LogBlocks performed during nested rollbacks. - - // For Requested State (like failure during index lookup), there is nothing to do rollback other than - // deleting the timeline file - if (!resolvedInstant.isRequested()) { - LOG.info("Unpublished " + resolvedInstant); - allRollbackStats = getRollbackStrategy().execute(resolvedInstant); - } - - dropBootstrapIndexIfNeeded(resolvedInstant); - - // Delete Inflight instants if enabled - deleteInflightAndRequestedInstant(deleteInstants, table.getActiveTimeline(), resolvedInstant); - LOG.info("Time(in ms) taken to finish rollback " + rollbackTimer.endTimer()); - return allRollbackStats; - } -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java index 40206ca45e1c3..4add51886fe3a 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java @@ -19,15 +19,18 @@ package org.apache.hudi.table.action.rollback; import org.apache.hudi.avro.model.HoodieRollbackMetadata; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.avro.model.HoodieRollbackPlan; +import org.apache.hudi.client.heartbeat.HoodieHeartbeatClient; +import org.apache.hudi.client.transaction.TransactionManager; import org.apache.hudi.common.HoodieRollbackStat; import org.apache.hudi.common.bootstrap.index.BootstrapIndex; -import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.util.ClusteringUtils; import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; @@ -35,14 +38,13 @@ import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieRollbackException; import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.MarkerFiles; import org.apache.hudi.table.action.BaseActionExecutor; +import org.apache.hudi.table.marker.WriteMarkersFactory; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.IOException; -import java.io.Serializable; import java.util.Collections; import java.util.List; import java.util.Objects; @@ -52,24 +54,22 @@ public abstract class BaseRollbackActionExecutor execute(HoodieInstant instantToRollback); - } - protected final HoodieInstant instantToRollback; protected final boolean deleteInstants; protected final boolean skipTimelinePublish; - protected final boolean useMarkerBasedStrategy; + private final TransactionManager txnManager; + private final boolean skipLocking; + + protected HoodieInstant resolvedInstant; public BaseRollbackActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable table, - String instantTime, - HoodieInstant instantToRollback, - boolean deleteInstants) { - this(context, config, table, instantTime, instantToRollback, deleteInstants, - false, config.shouldRollbackUsingMarkers()); + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + HoodieInstant instantToRollback, + boolean deleteInstants, + boolean skipLocking) { + this(context, config, table, instantTime, instantToRollback, deleteInstants, false, skipLocking); } public BaseRollbackActionExecutor(HoodieEngineContext context, @@ -79,43 +79,65 @@ public BaseRollbackActionExecutor(HoodieEngineContext context, HoodieInstant instantToRollback, boolean deleteInstants, boolean skipTimelinePublish, - boolean useMarkerBasedStrategy) { + boolean skipLocking) { super(context, config, table, instantTime); this.instantToRollback = instantToRollback; + this.resolvedInstant = instantToRollback; this.deleteInstants = deleteInstants; this.skipTimelinePublish = skipTimelinePublish; - this.useMarkerBasedStrategy = useMarkerBasedStrategy; - if (useMarkerBasedStrategy) { - ValidationUtils.checkArgument(!instantToRollback.isCompleted(), - "Cannot use marker based rollback strategy on completed instant:" + instantToRollback); - } + this.skipLocking = skipLocking; + this.txnManager = new TransactionManager(config, table.getMetaClient().getFs()); } - protected abstract RollbackStrategy getRollbackStrategy(); - - protected abstract List executeRollback() throws IOException; + /** + * Execute actual rollback and fetch list of RollbackStats. + * @param hoodieRollbackPlan instance of {@link HoodieRollbackPlan} that needs to be executed. + * @return a list of {@link HoodieRollbackStat}s. + * @throws IOException + */ + protected abstract List executeRollback(HoodieRollbackPlan hoodieRollbackPlan) throws IOException; - protected abstract List executeRollbackUsingFileListing(HoodieInstant instantToRollback); + private HoodieRollbackMetadata runRollback(HoodieTable table, HoodieInstant rollbackInstant, HoodieRollbackPlan rollbackPlan) { + ValidationUtils.checkArgument(rollbackInstant.getState().equals(HoodieInstant.State.REQUESTED) + || rollbackInstant.getState().equals(HoodieInstant.State.INFLIGHT)); + final HoodieInstant inflightInstant = rollbackInstant.isRequested() + ? table.getActiveTimeline().transitionRollbackRequestedToInflight(rollbackInstant) + : rollbackInstant; - @Override - public HoodieRollbackMetadata execute() { HoodieTimer rollbackTimer = new HoodieTimer().startTimer(); - List stats = doRollbackAndGetStats(); + List stats = doRollbackAndGetStats(rollbackPlan); HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.convertRollbackMetadata( instantTime, Option.of(rollbackTimer.endTimer()), Collections.singletonList(instantToRollback), stats); - if (!skipTimelinePublish) { - finishRollback(rollbackMetadata); - } + finishRollback(inflightInstant, rollbackMetadata); - // Finally, remove the marker files post rollback. - new MarkerFiles(table, instantToRollback.getTimestamp()).quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); + // Finally, remove the markers post rollback. + WriteMarkersFactory.get(config.getMarkersType(), table, instantToRollback.getTimestamp()) + .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); return rollbackMetadata; } + @Override + public HoodieRollbackMetadata execute() { + table.getMetaClient().reloadActiveTimeline(); + Option rollbackInstant = table.getRollbackTimeline() + .filterInflightsAndRequested() + .filter(instant -> instant.getTimestamp().equals(instantTime)) + .firstInstant(); + if (!rollbackInstant.isPresent()) { + throw new HoodieRollbackException("No pending rollback instants found to execute rollback"); + } + try { + HoodieRollbackPlan rollbackPlan = RollbackUtils.getRollbackPlan(table.getMetaClient(), rollbackInstant.get()); + return runRollback(table, rollbackInstant.get(), rollbackPlan); + } catch (IOException e) { + throw new HoodieIOException("Failed to fetch rollback plan for commit " + instantTime, e); + } + } + private void validateSavepointRollbacks() { // Check if any of the commits is a savepoint - do not allow rollback on those commits List savepoints = table.getCompletedSavepointTimeline().getInstants() @@ -130,23 +152,42 @@ private void validateSavepointRollbacks() { } private void validateRollbackCommitSequence() { - final String instantTimeToRollback = instantToRollback.getTimestamp(); - HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline(); - HoodieTimeline inflightAndRequestedCommitTimeline = table.getPendingCommitTimeline(); - // Make sure only the last n commits are being rolled back - // If there is a commit in-between or after that is not rolled back, then abort - if ((instantTimeToRollback != null) && !commitTimeline.empty() - && !commitTimeline.findInstantsAfter(instantTimeToRollback, Integer.MAX_VALUE).empty()) { - throw new HoodieRollbackException( - "Found commits after time :" + instantTimeToRollback + ", please rollback greater commits first"); - } + // Continue to provide the same behavior if policy is EAGER (similar to pendingRollback logic). This is required + // since with LAZY rollback we support parallel writing which can allow a new inflight while rollback is ongoing + // Remove this once we support LAZY rollback of failed writes by default as parallel writing becomes the default + // writer mode. + if (config.getFailedWritesCleanPolicy().isEager()) { + final String instantTimeToRollback = instantToRollback.getTimestamp(); + HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline(); + HoodieTimeline inflightAndRequestedCommitTimeline = table.getPendingCommitTimeline(); + // Make sure only the last n commits are being rolled back + // If there is a commit in-between or after that is not rolled back, then abort + if ((instantTimeToRollback != null) && !commitTimeline.empty() + && !commitTimeline.findInstantsAfter(instantTimeToRollback, Integer.MAX_VALUE).empty()) { + // check if remnants are from a previous LAZY rollback config, if yes, let out of order rollback continue + try { + if (!HoodieHeartbeatClient.heartbeatExists(table.getMetaClient().getFs(), + config.getBasePath(), instantTimeToRollback)) { + throw new HoodieRollbackException( + "Found commits after time :" + instantTimeToRollback + ", please rollback greater commits first"); + } + } catch (IOException io) { + throw new HoodieRollbackException("Unable to rollback commits ", io); + } + } - List inflights = inflightAndRequestedCommitTimeline.getInstants().map(HoodieInstant::getTimestamp) - .collect(Collectors.toList()); - if ((instantTimeToRollback != null) && !inflights.isEmpty() - && (inflights.indexOf(instantTimeToRollback) != inflights.size() - 1)) { - throw new HoodieRollbackException( - "Found in-flight commits after time :" + instantTimeToRollback + ", please rollback greater commits first"); + List inflights = inflightAndRequestedCommitTimeline.getInstants().filter(instant -> { + if (!instant.getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION)) { + return true; + } + return !ClusteringUtils.isPendingClusteringInstant(table.getMetaClient(), instant); + }).map(HoodieInstant::getTimestamp) + .collect(Collectors.toList()); + if ((instantTimeToRollback != null) && !inflights.isEmpty() + && (inflights.indexOf(instantTimeToRollback) != inflights.size() - 1)) { + throw new HoodieRollbackException( + "Found in-flight commits after time :" + instantTimeToRollback + ", please rollback greater commits first"); + } } } @@ -157,17 +198,20 @@ private void rollBackIndex() { LOG.info("Index rolled back for commits " + instantToRollback); } - public List doRollbackAndGetStats() { + public List doRollbackAndGetStats(HoodieRollbackPlan hoodieRollbackPlan) { final String instantTimeToRollback = instantToRollback.getTimestamp(); final boolean isPendingCompaction = Objects.equals(HoodieTimeline.COMPACTION_ACTION, instantToRollback.getAction()) && !instantToRollback.isCompleted(); + + final boolean isPendingClustering = Objects.equals(HoodieTimeline.REPLACE_COMMIT_ACTION, instantToRollback.getAction()) + && !instantToRollback.isCompleted() && ClusteringUtils.getClusteringPlan(table.getMetaClient(), instantToRollback).isPresent(); validateSavepointRollbacks(); - if (!isPendingCompaction) { + if (!isPendingCompaction && !isPendingClustering) { validateRollbackCommitSequence(); } try { - List stats = executeRollback(); + List stats = executeRollback(hoodieRollbackPlan); LOG.info("Rolled back inflight instant " + instantTimeToRollback); if (!isPendingCompaction) { rollBackIndex(); @@ -178,22 +222,45 @@ public List doRollbackAndGetStats() { } } - protected void finishRollback(HoodieRollbackMetadata rollbackMetadata) throws HoodieIOException { + /** + * Execute rollback and fetch rollback stats. + * @param instantToRollback instant to be rolled back. + * @param rollbackPlan instance of {@link HoodieRollbackPlan} for which rollback needs to be executed. + * @return list of {@link HoodieRollbackStat}s. + */ + protected List executeRollback(HoodieInstant instantToRollback, HoodieRollbackPlan rollbackPlan) { + return new BaseRollbackHelper(table.getMetaClient(), config).performRollback(context, instantToRollback, rollbackPlan.getRollbackRequests()); + } + + protected void finishRollback(HoodieInstant inflightInstant, HoodieRollbackMetadata rollbackMetadata) throws HoodieIOException { + boolean enableLocking = (!skipLocking && !skipTimelinePublish); try { - table.getActiveTimeline().createNewInstant( - new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.ROLLBACK_ACTION, instantTime)); - table.getActiveTimeline().saveAsComplete( - new HoodieInstant(true, HoodieTimeline.ROLLBACK_ACTION, instantTime), - TimelineMetadataUtils.serializeRollbackMetadata(rollbackMetadata)); - LOG.info("Rollback of Commits " + rollbackMetadata.getCommitsRollback() + " is complete"); - if (!table.getActiveTimeline().getCleanerTimeline().empty()) { - LOG.info("Cleaning up older rollback meta files"); - FSUtils.deleteOlderRollbackMetaFiles(table.getMetaClient().getFs(), - table.getMetaClient().getMetaPath(), - table.getActiveTimeline().getRollbackTimeline().getInstants()); + if (enableLocking) { + this.txnManager.beginTransaction(Option.of(inflightInstant), Option.empty()); + } + + // If publish the rollback to the timeline, we first write the rollback metadata + // to metadata table + if (!skipTimelinePublish) { + writeTableMetadata(rollbackMetadata); + } + + // Then we delete the inflight instant in the data table timeline if enabled + deleteInflightAndRequestedInstant(deleteInstants, table.getActiveTimeline(), resolvedInstant); + + // If publish the rollback to the timeline, we finally transition the inflight rollback + // to complete in the data table timeline + if (!skipTimelinePublish) { + table.getActiveTimeline().transitionRollbackInflightToComplete(inflightInstant, + TimelineMetadataUtils.serializeRollbackMetadata(rollbackMetadata)); + LOG.info("Rollback of Commits " + rollbackMetadata.getCommitsRollback() + " is complete"); } } catch (IOException e) { throw new HoodieIOException("Error executing rollback at instant " + instantTime, e); + } finally { + if (enableLocking) { + this.txnManager.endTransaction(Option.of(inflightInstant)); + } } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackHelper.java new file mode 100644 index 0000000000000..16cdd77b06357 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackHelper.java @@ -0,0 +1,220 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.rollback; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.model.HoodieRollbackRequest; +import org.apache.hudi.common.HoodieRollbackStat; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.log.HoodieLogFormat; +import org.apache.hudi.common.table.log.block.HoodieCommandBlock; +import org.apache.hudi.common.table.log.block.HoodieLogBlock; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieRollbackException; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * Contains common methods to be used across engines for rollback operation. + */ +public class BaseRollbackHelper implements Serializable { + + private static final Logger LOG = LogManager.getLogger(BaseRollbackHelper.class); + protected static final String EMPTY_STRING = ""; + + protected final HoodieTableMetaClient metaClient; + protected final HoodieWriteConfig config; + + public BaseRollbackHelper(HoodieTableMetaClient metaClient, HoodieWriteConfig config) { + this.metaClient = metaClient; + this.config = config; + } + + /** + * Performs all rollback actions that we have collected in parallel. + */ + public List performRollback(HoodieEngineContext context, HoodieInstant instantToRollback, + List rollbackRequests) { + int parallelism = Math.max(Math.min(rollbackRequests.size(), config.getRollbackParallelism()), 1); + context.setJobStatus(this.getClass().getSimpleName(), "Perform rollback actions: " + config.getTableName()); + // If not for conversion to HoodieRollbackInternalRequests, code fails. Using avro model (HoodieRollbackRequest) within spark.parallelize + // is failing with com.esotericsoftware.kryo.KryoException + // stack trace: https://gist.github.com/nsivabalan/b6359e7d5038484f8043506c8bc9e1c8 + // related stack overflow post: https://issues.apache.org/jira/browse/SPARK-3601. Avro deserializes list as GenericData.Array. + List serializableRequests = rollbackRequests.stream().map(SerializableHoodieRollbackRequest::new).collect(Collectors.toList()); + return context.reduceByKey(maybeDeleteAndCollectStats(context, instantToRollback, serializableRequests, true, parallelism), + RollbackUtils::mergeRollbackStat, parallelism); + } + + /** + * Collect all file info that needs to be rollbacked. + */ + public List collectRollbackStats(HoodieEngineContext context, HoodieInstant instantToRollback, + List rollbackRequests) { + int parallelism = Math.max(Math.min(rollbackRequests.size(), config.getRollbackParallelism()), 1); + context.setJobStatus(this.getClass().getSimpleName(), "Collect rollback stats for upgrade/downgrade: " + config.getTableName()); + // If not for conversion to HoodieRollbackInternalRequests, code fails. Using avro model (HoodieRollbackRequest) within spark.parallelize + // is failing with com.esotericsoftware.kryo.KryoException + // stack trace: https://gist.github.com/nsivabalan/b6359e7d5038484f8043506c8bc9e1c8 + // related stack overflow post: https://issues.apache.org/jira/browse/SPARK-3601. Avro deserializes list as GenericData.Array. + List serializableRequests = rollbackRequests.stream().map(SerializableHoodieRollbackRequest::new).collect(Collectors.toList()); + return context.reduceByKey(maybeDeleteAndCollectStats(context, instantToRollback, serializableRequests, false, parallelism), + RollbackUtils::mergeRollbackStat, parallelism); + } + + /** + * May be delete interested files and collect stats or collect stats only. + * + * @param context instance of {@link HoodieEngineContext} to use. + * @param instantToRollback {@link HoodieInstant} of interest for which deletion or collect stats is requested. + * @param rollbackRequests List of {@link ListingBasedRollbackRequest} to be operated on. + * @param doDelete {@code true} if deletion has to be done. {@code false} if only stats are to be collected w/o performing any deletes. + * @return stats collected with or w/o actual deletions. + */ + List> maybeDeleteAndCollectStats(HoodieEngineContext context, + HoodieInstant instantToRollback, + List rollbackRequests, + boolean doDelete, int numPartitions) { + return context.flatMap(rollbackRequests, (SerializableFunction>>) rollbackRequest -> { + List filesToBeDeleted = rollbackRequest.getFilesToBeDeleted(); + if (!filesToBeDeleted.isEmpty()) { + List rollbackStats = deleteFiles(metaClient, filesToBeDeleted, doDelete); + List> partitionToRollbackStats = new ArrayList<>(); + rollbackStats.forEach(entry -> partitionToRollbackStats.add(Pair.of(entry.getPartitionPath(), entry))); + return partitionToRollbackStats.stream(); + } else if (!rollbackRequest.getLogBlocksToBeDeleted().isEmpty()) { + HoodieLogFormat.Writer writer = null; + final Path filePath; + try { + String fileId = rollbackRequest.getFileId(); + String latestBaseInstant = rollbackRequest.getLatestBaseInstant(); + + writer = HoodieLogFormat.newWriterBuilder() + .onParentPath(FSUtils.getPartitionPath(metaClient.getBasePath(), rollbackRequest.getPartitionPath())) + .withFileId(fileId) + .overBaseCommit(latestBaseInstant) + .withFs(metaClient.getFs()) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); + + // generate metadata + if (doDelete) { + Map header = generateHeader(instantToRollback.getTimestamp()); + // if update belongs to an existing log file + // use the log file path from AppendResult in case the file handle may roll over + filePath = writer.appendBlock(new HoodieCommandBlock(header)).logFile().getPath(); + } else { + filePath = writer.getLogFile().getPath(); + } + } catch (IOException | InterruptedException io) { + throw new HoodieRollbackException("Failed to rollback for instant " + instantToRollback, io); + } finally { + try { + if (writer != null) { + writer.close(); + } + } catch (IOException io) { + throw new HoodieIOException("Error appending rollback block", io); + } + } + + // This step is intentionally done after writer is closed. Guarantees that + // getFileStatus would reflect correct stats and FileNotFoundException is not thrown in + // cloud-storage : HUDI-168 + Map filesToNumBlocksRollback = Collections.singletonMap( + metaClient.getFs().getFileStatus(Objects.requireNonNull(filePath)), + 1L + ); + + return Collections.singletonList( + Pair.of(rollbackRequest.getPartitionPath(), + HoodieRollbackStat.newBuilder() + .withPartitionPath(rollbackRequest.getPartitionPath()) + .withRollbackBlockAppendResults(filesToNumBlocksRollback) + .build())) + .stream(); + } else { + return Collections.singletonList( + Pair.of(rollbackRequest.getPartitionPath(), + HoodieRollbackStat.newBuilder() + .withPartitionPath(rollbackRequest.getPartitionPath()) + .build())) + .stream(); + } + }, numPartitions); + } + + /** + * Common method used for cleaning out files during rollback. + */ + protected List deleteFiles(HoodieTableMetaClient metaClient, List filesToBeDeleted, boolean doDelete) throws IOException { + return filesToBeDeleted.stream().map(fileToDelete -> { + String basePath = metaClient.getBasePath(); + try { + Path fullDeletePath = new Path(fileToDelete); + String partitionPath = FSUtils.getRelativePartitionPath(new Path(basePath), fullDeletePath.getParent()); + boolean isDeleted = true; + if (doDelete) { + try { + isDeleted = metaClient.getFs().delete(fullDeletePath); + } catch (FileNotFoundException e) { + // if first rollback attempt failed and retried again, chances that some files are already deleted. + isDeleted = true; + } + } + return HoodieRollbackStat.newBuilder() + .withPartitionPath(partitionPath) + .withDeletedFileResult(fullDeletePath.toString(), isDeleted) + .build(); + } catch (IOException e) { + LOG.error("Fetching file status for "); + throw new HoodieIOException("Fetching file status for " + fileToDelete + " failed ", e); + } + }).collect(Collectors.toList()); + } + + protected Map generateHeader(String commit) { + // generate metadata + Map header = new HashMap<>(3); + header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp()); + header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, commit); + header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, + String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); + return header; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackPlanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackPlanActionExecutor.java new file mode 100644 index 0000000000000..63b9e8a414b55 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackPlanActionExecutor.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.rollback; + +import org.apache.hudi.avro.model.HoodieInstantInfo; +import org.apache.hudi.avro.model.HoodieRollbackPlan; +import org.apache.hudi.avro.model.HoodieRollbackRequest; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.BaseActionExecutor; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +/** + * Base rollback plan action executor to assist in scheduling rollback requests. This phase serialized {@link HoodieRollbackPlan} + * to rollback.requested instant. + */ +public class BaseRollbackPlanActionExecutor extends BaseActionExecutor> { + + private static final Logger LOG = LogManager.getLogger(BaseRollbackPlanActionExecutor.class); + + protected final HoodieInstant instantToRollback; + private final boolean skipTimelinePublish; + private final boolean shouldRollbackUsingMarkers; + + public static final Integer ROLLBACK_PLAN_VERSION_1 = 1; + public static final Integer LATEST_ROLLBACK_PLAN_VERSION = ROLLBACK_PLAN_VERSION_1; + + public BaseRollbackPlanActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + HoodieInstant instantToRollback, + boolean skipTimelinePublish, + boolean shouldRollbackUsingMarkers) { + super(context, config, table, instantTime); + this.instantToRollback = instantToRollback; + this.skipTimelinePublish = skipTimelinePublish; + this.shouldRollbackUsingMarkers = shouldRollbackUsingMarkers && !instantToRollback.isCompleted(); + } + + /** + * Interface for RollbackStrategy. There are two types supported, listing based and marker based. + */ + interface RollbackStrategy extends Serializable { + + /** + * Fetch list of {@link HoodieRollbackRequest}s to be added to rollback plan. + * @param instantToRollback instant to be rolled back. + * @return list of {@link HoodieRollbackRequest}s to be added to rollback plan + */ + List getRollbackRequests(HoodieInstant instantToRollback); + } + + /** + * Fetch the Rollback strategy used. + * + * @return + */ + private BaseRollbackPlanActionExecutor.RollbackStrategy getRollbackStrategy() { + if (shouldRollbackUsingMarkers) { + return new MarkerBasedRollbackStrategy(table, context, config, instantTime); + } else { + return new ListingBasedRollbackStrategy(table, context, config, instantTime); + } + } + + /** + * Creates a Rollback plan if there are files to be rolledback and stores them in instant file. + * Rollback Plan contains absolute file paths. + * + * @param startRollbackTime Rollback Instant Time + * @return Rollback Plan if generated + */ + protected Option requestRollback(String startRollbackTime) { + final HoodieInstant rollbackInstant = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.ROLLBACK_ACTION, startRollbackTime); + try { + List rollbackRequests = new ArrayList<>(); + if (!instantToRollback.isRequested()) { + rollbackRequests.addAll(getRollbackStrategy().getRollbackRequests(instantToRollback)); + } + HoodieRollbackPlan rollbackPlan = new HoodieRollbackPlan(new HoodieInstantInfo(instantToRollback.getTimestamp(), + instantToRollback.getAction()), rollbackRequests, LATEST_ROLLBACK_PLAN_VERSION); + if (!skipTimelinePublish) { + if (table.getRollbackTimeline().filterInflightsAndRequested().containsInstant(rollbackInstant.getTimestamp())) { + LOG.warn("Request Rollback found with instant time " + rollbackInstant + ", hence skipping scheduling rollback"); + } else { + table.getActiveTimeline().saveToRollbackRequested(rollbackInstant, TimelineMetadataUtils.serializeRollbackPlan(rollbackPlan)); + table.getMetaClient().reloadActiveTimeline(); + LOG.info("Requesting Rollback with instant time " + rollbackInstant); + } + } + return Option.of(rollbackPlan); + } catch (IOException e) { + LOG.error("Got exception when saving rollback requested file", e); + throw new HoodieIOException(e.getMessage(), e); + } + } + + @Override + public Option execute() { + // Plan a new rollback action + return requestRollback(instantTime); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/CopyOnWriteRollbackActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/CopyOnWriteRollbackActionExecutor.java new file mode 100644 index 0000000000000..e766dbdc81c09 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/CopyOnWriteRollbackActionExecutor.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.rollback; + +import org.apache.hudi.avro.model.HoodieRollbackPlan; +import org.apache.hudi.common.HoodieRollbackStat; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.List; + +public class CopyOnWriteRollbackActionExecutor extends BaseRollbackActionExecutor { + + private static final Logger LOG = LogManager.getLogger(CopyOnWriteRollbackActionExecutor.class); + + public CopyOnWriteRollbackActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + HoodieInstant commitInstant, + boolean deleteInstants, + boolean skipLocking) { + super(context, config, table, instantTime, commitInstant, deleteInstants, skipLocking); + } + + public CopyOnWriteRollbackActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + HoodieInstant commitInstant, + boolean deleteInstants, + boolean skipTimelinePublish, + boolean skipLocking) { + super(context, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, skipLocking); + } + + @Override + protected List executeRollback(HoodieRollbackPlan hoodieRollbackPlan) { + HoodieTimer rollbackTimer = new HoodieTimer(); + rollbackTimer.startTimer(); + + List stats = new ArrayList<>(); + HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); + + if (instantToRollback.isCompleted()) { + LOG.info("Unpublishing instant " + instantToRollback); + resolvedInstant = activeTimeline.revertToInflight(instantToRollback); + // reload meta-client to reflect latest timeline status + table.getMetaClient().reloadActiveTimeline(); + } + + // For Requested State (like failure during index lookup), there is nothing to do rollback other than + // deleting the timeline file + if (!resolvedInstant.isRequested()) { + // delete all the data files for this commit + LOG.info("Clean out all base files generated for commit: " + resolvedInstant); + stats = executeRollback(resolvedInstant, hoodieRollbackPlan); + } + + dropBootstrapIndexIfNeeded(instantToRollback); + + LOG.info("Time(in ms) taken to finish rollback " + rollbackTimer.endTimer()); + return stats; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackRequest.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackRequest.java index fc369a46711cf..7411231bb7d79 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackRequest.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackRequest.java @@ -18,12 +18,15 @@ package org.apache.hudi.table.action.rollback; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.util.Option; +import java.io.Serializable; + /** * Request for performing one rollback action. */ -public class ListingBasedRollbackRequest { +public class ListingBasedRollbackRequest implements Serializable { /** * Rollback commands, that trigger a specific handling for rollback. @@ -49,32 +52,42 @@ public enum Type { */ private final Option latestBaseInstant; + /** + * TODO + */ + private final Option writeStat; + private final Type type; + public ListingBasedRollbackRequest(String partitionPath, Type type) { + this(partitionPath, Option.empty(), Option.empty(), Option.empty(), type); + } + public ListingBasedRollbackRequest(String partitionPath, Option fileId, Option latestBaseInstant, + Option writeStat, Type type) { this.partitionPath = partitionPath; this.fileId = fileId; this.latestBaseInstant = latestBaseInstant; + this.writeStat = writeStat; this.type = type; } public static ListingBasedRollbackRequest createRollbackRequestWithDeleteDataFilesOnlyAction(String partitionPath) { - return new ListingBasedRollbackRequest(partitionPath, Option.empty(), Option.empty(), - Type.DELETE_DATA_FILES_ONLY); + return new ListingBasedRollbackRequest(partitionPath, Type.DELETE_DATA_FILES_ONLY); } public static ListingBasedRollbackRequest createRollbackRequestWithDeleteDataAndLogFilesAction(String partitionPath) { - return new ListingBasedRollbackRequest(partitionPath, Option.empty(), Option.empty(), - Type.DELETE_DATA_AND_LOG_FILES); + return new ListingBasedRollbackRequest(partitionPath, Type.DELETE_DATA_AND_LOG_FILES); } - public static ListingBasedRollbackRequest createRollbackRequestWithAppendRollbackBlockAction(String partitionPath, String fileId, - String baseInstant) { - return new ListingBasedRollbackRequest(partitionPath, Option.of(fileId), Option.of(baseInstant), - Type.APPEND_ROLLBACK_BLOCK); + public static ListingBasedRollbackRequest createRollbackRequestWithAppendRollbackBlockAction(String partitionPath, + String fileId, + String baseInstant, + HoodieWriteStat writeStat) { + return new ListingBasedRollbackRequest(partitionPath, Option.of(fileId), Option.of(baseInstant), Option.of(writeStat), Type.APPEND_ROLLBACK_BLOCK); } public String getPartitionPath() { @@ -89,6 +102,10 @@ public Option getLatestBaseInstant() { return latestBaseInstant; } + public Option getWriteStat() { + return writeStat; + } + public Type getType() { return type; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackStrategy.java new file mode 100644 index 0000000000000..10b02d7fff200 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackStrategy.java @@ -0,0 +1,350 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.rollback; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hudi.avro.model.HoodieRollbackRequest; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieRollbackException; +import org.apache.hudi.table.HoodieTable; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.jetbrains.annotations.NotNull; + +import static org.apache.hudi.client.utils.MetadataConversionUtils.getHoodieCommitMetadata; +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; +import static org.apache.hudi.table.action.rollback.BaseRollbackHelper.EMPTY_STRING; + +/** + * Listing based rollback strategy to fetch list of {@link HoodieRollbackRequest}s. + */ +public class ListingBasedRollbackStrategy implements BaseRollbackPlanActionExecutor.RollbackStrategy { + + private static final Logger LOG = LogManager.getLogger(ListingBasedRollbackStrategy.class); + + protected final HoodieTable table; + + protected final transient HoodieEngineContext context; + + protected final HoodieWriteConfig config; + + protected final String instantTime; + + public ListingBasedRollbackStrategy(HoodieTable table, + HoodieEngineContext context, + HoodieWriteConfig config, + String instantTime) { + this.table = table; + this.context = context; + this.config = config; + this.instantTime = instantTime; + } + + @Override + public List getRollbackRequests(HoodieInstant instantToRollback) { + try { + HoodieTableMetaClient metaClient = table.getMetaClient(); + List partitionPaths = + FSUtils.getAllPartitionPaths(context, table.getMetaClient().getBasePath(), false, false); + int numPartitions = Math.max(Math.min(partitionPaths.size(), config.getRollbackParallelism()), 1); + + context.setJobStatus(this.getClass().getSimpleName(), "Creating Listing Rollback Plan: " + config.getTableName()); + + HoodieTableType tableType = table.getMetaClient().getTableType(); + String baseFileExtension = getBaseFileExtension(metaClient); + Option commitMetadataOptional = getHoodieCommitMetadata(metaClient, instantToRollback); + Boolean isCommitMetadataCompleted = checkCommitMetadataCompleted(instantToRollback, commitMetadataOptional); + + return context.flatMap(partitionPaths, partitionPath -> { + List hoodieRollbackRequests = new ArrayList<>(partitionPaths.size()); + FileStatus[] filesToDelete = + fetchFilesFromInstant(instantToRollback, partitionPath, metaClient.getBasePath(), baseFileExtension, + metaClient.getFs(), commitMetadataOptional, isCommitMetadataCompleted); + + if (HoodieTableType.COPY_ON_WRITE == tableType) { + hoodieRollbackRequests.add(getHoodieRollbackRequest(partitionPath, filesToDelete)); + } else if (HoodieTableType.MERGE_ON_READ == tableType) { + String commit = instantToRollback.getTimestamp(); + HoodieActiveTimeline activeTimeline = table.getMetaClient().reloadActiveTimeline(); + switch (instantToRollback.getAction()) { + case HoodieTimeline.COMMIT_ACTION: + case HoodieTimeline.REPLACE_COMMIT_ACTION: + hoodieRollbackRequests.add(getHoodieRollbackRequest(partitionPath, filesToDelete)); + break; + case HoodieTimeline.COMPACTION_ACTION: + // If there is no delta commit present after the current commit (if compaction), no action, else we + // need to make sure that a compaction commit rollback also deletes any log files written as part of the + // succeeding deltacommit. + boolean higherDeltaCommits = + !activeTimeline.getDeltaCommitTimeline().filterCompletedInstants().findInstantsAfter(commit, 1) + .empty(); + if (higherDeltaCommits) { + // Rollback of a compaction action with no higher deltacommit means that the compaction is scheduled + // and has not yet finished. In this scenario we should delete only the newly created base files + // and not corresponding base commit log files created with this as baseCommit since updates would + // have been written to the log files. + hoodieRollbackRequests.add(getHoodieRollbackRequest(partitionPath, + listFilesToBeDeleted(instantToRollback.getTimestamp(), baseFileExtension, partitionPath, + metaClient.getFs()))); + } else { + // No deltacommits present after this compaction commit (inflight or requested). In this case, we + // can also delete any log files that were created with this compaction commit as base + // commit. + hoodieRollbackRequests.add(getHoodieRollbackRequest(partitionPath, filesToDelete)); + } + break; + case HoodieTimeline.DELTA_COMMIT_ACTION: + // -------------------------------------------------------------------------------------------------- + // (A) The following cases are possible if index.canIndexLogFiles and/or index.isGlobal + // -------------------------------------------------------------------------------------------------- + // (A.1) Failed first commit - Inserts were written to log files and HoodieWriteStat has no entries. In + // this scenario we would want to delete these log files. + // (A.2) Failed recurring commit - Inserts/Updates written to log files. In this scenario, + // HoodieWriteStat will have the baseCommitTime for the first log file written, add rollback blocks. + // (A.3) Rollback triggered for first commit - Inserts were written to the log files but the commit is + // being reverted. In this scenario, HoodieWriteStat will be `null` for the attribute prevCommitTime and + // and hence will end up deleting these log files. This is done so there are no orphan log files + // lying around. + // (A.4) Rollback triggered for recurring commits - Inserts/Updates are being rolled back, the actions + // taken in this scenario is a combination of (A.2) and (A.3) + // --------------------------------------------------------------------------------------------------- + // (B) The following cases are possible if !index.canIndexLogFiles and/or !index.isGlobal + // --------------------------------------------------------------------------------------------------- + // (B.1) Failed first commit - Inserts were written to base files and HoodieWriteStat has no entries. + // In this scenario, we delete all the base files written for the failed commit. + // (B.2) Failed recurring commits - Inserts were written to base files and updates to log files. In + // this scenario, perform (A.1) and for updates written to log files, write rollback blocks. + // (B.3) Rollback triggered for first commit - Same as (B.1) + // (B.4) Rollback triggered for recurring commits - Same as (B.2) plus we need to delete the log files + // as well if the base base file gets deleted. + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( + table.getMetaClient().getCommitTimeline().getInstantDetails(instantToRollback).get(), + HoodieCommitMetadata.class); + + // In case all data was inserts and the commit failed, delete the file belonging to that commit + // We do not know fileIds for inserts (first inserts are either log files or base files), + // delete all files for the corresponding failed commit, if present (same as COW) + hoodieRollbackRequests.add(getHoodieRollbackRequest(partitionPath, filesToDelete)); + + // append rollback blocks for updates and inserts as A.2 and B.2 + if (commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) { + hoodieRollbackRequests.addAll( + getRollbackRequestToAppend(partitionPath, instantToRollback, commitMetadata, table)); + } + break; + default: + throw new HoodieRollbackException("Unknown listing type, during rollback of " + instantToRollback); + } + } else { + throw new HoodieRollbackException( + String.format("Unsupported table type: %s, during listing rollback of %s", tableType, instantToRollback)); + } + return hoodieRollbackRequests.stream(); + }, numPartitions); + } catch (Exception e) { + LOG.error("Generating rollback requests failed for " + instantToRollback.getTimestamp(), e); + throw new HoodieRollbackException("Generating rollback requests failed for " + instantToRollback.getTimestamp(), e); + } + } + + private String getBaseFileExtension(HoodieTableMetaClient metaClient) { + return metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); + } + + @NotNull + private HoodieRollbackRequest getHoodieRollbackRequest(String partitionPath, FileStatus[] filesToDeletedStatus) { + List filesToDelete = getFilesToBeDeleted(filesToDeletedStatus); + return new HoodieRollbackRequest( + partitionPath, EMPTY_STRING, EMPTY_STRING, filesToDelete, Collections.emptyMap()); + } + + @NotNull + private List getFilesToBeDeleted(FileStatus[] dataFilesToDeletedStatus) { + return Arrays.stream(dataFilesToDeletedStatus).map(fileStatus -> { + String dataFileToBeDeleted = fileStatus.getPath().toString(); + // strip scheme E.g: file:/var/folders + return dataFileToBeDeleted.substring(dataFileToBeDeleted.indexOf(":") + 1); + }).collect(Collectors.toList()); + } + + private FileStatus[] listFilesToBeDeleted(String commit, String basefileExtension, String partitionPath, + FileSystem fs) throws IOException { + LOG.info("Collecting files to be cleaned/rolledback up for path " + partitionPath + " and commit " + commit); + PathFilter filter = (path) -> { + if (path.toString().contains(basefileExtension)) { + String fileCommitTime = FSUtils.getCommitTime(path.getName()); + return commit.equals(fileCommitTime); + } + return false; + }; + return fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter); + } + + private FileStatus[] fetchFilesFromInstant(HoodieInstant instantToRollback, String partitionPath, String basePath, + String baseFileExtension, HoodieWrapperFileSystem fs, + Option commitMetadataOptional, + Boolean isCommitMetadataCompleted) throws IOException { + if (isCommitMetadataCompleted) { + return fetchFilesFromCommitMetadata(instantToRollback, partitionPath, basePath, commitMetadataOptional.get(), + baseFileExtension, fs); + } else { + return fetchFilesFromListFiles(instantToRollback, partitionPath, basePath, baseFileExtension, fs); + } + } + + private FileStatus[] fetchFilesFromCommitMetadata(HoodieInstant instantToRollback, String partitionPath, + String basePath, HoodieCommitMetadata commitMetadata, + String baseFileExtension, HoodieWrapperFileSystem fs) + throws IOException { + SerializablePathFilter pathFilter = getSerializablePathFilter(baseFileExtension, instantToRollback.getTimestamp()); + Path[] filePaths = getFilesFromCommitMetadata(basePath, commitMetadata, partitionPath); + + return fs.listStatus(Arrays.stream(filePaths).filter(entry -> { + try { + return fs.exists(entry); + } catch (Exception e) { + LOG.error("Exists check failed for " + entry.toString(), e); + } + // if any Exception is thrown, do not ignore. let's try to add the file of interest to be deleted. we can't miss any files to be rolled back. + return true; + }).toArray(Path[]::new), pathFilter); + } + + private FileStatus[] fetchFilesFromListFiles(HoodieInstant instantToRollback, String partitionPath, String basePath, + String baseFileExtension, HoodieWrapperFileSystem fs) + throws IOException { + SerializablePathFilter pathFilter = getSerializablePathFilter(baseFileExtension, instantToRollback.getTimestamp()); + Path[] filePaths = listFilesToBeDeleted(basePath, partitionPath); + + return fs.listStatus(filePaths, pathFilter); + } + + private Boolean checkCommitMetadataCompleted(HoodieInstant instantToRollback, + Option commitMetadataOptional) { + return commitMetadataOptional.isPresent() && instantToRollback.isCompleted() + && !WriteOperationType.UNKNOWN.equals(commitMetadataOptional.get().getOperationType()); + } + + private static Path[] listFilesToBeDeleted(String basePath, String partitionPath) { + return new Path[] {FSUtils.getPartitionPath(basePath, partitionPath)}; + } + + private static Path[] getFilesFromCommitMetadata(String basePath, HoodieCommitMetadata commitMetadata, String partitionPath) { + List fullPaths = commitMetadata.getFullPathsByPartitionPath(basePath, partitionPath); + return fullPaths.stream().map(Path::new).toArray(Path[]::new); + } + + @NotNull + private static SerializablePathFilter getSerializablePathFilter(String basefileExtension, String commit) { + return (path) -> { + if (path.toString().endsWith(basefileExtension)) { + String fileCommitTime = FSUtils.getCommitTime(path.getName()); + return commit.equals(fileCommitTime); + } else if (FSUtils.isLogFile(path)) { + // Since the baseCommitTime is the only commit for new log files, it's okay here + String fileCommitTime = FSUtils.getBaseCommitTimeFromLogPath(path); + return commit.equals(fileCommitTime); + } + return false; + }; + } + + public static List getRollbackRequestToAppend(String partitionPath, HoodieInstant rollbackInstant, + HoodieCommitMetadata commitMetadata, HoodieTable table) { + List hoodieRollbackRequests = new ArrayList<>(); + checkArgument(rollbackInstant.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION)); + + // wStat.getPrevCommit() might not give the right commit time in the following + // scenario : If a compaction was scheduled, the new commitTime associated with the requested compaction will be + // used to write the new log files. In this case, the commit time for the log file is the compaction requested time. + // But the index (global) might store the baseCommit of the base and not the requested, hence get the + // baseCommit always by listing the file slice + // With multi writers, rollbacks could be lazy. and so we need to use getLatestFileSlicesBeforeOrOn() instead of getLatestFileSlices() + Map latestFileSlices = table.getSliceView() + .getLatestFileSlicesBeforeOrOn(partitionPath, rollbackInstant.getTimestamp(), true) + .collect(Collectors.toMap(FileSlice::getFileId, Function.identity())); + + List hoodieWriteStats = commitMetadata.getPartitionToWriteStats().get(partitionPath) + .stream() + .filter(writeStat -> { + // Filter out stats without prevCommit since they are all inserts + boolean validForRollback = (writeStat != null) && (!writeStat.getPrevCommit().equals(HoodieWriteStat.NULL_COMMIT)) + && (writeStat.getPrevCommit() != null) && latestFileSlices.containsKey(writeStat.getFileId()); + + if (!validForRollback) { + return false; + } + + FileSlice latestFileSlice = latestFileSlices.get(writeStat.getFileId()); + + // For sanity, log-file base-instant time can never be less than base-commit on which we are rolling back + checkArgument( + HoodieTimeline.compareTimestamps(latestFileSlice.getBaseInstantTime(), + HoodieTimeline.LESSER_THAN_OR_EQUALS, rollbackInstant.getTimestamp()), + "Log-file base-instant could not be less than the instant being rolled back"); + + // Command block "rolling back" the preceding block {@link HoodieCommandBlockTypeEnum#ROLLBACK_PREVIOUS_BLOCK} + // w/in the latest file-slice is appended iff base-instant of the log-file is _strictly_ less + // than the instant of the Delta Commit being rolled back. Otherwise, log-file will be cleaned up + // in a different branch of the flow. + return HoodieTimeline.compareTimestamps(latestFileSlice.getBaseInstantTime(), HoodieTimeline.LESSER_THAN, rollbackInstant.getTimestamp()); + }) + .collect(Collectors.toList()); + + for (HoodieWriteStat writeStat : hoodieWriteStats) { + FileSlice latestFileSlice = latestFileSlices.get(writeStat.getFileId()); + String fileId = writeStat.getFileId(); + String latestBaseInstant = latestFileSlice.getBaseInstantTime(); + + Path fullLogFilePath = FSUtils.getPartitionPath(table.getConfig().getBasePath(), writeStat.getPath()); + + Map logFilesWithBlocksToRollback = + Collections.singletonMap(fullLogFilePath.toString(), writeStat.getTotalWriteBytes()); + + hoodieRollbackRequests.add(new HoodieRollbackRequest(partitionPath, fileId, latestBaseInstant, + Collections.emptyList(), logFilesWithBlocksToRollback)); + } + + return hoodieRollbackRequests; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/MarkerBasedRollbackStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/MarkerBasedRollbackStrategy.java new file mode 100644 index 0000000000000..87ee7d94723d9 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/MarkerBasedRollbackStrategy.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.rollback; + +import org.apache.hudi.avro.model.HoodieRollbackRequest; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.IOType; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieRollbackException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.marker.MarkerBasedRollbackUtils; +import org.apache.hudi.table.marker.WriteMarkers; + +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.apache.hudi.table.action.rollback.BaseRollbackHelper.EMPTY_STRING; + +/** + * Performs rollback using marker files generated during the write.. + */ +public class MarkerBasedRollbackStrategy implements BaseRollbackPlanActionExecutor.RollbackStrategy { + + private static final Logger LOG = LogManager.getLogger(MarkerBasedRollbackStrategy.class); + + protected final HoodieTable table; + + protected final transient HoodieEngineContext context; + + protected final HoodieWriteConfig config; + + protected final String basePath; + + protected final String instantTime; + + public MarkerBasedRollbackStrategy(HoodieTable table, HoodieEngineContext context, HoodieWriteConfig config, String instantTime) { + this.table = table; + this.context = context; + this.basePath = table.getMetaClient().getBasePath(); + this.config = config; + this.instantTime = instantTime; + } + + @Override + public List getRollbackRequests(HoodieInstant instantToRollback) { + try { + List markerPaths = MarkerBasedRollbackUtils.getAllMarkerPaths( + table, context, instantToRollback.getTimestamp(), config.getRollbackParallelism()); + int parallelism = Math.max(Math.min(markerPaths.size(), config.getRollbackParallelism()), 1); + return context.map(markerPaths, markerFilePath -> { + String typeStr = markerFilePath.substring(markerFilePath.lastIndexOf(".") + 1); + IOType type = IOType.valueOf(typeStr); + switch (type) { + case MERGE: + case CREATE: + String fileToDelete = WriteMarkers.stripMarkerSuffix(markerFilePath); + Path fullDeletePath = new Path(basePath, fileToDelete); + String partitionPath = FSUtils.getRelativePartitionPath(new Path(basePath), fullDeletePath.getParent()); + return new HoodieRollbackRequest(partitionPath, EMPTY_STRING, EMPTY_STRING, + Collections.singletonList(fullDeletePath.toString()), + Collections.emptyMap()); + case APPEND: + // NOTE: This marker file-path does NOT correspond to a log-file, but rather is a phony + // path serving as a "container" for the following components: + // - Base file's file-id + // - Base file's commit instant + // - Partition path + return getRollbackRequestForAppend(WriteMarkers.stripMarkerSuffix(markerFilePath)); + default: + throw new HoodieRollbackException("Unknown marker type, during rollback of " + instantToRollback); + } + }, parallelism); + } catch (Exception e) { + throw new HoodieRollbackException("Error rolling back using marker files written for " + instantToRollback, e); + } + } + + protected HoodieRollbackRequest getRollbackRequestForAppend(String markerFilePath) throws IOException { + Path baseFilePathForAppend = new Path(basePath, markerFilePath); + String fileId = FSUtils.getFileIdFromFilePath(baseFilePathForAppend); + String baseCommitTime = FSUtils.getCommitTime(baseFilePathForAppend.getName()); + String relativePartitionPath = FSUtils.getRelativePartitionPath(new Path(basePath), baseFilePathForAppend.getParent()); + Path partitionPath = FSUtils.getPartitionPath(config.getBasePath(), relativePartitionPath); + + // NOTE: Since we're rolling back incomplete Delta Commit, it only could have appended its + // block to the latest log-file + // TODO(HUDI-1517) use provided marker-file's path instead + Option latestLogFileOption = FSUtils.getLatestLogFile(table.getMetaClient().getFs(), partitionPath, fileId, + HoodieFileFormat.HOODIE_LOG.getFileExtension(), baseCommitTime); + + Map logFilesWithBlocsToRollback = new HashMap<>(); + if (latestLogFileOption.isPresent()) { + HoodieLogFile latestLogFile = latestLogFileOption.get(); + // NOTE: Marker's don't carry information about the cumulative size of the blocks that have been appended, + // therefore we simply stub this value. + logFilesWithBlocsToRollback = Collections.singletonMap(latestLogFile.getFileStatus().getPath().toString(), -1L); + } + + return new HoodieRollbackRequest(relativePartitionPath, fileId, baseCommitTime, Collections.emptyList(), + logFilesWithBlocsToRollback); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/MergeOnReadRollbackActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/MergeOnReadRollbackActionExecutor.java new file mode 100644 index 0000000000000..46d4d84ebf21d --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/MergeOnReadRollbackActionExecutor.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.action.rollback; + +import org.apache.hudi.avro.model.HoodieRollbackPlan; +import org.apache.hudi.common.HoodieRollbackStat; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.List; + +public class MergeOnReadRollbackActionExecutor extends BaseRollbackActionExecutor { + + private static final Logger LOG = LogManager.getLogger(MergeOnReadRollbackActionExecutor.class); + + public MergeOnReadRollbackActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + HoodieInstant commitInstant, + boolean deleteInstants, + boolean skipLocking) { + super(context, config, table, instantTime, commitInstant, deleteInstants, skipLocking); + } + + public MergeOnReadRollbackActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + HoodieInstant commitInstant, + boolean deleteInstants, + boolean skipTimelinePublish, + boolean skipLocking) { + super(context, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, skipLocking); + } + + @Override + protected List executeRollback(HoodieRollbackPlan hoodieRollbackPlan) { + HoodieTimer rollbackTimer = new HoodieTimer(); + rollbackTimer.startTimer(); + + LOG.info("Rolling back instant " + instantToRollback); + + // Atomically un-publish all non-inflight commits + if (instantToRollback.isCompleted()) { + LOG.info("Un-publishing instant " + instantToRollback + ", deleteInstants=" + deleteInstants); + resolvedInstant = table.getActiveTimeline().revertToInflight(instantToRollback); + // reload meta-client to reflect latest timeline status + table.getMetaClient().reloadActiveTimeline(); + } + + List allRollbackStats = new ArrayList<>(); + + // At the moment, MOR table type does not support bulk nested rollbacks. Nested rollbacks is an experimental + // feature that is expensive. To perform nested rollbacks, initiate multiple requests of client.rollback + // (commitToRollback). + // NOTE {@link HoodieCompactionConfig#withCompactionLazyBlockReadEnabled} needs to be set to TRUE. This is + // required to avoid OOM when merging multiple LogBlocks performed during nested rollbacks. + + // For Requested State (like failure during index lookup), there is nothing to do rollback other than + // deleting the timeline file + if (!resolvedInstant.isRequested()) { + LOG.info("Unpublished " + resolvedInstant); + allRollbackStats = executeRollback(instantToRollback, hoodieRollbackPlan); + } + + dropBootstrapIndexIfNeeded(resolvedInstant); + + LOG.info("Time(in ms) taken to finish rollback " + rollbackTimer.endTimer()); + return allRollbackStats; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RestorePlanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RestorePlanActionExecutor.java new file mode 100644 index 0000000000000..e33dffcb7b953 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RestorePlanActionExecutor.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.action.rollback; + +import org.apache.hudi.avro.model.HoodieInstantInfo; +import org.apache.hudi.avro.model.HoodieRestorePlan; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.util.ClusteringUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.BaseActionExecutor; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * Plans the restore action and add a restore.requested meta file to timeline. + */ +public class RestorePlanActionExecutor extends BaseActionExecutor> { + + + private static final Logger LOG = LogManager.getLogger(RestorePlanActionExecutor.class); + + public static final Integer RESTORE_PLAN_VERSION_1 = 1; + public static final Integer LATEST_RESTORE_PLAN_VERSION = RESTORE_PLAN_VERSION_1; + private final String restoreInstantTime; + + public RestorePlanActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + String restoreInstantTime) { + super(context, config, table, instantTime); + this.restoreInstantTime = restoreInstantTime; + } + + @Override + public Option execute() { + final HoodieInstant restoreInstant = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.RESTORE_ACTION, instantTime); + try { + // Get all the commits on the timeline after the provided commit time + // rollback pending clustering instants first before other instants (See HUDI-3362) + List pendingClusteringInstantsToRollback = table.getActiveTimeline().filterPendingReplaceTimeline() + // filter only clustering related replacecommits (Not insert_overwrite related commits) + .filter(instant -> ClusteringUtils.isPendingClusteringInstant(table.getMetaClient(), instant)) + .getReverseOrderedInstants() + .filter(instant -> HoodieActiveTimeline.GREATER_THAN.test(instant.getTimestamp(), restoreInstantTime)) + .collect(Collectors.toList()); + + // Get all the commits on the timeline after the provided commit time + List commitInstantsToRollback = table.getActiveTimeline().getWriteTimeline() + .getReverseOrderedInstants() + .filter(instant -> HoodieActiveTimeline.GREATER_THAN.test(instant.getTimestamp(), restoreInstantTime)) + .filter(instant -> !pendingClusteringInstantsToRollback.contains(instant)) + .collect(Collectors.toList()); + + // Combine both lists - first rollback pending clustering and then rollback all other commits + List instantsToRollback = Stream.concat(pendingClusteringInstantsToRollback.stream(), commitInstantsToRollback.stream()) + .map(entry -> new HoodieInstantInfo(entry.getTimestamp(), entry.getAction())) + .collect(Collectors.toList()); + + HoodieRestorePlan restorePlan = new HoodieRestorePlan(instantsToRollback, LATEST_RESTORE_PLAN_VERSION); + table.getActiveTimeline().saveToRestoreRequested(restoreInstant, TimelineMetadataUtils.serializeRestorePlan(restorePlan)); + table.getMetaClient().reloadActiveTimeline(); + LOG.info("Requesting Restore with instant time " + restoreInstant); + return Option.of(restorePlan); + } catch (IOException e) { + LOG.error("Got exception when saving restore requested file", e); + throw new HoodieIOException(e.getMessage(), e); + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RollbackUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RollbackUtils.java index 897b448fb5340..ce7a18515137b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RollbackUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RollbackUtils.java @@ -19,39 +19,46 @@ package org.apache.hudi.table.action.rollback; import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - +import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.FileSlice; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.log.block.HoodieCommandBlock; import org.apache.hudi.common.table.log.block.HoodieLogBlock; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.table.HoodieTable; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Objects; -import java.util.stream.Collectors; + +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; public class RollbackUtils { private static final Logger LOG = LogManager.getLogger(RollbackUtils.class); + /** + * Get Latest version of Rollback plan corresponding to a clean instant. + * + * @param metaClient Hoodie Table Meta Client + * @param rollbackInstant Instant referring to rollback action + * @return Rollback plan corresponding to rollback instant + * @throws IOException + */ + public static HoodieRollbackPlan getRollbackPlan(HoodieTableMetaClient metaClient, HoodieInstant rollbackInstant) + throws IOException { + // TODO: add upgrade step if required. + final HoodieInstant requested = HoodieTimeline.getRollbackRequestedInstant(rollbackInstant); + return TimelineMetadataUtils.deserializeAvroMetadata( + metaClient.getActiveTimeline().readRollbackInfoAsBytes(requested).get(), HoodieRollbackPlan.class); + } + static Map generateHeader(String instantToRollback, String rollbackInstantTime) { // generate metadata Map header = new HashMap<>(3); @@ -70,11 +77,11 @@ static Map generateHeader(String inst * @return Merged HoodieRollbackStat */ static HoodieRollbackStat mergeRollbackStat(HoodieRollbackStat stat1, HoodieRollbackStat stat2) { - ValidationUtils.checkArgument(stat1.getPartitionPath().equals(stat2.getPartitionPath())); + checkArgument(stat1.getPartitionPath().equals(stat2.getPartitionPath())); final List successDeleteFiles = new ArrayList<>(); final List failedDeleteFiles = new ArrayList<>(); final Map commandBlocksCount = new HashMap<>(); - final List filesToRollback = new ArrayList<>(); + final Map writtenLogFileSizeMap = new HashMap<>(); Option.ofNullable(stat1.getSuccessDeleteFiles()).ifPresent(successDeleteFiles::addAll); Option.ofNullable(stat2.getSuccessDeleteFiles()).ifPresent(successDeleteFiles::addAll); Option.ofNullable(stat1.getFailedDeleteFiles()).ifPresent(failedDeleteFiles::addAll); @@ -84,156 +91,4 @@ static HoodieRollbackStat mergeRollbackStat(HoodieRollbackStat stat1, HoodieRoll return new HoodieRollbackStat(stat1.getPartitionPath(), successDeleteFiles, failedDeleteFiles, commandBlocksCount); } - /** - * Generate all rollback requests that needs rolling back this action without actually performing rollback for COW table type. - * @param fs instance of {@link FileSystem} to use. - * @param basePath base path of interest. - * @param shouldAssumeDatePartitioning {@code true} if date partitioning should be assumed. {@code false} otherwise. - * @return {@link List} of {@link ListingBasedRollbackRequest}s thus collected. - */ - public static List generateRollbackRequestsByListingCOW(FileSystem fs, String basePath, boolean shouldAssumeDatePartitioning) { - try { - return FSUtils.getAllPartitionPaths(fs, basePath, shouldAssumeDatePartitioning).stream() - .map(ListingBasedRollbackRequest::createRollbackRequestWithDeleteDataAndLogFilesAction) - .collect(Collectors.toList()); - } catch (IOException e) { - throw new HoodieIOException("Error generating rollback requests", e); - } - } - - /** - * Generate all rollback requests that we need to perform for rolling back this action without actually performing rolling back for MOR table type. - * - * @param instantToRollback Instant to Rollback - * @param table instance of {@link HoodieTable} to use. - * @param context instance of {@link HoodieEngineContext} to use. - * @return list of rollback requests - */ - public static List generateRollbackRequestsUsingFileListingMOR(HoodieInstant instantToRollback, HoodieTable table, HoodieEngineContext context) throws IOException { - String commit = instantToRollback.getTimestamp(); - HoodieWriteConfig config = table.getConfig(); - List partitions = FSUtils.getAllPartitionPaths(table.getMetaClient().getFs(), table.getMetaClient().getBasePath(), - config.shouldAssumeDatePartitioning()); - int sparkPartitions = Math.max(Math.min(partitions.size(), config.getRollbackParallelism()), 1); - context.setJobStatus(RollbackUtils.class.getSimpleName(), "Generate all rollback requests"); - return context.flatMap(partitions, partitionPath -> { - HoodieActiveTimeline activeTimeline = table.getMetaClient().reloadActiveTimeline(); - List partitionRollbackRequests = new ArrayList<>(); - switch (instantToRollback.getAction()) { - case HoodieTimeline.COMMIT_ACTION: - LOG.info("Rolling back commit action."); - partitionRollbackRequests.add( - ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath)); - break; - case HoodieTimeline.COMPACTION_ACTION: - // If there is no delta commit present after the current commit (if compaction), no action, else we - // need to make sure that a compaction commit rollback also deletes any log files written as part of the - // succeeding deltacommit. - boolean higherDeltaCommits = - !activeTimeline.getDeltaCommitTimeline().filterCompletedInstants().findInstantsAfter(commit, 1).empty(); - if (higherDeltaCommits) { - // Rollback of a compaction action with no higher deltacommit means that the compaction is scheduled - // and has not yet finished. In this scenario we should delete only the newly created parquet files - // and not corresponding base commit log files created with this as baseCommit since updates would - // have been written to the log files. - LOG.info("Rolling back compaction. There are higher delta commits. So only deleting data files"); - partitionRollbackRequests.add( - ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataFilesOnlyAction(partitionPath)); - } else { - // No deltacommits present after this compaction commit (inflight or requested). In this case, we - // can also delete any log files that were created with this compaction commit as base - // commit. - LOG.info("Rolling back compaction plan. There are NO higher delta commits. So deleting both data and" - + " log files"); - partitionRollbackRequests.add( - ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath)); - } - break; - case HoodieTimeline.DELTA_COMMIT_ACTION: - // -------------------------------------------------------------------------------------------------- - // (A) The following cases are possible if index.canIndexLogFiles and/or index.isGlobal - // -------------------------------------------------------------------------------------------------- - // (A.1) Failed first commit - Inserts were written to log files and HoodieWriteStat has no entries. In - // this scenario we would want to delete these log files. - // (A.2) Failed recurring commit - Inserts/Updates written to log files. In this scenario, - // HoodieWriteStat will have the baseCommitTime for the first log file written, add rollback blocks. - // (A.3) Rollback triggered for first commit - Inserts were written to the log files but the commit is - // being reverted. In this scenario, HoodieWriteStat will be `null` for the attribute prevCommitTime and - // and hence will end up deleting these log files. This is done so there are no orphan log files - // lying around. - // (A.4) Rollback triggered for recurring commits - Inserts/Updates are being rolled back, the actions - // taken in this scenario is a combination of (A.2) and (A.3) - // --------------------------------------------------------------------------------------------------- - // (B) The following cases are possible if !index.canIndexLogFiles and/or !index.isGlobal - // --------------------------------------------------------------------------------------------------- - // (B.1) Failed first commit - Inserts were written to parquet files and HoodieWriteStat has no entries. - // In this scenario, we delete all the parquet files written for the failed commit. - // (B.2) Failed recurring commits - Inserts were written to parquet files and updates to log files. In - // this scenario, perform (A.1) and for updates written to log files, write rollback blocks. - // (B.3) Rollback triggered for first commit - Same as (B.1) - // (B.4) Rollback triggered for recurring commits - Same as (B.2) plus we need to delete the log files - // as well if the base parquet file gets deleted. - try { - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( - table.getMetaClient().getCommitTimeline() - .getInstantDetails(new HoodieInstant(true, instantToRollback.getAction(), instantToRollback.getTimestamp())) - .get(), - HoodieCommitMetadata.class); - - // In case all data was inserts and the commit failed, delete the file belonging to that commit - // We do not know fileIds for inserts (first inserts are either log files or parquet files), - // delete all files for the corresponding failed commit, if present (same as COW) - partitionRollbackRequests.add( - ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath)); - - // append rollback blocks for updates - if (commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) { - partitionRollbackRequests - .addAll(generateAppendRollbackBlocksAction(partitionPath, instantToRollback, commitMetadata, table)); - } - break; - } catch (IOException io) { - throw new HoodieIOException("Failed to collect rollback actions for commit " + commit, io); - } - default: - break; - } - return partitionRollbackRequests.stream(); - }, Math.min(partitions.size(), sparkPartitions)).stream().filter(Objects::nonNull).collect(Collectors.toList()); - } - - private static List generateAppendRollbackBlocksAction(String partitionPath, HoodieInstant rollbackInstant, - HoodieCommitMetadata commitMetadata, HoodieTable table) { - ValidationUtils.checkArgument(rollbackInstant.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION)); - - // wStat.getPrevCommit() might not give the right commit time in the following - // scenario : If a compaction was scheduled, the new commitTime associated with the requested compaction will be - // used to write the new log files. In this case, the commit time for the log file is the compaction requested time. - // But the index (global) might store the baseCommit of the parquet and not the requested, hence get the - // baseCommit always by listing the file slice - Map fileIdToBaseCommitTimeForLogMap = table.getSliceView().getLatestFileSlices(partitionPath) - .collect(Collectors.toMap(FileSlice::getFileId, FileSlice::getBaseInstantTime)); - return commitMetadata.getPartitionToWriteStats().get(partitionPath).stream().filter(wStat -> { - - // Filter out stats without prevCommit since they are all inserts - boolean validForRollback = (wStat != null) && (!wStat.getPrevCommit().equals(HoodieWriteStat.NULL_COMMIT)) - && (wStat.getPrevCommit() != null) && fileIdToBaseCommitTimeForLogMap.containsKey(wStat.getFileId()); - - if (validForRollback) { - // For sanity, log instant time can never be less than base-commit on which we are rolling back - ValidationUtils - .checkArgument(HoodieTimeline.compareTimestamps(fileIdToBaseCommitTimeForLogMap.get(wStat.getFileId()), - HoodieTimeline.LESSER_THAN_OR_EQUALS, rollbackInstant.getTimestamp())); - } - - return validForRollback && HoodieTimeline.compareTimestamps(fileIdToBaseCommitTimeForLogMap.get( - // Base Ts should be strictly less. If equal (for inserts-to-logs), the caller employs another option - // to delete and we should not step on it - wStat.getFileId()), HoodieTimeline.LESSER_THAN, rollbackInstant.getTimestamp()); - }).map(wStat -> { - String baseCommitTime = fileIdToBaseCommitTimeForLogMap.get(wStat.getFileId()); - return ListingBasedRollbackRequest.createRollbackRequestWithAppendRollbackBlockAction(partitionPath, wStat.getFileId(), - baseCommitTime); - }).collect(Collectors.toList()); - } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/SerializableHoodieRollbackRequest.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/SerializableHoodieRollbackRequest.java new file mode 100644 index 0000000000000..8f19692ed7c72 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/SerializableHoodieRollbackRequest.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.rollback; + +import org.apache.hudi.avro.model.HoodieRollbackRequest; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * HoodieRollbackRequest in HoodieRollbackPlan (avro pojo) is not operable direclty within spark parallel engine. + * Hence converting the same to this {@link SerializableHoodieRollbackRequest} and then using it within spark.parallelize. + */ +public class SerializableHoodieRollbackRequest implements Serializable { + + private final String partitionPath; + private final String fileId; + private final String latestBaseInstant; + private final List filesToBeDeleted = new ArrayList<>(); + private final Map logBlocksToBeDeleted = new HashMap<>(); + + public SerializableHoodieRollbackRequest(HoodieRollbackRequest rollbackRequest) { + this.partitionPath = rollbackRequest.getPartitionPath(); + this.fileId = rollbackRequest.getFileId(); + this.latestBaseInstant = rollbackRequest.getLatestBaseInstant(); + this.filesToBeDeleted.addAll(rollbackRequest.getFilesToBeDeleted()); + this.logBlocksToBeDeleted.putAll(rollbackRequest.getLogBlocksToBeDeleted()); + } + + public String getPartitionPath() { + return partitionPath; + } + + public String getFileId() { + return fileId; + } + + public String getLatestBaseInstant() { + return latestBaseInstant; + } + + public List getFilesToBeDeleted() { + return filesToBeDeleted; + } + + public Map getLogBlocksToBeDeleted() { + return logBlocksToBeDeleted; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/SerializablePathFilter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/SerializablePathFilter.java new file mode 100644 index 0000000000000..e2affdf5ca891 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/SerializablePathFilter.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.rollback; + +import org.apache.hadoop.fs.PathFilter; + +import java.io.Serializable; + +public interface SerializablePathFilter extends PathFilter, Serializable { +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/savepoint/SavepointActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/savepoint/SavepointActionExecutor.java index 16fd9a481e02d..7f408c1b8d24a 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/savepoint/SavepointActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/savepoint/SavepointActionExecutor.java @@ -20,24 +20,24 @@ import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieSavepointMetadata; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.table.view.TableFileSystemView; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieSavepointException; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.BaseActionExecutor; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import scala.Tuple2; import java.io.IOException; import java.util.List; @@ -64,13 +64,9 @@ public SavepointActionExecutor(HoodieEngineContext context, @Override public HoodieSavepointMetadata execute() { - if (table.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) { - throw new UnsupportedOperationException("Savepointing is not supported or MergeOnRead table types"); - } Option cleanInstant = table.getCompletedCleanTimeline().lastInstant(); - HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, instantTime); - if (!table.getCompletedCommitsTimeline().containsInstant(commitInstant)) { - throw new HoodieSavepointException("Could not savepoint non-existing commit " + commitInstant); + if (!table.getCompletedCommitsTimeline().containsInstant(instantTime)) { + throw new HoodieSavepointException("Could not savepoint non-existing commit " + instantTime); } try { @@ -88,16 +84,16 @@ public HoodieSavepointMetadata execute() { ValidationUtils.checkArgument(HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.GREATER_THAN_OR_EQUALS, lastCommitRetained), "Could not savepoint commit " + instantTime + " as this is beyond the lookup window " + lastCommitRetained); - context.setJobStatus(this.getClass().getSimpleName(), "Collecting latest files for savepoint " + instantTime); - Map> latestFilesMap = context.mapToPair(FSUtils.getAllPartitionPaths(table.getMetaClient().getFs(), - table.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()), partitionPath -> { - // Scan all partitions files with this commit time - LOG.info("Collecting latest files in partition path " + partitionPath); - TableFileSystemView.BaseFileOnlyView view = table.getBaseFileOnlyView(); - List latestFiles = view.getLatestBaseFilesBeforeOrOn(partitionPath, instantTime) - .map(HoodieBaseFile::getFileName).collect(Collectors.toList()); - return new Tuple2<>(partitionPath, latestFiles); - }, null); + context.setJobStatus(this.getClass().getSimpleName(), "Collecting latest files for savepoint " + instantTime + " " + table.getConfig().getTableName()); + List partitions = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), table.getMetaClient().getBasePath()); + Map> latestFilesMap = context.mapToPair(partitions, partitionPath -> { + // Scan all partitions files with this commit time + LOG.info("Collecting latest files in partition path " + partitionPath); + TableFileSystemView.BaseFileOnlyView view = table.getBaseFileOnlyView(); + List latestFiles = view.getLatestBaseFilesBeforeOrOn(partitionPath, instantTime) + .map(HoodieBaseFile::getFileName).collect(Collectors.toList()); + return new ImmutablePair<>(partitionPath, latestFiles); + }, null); HoodieSavepointMetadata metadata = TimelineMetadataUtils.convertSavepointMetadata(user, comment, latestFilesMap); // Nothing to save in the savepoint table.getActiveTimeline().createNewInstant( diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/savepoint/SavepointHelpers.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/savepoint/SavepointHelpers.java index 0d51a639aa03a..f00cd87797f6b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/savepoint/SavepointHelpers.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/savepoint/SavepointHelpers.java @@ -18,7 +18,6 @@ package org.apache.hudi.table.action.savepoint; -import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Option; @@ -33,9 +32,6 @@ public class SavepointHelpers { private static final Logger LOG = LogManager.getLogger(SavepointHelpers.class); public static void deleteSavepoint(HoodieTable table, String savepointTime) { - if (table.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) { - throw new UnsupportedOperationException("Savepointing is not supported or MergeOnRead table types"); - } HoodieInstant savePoint = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime); boolean isSavepointPresent = table.getCompletedSavepointTimeline().containsInstant(savePoint); if (!isSavepointPresent) { @@ -52,7 +48,7 @@ public static void validateSavepointRestore(HoodieTable table, String savepointT // Make sure the restore was successful table.getMetaClient().reloadActiveTimeline(); Option lastInstant = table.getActiveTimeline() - .getCommitsAndCompactionTimeline() + .getWriteTimeline() .filterCompletedAndCompactionInstants() .lastInstant(); ValidationUtils.checkArgument(lastInstant.isPresent()); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/marker/DirectWriteMarkers.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/marker/DirectWriteMarkers.java new file mode 100644 index 0000000000000..e813382079634 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/marker/DirectWriteMarkers.java @@ -0,0 +1,182 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.marker; + +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.IOType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.MarkerUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.table.HoodieTable; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Marker operations of directly accessing the file system to create and delete + * marker files. Each data file has a corresponding marker file. + */ +public class DirectWriteMarkers extends WriteMarkers { + + private static final Logger LOG = LogManager.getLogger(DirectWriteMarkers.class); + private final transient FileSystem fs; + + public DirectWriteMarkers(FileSystem fs, String basePath, String markerFolderPath, String instantTime) { + super(basePath, markerFolderPath, instantTime); + this.fs = fs; + } + + public DirectWriteMarkers(HoodieTable table, String instantTime) { + this(table.getMetaClient().getFs(), + table.getMetaClient().getBasePath(), + table.getMetaClient().getMarkerFolderPath(instantTime), + instantTime); + } + + /** + * Deletes Marker directory corresponding to an instant. + * + * @param context HoodieEngineContext. + * @param parallelism parallelism for deletion. + */ + public boolean deleteMarkerDir(HoodieEngineContext context, int parallelism) { + return FSUtils.deleteDir(context, fs, markerDirPath, parallelism); + } + + /** + * @return {@code true} if marker directory exists; {@code false} otherwise. + * @throws IOException + */ + public boolean doesMarkerDirExist() throws IOException { + return fs.exists(markerDirPath); + } + + @Override + public Set createdAndMergedDataPaths(HoodieEngineContext context, int parallelism) throws IOException { + Set dataFiles = new HashSet<>(); + + FileStatus[] topLevelStatuses = fs.listStatus(markerDirPath); + List subDirectories = new ArrayList<>(); + for (FileStatus topLevelStatus: topLevelStatuses) { + if (topLevelStatus.isFile()) { + String pathStr = topLevelStatus.getPath().toString(); + if (pathStr.contains(HoodieTableMetaClient.MARKER_EXTN) && !pathStr.endsWith(IOType.APPEND.name())) { + dataFiles.add(translateMarkerToDataPath(pathStr)); + } + } else { + subDirectories.add(topLevelStatus.getPath().toString()); + } + } + + if (subDirectories.size() > 0) { + parallelism = Math.min(subDirectories.size(), parallelism); + SerializableConfiguration serializedConf = new SerializableConfiguration(fs.getConf()); + context.setJobStatus(this.getClass().getSimpleName(), "Obtaining marker files for all created, merged paths"); + dataFiles.addAll(context.flatMap(subDirectories, directory -> { + Path path = new Path(directory); + FileSystem fileSystem = path.getFileSystem(serializedConf.get()); + RemoteIterator itr = fileSystem.listFiles(path, true); + List result = new ArrayList<>(); + while (itr.hasNext()) { + FileStatus status = itr.next(); + String pathStr = status.getPath().toString(); + if (pathStr.contains(HoodieTableMetaClient.MARKER_EXTN) && !pathStr.endsWith(IOType.APPEND.name())) { + result.add(translateMarkerToDataPath(pathStr)); + } + } + return result.stream(); + }, parallelism)); + } + + return dataFiles; + } + + private String translateMarkerToDataPath(String markerPath) { + String rPath = MarkerUtils.stripMarkerFolderPrefix(markerPath, basePath, instantTime); + return stripMarkerSuffix(rPath); + } + + @Override + public Set allMarkerFilePaths() throws IOException { + Set markerFiles = new HashSet<>(); + if (doesMarkerDirExist()) { + FSUtils.processFiles(fs, markerDirPath.toString(), fileStatus -> { + markerFiles.add(MarkerUtils.stripMarkerFolderPrefix(fileStatus.getPath().toString(), basePath, instantTime)); + return true; + }, false); + } + return markerFiles; + } + + /** + * Creates a marker file based on the full marker name excluding the base path and instant. + * + * @param markerName the full marker name, e.g., "2021/08/13/file1.marker.CREATE" + * @return path of the marker file + */ + public Option create(String markerName) { + return create(new Path(markerDirPath, markerName), true); + } + + @Override + protected Option create(String partitionPath, String dataFileName, IOType type, boolean checkIfExists) { + return create(getMarkerPath(partitionPath, dataFileName, type), checkIfExists); + } + + private Option create(Path markerPath, boolean checkIfExists) { + HoodieTimer timer = new HoodieTimer().startTimer(); + Path dirPath = markerPath.getParent(); + try { + if (!fs.exists(dirPath)) { + fs.mkdirs(dirPath); // create a new partition as needed. + } + } catch (IOException e) { + throw new HoodieIOException("Failed to make dir " + dirPath, e); + } + try { + if (checkIfExists && fs.exists(markerPath)) { + LOG.warn("Marker Path=" + markerPath + " already exists, cancel creation"); + return Option.empty(); + } + LOG.info("Creating Marker Path=" + markerPath); + fs.create(markerPath, false).close(); + } catch (IOException e) { + throw new HoodieException("Failed to create marker file " + markerPath, e); + } + LOG.info("[direct] Created marker file " + markerPath.toString() + + " in " + timer.endTimer() + " ms"); + return Option.of(markerPath); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/marker/MarkerBasedRollbackUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/marker/MarkerBasedRollbackUtils.java new file mode 100644 index 0000000000000..4d2f9e4e80630 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/marker/MarkerBasedRollbackUtils.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.marker; + +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.table.marker.MarkerType; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.table.HoodieTable; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.table.marker.MarkerType.DIRECT; +import static org.apache.hudi.common.table.marker.MarkerType.TIMELINE_SERVER_BASED; +import static org.apache.hudi.common.util.MarkerUtils.MARKER_TYPE_FILENAME; +import static org.apache.hudi.common.util.MarkerUtils.readMarkerType; +import static org.apache.hudi.common.util.MarkerUtils.readTimelineServerBasedMarkersFromFileSystem; + +/** + * A utility class for marker-based rollback. + */ +public class MarkerBasedRollbackUtils { + + private static final Logger LOG = LogManager.getLogger(MarkerBasedRollbackUtils.class); + + /** + * Gets all marker paths. + * + * @param table instance of {@code HoodieTable} to use + * @param context instance of {@code HoodieEngineContext} to use + * @param instant instant of interest to rollback + * @param parallelism parallelism to use + * @return a list of all markers + * @throws IOException + */ + public static List getAllMarkerPaths(HoodieTable table, HoodieEngineContext context, + String instant, int parallelism) throws IOException { + String markerDir = table.getMetaClient().getMarkerFolderPath(instant); + FileSystem fileSystem = table.getMetaClient().getFs(); + Option markerTypeOption = readMarkerType(fileSystem, markerDir); + + // If there is no marker type file "MARKERS.type", first assume "DIRECT" markers are used. + // If not, then fallback to "TIMELINE_SERVER_BASED" markers. + if (!markerTypeOption.isPresent()) { + WriteMarkers writeMarkers = WriteMarkersFactory.get(DIRECT, table, instant); + try { + return new ArrayList<>(writeMarkers.allMarkerFilePaths()); + } catch (IOException | IllegalArgumentException e) { + LOG.warn(String.format("%s not present and %s marker failed with error: %s. So, falling back to %s marker", + MARKER_TYPE_FILENAME, DIRECT, e.getMessage(), TIMELINE_SERVER_BASED)); + return getTimelineServerBasedMarkers(context, parallelism, markerDir, fileSystem); + } + } + + switch (markerTypeOption.get()) { + case TIMELINE_SERVER_BASED: + // Reads all markers written by the timeline server + return getTimelineServerBasedMarkers(context, parallelism, markerDir, fileSystem); + default: + throw new HoodieException( + "The marker type \"" + markerTypeOption.get().name() + "\" is not supported."); + } + } + + private static List getTimelineServerBasedMarkers(HoodieEngineContext context, int parallelism, String markerDir, FileSystem fileSystem) { + Map> markersMap = readTimelineServerBasedMarkersFromFileSystem(markerDir, fileSystem, context, parallelism); + return markersMap.values().stream() + .flatMap(Collection::stream) + .collect(Collectors.toList()); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/marker/TimelineServerBasedWriteMarkers.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/marker/TimelineServerBasedWriteMarkers.java new file mode 100644 index 0000000000000..4879e0bc60c94 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/marker/TimelineServerBasedWriteMarkers.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.marker; + +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.IOType; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.exception.HoodieRemoteException; +import org.apache.hudi.table.HoodieTable; + +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.hadoop.fs.Path; +import org.apache.http.client.fluent.Request; +import org.apache.http.client.fluent.Response; +import org.apache.http.client.utils.URIBuilder; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.table.marker.MarkerOperation.ALL_MARKERS_URL; +import static org.apache.hudi.common.table.marker.MarkerOperation.CREATE_AND_MERGE_MARKERS_URL; +import static org.apache.hudi.common.table.marker.MarkerOperation.CREATE_MARKER_URL; +import static org.apache.hudi.common.table.marker.MarkerOperation.DELETE_MARKER_DIR_URL; +import static org.apache.hudi.common.table.marker.MarkerOperation.MARKERS_DIR_EXISTS_URL; +import static org.apache.hudi.common.table.marker.MarkerOperation.MARKER_DIR_PATH_PARAM; +import static org.apache.hudi.common.table.marker.MarkerOperation.MARKER_NAME_PARAM; + +/** + * Marker operations of using timeline server as a proxy to create and delete markers. + * Each data file has a corresponding marker entry, which is stored in a limited number of + * underlying files maintained by the timeline server (each file contains multiple marker + * entries). + */ +public class TimelineServerBasedWriteMarkers extends WriteMarkers { + private static final Logger LOG = LogManager.getLogger(TimelineServerBasedWriteMarkers.class); + private final ObjectMapper mapper; + private final String timelineServerHost; + private final int timelineServerPort; + private final int timeoutSecs; + + public TimelineServerBasedWriteMarkers(HoodieTable table, String instantTime) { + this(table.getMetaClient().getBasePath(), + table.getMetaClient().getMarkerFolderPath(instantTime), instantTime, + table.getConfig().getViewStorageConfig().getRemoteViewServerHost(), + table.getConfig().getViewStorageConfig().getRemoteViewServerPort(), + table.getConfig().getViewStorageConfig().getRemoteTimelineClientTimeoutSecs()); + } + + TimelineServerBasedWriteMarkers(String basePath, String markerFolderPath, String instantTime, + String timelineServerHost, int timelineServerPort, int timeoutSecs) { + super(basePath, markerFolderPath, instantTime); + this.mapper = new ObjectMapper(); + this.timelineServerHost = timelineServerHost; + this.timelineServerPort = timelineServerPort; + this.timeoutSecs = timeoutSecs; + } + + @Override + public boolean deleteMarkerDir(HoodieEngineContext context, int parallelism) { + Map paramsMap = Collections.singletonMap(MARKER_DIR_PATH_PARAM, markerDirPath.toString()); + try { + return executeRequestToTimelineServer( + DELETE_MARKER_DIR_URL, paramsMap, new TypeReference() {}, RequestMethod.POST); + } catch (IOException e) { + throw new HoodieRemoteException("Failed to delete marker directory " + markerDirPath.toString(), e); + } + } + + @Override + public boolean doesMarkerDirExist() { + Map paramsMap = Collections.singletonMap(MARKER_DIR_PATH_PARAM, markerDirPath.toString()); + try { + return executeRequestToTimelineServer( + MARKERS_DIR_EXISTS_URL, paramsMap, new TypeReference() {}, RequestMethod.GET); + } catch (IOException e) { + throw new HoodieRemoteException("Failed to check marker directory " + markerDirPath.toString(), e); + } + } + + @Override + public Set createdAndMergedDataPaths(HoodieEngineContext context, int parallelism) throws IOException { + Map paramsMap = Collections.singletonMap(MARKER_DIR_PATH_PARAM, markerDirPath.toString()); + try { + Set markerPaths = executeRequestToTimelineServer( + CREATE_AND_MERGE_MARKERS_URL, paramsMap, new TypeReference>() {}, RequestMethod.GET); + return markerPaths.stream().map(WriteMarkers::stripMarkerSuffix).collect(Collectors.toSet()); + } catch (IOException e) { + throw new HoodieRemoteException("Failed to get CREATE and MERGE data file paths in " + + markerDirPath.toString(), e); + } + } + + @Override + public Set allMarkerFilePaths() { + Map paramsMap = Collections.singletonMap(MARKER_DIR_PATH_PARAM, markerDirPath.toString()); + try { + return executeRequestToTimelineServer( + ALL_MARKERS_URL, paramsMap, new TypeReference>() {}, RequestMethod.GET); + } catch (IOException e) { + throw new HoodieRemoteException("Failed to get all markers in " + markerDirPath.toString(), e); + } + } + + @Override + protected Option create(String partitionPath, String dataFileName, IOType type, boolean checkIfExists) { + HoodieTimer timer = new HoodieTimer().startTimer(); + String markerFileName = getMarkerFileName(dataFileName, type); + + Map paramsMap = new HashMap<>(); + paramsMap.put(MARKER_DIR_PATH_PARAM, markerDirPath.toString()); + if (StringUtils.isNullOrEmpty(partitionPath)) { + paramsMap.put(MARKER_NAME_PARAM, markerFileName); + } else { + paramsMap.put(MARKER_NAME_PARAM, partitionPath + "/" + markerFileName); + } + + boolean success; + try { + success = executeRequestToTimelineServer( + CREATE_MARKER_URL, paramsMap, new TypeReference() { + }, RequestMethod.POST); + } catch (IOException e) { + throw new HoodieRemoteException("Failed to create marker file " + partitionPath + "/" + markerFileName, e); + } + LOG.info("[timeline-server-based] Created marker file " + partitionPath + "/" + markerFileName + + " in " + timer.endTimer() + " ms"); + if (success) { + return Option.of(new Path(FSUtils.getPartitionPath(markerDirPath, partitionPath), markerFileName)); + } else { + return Option.empty(); + } + } + + private T executeRequestToTimelineServer(String requestPath, Map queryParameters, + TypeReference reference, RequestMethod method) throws IOException { + URIBuilder builder = + new URIBuilder().setHost(timelineServerHost).setPort(timelineServerPort).setPath(requestPath).setScheme("http"); + + queryParameters.forEach(builder::addParameter); + + String url = builder.toString(); + LOG.info("Sending request : (" + url + ")"); + Response response; + int timeout = this.timeoutSecs * 1000; // msec + switch (method) { + case GET: + response = Request.Get(url).connectTimeout(timeout).socketTimeout(timeout).execute(); + break; + case POST: + default: + response = Request.Post(url).connectTimeout(timeout).socketTimeout(timeout).execute(); + break; + } + String content = response.returnContent().asString(); + return (T) mapper.readValue(content, reference); + } + + private enum RequestMethod { + GET, POST + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/marker/WriteMarkers.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/marker/WriteMarkers.java new file mode 100644 index 0000000000000..07428dd936469 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/marker/WriteMarkers.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.marker; + +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.IOType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; + +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.io.Serializable; +import java.util.Set; + +/** + * Operates on markers for a given write action (commit, delta commit, compaction). + * + * This abstract class provides abstract methods of different marker operations, so that + * different marker write mechanism can be implemented. + */ +public abstract class WriteMarkers implements Serializable { + + private static final Logger LOG = LogManager.getLogger(WriteMarkers.class); + + protected final String basePath; + protected final transient Path markerDirPath; + protected final String instantTime; + + public WriteMarkers(String basePath, String markerFolderPath, String instantTime) { + this.basePath = basePath; + this.markerDirPath = new Path(markerFolderPath); + this.instantTime = instantTime; + } + + /** + * Creates a marker without checking if the marker already exists. + * + * @param partitionPath partition path in the table + * @param dataFileName data file name + * @param type write IO type + * @return the marker path + */ + public Option create(String partitionPath, String dataFileName, IOType type) { + return create(partitionPath, dataFileName, type, false); + } + + /** + * Creates a marker if the marker does not exist. + * + * @param partitionPath partition path in the table + * @param dataFileName data file name + * @param type write IO type + * @return the marker path or empty option if already exists + */ + public Option createIfNotExists(String partitionPath, String dataFileName, IOType type) { + return create(partitionPath, dataFileName, type, true); + } + + /** + * Quietly deletes the marker directory. + * + * @param context {@code HoodieEngineContext} instance. + * @param parallelism parallelism for deleting the marker files in the directory. + */ + public void quietDeleteMarkerDir(HoodieEngineContext context, int parallelism) { + try { + context.setJobStatus(this.getClass().getSimpleName(), "Deleting marker directory: " + basePath); + deleteMarkerDir(context, parallelism); + } catch (Exception e) { + LOG.warn("Error deleting marker directory for instant " + instantTime, e); + } + } + + /** + * Strips the marker file suffix from the input path, i.e., ".marker.[IO_type]". + * + * @param path file path + * @return Stripped path + */ + public static String stripMarkerSuffix(String path) { + return path.substring(0, path.indexOf(HoodieTableMetaClient.MARKER_EXTN)); + } + + /** + * Gets the marker file name, in the format of "[data_file_name].marker.[IO_type]". + * + * @param dataFileName data file name + * @param type IO type + * @return the marker file name + */ + protected String getMarkerFileName(String dataFileName, IOType type) { + return String.format("%s%s.%s", dataFileName, HoodieTableMetaClient.MARKER_EXTN, type.name()); + } + + /** + * Returns the marker path. Would create the partition path first if not exists + * + * @param partitionPath The partition path + * @param dataFileName The data file name + * @param type The IO type + * @return path of the marker file + */ + protected Path getMarkerPath(String partitionPath, String dataFileName, IOType type) { + Path path = FSUtils.getPartitionPath(markerDirPath, partitionPath); + String markerFileName = getMarkerFileName(dataFileName, type); + return new Path(path, markerFileName); + } + + /** + * Deletes the marker directory. + * + * @param context {@code HoodieEngineContext} instance. + * @param parallelism parallelism for deleting the marker files in the directory. + * @return {@true} if successful; {@false} otherwise. + */ + public abstract boolean deleteMarkerDir(HoodieEngineContext context, int parallelism); + + /** + * @return {@true} if the marker directory exists in the file system; {@false} otherwise. + * @throws IOException + */ + public abstract boolean doesMarkerDirExist() throws IOException; + + /** + * @param context {@code HoodieEngineContext} instance. + * @param parallelism parallelism for reading the marker files in the directory. + * @return all the data file paths of write IO type "CREATE" and "MERGE" + * @throws IOException + */ + public abstract Set createdAndMergedDataPaths(HoodieEngineContext context, int parallelism) throws IOException; + + /** + * @return all the marker paths + * @throws IOException + */ + public abstract Set allMarkerFilePaths() throws IOException; + + /** + * Creates a marker. + * + * @param partitionPath partition path in the table + * @param dataFileName data file name + * @param type write IO type + * @param checkIfExists whether to check if the marker already exists + * @return the marker path or empty option if already exists and {@code checkIfExists} is true + */ + abstract Option create(String partitionPath, String dataFileName, IOType type, boolean checkIfExists); +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/marker/WriteMarkersFactory.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/marker/WriteMarkersFactory.java new file mode 100644 index 0000000000000..dfd55f2958125 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/marker/WriteMarkersFactory.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.marker; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.fs.StorageSchemes; +import org.apache.hudi.common.table.marker.MarkerType; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.table.HoodieTable; + +import com.esotericsoftware.minlog.Log; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +/** + * A factory to generate {@code WriteMarkers} instance based on the {@code MarkerType}. + */ +public class WriteMarkersFactory { + private static final Logger LOG = LogManager.getLogger(WriteMarkersFactory.class); + + /** + * @param markerType the type of markers to use + * @param table {@code HoodieTable} instance + * @param instantTime current instant time + * @return {@code WriteMarkers} instance based on the {@code MarkerType} + */ + public static WriteMarkers get(MarkerType markerType, HoodieTable table, String instantTime) { + LOG.debug("Instantiated MarkerFiles with marker type: " + markerType.toString()); + switch (markerType) { + case DIRECT: + return new DirectWriteMarkers(table, instantTime); + case TIMELINE_SERVER_BASED: + if (!table.getConfig().isEmbeddedTimelineServerEnabled()) { + Log.warn("Timeline-server-based markers are configured as the marker type " + + "but embedded timeline server is not enabled. Falling back to direct markers."); + return new DirectWriteMarkers(table, instantTime); + } + String basePath = table.getMetaClient().getBasePath(); + if (StorageSchemes.HDFS.getScheme().equals( + FSUtils.getFs(basePath, table.getContext().getHadoopConf().newCopy()).getScheme())) { + Log.warn("Timeline-server-based markers are not supported for HDFS: " + + "base path " + basePath + ". Falling back to direct markers."); + return new DirectWriteMarkers(table, instantTime); + } + return new TimelineServerBasedWriteMarkers(table, instantTime); + default: + throw new HoodieException("The marker type \"" + markerType.name() + "\" is not supported."); + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/repair/RepairUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/repair/RepairUtils.java new file mode 100644 index 0000000000000..5aa03a4bddf84 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/repair/RepairUtils.java @@ -0,0 +1,175 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.repair; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.table.timeline.HoodieTimeline.COMMIT_ACTION; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.DELTA_COMMIT_ACTION; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.REPLACE_COMMIT_ACTION; + +/** + * Utils for table repair tool. + */ +public final class RepairUtils { + /** + * Tags the instant time of each base or log file from the input file paths. + * + * @param basePath Base path of the table. + * @param allPaths A {@link List} of file paths to tag. + * @return A {@link Map} of instant time in {@link String} to a {@link List} of relative file paths. + */ + public static Map> tagInstantsOfBaseAndLogFiles( + String basePath, List allPaths) { + // Instant time -> Set of base and log file paths + Map> instantToFilesMap = new HashMap<>(); + allPaths.forEach(path -> { + String instantTime = FSUtils.getCommitTime(path.getName()); + instantToFilesMap.computeIfAbsent(instantTime, k -> new ArrayList<>()); + instantToFilesMap.get(instantTime).add( + FSUtils.getRelativePartitionPath(new Path(basePath), path)); + }); + return instantToFilesMap; + } + + /** + * Gets the base and log file paths written for a given instant from the timeline. + * This reads the details of the instant metadata. + * + * @param timeline {@link HoodieTimeline} instance, can be active or archived timeline. + * @param instant Instant for lookup. + * @return A {@link Option} of {@link Set} of relative file paths to base path + * if the instant action is supported; empty {@link Option} otherwise. + * @throws IOException if reading instant details fail. + */ + public static Option> getBaseAndLogFilePathsFromTimeline( + HoodieTimeline timeline, HoodieInstant instant) throws IOException { + if (!instant.isCompleted()) { + throw new HoodieException("Cannot get base and log file paths from " + + "instant not completed: " + instant.getTimestamp()); + } + + switch (instant.getAction()) { + case COMMIT_ACTION: + case DELTA_COMMIT_ACTION: + final HoodieCommitMetadata commitMetadata = + HoodieCommitMetadata.fromBytes( + timeline.getInstantDetails(instant).get(), HoodieCommitMetadata.class); + return Option.of(commitMetadata.getPartitionToWriteStats().values().stream().flatMap(List::stream) + .map(HoodieWriteStat::getPath).collect(Collectors.toSet())); + case REPLACE_COMMIT_ACTION: + final HoodieReplaceCommitMetadata replaceCommitMetadata = + HoodieReplaceCommitMetadata.fromBytes( + timeline.getInstantDetails(instant).get(), HoodieReplaceCommitMetadata.class); + return Option.of(replaceCommitMetadata.getPartitionToWriteStats().values().stream().flatMap(List::stream) + .map(HoodieWriteStat::getPath).collect(Collectors.toSet())); + default: + return Option.empty(); + } + } + + /** + * Finds the dangling files to remove for a given instant to repair. + * + * @param instantToRepair Instant timestamp to repair. + * @param baseAndLogFilesFromFs A {@link List} of base and log files based on the file system. + * @param activeTimeline {@link HoodieActiveTimeline} instance. + * @param archivedTimeline {@link HoodieArchivedTimeline} instance. + * @return A {@link List} of relative file paths to base path for removing. + */ + public static List findInstantFilesToRemove( + String instantToRepair, List baseAndLogFilesFromFs, + HoodieActiveTimeline activeTimeline, HoodieArchivedTimeline archivedTimeline) { + // Skips the instant if it is requested or inflight in active timeline + if (activeTimeline.filter(instant -> instant.getTimestamp().equals(instantToRepair) + && !instant.isCompleted()).getInstants().findAny().isPresent()) { + return Collections.emptyList(); + } + + try { + boolean doesInstantExist = false; + Option> filesFromTimeline = Option.empty(); + Option instantOption = activeTimeline.filterCompletedInstants().filter( + instant -> instant.getTimestamp().equals(instantToRepair)).firstInstant(); + if (instantOption.isPresent()) { + // Completed instant in active timeline + doesInstantExist = true; + filesFromTimeline = RepairUtils.getBaseAndLogFilePathsFromTimeline( + activeTimeline, instantOption.get()); + } else { + instantOption = archivedTimeline.filterCompletedInstants().filter( + instant -> instant.getTimestamp().equals(instantToRepair)).firstInstant(); + if (instantOption.isPresent()) { + // Completed instant in archived timeline + doesInstantExist = true; + filesFromTimeline = RepairUtils.getBaseAndLogFilePathsFromTimeline( + archivedTimeline, instantOption.get()); + } + } + + if (doesInstantExist) { + if (!filesFromTimeline.isPresent() || filesFromTimeline.get().isEmpty()) { + // Skips if no instant details + return Collections.emptyList(); + } + // Excludes committed base and log files from timeline + Set filesToRemove = new HashSet<>(baseAndLogFilesFromFs); + filesToRemove.removeAll(filesFromTimeline.get()); + return new ArrayList<>(filesToRemove); + } else { + // The instant does not exist in the whole timeline (neither completed nor requested/inflight), + // this means the files from this instant are dangling, which should be removed + return baseAndLogFilesFromFs; + } + } catch (IOException e) { + // In case of failure, does not remove any files for the instant + return Collections.emptyList(); + } + } + + /** + * Serializable path filter class for Spark job. + */ + public interface SerializablePathFilter extends PathFilter, Serializable { + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/storage/HoodieConsistentBucketLayout.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/storage/HoodieConsistentBucketLayout.java new file mode 100644 index 0000000000000..0ed2b9c939a7b --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/storage/HoodieConsistentBucketLayout.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.storage; + +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; + +import java.util.Set; + +/** + * Storage layout when using consistent hashing bucket index. + */ +public class HoodieConsistentBucketLayout extends HoodieStorageLayout { + public static final Set SUPPORTED_OPERATIONS = CollectionUtils.createImmutableSet( + WriteOperationType.INSERT, + WriteOperationType.INSERT_PREPPED, + WriteOperationType.UPSERT, + WriteOperationType.UPSERT_PREPPED, + WriteOperationType.INSERT_OVERWRITE, + WriteOperationType.DELETE, + WriteOperationType.COMPACT, + WriteOperationType.DELETE_PARTITION + ); + + public HoodieConsistentBucketLayout(HoodieWriteConfig config) { + super(config); + } + + /** + * Bucketing controls the number of file groups directly. + */ + @Override + public boolean determinesNumFileGroups() { + return true; + } + + /** + * Consistent hashing will tag all incoming records, so we could go ahead reusing an existing Partitioner + */ + @Override + public Option layoutPartitionerClass() { + return Option.empty(); + } + + @Override + public boolean writeOperationSupported(WriteOperationType operationType) { + return SUPPORTED_OPERATIONS.contains(operationType); + } + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/storage/HoodieDefaultLayout.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/storage/HoodieDefaultLayout.java new file mode 100644 index 0000000000000..28fe37c9b8fe0 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/storage/HoodieDefaultLayout.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.storage; + +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; + +/** + * Default storage layout with non-constraints. + */ +public class HoodieDefaultLayout extends HoodieStorageLayout { + + public HoodieDefaultLayout(HoodieWriteConfig config) { + super(config); + } + + @Override + public boolean determinesNumFileGroups() { + return false; + } + + @Override + public Option layoutPartitionerClass() { + return Option.empty(); + } + + @Override + public boolean writeOperationSupported(WriteOperationType operationType) { + return true; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/storage/HoodieLayoutFactory.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/storage/HoodieLayoutFactory.java new file mode 100644 index 0000000000000..e78c15b3a4b22 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/storage/HoodieLayoutFactory.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.storage; + +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieNotSupportedException; + +/** + * A factory to generate layout. + */ +public final class HoodieLayoutFactory { + public static HoodieStorageLayout createLayout(HoodieWriteConfig config) { + switch (config.getLayoutType()) { + case DEFAULT: + return new HoodieDefaultLayout(config); + case BUCKET: + switch (config.getBucketIndexEngineType()) { + case SIMPLE: + return new HoodieSimpleBucketLayout(config); + case CONSISTENT_HASHING: + return new HoodieConsistentBucketLayout(config); + default: + throw new HoodieNotSupportedException("Unknown bucket index engine type: " + config.getBucketIndexEngineType()); + } + default: + throw new HoodieNotSupportedException("Unknown layout type, set " + config.getLayoutType()); + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/storage/HoodieSimpleBucketLayout.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/storage/HoodieSimpleBucketLayout.java new file mode 100644 index 0000000000000..be048a23b058c --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/storage/HoodieSimpleBucketLayout.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.storage; + +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieLayoutConfig; +import org.apache.hudi.config.HoodieWriteConfig; + +import java.util.Set; + +/** + * Storage layout when using bucket index. Data distribution and files organization are in a specific way. + */ +public class HoodieSimpleBucketLayout extends HoodieStorageLayout { + + public static final Set SUPPORTED_OPERATIONS = CollectionUtils.createImmutableSet( + WriteOperationType.INSERT, + WriteOperationType.INSERT_PREPPED, + WriteOperationType.UPSERT, + WriteOperationType.UPSERT_PREPPED, + WriteOperationType.INSERT_OVERWRITE, + WriteOperationType.DELETE, + WriteOperationType.COMPACT, + WriteOperationType.DELETE_PARTITION + ); + + public HoodieSimpleBucketLayout(HoodieWriteConfig config) { + super(config); + } + + /** + * Bucketing controls the number of file groups directly. + */ + @Override + public boolean determinesNumFileGroups() { + return true; + } + + @Override + public Option layoutPartitionerClass() { + return config.contains(HoodieLayoutConfig.LAYOUT_PARTITIONER_CLASS_NAME) + ? Option.of(config.getString(HoodieLayoutConfig.LAYOUT_PARTITIONER_CLASS_NAME.key())) + : Option.empty(); + } + + @Override + public boolean writeOperationSupported(WriteOperationType operationType) { + return SUPPORTED_OPERATIONS.contains(operationType); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/storage/HoodieStorageLayout.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/storage/HoodieStorageLayout.java new file mode 100644 index 0000000000000..36be1a8bef6a8 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/storage/HoodieStorageLayout.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.storage; + +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; + +import java.io.Serializable; + +/** + * Storage layout defines how the files are organized within a table. + */ +public abstract class HoodieStorageLayout implements Serializable { + + protected final HoodieWriteConfig config; + + public HoodieStorageLayout(HoodieWriteConfig config) { + this.config = config; + } + + /** + * By default, layout does not directly control the total number of files. + */ + public abstract boolean determinesNumFileGroups(); + + /** + * Return the layout specific partitioner for writing data, if any. + */ + public abstract Option layoutPartitionerClass(); + + /** + * Determines if the operation is supported by the layout. + */ + public abstract boolean writeOperationSupported(WriteOperationType operationType); + + public enum LayoutType { + DEFAULT, BUCKET + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/AbstractUpgradeDowngrade.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/AbstractUpgradeDowngrade.java deleted file mode 100644 index cafb8167b6fc5..0000000000000 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/AbstractUpgradeDowngrade.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.upgrade; - -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.table.HoodieTableConfig; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.HoodieTableVersion; -import org.apache.hudi.common.util.FileIOUtils; -import org.apache.hudi.config.HoodieWriteConfig; - -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.IOException; -import java.util.Date; -import java.util.Properties; - -/** - * Helper class to assist in upgrading/downgrading Hoodie when there is a version change. - */ -public abstract class AbstractUpgradeDowngrade { - - private static final Logger LOG = LogManager.getLogger(AbstractUpgradeDowngrade.class); - public static final String HOODIE_UPDATED_PROPERTY_FILE = "hoodie.properties.updated"; - - private HoodieTableMetaClient metaClient; - protected HoodieWriteConfig config; - protected HoodieEngineContext context; - private transient FileSystem fs; - private Path updatedPropsFilePath; - private Path propsFilePath; - - /** - * Perform Upgrade or Downgrade steps if required and updated table version if need be. - *

    - * Starting from version 0.6.0, this upgrade/downgrade step will be added in all write paths. - * - * Essentially, if a dataset was created using any pre 0.6.0(for eg 0.5.3), and Hoodie version was upgraded to 0.6.0, - * Hoodie table version gets bumped to 1 and there are some upgrade steps need to be executed before doing any writes. - * Similarly, if a dataset was created using Hoodie version 0.6.0 or Hoodie table version 1 and then hoodie was downgraded - * to pre 0.6.0 or to Hoodie table version 0, then some downgrade steps need to be executed before proceeding w/ any writes. - * - * On a high level, these are the steps performed - * - * Step1 : Understand current hoodie table version and table version from hoodie.properties file - * Step2 : Delete any left over .updated from previous upgrade/downgrade - * Step3 : If version are different, perform upgrade/downgrade. - * Step4 : Copy hoodie.properties -> hoodie.properties.updated with the version updated - * Step6 : Rename hoodie.properties.updated to hoodie.properties - *

    - * - * @param metaClient instance of {@link HoodieTableMetaClient} to use - * @param toVersion version to which upgrade or downgrade has to be done. - * @param config instance of {@link HoodieWriteConfig} to use. - * @param context instance of {@link HoodieEngineContext} to use. - * @param instantTime current instant time that should not be touched. - */ - public abstract void run(HoodieTableMetaClient metaClient, HoodieTableVersion toVersion, HoodieWriteConfig config, - HoodieEngineContext context, String instantTime); - - protected AbstractUpgradeDowngrade(HoodieTableMetaClient metaClient, HoodieWriteConfig config, HoodieEngineContext context) { - this.metaClient = metaClient; - this.config = config; - this.context = context; - this.fs = metaClient.getFs(); - this.updatedPropsFilePath = new Path(metaClient.getMetaPath(), HOODIE_UPDATED_PROPERTY_FILE); - this.propsFilePath = new Path(metaClient.getMetaPath(), HoodieTableConfig.HOODIE_PROPERTIES_FILE); - } - - protected void run(HoodieTableVersion toVersion, String instantTime) throws IOException { - // Fetch version from property file and current version - HoodieTableVersion fromVersion = metaClient.getTableConfig().getTableVersion(); - if (toVersion.versionCode() == fromVersion.versionCode()) { - return; - } - - if (fs.exists(updatedPropsFilePath)) { - // this can be left over .updated file from a failed attempt before. Many cases exist here. - // a) We failed while writing the .updated file and it's content is partial (e.g hdfs) - // b) We failed without renaming the file to hoodie.properties. We will re-attempt everything now anyway - // c) rename() is not atomic in cloud stores. so hoodie.properties is fine, but we failed before deleting the .updated file - // All cases, it simply suffices to delete the file and proceed. - LOG.info("Deleting existing .updated file with content :" + FileIOUtils.readAsUTFString(fs.open(updatedPropsFilePath))); - fs.delete(updatedPropsFilePath, false); - } - - // Perform the actual upgrade/downgrade; this has to be idempotent, for now. - LOG.info("Attempting to move table from version " + fromVersion + " to " + toVersion); - if (fromVersion.versionCode() < toVersion.versionCode()) { - // upgrade - upgrade(fromVersion, toVersion, instantTime); - } else { - // downgrade - downgrade(fromVersion, toVersion, instantTime); - } - - // Write out the current version in hoodie.properties.updated file - metaClient.getTableConfig().setTableVersion(toVersion); - createUpdatedFile(metaClient.getTableConfig().getProperties()); - - // because for different fs the fs.rename have different action,such as: - // a) for hdfs : if propsFilePath already exist,fs.rename will not replace propsFilePath, but just return false - // b) for localfs: if propsFilePath already exist,fs.rename will replace propsFilePath, and return ture - // c) for aliyun ossfs: if propsFilePath already exist,will throw FileAlreadyExistsException - // so we should delete the old propsFilePath. also upgrade and downgrade is Idempotent - if (fs.exists(propsFilePath)) { - fs.delete(propsFilePath, false); - } - // Rename the .updated file to hoodie.properties. This is atomic in hdfs, but not in cloud stores. - // But as long as this does not leave a partial hoodie.properties file, we are okay. - fs.rename(updatedPropsFilePath, propsFilePath); - } - - private void createUpdatedFile(Properties props) throws IOException { - try (FSDataOutputStream outputStream = fs.create(updatedPropsFilePath)) { - props.store(outputStream, "Properties saved on " + new Date(System.currentTimeMillis())); - } - } - - protected abstract void upgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime); - - protected abstract void downgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime); -} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/DowngradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/DowngradeHandler.java index 27389d923bd6a..45bbd78c3fb36 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/DowngradeHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/DowngradeHandler.java @@ -18,9 +18,12 @@ package org.apache.hudi.table.upgrade; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.config.HoodieWriteConfig; +import java.util.Map; + /** * Interface to assist in downgrading Hoodie table. */ @@ -29,9 +32,13 @@ public interface DowngradeHandler { /** * to be invoked to downgrade hoodie table from one version to a lower version. * - * @param config instance of {@link HoodieWriteConfig} to be used. - * @param context instance of {@link HoodieEngineContext} to be used. - * @param instantTime current instant time that should not touched. + * @param config instance of {@link HoodieWriteConfig} to be used. + * @param context instance of {@link HoodieEngineContext} to be used. + * @param instantTime current instant time that should not touched. + * @param upgradeDowngradeHelper instance of {@link SupportsUpgradeDowngrade} to be used. + * @return Map of config properties and its values to be added to table properties. */ - void downgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime); + Map downgrade( + HoodieWriteConfig config, HoodieEngineContext context, String instantTime, + SupportsUpgradeDowngrade upgradeDowngradeHelper); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/FiveToFourDowngradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/FiveToFourDowngradeHandler.java new file mode 100644 index 0000000000000..51da9810f6a2f --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/FiveToFourDowngradeHandler.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.config.HoodieWriteConfig; + +import java.util.HashMap; +import java.util.Map; + +public class FiveToFourDowngradeHandler implements DowngradeHandler { + + @Override + public Map downgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime, SupportsUpgradeDowngrade upgradeDowngradeHelper) { + return new HashMap<>(); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/FourToFiveUpgradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/FourToFiveUpgradeHandler.java new file mode 100644 index 0000000000000..4b1484ed3174b --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/FourToFiveUpgradeHandler.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import static org.apache.hudi.common.util.PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH; +import static org.apache.hudi.common.util.PartitionPathEncodeUtils.DEPRECATED_DEFAULT_PARTITION_PATH; + +/** + * Upgrade handler to upgrade Hudi's table version from 4 to 5. + */ +public class FourToFiveUpgradeHandler implements UpgradeHandler { + + private static final Logger LOG = LogManager.getLogger(FourToFiveUpgradeHandler.class); + + @Override + public Map upgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime, SupportsUpgradeDowngrade upgradeDowngradeHelper) { + try { + FileSystem fs = new Path(config.getBasePath()).getFileSystem(context.getHadoopConf().get()); + if (!config.doSkipDefaultPartitionValidation() && fs.exists(new Path(config.getBasePath() + "/" + DEPRECATED_DEFAULT_PARTITION_PATH))) { + LOG.error(String.format("\"%s\" partition detected. From 0.12, we are changing the default partition in hudi to %s " + + " Please read and write back the data in \"%s\" partition in hudi to new partition path \"%s\". \"\n" + + " Sample spark command to use to re-write the data: \n\n" + + " val df = spark.read.format(\"hudi\").load(HUDI_TABLE_PATH).filter(col(\"PARTITION_PATH_COLUMN\") === \"%s\"); \t \n\n" + + " df.drop(\"_hoodie_commit_time\").drop(\"_hoodie_commit_seqno\").drop(\"_hoodie_record_key\")\"\n" + + " .drop(\"_hoodie_partition_path\").drop(\"_hoodie_file_name\").withColumn(PARTITION_PATH_COLUMN,\"%s\")\"\n" + + " .write.options(writeOptions).mode(Append).save(HUDI_TABLE_PATH);\t\n\"\n" + + " Please fix values for PARTITION_PATH_COLUMN, HUDI_TABLE_PATH and set all write configs in above command before running. " + + " Also do delete the records in old partition once above command succeeds. " + + " Sample spark command to delete old partition records: \n\n" + + " val df = spark.read.format(\"hudi\").load(HUDI_TABLE_PATH).filter(col(\"PARTITION_PATH_COLUMN\") === \"%s\"); \t \n\n" + + " df.write.option(\"hoodie.datasource.write.operation\",\"delete\").options(writeOptions).mode(Append).save(HUDI_TABLE_PATH);\t\n\"\n", + DEPRECATED_DEFAULT_PARTITION_PATH, DEFAULT_PARTITION_PATH, DEPRECATED_DEFAULT_PARTITION_PATH, DEFAULT_PARTITION_PATH, + DEPRECATED_DEFAULT_PARTITION_PATH, DEFAULT_PARTITION_PATH, DEPRECATED_DEFAULT_PARTITION_PATH)); + throw new HoodieException(String.format("Old deprecated \"%s\" partition found in hudi table. This needs a migration step before we can upgrade ", + DEPRECATED_DEFAULT_PARTITION_PATH)); + } + } catch (IOException e) { + LOG.error("Fetching file system instance failed", e); + throw new HoodieException("Fetching FileSystem instance failed ", e); + } + return new HashMap<>(); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/FourToThreeDowngradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/FourToThreeDowngradeHandler.java new file mode 100644 index 0000000000000..86a594af17c5e --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/FourToThreeDowngradeHandler.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.metadata.HoodieTableMetadataUtil; + +import java.util.Collections; +import java.util.Map; + +/** + * DowngradeHandler to assist in downgrading {@link org.apache.hudi.table.HoodieTable} from version 4 to 3. + */ +public class FourToThreeDowngradeHandler implements DowngradeHandler { + + @Override + public Map downgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime, SupportsUpgradeDowngrade upgradeDowngradeHelper) { + if (config.isMetadataTableEnabled()) { + // Metadata Table in version 4 has a schema that is not forward compatible. + // Hence, it is safe to delete the metadata table, which will be re-initialized in subsequent commit. + HoodieTableMetadataUtil.deleteMetadataTable(config.getBasePath(), context); + } + return Collections.emptyMap(); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToTwoUpgradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToTwoUpgradeHandler.java new file mode 100644 index 0000000000000..dbf4d6159dcbd --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToTwoUpgradeHandler.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; + +import java.util.Hashtable; +import java.util.Map; + +/** + * Upgrade handle to assist in upgrading hoodie table from version 1 to 2. + */ +public class OneToTwoUpgradeHandler implements UpgradeHandler { + + @Override + public Map upgrade( + HoodieWriteConfig config, HoodieEngineContext context, String instantTime, + SupportsUpgradeDowngrade upgradeDowngradeHelper) { + Map tablePropsToAdd = new Hashtable<>(); + tablePropsToAdd.put(HoodieTableConfig.PARTITION_FIELDS, upgradeDowngradeHelper.getPartitionColumns(config)); + tablePropsToAdd.put(HoodieTableConfig.RECORDKEY_FIELDS, config.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key())); + tablePropsToAdd.put(HoodieTableConfig.BASE_FILE_FORMAT, config.getString(HoodieTableConfig.BASE_FILE_FORMAT)); + return tablePropsToAdd; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java new file mode 100644 index 0000000000000..14fe8e2b88713 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.marker.WriteMarkers; +import org.apache.hudi.table.marker.WriteMarkersFactory; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * Downgrade handle to assist in downgrading hoodie table from version 1 to 0. + */ +public class OneToZeroDowngradeHandler implements DowngradeHandler { + + @Override + public Map downgrade( + HoodieWriteConfig config, HoodieEngineContext context, String instantTime, + SupportsUpgradeDowngrade upgradeDowngradeHelper) { + HoodieTable table = upgradeDowngradeHelper.getTable(config, context); + // fetch pending commit info + HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction(); + List commits = inflightTimeline.getReverseOrderedInstants().collect(Collectors.toList()); + for (HoodieInstant inflightInstant : commits) { + // delete existing markers + WriteMarkers writeMarkers = WriteMarkersFactory.get(config.getMarkersType(), table, inflightInstant.getTimestamp()); + writeMarkers.quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); + } + return Collections.EMPTY_MAP; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/SupportsUpgradeDowngrade.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/SupportsUpgradeDowngrade.java new file mode 100644 index 0000000000000..a30396b63ea40 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/SupportsUpgradeDowngrade.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; + +import java.io.Serializable; + +/** + * Interface for engine-specific logic needed for upgrade and downgrade actions. + */ +public interface SupportsUpgradeDowngrade extends Serializable { + /** + * @param config Write config. + * @param context {@link HoodieEngineContext} instance to use. + * @return A new Hudi table for upgrade and downgrade actions. + */ + HoodieTable getTable(HoodieWriteConfig config, HoodieEngineContext context); + + /** + * @param config Write config. + * @return partition columns in String. + */ + String getPartitionColumns(HoodieWriteConfig config); +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToFourUpgradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToFourUpgradeHandler.java new file mode 100644 index 0000000000000..4da675ea82004 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToFourUpgradeHandler.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.metadata.MetadataPartitionType; + +import java.util.Hashtable; +import java.util.Map; + +import static org.apache.hudi.common.table.HoodieTableConfig.TABLE_CHECKSUM; +import static org.apache.hudi.common.table.HoodieTableConfig.TABLE_METADATA_PARTITIONS; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.metadataPartitionExists; + +/** + * UpgradeHandler to assist in upgrading {@link org.apache.hudi.table.HoodieTable} from version 3 to 4. + */ +public class ThreeToFourUpgradeHandler implements UpgradeHandler { + + @Override + public Map upgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime, SupportsUpgradeDowngrade upgradeDowngradeHelper) { + Map tablePropsToAdd = new Hashtable<>(); + tablePropsToAdd.put(TABLE_CHECKSUM, String.valueOf(HoodieTableConfig.generateChecksum(config.getProps()))); + // if metadata is enabled and files partition exist then update TABLE_METADATA_INDEX_COMPLETED + // schema for the files partition is same between the two versions + if (config.isMetadataTableEnabled() && metadataPartitionExists(config.getBasePath(), context, MetadataPartitionType.FILES)) { + tablePropsToAdd.put(TABLE_METADATA_PARTITIONS, MetadataPartitionType.FILES.getPartitionPath()); + } + return tablePropsToAdd; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToTwoDowngradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToTwoDowngradeHandler.java new file mode 100644 index 0000000000000..4f209f05ffc9b --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToTwoDowngradeHandler.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.metadata.HoodieTableMetadataUtil; + +import java.util.Collections; +import java.util.Map; + +/** + * Downgrade handler to assist in downgrading hoodie table from version 3 to 2. + */ +public class ThreeToTwoDowngradeHandler implements DowngradeHandler { + + @Override + public Map downgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime, SupportsUpgradeDowngrade upgradeDowngradeHelper) { + if (config.isMetadataTableEnabled()) { + // Metadata Table in version 3 is synchronous and in version 2 is asynchronous. Downgrading to asynchronous + // removes the checks in code to decide whether to use a LogBlock or not. Also, the schema for the + // table has been updated and is not forward compatible. Hence, we need to delete the table. + HoodieTableMetadataUtil.deleteMetadataTable(config.getBasePath(), context); + } + return Collections.emptyMap(); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToOneDowngradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToOneDowngradeHandler.java new file mode 100644 index 0000000000000..de1a1067fe111 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToOneDowngradeHandler.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.marker.MarkerType; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.MarkerUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.marker.DirectWriteMarkers; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.util.MarkerUtils.MARKERS_FILENAME_PREFIX; +/** + * Downgrade handler to assist in downgrading hoodie table from version 2 to 1. + */ +public class TwoToOneDowngradeHandler implements DowngradeHandler { + + @Override + public Map downgrade( + HoodieWriteConfig config, HoodieEngineContext context, String instantTime, + SupportsUpgradeDowngrade upgradeDowngradeHelper) { + HoodieTable table = upgradeDowngradeHelper.getTable(config, context); + HoodieTableMetaClient metaClient = table.getMetaClient(); + + // re-create marker files if any partial timeline server based markers are found + HoodieTimeline inflightTimeline = metaClient.getCommitsTimeline().filterPendingExcludingCompaction(); + List commits = inflightTimeline.getReverseOrderedInstants().collect(Collectors.toList()); + for (HoodieInstant inflightInstant : commits) { + // Converts the markers in new format to old format of direct markers + try { + convertToDirectMarkers( + inflightInstant.getTimestamp(), table, context, config.getMarkersDeleteParallelism()); + } catch (IOException e) { + throw new HoodieException("Converting marker files to DIRECT style failed during downgrade", e); + } + } + return Collections.EMPTY_MAP; + } + + /** + * Converts the markers in new format(timeline server based) to old format of direct markers, + * i.e., one marker file per data file, without MARKERS.type file. + * This needs to be idempotent. + * 1. read all markers from timeline server based marker files + * 2. create direct style markers + * 3. delete marker type file + * 4. delete timeline server based marker files + * + * @param commitInstantTime instant of interest for marker conversion. + * @param table instance of {@link HoodieTable} to use + * @param context instance of {@link HoodieEngineContext} to use + * @param parallelism parallelism to use + */ + private void convertToDirectMarkers(final String commitInstantTime, + HoodieTable table, + HoodieEngineContext context, + int parallelism) throws IOException { + String markerDir = table.getMetaClient().getMarkerFolderPath(commitInstantTime); + FileSystem fileSystem = FSUtils.getFs(markerDir, context.getHadoopConf().newCopy()); + Option markerTypeOption = MarkerUtils.readMarkerType(fileSystem, markerDir); + if (markerTypeOption.isPresent()) { + switch (markerTypeOption.get()) { + case TIMELINE_SERVER_BASED: + // Reads all markers written by the timeline server + Map> markersMap = + MarkerUtils.readTimelineServerBasedMarkersFromFileSystem( + markerDir, fileSystem, context, parallelism); + DirectWriteMarkers directWriteMarkers = new DirectWriteMarkers(table, commitInstantTime); + // Recreates the markers in the direct format + markersMap.values().stream().flatMap(Collection::stream) + .forEach(directWriteMarkers::create); + // Deletes marker type file + MarkerUtils.deleteMarkerTypeFile(fileSystem, markerDir); + // Deletes timeline server based markers + deleteTimelineBasedMarkerFiles(context, markerDir, fileSystem, parallelism); + break; + default: + throw new HoodieException("The marker type \"" + markerTypeOption.get().name() + + "\" is not supported for rollback."); + } + } else { + if (fileSystem.exists(new Path(markerDir))) { + // In case of partial failures during downgrade, there is a chance that marker type file was deleted, + // but timeline server based marker files are left. So deletes them if any + deleteTimelineBasedMarkerFiles(context, markerDir, fileSystem, parallelism); + } + } + } + + private void deleteTimelineBasedMarkerFiles(HoodieEngineContext context, String markerDir, + FileSystem fileSystem, int parallelism) throws IOException { + // Deletes timeline based marker files if any. + Predicate prefixFilter = fileStatus -> + fileStatus.getPath().getName().startsWith(MARKERS_FILENAME_PREFIX); + FSUtils.parallelizeSubPathProcess(context, fileSystem, new Path(markerDir), parallelism, + prefixFilter, pairOfSubPathAndConf -> + FSUtils.deleteSubPath(pairOfSubPathAndConf.getKey(), pairOfSubPathAndConf.getValue(), false)); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToThreeUpgradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToThreeUpgradeHandler.java new file mode 100644 index 0000000000000..8352ada1126e7 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToThreeUpgradeHandler.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.metadata.HoodieTableMetadataUtil; + +import java.util.Hashtable; +import java.util.Map; + +/** + * UpgradeHandler to assist in upgrading {@link org.apache.hudi.table.HoodieTable} from version 2 to 3. + */ +public class TwoToThreeUpgradeHandler implements UpgradeHandler { + public static final String SPARK_SIMPLE_KEY_GENERATOR = "org.apache.hudi.keygen.SimpleKeyGenerator"; + + @Override + public Map upgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime, SupportsUpgradeDowngrade upgradeDowngradeHelper) { + if (config.isMetadataTableEnabled()) { + // Metadata Table in version 2 is asynchronous and in version 3 is synchronous. Synchronous table will not + // sync any instants not already synced. So its simpler to re-bootstrap the table. Also, the schema for the + // table has been updated and is not backward compatible. + HoodieTableMetadataUtil.deleteMetadataTable(config.getBasePath(), context); + } + Map tablePropsToAdd = new Hashtable<>(); + tablePropsToAdd.put(HoodieTableConfig.URL_ENCODE_PARTITIONING, config.getStringOrDefault(HoodieTableConfig.URL_ENCODE_PARTITIONING)); + tablePropsToAdd.put(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE, config.getStringOrDefault(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE)); + String keyGenClassName = Option.ofNullable(config.getString(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME)) + .orElse(config.getString(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME)); + if (keyGenClassName == null && config.getEngineType() == EngineType.SPARK) { + // For Spark, if the key generator class is not configured by user, + // set it to SimpleKeyGenerator as default + keyGenClassName = SPARK_SIMPLE_KEY_GENERATOR; + } + ValidationUtils.checkState(keyGenClassName != null, String.format("Missing config: %s or %s", + HoodieTableConfig.KEY_GENERATOR_CLASS_NAME, HoodieWriteConfig.KEYGENERATOR_CLASS_NAME)); + tablePropsToAdd.put(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME, keyGenClassName); + return tablePropsToAdd; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngrade.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngrade.java new file mode 100644 index 0000000000000..246daf01b249f --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngrade.java @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.HoodieTableVersion; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieUpgradeDowngradeException; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Hashtable; +import java.util.Map; + +/** + * Helper class to assist in upgrading/downgrading Hoodie when there is a version change. + */ +public class UpgradeDowngrade { + + private static final Logger LOG = LogManager.getLogger(UpgradeDowngrade.class); + public static final String HOODIE_UPDATED_PROPERTY_FILE = "hoodie.properties.updated"; + + private final SupportsUpgradeDowngrade upgradeDowngradeHelper; + private HoodieTableMetaClient metaClient; + protected HoodieWriteConfig config; + protected HoodieEngineContext context; + private transient FileSystem fs; + private Path updatedPropsFilePath; + private Path propsFilePath; + + public UpgradeDowngrade( + HoodieTableMetaClient metaClient, HoodieWriteConfig config, HoodieEngineContext context, + SupportsUpgradeDowngrade upgradeDowngradeHelper) { + this.metaClient = metaClient; + this.config = config; + this.context = context; + this.fs = metaClient.getFs(); + this.updatedPropsFilePath = new Path(metaClient.getMetaPath(), HOODIE_UPDATED_PROPERTY_FILE); + this.propsFilePath = new Path(metaClient.getMetaPath(), HoodieTableConfig.HOODIE_PROPERTIES_FILE); + this.upgradeDowngradeHelper = upgradeDowngradeHelper; + } + + public boolean needsUpgradeOrDowngrade(HoodieTableVersion toVersion) { + HoodieTableVersion fromVersion = metaClient.getTableConfig().getTableVersion(); + // Ensure versions are same + return toVersion.versionCode() != fromVersion.versionCode(); + } + + /** + * Perform Upgrade or Downgrade steps if required and updated table version if need be. + *

    + * Starting from version 0.6.0, this upgrade/downgrade step will be added in all write paths. + *

    + * Essentially, if a dataset was created using an previous table version in an older release, + * and Hoodie version was upgraded to a new release with new table version supported, + * Hoodie table version gets bumped to the new version and there are some upgrade steps need + * to be executed before doing any writes. + *

    + * Similarly, if a dataset was created using an newer table version in an newer release, + * and then hoodie was downgraded to an older release or to older Hoodie table version, + * then some downgrade steps need to be executed before proceeding w/ any writes. + *

    + * Below shows the table version corresponding to the Hudi release: + * Hudi release -> table version + * pre 0.6.0 -> v0 + * 0.6.0 to 0.8.0 -> v1 + * 0.9.0 -> v2 + * 0.10.0 to current -> v3 + *

    + * On a high level, these are the steps performed + *

    + * Step1 : Understand current hoodie table version and table version from hoodie.properties file + * Step2 : Delete any left over .updated from previous upgrade/downgrade + * Step3 : If version are different, perform upgrade/downgrade. + * Step4 : Copy hoodie.properties -> hoodie.properties.updated with the version updated + * Step6 : Rename hoodie.properties.updated to hoodie.properties + *

    + * + * @param toVersion version to which upgrade or downgrade has to be done. + * @param instantTime current instant time that should not be touched. + */ + public void run(HoodieTableVersion toVersion, String instantTime) { + // Fetch version from property file and current version + HoodieTableVersion fromVersion = metaClient.getTableConfig().getTableVersion(); + if (!needsUpgradeOrDowngrade(toVersion)) { + return; + } + + // Perform the actual upgrade/downgrade; this has to be idempotent, for now. + LOG.info("Attempting to move table from version " + fromVersion + " to " + toVersion); + Map tableProps = new Hashtable<>(); + if (fromVersion.versionCode() < toVersion.versionCode()) { + // upgrade + while (fromVersion.versionCode() < toVersion.versionCode()) { + HoodieTableVersion nextVersion = HoodieTableVersion.versionFromCode(fromVersion.versionCode() + 1); + tableProps.putAll(upgrade(fromVersion, nextVersion, instantTime)); + fromVersion = nextVersion; + } + } else { + // downgrade + while (fromVersion.versionCode() > toVersion.versionCode()) { + HoodieTableVersion prevVersion = HoodieTableVersion.versionFromCode(fromVersion.versionCode() - 1); + tableProps.putAll(downgrade(fromVersion, prevVersion, instantTime)); + fromVersion = prevVersion; + } + } + + // Write out the current version in hoodie.properties.updated file + for (Map.Entry entry : tableProps.entrySet()) { + metaClient.getTableConfig().setValue(entry.getKey(), entry.getValue()); + } + metaClient.getTableConfig().setTableVersion(toVersion); + + HoodieTableConfig.update(metaClient.getFs(), new Path(metaClient.getMetaPath()), metaClient.getTableConfig().getProps()); + } + + protected Map upgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime) { + if (fromVersion == HoodieTableVersion.ZERO && toVersion == HoodieTableVersion.ONE) { + return new ZeroToOneUpgradeHandler().upgrade(config, context, instantTime, upgradeDowngradeHelper); + } else if (fromVersion == HoodieTableVersion.ONE && toVersion == HoodieTableVersion.TWO) { + return new OneToTwoUpgradeHandler().upgrade(config, context, instantTime, upgradeDowngradeHelper); + } else if (fromVersion == HoodieTableVersion.TWO && toVersion == HoodieTableVersion.THREE) { + return new TwoToThreeUpgradeHandler().upgrade(config, context, instantTime, upgradeDowngradeHelper); + } else if (fromVersion == HoodieTableVersion.THREE && toVersion == HoodieTableVersion.FOUR) { + return new ThreeToFourUpgradeHandler().upgrade(config, context, instantTime, upgradeDowngradeHelper); + } else if (fromVersion == HoodieTableVersion.FOUR && toVersion == HoodieTableVersion.FIVE) { + return new FourToFiveUpgradeHandler().upgrade(config, context, instantTime, upgradeDowngradeHelper); + } else { + throw new HoodieUpgradeDowngradeException(fromVersion.versionCode(), toVersion.versionCode(), true); + } + } + + protected Map downgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime) { + if (fromVersion == HoodieTableVersion.ONE && toVersion == HoodieTableVersion.ZERO) { + return new OneToZeroDowngradeHandler().downgrade(config, context, instantTime, upgradeDowngradeHelper); + } else if (fromVersion == HoodieTableVersion.TWO && toVersion == HoodieTableVersion.ONE) { + return new TwoToOneDowngradeHandler().downgrade(config, context, instantTime, upgradeDowngradeHelper); + } else if (fromVersion == HoodieTableVersion.THREE && toVersion == HoodieTableVersion.TWO) { + return new ThreeToTwoDowngradeHandler().downgrade(config, context, instantTime, upgradeDowngradeHelper); + } else if (fromVersion == HoodieTableVersion.FOUR && toVersion == HoodieTableVersion.THREE) { + return new FourToThreeDowngradeHandler().downgrade(config, context, instantTime, upgradeDowngradeHelper); + } else if (fromVersion == HoodieTableVersion.FIVE && toVersion == HoodieTableVersion.FOUR) { + return new FiveToFourDowngradeHandler().downgrade(config, context, instantTime, upgradeDowngradeHelper); + } else { + throw new HoodieUpgradeDowngradeException(fromVersion.versionCode(), toVersion.versionCode(), false); + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeHandler.java index fdf1261b31201..147aa4d8ab2dd 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeHandler.java @@ -18,9 +18,12 @@ package org.apache.hudi.table.upgrade; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.config.HoodieWriteConfig; +import java.util.Map; + /** * Interface to assist in upgrading Hoodie table. */ @@ -29,9 +32,13 @@ public interface UpgradeHandler { /** * to be invoked to upgrade hoodie table from one version to a higher version. * - * @param config instance of {@link HoodieWriteConfig} to be used. - * @param context instance of {@link HoodieEngineContext} to be used. - * @param instantTime current instant time that should not be touched. + * @param config instance of {@link HoodieWriteConfig} to be used. + * @param context instance of {@link HoodieEngineContext} to be used. + * @param instantTime current instant time that should not be touched. + * @param upgradeDowngradeHelper instance of {@link SupportsUpgradeDowngrade} to be used. + * @return Map of config properties and its values to be added to table properties. */ - void upgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime); + Map upgrade( + HoodieWriteConfig config, HoodieEngineContext context, String instantTime, + SupportsUpgradeDowngrade upgradeDowngradeHelper); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java new file mode 100644 index 0000000000000..95f22bba27d5f --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.model.HoodieRollbackRequest; +import org.apache.hudi.common.HoodieRollbackStat; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.IOType; +import org.apache.hudi.common.table.marker.MarkerType; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieRollbackException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.rollback.BaseRollbackHelper; +import org.apache.hudi.table.action.rollback.ListingBasedRollbackStrategy; +import org.apache.hudi.table.marker.WriteMarkers; +import org.apache.hudi.table.marker.WriteMarkersFactory; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * Upgrade handle to assist in upgrading hoodie table from version 0 to 1. + */ +public class ZeroToOneUpgradeHandler implements UpgradeHandler { + + @Override + public Map upgrade( + HoodieWriteConfig config, HoodieEngineContext context, String instantTime, + SupportsUpgradeDowngrade upgradeDowngradeHelper) { + // fetch pending commit info + HoodieTable table = upgradeDowngradeHelper.getTable(config, context); + HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction(); + List commits = inflightTimeline.getReverseOrderedInstants().map(HoodieInstant::getTimestamp) + .collect(Collectors.toList()); + if (commits.size() > 0 && instantTime != null) { + // ignore the latest inflight commit since a new commit would have been started and we need to fix any pending commits from previous launch + commits.remove(instantTime); + } + for (String commit : commits) { + // for every pending commit, delete old markers and re-create markers in new format + recreateMarkers(commit, table, context, config.getMarkersDeleteParallelism()); + } + return Collections.EMPTY_MAP; + } + + /** + * Recreate markers in new format. + * Step1: Delete existing markers + * Step2: Collect all rollback file info. + * Step3: recreate markers for all interested files. + * + * @param commitInstantTime instant of interest for which markers need to be recreated. + * @param table instance of {@link HoodieTable} to use + * @param context instance of {@link HoodieEngineContext} to use + * @throws HoodieRollbackException on any exception during upgrade. + */ + protected void recreateMarkers(final String commitInstantTime, + HoodieTable table, + HoodieEngineContext context, + int parallelism) throws HoodieRollbackException { + try { + // fetch hoodie instant + Option commitInstantOpt = Option.fromJavaOptional(table.getActiveTimeline().getCommitsTimeline().getInstants() + .filter(instant -> HoodieActiveTimeline.EQUALS.test(instant.getTimestamp(), commitInstantTime)) + .findFirst()); + if (commitInstantOpt.isPresent()) { + // delete existing markers + WriteMarkers writeMarkers = WriteMarkersFactory.get(MarkerType.DIRECT, table, commitInstantTime); + writeMarkers.quietDeleteMarkerDir(context, parallelism); + + // generate rollback stats + List rollbackStats = getListBasedRollBackStats(table, context, commitInstantOpt); + + // recreate markers adhering to marker based rollback + for (HoodieRollbackStat rollbackStat : rollbackStats) { + for (String path : rollbackStat.getSuccessDeleteFiles()) { + String dataFileName = path.substring(path.lastIndexOf("/") + 1); + // not feasible to differentiate MERGE from CREATE. hence creating with MERGE IOType for all base files. + writeMarkers.create(rollbackStat.getPartitionPath(), dataFileName, IOType.MERGE); + } + for (FileStatus fileStatus : rollbackStat.getCommandBlocksCount().keySet()) { + writeMarkers.create(rollbackStat.getPartitionPath(), getFileNameForMarkerFromLogFile(fileStatus.getPath().toString(), table), IOType.APPEND); + } + } + } + } catch (Exception e) { + throw new HoodieRollbackException("Exception thrown while upgrading Hoodie Table from version 0 to 1", e); + } + } + + List getListBasedRollBackStats(HoodieTable table, HoodieEngineContext context, Option commitInstantOpt) { + List hoodieRollbackRequests = + new ListingBasedRollbackStrategy(table, context, table.getConfig(), commitInstantOpt.get().getTimestamp()) + .getRollbackRequests(commitInstantOpt.get()); + return new BaseRollbackHelper(table.getMetaClient(), table.getConfig()) + .collectRollbackStats(context, commitInstantOpt.get(), hoodieRollbackRequests); + } + + /** + * Curates file name for marker from existing log file path. + * log file format : partitionpath/.fileid_baseInstant.log.writetoken + * marker file format : partitionpath/fileId_writetoken_baseinstant.basefileExtn.marker.APPEND + * + * @param logFilePath log file path for which marker file name needs to be generated. + * @param table {@link HoodieTable} instance to use + * @return the marker file name thus curated. + */ + private static String getFileNameForMarkerFromLogFile(String logFilePath, HoodieTable table) { + Path logPath = new Path(table.getMetaClient().getBasePath(), logFilePath); + String fileId = FSUtils.getFileIdFromLogPath(logPath); + String baseInstant = FSUtils.getBaseCommitTimeFromLogPath(logPath); + String writeToken = FSUtils.getWriteTokenFromLogPath(logPath); + + return FSUtils.makeBaseFileName(baseInstant, writeToken, fileId, table.getBaseFileFormat().getFileExtension()); + } +} diff --git a/hudi-client/hudi-client-common/src/main/resources/log4j.properties b/hudi-client/hudi-client-common/src/main/resources/log4j.properties deleted file mode 100644 index ff268faf6363c..0000000000000 --- a/hudi-client/hudi-client-common/src/main/resources/log4j.properties +++ /dev/null @@ -1,23 +0,0 @@ -### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -### -log4j.rootLogger=INFO, A1 -# A1 is set to be a ConsoleAppender. -log4j.appender.A1=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. -log4j.appender.A1.layout=org.apache.log4j.PatternLayout -log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/DummyTaskContextSupplier.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/DummyTaskContextSupplier.java index 022170567fa35..d2c07e35509c1 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/DummyTaskContextSupplier.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/DummyTaskContextSupplier.java @@ -18,8 +18,8 @@ package org.apache.hudi; -import org.apache.hudi.client.common.EngineProperty; -import org.apache.hudi.client.common.TaskContextSupplier; +import org.apache.hudi.common.engine.EngineProperty; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.util.Option; import java.util.function.Supplier; diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/HoodieTestCommitGenerator.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/HoodieTestCommitGenerator.java new file mode 100644 index 0000000000000..3146c9d6b4928 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/HoodieTestCommitGenerator.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; + +import static org.apache.hudi.common.table.log.HoodieLogFormat.DEFAULT_WRITE_TOKEN; + +public class HoodieTestCommitGenerator { + public static final String BASE_FILE_WRITE_TOKEN = "1-0-1"; + public static final String LOG_FILE_WRITE_TOKEN = DEFAULT_WRITE_TOKEN; + private static final Logger LOG = LogManager.getLogger(HoodieTestCommitGenerator.class); + + public static void initCommitInfoForRepairTests( + Map>> baseFileInfo, + Map>> logFileInfo) { + baseFileInfo.clear(); + logFileInfo.clear(); + baseFileInfo.put("000", CollectionUtils.createImmutableList( + new ImmutablePair<>("2022/01/01", UUID.randomUUID().toString()), + new ImmutablePair<>("2022/01/02", UUID.randomUUID().toString()), + new ImmutablePair<>("2022/01/03", UUID.randomUUID().toString()) + )); + baseFileInfo.put("001", CollectionUtils.createImmutableList( + new ImmutablePair<>("2022/01/04", UUID.randomUUID().toString()), + new ImmutablePair<>("2022/01/05", UUID.randomUUID().toString()) + )); + baseFileInfo.put("002", CollectionUtils.createImmutableList( + new ImmutablePair<>("2022/01/06", UUID.randomUUID().toString()) + )); + logFileInfo.put("001", CollectionUtils.createImmutableList( + new ImmutablePair<>("2022/01/03", UUID.randomUUID().toString()), + new ImmutablePair<>("2022/01/06", UUID.randomUUID().toString()) + )); + } + + public static void setupTimelineInFS( + String basePath, + Map>> baseFileInfo, + Map>> logFileInfo, + Map>>> instantInfoMap) throws IOException { + instantInfoMap.clear(); + for (String instantTime : baseFileInfo.keySet()) { + Map>> partitionPathToFileIdAndNameMap = new HashMap<>(); + baseFileInfo.getOrDefault(instantTime, new ArrayList<>()) + .forEach(e -> { + List> fileInfoList = partitionPathToFileIdAndNameMap + .computeIfAbsent(e.getKey(), k -> new ArrayList<>()); + String fileId = e.getValue(); + fileInfoList.add(new ImmutablePair<>(fileId, getBaseFilename(instantTime, fileId))); + }); + logFileInfo.getOrDefault(instantTime, new ArrayList<>()) + .forEach(e -> { + List> fileInfoList = partitionPathToFileIdAndNameMap + .computeIfAbsent(e.getKey(), k -> new ArrayList<>()); + String fileId = e.getValue(); + fileInfoList.add(new ImmutablePair<>(fileId, getLogFilename(instantTime, fileId))); + }); + createCommitAndDataFiles(basePath, instantTime, partitionPathToFileIdAndNameMap); + instantInfoMap.put(instantTime, partitionPathToFileIdAndNameMap); + } + } + + public static String getBaseFilename(String instantTime, String fileId) { + return FSUtils.makeBaseFileName(instantTime, BASE_FILE_WRITE_TOKEN, fileId); + } + + public static String getLogFilename(String instantTime, String fileId) { + return FSUtils.makeLogFileName( + fileId, HoodieFileFormat.HOODIE_LOG.getFileExtension(), instantTime, 1, LOG_FILE_WRITE_TOKEN); + } + + public static void createCommitAndDataFiles( + String basePath, String instantTime, + Map>> partitionPathToFileIdAndNameMap) throws IOException { + String commitFilename = HoodieTimeline.makeCommitFileName(instantTime); + HoodieCommitMetadata commitMetadata = + generateCommitMetadata(partitionPathToFileIdAndNameMap, Collections.emptyMap()); + String content = commitMetadata.toJsonString(); + createCommitFileWithMetadata(basePath, new Configuration(), commitFilename, content); + for (String partitionPath : partitionPathToFileIdAndNameMap.keySet()) { + partitionPathToFileIdAndNameMap.get(partitionPath) + .forEach(fileInfo -> { + String filename = fileInfo.getValue(); + try { + createDataFile(basePath, new Configuration(), partitionPath, filename); + } catch (IOException e) { + LOG.error(String.format("Failed to create data file: %s/%s/%s", + basePath, partitionPath, filename)); + } + }); + } + } + + public static HoodieCommitMetadata generateCommitMetadata( + Map>> partitionPathToFileIdAndNameMap, + Map extraMetadata) { + HoodieCommitMetadata metadata = new HoodieCommitMetadata(); + for (Map.Entry entry : extraMetadata.entrySet()) { + metadata.addMetadata(entry.getKey(), entry.getValue()); + } + partitionPathToFileIdAndNameMap.forEach((partitionPath, fileInfoList) -> + fileInfoList.forEach(fileInfo -> { + HoodieWriteStat writeStat = new HoodieWriteStat(); + writeStat.setPartitionPath(partitionPath); + writeStat.setPath(new Path(partitionPath, fileInfo.getValue()).toString()); + writeStat.setFileId(fileInfo.getKey()); + // Below are dummy values + writeStat.setTotalWriteBytes(10000); + writeStat.setPrevCommit("000"); + writeStat.setNumWrites(10); + writeStat.setNumUpdateWrites(15); + writeStat.setTotalLogBlocks(2); + writeStat.setTotalLogRecords(100); + metadata.addWriteStat(partitionPath, writeStat); + })); + return metadata; + } + + public static void createCommitFileWithMetadata( + String basePath, Configuration configuration, + String filename, String content) throws IOException { + Path commitFilePath = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + filename); + try (FSDataOutputStream os = FSUtils.getFs(basePath, configuration).create(commitFilePath, true)) { + os.writeBytes(new String(content.getBytes(StandardCharsets.UTF_8))); + } + } + + public static void createDataFile( + String basePath, Configuration configuration, + String partitionPath, String filename) throws IOException { + FileSystem fs = FSUtils.getFs(basePath, configuration); + Path filePath = new Path(new Path(basePath, partitionPath), filename); + Path parent = filePath.getParent(); + if (!fs.exists(parent)) { + fs.mkdirs(parent); + } + if (!fs.exists(filePath)) { + fs.create(filePath); + } + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/async/TestAsyncArchiveService.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/async/TestAsyncArchiveService.java new file mode 100644 index 0000000000000..9dad8b8020a1f --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/async/TestAsyncArchiveService.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.async; + +import org.apache.hudi.client.BaseHoodieWriteClient; +import org.apache.hudi.config.HoodieWriteConfig; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.util.concurrent.ExecutionException; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class TestAsyncArchiveService { + + @Mock + BaseHoodieWriteClient writeClient; + @Mock + HoodieWriteConfig config; + + @Test + void startAsyncArchiveReturnsNullWhenAutoArchiveDisabled() { + when(config.isAutoArchive()).thenReturn(false); + when(writeClient.getConfig()).thenReturn(config); + assertNull(AsyncArchiveService.startAsyncArchiveIfEnabled(writeClient)); + } + + @Test + void startAsyncArchiveReturnsNullWhenAsyncArchiveDisabled() { + when(config.isAutoArchive()).thenReturn(true); + when(config.isAsyncArchive()).thenReturn(false); + when(writeClient.getConfig()).thenReturn(config); + assertNull(AsyncArchiveService.startAsyncArchiveIfEnabled(writeClient)); + } + + @Test + void startAsyncArchiveIfEnabled() { + when(config.isAutoArchive()).thenReturn(true); + when(config.isAsyncArchive()).thenReturn(true); + when(writeClient.getConfig()).thenReturn(config); + assertNotNull(AsyncArchiveService.startAsyncArchiveIfEnabled(writeClient)); + } + + @Test + void startServiceShouldInvokeCallArchiveMethod() throws ExecutionException, InterruptedException { + AsyncArchiveService service = new AsyncArchiveService(writeClient); + assertEquals(true, service.startService().getLeft().get()); + verify(writeClient).archive(); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/async/TestHoodieAsyncTableService.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/async/TestHoodieAsyncTableService.java new file mode 100644 index 0000000000000..0c19576d042bf --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/async/TestHoodieAsyncTableService.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.async; + +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class TestHoodieAsyncTableService { + + @Test + void tableServiceShouldNotStartIfDisabled(@Mock HoodieWriteConfig config) { + when(config.areTableServicesEnabled()).thenReturn(false); + HoodieAsyncTableService service = new DummyAsyncTableService(config); + service.start(null); + assertFalse(service.isStarted()); + } + + private static class DummyAsyncTableService extends HoodieAsyncTableService { + + protected DummyAsyncTableService(HoodieWriteConfig writeConfig) { + super(writeConfig); + } + + @Override + protected Pair startService() { + return null; + } + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroParquetWriter.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroParquetWriter.java new file mode 100644 index 0000000000000..df879dc816399 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroParquetWriter.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.avro; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.DummyTaskContextSupplier; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.bloom.BloomFilterFactory; +import org.apache.hudi.common.bloom.BloomFilterTypeCode; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ParquetUtils; +import org.apache.hudi.io.storage.HoodieAvroParquetWriter; +import org.apache.hudi.io.storage.HoodieParquetConfig; +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestHoodieAvroParquetWriter { + + @TempDir java.nio.file.Path tmpDir; + + @Test + public void testProperWriting() throws IOException { + Configuration hadoopConf = new Configuration(); + + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(0xDEED); + List records = dataGen.generateGenericRecords(10); + + Schema schema = records.get(0).getSchema(); + + BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, 10000, + BloomFilterTypeCode.DYNAMIC_V0.name()); + HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), + schema, Option.of(filter)); + + HoodieParquetConfig parquetConfig = + new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP, ParquetWriter.DEFAULT_BLOCK_SIZE, + ParquetWriter.DEFAULT_PAGE_SIZE, 1024 * 1024 * 1024, hadoopConf, 0.1); + + Path filePath = new Path(tmpDir.resolve("test.parquet").toAbsolutePath().toString()); + + try (HoodieAvroParquetWriter writer = + new HoodieAvroParquetWriter<>(filePath, parquetConfig, "001", new DummyTaskContextSupplier(), true)) { + for (GenericRecord record : records) { + writer.writeAvro((String) record.get("_row_key"), record); + } + } + + ParquetUtils utils = new ParquetUtils(); + + // Step 1: Make sure records are written appropriately + List readRecords = utils.readAvroRecords(hadoopConf, filePath); + + assertEquals(toJson(records), toJson(readRecords)); + + // Step 2: Assert Parquet metadata was written appropriately + List recordKeys = records.stream().map(r -> (String) r.get("_row_key")).collect(Collectors.toList()); + + String minKey = recordKeys.stream().min(Comparator.naturalOrder()).get(); + String maxKey = recordKeys.stream().max(Comparator.naturalOrder()).get(); + + FileMetaData parquetMetadata = ParquetUtils.readMetadata(hadoopConf, filePath).getFileMetaData(); + + Map extraMetadata = parquetMetadata.getKeyValueMetaData(); + + assertEquals(extraMetadata.get(HoodieBloomFilterWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER), minKey); + assertEquals(extraMetadata.get(HoodieBloomFilterWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER), maxKey); + assertEquals(extraMetadata.get(HoodieBloomFilterWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE), BloomFilterTypeCode.DYNAMIC_V0.name()); + + // Step 3: Make sure Bloom Filter contains all the record keys + BloomFilter bloomFilter = utils.readBloomFilterFromMetadata(hadoopConf, filePath); + recordKeys.forEach(recordKey -> { + assertTrue(bloomFilter.mightContain(recordKey)); + }); + } + + private static List toJson(List records) { + return records.stream().map(r -> { + try { + return new String(HoodieAvroUtils.avroToJson(r, true)); + } catch (IOException e) { + throw new RuntimeException(e); + } + }).collect(Collectors.toList()); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/callback/http/TestCallbackHttpClient.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/callback/http/TestCallbackHttpClient.java index 616dc31734b79..49b948dd8c0dc 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/callback/http/TestCallbackHttpClient.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/callback/http/TestCallbackHttpClient.java @@ -18,14 +18,18 @@ package org.apache.hudi.callback.http; +import org.apache.hudi.callback.client.http.HoodieWriteCommitHttpCallbackClient; + import org.apache.http.StatusLine; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.hudi.callback.client.http.HoodieWriteCommitHttpCallbackClient; -import org.apache.log4j.AppenderSkeleton; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.apache.log4j.spi.LoggingEvent; +import org.apache.logging.log4j.Level; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.core.Appender; +import org.apache.logging.log4j.core.LogEvent; +import org.apache.logging.log4j.core.Logger; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.ArgumentCaptor; @@ -34,11 +38,13 @@ import org.mockito.junit.jupiter.MockitoExtension; import java.io.IOException; +import java.util.UUID; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.reset; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; @@ -49,10 +55,10 @@ public class TestCallbackHttpClient { @Mock - AppenderSkeleton appender; + Appender appender; @Captor - ArgumentCaptor logCaptor; + ArgumentCaptor logCaptor; @Mock CloseableHttpClient httpClient; @@ -63,6 +69,27 @@ public class TestCallbackHttpClient { @Mock StatusLine statusLine; + private Level initialLogLevel; + + @BeforeEach + void prepareAppender() { + when(appender.getName()).thenReturn("MockAppender-" + UUID.randomUUID()); + when(appender.isStarted()).thenReturn(true); + when(appender.isStopped()).thenReturn(false); + Logger logger = (Logger) LogManager.getLogger(HoodieWriteCommitHttpCallbackClient.class); + initialLogLevel = logger.getLevel(); + logger.setLevel(Level.DEBUG); + logger.addAppender(appender); + } + + @AfterEach + void resetMocks() { + Logger logger = (Logger) LogManager.getLogger(HoodieWriteCommitHttpCallbackClient.class); + logger.setLevel(initialLogLevel); + logger.removeAppender(appender); + reset(appender, httpClient, httpResponse, statusLine); + } + private void mockResponse(int statusCode) { when(statusLine.getStatusCode()).thenReturn(statusCode); when(httpResponse.getStatusLine()).thenReturn(statusLine); @@ -75,21 +102,19 @@ private void mockResponse(int statusCode) { @Test public void sendPayloadShouldLogWhenRequestFailed() throws IOException { - Logger.getRootLogger().addAppender(appender); when(httpClient.execute(any())).thenThrow(IOException.class); HoodieWriteCommitHttpCallbackClient hoodieWriteCommitCallBackHttpClient = new HoodieWriteCommitHttpCallbackClient("fake_api_key", "fake_url", httpClient); hoodieWriteCommitCallBackHttpClient.send("{}"); - verify(appender).doAppend(logCaptor.capture()); - assertEquals("Failed to send callback.", logCaptor.getValue().getRenderedMessage()); + verify(appender).append(logCaptor.capture()); + assertEquals("Failed to send callback.", logCaptor.getValue().getMessage().getFormattedMessage()); assertEquals(Level.WARN, logCaptor.getValue().getLevel()); } @Test public void sendPayloadShouldLogUnsuccessfulSending() { - Logger.getRootLogger().addAppender(appender); mockResponse(401); when(httpResponse.toString()).thenReturn("unauthorized"); @@ -97,22 +122,21 @@ public void sendPayloadShouldLogUnsuccessfulSending() { new HoodieWriteCommitHttpCallbackClient("fake_api_key", "fake_url", httpClient); hoodieWriteCommitCallBackHttpClient.send("{}"); - verify(appender).doAppend(logCaptor.capture()); - assertEquals("Failed to send callback message. Response was unauthorized", logCaptor.getValue().getRenderedMessage()); + verify(appender).append(logCaptor.capture()); + assertEquals("Failed to send callback message. Response was unauthorized", logCaptor.getValue().getMessage().getFormattedMessage()); assertEquals(Level.WARN, logCaptor.getValue().getLevel()); } @Test public void sendPayloadShouldLogSuccessfulSending() { - Logger.getRootLogger().addAppender(appender); mockResponse(202); HoodieWriteCommitHttpCallbackClient hoodieWriteCommitCallBackHttpClient = new HoodieWriteCommitHttpCallbackClient("fake_api_key", "fake_url", httpClient); hoodieWriteCommitCallBackHttpClient.send("{}"); - verify(appender).doAppend(logCaptor.capture()); - assertTrue(logCaptor.getValue().getRenderedMessage().startsWith("Sent Callback data")); + verify(appender).append(logCaptor.capture()); + assertTrue(logCaptor.getValue().getMessage().getFormattedMessage().startsWith("Sent Callback data")); assertEquals(Level.INFO, logCaptor.getValue().getLevel()); } diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/heartbeat/TestHoodieHeartbeatClient.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/heartbeat/TestHoodieHeartbeatClient.java new file mode 100644 index 0000000000000..88fe28edb4e7c --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/heartbeat/TestHoodieHeartbeatClient.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.heartbeat; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; + +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.awaitility.Awaitility.await; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestHoodieHeartbeatClient extends HoodieCommonTestHarness { + + private static String instantTime1 = "100"; + private static String instantTime2 = "101"; + private static Long heartBeatInterval = 1000L; + private static int numTolerableMisses = 1; + + @BeforeEach + public void init() throws IOException { + initMetaClient(); + } + + @Test + public void testStartHeartbeat() throws IOException { + HoodieHeartbeatClient hoodieHeartbeatClient = + new HoodieHeartbeatClient(metaClient.getFs(), metaClient.getBasePath(), heartBeatInterval, numTolerableMisses); + hoodieHeartbeatClient.start(instantTime1); + FileStatus [] fs = metaClient.getFs().listStatus(new Path(hoodieHeartbeatClient.getHeartbeatFolderPath())); + assertTrue(fs.length == 1); + assertTrue(fs[0].getPath().toString().contains(instantTime1)); + } + + @Test + public void testStopHeartbeat() { + HoodieHeartbeatClient hoodieHeartbeatClient = + new HoodieHeartbeatClient(metaClient.getFs(), metaClient.getBasePath(), heartBeatInterval, numTolerableMisses); + hoodieHeartbeatClient.start(instantTime1); + hoodieHeartbeatClient.stop(instantTime1); + await().atMost(5, SECONDS).until(() -> hoodieHeartbeatClient.getHeartbeat(instantTime1).getNumHeartbeats() > 0); + Integer numHeartBeats = hoodieHeartbeatClient.getHeartbeat(instantTime1).getNumHeartbeats(); + assertTrue(numHeartBeats == 1); + } + + @Test + public void testIsHeartbeatExpired() throws IOException { + HoodieHeartbeatClient hoodieHeartbeatClient = + new HoodieHeartbeatClient(metaClient.getFs(), metaClient.getBasePath(), heartBeatInterval, numTolerableMisses); + hoodieHeartbeatClient.start(instantTime1); + hoodieHeartbeatClient.stop(instantTime1); + assertFalse(hoodieHeartbeatClient.isHeartbeatExpired(instantTime1)); + } + + @Test + public void testNumHeartbeatsGenerated() { + Long heartBeatInterval = 5000L; + HoodieHeartbeatClient hoodieHeartbeatClient = + new HoodieHeartbeatClient(metaClient.getFs(), metaClient.getBasePath(), heartBeatInterval, numTolerableMisses); + hoodieHeartbeatClient.start("100"); + await().atMost(5, SECONDS).until(() -> hoodieHeartbeatClient.getHeartbeat(instantTime1).getNumHeartbeats() >= 1); + } + + @Test + public void testDeleteWrongHeartbeat() throws IOException { + HoodieHeartbeatClient hoodieHeartbeatClient = + new HoodieHeartbeatClient(metaClient.getFs(), metaClient.getBasePath(), heartBeatInterval, numTolerableMisses); + hoodieHeartbeatClient.start(instantTime1); + hoodieHeartbeatClient.stop(instantTime1); + assertFalse(HeartbeatUtils.deleteHeartbeatFile(metaClient.getFs(), basePath, instantTime2)); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/FileSystemBasedLockProviderTestClass.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/FileSystemBasedLockProviderTestClass.java new file mode 100644 index 0000000000000..97ad050e7240e --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/FileSystemBasedLockProviderTestClass.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.transaction; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.config.LockConfiguration; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.lock.LockProvider; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieLockException; + +import java.io.IOException; +import java.io.Serializable; +import java.util.concurrent.TimeUnit; + +import static org.apache.hudi.common.config.LockConfiguration.FILESYSTEM_LOCK_PATH_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_NUM_RETRIES_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY; + +/** + * This lock provider is used for testing purposes only. It provides a simple file system based lock + * using filesystem's atomic create operation. This lock does not support cleaning/expiring the lock + * after a failed write. Must not be used in production environments. + */ +public class FileSystemBasedLockProviderTestClass implements LockProvider, Serializable { + + private static final String LOCK = "lock"; + + private final int retryMaxCount; + private final int retryWaitTimeMs; + private transient FileSystem fs; + private transient Path lockFile; + protected LockConfiguration lockConfiguration; + + public FileSystemBasedLockProviderTestClass(final LockConfiguration lockConfiguration, final Configuration configuration) { + this.lockConfiguration = lockConfiguration; + final String lockDirectory = lockConfiguration.getConfig().getString(FILESYSTEM_LOCK_PATH_PROP_KEY); + this.retryWaitTimeMs = lockConfiguration.getConfig().getInteger(LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY); + this.retryMaxCount = lockConfiguration.getConfig().getInteger(LOCK_ACQUIRE_NUM_RETRIES_PROP_KEY); + this.lockFile = new Path(lockDirectory + "/" + LOCK); + this.fs = FSUtils.getFs(this.lockFile.toString(), configuration); + } + + @Override + public void close() { + synchronized (LOCK) { + try { + fs.delete(this.lockFile, true); + } catch (IOException e) { + throw new HoodieLockException("Unable to release lock: " + getLock(), e); + } + } + } + + @Override + public boolean tryLock(long time, TimeUnit unit) { + try { + int numRetries = 0; + synchronized (LOCK) { + while (fs.exists(this.lockFile)) { + LOCK.wait(retryWaitTimeMs); + numRetries++; + if (numRetries > retryMaxCount) { + return false; + } + } + acquireLock(); + return fs.exists(this.lockFile); + } + } catch (IOException | InterruptedException e) { + throw new HoodieLockException("Failed to acquire lock: " + getLock(), e); + } + } + + @Override + public void unlock() { + synchronized (LOCK) { + try { + if (fs.exists(this.lockFile)) { + fs.delete(this.lockFile, true); + } + } catch (IOException io) { + throw new HoodieIOException("Unable to delete lock " + getLock() + "on disk", io); + } + } + } + + @Override + public String getLock() { + return this.lockFile.toString(); + } + + private void acquireLock() { + try { + fs.create(this.lockFile, false).close(); + } catch (IOException e) { + throw new HoodieIOException("Failed to acquire lock: " + getLock(), e); + } + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestInProcessLockProvider.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestInProcessLockProvider.java new file mode 100644 index 0000000000000..6d6c526785dc5 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestInProcessLockProvider.java @@ -0,0 +1,294 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.transaction; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hudi.client.transaction.lock.InProcessLockProvider; +import org.apache.hudi.common.config.LockConfiguration; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.exception.HoodieLockException; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class TestInProcessLockProvider { + + private static final Logger LOG = LogManager.getLogger(TestInProcessLockProvider.class); + private final Configuration hadoopConfiguration = new Configuration(); + private final LockConfiguration lockConfiguration = new LockConfiguration(new TypedProperties()); + + @Test + public void testLockAcquisition() { + InProcessLockProvider inProcessLockProvider = new InProcessLockProvider(lockConfiguration, hadoopConfiguration); + assertDoesNotThrow(() -> { + inProcessLockProvider.lock(); + }); + assertDoesNotThrow(() -> { + inProcessLockProvider.unlock(); + }); + } + + @Test + public void testLockReAcquisitionBySameThread() { + InProcessLockProvider inProcessLockProvider = new InProcessLockProvider(lockConfiguration, hadoopConfiguration); + assertDoesNotThrow(() -> { + inProcessLockProvider.lock(); + }); + assertThrows(HoodieLockException.class, () -> { + inProcessLockProvider.lock(); + }); + assertDoesNotThrow(() -> { + inProcessLockProvider.unlock(); + }); + } + + @Test + public void testLockReAcquisitionByDifferentThread() { + InProcessLockProvider inProcessLockProvider = new InProcessLockProvider(lockConfiguration, hadoopConfiguration); + final AtomicBoolean writer2Completed = new AtomicBoolean(false); + + // Main test thread + assertDoesNotThrow(() -> { + inProcessLockProvider.lock(); + }); + + // Another writer thread in parallel, should block + // and later acquire the lock once it is released + Thread writer2 = new Thread(new Runnable() { + @Override + public void run() { + assertDoesNotThrow(() -> { + inProcessLockProvider.lock(); + }); + assertDoesNotThrow(() -> { + inProcessLockProvider.unlock(); + }); + writer2Completed.set(true); + } + }); + writer2.start(); + + assertDoesNotThrow(() -> { + inProcessLockProvider.unlock(); + }); + + try { + writer2.join(); + } catch (InterruptedException e) { + // + } + Assertions.assertTrue(writer2Completed.get()); + } + + @Test + public void testTryLockAcquisition() { + InProcessLockProvider inProcessLockProvider = new InProcessLockProvider(lockConfiguration, hadoopConfiguration); + Assertions.assertTrue(inProcessLockProvider.tryLock()); + assertDoesNotThrow(() -> { + inProcessLockProvider.unlock(); + }); + } + + @Test + public void testTryLockAcquisitionWithTimeout() { + InProcessLockProvider inProcessLockProvider = new InProcessLockProvider(lockConfiguration, hadoopConfiguration); + Assertions.assertTrue(inProcessLockProvider.tryLock(1, TimeUnit.MILLISECONDS)); + assertDoesNotThrow(() -> { + inProcessLockProvider.unlock(); + }); + } + + @Test + public void testTryLockReAcquisitionBySameThread() { + InProcessLockProvider inProcessLockProvider = new InProcessLockProvider(lockConfiguration, hadoopConfiguration); + Assertions.assertTrue(inProcessLockProvider.tryLock()); + assertThrows(HoodieLockException.class, () -> { + inProcessLockProvider.tryLock(1, TimeUnit.MILLISECONDS); + }); + assertDoesNotThrow(() -> { + inProcessLockProvider.unlock(); + }); + } + + @Test + public void testTryLockReAcquisitionByDifferentThread() { + InProcessLockProvider inProcessLockProvider = new InProcessLockProvider(lockConfiguration, hadoopConfiguration); + final AtomicBoolean writer2Completed = new AtomicBoolean(false); + + // Main test thread + Assertions.assertTrue(inProcessLockProvider.tryLock()); + + // Another writer thread + Thread writer2 = new Thread(() -> { + Assertions.assertFalse(inProcessLockProvider.tryLock(100L, TimeUnit.MILLISECONDS)); + writer2Completed.set(true); + }); + writer2.start(); + try { + writer2.join(); + } catch (InterruptedException e) { + // + } + + Assertions.assertTrue(writer2Completed.get()); + assertDoesNotThrow(() -> { + inProcessLockProvider.unlock(); + }); + } + + @Test + public void testTryUnLockByDifferentThread() { + InProcessLockProvider inProcessLockProvider = new InProcessLockProvider(lockConfiguration, hadoopConfiguration); + final AtomicBoolean writer3Completed = new AtomicBoolean(false); + + // Main test thread + Assertions.assertTrue(inProcessLockProvider.tryLock()); + + // Another writer thread + Thread writer2 = new Thread(() -> { + assertDoesNotThrow(() -> { + inProcessLockProvider.unlock(); + }); + }); + writer2.start(); + try { + writer2.join(); + } catch (InterruptedException e) { + // + } + + // try acquiring by diff thread. should fail. since main thread still have acquired the lock. if previous unblock by a different thread would have succeeded, this lock + // acquisition would succeed. + Thread writer3 = new Thread(() -> { + Assertions.assertFalse(inProcessLockProvider.tryLock(50, TimeUnit.MILLISECONDS)); + writer3Completed.set(true); + }); + writer3.start(); + try { + writer3.join(); + } catch (InterruptedException e) { + // + } + + Assertions.assertTrue(writer3Completed.get()); + assertDoesNotThrow(() -> { + // unlock by main thread should succeed. + inProcessLockProvider.unlock(); + }); + } + + @Test + public void testTryLockAcquisitionBeforeTimeOutFromTwoThreads() { + final InProcessLockProvider inProcessLockProvider = new InProcessLockProvider(lockConfiguration, hadoopConfiguration); + final int threadCount = 3; + final long awaitMaxTimeoutMs = 2000L; + final CountDownLatch latch = new CountDownLatch(threadCount); + final AtomicBoolean writer1Completed = new AtomicBoolean(false); + final AtomicBoolean writer2Completed = new AtomicBoolean(false); + + // Let writer1 get the lock first, then wait for others + // to join the sync up point. + Thread writer1 = new Thread(() -> { + Assertions.assertTrue(inProcessLockProvider.tryLock()); + latch.countDown(); + try { + latch.await(awaitMaxTimeoutMs, TimeUnit.MILLISECONDS); + // Following sleep is to make sure writer2 attempts + // to try lock and to get bocked on the lock which + // this thread is currently holding. + Thread.sleep(50); + } catch (InterruptedException e) { + // + } + assertDoesNotThrow(() -> { + inProcessLockProvider.unlock(); + }); + writer1Completed.set(true); + }); + writer1.start(); + + // Writer2 will block on trying to acquire the lock + // and will eventually get the lock before the timeout. + Thread writer2 = new Thread(() -> { + latch.countDown(); + Assertions.assertTrue(inProcessLockProvider.tryLock(awaitMaxTimeoutMs, TimeUnit.MILLISECONDS)); + assertDoesNotThrow(() -> { + inProcessLockProvider.unlock(); + }); + writer2Completed.set(true); + }); + writer2.start(); + + // Let writer1 and writer2 wait at the sync up + // point to make sure they run in parallel and + // one get blocked by the other. + latch.countDown(); + try { + writer1.join(); + writer2.join(); + } catch (InterruptedException e) { + // + } + + // Make sure both writers actually completed good + Assertions.assertTrue(writer1Completed.get()); + Assertions.assertTrue(writer2Completed.get()); + } + + @Test + public void testLockReleaseByClose() { + InProcessLockProvider inProcessLockProvider = new InProcessLockProvider(lockConfiguration, hadoopConfiguration); + assertDoesNotThrow(() -> { + inProcessLockProvider.lock(); + }); + assertDoesNotThrow(() -> { + inProcessLockProvider.close(); + }); + } + + @Test + public void testRedundantUnlock() { + InProcessLockProvider inProcessLockProvider = new InProcessLockProvider(lockConfiguration, hadoopConfiguration); + assertDoesNotThrow(() -> { + inProcessLockProvider.lock(); + }); + assertDoesNotThrow(() -> { + inProcessLockProvider.unlock(); + }); + assertDoesNotThrow(() -> { + inProcessLockProvider.unlock(); + }); + } + + @Test + public void testUnlockWithoutLock() { + InProcessLockProvider inProcessLockProvider = new InProcessLockProvider(lockConfiguration, hadoopConfiguration); + assertDoesNotThrow(() -> { + inProcessLockProvider.unlock(); + }); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestSimpleConcurrentFileWritesConflictResolutionStrategy.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestSimpleConcurrentFileWritesConflictResolutionStrategy.java new file mode 100644 index 0000000000000..39b9e1e6dc474 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestSimpleConcurrentFileWritesConflictResolutionStrategy.java @@ -0,0 +1,619 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.transaction; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.avro.model.HoodieCompactionOperation; +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; +import org.apache.hudi.avro.model.HoodieSliceInfo; +import org.apache.hudi.client.utils.TransactionUtils; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieInstant.State; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.testutils.FileCreateUtils; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieWriteConflictException; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class TestSimpleConcurrentFileWritesConflictResolutionStrategy extends HoodieCommonTestHarness { + + @BeforeEach + public void init() throws IOException { + initMetaClient(); + } + + @Test + public void testNoConcurrentWrites() throws Exception { + String newInstantTime = HoodieTestTable.makeNewCommitTime(); + createCommit(newInstantTime); + // consider commits before this are all successful + + Option lastSuccessfulInstant = metaClient.getCommitsTimeline().filterCompletedInstants().lastInstant(); + newInstantTime = HoodieTestTable.makeNewCommitTime(); + Option currentInstant = Option.of(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, newInstantTime)); + + SimpleConcurrentFileWritesConflictResolutionStrategy strategy = new SimpleConcurrentFileWritesConflictResolutionStrategy(); + Stream candidateInstants = strategy.getCandidateInstants(metaClient.getActiveTimeline(), currentInstant.get(), lastSuccessfulInstant); + Assertions.assertTrue(candidateInstants.count() == 0); + } + + @Test + public void testConcurrentWrites() throws Exception { + String newInstantTime = HoodieTestTable.makeNewCommitTime(); + createCommit(newInstantTime); + // consider commits before this are all successful + // writer 1 + createInflightCommit(HoodieTestTable.makeNewCommitTime()); + // writer 2 + createInflightCommit(HoodieTestTable.makeNewCommitTime()); + Option lastSuccessfulInstant = metaClient.getCommitsTimeline().filterCompletedInstants().lastInstant(); + newInstantTime = HoodieTestTable.makeNewCommitTime(); + Option currentInstant = Option.of(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, newInstantTime)); + SimpleConcurrentFileWritesConflictResolutionStrategy strategy = new SimpleConcurrentFileWritesConflictResolutionStrategy(); + Stream candidateInstants = strategy.getCandidateInstants(metaClient.getActiveTimeline(), currentInstant.get(), lastSuccessfulInstant); + Assertions.assertTrue(candidateInstants.count() == 0); + } + + @Test + public void testConcurrentWritesWithInterleavingSuccesssfulCommit() throws Exception { + createCommit(HoodieActiveTimeline.createNewInstantTime()); + HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); + // consider commits before this are all successful + Option lastSuccessfulInstant = timeline.getCommitsTimeline().filterCompletedInstants().lastInstant(); + // writer 1 starts + String currentWriterInstant = HoodieActiveTimeline.createNewInstantTime(); + createInflightCommit(currentWriterInstant); + // writer 2 starts and finishes + String newInstantTime = HoodieActiveTimeline.createNewInstantTime(); + createCommit(newInstantTime); + + Option currentInstant = Option.of(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, currentWriterInstant)); + SimpleConcurrentFileWritesConflictResolutionStrategy strategy = new SimpleConcurrentFileWritesConflictResolutionStrategy(); + HoodieCommitMetadata currentMetadata = createCommitMetadata(currentWriterInstant); + timeline = timeline.reload(); + List candidateInstants = strategy.getCandidateInstants(timeline, currentInstant.get(), lastSuccessfulInstant).collect( + Collectors.toList()); + // writer 1 conflicts with writer 2 + Assertions.assertTrue(candidateInstants.size() == 1); + ConcurrentOperation thatCommitOperation = new ConcurrentOperation(candidateInstants.get(0), metaClient); + ConcurrentOperation thisCommitOperation = new ConcurrentOperation(currentInstant.get(), currentMetadata); + Assertions.assertTrue(strategy.hasConflict(thisCommitOperation, thatCommitOperation)); + try { + strategy.resolveConflict(null, thisCommitOperation, thatCommitOperation); + Assertions.fail("Cannot reach here, writer 1 and writer 2 should have thrown a conflict"); + } catch (HoodieWriteConflictException e) { + // expected + } + } + + @Test + public void testConcurrentWritesWithReplaceInflightCommit() throws Exception { + createReplaceInflight(HoodieActiveTimeline.createNewInstantTime()); + HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); + Option lastSuccessfulInstant = Option.empty(); + + // writer 1 starts + String currentWriterInstant = HoodieActiveTimeline.createNewInstantTime(); + createInflightCommit(currentWriterInstant); + Option currentInstant = Option.of(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, currentWriterInstant)); + + // writer 2 starts and finishes + String newInstantTime = HoodieActiveTimeline.createNewInstantTime(); + createReplaceInflight(newInstantTime); + + SimpleConcurrentFileWritesConflictResolutionStrategy strategy = new SimpleConcurrentFileWritesConflictResolutionStrategy(); + HoodieCommitMetadata currentMetadata = createCommitMetadata(currentWriterInstant); + timeline = timeline.reload(); + + List candidateInstants = strategy.getCandidateInstants(timeline, currentInstant.get(), lastSuccessfulInstant).collect( + Collectors.toList()); + + // writer 1 conflicts with writer 2 + Assertions.assertTrue(candidateInstants.size() == 1); + ConcurrentOperation thatCommitOperation = new ConcurrentOperation(candidateInstants.get(0), metaClient); + ConcurrentOperation thisCommitOperation = new ConcurrentOperation(currentInstant.get(), currentMetadata); + Assertions.assertTrue(strategy.hasConflict(thisCommitOperation, thatCommitOperation)); + try { + strategy.resolveConflict(null, thisCommitOperation, thatCommitOperation); + Assertions.fail("Cannot reach here, writer 1 and writer 2 should have thrown a conflict"); + } catch (HoodieWriteConflictException e) { + // expected + } + } + + @Test + public void testConcurrentWritesWithInterleavingScheduledCompaction() throws Exception { + createCommit(HoodieActiveTimeline.createNewInstantTime()); + HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); + // consider commits before this are all successful + Option lastSuccessfulInstant = timeline.getCommitsTimeline().filterCompletedInstants().lastInstant(); + // writer 1 starts + String currentWriterInstant = HoodieActiveTimeline.createNewInstantTime(); + createInflightCommit(currentWriterInstant); + // compaction 1 gets scheduled + String newInstantTime = HoodieActiveTimeline.createNewInstantTime(); + createCompactionRequested(newInstantTime); + + Option currentInstant = Option.of(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, currentWriterInstant)); + SimpleConcurrentFileWritesConflictResolutionStrategy strategy = new SimpleConcurrentFileWritesConflictResolutionStrategy(); + HoodieCommitMetadata currentMetadata = createCommitMetadata(currentWriterInstant); + timeline = timeline.reload(); + List candidateInstants = strategy.getCandidateInstants(timeline, currentInstant.get(), lastSuccessfulInstant).collect( + Collectors.toList()); + // writer 1 conflicts with scheduled compaction plan 1 + Assertions.assertTrue(candidateInstants.size() == 1); + ConcurrentOperation thatCommitOperation = new ConcurrentOperation(candidateInstants.get(0), metaClient); + ConcurrentOperation thisCommitOperation = new ConcurrentOperation(currentInstant.get(), currentMetadata); + Assertions.assertTrue(strategy.hasConflict(thisCommitOperation, thatCommitOperation)); + try { + strategy.resolveConflict(null, thisCommitOperation, thatCommitOperation); + Assertions.fail("Cannot reach here, should have thrown a conflict"); + } catch (HoodieWriteConflictException e) { + // expected + } + } + + @Test + public void testConcurrentWritesWithInterleavingSuccessfulCompaction() throws Exception { + createCommit(HoodieActiveTimeline.createNewInstantTime()); + HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); + // consider commits before this are all successful + Option lastSuccessfulInstant = timeline.getCommitsTimeline().filterCompletedInstants().lastInstant(); + // writer 1 starts + String currentWriterInstant = HoodieActiveTimeline.createNewInstantTime(); + createInflightCommit(currentWriterInstant); + // compaction 1 gets scheduled and finishes + String newInstantTime = HoodieActiveTimeline.createNewInstantTime(); + createCompaction(newInstantTime); + + Option currentInstant = Option.of(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, currentWriterInstant)); + SimpleConcurrentFileWritesConflictResolutionStrategy strategy = new SimpleConcurrentFileWritesConflictResolutionStrategy(); + HoodieCommitMetadata currentMetadata = createCommitMetadata(currentWriterInstant); + timeline = timeline.reload(); + List candidateInstants = strategy.getCandidateInstants(timeline, currentInstant.get(), lastSuccessfulInstant).collect( + Collectors.toList()); + // writer 1 conflicts with compaction 1 + Assertions.assertTrue(candidateInstants.size() == 1); + ConcurrentOperation thatCommitOperation = new ConcurrentOperation(candidateInstants.get(0), metaClient); + ConcurrentOperation thisCommitOperation = new ConcurrentOperation(currentInstant.get(), currentMetadata); + Assertions.assertTrue(strategy.hasConflict(thisCommitOperation, thatCommitOperation)); + try { + strategy.resolveConflict(null, thisCommitOperation, thatCommitOperation); + Assertions.fail("Cannot reach here, should have thrown a conflict"); + } catch (HoodieWriteConflictException e) { + // expected + } + } + + @Test + public void testConcurrentWriteAndCompactionScheduledEarlier() throws Exception { + createCommit(HoodieActiveTimeline.createNewInstantTime()); + // compaction 1 gets scheduled + String newInstantTime = HoodieActiveTimeline.createNewInstantTime(); + createCompaction(newInstantTime); + // consider commits before this are all successful + HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); + Option lastSuccessfulInstant = timeline.getCommitsTimeline().filterCompletedInstants().lastInstant(); + // writer 1 starts + String currentWriterInstant = HoodieActiveTimeline.createNewInstantTime(); + createInflightCommit(currentWriterInstant); + + Option currentInstant = Option.of(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, currentWriterInstant)); + SimpleConcurrentFileWritesConflictResolutionStrategy strategy = new SimpleConcurrentFileWritesConflictResolutionStrategy(); + HoodieCommitMetadata currentMetadata = createCommitMetadata(currentWriterInstant); + timeline = timeline.reload(); + List candidateInstants = strategy.getCandidateInstants(timeline, currentInstant.get(), lastSuccessfulInstant).collect( + Collectors.toList()); + // writer 1 should not conflict with an earlier scheduled compaction 1 with the same file ids + Assertions.assertTrue(candidateInstants.size() == 0); + } + + @Test + public void testConcurrentWritesWithInterleavingScheduledCluster() throws Exception { + createCommit(HoodieActiveTimeline.createNewInstantTime()); + HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); + // consider commits before this are all successful + Option lastSuccessfulInstant = timeline.getCommitsTimeline().filterCompletedInstants().lastInstant(); + // writer 1 starts + String currentWriterInstant = HoodieActiveTimeline.createNewInstantTime(); + createInflightCommit(currentWriterInstant); + // clustering 1 gets scheduled + String newInstantTime = HoodieActiveTimeline.createNewInstantTime(); + createReplaceRequested(newInstantTime); + + Option currentInstant = Option.of(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, currentWriterInstant)); + SimpleConcurrentFileWritesConflictResolutionStrategy strategy = new SimpleConcurrentFileWritesConflictResolutionStrategy(); + HoodieCommitMetadata currentMetadata = createCommitMetadata(currentWriterInstant); + timeline = timeline.reload(); + List candidateInstants = strategy.getCandidateInstants(timeline, currentInstant.get(), lastSuccessfulInstant).collect( + Collectors.toList()); + // writer 1 conflicts with scheduled compaction plan 1 + Assertions.assertTrue(candidateInstants.size() == 1); + ConcurrentOperation thatCommitOperation = new ConcurrentOperation(candidateInstants.get(0), metaClient); + ConcurrentOperation thisCommitOperation = new ConcurrentOperation(currentInstant.get(), currentMetadata); + Assertions.assertTrue(strategy.hasConflict(thisCommitOperation, thatCommitOperation)); + try { + strategy.resolveConflict(null, thisCommitOperation, thatCommitOperation); + Assertions.fail("Cannot reach here, should have thrown a conflict"); + } catch (HoodieWriteConflictException e) { + // expected + } + } + + @Test + public void testConcurrentWritesWithInterleavingSuccessfulCluster() throws Exception { + createCommit(HoodieActiveTimeline.createNewInstantTime()); + HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); + // consider commits before this are all successful + Option lastSuccessfulInstant = timeline.getCommitsTimeline().filterCompletedInstants().lastInstant(); + // writer 1 starts + String currentWriterInstant = HoodieActiveTimeline.createNewInstantTime(); + createInflightCommit(currentWriterInstant); + // cluster 1 gets scheduled and finishes + String newInstantTime = HoodieActiveTimeline.createNewInstantTime(); + createReplace(newInstantTime, WriteOperationType.CLUSTER); + + Option currentInstant = Option.of(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, currentWriterInstant)); + SimpleConcurrentFileWritesConflictResolutionStrategy strategy = new SimpleConcurrentFileWritesConflictResolutionStrategy(); + HoodieCommitMetadata currentMetadata = createCommitMetadata(currentWriterInstant); + timeline = timeline.reload(); + List candidateInstants = strategy.getCandidateInstants(timeline, currentInstant.get(), lastSuccessfulInstant).collect( + Collectors.toList()); + // writer 1 conflicts with cluster 1 + Assertions.assertTrue(candidateInstants.size() == 1); + ConcurrentOperation thatCommitOperation = new ConcurrentOperation(candidateInstants.get(0), metaClient); + ConcurrentOperation thisCommitOperation = new ConcurrentOperation(currentInstant.get(), currentMetadata); + Assertions.assertTrue(strategy.hasConflict(thisCommitOperation, thatCommitOperation)); + try { + strategy.resolveConflict(null, thisCommitOperation, thatCommitOperation); + Assertions.fail("Cannot reach here, should have thrown a conflict"); + } catch (HoodieWriteConflictException e) { + // expected + } + } + + @Test + public void testConcurrentWritesWithInterleavingSuccessfulReplace() throws Exception { + createCommit(HoodieActiveTimeline.createNewInstantTime()); + HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); + // consider commits before this are all successful + Option lastSuccessfulInstant = timeline.getCommitsTimeline().filterCompletedInstants().lastInstant(); + // writer 1 starts + String currentWriterInstant = HoodieActiveTimeline.createNewInstantTime(); + createInflightCommit(currentWriterInstant); + // replace 1 gets scheduled and finished + String newInstantTime = HoodieActiveTimeline.createNewInstantTime(); + createReplace(newInstantTime, WriteOperationType.INSERT_OVERWRITE); + + Option currentInstant = Option.of(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, currentWriterInstant)); + SimpleConcurrentFileWritesConflictResolutionStrategy strategy = new SimpleConcurrentFileWritesConflictResolutionStrategy(); + HoodieCommitMetadata currentMetadata = createCommitMetadata(currentWriterInstant); + timeline = timeline.reload(); + List candidateInstants = strategy.getCandidateInstants(timeline, currentInstant.get(), lastSuccessfulInstant).collect( + Collectors.toList()); + // writer 1 conflicts with replace 1 + Assertions.assertTrue(candidateInstants.size() == 1); + ConcurrentOperation thatCommitOperation = new ConcurrentOperation(candidateInstants.get(0), metaClient); + ConcurrentOperation thisCommitOperation = new ConcurrentOperation(currentInstant.get(), currentMetadata); + Assertions.assertTrue(strategy.hasConflict(thisCommitOperation, thatCommitOperation)); + try { + strategy.resolveConflict(null, thisCommitOperation, thatCommitOperation); + Assertions.fail("Cannot reach here, should have thrown a conflict"); + } catch (HoodieWriteConflictException e) { + // expected + } + } + + private void createCommit(String instantTime) throws Exception { + String fileId1 = "file-1"; + String fileId2 = "file-2"; + + HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); + commitMetadata.addMetadata("test", "test"); + HoodieWriteStat writeStat = new HoodieWriteStat(); + writeStat.setFileId("file-1"); + commitMetadata.addWriteStat(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, writeStat); + commitMetadata.setOperationType(WriteOperationType.INSERT); + HoodieTestTable.of(metaClient) + .addCommit(instantTime, Option.of(commitMetadata)) + .withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2); + } + + private HoodieCommitMetadata createCommitMetadata(String instantTime, String writeFileName) { + HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); + commitMetadata.addMetadata("test", "test"); + HoodieWriteStat writeStat = new HoodieWriteStat(); + writeStat.setFileId(writeFileName); + commitMetadata.addWriteStat(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, writeStat); + commitMetadata.setOperationType(WriteOperationType.INSERT); + return commitMetadata; + } + + private HoodieCommitMetadata createCommitMetadata(String instantTime) { + return createCommitMetadata(instantTime, "file-1"); + } + + private void createInflightCommit(String instantTime) throws Exception { + String fileId1 = "file-" + instantTime + "-1"; + String fileId2 = "file-" + instantTime + "-2"; + HoodieTestTable.of(metaClient) + .addInflightCommit(instantTime) + .withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2); + } + + private void createCompactionRequested(String instantTime) throws Exception { + String fileId1 = "file-1"; + HoodieCompactionPlan compactionPlan = new HoodieCompactionPlan(); + compactionPlan.setVersion(TimelineLayoutVersion.CURR_VERSION); + HoodieCompactionOperation operation = new HoodieCompactionOperation(); + operation.setFileId(fileId1); + operation.setPartitionPath(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH); + operation.setDataFilePath("/file-1"); + operation.setDeltaFilePaths(Arrays.asList("/file-1")); + compactionPlan.setOperations(Arrays.asList(operation)); + HoodieTestTable.of(metaClient) + .addRequestedCompaction(instantTime, compactionPlan); + } + + private void createCompaction(String instantTime) throws Exception { + String fileId1 = "file-1"; + String fileId2 = "file-2"; + + HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); + commitMetadata.addMetadata("test", "test"); + commitMetadata.setOperationType(WriteOperationType.COMPACT); + commitMetadata.setCompacted(true); + HoodieWriteStat writeStat = new HoodieWriteStat(); + writeStat.setFileId("file-1"); + commitMetadata.addWriteStat(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, writeStat); + HoodieTestTable.of(metaClient) + .addCommit(instantTime, Option.of(commitMetadata)) + .withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2); + } + + private void createReplaceRequested(String instantTime) throws Exception { + String fileId1 = "file-1"; + String fileId2 = "file-2"; + + // create replace instant to mark fileId1 as deleted + HoodieRequestedReplaceMetadata requestedReplaceMetadata = new HoodieRequestedReplaceMetadata(); + requestedReplaceMetadata.setOperationType(WriteOperationType.CLUSTER.name()); + HoodieClusteringPlan clusteringPlan = new HoodieClusteringPlan(); + HoodieClusteringGroup clusteringGroup = new HoodieClusteringGroup(); + HoodieSliceInfo sliceInfo = new HoodieSliceInfo(); + sliceInfo.setFileId(fileId1); + sliceInfo.setPartitionPath(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH); + clusteringGroup.setSlices(Arrays.asList(sliceInfo)); + clusteringPlan.setInputGroups(Arrays.asList(clusteringGroup)); + requestedReplaceMetadata.setClusteringPlan(clusteringPlan); + requestedReplaceMetadata.setVersion(TimelineLayoutVersion.CURR_VERSION); + HoodieTestTable.of(metaClient) + .addRequestedReplace(instantTime, Option.of(requestedReplaceMetadata)) + .withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2); + } + + private void createReplaceInflight(String instantTime) throws Exception { + String fileId1 = "file-1"; + String fileId2 = "file-2"; + + HoodieCommitMetadata inflightReplaceMetadata = new HoodieCommitMetadata(); + inflightReplaceMetadata.setOperationType(WriteOperationType.INSERT_OVERWRITE); + HoodieWriteStat writeStat = new HoodieWriteStat(); + writeStat.setFileId("file-1"); + inflightReplaceMetadata.addWriteStat(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, writeStat); + HoodieTestTable.of(metaClient) + .addInflightReplace(instantTime, Option.of(inflightReplaceMetadata)) + .withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2); + } + + private void createReplace(String instantTime, WriteOperationType writeOperationType) throws Exception { + String fileId1 = "file-1"; + String fileId2 = "file-2"; + + // create replace instant to mark fileId1 as deleted + HoodieReplaceCommitMetadata replaceMetadata = new HoodieReplaceCommitMetadata(); + Map> partitionFileIds = new HashMap<>(); + partitionFileIds.put(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, Arrays.asList(fileId2)); + replaceMetadata.setPartitionToReplaceFileIds(partitionFileIds); + HoodieWriteStat writeStat = new HoodieWriteStat(); + writeStat.setFileId("file-1"); + replaceMetadata.addWriteStat(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, writeStat); + replaceMetadata.setOperationType(writeOperationType); + // create replace instant to mark fileId1 as deleted + HoodieRequestedReplaceMetadata requestedReplaceMetadata = new HoodieRequestedReplaceMetadata(); + requestedReplaceMetadata.setOperationType(WriteOperationType.CLUSTER.name()); + HoodieClusteringPlan clusteringPlan = new HoodieClusteringPlan(); + HoodieClusteringGroup clusteringGroup = new HoodieClusteringGroup(); + HoodieSliceInfo sliceInfo = new HoodieSliceInfo(); + sliceInfo.setFileId(fileId1); + sliceInfo.setPartitionPath(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH); + clusteringGroup.setSlices(Arrays.asList(sliceInfo)); + clusteringPlan.setInputGroups(Arrays.asList(clusteringGroup)); + requestedReplaceMetadata.setClusteringPlan(clusteringPlan); + requestedReplaceMetadata.setVersion(TimelineLayoutVersion.CURR_VERSION); + HoodieTestTable.of(metaClient) + .addReplaceCommit(instantTime, Option.of(requestedReplaceMetadata), Option.empty(), replaceMetadata) + .withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2); + } + + // try to simulate HUDI-3355 + @Test + public void testConcurrentWritesWithPendingInstants() throws Exception { + // step1: create a pending replace/commit/compact instant: C1,C11,C12 + String newInstantTimeC1 = HoodieActiveTimeline.createNewInstantTime(); + createPendingReplace(newInstantTimeC1, WriteOperationType.CLUSTER); + + String newCompactionInstantTimeC11 = HoodieActiveTimeline.createNewInstantTime(); + createPendingCompaction(newCompactionInstantTimeC11); + + String newCommitInstantTimeC12 = HoodieActiveTimeline.createNewInstantTime(); + createInflightCommit(newCommitInstantTimeC12); + // step2: create a complete commit which has no conflict with C1,C11,C12, named it as C2 + createCommit(HoodieActiveTimeline.createNewInstantTime()); + HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); + // consider commits before this are all successful + Option lastSuccessfulInstant = timeline.getCommitsTimeline().filterCompletedInstants().lastInstant(); + // step3: write 1 starts, which has conflict with C1,C11,C12, named it as C3 + String currentWriterInstant = HoodieActiveTimeline.createNewInstantTime(); + createInflightCommit(currentWriterInstant); + // step4: create a requested commit, which has conflict with C3, named it as C4 + String commitC4 = HoodieActiveTimeline.createNewInstantTime(); + createRequestedCommit(commitC4); + // get PendingCommit during write 1 operation + metaClient.reloadActiveTimeline(); + Set pendingInstant = TransactionUtils.getInflightAndRequestedInstants(metaClient); + pendingInstant.remove(currentWriterInstant); + // step5: finished pending cluster/compaction/commit operation + createCompleteReplace(newInstantTimeC1, WriteOperationType.CLUSTER); + createCompleteCompaction(newCompactionInstantTimeC11); + createCompleteCommit(newCommitInstantTimeC12); + createCompleteCommit(commitC4); + + // step6: do check + Option currentInstant = Option.of(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, currentWriterInstant)); + SimpleConcurrentFileWritesConflictResolutionStrategy strategy = new SimpleConcurrentFileWritesConflictResolutionStrategy(); + // make sure c3 has conflict with C1,C11,C12,C4; + HoodieCommitMetadata currentMetadata = createCommitMetadata(currentWriterInstant, "file-2"); + timeline.reload(); + List completedInstantsDuringCurrentWriteOperation = TransactionUtils + .getCompletedInstantsDuringCurrentWriteOperation(metaClient, pendingInstant).collect(Collectors.toList()); + // C1,C11,C12,C4 should be included + Assertions.assertTrue(completedInstantsDuringCurrentWriteOperation.size() == 4); + + ConcurrentOperation thisCommitOperation = new ConcurrentOperation(currentInstant.get(), currentMetadata); + // check C3 has conflict with C1,C11,C12,C4 + for (HoodieInstant instant : completedInstantsDuringCurrentWriteOperation) { + ConcurrentOperation thatCommitOperation = new ConcurrentOperation(instant, metaClient); + Assertions.assertTrue(strategy.hasConflict(thisCommitOperation, thatCommitOperation)); + try { + strategy.resolveConflict(null, thisCommitOperation, thatCommitOperation); + } catch (HoodieWriteConflictException e) { + // expected + } + } + } + + private void createPendingReplace(String instantTime, WriteOperationType writeOperationType) throws Exception { + String fileId1 = "file-1"; + String fileId2 = "file-2"; + // create replace instant to mark fileId2 as deleted + HoodieRequestedReplaceMetadata requestedReplaceMetadata = new HoodieRequestedReplaceMetadata(); + requestedReplaceMetadata.setOperationType(WriteOperationType.CLUSTER.name()); + HoodieClusteringPlan clusteringPlan = new HoodieClusteringPlan(); + HoodieClusteringGroup clusteringGroup = new HoodieClusteringGroup(); + HoodieSliceInfo sliceInfo = new HoodieSliceInfo(); + sliceInfo.setFileId(fileId2); + sliceInfo.setPartitionPath(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH); + clusteringGroup.setSlices(Arrays.asList(sliceInfo)); + clusteringPlan.setInputGroups(Arrays.asList(clusteringGroup)); + requestedReplaceMetadata.setClusteringPlan(clusteringPlan); + requestedReplaceMetadata.setVersion(TimelineLayoutVersion.CURR_VERSION); + HoodieTestTable.of(metaClient) + .addPendingReplace(instantTime, Option.of(requestedReplaceMetadata), Option.empty()) + .withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2); + } + + private void createCompleteReplace(String instantTime, WriteOperationType writeOperationType) throws Exception { + String fileId1 = "file-1"; + String fileId2 = "file-2"; + + // create replace instant to mark fileId2 as deleted + HoodieReplaceCommitMetadata replaceMetadata = new HoodieReplaceCommitMetadata(); + Map> partitionFileIds = new HashMap<>(); + partitionFileIds.put(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, Arrays.asList(fileId2)); + replaceMetadata.setPartitionToReplaceFileIds(partitionFileIds); + HoodieWriteStat writeStat = new HoodieWriteStat(); + writeStat.setFileId("file-2"); + replaceMetadata.addWriteStat(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, writeStat); + replaceMetadata.setOperationType(writeOperationType); + FileCreateUtils.createReplaceCommit(metaClient.getBasePath(), instantTime, replaceMetadata); + } + + private void createPendingCompaction(String instantTime) throws Exception { + String fileId1 = "file-2"; + HoodieCompactionPlan compactionPlan = new HoodieCompactionPlan(); + compactionPlan.setVersion(TimelineLayoutVersion.CURR_VERSION); + HoodieCompactionOperation operation = new HoodieCompactionOperation(); + operation.setFileId(fileId1); + operation.setPartitionPath(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH); + operation.setDataFilePath("/file-2"); + operation.setDeltaFilePaths(Arrays.asList("/file-2")); + compactionPlan.setOperations(Arrays.asList(operation)); + HoodieTestTable.of(metaClient) + .addRequestedCompaction(instantTime, compactionPlan); + FileCreateUtils.createPendingInflightCompaction(metaClient.getBasePath(), instantTime); + } + + private void createCompleteCompaction(String instantTime) throws Exception { + String fileId1 = "file-1"; + String fileId2 = "file-2"; + + HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); + commitMetadata.addMetadata("test", "test"); + commitMetadata.setOperationType(WriteOperationType.COMPACT); + commitMetadata.setCompacted(true); + HoodieWriteStat writeStat = new HoodieWriteStat(); + writeStat.setFileId("file-2"); + commitMetadata.addWriteStat(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, writeStat); + HoodieTestTable.of(metaClient) + .addCommit(instantTime, Option.of(commitMetadata)) + .withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2); + } + + private void createRequestedCommit(String instantTime) throws Exception { + HoodieTestTable.of(metaClient) + .addInflightCommit(instantTime); + } + + private void createCompleteCommit(String instantTime) throws Exception { + String fileId1 = "file-1"; + String fileId2 = "file-2"; + + HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); + commitMetadata.addMetadata("test", "test"); + HoodieWriteStat writeStat = new HoodieWriteStat(); + writeStat.setFileId("file-2"); + commitMetadata.addWriteStat(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, writeStat); + commitMetadata.setOperationType(WriteOperationType.INSERT); + HoodieTestTable.of(metaClient) + .addCommit(instantTime, Option.of(commitMetadata)) + .withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestTransactionManager.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestTransactionManager.java new file mode 100644 index 0000000000000..afbedc0de39c4 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestTransactionManager.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.client.transaction; + +import org.apache.hudi.client.transaction.lock.InProcessLockProvider; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.common.model.WriteConcurrencyMode; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieLockConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieLockException; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class TestTransactionManager extends HoodieCommonTestHarness { + HoodieWriteConfig writeConfig; + TransactionManager transactionManager; + + @BeforeEach + private void init() throws IOException { + initPath(); + initMetaClient(); + this.writeConfig = getWriteConfig(); + this.transactionManager = new TransactionManager(this.writeConfig, this.metaClient.getFs()); + } + + private HoodieWriteConfig getWriteConfig() { + return HoodieWriteConfig.newBuilder() + .withPath(basePath) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) + .build()) + .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) + .withLockConfig(HoodieLockConfig.newBuilder() + .withLockProvider(InProcessLockProvider.class) + .build()) + .build(); + } + + @Test + public void testSingleWriterTransaction() { + Option lastCompletedInstant = getInstant("0000001"); + Option newTxnOwnerInstant = getInstant("0000002"); + transactionManager.beginTransaction(newTxnOwnerInstant, lastCompletedInstant); + transactionManager.endTransaction(newTxnOwnerInstant); + } + + @Test + public void testSingleWriterNestedTransaction() { + Option lastCompletedInstant = getInstant("0000001"); + Option newTxnOwnerInstant = getInstant("0000002"); + transactionManager.beginTransaction(newTxnOwnerInstant, lastCompletedInstant); + + Option lastCompletedInstant1 = getInstant("0000003"); + Option newTxnOwnerInstant1 = getInstant("0000004"); + + assertThrows(HoodieLockException.class, () -> { + transactionManager.beginTransaction(newTxnOwnerInstant1, lastCompletedInstant1); + }); + + transactionManager.endTransaction(newTxnOwnerInstant); + assertDoesNotThrow(() -> { + transactionManager.endTransaction(newTxnOwnerInstant1); + }); + } + + @Test + public void testMultiWriterTransactions() { + final int threadCount = 3; + final long awaitMaxTimeoutMs = 2000L; + final CountDownLatch latch = new CountDownLatch(threadCount); + final AtomicBoolean writer1Completed = new AtomicBoolean(false); + final AtomicBoolean writer2Completed = new AtomicBoolean(false); + + Option lastCompletedInstant1 = getInstant("0000001"); + Option newTxnOwnerInstant1 = getInstant("0000002"); + Option lastCompletedInstant2 = getInstant("0000003"); + Option newTxnOwnerInstant2 = getInstant("0000004"); + + // Let writer1 get the lock first, then wait for others + // to join the sync up point. + Thread writer1 = new Thread(() -> { + assertDoesNotThrow(() -> { + transactionManager.beginTransaction(newTxnOwnerInstant1, lastCompletedInstant1); + }); + latch.countDown(); + try { + latch.await(awaitMaxTimeoutMs, TimeUnit.MILLISECONDS); + // Following sleep is to make sure writer2 attempts + // to try lock and to get blocked on the lock which + // this thread is currently holding. + Thread.sleep(50); + } catch (InterruptedException e) { + // + } + assertDoesNotThrow(() -> { + transactionManager.endTransaction(newTxnOwnerInstant1); + }); + writer1Completed.set(true); + }); + writer1.start(); + + // Writer2 will block on trying to acquire the lock + // and will eventually get the lock before the timeout. + Thread writer2 = new Thread(() -> { + latch.countDown(); + try { + latch.await(awaitMaxTimeoutMs, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + // + } + assertDoesNotThrow(() -> { + transactionManager.beginTransaction(newTxnOwnerInstant2, lastCompletedInstant2); + }); + assertDoesNotThrow(() -> { + transactionManager.endTransaction(newTxnOwnerInstant2); + }); + writer2Completed.set(true); + }); + writer2.start(); + + // Let writer1 and writer2 wait at the sync up + // point to make sure they run in parallel and + // one get blocked by the other. + latch.countDown(); + try { + writer1.join(); + writer2.join(); + } catch (InterruptedException e) { + // + } + + // Make sure both writers actually completed good + Assertions.assertTrue(writer1Completed.get()); + Assertions.assertTrue(writer2Completed.get()); + } + + @Test + public void testEndTransactionByDiffOwner() throws InterruptedException { + // 1. Begin and end by the same transaction owner + Option lastCompletedInstant = getInstant("0000001"); + Option newTxnOwnerInstant = getInstant("0000002"); + transactionManager.beginTransaction(newTxnOwnerInstant, lastCompletedInstant); + + CountDownLatch countDownLatch = new CountDownLatch(1); + // Another writer thread + Thread writer2 = new Thread(() -> { + Option newTxnOwnerInstant1 = getInstant("0000003"); + transactionManager.endTransaction(newTxnOwnerInstant1); + countDownLatch.countDown(); + }); + + writer2.start(); + countDownLatch.await(30, TimeUnit.SECONDS); + // should not have reset the state within transaction manager since the owner is different. + Assertions.assertTrue(transactionManager.getCurrentTransactionOwner().isPresent()); + Assertions.assertTrue(transactionManager.getLastCompletedTransactionOwner().isPresent()); + + transactionManager.endTransaction(newTxnOwnerInstant); + Assertions.assertFalse(transactionManager.getCurrentTransactionOwner().isPresent()); + Assertions.assertFalse(transactionManager.getLastCompletedTransactionOwner().isPresent()); + } + + @Test + public void testTransactionsWithInstantTime() { + // 1. Begin and end by the same transaction owner + Option lastCompletedInstant = getInstant("0000001"); + Option newTxnOwnerInstant = getInstant("0000002"); + transactionManager.beginTransaction(newTxnOwnerInstant, lastCompletedInstant); + Assertions.assertTrue(transactionManager.getCurrentTransactionOwner() == newTxnOwnerInstant); + Assertions.assertTrue(transactionManager.getLastCompletedTransactionOwner() == lastCompletedInstant); + transactionManager.endTransaction(newTxnOwnerInstant); + Assertions.assertFalse(transactionManager.getCurrentTransactionOwner().isPresent()); + Assertions.assertFalse(transactionManager.getLastCompletedTransactionOwner().isPresent()); + + // 2. Begin transaction with a new txn owner, but end transaction with wrong owner + lastCompletedInstant = getInstant("0000002"); + newTxnOwnerInstant = getInstant("0000003"); + transactionManager.beginTransaction(newTxnOwnerInstant, lastCompletedInstant); + transactionManager.endTransaction(getInstant("0000004")); + // Owner reset would not happen as the end txn was invoked with an incorrect current txn owner + Assertions.assertTrue(transactionManager.getCurrentTransactionOwner() == newTxnOwnerInstant); + Assertions.assertTrue(transactionManager.getLastCompletedTransactionOwner() == lastCompletedInstant); + transactionManager.endTransaction(newTxnOwnerInstant); + + // 3. But, we should be able to begin a new transaction for a new owner + lastCompletedInstant = getInstant("0000003"); + newTxnOwnerInstant = getInstant("0000004"); + transactionManager.beginTransaction(newTxnOwnerInstant, lastCompletedInstant); + Assertions.assertTrue(transactionManager.getCurrentTransactionOwner() == newTxnOwnerInstant); + Assertions.assertTrue(transactionManager.getLastCompletedTransactionOwner() == lastCompletedInstant); + transactionManager.endTransaction(newTxnOwnerInstant); + Assertions.assertFalse(transactionManager.getCurrentTransactionOwner().isPresent()); + Assertions.assertFalse(transactionManager.getLastCompletedTransactionOwner().isPresent()); + + // 4. Transactions with new instants but with same timestamps should properly reset owners + transactionManager.beginTransaction(getInstant("0000005"), Option.empty()); + Assertions.assertTrue(transactionManager.getCurrentTransactionOwner().isPresent()); + Assertions.assertFalse(transactionManager.getLastCompletedTransactionOwner().isPresent()); + transactionManager.endTransaction(getInstant("0000005")); + Assertions.assertFalse(transactionManager.getCurrentTransactionOwner().isPresent()); + Assertions.assertFalse(transactionManager.getLastCompletedTransactionOwner().isPresent()); + + // 6. Transactions with no owners should also go through + transactionManager.beginTransaction(Option.empty(), Option.empty()); + Assertions.assertFalse(transactionManager.getCurrentTransactionOwner().isPresent()); + Assertions.assertFalse(transactionManager.getLastCompletedTransactionOwner().isPresent()); + transactionManager.endTransaction(Option.empty()); + Assertions.assertFalse(transactionManager.getCurrentTransactionOwner().isPresent()); + Assertions.assertFalse(transactionManager.getLastCompletedTransactionOwner().isPresent()); + } + + private Option getInstant(String timestamp) { + return Option.of(new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMMIT_ACTION, timestamp)); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestZookeeperBasedLockProvider.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestZookeeperBasedLockProvider.java new file mode 100644 index 0000000000000..e9ab49a296676 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestZookeeperBasedLockProvider.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.transaction; + +import org.apache.curator.framework.CuratorFramework; +import org.apache.curator.framework.CuratorFrameworkFactory; +import org.apache.curator.retry.RetryOneTime; +import org.apache.curator.test.TestingServer; +import org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider; +import org.apache.hudi.common.config.LockConfiguration; +import org.apache.hudi.exception.HoodieLockException; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import java.util.Properties; +import java.util.concurrent.TimeUnit; + +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.ZK_BASE_PATH_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.ZK_CONNECTION_TIMEOUT_MS_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.ZK_CONNECT_URL_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.ZK_LOCK_KEY_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.ZK_SESSION_TIMEOUT_MS_PROP_KEY; + +public class TestZookeeperBasedLockProvider { + + private static final Logger LOG = LogManager.getLogger(TestZookeeperBasedLockProvider.class); + + private static TestingServer server; + private static CuratorFramework client; + private static String basePath = "/hudi/test/lock"; + private static String key = "table1"; + private static LockConfiguration lockConfiguration; + + @BeforeAll + public static void setup() { + while (server == null) { + try { + server = new TestingServer(); + CuratorFrameworkFactory.Builder builder = CuratorFrameworkFactory.builder(); + client = builder.connectString(server.getConnectString()).retryPolicy(new RetryOneTime(1000)).build(); + } catch (Exception e) { + LOG.error("Getting bind exception - retrying to allocate server"); + server = null; + } + } + Properties properties = new Properties(); + properties.setProperty(ZK_BASE_PATH_PROP_KEY, basePath); + properties.setProperty(ZK_LOCK_KEY_PROP_KEY, key); + properties.setProperty(ZK_CONNECT_URL_PROP_KEY, server.getConnectString()); + properties.setProperty(ZK_BASE_PATH_PROP_KEY, server.getTempDirectory().getAbsolutePath()); + properties.setProperty(ZK_SESSION_TIMEOUT_MS_PROP_KEY, "10000"); + properties.setProperty(ZK_CONNECTION_TIMEOUT_MS_PROP_KEY, "10000"); + properties.setProperty(ZK_LOCK_KEY_PROP_KEY, "key"); + properties.setProperty(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "1000"); + lockConfiguration = new LockConfiguration(properties); + } + + @Test + public void testAcquireLock() { + ZookeeperBasedLockProvider zookeeperBasedLockProvider = new ZookeeperBasedLockProvider(lockConfiguration, client); + Assertions.assertTrue(zookeeperBasedLockProvider.tryLock(lockConfiguration.getConfig() + .getLong(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY), TimeUnit.MILLISECONDS)); + zookeeperBasedLockProvider.unlock(); + } + + @Test + public void testUnLock() { + ZookeeperBasedLockProvider zookeeperBasedLockProvider = new ZookeeperBasedLockProvider(lockConfiguration, client); + Assertions.assertTrue(zookeeperBasedLockProvider.tryLock(lockConfiguration.getConfig() + .getLong(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY), TimeUnit.MILLISECONDS)); + zookeeperBasedLockProvider.unlock(); + zookeeperBasedLockProvider.tryLock(lockConfiguration.getConfig() + .getLong(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY), TimeUnit.MILLISECONDS); + } + + @Test + public void testReentrantLock() { + ZookeeperBasedLockProvider zookeeperBasedLockProvider = new ZookeeperBasedLockProvider(lockConfiguration, client); + Assertions.assertTrue(zookeeperBasedLockProvider.tryLock(lockConfiguration.getConfig() + .getLong(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY), TimeUnit.MILLISECONDS)); + try { + zookeeperBasedLockProvider.tryLock(lockConfiguration.getConfig() + .getLong(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY), TimeUnit.MILLISECONDS); + Assertions.fail(); + } catch (HoodieLockException e) { + // expected + } + zookeeperBasedLockProvider.unlock(); + } + + @Test + public void testUnlockWithoutLock() { + ZookeeperBasedLockProvider zookeeperBasedLockProvider = new ZookeeperBasedLockProvider(lockConfiguration, client); + zookeeperBasedLockProvider.unlock(); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/utils/TestFileSliceMetricUtils.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/utils/TestFileSliceMetricUtils.java new file mode 100644 index 0000000000000..1fbd01d576d6a --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/utils/TestFileSliceMetricUtils.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.utils; + +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; + +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestFileSliceMetricUtils { + + @Test + public void testFileSliceMetricUtilsWithoutFile() { + Map metrics = new HashMap<>(); + List fileSlices = new ArrayList<>(); + final long defaultBaseFileSize = 10 * 1024 * 1024; + final double epsilon = 1e-5; + FileSliceMetricUtils.addFileSliceCommonMetrics(fileSlices, metrics, defaultBaseFileSize); + assertEquals(0.0, metrics.get(FileSliceMetricUtils.TOTAL_IO_READ_MB), epsilon); + assertEquals(0.0, metrics.get(FileSliceMetricUtils.TOTAL_IO_WRITE_MB), epsilon); + assertEquals(0.0, metrics.get(FileSliceMetricUtils.TOTAL_IO_MB), epsilon); + assertEquals(0.0, metrics.get(FileSliceMetricUtils.TOTAL_LOG_FILE_SIZE), epsilon); + assertEquals(0.0, metrics.get(FileSliceMetricUtils.TOTAL_LOG_FILES), epsilon); + } + + @Test + public void testFileSliceMetricUtilsWithoutLogFile() { + Map metrics = new HashMap<>(); + List fileSlices = new ArrayList<>(); + final long defaultBaseFileSize = 10 * 1024 * 1024; + final double epsilon = 1e-5; + fileSlices.add(buildFileSlice(15 * 1024 * 1024, new ArrayList<>())); + fileSlices.add(buildFileSlice(20 * 1024 * 1024, new ArrayList<>())); + fileSlices.add(buildFileSlice(0, new ArrayList<>())); + FileSliceMetricUtils.addFileSliceCommonMetrics(fileSlices, metrics, defaultBaseFileSize); + assertEquals(35.0, metrics.get(FileSliceMetricUtils.TOTAL_IO_READ_MB), epsilon); + assertEquals(45.0, metrics.get(FileSliceMetricUtils.TOTAL_IO_WRITE_MB), epsilon); + assertEquals(80.0, metrics.get(FileSliceMetricUtils.TOTAL_IO_MB), epsilon); + assertEquals(0.0, metrics.get(FileSliceMetricUtils.TOTAL_LOG_FILE_SIZE), epsilon); + assertEquals(0.0, metrics.get(FileSliceMetricUtils.TOTAL_LOG_FILES), epsilon); + } + + @Test + public void testFileSliceMetricUtilsWithLogFile() { + Map metrics = new HashMap<>(); + List fileSlices = new ArrayList<>(); + final long defaultBaseFileSize = 10 * 1024 * 1024; + final double epsilon = 1e-5; + fileSlices.add(buildFileSlice(15 * 1024 * 1024, + new ArrayList<>(Arrays.asList(5 * 1024 * 1024L, 3 * 1024 * 1024L)))); + fileSlices.add(buildFileSlice(20 * 1024 * 1024, + new ArrayList<>(Collections.singletonList(2 * 1024 * 1024L)))); + FileSliceMetricUtils.addFileSliceCommonMetrics(fileSlices, metrics, defaultBaseFileSize); + assertEquals(45.0, metrics.get(FileSliceMetricUtils.TOTAL_IO_READ_MB), epsilon); + assertEquals(35.0, metrics.get(FileSliceMetricUtils.TOTAL_IO_WRITE_MB), epsilon); + assertEquals(80.0, metrics.get(FileSliceMetricUtils.TOTAL_IO_MB), epsilon); + assertEquals(10.0 * 1024 * 1024, metrics.get(FileSliceMetricUtils.TOTAL_LOG_FILE_SIZE), epsilon); + assertEquals(3.0, metrics.get(FileSliceMetricUtils.TOTAL_LOG_FILES), epsilon); + } + + private FileSlice buildFileSlice(long baseFileLen, List logFileLens) { + final String baseFilePath = ".b5068208-e1a4-11e6-bf01-fe55135034f3_20170101134598.log.1"; + FileSlice slice = new FileSlice("partition_0", + HoodieActiveTimeline.createNewInstantTime(), + UUID.randomUUID().toString()); + HoodieBaseFile baseFile = new HoodieBaseFile(baseFilePath); + baseFile.setFileLen(baseFileLen); + slice.setBaseFile(baseFile); + int logVersion = 1; + for (long logFileLen : logFileLens) { + String logFilePath = "." + UUID.randomUUID().toString() + "_20170101134598.log." + logVersion; + HoodieLogFile logFile = new HoodieLogFile(logFilePath); + logFile.setFileLen(logFileLen); + slice.addLogFile(logFile); + logVersion++; + } + return slice; + } + +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/common/testutils/HoodieMetadataTestTable.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/common/testutils/HoodieMetadataTestTable.java new file mode 100644 index 0000000000000..fa0f5df61b183 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/common/testutils/HoodieMetadataTestTable.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.testutils; + +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; +import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; + +import org.apache.hadoop.fs.FileSystem; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +/** + * {@link HoodieTestTable} impl used for testing metadata. This class does synchronous updates to HoodieTableMetadataWriter if non null. + */ +public class HoodieMetadataTestTable extends HoodieTestTable { + + private final HoodieTableMetadataWriter writer; + + protected HoodieMetadataTestTable(String basePath, FileSystem fs, HoodieTableMetaClient metaClient, HoodieTableMetadataWriter writer) { + super(basePath, fs, metaClient); + this.writer = writer; + } + + public static HoodieTestTable of(HoodieTableMetaClient metaClient) { + return HoodieMetadataTestTable.of(metaClient, null); + } + + public static HoodieTestTable of(HoodieTableMetaClient metaClient, HoodieTableMetadataWriter writer) { + testTableState = HoodieTestTableState.of(); + return new HoodieMetadataTestTable(metaClient.getBasePath(), metaClient.getRawFs(), metaClient, writer); + } + + /** + * Add commits to the requested partitions and update metadata table. + * + * @param commitTime - Commit time for the operation + * @param operationType - Operation type + * @param newPartitionsToAdd - New partitions to add for the operation + * @param partitionToFilesNameLengthMap - Map of partition names to its list of files name and length pair + * @param bootstrap - Whether bootstrapping needed for the operation + * @param createInflightCommit - Whether in flight commit needed for the operation + * @return Commit metadata for the commit operation performed. + * @throws Exception + */ + @Override + public HoodieCommitMetadata doWriteOperation(String commitTime, WriteOperationType operationType, + List newPartitionsToAdd, + Map>> partitionToFilesNameLengthMap, + boolean bootstrap, boolean createInflightCommit) throws Exception { + HoodieCommitMetadata commitMetadata = super.doWriteOperation(commitTime, operationType, newPartitionsToAdd, + partitionToFilesNameLengthMap, bootstrap, createInflightCommit); + if (writer != null && !createInflightCommit) { + writer.update(commitMetadata, commitTime, false); + } + return commitMetadata; + } + + @Override + public HoodieTestTable moveInflightCommitToComplete(String instantTime, HoodieCommitMetadata metadata) throws IOException { + super.moveInflightCommitToComplete(instantTime, metadata); + if (writer != null) { + writer.update(metadata, instantTime, false); + } + return this; + } + + public HoodieTestTable moveInflightCommitToComplete(String instantTime, HoodieCommitMetadata metadata, boolean ignoreWriter) throws IOException { + super.moveInflightCommitToComplete(instantTime, metadata); + if (!ignoreWriter && writer != null) { + writer.update(metadata, instantTime, false); + } + return this; + } + + @Override + public HoodieTestTable moveInflightCompactionToComplete(String instantTime, HoodieCommitMetadata metadata) throws IOException { + super.moveInflightCompactionToComplete(instantTime, metadata); + if (writer != null) { + writer.update(metadata, instantTime, true); + } + return this; + } + + @Override + public HoodieCleanMetadata doClean(String commitTime, Map partitionFileCountsToDelete) throws IOException { + HoodieCleanMetadata cleanMetadata = super.doClean(commitTime, partitionFileCountsToDelete); + if (writer != null) { + writer.update(cleanMetadata, commitTime); + } + return cleanMetadata; + } + + public HoodieTestTable addCompaction(String instantTime, HoodieCommitMetadata commitMetadata) throws Exception { + super.addCompaction(instantTime, commitMetadata); + if (writer != null) { + writer.update(commitMetadata, instantTime, true); + } + return this; + } + + @Override + public HoodieTestTable addRollback(String instantTime, HoodieRollbackMetadata rollbackMetadata) throws IOException { + super.addRollback(instantTime, rollbackMetadata); + if (writer != null) { + writer.update(rollbackMetadata, instantTime); + } + return this; + } + + @Override + public HoodieTestTable addRestore(String instantTime, HoodieRestoreMetadata restoreMetadata) throws IOException { + super.addRestore(instantTime, restoreMetadata); + if (writer != null) { + writer.update(restoreMetadata, instantTime); + } + return this; + } + + @Override + public HoodieTestTable addReplaceCommit( + String instantTime, + Option requestedReplaceMetadata, + Option inflightReplaceMetadata, + HoodieReplaceCommitMetadata completeReplaceMetadata) throws Exception { + super.addReplaceCommit(instantTime, requestedReplaceMetadata, inflightReplaceMetadata, completeReplaceMetadata); + if (writer != null) { + writer.update(completeReplaceMetadata, instantTime, true); + } + return this; + } + +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/config/TestHoodieWriteConfig.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/config/TestHoodieWriteConfig.java index cba240959274c..e956668d0c49a 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/config/TestHoodieWriteConfig.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/config/TestHoodieWriteConfig.java @@ -18,11 +18,24 @@ package org.apache.hudi.config; -import org.apache.hudi.client.common.EngineType; +import org.apache.hudi.client.transaction.FileSystemBasedLockProviderTestClass; +import org.apache.hudi.client.transaction.lock.InProcessLockProvider; +import org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.WriteConcurrencyMode; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.marker.MarkerType; import org.apache.hudi.config.HoodieWriteConfig.Builder; - import org.apache.hudi.index.HoodieIndex; + import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.ValueSource; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; @@ -31,18 +44,34 @@ import java.util.HashMap; import java.util.Map; import java.util.Properties; +import java.util.function.Function; +import static org.apache.hudi.config.HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE; +import static org.apache.hudi.config.HoodieArchivalConfig.ASYNC_ARCHIVE; +import static org.apache.hudi.config.HoodieCleanConfig.ASYNC_CLEAN; +import static org.apache.hudi.config.HoodieCleanConfig.AUTO_CLEAN; +import static org.apache.hudi.config.HoodieCleanConfig.FAILED_WRITES_CLEANER_POLICY; +import static org.apache.hudi.config.HoodieCompactionConfig.INLINE_COMPACT; +import static org.apache.hudi.config.HoodieWriteConfig.TABLE_SERVICES_ENABLED; +import static org.apache.hudi.config.HoodieWriteConfig.WRITE_CONCURRENCY_MODE; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class TestHoodieWriteConfig { - @Test - public void testPropertyLoading() throws IOException { + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testPropertyLoading(boolean withAlternative) throws IOException { Builder builder = HoodieWriteConfig.newBuilder().withPath("/tmp"); Map params = new HashMap<>(3); - params.put(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP, "1"); - params.put(HoodieCompactionConfig.MAX_COMMITS_TO_KEEP_PROP, "5"); - params.put(HoodieCompactionConfig.MIN_COMMITS_TO_KEEP_PROP, "2"); + params.put(HoodieCleanConfig.CLEANER_COMMITS_RETAINED.key(), "1"); + params.put(HoodieArchivalConfig.MAX_COMMITS_TO_KEEP.key(), "5"); + params.put(HoodieArchivalConfig.MIN_COMMITS_TO_KEEP.key(), "2"); + if (withAlternative) { + params.put("hoodie.avro.schema.externalTransformation", "true"); + } else { + params.put("hoodie.avro.schema.external.transformation", "true"); + } ByteArrayOutputStream outStream = saveParamsIntoOutputStream(params); ByteArrayInputStream inputStream = new ByteArrayInputStream(outStream.toByteArray()); try { @@ -54,21 +83,318 @@ public void testPropertyLoading() throws IOException { HoodieWriteConfig config = builder.build(); assertEquals(5, config.getMaxCommitsToKeep()); assertEquals(2, config.getMinCommitsToKeep()); + assertTrue(config.shouldUseExternalSchemaTransformation()); } @Test public void testDefaultIndexAccordingToEngineType() { - // default bloom - HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").build(); - assertEquals(HoodieIndex.IndexType.BLOOM, writeConfig.getIndexType()); + testEngineSpecificConfig(HoodieWriteConfig::getIndexType, + constructConfigMap( + EngineType.SPARK, HoodieIndex.IndexType.SIMPLE, + EngineType.FLINK, HoodieIndex.IndexType.INMEMORY, + EngineType.JAVA, HoodieIndex.IndexType.INMEMORY)); + } + + @Test + public void testDefaultClusteringPlanStrategyClassAccordingToEngineType() { + testEngineSpecificConfig(HoodieWriteConfig::getClusteringPlanStrategyClass, + constructConfigMap( + EngineType.SPARK, HoodieClusteringConfig.SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY, + EngineType.FLINK, HoodieClusteringConfig.JAVA_SIZED_BASED_CLUSTERING_PLAN_STRATEGY, + EngineType.JAVA, HoodieClusteringConfig.JAVA_SIZED_BASED_CLUSTERING_PLAN_STRATEGY)); + } + + @Test + public void testDefaultClusteringExecutionStrategyClassAccordingToEngineType() { + testEngineSpecificConfig(HoodieWriteConfig::getClusteringExecutionStrategyClass, + constructConfigMap( + EngineType.SPARK, HoodieClusteringConfig.SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY, + EngineType.FLINK, HoodieClusteringConfig.JAVA_SORT_AND_SIZE_EXECUTION_STRATEGY, + EngineType.JAVA, HoodieClusteringConfig.JAVA_SORT_AND_SIZE_EXECUTION_STRATEGY)); + } + + @Test + public void testDefaultMarkersTypeAccordingToEngineType() { + testEngineSpecificConfig(HoodieWriteConfig::getMarkersType, + constructConfigMap( + EngineType.SPARK, MarkerType.TIMELINE_SERVER_BASED, + EngineType.FLINK, MarkerType.DIRECT, + EngineType.JAVA, MarkerType.DIRECT)); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testAutoConcurrencyConfigAdjustmentWithTableServices(HoodieTableType tableType) { + final String inProcessLockProviderClassName = InProcessLockProvider.class.getCanonicalName(); + // With metadata table enabled by default, any async table service enabled should + // use InProcess lock provider as default when no other lock provider is set. + // 1. Async clustering + verifyConcurrencyControlRelatedConfigs(createWriteConfig(new HashMap() { + { + put(HoodieTableConfig.TYPE.key(), tableType.name()); + put(ASYNC_CLUSTERING_ENABLE.key(), "true"); + put(INLINE_COMPACT.key(), "true"); + put(AUTO_CLEAN.key(), "true"); + put(ASYNC_CLEAN.key(), "false"); + put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); + } + }), true, true, true, WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL, + HoodieFailedWritesCleaningPolicy.LAZY, inProcessLockProviderClassName); + + // 2. Async clean + verifyConcurrencyControlRelatedConfigs(createWriteConfig(new HashMap() { + { + put(HoodieTableConfig.TYPE.key(), tableType.name()); + put(ASYNC_CLUSTERING_ENABLE.key(), "false"); + put(INLINE_COMPACT.key(), "true"); + put(AUTO_CLEAN.key(), "true"); + put(ASYNC_CLEAN.key(), "true"); + put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); + } + }), true, true, true, WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL, + HoodieFailedWritesCleaningPolicy.LAZY, inProcessLockProviderClassName); + + // 3. Async compaction configured + verifyConcurrencyControlRelatedConfigs(createWriteConfig(new HashMap() { + { + put(HoodieTableConfig.TYPE.key(), tableType.name()); + put(ASYNC_CLUSTERING_ENABLE.key(), "false"); + put(INLINE_COMPACT.key(), "false"); + put(AUTO_CLEAN.key(), "true"); + put(ASYNC_CLEAN.key(), "false"); + put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); + } + }), true, + tableType == HoodieTableType.MERGE_ON_READ, true, + tableType == HoodieTableType.MERGE_ON_READ + ? WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL + : WriteConcurrencyMode.valueOf(WRITE_CONCURRENCY_MODE.defaultValue()), + tableType == HoodieTableType.MERGE_ON_READ + ? HoodieFailedWritesCleaningPolicy.LAZY + : HoodieFailedWritesCleaningPolicy.valueOf(FAILED_WRITES_CLEANER_POLICY.defaultValue()), + tableType == HoodieTableType.MERGE_ON_READ + ? inProcessLockProviderClassName + : HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.defaultValue()); + + // 4. All inline services + verifyConcurrencyControlRelatedConfigs(createWriteConfig(new HashMap() { + { + put(HoodieTableConfig.TYPE.key(), tableType.name()); + put(ASYNC_CLUSTERING_ENABLE.key(), "false"); + put(INLINE_COMPACT.key(), "true"); + put(AUTO_CLEAN.key(), "true"); + put(ASYNC_CLEAN.key(), "false"); + put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); + } + }), true, false, true, + WriteConcurrencyMode.valueOf(WRITE_CONCURRENCY_MODE.defaultValue()), + HoodieFailedWritesCleaningPolicy.valueOf(FAILED_WRITES_CLEANER_POLICY.defaultValue()), + HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.defaultValue()); + + // 5. All async services + verifyConcurrencyControlRelatedConfigs(createWriteConfig(new HashMap() { + { + put(HoodieTableConfig.TYPE.key(), tableType.name()); + put(ASYNC_CLUSTERING_ENABLE.key(), "true"); + put(INLINE_COMPACT.key(), "false"); + put(AUTO_CLEAN.key(), "true"); + put(ASYNC_CLEAN.key(), "true"); + put(ASYNC_ARCHIVE.key(), "true"); + put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); + } + }), true, true, false, + WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL, + HoodieFailedWritesCleaningPolicy.LAZY, inProcessLockProviderClassName); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testAutoAdjustLockConfigs(HoodieTableType tableType) { + TypedProperties properties = new TypedProperties(); + properties.setProperty(HoodieTableConfig.TYPE.key(), tableType.name()); + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder() + .withPath("/tmp") + .withAutoAdjustLockConfigs(false) + .withClusteringConfig(new HoodieClusteringConfig.Builder().withAsyncClustering(true).build()) + .withProperties(properties) + .build(); + + verifyConcurrencyControlRelatedConfigs(writeConfig, + true, true, true, + WriteConcurrencyMode.valueOf(WRITE_CONCURRENCY_MODE.defaultValue()), + HoodieFailedWritesCleaningPolicy.valueOf(FAILED_WRITES_CLEANER_POLICY.defaultValue()), + HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.defaultValue()); + + writeConfig = HoodieWriteConfig.newBuilder() + .withPath("/tmp") + .withAutoAdjustLockConfigs(false) + .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) + .withClusteringConfig(new HoodieClusteringConfig.Builder().withAsyncClustering(true).build()) + .withProperties(properties) + .build(); + + verifyConcurrencyControlRelatedConfigs(writeConfig, + true, true, true, + WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL, HoodieFailedWritesCleaningPolicy.LAZY, + HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.defaultValue()); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testAutoConcurrencyConfigAdjustmentWithUserConfigs(HoodieTableType tableType) { + // 1. User override for the lock provider should always take the precedence + TypedProperties properties = new TypedProperties(); + properties.setProperty(HoodieTableConfig.TYPE.key(), tableType.name()); + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder() + .withPath("/tmp") + .withLockConfig(HoodieLockConfig.newBuilder() + .withLockProvider(FileSystemBasedLockProviderTestClass.class) + .build()) + .withAutoAdjustLockConfigs(true) + .withProperties(properties) + .build(); + + verifyConcurrencyControlRelatedConfigs(writeConfig, + true, tableType == HoodieTableType.MERGE_ON_READ, true, + WriteConcurrencyMode.valueOf(WRITE_CONCURRENCY_MODE.defaultValue()), + HoodieFailedWritesCleaningPolicy.valueOf(FAILED_WRITES_CLEANER_POLICY.defaultValue()), + FileSystemBasedLockProviderTestClass.class.getName()); + + // 2. User can set the lock provider via properties + verifyConcurrencyControlRelatedConfigs(createWriteConfig(new HashMap() { + { + put(HoodieTableConfig.TYPE.key(), tableType.name()); + put(ASYNC_CLUSTERING_ENABLE.key(), "false"); + put(INLINE_COMPACT.key(), "true"); + put(AUTO_CLEAN.key(), "true"); + put(ASYNC_CLEAN.key(), "true"); + put(HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.key(), + ZookeeperBasedLockProvider.class.getName()); + put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); + } + }), true, true, true, + WriteConcurrencyMode.valueOf(WRITE_CONCURRENCY_MODE.defaultValue()), + HoodieFailedWritesCleaningPolicy.valueOf(FAILED_WRITES_CLEANER_POLICY.defaultValue()), + ZookeeperBasedLockProvider.class.getName()); + + // 3. Default config should have default lock provider + writeConfig = createWriteConfig(new HashMap() { + { + put(HoodieTableConfig.TYPE.key(), tableType.name()); + put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); + } + }); + if (writeConfig.areAnyTableServicesAsync()) { + verifyConcurrencyControlRelatedConfigs(writeConfig, + true, true, true, + WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL, + HoodieFailedWritesCleaningPolicy.LAZY, + InProcessLockProvider.class.getName()); + } else { + verifyConcurrencyControlRelatedConfigs(writeConfig, + true, false, true, + WriteConcurrencyMode.valueOf(WRITE_CONCURRENCY_MODE.defaultValue()), + HoodieFailedWritesCleaningPolicy.valueOf(FAILED_WRITES_CLEANER_POLICY.defaultValue()), + HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.defaultValue()); + } + } - // spark default bloom - writeConfig = HoodieWriteConfig.newBuilder().withEngineType(EngineType.SPARK).withPath("/tmp").build(); - assertEquals(HoodieIndex.IndexType.BLOOM, writeConfig.getIndexType()); + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testAutoConcurrencyConfigAdjustmentWithNoTableService(HoodieTableType tableType) { + // 1. No table service, concurrency control configs should not be overwritten + verifyConcurrencyControlRelatedConfigs(createWriteConfig(new HashMap() { + { + put(HoodieTableConfig.TYPE.key(), tableType.name()); + put(TABLE_SERVICES_ENABLED.key(), "false"); + put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); + } + }), false, false, false, + WriteConcurrencyMode.fromValue(WRITE_CONCURRENCY_MODE.defaultValue()), + HoodieFailedWritesCleaningPolicy.valueOf(FAILED_WRITES_CLEANER_POLICY.defaultValue()), + HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.defaultValue()); - // flink default in-memory - writeConfig = HoodieWriteConfig.newBuilder().withEngineType(EngineType.FLINK).withPath("/tmp").build(); - assertEquals(HoodieIndex.IndexType.INMEMORY, writeConfig.getIndexType()); + // 2. No table service, with optimistic concurrency control, + // failed write clean policy should be updated accordingly + verifyConcurrencyControlRelatedConfigs(createWriteConfig(new HashMap() { + { + put(HoodieTableConfig.TYPE.key(), tableType.name()); + put(TABLE_SERVICES_ENABLED.key(), "false"); + put(WRITE_CONCURRENCY_MODE.key(), + WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL.value()); + put(HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.key(), + FileSystemBasedLockProviderTestClass.class.getName()); + put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); + } + }), false, false, false, + WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL, + HoodieFailedWritesCleaningPolicy.LAZY, + FileSystemBasedLockProviderTestClass.class.getName()); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testAutoConcurrencyConfigAdjustmentWithMetadataTableDisabled(HoodieTableType tableType) { + // 1. Metadata table disabled, with async table services, concurrency control configs + // should not be changed + verifyConcurrencyControlRelatedConfigs(createWriteConfig(new HashMap() { + { + put(HoodieTableConfig.TYPE.key(), tableType.name()); + put(HoodieMetadataConfig.ENABLE.key(), "false"); + put(ASYNC_CLUSTERING_ENABLE.key(), "true"); + put(INLINE_COMPACT.key(), "true"); + put(AUTO_CLEAN.key(), "true"); + put(ASYNC_CLEAN.key(), "false"); + put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); + } + }), true, true, true, + WriteConcurrencyMode.fromValue(WRITE_CONCURRENCY_MODE.defaultValue()), + HoodieFailedWritesCleaningPolicy.valueOf(FAILED_WRITES_CLEANER_POLICY.defaultValue()), + HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.defaultValue()); + + // 2. Metadata table disabled, with optimistic concurrency control, + // failed write clean policy should be updated accordingly + verifyConcurrencyControlRelatedConfigs(createWriteConfig(new HashMap() { + { + put(ASYNC_CLUSTERING_ENABLE.key(), "true"); + put(INLINE_COMPACT.key(), "true"); + put(AUTO_CLEAN.key(), "true"); + put(ASYNC_CLEAN.key(), "false"); + put(WRITE_CONCURRENCY_MODE.key(), + WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL.value()); + put(HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.key(), + FileSystemBasedLockProviderTestClass.class.getName()); + put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); + } + }), true, true, true, + WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL, + HoodieFailedWritesCleaningPolicy.LAZY, FileSystemBasedLockProviderTestClass.class.getName()); + } + + @Test + public void testSimpleBucketIndexPartitionerConfig() { + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp") + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BUCKET) + .withBucketIndexEngineType(HoodieIndex.BucketIndexEngineType.SIMPLE).build()) + .build(); + assertEquals(HoodieLayoutConfig.SIMPLE_BUCKET_LAYOUT_PARTITIONER_CLASS_NAME, writeConfig.getString(HoodieLayoutConfig.LAYOUT_PARTITIONER_CLASS_NAME)); + + HoodieWriteConfig overwritePartitioner = HoodieWriteConfig.newBuilder().withPath("/tmp") + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BUCKET) + .withBucketIndexEngineType(HoodieIndex.BucketIndexEngineType.SIMPLE) + .build()) + .withLayoutConfig(HoodieLayoutConfig.newBuilder().withLayoutPartitioner("org.apache.hudi.table.action.commit.UpsertPartitioner").build()) + .build(); + assertEquals("org.apache.hudi.table.action.commit.UpsertPartitioner", overwritePartitioner.getString(HoodieLayoutConfig.LAYOUT_PARTITIONER_CLASS_NAME)); + } + + private HoodieWriteConfig createWriteConfig(Map configs) { + final Properties properties = new Properties(); + configs.forEach(properties::setProperty); + return HoodieWriteConfig.newBuilder() + .withPath("/tmp") + .withProperties(properties) + .build(); } private ByteArrayOutputStream saveParamsIntoOutputStream(Map params) throws IOException { @@ -78,4 +404,60 @@ private ByteArrayOutputStream saveParamsIntoOutputStream(Map par properties.store(outStream, "Saved on " + new Date(System.currentTimeMillis())); return outStream; } + + /** + * Tests the engine-specific configuration values for one configuration key . + * + * @param getConfigFunc Function to get the config value. + * @param expectedConfigMap Expected config map, with key as the engine type + * and value as the corresponding config value for the engine. + */ + private void testEngineSpecificConfig(Function getConfigFunc, + Map expectedConfigMap) { + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").build(); + assertEquals(expectedConfigMap.get(EngineType.SPARK), getConfigFunc.apply(writeConfig)); + + for (EngineType engineType : expectedConfigMap.keySet()) { + writeConfig = HoodieWriteConfig.newBuilder() + .withEngineType(engineType).withPath("/tmp").build(); + assertEquals(expectedConfigMap.get(engineType), getConfigFunc.apply(writeConfig)); + } + } + + /** + * Constructs the map. + * + * @param k1 First engine type. + * @param v1 Config value for the first engine type. + * @param k2 Second engine type. + * @param v2 Config value for the second engine type. + * @param k3 Third engine type. + * @param v3 Config value for the third engine type. + * @return {@link Map} instance, with key as the engine type + * and value as the corresponding config value for the engine. + */ + private Map constructConfigMap( + EngineType k1, Object v1, EngineType k2, Object v2, EngineType k3, Object v3) { + Map mapping = new HashMap<>(); + mapping.put(k1, v1); + mapping.put(k2, v2); + mapping.put(k3, v3); + return mapping; + } + + private void verifyConcurrencyControlRelatedConfigs( + HoodieWriteConfig writeConfig, boolean expectedTableServicesEnabled, + boolean expectedAnyTableServicesAsync, + boolean expectedAnyTableServicesExecutedInline, + WriteConcurrencyMode expectedConcurrencyMode, + HoodieFailedWritesCleaningPolicy expectedCleanPolicy, + String expectedLockProviderName) { + assertEquals(expectedTableServicesEnabled, writeConfig.areTableServicesEnabled()); + assertEquals(expectedAnyTableServicesAsync, writeConfig.areAnyTableServicesAsync()); + assertEquals( + expectedAnyTableServicesExecutedInline, writeConfig.areAnyTableServicesExecutedInline()); + assertEquals(expectedConcurrencyMode, writeConfig.getWriteConcurrencyMode()); + assertEquals(expectedCleanPolicy, writeConfig.getFailedWritesCleanPolicy()); + assertEquals(expectedLockProviderName, writeConfig.getLockProviderClass()); + } } diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/index/bucket/TestBucketIdentifier.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/index/bucket/TestBucketIdentifier.java new file mode 100644 index 0000000000000..31f33890ad318 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/index/bucket/TestBucketIdentifier.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index.bucket; + +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.keygen.KeyGenUtils; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.List; + +public class TestBucketIdentifier { + + public static final String NESTED_COL_SCHEMA = "{\"type\":\"record\", \"name\":\"nested_col\",\"fields\": [" + + "{\"name\": \"prop1\",\"type\": \"string\"},{\"name\": \"prop2\", \"type\": \"long\"}]}"; + public static final String EXAMPLE_SCHEMA = "{\"type\": \"record\",\"name\": \"testrec\",\"fields\": [ " + + "{\"name\": \"timestamp\",\"type\": \"long\"},{\"name\": \"_row_key\", \"type\": \"string\"}," + + "{\"name\": \"ts_ms\", \"type\": \"string\"}," + + "{\"name\": \"pii_col\", \"type\": \"string\"}," + + "{\"name\": \"nested_col\",\"type\": " + + NESTED_COL_SCHEMA + "}" + + "]}"; + + public static GenericRecord getRecord() { + return getRecord(getNestedColRecord("val1", 10L)); + } + + public static GenericRecord getNestedColRecord(String prop1Value, Long prop2Value) { + GenericRecord nestedColRecord = new GenericData.Record(new Schema.Parser().parse(NESTED_COL_SCHEMA)); + nestedColRecord.put("prop1", prop1Value); + nestedColRecord.put("prop2", prop2Value); + return nestedColRecord; + } + + public static GenericRecord getRecord(GenericRecord nestedColRecord) { + GenericRecord record = new GenericData.Record(new Schema.Parser().parse(EXAMPLE_SCHEMA)); + record.put("timestamp", 4357686L); + record.put("_row_key", "key1"); + record.put("ts_ms", "2020-03-21"); + record.put("pii_col", "pi"); + record.put("nested_col", nestedColRecord); + return record; + } + + @Test + public void testBucketFileId() { + int[] ids = {0, 4, 8, 16, 32, 64, 128, 256, 512, 1000, 1024, 4096, 10000, 100000}; + for (int id : ids) { + String bucketIdStr = BucketIdentifier.bucketIdStr(id); + String fileId = BucketIdentifier.newBucketFileIdPrefix(bucketIdStr); + assert BucketIdentifier.bucketIdFromFileId(fileId) == id; + } + } + + @Test + public void testBucketIdWithSimpleRecordKey() { + String recordKeyField = "_row_key"; + String indexKeyField = "_row_key"; + GenericRecord record = getRecord(); + HoodieRecord hoodieRecord = new HoodieAvroRecord( + new HoodieKey(KeyGenUtils.getRecordKey(record, recordKeyField, false), ""), null); + int bucketId = BucketIdentifier.getBucketId(hoodieRecord, indexKeyField, 8); + assert bucketId == BucketIdentifier.getBucketId( + Arrays.asList(record.get(indexKeyField).toString()), 8); + } + + @Test + public void testBucketIdWithComplexRecordKey() { + List recordKeyField = Arrays.asList("_row_key", "ts_ms"); + String indexKeyField = "_row_key"; + GenericRecord record = getRecord(); + HoodieRecord hoodieRecord = new HoodieAvroRecord( + new HoodieKey(KeyGenUtils.getRecordKey(record, recordKeyField, false), ""), null); + int bucketId = BucketIdentifier.getBucketId(hoodieRecord, indexKeyField, 8); + assert bucketId == BucketIdentifier.getBucketId( + Arrays.asList(record.get(indexKeyField).toString()), 8); + } + + @Test + public void testGetHashKeys() { + BucketIdentifier identifier = new BucketIdentifier(); + List keys = identifier.getHashKeys(new HoodieKey("abc", "partition"), ""); + Assertions.assertEquals(1, keys.size()); + Assertions.assertEquals("abc", keys.get(0)); + + keys = identifier.getHashKeys(new HoodieKey("f1:abc", "partition"), "f1"); + Assertions.assertEquals(1, keys.size()); + Assertions.assertEquals("abc", keys.get(0)); + + keys = identifier.getHashKeys(new HoodieKey("f1:abc,f2:bcd", "partition"), "f2"); + Assertions.assertEquals(1, keys.size()); + Assertions.assertEquals("bcd", keys.get(0)); + + keys = identifier.getHashKeys(new HoodieKey("f1:abc,f2:bcd", "partition"), "f1,f2"); + Assertions.assertEquals(2, keys.size()); + Assertions.assertEquals("abc", keys.get(0)); + Assertions.assertEquals("bcd", keys.get(1)); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/index/bucket/TestConsistentBucketIdIdentifier.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/index/bucket/TestConsistentBucketIdIdentifier.java new file mode 100644 index 0000000000000..3ffe6ded188b8 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/index/bucket/TestConsistentBucketIdIdentifier.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index.bucket; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.ConsistentHashingNode; +import org.apache.hudi.common.model.HoodieConsistentHashingMetadata; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.List; + +import static org.apache.hudi.common.model.HoodieConsistentHashingMetadata.HASH_VALUE_MASK; + +/** + * Unit test of consistent bucket identifier + */ +public class TestConsistentBucketIdIdentifier { + + @Test + public void testGetBucket() { + List nodes = Arrays.asList( + new ConsistentHashingNode(100, "0"), + new ConsistentHashingNode(0x2fffffff, "1"), + new ConsistentHashingNode(0x4fffffff, "2")); + HoodieConsistentHashingMetadata meta = new HoodieConsistentHashingMetadata((short) 0, "", "", 3, 0, nodes); + ConsistentBucketIdentifier identifier = new ConsistentBucketIdentifier(meta); + + Assertions.assertEquals(3, identifier.getNumBuckets()); + + // Get bucket by hash keys + Assertions.assertEquals(nodes.get(2), identifier.getBucket(Arrays.asList("Hudi"))); + Assertions.assertEquals(nodes.get(1), identifier.getBucket(Arrays.asList("bucket_index"))); + Assertions.assertEquals(nodes.get(1), identifier.getBucket(Arrays.asList("consistent_hashing"))); + Assertions.assertEquals(nodes.get(1), identifier.getBucket(Arrays.asList("bucket_index", "consistent_hashing"))); + int[] ref1 = {2, 2, 1, 1, 0, 1, 1, 1, 0, 1}; + int[] ref2 = {1, 0, 1, 0, 1, 1, 1, 0, 1, 2}; + for (int i = 0; i < 10; ++i) { + Assertions.assertEquals(nodes.get(ref1[i]), identifier.getBucket(Arrays.asList(Integer.toString(i)))); + Assertions.assertEquals(nodes.get(ref2[i]), identifier.getBucket(Arrays.asList(Integer.toString(i), Integer.toString(i + 1)))); + } + + // Get bucket by hash value + Assertions.assertEquals(nodes.get(0), identifier.getBucket(0)); + Assertions.assertEquals(nodes.get(0), identifier.getBucket(50)); + Assertions.assertEquals(nodes.get(0), identifier.getBucket(100)); + Assertions.assertEquals(nodes.get(1), identifier.getBucket(101)); + Assertions.assertEquals(nodes.get(1), identifier.getBucket(0x1fffffff)); + Assertions.assertEquals(nodes.get(1), identifier.getBucket(0x2fffffff)); + Assertions.assertEquals(nodes.get(2), identifier.getBucket(0x40000000)); + Assertions.assertEquals(nodes.get(2), identifier.getBucket(0x40000001)); + Assertions.assertEquals(nodes.get(2), identifier.getBucket(0x4fffffff)); + Assertions.assertEquals(nodes.get(0), identifier.getBucket(0x50000000)); + Assertions.assertEquals(nodes.get(0), identifier.getBucket(HASH_VALUE_MASK)); + + // Get bucket by file id + Assertions.assertEquals(nodes.get(0), identifier.getBucketByFileId(FSUtils.createNewFileId("0", 0))); + Assertions.assertEquals(nodes.get(1), identifier.getBucketByFileId(FSUtils.createNewFileId("1", 0))); + Assertions.assertEquals(nodes.get(2), identifier.getBucketByFileId(FSUtils.createNewFileId("2", 0))); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java new file mode 100644 index 0000000000000..a45b8a9aaa3a5 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java @@ -0,0 +1,408 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.CellComparatorImpl; +import org.apache.hadoop.hbase.io.hfile.CacheConfig; +import org.apache.hadoop.hbase.io.hfile.HFile; +import org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.FileIOUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; +import org.mockito.Mockito; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.TreeMap; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import static org.apache.hudi.common.testutils.FileSystemTestUtils.RANDOM; +import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource; +import static org.apache.hudi.common.util.CollectionUtils.toStream; +import static org.apache.hudi.io.storage.HoodieHFileConfig.HFILE_COMPARATOR; +import static org.apache.hudi.io.storage.HoodieHFileReader.SCHEMA_KEY; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.mockito.Mockito.when; + +public class TestHoodieHFileReaderWriter extends TestHoodieReaderWriterBase { + private static final String DUMMY_BASE_PATH = "dummy_base_path"; + // Number of records in HFile fixtures for compatibility tests + private static final int NUM_RECORDS_FIXTURE = 50; + private static final String SIMPLE_SCHEMA_HFILE_SUFFIX = "_simple.hfile"; + private static final String COMPLEX_SCHEMA_HFILE_SUFFIX = "_complex.hfile"; + private static final String BOOTSTRAP_INDEX_HFILE_SUFFIX = "_bootstrap_index_partitions.hfile"; + + @Override + protected Path getFilePath() { + return new Path(tempDir.toString() + "/f1_1-0-1_000.hfile"); + } + + @Override + protected HoodieFileWriter createWriter( + Schema avroSchema, boolean populateMetaFields) throws Exception { + String instantTime = "000"; + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder() + .withPath(DUMMY_BASE_PATH) + .withIndexConfig(HoodieIndexConfig.newBuilder() + .bloomFilterNumEntries(1000).bloomFilterFPP(0.00001).build()) + .withPopulateMetaFields(populateMetaFields) + .build(); + Configuration conf = new Configuration(); + TaskContextSupplier mockTaskContextSupplier = Mockito.mock(TaskContextSupplier.class); + Supplier partitionSupplier = Mockito.mock(Supplier.class); + when(mockTaskContextSupplier.getPartitionIdSupplier()).thenReturn(partitionSupplier); + when(partitionSupplier.get()).thenReturn(10); + + return HoodieFileWriterFactory.newHFileFileWriter( + instantTime, getFilePath(), writeConfig, avroSchema, conf, mockTaskContextSupplier); + } + + @Override + protected HoodieFileReader createReader( + Configuration conf) throws Exception { + CacheConfig cacheConfig = new CacheConfig(conf); + return new HoodieHFileReader<>(conf, getFilePath(), cacheConfig, getFilePath().getFileSystem(conf)); + } + + @Override + protected void verifyMetadata(Configuration conf) throws IOException { + FileSystem fs = getFilePath().getFileSystem(conf); + HFile.Reader hfileReader = HoodieHFileUtils.createHFileReader(fs, getFilePath(), new CacheConfig(conf), conf); + assertEquals(HFILE_COMPARATOR.getClass(), hfileReader.getComparator().getClass()); + assertEquals(NUM_RECORDS, hfileReader.getEntries()); + } + + @Override + protected void verifySchema(Configuration conf, String schemaPath) throws IOException { + FileSystem fs = getFilePath().getFileSystem(conf); + HFile.Reader hfileReader = HoodieHFileUtils.createHFileReader(fs, getFilePath(), new CacheConfig(conf), conf); + assertEquals(getSchemaFromResource(TestHoodieHFileReaderWriter.class, schemaPath), + new Schema.Parser().parse(new String(hfileReader.getHFileInfo().get(SCHEMA_KEY.getBytes())))); + } + + private static Stream populateMetaFieldsAndTestAvroWithMeta() { + return Arrays.stream(new Boolean[][] { + {true, true}, + {false, true}, + {true, false}, + {false, false} + }).map(Arguments::of); + } + + @ParameterizedTest + @MethodSource("populateMetaFieldsAndTestAvroWithMeta") + public void testWriteReadHFileWithMetaFields(boolean populateMetaFields, boolean testAvroWithMeta) throws Exception { + Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchemaWithMetaFields.avsc"); + HoodieFileWriter writer = createWriter(avroSchema, populateMetaFields); + List keys = new ArrayList<>(); + Map recordMap = new TreeMap<>(); + for (int i = 0; i < 100; i++) { + GenericRecord record = new GenericData.Record(avroSchema); + String key = String.format("%s%04d", "key", i); + record.put("_row_key", key); + keys.add(key); + record.put("time", Integer.toString(RANDOM.nextInt())); + record.put("number", i); + if (testAvroWithMeta) { + // payload does not matter. GenericRecord passed in is what matters + writer.writeAvroWithMetadata(new HoodieAvroRecord(new HoodieKey((String) record.get("_row_key"), + Integer.toString((Integer) record.get("number"))), new EmptyHoodieRecordPayload()).getKey(), record); + // only HoodieKey will be looked up from the 2nd arg(HoodieRecord). + } else { + writer.writeAvro(key, record); + } + recordMap.put(key, record); + } + writer.close(); + + Configuration conf = new Configuration(); + HoodieHFileReader hoodieHFileReader = (HoodieHFileReader) createReader(conf); + List records = HoodieHFileReader.readAllRecords(hoodieHFileReader); + assertEquals(new ArrayList<>(recordMap.values()), records); + + hoodieHFileReader.close(); + + for (int i = 0; i < 2; i++) { + int randomRowstoFetch = 5 + RANDOM.nextInt(10); + Set rowsToFetch = getRandomKeys(randomRowstoFetch, keys); + + List rowsList = new ArrayList<>(rowsToFetch); + Collections.sort(rowsList); + + List expectedRecords = rowsList.stream().map(recordMap::get).collect(Collectors.toList()); + + hoodieHFileReader = (HoodieHFileReader) createReader(conf); + List result = HoodieHFileReader.readRecords(hoodieHFileReader, rowsList); + + assertEquals(expectedRecords, result); + + result.forEach(entry -> { + if (populateMetaFields && testAvroWithMeta) { + assertNotNull(entry.get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); + } else { + assertNull(entry.get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); + } + }); + hoodieHFileReader.close(); + } + } + + @Override + @Test + public void testWriteReadWithEvolvedSchema() throws Exception { + // Disable the test with evolved schema for HFile since it's not supported + // TODO(HUDI-3683): fix the schema evolution for HFile + } + + @Test + public void testReadHFileFormatRecords() throws Exception { + writeFileWithSimpleSchema(); + FileSystem fs = FSUtils.getFs(getFilePath().toString(), new Configuration()); + byte[] content = FileIOUtils.readAsByteArray( + fs.open(getFilePath()), (int) fs.getFileStatus(getFilePath()).getLen()); + // Reading byte array in HFile format, without actual file path + HoodieHFileReader hfileReader = + new HoodieHFileReader<>(fs, new Path(DUMMY_BASE_PATH), content, Option.empty()); + Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc"); + assertEquals(NUM_RECORDS, hfileReader.getTotalRecords()); + verifySimpleRecords(hfileReader.getRecordIterator(avroSchema)); + } + + @Test + public void testReaderGetRecordIterator() throws Exception { + writeFileWithSimpleSchema(); + HoodieHFileReader hfileReader = + (HoodieHFileReader) createReader(new Configuration()); + List keys = + IntStream.concat(IntStream.range(40, NUM_RECORDS * 2), IntStream.range(10, 20)) + .mapToObj(i -> "key" + String.format("%02d", i)).collect(Collectors.toList()); + Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc"); + Iterator iterator = hfileReader.getRecordsByKeysIterator(keys, avroSchema); + + List expectedIds = + IntStream.concat(IntStream.range(40, NUM_RECORDS), IntStream.range(10, 20)) + .boxed().collect(Collectors.toList()); + int index = 0; + while (iterator.hasNext()) { + GenericRecord record = iterator.next(); + String key = "key" + String.format("%02d", expectedIds.get(index)); + assertEquals(key, record.get("_row_key").toString()); + assertEquals(Integer.toString(expectedIds.get(index)), record.get("time").toString()); + assertEquals(expectedIds.get(index), record.get("number")); + index++; + } + } + + @Test + public void testReaderGetRecordIteratorByKeyPrefixes() throws Exception { + writeFileWithSimpleSchema(); + HoodieHFileReader hfileReader = + (HoodieHFileReader) createReader(new Configuration()); + + Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc"); + + List keyPrefixes = Collections.singletonList("key"); + Iterator iterator = + hfileReader.getRecordsByKeyPrefixIterator(keyPrefixes, avroSchema); + + List recordsByPrefix = toStream(iterator).collect(Collectors.toList()); + + List allRecords = toStream(hfileReader.getRecordIterator()).collect(Collectors.toList()); + + assertEquals(allRecords, recordsByPrefix); + + // filter for "key1" : entries from key10 to key19 should be matched + List expectedKey1s = allRecords.stream().filter(entry -> (entry.get("_row_key").toString()).contains("key1")).collect(Collectors.toList()); + iterator = + hfileReader.getRecordsByKeyPrefixIterator(Collections.singletonList("key1"), avroSchema); + recordsByPrefix = + StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false) + .collect(Collectors.toList()); + assertEquals(expectedKey1s, recordsByPrefix); + + // exact match + List expectedKey25 = allRecords.stream().filter(entry -> (entry.get("_row_key").toString()).contains("key25")).collect(Collectors.toList()); + iterator = + hfileReader.getRecordsByKeyPrefixIterator(Collections.singletonList("key25"), avroSchema); + recordsByPrefix = + StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false) + .collect(Collectors.toList()); + assertEquals(expectedKey25, recordsByPrefix); + + // no match. key prefix is beyond entries in file. + iterator = + hfileReader.getRecordsByKeyPrefixIterator(Collections.singletonList("key99"), avroSchema); + recordsByPrefix = + StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false) + .collect(Collectors.toList()); + assertEquals(Collections.emptyList(), recordsByPrefix); + + // no match. but keyPrefix is in between the entries found in file. + iterator = + hfileReader.getRecordsByKeyPrefixIterator(Collections.singletonList("key1234"), avroSchema); + recordsByPrefix = + StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false) + .collect(Collectors.toList()); + assertEquals(Collections.emptyList(), recordsByPrefix); + + // filter for "key50" and "key1" : entries from key50 and 'key10 to key19' should be matched. + List expectedKey50and1s = allRecords.stream().filter(entry -> (entry.get("_row_key").toString()).contains("key1") + || (entry.get("_row_key").toString()).contains("key50")).collect(Collectors.toList()); + iterator = + hfileReader.getRecordsByKeyPrefixIterator(Arrays.asList("key50", "key1"), avroSchema); + recordsByPrefix = + StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false) + .collect(Collectors.toList()); + assertEquals(expectedKey50and1s, recordsByPrefix); + + // filter for "key50" and "key0" : entries from key50 and 'key00 to key09' should be matched. + List expectedKey50and0s = allRecords.stream().filter(entry -> (entry.get("_row_key").toString()).contains("key0") + || (entry.get("_row_key").toString()).contains("key50")).collect(Collectors.toList()); + iterator = + hfileReader.getRecordsByKeyPrefixIterator(Arrays.asList("key50", "key0"), avroSchema); + recordsByPrefix = + StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false) + .collect(Collectors.toList()); + assertEquals(expectedKey50and0s, recordsByPrefix); + + // filter for "key1" and "key0" : entries from 'key10 to key19' and 'key00 to key09' should be matched. + List expectedKey1sand0s = allRecords.stream() + .filter(entry -> (entry.get("_row_key").toString()).contains("key1") || (entry.get("_row_key").toString()).contains("key0")) + .collect(Collectors.toList()); + iterator = + hfileReader.getRecordsByKeyPrefixIterator(Arrays.asList("key1", "key0"), avroSchema); + recordsByPrefix = + StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false) + .collect(Collectors.toList()); + Collections.sort(recordsByPrefix, new Comparator() { + @Override + public int compare(GenericRecord o1, GenericRecord o2) { + return o1.get("_row_key").toString().compareTo(o2.get("_row_key").toString()); + } + }); + assertEquals(expectedKey1sand0s, recordsByPrefix); + } + + @ParameterizedTest + @ValueSource(strings = { + "/hudi_0_9_hbase_1_2_3", "/hudi_0_10_hbase_1_2_3", "/hudi_0_11_hbase_2_4_9"}) + public void testHoodieHFileCompatibility(String hfilePrefix) throws IOException { + // This fixture is generated from TestHoodieReaderWriterBase#testWriteReadPrimitiveRecord() + // using different Hudi releases + String simpleHFile = hfilePrefix + SIMPLE_SCHEMA_HFILE_SUFFIX; + // This fixture is generated from TestHoodieReaderWriterBase#testWriteReadComplexRecord() + // using different Hudi releases + String complexHFile = hfilePrefix + COMPLEX_SCHEMA_HFILE_SUFFIX; + // This fixture is generated from TestBootstrapIndex#testBootstrapIndex() + // using different Hudi releases. The file is copied from .hoodie/.aux/.bootstrap/.partitions/ + String bootstrapIndexFile = hfilePrefix + BOOTSTRAP_INDEX_HFILE_SUFFIX; + + FileSystem fs = FSUtils.getFs(getFilePath().toString(), new Configuration()); + byte[] content = readHFileFromResources(simpleHFile); + verifyHFileReader( + HoodieHFileUtils.createHFileReader(fs, new Path(DUMMY_BASE_PATH), content), + hfilePrefix, true, HFILE_COMPARATOR.getClass(), NUM_RECORDS_FIXTURE); + HoodieHFileReader hfileReader = + new HoodieHFileReader<>(fs, new Path(DUMMY_BASE_PATH), content, Option.empty()); + Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc"); + assertEquals(NUM_RECORDS_FIXTURE, hfileReader.getTotalRecords()); + verifySimpleRecords(hfileReader.getRecordIterator(avroSchema)); + + content = readHFileFromResources(complexHFile); + verifyHFileReader(HoodieHFileUtils.createHFileReader(fs, new Path(DUMMY_BASE_PATH), content), + hfilePrefix, true, HFILE_COMPARATOR.getClass(), NUM_RECORDS_FIXTURE); + hfileReader = new HoodieHFileReader<>(fs, new Path(DUMMY_BASE_PATH), content, Option.empty()); + avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchemaWithUDT.avsc"); + assertEquals(NUM_RECORDS_FIXTURE, hfileReader.getTotalRecords()); + verifySimpleRecords(hfileReader.getRecordIterator(avroSchema)); + + content = readHFileFromResources(bootstrapIndexFile); + verifyHFileReader(HoodieHFileUtils.createHFileReader(fs, new Path(DUMMY_BASE_PATH), content), + hfilePrefix, false, HFileBootstrapIndex.HoodieKVComparator.class, 4); + } + + private Set getRandomKeys(int count, List keys) { + Set rowKeys = new HashSet<>(); + int totalKeys = keys.size(); + while (rowKeys.size() < count) { + int index = RANDOM.nextInt(totalKeys); + if (!rowKeys.contains(index)) { + rowKeys.add(keys.get(index)); + } + } + return rowKeys; + } + + private byte[] readHFileFromResources(String filename) throws IOException { + long size = TestHoodieHFileReaderWriter.class + .getResource(filename).openConnection().getContentLength(); + return FileIOUtils.readAsByteArray( + TestHoodieHFileReaderWriter.class.getResourceAsStream(filename), (int) size); + } + + private void verifyHFileReader( + HFile.Reader reader, String hfileName, boolean mayUseDefaultComparator, + Class clazz, int count) { + // HFile version is 3 + assertEquals(3, reader.getTrailer().getMajorVersion()); + if (mayUseDefaultComparator && hfileName.contains("hudi_0_9")) { + // Pre Hudi 0.10, the default comparator is used for metadata table HFiles + // For bootstrap index HFiles, the custom comparator is always used + assertEquals(CellComparatorImpl.class, reader.getComparator().getClass()); + } else { + assertEquals(clazz, reader.getComparator().getClass()); + } + assertEquals(count, reader.getEntries()); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieOrcReaderWriter.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieOrcReaderWriter.java new file mode 100644 index 0000000000000..373fc31a56272 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieOrcReaderWriter.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.hudi.avro.HoodieBloomFilterWriteSupport; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.bloom.BloomFilterFactory; +import org.apache.hudi.common.bloom.BloomFilterTypeCode; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.config.HoodieStorageConfig; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.orc.CompressionKind; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.mockito.Mockito; + +import java.io.IOException; +import java.util.function.Supplier; + +import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY; +import static org.apache.hudi.io.storage.HoodieOrcConfig.AVRO_SCHEMA_METADATA_KEY; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.when; + +public class TestHoodieOrcReaderWriter extends TestHoodieReaderWriterBase { + + @Override + protected Path getFilePath() { + return new Path(tempDir.toString() + "/f1_1-0-1_000.orc"); + } + + @Override + protected HoodieFileWriter createWriter( + Schema avroSchema, boolean populateMetaFields) throws Exception { + BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.00001, -1, BloomFilterTypeCode.SIMPLE.name()); + Configuration conf = new Configuration(); + int orcStripSize = Integer.parseInt(HoodieStorageConfig.ORC_STRIPE_SIZE.defaultValue()); + int orcBlockSize = Integer.parseInt(HoodieStorageConfig.ORC_BLOCK_SIZE.defaultValue()); + int maxFileSize = Integer.parseInt(HoodieStorageConfig.ORC_FILE_MAX_SIZE.defaultValue()); + HoodieOrcConfig config = new HoodieOrcConfig(conf, CompressionKind.ZLIB, orcStripSize, orcBlockSize, maxFileSize, filter); + TaskContextSupplier mockTaskContextSupplier = Mockito.mock(TaskContextSupplier.class); + Supplier partitionSupplier = Mockito.mock(Supplier.class); + when(mockTaskContextSupplier.getPartitionIdSupplier()).thenReturn(partitionSupplier); + when(partitionSupplier.get()).thenReturn(10); + String instantTime = "000"; + return new HoodieOrcWriter<>(instantTime, getFilePath(), config, avroSchema, mockTaskContextSupplier); + } + + @Override + protected HoodieFileReader createReader( + Configuration conf) throws Exception { + return HoodieFileReaderFactory.getFileReader(conf, getFilePath()); + } + + @Override + protected void verifyMetadata(Configuration conf) throws IOException { + Reader orcReader = OrcFile.createReader(getFilePath(), OrcFile.readerOptions(conf)); + assertEquals(4, orcReader.getMetadataKeys().size()); + assertTrue(orcReader.getMetadataKeys().contains(HoodieBloomFilterWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER)); + assertTrue(orcReader.getMetadataKeys().contains(HoodieBloomFilterWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER)); + assertTrue(orcReader.getMetadataKeys().contains(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY)); + assertTrue(orcReader.getMetadataKeys().contains(AVRO_SCHEMA_METADATA_KEY)); + assertEquals(CompressionKind.ZLIB.name(), orcReader.getCompressionKind().toString()); + assertEquals(NUM_RECORDS, orcReader.getNumberOfRows()); + } + + @Override + protected void verifySchema(Configuration conf, String schemaPath) throws IOException { + Reader orcReader = OrcFile.createReader(getFilePath(), OrcFile.readerOptions(conf)); + if ("/exampleSchema.avsc".equals(schemaPath)) { + assertEquals("struct<_row_key:string,time:string,number:int>", + orcReader.getSchema().toString()); + } else if ("/exampleSchemaWithUDT.avsc".equals(schemaPath)) { + assertEquals("struct<_row_key:string,time:string,number:int,driver:struct,map:map>>", + orcReader.getSchema().toString()); + } + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieReaderWriterBase.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieReaderWriterBase.java new file mode 100644 index 0000000000000..902f42e38f32b --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieReaderWriterBase.java @@ -0,0 +1,267 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.model.HoodieKey; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Abstract class for unit tests of {@link HoodieFileReader} and {@link HoodieFileWriter} + * for different file format + */ +public abstract class TestHoodieReaderWriterBase { + protected static final int NUM_RECORDS = 50; + @TempDir + protected File tempDir; + + protected abstract Path getFilePath(); + + protected abstract HoodieFileWriter createWriter( + Schema avroSchema, boolean populateMetaFields) throws Exception; + + protected abstract HoodieFileReader createReader( + Configuration conf) throws Exception; + + protected abstract void verifyMetadata(Configuration conf) throws IOException; + + protected abstract void verifySchema(Configuration conf, String schemaPath) throws IOException; + + @BeforeEach + @AfterEach + public void clearTempFile() { + File file = new File(getFilePath().toString()); + if (file.exists()) { + file.delete(); + } + } + + @Test + public void testWriteReadMetadata() throws Exception { + Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc"); + writeFileWithSimpleSchema(); + + Configuration conf = new Configuration(); + verifyMetadata(conf); + + HoodieFileReader hoodieReader = createReader(conf); + BloomFilter filter = hoodieReader.readBloomFilter(); + for (int i = 0; i < NUM_RECORDS; i++) { + String key = "key" + String.format("%02d", i); + assertTrue(filter.mightContain(key)); + } + assertFalse(filter.mightContain("non-existent-key")); + assertEquals(avroSchema, hoodieReader.getSchema()); + assertEquals(NUM_RECORDS, hoodieReader.getTotalRecords()); + String[] minMaxRecordKeys = hoodieReader.readMinMaxRecordKeys(); + assertEquals(2, minMaxRecordKeys.length); + assertEquals("key00", minMaxRecordKeys[0]); + assertEquals("key" + (NUM_RECORDS - 1), minMaxRecordKeys[1]); + } + + @Test + public void testWriteReadPrimitiveRecord() throws Exception { + String schemaPath = "/exampleSchema.avsc"; + writeFileWithSimpleSchema(); + + Configuration conf = new Configuration(); + verifyMetadata(conf); + verifySchema(conf, schemaPath); + verifySimpleRecords(createReader(conf).getRecordIterator()); + } + + @Test + public void testWriteReadComplexRecord() throws Exception { + String schemaPath = "/exampleSchemaWithUDT.avsc"; + Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, schemaPath); + Schema udtSchema = avroSchema.getField("driver").schema().getTypes().get(1); + HoodieFileWriter writer = createWriter(avroSchema, true); + for (int i = 0; i < NUM_RECORDS; i++) { + GenericRecord record = new GenericData.Record(avroSchema); + String key = "key" + String.format("%02d", i); + record.put("_row_key", key); + record.put("time", Integer.toString(i)); + record.put("number", i); + GenericRecord innerRecord = new GenericData.Record(udtSchema); + innerRecord.put("driver_name", "driver" + i); + innerRecord.put("list", Collections.singletonList(i)); + innerRecord.put("map", Collections.singletonMap(key, "value" + i)); + record.put("driver", innerRecord); + writer.writeAvro(key, record); + } + writer.close(); + + Configuration conf = new Configuration(); + verifyMetadata(conf); + verifySchema(conf, schemaPath); + verifyComplexRecords(createReader(conf).getRecordIterator()); + } + + @Test + public void testWriteReadWithEvolvedSchema() throws Exception { + writeFileWithSimpleSchema(); + + Configuration conf = new Configuration(); + HoodieFileReader hoodieReader = createReader(conf); + String[] schemaList = new String[] { + "/exampleEvolvedSchema.avsc", "/exampleEvolvedSchemaChangeOrder.avsc", + "/exampleEvolvedSchemaColumnRequire.avsc", "/exampleEvolvedSchemaColumnType.avsc", + "/exampleEvolvedSchemaDeleteColumn.avsc"}; + + for (String evolvedSchemaPath : schemaList) { + verifyReaderWithSchema(evolvedSchemaPath, hoodieReader); + } + } + + @Test + public void testReaderFilterRowKeys() throws Exception { + writeFileWithSchemaWithMeta(); + Configuration conf = new Configuration(); + verifyMetadata(conf); + verifyFilterRowKeys(createReader(conf)); + } + + protected void writeFileWithSimpleSchema() throws Exception { + Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc"); + HoodieFileWriter writer = createWriter(avroSchema, true); + for (int i = 0; i < NUM_RECORDS; i++) { + GenericRecord record = new GenericData.Record(avroSchema); + String key = "key" + String.format("%02d", i); + record.put("_row_key", key); + record.put("time", Integer.toString(i)); + record.put("number", i); + writer.writeAvro(key, record); + } + writer.close(); + } + + protected void writeFileWithSchemaWithMeta() throws Exception { + Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchemaWithMetaFields.avsc"); + HoodieFileWriter writer = createWriter(avroSchema, true); + for (int i = 0; i < NUM_RECORDS; i++) { + GenericRecord record = new GenericData.Record(avroSchema); + String key = "key" + String.format("%02d", i); + record.put("_row_key", key); + record.put("time", Integer.toString(i)); + record.put("number", i); + writer.writeAvroWithMetadata(new HoodieKey((String) record.get("_row_key"), + Integer.toString((Integer) record.get("number"))), record); + } + writer.close(); + } + + protected void verifySimpleRecords(Iterator iterator) { + int index = 0; + while (iterator.hasNext()) { + GenericRecord record = iterator.next(); + String key = "key" + String.format("%02d", index); + assertEquals(key, record.get("_row_key").toString()); + assertEquals(Integer.toString(index), record.get("time").toString()); + assertEquals(index, record.get("number")); + index++; + } + } + + protected void verifyComplexRecords(Iterator iterator) { + int index = 0; + while (iterator.hasNext()) { + GenericRecord record = iterator.next(); + String key = "key" + String.format("%02d", index); + assertEquals(key, record.get("_row_key").toString()); + assertEquals(Integer.toString(index), record.get("time").toString()); + assertEquals(index, record.get("number")); + GenericRecord innerRecord = (GenericRecord) record.get("driver"); + assertEquals("driver" + index, innerRecord.get("driver_name").toString()); + assertEquals(1, ((List) innerRecord.get("list")).size()); + assertEquals(index, ((List) innerRecord.get("list")).get(0)); + Map mapping = (Map) innerRecord.get("map"); + boolean match = false; + for (Object innerKey : mapping.keySet()) { + // The innerKey may not be in the type of String, so we have to + // use the following logic for validation + if (innerKey.toString().equals(key)) { + assertEquals("value" + index, mapping.get(innerKey).toString()); + match = true; + } + } + assertTrue(match); + index++; + } + } + + private void verifyFilterRowKeys(HoodieFileReader hoodieReader) { + Set candidateRowKeys = IntStream.range(40, NUM_RECORDS * 2) + .mapToObj(i -> "key" + String.format("%02d", i)).collect(Collectors.toCollection(TreeSet::new)); + List expectedKeys = IntStream.range(40, NUM_RECORDS) + .mapToObj(i -> "key" + String.format("%02d", i)).sorted().collect(Collectors.toList()); + assertEquals(expectedKeys, hoodieReader.filterRowKeys(candidateRowKeys) + .stream().sorted().collect(Collectors.toList())); + } + + private void verifyReaderWithSchema(String schemaPath, HoodieFileReader hoodieReader) throws IOException { + Schema evolvedSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, schemaPath); + Iterator iter = hoodieReader.getRecordIterator(evolvedSchema); + int index = 0; + while (iter.hasNext()) { + verifyRecord(schemaPath, iter.next(), index); + index++; + } + } + + private void verifyRecord(String schemaPath, GenericRecord record, int index) { + String numStr = String.format("%02d", index); + assertEquals("key" + numStr, record.get("_row_key").toString()); + assertEquals(Integer.toString(index), record.get("time").toString()); + if ("/exampleEvolvedSchemaColumnType.avsc".equals(schemaPath)) { + assertEquals(Integer.toString(index), record.get("number").toString()); + } else if ("/exampleEvolvedSchemaDeleteColumn.avsc".equals(schemaPath)) { + assertNull(record.get("number")); + } else { + assertEquals(index, record.get("number")); + } + assertNull(record.get("added_field")); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/keygen/TestKeyGenUtils.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/keygen/TestKeyGenUtils.java new file mode 100644 index 0000000000000..43f5952e4927c --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/keygen/TestKeyGenUtils.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.keygen; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +public class TestKeyGenUtils { + + @Test + public void testExtractRecordKeys() { + // test complex key form: field1:val1,field2:val2,... + String[] s1 = KeyGenUtils.extractRecordKeys("id:1"); + Assertions.assertArrayEquals(new String[] {"1"}, s1); + + String[] s2 = KeyGenUtils.extractRecordKeys("id:1,id:2"); + Assertions.assertArrayEquals(new String[] {"1", "2"}, s2); + + String[] s3 = KeyGenUtils.extractRecordKeys("id:1,id2:__null__,id3:__empty__"); + Assertions.assertArrayEquals(new String[] {"1", null, ""}, s3); + + String[] s4 = KeyGenUtils.extractRecordKeys("id:ab:cd,id2:ef"); + Assertions.assertArrayEquals(new String[] {"ab:cd", "ef"}, s4); + + // test simple key form: val1 + String[] s5 = KeyGenUtils.extractRecordKeys("1"); + Assertions.assertArrayEquals(new String[] {"1"}, s5); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/keygen/factory/TestCreateAvroKeyGeneratorByTypeWithFactory.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/keygen/factory/TestCreateAvroKeyGeneratorByTypeWithFactory.java new file mode 100644 index 0000000000000..b69d84442bcce --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/keygen/factory/TestCreateAvroKeyGeneratorByTypeWithFactory.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.keygen.factory; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieKeyGeneratorException; +import org.apache.hudi.keygen.ComplexAvroKeyGenerator; +import org.apache.hudi.keygen.CustomAvroKeyGenerator; +import org.apache.hudi.keygen.GlobalAvroDeleteKeyGenerator; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.keygen.NonpartitionedAvroKeyGenerator; +import org.apache.hudi.keygen.SimpleAvroKeyGenerator; +import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.apache.hudi.keygen.constant.KeyGeneratorType; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.IOException; +import java.util.stream.Stream; + +public class TestCreateAvroKeyGeneratorByTypeWithFactory { + + private TypedProperties props; + + private static Stream configParams() { + String[] types = {KeyGeneratorType.SIMPLE.name(), KeyGeneratorType.TIMESTAMP.name(), KeyGeneratorType.COMPLEX.name(), + KeyGeneratorType.CUSTOM.name(), KeyGeneratorType.NON_PARTITION.name(), KeyGeneratorType.GLOBAL_DELETE.name()}; + return Stream.of(types).map(Arguments::of); + } + + @BeforeEach + public void init() { + props = new TypedProperties(); + props.put(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "_row_key"); + props.put(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE.key(), "true"); + props.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "timestamp"); + + // for timestamp based key generator + props.put("hoodie.deltastreamer.keygen.timebased.timestamp.type", "DATE_STRING"); + props.put("hoodie.deltastreamer.keygen.timebased.input.dateformat", "yyyy-MM-dd"); + props.put("hoodie.deltastreamer.keygen.timebased.output.dateformat", "yyyyMMdd"); + } + + @AfterEach + public void teardown() { + props = null; + } + + @ParameterizedTest + @MethodSource("configParams") + public void testKeyGeneratorTypes(String keyGenType) throws IOException { + props.put(HoodieWriteConfig.KEYGENERATOR_TYPE.key(), keyGenType); + KeyGeneratorType keyType = KeyGeneratorType.valueOf(keyGenType); + + KeyGenerator keyGenerator = HoodieAvroKeyGeneratorFactory.createKeyGenerator(props); + switch (keyType) { + case SIMPLE: + Assertions.assertEquals(SimpleAvroKeyGenerator.class.getName(), keyGenerator.getClass().getName()); + return; + case COMPLEX: + Assertions.assertEquals(ComplexAvroKeyGenerator.class.getName(), keyGenerator.getClass().getName()); + return; + case TIMESTAMP: + Assertions.assertEquals(TimestampBasedAvroKeyGenerator.class.getName(), keyGenerator.getClass().getName()); + return; + case CUSTOM: + Assertions.assertEquals(CustomAvroKeyGenerator.class.getName(), keyGenerator.getClass().getName()); + return; + case NON_PARTITION: + Assertions.assertEquals(NonpartitionedAvroKeyGenerator.class.getName(), keyGenerator.getClass().getName()); + return; + case GLOBAL_DELETE: + Assertions.assertEquals(GlobalAvroDeleteKeyGenerator.class.getName(), keyGenerator.getClass().getName()); + return; + default: + throw new HoodieKeyGeneratorException("Unsupported keyGenerator Type " + keyGenType); + } + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/keygen/factory/TestHoodieAvroKeyGeneratorFactory.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/keygen/factory/TestHoodieAvroKeyGeneratorFactory.java new file mode 100644 index 0000000000000..c3be6284520e2 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/keygen/factory/TestHoodieAvroKeyGeneratorFactory.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.keygen.factory; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieKeyGeneratorException; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.keygen.SimpleAvroKeyGenerator; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.apache.hudi.keygen.constant.KeyGeneratorType; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.io.IOException; + +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class TestHoodieAvroKeyGeneratorFactory { + @Test + public void testKeyGeneratorFactory() throws IOException { + TypedProperties props = getCommonProps(); + + // set KeyGenerator type only + props.put(HoodieWriteConfig.KEYGENERATOR_TYPE.key(), KeyGeneratorType.SIMPLE.name()); + KeyGenerator keyGenerator = HoodieAvroKeyGeneratorFactory.createKeyGenerator(props); + Assertions.assertEquals(SimpleAvroKeyGenerator.class.getName(), keyGenerator.getClass().getName()); + + // set KeyGenerator class only + props = getCommonProps(); + props.put(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key(), SimpleAvroKeyGenerator.class.getName()); + KeyGenerator keyGenerator2 = HoodieAvroKeyGeneratorFactory.createKeyGenerator(props); + Assertions.assertEquals(SimpleAvroKeyGenerator.class.getName(), keyGenerator2.getClass().getName()); + + // set both class name and keyGenerator type + props.put(HoodieWriteConfig.KEYGENERATOR_TYPE.key(), KeyGeneratorType.CUSTOM.name()); + KeyGenerator keyGenerator3 = HoodieAvroKeyGeneratorFactory.createKeyGenerator(props); + // KEYGENERATOR_TYPE_PROP was overitten by KEYGENERATOR_CLASS_PROP + Assertions.assertEquals(SimpleAvroKeyGenerator.class.getName(), keyGenerator3.getClass().getName()); + + // set wrong class name + final TypedProperties props2 = getCommonProps(); + props2.put(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key(), TestHoodieAvroKeyGeneratorFactory.class.getName()); + assertThrows(IOException.class, () -> HoodieAvroKeyGeneratorFactory.createKeyGenerator(props2)); + + // set wrong keyGenerator type + final TypedProperties props3 = getCommonProps(); + props3.put(HoodieWriteConfig.KEYGENERATOR_TYPE.key(), "wrong_type"); + assertThrows(HoodieKeyGeneratorException.class, () -> HoodieAvroKeyGeneratorFactory.createKeyGenerator(props3)); + } + + private TypedProperties getCommonProps() { + TypedProperties properties = new TypedProperties(); + properties.put(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "_row_key"); + properties.put(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE.key(), "true"); + properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "timestamp"); + return properties; + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestHoodieConsoleMetrics.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestHoodieConsoleMetrics.java index 7424d0b92695d..467a9f7929339 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestHoodieConsoleMetrics.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestHoodieConsoleMetrics.java @@ -22,21 +22,26 @@ import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; import static org.apache.hudi.metrics.Metrics.registerGauge; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; +@ExtendWith(MockitoExtension.class) public class TestHoodieConsoleMetrics { - HoodieWriteConfig config = mock(HoodieWriteConfig.class); + @Mock + HoodieWriteConfig config; @BeforeEach public void start() { + when(config.getTableName()).thenReturn("console_metrics_test"); when(config.isMetricsOn()).thenReturn(true); when(config.getMetricsReporterType()).thenReturn(MetricsReporterType.CONSOLE); - new HoodieMetrics(config, "raw_table"); + new HoodieMetrics(config); } @AfterEach diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestHoodieGraphiteMetrics.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestHoodieGraphiteMetrics.java new file mode 100644 index 0000000000000..6ff7ee88ac8fb --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestHoodieGraphiteMetrics.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metrics; + +import org.apache.hudi.common.testutils.NetworkTestUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import static org.apache.hudi.metrics.Metrics.registerGauge; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.when; + +/** + * Test for the Graphite metrics report. + */ +@ExtendWith(MockitoExtension.class) +public class TestHoodieGraphiteMetrics { + + @Mock + HoodieWriteConfig config; + + @AfterEach + void shutdownMetrics() { + Metrics.shutdown(); + } + + @Test + public void testRegisterGauge() { + when(config.isMetricsOn()).thenReturn(true); + when(config.getTableName()).thenReturn("table1"); + when(config.getMetricsReporterType()).thenReturn(MetricsReporterType.GRAPHITE); + when(config.getGraphiteServerHost()).thenReturn("localhost"); + when(config.getGraphiteServerPort()).thenReturn(NetworkTestUtils.nextFreePort()); + when(config.getGraphiteReportPeriodSeconds()).thenReturn(30); + new HoodieMetrics(config); + registerGauge("graphite_metric", 123L); + assertEquals("123", Metrics.getInstance().getRegistry().getGauges() + .get("graphite_metric").getValue().toString()); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestHoodieJmxMetrics.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestHoodieJmxMetrics.java index 7b63a300f3a98..a752aa36eca9a 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestHoodieJmxMetrics.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestHoodieJmxMetrics.java @@ -21,27 +21,38 @@ import org.apache.hudi.common.testutils.NetworkTestUtils; import org.apache.hudi.config.HoodieWriteConfig; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; import static org.apache.hudi.metrics.Metrics.registerGauge; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; /** * Test for the Jmx metrics report. */ +@ExtendWith(MockitoExtension.class) public class TestHoodieJmxMetrics { - HoodieWriteConfig config = mock(HoodieWriteConfig.class); + @Mock + HoodieWriteConfig config; + + @AfterEach + void shutdownMetrics() { + Metrics.shutdown(); + } @Test public void testRegisterGauge() { when(config.isMetricsOn()).thenReturn(true); + when(config.getTableName()).thenReturn("foo"); when(config.getMetricsReporterType()).thenReturn(MetricsReporterType.JMX); when(config.getJmxHost()).thenReturn("localhost"); when(config.getJmxPort()).thenReturn(String.valueOf(NetworkTestUtils.nextFreePort())); - new HoodieMetrics(config, "raw_table"); + new HoodieMetrics(config); registerGauge("jmx_metric1", 123L); assertEquals("123", Metrics.getInstance().getRegistry().getGauges() .get("jmx_metric1").getValue().toString()); @@ -50,10 +61,11 @@ public void testRegisterGauge() { @Test public void testRegisterGaugeByRangerPort() { when(config.isMetricsOn()).thenReturn(true); + when(config.getTableName()).thenReturn("foo"); when(config.getMetricsReporterType()).thenReturn(MetricsReporterType.JMX); when(config.getJmxHost()).thenReturn("localhost"); when(config.getJmxPort()).thenReturn(String.valueOf(NetworkTestUtils.nextFreePort())); - new HoodieMetrics(config, "raw_table"); + new HoodieMetrics(config); registerGauge("jmx_metric2", 123L); assertEquals("123", Metrics.getInstance().getRegistry().getGauges() .get("jmx_metric2").getValue().toString()); diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestHoodieMetrics.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestHoodieMetrics.java index 41842b1be370f..a5ea531c9280a 100755 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestHoodieMetrics.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestHoodieMetrics.java @@ -19,11 +19,17 @@ package org.apache.hudi.metrics; import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import com.codahale.metrics.Timer; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; import java.util.Random; import java.util.stream.Stream; @@ -34,16 +40,24 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; +@ExtendWith(MockitoExtension.class) public class TestHoodieMetrics { - private HoodieMetrics metrics; + @Mock + HoodieWriteConfig config; + HoodieMetrics metrics; @BeforeEach - public void start() { - HoodieWriteConfig config = mock(HoodieWriteConfig.class); + void setUp() { when(config.isMetricsOn()).thenReturn(true); + when(config.getTableName()).thenReturn("raw_table"); when(config.getMetricsReporterType()).thenReturn(MetricsReporterType.INMEMORY); - metrics = new HoodieMetrics(config, "raw_table"); + metrics = new HoodieMetrics(config); + } + + @AfterEach + void shutdownMetrics() { + Metrics.shutdown(); } @Test @@ -123,6 +137,7 @@ public void testTimerCtx() throws InterruptedException { when(metadata.getTotalCompactedRecordsUpdated()).thenReturn(randomValue + 11); when(metadata.getTotalLogFilesCompacted()).thenReturn(randomValue + 12); when(metadata.getTotalLogFilesSize()).thenReturn(randomValue + 13); + when(metadata.getMinAndMaxEventTime()).thenReturn(Pair.of(Option.empty(), Option.empty())); metrics.updateCommitMetrics(randomValue + 14, commitTimer.stop(), metadata, action); String metricname = metrics.getMetricsName(action, "duration"); diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestMetricsReporterFactory.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestMetricsReporterFactory.java index 317f15230c3c7..390f585ebb73f 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestMetricsReporterFactory.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestMetricsReporterFactory.java @@ -19,17 +19,17 @@ package org.apache.hudi.metrics; +import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.metrics.custom.CustomizableMetricsReporter; import com.codahale.metrics.MetricRegistry; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.metrics.userdefined.AbstractUserDefinedMetricsReporter; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; -import java.io.Closeable; import java.util.Properties; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -57,12 +57,12 @@ public void metricsReporterFactoryShouldReturnReporter() { public void metricsReporterFactoryShouldReturnUserDefinedReporter() { when(config.getMetricReporterClassName()).thenReturn(DummyMetricsReporter.class.getName()); - Properties props = new Properties(); + TypedProperties props = new TypedProperties(); props.setProperty("testKey", "testValue"); when(config.getProps()).thenReturn(props); MetricsReporter reporter = MetricsReporterFactory.createReporter(config, registry); - assertTrue(reporter instanceof AbstractUserDefinedMetricsReporter); + assertTrue(reporter instanceof CustomizableMetricsReporter); assertEquals(props, ((DummyMetricsReporter) reporter).getProps()); assertEquals(registry, ((DummyMetricsReporter) reporter).getRegistry()); } @@ -70,11 +70,11 @@ public void metricsReporterFactoryShouldReturnUserDefinedReporter() { @Test public void metricsReporterFactoryShouldThrowExceptionWhenMetricsReporterClassIsIllegal() { when(config.getMetricReporterClassName()).thenReturn(IllegalTestMetricsReporter.class.getName()); - when(config.getProps()).thenReturn(new Properties()); + when(config.getProps()).thenReturn(new TypedProperties()); assertThrows(HoodieException.class, () -> MetricsReporterFactory.createReporter(config, registry)); } - public static class DummyMetricsReporter extends AbstractUserDefinedMetricsReporter { + public static class DummyMetricsReporter extends CustomizableMetricsReporter { public DummyMetricsReporter(Properties props, MetricRegistry registry) { super(props, registry); @@ -86,11 +86,6 @@ public void start() {} @Override public void report() {} - @Override - public Closeable getReporter() { - return null; - } - @Override public void stop() {} } diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/cloudwatch/TestCloudWatchMetricsReporter.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/cloudwatch/TestCloudWatchMetricsReporter.java new file mode 100644 index 0000000000000..7901d80246513 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/cloudwatch/TestCloudWatchMetricsReporter.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metrics.cloudwatch; + +import org.apache.hudi.aws.cloudwatch.CloudWatchReporter; +import org.apache.hudi.config.HoodieWriteConfig; + +import com.codahale.metrics.MetricRegistry; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.util.concurrent.TimeUnit; + +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +public class TestCloudWatchMetricsReporter { + + @Mock + private HoodieWriteConfig config; + + @Mock + private MetricRegistry registry; + + @Mock + private CloudWatchReporter reporter; + + @Test + public void testReporter() { + when(config.getCloudWatchReportPeriodSeconds()).thenReturn(30); + CloudWatchMetricsReporter metricsReporter = new CloudWatchMetricsReporter(config, registry, reporter); + + metricsReporter.start(); + verify(reporter, times(1)).start(30, TimeUnit.SECONDS); + + metricsReporter.report(); + verify(reporter, times(1)).report(); + + metricsReporter.stop(); + verify(reporter, times(1)).stop(); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestDatadogHttpClient.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestDatadogHttpClient.java index 5767d189d35dc..e968190522465 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestDatadogHttpClient.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestDatadogHttpClient.java @@ -23,10 +23,13 @@ import org.apache.http.StatusLine; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.log4j.AppenderSkeleton; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.apache.log4j.spi.LoggingEvent; +import org.apache.logging.log4j.Level; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.core.Appender; +import org.apache.logging.log4j.core.LogEvent; +import org.apache.logging.log4j.core.Logger; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.junit.jupiter.params.ParameterizedTest; @@ -40,12 +43,14 @@ import java.io.IOException; import java.util.Arrays; import java.util.List; +import java.util.UUID; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.reset; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; @@ -53,10 +58,10 @@ public class TestDatadogHttpClient { @Mock - AppenderSkeleton appender; + Appender appender; @Captor - ArgumentCaptor logCaptor; + ArgumentCaptor logCaptor; @Mock CloseableHttpClient httpClient; @@ -67,6 +72,27 @@ public class TestDatadogHttpClient { @Mock StatusLine statusLine; + private Level initialLogLevel; + + @BeforeEach + void prepareAppender() { + when(appender.getName()).thenReturn("MockAppender-" + UUID.randomUUID()); + when(appender.isStarted()).thenReturn(true); + when(appender.isStopped()).thenReturn(false); + Logger logger = (Logger) LogManager.getLogger(DatadogHttpClient.class); + initialLogLevel = logger.getLevel(); + logger.setLevel(Level.DEBUG); + logger.addAppender(appender); + } + + @AfterEach + void resetMocks() { + Logger logger = (Logger) LogManager.getLogger(DatadogHttpClient.class); + logger.setLevel(initialLogLevel); + logger.removeAppender(appender); + reset(appender, httpClient, httpResponse, statusLine); + } + private void mockResponse(int statusCode) { when(statusLine.getStatusCode()).thenReturn(statusCode); when(httpResponse.getStatusLine()).thenReturn(statusLine); @@ -99,41 +125,38 @@ public void validateApiKeyShouldThrowExceptionWhenResponseNotSuccessful() { @Test public void sendPayloadShouldLogWhenRequestFailed() throws IOException { - Logger.getRootLogger().addAppender(appender); when(httpClient.execute(any())).thenThrow(IOException.class); DatadogHttpClient ddClient = new DatadogHttpClient(ApiSite.US, "foo", true, httpClient); ddClient.send("{}"); - verify(appender).doAppend(logCaptor.capture()); - assertEquals("Failed to send to Datadog.", logCaptor.getValue().getRenderedMessage()); + verify(appender).append(logCaptor.capture()); + assertEquals("Failed to send to Datadog.", logCaptor.getValue().getMessage().getFormattedMessage()); assertEquals(Level.WARN, logCaptor.getValue().getLevel()); } @Test public void sendPayloadShouldLogUnsuccessfulSending() { - Logger.getRootLogger().addAppender(appender); mockResponse(401); when(httpResponse.toString()).thenReturn("unauthorized"); DatadogHttpClient ddClient = new DatadogHttpClient(ApiSite.US, "foo", true, httpClient); ddClient.send("{}"); - verify(appender).doAppend(logCaptor.capture()); - assertEquals("Failed to send to Datadog. Response was unauthorized", logCaptor.getValue().getRenderedMessage()); + verify(appender).append(logCaptor.capture()); + assertEquals("Failed to send to Datadog. Response was unauthorized", logCaptor.getValue().getMessage().getFormattedMessage()); assertEquals(Level.WARN, logCaptor.getValue().getLevel()); } @Test public void sendPayloadShouldLogSuccessfulSending() { - Logger.getRootLogger().addAppender(appender); mockResponse(202); DatadogHttpClient ddClient = new DatadogHttpClient(ApiSite.US, "foo", true, httpClient); ddClient.send("{}"); - verify(appender).doAppend(logCaptor.capture()); - assertTrue(logCaptor.getValue().getRenderedMessage().startsWith("Sent metrics data")); + verify(appender).append(logCaptor.capture()); + assertTrue(logCaptor.getValue().getMessage().getFormattedMessage().startsWith("Sent metrics data")); assertEquals(Level.DEBUG, logCaptor.getValue().getLevel()); } diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestDatadogMetricsReporter.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestDatadogMetricsReporter.java index 3cab8f682e027..2514a489563ec 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestDatadogMetricsReporter.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestDatadogMetricsReporter.java @@ -19,9 +19,11 @@ package org.apache.hudi.metrics.datadog; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.metrics.Metrics; import org.apache.hudi.metrics.datadog.DatadogHttpClient.ApiSite; import com.codahale.metrics.MetricRegistry; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; @@ -43,6 +45,11 @@ public class TestDatadogMetricsReporter { @Mock MetricRegistry registry; + @AfterEach + void shutdownMetrics() { + Metrics.shutdown(); + } + @Test public void instantiationShouldFailWhenNoApiKey() { when(config.getDatadogApiKey()).thenReturn(""); diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestDatadogReporter.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestDatadogReporter.java index 1654e1648b053..4166a3e8060b3 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestDatadogReporter.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestDatadogReporter.java @@ -24,10 +24,12 @@ import com.codahale.metrics.MetricFilter; import com.codahale.metrics.MetricRegistry; -import org.apache.log4j.AppenderSkeleton; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.apache.log4j.spi.LoggingEvent; +import org.apache.logging.log4j.Level; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.core.Appender; +import org.apache.logging.log4j.core.LogEvent; +import org.apache.logging.log4j.core.Logger; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.ArgumentCaptor; @@ -37,20 +39,23 @@ import java.io.IOException; import java.util.Arrays; +import java.util.UUID; import java.util.concurrent.TimeUnit; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.reset; import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; @ExtendWith(MockitoExtension.class) public class TestDatadogReporter { @Mock - AppenderSkeleton appender; + Appender appender; @Captor - ArgumentCaptor logCaptor; + ArgumentCaptor logCaptor; @Mock MetricRegistry registry; @@ -58,6 +63,12 @@ public class TestDatadogReporter { @Mock DatadogHttpClient client; + @AfterEach + void resetMocks() { + ((Logger) LogManager.getLogger(DatadogReporter.class)).removeAppender(appender); + reset(appender, registry, client); + } + @Test public void stopShouldCloseEnclosedClient() throws IOException { new DatadogReporter(registry, client, "foo", Option.empty(), Option.empty(), @@ -68,14 +79,18 @@ public void stopShouldCloseEnclosedClient() throws IOException { @Test public void stopShouldLogWhenEnclosedClientFailToClose() throws IOException { - Logger.getRootLogger().addAppender(appender); + when(appender.getName()).thenReturn("MockAppender-" + UUID.randomUUID()); + when(appender.isStarted()).thenReturn(true); + when(appender.isStopped()).thenReturn(false); + ((Logger) LogManager.getLogger(DatadogReporter.class)).addAppender(appender); + doThrow(IOException.class).when(client).close(); new DatadogReporter(registry, client, "foo", Option.empty(), Option.empty(), MetricFilter.ALL, TimeUnit.SECONDS, TimeUnit.SECONDS).stop(); - verify(appender).doAppend(logCaptor.capture()); - assertEquals("Error disconnecting from Datadog.", logCaptor.getValue().getRenderedMessage()); + verify(appender).append(logCaptor.capture()); + assertEquals("Error disconnecting from Datadog.", logCaptor.getValue().getMessage().getFormattedMessage()); assertEquals(Level.WARN, logCaptor.getValue().getLevel()); } diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestHoodieMetricsDatadogConfig.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestHoodieMetricsDatadogConfig.java new file mode 100644 index 0000000000000..aa486e9b95245 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/datadog/TestHoodieMetricsDatadogConfig.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.metrics.datadog; + +import org.apache.hudi.config.metrics.HoodieMetricsDatadogConfig; +import org.apache.hudi.config.HoodieWriteConfig; + +import org.junit.jupiter.api.Test; + +import java.util.Collections; + +import static org.junit.jupiter.api.Assertions.assertIterableEquals; + +public class TestHoodieMetricsDatadogConfig { + + @Test + public void getDatadogMetricTagsShouldReturnEmptyListWhenNotSet() { + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").build(); + writeConfig.getProps().remove(HoodieMetricsDatadogConfig.METRIC_TAG_VALUES.key()); + assertIterableEquals(Collections.emptyList(), writeConfig.getDatadogMetricTags()); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/prometheus/TestPrometheusReporter.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/prometheus/TestPrometheusReporter.java index 6bbd49d485bf3..79b12716530d9 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/prometheus/TestPrometheusReporter.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/prometheus/TestPrometheusReporter.java @@ -20,24 +20,37 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.metrics.HoodieMetrics; +import org.apache.hudi.metrics.Metrics; import org.apache.hudi.metrics.MetricsReporterType; + +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; -import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; +@ExtendWith(MockitoExtension.class) public class TestPrometheusReporter { - HoodieWriteConfig config = mock(HoodieWriteConfig.class); + @Mock + HoodieWriteConfig config; + + @AfterEach + void shutdownMetrics() { + Metrics.shutdown(); + } @Test public void testRegisterGauge() { when(config.isMetricsOn()).thenReturn(true); + when(config.getTableName()).thenReturn("foo"); when(config.getMetricsReporterType()).thenReturn(MetricsReporterType.PROMETHEUS); when(config.getPrometheusPort()).thenReturn(9090); assertDoesNotThrow(() -> { - new HoodieMetrics(config, "raw_table"); + new HoodieMetrics(config); }); } -} \ No newline at end of file +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/prometheus/TestPushGateWayReporter.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/prometheus/TestPushGateWayReporter.java index 2b94226cf4075..dcbf72c39db6e 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/prometheus/TestPushGateWayReporter.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/prometheus/TestPushGateWayReporter.java @@ -22,26 +22,47 @@ import org.apache.hudi.metrics.HoodieMetrics; import org.apache.hudi.metrics.Metrics; import org.apache.hudi.metrics.MetricsReporterType; + +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; import static org.apache.hudi.metrics.Metrics.registerGauge; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; +@ExtendWith(MockitoExtension.class) public class TestPushGateWayReporter { - HoodieWriteConfig config = mock(HoodieWriteConfig.class); + @Mock + HoodieWriteConfig config; + + @AfterEach + void shutdownMetrics() { + Metrics.shutdown(); + } @Test public void testRegisterGauge() { when(config.isMetricsOn()).thenReturn(true); + when(config.getTableName()).thenReturn("foo"); when(config.getMetricsReporterType()).thenReturn(MetricsReporterType.PROMETHEUS_PUSHGATEWAY); when(config.getPushGatewayHost()).thenReturn("localhost"); when(config.getPushGatewayPort()).thenReturn(9091); - new HoodieMetrics(config, "raw_table"); + when(config.getPushGatewayReportPeriodSeconds()).thenReturn(30); + when(config.getPushGatewayDeleteOnShutdown()).thenReturn(true); + when(config.getPushGatewayJobName()).thenReturn("foo"); + when(config.getPushGatewayRandomJobNameSuffix()).thenReturn(false); + + assertDoesNotThrow(() -> { + new HoodieMetrics(config); + }); + registerGauge("pushGateWayReporter_metric", 123L); assertEquals("123", Metrics.getInstance().getRegistry().getGauges() .get("pushGateWayReporter_metric").getValue().toString()); } -} \ No newline at end of file +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/optimize/TestHilbertCurveUtils.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/optimize/TestHilbertCurveUtils.java new file mode 100644 index 0000000000000..5bb482e6d67fe --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/optimize/TestHilbertCurveUtils.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.optimize; + +import org.davidmoten.hilbert.HilbertCurve; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestHilbertCurveUtils { + + private static final HilbertCurve INSTANCE = HilbertCurve.bits(5).dimensions(2); + + @Test + public void testIndex() { + long[] t = {1, 2}; + assertEquals(13, INSTANCE.index(t).intValue()); + long[] t1 = {0, 16}; + assertEquals(256, INSTANCE.index(t1).intValue()); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/action/cluster/strategy/TestPartitionAwareClusteringPlanStrategy.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/action/cluster/strategy/TestPartitionAwareClusteringPlanStrategy.java new file mode 100644 index 0000000000000..440bc95615391 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/action/cluster/strategy/TestPartitionAwareClusteringPlanStrategy.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.cluster.strategy; + +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mock; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestPartitionAwareClusteringPlanStrategy { + + @Mock + HoodieTable table; + @Mock + HoodieEngineContext context; + HoodieWriteConfig hoodieWriteConfig; + + @BeforeEach + public void setUp() { + Properties props = new Properties(); + props.setProperty("hoodie.clustering.plan.strategy.partition.regex.pattern", "2021072.*"); + this.hoodieWriteConfig = HoodieWriteConfig + .newBuilder() + .withPath("dummy_Table_Path") + .withClusteringConfig(HoodieClusteringConfig + .newBuilder() + .fromProperties(props) + .build()) + .build(); + } + + @Test + public void testFilterPartitionPaths() { + PartitionAwareClusteringPlanStrategy strategyTestRegexPattern = new DummyPartitionAwareClusteringPlanStrategy(table, context, hoodieWriteConfig); + + ArrayList fakeTimeBasedPartitionsPath = new ArrayList<>(); + fakeTimeBasedPartitionsPath.add("20210718"); + fakeTimeBasedPartitionsPath.add("20210715"); + fakeTimeBasedPartitionsPath.add("20210723"); + fakeTimeBasedPartitionsPath.add("20210716"); + fakeTimeBasedPartitionsPath.add("20210719"); + fakeTimeBasedPartitionsPath.add("20210721"); + + List list = strategyTestRegexPattern.getMatchedPartitions(hoodieWriteConfig, fakeTimeBasedPartitionsPath); + assertEquals(2, list.size()); + assertTrue(list.contains("20210721")); + assertTrue(list.contains("20210723")); + } + + class DummyPartitionAwareClusteringPlanStrategy extends PartitionAwareClusteringPlanStrategy { + + public DummyPartitionAwareClusteringPlanStrategy(HoodieTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + @Override + protected Stream buildClusteringGroupsForPartition(String partitionPath, List list) { + return null; + } + + @Override + protected Map getStrategyParams() { + return null; + } + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/cluster/strategy/TestClusteringPlanStrategyConfigCompatibility.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/cluster/strategy/TestClusteringPlanStrategyConfigCompatibility.java new file mode 100644 index 0000000000000..34626a897dad4 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/cluster/strategy/TestClusteringPlanStrategyConfigCompatibility.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.cluster.strategy; + +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.action.cluster.ClusteringPlanPartitionFilterMode; +import org.apache.hudi.table.action.cluster.strategy.ClusteringPlanStrategy; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.stream.Stream; + +public class TestClusteringPlanStrategyConfigCompatibility { + + private static Stream configParams() { + /** + * (user specified class, converted class, filter mode) + */ + Object[][] data = new Object[][] { + {"org.apache.hudi.client.clustering.plan.strategy.SparkRecentDaysClusteringPlanStrategy", + "org.apache.hudi.client.clustering.plan.strategy.SparkSizeBasedClusteringPlanStrategy", + ClusteringPlanPartitionFilterMode.RECENT_DAYS}, + {"org.apache.hudi.client.clustering.plan.strategy.SparkSelectedPartitionsClusteringPlanStrategy", + "org.apache.hudi.client.clustering.plan.strategy.SparkSizeBasedClusteringPlanStrategy", + ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS}, + {"org.apache.hudi.client.clustering.plan.strategy.JavaRecentDaysClusteringPlanStrategy", + "org.apache.hudi.client.clustering.plan.strategy.JavaSizeBasedClusteringPlanStrategy", + ClusteringPlanPartitionFilterMode.RECENT_DAYS} + }; + return Stream.of(data).map(Arguments::of); + } + + @ParameterizedTest() + @MethodSource("configParams") + public void testCheckAndGetClusteringPlanStrategy(String oldClass, String newClass, ClusteringPlanPartitionFilterMode mode) { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder() + .withPath("") + .withClusteringConfig(HoodieClusteringConfig.newBuilder() + .withClusteringPlanStrategyClass(oldClass) + .build()) + .build(); + + Assertions.assertEquals(newClass, ClusteringPlanStrategy.checkAndGetClusteringPlanStrategy(config)); + Assertions.assertEquals(mode, config.getClusteringPlanPartitionFilterMode()); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/marker/TestWriteMarkersFactory.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/marker/TestWriteMarkersFactory.java new file mode 100644 index 0000000000000..21c0e8108a531 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/marker/TestWriteMarkersFactory.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.marker; + +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.marker.MarkerType; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; + +import org.apache.hadoop.conf.Configuration; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.mockito.Mockito; + +import java.io.IOException; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.ArgumentMatchers.any; + +public class TestWriteMarkersFactory extends HoodieCommonTestHarness { + private static final String NON_HDFS_BASE_PATH = "/tmp/dir"; + private static final String HDFS_BASE_PATH = "hdfs://localhost/dir"; + private final HoodieWriteConfig writeConfig = Mockito.mock(HoodieWriteConfig.class); + private final HoodieTableMetaClient metaClient = Mockito.mock(HoodieTableMetaClient.class); + private final HoodieWrapperFileSystem fileSystem = Mockito.mock(HoodieWrapperFileSystem.class); + private final HoodieEngineContext context = Mockito.mock(HoodieEngineContext.class); + private final HoodieTable table = Mockito.mock(HoodieTable.class); + + @BeforeEach + public void init() throws IOException { + initMetaClient(); + } + + public static Stream configParams() { + Object[][] data = new Object[][] { + {NON_HDFS_BASE_PATH, true}, {HDFS_BASE_PATH, false}, + {NON_HDFS_BASE_PATH, true}, {HDFS_BASE_PATH, false}, + }; + return Stream.of(data).map(Arguments::of); + } + + @ParameterizedTest + @MethodSource("configParams") + public void testDirectMarkers(String basePath, boolean isTimelineServerEnabled) { + testWriteMarkersFactory( + MarkerType.DIRECT, basePath, isTimelineServerEnabled, DirectWriteMarkers.class); + } + + @Test + public void testTimelineServerBasedMarkersWithTimelineServerEnabled() { + testWriteMarkersFactory( + MarkerType.TIMELINE_SERVER_BASED, NON_HDFS_BASE_PATH, true, + TimelineServerBasedWriteMarkers.class); + } + + @Test + public void testTimelineServerBasedMarkersWithTimelineServerDisabled() { + // Fallback to direct markers should happen + testWriteMarkersFactory( + MarkerType.TIMELINE_SERVER_BASED, NON_HDFS_BASE_PATH, false, + DirectWriteMarkers.class); + } + + @Test + public void testTimelineServerBasedMarkersWithHDFS() { + // Fallback to direct markers should happen + testWriteMarkersFactory( + MarkerType.TIMELINE_SERVER_BASED, HDFS_BASE_PATH, true, + DirectWriteMarkers.class); + } + + private void testWriteMarkersFactory( + MarkerType markerTypeConfig, String basePath, boolean isTimelineServerEnabled, + Class expectedWriteMarkersClass) { + String instantTime = "001"; + Mockito.when(table.getConfig()).thenReturn(writeConfig); + Mockito.when(writeConfig.isEmbeddedTimelineServerEnabled()) + .thenReturn(isTimelineServerEnabled); + Mockito.when(table.getMetaClient()).thenReturn(metaClient); + Mockito.when(metaClient.getFs()).thenReturn(fileSystem); + Mockito.when(metaClient.getBasePath()).thenReturn(basePath); + Mockito.when(metaClient.getMarkerFolderPath(any())).thenReturn(basePath + ".hoodie/.temp"); + Mockito.when(table.getContext()).thenReturn(context); + Mockito.when(context.getHadoopConf()).thenReturn(new SerializableConfiguration(new Configuration())); + Mockito.when(writeConfig.getViewStorageConfig()) + .thenReturn(FileSystemViewStorageConfig.newBuilder().build()); + assertEquals(expectedWriteMarkersClass, + WriteMarkersFactory.get(markerTypeConfig, table, instantTime).getClass()); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/repair/TestRepairUtils.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/repair/TestRepairUtils.java new file mode 100644 index 0000000000000..4f8fb1dba339b --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/repair/TestRepairUtils.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.repair; + +import org.apache.hudi.HoodieTestCommitGenerator; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; + +import org.apache.hadoop.fs.Path; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import java.util.stream.Collectors; + +import static org.apache.hudi.HoodieTestCommitGenerator.getBaseFilename; +import static org.apache.hudi.HoodieTestCommitGenerator.getLogFilename; +import static org.apache.hudi.HoodieTestCommitGenerator.initCommitInfoForRepairTests; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class TestRepairUtils { + // Instant time -> List> + private static final Map>> BASE_FILE_INFO = new HashMap<>(); + private static final Map>> LOG_FILE_INFO = new HashMap<>(); + // instant time -> partitionPathToFileIdAndPathMap + private final Map>>> instantInfoMap = new HashMap<>(); + @TempDir + public static java.nio.file.Path tempDir; + private static String basePath; + private static HoodieTableMetaClient metaClient; + + @BeforeAll + static void initFileInfo() throws IOException { + initCommitInfoForRepairTests(BASE_FILE_INFO, LOG_FILE_INFO); + metaClient = + HoodieTestUtils.init(tempDir.toAbsolutePath().toString(), HoodieTableType.COPY_ON_WRITE); + basePath = metaClient.getBasePath(); + } + + public void setupTimelineInFS() throws IOException { + HoodieTestCommitGenerator.setupTimelineInFS( + basePath, BASE_FILE_INFO, LOG_FILE_INFO, instantInfoMap); + } + + @Test + public void testTagInstantsOfBaseAndLogFiles() { + Map> expectedResult = new HashMap<>(); + List inputPathList = new ArrayList<>(); + + for (Map.Entry>> entry : BASE_FILE_INFO.entrySet()) { + String instantTime = entry.getKey(); + List fileNameList = entry.getValue().stream() + .map(e -> { + String partitionPath = e.getKey(); + String fileId = e.getValue(); + return new Path( + new Path(partitionPath), getBaseFilename(instantTime, fileId)).toString(); + }) + .collect(Collectors.toList()); + List expectedList = expectedResult.computeIfAbsent( + instantTime, k -> new ArrayList<>()); + expectedList.addAll(fileNameList); + inputPathList.addAll(fileNameList.stream() + .map(path -> new Path(basePath, path)).collect(Collectors.toList())); + } + + for (Map.Entry>> entry : LOG_FILE_INFO.entrySet()) { + String instantTime = entry.getKey(); + List fileNameList = entry.getValue().stream() + .map(e -> { + String partitionPath = e.getKey(); + String fileId = e.getValue(); + return new Path( + new Path(partitionPath), getLogFilename(instantTime, fileId)).toString(); + }) + .collect(Collectors.toList()); + List expectedList = expectedResult.computeIfAbsent( + instantTime, k -> new ArrayList<>()); + expectedList.addAll(fileNameList); + inputPathList.addAll(fileNameList.stream() + .map(path -> new Path(basePath, path)).collect(Collectors.toList())); + } + + assertEquals(expectedResult, + RepairUtils.tagInstantsOfBaseAndLogFiles(basePath, inputPathList)); + } + + @Test + public void testGetBaseAndLogFilePathsFromTimeline() throws IOException { + setupTimelineInFS(); + HoodieTimeline timeline = metaClient.getActiveTimeline(); + HoodieInstant commitInstant = new HoodieInstant( + HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "001"); + HoodieInstant inflightInstant = new HoodieInstant( + HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "005"); + HoodieInstant compactionInstant = new HoodieInstant( + HoodieInstant.State.COMPLETED, HoodieTimeline.COMPACTION_ACTION, "006"); + + Map>> partitionToFileIdAndNameMap = + instantInfoMap.get(commitInstant.getTimestamp()); + Set expectedPaths = partitionToFileIdAndNameMap.entrySet().stream() + .flatMap(entry -> + entry.getValue().stream() + .map(fileInfo -> new Path(entry.getKey(), fileInfo.getValue()).toString()) + .collect(Collectors.toList()) + .stream() + ).collect(Collectors.toSet()); + assertEquals(Option.of(expectedPaths), + RepairUtils.getBaseAndLogFilePathsFromTimeline(timeline, commitInstant)); + assertThrows(HoodieException.class, + () -> RepairUtils.getBaseAndLogFilePathsFromTimeline(timeline, inflightInstant)); + assertEquals(Option.empty(), + RepairUtils.getBaseAndLogFilePathsFromTimeline(timeline, compactionInstant)); + } + + @Test + public void testFindInstantFilesToRemove() throws IOException { + setupTimelineInFS(); + HoodieInstant existingInstant = new HoodieInstant( + HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "001"); + Map>> partitionToFileIdAndNameMap = + instantInfoMap.get(existingInstant.getTimestamp()); + List fileListFromFs = partitionToFileIdAndNameMap.entrySet().stream() + .flatMap(entry -> + entry.getValue().stream() + .map(fileInfo -> new Path(entry.getKey(), fileInfo.getValue()).toString()) + .collect(Collectors.toList()) + .stream() + ).collect(Collectors.toList()); + String danglingFilePath = new Path("2022/01/02", + getBaseFilename(existingInstant.getTimestamp(), UUID.randomUUID().toString())).toString(); + fileListFromFs.add(danglingFilePath); + // Existing instant + assertEquals(CollectionUtils.createImmutableList(danglingFilePath), + RepairUtils.findInstantFilesToRemove( + existingInstant.getTimestamp(), fileListFromFs, + metaClient.getActiveTimeline(), metaClient.getArchivedTimeline())); + // Non-existing instant + assertEquals(fileListFromFs, + RepairUtils.findInstantFilesToRemove( + "004", fileListFromFs, + metaClient.getActiveTimeline(), metaClient.getArchivedTimeline())); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/upgrade/TestTwoToThreeUpgradeHandler.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/upgrade/TestTwoToThreeUpgradeHandler.java new file mode 100644 index 0000000000000..d6339a9782e1e --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/upgrade/TestTwoToThreeUpgradeHandler.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.keygen.KeyGenerator; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.ValueSource; + +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class TestTwoToThreeUpgradeHandler { + + HoodieWriteConfig config; + + @BeforeEach + void setUp() { + config = HoodieWriteConfig.newBuilder() + .forTable("foo") + .withPath("/foo") + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) + .build(); + } + + @ParameterizedTest + @ValueSource(strings = {"hoodie.table.keygenerator.class", "hoodie.datasource.write.keygenerator.class"}) + void upgradeHandlerShouldRetrieveKeyGeneratorConfig(String keyGenConfigKey) { + config.setValue(keyGenConfigKey, KeyGenerator.class.getName()); + TwoToThreeUpgradeHandler handler = new TwoToThreeUpgradeHandler(); + Map kv = handler.upgrade(config, null, null, null); + assertEquals(KeyGenerator.class.getName(), kv.get(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME)); + } + + @ParameterizedTest + @EnumSource(EngineType.class) + void upgradeHandlerWhenKeyGeneratorNotSet(EngineType engineType) { + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder() + .withEngineType(engineType) + .forTable("foo") + .withPath("/foo") + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) + .build(); + TwoToThreeUpgradeHandler handler = new TwoToThreeUpgradeHandler(); + if (engineType == EngineType.SPARK) { + Map kv = handler.upgrade(config, null, null, null); + assertEquals(TwoToThreeUpgradeHandler.SPARK_SIMPLE_KEY_GENERATOR, + kv.get(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME)); + } else { + Throwable t = assertThrows(IllegalStateException.class, () -> handler + .upgrade(writeConfig, null, null, null)); + assertTrue(t.getMessage().startsWith("Missing config:")); + } + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/Assertions.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/Assertions.java new file mode 100644 index 0000000000000..bb2ba84f8c9bd --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/Assertions.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.testutils; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.testutils.CheckedFunction; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertAll; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; + +/** + * Commonly used assertion functions. + */ +public class Assertions { + + /** + * Assert no failures in writing hoodie files. + */ + public static void assertNoWriteErrors(List statuses) { + assertAll(statuses.stream().map(status -> () -> + assertFalse(status.hasErrors(), "Errors found in write of " + status.getFileId()))); + } + + /** + * Assert each file size equal to its source of truth. + * + * @param fileSizeGetter to retrieve the source of truth of file size. + */ + public static void assertFileSizesEqual(List statuses, CheckedFunction fileSizeGetter) { + assertAll(statuses.stream().map(status -> () -> + assertEquals(fileSizeGetter.apply(status), status.getStat().getFileSizeInBytes()))); + } + +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java new file mode 100644 index 0000000000000..8e7df833cc5df --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.testutils; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.HoodieAvroWriteSupport; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.log.HoodieLogFormat; +import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; +import org.apache.hudi.common.table.log.block.HoodieLogBlock; +import org.apache.hudi.common.testutils.FileCreateUtils; +import org.apache.hudi.common.testutils.HoodieMetadataTestTable; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.io.storage.HoodieAvroParquetWriter; +import org.apache.hudi.io.storage.HoodieOrcConfig; +import org.apache.hudi.io.storage.HoodieOrcWriter; +import org.apache.hudi.io.storage.HoodieParquetConfig; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.orc.CompressionKind; +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; + +import java.io.IOException; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.testutils.FileCreateUtils.baseFileName; + +public class HoodieWriteableTestTable extends HoodieMetadataTestTable { + private static final Logger LOG = LogManager.getLogger(HoodieWriteableTestTable.class); + + protected final Schema schema; + protected final BloomFilter filter; + protected final boolean populateMetaFields; + + protected HoodieWriteableTestTable(String basePath, FileSystem fs, HoodieTableMetaClient metaClient, + Schema schema, BloomFilter filter) { + this(basePath, fs, metaClient, schema, filter, null); + } + + protected HoodieWriteableTestTable(String basePath, FileSystem fs, HoodieTableMetaClient metaClient, Schema schema, + BloomFilter filter, HoodieTableMetadataWriter metadataWriter) { + super(basePath, fs, metaClient, metadataWriter); + this.schema = schema; + this.filter = filter; + this.populateMetaFields = metaClient.getTableConfig().populateMetaFields(); + } + + @Override + public HoodieWriteableTestTable addCommit(String instantTime) throws Exception { + return (HoodieWriteableTestTable) super.addCommit(instantTime); + } + + @Override + public HoodieWriteableTestTable forCommit(String instantTime) { + return (HoodieWriteableTestTable) super.forCommit(instantTime); + } + + public Path withInserts(String partition, String fileId, List records, TaskContextSupplier contextSupplier) throws Exception { + FileCreateUtils.createPartitionMetaFile(basePath, partition); + String fileName = baseFileName(currentInstantTime, fileId); + + Path baseFilePath = new Path(Paths.get(basePath, partition, fileName).toString()); + if (this.fs.exists(baseFilePath)) { + LOG.warn("Deleting the existing base file " + baseFilePath); + this.fs.delete(baseFilePath, true); + } + + if (HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().equals(HoodieFileFormat.PARQUET)) { + HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport( + new AvroSchemaConverter().convert(schema), schema, Option.of(filter)); + HoodieParquetConfig config = new HoodieParquetConfig<>(writeSupport, CompressionCodecName.GZIP, + ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024, + new Configuration(), Double.parseDouble(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION.defaultValue())); + try (HoodieAvroParquetWriter writer = new HoodieAvroParquetWriter<>( + new Path(Paths.get(basePath, partition, fileName).toString()), config, currentInstantTime, + contextSupplier, populateMetaFields)) { + int seqId = 1; + for (HoodieRecord record : records) { + GenericRecord avroRecord = (GenericRecord) ((HoodieRecordPayload) record.getData()).getInsertValue(schema).get(); + if (populateMetaFields) { + HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, currentInstantTime, String.valueOf(seqId++)); + HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), fileName); + writer.writeAvro(record.getRecordKey(), avroRecord); + filter.add(record.getRecordKey()); + } else { + writer.writeAvro(record.getRecordKey(), avroRecord); + } + } + } + } else if (HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().equals(HoodieFileFormat.ORC)) { + Configuration conf = new Configuration(); + int orcStripSize = Integer.parseInt(HoodieStorageConfig.ORC_STRIPE_SIZE.defaultValue()); + int orcBlockSize = Integer.parseInt(HoodieStorageConfig.ORC_BLOCK_SIZE.defaultValue()); + int maxFileSize = Integer.parseInt(HoodieStorageConfig.ORC_FILE_MAX_SIZE.defaultValue()); + HoodieOrcConfig config = new HoodieOrcConfig(conf, CompressionKind.ZLIB, orcStripSize, orcBlockSize, maxFileSize, filter); + try (HoodieOrcWriter writer = new HoodieOrcWriter( + currentInstantTime, + new Path(Paths.get(basePath, partition, fileName).toString()), + config, schema, contextSupplier)) { + int seqId = 1; + for (HoodieRecord record : records) { + GenericRecord avroRecord = (GenericRecord) ((HoodieRecordPayload) record.getData()).getInsertValue(schema).get(); + HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, currentInstantTime, String.valueOf(seqId++)); + HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), fileName); + writer.writeAvro(record.getRecordKey(), avroRecord); + filter.add(record.getRecordKey()); + } + } + } + + return baseFilePath; + } + + public Map> withLogAppends(String partition, String fileId, List records) throws Exception { + Map> partitionToLogfilesMap = new HashMap<>(); + final Pair appendedLogFile = appendRecordsToLogFile(partition, fileId, records); + partitionToLogfilesMap.computeIfAbsent(appendedLogFile.getKey(), k -> new ArrayList<>()).add(appendedLogFile.getValue()); + return partitionToLogfilesMap; + } + + private Pair appendRecordsToLogFile(String partitionPath, String fileId, List records) throws Exception { + try (HoodieLogFormat.Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath)) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(fileId) + .overBaseCommit(currentInstantTime).withFs(fs).build()) { + Map header = new HashMap<>(); + header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, currentInstantTime); + header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); + logWriter.appendBlock(new HoodieAvroDataBlock(records.stream().map(r -> { + try { + GenericRecord val = (GenericRecord) ((HoodieRecordPayload) r.getData()).getInsertValue(schema).get(); + HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), ""); + return (IndexedRecord) val; + } catch (IOException e) { + LOG.warn("Failed to convert record " + r.toString(), e); + return null; + } + }).collect(Collectors.toList()), header, HoodieRecord.RECORD_KEY_METADATA_FIELD)); + return Pair.of(partitionPath, logWriter.getLogFile()); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/MetadataMergeWriteStatus.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/MetadataMergeWriteStatus.java similarity index 100% rename from hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/MetadataMergeWriteStatus.java rename to hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/MetadataMergeWriteStatus.java diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/providers/HoodieEngineContextProvider.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/providers/HoodieEngineContextProvider.java index e876c08bc53f1..3faa4b2c32b2f 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/providers/HoodieEngineContextProvider.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/providers/HoodieEngineContextProvider.java @@ -19,7 +19,7 @@ package org.apache.hudi.testutils.providers; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; public interface HoodieEngineContextProvider { HoodieEngineContext context(); diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/providers/HoodieWriteClientProvider.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/providers/HoodieWriteClientProvider.java index 9bc559deb5ba4..f67e158c8395d 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/providers/HoodieWriteClientProvider.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/providers/HoodieWriteClientProvider.java @@ -19,12 +19,12 @@ package org.apache.hudi.testutils.providers; -import org.apache.hudi.client.AbstractHoodieWriteClient; +import org.apache.hudi.client.BaseHoodieWriteClient; import org.apache.hudi.config.HoodieWriteConfig; import java.io.IOException; public interface HoodieWriteClientProvider { - AbstractHoodieWriteClient getHoodieWriteClient(HoodieWriteConfig cfg) throws IOException; + BaseHoodieWriteClient getHoodieWriteClient(HoodieWriteConfig cfg) throws IOException; } diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/utils/TestConcatenatingIterator.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/utils/TestConcatenatingIterator.java new file mode 100644 index 0000000000000..fc591edd761e5 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/utils/TestConcatenatingIterator.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils; + +import org.apache.hudi.client.utils.ConcatenatingIterator; +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.fail; + +public class TestConcatenatingIterator { + + // Simple test for iterator concatenation + @Test + public void testConcatBasic() { + Iterator i1 = Arrays.asList(5, 3, 2, 1).iterator(); + Iterator i2 = Collections.emptyIterator(); // empty iterator + Iterator i3 = Collections.singletonList(3).iterator(); + + ConcatenatingIterator ci = new ConcatenatingIterator<>(Arrays.asList(i1, i2, i3)); + List allElements = new ArrayList<>(); + while (ci.hasNext()) { + allElements.add(ci.next()); + } + + assertEquals(5, allElements.size()); + assertEquals(Arrays.asList(5, 3, 2, 1, 3), allElements); + } + + @Test + public void testConcatError() { + Iterator i1 = Collections.emptyIterator(); // empty iterator + + ConcatenatingIterator ci = new ConcatenatingIterator<>(Collections.singletonList(i1)); + assertFalse(ci.hasNext()); + try { + ci.next(); + fail("expected error for empty iterator"); + } catch (IllegalArgumentException e) { + // + } + } +} \ No newline at end of file diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/utils/TestMetadataConversionUtils.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/utils/TestMetadataConversionUtils.java new file mode 100644 index 0000000000000..1bbe10db0f557 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/utils/TestMetadataConversionUtils.java @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils; + +import static org.apache.hudi.common.util.CleanerUtils.convertCleanMetadata; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; +import org.apache.hudi.avro.model.HoodieActionInstant; +import org.apache.hudi.avro.model.HoodieArchivedMetaEntry; +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieCleanerPlan; +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.avro.model.HoodieInstantInfo; +import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPartitionMetadata; +import org.apache.hudi.avro.model.HoodieSliceInfo; +import org.apache.hudi.client.utils.MetadataConversionUtils; +import org.apache.hudi.common.HoodieCleanStat; +import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieInstant.State; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.Option; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class TestMetadataConversionUtils extends HoodieCommonTestHarness { + + @BeforeEach + public void init() throws IOException { + initMetaClient(); + } + + @Test + public void testCompletedClean() throws Exception { + String newCommitTime = HoodieTestTable.makeNewCommitTime(); + createCleanMetadata(newCommitTime); + HoodieArchivedMetaEntry metaEntry = MetadataConversionUtils.createMetaWrapper( + new HoodieInstant(State.COMPLETED, HoodieTimeline.CLEAN_ACTION, newCommitTime), metaClient); + assertEquals(metaEntry.getActionState(), State.COMPLETED.toString()); + assertEquals(metaEntry.getHoodieCleanMetadata().getStartCleanTime(), newCommitTime); + } + + @Test + public void testCompletedReplace() throws Exception { + String newCommitTime = HoodieTestTable.makeNewCommitTime(); + createReplace(newCommitTime, WriteOperationType.INSERT_OVERWRITE, true); + HoodieArchivedMetaEntry metaEntry = MetadataConversionUtils.createMetaWrapper( + new HoodieInstant(State.COMPLETED, HoodieTimeline.REPLACE_COMMIT_ACTION, newCommitTime), metaClient); + assertEquals(metaEntry.getActionState(), State.COMPLETED.toString()); + assertEquals(metaEntry.getHoodieReplaceCommitMetadata().getOperationType(), WriteOperationType.INSERT_OVERWRITE.toString()); + } + + @Test + public void testEmptyRequestedReplace() throws Exception { + String newCommitTime = HoodieTestTable.makeNewCommitTime(); + createReplace(newCommitTime, WriteOperationType.INSERT_OVERWRITE_TABLE, false); + HoodieArchivedMetaEntry metaEntry = MetadataConversionUtils.createMetaWrapper( + new HoodieInstant(State.REQUESTED, HoodieTimeline.REPLACE_COMMIT_ACTION, newCommitTime), metaClient); + assertEquals(metaEntry.getActionState(), State.REQUESTED.toString()); + assertNull(metaEntry.getHoodieRequestedReplaceMetadata()); + } + + @Test + public void testEmptyInflightReplace() throws Exception { + String newCommitTime = HoodieTestTable.makeNewCommitTime(); + createReplace(newCommitTime, WriteOperationType.INSERT_OVERWRITE_TABLE, true); + HoodieArchivedMetaEntry metaEntry = MetadataConversionUtils.createMetaWrapper( + new HoodieInstant(State.INFLIGHT, HoodieTimeline.REPLACE_COMMIT_ACTION, newCommitTime), metaClient); + assertEquals(metaEntry.getActionState(), State.INFLIGHT.toString()); + assertNull(metaEntry.getHoodieInflightReplaceMetadata()); + } + + @Test + public void testNonEmptyInflightReplace() throws Exception { + String newCommitTime = HoodieTestTable.makeNewCommitTime(); + createReplace(newCommitTime, WriteOperationType.INSERT_OVERWRITE_TABLE, false); + HoodieArchivedMetaEntry metaEntry = MetadataConversionUtils.createMetaWrapper( + new HoodieInstant(State.INFLIGHT, HoodieTimeline.REPLACE_COMMIT_ACTION, newCommitTime), metaClient); + assertEquals(metaEntry.getActionState(), State.INFLIGHT.toString()); + assertEquals(metaEntry.getHoodieInflightReplaceMetadata().getOperationType(), WriteOperationType.INSERT_OVERWRITE_TABLE.name()); + } + + @Test + public void testCompletedCommit() throws Exception { + String newCommitTime = HoodieTestTable.makeNewCommitTime(); + createCommitMetadata(newCommitTime); + HoodieArchivedMetaEntry metaEntry = MetadataConversionUtils.createMetaWrapper( + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, newCommitTime), metaClient); + assertEquals(metaEntry.getActionState(), State.COMPLETED.toString()); + assertEquals(metaEntry.getHoodieCommitMetadata().getOperationType(), WriteOperationType.INSERT.toString()); + } + + @Test + public void testCompletedDeltaCommit() throws Exception { + String newCommitTime = HoodieTestTable.makeNewCommitTime(); + createDeltaCommitMetadata(newCommitTime); + HoodieArchivedMetaEntry metaEntry = MetadataConversionUtils.createMetaWrapper( + new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, newCommitTime), metaClient); + assertEquals(metaEntry.getActionState(), State.COMPLETED.toString()); + assertEquals(metaEntry.getActionType(), HoodieTimeline.DELTA_COMMIT_ACTION); + } + + @Test + public void testCompletedRollback() throws Exception { + String newCommitTime = HoodieTestTable.makeNewCommitTime(); + createRollbackMetadata(newCommitTime); + HoodieArchivedMetaEntry metaEntry = MetadataConversionUtils.createMetaWrapper( + new HoodieInstant(State.COMPLETED, HoodieTimeline.ROLLBACK_ACTION, newCommitTime), metaClient); + assertEquals(metaEntry.getActionState(), State.COMPLETED.toString()); + assertEquals(metaEntry.getHoodieRollbackMetadata().getStartRollbackTime(), newCommitTime); + } + + @Test + public void testCompletedCompaction() throws Exception { + String newCommitTime = HoodieTestTable.makeNewCommitTime(); + createCompactionMetadata(newCommitTime); + HoodieArchivedMetaEntry metaEntry = MetadataConversionUtils.createMetaWrapper( + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, newCommitTime), metaClient); + assertEquals(metaEntry.getActionState(), State.COMPLETED.toString()); + assertEquals(metaEntry.getHoodieCommitMetadata().getOperationType(), WriteOperationType.COMPACT.toString()); + } + + @Test + public void testConvertCommitMetadata() { + HoodieCommitMetadata hoodieCommitMetadata = new HoodieCommitMetadata(); + hoodieCommitMetadata.setOperationType(WriteOperationType.INSERT); + org.apache.hudi.avro.model.HoodieCommitMetadata expectedCommitMetadata = MetadataConversionUtils + .convertCommitMetadata(hoodieCommitMetadata); + assertEquals(expectedCommitMetadata.getOperationType(), WriteOperationType.INSERT.toString()); + } + + private void createCompactionMetadata(String instantTime) throws Exception { + String fileId1 = "file-" + instantTime + "-1"; + String fileId2 = "file-" + instantTime + "-2"; + + HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); + commitMetadata.addMetadata("test", "test"); + commitMetadata.setOperationType(WriteOperationType.COMPACT); + commitMetadata.setCompacted(true); + HoodieTestTable.of(metaClient) + .addCommit(instantTime, Option.of(commitMetadata)) + .withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2); + } + + private void createRollbackMetadata(String instantTime) throws Exception { + HoodieRollbackMetadata rollbackMetadata = new HoodieRollbackMetadata(); + rollbackMetadata.setCommitsRollback(Arrays.asList(instantTime)); + rollbackMetadata.setStartRollbackTime(instantTime); + HoodieRollbackPartitionMetadata rollbackPartitionMetadata = new HoodieRollbackPartitionMetadata(); + rollbackPartitionMetadata.setPartitionPath("p1"); + rollbackPartitionMetadata.setSuccessDeleteFiles(Arrays.asList("f1")); + rollbackPartitionMetadata.setFailedDeleteFiles(new ArrayList<>()); + rollbackPartitionMetadata.setRollbackLogFiles(new HashMap<>()); + Map partitionMetadataMap = new HashMap<>(); + partitionMetadataMap.put("p1", rollbackPartitionMetadata); + rollbackMetadata.setPartitionMetadata(partitionMetadataMap); + rollbackMetadata.setInstantsRollback(Arrays.asList(new HoodieInstantInfo("1", HoodieTimeline.COMMIT_ACTION))); + HoodieTestTable.of(metaClient) + .addRollback(instantTime, rollbackMetadata); + } + + private void createCommitMetadata(String instantTime) throws Exception { + String fileId1 = "file-" + instantTime + "-1"; + String fileId2 = "file-" + instantTime + "-2"; + + HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); + commitMetadata.addMetadata("test", "test"); + commitMetadata.setOperationType(WriteOperationType.INSERT); + HoodieTestTable.of(metaClient) + .addCommit(instantTime, Option.of(commitMetadata)) + .withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2); + } + + private void createDeltaCommitMetadata(String instantTime) throws Exception { + String fileId1 = "file-" + instantTime + "-1"; + String fileId2 = "file-" + instantTime + "-2"; + HoodieTestTable.of(metaClient) + .addDeltaCommit(instantTime) + .withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2); + } + + private void createReplace(String instantTime, WriteOperationType writeOperationType, Boolean isClustering) + throws Exception { + String fileId1 = "file-1"; + String fileId2 = "file-2"; + + // create replace instant to mark fileId1 as deleted + HoodieReplaceCommitMetadata replaceMetadata = new HoodieReplaceCommitMetadata(); + Map> partitionFileIds = new HashMap<>(); + partitionFileIds.put(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, Arrays.asList(fileId2)); + replaceMetadata.setPartitionToReplaceFileIds(partitionFileIds); + HoodieWriteStat writeStat = new HoodieWriteStat(); + writeStat.setFileId("file-1"); + replaceMetadata.addWriteStat(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, writeStat); + replaceMetadata.setOperationType(writeOperationType); + // some cases requestedReplaceMetadata will be null + // e.g. insert_overwrite_table or insert_overwrite without clustering + HoodieRequestedReplaceMetadata requestedReplaceMetadata = null; + HoodieCommitMetadata inflightReplaceMetadata = null; + if (isClustering) { + requestedReplaceMetadata = new HoodieRequestedReplaceMetadata(); + requestedReplaceMetadata.setOperationType(writeOperationType.name()); + HoodieClusteringPlan clusteringPlan = new HoodieClusteringPlan(); + HoodieClusteringGroup clusteringGroup = new HoodieClusteringGroup(); + HoodieSliceInfo sliceInfo = new HoodieSliceInfo(); + clusteringGroup.setSlices(Arrays.asList(sliceInfo)); + clusteringPlan.setInputGroups(Arrays.asList(clusteringGroup)); + requestedReplaceMetadata.setClusteringPlan(clusteringPlan); + requestedReplaceMetadata.setVersion(TimelineLayoutVersion.CURR_VERSION); + } else { + // inflightReplaceMetadata will be null in clustering but not null + // in insert_overwrite or insert_overwrite_table + inflightReplaceMetadata = new HoodieCommitMetadata(); + inflightReplaceMetadata.setOperationType(writeOperationType); + inflightReplaceMetadata.setCompacted(false); + } + HoodieTestTable.of(metaClient) + .addReplaceCommit(instantTime, Option.ofNullable(requestedReplaceMetadata), Option.ofNullable(inflightReplaceMetadata), replaceMetadata) + .withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2); + } + + private void createCleanMetadata(String instantTime) throws IOException { + HoodieCleanerPlan cleanerPlan = new HoodieCleanerPlan(new HoodieActionInstant("", "", ""), + "", "", new HashMap<>(), CleanPlanV2MigrationHandler.VERSION, new HashMap<>(), new ArrayList<>()); + HoodieCleanStat cleanStats = new HoodieCleanStat( + HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS, + HoodieTestUtils.DEFAULT_PARTITION_PATHS[new Random().nextInt(HoodieTestUtils.DEFAULT_PARTITION_PATHS.length)], + Collections.emptyList(), + Collections.emptyList(), + Collections.emptyList(), + instantTime, + ""); + HoodieCleanMetadata cleanMetadata = convertCleanMetadata(instantTime, Option.of(0L), Collections.singletonList(cleanStats)); + HoodieTestTable.of(metaClient).addClean(instantTime, cleanerPlan, cleanMetadata); + } +} diff --git a/hudi-client/hudi-client-common/src/test/resources/exampleSchemaWithMetaFields.avsc b/hudi-client/hudi-client-common/src/test/resources/exampleSchemaWithMetaFields.avsc new file mode 100644 index 0000000000000..c3fa82207a0cf --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/resources/exampleSchemaWithMetaFields.avsc @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "namespace": "example.schema", + "type": "record", + "name": "trip", + "fields": [ + { + "name": "_hoodie_commit_time", + "type": ["null","string"], + "default":null + }, + { + "name": "_hoodie_commit_seqno", + "type": ["null","string"], + "default":null + }, + { + "name": "_hoodie_record_key", + "type": ["null","string"], + "default":null + }, + { + "name": "_hoodie_partition_path", + "type": ["null","string"], + "default":null + }, + { + "name": "_hoodie_file_name", + "type": ["null","string"], + "default":null + }, + { + "name": "_hoodie_operation", + "type": ["null","string"], + "default":null + }, + { + "name": "_row_key", + "type": "string" + }, + { + "name": "time", + "type": "string" + }, + { + "name": "number", + "type": ["int", "null"] + } + ] +} diff --git a/hudi-client/hudi-client-common/src/test/resources/exampleSchemaWithUDT.avsc b/hudi-client/hudi-client-common/src/test/resources/exampleSchemaWithUDT.avsc new file mode 100644 index 0000000000000..4c40fb23a03f5 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/resources/exampleSchemaWithUDT.avsc @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "namespace": "example.schema", + "type": "record", + "name": "trip", + "fields": [ + { + "name": "_row_key", + "type": "string" + }, + { + "name": "time", + "type": "string" + }, + { + "name": "number", + "type": ["null", "int"] + }, + { + "name": "driver", + "type": [ + "null", + { + "name": "person", + "type": "record", + "fields": [ + { + "default": null, + "name": "driver_name", + "type": ["null", "string"] + }, + { + "name": "list", + "type": { + "type": "array", + "items": "int" + } + }, + { + "name": "map", + "type": { + "type": "map", + "values": "string" + } + } + ] + } + ] + } + ] +} diff --git a/hudi-client/hudi-client-common/src/test/resources/hudi_0_10_hbase_1_2_3_bootstrap_index_partitions.hfile b/hudi-client/hudi-client-common/src/test/resources/hudi_0_10_hbase_1_2_3_bootstrap_index_partitions.hfile new file mode 100644 index 0000000000000..91e9c7656c39e Binary files /dev/null and b/hudi-client/hudi-client-common/src/test/resources/hudi_0_10_hbase_1_2_3_bootstrap_index_partitions.hfile differ diff --git a/hudi-client/hudi-client-common/src/test/resources/hudi_0_10_hbase_1_2_3_complex.hfile b/hudi-client/hudi-client-common/src/test/resources/hudi_0_10_hbase_1_2_3_complex.hfile new file mode 100644 index 0000000000000..8ce3d0d0b1f6c Binary files /dev/null and b/hudi-client/hudi-client-common/src/test/resources/hudi_0_10_hbase_1_2_3_complex.hfile differ diff --git a/hudi-client/hudi-client-common/src/test/resources/hudi_0_10_hbase_1_2_3_simple.hfile b/hudi-client/hudi-client-common/src/test/resources/hudi_0_10_hbase_1_2_3_simple.hfile new file mode 100644 index 0000000000000..abe0b336eb3c2 Binary files /dev/null and b/hudi-client/hudi-client-common/src/test/resources/hudi_0_10_hbase_1_2_3_simple.hfile differ diff --git a/hudi-client/hudi-client-common/src/test/resources/hudi_0_11_hbase_2_4_9_bootstrap_index_partitions.hfile b/hudi-client/hudi-client-common/src/test/resources/hudi_0_11_hbase_2_4_9_bootstrap_index_partitions.hfile new file mode 100644 index 0000000000000..7f6c5bd353d22 Binary files /dev/null and b/hudi-client/hudi-client-common/src/test/resources/hudi_0_11_hbase_2_4_9_bootstrap_index_partitions.hfile differ diff --git a/hudi-client/hudi-client-common/src/test/resources/hudi_0_11_hbase_2_4_9_complex.hfile b/hudi-client/hudi-client-common/src/test/resources/hudi_0_11_hbase_2_4_9_complex.hfile new file mode 100644 index 0000000000000..f5293c5a249da Binary files /dev/null and b/hudi-client/hudi-client-common/src/test/resources/hudi_0_11_hbase_2_4_9_complex.hfile differ diff --git a/hudi-client/hudi-client-common/src/test/resources/hudi_0_11_hbase_2_4_9_simple.hfile b/hudi-client/hudi-client-common/src/test/resources/hudi_0_11_hbase_2_4_9_simple.hfile new file mode 100644 index 0000000000000..2b570920f1456 Binary files /dev/null and b/hudi-client/hudi-client-common/src/test/resources/hudi_0_11_hbase_2_4_9_simple.hfile differ diff --git a/hudi-client/hudi-client-common/src/test/resources/hudi_0_9_hbase_1_2_3_bootstrap_index_partitions.hfile b/hudi-client/hudi-client-common/src/test/resources/hudi_0_9_hbase_1_2_3_bootstrap_index_partitions.hfile new file mode 100644 index 0000000000000..290af9918e5dc Binary files /dev/null and b/hudi-client/hudi-client-common/src/test/resources/hudi_0_9_hbase_1_2_3_bootstrap_index_partitions.hfile differ diff --git a/hudi-client/hudi-client-common/src/test/resources/hudi_0_9_hbase_1_2_3_complex.hfile b/hudi-client/hudi-client-common/src/test/resources/hudi_0_9_hbase_1_2_3_complex.hfile new file mode 100644 index 0000000000000..5a16f0ea68989 Binary files /dev/null and b/hudi-client/hudi-client-common/src/test/resources/hudi_0_9_hbase_1_2_3_complex.hfile differ diff --git a/hudi-client/hudi-client-common/src/test/resources/hudi_0_9_hbase_1_2_3_simple.hfile b/hudi-client/hudi-client-common/src/test/resources/hudi_0_9_hbase_1_2_3_simple.hfile new file mode 100644 index 0000000000000..e52d3c556e555 Binary files /dev/null and b/hudi-client/hudi-client-common/src/test/resources/hudi_0_9_hbase_1_2_3_simple.hfile differ diff --git a/hudi-client/hudi-client-common/src/test/resources/log4j-surefire-quiet.properties b/hudi-client/hudi-client-common/src/test/resources/log4j-surefire-quiet.properties deleted file mode 100644 index 2b94ea2903067..0000000000000 --- a/hudi-client/hudi-client-common/src/test/resources/log4j-surefire-quiet.properties +++ /dev/null @@ -1,30 +0,0 @@ -### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -### -log4j.rootLogger=WARN, CONSOLE -log4j.logger.org.apache.hudi=DEBUG -log4j.logger.org.apache.hadoop.hbase=ERROR - -# CONSOLE is set to be a ConsoleAppender. -log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# CONSOLE uses PatternLayout. -log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout -log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n -log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter -log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true -log4j.appender.CONSOLE.filter.a.LevelMin=WARN -log4j.appender.CONSOLE.filter.a.LevelMax=FATAL \ No newline at end of file diff --git a/hudi-client/hudi-client-common/src/test/resources/log4j-surefire.properties b/hudi-client/hudi-client-common/src/test/resources/log4j-surefire.properties deleted file mode 100644 index 32af462093ae5..0000000000000 --- a/hudi-client/hudi-client-common/src/test/resources/log4j-surefire.properties +++ /dev/null @@ -1,31 +0,0 @@ -### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -### -log4j.rootLogger=WARN, CONSOLE -log4j.logger.org.apache=INFO -log4j.logger.org.apache.hudi=DEBUG -log4j.logger.org.apache.hadoop.hbase=ERROR - -# A1 is set to be a ConsoleAppender. -log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. -log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout -log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n -log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter -log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true -log4j.appender.CONSOLE.filter.a.LevelMin=WARN -log4j.appender.CONSOLE.filter.a.LevelMax=FATAL diff --git a/hudi-client/hudi-flink-client/pom.xml b/hudi-client/hudi-flink-client/pom.xml index e07f0c672262e..6ad0b3c4785f9 100644 --- a/hudi-client/hudi-flink-client/pom.xml +++ b/hudi-client/hudi-flink-client/pom.xml @@ -17,214 +17,311 @@ --> - - hudi-client - org.apache.hudi - 0.6.1-SNAPSHOT - - 4.0.0 - - hudi-flink-client - ${parent.version} - - hudi-flink-client - jar - - - - - org.apache.hudi - hudi-client-common - ${parent.version} - - - - - org.apache.flink - flink-streaming-java_${scala.binary.version} - - - org.apache.flink - flink-clients_${scala.binary.version} - - - - - org.apache.parquet - parquet-avro - - - - - org.apache.hudi - hudi-common - ${project.version} - tests - test-jar - test - - - org.apache.hudi - hudi-client-common - ${project.version} - tests - test-jar - test - - - org.apache.hudi - hudi-hadoop-mr - ${project.version} - test - - - - - org.apache.hbase - hbase-testing-util - ${hbase.version} - test - - - org.codehaus.jackson - jackson-mapper-asl - - - org.codehaus.jackson - jackson-core-asl - - - javax.xml.bind - * - - - - - - - ${hive.groupid} - hive-exec - ${hive.version} - test - ${hive.exec.classifier} - - - ${hive.groupid} - hive-metastore - ${hive.version} - test - - - - - org.junit.jupiter - junit-jupiter-api - test - - - org.junit.jupiter - junit-jupiter-engine - test - - - org.junit.vintage - junit-vintage-engine - test - - - org.junit.jupiter - junit-jupiter-params - test - - - org.mockito - mockito-junit-jupiter - test - - - org.junit.platform - junit-platform-runner - test - - - org.junit.platform - junit-platform-suite-api - test - - - org.junit.platform - junit-platform-commons - test - - - - - - - org.jacoco - jacoco-maven-plugin - - - net.alchim31.maven - scala-maven-plugin - - - scala-compile-first - process-resources - - add-source - compile - - - - scala-test-compile - process-test-resources - - testCompile - - - - - - org.apache.maven.plugins - maven-compiler-plugin - - - compile - - compile - - - - - - org.apache.maven.plugins - maven-jar-plugin - - - - test-jar - - test-compile - - - - false - - - - org.apache.rat - apache-rat-plugin - - - - - - src/main/resources - - - src/test/resources - - - + + hudi-client + org.apache.hudi + 0.12.2-dt-SNAPSHOT + + 4.0.0 + + hudi-flink-client + 0.12.2-dt-SNAPSHOT + + hudi-flink-client + jar + + + ${flink.format.parquet.version} + + + + + + org.apache.logging.log4j + log4j-1.2-api + + + org.apache.logging.log4j + log4j-slf4j-impl + + + org.slf4j + slf4j-api + + + + + org.apache.hudi + hudi-client-common + ${project.parent.version} + + + + + org.apache.flink + ${flink.streaming.java.artifactId} + + + org.apache.flink + ${flink.clients.artifactId} + + + org.apache.flink + ${flink.hadoop.compatibility.artifactId} + ${flink.version} + provided + + + org.apache.flink + flink-table-common + ${flink.version} + provided + + + org.apache.flink + ${flink.table.runtime.artifactId} + ${flink.version} + provided + + + org.apache.flink + ${flink.parquet.artifactId} + ${flink.version} + provided + + + org.apache.parquet + parquet-hadoop + + + + + org.apache.flink + flink-avro + ${flink.version} + provided + + + + + org.apache.parquet + parquet-avro + ${parquet.version} + + + + org.apache.parquet + parquet-column + ${parquet.version} + + + + + org.apache.hudi + hudi-common + ${project.version} + tests + test-jar + test + + + org.apache.hudi + hudi-client-common + ${project.version} + tests + test-jar + test + + + org.apache.hudi + hudi-hadoop-mr + ${project.version} + test + + + + + org.apache.hbase + hbase-testing-util + ${hbase.version} + test + + + org.codehaus.jackson + jackson-mapper-asl + + + org.codehaus.jackson + jackson-core-asl + + + javax.xml.bind + * + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + + + + + + + ${hive.groupid} + hive-exec + ${hive.version} + test + ${hive.exec.classifier} + + + ${hive.groupid} + hive-metastore + ${hive.version} + test + + + + + org.apache.flink + ${flink.test.utils.artifactId} + ${flink.version} + test + + + org.apache.flink + ${flink.runtime.artifactId} + ${flink.version} + test + tests + + + org.apache.flink + ${flink.streaming.java.artifactId} + ${flink.version} + test + tests + + + + + org.apache.hudi + hudi-tests-common + ${project.version} + test + + + org.junit.jupiter + junit-jupiter-api + test + + + org.junit.jupiter + junit-jupiter-engine + test + + + org.junit.vintage + junit-vintage-engine + test + + + org.junit.jupiter + junit-jupiter-params + test + + + org.mockito + mockito-junit-jupiter + test + + + org.junit.platform + junit-platform-runner + test + + + org.junit.platform + junit-platform-suite-api + test + + + org.junit.platform + junit-platform-commons + test + + + + + + + org.jacoco + jacoco-maven-plugin + + + net.alchim31.maven + scala-maven-plugin + + + scala-compile-first + process-resources + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + + compile + + compile + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + + test-jar + + test-compile + + + + false + + + + org.apache.rat + apache-rat-plugin + + + + + + src/main/resources + + + src/test/resources + + + diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/FlinkTaskContextSupplier.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/FlinkTaskContextSupplier.java index bb405e21f961f..aab248fc3cf16 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/FlinkTaskContextSupplier.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/FlinkTaskContextSupplier.java @@ -18,8 +18,8 @@ package org.apache.hudi.client; -import org.apache.hudi.client.common.EngineProperty; -import org.apache.hudi.client.common.TaskContextSupplier; +import org.apache.hudi.common.engine.EngineProperty; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.util.Option; import org.apache.flink.api.common.functions.RuntimeContext; diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/HoodieFlinkWriteClient.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/HoodieFlinkWriteClient.java index f975406e4505b..551b412ccbc6c 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/HoodieFlinkWriteClient.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/HoodieFlinkWriteClient.java @@ -18,61 +18,101 @@ package org.apache.hudi.client; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.async.AsyncCleanerService; import org.apache.hudi.client.common.HoodieFlinkEngineContext; -import org.apache.hudi.client.embedded.EmbeddedTimelineService; +import org.apache.hudi.common.data.HoodieListData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; +import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.TableServiceType; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.HoodieTableVersion; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieClusteringException; +import org.apache.hudi.exception.HoodieCommitException; +import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieNotSupportedException; -import org.apache.hudi.index.FlinkHoodieIndex; +import org.apache.hudi.index.FlinkHoodieIndexFactory; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.io.FlinkAppendHandle; +import org.apache.hudi.io.FlinkConcatAndReplaceHandle; +import org.apache.hudi.io.FlinkConcatHandle; +import org.apache.hudi.io.FlinkCreateHandle; +import org.apache.hudi.io.FlinkMergeAndReplaceHandle; +import org.apache.hudi.io.FlinkMergeHandle; +import org.apache.hudi.io.HoodieWriteHandle; +import org.apache.hudi.io.MiniBatchHandle; +import org.apache.hudi.metadata.FlinkHoodieBackedTableMetadataWriter; +import org.apache.hudi.metadata.HoodieBackedTableMetadataWriter; import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.hudi.table.HoodieFlinkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.hudi.table.upgrade.FlinkUpgradeDowngrade; +import org.apache.hudi.table.action.compact.CompactHelpers; +import org.apache.hudi.table.marker.WriteMarkersFactory; +import org.apache.hudi.table.upgrade.FlinkUpgradeDowngradeHelper; +import org.apache.hudi.table.upgrade.UpgradeDowngrade; +import org.apache.hudi.util.FlinkClientUtil; import com.codahale.metrics.Timer; import org.apache.hadoop.conf.Configuration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.text.ParseException; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.stream.Collectors; +/** + * Flink hoodie write client. + * + *

    The client is used both on driver (for starting/committing transactions) + * and executor (for writing dataset). + * + * @param type of the payload + */ @SuppressWarnings("checkstyle:LineLength") public class HoodieFlinkWriteClient extends - AbstractHoodieWriteClient>, List, List> { + BaseHoodieWriteClient>, List, List> { - public HoodieFlinkWriteClient(HoodieEngineContext context, HoodieWriteConfig clientConfig) { - super(context, clientConfig); - } + private static final Logger LOG = LoggerFactory.getLogger(HoodieFlinkWriteClient.class); - public HoodieFlinkWriteClient(HoodieEngineContext context, HoodieWriteConfig writeConfig, boolean rollbackPending) { - super(context, writeConfig, rollbackPending); - } + /** + * FileID to write handle mapping in order to record the write handles for each file group, + * so that we can append the mini-batch data buffer incrementally. + */ + private final Map> bucketToHandles; - public HoodieFlinkWriteClient(HoodieEngineContext context, HoodieWriteConfig writeConfig, boolean rollbackPending, - Option timelineService) { - super(context, writeConfig, rollbackPending, timelineService); + public HoodieFlinkWriteClient(HoodieEngineContext context, HoodieWriteConfig writeConfig) { + super(context, writeConfig, FlinkUpgradeDowngradeHelper.getInstance()); + this.bucketToHandles = new HashMap<>(); } /** * Complete changes performed at the given instantTime marker with specified action. */ @Override - protected HoodieIndex>, List, List> createIndex(HoodieWriteConfig writeConfig) { - return FlinkHoodieIndex.createIndex((HoodieFlinkEngineContext) context, config); + protected HoodieIndex createIndex(HoodieWriteConfig writeConfig) { + return FlinkHoodieIndexFactory.createIndex((HoodieFlinkEngineContext) context, config); } @Override @@ -82,16 +122,16 @@ public boolean commit(String instantTime, List writeStatuses, Optio } @Override - protected HoodieTable>, List, List> createTable(HoodieWriteConfig config, Configuration hadoopConf) { + protected HoodieTable createTable(HoodieWriteConfig config, Configuration hadoopConf) { return HoodieFlinkTable.create(config, (HoodieFlinkEngineContext) context); } @Override public List> filterExists(List> hoodieRecords) { // Create a Hoodie table which encapsulated the commits and files visible - HoodieFlinkTable table = HoodieFlinkTable.create(config, (HoodieFlinkEngineContext) context); + HoodieFlinkTable table = getHoodieTable(); Timer.Context indexTimer = metrics.getIndexCtx(); - List> recordsWithLocation = getIndex().tagLocation(hoodieRecords, context, table); + List> recordsWithLocation = getIndex().tagLocation(HoodieListData.eager(hoodieRecords), context, table).collectAsList(); metrics.updateIndexMetrics(LOOKUP_STR, metrics.getDurationInMs(indexTimer == null ? 0L : indexTimer.stop())); return recordsWithLocation.stream().filter(v1 -> !v1.isCurrentLocationKnown()).collect(Collectors.toList()); } @@ -104,11 +144,12 @@ public void bootstrap(Option> extraMetadata) { @Override public List upsert(List> records, String instantTime) { HoodieTable>, List, List> table = - getTableAndInitCtx(WriteOperationType.UPSERT, instantTime); + initTable(WriteOperationType.UPSERT, Option.ofNullable(instantTime)); table.validateUpsertSchema(); - setOperationType(WriteOperationType.UPSERT); - this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime); - HoodieWriteMetadata> result = table.upsert(context, instantTime, records); + preWrite(instantTime, WriteOperationType.UPSERT, table.getMetaClient()); + final HoodieWriteHandle writeHandle = getOrCreateWriteHandle(records.get(0), getConfig(), + instantTime, table, records.listIterator()); + HoodieWriteMetadata> result = ((HoodieFlinkTable) table).upsert(context, writeHandle, instantTime, records); if (result.getIndexLookupDuration().isPresent()) { metrics.updateIndexMetrics(LOOKUP_STR, result.getIndexLookupDuration().get().toMillis()); } @@ -117,23 +158,76 @@ public List upsert(List> records, String instantTim @Override public List upsertPreppedRecords(List> preppedRecords, String instantTime) { - throw new HoodieNotSupportedException("UpsertPrepped operation is not supported yet"); + // only used for metadata table, the upsert happens in single thread + HoodieTable>, List, List> table = + initTable(WriteOperationType.UPSERT, Option.ofNullable(instantTime)); + table.validateUpsertSchema(); + preWrite(instantTime, WriteOperationType.UPSERT_PREPPED, table.getMetaClient()); + Map>> preppedRecordsByFileId = preppedRecords.stream().parallel() + .collect(Collectors.groupingBy(r -> r.getCurrentLocation().getFileId())); + return preppedRecordsByFileId.values().stream().parallel().map(records -> { + final HoodieWriteHandle writeHandle = getOrCreateWriteHandle(records.get(0), getConfig(), + instantTime, table, records.listIterator()); + HoodieWriteMetadata> result = ((HoodieFlinkTable) table).upsertPrepped(context, writeHandle, instantTime, records); + return postWrite(result, instantTime, table); + }).flatMap(Collection::stream).collect(Collectors.toList()); } @Override public List insert(List> records, String instantTime) { HoodieTable>, List, List> table = - getTableAndInitCtx(WriteOperationType.INSERT, instantTime); - table.validateUpsertSchema(); - setOperationType(WriteOperationType.INSERT); - this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime); - HoodieWriteMetadata> result = table.insert(context, instantTime, records); + initTable(WriteOperationType.INSERT, Option.ofNullable(instantTime)); + table.validateInsertSchema(); + preWrite(instantTime, WriteOperationType.INSERT, table.getMetaClient()); + // create the write handle if not exists + final HoodieWriteHandle writeHandle = getOrCreateWriteHandle(records.get(0), getConfig(), + instantTime, table, records.listIterator()); + HoodieWriteMetadata> result = ((HoodieFlinkTable) table).insert(context, writeHandle, instantTime, records); if (result.getIndexLookupDuration().isPresent()) { metrics.updateIndexMetrics(LOOKUP_STR, result.getIndexLookupDuration().get().toMillis()); } return postWrite(result, instantTime, table); } + /** + * Removes all existing records from the partitions affected and inserts the given HoodieRecords, into the table. + * + * @param records HoodieRecords to insert + * @param instantTime Instant time of the commit + * @return list of WriteStatus to inspect errors and counts + */ + public List insertOverwrite( + List> records, final String instantTime) { + HoodieTable>, List, List> table = + initTable(WriteOperationType.INSERT_OVERWRITE, Option.ofNullable(instantTime)); + table.validateInsertSchema(); + preWrite(instantTime, WriteOperationType.INSERT_OVERWRITE, table.getMetaClient()); + // create the write handle if not exists + final HoodieWriteHandle writeHandle = getOrCreateWriteHandle(records.get(0), getConfig(), + instantTime, table, records.listIterator()); + HoodieWriteMetadata result = ((HoodieFlinkTable) table).insertOverwrite(context, writeHandle, instantTime, records); + return postWrite(result, instantTime, table); + } + + /** + * Removes all existing records of the Hoodie table and inserts the given HoodieRecords, into the table. + * + * @param records HoodieRecords to insert + * @param instantTime Instant time of the commit + * @return list of WriteStatus to inspect errors and counts + */ + public List insertOverwriteTable( + List> records, final String instantTime) { + HoodieTable table = initTable(WriteOperationType.INSERT_OVERWRITE_TABLE, Option.ofNullable(instantTime)); + table.validateInsertSchema(); + preWrite(instantTime, WriteOperationType.INSERT_OVERWRITE_TABLE, table.getMetaClient()); + // create the write handle if not exists + final HoodieWriteHandle writeHandle = getOrCreateWriteHandle(records.get(0), getConfig(), + instantTime, table, records.listIterator()); + HoodieWriteMetadata result = ((HoodieFlinkTable) table).insertOverwriteTable(context, writeHandle, instantTime, records); + return postWrite(result, instantTime, table); + } + @Override public List insertPreppedRecords(List> preppedRecords, String instantTime) { throw new HoodieNotSupportedException("InsertPrepped operation is not supported yet"); @@ -145,74 +239,392 @@ public List bulkInsert(List> records, String instan } @Override - public List bulkInsert(List> records, String instantTime, Option>>> userDefinedBulkInsertPartitioner) { + public List bulkInsert(List> records, String instantTime, Option userDefinedBulkInsertPartitioner) { throw new HoodieNotSupportedException("BulkInsert operation is not supported yet"); } @Override - public List bulkInsertPreppedRecords(List> preppedRecords, String instantTime, Option>>> bulkInsertPartitioner) { + public List bulkInsertPreppedRecords(List> preppedRecords, String instantTime, Option bulkInsertPartitioner) { throw new HoodieNotSupportedException("BulkInsertPrepped operation is not supported yet"); } @Override public List delete(List keys, String instantTime) { HoodieTable>, List, List> table = - getTableAndInitCtx(WriteOperationType.DELETE, instantTime); - setOperationType(WriteOperationType.DELETE); - HoodieWriteMetadata> result = table.delete(context,instantTime, keys); + initTable(WriteOperationType.DELETE, Option.ofNullable(instantTime)); + preWrite(instantTime, WriteOperationType.DELETE, table.getMetaClient()); + HoodieWriteMetadata> result = table.delete(context, instantTime, keys); return postWrite(result, instantTime, table); } + @Override + public void preWrite(String instantTime, WriteOperationType writeOperationType, HoodieTableMetaClient metaClient) { + setOperationType(writeOperationType); + // Note: the code to read the commit metadata is not thread safe for JSON deserialization, + // remove the table metadata sync + + // remove the async cleaning + } + + @Override + protected void writeTableMetadata(HoodieTable table, String instantTime, String actionType, HoodieCommitMetadata metadata) { + try (HoodieBackedTableMetadataWriter metadataWriter = initMetadataWriter()) { + metadataWriter.update(metadata, instantTime, getHoodieTable().isTableServiceAction(actionType, instantTime)); + } catch (Exception e) { + throw new HoodieException("Failed to update metadata", e); + } + } + + /** + * Initialize the table metadata writer, for e.g, bootstrap the metadata table + * from the filesystem if it does not exist. + */ + public HoodieBackedTableMetadataWriter initMetadataWriter() { + return (HoodieBackedTableMetadataWriter) FlinkHoodieBackedTableMetadataWriter.create( + FlinkClientUtil.getHadoopConf(), this.config, HoodieFlinkEngineContext.DEFAULT); + } + + /** + * Initialized the metadata table on start up, should only be called once on driver. + */ + public void initMetadataTable() { + HoodieFlinkTable table = getHoodieTable(); + if (config.isMetadataTableEnabled()) { + // initialize the metadata table path + try (HoodieBackedTableMetadataWriter metadataWriter = initMetadataWriter()) { + // do nothing + } catch (Exception e) { + throw new HoodieException("Failed to initialize metadata table", e); + } + // clean the obsolete index stats + table.deleteMetadataIndexIfNecessary(); + } else { + // delete the metadata table if it was enabled but is now disabled + table.maybeDeleteMetadataTable(); + } + } + + /** + * Starts async cleaning service for finished commits. + * + *

    The Flink write client is designed to write data set as buckets + * but cleaning action should trigger after all the write actions within a + * checkpoint finish. + */ + public void startAsyncCleaning() { + if (this.asyncCleanerService == null) { + this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this); + } else { + this.asyncCleanerService.start(null); + } + } + + /** + * Blocks and wait for the async cleaning service to finish. + * + *

    The Flink write client is designed to write data set as buckets + * but cleaning action should trigger after all the write actions within a + * checkpoint finish. + */ + public void waitForCleaningFinish() { + if (this.asyncCleanerService != null) { + LOG.info("Cleaner has been spawned already. Waiting for it to finish"); + AsyncCleanerService.waitForCompletion(asyncCleanerService); + LOG.info("Cleaner has finished"); + } + } + @Override protected List postWrite(HoodieWriteMetadata> result, String instantTime, - HoodieTable>, List, List> hoodieTable) { + HoodieTable hoodieTable) { if (result.getIndexLookupDuration().isPresent()) { metrics.updateIndexMetrics(getOperationType().name(), result.getIndexUpdateDuration().get().toMillis()); } return result.getWriteStatuses(); } + /** + * Post commit is rewrite to be invoked after a successful commit. + * + *

    The Flink write client is designed to write data set as buckets + * but cleaning action should trigger after all the write actions within a + * checkpoint finish. + * + * @param table Table to commit on + * @param metadata Commit Metadata corresponding to committed instant + * @param instantTime Instant Time + * @param extraMetadata Additional Metadata passed by user + */ + @Override + protected void postCommit(HoodieTable table, + HoodieCommitMetadata metadata, + String instantTime, + Option> extraMetadata, + boolean acquireLockForArchival) { + try { + // Delete the marker directory for the instant. + WriteMarkersFactory.get(config.getMarkersType(), createTable(config, hadoopConf), instantTime) + .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); + autoArchiveOnCommit(table, acquireLockForArchival); + } finally { + this.heartbeatClient.stop(instantTime); + } + } + @Override - public void commitCompaction(String compactionInstantTime, List writeStatuses, Option> extraMetadata) throws IOException { - throw new HoodieNotSupportedException("Compaction is not supported yet"); + public void commitCompaction( + String compactionInstantTime, + HoodieCommitMetadata metadata, + Option> extraMetadata) { + HoodieFlinkTable table = getHoodieTable(); + extraMetadata.ifPresent(m -> m.forEach(metadata::addMetadata)); + completeCompaction(metadata, table, compactionInstantTime); } @Override - protected void completeCompaction(HoodieCommitMetadata metadata, List writeStatuses, HoodieTable>, List, List> table, String compactionCommitTime) { - throw new HoodieNotSupportedException("Compaction is not supported yet"); + public void completeCompaction( + HoodieCommitMetadata metadata, + HoodieTable table, + String compactionCommitTime) { + this.context.setJobStatus(this.getClass().getSimpleName(), "Collect compaction write status and commit compaction: " + config.getTableName()); + List writeStats = metadata.getWriteStats(); + final HoodieInstant compactionInstant = HoodieTimeline.getCompactionInflightInstant(compactionCommitTime); + try { + this.txnManager.beginTransaction(Option.of(compactionInstant), Option.empty()); + finalizeWrite(table, compactionCommitTime, writeStats); + // commit to data table after committing to metadata table. + // Do not do any conflict resolution here as we do with regular writes. We take the lock here to ensure all writes to metadata table happens within a + // single lock (single writer). Because more than one write to metadata table will result in conflicts since all of them updates the same partition. + writeTableMetadata(table, compactionCommitTime, compactionInstant.getAction(), metadata); + LOG.info("Committing Compaction {} finished with result {}.", compactionCommitTime, metadata); + CompactHelpers.getInstance().completeInflightCompaction(table, compactionCommitTime, metadata); + } finally { + this.txnManager.endTransaction(Option.of(compactionInstant)); + } + WriteMarkersFactory + .get(config.getMarkersType(), table, compactionCommitTime) + .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); + if (compactionTimer != null) { + long durationInMs = metrics.getDurationInMs(compactionTimer.stop()); + try { + metrics.updateCommitMetrics(HoodieActiveTimeline.parseDateFromInstantTime(compactionCommitTime).getTime(), + durationInMs, metadata, HoodieActiveTimeline.COMPACTION_ACTION); + } catch (ParseException e) { + throw new HoodieCommitException("Commit time is not of valid format. Failed to commit compaction " + + config.getBasePath() + " at time " + compactionCommitTime, e); + } + } + LOG.info("Compacted successfully on commit " + compactionCommitTime); } @Override - protected List compact(String compactionInstantTime, boolean shouldComplete) { - throw new HoodieNotSupportedException("Compaction is not supported yet"); + protected HoodieWriteMetadata> compact(String compactionInstantTime, boolean shouldComplete) { + // only used for metadata table, the compaction happens in single thread + HoodieWriteMetadata> compactionMetadata = getHoodieTable().compact(context, compactionInstantTime); + commitCompaction(compactionInstantTime, compactionMetadata.getCommitMetadata().get(), Option.empty()); + return compactionMetadata; } @Override - protected HoodieTable>, List, List> getTableAndInitCtx(WriteOperationType operationType, String instantTime) { - HoodieTableMetaClient metaClient = createMetaClient(true); - new FlinkUpgradeDowngrade(metaClient, config, context).run(metaClient, HoodieTableVersion.current(), config, context, instantTime); - return getTableAndInitCtx(metaClient, operationType); + public HoodieWriteMetadata> cluster(final String clusteringInstant, final boolean shouldComplete) { + throw new HoodieNotSupportedException("Clustering is not supported yet"); } - private HoodieTable>, List, List> getTableAndInitCtx(HoodieTableMetaClient metaClient, WriteOperationType operationType) { - if (operationType == WriteOperationType.DELETE) { - setWriteSchemaForDeletes(metaClient); + private void completeClustering( + HoodieReplaceCommitMetadata metadata, + HoodieTable>, List, List> table, + String clusteringCommitTime) { + this.context.setJobStatus(this.getClass().getSimpleName(), "Collect clustering write status and commit clustering"); + HoodieInstant clusteringInstant = new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.REPLACE_COMMIT_ACTION, clusteringCommitTime); + List writeStats = metadata.getPartitionToWriteStats().entrySet().stream().flatMap(e -> + e.getValue().stream()).collect(Collectors.toList()); + if (writeStats.stream().mapToLong(HoodieWriteStat::getTotalWriteErrors).sum() > 0) { + throw new HoodieClusteringException("Clustering failed to write to files:" + + writeStats.stream().filter(s -> s.getTotalWriteErrors() > 0L).map(HoodieWriteStat::getFileId).collect(Collectors.joining(","))); + } + + try { + this.txnManager.beginTransaction(Option.of(clusteringInstant), Option.empty()); + finalizeWrite(table, clusteringCommitTime, writeStats); + // commit to data table after committing to metadata table. + // Do not do any conflict resolution here as we do with regular writes. We take the lock here to ensure all writes to metadata table happens within a + // single lock (single writer). Because more than one write to metadata table will result in conflicts since all of them updates the same partition. + writeTableMetadata(table, clusteringCommitTime, clusteringInstant.getAction(), metadata); + LOG.info("Committing Clustering {} finished with result {}.", clusteringCommitTime, metadata); + table.getActiveTimeline().transitionReplaceInflightToComplete( + HoodieTimeline.getReplaceCommitInflightInstant(clusteringCommitTime), + Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + } catch (IOException e) { + throw new HoodieClusteringException( + "Failed to commit " + table.getMetaClient().getBasePath() + " at time " + clusteringCommitTime, e); + } finally { + this.txnManager.endTransaction(Option.of(clusteringInstant)); } + + WriteMarkersFactory.get(config.getMarkersType(), table, clusteringCommitTime) + .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); + if (clusteringTimer != null) { + long durationInMs = metrics.getDurationInMs(clusteringTimer.stop()); + try { + metrics.updateCommitMetrics(HoodieActiveTimeline.parseDateFromInstantTime(clusteringCommitTime).getTime(), + durationInMs, metadata, HoodieActiveTimeline.REPLACE_COMMIT_ACTION); + } catch (ParseException e) { + throw new HoodieCommitException("Commit time is not of valid format. Failed to commit compaction " + + config.getBasePath() + " at time " + clusteringCommitTime, e); + } + } + LOG.info("Clustering successfully on commit " + clusteringCommitTime); + } + + @Override + protected HoodieTable doInitTable(HoodieTableMetaClient metaClient, Option instantTime, boolean initialMetadataTableIfNecessary) { // Create a Hoodie table which encapsulated the commits and files visible - HoodieFlinkTable table = HoodieFlinkTable.create(config, (HoodieFlinkEngineContext) context, metaClient); - if (table.getMetaClient().getCommitActionType().equals(HoodieTimeline.COMMIT_ACTION)) { - writeTimer = metrics.getCommitCtx(); + return getHoodieTable(); + } + + @Override + protected void tryUpgrade(HoodieTableMetaClient metaClient, Option instantTime) { + // do nothing. + // flink executes the upgrade/downgrade once when initializing the first instant on start up, + // no need to execute the upgrade/downgrade on each write in streaming. + } + + public void completeTableService( + TableServiceType tableServiceType, + HoodieCommitMetadata metadata, + HoodieTable>, List, List> table, + String commitInstant) { + switch (tableServiceType) { + case CLUSTER: + completeClustering((HoodieReplaceCommitMetadata) metadata, table, commitInstant); + break; + case COMPACT: + completeCompaction(metadata, table, commitInstant); + break; + default: + throw new IllegalArgumentException("This table service is not valid " + tableServiceType); + } + } + + /** + * Upgrade downgrade the Hoodie table. + * + *

    This action should only be executed once for each commit. + * The modification of the table properties is not thread safe. + */ + public void upgradeDowngrade(String instantTime, HoodieTableMetaClient metaClient) { + new UpgradeDowngrade(metaClient, config, context, FlinkUpgradeDowngradeHelper.getInstance()) + .run(HoodieTableVersion.current(), instantTime); + } + + /** + * Clean the write handles within a checkpoint interval. + * All the handles should have been closed already. + */ + public void cleanHandles() { + this.bucketToHandles.clear(); + } + + /** + * Clean the write handles within a checkpoint interval, this operation + * would close the underneath file handles, if any error happens, clean the + * corrupted data file. + */ + public void cleanHandlesGracefully() { + this.bucketToHandles.values() + .forEach(handle -> ((MiniBatchHandle) handle).closeGracefully()); + this.bucketToHandles.clear(); + } + + /** + * Get or create a new write handle in order to reuse the file handles. + * + * @param record The first record in the bucket + * @param config Write config + * @param instantTime The instant time + * @param table The table + * @param recordItr Record iterator + * @return Existing write handle or create a new one + */ + private HoodieWriteHandle getOrCreateWriteHandle( + HoodieRecord record, + HoodieWriteConfig config, + String instantTime, + HoodieTable>, List, List> table, + Iterator> recordItr) { + final HoodieRecordLocation loc = record.getCurrentLocation(); + final String fileID = loc.getFileId(); + final String partitionPath = record.getPartitionPath(); + final boolean insertClustering = config.allowDuplicateInserts(); + + if (bucketToHandles.containsKey(fileID)) { + MiniBatchHandle lastHandle = (MiniBatchHandle) bucketToHandles.get(fileID); + if (lastHandle.shouldReplace()) { + HoodieWriteHandle writeHandle = insertClustering + ? new FlinkConcatAndReplaceHandle<>(config, instantTime, table, recordItr, partitionPath, fileID, + table.getTaskContextSupplier(), lastHandle.getWritePath()) + : new FlinkMergeAndReplaceHandle<>(config, instantTime, table, recordItr, partitionPath, fileID, + table.getTaskContextSupplier(), lastHandle.getWritePath()); + this.bucketToHandles.put(fileID, writeHandle); // override with new replace handle + return writeHandle; + } + } + + final boolean isDelta = table.getMetaClient().getTableType().equals(HoodieTableType.MERGE_ON_READ); + final HoodieWriteHandle writeHandle; + if (isDelta) { + writeHandle = new FlinkAppendHandle<>(config, instantTime, table, partitionPath, fileID, recordItr, + table.getTaskContextSupplier()); + } else if (loc.getInstantTime().equals("I")) { + writeHandle = new FlinkCreateHandle<>(config, instantTime, table, partitionPath, + fileID, table.getTaskContextSupplier()); } else { - writeTimer = metrics.getDeltaCommitCtx(); + writeHandle = insertClustering + ? new FlinkConcatHandle<>(config, instantTime, table, recordItr, partitionPath, + fileID, table.getTaskContextSupplier()) + : new FlinkMergeHandle<>(config, instantTime, table, recordItr, partitionPath, + fileID, table.getTaskContextSupplier()); + } + this.bucketToHandles.put(fileID, writeHandle); + return writeHandle; + } + + public HoodieFlinkTable getHoodieTable() { + return HoodieFlinkTable.create(config, (HoodieFlinkEngineContext) context); + } + + public Map> getPartitionToReplacedFileIds( + WriteOperationType writeOperationType, + List writeStatuses) { + HoodieFlinkTable table = getHoodieTable(); + switch (writeOperationType) { + case INSERT_OVERWRITE: + return writeStatuses.stream().map(status -> status.getStat().getPartitionPath()).distinct() + .collect( + Collectors.toMap( + partition -> partition, + partitionPath -> getAllExistingFileIds(table, partitionPath))); + case INSERT_OVERWRITE_TABLE: + Map> partitionToExistingFileIds = new HashMap<>(); + List partitionPaths = + FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), table.getMetaClient().getBasePath()); + if (partitionPaths != null && partitionPaths.size() > 0) { + context.setJobStatus(this.getClass().getSimpleName(), "Getting ExistingFileIds of all partitions: " + config.getTableName()); + partitionToExistingFileIds = partitionPaths.stream().parallel() + .collect( + Collectors.toMap( + partition -> partition, + partition -> getAllExistingFileIds(table, partition))); + } + return partitionToExistingFileIds; + default: + throw new AssertionError(); } - return table; } - public List getInflightsAndRequestedInstants(String commitType) { - HoodieFlinkTable table = HoodieFlinkTable.create(config, (HoodieFlinkEngineContext) context); - HoodieTimeline unCompletedTimeline = table.getMetaClient().getCommitsTimeline().filterInflightsAndRequested(); - return unCompletedTimeline.getInstants().filter(x -> x.getAction().equals(commitType)).map(HoodieInstant::getTimestamp) - .collect(Collectors.toList()); + private List getAllExistingFileIds(HoodieFlinkTable table, String partitionPath) { + // because new commit is not complete. it is safe to mark all existing file Ids as old files + return table.getSliceView().getLatestFileSlices(partitionPath).map(FileSlice::getFileId).distinct().collect(Collectors.toList()); } } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/FlinkSizeBasedClusteringPlanStrategy.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/FlinkSizeBasedClusteringPlanStrategy.java new file mode 100644 index 0000000000000..6ff063f49d445 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/FlinkSizeBasedClusteringPlanStrategy.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.clustering.plan.strategy; + +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieFlinkCopyOnWriteTable; +import org.apache.hudi.table.HoodieFlinkMergeOnReadTable; +import org.apache.hudi.table.action.cluster.strategy.PartitionAwareClusteringPlanStrategy; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Stream; + +import static org.apache.hudi.config.HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS; + +/** + * Clustering Strategy based on following. + * 1) Creates clustering groups based on max size allowed per group. + * 2) Excludes files that are greater than 'small.file.limit' from clustering plan. + */ +public class FlinkSizeBasedClusteringPlanStrategy> + extends PartitionAwareClusteringPlanStrategy>, List, List> { + private static final Logger LOG = LogManager.getLogger(FlinkSizeBasedClusteringPlanStrategy.class); + + public FlinkSizeBasedClusteringPlanStrategy(HoodieFlinkCopyOnWriteTable table, + HoodieFlinkEngineContext engineContext, + HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + public FlinkSizeBasedClusteringPlanStrategy(HoodieFlinkMergeOnReadTable table, + HoodieFlinkEngineContext engineContext, + HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + @Override + protected Stream buildClusteringGroupsForPartition(String partitionPath, List fileSlices) { + HoodieWriteConfig writeConfig = getWriteConfig(); + + List, Integer>> fileSliceGroups = new ArrayList<>(); + List currentGroup = new ArrayList<>(); + long totalSizeSoFar = 0; + + for (FileSlice currentSlice : fileSlices) { + // check if max size is reached and create new group, if needed. + // in now, every clustering group out put is 1 file group. + if (totalSizeSoFar >= writeConfig.getClusteringTargetFileMaxBytes() && !currentGroup.isEmpty()) { + LOG.info("Adding one clustering group " + totalSizeSoFar + " max bytes: " + + writeConfig.getClusteringMaxBytesInGroup() + " num input slices: " + currentGroup.size()); + fileSliceGroups.add(Pair.of(currentGroup, 1)); + currentGroup = new ArrayList<>(); + totalSizeSoFar = 0; + } + + // Add to the current file-group + currentGroup.add(currentSlice); + // assume each file group size is ~= parquet.max.file.size + totalSizeSoFar += currentSlice.getBaseFile().isPresent() ? currentSlice.getBaseFile().get().getFileSize() : writeConfig.getParquetMaxFileSize(); + } + + if (!currentGroup.isEmpty()) { + fileSliceGroups.add(Pair.of(currentGroup, 1)); + } + + return fileSliceGroups.stream().map(fileSliceGroup -> + HoodieClusteringGroup.newBuilder() + .setSlices(getFileSliceInfo(fileSliceGroup.getLeft())) + .setNumOutputFileGroups(fileSliceGroup.getRight()) + .setMetrics(buildMetrics(fileSliceGroup.getLeft())) + .build()); + } + + @Override + protected Map getStrategyParams() { + Map params = new HashMap<>(); + if (!StringUtils.isNullOrEmpty(getWriteConfig().getClusteringSortColumns())) { + params.put(PLAN_STRATEGY_SORT_COLUMNS.key(), getWriteConfig().getClusteringSortColumns()); + } + return params; + } + + @Override + protected Stream getFileSlicesEligibleForClustering(final String partition) { + return super.getFileSlicesEligibleForClustering(partition) + // Only files that have basefile size smaller than small file size are eligible. + .filter(slice -> slice.getBaseFile().map(HoodieBaseFile::getFileSize).orElse(0L) < getWriteConfig().getClusteringSmallFileLimit()); + } + +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/common/HoodieFlinkEngineContext.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/common/HoodieFlinkEngineContext.java index 74c921fd0cb8b..c9136da6bb453 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/common/HoodieFlinkEngineContext.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/common/HoodieFlinkEngineContext.java @@ -19,35 +19,60 @@ package org.apache.hudi.client.common; import org.apache.hudi.client.FlinkTaskContextSupplier; -import org.apache.hudi.client.common.function.SerializableConsumer; -import org.apache.hudi.client.common.function.SerializableFunction; -import org.apache.hudi.client.common.function.SerializablePairFunction; import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.data.HoodieAccumulator; +import org.apache.hudi.common.data.HoodieAtomicLongAccumulator; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodieListData; +import org.apache.hudi.common.engine.EngineProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.function.SerializableBiFunction; +import org.apache.hudi.common.function.SerializableConsumer; +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFlatMapFunction; +import org.apache.hudi.common.function.SerializablePairFunction; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.util.FlinkClientUtil; import org.apache.flink.api.common.functions.RuntimeContext; -import org.apache.hadoop.conf.Configuration; +import java.util.Collections; +import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; +import java.util.function.Supplier; import java.util.stream.Collectors; import java.util.stream.Stream; -import scala.Tuple2; - -import static org.apache.hudi.client.common.function.FunctionWrapper.throwingFlatMapWrapper; -import static org.apache.hudi.client.common.function.FunctionWrapper.throwingForeachWrapper; -import static org.apache.hudi.client.common.function.FunctionWrapper.throwingMapToPairWrapper; -import static org.apache.hudi.client.common.function.FunctionWrapper.throwingMapWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapToPairWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingForeachWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingMapToPairWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingMapWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingReduceWrapper; /** * A flink engine implementation of HoodieEngineContext. */ public class HoodieFlinkEngineContext extends HoodieEngineContext { - private RuntimeContext runtimeContext; + public static final HoodieFlinkEngineContext DEFAULT = new HoodieFlinkEngineContext(); + + private final RuntimeContext runtimeContext; + + private HoodieFlinkEngineContext() { + this(new SerializableConfiguration(FlinkClientUtil.getHadoopConf()), new DefaultTaskContextSupplier()); + } + + public HoodieFlinkEngineContext(org.apache.hadoop.conf.Configuration hadoopConf) { + this(new SerializableConfiguration(hadoopConf), new DefaultTaskContextSupplier()); + } public HoodieFlinkEngineContext(TaskContextSupplier taskContextSupplier) { - this(new SerializableConfiguration(new Configuration()), taskContextSupplier); + this(new SerializableConfiguration(FlinkClientUtil.getHadoopConf()), taskContextSupplier); } public HoodieFlinkEngineContext(SerializableConfiguration hadoopConf, TaskContextSupplier taskContextSupplier) { @@ -55,6 +80,21 @@ public HoodieFlinkEngineContext(SerializableConfiguration hadoopConf, TaskContex this.runtimeContext = ((FlinkTaskContextSupplier) taskContextSupplier).getFlinkRuntimeContext(); } + @Override + public HoodieAccumulator newAccumulator() { + return HoodieAtomicLongAccumulator.create(); + } + + @Override + public HoodieData emptyHoodieData() { + return HoodieListData.eager(Collections.emptyList()); + } + + @Override + public HoodieData parallelize(List data, int parallelism) { + return HoodieListData.eager(data); + } + public RuntimeContext getRuntimeContext() { return this.runtimeContext; } @@ -64,6 +104,36 @@ public List map(List data, SerializableFunction func, int par return data.stream().parallel().map(throwingMapWrapper(func)).collect(Collectors.toList()); } + @Override + public List mapToPairAndReduceByKey(List data, SerializablePairFunction mapToPairFunc, SerializableBiFunction reduceFunc, int parallelism) { + return data.stream().parallel().map(throwingMapToPairWrapper(mapToPairFunc)) + .collect(Collectors.groupingBy(p -> p.getKey())).values().stream() + .map(list -> list.stream().map(e -> e.getValue()).reduce(throwingReduceWrapper(reduceFunc)).orElse(null)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + + @Override + public Stream> mapPartitionsToPairAndReduceByKey( + Stream data, SerializablePairFlatMapFunction, K, V> flatMapToPairFunc, + SerializableBiFunction reduceFunc, int parallelism) { + return throwingFlatMapToPairWrapper(flatMapToPairFunc).apply(data.parallel().iterator()) + .collect(Collectors.groupingBy(Pair::getKey)).entrySet().stream() + .map(entry -> new ImmutablePair<>(entry.getKey(), entry.getValue().stream().map( + Pair::getValue).reduce(throwingReduceWrapper(reduceFunc)).orElse(null))) + .filter(Objects::nonNull); + } + + @Override + public List reduceByKey( + List> data, SerializableBiFunction reduceFunc, int parallelism) { + return data.stream().parallel() + .collect(Collectors.groupingBy(p -> p.getKey())).values().stream() + .map(list -> list.stream().map(e -> e.getValue()).reduce(throwingReduceWrapper(reduceFunc)).orElse(null)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + @Override public List flatMap(List data, SerializableFunction> func, int parallelism) { return data.stream().parallel().flatMap(throwingFlatMapWrapper(func)).collect(Collectors.toList()); @@ -76,7 +146,7 @@ public void foreach(List data, SerializableConsumer consumer, int para @Override public Map mapToPair(List data, SerializablePairFunction func, Integer parallelism) { - return data.stream().map(throwingMapToPairWrapper(func)).collect(Collectors.toMap(Tuple2::_1, Tuple2::_2)); + return data.stream().parallel().map(throwingMapToPairWrapper(func)).collect(Collectors.toMap(Pair::getLeft, Pair::getRight)); } @Override @@ -94,4 +164,34 @@ public Option getProperty(EngineProperty key) { public void setJobStatus(String activeModule, String activityDescription) { // no operation for now } + + /** + * Override the flink context supplier to return constant write token. + */ + private static class DefaultTaskContextSupplier extends FlinkTaskContextSupplier { + + public DefaultTaskContextSupplier() { + this(null); + } + + public DefaultTaskContextSupplier(RuntimeContext flinkRuntimeContext) { + super(flinkRuntimeContext); + } + + public Supplier getPartitionIdSupplier() { + return () -> 0; + } + + public Supplier getStageIdSupplier() { + return () -> 1; + } + + public Supplier getAttemptIdSupplier() { + return () -> 0L; + } + + public Option getProperty(EngineProperty prop) { + return Option.empty(); + } + } } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/model/AbstractHoodieRowData.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/model/AbstractHoodieRowData.java new file mode 100644 index 0000000000000..37d100fa78653 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/model/AbstractHoodieRowData.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.model; + +import org.apache.hudi.common.model.HoodieOperation; + +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RawValueData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.types.RowKind; + +/** + * RowData implementation for Hoodie Row. It wraps an {@link RowData} and keeps meta columns locally. But the {@link RowData} + * does include the meta columns as well just that {@link AbstractHoodieRowData} will intercept queries for meta columns and serve from its + * copy rather than fetching from {@link RowData}. + */ +public abstract class AbstractHoodieRowData implements RowData { + private final String[] metaColumns; + protected final RowData row; + protected final int metaColumnsNum; + + public AbstractHoodieRowData(String commitTime, + String commitSeqNumber, + String recordKey, + String partitionPath, + String fileName, + RowData row, + boolean withOperation) { + this.metaColumnsNum = withOperation ? 6 : 5; + this.metaColumns = new String[metaColumnsNum]; + metaColumns[0] = commitTime; + metaColumns[1] = commitSeqNumber; + metaColumns[2] = recordKey; + metaColumns[3] = partitionPath; + metaColumns[4] = fileName; + if (withOperation) { + metaColumns[5] = HoodieOperation.fromValue(row.getRowKind().toByteValue()).getName(); + } + this.row = row; + } + + @Override + public RowKind getRowKind() { + return row.getRowKind(); + } + + @Override + public void setRowKind(RowKind kind) { + this.row.setRowKind(kind); + } + + @Override + public boolean isNullAt(int ordinal) { + if (ordinal < metaColumnsNum) { + return null == getMetaColumnVal(ordinal); + } + return row.isNullAt(rebaseOrdinal(ordinal)); + } + + @Override + public boolean getBoolean(int ordinal) { + return row.getBoolean(rebaseOrdinal(ordinal)); + } + + @Override + public byte getByte(int ordinal) { + return row.getByte(rebaseOrdinal(ordinal)); + } + + @Override + public short getShort(int ordinal) { + return row.getShort(rebaseOrdinal(ordinal)); + } + + @Override + public int getInt(int ordinal) { + return row.getInt(rebaseOrdinal(ordinal)); + } + + @Override + public long getLong(int ordinal) { + return row.getLong(rebaseOrdinal(ordinal)); + } + + @Override + public float getFloat(int ordinal) { + return row.getFloat(rebaseOrdinal(ordinal)); + } + + @Override + public double getDouble(int ordinal) { + return row.getDouble(rebaseOrdinal(ordinal)); + } + + @Override + public DecimalData getDecimal(int ordinal, int precision, int scale) { + return row.getDecimal(rebaseOrdinal(ordinal), precision, scale); + } + + @Override + public TimestampData getTimestamp(int ordinal, int precision) { + return row.getTimestamp(rebaseOrdinal(ordinal), precision); + } + + @Override + public RawValueData getRawValue(int ordinal) { + return row.getRawValue(rebaseOrdinal(ordinal)); + } + + @Override + public StringData getString(int ordinal) { + if (ordinal < metaColumnsNum) { + return StringData.fromString(getMetaColumnVal(ordinal)); + } + return row.getString(rebaseOrdinal(ordinal)); + } + + @Override + public byte[] getBinary(int ordinal) { + return row.getBinary(rebaseOrdinal(ordinal)); + } + + @Override + public RowData getRow(int ordinal, int numFields) { + return row.getRow(rebaseOrdinal(ordinal), numFields); + } + + @Override + public ArrayData getArray(int ordinal) { + return row.getArray(rebaseOrdinal(ordinal)); + } + + @Override + public MapData getMap(int ordinal) { + return row.getMap(rebaseOrdinal(ordinal)); + } + + private String getMetaColumnVal(int ordinal) { + return this.metaColumns[ordinal]; + } + + protected abstract int rebaseOrdinal(int ordinal); +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/model/HoodieRowData.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/model/HoodieRowData.java new file mode 100644 index 0000000000000..d37af8aa5e9f3 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/model/HoodieRowData.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.model; + +import org.apache.flink.table.data.RowData; + +/** + * RowData implementation for Hoodie Row. It wraps an {@link RowData} and keeps meta columns locally. But the {@link RowData} + * does include the meta columns as well just that {@link HoodieRowData} will intercept queries for meta columns and serve from its + * copy rather than fetching from {@link RowData}. + * + *

    The wrapped {@link RowData} does not contain hoodie metadata fields. + */ +public class HoodieRowData extends AbstractHoodieRowData { + + public HoodieRowData(String commitTime, + String commitSeqNumber, + String recordKey, + String partitionPath, + String fileName, + RowData row, + boolean withOperation) { + super(commitTime, commitSeqNumber, recordKey, partitionPath, fileName, row, withOperation); + } + + @Override + public int getArity() { + return metaColumnsNum + row.getArity(); + } + + protected int rebaseOrdinal(int ordinal) { + // NOTE: In cases when source row does not contain meta fields, we will have to + // rebase ordinal onto its indexes + return ordinal - metaColumnsNum; + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/model/HoodieRowDataCreation.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/model/HoodieRowDataCreation.java new file mode 100644 index 0000000000000..b4fd5cd74ae5e --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/model/HoodieRowDataCreation.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.model; + +import org.apache.flink.table.data.RowData; + +/** + * The factory clazz for hoodie row data. + */ +public abstract class HoodieRowDataCreation { + /** + * Creates a {@link AbstractHoodieRowData} instance based on the given configuration. + */ + public static AbstractHoodieRowData create( + String commitTime, + String commitSeqNumber, + String recordKey, + String partitionPath, + String fileName, + RowData row, + boolean withOperation, + boolean withMetaFields) { + return withMetaFields + ? new HoodieRowDataWithMetaFields(commitTime, commitSeqNumber, recordKey, partitionPath, fileName, row, withOperation) + : new HoodieRowData(commitTime, commitSeqNumber, recordKey, partitionPath, fileName, row, withOperation); + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/model/HoodieRowDataWithMetaFields.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/model/HoodieRowDataWithMetaFields.java new file mode 100644 index 0000000000000..eb1ab49812c05 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/model/HoodieRowDataWithMetaFields.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.model; + +import org.apache.flink.table.data.RowData; + +/** + * RowData implementation for Hoodie Row. It wraps an {@link RowData} and keeps meta columns locally. But the {@link RowData} + * does include the meta columns as well just that {@link HoodieRowDataWithMetaFields} will intercept queries for meta columns and serve from its + * copy rather than fetching from {@link RowData}. + * + *

    The wrapped {@link RowData} contains hoodie metadata fields. + */ +public class HoodieRowDataWithMetaFields extends AbstractHoodieRowData { + + public HoodieRowDataWithMetaFields(String commitTime, + String commitSeqNumber, + String recordKey, + String partitionPath, + String fileName, + RowData row, + boolean withOperation) { + super(commitTime, commitSeqNumber, recordKey, partitionPath, fileName, row, withOperation); + } + + @Override + public int getArity() { + return row.getArity(); + } + + protected int rebaseOrdinal(int ordinal) { + // NOTE: The source row contains the same number of meta fields of current row + return ordinal; + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/execution/ExplicitWriteHandler.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/execution/ExplicitWriteHandler.java new file mode 100644 index 0000000000000..46eff587575cc --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/execution/ExplicitWriteHandler.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer; +import org.apache.hudi.io.HoodieWriteHandle; + +import java.util.ArrayList; +import java.util.List; + +/** + * Consumes stream of hoodie records from in-memory queue and writes to one explicit create handle. + */ +public class ExplicitWriteHandler + extends BoundedInMemoryQueueConsumer, List> { + + private final List statuses = new ArrayList<>(); + + private HoodieWriteHandle handle; + + public ExplicitWriteHandler(HoodieWriteHandle handle) { + this.handle = handle; + } + + @Override + public void consumeOneRecord(HoodieLazyInsertIterable.HoodieInsertValueGenResult payload) { + final HoodieRecord insertPayload = payload.record; + handle.write(insertPayload, payload.insertValue, payload.exception); + } + + @Override + public void finish() { + closeOpenHandle(); + assert statuses.size() > 0; + } + + @Override + public List getResult() { + return statuses; + } + + private void closeOpenHandle() { + statuses.addAll(handle.close()); + } +} + diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/execution/FlinkLazyInsertIterable.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/execution/FlinkLazyInsertIterable.java index f0e51220d7c98..d0ec4e5ae6b03 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/execution/FlinkLazyInsertIterable.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/execution/FlinkLazyInsertIterable.java @@ -19,7 +19,7 @@ package org.apache.hudi.execution; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.TaskContextSupplier; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.Option; @@ -27,7 +27,8 @@ import org.apache.hudi.common.util.queue.IteratorBasedQueueProducer; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.io.WriteHandleFactory; +import org.apache.hudi.io.ExplicitWriteHandleFactory; +import org.apache.hudi.io.HoodieWriteHandle; import org.apache.hudi.table.HoodieTable; import org.apache.avro.Schema; @@ -35,16 +36,12 @@ import java.util.Iterator; import java.util.List; +/** + * Flink lazy iterable that supports explicit write handler. + * + * @param type of the payload + */ public class FlinkLazyInsertIterable extends HoodieLazyInsertIterable { - public FlinkLazyInsertIterable(Iterator> recordItr, - boolean areRecordsSorted, - HoodieWriteConfig config, - String instantTime, - HoodieTable hoodieTable, - String idPrefix, - TaskContextSupplier taskContextSupplier) { - super(recordItr, areRecordsSorted, config, instantTime, hoodieTable, idPrefix, taskContextSupplier); - } public FlinkLazyInsertIterable(Iterator> recordItr, boolean areRecordsSorted, @@ -53,7 +50,7 @@ public FlinkLazyInsertIterable(Iterator> recordItr, HoodieTable hoodieTable, String idPrefix, TaskContextSupplier taskContextSupplier, - WriteHandleFactory writeHandleFactory) { + ExplicitWriteHandleFactory writeHandleFactory) { super(recordItr, areRecordsSorted, config, instantTime, hoodieTable, idPrefix, taskContextSupplier, writeHandleFactory); } @@ -64,8 +61,8 @@ protected List computeNext() { null; try { final Schema schema = new Schema.Parser().parse(hoodieConfig.getSchema()); - bufferedIteratorExecutor = - new BoundedInMemoryExecutor<>(hoodieConfig.getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(inputItr), Option.of(getInsertHandler()), getTransformFunction(schema)); + bufferedIteratorExecutor = new BoundedInMemoryExecutor<>(hoodieConfig.getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(inputItr), + Option.of(getExplicitInsertHandler()), getTransformFunction(schema, hoodieConfig)); final List result = bufferedIteratorExecutor.execute(); assert result != null && !result.isEmpty() && !bufferedIteratorExecutor.isRemaining(); return result; @@ -74,7 +71,14 @@ protected List computeNext() { } finally { if (null != bufferedIteratorExecutor) { bufferedIteratorExecutor.shutdownNow(); + bufferedIteratorExecutor.awaitTermination(); } } } + + @SuppressWarnings("rawtypes") + private ExplicitWriteHandler getExplicitInsertHandler() { + HoodieWriteHandle handle = ((ExplicitWriteHandleFactory) writeHandleFactory).getWriteHandle(); + return new ExplicitWriteHandler(handle); + } } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndex.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndex.java index 427212c6f897b..be2273a8409b8 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndex.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndex.java @@ -7,71 +7,69 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ package org.apache.hudi.index; import org.apache.hudi.ApiMaturityLevel; +import org.apache.hudi.PublicAPIMethod; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.client.common.HoodieFlinkEngineContext; -import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodieListData; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.ReflectionUtils; -import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIndexException; -import org.apache.hudi.index.state.FlinkInMemoryStateIndex; -import org.apache.hudi.PublicAPIMethod; import org.apache.hudi.table.HoodieTable; import java.util.List; +import java.util.stream.Collectors; /** * Base flink implementation of {@link HoodieIndex}. - * @param payload type */ -public abstract class FlinkHoodieIndex extends HoodieIndex>, List, List> { +public abstract class FlinkHoodieIndex extends HoodieIndex>, List> { protected FlinkHoodieIndex(HoodieWriteConfig config) { super(config); } - public static FlinkHoodieIndex createIndex(HoodieFlinkEngineContext context, HoodieWriteConfig config) { - // first use index class config to create index. - if (!StringUtils.isNullOrEmpty(config.getIndexClass())) { - Object instance = ReflectionUtils.loadClass(config.getIndexClass(), config); - if (!(instance instanceof HoodieIndex)) { - throw new HoodieIndexException(config.getIndexClass() + " is not a subclass of HoodieIndex"); - } - return (FlinkHoodieIndex) instance; - } - - // TODO more indexes to be added - switch (config.getIndexType()) { - case INMEMORY: - return new FlinkInMemoryStateIndex<>(context, config); - default: - throw new HoodieIndexException("Unsupported index type " + config.getIndexType()); - } - } - @Override - @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) public abstract List updateLocation(List writeStatuses, - HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) throws HoodieIndexException; + HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException; @Override - @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) public abstract List> tagLocation(List> records, - HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) throws HoodieIndexException; + HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException; + + @Override + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { + List> hoodieRecords = tagLocation(records.map(record -> (HoodieRecord) record).collectAsList(), context, hoodieTable); + return HoodieListData.eager(hoodieRecords.stream().map(r -> (HoodieRecord) r).collect(Collectors.toList())); + } + + @Override + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public HoodieData updateLocation( + HoodieData writeStatuses, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { + return HoodieListData.eager(updateLocation(writeStatuses.collectAsList(), context, hoodieTable)); + } } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndexFactory.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndexFactory.java new file mode 100644 index 0000000000000..b10014b9183bc --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndexFactory.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index; + +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.index.bloom.HoodieBloomIndex; +import org.apache.hudi.index.bloom.HoodieGlobalBloomIndex; +import org.apache.hudi.index.bloom.ListBasedHoodieBloomIndexHelper; +import org.apache.hudi.index.bucket.HoodieSimpleBucketIndex; +import org.apache.hudi.index.simple.HoodieGlobalSimpleIndex; +import org.apache.hudi.index.simple.HoodieSimpleIndex; +import org.apache.hudi.index.state.FlinkInMemoryStateIndex; + +/** + * A factory to generate Flink {@link HoodieIndex}. + */ +public final class FlinkHoodieIndexFactory { + public static HoodieIndex createIndex(HoodieFlinkEngineContext context, HoodieWriteConfig config) { + // first use index class config to create index. + if (!StringUtils.isNullOrEmpty(config.getIndexClass())) { + Object instance = ReflectionUtils.loadClass(config.getIndexClass(), config); + if (!(instance instanceof HoodieIndex)) { + throw new HoodieIndexException(config.getIndexClass() + " is not a subclass of HoodieIndex"); + } + return (HoodieIndex) instance; + } + + // TODO more indexes to be added + switch (config.getIndexType()) { + case INMEMORY: + return new FlinkInMemoryStateIndex(context, config); + case BLOOM: + return new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance()); + case GLOBAL_BLOOM: + return new HoodieGlobalBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance()); + case SIMPLE: + return new HoodieSimpleIndex(config, Option.empty()); + case GLOBAL_SIMPLE: + return new HoodieGlobalSimpleIndex(config, Option.empty()); + case BUCKET: + return new HoodieSimpleBucketIndex(config); + default: + throw new HoodieIndexException("Unsupported index type " + config.getIndexType()); + } + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/state/FlinkInMemoryStateIndex.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/state/FlinkInMemoryStateIndex.java index d3fdf67d76a92..af9785edbeb0c 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/state/FlinkInMemoryStateIndex.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/state/FlinkInMemoryStateIndex.java @@ -19,21 +19,15 @@ package org.apache.hudi.index.state; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; import org.apache.hudi.client.common.HoodieFlinkEngineContext; -import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIndexException; -import org.apache.hudi.index.FlinkHoodieIndex; +import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; -import org.apache.flink.api.common.state.MapState; -import org.apache.flink.api.common.state.MapStateDescriptor; -import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -41,70 +35,27 @@ /** * Hoodie index implementation backed by flink state. - * - * @param type of payload */ -public class FlinkInMemoryStateIndex extends FlinkHoodieIndex { +public class FlinkInMemoryStateIndex extends HoodieIndex, List> { private static final Logger LOG = LogManager.getLogger(FlinkInMemoryStateIndex.class); - private MapState mapState; public FlinkInMemoryStateIndex(HoodieFlinkEngineContext context, HoodieWriteConfig config) { super(config); - if (context.getRuntimeContext() != null) { - MapStateDescriptor indexStateDesc = - new MapStateDescriptor<>("indexState", TypeInformation.of(HoodieKey.class), TypeInformation.of(HoodieRecordLocation.class)); - if (context.getRuntimeContext() != null) { - mapState = context.getRuntimeContext().getMapState(indexStateDesc); - } - } } @Override - public List> tagLocation(List> records, - HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) throws HoodieIndexException { - return context.map(records, record -> { - try { - if (mapState.contains(record.getKey())) { - record.unseal(); - record.setCurrentLocation(mapState.get(record.getKey())); - record.seal(); - } - } catch (Exception e) { - LOG.error(String.format("Tag record location failed, key = %s, %s", record.getRecordKey(), e.getMessage())); - } - return record; - }, 0); + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { + throw new UnsupportedOperationException("No need to tag location for FlinkInMemoryStateIndex"); } @Override - public List updateLocation(List writeStatuses, - HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) throws HoodieIndexException { - return context.map(writeStatuses, writeStatus -> { - for (HoodieRecord record : writeStatus.getWrittenRecords()) { - if (!writeStatus.isErrored(record.getKey())) { - HoodieKey key = record.getKey(); - Option newLocation = record.getNewLocation(); - if (newLocation.isPresent()) { - try { - mapState.put(key, newLocation.get()); - } catch (Exception e) { - LOG.error(String.format("Update record location failed, key = %s, %s", record.getRecordKey(), e.getMessage())); - } - } else { - // Delete existing index for a deleted record - try { - mapState.remove(key); - } catch (Exception e) { - LOG.error(String.format("Remove record location failed, key = %s, %s", record.getRecordKey(), e.getMessage())); - } - } - } - } - return writeStatus; - }, 0); + public HoodieData updateLocation( + HoodieData writeStatuses, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { + throw new UnsupportedOperationException("No need to update location for FlinkInMemoryStateIndex"); } @Override @@ -130,6 +81,6 @@ public boolean canIndexLogFiles() { */ @Override public boolean isImplicitWithStorage() { - return false; + return true; } -} \ No newline at end of file +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/ExplicitWriteHandleFactory.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/ExplicitWriteHandleFactory.java new file mode 100644 index 0000000000000..e598a033750dd --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/ExplicitWriteHandleFactory.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io; + +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; + +/** + * Create handle factory for Flink writer, use the specified write handle directly. + */ +public class ExplicitWriteHandleFactory + extends WriteHandleFactory { + private final HoodieWriteHandle writeHandle; + + public ExplicitWriteHandleFactory(HoodieWriteHandle writeHandle) { + this.writeHandle = writeHandle; + } + + @Override + public HoodieWriteHandle create( + HoodieWriteConfig hoodieConfig, String commitTime, + HoodieTable hoodieTable, String partitionPath, + String fileIdPrefix, TaskContextSupplier taskContextSupplier) { + return writeHandle; + } + + public HoodieWriteHandle getWriteHandle() { + return writeHandle; + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkAppendHandle.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkAppendHandle.java new file mode 100644 index 0000000000000..b514896aa1e3a --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkAppendHandle.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.marker.WriteMarkers; +import org.apache.hudi.table.marker.WriteMarkersFactory; + +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Iterator; +import java.util.List; + +/** + * A {@link HoodieAppendHandle} that supports APPEND write incrementally(mini-batches). + * + *

    For the first mini-batch, it initializes and sets up the next file path to write, + * then closes the file writer. The subsequent mini-batches are appended to the same file + * through a different append handle with same write file name. + * + *

    The back-up writer may rollover on condition(for e.g, the filesystem does not support append + * or the file size hits the configured threshold). + */ +public class FlinkAppendHandle + extends HoodieAppendHandle implements MiniBatchHandle { + + private static final Logger LOG = LoggerFactory.getLogger(FlinkAppendHandle.class); + + private boolean isClosed = false; + private final WriteMarkers writeMarkers; + + public FlinkAppendHandle( + HoodieWriteConfig config, + String instantTime, + HoodieTable hoodieTable, + String partitionPath, + String fileId, + Iterator> recordItr, + TaskContextSupplier taskContextSupplier) { + super(config, instantTime, hoodieTable, partitionPath, fileId, recordItr, taskContextSupplier); + this.writeMarkers = WriteMarkersFactory.get(config.getMarkersType(), hoodieTable, instantTime); + } + + @Override + protected void createMarkerFile(String partitionPath, String dataFileName) { + // In some rare cases, the task was pulled up again with same write file name, + // for e.g, reuse the small log files from last commit instant. + + // Just skip the marker creation if it already exists, the new data would append to + // the file directly. + writeMarkers.createIfNotExists(partitionPath, dataFileName, getIOType()); + } + + @Override + public boolean canWrite(HoodieRecord record) { + return true; + } + + @Override + protected boolean needsUpdateLocation() { + return false; + } + + @Override + protected boolean isUpdateRecord(HoodieRecord hoodieRecord) { + // do not use the HoodieRecord operation because hoodie writer has its own + // INSERT/MERGE bucket for 'UPSERT' semantics. For e.g, a hoodie record with fresh new key + // and operation HoodieCdcOperation.DELETE would be put into either an INSERT bucket or UPDATE bucket. + return hoodieRecord.getCurrentLocation() != null + && hoodieRecord.getCurrentLocation().getInstantTime().equals("U"); + } + + @Override + public List close() { + try { + return super.close(); + } finally { + this.isClosed = true; + } + } + + @Override + public void closeGracefully() { + if (isClosed) { + return; + } + try { + close(); + } catch (Throwable throwable) { + // The intermediate log file can still append based on the incremental MERGE semantics, + // there is no need to delete the file. + LOG.warn("Error while trying to dispose the APPEND handle", throwable); + } + } + + @Override + public Path getWritePath() { + return writer.getLogFile().getPath(); + } + + @Override + public boolean shouldReplace() { + // log files can append new data buffer directly + return false; + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkConcatAndReplaceHandle.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkConcatAndReplaceHandle.java new file mode 100644 index 0000000000000..300e8c512bb34 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkConcatAndReplaceHandle.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io; + +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieUpsertException; +import org.apache.hudi.keygen.KeyGenUtils; +import org.apache.hudi.table.HoodieTable; + +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Collections; +import java.util.Iterator; + +/** + * A {@link FlinkMergeAndReplaceHandle} that supports CONCAT write incrementally(small data buffers). + * + *

    The records iterator for super constructor is reset as empty thus the initialization for new records + * does nothing. This handle keep the iterator for itself to override the write behavior. + */ +public class FlinkConcatAndReplaceHandle + extends FlinkMergeAndReplaceHandle { + private static final Logger LOG = LoggerFactory.getLogger(FlinkConcatAndReplaceHandle.class); + + // a representation of incoming records that tolerates duplicate keys + private final Iterator> recordItr; + + public FlinkConcatAndReplaceHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, + Iterator> recordItr, String partitionPath, String fileId, + TaskContextSupplier taskContextSupplier, Path basePath) { + super(config, instantTime, hoodieTable, Collections.emptyIterator(), partitionPath, fileId, taskContextSupplier, basePath); + this.recordItr = recordItr; + } + + /** + * Write old record as is w/o merging with incoming record. + */ + @Override + public void write(GenericRecord oldRecord) { + String key = KeyGenUtils.getRecordKeyFromGenericRecord(oldRecord, keyGeneratorOpt); + try { + fileWriter.writeAvro(key, oldRecord); + } catch (IOException | RuntimeException e) { + String errMsg = String.format("Failed to write old record into new file for key %s from old file %s to new file %s with writerSchema %s", + key, getOldFilePath(), newFilePath, writeSchemaWithMetaFields.toString(true)); + LOG.debug("Old record is " + oldRecord); + throw new HoodieUpsertException(errMsg, e); + } + recordsWritten++; + } + + @Override + protected void writeIncomingRecords() throws IOException { + while (recordItr.hasNext()) { + HoodieRecord record = recordItr.next(); + writeInsertRecord(record); + } + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkConcatHandle.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkConcatHandle.java new file mode 100644 index 0000000000000..812155c3d2fb0 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkConcatHandle.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io; + +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieUpsertException; +import org.apache.hudi.keygen.KeyGenUtils; +import org.apache.hudi.table.HoodieTable; + +import org.apache.avro.generic.GenericRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Collections; +import java.util.Iterator; + +/** + * Handle to concatenate new records to old records w/o any merging. + * + *

    The records iterator for super constructor is reset as empty thus the initialization for new records + * does nothing. This handle keep the iterator for itself to override the write behavior. + */ +public class FlinkConcatHandle + extends FlinkMergeHandle { + private static final Logger LOG = LoggerFactory.getLogger(FlinkConcatHandle.class); + + // a representation of incoming records that tolerates duplicate keys + private final Iterator> recordItr; + + public FlinkConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, + Iterator> recordItr, String partitionPath, String fileId, + TaskContextSupplier taskContextSupplier) { + super(config, instantTime, hoodieTable, Collections.emptyIterator(), partitionPath, fileId, taskContextSupplier); + this.recordItr = recordItr; + } + + /** + * Write old record as is w/o merging with incoming record. + */ + @Override + public void write(GenericRecord oldRecord) { + String key = KeyGenUtils.getRecordKeyFromGenericRecord(oldRecord, keyGeneratorOpt); + try { + fileWriter.writeAvro(key, oldRecord); + } catch (IOException | RuntimeException e) { + String errMsg = String.format("Failed to write old record into new file for key %s from old file %s to new file %s with writerSchema %s", + key, getOldFilePath(), newFilePath, writeSchemaWithMetaFields.toString(true)); + LOG.debug("Old record is " + oldRecord); + throw new HoodieUpsertException(errMsg, e); + } + recordsWritten++; + } + + @Override + protected void writeIncomingRecords() throws IOException { + while (recordItr.hasNext()) { + HoodieRecord record = recordItr.next(); + writeInsertRecord(record); + } + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkCreateHandle.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkCreateHandle.java new file mode 100644 index 0000000000000..777e228c9510d --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkCreateHandle.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.marker.WriteMarkers; +import org.apache.hudi.table.marker.WriteMarkersFactory; + +import org.apache.avro.Schema; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.List; + +/** + * A {@link HoodieCreateHandle} that supports CREATE write incrementally(mini-batches). + * + *

    For the first mini-batch, it initializes and sets up the next file path to write, + * then closes the file writer. The subsequent mini-batches are appended to a file with new name, + * the new file would then rename to this file name, + * behaves like each mini-batch data are appended to the same file. + * + * @see FlinkMergeAndReplaceHandle + */ +public class FlinkCreateHandle + extends HoodieCreateHandle implements MiniBatchHandle { + + private static final Logger LOG = LogManager.getLogger(FlinkCreateHandle.class); + + private boolean isClosed = false; + + public FlinkCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, + String partitionPath, String fileId, TaskContextSupplier taskContextSupplier) { + this(config, instantTime, hoodieTable, partitionPath, fileId, Option.empty(), + taskContextSupplier); + } + + public FlinkCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, + String partitionPath, String fileId, Option schemaOption, + TaskContextSupplier taskContextSupplier) { + super(config, instantTime, hoodieTable, partitionPath, fileId, schemaOption, + taskContextSupplier); + // delete invalid data files generated by task retry. + if (getAttemptId() > 0) { + deleteInvalidDataFile(getAttemptId() - 1); + } + } + + /** + * The flink checkpoints start in sequence and asynchronously, when one write task finish the checkpoint(A) + * (thus the fs view got the written data files some of which may be invalid), + * it goes on with the next round checkpoint(B) write immediately, + * if it tries to reuse the last small data bucket(small file) of an invalid data file, + * finally, when the coordinator receives the checkpoint success event of checkpoint(A), + * the invalid data file would be cleaned, + * and this merger got a FileNotFoundException when it close the write file handle. + * + *

    To solve, deletes the invalid data file eagerly + * so that the invalid file small bucket would never be reused. + * + * @param lastAttemptId The last attempt ID + */ + private void deleteInvalidDataFile(long lastAttemptId) { + final String lastWriteToken = FSUtils.makeWriteToken(getPartitionId(), getStageId(), lastAttemptId); + final String lastDataFileName = FSUtils.makeBaseFileName(instantTime, + lastWriteToken, this.fileId, hoodieTable.getBaseFileExtension()); + final Path path = makeNewFilePath(partitionPath, lastDataFileName); + try { + if (fs.exists(path)) { + LOG.info("Deleting invalid INSERT file due to task retry: " + lastDataFileName); + fs.delete(path, false); + } + } catch (IOException e) { + throw new HoodieException("Error while deleting the INSERT file due to task retry: " + lastDataFileName, e); + } + } + + @Override + protected void createMarkerFile(String partitionPath, String dataFileName) { + WriteMarkers writeMarkers = WriteMarkersFactory.get(config.getMarkersType(), hoodieTable, instantTime); + writeMarkers.createIfNotExists(partitionPath, dataFileName, getIOType()); + } + + @Override + public Path makeNewPath(String partitionPath) { + Path path = super.makeNewPath(partitionPath); + // If the data file already exists, it means the write task write new data bucket multiple times + // in one hoodie commit, rolls over to a new name instead. + + // Write to a new file which behaves like a different task write. + try { + int rollNumber = 0; + while (fs.exists(path)) { + Path existing = path; + path = newFilePathWithRollover(rollNumber++); + LOG.warn("Duplicate write for INSERT bucket with path: " + existing + ", rolls over to new path: " + path); + } + return path; + } catch (IOException e) { + throw new HoodieException("Checking existing path for create handle error: " + path, e); + } + } + + @Override + public boolean canWrite(HoodieRecord record) { + return true; + } + + /** + * Use the writeToken + "-" + rollNumber as the new writeToken of a mini-batch write. + */ + private Path newFilePathWithRollover(int rollNumber) { + final String dataFileName = FSUtils.makeBaseFileName(instantTime, writeToken + "-" + rollNumber, fileId, + hoodieTable.getBaseFileExtension()); + return makeNewFilePath(partitionPath, dataFileName); + } + + @Override + public List close() { + try { + return super.close(); + } finally { + this.isClosed = true; + } + } + + @Override + public void closeGracefully() { + if (isClosed) { + return; + } + try { + close(); + } catch (Throwable throwable) { + LOG.warn("Error while trying to dispose the CREATE handle", throwable); + try { + fs.delete(path, false); + LOG.info("Deleting the intermediate CREATE data file: " + path + " success!"); + } catch (IOException e) { + // logging a warning and ignore the exception. + LOG.warn("Deleting the intermediate CREATE data file: " + path + " failed", e); + } + } + } + + @Override + public Path getWritePath() { + return path; + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkMergeAndReplaceHandle.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkMergeAndReplaceHandle.java new file mode 100644 index 0000000000000..9fea0a97185cb --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkMergeAndReplaceHandle.java @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.marker.WriteMarkers; +import org.apache.hudi.table.marker.WriteMarkersFactory; + +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; + +/** + * A {@link HoodieMergeHandle} that supports MERGE write incrementally(small data buffers). + * + *

    This handle is needed from the second mini-batch write for COW data bucket + * when the data bucket is written using multiple mini-batches. + * + *

    For the incremental data buffer, it initializes and sets up the next file path to write, + * then closes the file and rename to the old file name, + * behaves like the new data buffer are appended to the old file. + */ +public class FlinkMergeAndReplaceHandle + extends HoodieMergeHandle + implements MiniBatchHandle { + + private static final Logger LOG = LogManager.getLogger(FlinkMergeAndReplaceHandle.class); + + private boolean isClosed = false; + + /** + * Flag saying whether we should replace the old file with new. + */ + private boolean shouldReplace = true; + + public FlinkMergeAndReplaceHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, + Iterator> recordItr, String partitionPath, String fileId, + TaskContextSupplier taskContextSupplier, Path basePath) { + super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier, + new HoodieBaseFile(basePath.toString()), Option.empty()); + // delete invalid data files generated by task retry. + if (getAttemptId() > 0) { + deleteInvalidDataFile(getAttemptId() - 1); + } + } + + /** + * The flink checkpoints start in sequence and asynchronously, when one write task finish the checkpoint(A) + * (thus the fs view got the written data files some of which may be invalid), + * it goes on with the next round checkpoint(B) write immediately, + * if it tries to reuse the last small data bucket(small file) of an invalid data file, + * finally, when the coordinator receives the checkpoint success event of checkpoint(A), + * the invalid data file would be cleaned, + * and this merger got a FileNotFoundException when it close the write file handle. + * + *

    To solve, deletes the invalid data file eagerly + * so that the invalid file small bucket would never be reused. + * + * @param lastAttemptId The last attempt ID + */ + private void deleteInvalidDataFile(long lastAttemptId) { + final String lastWriteToken = FSUtils.makeWriteToken(getPartitionId(), getStageId(), lastAttemptId); + final String lastDataFileName = FSUtils.makeBaseFileName(instantTime, + lastWriteToken, this.fileId, hoodieTable.getBaseFileExtension()); + final Path path = makeNewFilePath(partitionPath, lastDataFileName); + try { + if (fs.exists(path)) { + LOG.info("Deleting invalid MERGE and REPLACE base file due to task retry: " + lastDataFileName); + fs.delete(path, false); + } + } catch (IOException e) { + throw new HoodieException("Error while deleting the MERGE and REPLACE base file due to task retry: " + lastDataFileName, e); + } + } + + @Override + protected void createMarkerFile(String partitionPath, String dataFileName) { + WriteMarkers writeMarkers = WriteMarkersFactory.get(config.getMarkersType(), hoodieTable, instantTime); + writeMarkers.createIfNotExists(partitionPath, dataFileName, getIOType()); + } + + @Override + protected void makeOldAndNewFilePaths(String partitionPath, String oldFileName, String newFileName) { + // old and new file name expects to be the same. + if (!FSUtils.getCommitTime(oldFileName).equals(instantTime)) { + LOG.warn("MERGE and REPLACE handle expect the same name for old and new files,\n" + + "while got new file: " + newFileName + " with old file: " + oldFileName + ",\n" + + "this rarely happens when the checkpoint success event was not received yet\n" + + "but the write task flush with new instant time, which does not break the UPSERT semantics"); + shouldReplace = false; + } + super.makeOldAndNewFilePaths(partitionPath, oldFileName, newFileName); + try { + int rollNumber = 0; + while (fs.exists(newFilePath)) { + Path oldPath = newFilePath; + newFileName = newFileNameWithRollover(rollNumber++); + newFilePath = makeNewFilePath(partitionPath, newFileName); + LOG.warn("Duplicate write for MERGE and REPLACE handle with path: " + oldPath + ", rolls over to new path: " + newFilePath); + } + } catch (IOException e) { + throw new HoodieException("Checking existing path for merge and replace handle error: " + newFilePath, e); + } + } + + /** + * Use the writeToken + "-" + rollNumber as the new writeToken of a mini-batch write. + */ + protected String newFileNameWithRollover(int rollNumber) { + return FSUtils.makeBaseFileName(instantTime, writeToken + "-" + rollNumber, + this.fileId, hoodieTable.getBaseFileExtension()); + } + + @Override + protected void setWriteStatusPath() { + // should still report the old file path. + writeStatus.getStat().setPath(new Path(config.getBasePath()), oldFilePath); + } + + boolean needsUpdateLocation() { + // No need to update location for Flink hoodie records because all the records are pre-tagged + // with the desired locations. + return false; + } + + public void finalizeWrite() { + // Behaves like the normal merge handle if the write instant time changes. + if (!shouldReplace) { + return; + } + // The file visibility should be kept by the configured ConsistencyGuard instance. + try { + fs.delete(oldFilePath, false); + } catch (IOException e) { + throw new HoodieIOException("Error while cleaning the old base file: " + oldFilePath, e); + } + try { + fs.rename(newFilePath, oldFilePath); + } catch (IOException e) { + throw new HoodieIOException("Error while renaming the temporary rollover file: " + + newFilePath + " to old base file name: " + oldFilePath, e); + } + } + + @Override + public List close() { + try { + List writeStatuses = super.close(); + finalizeWrite(); + return writeStatuses; + } finally { + this.isClosed = true; + } + } + + @Override + public void closeGracefully() { + if (isClosed) { + return; + } + try { + close(); + } catch (Throwable throwable) { + LOG.warn("Error while trying to dispose the MERGE handle", throwable); + try { + fs.delete(newFilePath, false); + LOG.info("Deleting the intermediate MERGE and REPLACE data file: " + newFilePath + " success!"); + } catch (IOException e) { + // logging a warning and ignore the exception. + LOG.warn("Deleting the intermediate MERGE and REPLACE data file: " + newFilePath + " failed", e); + } + } + } + + @Override + public Path getWritePath() { + return oldFilePath; + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkMergeHandle.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkMergeHandle.java new file mode 100644 index 0000000000000..a44783f99e437 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/FlinkMergeHandle.java @@ -0,0 +1,233 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.marker.WriteMarkers; +import org.apache.hudi.table.marker.WriteMarkersFactory; + +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * A {@link HoodieMergeHandle} that supports MERGE write incrementally(small data buffers). + * + *

    For a new data buffer, it initializes and set up the next file path to write, + * and closes the file path when the data buffer write finish. When next data buffer + * write starts, it rolls over to another new file. If all the data buffers write finish + * for a checkpoint round, it renames the last new file path as the desired file name + * (name with the expected file ID). + * + * @see FlinkMergeAndReplaceHandle + */ +public class FlinkMergeHandle + extends HoodieMergeHandle + implements MiniBatchHandle { + + private static final Logger LOG = LogManager.getLogger(FlinkMergeHandle.class); + + private boolean isClosed = false; + + /** + * Records the rolled over file paths. + */ + private List rolloverPaths; + + public FlinkMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, + Iterator> recordItr, String partitionPath, String fileId, + TaskContextSupplier taskContextSupplier) { + super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier, Option.empty()); + if (rolloverPaths == null) { + // #makeOldAndNewFilePaths may already initialize it already + rolloverPaths = new ArrayList<>(); + } + // delete invalid data files generated by task retry. + if (getAttemptId() > 0) { + deleteInvalidDataFile(getAttemptId() - 1); + } + } + + /** + * The flink checkpoints start in sequence and asynchronously, when one write task finish the checkpoint(A) + * (thus the fs view got the written data files some of which may be invalid), + * it goes on with the next round checkpoint(B) write immediately, + * if it tries to reuse the last small data bucket(small file) of an invalid data file, + * finally, when the coordinator receives the checkpoint success event of checkpoint(A), + * the invalid data file would be cleaned, + * and this merger got a FileNotFoundException when it close the write file handle. + * + *

    To solve, deletes the invalid data file eagerly + * so that the invalid file small bucket would never be reused. + * + * @param lastAttemptId The last attempt ID + */ + private void deleteInvalidDataFile(long lastAttemptId) { + final String lastWriteToken = FSUtils.makeWriteToken(getPartitionId(), getStageId(), lastAttemptId); + final String lastDataFileName = FSUtils.makeBaseFileName(instantTime, + lastWriteToken, this.fileId, hoodieTable.getBaseFileExtension()); + final Path path = makeNewFilePath(partitionPath, lastDataFileName); + if (path.equals(oldFilePath)) { + // In some rare cases, the old attempt file is used as the old base file to merge + // because the flink index eagerly records that. + // + // The merge handle has the 'UPSERT' semantics so there is no need to roll over + // and the file can still be used as the merge base file. + return; + } + try { + if (fs.exists(path)) { + LOG.info("Deleting invalid MERGE base file due to task retry: " + lastDataFileName); + fs.delete(path, false); + } + } catch (IOException e) { + throw new HoodieException("Error while deleting the MERGE base file due to task retry: " + lastDataFileName, e); + } + } + + @Override + protected void createMarkerFile(String partitionPath, String dataFileName) { + WriteMarkers writeMarkers = WriteMarkersFactory.get(config.getMarkersType(), hoodieTable, instantTime); + writeMarkers.createIfNotExists(partitionPath, dataFileName, getIOType()); + } + + @Override + protected void makeOldAndNewFilePaths(String partitionPath, String oldFileName, String newFileName) { + // If the data file already exists, it means the write task write merge data bucket multiple times + // in one hoodie commit, rolls over to a new name instead. + + // Use the existing file path as the base file path (file1), + // and generates new file path with roll over number (file2). + // the incremental data set would merge into the file2 instead of file1. + // + // When the task finalizes in #finishWrite, the intermediate files would be cleaned. + super.makeOldAndNewFilePaths(partitionPath, oldFileName, newFileName); + rolloverPaths = new ArrayList<>(); + try { + int rollNumber = 0; + while (fs.exists(newFilePath)) { + // in case there is empty file because of task failover attempt. + if (fs.getFileStatus(newFilePath).getLen() <= 0) { + fs.delete(newFilePath, false); + LOG.warn("Delete empty write file for MERGE bucket: " + newFilePath); + break; + } + + rolloverPaths.add(newFilePath); + newFileName = newFileNameWithRollover(rollNumber++); + newFilePath = makeNewFilePath(partitionPath, newFileName); + LOG.warn("Duplicate write for MERGE bucket with path: " + oldFilePath + ", rolls over to new path: " + newFilePath); + } + } catch (IOException e) { + throw new HoodieException("Checking existing path for merge handle error: " + newFilePath, e); + } + } + + /** + * Use the writeToken + "-" + rollNumber as the new writeToken of a mini-batch write. + */ + protected String newFileNameWithRollover(int rollNumber) { + return FSUtils.makeBaseFileName(instantTime, writeToken + "-" + rollNumber, + this.fileId, hoodieTable.getBaseFileExtension()); + } + + @Override + protected void setWriteStatusPath() { + // if there was rollover, should set up the path as the initial new file path. + writeStatus.getStat().setPath(new Path(config.getBasePath()), getWritePath()); + } + + @Override + public List close() { + try { + List writeStatus = super.close(); + finalizeWrite(); + return writeStatus; + } finally { + this.isClosed = true; + } + } + + boolean needsUpdateLocation() { + // No need to update location for Flink hoodie records because all the records are pre-tagged + // with the desired locations. + return false; + } + + public void finalizeWrite() { + // The file visibility should be kept by the configured ConsistencyGuard instance. + if (rolloverPaths.size() == 0) { + // only one flush action, no need to roll over + return; + } + + for (Path path : rolloverPaths) { + try { + fs.delete(path, false); + LOG.info("Delete the rollover data file: " + path + " success!"); + } catch (IOException e) { + throw new HoodieIOException("Error when clean the temporary rollover data file: " + path, e); + } + } + final Path desiredPath = rolloverPaths.get(0); + try { + fs.rename(newFilePath, desiredPath); + } catch (IOException e) { + throw new HoodieIOException("Error when rename the temporary roll file: " + newFilePath + " to: " + desiredPath, e); + } + } + + @Override + public void closeGracefully() { + if (isClosed) { + return; + } + try { + close(); + } catch (Throwable throwable) { + LOG.warn("Error while trying to dispose the MERGE handle", throwable); + try { + fs.delete(newFilePath, false); + LOG.info("Deleting the intermediate MERGE data file: " + newFilePath + " success!"); + } catch (IOException e) { + // logging a warning and ignore the exception. + LOG.warn("Deleting the intermediate MERGE data file: " + newFilePath + " failed", e); + } + } + } + + @Override + public Path getWritePath() { + return rolloverPaths.size() > 0 ? rolloverPaths.get(0) : newFilePath; + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/MiniBatchHandle.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/MiniBatchHandle.java new file mode 100644 index 0000000000000..7d3d7fa5ff2c4 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/MiniBatchHandle.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io; + +import org.apache.hadoop.fs.Path; + +/** + * Hoodie write handle that supports write as mini-batch. + */ +public interface MiniBatchHandle { + + /** + * Finalize the write of one mini-batch. Usually these mini-bathes + * come from one checkpoint interval. The file handle may roll over to new name + * if the name conflicts, give a chance to clean the intermediate file. + */ + default void finalizeWrite() { + } + + /** + * Close the file handle gracefully, if any error happens during the file handle close, + * clean the file to not left corrupted file. + */ + void closeGracefully(); + + /** + * Returns the write file path. + */ + Path getWritePath(); + + /** + * Whether the old write file should be replaced with the same name new file + * using content merged with incremental new data batch. + */ + default boolean shouldReplace() { + return true; + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowDataCreateHandle.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowDataCreateHandle.java new file mode 100644 index 0000000000000..ec059b23cd97f --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowDataCreateHandle.java @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage.row; + +import org.apache.hudi.client.HoodieInternalWriteStatus; +import org.apache.hudi.client.model.HoodieRowData; +import org.apache.hudi.client.model.HoodieRowDataCreation; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodiePartitionMetadata; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.IOType; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieInsertException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.marker.WriteMarkers; +import org.apache.hudi.table.marker.WriteMarkersFactory; + +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.io.Serializable; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Create handle with RowData for datasource implemention of bulk insert. + */ +public class HoodieRowDataCreateHandle implements Serializable { + + private static final long serialVersionUID = 1L; + private static final Logger LOG = LogManager.getLogger(HoodieRowDataCreateHandle.class); + private static final AtomicLong SEQGEN = new AtomicLong(1); + + private final String instantTime; + private final int taskPartitionId; + private final long taskId; + private final long taskEpochId; + private final HoodieTable table; + private final HoodieWriteConfig writeConfig; + protected final HoodieRowDataFileWriter fileWriter; + private final String partitionPath; + private final Path path; + private final String fileId; + private final boolean preserveHoodieMetadata; + private final FileSystem fs; + protected final HoodieInternalWriteStatus writeStatus; + private final HoodieTimer currTimer; + + public HoodieRowDataCreateHandle(HoodieTable table, HoodieWriteConfig writeConfig, String partitionPath, String fileId, + String instantTime, int taskPartitionId, long taskId, long taskEpochId, + RowType rowType, boolean preserveHoodieMetadata) { + this.partitionPath = partitionPath; + this.table = table; + this.writeConfig = writeConfig; + this.instantTime = instantTime; + this.taskPartitionId = taskPartitionId; + this.taskId = taskId; + this.taskEpochId = taskEpochId; + this.fileId = fileId; + this.preserveHoodieMetadata = preserveHoodieMetadata; + this.currTimer = new HoodieTimer(); + this.currTimer.startTimer(); + this.fs = table.getMetaClient().getFs(); + this.path = makeNewPath(partitionPath); + this.writeStatus = new HoodieInternalWriteStatus(!table.getIndex().isImplicitWithStorage(), + writeConfig.getWriteStatusFailureFraction()); + writeStatus.setPartitionPath(partitionPath); + writeStatus.setFileId(fileId); + writeStatus.setStat(new HoodieWriteStat()); + try { + HoodiePartitionMetadata partitionMetadata = + new HoodiePartitionMetadata( + fs, + instantTime, + new Path(writeConfig.getBasePath()), + FSUtils.getPartitionPath(writeConfig.getBasePath(), partitionPath), + table.getPartitionMetafileFormat()); + partitionMetadata.trySave(taskPartitionId); + createMarkerFile(partitionPath, FSUtils.makeBaseFileName(this.instantTime, getWriteToken(), this.fileId, table.getBaseFileExtension())); + this.fileWriter = createNewFileWriter(path, table, writeConfig, rowType); + } catch (IOException e) { + throw new HoodieInsertException("Failed to initialize file writer for path " + path, e); + } + LOG.info("New handle created for partition :" + partitionPath + " with fileId " + fileId); + } + + /** + * Writes an {@link RowData} to the underlying {@link HoodieRowDataFileWriter}. + * Before writing, value for meta columns are computed as required + * and wrapped in {@link HoodieRowData}. {@link HoodieRowData} is what gets written to HoodieRowDataFileWriter. + * + * @param recordKey The record key + * @param partitionPath The partition path + * @param record instance of {@link RowData} that needs to be written to the fileWriter. + * @throws IOException + */ + public void write(String recordKey, String partitionPath, RowData record) throws IOException { + try { + String seqId = preserveHoodieMetadata + ? record.getString(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD_ORD).toString() + : HoodieRecord.generateSequenceId(instantTime, taskPartitionId, SEQGEN.getAndIncrement()); + String commitInstant = preserveHoodieMetadata + ? record.getString(HoodieRecord.COMMIT_TIME_METADATA_FIELD_ORD).toString() + : instantTime; + RowData rowData = HoodieRowDataCreation.create(commitInstant, seqId, recordKey, partitionPath, path.getName(), + record, writeConfig.allowOperationMetadataField(), preserveHoodieMetadata); + try { + fileWriter.writeRow(recordKey, rowData); + writeStatus.markSuccess(recordKey); + } catch (Throwable t) { + writeStatus.markFailure(recordKey, t); + } + } catch (Throwable ge) { + writeStatus.setGlobalError(ge); + throw ge; + } + } + + /** + * Returns {@code true} if this handle can take in more writes. else {@code false}. + */ + public boolean canWrite() { + return fileWriter.canWrite(); + } + + /** + * Closes the {@link HoodieRowDataCreateHandle} and returns an instance of {@link HoodieInternalWriteStatus} containing the stats and + * status of the writes to this handle. + * + * @return the {@link HoodieInternalWriteStatus} containing the stats and status of the writes to this handle. + * @throws IOException + */ + public HoodieInternalWriteStatus close() throws IOException { + fileWriter.close(); + HoodieWriteStat stat = writeStatus.getStat(); + stat.setPartitionPath(partitionPath); + stat.setNumWrites(writeStatus.getTotalRecords()); + stat.setNumDeletes(0); + stat.setNumInserts(writeStatus.getTotalRecords()); + stat.setPrevCommit(HoodieWriteStat.NULL_COMMIT); + stat.setFileId(fileId); + stat.setPath(new Path(writeConfig.getBasePath()), path); + long fileSizeInBytes = FSUtils.getFileSize(table.getMetaClient().getFs(), path); + stat.setTotalWriteBytes(fileSizeInBytes); + stat.setFileSizeInBytes(fileSizeInBytes); + stat.setTotalWriteErrors(writeStatus.getFailedRowsSize()); + HoodieWriteStat.RuntimeStats runtimeStats = new HoodieWriteStat.RuntimeStats(); + runtimeStats.setTotalCreateTime(currTimer.endTimer()); + stat.setRuntimeStats(runtimeStats); + return writeStatus; + } + + public String getFileName() { + return path.getName(); + } + + private Path makeNewPath(String partitionPath) { + Path path = FSUtils.getPartitionPath(writeConfig.getBasePath(), partitionPath); + try { + if (!fs.exists(path)) { + fs.mkdirs(path); // create a new partition as needed. + } + } catch (IOException e) { + throw new HoodieIOException("Failed to make dir " + path, e); + } + HoodieTableConfig tableConfig = table.getMetaClient().getTableConfig(); + return new Path(path.toString(), FSUtils.makeBaseFileName(instantTime, getWriteToken(), fileId, + tableConfig.getBaseFileFormat().getFileExtension())); + } + + /** + * Creates an empty marker file corresponding to storage writer path. + * + * @param partitionPath Partition path + */ + private void createMarkerFile(String partitionPath, String dataFileName) { + WriteMarkers writeMarkers = WriteMarkersFactory.get(writeConfig.getMarkersType(), table, instantTime); + writeMarkers.create(partitionPath, dataFileName, IOType.CREATE); + } + + private String getWriteToken() { + return taskPartitionId + "-" + taskId + "-" + taskEpochId; + } + + protected HoodieRowDataFileWriter createNewFileWriter( + Path path, HoodieTable hoodieTable, HoodieWriteConfig config, RowType rowType) + throws IOException { + return HoodieRowDataFileWriterFactory.getRowDataFileWriter( + path, hoodieTable, config, rowType); + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowDataFileWriter.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowDataFileWriter.java new file mode 100644 index 0000000000000..5a03b43adcc7d --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowDataFileWriter.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage.row; + +import org.apache.flink.table.data.RowData; + +import java.io.IOException; + +/** + * Abstraction to assist in writing {@link RowData}s to be used in datasource implementation. + */ +public interface HoodieRowDataFileWriter { + + /** + * Returns {@code true} if this RowFileWriter can take in more writes. else {@code false}. + */ + boolean canWrite(); + + /** + * Writes an {@link RowData} to the {@link HoodieRowDataFileWriter}. Also takes in associated record key to be added to bloom filter if required. + * + * @throws IOException on any exception while writing. + */ + void writeRow(String key, RowData row) throws IOException; + + /** + * Writes an {@link RowData} to the {@link HoodieRowDataFileWriter}. + * + * @throws IOException on any exception while writing. + */ + void writeRow(RowData row) throws IOException; + + /** + * Closes the {@link HoodieRowDataFileWriter} and may not take in any more writes. + */ + void close() throws IOException; +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowDataFileWriterFactory.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowDataFileWriterFactory.java new file mode 100644 index 0000000000000..98d4a866e0ee9 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowDataFileWriterFactory.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage.row; + +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.bloom.BloomFilterFactory; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.storage.HoodieParquetConfig; +import org.apache.hudi.table.HoodieTable; + +import org.apache.flink.table.types.logical.RowType; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; + +import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; + +/** + * Factory to assist in instantiating a new {@link HoodieRowDataFileWriter}. + */ +public class HoodieRowDataFileWriterFactory { + + /** + * Factory method to assist in instantiating an instance of {@link HoodieRowDataFileWriter}. + * + * @param path path of the RowFileWriter. + * @param hoodieTable instance of {@link HoodieTable} in use. + * @param config instance of {@link HoodieWriteConfig} to use. + * @param schema schema of the dataset in use. + * @return the instantiated {@link HoodieRowDataFileWriter}. + * @throws IOException if format is not supported or if any exception during instantiating the RowFileWriter. + */ + public static HoodieRowDataFileWriter getRowDataFileWriter( + Path path, HoodieTable hoodieTable, HoodieWriteConfig config, RowType schema) + throws IOException { + final String extension = FSUtils.getFileExtension(path.getName()); + if (PARQUET.getFileExtension().equals(extension)) { + return newParquetInternalRowFileWriter(path, config, schema, hoodieTable); + } + throw new UnsupportedOperationException(extension + " format not supported yet."); + } + + private static HoodieRowDataFileWriter newParquetInternalRowFileWriter( + Path path, HoodieWriteConfig writeConfig, RowType rowType, HoodieTable table) + throws IOException { + BloomFilter filter = BloomFilterFactory.createBloomFilter( + writeConfig.getBloomFilterNumEntries(), + writeConfig.getBloomFilterFPP(), + writeConfig.getDynamicBloomFilterMaxNumEntries(), + writeConfig.getBloomFilterType()); + HoodieRowDataParquetWriteSupport writeSupport = + new HoodieRowDataParquetWriteSupport(table.getHadoopConf(), rowType, filter); + return new HoodieRowDataParquetWriter( + path, new HoodieParquetConfig<>( + writeSupport, + writeConfig.getParquetCompressionCodec(), + writeConfig.getParquetBlockSize(), + writeConfig.getParquetPageSize(), + writeConfig.getParquetMaxFileSize(), + writeSupport.getHadoopConf(), + writeConfig.getParquetCompressionRatio())); + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowDataParquetWriteSupport.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowDataParquetWriteSupport.java new file mode 100644 index 0000000000000..b939498c3e240 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowDataParquetWriteSupport.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage.row; + +import org.apache.hudi.avro.HoodieBloomFilterWriteSupport; +import org.apache.hudi.common.bloom.BloomFilter; + +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.hadoop.conf.Configuration; +import org.apache.hudi.common.util.Option; +import org.apache.parquet.hadoop.api.WriteSupport; + +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.Map; + +/** + * Hoodie Write Support for directly writing {@link RowData} to Parquet. + */ +public class HoodieRowDataParquetWriteSupport extends RowDataParquetWriteSupport { + + private final Configuration hadoopConf; + private final Option> bloomFilterWriteSupportOpt; + + public HoodieRowDataParquetWriteSupport(Configuration conf, RowType rowType, BloomFilter bloomFilter) { + super(rowType); + this.hadoopConf = new Configuration(conf); + this.bloomFilterWriteSupportOpt = Option.ofNullable(bloomFilter) + .map(HoodieBloomFilterRowDataWriteSupport::new); + } + + public Configuration getHadoopConf() { + return hadoopConf; + } + + @Override + public WriteSupport.FinalizedWriteContext finalizeWrite() { + Map extraMetadata = + bloomFilterWriteSupportOpt.map(HoodieBloomFilterWriteSupport::finalizeMetadata) + .orElse(Collections.emptyMap()); + + return new WriteSupport.FinalizedWriteContext(extraMetadata); + } + + public void add(String recordKey) { + this.bloomFilterWriteSupportOpt.ifPresent(bloomFilterWriteSupport -> + bloomFilterWriteSupport.addKey(recordKey)); + } + + private static class HoodieBloomFilterRowDataWriteSupport extends HoodieBloomFilterWriteSupport { + public HoodieBloomFilterRowDataWriteSupport(BloomFilter bloomFilter) { + super(bloomFilter); + } + + @Override + protected byte[] getUTF8Bytes(String key) { + return key.getBytes(StandardCharsets.UTF_8); + } + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowDataParquetWriter.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowDataParquetWriter.java new file mode 100644 index 0000000000000..17b3b6b37cf18 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowDataParquetWriter.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage.row; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; +import org.apache.hudi.io.storage.HoodieParquetConfig; + +import org.apache.flink.table.data.RowData; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.apache.parquet.hadoop.ParquetWriter; + +import java.io.IOException; + +/** + * Parquet's impl of {@link HoodieRowDataFileWriter} to write {@link RowData}s. + */ +public class HoodieRowDataParquetWriter extends ParquetWriter + implements HoodieRowDataFileWriter { + + private final Path file; + private final HoodieWrapperFileSystem fs; + private final long maxFileSize; + private final HoodieRowDataParquetWriteSupport writeSupport; + + public HoodieRowDataParquetWriter(Path file, HoodieParquetConfig parquetConfig) + throws IOException { + super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()), + ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(), + parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(), + DEFAULT_IS_DICTIONARY_ENABLED, DEFAULT_IS_VALIDATING_ENABLED, + DEFAULT_WRITER_VERSION, FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf())); + this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()); + this.fs = (HoodieWrapperFileSystem) this.file.getFileSystem(FSUtils.registerFileSystem(file, + parquetConfig.getHadoopConf())); + this.maxFileSize = parquetConfig.getMaxFileSize() + + Math.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio()); + this.writeSupport = parquetConfig.getWriteSupport(); + } + + @Override + public boolean canWrite() { + return fs.getBytesWritten(file) < maxFileSize; + } + + @Override + public void writeRow(String key, RowData row) throws IOException { + super.write(row); + writeSupport.add(key); + } + + @Override + public void writeRow(RowData row) throws IOException { + super.write(row); + } + + @Override + public void close() throws IOException { + super.close(); + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/RowDataParquetWriteSupport.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/RowDataParquetWriteSupport.java new file mode 100644 index 0000000000000..00570aee62ed3 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/RowDataParquetWriteSupport.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage.row; + +import org.apache.hudi.io.storage.row.parquet.ParquetRowDataWriter; +import org.apache.hudi.io.storage.row.parquet.ParquetSchemaConverter; + +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.hadoop.api.WriteSupport; +import org.apache.parquet.io.api.RecordConsumer; +import org.apache.parquet.schema.MessageType; + +import java.util.HashMap; + +/** + * Row data parquet write support. + */ +public class RowDataParquetWriteSupport extends WriteSupport { + + private final RowType rowType; + private final MessageType schema; + private ParquetRowDataWriter writer; + + public RowDataParquetWriteSupport(RowType rowType) { + super(); + this.rowType = rowType; + this.schema = ParquetSchemaConverter.convertToParquetMessageType("flink_schema", rowType); + } + + @Override + public WriteContext init(Configuration configuration) { + return new WriteContext(schema, new HashMap<>()); + } + + @Override + public void prepareForWrite(RecordConsumer recordConsumer) { + // should make the utc timestamp configurable + this.writer = new ParquetRowDataWriter(recordConsumer, rowType, schema, true); + } + + @Override + public void write(RowData record) { + try { + this.writer.write(record); + } catch (Exception e) { + throw new RuntimeException(e); + } + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/parquet/ParquetRowDataWriter.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/parquet/ParquetRowDataWriter.java new file mode 100644 index 0000000000000..3d9524eaa30e9 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/parquet/ParquetRowDataWriter.java @@ -0,0 +1,579 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage.row.parquet; + +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalDataUtils; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.util.Preconditions; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.io.api.RecordConsumer; +import org.apache.parquet.schema.GroupType; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.sql.Timestamp; +import java.util.Arrays; + +import static org.apache.flink.formats.parquet.utils.ParquetSchemaConverter.computeMinBytesForDecimalPrecision; +import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.JULIAN_EPOCH_OFFSET_DAYS; +import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.MILLIS_IN_DAY; +import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.NANOS_PER_MILLISECOND; +import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.NANOS_PER_SECOND; + +/** + * Writes a record to the Parquet API with the expected schema in order to be written to a file. + * + *

    Reference {@code org.apache.flink.formats.parquet.row.ParquetRowDataWriter} + * to support timestamp of INT64 8 bytes and complex data types. + */ +public class ParquetRowDataWriter { + + private final RecordConsumer recordConsumer; + private final boolean utcTimestamp; + + private final FieldWriter[] filedWriters; + private final String[] fieldNames; + + public ParquetRowDataWriter( + RecordConsumer recordConsumer, + RowType rowType, + GroupType schema, + boolean utcTimestamp) { + this.recordConsumer = recordConsumer; + this.utcTimestamp = utcTimestamp; + + this.filedWriters = new FieldWriter[rowType.getFieldCount()]; + this.fieldNames = rowType.getFieldNames().toArray(new String[0]); + for (int i = 0; i < rowType.getFieldCount(); i++) { + this.filedWriters[i] = createWriter(rowType.getTypeAt(i)); + } + } + + /** + * It writes a record to Parquet. + * + * @param record Contains the record that is going to be written. + */ + public void write(final RowData record) { + recordConsumer.startMessage(); + for (int i = 0; i < filedWriters.length; i++) { + if (!record.isNullAt(i)) { + String fieldName = fieldNames[i]; + FieldWriter writer = filedWriters[i]; + + recordConsumer.startField(fieldName, i); + writer.write(record, i); + recordConsumer.endField(fieldName, i); + } + } + recordConsumer.endMessage(); + } + + private FieldWriter createWriter(LogicalType t) { + switch (t.getTypeRoot()) { + case CHAR: + case VARCHAR: + return new StringWriter(); + case BOOLEAN: + return new BooleanWriter(); + case BINARY: + case VARBINARY: + return new BinaryWriter(); + case DECIMAL: + DecimalType decimalType = (DecimalType) t; + return createDecimalWriter(decimalType.getPrecision(), decimalType.getScale()); + case TINYINT: + return new ByteWriter(); + case SMALLINT: + return new ShortWriter(); + case DATE: + case TIME_WITHOUT_TIME_ZONE: + case INTEGER: + return new IntWriter(); + case BIGINT: + return new LongWriter(); + case FLOAT: + return new FloatWriter(); + case DOUBLE: + return new DoubleWriter(); + case TIMESTAMP_WITHOUT_TIME_ZONE: + TimestampType timestampType = (TimestampType) t; + if (timestampType.getPrecision() == 3) { + return new Timestamp64Writer(); + } else { + return new Timestamp96Writer(timestampType.getPrecision()); + } + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + LocalZonedTimestampType localZonedTimestampType = (LocalZonedTimestampType) t; + if (localZonedTimestampType.getPrecision() == 3) { + return new Timestamp64Writer(); + } else { + return new Timestamp96Writer(localZonedTimestampType.getPrecision()); + } + case ARRAY: + ArrayType arrayType = (ArrayType) t; + LogicalType elementType = arrayType.getElementType(); + FieldWriter elementWriter = createWriter(elementType); + return new ArrayWriter(elementWriter); + case MAP: + MapType mapType = (MapType) t; + LogicalType keyType = mapType.getKeyType(); + LogicalType valueType = mapType.getValueType(); + FieldWriter keyWriter = createWriter(keyType); + FieldWriter valueWriter = createWriter(valueType); + return new MapWriter(keyWriter, valueWriter); + case ROW: + RowType rowType = (RowType) t; + FieldWriter[] fieldWriters = rowType.getFields().stream() + .map(RowType.RowField::getType).map(this::createWriter).toArray(FieldWriter[]::new); + String[] fieldNames = rowType.getFields().stream() + .map(RowType.RowField::getName).toArray(String[]::new); + return new RowWriter(fieldNames, fieldWriters); + default: + throw new UnsupportedOperationException("Unsupported type: " + t); + } + } + + private interface FieldWriter { + void write(RowData row, int ordinal); + + void write(ArrayData array, int ordinal); + } + + private class BooleanWriter implements FieldWriter { + + @Override + public void write(RowData row, int ordinal) { + recordConsumer.addBoolean(row.getBoolean(ordinal)); + } + + @Override + public void write(ArrayData array, int ordinal) { + recordConsumer.addBoolean(array.getBoolean(ordinal)); + } + } + + private class ByteWriter implements FieldWriter { + + @Override + public void write(RowData row, int ordinal) { + recordConsumer.addInteger(row.getByte(ordinal)); + } + + @Override + public void write(ArrayData array, int ordinal) { + recordConsumer.addInteger(array.getByte(ordinal)); + } + } + + private class ShortWriter implements FieldWriter { + + @Override + public void write(RowData row, int ordinal) { + recordConsumer.addInteger(row.getShort(ordinal)); + } + + @Override + public void write(ArrayData array, int ordinal) { + recordConsumer.addInteger(array.getShort(ordinal)); + } + } + + private class LongWriter implements FieldWriter { + + @Override + public void write(RowData row, int ordinal) { + recordConsumer.addLong(row.getLong(ordinal)); + } + + @Override + public void write(ArrayData array, int ordinal) { + recordConsumer.addLong(array.getLong(ordinal)); + } + } + + private class FloatWriter implements FieldWriter { + + @Override + public void write(RowData row, int ordinal) { + recordConsumer.addFloat(row.getFloat(ordinal)); + } + + @Override + public void write(ArrayData array, int ordinal) { + recordConsumer.addFloat(array.getFloat(ordinal)); + } + } + + private class DoubleWriter implements FieldWriter { + + @Override + public void write(RowData row, int ordinal) { + recordConsumer.addDouble(row.getDouble(ordinal)); + } + + @Override + public void write(ArrayData array, int ordinal) { + recordConsumer.addDouble(array.getDouble(ordinal)); + } + } + + private class StringWriter implements FieldWriter { + + @Override + public void write(RowData row, int ordinal) { + recordConsumer.addBinary(Binary.fromReusedByteArray(row.getString(ordinal).toBytes())); + } + + @Override + public void write(ArrayData array, int ordinal) { + recordConsumer.addBinary(Binary.fromReusedByteArray(array.getString(ordinal).toBytes())); + } + } + + private class BinaryWriter implements FieldWriter { + + @Override + public void write(RowData row, int ordinal) { + recordConsumer.addBinary(Binary.fromReusedByteArray(row.getBinary(ordinal))); + } + + @Override + public void write(ArrayData array, int ordinal) { + recordConsumer.addBinary(Binary.fromReusedByteArray(array.getBinary(ordinal))); + } + } + + private class IntWriter implements FieldWriter { + + @Override + public void write(RowData row, int ordinal) { + recordConsumer.addInteger(row.getInt(ordinal)); + } + + @Override + public void write(ArrayData array, int ordinal) { + recordConsumer.addInteger(array.getInt(ordinal)); + } + } + + /** + * Timestamp of INT96 bytes, julianDay(4) + nanosOfDay(8). See + * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp + * TIMESTAMP_MILLIS and TIMESTAMP_MICROS are the deprecated ConvertedType. + */ + private class Timestamp64Writer implements FieldWriter { + private Timestamp64Writer() { + } + + @Override + public void write(RowData row, int ordinal) { + recordConsumer.addLong(timestampToInt64(row.getTimestamp(ordinal, 3))); + } + + @Override + public void write(ArrayData array, int ordinal) { + recordConsumer.addLong(timestampToInt64(array.getTimestamp(ordinal, 3))); + } + } + + private long timestampToInt64(TimestampData timestampData) { + return utcTimestamp ? timestampData.getMillisecond() : timestampData.toTimestamp().getTime(); + } + + /** + * Timestamp of INT96 bytes, julianDay(4) + nanosOfDay(8). See + * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp + * TIMESTAMP_MILLIS and TIMESTAMP_MICROS are the deprecated ConvertedType. + */ + private class Timestamp96Writer implements FieldWriter { + + private final int precision; + + private Timestamp96Writer(int precision) { + this.precision = precision; + } + + @Override + public void write(RowData row, int ordinal) { + recordConsumer.addBinary(timestampToInt96(row.getTimestamp(ordinal, precision))); + } + + @Override + public void write(ArrayData array, int ordinal) { + recordConsumer.addBinary(timestampToInt96(array.getTimestamp(ordinal, precision))); + } + } + + private Binary timestampToInt96(TimestampData timestampData) { + int julianDay; + long nanosOfDay; + if (utcTimestamp) { + long mills = timestampData.getMillisecond(); + julianDay = (int) ((mills / MILLIS_IN_DAY) + JULIAN_EPOCH_OFFSET_DAYS); + nanosOfDay = + (mills % MILLIS_IN_DAY) * NANOS_PER_MILLISECOND + + timestampData.getNanoOfMillisecond(); + } else { + Timestamp timestamp = timestampData.toTimestamp(); + long mills = timestamp.getTime(); + julianDay = (int) ((mills / MILLIS_IN_DAY) + JULIAN_EPOCH_OFFSET_DAYS); + nanosOfDay = ((mills % MILLIS_IN_DAY) / 1000) * NANOS_PER_SECOND + timestamp.getNanos(); + } + + ByteBuffer buf = ByteBuffer.allocate(12); + buf.order(ByteOrder.LITTLE_ENDIAN); + buf.putLong(nanosOfDay); + buf.putInt(julianDay); + buf.flip(); + return Binary.fromConstantByteBuffer(buf); + } + + private FieldWriter createDecimalWriter(int precision, int scale) { + Preconditions.checkArgument( + precision <= DecimalType.MAX_PRECISION, + "Decimal precision %s exceeds max precision %s", + precision, + DecimalType.MAX_PRECISION); + + /* + * This is optimizer for UnscaledBytesWriter. + */ + class LongUnscaledBytesWriter implements FieldWriter { + private final int numBytes; + private final int initShift; + private final byte[] decimalBuffer; + + private LongUnscaledBytesWriter() { + this.numBytes = computeMinBytesForDecimalPrecision(precision); + this.initShift = 8 * (numBytes - 1); + this.decimalBuffer = new byte[numBytes]; + } + + @Override + public void write(RowData row, int ordinal) { + long unscaledLong = row.getDecimal(ordinal, precision, scale).toUnscaledLong(); + doWrite(unscaledLong); + } + + @Override + public void write(ArrayData array, int ordinal) { + long unscaledLong = array.getDecimal(ordinal, precision, scale).toUnscaledLong(); + doWrite(unscaledLong); + } + + private void doWrite(long unscaled) { + int i = 0; + int shift = initShift; + while (i < numBytes) { + decimalBuffer[i] = (byte) (unscaled >> shift); + i += 1; + shift -= 8; + } + + recordConsumer.addBinary(Binary.fromReusedByteArray(decimalBuffer, 0, numBytes)); + } + } + + class UnscaledBytesWriter implements FieldWriter { + private final int numBytes; + private final byte[] decimalBuffer; + + private UnscaledBytesWriter() { + this.numBytes = computeMinBytesForDecimalPrecision(precision); + this.decimalBuffer = new byte[numBytes]; + } + + @Override + public void write(RowData row, int ordinal) { + byte[] bytes = row.getDecimal(ordinal, precision, scale).toUnscaledBytes(); + doWrite(bytes); + } + + @Override + public void write(ArrayData array, int ordinal) { + byte[] bytes = array.getDecimal(ordinal, precision, scale).toUnscaledBytes(); + doWrite(bytes); + } + + private void doWrite(byte[] bytes) { + byte[] writtenBytes; + if (bytes.length == numBytes) { + // Avoid copy. + writtenBytes = bytes; + } else { + byte signByte = bytes[0] < 0 ? (byte) -1 : (byte) 0; + Arrays.fill(decimalBuffer, 0, numBytes - bytes.length, signByte); + System.arraycopy( + bytes, 0, decimalBuffer, numBytes - bytes.length, bytes.length); + writtenBytes = decimalBuffer; + } + recordConsumer.addBinary(Binary.fromReusedByteArray(writtenBytes, 0, numBytes)); + } + } + + // 1 <= precision <= 18, writes as FIXED_LEN_BYTE_ARRAY + // optimizer for UnscaledBytesWriter + if (DecimalDataUtils.is32BitDecimal(precision) + || DecimalDataUtils.is64BitDecimal(precision)) { + return new LongUnscaledBytesWriter(); + } + + // 19 <= precision <= 38, writes as FIXED_LEN_BYTE_ARRAY + return new UnscaledBytesWriter(); + } + + private class ArrayWriter implements FieldWriter { + private final FieldWriter elementWriter; + + private ArrayWriter(FieldWriter elementWriter) { + this.elementWriter = elementWriter; + } + + @Override + public void write(RowData row, int ordinal) { + ArrayData arrayData = row.getArray(ordinal); + doWrite(arrayData); + } + + @Override + public void write(ArrayData array, int ordinal) { + ArrayData arrayData = array.getArray(ordinal); + doWrite(arrayData); + } + + private void doWrite(ArrayData arrayData) { + recordConsumer.startGroup(); + if (arrayData.size() > 0) { + final String repeatedGroup = "list"; + final String elementField = "element"; + recordConsumer.startField(repeatedGroup, 0); + for (int i = 0; i < arrayData.size(); i++) { + recordConsumer.startGroup(); + if (!arrayData.isNullAt(i)) { + // Only creates the element field if the current array element is not null. + recordConsumer.startField(elementField, 0); + elementWriter.write(arrayData, i); + recordConsumer.endField(elementField, 0); + } + recordConsumer.endGroup(); + } + recordConsumer.endField(repeatedGroup, 0); + } + recordConsumer.endGroup(); + } + } + + private class MapWriter implements FieldWriter { + private final FieldWriter keyWriter; + private final FieldWriter valueWriter; + + private MapWriter(FieldWriter keyWriter, FieldWriter valueWriter) { + this.keyWriter = keyWriter; + this.valueWriter = valueWriter; + } + + @Override + public void write(RowData row, int ordinal) { + MapData map = row.getMap(ordinal); + doWrite(map); + } + + @Override + public void write(ArrayData array, int ordinal) { + MapData map = array.getMap(ordinal); + doWrite(map); + } + + private void doWrite(MapData mapData) { + ArrayData keyArray = mapData.keyArray(); + ArrayData valueArray = mapData.valueArray(); + recordConsumer.startGroup(); + if (mapData.size() > 0) { + final String repeatedGroup = "key_value"; + final String kField = "key"; + final String vField = "value"; + recordConsumer.startField(repeatedGroup, 0); + for (int i = 0; i < mapData.size(); i++) { + recordConsumer.startGroup(); + // key + recordConsumer.startField(kField, 0); + this.keyWriter.write(keyArray, i); + recordConsumer.endField(kField, 0); + // value + if (!valueArray.isNullAt(i)) { + // Only creates the "value" field if the value if non-empty + recordConsumer.startField(vField, 1); + this.valueWriter.write(valueArray, i); + recordConsumer.endField(vField, 1); + } + recordConsumer.endGroup(); + } + recordConsumer.endField(repeatedGroup, 0); + } + recordConsumer.endGroup(); + } + } + + private class RowWriter implements FieldWriter { + private final String[] fieldNames; + private final FieldWriter[] fieldWriters; + + private RowWriter(String[] fieldNames, FieldWriter[] fieldWriters) { + this.fieldNames = fieldNames; + this.fieldWriters = fieldWriters; + } + + @Override + public void write(RowData row, int ordinal) { + RowData nested = row.getRow(ordinal, fieldWriters.length); + doWrite(nested); + } + + @Override + public void write(ArrayData array, int ordinal) { + RowData nested = array.getRow(ordinal, fieldWriters.length); + doWrite(nested); + } + + private void doWrite(RowData row) { + recordConsumer.startGroup(); + for (int i = 0; i < row.getArity(); i++) { + if (!row.isNullAt(i)) { + String fieldName = fieldNames[i]; + recordConsumer.startField(fieldName, i); + fieldWriters[i].write(row, i); + recordConsumer.endField(fieldName, i); + } + } + recordConsumer.endGroup(); + } + } +} + diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/parquet/ParquetSchemaConverter.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/parquet/ParquetSchemaConverter.java new file mode 100644 index 0000000000000..90497aae370ea --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/io/storage/row/parquet/ParquetSchemaConverter.java @@ -0,0 +1,674 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage.row.parquet; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + * + *

    Reference org.apache.flink.formats.parquet.utils.ParquetSchemaConverter to support timestamp of INT64 8 bytes. + */ +public class ParquetSchemaConverter { + private static final Logger LOGGER = LoggerFactory.getLogger(ParquetSchemaConverter.class); + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + + /** + * Converts Parquet schema to Flink Internal Type. + * + * @param type Parquet schema + * @return Flink type information + */ + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** + * Converts Flink Internal Type to Parquet schema. + * + * @param typeInformation Flink type information + * @param legacyMode is standard LIST and MAP schema or back-compatible schema + * @return Parquet schema + */ + public static MessageType toParquetType( + TypeInformation typeInformation, boolean legacyMode) { + return (MessageType) + convertField(null, typeInformation, Type.Repetition.OPTIONAL, legacyMode); + } + + public static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertParquetTypeToTypeInfo(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } else { + LOGGER.error( + "Parquet field {} in schema type {} can not be converted to Flink Internal Type", + field.getName(), + field.getOriginalType().name()); + } + } + + return new RowTypeInfo( + types.toArray(new TypeInformation[0]), names.toArray(new String[0])); + } + + public static TypeInformation convertParquetTypeToTypeInfo(final Type fieldType) { + TypeInformation typeInfo; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; + break; + case UTF8: + case ENUM: + case JSON: + case BSON: + typeInfo = BasicTypeInfo.STRING_TYPE_INFO; + break; + default: + throw new UnsupportedOperationException( + "Unsupported original type : " + + originalType.name() + + " for primitive type BINARY"); + } + } else { + typeInfo = BasicTypeInfo.STRING_TYPE_INFO; + } + break; + case BOOLEAN: + typeInfo = BasicTypeInfo.BOOLEAN_TYPE_INFO; + break; + case INT32: + if (originalType != null) { + switch (originalType) { + case TIME_MICROS: + case TIME_MILLIS: + typeInfo = SqlTimeTypeInfo.TIME; + break; + case TIMESTAMP_MICROS: + case TIMESTAMP_MILLIS: + typeInfo = SqlTimeTypeInfo.TIMESTAMP; + break; + case DATE: + typeInfo = SqlTimeTypeInfo.DATE; + break; + case UINT_8: + case UINT_16: + case UINT_32: + typeInfo = BasicTypeInfo.INT_TYPE_INFO; + break; + case INT_8: + typeInfo = org.apache.flink.api.common.typeinfo.Types.BYTE; + break; + case INT_16: + typeInfo = org.apache.flink.api.common.typeinfo.Types.SHORT; + break; + case INT_32: + typeInfo = BasicTypeInfo.INT_TYPE_INFO; + break; + default: + throw new UnsupportedOperationException( + "Unsupported original type : " + + originalType.name() + + " for primitive type INT32"); + } + } else { + typeInfo = BasicTypeInfo.INT_TYPE_INFO; + } + break; + case INT64: + if (originalType != null) { + switch (originalType) { + case TIME_MICROS: + typeInfo = SqlTimeTypeInfo.TIME; + break; + case TIMESTAMP_MICROS: + case TIMESTAMP_MILLIS: + typeInfo = SqlTimeTypeInfo.TIMESTAMP; + break; + case INT_64: + case DECIMAL: + typeInfo = BasicTypeInfo.LONG_TYPE_INFO; + break; + default: + throw new UnsupportedOperationException( + "Unsupported original type : " + + originalType.name() + + " for primitive type INT64"); + } + } else { + typeInfo = BasicTypeInfo.LONG_TYPE_INFO; + } + break; + case INT96: + // It stores a timestamp type data, we read it as millisecond + typeInfo = SqlTimeTypeInfo.TIMESTAMP; + break; + case FLOAT: + typeInfo = BasicTypeInfo.FLOAT_TYPE_INFO; + break; + case DOUBLE: + typeInfo = BasicTypeInfo.DOUBLE_TYPE_INFO; + break; + case FIXED_LEN_BYTE_ARRAY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; + break; + default: + throw new UnsupportedOperationException( + "Unsupported original type : " + + originalType.name() + + " for primitive type FIXED_LEN_BYTE_ARRAY"); + } + } else { + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; + } + break; + default: + throw new UnsupportedOperationException("Unsupported schema: " + fieldType); + } + } else { + GroupType parquetGroupType = fieldType.asGroupType(); + OriginalType originalType = parquetGroupType.getOriginalType(); + if (originalType != null) { + switch (originalType) { + case LIST: + if (parquetGroupType.getFieldCount() != 1) { + throw new UnsupportedOperationException( + "Invalid list type " + parquetGroupType); + } + Type repeatedType = parquetGroupType.getType(0); + if (!repeatedType.isRepetition(Type.Repetition.REPEATED)) { + throw new UnsupportedOperationException( + "Invalid list type " + parquetGroupType); + } + + if (repeatedType.isPrimitive()) { + typeInfo = convertParquetPrimitiveListToFlinkArray(repeatedType); + } else { + // Backward-compatibility element group name can be any string + // (element/array/other) + GroupType elementType = repeatedType.asGroupType(); + // If the repeated field is a group with multiple fields, then its type + // is the element + // type and elements are required. + if (elementType.getFieldCount() > 1) { + + for (Type type : elementType.getFields()) { + if (!type.isRepetition(Type.Repetition.REQUIRED)) { + throw new UnsupportedOperationException( + String.format( + "List field [%s] in List [%s] has to be required. ", + type.toString(), fieldType.getName())); + } + } + typeInfo = + ObjectArrayTypeInfo.getInfoFor( + convertParquetTypeToTypeInfo(elementType)); + } else { + Type internalType = elementType.getType(0); + if (internalType.isPrimitive()) { + typeInfo = + convertParquetPrimitiveListToFlinkArray(internalType); + } else { + // No need to do special process for group named array and tuple + GroupType tupleGroup = internalType.asGroupType(); + if (tupleGroup.getFieldCount() == 1 + && tupleGroup + .getFields() + .get(0) + .isRepetition(Type.Repetition.REQUIRED)) { + typeInfo = + ObjectArrayTypeInfo.getInfoFor( + convertParquetTypeToTypeInfo(internalType)); + } else { + throw new UnsupportedOperationException( + String.format( + "Unrecgonized List schema [%s] according to Parquet" + + " standard", + parquetGroupType.toString())); + } + } + } + } + break; + + case MAP_KEY_VALUE: + case MAP: + // The outer-most level must be a group annotated with MAP + // that contains a single field named key_value + if (parquetGroupType.getFieldCount() != 1 + || parquetGroupType.getType(0).isPrimitive()) { + throw new UnsupportedOperationException( + "Invalid map type " + parquetGroupType); + } + + // The middle level must be a repeated group with a key field for map keys + // and, optionally, a value field for map values. But we can't enforce two + // strict condition here + // the schema generated by Parquet lib doesn't contain LogicalType + // ! mapKeyValType.getOriginalType().equals(OriginalType.MAP_KEY_VALUE) + GroupType mapKeyValType = parquetGroupType.getType(0).asGroupType(); + if (!mapKeyValType.isRepetition(Type.Repetition.REPEATED) + || mapKeyValType.getFieldCount() != 2) { + throw new UnsupportedOperationException( + "The middle level of Map should be single field named key_value. Invalid map type " + + parquetGroupType); + } + + Type keyType = mapKeyValType.getType(0); + + // The key field encodes the map's key type. This field must have repetition + // required and + // must always be present. + if (!keyType.isPrimitive() + || !keyType.isRepetition(Type.Repetition.REQUIRED) + || !keyType.asPrimitiveType() + .getPrimitiveTypeName() + .equals(PrimitiveType.PrimitiveTypeName.BINARY) + || !keyType.getOriginalType().equals(OriginalType.UTF8)) { + throw new IllegalArgumentException( + "Map key type must be required binary (UTF8): " + keyType); + } + + Type valueType = mapKeyValType.getType(1); + return new MapTypeInfo<>( + BasicTypeInfo.STRING_TYPE_INFO, + convertParquetTypeToTypeInfo(valueType)); + default: + throw new UnsupportedOperationException("Unsupported schema: " + fieldType); + } + } else { + // if no original type than it is a record + return convertFields(parquetGroupType.getFields()); + } + } + + return typeInfo; + } + + private static TypeInformation convertParquetPrimitiveListToFlinkArray(Type type) { + // Backward-compatibility element group doesn't exist also allowed + TypeInformation flinkType = convertParquetTypeToTypeInfo(type); + if (flinkType.isBasicType()) { + return BasicArrayTypeInfo.getInfoFor( + Array.newInstance(flinkType.getTypeClass(), 0).getClass()); + } else { + // flinkType here can be either SqlTimeTypeInfo or BasicTypeInfo.BIG_DEC_TYPE_INFO, + // So it should be converted to ObjectArrayTypeInfo + return ObjectArrayTypeInfo.getInfoFor(flinkType); + } + } + + private static Type convertField( + String fieldName, + TypeInformation typeInfo, + Type.Repetition inheritRepetition, + boolean legacyMode) { + Type fieldType = null; + + Type.Repetition repetition = + inheritRepetition == null ? Type.Repetition.OPTIONAL : inheritRepetition; + if (typeInfo instanceof BasicTypeInfo) { + BasicTypeInfo basicTypeInfo = (BasicTypeInfo) typeInfo; + if (basicTypeInfo.equals(BasicTypeInfo.BIG_DEC_TYPE_INFO) + || basicTypeInfo.equals(BasicTypeInfo.BIG_INT_TYPE_INFO)) { + fieldType = + Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, repetition) + .as(OriginalType.DECIMAL) + .named(fieldName); + } else if (basicTypeInfo.equals(BasicTypeInfo.INT_TYPE_INFO)) { + fieldType = + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition) + .as(OriginalType.INT_32) + .named(fieldName); + } else if (basicTypeInfo.equals(BasicTypeInfo.DOUBLE_TYPE_INFO)) { + fieldType = + Types.primitive(PrimitiveType.PrimitiveTypeName.DOUBLE, repetition) + .named(fieldName); + } else if (basicTypeInfo.equals(BasicTypeInfo.FLOAT_TYPE_INFO)) { + fieldType = + Types.primitive(PrimitiveType.PrimitiveTypeName.FLOAT, repetition) + .named(fieldName); + } else if (basicTypeInfo.equals(BasicTypeInfo.LONG_TYPE_INFO)) { + fieldType = + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, repetition) + .as(OriginalType.INT_64) + .named(fieldName); + } else if (basicTypeInfo.equals(BasicTypeInfo.SHORT_TYPE_INFO)) { + fieldType = + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition) + .as(OriginalType.INT_16) + .named(fieldName); + } else if (basicTypeInfo.equals(BasicTypeInfo.BYTE_TYPE_INFO)) { + fieldType = + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition) + .as(OriginalType.INT_8) + .named(fieldName); + } else if (basicTypeInfo.equals(BasicTypeInfo.CHAR_TYPE_INFO)) { + fieldType = + Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, repetition) + .as(OriginalType.UTF8) + .named(fieldName); + } else if (basicTypeInfo.equals(BasicTypeInfo.BOOLEAN_TYPE_INFO)) { + fieldType = + Types.primitive(PrimitiveType.PrimitiveTypeName.BOOLEAN, repetition) + .named(fieldName); + } else if (basicTypeInfo.equals(BasicTypeInfo.DATE_TYPE_INFO) + || basicTypeInfo.equals(BasicTypeInfo.STRING_TYPE_INFO)) { + fieldType = + Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, repetition) + .as(OriginalType.UTF8) + .named(fieldName); + } + } else if (typeInfo instanceof MapTypeInfo) { + MapTypeInfo mapTypeInfo = (MapTypeInfo) typeInfo; + + if (mapTypeInfo.getKeyTypeInfo().equals(BasicTypeInfo.STRING_TYPE_INFO)) { + fieldType = + Types.map(repetition) + .value( + convertField( + MAP_VALUE, + mapTypeInfo.getValueTypeInfo(), + Type.Repetition.OPTIONAL, + legacyMode)) + .named(fieldName); + } else { + throw new UnsupportedOperationException( + String.format( + "Can not convert Flink MapTypeInfo %s to Parquet" + + " Map type as key has to be String", + typeInfo)); + } + } else if (typeInfo instanceof ObjectArrayTypeInfo) { + ObjectArrayTypeInfo objectArrayTypeInfo = (ObjectArrayTypeInfo) typeInfo; + + // Get all required sub fields + GroupType componentGroup = + (GroupType) + convertField( + LIST_ELEMENT, + objectArrayTypeInfo.getComponentInfo(), + Type.Repetition.REQUIRED, + legacyMode); + + GroupType elementGroup = Types.repeatedGroup().named(LIST_ELEMENT); + elementGroup = elementGroup.withNewFields(componentGroup.getFields()); + fieldType = + Types.buildGroup(repetition) + .addField(elementGroup) + .as(OriginalType.LIST) + .named(fieldName); + } else if (typeInfo instanceof BasicArrayTypeInfo) { + BasicArrayTypeInfo basicArrayType = (BasicArrayTypeInfo) typeInfo; + + if (legacyMode) { + + // Add extra layer of Group according to Parquet's standard + Type listGroup = + Types.repeatedGroup() + .addField( + convertField( + LIST_ELEMENT, + basicArrayType.getComponentInfo(), + Type.Repetition.REQUIRED, + legacyMode)) + .named(LIST_GROUP_NAME); + + fieldType = + Types.buildGroup(repetition) + .addField(listGroup) + .as(OriginalType.LIST) + .named(fieldName); + } else { + PrimitiveType primitiveTyp = + convertField( + fieldName, + basicArrayType.getComponentInfo(), + Type.Repetition.REQUIRED, + legacyMode) + .asPrimitiveType(); + fieldType = + Types.buildGroup(repetition) + .repeated(primitiveTyp.getPrimitiveTypeName()) + .as(primitiveTyp.getOriginalType()) + .named(LIST_ARRAY_TYPE) + .as(OriginalType.LIST) + .named(fieldName); + } + } else if (typeInfo instanceof SqlTimeTypeInfo) { + if (typeInfo.equals(SqlTimeTypeInfo.DATE)) { + fieldType = + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition) + .as(OriginalType.DATE) + .named(fieldName); + } else if (typeInfo.equals(SqlTimeTypeInfo.TIME)) { + fieldType = + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition) + .as(OriginalType.TIME_MILLIS) + .named(fieldName); + } else if (typeInfo.equals(SqlTimeTypeInfo.TIMESTAMP)) { + fieldType = + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, repetition) + .as(OriginalType.TIMESTAMP_MILLIS) + .named(fieldName); + } else { + throw new UnsupportedOperationException( + "Unsupported SqlTimeTypeInfo " + typeInfo.toString()); + } + + } else { + RowTypeInfo rowTypeInfo = (RowTypeInfo) typeInfo; + List types = new ArrayList<>(); + String[] fieldNames = rowTypeInfo.getFieldNames(); + TypeInformation[] fieldTypes = rowTypeInfo.getFieldTypes(); + for (int i = 0; i < rowTypeInfo.getArity(); i++) { + types.add(convertField(fieldNames[i], fieldTypes[i], repetition, legacyMode)); + } + + if (fieldName == null) { + fieldType = new MessageType(MESSAGE_ROOT, types); + } else { + fieldType = new GroupType(repetition, fieldName, types); + } + } + + return fieldType; + } + + public static MessageType convertToParquetMessageType(String name, RowType rowType) { + Type[] types = new Type[rowType.getFieldCount()]; + for (int i = 0; i < rowType.getFieldCount(); i++) { + types[i] = convertToParquetType(rowType.getFieldNames().get(i), rowType.getTypeAt(i)); + } + return new MessageType(name, types); + } + + private static Type convertToParquetType(String name, LogicalType type) { + return convertToParquetType(name, type, Type.Repetition.OPTIONAL); + } + + private static Type convertToParquetType( + String name, LogicalType type, Type.Repetition repetition) { + switch (type.getTypeRoot()) { + case CHAR: + case VARCHAR: + return Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, repetition) + .as(OriginalType.UTF8) + .named(name); + case BOOLEAN: + return Types.primitive(PrimitiveType.PrimitiveTypeName.BOOLEAN, repetition) + .named(name); + case BINARY: + case VARBINARY: + return Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, repetition) + .named(name); + case DECIMAL: + int precision = ((DecimalType) type).getPrecision(); + int scale = ((DecimalType) type).getScale(); + int numBytes = computeMinBytesForDecimalPrecision(precision); + return Types.primitive( + PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, repetition) + .as(LogicalTypeAnnotation.decimalType(scale, precision)) + .length(numBytes) + .named(name); + case TINYINT: + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition) + .as(LogicalTypeAnnotation.intType(8, true)) + .named(name); + case SMALLINT: + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition) + .as(LogicalTypeAnnotation.intType(16, true)) + .named(name); + case INTEGER: + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition) + .named(name); + case BIGINT: + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, repetition) + .named(name); + case FLOAT: + return Types.primitive(PrimitiveType.PrimitiveTypeName.FLOAT, repetition) + .named(name); + case DOUBLE: + return Types.primitive(PrimitiveType.PrimitiveTypeName.DOUBLE, repetition) + .named(name); + case DATE: + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition) + .as(LogicalTypeAnnotation.dateType()) + .named(name); + case TIME_WITHOUT_TIME_ZONE: + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition) + .as(LogicalTypeAnnotation.timeType(true, TimeUnit.MILLIS)) + .named(name); + case TIMESTAMP_WITHOUT_TIME_ZONE: + TimestampType timestampType = (TimestampType) type; + if (timestampType.getPrecision() == 3) { + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, repetition) + .as(LogicalTypeAnnotation.timestampType(true, TimeUnit.MILLIS)) + .named(name); + } else { + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT96, repetition) + .named(name); + } + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + LocalZonedTimestampType localZonedTimestampType = (LocalZonedTimestampType) type; + if (localZonedTimestampType.getPrecision() == 3) { + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, repetition) + .as(LogicalTypeAnnotation.timestampType(false, TimeUnit.MILLIS)) + .named(name); + } else { + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT96, repetition) + .named(name); + } + case ARRAY: + // group (LIST) { + // repeated group list { + // element; + // } + // } + ArrayType arrayType = (ArrayType) type; + LogicalType elementType = arrayType.getElementType(); + return Types + .buildGroup(repetition).as(OriginalType.LIST) + .addField( + Types.repeatedGroup() + .addField(convertToParquetType("element", elementType, repetition)) + .named("list")) + .named(name); + case MAP: + // group (MAP) { + // repeated group key_value { + // required key; + // value; + // } + // } + MapType mapType = (MapType) type; + LogicalType keyType = mapType.getKeyType(); + LogicalType valueType = mapType.getValueType(); + return Types + .buildGroup(repetition).as(OriginalType.MAP) + .addField( + Types + .repeatedGroup() + .addField(convertToParquetType("key", keyType, repetition)) + .addField(convertToParquetType("value", valueType, repetition)) + .named("key_value")) + .named(name); + case ROW: + RowType rowType = (RowType) type; + Types.GroupBuilder builder = Types.buildGroup(repetition); + rowType.getFields().forEach(field -> builder.addField(convertToParquetType(field.getName(), field.getType(), repetition))); + return builder.named(name); + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + } + + public static int computeMinBytesForDecimalPrecision(int precision) { + int numBytes = 1; + while (Math.pow(2.0, 8 * numBytes - 1) < Math.pow(10.0, precision)) { + numBytes += 1; + } + return numBytes; + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/metadata/FlinkHoodieBackedTableMetadataWriter.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/metadata/FlinkHoodieBackedTableMetadataWriter.java new file mode 100644 index 0000000000000..aa70f5835c8aa --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/metadata/FlinkHoodieBackedTableMetadataWriter.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.metrics.Registry; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieMetadataException; +import org.apache.hudi.exception.HoodieNotSupportedException; + +import org.apache.avro.specific.SpecificRecordBase; +import org.apache.hadoop.conf.Configuration; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +/** + * Flink hoodie backed table metadata writer. + */ +public class FlinkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetadataWriter { + + private static final Logger LOG = LogManager.getLogger(FlinkHoodieBackedTableMetadataWriter.class); + + public static HoodieTableMetadataWriter create(Configuration conf, HoodieWriteConfig writeConfig, + HoodieEngineContext context) { + return create(conf, writeConfig, context, Option.empty()); + } + + public static HoodieTableMetadataWriter create(Configuration conf, + HoodieWriteConfig writeConfig, + HoodieEngineContext context, + Option actionMetadata) { + return new FlinkHoodieBackedTableMetadataWriter(conf, writeConfig, context, actionMetadata, Option.empty()); + } + + public static HoodieTableMetadataWriter create(Configuration conf, + HoodieWriteConfig writeConfig, + HoodieEngineContext context, + Option actionMetadata, + Option inFlightInstantTimestamp) { + return new FlinkHoodieBackedTableMetadataWriter(conf, writeConfig, context, actionMetadata, inFlightInstantTimestamp); + } + + FlinkHoodieBackedTableMetadataWriter(Configuration hadoopConf, + HoodieWriteConfig writeConfig, + HoodieEngineContext engineContext, + Option actionMetadata, + Option inFlightInstantTimestamp) { + super(hadoopConf, writeConfig, engineContext, actionMetadata, inFlightInstantTimestamp); + } + + @Override + protected void initRegistry() { + if (metadataWriteConfig.isMetricsOn()) { + // should support executor metrics + Registry registry = Registry.getRegistry("HoodieMetadata"); + this.metrics = Option.of(new HoodieMetadataMetrics(registry)); + } else { + this.metrics = Option.empty(); + } + } + + @Override + protected void initialize(HoodieEngineContext engineContext, + Option actionMetadata, + Option inflightInstantTimestamp) { + try { + if (enabled) { + initializeIfNeeded(dataMetaClient, actionMetadata, inflightInstantTimestamp); + } + } catch (IOException e) { + LOG.error("Failed to initialize metadata table. Disabling the writer.", e); + enabled = false; + } + } + + @Override + protected void commit(String instantTime, Map> partitionRecordsMap, + boolean canTriggerTableService) { + ValidationUtils.checkState(enabled, "Metadata table cannot be committed to as it is not enabled"); + ValidationUtils.checkState(metadataMetaClient != null, "Metadata table is not fully initialized yet."); + HoodieData preppedRecords = prepRecords(partitionRecordsMap); + List preppedRecordList = preppedRecords.collectAsList(); + + try (HoodieFlinkWriteClient writeClient = new HoodieFlinkWriteClient(engineContext, metadataWriteConfig)) { + if (canTriggerTableService) { + // trigger compaction before doing the delta commit. this is to ensure, if this delta commit succeeds in metadata table, but failed in data table, + // we would have compacted metadata table and so could have included uncommitted data which will never be ignored while reading from metadata + // table (since reader will filter out only from delta commits) + compactIfNecessary(writeClient, instantTime); + } + + if (!metadataMetaClient.getActiveTimeline().containsInstant(instantTime)) { + // if this is a new commit being applied to metadata for the first time + writeClient.startCommitWithTime(instantTime); + metadataMetaClient.getActiveTimeline().transitionRequestedToInflight(HoodieActiveTimeline.DELTA_COMMIT_ACTION, instantTime); + } else { + Option alreadyCompletedInstant = metadataMetaClient.getActiveTimeline().filterCompletedInstants().filter(entry -> entry.getTimestamp().equals(instantTime)).lastInstant(); + if (alreadyCompletedInstant.isPresent()) { + // this code path refers to a re-attempted commit that got committed to metadata table, but failed in datatable. + // for eg, lets say compaction c1 on 1st attempt succeeded in metadata table and failed before committing to datatable. + // when retried again, data table will first rollback pending compaction. these will be applied to metadata table, but all changes + // are upserts to metadata table and so only a new delta commit will be created. + // once rollback is complete, compaction will be retried again, which will eventually hit this code block where the respective commit is + // already part of completed commit. So, we have to manually remove the completed instant and proceed. + // and it is for the same reason we enabled withAllowMultiWriteOnSameInstant for metadata table. + HoodieActiveTimeline.deleteInstantFile(metadataMetaClient.getFs(), metadataMetaClient.getMetaPath(), alreadyCompletedInstant.get()); + metadataMetaClient.reloadActiveTimeline(); + } + // If the alreadyCompletedInstant is empty, that means there is a requested or inflight + // instant with the same instant time. This happens for data table clean action which + // reuses the same instant time without rollback first. It is a no-op here as the + // clean plan is the same, so we don't need to delete the requested and inflight instant + // files in the active timeline. + + // The metadata writer uses LAZY cleaning strategy without auto commit, + // write client then checks the heartbeat expiration when committing the instant, + // sets up the heartbeat explicitly to make the check pass. + writeClient.getHeartbeatClient().start(instantTime); + } + + List statuses = preppedRecordList.size() > 0 + ? writeClient.upsertPreppedRecords(preppedRecordList, instantTime) + : Collections.emptyList(); + statuses.forEach(writeStatus -> { + if (writeStatus.hasErrors()) { + throw new HoodieMetadataException("Failed to commit metadata table records at instant " + instantTime); + } + }); + // flink does not support auto-commit yet, also the auto commit logic is not complete as BaseHoodieWriteClient now. + writeClient.commit(instantTime, statuses, Option.empty(), HoodieActiveTimeline.DELTA_COMMIT_ACTION, Collections.emptyMap()); + + // reload timeline + metadataMetaClient.reloadActiveTimeline(); + if (canTriggerTableService) { + cleanIfNecessary(writeClient, instantTime); + writeClient.archive(); + } + } + + // Update total size of the metadata and count of base/log files + metrics.ifPresent(m -> m.updateSizeMetrics(metadataMetaClient, metadata)); + } + + @Override + public void deletePartitions(String instantTime, List partitions) { + throw new HoodieNotSupportedException("Dropping metadata index not supported for Flink metadata table yet."); + } +} \ No newline at end of file diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/ExplicitWriteHandleTable.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/ExplicitWriteHandleTable.java new file mode 100644 index 0000000000000..b95894bed8d54 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/ExplicitWriteHandleTable.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.io.HoodieWriteHandle; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.util.List; + +/** + * HoodieTable that need to pass in the + * {@link org.apache.hudi.io.HoodieWriteHandle} explicitly. + */ +public interface ExplicitWriteHandleTable { + /** + * Upsert a batch of new records into Hoodie table at the supplied instantTime. + * + *

    Specifies the write handle explicitly in order to have fine grained control with + * the underneath file. + * + * @param context HoodieEngineContext + * @param writeHandle The write handle + * @param instantTime Instant Time for the action + * @param records hoodieRecords to upsert + * @return HoodieWriteMetadata + */ + HoodieWriteMetadata> upsert( + HoodieEngineContext context, + HoodieWriteHandle writeHandle, + String instantTime, + List> records); + + /** + * Insert a batch of new records into Hoodie table at the supplied instantTime. + * + *

    Specifies the write handle explicitly in order to have fine grained control with + * the underneath file. + * + * @param context HoodieEngineContext + * @param writeHandle The write handle + * @param instantTime Instant Time for the action + * @param records hoodieRecords to upsert + * @return HoodieWriteMetadata + */ + HoodieWriteMetadata> insert( + HoodieEngineContext context, + HoodieWriteHandle writeHandle, + String instantTime, + List> records); + + /** + * Deletes a list of {@link HoodieKey}s from the Hoodie table, at the supplied instantTime {@link HoodieKey}s will be + * de-duped and non existent keys will be removed before deleting. + * + *

    Specifies the write handle explicitly in order to have fine grained control with + * the underneath file. + * + * @param context HoodieEngineContext + * @param writeHandle The write handle + * @param instantTime Instant Time for the action + * @param keys {@link List} of {@link HoodieKey}s to be deleted + * @return HoodieWriteMetadata + */ + HoodieWriteMetadata> delete( + HoodieEngineContext context, + HoodieWriteHandle writeHandle, + String instantTime, + List keys); + + /** + * Upserts the given prepared records into the Hoodie table, at the supplied instantTime. + * + *

    This implementation requires that the input records are already tagged, and de-duped if needed. + * + *

    Specifies the write handle explicitly in order to have fine grained control with + * the underneath file. + * + * @param context HoodieEngineContext + * @param instantTime Instant Time for the action + * @param preppedRecords hoodieRecords to upsert + * @return HoodieWriteMetadata + */ + HoodieWriteMetadata> upsertPrepped( + HoodieEngineContext context, + HoodieWriteHandle writeHandle, + String instantTime, + List> preppedRecords); + + /** + * Inserts the given prepared records into the Hoodie table, at the supplied instantTime. + * + *

    This implementation requires that the input records are already tagged, and de-duped if needed. + * + *

    Specifies the write handle explicitly in order to have fine grained control with + * the underneath file. + * + * @param context HoodieEngineContext + * @param instantTime Instant Time for the action + * @param preppedRecords hoodieRecords to upsert + * @return HoodieWriteMetadata + */ + HoodieWriteMetadata> insertPrepped( + HoodieEngineContext context, + HoodieWriteHandle writeHandle, + String instantTime, + List> preppedRecords); + + /** + * Replaces all the existing records and inserts the specified new records into Hoodie table at the supplied instantTime, + * for the partition paths contained in input records. + * + * @param context HoodieEngineContext + * @param writeHandle The write handle + * @param instantTime Instant time for the replace action + * @param records input records + * @return HoodieWriteMetadata + */ + HoodieWriteMetadata> insertOverwrite( + HoodieEngineContext context, + HoodieWriteHandle writeHandle, + String instantTime, + List> records); + + /** + * Deletes all the existing records of the Hoodie table and inserts the specified new records into Hoodie table at the supplied instantTime, + * for the partition paths contained in input records. + * + * @param context HoodieEngineContext + * @param writeHandle The write handle + * @param instantTime Instant time for the replace action + * @param records input records + * @return HoodieWriteMetadata + */ + HoodieWriteMetadata> insertOverwriteTable( + HoodieEngineContext context, + HoodieWriteHandle writeHandle, + String instantTime, + List> records); +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java index b8ae370f129d4..543751a041078 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java @@ -19,12 +19,20 @@ package org.apache.hudi.table; import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieCleanerPlan; +import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.avro.model.HoodieIndexCommitMetadata; +import org.apache.hudi.avro.model.HoodieIndexPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRestorePlan; import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.avro.model.HoodieSavepointMetadata; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -32,17 +40,38 @@ import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieNotSupportedException; +import org.apache.hudi.exception.HoodieUpsertException; +import org.apache.hudi.io.HoodieCreateHandle; +import org.apache.hudi.io.HoodieMergeHandle; +import org.apache.hudi.io.HoodieSortedMergeHandle; +import org.apache.hudi.io.HoodieWriteHandle; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.keygen.factory.HoodieAvroKeyGeneratorFactory; +import org.apache.hudi.metadata.MetadataPartitionType; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata; -import org.apache.hudi.table.action.clean.FlinkCleanActionExecutor; +import org.apache.hudi.table.action.clean.CleanActionExecutor; +import org.apache.hudi.table.action.clean.CleanPlanActionExecutor; +import org.apache.hudi.table.action.cluster.ClusteringPlanActionExecutor; import org.apache.hudi.table.action.commit.FlinkDeleteCommitActionExecutor; import org.apache.hudi.table.action.commit.FlinkInsertCommitActionExecutor; +import org.apache.hudi.table.action.commit.FlinkInsertOverwriteCommitActionExecutor; +import org.apache.hudi.table.action.commit.FlinkInsertOverwriteTableCommitActionExecutor; import org.apache.hudi.table.action.commit.FlinkInsertPreppedCommitActionExecutor; +import org.apache.hudi.table.action.commit.FlinkMergeHelper; import org.apache.hudi.table.action.commit.FlinkUpsertCommitActionExecutor; import org.apache.hudi.table.action.commit.FlinkUpsertPreppedCommitActionExecutor; -import org.apache.hudi.table.action.rollback.FlinkCopyOnWriteRollbackActionExecutor; +import org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor; +import org.apache.hudi.table.action.rollback.CopyOnWriteRollbackActionExecutor; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Collections; +import java.util.Iterator; import java.util.List; import java.util.Map; @@ -54,61 +83,190 @@ *

    * UPDATES - Produce a new version of the file, just replacing the updated records with new values */ -public class HoodieFlinkCopyOnWriteTable extends HoodieFlinkTable { +public class HoodieFlinkCopyOnWriteTable + extends HoodieFlinkTable implements HoodieCompactionHandler { + + private static final Logger LOG = LoggerFactory.getLogger(HoodieFlinkCopyOnWriteTable.class); - protected HoodieFlinkCopyOnWriteTable(HoodieWriteConfig config, HoodieEngineContext context, HoodieTableMetaClient metaClient) { + public HoodieFlinkCopyOnWriteTable(HoodieWriteConfig config, HoodieEngineContext context, HoodieTableMetaClient metaClient) { super(config, context, metaClient); } + /** + * Upsert a batch of new records into Hoodie table at the supplied instantTime. + * + *

    Specifies the write handle explicitly in order to have fine grained control with + * the underneath file. + * + * @param context HoodieEngineContext + * @param writeHandle The write handle + * @param instantTime Instant Time for the action + * @param records hoodieRecords to upsert + * @return HoodieWriteMetadata + */ + public HoodieWriteMetadata> upsert( + HoodieEngineContext context, + HoodieWriteHandle writeHandle, + String instantTime, + List> records) { + return new FlinkUpsertCommitActionExecutor<>(context, writeHandle, config, this, instantTime, records).execute(); + } + + /** + * Insert a batch of new records into Hoodie table at the supplied instantTime. + * + *

    Specifies the write handle explicitly in order to have fine grained control with + * the underneath file. + * + * @param context HoodieEngineContext + * @param writeHandle The write handle + * @param instantTime Instant Time for the action + * @param records hoodieRecords to upsert + * @return HoodieWriteMetadata + */ + public HoodieWriteMetadata> insert( + HoodieEngineContext context, + HoodieWriteHandle writeHandle, + String instantTime, + List> records) { + return new FlinkInsertCommitActionExecutor<>(context, writeHandle, config, this, instantTime, records).execute(); + } + + /** + * Deletes a list of {@link HoodieKey}s from the Hoodie table, at the supplied instantTime {@link HoodieKey}s will be + * de-duped and non existent keys will be removed before deleting. + * + *

    Specifies the write handle explicitly in order to have fine grained control with + * the underneath file. + * + * @param context HoodieEngineContext + * @param writeHandle The write handle + * @param instantTime Instant Time for the action + * @param keys {@link List} of {@link HoodieKey}s to be deleted + * @return HoodieWriteMetadata + */ + public HoodieWriteMetadata> delete( + HoodieEngineContext context, + HoodieWriteHandle writeHandle, + String instantTime, + List keys) { + return new FlinkDeleteCommitActionExecutor<>(context, writeHandle, config, this, instantTime, keys).execute(); + } + + /** + * Upserts the given prepared records into the Hoodie table, at the supplied instantTime. + * + *

    This implementation requires that the input records are already tagged, and de-duped if needed. + * + *

    Specifies the write handle explicitly in order to have fine grained control with + * the underneath file. + * + * @param context HoodieEngineContext + * @param instantTime Instant Time for the action + * @param preppedRecords hoodieRecords to upsert + * @return HoodieWriteMetadata + */ + public HoodieWriteMetadata> upsertPrepped( + HoodieEngineContext context, + HoodieWriteHandle writeHandle, + String instantTime, + List> preppedRecords) { + return new FlinkUpsertPreppedCommitActionExecutor<>(context, writeHandle, config, this, instantTime, preppedRecords).execute(); + } + + /** + * Inserts the given prepared records into the Hoodie table, at the supplied instantTime. + * + *

    This implementation requires that the input records are already tagged, and de-duped if needed. + * + *

    Specifies the write handle explicitly in order to have fine grained control with + * the underneath file. + * + * @param context HoodieEngineContext + * @param instantTime Instant Time for the action + * @param preppedRecords hoodieRecords to upsert + * @return HoodieWriteMetadata + */ + public HoodieWriteMetadata> insertPrepped( + HoodieEngineContext context, + HoodieWriteHandle writeHandle, + String instantTime, + List> preppedRecords) { + return new FlinkInsertPreppedCommitActionExecutor<>(context, writeHandle, config, this, instantTime, preppedRecords).execute(); + } + + @Override + public HoodieWriteMetadata> insertOverwrite( + HoodieEngineContext context, + HoodieWriteHandle writeHandle, + String instantTime, + List> records) { + return new FlinkInsertOverwriteCommitActionExecutor(context, writeHandle, config, this, instantTime, records).execute(); + } + + @Override + public HoodieWriteMetadata> insertOverwriteTable( + HoodieEngineContext context, + HoodieWriteHandle writeHandle, + String instantTime, + List> records) { + return new FlinkInsertOverwriteTableCommitActionExecutor(context, writeHandle, config, this, instantTime, records).execute(); + } + @Override public HoodieWriteMetadata> upsert(HoodieEngineContext context, String instantTime, List> records) { - return new FlinkUpsertCommitActionExecutor<>(context, config, this, instantTime, records).execute(); + throw new HoodieNotSupportedException("This method should not be invoked"); } @Override public HoodieWriteMetadata> insert(HoodieEngineContext context, String instantTime, List> records) { - return new FlinkInsertCommitActionExecutor<>(context, config, this, instantTime, records).execute(); + throw new HoodieNotSupportedException("This method should not be invoked"); } @Override public HoodieWriteMetadata> bulkInsert(HoodieEngineContext context, String instantTime, List> records, - Option>>> bulkInsertPartitioner) { + Option bulkInsertPartitioner) { throw new HoodieNotSupportedException("BulkInsert is not supported yet"); } @Override public HoodieWriteMetadata> delete(HoodieEngineContext context, String instantTime, List keys) { - return new FlinkDeleteCommitActionExecutor<>(context, config, this, instantTime, keys).execute(); + throw new HoodieNotSupportedException("This method should not be invoked"); + } + + @Override + public HoodieWriteMetadata deletePartitions(HoodieEngineContext context, String instantTime, List partitions) { + throw new HoodieNotSupportedException("DeletePartitions is not supported yet"); } @Override public HoodieWriteMetadata> upsertPrepped(HoodieEngineContext context, String instantTime, List> preppedRecords) { - return new FlinkUpsertPreppedCommitActionExecutor<>(context, config, this, instantTime, preppedRecords).execute(); + throw new HoodieNotSupportedException("This method should not be invoked"); } @Override public HoodieWriteMetadata> insertPrepped(HoodieEngineContext context, String instantTime, List> preppedRecords) { - return new FlinkInsertPreppedCommitActionExecutor<>(context, config, this, instantTime, preppedRecords).execute(); + throw new HoodieNotSupportedException("This method should not be invoked"); } @Override public HoodieWriteMetadata> bulkInsertPrepped(HoodieEngineContext context, String instantTime, List> preppedRecords, - Option>>> bulkInsertPartitioner) { + Option bulkInsertPartitioner) { throw new HoodieNotSupportedException("BulkInsertPrepped is not supported yet"); } @Override public HoodieWriteMetadata> insertOverwrite(HoodieEngineContext context, String instantTime, List> records) { - throw new HoodieNotSupportedException("InsertOverWrite is not supported yet"); + throw new HoodieNotSupportedException("This method should not be invoked"); } @Override public HoodieWriteMetadata> insertOverwriteTable(HoodieEngineContext context, String instantTime, List> records) { - throw new HoodieNotSupportedException("insertOverwriteTable is not supported yet"); + throw new HoodieNotSupportedException("This method should not be invoked"); } @Override @@ -117,10 +275,21 @@ public Option scheduleCompaction(HoodieEngineContext conte } @Override - public HoodieWriteMetadata> compact(HoodieEngineContext context, String compactionInstantTime) { + public HoodieWriteMetadata> compact( + HoodieEngineContext context, String compactionInstantTime) { throw new HoodieNotSupportedException("Compaction is not supported on a CopyOnWrite table"); } + @Override + public Option scheduleClustering(final HoodieEngineContext context, final String instantTime, final Option> extraMetadata) { + return new ClusteringPlanActionExecutor<>(context, config, this, instantTime, extraMetadata).execute(); + } + + @Override + public HoodieWriteMetadata> cluster(final HoodieEngineContext context, final String clusteringInstantTime) { + throw new HoodieNotSupportedException("Clustering is not supported on a Flink CopyOnWrite table"); + } + @Override public HoodieBootstrapWriteMetadata> bootstrap(HoodieEngineContext context, Option> extraMetadata) { throw new HoodieNotSupportedException("Bootstrap is not supported yet"); @@ -131,14 +300,43 @@ public void rollbackBootstrap(HoodieEngineContext context, String instantTime) { throw new HoodieNotSupportedException("Bootstrap is not supported yet"); } + /** + * @param context HoodieEngineContext + * @param instantTime Instant Time for scheduling cleaning + * @param extraMetadata additional metadata to write into plan + * @return + */ @Override - public HoodieCleanMetadata clean(HoodieEngineContext context, String cleanInstantTime) { - return new FlinkCleanActionExecutor(context, config, this, cleanInstantTime).execute(); + public Option scheduleCleaning(HoodieEngineContext context, String instantTime, Option> extraMetadata) { + return new CleanPlanActionExecutor(context, config, this, instantTime, extraMetadata).execute(); } @Override - public HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollbackInstantTime, HoodieInstant commitInstant, boolean deleteInstants) { - return new FlinkCopyOnWriteRollbackActionExecutor(context, config, this, rollbackInstantTime, commitInstant, deleteInstants).execute(); + public Option scheduleRollback(HoodieEngineContext context, String instantTime, HoodieInstant instantToRollback, + boolean skipTimelinePublish, boolean shouldRollbackUsingMarkers) { + return new BaseRollbackPlanActionExecutor(context, config, this, instantTime, instantToRollback, skipTimelinePublish, + shouldRollbackUsingMarkers).execute(); + } + + @Override + public HoodieCleanMetadata clean(HoodieEngineContext context, String cleanInstantTime, boolean skipLocking) { + return new CleanActionExecutor(context, config, this, cleanInstantTime).execute(); + } + + @Override + public HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollbackInstantTime, HoodieInstant commitInstant, + boolean deleteInstants, boolean skipLocking) { + return new CopyOnWriteRollbackActionExecutor(context, config, this, rollbackInstantTime, commitInstant, deleteInstants, skipLocking).execute(); + } + + @Override + public Option scheduleIndexing(HoodieEngineContext context, String indexInstantTime, List partitionsToIndex) { + throw new HoodieNotSupportedException("Metadata indexing is not supported for a Flink table yet."); + } + + @Override + public Option index(HoodieEngineContext context, String indexInstantTime) { + throw new HoodieNotSupportedException("Metadata indexing is not supported for a Flink table yet."); } @Override @@ -146,8 +344,73 @@ public HoodieSavepointMetadata savepoint(HoodieEngineContext context, String ins throw new HoodieNotSupportedException("Savepoint is not supported yet"); } + @Override + public Option scheduleRestore(HoodieEngineContext context, String restoreInstantTime, String instantToRestore) { + throw new HoodieNotSupportedException("Restore is not supported yet"); + } + @Override public HoodieRestoreMetadata restore(HoodieEngineContext context, String restoreInstantTime, String instantToRestore) { throw new HoodieNotSupportedException("Savepoint and restore is not supported yet"); } + + // ------------------------------------------------------------------------- + // Used for compaction + // ------------------------------------------------------------------------- + @Override + public Iterator> handleUpdate( + String instantTime, String partitionPath, String fileId, + Map> keyToNewRecords, HoodieBaseFile oldDataFile) throws IOException { + // these are updates + HoodieMergeHandle upsertHandle = getUpdateHandle(instantTime, partitionPath, fileId, keyToNewRecords, oldDataFile); + return handleUpdateInternal(upsertHandle, instantTime, fileId); + } + + protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String instantTime, + String fileId) throws IOException { + if (upsertHandle.getOldFilePath() == null) { + throw new HoodieUpsertException( + "Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId); + } else { + FlinkMergeHelper.newInstance().runMerge(this, upsertHandle); + } + + // TODO(vc): This needs to be revisited + if (upsertHandle.getPartitionPath() == null) { + LOG.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", " + + upsertHandle.writeStatuses()); + } + + return Collections.singletonList(upsertHandle.writeStatuses()).iterator(); + } + + protected HoodieMergeHandle getUpdateHandle(String instantTime, String partitionPath, String fileId, + Map> keyToNewRecords, HoodieBaseFile dataFileToBeMerged) { + Option keyGeneratorOpt = Option.empty(); + if (!config.populateMetaFields()) { + try { + keyGeneratorOpt = Option.of((BaseKeyGenerator) HoodieAvroKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()))); + } catch (IOException e) { + throw new HoodieIOException("Only BaseKeyGenerator (or any key generator that extends from BaseKeyGenerator) are supported when meta " + + "columns are disabled. Please choose the right key generator if you wish to disable meta fields.", e); + } + } + if (requireSortedRecords()) { + return new HoodieSortedMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId, + dataFileToBeMerged, taskContextSupplier, keyGeneratorOpt); + } else { + return new HoodieMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId, + dataFileToBeMerged, taskContextSupplier, keyGeneratorOpt); + } + } + + @Override + public Iterator> handleInsert( + String instantTime, String partitionPath, String fileId, + Map> recordMap) { + HoodieCreateHandle createHandle = + new HoodieCreateHandle(config, instantTime, this, partitionPath, fileId, recordMap, taskContextSupplier); + createHandle.write(); + return Collections.singletonList(createHandle.close()).iterator(); + } } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkMergeOnReadTable.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkMergeOnReadTable.java index 51ce54a42da4f..9b7d3447177eb 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkMergeOnReadTable.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkMergeOnReadTable.java @@ -18,14 +18,115 @@ package org.apache.hudi.table; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.FlinkAppendHandle; +import org.apache.hudi.io.HoodieWriteHandle; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.commit.delta.FlinkUpsertDeltaCommitActionExecutor; +import org.apache.hudi.table.action.commit.delta.FlinkUpsertPreppedDeltaCommitActionExecutor; +import org.apache.hudi.table.action.compact.HoodieFlinkMergeOnReadTableCompactor; +import org.apache.hudi.table.action.compact.RunCompactionActionExecutor; +import org.apache.hudi.table.action.compact.ScheduleCompactionActionExecutor; +import org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor; +import org.apache.hudi.table.action.rollback.MergeOnReadRollbackActionExecutor; -public class HoodieFlinkMergeOnReadTable extends HoodieFlinkCopyOnWriteTable { - protected HoodieFlinkMergeOnReadTable(HoodieWriteConfig config, HoodieEngineContext context, HoodieTableMetaClient metaClient) { +import java.util.List; +import java.util.Map; + +/** + * Flink MERGE_ON_READ table. + */ +public class HoodieFlinkMergeOnReadTable + extends HoodieFlinkCopyOnWriteTable { + + HoodieFlinkMergeOnReadTable( + HoodieWriteConfig config, + HoodieEngineContext context, + HoodieTableMetaClient metaClient) { super(config, context, metaClient); } - // TODO not support yet. + + @Override + public HoodieWriteMetadata> upsert( + HoodieEngineContext context, + HoodieWriteHandle writeHandle, + String instantTime, + List> hoodieRecords) { + ValidationUtils.checkArgument(writeHandle instanceof FlinkAppendHandle, + "MOR write handle should always be a FlinkAppendHandle"); + FlinkAppendHandle appendHandle = (FlinkAppendHandle) writeHandle; + return new FlinkUpsertDeltaCommitActionExecutor<>(context, appendHandle, config, this, instantTime, hoodieRecords).execute(); + } + + @Override + public HoodieWriteMetadata> upsertPrepped( + HoodieEngineContext context, + HoodieWriteHandle writeHandle, + String instantTime, + List> preppedRecords) { + ValidationUtils.checkArgument(writeHandle instanceof FlinkAppendHandle, + "MOR write handle should always be a FlinkAppendHandle"); + FlinkAppendHandle appendHandle = (FlinkAppendHandle) writeHandle; + return new FlinkUpsertPreppedDeltaCommitActionExecutor<>(context, appendHandle, config, this, instantTime, preppedRecords).execute(); + } + + @Override + public HoodieWriteMetadata> insert( + HoodieEngineContext context, + HoodieWriteHandle writeHandle, + String instantTime, + List> hoodieRecords) { + if (writeHandle instanceof FlinkAppendHandle) { + FlinkAppendHandle appendHandle = (FlinkAppendHandle) writeHandle; + return new FlinkUpsertDeltaCommitActionExecutor<>(context, appendHandle, config, this, instantTime, hoodieRecords).execute(); + } else { + return super.insert(context, writeHandle, instantTime, hoodieRecords); + } + } + + @Override + public Option scheduleCompaction( + HoodieEngineContext context, + String instantTime, + Option> extraMetadata) { + ScheduleCompactionActionExecutor scheduleCompactionExecutor = new ScheduleCompactionActionExecutor( + context, config, this, instantTime, extraMetadata, + new HoodieFlinkMergeOnReadTableCompactor()); + return scheduleCompactionExecutor.execute(); + } + + @Override + public HoodieWriteMetadata> compact( + HoodieEngineContext context, String compactionInstantTime) { + RunCompactionActionExecutor compactionExecutor = new RunCompactionActionExecutor( + context, config, this, compactionInstantTime, new HoodieFlinkMergeOnReadTableCompactor(), + new HoodieFlinkCopyOnWriteTable(config, context, getMetaClient())); + return convertMetadata(compactionExecutor.execute()); + } + + @Override + public Option scheduleRollback(HoodieEngineContext context, String instantTime, HoodieInstant instantToRollback, + boolean skipTimelinePublish, boolean shouldRollbackUsingMarkers) { + return new BaseRollbackPlanActionExecutor(context, config, this, instantTime, instantToRollback, skipTimelinePublish, + shouldRollbackUsingMarkers).execute(); + } + + @Override + public HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollbackInstantTime, HoodieInstant commitInstant, + boolean deleteInstants, boolean skipLocking) { + return new MergeOnReadRollbackActionExecutor(context, config, this, rollbackInstantTime, commitInstant, deleteInstants, + skipLocking).execute(); + } } + diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkTable.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkTable.java index 3c09b38e64faa..4e7dbe36c4374 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkTable.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkTable.java @@ -19,8 +19,9 @@ package org.apache.hudi.table; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -29,44 +30,76 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieNotSupportedException; -import org.apache.hudi.index.FlinkHoodieIndex; +import org.apache.hudi.index.FlinkHoodieIndexFactory; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.metadata.FlinkHoodieBackedTableMetadataWriter; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import org.apache.avro.specific.SpecificRecordBase; import java.util.List; +/** + * Impl of a flink hoodie table. + */ public abstract class HoodieFlinkTable - extends HoodieTable>, List, List> { + extends HoodieTable>, List, List> + implements ExplicitWriteHandleTable { + protected HoodieFlinkTable(HoodieWriteConfig config, HoodieEngineContext context, HoodieTableMetaClient metaClient) { super(config, context, metaClient); } public static HoodieFlinkTable create(HoodieWriteConfig config, HoodieFlinkEngineContext context) { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient( - context.getHadoopConf().get(), - config.getBasePath(), - true, - config.getConsistencyGuardConfig(), - Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion())) - ); + HoodieTableMetaClient metaClient = + HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(config.getBasePath()) + .setLoadActiveTimelineOnLoad(true).setConsistencyGuardConfig(config.getConsistencyGuardConfig()) + .setLayoutVersion(Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion()))) + .setFileSystemRetryConfig(config.getFileSystemRetryConfig()).build(); return HoodieFlinkTable.create(config, context, metaClient); } public static HoodieFlinkTable create(HoodieWriteConfig config, HoodieFlinkEngineContext context, HoodieTableMetaClient metaClient) { + final HoodieFlinkTable hoodieFlinkTable; switch (metaClient.getTableType()) { case COPY_ON_WRITE: - return new HoodieFlinkCopyOnWriteTable<>(config, context, metaClient); + hoodieFlinkTable = new HoodieFlinkCopyOnWriteTable<>(config, context, metaClient); + break; case MERGE_ON_READ: - throw new HoodieNotSupportedException("MERGE_ON_READ is not supported yet"); + hoodieFlinkTable = new HoodieFlinkMergeOnReadTable<>(config, context, metaClient); + break; default: throw new HoodieException("Unsupported table type :" + metaClient.getTableType()); } + return hoodieFlinkTable; + } + + public static HoodieWriteMetadata> convertMetadata( + HoodieWriteMetadata> metadata) { + return metadata.clone(metadata.getWriteStatuses().collectAsList()); } @Override - protected HoodieIndex>, List, List> getIndex(HoodieWriteConfig config, HoodieEngineContext context) { - return FlinkHoodieIndex.createIndex((HoodieFlinkEngineContext) context, config); + protected HoodieIndex getIndex(HoodieWriteConfig config, HoodieEngineContext context) { + return FlinkHoodieIndexFactory.createIndex((HoodieFlinkEngineContext) context, config); + } + + /** + * Fetch instance of {@link HoodieTableMetadataWriter}. + * + * @return instance of {@link HoodieTableMetadataWriter} + */ + @Override + public Option getMetadataWriter(String triggeringInstantTimestamp, + Option actionMetadata) { + if (config.isMetadataTableEnabled()) { + return Option.of(FlinkHoodieBackedTableMetadataWriter.create(context.getHadoopConf().get(), config, + context, actionMetadata, Option.of(triggeringInstantTimestamp))); + } else { + return Option.empty(); + } } } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/clean/FlinkCleanActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/clean/FlinkCleanActionExecutor.java deleted file mode 100644 index 010e2a16af4a7..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/clean/FlinkCleanActionExecutor.java +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.clean; - -import org.apache.hudi.avro.model.HoodieActionInstant; -import org.apache.hudi.avro.model.HoodieCleanerPlan; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.HoodieCleanStat; -import org.apache.hudi.common.model.CleanFileInfo; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import scala.Tuple2; - -public class FlinkCleanActionExecutor extends - BaseCleanActionExecutor>, List, List> { - - private static final Logger LOG = LogManager.getLogger(FlinkCleanActionExecutor.class); - - public FlinkCleanActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, List, List> table, - String instantTime) { - super(context, config, table, instantTime); - } - - @Override - List clean(HoodieEngineContext context, HoodieCleanerPlan cleanerPlan) { - - Iterator> filesToBeDeletedPerPartition = cleanerPlan.getFilePathsToBeDeletedPerPartition().entrySet().stream() - .flatMap(x -> x.getValue().stream().map(y -> new Tuple2<>(x.getKey(), new CleanFileInfo(y.getFilePath(), y.getIsBootstrapBaseFile())))).iterator(); - - Stream> partitionCleanStats = - deleteFilesFunc(filesToBeDeletedPerPartition, table) - .collect(Collectors.groupingBy(Pair::getLeft)) - .entrySet().stream() - .map(x -> new Tuple2(x.getKey(), x.getValue().stream().map(y -> y.getRight()).reduce(PartitionCleanStat::merge).get())); - - Map partitionCleanStatsMap = partitionCleanStats - .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2)); - - // Return PartitionCleanStat for each partition passed. - return cleanerPlan.getFilePathsToBeDeletedPerPartition().keySet().stream().map(partitionPath -> { - PartitionCleanStat partitionCleanStat = partitionCleanStatsMap.containsKey(partitionPath) - ? partitionCleanStatsMap.get(partitionPath) - : new PartitionCleanStat(partitionPath); - HoodieActionInstant actionInstant = cleanerPlan.getEarliestInstantToRetain(); - return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()).withPartitionPath(partitionPath) - .withEarliestCommitRetained(Option.ofNullable( - actionInstant != null - ? new HoodieInstant(HoodieInstant.State.valueOf(actionInstant.getState()), - actionInstant.getAction(), actionInstant.getTimestamp()) - : null)) - .withDeletePathPattern(partitionCleanStat.deletePathPatterns()) - .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles()) - .withFailedDeletes(partitionCleanStat.failedDeleteFiles()) - .withDeleteBootstrapBasePathPatterns(partitionCleanStat.getDeleteBootstrapBasePathPatterns()) - .withSuccessfulDeleteBootstrapBaseFiles(partitionCleanStat.getSuccessfulDeleteBootstrapBaseFiles()) - .withFailedDeleteBootstrapBaseFiles(partitionCleanStat.getFailedDeleteBootstrapBaseFiles()) - .build(); - }).collect(Collectors.toList()); - } - - private static Stream> deleteFilesFunc(Iterator> iter, HoodieTable table) { - Map partitionCleanStatMap = new HashMap<>(); - FileSystem fs = table.getMetaClient().getFs(); - - while (iter.hasNext()) { - Tuple2 partitionDelFileTuple = iter.next(); - String partitionPath = partitionDelFileTuple._1(); - Path deletePath = new Path(partitionDelFileTuple._2().getFilePath()); - String deletePathStr = deletePath.toString(); - Boolean deletedFileResult = null; - try { - deletedFileResult = deleteFileAndGetResult(fs, deletePathStr); - } catch (IOException e) { - LOG.error("Delete file failed"); - } - if (!partitionCleanStatMap.containsKey(partitionPath)) { - partitionCleanStatMap.put(partitionPath, new PartitionCleanStat(partitionPath)); - } - boolean isBootstrapBasePathFile = partitionDelFileTuple._2().isBootstrapBaseFile(); - PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath); - if (isBootstrapBasePathFile) { - // For Bootstrap Base file deletions, store the full file path. - partitionCleanStat.addDeleteFilePatterns(deletePath.toString(), true); - partitionCleanStat.addDeletedFileResult(deletePath.toString(), deletedFileResult, true); - } else { - partitionCleanStat.addDeleteFilePatterns(deletePath.getName(), false); - partitionCleanStat.addDeletedFileResult(deletePath.getName(), deletedFileResult, false); - } - } - return partitionCleanStatMap.entrySet().stream().map(e -> Pair.of(e.getKey(), e.getValue())); - } -} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/BaseFlinkCommitActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/BaseFlinkCommitActionExecutor.java index 1d40b8e95a539..51138cd29daa6 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/BaseFlinkCommitActionExecutor.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/BaseFlinkCommitActionExecutor.java @@ -19,119 +19,109 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.model.WriteOperationType; -import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.CommitUtils; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieCommitException; import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.execution.FlinkLazyInsertIterable; -import org.apache.hudi.io.CreateHandleFactory; +import org.apache.hudi.io.ExplicitWriteHandleFactory; +import org.apache.hudi.io.HoodieCreateHandle; import org.apache.hudi.io.HoodieMergeHandle; -import org.apache.hudi.io.HoodieSortedMergeHandle; +import org.apache.hudi.io.HoodieWriteHandle; import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.WorkloadProfile; -import org.apache.hudi.table.WorkloadStat; import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.time.Duration; -import java.time.Instant; import java.util.Collections; -import java.util.HashMap; import java.util.Iterator; -import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.stream.Collectors; -import scala.Tuple2; - +/** + * With {@code org.apache.hudi.operator.partitioner.BucketAssigner}, each hoodie record + * is tagged with a bucket ID (partition path + fileID) in streaming way. All the records consumed by this + * executor should be tagged with bucket IDs and belong to one data bucket. + * + *

    These bucket IDs make it possible to shuffle the records first by the bucket ID + * (see org.apache.hudi.operator.partitioner.BucketAssignerFunction), and this executor + * only needs to handle the data buffer that belongs to one data bucket once at a time. So there is no need to + * partition the buffer. + * + *

    Computing the records batch locations all at a time is a pressure to the engine, + * we should avoid that in streaming system. + */ public abstract class BaseFlinkCommitActionExecutor extends BaseCommitActionExecutor>, List, List, HoodieWriteMetadata> { private static final Logger LOG = LogManager.getLogger(BaseFlinkCommitActionExecutor.class); + protected HoodieWriteHandle writeHandle; + public BaseFlinkCommitActionExecutor(HoodieEngineContext context, + HoodieWriteHandle writeHandle, HoodieWriteConfig config, HoodieTable table, String instantTime, WriteOperationType operationType) { - super(context, config, table, instantTime, operationType, Option.empty()); + this(context, writeHandle, config, table, instantTime, operationType, Option.empty()); } public BaseFlinkCommitActionExecutor(HoodieEngineContext context, + HoodieWriteHandle writeHandle, HoodieWriteConfig config, HoodieTable table, String instantTime, WriteOperationType operationType, Option extraMetadata) { super(context, config, table, instantTime, operationType, extraMetadata); + this.writeHandle = writeHandle; } @Override public HoodieWriteMetadata> execute(List> inputRecords) { HoodieWriteMetadata> result = new HoodieWriteMetadata<>(); - WorkloadProfile profile = null; - if (isWorkloadProfileNeeded()) { - profile = new WorkloadProfile(buildProfile(inputRecords)); - LOG.info("Workload profile :" + profile); - try { - saveWorkloadProfileMetadataToInflight(profile, instantTime); - } catch (Exception e) { - HoodieTableMetaClient metaClient = table.getMetaClient(); - HoodieInstant inflightInstant = new HoodieInstant(HoodieInstant.State.INFLIGHT, metaClient.getCommitActionType(), instantTime); - try { - if (!metaClient.getFs().exists(new Path(metaClient.getMetaPath(), inflightInstant.getFileName()))) { - throw new HoodieCommitException("Failed to commit " + instantTime + " unable to save inflight metadata ", e); - } - } catch (IOException ex) { - LOG.error("Check file exists failed"); - throw new HoodieCommitException("Failed to commit " + instantTime + " unable to save inflight metadata ", ex); - } - } - } - - final Partitioner partitioner = getPartitioner(profile); - Map>> partitionedRecords = partition(inputRecords, partitioner); - List writeStatuses = new LinkedList<>(); - partitionedRecords.forEach((partition, records) -> { - if (WriteOperationType.isChangingRecords(operationType)) { - handleUpsertPartition(instantTime, partition, records.iterator(), partitioner).forEachRemaining(writeStatuses::addAll); - } else { - handleInsertPartition(instantTime, partition, records.iterator(), partitioner).forEachRemaining(writeStatuses::addAll); - } - }); - updateIndex(writeStatuses, result); + final HoodieRecord record = inputRecords.get(0); + final String partitionPath = record.getPartitionPath(); + final String fileId = record.getCurrentLocation().getFileId(); + final BucketType bucketType = record.getCurrentLocation().getInstantTime().equals("I") + ? BucketType.INSERT + : BucketType.UPDATE; + handleUpsertPartition( + instantTime, + partitionPath, + fileId, + bucketType, + inputRecords.iterator()) + .forEachRemaining(writeStatuses::addAll); + setUpWriteMetadata(writeStatuses, result); return result; } - protected void updateIndex(List writeStatuses, HoodieWriteMetadata> result) { - Instant indexStartTime = Instant.now(); - // Update the index back - List statuses = table.getIndex().updateLocation(writeStatuses, context, table); - result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now())); + protected void setUpWriteMetadata( + List statuses, + HoodieWriteMetadata> result) { + // No need to update the index because the update happens before the write. result.setWriteStatuses(statuses); + result.setIndexUpdateDuration(Duration.ZERO); } @Override @@ -139,61 +129,17 @@ protected String getCommitActionType() { return table.getMetaClient().getCommitActionType(); } - private Partitioner getPartitioner(WorkloadProfile profile) { - if (WriteOperationType.isChangingRecords(operationType)) { - return getUpsertPartitioner(profile); - } else { - return getInsertPartitioner(profile); - } - } - - private Map>> partition(List> dedupedRecords, Partitioner partitioner) { - Map>, HoodieRecord>>> partitionedMidRecords = dedupedRecords - .stream() - .map(record -> new Tuple2<>(new Tuple2<>(record.getKey(), Option.ofNullable(record.getCurrentLocation())), record)) - .collect(Collectors.groupingBy(x -> partitioner.getPartition(x._1))); - Map>> results = new LinkedHashMap<>(); - partitionedMidRecords.forEach((key, value) -> results.put(key, value.stream().map(x -> x._2).collect(Collectors.toList()))); - return results; - } - - protected Pair, WorkloadStat> buildProfile(List> inputRecords) { - HashMap partitionPathStatMap = new HashMap<>(); - WorkloadStat globalStat = new WorkloadStat(); - - Map>, Long> partitionLocationCounts = inputRecords - .stream() - .map(record -> Pair.of( - Pair.of(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())), record)) - .collect(Collectors.groupingBy(Pair::getLeft, Collectors.counting())); - - for (Map.Entry>, Long> e : partitionLocationCounts.entrySet()) { - String partitionPath = e.getKey().getLeft(); - Long count = e.getValue(); - Option locOption = e.getKey().getRight(); - - if (!partitionPathStatMap.containsKey(partitionPath)) { - partitionPathStatMap.put(partitionPath, new WorkloadStat()); - } - - if (locOption.isPresent()) { - // update - partitionPathStatMap.get(partitionPath).addUpdates(locOption.get(), count); - globalStat.addUpdates(locOption.get(), count); - } else { - // insert - partitionPathStatMap.get(partitionPath).addInserts(count); - globalStat.addInserts(count); - } - } - return Pair.of(partitionPathStatMap, globalStat); - } - @Override protected void commit(Option> extraMetadata, HoodieWriteMetadata> result) { commit(extraMetadata, result, result.getWriteStatuses().stream().map(WriteStatus::getStat).collect(Collectors.toList())); } + protected void setCommitMetadata(HoodieWriteMetadata> result) { + result.setCommitMetadata(Option.of(CommitUtils.buildMetadata(result.getWriteStatuses().stream().map(WriteStatus::getStat).collect(Collectors.toList()), + result.getPartitionToReplaceFileIds(), + extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()))); + } + protected void commit(Option> extraMetadata, HoodieWriteMetadata> result, List writeStats) { String actionType = getCommitActionType(); LOG.info("Committing " + instantTime + ", action Type " + actionType); @@ -201,12 +147,12 @@ protected void commit(Option> extraMetadata, HoodieWriteMeta result.setWriteStats(writeStats); // Finalize write finalizeWrite(instantTime, writeStats, result); - try { LOG.info("Committing " + instantTime + ", action Type " + getCommitActionType()); HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); - HoodieCommitMetadata metadata = CommitUtils.buildMetadata(writeStats, result.getPartitionToReplaceFileIds(), - extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()); + HoodieCommitMetadata metadata = result.getCommitMetadata().get(); + + writeTableMetadata(metadata, actionType); activeTimeline.saveAsComplete(new HoodieInstant(true, getCommitActionType(), instantTime), Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); @@ -218,41 +164,47 @@ protected void commit(Option> extraMetadata, HoodieWriteMeta } } - protected Map> getPartitionToReplacedFileIds(List writeStatuses) { - return Collections.emptyMap(); - } - @Override protected boolean isWorkloadProfileNeeded() { return true; } @SuppressWarnings("unchecked") - protected Iterator> handleUpsertPartition(String instantTime, Integer partition, Iterator recordItr, - Partitioner partitioner) { - UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner; - BucketInfo binfo = upsertPartitioner.getBucketInfo(partition); - BucketType btype = binfo.bucketType; + protected Iterator> handleUpsertPartition( + String instantTime, + String partitionPath, + String fileIdHint, + BucketType bucketType, + Iterator recordItr) { try { - if (btype.equals(BucketType.INSERT)) { - return handleInsert(binfo.fileIdPrefix, recordItr); - } else if (btype.equals(BucketType.UPDATE)) { - return handleUpdate(binfo.partitionPath, binfo.fileIdPrefix, recordItr); + if (this.writeHandle instanceof HoodieCreateHandle) { + // During one checkpoint interval, an insert record could also be updated, + // for example, for an operation sequence of a record: + // I, U, | U, U + // - batch1 - | - batch2 - + // the first batch(batch1) operation triggers an INSERT bucket, + // the second batch batch2 tries to reuse the same bucket + // and append instead of UPDATE. + return handleInsert(fileIdHint, recordItr); + } else if (this.writeHandle instanceof HoodieMergeHandle) { + return handleUpdate(partitionPath, fileIdHint, recordItr); } else { - throw new HoodieUpsertException("Unknown bucketType " + btype + " for partition :" + partition); + switch (bucketType) { + case INSERT: + return handleInsert(fileIdHint, recordItr); + case UPDATE: + return handleUpdate(partitionPath, fileIdHint, recordItr); + default: + throw new AssertionError(); + } } } catch (Throwable t) { - String msg = "Error upserting bucketType " + btype + " for partition :" + partition; + String msg = "Error upsetting bucketType " + bucketType + " for partition :" + partitionPath; LOG.error(msg, t); throw new HoodieUpsertException(msg, t); } } - protected Iterator> handleInsertPartition(String instantTime, Integer partition, Iterator recordItr, - Partitioner partitioner) { - return handleUpsertPartition(instantTime, partition, recordItr, partitioner); - } - @Override public Iterator> handleUpdate(String partitionPath, String fileId, Iterator> recordItr) @@ -263,11 +215,11 @@ public Iterator> handleUpdate(String partitionPath, String fil return Collections.singletonList((List) Collections.EMPTY_LIST).iterator(); } // these are updates - HoodieMergeHandle upsertHandle = getUpdateHandle(partitionPath, fileId, recordItr); + HoodieMergeHandle upsertHandle = (HoodieMergeHandle) this.writeHandle; return handleUpdateInternal(upsertHandle, fileId); } - protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String fileId) + protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String fileId) throws IOException { if (upsertHandle.getOldFilePath() == null) { throw new HoodieUpsertException( @@ -277,26 +229,12 @@ protected Iterator> handleUpdateInternal(HoodieMergeHandle ups } // TODO(vc): This needs to be revisited - if (upsertHandle.getWriteStatus().getPartitionPath() == null) { + if (upsertHandle.getPartitionPath() == null) { LOG.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", " - + upsertHandle.getWriteStatus()); - } - return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())).iterator(); - } - - protected HoodieMergeHandle getUpdateHandle(String partitionPath, String fileId, Iterator> recordItr) { - if (table.requireSortedRecords()) { - return new HoodieSortedMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier); - } else { - return new HoodieMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier); + + upsertHandle.writeStatuses()); } - } - protected HoodieMergeHandle getUpdateHandle(String partitionPath, String fileId, - Map> keyToNewRecords, - HoodieBaseFile dataFileToBeMerged) { - return new HoodieMergeHandle<>(config, instantTime, table, keyToNewRecords, - partitionPath, fileId, dataFileToBeMerged, taskContextSupplier); + return Collections.singletonList(upsertHandle.writeStatuses()).iterator(); } @Override @@ -308,24 +246,6 @@ public Iterator> handleInsert(String idPfx, Iterator) Collections.EMPTY_LIST).iterator(); } return new FlinkLazyInsertIterable<>(recordItr, true, config, instantTime, table, idPfx, - taskContextSupplier, new CreateHandleFactory<>()); + taskContextSupplier, new ExplicitWriteHandleFactory<>(writeHandle)); } - - /** - * Provides a partitioner to perform the upsert operation, based on the workload profile. - */ - public Partitioner getUpsertPartitioner(WorkloadProfile profile) { - if (profile == null) { - throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner."); - } - return new UpsertPartitioner(profile, context, table, config); - } - - /** - * Provides a partitioner to perform the insert operation, based on the workload profile. - */ - public Partitioner getInsertPartitioner(WorkloadProfile profile) { - return getUpsertPartitioner(profile); - } - } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkDeleteCommitActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkDeleteCommitActionExecutor.java index a31ab4ee15d70..23e3c01eac171 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkDeleteCommitActionExecutor.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkDeleteCommitActionExecutor.java @@ -19,23 +19,30 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.HoodieWriteHandle; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; import java.util.List; +/** + * Flink delete commit action executor. + */ public class FlinkDeleteCommitActionExecutor> extends BaseFlinkCommitActionExecutor { private final List keys; public FlinkDeleteCommitActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, HoodieTable table, - String instantTime, List keys) { - super(context, config, table, instantTime, WriteOperationType.DELETE); + HoodieWriteHandle writeHandle, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + List keys) { + super(context, writeHandle, config, table, instantTime, WriteOperationType.DELETE); this.keys = keys; } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkDeleteHelper.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkDeleteHelper.java index 57a87c412fa2a..f6b172e125b5b 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkDeleteHelper.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkDeleteHelper.java @@ -19,8 +19,10 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.data.HoodieListData; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.collection.Pair; @@ -40,9 +42,12 @@ import java.util.List; import java.util.stream.Collectors; +/** + * Flink delete helper. + */ @SuppressWarnings("checkstyle:LineLength") public class FlinkDeleteHelper extends - AbstractDeleteHelper>, List, List, R> { + BaseDeleteHelper>, List, List, R> { private FlinkDeleteHelper() { } @@ -92,11 +97,10 @@ public HoodieWriteMetadata> execute(String instantTime, } List> dedupedRecords = - dedupedKeys.stream().map(key -> new HoodieRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList()); + dedupedKeys.stream().map(key -> new HoodieAvroRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList()); Instant beginTag = Instant.now(); // perform index look up to get existing location of records - List> taggedRecords = - table.getIndex().tagLocation(dedupedRecords, context, table); + List> taggedRecords = table.getIndex().tagLocation(HoodieListData.eager(dedupedRecords), context, table).collectAsList(); Duration tagLocationDuration = Duration.between(beginTag, Instant.now()); // filter out non existent keys/records diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkInsertCommitActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkInsertCommitActionExecutor.java index 1752960321e0b..3ae6802e7ce7a 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkInsertCommitActionExecutor.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkInsertCommitActionExecutor.java @@ -19,32 +19,37 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.HoodieWriteHandle; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; import java.util.List; +/** + * Flink insert commit action executor. + */ public class FlinkInsertCommitActionExecutor> extends BaseFlinkCommitActionExecutor { private List> inputRecords; public FlinkInsertCommitActionExecutor(HoodieEngineContext context, + HoodieWriteHandle writeHandle, HoodieWriteConfig config, HoodieTable table, String instantTime, List> inputRecords) { - super(context, config, table, instantTime, WriteOperationType.INSERT); + super(context, writeHandle, config, table, instantTime, WriteOperationType.INSERT); this.inputRecords = inputRecords; } @Override public HoodieWriteMetadata> execute() { return FlinkWriteHelper.newInstance().write(instantTime, inputRecords, context, table, - config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(), this, false); + config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(), this, operationType); } } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkInsertOverwriteCommitActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkInsertOverwriteCommitActionExecutor.java new file mode 100644 index 0000000000000..5f6d06ea544da --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkInsertOverwriteCommitActionExecutor.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.HoodieWriteHandle; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.util.List; + +/** + * Flink INSERT OVERWRITE commit action executor. + */ +public class FlinkInsertOverwriteCommitActionExecutor> + extends BaseFlinkCommitActionExecutor { + + protected List> inputRecords; + + public FlinkInsertOverwriteCommitActionExecutor(HoodieEngineContext context, + HoodieWriteHandle writeHandle, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + List> inputRecords) { + this(context, writeHandle, config, table, instantTime, inputRecords, WriteOperationType.INSERT_OVERWRITE); + } + + public FlinkInsertOverwriteCommitActionExecutor(HoodieEngineContext context, + HoodieWriteHandle writeHandle, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + List> inputRecords, + WriteOperationType writeOperationType) { + super(context, writeHandle, config, table, instantTime, writeOperationType); + this.inputRecords = inputRecords; + } + + @Override + protected String getCommitActionType() { + return HoodieTimeline.REPLACE_COMMIT_ACTION; + } + + @Override + public HoodieWriteMetadata> execute() { + return FlinkWriteHelper.newInstance().write(instantTime, inputRecords, context, table, + config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(), this, operationType); + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkInsertOverwriteTableCommitActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkInsertOverwriteTableCommitActionExecutor.java new file mode 100644 index 0000000000000..f52b2d9c98339 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkInsertOverwriteTableCommitActionExecutor.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.HoodieWriteHandle; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.util.List; + +/** + * Flink INSERT OVERWRITE TABLE commit action executor. + */ +public class FlinkInsertOverwriteTableCommitActionExecutor> + extends FlinkInsertOverwriteCommitActionExecutor { + + public FlinkInsertOverwriteTableCommitActionExecutor(HoodieEngineContext context, + HoodieWriteHandle writeHandle, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + List> inputRecords) { + super(context, writeHandle, config, table, instantTime, inputRecords, WriteOperationType.INSERT_OVERWRITE_TABLE); + } + + @Override + public HoodieWriteMetadata> execute() { + return FlinkWriteHelper.newInstance().write(instantTime, inputRecords, context, table, + config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(), this, operationType); + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkInsertPreppedCommitActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkInsertPreppedCommitActionExecutor.java index 96fcd06a64ecc..240b04d7eb6fd 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkInsertPreppedCommitActionExecutor.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkInsertPreppedCommitActionExecutor.java @@ -19,24 +19,29 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.HoodieWriteHandle; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; import java.util.List; +/** + * Flink insert prepped commit action executor. + */ public class FlinkInsertPreppedCommitActionExecutor> extends BaseFlinkCommitActionExecutor { private final List> preppedRecords; public FlinkInsertPreppedCommitActionExecutor(HoodieEngineContext context, + HoodieWriteHandle writeHandle, HoodieWriteConfig config, HoodieTable table, String instantTime, List> preppedRecords) { - super(context, config, table, instantTime, WriteOperationType.INSERT_PREPPED); + super(context, writeHandle, config, table, instantTime, WriteOperationType.INSERT_PREPPED); this.preppedRecords = preppedRecords; } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkMergeHelper.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkMergeHelper.java index d34aca22f049c..868290507f451 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkMergeHelper.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkMergeHelper.java @@ -18,7 +18,10 @@ package org.apache.hudi.table.action.commit; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.utils.MergingIterator; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; @@ -42,10 +45,12 @@ import java.io.IOException; import java.util.Iterator; +import java.util.List; -import scala.collection.immutable.List; - -public class FlinkMergeHelper extends AbstractMergeHelper>, +/** + * Flink merge helper. + */ +public class FlinkMergeHelper extends BaseMergeHelper>, List, List> { private FlinkMergeHelper() { @@ -61,38 +66,52 @@ public static FlinkMergeHelper newInstance() { @Override public void runMerge(HoodieTable>, List, List> table, - HoodieMergeHandle>, List, List> upsertHandle) throws IOException { - final boolean externalSchemaTransformation = table.getConfig().shouldUseExternalSchemaTransformation(); - Configuration cfgForHoodieFile = new Configuration(table.getHadoopConf()); - HoodieMergeHandle>, List, List> mergeHandle = upsertHandle; - HoodieBaseFile baseFile = mergeHandle.baseFileForMerge(); - + HoodieMergeHandle>, List, List> mergeHandle) throws IOException { final GenericDatumWriter gWriter; final GenericDatumReader gReader; Schema readSchema; + + Configuration hadoopConf = new Configuration(table.getHadoopConf()); + HoodieFileReader baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf, mergeHandle.getOldFilePath()); + HoodieFileReader bootstrapFileReader = null; + + final boolean externalSchemaTransformation = table.getConfig().shouldUseExternalSchemaTransformation(); + HoodieBaseFile baseFile = mergeHandle.baseFileForMerge(); if (externalSchemaTransformation || baseFile.getBootstrapBaseFile().isPresent()) { - readSchema = HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), mergeHandle.getOldFilePath()).getSchema(); + readSchema = baseFileReader.getSchema(); gWriter = new GenericDatumWriter<>(readSchema); - gReader = new GenericDatumReader<>(readSchema, mergeHandle.getWriterSchemaWithMetafields()); + gReader = new GenericDatumReader<>(readSchema, mergeHandle.getWriterSchemaWithMetaFields()); } else { gReader = null; gWriter = null; - readSchema = mergeHandle.getWriterSchemaWithMetafields(); + readSchema = mergeHandle.getWriterSchemaWithMetaFields(); } BoundedInMemoryExecutor wrapper = null; - HoodieFileReader reader = HoodieFileReaderFactory.getFileReader(cfgForHoodieFile, mergeHandle.getOldFilePath()); try { final Iterator readerIterator; if (baseFile.getBootstrapBaseFile().isPresent()) { - readerIterator = getMergingIterator(table, mergeHandle, baseFile, reader, readSchema, externalSchemaTransformation); + Path bootstrapFilePath = new Path(baseFile.getBootstrapBaseFile().get().getPath()); + Configuration bootstrapFileConfig = new Configuration(table.getHadoopConf()); + // NOTE: It's important for us to rely on writer's schema here + // - When records will be read by Parquet reader, if schema will be decoded from the + // file itself by taking its Parquet one and converting it to Avro. This will be problematic + // w/ schema validations of the records since Avro's schemas also validate corresponding + // qualified names of the structs, which could not be reconstructed when converting from + // Parquet to Avro (b/c Parquet doesn't bear these) + Schema bootstrapSchema = externalSchemaTransformation ? bootstrapFileReader.getSchema() : mergeHandle.getWriterSchema(); + bootstrapFileReader = HoodieFileReaderFactory.getFileReader(bootstrapFileConfig, bootstrapFilePath); + readerIterator = new MergingIterator<>( + baseFileReader.getRecordIterator(readSchema), + bootstrapFileReader.getRecordIterator(bootstrapSchema), + (inputRecordPair) -> HoodieAvroUtils.stitchRecords(inputRecordPair.getLeft(), inputRecordPair.getRight(), mergeHandle.getWriterSchemaWithMetaFields())); } else { - readerIterator = reader.getRecordIterator(readSchema); + readerIterator = baseFileReader.getRecordIterator(readSchema); } ThreadLocal encoderCache = new ThreadLocal<>(); ThreadLocal decoderCache = new ThreadLocal<>(); - wrapper = new BoundedInMemoryExecutor(table.getConfig().getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(readerIterator), + wrapper = new BoundedInMemoryExecutor<>(table.getConfig().getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(readerIterator), Option.of(new UpdateHandler(mergeHandle)), record -> { if (!externalSchemaTransformation) { return record; @@ -103,14 +122,17 @@ public void runMerge(HoodieTable>, List, List } catch (Exception e) { throw new HoodieException(e); } finally { - if (reader != null) { - reader.close(); + // HUDI-2875: mergeHandle is not thread safe, we should totally terminate record inputting + // and executor firstly and then close mergeHandle. + baseFileReader.close(); + if (bootstrapFileReader != null) { + bootstrapFileReader.close(); } - mergeHandle.close(); if (null != wrapper) { wrapper.shutdownNow(); + wrapper.awaitTermination(); } + mergeHandle.close(); } } - } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkUpsertCommitActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkUpsertCommitActionExecutor.java index 24659c5fe40cd..1e7dbb85b1971 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkUpsertCommitActionExecutor.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkUpsertCommitActionExecutor.java @@ -19,32 +19,37 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.HoodieWriteHandle; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; import java.util.List; +/** + * Flink upsert commit action executor. + */ public class FlinkUpsertCommitActionExecutor> extends BaseFlinkCommitActionExecutor { private List> inputRecords; public FlinkUpsertCommitActionExecutor(HoodieEngineContext context, + HoodieWriteHandle writeHandle, HoodieWriteConfig config, HoodieTable table, String instantTime, List> inputRecords) { - super(context, config, table, instantTime, WriteOperationType.UPSERT); + super(context, writeHandle, config, table, instantTime, WriteOperationType.UPSERT); this.inputRecords = inputRecords; } @Override public HoodieWriteMetadata> execute() { return FlinkWriteHelper.newInstance().write(instantTime, inputRecords, context, table, - config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), this, true); + config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), this, operationType); } } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkUpsertPreppedCommitActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkUpsertPreppedCommitActionExecutor.java index 89540804598af..8fb7bc6de2c8f 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkUpsertPreppedCommitActionExecutor.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkUpsertPreppedCommitActionExecutor.java @@ -19,24 +19,29 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.HoodieWriteHandle; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; import java.util.List; +/** + * Flink upsert prepped commit action executor. + */ public class FlinkUpsertPreppedCommitActionExecutor> extends BaseFlinkCommitActionExecutor { private final List> preppedRecords; public FlinkUpsertPreppedCommitActionExecutor(HoodieEngineContext context, + HoodieWriteHandle writeHandle, HoodieWriteConfig config, HoodieTable table, String instantTime, List> preppedRecords) { - super(context, config, table, instantTime, WriteOperationType.UPSERT_PREPPED); + super(context, writeHandle, config, table, instantTime, WriteOperationType.UPSERT_PREPPED); this.preppedRecords = preppedRecords; } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkWriteHelper.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkWriteHelper.java index df106ce8d585c..ffce7fa4023c6 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkWriteHelper.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkWriteHelper.java @@ -19,18 +19,37 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieListData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import java.time.Duration; +import java.time.Instant; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.stream.Collectors; -public class FlinkWriteHelper extends AbstractWriteHelper>, +/** + * Overrides the {@link #write} method to not look up index and partition the records, because + * with {@code org.apache.hudi.operator.partitioner.BucketAssigner}, each hoodie record + * is tagged with a bucket ID (partition path + fileID) in streaming way. The FlinkWriteHelper only hands over + * the records to the action executor {@link BaseCommitActionExecutor} to execute. + * + *

    Computing the records batch locations all at a time is a pressure to the engine, + * we should avoid that in streaming system. + */ +public class FlinkWriteHelper extends BaseWriteHelper>, List, List, R> { private FlinkWriteHelper() { @@ -45,24 +64,51 @@ public static FlinkWriteHelper newInstance() { } @Override - public List> deduplicateRecords(List> records, - HoodieIndex>, List, List> index, - int parallelism) { - boolean isIndexingGlobal = index.isGlobal(); - Map>>> keyedRecords = records.stream().map(record -> { - HoodieKey hoodieKey = record.getKey(); - // If index used is global, then records are expected to differ in their partitionPath - Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey; - return Pair.of(key, record); - }).collect(Collectors.groupingBy(Pair::getLeft)); + public HoodieWriteMetadata> write(String instantTime, List> inputRecords, HoodieEngineContext context, + HoodieTable>, List, List> table, boolean shouldCombine, int shuffleParallelism, + BaseCommitActionExecutor>, List, List, R> executor, WriteOperationType operationType) { + try { + Instant lookupBegin = Instant.now(); + Duration indexLookupDuration = Duration.between(lookupBegin, Instant.now()); + + HoodieWriteMetadata> result = executor.execute(inputRecords); + result.setIndexLookupDuration(indexLookupDuration); + return result; + } catch (Throwable e) { + if (e instanceof HoodieUpsertException) { + throw (HoodieUpsertException) e; + } + throw new HoodieUpsertException("Failed to upsert for commit time " + instantTime, e); + } + } + + @Override + protected List> tag(List> dedupedRecords, HoodieEngineContext context, HoodieTable>, List, List> table) { + return table.getIndex().tagLocation(HoodieListData.eager(dedupedRecords), context, table).collectAsList(); + } + + @Override + public List> deduplicateRecords( + List> records, HoodieIndex index, int parallelism) { + // If index used is global, then records are expected to differ in their partitionPath + Map>> keyedRecords = records.stream() + .collect(Collectors.groupingBy(record -> record.getKey().getRecordKey())); + + return keyedRecords.values().stream().map(x -> x.stream().reduce((rec1, rec2) -> { + final T data1 = rec1.getData(); + final T data2 = rec2.getData(); - return keyedRecords.values().stream().map(x -> x.stream().map(Pair::getRight).reduce((rec1, rec2) -> { - @SuppressWarnings("unchecked") - T reducedData = (T) rec1.getData().preCombine(rec2.getData()); + @SuppressWarnings("unchecked") final T reducedData = (T) data2.preCombine(data1, CollectionUtils.emptyProps()); // we cannot allow the user to change the key or partitionPath, since that will affect // everything // so pick it from one of the records. - return new HoodieRecord(rec1.getKey(), reducedData); + boolean choosePrev = data1 == reducedData; + HoodieKey reducedKey = choosePrev ? rec1.getKey() : rec2.getKey(); + HoodieOperation operation = choosePrev ? rec1.getOperation() : rec2.getOperation(); + HoodieRecord hoodieRecord = new HoodieAvroRecord<>(reducedKey, reducedData, operation); + // reuse the location from the first record. + hoodieRecord.setCurrentLocation(rec1.getCurrentLocation()); + return hoodieRecord; }).orElse(null)).filter(Objects::nonNull).collect(Collectors.toList()); } } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java deleted file mode 100644 index 2bcd3b2a7189e..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java +++ /dev/null @@ -1,319 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.commit; - -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.NumericUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.WorkloadProfile; -import org.apache.hudi.table.WorkloadStat; - -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -import scala.Tuple2; - -/** - * Packs incoming records to be upserted, into buckets. - */ -public class UpsertPartitioner> implements Partitioner { - - private static final Logger LOG = LogManager.getLogger(UpsertPartitioner.class); - - /** - * List of all small files to be corrected. - */ - protected List smallFiles = new ArrayList<>(); - /** - * Total number of RDD partitions, is determined by total buckets we want to pack the incoming workload into. - */ - private int totalBuckets = 0; - /** - * Stat for the current workload. Helps in determining inserts, upserts etc. - */ - private WorkloadProfile profile; - /** - * Helps decide which bucket an incoming update should go to. - */ - private HashMap updateLocationToBucket; - /** - * Helps us pack inserts into 1 or more buckets depending on number of incoming records. - */ - private HashMap> partitionPathToInsertBucketInfos; - /** - * Remembers what type each bucket is for later. - */ - private HashMap bucketInfoMap; - - protected final HoodieTable table; - - protected final HoodieWriteConfig config; - - public UpsertPartitioner(WorkloadProfile profile, HoodieEngineContext context, HoodieTable table, - HoodieWriteConfig config) { - updateLocationToBucket = new HashMap<>(); - partitionPathToInsertBucketInfos = new HashMap<>(); - bucketInfoMap = new HashMap<>(); - this.profile = profile; - this.table = table; - this.config = config; - assignUpdates(profile); - assignInserts(profile, context); - - LOG.info("Total Buckets :" + totalBuckets + ", buckets info => " + bucketInfoMap + ", \n" - + "Partition to insert buckets => " + partitionPathToInsertBucketInfos + ", \n" - + "UpdateLocations mapped to buckets =>" + updateLocationToBucket); - } - - private void assignUpdates(WorkloadProfile profile) { - // each update location gets a partition - Set> partitionStatEntries = profile.getPartitionPathStatMap().entrySet(); - for (Map.Entry partitionStat : partitionStatEntries) { - for (Map.Entry> updateLocEntry : - partitionStat.getValue().getUpdateLocationToCount().entrySet()) { - addUpdateBucket(partitionStat.getKey(), updateLocEntry.getKey()); - } - } - } - - private int addUpdateBucket(String partitionPath, String fileIdHint) { - int bucket = totalBuckets; - updateLocationToBucket.put(fileIdHint, bucket); - BucketInfo bucketInfo = new BucketInfo(); - bucketInfo.bucketType = BucketType.UPDATE; - bucketInfo.fileIdPrefix = fileIdHint; - bucketInfo.partitionPath = partitionPath; - bucketInfoMap.put(totalBuckets, bucketInfo); - totalBuckets++; - return bucket; - } - - private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) { - // for new inserts, compute buckets depending on how many records we have for each partition - Set partitionPaths = profile.getPartitionPaths(); - long averageRecordSize = - averageBytesPerRecord(table.getMetaClient().getActiveTimeline().getCommitTimeline().filterCompletedInstants(), - config); - LOG.info("AvgRecordSize => " + averageRecordSize); - - Map> partitionSmallFilesMap = - getSmallFilesForPartitions(new ArrayList(partitionPaths), context); - - for (String partitionPath : partitionPaths) { - WorkloadStat pStat = profile.getWorkloadStat(partitionPath); - if (pStat.getNumInserts() > 0) { - - List smallFiles = partitionSmallFilesMap.get(partitionPath); - this.smallFiles.addAll(smallFiles); - - LOG.info("For partitionPath : " + partitionPath + " Small Files => " + smallFiles); - - long totalUnassignedInserts = pStat.getNumInserts(); - List bucketNumbers = new ArrayList<>(); - List recordsPerBucket = new ArrayList<>(); - - // first try packing this into one of the smallFiles - for (SmallFile smallFile : smallFiles) { - long recordsToAppend = Math.min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize, - totalUnassignedInserts); - if (recordsToAppend > 0 && totalUnassignedInserts > 0) { - // create a new bucket or re-use an existing bucket - int bucket; - if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) { - bucket = updateLocationToBucket.get(smallFile.location.getFileId()); - LOG.info("Assigning " + recordsToAppend + " inserts to existing update bucket " + bucket); - } else { - bucket = addUpdateBucket(partitionPath, smallFile.location.getFileId()); - LOG.info("Assigning " + recordsToAppend + " inserts to new update bucket " + bucket); - } - bucketNumbers.add(bucket); - recordsPerBucket.add(recordsToAppend); - totalUnassignedInserts -= recordsToAppend; - } - } - - // if we have anything more, create new insert buckets, like normal - if (totalUnassignedInserts > 0) { - long insertRecordsPerBucket = config.getCopyOnWriteInsertSplitSize(); - if (config.shouldAutoTuneInsertSplits()) { - insertRecordsPerBucket = config.getParquetMaxFileSize() / averageRecordSize; - } - - int insertBuckets = (int) Math.ceil((1.0 * totalUnassignedInserts) / insertRecordsPerBucket); - LOG.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts - + ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => " + insertRecordsPerBucket); - for (int b = 0; b < insertBuckets; b++) { - bucketNumbers.add(totalBuckets); - recordsPerBucket.add(totalUnassignedInserts / insertBuckets); - BucketInfo bucketInfo = new BucketInfo(); - bucketInfo.bucketType = BucketType.INSERT; - bucketInfo.partitionPath = partitionPath; - bucketInfo.fileIdPrefix = FSUtils.createNewFileIdPfx(); - bucketInfoMap.put(totalBuckets, bucketInfo); - totalBuckets++; - } - } - - // Go over all such buckets, and assign weights as per amount of incoming inserts. - List insertBuckets = new ArrayList<>(); - double curentCumulativeWeight = 0; - for (int i = 0; i < bucketNumbers.size(); i++) { - InsertBucket bkt = new InsertBucket(); - bkt.bucketNumber = bucketNumbers.get(i); - bkt.weight = (1.0 * recordsPerBucket.get(i)) / pStat.getNumInserts(); - curentCumulativeWeight += bkt.weight; - insertBuckets.add(new InsertBucketCumulativeWeightPair(bkt, curentCumulativeWeight)); - } - LOG.info("Total insert buckets for partition path " + partitionPath + " => " + insertBuckets); - partitionPathToInsertBucketInfos.put(partitionPath, insertBuckets); - } - } - } - - private Map> getSmallFilesForPartitions(List partitionPaths, HoodieEngineContext context) { - Map> partitionSmallFilesMap = new HashMap<>(); - if (partitionPaths != null && partitionPaths.size() > 0) { - context.setJobStatus(this.getClass().getSimpleName(), "Getting small files from partitions"); - partitionSmallFilesMap = context.mapToPair(partitionPaths, partitionPath -> new Tuple2<>(partitionPath, getSmallFiles(partitionPath)), 0); - } - return partitionSmallFilesMap; - } - - /** - * Returns a list of small files in the given partition path. - */ - protected List getSmallFiles(String partitionPath) { - - // smallFiles only for partitionPath - List smallFileLocations = new ArrayList<>(); - - HoodieTimeline commitTimeline = table.getMetaClient().getCommitsTimeline().filterCompletedInstants(); - - if (!commitTimeline.empty()) { // if we have some commits - HoodieInstant latestCommitTime = commitTimeline.lastInstant().get(); - List allFiles = table.getBaseFileOnlyView() - .getLatestBaseFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList()); - - for (HoodieBaseFile file : allFiles) { - if (file.getFileSize() < config.getParquetSmallFileLimit()) { - String filename = file.getFileName(); - SmallFile sf = new SmallFile(); - sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename)); - sf.sizeBytes = file.getFileSize(); - smallFileLocations.add(sf); - } - } - } - - return smallFileLocations; - } - - public BucketInfo getBucketInfo(int bucketNumber) { - return bucketInfoMap.get(bucketNumber); - } - - public List getInsertBuckets(String partitionPath) { - return partitionPathToInsertBucketInfos.get(partitionPath); - } - - @Override - public int getNumPartitions() { - return totalBuckets; - } - - @Override - public int getPartition(Object key) { - Tuple2> keyLocation = - (Tuple2>) key; - if (keyLocation._2().isPresent()) { - HoodieRecordLocation location = keyLocation._2().get(); - return updateLocationToBucket.get(location.getFileId()); - } else { - String partitionPath = keyLocation._1().getPartitionPath(); - List targetBuckets = partitionPathToInsertBucketInfos.get(partitionPath); - // pick the target bucket to use based on the weights. - final long totalInserts = Math.max(1, profile.getWorkloadStat(partitionPath).getNumInserts()); - final long hashOfKey = NumericUtils.getMessageDigestHash("MD5", keyLocation._1().getRecordKey()); - final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts; - - int index = Collections.binarySearch(targetBuckets, new InsertBucketCumulativeWeightPair(new InsertBucket(), r)); - - if (index >= 0) { - return targetBuckets.get(index).getKey().bucketNumber; - } - - if ((-1 * index - 1) < targetBuckets.size()) { - return targetBuckets.get((-1 * index - 1)).getKey().bucketNumber; - } - - // return first one, by default - return targetBuckets.get(0).getKey().bucketNumber; - } - } - - /** - * Obtains the average record size based on records written during previous commits. Used for estimating how many - * records pack into one file. - */ - protected static long averageBytesPerRecord(HoodieTimeline commitTimeline, HoodieWriteConfig hoodieWriteConfig) { - long avgSize = hoodieWriteConfig.getCopyOnWriteRecordSizeEstimate(); - long fileSizeThreshold = (long) (hoodieWriteConfig.getRecordSizeEstimationThreshold() * hoodieWriteConfig.getParquetSmallFileLimit()); - try { - if (!commitTimeline.empty()) { - // Go over the reverse ordered commits to get a more recent estimate of average record size. - Iterator instants = commitTimeline.getReverseOrderedInstants().iterator(); - while (instants.hasNext()) { - HoodieInstant instant = instants.next(); - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata - .fromBytes(commitTimeline.getInstantDetails(instant).get(), HoodieCommitMetadata.class); - long totalBytesWritten = commitMetadata.fetchTotalBytesWritten(); - long totalRecordsWritten = commitMetadata.fetchTotalRecordsWritten(); - if (totalBytesWritten > fileSizeThreshold && totalRecordsWritten > 0) { - avgSize = (long) Math.ceil((1.0 * totalBytesWritten) / totalRecordsWritten); - break; - } - } - } - } catch (Throwable t) { - // make this fail safe. - LOG.error("Error trying to compute average bytes/record ", t); - } - return avgSize; - } -} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/delta/BaseFlinkDeltaCommitActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/delta/BaseFlinkDeltaCommitActionExecutor.java new file mode 100644 index 0000000000000..d8ea958266b82 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/delta/BaseFlinkDeltaCommitActionExecutor.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit.delta; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.execution.FlinkLazyInsertIterable; +import org.apache.hudi.io.ExplicitWriteHandleFactory; +import org.apache.hudi.io.FlinkAppendHandle; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.commit.BaseFlinkCommitActionExecutor; + +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +/** + * Base flink delta commit action executor. + */ +public abstract class BaseFlinkDeltaCommitActionExecutor> + extends BaseFlinkCommitActionExecutor { + + public BaseFlinkDeltaCommitActionExecutor(HoodieEngineContext context, + FlinkAppendHandle writeHandle, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + WriteOperationType operationType) { + super(context, writeHandle, config, table, instantTime, operationType); + } + + @Override + public Iterator> handleUpdate(String partitionPath, String fileId, Iterator> recordItr) { + FlinkAppendHandle appendHandle = (FlinkAppendHandle) writeHandle; + appendHandle.doAppend(); + List writeStatuses = appendHandle.close(); + return Collections.singletonList(writeStatuses).iterator(); + } + + @Override + public Iterator> handleInsert(String idPfx, Iterator> recordItr) { + return new FlinkLazyInsertIterable<>(recordItr, true, config, instantTime, table, + idPfx, taskContextSupplier, new ExplicitWriteHandleFactory(writeHandle)); + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/delta/FlinkUpsertDeltaCommitActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/delta/FlinkUpsertDeltaCommitActionExecutor.java new file mode 100644 index 0000000000000..c95a6c1c7b9f0 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/delta/FlinkUpsertDeltaCommitActionExecutor.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit.delta; + +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.FlinkAppendHandle; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.commit.FlinkWriteHelper; + +import java.util.List; + +/** + * Flink upsert delta commit action executor. + */ +public class FlinkUpsertDeltaCommitActionExecutor> + extends BaseFlinkDeltaCommitActionExecutor { + private final List> inputRecords; + + public FlinkUpsertDeltaCommitActionExecutor(HoodieEngineContext context, + FlinkAppendHandle writeHandle, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + List> inputRecords) { + super(context, writeHandle, config, table, instantTime, WriteOperationType.UPSERT); + this.inputRecords = inputRecords; + } + + @Override + public HoodieWriteMetadata execute() { + return FlinkWriteHelper.newInstance().write(instantTime, inputRecords, context, table, + config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), this, operationType); + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/delta/FlinkUpsertPreppedDeltaCommitActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/delta/FlinkUpsertPreppedDeltaCommitActionExecutor.java new file mode 100644 index 0000000000000..94bcbc586c2e2 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/delta/FlinkUpsertPreppedDeltaCommitActionExecutor.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit.delta; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.FlinkAppendHandle; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.util.List; + +/** + * Flink upsert prepped delta commit action executor. + */ +public class FlinkUpsertPreppedDeltaCommitActionExecutor> + extends BaseFlinkDeltaCommitActionExecutor { + + private final List> preppedRecords; + + public FlinkUpsertPreppedDeltaCommitActionExecutor(HoodieEngineContext context, + FlinkAppendHandle writeHandle, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + List> preppedRecords) { + super(context, writeHandle, config, table, instantTime, WriteOperationType.UPSERT_PREPPED); + this.preppedRecords = preppedRecords; + } + + @Override + public HoodieWriteMetadata> execute() { + return super.execute(preppedRecords); + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/compact/HoodieFlinkMergeOnReadTableCompactor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/compact/HoodieFlinkMergeOnReadTableCompactor.java new file mode 100644 index 0000000000000..03b9f8e7ee090 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/compact/HoodieFlinkMergeOnReadTableCompactor.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.compact; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; + +import java.util.List; + +/** + * Compacts a hoodie table with merge on read storage. Computes all possible compactions, + * passes it through a CompactionFilter and executes all the compactions and writes a new version of base files and make + * a normal commit. + * + *

    Note: the compaction logic is invoked through the flink pipeline. + */ +@SuppressWarnings("checkstyle:LineLength") +public class HoodieFlinkMergeOnReadTableCompactor + extends HoodieCompactor>, List, List> { + + @Override + public void preCompact( + HoodieTable table, HoodieTimeline pendingCompactionTimeline, String compactionInstantTime) { + HoodieInstant inflightInstant = HoodieTimeline.getCompactionInflightInstant(compactionInstantTime); + if (pendingCompactionTimeline.containsInstant(inflightInstant)) { + table.rollbackInflightCompaction(inflightInstant); + table.getMetaClient().reloadActiveTimeline(); + } + } + + @Override + public void maybePersist(HoodieData writeStatus, HoodieWriteConfig config) { + // No OP + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/FlinkCopyOnWriteRollbackActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/FlinkCopyOnWriteRollbackActionExecutor.java deleted file mode 100644 index 28b713b24b9ab..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/FlinkCopyOnWriteRollbackActionExecutor.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; - -import java.util.List; - -@SuppressWarnings("checkstyle:LineLength") -public class FlinkCopyOnWriteRollbackActionExecutor extends - BaseCopyOnWriteRollbackActionExecutor>, List, List> { - public FlinkCopyOnWriteRollbackActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, List, List> table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants) { - super(context, config, table, instantTime, commitInstant, deleteInstants); - } - - public FlinkCopyOnWriteRollbackActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, List, List> table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants, - boolean skipTimelinePublish, - boolean useMarkerBasedStrategy) { - super(context, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, useMarkerBasedStrategy); - } - - @Override - protected BaseRollbackActionExecutor.RollbackStrategy getRollbackStrategy() { - if (useMarkerBasedStrategy) { - return new FlinkMarkerBasedRollbackStrategy(table, context, config, instantTime); - } else { - return this::executeRollbackUsingFileListing; - } - } - - @Override - protected List executeRollbackUsingFileListing(HoodieInstant instantToRollback) { - List rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(table.getMetaClient().getFs(), table.getMetaClient().getBasePath(), - config.shouldAssumeDatePartitioning()); - return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(context, instantToRollback, rollbackRequests); - } -} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/FlinkMarkerBasedRollbackStrategy.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/FlinkMarkerBasedRollbackStrategy.java deleted file mode 100644 index 8cf91a21e3382..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/FlinkMarkerBasedRollbackStrategy.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.IOType; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieRollbackException; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.MarkerFiles; - -import java.util.List; -import java.util.stream.Collectors; - -import scala.Tuple2; - -@SuppressWarnings("checkstyle:LineLength") -public class FlinkMarkerBasedRollbackStrategy extends AbstractMarkerBasedRollbackStrategy>, List, List> { - public FlinkMarkerBasedRollbackStrategy(HoodieTable>, List, List> table, HoodieEngineContext context, HoodieWriteConfig config, String instantTime) { - super(table, context, config, instantTime); - } - - @Override - public List execute(HoodieInstant instantToRollback) { - try { - MarkerFiles markerFiles = new MarkerFiles(table, instantToRollback.getTimestamp()); - List rollbackStats = context.map(markerFiles.allMarkerFilePaths(), markerFilePath -> { - String typeStr = markerFilePath.substring(markerFilePath.lastIndexOf(".") + 1); - IOType type = IOType.valueOf(typeStr); - switch (type) { - case MERGE: - return undoMerge(MarkerFiles.stripMarkerSuffix(markerFilePath)); - case APPEND: - return undoAppend(MarkerFiles.stripMarkerSuffix(markerFilePath), instantToRollback); - case CREATE: - return undoCreate(MarkerFiles.stripMarkerSuffix(markerFilePath)); - default: - throw new HoodieRollbackException("Unknown marker type, during rollback of " + instantToRollback); - } - }, 0); - - return rollbackStats.stream().map(rollbackStat -> new Tuple2<>(rollbackStat.getPartitionPath(), rollbackStat)) - .collect(Collectors.groupingBy(Tuple2::_1)) - .values() - .stream() - .map(x -> x.stream().map(y -> y._2).reduce(RollbackUtils::mergeRollbackStat).get()) - .collect(Collectors.toList()); - } catch (Exception e) { - throw new HoodieRollbackException("Error rolling back using marker files written for " + instantToRollback, e); - } - } -} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java deleted file mode 100644 index 612635da871f8..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.log.HoodieLogFormat; -import org.apache.hudi.common.table.log.block.HoodieCommandBlock; -import org.apache.hudi.common.table.log.block.HoodieLogBlock; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.exception.HoodieRollbackException; - -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.PathFilter; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.IOException; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.stream.Collectors; - -import scala.Tuple2; - -/** - * Performs Rollback of Hoodie Tables. - */ -public class ListingBasedRollbackHelper implements Serializable { - - private static final Logger LOG = LogManager.getLogger(ListingBasedRollbackHelper.class); - - private final HoodieTableMetaClient metaClient; - private final HoodieWriteConfig config; - - public ListingBasedRollbackHelper(HoodieTableMetaClient metaClient, HoodieWriteConfig config) { - this.metaClient = metaClient; - this.config = config; - } - - /** - * Performs all rollback actions that we have collected in parallel. - */ - public List performRollback(HoodieEngineContext context, HoodieInstant instantToRollback, List rollbackRequests) { - Map partitionPathRollbackStatsPairs = maybeDeleteAndCollectStats(context, instantToRollback, rollbackRequests, true); - - Map>> collect = partitionPathRollbackStatsPairs.entrySet() - .stream() - .map(x -> Pair.of(x.getKey(), x.getValue())).collect(Collectors.groupingBy(Pair::getLeft)); - return collect.values().stream() - .map(pairs -> pairs.stream().map(Pair::getRight).reduce(RollbackUtils::mergeRollbackStat).orElse(null)) - .filter(Objects::nonNull) - .collect(Collectors.toList()); - } - - /** - * Collect all file info that needs to be rollbacked. - */ - public List collectRollbackStats(HoodieEngineContext context, HoodieInstant instantToRollback, List rollbackRequests) { - Map partitionPathRollbackStatsPairs = maybeDeleteAndCollectStats(context, instantToRollback, rollbackRequests, false); - return new ArrayList<>(partitionPathRollbackStatsPairs.values()); - } - - /** - * May be delete interested files and collect stats or collect stats only. - * - * @param context instance of {@link HoodieEngineContext} to use. - * @param instantToRollback {@link HoodieInstant} of interest for which deletion or collect stats is requested. - * @param rollbackRequests List of {@link ListingBasedRollbackRequest} to be operated on. - * @param doDelete {@code true} if deletion has to be done. {@code false} if only stats are to be collected w/o performing any deletes. - * @return stats collected with or w/o actual deletions. - */ - Map maybeDeleteAndCollectStats(HoodieEngineContext context, - HoodieInstant instantToRollback, - List rollbackRequests, - boolean doDelete) { - return context.mapToPair(rollbackRequests, rollbackRequest -> { - switch (rollbackRequest.getType()) { - case DELETE_DATA_FILES_ONLY: { - final Map filesToDeletedStatus = deleteBaseFiles(metaClient, config, instantToRollback.getTimestamp(), - rollbackRequest.getPartitionPath(), doDelete); - return new Tuple2<>(rollbackRequest.getPartitionPath(), - HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) - .withDeletedFileResults(filesToDeletedStatus).build()); - } - case DELETE_DATA_AND_LOG_FILES: { - final Map filesToDeletedStatus = deleteBaseAndLogFiles(metaClient, config, instantToRollback.getTimestamp(), rollbackRequest.getPartitionPath(), doDelete); - return new Tuple2<>(rollbackRequest.getPartitionPath(), - HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) - .withDeletedFileResults(filesToDeletedStatus).build()); - } - case APPEND_ROLLBACK_BLOCK: { - HoodieLogFormat.Writer writer = null; - try { - writer = HoodieLogFormat.newWriterBuilder() - .onParentPath(FSUtils.getPartitionPath(metaClient.getBasePath(), rollbackRequest.getPartitionPath())) - .withFileId(rollbackRequest.getFileId().get()) - .overBaseCommit(rollbackRequest.getLatestBaseInstant().get()).withFs(metaClient.getFs()) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); - - // generate metadata - if (doDelete) { - Map header = generateHeader(instantToRollback.getTimestamp()); - // if update belongs to an existing log file - writer = writer.appendBlock(new HoodieCommandBlock(header)); - } - } catch (IOException | InterruptedException io) { - throw new HoodieRollbackException("Failed to rollback for instant " + instantToRollback, io); - } finally { - try { - if (writer != null) { - writer.close(); - } - } catch (IOException io) { - throw new HoodieIOException("Error appending rollback block..", io); - } - } - - // This step is intentionally done after writer is closed. Guarantees that - // getFileStatus would reflect correct stats and FileNotFoundException is not thrown in - // cloud-storage : HUDI-168 - Map filesToNumBlocksRollback = Collections.singletonMap( - metaClient.getFs().getFileStatus(Objects.requireNonNull(writer).getLogFile().getPath()), - 1L - ); - return new Tuple2<>(rollbackRequest.getPartitionPath(), - HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) - .withRollbackBlockAppendResults(filesToNumBlocksRollback).build()); - } - default: - throw new IllegalStateException("Unknown Rollback action " + rollbackRequest); - } - }, 0); - } - - /** - * Common method used for cleaning out base files under a partition path during rollback of a set of commits. - */ - private Map deleteBaseAndLogFiles(HoodieTableMetaClient metaClient, HoodieWriteConfig config, - String commit, String partitionPath, boolean doDelete) throws IOException { - LOG.info("Cleaning path " + partitionPath); - String basefileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); - SerializablePathFilter filter = (path) -> { - if (path.toString().endsWith(basefileExtension)) { - String fileCommitTime = FSUtils.getCommitTime(path.getName()); - return commit.equals(fileCommitTime); - } else if (FSUtils.isLogFile(path)) { - // Since the baseCommitTime is the only commit for new log files, it's okay here - String fileCommitTime = FSUtils.getBaseCommitTimeFromLogPath(path); - return commit.equals(fileCommitTime); - } - return false; - }; - - final Map results = new HashMap<>(); - FileSystem fs = metaClient.getFs(); - FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter); - for (FileStatus file : toBeDeleted) { - if (doDelete) { - boolean success = fs.delete(file.getPath(), false); - results.put(file, success); - LOG.info("Delete file " + file.getPath() + "\t" + success); - } else { - results.put(file, true); - } - } - return results; - } - - /** - * Common method used for cleaning out base files under a partition path during rollback of a set of commits. - */ - private Map deleteBaseFiles(HoodieTableMetaClient metaClient, HoodieWriteConfig config, - String commit, String partitionPath, boolean doDelete) throws IOException { - final Map results = new HashMap<>(); - LOG.info("Cleaning path " + partitionPath); - FileSystem fs = metaClient.getFs(); - String basefileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); - PathFilter filter = (path) -> { - if (path.toString().contains(basefileExtension)) { - String fileCommitTime = FSUtils.getCommitTime(path.getName()); - return commit.equals(fileCommitTime); - } - return false; - }; - FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter); - for (FileStatus file : toBeDeleted) { - if (doDelete) { - boolean success = fs.delete(file.getPath(), false); - results.put(file, success); - LOG.info("Delete file " + file.getPath() + "\t" + success); - } else { - results.put(file, true); - } - } - return results; - } - - private Map generateHeader(String commit) { - // generate metadata - Map header = new HashMap<>(3); - header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp()); - header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, commit); - header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, - String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); - return header; - } - - public interface SerializablePathFilter extends PathFilter, Serializable { - - } -} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/FlinkUpgradeDowngrade.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/FlinkUpgradeDowngrade.java deleted file mode 100644 index a96a14aa7800a..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/FlinkUpgradeDowngrade.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.upgrade; - -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.HoodieTableVersion; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieUpgradeDowngradeException; - -import java.io.IOException; - -public class FlinkUpgradeDowngrade extends AbstractUpgradeDowngrade { - public FlinkUpgradeDowngrade(HoodieTableMetaClient metaClient, HoodieWriteConfig config, HoodieEngineContext context) { - super(metaClient, config, context); - } - - @Override - public void run(HoodieTableMetaClient metaClient, HoodieTableVersion toVersion, HoodieWriteConfig config, - HoodieEngineContext context, String instantTime) { - try { - new FlinkUpgradeDowngrade(metaClient, config, context).run(toVersion, instantTime); - } catch (IOException e) { - throw new HoodieUpgradeDowngradeException("Error during upgrade/downgrade to version:" + toVersion, e); - } - } - - @Override - protected void upgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime) { - if (fromVersion == HoodieTableVersion.ZERO && toVersion == HoodieTableVersion.ONE) { - new ZeroToOneUpgradeHandler().upgrade(config, context, instantTime); - } else { - throw new HoodieUpgradeDowngradeException(fromVersion.versionCode(), toVersion.versionCode(), true); - } - } - - @Override - protected void downgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime) { - if (fromVersion == HoodieTableVersion.ONE && toVersion == HoodieTableVersion.ZERO) { - new OneToZeroDowngradeHandler().downgrade(config, context, instantTime); - } else { - throw new HoodieUpgradeDowngradeException(fromVersion.versionCode(), toVersion.versionCode(), false); - } - } -} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/FlinkUpgradeDowngradeHelper.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/FlinkUpgradeDowngradeHelper.java new file mode 100644 index 0000000000000..69acce5627543 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/FlinkUpgradeDowngradeHelper.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.table.HoodieTable; + +/** + * Flink upgrade and downgrade helper. + */ +public class FlinkUpgradeDowngradeHelper implements SupportsUpgradeDowngrade { + + private static final FlinkUpgradeDowngradeHelper SINGLETON_INSTANCE = + new FlinkUpgradeDowngradeHelper(); + + private FlinkUpgradeDowngradeHelper() { + } + + public static FlinkUpgradeDowngradeHelper getInstance() { + return SINGLETON_INSTANCE; + } + + @Override + public HoodieTable getTable(HoodieWriteConfig config, HoodieEngineContext context) { + return HoodieFlinkTable.create(config, (HoodieFlinkEngineContext) context); + } + + @Override + public String getPartitionColumns(HoodieWriteConfig config) { + return config.getProps().getProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()); + } +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java deleted file mode 100644 index 2d2e06e5979cb..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.upgrade; - -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.client.common.HoodieFlinkEngineContext; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieFlinkTable; -import org.apache.hudi.table.MarkerFiles; - -import java.util.List; -import java.util.stream.Collectors; - -/** - * Downgrade handle to assist in downgrading hoodie table from version 1 to 0. - */ -public class OneToZeroDowngradeHandler implements DowngradeHandler { - - @Override - public void downgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime) { - // fetch pending commit info - HoodieFlinkTable table = HoodieFlinkTable.create(config, (HoodieFlinkEngineContext) context); - HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction(); - List commits = inflightTimeline.getReverseOrderedInstants().collect(Collectors.toList()); - for (HoodieInstant commitInstant : commits) { - // delete existing marker files - MarkerFiles markerFiles = new MarkerFiles(table, commitInstant.getTimestamp()); - markerFiles.quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); - } - } -} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java deleted file mode 100644 index 1fa3ad0a32b7c..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.upgrade; - -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.client.common.HoodieFlinkEngineContext; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieTableType; -import org.apache.hudi.common.model.IOType; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieRollbackException; -import org.apache.hudi.table.HoodieFlinkTable; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.MarkerFiles; -import org.apache.hudi.table.action.rollback.ListingBasedRollbackHelper; -import org.apache.hudi.table.action.rollback.ListingBasedRollbackRequest; -import org.apache.hudi.table.action.rollback.RollbackUtils; - -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; - -import java.util.List; -import java.util.stream.Collectors; - -/** - * Upgrade handle to assist in upgrading hoodie table from version 0 to 1. - */ -public class ZeroToOneUpgradeHandler implements UpgradeHandler { - - @Override - public void upgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime) { - // fetch pending commit info - HoodieFlinkTable table = HoodieFlinkTable.create(config, (HoodieFlinkEngineContext) context); - HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction(); - List commits = inflightTimeline.getReverseOrderedInstants().map(HoodieInstant::getTimestamp) - .collect(Collectors.toList()); - if (commits.size() > 0 && instantTime != null) { - // ignore the latest inflight commit since a new commit would have been started and we need to fix any pending commits from previous launch - commits.remove(instantTime); - } - for (String commit : commits) { - // for every pending commit, delete old marker files and re-create marker files in new format - recreateMarkerFiles(commit, table, context, config.getMarkersDeleteParallelism()); - } - } - - /** - * Recreate marker files in new format. - * Step1: Delete existing marker files - * Step2: Collect all rollback file info. - * Step3: recreate marker files for all interested files. - * - * @param commitInstantTime instant of interest for which marker files need to be recreated. - * @param table instance of {@link HoodieFlinkTable} to use - * @param context instance of {@link HoodieEngineContext} to use - * @throws HoodieRollbackException on any exception during upgrade. - */ - private static void recreateMarkerFiles(final String commitInstantTime, - HoodieFlinkTable table, - HoodieEngineContext context, - int parallelism) throws HoodieRollbackException { - try { - // fetch hoodie instant - Option commitInstantOpt = Option.fromJavaOptional(table.getActiveTimeline().getCommitsTimeline().getInstants() - .filter(instant -> HoodieActiveTimeline.EQUALS.test(instant.getTimestamp(), commitInstantTime)) - .findFirst()); - if (commitInstantOpt.isPresent()) { - // delete existing marker files - MarkerFiles markerFiles = new MarkerFiles(table, commitInstantTime); - markerFiles.quietDeleteMarkerDir(context, parallelism); - - // generate rollback stats - List rollbackRequests; - if (table.getMetaClient().getTableType() == HoodieTableType.COPY_ON_WRITE) { - rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(table.getMetaClient().getFs(), table.getMetaClient().getBasePath(), - table.getConfig().shouldAssumeDatePartitioning()); - } else { - rollbackRequests = RollbackUtils.generateRollbackRequestsUsingFileListingMOR(commitInstantOpt.get(), table, context); - } - List rollbackStats = new ListingBasedRollbackHelper(table.getMetaClient(), table.getConfig()) - .collectRollbackStats(context, commitInstantOpt.get(), rollbackRequests); - - // recreate marker files adhering to marker based rollback - for (HoodieRollbackStat rollbackStat : rollbackStats) { - for (String path : rollbackStat.getSuccessDeleteFiles()) { - String dataFileName = path.substring(path.lastIndexOf("/") + 1); - // not feasible to differentiate MERGE from CREATE. hence creating with MERGE IOType for all base files. - markerFiles.create(rollbackStat.getPartitionPath(), dataFileName, IOType.MERGE); - } - for (FileStatus fileStatus : rollbackStat.getCommandBlocksCount().keySet()) { - markerFiles.create(rollbackStat.getPartitionPath(), getFileNameForMarkerFromLogFile(fileStatus.getPath().toString(), table), IOType.APPEND); - } - } - } - } catch (Exception e) { - throw new HoodieRollbackException("Exception thrown while upgrading Hoodie Table from version 0 to 1", e); - } - } - - /** - * Curates file name for marker from existing log file path. - * log file format : partitionpath/.fileid_baseInstant.log.writetoken - * marker file format : partitionpath/fileId_writetoken_baseinstant.basefileExtn.marker.APPEND - * - * @param logFilePath log file path for which marker file name needs to be generated. - * @return the marker file name thus curated. - */ - private static String getFileNameForMarkerFromLogFile(String logFilePath, HoodieTable table) { - Path logPath = new Path(table.getMetaClient().getBasePath(), logFilePath); - String fileId = FSUtils.getFileIdFromLogPath(logPath); - String baseInstant = FSUtils.getBaseCommitTimeFromLogPath(logPath); - String writeToken = FSUtils.getWriteTokenFromLogPath(logPath); - - return FSUtils.makeDataFileName(baseInstant, writeToken, fileId, table.getBaseFileFormat().getFileExtension()); - } -} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/util/FlinkClientUtil.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/util/FlinkClientUtil.java new file mode 100644 index 0000000000000..3850ec8ac8ec4 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/util/FlinkClientUtil.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.hudi.common.table.HoodieTableMetaClient; + +import org.apache.flink.api.java.hadoop.mapred.utils.HadoopUtils; +import org.apache.flink.configuration.Configuration; +import org.apache.hadoop.fs.Path; + +import java.io.File; + +/** + * Utilities for Hoodie Flink client. + */ +public class FlinkClientUtil { + + /** + * Creates the meta client. + */ + public static HoodieTableMetaClient createMetaClient(String basePath) { + return HoodieTableMetaClient.builder().setBasePath(basePath).setConf(FlinkClientUtil.getHadoopConf()).build(); + } + + /** + * Returns the hadoop configuration with possible hadoop conf paths. + * E.G. the configurations under path $HADOOP_CONF_DIR and $HADOOP_HOME. + */ + public static org.apache.hadoop.conf.Configuration getHadoopConf() { + // create hadoop configuration with hadoop conf directory configured. + org.apache.hadoop.conf.Configuration hadoopConf = null; + for (String possibleHadoopConfPath : HadoopUtils.possibleHadoopConfPaths(new Configuration())) { + hadoopConf = getHadoopConfiguration(possibleHadoopConfPath); + if (hadoopConf != null) { + break; + } + } + if (hadoopConf == null) { + hadoopConf = new org.apache.hadoop.conf.Configuration(); + } + return hadoopConf; + } + + /** + * Returns a new Hadoop Configuration object using the path to the hadoop conf configured. + * + * @param hadoopConfDir Hadoop conf directory path. + * @return A Hadoop configuration instance. + */ + private static org.apache.hadoop.conf.Configuration getHadoopConfiguration(String hadoopConfDir) { + if (new File(hadoopConfDir).exists()) { + org.apache.hadoop.conf.Configuration hadoopConfiguration = new org.apache.hadoop.conf.Configuration(); + File coreSite = new File(hadoopConfDir, "core-site.xml"); + if (coreSite.exists()) { + hadoopConfiguration.addResource(new Path(coreSite.getAbsolutePath())); + } + File hdfsSite = new File(hadoopConfDir, "hdfs-site.xml"); + if (hdfsSite.exists()) { + hadoopConfiguration.addResource(new Path(hdfsSite.getAbsolutePath())); + } + File yarnSite = new File(hadoopConfDir, "yarn-site.xml"); + if (yarnSite.exists()) { + hadoopConfiguration.addResource(new Path(yarnSite.getAbsolutePath())); + } + // Add mapred-site.xml. We need to read configurations like compression codec. + File mapredSite = new File(hadoopConfDir, "mapred-site.xml"); + if (mapredSite.exists()) { + hadoopConfiguration.addResource(new Path(mapredSite.getAbsolutePath())); + } + return hadoopConfiguration; + } + return null; + } +} diff --git a/hudi-client/hudi-flink-client/src/main/resources/log4j.properties b/hudi-client/hudi-flink-client/src/main/resources/log4j.properties deleted file mode 100644 index ff268faf6363c..0000000000000 --- a/hudi-client/hudi-flink-client/src/main/resources/log4j.properties +++ /dev/null @@ -1,23 +0,0 @@ -### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -### -log4j.rootLogger=INFO, A1 -# A1 is set to be a ConsoleAppender. -log4j.appender.A1=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. -log4j.appender.A1.layout=org.apache.log4j.PatternLayout -log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n diff --git a/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/client/common/TestHoodieFlinkEngineContext.java b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/client/common/TestHoodieFlinkEngineContext.java index 41a8b96199395..d0babe7663dbf 100644 --- a/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/client/common/TestHoodieFlinkEngineContext.java +++ b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/client/common/TestHoodieFlinkEngineContext.java @@ -19,6 +19,7 @@ package org.apache.hudi.client.common; import org.apache.hudi.client.FlinkTaskContextSupplier; +import org.apache.hudi.common.util.collection.ImmutablePair; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; @@ -30,8 +31,6 @@ import java.util.List; import java.util.Map; -import scala.Tuple2; - /** * Unit test against HoodieFlinkEngineContext. */ @@ -85,7 +84,7 @@ public void testMapToPair() { Map resultMap = context.mapToPair(mapList, x -> { String[] splits = x.split("_"); - return Tuple2.apply(splits[0], splits[1]); + return new ImmutablePair<>(splits[0], splits[1]); }, 2); Assertions.assertEquals(resultMap.get("spark"), resultMap.get("flink")); diff --git a/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/index/bloom/TestFlinkHoodieBloomIndex.java b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/index/bloom/TestFlinkHoodieBloomIndex.java new file mode 100644 index 0000000000000..d07eff01c272b --- /dev/null +++ b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/index/bloom/TestFlinkHoodieBloomIndex.java @@ -0,0 +1,467 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index.bloom; + +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.bloom.BloomFilterFactory; +import org.apache.hudi.common.bloom.BloomFilterTypeCode; +import org.apache.hudi.common.data.HoodieListPairData; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.testutils.RawTripTestPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndexUtils; +import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.testutils.HoodieFlinkClientTestHarness; +import org.apache.hudi.testutils.HoodieFlinkWriteableTestTable; + +import org.apache.avro.Schema; +import org.apache.hadoop.fs.Path; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Stream; + +import static java.util.Arrays.asList; +import static java.util.UUID.randomUUID; +import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Unit test against FlinkHoodieBloomIndex. + */ +//TODO merge code with Spark Bloom index tests. +public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness { + + private static final Schema SCHEMA = getSchemaFromResource(TestFlinkHoodieBloomIndex.class, "/exampleSchema.avsc", true); + private static final String TEST_NAME_WITH_PARAMS = "[{index}] Test with rangePruning={0}, treeFiltering={1}, bucketizedChecking={2}"; + + public static Stream configParams() { + Object[][] data = + new Object[][] {{true, true, true}, {false, true, true}, {true, true, false}, {true, false, true}}; + return Stream.of(data).map(Arguments::of); + } + + @BeforeEach + public void setUp() throws Exception { + initPath(); + initFileSystem(); + // We have some records to be tagged (two different partitions) + initMetaClient(); + } + + @AfterEach + public void tearDown() throws Exception { + cleanupResources(); + } + + private HoodieWriteConfig makeConfig(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) { + return HoodieWriteConfig.newBuilder().withPath(basePath) + .withIndexConfig(HoodieIndexConfig.newBuilder().bloomIndexPruneByRanges(rangePruning) + .bloomIndexTreebasedFilter(treeFiltering).bloomIndexBucketizedChecking(bucketizedChecking) + .bloomIndexKeysPerBucket(2).build()) + .build(); + } + + @ParameterizedTest(name = TEST_NAME_WITH_PARAMS) + @MethodSource("configParams") + public void testLoadInvolvedFiles(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception { + HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); + HoodieBloomIndex index = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance()); + HoodieTable hoodieTable = HoodieFlinkTable.create(config, context, metaClient); + HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(hoodieTable, SCHEMA); + + // Create some partitions, and put some files + // "2016/01/21": 0 file + // "2016/04/01": 1 file (2_0_20160401010101.parquet) + // "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet, 4_0_20150312101010.parquet) + testTable.withPartitionMetaFiles("2016/01/21", "2016/04/01", "2015/03/12"); + + RawTripTestPayload rowChange1 = + new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record1 = + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + RawTripTestPayload rowChange2 = + new RawTripTestPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record2 = + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + RawTripTestPayload rowChange3 = + new RawTripTestPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record3 = + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + RawTripTestPayload rowChange4 = + new RawTripTestPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord record4 = + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + + List partitions = asList("2016/01/21", "2016/04/01", "2015/03/12"); + List> filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable); + // Still 0, as no valid commit + assertEquals(0, filesList.size()); + + testTable.addCommit("20160401010101").withInserts("2016/04/01", "2"); + testTable.addCommit("20150312101010").withInserts("2015/03/12", "1") + .withInserts("2015/03/12", "3", record1) + .withInserts("2015/03/12", "4", record2, record3, record4); + metaClient.reloadActiveTimeline(); + + filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable); + assertEquals(4, filesList.size()); + + if (rangePruning) { + // these files will not have the key ranges + assertNull(filesList.get(0).getRight().getMaxRecordKey()); + assertNull(filesList.get(0).getRight().getMinRecordKey()); + assertFalse(filesList.get(1).getRight().hasKeyRanges()); + assertNotNull(filesList.get(2).getRight().getMaxRecordKey()); + assertNotNull(filesList.get(2).getRight().getMinRecordKey()); + assertTrue(filesList.get(3).getRight().hasKeyRanges()); + + // no longer sorted, but should have same files. + + List> expected = + asList(Pair.of("2016/04/01", new BloomIndexFileInfo("2")), + Pair.of("2015/03/12", new BloomIndexFileInfo("1")), + Pair.of("2015/03/12", new BloomIndexFileInfo("3", "000", "000")), + Pair.of("2015/03/12", new BloomIndexFileInfo("4", "001", "003"))); + assertEquals(expected, filesList); + } + } + + @ParameterizedTest(name = TEST_NAME_WITH_PARAMS) + @MethodSource("configParams") + public void testRangePruning(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) { + HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); + HoodieBloomIndex index = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance()); + + final Map> partitionToFileIndexInfo = new HashMap<>(); + partitionToFileIndexInfo.put("2017/10/22", + asList(new BloomIndexFileInfo("f1"), new BloomIndexFileInfo("f2", "000", "000"), + new BloomIndexFileInfo("f3", "001", "003"), new BloomIndexFileInfo("f4", "002", "007"), + new BloomIndexFileInfo("f5", "009", "010"))); + + Map> partitionRecordKeyMap = new HashMap<>(); + asList(Pair.of("2017/10/22", "003"), Pair.of("2017/10/22", "002"), + Pair.of("2017/10/22", "005"), Pair.of("2017/10/22", "004")) + .forEach(t -> { + List recordKeyList = partitionRecordKeyMap.getOrDefault(t.getLeft(), new ArrayList<>()); + recordKeyList.add(t.getRight()); + partitionRecordKeyMap.put(t.getLeft(), recordKeyList); + }); + + List> comparisonKeyList = index.explodeRecordsWithFileComparisons(partitionToFileIndexInfo, HoodieListPairData.lazy(partitionRecordKeyMap)).collectAsList(); + + assertEquals(10, comparisonKeyList.size()); + java.util.Map> recordKeyToFileComps = comparisonKeyList.stream() + .collect(java.util.stream.Collectors.groupingBy(t -> t.getRight().getRecordKey(), java.util.stream.Collectors.mapping(t -> t.getLeft(), java.util.stream.Collectors.toList()))); + + assertEquals(4, recordKeyToFileComps.size()); + assertEquals(new java.util.HashSet<>(asList("f1", "f3", "f4")), new java.util.HashSet<>(recordKeyToFileComps.get("002"))); + assertEquals(new java.util.HashSet<>(asList("f1", "f3", "f4")), new java.util.HashSet<>(recordKeyToFileComps.get("003"))); + assertEquals(new java.util.HashSet<>(asList("f1", "f4")), new java.util.HashSet<>(recordKeyToFileComps.get("004"))); + assertEquals(new java.util.HashSet<>(asList("f1", "f4")), new java.util.HashSet<>(recordKeyToFileComps.get("005"))); + } + + @Test + public void testCheckUUIDsAgainstOneFile() throws Exception { + final String partition = "2016/01/31"; + // Create some records to use + String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}"; + RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); + HoodieRecord record1 = + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); + HoodieRecord record2 = + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); + HoodieRecord record3 = + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); + HoodieRecord record4 = + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + + // We write record1, record2 to a base file, but the bloom filter contains (record1, + // record2, record3). + BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name()); + filter.add(record3.getRecordKey()); + HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(metaClient, SCHEMA, filter); + String fileId = testTable.addCommit("000").getFileIdWithInserts(partition, record1, record2); + String filename = testTable.getBaseFileNameById(fileId); + + // The bloom filter contains 3 records + assertTrue(filter.mightContain(record1.getRecordKey())); + assertTrue(filter.mightContain(record2.getRecordKey())); + assertTrue(filter.mightContain(record3.getRecordKey())); + assertFalse(filter.mightContain(record4.getRecordKey())); + + // Compare with file + List uuids = asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(), record4.getRecordKey()); + + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); + HoodieFlinkTable table = HoodieFlinkTable.create(config, context, metaClient); + List results = HoodieIndexUtils.filterKeysFromFile( + new Path(java.nio.file.Paths.get(basePath, partition, filename).toString()), uuids, hadoopConf); + assertEquals(results.size(), 2); + assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") + || results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")); + assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0") + || results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")); + // TODO(vc): Need more coverage on actual filenames + // assertTrue(results.get(0)._2().equals(filename)); + // assertTrue(results.get(1)._2().equals(filename)); + } + + @ParameterizedTest(name = TEST_NAME_WITH_PARAMS) + @MethodSource("configParams") + public void testTagLocationWithEmptyList(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) { + // We have some records to be tagged (two different partitions) + List records = new ArrayList<>(); + // Also create the metadata and config + HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieFlinkTable table = HoodieFlinkTable.create(config, context, metaClient); + + // Let's tag + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance()); + + assertDoesNotThrow(() -> { + tagLocation(bloomIndex, records, table); + }, "EmptyList should not result in IllegalArgumentException: Positive number of slices required"); + } + + @ParameterizedTest(name = TEST_NAME_WITH_PARAMS) + @MethodSource("configParams") + public void testTagLocation(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception { + // We have some records to be tagged (two different partitions) + String rowKey1 = randomUUID().toString(); + String rowKey2 = randomUUID().toString(); + String rowKey3 = randomUUID().toString(); + String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + // place same row key under a different partition. + String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; + RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); + HoodieRecord record1 = + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); + HoodieRecord record2 = + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); + HoodieRecord record3 = + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); + HoodieRecord record4 = + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + List records = asList(record1, record2, record3, record4); + + // Also create the metadata and config + HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); + HoodieFlinkTable hoodieTable = HoodieFlinkTable.create(config, context, metaClient); + HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(hoodieTable, SCHEMA); + + // Let's tag + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance()); + List taggedRecords = tagLocation(bloomIndex, records, hoodieTable); + + // Should not find any files + for (HoodieRecord record : taggedRecords) { + assertFalse(record.isCurrentLocationKnown()); + } + + // We create three base file, each having one record. (two different partitions) + String fileId1 = testTable.addCommit("001").getFileIdWithInserts("2016/01/31", record1); + String fileId2 = testTable.addCommit("002").getFileIdWithInserts("2016/01/31", record2); + String fileId3 = testTable.addCommit("003").getFileIdWithInserts("2015/01/31", record4); + + metaClient.reloadActiveTimeline(); + + // We do the tag again + taggedRecords = tagLocation(bloomIndex, records, HoodieFlinkTable.create(config, context, metaClient)); + + // Check results + for (HoodieRecord record : taggedRecords) { + if (record.getRecordKey().equals(rowKey1)) { + if (record.getPartitionPath().equals("2015/01/31")) { + assertEquals(record.getCurrentLocation().getFileId(), fileId3); + } else { + assertEquals(record.getCurrentLocation().getFileId(), fileId1); + } + } else if (record.getRecordKey().equals(rowKey2)) { + assertEquals(record.getCurrentLocation().getFileId(), fileId2); + } else if (record.getRecordKey().equals(rowKey3)) { + assertFalse(record.isCurrentLocationKnown()); + } + } + } + + @ParameterizedTest(name = TEST_NAME_WITH_PARAMS) + @MethodSource("configParams") + public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception { + // We have some records to be tagged (two different partitions) + + String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + // record key same as recordStr2 + String recordStr4 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; + RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); + HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()); + HoodieRecord record1 = new HoodieAvroRecord(key1, rowChange1); + RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); + HoodieKey key2 = new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()); + HoodieRecord record2 = new HoodieAvroRecord(key2, rowChange2); + RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); + HoodieKey key3 = new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()); + RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); + HoodieKey key4 = new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()); + HoodieRecord record4 = new HoodieAvroRecord(key4, rowChange4); + List keys = asList(key1, key2, key3, key4); + + // Also create the metadata and config + HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); + HoodieTable hoodieTable = HoodieFlinkTable.create(config, context, metaClient); + HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(hoodieTable, SCHEMA); + + // Let's tag + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance()); + List toTagRecords = new ArrayList<>(); + toTagRecords.add(new HoodieAvroRecord(record4.getKey(), null)); + List taggedRecords = tagLocation(bloomIndex, toTagRecords, hoodieTable); + Map>> recordLocations = new HashMap<>(); + for (HoodieRecord taggedRecord : taggedRecords) { + recordLocations.put(taggedRecord.getKey(), taggedRecord.isCurrentLocationKnown() + ? Option.of(Pair.of(taggedRecord.getPartitionPath(), taggedRecord.getCurrentLocation().getFileId())) + : Option.empty()); + } + // Should not find any files + for (Option> record : recordLocations.values()) { + assertTrue(!record.isPresent()); + } + + // We create three base file, each having one record. (two different partitions) + String fileId1 = testTable.addCommit("001").getFileIdWithInserts("2016/01/31", record1); + String fileId2 = testTable.addCommit("002").getFileIdWithInserts("2016/01/31", record2); + String fileId3 = testTable.addCommit("003").getFileIdWithInserts("2015/01/31", record4); + + // We do the tag again + metaClient = HoodieTableMetaClient.reload(metaClient); + hoodieTable = HoodieFlinkTable.create(config, context, metaClient); + List toTagRecords1 = new ArrayList<>(); + for (HoodieKey key : keys) { + taggedRecords.add(new HoodieAvroRecord(key, null)); + } + + taggedRecords = tagLocation(bloomIndex, toTagRecords1, hoodieTable); + recordLocations.clear(); + for (HoodieRecord taggedRecord : taggedRecords) { + recordLocations.put(taggedRecord.getKey(), taggedRecord.isCurrentLocationKnown() + ? Option.of(Pair.of(taggedRecord.getPartitionPath(), taggedRecord.getCurrentLocation().getFileId())) + : Option.empty()); + } + + // Check results + for (Map.Entry>> record : recordLocations.entrySet()) { + if (record.getKey().getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) { + assertTrue(record.getValue().isPresent()); + assertEquals(fileId1, record.getValue().get().getRight()); + } else if (record.getKey().getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) { + assertTrue(record.getValue().isPresent()); + if (record.getKey().getPartitionPath().equals("2015/01/31")) { + assertEquals(fileId3, record.getValue().get().getRight()); + } else { + assertEquals(fileId2, record.getValue().get().getRight()); + } + } else if (record.getKey().getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) { + assertFalse(record.getValue().isPresent()); + } + } + } + + @ParameterizedTest(name = TEST_NAME_WITH_PARAMS) + @MethodSource("configParams") + public void testBloomFilterFalseError(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception { + // We have two hoodie records + String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + + // We write record1 to a base file, using a bloom filter having both records + RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); + HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); + HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + + BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name()); + filter.add(record2.getRecordKey()); + HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(metaClient, SCHEMA, filter); + String fileId = testTable.addCommit("000").getFileIdWithInserts("2016/01/31", record1); + assertTrue(filter.mightContain(record1.getRecordKey())); + assertTrue(filter.mightContain(record2.getRecordKey())); + + // We do the tag + List records = asList(record1, record2); + HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable table = HoodieFlinkTable.create(config, context, metaClient); + + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance()); + List taggedRecords = tagLocation(bloomIndex, records, table); + + // Check results + for (HoodieRecord record : taggedRecords) { + if (record.getKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) { + assertEquals(record.getCurrentLocation().getFileId(), fileId); + } else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) { + assertFalse(record.isCurrentLocationKnown()); + } + } + } +} diff --git a/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/io/storage/row/parquet/TestParquetSchemaConverter.java b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/io/storage/row/parquet/TestParquetSchemaConverter.java new file mode 100644 index 0000000000000..a1a07a65f9931 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/io/storage/row/parquet/TestParquetSchemaConverter.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage.row.parquet; + +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.junit.jupiter.api.Test; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; + +/** + * Test cases for {@link ParquetSchemaConverter}. + */ +public class TestParquetSchemaConverter { + @Test + void testConvertComplexTypes() { + DataType dataType = DataTypes.ROW( + DataTypes.FIELD("f_array", + DataTypes.ARRAY(DataTypes.CHAR(10))), + DataTypes.FIELD("f_map", + DataTypes.MAP(DataTypes.INT(), DataTypes.VARCHAR(20))), + DataTypes.FIELD("f_row", + DataTypes.ROW( + DataTypes.FIELD("f_row_f0", DataTypes.INT()), + DataTypes.FIELD("f_row_f1", DataTypes.VARCHAR(10)), + DataTypes.FIELD("f_row_f2", + DataTypes.ROW( + DataTypes.FIELD("f_row_f2_f0", DataTypes.INT()), + DataTypes.FIELD("f_row_f2_f1", DataTypes.VARCHAR(10))))))); + org.apache.parquet.schema.MessageType messageType = + ParquetSchemaConverter.convertToParquetMessageType("converted", (RowType) dataType.getLogicalType()); + assertThat(messageType.getColumns().size(), is(7)); + final String expected = "message converted {\n" + + " optional group f_array (LIST) {\n" + + " repeated group list {\n" + + " optional binary element (STRING);\n" + + " }\n" + + " }\n" + + " optional group f_map (MAP) {\n" + + " repeated group key_value {\n" + + " optional int32 key;\n" + + " optional binary value (STRING);\n" + + " }\n" + + " }\n" + + " optional group f_row {\n" + + " optional int32 f_row_f0;\n" + + " optional binary f_row_f1 (STRING);\n" + + " optional group f_row_f2 {\n" + + " optional int32 f_row_f2_f0;\n" + + " optional binary f_row_f2_f1 (STRING);\n" + + " }\n" + + " }\n" + + "}\n"; + assertThat(messageType.toString(), is(expected)); + } + + @Test + void testConvertTimestampTypes() { + DataType dataType = DataTypes.ROW( + DataTypes.FIELD("ts_3", DataTypes.TIMESTAMP(3)), + DataTypes.FIELD("ts_6", DataTypes.TIMESTAMP(6)), + DataTypes.FIELD("ts_9", DataTypes.TIMESTAMP(9))); + org.apache.parquet.schema.MessageType messageType = + ParquetSchemaConverter.convertToParquetMessageType("converted", (RowType) dataType.getLogicalType()); + assertThat(messageType.getColumns().size(), is(3)); + final String expected = "message converted {\n" + + " optional int64 ts_3 (TIMESTAMP(MILLIS,true));\n" + + " optional int96 ts_6;\n" + + " optional int96 ts_9;\n" + + "}\n"; + assertThat(messageType.toString(), is(expected)); + } +} diff --git a/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/testutils/HoodieFlinkClientTestHarness.java b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/testutils/HoodieFlinkClientTestHarness.java new file mode 100644 index 0000000000000..2f4e9eeddc9e4 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/testutils/HoodieFlinkClientTestHarness.java @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.testutils; + +import org.apache.hudi.client.FlinkTaskContextSupplier; +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodieListData; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.testutils.minicluster.HdfsTestService; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.index.bloom.TestFlinkHoodieBloomIndex; +import org.apache.hudi.table.HoodieTable; + +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.streaming.api.functions.sink.SinkFunction; +import org.apache.flink.test.util.MiniClusterWithClientResource; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; +import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestInfo; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ExecutorService; + +/** + * The test harness for resource initialization and cleanup. + */ +public class HoodieFlinkClientTestHarness extends HoodieCommonTestHarness implements Serializable { + + protected static final Logger LOG = LogManager.getLogger(HoodieFlinkClientTestHarness.class); + private String testMethodName; + protected transient Configuration hadoopConf = null; + protected transient FileSystem fs; + protected transient MiniClusterWithClientResource flinkCluster = null; + protected transient HoodieFlinkEngineContext context = null; + protected transient ExecutorService executorService; + protected transient HoodieFlinkWriteClient writeClient; + protected transient HoodieTableFileSystemView tableView; + + protected final FlinkTaskContextSupplier supplier = new FlinkTaskContextSupplier(null); + + // dfs + protected transient HdfsTestService hdfsTestService; + protected transient MiniDFSCluster dfsCluster; + protected transient DistributedFileSystem dfs; + + @BeforeEach + public void setTestMethodName(TestInfo testInfo) { + if (testInfo.getTestMethod().isPresent()) { + testMethodName = testInfo.getTestMethod().get().getName(); + } else { + testMethodName = "Unknown"; + } + } + + protected void initFlinkMiniCluster() { + flinkCluster = new MiniClusterWithClientResource( + new MiniClusterResourceConfiguration.Builder() + .setNumberSlotsPerTaskManager(2) + .setNumberTaskManagers(1) + .build()); + } + + protected void initFileSystem() { + hadoopConf = new Configuration(); + initFileSystemWithConfiguration(hadoopConf); + context = new HoodieFlinkEngineContext(supplier); + } + + private void initFileSystemWithConfiguration(Configuration configuration) { + if (basePath == null) { + throw new IllegalStateException("The base path has not been initialized."); + } + fs = FSUtils.getFs(basePath, configuration); + if (fs instanceof LocalFileSystem) { + LocalFileSystem lfs = (LocalFileSystem) fs; + // With LocalFileSystem, with checksum disabled, fs.open() returns an inputStream which is FSInputStream + // This causes ClassCastExceptions in LogRecordScanner (and potentially other places) calling fs.open + // So, for the tests, we enforce checksum verification to circumvent the problem + lfs.setVerifyChecksum(true); + } + } + + /** + * Initializes an instance of {@link HoodieTableMetaClient} with a special table type specified by + * {@code getTableType()}. + * + * @throws IOException + */ + protected void initMetaClient() throws IOException { + initMetaClient(getTableType()); + } + + protected void initMetaClient(HoodieTableType tableType) throws IOException { + if (basePath == null) { + throw new IllegalStateException("The base path has not been initialized."); + } + metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType); + } + + protected List tagLocation( + HoodieIndex index, List records, HoodieTable table) { + return ((HoodieData) index.tagLocation(HoodieListData.eager(records), context, table)).collectAsList(); + } + + /** + * Cleanups file system. + * + * @throws IOException + */ + protected void cleanupFileSystem() throws IOException { + if (fs != null) { + LOG.warn("Closing file-system instance used in previous test-run"); + fs.close(); + fs = null; + } + } + + /** + * Cleanups resource group for the subclasses of {@link TestFlinkHoodieBloomIndex}. + */ + public void cleanupResources() throws java.io.IOException { + cleanupClients(); + cleanupFlinkContexts(); + cleanupTestDataGenerator(); + cleanupFileSystem(); + cleanupDFS(); + cleanupExecutorService(); + System.gc(); + } + + protected void cleanupFlinkMiniCluster() { + if (flinkCluster != null) { + flinkCluster.after(); + flinkCluster = null; + } + } + + /** + * Simple test sink function. + */ + public static class SimpleTestSinkFunction implements SinkFunction { + + // must be static + public static List valuesList = new ArrayList<>(); + + @Override + public synchronized void invoke(HoodieRecord value, Context context) throws Exception { + valuesList.add(value); + } + } + + /** + * Cleanups hoodie clients. + */ + protected void cleanupClients() throws java.io.IOException { + if (metaClient != null) { + metaClient = null; + } + if (writeClient != null) { + writeClient.close(); + writeClient = null; + } + if (tableView != null) { + tableView.close(); + tableView = null; + } + } + + /** + * Cleanups the distributed file system. + * + * @throws IOException + */ + protected void cleanupDFS() throws java.io.IOException { + if (hdfsTestService != null) { + hdfsTestService.stop(); + dfsCluster.shutdown(true, true); + hdfsTestService = null; + dfsCluster = null; + dfs = null; + } + // Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the + // same JVM + FileSystem.closeAll(); + } + + /** + * Cleanups the executor service. + */ + protected void cleanupExecutorService() { + if (this.executorService != null) { + this.executorService.shutdownNow(); + this.executorService = null; + } + } + + /** + * Cleanups Flink contexts. + */ + protected void cleanupFlinkContexts() { + if (context != null) { + LOG.info("Closing flink engine context used in previous test-case"); + context = null; + } + } +} diff --git a/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/testutils/HoodieFlinkWriteableTestTable.java b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/testutils/HoodieFlinkWriteableTestTable.java new file mode 100644 index 0000000000000..2a69e6fd671c0 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/testutils/HoodieFlinkWriteableTestTable.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.testutils; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.bloom.BloomFilterFactory; +import org.apache.hudi.common.bloom.BloomFilterTypeCode; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.log.HoodieLogFormat; +import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; +import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.table.HoodieTable; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * Flink hoodie writable table. + */ +public class HoodieFlinkWriteableTestTable extends HoodieWriteableTestTable { + private static final Logger LOG = LogManager.getLogger(HoodieFlinkWriteableTestTable.class); + + private HoodieFlinkWriteableTestTable(String basePath, org.apache.hadoop.fs.FileSystem fs, HoodieTableMetaClient metaClient, Schema schema, BloomFilter filter) { + super(basePath, fs, metaClient, schema, filter); + } + + public static HoodieFlinkWriteableTestTable of(HoodieTableMetaClient metaClient, Schema schema, BloomFilter filter) { + return new HoodieFlinkWriteableTestTable(metaClient.getBasePath(), metaClient.getRawFs(), metaClient, schema, filter); + } + + public static HoodieFlinkWriteableTestTable of(HoodieTableMetaClient metaClient, Schema schema) { + BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name()); + return of(metaClient, schema, filter); + } + + public static HoodieFlinkWriteableTestTable of(HoodieTable hoodieTable, Schema schema) { + HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); + return of(metaClient, schema); + } + + public static HoodieFlinkWriteableTestTable of(HoodieTable hoodieTable, Schema schema, BloomFilter filter) { + HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); + return of(metaClient, schema, filter); + } + + @Override + public HoodieFlinkWriteableTestTable addCommit(String instantTime) throws Exception { + return (HoodieFlinkWriteableTestTable) super.addCommit(instantTime); + } + + @Override + public HoodieFlinkWriteableTestTable forCommit(String instantTime) { + return (HoodieFlinkWriteableTestTable) super.forCommit(instantTime); + } + + public String getFileIdWithInserts(String partition) throws Exception { + return getFileIdWithInserts(partition, new HoodieRecord[0]); + } + + public String getFileIdWithInserts(String partition, HoodieRecord... records) throws Exception { + return getFileIdWithInserts(partition, Arrays.asList(records)); + } + + public String getFileIdWithInserts(String partition, List records) throws Exception { + String fileId = java.util.UUID.randomUUID().toString(); + withInserts(partition, fileId, records); + return fileId; + } + + public HoodieFlinkWriteableTestTable withInserts(String partition, String fileId) throws Exception { + return withInserts(partition, fileId, new HoodieRecord[0]); + } + + public HoodieFlinkWriteableTestTable withInserts(String partition, String fileId, HoodieRecord... records) throws Exception { + return withInserts(partition, fileId, Arrays.asList(records)); + } + + public HoodieFlinkWriteableTestTable withInserts(String partition, String fileId, List records) throws Exception { + withInserts(partition, fileId, records, new org.apache.hudi.client.FlinkTaskContextSupplier(null)); + return this; + } + + public Map> withLogAppends(List records) throws Exception { + Map> partitionToLogfilesMap = new HashMap<>(); + for (List groupedRecords : records.stream().collect( + Collectors.groupingBy(HoodieRecord::getCurrentLocation)).values()) { + final Pair appendedLogFile = appendRecordsToLogFile(groupedRecords); + partitionToLogfilesMap.computeIfAbsent( + appendedLogFile.getKey(), k -> new ArrayList<>()).add(appendedLogFile.getValue()); + } + return partitionToLogfilesMap; + } + + private Pair appendRecordsToLogFile(List groupedRecords) throws Exception { + String partitionPath = groupedRecords.get(0).getPartitionPath(); + HoodieRecordLocation location = groupedRecords.get(0).getCurrentLocation(); + try (HoodieLogFormat.Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath)) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(location.getFileId()) + .overBaseCommit(location.getInstantTime()).withFs(fs).build()) { + Map header = new java.util.HashMap<>(); + header.put(HeaderMetadataType.INSTANT_TIME, location.getInstantTime()); + header.put(HeaderMetadataType.SCHEMA, schema.toString()); + logWriter.appendBlock(new HoodieAvroDataBlock(groupedRecords.stream().map(r -> { + try { + GenericRecord val = (GenericRecord) ((HoodieRecordPayload) r.getData()).getInsertValue(schema).get(); + HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), ""); + return (IndexedRecord) val; + } catch (IOException e) { + LOG.warn("Failed to convert record " + r.toString(), e); + return null; + } + }).collect(Collectors.toList()), header, HoodieRecord.RECORD_KEY_METADATA_FIELD)); + return Pair.of(partitionPath, logWriter.getLogFile()); + } + } +} diff --git a/hudi-client/hudi-flink-client/src/test/resources/log4j-surefire.properties b/hudi-client/hudi-flink-client/src/test/resources/log4j-surefire.properties deleted file mode 100644 index 32af462093ae5..0000000000000 --- a/hudi-client/hudi-flink-client/src/test/resources/log4j-surefire.properties +++ /dev/null @@ -1,31 +0,0 @@ -### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -### -log4j.rootLogger=WARN, CONSOLE -log4j.logger.org.apache=INFO -log4j.logger.org.apache.hudi=DEBUG -log4j.logger.org.apache.hadoop.hbase=ERROR - -# A1 is set to be a ConsoleAppender. -log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. -log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout -log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n -log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter -log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true -log4j.appender.CONSOLE.filter.a.LevelMin=WARN -log4j.appender.CONSOLE.filter.a.LevelMax=FATAL diff --git a/hudi-client/hudi-java-client/pom.xml b/hudi-client/hudi-java-client/pom.xml index 6429adedc6e12..99371193be895 100644 --- a/hudi-client/hudi-java-client/pom.xml +++ b/hudi-client/hudi-java-client/pom.xml @@ -15,27 +15,38 @@ See the License for the specific language governing permissions and limitations under the License. --> - + hudi-client org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 hudi-java-client - ${parent.version} + 0.12.2-dt-SNAPSHOT hudi-java-client jar + + + org.apache.logging.log4j + log4j-1.2-api + + org.apache.hudi hudi-client-common - ${parent.version} + ${project.parent.version} + + + + + org.apache.parquet + parquet-avro @@ -55,8 +66,33 @@ test-jar test + + org.apache.hudi + hudi-hadoop-mr + ${project.version} + test + + + ${hive.groupid} + hive-exec + ${hive.version} + test + ${hive.exec.classifier} + + + ${hive.groupid} + hive-metastore + ${hive.version} + test + + + org.apache.hudi + hudi-tests-common + ${project.version} + test + org.junit.jupiter junit-jupiter-api @@ -97,6 +133,28 @@ junit-platform-commons test + + + org.apache.hadoop + hadoop-hdfs + tests + test + + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java new file mode 100644 index 0000000000000..b6951bc6b7874 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client; + +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.client.embedded.EmbeddedTimelineService; +import org.apache.hudi.common.data.HoodieListData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieNotSupportedException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.index.JavaHoodieIndexFactory; +import org.apache.hudi.table.BulkInsertPartitioner; +import org.apache.hudi.table.HoodieJavaTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.upgrade.JavaUpgradeDowngradeHelper; + +import com.codahale.metrics.Timer; +import org.apache.hadoop.conf.Configuration; + +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class HoodieJavaWriteClient extends + BaseHoodieWriteClient>, List, List> { + + public HoodieJavaWriteClient(HoodieEngineContext context, HoodieWriteConfig clientConfig) { + super(context, clientConfig, JavaUpgradeDowngradeHelper.getInstance()); + } + + public HoodieJavaWriteClient(HoodieEngineContext context, + HoodieWriteConfig writeConfig, + boolean rollbackPending, + Option timelineService) { + super(context, writeConfig, timelineService, JavaUpgradeDowngradeHelper.getInstance()); + } + + @Override + public List> filterExists(List> hoodieRecords) { + // Create a Hoodie table which encapsulated the commits and files visible + HoodieJavaTable table = HoodieJavaTable.create(config, (HoodieJavaEngineContext) context); + Timer.Context indexTimer = metrics.getIndexCtx(); + List> recordsWithLocation = getIndex().tagLocation(HoodieListData.eager(hoodieRecords), context, table).collectAsList(); + metrics.updateIndexMetrics(LOOKUP_STR, metrics.getDurationInMs(indexTimer == null ? 0L : indexTimer.stop())); + return recordsWithLocation.stream().filter(v1 -> !v1.isCurrentLocationKnown()).collect(Collectors.toList()); + } + + @Override + protected HoodieIndex createIndex(HoodieWriteConfig writeConfig) { + return JavaHoodieIndexFactory.createIndex(config); + } + + @Override + public boolean commit(String instantTime, + List writeStatuses, + Option> extraMetadata, + String commitActionType, + Map> partitionToReplacedFileIds) { + List writeStats = writeStatuses.stream().map(WriteStatus::getStat).collect(Collectors.toList()); + return commitStats(instantTime, writeStats, extraMetadata, commitActionType, partitionToReplacedFileIds); + } + + @Override + protected HoodieTable createTable(HoodieWriteConfig config, Configuration hadoopConf) { + return HoodieJavaTable.create(config, context); + } + + @Override + public List upsert(List> records, + String instantTime) { + HoodieTable>, List, List> table = + initTable(WriteOperationType.UPSERT, Option.ofNullable(instantTime)); + table.validateUpsertSchema(); + preWrite(instantTime, WriteOperationType.UPSERT, table.getMetaClient()); + HoodieWriteMetadata> result = table.upsert(context, instantTime, records); + if (result.getIndexLookupDuration().isPresent()) { + metrics.updateIndexMetrics(LOOKUP_STR, result.getIndexLookupDuration().get().toMillis()); + } + return postWrite(result, instantTime, table); + } + + @Override + public List upsertPreppedRecords(List> preppedRecords, + String instantTime) { + HoodieTable>, List, List> table = + initTable(WriteOperationType.UPSERT_PREPPED, Option.ofNullable(instantTime)); + table.validateUpsertSchema(); + preWrite(instantTime, WriteOperationType.UPSERT_PREPPED, table.getMetaClient()); + HoodieWriteMetadata> result = table.upsertPrepped(context,instantTime, preppedRecords); + return postWrite(result, instantTime, table); + } + + @Override + public List insert(List> records, String instantTime) { + HoodieTable>, List, List> table = + initTable(WriteOperationType.INSERT, Option.ofNullable(instantTime)); + table.validateUpsertSchema(); + preWrite(instantTime, WriteOperationType.INSERT, table.getMetaClient()); + HoodieWriteMetadata> result = table.insert(context, instantTime, records); + if (result.getIndexLookupDuration().isPresent()) { + metrics.updateIndexMetrics(LOOKUP_STR, result.getIndexLookupDuration().get().toMillis()); + } + return postWrite(result, instantTime, table); + } + + @Override + public List insertPreppedRecords(List> preppedRecords, + String instantTime) { + HoodieTable>, List, List> table = + initTable(WriteOperationType.INSERT_PREPPED, Option.ofNullable(instantTime)); + table.validateInsertSchema(); + preWrite(instantTime, WriteOperationType.INSERT_PREPPED, table.getMetaClient()); + HoodieWriteMetadata> result = table.insertPrepped(context,instantTime, preppedRecords); + return postWrite(result, instantTime, table); + } + + @Override + public List bulkInsert(List> records, + String instantTime) { + throw new HoodieNotSupportedException("BulkInsert is not supported in HoodieJavaClient"); + } + + @Override + public List bulkInsert(List> records, + String instantTime, + Option userDefinedBulkInsertPartitioner) { + throw new HoodieNotSupportedException("BulkInsert is not supported in HoodieJavaClient"); + } + + public void transitionInflight(String instantTime) { + HoodieTableMetaClient metaClient = createMetaClient(true); + metaClient.getActiveTimeline().transitionRequestedToInflight( + new HoodieInstant(HoodieInstant.State.REQUESTED, metaClient.getCommitActionType(), instantTime), + Option.empty(), config.shouldAllowMultiWriteOnSameInstant()); + } + + @Override + public List bulkInsertPreppedRecords(List> preppedRecords, + String instantTime, + Option bulkInsertPartitioner) { + HoodieTable>, List, List> table = + initTable(WriteOperationType.BULK_INSERT_PREPPED, Option.ofNullable(instantTime)); + table.validateInsertSchema(); + preWrite(instantTime, WriteOperationType.BULK_INSERT_PREPPED, table.getMetaClient()); + HoodieWriteMetadata> result = table.bulkInsertPrepped(context, instantTime, preppedRecords, bulkInsertPartitioner); + return postWrite(result, instantTime, table); + } + + @Override + public List delete(List keys, + String instantTime) { + HoodieTable>, List, List> table = + initTable(WriteOperationType.DELETE, Option.ofNullable(instantTime)); + preWrite(instantTime, WriteOperationType.DELETE, table.getMetaClient()); + HoodieWriteMetadata> result = table.delete(context,instantTime, keys); + return postWrite(result, instantTime, table); + } + + @Override + protected List postWrite(HoodieWriteMetadata> result, + String instantTime, + HoodieTable hoodieTable) { + if (result.getIndexLookupDuration().isPresent()) { + metrics.updateIndexMetrics(getOperationType().name(), result.getIndexUpdateDuration().get().toMillis()); + } + if (result.isCommitted()) { + // Perform post commit operations. + if (result.getFinalizeDuration().isPresent()) { + metrics.updateFinalizeWriteMetrics(result.getFinalizeDuration().get().toMillis(), + result.getWriteStats().get().size()); + } + + postCommit(hoodieTable, result.getCommitMetadata().get(), instantTime, Option.empty(), true); + + emitCommitMetrics(instantTime, result.getCommitMetadata().get(), hoodieTable.getMetaClient().getCommitActionType()); + } + return result.getWriteStatuses(); + } + + @Override + public void commitCompaction(String compactionInstantTime, + HoodieCommitMetadata metadata, + Option> extraMetadata) { + throw new HoodieNotSupportedException("CommitCompaction is not supported in HoodieJavaClient"); + } + + @Override + protected void completeCompaction(HoodieCommitMetadata metadata, + HoodieTable table, + String compactionCommitTime) { + throw new HoodieNotSupportedException("CompleteCompaction is not supported in HoodieJavaClient"); + } + + @Override + protected HoodieWriteMetadata> compact(String compactionInstantTime, + boolean shouldComplete) { + throw new HoodieNotSupportedException("Compact is not supported in HoodieJavaClient"); + } + + @Override + public HoodieWriteMetadata> cluster(final String clusteringInstant, final boolean shouldComplete) { + throw new HoodieNotSupportedException("Cluster is not supported in HoodieJavaClient"); + } + + @Override + protected HoodieTable doInitTable(HoodieTableMetaClient metaClient, Option instantTime, boolean initialMetadataTableIfNecessary) { + // new JavaUpgradeDowngrade(metaClient, config, context).run(metaClient, HoodieTableVersion.current(), config, context, instantTime); + + // Create a Hoodie table which encapsulated the commits and files visible + return HoodieJavaTable.create(config, (HoodieJavaEngineContext) context, metaClient); + } + +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/JavaSizeBasedClusteringPlanStrategy.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/JavaSizeBasedClusteringPlanStrategy.java new file mode 100644 index 0000000000000..ec7202f4d8622 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/JavaSizeBasedClusteringPlanStrategy.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.client.clustering.plan.strategy; + +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieJavaCopyOnWriteTable; +import org.apache.hudi.table.HoodieJavaMergeOnReadTable; +import org.apache.hudi.table.action.cluster.strategy.PartitionAwareClusteringPlanStrategy; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Stream; + +import static org.apache.hudi.config.HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS; + +/** + * Clustering Strategy for Java engine based on following. + * 1) Creates clustering groups based on max size allowed per group. + * 2) Excludes files that are greater than 'small.file.limit' from clustering plan. + */ +public class JavaSizeBasedClusteringPlanStrategy> + extends PartitionAwareClusteringPlanStrategy>, List, List> { + private static final Logger LOG = LogManager.getLogger(JavaSizeBasedClusteringPlanStrategy.class); + + public JavaSizeBasedClusteringPlanStrategy(HoodieJavaCopyOnWriteTable table, + HoodieJavaEngineContext engineContext, + HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + public JavaSizeBasedClusteringPlanStrategy(HoodieJavaMergeOnReadTable table, + HoodieJavaEngineContext engineContext, + HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + @Override + protected Stream buildClusteringGroupsForPartition(String partitionPath, List fileSlices) { + List, Integer>> fileSliceGroups = new ArrayList<>(); + List currentGroup = new ArrayList<>(); + long totalSizeSoFar = 0; + HoodieWriteConfig writeConfig = getWriteConfig(); + for (FileSlice currentSlice : fileSlices) { + // assume each filegroup size is ~= parquet.max.file.size + totalSizeSoFar += currentSlice.getBaseFile().isPresent() ? currentSlice.getBaseFile().get().getFileSize() : writeConfig.getParquetMaxFileSize(); + // check if max size is reached and create new group, if needed. + if (totalSizeSoFar >= writeConfig.getClusteringMaxBytesInGroup() && !currentGroup.isEmpty()) { + int numOutputGroups = getNumberOfOutputFileGroups(totalSizeSoFar, writeConfig.getClusteringTargetFileMaxBytes()); + LOG.info("Adding one clustering group " + totalSizeSoFar + " max bytes: " + + writeConfig.getClusteringMaxBytesInGroup() + " num input slices: " + currentGroup.size() + " output groups: " + numOutputGroups); + fileSliceGroups.add(Pair.of(currentGroup, numOutputGroups)); + currentGroup = new ArrayList<>(); + totalSizeSoFar = 0; + } + currentGroup.add(currentSlice); + // totalSizeSoFar could be 0 when new group was created in the previous conditional block. + // reset to the size of current slice, otherwise the number of output file group will become 0 even though current slice is present. + if (totalSizeSoFar == 0) { + totalSizeSoFar += currentSlice.getBaseFile().isPresent() ? currentSlice.getBaseFile().get().getFileSize() : writeConfig.getParquetMaxFileSize(); + } + } + if (!currentGroup.isEmpty()) { + int numOutputGroups = getNumberOfOutputFileGroups(totalSizeSoFar, writeConfig.getClusteringTargetFileMaxBytes()); + LOG.info("Adding final clustering group " + totalSizeSoFar + " max bytes: " + + writeConfig.getClusteringMaxBytesInGroup() + " num input slices: " + currentGroup.size() + " output groups: " + numOutputGroups); + fileSliceGroups.add(Pair.of(currentGroup, numOutputGroups)); + } + + return fileSliceGroups.stream().map(fileSliceGroup -> HoodieClusteringGroup.newBuilder() + .setSlices(getFileSliceInfo(fileSliceGroup.getLeft())) + .setNumOutputFileGroups(fileSliceGroup.getRight()) + .setMetrics(buildMetrics(fileSliceGroup.getLeft())) + .build()); + } + + @Override + protected Map getStrategyParams() { + Map params = new HashMap<>(); + if (!StringUtils.isNullOrEmpty(getWriteConfig().getClusteringSortColumns())) { + params.put(PLAN_STRATEGY_SORT_COLUMNS.key(), getWriteConfig().getClusteringSortColumns()); + } + return params; + } + + @Override + protected Stream getFileSlicesEligibleForClustering(final String partition) { + return super.getFileSlicesEligibleForClustering(partition) + // Only files that have basefile size smaller than small file size are eligible. + .filter(slice -> slice.getBaseFile().map(HoodieBaseFile::getFileSize).orElse(0L) < getWriteConfig().getClusteringSmallFileLimit()); + } + + private int getNumberOfOutputFileGroups(long groupSize, long targetFileSize) { + return (int) Math.ceil(groupSize / (double) targetFileSize); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java new file mode 100644 index 0000000000000..c6f885fa9160a --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java @@ -0,0 +1,257 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.client.clustering.run.strategy; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.JavaTaskContextSupplier; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodieListData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.ClusteringOperation; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.RewriteAvroPayload; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieClusteringException; +import org.apache.hudi.execution.bulkinsert.JavaBulkInsertInternalPartitionerFactory; +import org.apache.hudi.execution.bulkinsert.JavaCustomColumnsSortPartitioner; +import org.apache.hudi.io.IOUtils; +import org.apache.hudi.io.storage.HoodieFileReader; +import org.apache.hudi.io.storage.HoodieFileReaderFactory; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.keygen.KeyGenUtils; +import org.apache.hudi.table.BulkInsertPartitioner; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.table.log.HoodieFileSliceReader.getFileSliceReader; +import static org.apache.hudi.config.HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS; + +/** + * Clustering strategy for Java engine. + */ +public abstract class JavaExecutionStrategy> + extends ClusteringExecutionStrategy>, HoodieData, HoodieData> { + + private static final Logger LOG = LogManager.getLogger(JavaExecutionStrategy.class); + + public JavaExecutionStrategy( + HoodieTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + @Override + public HoodieWriteMetadata> performClustering( + HoodieClusteringPlan clusteringPlan, Schema schema, String instantTime) { + // execute clustering for each group and collect WriteStatus + List writeStatusList = new ArrayList<>(); + clusteringPlan.getInputGroups().forEach( + inputGroup -> writeStatusList.addAll(runClusteringForGroup( + inputGroup, clusteringPlan.getStrategy().getStrategyParams(), + Option.ofNullable(clusteringPlan.getPreserveHoodieMetadata()).orElse(false), + instantTime))); + HoodieWriteMetadata> writeMetadata = new HoodieWriteMetadata<>(); + writeMetadata.setWriteStatuses(HoodieListData.eager(writeStatusList)); + return writeMetadata; + } + + /** + * Execute clustering to write inputRecords into new files as defined by rules in strategy parameters. + * The number of new file groups created is bounded by numOutputGroups. + * Note that commit is not done as part of strategy. commit is callers responsibility. + * + * @param inputRecords List of {@link HoodieRecord}. + * @param numOutputGroups Number of output file groups. + * @param instantTime Clustering (replace commit) instant time. + * @param strategyParams Strategy parameters containing columns to sort the data by when clustering. + * @param schema Schema of the data including metadata fields. + * @param fileGroupIdList File group id corresponding to each out group. + * @param preserveHoodieMetadata Whether to preserve commit metadata while clustering. + * @return List of {@link WriteStatus}. + */ + public abstract List performClusteringWithRecordList( + final List> inputRecords, final int numOutputGroups, final String instantTime, + final Map strategyParams, final Schema schema, + final List fileGroupIdList, final boolean preserveHoodieMetadata); + + /** + * Create {@link BulkInsertPartitioner} based on strategy params. + * + * @param strategyParams Strategy parameters containing columns to sort the data by when clustering. + * @param schema Schema of the data including metadata fields. + * @return partitioner for the java engine + */ + protected BulkInsertPartitioner>> getPartitioner(Map strategyParams, Schema schema) { + if (strategyParams.containsKey(PLAN_STRATEGY_SORT_COLUMNS.key())) { + return new JavaCustomColumnsSortPartitioner( + strategyParams.get(PLAN_STRATEGY_SORT_COLUMNS.key()).split(","), + HoodieAvroUtils.addMetadataFields(schema), + getWriteConfig().isConsistentLogicalTimestampEnabled()); + } else { + return JavaBulkInsertInternalPartitionerFactory.get(getWriteConfig().getBulkInsertSortMode()); + } + } + + /** + * Executes clustering for the group. + */ + private List runClusteringForGroup( + HoodieClusteringGroup clusteringGroup, Map strategyParams, + boolean preserveHoodieMetadata, String instantTime) { + List> inputRecords = readRecordsForGroup(clusteringGroup, instantTime); + Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(getWriteConfig().getSchema())); + List inputFileIds = clusteringGroup.getSlices().stream() + .map(info -> new HoodieFileGroupId(info.getPartitionPath(), info.getFileId())) + .collect(Collectors.toList()); + return performClusteringWithRecordList(inputRecords, clusteringGroup.getNumOutputFileGroups(), instantTime, strategyParams, readerSchema, inputFileIds, preserveHoodieMetadata); + } + + /** + * Get a list of all records for the group. This includes all records from file slice + * (Apply updates from log files, if any). + */ + private List> readRecordsForGroup(HoodieClusteringGroup clusteringGroup, String instantTime) { + List clusteringOps = clusteringGroup.getSlices().stream().map(ClusteringOperation::create).collect(Collectors.toList()); + boolean hasLogFiles = clusteringOps.stream().anyMatch(op -> op.getDeltaFilePaths().size() > 0); + if (hasLogFiles) { + // if there are log files, we read all records into memory for a file group and apply updates. + return readRecordsForGroupWithLogs(clusteringOps, instantTime); + } else { + // We want to optimize reading records for case there are no log files. + return readRecordsForGroupBaseFiles(clusteringOps); + } + } + + /** + * Read records from baseFiles and apply updates. + */ + private List> readRecordsForGroupWithLogs(List clusteringOps, + String instantTime) { + HoodieWriteConfig config = getWriteConfig(); + HoodieTable table = getHoodieTable(); + List> records = new ArrayList<>(); + + clusteringOps.forEach(clusteringOp -> { + long maxMemoryPerCompaction = IOUtils.getMaxMemoryPerCompaction(new JavaTaskContextSupplier(), config); + LOG.info("MaxMemoryPerCompaction run as part of clustering => " + maxMemoryPerCompaction); + Option baseFileReader = Option.empty(); + HoodieMergedLogRecordScanner scanner = null; + try { + Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); + scanner = HoodieMergedLogRecordScanner.newBuilder() + .withFileSystem(table.getMetaClient().getFs()) + .withBasePath(table.getMetaClient().getBasePath()) + .withLogFilePaths(clusteringOp.getDeltaFilePaths()) + .withReaderSchema(readerSchema) + .withLatestInstantTime(instantTime) + .withMaxMemorySizeInBytes(maxMemoryPerCompaction) + .withReadBlocksLazily(config.getCompactionLazyBlockReadEnabled()) + .withReverseReader(config.getCompactionReverseLogReadEnabled()) + .withBufferSize(config.getMaxDFSStreamBufferSize()) + .withSpillableMapBasePath(config.getSpillableMapBasePath()) + .withPartition(clusteringOp.getPartitionPath()) + .withDiskMapType(config.getCommonConfig().getSpillableDiskMapType()) + .withBitCaskDiskMapCompressionEnabled(config.getCommonConfig().isBitCaskDiskMapCompressionEnabled()) + .build(); + + baseFileReader = StringUtils.isNullOrEmpty(clusteringOp.getDataFilePath()) + ? Option.empty() + : Option.of(HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), new Path(clusteringOp.getDataFilePath()))); + HoodieTableConfig tableConfig = table.getMetaClient().getTableConfig(); + Iterator> fileSliceReader = getFileSliceReader(baseFileReader, scanner, readerSchema, + tableConfig.getPayloadClass(), + tableConfig.getPreCombineField(), + tableConfig.populateMetaFields() ? Option.empty() : Option.of(Pair.of(tableConfig.getRecordKeyFieldProp(), + tableConfig.getPartitionFieldProp()))); + fileSliceReader.forEachRemaining(records::add); + } catch (IOException e) { + throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath() + + " and " + clusteringOp.getDeltaFilePaths(), e); + } finally { + if (scanner != null) { + scanner.close(); + } + if (baseFileReader.isPresent()) { + baseFileReader.get().close(); + } + } + }); + return records; + } + + /** + * Read records from baseFiles. + */ + private List> readRecordsForGroupBaseFiles(List clusteringOps) { + List> records = new ArrayList<>(); + clusteringOps.forEach(clusteringOp -> { + try (HoodieFileReader baseFileReader = HoodieFileReaderFactory.getFileReader(getHoodieTable().getHadoopConf(), new Path(clusteringOp.getDataFilePath()))) { + Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(getWriteConfig().getSchema())); + Iterator recordIterator = baseFileReader.getRecordIterator(readerSchema); + recordIterator.forEachRemaining(record -> records.add(transform(record))); + } catch (IOException e) { + throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath() + + " and " + clusteringOp.getDeltaFilePaths(), e); + } + }); + return records; + } + + /** + * Transform IndexedRecord into HoodieRecord. + */ + private HoodieRecord transform(IndexedRecord indexedRecord) { + GenericRecord record = (GenericRecord) indexedRecord; + Option keyGeneratorOpt = Option.empty(); + String key = KeyGenUtils.getRecordKeyFromGenericRecord(record, keyGeneratorOpt); + String partition = KeyGenUtils.getPartitionPathFromGenericRecord(record, keyGeneratorOpt); + HoodieKey hoodieKey = new HoodieKey(key, partition); + + HoodieRecordPayload avroPayload = new RewriteAvroPayload(record); + HoodieRecord hoodieRecord = new HoodieAvroRecord(hoodieKey, avroPayload); + return hoodieRecord; + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaSortAndSizeExecutionStrategy.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaSortAndSizeExecutionStrategy.java new file mode 100644 index 0000000000000..d34673c2d9b9a --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaSortAndSizeExecutionStrategy.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.client.clustering.run.strategy; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.CreateHandleFactory; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.commit.JavaBulkInsertHelper; + +import org.apache.avro.Schema; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.List; +import java.util.Map; + +/** + * Clustering Strategy based on following. + * 1) Java execution engine. + * 2) Uses bulk_insert to write data into new files. + */ +public class JavaSortAndSizeExecutionStrategy> + extends JavaExecutionStrategy { + private static final Logger LOG = LogManager.getLogger(JavaSortAndSizeExecutionStrategy.class); + + public JavaSortAndSizeExecutionStrategy(HoodieTable table, + HoodieEngineContext engineContext, + HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + @Override + public List performClusteringWithRecordList( + final List> inputRecords, final int numOutputGroups, + final String instantTime, final Map strategyParams, final Schema schema, + final List fileGroupIdList, final boolean preserveHoodieMetadata) { + LOG.info("Starting clustering for a group, parallelism:" + numOutputGroups + " commit:" + instantTime); + + HoodieWriteConfig newConfig = HoodieWriteConfig.newBuilder() + .withBulkInsertParallelism(numOutputGroups) + .withEngineType(EngineType.JAVA) + .withProps(getWriteConfig().getProps()).build(); + newConfig.setValue(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE, String.valueOf(getWriteConfig().getClusteringTargetFileMaxBytes())); + return (List) JavaBulkInsertHelper.newInstance().bulkInsert(inputRecords, instantTime, getHoodieTable(), newConfig, + false, getPartitioner(strategyParams, schema), true, numOutputGroups, new CreateHandleFactory(preserveHoodieMetadata)); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/common/HoodieJavaEngineContext.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/common/HoodieJavaEngineContext.java index a04a18b190964..2211c8a1030ae 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/common/HoodieJavaEngineContext.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/common/HoodieJavaEngineContext.java @@ -19,39 +19,102 @@ package org.apache.hudi.client.common; import org.apache.hadoop.conf.Configuration; -import org.apache.hudi.client.common.function.SerializableConsumer; -import org.apache.hudi.client.common.function.SerializableFunction; -import org.apache.hudi.client.common.function.SerializablePairFunction; + import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.data.HoodieAccumulator; +import org.apache.hudi.common.data.HoodieAtomicLongAccumulator; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodieListData; +import org.apache.hudi.common.engine.EngineProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.function.SerializableBiFunction; +import org.apache.hudi.common.function.SerializableConsumer; +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFlatMapFunction; +import org.apache.hudi.common.function.SerializablePairFunction; import org.apache.hudi.common.util.Option; -import scala.Tuple2; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; +import java.util.Collections; +import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.stream.Collectors; import java.util.stream.Stream; import static java.util.stream.Collectors.toList; -import static org.apache.hudi.client.common.function.FunctionWrapper.throwingFlatMapWrapper; -import static org.apache.hudi.client.common.function.FunctionWrapper.throwingForeachWrapper; -import static org.apache.hudi.client.common.function.FunctionWrapper.throwingMapToPairWrapper; -import static org.apache.hudi.client.common.function.FunctionWrapper.throwingMapWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapToPairWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingForeachWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingMapToPairWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingMapWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingReduceWrapper; /** * A java engine implementation of HoodieEngineContext. */ public class HoodieJavaEngineContext extends HoodieEngineContext { + public HoodieJavaEngineContext(Configuration conf) { + this(conf, new JavaTaskContextSupplier()); + } + public HoodieJavaEngineContext(Configuration conf, TaskContextSupplier taskContextSupplier) { super(new SerializableConfiguration(conf), taskContextSupplier); } + @Override + public HoodieAccumulator newAccumulator() { + return HoodieAtomicLongAccumulator.create(); + } + + @Override + public HoodieData emptyHoodieData() { + return HoodieListData.eager(Collections.emptyList()); + } + + @Override + public HoodieData parallelize(List data, int parallelism) { + return HoodieListData.eager(data); + } + @Override public List map(List data, SerializableFunction func, int parallelism) { return data.stream().parallel().map(throwingMapWrapper(func)).collect(toList()); } + @Override + public List mapToPairAndReduceByKey(List data, SerializablePairFunction mapToPairFunc, SerializableBiFunction reduceFunc, int parallelism) { + return data.stream().parallel().map(throwingMapToPairWrapper(mapToPairFunc)) + .collect(Collectors.groupingBy(p -> p.getKey())).values().stream() + .map(list -> list.stream().map(e -> e.getValue()).reduce(throwingReduceWrapper(reduceFunc)).get()) + .collect(Collectors.toList()); + } + + @Override + public Stream> mapPartitionsToPairAndReduceByKey(Stream data, SerializablePairFlatMapFunction, K, V> flatMapToPairFunc, + SerializableBiFunction reduceFunc, int parallelism) { + return throwingFlatMapToPairWrapper(flatMapToPairFunc).apply(data.parallel().iterator()) + .collect(Collectors.groupingBy(Pair::getKey)).entrySet().stream() + .map(entry -> new ImmutablePair<>(entry.getKey(), entry.getValue().stream().map( + Pair::getValue).reduce(throwingReduceWrapper(reduceFunc)).orElse(null))) + .filter(Objects::nonNull); + } + + @Override + public List reduceByKey( + List> data, SerializableBiFunction reduceFunc, int parallelism) { + return data.stream().parallel() + .collect(Collectors.groupingBy(p -> p.getKey())).values().stream() + .map(list -> list.stream().map(e -> e.getValue()).reduce(throwingReduceWrapper(reduceFunc)).orElse(null)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + @Override public List flatMap(List data, SerializableFunction> func, int parallelism) { return data.stream().parallel().flatMap(throwingFlatMapWrapper(func)).collect(toList()); @@ -65,7 +128,7 @@ public void foreach(List data, SerializableConsumer consumer, int para @Override public Map mapToPair(List data, SerializablePairFunction func, Integer parallelism) { return data.stream().map(throwingMapToPairWrapper(func)).collect( - Collectors.toMap(Tuple2::_1, Tuple2::_2, (oldVal, newVal) -> newVal) + Collectors.toMap(Pair::getLeft, Pair::getRight, (oldVal, newVal) -> newVal) ); } diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/common/JavaTaskContextSupplier.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/common/JavaTaskContextSupplier.java new file mode 100644 index 0000000000000..628201ccc25ae --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/common/JavaTaskContextSupplier.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.common; + +import org.apache.hudi.common.engine.EngineProperty; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.util.Option; + +import java.util.function.Supplier; + +public class JavaTaskContextSupplier extends TaskContextSupplier { + @Override + public Supplier getPartitionIdSupplier() { + return () -> 0; + } + + @Override + public Supplier getStageIdSupplier() { + return () -> 0; + } + + @Override + public Supplier getAttemptIdSupplier() { + return () -> 0L; + } + + @Override + public Option getProperty(EngineProperty prop) { + return Option.empty(); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/execution/JavaLazyInsertIterable.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/execution/JavaLazyInsertIterable.java new file mode 100644 index 0000000000000..9821aedc875cd --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/execution/JavaLazyInsertIterable.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; +import org.apache.hudi.common.util.queue.IteratorBasedQueueProducer; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.io.WriteHandleFactory; +import org.apache.hudi.table.HoodieTable; + +import org.apache.avro.Schema; + +import java.util.Iterator; +import java.util.List; + +public class JavaLazyInsertIterable extends HoodieLazyInsertIterable { + public JavaLazyInsertIterable(Iterator> recordItr, + boolean areRecordsSorted, + HoodieWriteConfig config, + String instantTime, + HoodieTable hoodieTable, + String idPrefix, + TaskContextSupplier taskContextSupplier) { + super(recordItr, areRecordsSorted, config, instantTime, hoodieTable, idPrefix, taskContextSupplier); + } + + public JavaLazyInsertIterable(Iterator> recordItr, + boolean areRecordsSorted, + HoodieWriteConfig config, + String instantTime, + HoodieTable hoodieTable, + String idPrefix, + TaskContextSupplier taskContextSupplier, + WriteHandleFactory writeHandleFactory) { + super(recordItr, areRecordsSorted, config, instantTime, hoodieTable, idPrefix, taskContextSupplier, writeHandleFactory); + } + + @Override + protected List computeNext() { + // Executor service used for launching writer thread. + BoundedInMemoryExecutor, HoodieInsertValueGenResult, List> bufferedIteratorExecutor = + null; + try { + final Schema schema = new Schema.Parser().parse(hoodieConfig.getSchema()); + bufferedIteratorExecutor = + new BoundedInMemoryExecutor<>(hoodieConfig.getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(inputItr), Option.of(getInsertHandler()), getTransformFunction(schema)); + final List result = bufferedIteratorExecutor.execute(); + assert result != null && !result.isEmpty() && !bufferedIteratorExecutor.isRemaining(); + return result; + } catch (Exception e) { + throw new HoodieException(e); + } finally { + if (null != bufferedIteratorExecutor) { + bufferedIteratorExecutor.shutdownNow(); + bufferedIteratorExecutor.awaitTermination(); + } + } + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/execution/bulkinsert/JavaBulkInsertInternalPartitionerFactory.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/execution/bulkinsert/JavaBulkInsertInternalPartitionerFactory.java new file mode 100644 index 0000000000000..62523d3399054 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/execution/bulkinsert/JavaBulkInsertInternalPartitionerFactory.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.table.BulkInsertPartitioner; + +/** + * A factory to generate built-in partitioner to repartition input records into at least + * expected number of output spark partitions for bulk insert operation. + */ +public abstract class JavaBulkInsertInternalPartitionerFactory { + + public static BulkInsertPartitioner get(BulkInsertSortMode sortMode) { + switch (sortMode) { + case NONE: + return new JavaNonSortPartitioner(); + case GLOBAL_SORT: + return new JavaGlobalSortPartitioner(); + default: + throw new HoodieException("The bulk insert sort mode \"" + sortMode.name() + + "\" is not supported in java client."); + } + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/execution/bulkinsert/JavaCustomColumnsSortPartitioner.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/execution/bulkinsert/JavaCustomColumnsSortPartitioner.java new file mode 100644 index 0000000000000..b9e466485f209 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/execution/bulkinsert/JavaCustomColumnsSortPartitioner.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.table.BulkInsertPartitioner; + +import org.apache.avro.Schema; + +import java.util.List; +import java.util.stream.Collectors; + +/** + * A partitioner that does sorting based on specified column values for Java client. + * + * @param HoodieRecordPayload type + */ +public class JavaCustomColumnsSortPartitioner + implements BulkInsertPartitioner>> { + + private final String[] sortColumnNames; + private final Schema schema; + private final boolean consistentLogicalTimestampEnabled; + + public JavaCustomColumnsSortPartitioner(String[] columnNames, Schema schema, boolean consistentLogicalTimestampEnabled) { + this.sortColumnNames = columnNames; + this.schema = schema; + this.consistentLogicalTimestampEnabled = consistentLogicalTimestampEnabled; + } + + @Override + public List> repartitionRecords( + List> records, int outputPartitions) { + return records.stream().sorted((o1, o2) -> { + Object values1 = HoodieAvroUtils.getRecordColumnValues(o1, sortColumnNames, schema, consistentLogicalTimestampEnabled); + Object values2 = HoodieAvroUtils.getRecordColumnValues(o2, sortColumnNames, schema, consistentLogicalTimestampEnabled); + return values1.toString().compareTo(values2.toString()); + }).collect(Collectors.toList()); + } + + @Override + public boolean arePartitionRecordsSorted() { + return true; + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/execution/bulkinsert/JavaGlobalSortPartitioner.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/execution/bulkinsert/JavaGlobalSortPartitioner.java new file mode 100644 index 0000000000000..d272849a19f28 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/execution/bulkinsert/JavaGlobalSortPartitioner.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.table.BulkInsertPartitioner; + +import java.util.Comparator; +import java.util.List; + +/** + * A built-in partitioner that does global sorting for the input records across partitions + * after repartition for bulk insert operation, corresponding to the + * {@code BulkInsertSortMode.GLOBAL_SORT} mode. + * + * @param HoodieRecordPayload type + */ +public class JavaGlobalSortPartitioner + implements BulkInsertPartitioner>> { + + @Override + public List> repartitionRecords(List> records, + int outputPartitions) { + // Now, sort the records and line them up nicely for loading. + records.sort(new Comparator() { + @Override + public int compare(Object o1, Object o2) { + HoodieRecord o11 = (HoodieRecord) o1; + HoodieRecord o22 = (HoodieRecord) o2; + String left = new StringBuilder() + .append(o11.getPartitionPath()) + .append("+") + .append(o11.getRecordKey()) + .toString(); + String right = new StringBuilder() + .append(o22.getPartitionPath()) + .append("+") + .append(o22.getRecordKey()) + .toString(); + return left.compareTo(right); + } + }); + return records; + } + + @Override + public boolean arePartitionRecordsSorted() { + return true; + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/execution/bulkinsert/JavaNonSortPartitioner.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/execution/bulkinsert/JavaNonSortPartitioner.java new file mode 100644 index 0000000000000..b40459d838444 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/execution/bulkinsert/JavaNonSortPartitioner.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.table.BulkInsertPartitioner; + +import java.util.List; + +/** + * A built-in partitioner that only does coalesce for input records for bulk insert operation, + * corresponding to the {@code BulkInsertSortMode.NONE} mode. + * + * @param HoodieRecordPayload type + */ +public class JavaNonSortPartitioner + implements BulkInsertPartitioner>> { + + @Override + public List> repartitionRecords(List> records, + int outputPartitions) { + return records; + } + + @Override + public boolean arePartitionRecordsSorted() { + return false; + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndex.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndex.java new file mode 100644 index 0000000000000..dcc9d050dcbe5 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndex.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.index; + +import org.apache.hudi.ApiMaturityLevel; +import org.apache.hudi.PublicAPIMethod; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodieListData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.table.HoodieTable; + +import java.util.List; +import java.util.stream.Collectors; + +public abstract class JavaHoodieIndex extends HoodieIndex>, List> { + protected JavaHoodieIndex(HoodieWriteConfig config) { + super(config); + } + + @Override + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) + public abstract List updateLocation(List writeStatuses, + HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException; + + @Override + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) + public abstract List> tagLocation(List> records, + HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException; + + @Override + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { + List> hoodieRecords = tagLocation(records.map(record -> (HoodieRecord) record).collectAsList(), context, hoodieTable); + return HoodieListData.eager(hoodieRecords.stream().map(r -> (HoodieRecord) r).collect(Collectors.toList())); + } + + @Override + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public HoodieData updateLocation( + HoodieData writeStatuses, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { + return HoodieListData.eager(updateLocation(writeStatuses.collectAsList(), context, hoodieTable)); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndexFactory.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndexFactory.java new file mode 100644 index 0000000000000..9f4adad8ecf8a --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndexFactory.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index; + +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.index.bloom.HoodieBloomIndex; +import org.apache.hudi.index.bloom.ListBasedHoodieBloomIndexHelper; +import org.apache.hudi.index.inmemory.HoodieInMemoryHashIndex; + +/** + * A factory to generate Java {@link HoodieIndex}. + */ +public final class JavaHoodieIndexFactory { + + public static HoodieIndex createIndex(HoodieWriteConfig config) { + // first use index class config to create index. + if (!StringUtils.isNullOrEmpty(config.getIndexClass())) { + Object instance = ReflectionUtils.loadClass(config.getIndexClass(), config); + if (!(instance instanceof HoodieIndex)) { + throw new HoodieIndexException(config.getIndexClass() + " is not a subclass of HoodieIndex"); + } + return (HoodieIndex) instance; + } + + // TODO more indexes to be added + switch (config.getIndexType()) { + case INMEMORY: + return new HoodieInMemoryHashIndex(config); + case BLOOM: + return new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance()); + default: + throw new HoodieIndexException("Unsupported index type " + config.getIndexType()); + } + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java new file mode 100644 index 0000000000000..342c018e5a269 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java @@ -0,0 +1,314 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table; + +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieCleanerPlan; +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.avro.model.HoodieIndexCommitMetadata; +import org.apache.hudi.avro.model.HoodieIndexPlan; +import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRestorePlan; +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; +import org.apache.hudi.avro.model.HoodieSavepointMetadata; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieNotSupportedException; +import org.apache.hudi.exception.HoodieUpsertException; +import org.apache.hudi.io.HoodieCreateHandle; +import org.apache.hudi.io.HoodieMergeHandle; +import org.apache.hudi.io.HoodieSortedMergeHandle; +import org.apache.hudi.metadata.MetadataPartitionType; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata; +import org.apache.hudi.table.action.clean.CleanActionExecutor; +import org.apache.hudi.table.action.clean.CleanPlanActionExecutor; +import org.apache.hudi.table.action.cluster.ClusteringPlanActionExecutor; +import org.apache.hudi.table.action.cluster.JavaExecuteClusteringCommitActionExecutor; +import org.apache.hudi.table.action.commit.JavaBulkInsertCommitActionExecutor; +import org.apache.hudi.table.action.commit.JavaBulkInsertPreppedCommitActionExecutor; +import org.apache.hudi.table.action.commit.JavaDeleteCommitActionExecutor; +import org.apache.hudi.table.action.commit.JavaInsertCommitActionExecutor; +import org.apache.hudi.table.action.commit.JavaInsertOverwriteCommitActionExecutor; +import org.apache.hudi.table.action.commit.JavaInsertOverwriteTableCommitActionExecutor; +import org.apache.hudi.table.action.commit.JavaInsertPreppedCommitActionExecutor; +import org.apache.hudi.table.action.commit.JavaMergeHelper; +import org.apache.hudi.table.action.commit.JavaUpsertCommitActionExecutor; +import org.apache.hudi.table.action.commit.JavaUpsertPreppedCommitActionExecutor; +import org.apache.hudi.table.action.index.RunIndexActionExecutor; +import org.apache.hudi.table.action.index.ScheduleIndexActionExecutor; +import org.apache.hudi.table.action.restore.CopyOnWriteRestoreActionExecutor; +import org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor; +import org.apache.hudi.table.action.rollback.CopyOnWriteRollbackActionExecutor; +import org.apache.hudi.table.action.rollback.RestorePlanActionExecutor; +import org.apache.hudi.table.action.savepoint.SavepointActionExecutor; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +public class HoodieJavaCopyOnWriteTable + extends HoodieJavaTable implements HoodieCompactionHandler { + + private static final Logger LOG = LoggerFactory.getLogger(HoodieJavaCopyOnWriteTable.class); + + protected HoodieJavaCopyOnWriteTable(HoodieWriteConfig config, + HoodieEngineContext context, + HoodieTableMetaClient metaClient) { + super(config, context, metaClient); + } + + @Override + public HoodieWriteMetadata> upsert(HoodieEngineContext context, + String instantTime, + List> records) { + return new JavaUpsertCommitActionExecutor<>(context, config, + this, instantTime, records).execute(); + } + + @Override + public HoodieWriteMetadata> insert(HoodieEngineContext context, + String instantTime, + List> records) { + return new JavaInsertCommitActionExecutor<>(context, config, + this, instantTime, records).execute(); + } + + @Override + public HoodieWriteMetadata> bulkInsert(HoodieEngineContext context, + String instantTime, + List> records, + Option bulkInsertPartitioner) { + return new JavaBulkInsertCommitActionExecutor((HoodieJavaEngineContext) context, config, + this, instantTime, records, bulkInsertPartitioner).execute(); + } + + @Override + public HoodieWriteMetadata> delete(HoodieEngineContext context, + String instantTime, + List keys) { + return new JavaDeleteCommitActionExecutor<>(context, config, this, instantTime, keys).execute(); + } + + @Override + public HoodieWriteMetadata deletePartitions(HoodieEngineContext context, String instantTime, List partitions) { + throw new HoodieNotSupportedException("Delete partitions is not supported yet"); + } + + @Override + public HoodieWriteMetadata> upsertPrepped(HoodieEngineContext context, + String instantTime, + List> preppedRecords) { + return new JavaUpsertPreppedCommitActionExecutor<>((HoodieJavaEngineContext) context, config, + this, instantTime, preppedRecords).execute(); + + } + + @Override + public HoodieWriteMetadata> insertPrepped(HoodieEngineContext context, + String instantTime, + List> preppedRecords) { + return new JavaInsertPreppedCommitActionExecutor<>((HoodieJavaEngineContext) context, config, + this, instantTime, preppedRecords).execute(); + } + + @Override + public HoodieWriteMetadata> bulkInsertPrepped(HoodieEngineContext context, + String instantTime, + List> preppedRecords, + Option bulkInsertPartitioner) { + return new JavaBulkInsertPreppedCommitActionExecutor((HoodieJavaEngineContext) context, config, + this, instantTime, preppedRecords, bulkInsertPartitioner).execute(); + } + + @Override + public HoodieWriteMetadata> insertOverwrite(HoodieEngineContext context, + String instantTime, + List> records) { + return new JavaInsertOverwriteCommitActionExecutor( + context, config, this, instantTime, records).execute(); + } + + @Override + public HoodieWriteMetadata> insertOverwriteTable(HoodieEngineContext context, + String instantTime, + List> records) { + return new JavaInsertOverwriteTableCommitActionExecutor( + context, config, this, instantTime, records).execute(); + } + + @Override + public Option scheduleCompaction(HoodieEngineContext context, + String instantTime, + Option> extraMetadata) { + throw new HoodieNotSupportedException("ScheduleCompaction is not supported on a CopyOnWrite table"); + } + + @Override + public HoodieWriteMetadata> compact(HoodieEngineContext context, + String compactionInstantTime) { + throw new HoodieNotSupportedException("Compaction is not supported on a CopyOnWrite table"); + } + + @Override + public Option scheduleClustering(final HoodieEngineContext context, final String instantTime, final Option> extraMetadata) { + return new ClusteringPlanActionExecutor<>(context, config, this, instantTime, extraMetadata).execute(); + } + + @Override + public HoodieWriteMetadata> cluster(final HoodieEngineContext context, final String clusteringInstantTime) { + return new JavaExecuteClusteringCommitActionExecutor<>(context, config, this, clusteringInstantTime).execute(); + } + + @Override + public HoodieBootstrapWriteMetadata> bootstrap(HoodieEngineContext context, + Option> extraMetadata) { + throw new HoodieNotSupportedException("Bootstrap is not supported yet"); + } + + @Override + public void rollbackBootstrap(HoodieEngineContext context, + String instantTime) { + throw new HoodieNotSupportedException("RollbackBootstrap is not supported yet"); + } + + @Override + public Option scheduleRollback(HoodieEngineContext context, String instantTime, HoodieInstant instantToRollback, + boolean skipTimelinePublish, boolean shouldRollbackUsingMarkers) { + return new BaseRollbackPlanActionExecutor(context, config, this, instantTime, instantToRollback, skipTimelinePublish, + shouldRollbackUsingMarkers).execute(); + } + + @Override + public Option scheduleCleaning(HoodieEngineContext context, String instantTime, Option> extraMetadata) { + return new CleanPlanActionExecutor<>(context, config, this, instantTime, extraMetadata).execute(); + } + + @Override + public HoodieCleanMetadata clean(HoodieEngineContext context, + String cleanInstantTime, boolean skipLocking) { + return new CleanActionExecutor(context, config, this, cleanInstantTime).execute(); + } + + @Override + public HoodieRollbackMetadata rollback(HoodieEngineContext context, + String rollbackInstantTime, + HoodieInstant commitInstant, + boolean deleteInstants, + boolean skipLocking) { + return new CopyOnWriteRollbackActionExecutor( + context, config, this, rollbackInstantTime, commitInstant, deleteInstants, skipLocking).execute(); + } + + @Override + public Option scheduleIndexing(HoodieEngineContext context, String indexInstantTime, List partitionsToIndex) { + return new ScheduleIndexActionExecutor<>(context, config, this, indexInstantTime, partitionsToIndex).execute(); + } + + @Override + public Option index(HoodieEngineContext context, String indexInstantTime) { + return new RunIndexActionExecutor<>(context, config, this, indexInstantTime).execute(); + } + + @Override + public HoodieSavepointMetadata savepoint(HoodieEngineContext context, + String instantToSavepoint, + String user, + String comment) { + return new SavepointActionExecutor( + context, config, this, instantToSavepoint, user, comment).execute(); + } + + @Override + public Option scheduleRestore(HoodieEngineContext context, String restoreInstantTime, String instantToRestore) { + return new RestorePlanActionExecutor(context, config, this, restoreInstantTime, instantToRestore).execute(); + } + + @Override + public HoodieRestoreMetadata restore(HoodieEngineContext context, + String restoreInstantTime, + String instantToRestore) { + return new CopyOnWriteRestoreActionExecutor( + context, config, this, restoreInstantTime, instantToRestore).execute(); + } + + @Override + public Iterator> handleUpdate( + String instantTime, String partitionPath, String fileId, + Map> keyToNewRecords, HoodieBaseFile oldDataFile) + throws IOException { + // these are updates + HoodieMergeHandle upsertHandle = getUpdateHandle(instantTime, partitionPath, fileId, keyToNewRecords, oldDataFile); + return handleUpdateInternal(upsertHandle, instantTime, fileId); + } + + protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String instantTime, + String fileId) throws IOException { + if (upsertHandle.getOldFilePath() == null) { + throw new HoodieUpsertException( + "Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId); + } else { + JavaMergeHelper.newInstance().runMerge(this, upsertHandle); + } + + // TODO(yihua): This needs to be revisited + if (upsertHandle.getPartitionPath() == null) { + LOG.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", " + + upsertHandle.writeStatuses()); + } + + return Collections.singletonList(upsertHandle.writeStatuses()).iterator(); + } + + protected HoodieMergeHandle getUpdateHandle(String instantTime, String partitionPath, String fileId, + Map> keyToNewRecords, HoodieBaseFile dataFileToBeMerged) { + if (requireSortedRecords()) { + return new HoodieSortedMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId, + dataFileToBeMerged, taskContextSupplier, Option.empty()); + } else { + return new HoodieMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId, + dataFileToBeMerged, taskContextSupplier, Option.empty()); + } + } + + @Override + public Iterator> handleInsert( + String instantTime, String partitionPath, String fileId, + Map> recordMap) { + HoodieCreateHandle createHandle = + new HoodieCreateHandle(config, instantTime, this, partitionPath, fileId, recordMap, taskContextSupplier); + createHandle.write(); + return Collections.singletonList(createHandle.close()).iterator(); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaMergeOnReadTable.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaMergeOnReadTable.java new file mode 100644 index 0000000000000..5af29502a95cd --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaMergeOnReadTable.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table; + +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.commit.JavaBulkInsertPreppedCommitActionExecutor; +import org.apache.hudi.table.action.compact.HoodieJavaMergeOnReadTableCompactor; +import org.apache.hudi.table.action.compact.RunCompactionActionExecutor; +import org.apache.hudi.table.action.compact.ScheduleCompactionActionExecutor; +import org.apache.hudi.table.action.deltacommit.JavaUpsertPreppedDeltaCommitActionExecutor; + +import java.util.List; +import java.util.Map; + +public class HoodieJavaMergeOnReadTable extends HoodieJavaCopyOnWriteTable { + protected HoodieJavaMergeOnReadTable(HoodieWriteConfig config, HoodieEngineContext context, HoodieTableMetaClient metaClient) { + super(config, context, metaClient); + } + + @Override + public HoodieWriteMetadata> upsertPrepped(HoodieEngineContext context, + String instantTime, + List> preppedRecords) { + return new JavaUpsertPreppedDeltaCommitActionExecutor<>((HoodieJavaEngineContext) context, config, + this, instantTime, preppedRecords).execute(); + + } + + @Override + public HoodieWriteMetadata> bulkInsertPrepped(HoodieEngineContext context, + String instantTime, + List> preppedRecords, + Option bulkInsertPartitioner) { + return new JavaBulkInsertPreppedCommitActionExecutor((HoodieJavaEngineContext) context, config, + this, instantTime, preppedRecords, bulkInsertPartitioner).execute(); + } + + @Override + public Option scheduleCompaction(HoodieEngineContext context, String instantTime, Option> extraMetadata) { + ScheduleCompactionActionExecutor scheduleCompactionExecutor = new ScheduleCompactionActionExecutor( + context, config, this, instantTime, extraMetadata, + new HoodieJavaMergeOnReadTableCompactor()); + return scheduleCompactionExecutor.execute(); + } + + @Override + public HoodieWriteMetadata> compact( + HoodieEngineContext context, String compactionInstantTime) { + RunCompactionActionExecutor compactionExecutor = new RunCompactionActionExecutor( + context, config, this, compactionInstantTime, new HoodieJavaMergeOnReadTableCompactor(), + new HoodieJavaCopyOnWriteTable(config, context, getMetaClient())); + return convertMetadata(compactionExecutor.execute()); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaTable.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaTable.java new file mode 100644 index 0000000000000..3c878cbc14cf8 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaTable.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.index.JavaHoodieIndexFactory; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.util.List; + +public abstract class HoodieJavaTable + extends HoodieTable>, List, List> { + protected HoodieJavaTable(HoodieWriteConfig config, HoodieEngineContext context, HoodieTableMetaClient metaClient) { + super(config, context, metaClient); + } + + public static HoodieJavaTable create(HoodieWriteConfig config, HoodieEngineContext context) { + HoodieTableMetaClient metaClient = + HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(config.getBasePath()) + .setLoadActiveTimelineOnLoad(true).setConsistencyGuardConfig(config.getConsistencyGuardConfig()) + .setLayoutVersion(Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion()))).build(); + return HoodieJavaTable.create(config, (HoodieJavaEngineContext) context, metaClient); + } + + public static HoodieJavaTable create(HoodieWriteConfig config, + HoodieJavaEngineContext context, + HoodieTableMetaClient metaClient) { + switch (metaClient.getTableType()) { + case COPY_ON_WRITE: + return new HoodieJavaCopyOnWriteTable<>(config, context, metaClient); + case MERGE_ON_READ: + return new HoodieJavaMergeOnReadTable<>(config, context, metaClient); + default: + throw new HoodieException("Unsupported table type :" + metaClient.getTableType()); + } + } + + public static HoodieWriteMetadata> convertMetadata( + HoodieWriteMetadata> metadata) { + return metadata.clone(metadata.getWriteStatuses().collectAsList()); + } + + @Override + protected HoodieIndex getIndex(HoodieWriteConfig config, HoodieEngineContext context) { + return JavaHoodieIndexFactory.createIndex(config); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/cluster/JavaExecuteClusteringCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/cluster/JavaExecuteClusteringCommitActionExecutor.java new file mode 100644 index 0000000000000..168d558143bd3 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/cluster/JavaExecuteClusteringCommitActionExecutor.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.action.cluster; + +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.ClusteringUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieClusteringException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.commit.BaseJavaCommitActionExecutor; + +import java.util.List; + +public class JavaExecuteClusteringCommitActionExecutor> + extends BaseJavaCommitActionExecutor { + + private final HoodieClusteringPlan clusteringPlan; + + public JavaExecuteClusteringCommitActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime) { + super(context, config, table, instantTime, WriteOperationType.CLUSTER); + this.clusteringPlan = ClusteringUtils.getClusteringPlan( + table.getMetaClient(), HoodieTimeline.getReplaceCommitRequestedInstant(instantTime)) + .map(Pair::getRight).orElseThrow(() -> new HoodieClusteringException( + "Unable to read clustering plan for instant: " + instantTime)); + } + + @Override + public HoodieWriteMetadata> execute() { + HoodieWriteMetadata> writeMetadata = executeClustering(clusteringPlan); + List transformedWriteStatuses = writeMetadata.getWriteStatuses().collectAsList(); + return writeMetadata.clone(transformedWriteStatuses); + } + + @Override + protected String getCommitActionType() { + return HoodieTimeline.REPLACE_COMMIT_ACTION; + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/BaseJavaCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/BaseJavaCommitActionExecutor.java new file mode 100644 index 0000000000000..22c90eb8bb445 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/BaseJavaCommitActionExecutor.java @@ -0,0 +1,347 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieListData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.CommitUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieCommitException; +import org.apache.hudi.exception.HoodieUpsertException; +import org.apache.hudi.execution.JavaLazyInsertIterable; +import org.apache.hudi.io.CreateHandleFactory; +import org.apache.hudi.io.HoodieMergeHandle; +import org.apache.hudi.io.HoodieSortedMergeHandle; +import org.apache.hudi.io.HoodieConcatHandle; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.WorkloadProfile; +import org.apache.hudi.table.WorkloadStat; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.time.Duration; +import java.time.Instant; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public abstract class BaseJavaCommitActionExecutor extends + BaseCommitActionExecutor>, List, List, HoodieWriteMetadata> { + + private static final Logger LOG = LogManager.getLogger(BaseJavaCommitActionExecutor.class); + + public BaseJavaCommitActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + WriteOperationType operationType) { + super(context, config, table, instantTime, operationType, Option.empty()); + } + + public BaseJavaCommitActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + WriteOperationType operationType, + Option extraMetadata) { + super(context, config, table, instantTime, operationType, extraMetadata); + } + + @Override + public HoodieWriteMetadata> execute(List> inputRecords) { + HoodieWriteMetadata> result = new HoodieWriteMetadata<>(); + + WorkloadProfile workloadProfile = null; + if (isWorkloadProfileNeeded()) { + workloadProfile = new WorkloadProfile(buildProfile(inputRecords), table.getIndex().canIndexLogFiles()); + LOG.info("Input workload profile :" + workloadProfile); + } + + final Partitioner partitioner = getPartitioner(workloadProfile); + try { + saveWorkloadProfileMetadataToInflight(workloadProfile, instantTime); + } catch (Exception e) { + HoodieTableMetaClient metaClient = table.getMetaClient(); + HoodieInstant inflightInstant = new HoodieInstant(HoodieInstant.State.INFLIGHT, metaClient.getCommitActionType(), instantTime); + try { + if (!metaClient.getFs().exists(new Path(metaClient.getMetaPath(), inflightInstant.getFileName()))) { + throw new HoodieCommitException("Failed to commit " + instantTime + " unable to save inflight metadata ", e); + } + } catch (IOException ex) { + LOG.error("Check file exists failed"); + throw new HoodieCommitException("Failed to commit " + instantTime + " unable to save inflight metadata ", ex); + } + } + Map>> partitionedRecords = partition(inputRecords, partitioner); + + List writeStatuses = new LinkedList<>(); + partitionedRecords.forEach((partition, records) -> { + if (WriteOperationType.isChangingRecords(operationType)) { + handleUpsertPartition(instantTime, partition, records.iterator(), partitioner).forEachRemaining(writeStatuses::addAll); + } else { + handleInsertPartition(instantTime, partition, records.iterator(), partitioner).forEachRemaining(writeStatuses::addAll); + } + }); + updateIndex(writeStatuses, result); + updateIndexAndCommitIfNeeded(writeStatuses, result); + return result; + } + + protected List updateIndex(List writeStatuses, HoodieWriteMetadata> result) { + Instant indexStartTime = Instant.now(); + // Update the index back + List statuses = table.getIndex().updateLocation(HoodieListData.eager(writeStatuses), context, table).collectAsList(); + result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now())); + result.setWriteStatuses(statuses); + return statuses; + } + + @Override + protected String getCommitActionType() { + return table.getMetaClient().getCommitActionType(); + } + + private Partitioner getPartitioner(WorkloadProfile profile) { + if (WriteOperationType.isChangingRecords(operationType)) { + return getUpsertPartitioner(profile); + } else { + return getInsertPartitioner(profile); + } + } + + private Map>> partition(List> dedupedRecords, Partitioner partitioner) { + Map>, HoodieRecord>>> partitionedMidRecords = dedupedRecords + .stream() + .map(record -> Pair.of(Pair.of(record.getKey(), Option.ofNullable(record.getCurrentLocation())), record)) + .collect(Collectors.groupingBy(x -> partitioner.getPartition(x.getLeft()))); + Map>> results = new LinkedHashMap<>(); + partitionedMidRecords.forEach((key, value) -> results.put(key, value.stream().map(x -> x.getRight()).collect(Collectors.toList()))); + return results; + } + + protected Pair, WorkloadStat> buildProfile(List> inputRecords) { + HashMap partitionPathStatMap = new HashMap<>(); + WorkloadStat globalStat = new WorkloadStat(); + + Map>, Long> partitionLocationCounts = inputRecords + .stream() + .map(record -> Pair.of( + Pair.of(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())), record)) + .collect(Collectors.groupingBy(Pair::getLeft, Collectors.counting())); + + for (Map.Entry>, Long> e : partitionLocationCounts.entrySet()) { + String partitionPath = e.getKey().getLeft(); + Long count = e.getValue(); + Option locOption = e.getKey().getRight(); + + if (!partitionPathStatMap.containsKey(partitionPath)) { + partitionPathStatMap.put(partitionPath, new WorkloadStat()); + } + + if (locOption.isPresent()) { + // update + partitionPathStatMap.get(partitionPath).addUpdates(locOption.get(), count); + globalStat.addUpdates(locOption.get(), count); + } else { + // insert + partitionPathStatMap.get(partitionPath).addInserts(count); + globalStat.addInserts(count); + } + } + return Pair.of(partitionPathStatMap, globalStat); + } + + @Override + protected void commit(Option> extraMetadata, HoodieWriteMetadata> result) { + commit(extraMetadata, result, result.getWriteStatuses().stream().map(WriteStatus::getStat).collect(Collectors.toList())); + } + + protected void setCommitMetadata(HoodieWriteMetadata> result) { + result.setCommitMetadata(Option.of(CommitUtils.buildMetadata(result.getWriteStatuses().stream().map(WriteStatus::getStat).collect(Collectors.toList()), + result.getPartitionToReplaceFileIds(), extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()))); + } + + protected void commit(Option> extraMetadata, HoodieWriteMetadata> result, List writeStats) { + String actionType = getCommitActionType(); + LOG.info("Committing " + instantTime + ", action Type " + actionType); + result.setCommitted(true); + result.setWriteStats(writeStats); + // Finalize write + finalizeWrite(instantTime, writeStats, result); + try { + LOG.info("Committing " + instantTime + ", action Type " + getCommitActionType()); + HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); + HoodieCommitMetadata metadata = result.getCommitMetadata().get(); + + writeTableMetadata(metadata, actionType); + + activeTimeline.saveAsComplete(new HoodieInstant(true, getCommitActionType(), instantTime), + Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + LOG.info("Committed " + instantTime); + result.setCommitMetadata(Option.of(metadata)); + } catch (IOException e) { + throw new HoodieCommitException("Failed to complete commit " + config.getBasePath() + " at time " + instantTime, + e); + } + } + + protected Map> getPartitionToReplacedFileIds(HoodieWriteMetadata> writeMetadata) { + return Collections.emptyMap(); + } + + @Override + protected boolean isWorkloadProfileNeeded() { + return true; + } + + @SuppressWarnings("unchecked") + protected Iterator> handleUpsertPartition(String instantTime, Integer partition, Iterator recordItr, + Partitioner partitioner) { + JavaUpsertPartitioner javaUpsertPartitioner = (JavaUpsertPartitioner) partitioner; + BucketInfo binfo = javaUpsertPartitioner.getBucketInfo(partition); + BucketType btype = binfo.bucketType; + try { + if (btype.equals(BucketType.INSERT)) { + return handleInsert(binfo.fileIdPrefix, recordItr); + } else if (btype.equals(BucketType.UPDATE)) { + return handleUpdate(binfo.partitionPath, binfo.fileIdPrefix, recordItr); + } else { + throw new HoodieUpsertException("Unknown bucketType " + btype + " for partition :" + partition); + } + } catch (Throwable t) { + String msg = "Error upserting bucketType " + btype + " for partition :" + partition; + LOG.error(msg, t); + throw new HoodieUpsertException(msg, t); + } + } + + protected Iterator> handleInsertPartition(String instantTime, Integer partition, Iterator recordItr, + Partitioner partitioner) { + return handleUpsertPartition(instantTime, partition, recordItr, partitioner); + } + + @Override + public Iterator> handleUpdate(String partitionPath, String fileId, + Iterator> recordItr) + throws IOException { + // This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records + if (!recordItr.hasNext()) { + LOG.info("Empty partition with fileId => " + fileId); + return Collections.singletonList((List) Collections.EMPTY_LIST).iterator(); + } + // these are updates + HoodieMergeHandle upsertHandle = getUpdateHandle(partitionPath, fileId, recordItr); + return handleUpdateInternal(upsertHandle, fileId); + } + + protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String fileId) + throws IOException { + if (upsertHandle.getOldFilePath() == null) { + throw new HoodieUpsertException( + "Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId); + } else { + JavaMergeHelper.newInstance().runMerge(table, upsertHandle); + } + + List statuses = upsertHandle.writeStatuses(); + if (upsertHandle.getPartitionPath() == null) { + LOG.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", " + statuses); + } + return Collections.singletonList(statuses).iterator(); + } + + protected HoodieMergeHandle getUpdateHandle(String partitionPath, String fileId, Iterator> recordItr) { + if (table.requireSortedRecords()) { + return new HoodieSortedMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier, Option.empty()); + } else if (!WriteOperationType.isChangingRecords(operationType) && config.allowDuplicateInserts()) { + return new HoodieConcatHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier, Option.empty()); + } else { + return new HoodieMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier, Option.empty()); + } + } + + protected HoodieMergeHandle getUpdateHandle(String partitionPath, String fileId, + Map> keyToNewRecords, + HoodieBaseFile dataFileToBeMerged) { + return new HoodieMergeHandle<>(config, instantTime, table, keyToNewRecords, + partitionPath, fileId, dataFileToBeMerged, taskContextSupplier, Option.empty()); + } + + @Override + public Iterator> handleInsert(String idPfx, Iterator> recordItr) { + // This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records + if (!recordItr.hasNext()) { + LOG.info("Empty partition"); + return Collections.singletonList((List) Collections.EMPTY_LIST).iterator(); + } + return new JavaLazyInsertIterable<>(recordItr, true, config, instantTime, table, idPfx, + taskContextSupplier, new CreateHandleFactory<>()); + } + + /** + * Provides a partitioner to perform the upsert operation, based on the workload profile. + */ + public Partitioner getUpsertPartitioner(WorkloadProfile profile) { + if (profile == null) { + throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner."); + } + return new JavaUpsertPartitioner(profile, context, table, config); + } + + /** + * Provides a partitioner to perform the insert operation, based on the workload profile. + */ + public Partitioner getInsertPartitioner(WorkloadProfile profile) { + return getUpsertPartitioner(profile); + } + + public void updateIndexAndCommitIfNeeded(List writeStatuses, HoodieWriteMetadata result) { + Instant indexStartTime = Instant.now(); + // Update the index back + List statuses = table.getIndex().updateLocation(HoodieListData.eager(writeStatuses), context, table).collectAsList(); + result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now())); + result.setWriteStatuses(statuses); + result.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(result)); + commitOnAutoCommit(result); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertCommitActionExecutor.java new file mode 100644 index 0000000000000..d5c7a0b0b5dc7 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertCommitActionExecutor.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieInsertException; +import org.apache.hudi.table.BulkInsertPartitioner; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.util.List; +import java.util.Map; + +public class JavaBulkInsertCommitActionExecutor> extends BaseJavaCommitActionExecutor { + + private final List> inputRecords; + private final Option bulkInsertPartitioner; + + public JavaBulkInsertCommitActionExecutor(HoodieJavaEngineContext context, HoodieWriteConfig config, HoodieTable table, + String instantTime, List> inputRecords, + Option bulkInsertPartitioner) { + this(context, config, table, instantTime, inputRecords, bulkInsertPartitioner, Option.empty()); + } + + public JavaBulkInsertCommitActionExecutor(HoodieJavaEngineContext context, HoodieWriteConfig config, HoodieTable table, + String instantTime, List> inputRecords, + Option bulkInsertPartitioner, + Option> extraMetadata) { + super(context, config, table, instantTime, WriteOperationType.BULK_INSERT, extraMetadata); + this.inputRecords = inputRecords; + this.bulkInsertPartitioner = bulkInsertPartitioner; + } + + @Override + public HoodieWriteMetadata> execute() { + try { + return JavaBulkInsertHelper.newInstance().bulkInsert(inputRecords, instantTime, table, config, + this, true, bulkInsertPartitioner); + } catch (HoodieInsertException ie) { + throw ie; + } catch (Throwable e) { + throw new HoodieInsertException("Failed to bulk insert for commit time " + instantTime, e); + } + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertHelper.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertHelper.java new file mode 100644 index 0000000000000..e126372aa9068 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertHelper.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.execution.JavaLazyInsertIterable; +import org.apache.hudi.execution.bulkinsert.JavaBulkInsertInternalPartitionerFactory; +import org.apache.hudi.io.CreateHandleFactory; +import org.apache.hudi.io.WriteHandleFactory; +import org.apache.hudi.table.BulkInsertPartitioner; +import org.apache.hudi.table.FileIdPrefixProvider; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.util.ArrayList; +import java.util.List; + +/** + * A java implementation of {@link BaseBulkInsertHelper}. + * + * @param + */ +@SuppressWarnings("checkstyle:LineLength") +public class JavaBulkInsertHelper extends BaseBulkInsertHelper>, + List, List, R> { + + private JavaBulkInsertHelper() { + } + + private static class BulkInsertHelperHolder { + private static final JavaBulkInsertHelper JAVA_BULK_INSERT_HELPER = new JavaBulkInsertHelper(); + } + + public static JavaBulkInsertHelper newInstance() { + return BulkInsertHelperHolder.JAVA_BULK_INSERT_HELPER; + } + + @Override + public HoodieWriteMetadata> bulkInsert(final List> inputRecords, + final String instantTime, + final HoodieTable>, List, List> table, + final HoodieWriteConfig config, + final BaseCommitActionExecutor>, List, List, R> executor, + final boolean performDedupe, + final Option userDefinedBulkInsertPartitioner) { + HoodieWriteMetadata result = new HoodieWriteMetadata(); + + // It's possible the transition to inflight could have already happened. + if (!table.getActiveTimeline().filterInflights().containsInstant(instantTime)) { + table.getActiveTimeline().transitionRequestedToInflight( + new HoodieInstant(HoodieInstant.State.REQUESTED, table.getMetaClient().getCommitActionType(), instantTime), + Option.empty(), + config.shouldAllowMultiWriteOnSameInstant()); + } + + BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.orElse(JavaBulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode())); + + // write new files + List writeStatuses = bulkInsert(inputRecords, instantTime, table, config, performDedupe, partitioner, false, + config.getBulkInsertShuffleParallelism(), new CreateHandleFactory(false)); + //update index + ((BaseJavaCommitActionExecutor) executor).updateIndexAndCommitIfNeeded(writeStatuses, result); + return result; + } + + @Override + public List bulkInsert(List> inputRecords, + String instantTime, + HoodieTable>, List, List> table, + HoodieWriteConfig config, + boolean performDedupe, + BulkInsertPartitioner partitioner, + boolean useWriterSchema, + int parallelism, + WriteHandleFactory writeHandleFactory) { + + // De-dupe/merge if needed + List> dedupedRecords = inputRecords; + + if (performDedupe) { + dedupedRecords = (List>) JavaWriteHelper.newInstance().combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords, + parallelism, table); + } + + final List> repartitionedRecords = (List>) partitioner.repartitionRecords(dedupedRecords, parallelism); + + FileIdPrefixProvider fileIdPrefixProvider = (FileIdPrefixProvider) ReflectionUtils.loadClass( + config.getFileIdPrefixProviderClassName(), + new TypedProperties(config.getProps())); + + List writeStatuses = new ArrayList<>(); + + new JavaLazyInsertIterable<>(repartitionedRecords.iterator(), true, + config, instantTime, table, + fileIdPrefixProvider.createFilePrefix(""), table.getTaskContextSupplier(), + // Always get the first WriteHandleFactory, as there is only a single data partition for hudi java engine. + (WriteHandleFactory) partitioner.getWriteHandleFactory(0).orElse(writeHandleFactory)).forEachRemaining(writeStatuses::addAll); + + return writeStatuses; + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertPreppedCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertPreppedCommitActionExecutor.java new file mode 100644 index 0000000000000..14c4c8a93e916 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertPreppedCommitActionExecutor.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieInsertException; +import org.apache.hudi.table.BulkInsertPartitioner; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.util.List; + +public class JavaBulkInsertPreppedCommitActionExecutor> + extends BaseJavaCommitActionExecutor { + + private final List> preppedInputRecord; + private final Option userDefinedBulkInsertPartitioner; + + public JavaBulkInsertPreppedCommitActionExecutor(HoodieJavaEngineContext context, + HoodieWriteConfig config, HoodieTable table, + String instantTime, List> preppedInputRecord, + Option userDefinedBulkInsertPartitioner) { + super(context, config, table, instantTime, WriteOperationType.BULK_INSERT); + this.preppedInputRecord = preppedInputRecord; + this.userDefinedBulkInsertPartitioner = userDefinedBulkInsertPartitioner; + } + + @Override + public HoodieWriteMetadata> execute() { + try { + return JavaBulkInsertHelper.newInstance().bulkInsert(preppedInputRecord, instantTime, table, config, + this, false, userDefinedBulkInsertPartitioner); + } catch (Throwable e) { + if (e instanceof HoodieInsertException) { + throw e; + } + throw new HoodieInsertException("Failed to bulk insert for commit time " + instantTime, e); + } + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaDeleteCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaDeleteCommitActionExecutor.java new file mode 100644 index 0000000000000..72c2332645cf3 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaDeleteCommitActionExecutor.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.util.List; + +public class JavaDeleteCommitActionExecutor> extends BaseJavaCommitActionExecutor { + private final List keys; + + public JavaDeleteCommitActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, HoodieTable table, + String instantTime, List keys) { + super(context, config, table, instantTime, WriteOperationType.DELETE); + this.keys = keys; + } + + @Override + public HoodieWriteMetadata> execute() { + return JavaDeleteHelper.newInstance().execute(instantTime, keys, context, config, table, this); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaDeleteHelper.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaDeleteHelper.java new file mode 100644 index 0000000000000..57d796c925298 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaDeleteHelper.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieListData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieUpsertException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.WorkloadProfile; +import org.apache.hudi.table.WorkloadStat; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.time.Duration; +import java.time.Instant; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.stream.Collectors; + +@SuppressWarnings("checkstyle:LineLength") +public class JavaDeleteHelper extends + BaseDeleteHelper>, List, List, R> { + + private JavaDeleteHelper() { + } + + private static class DeleteHelperHolder { + private static final JavaDeleteHelper JAVA_DELETE_HELPER = new JavaDeleteHelper(); + } + + public static JavaDeleteHelper newInstance() { + return DeleteHelperHolder.JAVA_DELETE_HELPER; + } + + @Override + public List deduplicateKeys(List keys, + HoodieTable>, List, List> table, + int parallelism) { + boolean isIndexingGlobal = table.getIndex().isGlobal(); + if (isIndexingGlobal) { + HashSet recordKeys = keys.stream().map(HoodieKey::getRecordKey).collect(Collectors.toCollection(HashSet::new)); + List deduplicatedKeys = new LinkedList<>(); + keys.forEach(x -> { + if (recordKeys.contains(x.getRecordKey())) { + deduplicatedKeys.add(x); + } + }); + return deduplicatedKeys; + } else { + HashSet set = new HashSet<>(keys); + keys.clear(); + keys.addAll(set); + return keys; + } + } + + @Override + public HoodieWriteMetadata> execute(String instantTime, + List keys, + HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable>, List, List> table, + BaseCommitActionExecutor>, List, List, R> deleteExecutor) { + try { + HoodieWriteMetadata> result = null; + List dedupedKeys = keys; + final int parallelism = config.getDeleteShuffleParallelism(); + if (config.shouldCombineBeforeDelete()) { + // De-dupe/merge if needed + dedupedKeys = deduplicateKeys(keys, table, parallelism); + } + + List> dedupedRecords = + dedupedKeys.stream().map(key -> new HoodieAvroRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList()); + Instant beginTag = Instant.now(); + // perform index look up to get existing location of records + List> taggedRecords = table.getIndex().tagLocation(HoodieListData.eager(dedupedRecords), context, table).collectAsList(); + Duration tagLocationDuration = Duration.between(beginTag, Instant.now()); + + // filter out non existent keys/records + List> taggedValidRecords = taggedRecords.stream().filter(HoodieRecord::isCurrentLocationKnown).collect(Collectors.toList()); + if (!taggedValidRecords.isEmpty()) { + result = deleteExecutor.execute(taggedValidRecords); + result.setIndexLookupDuration(tagLocationDuration); + } else { + // if entire set of keys are non existent + deleteExecutor.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(Pair.of(new HashMap<>(), new WorkloadStat())), instantTime); + result = new HoodieWriteMetadata<>(); + result.setWriteStatuses(Collections.EMPTY_LIST); + deleteExecutor.commitOnAutoCommit(result); + } + return result; + } catch (Throwable e) { + if (e instanceof HoodieUpsertException) { + throw (HoodieUpsertException) e; + } + throw new HoodieUpsertException("Failed to delete for commit time " + instantTime, e); + } + } + +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertCommitActionExecutor.java new file mode 100644 index 0000000000000..c1fae07a9d6c2 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertCommitActionExecutor.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.util.List; + +public class JavaInsertCommitActionExecutor> extends BaseJavaCommitActionExecutor { + + private List> inputRecords; + + public JavaInsertCommitActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + List> inputRecords) { + super(context, config, table, instantTime, WriteOperationType.INSERT); + this.inputRecords = inputRecords; + } + + @Override + public HoodieWriteMetadata> execute() { + return JavaWriteHelper.newInstance().write(instantTime, inputRecords, context, table, + config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(), this, operationType); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertOverwriteCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertOverwriteCommitActionExecutor.java new file mode 100644 index 0000000000000..a99485e67bb81 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertOverwriteCommitActionExecutor.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class JavaInsertOverwriteCommitActionExecutor> + extends BaseJavaCommitActionExecutor { + + private final List> inputRecords; + + public JavaInsertOverwriteCommitActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, HoodieTable table, + String instantTime, List> inputRecords) { + this(context, config, table, instantTime, inputRecords, WriteOperationType.INSERT_OVERWRITE); + } + + public JavaInsertOverwriteCommitActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, HoodieTable table, + String instantTime, List> inputRecords, + WriteOperationType writeOperationType) { + super(context, config, table, instantTime, writeOperationType); + this.inputRecords = inputRecords; + } + + @Override + public HoodieWriteMetadata> execute() { + return JavaWriteHelper.newInstance().write(instantTime, inputRecords, context, table, + config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(), this, operationType); + } + + @Override + protected String getCommitActionType() { + return HoodieTimeline.REPLACE_COMMIT_ACTION; + } + + @Override + protected Map> getPartitionToReplacedFileIds(HoodieWriteMetadata> writeResult) { + return context.mapToPair( + writeResult.getWriteStatuses().stream().map(status -> status.getStat().getPartitionPath()).distinct().collect(Collectors.toList()), + partitionPath -> + Pair.of(partitionPath, getAllExistingFileIds(partitionPath)), 1 + ); + } + + private List getAllExistingFileIds(String partitionPath) { + // because new commit is not complete. it is safe to mark all existing file Ids as old files + return table.getSliceView().getLatestFileSlices(partitionPath).map(fg -> fg.getFileId()).distinct().collect(Collectors.toList()); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertOverwriteTableCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertOverwriteTableCommitActionExecutor.java new file mode 100644 index 0000000000000..a52ab6e0f3d0c --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertOverwriteTableCommitActionExecutor.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class JavaInsertOverwriteTableCommitActionExecutor> + extends JavaInsertOverwriteCommitActionExecutor { + + public JavaInsertOverwriteTableCommitActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, HoodieTable table, + String instantTime, List> inputRecords) { + super(context, config, table, instantTime, inputRecords, WriteOperationType.INSERT_OVERWRITE_TABLE); + } + + protected List getAllExistingFileIds(String partitionPath) { + return table.getSliceView().getLatestFileSlices(partitionPath) + .map(fg -> fg.getFileId()).distinct().collect(Collectors.toList()); + } + + @Override + protected Map> getPartitionToReplacedFileIds(HoodieWriteMetadata> writeResult) { + Map> partitionToExistingFileIds = new HashMap<>(); + List partitionPaths = FSUtils.getAllPartitionPaths(context, + table.getMetaClient().getBasePath(), config.isMetadataTableEnabled(), config.shouldAssumeDatePartitioning()); + + if (partitionPaths != null && partitionPaths.size() > 0) { + partitionToExistingFileIds = context.mapToPair(partitionPaths, + partitionPath -> Pair.of(partitionPath, getAllExistingFileIds(partitionPath)), 1); + } + return partitionToExistingFileIds; + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertPreppedCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertPreppedCommitActionExecutor.java new file mode 100644 index 0000000000000..349cf69dcc30b --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaInsertPreppedCommitActionExecutor.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; + +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.util.List; + +public class JavaInsertPreppedCommitActionExecutor> + extends BaseJavaCommitActionExecutor { + + private final List> preppedRecords; + + public JavaInsertPreppedCommitActionExecutor(HoodieJavaEngineContext context, + HoodieWriteConfig config, HoodieTable table, + String instantTime, List> preppedRecords) { + super(context, config, table, instantTime, WriteOperationType.INSERT_PREPPED); + this.preppedRecords = preppedRecords; + } + + @Override + public HoodieWriteMetadata> execute() { + return super.execute(preppedRecords); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaMergeHelper.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaMergeHelper.java new file mode 100644 index 0000000000000..d95daf896c4da --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaMergeHelper.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.utils.MergingIterator; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; +import org.apache.hudi.common.util.queue.IteratorBasedQueueProducer; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.io.HoodieMergeHandle; +import org.apache.hudi.io.storage.HoodieFileReader; +import org.apache.hudi.io.storage.HoodieFileReaderFactory; +import org.apache.hudi.table.HoodieTable; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.BinaryDecoder; +import org.apache.avro.io.BinaryEncoder; +import org.apache.hadoop.conf.Configuration; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; + +public class JavaMergeHelper extends BaseMergeHelper>, + List, List> { + + private JavaMergeHelper() { + } + + private static class MergeHelperHolder { + private static final JavaMergeHelper JAVA_MERGE_HELPER = new JavaMergeHelper(); + } + + public static JavaMergeHelper newInstance() { + return JavaMergeHelper.MergeHelperHolder.JAVA_MERGE_HELPER; + } + + @Override + public void runMerge(HoodieTable>, List, List> table, + HoodieMergeHandle>, List, List> upsertHandle) throws IOException { + final boolean externalSchemaTransformation = table.getConfig().shouldUseExternalSchemaTransformation(); + Configuration cfgForHoodieFile = new Configuration(table.getHadoopConf()); + HoodieMergeHandle>, List, List> mergeHandle = upsertHandle; + HoodieBaseFile baseFile = mergeHandle.baseFileForMerge(); + + Configuration hadoopConf = new Configuration(table.getHadoopConf()); + HoodieFileReader baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf, mergeHandle.getOldFilePath()); + HoodieFileReader bootstrapFileReader = null; + + final GenericDatumWriter gWriter; + final GenericDatumReader gReader; + Schema readSchema; + if (externalSchemaTransformation || baseFile.getBootstrapBaseFile().isPresent()) { + readSchema = baseFileReader.getSchema(); + gWriter = new GenericDatumWriter<>(readSchema); + gReader = new GenericDatumReader<>(readSchema, mergeHandle.getWriterSchemaWithMetaFields()); + } else { + gReader = null; + gWriter = null; + readSchema = mergeHandle.getWriterSchemaWithMetaFields(); + } + + BoundedInMemoryExecutor wrapper = null; + try { + final Iterator readerIterator; + if (baseFile.getBootstrapBaseFile().isPresent()) { + Path bootstrapFilePath = new Path(baseFile.getBootstrapBaseFile().get().getPath()); + Configuration bootstrapFileConfig = new Configuration(table.getHadoopConf()); + bootstrapFileReader = HoodieFileReaderFactory.getFileReader(bootstrapFileConfig, bootstrapFilePath); + // NOTE: It's important for us to rely on writer's schema here + // - When records will be read by Parquet reader, if schema will be decoded from the + // file itself by taking its Parquet one and converting it to Avro. This will be problematic + // w/ schema validations of the records since Avro's schemas also validate corresponding + // qualified names of the structs, which could not be reconstructed when converting from + // Parquet to Avro (b/c Parquet doesn't bear these) + Schema bootstrapSchema = externalSchemaTransformation ? bootstrapFileReader.getSchema() : mergeHandle.getWriterSchema(); + readerIterator = new MergingIterator<>( + baseFileReader.getRecordIterator(readSchema), + bootstrapFileReader.getRecordIterator(bootstrapSchema), + (inputRecordPair) -> HoodieAvroUtils.stitchRecords(inputRecordPair.getLeft(), inputRecordPair.getRight(), mergeHandle.getWriterSchemaWithMetaFields())); + } else { + readerIterator = baseFileReader.getRecordIterator(readSchema); + } + + ThreadLocal encoderCache = new ThreadLocal<>(); + ThreadLocal decoderCache = new ThreadLocal<>(); + wrapper = new BoundedInMemoryExecutor<>(table.getConfig().getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(readerIterator), + Option.of(new UpdateHandler(mergeHandle)), record -> { + if (!externalSchemaTransformation) { + return record; + } + return transformRecordBasedOnNewSchema(gReader, gWriter, encoderCache, decoderCache, (GenericRecord) record); + }); + wrapper.execute(); + } catch (Exception e) { + throw new HoodieException(e); + } finally { + // HUDI-2875: mergeHandle is not thread safe, we should totally terminate record inputting + // and executor firstly and then close mergeHandle. + baseFileReader.close(); + if (bootstrapFileReader != null) { + bootstrapFileReader.close(); + } + if (null != wrapper) { + wrapper.shutdownNow(); + wrapper.awaitTermination(); + } + mergeHandle.close(); + } + } + +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaUpsertCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaUpsertCommitActionExecutor.java new file mode 100644 index 0000000000000..ed0af4402869d --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaUpsertCommitActionExecutor.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.util.List; + +public class JavaUpsertCommitActionExecutor> extends BaseJavaCommitActionExecutor { + + private List> inputRecords; + + public JavaUpsertCommitActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + List> inputRecords) { + super(context, config, table, instantTime, WriteOperationType.UPSERT); + this.inputRecords = inputRecords; + } + + @Override + public HoodieWriteMetadata> execute() { + return JavaWriteHelper.newInstance().write(instantTime, inputRecords, context, table, + config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), this, operationType); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaUpsertPartitioner.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaUpsertPartitioner.java new file mode 100644 index 0000000000000..fb19259b55591 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaUpsertPartitioner.java @@ -0,0 +1,341 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.NumericUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.WorkloadProfile; +import org.apache.hudi.table.WorkloadStat; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Packs incoming records to be upserted, into buckets. + */ +public class JavaUpsertPartitioner> implements Partitioner { + + private static final Logger LOG = LogManager.getLogger(JavaUpsertPartitioner.class); + + /** + * List of all small files to be corrected. + */ + protected List smallFiles = new ArrayList<>(); + /** + * Total number of RDD partitions, is determined by total buckets we want to pack the incoming workload into. + */ + private int totalBuckets = 0; + /** + * Stat for the input and output workload. Describe the workload before and after being assigned buckets. + */ + private WorkloadProfile workloadProfile; + /** + * Helps decide which bucket an incoming update should go to. + */ + private HashMap updateLocationToBucket; + /** + * Helps us pack inserts into 1 or more buckets depending on number of incoming records. + */ + private HashMap> partitionPathToInsertBucketInfos; + /** + * Remembers what type each bucket is for later. + */ + private HashMap bucketInfoMap; + + protected final HoodieTable table; + + protected final HoodieWriteConfig config; + + public JavaUpsertPartitioner(WorkloadProfile workloadProfile, HoodieEngineContext context, HoodieTable table, + HoodieWriteConfig config) { + updateLocationToBucket = new HashMap<>(); + partitionPathToInsertBucketInfos = new HashMap<>(); + bucketInfoMap = new HashMap<>(); + this.workloadProfile = workloadProfile; + this.table = table; + this.config = config; + assignUpdates(workloadProfile); + assignInserts(workloadProfile, context); + + LOG.info("Total Buckets :" + totalBuckets + ", buckets info => " + bucketInfoMap + ", \n" + + "Partition to insert buckets => " + partitionPathToInsertBucketInfos + ", \n" + + "UpdateLocations mapped to buckets =>" + updateLocationToBucket); + } + + private void assignUpdates(WorkloadProfile profile) { + // each update location gets a partition + Set> partitionStatEntries = profile.getInputPartitionPathStatMap().entrySet(); + for (Map.Entry partitionStat : partitionStatEntries) { + WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionStat.getKey(), new WorkloadStat()); + for (Map.Entry> updateLocEntry : + partitionStat.getValue().getUpdateLocationToCount().entrySet()) { + addUpdateBucket(partitionStat.getKey(), updateLocEntry.getKey()); + if (profile.hasOutputWorkLoadStats()) { + HoodieRecordLocation hoodieRecordLocation = new HoodieRecordLocation(updateLocEntry.getValue().getKey(), updateLocEntry.getKey()); + outputWorkloadStats.addUpdates(hoodieRecordLocation, updateLocEntry.getValue().getValue()); + } + } + if (profile.hasOutputWorkLoadStats()) { + profile.updateOutputPartitionPathStatMap(partitionStat.getKey(), outputWorkloadStats); + } + } + } + + private int addUpdateBucket(String partitionPath, String fileIdHint) { + int bucket = totalBuckets; + updateLocationToBucket.put(fileIdHint, bucket); + BucketInfo bucketInfo = new BucketInfo(BucketType.UPDATE, fileIdHint, partitionPath); + bucketInfoMap.put(totalBuckets, bucketInfo); + totalBuckets++; + return bucket; + } + + private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) { + // for new inserts, compute buckets depending on how many records we have for each partition + Set partitionPaths = profile.getPartitionPaths(); + long averageRecordSize = + averageBytesPerRecord(table.getMetaClient().getActiveTimeline().getCommitTimeline().filterCompletedInstants(), + config); + LOG.info("AvgRecordSize => " + averageRecordSize); + + Map> partitionSmallFilesMap = + getSmallFilesForPartitions(new ArrayList(partitionPaths), context); + + for (String partitionPath : partitionPaths) { + WorkloadStat pStat = profile.getWorkloadStat(partitionPath); + WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionPath, new WorkloadStat()); + if (pStat.getNumInserts() > 0) { + + List smallFiles = partitionSmallFilesMap.getOrDefault(partitionPath, new ArrayList<>()); + this.smallFiles.addAll(smallFiles); + + LOG.info("For partitionPath : " + partitionPath + " Small Files => " + smallFiles); + + long totalUnassignedInserts = pStat.getNumInserts(); + List bucketNumbers = new ArrayList<>(); + List recordsPerBucket = new ArrayList<>(); + + // first try packing this into one of the smallFiles + for (SmallFile smallFile : smallFiles) { + long recordsToAppend = Math.min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize, + totalUnassignedInserts); + if (recordsToAppend > 0) { + // create a new bucket or re-use an existing bucket + int bucket; + if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) { + bucket = updateLocationToBucket.get(smallFile.location.getFileId()); + LOG.info("Assigning " + recordsToAppend + " inserts to existing update bucket " + bucket); + } else { + bucket = addUpdateBucket(partitionPath, smallFile.location.getFileId()); + LOG.info("Assigning " + recordsToAppend + " inserts to new update bucket " + bucket); + } + if (profile.hasOutputWorkLoadStats()) { + outputWorkloadStats.addInserts(smallFile.location, recordsToAppend); + } + bucketNumbers.add(bucket); + recordsPerBucket.add(recordsToAppend); + totalUnassignedInserts -= recordsToAppend; + } + } + + // if we have anything more, create new insert buckets, like normal + if (totalUnassignedInserts > 0) { + long insertRecordsPerBucket = config.getCopyOnWriteInsertSplitSize(); + if (config.shouldAutoTuneInsertSplits()) { + insertRecordsPerBucket = config.getParquetMaxFileSize() / averageRecordSize; + } + + int insertBuckets = (int) Math.ceil((1.0 * totalUnassignedInserts) / insertRecordsPerBucket); + LOG.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts + + ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => " + insertRecordsPerBucket); + for (int b = 0; b < insertBuckets; b++) { + bucketNumbers.add(totalBuckets); + if (b < insertBuckets - 1) { + recordsPerBucket.add(insertRecordsPerBucket); + } else { + recordsPerBucket.add(totalUnassignedInserts - (insertBuckets - 1) * insertRecordsPerBucket); + } + BucketInfo bucketInfo = new BucketInfo(BucketType.INSERT, FSUtils.createNewFileIdPfx(), partitionPath); + bucketInfoMap.put(totalBuckets, bucketInfo); + if (profile.hasOutputWorkLoadStats()) { + outputWorkloadStats.addInserts(new HoodieRecordLocation(HoodieWriteStat.NULL_COMMIT, bucketInfo.getFileIdPrefix()), recordsPerBucket.get(recordsPerBucket.size() - 1)); + } + totalBuckets++; + } + } + + // Go over all such buckets, and assign weights as per amount of incoming inserts. + List insertBuckets = new ArrayList<>(); + double currentCumulativeWeight = 0; + for (int i = 0; i < bucketNumbers.size(); i++) { + InsertBucket bkt = new InsertBucket(); + bkt.bucketNumber = bucketNumbers.get(i); + bkt.weight = (1.0 * recordsPerBucket.get(i)) / pStat.getNumInserts(); + currentCumulativeWeight += bkt.weight; + insertBuckets.add(new InsertBucketCumulativeWeightPair(bkt, currentCumulativeWeight)); + } + LOG.info("Total insert buckets for partition path " + partitionPath + " => " + insertBuckets); + partitionPathToInsertBucketInfos.put(partitionPath, insertBuckets); + } + if (profile.hasOutputWorkLoadStats()) { + profile.updateOutputPartitionPathStatMap(partitionPath, outputWorkloadStats); + } + } + } + + private Map> getSmallFilesForPartitions(List partitionPaths, HoodieEngineContext context) { + Map> partitionSmallFilesMap = new HashMap<>(); + + if (config.getParquetSmallFileLimit() <= 0) { + return partitionSmallFilesMap; + } + + if (partitionPaths != null && partitionPaths.size() > 0) { + context.setJobStatus(this.getClass().getSimpleName(), "Getting small files from partitions: " + config.getTableName()); + partitionSmallFilesMap = context.mapToPair(partitionPaths, + partitionPath -> new ImmutablePair<>(partitionPath, getSmallFiles(partitionPath)), 0); + } + return partitionSmallFilesMap; + } + + /** + * Returns a list of small files in the given partition path. + */ + protected List getSmallFiles(String partitionPath) { + + // smallFiles only for partitionPath + List smallFileLocations = new ArrayList<>(); + + HoodieTimeline commitTimeline = table.getMetaClient().getCommitsTimeline().filterCompletedInstants(); + + if (!commitTimeline.empty()) { // if we have some commits + HoodieInstant latestCommitTime = commitTimeline.lastInstant().get(); + List allFiles = table.getBaseFileOnlyView() + .getLatestBaseFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList()); + + for (HoodieBaseFile file : allFiles) { + if (file.getFileSize() < config.getParquetSmallFileLimit()) { + String filename = file.getFileName(); + SmallFile sf = new SmallFile(); + sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename)); + sf.sizeBytes = file.getFileSize(); + smallFileLocations.add(sf); + } + } + } + + return smallFileLocations; + } + + public BucketInfo getBucketInfo(int bucketNumber) { + return bucketInfoMap.get(bucketNumber); + } + + public List getInsertBuckets(String partitionPath) { + return partitionPathToInsertBucketInfos.get(partitionPath); + } + + @Override + public int getNumPartitions() { + return totalBuckets; + } + + @Override + public int getPartition(Object key) { + Pair> keyLocation = + (Pair>) key; + if (keyLocation.getRight().isPresent()) { + HoodieRecordLocation location = keyLocation.getRight().get(); + return updateLocationToBucket.get(location.getFileId()); + } else { + String partitionPath = keyLocation.getLeft().getPartitionPath(); + List targetBuckets = partitionPathToInsertBucketInfos.get(partitionPath); + // pick the target bucket to use based on the weights. + final long totalInserts = Math.max(1, workloadProfile.getWorkloadStat(partitionPath).getNumInserts()); + final long hashOfKey = NumericUtils.getMessageDigestHash("MD5", keyLocation.getLeft().getRecordKey()); + final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts; + + int index = Collections.binarySearch(targetBuckets, new InsertBucketCumulativeWeightPair(new InsertBucket(), r)); + + if (index >= 0) { + return targetBuckets.get(index).getKey().bucketNumber; + } + + if ((-1 * index - 1) < targetBuckets.size()) { + return targetBuckets.get((-1 * index - 1)).getKey().bucketNumber; + } + + // return first one, by default + return targetBuckets.get(0).getKey().bucketNumber; + } + } + + /** + * Obtains the average record size based on records written during previous commits. Used for estimating how many + * records pack into one file. + */ + protected static long averageBytesPerRecord(HoodieTimeline commitTimeline, HoodieWriteConfig hoodieWriteConfig) { + long avgSize = hoodieWriteConfig.getCopyOnWriteRecordSizeEstimate(); + long fileSizeThreshold = (long) (hoodieWriteConfig.getRecordSizeEstimationThreshold() * hoodieWriteConfig.getParquetSmallFileLimit()); + try { + if (!commitTimeline.empty()) { + // Go over the reverse ordered commits to get a more recent estimate of average record size. + Iterator instants = commitTimeline.getReverseOrderedInstants().iterator(); + while (instants.hasNext()) { + HoodieInstant instant = instants.next(); + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(commitTimeline.getInstantDetails(instant).get(), HoodieCommitMetadata.class); + long totalBytesWritten = commitMetadata.fetchTotalBytesWritten(); + long totalRecordsWritten = commitMetadata.fetchTotalRecordsWritten(); + if (totalBytesWritten > fileSizeThreshold && totalRecordsWritten > 0) { + avgSize = (long) Math.ceil((1.0 * totalBytesWritten) / totalRecordsWritten); + break; + } + } + } + } catch (Throwable t) { + // make this fail safe. + LOG.error("Error trying to compute average bytes/record ", t); + } + return avgSize; + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaUpsertPreppedCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaUpsertPreppedCommitActionExecutor.java new file mode 100644 index 0000000000000..8eea5b5105826 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaUpsertPreppedCommitActionExecutor.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; + +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import java.util.List; + +public class JavaUpsertPreppedCommitActionExecutor> + extends BaseJavaCommitActionExecutor { + + private final List> preppedRecords; + + public JavaUpsertPreppedCommitActionExecutor(HoodieJavaEngineContext context, + HoodieWriteConfig config, HoodieTable table, + String instantTime, List> preppedRecords) { + super(context, config, table, instantTime, WriteOperationType.UPSERT_PREPPED); + this.preppedRecords = preppedRecords; + } + + @Override + public HoodieWriteMetadata> execute() { + return super.execute(preppedRecords); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaWriteHelper.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaWriteHelper.java new file mode 100644 index 0000000000000..29725ad76f7c3 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaWriteHelper.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieListData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.table.HoodieTable; + +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + +public class JavaWriteHelper extends BaseWriteHelper>, + List, List, R> { + + private JavaWriteHelper() { + } + + private static class WriteHelperHolder { + private static final JavaWriteHelper JAVA_WRITE_HELPER = new JavaWriteHelper(); + } + + public static JavaWriteHelper newInstance() { + return WriteHelperHolder.JAVA_WRITE_HELPER; + } + + @Override + protected List> tag(List> dedupedRecords, HoodieEngineContext context, HoodieTable>, List, List> table) { + return table.getIndex().tagLocation(HoodieListData.eager(dedupedRecords), context, table).collectAsList(); + } + + @Override + public List> deduplicateRecords( + List> records, HoodieIndex index, int parallelism) { + boolean isIndexingGlobal = index.isGlobal(); + Map>>> keyedRecords = records.stream().map(record -> { + HoodieKey hoodieKey = record.getKey(); + // If index used is global, then records are expected to differ in their partitionPath + Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey; + return Pair.of(key, record); + }).collect(Collectors.groupingBy(Pair::getLeft)); + + return keyedRecords.values().stream().map(x -> x.stream().map(Pair::getRight).reduce((rec1, rec2) -> { + @SuppressWarnings("unchecked") + T reducedData = (T) rec1.getData().preCombine(rec2.getData(), CollectionUtils.emptyProps()); + // we cannot allow the user to change the key or partitionPath, since that will affect + // everything + // so pick it from one of the records. + return new HoodieAvroRecord(rec1.getKey(), reducedData); + }).orElse(null)).filter(Objects::nonNull).collect(Collectors.toList()); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/compact/HoodieJavaMergeOnReadTableCompactor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/compact/HoodieJavaMergeOnReadTableCompactor.java new file mode 100644 index 0000000000000..30bdcda759ce0 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/compact/HoodieJavaMergeOnReadTableCompactor.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.action.compact; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; + +import java.util.List; + +/** + * Compacts a hoodie table with merge on read storage in Java engine. Computes all possible + * compactions, passes it through a CompactionFilter and executes all the compactions and + * writes a new version of base files and make a normal commit. + */ +public class HoodieJavaMergeOnReadTableCompactor + extends HoodieCompactor>, List, List> { + + @Override + public void preCompact( + HoodieTable table, HoodieTimeline pendingCompactionTimeline, String compactionInstantTime) { + HoodieInstant inflightInstant = HoodieTimeline.getCompactionInflightInstant(compactionInstantTime); + if (pendingCompactionTimeline.containsInstant(inflightInstant)) { + table.rollbackInflightCompaction(inflightInstant); + table.getMetaClient().reloadActiveTimeline(); + } + } + + @Override + public void maybePersist(HoodieData writeStatus, HoodieWriteConfig config) { + // No OP + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/deltacommit/BaseJavaDeltaCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/deltacommit/BaseJavaDeltaCommitActionExecutor.java new file mode 100644 index 0000000000000..0b4a654074408 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/deltacommit/BaseJavaDeltaCommitActionExecutor.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.action.deltacommit; + +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.commit.BaseJavaCommitActionExecutor; + +public abstract class BaseJavaDeltaCommitActionExecutor> extends BaseJavaCommitActionExecutor { + + public BaseJavaDeltaCommitActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, + String instantTime, WriteOperationType operationType) { + super(context, config, table, instantTime, operationType); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/deltacommit/JavaUpsertPreppedDeltaCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/deltacommit/JavaUpsertPreppedDeltaCommitActionExecutor.java new file mode 100644 index 0000000000000..f6faa28bbb1ef --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/deltacommit/JavaUpsertPreppedDeltaCommitActionExecutor.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.action.deltacommit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieUpsertException; +import org.apache.hudi.io.HoodieAppendHandle; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.commit.JavaBulkInsertHelper; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; + +public class JavaUpsertPreppedDeltaCommitActionExecutor> extends BaseJavaDeltaCommitActionExecutor { + + private static final Logger LOG = LogManager.getLogger(JavaUpsertPreppedDeltaCommitActionExecutor.class); + + private final List> preppedInputRecords; + + public JavaUpsertPreppedDeltaCommitActionExecutor(HoodieJavaEngineContext context, HoodieWriteConfig config, HoodieTable table, + String instantTime, List> preppedInputRecords) { + super(context, config, table, instantTime, WriteOperationType.UPSERT_PREPPED); + this.preppedInputRecords = preppedInputRecords; + } + + @Override + public HoodieWriteMetadata> execute() { + HoodieWriteMetadata> result = new HoodieWriteMetadata<>(); + // First group by target file id. + HashMap, List>> recordsByFileId = new HashMap<>(); + List> insertedRecords = new LinkedList<>(); + + // Split records into inserts and updates. + for (HoodieRecord record : preppedInputRecords) { + if (!record.isCurrentLocationKnown()) { + insertedRecords.add(record); + } else { + Pair fileIdPartitionPath = Pair.of(record.getCurrentLocation().getFileId(), record.getPartitionPath()); + if (!recordsByFileId.containsKey(fileIdPartitionPath)) { + recordsByFileId.put(fileIdPartitionPath, new LinkedList<>()); + } + recordsByFileId.get(fileIdPartitionPath).add(record); + } + } + LOG.info(String.format("Total update fileIDs %s, total inserts %s for commit %s", + recordsByFileId.size(), insertedRecords.size(), instantTime)); + + List allWriteStatuses = new ArrayList<>(); + try { + recordsByFileId.forEach((k, v) -> { + HoodieAppendHandle appendHandle = new HoodieAppendHandle(config, instantTime, table, + k.getRight(), k.getLeft(), v.iterator(), taskContextSupplier); + appendHandle.doAppend(); + allWriteStatuses.addAll(appendHandle.close()); + }); + + if (insertedRecords.size() > 0) { + HoodieWriteMetadata> insertResult = JavaBulkInsertHelper.newInstance() + .bulkInsert(insertedRecords, instantTime, table, config, this, false, Option.empty()); + allWriteStatuses.addAll(insertResult.getWriteStatuses()); + } + } catch (Throwable e) { + if (e instanceof HoodieUpsertException) { + throw e; + } + throw new HoodieUpsertException("Failed to upsert for commit time " + instantTime, e); + } + + updateIndex(allWriteStatuses, result); + return result; + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/upgrade/JavaUpgradeDowngradeHelper.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/upgrade/JavaUpgradeDowngradeHelper.java new file mode 100644 index 0000000000000..e1c44d0913318 --- /dev/null +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/upgrade/JavaUpgradeDowngradeHelper.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.apache.hudi.table.HoodieJavaTable; +import org.apache.hudi.table.HoodieTable; + +/** + * Java upgrade and downgrade helper + */ +public class JavaUpgradeDowngradeHelper implements SupportsUpgradeDowngrade { + + private static final JavaUpgradeDowngradeHelper SINGLETON_INSTANCE = + new JavaUpgradeDowngradeHelper(); + + private JavaUpgradeDowngradeHelper() {} + + public static JavaUpgradeDowngradeHelper getInstance() { + return SINGLETON_INSTANCE; + } + + @Override + public HoodieTable getTable(HoodieWriteConfig config, HoodieEngineContext context) { + return HoodieJavaTable.create(config, context); + } + + @Override + public String getPartitionColumns(HoodieWriteConfig config) { + return config.getProps().getProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()); + } +} diff --git a/hudi-client/hudi-java-client/src/main/resources/log4j.properties b/hudi-client/hudi-java-client/src/main/resources/log4j.properties deleted file mode 100644 index ff268faf6363c..0000000000000 --- a/hudi-client/hudi-java-client/src/main/resources/log4j.properties +++ /dev/null @@ -1,23 +0,0 @@ -### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -### -log4j.rootLogger=INFO, A1 -# A1 is set to be a ConsoleAppender. -log4j.appender.A1=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. -log4j.appender.A1.layout=org.apache.log4j.PatternLayout -log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/TestHoodieJavaWriteClientInsert.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/TestHoodieJavaWriteClientInsert.java new file mode 100644 index 0000000000000..ae73b0a65d795 --- /dev/null +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/TestHoodieJavaWriteClientInsert.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.client; + +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.testutils.RawTripTestPayload; +import org.apache.hudi.common.util.BaseFileUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.hadoop.HoodieParquetInputFormat; +import org.apache.hudi.hadoop.utils.HoodieHiveUtils; +import org.apache.hudi.testutils.HoodieJavaClientTestHarness; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.AVRO_SCHEMA; +import static org.apache.hudi.common.testutils.HoodieTestTable.makeNewCommitTime; +import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestHoodieJavaWriteClientInsert extends HoodieJavaClientTestHarness { + private static final Schema SCHEMA = getSchemaFromResource(TestHoodieJavaWriteClientInsert.class, "/exampleSchema.avsc"); + + private static HoodieWriteConfig.Builder makeHoodieClientConfigBuilder(String basePath) { + return makeHoodieClientConfigBuilder(basePath, SCHEMA); + } + + private static HoodieWriteConfig.Builder makeHoodieClientConfigBuilder(String basePath, Schema schema) { + return HoodieWriteConfig.newBuilder() + .withEngineType(EngineType.JAVA) + .withPath(basePath) + .withSchema(schema.toString()); + } + + private FileStatus[] getIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull) + throws Exception { + // initialize parquet input format + HoodieParquetInputFormat hoodieInputFormat = new HoodieParquetInputFormat(); + JobConf jobConf = new JobConf(hadoopConf); + hoodieInputFormat.setConf(jobConf); + HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE); + setupIncremental(jobConf, startCommitTime, numCommitsToPull); + FileInputFormat.setInputPaths(jobConf, Paths.get(basePath, partitionPath).toString()); + return hoodieInputFormat.listStatus(jobConf); + } + + private void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull) { + String modePropertyName = + String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.set(modePropertyName, HoodieHiveUtils.INCREMENTAL_SCAN_MODE); + + String startCommitTimestampName = + String.format(HoodieHiveUtils.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.set(startCommitTimestampName, startCommit); + + String maxCommitPulls = + String.format(HoodieHiveUtils.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.setInt(maxCommitPulls, numberOfCommitsToPull); + } + + @Test + public void testInsert() throws Exception { + HoodieWriteConfig config = makeHoodieClientConfigBuilder(basePath).withMergeAllowDuplicateOnInserts(true).build(); + + HoodieJavaWriteClient writeClient = getHoodieWriteClient(config); + metaClient = HoodieTableMetaClient.reload(metaClient); + BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient); + + // Get some records belong to the same partition (2021/09/11) + String insertRecordStr1 = "{\"_row_key\":\"1\"," + + "\"time\":\"2021-09-11T16:16:41.415Z\",\"number\":1}"; + String insertRecordStr2 = "{\"_row_key\":\"2\"," + + "\"time\":\"2021-09-11T16:16:41.415Z\",\"number\":2}"; + List records1 = new ArrayList<>(); + RawTripTestPayload insertRow1 = new RawTripTestPayload(insertRecordStr1); + RawTripTestPayload insertRow2 = new RawTripTestPayload(insertRecordStr2); + records1.add(new HoodieAvroRecord(new HoodieKey(insertRow1.getRowKey(), insertRow1.getPartitionPath()), insertRow1)); + records1.add(new HoodieAvroRecord(new HoodieKey(insertRow2.getRowKey(), insertRow2.getPartitionPath()), insertRow2)); + + int startInstant = 1; + String firstCommitTime = makeNewCommitTime(startInstant++, "%09d"); + // First insert + writeClient.startCommitWithTime(firstCommitTime); + writeClient.insert(records1, firstCommitTime); + + String partitionPath = "2021/09/11"; + FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1); + assertEquals(1, allFiles.length); + + // Read out the bloom filter and make sure filter can answer record exist or not + Path filePath = allFiles[0].getPath(); + BloomFilter filter = fileUtils.readBloomFilterFromMetadata(hadoopConf, filePath); + for (HoodieRecord record : records1) { + assertTrue(filter.mightContain(record.getRecordKey())); + } + + insertRecordStr1 = "{\"_row_key\":\"1\"," + + "\"time\":\"2021-09-11T16:39:41.415Z\",\"number\":3}"; + insertRecordStr2 = "{\"_row_key\":\"2\"," + + "\"time\":\"2021-09-11T16:39:41.415Z\",\"number\":4}"; + + List records2 = new ArrayList<>(); + insertRow1 = new RawTripTestPayload(insertRecordStr1); + insertRow2 = new RawTripTestPayload(insertRecordStr2); + // The recordKey of records2 and records1 are the same, but the values of other fields are different + records2.add(new HoodieAvroRecord(new HoodieKey(insertRow1.getRowKey(), insertRow1.getPartitionPath()), insertRow1)); + records2.add(new HoodieAvroRecord(new HoodieKey(insertRow2.getRowKey(), insertRow2.getPartitionPath()), insertRow2)); + + String newCommitTime = makeNewCommitTime(startInstant++, "%09d"); + writeClient.startCommitWithTime(newCommitTime); + // Second insert is the same as the _row_key of the first one,test allowDuplicateInserts + writeClient.insert(records2, newCommitTime); + + allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1); + assertEquals(1, allFiles.length); + // verify new incremental file group is same as the previous one + assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName())); + + filePath = allFiles[0].getPath(); + // The final result should be a collection of records1 and records2 + records1.addAll(records2); + + // Read the base file, check the record content + List fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath); + int index = 0; + for (GenericRecord record : fileRecords) { + assertEquals(records1.get(index).getRecordKey(), record.get("_row_key").toString()); + assertEquals(index + 1, record.get("number")); + index++; + } + } + + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void testInsertWithDataGenerator(boolean mergeAllowDuplicateOnInsertsEnable) throws Exception { + HoodieWriteConfig config = makeHoodieClientConfigBuilder(basePath, AVRO_SCHEMA) + .withMergeAllowDuplicateOnInserts(mergeAllowDuplicateOnInsertsEnable).build(); + + HoodieJavaWriteClient writeClient = getHoodieWriteClient(config); + metaClient = HoodieTableMetaClient.reload(metaClient); + BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient); + + String partitionPath = "2021/09/11"; + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[]{partitionPath}); + + int startInstant = 1; + String firstCommitTime = makeNewCommitTime(startInstant++, "%09d"); + List records1 = dataGenerator.generateInserts(firstCommitTime, 100); + + // First insert + writeClient.startCommitWithTime(firstCommitTime); + writeClient.insert(records1, firstCommitTime); + + FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1); + assertEquals(1, allFiles.length); + + // Read out the bloom filter and make sure filter can answer record exist or not + Path filePath = allFiles[0].getPath(); + BloomFilter filter = fileUtils.readBloomFilterFromMetadata(hadoopConf, filePath); + for (HoodieRecord record : records1) { + assertTrue(filter.mightContain(record.getRecordKey())); + } + + String newCommitTime = makeNewCommitTime(startInstant++, "%09d"); + List records2 = dataGenerator.generateUpdates(newCommitTime, 100); + writeClient.startCommitWithTime(newCommitTime); + // Second insert is the same as the _row_key of the first one,test allowDuplicateInserts + writeClient.insert(records2, newCommitTime); + + allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1); + assertEquals(1, allFiles.length); + // verify new incremental file group is same as the previous one + assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName())); + + filePath = allFiles[0].getPath(); + // If mergeAllowDuplicateOnInsertsEnable is true, the final result should be a collection of records1 and records2 + records1.addAll(records2); + + // Read the base file, check the record content + List fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath); + assertEquals(fileRecords.size(), mergeAllowDuplicateOnInsertsEnable ? records1.size() : records2.size()); + + int index = 0; + for (GenericRecord record : fileRecords) { + assertEquals(records1.get(index).getRecordKey(), record.get("_row_key").toString()); + index++; + } + } +} diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/common/TestHoodieJavaEngineContext.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/common/TestHoodieJavaEngineContext.java index b81c11b710f76..e67e78c019669 100644 --- a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/common/TestHoodieJavaEngineContext.java +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/common/TestHoodieJavaEngineContext.java @@ -20,11 +20,10 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hudi.DummyTaskContextSupplier; +import org.apache.hudi.common.util.collection.ImmutablePair; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Assertions; -import scala.Tuple2; - import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -77,7 +76,7 @@ public void testMapToPair() { Map resultMap = context.mapToPair(mapList, x -> { String[] splits = x.split("_"); - return Tuple2.apply(splits[0], splits[1]); + return new ImmutablePair<>(splits[0], splits[1]); }, 2); Assertions.assertNotNull(resultMap.get("hudi")); diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/execution/bulkinsert/TestJavaBulkInsertInternalPartitioner.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/execution/bulkinsert/TestJavaBulkInsertInternalPartitioner.java new file mode 100644 index 0000000000000..16ee0f9953a77 --- /dev/null +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/execution/bulkinsert/TestJavaBulkInsertInternalPartitioner.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.table.BulkInsertPartitioner; +import org.apache.hudi.testutils.HoodieJavaClientTestHarness; + +import org.apache.avro.Schema; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestJavaBulkInsertInternalPartitioner extends HoodieJavaClientTestHarness { + private static final Comparator KEY_COMPARATOR = + Comparator.comparing(o -> (o.getPartitionPath() + "+" + o.getRecordKey())); + + public static List generateTestRecordsForBulkInsert(int numRecords) { + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(); + List records = dataGenerator.generateInserts("0", numRecords); + return records; + } + + public static Map generatePartitionNumRecords(List records) { + return records.stream().map(record -> record.getPartitionPath()) + .collect(Collectors.groupingBy(Function.identity(), Collectors.counting())); + } + + @ParameterizedTest + @ValueSource(strings = {"rider", "rider,driver"}) + public void testCustomColumnSortPartitioner(String sortColumnString) throws Exception { + String[] sortColumns = sortColumnString.split(","); + Comparator columnComparator = + getCustomColumnComparator(HoodieTestDataGenerator.AVRO_SCHEMA, sortColumns); + + List records = generateTestRecordsForBulkInsert(1000); + testBulkInsertInternalPartitioner( + new JavaCustomColumnsSortPartitioner(sortColumns, HoodieTestDataGenerator.AVRO_SCHEMA, false), + records, true, generatePartitionNumRecords(records), Option.of(columnComparator)); + } + + private Comparator getCustomColumnComparator(Schema schema, String[] sortColumns) { + return Comparator.comparing( + record -> HoodieAvroUtils.getRecordColumnValues(record, sortColumns, schema, false).toString()); + } + + private void verifyRecordAscendingOrder(List records, + Option> comparator) { + List expectedRecords = new ArrayList<>(records); + Collections.sort(expectedRecords, comparator.orElse(KEY_COMPARATOR)); + assertEquals(expectedRecords, records); + } + + private void testBulkInsertInternalPartitioner(BulkInsertPartitioner partitioner, + List records, + boolean isSorted, + Map expectedPartitionNumRecords, + Option> comparator) { + List actualRecords = + (List) partitioner.repartitionRecords(records, 1); + if (isSorted) { + // Verify global order + verifyRecordAscendingOrder(actualRecords, comparator); + } + + // Verify number of records per partition path + assertEquals(expectedPartitionNumRecords, generatePartitionNumRecords(actualRecords)); + } +} diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java new file mode 100644 index 0000000000000..c6d83f9e94168 --- /dev/null +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java @@ -0,0 +1,567 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.HoodieJavaWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.testutils.RawTripTestPayload; +import org.apache.hudi.common.testutils.Transformations; +import org.apache.hudi.common.util.BaseFileUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.hadoop.HoodieParquetInputFormat; +import org.apache.hudi.hadoop.utils.HoodieHiveUtils; +import org.apache.hudi.io.HoodieCreateHandle; +import org.apache.hudi.table.HoodieJavaCopyOnWriteTable; +import org.apache.hudi.table.HoodieJavaTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.testutils.HoodieJavaClientTestHarness; +import org.apache.hudi.testutils.MetadataMergeWriteStatus; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.parquet.avro.AvroReadSupport; +import org.apache.parquet.hadoop.ParquetReader; +import org.junit.jupiter.api.Test; + +import java.io.File; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; +import static org.apache.hudi.common.testutils.HoodieTestTable.makeNewCommitTime; +import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class TestJavaCopyOnWriteActionExecutor extends HoodieJavaClientTestHarness { + + private static final Logger LOG = LogManager.getLogger(TestJavaCopyOnWriteActionExecutor.class); + private static final Schema SCHEMA = getSchemaFromResource(TestJavaCopyOnWriteActionExecutor.class, "/exampleSchema.avsc"); + + @Test + public void testMakeNewPath() { + String fileName = UUID.randomUUID().toString(); + String partitionPath = "2016/05/04"; + + String instantTime = makeNewCommitTime(); + HoodieWriteConfig config = makeHoodieClientConfig(); + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable table = HoodieJavaTable.create(config, context, metaClient); + + Pair newPathWithWriteToken = Arrays.asList(1).stream().map(x -> { + HoodieRecord record = mock(HoodieRecord.class); + when(record.getPartitionPath()).thenReturn(partitionPath); + String writeToken = FSUtils.makeWriteToken(context.getTaskContextSupplier().getPartitionIdSupplier().get(), + context.getTaskContextSupplier().getStageIdSupplier().get(), + context.getTaskContextSupplier().getAttemptIdSupplier().get()); + HoodieCreateHandle io = new HoodieCreateHandle(config, instantTime, table, partitionPath, fileName, + context.getTaskContextSupplier()); + return Pair.of(io.makeNewPath(record.getPartitionPath()), writeToken); + }).collect(Collectors.toList()).get(0); + + assertEquals(newPathWithWriteToken.getKey().toString(), Paths.get(this.basePath, partitionPath, + FSUtils.makeBaseFileName(instantTime, newPathWithWriteToken.getRight(), fileName)).toString()); + } + + private HoodieWriteConfig makeHoodieClientConfig() { + return makeHoodieClientConfigBuilder().build(); + } + + private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() { + // Prepare the AvroParquetIO + return HoodieWriteConfig.newBuilder() + .withEngineType(EngineType.JAVA) + .withPath(basePath) + .withSchema(SCHEMA.toString()); + } + + @Test + public void testUpdateRecords() throws Exception { + // Prepare the AvroParquetIO + HoodieWriteConfig config = makeHoodieClientConfig(); + int startInstant = 1; + String firstCommitTime = makeNewCommitTime(startInstant++, "%09d"); + HoodieJavaWriteClient writeClient = getHoodieWriteClient(config); + writeClient.startCommitWithTime(firstCommitTime); + metaClient = HoodieTableMetaClient.reload(metaClient); + BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient); + + String partitionPath = "2016/01/31"; + + // Get some records belong to the same partition (2016/01/31) + String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}"; + + List records = new ArrayList<>(); + RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); + RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); + RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); + + // Insert new records + writeClient.insert(records, firstCommitTime); + + FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1); + assertEquals(1, allFiles.length); + + // Read out the bloom filter and make sure filter can answer record exist or not + Path filePath = allFiles[0].getPath(); + BloomFilter filter = fileUtils.readBloomFilterFromMetadata(hadoopConf, filePath); + for (HoodieRecord record : records) { + assertTrue(filter.mightContain(record.getRecordKey())); + } + + // Read the base file, check the record content + List fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath); + GenericRecord newRecord; + int index = 0; + for (GenericRecord record : fileRecords) { + //System.out.println("Got :" + record.get("_row_key").toString() + ", Exp :" + records.get(index).getRecordKey()); + assertEquals(records.get(index).getRecordKey(), record.get("_row_key").toString()); + index++; + } + + // We update the 1st record & add a new record + String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + RawTripTestPayload updateRowChanges1 = new RawTripTestPayload(updateRecordStr1); + HoodieRecord updatedRecord1 = new HoodieAvroRecord( + new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1); + + RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); + HoodieRecord insertedRecord1 = + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + + List updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1); + + String newCommitTime = makeNewCommitTime(startInstant++, "%09d"); + metaClient = HoodieTableMetaClient.reload(metaClient); + writeClient.startCommitWithTime(newCommitTime); + List statuses = writeClient.upsert(updatedRecords, newCommitTime); + + allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1); + assertEquals(1, allFiles.length); + // verify new incremental file group is same as the previous one + assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName())); + + // Check whether the record has been updated + Path updatedFilePath = allFiles[0].getPath(); + BloomFilter updatedFilter = + fileUtils.readBloomFilterFromMetadata(hadoopConf, updatedFilePath); + for (HoodieRecord record : records) { + // No change to the _row_key + assertTrue(updatedFilter.mightContain(record.getRecordKey())); + } + + assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey())); + records.add(insertedRecord1);// add this so it can further check below + + ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedFilePath).build(); + index = 0; + while ((newRecord = (GenericRecord) updatedReader.read()) != null) { + assertEquals(newRecord.get("_row_key").toString(), records.get(index).getRecordKey()); + if (index == 0) { + assertEquals("15", newRecord.get("number").toString()); + } + index++; + } + updatedReader.close(); + // Also check the numRecordsWritten + WriteStatus writeStatus = statuses.get(0); + assertEquals(1, statuses.size(), "Should be only one file generated"); + assertEquals(4, writeStatus.getStat().getNumWrites());// 3 rewritten records + 1 new record + } + + private FileStatus[] getIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull) + throws Exception { + // initialize parquet input format + HoodieParquetInputFormat hoodieInputFormat = new HoodieParquetInputFormat(); + JobConf jobConf = new JobConf(hadoopConf); + hoodieInputFormat.setConf(jobConf); + HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE); + setupIncremental(jobConf, startCommitTime, numCommitsToPull); + FileInputFormat.setInputPaths(jobConf, Paths.get(basePath, partitionPath).toString()); + return hoodieInputFormat.listStatus(jobConf); + } + + private void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull) { + String modePropertyName = + String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.set(modePropertyName, HoodieHiveUtils.INCREMENTAL_SCAN_MODE); + + String startCommitTimestampName = + String.format(HoodieHiveUtils.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.set(startCommitTimestampName, startCommit); + + String maxCommitPulls = + String.format(HoodieHiveUtils.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.setInt(maxCommitPulls, numberOfCommitsToPull); + } + + private List newHoodieRecords(int n, String time) throws Exception { + List records = new ArrayList<>(); + for (int i = 0; i < n; i++) { + String recordStr = + String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}", UUID.randomUUID().toString(), time, i); + RawTripTestPayload rowChange = new RawTripTestPayload(recordStr); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); + } + return records; + } + + // Check if record level metadata is aggregated properly at the end of write. + @Test + public void testMetadataAggregateFromWriteStatus() throws Exception { + // Prepare the AvroParquetIO + HoodieWriteConfig config = + makeHoodieClientConfigBuilder().withWriteStatusClass(MetadataMergeWriteStatus.class).build(); + String firstCommitTime = makeNewCommitTime(); + metaClient = HoodieTableMetaClient.reload(metaClient); + + HoodieJavaCopyOnWriteTable table = (HoodieJavaCopyOnWriteTable) HoodieJavaTable.create(config, context, metaClient); + + // Get some records belong to the same partition (2016/01/31) + String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + + List records = new ArrayList<>(); + RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); + RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); + RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); + + // Insert new records + BaseJavaCommitActionExecutor actionExecutor = new JavaInsertCommitActionExecutor(context, config, table, + firstCommitTime, records); + List writeStatuses = new ArrayList<>(); + actionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), records.iterator()) + .forEachRemaining(x -> writeStatuses.addAll((List)x)); + + Map allWriteStatusMergedMetadataMap = + MetadataMergeWriteStatus.mergeMetadataForWriteStatuses(writeStatuses); + assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000")); + // For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this + // should be 2 * 3 + assertEquals("6", allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000")); + } + + private void verifyStatusResult(List statuses, Map expectedPartitionNumRecords) { + Map actualPartitionNumRecords = new HashMap<>(); + + for (int i = 0; i < statuses.size(); i++) { + WriteStatus writeStatus = statuses.get(i); + String partitionPath = writeStatus.getPartitionPath(); + actualPartitionNumRecords.put( + partitionPath, + actualPartitionNumRecords.getOrDefault(partitionPath, 0L) + writeStatus.getTotalRecords()); + assertEquals(0, writeStatus.getFailedRecords().size()); + } + + assertEquals(expectedPartitionNumRecords, actualPartitionNumRecords); + } + + @Test + public void testInsertRecords() throws Exception { + HoodieWriteConfig config = makeHoodieClientConfig(); + String instantTime = makeNewCommitTime(); + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieJavaCopyOnWriteTable table = (HoodieJavaCopyOnWriteTable) HoodieJavaTable.create(config, context, metaClient); + + // Case 1: + // 10 records for partition 1, 1 record for partition 2. + List records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z"); + records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z")); + + // Insert new records + final List recs2 = records; + BaseJavaCommitActionExecutor actionExecutor = new JavaInsertPreppedCommitActionExecutor(context, config, table, + instantTime, recs2); + + final List returnedStatuses = new ArrayList<>(); + actionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), recs2.iterator()) + .forEachRemaining(x -> returnedStatuses.addAll((List)x)); + + assertEquals(2, returnedStatuses.size()); + Map expectedPartitionNumRecords = new HashMap<>(); + expectedPartitionNumRecords.put("2016/01/31", 10L); + expectedPartitionNumRecords.put("2016/02/01", 1L); + verifyStatusResult(returnedStatuses, expectedPartitionNumRecords); + + // Case 2: + // 1 record for partition 1, 5 record for partition 2, 1 records for partition 3. + records = newHoodieRecords(1, "2016-01-31T03:16:41.415Z"); + records.addAll(newHoodieRecords(5, "2016-02-01T03:16:41.415Z")); + records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z")); + + // Insert new records + final List recs3 = records; + BaseJavaCommitActionExecutor newActionExecutor = new JavaUpsertPreppedCommitActionExecutor(context, config, table, + instantTime, recs3); + + final List returnedStatuses1 = new ArrayList<>(); + newActionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), recs3.iterator()) + .forEachRemaining(x -> returnedStatuses1.addAll((List)x)); + + assertEquals(3, returnedStatuses1.size()); + expectedPartitionNumRecords.clear(); + expectedPartitionNumRecords.put("2016/01/31", 1L); + expectedPartitionNumRecords.put("2016/02/01", 5L); + expectedPartitionNumRecords.put("2016/02/02", 1L); + verifyStatusResult(returnedStatuses1, expectedPartitionNumRecords); + } + + @Test + public void testFileSizeUpsertRecords() throws Exception { + HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig(HoodieStorageConfig.newBuilder() + .parquetMaxFileSize(64 * 1024).hfileMaxFileSize(64 * 1024) + .parquetBlockSize(64 * 1024).parquetPageSize(64 * 1024).build()).build(); + + String instantTime = makeNewCommitTime(); + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieJavaCopyOnWriteTable table = (HoodieJavaCopyOnWriteTable) HoodieJavaTable.create(config, context, metaClient); + + List records = new ArrayList<>(); + // Approx 1150 records are written for block size of 64KB + for (int i = 0; i < 2050; i++) { + String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}"; + RawTripTestPayload rowChange = new RawTripTestPayload(recordStr); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); + } + + // Insert new records + BaseJavaCommitActionExecutor actionExecutor = new JavaUpsertCommitActionExecutor(context, config, table, + instantTime, records); + + Arrays.asList(1).stream() + .map(i -> actionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), records.iterator())) + .map(Transformations::flatten).collect(Collectors.toList()); + + // Check the updated file + int counts = 0; + for (File file : Paths.get(basePath, "2016/01/31").toFile().listFiles()) { + if (file.getName().endsWith(table.getBaseFileExtension()) && FSUtils.getCommitTime(file.getName()).equals(instantTime)) { + LOG.info(file.getName() + "-" + file.length()); + counts++; + } + } + // we check canWrite only once every 1000 records. and so 2 files with 1000 records and 3rd file with 50 records. + assertEquals(3, counts, "If the number of records are more than 1150, then there should be a new file"); + } + + @Test + public void testInsertUpsertWithHoodieAvroPayload() throws Exception { + Schema schema = getSchemaFromResource(TestJavaCopyOnWriteActionExecutor.class, "/testDataGeneratorSchema.txt"); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder() + .withEngineType(EngineType.JAVA) + .withPath(basePath) + .withSchema(schema.toString()) + .withStorageConfig(HoodieStorageConfig.newBuilder() + .parquetMaxFileSize(1000 * 1024).hfileMaxFileSize(1000 * 1024).build()) + .build(); + metaClient = HoodieTableMetaClient.reload(metaClient); + final HoodieJavaCopyOnWriteTable table = (HoodieJavaCopyOnWriteTable) HoodieJavaTable.create(config, context, metaClient); + String instantTime = "000"; + // Perform inserts of 100 records to test CreateHandle and BufferedExecutor + final List inserts = dataGen.generateInsertsWithHoodieAvroPayload(instantTime, 100); + BaseJavaCommitActionExecutor actionExecutor = new JavaInsertCommitActionExecutor(context, config, table, + instantTime, inserts); + + final List> ws = new ArrayList<>(); + actionExecutor.handleInsert(UUID.randomUUID().toString(), inserts.iterator()) + .forEachRemaining(x -> ws.add((List)x)); + + WriteStatus writeStatus = ws.get(0).get(0); + String fileId = writeStatus.getFileId(); + metaClient.getFs().create(new Path(Paths.get(basePath, ".hoodie", "000.commit").toString())).close(); + //TODO : Find race condition that causes the timeline sometime to reflect 000.commit and sometimes not + final HoodieJavaCopyOnWriteTable reloadedTable = (HoodieJavaCopyOnWriteTable) HoodieJavaTable.create(config, context, HoodieTableMetaClient.reload(metaClient)); + + final List updates = dataGen.generateUpdatesWithHoodieAvroPayload(instantTime, inserts); + + String partitionPath = writeStatus.getPartitionPath(); + long numRecordsInPartition = updates.stream().filter(u -> u.getPartitionPath().equals(partitionPath)).count(); + BaseJavaCommitActionExecutor newActionExecutor = new JavaUpsertCommitActionExecutor(context, config, reloadedTable, + instantTime, updates); + + taskContextSupplier.reset(); + final List> updateStatus = new ArrayList<>(); + newActionExecutor.handleUpdate(partitionPath, fileId, updates.iterator()) + .forEachRemaining(x -> updateStatus.add((List)x)); + assertEquals(updates.size() - numRecordsInPartition, updateStatus.get(0).get(0).getTotalErrorRecords()); + } + + public void testBulkInsertRecords(String bulkInsertMode) throws Exception { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder() + .withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA) + .withBulkInsertParallelism(2).withBulkInsertSortMode(bulkInsertMode).build(); + String instantTime = makeNewCommitTime(); + HoodieJavaWriteClient writeClient = getHoodieWriteClient(config); + writeClient.startCommitWithTime(instantTime); + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieJavaCopyOnWriteTable table = (HoodieJavaCopyOnWriteTable) HoodieJavaTable.create(config, context, metaClient); + + // Insert new records + final List inputRecords = generateTestRecordsForBulkInsert(); + JavaBulkInsertCommitActionExecutor bulkInsertExecutor = new JavaBulkInsertCommitActionExecutor( + context, config, table, instantTime, inputRecords, Option.empty()); + List returnedStatuses = (List)bulkInsertExecutor.execute().getWriteStatuses(); + verifyStatusResult(returnedStatuses, generateExpectedPartitionNumRecords(inputRecords)); + } + + @Test + public void testDeleteRecords() throws Exception { + // Prepare the AvroParquetIO + HoodieWriteConfig config = makeHoodieClientConfig(); + int startInstant = 1; + String firstCommitTime = makeNewCommitTime(startInstant++, "%09d"); + HoodieJavaWriteClient writeClient = getHoodieWriteClient(config); + writeClient.startCommitWithTime(firstCommitTime); + metaClient = HoodieTableMetaClient.reload(metaClient); + BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient); + + String partitionPath = "2022/04/09"; + + // Get some records belong to the same partition (2016/01/31) + String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2022-04-09T03:16:41.415Z\",\"number\":1}"; + String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2022-04-09T03:20:41.415Z\",\"number\":2}"; + String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2022-04-09T03:16:41.415Z\",\"number\":3}"; + + List records = new ArrayList<>(); + RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); + RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); + RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); + + // Insert new records + writeClient.insert(records, firstCommitTime); + + FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1); + assertEquals(1, allFiles.length); + + // Read out the bloom filter and make sure filter can answer record exist or not + Path filePath = allFiles[0].getPath(); + BloomFilter filter = fileUtils.readBloomFilterFromMetadata(hadoopConf, filePath); + for (HoodieRecord record : records) { + assertTrue(filter.mightContain(record.getRecordKey())); + } + + // Read the base file, check the record content + List fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath); + int index = 0; + for (GenericRecord record : fileRecords) { + assertEquals(records.get(index).getRecordKey(), record.get("_row_key").toString()); + index++; + } + + String newCommitTime = makeNewCommitTime(startInstant++, "%09d"); + writeClient.startCommitWithTime(newCommitTime); + + // Test delete two records + List keysForDelete = new ArrayList(Arrays.asList(records.get(0).getKey(), records.get(2).getKey())); + writeClient.delete(keysForDelete, newCommitTime); + + allFiles = getIncrementalFiles(partitionPath, "0", -1); + assertEquals(1, allFiles.length); + + filePath = allFiles[0].getPath(); + // Read the base file, check the record content + fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath); + // Check that the two records are deleted successfully + assertEquals(1, fileRecords.size()); + assertEquals(records.get(1).getRecordKey(), fileRecords.get(0).get("_row_key").toString()); + + newCommitTime = makeNewCommitTime(startInstant++, "%09d"); + writeClient.startCommitWithTime(newCommitTime); + + // Test delete last record + keysForDelete = new ArrayList(Arrays.asList(records.get(1).getKey())); + writeClient.delete(keysForDelete, newCommitTime); + + allFiles = getIncrementalFiles(partitionPath, "0", -1); + assertEquals(1, allFiles.length); + + filePath = allFiles[0].getPath(); + // Read the base file, check the record content + fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath); + // Check whether all records have been deleted + assertEquals(0, fileRecords.size()); + } + + public static Map generateExpectedPartitionNumRecords(List records) { + return records.stream().map(record -> Pair.of(record.getPartitionPath(), 1)) + .collect(Collectors.groupingBy(Pair::getLeft, Collectors.counting())); + } + + public static List generateTestRecordsForBulkInsert() { + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(); + // RDD partition 1 + List records1 = dataGenerator.generateInserts("0", 100); + // RDD partition 2 + List records2 = dataGenerator.generateInserts("0", 150); + records1.addAll(records2); + return records1; + } +} diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestSchemaEvolutionClient.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestSchemaEvolutionClient.java new file mode 100644 index 0000000000000..33cf88786e831 --- /dev/null +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestSchemaEvolutionClient.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.HoodieJavaWriteClient; +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.testutils.RawTripTestPayload; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.internal.schema.Types; +import org.apache.hudi.testutils.HoodieJavaClientTestHarness; + +import org.apache.avro.Schema; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.Collections; + +import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource; +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * Tests for schema evolution client api. + */ +public class TestSchemaEvolutionClient extends HoodieJavaClientTestHarness { + + private static final Schema SCHEMA = getSchemaFromResource(TestSchemaEvolutionClient.class, "/exampleSchema.avsc"); + + @BeforeEach + public void setUpClient() throws IOException { + HoodieJavaWriteClient writeClient = getWriteClient(); + this.writeClient = writeClient; + prepareTable(writeClient); + } + + @AfterEach + public void closeClient() { + if (writeClient != null) { + writeClient.close(); + } + } + + @Test + public void testUpdateColumnType() { + writeClient.updateColumnType("number", Types.LongType.get()); + assertEquals(Types.LongType.get(), getFieldByName("number").type()); + } + + private HoodieJavaWriteClient getWriteClient() { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder() + .withEngineType(EngineType.JAVA) + .withPath(basePath) + .withSchema(SCHEMA.toString()) + .build(); + return new HoodieJavaWriteClient<>(context, config); + } + + private void prepareTable(HoodieJavaWriteClient writeClient) throws IOException { + String commitTime = "1"; + writeClient.startCommitWithTime(commitTime); + String jsonRow = "{\"_row_key\": \"1\", \"time\": \"2000-01-01T00:00:00.000Z\", \"number\": 1}"; + RawTripTestPayload payload = new RawTripTestPayload(jsonRow); + HoodieAvroRecord record = new HoodieAvroRecord<>( + new HoodieKey(payload.getRowKey(), payload.getPartitionPath()), payload); + writeClient.insert(Collections.singletonList(record), commitTime); + } + + private Types.Field getFieldByName(String fieldName) { + return new TableSchemaResolver(metaClient) + .getTableInternalSchemaFromCommitMetadata() + .get() + .findField(fieldName); + } +} diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/testutils/HoodieJavaClientTestHarness.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/testutils/HoodieJavaClientTestHarness.java new file mode 100644 index 0000000000000..3a60d98921e7c --- /dev/null +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/testutils/HoodieJavaClientTestHarness.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.testutils; + +import org.apache.hudi.client.HoodieJavaWriteClient; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.engine.EngineProperty; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; + +import java.io.IOException; +import java.util.concurrent.ExecutorService; +import java.util.function.Supplier; + +/** + * The test harness for resource initialization and cleanup. + */ +public abstract class HoodieJavaClientTestHarness extends HoodieCommonTestHarness { + + private static final Logger LOG = LogManager.getLogger(HoodieJavaClientTestHarness.class); + + protected Configuration hadoopConf; + protected HoodieJavaEngineContext context; + protected TestJavaTaskContextSupplier taskContextSupplier; + protected FileSystem fs; + protected ExecutorService executorService; + protected HoodieTableFileSystemView tableView; + protected HoodieJavaWriteClient writeClient; + + @BeforeEach + protected void initResources() throws IOException { + basePath = tempDir.resolve("java_client_tests" + System.currentTimeMillis()).toUri().getPath(); + hadoopConf = new Configuration(); + taskContextSupplier = new TestJavaTaskContextSupplier(); + context = new HoodieJavaEngineContext(hadoopConf, taskContextSupplier); + initFileSystem(basePath, hadoopConf); + initTestDataGenerator(); + initMetaClient(); + } + + public class TestJavaTaskContextSupplier extends TaskContextSupplier { + int partitionId = 0; + int stageId = 0; + long attemptId = 0; + + public void reset() { + stageId += 1; + } + + @Override + public Supplier getPartitionIdSupplier() { + return () -> partitionId; + } + + @Override + public Supplier getStageIdSupplier() { + return () -> stageId; + } + + @Override + public Supplier getAttemptIdSupplier() { + return () -> attemptId; + } + + @Override + public Option getProperty(EngineProperty prop) { + return Option.empty(); + } + } + + @AfterEach + protected void cleanupResources() throws IOException { + cleanupClients(); + cleanupTestDataGenerator(); + cleanupFileSystem(); + cleanupExecutorService(); + } + + protected void initFileSystem(String basePath, Configuration hadoopConf) { + if (basePath == null) { + throw new IllegalStateException("The base path has not been initialized."); + } + + fs = FSUtils.getFs(basePath, hadoopConf); + if (fs instanceof LocalFileSystem) { + LocalFileSystem lfs = (LocalFileSystem) fs; + // With LocalFileSystem, with checksum disabled, fs.open() returns an inputStream which is FSInputStream + // This causes ClassCastExceptions in LogRecordScanner (and potentially other places) calling fs.open + // So, for the tests, we enforce checksum verification to circumvent the problem + lfs.setVerifyChecksum(true); + } + } + + protected void cleanupFileSystem() throws IOException { + if (fs != null) { + LOG.warn("Closing file-system instance used in previous test-run"); + fs.close(); + fs = null; + } + } + + protected void initMetaClient() throws IOException { + initMetaClient(getTableType()); + } + + protected void initMetaClient(HoodieTableType tableType) throws IOException { + if (basePath == null) { + throw new IllegalStateException("The base path has not been initialized."); + } + + metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType); + } + + protected void cleanupClients() { + if (metaClient != null) { + metaClient = null; + } + if (writeClient != null) { + writeClient.close(); + writeClient = null; + } + if (tableView != null) { + tableView.close(); + tableView = null; + } + } + + protected void cleanupExecutorService() { + if (this.executorService != null) { + this.executorService.shutdownNow(); + this.executorService = null; + } + } + + protected HoodieJavaWriteClient getHoodieWriteClient(HoodieWriteConfig cfg) { + if (null != writeClient) { + writeClient.close(); + writeClient = null; + } + writeClient = new HoodieJavaWriteClient(context, cfg); + return writeClient; + } +} diff --git a/hudi-client/hudi-java-client/src/test/resources/log4j-surefire.properties b/hudi-client/hudi-java-client/src/test/resources/log4j-surefire.properties deleted file mode 100644 index 32af462093ae5..0000000000000 --- a/hudi-client/hudi-java-client/src/test/resources/log4j-surefire.properties +++ /dev/null @@ -1,31 +0,0 @@ -### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -### -log4j.rootLogger=WARN, CONSOLE -log4j.logger.org.apache=INFO -log4j.logger.org.apache.hudi=DEBUG -log4j.logger.org.apache.hadoop.hbase=ERROR - -# A1 is set to be a ConsoleAppender. -log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. -log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout -log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n -log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter -log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true -log4j.appender.CONSOLE.filter.a.LevelMin=WARN -log4j.appender.CONSOLE.filter.a.LevelMax=FATAL diff --git a/hudi-client/hudi-java-client/src/test/resources/testDataGeneratorSchema.txt b/hudi-client/hudi-java-client/src/test/resources/testDataGeneratorSchema.txt new file mode 100644 index 0000000000000..c80365b76ea6d --- /dev/null +++ b/hudi-client/hudi-java-client/src/test/resources/testDataGeneratorSchema.txt @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "type" : "record", + "name" : "triprec", + "fields" : [ + { + "name" : "timestamp", + "type" : "long" + }, { + "name" : "_row_key", + "type" : "string" + }, { + "name" : "partition_path", + "type" : ["null", "string"], + "default": null + }, { + "name" : "rider", + "type" : "string" + }, { + "name" : "driver", + "type" : "string" + }, { + "name" : "begin_lat", + "type" : "double" + }, { + "name" : "begin_lon", + "type" : "double" + }, { + "name" : "end_lat", + "type" : "double" + }, { + "name" : "end_lon", + "type" : "double" + }, { + "name" : "distance_in_meters", + "type" : "int" + }, { + "name" : "seconds_since_epoch", + "type" : "long" + }, { + "name" : "weight", + "type" : "float" + },{ + "name" : "nation", + "type" : "bytes" + },{ + "name" : "current_date", + "type" : { + "type" : "int", + "logicalType" : "date" + } + },{ + "name" : "current_ts", + "type" : { + "type" : "long" + } + },{ + "name" : "height", + "type" : { + "type" : "fixed", + "name" : "abc", + "size" : 5, + "logicalType" : "decimal", + "precision" : 10, + "scale": 6 + } + }, { + "name" :"city_to_state", + "type" : { + "type" : "map", + "values": "string" + } + }, + { + "name" : "fare", + "type" : { + "type" : "record", + "name" : "fare", + "fields" : [ + { + "name" : "amount", + "type" : "double" + }, + { + "name" : "currency", + "type" : "string" + } + ] + } + }, + { + "name" : "tip_history", + "type" : { + "type" : "array", + "items" : { + "type" : "record", + "name" : "tip_history", + "fields" : [ + { + "name" : "amount", + "type" : "double" + }, + { + "name" : "currency", + "type" : "string" + } + ] + } + } + }, + { + "name" : "_hoodie_is_deleted", + "type" : "boolean", + "default" : false + } ] +} diff --git a/hudi-client/hudi-spark-client/pom.xml b/hudi-client/hudi-spark-client/pom.xml index 5cc6ad6560b6c..f6497ca2bad13 100644 --- a/hudi-client/hudi-spark-client/pom.xml +++ b/hudi-client/hudi-spark-client/pom.xml @@ -15,17 +15,16 @@ See the License for the specific language governing permissions and limitations under the License. --> - + hudi-client org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 hudi-spark-client - ${parent.version} + 0.12.2-dt-SNAPSHOT hudi-spark-client jar @@ -38,11 +37,22 @@ ${scala.version} + + org.scala-lang.modules + scala-collection-compat_${scala.binary.version} + + + + + org.apache.logging.log4j + log4j-1.2-api + + org.apache.hudi hudi-client-common - ${parent.version} + ${project.parent.version} @@ -54,11 +64,6 @@ org.apache.spark spark-sql_${scala.binary.version} - - org.apache.spark - spark-avro_${scala.binary.version} - provided - @@ -109,8 +114,38 @@ javax.xml.bind * + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + + + org.apache.zookeeper + zookeeper + ${zookeeper.version} + test + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + + + + + + + org.slf4j + jul-to-slf4j + @@ -128,6 +163,12 @@ + + org.apache.hudi + hudi-tests-common + ${project.version} + test + org.junit.jupiter junit-jupiter-api @@ -168,6 +209,12 @@ junit-platform-commons test + + + org.awaitility + awaitility + test + diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/async/SparkAsyncClusteringService.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/async/SparkAsyncClusteringService.java new file mode 100644 index 0000000000000..dd2ac9193998f --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/async/SparkAsyncClusteringService.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.async; + +import org.apache.hudi.client.BaseClusterer; +import org.apache.hudi.client.BaseHoodieWriteClient; +import org.apache.hudi.client.HoodieSparkClusteringClient; +import org.apache.hudi.common.engine.HoodieEngineContext; + +/** + * Async clustering service for Spark datasource. + */ +public class SparkAsyncClusteringService extends AsyncClusteringService { + + public SparkAsyncClusteringService(HoodieEngineContext engineContext, BaseHoodieWriteClient writeClient) { + super(engineContext, writeClient); + } + + @Override + protected BaseClusterer createClusteringClient(BaseHoodieWriteClient client) { + return new HoodieSparkClusteringClient(client); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/async/SparkAsyncCompactService.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/async/SparkAsyncCompactService.java index 152a901a77f60..d54fe386bd06b 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/async/SparkAsyncCompactService.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/async/SparkAsyncCompactService.java @@ -18,19 +18,19 @@ package org.apache.hudi.async; -import org.apache.hudi.client.AbstractCompactor; -import org.apache.hudi.client.AbstractHoodieWriteClient; +import org.apache.hudi.client.BaseCompactor; +import org.apache.hudi.client.BaseHoodieWriteClient; import org.apache.hudi.client.HoodieSparkCompactor; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; public class SparkAsyncCompactService extends AsyncCompactService { - public SparkAsyncCompactService(HoodieEngineContext context, AbstractHoodieWriteClient client) { + public SparkAsyncCompactService(HoodieEngineContext context, BaseHoodieWriteClient client) { super(context, client); } @Override - protected AbstractCompactor createCompactor(AbstractHoodieWriteClient client) { - return new HoodieSparkCompactor(client); + protected BaseCompactor createCompactor(BaseHoodieWriteClient client) { + return new HoodieSparkCompactor(client, this.context); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieReadClient.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieReadClient.java index 4fb9f221cbdd7..7277479f64ec0 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieReadClient.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieReadClient.java @@ -18,191 +18,34 @@ package org.apache.hudi.client; -import org.apache.hadoop.conf.Configuration; -import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.util.CompactionUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIndexException; import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.index.SparkHoodieIndex; -import org.apache.hudi.table.HoodieSparkTable; -import org.apache.hudi.table.HoodieTable; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; -import org.apache.spark.sql.types.StructType; - -import java.io.Serializable; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -import scala.Tuple2; /** * Provides an RDD based API for accessing/filtering Hoodie tables, based on keys. + * + * @deprecated This. Use {@link SparkRDDReadClient instead.} */ -public class HoodieReadClient implements Serializable { +@Deprecated +public class HoodieReadClient> extends SparkRDDReadClient { - private static final long serialVersionUID = 1L; - - /** - * TODO: We need to persist the index type into hoodie.properties and be able to access the index just with a simple - * basepath pointing to the table. Until, then just always assume a BloomIndex - */ - private final transient HoodieIndex>, JavaRDD, JavaRDD> index; - private HoodieTable>, JavaRDD, JavaRDD> hoodieTable; - private transient Option sqlContextOpt; - private final transient HoodieSparkEngineContext context; - private final transient Configuration hadoopConf; - - /** - * @param basePath path to Hoodie table - */ public HoodieReadClient(HoodieSparkEngineContext context, String basePath) { - this(context, HoodieWriteConfig.newBuilder().withPath(basePath) - // by default we use HoodieBloomIndex - .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build()); + super(context, basePath); } - /** - * @param context - * @param basePath - * @param sqlContext - */ public HoodieReadClient(HoodieSparkEngineContext context, String basePath, SQLContext sqlContext) { - this(context, basePath); - this.sqlContextOpt = Option.of(sqlContext); - } - - /** - * @param clientConfig instance of HoodieWriteConfig - */ - public HoodieReadClient(HoodieSparkEngineContext context, HoodieWriteConfig clientConfig) { - this.context = context; - this.hadoopConf = context.getHadoopConf().get(); - final String basePath = clientConfig.getBasePath(); - // Create a Hoodie table which encapsulated the commits and files visible - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, basePath, true); - this.hoodieTable = HoodieSparkTable.create(clientConfig, context, metaClient); - this.index = SparkHoodieIndex.createIndex(clientConfig); - this.sqlContextOpt = Option.empty(); - } - - /** - * Adds support for accessing Hoodie built tables from SparkSQL, as you normally would. - * - * @return SparkConf object to be used to construct the SparkContext by caller - */ - public static SparkConf addHoodieSupport(SparkConf conf) { - conf.set("spark.sql.hive.convertMetastoreParquet", "false"); - return conf; - } - - private void assertSqlContext() { - if (!sqlContextOpt.isPresent()) { - throw new IllegalStateException("SQLContext must be set, when performing dataframe operations"); - } + super(context, basePath, sqlContext); } - private Option convertToDataFilePath(Option> partitionPathFileIDPair) { - if (partitionPathFileIDPair.isPresent()) { - HoodieBaseFile dataFile = hoodieTable.getBaseFileOnlyView() - .getLatestBaseFile(partitionPathFileIDPair.get().getLeft(), partitionPathFileIDPair.get().getRight()).get(); - return Option.of(dataFile.getPath()); - } else { - return Option.empty(); - } + public HoodieReadClient(HoodieSparkEngineContext context, String basePath, SQLContext sqlContext, HoodieIndex.IndexType indexType) { + super(context, basePath, sqlContext, indexType); } - /** - * Given a bunch of hoodie keys, fetches all the individual records out as a data frame. - * - * @return a dataframe - */ - public Dataset readROView(JavaRDD hoodieKeys, int parallelism) { - assertSqlContext(); - JavaPairRDD>> lookupResultRDD = checkExists(hoodieKeys); - JavaPairRDD> keyToFileRDD = - lookupResultRDD.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2))); - List paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent()) - .map(keyFileTuple -> keyFileTuple._2().get()).collect(); - - // record locations might be same for multiple keys, so need a unique list - Set uniquePaths = new HashSet<>(paths); - Dataset originalDF = sqlContextOpt.get().read().parquet(uniquePaths.toArray(new String[uniquePaths.size()])); - StructType schema = originalDF.schema(); - JavaPairRDD keyRowRDD = originalDF.javaRDD().mapToPair(row -> { - HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD), - row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); - return new Tuple2<>(key, row); - }); - - // Now, we need to further filter out, for only rows that match the supplied hoodie keys - JavaRDD rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1()); - return sqlContextOpt.get().createDataFrame(rowRDD, schema); - } - - /** - * Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[FullFilePath]] If the optional - * FullFilePath value is not present, then the key is not found. If the FullFilePath value is present, it is the path - * component (without scheme) of the URI underlying file - */ - public JavaPairRDD>> checkExists(JavaRDD hoodieKeys) { - return index.tagLocation(hoodieKeys.map(k -> new HoodieRecord<>(k, null)), context, hoodieTable) - .mapToPair(hr -> new Tuple2<>(hr.getKey(), hr.isCurrentLocationKnown() - ? Option.of(Pair.of(hr.getPartitionPath(), hr.getCurrentLocation().getFileId())) - : Option.empty()) - ); - } - - /** - * Filter out HoodieRecords that already exists in the output folder. This is useful in deduplication. - * - * @param hoodieRecords Input RDD of Hoodie records. - * @return A subset of hoodieRecords RDD, with existing records filtered out. - */ - public JavaRDD> filterExists(JavaRDD> hoodieRecords) { - JavaRDD> recordsWithLocation = tagLocation(hoodieRecords); - return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown()); - } - - /** - * Looks up the index and tags each incoming record with a location of a file that contains the row (if it is actually - * present). Input RDD should contain no duplicates if needed. - * - * @param hoodieRecords Input RDD of Hoodie records - * @return Tagged RDD of Hoodie records - */ - public JavaRDD> tagLocation(JavaRDD> hoodieRecords) throws HoodieIndexException { - return index.tagLocation(hoodieRecords, context, hoodieTable); - } - - /** - * Return all pending compactions with instant time for clients to decide what to compact next. - * - * @return - */ - public List> getPendingCompactions() { - HoodieTableMetaClient metaClient = - new HoodieTableMetaClient(hadoopConf, hoodieTable.getMetaClient().getBasePath(), true); - return CompactionUtils.getAllPendingCompactionPlans(metaClient).stream() - .map( - instantWorkloadPair -> Pair.of(instantWorkloadPair.getKey().getTimestamp(), instantWorkloadPair.getValue())) - .collect(Collectors.toList()); + public HoodieReadClient(HoodieSparkEngineContext context, HoodieWriteConfig clientConfig) { + super(context, clientConfig); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkClusteringClient.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkClusteringClient.java new file mode 100644 index 0000000000000..0812b366aadac --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkClusteringClient.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.client; + +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.timeline.HoodieInstant; + +import org.apache.hudi.common.util.Option; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaRDD; + +import java.io.IOException; +import java.util.stream.Stream; + +/** + * Async clustering client for Spark datasource. + */ +public class HoodieSparkClusteringClient extends + BaseClusterer>, JavaRDD, JavaRDD> { + + private static final Logger LOG = LogManager.getLogger(HoodieSparkClusteringClient.class); + + public HoodieSparkClusteringClient( + BaseHoodieWriteClient>, JavaRDD, JavaRDD> clusteringClient) { + super(clusteringClient); + } + + @Override + public void cluster(HoodieInstant instant) throws IOException { + LOG.info("Executing clustering instance " + instant); + SparkRDDWriteClient writeClient = (SparkRDDWriteClient) clusteringClient; + Option commitMetadata = writeClient.cluster(instant.getTimestamp(), true).getCommitMetadata(); + Stream hoodieWriteStatStream = commitMetadata.get().getPartitionToWriteStats().entrySet().stream().flatMap(e -> + e.getValue().stream()); + long errorsCount = hoodieWriteStatStream.mapToLong(HoodieWriteStat::getTotalWriteErrors).sum(); + if (errorsCount > 0) { + // TODO: Should we treat this fatal and throw exception? + LOG.error("Clustering for instant (" + instant + ") failed with write errors"); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkCompactor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkCompactor.java index b81570de9ef10..b3dc27b6fc65b 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkCompactor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkCompactor.java @@ -18,32 +18,40 @@ package org.apache.hudi.client; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.table.action.HoodieWriteMetadata; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; -import java.io.IOException; +import java.util.List; -public class HoodieSparkCompactor extends AbstractCompactor extends BaseCompactor>, JavaRDD, JavaRDD> { private static final Logger LOG = LogManager.getLogger(HoodieSparkCompactor.class); + private transient HoodieEngineContext context; - public HoodieSparkCompactor(AbstractHoodieWriteClient>, JavaRDD, JavaRDD> compactionClient) { + public HoodieSparkCompactor(BaseHoodieWriteClient>, JavaRDD, JavaRDD> compactionClient, + HoodieEngineContext context) { super(compactionClient); + this.context = context; } @Override - public void compact(HoodieInstant instant) throws IOException { + public void compact(HoodieInstant instant) { LOG.info("Compactor executing compaction " + instant); - SparkRDDWriteClient writeClient = (SparkRDDWriteClient)compactionClient; - JavaRDD res = writeClient.compact(instant.getTimestamp()); - long numWriteErrors = res.collect().stream().filter(WriteStatus::hasErrors).count(); + SparkRDDWriteClient writeClient = (SparkRDDWriteClient) compactionClient; + HoodieWriteMetadata> compactionMetadata = writeClient.compact(instant.getTimestamp()); + List writeStats = compactionMetadata.getCommitMetadata().get().getWriteStats(); + long numWriteErrors = writeStats.stream().mapToLong(HoodieWriteStat::getTotalWriteErrors).sum(); if (numWriteErrors != 0) { // We treat even a single error in compaction as fatal LOG.error("Compaction for instant (" + instant + ") failed with write errors. Errors :" + numWriteErrors); @@ -51,6 +59,6 @@ public void compact(HoodieInstant instant) throws IOException { "Compaction for instant (" + instant + ") failed with write errors. Errors :" + numWriteErrors); } // Commit compaction - writeClient.commitCompaction(instant.getTimestamp(), res, Option.empty()); + writeClient.commitCompaction(instant.getTimestamp(), compactionMetadata.getCommitMetadata().get(), Option.empty()); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDReadClient.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDReadClient.java new file mode 100644 index 0000000000000..adddabfdc0299 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDReadClient.java @@ -0,0 +1,234 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client; + +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.CompactionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.index.SparkHoodieIndexFactory; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; + +import org.apache.hadoop.conf.Configuration; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.types.StructType; + +import java.io.Serializable; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import scala.Tuple2; + +/** + * Provides an RDD based API for accessing/filtering Hoodie tables, based on keys. + */ +public class SparkRDDReadClient> implements Serializable { + + private static final long serialVersionUID = 1L; + + /** + * TODO: We need to persist the index type into hoodie.properties and be able to access the index just with a simple + * base path pointing to the table. Until, then just always assume a BloomIndex + */ + private final transient HoodieIndex index; + private HoodieTable hoodieTable; + private transient Option sqlContextOpt; + private final transient HoodieSparkEngineContext context; + private final transient Configuration hadoopConf; + + /** + * @param basePath path to Hoodie table + */ + public SparkRDDReadClient(HoodieSparkEngineContext context, String basePath) { + this(context, HoodieWriteConfig.newBuilder().withPath(basePath) + // by default we use HoodieBloomIndex + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build()); + } + + /** + * @param context + * @param basePath + * @param sqlContext + */ + public SparkRDDReadClient(HoodieSparkEngineContext context, String basePath, SQLContext sqlContext) { + this(context, basePath); + this.sqlContextOpt = Option.of(sqlContext); + } + + /** + * Initializes the {@link HoodieReadClient} with engine context, base path, SQL context and index type. + * + * @param context Hudi Spark engine context + * @param basePath Base path of the table + * @param sqlContext {@link SQLContext} instance + * @param indexType Hudi index type + */ + public SparkRDDReadClient(HoodieSparkEngineContext context, String basePath, SQLContext sqlContext, HoodieIndex.IndexType indexType) { + this(context, HoodieWriteConfig.newBuilder().withPath(basePath) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).build()).build()); + this.sqlContextOpt = Option.of(sqlContext); + } + + /** + * @param clientConfig instance of HoodieWriteConfig + */ + public SparkRDDReadClient(HoodieSparkEngineContext context, HoodieWriteConfig clientConfig) { + this.context = context; + this.hadoopConf = context.getHadoopConf().get(); + final String basePath = clientConfig.getBasePath(); + // Create a Hoodie table which encapsulated the commits and files visible + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); + this.hoodieTable = HoodieSparkTable.create(clientConfig, context, metaClient); + this.index = SparkHoodieIndexFactory.createIndex(clientConfig); + this.sqlContextOpt = Option.empty(); + } + + /** + * Adds support for accessing Hoodie built tables from SparkSQL, as you normally would. + * + * @return SparkConf object to be used to construct the SparkContext by caller + */ + public static SparkConf addHoodieSupport(SparkConf conf) { + conf.set("spark.sql.hive.convertMetastoreParquet", "false"); + return conf; + } + + private void assertSqlContext() { + if (!sqlContextOpt.isPresent()) { + throw new IllegalStateException("SQLContext must be set, when performing dataframe operations"); + } + } + + private Option convertToDataFilePath(Option> partitionPathFileIDPair) { + if (partitionPathFileIDPair.isPresent()) { + HoodieBaseFile dataFile = hoodieTable.getBaseFileOnlyView() + .getLatestBaseFile(partitionPathFileIDPair.get().getLeft(), partitionPathFileIDPair.get().getRight()).get(); + return Option.of(dataFile.getPath()); + } else { + return Option.empty(); + } + } + + /** + * Given a bunch of hoodie keys, fetches all the individual records out as a data frame. + * + * @return a dataframe + */ + public Dataset readROView(JavaRDD hoodieKeys, int parallelism) { + assertSqlContext(); + JavaPairRDD>> lookupResultRDD = checkExists(hoodieKeys); + JavaPairRDD> keyToFileRDD = + lookupResultRDD.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2))); + List paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent()) + .map(keyFileTuple -> keyFileTuple._2().get()).collect(); + + // record locations might be same for multiple keys, so need a unique list + Set uniquePaths = new HashSet<>(paths); + Dataset originalDF = null; + // read files based on the file extension name + if (paths.size() == 0 || paths.get(0).endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { + originalDF = sqlContextOpt.get().read().parquet(uniquePaths.toArray(new String[uniquePaths.size()])); + } else if (paths.get(0).endsWith(HoodieFileFormat.ORC.getFileExtension())) { + originalDF = sqlContextOpt.get().read().orc(uniquePaths.toArray(new String[uniquePaths.size()])); + } + StructType schema = originalDF.schema(); + JavaPairRDD keyRowRDD = originalDF.javaRDD().mapToPair(row -> { + HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD), + row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); + return new Tuple2<>(key, row); + }); + + // Now, we need to further filter out, for only rows that match the supplied hoodie keys + JavaRDD rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1()); + return sqlContextOpt.get().createDataFrame(rowRDD, schema); + } + + /** + * Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[FullFilePath]] If the optional + * FullFilePath value is not present, then the key is not found. If the FullFilePath value is present, it is the path + * component (without scheme) of the URI underlying file + */ + public JavaPairRDD>> checkExists(JavaRDD hoodieKeys) { + return HoodieJavaRDD.getJavaRDD( + index.tagLocation(HoodieJavaRDD.of(hoodieKeys.map(k -> new HoodieAvroRecord<>(k, null))), + context, hoodieTable)) + .mapToPair(hr -> new Tuple2<>(hr.getKey(), hr.isCurrentLocationKnown() + ? Option.of(Pair.of(hr.getPartitionPath(), hr.getCurrentLocation().getFileId())) + : Option.empty()) + ); + } + + /** + * Filter out HoodieRecords that already exists in the output folder. This is useful in deduplication. + * + * @param hoodieRecords Input RDD of Hoodie records. + * @return A subset of hoodieRecords RDD, with existing records filtered out. + */ + public JavaRDD> filterExists(JavaRDD> hoodieRecords) { + JavaRDD> recordsWithLocation = tagLocation(hoodieRecords); + return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown()); + } + + /** + * Looks up the index and tags each incoming record with a location of a file that contains the row (if it is actually + * present). Input RDD should contain no duplicates if needed. + * + * @param hoodieRecords Input RDD of Hoodie records + * @return Tagged RDD of Hoodie records + */ + public JavaRDD> tagLocation(JavaRDD> hoodieRecords) throws HoodieIndexException { + return HoodieJavaRDD.getJavaRDD( + index.tagLocation(HoodieJavaRDD.of(hoodieRecords), context, hoodieTable)); + } + + /** + * Return all pending compactions with instant time for clients to decide what to compact next. + * + * @return + */ + public List> getPendingCompactions() { + HoodieTableMetaClient metaClient = + HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(hoodieTable.getMetaClient().getBasePath()).setLoadActiveTimelineOnLoad(true).build(); + return CompactionUtils.getAllPendingCompactionPlans(metaClient).stream() + .map( + instantWorkloadPair -> Pair.of(instantWorkloadPair.getKey().getTimestamp(), instantWorkloadPair.getValue())) + .collect(Collectors.toList()); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java index 10a55df9f882d..1f9fcf3ef9c29 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java @@ -18,31 +18,47 @@ package org.apache.hudi.client; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.client.embedded.EmbeddedTimelineService; +import org.apache.hudi.client.utils.TransactionUtils; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; +import org.apache.hudi.common.metrics.Registry; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.TableServiceType; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.HoodieTableVersion; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.ClusteringUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieCommitException; +import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.exception.HoodieClusteringException; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieWriteConflictException; import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.index.SparkHoodieIndex; +import org.apache.hudi.index.SparkHoodieIndexFactory; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; +import org.apache.hudi.metrics.DistributedRegistry; import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.hudi.table.action.compact.SparkCompactHelpers; -import org.apache.hudi.table.upgrade.SparkUpgradeDowngrade; +import org.apache.hudi.table.action.compact.CompactHelpers; +import org.apache.hudi.table.marker.WriteMarkersFactory; +import org.apache.hudi.table.upgrade.SparkUpgradeDowngradeHelper; import com.codahale.metrics.Timer; import org.apache.hadoop.conf.Configuration; @@ -50,29 +66,37 @@ import org.apache.log4j.Logger; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; -import java.io.IOException; -import java.text.ParseException; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; @SuppressWarnings("checkstyle:LineLength") public class SparkRDDWriteClient extends - AbstractHoodieWriteClient>, JavaRDD, JavaRDD> { + BaseHoodieWriteClient>, JavaRDD, JavaRDD> { private static final Logger LOG = LogManager.getLogger(SparkRDDWriteClient.class); public SparkRDDWriteClient(HoodieEngineContext context, HoodieWriteConfig clientConfig) { - super(context, clientConfig); + this(context, clientConfig, Option.empty()); } + @Deprecated public SparkRDDWriteClient(HoodieEngineContext context, HoodieWriteConfig writeConfig, boolean rollbackPending) { - super(context, writeConfig, rollbackPending); + this(context, writeConfig, Option.empty()); } + @Deprecated public SparkRDDWriteClient(HoodieEngineContext context, HoodieWriteConfig writeConfig, boolean rollbackPending, Option timelineService) { - super(context, writeConfig, rollbackPending, timelineService); + this(context, writeConfig, timelineService); + } + + public SparkRDDWriteClient(HoodieEngineContext context, HoodieWriteConfig writeConfig, + Option timelineService) { + super(context, writeConfig, timelineService, SparkUpgradeDowngradeHelper.getInstance()); } /** @@ -87,8 +111,8 @@ public static SparkConf registerClasses(SparkConf conf) { } @Override - protected HoodieIndex>, JavaRDD, JavaRDD> createIndex(HoodieWriteConfig writeConfig) { - return SparkHoodieIndex.createIndex(config); + protected HoodieIndex createIndex(HoodieWriteConfig writeConfig) { + return SparkHoodieIndexFactory.createIndex(config); } /** @@ -97,13 +121,13 @@ protected HoodieIndex>, JavaRDD, JavaRDD writeStatuses, Option> extraMetadata, String commitActionType, Map> partitionToReplacedFileIds) { + context.setJobStatus(this.getClass().getSimpleName(), "Committing stats: " + config.getTableName()); List writeStats = writeStatuses.map(WriteStatus::getStat).collect(); return commitStats(instantTime, writeStats, extraMetadata, commitActionType, partitionToReplacedFileIds); } @Override - protected HoodieTable>, JavaRDD, JavaRDD> createTable(HoodieWriteConfig config, - Configuration hadoopConf) { + protected HoodieTable createTable(HoodieWriteConfig config, Configuration hadoopConf) { return HoodieSparkTable.create(config, context); } @@ -112,7 +136,8 @@ public JavaRDD> filterExists(JavaRDD> hoodieReco // Create a Hoodie table which encapsulated the commits and files visible HoodieSparkTable table = HoodieSparkTable.create(config, context); Timer.Context indexTimer = metrics.getIndexCtx(); - JavaRDD> recordsWithLocation = getIndex().tagLocation(hoodieRecords, context, table); + JavaRDD> recordsWithLocation = HoodieJavaRDD.getJavaRDD( + getIndex().tagLocation(HoodieJavaRDD.of(hoodieRecords), context, table)); metrics.updateIndexMetrics(LOOKUP_STR, metrics.getDurationInMs(indexTimer == null ? 0L : indexTimer.stop())); return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown()); } @@ -122,57 +147,54 @@ public JavaRDD> filterExists(JavaRDD> hoodieReco */ @Override public void bootstrap(Option> extraMetadata) { - if (rollbackPending) { - rollBackInflightBootstrap(); - } - getTableAndInitCtx(WriteOperationType.UPSERT, HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS).bootstrap(context, extraMetadata); + initTable(WriteOperationType.UPSERT, Option.ofNullable(HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS)).bootstrap(context, extraMetadata); } @Override public JavaRDD upsert(JavaRDD> records, String instantTime) { - HoodieTable>, JavaRDD, JavaRDD> table = - getTableAndInitCtx(WriteOperationType.UPSERT, instantTime); + HoodieTable>, HoodieData, HoodieData> table = + initTable(WriteOperationType.UPSERT, Option.ofNullable(instantTime)); table.validateUpsertSchema(); - setOperationType(WriteOperationType.UPSERT); - this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime); - HoodieWriteMetadata> result = table.upsert(context, instantTime, records); + preWrite(instantTime, WriteOperationType.UPSERT, table.getMetaClient()); + HoodieWriteMetadata> result = table.upsert(context, instantTime, HoodieJavaRDD.of(records)); + HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses())); if (result.getIndexLookupDuration().isPresent()) { metrics.updateIndexMetrics(LOOKUP_STR, result.getIndexLookupDuration().get().toMillis()); } - return postWrite(result, instantTime, table); + return postWrite(resultRDD, instantTime, table); } @Override public JavaRDD upsertPreppedRecords(JavaRDD> preppedRecords, String instantTime) { - HoodieTable>, JavaRDD, JavaRDD> table = - getTableAndInitCtx(WriteOperationType.UPSERT_PREPPED, instantTime); + HoodieTable>, HoodieData, HoodieData> table = + initTable(WriteOperationType.UPSERT_PREPPED, Option.ofNullable(instantTime)); table.validateUpsertSchema(); - setOperationType(WriteOperationType.UPSERT_PREPPED); - this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime); - HoodieWriteMetadata> result = table.upsertPrepped(context,instantTime, preppedRecords); - return postWrite(result, instantTime, table); + preWrite(instantTime, WriteOperationType.UPSERT_PREPPED, table.getMetaClient()); + HoodieWriteMetadata> result = table.upsertPrepped(context,instantTime, HoodieJavaRDD.of(preppedRecords)); + HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses())); + return postWrite(resultRDD, instantTime, table); } @Override public JavaRDD insert(JavaRDD> records, String instantTime) { - HoodieTable>, JavaRDD, JavaRDD> table = - getTableAndInitCtx(WriteOperationType.INSERT, instantTime); + HoodieTable>, HoodieData, HoodieData> table = + initTable(WriteOperationType.INSERT, Option.ofNullable(instantTime)); table.validateInsertSchema(); - setOperationType(WriteOperationType.INSERT); - this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime); - HoodieWriteMetadata> result = table.insert(context,instantTime, records); - return postWrite(result, instantTime, table); + preWrite(instantTime, WriteOperationType.INSERT, table.getMetaClient()); + HoodieWriteMetadata> result = table.insert(context, instantTime, HoodieJavaRDD.of(records)); + HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses())); + return postWrite(resultRDD, instantTime, table); } @Override public JavaRDD insertPreppedRecords(JavaRDD> preppedRecords, String instantTime) { - HoodieTable>, JavaRDD, JavaRDD> table = - getTableAndInitCtx(WriteOperationType.INSERT_PREPPED, instantTime); + HoodieTable>, HoodieData, HoodieData> table = + initTable(WriteOperationType.INSERT_PREPPED, Option.ofNullable(instantTime)); table.validateInsertSchema(); - setOperationType(WriteOperationType.INSERT_PREPPED); - this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime); - HoodieWriteMetadata> result = table.insertPrepped(context,instantTime, preppedRecords); - return postWrite(result, instantTime, table); + preWrite(instantTime, WriteOperationType.INSERT_PREPPED, table.getMetaClient()); + HoodieWriteMetadata> result = table.insertPrepped(context,instantTime, HoodieJavaRDD.of(preppedRecords)); + HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses())); + return postWrite(resultRDD, instantTime, table); } /** @@ -183,15 +205,14 @@ public JavaRDD insertPreppedRecords(JavaRDD> preppe * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts */ public HoodieWriteResult insertOverwrite(JavaRDD> records, final String instantTime) { - HoodieTable table = getTableAndInitCtx(WriteOperationType.INSERT_OVERWRITE, instantTime); + HoodieTable>, HoodieData, HoodieData> table = initTable(WriteOperationType.INSERT_OVERWRITE, Option.ofNullable(instantTime)); table.validateInsertSchema(); - setOperationType(WriteOperationType.INSERT_OVERWRITE); - this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime); - HoodieWriteMetadata result = table.insertOverwrite(context, instantTime, records); - return new HoodieWriteResult(postWrite(result, instantTime, table), result.getPartitionToReplaceFileIds()); + preWrite(instantTime, WriteOperationType.INSERT_OVERWRITE, table.getMetaClient()); + HoodieWriteMetadata> result = table.insertOverwrite(context, instantTime, HoodieJavaRDD.of(records)); + HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses())); + return new HoodieWriteResult(postWrite(resultRDD, instantTime, table), result.getPartitionToReplaceFileIds()); } - /** * Removes all existing records of the Hoodie table and inserts the given HoodieRecords, into the table. @@ -200,12 +221,12 @@ public HoodieWriteResult insertOverwrite(JavaRDD> records, final * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts */ public HoodieWriteResult insertOverwriteTable(JavaRDD> records, final String instantTime) { - HoodieTable table = getTableAndInitCtx(WriteOperationType.INSERT_OVERWRITE_TABLE, instantTime); + HoodieTable>, HoodieData, HoodieData> table = initTable(WriteOperationType.INSERT_OVERWRITE_TABLE, Option.ofNullable(instantTime)); table.validateInsertSchema(); - setOperationType(WriteOperationType.INSERT_OVERWRITE_TABLE); - this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime); - HoodieWriteMetadata result = table.insertOverwriteTable(context, instantTime, records); - return new HoodieWriteResult(postWrite(result, instantTime, table), result.getPartitionToReplaceFileIds()); + preWrite(instantTime, WriteOperationType.INSERT_OVERWRITE_TABLE, table.getMetaClient()); + HoodieWriteMetadata> result = table.insertOverwriteTable(context, instantTime, HoodieJavaRDD.of(records)); + HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses())); + return new HoodieWriteResult(postWrite(resultRDD, instantTime, table), result.getPartitionToReplaceFileIds()); } @Override @@ -214,39 +235,48 @@ public JavaRDD bulkInsert(JavaRDD> records, String } @Override - public JavaRDD bulkInsert(JavaRDD> records, String instantTime, Option>>> userDefinedBulkInsertPartitioner) { - HoodieTable>, JavaRDD, JavaRDD> table = - getTableAndInitCtx(WriteOperationType.BULK_INSERT, instantTime); + public JavaRDD bulkInsert(JavaRDD> records, String instantTime, Option userDefinedBulkInsertPartitioner) { + HoodieTable>, HoodieData, HoodieData> table = + initTable(WriteOperationType.BULK_INSERT, Option.ofNullable(instantTime)); table.validateInsertSchema(); - setOperationType(WriteOperationType.BULK_INSERT); - this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime); - HoodieWriteMetadata> result = table.bulkInsert(context,instantTime, records, userDefinedBulkInsertPartitioner); - return postWrite(result, instantTime, table); + preWrite(instantTime, WriteOperationType.BULK_INSERT, table.getMetaClient()); + HoodieWriteMetadata> result = table.bulkInsert(context,instantTime, HoodieJavaRDD.of(records), userDefinedBulkInsertPartitioner); + HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses())); + return postWrite(resultRDD, instantTime, table); } @Override - public JavaRDD bulkInsertPreppedRecords(JavaRDD> preppedRecords, String instantTime, Option>>> bulkInsertPartitioner) { - HoodieTable>, JavaRDD, JavaRDD> table = - getTableAndInitCtx(WriteOperationType.BULK_INSERT_PREPPED, instantTime); + public JavaRDD bulkInsertPreppedRecords(JavaRDD> preppedRecords, String instantTime, Option bulkInsertPartitioner) { + HoodieTable>, HoodieData, HoodieData> table = + initTable(WriteOperationType.BULK_INSERT_PREPPED, Option.ofNullable(instantTime)); table.validateInsertSchema(); - setOperationType(WriteOperationType.BULK_INSERT_PREPPED); - this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime); - HoodieWriteMetadata> result = table.bulkInsertPrepped(context,instantTime, preppedRecords, bulkInsertPartitioner); - return postWrite(result, instantTime, table); + preWrite(instantTime, WriteOperationType.BULK_INSERT_PREPPED, table.getMetaClient()); + HoodieWriteMetadata> result = table.bulkInsertPrepped(context,instantTime, HoodieJavaRDD.of(preppedRecords), bulkInsertPartitioner); + HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses())); + return postWrite(resultRDD, instantTime, table); } @Override public JavaRDD delete(JavaRDD keys, String instantTime) { - HoodieTable>, JavaRDD, JavaRDD> table = getTableAndInitCtx(WriteOperationType.DELETE, instantTime); - setOperationType(WriteOperationType.DELETE); - HoodieWriteMetadata> result = table.delete(context,instantTime, keys); - return postWrite(result, instantTime, table); + HoodieTable>, HoodieData, HoodieData> table = initTable(WriteOperationType.DELETE, Option.ofNullable(instantTime)); + preWrite(instantTime, WriteOperationType.DELETE, table.getMetaClient()); + HoodieWriteMetadata> result = table.delete(context,instantTime, HoodieJavaRDD.of(keys)); + HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses())); + return postWrite(resultRDD, instantTime, table); + } + + public HoodieWriteResult deletePartitions(List partitions, String instantTime) { + HoodieTable>, HoodieData, HoodieData> table = initTable(WriteOperationType.DELETE_PARTITION, Option.ofNullable(instantTime)); + preWrite(instantTime, WriteOperationType.DELETE_PARTITION, table.getMetaClient()); + HoodieWriteMetadata> result = table.deletePartitions(context, instantTime, partitions); + HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses())); + return new HoodieWriteResult(postWrite(resultRDD, instantTime, table), result.getPartitionToReplaceFileIds()); } @Override protected JavaRDD postWrite(HoodieWriteMetadata> result, String instantTime, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { + HoodieTable hoodieTable) { if (result.getIndexLookupDuration().isPresent()) { metrics.updateIndexMetrics(getOperationType().name(), result.getIndexUpdateDuration().get().toMillis()); } @@ -257,7 +287,7 @@ protected JavaRDD postWrite(HoodieWriteMetadata postWrite(HoodieWriteMetadata writeStatuses, Option> extraMetadata) throws IOException { + public void commitCompaction(String compactionInstantTime, HoodieCommitMetadata metadata, Option> extraMetadata) { HoodieSparkTable table = HoodieSparkTable.create(config, context); - HoodieCommitMetadata metadata = SparkCompactHelpers.newInstance().createCompactionMetadata( - table, compactionInstantTime, writeStatuses, config.getSchema()); extraMetadata.ifPresent(m -> m.forEach(metadata::addMetadata)); - completeCompaction(metadata, writeStatuses, table, compactionInstantTime); + completeCompaction(metadata, table, compactionInstantTime); } @Override - protected void completeCompaction(HoodieCommitMetadata metadata, JavaRDD writeStatuses, - HoodieTable>, JavaRDD, JavaRDD> table, + protected void completeCompaction(HoodieCommitMetadata metadata, + HoodieTable table, String compactionCommitTime) { - List writeStats = writeStatuses.map(WriteStatus::getStat).collect(); - finalizeWrite(table, compactionCommitTime, writeStats); - LOG.info("Committing Compaction " + compactionCommitTime + ". Finished with result " + metadata); - SparkCompactHelpers.newInstance().completeInflightCompaction(table, compactionCommitTime, metadata); - + this.context.setJobStatus(this.getClass().getSimpleName(), "Collect compaction write status and commit compaction: " + config.getTableName()); + List writeStats = metadata.getWriteStats(); + final HoodieInstant compactionInstant = HoodieTimeline.getCompactionInflightInstant(compactionCommitTime); + try { + this.txnManager.beginTransaction(Option.of(compactionInstant), Option.empty()); + finalizeWrite(table, compactionCommitTime, writeStats); + // commit to data table after committing to metadata table. + updateTableMetadata(table, metadata, compactionInstant); + LOG.info("Committing Compaction " + compactionCommitTime + ". Finished with result " + metadata); + CompactHelpers.getInstance().completeInflightCompaction(table, compactionCommitTime, metadata); + } finally { + this.txnManager.endTransaction(Option.of(compactionInstant)); + } + WriteMarkersFactory.get(config.getMarkersType(), table, compactionCommitTime) + .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); if (compactionTimer != null) { long durationInMs = metrics.getDurationInMs(compactionTimer.stop()); - try { - metrics.updateCommitMetrics(HoodieActiveTimeline.COMMIT_FORMATTER.parse(compactionCommitTime).getTime(), - durationInMs, metadata, HoodieActiveTimeline.COMPACTION_ACTION); - } catch (ParseException e) { - throw new HoodieCommitException("Commit time is not of valid format. Failed to commit compaction " - + config.getBasePath() + " at time " + compactionCommitTime, e); - } + HoodieActiveTimeline.parseDateFromInstantTimeSafely(compactionCommitTime).ifPresent(parsedInstant -> + metrics.updateCommitMetrics(parsedInstant.getTime(), durationInMs, metadata, HoodieActiveTimeline.COMPACTION_ACTION) + ); } LOG.info("Compacted successfully on commit " + compactionCommitTime); } @Override - protected JavaRDD compact(String compactionInstantTime, boolean shouldComplete) { + protected HoodieWriteMetadata> compact(String compactionInstantTime, boolean shouldComplete) { HoodieSparkTable table = HoodieSparkTable.create(config, context); + preWrite(compactionInstantTime, WriteOperationType.COMPACT, table.getMetaClient()); HoodieTimeline pendingCompactionTimeline = table.getActiveTimeline().filterPendingCompactionTimeline(); HoodieInstant inflightInstant = HoodieTimeline.getCompactionInflightInstant(compactionInstantTime); if (pendingCompactionTimeline.containsInstant(inflightInstant)) { - rollbackInflightCompaction(inflightInstant, table); + table.rollbackInflightCompaction(inflightInstant, commitToRollback -> getPendingRollbackInfo(table.getMetaClient(), commitToRollback, false)); table.getMetaClient().reloadActiveTimeline(); } compactionTimer = metrics.getCompactionCtx(); - HoodieWriteMetadata> compactionMetadata = table.compact(context, compactionInstantTime); - JavaRDD statuses = compactionMetadata.getWriteStatuses(); + HoodieWriteMetadata> writeMetadata = table.compact(context, compactionInstantTime); + HoodieWriteMetadata> compactionMetadata = writeMetadata.clone(HoodieJavaRDD.getJavaRDD(writeMetadata.getWriteStatuses())); if (shouldComplete && compactionMetadata.getCommitMetadata().isPresent()) { - completeCompaction(compactionMetadata.getCommitMetadata().get(), statuses, table, compactionInstantTime); + completeTableService(TableServiceType.COMPACT, compactionMetadata.getCommitMetadata().get(), table, compactionInstantTime); } - return statuses; + return compactionMetadata; } @Override - protected HoodieTable>, JavaRDD, JavaRDD> getTableAndInitCtx(WriteOperationType operationType, String instantTime) { - HoodieTableMetaClient metaClient = createMetaClient(true); - new SparkUpgradeDowngrade(metaClient, config, context).run(metaClient, HoodieTableVersion.current(), config, context, instantTime); - return getTableAndInitCtx(metaClient, operationType); + public HoodieWriteMetadata> cluster(String clusteringInstant, boolean shouldComplete) { + HoodieSparkTable table = HoodieSparkTable.create(config, context); + preWrite(clusteringInstant, WriteOperationType.CLUSTER, table.getMetaClient()); + HoodieTimeline pendingClusteringTimeline = table.getActiveTimeline().filterPendingReplaceTimeline(); + HoodieInstant inflightInstant = HoodieTimeline.getReplaceCommitInflightInstant(clusteringInstant); + if (pendingClusteringTimeline.containsInstant(inflightInstant)) { + table.rollbackInflightClustering(inflightInstant, commitToRollback -> getPendingRollbackInfo(table.getMetaClient(), commitToRollback, false)); + table.getMetaClient().reloadActiveTimeline(); + } + clusteringTimer = metrics.getClusteringCtx(); + LOG.info("Starting clustering at " + clusteringInstant); + HoodieWriteMetadata> writeMetadata = table.cluster(context, clusteringInstant); + HoodieWriteMetadata> clusteringMetadata = writeMetadata.clone(HoodieJavaRDD.getJavaRDD(writeMetadata.getWriteStatuses())); + // Validation has to be done after cloning. if not, it could result in dereferencing the write status twice which means clustering could get executed twice. + validateClusteringCommit(clusteringMetadata, clusteringInstant, table); + // TODO : Where is shouldComplete used ? + if (shouldComplete && clusteringMetadata.getCommitMetadata().isPresent()) { + completeTableService(TableServiceType.CLUSTER, clusteringMetadata.getCommitMetadata().get(), table, clusteringInstant); + } + return clusteringMetadata; + } + + private void completeClustering(HoodieReplaceCommitMetadata metadata, + HoodieTable table, + String clusteringCommitTime) { + List writeStats = metadata.getPartitionToWriteStats().entrySet().stream().flatMap(e -> + e.getValue().stream()).collect(Collectors.toList()); + + if (writeStats.stream().mapToLong(s -> s.getTotalWriteErrors()).sum() > 0) { + throw new HoodieClusteringException("Clustering failed to write to files:" + + writeStats.stream().filter(s -> s.getTotalWriteErrors() > 0L).map(s -> s.getFileId()).collect(Collectors.joining(","))); + } + + final HoodieInstant clusteringInstant = HoodieTimeline.getReplaceCommitInflightInstant(clusteringCommitTime); + try { + this.txnManager.beginTransaction(Option.of(clusteringInstant), Option.empty()); + + finalizeWrite(table, clusteringCommitTime, writeStats); + // Update table's metadata (table) + updateTableMetadata(table, metadata, clusteringInstant); + + LOG.info("Committing Clustering " + clusteringCommitTime + ". Finished with result " + metadata); + + table.getActiveTimeline().transitionReplaceInflightToComplete( + clusteringInstant, + Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + } catch (Exception e) { + throw new HoodieClusteringException("unable to transition clustering inflight to complete: " + clusteringCommitTime, e); + } finally { + this.txnManager.endTransaction(Option.of(clusteringInstant)); + } + WriteMarkersFactory.get(config.getMarkersType(), table, clusteringCommitTime) + .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); + if (clusteringTimer != null) { + long durationInMs = metrics.getDurationInMs(clusteringTimer.stop()); + HoodieActiveTimeline.parseDateFromInstantTimeSafely(clusteringCommitTime).ifPresent(parsedInstant -> + metrics.updateCommitMetrics(parsedInstant.getTime(), durationInMs, metadata, HoodieActiveTimeline.REPLACE_COMMIT_ACTION) + ); + } + LOG.info("Clustering successfully on commit " + clusteringCommitTime); + } + + private void validateClusteringCommit(HoodieWriteMetadata> clusteringMetadata, String clusteringCommitTime, HoodieTable table) { + if (clusteringMetadata.getWriteStatuses().isEmpty()) { + HoodieClusteringPlan clusteringPlan = ClusteringUtils.getClusteringPlan( + table.getMetaClient(), HoodieTimeline.getReplaceCommitRequestedInstant(clusteringCommitTime)) + .map(Pair::getRight).orElseThrow(() -> new HoodieClusteringException( + "Unable to read clustering plan for instant: " + clusteringCommitTime)); + throw new HoodieClusteringException("Clustering plan produced 0 WriteStatus for " + clusteringCommitTime + + " #groups: " + clusteringPlan.getInputGroups().size() + " expected at least " + + clusteringPlan.getInputGroups().stream().mapToInt(HoodieClusteringGroup::getNumOutputFileGroups).sum() + + " write statuses"); + } } - private HoodieTable>, JavaRDD, JavaRDD> getTableAndInitCtx(HoodieTableMetaClient metaClient, WriteOperationType operationType) { - if (operationType == WriteOperationType.DELETE) { - setWriteSchemaForDeletes(metaClient); + private void updateTableMetadata(HoodieTable table, HoodieCommitMetadata commitMetadata, + HoodieInstant hoodieInstant) { + boolean isTableServiceAction = table.isTableServiceAction(hoodieInstant.getAction(), hoodieInstant.getTimestamp()); + // Do not do any conflict resolution here as we do with regular writes. We take the lock here to ensure all writes to metadata table happens within a + // single lock (single writer). Because more than one write to metadata table will result in conflicts since all of them updates the same partition. + table.getMetadataWriter(hoodieInstant.getTimestamp()) + .ifPresent(writer -> ((HoodieTableMetadataWriter) writer).update(commitMetadata, hoodieInstant.getTimestamp(), isTableServiceAction)); + } + + @Override + protected HoodieTable doInitTable(HoodieTableMetaClient metaClient, Option instantTime, boolean initialMetadataTableIfNecessary) { + if (initialMetadataTableIfNecessary) { + // Initialize Metadata Table to make sure it's bootstrapped _before_ the operation, + // if it didn't exist before + // See https://issues.apache.org/jira/browse/HUDI-3343 for more details + initializeMetadataTable(instantTime); } + // Create a Hoodie table which encapsulated the commits and files visible - HoodieSparkTable table = HoodieSparkTable.create(config, (HoodieSparkEngineContext) context, metaClient); - if (table.getMetaClient().getCommitActionType().equals(HoodieTimeline.COMMIT_ACTION)) { - writeTimer = metrics.getCommitCtx(); - } else { - writeTimer = metrics.getDeltaCommitCtx(); + return HoodieSparkTable.create(config, (HoodieSparkEngineContext) context, metaClient); + } + + /** + * Initialize the metadata table if needed. Creating the metadata table writer + * will trigger the initial bootstrapping from the data table. + * + * @param inFlightInstantTimestamp - The in-flight action responsible for the metadata table initialization + */ + private void initializeMetadataTable(Option inFlightInstantTimestamp) { + if (config.isMetadataTableEnabled()) { + HoodieTableMetadataWriter writer = SparkHoodieBackedTableMetadataWriter.create(context.getHadoopConf().get(), config, + context, Option.empty(), inFlightInstantTimestamp); + try { + writer.close(); + } catch (Exception e) { + throw new HoodieException("Failed to instantiate Metadata table ", e); + } + } + } + + // TODO : To enforce priority between table service and ingestion writer, use transactions here and invoke strategy + private void completeTableService(TableServiceType tableServiceType, HoodieCommitMetadata metadata, + HoodieTable table, + String commitInstant) { + + switch (tableServiceType) { + case CLUSTER: + completeClustering((HoodieReplaceCommitMetadata) metadata, table, commitInstant); + break; + case COMPACT: + completeCompaction(metadata, table, commitInstant); + break; + default: + throw new IllegalArgumentException("This table service is not valid " + tableServiceType); + } + } + + @Override + protected void preCommit(HoodieInstant inflightInstant, HoodieCommitMetadata metadata) { + // Create a Hoodie table after startTxn which encapsulated the commits and files visible. + // Important to create this after the lock to ensure the latest commits show up in the timeline without need for reload + HoodieTable table = createTable(config, hadoopConf); + Timer.Context conflictResolutionTimer = metrics.getConflictResolutionCtx(); + try { + TransactionUtils.resolveWriteConflictIfAny(table, this.txnManager.getCurrentTransactionOwner(), + Option.of(metadata), config, txnManager.getLastCompletedTransactionOwner(), false, this.pendingInflightAndRequestedInstants); + metrics.emitConflictResolutionSuccessful(); + } catch (HoodieWriteConflictException e) { + metrics.emitConflictResolutionFailed(); + throw e; + } finally { + if (conflictResolutionTimer != null) { + conflictResolutionTimer.stop(); + } + } + } + + @Override + protected void initWrapperFSMetrics() { + if (config.isMetricsOn()) { + Registry registry; + Registry registryMeta; + JavaSparkContext jsc = ((HoodieSparkEngineContext) context).getJavaSparkContext(); + + if (config.isExecutorMetricsEnabled()) { + // Create a distributed registry for HoodieWrapperFileSystem + registry = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName(), + DistributedRegistry.class.getName()); + ((DistributedRegistry)registry).register(jsc); + registryMeta = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName() + "MetaFolder", + DistributedRegistry.class.getName()); + ((DistributedRegistry)registryMeta).register(jsc); + } else { + registry = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName()); + registryMeta = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName() + "MetaFolder"); + } + + HoodieWrapperFileSystem.setMetricsRegistry(registry, registryMeta); + } + } + + @Override + protected void releaseResources() { + // If we do not explicitly release the resource, spark will automatically manage the resource and clean it up automatically + // see: https://spark.apache.org/docs/latest/rdd-programming-guide.html#removing-data + if (config.areReleaseResourceEnabled()) { + ((HoodieSparkEngineContext) context).getJavaSparkContext().getPersistentRDDs().values() + .forEach(JavaRDD::unpersist); } - return table; } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkTaskContextSupplier.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkTaskContextSupplier.java index 563bc1a77f075..d118f0ead8d8e 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkTaskContextSupplier.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkTaskContextSupplier.java @@ -18,8 +18,8 @@ package org.apache.hudi.client; -import org.apache.hudi.client.common.EngineProperty; -import org.apache.hudi.client.common.TaskContextSupplier; +import org.apache.hudi.common.engine.EngineProperty; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieException; @@ -78,6 +78,14 @@ public Option getProperty(EngineProperty prop) { .get(SPARK_EXECUTOR_MEMORY_FRACTION_PROP, DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION)); } return Option.empty(); + } else if (prop == EngineProperty.TOTAL_CORES_PER_EXECUTOR) { + final String DEFAULT_SPARK_EXECUTOR_CORES = "1"; + final String SPARK_EXECUTOR_EXECUTOR_CORES_PROP = "spark.executor.cores"; + if (SparkEnv.get() != null) { + return Option.ofNullable(SparkEnv.get().conf() + .get(SPARK_EXECUTOR_EXECUTOR_CORES_PROP, DEFAULT_SPARK_EXECUTOR_CORES)); + } + return Option.empty(); } throw new HoodieException("Unknown engine property :" + prop); } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/bootstrap/HoodieSparkBootstrapSchemaProvider.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/bootstrap/HoodieSparkBootstrapSchemaProvider.java index 30cde59febe11..b161182b83a36 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/bootstrap/HoodieSparkBootstrapSchemaProvider.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/bootstrap/HoodieSparkBootstrapSchemaProvider.java @@ -18,25 +18,33 @@ package org.apache.hudi.client.bootstrap; -import org.apache.avro.Schema; -import org.apache.hadoop.fs.Path; +import org.apache.hudi.AvroConversionUtils; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieFileStatus; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.bootstrap.FileStatusUtils; -import org.apache.hudi.common.util.ParquetUtils; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.AvroOrcUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; -import org.apache.parquet.schema.MessageType; -import org.apache.spark.sql.avro.SchemaConverters; -import org.apache.spark.sql.execution.datasources.parquet.ParquetToSparkSchemaConverter; + +import org.apache.avro.Schema; +import org.apache.hadoop.fs.Path; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.orc.TypeDescription; import org.apache.spark.sql.internal.SQLConf; import org.apache.spark.sql.types.StructType; +import java.io.IOException; import java.util.List; import java.util.Objects; +import static org.apache.hudi.common.model.HoodieFileFormat.ORC; +import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; + public class HoodieSparkBootstrapSchemaProvider extends HoodieBootstrapSchemaProvider { public HoodieSparkBootstrapSchemaProvider(HoodieWriteConfig writeConfig) { super(writeConfig); @@ -44,25 +52,53 @@ public HoodieSparkBootstrapSchemaProvider(HoodieWriteConfig writeConfig) { @Override protected Schema getBootstrapSourceSchema(HoodieEngineContext context, List>> partitions) { - MessageType parquetSchema = partitions.stream().flatMap(p -> p.getValue().stream()).map(fs -> { - try { - Path filePath = FileStatusUtils.toPath(fs.getPath()); - return ParquetUtils.readSchema(context.getHadoopConf().get(), filePath); - } catch (Exception ex) { - return null; - } - }).filter(Objects::nonNull).findAny() + Schema schema = partitions.stream().flatMap(p -> p.getValue().stream()).map(fs -> { + Path filePath = FileStatusUtils.toPath(fs.getPath()); + String extension = FSUtils.getFileExtension(filePath.getName()); + if (PARQUET.getFileExtension().equals(extension)) { + return getBootstrapSourceSchemaParquet(writeConfig, context, filePath); + } else if (ORC.getFileExtension().equals(extension)) { + return getBootstrapSourceSchemaOrc(writeConfig, context, filePath); + } else { + throw new HoodieException("Could not determine schema from the data files."); + } + } + ).filter(Objects::nonNull).findAny() .orElseThrow(() -> new HoodieException("Could not determine schema from the data files.")); + return schema; + } - - ParquetToSparkSchemaConverter converter = new ParquetToSparkSchemaConverter( - Boolean.parseBoolean(SQLConf.PARQUET_BINARY_AS_STRING().defaultValueString()), - Boolean.parseBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP().defaultValueString())); - StructType sparkSchema = converter.convert(parquetSchema); + private static Schema getBootstrapSourceSchemaParquet(HoodieWriteConfig writeConfig, HoodieEngineContext context, Path filePath) { + // NOTE: The type inference of partition column in the parquet table is turned off explicitly, + // to be consistent with the existing bootstrap behavior, where the partition column is String + // typed in Hudi table. + // TODO(HUDI-4932): add a config to allow type inference of partition column in bootstrap and + // support other types of partition column as well + ((HoodieSparkEngineContext) context).getSqlContext() + .setConf(SQLConf.PARTITION_COLUMN_TYPE_INFERENCE(), false); + StructType parquetSchema = ((HoodieSparkEngineContext) context).getSqlContext().read() + .option("basePath", writeConfig.getBootstrapSourceBasePath()) + .parquet(filePath.toString()) + .schema(); String tableName = HoodieAvroUtils.sanitizeName(writeConfig.getTableName()); String structName = tableName + "_record"; String recordNamespace = "hoodie." + tableName; - return SchemaConverters.toAvroType(sparkSchema, false, structName, recordNamespace); + return AvroConversionUtils.convertStructTypeToAvroSchema(parquetSchema, structName, recordNamespace); + } + + private static Schema getBootstrapSourceSchemaOrc(HoodieWriteConfig writeConfig, HoodieEngineContext context, Path filePath) { + Reader orcReader = null; + try { + orcReader = OrcFile.createReader(filePath, OrcFile.readerOptions(context.getHadoopConf().get())); + } catch (IOException e) { + throw new HoodieException("Could not determine schema from the data files."); + } + TypeDescription orcSchema = orcReader.getSchema(); + String tableName = HoodieAvroUtils.sanitizeName(writeConfig.getTableName()); + String structName = tableName + "_record"; + String recordNamespace = "hoodie." + tableName; + return AvroOrcUtils.createAvroSchemaWithDefaultValue(orcSchema, structName, recordNamespace, true); } + } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkSingleFileSortPlanStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkSingleFileSortPlanStrategy.java new file mode 100644 index 0000000000000..88c3057f2f0ad --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkSingleFileSortPlanStrategy.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.client.clustering.plan.strategy; + +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieSparkCopyOnWriteTable; +import org.apache.hudi.table.HoodieSparkMergeOnReadTable; + +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * In this strategy, clustering group for each partition is built in the same way as {@link SparkSizeBasedClusteringPlanStrategy}. + * The difference is that the output groups is 1 and file group id remains the same. + */ +public class SparkSingleFileSortPlanStrategy> + extends SparkSizeBasedClusteringPlanStrategy { + + public SparkSingleFileSortPlanStrategy(HoodieSparkCopyOnWriteTable table, HoodieSparkEngineContext engineContext, HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + public SparkSingleFileSortPlanStrategy(HoodieSparkMergeOnReadTable table, HoodieSparkEngineContext engineContext, HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + @Override + protected Stream buildClusteringGroupsForPartition(String partitionPath, List fileSlices) { + List, Integer>> fileSliceGroups = fileSlices.stream() + .map(fileSlice -> Pair.of(Collections.singletonList(fileSlice), 1)).collect(Collectors.toList()); + return fileSliceGroups.stream().map(fileSliceGroup -> HoodieClusteringGroup.newBuilder() + .setSlices(getFileSliceInfo(fileSliceGroup.getLeft())) + .setNumOutputFileGroups(fileSliceGroup.getRight()) + .setMetrics(buildMetrics(fileSliceGroup.getLeft())) + .build()); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkSizeBasedClusteringPlanStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkSizeBasedClusteringPlanStrategy.java new file mode 100644 index 0000000000000..6629569d096b3 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkSizeBasedClusteringPlanStrategy.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.clustering.plan.strategy; + +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieSparkCopyOnWriteTable; +import org.apache.hudi.table.HoodieSparkMergeOnReadTable; +import org.apache.hudi.table.action.cluster.strategy.PartitionAwareClusteringPlanStrategy; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaRDD; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Stream; + +import static org.apache.hudi.config.HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS; + +/** + * Clustering Strategy based on following. + * 1) Creates clustering groups based on max size allowed per group. + * 2) Excludes files that are greater than 'small.file.limit' from clustering plan. + */ +public class SparkSizeBasedClusteringPlanStrategy> + extends PartitionAwareClusteringPlanStrategy>, JavaRDD, JavaRDD> { + private static final Logger LOG = LogManager.getLogger(SparkSizeBasedClusteringPlanStrategy.class); + + public SparkSizeBasedClusteringPlanStrategy(HoodieSparkCopyOnWriteTable table, + HoodieSparkEngineContext engineContext, + HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + public SparkSizeBasedClusteringPlanStrategy(HoodieSparkMergeOnReadTable table, + HoodieSparkEngineContext engineContext, + HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + @Override + protected Stream buildClusteringGroupsForPartition(String partitionPath, List fileSlices) { + HoodieWriteConfig writeConfig = getWriteConfig(); + + List, Integer>> fileSliceGroups = new ArrayList<>(); + List currentGroup = new ArrayList<>(); + long totalSizeSoFar = 0; + + for (FileSlice currentSlice : fileSlices) { + // check if max size is reached and create new group, if needed. + if (totalSizeSoFar >= writeConfig.getClusteringMaxBytesInGroup() && !currentGroup.isEmpty()) { + int numOutputGroups = getNumberOfOutputFileGroups(totalSizeSoFar, writeConfig.getClusteringTargetFileMaxBytes()); + LOG.info("Adding one clustering group " + totalSizeSoFar + " max bytes: " + + writeConfig.getClusteringMaxBytesInGroup() + " num input slices: " + currentGroup.size() + " output groups: " + numOutputGroups); + fileSliceGroups.add(Pair.of(currentGroup, numOutputGroups)); + currentGroup = new ArrayList<>(); + totalSizeSoFar = 0; + } + + // Add to the current file-group + currentGroup.add(currentSlice); + // assume each file group size is ~= parquet.max.file.size + totalSizeSoFar += currentSlice.getBaseFile().isPresent() ? currentSlice.getBaseFile().get().getFileSize() : writeConfig.getParquetMaxFileSize(); + } + + if (!currentGroup.isEmpty()) { + int numOutputGroups = getNumberOfOutputFileGroups(totalSizeSoFar, writeConfig.getClusteringTargetFileMaxBytes()); + LOG.info("Adding final clustering group " + totalSizeSoFar + " max bytes: " + + writeConfig.getClusteringMaxBytesInGroup() + " num input slices: " + currentGroup.size() + " output groups: " + numOutputGroups); + fileSliceGroups.add(Pair.of(currentGroup, numOutputGroups)); + } + + return fileSliceGroups.stream().map(fileSliceGroup -> + HoodieClusteringGroup.newBuilder() + .setSlices(getFileSliceInfo(fileSliceGroup.getLeft())) + .setNumOutputFileGroups(fileSliceGroup.getRight()) + .setMetrics(buildMetrics(fileSliceGroup.getLeft())) + .build()); + } + + @Override + protected Map getStrategyParams() { + Map params = new HashMap<>(); + if (!StringUtils.isNullOrEmpty(getWriteConfig().getClusteringSortColumns())) { + params.put(PLAN_STRATEGY_SORT_COLUMNS.key(), getWriteConfig().getClusteringSortColumns()); + } + return params; + } + + @Override + protected Stream getFileSlicesEligibleForClustering(final String partition) { + return super.getFileSlicesEligibleForClustering(partition) + // Only files that have base file size smaller than small file size are eligible. + .filter(slice -> slice.getBaseFile().map(HoodieBaseFile::getFileSize).orElse(0L) < getWriteConfig().getClusteringSmallFileLimit()); + } + + private int getNumberOfOutputFileGroups(long groupSize, long targetFileSize) { + return (int) Math.ceil(groupSize / (double) targetFileSize); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java new file mode 100644 index 0000000000000..f26d3743c8fa6 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java @@ -0,0 +1,446 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.clustering.run.strategy; + +import org.apache.hudi.SparkAdapterSupport$; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.client.SparkTaskContextSupplier; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.client.utils.ConcatenatingIterator; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.ClusteringOperation; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.RewriteAvroPayload; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.FutureUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.exception.HoodieClusteringException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.execution.bulkinsert.BulkInsertInternalPartitionerFactory; +import org.apache.hudi.execution.bulkinsert.BulkInsertInternalPartitionerWithRowsFactory; +import org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner; +import org.apache.hudi.execution.bulkinsert.RDDSpatialCurveSortPartitioner; +import org.apache.hudi.execution.bulkinsert.RowCustomColumnsSortPartitioner; +import org.apache.hudi.execution.bulkinsert.RowSpatialCurveSortPartitioner; +import org.apache.hudi.io.IOUtils; +import org.apache.hudi.io.storage.HoodieFileReader; +import org.apache.hudi.io.storage.HoodieFileReaderFactory; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.keygen.KeyGenUtils; +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; +import org.apache.hudi.table.BulkInsertPartitioner; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.sources.BaseRelation; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.apache.hudi.common.config.HoodieCommonConfig.TIMESTAMP_AS_OF; +import static org.apache.hudi.common.table.log.HoodieFileSliceReader.getFileSliceReader; +import static org.apache.hudi.config.HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS; + +/** + * Clustering strategy to submit multiple spark jobs and union the results. + */ +public abstract class MultipleSparkJobExecutionStrategy> + extends ClusteringExecutionStrategy>, HoodieData, HoodieData> { + private static final Logger LOG = LogManager.getLogger(MultipleSparkJobExecutionStrategy.class); + + public MultipleSparkJobExecutionStrategy(HoodieTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + @Override + public HoodieWriteMetadata> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime) { + JavaSparkContext engineContext = HoodieSparkEngineContext.getSparkContext(getEngineContext()); + boolean shouldPreserveMetadata = Option.ofNullable(clusteringPlan.getPreserveHoodieMetadata()).orElse(false); + // execute clustering for each group async and collect WriteStatus + Stream> writeStatusesStream = FutureUtils.allOf( + clusteringPlan.getInputGroups().stream() + .map(inputGroup -> { + if (getWriteConfig().getBooleanOrDefault("hoodie.datasource.write.row.writer.enable", false)) { + return runClusteringForGroupAsyncAsRow(inputGroup, + clusteringPlan.getStrategy().getStrategyParams(), + shouldPreserveMetadata, + instantTime); + } + return runClusteringForGroupAsync(inputGroup, + clusteringPlan.getStrategy().getStrategyParams(), + shouldPreserveMetadata, + instantTime); + }) + .collect(Collectors.toList())) + .join() + .stream(); + JavaRDD[] writeStatuses = convertStreamToArray(writeStatusesStream.map(HoodieJavaRDD::getJavaRDD)); + JavaRDD writeStatusRDD = engineContext.union(writeStatuses); + + HoodieWriteMetadata> writeMetadata = new HoodieWriteMetadata<>(); + writeMetadata.setWriteStatuses(HoodieJavaRDD.of(writeStatusRDD)); + return writeMetadata; + } + + /** + * Execute clustering to write inputRecords into new files based on strategyParams. + * Different from {@link MultipleSparkJobExecutionStrategy#performClusteringWithRecordsRDD}, this method take {@link Dataset} + * as inputs. + */ + public abstract HoodieData performClusteringWithRecordsAsRow(final Dataset inputRecords, + final int numOutputGroups, + final String instantTime, + final Map strategyParams, + final Schema schema, + final List fileGroupIdList, + final boolean shouldPreserveHoodieMetadata, + final Map extraMetadata); + + /** + * Execute clustering to write inputRecords into new files as defined by rules in strategy parameters. + * The number of new file groups created is bounded by numOutputGroups. + * Note that commit is not done as part of strategy. commit is callers responsibility. + * + * @param inputRecords RDD of {@link HoodieRecord}. + * @param numOutputGroups Number of output file groups. + * @param instantTime Clustering (replace commit) instant time. + * @param strategyParams Strategy parameters containing columns to sort the data by when clustering. + * @param schema Schema of the data including metadata fields. + * @param fileGroupIdList File group id corresponding to each out group. + * @param shouldPreserveHoodieMetadata Whether to preserve commit metadata while clustering. + * @return RDD of {@link WriteStatus}. + */ + public abstract HoodieData performClusteringWithRecordsRDD(final HoodieData> inputRecords, + final int numOutputGroups, + final String instantTime, + final Map strategyParams, + final Schema schema, + final List fileGroupIdList, + final boolean shouldPreserveHoodieMetadata, + final Map extraMetadata); + + protected BulkInsertPartitioner> getRowPartitioner(Map strategyParams, + Schema schema) { + return getPartitioner(strategyParams, schema, true); + } + + protected BulkInsertPartitioner>> getRDDPartitioner(Map strategyParams, + Schema schema) { + return getPartitioner(strategyParams, schema, false); + } + + /** + * Create {@link BulkInsertPartitioner} based on strategy params. + * + * @param strategyParams Strategy parameters containing columns to sort the data by when clustering. + * @param schema Schema of the data including metadata fields. + */ + private BulkInsertPartitioner getPartitioner(Map strategyParams, + Schema schema, + boolean isRowPartitioner) { + Option orderByColumnsOpt = + Option.ofNullable(strategyParams.get(PLAN_STRATEGY_SORT_COLUMNS.key())) + .map(listStr -> listStr.split(",")); + + return orderByColumnsOpt.map(orderByColumns -> { + HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy = getWriteConfig().getLayoutOptimizationStrategy(); + switch (layoutOptStrategy) { + case ZORDER: + case HILBERT: + return isRowPartitioner + ? new RowSpatialCurveSortPartitioner(getWriteConfig()) + : new RDDSpatialCurveSortPartitioner((HoodieSparkEngineContext) getEngineContext(), orderByColumns, layoutOptStrategy, + getWriteConfig().getLayoutOptimizationCurveBuildMethod(), HoodieAvroUtils.addMetadataFields(schema)); + case LINEAR: + return isRowPartitioner + ? new RowCustomColumnsSortPartitioner(orderByColumns) + : new RDDCustomColumnsSortPartitioner(orderByColumns, HoodieAvroUtils.addMetadataFields(schema), + getWriteConfig().isConsistentLogicalTimestampEnabled()); + default: + throw new UnsupportedOperationException(String.format("Layout optimization strategy '%s' is not supported", layoutOptStrategy)); + } + }).orElse(isRowPartitioner + ? BulkInsertInternalPartitionerWithRowsFactory.get( + getWriteConfig().getBulkInsertSortMode(), getHoodieTable().isPartitioned(), true) + : BulkInsertInternalPartitionerFactory.get( + getWriteConfig().getBulkInsertSortMode(), getHoodieTable().isPartitioned(), true)); + } + + /** + * Submit job to execute clustering for the group using Avro/HoodieRecord representation. + */ + private CompletableFuture> runClusteringForGroupAsync(HoodieClusteringGroup clusteringGroup, Map strategyParams, + boolean preserveHoodieMetadata, String instantTime) { + return CompletableFuture.supplyAsync(() -> { + JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(getEngineContext()); + HoodieData> inputRecords = readRecordsForGroup(jsc, clusteringGroup, instantTime); + Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(getWriteConfig().getSchema())); + List inputFileIds = clusteringGroup.getSlices().stream() + .map(info -> new HoodieFileGroupId(info.getPartitionPath(), info.getFileId())) + .collect(Collectors.toList()); + return performClusteringWithRecordsRDD(inputRecords, clusteringGroup.getNumOutputFileGroups(), instantTime, strategyParams, readerSchema, inputFileIds, preserveHoodieMetadata, + clusteringGroup.getExtraMetadata()); + }); + } + + /** + * Submit job to execute clustering for the group, directly using the spark native Row representation. + */ + private CompletableFuture> runClusteringForGroupAsyncAsRow(HoodieClusteringGroup clusteringGroup, + Map strategyParams, + boolean shouldPreserveHoodieMetadata, + String instantTime) { + return CompletableFuture.supplyAsync(() -> { + JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(getEngineContext()); + Dataset inputRecords = readRecordsForGroupAsRow(jsc, clusteringGroup, instantTime); + Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(getWriteConfig().getSchema())); + List inputFileIds = clusteringGroup.getSlices().stream() + .map(info -> new HoodieFileGroupId(info.getPartitionPath(), info.getFileId())) + .collect(Collectors.toList()); + return performClusteringWithRecordsAsRow(inputRecords, clusteringGroup.getNumOutputFileGroups(), instantTime, strategyParams, readerSchema, inputFileIds, shouldPreserveHoodieMetadata, + clusteringGroup.getExtraMetadata()); + }); + } + + /** + * Get RDD of all records for the group. This includes all records from file slice (Apply updates from log files, if any). + */ + private HoodieData> readRecordsForGroup(JavaSparkContext jsc, HoodieClusteringGroup clusteringGroup, String instantTime) { + List clusteringOps = clusteringGroup.getSlices().stream().map(ClusteringOperation::create).collect(Collectors.toList()); + boolean hasLogFiles = clusteringOps.stream().anyMatch(op -> op.getDeltaFilePaths().size() > 0); + if (hasLogFiles) { + // if there are log files, we read all records into memory for a file group and apply updates. + return readRecordsForGroupWithLogs(jsc, clusteringOps, instantTime); + } else { + // We want to optimize reading records for case there are no log files. + return readRecordsForGroupBaseFiles(jsc, clusteringOps); + } + } + + /** + * Read records from baseFiles, apply updates and convert to RDD. + */ + private HoodieData> readRecordsForGroupWithLogs(JavaSparkContext jsc, + List clusteringOps, + String instantTime) { + HoodieWriteConfig config = getWriteConfig(); + HoodieTable table = getHoodieTable(); + return HoodieJavaRDD.of(jsc.parallelize(clusteringOps, clusteringOps.size()).mapPartitions(clusteringOpsPartition -> { + List>> recordIterators = new ArrayList<>(); + clusteringOpsPartition.forEachRemaining(clusteringOp -> { + long maxMemoryPerCompaction = IOUtils.getMaxMemoryPerCompaction(new SparkTaskContextSupplier(), config); + LOG.info("MaxMemoryPerCompaction run as part of clustering => " + maxMemoryPerCompaction); + try { + Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); + HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder() + .withFileSystem(table.getMetaClient().getFs()) + .withBasePath(table.getMetaClient().getBasePath()) + .withLogFilePaths(clusteringOp.getDeltaFilePaths()) + .withReaderSchema(readerSchema) + .withLatestInstantTime(instantTime) + .withMaxMemorySizeInBytes(maxMemoryPerCompaction) + .withReadBlocksLazily(config.getCompactionLazyBlockReadEnabled()) + .withReverseReader(config.getCompactionReverseLogReadEnabled()) + .withBufferSize(config.getMaxDFSStreamBufferSize()) + .withSpillableMapBasePath(config.getSpillableMapBasePath()) + .withPartition(clusteringOp.getPartitionPath()) + .withDiskMapType(config.getCommonConfig().getSpillableDiskMapType()) + .withBitCaskDiskMapCompressionEnabled(config.getCommonConfig().isBitCaskDiskMapCompressionEnabled()) + .build(); + + Option baseFileReader = StringUtils.isNullOrEmpty(clusteringOp.getDataFilePath()) + ? Option.empty() + : Option.of(HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), new Path(clusteringOp.getDataFilePath()))); + HoodieTableConfig tableConfig = table.getMetaClient().getTableConfig(); + recordIterators.add(getFileSliceReader(baseFileReader, scanner, readerSchema, + tableConfig.getPayloadClass(), + tableConfig.getPreCombineField(), + tableConfig.populateMetaFields() ? Option.empty() : Option.of(Pair.of(tableConfig.getRecordKeyFieldProp(), + tableConfig.getPartitionFieldProp())))); + } catch (IOException e) { + throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath() + + " and " + clusteringOp.getDeltaFilePaths(), e); + } + }); + + return new ConcatenatingIterator<>(recordIterators); + })); + } + + /** + * Read records from baseFiles and convert to RDD. + */ + private HoodieData> readRecordsForGroupBaseFiles(JavaSparkContext jsc, + List clusteringOps) { + SerializableConfiguration hadoopConf = new SerializableConfiguration(getHoodieTable().getHadoopConf()); + HoodieWriteConfig writeConfig = getWriteConfig(); + + // NOTE: It's crucial to make sure that we don't capture whole "this" object into the + // closure, as this might lead to issues attempting to serialize its nested fields + return HoodieJavaRDD.of(jsc.parallelize(clusteringOps, clusteringOps.size()) + .mapPartitions(clusteringOpsPartition -> { + List> iteratorsForPartition = new ArrayList<>(); + clusteringOpsPartition.forEachRemaining(clusteringOp -> { + try { + Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(writeConfig.getSchema())); + HoodieFileReader baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(clusteringOp.getDataFilePath())); + iteratorsForPartition.add(baseFileReader.getRecordIterator(readerSchema)); + } catch (IOException e) { + throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath() + + " and " + clusteringOp.getDeltaFilePaths(), e); + } + }); + + return new ConcatenatingIterator<>(iteratorsForPartition); + }) + .map(record -> transform(record, writeConfig))); + } + + /** + * Get dataset of all records for the group. This includes all records from file slice (Apply updates from log files, if any). + */ + private Dataset readRecordsForGroupAsRow(JavaSparkContext jsc, + HoodieClusteringGroup clusteringGroup, + String instantTime) { + List clusteringOps = clusteringGroup.getSlices().stream() + .map(ClusteringOperation::create).collect(Collectors.toList()); + boolean hasLogFiles = clusteringOps.stream().anyMatch(op -> op.getDeltaFilePaths().size() > 0); + SQLContext sqlContext = new SQLContext(jsc.sc()); + + Path[] baseFilePaths = clusteringOps + .stream() + .map(op -> { + ArrayList readPaths = new ArrayList<>(); + if (op.getBootstrapFilePath() != null) { + readPaths.add(op.getBootstrapFilePath()); + } + if (op.getDataFilePath() != null) { + readPaths.add(op.getDataFilePath()); + } + return readPaths; + }) + .flatMap(Collection::stream) + .filter(path -> !path.isEmpty()) + .map(Path::new) + .toArray(Path[]::new); + + HashMap params = new HashMap<>(); + params.put("hoodie.datasource.query.type", "snapshot"); + params.put(TIMESTAMP_AS_OF.key(), instantTime); + + Path[] paths; + if (hasLogFiles) { + String compactionFractor = Option.ofNullable(getWriteConfig().getString("compaction.memory.fraction")) + .orElse("0.75"); + params.put("compaction.memory.fraction", compactionFractor); + + Path[] deltaPaths = clusteringOps + .stream() + .filter(op -> !op.getDeltaFilePaths().isEmpty()) + .flatMap(op -> op.getDeltaFilePaths().stream()) + .map(Path::new) + .toArray(Path[]::new); + paths = CollectionUtils.combine(baseFilePaths, deltaPaths); + } else { + paths = baseFilePaths; + } + + String readPathString = String.join(",", Arrays.stream(paths).map(Path::toString).toArray(String[]::new)); + params.put("hoodie.datasource.read.paths", readPathString); + // Building HoodieFileIndex needs this param to decide query path + params.put("glob.paths", readPathString); + + // Let Hudi relations to fetch the schema from the table itself + BaseRelation relation = SparkAdapterSupport$.MODULE$.sparkAdapter() + .createRelation(sqlContext, getHoodieTable().getMetaClient(), null, paths, params); + return sqlContext.baseRelationToDataFrame(relation); + } + + /** + * Stream to array conversion with generic type is not straightforward. + * Implement a utility method to abstract high level logic. This needs to be improved in future + */ + private JavaRDD[] convertStreamToArray(Stream> writeStatusRDDStream) { + Object[] writeStatusObjects = writeStatusRDDStream.toArray(Object[]::new); + JavaRDD[] writeStatusRDDArray = new JavaRDD[writeStatusObjects.length]; + for (int i = 0; i < writeStatusObjects.length; i++) { + writeStatusRDDArray[i] = (JavaRDD) writeStatusObjects[i]; + } + return writeStatusRDDArray; + } + + /** + * Transform IndexedRecord into HoodieRecord. + */ + private static HoodieRecord transform(IndexedRecord indexedRecord, HoodieWriteConfig writeConfig) { + GenericRecord record = (GenericRecord) indexedRecord; + Option keyGeneratorOpt = Option.empty(); + if (!writeConfig.populateMetaFields()) { + try { + keyGeneratorOpt = Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(writeConfig.getProps())); + } catch (IOException e) { + throw new HoodieIOException("Only BaseKeyGenerators are supported when meta columns are disabled ", e); + } + } + String key = KeyGenUtils.getRecordKeyFromGenericRecord(record, keyGeneratorOpt); + String partition = KeyGenUtils.getPartitionPathFromGenericRecord(record, keyGeneratorOpt); + HoodieKey hoodieKey = new HoodieKey(key, partition); + + HoodieRecordPayload avroPayload = new RewriteAvroPayload(record); + HoodieRecord hoodieRecord = new HoodieAvroRecord(hoodieKey, avroPayload); + return hoodieRecord; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SingleSparkJobExecutionStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SingleSparkJobExecutionStrategy.java new file mode 100644 index 0000000000000..e55bac0b172a4 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SingleSparkJobExecutionStrategy.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.clustering.run.strategy; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.client.utils.ConcatenatingIterator; +import org.apache.hudi.common.config.SerializableSchema; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.model.ClusteringGroupInfo; +import org.apache.hudi.common.model.ClusteringOperation; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.RewriteAvroPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.exception.HoodieClusteringException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.io.storage.HoodieFileReaderFactory; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.keygen.KeyGenUtils; +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.broadcast.Broadcast; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +/** + * Clustering strategy to submit single spark jobs. + * MultipleSparkJobExecution strategy is not ideal for use cases that require large number of clustering groups + */ +public abstract class SingleSparkJobExecutionStrategy> + extends ClusteringExecutionStrategy>, HoodieData, HoodieData> { + private static final Logger LOG = LogManager.getLogger(SingleSparkJobExecutionStrategy.class); + + public SingleSparkJobExecutionStrategy(HoodieTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + @Override + public HoodieWriteMetadata> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime) { + JavaSparkContext engineContext = HoodieSparkEngineContext.getSparkContext(getEngineContext()); + final TaskContextSupplier taskContextSupplier = getEngineContext().getTaskContextSupplier(); + final SerializableSchema serializableSchema = new SerializableSchema(schema); + final List clusteringGroupInfos = clusteringPlan.getInputGroups().stream().map(clusteringGroup -> + ClusteringGroupInfo.create(clusteringGroup)).collect(Collectors.toList()); + + String umask = engineContext.hadoopConfiguration().get("fs.permissions.umask-mode"); + Broadcast umaskBroadcastValue = engineContext.broadcast(umask); + + JavaRDD groupInfoJavaRDD = engineContext.parallelize(clusteringGroupInfos, clusteringGroupInfos.size()); + LOG.info("number of partitions for clustering " + groupInfoJavaRDD.getNumPartitions()); + JavaRDD writeStatusRDD = groupInfoJavaRDD + .mapPartitions(clusteringOps -> { + Configuration configuration = new Configuration(); + configuration.set("fs.permissions.umask-mode", umaskBroadcastValue.getValue()); + Iterable clusteringOpsIterable = () -> clusteringOps; + List groupsInPartition = StreamSupport.stream(clusteringOpsIterable.spliterator(), false).collect(Collectors.toList()); + return groupsInPartition.stream().flatMap(clusteringOp -> + runClusteringForGroup(clusteringOp, clusteringPlan.getStrategy().getStrategyParams(), + Option.ofNullable(clusteringPlan.getPreserveHoodieMetadata()).orElse(false), + serializableSchema, taskContextSupplier, instantTime) + ).iterator(); + }); + + HoodieWriteMetadata> writeMetadata = new HoodieWriteMetadata<>(); + writeMetadata.setWriteStatuses(HoodieJavaRDD.of(writeStatusRDD)); + return writeMetadata; + } + + /** + * Submit job to execute clustering for the group. + */ + private Stream runClusteringForGroup(ClusteringGroupInfo clusteringOps, Map strategyParams, + boolean preserveHoodieMetadata, SerializableSchema schema, + TaskContextSupplier taskContextSupplier, String instantTime) { + + List inputFileIds = clusteringOps.getOperations().stream() + .map(op -> new HoodieFileGroupId(op.getPartitionPath(), op.getFileId())) + .collect(Collectors.toList()); + + Iterator> inputRecords = readRecordsForGroupBaseFiles(clusteringOps.getOperations()); + Iterator> writeStatuses = performClusteringWithRecordsIterator(inputRecords, clusteringOps.getNumOutputGroups(), instantTime, + strategyParams, schema.get(), inputFileIds, preserveHoodieMetadata, taskContextSupplier); + + Iterable> writeStatusIterable = () -> writeStatuses; + return StreamSupport.stream(writeStatusIterable.spliterator(), false) + .flatMap(writeStatusList -> writeStatusList.stream()); + } + + /** + * Execute clustering to write inputRecords into new files as defined by rules in strategy parameters. + * The number of new file groups created is bounded by numOutputGroups. + * Note that commit is not done as part of strategy. Commit is callers responsibility. + */ + public abstract Iterator> performClusteringWithRecordsIterator(final Iterator> records, final int numOutputGroups, + final String instantTime, + final Map strategyParams, final Schema schema, + final List fileGroupIdList, final boolean preserveHoodieMetadata, + final TaskContextSupplier taskContextSupplier); + + /** + * Read records from baseFiles and get iterator. + */ + private Iterator> readRecordsForGroupBaseFiles(List clusteringOps) { + List>> iteratorsForPartition = clusteringOps.stream().map(clusteringOp -> { + + Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(getWriteConfig().getSchema())); + Iterable indexedRecords = () -> { + try { + return HoodieFileReaderFactory.getFileReader(getHoodieTable().getHadoopConf(), new Path(clusteringOp.getDataFilePath())).getRecordIterator(readerSchema); + } catch (IOException e) { + throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath() + + " and " + clusteringOp.getDeltaFilePaths(), e); + } + }; + + return StreamSupport.stream(indexedRecords.spliterator(), false).map(record -> transform(record)).iterator(); + }).collect(Collectors.toList()); + + return new ConcatenatingIterator<>(iteratorsForPartition); + } + + /** + * Transform IndexedRecord into HoodieRecord. + */ + private HoodieRecord transform(IndexedRecord indexedRecord) { + GenericRecord record = (GenericRecord) indexedRecord; + Option keyGeneratorOpt = Option.empty(); + if (!getWriteConfig().populateMetaFields()) { + try { + keyGeneratorOpt = Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(getWriteConfig().getProps()))); + } catch (IOException e) { + throw new HoodieIOException("Only BaseKeyGenerators are supported when meta columns are disabled ", e); + } + } + String key = KeyGenUtils.getRecordKeyFromGenericRecord(record, keyGeneratorOpt); + String partition = KeyGenUtils.getPartitionPathFromGenericRecord(record, keyGeneratorOpt); + HoodieKey hoodieKey = new HoodieKey(key, partition); + + HoodieRecordPayload avroPayload = new RewriteAvroPayload(record); + HoodieRecord hoodieRecord = new HoodieAvroRecord(hoodieKey, avroPayload); + return hoodieRecord; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSingleFileSortExecutionStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSingleFileSortExecutionStrategy.java new file mode 100644 index 0000000000000..f2ae9a922d811 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSingleFileSortExecutionStrategy.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.client.clustering.run.strategy; + +import org.apache.hudi.HoodieDatasetBulkInsertHelper; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieClusteringException; +import org.apache.hudi.io.SingleFileHandleCreateFactory; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.commit.SparkBulkInsertHelper; + +import org.apache.avro.Schema; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; + +import java.util.List; +import java.util.Map; + +/** + * This strategy is similar to {@link SparkSortAndSizeExecutionStrategy} with the difference being that + * there should be only one large file group per clustering group. + */ +public class SparkSingleFileSortExecutionStrategy> + extends MultipleSparkJobExecutionStrategy { + + private static final Logger LOG = LogManager.getLogger(SparkSingleFileSortExecutionStrategy.class); + + public SparkSingleFileSortExecutionStrategy(HoodieTable table, + HoodieEngineContext engineContext, + HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + @Override + public HoodieData performClusteringWithRecordsAsRow(Dataset inputRecords, + int numOutputGroups, + String instantTime, + Map strategyParams, + Schema schema, + List fileGroupIdList, + boolean shouldPreserveHoodieMetadata, + Map extraMetadata) { + if (numOutputGroups != 1 || fileGroupIdList.size() != 1) { + throw new HoodieClusteringException("Expect only one file group for strategy: " + getClass().getName()); + } + LOG.info("Starting clustering for a group, parallelism:" + numOutputGroups + " commit:" + instantTime); + + HoodieWriteConfig newConfig = HoodieWriteConfig.newBuilder() + .withBulkInsertParallelism(numOutputGroups) + .withProps(getWriteConfig().getProps()).build(); + + // Since clustering will write to single file group using HoodieUnboundedCreateHandle, set max file size to a large value. + newConfig.setValue(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE, String.valueOf(Long.MAX_VALUE)); + + return HoodieDatasetBulkInsertHelper.bulkInsert(inputRecords, instantTime, getHoodieTable(), newConfig, + getRowPartitioner(strategyParams, schema), numOutputGroups, shouldPreserveHoodieMetadata); + } + + @Override + public HoodieData performClusteringWithRecordsRDD(HoodieData> inputRecords, + int numOutputGroups, + String instantTime, + Map strategyParams, + Schema schema, + List fileGroupIdList, + boolean shouldPreserveHoodieMetadata, + Map extraMetadata) { + if (numOutputGroups != 1 || fileGroupIdList.size() != 1) { + throw new HoodieClusteringException("Expect only one file group for strategy: " + getClass().getName()); + } + LOG.info("Starting clustering for a group, parallelism:" + numOutputGroups + " commit:" + instantTime); + + HoodieWriteConfig newConfig = HoodieWriteConfig.newBuilder() + .withBulkInsertParallelism(numOutputGroups) + .withProps(getWriteConfig().getProps()).build(); + // Since clustering will write to single file group using HoodieUnboundedCreateHandle, set max file size to a large value. + newConfig.setValue(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE, String.valueOf(Long.MAX_VALUE)); + + return (HoodieData) SparkBulkInsertHelper.newInstance().bulkInsert(inputRecords, instantTime, getHoodieTable(), newConfig, + false, getRDDPartitioner(strategyParams, schema), true, numOutputGroups, new SingleFileHandleCreateFactory(fileGroupIdList.get(0).getFileId(), shouldPreserveHoodieMetadata)); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSortAndSizeExecutionStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSortAndSizeExecutionStrategy.java new file mode 100644 index 0000000000000..35c8f288bc891 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSortAndSizeExecutionStrategy.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.clustering.run.strategy; + +import org.apache.hudi.HoodieDatasetBulkInsertHelper; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.CreateHandleFactory; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.commit.SparkBulkInsertHelper; + +import org.apache.avro.Schema; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; + +import java.util.List; +import java.util.Map; + +/** + * Clustering Strategy based on following. + * 1) Spark execution engine. + * 2) Uses bulk_insert to write data into new files. + */ +public class SparkSortAndSizeExecutionStrategy> + extends MultipleSparkJobExecutionStrategy { + private static final Logger LOG = LogManager.getLogger(SparkSortAndSizeExecutionStrategy.class); + + public SparkSortAndSizeExecutionStrategy(HoodieTable table, + HoodieEngineContext engineContext, + HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + @Override + public HoodieData performClusteringWithRecordsAsRow(Dataset inputRecords, + int numOutputGroups, + String instantTime, Map strategyParams, + Schema schema, + List fileGroupIdList, + boolean shouldPreserveHoodieMetadata, + Map extraMetadata) { + LOG.info("Starting clustering for a group, parallelism:" + numOutputGroups + " commit:" + instantTime); + HoodieWriteConfig newConfig = HoodieWriteConfig.newBuilder() + .withBulkInsertParallelism(numOutputGroups) + .withProps(getWriteConfig().getProps()).build(); + + newConfig.setValue(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE, String.valueOf(getWriteConfig().getClusteringTargetFileMaxBytes())); + + return HoodieDatasetBulkInsertHelper.bulkInsert(inputRecords, instantTime, getHoodieTable(), newConfig, + getRowPartitioner(strategyParams, schema), numOutputGroups, shouldPreserveHoodieMetadata); + } + + @Override + public HoodieData performClusteringWithRecordsRDD(final HoodieData> inputRecords, + final int numOutputGroups, + final String instantTime, + final Map strategyParams, + final Schema schema, + final List fileGroupIdList, + final boolean shouldPreserveHoodieMetadata, + final Map extraMetadata) { + LOG.info("Starting clustering for a group, parallelism:" + numOutputGroups + " commit:" + instantTime); + + HoodieWriteConfig newConfig = HoodieWriteConfig.newBuilder() + .withBulkInsertParallelism(numOutputGroups) + .withProps(getWriteConfig().getProps()).build(); + newConfig.setValue(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE, String.valueOf(getWriteConfig().getClusteringTargetFileMaxBytes())); + return (HoodieData) SparkBulkInsertHelper.newInstance().bulkInsert(inputRecords, instantTime, getHoodieTable(), + newConfig, false, getRDDPartitioner(strategyParams, schema), true, numOutputGroups, new CreateHandleFactory(shouldPreserveHoodieMetadata)); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/BaseSparkUpdateStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/BaseSparkUpdateStrategy.java new file mode 100644 index 0000000000000..3eadba25bbcfb --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/BaseSparkUpdateStrategy.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.clustering.update.strategy; + +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.table.action.cluster.strategy.UpdateStrategy; + +import java.util.List; +import java.util.Set; + +/** + * Spark base update strategy, write records to the file groups which are in clustering + * need to check. Spark relate implementations should extend this base class. + */ +public abstract class BaseSparkUpdateStrategy> extends UpdateStrategy>> { + + public BaseSparkUpdateStrategy(HoodieSparkEngineContext engineContext, + Set fileGroupsInPendingClustering) { + super(engineContext, fileGroupsInPendingClustering); + } + + /** + * Get records matched file group ids. + * @param inputRecords the records to write, tagged with target file id + * @return the records matched file group ids + */ + protected List getGroupIdsWithUpdate(HoodieData> inputRecords) { + return inputRecords + .filter(record -> record.getCurrentLocation() != null) + .map(record -> new HoodieFileGroupId(record.getPartitionPath(), record.getCurrentLocation().getFileId())).distinct().collectAsList(); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/SparkAllowUpdateStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/SparkAllowUpdateStrategy.java new file mode 100644 index 0000000000000..59040629f718e --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/SparkAllowUpdateStrategy.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.clustering.update.strategy; + +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.collection.Pair; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Allow ingestion commits during clustering job. + */ +public class SparkAllowUpdateStrategy> extends BaseSparkUpdateStrategy { + + public SparkAllowUpdateStrategy(HoodieSparkEngineContext engineContext, + HashSet fileGroupsInPendingClustering) { + super(engineContext, fileGroupsInPendingClustering); + } + + @Override + public Pair>, Set> handleUpdate(HoodieData> taggedRecordsRDD) { + List fileGroupIdsWithRecordUpdate = getGroupIdsWithUpdate(taggedRecordsRDD); + Set fileGroupIdsWithUpdatesAndPendingClustering = fileGroupIdsWithRecordUpdate.stream() + .filter(f -> fileGroupsInPendingClustering.contains(f)) + .collect(Collectors.toSet()); + return Pair.of(taggedRecordsRDD, fileGroupIdsWithUpdatesAndPendingClustering); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/SparkRejectUpdateStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/SparkRejectUpdateStrategy.java new file mode 100644 index 0000000000000..d09422ee77242 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/SparkRejectUpdateStrategy.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.clustering.update.strategy; + +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieClusteringUpdateException; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Update strategy based on following. + * if some file groups have update record, throw exception + */ +public class SparkRejectUpdateStrategy> extends BaseSparkUpdateStrategy { + private static final Logger LOG = LogManager.getLogger(SparkRejectUpdateStrategy.class); + + public SparkRejectUpdateStrategy(HoodieSparkEngineContext engineContext, + HashSet fileGroupsInPendingClustering) { + super(engineContext, fileGroupsInPendingClustering); + } + + @Override + public Pair>, Set> handleUpdate(HoodieData> taggedRecordsRDD) { + List fileGroupIdsWithRecordUpdate = getGroupIdsWithUpdate(taggedRecordsRDD); + fileGroupIdsWithRecordUpdate.forEach(fileGroupIdWithRecordUpdate -> { + if (fileGroupsInPendingClustering.contains(fileGroupIdWithRecordUpdate)) { + String msg = String.format("Not allowed to update the clustering file group %s. " + + "For pending clustering operations, we are not going to support update for now.", + fileGroupIdWithRecordUpdate.toString()); + LOG.error(msg); + throw new HoodieClusteringUpdateException(msg); + } + }); + return Pair.of(taggedRecordsRDD, Collections.emptySet()); + } + +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/common/HoodieSparkEngineContext.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/common/HoodieSparkEngineContext.java index 0f17511b56d99..d8281d1a10b73 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/common/HoodieSparkEngineContext.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/common/HoodieSparkEngineContext.java @@ -20,18 +20,34 @@ import org.apache.hudi.client.SparkTaskContextSupplier; import org.apache.hudi.common.config.SerializableConfiguration; -import org.apache.hudi.client.common.function.SerializableConsumer; -import org.apache.hudi.client.common.function.SerializableFunction; -import org.apache.hudi.client.common.function.SerializablePairFunction; +import org.apache.hudi.common.data.HoodieAccumulator; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.EngineProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.function.SerializableBiFunction; +import org.apache.hudi.common.function.SerializableConsumer; +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFlatMapFunction; +import org.apache.hudi.common.function.SerializablePairFunction; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.data.HoodieSparkLongAccumulator; import org.apache.hudi.exception.HoodieException; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.PairFlatMapFunction; import org.apache.spark.sql.SQLContext; +import scala.Tuple2; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.stream.Collectors; import java.util.stream.Stream; /** @@ -39,6 +55,7 @@ */ public class HoodieSparkEngineContext extends HoodieEngineContext { + private static final Logger LOG = LogManager.getLogger(HoodieSparkEngineContext.class); private final JavaSparkContext javaSparkContext; private SQLContext sqlContext; @@ -64,11 +81,57 @@ public static JavaSparkContext getSparkContext(HoodieEngineContext context) { return ((HoodieSparkEngineContext) context).getJavaSparkContext(); } + @Override + public HoodieAccumulator newAccumulator() { + HoodieSparkLongAccumulator accumulator = HoodieSparkLongAccumulator.create(); + javaSparkContext.sc().register(accumulator.getAccumulator()); + return accumulator; + } + + @Override + public HoodieData emptyHoodieData() { + return HoodieJavaRDD.of(javaSparkContext.emptyRDD()); + } + + @Override + public HoodieData parallelize(List data, int parallelism) { + return HoodieJavaRDD.of(javaSparkContext.parallelize(data, parallelism)); + } + @Override public List map(List data, SerializableFunction func, int parallelism) { return javaSparkContext.parallelize(data, parallelism).map(func::apply).collect(); } + @Override + public List mapToPairAndReduceByKey(List data, SerializablePairFunction mapToPairFunc, SerializableBiFunction reduceFunc, int parallelism) { + return javaSparkContext.parallelize(data, parallelism).mapToPair(input -> { + Pair pair = mapToPairFunc.call(input); + return new Tuple2<>(pair.getLeft(), pair.getRight()); + }).reduceByKey(reduceFunc::apply).map(Tuple2::_2).collect(); + } + + @Override + public Stream> mapPartitionsToPairAndReduceByKey( + Stream data, SerializablePairFlatMapFunction, K, V> flatMapToPairFunc, + SerializableBiFunction reduceFunc, int parallelism) { + return javaSparkContext.parallelize(data.collect(Collectors.toList()), parallelism) + .mapPartitionsToPair((PairFlatMapFunction, K, V>) iterator -> + flatMapToPairFunc.call(iterator).collect(Collectors.toList()).stream() + .map(e -> new Tuple2<>(e.getKey(), e.getValue())).iterator() + ) + .reduceByKey(reduceFunc::apply) + .map(e -> new ImmutablePair<>(e._1, e._2)) + .collect().stream(); + } + + @Override + public List reduceByKey( + List> data, SerializableBiFunction reduceFunc, int parallelism) { + return javaSparkContext.parallelize(data, parallelism).mapToPair(pair -> new Tuple2(pair.getLeft(), pair.getRight())) + .reduceByKey(reduceFunc::apply).map(Tuple2::_2).collect(); + } + @Override public List flatMap(List data, SerializableFunction> func, int parallelism) { return javaSparkContext.parallelize(data, parallelism).flatMap(x -> func.apply(x).iterator()).collect(); @@ -82,9 +145,15 @@ public void foreach(List data, SerializableConsumer consumer, int para @Override public Map mapToPair(List data, SerializablePairFunction func, Integer parallelism) { if (Objects.nonNull(parallelism)) { - return javaSparkContext.parallelize(data, parallelism).mapToPair(func::call).collectAsMap(); + return javaSparkContext.parallelize(data, parallelism).mapToPair(input -> { + Pair pair = func.call(input); + return new Tuple2(pair.getLeft(), pair.getRight()); + }).collectAsMap(); } else { - return javaSparkContext.parallelize(data).mapToPair(func::call).collectAsMap(); + return javaSparkContext.parallelize(data).mapToPair(input -> { + Pair pair = func.call(input); + return new Tuple2(pair.getLeft(), pair.getRight()); + }).collectAsMap(); } } @@ -92,6 +161,8 @@ public Map mapToPair(List data, SerializablePairFunction + *

  • When meta-fields need to be prepended to the source {@link InternalRow}
  • + *
  • When meta-fields need to be updated w/in the source {@link InternalRow} + * ({@link org.apache.spark.sql.catalyst.expressions.UnsafeRow} currently does not + * allow in-place updates due to its memory layout)
  • + * */ public class HoodieInternalRow extends InternalRow { - private String commitTime; - private String commitSeqNumber; - private String recordKey; - private String partitionPath; - private String fileName; - private InternalRow row; - - public HoodieInternalRow(String commitTime, String commitSeqNumber, String recordKey, String partitionPath, - String fileName, InternalRow row) { - this.commitTime = commitTime; - this.commitSeqNumber = commitSeqNumber; - this.recordKey = recordKey; - this.partitionPath = partitionPath; - this.fileName = fileName; - this.row = row; + /** + * Collection of meta-fields as defined by {@link HoodieRecord#HOODIE_META_COLUMNS} + * + * NOTE: {@code HoodieInternalRow} *always* overlays its own meta-fields even in case + * when source row also contains them, to make sure these fields are mutable and + * can be updated (for ex, {@link UnsafeRow} doesn't support mutations due to + * its memory layout, as it persists field offsets) + */ + private final UTF8String[] metaFields; + private final InternalRow sourceRow; + + /** + * Specifies whether source {@link #sourceRow} contains meta-fields + */ + private final boolean sourceContainsMetaFields; + + public HoodieInternalRow(UTF8String commitTime, + UTF8String commitSeqNumber, + UTF8String recordKey, + UTF8String partitionPath, + UTF8String fileName, + InternalRow sourceRow, + boolean sourceContainsMetaFields) { + this.metaFields = new UTF8String[] { + commitTime, + commitSeqNumber, + recordKey, + partitionPath, + fileName + }; + + this.sourceRow = sourceRow; + this.sourceContainsMetaFields = sourceContainsMetaFields; + } + + private HoodieInternalRow(UTF8String[] metaFields, + InternalRow sourceRow, + boolean sourceContainsMetaFields) { + this.metaFields = metaFields; + this.sourceRow = sourceRow; + this.sourceContainsMetaFields = sourceContainsMetaFields; } @Override public int numFields() { - return row.numFields(); - } - - @Override - public void setNullAt(int i) { - if (i < HoodieRecord.HOODIE_META_COLUMNS.size()) { - switch (i) { - case 0: { - this.commitTime = null; - break; - } - case 1: { - this.commitSeqNumber = null; - break; - } - case 2: { - this.recordKey = null; - break; - } - case 3: { - this.partitionPath = null; - break; - } - case 4: { - this.fileName = null; - break; - } - default: throw new IllegalArgumentException("Not expected"); - } + if (sourceContainsMetaFields) { + return sourceRow.numFields(); } else { - row.setNullAt(i); + return sourceRow.numFields() + metaFields.length; } } @Override - public void update(int i, Object value) { - if (i < HoodieRecord.HOODIE_META_COLUMNS.size()) { - switch (i) { - case 0: { - this.commitTime = value.toString(); - break; - } - case 1: { - this.commitSeqNumber = value.toString(); - break; - } - case 2: { - this.recordKey = value.toString(); - break; - } - case 3: { - this.partitionPath = value.toString(); - break; - } - case 4: { - this.fileName = value.toString(); - break; - } - default: throw new IllegalArgumentException("Not expected"); - } + public void setNullAt(int ordinal) { + if (ordinal < metaFields.length) { + metaFields[ordinal] = null; } else { - row.update(i, value); + sourceRow.setNullAt(rebaseOrdinal(ordinal)); } } - private String getMetaColumnVal(int ordinal) { - switch (ordinal) { - case 0: { - return commitTime; - } - case 1: { - return commitSeqNumber; - } - case 2: { - return recordKey; - } - case 3: { - return partitionPath; - } - case 4: { - return fileName; + @Override + public void update(int ordinal, Object value) { + if (ordinal < metaFields.length) { + if (value instanceof UTF8String) { + metaFields[ordinal] = (UTF8String) value; + } else if (value instanceof String) { + metaFields[ordinal] = UTF8String.fromString((String) value); + } else { + throw new IllegalArgumentException( + String.format("Could not update the row at (%d) with value of type (%s), either UTF8String or String are expected", ordinal, value.getClass().getSimpleName())); } - default: throw new IllegalArgumentException("Not expected"); + } else { + sourceRow.update(rebaseOrdinal(ordinal), value); } } @Override public boolean isNullAt(int ordinal) { - if (ordinal < HoodieRecord.HOODIE_META_COLUMNS.size()) { - return null == getMetaColumnVal(ordinal); + if (ordinal < metaFields.length) { + return metaFields[ordinal] == null; + } + return sourceRow.isNullAt(rebaseOrdinal(ordinal)); + } + + @Override + public UTF8String getUTF8String(int ordinal) { + if (ordinal < metaFields.length) { + return metaFields[ordinal]; + } + return sourceRow.getUTF8String(rebaseOrdinal(ordinal)); + } + + @Override + public Object get(int ordinal, DataType dataType) { + if (ordinal < metaFields.length) { + validateMetaFieldDataType(dataType); + return metaFields[ordinal]; } - return row.isNullAt(ordinal); + return sourceRow.get(rebaseOrdinal(ordinal), dataType); } @Override public boolean getBoolean(int ordinal) { - return row.getBoolean(ordinal); + ruleOutMetaFieldsAccess(ordinal, Boolean.class); + return sourceRow.getBoolean(rebaseOrdinal(ordinal)); } @Override public byte getByte(int ordinal) { - return row.getByte(ordinal); + ruleOutMetaFieldsAccess(ordinal, Byte.class); + return sourceRow.getByte(rebaseOrdinal(ordinal)); } @Override public short getShort(int ordinal) { - return row.getShort(ordinal); + ruleOutMetaFieldsAccess(ordinal, Short.class); + return sourceRow.getShort(rebaseOrdinal(ordinal)); } @Override public int getInt(int ordinal) { - return row.getInt(ordinal); + ruleOutMetaFieldsAccess(ordinal, Integer.class); + return sourceRow.getInt(rebaseOrdinal(ordinal)); } @Override public long getLong(int ordinal) { - return row.getLong(ordinal); + ruleOutMetaFieldsAccess(ordinal, Long.class); + return sourceRow.getLong(rebaseOrdinal(ordinal)); } @Override public float getFloat(int ordinal) { - return row.getFloat(ordinal); + ruleOutMetaFieldsAccess(ordinal, Float.class); + return sourceRow.getFloat(rebaseOrdinal(ordinal)); } @Override public double getDouble(int ordinal) { - return row.getDouble(ordinal); + ruleOutMetaFieldsAccess(ordinal, Double.class); + return sourceRow.getDouble(rebaseOrdinal(ordinal)); } @Override public Decimal getDecimal(int ordinal, int precision, int scale) { - return row.getDecimal(ordinal, precision, scale); - } - - @Override - public UTF8String getUTF8String(int ordinal) { - if (ordinal < HoodieRecord.HOODIE_META_COLUMNS.size()) { - return UTF8String.fromBytes(getMetaColumnVal(ordinal).getBytes()); - } - return row.getUTF8String(ordinal); - } - - @Override - public String getString(int ordinal) { - if (ordinal < HoodieRecord.HOODIE_META_COLUMNS.size()) { - return new String(getMetaColumnVal(ordinal).getBytes()); - } - return row.getString(ordinal); + ruleOutMetaFieldsAccess(ordinal, Decimal.class); + return sourceRow.getDecimal(rebaseOrdinal(ordinal), precision, scale); } @Override public byte[] getBinary(int ordinal) { - return row.getBinary(ordinal); + ruleOutMetaFieldsAccess(ordinal, Byte[].class); + return sourceRow.getBinary(rebaseOrdinal(ordinal)); } @Override public CalendarInterval getInterval(int ordinal) { - return row.getInterval(ordinal); + ruleOutMetaFieldsAccess(ordinal, CalendarInterval.class); + return sourceRow.getInterval(rebaseOrdinal(ordinal)); } @Override public InternalRow getStruct(int ordinal, int numFields) { - return row.getStruct(ordinal, numFields); + ruleOutMetaFieldsAccess(ordinal, InternalRow.class); + return sourceRow.getStruct(rebaseOrdinal(ordinal), numFields); } @Override public ArrayData getArray(int ordinal) { - return row.getArray(ordinal); + ruleOutMetaFieldsAccess(ordinal, ArrayData.class); + return sourceRow.getArray(rebaseOrdinal(ordinal)); } @Override public MapData getMap(int ordinal) { - return row.getMap(ordinal); + ruleOutMetaFieldsAccess(ordinal, MapData.class); + return sourceRow.getMap(rebaseOrdinal(ordinal)); } @Override - public Object get(int ordinal, DataType dataType) { - if (ordinal < HoodieRecord.HOODIE_META_COLUMNS.size()) { - return UTF8String.fromBytes(getMetaColumnVal(ordinal).getBytes()); + public InternalRow copy() { + return new HoodieInternalRow(Arrays.copyOf(metaFields, metaFields.length), sourceRow.copy(), sourceContainsMetaFields); + } + + private int rebaseOrdinal(int ordinal) { + // NOTE: In cases when source row does not contain meta fields, we will have to + // rebase ordinal onto its indexes + return sourceContainsMetaFields ? ordinal : ordinal - metaFields.length; + } + + private void validateMetaFieldDataType(DataType dataType) { + if (!dataType.sameType(StringType$.MODULE$)) { + throw new ClassCastException(String.format("Can not cast meta-field of type UTF8String to %s", dataType.simpleString())); } - return row.get(ordinal, dataType); } - @Override - public InternalRow copy() { - return new HoodieInternalRow(commitTime, commitSeqNumber, recordKey, partitionPath, fileName, row.copy()); + private void ruleOutMetaFieldsAccess(int ordinal, Class expectedDataType) { + if (ordinal < metaFields.length) { + throw new ClassCastException(String.format("Can not cast meta-field of type UTF8String at (%d) as %s", ordinal, expectedDataType.getName())); + } } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkInternalSchemaConverter.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkInternalSchemaConverter.java new file mode 100644 index 0000000000000..3b749bcf15bdd --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkInternalSchemaConverter.java @@ -0,0 +1,502 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.utils; + +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.Type; +import org.apache.hudi.internal.schema.Types; +import org.apache.hudi.internal.schema.action.InternalSchemaMerger; +import org.apache.hudi.internal.schema.utils.InternalSchemaUtils; +import org.apache.spark.sql.execution.vectorized.WritableColumnVector; +import org.apache.spark.sql.types.ArrayType; +import org.apache.spark.sql.types.ArrayType$; +import org.apache.spark.sql.types.BinaryType; +import org.apache.spark.sql.types.BinaryType$; +import org.apache.spark.sql.types.BooleanType; +import org.apache.spark.sql.types.BooleanType$; +import org.apache.spark.sql.types.ByteType; +import org.apache.spark.sql.types.CharType; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DateType; +import org.apache.spark.sql.types.DateType$; +import org.apache.spark.sql.types.Decimal; +import org.apache.spark.sql.types.DecimalType; +import org.apache.spark.sql.types.DecimalType$; +import org.apache.spark.sql.types.DoubleType; +import org.apache.spark.sql.types.DoubleType$; +import org.apache.spark.sql.types.FloatType; +import org.apache.spark.sql.types.FloatType$; +import org.apache.spark.sql.types.IntegerType; +import org.apache.spark.sql.types.IntegerType$; +import org.apache.spark.sql.types.LongType; +import org.apache.spark.sql.types.LongType$; +import org.apache.spark.sql.types.MapType; +import org.apache.spark.sql.types.MapType$; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.ShortType; +import org.apache.spark.sql.types.StringType; +import org.apache.spark.sql.types.StringType$; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.sql.types.StructType$; +import org.apache.spark.sql.types.TimestampType; +import org.apache.spark.sql.types.TimestampType$; +import org.apache.spark.sql.types.UserDefinedType; +import org.apache.spark.sql.types.VarcharType; + +import java.nio.charset.StandardCharsets; +import java.sql.Date; +import java.util.ArrayList; +import java.util.Deque; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +public class SparkInternalSchemaConverter { + private SparkInternalSchemaConverter() { + + } + + public static final String HOODIE_QUERY_SCHEMA = "hoodie.schema.internal.querySchema"; + public static final String HOODIE_TABLE_PATH = "hoodie.tablePath"; + public static final String HOODIE_VALID_COMMITS_LIST = "hoodie.valid.commits.list"; + + /** + * Convert a spark schema to an hudi internal schema. Fields without IDs are kept and assigned fallback IDs. + * + * @param sparkSchema a spark schema + * @return a matching internal schema for the provided spark schema + */ + public static InternalSchema convertStructTypeToInternalSchema(StructType sparkSchema) { + Type newType = buildTypeFromStructType(sparkSchema, true, new AtomicInteger(0)); + return new InternalSchema(((Types.RecordType)newType).fields()); + } + + public static Type buildTypeFromStructType(DataType sparkType, Boolean firstVisitRoot, AtomicInteger nextId) { + if (sparkType instanceof StructType) { + StructField[] fields = ((StructType) sparkType).fields(); + int nextAssignId = firstVisitRoot ? 0 : nextId.get(); + nextId.set(nextAssignId + fields.length); + List newTypes = new ArrayList<>(); + for (StructField f : fields) { + newTypes.add(buildTypeFromStructType(f.dataType(), false, nextId)); + } + List newFields = new ArrayList<>(); + for (int i = 0; i < newTypes.size(); i++) { + StructField f = fields[i]; + newFields.add(Types.Field.get(nextAssignId + i, f.nullable(), f.name(), newTypes.get(i), + f.getComment().isDefined() ? f.getComment().get() : null)); + } + return Types.RecordType.get(newFields); + } else if (sparkType instanceof MapType) { + MapType map = (MapType) sparkType; + DataType keyType = map.keyType(); + DataType valueType = map.valueType(); + int keyId = nextId.get(); + int valueId = keyId + 1; + nextId.set(valueId + 1); + return Types.MapType.get(keyId, valueId, buildTypeFromStructType(keyType, false, nextId), + buildTypeFromStructType(valueType, false, nextId), map.valueContainsNull()); + } else if (sparkType instanceof ArrayType) { + ArrayType array = (ArrayType) sparkType; + DataType et = array.elementType(); + int elementId = nextId.get(); + nextId.set(elementId + 1); + return Types.ArrayType.get(elementId, array.containsNull(), buildTypeFromStructType(et, false, nextId)); + } else if (sparkType instanceof UserDefinedType) { + throw new UnsupportedOperationException("User-defined types are not supported"); + } else if (sparkType instanceof BooleanType) { + return Types.BooleanType.get(); + } else if (sparkType instanceof IntegerType + || sparkType instanceof ShortType + || sparkType instanceof ByteType) { + return Types.IntType.get(); + } else if (sparkType instanceof LongType) { + return Types.LongType.get(); + } else if (sparkType instanceof FloatType) { + return Types.FloatType.get(); + } else if (sparkType instanceof DoubleType) { + return Types.DoubleType.get(); + } else if (sparkType instanceof StringType + || sparkType instanceof CharType + || sparkType instanceof VarcharType) { + return Types.StringType.get(); + } else if (sparkType instanceof DateType) { + return Types.DateType.get(); + // TODO support spark 3.3.x as it supports TimeStampNTZ (SPARK-35662) + } else if (sparkType instanceof TimestampType) { + return Types.TimestampType.get(); + } else if (sparkType instanceof DecimalType) { + return Types.DecimalType.get( + ((DecimalType) sparkType).precision(), + ((DecimalType) sparkType).scale()); + } else if (sparkType instanceof BinaryType) { + return Types.BinaryType.get(); + } else { + throw new UnsupportedOperationException(String.format("Not a supported type: %s", sparkType.catalogString())); + } + } + + /** + * Convert Spark schema to Hudi internal schema, and prune fields. + * Fields without IDs are kept and assigned fallback IDs. + * + * @param sparkSchema a pruned spark schema + * @param originSchema a internal schema for hoodie table + * @return a pruned internal schema for the provided spark schema + */ + public static InternalSchema convertAndPruneStructTypeToInternalSchema(StructType sparkSchema, InternalSchema originSchema) { + List pruneNames = collectColNamesFromSparkStruct(sparkSchema); + return InternalSchemaUtils.pruneInternalSchema(originSchema, pruneNames); + } + + /** + * Collect all the leaf nodes names. + * + * @param sparkSchema a spark schema + * @return leaf nodes full names. + */ + public static List collectColNamesFromSparkStruct(StructType sparkSchema) { + List result = new ArrayList<>(); + collectColNamesFromStructType(sparkSchema, new LinkedList<>(), result); + return result; + } + + private static void collectColNamesFromStructType(DataType sparkType, Deque fieldNames, List resultSet) { + if (sparkType instanceof StructType) { + StructField[] fields = ((StructType) sparkType).fields(); + for (StructField f : fields) { + fieldNames.push(f.name()); + collectColNamesFromStructType(f.dataType(), fieldNames, resultSet); + fieldNames.pop(); + addFullName(f.dataType(), f.name(), fieldNames, resultSet); + } + } else if (sparkType instanceof MapType) { + MapType map = (MapType) sparkType; + DataType keyType = map.keyType(); + DataType valueType = map.valueType(); + // key + fieldNames.push("key"); + collectColNamesFromStructType(keyType, fieldNames, resultSet); + fieldNames.pop(); + addFullName(keyType,"key", fieldNames, resultSet); + // value + fieldNames.push("value"); + collectColNamesFromStructType(valueType, fieldNames, resultSet); + fieldNames.poll(); + addFullName(valueType,"value", fieldNames, resultSet); + } else if (sparkType instanceof ArrayType) { + ArrayType array = (ArrayType) sparkType; + DataType et = array.elementType(); + fieldNames.push("element"); + collectColNamesFromStructType(et, fieldNames, resultSet); + fieldNames.pop(); + addFullName(et, "element", fieldNames, resultSet); + } else if (sparkType instanceof UserDefinedType) { + throw new UnsupportedOperationException("User-defined types are not supported"); + } else { + // do nothings + } + } + + private static void addFullName(DataType sparkType, String name, Deque fieldNames, List resultSet) { + if (!(sparkType instanceof StructType) && !(sparkType instanceof ArrayType) && !(sparkType instanceof MapType)) { + resultSet.add(InternalSchemaUtils.createFullName(name, fieldNames)); + } + } + + public static StructType mergeSchema(InternalSchema fileSchema, InternalSchema querySchema) { + InternalSchema schema = new InternalSchemaMerger(fileSchema, querySchema, true, true).mergeSchema(); + return constructSparkSchemaFromInternalSchema(schema); + } + + public static Map> collectTypeChangedCols(InternalSchema schema, InternalSchema other) { + return InternalSchemaUtils + .collectTypeChangedCols(schema, other) + .entrySet() + .stream() + .collect(Collectors.toMap(e -> e.getKey(), e -> Pair.of(constructSparkSchemaFromType(e.getValue().getLeft()), constructSparkSchemaFromType(e.getValue().getRight())))); + } + + public static StructType constructSparkSchemaFromInternalSchema(InternalSchema schema) { + return (StructType) constructSparkSchemaFromType(schema.getRecord()); + } + + private static DataType constructSparkSchemaFromType(Type type) { + switch (type.typeId()) { + case RECORD: + Types.RecordType record = (Types.RecordType) type; + List fields = record.fields(); + List structFields = new ArrayList<>(); + for (Types.Field f : fields) { + DataType dataType = constructSparkSchemaFromType(f.type()); + StructField structField = StructField.apply(f.name(), dataType, f.isOptional(), Metadata.empty()); + structField = f.doc() == null ? structField : structField.withComment(f.doc()); + structFields.add(structField); + } + return StructType$.MODULE$.apply(structFields); + case ARRAY: + Types.ArrayType array = (Types.ArrayType) type; + DataType elementType = constructSparkSchemaFromType(array.elementType()); + return ArrayType$.MODULE$.apply(elementType, array.isElementOptional()); + case MAP: + Types.MapType map = (Types.MapType) type; + DataType keyDataType = constructSparkSchemaFromType(map.keyType()); + DataType valueDataType = constructSparkSchemaFromType(map.valueType()); + return MapType$.MODULE$.apply(keyDataType, valueDataType, map.isValueOptional()); + case BOOLEAN: + return BooleanType$.MODULE$; + case INT: + return IntegerType$.MODULE$; + case LONG: + return LongType$.MODULE$; + case FLOAT: + return FloatType$.MODULE$; + case DOUBLE: + return DoubleType$.MODULE$; + case DATE: + return DateType$.MODULE$; + case TIME: + throw new UnsupportedOperationException(String.format("cannot convert %s type to Spark", type)); + case TIMESTAMP: + // todo support TimeStampNTZ + return TimestampType$.MODULE$; + case STRING: + return StringType$.MODULE$; + case UUID: + return StringType$.MODULE$; + case FIXED: + return BinaryType$.MODULE$; + case BINARY: + return BinaryType$.MODULE$; + case DECIMAL: + Types.DecimalType decimal = (Types.DecimalType) type; + return DecimalType$.MODULE$.apply(decimal.precision(), decimal.scale()); + default: + throw new UnsupportedOperationException(String.format("cannot convert unknown type: %s to Spark", type)); + } + } + + /** + * Convert Int/long type to other Type. + * Now only support int/long -> long/float/double/string + * TODO: support more types + */ + private static boolean convertIntLongType(WritableColumnVector oldV, WritableColumnVector newV, DataType newType, int len) { + boolean isInt = oldV.dataType() instanceof IntegerType; + if (newType instanceof LongType || newType instanceof FloatType + || newType instanceof DoubleType || newType instanceof StringType || newType instanceof DecimalType) { + for (int i = 0; i < len; i++) { + if (oldV.isNullAt(i)) { + newV.putNull(i); + continue; + } + // int/long -> long/float/double/string/decimal + if (newType instanceof LongType) { + newV.putLong(i, isInt ? oldV.getInt(i) : oldV.getLong(i)); + } else if (newType instanceof FloatType) { + newV.putFloat(i, isInt ? oldV.getInt(i) : oldV.getLong(i)); + } else if (newType instanceof DoubleType) { + newV.putDouble(i, isInt ? oldV.getInt(i) : oldV.getLong(i)); + } else if (newType instanceof StringType) { + newV.putByteArray(i, ((isInt ? oldV.getInt(i) : oldV.getLong(i)) + "").getBytes(StandardCharsets.UTF_8)); + } else if (newType instanceof DecimalType) { + Decimal oldDecimal = Decimal.apply(isInt ? oldV.getInt(i) : oldV.getLong(i)); + oldDecimal.changePrecision(((DecimalType) newType).precision(), ((DecimalType) newType).scale()); + newV.putDecimal(i, oldDecimal, ((DecimalType) newType).precision()); + } + } + return true; + } + return false; + } + + /** + * Convert float type to other Type. + * Now only support float -> double/String + * TODO: support more types + */ + private static boolean convertFloatType(WritableColumnVector oldV, WritableColumnVector newV, DataType newType, int len) { + if (newType instanceof DoubleType || newType instanceof StringType || newType instanceof DecimalType) { + for (int i = 0; i < len; i++) { + if (oldV.isNullAt(i)) { + newV.putNull(i); + continue; + } + // float -> double/string/decimal + if (newType instanceof DoubleType) { + newV.putDouble(i, Double.valueOf(oldV.getFloat(i) + "")); + } else if (newType instanceof StringType) { + newV.putByteArray(i, (oldV.getFloat(i) + "").getBytes(StandardCharsets.UTF_8)); + } else if (newType instanceof DecimalType) { + Decimal oldDecimal = Decimal.apply(oldV.getFloat(i)); + oldDecimal.changePrecision(((DecimalType) newType).precision(), ((DecimalType) newType).scale()); + newV.putDecimal(i, oldDecimal, ((DecimalType) newType).precision()); + } + } + return true; + } + return false; + } + + /** + * Convert double type to other Type. + * Now only support Double -> Decimal/String + * TODO: support more types + */ + private static boolean convertDoubleType(WritableColumnVector oldV, WritableColumnVector newV, DataType newType, int len) { + if (newType instanceof DecimalType || newType instanceof StringType) { + for (int i = 0; i < len; i++) { + if (oldV.isNullAt(i)) { + newV.putNull(i); + continue; + } + // double -> decimal/string + if (newType instanceof DecimalType) { + Decimal oldDecimal = Decimal.apply(oldV.getDouble(i)); + oldDecimal.changePrecision(((DecimalType) newType).precision(), ((DecimalType) newType).scale()); + newV.putDecimal(i, oldDecimal, ((DecimalType) newType).precision()); + } else if (newType instanceof StringType) { + newV.putByteArray(i, (oldV.getDouble(i) + "").getBytes(StandardCharsets.UTF_8)); + } + } + return true; + } + return false; + } + + /** + * Convert decimal type to other Type. + * Now only support Decimal -> Decimal/String + * TODO: support more types + */ + private static boolean convertDecimalType(WritableColumnVector oldV, WritableColumnVector newV, DataType newType, int len) { + DataType oldType = oldV.dataType(); + if (newType instanceof DecimalType || newType instanceof StringType) { + for (int i = 0; i < len; i++) { + if (oldV.isNullAt(i)) { + newV.putNull(i); + continue; + } + Decimal oldDecimal = oldV.getDecimal(i, ((DecimalType) oldType).precision(), ((DecimalType) oldType).scale()); + if (newType instanceof DecimalType) { + oldDecimal.changePrecision(((DecimalType) newType).precision(), ((DecimalType) newType).scale()); + newV.putDecimal(i, oldDecimal, ((DecimalType) newType).precision()); + } else if (newType instanceof StringType) { + newV.putByteArray(i, oldDecimal.toString().getBytes(StandardCharsets.UTF_8)); + } + } + return true; + } + return false; + } + + /** + * Convert date type to other Type. + * Now only support Date -> String + * TODO: support more types + */ + private static boolean convertDateType(WritableColumnVector oldV, WritableColumnVector newV, DataType newType, int len) { + if (newType instanceof StringType) { + for (int i = 0; i < len; i++) { + if (oldV.isNullAt(i)) { + newV.putNull(i); + continue; + } + // to do support rebaseDate + String res = org.apache.spark.sql.catalyst.util.DateTimeUtils.toJavaDate(oldV.getInt(i)).toString(); + newV.putByteArray(i, res.getBytes(StandardCharsets.UTF_8)); + } + return true; + } + return false; + } + + /** + * Convert String type to other Type. + * Now only support String -> Decimal/Date. + * Notice: This convert maybe failed!!! + * TODO: support more types + */ + private static boolean convertStringType(WritableColumnVector oldV, WritableColumnVector newV, DataType newType, int len) { + if (newType instanceof DateType || newType instanceof DecimalType) { + for (int i = 0; i < len; i++) { + if (oldV.isNullAt(i)) { + newV.putNull(i); + continue; + } + // to do support rebaseDate + if (newType instanceof DateType) { + int days = org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaDate(Date.valueOf(oldV.getUTF8String(i).toString())); + newV.putInt(i, days); + } else if (newType instanceof DecimalType) { + DecimalType decimalType = (DecimalType) newType; + java.math.BigDecimal bigDecimal = new java.math.BigDecimal(oldV.getUTF8String(i).toString().trim()); + Decimal sparkDecimal = Decimal.apply(bigDecimal); + sparkDecimal.changePrecision(decimalType.precision(), decimalType.scale()); + newV.putDecimal(i, sparkDecimal, decimalType.precision()); + } + } + return true; + } + return false; + } + + public static boolean convertColumnVectorType(WritableColumnVector oldV, WritableColumnVector newV, int len) { + if (len == 0 || oldV == null || newV == null) { + return false; + } + DataType oldType = oldV.dataType(); // old colType eg: floatType + DataType newType = newV.dataType(); // new colType eg: doubleType + if (oldV != null && newType != null) { + if (oldType instanceof BooleanType) { + return false; + } else if (oldType instanceof ByteType) { + return false; + } else if (oldType instanceof ShortType) { + return false; + } else if (oldType instanceof IntegerType) { + return convertIntLongType(oldV, newV, newType, len); + } else if (oldType instanceof LongType) { + return convertIntLongType(oldV, newV, newType, len); + } else if (oldType instanceof FloatType) { + return convertFloatType(oldV, newV, newType, len); + } else if (oldType instanceof DoubleType) { + return convertDoubleType(oldV, newV, newType, len); + } else if (oldType instanceof StringType) { + return convertStringType(oldV, newV, newType, len); + } else if (oldType instanceof BinaryType) { + return false; + } else if (oldType instanceof DecimalType) { + return convertDecimalType(oldV, newV, newType, len); + } else if (oldType instanceof DateType) { + return convertDateType(oldV, newV, newType, len); + } else if (oldType instanceof TimestampType) { + return false; + } else { + throw new UnsupportedOperationException("Datatype not supported " + oldV); + } + } + return false; + } +} + diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkMemoryUtils.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkMemoryUtils.java index 0fa75dc61f9d9..9cb127f397b20 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkMemoryUtils.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkMemoryUtils.java @@ -18,27 +18,19 @@ package org.apache.hudi.client.utils; -import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.common.config.HoodieConfig; import org.apache.spark.storage.StorageLevel; import java.util.Properties; -import static org.apache.hudi.config.HoodieWriteConfig.WRITE_STATUS_STORAGE_LEVEL; +import static org.apache.hudi.config.HoodieWriteConfig.WRITE_STATUS_STORAGE_LEVEL_VALUE; /** * Spark config utils. */ public class SparkMemoryUtils { public static StorageLevel getWriteStatusStorageLevel(Properties properties) { - return StorageLevel.fromString(properties.getProperty(WRITE_STATUS_STORAGE_LEVEL)); - } - - public static StorageLevel getBloomIndexInputStorageLevel(Properties properties) { - return StorageLevel.fromString(properties.getProperty(HoodieIndexConfig.BLOOM_INDEX_INPUT_STORAGE_LEVEL)); - } - - public static StorageLevel getSimpleIndexInputStorageLevel(Properties properties) { - return StorageLevel.fromString(properties.getProperty(HoodieIndexConfig.SIMPLE_INDEX_INPUT_STORAGE_LEVEL)); + return StorageLevel.fromString(new HoodieConfig(properties).getString(WRITE_STATUS_STORAGE_LEVEL_VALUE)); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkRowDeserializer.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkRowDeserializer.java deleted file mode 100644 index 66b8b78b56920..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkRowDeserializer.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.client.utils; - -import org.apache.spark.sql.Row; -import org.apache.spark.sql.catalyst.InternalRow; - -import java.io.Serializable; - -public interface SparkRowDeserializer extends Serializable { - Row deserializeRow(InternalRow internalRow); -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkRowSerDe.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkRowSerDe.java new file mode 100644 index 0000000000000..dce2d2fb62f1f --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkRowSerDe.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.utils; + +import org.apache.spark.sql.Row; +import org.apache.spark.sql.catalyst.InternalRow; + +import java.io.Serializable; + +public interface SparkRowSerDe extends Serializable { + Row deserializeRow(InternalRow internalRow); + + InternalRow serializeRow(Row row); +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkValidatorUtils.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkValidatorUtils.java new file mode 100644 index 0000000000000..a6d03eae2b361 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkValidatorUtils.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.utils; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.client.validator.SparkPreCommitValidator; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.BaseFile; +import org.apache.hudi.common.table.view.HoodieTablePreCommitFileSystemView; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieValidationException; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; + +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import scala.collection.JavaConverters; + +/** + * Spark validator utils to verify and run any pre-commit validators configured. + */ +public class SparkValidatorUtils { + private static final Logger LOG = LogManager.getLogger(BaseSparkCommitActionExecutor.class); + + /** + * Check configured pre-commit validators and run them. Note that this only works for COW tables + * + * Throw error if there are validation failures. + */ + public static void runValidators(HoodieWriteConfig config, + HoodieWriteMetadata> writeMetadata, + HoodieEngineContext context, + HoodieTable table, + String instantTime) { + if (StringUtils.isNullOrEmpty(config.getPreCommitValidators())) { + LOG.info("no validators configured."); + } else { + if (!writeMetadata.getWriteStats().isPresent()) { + writeMetadata.setWriteStats(writeMetadata.getWriteStatuses().map(WriteStatus::getStat).collectAsList()); + } + Set partitionsModified = writeMetadata.getWriteStats().get().stream().map(writeStats -> + writeStats.getPartitionPath()).collect(Collectors.toSet()); + SQLContext sqlContext = new SQLContext(HoodieSparkEngineContext.getSparkContext(context)); + // Refresh timeline to ensure validator sees the any other operations done on timeline (async operations such as other clustering/compaction/rollback) + table.getMetaClient().reloadActiveTimeline(); + Dataset beforeState = getRecordsFromCommittedFiles(sqlContext, partitionsModified, table).cache(); + Dataset afterState = getRecordsFromPendingCommits(sqlContext, partitionsModified, writeMetadata, table, instantTime).cache(); + + Stream validators = Arrays.stream(config.getPreCommitValidators().split(",")) + .map(validatorClass -> { + return ((SparkPreCommitValidator) ReflectionUtils.loadClass(validatorClass, + new Class[] {HoodieSparkTable.class, HoodieEngineContext.class, HoodieWriteConfig.class}, + table, context, config)); + }); + + boolean allSuccess = validators.map(v -> runValidatorAsync(v, writeMetadata, beforeState, afterState, instantTime)).map(CompletableFuture::join) + .reduce(true, Boolean::logicalAnd); + + if (allSuccess) { + LOG.info("All validations succeeded"); + } else { + LOG.error("At least one pre-commit validation failed"); + throw new HoodieValidationException("At least one pre-commit validation failed"); + } + } + } + + /** + * Run validators in a separate thread pool for parallelism. Each of validator can submit a distributed spark job if needed. + */ + private static CompletableFuture runValidatorAsync(SparkPreCommitValidator validator, HoodieWriteMetadata> writeMetadata, + Dataset beforeState, Dataset afterState, String instantTime) { + return CompletableFuture.supplyAsync(() -> { + try { + validator.validate(instantTime, writeMetadata, beforeState, afterState); + LOG.info("validation complete for " + validator.getClass().getName()); + return true; + } catch (HoodieValidationException e) { + LOG.error("validation failed for " + validator.getClass().getName()); + return false; + } + }); + } + + /** + * Get records from partitions modified as a dataset. + * Note that this only works for COW tables. + */ + public static Dataset getRecordsFromCommittedFiles(SQLContext sqlContext, + Set partitionsAffected, HoodieTable table) { + + List committedFiles = partitionsAffected.stream() + .flatMap(partition -> table.getBaseFileOnlyView().getLatestBaseFiles(partition).map(BaseFile::getPath)) + .collect(Collectors.toList()); + + if (committedFiles.isEmpty()) { + return sqlContext.emptyDataFrame(); + } + return readRecordsForBaseFiles(sqlContext, committedFiles); + } + + /** + * Get records from specified list of data files. + */ + public static Dataset readRecordsForBaseFiles(SQLContext sqlContext, List baseFilePaths) { + return sqlContext.read().parquet(JavaConverters.asScalaBufferConverter(baseFilePaths).asScala()); + } + + /** + * Get reads from partitions modified including any inflight commits. + * Note that this only works for COW tables + */ + public static Dataset getRecordsFromPendingCommits(SQLContext sqlContext, + Set partitionsAffected, + HoodieWriteMetadata> writeMetadata, + HoodieTable table, + String instantTime) { + + // build file system view with pending commits + HoodieTablePreCommitFileSystemView fsView = new HoodieTablePreCommitFileSystemView(table.getMetaClient(), + table.getHoodieView(), + writeMetadata.getWriteStats().get(), + writeMetadata.getPartitionToReplaceFileIds(), + instantTime); + + List newFiles = partitionsAffected.stream() + .flatMap(partition -> fsView.getLatestBaseFiles(partition).map(BaseFile::getPath)) + .collect(Collectors.toList()); + + if (newFiles.isEmpty()) { + return sqlContext.emptyDataFrame(); + } + + return readRecordsForBaseFiles(sqlContext, newFiles); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SparkPreCommitValidator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SparkPreCommitValidator.java new file mode 100644 index 0000000000000..f08d11b571492 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SparkPreCommitValidator.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.validator; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieValidationException; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; + +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Validator can be configured pre-commit. + */ +public abstract class SparkPreCommitValidator> { + private static final Logger LOG = LogManager.getLogger(SparkPreCommitValidator.class); + + private HoodieSparkTable table; + private HoodieEngineContext engineContext; + private HoodieWriteConfig writeConfig; + + protected SparkPreCommitValidator(HoodieSparkTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) { + this.table = table; + this.engineContext = engineContext; + this.writeConfig = writeConfig; + } + + protected Set getPartitionsModified(HoodieWriteMetadata writeResult) { + Set partitionsModified; + if (writeResult.getWriteStats().isPresent()) { + partitionsModified = writeResult.getWriteStats().get().stream().map(HoodieWriteStat::getPartitionPath).collect(Collectors.toSet()); + } else { + partitionsModified = new HashSet<>(writeResult.getWriteStatuses().map(WriteStatus::getPartitionPath).collectAsList()); + } + return partitionsModified; + } + + /** + * Verify the data written as part of specified instant. + * Throw HoodieValidationException if any unexpected data is written (Example: data files are not readable for some reason). + */ + public void validate(String instantTime, HoodieWriteMetadata writeResult, Dataset before, Dataset after) throws HoodieValidationException { + HoodieTimer timer = new HoodieTimer().startTimer(); + try { + validateRecordsBeforeAndAfter(before, after, getPartitionsModified(writeResult)); + } finally { + LOG.info(getClass() + " validator took " + timer.endTimer() + " ms"); + } + } + + /** + * Takes input of RDD 1) before clustering and 2) after clustering. Perform required validation + * and throw error if validation fails + */ + protected abstract void validateRecordsBeforeAndAfter(Dataset before, + Dataset after, + Set partitionsAffected); + + public HoodieTable getHoodieTable() { + return this.table; + } + + public HoodieEngineContext getEngineContext() { + return this.engineContext; + } + + public HoodieWriteConfig getWriteConfig() { + return this.writeConfig; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryEqualityPreCommitValidator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryEqualityPreCommitValidator.java new file mode 100644 index 0000000000000..2506d52b4a416 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryEqualityPreCommitValidator.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.validator; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodiePreCommitValidatorConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieValidationException; +import org.apache.hudi.table.HoodieSparkTable; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; + +/** + * Validator to run sql query and compare table state + * 1) before new commit started. + * 2) current inflight commit (if successful). + * + * Expects both queries to return same result. + */ +public class SqlQueryEqualityPreCommitValidator> extends SqlQueryPreCommitValidator { + + private static final Logger LOG = LogManager.getLogger(SqlQueryEqualityPreCommitValidator.class); + + public SqlQueryEqualityPreCommitValidator(HoodieSparkTable table, HoodieEngineContext engineContext, HoodieWriteConfig config) { + super(table, engineContext, config); + } + + @Override + protected String getQueryConfigName() { + return HoodiePreCommitValidatorConfig.EQUALITY_SQL_QUERIES.key(); + } + + @Override + protected void validateUsingQuery(String query, String prevTableSnapshot, String newTableSnapshot, SQLContext sqlContext) { + String queryWithPrevSnapshot = query.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, prevTableSnapshot); + String queryWithNewSnapshot = query.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, newTableSnapshot); + LOG.info("Running query on previous state: " + queryWithPrevSnapshot); + Dataset prevRows = sqlContext.sql(queryWithPrevSnapshot); + LOG.info("Running query on new state: " + queryWithNewSnapshot); + Dataset newRows = sqlContext.sql(queryWithNewSnapshot); + printAllRowsIfDebugEnabled(prevRows); + printAllRowsIfDebugEnabled(newRows); + boolean areDatasetsEqual = prevRows.intersect(newRows).count() == prevRows.count(); + LOG.info("Completed Equality Validation, datasets equal? " + areDatasetsEqual); + if (!areDatasetsEqual) { + LOG.error("query validation failed. See stdout for sample query results. Query: " + query); + System.out.println("Expected result (sample records only):"); + prevRows.show(); + System.out.println("Actual result (sample records only):"); + newRows.show(); + throw new HoodieValidationException("Query validation failed for '" + query + "'. See stdout for expected vs actual records"); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryInequalityPreCommitValidator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryInequalityPreCommitValidator.java new file mode 100644 index 0000000000000..8a25150651943 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryInequalityPreCommitValidator.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.validator; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodiePreCommitValidatorConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieValidationException; +import org.apache.hudi.table.HoodieSparkTable; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; + +/** + * Validator to run sql query and compare table state + * 1) before new commit started. + * 2) current inflight commit (if successful). + *

    + * Expects query results do not match. + */ +public class SqlQueryInequalityPreCommitValidator> extends SqlQueryPreCommitValidator { + private static final Logger LOG = LogManager.getLogger(SqlQueryInequalityPreCommitValidator.class); + + public SqlQueryInequalityPreCommitValidator(HoodieSparkTable table, HoodieEngineContext engineContext, HoodieWriteConfig config) { + super(table, engineContext, config); + } + + @Override + protected String getQueryConfigName() { + return HoodiePreCommitValidatorConfig.INEQUALITY_SQL_QUERIES.key(); + } + + @Override + protected void validateUsingQuery(String query, String prevTableSnapshot, String newTableSnapshot, SQLContext sqlContext) { + String queryWithPrevSnapshot = query.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, prevTableSnapshot); + String queryWithNewSnapshot = query.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, newTableSnapshot); + LOG.info("Running query on previous state: " + queryWithPrevSnapshot); + Dataset prevRows = sqlContext.sql(queryWithPrevSnapshot); + LOG.info("Running query on new state: " + queryWithNewSnapshot); + Dataset newRows = sqlContext.sql(queryWithNewSnapshot); + printAllRowsIfDebugEnabled(prevRows); + printAllRowsIfDebugEnabled(newRows); + boolean areDatasetsEqual = prevRows.intersect(newRows).count() == prevRows.count(); + LOG.info("Completed Inequality Validation, datasets equal? " + areDatasetsEqual); + if (areDatasetsEqual) { + LOG.error("query validation failed. See stdout for sample query results. Query: " + query); + System.out.println("Expected query results to be different, but they are same. Result (sample records only):"); + prevRows.show(); + throw new HoodieValidationException("Query validation failed for '" + query + + "'. Expected " + prevRows.count() + " rows, Found " + newRows.count()); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryPreCommitValidator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryPreCommitValidator.java new file mode 100644 index 0000000000000..3a88d54d36d7b --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryPreCommitValidator.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.validator; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieValidationException; +import org.apache.hudi.table.HoodieSparkTable; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; + +import java.util.Arrays; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * Validator framework to run sql queries and compare table state at different locations. + */ +public abstract class SqlQueryPreCommitValidator> extends SparkPreCommitValidator { + private static final Logger LOG = LogManager.getLogger(SqlQueryPreCommitValidator.class); + private static final AtomicInteger TABLE_COUNTER = new AtomicInteger(0); + + public SqlQueryPreCommitValidator(HoodieSparkTable table, HoodieEngineContext engineContext, HoodieWriteConfig config) { + super(table, engineContext, config); + } + + /** + * Takes input datasets 1) before commit started and 2) with inflight commit. Perform required validation + * and throw error if validation fails + */ + @Override + public void validateRecordsBeforeAndAfter(Dataset before, Dataset after, final Set partitionsAffected) { + String hoodieTableName = "staged_table_" + TABLE_COUNTER.incrementAndGet(); + String hoodieTableBeforeCurrentCommit = hoodieTableName + "_before"; + String hoodieTableWithInflightCommit = hoodieTableName + "_after"; + before.registerTempTable(hoodieTableBeforeCurrentCommit); + after.registerTempTable(hoodieTableWithInflightCommit); + JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(getEngineContext()); + SQLContext sqlContext = new SQLContext(jsc); + + String[] queries = getQueriesToRun(); + //TODO run this in a thread pool to improve parallelism + Arrays.stream(queries).forEach(query -> + validateUsingQuery(query, hoodieTableBeforeCurrentCommit, hoodieTableWithInflightCommit, sqlContext)); + } + + protected String[] getQueriesToRun() { + String sqlQueriesConfigured = getWriteConfig().getProps().getProperty(getQueryConfigName()); + if (StringUtils.isNullOrEmpty(sqlQueriesConfigured)) { + throw new HoodieValidationException("Sql validator configured incorrectly. expecting at least one query. Found 0 queries in " + + sqlQueriesConfigured); + } + return sqlQueriesConfigured.trim().split(";"); + } + + protected void printAllRowsIfDebugEnabled(Dataset dataset) { + if (LOG.isDebugEnabled()) { + dataset = dataset.cache(); + LOG.debug("Printing all rows from query validation:"); + dataset.show(Integer.MAX_VALUE,false); + } + } + + protected abstract String getQueryConfigName(); + + protected abstract void validateUsingQuery(String query, String prevTableSnapshot, String newTableSnapshot, SQLContext sqlContext); +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQuerySingleResultPreCommitValidator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQuerySingleResultPreCommitValidator.java new file mode 100644 index 0000000000000..b1942244d3d39 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQuerySingleResultPreCommitValidator.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.validator; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodiePreCommitValidatorConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieValidationException; +import org.apache.hudi.table.HoodieSparkTable; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; + +import java.util.List; + +/** + * Validator to run sql queries on new table state and expects a single result. If the result does not match expected result, + * throw validation error. + *

    + * Example configuration: "query1#expectedResult1;query2#expectedResult2;" + */ +public class SqlQuerySingleResultPreCommitValidator> extends SqlQueryPreCommitValidator { + private static final Logger LOG = LogManager.getLogger(SqlQueryInequalityPreCommitValidator.class); + + public SqlQuerySingleResultPreCommitValidator(HoodieSparkTable table, HoodieEngineContext engineContext, HoodieWriteConfig config) { + super(table, engineContext, config); + } + + @Override + protected String getQueryConfigName() { + return HoodiePreCommitValidatorConfig.SINGLE_VALUE_SQL_QUERIES.key(); + } + + @Override + protected void validateUsingQuery(String query, String prevTableSnapshot, String newTableSnapshot, SQLContext sqlContext) { + String[] queryWithExpectedResult = query.split("#"); + if (queryWithExpectedResult.length != 2) { + throw new HoodieValidationException("Invalid query format " + query); + } + + String queryToRun = queryWithExpectedResult[0]; + String expectedResult = queryWithExpectedResult[1]; + LOG.info("Running query on new state: " + queryToRun); + String queryWithNewSnapshot = queryToRun.replaceAll(HoodiePreCommitValidatorConfig.VALIDATOR_TABLE_VARIABLE, newTableSnapshot); + List newRows = sqlContext.sql(queryWithNewSnapshot).collectAsList(); + if (newRows.size() != 1 && newRows.get(0).size() != 1) { + throw new HoodieValidationException("Invalid query result. expect single value for '" + query + "'"); + } + Object result = newRows.get(0).apply(0); + if (result == null || !expectedResult.equals(result.toString())) { + LOG.error("Mismatch query result. Expected: " + expectedResult + " got " + result + "Query: " + query); + throw new HoodieValidationException("Query validation failed for '" + query + + "'. Expected " + expectedResult + " rows, Found " + result); + } else { + LOG.info("Query validation successful. Expected: " + expectedResult + " got " + result); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieJavaPairRDD.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieJavaPairRDD.java new file mode 100644 index 0000000000000..9ec3c4cf71592 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieJavaPairRDD.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.data; + +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodiePairData; +import org.apache.hudi.common.function.SerializableBiFunction; +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFunction; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; + +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.storage.StorageLevel; + +import java.util.List; +import java.util.Map; + +import scala.Tuple2; + +/** + * Implementation of {@link HoodiePairData} using Spark {@link JavaPairRDD}. + * + * @param type of key. + * @param type of value. + */ +public class HoodieJavaPairRDD implements HoodiePairData { + + private final JavaPairRDD pairRDDData; + + private HoodieJavaPairRDD(JavaPairRDD pairRDDData) { + this.pairRDDData = pairRDDData; + } + + /** + * @param pairRDDData a {@link JavaPairRDD} of pairs. + * @param type of key. + * @param type of value. + * @return a new instance containing the {@link JavaPairRDD} reference. + */ + public static HoodieJavaPairRDD of(JavaPairRDD pairRDDData) { + return new HoodieJavaPairRDD<>(pairRDDData); + } + + /** + * @param hoodiePairData {@link HoodieJavaPairRDD } instance containing the {@link JavaPairRDD} of pairs. + * @param type of key. + * @param type of value. + * @return the {@link JavaPairRDD} of pairs. + */ + public static JavaPairRDD getJavaPairRDD(HoodiePairData hoodiePairData) { + return ((HoodieJavaPairRDD) hoodiePairData).get(); + } + + @Override + public JavaPairRDD get() { + return pairRDDData; + } + + @Override + public void persist(String storageLevel) { + pairRDDData.persist(StorageLevel.fromString(storageLevel)); + } + + @Override + public void unpersist() { + pairRDDData.unpersist(); + } + + @Override + public HoodieData keys() { + return HoodieJavaRDD.of(pairRDDData.keys()); + } + + @Override + public HoodieData values() { + return HoodieJavaRDD.of(pairRDDData.values()); + } + + @Override + public long count() { + return pairRDDData.count(); + } + + @Override + public Map countByKey() { + return pairRDDData.countByKey(); + } + + @Override + public HoodiePairData> groupByKey() { + return new HoodieJavaPairRDD<>(pairRDDData.groupByKey()); + } + + @Override + public HoodiePairData reduceByKey(SerializableBiFunction combiner, int parallelism) { + return HoodieJavaPairRDD.of(pairRDDData.reduceByKey(combiner::apply, parallelism)); + } + + @Override + public HoodieData map(SerializableFunction, O> func) { + return HoodieJavaRDD.of(pairRDDData.map( + tuple -> func.apply(new ImmutablePair<>(tuple._1, tuple._2)))); + } + + @Override + public HoodiePairData mapToPair(SerializablePairFunction, L, W> mapToPairFunc) { + return HoodieJavaPairRDD.of(pairRDDData.mapToPair(pair -> { + Pair newPair = mapToPairFunc.call(new ImmutablePair<>(pair._1, pair._2)); + return new Tuple2<>(newPair.getLeft(), newPair.getRight()); + })); + } + + @Override + public HoodiePairData>> leftOuterJoin(HoodiePairData other) { + return HoodieJavaPairRDD.of(JavaPairRDD.fromJavaRDD( + pairRDDData.leftOuterJoin(HoodieJavaPairRDD.getJavaPairRDD(other)) + .map(tuple -> new Tuple2<>(tuple._1, + new ImmutablePair<>(tuple._2._1, Option.ofNullable(tuple._2._2.orElse(null))))))); + } + + @Override + public List> collectAsList() { + return pairRDDData.map(t -> Pair.of(t._1, t._2)).collect(); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieJavaRDD.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieJavaRDD.java new file mode 100644 index 0000000000000..ed9613bc15fe6 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieJavaRDD.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.data; + +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodiePairData; +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFunction; +import org.apache.hudi.common.util.collection.Pair; + +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.storage.StorageLevel; + +import java.util.Iterator; +import java.util.List; + +import scala.Tuple2; + +/** + * Holds a {@link JavaRDD} of objects. + * + * @param type of object. + */ +public class HoodieJavaRDD implements HoodieData { + + private final JavaRDD rddData; + + private HoodieJavaRDD(JavaRDD rddData) { + this.rddData = rddData; + } + + /** + * @param rddData a {@link JavaRDD} of objects in type T. + * @param type of object. + * @return a new instance containing the {@link JavaRDD} reference. + */ + public static HoodieJavaRDD of(JavaRDD rddData) { + return new HoodieJavaRDD<>(rddData); + } + + /** + * @param data a {@link List} of objects in type T. + * @param context {@link HoodieSparkEngineContext} to use. + * @param parallelism parallelism for the {@link JavaRDD}. + * @param type of object. + * @return a new instance containing the {@link JavaRDD} instance. + */ + public static HoodieJavaRDD of( + List data, HoodieSparkEngineContext context, int parallelism) { + return new HoodieJavaRDD<>(context.getJavaSparkContext().parallelize(data, parallelism)); + } + + /** + * @param hoodieData {@link HoodieJavaRDD } instance containing the {@link JavaRDD} of objects. + * @param type of object. + * @return the a {@link JavaRDD} of objects in type T. + */ + public static JavaRDD getJavaRDD(HoodieData hoodieData) { + return ((HoodieJavaRDD) hoodieData).rddData; + } + + public static JavaPairRDD getJavaRDD(HoodiePairData hoodieData) { + return ((HoodieJavaPairRDD) hoodieData).get(); + } + + @Override + public void persist(String level) { + rddData.persist(StorageLevel.fromString(level)); + } + + @Override + public void unpersist() { + rddData.unpersist(); + } + + @Override + public boolean isEmpty() { + return rddData.isEmpty(); + } + + @Override + public long count() { + return rddData.count(); + } + + @Override + public int getNumPartitions() { + return rddData.getNumPartitions(); + } + + @Override + public HoodieData map(SerializableFunction func) { + return HoodieJavaRDD.of(rddData.map(func::apply)); + } + + @Override + public HoodieData mapPartitions(SerializableFunction, Iterator> func, boolean preservesPartitioning) { + return HoodieJavaRDD.of(rddData.mapPartitions(func::apply, preservesPartitioning)); + } + + @Override + public HoodieData flatMap(SerializableFunction> func) { + return HoodieJavaRDD.of(rddData.flatMap(e -> func.apply(e))); + } + + @Override + public HoodiePairData mapToPair(SerializablePairFunction func) { + return HoodieJavaPairRDD.of(rddData.mapToPair(input -> { + Pair pair = func.call(input); + return new Tuple2<>(pair.getLeft(), pair.getRight()); + })); + } + + @Override + public HoodieData distinct() { + return HoodieJavaRDD.of(rddData.distinct()); + } + + @Override + public HoodieData distinct(int parallelism) { + return HoodieJavaRDD.of(rddData.distinct(parallelism)); + } + + @Override + public HoodieData filter(SerializableFunction filterFunc) { + return HoodieJavaRDD.of(rddData.filter(filterFunc::apply)); + } + + @Override + public HoodieData union(HoodieData other) { + return HoodieJavaRDD.of(rddData.union(((HoodieJavaRDD) other).rddData)); + } + + @Override + public List collectAsList() { + return rddData.collect(); + } + + @Override + public HoodieData repartition(int parallelism) { + return HoodieJavaRDD.of(rddData.repartition(parallelism)); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieSparkLongAccumulator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieSparkLongAccumulator.java new file mode 100644 index 0000000000000..10027a28258c0 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieSparkLongAccumulator.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.data; + +import org.apache.hudi.common.data.HoodieAccumulator; + +import org.apache.spark.util.AccumulatorV2; +import org.apache.spark.util.LongAccumulator; + +/** + * An accumulator on counts based on Spark {@link AccumulatorV2} implementation. + */ +public class HoodieSparkLongAccumulator extends HoodieAccumulator { + + private final AccumulatorV2 accumulator; + + private HoodieSparkLongAccumulator() { + accumulator = new LongAccumulator(); + } + + public static HoodieSparkLongAccumulator create() { + return new HoodieSparkLongAccumulator(); + } + + @Override + public long value() { + return accumulator.value(); + } + + @Override + public void add(long increment) { + accumulator.add(increment); + } + + public AccumulatorV2 getAccumulator() { + return accumulator; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/SparkBoundedInMemoryExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/SparkBoundedInMemoryExecutor.java deleted file mode 100644 index d240c065d0834..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/SparkBoundedInMemoryExecutor.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.execution; - -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; -import org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer; -import org.apache.hudi.common.util.queue.BoundedInMemoryQueueProducer; -import org.apache.hudi.common.util.queue.IteratorBasedQueueProducer; -import org.apache.hudi.config.HoodieWriteConfig; - -import org.apache.spark.TaskContext; -import org.apache.spark.TaskContext$; - -import java.util.Iterator; -import java.util.function.Function; - -public class SparkBoundedInMemoryExecutor extends BoundedInMemoryExecutor { - - // Need to set current spark thread's TaskContext into newly launched thread so that new thread can access - // TaskContext properties. - final TaskContext sparkThreadTaskContext; - - public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig, final Iterator inputItr, - BoundedInMemoryQueueConsumer consumer, Function bufferedIteratorTransform) { - this(hoodieConfig, new IteratorBasedQueueProducer<>(inputItr), consumer, bufferedIteratorTransform); - } - - public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig, BoundedInMemoryQueueProducer producer, - BoundedInMemoryQueueConsumer consumer, Function bufferedIteratorTransform) { - super(hoodieConfig.getWriteBufferLimitBytes(), producer, Option.of(consumer), bufferedIteratorTransform); - this.sparkThreadTaskContext = TaskContext.get(); - } - - @Override - public void preExecute() { - // Passing parent thread's TaskContext to newly launched thread for it to access original TaskContext properties. - TaskContext$.MODULE$.setTaskContext(sparkThreadTaskContext); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/SparkLazyInsertIterable.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/SparkLazyInsertIterable.java index ec90ef88ed86f..df5bd2d3f458c 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/SparkLazyInsertIterable.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/SparkLazyInsertIterable.java @@ -18,9 +18,9 @@ package org.apache.hudi.execution; -import org.apache.avro.Schema; +import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.TaskContextSupplier; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; @@ -29,19 +29,36 @@ import org.apache.hudi.io.WriteHandleFactory; import org.apache.hudi.table.HoodieTable; +import org.apache.avro.Schema; + import java.util.Iterator; import java.util.List; public class SparkLazyInsertIterable extends HoodieLazyInsertIterable { + private boolean useWriterSchema; + public SparkLazyInsertIterable(Iterator> recordItr, boolean areRecordsSorted, HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, String idPrefix, - TaskContextSupplier taskContextSupplier) { + TaskContextSupplier taskContextSupplier, + boolean useWriterSchema) { super(recordItr, areRecordsSorted, config, instantTime, hoodieTable, idPrefix, taskContextSupplier); + this.useWriterSchema = useWriterSchema; + } + + public SparkLazyInsertIterable(Iterator> recordItr, + boolean areRecordsSorted, + HoodieWriteConfig config, + String instantTime, + HoodieTable hoodieTable, + String idPrefix, + TaskContextSupplier taskContextSupplier, + WriteHandleFactory writeHandleFactory) { + this(recordItr, areRecordsSorted, config, instantTime, hoodieTable, idPrefix, taskContextSupplier, false, writeHandleFactory); } public SparkLazyInsertIterable(Iterator> recordItr, @@ -51,8 +68,10 @@ public SparkLazyInsertIterable(Iterator> recordItr, HoodieTable hoodieTable, String idPrefix, TaskContextSupplier taskContextSupplier, + boolean useWriterSchema, WriteHandleFactory writeHandleFactory) { super(recordItr, areRecordsSorted, config, instantTime, hoodieTable, idPrefix, taskContextSupplier, writeHandleFactory); + this.useWriterSchema = useWriterSchema; } @Override @@ -61,9 +80,13 @@ protected List computeNext() { BoundedInMemoryExecutor, HoodieInsertValueGenResult, List> bufferedIteratorExecutor = null; try { - final Schema schema = new Schema.Parser().parse(hoodieConfig.getSchema()); + Schema schema = new Schema.Parser().parse(hoodieConfig.getSchema()); + if (useWriterSchema) { + schema = HoodieAvroUtils.addMetadataFields(schema); + } bufferedIteratorExecutor = - new SparkBoundedInMemoryExecutor<>(hoodieConfig, inputItr, getInsertHandler(), getTransformFunction(schema)); + new BoundedInMemoryExecutor<>(hoodieConfig.getWriteBufferLimitBytes(), inputItr, getInsertHandler(), + getTransformFunction(schema, hoodieConfig), hoodieTable.getPreExecuteRunnable()); final List result = bufferedIteratorExecutor.execute(); assert result != null && !result.isEmpty() && !bufferedIteratorExecutor.isRemaining(); return result; @@ -72,6 +95,7 @@ protected List computeNext() { } finally { if (null != bufferedIteratorExecutor) { bufferedIteratorExecutor.shutdownNow(); + bufferedIteratorExecutor.awaitTermination(); } } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertInternalPartitionerFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertInternalPartitionerFactory.java index aaa7b5b4bfab4..900d2729f105b 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertInternalPartitionerFactory.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertInternalPartitionerFactory.java @@ -18,8 +18,10 @@ package org.apache.hudi.execution.bulkinsert; +import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.table.BulkInsertPartitioner; +import org.apache.hudi.table.HoodieTable; /** * A factory to generate built-in partitioner to repartition input records into at least @@ -27,14 +29,33 @@ */ public abstract class BulkInsertInternalPartitionerFactory { - public static BulkInsertPartitioner get(BulkInsertSortMode sortMode) { + public static BulkInsertPartitioner get(HoodieTable table, HoodieWriteConfig config) { + return get(config.getBulkInsertSortMode(), table.isPartitioned(), false); + } + + public static BulkInsertPartitioner get( + HoodieTable table, HoodieWriteConfig config, boolean enforceNumOutputPartitions) { + return get(config.getBulkInsertSortMode(), table.isPartitioned(), enforceNumOutputPartitions); + } + + public static BulkInsertPartitioner get(BulkInsertSortMode sortMode, boolean isTablePartitioned) { + return get(sortMode, isTablePartitioned, false); + } + + public static BulkInsertPartitioner get(BulkInsertSortMode sortMode, + boolean isTablePartitioned, + boolean enforceNumOutputPartitions) { switch (sortMode) { case NONE: - return new NonSortPartitioner(); + return new NonSortPartitioner(enforceNumOutputPartitions); case GLOBAL_SORT: return new GlobalSortPartitioner(); case PARTITION_SORT: return new RDDPartitionSortPartitioner(); + case PARTITION_PATH_REPARTITION: + return new PartitionPathRepartitionPartitioner(isTablePartitioned); + case PARTITION_PATH_REPARTITION_AND_SORT: + return new PartitionPathRepartitionAndSortPartitioner(isTablePartitioned); default: throw new HoodieException("The bulk insert sort mode \"" + sortMode.name() + "\" is not supported."); } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertInternalPartitionerWithRowsFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertInternalPartitionerWithRowsFactory.java new file mode 100644 index 0000000000000..218eae0dc94ca --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertInternalPartitionerWithRowsFactory.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.table.BulkInsertPartitioner; + +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; + +/** + * A factory to generate built-in partitioner to repartition input Rows into at least + * expected number of output spark partitions for bulk insert operation. + */ +public abstract class BulkInsertInternalPartitionerWithRowsFactory { + + public static BulkInsertPartitioner> get(BulkInsertSortMode sortMode, + boolean isTablePartitioned) { + return get(sortMode, isTablePartitioned, false); + } + + public static BulkInsertPartitioner> get( + BulkInsertSortMode sortMode, boolean isTablePartitioned, boolean enforceNumOutputPartitions) { + switch (sortMode) { + case NONE: + return new NonSortPartitionerWithRows(enforceNumOutputPartitions); + case GLOBAL_SORT: + return new GlobalSortPartitionerWithRows(); + case PARTITION_SORT: + return new PartitionSortPartitionerWithRows(); + case PARTITION_PATH_REPARTITION: + return new PartitionPathRepartitionPartitionerWithRows(isTablePartitioned); + case PARTITION_PATH_REPARTITION_AND_SORT: + return new PartitionPathRepartitionAndSortPartitionerWithRows(isTablePartitioned); + default: + throw new UnsupportedOperationException("The bulk insert sort mode \"" + sortMode.name() + "\" is not supported."); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertMapFunction.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertMapFunction.java index db73a9c3e7e70..66c3bdddcb1ef 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertMapFunction.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertMapFunction.java @@ -23,6 +23,8 @@ import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.execution.SparkLazyInsertIterable; +import org.apache.hudi.io.WriteHandleFactory; +import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.hudi.table.HoodieTable; import org.apache.spark.api.java.function.Function2; @@ -40,21 +42,27 @@ public class BulkInsertMapFunction private boolean areRecordsSorted; private HoodieWriteConfig config; private HoodieTable hoodieTable; - private List fileIDPrefixes; + private boolean useWriterSchema; + private BulkInsertPartitioner partitioner; + private WriteHandleFactory writeHandleFactory; public BulkInsertMapFunction(String instantTime, boolean areRecordsSorted, HoodieWriteConfig config, HoodieTable hoodieTable, - List fileIDPrefixes) { + boolean useWriterSchema, BulkInsertPartitioner partitioner, + WriteHandleFactory writeHandleFactory) { this.instantTime = instantTime; this.areRecordsSorted = areRecordsSorted; this.config = config; this.hoodieTable = hoodieTable; - this.fileIDPrefixes = fileIDPrefixes; + this.useWriterSchema = useWriterSchema; + this.writeHandleFactory = writeHandleFactory; + this.partitioner = partitioner; } @Override public Iterator> call(Integer partition, Iterator> recordItr) { return new SparkLazyInsertIterable<>(recordItr, areRecordsSorted, config, instantTime, hoodieTable, - fileIDPrefixes.get(partition), hoodieTable.getTaskContextSupplier()); + partitioner.getFileIdPfx(partition), hoodieTable.getTaskContextSupplier(), useWriterSchema, + (WriteHandleFactory) partitioner.getWriteHandleFactory(partition).orElse(this.writeHandleFactory)); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/GlobalSortPartitionerWithRows.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/GlobalSortPartitionerWithRows.java new file mode 100644 index 0000000000000..24bcc0aff0df3 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/GlobalSortPartitionerWithRows.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.table.BulkInsertPartitioner; + +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.functions; + +/** + * A built-in partitioner that does global sorting for the input Rows across partitions after repartition for bulk insert operation, corresponding to the {@code BulkInsertSortMode.GLOBAL_SORT} mode. + */ +public class GlobalSortPartitionerWithRows implements BulkInsertPartitioner> { + + @Override + public Dataset repartitionRecords(Dataset rows, int outputSparkPartitions) { + // Now, sort the records and line them up nicely for loading. + // Let's use "partitionPath + key" as the sort key. + return rows.sort(functions.col(HoodieRecord.PARTITION_PATH_METADATA_FIELD), functions.col(HoodieRecord.RECORD_KEY_METADATA_FIELD)) + .coalesce(outputSparkPartitions); + } + + @Override + public boolean arePartitionRecordsSorted() { + return true; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/NonSortPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/NonSortPartitioner.java index 19c90ecb1a012..67cd599731c13 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/NonSortPartitioner.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/NonSortPartitioner.java @@ -25,18 +25,45 @@ import org.apache.spark.api.java.JavaRDD; /** - * A built-in partitioner that only does coalesce for input records for bulk insert operation, - * corresponding to the {@code BulkInsertSortMode.NONE} mode. + * A built-in partitioner that avoids expensive sorting for the input records for bulk insert + * operation, by doing either of the following: + *

    + * - If enforcing the outputSparkPartitions, only does coalesce for input records; + *

    + * - Otherwise, returns input records as is. + *

    + * Corresponds to the {@code BulkInsertSortMode.NONE} mode. * * @param HoodieRecordPayload type */ public class NonSortPartitioner implements BulkInsertPartitioner>> { + private final boolean enforceNumOutputPartitions; + + /** + * Default constructor without enforcing the number of output partitions. + */ + public NonSortPartitioner() { + this(false); + } + + /** + * Constructor with `enforceNumOutputPartitions` config. + * + * @param enforceNumOutputPartitions Whether to enforce the number of output partitions. + */ + public NonSortPartitioner(boolean enforceNumOutputPartitions) { + this.enforceNumOutputPartitions = enforceNumOutputPartitions; + } + @Override public JavaRDD> repartitionRecords(JavaRDD> records, int outputSparkPartitions) { - return records.coalesce(outputSparkPartitions); + if (enforceNumOutputPartitions) { + return records.coalesce(outputSparkPartitions); + } + return records; } @Override diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/NonSortPartitionerWithRows.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/NonSortPartitionerWithRows.java new file mode 100644 index 0000000000000..10ec275064ffe --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/NonSortPartitionerWithRows.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.table.BulkInsertPartitioner; + +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; + +/** + * A built-in partitioner that avoids expensive sorting for the input Rows for bulk insert + * operation, by doing either of the following: + *

    + * - If enforcing the outputSparkPartitions, only does coalesce for input Rows; + *

    + * - Otherwise, returns input Rows as is. + *

    + * Corresponds to the {@code BulkInsertSortMode.NONE} mode. + */ +public class NonSortPartitionerWithRows implements BulkInsertPartitioner> { + + private final boolean enforceNumOutputPartitions; + + /** + * Default constructor without enforcing the number of output partitions. + */ + public NonSortPartitionerWithRows() { + this(false); + } + + /** + * Constructor with `enforceNumOutputPartitions` config. + * + * @param enforceNumOutputPartitions Whether to enforce the number of output partitions. + */ + public NonSortPartitionerWithRows(boolean enforceNumOutputPartitions) { + this.enforceNumOutputPartitions = enforceNumOutputPartitions; + } + + @Override + public Dataset repartitionRecords(Dataset rows, int outputSparkPartitions) { + if (enforceNumOutputPartitions) { + return rows.coalesce(outputSparkPartitions); + } + return rows; + } + + @Override + public boolean arePartitionRecordsSorted() { + return false; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionPathRDDPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionPathRDDPartitioner.java new file mode 100644 index 0000000000000..eb835b38c3498 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionPathRDDPartitioner.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.common.function.SerializableFunctionUnchecked; + +import org.apache.spark.Partitioner; + +import java.io.Serializable; +import java.util.Objects; + +/** + * A Spark RDD partitioner implementation that determines the Spark partition + * based on the table partition path, generating targeted number of Spark partitions. + */ +public class PartitionPathRDDPartitioner extends Partitioner implements Serializable { + private final SerializableFunctionUnchecked partitionPathExtractor; + private final int numPartitions; + + PartitionPathRDDPartitioner(SerializableFunctionUnchecked partitionPathExtractor, int numPartitions) { + this.partitionPathExtractor = partitionPathExtractor; + this.numPartitions = numPartitions; + } + + @Override + public int numPartitions() { + return numPartitions; + } + + @SuppressWarnings("unchecked") + @Override + public int getPartition(Object o) { + return Math.abs(Objects.hash(partitionPathExtractor.apply(o))) % numPartitions; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionPathRepartitionAndSortPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionPathRepartitionAndSortPartitioner.java new file mode 100644 index 0000000000000..e8e1e2072f5ff --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionPathRepartitionAndSortPartitioner.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.table.BulkInsertPartitioner; + +import org.apache.spark.api.java.JavaRDD; + +import scala.Tuple2; + +/** + * A built-in partitioner that does the following for input records for bulk insert operation + *

    + * - For physically partitioned table, repartition the input records based on the partition path, + * and sort records within Spark partitions, limiting the shuffle parallelism to specified + * `outputSparkPartitions` + *

    + * - For physically non-partitioned table, simply does coalesce for the input records with + * `outputSparkPartitions` + *

    + * Corresponding to the {@code BulkInsertSortMode.PARTITION_PATH_REPARTITION_AND_SORT} mode. + * + * @param HoodieRecordPayload type + */ +public class PartitionPathRepartitionAndSortPartitioner + implements BulkInsertPartitioner>> { + + private final boolean isTablePartitioned; + + public PartitionPathRepartitionAndSortPartitioner(boolean isTablePartitioned) { + this.isTablePartitioned = isTablePartitioned; + } + + @Override + public JavaRDD> repartitionRecords(JavaRDD> records, + int outputSparkPartitions) { + if (isTablePartitioned) { + PartitionPathRDDPartitioner partitioner = new PartitionPathRDDPartitioner( + (partitionPath) -> (String) partitionPath, outputSparkPartitions); + return records + .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record)) + .repartitionAndSortWithinPartitions(partitioner) + .values(); + } + return records.coalesce(outputSparkPartitions); + } + + @Override + public boolean arePartitionRecordsSorted() { + return isTablePartitioned; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionPathRepartitionAndSortPartitionerWithRows.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionPathRepartitionAndSortPartitionerWithRows.java new file mode 100644 index 0000000000000..cf3ff1acfa40b --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionPathRepartitionAndSortPartitionerWithRows.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.table.BulkInsertPartitioner; + +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; + +/** + * A built-in partitioner that does the following for input rows for bulk insert operation + *

    + * - For physically partitioned table, repartition the input rows based on the partition path, + * and sort rows within Spark partitions, limiting the shuffle parallelism to specified + * `outputSparkPartitions` + *

    + * - For physically non-partitioned table, simply does coalesce for the input rows with + * `outputSparkPartitions` + *

    + * Corresponding to the {@code BulkInsertSortMode.PARTITION_PATH_REPARTITION_AND_SORT} mode. + */ +public class PartitionPathRepartitionAndSortPartitionerWithRows implements BulkInsertPartitioner> { + + private final boolean isTablePartitioned; + + public PartitionPathRepartitionAndSortPartitionerWithRows(boolean isTablePartitioned) { + this.isTablePartitioned = isTablePartitioned; + } + + @Override + public Dataset repartitionRecords(Dataset rows, int outputSparkPartitions) { + if (isTablePartitioned) { + return rows.repartition(outputSparkPartitions, new Column(HoodieRecord.PARTITION_PATH_METADATA_FIELD)) + .sortWithinPartitions(new Column(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); + } + return rows.coalesce(outputSparkPartitions); + } + + @Override + public boolean arePartitionRecordsSorted() { + return isTablePartitioned; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionPathRepartitionPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionPathRepartitionPartitioner.java new file mode 100644 index 0000000000000..5931b565757f5 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionPathRepartitionPartitioner.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.table.BulkInsertPartitioner; + +import org.apache.spark.api.java.JavaRDD; + +import scala.Tuple2; + +/** + * A built-in partitioner that does the following for input records for bulk insert operation + *

    + * - For physically partitioned table, repartition the input records based on the partition path, + * limiting the shuffle parallelism to specified `outputSparkPartitions` + *

    + * - For physically non-partitioned table, simply does coalesce for the input records with + * `outputSparkPartitions` + *

    + * Corresponding to the {@code BulkInsertSortMode.PARTITION_PATH_REPARTITION} mode. + * + * @param HoodieRecordPayload type + */ +public class PartitionPathRepartitionPartitioner + implements BulkInsertPartitioner>> { + + private final boolean isTablePartitioned; + + public PartitionPathRepartitionPartitioner(boolean isTablePartitioned) { + this.isTablePartitioned = isTablePartitioned; + } + + @Override + public JavaRDD> repartitionRecords(JavaRDD> records, + int outputSparkPartitions) { + if (isTablePartitioned) { + PartitionPathRDDPartitioner partitioner = new PartitionPathRDDPartitioner( + (partitionPath) -> (String) partitionPath, outputSparkPartitions); + return records + .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record)) + .partitionBy(partitioner) + .values(); + } + return records.coalesce(outputSparkPartitions); + } + + @Override + public boolean arePartitionRecordsSorted() { + return false; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionPathRepartitionPartitionerWithRows.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionPathRepartitionPartitionerWithRows.java new file mode 100644 index 0000000000000..62d9edbca844f --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionPathRepartitionPartitionerWithRows.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.table.BulkInsertPartitioner; + +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; + +/** + * A built-in partitioner that does the following for input rows for bulk insert operation + *

    + * - For physically partitioned table, repartition the input rows based on the partition path, + * limiting the shuffle parallelism to specified `outputSparkPartitions` + *

    + * - For physically non-partitioned table, simply does coalesce for the input rows with + * `outputSparkPartitions` + *

    + * Corresponding to the {@code BulkInsertSortMode.PARTITION_PATH_REPARTITION} mode. + */ +public class PartitionPathRepartitionPartitionerWithRows implements BulkInsertPartitioner> { + + private final boolean isTablePartitioned; + + public PartitionPathRepartitionPartitionerWithRows(boolean isTablePartitioned) { + this.isTablePartitioned = isTablePartitioned; + } + + @Override + public Dataset repartitionRecords(Dataset rows, int outputSparkPartitions) { + if (isTablePartitioned) { + return rows.repartition(outputSparkPartitions, new Column(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); + } + return rows.coalesce(outputSparkPartitions); + } + + @Override + public boolean arePartitionRecordsSorted() { + return false; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionSortPartitionerWithRows.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionSortPartitionerWithRows.java new file mode 100644 index 0000000000000..b669c338f8668 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionSortPartitionerWithRows.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.table.BulkInsertPartitioner; + +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; + +/** + * A built-in partitioner that does local sorting for each spark partitions after coalesce for bulk insert operation, corresponding to the {@code BulkInsertSortMode.PARTITION_SORT} mode. + */ +public class PartitionSortPartitionerWithRows implements BulkInsertPartitioner> { + + @Override + public Dataset repartitionRecords(Dataset rows, int outputSparkPartitions) { + return rows.coalesce(outputSparkPartitions).sortWithinPartitions(HoodieRecord.PARTITION_PATH_METADATA_FIELD, HoodieRecord.RECORD_KEY_METADATA_FIELD); + } + + @Override + public boolean arePartitionRecordsSorted() { + return true; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDCustomColumnsSortPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDCustomColumnsSortPartitioner.java new file mode 100644 index 0000000000000..dc80498c7a964 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDCustomColumnsSortPartitioner.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.config.SerializableSchema; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.BulkInsertPartitioner; + +import org.apache.avro.Schema; +import org.apache.spark.api.java.JavaRDD; + +import java.util.Arrays; + +/** + * A partitioner that does sorting based on specified column values for each RDD partition. + * + * @param HoodieRecordPayload type + */ +public class RDDCustomColumnsSortPartitioner + implements BulkInsertPartitioner>> { + + private final String[] sortColumnNames; + private final SerializableSchema serializableSchema; + private final boolean consistentLogicalTimestampEnabled; + + public RDDCustomColumnsSortPartitioner(HoodieWriteConfig config) { + this.serializableSchema = new SerializableSchema(new Schema.Parser().parse(config.getSchema())); + this.sortColumnNames = getSortColumnName(config); + this.consistentLogicalTimestampEnabled = config.isConsistentLogicalTimestampEnabled(); + } + + public RDDCustomColumnsSortPartitioner(String[] columnNames, Schema schema, boolean consistentLogicalTimestampEnabled) { + this.sortColumnNames = columnNames; + this.serializableSchema = new SerializableSchema(schema); + this.consistentLogicalTimestampEnabled = consistentLogicalTimestampEnabled; + } + + @Override + public JavaRDD> repartitionRecords(JavaRDD> records, + int outputSparkPartitions) { + final String[] sortColumns = this.sortColumnNames; + final SerializableSchema schema = this.serializableSchema; + final boolean consistentLogicalTimestampEnabled = this.consistentLogicalTimestampEnabled; + return records.sortBy( + record -> { + Object recordValue = HoodieAvroUtils.getRecordColumnValues(record, sortColumns, schema, consistentLogicalTimestampEnabled); + // null values are replaced with empty string for null_first order + if (recordValue == null) { + return StringUtils.EMPTY_STRING; + } else { + return StringUtils.objToString(recordValue); + } + }, + true, outputSparkPartitions); + } + + @Override + public boolean arePartitionRecordsSorted() { + return true; + } + + private String[] getSortColumnName(HoodieWriteConfig config) { + return Arrays.stream(config.getUserDefinedBulkInsertPartitionerSortColumns().split(",")) + .map(String::trim).toArray(String[]::new); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveSortPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveSortPartitioner.java new file mode 100644 index 0000000000000..2ab9107fa54b5 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveSortPartitioner.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.AvroConversionUtils; +import org.apache.hudi.HoodieSparkUtils; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.SerializableSchema; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.RewriteAvroPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieClusteringConfig; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; + +/** + * A partitioner that does spatial curve optimization sorting based on specified column values for each RDD partition. + * support z-curve optimization, hilbert will come soon. + * @param HoodieRecordPayload type + */ +public class RDDSpatialCurveSortPartitioner + extends SpatialCurveSortPartitionerBase>> { + + private final transient HoodieSparkEngineContext sparkEngineContext; + private final SerializableSchema schema; + + public RDDSpatialCurveSortPartitioner(HoodieSparkEngineContext sparkEngineContext, + String[] orderByColumns, + HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy, + HoodieClusteringConfig.SpatialCurveCompositionStrategyType curveCompositionStrategyType, + Schema schema) { + super(orderByColumns, layoutOptStrategy, curveCompositionStrategyType); + this.sparkEngineContext = sparkEngineContext; + this.schema = new SerializableSchema(schema); + } + + @Override + public JavaRDD> repartitionRecords(JavaRDD> records, int outputSparkPartitions) { + JavaRDD genericRecordsRDD = + records.map(f -> (GenericRecord) f.getData().getInsertValue(schema.get()).get()); + + Dataset sourceDataset = + AvroConversionUtils.createDataFrame( + genericRecordsRDD.rdd(), + schema.toString(), + sparkEngineContext.getSqlContext().sparkSession() + ); + + Dataset sortedDataset = reorder(sourceDataset, outputSparkPartitions); + + return HoodieSparkUtils.createRdd(sortedDataset, schema.get().getName(), schema.get().getNamespace(), false, Option.empty()) + .toJavaRDD() + .map(record -> { + String key = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String partition = record.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); + HoodieKey hoodieKey = new HoodieKey(key, partition); + HoodieRecord hoodieRecord = new HoodieAvroRecord(hoodieKey, new RewriteAvroPayload(record)); + return hoodieRecord; + }); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RowCustomColumnsSortPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RowCustomColumnsSortPartitioner.java new file mode 100644 index 0000000000000..ceeb2b3fe8f00 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RowCustomColumnsSortPartitioner.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.BulkInsertPartitioner; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; + +import java.util.Arrays; + +/** + * A partitioner that does sorting based on specified column values for each spark partitions. + */ +public class RowCustomColumnsSortPartitioner implements BulkInsertPartitioner> { + + private final String[] sortColumnNames; + + public RowCustomColumnsSortPartitioner(HoodieWriteConfig config) { + this.sortColumnNames = getSortColumnName(config); + } + + public RowCustomColumnsSortPartitioner(String[] columnNames) { + this.sortColumnNames = columnNames; + } + + @Override + public Dataset repartitionRecords(Dataset records, int outputSparkPartitions) { + final String[] sortColumns = this.sortColumnNames; + return records.coalesce(outputSparkPartitions) + .sortWithinPartitions(HoodieRecord.PARTITION_PATH_METADATA_FIELD, sortColumns); + } + + @Override + public boolean arePartitionRecordsSorted() { + return true; + } + + private String[] getSortColumnName(HoodieWriteConfig config) { + return Arrays.stream(config.getUserDefinedBulkInsertPartitionerSortColumns().split(",")) + .map(String::trim).toArray(String[]::new); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RowSpatialCurveSortPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RowSpatialCurveSortPartitioner.java new file mode 100644 index 0000000000000..1217477c9d817 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RowSpatialCurveSortPartitioner.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; + +public class RowSpatialCurveSortPartitioner extends SpatialCurveSortPartitionerBase> { + + public RowSpatialCurveSortPartitioner(HoodieWriteConfig config) { + super(config.getClusteringSortColumns(), config.getLayoutOptimizationStrategy(), config.getLayoutOptimizationCurveBuildMethod()); + } + + public RowSpatialCurveSortPartitioner(String[] orderByColumns, + HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy, + HoodieClusteringConfig.SpatialCurveCompositionStrategyType curveCompositionStrategyType) { + super(orderByColumns, layoutOptStrategy, curveCompositionStrategyType); + } + + @Override + public Dataset repartitionRecords(Dataset records, int outputPartitions) { + return reorder(records, outputPartitions); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/SpatialCurveSortPartitionerBase.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/SpatialCurveSortPartitionerBase.java new file mode 100644 index 0000000000000..96048f2782bc1 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/SpatialCurveSortPartitionerBase.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.sort.SpaceCurveSortingHelper; +import org.apache.hudi.table.BulkInsertPartitioner; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; + +import java.util.Arrays; +import java.util.List; + +public abstract class SpatialCurveSortPartitionerBase implements BulkInsertPartitioner { + + private final String[] orderByColumns; + private final HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy; + private final HoodieClusteringConfig.SpatialCurveCompositionStrategyType curveCompositionStrategyType; + + public SpatialCurveSortPartitionerBase(String orderByColumns, + HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy, + HoodieClusteringConfig.SpatialCurveCompositionStrategyType curveCompositionStrategyType) { + if (orderByColumns != null) { + this.orderByColumns = Arrays.stream(orderByColumns.split(",")) + .map(String::trim).toArray(String[]::new); + } else { + throw new IllegalArgumentException("The config " + + HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS.key() + " must be provided"); + } + this.layoutOptStrategy = layoutOptStrategy; + this.curveCompositionStrategyType = curveCompositionStrategyType; + } + + public SpatialCurveSortPartitionerBase(String[] orderByColumns, + HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy, + HoodieClusteringConfig.SpatialCurveCompositionStrategyType curveCompositionStrategyType) { + this.orderByColumns = orderByColumns; + this.layoutOptStrategy = layoutOptStrategy; + this.curveCompositionStrategyType = curveCompositionStrategyType; + } + + /** + * Mapping specified multi need-to-order columns to one dimension while preserving data locality. + */ + protected Dataset reorder(Dataset dataset, int numOutputGroups) { + if (orderByColumns.length == 0) { + // No-op + return dataset; + } + + List orderedCols = Arrays.asList(orderByColumns); + + switch (curveCompositionStrategyType) { + case DIRECT: + return SpaceCurveSortingHelper.orderDataFrameByMappingValues(dataset, layoutOptStrategy, orderedCols, numOutputGroups); + case SAMPLE: + return SpaceCurveSortingHelper.orderDataFrameBySamplingValues(dataset, layoutOptStrategy, orderedCols, numOutputGroups); + default: + throw new UnsupportedOperationException(String.format("Unsupported space-curve curve building strategy (%s)", curveCompositionStrategyType)); + } + } + + @Override + public boolean arePartitionRecordsSorted() { + return true; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndex.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndex.java index dd73bf2e76568..aece86a3878ee 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndex.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndex.java @@ -7,13 +7,14 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ package org.apache.hudi.index; @@ -21,65 +22,53 @@ import org.apache.hudi.ApiMaturityLevel; import org.apache.hudi.PublicAPIMethod; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.ReflectionUtils; -import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieIndexException; -import org.apache.hudi.index.bloom.SparkHoodieBloomIndex; -import org.apache.hudi.index.bloom.SparkHoodieGlobalBloomIndex; -import org.apache.hudi.index.hbase.SparkHoodieHBaseIndex; -import org.apache.hudi.index.simple.SparkHoodieGlobalSimpleIndex; -import org.apache.hudi.index.simple.SparkHoodieSimpleIndex; import org.apache.hudi.table.HoodieTable; import org.apache.spark.api.java.JavaRDD; @SuppressWarnings("checkstyle:LineLength") -public abstract class SparkHoodieIndex extends HoodieIndex>, JavaRDD, JavaRDD> { +public abstract class SparkHoodieIndex> + extends HoodieIndex>, JavaRDD> { protected SparkHoodieIndex(HoodieWriteConfig config) { super(config); } - public static SparkHoodieIndex createIndex(HoodieWriteConfig config) { - // first use index class config to create index. - if (!StringUtils.isNullOrEmpty(config.getIndexClass())) { - Object instance = ReflectionUtils.loadClass(config.getIndexClass(), config); - if (!(instance instanceof HoodieIndex)) { - throw new HoodieIndexException(config.getIndexClass() + " is not a subclass of HoodieIndex"); - } - return (SparkHoodieIndex) instance; - } - switch (config.getIndexType()) { - case HBASE: - return new SparkHoodieHBaseIndex<>(config); - case INMEMORY: - return new SparkInMemoryHashIndex(config); - case BLOOM: - return new SparkHoodieBloomIndex<>(config); - case GLOBAL_BLOOM: - return new SparkHoodieGlobalBloomIndex<>(config); - case SIMPLE: - return new SparkHoodieSimpleIndex(config); - case GLOBAL_SIMPLE: - return new SparkHoodieGlobalSimpleIndex(config); - default: - throw new HoodieIndexException("Index type unspecified, set " + config.getIndexType()); - } - } - @Override - @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) public abstract JavaRDD updateLocation(JavaRDD writeStatusRDD, HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) throws HoodieIndexException; + HoodieTable hoodieTable) throws HoodieIndexException; @Override - @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) public abstract JavaRDD> tagLocation(JavaRDD> records, HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) throws HoodieIndexException; + HoodieTable hoodieTable) throws HoodieIndexException; + + @Override + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { + return HoodieJavaRDD.of(tagLocation( + HoodieJavaRDD.getJavaRDD(records.map(record -> (HoodieRecord) record)), context, hoodieTable) + .map(r -> (HoodieRecord) r)); + } + + @Override + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public HoodieData updateLocation( + HoodieData writeStatuses, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException { + return HoodieJavaRDD.of(updateLocation(HoodieJavaRDD.getJavaRDD(writeStatuses), context, hoodieTable)); + } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndexFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndexFactory.java new file mode 100644 index 0000000000000..4525490c8d168 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndexFactory.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.index.bloom.HoodieBloomIndex; +import org.apache.hudi.index.bloom.HoodieGlobalBloomIndex; +import org.apache.hudi.index.bloom.SparkHoodieBloomIndexHelper; +import org.apache.hudi.index.bucket.HoodieSimpleBucketIndex; +import org.apache.hudi.index.bucket.HoodieSparkConsistentBucketIndex; +import org.apache.hudi.index.hbase.SparkHoodieHBaseIndex; +import org.apache.hudi.index.inmemory.HoodieInMemoryHashIndex; +import org.apache.hudi.index.simple.HoodieGlobalSimpleIndex; +import org.apache.hudi.index.simple.HoodieSimpleIndex; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; + +import java.io.IOException; + +/** + * A factory to generate Spark {@link HoodieIndex}. + */ +public final class SparkHoodieIndexFactory { + public static HoodieIndex createIndex(HoodieWriteConfig config) { + // first use index class config to create index. + if (!StringUtils.isNullOrEmpty(config.getIndexClass())) { + Object instance = ReflectionUtils.loadClass(config.getIndexClass(), config); + if (!(instance instanceof HoodieIndex)) { + throw new HoodieIndexException(config.getIndexClass() + " is not a subclass of HoodieIndex"); + } + return (HoodieIndex) instance; + } + switch (config.getIndexType()) { + case HBASE: + return new SparkHoodieHBaseIndex(config); + case INMEMORY: + return new HoodieInMemoryHashIndex(config); + case BLOOM: + return new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); + case GLOBAL_BLOOM: + return new HoodieGlobalBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); + case SIMPLE: + return new HoodieSimpleIndex(config, getKeyGeneratorForSimpleIndex(config)); + case GLOBAL_SIMPLE: + return new HoodieGlobalSimpleIndex(config, getKeyGeneratorForSimpleIndex(config)); + case BUCKET: + switch (config.getBucketIndexEngineType()) { + case SIMPLE: + return new HoodieSimpleBucketIndex(config); + case CONSISTENT_HASHING: + return new HoodieSparkConsistentBucketIndex(config); + default: + throw new HoodieIndexException("Unknown bucket index engine type: " + config.getBucketIndexEngineType()); + } + default: + throw new HoodieIndexException("Index type unspecified, set " + config.getIndexType()); + } + } + + /** + * Whether index is global or not. + * @param config HoodieWriteConfig to use. + * @return {@code true} if index is a global one. else {@code false}. + */ + public static boolean isGlobalIndex(HoodieWriteConfig config) { + switch (config.getIndexType()) { + case HBASE: + return true; + case INMEMORY: + return true; + case BLOOM: + return false; + case GLOBAL_BLOOM: + return true; + case SIMPLE: + return false; + case GLOBAL_SIMPLE: + return true; + case BUCKET: + return false; + default: + return createIndex(config).isGlobal(); + } + } + + private static Option getKeyGeneratorForSimpleIndex(HoodieWriteConfig config) { + try { + return config.populateMetaFields() ? Option.empty() + : Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()))); + } catch (IOException e) { + throw new HoodieIOException("KeyGenerator instantiation failed ", e); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkInMemoryHashIndex.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkInMemoryHashIndex.java deleted file mode 100644 index 55ce8d2cc90c0..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkInMemoryHashIndex.java +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.index; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.Function2; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentMap; - -/** - * Hoodie Index implementation backed by an in-memory Hash map. - *

    - * ONLY USE FOR LOCAL TESTING - */ -@SuppressWarnings("checkstyle:LineLength") -public class SparkInMemoryHashIndex extends SparkHoodieIndex { - - private static ConcurrentMap recordLocationMap; - - public SparkInMemoryHashIndex(HoodieWriteConfig config) { - super(config); - synchronized (SparkInMemoryHashIndex.class) { - if (recordLocationMap == null) { - recordLocationMap = new ConcurrentHashMap<>(); - } - } - } - - @Override - public JavaRDD> tagLocation(JavaRDD> recordRDD, HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(), true); - } - - @Override - public JavaRDD updateLocation(JavaRDD writeStatusRDD, - HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - return writeStatusRDD.map(writeStatus -> { - for (HoodieRecord record : writeStatus.getWrittenRecords()) { - if (!writeStatus.isErrored(record.getKey())) { - HoodieKey key = record.getKey(); - Option newLocation = record.getNewLocation(); - if (newLocation.isPresent()) { - recordLocationMap.put(key, newLocation.get()); - } else { - // Delete existing index for a deleted record - recordLocationMap.remove(key); - } - } - } - return writeStatus; - }); - } - - @Override - public boolean rollbackCommit(String instantTime) { - return true; - } - - /** - * Only looks up by recordKey. - */ - @Override - public boolean isGlobal() { - return true; - } - - /** - * Mapping is available in HBase already. - */ - @Override - public boolean canIndexLogFiles() { - return true; - } - - /** - * Index needs to be explicitly updated after storage write. - */ - @Override - public boolean isImplicitWithStorage() { - return false; - } - - /** - * Function that tags each HoodieRecord with an existing location, if known. - */ - class LocationTagFunction implements Function2>, Iterator>> { - - @Override - public Iterator> call(Integer partitionNum, Iterator> hoodieRecordIterator) { - List> taggedRecords = new ArrayList<>(); - while (hoodieRecordIterator.hasNext()) { - HoodieRecord rec = hoodieRecordIterator.next(); - if (recordLocationMap.containsKey(rec.getKey())) { - rec.unseal(); - rec.setCurrentLocation(recordLocationMap.get(rec.getKey())); - rec.seal(); - } - taggedRecords.add(rec); - } - return taggedRecords.iterator(); - } - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndexCheckFunction.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndexCheckFunction.java index a8fac5fa7753d..e19a429ea7234 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndexCheckFunction.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndexCheckFunction.java @@ -25,7 +25,7 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIndexException; import org.apache.hudi.io.HoodieKeyLookupHandle; -import org.apache.hudi.io.HoodieKeyLookupHandle.KeyLookupResult; +import org.apache.hudi.io.HoodieKeyLookupResult; import org.apache.hudi.table.HoodieTable; import org.apache.spark.api.java.function.Function2; @@ -40,7 +40,7 @@ * Function performing actual checking of RDD partition containing (fileId, hoodieKeys) against the actual files. */ public class HoodieBloomIndexCheckFunction - implements Function2>, Iterator>> { + implements Function2>, Iterator>> { private final HoodieTable hoodieTable; @@ -52,12 +52,12 @@ public HoodieBloomIndexCheckFunction(HoodieTable hoodieTable, HoodieWriteConfig } @Override - public Iterator> call(Integer partition, - Iterator> fileParitionRecordKeyTripletItr) { - return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr); + public Iterator> call(Integer partition, + Iterator> filePartitionRecordKeyTripletItr) { + return new LazyKeyCheckIterator(filePartitionRecordKeyTripletItr); } - class LazyKeyCheckIterator extends LazyIterableIterator, List> { + class LazyKeyCheckIterator extends LazyIterableIterator, List> { private HoodieKeyLookupHandle keyLookupHandle; @@ -70,9 +70,9 @@ protected void start() { } @Override - protected List computeNext() { + protected List computeNext() { - List ret = new ArrayList<>(); + List ret = new ArrayList<>(); try { // process one file in each go. while (inputItr.hasNext()) { @@ -88,7 +88,7 @@ protected List computeNext() { } // if continue on current file - if (keyLookupHandle.getPartitionPathFilePair().equals(partitionPathFilePair)) { + if (keyLookupHandle.getPartitionPathFileIDPair().equals(partitionPathFilePair)) { keyLookupHandle.addKey(recordKey); } else { // do the actual checking of file & break out diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/HoodieMetadataBloomIndexCheckFunction.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/HoodieMetadataBloomIndexCheckFunction.java new file mode 100644 index 0000000000000..8a2958eab9da8 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/HoodieMetadataBloomIndexCheckFunction.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index.bloom; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.client.utils.LazyIterableIterator; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.index.HoodieIndexUtils; +import org.apache.hudi.io.HoodieKeyLookupResult; +import org.apache.hudi.table.HoodieTable; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.function.Function2; +import scala.Tuple2; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * Spark Function2 implementation for checking bloom filters for the + * requested keys from the metadata table index. The bloom filter + * checking for keys and the actual file verification for the + * candidate keys is done in an iterative fashion. In each iteration, + * bloom filters are requested for a batch of partition files and the + * keys are checked against them. + */ +public class HoodieMetadataBloomIndexCheckFunction implements + Function2>, Iterator>> { + + private static final Logger LOG = LogManager.getLogger(HoodieMetadataBloomIndexCheckFunction.class); + + // Assuming each file bloom filter takes up 512K, sizing the max file count + // per batch so that the total fetched bloom filters would not cross 128 MB. + private static final long BLOOM_FILTER_CHECK_MAX_FILE_COUNT_PER_BATCH = 256; + private final HoodieTable hoodieTable; + + public HoodieMetadataBloomIndexCheckFunction(HoodieTable hoodieTable) { + this.hoodieTable = hoodieTable; + } + + @Override + public Iterator> call(Integer integer, Iterator> tuple2Iterator) throws Exception { + return new BloomIndexLazyKeyCheckIterator(tuple2Iterator); + } + + private class BloomIndexLazyKeyCheckIterator extends LazyIterableIterator, List> { + public BloomIndexLazyKeyCheckIterator(Iterator> tuple2Iterator) { + super(tuple2Iterator); + } + + @Override + protected void start() { + } + + @Override + protected List computeNext() { + // Partition path and file name pair to list of keys + final Map, List> fileToKeysMap = new HashMap<>(); + final Map fileIDBaseFileMap = new HashMap<>(); + final List resultList = new ArrayList<>(); + + while (inputItr.hasNext()) { + Tuple2 entry = inputItr.next(); + final String partitionPath = entry._2.getPartitionPath(); + final String fileId = entry._1; + if (!fileIDBaseFileMap.containsKey(fileId)) { + Option baseFile = hoodieTable.getBaseFileOnlyView().getLatestBaseFile(partitionPath, fileId); + if (!baseFile.isPresent()) { + throw new HoodieIndexException("Failed to find the base file for partition: " + partitionPath + + ", fileId: " + fileId); + } + fileIDBaseFileMap.put(fileId, baseFile.get()); + } + fileToKeysMap.computeIfAbsent(Pair.of(partitionPath, fileIDBaseFileMap.get(fileId).getFileName()), + k -> new ArrayList<>()).add(entry._2); + if (fileToKeysMap.size() > BLOOM_FILTER_CHECK_MAX_FILE_COUNT_PER_BATCH) { + break; + } + } + if (fileToKeysMap.isEmpty()) { + return Collections.emptyList(); + } + + List> partitionNameFileNameList = new ArrayList<>(fileToKeysMap.keySet()); + Map, BloomFilter> fileToBloomFilterMap = + hoodieTable.getMetadataTable().getBloomFilters(partitionNameFileNameList); + + final AtomicInteger totalKeys = new AtomicInteger(0); + fileToKeysMap.forEach((partitionPathFileNamePair, hoodieKeyList) -> { + final String partitionPath = partitionPathFileNamePair.getLeft(); + final String fileName = partitionPathFileNamePair.getRight(); + final String fileId = FSUtils.getFileId(fileName); + ValidationUtils.checkState(!fileId.isEmpty()); + + if (!fileToBloomFilterMap.containsKey(partitionPathFileNamePair)) { + throw new HoodieIndexException("Failed to get the bloom filter for " + partitionPathFileNamePair); + } + final BloomFilter fileBloomFilter = fileToBloomFilterMap.get(partitionPathFileNamePair); + + List candidateRecordKeys = new ArrayList<>(); + hoodieKeyList.forEach(hoodieKey -> { + totalKeys.incrementAndGet(); + if (fileBloomFilter.mightContain(hoodieKey.getRecordKey())) { + candidateRecordKeys.add(hoodieKey.getRecordKey()); + } + }); + + final HoodieBaseFile dataFile = fileIDBaseFileMap.get(fileId); + List matchingKeys = + HoodieIndexUtils.filterKeysFromFile(new Path(dataFile.getPath()), candidateRecordKeys, + hoodieTable.getHadoopConf()); + LOG.debug( + String.format("Total records (%d), bloom filter candidates (%d)/fp(%d), actual matches (%d)", + hoodieKeyList.size(), candidateRecordKeys.size(), + candidateRecordKeys.size() - matchingKeys.size(), matchingKeys.size())); + + resultList.add(new HoodieKeyLookupResult(fileId, partitionPath, dataFile.getCommitTime(), matchingKeys)); + }); + return resultList; + } + + @Override + protected void end() { + } + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieBloomIndex.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieBloomIndex.java deleted file mode 100644 index 894b41b51c6bf..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieBloomIndex.java +++ /dev/null @@ -1,298 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.index.bloom; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.utils.SparkMemoryUtils; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.MetadataNotFoundException; -import org.apache.hudi.index.HoodieIndexUtils; -import org.apache.hudi.index.SparkHoodieIndex; -import org.apache.hudi.io.HoodieRangeInfoHandle; -import org.apache.hudi.table.HoodieTable; - -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.Partitioner; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.storage.StorageLevel; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -import scala.Tuple2; - -import static java.util.stream.Collectors.groupingBy; -import static java.util.stream.Collectors.mapping; -import static java.util.stream.Collectors.toList; -import static org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions; - -/** - * Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in its metadata. - */ -@SuppressWarnings("checkstyle:LineLength") -public class SparkHoodieBloomIndex extends SparkHoodieIndex { - - private static final Logger LOG = LogManager.getLogger(SparkHoodieBloomIndex.class); - - public SparkHoodieBloomIndex(HoodieWriteConfig config) { - super(config); - } - - @Override - public JavaRDD> tagLocation(JavaRDD> recordRDD, HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - - // Step 0: cache the input record RDD - if (config.getBloomIndexUseCaching()) { - recordRDD.persist(SparkMemoryUtils.getBloomIndexInputStorageLevel(config.getProps())); - } - - // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey) - JavaPairRDD partitionRecordKeyPairRDD = - recordRDD.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())); - - // Lookup indexes for all the partition/recordkey pair - JavaPairRDD keyFilenamePairRDD = - lookupIndex(partitionRecordKeyPairRDD, context, hoodieTable); - - // Cache the result, for subsequent stages. - if (config.getBloomIndexUseCaching()) { - keyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER()); - } - if (LOG.isDebugEnabled()) { - long totalTaggedRecords = keyFilenamePairRDD.count(); - LOG.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords); - } - - // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys - // Cost: 4 sec. - JavaRDD> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD, recordRDD); - - if (config.getBloomIndexUseCaching()) { - recordRDD.unpersist(); // unpersist the input Record RDD - keyFilenamePairRDD.unpersist(); - } - return taggedRecordRDD; - } - - /** - * Lookup the location for each record key and return the pair for all record keys already - * present and drop the record keys if not present. - */ - private JavaPairRDD lookupIndex( - JavaPairRDD partitionRecordKeyPairRDD, final HoodieEngineContext context, - final HoodieTable hoodieTable) { - // Obtain records per partition, in the incoming records - Map recordsPerPartition = partitionRecordKeyPairRDD.countByKey(); - List affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet()); - - // Step 2: Load all involved files as pairs - List> fileInfoList = - loadInvolvedFiles(affectedPartitionPathList, context, hoodieTable); - final Map> partitionToFileInfo = - fileInfoList.stream().collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList()))); - - // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, - // that contains it. - Map comparisonsPerFileGroup = - computeComparisonsPerFileGroup(recordsPerPartition, partitionToFileInfo, partitionRecordKeyPairRDD); - int inputParallelism = partitionRecordKeyPairRDD.partitions().size(); - int joinParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism()); - LOG.info("InputParallelism: ${" + inputParallelism + "}, IndexParallelism: ${" - + config.getBloomIndexParallelism() + "}"); - return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, joinParallelism, hoodieTable, - comparisonsPerFileGroup); - } - - /** - * Compute the estimated number of bloom filter comparisons to be performed on each file group. - */ - private Map computeComparisonsPerFileGroup(final Map recordsPerPartition, - final Map> partitionToFileInfo, - JavaPairRDD partitionRecordKeyPairRDD) { - - Map fileToComparisons; - if (config.getBloomIndexPruneByRanges()) { - // we will just try exploding the input and then count to determine comparisons - // FIX(vc): Only do sampling here and extrapolate? - fileToComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairRDD) - .mapToPair(t -> t).countByKey(); - } else { - fileToComparisons = new HashMap<>(); - partitionToFileInfo.forEach((key, value) -> { - for (BloomIndexFileInfo fileInfo : value) { - // each file needs to be compared against all the records coming into the partition - fileToComparisons.put(fileInfo.getFileId(), recordsPerPartition.get(key)); - } - }); - } - return fileToComparisons; - } - - /** - * Load all involved files as pair RDD. - */ - List> loadInvolvedFiles(List partitions, final HoodieEngineContext context, - final HoodieTable hoodieTable) { - - // Obtain the latest data files from all the partitions. - List> partitionPathFileIDList = getLatestBaseFilesForAllPartitions(partitions, context, hoodieTable).stream() - .map(pair -> Pair.of(pair.getKey(), pair.getValue().getFileId())) - .collect(toList()); - - if (config.getBloomIndexPruneByRanges()) { - // also obtain file ranges, if range pruning is enabled - context.setJobStatus(this.getClass().getName(), "Obtain key ranges for file slices (range pruning=on)"); - return context.map(partitionPathFileIDList, pf -> { - try { - HoodieRangeInfoHandle rangeInfoHandle = new HoodieRangeInfoHandle(config, hoodieTable, pf); - String[] minMaxKeys = rangeInfoHandle.getMinMaxKeys(); - return new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue(), minMaxKeys[0], minMaxKeys[1])); - } catch (MetadataNotFoundException me) { - LOG.warn("Unable to find range metadata in file :" + pf); - return new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue())); - } - }, Math.max(partitionPathFileIDList.size(), 1)); - } else { - return partitionPathFileIDList.stream() - .map(pf -> new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue()))).collect(toList()); - } - } - - @Override - public boolean rollbackCommit(String instantTime) { - // Nope, don't need to do anything. - return true; - } - - /** - * This is not global, since we depend on the partitionPath to do the lookup. - */ - @Override - public boolean isGlobal() { - return false; - } - - /** - * No indexes into log files yet. - */ - @Override - public boolean canIndexLogFiles() { - return false; - } - - /** - * Bloom filters are stored, into the same data files. - */ - @Override - public boolean isImplicitWithStorage() { - return true; - } - - /** - * For each incoming record, produce N output records, 1 each for each file against which the record's key needs to be - * checked. For tables, where the keys have a definite insert order (e.g: timestamp as prefix), the number of files - * to be compared gets cut down a lot from range pruning. - *

    - * Sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on - * recordKey ranges in the index info. - */ - JavaRDD> explodeRecordRDDWithFileComparisons( - final Map> partitionToFileIndexInfo, - JavaPairRDD partitionRecordKeyPairRDD) { - IndexFileFilter indexFileFilter = - config.useBloomIndexTreebasedFilter() ? new IntervalTreeBasedIndexFileFilter(partitionToFileIndexInfo) - : new ListBasedIndexFileFilter(partitionToFileIndexInfo); - - return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> { - String recordKey = partitionRecordKeyPair._2(); - String partitionPath = partitionRecordKeyPair._1(); - - return indexFileFilter.getMatchingFilesAndPartition(partitionPath, recordKey).stream() - .map(partitionFileIdPair -> new Tuple2<>(partitionFileIdPair.getRight(), - new HoodieKey(recordKey, partitionPath))) - .collect(Collectors.toList()); - }).flatMap(List::iterator); - } - - /** - * Find out pair. All workload grouped by file-level. - *

    - * Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such that each RDD - * partition is a file, then for each file, we do (1) load bloom filter, (2) load rowKeys, (3) Tag rowKey - *

    - * Make sure the parallelism is atleast the groupby parallelism for tagging location - */ - JavaPairRDD findMatchingFilesForRecordKeys( - final Map> partitionToFileIndexInfo, - JavaPairRDD partitionRecordKeyPairRDD, int shuffleParallelism, HoodieTable hoodieTable, - Map fileGroupToComparisons) { - JavaRDD> fileComparisonsRDD = - explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD); - - if (config.useBloomIndexBucketizedChecking()) { - Partitioner partitioner = new BucketizedBloomCheckPartitioner(shuffleParallelism, fileGroupToComparisons, - config.getBloomIndexKeysPerBucket()); - - fileComparisonsRDD = fileComparisonsRDD.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t)) - .repartitionAndSortWithinPartitions(partitioner).map(Tuple2::_2); - } else { - fileComparisonsRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, shuffleParallelism); - } - - return fileComparisonsRDD.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true) - .flatMap(List::iterator).filter(lr -> lr.getMatchingRecordKeys().size() > 0) - .flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream() - .map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()), - new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId()))) - .collect(Collectors.toList()).iterator()); - } - - - /** - * Tag the back to the original HoodieRecord RDD. - */ - protected JavaRDD> tagLocationBacktoRecords( - JavaPairRDD keyFilenamePairRDD, JavaRDD> recordRDD) { - JavaPairRDD> keyRecordPairRDD = - recordRDD.mapToPair(record -> new Tuple2<>(record.getKey(), record)); - // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), - // so we do left outer join. - return keyRecordPairRDD.leftOuterJoin(keyFilenamePairRDD).values() - .map(v1 -> HoodieIndexUtils.getTaggedRecord(v1._1, Option.ofNullable(v1._2.orNull()))); - } - - @Override - public JavaRDD updateLocation(JavaRDD writeStatusRDD, HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - return writeStatusRDD; - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieBloomIndexHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieBloomIndexHelper.java new file mode 100644 index 0000000000000..5736024dc2455 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieBloomIndexHelper.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.index.bloom; + +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodiePairData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaPairRDD; +import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.io.HoodieKeyLookupResult; +import org.apache.hudi.table.HoodieTable; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.Partitioner; +import org.apache.spark.api.java.JavaRDD; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import scala.Tuple2; + +import static org.apache.hudi.metadata.MetadataPartitionType.BLOOM_FILTERS; + +/** + * Helper for {@link HoodieBloomIndex} containing Spark-specific logic. + */ +public class SparkHoodieBloomIndexHelper extends BaseHoodieBloomIndexHelper { + + private static final Logger LOG = LogManager.getLogger(SparkHoodieBloomIndexHelper.class); + + private static final SparkHoodieBloomIndexHelper SINGLETON_INSTANCE = + new SparkHoodieBloomIndexHelper(); + + private SparkHoodieBloomIndexHelper() { + } + + public static SparkHoodieBloomIndexHelper getInstance() { + return SINGLETON_INSTANCE; + } + + @Override + public HoodiePairData findMatchingFilesForRecordKeys( + HoodieWriteConfig config, HoodieEngineContext context, HoodieTable hoodieTable, + HoodiePairData partitionRecordKeyPairs, + HoodieData> fileComparisonPairs, + Map> partitionToFileInfo, + Map recordsPerPartition) { + JavaRDD> fileComparisonsRDD = + HoodieJavaRDD.getJavaRDD(fileComparisonPairs) + .map(pair -> new Tuple2<>(pair.getLeft(), pair.getRight())); + + int inputParallelism = HoodieJavaPairRDD.getJavaPairRDD(partitionRecordKeyPairs).partitions().size(); + int joinParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism()); + LOG.info("InputParallelism: ${" + inputParallelism + "}, IndexParallelism: ${" + + config.getBloomIndexParallelism() + "}"); + + JavaRDD> keyLookupResultRDD; + if (config.getBloomIndexUseMetadata() + && hoodieTable.getMetaClient().getTableConfig().getMetadataPartitions() + .contains(BLOOM_FILTERS.getPartitionPath())) { + // Step 1: Sort by file id + JavaRDD> sortedFileIdAndKeyPairs = + fileComparisonsRDD.sortBy(Tuple2::_1, true, joinParallelism); + + // Step 2: Use bloom filter to filter and the actual log file to get the record location + keyLookupResultRDD = sortedFileIdAndKeyPairs.mapPartitionsWithIndex( + new HoodieMetadataBloomIndexCheckFunction(hoodieTable), true); + } else if (config.useBloomIndexBucketizedChecking()) { + Map comparisonsPerFileGroup = computeComparisonsPerFileGroup( + config, recordsPerPartition, partitionToFileInfo, fileComparisonsRDD, context); + Partitioner partitioner = new BucketizedBloomCheckPartitioner(joinParallelism, comparisonsPerFileGroup, + config.getBloomIndexKeysPerBucket()); + + keyLookupResultRDD = fileComparisonsRDD.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t)) + .repartitionAndSortWithinPartitions(partitioner) + .map(Tuple2::_2) + .mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true); + } else { + keyLookupResultRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, joinParallelism) + .mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true); + } + + return HoodieJavaPairRDD.of(keyLookupResultRDD.flatMap(List::iterator) + .filter(lr -> lr.getMatchingRecordKeys().size() > 0) + .flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream() + .map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()), + new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId()))) + .collect(Collectors.toList()).iterator())); + } + + /** + * Compute the estimated number of bloom filter comparisons to be performed on each file group. + */ + private Map computeComparisonsPerFileGroup( + final HoodieWriteConfig config, + final Map recordsPerPartition, + final Map> partitionToFileInfo, + final JavaRDD> fileComparisonsRDD, + final HoodieEngineContext context) { + Map fileToComparisons; + if (config.getBloomIndexPruneByRanges()) { + // we will just try exploding the input and then count to determine comparisons + // FIX(vc): Only do sampling here and extrapolate? + context.setJobStatus(this.getClass().getSimpleName(), "Compute all comparisons needed between records and files: " + config.getTableName()); + fileToComparisons = fileComparisonsRDD.mapToPair(t -> t).countByKey(); + } else { + fileToComparisons = new HashMap<>(); + partitionToFileInfo.forEach((key, value) -> { + for (BloomIndexFileInfo fileInfo : value) { + // each file needs to be compared against all the records coming into the partition + fileToComparisons.put(fileInfo.getFileId(), recordsPerPartition.get(key)); + } + }); + } + return fileToComparisons; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieGlobalBloomIndex.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieGlobalBloomIndex.java deleted file mode 100644 index 771c01ab875de..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieGlobalBloomIndex.java +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.index.bloom; - -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.EmptyHoodieRecordPayload; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.index.HoodieIndexUtils; -import org.apache.hudi.table.HoodieTable; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.Optional; - -import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -import scala.Tuple2; - -/** - * This filter will only work with hoodie table since it will only load partitions with .hoodie_partition_metadata - * file in it. - */ -public class SparkHoodieGlobalBloomIndex extends SparkHoodieBloomIndex { - - public SparkHoodieGlobalBloomIndex(HoodieWriteConfig config) { - super(config); - } - - /** - * Load all involved files as pair RDD from all partitions in the table. - */ - @Override - List> loadInvolvedFiles(List partitions, final HoodieEngineContext context, - final HoodieTable hoodieTable) { - HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); - try { - List allPartitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), - config.shouldAssumeDatePartitioning()); - return super.loadInvolvedFiles(allPartitionPaths, context, hoodieTable); - } catch (IOException e) { - throw new HoodieIOException("Failed to load all partitions", e); - } - } - - /** - * For each incoming record, produce N output records, 1 each for each file against which the record's key needs to be - * checked. For tables, where the keys have a definite insert order (e.g: timestamp as prefix), the number of files - * to be compared gets cut down a lot from range pruning. - *

    - * Sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on - * recordKey ranges in the index info. the partition path of the incoming record (partitionRecordKeyPairRDD._2()) will - * be ignored since the search scope should be bigger than that - */ - - @Override - JavaRDD> explodeRecordRDDWithFileComparisons( - final Map> partitionToFileIndexInfo, - JavaPairRDD partitionRecordKeyPairRDD) { - - IndexFileFilter indexFileFilter = - config.useBloomIndexTreebasedFilter() ? new IntervalTreeBasedGlobalIndexFileFilter(partitionToFileIndexInfo) - : new ListBasedGlobalIndexFileFilter(partitionToFileIndexInfo); - - return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> { - String recordKey = partitionRecordKeyPair._2(); - String partitionPath = partitionRecordKeyPair._1(); - - return indexFileFilter.getMatchingFilesAndPartition(partitionPath, recordKey).stream() - .map(partitionFileIdPair -> new Tuple2<>(partitionFileIdPair.getRight(), - new HoodieKey(recordKey, partitionFileIdPair.getLeft()))) - .collect(Collectors.toList()); - }).flatMap(List::iterator); - } - - /** - * Tagging for global index should only consider the record key. - */ - @Override - protected JavaRDD> tagLocationBacktoRecords( - JavaPairRDD keyLocationPairRDD, JavaRDD> recordRDD) { - - JavaPairRDD> incomingRowKeyRecordPairRDD = - recordRDD.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record)); - - JavaPairRDD> existingRecordKeyToRecordLocationHoodieKeyMap = - keyLocationPairRDD.mapToPair(p -> new Tuple2<>(p._1.getRecordKey(), new Tuple2<>(p._2, p._1))); - - // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), so we do left outer join. - return incomingRowKeyRecordPairRDD.leftOuterJoin(existingRecordKeyToRecordLocationHoodieKeyMap).values().flatMap(record -> { - final HoodieRecord hoodieRecord = record._1; - final Optional> recordLocationHoodieKeyPair = record._2; - if (recordLocationHoodieKeyPair.isPresent()) { - // Record key matched to file - if (config.getBloomIndexUpdatePartitionPath() - && !recordLocationHoodieKeyPair.get()._2.getPartitionPath().equals(hoodieRecord.getPartitionPath())) { - // Create an empty record to delete the record in the old partition - HoodieRecord deleteRecord = new HoodieRecord(recordLocationHoodieKeyPair.get()._2, - new EmptyHoodieRecordPayload()); - deleteRecord.setCurrentLocation(recordLocationHoodieKeyPair.get()._1()); - deleteRecord.seal(); - // Tag the incoming record for inserting to the new partition - HoodieRecord insertRecord = HoodieIndexUtils.getTaggedRecord(hoodieRecord, Option.empty()); - return Arrays.asList(deleteRecord, insertRecord).iterator(); - } else { - // Ignore the incoming record's partition, regardless of whether it differs from its old partition or not. - // When it differs, the record will still be updated at its old partition. - return Collections.singletonList( - (HoodieRecord) HoodieIndexUtils.getTaggedRecord(new HoodieRecord<>(recordLocationHoodieKeyPair.get()._2, hoodieRecord.getData()), - Option.ofNullable(recordLocationHoodieKeyPair.get()._1))).iterator(); - } - } else { - return Collections.singletonList((HoodieRecord) HoodieIndexUtils.getTaggedRecord(hoodieRecord, Option.empty())).iterator(); - } - }); - } - - @Override - public boolean isGlobal() { - return true; - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bucket/HoodieSparkConsistentBucketIndex.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bucket/HoodieSparkConsistentBucketIndex.java new file mode 100644 index 0000000000000..ca6bf0fc7d990 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bucket/HoodieSparkConsistentBucketIndex.java @@ -0,0 +1,210 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index.bucket; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; +import org.apache.hudi.common.model.ConsistentHashingNode; +import org.apache.hudi.common.model.HoodieConsistentHashingMetadata; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.FileIOUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.table.HoodieTable; + +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +/** + * Consistent hashing bucket index implementation, with auto-adjust bucket number. + * NOTE: bucket resizing is triggered by clustering. + */ +public class HoodieSparkConsistentBucketIndex extends HoodieBucketIndex { + + private static final Logger LOG = LogManager.getLogger(HoodieSparkConsistentBucketIndex.class); + + public HoodieSparkConsistentBucketIndex(HoodieWriteConfig config) { + super(config); + } + + @Override + public HoodieData updateLocation(HoodieData writeStatuses, + HoodieEngineContext context, + HoodieTable hoodieTable) + throws HoodieIndexException { + return writeStatuses; + } + + /** + * Do nothing. + * A failed write may create a hashing metadata for a partition. In this case, we still do nothing when rolling back + * the failed write. Because the hashing metadata created by a writer must have 00000000000000 timestamp and can be viewed + * as the initialization of a partition rather than as a part of the failed write. + */ + @Override + public boolean rollbackCommit(String instantTime) { + return true; + } + + @Override + protected BucketIndexLocationMapper getLocationMapper(HoodieTable table, List partitionPath) { + return new ConsistentBucketIndexLocationMapper(table, partitionPath); + } + + /** + * Load hashing metadata of the given partition, if it is not existed, create a new one (also persist it into storage) + * + * @param table hoodie table + * @param partition table partition + * @return Consistent hashing metadata + */ + public HoodieConsistentHashingMetadata loadOrCreateMetadata(HoodieTable table, String partition) { + HoodieConsistentHashingMetadata metadata = loadMetadata(table, partition); + if (metadata != null) { + return metadata; + } + + // There is no metadata, so try to create a new one and save it. + metadata = new HoodieConsistentHashingMetadata(partition, numBuckets); + if (saveMetadata(table, metadata, false)) { + return metadata; + } + + // The creation failed, so try load metadata again. Concurrent creation of metadata should have succeeded. + // Note: the consistent problem of cloud storage is handled internal in the HoodieWrapperFileSystem, i.e., ConsistentGuard + metadata = loadMetadata(table, partition); + ValidationUtils.checkState(metadata != null, "Failed to load or create metadata, partition: " + partition); + return metadata; + } + + /** + * Load hashing metadata of the given partition, if it is not existed, return null + * + * @param table hoodie table + * @param partition table partition + * @return Consistent hashing metadata or null if it does not exist + */ + public static HoodieConsistentHashingMetadata loadMetadata(HoodieTable table, String partition) { + Path metadataPath = FSUtils.getPartitionPath(table.getMetaClient().getHashingMetadataPath(), partition); + + try { + if (!table.getMetaClient().getFs().exists(metadataPath)) { + return null; + } + FileStatus[] metaFiles = table.getMetaClient().getFs().listStatus(metadataPath); + final HoodieTimeline completedCommits = table.getMetaClient().getActiveTimeline().getCommitTimeline().filterCompletedInstants(); + Predicate metaFilePredicate = fileStatus -> { + String filename = fileStatus.getPath().getName(); + if (!filename.contains(HoodieConsistentHashingMetadata.HASHING_METADATA_FILE_SUFFIX)) { + return false; + } + String timestamp = HoodieConsistentHashingMetadata.getTimestampFromFile(filename); + return completedCommits.containsInstant(timestamp) || timestamp.equals(HoodieTimeline.INIT_INSTANT_TS); + }; + + // Get a valid hashing metadata with the largest (latest) timestamp + FileStatus metaFile = Arrays.stream(metaFiles).filter(metaFilePredicate) + .max(Comparator.comparing(a -> a.getPath().getName())).orElse(null); + + if (metaFile == null) { + return null; + } + + byte[] content = FileIOUtils.readAsByteArray(table.getMetaClient().getFs().open(metaFile.getPath())); + return HoodieConsistentHashingMetadata.fromBytes(content); + } catch (IOException e) { + LOG.error("Error when loading hashing metadata, partition: " + partition, e); + throw new HoodieIndexException("Error while loading hashing metadata", e); + } + } + + /** + * Save metadata into storage + * + * @param table hoodie table + * @param metadata hashing metadata to be saved + * @param overwrite whether to overwrite existing metadata + * @return true if the metadata is saved successfully + */ + private static boolean saveMetadata(HoodieTable table, HoodieConsistentHashingMetadata metadata, boolean overwrite) { + HoodieWrapperFileSystem fs = table.getMetaClient().getFs(); + Path dir = FSUtils.getPartitionPath(table.getMetaClient().getHashingMetadataPath(), metadata.getPartitionPath()); + Path fullPath = new Path(dir, metadata.getFilename()); + try (FSDataOutputStream fsOut = fs.create(fullPath, overwrite)) { + byte[] bytes = metadata.toBytes(); + fsOut.write(bytes); + fsOut.close(); + return true; + } catch (IOException e) { + LOG.warn("Failed to update bucket metadata: " + metadata, e); + } + return false; + } + + public class ConsistentBucketIndexLocationMapper implements BucketIndexLocationMapper { + + /** + * Mapping from partitionPath -> bucket identifier + */ + private final Map partitionToIdentifier; + + public ConsistentBucketIndexLocationMapper(HoodieTable table, List partitions) { + // TODO maybe parallel + partitionToIdentifier = partitions.stream().collect(Collectors.toMap(p -> p, p -> { + HoodieConsistentHashingMetadata metadata = loadOrCreateMetadata(table, p); + return new ConsistentBucketIdentifier(metadata); + })); + } + + @Override + public Option getRecordLocation(HoodieKey key, String partitionPath) { + ConsistentHashingNode node = partitionToIdentifier.get(partitionPath).getBucket(key, indexKeyFields); + if (!StringUtils.isNullOrEmpty(node.getFileIdPrefix())) { + /** + * Dynamic Bucket Index doesn't need the instant time of the latest file group. + * We add suffix 0 here to the file uuid, following the naming convention, i.e., fileId = [uuid]_[numWrites] + */ + return Option.of(new HoodieRecordLocation(null, FSUtils.createNewFileId(node.getFileIdPrefix(), 0))); + } + + LOG.error("Consistent hashing node has no file group, partition: " + partitionPath + ", meta: " + + partitionToIdentifier.get(partitionPath).getMetadata().getFilename() + ", record_key: " + key.toString()); + throw new HoodieIndexException("Failed to getBucket as hashing node has no file group"); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/RebalancedSparkHoodieHBaseIndex.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/RebalancedSparkHoodieHBaseIndex.java new file mode 100644 index 0000000000000..0ee1bef98f8a0 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/RebalancedSparkHoodieHBaseIndex.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index.hbase; + +import org.apache.hudi.config.HoodieWriteConfig; + +/** + * Extends {@link SparkHoodieHBaseIndex}, add random prefix to key for avoiding data skew issue in hbase regions. + */ +public class RebalancedSparkHoodieHBaseIndex extends SparkHoodieHBaseIndex { + + public RebalancedSparkHoodieHBaseIndex(HoodieWriteConfig config) { + super(config); + } + + @Override + protected String getHBaseKey(String originalKey) { + int bucket = Math.abs(originalKey.hashCode()) % config.getHBaseIndexRegionCount(); + String bucketStr = String.format("%0" + String.valueOf(config.getHBaseIndexRegionCount() - 1).length() + "d", bucket); + return bucketStr + originalKey; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/SparkHoodieHBaseIndex.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/SparkHoodieHBaseIndex.java index 5b67f838509bd..f99bf876c93c8 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/SparkHoodieHBaseIndex.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/SparkHoodieHBaseIndex.java @@ -19,26 +19,29 @@ package org.apache.hudi.index.hbase; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.client.utils.SparkMemoryUtils; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.RateLimiter; import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.config.HoodieHBaseIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieDependentSystemUnavailableException; import org.apache.hudi.exception.HoodieIndexException; -import org.apache.hudi.index.SparkHoodieIndex; +import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HRegionLocation; @@ -53,11 +56,15 @@ import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.RegionLocator; import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.security.UserGroupInformation; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.Partitioner; import org.apache.spark.SparkConf; +import org.apache.spark.SparkFiles; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -66,7 +73,9 @@ import java.io.IOException; import java.io.Serializable; +import java.security.PrivilegedExceptionAction; import java.util.ArrayList; +import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; @@ -76,10 +85,19 @@ import scala.Tuple2; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION; +import static org.apache.hadoop.hbase.HConstants.ZOOKEEPER_QUORUM; +import static org.apache.hadoop.hbase.HConstants.ZOOKEEPER_ZNODE_PARENT; +import static org.apache.hadoop.hbase.HConstants.ZOOKEEPER_CLIENT_PORT; +import static org.apache.hadoop.hbase.security.SecurityConstants.MASTER_KRB_PRINCIPAL; +import static org.apache.hadoop.hbase.security.SecurityConstants.REGIONSERVER_KRB_PRINCIPAL; +import static org.apache.hadoop.hbase.security.User.HBASE_SECURITY_AUTHORIZATION_CONF_KEY; +import static org.apache.hadoop.hbase.security.User.HBASE_SECURITY_CONF_KEY; + /** * Hoodie Index implementation backed by HBase. */ -public class SparkHoodieHBaseIndex extends SparkHoodieIndex { +public class SparkHoodieHBaseIndex extends HoodieIndex { public static final String DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME = "spark.executor.instances"; public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME = "spark.dynamicAllocation.enabled"; @@ -101,7 +119,7 @@ public class SparkHoodieHBaseIndex extends SparkH /** * multiPutBatchSize will be computed and re-set in updateLocation if - * {@link HoodieHBaseIndexConfig#HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP} is set to true. + * {@link HoodieHBaseIndexConfig#PUT_BATCH_SIZE_AUTO_COMPUTE} is set to true. */ private Integer multiPutBatchSize; private Integer numRegionServersForTable; @@ -136,18 +154,37 @@ public HBaseIndexQPSResourceAllocator createQPSResourceAllocator(HoodieWriteConf private Connection getHBaseConnection() { Configuration hbaseConfig = HBaseConfiguration.create(); String quorum = config.getHbaseZkQuorum(); - hbaseConfig.set("hbase.zookeeper.quorum", quorum); + hbaseConfig.set(ZOOKEEPER_QUORUM, quorum); String zkZnodeParent = config.getHBaseZkZnodeParent(); if (zkZnodeParent != null) { - hbaseConfig.set("zookeeper.znode.parent", zkZnodeParent); + hbaseConfig.set(ZOOKEEPER_ZNODE_PARENT, zkZnodeParent); } String port = String.valueOf(config.getHbaseZkPort()); - hbaseConfig.set("hbase.zookeeper.property.clientPort", port); + hbaseConfig.set(ZOOKEEPER_CLIENT_PORT, port); + try { - return ConnectionFactory.createConnection(hbaseConfig); - } catch (IOException e) { + String authentication = config.getHBaseIndexSecurityAuthentication(); + if (authentication.equals("kerberos")) { + hbaseConfig.set(HBASE_SECURITY_CONF_KEY, "kerberos"); + hbaseConfig.set(HADOOP_SECURITY_AUTHENTICATION, "kerberos"); + hbaseConfig.set(HBASE_SECURITY_AUTHORIZATION_CONF_KEY, "true"); + hbaseConfig.set(REGIONSERVER_KRB_PRINCIPAL, config.getHBaseIndexRegionserverPrincipal()); + hbaseConfig.set(MASTER_KRB_PRINCIPAL, config.getHBaseIndexMasterPrincipal()); + + String principal = config.getHBaseIndexKerberosUserPrincipal(); + String keytab = SparkFiles.get(config.getHBaseIndexKerberosUserKeytab()); + + UserGroupInformation.setConfiguration(hbaseConfig); + UserGroupInformation ugi = UserGroupInformation.loginUserFromKeytabAndReturnUGI(principal, keytab); + return ugi.doAs((PrivilegedExceptionAction) () -> + (Connection) ConnectionFactory.createConnection(hbaseConfig) + ); + } else { + return ConnectionFactory.createConnection(hbaseConfig); + } + } catch (IOException | InterruptedException e) { throw new HoodieDependentSystemUnavailableException(HoodieDependentSystemUnavailableException.HBASE, - quorum + ":" + port); + quorum + ":" + port, e); } } @@ -177,10 +214,18 @@ public void close() { } private Get generateStatement(String key) throws IOException { - return new Get(Bytes.toBytes(key)).setMaxVersions(1).addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN) + return new Get(Bytes.toBytes(getHBaseKey(key))).readVersions(1).addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN) .addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN).addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN); } + private Get generateStatement(String key, long startTime, long endTime) throws IOException { + return generateStatement(key).setTimeRange(startTime, endTime); + } + + protected String getHBaseKey(String key) { + return key; + } + private boolean checkIfValidCommit(HoodieTableMetaClient metaClient, String commitTs) { HoodieTimeline commitTimeline = metaClient.getCommitsTimeline().filterCompletedInstants(); // Check if the last commit ts for this row is 1) present in the timeline or @@ -192,15 +237,13 @@ private boolean checkIfValidCommit(HoodieTableMetaClient metaClient, String comm /** * Function that tags each HoodieRecord with an existing location, if known. */ - private Function2>, Iterator>> locationTagFunction( + private Function2>, Iterator>> locationTagFunction( HoodieTableMetaClient metaClient) { // `multiGetBatchSize` is intended to be a batch per 100ms. To create a rate limiter that measures // operations per second, we need to multiply `multiGetBatchSize` by 10. Integer multiGetBatchSize = config.getHbaseIndexGetBatchSize(); - return (Function2>, Iterator>>) (partitionNum, - hoodieRecordIterator) -> { - + return (partitionNum, hoodieRecordIterator) -> { boolean updatePartitionPath = config.getHbaseIndexUpdatePartitionPath(); RateLimiter limiter = RateLimiter.create(multiGetBatchSize * 10, TimeUnit.SECONDS); // Grab the global HBase connection @@ -209,7 +252,7 @@ private Function2>, Iterator>> hbaseConnection = getHBaseConnection(); } } - List> taggedRecords = new ArrayList<>(); + List> taggedRecords = new ArrayList<>(); try (HTable hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName))) { List statements = new ArrayList<>(); List currentBatchOfRecords = new LinkedList<>(); @@ -245,19 +288,19 @@ private Function2>, Iterator>> // check whether to do partition change processing if (updatePartitionPath && !partitionPath.equals(currentRecord.getPartitionPath())) { // delete partition old data record - HoodieRecord emptyRecord = new HoodieRecord(new HoodieKey(currentRecord.getRecordKey(), partitionPath), + HoodieRecord emptyRecord = new HoodieAvroRecord(new HoodieKey(currentRecord.getRecordKey(), partitionPath), new EmptyHoodieRecordPayload()); emptyRecord.unseal(); emptyRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId)); emptyRecord.seal(); // insert partition new data record - currentRecord = new HoodieRecord(new HoodieKey(currentRecord.getRecordKey(), currentRecord.getPartitionPath()), - currentRecord.getData()); + currentRecord = new HoodieAvroRecord(new HoodieKey(currentRecord.getRecordKey(), currentRecord.getPartitionPath()), + (HoodieRecordPayload) currentRecord.getData()); taggedRecords.add(emptyRecord); taggedRecords.add(currentRecord); } else { - currentRecord = new HoodieRecord(new HoodieKey(currentRecord.getRecordKey(), partitionPath), - currentRecord.getData()); + currentRecord = new HoodieAvroRecord(new HoodieKey(currentRecord.getRecordKey(), partitionPath), + (HoodieRecordPayload) currentRecord.getData()); currentRecord.unseal(); currentRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId)); currentRecord.seal(); @@ -269,6 +312,8 @@ private Function2>, Iterator>> } } catch (IOException e) { throw new HoodieIndexException("Failed to Tag indexed locations because of exception with HBase Client", e); + } finally { + limiter.stop(); } return taggedRecords.iterator(); }; @@ -283,15 +328,16 @@ private Result[] doGet(HTable hTable, List keys, RateLimiter limiter) throw } @Override - public JavaRDD> tagLocation(JavaRDD> recordRDD, - HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - return recordRDD.mapPartitionsWithIndex(locationTagFunction(hoodieTable.getMetaClient()), true); + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) { + return HoodieJavaRDD.of(HoodieJavaRDD.getJavaRDD(records) + .mapPartitionsWithIndex(locationTagFunction(hoodieTable.getMetaClient()), true)); } private Function2, Iterator> updateLocationFunction() { - return (Function2, Iterator>) (partition, statusIterator) -> { + return (partition, statusIterator) -> { List writeStatusList = new ArrayList<>(); // Grab the global HBase connection @@ -303,8 +349,8 @@ private Function2, Iterator> updateL final long startTimeForPutsTask = DateTime.now().getMillis(); LOG.info("startTimeForPutsTask for this task: " + startTimeForPutsTask); + final RateLimiter limiter = RateLimiter.create(multiPutBatchSize, TimeUnit.SECONDS); try (BufferedMutator mutator = hbaseConnection.getBufferedMutator(TableName.valueOf(tableName))) { - final RateLimiter limiter = RateLimiter.create(multiPutBatchSize, TimeUnit.SECONDS); while (statusIterator.hasNext()) { WriteStatus writeStatus = statusIterator.next(); List mutations = new ArrayList<>(); @@ -323,14 +369,14 @@ private Function2, Iterator> updateL // This is an update, no need to update index continue; } - Put put = new Put(Bytes.toBytes(rec.getRecordKey())); + Put put = new Put(Bytes.toBytes(getHBaseKey(rec.getRecordKey()))); put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, Bytes.toBytes(loc.get().getInstantTime())); put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, Bytes.toBytes(loc.get().getFileId())); put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, Bytes.toBytes(rec.getPartitionPath())); mutations.add(put); } else { // Delete existing index for a deleted record - Delete delete = new Delete(Bytes.toBytes(rec.getRecordKey())); + Delete delete = new Delete(Bytes.toBytes(getHBaseKey(rec.getRecordKey()))); mutations.add(delete); } } @@ -352,6 +398,8 @@ private Function2, Iterator> updateL LOG.info("hbase puts task time for this task: " + (endPutsTime - startTimeForPutsTask)); } catch (IOException e) { throw new HoodieIndexException("Failed to Update Index locations because of exception with HBase Client", e); + } finally { + limiter.stop(); } return writeStatusList.iterator(); }; @@ -373,7 +421,7 @@ private void doMutations(BufferedMutator mutator, List mutations, Rate mutations.clear(); } - public Map mapFileWithInsertsToUniquePartition(JavaRDD writeStatusRDD) { + Map mapFileWithInsertsToUniquePartition(JavaRDD writeStatusRDD) { final Map fileIdPartitionMap = new HashMap<>(); int partitionIndex = 0; // Map each fileId that has inserts to a unique partition Id. This will be used while @@ -387,16 +435,17 @@ public Map mapFileWithInsertsToUniquePartition(JavaRDD updateLocation(JavaRDD writeStatusRDD, HoodieEngineContext context, - HoodieTable>, JavaRDD, - JavaRDD> hoodieTable) { - final Option desiredQPSFraction = calculateQPSFraction(writeStatusRDD); + public HoodieData updateLocation( + HoodieData writeStatus, HoodieEngineContext context, + HoodieTable hoodieTable) { + JavaRDD writeStatusRDD = HoodieJavaRDD.getJavaRDD(writeStatus); + final Option desiredQPSFraction = calculateQPSFraction(writeStatusRDD); final Map fileIdPartitionMap = mapFileWithInsertsToUniquePartition(writeStatusRDD); JavaRDD partitionedRDD = this.numWriteStatusWithInserts == 0 ? writeStatusRDD : - writeStatusRDD.mapToPair(w -> new Tuple2<>(w.getFileId(), w)) - .partitionBy(new WriteStatusPartitioner(fileIdPartitionMap, - this.numWriteStatusWithInserts)) - .map(w -> w._2()); + writeStatusRDD.mapToPair(w -> new Tuple2<>(w.getFileId(), w)) + .partitionBy(new WriteStatusPartitioner(fileIdPartitionMap, + this.numWriteStatusWithInserts)) + .map(w -> w._2()); JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); acquireQPSResourcesAndSetBatchSize(desiredQPSFraction, jsc); JavaRDD writeStatusJavaRDD = partitionedRDD.mapPartitionsWithIndex(updateLocationFunction(), @@ -406,7 +455,7 @@ public JavaRDD updateLocation(JavaRDD writeStatusRDD, // force trigger update location(hbase puts) writeStatusJavaRDD.count(); this.hBaseIndexQPSResourceAllocator.releaseQPSResources(); - return writeStatusJavaRDD; + return HoodieJavaRDD.of(writeStatusJavaRDD); } private Option calculateQPSFraction(JavaRDD writeStatusRDD) { @@ -453,7 +502,7 @@ private void acquireQPSResourcesAndSetBatchSize(final Option desiredQPSFr } } - public Tuple2 getHBasePutAccessParallelism(final JavaRDD writeStatusRDD) { + Tuple2 getHBasePutAccessParallelism(final JavaRDD writeStatusRDD) { final JavaPairRDD insertOnlyWriteStatusRDD = writeStatusRDD .filter(w -> w.getStat().getNumInserts() > 0).mapToPair(w -> new Tuple2<>(w.getStat().getNumInserts(), 1)); return insertOnlyWriteStatusRDD.fold(new Tuple2<>(0L, 0), (w, c) -> new Tuple2<>(w._1 + c._1, w._2 + c._2)); @@ -537,7 +586,73 @@ private Integer getNumRegionServersAliveForTable() { @Override public boolean rollbackCommit(String instantTime) { - // Rollback in HbaseIndex is managed via method {@link #checkIfValidCommit()} + int multiGetBatchSize = config.getHbaseIndexGetBatchSize(); + boolean rollbackSync = config.getHBaseIndexRollbackSync(); + + if (!config.getHBaseIndexRollbackSync()) { + // Default Rollback in HbaseIndex is managed via method {@link #checkIfValidCommit()} + return true; + } + + synchronized (SparkHoodieHBaseIndex.class) { + if (hbaseConnection == null || hbaseConnection.isClosed()) { + hbaseConnection = getHBaseConnection(); + } + } + final RateLimiter limiter = RateLimiter.create(multiPutBatchSize, TimeUnit.SECONDS); + try (HTable hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName)); + BufferedMutator mutator = hbaseConnection.getBufferedMutator(TableName.valueOf(tableName))) { + Long rollbackTime = HoodieActiveTimeline.parseDateFromInstantTime(instantTime).getTime(); + Long currentTime = new Date().getTime(); + Scan scan = new Scan(); + scan.addFamily(SYSTEM_COLUMN_FAMILY); + scan.setTimeRange(rollbackTime, currentTime); + ResultScanner scanner = hTable.getScanner(scan); + Iterator scannerIterator = scanner.iterator(); + + List statements = new ArrayList<>(); + List currentVersionResults = new ArrayList(); + List mutations = new ArrayList<>(); + while (scannerIterator.hasNext()) { + Result result = scannerIterator.next(); + currentVersionResults.add(result); + statements.add(generateStatement(Bytes.toString(result.getRow()), 0L, rollbackTime - 1)); + + if (scannerIterator.hasNext() && statements.size() < multiGetBatchSize) { + continue; + } + Result[] lastVersionResults = hTable.get(statements); + for (int i = 0; i < lastVersionResults.length; i++) { + Result lastVersionResult = lastVersionResults[i]; + if (null == lastVersionResult.getRow() && rollbackSync) { + Result currentVersionResult = currentVersionResults.get(i); + Delete delete = new Delete(currentVersionResult.getRow()); + mutations.add(delete); + } + + if (null != lastVersionResult.getRow()) { + String oldPath = new String(lastVersionResult.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN)); + String nowPath = new String(currentVersionResults.get(i).getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN)); + if (!oldPath.equals(nowPath) || rollbackSync) { + Put put = new Put(lastVersionResult.getRow()); + put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, lastVersionResult.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN)); + put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, lastVersionResult.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN)); + put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, lastVersionResult.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN)); + mutations.add(put); + } + } + } + doMutations(mutator, mutations, limiter); + currentVersionResults.clear(); + statements.clear(); + mutations.clear(); + } + } catch (Exception e) { + LOG.error("hbase index roll back failed", e); + return false; + } finally { + limiter.stop(); + } return true; } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/simple/SparkHoodieGlobalSimpleIndex.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/simple/SparkHoodieGlobalSimpleIndex.java deleted file mode 100644 index bdb4991cf76e3..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/simple/SparkHoodieGlobalSimpleIndex.java +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.index.simple; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.EmptyHoodieRecordPayload; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.index.HoodieIndexUtils; -import org.apache.hudi.table.HoodieTable; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import scala.Tuple2; - -import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -import static org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions; - -/** - * A global simple index which reads interested fields(record key and partition path) from base files and - * joins with incoming records to find the tagged location. - * - * @param - */ -@SuppressWarnings("checkstyle:LineLength") -public class SparkHoodieGlobalSimpleIndex extends SparkHoodieSimpleIndex { - - public SparkHoodieGlobalSimpleIndex(HoodieWriteConfig config) { - super(config); - } - - @Override - public JavaRDD> tagLocation(JavaRDD> recordRDD, HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - return tagLocationInternal(recordRDD, context, hoodieTable); - } - - /** - * Tags records location for incoming records. - * - * @param inputRecordRDD {@link JavaRDD} of incoming records - * @param context instance of {@link HoodieEngineContext} to use - * @param hoodieTable instance of {@link HoodieTable} to use - * @return {@link JavaRDD} of records with record locations set - */ - @Override - protected JavaRDD> tagLocationInternal(JavaRDD> inputRecordRDD, HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - - JavaPairRDD> keyedInputRecordRDD = inputRecordRDD.mapToPair(entry -> new Tuple2<>(entry.getRecordKey(), entry)); - JavaPairRDD allRecordLocationsInTable = fetchAllRecordLocations(context, hoodieTable, - config.getGlobalSimpleIndexParallelism()); - return getTaggedRecords(keyedInputRecordRDD, allRecordLocationsInTable); - } - - /** - * Fetch record locations for passed in {@link HoodieKey}s. - * - * @param context instance of {@link HoodieEngineContext} to use - * @param hoodieTable instance of {@link HoodieTable} of interest - * @param parallelism parallelism to use - * @return {@link JavaPairRDD} of {@link HoodieKey} and {@link HoodieRecordLocation} - */ - protected JavaPairRDD fetchAllRecordLocations(HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable, - int parallelism) { - List> latestBaseFiles = getAllBaseFilesInTable(context, hoodieTable); - return fetchRecordLocations(context, hoodieTable, parallelism, latestBaseFiles); - } - - /** - * Load all files for all partitions as pair RDD. - */ - protected List> getAllBaseFilesInTable(final HoodieEngineContext context, - final HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); - try { - List allPartitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), config.shouldAssumeDatePartitioning()); - // Obtain the latest data files from all the partitions. - return getLatestBaseFilesForAllPartitions(allPartitionPaths, context, hoodieTable); - } catch (IOException e) { - throw new HoodieIOException("Failed to load all partitions", e); - } - } - - /** - * Tag records with right {@link HoodieRecordLocation}. - * - * @param incomingRecords incoming {@link HoodieRecord}s - * @param existingRecords existing records with {@link HoodieRecordLocation}s - * @return {@link JavaRDD} of {@link HoodieRecord}s with tagged {@link HoodieRecordLocation}s - */ - private JavaRDD> getTaggedRecords(JavaPairRDD> incomingRecords, JavaPairRDD existingRecords) { - JavaPairRDD> existingRecordByRecordKey = existingRecords - .mapToPair(entry -> new Tuple2<>(entry._1.getRecordKey(), Pair.of(entry._1.getPartitionPath(), entry._2))); - - return incomingRecords.leftOuterJoin(existingRecordByRecordKey).values() - .flatMap(entry -> { - HoodieRecord inputRecord = entry._1; - Option> partitionPathLocationPair = Option.ofNullable(entry._2.orNull()); - List> taggedRecords; - - if (partitionPathLocationPair.isPresent()) { - String partitionPath = partitionPathLocationPair.get().getKey(); - HoodieRecordLocation location = partitionPathLocationPair.get().getRight(); - if (config.getGlobalSimpleIndexUpdatePartitionPath() && !(inputRecord.getPartitionPath().equals(partitionPath))) { - // Create an empty record to delete the record in the old partition - HoodieRecord deleteRecord = new HoodieRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), new EmptyHoodieRecordPayload()); - deleteRecord.setCurrentLocation(location); - deleteRecord.seal(); - // Tag the incoming record for inserting to the new partition - HoodieRecord insertRecord = (HoodieRecord) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty()); - taggedRecords = Arrays.asList(deleteRecord, insertRecord); - } else { - // Ignore the incoming record's partition, regardless of whether it differs from its old partition or not. - // When it differs, the record will still be updated at its old partition. - HoodieRecord newRecord = new HoodieRecord<>(new HoodieKey(inputRecord.getRecordKey(), partitionPath), inputRecord.getData()); - taggedRecords = Collections.singletonList((HoodieRecord) HoodieIndexUtils.getTaggedRecord(newRecord, Option.ofNullable(location))); - } - } else { - taggedRecords = Collections.singletonList((HoodieRecord) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty())); - } - return taggedRecords.iterator(); - }); - } - - @Override - public boolean isGlobal() { - return true; - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/simple/SparkHoodieSimpleIndex.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/simple/SparkHoodieSimpleIndex.java deleted file mode 100644 index 3f167e2ebdd85..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/simple/SparkHoodieSimpleIndex.java +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.index.simple; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.utils.SparkMemoryUtils; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.index.HoodieIndexUtils; -import org.apache.hudi.index.SparkHoodieIndex; -import org.apache.hudi.io.HoodieKeyLocationFetchHandle; -import org.apache.hudi.table.HoodieTable; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; - -import java.util.List; - -import scala.Tuple2; - -import static org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions; - -/** - * A simple index which reads interested fields(record key and partition path) from base files and - * joins with incoming records to find the tagged location. - * - * @param - */ -@SuppressWarnings("checkstyle:LineLength") -public class SparkHoodieSimpleIndex extends SparkHoodieIndex { - - public SparkHoodieSimpleIndex(HoodieWriteConfig config) { - super(config); - } - - @Override - public JavaRDD updateLocation(JavaRDD writeStatusRDD, HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - return writeStatusRDD; - } - - @Override - public boolean rollbackCommit(String commitTime) { - return true; - } - - @Override - public boolean isGlobal() { - return false; - } - - @Override - public boolean canIndexLogFiles() { - return false; - } - - @Override - public boolean isImplicitWithStorage() { - return true; - } - - @Override - public JavaRDD> tagLocation(JavaRDD> recordRDD, - HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - return tagLocationInternal(recordRDD, context, hoodieTable); - } - - /** - * Tags records location for incoming records. - * - * @param inputRecordRDD {@link JavaRDD} of incoming records - * @param context instance of {@link HoodieEngineContext} to use - * @param hoodieTable instance of {@link HoodieTable} to use - * @return {@link JavaRDD} of records with record locations set - */ - protected JavaRDD> tagLocationInternal(JavaRDD> inputRecordRDD, HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) { - if (config.getSimpleIndexUseCaching()) { - inputRecordRDD.persist(SparkMemoryUtils.getSimpleIndexInputStorageLevel(config.getProps())); - } - - JavaPairRDD> keyedInputRecordRDD = inputRecordRDD.mapToPair(record -> new Tuple2<>(record.getKey(), record)); - JavaPairRDD existingLocationsOnTable = fetchRecordLocationsForAffectedPartitions(keyedInputRecordRDD.keys(), context, hoodieTable, - config.getSimpleIndexParallelism()); - - JavaRDD> taggedRecordRDD = keyedInputRecordRDD.leftOuterJoin(existingLocationsOnTable) - .map(entry -> { - final HoodieRecord untaggedRecord = entry._2._1; - final Option location = Option.ofNullable(entry._2._2.orNull()); - return HoodieIndexUtils.getTaggedRecord(untaggedRecord, location); - }); - - if (config.getSimpleIndexUseCaching()) { - inputRecordRDD.unpersist(); - } - return taggedRecordRDD; - } - - /** - * Fetch record locations for passed in {@link HoodieKey}s. - * - * @param hoodieKeys {@link JavaRDD} of {@link HoodieKey}s for which locations are fetched - * @param context instance of {@link HoodieEngineContext} to use - * @param hoodieTable instance of {@link HoodieTable} of interest - * @param parallelism parallelism to use - * @return {@link JavaPairRDD} of {@link HoodieKey} and {@link HoodieRecordLocation} - */ - protected JavaPairRDD fetchRecordLocationsForAffectedPartitions(JavaRDD hoodieKeys, - HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable, - int parallelism) { - List affectedPartitionPathList = hoodieKeys.map(HoodieKey::getPartitionPath).distinct().collect(); - List> latestBaseFiles = getLatestBaseFilesForAllPartitions(affectedPartitionPathList, context, hoodieTable); - return fetchRecordLocations(context, hoodieTable, parallelism, latestBaseFiles); - } - - protected JavaPairRDD fetchRecordLocations(HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable, - int parallelism, - List> baseFiles) { - JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); - int fetchParallelism = Math.max(1, Math.max(baseFiles.size(), parallelism)); - return jsc.parallelize(baseFiles, fetchParallelism) - .flatMapToPair(partitionPathBaseFile -> new HoodieKeyLocationFetchHandle(config, hoodieTable, partitionPathBaseFile).locations()); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/HoodieRowCreateHandle.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/HoodieRowCreateHandle.java deleted file mode 100644 index fa160c6919458..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/HoodieRowCreateHandle.java +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.io; - -import org.apache.hudi.client.HoodieInternalWriteStatus; -import org.apache.hudi.client.model.HoodieInternalRow; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodiePartitionMetadata; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieWriteStat; -import org.apache.hudi.common.model.IOType; -import org.apache.hudi.common.table.HoodieTableConfig; -import org.apache.hudi.common.util.HoodieTimer; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.exception.HoodieInsertException; -import org.apache.hudi.io.storage.HoodieInternalRowFileWriter; -import org.apache.hudi.io.storage.HoodieInternalRowFileWriterFactory; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.MarkerFiles; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.types.StructType; - -import java.io.IOException; -import java.io.Serializable; -import java.util.concurrent.atomic.AtomicLong; - -/** - * Create handle with InternalRow for datasource implemention of bulk insert. - */ -public class HoodieRowCreateHandle implements Serializable { - - private static final long serialVersionUID = 1L; - private static final Logger LOG = LogManager.getLogger(HoodieRowCreateHandle.class); - private static final AtomicLong SEQGEN = new AtomicLong(1); - - private final String instantTime; - private final int taskPartitionId; - private final long taskId; - private final long taskEpochId; - private final HoodieTable table; - private final HoodieWriteConfig writeConfig; - private final HoodieInternalRowFileWriter fileWriter; - private final String partitionPath; - private final Path path; - private final String fileId; - private final FileSystem fs; - private final HoodieInternalWriteStatus writeStatus; - private final HoodieTimer currTimer; - - public HoodieRowCreateHandle(HoodieTable table, HoodieWriteConfig writeConfig, String partitionPath, String fileId, - String instantTime, int taskPartitionId, long taskId, long taskEpochId, - StructType structType) { - this.partitionPath = partitionPath; - this.table = table; - this.writeConfig = writeConfig; - this.instantTime = instantTime; - this.taskPartitionId = taskPartitionId; - this.taskId = taskId; - this.taskEpochId = taskEpochId; - this.fileId = fileId; - this.currTimer = new HoodieTimer(); - this.currTimer.startTimer(); - this.fs = table.getMetaClient().getFs(); - this.path = makeNewPath(partitionPath); - this.writeStatus = new HoodieInternalWriteStatus(!table.getIndex().isImplicitWithStorage(), - writeConfig.getWriteStatusFailureFraction()); - writeStatus.setPartitionPath(partitionPath); - writeStatus.setFileId(fileId); - try { - HoodiePartitionMetadata partitionMetadata = - new HoodiePartitionMetadata( - fs, - instantTime, - new Path(writeConfig.getBasePath()), - FSUtils.getPartitionPath(writeConfig.getBasePath(), partitionPath)); - partitionMetadata.trySave(taskPartitionId); - createMarkerFile(partitionPath, FSUtils.makeDataFileName(this.instantTime, getWriteToken(), this.fileId, table.getBaseFileExtension())); - this.fileWriter = createNewFileWriter(path, table, writeConfig, structType); - } catch (IOException e) { - throw new HoodieInsertException("Failed to initialize file writer for path " + path, e); - } - LOG.info("New handle created for partition :" + partitionPath + " with fileId " + fileId); - } - - /** - * Writes an {@link InternalRow} to the underlying HoodieInternalRowFileWriter. Before writing, value for meta columns are computed as required - * and wrapped in {@link HoodieInternalRow}. {@link HoodieInternalRow} is what gets written to HoodieInternalRowFileWriter. - * @param record instance of {@link InternalRow} that needs to be written to the fileWriter. - * @throws IOException - */ - public void write(InternalRow record) throws IOException { - try { - String partitionPath = record.getUTF8String(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get( - HoodieRecord.PARTITION_PATH_METADATA_FIELD)).toString(); - String seqId = HoodieRecord.generateSequenceId(instantTime, taskPartitionId, SEQGEN.getAndIncrement()); - String recordKey = record.getUTF8String(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get( - HoodieRecord.RECORD_KEY_METADATA_FIELD)).toString(); - HoodieInternalRow internalRow = new HoodieInternalRow(instantTime, seqId, recordKey, partitionPath, path.getName(), - record); - try { - fileWriter.writeRow(recordKey, internalRow); - writeStatus.markSuccess(recordKey); - } catch (Throwable t) { - writeStatus.markFailure(recordKey, t); - } - } catch (Throwable ge) { - writeStatus.setGlobalError(ge); - throw ge; - } - } - - /** - * @returns {@code true} if this handle can take in more writes. else {@code false}. - */ - public boolean canWrite() { - return fileWriter.canWrite(); - } - - /** - * Closes the {@link HoodieRowCreateHandle} and returns an instance of {@link HoodieInternalWriteStatus} containing the stats and - * status of the writes to this handle. - * @return the {@link HoodieInternalWriteStatus} containing the stats and status of the writes to this handle. - * @throws IOException - */ - public HoodieInternalWriteStatus close() throws IOException { - fileWriter.close(); - HoodieWriteStat stat = new HoodieWriteStat(); - stat.setPartitionPath(partitionPath); - stat.setNumWrites(writeStatus.getTotalRecords()); - stat.setNumDeletes(0); - stat.setNumInserts(writeStatus.getTotalRecords()); - stat.setPrevCommit(HoodieWriteStat.NULL_COMMIT); - stat.setFileId(fileId); - stat.setPath(new Path(writeConfig.getBasePath()), path); - long fileSizeInBytes = FSUtils.getFileSize(table.getMetaClient().getFs(), path); - stat.setTotalWriteBytes(fileSizeInBytes); - stat.setFileSizeInBytes(fileSizeInBytes); - stat.setTotalWriteErrors(writeStatus.getFailedRowsSize()); - HoodieWriteStat.RuntimeStats runtimeStats = new HoodieWriteStat.RuntimeStats(); - runtimeStats.setTotalCreateTime(currTimer.endTimer()); - stat.setRuntimeStats(runtimeStats); - writeStatus.setStat(stat); - return writeStatus; - } - - public String getFileName() { - return path.getName(); - } - - private Path makeNewPath(String partitionPath) { - Path path = FSUtils.getPartitionPath(writeConfig.getBasePath(), partitionPath); - try { - fs.mkdirs(path); // create a new partition as needed. - } catch (IOException e) { - throw new HoodieIOException("Failed to make dir " + path, e); - } - HoodieTableConfig tableConfig = table.getMetaClient().getTableConfig(); - return new Path(path.toString(), FSUtils.makeDataFileName(instantTime, getWriteToken(), fileId, - tableConfig.getBaseFileFormat().getFileExtension())); - } - - /** - * Creates an empty marker file corresponding to storage writer path. - * - * @param partitionPath Partition path - */ - private void createMarkerFile(String partitionPath, String dataFileName) { - MarkerFiles markerFiles = new MarkerFiles(table, instantTime); - markerFiles.create(partitionPath, dataFileName, IOType.CREATE); - } - - private String getWriteToken() { - return taskPartitionId + "-" + taskId + "-" + taskEpochId; - } - - private HoodieInternalRowFileWriter createNewFileWriter( - Path path, HoodieTable hoodieTable, HoodieWriteConfig config, StructType schema) - throws IOException { - return HoodieInternalRowFileWriterFactory.getInternalRowFileWriter( - path, hoodieTable, config, schema); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieInternalRowFileWriter.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieInternalRowFileWriter.java deleted file mode 100644 index 6ab80b6987ba3..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieInternalRowFileWriter.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.io.storage; - -import org.apache.spark.sql.catalyst.InternalRow; - -import java.io.IOException; - -/** - * Abstraction to assist in writing {@link InternalRow}s to be used in datasource implementation. - */ -public interface HoodieInternalRowFileWriter { - - /** - * @returns {@code true} if this RowFileWriter can take in more writes. else {@code false}. - */ - boolean canWrite(); - - /** - * Writes an {@link InternalRow} to the HoodieInternalRowFileWriter. - * - * @throws IOException on any exception while writing. - */ - void writeRow(String key, InternalRow row) throws IOException; - - /** - * Closes the {@link HoodieInternalRowFileWriter} and may not take in any more writes. - */ - void close() throws IOException; -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieInternalRowFileWriterFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieInternalRowFileWriterFactory.java deleted file mode 100644 index cb238bb31cada..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieInternalRowFileWriterFactory.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.io.storage; - -import org.apache.hudi.common.bloom.BloomFilter; -import org.apache.hudi.common.bloom.BloomFilterFactory; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; - -import org.apache.hadoop.fs.Path; -import org.apache.spark.sql.types.StructType; - -import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; - -import java.io.IOException; - -/** - * Factory to assist in instantiating a new {@link HoodieInternalRowFileWriter}. - */ -public class HoodieInternalRowFileWriterFactory { - - /** - * Factory method to assist in instantiating an instance of {@link HoodieInternalRowFileWriter}. - * @param path path of the RowFileWriter. - * @param hoodieTable instance of {@link HoodieTable} in use. - * @param config instance of {@link HoodieWriteConfig} to use. - * @param schema schema of the dataset in use. - * @return the instantiated {@link HoodieInternalRowFileWriter}. - * @throws IOException if format is not supported or if any exception during instantiating the RowFileWriter. - * - */ - public static HoodieInternalRowFileWriter getInternalRowFileWriter( - Path path, HoodieTable hoodieTable, HoodieWriteConfig config, StructType schema) - throws IOException { - final String extension = FSUtils.getFileExtension(path.getName()); - if (PARQUET.getFileExtension().equals(extension)) { - return newParquetInternalRowFileWriter(path, config, schema, hoodieTable); - } - throw new UnsupportedOperationException(extension + " format not supported yet."); - } - - private static HoodieInternalRowFileWriter newParquetInternalRowFileWriter( - Path path, HoodieWriteConfig writeConfig, StructType structType, HoodieTable table) - throws IOException { - BloomFilter filter = BloomFilterFactory.createBloomFilter( - writeConfig.getBloomFilterNumEntries(), - writeConfig.getBloomFilterFPP(), - writeConfig.getDynamicBloomFilterMaxNumEntries(), - writeConfig.getBloomFilterType()); - HoodieRowParquetWriteSupport writeSupport = - new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, filter); - return new HoodieInternalRowParquetWriter( - path, new HoodieRowParquetConfig( - writeSupport, - writeConfig.getParquetCompressionCodec(), - writeConfig.getParquetBlockSize(), - writeConfig.getParquetPageSize(), - writeConfig.getParquetMaxFileSize(), - writeSupport.getHadoopConf(), - writeConfig.getParquetCompressionRatio())); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieInternalRowParquetWriter.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieInternalRowParquetWriter.java deleted file mode 100644 index 8070c07bf7b2d..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieInternalRowParquetWriter.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.io.storage; - -import org.apache.hadoop.fs.Path; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.fs.HoodieWrapperFileSystem; - -import org.apache.parquet.hadoop.ParquetFileWriter; -import org.apache.parquet.hadoop.ParquetWriter; -import org.apache.spark.sql.catalyst.InternalRow; - -import java.io.IOException; - -/** - * Parquet's impl of {@link HoodieInternalRowFileWriter} to write {@link InternalRow}s. - */ -public class HoodieInternalRowParquetWriter extends ParquetWriter - implements HoodieInternalRowFileWriter { - - private final Path file; - private final HoodieWrapperFileSystem fs; - private final long maxFileSize; - private final HoodieRowParquetWriteSupport writeSupport; - - public HoodieInternalRowParquetWriter(Path file, HoodieRowParquetConfig parquetConfig) - throws IOException { - super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()), - ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(), - parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(), - DEFAULT_IS_DICTIONARY_ENABLED, DEFAULT_IS_VALIDATING_ENABLED, - DEFAULT_WRITER_VERSION, FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf())); - this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()); - this.fs = (HoodieWrapperFileSystem) this.file.getFileSystem(FSUtils.registerFileSystem(file, - parquetConfig.getHadoopConf())); - this.maxFileSize = parquetConfig.getMaxFileSize() - + Math.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio()); - this.writeSupport = parquetConfig.getWriteSupport(); - } - - @Override - public boolean canWrite() { - return fs.getBytesWritten(file) < maxFileSize; - } - - @Override - public void writeRow(String key, InternalRow row) throws IOException { - super.write(row); - writeSupport.add(key); - } - - @Override - public void close() throws IOException { - super.close(); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieRowParquetConfig.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieRowParquetConfig.java deleted file mode 100644 index d9930056fb93f..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieRowParquetConfig.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.io.storage; - -import org.apache.hadoop.conf.Configuration; -import org.apache.parquet.hadoop.metadata.CompressionCodecName; - -/** - * ParquetConfig for datasource implementation with {@link org.apache.hudi.client.model.HoodieInternalRow}. - */ -public class HoodieRowParquetConfig extends HoodieBaseParquetConfig { - - public HoodieRowParquetConfig(HoodieRowParquetWriteSupport writeSupport, CompressionCodecName compressionCodecName, - int blockSize, int pageSize, long maxFileSize, Configuration hadoopConf, - double compressionRatio) { - super(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieRowParquetWriteSupport.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieRowParquetWriteSupport.java deleted file mode 100644 index f6cef204aae6f..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieRowParquetWriteSupport.java +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.io.storage; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hudi.common.bloom.BloomFilter; -import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter; -import org.apache.parquet.hadoop.api.WriteSupport; -import org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport; -import org.apache.spark.sql.types.StructType; - -import java.util.HashMap; - -import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY; -import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE; -import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER; -import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER; - -/** - * Hoodie Write Support for directly writing Row to Parquet. - */ -public class HoodieRowParquetWriteSupport extends ParquetWriteSupport { - - private Configuration hadoopConf; - private BloomFilter bloomFilter; - private String minRecordKey; - private String maxRecordKey; - - public HoodieRowParquetWriteSupport(Configuration conf, StructType structType, BloomFilter bloomFilter) { - super(); - Configuration hadoopConf = new Configuration(conf); - hadoopConf.set("spark.sql.parquet.writeLegacyFormat", "false"); - hadoopConf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MILLIS"); - this.hadoopConf = hadoopConf; - setSchema(structType, hadoopConf); - this.bloomFilter = bloomFilter; - } - - public Configuration getHadoopConf() { - return hadoopConf; - } - - @Override - public WriteSupport.FinalizedWriteContext finalizeWrite() { - HashMap extraMetaData = new HashMap<>(); - if (bloomFilter != null) { - extraMetaData.put(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, bloomFilter.serializeToString()); - if (minRecordKey != null && maxRecordKey != null) { - extraMetaData.put(HOODIE_MIN_RECORD_KEY_FOOTER, minRecordKey); - extraMetaData.put(HOODIE_MAX_RECORD_KEY_FOOTER, maxRecordKey); - } - if (bloomFilter.getBloomFilterTypeCode().name().contains(HoodieDynamicBoundedBloomFilter.TYPE_CODE_PREFIX)) { - extraMetaData.put(HOODIE_BLOOM_FILTER_TYPE_CODE, bloomFilter.getBloomFilterTypeCode().name()); - } - } - return new WriteSupport.FinalizedWriteContext(extraMetaData); - } - - public void add(String recordKey) { - this.bloomFilter.add(recordKey); - if (minRecordKey != null) { - minRecordKey = minRecordKey.compareTo(recordKey) <= 0 ? minRecordKey : recordKey; - } else { - minRecordKey = recordKey; - } - - if (maxRecordKey != null) { - maxRecordKey = maxRecordKey.compareTo(recordKey) >= 0 ? maxRecordKey : recordKey; - } else { - maxRecordKey = recordKey; - } - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowFileWriter.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowFileWriter.java new file mode 100644 index 0000000000000..9303223b62e31 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowFileWriter.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage.row; + +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.unsafe.types.UTF8String; + +import java.io.IOException; + +/** + * Abstraction to assist in writing {@link InternalRow}s to be used in datasource implementation. + */ +public interface HoodieInternalRowFileWriter { + + /** + * @return {@code true} if this RowFileWriter can take in more writes. else {@code false}. + */ + boolean canWrite(); + + /** + * Writes an {@link InternalRow} to the HoodieInternalRowFileWriter. Also takes in associated record key to be added to bloom filter if required. + * + * @throws IOException on any exception while writing. + */ + void writeRow(UTF8String key, InternalRow row) throws IOException; + + /** + * Writes an {@link InternalRow} to the HoodieInternalRowFileWriter. + * + * @throws IOException on any exception while writing. + */ + void writeRow(InternalRow row) throws IOException; + + /** + * Closes the {@link HoodieInternalRowFileWriter} and may not take in any more writes. + */ + void close() throws IOException; +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowFileWriterFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowFileWriterFactory.java new file mode 100644 index 0000000000000..e68873f92efdb --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowFileWriterFactory.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage.row; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.bloom.BloomFilterFactory; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.storage.HoodieParquetConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.spark.sql.types.StructType; + +import java.io.IOException; + +import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; + +/** + * Factory to assist in instantiating a new {@link HoodieInternalRowFileWriter}. + */ +public class HoodieInternalRowFileWriterFactory { + + /** + * Factory method to assist in instantiating an instance of {@link HoodieInternalRowFileWriter}. + * @param path path of the RowFileWriter. + * @param hoodieTable instance of {@link HoodieTable} in use. + * @param writeConfig instance of {@link HoodieWriteConfig} to use. + * @param schema schema of the dataset in use. + * @return the instantiated {@link HoodieInternalRowFileWriter}. + * @throws IOException if format is not supported or if any exception during instantiating the RowFileWriter. + * + */ + public static HoodieInternalRowFileWriter getInternalRowFileWriter(Path path, + HoodieTable hoodieTable, + HoodieWriteConfig writeConfig, + StructType schema) + throws IOException { + final String extension = FSUtils.getFileExtension(path.getName()); + if (PARQUET.getFileExtension().equals(extension)) { + return newParquetInternalRowFileWriter(path, hoodieTable, writeConfig, schema, tryInstantiateBloomFilter(writeConfig)); + } + throw new UnsupportedOperationException(extension + " format not supported yet."); + } + + private static HoodieInternalRowFileWriter newParquetInternalRowFileWriter(Path path, + HoodieTable table, + HoodieWriteConfig writeConfig, + StructType structType, + Option bloomFilterOpt + ) + throws IOException { + HoodieRowParquetWriteSupport writeSupport = + new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, bloomFilterOpt, writeConfig); + + return new HoodieInternalRowParquetWriter( + path, + new HoodieParquetConfig<>( + writeSupport, + writeConfig.getParquetCompressionCodec(), + writeConfig.getParquetBlockSize(), + writeConfig.getParquetPageSize(), + writeConfig.getParquetMaxFileSize(), + writeSupport.getHadoopConf(), + writeConfig.getParquetCompressionRatio(), + writeConfig.parquetDictionaryEnabled() + )); + } + + private static Option tryInstantiateBloomFilter(HoodieWriteConfig writeConfig) { + // NOTE: Currently Bloom Filter is only going to be populated if meta-fields are populated + if (writeConfig.populateMetaFields()) { + BloomFilter bloomFilter = BloomFilterFactory.createBloomFilter( + writeConfig.getBloomFilterNumEntries(), + writeConfig.getBloomFilterFPP(), + writeConfig.getDynamicBloomFilterMaxNumEntries(), + writeConfig.getBloomFilterType()); + + return Option.of(bloomFilter); + } + + return Option.empty(); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowParquetWriter.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowParquetWriter.java new file mode 100644 index 0000000000000..a7cacd055a63c --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowParquetWriter.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage.row; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.io.storage.HoodieParquetConfig; +import org.apache.hudi.io.storage.HoodieBaseParquetWriter; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.unsafe.types.UTF8String; + +import java.io.IOException; + +/** + * Parquet's impl of {@link HoodieInternalRowFileWriter} to write {@link InternalRow}s. + */ +public class HoodieInternalRowParquetWriter extends HoodieBaseParquetWriter + implements HoodieInternalRowFileWriter { + + private final HoodieRowParquetWriteSupport writeSupport; + + public HoodieInternalRowParquetWriter(Path file, HoodieParquetConfig parquetConfig) + throws IOException { + super(file, parquetConfig); + + this.writeSupport = parquetConfig.getWriteSupport(); + } + + @Override + public void writeRow(UTF8String key, InternalRow row) throws IOException { + super.write(row); + writeSupport.add(key); + } + + @Override + public void writeRow(InternalRow row) throws IOException { + super.write(row); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowCreateHandle.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowCreateHandle.java new file mode 100644 index 0000000000000..9da04f72600b7 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowCreateHandle.java @@ -0,0 +1,277 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage.row; + +import org.apache.hudi.client.HoodieInternalWriteStatus; +import org.apache.hudi.client.model.HoodieInternalRow; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodiePartitionMetadata; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.IOType; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieInsertException; +import org.apache.hudi.hadoop.CachingPath; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.marker.WriteMarkersFactory; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.UTF8String; + +import java.io.IOException; +import java.io.Serializable; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.Function; + +/** + * Create handle with InternalRow for datasource implementation of bulk insert. + */ +public class HoodieRowCreateHandle implements Serializable { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LogManager.getLogger(HoodieRowCreateHandle.class); + private static final AtomicLong GLOBAL_SEQ_NO = new AtomicLong(1); + + private final HoodieTable table; + private final HoodieWriteConfig writeConfig; + + private final String partitionPath; + private final Path path; + private final String fileId; + + private final boolean populateMetaFields; + + private final UTF8String fileName; + private final UTF8String commitTime; + private final Function seqIdGenerator; + + private final boolean shouldPreserveHoodieMetadata; + + private final HoodieTimer currTimer; + + protected final HoodieInternalRowFileWriter fileWriter; + protected final HoodieInternalWriteStatus writeStatus; + + public HoodieRowCreateHandle(HoodieTable table, + HoodieWriteConfig writeConfig, + String partitionPath, + String fileId, + String instantTime, + int taskPartitionId, + long taskId, + long taskEpochId, + StructType structType) { + this(table, writeConfig, partitionPath, fileId, instantTime, taskPartitionId, taskId, taskEpochId, + structType, false); + } + + public HoodieRowCreateHandle(HoodieTable table, + HoodieWriteConfig writeConfig, + String partitionPath, + String fileId, + String instantTime, + int taskPartitionId, + long taskId, + long taskEpochId, + StructType structType, + boolean shouldPreserveHoodieMetadata) { + this.partitionPath = partitionPath; + this.table = table; + this.writeConfig = writeConfig; + this.fileId = fileId; + + this.currTimer = HoodieTimer.start(); + + FileSystem fs = table.getMetaClient().getFs(); + + String writeToken = getWriteToken(taskPartitionId, taskId, taskEpochId); + String fileName = FSUtils.makeBaseFileName(instantTime, writeToken, this.fileId, table.getBaseFileExtension()); + this.path = makeNewPath(fs, partitionPath, fileName, writeConfig); + + this.populateMetaFields = writeConfig.populateMetaFields(); + this.fileName = UTF8String.fromString(path.getName()); + this.commitTime = UTF8String.fromString(instantTime); + this.seqIdGenerator = (id) -> HoodieRecord.generateSequenceId(instantTime, taskPartitionId, id); + + this.writeStatus = new HoodieInternalWriteStatus(!table.getIndex().isImplicitWithStorage(), + writeConfig.getWriteStatusFailureFraction()); + this.shouldPreserveHoodieMetadata = shouldPreserveHoodieMetadata; + + writeStatus.setPartitionPath(partitionPath); + writeStatus.setFileId(fileId); + writeStatus.setStat(new HoodieWriteStat()); + try { + HoodiePartitionMetadata partitionMetadata = + new HoodiePartitionMetadata( + fs, + instantTime, + new Path(writeConfig.getBasePath()), + FSUtils.getPartitionPath(writeConfig.getBasePath(), partitionPath), + table.getPartitionMetafileFormat()); + partitionMetadata.trySave(taskPartitionId); + + createMarkerFile(partitionPath, fileName, instantTime, table, writeConfig); + + this.fileWriter = HoodieInternalRowFileWriterFactory.getInternalRowFileWriter(path, table, writeConfig, structType); + } catch (IOException e) { + throw new HoodieInsertException("Failed to initialize file writer for path " + path, e); + } + + LOG.info("New handle created for partition: " + partitionPath + " with fileId " + fileId); + } + + /** + * Writes an {@link InternalRow} to the underlying HoodieInternalRowFileWriter. Before writing, value for meta columns are computed as required + * and wrapped in {@link HoodieInternalRow}. {@link HoodieInternalRow} is what gets written to HoodieInternalRowFileWriter. + * + * @param row instance of {@link InternalRow} that needs to be written to the fileWriter. + * @throws IOException + */ + public void write(InternalRow row) throws IOException { + if (populateMetaFields) { + writeRow(row); + } else { + writeRowNoMetaFields(row); + } + } + + private void writeRow(InternalRow row) { + try { + // NOTE: PLEASE READ THIS CAREFULLY BEFORE MODIFYING + // This code lays in the hot-path, and substantial caution should be + // exercised making changes to it to minimize amount of excessive: + // - Conversions b/w Spark internal types and JVM native ones (like [[UTF8String]] + // and [[String]]) + // - Repeated computations (for ex, converting file-path to [[UTF8String]] over and + // over again) + UTF8String recordKey = row.getUTF8String(HoodieRecord.RECORD_KEY_META_FIELD_ORD); + UTF8String partitionPath = row.getUTF8String(HoodieRecord.PARTITION_PATH_META_FIELD_ORD); + // This is the only meta-field that is generated dynamically, hence conversion b/w + // [[String]] and [[UTF8String]] is unavoidable if preserveHoodieMetadata is false + UTF8String seqId = shouldPreserveHoodieMetadata ? row.getUTF8String(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD_ORD) + : UTF8String.fromString(seqIdGenerator.apply(GLOBAL_SEQ_NO.getAndIncrement())); + UTF8String writeCommitTime = shouldPreserveHoodieMetadata ? row.getUTF8String(HoodieRecord.COMMIT_TIME_METADATA_FIELD_ORD) + : commitTime; + + InternalRow updatedRow = new HoodieInternalRow(writeCommitTime, seqId, recordKey, + partitionPath, fileName, row, true); + try { + fileWriter.writeRow(recordKey, updatedRow); + // NOTE: To avoid conversion on the hot-path we only convert [[UTF8String]] into [[String]] + // in cases when successful records' writes are being tracked + writeStatus.markSuccess(writeStatus.isTrackingSuccessfulWrites() ? recordKey.toString() : null); + } catch (Exception t) { + writeStatus.markFailure(recordKey.toString(), t); + } + } catch (Exception e) { + writeStatus.setGlobalError(e); + throw e; + } + } + + private void writeRowNoMetaFields(InternalRow row) { + try { + // TODO make sure writing w/ and w/o meta fields is consistent (currently writing w/o + // meta-fields would fail if any record will, while when writing w/ meta-fields it won't) + fileWriter.writeRow(row); + writeStatus.markSuccess(); + } catch (Exception e) { + writeStatus.setGlobalError(e); + throw new HoodieException("Exception thrown while writing spark InternalRows to file ", e); + } + } + + /** + * Returns {@code true} if this handle can take in more writes. else {@code false}. + */ + public boolean canWrite() { + return fileWriter.canWrite(); + } + + /** + * Closes the {@link HoodieRowCreateHandle} and returns an instance of {@link HoodieInternalWriteStatus} containing the stats and + * status of the writes to this handle. + * + * @return the {@link HoodieInternalWriteStatus} containing the stats and status of the writes to this handle. + */ + public HoodieInternalWriteStatus close() throws IOException { + fileWriter.close(); + HoodieWriteStat stat = writeStatus.getStat(); + stat.setPartitionPath(partitionPath); + stat.setNumWrites(writeStatus.getTotalRecords()); + stat.setNumDeletes(0); + stat.setNumInserts(writeStatus.getTotalRecords()); + stat.setPrevCommit(HoodieWriteStat.NULL_COMMIT); + stat.setFileId(fileId); + stat.setPath(new Path(writeConfig.getBasePath()), path); + long fileSizeInBytes = FSUtils.getFileSize(table.getMetaClient().getFs(), path); + stat.setTotalWriteBytes(fileSizeInBytes); + stat.setFileSizeInBytes(fileSizeInBytes); + stat.setTotalWriteErrors(writeStatus.getFailedRowsSize()); + HoodieWriteStat.RuntimeStats runtimeStats = new HoodieWriteStat.RuntimeStats(); + runtimeStats.setTotalCreateTime(currTimer.endTimer()); + stat.setRuntimeStats(runtimeStats); + return writeStatus; + } + + public String getFileName() { + return path.getName(); + } + + private static Path makeNewPath(FileSystem fs, String partitionPath, String fileName, HoodieWriteConfig writeConfig) { + Path path = FSUtils.getPartitionPath(writeConfig.getBasePath(), partitionPath); + try { + if (!fs.exists(path)) { + fs.mkdirs(path); // create a new partition as needed. + } + } catch (IOException e) { + throw new HoodieIOException("Failed to make dir " + path, e); + } + return new CachingPath(path.toString(), fileName); + } + + /** + * Creates an empty marker file corresponding to storage writer path. + * + * @param partitionPath Partition path + */ + private static void createMarkerFile(String partitionPath, + String dataFileName, + String instantTime, + HoodieTable table, + HoodieWriteConfig writeConfig) { + WriteMarkersFactory.get(writeConfig.getMarkersType(), table, instantTime) + .create(partitionPath, dataFileName, IOType.CREATE); + } + + // TODO extract to utils + private static String getWriteToken(int taskPartitionId, long taskId, long taskEpochId) { + return taskPartitionId + "-" + taskId + "-" + taskEpochId; + } + +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java new file mode 100644 index 0000000000000..bb4dd9c619425 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage.row; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hudi.avro.HoodieBloomFilterWriteSupport; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.parquet.hadoop.api.WriteSupport; +import org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.UTF8String; + +import java.util.Collections; +import java.util.Map; + +/** + * Hoodie Write Support for directly writing Row to Parquet. + */ +public class HoodieRowParquetWriteSupport extends ParquetWriteSupport { + + private final Configuration hadoopConf; + private final Option> bloomFilterWriteSupportOpt; + + public HoodieRowParquetWriteSupport(Configuration conf, StructType structType, Option bloomFilterOpt, HoodieWriteConfig writeConfig) { + Configuration hadoopConf = new Configuration(conf); + hadoopConf.set("spark.sql.parquet.writeLegacyFormat", writeConfig.parquetWriteLegacyFormatEnabled()); + hadoopConf.set("spark.sql.parquet.outputTimestampType", writeConfig.parquetOutputTimestampType()); + hadoopConf.set("spark.sql.parquet.fieldId.write.enabled", writeConfig.parquetFieldIdWriteEnabled()); + setSchema(structType, hadoopConf); + + this.hadoopConf = hadoopConf; + this.bloomFilterWriteSupportOpt = bloomFilterOpt.map(HoodieBloomFilterRowWriteSupport::new); + } + + public Configuration getHadoopConf() { + return hadoopConf; + } + + @Override + public WriteSupport.FinalizedWriteContext finalizeWrite() { + Map extraMetadata = + bloomFilterWriteSupportOpt.map(HoodieBloomFilterWriteSupport::finalizeMetadata) + .orElse(Collections.emptyMap()); + + return new WriteSupport.FinalizedWriteContext(extraMetadata); + } + + public void add(UTF8String recordKey) { + this.bloomFilterWriteSupportOpt.ifPresent(bloomFilterWriteSupport -> + bloomFilterWriteSupport.addKey(recordKey)); + } + + private static class HoodieBloomFilterRowWriteSupport extends HoodieBloomFilterWriteSupport { + public HoodieBloomFilterRowWriteSupport(BloomFilter bloomFilter) { + super(bloomFilter); + } + + @Override + protected byte[] getUTF8Bytes(UTF8String key) { + return key.getBytes(); + } + + @Override + protected UTF8String dereference(UTF8String key) { + // NOTE: [[clone]] is performed here (rather than [[copy]]) to only copy underlying buffer in + // cases when [[UTF8String]] is pointing into a buffer storing the whole containing record, + // and simply do a pass over when it holds a (immutable) buffer holding just the string + return key.clone(); + } + } + +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/BuiltinKeyGenerator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/BuiltinKeyGenerator.java index a0c199138b1a7..ad71b17ce70ff 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/BuiltinKeyGenerator.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/BuiltinKeyGenerator.java @@ -19,101 +19,559 @@ package org.apache.hudi.keygen; import org.apache.avro.generic.GenericRecord; -import org.apache.hudi.ApiMaturityLevel; -import org.apache.hudi.AvroConversionHelper; -import org.apache.hudi.PublicAPIMethod; +import org.apache.hudi.AvroConversionUtils; +import org.apache.hudi.HoodieSparkUtils; +import org.apache.hudi.client.utils.SparkRowSerDe; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.PartitionPathEncodeUtils; +import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieKeyException; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.sql.HoodieUnsafeRowUtils; +import org.apache.spark.sql.HoodieUnsafeRowUtils$; import org.apache.spark.sql.Row; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DateType; import org.apache.spark.sql.types.StructType; +import org.apache.spark.sql.types.TimestampType; +import org.apache.spark.unsafe.types.UTF8String; import scala.Function1; -import java.util.Collections; -import java.util.HashMap; +import javax.annotation.concurrent.ThreadSafe; +import java.sql.Timestamp; +import java.time.Instant; +import java.time.LocalDate; import java.util.List; -import java.util.Map; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.function.Supplier; + +import static org.apache.hudi.common.util.CollectionUtils.tail; +import static org.apache.hudi.common.util.ValidationUtils.checkState; +import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR; +import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_RECORD_KEY_PARTS_SEPARATOR; +import static org.apache.hudi.keygen.KeyGenUtils.EMPTY_RECORDKEY_PLACEHOLDER; +import static org.apache.hudi.keygen.KeyGenUtils.HUDI_DEFAULT_PARTITION_PATH; +import static org.apache.hudi.keygen.KeyGenUtils.NULL_RECORDKEY_PLACEHOLDER; /** - * Base class for the built-in key generators. Contains methods structured for - * code reuse amongst them. + * Base class for all built-in key generators. + * + * NOTE: By default it implements all the methods of {@link SparkKeyGeneratorInterface}, which + * by default however fallback to Avro implementation. For maximum performance (to avoid + * conversion from Spark's internal data-types to Avro) you should override these methods + * in your implementation. + * + * TODO rename to AvroFallbackBaseKeyGenerator */ +@ThreadSafe public abstract class BuiltinKeyGenerator extends BaseKeyGenerator implements SparkKeyGeneratorInterface { - private static final String STRUCT_NAME = "hoodieRowTopLevelField"; - private static final String NAMESPACE = "hoodieRow"; - private transient Function1 converterFn = null; - protected StructType structType; + private static final Logger LOG = LogManager.getLogger(BuiltinKeyGenerator.class); - protected Map> recordKeyPositions = new HashMap<>(); - protected Map> partitionPathPositions = new HashMap<>(); + private static final String COMPOSITE_KEY_FIELD_VALUE_INFIX = ":"; + + protected static final UTF8String HUDI_DEFAULT_PARTITION_PATH_UTF8 = UTF8String.fromString(HUDI_DEFAULT_PARTITION_PATH); + protected static final UTF8String NULL_RECORD_KEY_PLACEHOLDER_UTF8 = UTF8String.fromString(NULL_RECORDKEY_PLACEHOLDER); + protected static final UTF8String EMPTY_RECORD_KEY_PLACEHOLDER_UTF8 = UTF8String.fromString(EMPTY_RECORDKEY_PLACEHOLDER); + + protected transient volatile SparkRowConverter rowConverter; + protected transient volatile SparkRowAccessor rowAccessor; protected BuiltinKeyGenerator(TypedProperties config) { super(config); } - /** - * Fetch record key from {@link Row}. - * @param row instance of {@link Row} from which record key is requested. - * @return the record key of interest from {@link Row}. - */ @Override - @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) public String getRecordKey(Row row) { - if (null == converterFn) { - converterFn = AvroConversionHelper.createConverterToAvro(row.schema(), STRUCT_NAME, NAMESPACE); + tryInitRowConverter(row.schema()); + // NOTE: This implementation has considerable computational overhead and has to be overridden + // to provide for optimal performance on Spark. This implementation provided exclusively + // for compatibility reasons. + return getRecordKey(rowConverter.convertToAvro(row)); + } + + @Override + public UTF8String getRecordKey(InternalRow internalRow, StructType schema) { + tryInitRowConverter(schema); + // NOTE: This implementation has considerable computational overhead and has to be overridden + // to provide for optimal performance on Spark. This implementation provided exclusively + // for compatibility reasons. + return UTF8String.fromString(getRecordKey(rowConverter.convertToAvro(internalRow))); + } + + @Override + public String getPartitionPath(Row row) { + tryInitRowConverter(row.schema()); + // NOTE: This implementation has considerable computational overhead and has to be overridden + // to provide for optimal performance on Spark. This implementation provided exclusively + // for compatibility reasons. + return getPartitionPath(rowConverter.convertToAvro(row)); + } + + @Override + public UTF8String getPartitionPath(InternalRow internalRow, StructType schema) { + tryInitRowConverter(schema); + // NOTE: This implementation has considerable computational overhead and has to be overridden + // to provide for optimal performance on Spark. This implementation provided exclusively + // for compatibility reasons. + GenericRecord avroRecord = rowConverter.convertToAvro(internalRow); + return UTF8String.fromString(getPartitionPath(avroRecord)); + } + + protected void tryInitRowAccessor(StructType schema) { + if (this.rowAccessor == null) { + synchronized (this) { + if (this.rowAccessor == null) { + this.rowAccessor = new SparkRowAccessor(schema); + } + } } - GenericRecord genericRecord = (GenericRecord) converterFn.apply(row); - return getKey(genericRecord).getRecordKey(); } /** - * Fetch partition path from {@link Row}. - * @param row instance of {@link Row} from which partition path is requested - * @return the partition path of interest from {@link Row}. + * NOTE: This method has to stay final (so that it's easier for JIT compiler to apply certain + * optimizations, like inlining) */ - @Override - @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) - public String getPartitionPath(Row row) { - if (null == converterFn) { - converterFn = AvroConversionHelper.createConverterToAvro(row.schema(), STRUCT_NAME, NAMESPACE); - } - GenericRecord genericRecord = (GenericRecord) converterFn.apply(row); - return getKey(genericRecord).getPartitionPath(); - } - - void buildFieldPositionMapIfNeeded(StructType structType) { - if (this.structType == null) { - // parse simple fields - getRecordKeyFields().stream() - .filter(f -> !(f.contains("."))) - .forEach(f -> { - if (structType.getFieldIndex(f).isDefined()) { - recordKeyPositions.put(f, Collections.singletonList((Integer) (structType.getFieldIndex(f).get()))); - } else { - throw new HoodieKeyException("recordKey value not found for field: \"" + f + "\""); - } - }); - // parse nested fields - getRecordKeyFields().stream() - .filter(f -> f.contains(".")) - .forEach(f -> recordKeyPositions.put(f, RowKeyGeneratorHelper.getNestedFieldIndices(structType, f, true))); - // parse simple fields - if (getPartitionPathFields() != null) { - getPartitionPathFields().stream().filter(f -> !f.isEmpty()).filter(f -> !(f.contains("."))) - .forEach(f -> { - if (structType.getFieldIndex(f).isDefined()) { - partitionPathPositions.put(f, - Collections.singletonList((Integer) (structType.getFieldIndex(f).get()))); - } else { - partitionPathPositions.put(f, Collections.singletonList(-1)); - } - }); - // parse nested fields - getPartitionPathFields().stream().filter(f -> !f.isEmpty()).filter(f -> f.contains(".")) - .forEach(f -> partitionPathPositions.put(f, - RowKeyGeneratorHelper.getNestedFieldIndices(structType, f, false))); + protected final String combinePartitionPath(Object... partitionPathParts) { + return combinePartitionPathInternal( + JavaStringBuilder::new, + BuiltinKeyGenerator::toString, + this::tryEncodePartitionPath, + BuiltinKeyGenerator::handleNullOrEmptyPartitionPathPart, + partitionPathParts + ); + } + + /** + * NOTE: This method has to stay final (so that it's easier for JIT compiler to apply certain + * optimizations, like inlining) + */ + protected final UTF8String combinePartitionPathUnsafe(Object... partitionPathParts) { + return combinePartitionPathInternal( + UTF8StringBuilder::new, + BuiltinKeyGenerator::toUTF8String, + this::tryEncodePartitionPathUTF8, + BuiltinKeyGenerator::handleNullOrEmptyPartitionPathPartUTF8, + partitionPathParts + ); + } + + /** + * NOTE: This method has to stay final (so that it's easier for JIT compiler to apply certain + * optimizations, like inlining) + */ + protected final String combineRecordKey(Object... recordKeyParts) { + return combineRecordKeyInternal( + JavaStringBuilder::new, + BuiltinKeyGenerator::toString, + BuiltinKeyGenerator::handleNullRecordKey, + recordKeyParts + ); + } + + /** + * NOTE: This method has to stay final (so that it's easier for JIT compiler to apply certain + * optimizations, like inlining) + */ + protected final UTF8String combineRecordKeyUnsafe(Object... recordKeyParts) { + return combineRecordKeyInternal( + UTF8StringBuilder::new, + BuiltinKeyGenerator::toUTF8String, + BuiltinKeyGenerator::handleNullRecordKey, + recordKeyParts + ); + } + + /** + * NOTE: This method has to stay final (so that it's easier for JIT compiler to apply certain + * optimizations, like inlining) + */ + protected final String combineCompositeRecordKey(Object... recordKeyParts) { + return combineCompositeRecordKeyInternal( + JavaStringBuilder::new, + BuiltinKeyGenerator::toString, + BuiltinKeyGenerator::handleNullOrEmptyCompositeKeyPart, + BuiltinKeyGenerator::isNullOrEmptyCompositeKeyPart, + recordKeyParts + ); + } + + /** + * NOTE: This method has to stay final (so that it's easier for JIT compiler to apply certain + * optimizations, like inlining) + */ + protected final UTF8String combineCompositeRecordKeyUnsafe(Object... recordKeyParts) { + return combineCompositeRecordKeyInternal( + UTF8StringBuilder::new, + BuiltinKeyGenerator::toUTF8String, + BuiltinKeyGenerator::handleNullOrEmptyCompositeKeyPartUTF8, + BuiltinKeyGenerator::isNullOrEmptyCompositeKeyPartUTF8, + recordKeyParts + ); + } + + private S combineRecordKeyInternal( + Supplier> builderFactory, + Function converter, + Function emptyKeyPartHandler, + Object... recordKeyParts + ) { + if (recordKeyParts.length == 1) { + return emptyKeyPartHandler.apply(converter.apply(recordKeyParts[0])); + } + + StringBuilder sb = builderFactory.get(); + for (int i = 0; i < recordKeyParts.length; ++i) { + // NOTE: If record-key part has already been a string [[toString]] will be a no-op + sb.append(emptyKeyPartHandler.apply(converter.apply(recordKeyParts[i]))); + + if (i < recordKeyParts.length - 1) { + sb.appendJava(DEFAULT_RECORD_KEY_PARTS_SEPARATOR); + } + } + + return sb.build(); + } + + private S combineCompositeRecordKeyInternal( + Supplier> builderFactory, + Function converter, + Function emptyKeyPartHandler, + Predicate isNullOrEmptyKeyPartPredicate, + Object... recordKeyParts + ) { + boolean hasNonNullNonEmptyPart = false; + + StringBuilder sb = builderFactory.get(); + for (int i = 0; i < recordKeyParts.length; ++i) { + // NOTE: If record-key part has already been a string [[toString]] will be a no-op + S convertedKeyPart = emptyKeyPartHandler.apply(converter.apply(recordKeyParts[i])); + + sb.appendJava(recordKeyFields.get(i)); + sb.appendJava(COMPOSITE_KEY_FIELD_VALUE_INFIX); + sb.append(convertedKeyPart); + // This check is to validate that overall composite-key has at least one non-null, non-empty + // segment + hasNonNullNonEmptyPart |= !isNullOrEmptyKeyPartPredicate.test(convertedKeyPart); + + if (i < recordKeyParts.length - 1) { + sb.appendJava(DEFAULT_RECORD_KEY_PARTS_SEPARATOR); + } + } + + if (hasNonNullNonEmptyPart) { + return sb.build(); + } else { + throw new HoodieKeyException(String.format("All of the values for (%s) were either null or empty", recordKeyFields)); + } + } + + private S combinePartitionPathInternal(Supplier> builderFactory, + Function converter, + Function encoder, + Function emptyHandler, + Object... partitionPathParts) { + checkState(partitionPathParts.length == partitionPathFields.size()); + // Avoid creating [[StringBuilder]] in case there's just one partition-path part, + // and Hive-style of partitioning is not required + if (!hiveStylePartitioning && partitionPathParts.length == 1) { + return emptyHandler.apply(converter.apply(partitionPathParts[0])); + } + + StringBuilder sb = builderFactory.get(); + for (int i = 0; i < partitionPathParts.length; ++i) { + S partitionPathPartStr = encoder.apply(emptyHandler.apply(converter.apply(partitionPathParts[i]))); + + if (hiveStylePartitioning) { + sb.appendJava(partitionPathFields.get(i)) + .appendJava("=") + .append(partitionPathPartStr); + } else { + sb.append(partitionPathPartStr); + } + + if (i < partitionPathParts.length - 1) { + sb.appendJava(DEFAULT_PARTITION_PATH_SEPARATOR); } - this.structType = structType; + } + + return sb.build(); + } + + private String tryEncodePartitionPath(String partitionPathPart) { + return encodePartitionPath ? PartitionPathEncodeUtils.escapePathName(partitionPathPart) : partitionPathPart; + } + + private UTF8String tryEncodePartitionPathUTF8(UTF8String partitionPathPart) { + // NOTE: This method avoids [[UTF8String]] to [[String]] conversion (and back) unless + // partition-path encoding is enabled + return encodePartitionPath ? UTF8String.fromString(PartitionPathEncodeUtils.escapePathName(partitionPathPart.toString())) : partitionPathPart; + } + + private void tryInitRowConverter(StructType structType) { + if (rowConverter == null) { + synchronized (this) { + if (rowConverter == null) { + rowConverter = new SparkRowConverter(structType); + } + } + } + } + + protected static String requireNonNullNonEmptyKey(String key) { + if (key != null && key.length() > 0) { + return key; + } else { + throw new HoodieKeyException("Record key has to be non-empty string!"); + } + } + + protected static UTF8String requireNonNullNonEmptyKey(UTF8String key) { + if (key != null && key.numChars() > 0) { + return key; + } else { + throw new HoodieKeyException("Record key has to be non-empty string!"); + } + } + + protected static S handleNullRecordKey(S s) { + if (s == null || s.toString().isEmpty()) { + throw new HoodieKeyException("Record key has to be non-null!"); + } + + return s; + } + + private static UTF8String toUTF8String(Object o) { + if (o == null) { + return null; + } else if (o instanceof UTF8String) { + return (UTF8String) o; + } else { + // NOTE: If object is a [[String]], [[toString]] would be a no-op + return UTF8String.fromString(o.toString()); + } + } + + private static String toString(Object o) { + return o == null ? null : o.toString(); + } + + private static String handleNullOrEmptyCompositeKeyPart(Object keyPart) { + if (keyPart == null) { + return NULL_RECORDKEY_PLACEHOLDER; + } else { + // NOTE: [[toString]] is a no-op if key-part was already a [[String]] + String keyPartStr = keyPart.toString(); + return !keyPartStr.isEmpty() ? keyPartStr : EMPTY_RECORDKEY_PLACEHOLDER; + } + } + + private static UTF8String handleNullOrEmptyCompositeKeyPartUTF8(UTF8String keyPart) { + if (keyPart == null) { + return NULL_RECORD_KEY_PLACEHOLDER_UTF8; + } else if (keyPart.numChars() == 0) { + return EMPTY_RECORD_KEY_PLACEHOLDER_UTF8; + } + + return keyPart; + } + + @SuppressWarnings("StringEquality") + private static boolean isNullOrEmptyCompositeKeyPart(String keyPart) { + // NOTE: Converted key-part is compared against null/empty stub using ref-equality + // for performance reasons (it relies on the fact that we're using internalized + // constants) + return keyPart == NULL_RECORDKEY_PLACEHOLDER || keyPart == EMPTY_RECORDKEY_PLACEHOLDER; + } + + private static boolean isNullOrEmptyCompositeKeyPartUTF8(UTF8String keyPart) { + // NOTE: Converted key-part is compared against null/empty stub using ref-equality + // for performance reasons (it relies on the fact that we're using internalized + // constants) + return keyPart == NULL_RECORD_KEY_PLACEHOLDER_UTF8 || keyPart == EMPTY_RECORD_KEY_PLACEHOLDER_UTF8; + } + + private static String handleNullOrEmptyPartitionPathPart(Object partitionPathPart) { + if (partitionPathPart == null) { + return HUDI_DEFAULT_PARTITION_PATH; + } else { + // NOTE: [[toString]] is a no-op if key-part was already a [[String]] + String keyPartStr = partitionPathPart.toString(); + return keyPartStr.isEmpty() ? HUDI_DEFAULT_PARTITION_PATH : keyPartStr; + } + } + + private static UTF8String handleNullOrEmptyPartitionPathPartUTF8(UTF8String keyPart) { + if (keyPart == null || keyPart.numChars() == 0) { + return HUDI_DEFAULT_PARTITION_PATH_UTF8; + } + + return keyPart; + } + + /** + * Converts provided (raw) value extracted from the {@link InternalRow} object into a deserialized, + * JVM native format (for ex, converting {@code Long} into {@link Instant}, + * {@code Integer} to {@link LocalDate}, etc) + * + * This method allows to avoid costly full-row deserialization sequence. Note, that this method + * should be maintained in sync w/ + * + *

      + *
    1. {@code RowEncoder#deserializerFor}, as well as
    2. + *
    3. {@code HoodieAvroUtils#convertValueForAvroLogicalTypes}
    4. + *
    + * + * @param dataType target data-type of the given value + * @param value target value to be converted + */ + private static Object convertToLogicalDataType(DataType dataType, Object value) { + if (value == null) { + return null; + } else if (dataType instanceof TimestampType) { + // Provided value have to be [[Long]] in this case, representing micros since epoch + return new Timestamp((Long) value / 1000); + } else if (dataType instanceof DateType) { + // Provided value have to be [[Int]] in this case + return LocalDate.ofEpochDay((Integer) value); + } + + return value; + } + + protected static class SparkRowConverter { + private static final String STRUCT_NAME = "hoodieRowTopLevelField"; + private static final String NAMESPACE = "hoodieRow"; + + private final Function1 avroConverter; + private final SparkRowSerDe rowSerDe; + + SparkRowConverter(StructType schema) { + this.rowSerDe = HoodieSparkUtils.getCatalystRowSerDe(schema); + this.avroConverter = AvroConversionUtils.createConverterToAvro(schema, STRUCT_NAME, NAMESPACE); + } + + GenericRecord convertToAvro(Row row) { + return avroConverter.apply(row); + } + + GenericRecord convertToAvro(InternalRow row) { + return avroConverter.apply(rowSerDe.deserializeRow(row)); + } + } + + protected class SparkRowAccessor { + private final HoodieUnsafeRowUtils.NestedFieldPath[] recordKeyFieldsPaths; + private final HoodieUnsafeRowUtils.NestedFieldPath[] partitionPathFieldsPaths; + + SparkRowAccessor(StructType schema) { + this.recordKeyFieldsPaths = resolveNestedFieldPaths(getRecordKeyFieldNames(), schema); + this.partitionPathFieldsPaths = resolveNestedFieldPaths(getPartitionPathFields(), schema); + } + + public Object[] getRecordKeyParts(Row row) { + return getNestedFieldValues(row, recordKeyFieldsPaths); + } + + public Object[] getRecordPartitionPathValues(Row row) { + return getNestedFieldValues(row, partitionPathFieldsPaths); + } + + public Object[] getRecordKeyParts(InternalRow row) { + return getNestedFieldValues(row, recordKeyFieldsPaths); + } + + public Object[] getRecordPartitionPathValues(InternalRow row) { + return getNestedFieldValues(row, partitionPathFieldsPaths); + } + + private Object[] getNestedFieldValues(Row row, HoodieUnsafeRowUtils.NestedFieldPath[] nestedFieldsPaths) { + Object[] nestedFieldValues = new Object[nestedFieldsPaths.length]; + for (int i = 0; i < nestedFieldsPaths.length; ++i) { + nestedFieldValues[i] = HoodieUnsafeRowUtils$.MODULE$.getNestedRowValue(row, nestedFieldsPaths[i]); + } + return nestedFieldValues; + } + + private Object[] getNestedFieldValues(InternalRow row, HoodieUnsafeRowUtils.NestedFieldPath[] nestedFieldsPaths) { + Object[] nestedFieldValues = new Object[nestedFieldsPaths.length]; + for (int i = 0; i < nestedFieldsPaths.length; ++i) { + Object rawValue = HoodieUnsafeRowUtils$.MODULE$.getNestedInternalRowValue(row, nestedFieldsPaths[i]); + DataType dataType = tail(nestedFieldsPaths[i].parts())._2.dataType(); + + nestedFieldValues[i] = convertToLogicalDataType(dataType, rawValue); + } + + return nestedFieldValues; + } + + private HoodieUnsafeRowUtils.NestedFieldPath[] resolveNestedFieldPaths(List fieldPaths, StructType schema) { + try { + return fieldPaths.stream() + .map(fieldPath -> HoodieUnsafeRowUtils$.MODULE$.composeNestedFieldPath(schema, fieldPath)) + .toArray(HoodieUnsafeRowUtils.NestedFieldPath[]::new); + } catch (Exception e) { + LOG.error(String.format("Failed to resolve nested field-paths (%s) in schema (%s)", fieldPaths, schema), e); + throw new HoodieException("Failed to resolve nested field-paths", e); + } + } + } + + /** + * This is a generic interface closing the gap and unifying the {@link java.lang.StringBuilder} with + * {@link org.apache.hudi.unsafe.UTF8StringBuilder} implementations, allowing us to avoid code-duplication by performing + * most of the key-generation in a generic and unified way + * + * @param target string type this builder is producing (could either be native {@link String} + * or alternatively {@link UTF8String} + */ + private interface StringBuilder { + default StringBuilder append(S s) { + return appendJava(s.toString()); + } + + StringBuilder appendJava(String s); + + S build(); + } + + private static class JavaStringBuilder implements StringBuilder { + private final java.lang.StringBuilder sb = new java.lang.StringBuilder(); + + @Override + public StringBuilder appendJava(String s) { + sb.append(s); + return this; + } + + @Override + public String build() { + return sb.toString(); + } + } + + private static class UTF8StringBuilder implements StringBuilder { + private final org.apache.hudi.unsafe.UTF8StringBuilder sb = new org.apache.hudi.unsafe.UTF8StringBuilder(); + + @Override + public StringBuilder appendJava(String s) { + sb.append(s); + return this; + } + + @Override + public StringBuilder append(UTF8String s) { + sb.append(s); + return this; + } + + @Override + public UTF8String build() { + return sb.build(); } } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java index 36c8345593539..1eac7579757c7 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java @@ -21,12 +21,19 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.spark.sql.Row; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.UTF8String; import java.util.Arrays; import java.util.stream.Collectors; /** - * Complex key generator, which takes names of fields to be used for recordKey and partitionPath as configs. + * Key generator prefixing field names before corresponding record-key parts. + * + *

    + * For example, for the schema of {@code { "key": string, "value": bytes }}, and corresponding record + * {@code { "key": "foo" }}, record-key "key:foo" will be produced. */ public class ComplexKeyGenerator extends BuiltinKeyGenerator { @@ -34,11 +41,15 @@ public class ComplexKeyGenerator extends BuiltinKeyGenerator { public ComplexKeyGenerator(TypedProperties props) { super(props); - this.recordKeyFields = Arrays.stream(props.getString(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY) - .split(",")).map(String::trim).filter(s -> !s.isEmpty()).collect(Collectors.toList()); - this.partitionPathFields = Arrays.stream(props.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY) - .split(",")).map(String::trim).filter(s -> !s.isEmpty()).collect(Collectors.toList()); - complexAvroKeyGenerator = new ComplexAvroKeyGenerator(props); + this.recordKeyFields = Arrays.stream(props.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()).split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toList()); + this.partitionPathFields = Arrays.stream(props.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()).split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toList()); + this.complexAvroKeyGenerator = new ComplexAvroKeyGenerator(props); } @Override @@ -53,15 +64,25 @@ public String getPartitionPath(GenericRecord record) { @Override public String getRecordKey(Row row) { - buildFieldPositionMapIfNeeded(row.schema()); - return RowKeyGeneratorHelper.getRecordKeyFromRow(row, getRecordKeyFields(), recordKeyPositions, true); + tryInitRowAccessor(row.schema()); + return combineCompositeRecordKey(rowAccessor.getRecordKeyParts(row)); + } + + @Override + public UTF8String getRecordKey(InternalRow internalRow, StructType schema) { + tryInitRowAccessor(schema); + return combineCompositeRecordKeyUnsafe(rowAccessor.getRecordKeyParts(internalRow)); } @Override public String getPartitionPath(Row row) { - buildFieldPositionMapIfNeeded(row.schema()); - return RowKeyGeneratorHelper.getPartitionPathFromRow(row, getPartitionPathFields(), - hiveStylePartitioning, partitionPathPositions); + tryInitRowAccessor(row.schema()); + return combinePartitionPath(rowAccessor.getRecordPartitionPathValues(row)); } + @Override + public UTF8String getPartitionPath(InternalRow row, StructType schema) { + tryInitRowAccessor(schema); + return combinePartitionPathUnsafe(rowAccessor.getRecordPartitionPathValues(row)); + } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/CustomKeyGenerator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/CustomKeyGenerator.java index 6727b79d78477..fcd94bb4f1550 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/CustomKeyGenerator.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/CustomKeyGenerator.java @@ -18,16 +18,22 @@ package org.apache.hudi.keygen; -import org.apache.avro.generic.GenericRecord; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieKeyException; import org.apache.hudi.exception.HoodieKeyGeneratorException; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; + +import org.apache.avro.generic.GenericRecord; import org.apache.spark.sql.Row; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.UTF8String; import java.io.IOException; import java.util.Arrays; +import java.util.Collections; import java.util.stream.Collectors; /** @@ -42,16 +48,29 @@ * field in the partition path, use field1:simple 3. If you want your table to be non partitioned, simply leave it as blank. * * RecordKey is internally generated using either SimpleKeyGenerator or ComplexKeyGenerator. + * + * @deprecated */ +@Deprecated public class CustomKeyGenerator extends BuiltinKeyGenerator { private final CustomAvroKeyGenerator customAvroKeyGenerator; public CustomKeyGenerator(TypedProperties props) { - super(props); - this.recordKeyFields = Arrays.stream(props.getString(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY).split(",")).map(String::trim).collect(Collectors.toList()); - this.partitionPathFields = Arrays.stream(props.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY).split(",")).map(String::trim).collect(Collectors.toList()); - customAvroKeyGenerator = new CustomAvroKeyGenerator(props); + // NOTE: We have to strip partition-path configuration, since it could only be interpreted by + // this key-gen + super(stripPartitionPathConfig(props)); + this.recordKeyFields = + Arrays.stream(props.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()).split(",")) + .map(String::trim) + .collect(Collectors.toList()); + String partitionPathFields = props.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()); + this.partitionPathFields = partitionPathFields == null + ? Collections.emptyList() + : Arrays.stream(partitionPathFields.split(",")).map(String::trim).collect(Collectors.toList()); + this.customAvroKeyGenerator = new CustomAvroKeyGenerator(props); + + validateRecordKeyFields(); } @Override @@ -66,18 +85,22 @@ public String getPartitionPath(GenericRecord record) { @Override public String getRecordKey(Row row) { - validateRecordKeyFields(); - return getRecordKeyFields().size() == 1 - ? new SimpleKeyGenerator(config).getRecordKey(row) + return getRecordKeyFieldNames().size() == 1 + ? new SimpleKeyGenerator(config, config.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()), null).getRecordKey(row) : new ComplexKeyGenerator(config).getRecordKey(row); } @Override public String getPartitionPath(Row row) { - return getPartitionPath(Option.empty(), Option.of(row)); + return getPartitionPath(Option.empty(), Option.of(row), Option.empty()); + } + + @Override + public UTF8String getPartitionPath(InternalRow row, StructType schema) { + return UTF8String.fromString(getPartitionPath(Option.empty(), Option.empty(), Option.of(Pair.of(row, schema)))); } - private String getPartitionPath(Option record, Option row) { + private String getPartitionPath(Option record, Option row, Option> internalRowStructTypePair) { if (getPartitionPathFields() == null) { throw new HoodieKeyException("Unable to find field names for partition path in cfg"); } @@ -90,7 +113,7 @@ private String getPartitionPath(Option record, Option row) { return ""; } for (String field : getPartitionPathFields()) { - String[] fieldWithType = field.split(customAvroKeyGenerator.getSplitRegex()); + String[] fieldWithType = field.split(CustomAvroKeyGenerator.SPLIT_REGEX); if (fieldWithType.length != 2) { throw new HoodieKeyGeneratorException("Unable to find field names for partition path in proper format"); } @@ -101,19 +124,25 @@ private String getPartitionPath(Option record, Option row) { case SIMPLE: if (record.isPresent()) { partitionPath.append(new SimpleKeyGenerator(config, partitionPathField).getPartitionPath(record.get())); - } else { + } else if (row.isPresent()) { partitionPath.append(new SimpleKeyGenerator(config, partitionPathField).getPartitionPath(row.get())); + } else { + partitionPath.append(new SimpleKeyGenerator(config, partitionPathField).getPartitionPath(internalRowStructTypePair.get().getKey(), + internalRowStructTypePair.get().getValue())); } break; case TIMESTAMP: try { if (record.isPresent()) { partitionPath.append(new TimestampBasedKeyGenerator(config, partitionPathField).getPartitionPath(record.get())); - } else { + } else if (row.isPresent()) { partitionPath.append(new TimestampBasedKeyGenerator(config, partitionPathField).getPartitionPath(row.get())); + } else { + partitionPath.append(new TimestampBasedKeyGenerator(config, partitionPathField).getPartitionPath(internalRowStructTypePair.get().getKey(), + internalRowStructTypePair.get().getValue())); } } catch (IOException ioe) { - throw new HoodieKeyGeneratorException("Unable to initialise TimestampBasedKeyGenerator class"); + throw new HoodieKeyGeneratorException("Unable to initialise TimestampBasedKeyGenerator class", ioe); } break; default: @@ -127,9 +156,18 @@ private String getPartitionPath(Option record, Option row) { } private void validateRecordKeyFields() { - if (getRecordKeyFields() == null || getRecordKeyFields().isEmpty()) { + if (getRecordKeyFieldNames() == null || getRecordKeyFieldNames().isEmpty()) { throw new HoodieKeyException("Unable to find field names for record key in cfg"); } } + + private static TypedProperties stripPartitionPathConfig(TypedProperties props) { + TypedProperties filtered = new TypedProperties(props); + // NOTE: We have to stub it out w/ empty string, since we properties are: + // - Expected to bear this config + // - Can't be stubbed out w/ null + filtered.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), ""); + return filtered; + } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/GlobalDeleteKeyGenerator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/GlobalDeleteKeyGenerator.java index 5c9a813a2c314..7fcc16094eadc 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/GlobalDeleteKeyGenerator.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/GlobalDeleteKeyGenerator.java @@ -18,10 +18,14 @@ package org.apache.hudi.keygen; -import org.apache.avro.generic.GenericRecord; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; + +import org.apache.avro.generic.GenericRecord; import org.apache.spark.sql.Row; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.UTF8String; import java.util.ArrayList; import java.util.Arrays; @@ -36,8 +40,13 @@ public class GlobalDeleteKeyGenerator extends BuiltinKeyGenerator { private final GlobalAvroDeleteKeyGenerator globalAvroDeleteKeyGenerator; public GlobalDeleteKeyGenerator(TypedProperties config) { super(config); - this.recordKeyFields = Arrays.asList(config.getString(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY).split(",")); - globalAvroDeleteKeyGenerator = new GlobalAvroDeleteKeyGenerator(config); + this.recordKeyFields = Arrays.asList(config.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()).split(",")); + this.globalAvroDeleteKeyGenerator = new GlobalAvroDeleteKeyGenerator(config); + } + + @Override + public List getPartitionPathFields() { + return new ArrayList<>(); } @Override @@ -51,19 +60,25 @@ public String getPartitionPath(GenericRecord record) { } @Override - public List getPartitionPathFields() { - return new ArrayList<>(); + public String getRecordKey(Row row) { + tryInitRowAccessor(row.schema()); + return combineCompositeRecordKey(rowAccessor.getRecordKeyParts(row)); } @Override - public String getRecordKey(Row row) { - buildFieldPositionMapIfNeeded(row.schema()); - return RowKeyGeneratorHelper.getRecordKeyFromRow(row, getRecordKeyFields(), recordKeyPositions, true); + public UTF8String getRecordKey(InternalRow internalRow, StructType schema) { + tryInitRowAccessor(schema); + return combineCompositeRecordKeyUnsafe(rowAccessor.getRecordKeyParts(internalRow)); } @Override public String getPartitionPath(Row row) { return globalAvroDeleteKeyGenerator.getEmptyPartition(); } + + @Override + public UTF8String getPartitionPath(InternalRow row, StructType schema) { + return UTF8String.EMPTY_UTF8; + } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/NonpartitionedKeyGenerator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/NonpartitionedKeyGenerator.java index 543e1349e9f7c..ccd37dc9ce324 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/NonpartitionedKeyGenerator.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/NonpartitionedKeyGenerator.java @@ -20,30 +20,59 @@ import org.apache.avro.generic.GenericRecord; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.spark.sql.Row; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.UTF8String; +import java.util.Arrays; +import java.util.Collections; import java.util.List; +import java.util.stream.Collectors; /** - * Simple Key generator for unpartitioned Hive Tables. + * Simple Key generator for non-partitioned Hive Tables. */ -public class NonpartitionedKeyGenerator extends SimpleKeyGenerator { +public class NonpartitionedKeyGenerator extends BuiltinKeyGenerator { private final NonpartitionedAvroKeyGenerator nonpartitionedAvroKeyGenerator; - public NonpartitionedKeyGenerator(TypedProperties config) { - super(config); - nonpartitionedAvroKeyGenerator = new NonpartitionedAvroKeyGenerator(config); + public NonpartitionedKeyGenerator(TypedProperties props) { + super(props); + this.recordKeyFields = Arrays.stream(props.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()) + .split(",")) + .map(String::trim) + .collect(Collectors.toList()); + this.partitionPathFields = Collections.emptyList(); + this.nonpartitionedAvroKeyGenerator = new NonpartitionedAvroKeyGenerator(props); } @Override - public String getPartitionPath(GenericRecord record) { - return nonpartitionedAvroKeyGenerator.getPartitionPath(record); + public List getPartitionPathFields() { + return nonpartitionedAvroKeyGenerator.getPartitionPathFields(); } @Override - public List getPartitionPathFields() { - return nonpartitionedAvroKeyGenerator.getPartitionPathFields(); + public String getRecordKey(GenericRecord record) { + return nonpartitionedAvroKeyGenerator.getRecordKey(record); + } + + @Override + public String getRecordKey(Row row) { + tryInitRowAccessor(row.schema()); + return combineRecordKey(rowAccessor.getRecordKeyParts(row)); + } + + @Override + public UTF8String getRecordKey(InternalRow internalRow, StructType schema) { + tryInitRowAccessor(schema); + return combineRecordKeyUnsafe(rowAccessor.getRecordKeyParts(internalRow)); + } + + @Override + public String getPartitionPath(GenericRecord record) { + return nonpartitionedAvroKeyGenerator.getPartitionPath(record); } @Override @@ -51,5 +80,9 @@ public String getPartitionPath(Row row) { return nonpartitionedAvroKeyGenerator.getEmptyPartition(); } + @Override + public UTF8String getPartitionPath(InternalRow row, StructType schema) { + return UTF8String.EMPTY_UTF8; + } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/RowKeyGeneratorHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/RowKeyGeneratorHelper.java deleted file mode 100644 index dd0d4c5c5318b..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/RowKeyGeneratorHelper.java +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.keygen; - -import org.apache.hudi.exception.HoodieKeyException; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import scala.Option; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_PARTITION_PATH; -import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR; -import static org.apache.hudi.keygen.KeyGenUtils.EMPTY_RECORDKEY_PLACEHOLDER; -import static org.apache.hudi.keygen.KeyGenUtils.NULL_RECORDKEY_PLACEHOLDER; - -/** - * Helper class to fetch fields from Row. - */ -public class RowKeyGeneratorHelper { - - /** - * Generates record key for the corresponding {@link Row}. - * @param row instance of {@link Row} of interest - * @param recordKeyFields record key fields as a list - * @param recordKeyPositions record key positions for the corresponding record keys in {@code recordKeyFields} - * @param prefixFieldName {@code true} if field name need to be prefixed in the returned result. {@code false} otherwise. - * @return the record key thus generated - */ - public static String getRecordKeyFromRow(Row row, List recordKeyFields, Map> recordKeyPositions, boolean prefixFieldName) { - AtomicBoolean keyIsNullOrEmpty = new AtomicBoolean(true); - String toReturn = recordKeyFields.stream().map(field -> { - String val = null; - List fieldPositions = recordKeyPositions.get(field); - if (fieldPositions.size() == 1) { // simple field - Integer fieldPos = fieldPositions.get(0); - if (row.isNullAt(fieldPos)) { - val = NULL_RECORDKEY_PLACEHOLDER; - } else { - val = row.getAs(field).toString(); - if (val.isEmpty()) { - val = EMPTY_RECORDKEY_PLACEHOLDER; - } else { - keyIsNullOrEmpty.set(false); - } - } - } else { // nested fields - val = getNestedFieldVal(row, recordKeyPositions.get(field)).toString(); - if (!val.contains(NULL_RECORDKEY_PLACEHOLDER) && !val.contains(EMPTY_RECORDKEY_PLACEHOLDER)) { - keyIsNullOrEmpty.set(false); - } - } - return prefixFieldName ? (field + ":" + val) : val; - }).collect(Collectors.joining(",")); - if (keyIsNullOrEmpty.get()) { - throw new HoodieKeyException("recordKey value: \"" + toReturn + "\" for fields: \"" + Arrays.toString(recordKeyFields.toArray()) + "\" cannot be null or empty."); - } - return toReturn; - } - - /** - * Generates partition path for the corresponding {@link Row}. - * @param row instance of {@link Row} of interest - * @param partitionPathFields partition path fields as a list - * @param hiveStylePartitioning {@code true} if hive style partitioning is set. {@code false} otherwise - * @param partitionPathPositions partition path positions for the corresponding fields in {@code partitionPathFields} - * @return the generated partition path for the row - */ - public static String getPartitionPathFromRow(Row row, List partitionPathFields, boolean hiveStylePartitioning, Map> partitionPathPositions) { - return IntStream.range(0, partitionPathFields.size()).mapToObj(idx -> { - String field = partitionPathFields.get(idx); - String val = null; - List fieldPositions = partitionPathPositions.get(field); - if (fieldPositions.size() == 1) { // simple - Integer fieldPos = fieldPositions.get(0); - // for partition path, if field is not found, index will be set to -1 - if (fieldPos == -1 || row.isNullAt(fieldPos)) { - val = DEFAULT_PARTITION_PATH; - } else { - val = row.getAs(field).toString(); - if (val.isEmpty()) { - val = DEFAULT_PARTITION_PATH; - } - } - if (hiveStylePartitioning) { - val = field + "=" + val; - } - } else { // nested - Object nestedVal = getNestedFieldVal(row, partitionPathPositions.get(field)); - if (nestedVal.toString().contains(NULL_RECORDKEY_PLACEHOLDER) || nestedVal.toString().contains(EMPTY_RECORDKEY_PLACEHOLDER)) { - val = hiveStylePartitioning ? field + "=" + DEFAULT_PARTITION_PATH : DEFAULT_PARTITION_PATH; - } else { - val = hiveStylePartitioning ? field + "=" + nestedVal.toString() : nestedVal.toString(); - } - } - return val; - }).collect(Collectors.joining(DEFAULT_PARTITION_PATH_SEPARATOR)); - } - - /** - * Fetch the field value located at the positions requested for. - * @param row instance of {@link Row} of interest - * @param positions tree style positions where the leaf node need to be fetched and returned - * @return the field value as per the positions requested for. - */ - public static Object getNestedFieldVal(Row row, List positions) { - if (positions.size() == 1 && positions.get(0) == -1) { - return DEFAULT_PARTITION_PATH; - } - int index = 0; - int totalCount = positions.size(); - Row valueToProcess = row; - Object toReturn = null; - - while (index < totalCount) { - if (index < totalCount - 1) { - if (valueToProcess.isNullAt(positions.get(index))) { - toReturn = NULL_RECORDKEY_PLACEHOLDER; - break; - } - valueToProcess = (Row) valueToProcess.get(positions.get(index)); - } else { // last index - if (null != valueToProcess.getAs(positions.get(index)) && valueToProcess.getAs(positions.get(index)).toString().isEmpty()) { - toReturn = EMPTY_RECORDKEY_PLACEHOLDER; - break; - } - toReturn = valueToProcess.getAs(positions.get(index)); - } - index++; - } - return toReturn; - } - - /** - * Generate the tree style positions for the field requested for as per the defined struct type. - * @param structType schema of interest - * @param field field of interest for which the positions are requested for - * @param isRecordKey {@code true} if the field requested for is a record key. {@code false} incase of a partition path. - * @return the positions of the field as per the struct type. - */ - public static List getNestedFieldIndices(StructType structType, String field, boolean isRecordKey) { - String[] slices = field.split("\\."); - List positions = new ArrayList<>(); - int index = 0; - int totalCount = slices.length; - while (index < totalCount) { - String slice = slices[index]; - Option curIndexOpt = structType.getFieldIndex(slice); - if (curIndexOpt.isDefined()) { - int curIndex = (int) curIndexOpt.get(); - positions.add(curIndex); - final StructField nestedField = structType.fields()[curIndex]; - if (index < totalCount - 1) { - if (!(nestedField.dataType() instanceof StructType)) { - if (isRecordKey) { - throw new HoodieKeyException("Nested field should be of type StructType " + nestedField); - } else { - positions = Collections.singletonList(-1); - break; - } - } - structType = (StructType) nestedField.dataType(); - } - } else { - if (isRecordKey) { - throw new HoodieKeyException("Can't find " + slice + " in StructType for the field " + field); - } else { - positions = Collections.singletonList(-1); - break; - } - } - index++; - } - return positions; - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/SimpleKeyGenerator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/SimpleKeyGenerator.java index 332686d378c0b..8c43e19baaf55 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/SimpleKeyGenerator.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/SimpleKeyGenerator.java @@ -20,8 +20,12 @@ import org.apache.avro.generic.GenericRecord; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.spark.sql.Row; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.UTF8String; import java.util.Collections; @@ -33,8 +37,8 @@ public class SimpleKeyGenerator extends BuiltinKeyGenerator { private final SimpleAvroKeyGenerator simpleAvroKeyGenerator; public SimpleKeyGenerator(TypedProperties props) { - this(props, props.getString(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY), - props.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY)); + this(props, props.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()), + props.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())); } SimpleKeyGenerator(TypedProperties props, String partitionPathField) { @@ -43,11 +47,15 @@ public SimpleKeyGenerator(TypedProperties props) { SimpleKeyGenerator(TypedProperties props, String recordKeyField, String partitionPathField) { super(props); - this.recordKeyFields = recordKeyField == null - ? Collections.emptyList() - : Collections.singletonList(recordKeyField); - this.partitionPathFields = Collections.singletonList(partitionPathField); - simpleAvroKeyGenerator = new SimpleAvroKeyGenerator(props, recordKeyField, partitionPathField); + // Make sure key-generator is configured properly + ValidationUtils.checkArgument(recordKeyField == null || !recordKeyField.isEmpty(), + "Record key field has to be non-empty!"); + ValidationUtils.checkArgument(partitionPathField == null || !partitionPathField.isEmpty(), + "Partition path field has to be non-empty!"); + + this.recordKeyFields = recordKeyField == null ? Collections.emptyList() : Collections.singletonList(recordKeyField); + this.partitionPathFields = partitionPathField == null ? Collections.emptyList() : Collections.singletonList(partitionPathField); + this.simpleAvroKeyGenerator = new SimpleAvroKeyGenerator(props, recordKeyField, partitionPathField); } @Override @@ -62,14 +70,43 @@ public String getPartitionPath(GenericRecord record) { @Override public String getRecordKey(Row row) { - buildFieldPositionMapIfNeeded(row.schema()); - return RowKeyGeneratorHelper.getRecordKeyFromRow(row, getRecordKeyFields(), recordKeyPositions, false); + tryInitRowAccessor(row.schema()); + + Object[] recordKeys = rowAccessor.getRecordKeyParts(row); + // NOTE: [[SimpleKeyGenerator]] is restricted to allow only primitive (non-composite) + // record-key field + if (recordKeys[0] == null) { + return handleNullRecordKey(null); + } else { + return requireNonNullNonEmptyKey(recordKeys[0].toString()); + } + } + + @Override + public UTF8String getRecordKey(InternalRow internalRow, StructType schema) { + tryInitRowAccessor(schema); + + Object[] recordKeyValues = rowAccessor.getRecordKeyParts(internalRow); + // NOTE: [[SimpleKeyGenerator]] is restricted to allow only primitive (non-composite) + // record-key field + if (recordKeyValues[0] == null) { + return handleNullRecordKey(null); + } else if (recordKeyValues[0] instanceof UTF8String) { + return requireNonNullNonEmptyKey((UTF8String) recordKeyValues[0]); + } else { + return requireNonNullNonEmptyKey(UTF8String.fromString(recordKeyValues[0].toString())); + } } @Override public String getPartitionPath(Row row) { - buildFieldPositionMapIfNeeded(row.schema()); - return RowKeyGeneratorHelper.getPartitionPathFromRow(row, getPartitionPathFields(), - hiveStylePartitioning, partitionPathPositions); + tryInitRowAccessor(row.schema()); + return combinePartitionPath(rowAccessor.getRecordPartitionPathValues(row)); + } + + @Override + public UTF8String getPartitionPath(InternalRow row, StructType schema) { + tryInitRowAccessor(schema); + return combinePartitionPathUnsafe(rowAccessor.getRecordPartitionPathValues(row)); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/SparkKeyGeneratorInterface.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/SparkKeyGeneratorInterface.java index 77abf1514cfe9..977ff709bb1ab 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/SparkKeyGeneratorInterface.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/SparkKeyGeneratorInterface.java @@ -18,14 +18,65 @@ package org.apache.hudi.keygen; +import org.apache.hudi.ApiMaturityLevel; +import org.apache.hudi.PublicAPIMethod; import org.apache.spark.sql.Row; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.UTF8String; /** - * Spark key generator interface. + * Spark-specific {@link KeyGenerator} interface extension allowing implementation to + * specifically implement record-key, partition-path generation w/o the need for (expensive) + * conversion from Spark internal representation (for ex, to Avro) */ public interface SparkKeyGeneratorInterface extends KeyGeneratorInterface { + /** + * Extracts record key from Spark's {@link Row} + * + * @param row instance of {@link Row} from which record-key is extracted + * @return record's (primary) key + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) String getRecordKey(Row row); + /** + * Extracts record key from Spark's {@link InternalRow} + * + * NOTE: Difference b/w {@link Row} and {@link InternalRow} is that {@link InternalRow} could + * internally hold just a binary representation of the data, while {@link Row} has it + * deserialized into JVM-native representation (like {@code Integer}, {@code Long}, + * {@code String}, etc) + * + * @param row instance of {@link InternalRow} from which record-key is extracted + * @param schema schema {@link InternalRow} is adhering to + * @return record-key as instance of {@link UTF8String} + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + UTF8String getRecordKey(InternalRow row, StructType schema); + + /** + * Extracts partition-path from {@link Row} + * + * @param row instance of {@link Row} from which partition-path is extracted + * @return record's partition-path + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) String getPartitionPath(Row row); + + /** + * Extracts partition-path from Spark's {@link InternalRow} + * + * NOTE: Difference b/w {@link Row} and {@link InternalRow} is that {@link InternalRow} could + * internally hold just a binary representation of the data, while {@link Row} has it + * deserialized into JVM-native representation (like {@code Integer}, {@code Long}, + * {@code String}, etc) + * + * @param row instance of {@link InternalRow} from which record-key is extracted + * @param schema schema {@link InternalRow} is adhering to + * @return partition-path as instance of {@link UTF8String} + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + UTF8String getPartitionPath(InternalRow row, StructType schema); } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/TimestampBasedKeyGenerator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/TimestampBasedKeyGenerator.java index 859269c751a80..f090320ccbcc3 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/TimestampBasedKeyGenerator.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/TimestampBasedKeyGenerator.java @@ -23,12 +23,14 @@ import org.apache.hudi.exception.HoodieKeyGeneratorException; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.spark.sql.Row; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.UTF8String; import java.io.IOException; +import java.util.Objects; -import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_PARTITION_PATH; -import static org.apache.hudi.keygen.KeyGenUtils.EMPTY_RECORDKEY_PLACEHOLDER; -import static org.apache.hudi.keygen.KeyGenUtils.NULL_RECORDKEY_PLACEHOLDER; +import static org.apache.hudi.keygen.KeyGenUtils.HUDI_DEFAULT_PARTITION_PATH; /** * Key generator, that relies on timestamps for partitioning field. Still picks record key by name. @@ -38,8 +40,8 @@ public class TimestampBasedKeyGenerator extends SimpleKeyGenerator { private final TimestampBasedAvroKeyGenerator timestampBasedAvroKeyGenerator; public TimestampBasedKeyGenerator(TypedProperties config) throws IOException { - this(config, config.getString(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY), - config.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY)); + this(config, config.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()), + config.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())); } TimestampBasedKeyGenerator(TypedProperties config, String partitionPathField) throws IOException { @@ -58,25 +60,44 @@ public String getPartitionPath(GenericRecord record) { @Override public String getRecordKey(Row row) { - buildFieldPositionMapIfNeeded(row.schema()); - return RowKeyGeneratorHelper.getRecordKeyFromRow(row, getRecordKeyFields(), recordKeyPositions, false); + tryInitRowAccessor(row.schema()); + return combineRecordKey(rowAccessor.getRecordKeyParts(row)); + } + + @Override + public UTF8String getRecordKey(InternalRow internalRow, StructType schema) { + tryInitRowAccessor(schema); + return combineRecordKeyUnsafe(rowAccessor.getRecordKeyParts(internalRow)); } @Override public String getPartitionPath(Row row) { - Object fieldVal = null; - buildFieldPositionMapIfNeeded(row.schema()); - Object partitionPathFieldVal = RowKeyGeneratorHelper.getNestedFieldVal(row, partitionPathPositions.get(getPartitionPathFields().get(0))); + tryInitRowAccessor(row.schema()); + Object[] partitionPathValues = rowAccessor.getRecordPartitionPathValues(row); + return getFormattedPartitionPath(partitionPathValues[0]); + } + + @Override + public UTF8String getPartitionPath(InternalRow row, StructType schema) { + tryInitRowAccessor(schema); + Object[] partitionPathValues = rowAccessor.getRecordPartitionPathValues(row); + return UTF8String.fromString(getFormattedPartitionPath(partitionPathValues[0])); + } + + private String getFormattedPartitionPath(Object partitionPathPart) { + Object fieldVal; + if (partitionPathPart == null || Objects.equals(partitionPathPart, HUDI_DEFAULT_PARTITION_PATH)) { + fieldVal = timestampBasedAvroKeyGenerator.getDefaultPartitionVal(); + } else if (partitionPathPart instanceof UTF8String) { + fieldVal = partitionPathPart.toString(); + } else { + fieldVal = partitionPathPart; + } + try { - if (partitionPathFieldVal == null || partitionPathFieldVal.toString().contains(DEFAULT_PARTITION_PATH) || partitionPathFieldVal.toString().contains(NULL_RECORDKEY_PLACEHOLDER) - || partitionPathFieldVal.toString().contains(EMPTY_RECORDKEY_PLACEHOLDER)) { - fieldVal = timestampBasedAvroKeyGenerator.getDefaultPartitionVal(); - } else { - fieldVal = partitionPathFieldVal; - } return timestampBasedAvroKeyGenerator.getPartitionPath(fieldVal); } catch (Exception e) { - throw new HoodieKeyGeneratorException("Unable to parse input partition field :" + fieldVal, e); + throw new HoodieKeyGeneratorException(String.format("Failed to properly format partition-path (%s)", fieldVal), e); } } } \ No newline at end of file diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/factory/HoodieSparkKeyGeneratorFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/factory/HoodieSparkKeyGeneratorFactory.java new file mode 100644 index 0000000000000..165b27d6ce283 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/factory/HoodieSparkKeyGeneratorFactory.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.keygen.factory; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieKeyGeneratorException; +import org.apache.hudi.keygen.ComplexKeyGenerator; +import org.apache.hudi.keygen.CustomKeyGenerator; +import org.apache.hudi.keygen.GlobalDeleteKeyGenerator; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.keygen.NonpartitionedKeyGenerator; +import org.apache.hudi.keygen.SimpleKeyGenerator; +import org.apache.hudi.keygen.TimestampBasedKeyGenerator; +import org.apache.hudi.keygen.constant.KeyGeneratorType; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; + +/** + * Factory help to create {@link org.apache.hudi.keygen.KeyGenerator}. + *

    + * This factory will try {@link HoodieWriteConfig#KEYGENERATOR_CLASS_NAME} firstly, this ensures the class prop + * will not be overwritten by {@link KeyGeneratorType} + */ +public class HoodieSparkKeyGeneratorFactory { + + private static final Logger LOG = LoggerFactory.getLogger(HoodieSparkKeyGeneratorFactory.class); + + private static final Map COMMON_TO_SPARK_KEYGENERATOR = new HashMap<>(); + static { + COMMON_TO_SPARK_KEYGENERATOR.put("org.apache.hudi.keygen.ComplexAvroKeyGenerator", + "org.apache.hudi.keygen.ComplexKeyGenerator"); + COMMON_TO_SPARK_KEYGENERATOR.put("org.apache.hudi.keygen.CustomAvroKeyGenerator", + "org.apache.hudi.keygen.CustomKeyGenerator"); + COMMON_TO_SPARK_KEYGENERATOR.put("org.apache.hudi.keygen.GlobalAvroDeleteKeyGenerator", + "org.apache.hudi.keygen.GlobalDeleteKeyGenerator"); + COMMON_TO_SPARK_KEYGENERATOR.put("org.apache.hudi.keygen.NonpartitionedAvroKeyGenerator", + "org.apache.hudi.keygen.NonpartitionedKeyGenerator"); + COMMON_TO_SPARK_KEYGENERATOR.put("org.apache.hudi.keygen.SimpleAvroKeyGenerator", + "org.apache.hudi.keygen.SimpleKeyGenerator"); + COMMON_TO_SPARK_KEYGENERATOR.put("org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator", + "org.apache.hudi.keygen.TimestampBasedKeyGenerator"); + } + + public static KeyGenerator createKeyGenerator(TypedProperties props) throws IOException { + String keyGeneratorClass = getKeyGeneratorClassName(props); + try { + return (KeyGenerator) ReflectionUtils.loadClass(keyGeneratorClass, props); + } catch (Throwable e) { + throw new IOException("Could not load key generator class " + keyGeneratorClass, e); + } + } + + public static String getKeyGeneratorClassName(TypedProperties props) { + String keyGeneratorClass = props.getString(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key(), null); + + if (StringUtils.isNullOrEmpty(keyGeneratorClass)) { + String keyGeneratorType = props.getString(HoodieWriteConfig.KEYGENERATOR_TYPE.key(), KeyGeneratorType.SIMPLE.name()); + LOG.info("The value of {} is empty, use SIMPLE", HoodieWriteConfig.KEYGENERATOR_TYPE.key()); + KeyGeneratorType keyGeneratorTypeEnum; + try { + keyGeneratorTypeEnum = KeyGeneratorType.valueOf(keyGeneratorType.toUpperCase(Locale.ROOT)); + } catch (IllegalArgumentException e) { + throw new HoodieKeyGeneratorException("Unsupported keyGenerator Type " + keyGeneratorType); + } + switch (keyGeneratorTypeEnum) { + case SIMPLE: + keyGeneratorClass = SimpleKeyGenerator.class.getName(); + break; + case COMPLEX: + keyGeneratorClass = ComplexKeyGenerator.class.getName(); + break; + case TIMESTAMP: + keyGeneratorClass = TimestampBasedKeyGenerator.class.getName(); + break; + case CUSTOM: + keyGeneratorClass = CustomKeyGenerator.class.getName(); + break; + case NON_PARTITION: + keyGeneratorClass = NonpartitionedKeyGenerator.class.getName(); + break; + case GLOBAL_DELETE: + keyGeneratorClass = GlobalDeleteKeyGenerator.class.getName(); + break; + default: + throw new HoodieKeyGeneratorException("Unsupported keyGenerator Type " + keyGeneratorType); + } + } + return keyGeneratorClass; + } + + /** + * Convert hoodie-common KeyGenerator to SparkKeyGeneratorInterface implement. + */ + public static String convertToSparkKeyGenerator(String keyGeneratorClassName) { + return COMMON_TO_SPARK_KEYGENERATOR.getOrDefault(keyGeneratorClassName, keyGeneratorClassName); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java new file mode 100644 index 0000000000000..272d3d479852a --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.metrics.Registry; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.CommitUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.exception.HoodieMetadataException; +import org.apache.hudi.metrics.DistributedRegistry; + +import org.apache.avro.specific.SpecificRecordBase; +import org.apache.hadoop.conf.Configuration; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaRDD; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class SparkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetadataWriter { + + private static final Logger LOG = LogManager.getLogger(SparkHoodieBackedTableMetadataWriter.class); + + /** + * Return a Spark based implementation of {@code HoodieTableMetadataWriter} which can be used to + * write to the metadata table. + *

    + * If the metadata table does not exist, an attempt is made to bootstrap it but there is no guaranteed that + * table will end up bootstrapping at this time. + * + * @param conf + * @param writeConfig + * @param context + * @param actionMetadata + * @param inflightInstantTimestamp Timestamp of an instant which is in-progress. This instant is ignored while + * attempting to bootstrap the table. + * @return An instance of the {@code HoodieTableMetadataWriter} + */ + public static HoodieTableMetadataWriter create(Configuration conf, + HoodieWriteConfig writeConfig, + HoodieEngineContext context, + Option actionMetadata, + Option inflightInstantTimestamp) { + return new SparkHoodieBackedTableMetadataWriter(conf, writeConfig, context, actionMetadata, + inflightInstantTimestamp); + } + + public static HoodieTableMetadataWriter create(Configuration conf, HoodieWriteConfig writeConfig, + HoodieEngineContext context) { + return create(conf, writeConfig, context, Option.empty(), Option.empty()); + } + + SparkHoodieBackedTableMetadataWriter(Configuration hadoopConf, + HoodieWriteConfig writeConfig, + HoodieEngineContext engineContext, + Option actionMetadata, + Option inflightInstantTimestamp) { + super(hadoopConf, writeConfig, engineContext, actionMetadata, inflightInstantTimestamp); + } + + @Override + protected void initRegistry() { + if (metadataWriteConfig.isMetricsOn()) { + Registry registry; + if (metadataWriteConfig.isExecutorMetricsEnabled()) { + registry = Registry.getRegistry("HoodieMetadata", DistributedRegistry.class.getName()); + } else { + registry = Registry.getRegistry("HoodieMetadata"); + } + this.metrics = Option.of(new HoodieMetadataMetrics(registry)); + } else { + this.metrics = Option.empty(); + } + } + + @Override + protected void initialize(HoodieEngineContext engineContext, + Option actionMetadata, + Option inflightInstantTimestamp) { + try { + metrics.map(HoodieMetadataMetrics::registry).ifPresent(registry -> { + if (registry instanceof DistributedRegistry) { + HoodieSparkEngineContext sparkEngineContext = (HoodieSparkEngineContext) engineContext; + ((DistributedRegistry) registry).register(sparkEngineContext.getJavaSparkContext()); + } + }); + + if (enabled) { + initializeIfNeeded(dataMetaClient, actionMetadata, inflightInstantTimestamp); + } + } catch (IOException e) { + LOG.error("Failed to initialize metadata table. Disabling the writer.", e); + enabled = false; + } + } + + @Override + protected void commit(String instantTime, Map> partitionRecordsMap, boolean canTriggerTableService) { + ValidationUtils.checkState(metadataMetaClient != null, "Metadata table is not fully initialized yet."); + ValidationUtils.checkState(enabled, "Metadata table cannot be committed to as it is not enabled"); + HoodieData preppedRecords = prepRecords(partitionRecordsMap); + JavaRDD preppedRecordRDD = HoodieJavaRDD.getJavaRDD(preppedRecords); + + try (SparkRDDWriteClient writeClient = new SparkRDDWriteClient(engineContext, metadataWriteConfig, true)) { + if (canTriggerTableService) { + // trigger compaction before doing the delta commit. this is to ensure, if this delta commit succeeds in metadata table, but failed in data table, + // we would have compacted metadata table and so could have included uncommitted data which will never be ignored while reading from metadata + // table (since reader will filter out only from delta commits) + compactIfNecessary(writeClient, instantTime); + } + + if (!metadataMetaClient.getActiveTimeline().containsInstant(instantTime)) { + // if this is a new commit being applied to metadata for the first time + writeClient.startCommitWithTime(instantTime); + } else { + Option alreadyCompletedInstant = metadataMetaClient.getActiveTimeline().filterCompletedInstants().filter(entry -> entry.getTimestamp().equals(instantTime)).lastInstant(); + if (alreadyCompletedInstant.isPresent()) { + // this code path refers to a re-attempted commit that got committed to metadata table, but failed in datatable. + // for eg, lets say compaction c1 on 1st attempt succeeded in metadata table and failed before committing to datatable. + // when retried again, data table will first rollback pending compaction. these will be applied to metadata table, but all changes + // are upserts to metadata table and so only a new delta commit will be created. + // once rollback is complete, compaction will be retried again, which will eventually hit this code block where the respective commit is + // already part of completed commit. So, we have to manually remove the completed instant and proceed. + // and it is for the same reason we enabled withAllowMultiWriteOnSameInstant for metadata table. + HoodieActiveTimeline.deleteInstantFile(metadataMetaClient.getFs(), metadataMetaClient.getMetaPath(), alreadyCompletedInstant.get()); + metadataMetaClient.reloadActiveTimeline(); + } + // If the alreadyCompletedInstant is empty, that means there is a requested or inflight + // instant with the same instant time. This happens for data table clean action which + // reuses the same instant time without rollback first. It is a no-op here as the + // clean plan is the same, so we don't need to delete the requested and inflight instant + // files in the active timeline. + } + + List statuses = writeClient.upsertPreppedRecords(preppedRecordRDD, instantTime).collect(); + statuses.forEach(writeStatus -> { + if (writeStatus.hasErrors()) { + throw new HoodieMetadataException("Failed to commit metadata table records at instant " + instantTime); + } + }); + + // reload timeline + metadataMetaClient.reloadActiveTimeline(); + if (canTriggerTableService) { + cleanIfNecessary(writeClient, instantTime); + writeClient.archive(); + } + } + + // Update total size of the metadata and count of base/log files + metrics.ifPresent(m -> m.updateSizeMetrics(metadataMetaClient, metadata)); + } + + @Override + public void deletePartitions(String instantTime, List partitions) { + List partitionsToDrop = partitions.stream().map(MetadataPartitionType::getPartitionPath).collect(Collectors.toList()); + LOG.info("Deleting Metadata Table partitions: " + partitionsToDrop); + + try (SparkRDDWriteClient writeClient = new SparkRDDWriteClient(engineContext, metadataWriteConfig, true)) { + String actionType = CommitUtils.getCommitActionType(WriteOperationType.DELETE_PARTITION, HoodieTableType.MERGE_ON_READ); + writeClient.startCommitWithTime(instantTime, actionType); + writeClient.deletePartitions(partitionsToDrop, instantTime); + } + closeInternal(); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metrics/DistributedRegistry.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metrics/DistributedRegistry.java new file mode 100644 index 0000000000000..60c32b34da2a8 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metrics/DistributedRegistry.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metrics; + +import java.io.Serializable; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import org.apache.hudi.common.metrics.Registry; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.util.AccumulatorV2; + +/** + * Lightweight Metrics Registry to track Hudi events. + */ +public class DistributedRegistry extends AccumulatorV2, Map> + implements Registry, Serializable { + private String name; + ConcurrentHashMap counters = new ConcurrentHashMap<>(); + + public DistributedRegistry(String name) { + this.name = name; + } + + public void register(JavaSparkContext jsc) { + if (!isRegistered()) { + jsc.sc().register(this); + } + } + + @Override + public void clear() { + counters.clear(); + } + + @Override + public void increment(String name) { + counters.merge(name, 1L, (oldValue, newValue) -> oldValue + newValue); + } + + @Override + public void add(String name, long value) { + counters.merge(name, value, (oldValue, newValue) -> oldValue + newValue); + } + + @Override + public void set(String name, long value) { + counters.merge(name, value, (oldValue, newValue) -> newValue); + } + + /** + * Get all Counter type metrics. + */ + @Override + public Map getAllCounts(boolean prefixWithRegistryName) { + HashMap countersMap = new HashMap<>(); + counters.forEach((k, v) -> { + String key = prefixWithRegistryName ? name + "." + k : k; + countersMap.put(key, v); + }); + return countersMap; + } + + @Override + public void add(Map arg) { + arg.forEach((key, value) -> add(key, value)); + } + + @Override + public AccumulatorV2, Map> copy() { + DistributedRegistry registry = new DistributedRegistry(name); + counters.forEach((key, value) -> registry.add(key, value)); + return registry; + } + + @Override + public boolean isZero() { + return counters.isEmpty(); + } + + @Override + public void merge(AccumulatorV2, Map> acc) { + acc.value().forEach((key, value) -> add(key, value)); + } + + @Override + public void reset() { + counters.clear(); + } + + @Override + public Map value() { + return counters; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/sort/SpaceCurveSortingHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/sort/SpaceCurveSortingHelper.java new file mode 100644 index 0000000000000..496168e844276 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/sort/SpaceCurveSortingHelper.java @@ -0,0 +1,277 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sort; + +import org.apache.hudi.common.util.BinaryUtil; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.optimize.HilbertCurveUtils; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.hudi.execution.ByteArraySorting; +import org.apache.spark.sql.hudi.execution.RangeSampleSort$; +import org.apache.spark.sql.types.BinaryType; +import org.apache.spark.sql.types.BinaryType$; +import org.apache.spark.sql.types.BooleanType; +import org.apache.spark.sql.types.ByteType; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DateType; +import org.apache.spark.sql.types.DecimalType; +import org.apache.spark.sql.types.DoubleType; +import org.apache.spark.sql.types.FloatType; +import org.apache.spark.sql.types.IntegerType; +import org.apache.spark.sql.types.LongType; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.ShortType; +import org.apache.spark.sql.types.StringType; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.sql.types.StructType$; +import org.apache.spark.sql.types.TimestampType; +import org.davidmoten.hilbert.HilbertCurve; +import scala.collection.JavaConversions; +import scala.collection.mutable.WrappedArray; + +import javax.annotation.Nonnull; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; + +public class SpaceCurveSortingHelper { + + private static final Logger LOG = LogManager.getLogger(SpaceCurveSortingHelper.class); + + /** + * Orders provided {@link Dataset} by mapping values of the provided list of columns + * {@code orderByCols} onto a specified space curve (Z-curve, Hilbert, etc) + * + *

    + * NOTE: Only support base data-types: long,int,short,double,float,string,timestamp,decimal,date,byte. + * This method is more effective than {@link #orderDataFrameBySamplingValues} leveraging + * data sampling instead of direct mapping + * + * @param df Spark {@link Dataset} holding data to be ordered + * @param orderByCols list of columns to be ordered by + * @param targetPartitionCount target number of output partitions + * @param layoutOptStrategy target layout optimization strategy + * @return a {@link Dataset} holding data ordered by mapping tuple of values from provided columns + * onto a specified space-curve + */ + public static Dataset orderDataFrameByMappingValues( + Dataset df, + HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy, + List orderByCols, + int targetPartitionCount + ) { + Map columnsMap = + Arrays.stream(df.schema().fields()) + .collect(Collectors.toMap(StructField::name, Function.identity())); + + List checkCols = + orderByCols.stream() + .filter(columnsMap::containsKey) + .collect(Collectors.toList()); + + if (orderByCols.size() != checkCols.size()) { + LOG.error(String.format("Trying to ordering over a column(s) not present in the schema (%s); skipping", CollectionUtils.diff(orderByCols, checkCols))); + return df; + } + + // In case when there's just one column to be ordered by, we can skip space-curve + // ordering altogether (since it will match linear ordering anyway) + if (orderByCols.size() == 1) { + String orderByColName = orderByCols.get(0); + LOG.debug(String.format("Single column to order by (%s), skipping space-curve ordering", orderByColName)); + + // TODO validate if we need Spark to re-partition + return df.repartitionByRange(targetPartitionCount, new Column(orderByColName)); + } + + int fieldNum = df.schema().fields().length; + + Map fieldMap = + orderByCols.stream() + .collect( + Collectors.toMap(e -> Arrays.asList(df.schema().fields()).indexOf(columnsMap.get(e)), columnsMap::get)); + + JavaRDD sortedRDD; + switch (layoutOptStrategy) { + case ZORDER: + sortedRDD = createZCurveSortedRDD(df.toJavaRDD(), fieldMap, fieldNum, targetPartitionCount); + break; + case HILBERT: + sortedRDD = createHilbertSortedRDD(df.toJavaRDD(), fieldMap, fieldNum, targetPartitionCount); + break; + default: + throw new UnsupportedOperationException(String.format("Not supported layout-optimization strategy (%s)", layoutOptStrategy)); + } + + // Compose new {@code StructType} for ordered RDDs + StructType newStructType = composeOrderedRDDStructType(df.schema()); + + return df.sparkSession() + .createDataFrame(sortedRDD, newStructType) + .drop("Index"); + } + + private static StructType composeOrderedRDDStructType(StructType schema) { + return StructType$.MODULE$.apply( + CollectionUtils.combine( + Arrays.asList(schema.fields()), + Arrays.asList(new StructField("Index", BinaryType$.MODULE$, true, Metadata.empty())) + ) + ); + } + + private static JavaRDD createZCurveSortedRDD(JavaRDD originRDD, Map fieldMap, int fieldNum, int fileNum) { + return originRDD.map(row -> { + byte[][] zBytes = fieldMap.entrySet().stream() + .map(entry -> { + int index = entry.getKey(); + StructField field = entry.getValue(); + return mapColumnValueTo8Bytes(row, index, field.dataType()); + }) + .toArray(byte[][]::new); + + // Interleave received bytes to produce Z-curve ordinal + byte[] zOrdinalBytes = BinaryUtil.interleaving(zBytes, 8); + return appendToRow(row, zOrdinalBytes); + }) + .sortBy(f -> new ByteArraySorting((byte[]) f.get(fieldNum)), true, fileNum); + } + + private static JavaRDD createHilbertSortedRDD(JavaRDD originRDD, Map fieldMap, int fieldNum, int fileNum) { + // NOTE: Here {@code mapPartitions} is used to make sure Hilbert curve instance is initialized + // only once per partition + return originRDD.mapPartitions(rows -> { + HilbertCurve hilbertCurve = HilbertCurve.bits(63).dimensions(fieldMap.size()); + return new Iterator() { + + @Override + public boolean hasNext() { + return rows.hasNext(); + } + + @Override + public Row next() { + Row row = rows.next(); + long[] longs = fieldMap.entrySet().stream() + .mapToLong(entry -> { + int index = entry.getKey(); + StructField field = entry.getValue(); + return mapColumnValueToLong(row, index, field.dataType()); + }) + .toArray(); + + // Map N-dimensional coordinates into position on the Hilbert curve + byte[] hilbertCurvePosBytes = HilbertCurveUtils.indexBytes(hilbertCurve, longs, 63); + return appendToRow(row, hilbertCurvePosBytes); + } + }; + }) + .sortBy(f -> new ByteArraySorting((byte[]) f.get(fieldNum)), true, fileNum); + } + + private static Row appendToRow(Row row, Object value) { + // NOTE: This is an ugly hack to avoid array re-allocation -- + // Spark's {@code Row#toSeq} returns array of Objects + Object[] currentValues = (Object[]) ((WrappedArray) row.toSeq()).array(); + return RowFactory.create(CollectionUtils.append(currentValues, value)); + } + + @Nonnull + private static byte[] mapColumnValueTo8Bytes(Row row, int index, DataType dataType) { + if (dataType instanceof LongType) { + return BinaryUtil.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : row.getLong(index)); + } else if (dataType instanceof DoubleType) { + return BinaryUtil.doubleTo8Byte(row.isNullAt(index) ? Double.MAX_VALUE : row.getDouble(index)); + } else if (dataType instanceof IntegerType) { + return BinaryUtil.intTo8Byte(row.isNullAt(index) ? Integer.MAX_VALUE : row.getInt(index)); + } else if (dataType instanceof FloatType) { + return BinaryUtil.doubleTo8Byte(row.isNullAt(index) ? Float.MAX_VALUE : row.getFloat(index)); + } else if (dataType instanceof StringType) { + return BinaryUtil.utf8To8Byte(row.isNullAt(index) ? "" : row.getString(index)); + } else if (dataType instanceof DateType) { + return BinaryUtil.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : row.getDate(index).getTime()); + } else if (dataType instanceof TimestampType) { + return BinaryUtil.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : row.getTimestamp(index).getTime()); + } else if (dataType instanceof ByteType) { + return BinaryUtil.byteTo8Byte(row.isNullAt(index) ? Byte.MAX_VALUE : row.getByte(index)); + } else if (dataType instanceof ShortType) { + return BinaryUtil.intTo8Byte(row.isNullAt(index) ? Short.MAX_VALUE : row.getShort(index)); + } else if (dataType instanceof DecimalType) { + return BinaryUtil.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : row.getDecimal(index).longValue()); + } else if (dataType instanceof BooleanType) { + boolean value = row.isNullAt(index) ? false : row.getBoolean(index); + return BinaryUtil.intTo8Byte(value ? 1 : 0); + } else if (dataType instanceof BinaryType) { + return BinaryUtil.paddingTo8Byte(row.isNullAt(index) ? new byte[] {0} : (byte[]) row.get(index)); + } + + throw new UnsupportedOperationException(String.format("Unsupported data-type (%s)", dataType.typeName())); + } + + private static long mapColumnValueToLong(Row row, int index, DataType dataType) { + if (dataType instanceof LongType) { + return row.isNullAt(index) ? Long.MAX_VALUE : row.getLong(index); + } else if (dataType instanceof DoubleType) { + return row.isNullAt(index) ? Long.MAX_VALUE : Double.doubleToLongBits(row.getDouble(index)); + } else if (dataType instanceof IntegerType) { + return row.isNullAt(index) ? Long.MAX_VALUE : (long) row.getInt(index); + } else if (dataType instanceof FloatType) { + return row.isNullAt(index) ? Long.MAX_VALUE : Double.doubleToLongBits((double) row.getFloat(index)); + } else if (dataType instanceof StringType) { + return row.isNullAt(index) ? Long.MAX_VALUE : BinaryUtil.convertStringToLong(row.getString(index)); + } else if (dataType instanceof DateType) { + return row.isNullAt(index) ? Long.MAX_VALUE : row.getDate(index).getTime(); + } else if (dataType instanceof TimestampType) { + return row.isNullAt(index) ? Long.MAX_VALUE : row.getTimestamp(index).getTime(); + } else if (dataType instanceof ByteType) { + return row.isNullAt(index) ? Long.MAX_VALUE : BinaryUtil.convertBytesToLong(new byte[] {row.getByte(index)}); + } else if (dataType instanceof ShortType) { + return row.isNullAt(index) ? Long.MAX_VALUE : (long) row.getShort(index); + } else if (dataType instanceof DecimalType) { + return row.isNullAt(index) ? Long.MAX_VALUE : row.getDecimal(index).longValue(); + } else if (dataType instanceof BooleanType) { + boolean value = row.isNullAt(index) ? false : row.getBoolean(index); + return value ? Long.MAX_VALUE : 0; + } else if (dataType instanceof BinaryType) { + return row.isNullAt(index) ? Long.MAX_VALUE : BinaryUtil.convertBytesToLong((byte[]) row.get(index)); + } + + throw new UnsupportedOperationException(String.format("Unsupported data-type (%s)", dataType.typeName())); + } + + public static Dataset orderDataFrameBySamplingValues( + Dataset df, + HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy, + List orderByCols, + int targetPartitionCount + ) { + return RangeSampleSort$.MODULE$.sortDataFrameBySample(df, layoutOptStrategy, JavaConversions.asScalaBuffer(orderByCols), targetPartitionCount); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java index 99a8f1f3c10c3..743aff51a1254 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java @@ -19,13 +19,21 @@ package org.apache.hudi.table; import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieCleanerPlan; +import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.avro.model.HoodieIndexCommitMetadata; +import org.apache.hudi.avro.model.HoodieIndexPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRestorePlan; import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.avro.model.HoodieSavepointMetadata; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; @@ -35,32 +43,42 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.io.HoodieCreateHandle; import org.apache.hudi.io.HoodieMergeHandle; import org.apache.hudi.io.HoodieSortedMergeHandle; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; +import org.apache.hudi.metadata.MetadataPartitionType; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata; import org.apache.hudi.table.action.bootstrap.SparkBootstrapCommitActionExecutor; -import org.apache.hudi.table.action.clean.SparkCleanActionExecutor; -import org.apache.hudi.table.action.commit.SparkInsertOverwriteCommitActionExecutor; -import org.apache.hudi.table.action.commit.SparkInsertOverwriteTableCommitActionExecutor; +import org.apache.hudi.table.action.clean.CleanActionExecutor; +import org.apache.hudi.table.action.clean.CleanPlanActionExecutor; +import org.apache.hudi.table.action.cluster.ClusteringPlanActionExecutor; +import org.apache.hudi.table.action.cluster.SparkExecuteClusteringCommitActionExecutor; +import org.apache.hudi.table.action.commit.HoodieMergeHelper; import org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor; import org.apache.hudi.table.action.commit.SparkBulkInsertPreppedCommitActionExecutor; import org.apache.hudi.table.action.commit.SparkDeleteCommitActionExecutor; +import org.apache.hudi.table.action.commit.SparkDeletePartitionCommitActionExecutor; import org.apache.hudi.table.action.commit.SparkInsertCommitActionExecutor; +import org.apache.hudi.table.action.commit.SparkInsertOverwriteCommitActionExecutor; +import org.apache.hudi.table.action.commit.SparkInsertOverwriteTableCommitActionExecutor; import org.apache.hudi.table.action.commit.SparkInsertPreppedCommitActionExecutor; -import org.apache.hudi.table.action.commit.SparkMergeHelper; import org.apache.hudi.table.action.commit.SparkUpsertCommitActionExecutor; import org.apache.hudi.table.action.commit.SparkUpsertPreppedCommitActionExecutor; -import org.apache.hudi.table.action.restore.SparkCopyOnWriteRestoreActionExecutor; -import org.apache.hudi.table.action.rollback.SparkCopyOnWriteRollbackActionExecutor; +import org.apache.hudi.table.action.index.RunIndexActionExecutor; +import org.apache.hudi.table.action.index.ScheduleIndexActionExecutor; +import org.apache.hudi.table.action.restore.CopyOnWriteRestoreActionExecutor; +import org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor; +import org.apache.hudi.table.action.rollback.CopyOnWriteRollbackActionExecutor; +import org.apache.hudi.table.action.rollback.RestorePlanActionExecutor; import org.apache.hudi.table.action.savepoint.SavepointActionExecutor; - import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import org.apache.spark.api.java.JavaRDD; import java.io.IOException; import java.util.Collections; @@ -76,7 +94,8 @@ *

    * UPDATES - Produce a new version of the file, just replacing the updated records with new values */ -public class HoodieSparkCopyOnWriteTable extends HoodieSparkTable { +public class HoodieSparkCopyOnWriteTable + extends HoodieSparkTable implements HoodieCompactionHandler { private static final Logger LOG = LogManager.getLogger(HoodieSparkCopyOnWriteTable.class); @@ -85,53 +104,58 @@ public HoodieSparkCopyOnWriteTable(HoodieWriteConfig config, HoodieEngineContext } @Override - public HoodieWriteMetadata> upsert(HoodieEngineContext context, String instantTime, JavaRDD> records) { + public HoodieWriteMetadata> upsert(HoodieEngineContext context, String instantTime, HoodieData> records) { return new SparkUpsertCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, records).execute(); } @Override - public HoodieWriteMetadata> insert(HoodieEngineContext context, String instantTime, JavaRDD> records) { + public HoodieWriteMetadata> insert(HoodieEngineContext context, String instantTime, HoodieData> records) { return new SparkInsertCommitActionExecutor<>((HoodieSparkEngineContext)context, config, this, instantTime, records).execute(); } @Override - public HoodieWriteMetadata> bulkInsert(HoodieEngineContext context, String instantTime, JavaRDD> records, - Option>>> userDefinedBulkInsertPartitioner) { - return new SparkBulkInsertCommitActionExecutor((HoodieSparkEngineContext) context, config, + public HoodieWriteMetadata> bulkInsert(HoodieEngineContext context, String instantTime, HoodieData> records, + Option userDefinedBulkInsertPartitioner) { + return new SparkBulkInsertCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, records, userDefinedBulkInsertPartitioner).execute(); } @Override - public HoodieWriteMetadata> delete(HoodieEngineContext context, String instantTime, JavaRDD keys) { + public HoodieWriteMetadata> delete(HoodieEngineContext context, String instantTime, HoodieData keys) { return new SparkDeleteCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, keys).execute(); } @Override - public HoodieWriteMetadata> upsertPrepped(HoodieEngineContext context, String instantTime, - JavaRDD> preppedRecords) { + public HoodieWriteMetadata> deletePartitions(HoodieEngineContext context, String instantTime, List partitions) { + return new SparkDeletePartitionCommitActionExecutor<>(context, config, this, instantTime, partitions).execute(); + } + + @Override + public HoodieWriteMetadata> upsertPrepped(HoodieEngineContext context, String instantTime, + HoodieData> preppedRecords) { return new SparkUpsertPreppedCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, preppedRecords).execute(); } @Override - public HoodieWriteMetadata> insertPrepped(HoodieEngineContext context, String instantTime, - JavaRDD> preppedRecords) { + public HoodieWriteMetadata> insertPrepped(HoodieEngineContext context, String instantTime, + HoodieData> preppedRecords) { return new SparkInsertPreppedCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, preppedRecords).execute(); } @Override - public HoodieWriteMetadata> bulkInsertPrepped(HoodieEngineContext context, String instantTime, - JavaRDD> preppedRecords, Option>>> userDefinedBulkInsertPartitioner) { + public HoodieWriteMetadata> bulkInsertPrepped(HoodieEngineContext context, String instantTime, + HoodieData> preppedRecords, Option userDefinedBulkInsertPartitioner) { return new SparkBulkInsertPreppedCommitActionExecutor((HoodieSparkEngineContext) context, config, this, instantTime, preppedRecords, userDefinedBulkInsertPartitioner).execute(); } @Override - public HoodieWriteMetadata insertOverwrite(HoodieEngineContext context, String instantTime, JavaRDD> records) { + public HoodieWriteMetadata insertOverwrite(HoodieEngineContext context, String instantTime, HoodieData> records) { return new SparkInsertOverwriteCommitActionExecutor(context, config, this, instantTime, records).execute(); } @Override - public HoodieWriteMetadata> insertOverwriteTable(HoodieEngineContext context, String instantTime, JavaRDD> records) { + public HoodieWriteMetadata> insertOverwriteTable(HoodieEngineContext context, String instantTime, HoodieData> records) { return new SparkInsertOverwriteTableCommitActionExecutor(context, config, this, instantTime, records).execute(); } @@ -141,81 +165,139 @@ public Option scheduleCompaction(HoodieEngineContext conte } @Override - public HoodieWriteMetadata> compact(HoodieEngineContext context, String compactionInstantTime) { + public HoodieWriteMetadata> compact( + HoodieEngineContext context, String compactionInstantTime) { throw new HoodieNotSupportedException("Compaction is not supported on a CopyOnWrite table"); } @Override - public HoodieBootstrapWriteMetadata> bootstrap(HoodieEngineContext context, Option> extraMetadata) { + public Option scheduleClustering(HoodieEngineContext context, + String instantTime, + Option> extraMetadata) { + return new ClusteringPlanActionExecutor<>(context, config,this, instantTime, extraMetadata).execute(); + } + + @Override + public HoodieWriteMetadata> cluster(HoodieEngineContext context, + String clusteringInstantTime) { + return new SparkExecuteClusteringCommitActionExecutor<>(context, config, this, clusteringInstantTime).execute(); + } + + @Override + public HoodieBootstrapWriteMetadata> bootstrap(HoodieEngineContext context, Option> extraMetadata) { return new SparkBootstrapCommitActionExecutor((HoodieSparkEngineContext) context, config, this, extraMetadata).execute(); } @Override public void rollbackBootstrap(HoodieEngineContext context, String instantTime) { - new SparkCopyOnWriteRestoreActionExecutor((HoodieSparkEngineContext) context, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute(); + new RestorePlanActionExecutor<>(context, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute(); + new CopyOnWriteRestoreActionExecutor<>(context, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute(); + } + + @Override + public Option scheduleCleaning(HoodieEngineContext context, String instantTime, Option> extraMetadata) { + return new CleanPlanActionExecutor<>(context, config, this, instantTime, extraMetadata).execute(); + } + + @Override + public Option scheduleRollback(HoodieEngineContext context, + String instantTime, + HoodieInstant instantToRollback, boolean skipTimelinePublish, boolean shouldRollbackUsingMarkers) { + return new BaseRollbackPlanActionExecutor<>(context, config, this, instantTime, instantToRollback, skipTimelinePublish, + shouldRollbackUsingMarkers).execute(); } - public Iterator> handleUpdate(String instantTime, String partitionPath, String fileId, + @Override + public Iterator> handleUpdate( + String instantTime, String partitionPath, String fileId, Map> keyToNewRecords, HoodieBaseFile oldDataFile) throws IOException { // these are updates HoodieMergeHandle upsertHandle = getUpdateHandle(instantTime, partitionPath, fileId, keyToNewRecords, oldDataFile); return handleUpdateInternal(upsertHandle, instantTime, fileId); } - protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String instantTime, - String fileId) throws IOException { + protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String instantTime, + String fileId) throws IOException { if (upsertHandle.getOldFilePath() == null) { throw new HoodieUpsertException( "Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId); } else { - SparkMergeHelper.newInstance().runMerge(this, upsertHandle); + HoodieMergeHelper.newInstance().runMerge(this, upsertHandle); } // TODO(vc): This needs to be revisited - if (upsertHandle.getWriteStatus().getPartitionPath() == null) { + if (upsertHandle.getPartitionPath() == null) { LOG.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", " - + upsertHandle.getWriteStatus()); + + upsertHandle.writeStatuses()); } - return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())).iterator(); + + return Collections.singletonList(upsertHandle.writeStatuses()).iterator(); } protected HoodieMergeHandle getUpdateHandle(String instantTime, String partitionPath, String fileId, Map> keyToNewRecords, HoodieBaseFile dataFileToBeMerged) { + Option keyGeneratorOpt = Option.empty(); + if (!config.populateMetaFields()) { + try { + keyGeneratorOpt = Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()))); + } catch (IOException e) { + throw new HoodieIOException("Only BaseKeyGenerator (or any key generator that extends from BaseKeyGenerator) are supported when meta " + + "columns are disabled. Please choose the right key generator if you wish to disable meta fields.", e); + } + } if (requireSortedRecords()) { return new HoodieSortedMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId, - dataFileToBeMerged, taskContextSupplier); + dataFileToBeMerged, taskContextSupplier, keyGeneratorOpt); } else { - return new HoodieMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId, - dataFileToBeMerged,taskContextSupplier); + return new HoodieMergeHandle(config, instantTime, this, keyToNewRecords, partitionPath, fileId, + dataFileToBeMerged, taskContextSupplier, keyGeneratorOpt); } } - public Iterator> handleInsert(String instantTime, String partitionPath, String fileId, + @Override + public Iterator> handleInsert( + String instantTime, String partitionPath, String fileId, Map> recordMap) { - HoodieCreateHandle createHandle = + HoodieCreateHandle createHandle = new HoodieCreateHandle(config, instantTime, this, partitionPath, fileId, recordMap, taskContextSupplier); createHandle.write(); - return Collections.singletonList(Collections.singletonList(createHandle.close())).iterator(); + return Collections.singletonList(createHandle.close()).iterator(); } @Override - public HoodieCleanMetadata clean(HoodieEngineContext context, String cleanInstantTime) { - return new SparkCleanActionExecutor((HoodieSparkEngineContext)context, config, this, cleanInstantTime).execute(); + public HoodieCleanMetadata clean(HoodieEngineContext context, String cleanInstantTime, boolean skipLocking) { + return new CleanActionExecutor<>(context, config, this, cleanInstantTime, skipLocking).execute(); } @Override - public HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollbackInstantTime, HoodieInstant commitInstant, boolean deleteInstants) { - return new SparkCopyOnWriteRollbackActionExecutor((HoodieSparkEngineContext) context, config, this, rollbackInstantTime, commitInstant, deleteInstants).execute(); + public HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollbackInstantTime, HoodieInstant commitInstant, + boolean deleteInstants, boolean skipLocking) { + return new CopyOnWriteRollbackActionExecutor<>(context, config, this, rollbackInstantTime, commitInstant, + deleteInstants, skipLocking).execute(); + } + + @Override + public Option scheduleIndexing(HoodieEngineContext context, String indexInstantTime, List partitionsToIndex) { + return new ScheduleIndexActionExecutor<>(context, config, this, indexInstantTime, partitionsToIndex).execute(); + } + + @Override + public Option index(HoodieEngineContext context, String indexInstantTime) { + return new RunIndexActionExecutor<>(context, config, this, indexInstantTime).execute(); } @Override public HoodieSavepointMetadata savepoint(HoodieEngineContext context, String instantToSavepoint, String user, String comment) { - return new SavepointActionExecutor(context, config, this, instantToSavepoint, user, comment).execute(); + return new SavepointActionExecutor<>(context, config, this, instantToSavepoint, user, comment).execute(); } @Override public HoodieRestoreMetadata restore(HoodieEngineContext context, String restoreInstantTime, String instantToRestore) { - return new SparkCopyOnWriteRestoreActionExecutor((HoodieSparkEngineContext) context, config, this, restoreInstantTime, instantToRestore).execute(); + return new CopyOnWriteRestoreActionExecutor<>(context, config, this, restoreInstantTime, instantToRestore).execute(); } + @Override + public Option scheduleRestore(HoodieEngineContext context, String restoreInstantTime, String instantToRestore) { + return new RestorePlanActionExecutor<>(context, config, this, restoreInstantTime, instantToRestore).execute(); + } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkMergeOnReadTable.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkMergeOnReadTable.java index 0a60dcc50f032..c9d7424631916 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkMergeOnReadTable.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkMergeOnReadTable.java @@ -21,25 +21,27 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; - import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.hudi.table.action.bootstrap.SparkBootstrapDeltaCommitActionExecutor; import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata; -import org.apache.hudi.table.action.compact.SparkRunCompactionActionExecutor; -import org.apache.hudi.table.action.compact.SparkScheduleCompactionActionExecutor; +import org.apache.hudi.table.action.bootstrap.SparkBootstrapDeltaCommitActionExecutor; +import org.apache.hudi.table.action.compact.HoodieSparkMergeOnReadTableCompactor; +import org.apache.hudi.table.action.compact.RunCompactionActionExecutor; +import org.apache.hudi.table.action.compact.ScheduleCompactionActionExecutor; import org.apache.hudi.table.action.deltacommit.SparkBulkInsertDeltaCommitActionExecutor; import org.apache.hudi.table.action.deltacommit.SparkBulkInsertPreppedDeltaCommitActionExecutor; import org.apache.hudi.table.action.deltacommit.SparkDeleteDeltaCommitActionExecutor; @@ -47,10 +49,10 @@ import org.apache.hudi.table.action.deltacommit.SparkInsertPreppedDeltaCommitActionExecutor; import org.apache.hudi.table.action.deltacommit.SparkUpsertDeltaCommitActionExecutor; import org.apache.hudi.table.action.deltacommit.SparkUpsertPreppedDeltaCommitActionExecutor; -import org.apache.hudi.table.action.compact.BaseScheduleCompactionActionExecutor; -import org.apache.hudi.table.action.restore.SparkMergeOnReadRestoreActionExecutor; -import org.apache.hudi.table.action.rollback.SparkMergeOnReadRollbackActionExecutor; -import org.apache.spark.api.java.JavaRDD; +import org.apache.hudi.table.action.restore.MergeOnReadRestoreActionExecutor; +import org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor; +import org.apache.hudi.table.action.rollback.MergeOnReadRollbackActionExecutor; +import org.apache.hudi.table.action.rollback.RestorePlanActionExecutor; import java.util.List; import java.util.Map; @@ -78,80 +80,94 @@ public class HoodieSparkMergeOnReadTable extends } @Override - public HoodieWriteMetadata> upsert(HoodieEngineContext context, String instantTime, JavaRDD> records) { + public HoodieWriteMetadata> upsert(HoodieEngineContext context, String instantTime, HoodieData> records) { return new SparkUpsertDeltaCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, records).execute(); } @Override - public HoodieWriteMetadata> insert(HoodieEngineContext context, String instantTime, JavaRDD> records) { + public HoodieWriteMetadata> insert(HoodieEngineContext context, String instantTime, HoodieData> records) { return new SparkInsertDeltaCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, records).execute(); } @Override - public HoodieWriteMetadata> bulkInsert(HoodieEngineContext context, String instantTime, JavaRDD> records, - Option>>> userDefinedBulkInsertPartitioner) { - return new SparkBulkInsertDeltaCommitActionExecutor((HoodieSparkEngineContext) context, config, + public HoodieWriteMetadata> bulkInsert(HoodieEngineContext context, String instantTime, HoodieData> records, + Option userDefinedBulkInsertPartitioner) { + return new SparkBulkInsertDeltaCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, records, userDefinedBulkInsertPartitioner).execute(); } @Override - public HoodieWriteMetadata> delete(HoodieEngineContext context, String instantTime, JavaRDD keys) { + public HoodieWriteMetadata> delete(HoodieEngineContext context, String instantTime, HoodieData keys) { return new SparkDeleteDeltaCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, keys).execute(); } @Override - public HoodieWriteMetadata> upsertPrepped(HoodieEngineContext context, String instantTime, - JavaRDD> preppedRecords) { + public HoodieWriteMetadata> upsertPrepped(HoodieEngineContext context, String instantTime, + HoodieData> preppedRecords) { return new SparkUpsertPreppedDeltaCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, preppedRecords).execute(); } @Override - public HoodieWriteMetadata> insertPrepped(HoodieEngineContext context, String instantTime, - JavaRDD> preppedRecords) { + public HoodieWriteMetadata> insertPrepped(HoodieEngineContext context, String instantTime, + HoodieData> preppedRecords) { return new SparkInsertPreppedDeltaCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, preppedRecords).execute(); } @Override - public HoodieWriteMetadata> bulkInsertPrepped(HoodieEngineContext context, String instantTime, - JavaRDD> preppedRecords, Option>>> userDefinedBulkInsertPartitioner) { + public HoodieWriteMetadata> bulkInsertPrepped(HoodieEngineContext context, String instantTime, + HoodieData> preppedRecords, Option userDefinedBulkInsertPartitioner) { return new SparkBulkInsertPreppedDeltaCommitActionExecutor((HoodieSparkEngineContext) context, config, this, instantTime, preppedRecords, userDefinedBulkInsertPartitioner).execute(); } @Override public Option scheduleCompaction(HoodieEngineContext context, String instantTime, Option> extraMetadata) { - BaseScheduleCompactionActionExecutor scheduleCompactionExecutor = new SparkScheduleCompactionActionExecutor( - context, config, this, instantTime, extraMetadata); + ScheduleCompactionActionExecutor scheduleCompactionExecutor = new ScheduleCompactionActionExecutor<>( + context, config, this, instantTime, extraMetadata, + new HoodieSparkMergeOnReadTableCompactor<>()); return scheduleCompactionExecutor.execute(); } @Override - public HoodieWriteMetadata> compact(HoodieEngineContext context, String compactionInstantTime) { - SparkRunCompactionActionExecutor compactionExecutor = new SparkRunCompactionActionExecutor((HoodieSparkEngineContext) context, config, this, compactionInstantTime); + public HoodieWriteMetadata> compact( + HoodieEngineContext context, String compactionInstantTime) { + RunCompactionActionExecutor compactionExecutor = new RunCompactionActionExecutor<>( + context, config, this, compactionInstantTime, new HoodieSparkMergeOnReadTableCompactor<>(), + new HoodieSparkCopyOnWriteTable<>(config, context, getMetaClient())); return compactionExecutor.execute(); } @Override - public HoodieBootstrapWriteMetadata> bootstrap(HoodieEngineContext context, Option> extraMetadata) { - return new SparkBootstrapDeltaCommitActionExecutor((HoodieSparkEngineContext) context, config, this, extraMetadata).execute(); + public HoodieBootstrapWriteMetadata> bootstrap(HoodieEngineContext context, Option> extraMetadata) { + return new SparkBootstrapDeltaCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, extraMetadata).execute(); } @Override public void rollbackBootstrap(HoodieEngineContext context, String instantTime) { - new SparkMergeOnReadRestoreActionExecutor((HoodieSparkEngineContext) context, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute(); + new RestorePlanActionExecutor<>(context, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute(); + new MergeOnReadRestoreActionExecutor<>(context, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute(); + } + + @Override + public Option scheduleRollback(HoodieEngineContext context, + String instantTime, + HoodieInstant instantToRollback, boolean skipTimelinePublish, boolean shouldRollbackUsingMarkers) { + return new BaseRollbackPlanActionExecutor<>(context, config, this, instantTime, instantToRollback, skipTimelinePublish, + shouldRollbackUsingMarkers).execute(); } @Override public HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollbackInstantTime, HoodieInstant commitInstant, - boolean deleteInstants) { - return new SparkMergeOnReadRollbackActionExecutor(context, config, this, rollbackInstantTime, commitInstant, deleteInstants).execute(); + boolean deleteInstants, + boolean skipLocking) { + return new MergeOnReadRollbackActionExecutor<>(context, config, this, rollbackInstantTime, commitInstant, deleteInstants, skipLocking).execute(); } @Override public HoodieRestoreMetadata restore(HoodieEngineContext context, String restoreInstantTime, String instantToRestore) { - return new SparkMergeOnReadRestoreActionExecutor((HoodieSparkEngineContext) context, config, this, restoreInstantTime, instantToRestore).execute(); + return new MergeOnReadRestoreActionExecutor<>(context, config, this, restoreInstantTime, instantToRestore).execute(); } @Override diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkTable.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkTable.java index d5fb4ee018a69..66d51c91283f3 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkTable.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkTable.java @@ -19,8 +19,9 @@ package org.apache.hudi.table; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -29,44 +30,97 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.index.SparkHoodieIndex; +import org.apache.hudi.index.SparkHoodieIndexFactory; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; -import org.apache.spark.api.java.JavaRDD; +import org.apache.avro.specific.SpecificRecordBase; +import org.apache.hadoop.fs.Path; +import org.apache.spark.TaskContext; +import org.apache.spark.TaskContext$; + +import java.io.IOException; public abstract class HoodieSparkTable - extends HoodieTable>, JavaRDD, JavaRDD> { + extends HoodieTable>, HoodieData, HoodieData> { + + private volatile boolean isMetadataTableExists = false; protected HoodieSparkTable(HoodieWriteConfig config, HoodieEngineContext context, HoodieTableMetaClient metaClient) { super(config, context, metaClient); } public static HoodieSparkTable create(HoodieWriteConfig config, HoodieEngineContext context) { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient( - context.getHadoopConf().get(), - config.getBasePath(), - true, - config.getConsistencyGuardConfig(), - Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion())) - ); + HoodieTableMetaClient metaClient = + HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(config.getBasePath()) + .setLoadActiveTimelineOnLoad(true).setConsistencyGuardConfig(config.getConsistencyGuardConfig()) + .setLayoutVersion(Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion()))) + .setFileSystemRetryConfig(config.getFileSystemRetryConfig()) + .setProperties(config.getProps()).build(); return HoodieSparkTable.create(config, (HoodieSparkEngineContext) context, metaClient); } public static HoodieSparkTable create(HoodieWriteConfig config, HoodieSparkEngineContext context, HoodieTableMetaClient metaClient) { + HoodieSparkTable hoodieSparkTable; switch (metaClient.getTableType()) { case COPY_ON_WRITE: - return new HoodieSparkCopyOnWriteTable<>(config, context, metaClient); + hoodieSparkTable = new HoodieSparkCopyOnWriteTable<>(config, context, metaClient); + break; case MERGE_ON_READ: - return new HoodieSparkMergeOnReadTable<>(config, context, metaClient); + hoodieSparkTable = new HoodieSparkMergeOnReadTable<>(config, context, metaClient); + break; default: throw new HoodieException("Unsupported table type :" + metaClient.getTableType()); } + return hoodieSparkTable; + } + + @Override + protected HoodieIndex getIndex(HoodieWriteConfig config, HoodieEngineContext context) { + return SparkHoodieIndexFactory.createIndex(config); + } + + /** + * Fetch instance of {@link HoodieTableMetadataWriter}. + * + * @return instance of {@link HoodieTableMetadataWriter} + */ + @Override + public Option getMetadataWriter(String triggeringInstantTimestamp, + Option actionMetadata) { + if (config.isMetadataTableEnabled()) { + // Create the metadata table writer. First time after the upgrade this creation might trigger + // metadata table bootstrapping. Bootstrapping process could fail and checking the table + // existence after the creation is needed. + final HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create( + context.getHadoopConf().get(), config, context, actionMetadata, Option.of(triggeringInstantTimestamp)); + // even with metadata enabled, some index could have been disabled + // delete metadata partitions corresponding to such indexes + deleteMetadataIndexIfNecessary(); + try { + if (isMetadataTableExists || metaClient.getFs().exists(new Path( + HoodieTableMetadata.getMetadataTableBasePath(metaClient.getBasePath())))) { + isMetadataTableExists = true; + return Option.of(metadataWriter); + } + } catch (IOException e) { + throw new HoodieMetadataException("Checking existence of metadata table failed", e); + } + } else { + maybeDeleteMetadataTable(); + } + + return Option.empty(); } @Override - protected HoodieIndex>, JavaRDD, JavaRDD> getIndex(HoodieWriteConfig config, HoodieEngineContext context) { - return SparkHoodieIndex.createIndex(config); + public Runnable getPreExecuteRunnable() { + final TaskContext taskContext = TaskContext.get(); + return () -> TaskContext$.MODULE$.setTaskContext(taskContext); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BaseBootstrapMetadataHandler.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BaseBootstrapMetadataHandler.java new file mode 100644 index 0000000000000..0bc15fa2106a5 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BaseBootstrapMetadataHandler.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.bootstrap; + +import org.apache.avro.Schema; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieFileStatus; +import org.apache.hudi.client.bootstrap.BootstrapWriteStatus; +import org.apache.hudi.common.bootstrap.FileStatusUtils; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.BootstrapFileMapping; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.io.HoodieBootstrapHandle; +import org.apache.hudi.keygen.KeyGeneratorInterface; +import org.apache.hudi.table.HoodieTable; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.parquet.avro.AvroReadSupport; + +import java.io.IOException; +import java.util.List; +import java.util.stream.Collectors; + +public abstract class BaseBootstrapMetadataHandler implements BootstrapMetadataHandler { + private static final Logger LOG = LogManager.getLogger(ParquetBootstrapMetadataHandler.class); + protected HoodieWriteConfig config; + protected HoodieTable table; + protected HoodieFileStatus srcFileStatus; + + public BaseBootstrapMetadataHandler(HoodieWriteConfig config, HoodieTable table, HoodieFileStatus srcFileStatus) { + this.config = config; + this.table = table; + this.srcFileStatus = srcFileStatus; + } + + public BootstrapWriteStatus runMetadataBootstrap(String srcPartitionPath, String partitionPath, KeyGeneratorInterface keyGenerator) { + Path sourceFilePath = FileStatusUtils.toPath(srcFileStatus.getPath()); + HoodieBootstrapHandle bootstrapHandle = new HoodieBootstrapHandle(config, HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, + table, partitionPath, FSUtils.createNewFileIdPfx(), table.getTaskContextSupplier()); + try { + Schema avroSchema = getAvroSchema(sourceFilePath); + List recordKeyColumns = keyGenerator.getRecordKeyFieldNames().stream() + .map(HoodieAvroUtils::getRootLevelFieldName) + .collect(Collectors.toList()); + Schema recordKeySchema = HoodieAvroUtils.generateProjectionSchema(avroSchema, recordKeyColumns); + LOG.info("Schema to be used for reading record Keys :" + recordKeySchema); + AvroReadSupport.setAvroReadSchema(table.getHadoopConf(), recordKeySchema); + AvroReadSupport.setRequestedProjection(table.getHadoopConf(), recordKeySchema); + executeBootstrap(bootstrapHandle, sourceFilePath, keyGenerator, partitionPath, avroSchema); + } catch (Exception e) { + throw new HoodieException(e.getMessage(), e); + } + + BootstrapWriteStatus writeStatus = (BootstrapWriteStatus) bootstrapHandle.writeStatuses().get(0); + BootstrapFileMapping bootstrapFileMapping = new BootstrapFileMapping( + config.getBootstrapSourceBasePath(), srcPartitionPath, partitionPath, + srcFileStatus, writeStatus.getFileId()); + writeStatus.setBootstrapSourceFileMapping(bootstrapFileMapping); + return writeStatus; + } + + abstract Schema getAvroSchema(Path sourceFilePath) throws IOException; + + abstract void executeBootstrap(HoodieBootstrapHandle bootstrapHandle, + Path sourceFilePath, KeyGeneratorInterface keyGenerator, String partitionPath, Schema avroSchema) throws Exception; +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapMetadataHandler.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapMetadataHandler.java new file mode 100644 index 0000000000000..237fe6cf84849 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapMetadataHandler.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.bootstrap; + +import org.apache.hudi.client.bootstrap.BootstrapWriteStatus; +import org.apache.hudi.keygen.KeyGeneratorInterface; + +/** + * Bootstrap metadata handler to assist in bootstrapping only metadata. + */ +public interface BootstrapMetadataHandler { + /** + * Execute bootstrap with only metadata. + * + * @param srcPartitionPath source partition path. + * @param partitionPath destination partition path. + * @param keyGenerator key generator to use. + * @return the {@link BootstrapWriteStatus} which has the result of execution. + */ + BootstrapWriteStatus runMetadataBootstrap(String srcPartitionPath, String partitionPath, KeyGeneratorInterface keyGenerator); +} + + diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/MetadataBootstrapHandlerFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/MetadataBootstrapHandlerFactory.java new file mode 100644 index 0000000000000..285467f9ff260 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/MetadataBootstrapHandlerFactory.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.bootstrap; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.bootstrap.FileStatusUtils; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.avro.model.HoodieFileStatus; + +import static org.apache.hudi.common.model.HoodieFileFormat.ORC; +import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; + +public class MetadataBootstrapHandlerFactory { + + public static BootstrapMetadataHandler getMetadataHandler(HoodieWriteConfig config, HoodieTable table, HoodieFileStatus srcFileStatus) { + Path sourceFilePath = FileStatusUtils.toPath(srcFileStatus.getPath()); + + String extension = FSUtils.getFileExtension(sourceFilePath.toString()); + BootstrapMetadataHandler bootstrapMetadataHandler; + if (ORC.getFileExtension().equals(extension)) { + return new OrcBootstrapMetadataHandler(config, table, srcFileStatus); + } else if (PARQUET.getFileExtension().equals(extension)) { + return new ParquetBootstrapMetadataHandler(config, table, srcFileStatus); + } else { + throw new HoodieIOException("Bootstrap Metadata Handler not implemented for base file format " + extension); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/OrcBootstrapMetadataHandler.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/OrcBootstrapMetadataHandler.java new file mode 100644 index 0000000000000..96ac794dcbc82 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/OrcBootstrapMetadataHandler.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.bootstrap; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieFileStatus; +import org.apache.hudi.client.bootstrap.BootstrapRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.AvroOrcUtils; +import org.apache.hudi.common.util.OrcReaderIterator; +import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.io.HoodieBootstrapHandle; +import org.apache.hudi.keygen.KeyGeneratorInterface; +import org.apache.hudi.table.HoodieTable; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.orc.RecordReader; +import org.apache.orc.TypeDescription; + +import java.io.IOException; + +class OrcBootstrapMetadataHandler extends BaseBootstrapMetadataHandler { + private static final Logger LOG = LogManager.getLogger(OrcBootstrapMetadataHandler.class); + + public OrcBootstrapMetadataHandler(HoodieWriteConfig config, HoodieTable table, HoodieFileStatus srcFileStatus) { + super(config, table, srcFileStatus); + } + + @Override + Schema getAvroSchema(Path sourceFilePath) throws IOException { + Reader orcReader = OrcFile.createReader(sourceFilePath, OrcFile.readerOptions(table.getHadoopConf())); + TypeDescription orcSchema = orcReader.getSchema(); + return AvroOrcUtils.createAvroSchema(orcSchema); + } + + @Override + void executeBootstrap(HoodieBootstrapHandle bootstrapHandle, Path sourceFilePath, KeyGeneratorInterface keyGenerator, + String partitionPath, Schema avroSchema) throws Exception { + BoundedInMemoryExecutor wrapper = null; + Reader orcReader = OrcFile.createReader(sourceFilePath, OrcFile.readerOptions(table.getHadoopConf())); + TypeDescription orcSchema = orcReader.getSchema(); + try (RecordReader reader = orcReader.rows(new Reader.Options(table.getHadoopConf()).schema(orcSchema))) { + wrapper = new BoundedInMemoryExecutor(config.getWriteBufferLimitBytes(), + new OrcReaderIterator(reader, avroSchema, orcSchema), new BootstrapRecordConsumer(bootstrapHandle), inp -> { + String recKey = keyGenerator.getKey(inp).getRecordKey(); + GenericRecord gr = new GenericData.Record(HoodieAvroUtils.RECORD_KEY_SCHEMA); + gr.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recKey); + BootstrapRecordPayload payload = new BootstrapRecordPayload(gr); + HoodieRecord rec = new HoodieAvroRecord(new HoodieKey(recKey, partitionPath), payload); + return rec; + }, table.getPreExecuteRunnable()); + wrapper.execute(); + } catch (Exception e) { + throw new HoodieException(e); + } finally { + if (null != wrapper) { + wrapper.shutdownNow(); + wrapper.awaitTermination(); + } + bootstrapHandle.close(); + } + } +} + diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/ParquetBootstrapMetadataHandler.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/ParquetBootstrapMetadataHandler.java new file mode 100644 index 0000000000000..5f45629ba8023 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/ParquetBootstrapMetadataHandler.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.bootstrap; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieFileStatus; +import org.apache.hudi.client.bootstrap.BootstrapRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.ParquetReaderIterator; +import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.io.HoodieBootstrapHandle; +import org.apache.hudi.keygen.KeyGeneratorInterface; +import org.apache.hudi.table.HoodieTable; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.parquet.avro.AvroParquetReader; +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.format.converter.ParquetMetadataConverter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.MessageType; + +import java.io.IOException; + +class ParquetBootstrapMetadataHandler extends BaseBootstrapMetadataHandler { + private static final Logger LOG = LogManager.getLogger(ParquetBootstrapMetadataHandler.class); + + public ParquetBootstrapMetadataHandler(HoodieWriteConfig config, HoodieTable table, HoodieFileStatus srcFileStatus) { + super(config, table, srcFileStatus); + } + + @Override + Schema getAvroSchema(Path sourceFilePath) throws IOException { + ParquetMetadata readFooter = ParquetFileReader.readFooter(table.getHadoopConf(), sourceFilePath, + ParquetMetadataConverter.NO_FILTER); + MessageType parquetSchema = readFooter.getFileMetaData().getSchema(); + return new AvroSchemaConverter().convert(parquetSchema); + } + + @Override + void executeBootstrap(HoodieBootstrapHandle bootstrapHandle, + Path sourceFilePath, KeyGeneratorInterface keyGenerator, String partitionPath, Schema avroSchema) throws Exception { + BoundedInMemoryExecutor wrapper = null; + ParquetReader reader = + AvroParquetReader.builder(sourceFilePath).withConf(table.getHadoopConf()).build(); + try { + wrapper = new BoundedInMemoryExecutor(config.getWriteBufferLimitBytes(), + new ParquetReaderIterator(reader), new BootstrapRecordConsumer(bootstrapHandle), inp -> { + String recKey = keyGenerator.getKey(inp).getRecordKey(); + GenericRecord gr = new GenericData.Record(HoodieAvroUtils.RECORD_KEY_SCHEMA); + gr.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recKey); + BootstrapRecordPayload payload = new BootstrapRecordPayload(gr); + HoodieRecord rec = new HoodieAvroRecord(new HoodieKey(recKey, partitionPath), payload); + return rec; + }, table.getPreExecuteRunnable()); + wrapper.execute(); + } catch (Exception e) { + throw new HoodieException(e); + } finally { + reader.close(); + if (null != wrapper) { + wrapper.shutdownNow(); + wrapper.awaitTermination(); + } + bootstrapHandle.close(); + } + } +} + diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java index 2dd9fd039dbea..88f6a54e0da6c 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java @@ -18,22 +18,21 @@ package org.apache.hudi.table.action.bootstrap; -import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieFileStatus; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.bootstrap.HoodieBootstrapSchemaProvider; import org.apache.hudi.client.bootstrap.BootstrapMode; -import org.apache.hudi.client.bootstrap.BootstrapRecordPayload; import org.apache.hudi.client.bootstrap.BootstrapWriteStatus; import org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider; +import org.apache.hudi.client.bootstrap.HoodieBootstrapSchemaProvider; import org.apache.hudi.client.bootstrap.HoodieSparkBootstrapSchemaProvider; import org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector; +import org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector; import org.apache.hudi.client.bootstrap.translator.BootstrapPartitionPathTranslator; import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.client.utils.SparkMemoryUtils; -import org.apache.hudi.common.bootstrap.FileStatusUtils; +import org.apache.hudi.client.utils.SparkValidatorUtils; import org.apache.hudi.common.bootstrap.index.BootstrapIndex; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.BootstrapFileMapping; import org.apache.hudi.common.model.HoodieCommitMetadata; @@ -48,56 +47,47 @@ import org.apache.hudi.common.table.timeline.HoodieInstant.State; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ParquetReaderIterator; import org.apache.hudi.common.util.ReflectionUtils; -import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieCommitException; -import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.execution.SparkBoundedInMemoryExecutor; -import org.apache.hudi.io.HoodieBootstrapHandle; +import org.apache.hudi.exception.HoodieKeyGeneratorException; import org.apache.hudi.keygen.KeyGeneratorInterface; +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor; import org.apache.hudi.table.action.commit.BaseCommitActionExecutor; +import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor; import org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor; +import org.apache.hudi.table.marker.WriteMarkersFactory; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import org.apache.parquet.avro.AvroParquetReader; -import org.apache.parquet.avro.AvroReadSupport; -import org.apache.parquet.avro.AvroSchemaConverter; -import org.apache.parquet.format.converter.ParquetMetadataConverter; -import org.apache.parquet.hadoop.ParquetFileReader; -import org.apache.parquet.hadoop.ParquetReader; -import org.apache.parquet.hadoop.metadata.ParquetMetadata; -import org.apache.parquet.schema.MessageType; import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.time.Duration; import java.time.Instant; import java.util.Collection; +import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import static org.apache.hudi.client.bootstrap.BootstrapMode.FULL_RECORD; +import static org.apache.hudi.client.bootstrap.BootstrapMode.METADATA_ONLY; +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; +import static org.apache.hudi.config.HoodieWriteConfig.WRITE_STATUS_STORAGE_LEVEL_VALUE; +import static org.apache.hudi.table.action.bootstrap.MetadataBootstrapHandlerFactory.getMetadataHandler; + public class SparkBootstrapCommitActionExecutor> - extends BaseCommitActionExecutor>, JavaRDD, JavaRDD, HoodieBootstrapWriteMetadata> { + extends BaseCommitActionExecutor>, HoodieData, HoodieData, HoodieBootstrapWriteMetadata>> { private static final Logger LOG = LogManager.getLogger(SparkBootstrapCommitActionExecutor.class); protected String bootstrapSchema = null; @@ -105,41 +95,50 @@ public class SparkBootstrapCommitActionExecutor public SparkBootstrapCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, - HoodieTable>, JavaRDD, JavaRDD> table, + HoodieTable table, Option> extraMetadata) { - super(context, new HoodieWriteConfig.Builder().withProps(config.getProps()) - .withAutoCommit(true).withWriteStatusClass(BootstrapWriteStatus.class) - .withBulkInsertParallelism(config.getBootstrapParallelism()) - .build(), table, HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, WriteOperationType.BOOTSTRAP, + super( + context, + new HoodieWriteConfig.Builder() + .withProps(config.getProps()) + .withAutoCommit(true) + .withWriteStatusClass(BootstrapWriteStatus.class) + .withBulkInsertParallelism(config.getBootstrapParallelism()).build(), + table, + HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, + WriteOperationType.BOOTSTRAP, extraMetadata); bootstrapSourceFileSystem = FSUtils.getFs(config.getBootstrapSourceBasePath(), hadoopConf); } private void validate() { - ValidationUtils.checkArgument(config.getBootstrapSourceBasePath() != null, + checkArgument(config.getBootstrapSourceBasePath() != null, "Ensure Bootstrap Source Path is set"); - ValidationUtils.checkArgument(config.getBootstrapModeSelectorClass() != null, + checkArgument(config.getBootstrapModeSelectorClass() != null, "Ensure Bootstrap Partition Selector is set"); - ValidationUtils.checkArgument(config.getBootstrapKeyGeneratorClass() != null, - "Ensure bootstrap key generator class is set"); + if (METADATA_ONLY.name().equals(config.getBootstrapModeSelectorRegex())) { + checkArgument(!config.getBootstrapModeSelectorClass().equals(FullRecordBootstrapModeSelector.class.getCanonicalName()), + "FullRecordBootstrapModeSelector cannot be used with METADATA_ONLY bootstrap mode"); + } } @Override - public HoodieBootstrapWriteMetadata execute() { + public HoodieBootstrapWriteMetadata> execute() { validate(); try { HoodieTableMetaClient metaClient = table.getMetaClient(); - Option completetedInstant = + Option completedInstant = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant(); - ValidationUtils.checkArgument(!completetedInstant.isPresent(), + checkArgument(!completedInstant.isPresent(), "Active Timeline is expected to be empty for bootstrap to be performed. " + "If you want to re-bootstrap, please rollback bootstrap first !!"); Map>>> partitionSelections = listAndProcessSourcePartitions(); // First run metadata bootstrap which will auto commit - Option metadataResult = metadataBootstrap(partitionSelections.get(BootstrapMode.METADATA_ONLY)); + Option>> metadataResult = metadataBootstrap(partitionSelections.get(METADATA_ONLY)); // if there are full bootstrap to be performed, perform that too - Option fullBootstrapResult = fullBootstrap(partitionSelections.get(BootstrapMode.FULL_RECORD)); + Option>> fullBootstrapResult = fullBootstrap(partitionSelections.get(FULL_RECORD)); + return new HoodieBootstrapWriteMetadata(metadataResult, fullBootstrapResult); } catch (IOException ioe) { throw new HoodieIOException(ioe.getMessage(), ioe); @@ -155,51 +154,60 @@ protected String getSchemaToStoreInCommit() { * Perform Metadata Bootstrap. * @param partitionFilesList List of partitions and files within that partitions */ - protected Option metadataBootstrap(List>> partitionFilesList) { + protected Option>> metadataBootstrap(List>> partitionFilesList) { if (null == partitionFilesList || partitionFilesList.isEmpty()) { return Option.empty(); } HoodieTableMetaClient metaClient = table.getMetaClient(); + String bootstrapInstantTime = HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS; metaClient.getActiveTimeline().createNewInstant( - new HoodieInstant(State.REQUESTED, metaClient.getCommitActionType(), - HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS)); + new HoodieInstant(State.REQUESTED, metaClient.getCommitActionType(), bootstrapInstantTime)); table.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(State.REQUESTED, - metaClient.getCommitActionType(), HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS), Option.empty()); + metaClient.getCommitActionType(), bootstrapInstantTime), Option.empty()); - JavaRDD bootstrapWriteStatuses = runMetadataBootstrap(partitionFilesList); + HoodieData bootstrapWriteStatuses = runMetadataBootstrap(partitionFilesList); - HoodieWriteMetadata> result = new HoodieWriteMetadata<>(); + HoodieWriteMetadata> result = new HoodieWriteMetadata<>(); updateIndexAndCommitIfNeeded(bootstrapWriteStatuses.map(w -> w), result); + + // Delete the marker directory for the instant + WriteMarkersFactory.get(config.getMarkersType(), table, bootstrapInstantTime) + .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); + return Option.of(result); } - private void updateIndexAndCommitIfNeeded(JavaRDD writeStatusRDD, HoodieWriteMetadata> result) { + private void updateIndexAndCommitIfNeeded(HoodieData writeStatuses, HoodieWriteMetadata> result) { // cache writeStatusRDD before updating index, so that all actions before this are not triggered again for future // RDD actions that are performed after updating the index. - writeStatusRDD = writeStatusRDD.persist(SparkMemoryUtils.getWriteStatusStorageLevel(config.getProps())); + writeStatuses.persist(config.getString(WRITE_STATUS_STORAGE_LEVEL_VALUE)); Instant indexStartTime = Instant.now(); // Update the index back - JavaRDD statuses = table.getIndex().updateLocation(writeStatusRDD, context, - table); + HoodieData statuses = table.getIndex().updateLocation(writeStatuses, context, table); result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now())); result.setWriteStatuses(statuses); commitOnAutoCommit(result); } @Override - public HoodieWriteMetadata> execute(JavaRDD> inputRecords) { + public HoodieWriteMetadata> execute(HoodieData> inputRecords) { // NO_OP return null; } @Override - protected void commit(Option> extraMetadata, HoodieWriteMetadata> result) { + protected void setCommitMetadata(HoodieWriteMetadata> result) { + result.setCommitMetadata(Option.of(new HoodieCommitMetadata())); + } + + @Override + protected void commit(Option> extraMetadata, HoodieWriteMetadata> result) { // Perform bootstrap index write and then commit. Make sure both record-key and bootstrap-index // is all done in a single job DAG. Map>> bootstrapSourceAndStats = - result.getWriteStatuses().collect().stream() + result.getWriteStatuses().collectAsList().stream() .map(w -> { BootstrapWriteStatus ws = (BootstrapWriteStatus) w; return Pair.of(ws.getBootstrapSourceFileMapping(), ws.getStat()); @@ -222,7 +230,7 @@ protected void commit(Option> extraMetadata, HoodieWriteMeta LOG.info("Committing metadata bootstrap !!"); } - protected void commit(Option> extraMetadata, HoodieWriteMetadata> result, List stats) { + protected void commit(Option> extraMetadata, HoodieWriteMetadata> result, List stats) { String actionType = table.getMetaClient().getCommitActionType(); LOG.info("Committing " + instantTime + ", action Type " + actionType); // Create a Hoodie table which encapsulated the commits and files visible @@ -237,7 +245,6 @@ protected void commit(Option> extraMetadata, HoodieWriteMeta // Finalize write finalizeWrite(instantTime, stats, result); - // add in extra metadata if (extraMetadata.isPresent()) { extraMetadata.get().forEach(metadata::addMetadata); @@ -245,6 +252,8 @@ protected void commit(Option> extraMetadata, HoodieWriteMeta metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, getSchemaToStoreInCommit()); metadata.setOperationType(operationType); + writeTableMetadata(metadata, actionType); + try { activeTimeline.saveAsComplete(new HoodieInstant(true, actionType, instantTime), Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); @@ -260,7 +269,7 @@ protected void commit(Option> extraMetadata, HoodieWriteMeta * Perform Full Bootstrap. * @param partitionFilesList List of partitions and files within that partitions */ - protected Option fullBootstrap(List>> partitionFilesList) { + protected Option>> fullBootstrap(List>> partitionFilesList) { if (null == partitionFilesList || partitionFilesList.isEmpty()) { return Option.empty(); } @@ -273,70 +282,28 @@ protected Option fullBootstrap(List) inputProvider.generateInputRecords("bootstrap_source", config.getBootstrapSourceBasePath(), partitionFilesList); // Start Full Bootstrap - final HoodieInstant requested = new HoodieInstant(State.REQUESTED, table.getMetaClient().getCommitActionType(), - HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS); + String bootstrapInstantTime = HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS; + final HoodieInstant requested = new HoodieInstant( + State.REQUESTED, table.getMetaClient().getCommitActionType(), bootstrapInstantTime); table.getActiveTimeline().createNewInstant(requested); // Setup correct schema and run bulk insert. - return Option.of(getBulkInsertActionExecutor(inputRecordsRDD).execute()); + Option>> writeMetadataOption = + Option.of(getBulkInsertActionExecutor(HoodieJavaRDD.of(inputRecordsRDD)).execute()); + + // Delete the marker directory for the instant + WriteMarkersFactory.get(config.getMarkersType(), table, bootstrapInstantTime) + .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); + + return writeMetadataOption; } - protected BaseSparkCommitActionExecutor getBulkInsertActionExecutor(JavaRDD inputRecordsRDD) { + protected BaseSparkCommitActionExecutor getBulkInsertActionExecutor(HoodieData inputRecordsRDD) { return new SparkBulkInsertCommitActionExecutor((HoodieSparkEngineContext) context, new HoodieWriteConfig.Builder().withProps(config.getProps()) .withSchema(bootstrapSchema).build(), table, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS, inputRecordsRDD, Option.empty(), extraMetadata); } - private BootstrapWriteStatus handleMetadataBootstrap(String srcPartitionPath, String partitionPath, - HoodieFileStatus srcFileStatus, KeyGeneratorInterface keyGenerator) { - - Path sourceFilePath = FileStatusUtils.toPath(srcFileStatus.getPath()); - HoodieBootstrapHandle bootstrapHandle = new HoodieBootstrapHandle(config, HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, - table, partitionPath, FSUtils.createNewFileIdPfx(), table.getTaskContextSupplier()); - Schema avroSchema = null; - try { - ParquetMetadata readFooter = ParquetFileReader.readFooter(table.getHadoopConf(), sourceFilePath, - ParquetMetadataConverter.NO_FILTER); - MessageType parquetSchema = readFooter.getFileMetaData().getSchema(); - avroSchema = new AvroSchemaConverter().convert(parquetSchema); - Schema recordKeySchema = HoodieAvroUtils.generateProjectionSchema(avroSchema, - keyGenerator.getRecordKeyFieldNames()); - LOG.info("Schema to be used for reading record Keys :" + recordKeySchema); - AvroReadSupport.setAvroReadSchema(table.getHadoopConf(), recordKeySchema); - AvroReadSupport.setRequestedProjection(table.getHadoopConf(), recordKeySchema); - - BoundedInMemoryExecutor wrapper = null; - try (ParquetReader reader = - AvroParquetReader.builder(sourceFilePath).withConf(table.getHadoopConf()).build()) { - wrapper = new SparkBoundedInMemoryExecutor(config, - new ParquetReaderIterator(reader), new BootstrapRecordConsumer(bootstrapHandle), inp -> { - String recKey = keyGenerator.getKey(inp).getRecordKey(); - GenericRecord gr = new GenericData.Record(HoodieAvroUtils.RECORD_KEY_SCHEMA); - gr.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recKey); - BootstrapRecordPayload payload = new BootstrapRecordPayload(gr); - HoodieRecord rec = new HoodieRecord(new HoodieKey(recKey, partitionPath), payload); - return rec; - }); - wrapper.execute(); - } catch (Exception e) { - throw new HoodieException(e); - } finally { - bootstrapHandle.close(); - if (null != wrapper) { - wrapper.shutdownNow(); - } - } - } catch (IOException e) { - throw new HoodieIOException(e.getMessage(), e); - } - BootstrapWriteStatus writeStatus = (BootstrapWriteStatus)bootstrapHandle.getWriteStatus(); - BootstrapFileMapping bootstrapFileMapping = new BootstrapFileMapping( - config.getBootstrapSourceBasePath(), srcPartitionPath, partitionPath, - srcFileStatus, writeStatus.getFileId()); - writeStatus.setBootstrapSourceFileMapping(bootstrapFileMapping); - return writeStatus; - } - /** * Return Bootstrap Mode selections for partitions listed and figure out bootstrap Schema. * @return @@ -354,12 +321,21 @@ private Map>>> listAndPr BootstrapModeSelector selector = (BootstrapModeSelector) ReflectionUtils.loadClass(config.getBootstrapModeSelectorClass(), config); - Map> result = selector.select(folders); + Map> result = new HashMap<>(); + // for FULL_RECORD mode, original record along with metadata fields are needed + if (FULL_RECORD.equals(config.getBootstrapModeForRegexMatch())) { + if (!(selector instanceof FullRecordBootstrapModeSelector)) { + FullRecordBootstrapModeSelector fullRecordBootstrapModeSelector = new FullRecordBootstrapModeSelector(config); + result.putAll(fullRecordBootstrapModeSelector.select(folders)); + } + } else { + result = selector.select(folders); + } Map> partitionToFiles = folders.stream().collect( Collectors.toMap(Pair::getKey, Pair::getValue)); // Ensure all partitions are accounted for - ValidationUtils.checkArgument(partitionToFiles.keySet().equals( + checkArgument(partitionToFiles.keySet().equals( result.values().stream().flatMap(Collection::stream).collect(Collectors.toSet()))); return result.entrySet().stream().map(e -> Pair.of(e.getKey(), e.getValue().stream() @@ -367,16 +343,21 @@ private Map>>> listAndPr .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); } - private JavaRDD runMetadataBootstrap(List>> partitions) { - JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); + private HoodieData runMetadataBootstrap(List>> partitions) { if (null == partitions || partitions.isEmpty()) { - return jsc.emptyRDD(); + return context.emptyHoodieData(); } TypedProperties properties = new TypedProperties(); properties.putAll(config.getProps()); - KeyGeneratorInterface keyGenerator = (KeyGeneratorInterface) ReflectionUtils.loadClass(config.getBootstrapKeyGeneratorClass(), - properties); + + KeyGeneratorInterface keyGenerator; + try { + keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(properties); + } catch (IOException e) { + throw new HoodieKeyGeneratorException("Init keyGenerator failed ", e); + } + BootstrapPartitionPathTranslator translator = (BootstrapPartitionPathTranslator) ReflectionUtils.loadClass( config.getBootstrapPartitionPathTranslatorClass(), properties); @@ -387,9 +368,10 @@ private JavaRDD runMetadataBootstrap(List handleMetadataBootstrap(partitionFsPair.getLeft(), partitionFsPair.getRight().getLeft(), - partitionFsPair.getRight().getRight(), keyGenerator)); + context.setJobStatus(this.getClass().getSimpleName(), "Run metadata-only bootstrap operation: " + config.getTableName()); + return context.parallelize(bootstrapPaths, config.getBootstrapParallelism()) + .map(partitionFsPair -> getMetadataHandler(config, table, partitionFsPair.getRight().getRight()).runMetadataBootstrap(partitionFsPair.getLeft(), + partitionFsPair.getRight().getLeft(), keyGenerator)); } @Override @@ -401,4 +383,9 @@ protected Iterator> handleInsert(String idPfx, Iterator> handleUpdate(String partitionPath, String fileId, Iterator> recordItr) { throw new UnsupportedOperationException("Should not called in bootstrap code path"); } + + @Override + protected void runPrecommitValidators(HoodieWriteMetadata> writeMetadata) { + SparkValidatorUtils.runValidators(config, writeMetadata, context, table, instantTime); + } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapDeltaCommitActionExecutor.java index 59f86662b7c0c..0d2ac6ceef896 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapDeltaCommitActionExecutor.java @@ -18,9 +18,8 @@ package org.apache.hudi.table.action.bootstrap; -import java.util.Map; - import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.timeline.HoodieTimeline; @@ -29,7 +28,8 @@ import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor; import org.apache.hudi.table.action.deltacommit.SparkBulkInsertDeltaCommitActionExecutor; -import org.apache.spark.api.java.JavaRDD; + +import java.util.Map; public class SparkBootstrapDeltaCommitActionExecutor> extends SparkBootstrapCommitActionExecutor { @@ -41,9 +41,15 @@ public SparkBootstrapDeltaCommitActionExecutor(HoodieSparkEngineContext context, } @Override - protected BaseSparkCommitActionExecutor getBulkInsertActionExecutor(JavaRDD inputRecordsRDD) { - return new SparkBulkInsertDeltaCommitActionExecutor((HoodieSparkEngineContext) context, new HoodieWriteConfig.Builder().withProps(config.getProps()) - .withSchema(bootstrapSchema).build(), table, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS, - inputRecordsRDD, extraMetadata); + protected BaseSparkCommitActionExecutor getBulkInsertActionExecutor(HoodieData inputRecordsRDD) { + return new SparkBulkInsertDeltaCommitActionExecutor( + (HoodieSparkEngineContext) context, + new HoodieWriteConfig.Builder() + .withProps(config.getProps()) + .withSchema(bootstrapSchema).build(), + table, + HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS, + inputRecordsRDD, + extraMetadata); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/clean/SparkCleanActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/clean/SparkCleanActionExecutor.java deleted file mode 100644 index bbd5c1fb0e4cc..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/clean/SparkCleanActionExecutor.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.clean; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hudi.avro.model.HoodieActionInstant; -import org.apache.hudi.avro.model.HoodieCleanerPlan; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.HoodieCleanStat; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.model.CleanFileInfo; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.PairFlatMapFunction; -import scala.Tuple2; - -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -@SuppressWarnings("checkstyle:LineLength") -public class SparkCleanActionExecutor extends - BaseCleanActionExecutor>, JavaRDD, JavaRDD> { - - private static final Logger LOG = LogManager.getLogger(SparkCleanActionExecutor.class); - - public SparkCleanActionExecutor(HoodieSparkEngineContext context, - HoodieWriteConfig config, - HoodieTable>, JavaRDD, JavaRDD> table, - String instantTime) { - super(context, config, table, instantTime); - } - - private static PairFlatMapFunction>, String, PartitionCleanStat> - deleteFilesFunc(HoodieTable table) { - return (PairFlatMapFunction>, String, PartitionCleanStat>) iter -> { - Map partitionCleanStatMap = new HashMap<>(); - FileSystem fs = table.getMetaClient().getFs(); - while (iter.hasNext()) { - Tuple2 partitionDelFileTuple = iter.next(); - String partitionPath = partitionDelFileTuple._1(); - Path deletePath = new Path(partitionDelFileTuple._2().getFilePath()); - String deletePathStr = deletePath.toString(); - Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr); - if (!partitionCleanStatMap.containsKey(partitionPath)) { - partitionCleanStatMap.put(partitionPath, new PartitionCleanStat(partitionPath)); - } - boolean isBootstrapBasePathFile = partitionDelFileTuple._2().isBootstrapBaseFile(); - PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath); - if (isBootstrapBasePathFile) { - // For Bootstrap Base file deletions, store the full file path. - partitionCleanStat.addDeleteFilePatterns(deletePath.toString(), true); - partitionCleanStat.addDeletedFileResult(deletePath.toString(), deletedFileResult, true); - } else { - partitionCleanStat.addDeleteFilePatterns(deletePath.getName(), false); - partitionCleanStat.addDeletedFileResult(deletePath.getName(), deletedFileResult, false); - } - } - return partitionCleanStatMap.entrySet().stream().map(e -> new Tuple2<>(e.getKey(), e.getValue())) - .collect(Collectors.toList()).iterator(); - }; - } - - @Override - List clean(HoodieEngineContext context, HoodieCleanerPlan cleanerPlan) { - JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); - int cleanerParallelism = Math.min( - (int) (cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().mapToInt(List::size).count()), - config.getCleanerParallelism()); - LOG.info("Using cleanerParallelism: " + cleanerParallelism); - - context.setJobStatus(this.getClass().getSimpleName(), "Perform cleaning of partitions"); - List> partitionCleanStats = jsc - .parallelize(cleanerPlan.getFilePathsToBeDeletedPerPartition().entrySet().stream() - .flatMap(x -> x.getValue().stream().map(y -> new Tuple2<>(x.getKey(), - new CleanFileInfo(y.getFilePath(), y.getIsBootstrapBaseFile())))) - .collect(Collectors.toList()), cleanerParallelism) - .mapPartitionsToPair(deleteFilesFunc(table)) - .reduceByKey(PartitionCleanStat::merge).collect(); - - Map partitionCleanStatsMap = partitionCleanStats.stream() - .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2)); - - // Return PartitionCleanStat for each partition passed. - return cleanerPlan.getFilePathsToBeDeletedPerPartition().keySet().stream().map(partitionPath -> { - PartitionCleanStat partitionCleanStat = partitionCleanStatsMap.containsKey(partitionPath) - ? partitionCleanStatsMap.get(partitionPath) - : new PartitionCleanStat(partitionPath); - HoodieActionInstant actionInstant = cleanerPlan.getEarliestInstantToRetain(); - return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()).withPartitionPath(partitionPath) - .withEarliestCommitRetained(Option.ofNullable( - actionInstant != null - ? new HoodieInstant(HoodieInstant.State.valueOf(actionInstant.getState()), - actionInstant.getAction(), actionInstant.getTimestamp()) - : null)) - .withDeletePathPattern(partitionCleanStat.deletePathPatterns()) - .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles()) - .withFailedDeletes(partitionCleanStat.failedDeleteFiles()) - .withDeleteBootstrapBasePathPatterns(partitionCleanStat.getDeleteBootstrapBasePathPatterns()) - .withSuccessfulDeleteBootstrapBaseFiles(partitionCleanStat.getSuccessfulDeleteBootstrapBaseFiles()) - .withFailedDeleteBootstrapBaseFiles(partitionCleanStat.getFailedDeleteBootstrapBaseFiles()) - .build(); - }).collect(Collectors.toList()); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/cluster/SparkExecuteClusteringCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/cluster/SparkExecuteClusteringCommitActionExecutor.java new file mode 100644 index 0000000000000..7d2a4c0baabe3 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/cluster/SparkExecuteClusteringCommitActionExecutor.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.cluster; + +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.ClusteringUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieClusteringException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor; + +public class SparkExecuteClusteringCommitActionExecutor> + extends BaseSparkCommitActionExecutor { + + private final HoodieClusteringPlan clusteringPlan; + + public SparkExecuteClusteringCommitActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, HoodieTable table, + String instantTime) { + super(context, config, table, instantTime, WriteOperationType.CLUSTER); + this.clusteringPlan = ClusteringUtils.getClusteringPlan( + table.getMetaClient(), HoodieTimeline.getReplaceCommitRequestedInstant(instantTime)) + .map(Pair::getRight).orElseThrow(() -> new HoodieClusteringException( + "Unable to read clustering plan for instant: " + instantTime)); + } + + @Override + public HoodieWriteMetadata> execute() { + return executeClustering(clusteringPlan); + } + + @Override + protected String getCommitActionType() { + return HoodieTimeline.REPLACE_COMMIT_ACTION; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java index ad62db9250eda..f8e4b31ff687e 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java @@ -19,10 +19,11 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.utils.SparkMemoryUtils; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.client.utils.SparkValidatorUtils; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; @@ -33,26 +34,34 @@ import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.CommitUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaPairRDD; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieCommitException; +import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.execution.SparkLazyInsertIterable; import org.apache.hudi.io.CreateHandleFactory; +import org.apache.hudi.io.HoodieConcatHandle; import org.apache.hudi.io.HoodieMergeHandle; import org.apache.hudi.io.HoodieSortedMergeHandle; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.WorkloadProfile; import org.apache.hudi.table.WorkloadStat; import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.cluster.strategy.UpdateStrategy; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.Partitioner; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.storage.StorageLevel; -import scala.Tuple2; import java.io.IOException; import java.io.Serializable; @@ -65,18 +74,26 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import scala.Tuple2; + +import static org.apache.hudi.common.util.ClusteringUtils.getAllFileGroupsInPendingClusteringPlans; +import static org.apache.hudi.config.HoodieWriteConfig.WRITE_STATUS_STORAGE_LEVEL_VALUE; public abstract class BaseSparkCommitActionExecutor extends - BaseCommitActionExecutor>, JavaRDD, JavaRDD, HoodieWriteMetadata> { + BaseCommitActionExecutor>, HoodieData, HoodieData, HoodieWriteMetadata>> { private static final Logger LOG = LogManager.getLogger(BaseSparkCommitActionExecutor.class); + protected final Option keyGeneratorOpt; public BaseSparkCommitActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime, WriteOperationType operationType) { - super(context, config, table, instantTime, operationType, Option.empty()); + this(context, config, table, instantTime, operationType, Option.empty()); } public BaseSparkCommitActionExecutor(HoodieEngineContext context, @@ -84,51 +101,88 @@ public BaseSparkCommitActionExecutor(HoodieEngineContext context, HoodieTable table, String instantTime, WriteOperationType operationType, - Option extraMetadata) { + Option> extraMetadata) { super(context, config, table, instantTime, operationType, extraMetadata); + try { + keyGeneratorOpt = config.populateMetaFields() + ? Option.empty() + : Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(this.config.getProps())); + } catch (IOException e) { + throw new HoodieIOException("Only BaseKeyGenerators are supported when meta columns are disabled ", e); + } + } + + private HoodieData> clusteringHandleUpdate(HoodieData> inputRecords, Set fileGroupsInPendingClustering) { + context.setJobStatus(this.getClass().getSimpleName(), "Handling updates which are under clustering: " + config.getTableName()); + UpdateStrategy>> updateStrategy = (UpdateStrategy>>) ReflectionUtils + .loadClass(config.getClusteringUpdatesStrategyClass(), this.context, fileGroupsInPendingClustering); + Pair>, Set> recordsAndPendingClusteringFileGroups = + updateStrategy.handleUpdate(inputRecords); + Set fileGroupsWithUpdatesAndPendingClustering = recordsAndPendingClusteringFileGroups.getRight(); + if (fileGroupsWithUpdatesAndPendingClustering.isEmpty()) { + return recordsAndPendingClusteringFileGroups.getLeft(); + } + // there are file groups pending clustering and receiving updates, so rollback the pending clustering instants + // there could be race condition, for example, if the clustering completes after instants are fetched but before rollback completed + if (config.isRollbackPendingClustering()) { + Set pendingClusteringInstantsToRollback = getAllFileGroupsInPendingClusteringPlans(table.getMetaClient()).entrySet().stream() + .filter(e -> fileGroupsWithUpdatesAndPendingClustering.contains(e.getKey())) + .map(Map.Entry::getValue) + .collect(Collectors.toSet()); + pendingClusteringInstantsToRollback.forEach(instant -> { + String commitTime = HoodieActiveTimeline.createNewInstantTime(); + table.scheduleRollback(context, commitTime, instant, false, config.shouldRollbackUsingMarkers()); + table.rollback(context, commitTime, instant, true, true); + }); + table.getMetaClient().reloadActiveTimeline(); + } + return recordsAndPendingClusteringFileGroups.getLeft(); } @Override - public HoodieWriteMetadata> execute(JavaRDD> inputRecordsRDD) { - HoodieWriteMetadata> result = new HoodieWriteMetadata<>(); + public HoodieWriteMetadata> execute(HoodieData> inputRecords) { // Cache the tagged records, so we don't end up computing both // TODO: Consistent contract in HoodieWriteClient regarding preppedRecord storage level handling - if (inputRecordsRDD.getStorageLevel() == StorageLevel.NONE()) { - inputRecordsRDD.persist(StorageLevel.MEMORY_AND_DISK_SER()); + JavaRDD> inputRDD = HoodieJavaRDD.getJavaRDD(inputRecords); + if (inputRDD.getStorageLevel() == StorageLevel.NONE()) { + inputRDD.persist(StorageLevel.MEMORY_AND_DISK_SER()); } else { - LOG.info("RDD PreppedRecords was persisted at: " + inputRecordsRDD.getStorageLevel()); + LOG.info("RDD PreppedRecords was persisted at: " + inputRDD.getStorageLevel()); } - WorkloadProfile profile = null; + WorkloadProfile workloadProfile = null; if (isWorkloadProfileNeeded()) { - profile = new WorkloadProfile(buildProfile(inputRecordsRDD)); - LOG.info("Workload profile :" + profile); - saveWorkloadProfileMetadataToInflight(profile, instantTime); + context.setJobStatus(this.getClass().getSimpleName(), "Building workload profile: " + config.getTableName()); + workloadProfile = new WorkloadProfile(buildProfile(inputRecords), operationType, table.getIndex().canIndexLogFiles()); + LOG.info("Input workload profile :" + workloadProfile); } // partition using the insert partitioner - final Partitioner partitioner = getPartitioner(profile); - JavaRDD> partitionedRecords = partition(inputRecordsRDD, partitioner); - JavaRDD writeStatusRDD = partitionedRecords.mapPartitionsWithIndex((partition, recordItr) -> { - if (WriteOperationType.isChangingRecords(operationType)) { - return handleUpsertPartition(instantTime, partition, recordItr, partitioner); - } else { - return handleInsertPartition(instantTime, partition, recordItr, partitioner); - } - }, true).flatMap(List::iterator); + final Partitioner partitioner = getPartitioner(workloadProfile); + if (isWorkloadProfileNeeded()) { + saveWorkloadProfileMetadataToInflight(workloadProfile, instantTime); + } + + // handle records update with clustering + Set fileGroupsInPendingClustering = + table.getFileSystemView().getFileGroupsInPendingClustering().map(Pair::getKey).collect(Collectors.toSet()); + HoodieData> inputRecordsWithClusteringUpdate = fileGroupsInPendingClustering.isEmpty() ? inputRecords : clusteringHandleUpdate(inputRecords, fileGroupsInPendingClustering); - updateIndexAndCommitIfNeeded(writeStatusRDD, result); + context.setJobStatus(this.getClass().getSimpleName(), "Doing partition and writing data: " + config.getTableName()); + HoodieData writeStatuses = mapPartitionsAsRDD(inputRecordsWithClusteringUpdate, partitioner); + HoodieWriteMetadata> result = new HoodieWriteMetadata<>(); + updateIndexAndCommitIfNeeded(writeStatuses, result); return result; } - private Pair, WorkloadStat> buildProfile(JavaRDD> inputRecordsRDD) { + private Pair, WorkloadStat> buildProfile(HoodieData> inputRecords) { HashMap partitionPathStatMap = new HashMap<>(); WorkloadStat globalStat = new WorkloadStat(); // group the records by partitionPath + currentLocation combination, count the number of // records in each partition - Map>, Long> partitionLocationCounts = inputRecordsRDD - .mapToPair(record -> new Tuple2<>( + Map>, Long> partitionLocationCounts = inputRecords + .mapToPair(record -> Pair.of( new Tuple2<>(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())), record)) .countByKey(); @@ -156,24 +210,27 @@ private Pair, WorkloadStat> buildProfile(JavaRDD layoutPartitionerClass = table.getStorageLayout().layoutPartitionerClass(); + if (layoutPartitionerClass.isPresent()) { + return getLayoutPartitioner(profile, layoutPartitionerClass.get()); + } else if (WriteOperationType.isChangingRecords(operationType)) { return getUpsertPartitioner(profile); } else { return getInsertPartitioner(profile); } } - private JavaRDD> partition(JavaRDD> dedupedRecords, Partitioner partitioner) { - JavaPairRDD> mappedRDD = dedupedRecords.mapToPair( - record -> new Tuple2<>(new Tuple2<>(record.getKey(), Option.ofNullable(record.getCurrentLocation())), record)); + private HoodieData mapPartitionsAsRDD(HoodieData> dedupedRecords, Partitioner partitioner) { + JavaPairRDD>, HoodieRecord> mappedRDD = HoodieJavaPairRDD.getJavaPairRDD( + dedupedRecords.mapToPair(record -> Pair.of(new Tuple2<>(record.getKey(), Option.ofNullable(record.getCurrentLocation())), record))); - JavaPairRDD> partitionedRDD; + JavaPairRDD>, HoodieRecord> partitionedRDD; if (table.requireSortedRecords()) { // Partition and sort within each partition as a single step. This is faster than partitioning first and then // applying a sort. - Comparator comparator = (Comparator & Serializable)(t1, t2) -> { - HoodieKey key1 = (HoodieKey) t1._1; - HoodieKey key2 = (HoodieKey) t2._1; + Comparator>> comparator = (Comparator>> & Serializable)(t1, t2) -> { + HoodieKey key1 = t1._1; + HoodieKey key2 = t2._1; return key1.getRecordKey().compareTo(key2.getRecordKey()); }; @@ -182,20 +239,30 @@ record -> new Tuple2<>(new Tuple2<>(record.getKey(), Option.ofNullable(record.ge // Partition only partitionedRDD = mappedRDD.partitionBy(partitioner); } - - return partitionedRDD.map(Tuple2::_2); + return HoodieJavaRDD.of(partitionedRDD.map(Tuple2::_2).mapPartitionsWithIndex((partition, recordItr) -> { + if (WriteOperationType.isChangingRecords(operationType)) { + return handleUpsertPartition(instantTime, partition, recordItr, partitioner); + } else { + return handleInsertPartition(instantTime, partition, recordItr, partitioner); + } + }, true).flatMap(List::iterator)); } - protected void updateIndexAndCommitIfNeeded(JavaRDD writeStatusRDD, HoodieWriteMetadata result) { + protected HoodieData updateIndex(HoodieData writeStatuses, HoodieWriteMetadata> result) { // cache writeStatusRDD before updating index, so that all actions before this are not triggered again for future // RDD actions that are performed after updating the index. - writeStatusRDD = writeStatusRDD.persist(SparkMemoryUtils.getWriteStatusStorageLevel(config.getProps())); + writeStatuses.persist(config.getString(WRITE_STATUS_STORAGE_LEVEL_VALUE)); Instant indexStartTime = Instant.now(); // Update the index back - JavaRDD statuses = table.getIndex().updateLocation(writeStatusRDD, context, table); + HoodieData statuses = table.getIndex().updateLocation(writeStatuses, context, table); result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now())); result.setWriteStatuses(statuses); - result.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(statuses)); + return statuses; + } + + protected void updateIndexAndCommitIfNeeded(HoodieData writeStatusRDD, HoodieWriteMetadata> result) { + updateIndex(writeStatusRDD, result); + result.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(result)); commitOnAutoCommit(result); } @@ -205,24 +272,29 @@ protected String getCommitActionType() { } @Override - protected void commit(Option> extraMetadata, HoodieWriteMetadata> result) { - commit(extraMetadata, result, result.getWriteStatuses().map(WriteStatus::getStat).collect()); + protected void setCommitMetadata(HoodieWriteMetadata> result) { + result.setCommitMetadata(Option.of(CommitUtils.buildMetadata(result.getWriteStatuses().map(WriteStatus::getStat).collectAsList(), + result.getPartitionToReplaceFileIds(), + extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()))); + } + + @Override + protected void commit(Option> extraMetadata, HoodieWriteMetadata> result) { + context.setJobStatus(this.getClass().getSimpleName(), "Commit write status collect: " + config.getTableName()); + commit(extraMetadata, result, result.getWriteStatuses().map(WriteStatus::getStat).collectAsList()); } - protected void commit(Option> extraMetadata, HoodieWriteMetadata> result, List writeStats) { + protected void commit(Option> extraMetadata, HoodieWriteMetadata> result, List writeStats) { String actionType = getCommitActionType(); - LOG.info("Committing " + instantTime + ", action Type " + actionType); + LOG.info("Committing " + instantTime + ", action Type " + actionType + ", operation Type " + operationType); result.setCommitted(true); result.setWriteStats(writeStats); // Finalize write finalizeWrite(instantTime, writeStats, result); - try { - LOG.info("Committing " + instantTime + ", action Type " + getCommitActionType()); HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); - HoodieCommitMetadata metadata = CommitUtils.buildMetadata(writeStats, result.getPartitionToReplaceFileIds(), - extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()); - + HoodieCommitMetadata metadata = result.getCommitMetadata().get(); + writeTableMetadata(metadata, actionType); activeTimeline.saveAsComplete(new HoodieInstant(true, getCommitActionType(), instantTime), Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); LOG.info("Committed " + instantTime); @@ -233,14 +305,14 @@ protected void commit(Option> extraMetadata, HoodieWriteMeta } } - protected Map> getPartitionToReplacedFileIds(JavaRDD writeStatuses) { + protected Map> getPartitionToReplacedFileIds(HoodieWriteMetadata> writeStatuses) { return Collections.emptyMap(); } @SuppressWarnings("unchecked") protected Iterator> handleUpsertPartition(String instantTime, Integer partition, Iterator recordItr, Partitioner partitioner) { - UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner; + SparkHoodiePartitioner upsertPartitioner = (SparkHoodiePartitioner) partitioner; BucketInfo binfo = upsertPartitioner.getBucketInfo(partition); BucketType btype = binfo.bucketType; try { @@ -270,54 +342,51 @@ public Iterator> handleUpdate(String partitionPath, String fil // This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records if (!recordItr.hasNext()) { LOG.info("Empty partition with fileId => " + fileId); - return Collections.singletonList((List) Collections.EMPTY_LIST).iterator(); + return Collections.emptyIterator(); } // these are updates HoodieMergeHandle upsertHandle = getUpdateHandle(partitionPath, fileId, recordItr); return handleUpdateInternal(upsertHandle, fileId); } - protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String fileId) + protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String fileId) throws IOException { if (upsertHandle.getOldFilePath() == null) { throw new HoodieUpsertException( "Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId); } else { - SparkMergeHelper.newInstance().runMerge(table, upsertHandle); + HoodieMergeHelper.newInstance().runMerge(table, upsertHandle); } // TODO(vc): This needs to be revisited - if (upsertHandle.getWriteStatus().getPartitionPath() == null) { + if (upsertHandle.getPartitionPath() == null) { LOG.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", " - + upsertHandle.getWriteStatus()); + + upsertHandle.writeStatuses()); } - return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())).iterator(); + + return Collections.singletonList(upsertHandle.writeStatuses()).iterator(); } protected HoodieMergeHandle getUpdateHandle(String partitionPath, String fileId, Iterator> recordItr) { if (table.requireSortedRecords()) { - return new HoodieSortedMergeHandle<>(config, instantTime, (HoodieSparkTable) table, recordItr, partitionPath, fileId, taskContextSupplier); + return new HoodieSortedMergeHandle<>(config, instantTime, (HoodieSparkTable) table, recordItr, partitionPath, fileId, taskContextSupplier, + keyGeneratorOpt); + } else if (!WriteOperationType.isChangingRecords(operationType) && config.allowDuplicateInserts()) { + return new HoodieConcatHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier, keyGeneratorOpt); } else { - return new HoodieMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier); + return new HoodieMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier, keyGeneratorOpt); } } - protected HoodieMergeHandle getUpdateHandle(String partitionPath, String fileId, - Map> keyToNewRecords, - HoodieBaseFile dataFileToBeMerged) { - return new HoodieMergeHandle<>(config, instantTime, table, keyToNewRecords, - partitionPath, fileId, dataFileToBeMerged, taskContextSupplier); - } - @Override public Iterator> handleInsert(String idPfx, Iterator> recordItr) throws Exception { // This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records if (!recordItr.hasNext()) { LOG.info("Empty partition"); - return Collections.singletonList((List) Collections.EMPTY_LIST).iterator(); + return Collections.emptyIterator(); } - return new SparkLazyInsertIterable(recordItr, true, config, instantTime, table, idPfx, + return new SparkLazyInsertIterable<>(recordItr, true, config, instantTime, table, idPfx, taskContextSupplier, new CreateHandleFactory<>()); } @@ -325,11 +394,21 @@ public Partitioner getUpsertPartitioner(WorkloadProfile profile) { if (profile == null) { throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner."); } - return new UpsertPartitioner(profile, context, table, config); + return new UpsertPartitioner<>(profile, context, table, config); } public Partitioner getInsertPartitioner(WorkloadProfile profile) { return getUpsertPartitioner(profile); } + public Partitioner getLayoutPartitioner(WorkloadProfile profile, String layoutPartitionerClass) { + return (Partitioner) ReflectionUtils.loadClass(layoutPartitionerClass, + new Class[] { WorkloadProfile.class, HoodieEngineContext.class, HoodieTable.class, HoodieWriteConfig.class }, + profile, context, table, config); + } + + @Override + protected void runPrecommitValidators(HoodieWriteMetadata> writeMetadata) { + SparkValidatorUtils.runValidators(config, writeMetadata, context, table, instantTime); + } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BulkInsertDataInternalWriterHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BulkInsertDataInternalWriterHelper.java new file mode 100644 index 0000000000000..12e9dda81a5bc --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BulkInsertDataInternalWriterHelper.java @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.HoodieInternalWriteStatus; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.io.storage.row.HoodieRowCreateHandle; +import org.apache.hudi.keygen.BuiltinKeyGenerator; +import org.apache.hudi.keygen.NonpartitionedKeyGenerator; +import org.apache.hudi.keygen.SimpleKeyGenerator; +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; +import org.apache.hudi.table.HoodieTable; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.UTF8String; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Properties; +import java.util.UUID; + +/** + * Helper class for HoodieBulkInsertDataInternalWriter used by Spark datasource v2. + */ +public class BulkInsertDataInternalWriterHelper { + + private static final Logger LOG = LogManager.getLogger(BulkInsertDataInternalWriterHelper.class); + + private final String instantTime; + private final int taskPartitionId; + private final long taskId; + private final long taskEpochId; + private final HoodieTable hoodieTable; + private final HoodieWriteConfig writeConfig; + private final StructType structType; + private final Boolean arePartitionRecordsSorted; + private final List writeStatusList = new ArrayList<>(); + private final String fileIdPrefix; + private final Map handles = new HashMap<>(); + private final boolean populateMetaFields; + private final boolean shouldPreserveHoodieMetadata; + private final Option keyGeneratorOpt; + private final boolean simpleKeyGen; + private final int simplePartitionFieldIndex; + private final DataType simplePartitionFieldDataType; + /** + * NOTE: This is stored as Catalyst's internal {@link UTF8String} to avoid + * conversion (deserialization) b/w {@link UTF8String} and {@link String} + */ + private UTF8String lastKnownPartitionPath = null; + private HoodieRowCreateHandle handle; + private int numFilesWritten = 0; + + public BulkInsertDataInternalWriterHelper(HoodieTable hoodieTable, HoodieWriteConfig writeConfig, + String instantTime, int taskPartitionId, long taskId, long taskEpochId, StructType structType, + boolean populateMetaFields, boolean arePartitionRecordsSorted) { + this(hoodieTable, writeConfig, instantTime, taskPartitionId, taskId, taskEpochId, structType, + populateMetaFields, arePartitionRecordsSorted, false); + } + + public BulkInsertDataInternalWriterHelper(HoodieTable hoodieTable, HoodieWriteConfig writeConfig, + String instantTime, int taskPartitionId, long taskId, long taskEpochId, StructType structType, + boolean populateMetaFields, boolean arePartitionRecordsSorted, boolean shouldPreserveHoodieMetadata) { + this.hoodieTable = hoodieTable; + this.writeConfig = writeConfig; + this.instantTime = instantTime; + this.taskPartitionId = taskPartitionId; + this.taskId = taskId; + this.taskEpochId = taskEpochId; + this.structType = structType; + this.populateMetaFields = populateMetaFields; + this.shouldPreserveHoodieMetadata = shouldPreserveHoodieMetadata; + this.arePartitionRecordsSorted = arePartitionRecordsSorted; + this.fileIdPrefix = UUID.randomUUID().toString(); + + if (!populateMetaFields) { + this.keyGeneratorOpt = getKeyGenerator(writeConfig.getProps()); + } else { + this.keyGeneratorOpt = Option.empty(); + } + + if (keyGeneratorOpt.isPresent() && keyGeneratorOpt.get() instanceof SimpleKeyGenerator) { + this.simpleKeyGen = true; + this.simplePartitionFieldIndex = (Integer) structType.getFieldIndex(keyGeneratorOpt.get().getPartitionPathFields().get(0)).get(); + this.simplePartitionFieldDataType = structType.fields()[simplePartitionFieldIndex].dataType(); + } else { + this.simpleKeyGen = false; + this.simplePartitionFieldIndex = -1; + this.simplePartitionFieldDataType = null; + } + } + + /** + * Instantiate {@link BuiltinKeyGenerator}. + * + * @param properties properties map. + * @return the key generator thus instantiated. + */ + private Option getKeyGenerator(Properties properties) { + TypedProperties typedProperties = new TypedProperties(); + typedProperties.putAll(properties); + if (Option.ofNullable(properties.get(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key())) + .map(v -> v.equals(NonpartitionedKeyGenerator.class.getName())).orElse(false)) { + return Option.empty(); // Do not instantiate NonPartitionKeyGen + } else { + try { + return Option.of((BuiltinKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(typedProperties)); + } catch (ClassCastException cce) { + throw new HoodieIOException("Only those key generators implementing BuiltInKeyGenerator interface is supported with virtual keys"); + } catch (IOException e) { + throw new HoodieIOException("Key generator instantiation failed ", e); + } + } + } + + public void write(InternalRow row) throws IOException { + try { + UTF8String partitionPath = extractPartitionPath(row); + if (lastKnownPartitionPath == null || !Objects.equals(lastKnownPartitionPath, partitionPath) || !handle.canWrite()) { + LOG.info("Creating new file for partition path " + partitionPath); + handle = getRowCreateHandle(partitionPath.toString()); + // NOTE: It's crucial to make a copy here, since [[UTF8String]] could be pointing into + // a mutable underlying buffer + lastKnownPartitionPath = partitionPath.clone(); + } + + handle.write(row); + } catch (Throwable t) { + LOG.error("Global error thrown while trying to write records in HoodieRowCreateHandle ", t); + throw t; + } + } + + public List getWriteStatuses() throws IOException { + close(); + return writeStatusList; + } + + public void abort() {} + + public void close() throws IOException { + for (HoodieRowCreateHandle rowCreateHandle : handles.values()) { + writeStatusList.add(rowCreateHandle.close()); + } + handles.clear(); + handle = null; + } + + private UTF8String extractPartitionPath(InternalRow row) { + if (populateMetaFields) { + // In case meta-fields are materialized w/in the table itself, we can just simply extract + // partition path from there + // + // NOTE: Helper keeps track of [[lastKnownPartitionPath]] as [[UTF8String]] to avoid + // conversion from Catalyst internal representation into a [[String]] + return row.getUTF8String(HoodieRecord.PARTITION_PATH_META_FIELD_ORD); + } else if (keyGeneratorOpt.isPresent()) { + return keyGeneratorOpt.get().getPartitionPath(row, structType); + } else { + return UTF8String.EMPTY_UTF8; + } + } + + private HoodieRowCreateHandle getRowCreateHandle(String partitionPath) throws IOException { + if (!handles.containsKey(partitionPath)) { // if there is no handle corresponding to the partition path + // if records are sorted, we can close all existing handles + if (arePartitionRecordsSorted) { + close(); + } + HoodieRowCreateHandle rowCreateHandle = createHandle(partitionPath); + handles.put(partitionPath, rowCreateHandle); + } else if (!handles.get(partitionPath).canWrite()) { + // even if there is a handle to the partition path, it could have reached its max size threshold. So, we close the handle here and + // create a new one. + writeStatusList.add(handles.remove(partitionPath).close()); + HoodieRowCreateHandle rowCreateHandle = createHandle(partitionPath); + handles.put(partitionPath, rowCreateHandle); + } + return handles.get(partitionPath); + } + + private HoodieRowCreateHandle createHandle(String partitionPath) { + return new HoodieRowCreateHandle(hoodieTable, writeConfig, partitionPath, getNextFileId(), + instantTime, taskPartitionId, taskId, taskEpochId, structType, shouldPreserveHoodieMetadata); + } + + private String getNextFileId() { + return String.format("%s-%d", fileIdPrefix, numFilesWritten++); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBucketIndexPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBucketIndexPartitioner.java new file mode 100644 index 0000000000000..65a45e1c6a047 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBucketIndexPartitioner.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.hudi.index.bucket.BucketIdentifier; +import scala.Tuple2; + +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.index.bucket.HoodieBucketIndex; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.WorkloadProfile; +import org.apache.hudi.table.WorkloadStat; + +/** + * Packs incoming records to be inserted into buckets (1 bucket = 1 RDD partition). + */ +public class SparkBucketIndexPartitioner> extends + SparkHoodiePartitioner { + + private final int numBuckets; + private final String indexKeyField; + private final int totalPartitionPaths; + private final List partitionPaths; + /** + * Helps get the RDD partition id, partition id is partition offset + bucket id. + * The partition offset is a multiple of the bucket num. + */ + private final Map partitionPathOffset; + + /** + * Partition path and file groups in it pair. Decide the file group an incoming update should go to. + */ + private Map> updatePartitionPathFileIds; + + public SparkBucketIndexPartitioner(WorkloadProfile profile, + HoodieEngineContext context, + HoodieTable table, + HoodieWriteConfig config) { + super(profile, table); + if (!(table.getIndex() instanceof HoodieBucketIndex)) { + throw new HoodieException( + " Bucket index partitioner should only be used by BucketIndex other than " + + table.getIndex().getClass().getSimpleName()); + } + this.numBuckets = ((HoodieBucketIndex) table.getIndex()).getNumBuckets(); + this.indexKeyField = config.getBucketIndexHashField(); + this.totalPartitionPaths = profile.getPartitionPaths().size(); + partitionPaths = new ArrayList<>(profile.getPartitionPaths()); + partitionPathOffset = new HashMap<>(); + int i = 0; + for (Object partitionPath : profile.getPartitionPaths()) { + partitionPathOffset.put(partitionPath.toString(), i); + i += numBuckets; + } + assignUpdates(profile); + } + + private void assignUpdates(WorkloadProfile profile) { + updatePartitionPathFileIds = new HashMap<>(); + // each update location gets a partition + Set> partitionStatEntries = profile.getInputPartitionPathStatMap() + .entrySet(); + for (Entry partitionStat : partitionStatEntries) { + if (!updatePartitionPathFileIds.containsKey(partitionStat.getKey())) { + updatePartitionPathFileIds.put(partitionStat.getKey(), new HashSet<>()); + } + for (Entry> updateLocEntry : + partitionStat.getValue().getUpdateLocationToCount().entrySet()) { + updatePartitionPathFileIds.get(partitionStat.getKey()).add(updateLocEntry.getKey()); + } + } + } + + @Override + public BucketInfo getBucketInfo(int bucketNumber) { + String partitionPath = partitionPaths.get(bucketNumber / numBuckets); + String bucketId = BucketIdentifier.bucketIdStr(bucketNumber % numBuckets); + Option fileIdOption = Option.fromJavaOptional(updatePartitionPathFileIds + .getOrDefault(partitionPath, Collections.emptySet()).stream() + .filter(e -> e.startsWith(bucketId)) + .findFirst()); + if (fileIdOption.isPresent()) { + return new BucketInfo(BucketType.UPDATE, fileIdOption.get(), partitionPath); + } else { + return new BucketInfo(BucketType.INSERT, BucketIdentifier.newBucketFileIdPrefix(bucketId), partitionPath); + } + } + + @Override + public int numPartitions() { + return totalPartitionPaths * numBuckets; + } + + @Override + public int getPartition(Object key) { + Tuple2> keyLocation = (Tuple2>) key; + String partitionPath = keyLocation._1.getPartitionPath(); + Option location = keyLocation._2; + int bucketId = location.isPresent() + ? BucketIdentifier.bucketIdFromFileId(location.get().getFileId()) + : BucketIdentifier.getBucketId(keyLocation._1, indexKeyField, numBuckets); + return partitionPathOffset.get(partitionPath) + bucketId; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertCommitActionExecutor.java index fb8b5f9cd3e05..f4b01c887b068 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertCommitActionExecutor.java @@ -18,36 +18,35 @@ package org.apache.hudi.table.action.commit; -import java.util.Map; - import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieInsertException; -import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.BulkInsertPartitioner; - +import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.spark.api.java.JavaRDD; + +import java.util.Map; public class SparkBulkInsertCommitActionExecutor> extends BaseSparkCommitActionExecutor { - private final JavaRDD> inputRecordsRDD; - private final Option> bulkInsertPartitioner; + private final HoodieData> inputRecordsRDD; + private final Option bulkInsertPartitioner; public SparkBulkInsertCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD, - Option> bulkInsertPartitioner) { + String instantTime, HoodieData> inputRecordsRDD, + Option bulkInsertPartitioner) { this(context, config, table, instantTime, inputRecordsRDD, bulkInsertPartitioner, Option.empty()); } public SparkBulkInsertCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD, - Option> bulkInsertPartitioner, + String instantTime, HoodieData> inputRecordsRDD, + Option bulkInsertPartitioner, Option> extraMetadata) { super(context, config, table, instantTime, WriteOperationType.BULK_INSERT, extraMetadata); this.inputRecordsRDD = inputRecordsRDD; @@ -55,7 +54,7 @@ public SparkBulkInsertCommitActionExecutor(HoodieSparkEngineContext context, Hoo } @Override - public HoodieWriteMetadata> execute() { + public HoodieWriteMetadata> execute() { try { return SparkBulkInsertHelper.newInstance().bulkInsert(inputRecordsRDD, instantTime, table, config, this, true, bulkInsertPartitioner); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java index 9ccd66b2cd265..9f1a7813e8c74 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java @@ -19,15 +19,18 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.execution.bulkinsert.BulkInsertInternalPartitionerFactory; import org.apache.hudi.execution.bulkinsert.BulkInsertMapFunction; +import org.apache.hudi.io.CreateHandleFactory; +import org.apache.hudi.io.WriteHandleFactory; import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; @@ -35,68 +38,79 @@ import org.apache.spark.api.java.JavaRDD; import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.IntStream; /** - * A spark implementation of {@link AbstractBulkInsertHelper}. + * A spark implementation of {@link BaseBulkInsertHelper}. * * @param */ @SuppressWarnings("checkstyle:LineLength") -public class SparkBulkInsertHelper extends AbstractBulkInsertHelper>, - JavaRDD, JavaRDD, R> { +public class SparkBulkInsertHelper extends BaseBulkInsertHelper>, + HoodieData, HoodieData, R> { private SparkBulkInsertHelper() { } private static class BulkInsertHelperHolder { - private static final SparkBulkInsertHelper SPARK_BULK_INSERT_HELPER = new SparkBulkInsertHelper(); + private static final SparkBulkInsertHelper HOODIE_BULK_INSERT_HELPER = new SparkBulkInsertHelper<>(); } public static SparkBulkInsertHelper newInstance() { - return BulkInsertHelperHolder.SPARK_BULK_INSERT_HELPER; + return BulkInsertHelperHolder.HOODIE_BULK_INSERT_HELPER; } @Override - public HoodieWriteMetadata> bulkInsert(JavaRDD> inputRecords, - String instantTime, - HoodieTable>, JavaRDD, JavaRDD> table, - HoodieWriteConfig config, - BaseCommitActionExecutor>, JavaRDD, JavaRDD, R> executor, - boolean performDedupe, - Option> userDefinedBulkInsertPartitioner) { + public HoodieWriteMetadata> bulkInsert(final HoodieData> inputRecords, + final String instantTime, + final HoodieTable>, HoodieData, HoodieData> table, + final HoodieWriteConfig config, + final BaseCommitActionExecutor>, HoodieData, HoodieData, R> executor, + final boolean performDedupe, + final Option userDefinedBulkInsertPartitioner) { HoodieWriteMetadata result = new HoodieWriteMetadata(); + //transition bulk_insert state to inflight + table.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(HoodieInstant.State.REQUESTED, + executor.getCommitActionType(), instantTime), Option.empty(), + config.shouldAllowMultiWriteOnSameInstant()); + + BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.orElse(BulkInsertInternalPartitionerFactory.get(table, config)); + + // write new files + HoodieData writeStatuses = bulkInsert(inputRecords, instantTime, table, config, performDedupe, partitioner, false, + config.getBulkInsertShuffleParallelism(), new CreateHandleFactory(false)); + //update index + ((BaseSparkCommitActionExecutor) executor).updateIndexAndCommitIfNeeded(writeStatuses, result); + return result; + } + + @Override + public HoodieData bulkInsert(HoodieData> inputRecords, + String instantTime, + HoodieTable>, HoodieData, HoodieData> table, + HoodieWriteConfig config, + boolean performDedupe, + BulkInsertPartitioner partitioner, + boolean useWriterSchema, + int parallelism, + WriteHandleFactory writeHandleFactory) { + // De-dupe/merge if needed - JavaRDD> dedupedRecords = inputRecords; + HoodieData> dedupedRecords = inputRecords; if (performDedupe) { - dedupedRecords = (JavaRDD>) SparkWriteHelper.newInstance().combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords, - config.getBulkInsertShuffleParallelism(), table); + dedupedRecords = (HoodieData>) HoodieWriteHelper.newInstance().combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords, + parallelism, table); } - final JavaRDD> repartitionedRecords; - final int parallelism = config.getBulkInsertShuffleParallelism(); - BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.isPresent() - ? userDefinedBulkInsertPartitioner.get() - : BulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode()); - repartitionedRecords = (JavaRDD>) partitioner.repartitionRecords(dedupedRecords, parallelism); + // only JavaRDD is supported for Spark partitioner, but it is not enforced by BulkInsertPartitioner API. To improve this, TODO HUDI-3463 + final HoodieData> repartitionedRecords = HoodieJavaRDD.of((JavaRDD>) partitioner.repartitionRecords(HoodieJavaRDD.getJavaRDD(dedupedRecords), parallelism)); - // generate new file ID prefixes for each output partition - final List fileIDPrefixes = - IntStream.range(0, parallelism).mapToObj(i -> FSUtils.createNewFileIdPfx()).collect(Collectors.toList()); - - table.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(HoodieInstant.State.REQUESTED, - table.getMetaClient().getCommitActionType(), instantTime), Option.empty(), - config.shouldAllowMultiWriteOnSameInstant()); - - JavaRDD writeStatusRDD = repartitionedRecords - .mapPartitionsWithIndex(new BulkInsertMapFunction(instantTime, - partitioner.arePartitionRecordsSorted(), config, table, fileIDPrefixes), true) + JavaRDD writeStatusRDD = HoodieJavaRDD.getJavaRDD(repartitionedRecords) + .mapPartitionsWithIndex(new BulkInsertMapFunction<>(instantTime, + partitioner.arePartitionRecordsSorted(), config, table, useWriterSchema, partitioner, writeHandleFactory), true) .flatMap(List::iterator); - ((BaseSparkCommitActionExecutor) executor).updateIndexAndCommitIfNeeded(writeStatusRDD, result); - return result; + return HoodieJavaRDD.of(writeStatusRDD); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertPreppedCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertPreppedCommitActionExecutor.java index e6b6809498e29..8862981c2a2b7 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertPreppedCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertPreppedCommitActionExecutor.java @@ -20,35 +20,34 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieInsertException; -import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.BulkInsertPartitioner; - +import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.spark.api.java.JavaRDD; public class SparkBulkInsertPreppedCommitActionExecutor> extends BaseSparkCommitActionExecutor { - private final JavaRDD> preppedInputRecordRdd; - private final Option> userDefinedBulkInsertPartitioner; + private final HoodieData> preppedInputRecordRdd; + private final Option userDefinedBulkInsertPartitioner; public SparkBulkInsertPreppedCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> preppedInputRecordRdd, - Option> userDefinedBulkInsertPartitioner) { + String instantTime, HoodieData> preppedInputRecordRdd, + Option userDefinedBulkInsertPartitioner) { super(context, config, table, instantTime, WriteOperationType.BULK_INSERT); this.preppedInputRecordRdd = preppedInputRecordRdd; this.userDefinedBulkInsertPartitioner = userDefinedBulkInsertPartitioner; } @Override - public HoodieWriteMetadata> execute() { + public HoodieWriteMetadata> execute() { try { return SparkBulkInsertHelper.newInstance().bulkInsert(preppedInputRecordRdd, instantTime, table, config, this, false, userDefinedBulkInsertPartitioner); @@ -60,4 +59,4 @@ public HoodieWriteMetadata> execute() { } } -} \ No newline at end of file +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkDeleteCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkDeleteCommitActionExecutor.java index 997c7bf2376e3..a6fc996b71c31 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkDeleteCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkDeleteCommitActionExecutor.java @@ -20,29 +20,28 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; - import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.spark.api.java.JavaRDD; public class SparkDeleteCommitActionExecutor> extends BaseSparkCommitActionExecutor { - private final JavaRDD keys; + private final HoodieData keys; public SparkDeleteCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD keys) { + String instantTime, HoodieData keys) { super(context, config, table, instantTime, WriteOperationType.DELETE); this.keys = keys; } @Override - public HoodieWriteMetadata> execute() { - return SparkDeleteHelper.newInstance().execute(instantTime, keys, context, config, table, this); + public HoodieWriteMetadata> execute() { + return HoodieDeleteHelper.newInstance().execute(instantTime, keys, context, config, table, this); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkDeleteHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkDeleteHelper.java deleted file mode 100644 index 01f9964b61bb2..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkDeleteHelper.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.commit; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.model.EmptyHoodieRecordPayload; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieUpsertException; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.WorkloadProfile; -import org.apache.hudi.table.WorkloadStat; -import org.apache.hudi.table.action.HoodieWriteMetadata; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; - -import java.time.Duration; -import java.time.Instant; -import java.util.HashMap; - -/** - * A spark implementation of {@link AbstractDeleteHelper}. - * - * @param - */ -@SuppressWarnings("checkstyle:LineLength") -public class SparkDeleteHelper extends - AbstractDeleteHelper>, JavaRDD, JavaRDD, R> { - private SparkDeleteHelper() { - } - - private static class DeleteHelperHolder { - private static final SparkDeleteHelper SPARK_DELETE_HELPER = new SparkDeleteHelper(); - } - - public static SparkDeleteHelper newInstance() { - return DeleteHelperHolder.SPARK_DELETE_HELPER; - } - - @Override - public JavaRDD deduplicateKeys(JavaRDD keys, HoodieTable>, JavaRDD, JavaRDD> table, int parallelism) { - boolean isIndexingGlobal = table.getIndex().isGlobal(); - if (isIndexingGlobal) { - return keys.keyBy(HoodieKey::getRecordKey) - .reduceByKey((key1, key2) -> key1, parallelism) - .values(); - } else { - return keys.distinct(parallelism); - } - } - - @Override - public HoodieWriteMetadata> execute(String instantTime, - JavaRDD keys, - HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, JavaRDD, JavaRDD> table, - BaseCommitActionExecutor>, JavaRDD, JavaRDD, R> deleteExecutor) { - JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); - - try { - HoodieWriteMetadata result = null; - JavaRDD dedupedKeys = keys; - final int parallelism = config.getDeleteShuffleParallelism(); - if (config.shouldCombineBeforeDelete()) { - // De-dupe/merge if needed - dedupedKeys = deduplicateKeys(keys, table, parallelism); - } else if (!keys.partitions().isEmpty()) { - dedupedKeys = keys.repartition(parallelism); - } - - JavaRDD> dedupedRecords = - dedupedKeys.map(key -> new HoodieRecord(key, new EmptyHoodieRecordPayload())); - Instant beginTag = Instant.now(); - // perform index loop up to get existing location of records - JavaRDD> taggedRecords = - table.getIndex().tagLocation(dedupedRecords, context, table); - Duration tagLocationDuration = Duration.between(beginTag, Instant.now()); - - // filter out non existent keys/records - JavaRDD> taggedValidRecords = taggedRecords.filter(HoodieRecord::isCurrentLocationKnown); - if (!taggedValidRecords.isEmpty()) { - result = deleteExecutor.execute(taggedValidRecords); - result.setIndexLookupDuration(tagLocationDuration); - } else { - // if entire set of keys are non existent - deleteExecutor.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(Pair.of(new HashMap<>(), new WorkloadStat())), instantTime); - result = new HoodieWriteMetadata(); - result.setWriteStatuses(jsc.emptyRDD()); - deleteExecutor.commitOnAutoCommit(result); - } - return result; - } catch (Throwable e) { - if (e instanceof HoodieUpsertException) { - throw (HoodieUpsertException) e; - } - throw new HoodieUpsertException("Failed to delete for commit time " + instantTime, e); - } - } - -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkDeletePartitionCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkDeletePartitionCommitActionExecutor.java new file mode 100644 index 0000000000000..149aef03e238a --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkDeletePartitionCommitActionExecutor.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import java.time.Duration; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaPairRDD; +import org.apache.hudi.exception.HoodieDeletePartitionException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.WorkloadProfile; +import org.apache.hudi.table.WorkloadStat; +import org.apache.hudi.table.action.HoodieWriteMetadata; + +import static org.apache.hudi.common.table.timeline.HoodieInstant.State.REQUESTED; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.REPLACE_COMMIT_ACTION; + +public class SparkDeletePartitionCommitActionExecutor> + extends SparkInsertOverwriteCommitActionExecutor { + + private List partitions; + public SparkDeletePartitionCommitActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, HoodieTable table, + String instantTime, List partitions) { + super(context, config, table, instantTime,null, WriteOperationType.DELETE_PARTITION); + this.partitions = partitions; + } + + @Override + public HoodieWriteMetadata> execute() { + try { + HoodieTimer timer = new HoodieTimer().startTimer(); + context.setJobStatus(this.getClass().getSimpleName(), "Gather all file ids from all deleting partitions."); + Map> partitionToReplaceFileIds = + HoodieJavaPairRDD.getJavaPairRDD(context.parallelize(partitions).distinct() + .mapToPair(partitionPath -> Pair.of(partitionPath, getAllExistingFileIds(partitionPath)))).collectAsMap(); + HoodieWriteMetadata> result = new HoodieWriteMetadata<>(); + result.setPartitionToReplaceFileIds(partitionToReplaceFileIds); + result.setIndexUpdateDuration(Duration.ofMillis(timer.endTimer())); + result.setWriteStatuses(context.emptyHoodieData()); + + // created requested + HoodieInstant dropPartitionsInstant = new HoodieInstant(REQUESTED, REPLACE_COMMIT_ACTION, instantTime); + if (!table.getMetaClient().getFs().exists(new Path(table.getMetaClient().getMetaPath(), + dropPartitionsInstant.getFileName()))) { + HoodieRequestedReplaceMetadata requestedReplaceMetadata = HoodieRequestedReplaceMetadata.newBuilder() + .setOperationType(WriteOperationType.DELETE_PARTITION.name()) + .setExtraMetadata(extraMetadata.orElse(Collections.emptyMap())) + .build(); + table.getMetaClient().getActiveTimeline().saveToPendingReplaceCommit(dropPartitionsInstant, + TimelineMetadataUtils.serializeRequestedReplaceMetadata(requestedReplaceMetadata)); + } + + this.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(Pair.of(new HashMap<>(), new WorkloadStat())), + instantTime); + this.commitOnAutoCommit(result); + return result; + } catch (Exception e) { + throw new HoodieDeletePartitionException("Failed to drop partitions for commit time " + instantTime, e); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkHoodiePartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkHoodiePartitioner.java new file mode 100644 index 0000000000000..4a5bff42153fb --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkHoodiePartitioner.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.WorkloadProfile; +import org.apache.spark.Partitioner; + +/** + * Packs incoming records to be inserted into buckets (1 bucket = 1 RDD partition). + */ +public abstract class SparkHoodiePartitioner> extends Partitioner + implements org.apache.hudi.table.action.commit.Partitioner { + + /** + * Stat for the current workload. Helps in determining inserts, upserts etc. + */ + protected WorkloadProfile profile; + + protected final HoodieTable table; + + public SparkHoodiePartitioner(WorkloadProfile profile, HoodieTable table) { + this.profile = profile; + this.table = table; + } + + @Override + public int getNumPartitions() { + return numPartitions(); + } + + public abstract BucketInfo getBucketInfo(int bucketNumber); +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertCommitActionExecutor.java index 25891e05a4dd1..479b51322ff32 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertCommitActionExecutor.java @@ -20,30 +20,29 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; - import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.spark.api.java.JavaRDD; public class SparkInsertCommitActionExecutor> extends BaseSparkCommitActionExecutor { - private final JavaRDD> inputRecordsRDD; + private final HoodieData> inputRecordsRDD; public SparkInsertCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD) { + String instantTime, HoodieData> inputRecordsRDD) { super(context, config, table, instantTime, WriteOperationType.INSERT); this.inputRecordsRDD = inputRecordsRDD; } @Override - public HoodieWriteMetadata> execute() { - return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table, - config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(), this, false); + public HoodieWriteMetadata> execute() { + return HoodieWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table, + config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(), this, operationType); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwriteCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwriteCommitActionExecutor.java index 1e3822016a765..518063ed34186 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwriteCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwriteCommitActionExecutor.java @@ -19,18 +19,21 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaPairRDD; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.WorkloadProfile; import org.apache.hudi.table.action.HoodieWriteMetadata; + import org.apache.spark.Partitioner; -import org.apache.spark.api.java.JavaRDD; -import scala.Tuple2; import java.util.List; import java.util.Map; @@ -39,31 +42,33 @@ public class SparkInsertOverwriteCommitActionExecutor> extends BaseSparkCommitActionExecutor { - private final JavaRDD> inputRecordsRDD; + private final HoodieData> inputRecordsRDD; public SparkInsertOverwriteCommitActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD) { + String instantTime, HoodieData> inputRecordsRDD) { this(context, config, table, instantTime, inputRecordsRDD, WriteOperationType.INSERT_OVERWRITE); } public SparkInsertOverwriteCommitActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD, + String instantTime, HoodieData> inputRecordsRDD, WriteOperationType writeOperationType) { super(context, config, table, instantTime, writeOperationType); this.inputRecordsRDD = inputRecordsRDD; } @Override - public HoodieWriteMetadata> execute() { - return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table, - config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(), this, false); + public HoodieWriteMetadata> execute() { + return HoodieWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table, + config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(), this, operationType); } @Override protected Partitioner getPartitioner(WorkloadProfile profile) { - return new SparkInsertOverwritePartitioner(profile, context, table, config); + return table.getStorageLayout().layoutPartitionerClass() + .map(c -> getLayoutPartitioner(profile, c)) + .orElse(new SparkInsertOverwritePartitioner(profile, context, table, config)); } @Override @@ -72,13 +77,13 @@ protected String getCommitActionType() { } @Override - protected Map> getPartitionToReplacedFileIds(JavaRDD writeStatuses) { - return writeStatuses.map(status -> status.getStat().getPartitionPath()).distinct().mapToPair(partitionPath -> - new Tuple2<>(partitionPath, getAllExistingFileIds(partitionPath))).collectAsMap(); + protected Map> getPartitionToReplacedFileIds(HoodieWriteMetadata> writeMetadata) { + return HoodieJavaPairRDD.getJavaPairRDD(writeMetadata.getWriteStatuses().map(status -> status.getStat().getPartitionPath()).distinct().mapToPair(partitionPath -> + Pair.of(partitionPath, getAllExistingFileIds(partitionPath)))).collectAsMap(); } - private List getAllExistingFileIds(String partitionPath) { + protected List getAllExistingFileIds(String partitionPath) { // because new commit is not complete. it is safe to mark all existing file Ids as old files - return table.getSliceView().getLatestFileSlices(partitionPath).map(fg -> fg.getFileId()).distinct().collect(Collectors.toList()); + return table.getSliceView().getLatestFileSlices(partitionPath).map(FileSlice::getFileId).distinct().collect(Collectors.toList()); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwritePartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwritePartitioner.java index 6f8be79f94e2c..dd545d5262846 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwritePartitioner.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwritePartitioner.java @@ -18,10 +18,11 @@ package org.apache.hudi.table.action.commit; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.WorkloadProfile; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -44,7 +45,7 @@ public SparkInsertOverwritePartitioner(WorkloadProfile profile, HoodieEngineCont * Returns a list of small files in the given partition path. */ protected List getSmallFiles(String partitionPath) { - // for overwrite, we ignore all existing files. So dont consider any file to be smallFiles + // for overwrite, we ignore all existing files. So do not consider any file to be smallFiles return Collections.emptyList(); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwriteTableCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwriteTableCommitActionExecutor.java index e349657b7e44e..93d0a8124c4ee 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwriteTableCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwriteTableCommitActionExecutor.java @@ -19,56 +19,39 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieCommitException; +import org.apache.hudi.data.HoodieJavaPairRDD; import org.apache.hudi.table.HoodieTable; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import scala.Tuple2; +import org.apache.hudi.table.action.HoodieWriteMetadata; -import java.io.IOException; -import java.util.HashMap; +import java.util.Collections; import java.util.List; import java.util.Map; -import java.util.stream.Collectors; public class SparkInsertOverwriteTableCommitActionExecutor> extends SparkInsertOverwriteCommitActionExecutor { public SparkInsertOverwriteTableCommitActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD) { + String instantTime, HoodieData> inputRecordsRDD) { super(context, config, table, instantTime, inputRecordsRDD, WriteOperationType.INSERT_OVERWRITE_TABLE); } - protected List getAllExistingFileIds(String partitionPath) { - return table.getSliceView().getLatestFileSlices(partitionPath) - .map(fg -> fg.getFileId()).distinct().collect(Collectors.toList()); - } - @Override - protected Map> getPartitionToReplacedFileIds(JavaRDD writeStatuses) { - Map> partitionToExistingFileIds = new HashMap<>(); - try { - List partitionPaths = FSUtils.getAllPartitionPaths(table.getMetaClient().getFs(), - table.getMetaClient().getBasePath(), false); - JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); - if (partitionPaths != null && partitionPaths.size() > 0) { - context.setJobStatus(this.getClass().getSimpleName(), "Getting ExistingFileIds of all partitions"); - JavaRDD partitionPathRdd = jsc.parallelize(partitionPaths, partitionPaths.size()); - partitionToExistingFileIds = partitionPathRdd.mapToPair( - partitionPath -> new Tuple2<>(partitionPath, getAllExistingFileIds(partitionPath))).collectAsMap(); - } - } catch (IOException e) { - throw new HoodieCommitException("In InsertOverwriteTable action failed to get existing fileIds of all partition " - + config.getBasePath() + " at time " + instantTime, e); + protected Map> getPartitionToReplacedFileIds(HoodieWriteMetadata> writeMetadata) { + List partitionPaths = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), table.getMetaClient().getBasePath()); + if (partitionPaths == null || partitionPaths.isEmpty()) { + return Collections.emptyMap(); } - return partitionToExistingFileIds; + context.setJobStatus(this.getClass().getSimpleName(), "Getting ExistingFileIds of all partitions"); + return HoodieJavaPairRDD.getJavaPairRDD(context.parallelize(partitionPaths, partitionPaths.size()).mapToPair( + partitionPath -> Pair.of(partitionPath, getAllExistingFileIds(partitionPath)))).collectAsMap(); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertPreppedCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertPreppedCommitActionExecutor.java index 400147bb8fe70..ff1a7e2b9beeb 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertPreppedCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertPreppedCommitActionExecutor.java @@ -20,29 +20,28 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; - import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.spark.api.java.JavaRDD; public class SparkInsertPreppedCommitActionExecutor> extends BaseSparkCommitActionExecutor { - private final JavaRDD> preppedRecords; + private final HoodieData> preppedRecords; public SparkInsertPreppedCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> preppedRecords) { + String instantTime, HoodieData> preppedRecords) { super(context, config, table, instantTime, WriteOperationType.INSERT_PREPPED); this.preppedRecords = preppedRecords; } @Override - public HoodieWriteMetadata> execute() { + public HoodieWriteMetadata> execute() { return super.execute(preppedRecords); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkMergeHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkMergeHelper.java deleted file mode 100644 index 08d60b93da37a..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkMergeHelper.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.commit; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.execution.SparkBoundedInMemoryExecutor; -import org.apache.hudi.io.HoodieMergeHandle; -import org.apache.hudi.io.storage.HoodieFileReader; -import org.apache.hudi.io.storage.HoodieFileReaderFactory; -import org.apache.hudi.table.HoodieTable; - -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericDatumReader; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.io.BinaryDecoder; -import org.apache.avro.io.BinaryEncoder; -import org.apache.hadoop.conf.Configuration; -import org.apache.spark.api.java.JavaRDD; - -import java.io.IOException; -import java.util.Iterator; - -public class SparkMergeHelper extends AbstractMergeHelper>, - JavaRDD, JavaRDD> { - - private SparkMergeHelper() { - } - - private static class MergeHelperHolder { - private static final SparkMergeHelper SPARK_MERGE_HELPER = new SparkMergeHelper(); - } - - public static SparkMergeHelper newInstance() { - return SparkMergeHelper.MergeHelperHolder.SPARK_MERGE_HELPER; - } - - @Override - public void runMerge(HoodieTable>, JavaRDD, JavaRDD> table, - HoodieMergeHandle>, JavaRDD, JavaRDD> upsertHandle) throws IOException { - final boolean externalSchemaTransformation = table.getConfig().shouldUseExternalSchemaTransformation(); - Configuration cfgForHoodieFile = new Configuration(table.getHadoopConf()); - HoodieMergeHandle>, JavaRDD, JavaRDD> mergeHandle = upsertHandle; - HoodieBaseFile baseFile = mergeHandle.baseFileForMerge(); - - final GenericDatumWriter gWriter; - final GenericDatumReader gReader; - Schema readSchema; - if (externalSchemaTransformation || baseFile.getBootstrapBaseFile().isPresent()) { - readSchema = HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), mergeHandle.getOldFilePath()).getSchema(); - gWriter = new GenericDatumWriter<>(readSchema); - gReader = new GenericDatumReader<>(readSchema, mergeHandle.getWriterSchemaWithMetafields()); - } else { - gReader = null; - gWriter = null; - readSchema = mergeHandle.getWriterSchemaWithMetafields(); - } - - BoundedInMemoryExecutor wrapper = null; - HoodieFileReader reader = HoodieFileReaderFactory.getFileReader(cfgForHoodieFile, mergeHandle.getOldFilePath()); - try { - final Iterator readerIterator; - if (baseFile.getBootstrapBaseFile().isPresent()) { - readerIterator = getMergingIterator(table, mergeHandle, baseFile, reader, readSchema, externalSchemaTransformation); - } else { - readerIterator = reader.getRecordIterator(readSchema); - } - - ThreadLocal encoderCache = new ThreadLocal<>(); - ThreadLocal decoderCache = new ThreadLocal<>(); - wrapper = new SparkBoundedInMemoryExecutor(table.getConfig(), readerIterator, - new UpdateHandler(mergeHandle), record -> { - if (!externalSchemaTransformation) { - return record; - } - return transformRecordBasedOnNewSchema(gReader, gWriter, encoderCache, decoderCache, (GenericRecord) record); - }); - wrapper.execute(); - } catch (Exception e) { - throw new HoodieException(e); - } finally { - if (reader != null) { - reader.close(); - } - mergeHandle.close(); - if (null != wrapper) { - wrapper.shutdownNow(); - } - } - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertCommitActionExecutor.java index fe90212b0be15..ccee9cf5a7164 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertCommitActionExecutor.java @@ -20,30 +20,29 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; - import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.spark.api.java.JavaRDD; public class SparkUpsertCommitActionExecutor> extends BaseSparkCommitActionExecutor { - private JavaRDD> inputRecordsRDD; + private final HoodieData> inputRecordsRDD; public SparkUpsertCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD) { + String instantTime, HoodieData> inputRecordsRDD) { super(context, config, table, instantTime, WriteOperationType.UPSERT); this.inputRecordsRDD = inputRecordsRDD; } @Override - public HoodieWriteMetadata> execute() { - return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table, - config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), this, true); + public HoodieWriteMetadata> execute() { + return HoodieWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table, + config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), this, operationType); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertPreppedCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertPreppedCommitActionExecutor.java index e36073fd17d6d..73d408593bd37 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertPreppedCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertPreppedCommitActionExecutor.java @@ -20,29 +20,28 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; - import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.spark.api.java.JavaRDD; public class SparkUpsertPreppedCommitActionExecutor> extends BaseSparkCommitActionExecutor { - private final JavaRDD> preppedRecords; + private final HoodieData> preppedRecords; public SparkUpsertPreppedCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> preppedRecords) { + String instantTime, HoodieData> preppedRecords) { super(context, config, table, instantTime, WriteOperationType.UPSERT_PREPPED); this.preppedRecords = preppedRecords; } @Override - public HoodieWriteMetadata> execute() { + public HoodieWriteMetadata> execute() { return super.execute(preppedRecords); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java deleted file mode 100644 index a197c91da946b..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.commit; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.index.HoodieIndex; - -import org.apache.spark.api.java.JavaRDD; - -import scala.Tuple2; - -/** - * A spark implementation of {@link AbstractWriteHelper}. - * - * @param - */ -public class SparkWriteHelper extends AbstractWriteHelper>, - JavaRDD, JavaRDD, R> { - private SparkWriteHelper() { - } - - private static class WriteHelperHolder { - private static final SparkWriteHelper SPARK_WRITE_HELPER = new SparkWriteHelper(); - } - - public static SparkWriteHelper newInstance() { - return WriteHelperHolder.SPARK_WRITE_HELPER; - } - - @Override - public JavaRDD> deduplicateRecords(JavaRDD> records, - HoodieIndex>, JavaRDD, JavaRDD> index, - int parallelism) { - boolean isIndexingGlobal = index.isGlobal(); - return records.mapToPair(record -> { - HoodieKey hoodieKey = record.getKey(); - // If index used is global, then records are expected to differ in their partitionPath - Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey; - return new Tuple2<>(key, record); - }).reduceByKey((rec1, rec2) -> { - @SuppressWarnings("unchecked") - T reducedData = (T) rec1.getData().preCombine(rec2.getData()); - HoodieKey reducedKey = rec1.getData().equals(reducedData) ? rec1.getKey() : rec2.getKey(); - - return new HoodieRecord(reducedKey, reducedData); - }, parallelism).map(Tuple2::_2); - } - -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java index b28c89a536469..134cfd8d2c0b5 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java @@ -18,16 +18,18 @@ package org.apache.hudi.table.action.commit; -import org.apache.hudi.client.common.HoodieEngineContext; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.NumericUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; @@ -35,9 +37,9 @@ import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.WorkloadProfile; import org.apache.hudi.table.WorkloadStat; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import org.apache.spark.Partitioner; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.PairFunction; @@ -54,10 +56,12 @@ import scala.Tuple2; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.COMMIT_ACTION; + /** * Packs incoming records to be upserted, into buckets (1 bucket = 1 RDD partition). */ -public class UpsertPartitioner> extends Partitioner { +public class UpsertPartitioner> extends SparkHoodiePartitioner { private static final Logger LOG = LogManager.getLogger(UpsertPartitioner.class); @@ -69,10 +73,6 @@ public class UpsertPartitioner> extends Partiti * Total number of RDD partitions, is determined by total buckets we want to pack the incoming workload into. */ private int totalBuckets = 0; - /** - * Stat for the current workload. Helps in determining inserts, upserts etc. - */ - private WorkloadProfile profile; /** * Helps decide which bucket an incoming update should go to. */ @@ -86,17 +86,14 @@ public class UpsertPartitioner> extends Partiti */ private HashMap bucketInfoMap; - protected final HoodieTable table; - protected final HoodieWriteConfig config; public UpsertPartitioner(WorkloadProfile profile, HoodieEngineContext context, HoodieTable table, HoodieWriteConfig config) { + super(profile, table); updateLocationToBucket = new HashMap<>(); partitionPathToInsertBucketInfos = new HashMap<>(); bucketInfoMap = new HashMap<>(); - this.profile = profile; - this.table = table; this.config = config; assignUpdates(profile); assignInserts(profile, context); @@ -108,11 +105,19 @@ public UpsertPartitioner(WorkloadProfile profile, HoodieEngineContext context, H private void assignUpdates(WorkloadProfile profile) { // each update location gets a partition - Set> partitionStatEntries = profile.getPartitionPathStatMap().entrySet(); + Set> partitionStatEntries = profile.getInputPartitionPathStatMap().entrySet(); for (Map.Entry partitionStat : partitionStatEntries) { + WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionStat.getKey(), new WorkloadStat()); for (Map.Entry> updateLocEntry : partitionStat.getValue().getUpdateLocationToCount().entrySet()) { addUpdateBucket(partitionStat.getKey(), updateLocEntry.getKey()); + if (profile.hasOutputWorkLoadStats()) { + HoodieRecordLocation hoodieRecordLocation = new HoodieRecordLocation(updateLocEntry.getValue().getKey(), updateLocEntry.getKey()); + outputWorkloadStats.addUpdates(hoodieRecordLocation, updateLocEntry.getValue().getValue()); + } + } + if (profile.hasOutputWorkLoadStats()) { + profile.updateOutputPartitionPathStatMap(partitionStat.getKey(), outputWorkloadStats); } } } @@ -120,31 +125,66 @@ private void assignUpdates(WorkloadProfile profile) { private int addUpdateBucket(String partitionPath, String fileIdHint) { int bucket = totalBuckets; updateLocationToBucket.put(fileIdHint, bucket); - BucketInfo bucketInfo = new BucketInfo(); - bucketInfo.bucketType = BucketType.UPDATE; - bucketInfo.fileIdPrefix = fileIdHint; - bucketInfo.partitionPath = partitionPath; + BucketInfo bucketInfo = new BucketInfo(BucketType.UPDATE, fileIdHint, partitionPath); bucketInfoMap.put(totalBuckets, bucketInfo); totalBuckets++; return bucket; } + /** + * Get the in pending clustering fileId for each partition path. + * @return partition path to pending clustering file groups id + */ + private Map> getPartitionPathToPendingClusteringFileGroupsId() { + Map> partitionPathToInPendingClusteringFileId = + table.getFileSystemView().getFileGroupsInPendingClustering() + .map(fileGroupIdAndInstantPair -> + Pair.of(fileGroupIdAndInstantPair.getKey().getPartitionPath(), fileGroupIdAndInstantPair.getKey().getFileId())) + .collect(Collectors.groupingBy(Pair::getKey, Collectors.mapping(Pair::getValue, Collectors.toSet()))); + return partitionPathToInPendingClusteringFileId; + } + + /** + * Exclude small file handling for clustering since update path is not supported. + * @param pendingClusteringFileGroupsId pending clustering file groups id of partition + * @param smallFiles small files of partition + * @return smallFiles not in clustering + */ + private List filterSmallFilesInClustering(final Set pendingClusteringFileGroupsId, final List smallFiles) { + if (!pendingClusteringFileGroupsId.isEmpty()) { + return smallFiles.stream() + .filter(smallFile -> !pendingClusteringFileGroupsId.contains(smallFile.location.getFileId())).collect(Collectors.toList()); + } else { + return smallFiles; + } + } + private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) { // for new inserts, compute buckets depending on how many records we have for each partition Set partitionPaths = profile.getPartitionPaths(); - long averageRecordSize = - averageBytesPerRecord(table.getMetaClient().getActiveTimeline().getCommitTimeline().filterCompletedInstants(), - config); + /* + * NOTE: we only use commit instants to calculate average record size because replacecommit can be + * created by clustering, which has smaller average record size, which affects assigning inserts and + * may result in OOM by making spark underestimate the actual input record sizes. + */ + long averageRecordSize = averageBytesPerRecord(table.getMetaClient().getActiveTimeline() + .getTimelineOfActions(CollectionUtils.createSet(COMMIT_ACTION)).filterCompletedInstants(), config); LOG.info("AvgRecordSize => " + averageRecordSize); Map> partitionSmallFilesMap = - getSmallFilesForPartitions(new ArrayList(partitionPaths), context); + getSmallFilesForPartitions(new ArrayList<>(partitionPaths), context); + + Map> partitionPathToPendingClusteringFileGroupsId = getPartitionPathToPendingClusteringFileGroupsId(); for (String partitionPath : partitionPaths) { WorkloadStat pStat = profile.getWorkloadStat(partitionPath); + WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionPath, new WorkloadStat()); if (pStat.getNumInserts() > 0) { - List smallFiles = partitionSmallFilesMap.get(partitionPath); + List smallFiles = + filterSmallFilesInClustering(partitionPathToPendingClusteringFileGroupsId.getOrDefault(partitionPath, Collections.emptySet()), + partitionSmallFilesMap.getOrDefault(partitionPath, new ArrayList<>())); + this.smallFiles.addAll(smallFiles); LOG.info("For partitionPath : " + partitionPath + " Small Files => " + smallFiles); @@ -157,7 +197,7 @@ private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) for (SmallFile smallFile : smallFiles) { long recordsToAppend = Math.min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize, totalUnassignedInserts); - if (recordsToAppend > 0 && totalUnassignedInserts > 0) { + if (recordsToAppend > 0) { // create a new bucket or re-use an existing bucket int bucket; if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) { @@ -167,9 +207,16 @@ private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) bucket = addUpdateBucket(partitionPath, smallFile.location.getFileId()); LOG.info("Assigning " + recordsToAppend + " inserts to new update bucket " + bucket); } + if (profile.hasOutputWorkLoadStats()) { + outputWorkloadStats.addInserts(smallFile.location, recordsToAppend); + } bucketNumbers.add(bucket); recordsPerBucket.add(recordsToAppend); totalUnassignedInserts -= recordsToAppend; + if (totalUnassignedInserts <= 0) { + // stop the loop when all the inserts are assigned + break; + } } } @@ -185,37 +232,49 @@ private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) + ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => " + insertRecordsPerBucket); for (int b = 0; b < insertBuckets; b++) { bucketNumbers.add(totalBuckets); - recordsPerBucket.add(totalUnassignedInserts / insertBuckets); - BucketInfo bucketInfo = new BucketInfo(); - bucketInfo.bucketType = BucketType.INSERT; - bucketInfo.partitionPath = partitionPath; - bucketInfo.fileIdPrefix = FSUtils.createNewFileIdPfx(); + if (b < insertBuckets - 1) { + recordsPerBucket.add(insertRecordsPerBucket); + } else { + recordsPerBucket.add(totalUnassignedInserts - (insertBuckets - 1) * insertRecordsPerBucket); + } + BucketInfo bucketInfo = new BucketInfo(BucketType.INSERT, FSUtils.createNewFileIdPfx(), partitionPath); bucketInfoMap.put(totalBuckets, bucketInfo); + if (profile.hasOutputWorkLoadStats()) { + outputWorkloadStats.addInserts(new HoodieRecordLocation(HoodieWriteStat.NULL_COMMIT, bucketInfo.getFileIdPrefix()), recordsPerBucket.get(recordsPerBucket.size() - 1)); + } totalBuckets++; } } // Go over all such buckets, and assign weights as per amount of incoming inserts. List insertBuckets = new ArrayList<>(); - double curentCumulativeWeight = 0; + double currentCumulativeWeight = 0; for (int i = 0; i < bucketNumbers.size(); i++) { InsertBucket bkt = new InsertBucket(); bkt.bucketNumber = bucketNumbers.get(i); bkt.weight = (1.0 * recordsPerBucket.get(i)) / pStat.getNumInserts(); - curentCumulativeWeight += bkt.weight; - insertBuckets.add(new InsertBucketCumulativeWeightPair(bkt, curentCumulativeWeight)); + currentCumulativeWeight += bkt.weight; + insertBuckets.add(new InsertBucketCumulativeWeightPair(bkt, currentCumulativeWeight)); } LOG.info("Total insert buckets for partition path " + partitionPath + " => " + insertBuckets); partitionPathToInsertBucketInfos.put(partitionPath, insertBuckets); } + if (profile.hasOutputWorkLoadStats()) { + profile.updateOutputPartitionPathStatMap(partitionPath, outputWorkloadStats); + } } } private Map> getSmallFilesForPartitions(List partitionPaths, HoodieEngineContext context) { JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); Map> partitionSmallFilesMap = new HashMap<>(); + + if (config.getParquetSmallFileLimit() <= 0) { + return partitionSmallFilesMap; + } + if (partitionPaths != null && partitionPaths.size() > 0) { - context.setJobStatus(this.getClass().getSimpleName(), "Getting small files from partitions"); + context.setJobStatus(this.getClass().getSimpleName(), "Getting small files from partitions: " + config.getTableName()); JavaRDD partitionPathRdds = jsc.parallelize(partitionPaths, partitionPaths.size()); partitionSmallFilesMap = partitionPathRdds.mapToPair((PairFunction>) partitionPath -> new Tuple2<>(partitionPath, getSmallFiles(partitionPath))).collectAsMap(); @@ -253,6 +312,10 @@ protected List getSmallFiles(String partitionPath) { return smallFileLocations; } + public List getBucketInfos() { + return Collections.unmodifiableList(new ArrayList<>(bucketInfoMap.values())); + } + public BucketInfo getBucketInfo(int bucketNumber) { return bucketInfoMap.get(bucketNumber); } @@ -266,6 +329,11 @@ public int numPartitions() { return totalBuckets; } + @Override + public int getNumPartitions() { + return totalBuckets; + } + @Override public int getPartition(Object key) { Tuple2> keyLocation = diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/HoodieSparkMergeOnReadTableCompactor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/HoodieSparkMergeOnReadTableCompactor.java index 65cefc9b9923c..61cb1ffd27bd1 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/HoodieSparkMergeOnReadTableCompactor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/HoodieSparkMergeOnReadTableCompactor.java @@ -18,234 +18,39 @@ package org.apache.hudi.table.action.compact; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.avro.model.HoodieCompactionOperation; -import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.client.SparkTaskContextSupplier; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.CompactionOperation; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.HoodieTableType; -import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.view.TableFileSystemView.SliceView; -import org.apache.hudi.common.util.CollectionUtils; -import org.apache.hudi.common.util.CompactionUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.io.IOUtils; -import org.apache.hudi.table.HoodieSparkCopyOnWriteTable; import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.action.compact.strategy.CompactionStrategy; -import org.apache.avro.Schema; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.util.AccumulatorV2; -import org.apache.spark.util.LongAccumulator; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Iterator; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.StreamSupport; - -import static java.util.stream.Collectors.toList; +import static org.apache.hudi.config.HoodieWriteConfig.WRITE_STATUS_STORAGE_LEVEL_VALUE; /** * Compacts a hoodie table with merge on read storage. Computes all possible compactions, * passes it through a CompactionFilter and executes all the compactions and writes a new version of base files and make * a normal commit - * */ @SuppressWarnings("checkstyle:LineLength") -public class HoodieSparkMergeOnReadTableCompactor implements HoodieCompactor>, JavaRDD, JavaRDD> { - - private static final Logger LOG = LogManager.getLogger(HoodieSparkMergeOnReadTableCompactor.class); - // Accumulator to keep track of total log files for a table - private AccumulatorV2 totalLogFiles; - // Accumulator to keep track of total log file slices for a table - private AccumulatorV2 totalFileSlices; +public class HoodieSparkMergeOnReadTableCompactor + extends HoodieCompactor>, HoodieData, HoodieData> { @Override - public JavaRDD compact(HoodieEngineContext context, HoodieCompactionPlan compactionPlan, - HoodieTable hoodieTable, HoodieWriteConfig config, String compactionInstantTime) throws IOException { - JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); - if (compactionPlan == null || (compactionPlan.getOperations() == null) - || (compactionPlan.getOperations().isEmpty())) { - return jsc.emptyRDD(); - } - HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); - // Compacting is very similar to applying updates to existing file - HoodieSparkCopyOnWriteTable table = new HoodieSparkCopyOnWriteTable(config, context, metaClient); - List operations = compactionPlan.getOperations().stream() - .map(CompactionOperation::convertFromAvroRecordInstance).collect(toList()); - LOG.info("Compactor compacting " + operations + " files"); - - context.setJobStatus(this.getClass().getSimpleName(), "Compacting file slices"); - return jsc.parallelize(operations, operations.size()) - .map(s -> compact(table, metaClient, config, s, compactionInstantTime)).flatMap(List::iterator); - } - - private List compact(HoodieSparkCopyOnWriteTable hoodieCopyOnWriteTable, HoodieTableMetaClient metaClient, - HoodieWriteConfig config, CompactionOperation operation, String instantTime) throws IOException { - FileSystem fs = metaClient.getFs(); - - Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); - LOG.info("Compacting base " + operation.getDataFileName() + " with delta files " + operation.getDeltaFileNames() - + " for commit " + instantTime); - // TODO - FIX THIS - // Reads the entire avro file. Always only specific blocks should be read from the avro file - // (failure recover). - // Load all the delta commits since the last compaction commit and get all the blocks to be - // loaded and load it using CompositeAvroLogReader - // Since a DeltaCommit is not defined yet, reading all the records. revisit this soon. - String maxInstantTime = metaClient - .getActiveTimeline().getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.COMMIT_ACTION, - HoodieTimeline.ROLLBACK_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION)) - .filterCompletedInstants().lastInstant().get().getTimestamp(); - long maxMemoryPerCompaction = IOUtils.getMaxMemoryPerCompaction(new SparkTaskContextSupplier(), config.getProps()); - LOG.info("MaxMemoryPerCompaction => " + maxMemoryPerCompaction); - - List logFiles = operation.getDeltaFileNames().stream().map( - p -> new Path(FSUtils.getPartitionPath(metaClient.getBasePath(), operation.getPartitionPath()), p).toString()) - .collect(toList()); - HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder() - .withFileSystem(fs) - .withBasePath(metaClient.getBasePath()) - .withLogFilePaths(logFiles) - .withReaderSchema(readerSchema) - .withLatestInstantTime(maxInstantTime) - .withMaxMemorySizeInBytes(maxMemoryPerCompaction) - .withReadBlocksLazily(config.getCompactionLazyBlockReadEnabled()) - .withReverseReader(config.getCompactionReverseLogReadEnabled()) - .withBufferSize(config.getMaxDFSStreamBufferSize()) - .withSpillableMapBasePath(config.getSpillableMapBasePath()) - .build(); - if (!scanner.iterator().hasNext()) { - return new ArrayList<>(); - } - - Option oldDataFileOpt = - operation.getBaseFile(metaClient.getBasePath(), operation.getPartitionPath()); - - // Compacting is very similar to applying updates to existing file - Iterator> result; - // If the dataFile is present, perform updates else perform inserts into a new base file. - if (oldDataFileOpt.isPresent()) { - result = hoodieCopyOnWriteTable.handleUpdate(instantTime, operation.getPartitionPath(), - operation.getFileId(), scanner.getRecords(), - oldDataFileOpt.get()); - } else { - result = hoodieCopyOnWriteTable.handleInsert(instantTime, operation.getPartitionPath(), operation.getFileId(), - scanner.getRecords()); + public void preCompact( + HoodieTable table, HoodieTimeline pendingCompactionTimeline, String compactionInstantTime) { + HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime); + if (!pendingCompactionTimeline.containsInstant(instant)) { + throw new IllegalStateException( + "No Compaction request available at " + compactionInstantTime + " to run compaction"); } - Iterable> resultIterable = () -> result; - return StreamSupport.stream(resultIterable.spliterator(), false).flatMap(Collection::stream).peek(s -> { - s.getStat().setTotalUpdatedRecordsCompacted(scanner.getNumMergedRecordsInLog()); - s.getStat().setTotalLogFilesCompacted(scanner.getTotalLogFiles()); - s.getStat().setTotalLogRecords(scanner.getTotalLogRecords()); - s.getStat().setPartitionPath(operation.getPartitionPath()); - s.getStat() - .setTotalLogSizeCompacted(operation.getMetrics().get(CompactionStrategy.TOTAL_LOG_FILE_SIZE).longValue()); - s.getStat().setTotalLogBlocks(scanner.getTotalLogBlocks()); - s.getStat().setTotalCorruptLogBlock(scanner.getTotalCorruptBlocks()); - s.getStat().setTotalRollbackBlocks(scanner.getTotalRollbacks()); - RuntimeStats runtimeStats = new RuntimeStats(); - runtimeStats.setTotalScanTime(scanner.getTotalTimeTakenToReadAndMergeBlocks()); - s.getStat().setRuntimeStats(runtimeStats); - }).collect(toList()); } @Override - public HoodieCompactionPlan generateCompactionPlan(HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable, - HoodieWriteConfig config, String compactionCommitTime, Set fgIdsInPendingCompactions) - throws IOException { - JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); - totalLogFiles = new LongAccumulator(); - totalFileSlices = new LongAccumulator(); - jsc.sc().register(totalLogFiles); - jsc.sc().register(totalFileSlices); - - ValidationUtils.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ, - "Can only compact table of type " + HoodieTableType.MERGE_ON_READ + " and not " - + hoodieTable.getMetaClient().getTableType().name()); - - // TODO : check if maxMemory is not greater than JVM or spark.executor memory - // TODO - rollback any compactions in flight - HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); - LOG.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime); - List partitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), - config.shouldAssumeDatePartitioning()); - - // filter the partition paths if needed to reduce list status - partitionPaths = config.getCompactionStrategy().filterPartitionPaths(config, partitionPaths); - - if (partitionPaths.isEmpty()) { - // In case no partitions could be picked, return no compaction plan - return null; - } - - SliceView fileSystemView = hoodieTable.getSliceView(); - LOG.info("Compaction looking for files to compact in " + partitionPaths + " partitions"); - context.setJobStatus(this.getClass().getSimpleName(), "Looking for files to compact"); - - List operations = context.flatMap(partitionPaths, partitionPath -> { - return fileSystemView - .getLatestFileSlices(partitionPath) - .filter(slice -> !fgIdsInPendingCompactions.contains(slice.getFileGroupId())) - .map(s -> { - List logFiles = - s.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList()); - totalLogFiles.add((long) logFiles.size()); - totalFileSlices.add(1L); - // Avro generated classes are not inheriting Serializable. Using CompactionOperation POJO - // for spark Map operations and collecting them finally in Avro generated classes for storing - // into meta files. - Option dataFile = s.getBaseFile(); - return new CompactionOperation(dataFile, partitionPath, logFiles, - config.getCompactionStrategy().captureMetrics(config, dataFile, partitionPath, logFiles)); - }) - .filter(c -> !c.getDeltaFileNames().isEmpty()); - }, partitionPaths.size()).stream().map(CompactionUtils::buildHoodieCompactionOperation).collect(toList()); - - LOG.info("Total of " + operations.size() + " compactions are retrieved"); - LOG.info("Total number of latest files slices " + totalFileSlices.value()); - LOG.info("Total number of log files " + totalLogFiles.value()); - LOG.info("Total number of file slices " + totalFileSlices.value()); - // Filter the compactions with the passed in filter. This lets us choose most effective - // compactions only - HoodieCompactionPlan compactionPlan = config.getCompactionStrategy().generateCompactionPlan(config, operations, - CompactionUtils.getAllPendingCompactionPlans(metaClient).stream().map(Pair::getValue).collect(toList())); - ValidationUtils.checkArgument( - compactionPlan.getOperations().stream().noneMatch( - op -> fgIdsInPendingCompactions.contains(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()))), - "Bad Compaction Plan. FileId MUST NOT have multiple pending compactions. " - + "Please fix your strategy implementation. FileIdsWithPendingCompactions :" + fgIdsInPendingCompactions - + ", Selected workload :" + compactionPlan); - if (compactionPlan.getOperations().isEmpty()) { - LOG.warn("After filtering, Nothing to compact for " + metaClient.getBasePath()); - } - return compactionPlan; + public void maybePersist(HoodieData writeStatus, HoodieWriteConfig config) { + writeStatus.persist(config.getString(WRITE_STATUS_STORAGE_LEVEL_VALUE)); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/SparkCompactHelpers.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/SparkCompactHelpers.java deleted file mode 100644 index 107f533f27b44..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/SparkCompactHelpers.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.compact; - -import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.HoodieWriteStat; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; -import org.apache.hudi.table.HoodieTable; - -import org.apache.spark.api.java.JavaRDD; - -import java.io.IOException; -import java.util.List; - -/** - * A spark implementation of {@link AbstractCompactHelpers}. - * - * @param - */ -public class SparkCompactHelpers extends - AbstractCompactHelpers>, JavaRDD, JavaRDD> { - - private SparkCompactHelpers() { - } - - private static class CompactHelperHolder { - private static final SparkCompactHelpers SPARK_COMPACT_HELPERS = new SparkCompactHelpers(); - } - - public static SparkCompactHelpers newInstance() { - return CompactHelperHolder.SPARK_COMPACT_HELPERS; - } - - @Override - public HoodieCommitMetadata createCompactionMetadata(HoodieTable>, JavaRDD, JavaRDD> table, - String compactionInstantTime, - JavaRDD writeStatuses, - String schema) throws IOException { - byte[] planBytes = table.getActiveTimeline().readCompactionPlanAsBytes( - HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime)).get(); - HoodieCompactionPlan compactionPlan = TimelineMetadataUtils.deserializeCompactionPlan(planBytes); - List updateStatusMap = writeStatuses.map(WriteStatus::getStat).collect(); - org.apache.hudi.common.model.HoodieCommitMetadata metadata = new org.apache.hudi.common.model.HoodieCommitMetadata(true); - for (HoodieWriteStat stat : updateStatusMap) { - metadata.addWriteStat(stat.getPartitionPath(), stat); - } - metadata.addMetadata(org.apache.hudi.common.model.HoodieCommitMetadata.SCHEMA_KEY, schema); - if (compactionPlan.getExtraMetadata() != null) { - compactionPlan.getExtraMetadata().forEach(metadata::addMetadata); - } - return metadata; - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/SparkRunCompactionActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/SparkRunCompactionActionExecutor.java deleted file mode 100644 index ebc3de5b804f2..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/SparkRunCompactionActionExecutor.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.compact; - -import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.client.utils.SparkMemoryUtils; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.HoodieWriteStat; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.CompactionUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieCompactionException; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.action.BaseActionExecutor; -import org.apache.hudi.table.action.HoodieWriteMetadata; - -import org.apache.spark.api.java.JavaRDD; - -import java.io.IOException; -import java.util.List; - -@SuppressWarnings("checkstyle:LineLength") -public class SparkRunCompactionActionExecutor extends - BaseActionExecutor>, JavaRDD, JavaRDD, HoodieWriteMetadata>> { - - public SparkRunCompactionActionExecutor(HoodieSparkEngineContext context, - HoodieWriteConfig config, - HoodieTable>, JavaRDD, JavaRDD> table, - String instantTime) { - super(context, config, table, instantTime); - } - - @Override - public HoodieWriteMetadata> execute() { - HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(instantTime); - HoodieTimeline pendingCompactionTimeline = table.getActiveTimeline().filterPendingCompactionTimeline(); - if (!pendingCompactionTimeline.containsInstant(instant)) { - throw new IllegalStateException( - "No Compaction request available at " + instantTime + " to run compaction"); - } - - HoodieWriteMetadata> compactionMetadata = new HoodieWriteMetadata<>(); - try { - HoodieActiveTimeline timeline = table.getActiveTimeline(); - HoodieCompactionPlan compactionPlan = - CompactionUtils.getCompactionPlan(table.getMetaClient(), instantTime); - // Mark instant as compaction inflight - timeline.transitionCompactionRequestedToInflight(instant); - table.getMetaClient().reloadActiveTimeline(); - - HoodieSparkMergeOnReadTableCompactor compactor = new HoodieSparkMergeOnReadTableCompactor(); - JavaRDD statuses = compactor.compact(context, compactionPlan, table, config, instantTime); - - statuses.persist(SparkMemoryUtils.getWriteStatusStorageLevel(config.getProps())); - List updateStatusMap = statuses.map(WriteStatus::getStat).collect(); - HoodieCommitMetadata metadata = new HoodieCommitMetadata(true); - for (HoodieWriteStat stat : updateStatusMap) { - metadata.addWriteStat(stat.getPartitionPath(), stat); - } - metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, config.getSchema()); - - compactionMetadata.setWriteStatuses(statuses); - compactionMetadata.setCommitted(false); - compactionMetadata.setCommitMetadata(Option.of(metadata)); - } catch (IOException e) { - throw new HoodieCompactionException("Could not compact " + config.getBasePath(), e); - } - - return compactionMetadata; - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/SparkScheduleCompactionActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/SparkScheduleCompactionActionExecutor.java deleted file mode 100644 index c5f6c1692c91e..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/compact/SparkScheduleCompactionActionExecutor.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.compact; - -import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.view.SyncableFileSystemView; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieCompactionException; -import org.apache.hudi.table.HoodieTable; - -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.api.java.JavaRDD; - -import java.io.IOException; -import java.util.Map; -import java.util.stream.Collectors; - -@SuppressWarnings("checkstyle:LineLength") -public class SparkScheduleCompactionActionExecutor extends - BaseScheduleCompactionActionExecutor>, JavaRDD, JavaRDD> { - - private static final Logger LOG = LogManager.getLogger(SparkScheduleCompactionActionExecutor.class); - - public SparkScheduleCompactionActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, JavaRDD, JavaRDD> table, - String instantTime, - Option> extraMetadata) { - super(context, config, table, instantTime, extraMetadata); - } - - @Override - protected HoodieCompactionPlan scheduleCompaction() { - LOG.info("Checking if compaction needs to be run on " + config.getBasePath()); - Option lastCompaction = table.getActiveTimeline().getCommitTimeline() - .filterCompletedInstants().lastInstant(); - String lastCompactionTs = "0"; - if (lastCompaction.isPresent()) { - lastCompactionTs = lastCompaction.get().getTimestamp(); - } - - int deltaCommitsSinceLastCompaction = table.getActiveTimeline().getDeltaCommitTimeline() - .findInstantsAfter(lastCompactionTs, Integer.MAX_VALUE).countInstants(); - if (config.getInlineCompactDeltaCommitMax() > deltaCommitsSinceLastCompaction) { - LOG.info("Not scheduling compaction as only " + deltaCommitsSinceLastCompaction - + " delta commits was found since last compaction " + lastCompactionTs + ". Waiting for " - + config.getInlineCompactDeltaCommitMax()); - return new HoodieCompactionPlan(); - } - - LOG.info("Generating compaction plan for merge on read table " + config.getBasePath()); - HoodieSparkMergeOnReadTableCompactor compactor = new HoodieSparkMergeOnReadTableCompactor(); - try { - return compactor.generateCompactionPlan(context, table, config, instantTime, - ((SyncableFileSystemView) table.getSliceView()).getPendingCompactionOperations() - .map(instantTimeOpPair -> instantTimeOpPair.getValue().getFileGroupId()) - .collect(Collectors.toSet())); - - } catch (IOException e) { - throw new HoodieCompactionException("Could not schedule compaction " + config.getBasePath(), e); - } - } - -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/AbstractSparkDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/AbstractSparkDeltaCommitActionExecutor.java deleted file mode 100644 index 64d4c9ce85779..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/AbstractSparkDeltaCommitActionExecutor.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.deltacommit; - -import java.util.Map; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.WriteOperationType; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieUpsertException; -import org.apache.hudi.execution.SparkLazyInsertIterable; -import org.apache.hudi.io.AppendHandleFactory; -import org.apache.hudi.io.HoodieAppendHandle; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.WorkloadProfile; -import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor; - -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.Partitioner; - -import java.io.IOException; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; - -public abstract class AbstractSparkDeltaCommitActionExecutor> - extends BaseSparkCommitActionExecutor { - private static final Logger LOG = LogManager.getLogger(AbstractSparkDeltaCommitActionExecutor.class); - - // UpsertPartitioner for MergeOnRead table type - private SparkUpsertDeltaCommitPartitioner mergeOnReadUpsertPartitioner; - - public AbstractSparkDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, WriteOperationType operationType) { - this(context, config, table, instantTime, operationType, Option.empty()); - } - - public AbstractSparkDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, WriteOperationType operationType, - Option> extraMetadata) { - super(context, config, table, instantTime, operationType, extraMetadata); - } - - @Override - public Partitioner getUpsertPartitioner(WorkloadProfile profile) { - if (profile == null) { - throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner."); - } - mergeOnReadUpsertPartitioner = new SparkUpsertDeltaCommitPartitioner(profile, (HoodieSparkEngineContext) context, table, config); - return mergeOnReadUpsertPartitioner; - } - - @Override - public Iterator> handleUpdate(String partitionPath, String fileId, - Iterator> recordItr) throws IOException { - LOG.info("Merging updates for commit " + instantTime + " for file " + fileId); - - if (!table.getIndex().canIndexLogFiles() && mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) { - LOG.info("Small file corrections for updates for commit " + instantTime + " for file " + fileId); - return super.handleUpdate(partitionPath, fileId, recordItr); - } else { - HoodieAppendHandle appendHandle = new HoodieAppendHandle<>(config, instantTime, table, - partitionPath, fileId, recordItr, taskContextSupplier); - appendHandle.doAppend(); - appendHandle.close(); - return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus())).iterator(); - } - } - - @Override - public Iterator> handleInsert(String idPfx, Iterator> recordItr) - throws Exception { - // If canIndexLogFiles, write inserts to log files else write inserts to base files - if (table.getIndex().canIndexLogFiles()) { - return new SparkLazyInsertIterable<>(recordItr, true, config, instantTime, table, - idPfx, taskContextSupplier, new AppendHandleFactory<>()); - } else { - return super.handleInsert(idPfx, recordItr); - } - } - -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/BaseSparkDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/BaseSparkDeltaCommitActionExecutor.java new file mode 100644 index 0000000000000..61e6f25af9429 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/BaseSparkDeltaCommitActionExecutor.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.deltacommit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieUpsertException; +import org.apache.hudi.execution.SparkLazyInsertIterable; +import org.apache.hudi.io.AppendHandleFactory; +import org.apache.hudi.io.HoodieAppendHandle; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.WorkloadProfile; +import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.Partitioner; + +import java.io.IOException; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +public abstract class BaseSparkDeltaCommitActionExecutor> + extends BaseSparkCommitActionExecutor { + private static final Logger LOG = LogManager.getLogger(BaseSparkDeltaCommitActionExecutor.class); + + // UpsertPartitioner for MergeOnRead table type + private SparkUpsertDeltaCommitPartitioner mergeOnReadUpsertPartitioner; + + public BaseSparkDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, + String instantTime, WriteOperationType operationType) { + this(context, config, table, instantTime, operationType, Option.empty()); + } + + public BaseSparkDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, + String instantTime, WriteOperationType operationType, + Option> extraMetadata) { + super(context, config, table, instantTime, operationType, extraMetadata); + } + + @Override + public Partitioner getUpsertPartitioner(WorkloadProfile profile) { + if (profile == null) { + throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner."); + } + mergeOnReadUpsertPartitioner = new SparkUpsertDeltaCommitPartitioner<>(profile, (HoodieSparkEngineContext) context, table, config); + return mergeOnReadUpsertPartitioner; + } + + @Override + public Iterator> handleUpdate(String partitionPath, String fileId, + Iterator> recordItr) throws IOException { + LOG.info("Merging updates for commit " + instantTime + " for file " + fileId); + if (!table.getIndex().canIndexLogFiles() && mergeOnReadUpsertPartitioner != null + && mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) { + LOG.info("Small file corrections for updates for commit " + instantTime + " for file " + fileId); + return super.handleUpdate(partitionPath, fileId, recordItr); + } else { + HoodieAppendHandle appendHandle = new HoodieAppendHandle<>(config, instantTime, table, + partitionPath, fileId, recordItr, taskContextSupplier); + appendHandle.doAppend(); + return Collections.singletonList(appendHandle.close()).iterator(); + } + } + + @Override + public Iterator> handleInsert(String idPfx, Iterator> recordItr) + throws Exception { + // If canIndexLogFiles, write inserts to log files else write inserts to base files + if (table.getIndex().canIndexLogFiles()) { + return new SparkLazyInsertIterable<>(recordItr, true, config, instantTime, table, + idPfx, taskContextSupplier, new AppendHandleFactory<>()); + } else { + return super.handleInsert(idPfx, recordItr); + } + } + +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkBulkInsertDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkBulkInsertDeltaCommitActionExecutor.java index 281304d957620..190a714e44612 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkBulkInsertDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkBulkInsertDeltaCommitActionExecutor.java @@ -18,38 +18,37 @@ package org.apache.hudi.table.action.deltacommit; -import java.util.Map; - import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieInsertException; -import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.BulkInsertPartitioner; - +import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.commit.SparkBulkInsertHelper; -import org.apache.spark.api.java.JavaRDD; + +import java.util.Map; public class SparkBulkInsertDeltaCommitActionExecutor> - extends AbstractSparkDeltaCommitActionExecutor { + extends BaseSparkDeltaCommitActionExecutor { - private final JavaRDD> inputRecordsRDD; - private final Option> bulkInsertPartitioner; + private final HoodieData> inputRecordsRDD; + private final Option bulkInsertPartitioner; public SparkBulkInsertDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD, - Option> bulkInsertPartitioner) { + String instantTime, HoodieData> inputRecordsRDD, + Option bulkInsertPartitioner) { this(context, config, table, instantTime, inputRecordsRDD, bulkInsertPartitioner, Option.empty()); } public SparkBulkInsertDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD, - Option> bulkInsertPartitioner, + String instantTime, HoodieData> inputRecordsRDD, + Option bulkInsertPartitioner, Option> extraMetadata) { super(context, config, table, instantTime, WriteOperationType.BULK_INSERT, extraMetadata); this.inputRecordsRDD = inputRecordsRDD; @@ -57,7 +56,7 @@ public SparkBulkInsertDeltaCommitActionExecutor(HoodieSparkEngineContext context } @Override - public HoodieWriteMetadata> execute() { + public HoodieWriteMetadata> execute() { try { return SparkBulkInsertHelper.newInstance().bulkInsert(inputRecordsRDD, instantTime, table, config, this, true, bulkInsertPartitioner); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkBulkInsertPreppedDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkBulkInsertPreppedDeltaCommitActionExecutor.java index 21fc013af69c9..c01bce2b9cf35 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkBulkInsertPreppedDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkBulkInsertPreppedDeltaCommitActionExecutor.java @@ -20,36 +20,35 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieInsertException; -import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.BulkInsertPartitioner; - +import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.commit.SparkBulkInsertHelper; -import org.apache.spark.api.java.JavaRDD; public class SparkBulkInsertPreppedDeltaCommitActionExecutor> - extends AbstractSparkDeltaCommitActionExecutor { + extends BaseSparkDeltaCommitActionExecutor { - private final JavaRDD> preppedInputRecordRdd; - private final Option> bulkInsertPartitioner; + private final HoodieData> preppedInputRecordRdd; + private final Option bulkInsertPartitioner; public SparkBulkInsertPreppedDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> preppedInputRecordRdd, - Option> bulkInsertPartitioner) { + String instantTime, HoodieData> preppedInputRecordRdd, + Option bulkInsertPartitioner) { super(context, config, table, instantTime, WriteOperationType.BULK_INSERT); this.preppedInputRecordRdd = preppedInputRecordRdd; this.bulkInsertPartitioner = bulkInsertPartitioner; } @Override - public HoodieWriteMetadata> execute() { + public HoodieWriteMetadata> execute() { try { return SparkBulkInsertHelper.newInstance().bulkInsert(preppedInputRecordRdd, instantTime, table, config, this, false, bulkInsertPartitioner); @@ -61,4 +60,4 @@ public HoodieWriteMetadata> execute() { } } -} \ No newline at end of file +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkDeleteDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkDeleteDeltaCommitActionExecutor.java index 4fb6a90f90a41..9a5b08df288d6 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkDeleteDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkDeleteDeltaCommitActionExecutor.java @@ -20,30 +20,29 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; - import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.hudi.table.action.commit.SparkDeleteHelper; -import org.apache.spark.api.java.JavaRDD; +import org.apache.hudi.table.action.commit.HoodieDeleteHelper; public class SparkDeleteDeltaCommitActionExecutor> - extends AbstractSparkDeltaCommitActionExecutor { + extends BaseSparkDeltaCommitActionExecutor { - private final JavaRDD keys; + private final HoodieData keys; public SparkDeleteDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD keys) { + String instantTime, HoodieData keys) { super(context, config, table, instantTime, WriteOperationType.DELETE); this.keys = keys; } @Override - public HoodieWriteMetadata> execute() { - return SparkDeleteHelper.newInstance().execute(instantTime, keys, context, config, table, this); + public HoodieWriteMetadata> execute() { + return HoodieDeleteHelper.newInstance().execute(instantTime, keys, context, config, table, this); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkInsertDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkInsertDeltaCommitActionExecutor.java index fcaedee1127d7..4889460c467fa 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkInsertDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkInsertDeltaCommitActionExecutor.java @@ -20,31 +20,30 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; - import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.hudi.table.action.commit.SparkWriteHelper; -import org.apache.spark.api.java.JavaRDD; +import org.apache.hudi.table.action.commit.HoodieWriteHelper; public class SparkInsertDeltaCommitActionExecutor> - extends AbstractSparkDeltaCommitActionExecutor { + extends BaseSparkDeltaCommitActionExecutor { - private final JavaRDD> inputRecordsRDD; + private final HoodieData> inputRecordsRDD; public SparkInsertDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD) { + String instantTime, HoodieData> inputRecordsRDD) { super(context, config, table, instantTime, WriteOperationType.INSERT); this.inputRecordsRDD = inputRecordsRDD; } @Override - public HoodieWriteMetadata> execute() { - return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table, - config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(),this, false); + public HoodieWriteMetadata> execute() { + return HoodieWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table, + config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(),this, operationType); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkInsertPreppedDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkInsertPreppedDeltaCommitActionExecutor.java index 1f1e0165b494a..dbf0cbc676118 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkInsertPreppedDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkInsertPreppedDeltaCommitActionExecutor.java @@ -20,28 +20,28 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.spark.api.java.JavaRDD; public class SparkInsertPreppedDeltaCommitActionExecutor> - extends AbstractSparkDeltaCommitActionExecutor { + extends BaseSparkDeltaCommitActionExecutor { - private final JavaRDD> preppedRecords; + private final HoodieData> preppedRecords; public SparkInsertPreppedDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> preppedRecords) { + String instantTime, HoodieData> preppedRecords) { super(context, config, table, instantTime, WriteOperationType.INSERT_PREPPED); this.preppedRecords = preppedRecords; } @Override - public HoodieWriteMetadata> execute() { + public HoodieWriteMetadata> execute() { return super.execute(preppedRecords); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java index 82aa081524050..67ecb9a8cbc06 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java @@ -18,32 +18,32 @@ package org.apache.hudi.table.action.deltacommit; +import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; - import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.hudi.table.action.commit.SparkWriteHelper; -import org.apache.spark.api.java.JavaRDD; +import org.apache.hudi.table.action.commit.HoodieWriteHelper; public class SparkUpsertDeltaCommitActionExecutor> - extends AbstractSparkDeltaCommitActionExecutor { + extends BaseSparkDeltaCommitActionExecutor { - private JavaRDD> inputRecordsRDD; + private final HoodieData> inputRecordsRDD; public SparkUpsertDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD) { + String instantTime, HoodieData> inputRecordsRDD) { super(context, config, table, instantTime, WriteOperationType.UPSERT); this.inputRecordsRDD = inputRecordsRDD; } @Override - public HoodieWriteMetadata execute() { - return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table, - config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(),this, true); + public HoodieWriteMetadata> execute() { + return HoodieWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table, + config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(),this, operationType); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitPartitioner.java index 48a0ff0822cda..e498019c415d8 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitPartitioner.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitPartitioner.java @@ -26,15 +26,16 @@ import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.WorkloadProfile; - import org.apache.hudi.table.action.commit.SmallFile; import org.apache.hudi.table.action.commit.UpsertPartitioner; +import javax.annotation.Nonnull; import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; import java.util.List; import java.util.stream.Collectors; @@ -44,75 +45,79 @@ */ public class SparkUpsertDeltaCommitPartitioner> extends UpsertPartitioner { - SparkUpsertDeltaCommitPartitioner(WorkloadProfile profile, HoodieSparkEngineContext context, HoodieTable table, - HoodieWriteConfig config) { + public SparkUpsertDeltaCommitPartitioner(WorkloadProfile profile, HoodieSparkEngineContext context, HoodieTable table, + HoodieWriteConfig config) { super(profile, context, table, config); } @Override protected List getSmallFiles(String partitionPath) { - - // smallFiles only for partitionPath - List smallFileLocations = new ArrayList<>(); - // Init here since this class (and member variables) might not have been initialized HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline(); - // Find out all eligible small file slices - if (!commitTimeline.empty()) { - HoodieInstant latestCommitTime = commitTimeline.lastInstant().get(); - // find smallest file in partition and append to it - List allSmallFileSlices = new ArrayList<>(); - // If we cannot index log files, then we choose the smallest parquet file in the partition and add inserts to - // it. Doing this overtime for a partition, we ensure that we handle small file issues - if (!table.getIndex().canIndexLogFiles()) { - // TODO : choose last N small files since there can be multiple small files written to a single partition - // by different spark partitions in a single batch - Option smallFileSlice = Option.fromJavaOptional(table.getSliceView() - .getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), false) - .filter( - fileSlice -> fileSlice.getLogFiles().count() < 1 && fileSlice.getBaseFile().get().getFileSize() < config - .getParquetSmallFileLimit()) - .min((FileSlice left, FileSlice right) -> - left.getBaseFile().get().getFileSize() < right.getBaseFile().get().getFileSize() ? -1 : 1)); - if (smallFileSlice.isPresent()) { - allSmallFileSlices.add(smallFileSlice.get()); - } + if (commitTimeline.empty()) { + return Collections.emptyList(); + } + + HoodieInstant latestCommitTime = commitTimeline.lastInstant().get(); + + // Find out all eligible small file slices, looking for + // smallest file in the partition to append to + List smallFileSlicesCandidates = getSmallFileCandidates(partitionPath, latestCommitTime); + List smallFileLocations = new ArrayList<>(); + + // Create SmallFiles from the eligible file slices + for (FileSlice smallFileSlice : smallFileSlicesCandidates) { + SmallFile sf = new SmallFile(); + if (smallFileSlice.getBaseFile().isPresent()) { + // TODO : Move logic of file name, file id, base commit time handling inside file slice + String filename = smallFileSlice.getBaseFile().get().getFileName(); + sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename)); + sf.sizeBytes = getTotalFileSize(smallFileSlice); + smallFileLocations.add(sf); } else { - // If we can index log files, we can add more inserts to log files for fileIds including those under - // pending compaction. - List allFileSlices = - table.getSliceView().getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), true) - .collect(Collectors.toList()); - for (FileSlice fileSlice : allFileSlices) { - if (isSmallFile(fileSlice)) { - allSmallFileSlices.add(fileSlice); - } - } - } - // Create SmallFiles from the eligible file slices - for (FileSlice smallFileSlice : allSmallFileSlices) { - SmallFile sf = new SmallFile(); - if (smallFileSlice.getBaseFile().isPresent()) { - // TODO : Move logic of file name, file id, base commit time handling inside file slice - String filename = smallFileSlice.getBaseFile().get().getFileName(); - sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename)); - sf.sizeBytes = getTotalFileSize(smallFileSlice); - smallFileLocations.add(sf); - } else { - HoodieLogFile logFile = smallFileSlice.getLogFiles().findFirst().get(); - sf.location = new HoodieRecordLocation(FSUtils.getBaseCommitTimeFromLogPath(logFile.getPath()), - FSUtils.getFileIdFromLogPath(logFile.getPath())); - sf.sizeBytes = getTotalFileSize(smallFileSlice); - smallFileLocations.add(sf); - } + HoodieLogFile logFile = smallFileSlice.getLogFiles().findFirst().get(); + sf.location = new HoodieRecordLocation(FSUtils.getBaseCommitTimeFromLogPath(logFile.getPath()), + FSUtils.getFileIdFromLogPath(logFile.getPath())); + sf.sizeBytes = getTotalFileSize(smallFileSlice); + smallFileLocations.add(sf); } } return smallFileLocations; } + @Nonnull + private List getSmallFileCandidates(String partitionPath, HoodieInstant latestCommitInstant) { + // If we can index log files, we can add more inserts to log files for fileIds NOT including those under + // pending compaction + if (table.getIndex().canIndexLogFiles()) { + return table.getSliceView() + .getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitInstant.getTimestamp(), false) + .filter(this::isSmallFile) + .collect(Collectors.toList()); + } + + if (config.getParquetSmallFileLimit() <= 0) { + return Collections.emptyList(); + } + + // If we cannot index log files, then we choose the smallest parquet file in the partition and add inserts to + // it. Doing this overtime for a partition, we ensure that we handle small file issues + return table.getSliceView() + .getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitInstant.getTimestamp(), false) + .filter( + fileSlice -> + // NOTE: We can not pad slices with existing log-files w/o compacting these, + // hence skipping + fileSlice.getLogFiles().count() < 1 + && fileSlice.getBaseFile().get().getFileSize() < config.getParquetSmallFileLimit()) + .sorted(Comparator.comparing(fileSlice -> fileSlice.getBaseFile().get().getFileSize())) + .limit(config.getSmallFileGroupCandidatesLimit()) + .collect(Collectors.toList()); + } + public List getSmallFileIds() { - return (List) smallFiles.stream().map(smallFile -> ((SmallFile) smallFile).location.getFileId()) + return smallFiles.stream().map(smallFile -> smallFile.location.getFileId()) .collect(Collectors.toList()); } @@ -132,8 +137,12 @@ private boolean isSmallFile(FileSlice fileSlice) { // TODO (NA) : Make this static part of utility public long convertLogFilesSizeToExpectedParquetSize(List hoodieLogFiles) { - long totalSizeOfLogFiles = hoodieLogFiles.stream().map(HoodieLogFile::getFileSize) - .filter(size -> size > 0).reduce(Long::sum).orElse(0L); + long totalSizeOfLogFiles = + hoodieLogFiles.stream() + .map(HoodieLogFile::getFileSize) + .filter(size -> size > 0) + .reduce(Long::sum) + .orElse(0L); // Here we assume that if there is no base parquet file, all log files contain only inserts. // We can then just get the parquet equivalent size of these log files, compare that with // {@link config.getParquetMaxFileSize()} and decide if there is scope to insert more rows diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertPreppedDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertPreppedDeltaCommitActionExecutor.java index 3509efa6bfa9f..9540030eba157 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertPreppedDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertPreppedDeltaCommitActionExecutor.java @@ -20,28 +20,28 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.spark.api.java.JavaRDD; public class SparkUpsertPreppedDeltaCommitActionExecutor> - extends AbstractSparkDeltaCommitActionExecutor { + extends BaseSparkDeltaCommitActionExecutor { - private final JavaRDD> preppedRecords; + private final HoodieData> preppedRecords; public SparkUpsertPreppedDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> preppedRecords) { + String instantTime, HoodieData> preppedRecords) { super(context, config, table, instantTime, WriteOperationType.UPSERT_PREPPED); this.preppedRecords = preppedRecords; } @Override - public HoodieWriteMetadata> execute() { + public HoodieWriteMetadata> execute() { return super.execute(preppedRecords); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/restore/SparkCopyOnWriteRestoreActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/restore/SparkCopyOnWriteRestoreActionExecutor.java deleted file mode 100644 index 101b3217da99e..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/restore/SparkCopyOnWriteRestoreActionExecutor.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.restore; - -import org.apache.hudi.avro.model.HoodieRollbackMetadata; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieRollbackException; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.action.rollback.SparkCopyOnWriteRollbackActionExecutor; - -import org.apache.spark.api.java.JavaRDD; - -@SuppressWarnings("checkstyle:LineLength") -public class SparkCopyOnWriteRestoreActionExecutor extends - BaseRestoreActionExecutor>, JavaRDD, JavaRDD> { - - public SparkCopyOnWriteRestoreActionExecutor(HoodieSparkEngineContext context, - HoodieWriteConfig config, - HoodieTable table, - String instantTime, - String restoreInstantTime) { - super(context, config, table, instantTime, restoreInstantTime); - } - - @Override - protected HoodieRollbackMetadata rollbackInstant(HoodieInstant instantToRollback) { - table.getMetaClient().reloadActiveTimeline(); - SparkCopyOnWriteRollbackActionExecutor rollbackActionExecutor = new SparkCopyOnWriteRollbackActionExecutor( - (HoodieSparkEngineContext) context, - config, - table, - HoodieActiveTimeline.createNewInstantTime(), - instantToRollback, - true, - true, - false); - if (!instantToRollback.getAction().equals(HoodieTimeline.COMMIT_ACTION) - && !instantToRollback.getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION)) { - throw new HoodieRollbackException("Unsupported action in rollback instant:" + instantToRollback); - } - return rollbackActionExecutor.execute(); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/restore/SparkMergeOnReadRestoreActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/restore/SparkMergeOnReadRestoreActionExecutor.java deleted file mode 100644 index c320579380b1d..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/restore/SparkMergeOnReadRestoreActionExecutor.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.restore; - -import org.apache.hudi.avro.model.HoodieRollbackMetadata; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.action.rollback.SparkMergeOnReadRollbackActionExecutor; - -import org.apache.spark.api.java.JavaRDD; - -@SuppressWarnings("checkstyle:LineLength") -public class SparkMergeOnReadRestoreActionExecutor extends - BaseRestoreActionExecutor>, JavaRDD, JavaRDD> { - - public SparkMergeOnReadRestoreActionExecutor(HoodieSparkEngineContext context, - HoodieWriteConfig config, - HoodieTable table, - String instantTime, - String restoreInstantTime) { - super(context, config, table, instantTime, restoreInstantTime); - } - - @Override - protected HoodieRollbackMetadata rollbackInstant(HoodieInstant instantToRollback) { - table.getMetaClient().reloadActiveTimeline(); - SparkMergeOnReadRollbackActionExecutor rollbackActionExecutor = new SparkMergeOnReadRollbackActionExecutor( - context, - config, - table, - HoodieActiveTimeline.createNewInstantTime(), - instantToRollback, - true, - true, - false); - - switch (instantToRollback.getAction()) { - case HoodieTimeline.COMMIT_ACTION: - case HoodieTimeline.DELTA_COMMIT_ACTION: - case HoodieTimeline.COMPACTION_ACTION: - case HoodieTimeline.REPLACE_COMMIT_ACTION: - // TODO : Get file status and create a rollback stat and file - // TODO : Delete the .aux files along with the instant file, okay for now since the archival process will - // delete these files when it does not see a corresponding instant file under .hoodie - return rollbackActionExecutor.execute(); - default: - throw new IllegalArgumentException("invalid action name " + instantToRollback.getAction()); - } - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java deleted file mode 100644 index 9cf2434bc22a3..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.log.HoodieLogFormat; -import org.apache.hudi.common.table.log.HoodieLogFormat.Writer; -import org.apache.hudi.common.table.log.block.HoodieCommandBlock; -import org.apache.hudi.common.table.log.block.HoodieCommandBlock.HoodieCommandBlockTypeEnum; -import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.exception.HoodieRollbackException; - -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.PathFilter; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaSparkContext; - -import java.io.IOException; -import java.io.Serializable; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Objects; - -import scala.Tuple2; - -/** - * Performs Rollback of Hoodie Tables. - */ -public class ListingBasedRollbackHelper implements Serializable { - - private static final Logger LOG = LogManager.getLogger(ListingBasedRollbackHelper.class); - - private final HoodieTableMetaClient metaClient; - private final HoodieWriteConfig config; - - public ListingBasedRollbackHelper(HoodieTableMetaClient metaClient, HoodieWriteConfig config) { - this.metaClient = metaClient; - this.config = config; - } - - /** - * Performs all rollback actions that we have collected in parallel. - */ - public List performRollback(HoodieEngineContext context, HoodieInstant instantToRollback, List rollbackRequests) { - int sparkPartitions = Math.max(Math.min(rollbackRequests.size(), config.getRollbackParallelism()), 1); - context.setJobStatus(this.getClass().getSimpleName(), "Perform rollback actions"); - JavaPairRDD partitionPathRollbackStatsPairRDD = maybeDeleteAndCollectStats(context, instantToRollback, rollbackRequests, sparkPartitions, true); - return partitionPathRollbackStatsPairRDD.reduceByKey(RollbackUtils::mergeRollbackStat).map(Tuple2::_2).collect(); - } - - /** - * Collect all file info that needs to be rollbacked. - */ - public List collectRollbackStats(HoodieEngineContext context, HoodieInstant instantToRollback, List rollbackRequests) { - int sparkPartitions = Math.max(Math.min(rollbackRequests.size(), config.getRollbackParallelism()), 1); - context.setJobStatus(this.getClass().getSimpleName(), "Collect rollback stats for upgrade/downgrade"); - JavaPairRDD partitionPathRollbackStatsPairRDD = maybeDeleteAndCollectStats(context, instantToRollback, rollbackRequests, sparkPartitions, false); - return partitionPathRollbackStatsPairRDD.map(Tuple2::_2).collect(); - } - - /** - * May be delete interested files and collect stats or collect stats only. - * - * @param context instance of {@link HoodieEngineContext} to use. - * @param instantToRollback {@link HoodieInstant} of interest for which deletion or collect stats is requested. - * @param rollbackRequests List of {@link ListingBasedRollbackRequest} to be operated on. - * @param sparkPartitions number of spark partitions to use for parallelism. - * @param doDelete {@code true} if deletion has to be done. {@code false} if only stats are to be collected w/o performing any deletes. - * @return stats collected with or w/o actual deletions. - */ - JavaPairRDD maybeDeleteAndCollectStats(HoodieEngineContext context, HoodieInstant instantToRollback, List rollbackRequests, - int sparkPartitions, boolean doDelete) { - JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); - return jsc.parallelize(rollbackRequests, sparkPartitions).mapToPair(rollbackRequest -> { - switch (rollbackRequest.getType()) { - case DELETE_DATA_FILES_ONLY: { - final Map filesToDeletedStatus = deleteBaseFiles(metaClient, config, instantToRollback.getTimestamp(), - rollbackRequest.getPartitionPath(), doDelete); - return new Tuple2<>(rollbackRequest.getPartitionPath(), - HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) - .withDeletedFileResults(filesToDeletedStatus).build()); - } - case DELETE_DATA_AND_LOG_FILES: { - final Map filesToDeletedStatus = deleteBaseAndLogFiles(metaClient, config, instantToRollback.getTimestamp(), rollbackRequest.getPartitionPath(), doDelete); - return new Tuple2<>(rollbackRequest.getPartitionPath(), - HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) - .withDeletedFileResults(filesToDeletedStatus).build()); - } - case APPEND_ROLLBACK_BLOCK: { - Writer writer = null; - try { - writer = HoodieLogFormat.newWriterBuilder() - .onParentPath(FSUtils.getPartitionPath(metaClient.getBasePath(), rollbackRequest.getPartitionPath())) - .withFileId(rollbackRequest.getFileId().get()) - .overBaseCommit(rollbackRequest.getLatestBaseInstant().get()).withFs(metaClient.getFs()) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); - - // generate metadata - if (doDelete) { - Map header = generateHeader(instantToRollback.getTimestamp()); - // if update belongs to an existing log file - writer = writer.appendBlock(new HoodieCommandBlock(header)); - } - } catch (IOException | InterruptedException io) { - throw new HoodieRollbackException("Failed to rollback for instant " + instantToRollback, io); - } finally { - try { - if (writer != null) { - writer.close(); - } - } catch (IOException io) { - throw new HoodieIOException("Error appending rollback block..", io); - } - } - - // This step is intentionally done after writer is closed. Guarantees that - // getFileStatus would reflect correct stats and FileNotFoundException is not thrown in - // cloud-storage : HUDI-168 - Map filesToNumBlocksRollback = Collections.singletonMap( - metaClient.getFs().getFileStatus(Objects.requireNonNull(writer).getLogFile().getPath()), - 1L - ); - return new Tuple2<>(rollbackRequest.getPartitionPath(), - HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) - .withRollbackBlockAppendResults(filesToNumBlocksRollback).build()); - } - default: - throw new IllegalStateException("Unknown Rollback action " + rollbackRequest); - } - }); - } - - - /** - * Common method used for cleaning out base files under a partition path during rollback of a set of commits. - */ - private Map deleteBaseAndLogFiles(HoodieTableMetaClient metaClient, HoodieWriteConfig config, - String commit, String partitionPath, boolean doDelete) throws IOException { - LOG.info("Cleaning path " + partitionPath); - String basefileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); - SerializablePathFilter filter = (path) -> { - if (path.toString().endsWith(basefileExtension)) { - String fileCommitTime = FSUtils.getCommitTime(path.getName()); - return commit.equals(fileCommitTime); - } else if (FSUtils.isLogFile(path)) { - // Since the baseCommitTime is the only commit for new log files, it's okay here - String fileCommitTime = FSUtils.getBaseCommitTimeFromLogPath(path); - return commit.equals(fileCommitTime); - } - return false; - }; - - final Map results = new HashMap<>(); - FileSystem fs = metaClient.getFs(); - FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter); - for (FileStatus file : toBeDeleted) { - if (doDelete) { - boolean success = fs.delete(file.getPath(), false); - results.put(file, success); - LOG.info("Delete file " + file.getPath() + "\t" + success); - } else { - results.put(file, true); - } - } - return results; - } - - /** - * Common method used for cleaning out base files under a partition path during rollback of a set of commits. - */ - private Map deleteBaseFiles(HoodieTableMetaClient metaClient, HoodieWriteConfig config, - String commit, String partitionPath, boolean doDelete) throws IOException { - final Map results = new HashMap<>(); - LOG.info("Cleaning path " + partitionPath); - FileSystem fs = metaClient.getFs(); - String basefileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); - PathFilter filter = (path) -> { - if (path.toString().contains(basefileExtension)) { - String fileCommitTime = FSUtils.getCommitTime(path.getName()); - return commit.equals(fileCommitTime); - } - return false; - }; - FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter); - for (FileStatus file : toBeDeleted) { - if (doDelete) { - boolean success = fs.delete(file.getPath(), false); - results.put(file, success); - LOG.info("Delete file " + file.getPath() + "\t" + success); - } else { - results.put(file, true); - } - } - return results; - } - - private Map generateHeader(String commit) { - // generate metadata - Map header = new HashMap<>(3); - header.put(HeaderMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp()); - header.put(HeaderMetadataType.TARGET_INSTANT_TIME, commit); - header.put(HeaderMetadataType.COMMAND_BLOCK_TYPE, - String.valueOf(HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); - return header; - } - - public interface SerializablePathFilter extends PathFilter, Serializable { - - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/SparkCopyOnWriteRollbackActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/SparkCopyOnWriteRollbackActionExecutor.java deleted file mode 100644 index 965d8055943a5..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/SparkCopyOnWriteRollbackActionExecutor.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; - -import org.apache.spark.api.java.JavaRDD; - -import java.util.List; - -@SuppressWarnings("checkstyle:LineLength") -public class SparkCopyOnWriteRollbackActionExecutor extends - BaseCopyOnWriteRollbackActionExecutor>, JavaRDD, JavaRDD> { - public SparkCopyOnWriteRollbackActionExecutor(HoodieSparkEngineContext context, - HoodieWriteConfig config, - HoodieTable>, JavaRDD, JavaRDD> table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants) { - super(context, config, table, instantTime, commitInstant, deleteInstants); - } - - public SparkCopyOnWriteRollbackActionExecutor(HoodieSparkEngineContext context, - HoodieWriteConfig config, - HoodieTable>, JavaRDD, JavaRDD> table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants, - boolean skipTimelinePublish, - boolean useMarkerBasedStrategy) { - super(context, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, useMarkerBasedStrategy); - } - - @Override - protected BaseRollbackActionExecutor.RollbackStrategy getRollbackStrategy() { - if (useMarkerBasedStrategy) { - return new SparkMarkerBasedRollbackStrategy(table, context, config, instantTime); - } else { - return this::executeRollbackUsingFileListing; - } - } - - @Override - protected List executeRollbackUsingFileListing(HoodieInstant instantToRollback) { - List rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(table.getMetaClient().getFs(), table.getMetaClient().getBasePath(), - config.shouldAssumeDatePartitioning()); - return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(context, instantToRollback, rollbackRequests); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/SparkMarkerBasedRollbackStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/SparkMarkerBasedRollbackStrategy.java deleted file mode 100644 index 065b22d787511..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/SparkMarkerBasedRollbackStrategy.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.IOType; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieRollbackException; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.MarkerFiles; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; - -import java.util.List; - -import scala.Tuple2; - -@SuppressWarnings("checkstyle:LineLength") -public class SparkMarkerBasedRollbackStrategy extends AbstractMarkerBasedRollbackStrategy>, JavaRDD, JavaRDD> { - public SparkMarkerBasedRollbackStrategy(HoodieTable>, JavaRDD, JavaRDD> table, HoodieEngineContext context, HoodieWriteConfig config, String instantTime) { - super(table, context, config, instantTime); - } - - @Override - public List execute(HoodieInstant instantToRollback) { - JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); - try { - MarkerFiles markerFiles = new MarkerFiles(table, instantToRollback.getTimestamp()); - List markerFilePaths = markerFiles.allMarkerFilePaths(); - int parallelism = Math.max(Math.min(markerFilePaths.size(), config.getRollbackParallelism()), 1); - return jsc.parallelize(markerFilePaths, parallelism) - .map(markerFilePath -> { - String typeStr = markerFilePath.substring(markerFilePath.lastIndexOf(".") + 1); - IOType type = IOType.valueOf(typeStr); - switch (type) { - case MERGE: - return undoMerge(MarkerFiles.stripMarkerSuffix(markerFilePath)); - case APPEND: - return undoAppend(MarkerFiles.stripMarkerSuffix(markerFilePath), instantToRollback); - case CREATE: - return undoCreate(MarkerFiles.stripMarkerSuffix(markerFilePath)); - default: - throw new HoodieRollbackException("Unknown marker type, during rollback of " + instantToRollback); - } - }) - .mapToPair(rollbackStat -> new Tuple2<>(rollbackStat.getPartitionPath(), rollbackStat)) - .reduceByKey(RollbackUtils::mergeRollbackStat) - .map(Tuple2::_2).collect(); - } catch (Exception e) { - throw new HoodieRollbackException("Error rolling back using marker files written for " + instantToRollback, e); - } - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/SparkMergeOnReadRollbackActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/SparkMergeOnReadRollbackActionExecutor.java deleted file mode 100644 index 459ab128f0360..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/rollback/SparkMergeOnReadRollbackActionExecutor.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.table.HoodieTable; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; - -import java.io.IOException; -import java.util.List; - -@SuppressWarnings("checkstyle:LineLength") -public class SparkMergeOnReadRollbackActionExecutor extends - BaseMergeOnReadRollbackActionExecutor>, JavaRDD, JavaRDD> { - public SparkMergeOnReadRollbackActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, JavaRDD, JavaRDD> table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants) { - super(context, config, table, instantTime, commitInstant, deleteInstants); - } - - public SparkMergeOnReadRollbackActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, JavaRDD, JavaRDD> table, - String instantTime, - HoodieInstant commitInstant, - boolean deleteInstants, - boolean skipTimelinePublish, - boolean useMarkerBasedStrategy) { - super(context, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, useMarkerBasedStrategy); - } - - @Override - protected BaseRollbackActionExecutor.RollbackStrategy getRollbackStrategy() { - if (useMarkerBasedStrategy) { - return new SparkMarkerBasedRollbackStrategy(table, context, config, instantTime); - } else { - return this::executeRollbackUsingFileListing; - } - } - - @Override - protected List executeRollbackUsingFileListing(HoodieInstant resolvedInstant) { - List rollbackRequests; - JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); - try { - rollbackRequests = RollbackUtils.generateRollbackRequestsUsingFileListingMOR(resolvedInstant, table, context); - } catch (IOException e) { - throw new HoodieIOException("Error generating rollback requests by file listing.", e); - } - return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(context, resolvedInstant, rollbackRequests); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java deleted file mode 100644 index 52849cb06f3d6..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.upgrade; - -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieSparkTable; -import org.apache.hudi.table.MarkerFiles; - -import java.util.List; -import java.util.stream.Collectors; - -/** - * Downgrade handle to assist in downgrading hoodie table from version 1 to 0. - */ -public class OneToZeroDowngradeHandler implements DowngradeHandler { - - @Override - public void downgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime) { - // fetch pending commit info - HoodieSparkTable table = HoodieSparkTable.create(config, context); - HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction(); - List commits = inflightTimeline.getReverseOrderedInstants().collect(Collectors.toList()); - for (HoodieInstant commitInstant : commits) { - // delete existing marker files - MarkerFiles markerFiles = new MarkerFiles(table, commitInstant.getTimestamp()); - markerFiles.quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); - } - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/SparkUpgradeDowngrade.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/SparkUpgradeDowngrade.java deleted file mode 100644 index 9c13c5a780ce8..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/SparkUpgradeDowngrade.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.upgrade; - -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.HoodieTableVersion; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieUpgradeDowngradeException; - -import java.io.IOException; - -public class SparkUpgradeDowngrade extends AbstractUpgradeDowngrade { - - public SparkUpgradeDowngrade(HoodieTableMetaClient metaClient, HoodieWriteConfig config, HoodieEngineContext context) { - super(metaClient, config, context); - } - - @Override - public void run(HoodieTableMetaClient metaClient, - HoodieTableVersion toVersion, - HoodieWriteConfig config, - HoodieEngineContext context, - String instantTime) { - try { - new SparkUpgradeDowngrade(metaClient, config, context).run(toVersion, instantTime); - } catch (IOException e) { - throw new HoodieUpgradeDowngradeException("Error during upgrade/downgrade to version:" + toVersion, e); - } - - } - - @Override - protected void upgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime) { - if (fromVersion == HoodieTableVersion.ZERO && toVersion == HoodieTableVersion.ONE) { - new ZeroToOneUpgradeHandler().upgrade(config, context, instantTime); - } else { - throw new HoodieUpgradeDowngradeException(fromVersion.versionCode(), toVersion.versionCode(), true); - } - } - - @Override - protected void downgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime) { - if (fromVersion == HoodieTableVersion.ONE && toVersion == HoodieTableVersion.ZERO) { - new OneToZeroDowngradeHandler().downgrade(config, context, instantTime); - } else { - throw new HoodieUpgradeDowngradeException(fromVersion.versionCode(), toVersion.versionCode(), false); - } - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/SparkUpgradeDowngradeHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/SparkUpgradeDowngradeHelper.java new file mode 100644 index 0000000000000..ba7f9012701a5 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/SparkUpgradeDowngradeHelper.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.util.SparkKeyGenUtils; + +/** + * Spark upgrade and downgrade helper. + */ +public class SparkUpgradeDowngradeHelper implements SupportsUpgradeDowngrade { + + private static final SparkUpgradeDowngradeHelper SINGLETON_INSTANCE = + new SparkUpgradeDowngradeHelper(); + + private SparkUpgradeDowngradeHelper() { + } + + public static SparkUpgradeDowngradeHelper getInstance() { + return SINGLETON_INSTANCE; + } + + @Override + public HoodieTable getTable(HoodieWriteConfig config, HoodieEngineContext context) { + return HoodieSparkTable.create(config, context); + } + + @Override + public String getPartitionColumns(HoodieWriteConfig config) { + return SparkKeyGenUtils.getPartitionColumns(config.getProps()); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java deleted file mode 100644 index 7e3faf32b6528..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.upgrade; - -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieTableType; -import org.apache.hudi.common.model.IOType; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieRollbackException; -import org.apache.hudi.table.HoodieSparkTable; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.MarkerFiles; -import org.apache.hudi.table.action.rollback.ListingBasedRollbackHelper; -import org.apache.hudi.table.action.rollback.ListingBasedRollbackRequest; - -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hudi.table.action.rollback.RollbackUtils; - -import java.util.List; -import java.util.stream.Collectors; - -/** - * Upgrade handle to assist in upgrading hoodie table from version 0 to 1. - */ -public class ZeroToOneUpgradeHandler implements UpgradeHandler { - - @Override - public void upgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime) { - // fetch pending commit info - HoodieSparkTable table = HoodieSparkTable.create(config, context); - HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction(); - List commits = inflightTimeline.getReverseOrderedInstants().map(HoodieInstant::getTimestamp) - .collect(Collectors.toList()); - if (commits.size() > 0 && instantTime != null) { - // ignore the latest inflight commit since a new commit would have been started and we need to fix any pending commits from previous launch - commits.remove(instantTime); - } - for (String commit : commits) { - // for every pending commit, delete old marker files and re-create marker files in new format - recreateMarkerFiles(commit, table, context, config.getMarkersDeleteParallelism()); - } - } - - /** - * Recreate marker files in new format. - * Step1: Delete existing marker files - * Step2: Collect all rollback file info. - * Step3: recreate marker files for all interested files. - * - * @param commitInstantTime instant of interest for which marker files need to be recreated. - * @param table instance of {@link HoodieSparkTable} to use - * @param context instance of {@link HoodieEngineContext} to use - * @throws HoodieRollbackException on any exception during upgrade. - */ - private static void recreateMarkerFiles(final String commitInstantTime, - HoodieSparkTable table, - HoodieEngineContext context, - int parallelism) throws HoodieRollbackException { - try { - // fetch hoodie instant - Option commitInstantOpt = Option.fromJavaOptional(table.getActiveTimeline().getCommitsTimeline().getInstants() - .filter(instant -> HoodieActiveTimeline.EQUALS.test(instant.getTimestamp(), commitInstantTime)) - .findFirst()); - if (commitInstantOpt.isPresent()) { - // delete existing marker files - MarkerFiles markerFiles = new MarkerFiles(table, commitInstantTime); - markerFiles.quietDeleteMarkerDir(context, parallelism); - - // generate rollback stats - List rollbackRequests; - if (table.getMetaClient().getTableType() == HoodieTableType.COPY_ON_WRITE) { - rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(table.getMetaClient().getFs(), table.getMetaClient().getBasePath(), - table.getConfig().shouldAssumeDatePartitioning()); - } else { - rollbackRequests = RollbackUtils.generateRollbackRequestsUsingFileListingMOR(commitInstantOpt.get(), table, context); - } - List rollbackStats = new ListingBasedRollbackHelper(table.getMetaClient(), table.getConfig()) - .collectRollbackStats(context, commitInstantOpt.get(), rollbackRequests); - - // recreate marker files adhering to marker based rollback - for (HoodieRollbackStat rollbackStat : rollbackStats) { - for (String path : rollbackStat.getSuccessDeleteFiles()) { - String dataFileName = path.substring(path.lastIndexOf("/") + 1); - // not feasible to differentiate MERGE from CREATE. hence creating with MERGE IOType for all base files. - markerFiles.create(rollbackStat.getPartitionPath(), dataFileName, IOType.MERGE); - } - for (FileStatus fileStatus : rollbackStat.getCommandBlocksCount().keySet()) { - markerFiles.create(rollbackStat.getPartitionPath(), getFileNameForMarkerFromLogFile(fileStatus.getPath().toString(), table), IOType.APPEND); - } - } - } - } catch (Exception e) { - throw new HoodieRollbackException("Exception thrown while upgrading Hoodie Table from version 0 to 1", e); - } - } - - /** - * Curates file name for marker from existing log file path. - * log file format : partitionpath/.fileid_baseInstant.log.writetoken - * marker file format : partitionpath/fileId_writetoken_baseinstant.basefileExtn.marker.APPEND - * - * @param logFilePath log file path for which marker file name needs to be generated. - * @return the marker file name thus curated. - */ - private static String getFileNameForMarkerFromLogFile(String logFilePath, HoodieTable table) { - Path logPath = new Path(table.getMetaClient().getBasePath(), logFilePath); - String fileId = FSUtils.getFileIdFromLogPath(logPath); - String baseInstant = FSUtils.getBaseCommitTimeFromLogPath(logPath); - String writeToken = FSUtils.getWriteTokenFromLogPath(logPath); - - return FSUtils.makeDataFileName(baseInstant, writeToken, fileId, table.getBaseFileFormat().getFileExtension()); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/util/DataTypeUtils.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/util/DataTypeUtils.java new file mode 100644 index 0000000000000..57b7a9d7bb3f3 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/util/DataTypeUtils.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.spark.sql.types.ArrayType; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.Decimal; +import org.apache.spark.sql.types.DecimalType; +import org.apache.spark.sql.types.MapType; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +import java.util.Arrays; + +public class DataTypeUtils { + + /** + * Checks whether provided {@link DataType} contains {@link DecimalType} whose scale is less than + * {@link Decimal#MAX_LONG_DIGITS()} + */ + public static boolean hasSmallPrecisionDecimalType(DataType sparkType) { + if (sparkType instanceof StructType) { + StructField[] fields = ((StructType) sparkType).fields(); + return Arrays.stream(fields).anyMatch(f -> hasSmallPrecisionDecimalType(f.dataType())); + } else if (sparkType instanceof MapType) { + MapType map = (MapType) sparkType; + return hasSmallPrecisionDecimalType(map.keyType()) || hasSmallPrecisionDecimalType(map.valueType()); + } else if (sparkType instanceof ArrayType) { + return hasSmallPrecisionDecimalType(((ArrayType) sparkType).elementType()); + } else if (sparkType instanceof DecimalType) { + DecimalType decimalType = (DecimalType) sparkType; + return decimalType.precision() < Decimal.MAX_LONG_DIGITS(); + } + + return false; + } +} diff --git a/hudi-client/hudi-spark-client/src/main/resources/log4j.properties b/hudi-client/hudi-spark-client/src/main/resources/log4j.properties deleted file mode 100644 index ff268faf6363c..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/resources/log4j.properties +++ /dev/null @@ -1,23 +0,0 @@ -### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -### -log4j.rootLogger=INFO, A1 -# A1 is set to be a ConsoleAppender. -log4j.appender.A1=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. -log4j.appender.A1.layout=org.apache.log4j.PatternLayout -log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionHelper.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionHelper.scala deleted file mode 100644 index db1ca6f94c3f6..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionHelper.scala +++ /dev/null @@ -1,368 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi - -import java.nio.ByteBuffer -import java.sql.{Date, Timestamp} -import java.util - -import org.apache.avro.Conversions.DecimalConversion -import org.apache.avro.LogicalTypes.{TimestampMicros, TimestampMillis} -import org.apache.avro.Schema.Type._ -import org.apache.avro.generic.GenericData.{Fixed, Record} -import org.apache.avro.generic.{GenericData, GenericFixed, GenericRecord} -import org.apache.avro.{LogicalTypes, Schema} -import org.apache.spark.sql.Row -import org.apache.spark.sql.avro.{IncompatibleSchemaException, SchemaConverters} -import org.apache.spark.sql.catalyst.expressions.GenericRow -import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.types._ - -import scala.collection.JavaConverters._ - -object AvroConversionHelper { - - private def createDecimal(decimal: java.math.BigDecimal, precision: Int, scale: Int): Decimal = { - if (precision <= Decimal.MAX_LONG_DIGITS) { - // Constructs a `Decimal` with an unscaled `Long` value if possible. - Decimal(decimal.unscaledValue().longValue(), precision, scale) - } else { - // Otherwise, resorts to an unscaled `BigInteger` instead. - Decimal(decimal, precision, scale) - } - } - - /** - * - * Returns a converter function to convert row in avro format to GenericRow of catalyst. - * - * @param sourceAvroSchema Source schema before conversion inferred from avro file by passed in - * by user. - * @param targetSqlType Target catalyst sql type after the conversion. - * @return returns a converter function to convert row in avro format to GenericRow of catalyst. - */ - def createConverterToRow(sourceAvroSchema: Schema, - targetSqlType: DataType): AnyRef => AnyRef = { - - def createConverter(avroSchema: Schema, sqlType: DataType, path: List[String]): AnyRef => AnyRef = { - val avroType = avroSchema.getType - (sqlType, avroType) match { - // Avro strings are in Utf8, so we have to call toString on them - case (StringType, STRING) | (StringType, ENUM) => - (item: AnyRef) => if (item == null) null else item.toString - // Byte arrays are reused by avro, so we have to make a copy of them. - case (IntegerType, INT) | (BooleanType, BOOLEAN) | (DoubleType, DOUBLE) | - (FloatType, FLOAT) | (LongType, LONG) => - identity - case (BinaryType, FIXED) => - (item: AnyRef) => - if (item == null) { - null - } else { - item.asInstanceOf[Fixed].bytes().clone() - } - case (BinaryType, BYTES) => - (item: AnyRef) => - if (item == null) { - null - } else { - val byteBuffer = item.asInstanceOf[ByteBuffer] - val bytes = new Array[Byte](byteBuffer.remaining) - byteBuffer.get(bytes) - bytes - } - case (d: DecimalType, FIXED) => - (item: AnyRef) => - if (item == null) { - null - } else { - val decimalConversion = new DecimalConversion - val bigDecimal = decimalConversion.fromFixed(item.asInstanceOf[GenericFixed], avroSchema, - LogicalTypes.decimal(d.precision, d.scale)) - createDecimal(bigDecimal, d.precision, d.scale) - } - case (d: DecimalType, BYTES) => - (item: AnyRef) => - if (item == null) { - null - } else { - val decimalConversion = new DecimalConversion - val bigDecimal = decimalConversion.fromBytes(item.asInstanceOf[ByteBuffer], avroSchema, - LogicalTypes.decimal(d.precision, d.scale)) - createDecimal(bigDecimal, d.precision, d.scale) - } - case (DateType, INT) => - (item: AnyRef) => - if (item == null) { - null - } else { - item match { - case integer: Integer => DateTimeUtils.toJavaDate(integer) - case _ => new Date(item.asInstanceOf[Long]) - } - } - case (TimestampType, LONG) => - (item: AnyRef) => - if (item == null) { - null - } else { - avroSchema.getLogicalType match { - case _: TimestampMillis => - new Timestamp(item.asInstanceOf[Long]) - case _: TimestampMicros => - new Timestamp(item.asInstanceOf[Long] / 1000) - case null => - new Timestamp(item.asInstanceOf[Long]) - case other => - throw new IncompatibleSchemaException( - s"Cannot convert Avro logical type $other to Catalyst Timestamp type.") - } - } - case (struct: StructType, RECORD) => - val length = struct.fields.length - val converters = new Array[AnyRef => AnyRef](length) - val avroFieldIndexes = new Array[Int](length) - var i = 0 - while (i < length) { - val sqlField = struct.fields(i) - val avroField = avroSchema.getField(sqlField.name) - if (avroField != null) { - val converter = createConverter(avroField.schema(), sqlField.dataType, - path :+ sqlField.name) - converters(i) = converter - avroFieldIndexes(i) = avroField.pos() - } else if (!sqlField.nullable) { - throw new IncompatibleSchemaException( - s"Cannot find non-nullable field ${sqlField.name} at path ${path.mkString(".")} " + - "in Avro schema\n" + - s"Source Avro schema: $sourceAvroSchema.\n" + - s"Target Catalyst type: $targetSqlType") - } - i += 1 - } - - (item: AnyRef) => { - if (item == null) { - null - } else { - val record = item.asInstanceOf[GenericRecord] - - val result = new Array[Any](length) - var i = 0 - while (i < converters.length) { - if (converters(i) != null) { - val converter = converters(i) - result(i) = converter(record.get(avroFieldIndexes(i))) - } - i += 1 - } - new GenericRow(result) - } - } - case (arrayType: ArrayType, ARRAY) => - val elementConverter = createConverter(avroSchema.getElementType, arrayType.elementType, - path) - val allowsNull = arrayType.containsNull - (item: AnyRef) => { - if (item == null) { - null - } else { - item.asInstanceOf[java.lang.Iterable[AnyRef]].asScala.map { element => - if (element == null && !allowsNull) { - throw new RuntimeException(s"Array value at path ${path.mkString(".")} is not " + - "allowed to be null") - } else { - elementConverter(element) - } - } - } - } - case (mapType: MapType, MAP) if mapType.keyType == StringType => - val valueConverter = createConverter(avroSchema.getValueType, mapType.valueType, path) - val allowsNull = mapType.valueContainsNull - (item: AnyRef) => { - if (item == null) { - null - } else { - item.asInstanceOf[java.util.Map[AnyRef, AnyRef]].asScala.map { x => - if (x._2 == null && !allowsNull) { - throw new RuntimeException(s"Map value at path ${path.mkString(".")} is not " + - "allowed to be null") - } else { - (x._1.toString, valueConverter(x._2)) - } - }.toMap - } - } - case (sqlType, UNION) => - if (avroSchema.getTypes.asScala.exists(_.getType == NULL)) { - val remainingUnionTypes = avroSchema.getTypes.asScala.filterNot(_.getType == NULL) - if (remainingUnionTypes.size == 1) { - createConverter(remainingUnionTypes.head, sqlType, path) - } else { - createConverter(Schema.createUnion(remainingUnionTypes.asJava), sqlType, path) - } - } else avroSchema.getTypes.asScala.map(_.getType) match { - case Seq(_) => createConverter(avroSchema.getTypes.get(0), sqlType, path) - case Seq(a, b) if Set(a, b) == Set(INT, LONG) && sqlType == LongType => - (item: AnyRef) => { - item match { - case null => null - case l: java.lang.Long => l - case i: java.lang.Integer => new java.lang.Long(i.longValue()) - } - } - case Seq(a, b) if Set(a, b) == Set(FLOAT, DOUBLE) && sqlType == DoubleType => - (item: AnyRef) => { - item match { - case null => null - case d: java.lang.Double => d - case f: java.lang.Float => new java.lang.Double(f.doubleValue()) - } - } - case other => - sqlType match { - case t: StructType if t.fields.length == avroSchema.getTypes.size => - val fieldConverters = t.fields.zip(avroSchema.getTypes.asScala).map { - case (field, schema) => - createConverter(schema, field.dataType, path :+ field.name) - } - - (item: AnyRef) => - if (item == null) { - null - } else { - val i = GenericData.get().resolveUnion(avroSchema, item) - val converted = new Array[Any](fieldConverters.length) - converted(i) = fieldConverters(i)(item) - new GenericRow(converted) - } - case _ => throw new IncompatibleSchemaException( - s"Cannot convert Avro schema to catalyst type because schema at path " + - s"${path.mkString(".")} is not compatible " + - s"(avroType = $other, sqlType = $sqlType). \n" + - s"Source Avro schema: $sourceAvroSchema.\n" + - s"Target Catalyst type: $targetSqlType") - } - } - case (left, right) => - throw new IncompatibleSchemaException( - s"Cannot convert Avro schema to catalyst type because schema at path " + - s"${path.mkString(".")} is not compatible (avroType = $left, sqlType = $right). \n" + - s"Source Avro schema: $sourceAvroSchema.\n" + - s"Target Catalyst type: $targetSqlType") - } - } - - createConverter(sourceAvroSchema, targetSqlType, List.empty[String]) - } - - def createConverterToAvro(dataType: DataType, - structName: String, - recordNamespace: String): Any => Any = { - dataType match { - case BinaryType => (item: Any) => - item match { - case null => null - case bytes: Array[Byte] => ByteBuffer.wrap(bytes) - } - case IntegerType | LongType | - FloatType | DoubleType | StringType | BooleanType => identity - case ByteType => (item: Any) => - if (item == null) null else item.asInstanceOf[Byte].intValue - case ShortType => (item: Any) => - if (item == null) null else item.asInstanceOf[Short].intValue - case dec: DecimalType => - val schema = SchemaConverters.toAvroType(dec, nullable = false, structName, recordNamespace) - (item: Any) => { - Option(item).map { _ => - val bigDecimalValue = item.asInstanceOf[java.math.BigDecimal] - val decimalConversions = new DecimalConversion() - decimalConversions.toFixed(bigDecimalValue, schema, LogicalTypes.decimal(dec.precision, dec.scale)) - }.orNull - } - case TimestampType => (item: Any) => - // Convert time to microseconds since spark-avro by default converts TimestampType to - // Avro Logical TimestampMicros - Option(item).map(_.asInstanceOf[Timestamp].getTime * 1000).orNull - case DateType => (item: Any) => - Option(item).map(_.asInstanceOf[Date].toLocalDate.toEpochDay.toInt).orNull - case ArrayType(elementType, _) => - val elementConverter = createConverterToAvro( - elementType, - structName, - recordNamespace) - (item: Any) => { - if (item == null) { - null - } else { - val sourceArray = item.asInstanceOf[Seq[Any]] - val sourceArraySize = sourceArray.size - val targetList = new util.ArrayList[Any](sourceArraySize) - var idx = 0 - while (idx < sourceArraySize) { - targetList.add(elementConverter(sourceArray(idx))) - idx += 1 - } - targetList - } - } - case MapType(StringType, valueType, _) => - val valueConverter = createConverterToAvro( - valueType, - structName, - recordNamespace) - (item: Any) => { - if (item == null) { - null - } else { - val javaMap = new util.HashMap[String, Any]() - item.asInstanceOf[Map[String, Any]].foreach { case (key, value) => - javaMap.put(key, valueConverter(value)) - } - javaMap - } - } - case structType: StructType => - val schema: Schema = SchemaConverters.toAvroType(structType, nullable = false, structName, recordNamespace) - val childNameSpace = if (recordNamespace != "") s"$recordNamespace.$structName" else structName - val fieldConverters = structType.fields.map(field => - createConverterToAvro( - field.dataType, - field.name, - childNameSpace)) - (item: Any) => { - if (item == null) { - null - } else { - val record = new Record(schema) - val convertersIterator = fieldConverters.iterator - val fieldNamesIterator = dataType.asInstanceOf[StructType].fieldNames.iterator - val rowIterator = item.asInstanceOf[Row].toSeq.iterator - - while (convertersIterator.hasNext) { - val converter = convertersIterator.next() - record.put(fieldNamesIterator.next(), converter(rowIterator.next())) - } - record - } - } - } - } -} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala index 88101265de297..1f445de38986d 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala @@ -18,18 +18,104 @@ package org.apache.hudi -import org.apache.avro.Schema -import org.apache.avro.generic.{GenericRecord, GenericRecordBuilder, IndexedRecord} +import org.apache.avro.Schema.Type +import org.apache.avro.generic.GenericRecord +import org.apache.avro.{AvroRuntimeException, JsonProperties, Schema} +import org.apache.hudi.HoodieSparkUtils.sparkAdapter import org.apache.hudi.avro.HoodieAvroUtils import org.apache.spark.rdd.RDD -import org.apache.spark.sql.avro.SchemaConverters -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructType} import org.apache.spark.sql.{Dataset, Row, SparkSession} +import scala.collection.JavaConversions._ import scala.collection.JavaConverters._ object AvroConversionUtils { + /** + * Check the nullability of the input Avro type and resolve it when it is nullable. The first + * return value is a [[Boolean]] indicating if the input Avro type is nullable. The second + * return value is either provided Avro type if it's not nullable, or its resolved non-nullable part + * in case it is + */ + def resolveAvroTypeNullability(avroType: Schema): (Boolean, Schema) = { + if (avroType.getType == Type.UNION) { + val fields = avroType.getTypes.asScala + val actualType = fields.filter(_.getType != Type.NULL) + if (fields.length != 2 || actualType.length != 1) { + throw new AvroRuntimeException( + s"Unsupported Avro UNION type $avroType: Only UNION of a null type and a non-null " + + "type is supported") + } + (true, actualType.head) + } else { + (false, avroType) + } + } + + /** + * Creates converter to transform Avro payload into Spark's Catalyst one + * + * @param rootAvroType Avro [[Schema]] to be transformed from + * @param rootCatalystType Catalyst [[StructType]] to be transformed into + * @return converter accepting Avro payload and transforming it into a Catalyst one (in the form of [[InternalRow]]) + */ + def createAvroToInternalRowConverter(rootAvroType: Schema, rootCatalystType: StructType): GenericRecord => Option[InternalRow] = { + val deserializer = sparkAdapter.createAvroDeserializer(rootAvroType, rootCatalystType) + record => deserializer + .deserialize(record) + .map(_.asInstanceOf[InternalRow]) + } + + /** + * Creates converter to transform Catalyst payload into Avro one + * + * @param rootCatalystType Catalyst [[StructType]] to be transformed from + * @param rootAvroType Avro [[Schema]] to be transformed into + * @param nullable whether Avro record is nullable + * @return converter accepting Catalyst payload (in the form of [[InternalRow]]) and transforming it into an Avro one + */ + def createInternalRowToAvroConverter(rootCatalystType: StructType, rootAvroType: Schema, nullable: Boolean): InternalRow => GenericRecord = { + val serializer = sparkAdapter.createAvroSerializer(rootCatalystType, rootAvroType, nullable) + row => serializer + .serialize(row) + .asInstanceOf[GenericRecord] + } + + /** + * @deprecated please use [[AvroConversionUtils.createAvroToInternalRowConverter]] + */ + @Deprecated + def createConverterToRow(sourceAvroSchema: Schema, + targetSqlType: StructType): GenericRecord => Row = { + val serde = sparkAdapter.createSparkRowSerDe(targetSqlType) + val converter = AvroConversionUtils.createAvroToInternalRowConverter(sourceAvroSchema, targetSqlType) + + avro => converter.apply(avro).map(serde.deserializeRow).get + } + + /** + * @deprecated please use [[AvroConversionUtils.createInternalRowToAvroConverter]] + */ + @Deprecated + def createConverterToAvro(sourceSqlType: StructType, + structName: String, + recordNamespace: String): Row => GenericRecord = { + val serde = sparkAdapter.createSparkRowSerDe(sourceSqlType) + val avroSchema = AvroConversionUtils.convertStructTypeToAvroSchema(sourceSqlType, structName, recordNamespace) + val (nullable, _) = resolveAvroTypeNullability(avroSchema) + + val converter = AvroConversionUtils.createInternalRowToAvroConverter(sourceSqlType, avroSchema, nullable) + + row => converter.apply(serde.serializeRow(row)) + } + + /** + * Creates [[org.apache.spark.sql.DataFrame]] from the provided [[RDD]] of [[GenericRecord]]s + * + * TODO convert directly from GenericRecord into InternalRow instead + */ def createDataFrame(rdd: RDD[GenericRecord], schemaStr: String, ss: SparkSession): Dataset[Row] = { if (rdd.isEmpty()) { ss.emptyDataFrame @@ -39,32 +125,90 @@ object AvroConversionUtils { else { val schema = new Schema.Parser().parse(schemaStr) val dataType = convertAvroSchemaToStructType(schema) - val convertor = AvroConversionHelper.createConverterToRow(schema, dataType) - records.map { x => convertor(x).asInstanceOf[Row] } + val converter = createConverterToRow(schema, dataType) + records.map { r => converter(r) } } }, convertAvroSchemaToStructType(new Schema.Parser().parse(schemaStr))) } } - def convertStructTypeToAvroSchema(structType: StructType, + /** + * + * Returns avro schema from spark StructType. + * + * @param structType Dataframe Struct Type. + * @param structName Avro record name. + * @param recordNamespace Avro record namespace. + * @return Avro schema corresponding to given struct type. + */ + def convertStructTypeToAvroSchema(structType: DataType, structName: String, recordNamespace: String): Schema = { - SchemaConverters.toAvroType(structType, nullable = false, structName, recordNamespace) + val schemaConverters = sparkAdapter.getAvroSchemaConverters + val avroSchema = schemaConverters.toAvroType(structType, nullable = false, structName, recordNamespace) + getAvroSchemaWithDefaults(avroSchema, structType) } def convertAvroSchemaToStructType(avroSchema: Schema): StructType = { - SchemaConverters.toSqlType(avroSchema).dataType.asInstanceOf[StructType] + val schemaConverters = sparkAdapter.getAvroSchemaConverters + schemaConverters.toSqlType(avroSchema) match { + case (dataType, _) => dataType.asInstanceOf[StructType] + } } - def buildAvroRecordBySchema(record: IndexedRecord, - requiredSchema: Schema, - requiredPos: List[Int], - recordBuilder: GenericRecordBuilder): GenericRecord = { - val requiredFields = requiredSchema.getFields.asScala - assert(requiredFields.length == requiredPos.length) - val positionIterator = requiredPos.iterator - requiredFields.foreach(f => recordBuilder.set(f, record.get(positionIterator.next()))) - recordBuilder.build() + /** + * + * Method to add default value of null to nullable fields in given avro schema + * + * @param schema input avro schema + * @return Avro schema with null default set to nullable fields + */ + def getAvroSchemaWithDefaults(schema: Schema, dataType: DataType): Schema = { + + schema.getType match { + case Schema.Type.RECORD => { + val structType = dataType.asInstanceOf[StructType] + val structFields = structType.fields + val modifiedFields = schema.getFields.map(field => { + val i: Int = structType.fieldIndex(field.name()) + val comment: String = if (structFields(i).metadata.contains("comment")) { + structFields(i).metadata.getString("comment") + } else { + field.doc() + } + val newSchema = getAvroSchemaWithDefaults(field.schema(), structFields(i).dataType) + field.schema().getType match { + case Schema.Type.UNION => { + val innerFields = newSchema.getTypes + val containsNullSchema = innerFields.foldLeft(false)((nullFieldEncountered, schema) => nullFieldEncountered | schema.getType == Schema.Type.NULL) + if(containsNullSchema) { + // Need to re shuffle the fields in list because to set null as default, null schema must be head in union schema + val restructuredNewSchema = Schema.createUnion(List(Schema.create(Schema.Type.NULL)) ++ innerFields.filter(innerSchema => !(innerSchema.getType == Schema.Type.NULL))) + new Schema.Field(field.name(), restructuredNewSchema, comment, JsonProperties.NULL_VALUE) + } else { + new Schema.Field(field.name(), newSchema, comment, field.defaultVal()) + } + } + case _ => new Schema.Field(field.name(), newSchema, comment, field.defaultVal()) + } + }).toList + Schema.createRecord(schema.getName, schema.getDoc, schema.getNamespace, schema.isError, modifiedFields) + } + + case Schema.Type.UNION => { + Schema.createUnion(schema.getTypes.map(innerSchema => getAvroSchemaWithDefaults(innerSchema, dataType))) + } + + case Schema.Type.MAP => { + Schema.createMap(getAvroSchemaWithDefaults(schema.getValueType, dataType.asInstanceOf[MapType].valueType)) + } + + case Schema.Type.ARRAY => { + Schema.createArray(getAvroSchemaWithDefaults(schema.getElementType, dataType.asInstanceOf[ArrayType].elementType)) + } + + case _ => schema + } } def getAvroRecordNameAndNamespace(tableName: String): (String, String) = { diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieConversionUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieConversionUtils.scala new file mode 100644 index 0000000000000..82c65705fbb00 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieConversionUtils.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hudi.common.config.TypedProperties + +import java.{util => ju} +import scala.collection.JavaConverters + +object HoodieConversionUtils { + + /** + * Converts Java's [[ju.Map]] into Scala's (immutable) [[Map]] (by default [[JavaConverters]] convert to + * a mutable one) + */ + def mapAsScalaImmutableMap[K, V](map: ju.Map[K, V]): Map[K, V] = { + // NOTE: We have to use deprecated [[JavaConversions]] to stay compatible w/ Scala 2.11 + import scala.collection.JavaConversions.mapAsScalaMap + map.toMap + } + + def toJavaOption[T](opt: Option[T]): org.apache.hudi.common.util.Option[T] = + if (opt.isDefined) org.apache.hudi.common.util.Option.of(opt.get) else org.apache.hudi.common.util.Option.empty() + + def toScalaOption[T](opt: org.apache.hudi.common.util.Option[T]): Option[T] = + if (opt.isPresent) Some(opt.get) else None + + def toProperties(params: Map[String, String]): TypedProperties = { + val props = new TypedProperties() + params.foreach(kv => props.setProperty(kv._1, kv._2)) + props + } + +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieDatasetBulkInsertHelper.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieDatasetBulkInsertHelper.scala new file mode 100644 index 0000000000000..79fa67acdb9d9 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieDatasetBulkInsertHelper.scala @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hudi.client.WriteStatus +import org.apache.hudi.client.model.HoodieInternalRow +import org.apache.hudi.common.config.TypedProperties +import org.apache.hudi.common.data.HoodieData +import org.apache.hudi.common.engine.TaskContextSupplier +import org.apache.hudi.common.model.{HoodieRecord, HoodieRecordPayload} +import org.apache.hudi.common.util.ReflectionUtils +import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.index.SparkHoodieIndexFactory +import org.apache.hudi.keygen.{BuiltinKeyGenerator, SparkKeyGeneratorInterface} +import org.apache.hudi.table.{BulkInsertPartitioner, HoodieTable} +import org.apache.hudi.table.action.commit.BulkInsertDataInternalWriterHelper +import org.apache.spark.internal.Logging +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.HoodieUnsafeRowUtils.{composeNestedFieldPath, getNestedInternalRowValue} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Alias, Literal} +import org.apache.spark.sql.catalyst.plans.logical.Project +import org.apache.spark.sql.types.{DataType, StringType, StructField, StructType} +import org.apache.spark.sql.{DataFrame, Dataset, HoodieUnsafeUtils, Row} +import org.apache.spark.unsafe.types.UTF8String + +import scala.collection.JavaConverters.{asScalaBufferConverter, seqAsJavaListConverter} + +object HoodieDatasetBulkInsertHelper extends Logging { + + /** + * Prepares [[DataFrame]] for bulk-insert into Hudi table, taking following steps: + * + *

      + *
    1. Invoking configured [[KeyGenerator]] to produce record key, alas partition-path value
    2. + *
    3. Prepends Hudi meta-fields to every row in the dataset
    4. + *
    5. Dedupes rows (if necessary)
    6. + *
    7. Partitions dataset using provided [[partitioner]]
    8. + *
    + */ + def prepareForBulkInsert(df: DataFrame, + config: HoodieWriteConfig, + partitioner: BulkInsertPartitioner[Dataset[Row]], + shouldDropPartitionColumns: Boolean): Dataset[Row] = { + val populateMetaFields = config.populateMetaFields() + val schema = df.schema + + val metaFields = Seq( + StructField(HoodieRecord.COMMIT_TIME_METADATA_FIELD, StringType), + StructField(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, StringType), + StructField(HoodieRecord.RECORD_KEY_METADATA_FIELD, StringType), + StructField(HoodieRecord.PARTITION_PATH_METADATA_FIELD, StringType), + StructField(HoodieRecord.FILENAME_METADATA_FIELD, StringType)) + + val updatedSchema = StructType(metaFields ++ schema.fields) + + val updatedDF = if (populateMetaFields) { + val keyGeneratorClassName = config.getStringOrThrow(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME, + "Key-generator class name is required") + + val prependedRdd: RDD[InternalRow] = + df.queryExecution.toRdd.mapPartitions { iter => + val keyGenerator = + ReflectionUtils.loadClass(keyGeneratorClassName, new TypedProperties(config.getProps)) + .asInstanceOf[SparkKeyGeneratorInterface] + + iter.map { row => + val recordKey = keyGenerator.getRecordKey(row, schema) + val partitionPath = keyGenerator.getPartitionPath(row, schema) + val commitTimestamp = UTF8String.EMPTY_UTF8 + val commitSeqNo = UTF8String.EMPTY_UTF8 + val filename = UTF8String.EMPTY_UTF8 + + // TODO use mutable row, avoid re-allocating + new HoodieInternalRow(commitTimestamp, commitSeqNo, recordKey, partitionPath, filename, row, false) + } + } + + val dedupedRdd = if (config.shouldCombineBeforeInsert) { + dedupeRows(prependedRdd, updatedSchema, config.getPreCombineField, SparkHoodieIndexFactory.isGlobalIndex(config)) + } else { + prependedRdd + } + + HoodieUnsafeUtils.createDataFrameFromRDD(df.sparkSession, dedupedRdd, updatedSchema) + } else { + // NOTE: In cases when we're not populating meta-fields we actually don't + // need access to the [[InternalRow]] and therefore can avoid the need + // to dereference [[DataFrame]] into [[RDD]] + val query = df.queryExecution.logical + val metaFieldsStubs = metaFields.map(f => Alias(Literal(UTF8String.EMPTY_UTF8, dataType = StringType), f.name)()) + val prependedQuery = Project(metaFieldsStubs ++ query.output, query) + + HoodieUnsafeUtils.createDataFrameFrom(df.sparkSession, prependedQuery) + } + + val trimmedDF = if (shouldDropPartitionColumns) { + dropPartitionColumns(updatedDF, config) + } else { + updatedDF + } + + partitioner.repartitionRecords(trimmedDF, config.getBulkInsertShuffleParallelism) + } + + /** + * Perform bulk insert for [[Dataset]], will not change timeline/index, return + * information about write files. + */ + def bulkInsert(dataset: Dataset[Row], + instantTime: String, + table: HoodieTable[_ <: HoodieRecordPayload[_ <: HoodieRecordPayload[_ <: AnyRef]], _, _, _], + writeConfig: HoodieWriteConfig, + partitioner: BulkInsertPartitioner[Dataset[Row]], + parallelism: Int, + shouldPreserveHoodieMetadata: Boolean): HoodieData[WriteStatus] = { + val repartitionedDataset = partitioner.repartitionRecords(dataset, parallelism) + val arePartitionRecordsSorted = partitioner.arePartitionRecordsSorted + val schema = dataset.schema + val writeStatuses = repartitionedDataset.queryExecution.toRdd.mapPartitions(iter => { + val taskContextSupplier: TaskContextSupplier = table.getTaskContextSupplier + val taskPartitionId = taskContextSupplier.getPartitionIdSupplier.get + val taskId = taskContextSupplier.getStageIdSupplier.get.toLong + val taskEpochId = taskContextSupplier.getAttemptIdSupplier.get + val writer = new BulkInsertDataInternalWriterHelper( + table, + writeConfig, + instantTime, + taskPartitionId, + taskId, + taskEpochId, + schema, + writeConfig.populateMetaFields, + arePartitionRecordsSorted, + shouldPreserveHoodieMetadata) + + try { + iter.foreach(writer.write) + } catch { + case t: Throwable => + writer.abort() + throw t + } finally { + writer.close() + } + + writer.getWriteStatuses.asScala.map(_.toWriteStatus).iterator + }).collect() + table.getContext.parallelize(writeStatuses.toList.asJava) + } + + private def dedupeRows(rdd: RDD[InternalRow], schema: StructType, preCombineFieldRef: String, isGlobalIndex: Boolean): RDD[InternalRow] = { + val recordKeyMetaFieldOrd = schema.fieldIndex(HoodieRecord.RECORD_KEY_METADATA_FIELD) + val partitionPathMetaFieldOrd = schema.fieldIndex(HoodieRecord.PARTITION_PATH_METADATA_FIELD) + // NOTE: Pre-combine field could be a nested field + val preCombineFieldPath = composeNestedFieldPath(schema, preCombineFieldRef) + + rdd.map { row => + val rowKey = if (isGlobalIndex) { + row.getString(recordKeyMetaFieldOrd) + } else { + val partitionPath = row.getString(partitionPathMetaFieldOrd) + val recordKey = row.getString(recordKeyMetaFieldOrd) + s"$partitionPath:$recordKey" + } + // NOTE: It's critical whenever we keep the reference to the row, to make a copy + // since Spark might be providing us with a mutable copy (updated during the iteration) + (rowKey, row.copy()) + } + .reduceByKey { + (oneRow, otherRow) => + val onePreCombineVal = getNestedInternalRowValue(oneRow, preCombineFieldPath).asInstanceOf[Comparable[AnyRef]] + val otherPreCombineVal = getNestedInternalRowValue(otherRow, preCombineFieldPath).asInstanceOf[Comparable[AnyRef]] + if (onePreCombineVal.compareTo(otherPreCombineVal.asInstanceOf[AnyRef]) >= 0) { + oneRow + } else { + otherRow + } + } + .values + } + + private def dropPartitionColumns(df: DataFrame, config: HoodieWriteConfig): DataFrame = { + val partitionPathFields = getPartitionPathFields(config).toSet + val nestedPartitionPathFields = partitionPathFields.filter(f => f.contains('.')) + if (nestedPartitionPathFields.nonEmpty) { + logWarning(s"Can not drop nested partition path fields: $nestedPartitionPathFields") + } + + val partitionPathCols = (partitionPathFields -- nestedPartitionPathFields).toSeq + + df.drop(partitionPathCols: _*) + } + + private def getPartitionPathFields(config: HoodieWriteConfig): Seq[String] = { + val keyGeneratorClassName = config.getString(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME) + val keyGenerator = ReflectionUtils.loadClass(keyGeneratorClassName, new TypedProperties(config.getProps)).asInstanceOf[BuiltinKeyGenerator] + keyGenerator.getPartitionPathFields.asScala + } +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala new file mode 100644 index 0000000000000..df5e2777cbe05 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.avro.Schema +import org.apache.avro.generic.GenericRecord +import org.apache.hudi.avro.HoodieAvroUtils +import org.apache.hudi.client.utils.SparkRowSerDe +import org.apache.hudi.common.model.HoodieRecord +import org.apache.spark.SPARK_VERSION +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.execution.SQLConfInjectingRDD +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{StringType, StructField, StructType} + +import scala.collection.JavaConverters._ +import scala.reflect.ClassTag + +private[hudi] trait SparkVersionsSupport { + def getSparkVersion: String + + def isSpark2: Boolean = getSparkVersion.startsWith("2.") + def isSpark3: Boolean = getSparkVersion.startsWith("3.") + def isSpark3_0: Boolean = getSparkVersion.startsWith("3.0") + def isSpark3_1: Boolean = getSparkVersion.startsWith("3.1") + def isSpark3_2: Boolean = getSparkVersion.startsWith("3.2") + def isSpark3_3: Boolean = getSparkVersion.startsWith("3.3") + + def gteqSpark3_0: Boolean = getSparkVersion >= "3.0" + def gteqSpark3_1: Boolean = getSparkVersion >= "3.1" + def gteqSpark3_1_3: Boolean = getSparkVersion >= "3.1.3" + def gteqSpark3_2: Boolean = getSparkVersion >= "3.2" + def gteqSpark3_2_1: Boolean = getSparkVersion >= "3.2.1" + def gteqSpark3_2_2: Boolean = getSparkVersion >= "3.2.2" + def gteqSpark3_3: Boolean = getSparkVersion >= "3.3" +} + +object HoodieSparkUtils extends SparkAdapterSupport with SparkVersionsSupport { + + override def getSparkVersion: String = SPARK_VERSION + + def getMetaSchema: StructType = { + StructType(HoodieRecord.HOODIE_META_COLUMNS.asScala.map(col => { + StructField(col, StringType, nullable = true) + })) + } + + /** + * @deprecated please use other overload [[createRdd]] + */ + def createRdd(df: DataFrame, structName: String, recordNamespace: String, reconcileToLatestSchema: Boolean, + latestTableSchema: org.apache.hudi.common.util.Option[Schema] = org.apache.hudi.common.util.Option.empty()): RDD[GenericRecord] = { + var latestTableSchemaConverted : Option[Schema] = None + + if (latestTableSchema.isPresent && reconcileToLatestSchema) { + latestTableSchemaConverted = Some(latestTableSchema.get()) + } else { + // cases when users want to use latestTableSchema but have not turned on reconcileToLatestSchema explicitly + // for example, when using a Transformer implementation to transform source RDD to target RDD + latestTableSchemaConverted = if (latestTableSchema.isPresent) Some(latestTableSchema.get()) else None + } + createRdd(df, structName, recordNamespace, latestTableSchemaConverted) + } + + def createRdd(df: DataFrame, structName: String, recordNamespace: String, readerAvroSchemaOpt: Option[Schema]): RDD[GenericRecord] = { + val writerSchema = df.schema + val writerAvroSchema = AvroConversionUtils.convertStructTypeToAvroSchema(writerSchema, structName, recordNamespace) + val readerAvroSchema = readerAvroSchemaOpt.getOrElse(writerAvroSchema) + // We check whether passed in reader schema is identical to writer schema to avoid costly serde loop of + // making Spark deserialize its internal representation [[InternalRow]] into [[Row]] for subsequent conversion + // (and back) + val sameSchema = writerAvroSchema.equals(readerAvroSchema) + val (nullable, _) = AvroConversionUtils.resolveAvroTypeNullability(writerAvroSchema) + + // NOTE: We have to serialize Avro schema, and then subsequently parse it on the executor node, since Spark + // serializer is not able to digest it + val readerAvroSchemaStr = readerAvroSchema.toString + val writerAvroSchemaStr = writerAvroSchema.toString + + // NOTE: We're accessing toRdd here directly to avoid [[InternalRow]] to [[Row]] conversion + // Additionally, we have to explicitly wrap around resulting [[RDD]] into the one + // injecting [[SQLConf]], which by default isn't propgated by Spark to the executor(s). + // [[SQLConf]] is required by [[AvroSerializer]] + injectSQLConf(df.queryExecution.toRdd.mapPartitions { rows => + if (rows.isEmpty) { + Iterator.empty + } else { + val readerAvroSchema = new Schema.Parser().parse(readerAvroSchemaStr) + val transform: GenericRecord => GenericRecord = + if (sameSchema) identity + else { + HoodieAvroUtils.rewriteRecordDeep(_, readerAvroSchema) + } + + // Since caller might request to get records in a different ("evolved") schema, we will be rewriting from + // existing Writer's schema into Reader's (avro) schema + val writerAvroSchema = new Schema.Parser().parse(writerAvroSchemaStr) + val convert = AvroConversionUtils.createInternalRowToAvroConverter(writerSchema, writerAvroSchema, nullable = nullable) + + rows.map { ir => transform(convert(ir)) } + } + }, SQLConf.get) + } + + def getCatalystRowSerDe(structType: StructType) : SparkRowSerDe = { + sparkAdapter.createSparkRowSerDe(structType) + } + + private def injectSQLConf[T: ClassTag](rdd: RDD[T], conf: SQLConf): RDD[T] = + new SQLConfInjectingRDD(rdd, conf) +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieUnsafeRDD.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieUnsafeRDD.scala new file mode 100644 index 0000000000000..51b03a0024efc --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieUnsafeRDD.scala @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.{Partition, SparkContext, TaskContext} + +/** + * !!! PLEASE READ CAREFULLY !!! + * + * Base class for all of the custom low-overhead RDD implementations for Hudi. + * + * To keep memory allocation footprint as low as possible, each inheritor of this RDD base class + * + *
    + *   1. Does NOT deserialize from [[InternalRow]] to [[Row]] (therefore only providing access to
    + *   Catalyst internal representations (often mutable) of the read row)
    + *
    + *   2. DOES NOT COPY UNDERLYING ROW OUT OF THE BOX, meaning that
    + *
    + *      a) access to this RDD is NOT thread-safe
    + *
    + *      b) iterating over it reference to a _mutable_ underlying instance (of [[InternalRow]]) is
    + *      returned, entailing that after [[Iterator#next()]] is invoked on the provided iterator,
    + *      previous reference becomes **invalid**. Therefore, you will have to copy underlying mutable
    + *      instance of [[InternalRow]] if you plan to access it after [[Iterator#next()]] is invoked (filling
    + *      it with the next row's payload)
    + *
    + *      c) due to item b) above, no operation other than the iteration will produce meaningful
    + *      results on it and will likely fail [1]
    + * 
    + * + * [1] For example, [[RDD#collect]] method on this implementation would not work correctly, as it's + * simply using Scala's default [[Iterator#toArray]] method which will simply concat all the references onto + * the same underlying mutable object into [[Array]]. Instead each individual [[InternalRow]] _has to be copied_, + * before concatenating into the final output. Please refer to [[HoodieRDDUtils#collect]] for more details. + * + * NOTE: It enforces, for ex, that all of the RDDs implement [[compute]] method returning + * [[InternalRow]] to avoid superfluous ser/de + */ +trait HoodieUnsafeRDD extends RDD[InternalRow] { + override def collect(): Array[InternalRow] = + throw new UnsupportedOperationException( + "This method will not function correctly, please refer to scala-doc for HoodieUnsafeRDD" + ) +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkAdapterSupport.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkAdapterSupport.scala new file mode 100644 index 0000000000000..9fe67f9918d01 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkAdapterSupport.scala @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.spark.sql.hudi.SparkAdapter + +/** + * Use the SparkAdapterSupport trait to get the SparkAdapter when we + * need to adapt the difference between spark2 and spark3. + */ +trait SparkAdapterSupport { + + lazy val sparkAdapter: SparkAdapter = SparkAdapterSupport.sparkAdapter + +} + +object SparkAdapterSupport { + + lazy val sparkAdapter: SparkAdapter = { + val adapterClass = if (HoodieSparkUtils.isSpark3_3) { + "org.apache.spark.sql.adapter.Spark3_3Adapter" + } else if (HoodieSparkUtils.isSpark3_2) { + "org.apache.spark.sql.adapter.Spark3_2Adapter" + } else if (HoodieSparkUtils.isSpark3_0 || HoodieSparkUtils.isSpark3_1) { + "org.apache.spark.sql.adapter.Spark3_1Adapter" + } else { + "org.apache.spark.sql.adapter.Spark2Adapter" + } + getClass.getClassLoader.loadClass(adapterClass) + .newInstance().asInstanceOf[SparkAdapter] + } +} \ No newline at end of file diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/unsafe/UTF8StringBuilder.java b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/unsafe/UTF8StringBuilder.java new file mode 100644 index 0000000000000..3d9f060515a6d --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/unsafe/UTF8StringBuilder.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.unsafe; + +import org.apache.spark.unsafe.Platform; +import org.apache.spark.unsafe.array.ByteArrayMethods; +import org.apache.spark.unsafe.types.UTF8String; + +/** + * A helper class to write {@link UTF8String}s to an internal buffer and build the concatenated + * {@link UTF8String} at the end. + */ +public class UTF8StringBuilder { + + private static final int ARRAY_MAX = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH; + + private byte[] buffer; + private int cursor = Platform.BYTE_ARRAY_OFFSET; + + public UTF8StringBuilder() { + // Since initial buffer size is 16 in `StringBuilder`, we set the same size here + this(16); + } + + public UTF8StringBuilder(int initialSize) { + if (initialSize < 0) { + throw new IllegalArgumentException("Size must be non-negative"); + } + if (initialSize > ARRAY_MAX) { + throw new IllegalArgumentException( + "Size " + initialSize + " exceeded maximum size of " + ARRAY_MAX); + } + this.buffer = new byte[initialSize]; + } + + // Grows the buffer by at least `neededSize` + private void grow(int neededSize) { + if (neededSize > ARRAY_MAX - totalSize()) { + throw new UnsupportedOperationException( + "Cannot grow internal buffer by size " + neededSize + " because the size after growing " + + "exceeds size limitation " + ARRAY_MAX); + } + final int length = totalSize() + neededSize; + if (buffer.length < length) { + int newLength = length < ARRAY_MAX / 2 ? length * 2 : ARRAY_MAX; + final byte[] tmp = new byte[newLength]; + Platform.copyMemory( + buffer, + Platform.BYTE_ARRAY_OFFSET, + tmp, + Platform.BYTE_ARRAY_OFFSET, + totalSize()); + buffer = tmp; + } + } + + private int totalSize() { + return cursor - Platform.BYTE_ARRAY_OFFSET; + } + + public void append(UTF8String value) { + grow(value.numBytes()); + value.writeToMemory(buffer, cursor); + cursor += value.numBytes(); + } + + public void append(String value) { + append(UTF8String.fromString(value)); + } + + public void appendBytes(Object base, long offset, int length) { + grow(length); + Platform.copyMemory( + base, + offset, + buffer, + cursor, + length); + cursor += length; + } + + public UTF8String build() { + return UTF8String.fromBytes(buffer, 0, totalSize()); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/util/JFunction.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/util/JFunction.scala new file mode 100644 index 0000000000000..8a612f4da2c64 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/util/JFunction.scala @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util + +import org.apache.hudi.common.function.{SerializableFunction, SerializablePairFunction} +import org.apache.hudi.common.util.collection + +import scala.language.implicitConversions + +/** + * Utility allowing for seamless conversion b/w Java/Scala functional primitives + */ +object JFunction { + + //////////////////////////////////////////////////////////// + // From Java to Scala + //////////////////////////////////////////////////////////// + + implicit def toScala[T, R](f: java.util.function.Function[T, R]): T => R = + (t: T) => f.apply(t) + + //////////////////////////////////////////////////////////// + // From Scala to Java + //////////////////////////////////////////////////////////// + + implicit def toJavaFunction[T, R](f: Function[T, R]): java.util.function.Function[T, R] = + new java.util.function.Function[T, R] { + override def apply(t: T): R = f.apply(t) + } + + implicit def toJavaSerializableFunction[T, R](f: Function[T, R]): SerializableFunction[T, R] = + new SerializableFunction[T, R] { + override def apply(t: T): R = f.apply(t) + } + + implicit def toJavaSerializablePairFunction[T, K, V](f: Function[T, collection.Pair[K, V]]): SerializablePairFunction[T, K, V] = + new SerializablePairFunction[T, K, V] { + override def call(t: T): collection.Pair[K, V] = f.apply(t) + } + + implicit def toJava[T](f: T => Unit): java.util.function.Consumer[T] = + new java.util.function.Consumer[T] { + override def accept(t: T): Unit = f.apply(t) + } + +} \ No newline at end of file diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/util/PathUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/util/PathUtils.scala new file mode 100644 index 0000000000000..4a96b542d58ab --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/util/PathUtils.scala @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util + +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.hudi.common.table.HoodieTableMetaClient + +/** + * TODO convert to Java, move to hudi-common + */ +object PathUtils { + + /** + * This method copied from [[org.apache.spark.deploy.SparkHadoopUtil]]. + * [[org.apache.spark.deploy.SparkHadoopUtil]] becomes private since Spark 3.0.0 and hence we had to copy it locally. + */ + def isGlobPath(pattern: Path): Boolean = { + pattern.toString.exists("{}[]*?\\".toSet.contains) + } + + /** + * This method is inspired from [[org.apache.spark.deploy.SparkHadoopUtil]] with some modifications like + * skipping meta paths. + */ + def globPath(fs: FileSystem, pattern: Path): Seq[Path] = { + // find base path to assist in skipping meta paths + var basePath = pattern.getParent + while (basePath.getName.equals("*")) { + basePath = basePath.getParent + } + + Option(fs.globStatus(pattern)).map { statuses => { + val nonMetaStatuses = statuses.filterNot(entry => { + // skip all entries in meta path + var leafPath = entry.getPath + // walk through every parent until we reach base path. if .hoodie is found anywhere, path needs to be skipped + while (!leafPath.equals(basePath) && !leafPath.getName.equals(HoodieTableMetaClient.METAFOLDER_NAME)) { + leafPath = leafPath.getParent + } + leafPath.getName.equals(HoodieTableMetaClient.METAFOLDER_NAME) + }) + nonMetaStatuses.map(_.getPath.makeQualified(fs.getUri, fs.getWorkingDirectory)).toSeq + } + }.getOrElse(Seq.empty[Path]) + } + + /** + * This method copied from [[org.apache.spark.deploy.SparkHadoopUtil]]. + * [[org.apache.spark.deploy.SparkHadoopUtil]] becomes private since Spark 3.0.0 and hence we had to copy it locally. + */ + def globPathIfNecessary(fs: FileSystem, pattern: Path): Seq[Path] = { + if (isGlobPath(pattern)) globPath(fs, pattern) else Seq(pattern) + } + + /** + * Checks to see whether input path contains a glob pattern and if yes, maps it to a list of absolute paths + * which match the glob pattern. Otherwise, returns original path + * + * @param paths List of absolute or globbed paths + * @param fs File system + * @return list of absolute file paths + */ + def checkAndGlobPathIfNecessary(paths: Seq[String], fs: FileSystem): Seq[Path] = { + paths.flatMap(path => { + val qualified = new Path(path).makeQualified(fs.getUri, fs.getWorkingDirectory) + globPathIfNecessary(fs, qualified) + }) + } +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/util/SparkKeyGenUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/util/SparkKeyGenUtils.scala new file mode 100644 index 0000000000000..4cdbbf7577abd --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/util/SparkKeyGenUtils.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util + +import org.apache.hudi.common.config.TypedProperties +import org.apache.hudi.keygen.constant.KeyGeneratorOptions +import org.apache.hudi.keygen.{BaseKeyGenerator, CustomAvroKeyGenerator, CustomKeyGenerator, KeyGenerator} +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory + +import java.util.Properties +import scala.collection.JavaConverters._ + +object SparkKeyGenUtils { + + /** + * @param properties config properties + * @return partition columns + */ + def getPartitionColumns(properties: Properties): String = { + val props = new TypedProperties(properties) + val keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(props) + getPartitionColumns(keyGenerator, props) + } + + /** + * @param keyGen key generator + * @return partition columns + */ + def getPartitionColumns(keyGen: KeyGenerator, typedProperties: TypedProperties): String = { + keyGen match { + // For CustomKeyGenerator and CustomAvroKeyGenerator, the partition path filed format + // is: "field_name: field_type", we extract the field_name from the partition path field. + case c: BaseKeyGenerator + if c.isInstanceOf[CustomKeyGenerator] || c.isInstanceOf[CustomAvroKeyGenerator] => + c.getPartitionPathFields.asScala.map(pathField => + pathField.split(CustomAvroKeyGenerator.SPLIT_REGEX) + .headOption.getOrElse(s"Illegal partition path field format: '$pathField' for ${c.getClass.getSimpleName}")) + .mkString(",") + + case b: BaseKeyGenerator => b.getPartitionPathFields.asScala.mkString(",") + case _ => typedProperties.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()) + } + } + +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalogUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalogUtils.scala new file mode 100644 index 0000000000000..c65957515546f --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalogUtils.scala @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +/** + * NOTE: Since support for [[TableCatalog]] was only added in Spark 3, this trait + * is going to be an empty one simply serving as a placeholder (for compatibility w/ Spark 2) + */ +trait HoodieCatalogUtils {} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystExpressionUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystExpressionUtils.scala new file mode 100644 index 0000000000000..f2d6f0381a471 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystExpressionUtils.scala @@ -0,0 +1,248 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.hudi.SparkAdapterSupport.sparkAdapter +import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction} +import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, Like, Literal, SubqueryExpression, UnsafeProjection} +import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} +import org.apache.spark.sql.sources._ +import org.apache.spark.sql.types.{DataType, StructType} + +trait HoodieCatalystExpressionUtils { + + /** + * Matches an expression iff + * + *
      + *
    1. It references exactly one [[AttributeReference]]
    2. + *
    3. It contains only whitelisted transformations that preserve ordering of the source column [1]
    4. + *
    + * + * [1] Preserving ordering is defined as following: transformation T is defined as ordering preserving in case + * values of the source column A values being ordered as a1, a2, a3 ..., will map into column B = T(A) which + * will keep the same ordering b1, b2, b3, ... with b1 = T(a1), b2 = T(a2), ... + */ + def tryMatchAttributeOrderingPreservingTransformation(expr: Expression): Option[AttributeReference] + + /** + * Verifies whether [[fromType]] can be up-casted to [[toType]] + */ + def canUpCast(fromType: DataType, toType: DataType): Boolean + + /** + * Un-applies [[Cast]] expression into + *
      + *
    1. Casted [[Expression]]
    2. + *
    3. Target [[DataType]]
    4. + *
    5. (Optional) Timezone spec
    6. + *
    7. Flag whether it's an ANSI cast or not
    8. + *
    + */ + def unapplyCastExpression(expr: Expression): Option[(Expression, DataType, Option[String], Boolean)] +} + +object HoodieCatalystExpressionUtils { + + /** + * Convenience extractor allowing to untuple [[Cast]] across Spark versions + */ + object MatchCast { + def unapply(expr: Expression): Option[(Expression, DataType, Option[String], Boolean)] = + sparkAdapter.getCatalystExpressionUtils.unapplyCastExpression(expr) + } + + /** + * Generates instance of [[UnsafeProjection]] projecting row of one [[StructType]] into another [[StructType]] + * + * NOTE: No safety checks are executed to validate that this projection is actually feasible, + * it's up to the caller to make sure that such projection is possible. + * + * NOTE: Projection of the row from [[StructType]] A to [[StructType]] B is only possible, if + * B is a subset of A + */ + def generateUnsafeProjection(from: StructType, to: StructType): UnsafeProjection = { + val attrs = from.toAttributes + val attrsMap = attrs.map(attr => (attr.name, attr)).toMap + val targetExprs = to.fields.map(f => attrsMap(f.name)) + + GenerateUnsafeProjection.generate(targetExprs, attrs) + } + + /** + * Split the given predicates into two sequence predicates: + * - predicates that references partition columns only(and involves no sub-query); + * - other predicates. + * + * @param sparkSession The spark session + * @param predicates The predicates to be split + * @param partitionColumns The partition columns + * @return (partitionFilters, dataFilters) + */ + def splitPartitionAndDataPredicates(sparkSession: SparkSession, + predicates: Array[Expression], + partitionColumns: Array[String]): (Array[Expression], Array[Expression]) = { + // Validates that the provided names both resolve to the same entity + val resolvedNameEquals = sparkSession.sessionState.analyzer.resolver + + predicates.partition(expr => { + // Checks whether given expression only references partition columns(and involves no sub-query) + expr.references.forall(r => partitionColumns.exists(resolvedNameEquals(r.name, _))) && + !SubqueryExpression.hasSubquery(expr) + }) + } + + /** + * Parses and resolves expression against the attributes of the given table schema. + * + * For example: + *
    +   * ts > 1000 and ts <= 1500
    +   * 
    + * will be resolved as + *
    +   * And(GreaterThan(ts#590L > 1000), LessThanOrEqual(ts#590L <= 1500))
    +   * 
    + * + * Where
    ts
    is a column of the provided [[tableSchema]] + * + * @param spark spark session + * @param exprString string representation of the expression to parse and resolve + * @param tableSchema table schema encompassing attributes to resolve against + * @return Resolved filter expression + */ + def resolveExpr(spark: SparkSession, exprString: String, tableSchema: StructType): Expression = { + val expr = spark.sessionState.sqlParser.parseExpression(exprString) + resolveExpr(spark, expr, tableSchema) + } + + /** + * Resolves provided expression (unless already resolved) against the attributes of the given table schema. + * + * For example: + *
    +   * ts > 1000 and ts <= 1500
    +   * 
    + * will be resolved as + *
    +   * And(GreaterThan(ts#590L > 1000), LessThanOrEqual(ts#590L <= 1500))
    +   * 
    + * + * Where
    ts
    is a column of the provided [[tableSchema]] + * + * @param spark spark session + * @param expr Catalyst expression to be resolved (if not yet) + * @param tableSchema table schema encompassing attributes to resolve against + * @return Resolved filter expression + */ + def resolveExpr(spark: SparkSession, expr: Expression, tableSchema: StructType): Expression = { + val analyzer = spark.sessionState.analyzer + val schemaFields = tableSchema.fields + + import org.apache.spark.sql.catalyst.plans.logical.Filter + val resolvedExpr = { + val plan: LogicalPlan = Filter(expr, LocalRelation(schemaFields.head, schemaFields.drop(1): _*)) + analyzer.execute(plan).asInstanceOf[Filter].condition + } + + if (!hasUnresolvedRefs(resolvedExpr)) { + resolvedExpr + } else { + throw new IllegalStateException("unresolved attribute") + } + } + + /** + * Converts [[Filter]] to Catalyst [[Expression]] + */ + def convertToCatalystExpression(filter: Filter, tableSchema: StructType): Option[Expression] = { + Option( + filter match { + case EqualTo(attribute, value) => + org.apache.spark.sql.catalyst.expressions.EqualTo(toAttribute(attribute, tableSchema), Literal.create(value)) + case EqualNullSafe(attribute, value) => + org.apache.spark.sql.catalyst.expressions.EqualNullSafe(toAttribute(attribute, tableSchema), Literal.create(value)) + case GreaterThan(attribute, value) => + org.apache.spark.sql.catalyst.expressions.GreaterThan(toAttribute(attribute, tableSchema), Literal.create(value)) + case GreaterThanOrEqual(attribute, value) => + org.apache.spark.sql.catalyst.expressions.GreaterThanOrEqual(toAttribute(attribute, tableSchema), Literal.create(value)) + case LessThan(attribute, value) => + org.apache.spark.sql.catalyst.expressions.LessThan(toAttribute(attribute, tableSchema), Literal.create(value)) + case LessThanOrEqual(attribute, value) => + org.apache.spark.sql.catalyst.expressions.LessThanOrEqual(toAttribute(attribute, tableSchema), Literal.create(value)) + case In(attribute, values) => + val attrExp = toAttribute(attribute, tableSchema) + val valuesExp = values.map(v => Literal.create(v)) + org.apache.spark.sql.catalyst.expressions.In(attrExp, valuesExp) + case IsNull(attribute) => + org.apache.spark.sql.catalyst.expressions.IsNull(toAttribute(attribute, tableSchema)) + case IsNotNull(attribute) => + org.apache.spark.sql.catalyst.expressions.IsNotNull(toAttribute(attribute, tableSchema)) + case And(left, right) => + val leftExp = convertToCatalystExpression(left, tableSchema) + val rightExp = convertToCatalystExpression(right, tableSchema) + if (leftExp.isEmpty || rightExp.isEmpty) { + null + } else { + org.apache.spark.sql.catalyst.expressions.And(leftExp.get, rightExp.get) + } + case Or(left, right) => + val leftExp = convertToCatalystExpression(left, tableSchema) + val rightExp = convertToCatalystExpression(right, tableSchema) + if (leftExp.isEmpty || rightExp.isEmpty) { + null + } else { + org.apache.spark.sql.catalyst.expressions.Or(leftExp.get, rightExp.get) + } + case Not(child) => + val childExp = convertToCatalystExpression(child, tableSchema) + if (childExp.isEmpty) { + null + } else { + org.apache.spark.sql.catalyst.expressions.Not(childExp.get) + } + case StringStartsWith(attribute, value) => + val leftExp = toAttribute(attribute, tableSchema) + val rightExp = Literal.create(s"$value%") + new Like(leftExp, rightExp) + case StringEndsWith(attribute, value) => + val leftExp = toAttribute(attribute, tableSchema) + val rightExp = Literal.create(s"%$value") + new Like(leftExp, rightExp) + case StringContains(attribute, value) => + val leftExp = toAttribute(attribute, tableSchema) + val rightExp = Literal.create(s"%$value%") + new Like(leftExp, rightExp) + case _ => null + } + ) + } + + private def hasUnresolvedRefs(resolvedExpr: Expression): Boolean = + resolvedExpr.collectFirst { + case _: UnresolvedAttribute | _: UnresolvedFunction => true + }.isDefined + + private def toAttribute(columnName: String, tableSchema: StructType): AttributeReference = { + val field = tableSchema.find(p => p.name == columnName) + assert(field.isDefined, s"Cannot find column: $columnName, Table Columns are: " + + s"${tableSchema.fieldNames.mkString(",")}") + AttributeReference(columnName, field.get.dataType, field.get.nullable)() + } +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystPlansUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystPlansUtils.scala new file mode 100644 index 0000000000000..e7e529b12545a --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystPlansUtils.scala @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} +import org.apache.spark.sql.catalyst.plans.JoinType +import org.apache.spark.sql.catalyst.plans.logical.{Join, LogicalPlan} +import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier} +import org.apache.spark.sql.internal.SQLConf + +trait HoodieCatalystPlansUtils { + + /** + * Resolves output of the provided [[query]] against the [[expected]] list of [[Attribute]], + * and returns new (reshaped) instance of the [[LogicalPlan]] + * + * @param tableName used purely for more human-readable error output (if any) + * @param expected list of attributes output of the query has to adhere to + * @param query query whose output has to be reshaped + * @param byName whether the matching should occur by-name or positionally + * @param conf instance of [[SQLConf]] + * @return [[LogicalPlan]] which output is aligned to match to that of [[expected]] + */ + def resolveOutputColumns(tableName: String, + expected: Seq[Attribute], + query: LogicalPlan, + byName: Boolean, + conf: SQLConf): LogicalPlan + + /** + * Instantiates an [[Explain]] command + */ + def createExplainCommand(plan: LogicalPlan, extended: Boolean): LogicalPlan + + /** + * Convert a AliasIdentifier to TableIdentifier. + */ + def toTableIdentifier(aliasId: AliasIdentifier): TableIdentifier + + /** + * Convert a UnresolvedRelation to TableIdentifier. + */ + def toTableIdentifier(relation: UnresolvedRelation): TableIdentifier + + /** + * Create Join logical plan. + */ + def createJoin(left: LogicalPlan, right: LogicalPlan, joinType: JoinType): Join + + /** + * Test if the logical plan is a Insert Into LogicalPlan. + */ + def isInsertInto(plan: LogicalPlan): Boolean + + /** + * Get the member of the Insert Into LogicalPlan. + */ + def getInsertIntoChildren(plan: LogicalPlan): + Option[(LogicalPlan, Map[String, Option[String]], LogicalPlan, Boolean, Boolean)] + + /** + * if the logical plan is a TimeTravelRelation LogicalPlan. + */ + def isRelationTimeTravel(plan: LogicalPlan): Boolean + + /** + * Get the member of the TimeTravelRelation LogicalPlan. + */ + def getRelationTimeTravel(plan: LogicalPlan): Option[(LogicalPlan, Option[Expression], Option[String])] + + /** + * Create a Insert Into LogicalPlan. + */ + def createInsertInto(table: LogicalPlan, partition: Map[String, Option[String]], + query: LogicalPlan, overwrite: Boolean, ifPartitionNotExists: Boolean): LogicalPlan + +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieUnsafeRowUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieUnsafeRowUtils.scala new file mode 100644 index 0000000000000..c105142de0f45 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieUnsafeRowUtils.scala @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.types.{StructField, StructType} + +import scala.collection.mutable.ArrayBuffer + +object HoodieUnsafeRowUtils { + + /** + * Fetches (nested) value w/in provided [[Row]] uniquely identified by the provided nested-field path + * previously composed by [[composeNestedFieldPath]] + */ + def getNestedRowValue(row: Row, nestedFieldPath: NestedFieldPath): Any = { + var curRow = row + for (idx <- nestedFieldPath.parts.indices) { + val (ord, f) = nestedFieldPath.parts(idx) + if (curRow.isNullAt(ord)) { + // scalastyle:off return + if (f.nullable) return null + else throw new IllegalArgumentException(s"Found null value for the field that is declared as non-nullable: $f") + // scalastyle:on return + } else if (idx == nestedFieldPath.parts.length - 1) { + // scalastyle:off return + return curRow.get(ord) + // scalastyle:on return + } else { + curRow = f.dataType match { + case _: StructType => + curRow.getStruct(ord) + case dt@_ => + throw new IllegalArgumentException(s"Invalid nested-field path: expected StructType, but was $dt") + } + } + } + } + + /** + * Fetches (nested) value w/in provided [[InternalRow]] uniquely identified by the provided nested-field path + * previously composed by [[composeNestedFieldPath]] + */ + def getNestedInternalRowValue(row: InternalRow, nestedFieldPath: NestedFieldPath): Any = { + if (nestedFieldPath.parts.length == 0) { + throw new IllegalArgumentException("Nested field-path could not be empty") + } + + var curRow = row + var idx = 0 + while (idx < nestedFieldPath.parts.length) { + val (ord, f) = nestedFieldPath.parts(idx) + if (curRow.isNullAt(ord)) { + // scalastyle:off return + if (f.nullable) return null + else throw new IllegalArgumentException(s"Found null value for the field that is declared as non-nullable: $f") + // scalastyle:on return + } else if (idx == nestedFieldPath.parts.length - 1) { + // scalastyle:off return + return curRow.get(ord, f.dataType) + // scalastyle:on return + } else { + curRow = f.dataType match { + case st: StructType => + curRow.getStruct(ord, st.fields.length) + case dt@_ => + throw new IllegalArgumentException(s"Invalid nested-field path: expected StructType, but was $dt") + } + } + idx += 1 + } + } + + /** + * For the provided [[nestedFieldRef]] (of the form "a.b.c") and [[schema]], produces nested-field path comprised + * of (ordinal, data-type) tuples of the respective fields w/in the provided schema. + * + * This method produces nested-field path, that is subsequently used by [[getNestedInternalRowValue]], [[getNestedRowValue]] + */ + def composeNestedFieldPath(schema: StructType, nestedFieldRef: String): NestedFieldPath = { + val fieldRefParts = nestedFieldRef.split('.') + val ordSeq = ArrayBuffer[(Int, StructField)]() + var curSchema = schema + var idx = 0 + while (idx < fieldRefParts.length) { + val fieldRefPart = fieldRefParts(idx) + val ord = curSchema.fieldIndex(fieldRefPart) + val field = curSchema(ord) + // Append current field's (ordinal, data-type) + ordSeq.append((ord, field)) + // Update current schema, unless terminal field-ref part + if (idx < fieldRefParts.length - 1) { + curSchema = field.dataType match { + case st: StructType => st + case dt@_ => + throw new IllegalArgumentException(s"Invalid nested field reference ${fieldRefParts.drop(idx).mkString(".")} into $dt") + } + } + idx += 1 + } + + NestedFieldPath(ordSeq.toArray) + } + + case class NestedFieldPath(parts: Array[(Int, StructField)]) +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieUnsafeUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieUnsafeUtils.scala new file mode 100644 index 0000000000000..c981cd8113ca3 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieUnsafeUtils.scala @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.hudi.HoodieUnsafeRDD +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} +import org.apache.spark.sql.types.StructType +import org.apache.spark.util.MutablePair + +/** + * Suite of utilities helping in handling instances of [[HoodieUnsafeRDD]] + */ +object HoodieUnsafeUtils { + + /** + * Creates [[DataFrame]] from provided [[plan]] + * + * @param spark spark's session + * @param plan given plan to wrap into [[DataFrame]] + */ + def createDataFrameFrom(spark: SparkSession, plan: LogicalPlan): DataFrame = + Dataset.ofRows(spark, plan) + + /** + * Creates [[DataFrame]] from the in-memory [[Seq]] of [[Row]]s with provided [[schema]] + * + * NOTE: [[DataFrame]] is based on [[LocalRelation]], entailing that most computations with it + * will be executed by Spark locally + * + * @param spark spark's session + * @param rows collection of rows to base [[DataFrame]] on + * @param schema target [[DataFrame]]'s schema + */ + def createDataFrameFromRows(spark: SparkSession, rows: Seq[Row], schema: StructType): DataFrame = + Dataset.ofRows(spark, LocalRelation.fromExternalRows(schema.toAttributes, rows)) + + /** + * Creates [[DataFrame]] from the in-memory [[Seq]] of [[InternalRow]]s with provided [[schema]] + * + * NOTE: [[DataFrame]] is based on [[LocalRelation]], entailing that most computations with it + * will be executed by Spark locally + * + * @param spark spark's session + * @param rows collection of rows to base [[DataFrame]] on + * @param schema target [[DataFrame]]'s schema + */ + def createDataFrameFromInternalRows(spark: SparkSession, rows: Seq[InternalRow], schema: StructType): DataFrame = + Dataset.ofRows(spark, LocalRelation(schema.toAttributes, rows)) + + + /** + * Creates [[DataFrame]] from the [[RDD]] of [[Row]]s with provided [[schema]] + * + * @param spark spark's session + * @param rdd RDD w/ [[Row]]s to base [[DataFrame]] on + * @param schema target [[DataFrame]]'s schema + */ + def createDataFrameFromRDD(spark: SparkSession, rdd: RDD[InternalRow], schema: StructType): DataFrame = + spark.internalCreateDataFrame(rdd, schema) + + /** + * Canonical implementation of the [[RDD#collect]] for [[HoodieUnsafeRDD]], returning a properly + * copied [[Array]] of [[InternalRow]]s + */ + def collect(rdd: HoodieUnsafeRDD): Array[InternalRow] = { + rdd.mapPartitionsInternal { iter => + // NOTE: We're leveraging [[MutablePair]] here to avoid unnecessary allocations, since + // a) iteration is performed lazily and b) iteration is single-threaded (w/in partition) + val pair = new MutablePair[InternalRow, Null]() + iter.map(row => pair.update(row.copy(), null)) + } + .map(p => p._1) + .collect() + } +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroDeserializer.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroDeserializer.scala new file mode 100644 index 0000000000000..4c4ddb5bf016c --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroDeserializer.scala @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.avro + +/** + * Deserializes Avro payload into Catalyst object + * + * NOTE: This is low-level component operating on Spark internal data-types (comprising [[InternalRow]]). + * If you're looking to convert Avro into "deserialized" [[Row]] (comprised of Java native types), + * please check [[AvroConversionUtils]] + */ +trait HoodieAvroDeserializer { + def deserialize(data: Any): Option[Any] +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroSchemaConverters.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroSchemaConverters.scala new file mode 100644 index 0000000000000..9b068afac83d2 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroSchemaConverters.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.avro + +import org.apache.avro.Schema +import org.apache.spark.sql.types.DataType + +/** + * Allows to convert Avro schema into Spark's Catalyst one + */ +trait HoodieAvroSchemaConverters { + + def toSqlType(avroSchema: Schema): (DataType, Boolean) + + def toAvroType(catalystType: DataType, nullable: Boolean, recordName: String, nameSpace: String = ""): Schema + +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroSerializer.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroSerializer.scala new file mode 100644 index 0000000000000..84ba44b00fbbb --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroSerializer.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.avro + +/** + * Serializes Catalyst payload into Avro object + * + * NOTE: This is low-level component operating on Spark internal data-types (comprising [[InternalRow]]). + * If you're looking to convert "deserialized" [[Row]] into Avro, please check [[AvroConversionUtils]] + */ +trait HoodieAvroSerializer { + def serialize(catalystData: Any): Any +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/execution/SQLConfInjectingRDD.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/execution/SQLConfInjectingRDD.scala new file mode 100644 index 0000000000000..1a44fd1af1e55 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/execution/SQLConfInjectingRDD.scala @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.{Partition, TaskContext} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.internal.SQLConf + +import scala.reflect.ClassTag + +/** + * NOTE: This is a generalized version of of Spark's [[SQLExecutionRDD]] + * + * It is just a wrapper over [[sqlRDD]] which sets and makes effective all the configs from the + * captured [[SQLConf]] + * + * @param sqlRDD the `RDD` generated by the SQL plan + * @param conf the `SQLConf` to apply to the execution of the SQL plan + */ +class SQLConfInjectingRDD[T: ClassTag](var sqlRDD: RDD[T], @transient conf: SQLConf) extends RDD[T](sqlRDD) { + private val sqlConfigs = conf.getAllConfs + private lazy val sqlConfExecutorSide = { + val newConf = new SQLConf() + sqlConfigs.foreach { case (k, v) => newConf.setConfString(k, v) } + newConf + } + + override val partitioner = firstParent[InternalRow].partitioner + + override def getPartitions: Array[Partition] = firstParent[InternalRow].partitions + + override def compute(split: Partition, context: TaskContext): Iterator[T] = { + // If we are in the context of a tracked SQL operation, `SQLExecution.EXECUTION_ID_KEY` is set + // and we have nothing to do here. Otherwise, we use the `SQLConf` captured at the creation of + // this RDD. + if (context.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) == null) { + SQLConf.withExistingConf(sqlConfExecutorSide) { + firstParent[T].iterator(split, context) + } + } else { + firstParent[T].iterator(split, context) + } + } +} \ No newline at end of file diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/execution/datasources/SparkParsePartitionUtil.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/execution/datasources/SparkParsePartitionUtil.scala new file mode 100644 index 0000000000000..2279e5a13f6f8 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/execution/datasources/SparkParsePartitionUtil.scala @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import java.util.TimeZone + +import org.apache.hadoop.fs.Path + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.types.DataType + +trait SparkParsePartitionUtil extends Serializable { + + def parsePartition(path: Path, + typeInference: Boolean, + basePaths: Set[Path], + userSpecifiedDataTypes: Map[String, DataType], + timeZone: TimeZone, + validatePartitionValues: Boolean = false): InternalRow +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala new file mode 100644 index 0000000000000..6f9616b669c47 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +import org.apache.avro.Schema +import org.apache.hadoop.fs.Path +import org.apache.hudi.client.utils.SparkRowSerDe +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.spark.sql.avro.{HoodieAvroDeserializer, HoodieAvroSchemaConverters, HoodieAvroSerializer} +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation +import org.apache.spark.sql.catalyst.catalog.CatalogTable +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, InterpretedPredicate} +import org.apache.spark.sql.catalyst.parser.ParserInterface +import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan, SubqueryAlias} +import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat +import org.apache.spark.sql.execution.datasources.{FilePartition, FileScanRDD, LogicalRelation, PartitionedFile, SparkParsePartitionUtil} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.sources.BaseRelation +import org.apache.spark.sql.types.{DataType, StructType} +import org.apache.spark.sql.{HoodieCatalogUtils, HoodieCatalystExpressionUtils, HoodieCatalystPlansUtils, Row, SQLContext, SparkSession} +import org.apache.spark.storage.StorageLevel + +import java.util.Locale + +/** + * Interface adapting discrepancies and incompatibilities between different Spark versions + */ +trait SparkAdapter extends Serializable { + + /** + * Returns an instance of [[HoodieCatalogUtils]] providing for common utils operating on Spark's + * [[TableCatalog]]s + */ + def getCatalogUtils: HoodieCatalogUtils + + /** + * Returns an instance of [[HoodieCatalystExpressionUtils]] providing for common utils operating + * on Catalyst [[Expression]]s + */ + def getCatalystExpressionUtils: HoodieCatalystExpressionUtils + + /** + * Returns an instance of [[HoodieCatalystPlansUtils]] providing for common utils operating + * on Catalyst [[LogicalPlan]]s + */ + def getCatalystPlanUtils: HoodieCatalystPlansUtils + + /** + * Creates instance of [[HoodieAvroSerializer]] providing for ability to serialize + * Spark's [[InternalRow]] into Avro payloads + */ + def createAvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable: Boolean): HoodieAvroSerializer + + /** + * Creates instance of [[HoodieAvroDeserializer]] providing for ability to deserialize + * Avro payloads into Spark's [[InternalRow]] + */ + def createAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType): HoodieAvroDeserializer + + /** + * Creates instance of [[HoodieAvroSchemaConverters]] allowing to convert b/w Avro and Catalyst schemas + */ + def getAvroSchemaConverters: HoodieAvroSchemaConverters + + /** + * Create the SparkRowSerDe. + */ + def createSparkRowSerDe(schema: StructType): SparkRowSerDe + + /** + * Create the hoodie's extended spark sql parser. + */ + def createExtendedSparkParser: Option[(SparkSession, ParserInterface) => ParserInterface] = None + + /** + * Create the SparkParsePartitionUtil. + */ + def getSparkParsePartitionUtil: SparkParsePartitionUtil + + /** + * ParserInterface#parseMultipartIdentifier is supported since spark3, for spark2 this should not be called. + */ + def parseMultipartIdentifier(parser: ParserInterface, sqlText: String): Seq[String] + + /** + * Combine [[PartitionedFile]] to [[FilePartition]] according to `maxSplitBytes`. + */ + def getFilePartitions(sparkSession: SparkSession, partitionedFiles: Seq[PartitionedFile], + maxSplitBytes: Long): Seq[FilePartition] + + def isHoodieTable(table: LogicalPlan, spark: SparkSession): Boolean = { + unfoldSubqueryAliases(table) match { + case LogicalRelation(_, _, Some(table), _) => isHoodieTable(table) + case relation: UnresolvedRelation => + isHoodieTable(getCatalystPlanUtils.toTableIdentifier(relation), spark) + case _=> false + } + } + + def isHoodieTable(map: java.util.Map[String, String]): Boolean = { + map.getOrDefault("provider", "").equals("hudi") + } + + def isHoodieTable(table: CatalogTable): Boolean = { + table.provider.map(_.toLowerCase(Locale.ROOT)).orNull == "hudi" + } + + def isHoodieTable(tableId: TableIdentifier, spark: SparkSession): Boolean = { + val table = spark.sessionState.catalog.getTableMetadata(tableId) + isHoodieTable(table) + } + + protected def unfoldSubqueryAliases(plan: LogicalPlan): LogicalPlan = { + plan match { + case SubqueryAlias(_, relation: LogicalPlan) => + unfoldSubqueryAliases(relation) + case other => + other + } + } + + /** + * Create instance of [[ParquetFileFormat]] + */ + def createHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] + + /** + * Create instance of [[InterpretedPredicate]] + * + * TODO move to HoodieCatalystExpressionUtils + */ + def createInterpretedPredicate(e: Expression): InterpretedPredicate + + /** + * Create Hoodie relation based on globPaths, otherwise use tablePath if it's empty + */ + def createRelation(sqlContext: SQLContext, + metaClient: HoodieTableMetaClient, + schema: Schema, + globPaths: Array[Path], + parameters: java.util.Map[String, String]): BaseRelation + + /** + * Create instance of [[HoodieFileScanRDD]] + * SPARK-37273 FileScanRDD constructor changed in SPARK 3.3 + */ + def createHoodieFileScanRDD(sparkSession: SparkSession, + readFunction: PartitionedFile => Iterator[InternalRow], + filePartitions: Seq[FilePartition], + readDataSchema: StructType, + metadataColumns: Seq[AttributeReference] = Seq.empty): FileScanRDD + + /** + * Resolve [[DeleteFromTable]] + * SPARK-38626 condition is no longer Option in Spark 3.3 + */ + def resolveDeleteFromTable(deleteFromTable: Command, + resolveExpression: Expression => Expression): LogicalPlan + + /** + * Extract condition in [[DeleteFromTable]] + * SPARK-38626 condition is no longer Option in Spark 3.3 + */ + def extractDeleteCondition(deleteFromTable: Command): Expression + + /** + * Get parseQuery from ExtendedSqlParser, only for Spark 3.3+ + */ + def getQueryParserFromExtendedSqlParser(session: SparkSession, delegate: ParserInterface, + sqlText: String): LogicalPlan = { + // unsupported by default + throw new UnsupportedOperationException(s"Unsupported parseQuery method in Spark earlier than Spark 3.3.0") + } + + /** + * Converts instance of [[StorageLevel]] to a corresponding string + */ + def convertStorageLevelToString(level: StorageLevel): String +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/execution/RangeSample.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/execution/RangeSample.scala new file mode 100644 index 0000000000000..7c39ce2546f26 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/execution/RangeSample.scala @@ -0,0 +1,539 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.execution + +import org.apache.hudi.common.util.BinaryUtil +import org.apache.hudi.config.HoodieClusteringConfig +import org.apache.hudi.config.HoodieClusteringConfig.LayoutOptimizationStrategy +import org.apache.hudi.optimize.HilbertCurveUtils +import org.apache.spark.rdd.{PartitionPruningRDD, RDD} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering +import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, BoundReference, SortOrder, UnsafeProjection, UnsafeRow} +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.util.MutablePair +import org.apache.spark.util.random.SamplingUtils +import org.davidmoten.hilbert.HilbertCurve + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer +import scala.reflect.{ClassTag, classTag} +import scala.util.hashing.byteswap32 + +class RangeSample[K: ClassTag, V]( + zEncodeNum: Int, + rdd: RDD[_ <: Product2[K, V]], + private var ascend: Boolean = true, + val samplePointsPerPartitionHint: Int = 20) extends Serializable { + + // We allow zEncodeNum = 0, which happens when sorting an empty RDD under the default settings. + require(zEncodeNum >= 0, s"Number of zEncodeNum cannot be negative but found $zEncodeNum.") + require(samplePointsPerPartitionHint > 0, + s"Sample points per partition must be greater than 0 but found $samplePointsPerPartitionHint") + + def getRangeBounds(): ArrayBuffer[(K, Float)] = { + if (zEncodeNum <= 1) { + ArrayBuffer.empty[(K, Float)] + } else { + // This is the sample size we need to have roughly balanced output partitions, capped at 1M. + // Cast to double to avoid overflowing ints or longs + val sampleSize = math.min(samplePointsPerPartitionHint.toDouble * zEncodeNum, 1e6) + // Assume the input partitions are roughly balanced and over-sample a little bit. + val sampleSizePerPartition = math.ceil(3.0 * sampleSize / rdd.partitions.length).toInt + val (numItems, sketched) = sketch(rdd.map(_._1), sampleSizePerPartition) + if (numItems == 0L) { + ArrayBuffer.empty[(K, Float)] + } else { + // If a partition contains much more than the average number of items, we re-sample from it + // to ensure that enough items are collected from that partition. + val fraction = math.min(sampleSize / math.max(numItems, 1L), 1.0) + val candidates = ArrayBuffer.empty[(K, Float)] + val imbalancedPartitions = mutable.Set.empty[Int] + + sketched.foreach { case (idx, n, sample) => + if (fraction * n > sampleSizePerPartition) { + imbalancedPartitions += idx + } else { + // The weight is 1 over the sampling probability. + val weight = (n.toDouble / sample.length).toFloat + for (key <- sample) { + candidates += ((key, weight)) + } + } + } + + if (imbalancedPartitions.nonEmpty) { + // Re-sample imbalanced partitions with the desired sampling probability. + val imbalanced = new PartitionPruningRDD(rdd.map(_._1), imbalancedPartitions.contains) + val seed = byteswap32(-rdd.id - 1) + val reSampled = imbalanced.sample(withReplacement = false, fraction, seed).collect() + val weight = (1.0 / fraction).toFloat + candidates ++= reSampled.map(x => (x, weight)) + } + candidates + } + } + } + + /** + * Determines the bounds for range partitioning from candidates with weights indicating how many + * items each represents. Usually this is 1 over the probability used to sample this candidate. + * + * @param candidates unordered candidates with weights + * @param partitions number of partitions + * @return selected bounds + */ + def determineBound[K : Ordering : ClassTag]( + candidates: ArrayBuffer[(K, Float)], + partitions: Int, ordering: Ordering[K]): Array[K] = { + val ordered = candidates.sortBy(_._1)(ordering) + val numCandidates = ordered.size + val sumWeights = ordered.map(_._2.toDouble).sum + val step = sumWeights / partitions + var cumWeight = 0.0 + var target = step + val bounds = ArrayBuffer.empty[K] + var i = 0 + var j = 0 + var previousBound = Option.empty[K] + while ((i < numCandidates) && (j < partitions - 1)) { + val (key, weight) = ordered(i) + cumWeight += weight + if (cumWeight >= target) { + // Skip duplicate values. + if (previousBound.isEmpty || ordering.gt(key, previousBound.get)) { + bounds += key + target += step + j += 1 + previousBound = Some(key) + } + } + i += 1 + } + bounds.toArray + } + + def determineRowBounds[K : Ordering : ClassTag]( + candidates: ArrayBuffer[(K, Float)], + partitions: Int, orderings: Seq[Ordering[K]], + attributes: Seq[Attribute]): Array[Array[UnsafeRow]] = { + + orderings.zipWithIndex.map { case (ordering, index) => + val ordered = candidates.sortBy(_._1)(ordering) + val numCandidates = ordered.size + val sumWeights = ordered.map(_._2.toDouble).sum + val step = sumWeights / partitions + var cumWeight = 0.0 + var target = step + val bounds = ArrayBuffer.empty[K] + var i = 0 + var j = 0 + var previousBound = Option.empty[K] + while ((i < numCandidates) && (j < partitions - 1)) { + val (key, weight) = ordered(i) + cumWeight += weight + if (cumWeight >= target) { + // Skip duplicate values. + if (previousBound.isEmpty || ordering.gt(key, previousBound.get)) { + bounds += key + target += step + j += 1 + previousBound = Some(key) + } + } + i += 1 + } + // build project + val project = UnsafeProjection.create(Seq(attributes(index)), attributes) + bounds.map { bound => + val row = bound.asInstanceOf[UnsafeRow] + project(row).copy() + }.toArray + }.toArray + } + + /** + * Sketches the input RDD via reservoir sampling on each partition. + * + * @param rdd the input RDD to sketch + * @param sampleSizePerPartition max sample size per partition + * @return (total number of items, an array of (partitionId, number of items, sample)) + */ + def sketch[K: ClassTag]( + rdd: RDD[K], + sampleSizePerPartition: Int): (Long, Array[(Int, Long, Array[K])]) = { + val shift = rdd.id + // val classTagK = classTag[K] // to avoid serializing the entire partitioner object + val sketched = rdd.mapPartitionsWithIndex { (idx, iter) => + val seed = byteswap32(idx ^ (shift << 16)) + val (sample, n) = SamplingUtils.reservoirSampleAndCount( + iter, sampleSizePerPartition, seed) + Iterator((idx, n, sample)) + }.collect() + val numItems = sketched.map(_._2).sum + (numItems, sketched) + } +} + +class RawDecisionBound[K : Ordering : ClassTag](ordering: Ordering[K]) extends Serializable { + + private var binarySearch: ((Array[K], K) => Int) = { + // For primitive keys, we can use the natural ordering. Otherwise, use the Ordering comparator. + classTag[K] match { + case ClassTag.Float => + (l, x) => java.util.Arrays.binarySearch(l.asInstanceOf[Array[Float]], x.asInstanceOf[Float]) + case ClassTag.Double => + (l, x) => java.util.Arrays.binarySearch(l.asInstanceOf[Array[Double]], x.asInstanceOf[Double]) + case ClassTag.Byte => + (l, x) => java.util.Arrays.binarySearch(l.asInstanceOf[Array[Byte]], x.asInstanceOf[Byte]) + case ClassTag.Char => + (l, x) => java.util.Arrays.binarySearch(l.asInstanceOf[Array[Char]], x.asInstanceOf[Char]) + case ClassTag.Short => + (l, x) => java.util.Arrays.binarySearch(l.asInstanceOf[Array[Short]], x.asInstanceOf[Short]) + case ClassTag.Int => + (l, x) => java.util.Arrays.binarySearch(l.asInstanceOf[Array[Int]], x.asInstanceOf[Int]) + case ClassTag.Long => + (l, x) => java.util.Arrays.binarySearch(l.asInstanceOf[Array[Long]], x.asInstanceOf[Long]) + case _ => + val comparator = ordering.asInstanceOf[java.util.Comparator[Any]] + (l, x) => java.util.Arrays.binarySearch(l.asInstanceOf[Array[AnyRef]], x, comparator) + } + } + + def getBound(key: Any, candidateBounds: Array[K]): Int = { + val k = key.asInstanceOf[K] + var bound = 0 + if (candidateBounds.length <= 128) { + while(bound < candidateBounds.length && ordering.gt(k, candidateBounds(bound))) { + bound += 1 + } + } else { + bound = binarySearch(candidateBounds, k) + if (bound < 0 ) { + bound = -bound - 1 + } + if (bound > candidateBounds.length) { + bound = candidateBounds.length + } + } + bound + } +} + +case class ByteArraySorting(b: Array[Byte]) extends Ordered[ByteArraySorting] with Serializable { + override def compare(that: ByteArraySorting): Int = { + val len = this.b.length + BinaryUtil.compareTo(this.b, 0, len, that.b, 0, len) + } +} + +object RangeSampleSort { + + /** + * create optimize DataFrame by sample + * first, sample origin data to get order-cols bounds, then apply sort to produce DataFrame + * support all type data. + * this method need more resource and cost more time than createOptimizedDataFrameByMapValue + */ + def sortDataFrameBySample(df: DataFrame, layoutOptStrategy: LayoutOptimizationStrategy, orderByCols: Seq[String], targetPartitionsCount: Int): DataFrame = { + val spark = df.sparkSession + val columnsMap = df.schema.fields.map(item => (item.name, item)).toMap + val fieldNum = df.schema.fields.length + val checkCols = orderByCols.filter(col => columnsMap(col) != null) + + if (orderByCols.isEmpty || checkCols.isEmpty) { + df + } else { + val zFields = orderByCols.map { col => + val newCol = columnsMap(col) + if (newCol == null) { + (-1, null) + } else { + newCol.dataType match { + case LongType | DoubleType | FloatType | StringType | IntegerType | DateType | TimestampType | ShortType | ByteType => + (df.schema.fields.indexOf(newCol), newCol) + case d: DecimalType => + (df.schema.fields.indexOf(newCol), newCol) + case _ => + (-1, null) + } + } + }.filter(_._1 != -1) + // Complex type found, use createZIndexedDataFrameByRange + if (zFields.length != orderByCols.length) { + return sortDataFrameBySampleSupportAllTypes(df, orderByCols, targetPartitionsCount) + } + + val rawRdd = df.rdd + val sampleRdd = rawRdd.map { row => + val values = zFields.map { case (index, field) => + field.dataType match { + case LongType => + if (row.isNullAt(index)) Long.MaxValue else row.getLong(index) + case DoubleType => + if (row.isNullAt(index)) Long.MaxValue else java.lang.Double.doubleToLongBits(row.getDouble(index)) + case IntegerType => + if (row.isNullAt(index)) Long.MaxValue else row.getInt(index).toLong + case FloatType => + if (row.isNullAt(index)) Long.MaxValue else java.lang.Double.doubleToLongBits(row.getFloat(index).toDouble) + case StringType => + if (row.isNullAt(index)) "" else row.getString(index) + case DateType => + if (row.isNullAt(index)) Long.MaxValue else row.getDate(index).getTime + case TimestampType => + if (row.isNullAt(index)) Long.MaxValue else row.getTimestamp(index).getTime + case ByteType => + if (row.isNullAt(index)) Long.MaxValue else row.getByte(index).toLong + case ShortType => + if (row.isNullAt(index)) Long.MaxValue else row.getShort(index).toLong + case d: DecimalType => + if (row.isNullAt(index)) Long.MaxValue else row.getDecimal(index).longValue() + case _ => + null + } + }.filter(v => v != null).toArray + (values, null) + } + val zOrderBounds = df.sparkSession.sessionState.conf.getConfString( + HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE.key, + HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE.defaultValue.toString).toInt + val sample = new RangeSample(zOrderBounds, sampleRdd) + val rangeBounds = sample.getRangeBounds() + val sampleBounds = { + val candidateColNumber = rangeBounds.head._1.length + (0 to candidateColNumber - 1).map { i => + val colRangeBound = rangeBounds.map(x => (x._1(i), x._2)) + + if (colRangeBound.head._1.isInstanceOf[String]) { + sample.determineBound(colRangeBound.asInstanceOf[ArrayBuffer[(String, Float)]], math.min(zOrderBounds, rangeBounds.length), Ordering[String]) + } else { + sample.determineBound(colRangeBound.asInstanceOf[ArrayBuffer[(Long, Float)]], math.min(zOrderBounds, rangeBounds.length), Ordering[Long]) + } + } + } + + // expand bounds. + // maybe it's better to use the value of "spark.zorder.bounds.number" as maxLength, + // however this will lead to extra time costs when all zorder cols distinct count values are less then "spark.zorder.bounds.number" + val maxLength = sampleBounds.map(_.length).max + val expandSampleBoundsWithFactor = sampleBounds.map { bound => + val fillFactor = maxLength / bound.size + val newBound = new Array[Double](bound.length * fillFactor) + if (bound.isInstanceOf[Array[Long]] && fillFactor > 1) { + val longBound = bound.asInstanceOf[Array[Long]] + for (i <- 0 to bound.length - 1) { + for (j <- 0 to fillFactor - 1) { + // sample factor shoud not be too large, so it's ok to use 1 / fillfactor as slice + newBound(j + i*(fillFactor)) = longBound(i) + (j + 1) * (1 / fillFactor.toDouble) + } + } + (newBound, fillFactor) + } else { + (bound, 0) + } + } + + val boundBroadCast = spark.sparkContext.broadcast(expandSampleBoundsWithFactor) + + val indexRdd = rawRdd.mapPartitions { iter => + val expandBoundsWithFactor = boundBroadCast.value + val maxBoundNum = expandBoundsWithFactor.map(_._1.length).max + val longDecisionBound = new RawDecisionBound(Ordering[Long]) + val doubleDecisionBound = new RawDecisionBound(Ordering[Double]) + val stringDecisionBound = new RawDecisionBound(Ordering[String]) + import java.util.concurrent.ThreadLocalRandom + val threadLocalRandom = ThreadLocalRandom.current + + def getRank(rawIndex: Int, value: Long, isNull: Boolean): Int = { + val (expandBound, factor) = expandBoundsWithFactor(rawIndex) + if (isNull) { + expandBound.length + 1 + } else { + if (factor > 1) { + doubleDecisionBound.getBound(value + (threadLocalRandom.nextInt(factor) + 1)*(1 / factor.toDouble), expandBound.asInstanceOf[Array[Double]]) + } else { + longDecisionBound.getBound(value, expandBound.asInstanceOf[Array[Long]]) + } + } + } + + val hilbertCurve = if (layoutOptStrategy == LayoutOptimizationStrategy.HILBERT) + Some(HilbertCurve.bits(32).dimensions(zFields.length)) + else + None + + iter.map { row => + val values = zFields.zipWithIndex.map { case ((index, field), rawIndex) => + field.dataType match { + case LongType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getLong(index), isNull) + case DoubleType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else java.lang.Double.doubleToLongBits(row.getDouble(index)), isNull) + case IntegerType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getInt(index).toLong, isNull) + case FloatType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else java.lang.Double.doubleToLongBits(row.getFloat(index).toDouble), isNull) + case StringType => + val factor = maxBoundNum.toDouble / expandBoundsWithFactor(rawIndex)._1.length + if (row.isNullAt(index)) { + maxBoundNum + 1 + } else { + val currentRank = stringDecisionBound.getBound(row.getString(index), expandBoundsWithFactor(rawIndex)._1.asInstanceOf[Array[String]]) + if (factor > 1) { + (currentRank*factor).toInt + threadLocalRandom.nextInt(factor.toInt) + } else { + currentRank + } + } + case DateType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getDate(index).getTime, isNull) + case TimestampType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getTimestamp(index).getTime, isNull) + case ByteType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getByte(index).toLong, isNull) + case ShortType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getShort(index).toLong, isNull) + case d: DecimalType => + val isNull = row.isNullAt(index) + getRank(rawIndex, if (isNull) 0 else row.getDecimal(index).longValue(), isNull) + case _ => + -1 + } + }.filter(v => v != -1) + + val mapValues = layoutOptStrategy match { + case LayoutOptimizationStrategy.HILBERT => + HilbertCurveUtils.indexBytes(hilbertCurve.get, values.map(_.toLong).toArray, 32) + case LayoutOptimizationStrategy.ZORDER => + BinaryUtil.interleaving(values.map(BinaryUtil.intTo8Byte(_)).toArray, 8) + } + + Row.fromSeq(row.toSeq ++ Seq(mapValues)) + } + }.sortBy(x => ByteArraySorting(x.getAs[Array[Byte]](fieldNum)), numPartitions = targetPartitionsCount) + val newDF = df.sparkSession.createDataFrame(indexRdd, StructType( + df.schema.fields ++ Seq( + StructField(s"index", + BinaryType, false)) + )) + newDF.drop("index") + } + } + + /** + * create z-order DataFrame by sample + * support all col types + */ + def sortDataFrameBySampleSupportAllTypes(df: DataFrame, zCols: Seq[String], fileNum: Int): DataFrame = { + val spark = df.sparkSession + val internalRdd = df.queryExecution.toRdd + val schema = df.schema + val outputAttributes = df.queryExecution.analyzed.output + val sortingExpressions = outputAttributes.filter(p => zCols.contains(p.name)) + if (sortingExpressions.length == 0 || sortingExpressions.length != zCols.size) { + df + } else { + val zOrderBounds = df.sparkSession.sessionState.conf.getConfString( + HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE.key, + HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE.defaultValue.toString).toInt + + val sampleRdd = internalRdd.mapPartitionsInternal { iter => + val projection = UnsafeProjection.create(sortingExpressions, outputAttributes) + val mutablePair = new MutablePair[InternalRow, Null]() + // Internally, RangePartitioner runs a job on the RDD that samples keys to compute + // partition bounds. To get accurate samples, we need to copy the mutable keys. + iter.map(row => mutablePair.update(projection(row).copy(), null)) + } + + val orderings = sortingExpressions.map(SortOrder(_, Ascending)).zipWithIndex.map { case (ord, i) => + ord.copy(child = BoundReference(i, ord.dataType, ord.nullable)) + } + + val lazyGeneratedOrderings = orderings.map(ord => new LazilyGeneratedOrdering(Seq(ord))) + + val sample = new RangeSample(zOrderBounds, sampleRdd) + + val rangeBounds = sample.getRangeBounds() + + implicit val ordering1 = lazyGeneratedOrderings(0) + + val sampleBounds = sample.determineRowBounds(rangeBounds, math.min(zOrderBounds, rangeBounds.length), lazyGeneratedOrderings, sortingExpressions) + + val origin_orderings = sortingExpressions.map(SortOrder(_, Ascending)).map { ord => + ord.copy(child = BoundReference(0, ord.dataType, ord.nullable)) + } + + val origin_lazyGeneratedOrderings = origin_orderings.map(ord => new LazilyGeneratedOrdering(Seq(ord))) + + // expand bounds. + // maybe it's better to use the value of "spark.zorder.bounds.number" as maxLength, + // however this will lead to extra time costs when all zorder cols distinct count values are less then "spark.zorder.bounds.number" + val maxLength = sampleBounds.map(_.length).max + val expandSampleBoundsWithFactor = sampleBounds.map { bound => + val fillFactor = maxLength / bound.size.toDouble + (bound, fillFactor) + } + + val boundBroadCast = spark.sparkContext.broadcast(expandSampleBoundsWithFactor) + + val indexRdd = internalRdd.mapPartitionsInternal { iter => + val boundsWithFactor = boundBroadCast.value + import java.util.concurrent.ThreadLocalRandom + val threadLocalRandom = ThreadLocalRandom.current + val maxBoundNum = boundsWithFactor.map(_._1.length).max + val origin_Projections = sortingExpressions.map { se => + UnsafeProjection.create(Seq(se), outputAttributes) + } + + iter.map { unsafeRow => + val interleaveValues = origin_Projections.zip(origin_lazyGeneratedOrderings).zipWithIndex.map { case ((rowProject, lazyOrdering), index) => + val row = rowProject(unsafeRow) + val decisionBound = new RawDecisionBound(lazyOrdering) + if (row.isNullAt(0)) { + maxBoundNum + 1 + } else { + val (bound, factor) = boundsWithFactor(index) + if (factor > 1) { + val currentRank = decisionBound.getBound(row, bound.asInstanceOf[Array[InternalRow]]) + currentRank*factor.toInt + threadLocalRandom.nextInt(factor.toInt) + } else { + decisionBound.getBound(row, bound.asInstanceOf[Array[InternalRow]]) + } + } + }.toArray.map(BinaryUtil.intTo8Byte(_)) + val zValues = BinaryUtil.interleaving(interleaveValues, 8) + val mutablePair = new MutablePair[InternalRow, Array[Byte]]() + + mutablePair.update(unsafeRow, zValues) + } + }.sortBy(x => ByteArraySorting(x._2), numPartitions = fileNum).map(_._1) + spark.internalCreateDataFrame(indexRdd, schema) + } + } +} + diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestClientRollback.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestClientRollback.java index d04a2df1b4eb4..92eeac85535c7 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestClientRollback.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestClientRollback.java @@ -18,34 +18,55 @@ package org.apache.hudi.client; +import org.apache.hudi.avro.model.HoodieInstantInfo; +import org.apache.hudi.avro.model.HoodieRollbackPlan; +import org.apache.hudi.avro.model.HoodieRollbackRequest; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView; +import org.apache.hudi.common.testutils.FileCreateUtils; +import org.apache.hudi.common.testutils.HoodieMetadataTestTable; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestTable; -import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieCleanConfig; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieRollbackException; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.testutils.HoodieClientTestBase; import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import java.util.stream.Stream; +import static org.apache.hudi.common.util.StringUtils.EMPTY_STRING; import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -59,10 +80,10 @@ public class TestClientRollback extends HoodieClientTestBase { */ @Test public void testSavepointAndRollback() throws Exception { - HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder() + HoodieWriteConfig cfg = getConfigBuilder().withCleanConfig(HoodieCleanConfig.newBuilder() .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1).build()).build(); try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { - HoodieTestDataGenerator.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath); + HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath); /** * Write 1 (only inserts) @@ -99,8 +120,9 @@ public void testSavepointAndRollback() throws Exception { statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); // Verify there are no errors assertNoWriteErrors(statuses); + HoodieWriteConfig config = getConfig(); List partitionPaths = - FSUtils.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning()); + FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), cfg.getBasePath()); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieSparkTable table = HoodieSparkTable.create(getConfig(), context, metaClient); final BaseFileOnlyView view1 = table.getBaseFileOnlyView(); @@ -157,11 +179,109 @@ public void testSavepointAndRollback() throws Exception { } /** - * Test Cases for effects of rollbacking completed/inflight commits. + * Test case for rollback-savepoint with KEEP_LATEST_FILE_VERSIONS policy. + */ + @Test + public void testSavepointAndRollbackWithKeepLatestFileVersionPolicy() throws Exception { + HoodieWriteConfig cfg = getConfigBuilder().withCleanConfig(HoodieCleanConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(2).build()).build(); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { + HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath); + + /** + * Write 1 (only inserts) + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 200); + JavaRDD writeRecords = jsc.parallelize(records, 1); + + List statuses = client.upsert(writeRecords, newCommitTime).collect(); + assertNoWriteErrors(statuses); + + /** + * Write 2 (updates) + */ + newCommitTime = "002"; + client.startCommitWithTime(newCommitTime); + + records = dataGen.generateUpdates(newCommitTime, records); + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + client.savepoint("hoodie-unit-test", "test"); + + /** + * Write 3 (updates) + */ + newCommitTime = "003"; + client.startCommitWithTime(newCommitTime); + + records = dataGen.generateUpdates(newCommitTime, records); + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + HoodieWriteConfig config = getConfig(); + List partitionPaths = + FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), cfg.getBasePath()); + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieSparkTable table = HoodieSparkTable.create(getConfig(), context, metaClient); + final BaseFileOnlyView view1 = table.getBaseFileOnlyView(); + + List dataFiles = partitionPaths.stream().flatMap(s -> { + return view1.getAllBaseFiles(s).filter(f -> f.getCommitTime().equals("003")); + }).collect(Collectors.toList()); + assertEquals(3, dataFiles.size(), "The data files for commit 003 should be present"); + + dataFiles = partitionPaths.stream().flatMap(s -> { + return view1.getAllBaseFiles(s).filter(f -> f.getCommitTime().equals("002")); + }).collect(Collectors.toList()); + assertEquals(3, dataFiles.size(), "The data files for commit 002 should be present"); + + /** + * Write 4 (updates) + */ + newCommitTime = "004"; + client.startCommitWithTime(newCommitTime); + + records = dataGen.generateUpdates(newCommitTime, records); + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + metaClient = HoodieTableMetaClient.reload(metaClient); + table = HoodieSparkTable.create(getConfig(), context, metaClient); + final BaseFileOnlyView view2 = table.getBaseFileOnlyView(); + + dataFiles = partitionPaths.stream().flatMap(s -> view2.getAllBaseFiles(s).filter(f -> f.getCommitTime().equals("004"))).collect(Collectors.toList()); + assertEquals(3, dataFiles.size(), "The data files for commit 004 should be present"); + + // rollback to savepoint 002 + HoodieInstant savepoint = table.getCompletedSavepointTimeline().getInstants().findFirst().get(); + client.restoreToSavepoint(savepoint.getTimestamp()); + + metaClient = HoodieTableMetaClient.reload(metaClient); + table = HoodieSparkTable.create(getConfig(), context, metaClient); + final BaseFileOnlyView view3 = table.getBaseFileOnlyView(); + dataFiles = partitionPaths.stream().flatMap(s -> view3.getAllBaseFiles(s).filter(f -> f.getCommitTime().equals("002"))).collect(Collectors.toList()); + assertEquals(3, dataFiles.size(), "The data files for commit 002 be available"); + + dataFiles = partitionPaths.stream().flatMap(s -> view3.getAllBaseFiles(s).filter(f -> f.getCommitTime().equals("003"))).collect(Collectors.toList()); + assertEquals(0, dataFiles.size(), "The data files for commit 003 should be rolled back"); + + dataFiles = partitionPaths.stream().flatMap(s -> view3.getAllBaseFiles(s).filter(f -> f.getCommitTime().equals("004"))).collect(Collectors.toList()); + assertEquals(0, dataFiles.size(), "The data files for commit 004 should be rolled back"); + } + } + + /** + * Test Cases for effects of rolling back completed/inflight commits. */ @Test public void testRollbackCommit() throws Exception { - // Let's create some commit files and parquet files + // Let's create some commit files and base files final String p1 = "2016/05/01"; final String p2 = "2016/05/02"; final String p3 = "2016/05/06"; @@ -189,24 +309,32 @@ public void testRollbackCommit() throws Exception { put(p3, "id33"); } }; - HoodieTestTable testTable = HoodieTestTable.of(metaClient) - .withPartitionMetaFiles(p1, p2, p3) - .addCommit(commitTime1) - .withBaseFilesInPartitions(partitionAndFileId1) - .addCommit(commitTime2) - .withBaseFilesInPartitions(partitionAndFileId2) - .addInflightCommit(commitTime3) - .withBaseFilesInPartitions(partitionAndFileId3); HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withRollbackUsingMarkers(false) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).build()) .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); - try (SparkRDDWriteClient client = getHoodieWriteClient(config, false)) { + HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context); + HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter); - // Rollback commit 1 (this should fail, since commit2 is still around) - assertThrows(HoodieRollbackException.class, () -> { - client.rollback(commitTime1); - }, "Should have thrown an exception "); + Map>> partitionToFilesNameLengthMap1 = new HashMap<>(); + partitionAndFileId1.forEach((k, v) -> partitionToFilesNameLengthMap1.put(k, Collections.singletonList(Pair.of(v, 100)))); + testTable.doWriteOperation(commitTime1, WriteOperationType.INSERT, Arrays.asList(p1, p2, p3), partitionToFilesNameLengthMap1, + false, false); + + Map>> partitionToFilesNameLengthMap2 = new HashMap<>(); + partitionAndFileId2.forEach((k, v) -> partitionToFilesNameLengthMap2.put(k, Collections.singletonList(Pair.of(v, 200)))); + testTable.doWriteOperation(commitTime2, WriteOperationType.INSERT, Collections.emptyList(), partitionToFilesNameLengthMap2, + false, false); + + Map>> partitionToFilesNameLengthMap3 = new HashMap<>(); + partitionAndFileId3.forEach((k, v) -> partitionToFilesNameLengthMap3.put(k, Collections.singletonList(Pair.of(v, 300)))); + testTable.doWriteOperation(commitTime3, WriteOperationType.INSERT, Collections.emptyList(), partitionToFilesNameLengthMap3, + false, true); + + try (SparkRDDWriteClient client = getHoodieWriteClient(config)) { // Rollback commit3 client.rollback(commitTime3); @@ -247,12 +375,20 @@ public void testRollbackCommit() throws Exception { } } + private static Stream testFailedRollbackCommitParams() { + return Arrays.stream(new Boolean[][] { + {true, true}, {true, false}, {false, true}, {false, false}, + }).map(Arguments::of); + } + /** - * Test auto-rollback of commits which are in flight. + * Test Cases for effects of rollbacking completed/inflight commits. */ - @Test - public void testAutoRollbackInflightCommit() throws Exception { - // Let's create some commit files and parquet files + @ParameterizedTest + @MethodSource("testFailedRollbackCommitParams") + public void testFailedRollbackCommit( + boolean enableMetadataTable, boolean instantToRollbackExists) throws Exception { + // Let's create some commit files and base files final String p1 = "2016/05/01"; final String p2 = "2016/05/02"; final String p3 = "2016/05/06"; @@ -280,21 +416,147 @@ public void testAutoRollbackInflightCommit() throws Exception { put(p3, "id33"); } }; - HoodieTestTable testTable = HoodieTestTable.of(metaClient) - .withPartitionMetaFiles(p1, p2, p3) + + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withRollbackUsingMarkers(false) + .withMetadataConfig( + HoodieMetadataConfig.newBuilder() + // Column Stats Index is disabled, since these tests construct tables which are + // not valid (empty commit metadata, invalid parquet files) + .withMetadataIndexColumnStats(false) + .enable(enableMetadataTable) + .build() + ) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).build()) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); + + HoodieTestTable testTable = enableMetadataTable + ? HoodieMetadataTestTable.of(metaClient, SparkHoodieBackedTableMetadataWriter.create( + metaClient.getHadoopConf(), config, context)) + : HoodieTestTable.of(metaClient); + + testTable.withPartitionMetaFiles(p1, p2, p3) .addCommit(commitTime1) .withBaseFilesInPartitions(partitionAndFileId1) - .addInflightCommit(commitTime2) + .addCommit(commitTime2) .withBaseFilesInPartitions(partitionAndFileId2) .addInflightCommit(commitTime3) .withBaseFilesInPartitions(partitionAndFileId3); - // Turn auto rollback off + try (SparkRDDWriteClient client = getHoodieWriteClient(config)) { + + // Rollback commit3 + client.rollback(commitTime3); + assertFalse(testTable.inflightCommitExists(commitTime3)); + assertFalse(testTable.baseFilesExist(partitionAndFileId3, commitTime3)); + assertTrue(testTable.baseFilesExist(partitionAndFileId2, commitTime2)); + + metaClient.reloadActiveTimeline(); + List rollbackInstants = metaClient.getActiveTimeline().getRollbackTimeline().getInstants().collect(Collectors.toList()); + assertEquals(rollbackInstants.size(), 1); + HoodieInstant rollbackInstant = rollbackInstants.get(0); + + // delete rollback completed meta file and retry rollback. + FileCreateUtils.deleteRollbackCommit(basePath, rollbackInstant.getTimestamp()); + + if (instantToRollbackExists) { + // recreate actual commit files if needed + testTable.addInflightCommit(commitTime3).withBaseFilesInPartitions(partitionAndFileId3); + } + + // retry rolling back the commit again. + client.rollback(commitTime3); + + // verify there are no extra rollback instants + metaClient.reloadActiveTimeline(); + rollbackInstants = metaClient.getActiveTimeline().getRollbackTimeline().getInstants().collect(Collectors.toList()); + assertEquals(rollbackInstants.size(), 1); + assertEquals(rollbackInstants.get(0), rollbackInstant); + + final String commitTime4 = "20160507040601"; + final String commitTime5 = "20160507050611"; + + // add inflight compaction then rolls it back + testTable.addInflightCompaction(commitTime4, new HoodieCommitMetadata()); + HoodieRollbackPlan rollbackPlan = new HoodieRollbackPlan(); + rollbackPlan.setRollbackRequests(Collections.emptyList()); + rollbackPlan.setInstantToRollback(new HoodieInstantInfo(commitTime4, HoodieTimeline.COMPACTION_ACTION)); + testTable.addRequestedRollback(commitTime5, rollbackPlan); + + // the compaction instants should be excluded + metaClient.reloadActiveTimeline(); + assertEquals(0, client.getPendingRollbackInfos(metaClient).size()); + + // verify there is no extra rollback instants + client.rollback(commitTime4); + + metaClient.reloadActiveTimeline(); + rollbackInstants = metaClient.reloadActiveTimeline().getRollbackTimeline().getInstants().collect(Collectors.toList()); + assertEquals(2, rollbackInstants.size()); + } + } + + /** + * Test auto-rollback of commits which are in flight. + */ + @Test + public void testAutoRollbackInflightCommit() throws Exception { + // Let's create some commit files and base files + final String p1 = "2016/05/01"; + final String p2 = "2016/05/02"; + final String p3 = "2016/05/06"; + final String commitTime1 = "20160501010101"; + final String commitTime2 = "20160502020601"; + final String commitTime3 = "20160506030611"; + Map partitionAndFileId1 = new HashMap() { + { + put(p1, "id11"); + put(p2, "id12"); + put(p3, "id13"); + } + }; + Map partitionAndFileId2 = new HashMap() { + { + put(p1, "id21"); + put(p2, "id22"); + put(p3, "id23"); + } + }; + Map partitionAndFileId3 = new HashMap() { + { + put(p1, "id31"); + put(p2, "id32"); + put(p3, "id33"); + } + }; + + // Set Failed Writes rollback to LAZY HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).build()).build(); + + HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context); + HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter); + + Map>> partitionToFilesNameLengthMap1 = new HashMap<>(); + partitionAndFileId1.forEach((k, v) -> partitionToFilesNameLengthMap1.put(k, Collections.singletonList(Pair.of(v, 100)))); + testTable.doWriteOperation(commitTime1, WriteOperationType.INSERT, Arrays.asList(p1, p2, p3), partitionToFilesNameLengthMap1, + false, false); + + Map>> partitionToFilesNameLengthMap2 = new HashMap<>(); + partitionAndFileId2.forEach((k, v) -> partitionToFilesNameLengthMap2.put(k, Collections.singletonList(Pair.of(v, 200)))); + testTable.doWriteOperation(commitTime2, WriteOperationType.INSERT, Collections.emptyList(), partitionToFilesNameLengthMap2, + false, true); + + Map>> partitionToFilesNameLengthMap3 = new HashMap<>(); + partitionAndFileId3.forEach((k, v) -> partitionToFilesNameLengthMap3.put(k, Collections.singletonList(Pair.of(v, 300)))); + testTable.doWriteOperation(commitTime3, WriteOperationType.INSERT, Collections.emptyList(), partitionToFilesNameLengthMap3, + false, true); final String commitTime4 = "20160506030621"; - try (SparkRDDWriteClient client = getHoodieWriteClient(config, false)) { + try (SparkRDDWriteClient client = getHoodieWriteClient(config)) { client.startCommitWithTime(commitTime4); // Check results, nothing changed assertTrue(testTable.commitExists(commitTime1)); @@ -305,9 +567,12 @@ public void testAutoRollbackInflightCommit() throws Exception { assertTrue(testTable.baseFilesExist(partitionAndFileId3, commitTime3)); } - // Turn auto rollback on + // Set Failed Writes rollback to EAGER + config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withRollbackUsingMarkers(false) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); final String commitTime5 = "20160506030631"; - try (SparkRDDWriteClient client = getHoodieWriteClient(config, true)) { + try (SparkRDDWriteClient client = getHoodieWriteClient(config)) { client.startCommitWithTime(commitTime5); assertTrue(testTable.commitExists(commitTime1)); assertFalse(testTable.inflightCommitExists(commitTime2)); @@ -317,4 +582,162 @@ public void testAutoRollbackInflightCommit() throws Exception { assertFalse(testTable.baseFilesExist(partitionAndFileId3, commitTime3)); } } + + private static Stream testRollbackWithRequestedRollbackPlanParams() { + return Arrays.stream(new Boolean[][] { + {true, true}, {true, false}, {false, true}, {false, false}, + }).map(Arguments::of); + } + + @ParameterizedTest + @MethodSource("testRollbackWithRequestedRollbackPlanParams") + public void testRollbackWithRequestedRollbackPlan(boolean enableMetadataTable, boolean isRollbackPlanCorrupted) throws Exception { + // Let's create some commit files and base files + final String p1 = "2022/04/05"; + final String p2 = "2022/04/06"; + final String commitTime1 = "20220406010101002"; + final String commitTime2 = "20220406020601002"; + final String commitTime3 = "20220406030611002"; + final String rollbackInstantTime = "20220406040611002"; + Map partitionAndFileId1 = new HashMap() { + { + put(p1, "id11"); + put(p2, "id12"); + } + }; + Map partitionAndFileId2 = new HashMap() { + { + put(p1, "id21"); + put(p2, "id22"); + } + }; + Map partitionAndFileId3 = new HashMap() { + { + put(p1, "id31"); + put(p2, "id32"); + } + }; + + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withRollbackUsingMarkers(false) + .withMetadataConfig( + HoodieMetadataConfig.newBuilder() + // Column Stats Index is disabled, since these tests construct tables which are + // not valid (empty commit metadata, invalid parquet files) + .withMetadataIndexColumnStats(false) + .enable(enableMetadataTable) + .build() + ) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).build()) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); + + HoodieTestTable testTable = enableMetadataTable + ? HoodieMetadataTestTable.of(metaClient, SparkHoodieBackedTableMetadataWriter.create( + metaClient.getHadoopConf(), config, context)) + : HoodieTestTable.of(metaClient); + + testTable.withPartitionMetaFiles(p1, p2) + .addCommit(commitTime1) + .withBaseFilesInPartitions(partitionAndFileId1) + .addCommit(commitTime2) + .withBaseFilesInPartitions(partitionAndFileId2) + .addInflightCommit(commitTime3) + .withBaseFilesInPartitions(partitionAndFileId3); + + try (SparkRDDWriteClient client = getHoodieWriteClient(config)) { + if (isRollbackPlanCorrupted) { + // Add a corrupted requested rollback plan + FileCreateUtils.createRequestedRollbackFile(metaClient.getBasePath(), rollbackInstantTime, new byte[] {0, 1, 2}); + } else { + // Add a valid requested rollback plan to roll back commitTime3 + HoodieRollbackPlan rollbackPlan = new HoodieRollbackPlan(); + List rollbackRequestList = partitionAndFileId3.keySet().stream() + .map(partition -> new HoodieRollbackRequest(partition, EMPTY_STRING, EMPTY_STRING, + Collections.singletonList(metaClient.getBasePath() + "/" + partition + "/" + + FileCreateUtils.baseFileName(commitTime3, partitionAndFileId3.get(p1))), + Collections.emptyMap())) + .collect(Collectors.toList()); + rollbackPlan.setRollbackRequests(rollbackRequestList); + rollbackPlan.setInstantToRollback(new HoodieInstantInfo(commitTime3, HoodieTimeline.COMMIT_ACTION)); + FileCreateUtils.createRequestedRollbackFile(metaClient.getBasePath(), rollbackInstantTime, rollbackPlan); + } + + // Rollback commit3 + client.rollback(commitTime3); + assertFalse(testTable.inflightCommitExists(commitTime3)); + assertFalse(testTable.baseFilesExist(partitionAndFileId3, commitTime3)); + assertTrue(testTable.baseFilesExist(partitionAndFileId2, commitTime2)); + + metaClient.reloadActiveTimeline(); + List rollbackInstants = metaClient.getActiveTimeline().getRollbackTimeline().getInstants().collect(Collectors.toList()); + // Corrupted requested rollback plan should be deleted before scheduling a new one + assertEquals(rollbackInstants.size(), 1); + HoodieInstant rollbackInstant = rollbackInstants.get(0); + assertTrue(rollbackInstant.isCompleted()); + + if (isRollbackPlanCorrupted) { + // Should create a new rollback instant + assertNotEquals(rollbackInstantTime, rollbackInstant.getTimestamp()); + } else { + // Should reuse the rollback instant + assertEquals(rollbackInstantTime, rollbackInstant.getTimestamp()); + } + } + } + + @Test + public void testFallbackToListingBasedRollbackForCompletedInstant() throws Exception { + // Let's create some commit files and base files + final String p1 = "2016/05/01"; + final String p2 = "2016/05/02"; + final String p3 = "2016/05/06"; + final String commitTime1 = "20160501010101"; + final String commitTime2 = "20160502020601"; + final String commitTime3 = "20160506030611"; + Map partitionAndFileId1 = new HashMap() { + { + put(p1, "id11"); + put(p2, "id12"); + put(p3, "id13"); + } + }; + Map partitionAndFileId2 = new HashMap() { + { + put(p1, "id21"); + put(p2, "id22"); + put(p3, "id23"); + } + }; + Map partitionAndFileId3 = new HashMap() { + { + put(p1, "id31"); + put(p2, "id32"); + put(p3, "id33"); + } + }; + + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withRollbackUsingMarkers(true) // rollback using markers to test fallback to listing based rollback for completed instant + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).build()) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); + + // create test table with all commits completed + HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, SparkHoodieBackedTableMetadataWriter.create(metaClient.getHadoopConf(), config, context)); + testTable.withPartitionMetaFiles(p1, p2, p3) + .addCommit(commitTime1) + .withBaseFilesInPartitions(partitionAndFileId1) + .addCommit(commitTime2) + .withBaseFilesInPartitions(partitionAndFileId2) + .addCommit(commitTime3) + .withBaseFilesInPartitions(partitionAndFileId3); + + try (SparkRDDWriteClient client = getHoodieWriteClient(config)) { + client.rollback(commitTime3); + assertFalse(testTable.inflightCommitExists(commitTime3)); + assertFalse(testTable.baseFilesExist(partitionAndFileId3, commitTime3)); + assertTrue(testTable.baseFilesExist(partitionAndFileId2, commitTime2)); + } + } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestCompactionAdminClient.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestCompactionAdminClient.java index 03328ddddc142..1e3de9ea386fa 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestCompactionAdminClient.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestCompactionAdminClient.java @@ -37,6 +37,7 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -70,18 +71,23 @@ public void setUp() throws Exception { client = new CompactionAdminClient(context, basePath); } + @AfterEach + public void cleanUp() throws Exception { + cleanupResources(); + } + @Test public void testUnscheduleCompactionPlan() throws Exception { int numEntriesPerInstant = 10; CompactionTestUtils.setupAndValidateCompactionOperations(metaClient, false, numEntriesPerInstant, numEntriesPerInstant, numEntriesPerInstant, numEntriesPerInstant); - // THere are delta-commits after compaction instant + // There are delta-commits after compaction instant validateUnSchedulePlan(client, "000", "001", numEntriesPerInstant, 2 * numEntriesPerInstant); - // THere are delta-commits after compaction instant + // There are delta-commits after compaction instant validateUnSchedulePlan(client, "002", "003", numEntriesPerInstant, 2 * numEntriesPerInstant); - // THere are no delta-commits after compaction instant + // There are no delta-commits after compaction instant validateUnSchedulePlan(client, "004", "005", numEntriesPerInstant, 0); - // THere are no delta-commits after compaction instant + // There are no delta-commits after compaction instant validateUnSchedulePlan(client, "006", "007", numEntriesPerInstant, 0); } @@ -100,13 +106,13 @@ public void testUnscheduleCompactionFileId() throws Exception { }).map(instantWithPlan -> instantWithPlan.getRight().getOperations().stream() .map(op -> Pair.of(instantWithPlan.getLeft(), CompactionOperation.convertFromAvroRecordInstance(op))) .findFirst().get()).collect(Collectors.toMap(Pair::getLeft, Pair::getRight)); - // THere are delta-commits after compaction instant + // There are delta-commits after compaction instant validateUnScheduleFileId(client, "000", "001", instantsWithOp.get("001"), 2); - // THere are delta-commits after compaction instant + // There are delta-commits after compaction instant validateUnScheduleFileId(client, "002", "003", instantsWithOp.get("003"), 2); - // THere are no delta-commits after compaction instant + // There are no delta-commits after compaction instant validateUnScheduleFileId(client, "004", "005", instantsWithOp.get("005"), 0); - // THere are no delta-commits after compaction instant + // There are no delta-commits after compaction instant validateUnScheduleFileId(client, "006", "007", instantsWithOp.get("007"), 0); } @@ -115,13 +121,13 @@ public void testRepairCompactionPlan() throws Exception { int numEntriesPerInstant = 10; CompactionTestUtils.setupAndValidateCompactionOperations(metaClient, false, numEntriesPerInstant, numEntriesPerInstant, numEntriesPerInstant, numEntriesPerInstant); - // THere are delta-commits after compaction instant + // There are delta-commits after compaction instant validateRepair("000", "001", numEntriesPerInstant, 2 * numEntriesPerInstant); - // THere are delta-commits after compaction instant + // There are delta-commits after compaction instant validateRepair("002", "003", numEntriesPerInstant, 2 * numEntriesPerInstant); - // THere are no delta-commits after compaction instant + // There are no delta-commits after compaction instant validateRepair("004", "005", numEntriesPerInstant, 0); - // THere are no delta-commits after compaction instant + // There are no delta-commits after compaction instant validateRepair("006", "007", numEntriesPerInstant, 0); } @@ -129,7 +135,7 @@ private void validateRepair(String ingestionInstant, String compactionInstant, i int expNumRepairs) throws Exception { List> renameFiles = validateUnSchedulePlan(client, ingestionInstant, compactionInstant, numEntriesPerInstant, expNumRepairs, true); - metaClient = new HoodieTableMetaClient(metaClient.getHadoopConf(), basePath, true); + metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); List result = client.validateCompactionPlan(metaClient, compactionInstant, 1); if (expNumRepairs > 0) { assertTrue(result.stream().anyMatch(r -> !r.isSuccess()), "Expect some failures in validation"); @@ -157,7 +163,7 @@ private void validateRepair(String ingestionInstant, String compactionInstant, i expRenameFiles.forEach((key, value) -> LOG.info("Key :" + key + " renamed to " + value + " rolled back to " + renameFilesFromUndo.get(key))); - assertEquals(expRenameFiles, renameFilesFromUndo, "Undo must completely rollback renames"); + assertEquals(expRenameFiles, renameFilesFromUndo, "Undo must completely rollback renamed files"); // Now expect validation to succeed result = client.validateCompactionPlan(metaClient, compactionInstant, 1); assertTrue(result.stream().allMatch(OperationResult::isSuccess), "Expect no failures in validation"); @@ -165,12 +171,12 @@ private void validateRepair(String ingestionInstant, String compactionInstant, i } /** - * Enssure compaction plan is valid. + * Ensure compaction plan is valid. * * @param compactionInstant Compaction Instant */ private void ensureValidCompactionPlan(String compactionInstant) throws Exception { - metaClient = new HoodieTableMetaClient(metaClient.getHadoopConf(), basePath, true); + metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); // Ensure compaction-plan is good to begin with List validationResults = client.validateCompactionPlan(metaClient, compactionInstant, 1); assertFalse(validationResults.stream().anyMatch(v -> !v.isSuccess()), @@ -193,8 +199,8 @@ private void validateRenameFiles(List> rename renameFiles.forEach(lfPair -> { HoodieLogFile oldLogFile = lfPair.getLeft(); HoodieLogFile newLogFile = lfPair.getValue(); - assertEquals(ingestionInstant, newLogFile.getBaseCommitTime(), "Base Commit time is expected"); - assertEquals(compactionInstant, oldLogFile.getBaseCommitTime(), "Base Commit time is expected"); + assertEquals(ingestionInstant, newLogFile.getBaseCommitTime(), "Base Commit time of ingestion instant is expected"); + assertEquals(compactionInstant, oldLogFile.getBaseCommitTime(), "Base Commit time of compaction instant is expected"); assertEquals(oldLogFile.getFileId(), newLogFile.getFileId(), "File Id is expected"); HoodieLogFile lastLogFileBeforeCompaction = fsView.getLatestMergedFileSlicesBeforeOrOn(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0], ingestionInstant) @@ -228,7 +234,7 @@ private List> validateUnSchedulePlan(Compacti // Check suggested rename operations List> renameFiles = client.getRenamingActionsForUnschedulingCompactionPlan(metaClient, compactionInstant, 1, Option.empty(), false); - metaClient = new HoodieTableMetaClient(metaClient.getHadoopConf(), basePath, true); + metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); // Log files belonging to file-slices created because of compaction request must be renamed @@ -264,10 +270,10 @@ private List> validateUnSchedulePlan(Compacti client.unscheduleCompactionPlan(compactionInstant, false, 1, false); - metaClient = new HoodieTableMetaClient(metaClient.getHadoopConf(), basePath, true); + metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); final HoodieTableFileSystemView newFsView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); - // Expect all file-slice whose base-commit is same as compaction commit to contain no new Log files + // Expect each file-slice whose base-commit is same as compaction commit to contain no new Log files newFsView.getLatestFileSlicesBeforeOrOn(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0], compactionInstant, true) .filter(fs -> fs.getBaseInstantTime().equals(compactionInstant)) .forEach(fs -> { @@ -285,7 +291,7 @@ private List> validateUnSchedulePlan(Compacti assertEquals(fileIdToCountsBeforeRenaming, fileIdToCountsAfterRenaming, "Each File Id has same number of log-files"); assertEquals(numEntriesPerInstant, fileIdToCountsAfterRenaming.size(), "Not Empty"); - assertEquals(expNumRenames, renameFiles.size(), "Expected number of renames"); + assertEquals(expNumRenames, renameFiles.size(), "Expected number of renamed files"); return renameFiles; } @@ -300,7 +306,7 @@ private void validateUnScheduleFileId(CompactionAdminClient client, String inges // Check suggested rename operations List> renameFiles = client .getRenamingActionsForUnschedulingCompactionOperation(metaClient, compactionInstant, op, Option.empty(), false); - metaClient = new HoodieTableMetaClient(metaClient.getHadoopConf(), basePath, true); + metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); // Log files belonging to file-slices created because of compaction request must be renamed @@ -325,7 +331,7 @@ private void validateUnScheduleFileId(CompactionAdminClient client, String inges // Call the main unschedule API client.unscheduleCompactionFileId(op.getFileGroupId(), false, false); - metaClient = new HoodieTableMetaClient(metaClient.getHadoopConf(), basePath, true); + metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); final HoodieTableFileSystemView newFsView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline()); // Expect all file-slice whose base-commit is same as compaction commit to contain no new Log files @@ -348,6 +354,6 @@ private void validateUnScheduleFileId(CompactionAdminClient client, String inges assertEquals(fileIdToCountsBeforeRenaming, fileIdToCountsAfterRenaming, "Each File Id has same number of log-files"); assertEquals(1, fileIdToCountsAfterRenaming.size(), "Not Empty"); - assertEquals(expNumRenames, renameFiles.size(), "Expected number of renames"); + assertEquals(expNumRenames, renameFiles.size(), "Expected number of renamed files"); } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestFileBasedLockProvider.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestFileBasedLockProvider.java new file mode 100644 index 0000000000000..208e9cd62e738 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestFileBasedLockProvider.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.client; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hudi.client.transaction.lock.FileSystemBasedLockProvider; +import org.apache.hudi.common.config.LockConfiguration; +import org.apache.hudi.common.testutils.minicluster.HdfsTestService; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieLockException; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.Properties; +import java.util.concurrent.TimeUnit; + +import static org.apache.hudi.common.config.LockConfiguration.FILESYSTEM_LOCK_EXPIRE_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.FILESYSTEM_LOCK_PATH_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_NUM_RETRIES_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY; + +public class TestFileBasedLockProvider { + private static HdfsTestService hdfsTestService; + private static MiniDFSCluster dfsCluster; + private static LockConfiguration lockConfiguration; + private static Configuration hadoopConf; + + @BeforeAll + public static void setup() throws IOException { + hdfsTestService = new HdfsTestService(); + dfsCluster = hdfsTestService.start(true); + hadoopConf = dfsCluster.getFileSystem().getConf(); + + Properties properties = new Properties(); + properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, "/tmp/"); + properties.setProperty(FILESYSTEM_LOCK_EXPIRE_PROP_KEY, "1"); + properties.setProperty(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "1000"); + properties.setProperty(LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY, "1000"); + properties.setProperty(LOCK_ACQUIRE_NUM_RETRIES_PROP_KEY, "3"); + lockConfiguration = new LockConfiguration(properties); + } + + @AfterAll + public static void cleanUpAfterAll() throws IOException { + Path workDir = dfsCluster.getFileSystem().getWorkingDirectory(); + FileSystem fs = workDir.getFileSystem(hdfsTestService.getHadoopConf()); + fs.delete(new Path("/tmp"), true); + if (hdfsTestService != null) { + hdfsTestService.stop(); + hdfsTestService = null; + } + } + + @AfterEach + public void cleanUpAfterEach() throws IOException { + Path workDir = dfsCluster.getFileSystem().getWorkingDirectory(); + FileSystem fs = workDir.getFileSystem(hdfsTestService.getHadoopConf()); + fs.delete(new Path("/tmp/lock"), true); + } + + @Test + public void testAcquireLock() { + FileSystemBasedLockProvider fileBasedLockProvider = new FileSystemBasedLockProvider(lockConfiguration, hadoopConf); + Assertions.assertTrue(fileBasedLockProvider.tryLock(lockConfiguration.getConfig() + .getLong(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY), TimeUnit.MILLISECONDS)); + fileBasedLockProvider.unlock(); + } + + @Test + public void testAcquireLockWithDefaultPath() { + lockConfiguration.getConfig().remove(FILESYSTEM_LOCK_PATH_PROP_KEY); + lockConfiguration.getConfig().setProperty(HoodieWriteConfig.BASE_PATH.key(), "/tmp/"); + FileSystemBasedLockProvider fileBasedLockProvider = new FileSystemBasedLockProvider(lockConfiguration, hadoopConf); + Assertions.assertTrue(fileBasedLockProvider.tryLock(lockConfiguration.getConfig() + .getLong(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY), TimeUnit.MILLISECONDS)); + fileBasedLockProvider.unlock(); + lockConfiguration.getConfig().setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, "/tmp/"); + } + + @Test + public void testUnLock() { + FileSystemBasedLockProvider fileBasedLockProvider = new FileSystemBasedLockProvider(lockConfiguration, hadoopConf); + Assertions.assertTrue(fileBasedLockProvider.tryLock(lockConfiguration.getConfig() + .getLong(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY), TimeUnit.MILLISECONDS)); + fileBasedLockProvider.unlock(); + Assertions.assertTrue(fileBasedLockProvider.tryLock(lockConfiguration.getConfig() + .getLong(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY), TimeUnit.MILLISECONDS)); + } + + @Test + public void testReentrantLock() { + FileSystemBasedLockProvider fileBasedLockProvider = new FileSystemBasedLockProvider(lockConfiguration, hadoopConf); + Assertions.assertTrue(fileBasedLockProvider.tryLock(lockConfiguration.getConfig() + .getLong(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY), TimeUnit.MILLISECONDS)); + Assertions.assertFalse(fileBasedLockProvider.tryLock(lockConfiguration.getConfig() + .getLong(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY), TimeUnit.MILLISECONDS)); + fileBasedLockProvider.unlock(); + } + + @Test + public void testUnlockWithoutLock() { + try { + FileSystemBasedLockProvider fileBasedLockProvider = new FileSystemBasedLockProvider(lockConfiguration, hadoopConf); + fileBasedLockProvider.unlock(); + } catch (HoodieLockException e) { + Assertions.fail(); + } + } + +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientMultiWriter.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientMultiWriter.java new file mode 100644 index 0000000000000..6ad8666a0fa20 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientMultiWriter.java @@ -0,0 +1,661 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client; + +import org.apache.hudi.client.transaction.lock.FileSystemBasedLockProvider; +import org.apache.hudi.client.transaction.lock.InProcessLockProvider; +import org.apache.hudi.common.config.LockConfiguration; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.TableServiceType; +import org.apache.hudi.common.model.WriteConcurrencyMode; +import org.apache.hudi.common.table.marker.MarkerType; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.FileSystemViewStorageType; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieArchivalConfig; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieLockConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieWriteConflictException; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.testutils.HoodieClientTestBase; + +import org.apache.hadoop.fs.Path; +import org.apache.spark.api.java.JavaRDD; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.CyclicBarrier; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.config.LockConfiguration.FILESYSTEM_LOCK_EXPIRE_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.FILESYSTEM_LOCK_PATH_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_NUM_RETRIES_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY; +import static org.apache.hudi.common.config.LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY; +import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +public class TestHoodieClientMultiWriter extends HoodieClientTestBase { + + private Properties lockProperties = null; + + @BeforeEach + public void setup() throws IOException { + if (lockProperties == null) { + lockProperties = new Properties(); + lockProperties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks"); + lockProperties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000"); + lockProperties.setProperty(FILESYSTEM_LOCK_EXPIRE_PROP_KEY, "1"); + lockProperties.setProperty(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "1000"); + lockProperties.setProperty(LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY, "1000"); + lockProperties.setProperty(LOCK_ACQUIRE_NUM_RETRIES_PROP_KEY, "3"); + } + } + + public void setUpMORTestTable() throws IOException { + cleanupResources(); + initPath(); + initSparkContexts(); + initTestDataGenerator(); + initFileSystem(); + fs.mkdirs(new Path(basePath)); + metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ, HoodieFileFormat.PARQUET); + initTestDataGenerator(); + } + + @AfterEach + public void clean() throws IOException { + cleanupResources(); + } + + private static final List LOCK_PROVIDER_CLASSES = Arrays.asList( + InProcessLockProvider.class, + FileSystemBasedLockProvider.class); + + private static Iterable providerClassAndTableType() { + List opts = new ArrayList<>(); + for (Object providerClass : LOCK_PROVIDER_CLASSES) { + opts.add(new Object[] {HoodieTableType.COPY_ON_WRITE, providerClass}); + opts.add(new Object[] {HoodieTableType.MERGE_ON_READ, providerClass}); + } + return opts; + } + + @ParameterizedTest + @MethodSource("providerClassAndTableType") + public void testHoodieClientBasicMultiWriter(HoodieTableType tableType, Class providerClass) throws Exception { + if (tableType == HoodieTableType.MERGE_ON_READ) { + setUpMORTestTable(); + } + lockProperties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000"); + + HoodieWriteConfig writeConfig = getConfigBuilder() + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) + .withAutoClean(false).build()) + .withArchivalConfig(HoodieArchivalConfig.newBuilder() + .withAutoArchive(false).build()) + .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) + // Timeline-server-based markers are not used for multi-writer tests + .withMarkersType(MarkerType.DIRECT.name()) + .withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(providerClass) + .build()).withAutoCommit(false).withProperties(lockProperties).build(); + + // Create the first commit + createCommitWithInserts(writeConfig, getHoodieWriteClient(writeConfig), "000", "001", 200, true); + + final int threadCount = 2; + final ExecutorService executors = Executors.newFixedThreadPool(2); + final SparkRDDWriteClient client1 = getHoodieWriteClient(writeConfig); + final SparkRDDWriteClient client2 = getHoodieWriteClient(writeConfig); + + final CyclicBarrier cyclicBarrier = new CyclicBarrier(threadCount); + final AtomicBoolean writer1Completed = new AtomicBoolean(false); + final AtomicBoolean writer2Completed = new AtomicBoolean(false); + + Future future1 = executors.submit(() -> { + try { + final String nextCommitTime = "002"; + final JavaRDD writeStatusList = startCommitForUpdate(writeConfig, client1, nextCommitTime, 100); + + // Wait for the 2nd writer to start the commit + cyclicBarrier.await(60, TimeUnit.SECONDS); + + // Commit the update before the 2nd writer + assertDoesNotThrow(() -> { + client1.commit(nextCommitTime, writeStatusList); + }); + + // Signal the 2nd writer to go ahead for his commit + cyclicBarrier.await(60, TimeUnit.SECONDS); + writer1Completed.set(true); + } catch (Exception e) { + writer1Completed.set(false); + } + }); + + Future future2 = executors.submit(() -> { + try { + final String nextCommitTime = "003"; + + // Wait for the 1st writer to make progress with the commit + cyclicBarrier.await(60, TimeUnit.SECONDS); + final JavaRDD writeStatusList = startCommitForUpdate(writeConfig, client2, nextCommitTime, 100); + + // Wait for the 1st writer to complete the commit + cyclicBarrier.await(60, TimeUnit.SECONDS); + assertThrows(HoodieWriteConflictException.class, () -> { + client2.commit(nextCommitTime, writeStatusList); + }); + writer2Completed.set(true); + } catch (Exception e) { + writer2Completed.set(false); + } + }); + + future1.get(); + future2.get(); + + // both should have been completed successfully. I mean, we already assert for conflict for writer2 at L155. + assertTrue(writer1Completed.get() && writer2Completed.get()); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class, names = {"COPY_ON_WRITE", "MERGE_ON_READ"}) + public void testMultiWriterWithInsertsToDistinctPartitions(HoodieTableType tableType) throws Exception { + if (tableType == HoodieTableType.MERGE_ON_READ) { + setUpMORTestTable(); + } + + lockProperties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000"); + lockProperties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY, "3000"); + lockProperties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY, "20"); + + HoodieWriteConfig cfg = getConfigBuilder() + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) + .build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withInlineCompaction(false) + .withMaxNumDeltaCommitsBeforeCompaction(2) + .build()) + .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) + .withLockConfig(HoodieLockConfig.newBuilder() + .withLockProvider(InProcessLockProvider.class) + .build()) + .withAutoCommit(false) + // Timeline-server-based markers are not used for multi-writer tests + .withMarkersType(MarkerType.DIRECT.name()) + .withProperties(lockProperties) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder().withStorageType(FileSystemViewStorageType.REMOTE_FIRST) + .withSecondaryStorageType(FileSystemViewStorageType.MEMORY).build()) + .build(); + + // Create the first commit + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + createCommitWithInsertsForPartition(cfg, client, "000", "001", 100, "2016/03/01"); + + int numConcurrentWriters = 5; + ExecutorService executors = Executors.newFixedThreadPool(numConcurrentWriters); + + List> futures = new ArrayList<>(numConcurrentWriters); + for (int loop = 0; loop < numConcurrentWriters; loop++) { + String newCommitTime = "00" + (loop + 2); + String partition = "2016/03/0" + (loop + 2); + futures.add(executors.submit(() -> { + try { + SparkRDDWriteClient writeClient = getHoodieWriteClient(cfg); + createCommitWithInsertsForPartition(cfg, writeClient, "001", newCommitTime, 100, partition); + } catch (Exception e) { + throw new RuntimeException(e); + } + })); + } + + futures.forEach(f -> { + try { + f.get(); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + } + + /** + * Count down the latch and await for all the needed threads to join. + * + * @param latch - Count down latch + * @param waitTimeMillis - Max wait time in millis for waiting + */ + private void latchCountDownAndWait(CountDownLatch latch, long waitTimeMillis) { + latch.countDown(); + try { + latch.await(waitTimeMillis, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + // + } + } + + @ParameterizedTest + @MethodSource("providerClassAndTableType") + public void testMultiWriterWithAsyncTableServicesWithConflict(HoodieTableType tableType, Class providerClass) throws Exception { + // create inserts X 1 + if (tableType == HoodieTableType.MERGE_ON_READ) { + setUpMORTestTable(); + } + // Disabling embedded timeline server, it doesn't work with multiwriter + HoodieWriteConfig.Builder writeConfigBuilder = getConfigBuilder() + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withAutoClean(false) + .withAsyncClean(true) + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withInlineCompaction(false) + .withMaxNumDeltaCommitsBeforeCompaction(2).build()) + .withEmbeddedTimelineServerEnabled(false) + // Timeline-server-based markers are not used for multi-writer tests + .withMarkersType(MarkerType.DIRECT.name()) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder().withStorageType( + FileSystemViewStorageType.MEMORY).build()) + .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) + .withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(providerClass) + .build()).withAutoCommit(false).withProperties(lockProperties); + Set validInstants = new HashSet<>(); + // Create the first commit with inserts + HoodieWriteConfig cfg = writeConfigBuilder.build(); + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + createCommitWithInserts(cfg, client, "000", "001", 200, true); + validInstants.add("001"); + // Create 2 commits with upserts + createCommitWithUpserts(cfg, client, "001", "000", "002", 100); + createCommitWithUpserts(cfg, client, "002", "000", "003", 100); + validInstants.add("002"); + validInstants.add("003"); + + // Three clients running actions in parallel + final int threadCount = 3; + final CountDownLatch scheduleCountDownLatch = new CountDownLatch(threadCount); + final ExecutorService executors = Executors.newFixedThreadPool(threadCount); + + // Write config with clustering enabled + final HoodieWriteConfig cfg2 = writeConfigBuilder + .withClusteringConfig(HoodieClusteringConfig.newBuilder() + .withInlineClustering(true) + .withInlineClusteringNumCommits(1) + .build()) + .build(); + final SparkRDDWriteClient client1 = getHoodieWriteClient(cfg2); + final SparkRDDWriteClient client2 = getHoodieWriteClient(cfg); + final SparkRDDWriteClient client3 = getHoodieWriteClient(cfg); + + // Create upserts, schedule cleaning, schedule compaction in parallel + Future future1 = executors.submit(() -> { + final String newCommitTime = "004"; + final int numRecords = 100; + final String commitTimeBetweenPrevAndNew = "002"; + + // We want the upsert to go through only after the compaction + // and cleaning schedule completion. So, waiting on latch here. + latchCountDownAndWait(scheduleCountDownLatch, 30000); + if (tableType == HoodieTableType.MERGE_ON_READ) { + // Since the compaction already went in, this upsert has + // to fail + assertThrows(IllegalArgumentException.class, () -> { + createCommitWithUpserts(cfg, client1, "003", commitTimeBetweenPrevAndNew, newCommitTime, numRecords); + }); + } else { + // We don't have the compaction for COW and so this upsert + // has to pass + assertDoesNotThrow(() -> { + createCommitWithUpserts(cfg, client1, "003", commitTimeBetweenPrevAndNew, newCommitTime, numRecords); + }); + validInstants.add(newCommitTime); + } + }); + + Future future2 = executors.submit(() -> { + if (tableType == HoodieTableType.MERGE_ON_READ) { + assertDoesNotThrow(() -> { + client2.scheduleTableService("005", Option.empty(), TableServiceType.COMPACT); + }); + } + latchCountDownAndWait(scheduleCountDownLatch, 30000); + }); + + Future future3 = executors.submit(() -> { + assertDoesNotThrow(() -> { + latchCountDownAndWait(scheduleCountDownLatch, 30000); + client3.scheduleTableService("006", Option.empty(), TableServiceType.CLEAN); + }); + }); + future1.get(); + future2.get(); + future3.get(); + + CountDownLatch runCountDownLatch = new CountDownLatch(threadCount); + // Create inserts, run cleaning, run compaction in parallel + future1 = executors.submit(() -> { + final String newCommitTime = "007"; + final int numRecords = 100; + latchCountDownAndWait(runCountDownLatch, 30000); + assertDoesNotThrow(() -> { + createCommitWithInserts(cfg, client1, "003", newCommitTime, numRecords, true); + validInstants.add("007"); + }); + }); + + future2 = executors.submit(() -> { + latchCountDownAndWait(runCountDownLatch, 30000); + if (tableType == HoodieTableType.MERGE_ON_READ) { + assertDoesNotThrow(() -> { + HoodieWriteMetadata> compactionMetadata = client2.compact("005"); + client2.commitCompaction("005", compactionMetadata.getCommitMetadata().get(), Option.empty()); + validInstants.add("005"); + }); + } + }); + + future3 = executors.submit(() -> { + latchCountDownAndWait(runCountDownLatch, 30000); + assertDoesNotThrow(() -> { + client3.clean("006", false); + validInstants.add("006"); + }); + }); + future1.get(); + future2.get(); + future3.get(); + + validInstants.addAll( + metaClient.reloadActiveTimeline().getCompletedReplaceTimeline() + .filterCompletedInstants().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toSet())); + Set completedInstants = metaClient.reloadActiveTimeline().getCommitsTimeline() + .filterCompletedInstants().getInstants().map(HoodieInstant::getTimestamp) + .collect(Collectors.toSet()); + assertTrue(validInstants.containsAll(completedInstants)); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class, names = {"COPY_ON_WRITE", "MERGE_ON_READ"}) + public void testHoodieClientMultiWriterWithClustering(HoodieTableType tableType) throws Exception { + if (tableType == HoodieTableType.MERGE_ON_READ) { + setUpMORTestTable(); + } + Properties properties = new Properties(); + properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000"); + HoodieWriteConfig.Builder writeConfigBuilder = getConfigBuilder() + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) + .withAutoClean(false).build()) + .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) + // Timeline-server-based markers are not used for multi-writer tests + .withMarkersType(MarkerType.DIRECT.name()) + .withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(InProcessLockProvider.class) + .build()).withAutoCommit(false).withProperties(properties); + HoodieWriteConfig cfg = writeConfigBuilder.build(); + HoodieWriteConfig cfg2 = writeConfigBuilder.build(); + HoodieWriteConfig cfg3 = writeConfigBuilder + .withClusteringConfig(HoodieClusteringConfig.newBuilder().withInlineClustering(true).withInlineClusteringNumCommits(1).build()) + .build(); + + // Create the first commit + createCommitWithInserts(cfg, getHoodieWriteClient(cfg), "000", "001", 200, true); + // Start another inflight commit + String newCommitTime = "003"; + int numRecords = 100; + SparkRDDWriteClient client1 = getHoodieWriteClient(cfg); + String commitTimeBetweenPrevAndNew = "002"; + JavaRDD result1 = updateBatch(cfg, client1, newCommitTime, "001", + Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), "000", numRecords, SparkRDDWriteClient::upsert, false, false, + numRecords, 200, 2); + // Start and finish another commit while the previous writer for commit 003 is running + newCommitTime = "004"; + SparkRDDWriteClient client2 = getHoodieWriteClient(cfg); + JavaRDD result2 = updateBatch(cfg2, client2, newCommitTime, "001", + Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), "000", numRecords, SparkRDDWriteClient::upsert, false, false, + numRecords, 200, 2); + client2.commit(newCommitTime, result2); + // Schedule and run clustering while previous writer for commit 003 is running + SparkRDDWriteClient client3 = getHoodieWriteClient(cfg3); + // schedule clustering + Option clusterInstant = client3.scheduleTableService(Option.empty(), TableServiceType.CLUSTER); + assertTrue(clusterInstant.isPresent()); + // Attempt to commit the inflight commit 003 + try { + client1.commit("003", result1); + fail("Should have thrown a concurrent conflict exception"); + } catch (Exception e) { + // Expected + } + } + + @Test + public void testHoodieClientMultiWriterAutoCommitForConflict() throws Exception { + lockProperties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY, "100"); + HoodieWriteConfig.Builder writeConfigBuilder = getConfigBuilder() + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) + .withAutoClean(false).build()) + .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) + // Timeline-server-based markers are not used for multi-writer tests + .withMarkersType(MarkerType.DIRECT.name()) + .withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(InProcessLockProvider.class) + .build()).withAutoCommit(true).withProperties(lockProperties); + HoodieWriteConfig cfg = writeConfigBuilder.build(); + HoodieWriteConfig cfg2 = writeConfigBuilder.build(); + + // Create the first commit + createCommitWithInserts(cfg, getHoodieWriteClient(cfg), "000", "001", 5000, false); + // Start another inflight commit + String newCommitTime1 = "003"; + String newCommitTime2 = "004"; + SparkRDDWriteClient client1 = getHoodieWriteClient(cfg); + SparkRDDWriteClient client2 = getHoodieWriteClient(cfg2); + + List updates1 = dataGen.generateUpdates(newCommitTime1, 5000); + List updates2 = dataGen.generateUpdates(newCommitTime2, 5000); + + JavaRDD writeRecords1 = jsc.parallelize(updates1, 4); + JavaRDD writeRecords2 = jsc.parallelize(updates2, 4); + + runConcurrentAndAssert(writeRecords1, writeRecords2, client1, client2, SparkRDDWriteClient::upsert, true); + } + + private void runConcurrentAndAssert(JavaRDD writeRecords1, JavaRDD writeRecords2, + SparkRDDWriteClient client1, SparkRDDWriteClient client2, + Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, + boolean assertForConflict) throws ExecutionException, InterruptedException { + + CountDownLatch runCountDownLatch = new CountDownLatch(2); + final ExecutorService executors = Executors.newFixedThreadPool(2); + String newCommitTime1 = "003"; + String newCommitTime2 = "004"; + + AtomicBoolean client1Succeeded = new AtomicBoolean(true); + AtomicBoolean client2Succeeded = new AtomicBoolean(true); + + Future future1 = executors.submit(() -> { + try { + ingestBatch(writeFn, client1, newCommitTime1, writeRecords1, runCountDownLatch); + } catch (IOException e) { + LOG.error("IOException thrown " + e.getMessage()); + } catch (InterruptedException e) { + LOG.error("Interrupted Exception thrown " + e.getMessage()); + } catch (Exception e) { + client1Succeeded.set(false); + } + } + ); + + Future future2 = executors.submit(() -> { + try { + ingestBatch(writeFn, client2, newCommitTime2, writeRecords2, runCountDownLatch); + } catch (IOException e) { + LOG.error("IOException thrown " + e.getMessage()); + } catch (InterruptedException e) { + LOG.error("Interrupted Exception thrown " + e.getMessage()); + } catch (Exception e) { + client2Succeeded.set(false); + } + } + ); + + future1.get(); + future2.get(); + if (assertForConflict) { + assertFalse(client1Succeeded.get() && client2Succeeded.get()); + assertTrue(client1Succeeded.get() || client2Succeeded.get()); + } else { + assertTrue(client2Succeeded.get() && client1Succeeded.get()); + } + } + + @Test + public void testHoodieClientMultiWriterAutoCommitNonConflict() throws Exception { + Properties properties = new Properties(); + properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY, "100"); + HoodieWriteConfig.Builder writeConfigBuilder = getConfigBuilder() + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) + .withAutoClean(false).build()) + .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) + // Timeline-server-based markers are not used for multi-writer tests + .withMarkersType(MarkerType.DIRECT.name()) + .withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(InProcessLockProvider.class) + .build()).withAutoCommit(true).withProperties(properties); + HoodieWriteConfig cfg = writeConfigBuilder.build(); + HoodieWriteConfig cfg2 = writeConfigBuilder.build(); + + // Create the first commit + createCommitWithInserts(cfg, getHoodieWriteClient(cfg), "000", "001", 200, false); + // Start another inflight commit + String newCommitTime1 = "003"; + String newCommitTime2 = "004"; + SparkRDDWriteClient client1 = getHoodieWriteClient(cfg); + SparkRDDWriteClient client2 = getHoodieWriteClient(cfg2); + + List updates1 = dataGen.generateInserts(newCommitTime1, 200); + List updates2 = dataGen.generateInserts(newCommitTime2, 200); + + JavaRDD writeRecords1 = jsc.parallelize(updates1, 1); + JavaRDD writeRecords2 = jsc.parallelize(updates2, 1); + + runConcurrentAndAssert(writeRecords1, writeRecords2, client1, client2, SparkRDDWriteClient::bulkInsert, false); + } + + private void ingestBatch(Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, + SparkRDDWriteClient writeClient, String commitTime, JavaRDD records, + CountDownLatch countDownLatch) throws IOException, InterruptedException { + writeClient.startCommitWithTime(commitTime); + countDownLatch.countDown(); + countDownLatch.await(); + JavaRDD statusJavaRDD = writeFn.apply(writeClient, records, commitTime); + statusJavaRDD.collect(); + } + + private void createCommitWithInsertsForPartition(HoodieWriteConfig cfg, SparkRDDWriteClient client, + String prevCommitTime, String newCommitTime, int numRecords, + String partition) throws Exception { + JavaRDD result = insertBatch(cfg, client, newCommitTime, prevCommitTime, numRecords, SparkRDDWriteClient::insert, + false, false, numRecords, numRecords, 1, Option.of(partition)); + assertTrue(client.commit(newCommitTime, result), "Commit should succeed"); + } + + private void createCommitWithInserts(HoodieWriteConfig cfg, SparkRDDWriteClient client, + String prevCommitTime, String newCommitTime, int numRecords, + boolean doCommit) throws Exception { + // Finish first base commit + JavaRDD result = insertFirstBatch(cfg, client, newCommitTime, prevCommitTime, numRecords, SparkRDDWriteClient::bulkInsert, + false, false, numRecords); + if (doCommit) { + assertTrue(client.commit(newCommitTime, result), "Commit should succeed"); + } + } + + private void createCommitWithUpserts(HoodieWriteConfig cfg, SparkRDDWriteClient client, String prevCommit, + String commitTimeBetweenPrevAndNew, String newCommitTime, int numRecords) + throws Exception { + JavaRDD result = updateBatch(cfg, client, newCommitTime, prevCommit, + Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), "000", numRecords, SparkRDDWriteClient::upsert, false, false, + numRecords, 200, 2); + client.commit(newCommitTime, result); + } + + /** + * Start the commit for an update operation with given number of records + * + * @param writeConfig - Write config + * @param writeClient - Write client for starting the commit + * @param newCommitTime - Commit time for the update + * @param numRecords - Number of records to update + * @return RDD of write status from the update + * @throws Exception + */ + private JavaRDD startCommitForUpdate(HoodieWriteConfig writeConfig, SparkRDDWriteClient writeClient, + String newCommitTime, int numRecords) throws Exception { + // Start the new commit + writeClient.startCommitWithTime(newCommitTime); + + // Prepare update records + final Function2, String, Integer> recordGenFunction = + generateWrapRecordsFn(false, writeConfig, dataGen::generateUniqueUpdates); + final List records = recordGenFunction.apply(newCommitTime, numRecords); + final JavaRDD writeRecords = jsc.parallelize(records, 1); + + // Write updates + Function3, SparkRDDWriteClient, JavaRDD, String> writeFn = SparkRDDWriteClient::upsert; + JavaRDD result = writeFn.apply(writeClient, writeRecords, newCommitTime); + List statuses = result.collect(); + assertNoWriteErrors(statuses); + return result; + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java deleted file mode 100644 index d278b08f3fc14..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java +++ /dev/null @@ -1,1401 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.client; - -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.Path; -import org.apache.hudi.common.fs.ConsistencyGuardConfig; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieWriteStat; -import org.apache.hudi.common.model.IOType; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; -import org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView; -import org.apache.hudi.common.testutils.HoodieTestDataGenerator; -import org.apache.hudi.common.testutils.HoodieTestTable; -import org.apache.hudi.common.testutils.RawTripTestPayload; -import org.apache.hudi.common.util.FileIOUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ParquetUtils; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieCompactionConfig; -import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieStorageConfig; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieCommitException; -import org.apache.hudi.exception.HoodieCorruptedDataException; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.exception.HoodieRollbackException; -import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.index.HoodieIndex.IndexType; -import org.apache.hudi.io.HoodieMergeHandle; -import org.apache.hudi.table.HoodieSparkTable; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.MarkerFiles; -import org.apache.hudi.table.action.commit.SparkWriteHelper; -import org.apache.hudi.testutils.HoodieClientTestBase; -import org.apache.hudi.testutils.HoodieClientTestUtils; -import org.apache.hudi.testutils.HoodieWriteableTestTable; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.EnumSource; -import org.junit.jupiter.params.provider.ValueSource; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.UUID; -import java.util.stream.Collectors; - -import static org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion.VERSION_0; -import static org.apache.hudi.common.testutils.FileCreateUtils.getBaseFileCountsForPaths; -import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; -import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH; -import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH; -import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.NULL_SCHEMA; -import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; -import static org.apache.hudi.common.testutils.Transformations.randomSelectAsHoodieKeys; -import static org.apache.hudi.common.testutils.Transformations.recordsToRecordKeySet; -import static org.apache.hudi.common.util.ParquetUtils.readRowKeysFromParquet; -import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -@SuppressWarnings("unchecked") -public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase { - - private static final Logger LOG = LogManager.getLogger(TestHoodieClientOnCopyOnWriteStorage.class); - private HoodieTestTable testTable; - - @BeforeEach - public void setUpTestTable() { - testTable = HoodieWriteableTestTable.of(metaClient); - } - - /** - * Test Auto Commit behavior for HoodieWriteClient insert API. - */ - @Test - public void testAutoCommitOnInsert() throws Exception { - testAutoCommit(SparkRDDWriteClient::insert, false); - } - - /** - * Test Auto Commit behavior for HoodieWriteClient insertPrepped API. - */ - @Test - public void testAutoCommitOnInsertPrepped() throws Exception { - testAutoCommit(SparkRDDWriteClient::insertPreppedRecords, true); - } - - /** - * Test Auto Commit behavior for HoodieWriteClient upsert API. - */ - @Test - public void testAutoCommitOnUpsert() throws Exception { - testAutoCommit(SparkRDDWriteClient::upsert, false); - } - - /** - * Test Auto Commit behavior for HoodieWriteClient upsert Prepped API. - */ - @Test - public void testAutoCommitOnUpsertPrepped() throws Exception { - testAutoCommit(SparkRDDWriteClient::upsertPreppedRecords, true); - } - - /** - * Test Auto Commit behavior for HoodieWriteClient bulk-insert API. - */ - @Test - public void testAutoCommitOnBulkInsert() throws Exception { - testAutoCommit(SparkRDDWriteClient::bulkInsert, false); - } - - /** - * Test Auto Commit behavior for HoodieWriteClient bulk-insert prepped API. - */ - @Test - public void testAutoCommitOnBulkInsertPrepped() throws Exception { - testAutoCommit((writeClient, recordRDD, instantTime) -> writeClient.bulkInsertPreppedRecords(recordRDD, instantTime, - Option.empty()), true); - } - - /** - * Test auto-commit by applying write function. - * - * @param writeFn One of HoodieWriteClient Write API - * @throws Exception in case of failure - */ - private void testAutoCommit(Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, - boolean isPrepped) throws Exception { - // Set autoCommit false - HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build(); - try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { - - String prevCommitTime = "000"; - String newCommitTime = "001"; - int numRecords = 200; - JavaRDD result = insertFirstBatch(cfg, client, newCommitTime, prevCommitTime, numRecords, writeFn, - isPrepped, false, numRecords); - - assertFalse(testTable.commitExists(newCommitTime), - "If Autocommit is false, then commit should not be made automatically"); - assertTrue(client.commit(newCommitTime, result), "Commit should succeed"); - assertTrue(testTable.commitExists(newCommitTime), - "After explicit commit, commit file should be created"); - } - } - - /** - * Test De-duplication behavior for HoodieWriteClient insert API. - */ - @Test - public void testDeduplicationOnInsert() throws Exception { - testDeduplication(SparkRDDWriteClient::insert); - } - - /** - * Test De-duplication behavior for HoodieWriteClient bulk-insert API. - */ - @Test - public void testDeduplicationOnBulkInsert() throws Exception { - testDeduplication(SparkRDDWriteClient::bulkInsert); - } - - /** - * Test De-duplication behavior for HoodieWriteClient upsert API. - */ - @Test - public void testDeduplicationOnUpsert() throws Exception { - testDeduplication(SparkRDDWriteClient::upsert); - } - - /** - * Test Deduplication Logic for write function. - * - * @param writeFn One of HoddieWriteClient non-prepped write APIs - * @throws Exception in case of failure - */ - private void testDeduplication( - Function3, SparkRDDWriteClient, JavaRDD, String> writeFn) throws Exception { - String newCommitTime = "001"; - - String recordKey = UUID.randomUUID().toString(); - HoodieKey keyOne = new HoodieKey(recordKey, "2018-01-01"); - HoodieRecord recordOne = - new HoodieRecord(keyOne, dataGen.generateRandomValue(keyOne, newCommitTime)); - - HoodieKey keyTwo = new HoodieKey(recordKey, "2018-02-01"); - HoodieRecord recordTwo = - new HoodieRecord(keyTwo, dataGen.generateRandomValue(keyTwo, newCommitTime)); - - // Same key and partition as keyTwo - HoodieRecord recordThree = - new HoodieRecord(keyTwo, dataGen.generateRandomValue(keyTwo, newCommitTime)); - - JavaRDD> records = - jsc.parallelize(Arrays.asList(recordOne, recordTwo, recordThree), 1); - - // Global dedup should be done based on recordKey only - HoodieIndex index = mock(HoodieIndex.class); - when(index.isGlobal()).thenReturn(true); - List> dedupedRecs = SparkWriteHelper.newInstance().deduplicateRecords(records, index, 1).collect(); - assertEquals(1, dedupedRecs.size()); - assertEquals(dedupedRecs.get(0).getPartitionPath(), recordThree.getPartitionPath()); - assertNodupesWithinPartition(dedupedRecs); - - // non-Global dedup should be done based on both recordKey and partitionPath - index = mock(HoodieIndex.class); - when(index.isGlobal()).thenReturn(false); - dedupedRecs = SparkWriteHelper.newInstance().deduplicateRecords(records, index, 1).collect(); - assertEquals(2, dedupedRecs.size()); - assertNodupesWithinPartition(dedupedRecs); - - // Perform write-action and check - JavaRDD recordList = jsc.parallelize(Arrays.asList(recordOne, recordTwo, recordThree), 1); - try (SparkRDDWriteClient client = getHoodieWriteClient(getConfigBuilder().combineInput(true, true).build(), false);) { - client.startCommitWithTime(newCommitTime); - List statuses = writeFn.apply(client, recordList, newCommitTime).collect(); - assertNoWriteErrors(statuses); - assertEquals(2, statuses.size()); - assertNodupesInPartition(statuses.stream().map(WriteStatus::getWrittenRecords).flatMap(Collection::stream) - .collect(Collectors.toList())); - } - } - - /** - * Assert that there is no duplicate key at the partition level. - * - * @param records List of Hoodie records - */ - void assertNodupesInPartition(List records) { - Map> partitionToKeys = new HashMap<>(); - for (HoodieRecord r : records) { - String key = r.getRecordKey(); - String partitionPath = r.getPartitionPath(); - if (!partitionToKeys.containsKey(partitionPath)) { - partitionToKeys.put(partitionPath, new HashSet<>()); - } - assertFalse(partitionToKeys.get(partitionPath).contains(key), "key " + key + " is duplicate within partition " + partitionPath); - partitionToKeys.get(partitionPath).add(key); - } - } - - /** - * Test Upsert API. - */ - @Test - public void testUpserts() throws Exception { - testUpsertsInternal(getConfig(), SparkRDDWriteClient::upsert, false); - } - - /** - * Test UpsertPrepped API. - */ - @Test - public void testUpsertsPrepped() throws Exception { - testUpsertsInternal(getConfig(), SparkRDDWriteClient::upsertPreppedRecords, true); - } - - /** - * Test one of HoodieWriteClient upsert(Prepped) APIs. - * - * @param config Write Config - * @param writeFn One of Hoodie Write Function API - * @throws Exception in case of error - */ - private void testUpsertsInternal(HoodieWriteConfig config, - Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, boolean isPrepped) - throws Exception { - // Force using older timeline layout - HoodieWriteConfig hoodieWriteConfig = getConfigBuilder().withProps(config.getProps()).withTimelineLayoutVersion( - VERSION_0).build(); - HoodieTableMetaClient.initTableType(metaClient.getHadoopConf(), metaClient.getBasePath(), metaClient.getTableType(), - metaClient.getTableConfig().getTableName(), metaClient.getArchivePath(), - metaClient.getTableConfig().getPayloadClass(), VERSION_0); - SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig, false); - - // Write 1 (only inserts) - String newCommitTime = "001"; - String initCommitTime = "000"; - int numRecords = 200; - insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, numRecords, SparkRDDWriteClient::insert, - isPrepped, true, numRecords); - - // Write 2 (updates) - String prevCommitTime = newCommitTime; - newCommitTime = "004"; - numRecords = 100; - String commitTimeBetweenPrevAndNew = "002"; - updateBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime, - Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), initCommitTime, numRecords, writeFn, isPrepped, true, - numRecords, 200, 2); - - // Delete 1 - prevCommitTime = newCommitTime; - newCommitTime = "005"; - numRecords = 50; - - deleteBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime, - initCommitTime, numRecords, SparkRDDWriteClient::delete, isPrepped, true, - 0, 150); - - // Now simulate an upgrade and perform a restore operation - HoodieWriteConfig newConfig = getConfigBuilder().withProps(config.getProps()).withTimelineLayoutVersion( - TimelineLayoutVersion.CURR_VERSION).build(); - client = getHoodieWriteClient(newConfig, false); - client.restoreToInstant("004"); - - // Check the entire dataset has all records still - String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; - for (int i = 0; i < fullPartitionPaths.length; i++) { - fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); - } - assertEquals(200, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), - "Must contain " + 200 + " records"); - - // Perform Delete again on upgraded dataset. - prevCommitTime = newCommitTime; - newCommitTime = "006"; - numRecords = 50; - - deleteBatch(newConfig, client, newCommitTime, prevCommitTime, - initCommitTime, numRecords, SparkRDDWriteClient::delete, isPrepped, true, - 0, 150); - - HoodieActiveTimeline activeTimeline = new HoodieActiveTimeline(metaClient, false); - List instants = activeTimeline.getCommitTimeline().getInstants().collect(Collectors.toList()); - assertEquals(5, instants.size()); - assertEquals(new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "001"), - instants.get(0)); - assertEquals(new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "004"), - instants.get(1)); - // New Format should have all states of instants - assertEquals(new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMMIT_ACTION, "006"), - instants.get(2)); - assertEquals(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "006"), - instants.get(3)); - assertEquals(new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "006"), - instants.get(4)); - - final HoodieWriteConfig cfg = hoodieWriteConfig; - final String instantTime = "007"; - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); - String basePathStr = basePath; - HoodieTable table = getHoodieTable(metaClient, cfg); - jsc.parallelize(Arrays.asList(1)).map(e -> { - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata - .fromBytes(metaClient.getActiveTimeline().getInstantDetails( - metaClient.getCommitsTimeline().filterCompletedInstants().lastInstant().get()).get(), - HoodieCommitMetadata.class); - String filePath = commitMetadata.getPartitionToWriteStats().values().stream() - .flatMap(w -> w.stream()).filter(s -> s.getPath().endsWith(".parquet")).findAny() - .map(ee -> ee.getPath()).orElse(null); - String partitionPath = commitMetadata.getPartitionToWriteStats().values().stream() - .flatMap(w -> w.stream()).filter(s -> s.getPath().endsWith(".parquet")).findAny() - .map(ee -> ee.getPartitionPath()).orElse(null); - Path parquetFilePath = new Path(basePathStr, filePath); - HoodieBaseFile baseFile = new HoodieBaseFile(parquetFilePath.toString()); - - try { - HoodieMergeHandle handle = new HoodieMergeHandle(cfg, instantTime, table, new HashMap<>(), - partitionPath, FSUtils.getFileId(parquetFilePath.getName()), baseFile, new SparkTaskContextSupplier()); - WriteStatus writeStatus = new WriteStatus(false, 0.0); - writeStatus.setStat(new HoodieWriteStat()); - writeStatus.getStat().setNumWrites(0); - handle.performMergeDataValidationCheck(writeStatus); - } catch (HoodieCorruptedDataException e1) { - fail("Exception not expected because merge validation check is disabled"); - } - - try { - final String newInstantTime = "006"; - cfg.getProps().setProperty("hoodie.merge.data.validation.enabled", "true"); - HoodieWriteConfig cfg2 = HoodieWriteConfig.newBuilder().withProps(cfg.getProps()).build(); - HoodieMergeHandle handle = new HoodieMergeHandle(cfg2, newInstantTime, table, new HashMap<>(), - partitionPath, FSUtils.getFileId(parquetFilePath.getName()), baseFile, new SparkTaskContextSupplier()); - WriteStatus writeStatus = new WriteStatus(false, 0.0); - writeStatus.setStat(new HoodieWriteStat()); - writeStatus.getStat().setNumWrites(0); - handle.performMergeDataValidationCheck(writeStatus); - fail("The above line should have thrown an exception"); - } catch (HoodieCorruptedDataException e2) { - // expected - } - return true; - }).collect(); - } - - /** - * Tesst deletion of records. - */ - @Test - public void testDeletes() throws Exception { - SparkRDDWriteClient client = getHoodieWriteClient(getConfig(), false); - - /** - * Write 1 (inserts and deletes) Write actual 200 insert records and ignore 100 delete records - */ - String initCommitTime = "000"; - String newCommitTime = "001"; - - final List recordsInFirstBatch = new ArrayList<>(); - Function2, String, Integer> recordGenFunction = - (String instantTime, Integer numRecordsInThisCommit) -> { - List fewRecordsForInsert = dataGen.generateInserts(instantTime, 200); - List fewRecordsForDelete = dataGen.generateDeletes(instantTime, 100); - - recordsInFirstBatch.addAll(fewRecordsForInsert); - recordsInFirstBatch.addAll(fewRecordsForDelete); - return recordsInFirstBatch; - }; - writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime, - // unused as genFn uses hard-coded number of inserts/updates/deletes - -1, recordGenFunction, SparkRDDWriteClient::upsert, true, 200, 200, 1); - - /** - * Write 2 (deletes+writes). - */ - String prevCommitTime = newCommitTime; - newCommitTime = "004"; - final List recordsInSecondBatch = new ArrayList<>(); - - recordGenFunction = (String instantTime, Integer numRecordsInThisCommit) -> { - List fewRecordsForDelete = recordsInFirstBatch.subList(0, 50); - List fewRecordsForUpdate = recordsInFirstBatch.subList(50, 100); - recordsInSecondBatch.addAll(dataGen.generateDeletesFromExistingRecords(fewRecordsForDelete)); - recordsInSecondBatch.addAll(fewRecordsForUpdate); - return recordsInSecondBatch; - }; - writeBatch(client, newCommitTime, prevCommitTime, Option.empty(), initCommitTime, 100, recordGenFunction, - SparkRDDWriteClient::upsert, true, 50, 150, 2); - } - - /** - * When records getting inserted are deleted in the same write batch, hudi should have deleted those records and - * not be available in read path. - * @throws Exception - */ - @Test - public void testDeletesForInsertsInSameBatch() throws Exception { - SparkRDDWriteClient client = getHoodieWriteClient(getConfig(), false); - - /** - * Write 200 inserts and issue deletes to a subset(50) of inserts. - */ - String initCommitTime = "000"; - String newCommitTime = "001"; - - final List recordsInFirstBatch = new ArrayList<>(); - Function2, String, Integer> recordGenFunction = - (String instantTime, Integer numRecordsInThisCommit) -> { - List fewRecordsForInsert = dataGen.generateInserts(instantTime, 200); - List fewRecordsForDelete = fewRecordsForInsert.subList(40, 90); - - recordsInFirstBatch.addAll(fewRecordsForInsert); - recordsInFirstBatch.addAll(dataGen.generateDeletesFromExistingRecords(fewRecordsForDelete)); - return recordsInFirstBatch; - }; - - writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime, - -1, recordGenFunction, SparkRDDWriteClient::upsert, true, 150, 150, 1); - } - - /** - * Test update of a record to different partition with Global Index. - */ - @ParameterizedTest - @EnumSource(value = IndexType.class, names = {"GLOBAL_BLOOM", "GLOBAL_SIMPLE"}) - public void testUpsertsUpdatePartitionPathGlobalBloom(IndexType indexType) throws Exception { - testUpsertsUpdatePartitionPath(indexType, getConfig(), SparkRDDWriteClient::upsert); - } - - /** - * This test ensures in a global bloom when update partition path is set to true in config, if an incoming record has mismatched partition - * compared to whats in storage, then appropriate actions are taken. i.e. old record is deleted in old partition and new one is inserted - * in the new partition. - * test structure: - * 1. insert 1 batch - * 2. insert 2nd batch with larger no of records so that a new file group is created for partitions - * 3. issue upserts to records from batch 1 with different partition path. This should ensure records from batch 1 are deleted and new - * records are upserted to the new partition - * - * @param indexType index type to be tested for - * @param config instance of {@link HoodieWriteConfig} to use - * @param writeFn write function to be used for testing - */ - private void testUpsertsUpdatePartitionPath(IndexType indexType, HoodieWriteConfig config, - Function3, SparkRDDWriteClient, JavaRDD, String> writeFn) - throws Exception { - // instantiate client - - HoodieWriteConfig hoodieWriteConfig = getConfigBuilder() - .withProps(config.getProps()) - .withCompactionConfig( - HoodieCompactionConfig.newBuilder().compactionSmallFileSize(10000).build()) - .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType) - .withBloomIndexUpdatePartitionPath(true) - .withGlobalSimpleIndexUpdatePartitionPath(true) - .build()).withTimelineLayoutVersion(VERSION_0).build(); - HoodieTableMetaClient.initTableType(metaClient.getHadoopConf(), metaClient.getBasePath(), - metaClient.getTableType(), metaClient.getTableConfig().getTableName(), metaClient.getArchivePath(), - metaClient.getTableConfig().getPayloadClass(), VERSION_0); - SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig, false); - - // Write 1 - String newCommitTime = "001"; - int numRecords = 10; - client.startCommitWithTime(newCommitTime); - - List records = dataGen.generateInserts(newCommitTime, numRecords); - Set> expectedPartitionPathRecKeyPairs = new HashSet<>(); - // populate expected partition path and record keys - for (HoodieRecord rec : records) { - expectedPartitionPathRecKeyPairs.add(Pair.of(rec.getPartitionPath(), rec.getRecordKey())); - } - JavaRDD writeRecords = jsc.parallelize(records, 1); - JavaRDD result = writeFn.apply(client, writeRecords, newCommitTime); - result.collect(); - - // Check the entire dataset has all records - String[] fullPartitionPaths = getFullPartitionPaths(); - assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths); - - // verify one basefile per partition - String[] fullExpectedPartitionPaths = getFullPartitionPaths(expectedPartitionPathRecKeyPairs.stream().map(Pair::getLeft).toArray(String[]::new)); - Map baseFileCounts = getBaseFileCountsForPaths(basePath, fs, fullExpectedPartitionPaths); - for (Map.Entry entry : baseFileCounts.entrySet()) { - assertEquals(1, entry.getValue()); - } - assertTrue(baseFileCounts.entrySet().stream().allMatch(entry -> entry.getValue() == 1)); - - // Write 2 - newCommitTime = "002"; - numRecords = 20; // so that a new file id is created - client.startCommitWithTime(newCommitTime); - - List recordsSecondBatch = dataGen.generateInserts(newCommitTime, numRecords); - // populate expected partition path and record keys - for (HoodieRecord rec : recordsSecondBatch) { - expectedPartitionPathRecKeyPairs.add(Pair.of(rec.getPartitionPath(), rec.getRecordKey())); - } - writeRecords = jsc.parallelize(recordsSecondBatch, 1); - result = writeFn.apply(client, writeRecords, newCommitTime); - result.collect(); - - // Check the entire dataset has all records - fullPartitionPaths = getFullPartitionPaths(); - assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths); - - // verify that there are more than 1 basefiles per partition - // we can't guarantee randomness in partitions where records are distributed. So, verify atleast one partition has more than 1 basefile. - baseFileCounts = getBaseFileCountsForPaths(basePath, fs, fullPartitionPaths); - assertTrue(baseFileCounts.entrySet().stream().filter(entry -> entry.getValue() > 1).count() >= 1, - "At least one partition should have more than 1 base file after 2nd batch of writes"); - - // Write 3 (upserts to records from batch 1 with diff partition path) - newCommitTime = "003"; - - // update to diff partition paths - List recordsToUpsert = new ArrayList<>(); - for (HoodieRecord rec : records) { - // remove older entry from expected partition path record key pairs - expectedPartitionPathRecKeyPairs - .remove(Pair.of(rec.getPartitionPath(), rec.getRecordKey())); - String partitionPath = rec.getPartitionPath(); - String newPartitionPath = null; - if (partitionPath.equalsIgnoreCase(DEFAULT_FIRST_PARTITION_PATH)) { - newPartitionPath = DEFAULT_SECOND_PARTITION_PATH; - } else if (partitionPath.equalsIgnoreCase(DEFAULT_SECOND_PARTITION_PATH)) { - newPartitionPath = DEFAULT_THIRD_PARTITION_PATH; - } else if (partitionPath.equalsIgnoreCase(DEFAULT_THIRD_PARTITION_PATH)) { - newPartitionPath = DEFAULT_FIRST_PARTITION_PATH; - } else { - throw new IllegalStateException("Unknown partition path " + rec.getPartitionPath()); - } - recordsToUpsert.add( - new HoodieRecord(new HoodieKey(rec.getRecordKey(), newPartitionPath), - rec.getData())); - // populate expected partition path and record keys - expectedPartitionPathRecKeyPairs.add(Pair.of(newPartitionPath, rec.getRecordKey())); - } - - writeRecords = jsc.parallelize(recordsToUpsert, 1); - result = writeFn.apply(client, writeRecords, newCommitTime); - result.collect(); - - // Check the entire dataset has all records - fullPartitionPaths = getFullPartitionPaths(); - assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths); - } - - private void assertPartitionPathRecordKeys(Set> expectedPartitionPathRecKeyPairs, String[] fullPartitionPaths) { - Dataset rows = getAllRows(fullPartitionPaths); - List> actualPartitionPathRecKeyPairs = getActualPartitionPathAndRecordKeys(rows); - // verify all partitionpath, record key matches - assertActualAndExpectedPartitionPathRecordKeyMatches(expectedPartitionPathRecKeyPairs, actualPartitionPathRecKeyPairs); - } - - private List> getActualPartitionPathAndRecordKeys(Dataset rows) { - List> actualPartitionPathRecKeyPairs = new ArrayList<>(); - for (Row row : rows.collectAsList()) { - actualPartitionPathRecKeyPairs - .add(Pair.of(row.getAs("_hoodie_partition_path"), row.getAs("_row_key"))); - } - return actualPartitionPathRecKeyPairs; - } - - private Dataset getAllRows(String[] fullPartitionPaths) { - return HoodieClientTestUtils - .read(jsc, basePath, sqlContext, fs, fullPartitionPaths); - } - - private String[] getFullPartitionPaths() { - return getFullPartitionPaths(dataGen.getPartitionPaths()); - } - - private String[] getFullPartitionPaths(String[] relativePartitionPaths) { - String[] fullPartitionPaths = new String[relativePartitionPaths.length]; - for (int i = 0; i < fullPartitionPaths.length; i++) { - fullPartitionPaths[i] = String.format("%s/%s/*", basePath, relativePartitionPaths[i]); - } - return fullPartitionPaths; - } - - private void assertActualAndExpectedPartitionPathRecordKeyMatches(Set> expectedPartitionPathRecKeyPairs, - List> actualPartitionPathRecKeyPairs) { - // verify all partitionpath, record key matches - assertEquals(expectedPartitionPathRecKeyPairs.size(), actualPartitionPathRecKeyPairs.size()); - for (Pair entry : actualPartitionPathRecKeyPairs) { - assertTrue(expectedPartitionPathRecKeyPairs.contains(entry)); - } - - for (Pair entry : expectedPartitionPathRecKeyPairs) { - assertTrue(actualPartitionPathRecKeyPairs.contains(entry)); - } - } - - /** - * Test scenario of new file-group getting added during upsert(). - */ - @Test - public void testSmallInsertHandlingForUpserts() throws Exception { - final String testPartitionPath = "2016/09/26"; - final int insertSplitLimit = 100; - // setup the small file handling params - HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit); // hold upto 200 records max - dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath}); - - SparkRDDWriteClient client = getHoodieWriteClient(config, false); - - // Inserts => will write file1 - String commitTime1 = "001"; - client.startCommitWithTime(commitTime1); - List inserts1 = dataGen.generateInserts(commitTime1, insertSplitLimit); // this writes ~500kb - Set keys1 = recordsToRecordKeySet(inserts1); - - JavaRDD insertRecordsRDD1 = jsc.parallelize(inserts1, 1); - List statuses = client.upsert(insertRecordsRDD1, commitTime1).collect(); - - assertNoWriteErrors(statuses); - - assertEquals(1, statuses.size(), "Just 1 file needs to be added."); - String file1 = statuses.get(0).getFileId(); - assertEquals(100, - readRowKeysFromParquet(hadoopConf, new Path(basePath, statuses.get(0).getStat().getPath())) - .size(), "file should contain 100 records"); - - // Update + Inserts such that they just expand file1 - String commitTime2 = "002"; - client.startCommitWithTime(commitTime2); - List inserts2 = dataGen.generateInserts(commitTime2, 40); - Set keys2 = recordsToRecordKeySet(inserts2); - List insertsAndUpdates2 = new ArrayList<>(); - insertsAndUpdates2.addAll(inserts2); - insertsAndUpdates2.addAll(dataGen.generateUpdates(commitTime2, inserts1)); - - JavaRDD insertAndUpdatesRDD2 = jsc.parallelize(insertsAndUpdates2, 1); - statuses = client.upsert(insertAndUpdatesRDD2, commitTime2).collect(); - assertNoWriteErrors(statuses); - - assertEquals(1, statuses.size(), "Just 1 file needs to be updated."); - assertEquals(file1, statuses.get(0).getFileId(), "Existing file should be expanded"); - assertEquals(commitTime1, statuses.get(0).getStat().getPrevCommit(), "Existing file should be expanded"); - Path newFile = new Path(basePath, statuses.get(0).getStat().getPath()); - assertEquals(140, readRowKeysFromParquet(hadoopConf, newFile).size(), - "file should contain 140 records"); - - List records = ParquetUtils.readAvroRecords(hadoopConf, newFile); - for (GenericRecord record : records) { - String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - assertEquals(commitTime2, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(), "only expect commit2"); - assertTrue(keys2.contains(recordKey) || keys1.contains(recordKey), "key expected to be part of commit2"); - } - - // update + inserts such that file1 is updated and expanded, a new file2 is created. - String commitTime3 = "003"; - client.startCommitWithTime(commitTime3); - List insertsAndUpdates3 = dataGen.generateInserts(commitTime3, 200); - Set keys3 = recordsToRecordKeySet(insertsAndUpdates3); - List updates3 = dataGen.generateUpdates(commitTime3, inserts2); - insertsAndUpdates3.addAll(updates3); - - JavaRDD insertAndUpdatesRDD3 = jsc.parallelize(insertsAndUpdates3, 1); - statuses = client.upsert(insertAndUpdatesRDD3, commitTime3).collect(); - assertNoWriteErrors(statuses); - - assertEquals(2, statuses.size(), "2 files needs to be committed."); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(hadoopConf, basePath); - - HoodieTable table = getHoodieTable(metadata, config); - BaseFileOnlyView fileSystemView = table.getBaseFileOnlyView(); - List files = - fileSystemView.getLatestBaseFilesBeforeOrOn(testPartitionPath, commitTime3).collect(Collectors.toList()); - int numTotalInsertsInCommit3 = 0; - int numTotalUpdatesInCommit3 = 0; - for (HoodieBaseFile file : files) { - if (file.getFileName().contains(file1)) { - assertEquals(commitTime3, file.getCommitTime(), "Existing file should be expanded"); - records = ParquetUtils.readAvroRecords(hadoopConf, new Path(file.getPath())); - for (GenericRecord record : records) { - String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); - if (recordCommitTime.equals(commitTime3)) { - if (keys2.contains(recordKey)) { - keys2.remove(recordKey); - numTotalUpdatesInCommit3++; - } else { - numTotalInsertsInCommit3++; - } - } - } - assertEquals(0, keys2.size(), "All keys added in commit 2 must be updated in commit3 correctly"); - } else { - assertEquals(commitTime3, file.getCommitTime(), "New file must be written for commit 3"); - records = ParquetUtils.readAvroRecords(hadoopConf, new Path(file.getPath())); - for (GenericRecord record : records) { - String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - assertEquals(commitTime3, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(), - "only expect commit3"); - assertTrue(keys3.contains(recordKey), "key expected to be part of commit3"); - } - numTotalInsertsInCommit3 += records.size(); - } - } - assertEquals(numTotalUpdatesInCommit3, inserts2.size(), "Total updates in commit3 must add up"); - assertEquals(numTotalInsertsInCommit3, keys3.size(), "Total inserts in commit3 must add up"); - } - - /** - * Test scenario of new file-group getting added during insert(). - */ - @Test - public void testSmallInsertHandlingForInserts() throws Exception { - - final String testPartitionPath = "2016/09/26"; - final int insertSplitLimit = 100; - // setup the small file handling params - HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit); // hold upto 200 records max - dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath}); - SparkRDDWriteClient client = getHoodieWriteClient(config, false); - - // Inserts => will write file1 - String commitTime1 = "001"; - client.startCommitWithTime(commitTime1); - List inserts1 = dataGen.generateInserts(commitTime1, insertSplitLimit); // this writes ~500kb - Set keys1 = recordsToRecordKeySet(inserts1); - JavaRDD insertRecordsRDD1 = jsc.parallelize(inserts1, 1); - List statuses = client.insert(insertRecordsRDD1, commitTime1).collect(); - - assertNoWriteErrors(statuses); - assertPartitionMetadata(new String[] {testPartitionPath}, fs); - - assertEquals(1, statuses.size(), "Just 1 file needs to be added."); - String file1 = statuses.get(0).getFileId(); - assertEquals(100, - readRowKeysFromParquet(hadoopConf, new Path(basePath, statuses.get(0).getStat().getPath())) - .size(), "file should contain 100 records"); - - // Second, set of Inserts should just expand file1 - String commitTime2 = "002"; - client.startCommitWithTime(commitTime2); - List inserts2 = dataGen.generateInserts(commitTime2, 40); - Set keys2 = recordsToRecordKeySet(inserts2); - JavaRDD insertRecordsRDD2 = jsc.parallelize(inserts2, 1); - statuses = client.insert(insertRecordsRDD2, commitTime2).collect(); - assertNoWriteErrors(statuses); - - assertEquals(1, statuses.size(), "Just 1 file needs to be updated."); - assertEquals(file1, statuses.get(0).getFileId(), "Existing file should be expanded"); - assertEquals(commitTime1, statuses.get(0).getStat().getPrevCommit(), "Existing file should be expanded"); - Path newFile = new Path(basePath, statuses.get(0).getStat().getPath()); - assertEquals(140, readRowKeysFromParquet(hadoopConf, newFile).size(), - "file should contain 140 records"); - - List records = ParquetUtils.readAvroRecords(hadoopConf, newFile); - for (GenericRecord record : records) { - String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - String recCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); - assertTrue(commitTime1.equals(recCommitTime) || commitTime2.equals(recCommitTime), - "Record expected to be part of commit 1 or commit2"); - assertTrue(keys2.contains(recordKey) || keys1.contains(recordKey), - "key expected to be part of commit 1 or commit2"); - } - - // Lots of inserts such that file1 is updated and expanded, a new file2 is created. - String commitTime3 = "003"; - client.startCommitWithTime(commitTime3); - List insert3 = dataGen.generateInserts(commitTime3, 200); - JavaRDD insertRecordsRDD3 = jsc.parallelize(insert3, 1); - statuses = client.insert(insertRecordsRDD3, commitTime3).collect(); - assertNoWriteErrors(statuses); - assertEquals(2, statuses.size(), "2 files needs to be committed."); - - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, basePath); - HoodieTable table = getHoodieTable(metaClient, config); - List files = table.getBaseFileOnlyView() - .getLatestBaseFilesBeforeOrOn(testPartitionPath, commitTime3).collect(Collectors.toList()); - assertEquals(2, files.size(), "Total of 2 valid data files"); - - int totalInserts = 0; - for (HoodieBaseFile file : files) { - assertEquals(commitTime3, file.getCommitTime(), "All files must be at commit 3"); - records = ParquetUtils.readAvroRecords(hadoopConf, new Path(file.getPath())); - totalInserts += records.size(); - } - assertEquals(totalInserts, inserts1.size() + inserts2.size() + insert3.size(), - "Total number of records must add up"); - } - - /** - * Test delete with delete api. - */ - @Test - public void testDeletesWithDeleteApi() throws Exception { - final String testPartitionPath = "2016/09/26"; - final int insertSplitLimit = 100; - // setup the small file handling params - HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit); // hold upto 200 records max - dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath}); - - SparkRDDWriteClient client = getHoodieWriteClient(config, false); - - // Inserts => will write file1 - String commitTime1 = "001"; - client.startCommitWithTime(commitTime1); - List inserts1 = dataGen.generateInserts(commitTime1, insertSplitLimit); // this writes ~500kb - Set keys1 = recordsToRecordKeySet(inserts1); - List keysSoFar = new ArrayList<>(keys1); - JavaRDD insertRecordsRDD1 = jsc.parallelize(inserts1, 1); - List statuses = client.upsert(insertRecordsRDD1, commitTime1).collect(); - - assertNoWriteErrors(statuses); - - assertEquals(1, statuses.size(), "Just 1 file needs to be added."); - String file1 = statuses.get(0).getFileId(); - assertEquals(100, - readRowKeysFromParquet(hadoopConf, new Path(basePath, statuses.get(0).getStat().getPath())) - .size(), "file should contain 100 records"); - - // Delete 20 among 100 inserted - testDeletes(client, inserts1, 20, file1, "002", 80, keysSoFar); - - // Insert and update 40 records - Pair, List> updateBatch2 = testUpdates("003", client, 40, 120); - keysSoFar.addAll(updateBatch2.getLeft()); - - // Delete 10 records among 40 updated - testDeletes(client, updateBatch2.getRight(), 10, file1, "004", 110, keysSoFar); - - // do another batch of updates - Pair, List> updateBatch3 = testUpdates("005", client, 40, 150); - keysSoFar.addAll(updateBatch3.getLeft()); - - // delete non existent keys - String commitTime6 = "006"; - client.startCommitWithTime(commitTime6); - - List dummyInserts3 = dataGen.generateInserts(commitTime6, 20); - List hoodieKeysToDelete3 = randomSelectAsHoodieKeys(dummyInserts3, 20); - JavaRDD deleteKeys3 = jsc.parallelize(hoodieKeysToDelete3, 1); - statuses = client.delete(deleteKeys3, commitTime6).collect(); - assertNoWriteErrors(statuses); - assertEquals(0, statuses.size(), "Just 0 write status for delete."); - - // Check the entire dataset has all records still - String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; - for (int i = 0; i < fullPartitionPaths.length; i++) { - fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); - } - assertEquals(150, - HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), - "Must contain " + 150 + " records"); - - // delete another batch. previous delete commit should have persisted the schema. If not, - // this will throw exception - testDeletes(client, updateBatch3.getRight(), 10, file1, "007", 140, keysSoFar); - } - - /** - * Test scenario of writing more file groups than existing number of file groups in partition. - */ - @Test - public void testInsertOverwritePartitionHandlingWithMoreRecords() throws Exception { - verifyInsertOverwritePartitionHandling(1000, 3000); - } - - /** - * Test scenario of writing fewer file groups than existing number of file groups in partition. - */ - @Test - public void testInsertOverwritePartitionHandlingWithFewerRecords() throws Exception { - verifyInsertOverwritePartitionHandling(3000, 1000); - } - - /** - * Test scenario of writing similar number file groups in partition. - */ - @Test - public void testInsertOverwritePartitionHandlinWithSimilarNumberOfRecords() throws Exception { - verifyInsertOverwritePartitionHandling(3000, 3000); - } - - /** - * 1) Do write1 (upsert) with 'batch1RecordsCount' number of records. - * 2) Do write2 (insert overwrite) with 'batch2RecordsCount' number of records. - * - * Verify that all records in step1 are overwritten - */ - private void verifyInsertOverwritePartitionHandling(int batch1RecordsCount, int batch2RecordsCount) throws Exception { - final String testPartitionPath = "americas"; - HoodieWriteConfig config = getSmallInsertWriteConfig(2000, false); - SparkRDDWriteClient client = getHoodieWriteClient(config, false); - dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath}); - - // Do Inserts - String commitTime1 = "001"; - client.startCommitWithTime(commitTime1); - List inserts1 = dataGen.generateInserts(commitTime1, batch1RecordsCount); - JavaRDD insertRecordsRDD1 = jsc.parallelize(inserts1, 2); - List statuses = client.upsert(insertRecordsRDD1, commitTime1).collect(); - assertNoWriteErrors(statuses); - Set batch1Buckets = statuses.stream().map(s -> s.getFileId()).collect(Collectors.toSet()); - verifyRecordsWritten(commitTime1, inserts1, statuses); - - // Do Insert Overwrite - String commitTime2 = "002"; - client.startCommitWithTime(commitTime2, HoodieTimeline.REPLACE_COMMIT_ACTION); - List inserts2 = dataGen.generateInserts(commitTime2, batch2RecordsCount); - List insertsAndUpdates2 = new ArrayList<>(); - insertsAndUpdates2.addAll(inserts2); - JavaRDD insertAndUpdatesRDD2 = jsc.parallelize(insertsAndUpdates2, 2); - HoodieWriteResult writeResult = client.insertOverwrite(insertAndUpdatesRDD2, commitTime2); - statuses = writeResult.getWriteStatuses().collect(); - assertNoWriteErrors(statuses); - - assertEquals(batch1Buckets, new HashSet<>(writeResult.getPartitionToReplaceFileIds().get(testPartitionPath))); - verifyRecordsWritten(commitTime2, inserts2, statuses); - } - - /** - * Verify data in parquet files matches expected records and commit time. - */ - private void verifyRecordsWritten(String commitTime, List expectedRecords, List allStatus) { - List records = new ArrayList<>(); - for (WriteStatus status : allStatus) { - Path filePath = new Path(basePath, status.getStat().getPath()); - records.addAll(ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), filePath)); - } - - Set expectedKeys = recordsToRecordKeySet(expectedRecords); - assertEquals(records.size(), expectedKeys.size()); - for (GenericRecord record : records) { - String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - assertEquals(commitTime, - record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString()); - assertTrue(expectedKeys.contains(recordKey)); - } - } - - private Pair, List> testUpdates(String instantTime, SparkRDDWriteClient client, - int sizeToInsertAndUpdate, int expectedTotalRecords) - throws IOException { - client.startCommitWithTime(instantTime); - List inserts = dataGen.generateInserts(instantTime, sizeToInsertAndUpdate); - Set keys = recordsToRecordKeySet(inserts); - List insertsAndUpdates = new ArrayList<>(); - insertsAndUpdates.addAll(inserts); - insertsAndUpdates.addAll(dataGen.generateUpdates(instantTime, inserts)); - - JavaRDD insertAndUpdatesRDD = jsc.parallelize(insertsAndUpdates, 1); - List statuses = client.upsert(insertAndUpdatesRDD, instantTime).collect(); - assertNoWriteErrors(statuses); - - // Check the entire dataset has all records still - String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; - for (int i = 0; i < fullPartitionPaths.length; i++) { - fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); - } - assertEquals(expectedTotalRecords, - HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), - "Must contain " + expectedTotalRecords + " records"); - return Pair.of(keys, inserts); - } - - private void testDeletes(SparkRDDWriteClient client, List previousRecords, int sizeToDelete, - String existingFile, String instantTime, int exepctedRecords, List keys) { - client.startCommitWithTime(instantTime); - - List hoodieKeysToDelete = randomSelectAsHoodieKeys(previousRecords, sizeToDelete); - JavaRDD deleteKeys = jsc.parallelize(hoodieKeysToDelete, 1); - List statuses = client.delete(deleteKeys, instantTime).collect(); - - assertNoWriteErrors(statuses); - - assertEquals(1, statuses.size(), "Just 1 file needs to be added."); - assertEquals(existingFile, statuses.get(0).getFileId(), "Existing file should be expanded"); - - // Check the entire dataset has all records still - String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; - for (int i = 0; i < fullPartitionPaths.length; i++) { - fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); - } - assertEquals(exepctedRecords, - HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), - "Must contain " + exepctedRecords + " records"); - - Path newFile = new Path(basePath, statuses.get(0).getStat().getPath()); - assertEquals(exepctedRecords, - readRowKeysFromParquet(hadoopConf, newFile).size(), - "file should contain 110 records"); - - List records = ParquetUtils.readAvroRecords(hadoopConf, newFile); - for (GenericRecord record : records) { - String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - assertTrue(keys.contains(recordKey), "key expected to be part of " + instantTime); - assertFalse(hoodieKeysToDelete.contains(recordKey), "Key deleted"); - } - } - - /** - * Test delete with delete api. - */ - @Test - public void testDeletesWithoutInserts() { - final String testPartitionPath = "2016/09/26"; - final int insertSplitLimit = 100; - // setup the small file handling params - HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit, true); // hold upto 200 records max - dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath}); - - SparkRDDWriteClient client = getHoodieWriteClient(config, false); - - // delete non existent keys - String commitTime1 = "001"; - client.startCommitWithTime(commitTime1); - - List dummyInserts = dataGen.generateInserts(commitTime1, 20); - List hoodieKeysToDelete = randomSelectAsHoodieKeys(dummyInserts, 20); - JavaRDD deleteKeys = jsc.parallelize(hoodieKeysToDelete, 1); - assertThrows(HoodieIOException.class, () -> { - client.delete(deleteKeys, commitTime1).collect(); - }, "Should have thrown Exception"); - } - - /** - * Test to ensure commit metadata points to valid files. - */ - @Test - public void testCommitWritesRelativePaths() throws Exception { - - HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build(); - try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, basePath); - HoodieSparkTable table = HoodieSparkTable.create(cfg, context, metaClient); - - String instantTime = "000"; - client.startCommitWithTime(instantTime); - - List records = dataGen.generateInserts(instantTime, 200); - JavaRDD writeRecords = jsc.parallelize(records, 1); - - JavaRDD result = client.bulkInsert(writeRecords, instantTime); - - assertTrue(client.commit(instantTime, result), "Commit should succeed"); - assertTrue(testTable.commitExists(instantTime), - "After explicit commit, commit file should be created"); - - // Get parquet file paths from commit metadata - String actionType = metaClient.getCommitActionType(); - HoodieInstant commitInstant = new HoodieInstant(false, actionType, instantTime); - HoodieTimeline commitTimeline = metaClient.getCommitTimeline().filterCompletedInstants(); - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata - .fromBytes(commitTimeline.getInstantDetails(commitInstant).get(), HoodieCommitMetadata.class); - String basePath = table.getMetaClient().getBasePath(); - Collection commitPathNames = commitMetadata.getFileIdAndFullPaths(basePath).values(); - - // Read from commit file - try (FSDataInputStream inputStream = fs.open(testTable.getCommitFilePath(instantTime))) { - String everything = FileIOUtils.readAsUTFString(inputStream); - HoodieCommitMetadata metadata = HoodieCommitMetadata.fromJsonString(everything, HoodieCommitMetadata.class); - HashMap paths = metadata.getFileIdAndFullPaths(basePath); - // Compare values in both to make sure they are equal. - for (String pathName : paths.values()) { - assertTrue(commitPathNames.contains(pathName)); - } - } - } - } - - /** - * Test to ensure commit metadata points to valid files.10. - */ - @Test - public void testMetadataStatsOnCommit() throws Exception { - HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build(); - SparkRDDWriteClient client = getHoodieWriteClient(cfg); - - String instantTime0 = "000"; - client.startCommitWithTime(instantTime0); - - List records0 = dataGen.generateInserts(instantTime0, 200); - JavaRDD writeRecords0 = jsc.parallelize(records0, 1); - JavaRDD result0 = client.bulkInsert(writeRecords0, instantTime0); - - assertTrue(client.commit(instantTime0, result0), "Commit should succeed"); - assertTrue(testTable.commitExists(instantTime0), - "After explicit commit, commit file should be created"); - - // Read from commit file - try (FSDataInputStream inputStream = fs.open(testTable.getCommitFilePath(instantTime0))) { - String everything = FileIOUtils.readAsUTFString(inputStream); - HoodieCommitMetadata metadata = - HoodieCommitMetadata.fromJsonString(everything, HoodieCommitMetadata.class); - int inserts = 0; - for (Map.Entry> pstat : metadata.getPartitionToWriteStats().entrySet()) { - for (HoodieWriteStat stat : pstat.getValue()) { - inserts += stat.getNumInserts(); - } - } - assertEquals(200, inserts); - } - - // Update + Inserts such that they just expand file1 - String instantTime1 = "001"; - client.startCommitWithTime(instantTime1); - - List records1 = dataGen.generateUpdates(instantTime1, records0); - JavaRDD writeRecords1 = jsc.parallelize(records1, 1); - JavaRDD result1 = client.upsert(writeRecords1, instantTime1); - - assertTrue(client.commit(instantTime1, result1), "Commit should succeed"); - assertTrue(testTable.commitExists(instantTime1), - "After explicit commit, commit file should be created"); - - // Read from commit file - try (FSDataInputStream inputStream = fs.open(testTable.getCommitFilePath(instantTime1))) { - String everything = FileIOUtils.readAsUTFString(inputStream); - HoodieCommitMetadata metadata = HoodieCommitMetadata.fromJsonString(everything, HoodieCommitMetadata.class); - int inserts = 0; - int upserts = 0; - for (Map.Entry> pstat : metadata.getPartitionToWriteStats().entrySet()) { - for (HoodieWriteStat stat : pstat.getValue()) { - inserts += stat.getNumInserts(); - upserts += stat.getNumUpdateWrites(); - } - } - assertEquals(0, inserts); - assertEquals(200, upserts); - } - } - - /** - * Tests behavior of committing only when consistency is verified. - */ - @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testConsistencyCheckDuringFinalize(boolean enableOptimisticConsistencyGuard) throws Exception { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, basePath); - String instantTime = "000"; - HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder() - .withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard).build()).build(); - SparkRDDWriteClient client = getHoodieWriteClient(cfg); - Pair> result = testConsistencyCheck(metaClient, instantTime, enableOptimisticConsistencyGuard); - - // Delete orphan marker and commit should succeed - metaClient.getFs().delete(result.getKey(), false); - if (!enableOptimisticConsistencyGuard) { - assertTrue(client.commit(instantTime, result.getRight()), "Commit should succeed"); - assertTrue(testTable.commitExists(instantTime), - "After explicit commit, commit file should be created"); - // Marker directory must be removed - assertFalse(metaClient.getFs().exists(new Path(metaClient.getMarkerFolderPath(instantTime)))); - } else { - // with optimistic, first client.commit should have succeeded. - assertTrue(testTable.commitExists(instantTime), - "After explicit commit, commit file should be created"); - // Marker directory must be removed - assertFalse(metaClient.getFs().exists(new Path(metaClient.getMarkerFolderPath(instantTime)))); - } - } - - private void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean rollbackUsingMarkers, boolean enableOptimisticConsistencyGuard) throws Exception { - String instantTime = "000"; - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, basePath); - HoodieWriteConfig cfg = !enableOptimisticConsistencyGuard ? getConfigBuilder().withRollbackUsingMarkers(rollbackUsingMarkers).withAutoCommit(false) - .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true) - .withMaxConsistencyCheckIntervalMs(1).withInitialConsistencyCheckIntervalMs(1).withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard).build()).build() : - getConfigBuilder().withRollbackUsingMarkers(rollbackUsingMarkers).withAutoCommit(false) - .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder() - .withConsistencyCheckEnabled(true) - .withOptimisticConsistencyGuardSleepTimeMs(1).build()).build(); - SparkRDDWriteClient client = getHoodieWriteClient(cfg); - testConsistencyCheck(metaClient, instantTime, enableOptimisticConsistencyGuard); - - if (!enableOptimisticConsistencyGuard) { - // Rollback of this commit should succeed with FailSafeCG - client.rollback(instantTime); - assertFalse(testTable.commitExists(instantTime), - "After explicit rollback, commit file should not be present"); - // Marker directory must be removed after rollback - assertFalse(metaClient.getFs().exists(new Path(metaClient.getMarkerFolderPath(instantTime)))); - } else { - // if optimistic CG is enabled, commit should have succeeded. - assertTrue(testTable.commitExists(instantTime), - "With optimistic CG, first commit should succeed. commit file should be present"); - // Marker directory must be removed after rollback - assertFalse(metaClient.getFs().exists(new Path(metaClient.getMarkerFolderPath(instantTime)))); - if (rollbackUsingMarkers) { - // rollback of a completed commit should fail if marked based rollback is used. - try { - client.rollback(instantTime); - fail("Rollback of completed commit should throw exception"); - } catch (HoodieRollbackException e) { - // ignore - } - } else { - // rollback of a completed commit should succeed if using list based rollback - client.rollback(instantTime); - assertFalse(testTable.commitExists(instantTime), - "After explicit rollback, commit file should not be present"); - } - } - } - - @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean enableOptimisticConsistencyGuard) throws Exception { - testRollbackAfterConsistencyCheckFailureUsingFileList(false, enableOptimisticConsistencyGuard); - } - - @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testRollbackAfterConsistencyCheckFailureUsingMarkers(boolean enableOptimisticConsistencyGuard) throws Exception { - testRollbackAfterConsistencyCheckFailureUsingFileList(true, enableOptimisticConsistencyGuard); - } - - private Pair> testConsistencyCheck(HoodieTableMetaClient metaClient, String instantTime, boolean enableOptimisticConsistencyGuard) - throws Exception { - HoodieWriteConfig cfg = !enableOptimisticConsistencyGuard ? (getConfigBuilder().withAutoCommit(false) - .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true) - .withMaxConsistencyCheckIntervalMs(1).withInitialConsistencyCheckIntervalMs(1).withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard).build()) - .build()) : (getConfigBuilder().withAutoCommit(false) - .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true) - .withOptimisticConsistencyGuardSleepTimeMs(1).build()) - .build()); - SparkRDDWriteClient client = getHoodieWriteClient(cfg); - - client.startCommitWithTime(instantTime); - JavaRDD writeRecords = jsc.parallelize(dataGen.generateInserts(instantTime, 200), 1); - JavaRDD result = client.bulkInsert(writeRecords, instantTime); - result.collect(); - - // Create a dummy marker file to simulate the case that a marker file was created without data file. - // This should fail the commit - String partitionPath = Arrays - .stream(fs.globStatus(new Path(String.format("%s/*/*/*/*", metaClient.getMarkerFolderPath(instantTime))), - path -> path.toString().contains(HoodieTableMetaClient.MARKER_EXTN))) - .limit(1).map(status -> status.getPath().getParent().toString()).collect(Collectors.toList()).get(0); - - Path markerFilePath = new MarkerFiles(fs, basePath, metaClient.getMarkerFolderPath(instantTime), instantTime) - .create(partitionPath, - FSUtils.makeDataFileName(instantTime, "1-0-1", UUID.randomUUID().toString()), - IOType.MERGE); - LOG.info("Created a dummy marker path=" + markerFilePath); - - if (!enableOptimisticConsistencyGuard) { - Exception e = assertThrows(HoodieCommitException.class, () -> { - client.commit(instantTime, result); - }, "Commit should fail due to consistency check"); - assertTrue(e.getCause() instanceof HoodieIOException); - } else { - // with optimistic CG, commit should succeed - client.commit(instantTime, result); - } - return Pair.of(markerFilePath, result); - } - - @Test - public void testMultiOperationsPerCommit() throws IOException { - HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false) - .withAllowMultiWriteOnSameInstant(true) - .build(); - SparkRDDWriteClient client = getHoodieWriteClient(cfg); - String firstInstantTime = "0000"; - client.startCommitWithTime(firstInstantTime); - int numRecords = 200; - JavaRDD writeRecords = jsc.parallelize(dataGen.generateInserts(firstInstantTime, numRecords), 1); - JavaRDD result = client.bulkInsert(writeRecords, firstInstantTime); - assertTrue(client.commit(firstInstantTime, result), "Commit should succeed"); - assertTrue(testTable.commitExists(firstInstantTime), - "After explicit commit, commit file should be created"); - - // Check the entire dataset has all records still - String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; - for (int i = 0; i < fullPartitionPaths.length; i++) { - fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); - } - assertEquals(numRecords, - HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), - "Must contain " + numRecords + " records"); - - String nextInstantTime = "0001"; - client.startCommitWithTime(nextInstantTime); - JavaRDD updateRecords = jsc.parallelize(dataGen.generateUpdates(nextInstantTime, numRecords), 1); - JavaRDD insertRecords = jsc.parallelize(dataGen.generateInserts(nextInstantTime, numRecords), 1); - JavaRDD inserts = client.bulkInsert(insertRecords, nextInstantTime); - JavaRDD upserts = client.upsert(updateRecords, nextInstantTime); - assertTrue(client.commit(nextInstantTime, inserts.union(upserts)), "Commit should succeed"); - assertTrue(testTable.commitExists(firstInstantTime), - "After explicit commit, commit file should be created"); - int totalRecords = 2 * numRecords; - assertEquals(totalRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), - "Must contain " + totalRecords + " records"); - } - - /** - * Build Hoodie Write Config for small data file sizes. - */ - private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize) { - return getSmallInsertWriteConfig(insertSplitSize, false); - } - - /** - * Build Hoodie Write Config for small data file sizes. - */ - private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, boolean useNullSchema) { - HoodieWriteConfig.Builder builder = getConfigBuilder(useNullSchema ? NULL_SCHEMA : TRIP_EXAMPLE_SCHEMA); - return builder - .withCompactionConfig( - HoodieCompactionConfig.newBuilder() - .compactionSmallFileSize(dataGen.getEstimatedFileSizeInBytes(150)) - .insertSplitSize(insertSplitSize).build()) - .withStorageConfig( - HoodieStorageConfig.newBuilder() - .hfileMaxFileSize(dataGen.getEstimatedFileSizeInBytes(200)) - .parquetMaxFileSize(dataGen.getEstimatedFileSizeInBytes(200)).build()) - .build(); - } -} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieReadClient.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieReadClient.java index 35ee557ceae88..bc1d6e03c04df 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieReadClient.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieReadClient.java @@ -18,10 +18,12 @@ package org.apache.hudi.client; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.testutils.HoodieClientTestBase; import org.apache.spark.api.java.JavaPairRDD; @@ -31,6 +33,7 @@ import org.apache.spark.sql.Row; import org.junit.jupiter.api.Test; +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -41,12 +44,23 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; -@SuppressWarnings("unchecked") /** * Test-cases for covering HoodieReadClient APIs */ +@SuppressWarnings("unchecked") public class TestHoodieReadClient extends HoodieClientTestBase { + @Override + protected void initPath() { + try { + java.nio.file.Path basePath = tempDir.resolve("dataset"); + java.nio.file.Files.createDirectories(basePath); + this.basePath = basePath.toUri().toString(); + } catch (IOException ioe) { + throw new HoodieIOException(ioe.getMessage(), ioe); + } + } + /** * Test ReadFilter API after writing new records using HoodieWriteClient.insert. */ @@ -84,7 +98,7 @@ public void testReadFilterExistAfterBulkInsertPrepped() throws Exception { @Test public void testReadROViewFailsWithoutSqlContext() { - HoodieReadClient readClient = new HoodieReadClient(context, getConfig()); + SparkRDDReadClient readClient = new SparkRDDReadClient(context, getConfig()); JavaRDD recordsRDD = jsc.parallelize(new ArrayList<>(), 1); assertThrows(IllegalStateException.class, () -> { readClient.readROView(recordsRDD, 1); @@ -102,7 +116,7 @@ public void testReadROViewFailsWithoutSqlContext() { private void testReadFilterExist(HoodieWriteConfig config, Function3, SparkRDDWriteClient, JavaRDD, String> writeFn) throws Exception { try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config);) { - HoodieReadClient readClient = getHoodieReadClient(config.getBasePath()); + SparkRDDReadClient readClient = getHoodieReadClient(config.getBasePath()); String newCommitTime = writeClient.startCommit(); List records = dataGen.generateInserts(newCommitTime, 100); JavaRDD recordsRDD = jsc.parallelize(records, 1); @@ -113,12 +127,12 @@ private void testReadFilterExist(HoodieWriteConfig config, assertEquals(100, filteredRDD.collect().size()); JavaRDD smallRecordsRDD = jsc.parallelize(records.subList(0, 75), 1); - // We create three parquet file, each having one record. (3 different partitions) + // We create three base file, each having one record. (3 different partitions) List statuses = writeFn.apply(writeClient, smallRecordsRDD, newCommitTime).collect(); // Verify there are no errors assertNoWriteErrors(statuses); - HoodieReadClient anotherReadClient = getHoodieReadClient(config.getBasePath()); + SparkRDDReadClient anotherReadClient = getHoodieReadClient(config.getBasePath()); filteredRDD = anotherReadClient.filterExists(recordsRDD); List result = filteredRDD.collect(); // Check results @@ -209,9 +223,9 @@ private void testTagLocation(HoodieWriteConfig hoodieWriteConfig, // since they have been modified in the DAG JavaRDD recordRDD = jsc.parallelize(result.collect().stream().map(WriteStatus::getWrittenRecords).flatMap(Collection::stream) - .map(record -> new HoodieRecord(record.getKey(), null)).collect(Collectors.toList())); + .map(record -> new HoodieAvroRecord(record.getKey(), null)).collect(Collectors.toList())); // Should have 100 records in table (check using Index), all in locations marked at commit - HoodieReadClient readClient = getHoodieReadClient(hoodieWriteConfig.getBasePath()); + SparkRDDReadClient readClient = getHoodieReadClient(hoodieWriteConfig.getBasePath()); List taggedRecords = readClient.tagLocation(recordRDD).collect(); checkTaggedRecords(taggedRecords, newCommitTime); @@ -225,7 +239,7 @@ private void testTagLocation(HoodieWriteConfig hoodieWriteConfig, numRecords, 200, 2); recordRDD = jsc.parallelize(result.collect().stream().map(WriteStatus::getWrittenRecords).flatMap(Collection::stream) - .map(record -> new HoodieRecord(record.getKey(), null)).collect(Collectors.toList())); + .map(record -> new HoodieAvroRecord(record.getKey(), null)).collect(Collectors.toList())); // Index should be able to locate all updates in correct locations. readClient = getHoodieReadClient(hoodieWriteConfig.getBasePath()); taggedRecords = readClient.tagLocation(recordRDD).collect(); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestMultiFS.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestMultiFS.java index 34daed76f1ba8..df0fed027cec1 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestMultiFS.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestMultiFS.java @@ -30,6 +30,7 @@ import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.testutils.HoodieClientTestHarness; import org.apache.hudi.testutils.HoodieClientTestUtils; @@ -69,19 +70,32 @@ public void tearDown() throws Exception { protected HoodieWriteConfig getHoodieWriteConfig(String basePath) { return HoodieWriteConfig.newBuilder().withPath(basePath).withEmbeddedTimelineServerEnabled(true) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable(tableName) - .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .build(); } @Test public void readLocalWriteHDFS() throws Exception { // Initialize table and filesystem - HoodieTableMetaClient.initTableType(hadoopConf, dfsBasePath, HoodieTableType.valueOf(tableType), - tableName, HoodieAvroPayload.class.getName()); + HoodieTableMetaClient.withPropertyBuilder() + .setTableType(tableType) + .setTableName(tableName) + .setPayloadClass(HoodieAvroPayload.class) + .initTable(hadoopConf, dfsBasePath); // Create write client to write some records in HoodieWriteConfig cfg = getHoodieWriteConfig(dfsBasePath); HoodieWriteConfig localConfig = getHoodieWriteConfig(tablePath); + HoodieTableMetaClient.withPropertyBuilder() + .setTableType(tableType) + .setTableName(tableName) + .setPayloadClass(HoodieAvroPayload.class) + .setRecordKeyFields(localConfig.getProps().getProperty(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key())) + .setPartitionFields(localConfig.getProps().getProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())) + .initTable(hadoopConf, tablePath); + + try (SparkRDDWriteClient hdfsWriteClient = getHoodieWriteClient(cfg); SparkRDDWriteClient localWriteClient = getHoodieWriteClient(localConfig)) { @@ -94,14 +108,17 @@ public void readLocalWriteHDFS() throws Exception { // Read from hdfs FileSystem fs = FSUtils.getFs(dfsBasePath, HoodieTestUtils.getDefaultHadoopConf()); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), dfsBasePath); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(dfsBasePath).build(); HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline(); Dataset readRecords = HoodieClientTestUtils.readCommit(dfsBasePath, sqlContext, timeline, readCommitTime); assertEquals(readRecords.count(), records.size(), "Should contain 100 records"); // Write to local - HoodieTableMetaClient.initTableType(hadoopConf, tablePath, HoodieTableType.valueOf(tableType), - tableName, HoodieAvroPayload.class.getName()); + HoodieTableMetaClient.withPropertyBuilder() + .setTableType(tableType) + .setTableName(tableName) + .setPayloadClass(HoodieAvroPayload.class) + .initTable(hadoopConf, tablePath); String writeCommitTime = localWriteClient.startCommit(); LOG.info("Starting write commit " + writeCommitTime); @@ -112,7 +129,7 @@ public void readLocalWriteHDFS() throws Exception { LOG.info("Reading from path: " + tablePath); fs = FSUtils.getFs(tablePath, HoodieTestUtils.getDefaultHadoopConf()); - metaClient = new HoodieTableMetaClient(fs.getConf(), tablePath); + metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(tablePath).build(); timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline(); Dataset localReadRecords = HoodieClientTestUtils.readCommit(tablePath, sqlContext, timeline, writeCommitTime); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestTableSchemaEvolution.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestTableSchemaEvolution.java index 8ee0c163409a5..1cb7bcbfc4fcb 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestTableSchemaEvolution.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestTableSchemaEvolution.java @@ -19,6 +19,7 @@ package org.apache.hudi.client; import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; @@ -67,6 +68,10 @@ public class TestTableSchemaEvolution extends HoodieClientTestBase { public static final String EXTRA_FIELD_SCHEMA = "{\"name\": \"new_field\", \"type\": \"boolean\", \"default\": false},"; + public static final String EXTRA_FIELD_WITHOUT_DEFAULT_SCHEMA = + "{\"name\": \"new_field_without_default\", \"type\": \"boolean\"},"; + public static final String EXTRA_FIELD_NULLABLE_SCHEMA = + ",{\"name\": \"new_field_without_default\", \"type\": [\"boolean\", \"null\"]}"; // TRIP_EXAMPLE_SCHEMA with a new_field added public static final String TRIP_EXAMPLE_SCHEMA_EVOLVED = TRIP_SCHEMA_PREFIX + EXTRA_TYPE_SCHEMA + MAP_TYPE_SCHEMA @@ -107,10 +112,33 @@ public void testSchemaCompatibilityBasic() throws Exception { assertFalse(TableSchemaResolver.isSchemaCompatible(TRIP_EXAMPLE_SCHEMA, swappedFieldSchema), "Swapped fields are not compatible"); - String typeChangeSchema = TRIP_SCHEMA_PREFIX + MAP_TYPE_SCHEMA + FARE_NESTED_SCHEMA + String typeChangeSchemaDisallowed = TRIP_SCHEMA_PREFIX + MAP_TYPE_SCHEMA + FARE_NESTED_SCHEMA + TIP_NESTED_SCHEMA.replace("string", "boolean") + TRIP_SCHEMA_SUFFIX; - assertFalse(TableSchemaResolver.isSchemaCompatible(TRIP_EXAMPLE_SCHEMA, typeChangeSchema), - "Field type change is not compatible"); + assertFalse(TableSchemaResolver.isSchemaCompatible(TRIP_EXAMPLE_SCHEMA, typeChangeSchemaDisallowed), + "Incompatible field type change is not allowed"); + + // Array of allowed schema field type transitions + String[][] allowedFieldChanges = { + {"string", "bytes"}, {"bytes", "string"}, + {"int", "long"}, {"int", "float"}, {"long", "float"}, + {"int", "double"}, {"float", "double"}, {"long", "double"}}; + for (String[] fieldChange : allowedFieldChanges) { + String fromSchema = TRIP_SCHEMA_PREFIX + EXTRA_FIELD_SCHEMA.replace("string", fieldChange[0]) + TRIP_SCHEMA_SUFFIX; + String toSchema = TRIP_SCHEMA_PREFIX + EXTRA_FIELD_SCHEMA.replace("string", fieldChange[1]) + TRIP_SCHEMA_SUFFIX; + assertTrue(TableSchemaResolver.isSchemaCompatible(fromSchema, toSchema), + "Compatible field type change is not allowed"); + if (!fieldChange[0].equals("byte") && fieldChange[1].equals("byte")) { + assertFalse(TableSchemaResolver.isSchemaCompatible(toSchema, fromSchema), + "Incompatible field type change is allowed"); + } + } + + // Names and aliases should match + String fromSchema = TRIP_SCHEMA_PREFIX + EXTRA_FIELD_SCHEMA + TRIP_SCHEMA_SUFFIX; + String toSchema = TRIP_SCHEMA_PREFIX.replace("triprec", "new_triprec") + EXTRA_FIELD_SCHEMA + TRIP_SCHEMA_SUFFIX; + assertFalse(TableSchemaResolver.isSchemaCompatible(fromSchema, toSchema), "Field names should match"); + assertFalse(TableSchemaResolver.isSchemaCompatible(toSchema, fromSchema), "Field names should match"); + assertTrue(TableSchemaResolver.isSchemaCompatible(TRIP_EXAMPLE_SCHEMA, TRIP_EXAMPLE_SCHEMA_EVOLVED), "Added field with default is compatible (Evolved Schema)"); @@ -119,7 +147,17 @@ public void testSchemaCompatibilityBasic() throws Exception { + TIP_NESTED_SCHEMA + EXTRA_FIELD_SCHEMA + EXTRA_FIELD_SCHEMA.replace("new_field", "new_new_field") + TRIP_SCHEMA_SUFFIX; assertTrue(TableSchemaResolver.isSchemaCompatible(TRIP_EXAMPLE_SCHEMA, multipleAddedFieldSchema), - "Multiple added fields with defauls are compatible"); + "Multiple added fields with defaults are compatible"); + + assertFalse(TableSchemaResolver.isSchemaCompatible(TRIP_EXAMPLE_SCHEMA, + TRIP_SCHEMA_PREFIX + EXTRA_TYPE_SCHEMA + MAP_TYPE_SCHEMA + + FARE_NESTED_SCHEMA + TIP_NESTED_SCHEMA + EXTRA_FIELD_WITHOUT_DEFAULT_SCHEMA + TRIP_SCHEMA_SUFFIX), + "Added field without default and not nullable is not compatible (Evolved Schema)"); + + assertTrue(TableSchemaResolver.isSchemaCompatible(TRIP_EXAMPLE_SCHEMA, + TRIP_SCHEMA_PREFIX + EXTRA_TYPE_SCHEMA + MAP_TYPE_SCHEMA + + FARE_NESTED_SCHEMA + TIP_NESTED_SCHEMA + TRIP_SCHEMA_SUFFIX + EXTRA_FIELD_NULLABLE_SCHEMA), + "Added nullable field is compatible (Evolved Schema)"); } @Test @@ -127,12 +165,14 @@ public void testMORTable() throws Exception { tableType = HoodieTableType.MERGE_ON_READ; // Create the table - HoodieTableMetaClient.initTableType(metaClient.getHadoopConf(), metaClient.getBasePath(), - HoodieTableType.MERGE_ON_READ, metaClient.getTableConfig().getTableName(), - metaClient.getArchivePath(), metaClient.getTableConfig().getPayloadClass(), VERSION_1); + HoodieTableMetaClient.withPropertyBuilder() + .fromMetaClient(metaClient) + .setTableType(HoodieTableType.MERGE_ON_READ) + .setTimelineLayoutVersion(VERSION_1) + .initTable(metaClient.getHadoopConf(), metaClient.getBasePath()); HoodieWriteConfig hoodieWriteConfig = getWriteConfig(TRIP_EXAMPLE_SCHEMA); - SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig, false); + SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig); // Initial inserts with TRIP_EXAMPLE_SCHEMA int numRecords = 10; @@ -161,13 +201,13 @@ public void testMORTable() throws Exception { // Insert with evolved schema is not allowed HoodieWriteConfig hoodieDevolvedWriteConfig = getWriteConfig(TRIP_EXAMPLE_SCHEMA_DEVOLVED); - client = getHoodieWriteClient(hoodieDevolvedWriteConfig, false); + client = getHoodieWriteClient(hoodieDevolvedWriteConfig); final List failedRecords = generateInsertsWithSchema("004", numRecords, TRIP_EXAMPLE_SCHEMA_DEVOLVED); try { // We cannot use insertBatch directly here because we want to insert records - // with a devolved schema and insertBatch inserts records using the TRIP_EXMPLE_SCHEMA. + // with a devolved schema and insertBatch inserts records using the TRIP_EXAMPLE_SCHEMA. writeBatch(client, "005", "004", Option.empty(), "003", numRecords, - (String s, Integer a) -> failedRecords, SparkRDDWriteClient::insert, false, 0, 0, 0); + (String s, Integer a) -> failedRecords, SparkRDDWriteClient::insert, false, 0, 0, 0, false); fail("Insert with devolved scheme should fail"); } catch (HoodieInsertException ex) { // no new commit @@ -190,13 +230,13 @@ public void testMORTable() throws Exception { // Insert with an evolved scheme is allowed HoodieWriteConfig hoodieEvolvedWriteConfig = getWriteConfig(TRIP_EXAMPLE_SCHEMA_EVOLVED); - client = getHoodieWriteClient(hoodieEvolvedWriteConfig, false); + client = getHoodieWriteClient(hoodieEvolvedWriteConfig); // We cannot use insertBatch directly here because we want to insert records - // with a evolved schemaand insertBatch inserts records using the TRIP_EXMPLE_SCHEMA. + // with an evolved schema and insertBatch inserts records using the TRIP_EXAMPLE_SCHEMA. final List evolvedRecords = generateInsertsWithSchema("005", numRecords, TRIP_EXAMPLE_SCHEMA_EVOLVED); writeBatch(client, "005", "004", Option.empty(), initCommitTime, numRecords, - (String s, Integer a) -> evolvedRecords, SparkRDDWriteClient::insert, false, 0, 0, 0); + (String s, Integer a) -> evolvedRecords, SparkRDDWriteClient::insert, false, 0, 0, 0, false); // new commit checkLatestDeltaCommit("005"); @@ -205,14 +245,14 @@ public void testMORTable() throws Exception { // Updates with evolved schema is allowed final List updateRecords = generateUpdatesWithSchema("006", numUpdateRecords, TRIP_EXAMPLE_SCHEMA_EVOLVED); writeBatch(client, "006", "005", Option.empty(), initCommitTime, - numUpdateRecords, (String s, Integer a) -> updateRecords, SparkRDDWriteClient::upsert, false, 0, 0, 0); + numUpdateRecords, (String s, Integer a) -> updateRecords, SparkRDDWriteClient::upsert, false, 0, 0, 0, false); // new commit checkLatestDeltaCommit("006"); checkReadRecords("000", 2 * numRecords); // Now even the original schema cannot be used for updates as it is devolved in relation to the // current schema of the dataset. - client = getHoodieWriteClient(hoodieWriteConfig, false); + client = getHoodieWriteClient(hoodieWriteConfig); try { updateBatch(hoodieWriteConfig, client, "007", "006", Option.empty(), initCommitTime, numUpdateRecords, SparkRDDWriteClient::upsert, false, false, 0, 0, 0); @@ -233,7 +273,7 @@ public void testMORTable() throws Exception { failedRecords.clear(); failedRecords.addAll(dataGen.generateInserts("007", numRecords)); writeBatch(client, "007", "006", Option.empty(), initCommitTime, numRecords, - (String s, Integer a) -> failedRecords, SparkRDDWriteClient::insert, true, numRecords, numRecords, 1); + (String s, Integer a) -> failedRecords, SparkRDDWriteClient::insert, true, numRecords, numRecords, 1, false); fail("Insert with original scheme should fail"); } catch (HoodieInsertException ex) { // no new commit @@ -251,11 +291,11 @@ public void testMORTable() throws Exception { } // Rollback to the original schema - client.restoreToInstant("004"); + client.restoreToInstant("004", hoodieWriteConfig.isMetadataTableEnabled()); checkLatestDeltaCommit("004"); // Updates with original schema are now allowed - client = getHoodieWriteClient(hoodieWriteConfig, false); + client = getHoodieWriteClient(hoodieWriteConfig); updateBatch(hoodieWriteConfig, client, "008", "004", Option.empty(), initCommitTime, numUpdateRecords, SparkRDDWriteClient::upsert, false, false, 0, 0, 0); // new commit @@ -264,7 +304,7 @@ public void testMORTable() throws Exception { // Insert with original schema is allowed now insertBatch(hoodieWriteConfig, client, "009", "008", numRecords, SparkRDDWriteClient::insert, - false, false, 0, 0, 0); + false, false, 0, 0, 0, Option.empty()); checkLatestDeltaCommit("009"); checkReadRecords("000", 3 * numRecords); } @@ -272,12 +312,13 @@ public void testMORTable() throws Exception { @Test public void testCopyOnWriteTable() throws Exception { // Create the table - HoodieTableMetaClient.initTableType(metaClient.getHadoopConf(), metaClient.getBasePath(), - HoodieTableType.COPY_ON_WRITE, metaClient.getTableConfig().getTableName(), - metaClient.getArchivePath(), metaClient.getTableConfig().getPayloadClass(), VERSION_1); + HoodieTableMetaClient.withPropertyBuilder() + .fromMetaClient(metaClient) + .setTimelineLayoutVersion(VERSION_1) + .initTable(metaClient.getHadoopConf(), metaClient.getBasePath()); - HoodieWriteConfig hoodieWriteConfig = getWriteConfig(TRIP_EXAMPLE_SCHEMA); - SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig, false); + HoodieWriteConfig hoodieWriteConfig = getWriteConfigBuilder(TRIP_EXAMPLE_SCHEMA).withRollbackUsingMarkers(false).build(); + SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig); // Initial inserts with TRIP_EXAMPLE_SCHEMA int numRecords = 10; @@ -301,13 +342,13 @@ public void testCopyOnWriteTable() throws Exception { // Insert with devolved schema is not allowed HoodieWriteConfig hoodieDevolvedWriteConfig = getWriteConfig(TRIP_EXAMPLE_SCHEMA_DEVOLVED); - client = getHoodieWriteClient(hoodieDevolvedWriteConfig, false); + client = getHoodieWriteClient(hoodieDevolvedWriteConfig); final List failedRecords = generateInsertsWithSchema("004", numRecords, TRIP_EXAMPLE_SCHEMA_DEVOLVED); try { // We cannot use insertBatch directly here because we want to insert records // with a devolved schema. writeBatch(client, "004", "003", Option.empty(), "003", numRecords, - (String s, Integer a) -> failedRecords, SparkRDDWriteClient::insert, true, numRecords, numRecords, 1); + (String s, Integer a) -> failedRecords, SparkRDDWriteClient::insert, true, numRecords, numRecords, 1, false); fail("Insert with devolved scheme should fail"); } catch (HoodieInsertException ex) { // no new commit @@ -331,12 +372,12 @@ public void testCopyOnWriteTable() throws Exception { // Insert with evolved scheme is allowed HoodieWriteConfig hoodieEvolvedWriteConfig = getWriteConfig(TRIP_EXAMPLE_SCHEMA_EVOLVED); - client = getHoodieWriteClient(hoodieEvolvedWriteConfig, false); + client = getHoodieWriteClient(hoodieEvolvedWriteConfig); final List evolvedRecords = generateInsertsWithSchema("004", numRecords, TRIP_EXAMPLE_SCHEMA_EVOLVED); // We cannot use insertBatch directly here because we want to insert records // with a evolved schema. writeBatch(client, "004", "003", Option.empty(), initCommitTime, numRecords, - (String s, Integer a) -> evolvedRecords, SparkRDDWriteClient::insert, true, numRecords, 2 * numRecords, 4); + (String s, Integer a) -> evolvedRecords, SparkRDDWriteClient::insert, true, numRecords, 2 * numRecords, 4, false); // new commit HoodieTimeline curTimeline = metaClient.reloadActiveTimeline().getCommitTimeline().filterCompletedInstants(); assertTrue(curTimeline.lastInstant().get().getTimestamp().equals("004")); @@ -345,12 +386,12 @@ public void testCopyOnWriteTable() throws Exception { // Updates with evolved schema is allowed final List updateRecords = generateUpdatesWithSchema("005", numUpdateRecords, TRIP_EXAMPLE_SCHEMA_EVOLVED); writeBatch(client, "005", "004", Option.empty(), initCommitTime, - numUpdateRecords, (String s, Integer a) -> updateRecords, SparkRDDWriteClient::upsert, true, numUpdateRecords, 2 * numRecords, 5); + numUpdateRecords, (String s, Integer a) -> updateRecords, SparkRDDWriteClient::upsert, true, numUpdateRecords, 2 * numRecords, 5, false); checkReadRecords("000", 2 * numRecords); // Now even the original schema cannot be used for updates as it is devolved // in relation to the current schema of the dataset. - client = getHoodieWriteClient(hoodieWriteConfig, false); + client = getHoodieWriteClient(hoodieWriteConfig); try { updateBatch(hoodieWriteConfig, client, "006", "005", Option.empty(), initCommitTime, numUpdateRecords, SparkRDDWriteClient::upsert, false, true, @@ -372,7 +413,7 @@ public void testCopyOnWriteTable() throws Exception { failedRecords.clear(); failedRecords.addAll(dataGen.generateInserts("006", numRecords)); writeBatch(client, "006", "005", Option.empty(), initCommitTime, numRecords, - (String s, Integer a) -> failedRecords, SparkRDDWriteClient::insert, true, numRecords, numRecords, 1); + (String s, Integer a) -> failedRecords, SparkRDDWriteClient::insert, true, numRecords, numRecords, 1, false); fail("Insert with original scheme should fail"); } catch (HoodieInsertException ex) { // no new commit @@ -391,14 +432,14 @@ public void testCopyOnWriteTable() throws Exception { // Revert to the older commit and ensure that the original schema can now // be used for inserts and inserts. - client.restoreToInstant("003"); + client.restoreToInstant("003", hoodieWriteConfig.isMetadataTableEnabled()); curTimeline = metaClient.reloadActiveTimeline().getCommitTimeline().filterCompletedInstants(); assertTrue(curTimeline.lastInstant().get().getTimestamp().equals("003")); checkReadRecords("000", numRecords); // Insert with original schema is allowed now insertBatch(hoodieWriteConfig, client, "007", "003", numRecords, SparkRDDWriteClient::insert, - false, true, numRecords, 2 * numRecords, 1); + false, true, numRecords, 2 * numRecords, 1, Option.empty()); checkReadRecords("000", 2 * numRecords); // Update with original schema is allowed now @@ -411,7 +452,7 @@ public void testCopyOnWriteTable() throws Exception { private void checkReadRecords(String instantTime, int numExpectedRecords) throws IOException { if (tableType == HoodieTableType.COPY_ON_WRITE) { HoodieTimeline timeline = metaClient.reloadActiveTimeline().getCommitTimeline(); - assertEquals(numExpectedRecords, HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, instantTime)); + assertEquals(numExpectedRecords, HoodieClientTestUtils.countRecordsOptionallySince(jsc, basePath, sqlContext, timeline, Option.of(instantTime))); } else { // TODO: This code fails to read records under the following conditions: // 1. No parquet files yet (i.e. no compaction done yet) @@ -457,9 +498,9 @@ private List convertToSchema(List records, String sc HoodieKey key = r.getKey(); GenericRecord payload; try { - payload = (GenericRecord)r.getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA).get(); - GenericRecord newPayload = HoodieAvroUtils.rewriteRecordWithOnlyNewSchemaFields(payload, newSchema); - return new HoodieRecord(key, new RawTripTestPayload(newPayload.toString(), key.getRecordKey(), key.getPartitionPath(), schemaStr)); + payload = (GenericRecord) ((HoodieAvroRecord) r).getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA).get(); + GenericRecord newPayload = HoodieAvroUtils.rewriteRecord(payload, newSchema); + return new HoodieAvroRecord(key, new RawTripTestPayload(newPayload.toString(), key.getRecordKey(), key.getPartitionPath(), schemaStr)); } catch (IOException e) { throw new RuntimeException("Conversion to new schema failed"); } @@ -467,13 +508,17 @@ private List convertToSchema(List records, String sc } private HoodieWriteConfig getWriteConfig(String schema) { + return getWriteConfigBuilder(schema).build(); + } + + private HoodieWriteConfig.Builder getWriteConfigBuilder(String schema) { return getConfigBuilder(schema) .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.INMEMORY).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder().withMaxNumDeltaCommitsBeforeCompaction(1).build()) - .withAvroSchemaValidate(true) - .build(); + .withAvroSchemaValidate(true); } + @Override protected HoodieTableType getTableType() { return tableType; } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestUpdateSchemaEvolution.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestUpdateSchemaEvolution.java index 9a8d7e0c88898..a5926196ea396 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestUpdateSchemaEvolution.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestUpdateSchemaEvolution.java @@ -19,13 +19,17 @@ package org.apache.hudi.client; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.testutils.RawTripTestPayload; -import org.apache.hudi.common.util.ParquetUtils; +import org.apache.hudi.common.util.BaseFileUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.io.HoodieCreateHandle; @@ -36,7 +40,6 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.fs.Path; -import org.apache.parquet.avro.AvroReadSupport; import org.apache.parquet.io.InvalidRecordException; import org.apache.parquet.io.ParquetDecodingException; import org.junit.jupiter.api.AfterEach; @@ -64,6 +67,7 @@ public void setUp() throws Exception { HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath); initSparkContexts("TestUpdateSchemaEvolution"); initFileSystem(); + initTimelineService(); } @AfterEach @@ -74,20 +78,21 @@ public void tearDown() throws IOException { private WriteStatus prepareFirstRecordCommit(List recordsStrs) throws IOException { // Create a bunch of records with an old version of schema final HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.avsc"); + config.setValue(HoodieCompactionConfig.PRESERVE_COMMIT_METADATA, "false"); final HoodieSparkTable table = HoodieSparkTable.create(config, context); final List statuses = jsc.parallelize(Arrays.asList(1)).map(x -> { List insertRecords = new ArrayList<>(); for (String recordStr : recordsStrs) { RawTripTestPayload rowChange = new RawTripTestPayload(recordStr); insertRecords - .add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); + .add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); } Map insertRecordMap = insertRecords.stream() .collect(Collectors.toMap(r -> r.getRecordKey(), Function.identity())); - HoodieCreateHandle createHandle = + HoodieCreateHandle createHandle = new HoodieCreateHandle(config, "100", table, insertRecords.get(0).getPartitionPath(), "f1-0", insertRecordMap, supplier); createHandle.write(); - return createHandle.close(); + return createHandle.close().get(0); }).collect(); final Path commitFile = new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100")); @@ -122,10 +127,11 @@ private void assertSchemaEvolutionOnUpdateResult(WriteStatus insertResult, Hoodi jsc.parallelize(Arrays.asList(1)).map(x -> { Executable executable = () -> { HoodieMergeHandle mergeHandle = new HoodieMergeHandle(updateTable.getConfig(), "101", updateTable, - updateRecords.iterator(), updateRecords.get(0).getPartitionPath(), insertResult.getFileId(), supplier); - AvroReadSupport.setAvroReadSchema(updateTable.getHadoopConf(), mergeHandle.getWriterSchemaWithMetafields()); - List oldRecords = ParquetUtils.readAvroRecords(updateTable.getHadoopConf(), - new Path(updateTable.getConfig().getBasePath() + "/" + insertResult.getStat().getPath())); + updateRecords.iterator(), updateRecords.get(0).getPartitionPath(), insertResult.getFileId(), supplier, Option.empty()); + List oldRecords = BaseFileUtils.getInstance(updateTable.getBaseFileFormat()) + .readAvroRecords(updateTable.getHadoopConf(), + new Path(updateTable.getConfig().getBasePath() + "/" + insertResult.getStat().getPath()), + mergeHandle.getWriterSchemaWithMetaFields()); for (GenericRecord rec : oldRecords) { mergeHandle.write(rec); } @@ -144,7 +150,7 @@ private List buildUpdateRecords(String recordStr, String insertFil List updateRecords = new ArrayList<>(); RawTripTestPayload rowChange = new RawTripTestPayload(recordStr); HoodieRecord record = - new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange); + new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange); record.setCurrentLocation(new HoodieRecordLocation("101", insertFileId)); record.seal(); updateRecords.add(record); @@ -227,6 +233,9 @@ public void testSchemaEvolutionOnUpdateMisMatchWithChangeColumnType() throws Exc private HoodieWriteConfig makeHoodieClientConfig(String name) { Schema schema = getSchemaFromResource(getClass(), name); - return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schema.toString()).build(); + return HoodieWriteConfig.newBuilder().withPath(basePath) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withRemoteServerPort(timelineServicePort).build()) + .withSchema(schema.toString()).build(); } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestWriteStatus.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestWriteStatus.java index 91878e1992cfe..78e711ed70129 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestWriteStatus.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestWriteStatus.java @@ -19,6 +19,7 @@ package org.apache.hudi.client; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.Option; import org.junit.jupiter.api.Test; @@ -44,8 +45,8 @@ public void testSuccessRecordTracking() { WriteStatus status = new WriteStatus(false, 1.0); Throwable t = new Exception("some error in writing"); for (int i = 0; i < 1000; i++) { - status.markSuccess(mock(HoodieRecord.class), null); - status.markFailure(mock(HoodieRecord.class), t, null); + status.markSuccess(mock(HoodieRecord.class), Option.empty()); + status.markFailure(mock(HoodieRecord.class), t, Option.empty()); } assertEquals(1000, status.getFailedRecords().size()); assertTrue(status.hasErrors()); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestConsistentBucketIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestConsistentBucketIndex.java new file mode 100644 index 0000000000000..05617301936eb --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestConsistentBucketIndex.java @@ -0,0 +1,250 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.functional; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.fs.ConsistencyGuardConfig; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.FileSystemViewStorageType; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.hadoop.HoodieParquetInputFormat; +import org.apache.hudi.hadoop.RealtimeFileStatus; +import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.testutils.HoodieClientTestHarness; +import org.apache.hudi.testutils.HoodieMergeOnReadTestUtils; +import org.apache.hudi.testutils.MetadataMergeWriteStatus; + +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.apache.spark.api.java.JavaRDD; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.IOException; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.Properties; +import java.util.Random; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * Test consistent hashing index + */ +@Tag("functional") +public class TestConsistentBucketIndex extends HoodieClientTestHarness { + + private final Random random = new Random(1); + private HoodieIndex index; + private HoodieWriteConfig config; + + private static Stream configParams() { + // preserveMetaField, partitioned + Object[][] data = new Object[][] { + {true, false}, + {false, false}, + {true, true}, + {false, true}, + }; + return Stream.of(data).map(Arguments::of); + } + + private void setUp(boolean populateMetaFields, boolean partitioned) throws Exception { + initPath(); + initSparkContexts(); + if (partitioned) { + initTestDataGenerator(); + } else { + initTestDataGenerator(new String[] {""}); + } + initFileSystem(); + Properties props = getPropertiesForKeyGen(populateMetaFields); + props.setProperty(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "_row_key"); + metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ, props); + config = getConfigBuilder() + .withProperties(props) + .withIndexConfig(HoodieIndexConfig.newBuilder() + .fromProperties(props) + .withIndexType(HoodieIndex.IndexType.BUCKET) + .withIndexKeyField("_row_key") + .withBucketIndexEngineType(HoodieIndex.BucketIndexEngineType.CONSISTENT_HASHING) + .build()) + .withAutoCommit(false) + .build(); + writeClient = getHoodieWriteClient(config); + index = writeClient.getIndex(); + } + + @AfterEach + public void tearDown() throws IOException { + cleanupResources(); + } + + /** + * Test bucket index tagging (always tag regardless of the write status) + * Test bucket index tagging consistency, two tagging result should be same + * + * @param populateMetaFields + * @param partitioned + * @throws Exception + */ + @ParameterizedTest + @MethodSource("configParams") + public void testTagLocation(boolean populateMetaFields, boolean partitioned) throws Exception { + setUp(populateMetaFields, partitioned); + String newCommitTime = "001"; + int totalRecords = 20 + random.nextInt(20); + List records = dataGen.generateInserts(newCommitTime, totalRecords); + JavaRDD writeRecords = jsc.parallelize(records, 2); + + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + + // The records should be tagged anyway, even though it is the first time doing tagging + List taggedRecord = tagLocation(index, writeRecords, hoodieTable).collect(); + Assertions.assertTrue(taggedRecord.stream().allMatch(r -> r.isCurrentLocationKnown())); + + // Tag again, the records should get the same location (hashing metadata has been persisted after the first tagging) + List taggedRecord2 = tagLocation(index, writeRecords, hoodieTable).collect(); + for (HoodieRecord ref : taggedRecord) { + for (HoodieRecord record : taggedRecord2) { + if (ref.getRecordKey().equals(record.getRecordKey())) { + Assertions.assertEquals(ref.getCurrentLocation(), record.getCurrentLocation()); + break; + } + } + } + } + + @ParameterizedTest + @MethodSource("configParams") + public void testWriteData(boolean populateMetaFields, boolean partitioned) throws Exception { + setUp(populateMetaFields, partitioned); + String newCommitTime = "001"; + int totalRecords = 20 + random.nextInt(20); + List records = dataGen.generateInserts(newCommitTime, totalRecords); + JavaRDD writeRecords = jsc.parallelize(records, 2); + + metaClient = HoodieTableMetaClient.reload(metaClient); + + // Insert totalRecords records + writeClient.startCommitWithTime(newCommitTime); + List writeStatues = writeClient.upsert(writeRecords, newCommitTime).collect(); + org.apache.hudi.testutils.Assertions.assertNoWriteErrors(writeStatues); + boolean success = writeClient.commitStats(newCommitTime, writeStatues.stream() + .map(WriteStatus::getStat) + .collect(Collectors.toList()), Option.empty(), metaClient.getCommitActionType()); + Assertions.assertTrue(success); + metaClient = HoodieTableMetaClient.reload(metaClient); + // The number of distinct fileId should be the same as total log file numbers + Assertions.assertEquals(writeStatues.stream().map(WriteStatus::getFileId).distinct().count(), + Arrays.stream(dataGen.getPartitionPaths()).mapToInt(p -> Objects.requireNonNull(listStatus(p, true)).length).sum()); + Assertions.assertEquals(totalRecords, readRecords(dataGen.getPartitionPaths(), populateMetaFields).size()); + + // Upsert the same set of records, the number of records should be same + newCommitTime = "002"; + writeClient.startCommitWithTime(newCommitTime); + writeStatues = writeClient.upsert(writeRecords, newCommitTime).collect(); + org.apache.hudi.testutils.Assertions.assertNoWriteErrors(writeStatues); + success = writeClient.commitStats(newCommitTime, writeStatues.stream() + .map(WriteStatus::getStat) + .collect(Collectors.toList()), Option.empty(), metaClient.getCommitActionType()); + Assertions.assertTrue(success); + // The number of log file should double after this insertion + long numberOfLogFiles = Arrays.stream(dataGen.getPartitionPaths()) + .mapToInt(p -> { + return Arrays.stream(listStatus(p, true)).mapToInt(fs -> + fs instanceof RealtimeFileStatus ? ((RealtimeFileStatus) fs).getDeltaLogFiles().size() : 1).sum(); + }).sum(); + Assertions.assertEquals(writeStatues.stream().map(WriteStatus::getFileId).distinct().count() * 2, numberOfLogFiles); + // The record number should remain same because of deduplication + Assertions.assertEquals(totalRecords, readRecords(dataGen.getPartitionPaths(), populateMetaFields).size()); + + metaClient = HoodieTableMetaClient.reload(metaClient); + + // Upsert new set of records, and validate the total number of records + newCommitTime = "003"; + records = dataGen.generateInserts(newCommitTime, totalRecords); + writeRecords = jsc.parallelize(records, 2); + writeClient.startCommitWithTime(newCommitTime); + writeStatues = writeClient.upsert(writeRecords, newCommitTime).collect(); + org.apache.hudi.testutils.Assertions.assertNoWriteErrors(writeStatues); + success = writeClient.commitStats(newCommitTime, writeStatues.stream().map(WriteStatus::getStat).collect(Collectors.toList()), + Option.empty(), metaClient.getCommitActionType()); + Assertions.assertTrue(success); + Assertions.assertEquals(totalRecords * 2, readRecords(dataGen.getPartitionPaths(), populateMetaFields).size()); + } + + private List readRecords(String[] partitions, boolean populateMetaFields) { + return HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, + Arrays.stream(partitions).map(p -> Paths.get(basePath, p).toString()).collect(Collectors.toList()), + basePath, new JobConf(hadoopConf), true, populateMetaFields); + } + + private FileStatus[] listStatus(String p, boolean realtime) { + JobConf jobConf = new JobConf(hadoopConf); + FileInputFormat.setInputPaths(jobConf, Paths.get(basePath, p).toString()); + FileInputFormat format = HoodieInputFormatUtils.getInputFormat(HoodieFileFormat.PARQUET, realtime, jobConf); + try { + if (realtime) { + return ((HoodieParquetRealtimeInputFormat) format).listStatus(jobConf); + } else { + return ((HoodieParquetInputFormat) format).listStatus(jobConf); + } + } catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + private HoodieWriteConfig.Builder getConfigBuilder() { + return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2).withBulkInsertParallelism(2).withFinalizeWriteParallelism(2).withDeleteParallelism(2) + .withWriteStatusClass(MetadataMergeWriteStatus.class) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024).parquetMaxFileSize(1024 * 1024).build()) + .forTable("test-trip-table") + .withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build()); + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java new file mode 100644 index 0000000000000..8ea6c2adf895f --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java @@ -0,0 +1,2625 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.functional; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieMetadataColumnStats; +import org.apache.hudi.avro.model.HoodieMetadataRecord; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.client.transaction.lock.InProcessLockProvider; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.LockConfiguration; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.fs.ConsistencyGuardConfig; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.metrics.Registry; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieFileGroup; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.WriteConcurrencyMode; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.HoodieTableVersion; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.log.HoodieLogFormat; +import org.apache.hudi.common.table.log.block.HoodieDataBlock; +import org.apache.hudi.common.table.log.block.HoodieLogBlock; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.FileSystemViewStorageType; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.table.view.TableFileSystemView; +import org.apache.hudi.common.testutils.FileCreateUtils; +import org.apache.hudi.common.testutils.HoodieMetadataTestTable; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ExternalSpillableMap; +import org.apache.hudi.common.util.hash.ColumnIndexID; +import org.apache.hudi.common.util.hash.PartitionIndexID; +import org.apache.hudi.config.HoodieArchivalConfig; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieLockConfig; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieMetadataException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.io.storage.HoodieHFileReader; +import org.apache.hudi.metadata.FileSystemBackedTableMetadata; +import org.apache.hudi.metadata.HoodieBackedTableMetadataWriter; +import org.apache.hudi.metadata.HoodieMetadataMergedLogRecordReader; +import org.apache.hudi.metadata.HoodieMetadataMetrics; +import org.apache.hudi.metadata.HoodieMetadataPayload; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.HoodieTableMetadataUtil; +import org.apache.hudi.metadata.MetadataPartitionType; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.upgrade.SparkUpgradeDowngradeHelper; +import org.apache.hudi.table.upgrade.UpgradeDowngrade; +import org.apache.hudi.testutils.MetadataMergeWriteStatus; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.io.hfile.CacheConfig; +import org.apache.hadoop.util.Time; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.MessageType; +import org.apache.spark.api.java.JavaRDD; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +import static java.util.Arrays.asList; +import static java.util.Collections.emptyList; +import static java.util.Collections.singletonList; +import static org.apache.hudi.common.config.LockConfiguration.FILESYSTEM_LOCK_PATH_PROP_KEY; +import static org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE; +import static org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ; +import static org.apache.hudi.common.model.WriteOperationType.DELETE; +import static org.apache.hudi.common.model.WriteOperationType.INSERT; +import static org.apache.hudi.common.model.WriteOperationType.UPSERT; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; +import static org.apache.hudi.metadata.MetadataPartitionType.BLOOM_FILTERS; +import static org.apache.hudi.metadata.MetadataPartitionType.COLUMN_STATS; +import static org.apache.hudi.metadata.MetadataPartitionType.FILES; +import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +@Tag("functional") +public class TestHoodieBackedMetadata extends TestHoodieMetadataBase { + + private static final Logger LOG = LogManager.getLogger(TestHoodieBackedMetadata.class); + + public static List tableTypeAndEnableOperationArgs() { + return asList( + Arguments.of(COPY_ON_WRITE, true), + Arguments.of(COPY_ON_WRITE, false), + Arguments.of(MERGE_ON_READ, true), + Arguments.of(MERGE_ON_READ, false) + ); + } + + public static List tableOperationsTestArgs() { + return asList( + Arguments.of(COPY_ON_WRITE, true, true), + Arguments.of(COPY_ON_WRITE, true, false), + Arguments.of(COPY_ON_WRITE, false, true), + Arguments.of(COPY_ON_WRITE, false, false), + Arguments.of(MERGE_ON_READ, true, true), + Arguments.of(MERGE_ON_READ, true, false), + Arguments.of(MERGE_ON_READ, false, true), + Arguments.of(MERGE_ON_READ, false, false) + ); + } + + /** + * Metadata Table bootstrap scenarios. + */ + @ParameterizedTest + @MethodSource("tableTypeAndEnableOperationArgs") + public void testMetadataTableBootstrap(HoodieTableType tableType, boolean addRollback) throws Exception { + init(tableType, false); + // bootstrap with few commits + doPreBootstrapOperations(testTable); + + writeConfig = getWriteConfig(true, true); + initWriteConfigAndMetatableWriter(writeConfig, true); + syncTableMetadata(writeConfig); + validateMetadata(testTable); + doWriteInsertAndUpsert(testTable); + validateMetadata(testTable); + if (addRollback) { + // trigger an UPSERT that will be rolled back + doWriteOperationAndValidate(testTable, "0000003"); + + // rollback last commit + doRollbackAndValidate(testTable, "0000003", "0000004"); + } + + // trigger couple of upserts + doWriteOperation(testTable, "0000005"); + doWriteOperation(testTable, "0000006"); + doWriteOperation(testTable, "0000007"); + doCleanAndValidate(testTable, "0000008", Arrays.asList("0000007")); + validateMetadata(testTable, true); + } + + @Test + public void testTurnOffMetadataIndexAfterEnable() throws Exception { + initPath(); + HoodieWriteConfig cfg = getConfigBuilder(TRIP_EXAMPLE_SCHEMA, HoodieIndex.IndexType.BLOOM, HoodieFailedWritesCleaningPolicy.EAGER) + .withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()) + .build(); + init(COPY_ON_WRITE); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + // metadata enabled with only FILES partition + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, cfg)) { + // Insert + String commitTime = "0000001"; + List records = dataGen.generateInserts(commitTime, 20); + client.startCommitWithTime(commitTime); + List writeStatuses = client.insert(jsc.parallelize(records, 1), commitTime).collect(); + assertNoWriteErrors(writeStatuses); + + // Upsert + commitTime = "0000002"; + client.startCommitWithTime(commitTime); + records = dataGen.generateUniqueUpdates(commitTime, 10); + writeStatuses = client.upsert(jsc.parallelize(records, 1), commitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + } + // check table config + HoodieTableMetaClient.reload(metaClient); + HoodieTableConfig tableConfig = metaClient.getTableConfig(); + assertFalse(tableConfig.getMetadataPartitions().isEmpty()); + assertTrue(tableConfig.getMetadataPartitions().contains(FILES.getPartitionPath())); + assertFalse(tableConfig.getMetadataPartitions().contains(COLUMN_STATS.getPartitionPath())); + assertFalse(tableConfig.getMetadataPartitions().contains(BLOOM_FILTERS.getPartitionPath())); + + // enable column stats and run 1 upserts + HoodieWriteConfig cfgWithColStatsEnabled = HoodieWriteConfig.newBuilder() + .withProperties(cfg.getProps()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .withProperties(cfg.getMetadataConfig().getProps()) + .withMetadataIndexColumnStats(true) + .build()) + .build(); + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, cfgWithColStatsEnabled)) { + // Upsert + String commitTime = "0000003"; + client.startCommitWithTime(commitTime); + List records = dataGen.generateUniqueUpdates(commitTime, 10); + List writeStatuses = client.upsert(jsc.parallelize(records, 1), commitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + } + // check table config + HoodieTableMetaClient.reload(metaClient); + tableConfig = metaClient.getTableConfig(); + assertFalse(tableConfig.getMetadataPartitions().isEmpty()); + assertTrue(tableConfig.getMetadataPartitions().contains(FILES.getPartitionPath())); + assertTrue(tableConfig.getMetadataPartitions().contains(COLUMN_STATS.getPartitionPath())); + assertFalse(tableConfig.getMetadataPartitions().contains(BLOOM_FILTERS.getPartitionPath())); + + // disable column stats and run 1 upsert + HoodieWriteConfig cfgWithColStatsDisabled = HoodieWriteConfig.newBuilder() + .withProperties(cfg.getProps()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .withProperties(cfg.getMetadataConfig().getProps()) + .withMetadataIndexColumnStats(false) + .build()) + .build(); + + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, cfgWithColStatsDisabled)) { + // Upsert + String commitTime = "0000004"; + client.startCommitWithTime(commitTime); + List records = dataGen.generateUniqueUpdates(commitTime, 10); + List writeStatuses = client.upsert(jsc.parallelize(records, 1), commitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + } + // check table config + HoodieTableMetaClient.reload(metaClient); + tableConfig = metaClient.getTableConfig(); + assertFalse(tableConfig.getMetadataPartitions().isEmpty()); + assertTrue(tableConfig.getMetadataPartitions().contains(FILES.getPartitionPath())); + assertFalse(tableConfig.getMetadataPartitions().contains(COLUMN_STATS.getPartitionPath())); + assertFalse(tableConfig.getMetadataPartitions().contains(BLOOM_FILTERS.getPartitionPath())); + + // enable bloom filter as well as column stats and run 1 upsert + HoodieWriteConfig cfgWithBloomFilterEnabled = HoodieWriteConfig.newBuilder() + .withProperties(cfgWithColStatsEnabled.getProps()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .withProperties(cfgWithColStatsEnabled.getMetadataConfig().getProps()) + .withMetadataIndexBloomFilter(true) + .build()) + .build(); + + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, cfgWithBloomFilterEnabled)) { + // Upsert + String commitTime = "0000005"; + client.startCommitWithTime(commitTime); + List records = dataGen.generateUniqueUpdates(commitTime, 10); + List writeStatuses = client.upsert(jsc.parallelize(records, 1), commitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + } + // check table config + HoodieTableMetaClient.reload(metaClient); + tableConfig = metaClient.getTableConfig(); + assertFalse(tableConfig.getMetadataPartitions().isEmpty()); + assertTrue(tableConfig.getMetadataPartitions().contains(FILES.getPartitionPath())); + assertTrue(tableConfig.getMetadataPartitions().contains(COLUMN_STATS.getPartitionPath())); + assertTrue(tableConfig.getMetadataPartitions().contains(BLOOM_FILTERS.getPartitionPath())); + } + + @Test + public void testTurnOffMetadataTableAfterEnable() throws Exception { + init(COPY_ON_WRITE, true); + String instant1 = "0000001"; + HoodieCommitMetadata hoodieCommitMetadata = doWriteOperationWithMeta(testTable, instant1, INSERT); + + // Simulate the complete data directory including ".hoodie_partition_metadata" file + File metaForP1 = new File(metaClient.getBasePath() + "/p1", ".hoodie_partition_metadata"); + File metaForP2 = new File(metaClient.getBasePath() + "/p2", ".hoodie_partition_metadata"); + metaForP1.createNewFile(); + metaForP2.createNewFile(); + + // Sync to metadata table + metaClient.reloadActiveTimeline(); + HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient); + Option metadataWriter = table.getMetadataWriter(instant1, Option.of(hoodieCommitMetadata)); + validateMetadata(testTable, true); + + assertTrue(metadataWriter.isPresent()); + HoodieTableConfig hoodieTableConfig = + new HoodieTableConfig(this.fs, metaClient.getMetaPath(), writeConfig.getPayloadClass()); + assertFalse(hoodieTableConfig.getMetadataPartitions().isEmpty()); + + // Turn off metadata table + HoodieWriteConfig writeConfig2 = HoodieWriteConfig.newBuilder() + .withProperties(this.writeConfig.getProps()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) + .build(); + testTable = HoodieTestTable.of(metaClient); + String instant2 = "0000002"; + HoodieCommitMetadata hoodieCommitMetadata2 = doWriteOperationWithMeta(testTable, instant2, INSERT); + metaClient.reloadActiveTimeline(); + HoodieTable table2 = HoodieSparkTable.create(writeConfig2, context, metaClient); + Option metadataWriter2 = table2.getMetadataWriter(instant2, Option.of(hoodieCommitMetadata2)); + assertFalse(metadataWriter2.isPresent()); + + HoodieTableConfig hoodieTableConfig2 = + new HoodieTableConfig(this.fs, metaClient.getMetaPath(), writeConfig2.getPayloadClass()); + assertEquals(Collections.emptySet(), hoodieTableConfig2.getMetadataPartitions()); + // Assert metadata table folder is deleted + assertFalse(metaClient.getFs().exists( + new Path(HoodieTableMetadata.getMetadataTableBasePath(writeConfig2.getBasePath())))); + + // Enable metadata table again and initialize metadata table through + // HoodieTable.getMetadataWriter() function + HoodieWriteConfig writeConfig3 = HoodieWriteConfig.newBuilder() + .withProperties(this.writeConfig.getProps()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()) + .build(); + testTable = HoodieTestTable.of(metaClient); + metaClient.reloadActiveTimeline(); + String instant3 = "0000003"; + HoodieCommitMetadata hoodieCommitMetadata3 = doWriteOperationWithMeta(testTable, instant3, INSERT); + metaClient.reloadActiveTimeline(); + HoodieTable table3 = HoodieSparkTable.create(writeConfig3, context, metaClient); + Option metadataWriter3 = table3.getMetadataWriter(instant3, Option.of(hoodieCommitMetadata3)); + validateMetadata(testTable, true); + assertTrue(metadataWriter3.isPresent()); + HoodieTableConfig hoodieTableConfig3 = + new HoodieTableConfig(this.fs, metaClient.getMetaPath(), writeConfig.getPayloadClass()); + assertFalse(hoodieTableConfig3.getMetadataPartitions().isEmpty()); + } + + /** + * Only valid partition directories are added to the metadata. + */ + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testOnlyValidPartitionsAdded(HoodieTableType tableType) throws Exception { + // This test requires local file system + init(tableType, false); + // Create an empty directory which is not a partition directory (lacks partition metadata) + final String nonPartitionDirectory = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0] + "-nonpartition"; + Files.createDirectories(Paths.get(basePath, nonPartitionDirectory)); + + // Three directories which are partitions but will be ignored due to filter + final String filterDirRegex = ".*-filterDir\\d|\\..*"; + final String filteredDirectoryOne = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0] + "-filterDir1"; + final String filteredDirectoryTwo = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0] + "-filterDir2"; + final String filteredDirectoryThree = ".backups"; + + // Create some commits + testTable.withPartitionMetaFiles("p1", "p2", filteredDirectoryOne, filteredDirectoryTwo, filteredDirectoryThree) + .addCommit("0000001").withBaseFilesInPartition("p1", 10).withBaseFilesInPartition("p2", 10, 10) + .addCommit("0000002").withBaseFilesInPartition("p1", 10).withBaseFilesInPartition("p2", 10, 10, 10); + + writeConfig = getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.NEVER, true, true, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).withDirectoryFilterRegex(filterDirRegex).build()).build(); + testTable.doWriteOperation("0000003", UPSERT, emptyList(), asList("p1", "p2"), 1, true); + syncTableMetadata(writeConfig); + + List partitions = metadataWriter(writeConfig).getTableMetadata().getAllPartitionPaths(); + assertFalse(partitions.contains(nonPartitionDirectory), + "Must not contain the non-partition " + nonPartitionDirectory); + assertTrue(partitions.contains("p1"), "Must contain partition p1"); + assertTrue(partitions.contains("p2"), "Must contain partition p2"); + + assertFalse(partitions.contains(filteredDirectoryOne), + "Must not contain the filtered directory " + filteredDirectoryOne); + assertFalse(partitions.contains(filteredDirectoryTwo), + "Must not contain the filtered directory " + filteredDirectoryTwo); + assertFalse(partitions.contains(filteredDirectoryThree), + "Must not contain the filtered directory " + filteredDirectoryThree); + + FileStatus[] statuses = metadata(writeConfig, context).getAllFilesInPartition(new Path(basePath, "p1")); + assertEquals(tableType == COPY_ON_WRITE ? 3 : 4, statuses.length); + statuses = metadata(writeConfig, context).getAllFilesInPartition(new Path(basePath, "p2")); + assertEquals(tableType == COPY_ON_WRITE ? 6 : 7, statuses.length); + Map partitionsToFilesMap = metadata(writeConfig, context).getAllFilesInPartitions(asList(basePath + "/p1", basePath + "/p2")); + assertEquals(2, partitionsToFilesMap.size()); + assertEquals(tableType == COPY_ON_WRITE ? 3 : 4, partitionsToFilesMap.get(basePath + "/p1").length); + assertEquals(tableType == COPY_ON_WRITE ? 6 : 7, partitionsToFilesMap.get(basePath + "/p2").length); + } + + /** + * Test various table operations sync to Metadata Table correctly. + */ + @ParameterizedTest + @MethodSource("tableOperationsTestArgs") + public void testTableOperations(HoodieTableType tableType, boolean enableFullScan, boolean enableMetrics) throws Exception { + List commitTimeList = new ArrayList<>(); + commitTimeList.add(Long.parseLong(HoodieActiveTimeline.createNewInstantTime())); + for (int i = 0; i < 8; i++) { + long nextCommitTime = getNextCommitTime(commitTimeList.get(commitTimeList.size() - 1)); + commitTimeList.add(nextCommitTime); + } + init(tableType, true, enableFullScan, enableMetrics, false); + doWriteInsertAndUpsert(testTable, commitTimeList.get(0).toString(), commitTimeList.get(1).toString(), false); + + // trigger an upsert + doWriteOperationAndValidate(testTable, commitTimeList.get(2).toString()); + + // trigger compaction + if (MERGE_ON_READ.equals(tableType)) { + doCompactionAndValidate(testTable, commitTimeList.get(3).toString()); + } + + // trigger an upsert + doWriteOperation(testTable, commitTimeList.get(4).toString()); + + // trigger clean + doCleanAndValidate(testTable, commitTimeList.get(5).toString(), singletonList(commitTimeList.get(0).toString())); + + // trigger few upserts and validate + doWriteOperation(testTable, commitTimeList.get(6).toString()); + doWriteOperation(testTable, commitTimeList.get(7).toString()); + validateMetadata(testTable, emptyList(), true); + } + + @Test + public void testMetadataTableArchival() throws Exception { + init(COPY_ON_WRITE, false); + writeConfig = getWriteConfigBuilder(true, true, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(true) + .enableFullScan(true) + .enableMetrics(false) + .withMaxNumDeltaCommitsBeforeCompaction(3) + .archiveCommitsWith(3, 4) + .retainCommits(1) + .build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .retainCommits(1) + .build()) + .withArchivalConfig(HoodieArchivalConfig.newBuilder() + .archiveCommitsWith(2, 3) + .build()) + .build(); + initWriteConfigAndMetatableWriter(writeConfig, true); + + AtomicInteger commitTime = new AtomicInteger(1); + // trigger 2 regular writes(1 bootstrap commit). just 1 before archival can get triggered. + for (int i = 1; i <= 2; i++) { + doWriteOperation(testTable, "000000" + (commitTime.getAndIncrement()), INSERT); + } + // expected num commits = 1 (bootstrap) + 2 (writes) + HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build(); + HoodieActiveTimeline metadataTimeline = metadataMetaClient.reloadActiveTimeline(); + assertEquals(3, metadataTimeline.getCommitsTimeline().filterCompletedInstants().countInstants()); + + // trigger an async table service, archival should not kick in, even though conditions are met. + doCluster(testTable, "000000" + commitTime.getAndIncrement()); + metadataTimeline = metadataMetaClient.reloadActiveTimeline(); + assertEquals(4, metadataTimeline.getCommitsTimeline().filterCompletedInstants().countInstants()); + + // start the timeline server for MARKERS cleaning up + getHoodieWriteClient(writeConfig); + // trigger a regular write operation. data set timeline archival should kick in. + doWriteOperation(testTable, "000000" + (commitTime.getAndIncrement()), INSERT); + archiveDataTable(writeConfig, HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build()); + + // trigger a regular write operation. metadata timeline archival should kick in. + doWriteOperation(testTable, "000000" + (commitTime.getAndIncrement()), INSERT); + metadataTimeline = metadataMetaClient.reloadActiveTimeline(); + assertEquals(4, metadataTimeline.getCommitsTimeline().filterCompletedInstants().countInstants()); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataInsertUpsertClean(HoodieTableType tableType) throws Exception { + init(tableType); + doWriteOperation(testTable, "0000001", INSERT); + doWriteOperation(testTable, "0000002"); + doCleanAndValidate(testTable, "0000003", Arrays.asList("0000001")); + if (tableType == MERGE_ON_READ) { + doCompaction(testTable, "0000004"); + } + doWriteOperation(testTable, "0000005"); + validateMetadata(testTable, emptyList(), true); + } + + @Test + public void testUpdationOfPopulateMetaFieldsForMetadataTable() throws Exception { + tableType = COPY_ON_WRITE; + init(tableType, false); + + writeConfig = getWriteConfigBuilder(true, true, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(true) + .withPopulateMetaFields(true) + .build()) + .build(); + initWriteConfigAndMetatableWriter(writeConfig, true); + doWriteOperation(testTable, "0000001", INSERT); + + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setBasePath(writeConfig.getBasePath() + "/.hoodie/metadata").setConf(hadoopConf).build(); + assertTrue(metaClient.getTableConfig().populateMetaFields()); + + // update populateMeta fields to false. + writeConfig = getWriteConfigBuilder(true, true, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(true) + .withPopulateMetaFields(false) + .build()) + .build(); + initWriteConfigAndMetatableWriter(writeConfig, true); + doWriteOperation(testTable, "0000002", INSERT); + metaClient = HoodieTableMetaClient.builder().setBasePath(writeConfig.getBasePath() + "/.hoodie/metadata").setConf(hadoopConf).build(); + assertFalse(metaClient.getTableConfig().populateMetaFields()); + } + + @Test + public void testMetadataInsertUpsertCleanNonPartitioned() throws Exception { + HoodieTableType tableType = COPY_ON_WRITE; + init(tableType); + doWriteOperationNonPartitioned(testTable, "0000001", INSERT); + doWriteOperationNonPartitioned(testTable, "0000002", UPSERT); + testTable.doCleanBasedOnCommits("0000003", Arrays.asList("0000001")); + validateMetadata(testTable, emptyList(), true); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testInsertUpsertCluster(HoodieTableType tableType) throws Exception { + init(tableType); + doWriteOperation(testTable, "0000001", INSERT); + doWriteOperation(testTable, "0000002"); + doClusterAndValidate(testTable, "0000003"); + if (tableType == MERGE_ON_READ) { + doCompaction(testTable, "0000004"); + } + doCleanAndValidate(testTable, "0000005", Arrays.asList("0000001")); + validateMetadata(testTable, emptyList(), true); + } + + /** + * Tests that table services in data table won't trigger table services in metadata table. + * + * @throws Exception + */ + @Test + public void testMetadataTableServices() throws Exception { + HoodieTableType tableType = COPY_ON_WRITE; + init(tableType, false); + writeConfig = getWriteConfigBuilder(true, true, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(true) + .enableFullScan(true) + .enableMetrics(false) + .withMaxNumDeltaCommitsBeforeCompaction(3) // after 3 delta commits for regular writer operations, compaction should kick in. + .build()).build(); + initWriteConfigAndMetatableWriter(writeConfig, true); + + doWriteOperation(testTable, "0000001", INSERT); + doCleanAndValidate(testTable, "0000003", Arrays.asList("0000001")); + + HoodieTableMetadata tableMetadata = metadata(writeConfig, context); + // since clean was the last commit, table servives should not get triggered in metadata table. + assertFalse(tableMetadata.getLatestCompactionTime().isPresent()); + + doWriteOperation(testTable, "0000004", UPSERT); + // this should have triggered compaction in metadata table + tableMetadata = metadata(writeConfig, context); + assertTrue(tableMetadata.getLatestCompactionTime().isPresent()); + assertEquals(tableMetadata.getLatestCompactionTime().get(), "0000003001"); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testTableOperationsWithMetadataIndex(HoodieTableType tableType) throws Exception { + initPath(); + HoodieWriteConfig writeConfig = getWriteConfigBuilder(true, true, false) + .withIndexConfig(HoodieIndexConfig.newBuilder() + .bloomIndexBucketizedChecking(false) + .build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(true) + .withMetadataIndexBloomFilter(true) + .withMetadataIndexBloomFilterFileGroups(4) + .withMetadataIndexColumnStats(true) + .withMetadataIndexBloomFilterFileGroups(2) + .build()) + .build(); + init(tableType, writeConfig); + testTableOperationsForMetaIndexImpl(writeConfig); + } + + private void testTableOperationsForMetaIndexImpl(final HoodieWriteConfig writeConfig) throws Exception { + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + testTableOperationsImpl(engineContext, writeConfig); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataTableDeletePartition(HoodieTableType tableType) throws IOException { + initPath(); + int maxCommits = 1; + HoodieWriteConfig cfg = getConfigBuilder(TRIP_EXAMPLE_SCHEMA, HoodieIndex.IndexType.BLOOM, HoodieFailedWritesCleaningPolicy.EAGER) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(maxCommits) + .build()) + .withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()) + .build(); + init(tableType); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, cfg)) { + // Write 1 (Bulk insert) + String newCommitTime = "0000001"; + List records = dataGen.generateInserts(newCommitTime, 20); + client.startCommitWithTime(newCommitTime); + List writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + + // Write 2 (upserts) + newCommitTime = "0000002"; + client.startCommitWithTime(newCommitTime); + validateMetadata(client); + + records = dataGen.generateInserts(newCommitTime, 10); + writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + + // metadata writer to delete column_stats partition + HoodieBackedTableMetadataWriter metadataWriter = metadataWriter(client); + assertNotNull(metadataWriter, "MetadataWriter should have been initialized"); + metadataWriter.deletePartitions("0000003", Arrays.asList(COLUMN_STATS)); + + HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build(); + List metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, metadataMetaClient.getBasePath(), false, false); + // partition should be physically deleted + assertEquals(metadataWriter.getEnabledPartitionTypes().size(), metadataTablePartitions.size()); + assertFalse(metadataTablePartitions.contains(COLUMN_STATS.getPartitionPath())); + + Option completedReplaceInstant = metadataMetaClient.reloadActiveTimeline().getCompletedReplaceTimeline().lastInstant(); + assertTrue(completedReplaceInstant.isPresent()); + assertEquals("0000003", completedReplaceInstant.get().getTimestamp()); + + final Map metadataEnabledPartitionTypes = new HashMap<>(); + metadataWriter.getEnabledPartitionTypes().forEach(e -> metadataEnabledPartitionTypes.put(e.getPartitionPath(), e)); + HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metadataMetaClient, metadataMetaClient.getActiveTimeline()); + metadataTablePartitions.forEach(partition -> { + List latestSlices = fsView.getLatestFileSlices(partition).collect(Collectors.toList()); + if (COLUMN_STATS.getPartitionPath().equals(partition)) { + // there should not be any file slice in column_stats partition + assertTrue(latestSlices.isEmpty()); + } else { + assertFalse(latestSlices.isEmpty()); + assertTrue(latestSlices.stream().map(FileSlice::getBaseFile).count() + <= metadataEnabledPartitionTypes.get(partition).getFileGroupCount(), "Should have a single latest base file per file group"); + assertTrue(latestSlices.size() + <= metadataEnabledPartitionTypes.get(partition).getFileGroupCount(), "Should have a single latest file slice per file group"); + } + }); + } + } + + /** + * Tests that virtual key configs are honored in base files after compaction in metadata table. + * + * @throws Exception + */ + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testVirtualKeysInBaseFiles(boolean populateMetaFields) throws Exception { + HoodieTableType tableType = MERGE_ON_READ; + init(tableType, false); + writeConfig = getWriteConfigBuilder(true, true, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(true) + .enableFullScan(true) + .enableMetrics(false) + .withPopulateMetaFields(populateMetaFields) + .withMaxNumDeltaCommitsBeforeCompaction(2) + .build()).build(); + initWriteConfigAndMetatableWriter(writeConfig, true); + + doWriteOperation(testTable, "0000001", INSERT); + doClean(testTable, "0000003", Arrays.asList("0000001")); + // this should have triggered compaction in metadata table + doWriteOperation(testTable, "0000004", UPSERT); + + HoodieTableMetadata tableMetadata = metadata(writeConfig, context); + assertTrue(tableMetadata.getLatestCompactionTime().isPresent()); + assertEquals(tableMetadata.getLatestCompactionTime().get(), "0000003001"); + + HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build(); + HoodieWriteConfig metadataTableWriteConfig = getMetadataWriteConfig(writeConfig); + metadataMetaClient.reloadActiveTimeline(); + + HoodieTable table = HoodieSparkTable.create(metadataTableWriteConfig, context, metadataMetaClient); + table.getHoodieView().sync(); + List fileSlices = table.getSliceView().getLatestFileSlices("files").collect(Collectors.toList()); + HoodieBaseFile baseFile = fileSlices.get(0).getBaseFile().get(); + HoodieHFileReader hoodieHFileReader = new HoodieHFileReader(context.getHadoopConf().get(), new Path(baseFile.getPath()), + new CacheConfig(context.getHadoopConf().get())); + List records = HoodieHFileReader.readAllRecords(hoodieHFileReader); + records.forEach(entry -> { + if (populateMetaFields) { + assertNotNull(((GenericRecord) entry).get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); + } else { + assertNull(((GenericRecord) entry).get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); + } + }); + } + + /** + * Tests that virtual key configs are honored in base files after compaction in metadata table. + */ + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testMetadataTableWithPendingCompaction(boolean simulateFailedCompaction) throws Exception { + HoodieTableType tableType = COPY_ON_WRITE; + init(tableType, false); + writeConfig = getWriteConfigBuilder(true, true, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(true) + .enableFullScan(true) + .enableMetrics(false) + .withMaxNumDeltaCommitsBeforeCompaction(3) + .build()).build(); + initWriteConfigAndMetatableWriter(writeConfig, true); + + doWriteOperation(testTable, "0000001", INSERT); + // create an inflight compaction in metadata table. + // not easy to create an inflight in metadata table directly, hence letting compaction succeed and then deleting the completed instant. + // this new write is expected to trigger metadata table compaction + String commitInstant = "0000002"; + doWriteOperation(testTable, commitInstant, INSERT); + doWriteOperation(testTable, "0000003", INSERT); + + HoodieTableMetadata tableMetadata = metadata(writeConfig, context); + String metadataCompactionInstant = commitInstant + "001"; + assertTrue(tableMetadata.getLatestCompactionTime().isPresent()); + assertEquals(tableMetadata.getLatestCompactionTime().get(), metadataCompactionInstant); + + validateMetadata(testTable); + // Fetch compaction Commit file and rename to some other file. completed compaction meta file should have some serialized info that table interprets + // for future upserts. so, renaming the file here to some temp name and later renaming it back to same name. + java.nio.file.Path parentPath = Paths.get(metadataTableBasePath, HoodieTableMetaClient.METAFOLDER_NAME); + java.nio.file.Path metaFilePath = parentPath.resolve(metadataCompactionInstant + HoodieTimeline.COMMIT_EXTENSION); + java.nio.file.Path tempFilePath = FileCreateUtils.renameFileToTemp(metaFilePath, metadataCompactionInstant); + metaClient.reloadActiveTimeline(); + testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter); + // this validation will exercise the code path where a compaction is inflight in metadata table, but still metadata based file listing should match non + // metadata based file listing. + validateMetadata(testTable); + + if (simulateFailedCompaction) { + // this should retry the compaction in metadata table. + doWriteOperation(testTable, "0000004", INSERT); + } else { + // let the compaction succeed in metadata and validation should succeed. + FileCreateUtils.renameTempToMetaFile(tempFilePath, metaFilePath); + } + + validateMetadata(testTable); + + // add few more write and validate + doWriteOperation(testTable, "0000005", INSERT); + doWriteOperation(testTable, "0000006", UPSERT); + validateMetadata(testTable); + + if (simulateFailedCompaction) { + //trigger another compaction failure. + metadataCompactionInstant = "0000005001"; + tableMetadata = metadata(writeConfig, context); + assertTrue(tableMetadata.getLatestCompactionTime().isPresent()); + assertEquals(tableMetadata.getLatestCompactionTime().get(), metadataCompactionInstant); + + // Fetch compaction Commit file and rename to some other file. completed compaction meta file should have some serialized info that table interprets + // for future upserts. so, renaming the file here to some temp name and later renaming it back to same name. + parentPath = Paths.get(metadataTableBasePath, HoodieTableMetaClient.METAFOLDER_NAME); + metaFilePath = parentPath.resolve(metadataCompactionInstant + HoodieTimeline.COMMIT_EXTENSION); + tempFilePath = FileCreateUtils.renameFileToTemp(metaFilePath, metadataCompactionInstant); + + validateMetadata(testTable); + + // this should retry the failed compaction in metadata table. + doWriteOperation(testTable, "0000007", INSERT); + + validateMetadata(testTable); + + // add few more write and validate + doWriteOperation(testTable, "0000008", INSERT); + doWriteOperation(testTable, "0000009", UPSERT); + validateMetadata(testTable); + } + } + + /** + * Test arguments - Table type, populate meta fields, exclude key from payload. + */ + public static List testMetadataRecordKeyExcludeFromPayloadArgs() { + return asList( + Arguments.of(COPY_ON_WRITE, true), + Arguments.of(COPY_ON_WRITE, false), + Arguments.of(MERGE_ON_READ, true), + Arguments.of(MERGE_ON_READ, false) + ); + } + + /** + * 1. Verify metadata table records key deduplication feature. When record key + * deduplication is enabled, verify the metadata record payload on disk has empty key. + * Otherwise, verify the valid key. + * 2. Verify populate meta fields work irrespective of record key deduplication config. + * 3. Verify table services like compaction benefit from record key deduplication feature. + */ + @ParameterizedTest + @MethodSource("testMetadataRecordKeyExcludeFromPayloadArgs") + public void testMetadataRecordKeyExcludeFromPayload(final HoodieTableType tableType, final boolean enableMetaFields) throws Exception { + initPath(); + writeConfig = getWriteConfigBuilder(true, true, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(true) + .withPopulateMetaFields(enableMetaFields) + .withMaxNumDeltaCommitsBeforeCompaction(3) + .build()) + .build(); + init(tableType, writeConfig); + + // 2nd commit + doWriteOperation(testTable, "0000001", INSERT); + + final HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder() + .setConf(hadoopConf) + .setBasePath(metadataTableBasePath) + .build(); + HoodieWriteConfig metadataTableWriteConfig = getMetadataWriteConfig(writeConfig); + metadataMetaClient.reloadActiveTimeline(); + final HoodieTable table = HoodieSparkTable.create(metadataTableWriteConfig, context, metadataMetaClient); + + // Compaction has not yet kicked in. Verify all the log files + // for the metadata records persisted on disk as per the config. + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadLogFiles(table, metadataMetaClient, "0000001", + enableMetaFields); + }, "Metadata table should have valid log files!"); + + // Verify no base file created yet. + assertThrows(IllegalStateException.class, () -> { + verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(table, enableMetaFields); + }, "Metadata table should not have a base file yet!"); + + // 2 more commits + doWriteOperation(testTable, "0000002", UPSERT); + doWriteOperation(testTable, "0000004", UPSERT); + + // Compaction should be triggered by now. Let's verify the log files + // if any for the metadata records persisted on disk as per the config. + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadLogFiles(table, metadataMetaClient, "0000002", + enableMetaFields); + }, "Metadata table should have valid log files!"); + + // Verify the base file created by the just completed compaction. + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(table, enableMetaFields); + }, "Metadata table should have a valid base file!"); + + // 2 more commits to trigger one more compaction, along with a clean + doWriteOperation(testTable, "0000005", UPSERT); + doClean(testTable, "0000006", Arrays.asList("0000004")); + doWriteOperation(testTable, "0000007", UPSERT); + + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadLogFiles(table, metadataMetaClient, "7", enableMetaFields); + }, "Metadata table should have valid log files!"); + + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(table, enableMetaFields); + }, "Metadata table should have a valid base file!"); + + validateMetadata(testTable); + } + + /** + * Verify the metadata table log files for the record field correctness. On disk format + * should be based on meta fields and key deduplication config. And the in-memory merged + * records should all be materialized fully irrespective of the config. + * + * @param table - Hoodie metadata test table + * @param metadataMetaClient - Metadata meta client + * @param latestCommitTimestamp - Latest commit timestamp + * @param enableMetaFields - Enable meta fields for the table records + * @throws IOException + */ + private void verifyMetadataRecordKeyExcludeFromPayloadLogFiles(HoodieTable table, HoodieTableMetaClient metadataMetaClient, + String latestCommitTimestamp, + boolean enableMetaFields) throws IOException { + table.getHoodieView().sync(); + + // Compaction should not be triggered yet. Let's verify no base file + // and few log files available. + List fileSlices = table.getSliceView() + .getLatestFileSlices(FILES.getPartitionPath()).collect(Collectors.toList()); + if (fileSlices.isEmpty()) { + throw new IllegalStateException("LogFile slices are not available!"); + } + + // Verify the log files honor the key deduplication and virtual keys config + List logFiles = fileSlices.get(0).getLogFiles().map(logFile -> { + return logFile; + }).collect(Collectors.toList()); + + List logFilePaths = logFiles.stream().map(logFile -> { + return logFile.getPath().toString(); + }).collect(Collectors.toList()); + + // Verify the on-disk raw records before they get materialized + verifyMetadataRawRecords(table, logFiles, enableMetaFields); + + // Verify the in-memory materialized and merged records + verifyMetadataMergedRecords(metadataMetaClient, logFilePaths, latestCommitTimestamp, enableMetaFields); + } + + /** + * Verify the metadata table on-disk raw records. When populate meta fields is enabled, + * these records should have additional meta fields in the payload. When key deduplication + * is enabled, these records on the disk should have key in the payload as empty string. + * + * @param table + * @param logFiles - Metadata table log files to be verified + * @param enableMetaFields - Enable meta fields for records + * @throws IOException + */ + private void verifyMetadataRawRecords(HoodieTable table, List logFiles, boolean enableMetaFields) throws IOException { + for (HoodieLogFile logFile : logFiles) { + FileStatus[] fsStatus = fs.listStatus(logFile.getPath()); + MessageType writerSchemaMsg = TableSchemaResolver.readSchemaFromLogFile(fs, logFile.getPath()); + if (writerSchemaMsg == null) { + // not a data block + continue; + } + + Schema writerSchema = new AvroSchemaConverter().convert(writerSchemaMsg); + try (HoodieLogFormat.Reader logFileReader = HoodieLogFormat.newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema)) { + while (logFileReader.hasNext()) { + HoodieLogBlock logBlock = logFileReader.next(); + if (logBlock instanceof HoodieDataBlock) { + try (ClosableIterator recordItr = ((HoodieDataBlock) logBlock).getRecordIterator()) { + recordItr.forEachRemaining(indexRecord -> { + final GenericRecord record = (GenericRecord) indexRecord; + if (enableMetaFields) { + // Metadata table records should have meta fields! + assertNotNull(record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); + assertNotNull(record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD)); + } else { + // Metadata table records should not have meta fields! + assertNull(record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); + assertNull(record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD)); + } + + final String key = String.valueOf(record.get(HoodieMetadataPayload.KEY_FIELD_NAME)); + assertFalse(key.isEmpty()); + if (enableMetaFields) { + assertTrue(key.equals(String.valueOf(record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD)))); + } + }); + } + } + } + } + } + } + + /** + * Verify the metadata table in-memory merged records. Irrespective of key deduplication + * config, the in-memory merged records should always have the key field in the record + * payload fully materialized. + * + * @param metadataMetaClient - Metadata table meta client + * @param logFilePaths - Metadata table log file paths + * @param latestCommitTimestamp + * @param enableMetaFields - Enable meta fields + */ + private void verifyMetadataMergedRecords(HoodieTableMetaClient metadataMetaClient, List logFilePaths, + String latestCommitTimestamp, boolean enableMetaFields) { + Schema schema = HoodieAvroUtils.addMetadataFields(HoodieMetadataRecord.getClassSchema()); + if (enableMetaFields) { + schema = HoodieAvroUtils.addMetadataFields(schema); + } + HoodieMetadataMergedLogRecordReader logRecordReader = HoodieMetadataMergedLogRecordReader.newBuilder() + .withFileSystem(metadataMetaClient.getFs()) + .withBasePath(metadataMetaClient.getBasePath()) + .withLogFilePaths(logFilePaths) + .withLatestInstantTime(latestCommitTimestamp) + .withPartition(FILES.getPartitionPath()) + .withReaderSchema(schema) + .withMaxMemorySizeInBytes(100000L) + .withBufferSize(4096) + .withSpillableMapBasePath(tempDir.toString()) + .withDiskMapType(ExternalSpillableMap.DiskMapType.BITCASK) + .build(); + + assertDoesNotThrow(() -> { + logRecordReader.scan(); + }, "Metadata log records materialization failed"); + + for (Map.Entry> entry : logRecordReader.getRecords().entrySet()) { + assertFalse(entry.getKey().isEmpty()); + assertFalse(entry.getValue().getRecordKey().isEmpty()); + assertEquals(entry.getKey(), entry.getValue().getRecordKey()); + } + } + + /** + * Verify metadata table base files for the records persisted based on the config. When + * the key deduplication is enabled, the records persisted on the disk in the base file + * should have key field in the payload as empty string. + * + * @param table - Metadata table + * @param enableMetaFields - Enable meta fields + */ + private void verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(HoodieTable table, boolean enableMetaFields) throws IOException { + table.getHoodieView().sync(); + List fileSlices = table.getSliceView() + .getLatestFileSlices(FILES.getPartitionPath()).collect(Collectors.toList()); + if (!fileSlices.get(0).getBaseFile().isPresent()) { + throw new IllegalStateException("Base file not available!"); + } + final HoodieBaseFile baseFile = fileSlices.get(0).getBaseFile().get(); + + HoodieHFileReader hoodieHFileReader = new HoodieHFileReader(context.getHadoopConf().get(), + new Path(baseFile.getPath()), + new CacheConfig(context.getHadoopConf().get())); + List records = HoodieHFileReader.readAllRecords(hoodieHFileReader); + records.forEach(entry -> { + if (enableMetaFields) { + assertNotNull(((GenericRecord) entry).get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); + } else { + assertNull(((GenericRecord) entry).get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); + } + + final String keyInPayload = (String) ((GenericRecord) entry) + .get(HoodieMetadataPayload.KEY_FIELD_NAME); + assertFalse(keyInPayload.isEmpty()); + }); + } + + /** + * Test rollback of various table operations sync to Metadata Table correctly. + */ + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testRollbackOperations(HoodieTableType tableType) throws Exception { + init(tableType); + doWriteInsertAndUpsert(testTable); + + // trigger an upsert + doWriteOperation(testTable, "0000003", UPSERT); + + // trigger a commit and rollback + doWriteOperation(testTable, "0000004"); + doRollbackAndValidate(testTable, "0000004", "0000005"); + + // trigger few upserts and validate + for (int i = 6; i < 10; i++) { + doWriteOperation(testTable, "000000" + i); + } + validateMetadata(testTable); + + doWriteOperation(testTable, "0000010"); + + // rollback last commit. and validate. + doRollbackAndValidate(testTable, "0000010", "0000011"); + + // rollback of compaction + if (MERGE_ON_READ.equals(tableType)) { + doCompactionAndValidate(testTable, "0000012"); + doRollbackAndValidate(testTable, "0000012", "0000013"); + } + + // roll back of delete + doWriteOperation(testTable, "0000014", DELETE); + doRollbackAndValidate(testTable, "0000014", "0000015"); + + // rollback partial commit + writeConfig = getWriteConfigBuilder(true, true, false).withRollbackUsingMarkers(false).build(); + doWriteOperation(testTable, "0000016"); + testTable.doRollback("0000016", "0000017"); + validateMetadata(testTable); + + // marker-based rollback of partial commit + writeConfig = getWriteConfigBuilder(true, true, false).withRollbackUsingMarkers(true).build(); + doWriteOperation(testTable, "0000018"); + testTable.doRollback("0000018", "0000019"); + validateMetadata(testTable, true); + } + + @Test + public void testRollbackOperationsNonPartitioned() throws Exception { + HoodieTableType tableType = COPY_ON_WRITE; + init(tableType); + doWriteInsertAndUpsertNonPartitioned(testTable); + + // trigger an upsert + doWriteOperationNonPartitioned(testTable, "0000003", UPSERT); + + // trigger a commit and rollback + doWriteOperationNonPartitioned(testTable, "0000004", UPSERT); + doRollback(testTable, "0000004", "0000005"); + validateMetadata(testTable); + + // trigger few upserts and validate + for (int i = 6; i < 10; i++) { + doWriteOperationNonPartitioned(testTable, "000000" + i, UPSERT); + } + validateMetadata(testTable); + } + + /** + * Test that manual rollbacks work correctly and enough timeline history is maintained on the metadata table + * timeline. + */ + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testManualRollbacks(final boolean populateMateFields) throws Exception { + HoodieTableType tableType = COPY_ON_WRITE; + init(tableType, false); + // Setting to archive more aggressively on the Metadata Table than the Dataset + final int maxDeltaCommitsBeforeCompaction = 4; + final int minArchiveCommitsMetadata = 2; + final int minArchiveCommitsDataset = 4; + writeConfig = getWriteConfigBuilder(true, true, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true) + .archiveCommitsWith(minArchiveCommitsMetadata, minArchiveCommitsMetadata + 1).retainCommits(1) + .withMaxNumDeltaCommitsBeforeCompaction(maxDeltaCommitsBeforeCompaction) + .withPopulateMetaFields(populateMateFields) + .build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .retainCommits(1) + .retainFileVersions(1) + .withAutoClean(false) + .withAsyncClean(true) + .build()) + .withArchivalConfig(HoodieArchivalConfig.newBuilder() + .archiveCommitsWith(minArchiveCommitsDataset, minArchiveCommitsDataset + 1) + .build()) + .build(); + + initWriteConfigAndMetatableWriter(writeConfig, true); + doWriteInsertAndUpsert(testTable, "000001", "000002", false); + + for (int i = 3; i < 10; i++) { + doWriteOperation(testTable, "00000" + i); + archiveDataTable(writeConfig, metaClient); + } + validateMetadata(testTable); + + // We can only rollback those commits whose deltacommit have not been archived yet. + int numRollbacks = 0; + boolean exceptionRaised = false; + List allInstants = metaClient.reloadActiveTimeline().getCommitsTimeline().getReverseOrderedInstants().collect(Collectors.toList()); + for (HoodieInstant instantToRollback : allInstants) { + try { + testTable.doRollback(instantToRollback.getTimestamp(), String.valueOf(Time.now())); + validateMetadata(testTable); + ++numRollbacks; + } catch (HoodieMetadataException e) { + exceptionRaised = true; + break; + } + } + + assertFalse(exceptionRaised, "Metadata table should not archive instants that are in dataset active timeline"); + // Since each rollback also creates a deltacommit, we can only support rolling back of half of the original + // instants present before rollback started. + assertTrue(numRollbacks >= Math.max(minArchiveCommitsDataset, minArchiveCommitsMetadata) / 2, + "Rollbacks of non archived instants should work"); + } + + /** + * Test sync of table operations. + */ + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testSync(HoodieTableType tableType) throws Exception { + init(tableType, false); + // Initial commits without metadata table enabled + writeConfig = getWriteConfigBuilder(true, false, false).build(); + doPreBootstrapOperations(testTable, "00000001", "00000002"); + + // Enable metadata table so it initialized by listing from file system + writeConfig = getWriteConfigBuilder(true, true, false).build(); + + initWriteConfigAndMetatableWriter(writeConfig, true); + syncTableMetadata(writeConfig); + validateMetadata(testTable); + + doWriteOperation(testTable, "00000003", INSERT); + doWriteOperation(testTable, "00000004", UPSERT); + doWriteOperation(testTable, "00000005", UPSERT); + + // trigger compaction + if (MERGE_ON_READ.equals(tableType)) { + doCompactionAndValidate(testTable, "00000006"); + } + + // trigger an upsert + doWriteOperation(testTable, "00000008"); + // trigger delete + doWriteOperation(testTable, "00000009", DELETE); + // trigger clean + doCleanAndValidate(testTable, "00000010", asList("00000003", "00000004")); + // trigger another upsert + doWriteOperation(testTable, "00000011"); + // trigger clustering + doClusterAndValidate(testTable, "00000012"); + + // If there is an inflight operation, the Metadata Table is not updated beyond that operations but the + // in-memory merge should consider all the completed operations. + HoodieCommitMetadata inflightCommitMeta = testTable.doWriteOperation("00000007", UPSERT, emptyList(), + asList("p1", "p2"), 2, false, true); + // trigger upsert + doWriteOperation(testTable, "00000013"); + // testTable validation will fetch only files pertaining to completed commits. So, validateMetadata() will skip files for 007 + // while validating against actual metadata table. + validateMetadata(testTable, singletonList("00000007")); + + // Remove the inflight instance holding back table sync + testTable.moveInflightCommitToComplete("00000007", inflightCommitMeta); + validateMetadata(testTable); + // A regular commit should get synced + doWriteOperation(testTable, "00000014"); + validateMetadata(testTable, emptyList(), true); + } + + /** + * Fetches next commit time in seconds from current one. + * + * @param curCommitTime current commit time. + * @return the next valid commit time. + */ + private Long getNextCommitTime(long curCommitTime) { + if ((curCommitTime + 1) % 1000000000000L >= 60) { // max seconds is 60 and hence + return Long.parseLong(HoodieActiveTimeline.createNewInstantTime()); + } else { + return curCommitTime + 1; + } + } + + @ParameterizedTest + @MethodSource("tableTypeAndEnableOperationArgs") + public void testMetadataBootstrapLargeCommitList(HoodieTableType tableType, boolean nonPartitionedDataset) throws Exception { + init(tableType, true, true, true, false); + long baseCommitTime = Long.parseLong(HoodieActiveTimeline.createNewInstantTime()); + for (int i = 1; i < 25; i += 7) { + long commitTime1 = getNextCommitTime(baseCommitTime); + long commitTime2 = getNextCommitTime(commitTime1); + long commitTime3 = getNextCommitTime(commitTime2); + long commitTime4 = getNextCommitTime(commitTime3); + long commitTime5 = getNextCommitTime(commitTime4); + long commitTime6 = getNextCommitTime(commitTime5); + long commitTime7 = getNextCommitTime(commitTime6); + baseCommitTime = commitTime7; + doWriteOperation(testTable, Long.toString(commitTime1), INSERT, nonPartitionedDataset); + doWriteOperation(testTable, Long.toString(commitTime2), UPSERT, nonPartitionedDataset); + doClean(testTable, Long.toString(commitTime3), Arrays.asList(Long.toString(commitTime1))); + doWriteOperation(testTable, Long.toString(commitTime4), UPSERT, nonPartitionedDataset); + if (tableType == MERGE_ON_READ) { + doCompaction(testTable, Long.toString(commitTime5), nonPartitionedDataset); + } + doWriteOperation(testTable, Long.toString(commitTime6), UPSERT, nonPartitionedDataset); + doRollback(testTable, Long.toString(commitTime6), Long.toString(commitTime7)); + } + validateMetadata(testTable, emptyList(), nonPartitionedDataset); + } + + // Some operations are not feasible with test table infra. hence using write client to test those cases. + + /** + * Rollback of the first commit should not trigger bootstrap errors at the metadata table. + */ + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testFirstCommitRollback(HoodieTableType tableType) throws Exception { + init(tableType); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, + getWriteConfigBuilder(true, true, false).withRollbackUsingMarkers(false).build())) { + + // Write 1 + String commitTime = "0000001"; + List records = dataGen.generateInserts(commitTime, 20); + client.startCommitWithTime(commitTime); + List writeStatuses = client.insert(jsc.parallelize(records, 1), commitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + + // Rollback the first commit + client.rollback(commitTime); + + // Write 2 + commitTime = "0000002"; + records = dataGen.generateInserts(commitTime, 10); + client.startCommitWithTime(commitTime); + writeStatuses = client.upsert(jsc.parallelize(records, 1), commitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + } + } + + /** + * Tests the metadata payload spurious deletes. + * Lets say a commit was applied to metadata table, and later was explicitly got rolledback. Due to spark task failures, there could be more files in rollback + * metadata when compared to the original commit metadata. When payload consistency check is enabled, it will throw exception. If not, it will succeed. + * + * @throws Exception + */ + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testMetadataPayloadSpuriousDeletes(boolean ignoreSpuriousDeletes) throws Exception { + tableType = COPY_ON_WRITE; + init(tableType, true, true, false, ignoreSpuriousDeletes); + doWriteInsertAndUpsert(testTable); + // trigger an upsert + doWriteOperationAndValidate(testTable, "0000003"); + + // trigger a commit and rollback + doWriteOperation(testTable, "0000004"); + // add extra files in rollback to check for payload consistency + Map> extraFiles = new HashMap<>(); + extraFiles.put("p1", Collections.singletonList("f10")); + extraFiles.put("p2", Collections.singletonList("f12")); + testTable.doRollbackWithExtraFiles("0000004", "0000005", extraFiles); + if (!ignoreSpuriousDeletes) { + assertThrows(HoodieMetadataException.class, () -> validateMetadata(testTable)); + } else { + validateMetadata(testTable); + } + } + + /** + * Test several table operations with restore. This test uses SparkRDDWriteClient. + * Once the restore support is ready in HoodieTestTable, then rewrite this test. + */ + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testTableOperationsWithRestore(HoodieTableType tableType) throws Exception { + init(tableType); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + HoodieWriteConfig writeConfig = getWriteConfigBuilder(true, true, false) + .withRollbackUsingMarkers(false).build(); + testTableOperationsImpl(engineContext, writeConfig); + } + + @Test + public void testColStatsPrefixLookup() throws IOException { + this.tableType = COPY_ON_WRITE; + initPath(); + initSparkContexts("TestHoodieMetadata"); + initFileSystem(); + fs.mkdirs(new Path(basePath)); + initTimelineService(); + initMetaClient(tableType); + initTestDataGenerator(); + metadataTableBasePath = HoodieTableMetadata.getMetadataTableBasePath(basePath); + + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + // disable small file handling so that every insert goes to a new file group. + HoodieWriteConfig writeConfig = getWriteConfigBuilder(true, true, false) + .withRollbackUsingMarkers(false) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.EAGER) + .withAutoClean(false).retainCommits(1).retainFileVersions(1) + .build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(0) + .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1) + .build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(true) + .withMetadataIndexColumnStats(true) + .enableFullScan(false) + .build()) + .build(); + + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) { + + String firstCommit = "0000001"; + List records = dataGen.generateInserts(firstCommit, 20); + + AtomicInteger counter = new AtomicInteger(); + List processedRecords = records.stream().map(entry -> + new HoodieAvroRecord(new HoodieKey("key1_" + counter.getAndIncrement(), entry.getPartitionPath()), (HoodieRecordPayload) entry.getData())) + .collect(Collectors.toList()); + + client.startCommitWithTime(firstCommit); + List writeStatuses = client.insert(jsc.parallelize(processedRecords, 1), firstCommit).collect(); + assertNoWriteErrors(writeStatuses); + + // Write 2 (inserts) + String secondCommit = "0000002"; + client.startCommitWithTime(secondCommit); + records = dataGen.generateInserts(secondCommit, 20); + AtomicInteger counter1 = new AtomicInteger(); + processedRecords = records.stream().map(entry -> + new HoodieAvroRecord(new HoodieKey("key2_" + counter1.getAndIncrement(), entry.getPartitionPath()), (HoodieRecordPayload) entry.getData())) + .collect(Collectors.toList()); + writeStatuses = client.insert(jsc.parallelize(processedRecords, 1), secondCommit).collect(); + assertNoWriteErrors(writeStatuses); + + Map>> commitToPartitionsToFiles = new HashMap<>(); + // populate commit -> partition -> file info to assist in validation and prefi + metaClient.getActiveTimeline().getInstants().forEach(entry -> { + try { + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(metaClient.getActiveTimeline().getInstantDetails(entry).get(), HoodieCommitMetadata.class); + String commitTime = entry.getTimestamp(); + if (!commitToPartitionsToFiles.containsKey(commitTime)) { + commitToPartitionsToFiles.put(commitTime, new HashMap<>()); + } + commitMetadata.getPartitionToWriteStats().entrySet() + .stream() + .forEach(partitionWriteStat -> { + String partitionStatName = partitionWriteStat.getKey(); + List writeStats = partitionWriteStat.getValue(); + String partition = HoodieTableMetadataUtil.getPartitionIdentifier(partitionStatName); + if (!commitToPartitionsToFiles.get(commitTime).containsKey(partition)) { + commitToPartitionsToFiles.get(commitTime).put(partition, new ArrayList<>()); + } + writeStats.forEach(writeStat -> commitToPartitionsToFiles.get(commitTime).get(partition).add(writeStat.getPath())); + }); + } catch (IOException e) { + e.printStackTrace(); + } + }); + + HoodieTableMetadata tableMetadata = metadata(client); + // prefix search for column (_hoodie_record_key) + ColumnIndexID columnIndexID = new ColumnIndexID(HoodieRecord.RECORD_KEY_METADATA_FIELD); + List> result = tableMetadata.getRecordsByKeyPrefixes(Collections.singletonList(columnIndexID.asBase64EncodedString()), + MetadataPartitionType.COLUMN_STATS.getPartitionPath(), true).collectAsList(); + + // there are 3 partitions in total and 2 commits. total entries should be 6. + assertEquals(result.size(), 6); + result.forEach(entry -> { + //LOG.warn("Prefix search entries just for record key col : " + entry.getRecordKey().toString() + " :: " + entry.getData().getColumnStatMetadata().get().toString()); + }); + + // prefix search for col(_hoodie_record_key) and first partition. only 2 files should be matched + PartitionIndexID partitionIndexID = new PartitionIndexID(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH); + result = tableMetadata.getRecordsByKeyPrefixes(Collections.singletonList(columnIndexID.asBase64EncodedString().concat(partitionIndexID.asBase64EncodedString())), + MetadataPartitionType.COLUMN_STATS.getPartitionPath(), true).collectAsList(); + // 1 partition and 2 commits. total entries should be 2. + assertEquals(result.size(), 2); + result.forEach(entry -> { + // LOG.warn("Prefix search entries for record key col and first partition : " + entry.getRecordKey().toString() + " :: " + entry.getData().getColumnStatMetadata().get().toString()); + HoodieMetadataColumnStats metadataColumnStats = entry.getData().getColumnStatMetadata().get(); + String fileName = metadataColumnStats.getFileName(); + if (fileName.contains(firstCommit)) { + assertTrue(commitToPartitionsToFiles.get(firstCommit).get(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) + .contains(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH + "/" + fileName)); + } else { + assertTrue(commitToPartitionsToFiles.get(secondCommit).get(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) + .contains(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH + "/" + fileName)); + } + }); + + // prefix search for column {commit time} and first partition + columnIndexID = new ColumnIndexID(HoodieRecord.COMMIT_TIME_METADATA_FIELD); + result = tableMetadata.getRecordsByKeyPrefixes(Collections.singletonList(columnIndexID.asBase64EncodedString().concat(partitionIndexID.asBase64EncodedString())), + MetadataPartitionType.COLUMN_STATS.getPartitionPath(), true).collectAsList(); + + // 1 partition and 2 commits. total entries should be 2. + assertEquals(result.size(), 2); + result.forEach(entry -> { + // LOG.warn("Prefix search entries for record key col and first partition : " + entry.getRecordKey().toString() + " :: " + entry.getData().getColumnStatMetadata().get().toString()); + HoodieMetadataColumnStats metadataColumnStats = entry.getData().getColumnStatMetadata().get(); + // for commit time column, min max should be the same since we disable small files, every commit will create a new file + assertEquals(metadataColumnStats.getMinValue(), metadataColumnStats.getMaxValue()); + String fileName = metadataColumnStats.getFileName(); + if (fileName.contains(firstCommit)) { + assertTrue(commitToPartitionsToFiles.get(firstCommit).get(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) + .contains(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH + "/" + fileName)); + } else { + assertTrue(commitToPartitionsToFiles.get(secondCommit).get(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) + .contains(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH + "/" + fileName)); + } + }); + } + } + + /** + * Test all major table operations with the given table, config and context. + * + * @param engineContext - Engine context + * @param writeConfig - Write config + * @throws IOException + */ + private void testTableOperationsImpl(HoodieSparkEngineContext engineContext, HoodieWriteConfig writeConfig) throws IOException { + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) { + // Write 1 (Bulk insert) + String newCommitTime = "0000001"; + List records = dataGen.generateInserts(newCommitTime, 20); + client.startCommitWithTime(newCommitTime); + List writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + + // Write 2 (inserts) + newCommitTime = "0000002"; + client.startCommitWithTime(newCommitTime); + validateMetadata(client); + + records = dataGen.generateInserts(newCommitTime, 20); + writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + + // Write 3 (updates) + newCommitTime = "0000003"; + client.startCommitWithTime(newCommitTime); + records = dataGen.generateUniqueUpdates(newCommitTime, 10); + writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + + // Write 4 (updates and inserts) + newCommitTime = "0000004"; + client.startCommitWithTime(newCommitTime); + records = dataGen.generateUpdates(newCommitTime, 10); + writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + + // Compaction + if (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ) { + newCommitTime = "0000005"; + client.scheduleCompactionAtInstant(newCommitTime, Option.empty()); + client.compact(newCommitTime); + validateMetadata(client); + } + + // Write 5 (updates and inserts) + newCommitTime = "0000006"; + client.startCommitWithTime(newCommitTime); + records = dataGen.generateUpdates(newCommitTime, 5); + writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + + // Compaction + if (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ) { + newCommitTime = "0000007"; + client.scheduleCompactionAtInstant(newCommitTime, Option.empty()); + client.compact(newCommitTime); + validateMetadata(client); + } + + // Deletes + newCommitTime = "0000009"; + records = dataGen.generateDeletes(newCommitTime, 10); + JavaRDD deleteKeys = jsc.parallelize(records, 1).map(r -> r.getKey()); + client.startCommitWithTime(newCommitTime); + client.delete(deleteKeys, newCommitTime); + + // Clean + newCommitTime = "0000009"; + client.clean(newCommitTime); + validateMetadata(client); + + // Restore + client.restoreToInstant("0000006", writeConfig.isMetadataTableEnabled()); + validateMetadata(client); + } + } + + /** + * Test multi-writer on metadata table with optimistic concurrency. + */ + @Test + public void testMetadataMultiWriter() throws Exception { + init(HoodieTableType.COPY_ON_WRITE); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + + Properties properties = new Properties(); + properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "1000"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY, "20"); + HoodieWriteConfig writeConfig = getWriteConfigBuilder(true, true, false) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withAutoClean(false).build()) + .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) + .withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(InProcessLockProvider.class).build()) + .withProperties(properties) + .build(); + + ExecutorService executors = Executors.newFixedThreadPool(dataGen.getPartitionPaths().length); + // Create clients in advance + SparkRDDWriteClient[] writeClients = new SparkRDDWriteClient[dataGen.getPartitionPaths().length]; + for (int i = 0; i < dataGen.getPartitionPaths().length; i++) { + writeClients[i] = new SparkRDDWriteClient(engineContext, writeConfig); + } + + // Parallel commits for separate partitions + List futures = new LinkedList<>(); + for (int i = 0; i < dataGen.getPartitionPaths().length; ++i) { + final int index = i; + String newCommitTime = "000000" + (index + 1); + Future future = executors.submit(() -> { + List records = dataGen.generateInsertsForPartition(newCommitTime, 100, dataGen.getPartitionPaths()[index]); + SparkRDDWriteClient writeClient = writeClients[index]; + writeClient.startCommitWithTime(newCommitTime); + List writeStatuses = writeClient.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + }); + futures.add(future); + } + + // Wait for all commits to complete + for (Future future : futures) { + future.get(); + } + + // Ensure all commits were synced to the Metadata Table + HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build(); + assertEquals(metadataMetaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants().countInstants(), 4); + assertTrue(metadataMetaClient.getActiveTimeline().containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "0000001"))); + assertTrue(metadataMetaClient.getActiveTimeline().containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "0000002"))); + assertTrue(metadataMetaClient.getActiveTimeline().containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "0000003"))); + + // Compaction may occur if the commits completed in order + assertTrue(metadataMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().countInstants() <= 1); + + // Validation + validateMetadata(writeClients[0]); + } + + /** + * Tests that when inline cleaning is enabled and with auto commit set to true, there is no double locking. + * bcoz, auto clean is triggered within post commit which is already happening within a lock. + * + * @throws Exception + */ + @Test + public void testMultiWriterForDoubleLocking() throws Exception { + init(HoodieTableType.COPY_ON_WRITE); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + + Properties properties = new Properties(); + properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000"); + + HoodieWriteConfig writeConfig = getWriteConfigBuilder(true, true, false) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withAutoClean(true).retainCommits(4) + .build()) + .withAutoCommit(false) + .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) + .withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(InProcessLockProvider.class).build()) + .withProperties(properties) + .build(); + + SparkRDDWriteClient writeClient = new SparkRDDWriteClient(engineContext, writeConfig); + String partitionPath = dataGen.getPartitionPaths()[0]; + for (int j = 0; j < 6; j++) { + String newCommitTime = "000000" + j; + List records = dataGen.generateInsertsForPartition(newCommitTime, 100, partitionPath); + writeClient.startCommitWithTime(newCommitTime); + JavaRDD writeStatuses = writeClient.insert(jsc.parallelize(records, 1), newCommitTime); + writeClient.commit(newCommitTime, writeStatuses); + } + + // Ensure all commits were synced to the Metadata Table + HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build(); + LOG.warn("total commits in metadata table " + metadataMetaClient.getActiveTimeline().getCommitsTimeline().countInstants()); + + // 6 commits and 2 cleaner commits. + assertEquals(metadataMetaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants().countInstants(), 8); + assertTrue(metadataMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().countInstants() <= 1); + // Validation + validateMetadata(writeClient); + } + + /** + * Lets say clustering commit succeeded in metadata table, but failed before committing to datatable. + * Next time, when clustering kicks in, hudi will rollback pending clustering (in data table) and re-attempt the clustering with same + * instant time. So, this test ensures the 2nd attempt succeeds with metadata enabled. + * This is applicable to any table service where instant time is fixed. So, how many ever times the operation fails, re attempt will + * be made with same commit time. + * Tests uses clustering to test out the scenario. + */ + @Test + public void testReattemptOfFailedClusteringCommit() throws Exception { + tableType = HoodieTableType.COPY_ON_WRITE; + init(tableType); + context = new HoodieSparkEngineContext(jsc); + HoodieWriteConfig config = getSmallInsertWriteConfig(2000, TRIP_EXAMPLE_SCHEMA, 10, false); + SparkRDDWriteClient client = getHoodieWriteClient(config); + + // Write 1 (Bulk insert) + String newCommitTime = "0000001"; + List records = dataGen.generateInserts(newCommitTime, 20); + client.startCommitWithTime(newCommitTime); + List writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + + // Write 2 (inserts) + newCommitTime = "0000002"; + client.startCommitWithTime(newCommitTime); + records = dataGen.generateInserts(newCommitTime, 20); + writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + + // setup clustering config. + HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10) + .withClusteringSortColumns("_row_key").withInlineClustering(true) + .withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build(); + + HoodieWriteConfig newWriteConfig = getConfigBuilder(TRIP_EXAMPLE_SCHEMA, HoodieIndex.IndexType.BLOOM, HoodieFailedWritesCleaningPolicy.EAGER) + .withAutoCommit(false) + .withClusteringConfig(clusteringConfig).build(); + + // trigger clustering + SparkRDDWriteClient newClient = getHoodieWriteClient(newWriteConfig); + String clusteringCommitTime = newClient.scheduleClustering(Option.empty()).get().toString(); + HoodieWriteMetadata> clusterMetadata = newClient.cluster(clusteringCommitTime, true); + + // collect replaceFileIds for validation later. + Set replacedFileIds = new HashSet<>(); + clusterMetadata.getPartitionToReplaceFileIds().entrySet().forEach(partitionFiles -> + partitionFiles.getValue().stream().forEach(file -> + replacedFileIds.add(new HoodieFileGroupId(partitionFiles.getKey(), file)))); + + // trigger new write to mimic other writes succeeding before re-attempt. + newCommitTime = "0000003"; + client.startCommitWithTime(newCommitTime); + records = dataGen.generateInserts(newCommitTime, 20); + writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + + // manually remove clustering completed instant from .hoodie folder and to mimic succeeded clustering in metadata table, but failed in data table. + FileCreateUtils.deleteReplaceCommit(basePath, clusteringCommitTime); + HoodieWriteMetadata> updatedClusterMetadata = newClient.cluster(clusteringCommitTime, true); + + metaClient.reloadActiveTimeline(); + Set updatedReplacedFileIds = new HashSet<>(); + updatedClusterMetadata.getPartitionToReplaceFileIds().entrySet().forEach(partitionFiles -> + partitionFiles.getValue().stream().forEach(file -> + updatedReplacedFileIds.add(new HoodieFileGroupId(partitionFiles.getKey(), file)))); + assertEquals(replacedFileIds, updatedReplacedFileIds); + validateMetadata(client); + } + + @Test + public void testMetadataReadWithNoCompletedCommits() throws Exception { + init(HoodieTableType.COPY_ON_WRITE); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + + List records; + List writeStatuses; + String[] commitTimestamps = {HoodieActiveTimeline.createNewInstantTime(), HoodieActiveTimeline.createNewInstantTime()}; + + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) { + records = dataGen.generateInserts(commitTimestamps[0], 5); + client.startCommitWithTime(commitTimestamps[0]); + writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), commitTimestamps[0]).collect(); + assertNoWriteErrors(writeStatuses); + + // make all commits to inflight in metadata table. Still read should go through, just that it may not return any data. + FileCreateUtils.deleteDeltaCommit(basePath + "/.hoodie/metadata/", commitTimestamps[0]); + FileCreateUtils.deleteDeltaCommit(basePath + " /.hoodie/metadata/", HoodieTableMetadata.SOLO_COMMIT_TIMESTAMP); + assertEquals(getAllFiles(metadata(client)).stream().map(p -> p.getName()).map(n -> FSUtils.getCommitTime(n)).collect(Collectors.toSet()).size(), 0); + } + } + + + /** + * Ensure that the reader only reads completed instants. + * + * @throws IOException + */ + @Test + public void testReader() throws Exception { + init(HoodieTableType.COPY_ON_WRITE); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + + List records; + List writeStatuses; + String[] commitTimestamps = {HoodieActiveTimeline.createNewInstantTime(), HoodieActiveTimeline.createNewInstantTime(), + HoodieActiveTimeline.createNewInstantTime(), HoodieActiveTimeline.createNewInstantTime()}; + + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) { + for (int i = 0; i < commitTimestamps.length; ++i) { + records = dataGen.generateInserts(commitTimestamps[i], 5); + client.startCommitWithTime(commitTimestamps[i]); + writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), commitTimestamps[i]).collect(); + assertNoWriteErrors(writeStatuses); + } + + // Ensure we can see files from each commit + Set timelineTimestamps = getAllFiles(metadata(client)).stream().map(p -> p.getName()).map(n -> FSUtils.getCommitTime(n)).collect(Collectors.toSet()); + assertEquals(timelineTimestamps.size(), commitTimestamps.length); + for (int i = 0; i < commitTimestamps.length; ++i) { + assertTrue(timelineTimestamps.contains(commitTimestamps[i])); + } + + // mark each commit as incomplete and ensure files are not seen + for (int i = 0; i < commitTimestamps.length; ++i) { + FileCreateUtils.deleteCommit(basePath, commitTimestamps[i]); + timelineTimestamps = getAllFiles(metadata(client)).stream().map(p -> p.getName()).map(n -> FSUtils.getCommitTime(n)).collect(Collectors.toSet()); + assertEquals(timelineTimestamps.size(), commitTimestamps.length - 1); + for (int j = 0; j < commitTimestamps.length; ++j) { + assertTrue(j == i || timelineTimestamps.contains(commitTimestamps[j])); + } + FileCreateUtils.createCommit(basePath, commitTimestamps[i]); + } + + // Test multiple incomplete commits + FileCreateUtils.deleteCommit(basePath, commitTimestamps[0]); + FileCreateUtils.deleteCommit(basePath, commitTimestamps[2]); + timelineTimestamps = getAllFiles(metadata(client)).stream().map(p -> p.getName()).map(n -> FSUtils.getCommitTime(n)).collect(Collectors.toSet()); + assertEquals(timelineTimestamps.size(), commitTimestamps.length - 2); + for (int j = 0; j < commitTimestamps.length; ++j) { + assertTrue(j == 0 || j == 2 || timelineTimestamps.contains(commitTimestamps[j])); + } + + // Test no completed commits + for (int i = 0; i < commitTimestamps.length; ++i) { + FileCreateUtils.deleteCommit(basePath, commitTimestamps[i]); + } + timelineTimestamps = getAllFiles(metadata(client)).stream().map(p -> p.getName()).map(n -> FSUtils.getCommitTime(n)).collect(Collectors.toSet()); + assertEquals(timelineTimestamps.size(), 0); + } + } + + /** + * Instants on Metadata Table should be archived as per config but we always keep atlest the number of instants + * as on the dataset. + *

    + * Metadata Table should be automatically compacted as per config. + */ + @Disabled + public void testCleaningArchivingAndCompaction() throws Exception { + init(HoodieTableType.COPY_ON_WRITE, false); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + + final int maxDeltaCommitsBeforeCompaction = 3; + HoodieWriteConfig config = getWriteConfigBuilder(true, true, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true) + .archiveCommitsWith(40, 60).retainCommits(1) + .withMaxNumDeltaCommitsBeforeCompaction(maxDeltaCommitsBeforeCompaction).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.NEVER) + .retainCommits(1).retainFileVersions(1).withAutoClean(true).withAsyncClean(false) + .build()) + .withArchivalConfig(HoodieArchivalConfig.newBuilder() + .archiveCommitsWith(2, 4).build()) + .build(); + + List records; + String newCommitTime; + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, config)) { + // Some initial commits so compaction is not triggered. + // 1 deltacommit will be from bootstrap. So we can perform maxDeltaCommitsBeforeCompaction - 2 more commits before + // compaction will be attempted. + for (int i = 0; i < maxDeltaCommitsBeforeCompaction - 2; ++i) { + newCommitTime = HoodieActiveTimeline.createNewInstantTime(); + records = dataGen.generateInserts(newCommitTime, 5); + client.startCommitWithTime(newCommitTime); + client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + } + + HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build(); + HoodieTableMetaClient datasetMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(config.getBasePath()).build(); + + // There should not be any compaction yet and we have not performed more than maxDeltaCommitsBeforeCompaction + // deltacommits (1 will be due to bootstrap) + HoodieActiveTimeline metadataTimeline = metadataMetaClient.reloadActiveTimeline(); + assertEquals(metadataTimeline.getCommitTimeline().filterCompletedInstants().countInstants(), 0); + assertEquals(metadataTimeline.getCommitsTimeline().filterCompletedInstants().countInstants(), maxDeltaCommitsBeforeCompaction - 1); + assertEquals(datasetMetaClient.getArchivedTimeline().reload().countInstants(), 0); + + // Next commit will initiate a compaction + newCommitTime = HoodieActiveTimeline.createNewInstantTime(); + records = dataGen.generateInserts(newCommitTime, 5); + client.startCommitWithTime(newCommitTime); + client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + metadataTimeline = metadataMetaClient.reloadActiveTimeline(); + assertEquals(metadataTimeline.getCommitTimeline().filterCompletedInstants().countInstants(), 1); + assertEquals(metadataTimeline.getCommitsTimeline().filterCompletedInstants().countInstants(), maxDeltaCommitsBeforeCompaction + 1); + assertEquals(datasetMetaClient.getArchivedTimeline().reload().countInstants(), 0); + + // More than maxDeltaCommitsBeforeCompaction commits + String inflightCommitTime = newCommitTime; + for (int i = 0; i < maxDeltaCommitsBeforeCompaction + 1; ++i) { + newCommitTime = HoodieActiveTimeline.createNewInstantTime(); + records = dataGen.generateInserts(newCommitTime, 5); + client.startCommitWithTime(newCommitTime); + client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + if (i == 0) { + // Mark this commit inflight so compactions dont take place + FileCreateUtils.deleteCommit(basePath, newCommitTime); + FileCreateUtils.createInflightCommit(basePath, newCommitTime); + inflightCommitTime = newCommitTime; + } + } + + // Ensure no more compactions took place due to the leftover inflight commit + metadataTimeline = metadataMetaClient.reloadActiveTimeline(); + assertEquals(metadataTimeline.getCommitTimeline().filterCompletedInstants().countInstants(), 1); + assertEquals(metadataTimeline.getDeltaCommitTimeline().filterCompletedInstants().countInstants(), + ((2 * maxDeltaCommitsBeforeCompaction) + (maxDeltaCommitsBeforeCompaction /* clean from dataset */) + 1)/* clean in metadata table */); + + // Complete commit + FileCreateUtils.createCommit(basePath, inflightCommitTime); + + // Next commit should lead to compaction + newCommitTime = HoodieActiveTimeline.createNewInstantTime(); + records = dataGen.generateInserts(newCommitTime, 5); + client.startCommitWithTime(newCommitTime); + client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + + // Ensure compactions took place + metadataTimeline = metadataMetaClient.reloadActiveTimeline(); + assertEquals(metadataTimeline.getCommitTimeline().filterCompletedInstants().countInstants(), 2); + assertEquals(metadataTimeline.getDeltaCommitTimeline().filterCompletedInstants().countInstants(), + ((2 * maxDeltaCommitsBeforeCompaction) + (maxDeltaCommitsBeforeCompaction + 1 /* clean from dataset */) + 2 /* clean in metadata table */)); + assertTrue(datasetMetaClient.getArchivedTimeline().reload().countInstants() > 0); + + validateMetadata(client); + } + } + + @Test + public void testUpgradeDowngrade() throws IOException { + init(HoodieTableType.COPY_ON_WRITE, false); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + + // Perform a commit. This should bootstrap the metadata table with latest version. + List records; + List writeStatuses; + String commitTimestamp = HoodieActiveTimeline.createNewInstantTime(); + HoodieWriteConfig writeConfig = getWriteConfig(true, true); + + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) { + records = dataGen.generateInserts(commitTimestamp, 5); + client.startCommitWithTime(commitTimestamp); + writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), commitTimestamp).collect(); + assertNoWriteErrors(writeStatuses); + } + + // Metadata table should have been bootstrapped + assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist"); + FileStatus oldStatus = fs.getFileStatus(new Path(metadataTableBasePath)); + + // set hoodie.table.version to 2 in hoodie.properties file + changeTableVersion(HoodieTableVersion.TWO); + + // With next commit the table should be deleted (as part of upgrade) and then re-bootstrapped automatically + commitTimestamp = HoodieActiveTimeline.createNewInstantTime(); + metaClient.reloadActiveTimeline(); + FileStatus prevStatus = fs.getFileStatus(new Path(metadataTableBasePath)); + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) { + records = dataGen.generateInserts(commitTimestamp, 5); + client.startCommitWithTime(commitTimestamp); + writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), commitTimestamp).collect(); + assertNoWriteErrors(writeStatuses); + } + assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist"); + FileStatus currentStatus = fs.getFileStatus(new Path(metadataTableBasePath)); + assertTrue(currentStatus.getModificationTime() > prevStatus.getModificationTime()); + + initMetaClient(); + assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.FIVE.versionCode()); + assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist"); + FileStatus newStatus = fs.getFileStatus(new Path(metadataTableBasePath)); + assertTrue(oldStatus.getModificationTime() < newStatus.getModificationTime()); + + // Test downgrade by running the downgrader + new UpgradeDowngrade(metaClient, writeConfig, context, SparkUpgradeDowngradeHelper.getInstance()) + .run(HoodieTableVersion.TWO, null); + + assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.TWO.versionCode()); + assertFalse(fs.exists(new Path(metadataTableBasePath)), "Metadata table should not exist"); + } + + /** + * When table needs to be upgraded and when multi writer is enabled, hudi rollsback partial commits. Upgrade itself is happening + * within a lock and hence rollback should not lock again. + * + * @throws IOException + * @throws InterruptedException + */ + @Test + public void testRollbackDuringUpgradeForDoubleLocking() throws IOException, InterruptedException { + init(HoodieTableType.COPY_ON_WRITE, false); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + + // Perform a commit. This should bootstrap the metadata table with latest version. + List records; + JavaRDD writeStatuses; + String commitTimestamp = HoodieActiveTimeline.createNewInstantTime(); + Properties properties = new Properties(); + properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY, "3"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000"); + HoodieWriteConfig writeConfig = getWriteConfigBuilder(false, true, false) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withAutoClean(false).build()) + .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) + .withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(InProcessLockProvider.class).build()) + .withProperties(properties) + .build(); + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) { + records = dataGen.generateInserts(commitTimestamp, 5); + client.startCommitWithTime(commitTimestamp); + writeStatuses = client.insert(jsc.parallelize(records, 1), commitTimestamp); + client.commit(commitTimestamp, writeStatuses); + } + + // Metadata table should have been bootstrapped + assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist"); + FileStatus oldStatus = fs.getFileStatus(new Path(metadataTableBasePath)); + + // trigger partial commit + metaClient.reloadActiveTimeline(); + commitTimestamp = HoodieActiveTimeline.createNewInstantTime(); + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) { + records = dataGen.generateInserts(commitTimestamp, 5); + client.startCommitWithTime(commitTimestamp); + writeStatuses = client.insert(jsc.parallelize(records, 1), commitTimestamp); + } + + // set hoodie.table.version to 2 in hoodie.properties file + changeTableVersion(HoodieTableVersion.TWO); + writeConfig = getWriteConfigBuilder(true, true, false).withRollbackUsingMarkers(false).withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withAutoClean(false).build()) + .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) + .withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(InProcessLockProvider.class).build()) + .withProperties(properties) + .withEmbeddedTimelineServerEnabled(false) + .build(); + + // With next commit the table should be re-bootstrapped and partial commit should be rolled back. + metaClient.reloadActiveTimeline(); + commitTimestamp = HoodieActiveTimeline.createNewInstantTime(); + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) { + records = dataGen.generateInserts(commitTimestamp, 5); + client.startCommitWithTime(commitTimestamp); + writeStatuses = client.insert(jsc.parallelize(records, 1), commitTimestamp); + assertNoWriteErrors(writeStatuses.collect()); + } + + initMetaClient(); + assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.FIVE.versionCode()); + assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist"); + FileStatus newStatus = fs.getFileStatus(new Path(metadataTableBasePath)); + assertTrue(oldStatus.getModificationTime() < newStatus.getModificationTime()); + } + + /** + * Tests rollback of a commit which has new partitions which is not present in hudi table prior to the commit being rolledback. + * + * @throws Exception + */ + @Test + public void testRollbackOfPartiallyFailedCommitWithNewPartitions() throws Exception { + init(HoodieTableType.COPY_ON_WRITE); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, + getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, true, true, false, true, false, false).build(), + true)) { + String newCommitTime = HoodieActiveTimeline.createNewInstantTime(); + client.startCommitWithTime(newCommitTime); + List records = dataGen.generateInserts(newCommitTime, 10); + List upsertRecords = new ArrayList<>(); + for (HoodieRecord entry : records) { + if (entry.getPartitionPath().equals(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) + || entry.getPartitionPath().equals(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH)) { + upsertRecords.add(entry); + } + } + List writeStatuses = client.upsert(jsc.parallelize(upsertRecords, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + + newCommitTime = HoodieActiveTimeline.createNewInstantTime(); + client.startCommitWithTime(newCommitTime); + records = dataGen.generateInserts(newCommitTime, 20); + writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + + // There is no way to simulate failed commit on the main dataset, hence we simply delete the completed + // instant so that only the inflight is left over. + String commitInstantFileName = HoodieTimeline.makeCommitFileName(newCommitTime); + assertTrue(fs.delete(new Path(basePath + Path.SEPARATOR + HoodieTableMetaClient.METAFOLDER_NAME, + commitInstantFileName), false)); + } + + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, + getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, true, true, false, true, false, false).build(), + true)) { + String newCommitTime = client.startCommit(); + // Next insert + List records = dataGen.generateInserts(newCommitTime, 20); + List writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + } + } + + @Test + public void testDeletePartitions() throws Exception { + init(HoodieTableType.COPY_ON_WRITE); + + int maxCommits = 1; + HoodieWriteConfig cfg = getConfigBuilder(TRIP_EXAMPLE_SCHEMA, HoodieIndex.IndexType.BLOOM, HoodieFailedWritesCleaningPolicy.EAGER) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(maxCommits).build()) + .withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()) + .build(); + + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { + String newCommitTime = HoodieActiveTimeline.createNewInstantTime(); + client.startCommitWithTime(newCommitTime); + List records = dataGen.generateInserts(newCommitTime, 10); + List upsertRecords = new ArrayList<>(); + for (HoodieRecord entry : records) { + if (entry.getPartitionPath().equals(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) + || entry.getPartitionPath().equals(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH)) { + upsertRecords.add(entry); + } + } + List writeStatuses = client.upsert(jsc.parallelize(upsertRecords, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + + // delete partitions + newCommitTime = HoodieActiveTimeline.createNewInstantTime(5000); + client.startCommitWithTime(newCommitTime); + client.deletePartitions(singletonList(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH), newCommitTime); + + // add 1 more commit + newCommitTime = HoodieActiveTimeline.createNewInstantTime(5000); + client.startCommitWithTime(newCommitTime); + records = dataGen.generateInserts(newCommitTime, 10); + upsertRecords = new ArrayList<>(); + for (HoodieRecord entry : records) { + if (entry.getPartitionPath().equals(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH)) { + upsertRecords.add(entry); + } + } + writeStatuses = client.upsert(jsc.parallelize(upsertRecords, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + + // trigger clean which will actually trigger deletion of the partition + newCommitTime = HoodieActiveTimeline.createNewInstantTime(5000); + HoodieCleanMetadata cleanMetadata = client.clean(newCommitTime); + validateMetadata(client); + assertEquals(1, metadata(client).getAllPartitionPaths().size()); + } + } + + /** + * Test various error scenarios. + */ + @Test + public void testErrorCases() throws Exception { + init(HoodieTableType.COPY_ON_WRITE); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + + // TESTCASE: If commit on the metadata table succeeds but fails on the dataset, then on next init the metadata table + // should be rolled back to last valid commit. + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, + getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, true, true, false, true, false, false).build(), + true)) { + String newCommitTime = HoodieActiveTimeline.createNewInstantTime(); + client.startCommitWithTime(newCommitTime); + List records = dataGen.generateInserts(newCommitTime, 10); + List writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + + newCommitTime = HoodieActiveTimeline.createNewInstantTime(); + client.startCommitWithTime(newCommitTime); + records = dataGen.generateInserts(newCommitTime, 5); + writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + + // There is no way to simulate failed commit on the main dataset, hence we simply delete the completed + // instant so that only the inflight is left over. + String commitInstantFileName = HoodieTimeline.makeCommitFileName(newCommitTime); + assertTrue(fs.delete(new Path(basePath + Path.SEPARATOR + HoodieTableMetaClient.METAFOLDER_NAME, + commitInstantFileName), false)); + } + + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, + getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, true, true, false, true, false, false).build(), + true)) { + String newCommitTime = client.startCommit(); + // Next insert + List records = dataGen.generateInserts(newCommitTime, 5); + List writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + + // Post rollback commit and metadata should be valid + validateMetadata(client); + } + } + + @Test + public void testNonPartitioned() throws Exception { + init(HoodieTableType.COPY_ON_WRITE, false); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + + HoodieTestDataGenerator nonPartitionedGenerator = new HoodieTestDataGenerator(new String[] {""}); + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) { + // Write 1 (Bulk insert) + String newCommitTime = "0000001"; + List records = nonPartitionedGenerator.generateInserts(newCommitTime, 10); + client.startCommitWithTime(newCommitTime); + List writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), newCommitTime).collect(); + validateMetadata(client); + + List metadataPartitions = metadata(client).getAllPartitionPaths(); + assertTrue(metadataPartitions.contains(""), "Must contain empty partition"); + } + } + + /** + * Test various metrics published by metadata table. + */ + @Test + public void testMetadataMetrics() throws Exception { + init(HoodieTableType.COPY_ON_WRITE, false); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfigBuilder(true, true, true).build())) { + // Write + String newCommitTime = HoodieActiveTimeline.createNewInstantTime(); + List records = dataGen.generateInserts(newCommitTime, 20); + client.startCommitWithTime(newCommitTime); + List writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + assertNoWriteErrors(writeStatuses); + validateMetadata(client); + + Registry metricsRegistry = Registry.getRegistry("HoodieMetadata"); + assertTrue(metricsRegistry.getAllCounts().containsKey(HoodieMetadataMetrics.INITIALIZE_STR + ".count")); + assertTrue(metricsRegistry.getAllCounts().containsKey(HoodieMetadataMetrics.INITIALIZE_STR + ".totalDuration")); + assertTrue(metricsRegistry.getAllCounts().get(HoodieMetadataMetrics.INITIALIZE_STR + ".count") >= 1L); + final String prefix = FILES.getPartitionPath() + "."; + assertTrue(metricsRegistry.getAllCounts().containsKey(prefix + HoodieMetadataMetrics.STAT_COUNT_BASE_FILES)); + assertTrue(metricsRegistry.getAllCounts().containsKey(prefix + HoodieMetadataMetrics.STAT_COUNT_LOG_FILES)); + assertTrue(metricsRegistry.getAllCounts().containsKey(prefix + HoodieMetadataMetrics.STAT_TOTAL_BASE_FILE_SIZE)); + assertTrue(metricsRegistry.getAllCounts().containsKey(prefix + HoodieMetadataMetrics.STAT_TOTAL_LOG_FILE_SIZE)); + } + } + + private void doPreBootstrapOperations(HoodieTestTable testTable) throws Exception { + doPreBootstrapOperations(testTable, "0000001", "0000002"); + } + + private void doPreBootstrapOperations(HoodieTestTable testTable, String commit1, String commit2) throws Exception { + testTable.doWriteOperation(commit1, INSERT, asList("p1", "p2"), asList("p1", "p2"), + 2, true); + testTable.doWriteOperation(commit2, UPSERT, asList("p1", "p2"), + 2, true); + validateMetadata(testTable); + } + + private void doWriteInsertAndUpsertNonPartitioned(HoodieTestTable testTable) throws Exception { + doWriteInsertAndUpsert(testTable, "0000001", "0000002", true); + } + + private void doWriteInsertAndUpsert(HoodieTestTable testTable) throws Exception { + doWriteInsertAndUpsert(testTable, "0000001", "0000002", false); + } + + private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, boolean mergeAllowDuplicateInserts) { + HoodieWriteConfig.Builder builder = getConfigBuilder(schemaStr, HoodieIndex.IndexType.BLOOM, HoodieFailedWritesCleaningPolicy.EAGER); + return builder.withCompactionConfig( + HoodieCompactionConfig.newBuilder() + .compactionSmallFileSize(smallFileSize) + // Set rollback to LAZY so no inflights are deleted + .insertSplitSize(insertSplitSize).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).build()) + .withStorageConfig( + HoodieStorageConfig.newBuilder() + .hfileMaxFileSize(dataGen.getEstimatedFileSizeInBytes(200)) + .parquetMaxFileSize(dataGen.getEstimatedFileSizeInBytes(200)).build()) + .withMergeAllowDuplicateOnInserts(mergeAllowDuplicateInserts) + .build(); + } + + public HoodieWriteConfig.Builder getConfigBuilder(String schemaStr, HoodieIndex.IndexType indexType, + HoodieFailedWritesCleaningPolicy cleaningPolicy) { + return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr) + .withParallelism(2, 2).withBulkInsertParallelism(2).withFinalizeWriteParallelism(2).withDeleteParallelism(2) + .withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION) + .withWriteStatusClass(MetadataMergeWriteStatus.class) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder().withFailedWritesCleaningPolicy(cleaningPolicy).build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024).parquetMaxFileSize(1024 * 1024).orcMaxFileSize(1024 * 1024).build()) + .forTable("test-trip-table") + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).build()) + .withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withEnableBackupForRemoteFileSystemView(false) // Fail test if problem connecting to timeline-server + .withRemoteServerPort(timelineServicePort) + .withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build()); + } + + private void validateMetadata(SparkRDDWriteClient testClient) throws IOException { + HoodieWriteConfig config = testClient.getConfig(); + + SparkRDDWriteClient client; + if (config.isEmbeddedTimelineServerEnabled()) { + testClient.close(); + client = new SparkRDDWriteClient(testClient.getEngineContext(), testClient.getConfig()); + } else { + client = testClient; + } + + HoodieTableMetadata tableMetadata = metadata(client); + assertNotNull(tableMetadata, "MetadataReader should have been initialized"); + if (!config.isMetadataTableEnabled()) { + return; + } + + HoodieTimer timer = new HoodieTimer().startTimer(); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + + // Partitions should match + FileSystemBackedTableMetadata fsBackedTableMetadata = new FileSystemBackedTableMetadata(engineContext, + new SerializableConfiguration(hadoopConf), config.getBasePath(), config.shouldAssumeDatePartitioning()); + List fsPartitions = fsBackedTableMetadata.getAllPartitionPaths(); + List metadataPartitions = tableMetadata.getAllPartitionPaths(); + + Collections.sort(fsPartitions); + Collections.sort(metadataPartitions); + + assertEquals(fsPartitions.size(), metadataPartitions.size(), "Partitions should match"); + assertTrue(fsPartitions.equals(metadataPartitions), "Partitions should match"); + + // Files within each partition should match + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable table = HoodieSparkTable.create(config, engineContext); + TableFileSystemView tableView = table.getHoodieView(); + List fullPartitionPaths = fsPartitions.stream().map(partition -> basePath + "/" + partition).collect(Collectors.toList()); + Map partitionToFilesMap = tableMetadata.getAllFilesInPartitions(fullPartitionPaths); + assertEquals(fsPartitions.size(), partitionToFilesMap.size()); + + fsPartitions.forEach(partition -> { + try { + Path partitionPath; + if (partition.equals("")) { + // Should be the non-partitioned case + partitionPath = new Path(basePath); + } else { + partitionPath = new Path(basePath, partition); + } + FileStatus[] fsStatuses = FSUtils.getAllDataFilesInPartition(fs, partitionPath); + FileStatus[] metaStatuses = tableMetadata.getAllFilesInPartition(partitionPath); + List fsFileNames = Arrays.stream(fsStatuses) + .map(s -> s.getPath().getName()).collect(Collectors.toList()); + List metadataFilenames = Arrays.stream(metaStatuses) + .map(s -> s.getPath().getName()).collect(Collectors.toList()); + Collections.sort(fsFileNames); + Collections.sort(metadataFilenames); + + assertEquals(fsStatuses.length, partitionToFilesMap.get(partitionPath.toString()).length); + + // File sizes should be valid + Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getLen() > 0)); + + if ((fsFileNames.size() != metadataFilenames.size()) || (!fsFileNames.equals(metadataFilenames))) { + LOG.info("*** File system listing = " + Arrays.toString(fsFileNames.toArray())); + LOG.info("*** Metadata listing = " + Arrays.toString(metadataFilenames.toArray())); + + for (String fileName : fsFileNames) { + if (!metadataFilenames.contains(fileName)) { + LOG.error(partition + "FsFilename " + fileName + " not found in Meta data"); + } + } + for (String fileName : metadataFilenames) { + if (!fsFileNames.contains(fileName)) { + LOG.error(partition + "Metadata file " + fileName + " not found in original FS"); + } + } + } + + // Block sizes should be valid + Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getBlockSize() > 0)); + List fsBlockSizes = Arrays.stream(fsStatuses).map(FileStatus::getBlockSize).collect(Collectors.toList()); + Collections.sort(fsBlockSizes); + List metadataBlockSizes = Arrays.stream(metaStatuses).map(FileStatus::getBlockSize).collect(Collectors.toList()); + Collections.sort(metadataBlockSizes); + assertEquals(fsBlockSizes, metadataBlockSizes); + + assertEquals(fsFileNames.size(), metadataFilenames.size(), "Files within partition " + partition + " should match"); + assertTrue(fsFileNames.equals(metadataFilenames), "Files within partition " + partition + " should match"); + + // FileSystemView should expose the same data + List fileGroups = tableView.getAllFileGroups(partition).collect(Collectors.toList()); + fileGroups.addAll(tableView.getAllReplacedFileGroups(partition).collect(Collectors.toList())); + + fileGroups.forEach(g -> LogManager.getLogger(TestHoodieBackedMetadata.class).info(g)); + fileGroups.forEach(g -> g.getAllBaseFiles().forEach(b -> LogManager.getLogger(TestHoodieBackedMetadata.class).info(b))); + fileGroups.forEach(g -> g.getAllFileSlices().forEach(s -> LogManager.getLogger(TestHoodieBackedMetadata.class).info(s))); + + long numFiles = fileGroups.stream() + .mapToLong(g -> g.getAllBaseFiles().count() + g.getAllFileSlices().mapToLong(s -> s.getLogFiles().count()).sum()) + .sum(); + assertEquals(metadataFilenames.size(), numFiles); + } catch (IOException e) { + e.printStackTrace(); + assertTrue(false, "Exception should not be raised: " + e); + } + }); + + HoodieBackedTableMetadataWriter metadataWriter = metadataWriter(client); + assertNotNull(metadataWriter, "MetadataWriter should have been initialized"); + + // Validate write config for metadata table + HoodieWriteConfig metadataWriteConfig = metadataWriter.getWriteConfig(); + assertFalse(metadataWriteConfig.isMetadataTableEnabled(), "No metadata table for metadata table"); + + // Metadata table should be in sync with the dataset + HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build(); + + // Metadata table is MOR + assertEquals(metadataMetaClient.getTableType(), HoodieTableType.MERGE_ON_READ, "Metadata Table should be MOR"); + + // Metadata table is HFile format + assertEquals(metadataMetaClient.getTableConfig().getBaseFileFormat(), HoodieFileFormat.HFILE, + "Metadata Table base file format should be HFile"); + + // Metadata table has a fixed number of partitions + // Cannot use FSUtils.getAllFoldersWithPartitionMetaFile for this as that function filters all directory + // in the .hoodie folder. + List metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, HoodieTableMetadata.getMetadataTableBasePath(basePath), + false, false); + assertEquals(metadataWriter.getEnabledPartitionTypes().size(), metadataTablePartitions.size()); + + final Map metadataEnabledPartitionTypes = new HashMap<>(); + metadataWriter.getEnabledPartitionTypes().forEach(e -> metadataEnabledPartitionTypes.put(e.getPartitionPath(), e)); + + // Metadata table should automatically compact and clean + // versions are +1 as autoclean / compaction happens end of commits + int numFileVersions = metadataWriteConfig.getCleanerFileVersionsRetained() + 1; + HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metadataMetaClient, metadataMetaClient.getActiveTimeline()); + metadataTablePartitions.forEach(partition -> { + List latestSlices = fsView.getLatestFileSlices(partition).collect(Collectors.toList()); + assertTrue(latestSlices.stream().map(FileSlice::getBaseFile).count() + <= metadataEnabledPartitionTypes.get(partition).getFileGroupCount(), "Should have a single latest base file per file group"); + assertTrue(latestSlices.size() + <= metadataEnabledPartitionTypes.get(partition).getFileGroupCount(), "Should have a single latest file slice per file group"); + assertTrue(latestSlices.size() + <= (numFileVersions * metadataEnabledPartitionTypes.get(partition).getFileGroupCount()), "Should limit file slice to " + + numFileVersions + " per file group, but was " + latestSlices.size()); + List logFiles = latestSlices.get(0).getLogFiles().collect(Collectors.toList()); + try { + if (FILES.getPartitionPath().equals(partition)) { + verifyMetadataRawRecords(table, logFiles, false); + } + if (COLUMN_STATS.getPartitionPath().equals(partition)) { + verifyMetadataColumnStatsRecords(logFiles); + } + } catch (IOException e) { + LOG.error("Metadata record validation failed", e); + fail("Metadata record validation failed"); + } + }); + + LOG.info("Validation time=" + timer.endTimer()); + } + + private void verifyMetadataColumnStatsRecords(List logFiles) throws IOException { + for (HoodieLogFile logFile : logFiles) { + FileStatus[] fsStatus = fs.listStatus(logFile.getPath()); + MessageType writerSchemaMsg = TableSchemaResolver.readSchemaFromLogFile(fs, logFile.getPath()); + if (writerSchemaMsg == null) { + // not a data block + continue; + } + + Schema writerSchema = new AvroSchemaConverter().convert(writerSchemaMsg); + try (HoodieLogFormat.Reader logFileReader = HoodieLogFormat.newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema)) { + while (logFileReader.hasNext()) { + HoodieLogBlock logBlock = logFileReader.next(); + if (logBlock instanceof HoodieDataBlock) { + try (ClosableIterator recordItr = ((HoodieDataBlock) logBlock).getRecordIterator()) { + recordItr.forEachRemaining(indexRecord -> { + final GenericRecord record = (GenericRecord) indexRecord; + final GenericRecord colStatsRecord = (GenericRecord) record.get(HoodieMetadataPayload.SCHEMA_FIELD_ID_COLUMN_STATS); + assertNotNull(colStatsRecord); + assertNotNull(colStatsRecord.get(HoodieMetadataPayload.COLUMN_STATS_FIELD_COLUMN_NAME)); + assertNotNull(colStatsRecord.get(HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT)); + /** + * TODO: some types of field may have null min/max as these statistics are only supported for primitive types + * assertNotNull(colStatsRecord.get(HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE)); + * assertNotNull(colStatsRecord.get(HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE)); + */ + }); + } + } + } + } + } + } + + /** + * Returns the list of all files in the dataset by iterating over the metadata table. + * + * @throws IOException + * @throws IllegalArgumentException + */ + private List getAllFiles(HoodieTableMetadata metadata) throws Exception { + List allfiles = new LinkedList<>(); + for (String partition : metadata.getAllPartitionPaths()) { + for (FileStatus status : metadata.getAllFilesInPartition(new Path(basePath, partition))) { + allfiles.add(status.getPath()); + } + } + + return allfiles; + } + + private HoodieBackedTableMetadataWriter metadataWriter(SparkRDDWriteClient client) { + return (HoodieBackedTableMetadataWriter) SparkHoodieBackedTableMetadataWriter + .create(hadoopConf, client.getConfig(), new HoodieSparkEngineContext(jsc)); + } + + private HoodieTableMetadata metadata(SparkRDDWriteClient client) { + HoodieWriteConfig clientConfig = client.getConfig(); + return HoodieTableMetadata.create(client.getEngineContext(), clientConfig.getMetadataConfig(), clientConfig.getBasePath(), + clientConfig.getSpillableMapBasePath()); + } + + private void changeTableVersion(HoodieTableVersion version) throws IOException { + metaClient.getTableConfig().setTableVersion(version); + Path propertyFile = new Path(metaClient.getMetaPath() + "/" + HoodieTableConfig.HOODIE_PROPERTIES_FILE); + try (FSDataOutputStream os = metaClient.getFs().create(propertyFile)) { + metaClient.getTableConfig().getProps().store(os, ""); + } + } + + @Override + protected HoodieTableType getTableType() { + return tableType; + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedTableMetadata.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedTableMetadata.java new file mode 100644 index 0000000000000..719f914816740 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedTableMetadata.java @@ -0,0 +1,433 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.functional; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieMetadataRecord; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.log.HoodieLogFormat; +import org.apache.hudi.common.table.log.block.HoodieDataBlock; +import org.apache.hudi.common.table.log.block.HoodieLogBlock; +import org.apache.hudi.common.table.view.TableFileSystemView; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.collection.ExternalSpillableMap; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.storage.HoodieHFileReader; +import org.apache.hudi.metadata.HoodieBackedTableMetadata; +import org.apache.hudi.metadata.HoodieMetadataMergedLogRecordReader; +import org.apache.hudi.metadata.HoodieMetadataPayload; +import org.apache.hudi.metadata.HoodieTableMetadataKeyGenerator; +import org.apache.hudi.metadata.MetadataPartitionType; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.io.hfile.CacheConfig; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.MessageType; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.model.WriteOperationType.INSERT; +import static org.apache.hudi.common.model.WriteOperationType.UPSERT; + +import static java.util.Arrays.asList; +import static java.util.Collections.emptyList; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +public class TestHoodieBackedTableMetadata extends TestHoodieMetadataBase { + + private static final Logger LOG = LogManager.getLogger(TestHoodieBackedTableMetadata.class); + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testTableOperations(boolean reuseReaders) throws Exception { + HoodieTableType tableType = HoodieTableType.COPY_ON_WRITE; + init(tableType); + doWriteInsertAndUpsert(testTable); + + // trigger an upsert + doWriteOperation(testTable, "0000003"); + verifyBaseMetadataTable(reuseReaders); + } + + /** + * Create a cow table and call getAllFilesInPartition api in parallel which reads data files from MDT + * This UT is guard that multi readers for MDT#getAllFilesInPartition api is safety. + * @param reuse + * @throws Exception + */ + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testMultiReaderForHoodieBackedTableMetadata(boolean reuse) throws Exception { + final int taskNumber = 3; + HoodieTableType tableType = HoodieTableType.COPY_ON_WRITE; + init(tableType); + testTable.doWriteOperation("000001", INSERT, emptyList(), asList("p1"), 1); + HoodieBackedTableMetadata tableMetadata = new HoodieBackedTableMetadata(context, writeConfig.getMetadataConfig(), writeConfig.getBasePath(), writeConfig.getSpillableMapBasePath(), reuse); + assertTrue(tableMetadata.enabled()); + List metadataPartitions = tableMetadata.getAllPartitionPaths(); + String partition = metadataPartitions.get(0); + String finalPartition = basePath + "/" + partition; + ExecutorService executors = Executors.newFixedThreadPool(taskNumber); + AtomicBoolean flag = new AtomicBoolean(false); + CountDownLatch downLatch = new CountDownLatch(taskNumber); + AtomicInteger filesNumber = new AtomicInteger(0); + + // call getAllFilesInPartition api from meta data table in parallel + for (int i = 0; i < taskNumber; i++) { + executors.submit(new Runnable() { + @Override + public void run() { + try { + downLatch.countDown(); + downLatch.await(); + FileStatus[] files = tableMetadata.getAllFilesInPartition(new Path(finalPartition)); + if (files.length != 1) { + LOG.warn("Miss match data file numbers."); + throw new RuntimeException("Miss match data file numbers."); + } + filesNumber.addAndGet(files.length); + } catch (Exception e) { + LOG.warn("Catch Exception while reading data files from MDT.", e); + flag.compareAndSet(false, true); + } + } + }); + } + executors.shutdown(); + executors.awaitTermination(5, TimeUnit.MINUTES); + assertFalse(flag.get()); + assertEquals(filesNumber.get(), taskNumber); + } + + private void doWriteInsertAndUpsert(HoodieTestTable testTable) throws Exception { + doWriteInsertAndUpsert(testTable, "0000001", "0000002", false); + } + + private void verifyBaseMetadataTable(boolean reuseMetadataReaders) throws IOException { + HoodieBackedTableMetadata tableMetadata = new HoodieBackedTableMetadata( + context, writeConfig.getMetadataConfig(), writeConfig.getBasePath(), + writeConfig.getSpillableMapBasePath(), reuseMetadataReaders); + assertTrue(tableMetadata.enabled()); + List fsPartitionPaths = testTable.getAllPartitionPaths(); + List fsPartitions = new ArrayList<>(); + fsPartitionPaths.forEach(entry -> fsPartitions.add(entry.getFileName().toString())); + List metadataPartitions = tableMetadata.getAllPartitionPaths(); + + Collections.sort(fsPartitions); + Collections.sort(metadataPartitions); + + assertEquals(fsPartitions.size(), metadataPartitions.size(), "Partitions should match"); + assertEquals(fsPartitions, metadataPartitions, "Partitions should match"); + + // Files within each partition should match + HoodieTable table = HoodieSparkTable.create(writeConfig, context); + TableFileSystemView tableView = table.getHoodieView(); + List fullPartitionPaths = fsPartitions.stream().map(partition -> basePath + "/" + partition).collect(Collectors.toList()); + Map partitionToFilesMap = tableMetadata.getAllFilesInPartitions(fullPartitionPaths); + assertEquals(fsPartitions.size(), partitionToFilesMap.size()); + + fsPartitions.forEach(partition -> { + try { + validateFilesPerPartition(testTable, tableMetadata, tableView, partitionToFilesMap, partition); + } catch (IOException e) { + fail("Exception should not be raised: " + e); + } + }); + } + + /** + * Verify if the Metadata table is constructed with table properties including + * the right key generator class name. + */ + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataTableKeyGenerator(final HoodieTableType tableType) throws Exception { + init(tableType); + + HoodieBackedTableMetadata tableMetadata = new HoodieBackedTableMetadata(context, + writeConfig.getMetadataConfig(), writeConfig.getBasePath(), writeConfig.getSpillableMapBasePath(), false); + + assertEquals(HoodieTableMetadataKeyGenerator.class.getCanonicalName(), + tableMetadata.getMetadataMetaClient().getTableConfig().getKeyGeneratorClassName()); + } + + /** + * [HUDI-2852] Table metadata returns empty for non-exist partition. + */ + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testNotExistPartition(final HoodieTableType tableType) throws Exception { + init(tableType); + HoodieBackedTableMetadata tableMetadata = new HoodieBackedTableMetadata(context, + writeConfig.getMetadataConfig(), writeConfig.getBasePath(), writeConfig.getSpillableMapBasePath(), false); + FileStatus[] allFilesInPartition = + tableMetadata.getAllFilesInPartition(new Path(writeConfig.getBasePath() + "dummy")); + assertEquals(allFilesInPartition.length, 0); + } + + /** + * 1. Verify metadata table records key deduplication feature. When record key + * deduplication is enabled, verify the metadata record payload on disk has empty key. + * Otherwise, verify the valid key. + * 2. Verify populate meta fields work irrespective of record key deduplication config. + * 3. Verify table services like compaction benefit from record key deduplication feature. + */ + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataRecordKeyExcludeFromPayload(final HoodieTableType tableType) throws Exception { + initPath(); + writeConfig = getWriteConfigBuilder(true, true, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(true) + .withPopulateMetaFields(false) + .withMaxNumDeltaCommitsBeforeCompaction(3) + .build()) + .build(); + init(tableType, writeConfig); + + // 2nd commit + doWriteOperation(testTable, "0000001", INSERT); + + final HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder() + .setConf(hadoopConf) + .setBasePath(metadataTableBasePath) + .build(); + HoodieWriteConfig metadataTableWriteConfig = getMetadataWriteConfig(writeConfig); + metadataMetaClient.reloadActiveTimeline(); + final HoodieTable table = HoodieSparkTable.create(metadataTableWriteConfig, context, metadataMetaClient); + + // Compaction has not yet kicked in. Verify all the log files + // for the metadata records persisted on disk as per the config. + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadLogFiles(table, metadataMetaClient, "0000001"); + }, "Metadata table should have valid log files!"); + + // Verify no base file created yet. + assertThrows(IllegalStateException.class, () -> { + verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(table); + }, "Metadata table should not have a base file yet!"); + + // 2 more commits + doWriteOperation(testTable, "0000002", UPSERT); + doWriteOperation(testTable, "0000004", UPSERT); + + // Compaction should be triggered by now. Let's verify the log files + // if any for the metadata records persisted on disk as per the config. + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadLogFiles(table, metadataMetaClient, "0000002"); + }, "Metadata table should have valid log files!"); + + // Verify the base file created by the just completed compaction. + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(table); + }, "Metadata table should have a valid base file!"); + + // 2 more commits to trigger one more compaction, along with a clean + doWriteOperation(testTable, "0000005", UPSERT); + doClean(testTable, "0000006", Arrays.asList("0000004")); + doWriteOperation(testTable, "0000007", UPSERT); + + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadLogFiles(table, metadataMetaClient, "7"); + }, "Metadata table should have valid log files!"); + + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(table); + }, "Metadata table should have a valid base file!"); + + validateMetadata(testTable); + } + + /** + * Verify the metadata table log files for the record field correctness. On disk format + * should be based on meta fields and key deduplication config. And the in-memory merged + * records should all be materialized fully irrespective of the config. + * + * @param table - Hoodie metadata test table + * @param metadataMetaClient - Metadata meta client + * @param latestCommitTimestamp - Latest commit timestamp + * @throws IOException + */ + private void verifyMetadataRecordKeyExcludeFromPayloadLogFiles(HoodieTable table, HoodieTableMetaClient metadataMetaClient, + String latestCommitTimestamp) throws IOException { + table.getHoodieView().sync(); + + // Compaction should not be triggered yet. Let's verify no base file + // and few log files available. + List fileSlices = table.getSliceView() + .getLatestFileSlices(MetadataPartitionType.FILES.getPartitionPath()).collect(Collectors.toList()); + if (fileSlices.isEmpty()) { + throw new IllegalStateException("LogFile slices are not available!"); + } + + // Verify the log files honor the key deduplication and virtual keys config + List logFiles = fileSlices.get(0).getLogFiles().map(logFile -> { + return logFile; + }).collect(Collectors.toList()); + + List logFilePaths = logFiles.stream().map(logFile -> { + return logFile.getPath().toString(); + }).collect(Collectors.toList()); + + // Verify the on-disk raw records before they get materialized + verifyMetadataRawRecords(table, logFiles); + + // Verify the in-memory materialized and merged records + verifyMetadataMergedRecords(metadataMetaClient, logFilePaths, latestCommitTimestamp); + } + + /** + * Verify the metadata table on-disk raw records. When populate meta fields is enabled, + * these records should have additional meta fields in the payload. When key deduplication + * is enabled, these records on the disk should have key in the payload as empty string. + * + * @param table + * @param logFiles - Metadata table log files to be verified + * @throws IOException + */ + private void verifyMetadataRawRecords(HoodieTable table, List logFiles) throws IOException { + for (HoodieLogFile logFile : logFiles) { + FileStatus[] fsStatus = fs.listStatus(logFile.getPath()); + MessageType writerSchemaMsg = TableSchemaResolver.readSchemaFromLogFile(fs, logFile.getPath()); + if (writerSchemaMsg == null) { + // not a data block + continue; + } + + Schema writerSchema = new AvroSchemaConverter().convert(writerSchemaMsg); + try (HoodieLogFormat.Reader logFileReader = HoodieLogFormat.newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema)) { + while (logFileReader.hasNext()) { + HoodieLogBlock logBlock = logFileReader.next(); + if (logBlock instanceof HoodieDataBlock) { + try (ClosableIterator recordItr = ((HoodieDataBlock) logBlock).getRecordIterator()) { + recordItr.forEachRemaining(indexRecord -> { + final GenericRecord record = (GenericRecord) indexRecord; + assertNull(record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); + assertNull(record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD)); + final String key = String.valueOf(record.get(HoodieMetadataPayload.KEY_FIELD_NAME)); + assertFalse(key.isEmpty()); + }); + } + } + } + } + } + } + + /** + * Verify the metadata table in-memory merged records. Irrespective of key deduplication + * config, the in-memory merged records should always have the key field in the record + * payload fully materialized. + * + * @param metadataMetaClient - Metadata table meta client + * @param logFilePaths - Metadata table log file paths + * @param latestCommitTimestamp - Latest commit timestamp + */ + private void verifyMetadataMergedRecords(HoodieTableMetaClient metadataMetaClient, List logFilePaths, String latestCommitTimestamp) { + Schema schema = HoodieAvroUtils.addMetadataFields(HoodieMetadataRecord.getClassSchema()); + HoodieMetadataMergedLogRecordReader logRecordReader = HoodieMetadataMergedLogRecordReader.newBuilder() + .withFileSystem(metadataMetaClient.getFs()) + .withBasePath(metadataMetaClient.getBasePath()) + .withLogFilePaths(logFilePaths) + .withLatestInstantTime(latestCommitTimestamp) + .withPartition(MetadataPartitionType.FILES.getPartitionPath()) + .withReaderSchema(schema) + .withMaxMemorySizeInBytes(100000L) + .withBufferSize(4096) + .withSpillableMapBasePath(tempDir.toString()) + .withDiskMapType(ExternalSpillableMap.DiskMapType.BITCASK) + .build(); + + assertDoesNotThrow(() -> { + logRecordReader.scan(); + }, "Metadata log records materialization failed"); + + for (Map.Entry> entry : logRecordReader.getRecords().entrySet()) { + assertFalse(entry.getKey().isEmpty()); + assertFalse(entry.getValue().getRecordKey().isEmpty()); + assertEquals(entry.getKey(), entry.getValue().getRecordKey()); + } + } + + /** + * Verify metadata table base files for the records persisted based on the config. When + * the key deduplication is enabled, the records persisted on the disk in the base file + * should have key field in the payload as empty string. + * + * @param table - Metadata table + */ + private void verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(HoodieTable table) throws IOException { + table.getHoodieView().sync(); + List fileSlices = table.getSliceView() + .getLatestFileSlices(MetadataPartitionType.FILES.getPartitionPath()).collect(Collectors.toList()); + if (!fileSlices.get(0).getBaseFile().isPresent()) { + throw new IllegalStateException("Base file not available!"); + } + final HoodieBaseFile baseFile = fileSlices.get(0).getBaseFile().get(); + + HoodieHFileReader hoodieHFileReader = new HoodieHFileReader(context.getHadoopConf().get(), + new Path(baseFile.getPath()), + new CacheConfig(context.getHadoopConf().get())); + List records = HoodieHFileReader.readAllRecords(hoodieHFileReader); + records.forEach(entry -> { + assertNull(((GenericRecord) entry).get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); + final String keyInPayload = (String) ((GenericRecord) entry) + .get(HoodieMetadataPayload.KEY_FIELD_NAME); + assertFalse(keyInPayload.isEmpty()); + }); + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java new file mode 100644 index 0000000000000..081b717146a6e --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java @@ -0,0 +1,2816 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.functional; + +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; +import org.apache.hudi.client.BaseHoodieWriteClient; +import org.apache.hudi.client.HoodieWriteResult; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.SparkTaskContextSupplier; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.clustering.plan.strategy.SparkSingleFileSortPlanStrategy; +import org.apache.hudi.client.clustering.run.strategy.SparkSingleFileSortExecutionStrategy; +import org.apache.hudi.client.validator.SparkPreCommitValidator; +import org.apache.hudi.client.validator.SqlQueryEqualityPreCommitValidator; +import org.apache.hudi.client.validator.SqlQuerySingleResultPreCommitValidator; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.ConsistencyGuardConfig; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.IOType; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.marker.MarkerType; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView; +import org.apache.hudi.common.testutils.ClusteringTestUtils; +import org.apache.hudi.common.testutils.FileCreateUtils; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.testutils.RawTripTestPayload; +import org.apache.hudi.common.util.BaseFileUtils; +import org.apache.hudi.common.util.ClusteringUtils; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.FileIOUtils; +import org.apache.hudi.common.util.MarkerUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieArchivalConfig; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodiePreCommitValidatorConfig; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.exception.HoodieCommitException; +import org.apache.hudi.exception.HoodieCorruptedDataException; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieInsertException; +import org.apache.hudi.exception.HoodieUpsertException; +import org.apache.hudi.exception.HoodieValidationException; +import org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.index.HoodieIndex.IndexType; +import org.apache.hudi.io.HoodieMergeHandle; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; +import org.apache.hudi.table.BulkInsertPartitioner; +import org.apache.hudi.table.HoodieSparkCopyOnWriteTable; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.commit.HoodieWriteHelper; +import org.apache.hudi.table.marker.WriteMarkersFactory; +import org.apache.hudi.testutils.HoodieClientTestBase; +import org.apache.hudi.testutils.HoodieClientTestUtils; +import org.apache.hudi.testutils.HoodieSparkWriteableTestTable; + +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.jetbrains.annotations.NotNull; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy.EAGER; +import static org.apache.hudi.common.table.timeline.HoodieInstant.State.COMPLETED; +import static org.apache.hudi.common.table.timeline.HoodieInstant.State.INFLIGHT; +import static org.apache.hudi.common.table.timeline.HoodieInstant.State.REQUESTED; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.CLEAN_ACTION; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.COMMIT_ACTION; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.REPLACE_COMMIT_ACTION; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.ROLLBACK_ACTION; +import static org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion.VERSION_0; +import static org.apache.hudi.common.testutils.FileCreateUtils.getBaseFileCountsForPaths; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.NULL_SCHEMA; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; +import static org.apache.hudi.common.testutils.Transformations.randomSelectAsHoodieKeys; +import static org.apache.hudi.common.testutils.Transformations.recordsToRecordKeySet; +import static org.apache.hudi.config.HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE; +import static org.apache.hudi.config.HoodieClusteringConfig.EXECUTION_STRATEGY_CLASS_NAME; +import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +@SuppressWarnings("unchecked") +@Tag("functional") +public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase { + + private static final Logger LOG = LogManager.getLogger(TestHoodieClientOnCopyOnWriteStorage.class); + private static final Map STRATEGY_PARAMS = new HashMap() { + { + put("sortColumn", "record_key"); + } + }; + + private static Stream smallInsertHandlingParams() { + return Arrays.stream(new Boolean[][] {{true}, {false}}).map(Arguments::of); + } + + private static Stream populateMetaFieldsParams() { + return Arrays.stream(new Boolean[][] {{true}, {false}}).map(Arguments::of); + } + + private static Stream populateMetaFieldsAndPreserveMetadataParams() { + return Arrays.stream(new Boolean[][] { + {true, true}, + {false, true}, + {true, false}, + {false, false} + }).map(Arguments::of); + } + + private static Stream rollbackFailedCommitsParams() { + return Stream.of( + Arguments.of(HoodieFailedWritesCleaningPolicy.LAZY, true), + Arguments.of(HoodieFailedWritesCleaningPolicy.LAZY, false), + Arguments.of(HoodieFailedWritesCleaningPolicy.NEVER, true), + Arguments.of(HoodieFailedWritesCleaningPolicy.NEVER, false) + ); + } + + private static Stream rollbackAfterConsistencyCheckFailureParams() { + return Stream.of( + Arguments.of(true, true), + Arguments.of(true, false), + Arguments.of(false, true), + Arguments.of(false, false) + ); + } + + private HoodieTestTable testTable; + + private static final String COUNT_SQL_QUERY_FOR_VALIDATION = "select count(*) from "; + + @BeforeEach + public void setUpTestTable() { + testTable = HoodieSparkWriteableTestTable.of(metaClient); + } + + /** + * Test Auto Commit behavior for HoodieWriteClient insert API. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testAutoCommitOnInsert(boolean populateMetaFields) throws Exception { + testAutoCommit(SparkRDDWriteClient::insert, false, populateMetaFields); + } + + /** + * Test Auto Commit behavior for HoodieWriteClient insertPrepped API. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testAutoCommitOnInsertPrepped(boolean populateMetaFields) throws Exception { + testAutoCommit(SparkRDDWriteClient::insertPreppedRecords, true, populateMetaFields); + } + + /** + * Test Auto Commit behavior for HoodieWriteClient upsert API. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testAutoCommitOnUpsert(boolean populateMetaFields) throws Exception { + testAutoCommit(SparkRDDWriteClient::upsert, false, populateMetaFields); + } + + /** + * Test Auto Commit behavior for HoodieWriteClient upsert Prepped API. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testAutoCommitOnUpsertPrepped(boolean populateMetaFields) throws Exception { + testAutoCommit(SparkRDDWriteClient::upsertPreppedRecords, true, populateMetaFields); + } + + /** + * Test Auto Commit behavior for HoodieWriteClient bulk-insert API. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testAutoCommitOnBulkInsert(boolean populateMetaFields) throws Exception { + testAutoCommit(SparkRDDWriteClient::bulkInsert, false, populateMetaFields); + } + + /** + * Test Auto Commit behavior for HoodieWriteClient bulk-insert prepped API. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testAutoCommitOnBulkInsertPrepped(boolean populateMetaFields) throws Exception { + testAutoCommit((writeClient, recordRDD, instantTime) -> writeClient.bulkInsertPreppedRecords(recordRDD, instantTime, + Option.empty()), true, populateMetaFields); + } + + /** + * Test auto-commit by applying write function. + * + * @param writeFn One of HoodieWriteClient Write API + * @throws Exception in case of failure + */ + private void testAutoCommit(Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, + boolean isPrepped, boolean populateMetaFields) throws Exception { + // Set autoCommit false + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build());) { + + String prevCommitTime = "000"; + String newCommitTime = "001"; + int numRecords = 200; + JavaRDD result = insertFirstBatch(cfgBuilder.build(), client, newCommitTime, prevCommitTime, numRecords, writeFn, + isPrepped, false, numRecords); + + assertFalse(testTable.commitExists(newCommitTime), + "If Autocommit is false, then commit should not be made automatically"); + assertTrue(client.commit(newCommitTime, result), "Commit should succeed"); + assertTrue(testTable.commitExists(newCommitTime), + "After explicit commit, commit file should be created"); + } + } + + @Test + public void testPreCommitValidatorsOnInsert() throws Exception { + int numRecords = 200; + HoodiePreCommitValidatorConfig validatorConfig = HoodiePreCommitValidatorConfig.newBuilder() + .withPreCommitValidator(SqlQuerySingleResultPreCommitValidator.class.getName()) + .withPrecommitValidatorSingleResultSqlQueries(COUNT_SQL_QUERY_FOR_VALIDATION + "#" + numRecords) + .build(); + HoodieWriteConfig config = getConfigBuilder().withAutoCommit(true) + .withPreCommitValidatorConfig(validatorConfig).build(); + try (SparkRDDWriteClient client = getHoodieWriteClient(config)) { + Function3, SparkRDDWriteClient, JavaRDD, String> writeFn = (writeClient, recordRDD, instantTime) -> + writeClient.bulkInsert(recordRDD, instantTime, Option.empty()); + String newCommitTime = HoodieActiveTimeline.createNewInstantTime(); + JavaRDD result = insertFirstBatch(config, client, newCommitTime, + "000", numRecords, writeFn, false, false, numRecords); + assertTrue(testTable.commitExists(newCommitTime)); + } + } + + @Test + public void testPreCommitValidationFailureOnInsert() throws Exception { + int numRecords = 200; + HoodiePreCommitValidatorConfig validatorConfig = HoodiePreCommitValidatorConfig.newBuilder() + .withPreCommitValidator(SqlQuerySingleResultPreCommitValidator.class.getName()) + //set wrong value for expected number of rows + .withPrecommitValidatorSingleResultSqlQueries(COUNT_SQL_QUERY_FOR_VALIDATION + "#" + 500) + .build(); + HoodieWriteConfig config = getConfigBuilder().withPreCommitValidatorConfig(validatorConfig).build(); + String newCommitTime = HoodieActiveTimeline.createNewInstantTime(); + try (SparkRDDWriteClient client = getHoodieWriteClient(config)) { + Function3, SparkRDDWriteClient, JavaRDD, String> writeFn = (writeClient, recordRDD, instantTime) -> + writeClient.bulkInsert(recordRDD, instantTime, Option.empty()); + JavaRDD result = insertFirstBatch(config, client, newCommitTime, + "000", numRecords, writeFn, false, false, numRecords); + fail("Expected validation to fail because we only insert 200 rows. Validation is configured to expect 500 rows"); + } catch (HoodieInsertException e) { + if (e.getCause() instanceof HoodieValidationException) { + // expected because wrong value passed + } else { + throw e; + } + } + + assertFalse(testTable.commitExists(newCommitTime)); + } + + @Test + public void testPreCommitValidationWithMultipleInflights() throws Exception { + int numRecords = 200; + HoodiePreCommitValidatorConfig validatorConfig = HoodiePreCommitValidatorConfig.newBuilder() + .withPreCommitValidator(SqlQuerySingleResultPreCommitValidator.class.getName()) + //set wrong value for expected number of rows + .withPrecommitValidatorSingleResultSqlQueries(COUNT_SQL_QUERY_FOR_VALIDATION + "#" + 500) + .build(); + HoodieWriteConfig config = getConfigBuilder() + .withCleanConfig(HoodieCleanConfig.newBuilder().withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.NEVER).build()) + .withPreCommitValidatorConfig(validatorConfig) + .build(); + + String instant1 = HoodieActiveTimeline.createNewInstantTime(); + try { + insertWithConfig(config, numRecords, instant1); + fail("Expected validation to fail because we only insert 200 rows. Validation is configured to expect 500 rows"); + } catch (HoodieInsertException e) { + if (e.getCause() instanceof HoodieValidationException) { + // expected because wrong value passed + } else { + throw e; + } + } + + assertFalse(testTable.commitExists(instant1)); + assertTrue(testTable.inflightCommitExists(instant1)); + + numRecords = 300; + validatorConfig = HoodiePreCommitValidatorConfig.newBuilder() + .withPreCommitValidator(SqlQuerySingleResultPreCommitValidator.class.getName()) + //set wrong value for expected number of rows + .withPrecommitValidatorSingleResultSqlQueries(COUNT_SQL_QUERY_FOR_VALIDATION + "#" + numRecords) + .build(); + config = getConfigBuilder() + .withCleanConfig(HoodieCleanConfig.newBuilder().withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.NEVER).build()) + .withPreCommitValidatorConfig(validatorConfig) + .build(); + String instant2 = HoodieActiveTimeline.createNewInstantTime(); + // expect pre-commit validators to succeed. Note that validator is expected to exclude inflight instant1 + insertWithConfig(config, numRecords, instant2); + assertTrue(testTable.inflightCommitExists(instant1)); + assertTrue(testTable.commitExists(instant2)); + } + + private void insertWithConfig(HoodieWriteConfig config, int numRecords, String instant) throws Exception { + try (SparkRDDWriteClient client = getHoodieWriteClient(config)) { + Function3, SparkRDDWriteClient, JavaRDD, String> writeFn = (writeClient, recordRDD, instantTime) -> + writeClient.bulkInsert(recordRDD, instantTime, Option.empty()); + JavaRDD result = insertFirstBatch(config, client, instant, + "000", numRecords, writeFn, false, false, numRecords); + } + } + + /** + * Test De-duplication behavior for HoodieWriteClient insert API. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testDeduplicationOnInsert(boolean populateMetaFields) throws Exception { + testDeduplication(SparkRDDWriteClient::insert, populateMetaFields); + } + + /** + * Test De-duplication behavior for HoodieWriteClient bulk-insert API. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testDeduplicationOnBulkInsert(boolean populateMetaFields) throws Exception { + testDeduplication(SparkRDDWriteClient::bulkInsert, populateMetaFields); + } + + /** + * Test De-duplication behavior for HoodieWriteClient upsert API. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testDeduplicationOnUpsert(boolean populateMetaFields) throws Exception { + testDeduplication(SparkRDDWriteClient::upsert, populateMetaFields); + } + + /** + * Test Deduplication Logic for write function. + * + * @param writeFn One of HoddieWriteClient non-prepped write APIs + * @throws Exception in case of failure + */ + private void testDeduplication( + Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, boolean populateMetaFields) throws Exception { + String newCommitTime = "001"; + + String recordKey = UUID.randomUUID().toString(); + HoodieKey keyOne = new HoodieKey(recordKey, "2018-01-01"); + HoodieRecord recordOne = + new HoodieAvroRecord(keyOne, dataGen.generateRandomValue(keyOne, newCommitTime)); + + HoodieKey keyTwo = new HoodieKey(recordKey, "2018-02-01"); + HoodieRecord recordTwo = + new HoodieAvroRecord(keyTwo, dataGen.generateRandomValue(keyTwo, newCommitTime)); + + // Same key and partition as keyTwo + HoodieRecord recordThree = + new HoodieAvroRecord(keyTwo, dataGen.generateRandomValue(keyTwo, newCommitTime)); + + HoodieData> records = HoodieJavaRDD.of( + jsc.parallelize(Arrays.asList(recordOne, recordTwo, recordThree), 1)); + HoodieWriteConfig.Builder configBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY) + .combineInput(true, true); + addConfigsForPopulateMetaFields(configBuilder, populateMetaFields); + + // Global dedup should be done based on recordKey only + HoodieIndex index = mock(HoodieIndex.class); + when(index.isGlobal()).thenReturn(true); + int dedupParallelism = records.getNumPartitions() + 100; + HoodieData> dedupedRecsRdd = HoodieWriteHelper.newInstance().deduplicateRecords(records, index, dedupParallelism); + List> dedupedRecs = dedupedRecsRdd.collectAsList(); + assertEquals(records.getNumPartitions(), dedupedRecsRdd.getNumPartitions()); + assertEquals(1, dedupedRecs.size()); + assertEquals(dedupedRecs.get(0).getPartitionPath(), recordThree.getPartitionPath()); + assertNodupesWithinPartition(dedupedRecs); + + // non-Global dedup should be done based on both recordKey and partitionPath + index = mock(HoodieIndex.class); + when(index.isGlobal()).thenReturn(false); + dedupedRecsRdd = HoodieWriteHelper.newInstance().deduplicateRecords(records, index, dedupParallelism); + dedupedRecs = dedupedRecsRdd.collectAsList(); + assertEquals(records.getNumPartitions(), dedupedRecsRdd.getNumPartitions()); + assertEquals(2, dedupedRecs.size()); + assertNodupesWithinPartition(dedupedRecs); + + // Perform write-action and check + JavaRDD recordList = jsc.parallelize(Arrays.asList(recordOne, recordTwo, recordThree), 1); + configBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY) + .combineInput(true, true); + addConfigsForPopulateMetaFields(configBuilder, populateMetaFields); + + try (SparkRDDWriteClient client = getHoodieWriteClient(configBuilder.build());) { + client.startCommitWithTime(newCommitTime); + List statuses = writeFn.apply(client, recordList, newCommitTime).collect(); + assertNoWriteErrors(statuses); + assertEquals(2, statuses.size()); + assertNodupesInPartition(statuses.stream().map(WriteStatus::getWrittenRecords).flatMap(Collection::stream) + .collect(Collectors.toList())); + } + } + + /** + * Assert that there is no duplicate key at the partition level. + * + * @param records List of Hoodie records + */ + void assertNodupesInPartition(List records) { + Map> partitionToKeys = new HashMap<>(); + for (HoodieRecord r : records) { + String key = r.getRecordKey(); + String partitionPath = r.getPartitionPath(); + if (!partitionToKeys.containsKey(partitionPath)) { + partitionToKeys.put(partitionPath, new HashSet<>()); + } + assertFalse(partitionToKeys.get(partitionPath).contains(key), "key " + key + " is duplicate within partition " + partitionPath); + partitionToKeys.get(partitionPath).add(key); + } + } + + /** + * Test Upsert API. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testUpserts(boolean populateMetaFields) throws Exception { + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withRollbackUsingMarkers(true); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + testUpsertsInternal(cfgBuilder.build(), SparkRDDWriteClient::upsert, false); + } + + /** + * Test UpsertPrepped API. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testUpsertsPrepped(boolean populateMetaFields) throws Exception { + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withRollbackUsingMarkers(true); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + testUpsertsInternal(cfgBuilder.build(), SparkRDDWriteClient::upsertPreppedRecords, true); + } + + /** + * Test one of HoodieWriteClient upsert(Prepped) APIs. + * + * @param config Write Config + * @param writeFn One of Hoodie Write Function API + * @throws Exception in case of error + */ + private void testUpsertsInternal(HoodieWriteConfig config, + Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, boolean isPrepped) + throws Exception { + // Force using older timeline layout + HoodieWriteConfig hoodieWriteConfig = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY) + .withRollbackUsingMarkers(true) + .withProps(config.getProps()).withTimelineLayoutVersion( + VERSION_0).build(); + + HoodieTableMetaClient.withPropertyBuilder() + .fromMetaClient(metaClient) + .setTimelineLayoutVersion(VERSION_0) + .setPopulateMetaFields(config.populateMetaFields()) + .initTable(metaClient.getHadoopConf(), metaClient.getBasePath()); + + SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig); + + // Write 1 (only inserts) + String newCommitTime = "001"; + String initCommitTime = "000"; + int numRecords = 200; + insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, numRecords, SparkRDDWriteClient::insert, + isPrepped, true, numRecords, config.populateMetaFields()); + + // Write 2 (updates) + String prevCommitTime = newCommitTime; + newCommitTime = "004"; + numRecords = 100; + String commitTimeBetweenPrevAndNew = "002"; + updateBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime, + Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), initCommitTime, numRecords, writeFn, isPrepped, true, + numRecords, 200, 2, config.populateMetaFields()); + + // Delete 1 + prevCommitTime = newCommitTime; + newCommitTime = "005"; + numRecords = 50; + + deleteBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime, + initCommitTime, numRecords, SparkRDDWriteClient::delete, isPrepped, true, + 0, 150, config.populateMetaFields()); + + // Now simulate an upgrade and perform a restore operation + HoodieWriteConfig newConfig = getConfigBuilder().withProps(config.getProps()).withTimelineLayoutVersion( + TimelineLayoutVersion.CURR_VERSION).build(); + client = getHoodieWriteClient(newConfig); + + client.savepoint("004", "user1","comment1"); + + client.restoreToInstant("004", config.isMetadataTableEnabled()); + + assertFalse(metaClient.reloadActiveTimeline().getRollbackTimeline().lastInstant().isPresent()); + + client.deleteSavepoint("004"); + assertFalse(metaClient.reloadActiveTimeline().getSavePointTimeline().containsInstant("004")); + + // Check the entire dataset has all records still + String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; + for (int i = 0; i < fullPartitionPaths.length; i++) { + fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); + } + assertEquals(200, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), + "Must contain " + 200 + " records"); + + // Perform Delete again on upgraded dataset. + prevCommitTime = newCommitTime; + newCommitTime = "006"; + numRecords = 50; + + deleteBatch(newConfig, client, newCommitTime, prevCommitTime, + initCommitTime, numRecords, SparkRDDWriteClient::delete, isPrepped, true, + 0, 150); + + HoodieActiveTimeline activeTimeline = new HoodieActiveTimeline(metaClient, false); + List instants = activeTimeline.getCommitTimeline().getInstants().collect(Collectors.toList()); + assertEquals(5, instants.size()); + assertEquals(new HoodieInstant(COMPLETED, COMMIT_ACTION, "001"), + instants.get(0)); + assertEquals(new HoodieInstant(COMPLETED, COMMIT_ACTION, "004"), + instants.get(1)); + // New Format should have all states of instants + assertEquals(new HoodieInstant(REQUESTED, COMMIT_ACTION, "006"), + instants.get(2)); + assertEquals(new HoodieInstant(INFLIGHT, COMMIT_ACTION, "006"), + instants.get(3)); + assertEquals(new HoodieInstant(COMPLETED, COMMIT_ACTION, "006"), + instants.get(4)); + + final HoodieWriteConfig cfg = hoodieWriteConfig; + final String instantTime = "007"; + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build(); + String basePathStr = basePath; + HoodieTable table = getHoodieTable(metaClient, cfg); + String extension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); + jsc.parallelize(Arrays.asList(1)).map(e -> { + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(metaClient.getActiveTimeline().getInstantDetails( + metaClient.getCommitsTimeline().filterCompletedInstants().lastInstant().get()).get(), + HoodieCommitMetadata.class); + String filePath = commitMetadata.getPartitionToWriteStats().values().stream() + .flatMap(w -> w.stream()).filter(s -> s.getPath().endsWith(extension)).findAny() + .map(ee -> ee.getPath()).orElse(null); + String partitionPath = commitMetadata.getPartitionToWriteStats().values().stream() + .flatMap(w -> w.stream()).filter(s -> s.getPath().endsWith(extension)).findAny() + .map(ee -> ee.getPartitionPath()).orElse(null); + Path baseFilePath = new Path(basePathStr, filePath); + HoodieBaseFile baseFile = new HoodieBaseFile(baseFilePath.toString()); + + try { + HoodieMergeHandle handle = new HoodieMergeHandle(cfg, instantTime, table, new HashMap<>(), + partitionPath, FSUtils.getFileId(baseFilePath.getName()), baseFile, new SparkTaskContextSupplier(), + config.populateMetaFields() ? Option.empty() : + Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps())))); + WriteStatus writeStatus = new WriteStatus(false, 0.0); + writeStatus.setStat(new HoodieWriteStat()); + writeStatus.getStat().setNumWrites(0); + handle.performMergeDataValidationCheck(writeStatus); + } catch (HoodieCorruptedDataException e1) { + fail("Exception not expected because merge validation check is disabled"); + } + + try { + final String newInstantTime = "006"; + cfg.getProps().setProperty("hoodie.merge.data.validation.enabled", "true"); + HoodieWriteConfig cfg2 = HoodieWriteConfig.newBuilder().withProps(cfg.getProps()).build(); + HoodieMergeHandle handle = new HoodieMergeHandle(cfg2, newInstantTime, table, new HashMap<>(), + partitionPath, FSUtils.getFileId(baseFilePath.getName()), baseFile, new SparkTaskContextSupplier(), + config.populateMetaFields() ? Option.empty() : + Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps())))); + WriteStatus writeStatus = new WriteStatus(false, 0.0); + writeStatus.setStat(new HoodieWriteStat()); + writeStatus.getStat().setNumWrites(0); + handle.performMergeDataValidationCheck(writeStatus); + fail("The above line should have thrown an exception"); + } catch (HoodieCorruptedDataException e2) { + // expected + } + return true; + }).collect(); + } + + @Test + public void testRestoreWithSavepointBeyondArchival() throws Exception { + HoodieWriteConfig config = getConfigBuilder().withRollbackUsingMarkers(true).build(); + HoodieWriteConfig hoodieWriteConfig = getConfigBuilder(EAGER) + .withRollbackUsingMarkers(true) + .withArchivalConfig(HoodieArchivalConfig.newBuilder().withArchiveBeyondSavepoint(true).build()) + .withProps(config.getProps()).withTimelineLayoutVersion( + VERSION_0).build(); + + HoodieTableMetaClient.withPropertyBuilder() + .fromMetaClient(metaClient) + .setTimelineLayoutVersion(VERSION_0) + .setPopulateMetaFields(config.populateMetaFields()) + .initTable(metaClient.getHadoopConf(), metaClient.getBasePath()); + + SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig); + + // Write 1 (only inserts) + String newCommitTime = "001"; + String initCommitTime = "000"; + int numRecords = 200; + insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, numRecords, SparkRDDWriteClient::insert, + false, true, numRecords, config.populateMetaFields()); + + // Write 2 (updates) + String prevCommitTime = newCommitTime; + newCommitTime = "004"; + numRecords = 100; + String commitTimeBetweenPrevAndNew = "002"; + updateBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime, + Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), initCommitTime, numRecords, SparkRDDWriteClient::upsert, false, true, + numRecords, 200, 2, config.populateMetaFields()); + + // Delete 1 + prevCommitTime = newCommitTime; + newCommitTime = "005"; + numRecords = 50; + + deleteBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime, + initCommitTime, numRecords, SparkRDDWriteClient::delete, false, true, + 0, 150, config.populateMetaFields()); + + HoodieWriteConfig newConfig = getConfigBuilder().withProps(config.getProps()).withTimelineLayoutVersion( + TimelineLayoutVersion.CURR_VERSION) + .withArchivalConfig(HoodieArchivalConfig.newBuilder().withArchiveBeyondSavepoint(true).build()).build(); + client = getHoodieWriteClient(newConfig); + + client.savepoint("004", "user1", "comment1"); + + // verify that restore fails when "hoodie.archive.beyond.savepoint" is enabled. + SparkRDDWriteClient finalClient = client; + assertThrows(IllegalArgumentException.class, () -> { + finalClient.restoreToSavepoint("004"); + }, "Restore should not be supported when " + HoodieArchivalConfig.ARCHIVE_BEYOND_SAVEPOINT.key() + " is enabled"); + } + + /** + * Test Insert API for HoodieConcatHandle. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testInsertsWithHoodieConcatHandle(boolean populateMetaFields) throws Exception { + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + testHoodieConcatHandle(cfgBuilder.build(), false); + } + + /** + * Test InsertPrepped API for HoodieConcatHandle. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testInsertsPreppedWithHoodieConcatHandle(boolean populateMetaFields) throws Exception { + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + testHoodieConcatHandle(cfgBuilder.build(), true); + } + + /** + * Test one of HoodieConcatHandle w/ {@link BaseHoodieWriteClient#insert(Object, String)} API. + * + * @param config Write Config + * @throws Exception in case of error + */ + private void testHoodieConcatHandle(HoodieWriteConfig config, boolean isPrepped) + throws Exception { + // Force using older timeline layout + HoodieWriteConfig hoodieWriteConfig = getConfigBuilder() + .withProps(config.getProps()).withMergeAllowDuplicateOnInserts(true).withTimelineLayoutVersion( + VERSION_0).build(); + HoodieTableMetaClient.withPropertyBuilder() + .fromMetaClient(metaClient) + .setTimelineLayoutVersion(VERSION_0) + .initTable(metaClient.getHadoopConf(), metaClient.getBasePath()); + + SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig); + + // Write 1 (only inserts) + String newCommitTime = "001"; + String initCommitTime = "000"; + int numRecords = 200; + insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, numRecords, SparkRDDWriteClient::insert, + isPrepped, true, numRecords, config.populateMetaFields()); + + // Write 2 (updates) + String prevCommitTime = newCommitTime; + newCommitTime = "004"; + numRecords = 100; + String commitTimeBetweenPrevAndNew = "002"; + + final Function2, String, Integer> recordGenFunction = + generateWrapRecordsFn(isPrepped, hoodieWriteConfig, dataGen::generateUniqueUpdates); + + writeBatch(client, newCommitTime, prevCommitTime, Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), initCommitTime, + numRecords, recordGenFunction, SparkRDDWriteClient::insert, true, numRecords, 300, + 2, false, config.populateMetaFields()); + } + + /** + * Test Insert API for HoodieConcatHandle when incoming entries contain duplicate keys. + */ + @Test + public void testInsertsWithHoodieConcatHandleOnDuplicateIncomingKeys() throws Exception { + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(); + testHoodieConcatHandleOnDupInserts(cfgBuilder.build(), false); + } + + /** + * Test InsertPrepped API for HoodieConcatHandle when incoming entries contain duplicate keys. + */ + @Test + public void testInsertsPreppedWithHoodieConcatHandleOnDuplicateIncomingKeys() throws Exception { + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(); + testHoodieConcatHandleOnDupInserts(cfgBuilder.build(), true); + } + + private void testHoodieConcatHandleOnDupInserts(HoodieWriteConfig config, boolean isPrepped) throws Exception { + HoodieWriteConfig hoodieWriteConfig = getConfigBuilder() + .withProps(config.getProps()) + .withMergeAllowDuplicateOnInserts(true) + .build(); + + SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig); + + // Write 1 (only inserts) + String initCommitTime = "000"; + String newCommitTime = "001"; + int firstInsertRecords = 50; + insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, firstInsertRecords, SparkRDDWriteClient::insert, + isPrepped, true, firstInsertRecords, config.populateMetaFields()); + + // Write 2 (updates with duplicates) + String prevCommitTime = newCommitTime; + newCommitTime = "004"; + int secondInsertRecords = 100; // needs to be larger than firstInsertRecords to guarantee duplicate keys + List commitTimesBetweenPrevAndNew = Arrays.asList("002", "003"); + + final Function2, String, Integer> recordGenFunction = + generateWrapRecordsFn(isPrepped, hoodieWriteConfig, dataGen::generateUpdates); + + writeBatch(client, newCommitTime, prevCommitTime, Option.of(commitTimesBetweenPrevAndNew), initCommitTime, + secondInsertRecords, recordGenFunction, SparkRDDWriteClient::insert, true, secondInsertRecords, + firstInsertRecords + secondInsertRecords, 2, false, config.populateMetaFields()); + } + + @Test + public void testBulkInsertWithCustomPartitioner() { + HoodieWriteConfig config = getConfigBuilder().withRollbackUsingMarkers(true).build(); + try (SparkRDDWriteClient client = getHoodieWriteClient(config)) { + final String commitTime1 = "001"; + client.startCommitWithTime(commitTime1); + List inserts1 = dataGen.generateInserts(commitTime1, 100); + JavaRDD insertRecordsRDD1 = jsc.parallelize(inserts1, 10); + BulkInsertPartitioner> partitioner = new RDDCustomColumnsSortPartitioner(new String[]{"rider"}, HoodieTestDataGenerator.AVRO_SCHEMA, false); + List statuses = client.bulkInsert(insertRecordsRDD1, commitTime1, Option.of(partitioner)).collect(); + assertNoWriteErrors(statuses); + } + } + + /** + * Tests deletion of records. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testDeletes(boolean populateMetaFields) throws Exception { + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build()); + /** + * Write 1 (inserts and deletes) Write actual 200 insert records and ignore 100 delete records + */ + String initCommitTime = "000"; + String newCommitTime = "001"; + + final List recordsInFirstBatch = new ArrayList<>(); + Function2, String, Integer> recordGenFunction = + (String instantTime, Integer numRecordsInThisCommit) -> { + List fewRecordsForInsert = dataGen.generateInserts(instantTime, 200); + List fewRecordsForDelete = dataGen.generateDeletes(instantTime, 100); + + recordsInFirstBatch.addAll(fewRecordsForInsert); + recordsInFirstBatch.addAll(fewRecordsForDelete); + return recordsInFirstBatch; + }; + writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime, + // unused as genFn uses hard-coded number of inserts/updates/deletes + -1, recordGenFunction, SparkRDDWriteClient::upsert, true, 200, 200, 1, false, + populateMetaFields); + + /** + * Write 2 (deletes+writes). + */ + String prevCommitTime = newCommitTime; + newCommitTime = "004"; + final List recordsInSecondBatch = new ArrayList<>(); + + recordGenFunction = (String instantTime, Integer numRecordsInThisCommit) -> { + List fewRecordsForDelete = recordsInFirstBatch.subList(0, 50); + List fewRecordsForUpdate = recordsInFirstBatch.subList(50, 100); + recordsInSecondBatch.addAll(dataGen.generateDeletesFromExistingRecords(fewRecordsForDelete)); + recordsInSecondBatch.addAll(fewRecordsForUpdate); + return recordsInSecondBatch; + }; + writeBatch(client, newCommitTime, prevCommitTime, Option.empty(), initCommitTime, 100, recordGenFunction, + SparkRDDWriteClient::upsert, true, 50, 150, 2, false, + populateMetaFields); + } + + /** + * When records getting inserted are deleted in the same write batch, hudi should have deleted those records and + * not be available in read path. + * @throws Exception + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testDeletesForInsertsInSameBatch(boolean populateMetaFields) throws Exception { + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build()); + /** + * Write 200 inserts and issue deletes to a subset(50) of inserts. + */ + String initCommitTime = "000"; + String newCommitTime = "001"; + + final List recordsInFirstBatch = new ArrayList<>(); + Function2, String, Integer> recordGenFunction = + (String instantTime, Integer numRecordsInThisCommit) -> { + List fewRecordsForInsert = dataGen.generateInserts(instantTime, 200); + List fewRecordsForDelete = fewRecordsForInsert.subList(40, 90); + + recordsInFirstBatch.addAll(fewRecordsForInsert); + recordsInFirstBatch.addAll(dataGen.generateDeletesFromExistingRecords(fewRecordsForDelete)); + return recordsInFirstBatch; + }; + + writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime, + -1, recordGenFunction, SparkRDDWriteClient::upsert, true, 150, 150, 1, false, + populateMetaFields); + } + + /** + * Test update of a record to different partition with Global Index. + */ + @ParameterizedTest + @EnumSource(value = IndexType.class, names = {"GLOBAL_BLOOM", "GLOBAL_SIMPLE"}) + public void testUpsertsUpdatePartitionPathGlobalBloom(IndexType indexType) throws Exception { + testUpsertsUpdatePartitionPath(indexType, getConfig(), SparkRDDWriteClient::upsert); + } + + /** + * This test ensures in a global bloom when update partition path is set to true in config, if an incoming record has mismatched partition + * compared to whats in storage, then appropriate actions are taken. i.e. old record is deleted in old partition and new one is inserted + * in the new partition. + * test structure: + * 1. insert 1 batch + * 2. insert 2nd batch with larger no of records so that a new file group is created for partitions + * 3. issue upserts to records from batch 1 with different partition path. This should ensure records from batch 1 are deleted and new + * records are upserted to the new partition + * + * @param indexType index type to be tested for + * @param config instance of {@link HoodieWriteConfig} to use + * @param writeFn write function to be used for testing + */ + private void testUpsertsUpdatePartitionPath(IndexType indexType, HoodieWriteConfig config, + Function3, SparkRDDWriteClient, JavaRDD, String> writeFn) + throws Exception { + // instantiate client + + HoodieWriteConfig hoodieWriteConfig = getConfigBuilder() + .withProps(config.getProps()) + .withCompactionConfig( + HoodieCompactionConfig.newBuilder().compactionSmallFileSize(10000).build()) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType) + .withBloomIndexUpdatePartitionPath(true) + .withGlobalSimpleIndexUpdatePartitionPath(true) + .build()).withTimelineLayoutVersion(VERSION_0).build(); + + HoodieTableMetaClient.withPropertyBuilder() + .fromMetaClient(metaClient) + .setTimelineLayoutVersion(VERSION_0) + .initTable(metaClient.getHadoopConf(), metaClient.getBasePath()); + // Set rollback to LAZY so no inflights are deleted + hoodieWriteConfig.getProps().put(HoodieCleanConfig.FAILED_WRITES_CLEANER_POLICY.key(), + HoodieFailedWritesCleaningPolicy.LAZY.name()); + SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig); + + // Write 1 + String newCommitTime = "001"; + int numRecords = 10; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, numRecords); + Set> expectedPartitionPathRecKeyPairs = new HashSet<>(); + // populate expected partition path and record keys + for (HoodieRecord rec : records) { + expectedPartitionPathRecKeyPairs.add(Pair.of(rec.getPartitionPath(), rec.getRecordKey())); + } + JavaRDD writeRecords = jsc.parallelize(records, 1); + JavaRDD result = writeFn.apply(client, writeRecords, newCommitTime); + result.collect(); + + // Check the entire dataset has all records + String[] fullPartitionPaths = getFullPartitionPaths(); + assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths); + + // verify one basefile per partition + String[] fullExpectedPartitionPaths = getFullPartitionPaths(expectedPartitionPathRecKeyPairs.stream().map(Pair::getLeft).toArray(String[]::new)); + Map baseFileCounts = getBaseFileCountsForPaths(basePath, fs, fullExpectedPartitionPaths); + for (Map.Entry entry : baseFileCounts.entrySet()) { + assertEquals(1, entry.getValue()); + } + assertTrue(baseFileCounts.entrySet().stream().allMatch(entry -> entry.getValue() == 1)); + + // Write 2 + newCommitTime = "002"; + numRecords = 20; // so that a new file id is created + client.startCommitWithTime(newCommitTime); + + List recordsSecondBatch = dataGen.generateInserts(newCommitTime, numRecords); + // populate expected partition path and record keys + for (HoodieRecord rec : recordsSecondBatch) { + expectedPartitionPathRecKeyPairs.add(Pair.of(rec.getPartitionPath(), rec.getRecordKey())); + } + writeRecords = jsc.parallelize(recordsSecondBatch, 1); + result = writeFn.apply(client, writeRecords, newCommitTime); + result.collect(); + + // Check the entire dataset has all records + fullPartitionPaths = getFullPartitionPaths(); + assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths); + + // verify that there are more than 1 basefiles per partition + // we can't guarantee randomness in partitions where records are distributed. So, verify atleast one partition has more than 1 basefile. + baseFileCounts = getBaseFileCountsForPaths(basePath, fs, fullPartitionPaths); + assertTrue(baseFileCounts.entrySet().stream().filter(entry -> entry.getValue() > 1).count() >= 1, + "At least one partition should have more than 1 base file after 2nd batch of writes"); + + // Write 3 (upserts to records from batch 1 with diff partition path) + newCommitTime = "003"; + + // update to diff partition paths + List recordsToUpsert = new ArrayList<>(); + for (HoodieRecord rec : records) { + // remove older entry from expected partition path record key pairs + expectedPartitionPathRecKeyPairs + .remove(Pair.of(rec.getPartitionPath(), rec.getRecordKey())); + String partitionPath = rec.getPartitionPath(); + String newPartitionPath = null; + if (partitionPath.equalsIgnoreCase(DEFAULT_FIRST_PARTITION_PATH)) { + newPartitionPath = DEFAULT_SECOND_PARTITION_PATH; + } else if (partitionPath.equalsIgnoreCase(DEFAULT_SECOND_PARTITION_PATH)) { + newPartitionPath = DEFAULT_THIRD_PARTITION_PATH; + } else if (partitionPath.equalsIgnoreCase(DEFAULT_THIRD_PARTITION_PATH)) { + newPartitionPath = DEFAULT_FIRST_PARTITION_PATH; + } else { + throw new IllegalStateException("Unknown partition path " + rec.getPartitionPath()); + } + recordsToUpsert.add( + new HoodieAvroRecord(new HoodieKey(rec.getRecordKey(), newPartitionPath), + (HoodieRecordPayload) rec.getData())); + // populate expected partition path and record keys + expectedPartitionPathRecKeyPairs.add(Pair.of(newPartitionPath, rec.getRecordKey())); + } + + writeRecords = jsc.parallelize(recordsToUpsert, 1); + result = writeFn.apply(client, writeRecords, newCommitTime); + result.collect(); + + // Check the entire dataset has all records + fullPartitionPaths = getFullPartitionPaths(); + assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths); + } + + private void assertPartitionPathRecordKeys(Set> expectedPartitionPathRecKeyPairs, String[] fullPartitionPaths) { + Dataset rows = getAllRows(fullPartitionPaths); + List> actualPartitionPathRecKeyPairs = getActualPartitionPathAndRecordKeys(rows); + // verify all partitionpath, record key matches + assertActualAndExpectedPartitionPathRecordKeyMatches(expectedPartitionPathRecKeyPairs, actualPartitionPathRecKeyPairs); + } + + private List> getActualPartitionPathAndRecordKeys(Dataset rows) { + List> actualPartitionPathRecKeyPairs = new ArrayList<>(); + for (Row row : rows.collectAsList()) { + actualPartitionPathRecKeyPairs + .add(Pair.of(row.getAs("_hoodie_partition_path"), row.getAs("_row_key"))); + } + return actualPartitionPathRecKeyPairs; + } + + private Dataset getAllRows(String[] fullPartitionPaths) { + return HoodieClientTestUtils + .read(jsc, basePath, sqlContext, fs, fullPartitionPaths); + } + + private String[] getFullPartitionPaths() { + return getFullPartitionPaths(dataGen.getPartitionPaths()); + } + + private String[] getFullPartitionPaths(String[] relativePartitionPaths) { + String[] fullPartitionPaths = new String[relativePartitionPaths.length]; + for (int i = 0; i < fullPartitionPaths.length; i++) { + fullPartitionPaths[i] = String.format("%s/%s/*", basePath, relativePartitionPaths[i]); + } + return fullPartitionPaths; + } + + private void assertActualAndExpectedPartitionPathRecordKeyMatches(Set> expectedPartitionPathRecKeyPairs, + List> actualPartitionPathRecKeyPairs) { + // verify all partitionpath, record key matches + assertEquals(expectedPartitionPathRecKeyPairs.size(), actualPartitionPathRecKeyPairs.size()); + for (Pair entry : actualPartitionPathRecKeyPairs) { + assertTrue(expectedPartitionPathRecKeyPairs.contains(entry)); + } + + for (Pair entry : expectedPartitionPathRecKeyPairs) { + assertTrue(actualPartitionPathRecKeyPairs.contains(entry)); + } + } + + private Pair, List> insertBatchRecords(SparkRDDWriteClient client, String commitTime, + Integer recordNum, int expectStatueSize) { + client.startCommitWithTime(commitTime); + List inserts1 = dataGen.generateInserts(commitTime, recordNum); + JavaRDD insertRecordsRDD1 = jsc.parallelize(inserts1, 1); + List statuses = client.upsert(insertRecordsRDD1, commitTime).collect(); + assertNoWriteErrors(statuses); + assertEquals(expectStatueSize, statuses.size(), "check expect statue size."); + return Pair.of(statuses, inserts1); + } + + @Test + public void testUpdateRejectForClustering() throws IOException { + final String testPartitionPath = "2016/09/26"; + dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath}); + Properties props = new Properties(); + props.setProperty(ASYNC_CLUSTERING_ENABLE.key(), "true"); + HoodieWriteConfig config = getSmallInsertWriteConfig(100, + TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), true, props); + SparkRDDWriteClient client = getHoodieWriteClient(config); + HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient); + + //1. insert to generate 2 file group + String commitTime1 = "001"; + Pair, List> upsertResult = insertBatchRecords(client, commitTime1, 600, 2); + List inserts1 = upsertResult.getValue(); + List fileGroupIds1 = table.getFileSystemView().getAllFileGroups(testPartitionPath) + .map(fileGroup -> fileGroup.getFileGroupId().getFileId()).collect(Collectors.toList()); + assertEquals(2, fileGroupIds1.size()); + + // 2. generate clustering plan for fileGroupIds1 file groups + String commitTime2 = "002"; + List> firstInsertFileSlicesList = table.getFileSystemView().getAllFileGroups(testPartitionPath) + .map(fileGroup -> fileGroup.getAllFileSlices().collect(Collectors.toList())).collect(Collectors.toList()); + List[] fileSlices = (List[])firstInsertFileSlicesList.toArray(new List[firstInsertFileSlicesList.size()]); + createRequestedReplaceInstant(this.metaClient, commitTime2, fileSlices); + + // 3. insert one record with no updating reject exception, and not merge the small file, just generate a new file group + String commitTime3 = "003"; + insertBatchRecords(client, commitTime3, 1, 1); + List fileGroupIds2 = table.getFileSystemView().getAllFileGroups(testPartitionPath) + .map(fileGroup -> fileGroup.getFileGroupId().getFileId()).collect(Collectors.toList()); + assertEquals(3, fileGroupIds2.size()); + + // 4. update one record for the clustering two file groups, throw reject update exception + String commitTime4 = "004"; + client.startCommitWithTime(commitTime4); + List insertsAndUpdates3 = new ArrayList<>(); + insertsAndUpdates3.addAll(dataGen.generateUpdates(commitTime4, inserts1)); + String assertMsg = String.format("Not allowed to update the clustering files in partition: %s " + + "For pending clustering operations, we are not going to support update for now.", testPartitionPath); + assertThrows(HoodieUpsertException.class, () -> { + writeClient.upsert(jsc.parallelize(insertsAndUpdates3, 1), commitTime4).collect(); }, assertMsg); + + // 5. insert one record with no updating reject exception, will merge the small file + String commitTime5 = "005"; + List statuses = insertBatchRecords(client, commitTime5, 1, 1).getKey(); + fileGroupIds2.removeAll(fileGroupIds1); + assertEquals(fileGroupIds2.get(0), statuses.get(0).getFileId()); + List firstInsertFileGroupIds4 = table.getFileSystemView().getAllFileGroups(testPartitionPath) + .map(fileGroup -> fileGroup.getFileGroupId().getFileId()).collect(Collectors.toList()); + assertEquals(3, firstInsertFileGroupIds4.size()); + } + + /** + * Test scenario of new file-group getting added during upsert(). + */ + @Test + public void testSmallInsertHandlingForUpserts() throws Exception { + final String testPartitionPath = "2016/09/26"; + final int insertSplitLimit = 100; + // setup the small file handling params + // hold upto 200 records max + HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit, + TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150)); + + dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath}); + SparkRDDWriteClient client = getHoodieWriteClient(config); + BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient); + + // Inserts => will write file1 + String commitTime1 = "001"; + client.startCommitWithTime(commitTime1); + List inserts1 = dataGen.generateInserts(commitTime1, insertSplitLimit); // this writes ~500kb + Set keys1 = recordsToRecordKeySet(inserts1); + + JavaRDD insertRecordsRDD1 = jsc.parallelize(inserts1, 1); + List statuses = client.upsert(insertRecordsRDD1, commitTime1).collect(); + + assertNoWriteErrors(statuses); + + assertEquals(1, statuses.size(), "Just 1 file needs to be added."); + String file1 = statuses.get(0).getFileId(); + assertEquals(100, + fileUtils.readRowKeys(hadoopConf, new Path(basePath, statuses.get(0).getStat().getPath())) + .size(), "file should contain 100 records"); + + // Update + Inserts such that they just expand file1 + String commitTime2 = "002"; + client.startCommitWithTime(commitTime2); + List inserts2 = dataGen.generateInserts(commitTime2, 40); + Set keys2 = recordsToRecordKeySet(inserts2); + List insertsAndUpdates2 = new ArrayList<>(); + insertsAndUpdates2.addAll(inserts2); + insertsAndUpdates2.addAll(dataGen.generateUpdates(commitTime2, inserts1)); + + JavaRDD insertAndUpdatesRDD2 = jsc.parallelize(insertsAndUpdates2, 1); + statuses = client.upsert(insertAndUpdatesRDD2, commitTime2).collect(); + assertNoWriteErrors(statuses); + + assertEquals(1, statuses.size(), "Just 1 file needs to be updated."); + assertEquals(file1, statuses.get(0).getFileId(), "Existing file should be expanded"); + assertEquals(commitTime1, statuses.get(0).getStat().getPrevCommit(), "Existing file should be expanded"); + Path newFile = new Path(basePath, statuses.get(0).getStat().getPath()); + assertEquals(140, fileUtils.readRowKeys(hadoopConf, newFile).size(), + "file should contain 140 records"); + + List records = fileUtils.readAvroRecords(hadoopConf, newFile); + for (GenericRecord record : records) { + String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + assertEquals(commitTime2, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(), "only expect commit2"); + assertTrue(keys2.contains(recordKey) || keys1.contains(recordKey), "key expected to be part of commit2"); + } + + // update + inserts such that file1 is updated and expanded, a new file2 is created. + String commitTime3 = "003"; + client.startCommitWithTime(commitTime3); + List insertsAndUpdates3 = dataGen.generateInserts(commitTime3, 200); + Set keys3 = recordsToRecordKeySet(insertsAndUpdates3); + List updates3 = dataGen.generateUpdates(commitTime3, inserts2); + insertsAndUpdates3.addAll(updates3); + + JavaRDD insertAndUpdatesRDD3 = jsc.parallelize(insertsAndUpdates3, 1); + statuses = client.upsert(insertAndUpdatesRDD3, commitTime3).collect(); + assertNoWriteErrors(statuses); + + assertEquals(2, statuses.size(), "2 files needs to be committed."); + HoodieTableMetaClient metadata = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build(); + + HoodieTable table = getHoodieTable(metadata, config); + BaseFileOnlyView fileSystemView = table.getBaseFileOnlyView(); + List files = + fileSystemView.getLatestBaseFilesBeforeOrOn(testPartitionPath, commitTime3).collect(Collectors.toList()); + int numTotalInsertsInCommit3 = 0; + int numTotalUpdatesInCommit3 = 0; + for (HoodieBaseFile file : files) { + if (file.getFileName().contains(file1)) { + assertEquals(commitTime3, file.getCommitTime(), "Existing file should be expanded"); + records = fileUtils.readAvroRecords(hadoopConf, new Path(file.getPath())); + for (GenericRecord record : records) { + String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); + if (recordCommitTime.equals(commitTime3)) { + if (keys2.contains(recordKey)) { + keys2.remove(recordKey); + numTotalUpdatesInCommit3++; + } else { + numTotalInsertsInCommit3++; + } + } + } + assertEquals(0, keys2.size(), "All keys added in commit 2 must be updated in commit3 correctly"); + } else { + assertEquals(commitTime3, file.getCommitTime(), "New file must be written for commit 3"); + records = fileUtils.readAvroRecords(hadoopConf, new Path(file.getPath())); + for (GenericRecord record : records) { + String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + assertEquals(commitTime3, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(), + "only expect commit3"); + assertTrue(keys3.contains(recordKey), "key expected to be part of commit3"); + } + numTotalInsertsInCommit3 += records.size(); + } + } + assertEquals(numTotalUpdatesInCommit3, inserts2.size(), "Total updates in commit3 must add up"); + assertEquals(numTotalInsertsInCommit3, keys3.size(), "Total inserts in commit3 must add up"); + } + + /** + * Test scenario of new file-group getting added during insert(). + */ + @ParameterizedTest + @MethodSource("smallInsertHandlingParams") + public void testSmallInsertHandlingForInserts(boolean mergeAllowDuplicateInserts) throws Exception { + final String testPartitionPath = "2016/09/26"; + final int insertSplitLimit = 100; + // setup the small file handling params + + HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit, false, mergeAllowDuplicateInserts); // hold upto 200 records max + dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath}); + SparkRDDWriteClient client = getHoodieWriteClient(config); + BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient); + + // Inserts => will write file1 + String commitTime1 = "001"; + client.startCommitWithTime(commitTime1); + List inserts1 = dataGen.generateInserts(commitTime1, insertSplitLimit); // this writes ~500kb + Set keys1 = recordsToRecordKeySet(inserts1); + JavaRDD insertRecordsRDD1 = jsc.parallelize(inserts1, 1); + List statuses = client.insert(insertRecordsRDD1, commitTime1).collect(); + assertNoWriteErrors(statuses); + assertPartitionMetadata(basePath, new String[] {testPartitionPath}, fs); + assertEquals(1, statuses.size(), "Just 1 file needs to be added."); + String file1 = statuses.get(0).getFileId(); + assertEquals(100, + fileUtils.readRowKeys(hadoopConf, new Path(basePath, statuses.get(0).getStat().getPath())) + .size(), "file should contain 100 records"); + + // Second, set of Inserts should just expand file1 + String commitTime2 = "002"; + client.startCommitWithTime(commitTime2); + List inserts2 = dataGen.generateInserts(commitTime2, 40); + Set keys2 = recordsToRecordKeySet(inserts2); + JavaRDD insertRecordsRDD2 = jsc.parallelize(inserts2, 1); + statuses = client.insert(insertRecordsRDD2, commitTime2).collect(); + assertNoWriteErrors(statuses); + assertEquals(1, statuses.size(), "Just 1 file needs to be updated."); + assertEquals(file1, statuses.get(0).getFileId(), "Existing file should be expanded"); + assertEquals(commitTime1, statuses.get(0).getStat().getPrevCommit(), "Existing file should be expanded"); + + Path newFile = new Path(basePath, statuses.get(0).getStat().getPath()); + assertEquals(140, fileUtils.readRowKeys(hadoopConf, newFile).size(), + "file should contain 140 records"); + List records = fileUtils.readAvroRecords(hadoopConf, newFile); + for (GenericRecord record : records) { + String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String recCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); + assertTrue(commitTime1.equals(recCommitTime) || commitTime2.equals(recCommitTime), + "Record expected to be part of commit 1 or commit2"); + assertTrue(keys2.contains(recordKey) || keys1.contains(recordKey), + "key expected to be part of commit 1 or commit2"); + } + + // Lots of inserts such that file1 is updated and expanded, a new file2 is created. + String commitTime3 = "003"; + client.startCommitWithTime(commitTime3); + List inserts3 = dataGen.generateInserts(commitTime3, 200); + JavaRDD insertRecordsRDD3 = jsc.parallelize(inserts3, 1); + statuses = client.insert(insertRecordsRDD3, commitTime3).collect(); + assertNoWriteErrors(statuses); + assertEquals(2, statuses.size(), "2 files needs to be committed."); + assertEquals(340, + fileUtils.readRowKeys(hadoopConf, new Path(basePath, statuses.get(0).getStat().getPath())).size() + + fileUtils.readRowKeys(hadoopConf, new Path(basePath, statuses.get(1).getStat().getPath())).size(), + "file should contain 340 records"); + + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build(); + HoodieTable table = getHoodieTable(metaClient, config); + List files = table.getBaseFileOnlyView() + .getLatestBaseFilesBeforeOrOn(testPartitionPath, commitTime3).collect(Collectors.toList()); + assertEquals(2, files.size(), "Total of 2 valid data files"); + + int totalInserts = 0; + for (HoodieBaseFile file : files) { + assertEquals(commitTime3, file.getCommitTime(), "All files must be at commit 3"); + totalInserts += fileUtils.readAvroRecords(hadoopConf, new Path(file.getPath())).size(); + } + assertEquals(totalInserts, inserts1.size() + inserts2.size() + inserts3.size(), "Total number of records must add up"); + } + + /** + * Test delete with delete api. + */ + @Test + public void testDeletesWithDeleteApi() throws Exception { + final String testPartitionPath = "2016/09/26"; + final int insertSplitLimit = 100; + // setup the small file handling params + // hold upto 200 records max + HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit, + TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150)); + dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath}); + + SparkRDDWriteClient client = getHoodieWriteClient(config); + + // Inserts => will write file1 + String commitTime1 = "001"; + client.startCommitWithTime(commitTime1); + List inserts1 = dataGen.generateInserts(commitTime1, insertSplitLimit); // this writes ~500kb + Set keys1 = recordsToRecordKeySet(inserts1); + List keysSoFar = new ArrayList<>(keys1); + JavaRDD insertRecordsRDD1 = jsc.parallelize(inserts1, 1); + List statuses = client.upsert(insertRecordsRDD1, commitTime1).collect(); + + assertNoWriteErrors(statuses); + + assertEquals(1, statuses.size(), "Just 1 file needs to be added."); + String file1 = statuses.get(0).getFileId(); + assertEquals(100, + BaseFileUtils.getInstance(metaClient).readRowKeys(hadoopConf, new Path(basePath, statuses.get(0).getStat().getPath())) + .size(), "file should contain 100 records"); + + // Delete 20 among 100 inserted + testDeletes(client, inserts1, 20, file1, "002", 80, keysSoFar); + + // Insert and update 40 records + Pair, List> updateBatch2 = testUpdates("003", client, 40, 120); + keysSoFar.addAll(updateBatch2.getLeft()); + + // Delete 10 records among 40 updated + testDeletes(client, updateBatch2.getRight(), 10, file1, "004", 110, keysSoFar); + + // do another batch of updates + Pair, List> updateBatch3 = testUpdates("005", client, 40, 150); + keysSoFar.addAll(updateBatch3.getLeft()); + + // delete non existent keys + String commitTime6 = "006"; + client.startCommitWithTime(commitTime6); + + List dummyInserts3 = dataGen.generateInserts(commitTime6, 20); + List hoodieKeysToDelete3 = randomSelectAsHoodieKeys(dummyInserts3, 20); + JavaRDD deleteKeys3 = jsc.parallelize(hoodieKeysToDelete3, 1); + statuses = client.delete(deleteKeys3, commitTime6).collect(); + assertNoWriteErrors(statuses); + assertEquals(0, statuses.size(), "Just 0 write status for delete."); + + // Check the entire dataset has all records still + String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; + for (int i = 0; i < fullPartitionPaths.length; i++) { + fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); + } + assertEquals(150, + HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), + "Must contain " + 150 + " records"); + + // delete another batch. previous delete commit should have persisted the schema. If not, + // this will throw exception + testDeletes(client, updateBatch3.getRight(), 10, file1, "007", 140, keysSoFar); + } + + @ParameterizedTest + @MethodSource("populateMetaFieldsAndPreserveMetadataParams") + public void testSimpleClustering(boolean populateMetaFields, boolean preserveCommitMetadata) throws Exception { + // setup clustering config. + HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10) + .withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).withInlineClustering(true) + .withPreserveHoodieCommitMetadata(preserveCommitMetadata).build(); + testInsertAndClustering(clusteringConfig, populateMetaFields, true, false, SqlQueryEqualityPreCommitValidator.class.getName(), COUNT_SQL_QUERY_FOR_VALIDATION, ""); + } + + @Test + public void testAndValidateClusteringOutputFiles() throws IOException { + String partitionPath = "2015/03/16"; + testInsertTwoBatches(true, partitionPath); + + // Trigger clustering + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withEmbeddedTimelineServerEnabled(false).withAutoCommit(false) + .withClusteringConfig(HoodieClusteringConfig.newBuilder().withInlineClustering(true).withInlineClusteringNumCommits(2).build()); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build())) { + int numRecords = 200; + String newCommitTime = HoodieActiveTimeline.createNewInstantTime(); + List records1 = dataGen.generateInserts(newCommitTime, numRecords); + client.startCommitWithTime(newCommitTime); + JavaRDD insertRecordsRDD1 = jsc.parallelize(records1, 2); + JavaRDD statuses = client.insert(insertRecordsRDD1, newCommitTime); + client.commit(newCommitTime, statuses); + List statusList = statuses.collect(); + assertNoWriteErrors(statusList); + + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieInstant replaceCommitInstant = metaClient.getActiveTimeline().getCompletedReplaceTimeline().firstInstant().get(); + HoodieReplaceCommitMetadata replaceCommitMetadata = HoodieReplaceCommitMetadata + .fromBytes(metaClient.getActiveTimeline().getInstantDetails(replaceCommitInstant).get(), HoodieReplaceCommitMetadata.class); + + List filesFromReplaceCommit = new ArrayList<>(); + replaceCommitMetadata.getPartitionToWriteStats() + .forEach((k,v) -> v.forEach(entry -> filesFromReplaceCommit.add(entry.getPath()))); + + // find all parquet files created as part of clustering. Verify it matces w/ whats found in replace commit metadata. + FileStatus[] fileStatuses = fs.listStatus(new Path(basePath + "/" + partitionPath)); + List clusteredFiles = Arrays.stream(fileStatuses).filter(entry -> entry.getPath().getName().contains(replaceCommitInstant.getTimestamp())) + .map(fileStatus -> partitionPath + "/" + fileStatus.getPath().getName()).collect(Collectors.toList()); + assertEquals(clusteredFiles, filesFromReplaceCommit); + } + } + + @Test + public void testRolblackOfRegularCommitWithPendingReplaceCommitInTimeline() throws Exception { + HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10) + .withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).withInlineClustering(true) + .withPreserveHoodieCommitMetadata(true).build(); + // trigger clustering, but do not complete + testInsertAndClustering(clusteringConfig, true, false, false, SqlQueryEqualityPreCommitValidator.class.getName(), COUNT_SQL_QUERY_FOR_VALIDATION, ""); + + // trigger another partial commit, followed by valid commit. rollback of partial commit should succeed. + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false); + SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build()); + String commitTime1 = HoodieActiveTimeline.createNewInstantTime(); + List records1 = dataGen.generateInserts(commitTime1, 200); + client.startCommitWithTime(commitTime1); + JavaRDD insertRecordsRDD1 = jsc.parallelize(records1, 2); + JavaRDD statuses = client.upsert(insertRecordsRDD1, commitTime1); + List statusList = statuses.collect(); + assertNoWriteErrors(statusList); + + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build(); + assertEquals(2, metaClient.getActiveTimeline().getCommitsTimeline().filterInflightsAndRequested().countInstants()); + + // trigger another commit. this should rollback latest partial commit. + records1 = dataGen.generateInserts(commitTime1, 200); + client.startCommitWithTime(commitTime1); + insertRecordsRDD1 = jsc.parallelize(records1, 2); + statuses = client.upsert(insertRecordsRDD1, commitTime1); + statusList = statuses.collect(); + assertNoWriteErrors(statusList); + client.commit(commitTime1, statuses); + metaClient.reloadActiveTimeline(); + // rollback should have succeeded. Essentially, the pending clustering should not hinder the rollback of regular commits. + assertEquals(1, metaClient.getActiveTimeline().getCommitsTimeline().filterInflightsAndRequested().countInstants()); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testInlineScheduleClustering(boolean scheduleInlineClustering) throws IOException { + testInsertTwoBatches(true); + + // setup clustering config. + HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10) + .withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).withInlineClustering(false).withScheduleInlineClustering(scheduleInlineClustering) + .withPreserveHoodieCommitMetadata(true).build(); + + HoodieWriteConfig config = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).withAutoCommit(false) + .withClusteringConfig(clusteringConfig) + .withProps(getPropertiesForKeyGen()).build(); + SparkRDDWriteClient client = getHoodieWriteClient(config); + dataGen = new HoodieTestDataGenerator(new String[] {"2015/03/16"}); + String commitTime1 = HoodieActiveTimeline.createNewInstantTime(); + List records1 = dataGen.generateInserts(commitTime1, 200); + client.startCommitWithTime(commitTime1); + JavaRDD insertRecordsRDD1 = jsc.parallelize(records1, 2); + JavaRDD statuses = client.upsert(insertRecordsRDD1, commitTime1); + List statusList = statuses.collect(); + assertNoWriteErrors(statusList); + client.commit(commitTime1, statuses); + + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build(); + List> pendingClusteringPlans = + ClusteringUtils.getAllPendingClusteringPlans(metaClient).collect(Collectors.toList()); + if (scheduleInlineClustering) { + assertEquals(1, pendingClusteringPlans.size()); + } else { + assertEquals(0, pendingClusteringPlans.size()); + } + } + + @ParameterizedTest + @MethodSource("populateMetaFieldsAndPreserveMetadataParams") + public void testClusteringWithSortColumns(boolean populateMetaFields, boolean preserveCommitMetadata) throws Exception { + // setup clustering config. + HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10) + .withClusteringSortColumns(populateMetaFields ? "_hoodie_record_key" : "_row_key") + .withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).withInlineClustering(true) + .withPreserveHoodieCommitMetadata(preserveCommitMetadata).build(); + testInsertAndClustering(clusteringConfig, populateMetaFields, true, false, SqlQueryEqualityPreCommitValidator.class.getName(), COUNT_SQL_QUERY_FOR_VALIDATION, ""); + } + + @ParameterizedTest + @MethodSource("populateMetaFieldsAndPreserveMetadataParams") + public void testClusteringWithSortOneFilePerGroup(boolean populateMetaFields, boolean preserveCommitMetadata) throws Exception { + // setup clustering config. + HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10) + .withClusteringSortColumns("begin_lat,begin_lon") + .withClusteringPlanStrategyClass(SparkSingleFileSortPlanStrategy.class.getName()) + .withClusteringExecutionStrategyClass(SparkSingleFileSortExecutionStrategy.class.getName()) + .withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1) + .withPreserveHoodieCommitMetadata(preserveCommitMetadata).build(); + // note that assertSameFileIds is true for this test because of the plan and execution strategy + testInsertAndClustering(clusteringConfig, populateMetaFields, true, true, SqlQueryEqualityPreCommitValidator.class.getName(), COUNT_SQL_QUERY_FOR_VALIDATION, ""); + } + + @Test + public void testPendingClusteringRollback() throws Exception { + boolean populateMetaFields = true; + // setup clustering config. + HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10) + .withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).withInlineClustering(true).build(); + + // start clustering, but don't commit + List allRecords = testInsertAndClustering(clusteringConfig, populateMetaFields, false); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build(); + List> pendingClusteringPlans = + ClusteringUtils.getAllPendingClusteringPlans(metaClient).collect(Collectors.toList()); + assertEquals(1, pendingClusteringPlans.size()); + HoodieInstant pendingClusteringInstant = pendingClusteringPlans.get(0).getLeft(); + + // complete another commit after pending clustering + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(EAGER); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + HoodieWriteConfig config = cfgBuilder.build(); + SparkRDDWriteClient client = getHoodieWriteClient(config); + dataGen = new HoodieTestDataGenerator(); + String commitTime = HoodieActiveTimeline.createNewInstantTime(); + allRecords.addAll(dataGen.generateInserts(commitTime, 200)); + assertThrows(HoodieUpsertException.class, () -> writeAndVerifyBatch(client, allRecords, commitTime, populateMetaFields)); + // verify pending clustering can be rolled back (even though there is a completed commit greater than pending clustering) + client.rollback(pendingClusteringInstant.getTimestamp()); + metaClient.reloadActiveTimeline(); + // verify there are no pending clustering instants + assertEquals(0, ClusteringUtils.getAllPendingClusteringPlans(metaClient).count()); + + // delete rollback.completed instant to mimic failed rollback of clustering. and then trigger rollback of clustering again. same rollback instant should be used. + HoodieInstant rollbackInstant = metaClient.getActiveTimeline().getRollbackTimeline().lastInstant().get(); + FileCreateUtils.deleteRollbackCommit(metaClient.getBasePath(), rollbackInstant.getTimestamp()); + metaClient.reloadActiveTimeline(); + + // create replace commit requested meta file so that rollback will not throw FileNotFoundException + // create file slice with instantTime 001 and build clustering plan including this created 001 file slice. + HoodieClusteringPlan clusteringPlan = ClusteringTestUtils.createClusteringPlan(metaClient, pendingClusteringInstant.getTimestamp(), "1"); + // create requested replace commit + HoodieRequestedReplaceMetadata requestedReplaceMetadata = HoodieRequestedReplaceMetadata.newBuilder() + .setClusteringPlan(clusteringPlan).setOperationType(WriteOperationType.CLUSTER.name()).build(); + + FileCreateUtils.createRequestedReplaceCommit(metaClient.getBasePath(), pendingClusteringInstant.getTimestamp(), Option.of(requestedReplaceMetadata)); + + // trigger clustering again. no new rollback instants should be generated. + try { + client.cluster(pendingClusteringInstant.getTimestamp(), false); + // new replace commit metadata generated is fake one. so, clustering will fail. but the intention of test is ot check for duplicate rollback instants. + } catch (Exception e) { + //ignore. + } + + metaClient.reloadActiveTimeline(); + // verify that there is no new rollback instant generated + HoodieInstant newRollbackInstant = metaClient.getActiveTimeline().getRollbackTimeline().lastInstant().get(); + assertEquals(rollbackInstant.getTimestamp(), newRollbackInstant.getTimestamp()); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testInflightClusteringRollbackWhenUpdatesAllowed(boolean rollbackPendingClustering) throws Exception { + // setup clustering config with update strategy to allow updates during ingestion + HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder() + .withClusteringMaxNumGroups(10).withClusteringTargetPartitions(0) + .withClusteringUpdatesStrategy("org.apache.hudi.client.clustering.update.strategy.SparkAllowUpdateStrategy") + .withRollbackPendingClustering(rollbackPendingClustering) + .withInlineClustering(true).withInlineClusteringNumCommits(1).build(); + + // start clustering, but don't commit keep it inflight + List allRecords = testInsertAndClustering(clusteringConfig, true, false); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build(); + List> pendingClusteringPlans = + ClusteringUtils.getAllPendingClusteringPlans(metaClient).collect(Collectors.toList()); + assertEquals(1, pendingClusteringPlans.size()); + HoodieInstant pendingClusteringInstant = pendingClusteringPlans.get(0).getLeft(); + assertEquals(pendingClusteringInstant.getState(), INFLIGHT); + + // make an update to a filegroup within the partition that is pending clustering + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(EAGER); + addConfigsForPopulateMetaFields(cfgBuilder, true); + cfgBuilder.withClusteringConfig(clusteringConfig); + HoodieWriteConfig config = cfgBuilder.build(); + SparkRDDWriteClient client = getHoodieWriteClient(config); + String commitTime = HoodieActiveTimeline.createNewInstantTime(); + allRecords.addAll(dataGen.generateUpdates(commitTime, 200)); + writeAndVerifyBatch(client, allRecords, commitTime, true); + + // verify inflight clustering was rolled back + metaClient.reloadActiveTimeline(); + pendingClusteringPlans = ClusteringUtils.getAllPendingClusteringPlans(metaClient).collect(Collectors.toList()); + assertEquals(config.isRollbackPendingClustering() ? 0 : 1, pendingClusteringPlans.size()); + } + + @Test + public void testClusteringWithFailingValidator() throws Exception { + // setup clustering config. + HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10) + .withClusteringSortColumns("_hoodie_record_key").withInlineClustering(true) + .withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build(); + try { + testInsertAndClustering(clusteringConfig, true, true, false, FailingPreCommitValidator.class.getName(), COUNT_SQL_QUERY_FOR_VALIDATION, ""); + fail("expected pre-commit clustering validation to fail"); + } catch (HoodieValidationException e) { + // expected + } + } + + @Test + public void testClusteringInvalidConfigForSqlQueryValidator() throws Exception { + // setup clustering config. + HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10) + .withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).withInlineClustering(true).build(); + try { + testInsertAndClustering(clusteringConfig, false, true, false, SqlQueryEqualityPreCommitValidator.class.getName(), "", ""); + fail("expected pre-commit clustering validation to fail because sql query is not configured"); + } catch (HoodieValidationException e) { + // expected + } + } + + @Test + public void testClusteringInvalidConfigForSqlQuerySingleResultValidator() throws Exception { + // setup clustering config. + HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10) + .withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).withInlineClustering(true).build(); + + testInsertAndClustering(clusteringConfig, false, true, false, SqlQuerySingleResultPreCommitValidator.class.getName(), + "", COUNT_SQL_QUERY_FOR_VALIDATION + "#400"); + } + + @Test + public void testClusteringInvalidConfigForSqlQuerySingleResultValidatorFailure() throws Exception { + // setup clustering config. + HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10) + .withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).withInlineClustering(true).build(); + + try { + testInsertAndClustering(clusteringConfig, false, true, false, SqlQuerySingleResultPreCommitValidator.class.getName(), + "", COUNT_SQL_QUERY_FOR_VALIDATION + "#802"); + fail("expected pre-commit clustering validation to fail because of count mismatch. expect 400 rows, not 802"); + } catch (HoodieValidationException e) { + // expected + } + } + + private List testInsertAndClustering(HoodieClusteringConfig clusteringConfig, boolean populateMetaFields, boolean completeClustering) throws Exception { + return testInsertAndClustering(clusteringConfig, populateMetaFields, completeClustering, false, "", "", ""); + } + + private List testInsertAndClustering(HoodieClusteringConfig clusteringConfig, boolean populateMetaFields, + boolean completeClustering, boolean assertSameFileIds, String validatorClasses, + String sqlQueryForEqualityValidation, String sqlQueryForSingleResultValidation) throws Exception { + Pair, List>, Set> allRecords = testInsertTwoBatches(populateMetaFields); + testClustering(clusteringConfig, populateMetaFields, completeClustering, assertSameFileIds, validatorClasses, sqlQueryForEqualityValidation, sqlQueryForSingleResultValidation, allRecords); + return allRecords.getLeft().getLeft(); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testFailWritesOnInlineTableServiceExceptions(boolean shouldFail) throws IOException { + try { + Properties properties = new Properties(); + properties.setProperty("hoodie.fail.writes.on.inline.table.service.exception", String.valueOf(shouldFail)); + properties.setProperty("hoodie.auto.commit", "false"); + properties.setProperty("hoodie.clustering.inline.max.commits", "1"); + properties.setProperty("hoodie.clustering.inline", "true"); + testInsertTwoBatches(true, "2015/03/16", properties, true); + assertFalse(shouldFail); + } catch (HoodieException e) { + assertEquals(CLUSTERING_FAILURE, e.getMessage()); + assertTrue(shouldFail); + } + } + + private Pair, List>, Set> testInsertTwoBatches(boolean populateMetaFields) throws IOException { + return testInsertTwoBatches(populateMetaFields, "2015/03/16"); + } + + private Pair, List>, Set> testInsertTwoBatches(boolean populateMetaFields, String partitionPath) throws IOException { + return testInsertTwoBatches(populateMetaFields, partitionPath, new Properties(), false); + } + + /** + * This method returns following three items: + * 1. List of all HoodieRecord written in the two batches of insert. + * 2. Commit instants of the two batches. + * 3. List of new file group ids that were written in the two batches. + */ + private Pair, List>, Set> testInsertTwoBatches(boolean populateMetaFields, String partitionPath, Properties props, + boolean failInlineClustering) throws IOException { + // create config to not update small files. + HoodieWriteConfig config = getSmallInsertWriteConfig(2000, TRIP_EXAMPLE_SCHEMA, 10, false, populateMetaFields, + populateMetaFields ? props : getPropertiesForKeyGen()); + SparkRDDWriteClient client; + if (failInlineClustering) { + if (null != writeClient) { + writeClient.close(); + writeClient = null; + } + client = new WriteClientBrokenClustering(context, config); + } else { + client = getHoodieWriteClient(config); + } + + dataGen = new HoodieTestDataGenerator(new String[] {partitionPath}); + String commitTime1 = HoodieActiveTimeline.createNewInstantTime(); + List records1 = dataGen.generateInserts(commitTime1, 200); + List statuses1 = writeAndVerifyBatch(client, records1, commitTime1, populateMetaFields, failInlineClustering); + Set fileIds1 = getFileGroupIdsFromWriteStatus(statuses1); + + String commitTime2 = HoodieActiveTimeline.createNewInstantTime(); + List records2 = dataGen.generateInserts(commitTime2, 200); + List statuses2 = writeAndVerifyBatch(client, records2, commitTime2, populateMetaFields, failInlineClustering); + Set fileIds2 = getFileGroupIdsFromWriteStatus(statuses2); + Set fileIdsUnion = new HashSet<>(fileIds1); + fileIdsUnion.addAll(fileIds2); + //verify new files are created for 2nd write + Set fileIdIntersection = new HashSet<>(fileIds1); + fileIdIntersection.retainAll(fileIds2); + assertEquals(0, fileIdIntersection.size()); + return Pair.of(Pair.of(Stream.concat(records1.stream(), records2.stream()).collect(Collectors.toList()), Arrays.asList(commitTime1, commitTime2)), fileIdsUnion); + } + + private void testClustering(HoodieClusteringConfig clusteringConfig, boolean populateMetaFields, boolean completeClustering, boolean assertSameFileIds, + String validatorClasses, String sqlQueryForEqualityValidation, String sqlQueryForSingleResultValidation, + Pair, List>, Set> allRecords) throws IOException { + + HoodieWriteConfig config = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).withAutoCommit(false) + .withClusteringConfig(clusteringConfig) + .withProps(getPropertiesForKeyGen()).build(); + HoodieWriteMetadata> clusterMetadata = + performClustering(clusteringConfig, populateMetaFields, completeClustering, validatorClasses, sqlQueryForEqualityValidation, sqlQueryForSingleResultValidation, allRecords.getLeft()); + if (assertSameFileIds) { + Set replacedFileIds = clusterMetadata.getWriteStats().get().stream() + .map(s -> new HoodieFileGroupId(s.getPartitionPath(),s.getFileId())).collect(Collectors.toSet()); + Set insertedFileIds = allRecords.getRight(); + assertEquals(insertedFileIds, replacedFileIds); + } + if (completeClustering) { + String clusteringCommitTime = metaClient.reloadActiveTimeline().getCompletedReplaceTimeline() + .getReverseOrderedInstants().findFirst().get().getTimestamp(); + verifyRecordsWritten(clusteringCommitTime, populateMetaFields, allRecords.getLeft().getLeft(), clusterMetadata.getWriteStatuses().collect(), config); + } + } + + private HoodieWriteMetadata> performClustering(HoodieClusteringConfig clusteringConfig, + boolean populateMetaFields, + boolean completeClustering, + String validatorClasses, + String sqlQueryForEqualityValidation, String sqlQueryForSingleResultValidation, + Pair, List> allRecords) throws IOException { + HoodiePreCommitValidatorConfig validatorConfig = HoodiePreCommitValidatorConfig.newBuilder() + .withPreCommitValidator(StringUtils.nullToEmpty(validatorClasses)) + .withPrecommitValidatorEqualitySqlQueries(sqlQueryForEqualityValidation) + .withPrecommitValidatorSingleResultSqlQueries(sqlQueryForSingleResultValidation) + .build(); + + HoodieWriteConfig config = getConfigBuilder().withAutoCommit(false) + .withPreCommitValidatorConfig(validatorConfig) + .withProps(populateMetaFields ? new Properties() : getPropertiesForKeyGen()) + .withClusteringConfig(clusteringConfig).build(); + + // create client with new config. + SparkRDDWriteClient client = getHoodieWriteClient(config); + String clusteringCommitTime = client.scheduleClustering(Option.empty()).get().toString(); + HoodieWriteMetadata> clusterMetadata = client.cluster(clusteringCommitTime, completeClustering); + if (config.isPreserveHoodieCommitMetadataForClustering() && config.populateMetaFields()) { + verifyRecordsWrittenWithPreservedMetadata(new HashSet<>(allRecords.getRight()), allRecords.getLeft(), clusterMetadata.getWriteStatuses().collect()); + } else { + verifyRecordsWritten(clusteringCommitTime, populateMetaFields, allRecords.getLeft(), clusterMetadata.getWriteStatuses().collect(), config); + } + + Set replacedFileIds = new HashSet<>(); + clusterMetadata.getPartitionToReplaceFileIds().entrySet().forEach(partitionFiles -> + partitionFiles.getValue().stream().forEach(file -> + replacedFileIds.add(new HoodieFileGroupId(partitionFiles.getKey(), file)))); + return clusterMetadata; + } + + private Set getFileGroupIdsFromWriteStatus(List statuses) { + return statuses.stream().map(s -> new HoodieFileGroupId(s.getPartitionPath(), s.getFileId())).collect(Collectors.toSet()); + } + + /** + * Test scenario of writing more file groups than existing number of file groups in partition. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testInsertOverwritePartitionHandlingWithMoreRecords(boolean populateMetaFields) throws Exception { + verifyInsertOverwritePartitionHandling(1000, 3000, populateMetaFields); + } + + /** + * Test scenario of writing fewer file groups than existing number of file groups in partition. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testInsertOverwritePartitionHandlingWithFewerRecords(boolean populateMetaFields) throws Exception { + verifyInsertOverwritePartitionHandling(3000, 1000, populateMetaFields); + } + + /** + * Test scenario of writing similar number file groups in partition. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testInsertOverwritePartitionHandlingWithSimilarNumberOfRecords(boolean populateMetaFields) throws Exception { + verifyInsertOverwritePartitionHandling(3000, 3000, populateMetaFields); + } + + /** + * 1) Do write1 (upsert) with 'batch1RecordsCount' number of records. + * 2) Do write2 (insert overwrite) with 'batch2RecordsCount' number of records. + * + * Verify that all records in step1 are overwritten + */ + private void verifyInsertOverwritePartitionHandling(int batch1RecordsCount, int batch2RecordsCount, boolean populateMetaFields) throws Exception { + final String testPartitionPath = "americas"; + HoodieWriteConfig config = getSmallInsertWriteConfig(2000, + TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), populateMetaFields, populateMetaFields + ? new Properties() : getPropertiesForKeyGen()); + SparkRDDWriteClient client = getHoodieWriteClient(config); + dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath}); + + // Do Inserts + String commit1 = "001"; + List statuses = writeAndVerifyBatch(client, dataGen.generateInserts(commit1, batch1RecordsCount), commit1, populateMetaFields); + Set batch1Buckets = getFileIdsFromWriteStatus(statuses); + + // Do Insert Overwrite + String commitTime2 = "002"; + client.startCommitWithTime(commitTime2, REPLACE_COMMIT_ACTION); + List inserts2 = dataGen.generateInserts(commitTime2, batch2RecordsCount); + List insertsAndUpdates2 = new ArrayList<>(); + insertsAndUpdates2.addAll(inserts2); + JavaRDD insertAndUpdatesRDD2 = jsc.parallelize(insertsAndUpdates2, 2); + HoodieWriteResult writeResult = client.insertOverwrite(insertAndUpdatesRDD2, commitTime2); + statuses = writeResult.getWriteStatuses().collect(); + assertNoWriteErrors(statuses); + + assertEquals(batch1Buckets, new HashSet<>(writeResult.getPartitionToReplaceFileIds().get(testPartitionPath))); + verifyRecordsWritten(commitTime2, populateMetaFields, inserts2, statuses, config); + } + + private Set getFileIdsFromWriteStatus(List statuses) { + return statuses.stream().map(s -> s.getFileId()).collect(Collectors.toSet()); + } + + /** + * Test scenario of writing fewer file groups for first partition than second an third partition. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void verifyDeletePartitionsHandlingWithFewerRecordsFirstPartition(boolean populateMetaFields) throws Exception { + verifyDeletePartitionsHandling(1000, 3000, 3000, populateMetaFields); + } + + /** + * Test scenario of writing similar number file groups in partition. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void verifyDeletePartitionsHandlingWithSimilarNumberOfRecords(boolean populateMetaFields) throws Exception { + verifyDeletePartitionsHandling(3000, 3000, 3000, populateMetaFields); + } + + /** + * Test scenario of writing more file groups for first partition than second an third partition. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void verifyDeletePartitionsHandlingHandlingWithFewerRecordsSecondThirdPartition(boolean populateMetaFields) throws Exception { + verifyDeletePartitionsHandling(3000, 1000, 1000, populateMetaFields); + } + + private Set insertPartitionRecordsWithCommit(SparkRDDWriteClient client, int recordsCount, String commitTime1, String partitionPath) throws IOException { + client.startCommitWithTime(commitTime1); + List inserts1 = dataGen.generateInsertsForPartition(commitTime1, recordsCount, partitionPath); + JavaRDD insertRecordsRDD1 = jsc.parallelize(inserts1, 2); + List statuses = client.upsert(insertRecordsRDD1, commitTime1).collect(); + assertNoWriteErrors(statuses); + Set batchBuckets = statuses.stream().map(s -> s.getFileId()).collect(Collectors.toSet()); + verifyRecordsWritten(commitTime1, true, inserts1, statuses, client.getConfig()); + return batchBuckets; + } + + private Set deletePartitionWithCommit(SparkRDDWriteClient client, String commitTime, List deletePartitionPath) { + client.startCommitWithTime(commitTime, REPLACE_COMMIT_ACTION); + HoodieWriteResult writeResult = client.deletePartitions(deletePartitionPath, commitTime); + Set deletePartitionReplaceFileIds = + writeResult.getPartitionToReplaceFileIds().entrySet() + .stream().flatMap(entry -> entry.getValue().stream()).collect(Collectors.toSet()); + return deletePartitionReplaceFileIds; + } + + /** + * 1) Do write1 (upsert) with 'batch1RecordsCount' number of records for first partition. + * 2) Do write2 (upsert) with 'batch2RecordsCount' number of records for second partition. + * 3) Do write3 (upsert) with 'batch3RecordsCount' number of records for third partition. + * 4) delete first partition and check result. + * 5) delete second and third partition and check result. + * + */ + private void verifyDeletePartitionsHandling(int batch1RecordsCount, int batch2RecordsCount, int batch3RecordsCount, + boolean populateMetaFields) throws Exception { + HoodieWriteConfig config = getSmallInsertWriteConfig(2000, + TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), populateMetaFields, populateMetaFields + ? new Properties() : getPropertiesForKeyGen()); + SparkRDDWriteClient client = getHoodieWriteClient(config); + dataGen = new HoodieTestDataGenerator(); + + // Do Inserts for DEFAULT_FIRST_PARTITION_PATH + String commitTime1 = "001"; + Set batch1Buckets = + this.insertPartitionRecordsWithCommit(client, batch1RecordsCount, commitTime1, DEFAULT_FIRST_PARTITION_PATH); + + // Do Inserts for DEFAULT_SECOND_PARTITION_PATH + String commitTime2 = "002"; + Set batch2Buckets = + this.insertPartitionRecordsWithCommit(client, batch2RecordsCount, commitTime2, DEFAULT_SECOND_PARTITION_PATH); + + // Do Inserts for DEFAULT_THIRD_PARTITION_PATH + String commitTime3 = "003"; + Set batch3Buckets = + this.insertPartitionRecordsWithCommit(client, batch3RecordsCount, commitTime3, DEFAULT_THIRD_PARTITION_PATH); + + // delete DEFAULT_FIRST_PARTITION_PATH + String commitTime4 = "004"; + Set deletePartitionReplaceFileIds1 = + deletePartitionWithCommit(client, commitTime4, Arrays.asList(DEFAULT_FIRST_PARTITION_PATH)); + assertEquals(batch1Buckets, deletePartitionReplaceFileIds1); + List baseFiles = HoodieClientTestUtils.getLatestBaseFiles(basePath, fs, + String.format("%s/%s/*", basePath, DEFAULT_FIRST_PARTITION_PATH)); + assertEquals(0, baseFiles.size()); + baseFiles = HoodieClientTestUtils.getLatestBaseFiles(basePath, fs, + String.format("%s/%s/*", basePath, DEFAULT_SECOND_PARTITION_PATH)); + assertTrue(baseFiles.size() > 0); + baseFiles = HoodieClientTestUtils.getLatestBaseFiles(basePath, fs, + String.format("%s/%s/*", basePath, DEFAULT_THIRD_PARTITION_PATH)); + assertTrue(baseFiles.size() > 0); + + // delete DEFAULT_SECOND_PARTITION_PATH, DEFAULT_THIRD_PARTITION_PATH + String commitTime5 = "005"; + Set deletePartitionReplaceFileIds2 = + deletePartitionWithCommit(client, commitTime5, Arrays.asList(DEFAULT_SECOND_PARTITION_PATH, DEFAULT_THIRD_PARTITION_PATH)); + Set expectedFileId = new HashSet<>(); + expectedFileId.addAll(batch2Buckets); + expectedFileId.addAll(batch3Buckets); + assertEquals(expectedFileId, deletePartitionReplaceFileIds2); + + baseFiles = HoodieClientTestUtils.getLatestBaseFiles(basePath, fs, + String.format("%s/%s/*", basePath, DEFAULT_FIRST_PARTITION_PATH), + String.format("%s/%s/*", basePath, DEFAULT_SECOND_PARTITION_PATH), + String.format("%s/%s/*", basePath, DEFAULT_THIRD_PARTITION_PATH)); + assertEquals(0, baseFiles.size()); + } + + /** + * Verify data in base files matches expected records and commit time. + */ + private void verifyRecordsWritten(String commitTime, boolean populateMetadataField, + List expectedRecords, List allStatus, HoodieWriteConfig config) throws IOException { + List records = new ArrayList<>(); + Set expectedKeys = verifyRecordKeys(expectedRecords, allStatus, records); + if (config.populateMetaFields()) { + for (GenericRecord record : records) { + String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + assertEquals(commitTime, + record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString()); + assertTrue(expectedKeys.contains(recordKey)); + } + } else { + KeyGenerator keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps())); + for (GenericRecord record : records) { + String recordKey = keyGenerator.getKey(record).getRecordKey(); + if (!populateMetadataField) { + assertNull(record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD)); + } + assertTrue(expectedKeys.contains(recordKey)); + } + } + } + + @NotNull + private Set verifyRecordKeys(List expectedRecords, List allStatus, List records) { + for (WriteStatus status : allStatus) { + Path filePath = new Path(basePath, status.getStat().getPath()); + records.addAll(BaseFileUtils.getInstance(metaClient).readAvroRecords(jsc.hadoopConfiguration(), filePath)); + } + Set expectedKeys = recordsToRecordKeySet(expectedRecords); + assertEquals(records.size(), expectedKeys.size()); + return expectedKeys; + } + + private void verifyRecordsWrittenWithPreservedMetadata(Set commitTimes, List expectedRecords, List allStatus) { + List records = new ArrayList<>(); + Set expectedKeys = verifyRecordKeys(expectedRecords, allStatus, records); + Map> recordsByCommitTime = records.stream() + .collect(Collectors.groupingBy(r -> r.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString())); + assertTrue(commitTimes.containsAll(recordsByCommitTime.keySet())); + Set expectedFileIds = allStatus.stream().map(WriteStatus::getFileId).collect(Collectors.toSet()); + for (GenericRecord record : records) { + String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + assertTrue(expectedKeys.contains(recordKey)); + String fileName = record.get(HoodieRecord.FILENAME_METADATA_FIELD).toString(); + assertTrue(expectedFileIds.contains(FSUtils.getFileId(fileName))); + } + } + + private List writeAndVerifyBatch(SparkRDDWriteClient client, List inserts, String commitTime, boolean populateMetaFields) throws IOException { + return writeAndVerifyBatch(client, inserts, commitTime, populateMetaFields, false); + } + + private List writeAndVerifyBatch(SparkRDDWriteClient client, List inserts, String commitTime, boolean populateMetaFields, boolean autoCommitOff) throws IOException { + client.startCommitWithTime(commitTime); + JavaRDD insertRecordsRDD1 = jsc.parallelize(inserts, 2); + JavaRDD statusRDD = client.upsert(insertRecordsRDD1, commitTime); + if (autoCommitOff) { + client.commit(commitTime, statusRDD); + } + List statuses = statusRDD.collect(); + assertNoWriteErrors(statuses); + verifyRecordsWritten(commitTime, populateMetaFields, inserts, statuses, client.getConfig()); + + return statuses; + } + + private Pair, List> testUpdates(String instantTime, SparkRDDWriteClient client, + int sizeToInsertAndUpdate, int expectedTotalRecords) + throws IOException { + client.startCommitWithTime(instantTime); + List inserts = dataGen.generateInserts(instantTime, sizeToInsertAndUpdate); + Set keys = recordsToRecordKeySet(inserts); + List insertsAndUpdates = new ArrayList<>(); + insertsAndUpdates.addAll(inserts); + insertsAndUpdates.addAll(dataGen.generateUpdates(instantTime, inserts)); + + JavaRDD insertAndUpdatesRDD = jsc.parallelize(insertsAndUpdates, 1); + List statuses = client.upsert(insertAndUpdatesRDD, instantTime).collect(); + assertNoWriteErrors(statuses); + + // Check the entire dataset has all records still + String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; + for (int i = 0; i < fullPartitionPaths.length; i++) { + fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); + } + assertEquals(expectedTotalRecords, + HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), + "Must contain " + expectedTotalRecords + " records"); + return Pair.of(keys, inserts); + } + + private void testDeletes(SparkRDDWriteClient client, List previousRecords, int sizeToDelete, + String existingFile, String instantTime, int expectedRecords, List keys) { + client.startCommitWithTime(instantTime); + + List hoodieKeysToDelete = randomSelectAsHoodieKeys(previousRecords, sizeToDelete); + JavaRDD deleteKeys = jsc.parallelize(hoodieKeysToDelete, 1); + List statuses = client.delete(deleteKeys, instantTime).collect(); + + assertNoWriteErrors(statuses); + + assertEquals(1, statuses.size(), "Just 1 file needs to be added."); + assertEquals(existingFile, statuses.get(0).getFileId(), "Existing file should be expanded"); + + // Check the entire dataset has all records still + String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; + for (int i = 0; i < fullPartitionPaths.length; i++) { + fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); + } + assertEquals(expectedRecords, + HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), + "Must contain " + expectedRecords + " records"); + + Path newFile = new Path(basePath, statuses.get(0).getStat().getPath()); + assertEquals(expectedRecords, + BaseFileUtils.getInstance(metaClient).readRowKeys(hadoopConf, newFile).size(), + "file should contain 110 records"); + + List records = BaseFileUtils.getInstance(metaClient).readAvroRecords(hadoopConf, newFile); + for (GenericRecord record : records) { + String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + assertTrue(keys.contains(recordKey), "key expected to be part of " + instantTime); + assertFalse(hoodieKeysToDelete.contains(recordKey), "Key deleted"); + } + } + + /** + * Test delete with delete api. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testDeletesWithoutInserts(boolean populateMetaFields) { + final String testPartitionPath = "2016/09/26"; + final int insertSplitLimit = 100; + // setup the small file handling params + HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit, + TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), populateMetaFields, populateMetaFields + ? new Properties() : getPropertiesForKeyGen()); + dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath}); + SparkRDDWriteClient client = getHoodieWriteClient(config); + + // delete non existent keys + String commitTime1 = "001"; + client.startCommitWithTime(commitTime1); + + List dummyInserts = dataGen.generateInserts(commitTime1, 20); + List hoodieKeysToDelete = randomSelectAsHoodieKeys(dummyInserts, 20); + JavaRDD deleteKeys = jsc.parallelize(hoodieKeysToDelete, 1); + assertThrows(HoodieIOException.class, () -> { + client.delete(deleteKeys, commitTime1).collect(); + }, "Should have thrown Exception"); + } + + /** + * Test to ensure commit metadata points to valid files. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testCommitWritesRelativePaths(boolean populateMetaFields) throws Exception { + + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build());) { + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build(); + HoodieSparkTable table = HoodieSparkTable.create(cfgBuilder.build(), context, metaClient); + + String instantTime = "000"; + client.startCommitWithTime(instantTime); + + List records = dataGen.generateInserts(instantTime, 200); + JavaRDD writeRecords = jsc.parallelize(records, 1); + + JavaRDD result = client.bulkInsert(writeRecords, instantTime); + + assertTrue(client.commit(instantTime, result), "Commit should succeed"); + assertTrue(testTable.commitExists(instantTime), + "After explicit commit, commit file should be created"); + + // Get base file paths from commit metadata + String actionType = metaClient.getCommitActionType(); + HoodieInstant commitInstant = new HoodieInstant(false, actionType, instantTime); + HoodieTimeline commitTimeline = metaClient.getCommitTimeline().filterCompletedInstants(); + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(commitTimeline.getInstantDetails(commitInstant).get(), HoodieCommitMetadata.class); + String basePath = table.getMetaClient().getBasePath(); + Collection commitPathNames = commitMetadata.getFileIdAndFullPaths(new Path(basePath)).values(); + + // Read from commit file + try (FSDataInputStream inputStream = fs.open(testTable.getCommitFilePath(instantTime))) { + String everything = FileIOUtils.readAsUTFString(inputStream); + HoodieCommitMetadata metadata = HoodieCommitMetadata.fromJsonString(everything, HoodieCommitMetadata.class); + HashMap paths = metadata.getFileIdAndFullPaths(new Path(basePath)); + // Compare values in both to make sure they are equal. + for (String pathName : paths.values()) { + assertTrue(commitPathNames.contains(pathName)); + } + } + } + } + + /** + * Test to ensure commit metadata points to valid files.10. + */ + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testMetadataStatsOnCommit(boolean populateMetaFields) throws Exception { + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + HoodieWriteConfig cfg = cfgBuilder.build(); + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + + String instantTime0 = "000"; + client.startCommitWithTime(instantTime0); + + List records0 = dataGen.generateInserts(instantTime0, 200); + JavaRDD writeRecords0 = jsc.parallelize(records0, 1); + JavaRDD result0 = client.bulkInsert(writeRecords0, instantTime0); + + assertTrue(client.commit(instantTime0, result0), "Commit should succeed"); + assertTrue(testTable.commitExists(instantTime0), + "After explicit commit, commit file should be created"); + + // Read from commit file + try (FSDataInputStream inputStream = fs.open(testTable.getCommitFilePath(instantTime0))) { + String everything = FileIOUtils.readAsUTFString(inputStream); + HoodieCommitMetadata metadata = + HoodieCommitMetadata.fromJsonString(everything, HoodieCommitMetadata.class); + int inserts = 0; + for (Map.Entry> pstat : metadata.getPartitionToWriteStats().entrySet()) { + for (HoodieWriteStat stat : pstat.getValue()) { + inserts += stat.getNumInserts(); + } + } + assertEquals(200, inserts); + } + + // Update + Inserts such that they just expand file1 + String instantTime1 = "001"; + client.startCommitWithTime(instantTime1); + + List records1 = dataGen.generateUpdates(instantTime1, records0); + JavaRDD writeRecords1 = jsc.parallelize(records1, 1); + JavaRDD result1 = client.upsert(writeRecords1, instantTime1); + + assertTrue(client.commit(instantTime1, result1), "Commit should succeed"); + assertTrue(testTable.commitExists(instantTime1), + "After explicit commit, commit file should be created"); + + // Read from commit file + try (FSDataInputStream inputStream = fs.open(testTable.getCommitFilePath(instantTime1))) { + String everything = FileIOUtils.readAsUTFString(inputStream); + HoodieCommitMetadata metadata = HoodieCommitMetadata.fromJsonString(everything, HoodieCommitMetadata.class); + int inserts = 0; + int upserts = 0; + for (Map.Entry> pstat : metadata.getPartitionToWriteStats().entrySet()) { + for (HoodieWriteStat stat : pstat.getValue()) { + inserts += stat.getNumInserts(); + upserts += stat.getNumUpdateWrites(); + } + } + assertEquals(0, inserts); + assertEquals(200, upserts); + } + } + + /** + * Tests behavior of committing only when consistency is verified. + */ + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testConsistencyCheckDuringFinalize(boolean enableOptimisticConsistencyGuard) throws Exception { + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build(); + String instantTime = "000"; + HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder() + .withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard).build()).build(); + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + Pair> result = testConsistencyCheck(metaClient, instantTime, enableOptimisticConsistencyGuard); + + // Delete orphan marker and commit should succeed + metaClient.getFs().delete(result.getKey(), false); + if (!enableOptimisticConsistencyGuard) { + assertTrue(client.commit(instantTime, result.getRight()), "Commit should succeed"); + assertTrue(testTable.commitExists(instantTime), + "After explicit commit, commit file should be created"); + // Marker directory must be removed + assertFalse(metaClient.getFs().exists(new Path(metaClient.getMarkerFolderPath(instantTime)))); + } else { + // with optimistic, first client.commit should have succeeded. + assertTrue(testTable.commitExists(instantTime), + "After explicit commit, commit file should be created"); + // Marker directory must be removed + assertFalse(metaClient.getFs().exists(new Path(metaClient.getMarkerFolderPath(instantTime)))); + } + } + + private void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean rollbackUsingMarkers, boolean enableOptimisticConsistencyGuard, + boolean populateMetaFields) throws Exception { + String instantTime = "00000000000010"; + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build(); + + Properties properties = new Properties(); + if (!populateMetaFields) { + properties = getPropertiesForKeyGen(); + } + + HoodieWriteConfig cfg = !enableOptimisticConsistencyGuard ? getConfigBuilder().withRollbackUsingMarkers(rollbackUsingMarkers).withAutoCommit(false) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true) + .withMaxConsistencyCheckIntervalMs(1).withInitialConsistencyCheckIntervalMs(1).withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard).build()).build() : + getConfigBuilder().withRollbackUsingMarkers(rollbackUsingMarkers).withAutoCommit(false) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder() + .withConsistencyCheckEnabled(true) + .withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard) + .withOptimisticConsistencyGuardSleepTimeMs(1).build()) + .withProperties(properties).build(); + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + testConsistencyCheck(metaClient, instantTime, enableOptimisticConsistencyGuard); + + if (!enableOptimisticConsistencyGuard) { + // Rollback of this commit should succeed with FailSafeCG + client.rollback(instantTime); + assertFalse(testTable.commitExists(instantTime), + "After explicit rollback, commit file should not be present"); + // Marker directory must be removed after rollback + assertFalse(metaClient.getFs().exists(new Path(metaClient.getMarkerFolderPath(instantTime)))); + } else { + // if optimistic CG is enabled, commit should have succeeded. + assertTrue(testTable.commitExists(instantTime), + "With optimistic CG, first commit should succeed. commit file should be present"); + // Marker directory must be removed after rollback + assertFalse(metaClient.getFs().exists(new Path(metaClient.getMarkerFolderPath(instantTime)))); + client.rollback(instantTime); + assertFalse(testTable.commitExists(instantTime), + "After explicit rollback, commit file should not be present"); + } + } + + @ParameterizedTest + @MethodSource("rollbackAfterConsistencyCheckFailureParams") + public void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean enableOptimisticConsistencyGuard, boolean populateMetCols) throws Exception { + testRollbackAfterConsistencyCheckFailureUsingFileList(false, enableOptimisticConsistencyGuard, populateMetCols); + } + + @ParameterizedTest + @MethodSource("rollbackAfterConsistencyCheckFailureParams") + public void testRollbackAfterConsistencyCheckFailureUsingMarkers(boolean enableOptimisticConsistencyGuard, boolean populateMetCols) throws Exception { + testRollbackAfterConsistencyCheckFailureUsingFileList(true, enableOptimisticConsistencyGuard, populateMetCols); + } + + @ParameterizedTest + @MethodSource("rollbackFailedCommitsParams") + public void testRollbackFailedCommits(HoodieFailedWritesCleaningPolicy cleaningPolicy, boolean populateMetaFields) throws Exception { + HoodieTestUtils.init(hadoopConf, basePath); + SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)); + + // perform 1 successfull commit + writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100", + 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, + 0, true); + + // Perform 2 failed writes to table + writeBatch(client, "200", "100", Option.of(Arrays.asList("200")), "100", + 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, + 0, false); + client.close(); + client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)); + writeBatch(client, "300", "200", Option.of(Arrays.asList("300")), "300", + 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, + 0, false); + client.close(); + // refresh data generator to delete records generated from failed commits + dataGen = new HoodieTestDataGenerator(); + // Perform 1 successful write + client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)); + writeBatch(client, "400", "300", Option.of(Arrays.asList("400")), "400", + 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, + 0, true); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build(); + + assertTrue(metaClient.getActiveTimeline().getTimelineOfActions( + CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 0); + assertTrue(metaClient.getActiveTimeline().filterInflights().countInstants() == 2); + assertTrue(metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants() == 2); + // Await till enough time passes such that the first 2 failed commits heartbeats are expired + boolean conditionMet = false; + while (!conditionMet) { + conditionMet = client.getHeartbeatClient().isHeartbeatExpired("300"); + Thread.sleep(2000); + } + client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)); + // Perform 1 successful write + writeBatch(client, "500", "400", Option.of(Arrays.asList("500")), "500", + 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, + 0, true); + client.clean(); + HoodieActiveTimeline timeline = metaClient.getActiveTimeline().reload(); + if (cleaningPolicy.isLazy()) { + assertTrue( + timeline + .getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION)) + .countInstants() + == 2); + // Since we write rollbacks not clean, there should be no clean action on the timeline + assertTrue( + timeline + .getTimelineOfActions(CollectionUtils.createSet(CLEAN_ACTION)) + .countInstants() + == 0); + assertTrue(timeline.getCommitsTimeline().filterCompletedInstants().countInstants() == 3); + } else if (cleaningPolicy.isNever()) { + assertTrue( + timeline + .getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION)) + .countInstants() + == 0); + // There should be no clean or rollback action on the timeline + assertTrue( + timeline + .getTimelineOfActions(CollectionUtils.createSet(CLEAN_ACTION)) + .countInstants() + == 0); + assertTrue(timeline.getCommitsTimeline().filterCompletedInstants().countInstants() == 3); + } + } + + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testRollbackFailedCommitsToggleCleaningPolicy(boolean populateMetaFields) throws Exception { + HoodieTestUtils.init(hadoopConf, basePath); + HoodieFailedWritesCleaningPolicy cleaningPolicy = EAGER; + SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)); + // Perform 1 successful writes to table + writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100", + 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, + 0, true); + + // Perform 1 failed writes to table + writeBatch(client, "200", "100", Option.of(Arrays.asList("200")), "200", + 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, + 0, false); + client.close(); + // Toggle cleaning policy to LAZY + cleaningPolicy = HoodieFailedWritesCleaningPolicy.LAZY; + // Perform 2 failed writes to table + client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)); + writeBatch(client, "300", "200", Option.of(Arrays.asList("300")), "300", + 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, + 0, false); + client.close(); + client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)); + writeBatch(client, "400", "300", Option.of(Arrays.asList("400")), "400", + 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, + 0, false); + client.close(); + // Await till enough time passes such that the 2 failed commits heartbeats are expired + boolean conditionMet = false; + while (!conditionMet) { + conditionMet = client.getHeartbeatClient().isHeartbeatExpired("400"); + Thread.sleep(2000); + } + client.clean(); + HoodieActiveTimeline timeline = metaClient.getActiveTimeline().reload(); + assertTrue(timeline.getTimelineOfActions( + CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 3); + // Perform 2 failed commits + client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)); + writeBatch(client, "500", "400", Option.of(Arrays.asList("300")), "300", + 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, + 0, false); + client.close(); + client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)); + writeBatch(client, "600", "500", Option.of(Arrays.asList("400")), "400", + 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, + 0, false); + client.close(); + // Toggle cleaning policy to EAGER + cleaningPolicy = EAGER; + client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)); + client.startCommit(); + timeline = metaClient.getActiveTimeline().reload(); + assertTrue(timeline.getTimelineOfActions( + CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 5); + assertTrue(timeline.getCommitsTimeline().filterCompletedInstants().countInstants() == 1); + } + + @Test + public void testParallelInsertAndCleanPreviousFailedCommits() throws Exception { + HoodieFailedWritesCleaningPolicy cleaningPolicy = HoodieFailedWritesCleaningPolicy.LAZY; + ExecutorService service = Executors.newFixedThreadPool(2); + HoodieTestUtils.init(hadoopConf, basePath); + SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)); + // perform 1 successfull write + writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100", + 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, + 0, true); + + // Perform 2 failed writes to table + writeBatch(client, "200", "100", Option.of(Arrays.asList("200")), "200", + 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, + 0, false); + client.close(); + client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)); + writeBatch(client, "300", "200", Option.of(Arrays.asList("300")), "300", + 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, + 0, false); + client.close(); + // refresh data generator to delete records generated from failed commits + dataGen = new HoodieTestDataGenerator(); + // Create a succesful commit + Future> commit3 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)), + "400", "300", Option.of(Arrays.asList("400")), "300", 100, dataGen::generateInserts, + SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true)); + commit3.get(); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build(); + + assertTrue(metaClient.getActiveTimeline().getTimelineOfActions( + CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 0); + assertTrue(metaClient.getActiveTimeline().filterInflights().countInstants() == 2); + assertTrue(metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants() == 2); + client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)); + // Await till enough time passes such that the first 2 failed commits heartbeats are expired + boolean conditionMet = false; + while (!conditionMet) { + conditionMet = client.getHeartbeatClient().isHeartbeatExpired("300"); + Thread.sleep(2000); + } + Future> commit4 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)), + "500", "400", Option.of(Arrays.asList("500")), "500", 100, dataGen::generateInserts, + SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true)); + Future clean1 = service.submit(() -> new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)).clean()); + commit4.get(); + clean1.get(); + HoodieActiveTimeline timeline = metaClient.getActiveTimeline().reload(); + assertTrue(timeline.getTimelineOfActions( + CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 2); + // Since we write rollbacks not clean, there should be no clean action on the timeline + assertTrue(timeline.getTimelineOfActions( + CollectionUtils.createSet(CLEAN_ACTION)).countInstants() == 0); + assertTrue(timeline.getCommitsTimeline().filterCompletedInstants().countInstants() == 3); + } + + private Pair> testConsistencyCheck(HoodieTableMetaClient metaClient, String instantTime, boolean enableOptimisticConsistencyGuard) + throws Exception { + HoodieWriteConfig cfg = !enableOptimisticConsistencyGuard ? (getConfigBuilder().withAutoCommit(false) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true) + .withMaxConsistencyCheckIntervalMs(1).withInitialConsistencyCheckIntervalMs(1).withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard).build()) + .build()) : (getConfigBuilder().withAutoCommit(false) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true) + .withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard) + .withOptimisticConsistencyGuardSleepTimeMs(1).build()) + .build()); + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + + client.startCommitWithTime(instantTime); + JavaRDD writeRecords = jsc.parallelize(dataGen.generateInserts(instantTime, 200), 1); + JavaRDD result = client.bulkInsert(writeRecords, instantTime); + result.collect(); + + // Create a dummy marker file to simulate the case that a marker file was created without data file. + // This should fail the commit + String partitionPath; + String markerFolderPath = metaClient.getMarkerFolderPath(instantTime); + if (cfg.getMarkersType() == MarkerType.TIMELINE_SERVER_BASED) { + String markerName = MarkerUtils.readTimelineServerBasedMarkersFromFileSystem( + markerFolderPath, fs, context, 1).values().stream() + .flatMap(Collection::stream).findFirst().get(); + partitionPath = new Path(markerFolderPath, markerName).getParent().toString(); + } else { + partitionPath = Arrays + .stream(fs.globStatus(new Path(String.format("%s/*/*/*/*", markerFolderPath)), + path -> path.toString().contains(HoodieTableMetaClient.MARKER_EXTN))) + .limit(1).map(status -> status.getPath().getParent().toString()).collect(Collectors.toList()).get(0); + } + + Option markerFilePath = WriteMarkersFactory.get( + cfg.getMarkersType(), getHoodieTable(metaClient, cfg), instantTime) + .create(partitionPath, + FSUtils.makeBaseFileName(instantTime, "1-0-1", UUID.randomUUID().toString()), + IOType.MERGE); + LOG.info("Created a dummy marker path=" + markerFilePath.get()); + + if (!enableOptimisticConsistencyGuard) { + Exception e = assertThrows(HoodieCommitException.class, () -> { + client.commit(instantTime, result); + }, "Commit should fail due to consistency check"); + assertTrue(e.getCause() instanceof HoodieIOException); + } else { + // with optimistic CG, commit should succeed + client.commit(instantTime, result); + } + return Pair.of(markerFilePath.get(), result); + } + + @ParameterizedTest + @MethodSource("populateMetaFieldsParams") + public void testMultiOperationsPerCommit(boolean populateMetaFields) throws IOException { + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false) + .withAllowMultiWriteOnSameInstant(true); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + HoodieWriteConfig cfg = cfgBuilder.build(); + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + String firstInstantTime = "0000"; + client.startCommitWithTime(firstInstantTime); + int numRecords = 200; + JavaRDD writeRecords = jsc.parallelize(dataGen.generateInserts(firstInstantTime, numRecords), 1); + JavaRDD result = client.bulkInsert(writeRecords, firstInstantTime); + assertTrue(client.commit(firstInstantTime, result), "Commit should succeed"); + assertTrue(testTable.commitExists(firstInstantTime), + "After explicit commit, commit file should be created"); + + // Check the entire dataset has all records still + String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; + for (int i = 0; i < fullPartitionPaths.length; i++) { + fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); + } + assertEquals(numRecords, + HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), + "Must contain " + numRecords + " records"); + + String nextInstantTime = "0001"; + client.startCommitWithTime(nextInstantTime); + JavaRDD updateRecords = jsc.parallelize(dataGen.generateUpdates(nextInstantTime, numRecords), 1); + JavaRDD insertRecords = jsc.parallelize(dataGen.generateInserts(nextInstantTime, numRecords), 1); + JavaRDD inserts = client.bulkInsert(insertRecords, nextInstantTime); + JavaRDD upserts = client.upsert(updateRecords, nextInstantTime); + assertTrue(client.commit(nextInstantTime, inserts.union(upserts)), "Commit should succeed"); + assertTrue(testTable.commitExists(firstInstantTime), + "After explicit commit, commit file should be created"); + int totalRecords = 2 * numRecords; + assertEquals(totalRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), + "Must contain " + totalRecords + " records"); + } + + /** + * Build Hoodie Write Config for small data file sizes. + */ + private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize) { + return getSmallInsertWriteConfig(insertSplitSize, false); + } + + /** + * Build Hoodie Write Config for small data file sizes. + */ + private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, boolean useNullSchema) { + return getSmallInsertWriteConfig(insertSplitSize, useNullSchema, false); + } + + /** + * Build Hoodie Write Config for small data file sizes. + */ + private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, boolean useNullSchema, boolean mergeAllowDuplicateInserts) { + return getSmallInsertWriteConfig(insertSplitSize, useNullSchema, dataGen.getEstimatedFileSizeInBytes(150), mergeAllowDuplicateInserts); + } + + /** + * Build Hoodie Write Config for specified small file sizes. + */ + private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, boolean useNullSchema, long smallFileSize) { + return getSmallInsertWriteConfig(insertSplitSize, useNullSchema, smallFileSize, false); + } + + /** + * Build Hoodie Write Config for specified small file sizes. + */ + private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, boolean useNullSchema, long smallFileSize, boolean mergeAllowDuplicateInserts) { + String schemaStr = useNullSchema ? NULL_SCHEMA : TRIP_EXAMPLE_SCHEMA; + return getSmallInsertWriteConfig(insertSplitSize, schemaStr, smallFileSize, mergeAllowDuplicateInserts); + } + + private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize) { + return getSmallInsertWriteConfig(insertSplitSize, schemaStr, smallFileSize, false); + } + + private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, boolean mergeAllowDuplicateInserts) { + return getSmallInsertWriteConfig(insertSplitSize, schemaStr, smallFileSize, mergeAllowDuplicateInserts, true, new Properties()); + } + + private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, boolean populateMetaFields, Properties props) { + return getSmallInsertWriteConfig(insertSplitSize, schemaStr, smallFileSize, false, populateMetaFields, props); + } + + private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, boolean mergeAllowDuplicateInserts, + boolean populateMetaFields, Properties props) { + HoodieWriteConfig.Builder builder = getConfigBuilder(schemaStr); + if (!populateMetaFields) { + builder.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.SIMPLE).build()); + } + return builder.withCompactionConfig(HoodieCompactionConfig.newBuilder() + .compactionSmallFileSize(smallFileSize) + // Set rollback to LAZY so no inflights are deleted + .insertSplitSize(insertSplitSize).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) + .build()) + .withStorageConfig(HoodieStorageConfig.newBuilder() + .hfileMaxFileSize(dataGen.getEstimatedFileSizeInBytes(200)) + .parquetMaxFileSize(dataGen.getEstimatedFileSizeInBytes(200)).build()) + .withMergeAllowDuplicateOnInserts(mergeAllowDuplicateInserts) + .withProps(props) + .build(); + } + + protected HoodieInstant createRequestedReplaceInstant(HoodieTableMetaClient metaClient, String clusterTime, List[] fileSlices) throws IOException { + HoodieClusteringPlan clusteringPlan = + ClusteringUtils.createClusteringPlan(EXECUTION_STRATEGY_CLASS_NAME.defaultValue(), STRATEGY_PARAMS, fileSlices, Collections.emptyMap()); + + HoodieInstant clusteringInstant = new HoodieInstant(REQUESTED, REPLACE_COMMIT_ACTION, clusterTime); + HoodieRequestedReplaceMetadata requestedReplaceMetadata = HoodieRequestedReplaceMetadata.newBuilder() + .setClusteringPlan(clusteringPlan).setOperationType(WriteOperationType.CLUSTER.name()).build(); + metaClient.getActiveTimeline().saveToPendingReplaceCommit(clusteringInstant, TimelineMetadataUtils.serializeRequestedReplaceMetadata(requestedReplaceMetadata)); + return clusteringInstant; + } + + private HoodieWriteConfig getParallelWritingWriteConfig(HoodieFailedWritesCleaningPolicy cleaningPolicy, boolean populateMetaFields) { + return getConfigBuilder() + .withEmbeddedTimelineServerEnabled(false) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(cleaningPolicy) + .withAutoClean(false).build()) + .withTimelineLayoutVersion(1) + .withHeartbeatIntervalInMs(3 * 1000) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withRemoteServerPort(timelineServicePort).build()) + .withAutoCommit(false) + .withProperties(populateMetaFields ? new Properties() : getPropertiesForKeyGen()).build(); + } + + public static class FailingPreCommitValidator> extends SparkPreCommitValidator { + + public FailingPreCommitValidator(HoodieSparkTable table, HoodieEngineContext context, HoodieWriteConfig config) { + super(table, context, config); + } + + @Override + protected void validateRecordsBeforeAndAfter(final Dataset before, final Dataset after, final Set partitionsAffected) { + throw new HoodieValidationException("simulate failure"); + } + } + + public static class WriteClientBrokenClustering extends org.apache.hudi.client.SparkRDDWriteClient { + + public WriteClientBrokenClustering(HoodieEngineContext context, HoodieWriteConfig clientConfig) { + super(context, clientConfig); + } + + @Override + protected Option inlineClustering(Option> extraMetadata) { + throw new HoodieException(CLUSTERING_FAILURE); + } + + } + + public static String CLUSTERING_FAILURE = "CLUSTERING FAILURE"; +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java new file mode 100644 index 0000000000000..8c92f8189f762 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java @@ -0,0 +1,610 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.functional; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.fs.ConsistencyGuardConfig; +import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.FileSystemViewStorageType; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.testutils.RawTripTestPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieLayoutConfig; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.index.HoodieIndex.IndexType; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.commit.SparkBucketIndexPartitioner; +import org.apache.hudi.testutils.Assertions; +import org.apache.hudi.testutils.HoodieSparkWriteableTestTable; +import org.apache.hudi.testutils.MetadataMergeWriteStatus; + +import org.apache.avro.Schema; +import org.apache.hadoop.fs.Path; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Random; +import java.util.UUID; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import scala.Tuple2; + +import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.deleteMetadataPartition; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.metadataPartitionExists; +import static org.apache.hudi.metadata.MetadataPartitionType.COLUMN_STATS; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +@Tag("functional") +public class TestHoodieIndex extends TestHoodieMetadataBase { + + private static Stream indexTypeParams() { + // IndexType, populateMetaFields, enableMetadataIndex + Object[][] data = new Object[][] { + {IndexType.BLOOM, true, true}, + {IndexType.BLOOM, true, false}, + {IndexType.GLOBAL_BLOOM, true, true}, + {IndexType.GLOBAL_BLOOM, true, false}, + {IndexType.SIMPLE, true, true}, + {IndexType.SIMPLE, true, false}, + {IndexType.SIMPLE, false, true}, + {IndexType.SIMPLE, false, false}, + {IndexType.GLOBAL_SIMPLE, true, true}, + {IndexType.GLOBAL_SIMPLE, false, true}, + {IndexType.GLOBAL_SIMPLE, false, false}, + {IndexType.BUCKET, false, true}, + {IndexType.BUCKET, false, false} + }; + return Stream.of(data).map(Arguments::of); + } + + private static final Schema SCHEMA = getSchemaFromResource(TestHoodieIndex.class, "/exampleSchema.avsc", true); + private final Random random = new Random(); + private IndexType indexType; + private HoodieIndex index; + private HoodieWriteConfig config; + + private void setUp(IndexType indexType, boolean populateMetaFields, boolean enableMetadataIndex) throws Exception { + setUp(indexType, populateMetaFields, true, enableMetadataIndex); + } + + private void setUp(IndexType indexType, boolean populateMetaFields, boolean rollbackUsingMarkers, boolean enableMetadataIndex) throws Exception { + this.indexType = indexType; + initPath(); + initSparkContexts(); + initTestDataGenerator(); + initFileSystem(); + metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE, populateMetaFields ? new Properties() + : getPropertiesForKeyGen()); + HoodieIndexConfig.Builder indexBuilder = HoodieIndexConfig.newBuilder().withIndexType(indexType) + .fromProperties(populateMetaFields ? new Properties() : getPropertiesForKeyGen()) + .withIndexType(indexType); + if (indexType == IndexType.BUCKET) { + indexBuilder.withBucketIndexEngineType(HoodieIndex.BucketIndexEngineType.SIMPLE); + } + config = getConfigBuilder() + .withProperties(populateMetaFields ? new Properties() : getPropertiesForKeyGen()) + .withRollbackUsingMarkers(rollbackUsingMarkers) + .withIndexConfig(indexBuilder.build()) + .withAutoCommit(false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .withMetadataIndexBloomFilter(enableMetadataIndex) + .withMetadataIndexColumnStats(enableMetadataIndex) + .build()) + .withLayoutConfig(HoodieLayoutConfig.newBuilder().fromProperties(indexBuilder.build().getProps()) + .withLayoutPartitioner(SparkBucketIndexPartitioner.class.getName()).build()) + .build(); + writeClient = getHoodieWriteClient(config); + this.index = writeClient.getIndex(); + } + + @AfterEach + public void tearDown() throws IOException { + cleanupResources(); + } + + @ParameterizedTest + @MethodSource("indexTypeParams") + public void testSimpleTagLocationAndUpdate(IndexType indexType, boolean populateMetaFields, boolean enableMetadataIndex) throws Exception { + setUp(indexType, populateMetaFields, enableMetadataIndex); + String newCommitTime = "001"; + int totalRecords = 10 + random.nextInt(20); + List records = dataGen.generateInserts(newCommitTime, totalRecords); + JavaRDD writeRecords = jsc.parallelize(records, 1); + + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + + // Test tagLocation without any entries in index + JavaRDD javaRDD = tagLocation(index, writeRecords, hoodieTable); + assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0); + + // Insert totalRecords records + writeClient.startCommitWithTime(newCommitTime); + JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); + Assertions.assertNoWriteErrors(writeStatues.collect()); + + // Now tagLocation for these records, index should not tag them since it was a failed + // commit + javaRDD = tagLocation(index, writeRecords, hoodieTable); + assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0); + // Now commit this & update location of records inserted and validate no errors + writeClient.commit(newCommitTime, writeStatues); + // Now tagLocation for these records, index should tag them correctly + metaClient = HoodieTableMetaClient.reload(metaClient); + hoodieTable = HoodieSparkTable.create(config, context, metaClient); + javaRDD = tagLocation(index, writeRecords, hoodieTable); + Map recordKeyToPartitionPathMap = new HashMap(); + List hoodieRecords = writeRecords.collect(); + hoodieRecords.forEach(entry -> recordKeyToPartitionPathMap.put(entry.getRecordKey(), entry.getPartitionPath())); + + assertEquals(totalRecords, javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size()); + assertEquals(totalRecords, javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count()); + assertEquals(totalRecords, javaRDD.filter(record -> (record.getCurrentLocation() != null + && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count()); + javaRDD.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry.getRecordKey()), entry.getPartitionPath(), "PartitionPath mismatch")); + + JavaRDD hoodieKeyJavaRDD = writeRecords.map(entry -> entry.getKey()); + JavaPairRDD>> recordLocations = getRecordLocations(hoodieKeyJavaRDD, hoodieTable); + List hoodieKeys = hoodieKeyJavaRDD.collect(); + assertEquals(totalRecords, recordLocations.collect().size()); + assertEquals(totalRecords, recordLocations.map(record -> record._1).distinct().count()); + recordLocations.foreach(entry -> assertTrue(hoodieKeys.contains(entry._1), "Missing HoodieKey")); + recordLocations.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry._1.getRecordKey()), entry._1.getPartitionPath(), "PartitionPath mismatch")); + } + + @Test + public void testLookupIndexWithOrWithoutColumnStats() throws Exception { + setUp(IndexType.BLOOM, true, true); + String newCommitTime = "001"; + int totalRecords = 10 + random.nextInt(20); + List records = dataGen.generateInserts(newCommitTime, totalRecords); + JavaRDD writeRecords = jsc.parallelize(records, 1); + + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + + // Test tagLocation without any entries in index + JavaRDD javaRDD = tagLocation(index, writeRecords, hoodieTable); + assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0); + + // Insert totalRecords records + writeClient.startCommitWithTime(newCommitTime); + JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); + Assertions.assertNoWriteErrors(writeStatues.collect()); + + // Now tagLocation for these records + javaRDD = tagLocation(index, writeRecords, hoodieTable); + assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0); + // Now commit this & update location of records inserted + writeClient.commit(newCommitTime, writeStatues); + + // check column_stats partition exists + metaClient = HoodieTableMetaClient.reload(metaClient); + assertTrue(metadataPartitionExists(metaClient.getBasePath(), context, COLUMN_STATS)); + assertTrue(metaClient.getTableConfig().getMetadataPartitions().contains(COLUMN_STATS.getPartitionPath())); + + // delete the column_stats partition + deleteMetadataPartition(metaClient.getBasePath(), context, COLUMN_STATS); + + // Now tagLocation for these records, they should be tagged correctly despite column_stats being enabled but not present + hoodieTable = HoodieSparkTable.create(config, context, metaClient); + javaRDD = tagLocation(index, writeRecords, hoodieTable); + Map recordKeyToPartitionPathMap = new HashMap(); + List hoodieRecords = writeRecords.collect(); + hoodieRecords.forEach(entry -> recordKeyToPartitionPathMap.put(entry.getRecordKey(), entry.getPartitionPath())); + + assertEquals(totalRecords, javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size()); + assertEquals(totalRecords, javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count()); + assertEquals(totalRecords, javaRDD.filter(record -> (record.getCurrentLocation() != null + && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count()); + javaRDD.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry.getRecordKey()), entry.getPartitionPath(), "PartitionPath mismatch")); + + JavaRDD hoodieKeyJavaRDD = writeRecords.map(entry -> entry.getKey()); + JavaPairRDD>> recordLocations = getRecordLocations(hoodieKeyJavaRDD, hoodieTable); + List hoodieKeys = hoodieKeyJavaRDD.collect(); + assertEquals(totalRecords, recordLocations.collect().size()); + assertEquals(totalRecords, recordLocations.map(record -> record._1).distinct().count()); + recordLocations.foreach(entry -> assertTrue(hoodieKeys.contains(entry._1), "Missing HoodieKey")); + recordLocations.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry._1.getRecordKey()), entry._1.getPartitionPath(), "PartitionPath mismatch")); + } + + @ParameterizedTest + @MethodSource("indexTypeParams") + public void testTagLocationAndDuplicateUpdate(IndexType indexType, boolean populateMetaFields, boolean enableMetadataIndex) throws Exception { + setUp(indexType, populateMetaFields, enableMetadataIndex); + String newCommitTime = "001"; + int totalRecords = 10 + random.nextInt(20); + List records = dataGen.generateInserts(newCommitTime, totalRecords); + JavaRDD writeRecords = jsc.parallelize(records, 1); + + HoodieSparkTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + + writeClient.startCommitWithTime(newCommitTime); + JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); + JavaRDD javaRDD1 = tagLocation(index, writeRecords, hoodieTable); + + // Duplicate upsert and ensure correctness is maintained + // We are trying to approximately imitate the case when the RDD is recomputed. For RDD creating, driver code is not + // recomputed. This includes the state transitions. We need to delete the inflight instance so that subsequent + // upsert will not run into conflicts. + metaClient.getFs().delete(new Path(metaClient.getMetaPath(), "001.inflight")); + + writeClient.upsert(writeRecords, newCommitTime); + Assertions.assertNoWriteErrors(writeStatues.collect()); + + // Now commit this & update location of records inserted and validate no errors + writeClient.commit(newCommitTime, writeStatues); + // Now tagLocation for these records, hbaseIndex should tag them correctly + metaClient = HoodieTableMetaClient.reload(metaClient); + hoodieTable = HoodieSparkTable.create(config, context, metaClient); + JavaRDD javaRDD = tagLocation(index, writeRecords, hoodieTable); + + Map recordKeyToPartitionPathMap = new HashMap(); + List hoodieRecords = writeRecords.collect(); + hoodieRecords.forEach(entry -> recordKeyToPartitionPathMap.put(entry.getRecordKey(), entry.getPartitionPath())); + + assertEquals(totalRecords, javaRDD.filter(HoodieRecord::isCurrentLocationKnown).collect().size()); + assertEquals(totalRecords, javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count()); + assertEquals(totalRecords, javaRDD.filter(record -> (record.getCurrentLocation() != null + && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count()); + javaRDD.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry.getRecordKey()), entry.getPartitionPath(), "PartitionPath mismatch")); + + JavaRDD hoodieKeyJavaRDD = writeRecords.map(entry -> entry.getKey()); + JavaPairRDD>> recordLocations = getRecordLocations(hoodieKeyJavaRDD, hoodieTable); + List hoodieKeys = hoodieKeyJavaRDD.collect(); + assertEquals(totalRecords, recordLocations.collect().size()); + assertEquals(totalRecords, recordLocations.map(record -> record._1).distinct().count()); + recordLocations.foreach(entry -> assertTrue(hoodieKeys.contains(entry._1), "Missing HoodieKey")); + recordLocations.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry._1.getRecordKey()), entry._1.getPartitionPath(), "PartitionPath mismatch")); + } + + @ParameterizedTest + @MethodSource("indexTypeParams") + public void testSimpleTagLocationAndUpdateWithRollback(IndexType indexType, boolean populateMetaFields, boolean enableMetadataIndex) throws Exception { + setUp(indexType, populateMetaFields, false, enableMetadataIndex); + String newCommitTime = writeClient.startCommit(); + int totalRecords = 20 + random.nextInt(20); + List records = dataGen.generateInserts(newCommitTime, totalRecords); + JavaRDD writeRecords = jsc.parallelize(records, 1); + metaClient = HoodieTableMetaClient.reload(metaClient); + + // Insert 200 records + JavaRDD writeStatusesRDD = writeClient.upsert(writeRecords, newCommitTime); + // NOTE: This will trigger an actual write + List writeStatuses = writeStatusesRDD.collect(); + Assertions.assertNoWriteErrors(writeStatuses); + // Commit + writeClient.commit(newCommitTime, jsc.parallelize(writeStatuses)); + + List fileIds = writeStatuses.stream().map(WriteStatus::getFileId).collect(Collectors.toList()); + + HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + + // Now tagLocation for these records, hbaseIndex should tag them + JavaRDD javaRDD = tagLocation(index, writeRecords, hoodieTable); + assertEquals(totalRecords, javaRDD.filter(HoodieRecord::isCurrentLocationKnown).collect().size()); + + // check tagged records are tagged with correct fileIds + assertEquals(0, javaRDD.filter(record -> record.getCurrentLocation().getFileId() == null).collect().size()); + List taggedFileIds = javaRDD.map(record -> record.getCurrentLocation().getFileId()).distinct().collect(); + + Map recordKeyToPartitionPathMap = new HashMap(); + List hoodieRecords = writeRecords.collect(); + hoodieRecords.forEach(entry -> recordKeyToPartitionPathMap.put(entry.getRecordKey(), entry.getPartitionPath())); + + JavaRDD hoodieKeyJavaRDD = writeRecords.map(entry -> entry.getKey()); + JavaPairRDD>> recordLocations = getRecordLocations(hoodieKeyJavaRDD, hoodieTable); + List hoodieKeys = hoodieKeyJavaRDD.collect(); + assertEquals(totalRecords, recordLocations.collect().size()); + assertEquals(totalRecords, recordLocations.map(record -> record._1).distinct().count()); + recordLocations.foreach(entry -> assertTrue(hoodieKeys.contains(entry._1), "Missing HoodieKey")); + recordLocations.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry._1.getRecordKey()), entry._1.getPartitionPath(), "PartitionPath mismatch")); + + // both lists should match + assertTrue(taggedFileIds.containsAll(fileIds) && fileIds.containsAll(taggedFileIds)); + // Rollback the last commit + writeClient.rollback(newCommitTime); + + hoodieTable = HoodieSparkTable.create(config, context, metaClient); + // Now tagLocation for these records, hbaseIndex should not tag them since it was a rolled + // back commit + javaRDD = tagLocation(index, writeRecords, hoodieTable); + assert (javaRDD.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 0); + assert (javaRDD.filter(record -> record.getCurrentLocation() != null).collect().size() == 0); + } + + private static Stream regularIndexTypeParams() { + // IndexType, populateMetaFields, enableMetadataIndex + Object[][] data = new Object[][] { + // TODO (codope): Enabling metadata index is flaky. Both bloom_filter and col_stats get generated but loading column ranges from the index is failing. + // {IndexType.BLOOM, true, true}, + {IndexType.BLOOM, true, false}, + {IndexType.SIMPLE, true, true}, + {IndexType.SIMPLE, true, false} + }; + return Stream.of(data).map(Arguments::of); + } + + @ParameterizedTest + @MethodSource("regularIndexTypeParams") + public void testTagLocationAndFetchRecordLocations(IndexType indexType, boolean populateMetaFields, boolean enableMetadataIndex) throws Exception { + setUp(indexType, populateMetaFields, enableMetadataIndex); + String p1 = "2016/01/31"; + String p2 = "2015/01/31"; + String rowKey1 = UUID.randomUUID().toString(); + String rowKey2 = UUID.randomUUID().toString(); + String rowKey3 = UUID.randomUUID().toString(); + String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + // place same row key under a different partition. + String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; + RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); + HoodieRecord record1 = + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); + HoodieRecord record2 = + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); + HoodieRecord record3 = + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); + HoodieRecord record4 = + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4)); + String newCommitTime = writeClient.startCommit(); + metaClient = HoodieTableMetaClient.reload(metaClient); + writeClient.upsert(recordRDD, newCommitTime); + HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + + JavaRDD taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable); + + // Should not find any files + for (HoodieRecord record : taggedRecordRDD.collect()) { + assertFalse(record.isCurrentLocationKnown()); + } + + // We create three parquet files, each having one record (two different partitions) + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, metadataWriter); + final String fileId1 = "fileID1"; + final String fileId2 = "fileID2"; + final String fileId3 = "fileID3"; + + Map>> partitionToFilesNameLengthMap = new HashMap<>(); + Path baseFilePath = testTable.forCommit("0000001").withInserts(p1, fileId1, Collections.singletonList(record1)); + long baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.computeIfAbsent(p1, k -> new ArrayList<>()).add(Pair.of(fileId1, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation("0000001", WriteOperationType.UPSERT, Arrays.asList(p1, p2), + partitionToFilesNameLengthMap, false, false); + + partitionToFilesNameLengthMap.clear(); + baseFilePath = testTable.forCommit("0000002").withInserts(p1, fileId2, Collections.singletonList(record2)); + baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.computeIfAbsent(p1, k -> new ArrayList<>()).add(Pair.of(fileId2, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation("0000002", WriteOperationType.UPSERT, Arrays.asList(p1, p2), + partitionToFilesNameLengthMap, false, false); + + partitionToFilesNameLengthMap.clear(); + baseFilePath = testTable.forCommit("0000003").withInserts(p2, fileId3, Collections.singletonList(record4)); + baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.computeIfAbsent(p2, k -> new ArrayList<>()).add(Pair.of(fileId3, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation("0000003", WriteOperationType.UPSERT, Arrays.asList(p1, p2), + partitionToFilesNameLengthMap, false, false); + + // We do the tag again + metaClient = HoodieTableMetaClient.reload(metaClient); + hoodieTable = HoodieSparkTable.create(config, context, metaClient); + + taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable); + List records = taggedRecordRDD.collect(); + + // Check results + for (HoodieRecord record : records) { + if (record.getRecordKey().equals(rowKey1)) { + if (record.getPartitionPath().equals(p2)) { + assertEquals(record.getCurrentLocation().getFileId(), fileId3); + } else { + assertEquals(record.getCurrentLocation().getFileId(), fileId1); + } + } else if (record.getRecordKey().equals(rowKey2)) { + assertEquals(record.getCurrentLocation().getFileId(), fileId2); + } else if (record.getRecordKey().equals(rowKey3)) { + assertFalse(record.isCurrentLocationKnown()); + } + } + + JavaPairRDD>> recordLocations = getRecordLocations(recordRDD.map(HoodieRecord::getKey), hoodieTable); + for (Tuple2>> entry : recordLocations.collect()) { + if (entry._1.getRecordKey().equals(rowKey1)) { + assertTrue(entry._2.isPresent(), "Row1 should have been present "); + if (entry._1.getPartitionPath().equals(p2)) { + assertTrue(entry._2.isPresent(), "Row1 should have been present "); + assertEquals(entry._2.get().getRight(), fileId3); + } else { + assertEquals(entry._2.get().getRight(), fileId1); + } + } else if (entry._1.getRecordKey().equals(rowKey2)) { + assertTrue(entry._2.isPresent(), "Row2 should have been present "); + assertEquals(entry._2.get().getRight(), fileId2); + } else if (entry._1.getRecordKey().equals(rowKey3)) { + assertFalse(entry._2.isPresent(), "Row3 should have been absent "); + } + } + } + + @Test + public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath() throws Exception { + setUp(IndexType.GLOBAL_SIMPLE, true, true); + config = getConfigBuilder() + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType) + .withGlobalSimpleIndexUpdatePartitionPath(true) + .withBloomIndexUpdatePartitionPath(true) + .build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(true) + .withMetadataIndexBloomFilter(true) + .withMetadataIndexColumnStats(true) + .build()) + .build(); + writeClient = getHoodieWriteClient(config); + index = writeClient.getIndex(); + + HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create( + writeClient.getEngineContext().getHadoopConf().get(), config, writeClient.getEngineContext()); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable.getMetaClient(), + SCHEMA, metadataWriter); + + final String p1 = "2016/01/31"; + final String p2 = "2016/02/28"; + + // Create the original partition, and put a record, along with the meta file + // "2016/01/31": 1 file (1_0_20160131101010.parquet) + // this record will be saved in table and will be tagged to an empty record + RawTripTestPayload originalPayload = + new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); + HoodieRecord originalRecord = + new HoodieAvroRecord(new HoodieKey(originalPayload.getRowKey(), originalPayload.getPartitionPath()), + originalPayload); + + /* + This record has the same record key as originalRecord but different time so different partition + Because GLOBAL_BLOOM_INDEX_SHOULD_UPDATE_PARTITION_PATH = true, + globalBloomIndex should + - tag the original partition of the originalRecord to an empty record for deletion, and + - tag the new partition of the incomingRecord + */ + RawTripTestPayload incomingPayload = + new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-02-28T03:16:41.415Z\",\"number\":12}"); + HoodieRecord incomingRecord = + new HoodieAvroRecord(new HoodieKey(incomingPayload.getRowKey(), incomingPayload.getPartitionPath()), + incomingPayload); + /* + This record has the same record key as originalRecord and the same partition + Though GLOBAL_BLOOM_INDEX_SHOULD_UPDATE_PARTITION_PATH = true, + globalBloomIndex should just tag the original partition + */ + RawTripTestPayload incomingPayloadSamePartition = + new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T04:16:41.415Z\",\"number\":15}"); + HoodieRecord incomingRecordSamePartition = + new HoodieAvroRecord( + new HoodieKey(incomingPayloadSamePartition.getRowKey(), incomingPayloadSamePartition.getPartitionPath()), + incomingPayloadSamePartition); + + final String file1P1C0 = UUID.randomUUID().toString(); + Map>> c1PartitionToFilesNameLengthMap = new HashMap<>(); + // We have some records to be tagged (two different partitions) + Path baseFilePath = testTable.forCommit("1000").withInserts(p1, file1P1C0, Collections.singletonList(originalRecord)); + long baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + c1PartitionToFilesNameLengthMap.put(p1, Collections.singletonList(Pair.of(file1P1C0, Integer.valueOf((int) baseFileLength)))); + testTable.doWriteOperation("1000", WriteOperationType.INSERT, Arrays.asList(p1), + c1PartitionToFilesNameLengthMap, false, false); + + // We have some records to be tagged (two different partitions) + testTable.withInserts(p1, file1P1C0, originalRecord); + + // test against incoming record with a different partition + JavaRDD recordRDD = jsc.parallelize(Collections.singletonList(incomingRecord)); + JavaRDD taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable); + + assertEquals(2, taggedRecordRDD.count()); + for (HoodieRecord record : taggedRecordRDD.collect()) { + switch (record.getPartitionPath()) { + case p1: + assertEquals("000", record.getRecordKey()); + assertTrue(record.getData() instanceof EmptyHoodieRecordPayload); + break; + case p2: + assertEquals("000", record.getRecordKey()); + assertEquals(incomingPayload.getJsonData(), ((RawTripTestPayload) record.getData()).getJsonData()); + break; + default: + fail(String.format("Should not get partition path: %s", record.getPartitionPath())); + } + } + + // test against incoming record with the same partition + JavaRDD recordRDDSamePartition = jsc + .parallelize(Collections.singletonList(incomingRecordSamePartition)); + JavaRDD taggedRecordRDDSamePartition = tagLocation(index, recordRDDSamePartition, hoodieTable); + + assertEquals(1, taggedRecordRDDSamePartition.count()); + HoodieRecord record = taggedRecordRDDSamePartition.first(); + assertEquals("000", record.getRecordKey()); + assertEquals(p1, record.getPartitionPath()); + assertEquals(incomingPayloadSamePartition.getJsonData(), ((RawTripTestPayload) record.getData()).getJsonData()); + } + + private HoodieWriteConfig.Builder getConfigBuilder() { + return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2).withBulkInsertParallelism(2).withFinalizeWriteParallelism(2).withDeleteParallelism(2) + .withWriteStatusClass(MetadataMergeWriteStatus.class) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024).parquetMaxFileSize(1024 * 1024).build()) + .forTable("test-trip-table") + .withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build()); + } + + private JavaPairRDD>> getRecordLocations(JavaRDD keyRDD, HoodieTable hoodieTable) { + JavaRDD recordRDD = tagLocation( + index, keyRDD.map(k -> new HoodieAvroRecord(k, new EmptyHoodieRecordPayload())), hoodieTable); + return recordRDD.mapToPair(hr -> new Tuple2<>(hr.getKey(), hr.isCurrentLocationKnown() + ? Option.of(Pair.of(hr.getPartitionPath(), hr.getCurrentLocation().getFileId())) + : Option.empty()) + ); + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBase.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBase.java new file mode 100644 index 0000000000000..29c653daee61a --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBase.java @@ -0,0 +1,458 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.functional; + +import org.apache.hudi.avro.model.HoodieMetadataRecord; +import org.apache.hudi.client.HoodieTimelineArchiver; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.fs.ConsistencyGuardConfig; +import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.WriteConcurrencyMode; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.marker.MarkerType; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.testutils.HoodieMetadataTestTable; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieArchivalConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.config.metrics.HoodieMetricsConfig; +import org.apache.hudi.config.metrics.HoodieMetricsGraphiteConfig; +import org.apache.hudi.config.metrics.HoodieMetricsJmxConfig; +import org.apache.hudi.exception.HoodieMetadataException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.keygen.SimpleKeyGenerator; +import org.apache.hudi.metadata.HoodieMetadataPayload; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.HoodieTableMetadataKeyGenerator; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.testutils.HoodieClientTestHarness; + +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.junit.jupiter.api.AfterEach; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Properties; + +import static java.util.Arrays.asList; +import static java.util.Collections.emptyList; +import static org.apache.hudi.common.model.WriteOperationType.INSERT; +import static org.apache.hudi.common.model.WriteOperationType.UPSERT; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; +import static org.apache.hudi.metadata.HoodieTableMetadata.METADATA_TABLE_NAME_SUFFIX; + +public class TestHoodieMetadataBase extends HoodieClientTestHarness { + + private static final Logger LOG = LogManager.getLogger(TestHoodieMetadataBase.class); + + protected static HoodieTestTable testTable; + protected String metadataTableBasePath; + protected HoodieTableType tableType; + protected HoodieWriteConfig writeConfig; + protected HoodieTableMetadataWriter metadataWriter; + + public void init(HoodieTableType tableType) throws IOException { + init(tableType, true); + } + + public void init(HoodieTableType tableType, HoodieWriteConfig writeConfig) throws IOException { + init(tableType, Option.of(writeConfig), true, false, false, false); + } + + public void init(HoodieTableType tableType, boolean enableMetadataTable) throws IOException { + init(tableType, enableMetadataTable, true, false, false); + } + + public void init(HoodieTableType tableType, boolean enableMetadataTable, boolean enableColumnStats) throws IOException { + init(tableType, enableMetadataTable, true, false, false); + } + + public void init(HoodieTableType tableType, boolean enableMetadataTable, boolean enableFullScan, boolean enableMetrics, boolean + validateMetadataPayloadStateConsistency) throws IOException { + init(tableType, Option.empty(), enableMetadataTable, enableFullScan, enableMetrics, + validateMetadataPayloadStateConsistency); + } + + public void init(HoodieTableType tableType, Option writeConfig, boolean enableMetadataTable, + boolean enableFullScan, boolean enableMetrics, boolean validateMetadataPayloadStateConsistency) throws IOException { + this.tableType = tableType; + initPath(); + initSparkContexts("TestHoodieMetadata"); + initFileSystem(); + fs.mkdirs(new Path(basePath)); + initTimelineService(); + initMetaClient(tableType); + initTestDataGenerator(); + metadataTableBasePath = HoodieTableMetadata.getMetadataTableBasePath(basePath); + this.writeConfig = writeConfig.isPresent() + ? writeConfig.get() : getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, true, + enableMetadataTable, enableMetrics, enableFullScan, true, + validateMetadataPayloadStateConsistency) + .build(); + initWriteConfigAndMetatableWriter(this.writeConfig, enableMetadataTable); + } + + protected void initWriteConfigAndMetatableWriter(HoodieWriteConfig writeConfig, boolean enableMetadataTable) { + this.writeConfig = writeConfig; + if (enableMetadataTable) { + metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, writeConfig, context); + testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter); + } else { + testTable = HoodieTestTable.of(metaClient); + } + } + + @AfterEach + public void clean() throws Exception { + cleanupResources(); + } + + protected void doWriteInsertAndUpsert(HoodieTestTable testTable, String commit1, String commit2, boolean nonPartitioned) throws Exception { + testTable.doWriteOperation(commit1, INSERT, nonPartitioned ? asList("") : asList("p1", "p2"), nonPartitioned ? asList("") : asList("p1", "p2"), + 4, false); + testTable.doWriteOperation(commit2, UPSERT, nonPartitioned ? asList("") : asList("p1", "p2"), + 4, false); + validateMetadata(testTable); + } + + protected void doWriteOperationAndValidateMetadata(HoodieTestTable testTable, String commitTime) throws Exception { + doWriteOperation(testTable, commitTime); + validateMetadata(testTable); + } + + protected void doWriteOperation(HoodieTestTable testTable, String commitTime) throws Exception { + doWriteOperation(testTable, commitTime, UPSERT); + } + + protected void doWriteOperationAndValidate(HoodieTestTable testTable, String commitTime) throws Exception { + doWriteOperationAndValidate(testTable, commitTime, UPSERT); + } + + protected void doWriteOperationAndValidate(HoodieTestTable testTable, String commitTime, WriteOperationType operationType) throws Exception { + doWriteOperation(testTable, commitTime, operationType); + validateMetadata(testTable); + } + + protected void doWriteOperationNonPartitioned(HoodieTestTable testTable, String commitTime, WriteOperationType operationType) throws Exception { + testTable.doWriteOperation(commitTime, operationType, emptyList(), asList(""), 3); + } + + protected void doWriteOperation(HoodieTestTable testTable, String commitTime, WriteOperationType operationType, boolean nonPartitioned) throws Exception { + if (nonPartitioned) { + doWriteOperationNonPartitioned(testTable, commitTime, operationType); + } else { + doWriteOperation(testTable, commitTime, operationType); + } + } + + protected void doWriteOperation(HoodieTestTable testTable, String commitTime, WriteOperationType operationType) throws Exception { + testTable.doWriteOperation(commitTime, operationType, emptyList(), asList("p1", "p2"), 3); + } + + protected HoodieCommitMetadata doWriteOperationWithMeta(HoodieTestTable testTable, String commitTime, WriteOperationType operationType) throws Exception { + return testTable.doWriteOperation(commitTime, operationType, emptyList(), asList("p1", "p2"), 3); + } + + protected void doClean(HoodieTestTable testTable, String commitTime, List commitsToClean) throws IOException { + doCleanInternal(testTable, commitTime, commitsToClean, false); + } + + protected void doCleanAndValidate(HoodieTestTable testTable, String commitTime, List commitsToClean) throws IOException { + doCleanInternal(testTable, commitTime, commitsToClean, true); + } + + private void doCleanInternal(HoodieTestTable testTable, String commitTime, List commitsToClean, boolean validate) throws IOException { + testTable.doCleanBasedOnCommits(commitTime, commitsToClean); + if (validate) { + validateMetadata(testTable); + } + } + + protected void doCompactionNonPartitioned(HoodieTestTable testTable, String commitTime) throws Exception { + doCompactionInternal(testTable, commitTime, false, true); + } + + protected void doCompaction(HoodieTestTable testTable, String commitTime, boolean nonPartitioned) throws Exception { + doCompactionInternal(testTable, commitTime, false, nonPartitioned); + } + + protected void doCompaction(HoodieTestTable testTable, String commitTime) throws Exception { + doCompactionInternal(testTable, commitTime, false, false); + } + + protected void doCompactionNonPartitionedAndValidate(HoodieTestTable testTable, String commitTime) throws Exception { + doCompactionInternal(testTable, commitTime, true, true); + } + + protected void doCompactionAndValidate(HoodieTestTable testTable, String commitTime) throws Exception { + doCompactionInternal(testTable, commitTime, true, false); + } + + private void doCompactionInternal(HoodieTestTable testTable, String commitTime, boolean validate, boolean nonPartitioned) throws Exception { + testTable.doCompaction(commitTime, nonPartitioned ? asList("") : asList("p1", "p2")); + if (validate) { + validateMetadata(testTable); + } + } + + protected void doCluster(HoodieTestTable testTable, String commitTime) throws Exception { + doClusterInternal(testTable, commitTime, false); + } + + protected void doClusterAndValidate(HoodieTestTable testTable, String commitTime) throws Exception { + doClusterInternal(testTable, commitTime, true); + } + + protected void doClusterInternal(HoodieTestTable testTable, String commitTime, boolean validate) throws Exception { + testTable.doCluster(commitTime, new HashMap<>(), Arrays.asList("p1", "p2"), 2); + if (validate) { + validateMetadata(testTable); + } + } + + protected void doRollback(HoodieTestTable testTable, String commitToRollback, String rollbackTime) throws Exception { + doRollbackInternal(testTable, commitToRollback, rollbackTime, false); + } + + protected void doRollbackAndValidate(HoodieTestTable testTable, String commitToRollback, String rollbackTime) throws Exception { + doRollbackInternal(testTable, commitToRollback, rollbackTime, true); + } + + private void doRollbackInternal(HoodieTestTable testTable, String commitToRollback, String rollbackTime, boolean validate) throws Exception { + testTable.doRollback(commitToRollback, rollbackTime); + if (validate) { + validateMetadata(testTable); + } + } + + protected void doPreBootstrapWriteOperation(HoodieTestTable testTable, String commitTime) throws Exception { + doPreBootstrapWriteOperation(testTable, UPSERT, commitTime); + } + + protected void doPreBootstrapWriteOperation(HoodieTestTable testTable, WriteOperationType writeOperationType, String commitTime) throws Exception { + doPreBootstrapWriteOperation(testTable, writeOperationType, commitTime, 2); + } + + protected void doPreBootstrapWriteOperation(HoodieTestTable testTable, WriteOperationType writeOperationType, String commitTime, int filesPerPartition) throws Exception { + testTable.doWriteOperation(commitTime, writeOperationType, asList("p1", "p2"), asList("p1", "p2"), + filesPerPartition, true); + } + + protected void doPreBootstrapClean(HoodieTestTable testTable, String commitTime, List commitsToClean) throws Exception { + testTable.doCleanBasedOnCommits(commitTime, commitsToClean); + } + + protected void doPreBootstrapRollback(HoodieTestTable testTable, String rollbackTime, String commitToRollback) throws Exception { + testTable.doRollback(commitToRollback, rollbackTime); + } + + protected void doPrebootstrapCompaction(HoodieTestTable testTable, String commitTime) throws Exception { + doPrebootstrapCompaction(testTable, commitTime, Arrays.asList("p1", "p2")); + } + + protected void doPrebootstrapCompaction(HoodieTestTable testTable, String commitTime, List partitions) throws Exception { + testTable.doCompaction(commitTime, partitions); + } + + protected void doPreBootstrapCluster(HoodieTestTable testTable, String commitTime) throws Exception { + testTable.doCluster(commitTime, new HashMap<>(), Arrays.asList("p1", "p2"), 2); + } + + protected void doPreBootstrapRestore(HoodieTestTable testTable, String restoreTime, String commitToRestore) throws Exception { + testTable.doRestore(commitToRestore, restoreTime); + } + + protected void archiveDataTable(HoodieWriteConfig writeConfig, HoodieTableMetaClient metaClient) throws IOException { + HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table); + archiver.archiveIfRequired(context); + } + + protected void validateMetadata(HoodieTestTable testTable) throws IOException { + validateMetadata(testTable, emptyList()); + } + + protected void validateMetadata(HoodieTestTable testTable, boolean doFullValidation) throws IOException { + validateMetadata(testTable, emptyList(), doFullValidation); + } + + protected void validateMetadata(HoodieTestTable testTable, List inflightCommits) throws IOException { + validateMetadata(testTable, inflightCommits, false); + } + + protected void validateMetadata(HoodieTestTable testTable, List inflightCommits, boolean doFullValidation) throws IOException { + validateMetadata(testTable, inflightCommits, writeConfig, metadataTableBasePath, doFullValidation); + } + + protected HoodieWriteConfig getWriteConfig(boolean autoCommit, boolean useFileListingMetadata) { + return getWriteConfigBuilder(autoCommit, useFileListingMetadata, false).build(); + } + + protected HoodieWriteConfig.Builder getWriteConfigBuilder(boolean autoCommit, boolean useFileListingMetadata, boolean enableMetrics) { + return getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, autoCommit, useFileListingMetadata, enableMetrics); + } + + protected HoodieWriteConfig.Builder getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy policy, boolean autoCommit, boolean useFileListingMetadata, + boolean enableMetrics) { + return getWriteConfigBuilder(policy, autoCommit, useFileListingMetadata, enableMetrics, true, true, false); + } + + protected HoodieWriteConfig.Builder getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy policy, boolean autoCommit, boolean useFileListingMetadata, + boolean enableMetrics, boolean enableFullScan, boolean useRollbackUsingMarkers, + boolean validateMetadataPayloadConsistency) { + Properties properties = new Properties(); + properties.put(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key(), SimpleKeyGenerator.class.getName()); + return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2).withDeleteParallelism(2).withRollbackParallelism(2).withFinalizeWriteParallelism(2) + .withAutoCommit(autoCommit) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024) + .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(policy) + .withAutoClean(false).retainCommits(1).retainFileVersions(1) + .build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024 * 1024).build()) + .withEmbeddedTimelineServerEnabled(true).forTable("test-trip-table") + .withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder() + .withEnableBackupForRemoteFileSystemView(false).build()) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(useFileListingMetadata) + .enableFullScan(enableFullScan) + .enableMetrics(enableMetrics) + .withPopulateMetaFields(HoodieMetadataConfig.POPULATE_META_FIELDS.defaultValue()) + .ignoreSpuriousDeletes(validateMetadataPayloadConsistency) + .build()) + .withMetricsConfig(HoodieMetricsConfig.newBuilder().on(enableMetrics) + .withExecutorMetrics(true).build()) + .withMetricsGraphiteConfig(HoodieMetricsGraphiteConfig.newBuilder() + .usePrefix("unit-test").build()) + .withRollbackUsingMarkers(useRollbackUsingMarkers) + .withProperties(properties); + } + + /** + * Fetching WriteConfig for metadata table from Data table's writeConfig is not trivial and + * the method is not public in source code. so, for now, using this method which mimics source code. + */ + protected HoodieWriteConfig getMetadataWriteConfig(HoodieWriteConfig writeConfig) { + int parallelism = writeConfig.getMetadataInsertParallelism(); + + int minCommitsToKeep = Math.max(writeConfig.getMetadataMinCommitsToKeep(), writeConfig.getMinCommitsToKeep()); + int maxCommitsToKeep = Math.max(writeConfig.getMetadataMaxCommitsToKeep(), writeConfig.getMaxCommitsToKeep()); + + // Create the write config for the metadata table by borrowing options from the main write config. + HoodieWriteConfig.Builder builder = HoodieWriteConfig.newBuilder() + .withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder() + .withConsistencyCheckEnabled(writeConfig.getConsistencyGuardConfig().isConsistencyCheckEnabled()) + .withInitialConsistencyCheckIntervalMs(writeConfig.getConsistencyGuardConfig().getInitialConsistencyCheckIntervalMs()) + .withMaxConsistencyCheckIntervalMs(writeConfig.getConsistencyGuardConfig().getMaxConsistencyCheckIntervalMs()) + .withMaxConsistencyChecks(writeConfig.getConsistencyGuardConfig().getMaxConsistencyChecks()) + .build()) + .withWriteConcurrencyMode(WriteConcurrencyMode.SINGLE_WRITER) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).withFileListingParallelism(writeConfig.getFileListingParallelism()).build()) + .withAutoCommit(true) + .withAvroSchemaValidate(true) + .withEmbeddedTimelineServerEnabled(false) + .withMarkersType(MarkerType.DIRECT.name()) + .withRollbackUsingMarkers(false) + .withPath(HoodieTableMetadata.getMetadataTableBasePath(writeConfig.getBasePath())) + .withSchema(HoodieMetadataRecord.getClassSchema().toString()) + .forTable(writeConfig.getTableName() + METADATA_TABLE_NAME_SUFFIX) + // we will trigger cleaning manually, to control the instant times + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withAsyncClean(writeConfig.isMetadataAsyncClean()) + .withAutoClean(false) + .withCleanerParallelism(parallelism) + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS) + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) + .retainCommits(writeConfig.getMetadataCleanerCommitsRetained()) + .build()) + // we will trigger archival manually, to control the instant times + .withArchivalConfig(HoodieArchivalConfig.newBuilder() + .archiveCommitsWith(minCommitsToKeep, maxCommitsToKeep).build()) + // we will trigger compaction manually, to control the instant times + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withInlineCompaction(false) + .withMaxNumDeltaCommitsBeforeCompaction(writeConfig.getMetadataCompactDeltaCommitMax()).build()) + .withParallelism(parallelism, parallelism) + .withDeleteParallelism(parallelism) + .withRollbackParallelism(parallelism) + .withFinalizeWriteParallelism(parallelism) + .withAllowMultiWriteOnSameInstant(true) + .withKeyGenerator(HoodieTableMetadataKeyGenerator.class.getCanonicalName()) + .withPopulateMetaFields(writeConfig.getMetadataConfig().populateMetaFields()); + + // RecordKey properties are needed for the metadata table records + final Properties properties = new Properties(); + properties.put(HoodieTableConfig.RECORDKEY_FIELDS.key(), HoodieMetadataPayload.KEY_FIELD_NAME); + properties.put("hoodie.datasource.write.recordkey.field", HoodieMetadataPayload.KEY_FIELD_NAME); + builder.withProperties(properties); + + if (writeConfig.isMetricsOn()) { + builder.withMetricsConfig(HoodieMetricsConfig.newBuilder() + .withReporterType(writeConfig.getMetricsReporterType().toString()) + .withExecutorMetrics(writeConfig.isExecutorMetricsEnabled()) + .on(true).build()); + switch (writeConfig.getMetricsReporterType()) { + case GRAPHITE: + builder.withMetricsGraphiteConfig(HoodieMetricsGraphiteConfig.newBuilder() + .onGraphitePort(writeConfig.getGraphiteServerPort()) + .toGraphiteHost(writeConfig.getGraphiteServerHost()) + .usePrefix(writeConfig.getGraphiteMetricPrefix()).build()); + break; + case JMX: + builder.withMetricsJmxConfig(HoodieMetricsJmxConfig.newBuilder() + .onJmxPort(writeConfig.getJmxPort()) + .toJmxHost(writeConfig.getJmxHost()) + .build()); + break; + case DATADOG: + case PROMETHEUS: + case PROMETHEUS_PUSHGATEWAY: + case CONSOLE: + case INMEMORY: + case CLOUDWATCH: + break; + default: + throw new HoodieMetadataException("Unsupported Metrics Reporter type " + writeConfig.getMetricsReporterType()); + } + } + return builder.build(); + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBootstrap.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBootstrap.java new file mode 100644 index 0000000000000..8531030a5cc24 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBootstrap.java @@ -0,0 +1,293 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.functional; + +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.testutils.FileCreateUtils; +import org.apache.hudi.common.testutils.HoodieMetadataTestTable; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.config.HoodieArchivalConfig; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieWriteConfig; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.UUID; + +import static java.util.Arrays.asList; +import static java.util.Collections.emptyList; +import static org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE; +import static org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ; +import static org.apache.hudi.common.model.WriteOperationType.INSERT; +import static org.apache.hudi.common.model.WriteOperationType.UPSERT; + +@Tag("functional") +public class TestHoodieMetadataBootstrap extends TestHoodieMetadataBase { + + private static final Logger LOG = LogManager.getLogger(TestHoodieMetadataBootstrap.class); + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataBootstrapInsertUpsert(HoodieTableType tableType) throws Exception { + init(tableType, false); + doPreBootstrapWriteOperation(testTable, INSERT, "0000001"); + doPreBootstrapWriteOperation(testTable, "0000002"); + if (tableType == MERGE_ON_READ) { + doPrebootstrapCompaction(testTable, "0000003"); + } + bootstrapAndVerify(); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataBootstrapInsertUpsertClean(HoodieTableType tableType) throws Exception { + init(tableType, false); + doPreBootstrapWriteOperation(testTable, INSERT, "0000001"); + doPreBootstrapWriteOperation(testTable, "0000002"); + doPreBootstrapClean(testTable, "0000003", Arrays.asList("0000001")); + if (tableType == MERGE_ON_READ) { + doPrebootstrapCompaction(testTable, "0000004"); + } + doPreBootstrapWriteOperation(testTable, "0000005"); + bootstrapAndVerify(); + } + + /** + * Validate that bootstrap considers only files part of completed commit and ignore any extra files. + */ + @Test + public void testMetadataBootstrapWithExtraFiles() throws Exception { + HoodieTableType tableType = COPY_ON_WRITE; + init(tableType, false); + doPreBootstrapWriteOperation(testTable, INSERT, "0000001"); + doPreBootstrapWriteOperation(testTable, "0000002"); + doPreBootstrapClean(testTable, "0000003", Arrays.asList("0000001")); + doPreBootstrapWriteOperation(testTable, "0000005"); + // add few extra files to table. bootstrap should include those files. + String fileName = UUID.randomUUID().toString(); + Path baseFilePath = FileCreateUtils.getBaseFilePath(basePath, "p1", "0000006", fileName); + FileCreateUtils.createBaseFile(basePath, "p1", "0000006", fileName, 100); + + writeConfig = getWriteConfig(true, true); + initWriteConfigAndMetatableWriter(writeConfig, true); + syncTableMetadata(writeConfig); + + // remove those files from table. and then validate. + Files.delete(baseFilePath); + + // validate + validateMetadata(testTable); + // after bootstrap do two writes and validate its still functional. + doWriteInsertAndUpsert(testTable); + validateMetadata(testTable); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataBootstrapInsertUpsertRollback(HoodieTableType tableType) throws Exception { + init(tableType, false); + doPreBootstrapWriteOperation(testTable, INSERT, "0000001"); + doPreBootstrapWriteOperation(testTable, "0000002"); + doPreBootstrapRollback(testTable, "0000003", "0000002"); + if (tableType == MERGE_ON_READ) { + doPrebootstrapCompaction(testTable, "0000004"); + } + bootstrapAndVerify(); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataBootstrapInsertUpsertCluster(HoodieTableType tableType) throws Exception { + init(tableType, false); + doPreBootstrapWriteOperation(testTable, INSERT, "0000001"); + doPreBootstrapWriteOperation(testTable, "0000002"); + doPreBootstrapCluster(testTable, "0000003"); + if (tableType == MERGE_ON_READ) { + doPrebootstrapCompaction(testTable, "0000004"); + } + bootstrapAndVerify(); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataBootstrapLargeCommitList(HoodieTableType tableType) throws Exception { + init(tableType, false); + for (int i = 1; i < 25; i += 7) { + String commitTime1 = ((i > 9) ? ("00000") : ("000000")) + i; + String commitTime2 = ((i > 9) ? ("00000") : ("000000")) + (i + 1); + String commitTime3 = ((i > 9) ? ("00000") : ("000000")) + (i + 2); + String commitTime4 = ((i > 9) ? ("00000") : ("000000")) + (i + 3); + String commitTime5 = ((i > 9) ? ("00000") : ("000000")) + (i + 4); + String commitTime6 = ((i > 9) ? ("00000") : ("000000")) + (i + 5); + String commitTime7 = ((i > 9) ? ("00000") : ("000000")) + (i + 6); + doPreBootstrapWriteOperation(testTable, INSERT, commitTime1); + doPreBootstrapWriteOperation(testTable, commitTime2); + doPreBootstrapClean(testTable, commitTime3, Arrays.asList(commitTime1)); + doPreBootstrapWriteOperation(testTable, commitTime4); + if (tableType == MERGE_ON_READ) { + doPrebootstrapCompaction(testTable, commitTime5); + } + doPreBootstrapWriteOperation(testTable, commitTime6); + doPreBootstrapRollback(testTable, commitTime7, commitTime6); + } + bootstrapAndVerify(); + } + + @Test + public void testMetadataBootstrapInflightCommit() throws Exception { + HoodieTableType tableType = COPY_ON_WRITE; + init(tableType, false); + + doPreBootstrapWriteOperation(testTable, INSERT, "0000001"); + doPreBootstrapWriteOperation(testTable, "0000002"); + // add an inflight commit + HoodieCommitMetadata inflightCommitMeta = testTable.doWriteOperation("00000007", UPSERT, emptyList(), + asList("p1", "p2"), 2, true, true); + // bootstrap and following validation should fail. bootstrap should not happen. + bootstrapAndVerifyFailure(); + + // once the commit is complete, metadata should get fully synced. + // in prod code path, SparkHoodieBackedTableMetadataWriter.create() will be called for every commit, + // which may not be the case here if we directly call HoodieBackedTableMetadataWriter.update() + // hence lets first move the commit to complete and invoke sync directly + ((HoodieMetadataTestTable) testTable).moveInflightCommitToComplete("00000007", inflightCommitMeta, true); + syncTableMetadata(writeConfig); + validateMetadata(testTable); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataBootstrapArchival(HoodieTableType tableType) throws Exception { + init(tableType, false); + writeConfig = getWriteConfig(2, 4); + for (int i = 1; i < 13; i += 7) { + String commitTime1 = ((i > 9) ? ("00000") : ("000000")) + i; + String commitTime2 = ((i > 9) ? ("00000") : ("000000")) + (i + 1); + String commitTime3 = ((i > 9) ? ("00000") : ("000000")) + (i + 2); + String commitTime4 = ((i > 9) ? ("00000") : ("000000")) + (i + 3); + String commitTime5 = ((i > 9) ? ("00000") : ("000000")) + (i + 4); + String commitTime6 = ((i > 9) ? ("00000") : ("000000")) + (i + 5); + String commitTime7 = ((i > 9) ? ("00000") : ("000000")) + (i + 6); + doPreBootstrapWriteOperation(testTable, INSERT, commitTime1); + doPreBootstrapWriteOperation(testTable, commitTime2); + doPreBootstrapClean(testTable, commitTime3, Arrays.asList(commitTime1)); + doPreBootstrapWriteOperation(testTable, commitTime4); + if (tableType == MERGE_ON_READ) { + doPrebootstrapCompaction(testTable, commitTime5); + } + doPreBootstrapWriteOperation(testTable, commitTime6); + doPreBootstrapRollback(testTable, commitTime7, commitTime6); + } + // archive and then bootstrap + archiveDataTable(writeConfig, metaClient); + bootstrapAndVerify(); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataBootstrapAfterRestore(HoodieTableType tableType) throws Exception { + init(tableType, false); + testRestore(false); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataBootstrapAfterRestoreAndUpserts(HoodieTableType tableType) throws Exception { + init(tableType, false); + testRestore(true); + } + + private void testRestore(boolean addUpsertsAfterRestore) throws Exception { + doPreBootstrapWriteOperation(testTable, INSERT, "0000001"); + doPreBootstrapWriteOperation(testTable, "0000002"); + if (tableType == MERGE_ON_READ) { + doPrebootstrapCompaction(testTable, "0000003"); + } + doPreBootstrapWriteOperation(testTable, "0000004"); + doPreBootstrapWriteOperation(testTable, "0000005"); + doPreBootstrapWriteOperation(testTable, "0000006"); + doPreBootstrapRestore(testTable, "0000007", "0000004"); + + if (addUpsertsAfterRestore) { + doPreBootstrapWriteOperation(testTable, "0000008"); + doPreBootstrapWriteOperation(testTable, "0000009"); + if (tableType == MERGE_ON_READ) { + doPrebootstrapCompaction(testTable, "0000010"); + } + } + bootstrapAndVerify(); + } + + private void bootstrapAndVerify() throws Exception { + writeConfig = getWriteConfig(true, true); + initWriteConfigAndMetatableWriter(writeConfig, true); + syncTableMetadata(writeConfig); + validateMetadata(testTable); + // after bootstrap do two writes and validate its still functional. + doWriteInsertAndUpsert(testTable); + validateMetadata(testTable); + } + + private void bootstrapAndVerifyFailure() throws Exception { + writeConfig = getWriteConfig(true, true); + initWriteConfigAndMetatableWriter(writeConfig, true); + syncTableMetadata(writeConfig); + try { + validateMetadata(testTable); + Assertions.fail("Should have failed"); + } catch (IllegalStateException e) { + // expected + } + } + + private void doWriteInsertAndUpsert(HoodieTestTable testTable) throws Exception { + doWriteInsertAndUpsert(testTable, "0000100", "0000101", false); + } + + private HoodieWriteConfig getWriteConfig(int minArchivalCommits, int maxArchivalCommits) throws Exception { + return HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .retainCommits(1).build()) + .withArchivalConfig(HoodieArchivalConfig.newBuilder() + .archiveCommitsWith(minArchivalCommits, maxArchivalCommits).build()) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withRemoteServerPort(timelineServicePort).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()) + .forTable("test-trip-table").build(); + } + + @Override + protected HoodieTableType getTableType() { + return tableType; + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/model/TestHoodieInternalRow.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/model/TestHoodieInternalRow.java index bfcb012c3748c..fde4c988ed278 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/model/TestHoodieInternalRow.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/model/TestHoodieInternalRow.java @@ -21,6 +21,7 @@ import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.unsafe.types.UTF8String; import org.junit.jupiter.api.Test; import java.util.ArrayList; @@ -64,7 +65,13 @@ public void testGet() { Object[] values = getRandomValue(true); InternalRow row = new GenericInternalRow(values); - HoodieInternalRow hoodieInternalRow = new HoodieInternalRow("commitTime", "commitSeqNo", "recordKey", "partitionPath", "fileName", row); + HoodieInternalRow hoodieInternalRow = new HoodieInternalRow(UTF8String.fromString("commitTime"), + UTF8String.fromString("commitSeqNo"), + UTF8String.fromString("recordKey"), + UTF8String.fromString("partitionPath"), + UTF8String.fromString("fileName"), + row, + true); assertValues(hoodieInternalRow, "commitTime", "commitSeqNo", "recordKey", "partitionPath", "fileName", values, nullIndices); @@ -74,7 +81,13 @@ public void testGet() { public void testUpdate() { Object[] values = getRandomValue(true); InternalRow row = new GenericInternalRow(values); - HoodieInternalRow hoodieInternalRow = new HoodieInternalRow("commitTime", "commitSeqNo", "recordKey", "partitionPath", "fileName", row); + HoodieInternalRow hoodieInternalRow = new HoodieInternalRow(UTF8String.fromString("commitTime"), + UTF8String.fromString("commitSeqNo"), + UTF8String.fromString("recordKey"), + UTF8String.fromString("partitionPath"), + UTF8String.fromString("fileName"), + row, + true); hoodieInternalRow.update(0, "commitTime_updated"); hoodieInternalRow.update(1, "commitSeqNo_updated"); @@ -99,6 +112,28 @@ public void testUpdate() { "fileName_updated", values, nullIndices); } + @Test + public void testNumFields() { + Object[] values = getRandomValue(true); + InternalRow row = new GenericInternalRow(values); + HoodieInternalRow hoodieInternalRow1 = new HoodieInternalRow(UTF8String.fromString("commitTime"), + UTF8String.fromString("commitSeqNo"), + UTF8String.fromString("recordKey"), + UTF8String.fromString("partitionPath"), + UTF8String.fromString("fileName"), + row, + true); + HoodieInternalRow hoodieInternalRow2 = new HoodieInternalRow(UTF8String.fromString("commitTime"), + UTF8String.fromString("commitSeqNo"), + UTF8String.fromString("recordKey"), + UTF8String.fromString("partitionPath"), + UTF8String.fromString("fileName"), + row, + false); + assertEquals(row.numFields(), hoodieInternalRow1.numFields()); + assertEquals(row.numFields() + 5, hoodieInternalRow2.numFields()); + } + @Test public void testIsNullCheck() { @@ -106,7 +141,13 @@ public void testIsNullCheck() { Object[] values = getRandomValue(true); InternalRow row = new GenericInternalRow(values); - HoodieInternalRow hoodieInternalRow = new HoodieInternalRow("commitTime", "commitSeqNo", "recordKey", "partitionPath", "fileName", row); + HoodieInternalRow hoodieInternalRow = new HoodieInternalRow(UTF8String.fromString("commitTime"), + UTF8String.fromString("commitSeqNo"), + UTF8String.fromString("recordKey"), + UTF8String.fromString("partitionPath"), + UTF8String.fromString("fileName"), + row, + true); hoodieInternalRow.setNullAt(i); nullIndices.clear(); @@ -129,7 +170,13 @@ public void testIsNullCheck() { Object[] values = getRandomValue(true); InternalRow row = new GenericInternalRow(values); - HoodieInternalRow hoodieInternalRow = new HoodieInternalRow("commitTime", "commitSeqNo", "recordKey", "partitionPath", "fileName", row); + HoodieInternalRow hoodieInternalRow = new HoodieInternalRow(UTF8String.fromString("commitTime"), + UTF8String.fromString("commitSeqNo"), + UTF8String.fromString("recordKey"), + UTF8String.fromString("partitionPath"), + UTF8String.fromString("fileName"), + row, + true); nullIndices.clear(); @@ -173,7 +220,7 @@ private Object[] getRandomValue(boolean withStructType) { } private void assertValues(HoodieInternalRow hoodieInternalRow, String commitTime, String commitSeqNo, String recordKey, String partitionPath, String filename, Object[] values, - List nullIndexes) { + List nullIndexes) { for (Integer index : nullIndexes) { assertTrue(hoodieInternalRow.isNullAt(index)); } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/data/TestHoodieJavaRDD.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/data/TestHoodieJavaRDD.java new file mode 100644 index 0000000000000..75958883048e5 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/data/TestHoodieJavaRDD.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.data; + +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.testutils.HoodieClientTestBase; + +import org.junit.jupiter.api.Test; + +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestHoodieJavaRDD extends HoodieClientTestBase { + @Test + public void testGetNumPartitions() { + int numPartitions = 6; + HoodieData rddData = HoodieJavaRDD.of(jsc.parallelize( + IntStream.rangeClosed(0, 100).boxed().collect(Collectors.toList()), numPartitions)); + assertEquals(numPartitions, rddData.getNumPartitions()); + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestBoundedInMemoryExecutorInSpark.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestBoundedInMemoryExecutorInSpark.java new file mode 100644 index 0000000000000..a714d60d0033a --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestBoundedInMemoryExecutorInSpark.java @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution; + +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; +import org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.testutils.HoodieClientTestHarness; + +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.spark.TaskContext; +import org.apache.spark.TaskContext$; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.Iterator; +import java.util.List; + +import scala.Tuple2; + +import static org.apache.hudi.execution.HoodieLazyInsertIterable.getTransformFunction; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class TestBoundedInMemoryExecutorInSpark extends HoodieClientTestHarness { + + private final String instantTime = HoodieActiveTimeline.createNewInstantTime(); + + @BeforeEach + public void setUp() throws Exception { + initTestDataGenerator(); + } + + @AfterEach + public void tearDown() throws Exception { + cleanupResources(); + } + + private Runnable getPreExecuteRunnable() { + final TaskContext taskContext = TaskContext.get(); + return () -> TaskContext$.MODULE$.setTaskContext(taskContext); + } + + @Test + public void testExecutor() { + + final List hoodieRecords = dataGen.generateInserts(instantTime, 100); + + HoodieWriteConfig hoodieWriteConfig = mock(HoodieWriteConfig.class); + when(hoodieWriteConfig.getWriteBufferLimitBytes()).thenReturn(1024); + BoundedInMemoryQueueConsumer, Integer> consumer = + new BoundedInMemoryQueueConsumer, Integer>() { + + private int count = 0; + + @Override + protected void consumeOneRecord(HoodieLazyInsertIterable.HoodieInsertValueGenResult record) { + count++; + } + + @Override + protected void finish() { + } + + @Override + protected Integer getResult() { + return count; + } + }; + + BoundedInMemoryExecutor>, Integer> executor = null; + try { + executor = new BoundedInMemoryExecutor(hoodieWriteConfig.getWriteBufferLimitBytes(), hoodieRecords.iterator(), consumer, + getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA), getPreExecuteRunnable()); + int result = executor.execute(); + // It should buffer and write 100 records + assertEquals(100, result); + // There should be no remaining records in the buffer + assertFalse(executor.isRemaining()); + } finally { + if (executor != null) { + executor.shutdownNow(); + executor.awaitTermination(); + } + } + } + + @Test + public void testInterruptExecutor() { + final List hoodieRecords = dataGen.generateInserts(instantTime, 100); + + HoodieWriteConfig hoodieWriteConfig = mock(HoodieWriteConfig.class); + when(hoodieWriteConfig.getWriteBufferLimitBytes()).thenReturn(1024); + BoundedInMemoryQueueConsumer, Integer> consumer = + new BoundedInMemoryQueueConsumer, Integer>() { + + @Override + protected void consumeOneRecord(HoodieLazyInsertIterable.HoodieInsertValueGenResult record) { + try { + while (true) { + Thread.sleep(1000); + } + } catch (InterruptedException ie) { + return; + } + } + + @Override + protected void finish() { + } + + @Override + protected Integer getResult() { + return 0; + } + }; + + BoundedInMemoryExecutor>, Integer> executor = null; + try { + executor = new BoundedInMemoryExecutor(hoodieWriteConfig.getWriteBufferLimitBytes(), hoodieRecords.iterator(), consumer, + getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA), getPreExecuteRunnable()); + BoundedInMemoryExecutor>, Integer> finalExecutor = executor; + + Thread.currentThread().interrupt(); + + assertThrows(HoodieException.class, () -> finalExecutor.execute()); + assertTrue(Thread.interrupted()); + } finally { + if (executor != null) { + executor.shutdownNow(); + executor.awaitTermination(); + } + } + } + + @Test + public void testExecutorTermination() { + HoodieWriteConfig hoodieWriteConfig = mock(HoodieWriteConfig.class); + when(hoodieWriteConfig.getWriteBufferLimitBytes()).thenReturn(1024); + Iterator unboundedRecordIter = new Iterator() { + @Override + public boolean hasNext() { + return true; + } + + @Override + public GenericRecord next() { + return dataGen.generateGenericRecord(); + } + }; + + BoundedInMemoryQueueConsumer, Integer> consumer = + new BoundedInMemoryQueueConsumer, Integer>() { + @Override + protected void consumeOneRecord(HoodieLazyInsertIterable.HoodieInsertValueGenResult record) { + } + + @Override + protected void finish() { + } + + @Override + protected Integer getResult() { + return 0; + } + }; + + BoundedInMemoryExecutor>, Integer> executor = + new BoundedInMemoryExecutor(hoodieWriteConfig.getWriteBufferLimitBytes(), unboundedRecordIter, + consumer, getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA), + getPreExecuteRunnable()); + executor.shutdownNow(); + boolean terminatedGracefully = executor.awaitTermination(); + assertTrue(terminatedGracefully); + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestBoundedInMemoryQueue.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestBoundedInMemoryQueue.java index c30635bb12f9e..4707a68072e9a 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestBoundedInMemoryQueue.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestBoundedInMemoryQueue.java @@ -18,6 +18,7 @@ package org.apache.hudi.execution; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; @@ -82,7 +83,7 @@ public void tearDown() throws Exception { public void testRecordReading() throws Exception { final int numRecords = 128; final List hoodieRecords = dataGen.generateInserts(instantTime, numRecords); - final BoundedInMemoryQueue> queue = + final BoundedInMemoryQueue queue = new BoundedInMemoryQueue(FileIOUtils.KB, getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA)); // Produce Future resFuture = executorService.submit(() -> { @@ -93,7 +94,7 @@ public void testRecordReading() throws Exception { final Iterator originalRecordIterator = hoodieRecords.iterator(); int recordsRead = 0; while (queue.iterator().hasNext()) { - final HoodieRecord originalRecord = originalRecordIterator.next(); + final HoodieAvroRecord originalRecord = (HoodieAvroRecord) originalRecordIterator.next(); final Option originalInsertValue = originalRecord.getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA); final HoodieLazyInsertIterable.HoodieInsertValueGenResult payload = queue.iterator().next(); @@ -101,7 +102,7 @@ public void testRecordReading() throws Exception { assertEquals(originalRecord, payload.record); // cached insert value matches the expected insert value. assertEquals(originalInsertValue, - payload.record.getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA)); + ((HoodieAvroRecord) payload.record).getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA)); recordsRead++; } assertFalse(queue.iterator().hasNext() || originalRecordIterator.hasNext()); @@ -122,7 +123,7 @@ public void testCompositeProducerRecordReading() throws Exception { final int numProducers = 40; final List> recs = new ArrayList<>(); - final BoundedInMemoryQueue> queue = + final BoundedInMemoryQueue queue = new BoundedInMemoryQueue(FileIOUtils.KB, getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA)); // Record Key to @@ -188,7 +189,7 @@ public void testCompositeProducerRecordReading() throws Exception { // Read recs and ensure we have covered all producer recs. while (queue.iterator().hasNext()) { - final HoodieLazyInsertIterable.HoodieInsertValueGenResult payload = queue.iterator().next(); + final HoodieLazyInsertIterable.HoodieInsertValueGenResult payload = queue.iterator().next(); final HoodieRecord rec = payload.record; Tuple2 producerPos = keyToProducerAndIndexMap.get(rec.getRecordKey()); Integer lastSeenPos = lastSeenMap.get(producerPos._1()); @@ -216,12 +217,12 @@ public void testMemoryLimitForBuffering() throws Exception { final List hoodieRecords = dataGen.generateInserts(instantTime, numRecords); // maximum number of records to keep in memory. final int recordLimit = 5; - final SizeEstimator> sizeEstimator = new DefaultSizeEstimator<>(); - HoodieLazyInsertIterable.HoodieInsertValueGenResult payload = - getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA).apply(hoodieRecords.get(0)); + final SizeEstimator sizeEstimator = new DefaultSizeEstimator<>(); + HoodieLazyInsertIterable.HoodieInsertValueGenResult payload = + getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA).apply((HoodieAvroRecord) hoodieRecords.get(0)); final long objSize = sizeEstimator.sizeEstimate(payload); final long memoryLimitInBytes = recordLimit * objSize; - final BoundedInMemoryQueue> queue = + final BoundedInMemoryQueue queue = new BoundedInMemoryQueue(memoryLimitInBytes, getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA)); // Produce @@ -266,8 +267,8 @@ public void testException() throws Exception { final List hoodieRecords = dataGen.generateInserts(instantTime, numRecords); final SizeEstimator>> sizeEstimator = new DefaultSizeEstimator<>(); // queue memory limit - HoodieLazyInsertIterable.HoodieInsertValueGenResult payload = - getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA).apply(hoodieRecords.get(0)); + HoodieLazyInsertIterable.HoodieInsertValueGenResult payload = + getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA).apply((HoodieAvroRecord) hoodieRecords.get(0)); final long objSize = sizeEstimator.sizeEstimate(new Tuple2<>(payload.record, payload.insertValue)); final long memoryLimitInBytes = 4 * objSize; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestSparkBoundedInMemoryExecutor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestSparkBoundedInMemoryExecutor.java deleted file mode 100644 index fd41a16802f9f..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestSparkBoundedInMemoryExecutor.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.execution; - -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.testutils.HoodieTestDataGenerator; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.testutils.HoodieClientTestHarness; - -import org.apache.avro.generic.IndexedRecord; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.util.List; - -import scala.Tuple2; - -import static org.apache.hudi.execution.HoodieLazyInsertIterable.getTransformFunction; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -public class TestSparkBoundedInMemoryExecutor extends HoodieClientTestHarness { - - private final String instantTime = HoodieActiveTimeline.createNewInstantTime(); - - @BeforeEach - public void setUp() throws Exception { - initTestDataGenerator(); - } - - @AfterEach - public void tearDown() throws Exception { - cleanupResources(); - } - - @Test - public void testExecutor() { - - final List hoodieRecords = dataGen.generateInserts(instantTime, 100); - - HoodieWriteConfig hoodieWriteConfig = mock(HoodieWriteConfig.class); - when(hoodieWriteConfig.getWriteBufferLimitBytes()).thenReturn(1024); - BoundedInMemoryQueueConsumer, Integer> consumer = - new BoundedInMemoryQueueConsumer, Integer>() { - - private int count = 0; - - @Override - protected void consumeOneRecord(HoodieLazyInsertIterable.HoodieInsertValueGenResult record) { - count++; - } - - @Override - protected void finish() { - } - - @Override - protected Integer getResult() { - return count; - } - }; - - SparkBoundedInMemoryExecutor>, Integer> executor = null; - try { - executor = new SparkBoundedInMemoryExecutor(hoodieWriteConfig, hoodieRecords.iterator(), consumer, - getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA)); - int result = executor.execute(); - // It should buffer and write 100 records - assertEquals(100, result); - // There should be no remaining records in the buffer - assertFalse(executor.isRemaining()); - } finally { - if (executor != null) { - executor.shutdownNow(); - } - } - } -} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/bulkinsert/TestBulkInsertInternalPartitioner.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/bulkinsert/TestBulkInsertInternalPartitioner.java index 834229b683f1e..7bc64b5445763 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/bulkinsert/TestBulkInsertInternalPartitioner.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/bulkinsert/TestBulkInsertInternalPartitioner.java @@ -19,16 +19,25 @@ package org.apache.hudi.execution.bulkinsert; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.hudi.testutils.HoodieClientTestBase; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; +import java.io.IOException; +import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; @@ -37,9 +46,12 @@ import java.util.Map; import java.util.stream.Stream; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; import static org.junit.jupiter.api.Assertions.assertEquals; -public class TestBulkInsertInternalPartitioner extends HoodieClientTestBase { +public class TestBulkInsertInternalPartitioner extends HoodieClientTestBase implements Serializable { + private static final Comparator> KEY_COMPARATOR = + Comparator.comparing(o -> (o.getPartitionPath() + "+" + o.getRecordKey())); public static JavaRDD generateTestRecordsForBulkInsert(JavaSparkContext jsc) { HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(); @@ -50,48 +62,87 @@ public static JavaRDD generateTestRecordsForBulkInsert(JavaSparkCo return jsc.parallelize(records1, 1).union(jsc.parallelize(records2, 1)); } + public static JavaRDD generateTestRecordsForBulkInsert(JavaSparkContext jsc, int count) { + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(); + List records = dataGenerator.generateInserts("0", count); + return jsc.parallelize(records, 1); + } + public static Map generateExpectedPartitionNumRecords(JavaRDD records) { return records.map(record -> record.getPartitionPath()).countByValue(); } - private static JavaRDD generateTripleTestRecordsForBulkInsert(JavaSparkContext jsc) - throws Exception { + private static JavaRDD generateTripleTestRecordsForBulkInsert(JavaSparkContext jsc) { return generateTestRecordsForBulkInsert(jsc).union(generateTestRecordsForBulkInsert(jsc)) .union(generateTestRecordsForBulkInsert(jsc)); } private static Stream configParams() { + // Parameters: + // BulkInsertSortMode sortMode, + // boolean isTablePartitioned, + // boolean enforceNumOutputPartitions, + // boolean isGloballySorted, + // boolean isLocallySorted Object[][] data = new Object[][] { - {BulkInsertSortMode.GLOBAL_SORT, true, true}, - {BulkInsertSortMode.PARTITION_SORT, false, true}, - {BulkInsertSortMode.NONE, false, false} + {BulkInsertSortMode.GLOBAL_SORT, true, true, true, true}, + {BulkInsertSortMode.PARTITION_SORT, true, true, false, true}, + {BulkInsertSortMode.PARTITION_PATH_REPARTITION, true, true, false, false}, + {BulkInsertSortMode.PARTITION_PATH_REPARTITION, false, true, false, false}, + {BulkInsertSortMode.PARTITION_PATH_REPARTITION_AND_SORT, true, true, false, false}, + {BulkInsertSortMode.PARTITION_PATH_REPARTITION_AND_SORT, false, true, false, false}, + {BulkInsertSortMode.NONE, true, true, false, false}, + {BulkInsertSortMode.NONE, true, false, false, false} }; return Stream.of(data).map(Arguments::of); } - private void verifyRecordAscendingOrder(List records) { - List expectedRecords = new ArrayList<>(records); - Collections.sort(expectedRecords, Comparator.comparing(o -> (o.getPartitionPath() + "+" + o.getRecordKey()))); + private void verifyRecordAscendingOrder(List> records, + Option>> comparator) { + List> expectedRecords = new ArrayList<>(records); + Collections.sort(expectedRecords, comparator.orElse(KEY_COMPARATOR)); assertEquals(expectedRecords, records); } private void testBulkInsertInternalPartitioner(BulkInsertPartitioner partitioner, JavaRDD records, - boolean isGloballySorted, boolean isLocallySorted, + boolean enforceNumOutputPartitions, + boolean isGloballySorted, + boolean isLocallySorted, Map expectedPartitionNumRecords) { + testBulkInsertInternalPartitioner( + partitioner, + records, + enforceNumOutputPartitions, + isGloballySorted, + isLocallySorted, + expectedPartitionNumRecords, + Option.empty()); + } + + private void testBulkInsertInternalPartitioner(BulkInsertPartitioner partitioner, + JavaRDD records, + boolean enforceNumOutputPartitions, + boolean isGloballySorted, + boolean isLocallySorted, + Map expectedPartitionNumRecords, + Option>> comparator) { int numPartitions = 2; - JavaRDD actualRecords = (JavaRDD) partitioner.repartitionRecords(records, numPartitions); - assertEquals(numPartitions, actualRecords.getNumPartitions()); - List collectedActualRecords = actualRecords.collect(); + JavaRDD> actualRecords = + (JavaRDD>) partitioner.repartitionRecords(records, numPartitions); + assertEquals( + enforceNumOutputPartitions ? numPartitions : records.getNumPartitions(), + actualRecords.getNumPartitions()); + List> collectedActualRecords = actualRecords.collect(); if (isGloballySorted) { // Verify global order - verifyRecordAscendingOrder(collectedActualRecords); + verifyRecordAscendingOrder(collectedActualRecords, comparator); } else if (isLocallySorted) { // Verify local order actualRecords.mapPartitions(partition -> { - List partitionRecords = new ArrayList<>(); + List> partitionRecords = new ArrayList<>(); partition.forEachRemaining(partitionRecords::add); - verifyRecordAscendingOrder(partitionRecords); + verifyRecordAscendingOrder(partitionRecords, comparator); return Collections.emptyList().iterator(); }).collect(); } @@ -106,16 +157,75 @@ private void testBulkInsertInternalPartitioner(BulkInsertPartitioner partitioner assertEquals(expectedPartitionNumRecords, actualPartitionNumRecords); } - @ParameterizedTest(name = "[{index}] {0}") + @ParameterizedTest(name = "[{index}] {0} isTablePartitioned={1} enforceNumOutputPartitions={2}") @MethodSource("configParams") public void testBulkInsertInternalPartitioner(BulkInsertSortMode sortMode, - boolean isGloballySorted, boolean isLocallySorted) - throws Exception { + boolean isTablePartitioned, + boolean enforceNumOutputPartitions, + boolean isGloballySorted, + boolean isLocallySorted) { + JavaRDD records1 = generateTestRecordsForBulkInsert(jsc); + JavaRDD records2 = generateTripleTestRecordsForBulkInsert(jsc); + testBulkInsertInternalPartitioner( + BulkInsertInternalPartitionerFactory.get( + sortMode, isTablePartitioned, enforceNumOutputPartitions), + records1, + enforceNumOutputPartitions, + isGloballySorted, + isLocallySorted, + generateExpectedPartitionNumRecords(records1)); + testBulkInsertInternalPartitioner( + BulkInsertInternalPartitionerFactory.get( + sortMode, isTablePartitioned, enforceNumOutputPartitions), + records2, + enforceNumOutputPartitions, + isGloballySorted, + isLocallySorted, + generateExpectedPartitionNumRecords(records2)); + } + + @Test + public void testCustomColumnSortPartitioner() { + String sortColumnString = "rider"; + String[] sortColumns = sortColumnString.split(","); + Comparator> columnComparator = getCustomColumnComparator(HoodieTestDataGenerator.AVRO_SCHEMA, sortColumns); + JavaRDD records1 = generateTestRecordsForBulkInsert(jsc); JavaRDD records2 = generateTripleTestRecordsForBulkInsert(jsc); - testBulkInsertInternalPartitioner(BulkInsertInternalPartitionerFactory.get(sortMode), - records1, isGloballySorted, isLocallySorted, generateExpectedPartitionNumRecords(records1)); - testBulkInsertInternalPartitioner(BulkInsertInternalPartitionerFactory.get(sortMode), - records2, isGloballySorted, isLocallySorted, generateExpectedPartitionNumRecords(records2)); + testBulkInsertInternalPartitioner(new RDDCustomColumnsSortPartitioner(sortColumns, HoodieTestDataGenerator.AVRO_SCHEMA, false), + records1, true, true, true, generateExpectedPartitionNumRecords(records1), Option.of(columnComparator)); + testBulkInsertInternalPartitioner(new RDDCustomColumnsSortPartitioner(sortColumns, HoodieTestDataGenerator.AVRO_SCHEMA, false), + records2, true, true, true, generateExpectedPartitionNumRecords(records2), Option.of(columnComparator)); + + HoodieWriteConfig config = HoodieWriteConfig + .newBuilder() + .withPath("/") + .withSchema(TRIP_EXAMPLE_SCHEMA) + .withUserDefinedBulkInsertPartitionerClass(RDDCustomColumnsSortPartitioner.class.getName()) + .withUserDefinedBulkInsertPartitionerSortColumns(sortColumnString) + .build(); + testBulkInsertInternalPartitioner(new RDDCustomColumnsSortPartitioner(config), + records1, true, true, true, generateExpectedPartitionNumRecords(records1), Option.of(columnComparator)); + testBulkInsertInternalPartitioner(new RDDCustomColumnsSortPartitioner(config), + records2, true, true, true, generateExpectedPartitionNumRecords(records2), Option.of(columnComparator)); + + } + + private Comparator> getCustomColumnComparator(Schema schema, String[] sortColumns) { + Comparator> comparator = Comparator.comparing(record -> { + try { + GenericRecord genericRecord = (GenericRecord) record.getData().getInsertValue(schema).get(); + StringBuilder sb = new StringBuilder(); + for (String col : sortColumns) { + sb.append(genericRecord.get(col)); + } + + return sb.toString(); + } catch (IOException e) { + throw new HoodieIOException("unable to read value for " + sortColumns); + } + }); + + return comparator; } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/bulkinsert/TestBulkInsertInternalPartitionerForRows.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/bulkinsert/TestBulkInsertInternalPartitionerForRows.java new file mode 100644 index 0000000000000..de827f7a450ce --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/bulkinsert/TestBulkInsertInternalPartitionerForRows.java @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.BulkInsertPartitioner; +import org.apache.hudi.testutils.HoodieClientTestHarness; +import org.apache.hudi.testutils.SparkDatasetTestUtils; + +import org.apache.spark.api.java.function.MapPartitionsFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * Unit tests {@link BulkInsertPartitioner}s with Rows. + */ +public class TestBulkInsertInternalPartitionerForRows extends HoodieClientTestHarness { + + private static final Comparator KEY_COMPARATOR = + Comparator.comparing(o -> (o.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD) + "+" + o.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD))); + @BeforeEach + public void setUp() throws Exception { + initSparkContexts("TestBulkInsertInternalPartitionerForRows"); + initPath(); + initFileSystem(); + } + + @AfterEach + public void tearDown() throws Exception { + cleanupResources(); + } + + private static Stream configParams() { + // Parameters: + // BulkInsertSortMode sortMode, + // boolean isTablePartitioned, + // boolean enforceNumOutputPartitions, + // boolean isGloballySorted, + // boolean isLocallySorted + Object[][] data = new Object[][] { + {BulkInsertSortMode.GLOBAL_SORT, true, true, true, true}, + {BulkInsertSortMode.PARTITION_SORT, true, true, false, true}, + {BulkInsertSortMode.PARTITION_PATH_REPARTITION, true, true, false, false}, + {BulkInsertSortMode.PARTITION_PATH_REPARTITION, false, true, false, false}, + {BulkInsertSortMode.PARTITION_PATH_REPARTITION_AND_SORT, true, true, false, false}, + {BulkInsertSortMode.PARTITION_PATH_REPARTITION_AND_SORT, false, true, false, false}, + {BulkInsertSortMode.NONE, true, true, false, false}, + {BulkInsertSortMode.NONE, true, false, false, false} + }; + return Stream.of(data).map(Arguments::of); + } + + @ParameterizedTest(name = "[{index}] {0} isTablePartitioned={1} enforceNumOutputPartitions={2}") + @MethodSource("configParams") + public void testBulkInsertInternalPartitioner(BulkInsertSortMode sortMode, + boolean isTablePartitioned, + boolean enforceNumOutputPartitions, + boolean isGloballySorted, + boolean isLocallySorted) + throws Exception { + Dataset records1 = generateTestRecords(); + Dataset records2 = generateTestRecords(); + testBulkInsertInternalPartitioner( + BulkInsertInternalPartitionerWithRowsFactory.get( + sortMode, isTablePartitioned, enforceNumOutputPartitions), + records1, + enforceNumOutputPartitions, + isGloballySorted, + isLocallySorted, + generateExpectedPartitionNumRecords(records1), + Option.empty()); + testBulkInsertInternalPartitioner( + BulkInsertInternalPartitionerWithRowsFactory.get( + sortMode, isTablePartitioned, enforceNumOutputPartitions), + records2, + enforceNumOutputPartitions, + isGloballySorted, + isLocallySorted, + generateExpectedPartitionNumRecords(records2), + Option.empty()); + } + + @Test + public void testCustomColumnSortPartitionerWithRows() { + Dataset records1 = generateTestRecords(); + Dataset records2 = generateTestRecords(); + String sortColumnString = records1.columns()[5]; + String[] sortColumns = sortColumnString.split(","); + Comparator comparator = getCustomColumnComparator(sortColumns); + + testBulkInsertInternalPartitioner(new RowCustomColumnsSortPartitioner(sortColumns), + records1, true, false, true, generateExpectedPartitionNumRecords(records1), Option.of(comparator)); + testBulkInsertInternalPartitioner(new RowCustomColumnsSortPartitioner(sortColumns), + records2, true, false, true, generateExpectedPartitionNumRecords(records2), Option.of(comparator)); + + HoodieWriteConfig config = HoodieWriteConfig + .newBuilder() + .withPath("/") + .withUserDefinedBulkInsertPartitionerClass(RowCustomColumnsSortPartitioner.class.getName()) + .withUserDefinedBulkInsertPartitionerSortColumns(sortColumnString) + .build(); + testBulkInsertInternalPartitioner(new RowCustomColumnsSortPartitioner(config), + records1, true, false, true, generateExpectedPartitionNumRecords(records1), Option.of(comparator)); + testBulkInsertInternalPartitioner(new RowCustomColumnsSortPartitioner(config), + records2, true, false, true, generateExpectedPartitionNumRecords(records2), Option.of(comparator)); + } + + private void testBulkInsertInternalPartitioner(BulkInsertPartitioner partitioner, + Dataset rows, + boolean enforceNumOutputPartitions, + boolean isGloballySorted, + boolean isLocallySorted, + Map expectedPartitionNumRecords, + Option> comparator) { + int numPartitions = 2; + Dataset actualRecords = (Dataset) partitioner.repartitionRecords(rows, numPartitions); + assertEquals( + enforceNumOutputPartitions ? numPartitions : rows.rdd().getNumPartitions(), + actualRecords.rdd().getNumPartitions()); + + List collectedActualRecords = actualRecords.collectAsList(); + if (isGloballySorted) { + // Verify global order + verifyRowsAscendingOrder(collectedActualRecords, comparator); + } else if (isLocallySorted) { + // Verify local order + actualRecords.mapPartitions((MapPartitionsFunction) input -> { + List partitionRows = new ArrayList<>(); + while (input.hasNext()) { + partitionRows.add(input.next()); + } + verifyRowsAscendingOrder(partitionRows, comparator); + return Collections.emptyList().iterator(); + }, SparkDatasetTestUtils.ENCODER); + } + + // Verify number of records per partition path + Map actualPartitionNumRecords = new HashMap<>(); + for (Row record : collectedActualRecords) { + String partitionPath = record.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD); + actualPartitionNumRecords.put(partitionPath, + actualPartitionNumRecords.getOrDefault(partitionPath, 0L) + 1); + } + assertEquals(expectedPartitionNumRecords, actualPartitionNumRecords); + } + + public static Map generateExpectedPartitionNumRecords(Dataset rows) { + Dataset toReturn = rows.groupBy(HoodieRecord.PARTITION_PATH_METADATA_FIELD).count(); + List result = toReturn.collectAsList(); + Map returnMap = new HashMap<>(); + for (Row row : result) { + returnMap.put(row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD), (Long) row.getAs("count")); + } + return returnMap; + } + + public Dataset generateTestRecords() { + Dataset rowsPart1 = SparkDatasetTestUtils.getRandomRows(sqlContext, 100, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, false); + Dataset rowsPart2 = SparkDatasetTestUtils.getRandomRows(sqlContext, 150, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, false); + Dataset rowsPart3 = SparkDatasetTestUtils.getRandomRows(sqlContext, 200, HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH, false); + return rowsPart1.union(rowsPart2).union(rowsPart3); + } + + private void verifyRowsAscendingOrder(List records, Option> comparator) { + List expectedRecords = new ArrayList<>(records); + Collections.sort(expectedRecords,comparator.orElse(KEY_COMPARATOR)); + assertEquals(expectedRecords, records); + } + + private Comparator getCustomColumnComparator(String[] sortColumns) { + Comparator comparator = Comparator.comparing(row -> { + StringBuilder sb = new StringBuilder(); + for (String col : sortColumns) { + sb.append(row.getAs(col).toString()); + } + return sb.toString(); + }); + return comparator; + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/functional/SparkClientFunctionalTestSuite.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/functional/SparkClientFunctionalTestSuite.java new file mode 100644 index 0000000000000..5b20a51f5a2ed --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/functional/SparkClientFunctionalTestSuite.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.functional; + +import org.junit.platform.runner.JUnitPlatform; +import org.junit.platform.suite.api.IncludeTags; +import org.junit.platform.suite.api.SelectPackages; +import org.junit.runner.RunWith; + +@RunWith(JUnitPlatform.class) +@SelectPackages({ + "org.apache.hudi.client.functional", + "org.apache.hudi.table.functional", + "org.apache.hudi.index.hbase"}) +@IncludeTags("functional") +public class SparkClientFunctionalTestSuite { + +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndex.java deleted file mode 100644 index f10e845f05b55..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndex.java +++ /dev/null @@ -1,433 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.index; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.fs.ConsistencyGuardConfig; -import org.apache.hudi.common.model.EmptyHoodieRecordPayload; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; -import org.apache.hudi.common.table.view.FileSystemViewStorageType; -import org.apache.hudi.common.testutils.HoodieTestDataGenerator; -import org.apache.hudi.common.testutils.RawTripTestPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieCompactionConfig; -import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieStorageConfig; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.index.HoodieIndex.IndexType; -import org.apache.hudi.table.HoodieSparkTable; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.testutils.Assertions; -import org.apache.hudi.testutils.HoodieClientTestHarness; -import org.apache.hudi.testutils.HoodieWriteableTestTable; -import org.apache.hudi.testutils.MetadataMergeWriteStatus; - -import org.apache.avro.Schema; -import org.apache.hadoop.fs.Path; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.EnumSource; - -import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Random; -import java.util.UUID; - -import scala.Tuple2; - -import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; - -public class TestHoodieIndex extends HoodieClientTestHarness { - - private static final Schema SCHEMA = getSchemaFromResource(TestHoodieIndex.class, "/exampleSchema.avsc", true); - private final Random random = new Random(); - private IndexType indexType; - private HoodieIndex index; - private HoodieWriteConfig config; - - private void setUp(IndexType indexType) throws Exception { - this.indexType = indexType; - initResources(); - config = getConfigBuilder() - .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType) - .build()).withAutoCommit(false).build(); - writeClient = getHoodieWriteClient(config); - this.index = writeClient.getIndex(); - } - - @AfterEach - public void tearDown() throws IOException { - cleanupResources(); - } - - @ParameterizedTest - @EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE"}) - public void testSimpleTagLocationAndUpdate(IndexType indexType) throws Exception { - setUp(indexType); - String newCommitTime = "001"; - int totalRecords = 10 + random.nextInt(20); - List records = dataGen.generateInserts(newCommitTime, totalRecords); - JavaRDD writeRecords = jsc.parallelize(records, 1); - - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - - // Test tagLocation without any entries in index - JavaRDD javaRDD = (JavaRDD) index.tagLocation(writeRecords, context, hoodieTable); - assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0); - - // Insert totalRecords records - writeClient.startCommitWithTime(newCommitTime); - JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); - Assertions.assertNoWriteErrors(writeStatues.collect()); - - // Now tagLocation for these records, index should not tag them since it was a failed - // commit - javaRDD = (JavaRDD) index.tagLocation(writeRecords, context, hoodieTable); - assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0); - // Now commit this & update location of records inserted and validate no errors - writeClient.commit(newCommitTime, writeStatues); - // Now tagLocation for these records, index should tag them correctly - metaClient = HoodieTableMetaClient.reload(metaClient); - hoodieTable = HoodieSparkTable.create(config, context, metaClient); - javaRDD = (JavaRDD) index.tagLocation(writeRecords, context, hoodieTable); - Map recordKeyToPartitionPathMap = new HashMap(); - List hoodieRecords = writeRecords.collect(); - hoodieRecords.forEach(entry -> recordKeyToPartitionPathMap.put(entry.getRecordKey(), entry.getPartitionPath())); - - assertEquals(totalRecords, javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size()); - assertEquals(totalRecords, javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count()); - assertEquals(totalRecords, javaRDD.filter(record -> (record.getCurrentLocation() != null - && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count()); - javaRDD.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry.getRecordKey()), entry.getPartitionPath(), "PartitionPath mismatch")); - - JavaRDD hoodieKeyJavaRDD = writeRecords.map(entry -> entry.getKey()); - JavaPairRDD>> recordLocations = getRecordLocations(hoodieKeyJavaRDD, hoodieTable); - List hoodieKeys = hoodieKeyJavaRDD.collect(); - assertEquals(totalRecords, recordLocations.collect().size()); - assertEquals(totalRecords, recordLocations.map(record -> record._1).distinct().count()); - recordLocations.foreach(entry -> assertTrue(hoodieKeys.contains(entry._1), "Missing HoodieKey")); - recordLocations.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry._1.getRecordKey()), entry._1.getPartitionPath(), "PartitionPath mismatch")); - } - - @ParameterizedTest - @EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE"}) - public void testTagLocationAndDuplicateUpdate(IndexType indexType) throws Exception { - setUp(indexType); - String newCommitTime = "001"; - int totalRecords = 10 + random.nextInt(20); - List records = dataGen.generateInserts(newCommitTime, totalRecords); - JavaRDD writeRecords = jsc.parallelize(records, 1); - - HoodieSparkTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - - writeClient.startCommitWithTime(newCommitTime); - JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); - JavaRDD javaRDD1 = (JavaRDD) index.tagLocation(writeRecords, context, hoodieTable); - - // Duplicate upsert and ensure correctness is maintained - // We are trying to approximately imitate the case when the RDD is recomputed. For RDD creating, driver code is not - // recomputed. This includes the state transitions. We need to delete the inflight instance so that subsequent - // upsert will not run into conflicts. - metaClient.getFs().delete(new Path(metaClient.getMetaPath(), "001.inflight")); - - writeClient.upsert(writeRecords, newCommitTime); - Assertions.assertNoWriteErrors(writeStatues.collect()); - - // Now commit this & update location of records inserted and validate no errors - writeClient.commit(newCommitTime, writeStatues); - // Now tagLocation for these records, hbaseIndex should tag them correctly - metaClient = HoodieTableMetaClient.reload(metaClient); - hoodieTable = HoodieSparkTable.create(config, context, metaClient); - JavaRDD javaRDD = (JavaRDD) index.tagLocation(writeRecords, context, hoodieTable); - - Map recordKeyToPartitionPathMap = new HashMap(); - List hoodieRecords = writeRecords.collect(); - hoodieRecords.forEach(entry -> recordKeyToPartitionPathMap.put(entry.getRecordKey(), entry.getPartitionPath())); - - assertEquals(totalRecords, javaRDD.filter(HoodieRecord::isCurrentLocationKnown).collect().size()); - assertEquals(totalRecords, javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count()); - assertEquals(totalRecords, javaRDD.filter(record -> (record.getCurrentLocation() != null - && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count()); - javaRDD.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry.getRecordKey()), entry.getPartitionPath(), "PartitionPath mismatch")); - - JavaRDD hoodieKeyJavaRDD = writeRecords.map(entry -> entry.getKey()); - JavaPairRDD>> recordLocations = getRecordLocations(hoodieKeyJavaRDD, hoodieTable); - List hoodieKeys = hoodieKeyJavaRDD.collect(); - assertEquals(totalRecords, recordLocations.collect().size()); - assertEquals(totalRecords, recordLocations.map(record -> record._1).distinct().count()); - recordLocations.foreach(entry -> assertTrue(hoodieKeys.contains(entry._1), "Missing HoodieKey")); - recordLocations.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry._1.getRecordKey()), entry._1.getPartitionPath(), "PartitionPath mismatch")); - } - - @ParameterizedTest - @EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE"}) - public void testSimpleTagLocationAndUpdateWithRollback(IndexType indexType) throws Exception { - setUp(indexType); - String newCommitTime = writeClient.startCommit(); - int totalRecords = 20 + random.nextInt(20); - List records = dataGen.generateInserts(newCommitTime, totalRecords); - JavaRDD writeRecords = jsc.parallelize(records, 1); - metaClient = HoodieTableMetaClient.reload(metaClient); - - // Insert 200 records - JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); - Assertions.assertNoWriteErrors(writeStatues.collect()); - - // commit this upsert - writeClient.commit(newCommitTime, writeStatues); - HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - - // Now tagLocation for these records, hbaseIndex should tag them - JavaRDD javaRDD = (JavaRDD) index.tagLocation(writeRecords, context, hoodieTable); - assert (javaRDD.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == totalRecords); - - // check tagged records are tagged with correct fileIds - List fileIds = writeStatues.map(WriteStatus::getFileId).collect(); - assert (javaRDD.filter(record -> record.getCurrentLocation().getFileId() == null).collect().size() == 0); - List taggedFileIds = javaRDD.map(record -> record.getCurrentLocation().getFileId()).distinct().collect(); - - Map recordKeyToPartitionPathMap = new HashMap(); - List hoodieRecords = writeRecords.collect(); - hoodieRecords.forEach(entry -> recordKeyToPartitionPathMap.put(entry.getRecordKey(), entry.getPartitionPath())); - - JavaRDD hoodieKeyJavaRDD = writeRecords.map(entry -> entry.getKey()); - JavaPairRDD>> recordLocations = getRecordLocations(hoodieKeyJavaRDD, hoodieTable); - List hoodieKeys = hoodieKeyJavaRDD.collect(); - assertEquals(totalRecords, recordLocations.collect().size()); - assertEquals(totalRecords, recordLocations.map(record -> record._1).distinct().count()); - recordLocations.foreach(entry -> assertTrue(hoodieKeys.contains(entry._1), "Missing HoodieKey")); - recordLocations.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry._1.getRecordKey()), entry._1.getPartitionPath(), "PartitionPath mismatch")); - - // both lists should match - assertTrue(taggedFileIds.containsAll(fileIds) && fileIds.containsAll(taggedFileIds)); - // Rollback the last commit - writeClient.rollback(newCommitTime); - - hoodieTable = HoodieSparkTable.create(config, context, metaClient); - // Now tagLocation for these records, hbaseIndex should not tag them since it was a rolled - // back commit - javaRDD = (JavaRDD) index.tagLocation(writeRecords, context, hoodieTable); - assert (javaRDD.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 0); - assert (javaRDD.filter(record -> record.getCurrentLocation() != null).collect().size() == 0); - } - - @ParameterizedTest - @EnumSource(value = IndexType.class, names = {"BLOOM", "SIMPLE",}) - public void testTagLocationAndFetchRecordLocations(IndexType indexType) throws Exception { - setUp(indexType); - String p1 = "2016/01/31"; - String p2 = "2015/01/31"; - String rowKey1 = UUID.randomUUID().toString(); - String rowKey2 = UUID.randomUUID().toString(); - String rowKey3 = UUID.randomUUID().toString(); - String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; - String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; - String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; - // place same row key under a different partition. - String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; - RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); - HoodieRecord record1 = - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); - RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); - HoodieRecord record2 = - new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); - RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); - HoodieRecord record3 = - new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); - RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); - HoodieRecord record4 = - new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); - JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4)); - - HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - - JavaRDD taggedRecordRDD = (JavaRDD) index.tagLocation(recordRDD, context, hoodieTable); - - // Should not find any files - for (HoodieRecord record : taggedRecordRDD.collect()) { - assertFalse(record.isCurrentLocationKnown()); - } - - // We create three parquet file, each having one record. (two different partitions) - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(hoodieTable, SCHEMA); - String fileId1 = testTable.addCommit("001").getFileIdWithInserts(p1, record1); - String fileId2 = testTable.addCommit("002").getFileIdWithInserts(p1, record2); - String fileId3 = testTable.addCommit("003").getFileIdWithInserts(p2, record4); - - // We do the tag again - metaClient = HoodieTableMetaClient.reload(metaClient); - hoodieTable = HoodieSparkTable.create(config, context, metaClient); - - taggedRecordRDD = (JavaRDD) index.tagLocation(recordRDD, context, hoodieTable); - - // Check results - for (HoodieRecord record : taggedRecordRDD.collect()) { - if (record.getRecordKey().equals(rowKey1)) { - if (record.getPartitionPath().equals(p2)) { - assertEquals(record.getCurrentLocation().getFileId(), fileId3); - } else { - assertEquals(record.getCurrentLocation().getFileId(), fileId1); - } - } else if (record.getRecordKey().equals(rowKey2)) { - assertEquals(record.getCurrentLocation().getFileId(), fileId2); - } else if (record.getRecordKey().equals(rowKey3)) { - assertFalse(record.isCurrentLocationKnown()); - } - } - - JavaPairRDD>> recordLocations = getRecordLocations(recordRDD.map(HoodieRecord::getKey), hoodieTable); - for (Tuple2>> entry : recordLocations.collect()) { - if (entry._1.getRecordKey().equals(rowKey1)) { - assertTrue(entry._2.isPresent(), "Row1 should have been present "); - if (entry._1.getPartitionPath().equals(p2)) { - assertTrue(entry._2.isPresent(), "Row1 should have been present "); - assertEquals(entry._2.get().getRight(), fileId3); - } else { - assertEquals(entry._2.get().getRight(), fileId1); - } - } else if (entry._1.getRecordKey().equals(rowKey2)) { - assertTrue(entry._2.isPresent(), "Row2 should have been present "); - assertEquals(entry._2.get().getRight(), fileId2); - } else if (entry._1.getRecordKey().equals(rowKey3)) { - assertFalse(entry._2.isPresent(), "Row3 should have been absent "); - } - } - } - - @ParameterizedTest - @EnumSource(value = IndexType.class, names = {"GLOBAL_SIMPLE"}) - public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath(IndexType indexType) throws Exception { - setUp(indexType); - config = getConfigBuilder() - .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType) - .withGlobalSimpleIndexUpdatePartitionPath(true) - .withBloomIndexUpdatePartitionPath(true) - .build()).build(); - writeClient = getHoodieWriteClient(config); - index = writeClient.getIndex(); - HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(hoodieTable, SCHEMA); - final String p1 = "2016/01/31"; - final String p2 = "2016/02/28"; - - // Create the original partition, and put a record, along with the meta file - // "2016/01/31": 1 file (1_0_20160131101010.parquet) - // this record will be saved in table and will be tagged to an empty record - RawTripTestPayload originalPayload = - new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); - HoodieRecord originalRecord = - new HoodieRecord(new HoodieKey(originalPayload.getRowKey(), originalPayload.getPartitionPath()), - originalPayload); - - /* - This record has the same record key as originalRecord but different time so different partition - Because GLOBAL_BLOOM_INDEX_SHOULD_UPDATE_PARTITION_PATH = true, - globalBloomIndex should - - tag the original partition of the originalRecord to an empty record for deletion, and - - tag the new partition of the incomingRecord - */ - RawTripTestPayload incomingPayload = - new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-02-28T03:16:41.415Z\",\"number\":12}"); - HoodieRecord incomingRecord = - new HoodieRecord(new HoodieKey(incomingPayload.getRowKey(), incomingPayload.getPartitionPath()), - incomingPayload); - /* - This record has the same record key as originalRecord and the same partition - Though GLOBAL_BLOOM_INDEX_SHOULD_UPDATE_PARTITION_PATH = true, - globalBloomIndex should just tag the original partition - */ - RawTripTestPayload incomingPayloadSamePartition = - new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T04:16:41.415Z\",\"number\":15}"); - HoodieRecord incomingRecordSamePartition = - new HoodieRecord( - new HoodieKey(incomingPayloadSamePartition.getRowKey(), incomingPayloadSamePartition.getPartitionPath()), - incomingPayloadSamePartition); - - // We have some records to be tagged (two different partitions) - testTable.addCommit("1000").getFileIdWithInserts(p1, originalRecord); - - // test against incoming record with a different partition - JavaRDD recordRDD = jsc.parallelize(Collections.singletonList(incomingRecord)); - JavaRDD taggedRecordRDD = (JavaRDD) index.tagLocation(recordRDD, context, hoodieTable); - - assertEquals(2, taggedRecordRDD.count()); - for (HoodieRecord record : taggedRecordRDD.collect()) { - switch (record.getPartitionPath()) { - case p1: - assertEquals("000", record.getRecordKey()); - assertTrue(record.getData() instanceof EmptyHoodieRecordPayload); - break; - case p2: - assertEquals("000", record.getRecordKey()); - assertEquals(incomingPayload.getJsonData(), ((RawTripTestPayload) record.getData()).getJsonData()); - break; - default: - fail(String.format("Should not get partition path: %s", record.getPartitionPath())); - } - } - - // test against incoming record with the same partition - JavaRDD recordRDDSamePartition = jsc - .parallelize(Collections.singletonList(incomingRecordSamePartition)); - JavaRDD taggedRecordRDDSamePartition = (JavaRDD) index.tagLocation(recordRDDSamePartition, context, hoodieTable); - - assertEquals(1, taggedRecordRDDSamePartition.count()); - HoodieRecord record = taggedRecordRDDSamePartition.first(); - assertEquals("000", record.getRecordKey()); - assertEquals(p1, record.getPartitionPath()); - assertEquals(incomingPayloadSamePartition.getJsonData(), ((RawTripTestPayload) record.getData()).getJsonData()); - } - - private HoodieWriteConfig.Builder getConfigBuilder() { - return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) - .withParallelism(2, 2).withBulkInsertParallelism(2).withFinalizeWriteParallelism(2).withDeleteParallelism(2) - .withWriteStatusClass(MetadataMergeWriteStatus.class) - .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build()) - .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024).parquetMaxFileSize(1024 * 1024).build()) - .forTable("test-trip-table") - .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).build()) - .withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() - .withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build()); - } - - private JavaPairRDD>> getRecordLocations(JavaRDD keyRDD, HoodieTable hoodieTable) { - JavaRDD recordRDD = (JavaRDD) index.tagLocation( - keyRDD.map(k -> new HoodieRecord(k, new EmptyHoodieRecordPayload())), context, hoodieTable); - return recordRDD.mapToPair(hr -> new Tuple2<>(hr.getKey(), hr.isCurrentLocationKnown() - ? Option.of(Pair.of(hr.getPartitionPath(), hr.getCurrentLocation().getFileId())) - : Option.empty()) - ); - } -} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndexConfigs.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndexConfigs.java index 9175ebde51333..b843546799479 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndexConfigs.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndexConfigs.java @@ -19,24 +19,19 @@ package org.apache.hudi.index; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.config.HoodieHBaseIndexConfig; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieIndexException; import org.apache.hudi.index.HoodieIndex.IndexType; -import org.apache.hudi.index.bloom.SparkHoodieBloomIndex; -import org.apache.hudi.index.bloom.SparkHoodieGlobalBloomIndex; +import org.apache.hudi.index.bloom.HoodieBloomIndex; +import org.apache.hudi.index.bloom.HoodieGlobalBloomIndex; +import org.apache.hudi.index.bucket.HoodieSimpleBucketIndex; +import org.apache.hudi.index.bucket.HoodieSparkConsistentBucketIndex; import org.apache.hudi.index.hbase.SparkHoodieHBaseIndex; -import org.apache.hudi.index.simple.SparkHoodieSimpleIndex; -import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.index.inmemory.HoodieInMemoryHashIndex; +import org.apache.hudi.index.simple.HoodieSimpleIndex; -import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; @@ -59,8 +54,8 @@ public void setUp(@TempDir Path tempDir) { } @ParameterizedTest - @EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE", "HBASE"}) - public void testCreateIndex(IndexType indexType) throws Exception { + @EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE", "HBASE", "BUCKET"}) + public void testCreateIndex(IndexType indexType) { HoodieWriteConfig config; HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder(); HoodieIndexConfig.Builder indexConfigBuilder = HoodieIndexConfig.newBuilder(); @@ -68,44 +63,47 @@ public void testCreateIndex(IndexType indexType) throws Exception { case INMEMORY: config = clientConfigBuilder.withPath(basePath) .withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); - assertTrue(SparkHoodieIndex.createIndex(config) instanceof SparkInMemoryHashIndex); + assertTrue(SparkHoodieIndexFactory.createIndex(config) instanceof HoodieInMemoryHashIndex); break; case BLOOM: config = clientConfigBuilder.withPath(basePath) .withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); - assertTrue(SparkHoodieIndex.createIndex(config) instanceof SparkHoodieBloomIndex); + assertTrue(SparkHoodieIndexFactory.createIndex(config) instanceof HoodieBloomIndex); break; case GLOBAL_BLOOM: config = clientConfigBuilder.withPath(basePath) .withIndexConfig(indexConfigBuilder.withIndexType(IndexType.GLOBAL_BLOOM).build()).build(); - assertTrue(SparkHoodieIndex.createIndex(config) instanceof SparkHoodieGlobalBloomIndex); + assertTrue(SparkHoodieIndexFactory.createIndex(config) instanceof HoodieGlobalBloomIndex); break; case SIMPLE: config = clientConfigBuilder.withPath(basePath) .withIndexConfig(indexConfigBuilder.withIndexType(IndexType.SIMPLE).build()).build(); - assertTrue(SparkHoodieIndex.createIndex(config) instanceof SparkHoodieSimpleIndex); + assertTrue(SparkHoodieIndexFactory.createIndex(config) instanceof HoodieSimpleIndex); break; case HBASE: config = clientConfigBuilder.withPath(basePath) .withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.HBASE) .withHBaseIndexConfig(new HoodieHBaseIndexConfig.Builder().build()).build()) .build(); - assertTrue(SparkHoodieIndex.createIndex(config) instanceof SparkHoodieHBaseIndex); + assertTrue(SparkHoodieIndexFactory.createIndex(config) instanceof SparkHoodieHBaseIndex); + break; + case BUCKET: + config = clientConfigBuilder.withPath(basePath) + .withIndexConfig(indexConfigBuilder.withIndexType(IndexType.BUCKET) + .withBucketIndexEngineType(HoodieIndex.BucketIndexEngineType.SIMPLE).build()).build(); + assertTrue(SparkHoodieIndexFactory.createIndex(config) instanceof HoodieSimpleBucketIndex); + + config = clientConfigBuilder.withPath(basePath) + .withIndexConfig(indexConfigBuilder.withIndexType(IndexType.BUCKET) + .withBucketIndexEngineType(HoodieIndex.BucketIndexEngineType.CONSISTENT_HASHING).build()) + .build(); + assertTrue(SparkHoodieIndexFactory.createIndex(config) instanceof HoodieSparkConsistentBucketIndex); break; default: // no -op. just for checkstyle errors } } - @Test - public void testCreateDummyIndex() { - HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder(); - HoodieIndexConfig.Builder indexConfigBuilder = HoodieIndexConfig.newBuilder(); - HoodieWriteConfig config = clientConfigBuilder.withPath(basePath) - .withIndexConfig(indexConfigBuilder.withIndexClass(DummyHoodieIndex.class.getName()).build()).build(); - assertTrue(SparkHoodieIndex.createIndex(config) instanceof DummyHoodieIndex); - } - @Test public void testCreateIndexWithException() { HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder(); @@ -113,59 +111,18 @@ public void testCreateIndexWithException() { final HoodieWriteConfig config1 = clientConfigBuilder.withPath(basePath) .withIndexConfig(indexConfigBuilder.withIndexClass(IndexWithConstructor.class.getName()).build()).build(); final Throwable thrown1 = assertThrows(HoodieException.class, () -> { - SparkHoodieIndex.createIndex(config1); + SparkHoodieIndexFactory.createIndex(config1); }, "exception is expected"); assertTrue(thrown1.getMessage().contains("is not a subclass of HoodieIndex")); final HoodieWriteConfig config2 = clientConfigBuilder.withPath(basePath) .withIndexConfig(indexConfigBuilder.withIndexClass(IndexWithoutConstructor.class.getName()).build()).build(); final Throwable thrown2 = assertThrows(HoodieException.class, () -> { - SparkHoodieIndex.createIndex(config2); + SparkHoodieIndexFactory.createIndex(config2); }, "exception is expected"); assertTrue(thrown2.getMessage().contains("Unable to instantiate class")); } - public static class DummyHoodieIndex> extends SparkHoodieIndex { - - public DummyHoodieIndex(HoodieWriteConfig config) { - super(config); - } - - @Override - public JavaRDD updateLocation(JavaRDD writeStatusRDD, - HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) throws HoodieIndexException { - return null; - } - - @Override - public JavaRDD> tagLocation(JavaRDD> records, - HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) throws HoodieIndexException { - return null; - } - - @Override - public boolean rollbackCommit(String instantTime) { - return false; - } - - @Override - public boolean isGlobal() { - return false; - } - - @Override - public boolean canIndexLogFiles() { - return false; - } - - @Override - public boolean isImplicitWithStorage() { - return false; - } - } - public static class IndexWithConstructor { public IndexWithConstructor(HoodieWriteConfig config) { diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java index 8446698305846..5be4e4ce624a3 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java @@ -18,22 +18,30 @@ package org.apache.hudi.index.bloom; +import org.apache.hudi.client.functional.TestHoodieMetadataBase; import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.BloomFilterFactory; import org.apache.hudi.common.bloom.BloomFilterTypeCode; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.testutils.RawTripTestPayload; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.io.HoodieKeyLookupHandle; +import org.apache.hudi.data.HoodieJavaPairRDD; +import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.index.HoodieIndexUtils; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.testutils.HoodieClientTestHarness; -import org.apache.hudi.testutils.HoodieWriteableTestTable; +import org.apache.hudi.testutils.HoodieSparkWriteableTestTable; import org.apache.avro.Schema; import org.apache.hadoop.fs.Path; @@ -47,17 +55,21 @@ import org.junit.jupiter.params.provider.MethodSource; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Random; import java.util.UUID; import java.util.stream.Collectors; import java.util.stream.Stream; import scala.Tuple2; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.genPseudoRandomUUID; import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource; import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -66,14 +78,25 @@ import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestHoodieBloomIndex extends HoodieClientTestHarness { +public class TestHoodieBloomIndex extends TestHoodieMetadataBase { private static final Schema SCHEMA = getSchemaFromResource(TestHoodieBloomIndex.class, "/exampleSchema.avsc", true); - private static final String TEST_NAME_WITH_PARAMS = "[{index}] Test with rangePruning={0}, treeFiltering={1}, bucketizedChecking={2}"; + private static final String TEST_NAME_WITH_PARAMS = + "[{index}] Test with rangePruning={0}, treeFiltering={1}, bucketizedChecking={2}, useMetadataTable={3}"; + private static final Random RANDOM = new Random(0xDEED); public static Stream configParams() { - Object[][] data = - new Object[][] {{true, true, true}, {false, true, true}, {true, true, false}, {true, false, true}}; + // rangePruning, treeFiltering, bucketizedChecking, useMetadataTable + Object[][] data = new Object[][] { + {true, true, true, false}, + {false, true, true, false}, + {true, true, false, false}, + {true, false, true, false}, + {true, true, true, true}, + {false, true, true, true}, + {true, true, false, true}, + {true, false, true, true} + }; return Stream.of(data).map(Arguments::of); } @@ -84,6 +107,11 @@ public void setUp() throws Exception { initFileSystem(); // We have some records to be tagged (two different partitions) initMetaClient(); + HoodieIndexConfig.Builder indexBuilder = HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withIndexConfig(indexBuilder.build()) + .build(); + writeClient = getHoodieWriteClient(config); } @AfterEach @@ -91,21 +119,40 @@ public void tearDown() throws Exception { cleanupResources(); } - private HoodieWriteConfig makeConfig(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) { + private HoodieWriteConfig makeConfig( + boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking, boolean useMetadataTable) { + // For the bloom index to use column stats and bloom filters from metadata table, + // the following configs must be set to true: + // "hoodie.bloom.index.use.metadata" + // "hoodie.metadata.enable" (by default is true) + // "hoodie.metadata.index.column.stats.enable" + // "hoodie.metadata.index.bloom.filter.enable" return HoodieWriteConfig.newBuilder().withPath(basePath) - .withIndexConfig(HoodieIndexConfig.newBuilder().bloomIndexPruneByRanges(rangePruning) - .bloomIndexTreebasedFilter(treeFiltering).bloomIndexBucketizedChecking(bucketizedChecking) - .bloomIndexKeysPerBucket(2).build()) + .withIndexConfig(HoodieIndexConfig.newBuilder() + .bloomIndexPruneByRanges(rangePruning) + .bloomIndexTreebasedFilter(treeFiltering) + .bloomIndexBucketizedChecking(bucketizedChecking) + .bloomIndexKeysPerBucket(2) + .bloomIndexUseMetadata(useMetadataTable) + .build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .withMetadataIndexBloomFilter(useMetadataTable) + .withMetadataIndexColumnStats(useMetadataTable) + .build()) .build(); } @ParameterizedTest(name = TEST_NAME_WITH_PARAMS) @MethodSource("configParams") - public void testLoadInvolvedFiles(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception { - HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); - SparkHoodieBloomIndex index = new SparkHoodieBloomIndex(config); + public void testLoadInvolvedFiles( + boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking, + boolean useMetadataTable) throws Exception { + HoodieWriteConfig config = + makeConfig(rangePruning, treeFiltering, bucketizedChecking, useMetadataTable); + HoodieBloomIndex index = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(hoodieTable, SCHEMA); + metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, metadataWriter); // Create some partitions, and put some files // "2016/01/21": 0 file @@ -116,58 +163,91 @@ public void testLoadInvolvedFiles(boolean rangePruning, boolean treeFiltering, b RawTripTestPayload rowChange1 = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record1 = - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record2 = - new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); RawTripTestPayload rowChange3 = new RawTripTestPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record3 = - new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); RawTripTestPayload rowChange4 = new RawTripTestPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record4 = - new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); List partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12"); - List> filesList = index.loadInvolvedFiles(partitions, context, hoodieTable); + List> filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable); // Still 0, as no valid commit assertEquals(0, filesList.size()); - testTable.addCommit("20160401010101").withInserts("2016/04/01", "2"); - testTable.addCommit("20150312101010").withInserts("2015/03/12", "1") - .withInserts("2015/03/12", "3", record1) - .withInserts("2015/03/12", "4", record2, record3, record4); - - filesList = index.loadInvolvedFiles(partitions, context, hoodieTable); + final String fileId1 = "1"; + final String fileId2 = "2"; + final String fileId3 = "3"; + final String fileId4 = "4"; + final Map>> partitionToFilesNameLengthMap = new HashMap<>(); + + String commitTime = "20160401010101"; + Path baseFilePath = testTable.forCommit(commitTime).withInserts(partitions.get(1), fileId2, Collections.emptyList()); + long baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.computeIfAbsent(partitions.get(1), + k -> new ArrayList<>()).add(Pair.of(fileId2, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT, Arrays.asList(partitions.get(1)), + partitionToFilesNameLengthMap, false, false); + + commitTime = "20150312101010"; + partitionToFilesNameLengthMap.clear(); + testTable.forCommit(commitTime); + baseFilePath = testTable.withInserts(partitions.get(2), fileId1, Collections.emptyList()); + baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.computeIfAbsent(partitions.get(2), + k -> new ArrayList<>()).add(Pair.of(fileId1, Integer.valueOf((int) baseFileLength))); + + baseFilePath = testTable.withInserts(partitions.get(2), fileId3, Collections.singletonList(record1)); + baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.computeIfAbsent(partitions.get(2), + k -> new ArrayList<>()).add(Pair.of(fileId3, Integer.valueOf((int) baseFileLength))); + + baseFilePath = testTable.withInserts(partitions.get(2), fileId4, Arrays.asList(record2, record3, record4)); + baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.computeIfAbsent(partitions.get(2), + k -> new ArrayList<>()).add(Pair.of(fileId4, Integer.valueOf((int) baseFileLength))); + + testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT, Arrays.asList(partitions.get(2)), + partitionToFilesNameLengthMap, false, false); + + filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable); assertEquals(4, filesList.size()); if (rangePruning) { // these files will not have the key ranges - assertNull(filesList.get(0)._2().getMaxRecordKey()); - assertNull(filesList.get(0)._2().getMinRecordKey()); - assertFalse(filesList.get(1)._2().hasKeyRanges()); - assertNotNull(filesList.get(2)._2().getMaxRecordKey()); - assertNotNull(filesList.get(2)._2().getMinRecordKey()); - assertTrue(filesList.get(3)._2().hasKeyRanges()); + assertNull(filesList.get(0).getRight().getMaxRecordKey()); + assertNull(filesList.get(0).getRight().getMinRecordKey()); + assertFalse(filesList.get(1).getRight().hasKeyRanges()); + assertNotNull(filesList.get(2).getRight().getMaxRecordKey()); + assertNotNull(filesList.get(2).getRight().getMinRecordKey()); + assertTrue(filesList.get(3).getRight().hasKeyRanges()); // no longer sorted, but should have same files. - List> expected = - Arrays.asList(new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2")), - new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1")), - new Tuple2<>("2015/03/12", new BloomIndexFileInfo("3", "000", "000")), - new Tuple2<>("2015/03/12", new BloomIndexFileInfo("4", "001", "003"))); + List> expected = + Arrays.asList(new ImmutablePair<>("2016/04/01", new BloomIndexFileInfo("2")), + new ImmutablePair<>("2015/03/12", new BloomIndexFileInfo("1")), + new ImmutablePair<>("2015/03/12", new BloomIndexFileInfo("3", "000", "000")), + new ImmutablePair<>("2015/03/12", new BloomIndexFileInfo("4", "001", "003"))); assertEquals(expected, filesList); } } @ParameterizedTest(name = TEST_NAME_WITH_PARAMS) @MethodSource("configParams") - public void testRangePruning(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) { - HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); - SparkHoodieBloomIndex index = new SparkHoodieBloomIndex(config); + public void testRangePruning( + boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking, + boolean useMetadataTable) { + HoodieWriteConfig config = + makeConfig(rangePruning, treeFiltering, bucketizedChecking, useMetadataTable); + HoodieBloomIndex index = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); final Map> partitionToFileIndexInfo = new HashMap<>(); partitionToFileIndexInfo.put("2017/10/22", @@ -179,12 +259,12 @@ public void testRangePruning(boolean rangePruning, boolean treeFiltering, boolea jsc.parallelize(Arrays.asList(new Tuple2<>("2017/10/22", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"), new Tuple2<>("2017/10/22", "004"))).mapToPair(t -> t); - List> comparisonKeyList = - index.explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect(); + List> comparisonKeyList = HoodieJavaRDD.getJavaRDD( + index.explodeRecordsWithFileComparisons(partitionToFileIndexInfo, HoodieJavaPairRDD.of(partitionRecordKeyPairRDD))).collect(); assertEquals(10, comparisonKeyList.size()); Map> recordKeyToFileComps = comparisonKeyList.stream() - .collect(Collectors.groupingBy(t -> t._2.getRecordKey(), Collectors.mapping(t -> t._1, Collectors.toList()))); + .collect(Collectors.groupingBy(t -> t.getRight().getRecordKey(), Collectors.mapping(Pair::getLeft, Collectors.toList()))); assertEquals(4, recordKeyToFileComps.size()); assertEquals(new HashSet<>(Arrays.asList("f1", "f3", "f4")), new HashSet<>(recordKeyToFileComps.get("002"))); @@ -207,24 +287,35 @@ public void testCheckUUIDsAgainstOneFile() throws Exception { + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}"; RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); HoodieRecord record1 = - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); HoodieRecord record2 = - new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); HoodieRecord record3 = - new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); HoodieRecord record4 = - new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); // We write record1, record2 to a parquet file, but the bloom filter contains (record1, // record2, record3). BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name()); filter.add(record3.getRecordKey()); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(metaClient, SCHEMA, filter); - String fileId = testTable.addCommit("000").getFileIdWithInserts(partition, record1, record2); - String filename = testTable.getBaseFileNameById(fileId); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, filter, metadataWriter); + + final Map>> partitionToFilesNameLengthMap = new HashMap<>(); + final String commitTime = "0000001"; + final String fileId = genRandomUUID(); + + Path baseFilePath = testTable.forCommit(commitTime) + .withInserts(partition, fileId, Arrays.asList(record1, record2)); + long baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.computeIfAbsent(partition, + k -> new ArrayList<>()).add(Pair.of(fileId, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT, Collections.singletonList(partition), + partitionToFilesNameLengthMap, false, false); + final String filename = testTable.getBaseFileNameById(fileId); // The bloom filter contains 3 records assertTrue(filter.mightContain(record1.getRecordKey())); @@ -238,9 +329,9 @@ public void testCheckUUIDsAgainstOneFile() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient); - HoodieKeyLookupHandle keyHandle = new HoodieKeyLookupHandle<>(config, table, Pair.of(partition, fileId)); - List results = keyHandle.checkCandidatesAgainstFile(hadoopConf, uuids, - new Path(Paths.get(basePath, partition, filename).toString())); + List results = HoodieIndexUtils.filterKeysFromFile( + new Path(Paths.get(basePath, partition, filename).toString()), uuids, hadoopConf); + assertEquals(results.size(), 2); assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") || results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")); @@ -253,29 +344,34 @@ public void testCheckUUIDsAgainstOneFile() throws Exception { @ParameterizedTest(name = TEST_NAME_WITH_PARAMS) @MethodSource("configParams") - public void testTagLocationWithEmptyRDD(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) { + public void testTagLocationWithEmptyRDD( + boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking, + boolean useMetadataTable) { // We have some records to be tagged (two different partitions) JavaRDD recordRDD = jsc.emptyRDD(); // Also create the metadata and config - HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); + HoodieWriteConfig config = + makeConfig(rangePruning, treeFiltering, bucketizedChecking, useMetadataTable); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient); // Let's tag - SparkHoodieBloomIndex bloomIndex = new SparkHoodieBloomIndex(config); + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); assertDoesNotThrow(() -> { - bloomIndex.tagLocation(recordRDD, context, table); + tagLocation(bloomIndex, recordRDD, table); }, "EmptyRDD should not result in IllegalArgumentException: Positive number of slices required"); } @ParameterizedTest(name = TEST_NAME_WITH_PARAMS) @MethodSource("configParams") - public void testTagLocation(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception { + public void testTagLocationOnPartitionedTable( + boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking, + boolean useMetadataTable) throws Exception { // We have some records to be tagged (two different partitions) - String rowKey1 = UUID.randomUUID().toString(); - String rowKey2 = UUID.randomUUID().toString(); - String rowKey3 = UUID.randomUUID().toString(); + String rowKey1 = genRandomUUID(); + String rowKey2 = genRandomUUID(); + String rowKey3 = genRandomUUID(); String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; @@ -283,44 +379,75 @@ public void testTagLocation(boolean rangePruning, boolean treeFiltering, boolean String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); HoodieRecord record1 = - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); HoodieRecord record2 = - new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); HoodieRecord record3 = - new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); HoodieRecord record4 = - new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4)); // Also create the metadata and config - HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); + HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking, useMetadataTable); HoodieSparkTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(hoodieTable, SCHEMA); + metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, metadataWriter); // Let's tag - SparkHoodieBloomIndex bloomIndex = new SparkHoodieBloomIndex(config); - JavaRDD taggedRecordRDD = bloomIndex.tagLocation(recordRDD, context, hoodieTable); + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); + JavaRDD taggedRecordRDD = tagLocation(bloomIndex, recordRDD, hoodieTable); // Should not find any files for (HoodieRecord record : taggedRecordRDD.collect()) { assertFalse(record.isCurrentLocationKnown()); } + final Map>> partitionToFilesNameLengthMap = new HashMap<>(); + final String partition1 = "2016/01/31"; + final String partition2 = "2015/01/31"; + // We create three parquet file, each having one record. (two different partitions) - String fileId1 = testTable.addCommit("001").getFileIdWithInserts("2016/01/31", record1); - String fileId2 = testTable.addCommit("002").getFileIdWithInserts("2016/01/31", record2); - String fileId3 = testTable.addCommit("003").getFileIdWithInserts("2015/01/31", record4); + final String fileId1 = genRandomUUID(); + final String commit1 = "0000001"; + Path baseFilePath = testTable.forCommit(commit1).withInserts(partition1, fileId1, Collections.singletonList(record1)); + long baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.computeIfAbsent(partition1, + k -> new ArrayList<>()).add(Pair.of(fileId1, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation(commit1, WriteOperationType.UPSERT, Collections.singletonList(partition1), + partitionToFilesNameLengthMap, false, false); + + final String fileId2 = genRandomUUID(); + final String commit2 = "0000002"; + baseFilePath = testTable.forCommit(commit2).withInserts(partition1, fileId2, Collections.singletonList(record2)); + baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.clear(); + partitionToFilesNameLengthMap.computeIfAbsent(partition1, + k -> new ArrayList<>()).add(Pair.of(fileId2, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation(commit2, WriteOperationType.UPSERT, Collections.singletonList(partition1), + partitionToFilesNameLengthMap, false, false); + + final String fileId3 = genRandomUUID(); + final String commit3 = "0000003"; + baseFilePath = testTable.forCommit(commit3).withInserts(partition2, fileId3, Collections.singletonList(record4)); + baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.clear(); + partitionToFilesNameLengthMap.computeIfAbsent(partition2, + k -> new ArrayList<>()).add(Pair.of(fileId3, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation(commit3, WriteOperationType.UPSERT, Collections.singletonList(partition2), + partitionToFilesNameLengthMap, false, false); // We do the tag again - taggedRecordRDD = bloomIndex.tagLocation(recordRDD, context, HoodieSparkTable.create(config, context, metaClient)); + metaClient = HoodieTableMetaClient.reload(metaClient); + taggedRecordRDD = tagLocation(bloomIndex, recordRDD, HoodieSparkTable.create(config, context, metaClient)); // Check results for (HoodieRecord record : taggedRecordRDD.collect()) { if (record.getRecordKey().equals(rowKey1)) { - if (record.getPartitionPath().equals("2015/01/31")) { + if (record.getPartitionPath().equals(partition2)) { assertEquals(record.getCurrentLocation().getFileId(), fileId3); } else { assertEquals(record.getCurrentLocation().getFileId(), fileId1); @@ -335,7 +462,99 @@ public void testTagLocation(boolean rangePruning, boolean treeFiltering, boolean @ParameterizedTest(name = TEST_NAME_WITH_PARAMS) @MethodSource("configParams") - public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception { + public void testTagLocationOnNonpartitionedTable( + boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking, + boolean useMetadataTable) throws Exception { + // We have some records to be tagged (two different partitions) + String rowKey1 = genRandomUUID(); + String rowKey2 = genRandomUUID(); + String rowKey3 = genRandomUUID(); + String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + + String emptyPartitionPath = ""; + RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); + HoodieRecord record1 = + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), emptyPartitionPath), rowChange1); + RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); + HoodieRecord record2 = + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), emptyPartitionPath), rowChange2); + RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); + HoodieRecord record3 = + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), emptyPartitionPath), rowChange3); + + JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3)); + + // Also create the metadata and config + HoodieWriteConfig config = + makeConfig(rangePruning, treeFiltering, bucketizedChecking, useMetadataTable); + HoodieSparkTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, metadataWriter); + + // Let's tag + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); + JavaRDD taggedRecordRDD = tagLocation(bloomIndex, recordRDD, hoodieTable); + + // Should not find any files + for (HoodieRecord record : taggedRecordRDD.collect()) { + assertFalse(record.isCurrentLocationKnown()); + } + + final Map>> partitionToFilesNameLengthMap = new HashMap<>(); + + // We create three parquet file, each having one record + final String fileId1 = genRandomUUID(); + final String commit1 = "0000001"; + Path baseFilePath = testTable.forCommit(commit1).withInserts(emptyPartitionPath, fileId1, Collections.singletonList(record1)); + long baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.computeIfAbsent(emptyPartitionPath, + k -> new ArrayList<>()).add(Pair.of(fileId1, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation(commit1, WriteOperationType.UPSERT, Collections.singletonList(emptyPartitionPath), + partitionToFilesNameLengthMap, false, false); + + final String fileId2 = genRandomUUID(); + final String commit2 = "0000002"; + baseFilePath = testTable.forCommit(commit2).withInserts(emptyPartitionPath, fileId2, Collections.singletonList(record2)); + baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.clear(); + partitionToFilesNameLengthMap.computeIfAbsent(emptyPartitionPath, + k -> new ArrayList<>()).add(Pair.of(fileId2, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation(commit2, WriteOperationType.UPSERT, Collections.singletonList(emptyPartitionPath), + partitionToFilesNameLengthMap, false, false); + + final String fileId3 = UUID.randomUUID().toString(); + final String commit3 = "0000003"; + baseFilePath = testTable.forCommit(commit3).withInserts(emptyPartitionPath, fileId3, Collections.singletonList(record3)); + baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.clear(); + partitionToFilesNameLengthMap.computeIfAbsent(emptyPartitionPath, + k -> new ArrayList<>()).add(Pair.of(fileId3, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation(commit3, WriteOperationType.UPSERT, Collections.singletonList(emptyPartitionPath), + partitionToFilesNameLengthMap, false, false); + + // We do the tag again + metaClient = HoodieTableMetaClient.reload(metaClient); + taggedRecordRDD = tagLocation(bloomIndex, recordRDD, HoodieSparkTable.create(config, context, metaClient)); + + // Check results + for (HoodieRecord record : taggedRecordRDD.collect()) { + if (record.getRecordKey().equals(rowKey1)) { + assertEquals(record.getCurrentLocation().getFileId(), fileId1); + } else if (record.getRecordKey().equals(rowKey2)) { + assertEquals(record.getCurrentLocation().getFileId(), fileId2); + } else if (record.getRecordKey().equals(rowKey3)) { + assertEquals(record.getCurrentLocation().getFileId(), fileId3); + } + } + } + + @ParameterizedTest(name = TEST_NAME_WITH_PARAMS) + @MethodSource("configParams") + public void testCheckExists( + boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking, + boolean useMetadataTable) throws Exception { // We have some records to be tagged (two different partitions) String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," @@ -349,25 +568,28 @@ public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean + "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()); - HoodieRecord record1 = new HoodieRecord(key1, rowChange1); + HoodieRecord record1 = new HoodieAvroRecord(key1, rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); HoodieKey key2 = new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()); - HoodieRecord record2 = new HoodieRecord(key2, rowChange2); + HoodieRecord record2 = new HoodieAvroRecord(key2, rowChange2); RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); HoodieKey key3 = new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()); RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); HoodieKey key4 = new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()); - HoodieRecord record4 = new HoodieRecord(key4, rowChange4); + HoodieRecord record4 = new HoodieAvroRecord(key4, rowChange4); JavaRDD keysRDD = jsc.parallelize(Arrays.asList(key1, key2, key3, key4)); // Also create the metadata and config - HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); + HoodieWriteConfig config = + makeConfig(rangePruning, treeFiltering, bucketizedChecking, useMetadataTable); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(hoodieTable, SCHEMA); + metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, metadataWriter); // Let's tag - SparkHoodieBloomIndex bloomIndex = new SparkHoodieBloomIndex(config); - JavaRDD taggedRecords = bloomIndex.tagLocation(keysRDD.map(k -> new HoodieRecord(k, null)), context, hoodieTable); + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); + JavaRDD taggedRecords = tagLocation( + bloomIndex, keysRDD.map(k -> new HoodieAvroRecord(k, null)), hoodieTable); JavaPairRDD>> recordLocationsRDD = taggedRecords .mapToPair(hr -> new Tuple2<>(hr.getKey(), hr.isCurrentLocationKnown() ? Option.of(Pair.of(hr.getPartitionPath(), hr.getCurrentLocation().getFileId())) @@ -379,15 +601,43 @@ public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean assertTrue(!record._2.isPresent()); } + final String partition1 = "2016/01/31"; + final String partition2 = "2015/01/31"; + final String fileId1 = genRandomUUID(); + final String fileId2 = genRandomUUID(); + final String fileId3 = genRandomUUID(); + final Map>> partitionToFilesNameLengthMap = new HashMap<>(); // We create three parquet file, each having one record. (two different partitions) - String fileId1 = testTable.addCommit("001").getFileIdWithInserts("2016/01/31", record1); - String fileId2 = testTable.addCommit("002").getFileIdWithInserts("2016/01/31", record2); - String fileId3 = testTable.addCommit("003").getFileIdWithInserts("2015/01/31", record4); + final String commit1 = "0000001"; + Path baseFilePath = testTable.forCommit(commit1).withInserts(partition1, fileId1, Collections.singletonList(record1)); + long baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.computeIfAbsent(partition1, + k -> new ArrayList<>()).add(Pair.of(fileId1, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation(commit1, WriteOperationType.UPSERT, Collections.singletonList(partition1), + partitionToFilesNameLengthMap, false, false); + + final String commit2 = "0000002"; + partitionToFilesNameLengthMap.clear(); + baseFilePath = testTable.forCommit(commit2).withInserts(partition1, fileId2, Collections.singletonList(record2)); + baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.computeIfAbsent(partition1, + k -> new ArrayList<>()).add(Pair.of(fileId2, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation(commit2, WriteOperationType.UPSERT, Collections.singletonList(partition1), + partitionToFilesNameLengthMap, false, false); + + final String commit3 = "0000003"; + partitionToFilesNameLengthMap.clear(); + baseFilePath = testTable.forCommit(commit3).withInserts(partition2, fileId3, Collections.singletonList(record4)); + baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.computeIfAbsent(partition2, + k -> new ArrayList<>()).add(Pair.of(fileId3, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation(commit3, WriteOperationType.UPSERT, Collections.singletonList(partition2), + partitionToFilesNameLengthMap, false, false); // We do the tag again metaClient = HoodieTableMetaClient.reload(metaClient); hoodieTable = HoodieSparkTable.create(config, context, metaClient); - taggedRecords = bloomIndex.tagLocation(keysRDD.map(k -> new HoodieRecord(k, null)), context, hoodieTable); + taggedRecords = tagLocation(bloomIndex, keysRDD.map(k -> new HoodieAvroRecord(k, null)), hoodieTable); recordLocationsRDD = taggedRecords .mapToPair(hr -> new Tuple2<>(hr.getKey(), hr.isCurrentLocationKnown() ? Option.of(Pair.of(hr.getPartitionPath(), hr.getCurrentLocation().getFileId())) @@ -401,7 +651,7 @@ public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean assertEquals(fileId1, record._2.get().getRight()); } else if (record._1.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) { assertTrue(record._2.isPresent()); - if (record._1.getPartitionPath().equals("2015/01/31")) { + if (record._1.getPartitionPath().equals(partition2)) { assertEquals(fileId3, record._2.get().getRight()); } else { assertEquals(fileId2, record._2.get().getRight()); @@ -414,7 +664,9 @@ public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean @ParameterizedTest(name = TEST_NAME_WITH_PARAMS) @MethodSource("configParams") - public void testBloomFilterFalseError(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception { + public void testBloomFilterFalseError( + boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking, + boolean useMetadataTable) throws Exception { // We have two hoodie records String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; @@ -424,27 +676,28 @@ public void testBloomFilterFalseError(boolean rangePruning, boolean treeFilterin // We write record1 to a parquet file, using a bloom filter having both records RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); HoodieRecord record1 = - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); HoodieRecord record2 = - new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name()); filter.add(record2.getRecordKey()); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(metaClient, SCHEMA, filter); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, filter); String fileId = testTable.addCommit("000").getFileIdWithInserts("2016/01/31", record1); assertTrue(filter.mightContain(record1.getRecordKey())); assertTrue(filter.mightContain(record2.getRecordKey())); // We do the tag JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2)); - HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); + HoodieWriteConfig config = + makeConfig(rangePruning, treeFiltering, bucketizedChecking, useMetadataTable); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable table = HoodieSparkTable.create(config, context, metaClient); - SparkHoodieBloomIndex bloomIndex = new SparkHoodieBloomIndex(config); - JavaRDD taggedRecordRDD = bloomIndex.tagLocation(recordRDD, context, table); + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); + JavaRDD taggedRecordRDD = tagLocation(bloomIndex, recordRDD, table); // Check results for (HoodieRecord record : taggedRecordRDD.collect()) { @@ -455,4 +708,8 @@ public void testBloomFilterFalseError(boolean rangePruning, boolean treeFilterin } } } + + private static String genRandomUUID() { + return genPseudoRandomUUID(RANDOM).toString(); + } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java index c18eeb1d770c8..3ad8952feea84 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java @@ -18,18 +18,25 @@ package org.apache.hudi.index.bloom; +import org.apache.hudi.client.functional.TestHoodieMetadataBase; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.testutils.RawTripTestPayload; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaPairRDD; +import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.testutils.HoodieClientTestHarness; -import org.apache.hudi.testutils.HoodieWriteableTestTable; +import org.apache.hudi.testutils.HoodieSparkWriteableTestTable; import org.apache.avro.Schema; +import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.AfterEach; @@ -37,12 +44,14 @@ import org.junit.jupiter.api.Test; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.UUID; import java.util.stream.Collectors; import scala.Tuple2; @@ -55,7 +64,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; -public class TestHoodieGlobalBloomIndex extends HoodieClientTestHarness { +public class TestHoodieGlobalBloomIndex extends TestHoodieMetadataBase { private static final Schema SCHEMA = getSchemaFromResource(TestHoodieGlobalBloomIndex.class, "/exampleSchema.avsc", true); @@ -63,7 +72,13 @@ public class TestHoodieGlobalBloomIndex extends HoodieClientTestHarness { public void setUp() throws Exception { initSparkContexts(); initPath(); + initFileSystem(); initMetaClient(); + HoodieIndexConfig.Builder indexBuilder = HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.GLOBAL_BLOOM); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withIndexConfig(indexBuilder.build()) + .build(); + writeClient = getHoodieWriteClient(config); } @AfterEach @@ -74,46 +89,79 @@ public void tearDown() throws IOException { @Test public void testLoadInvolvedFiles() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); - SparkHoodieGlobalBloomIndex index = new SparkHoodieGlobalBloomIndex(config); + HoodieGlobalBloomIndex index = + new HoodieGlobalBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(hoodieTable, SCHEMA); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, metadataWriter); // Create some partitions, and put some files, along with the meta file // "2016/01/21": 0 file // "2016/04/01": 1 file (2_0_20160401010101.parquet) // "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet, 4_0_20150312101010.parquet) - testTable.withPartitionMetaFiles("2016/01/21", "2016/04/01", "2015/03/12"); + final String p1 = "2016/01/21"; + final String p2 = "2016/04/01"; + final String p3 = "2015/03/12"; RawTripTestPayload rowChange1 = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record1 = - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record2 = - new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); RawTripTestPayload rowChange3 = new RawTripTestPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record3 = - new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); RawTripTestPayload rowChange4 = new RawTripTestPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record4 = - new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); // intentionally missed the partition "2015/03/12" to see if the GlobalBloomIndex can pick it up - List partitions = Arrays.asList("2016/01/21", "2016/04/01"); + List partitions = Arrays.asList(p1, p2); // partitions will NOT be respected by this loadInvolvedFiles(...) call - List> filesList = index.loadInvolvedFiles(partitions, context, hoodieTable); + List> filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable); // Still 0, as no valid commit assertEquals(0, filesList.size()); - testTable.addCommit("20160401010101").withInserts("2016/04/01", "2"); - testTable.addCommit("20150312101010").withInserts("2015/03/12", "1") - .withInserts("2015/03/12", "3", record1) - .withInserts("2015/03/12", "4", record2, record3, record4); - - filesList = index.loadInvolvedFiles(partitions, context, hoodieTable); + final String fileId1 = "1"; + final String fileId2 = "2"; + final String fileId3 = "3"; + final String fileId4 = "4"; + final Map>> partitionToFilesNameLengthMap = new HashMap<>(); + + final String c1 = "20160401010101"; + Path baseFilePath = testTable.forCommit(c1).withInserts(p2, fileId2, Collections.emptyList()); + long baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.computeIfAbsent(p2, + k -> new ArrayList<>()).add(Pair.of(fileId2, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation(c1, WriteOperationType.UPSERT, Collections.singletonList(p2), + partitionToFilesNameLengthMap, false, false); + + final String c2 = "20150312101010"; + testTable.forCommit(c2); + baseFilePath = testTable.withInserts(p3, fileId1, Collections.emptyList()); + baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.clear(); + partitionToFilesNameLengthMap.computeIfAbsent(p3, + k -> new ArrayList<>()).add(Pair.of(fileId1, Integer.valueOf((int) baseFileLength))); + + baseFilePath = testTable.withInserts(p3, fileId3, Collections.singletonList(record1)); + baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.computeIfAbsent(p3, + k -> new ArrayList<>()).add(Pair.of(fileId3, Integer.valueOf((int) baseFileLength))); + + baseFilePath = testTable.withInserts(p3, fileId4, Arrays.asList(record2, record3, record4)); + baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.computeIfAbsent(p3, + k -> new ArrayList<>()).add(Pair.of(fileId4, Integer.valueOf((int) baseFileLength))); + + testTable.doWriteOperation(c2, WriteOperationType.UPSERT, Collections.singletonList(p3), + partitionToFilesNameLengthMap, false, false); + + filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable); assertEquals(4, filesList.size()); Map filesMap = toFileMap(filesList); @@ -138,7 +186,8 @@ public void testLoadInvolvedFiles() throws Exception { public void testExplodeRecordRDDWithFileComparisons() { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); - SparkHoodieGlobalBloomIndex index = new SparkHoodieGlobalBloomIndex(config); + HoodieGlobalBloomIndex index = + new HoodieGlobalBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); final Map> partitionToFileIndexInfo = new HashMap<>(); partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(new BloomIndexFileInfo("f1"), @@ -152,8 +201,9 @@ public void testExplodeRecordRDDWithFileComparisons() { jsc.parallelize(Arrays.asList(new Tuple2<>("2017/10/21", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"), new Tuple2<>("2017/10/23", "004"))).mapToPair(t -> t); - List> comparisonKeyList = - index.explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect(); + List> comparisonKeyList = HoodieJavaRDD.getJavaRDD( + index.explodeRecordsWithFileComparisons(partitionToFileIndexInfo, + HoodieJavaPairRDD.of(partitionRecordKeyPairRDD))).collect(); /* * expecting: f4, HoodieKey { recordKey=003 partitionPath=2017/10/23} f1, HoodieKey { recordKey=003 @@ -166,7 +216,7 @@ public void testExplodeRecordRDDWithFileComparisons() { assertEquals(10, comparisonKeyList.size()); Map> recordKeyToFileComps = comparisonKeyList.stream() - .collect(Collectors.groupingBy(t -> t._2.getRecordKey(), Collectors.mapping(Tuple2::_1, Collectors.toList()))); + .collect(Collectors.groupingBy(t -> t.getRight().getRecordKey(), Collectors.mapping(Pair::getKey, Collectors.toList()))); assertEquals(4, recordKeyToFileComps.size()); assertEquals(new HashSet<>(Arrays.asList("f4", "f1", "f3")), new HashSet<>(recordKeyToFileComps.get("002"))); @@ -177,53 +227,95 @@ public void testExplodeRecordRDDWithFileComparisons() { @Test public void testTagLocation() throws Exception { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); - SparkHoodieGlobalBloomIndex index = new SparkHoodieGlobalBloomIndex(config); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withIndexConfig(HoodieIndexConfig.newBuilder() + .withIndexType(HoodieIndex.IndexType.GLOBAL_BLOOM) + .withBloomIndexUpdatePartitionPath(false) + .build()) + .build(); + HoodieGlobalBloomIndex index = new HoodieGlobalBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(hoodieTable, SCHEMA); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, metadataWriter); // Create some partitions, and put some files, along with the meta file // "2016/01/21": 0 file // "2016/04/01": 1 file (2_0_20160401010101.parquet) // "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet, 4_0_20150312101010.parquet) - testTable.withPartitionMetaFiles("2016/01/21", "2016/04/01", "2015/03/12"); + final String partition2 = "2016/04/01"; + final String partition3 = "2015/03/12"; RawTripTestPayload rowChange1 = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record1 = - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record2 = - new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); RawTripTestPayload rowChange3 = new RawTripTestPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record3 = - new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); // this record will be saved in table and will be tagged to the incoming record5 RawTripTestPayload rowChange4 = new RawTripTestPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record4 = - new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); // this has the same record key as record4 but different time so different partition, but globalbloomIndex should // tag the original partition of the saved record4 RawTripTestPayload rowChange5 = new RawTripTestPayload("{\"_row_key\":\"003\",\"time\":\"2016-02-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record5 = - new HoodieRecord(new HoodieKey(rowChange5.getRowKey(), rowChange5.getPartitionPath()), rowChange5); + new HoodieAvroRecord(new HoodieKey(rowChange5.getRowKey(), rowChange5.getPartitionPath()), rowChange5); - JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record5)); + final String fileId1 = UUID.randomUUID().toString(); + final String fileId2 = UUID.randomUUID().toString(); + final String fileId3 = UUID.randomUUID().toString(); + final String fileId4 = UUID.randomUUID().toString(); + final Map>> partitionToFilesNameLengthMap = new HashMap<>(); // intentionally missed the partition "2015/03/12" to see if the GlobalBloomIndex can pick it up - String fileId1 = testTable.addCommit("1000").getFileIdWithInserts("2016/04/01", record1); - String fileId2 = testTable.addCommit("2000").getFileIdWithInserts("2015/03/12"); - String fileId3 = testTable.addCommit("3000").getFileIdWithInserts("2015/03/12", record2); - String fileId4 = testTable.addCommit("4000").getFileIdWithInserts("2015/03/12", record4); + String commitTime = "0000001"; + Path baseFilePath = testTable.forCommit(commitTime).withInserts(partition2, fileId1, Collections.singletonList(record1)); + long baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.computeIfAbsent(partition2, + k -> new ArrayList<>()).add(Pair.of(fileId1, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT, Collections.singletonList(partition2), + partitionToFilesNameLengthMap, false, false); + + commitTime = "0000002"; + baseFilePath = testTable.forCommit(commitTime).withInserts(partition3, fileId2, Collections.emptyList()); + baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.clear(); + partitionToFilesNameLengthMap.computeIfAbsent(partition3, + k -> new ArrayList<>()).add(Pair.of(fileId2, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT, Collections.singletonList(partition3), + partitionToFilesNameLengthMap, false, false); + + commitTime = "0000003"; + baseFilePath = testTable.forCommit(commitTime).withInserts(partition3, fileId3, Collections.singletonList(record2)); + baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.clear(); + partitionToFilesNameLengthMap.computeIfAbsent(partition3, + k -> new ArrayList<>()).add(Pair.of(fileId3, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT, Collections.singletonList(partition3), + partitionToFilesNameLengthMap, false, false); + + commitTime = "0000004"; + baseFilePath = testTable.forCommit(commitTime).withInserts(partition3, fileId4, Collections.singletonList(record4)); + baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.clear(); + partitionToFilesNameLengthMap.computeIfAbsent(partition3, + k -> new ArrayList<>()).add(Pair.of(fileId4, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT, Collections.singletonList(partition3), + partitionToFilesNameLengthMap, false, false); + + JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record5)); // partitions will NOT be respected by this loadInvolvedFiles(...) call - JavaRDD taggedRecordRDD = index.tagLocation(recordRDD, context, hoodieTable); + JavaRDD taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable); for (HoodieRecord record : taggedRecordRDD.collect()) { switch (record.getRecordKey()) { @@ -257,11 +349,15 @@ public void testTagLocation() throws Exception { public void testTagLocationWhenShouldUpdatePartitionPath() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder() .withPath(basePath) - .withIndexConfig(HoodieIndexConfig.newBuilder().withBloomIndexUpdatePartitionPath(true).build()) + .withIndexConfig(HoodieIndexConfig.newBuilder() + .withIndexType(HoodieIndex.IndexType.GLOBAL_BLOOM) + .withBloomIndexUpdatePartitionPath(true) + .build()) .build(); - SparkHoodieGlobalBloomIndex index = new SparkHoodieGlobalBloomIndex(config); + HoodieGlobalBloomIndex index = + new HoodieGlobalBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(hoodieTable, SCHEMA); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, metadataWriter); final String p1 = "2016/01/31"; final String p2 = "2016/02/28"; @@ -271,7 +367,7 @@ public void testTagLocationWhenShouldUpdatePartitionPath() throws Exception { RawTripTestPayload originalPayload = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord originalRecord = - new HoodieRecord(new HoodieKey(originalPayload.getRowKey(), originalPayload.getPartitionPath()), + new HoodieAvroRecord(new HoodieKey(originalPayload.getRowKey(), originalPayload.getPartitionPath()), originalPayload); /* @@ -284,7 +380,7 @@ public void testTagLocationWhenShouldUpdatePartitionPath() throws Exception { RawTripTestPayload incomingPayload = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-02-28T03:16:41.415Z\",\"number\":12}"); HoodieRecord incomingRecord = - new HoodieRecord(new HoodieKey(incomingPayload.getRowKey(), incomingPayload.getPartitionPath()), + new HoodieAvroRecord(new HoodieKey(incomingPayload.getRowKey(), incomingPayload.getPartitionPath()), incomingPayload); /* @@ -295,15 +391,24 @@ public void testTagLocationWhenShouldUpdatePartitionPath() throws Exception { RawTripTestPayload incomingPayloadSamePartition = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T04:16:41.415Z\",\"number\":15}"); HoodieRecord incomingRecordSamePartition = - new HoodieRecord( + new HoodieAvroRecord( new HoodieKey(incomingPayloadSamePartition.getRowKey(), incomingPayloadSamePartition.getPartitionPath()), incomingPayloadSamePartition); - testTable.addCommit("1000").getFileIdWithInserts(p1, originalRecord); + final String fileId1 = UUID.randomUUID().toString(); + final Map>> partitionToFilesNameLengthMap = new HashMap<>(); + + final String commitTime = "0000001"; + Path baseFilePath = testTable.forCommit(commitTime).withInserts(p1, fileId1, Collections.singletonList(originalRecord)); + long baseFileLength = fs.getFileStatus(baseFilePath).getLen(); + partitionToFilesNameLengthMap.computeIfAbsent(p1, + k -> new ArrayList<>()).add(Pair.of(fileId1, Integer.valueOf((int) baseFileLength))); + testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT, Arrays.asList(p1), + partitionToFilesNameLengthMap, false, false); // test against incoming record with a different partition JavaRDD recordRDD = jsc.parallelize(Collections.singletonList(incomingRecord)); - JavaRDD taggedRecordRDD = index.tagLocation(recordRDD, context, hoodieTable); + JavaRDD taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable); assertEquals(2, taggedRecordRDD.count()); for (HoodieRecord record : taggedRecordRDD.collect()) { @@ -324,7 +429,7 @@ public void testTagLocationWhenShouldUpdatePartitionPath() throws Exception { // test against incoming record with the same partition JavaRDD recordRDDSamePartition = jsc .parallelize(Collections.singletonList(incomingRecordSamePartition)); - JavaRDD taggedRecordRDDSamePartition = index.tagLocation(recordRDDSamePartition, context, hoodieTable); + JavaRDD taggedRecordRDDSamePartition = tagLocation(index, recordRDDSamePartition, hoodieTable); assertEquals(1, taggedRecordRDDSamePartition.count()); HoodieRecord record = taggedRecordRDDSamePartition.first(); @@ -334,10 +439,10 @@ public void testTagLocationWhenShouldUpdatePartitionPath() throws Exception { } // convert list to map to avoid sorting order dependencies - private static Map toFileMap(List> filesList) { + private static Map toFileMap(List> filesList) { Map filesMap = new HashMap<>(); - for (Tuple2 t : filesList) { - filesMap.put(t._1() + "/" + t._2().getFileId(), t._2()); + for (Pair t : filesList) { + filesMap.put(t.getKey() + "/" + t.getValue().getFileId(), t.getValue()); } return filesMap; } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestKeyRangeLookupTree.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestKeyRangeLookupTree.java index 10232ca81ec27..1c6973db746bc 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestKeyRangeLookupTree.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestKeyRangeLookupTree.java @@ -77,10 +77,10 @@ public void testFileGroupLookUpManyEntriesWithSameStartValue() { } /** - * Tests for many duplicte entries in the tree. + * Tests for many duplicate entries in the tree. */ @Test - public void testFileGroupLookUpManyDulicateEntries() { + public void testFileGroupLookUpManyDuplicateEntries() { KeyRangeNode toInsert = new KeyRangeNode(Long.toString(1200), Long.toString(2000), UUID.randomUUID().toString()); updateExpectedMatchesToTest(toInsert); keyRangeLookupTree.insert(toInsert); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bucket/TestHoodieSimpleBucketIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bucket/TestHoodieSimpleBucketIndex.java new file mode 100644 index 0000000000000..616fc3d719b55 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bucket/TestHoodieSimpleBucketIndex.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.index.bucket; + +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.testutils.RawTripTestPayload; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.testutils.HoodieClientTestHarness; +import org.apache.hudi.testutils.HoodieSparkWriteableTestTable; + +import org.apache.avro.Schema; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaRDD; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.util.Arrays; +import java.util.Properties; +import java.util.UUID; + +import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestHoodieSimpleBucketIndex extends HoodieClientTestHarness { + + private static final Logger LOG = LogManager.getLogger(TestHoodieSimpleBucketIndex.class); + private static final Schema SCHEMA = getSchemaFromResource(TestHoodieSimpleBucketIndex.class, "/exampleSchema.avsc", true); + private static final int NUM_BUCKET = 8; + + @BeforeEach + public void setUp() throws Exception { + initSparkContexts(); + initPath(); + initFileSystem(); + // We have some records to be tagged (two different partitions) + initMetaClient(); + } + + @AfterEach + public void tearDown() throws Exception { + cleanupResources(); + } + + @Test + public void testBucketIndexValidityCheck() { + Properties props = new Properties(); + props.setProperty(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key(), "_row_key"); + assertThrows(HoodieIndexException.class, () -> { + HoodieIndexConfig.newBuilder().fromProperties(props) + .withIndexType(HoodieIndex.IndexType.BUCKET) + .withBucketIndexEngineType(HoodieIndex.BucketIndexEngineType.SIMPLE) + .withBucketNum("8").build(); + }); + props.setProperty(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key(), "uuid"); + HoodieIndexConfig.newBuilder().fromProperties(props) + .withIndexType(HoodieIndex.IndexType.BUCKET) + .withBucketIndexEngineType(HoodieIndex.BucketIndexEngineType.SIMPLE) + .withBucketNum("8").build(); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testTagLocation(boolean isInsert) throws Exception { + String rowKey1 = UUID.randomUUID().toString(); + String rowKey2 = UUID.randomUUID().toString(); + String rowKey3 = UUID.randomUUID().toString(); + String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; + RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); + HoodieRecord record1 = new HoodieAvroRecord( + new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); + HoodieRecord record2 = new HoodieAvroRecord( + new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); + HoodieRecord record3 = new HoodieAvroRecord( + new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); + HoodieRecord record4 = new HoodieAvroRecord( + new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + JavaRDD> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4)); + + HoodieWriteConfig config = makeConfig(); + HoodieTable table = HoodieSparkTable.create(config, context, metaClient); + HoodieSimpleBucketIndex bucketIndex = new HoodieSimpleBucketIndex(config); + HoodieData> taggedRecordRDD = bucketIndex.tagLocation(HoodieJavaRDD.of(recordRDD), context, table); + assertFalse(taggedRecordRDD.collectAsList().stream().anyMatch(r -> r.isCurrentLocationKnown())); + + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(table, SCHEMA); + + if (isInsert) { + testTable.addCommit("001").withInserts("2016/01/31", getRecordFileId(record1), record1); + testTable.addCommit("002").withInserts("2016/01/31", getRecordFileId(record2), record2); + testTable.addCommit("003").withInserts("2016/01/31", getRecordFileId(record3), record3); + } else { + testTable.addCommit("001").withLogAppends("2016/01/31", getRecordFileId(record1), record1); + testTable.addCommit("002").withLogAppends("2016/01/31", getRecordFileId(record2), record2); + testTable.addCommit("003").withLogAppends("2016/01/31", getRecordFileId(record3), record3); + } + + taggedRecordRDD = bucketIndex.tagLocation(HoodieJavaRDD.of(recordRDD), context, + HoodieSparkTable.create(config, context, metaClient)); + assertFalse(taggedRecordRDD.collectAsList().stream().filter(r -> r.isCurrentLocationKnown()) + .filter(r -> BucketIdentifier.bucketIdFromFileId(r.getCurrentLocation().getFileId()) + != getRecordBucketId(r)).findAny().isPresent()); + assertTrue(taggedRecordRDD.collectAsList().stream().filter(r -> r.getPartitionPath().equals("2015/01/31") + && !r.isCurrentLocationKnown()).count() == 1L); + assertTrue(taggedRecordRDD.collectAsList().stream().filter(r -> r.getPartitionPath().equals("2016/01/31") + && r.isCurrentLocationKnown()).count() == 3L); + } + + private HoodieWriteConfig makeConfig() { + Properties props = new Properties(); + props.setProperty(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "_row_key"); + return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(SCHEMA.toString()) + .withIndexConfig(HoodieIndexConfig.newBuilder().fromProperties(props) + .withIndexType(HoodieIndex.IndexType.BUCKET) + .withBucketIndexEngineType(HoodieIndex.BucketIndexEngineType.SIMPLE) + .withIndexKeyField("_row_key") + .withBucketNum(String.valueOf(NUM_BUCKET)).build()).build(); + } + + private String getRecordFileId(HoodieRecord record) { + return BucketIdentifier.bucketIdStr( + BucketIdentifier.getBucketId(record, "_row_key", NUM_BUCKET)); + } + + private int getRecordBucketId(HoodieRecord record) { + return BucketIdentifier + .getBucketId(record, "_row_key", NUM_BUCKET); + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestHBaseIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestHBaseIndex.java deleted file mode 100644 index 2eb672a00bd4c..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestHBaseIndex.java +++ /dev/null @@ -1,629 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.index.hbase; - -import org.apache.hudi.client.SparkRDDWriteClient; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.model.EmptyHoodieRecordPayload; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieTableType; -import org.apache.hudi.common.model.HoodieWriteStat; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.testutils.HoodieTestDataGenerator; -import org.apache.hudi.common.testutils.HoodieTestUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieCompactionConfig; -import org.apache.hudi.config.HoodieHBaseIndexConfig; -import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieStorageConfig; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.table.HoodieSparkTable; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.testutils.FunctionalTestHarness; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.HBaseTestingUtility; -import org.apache.hadoop.hbase.TableName; -import org.apache.hadoop.hbase.client.Connection; -import org.apache.hadoop.hbase.client.Get; -import org.apache.hadoop.hbase.client.HTable; -import org.apache.hadoop.hbase.client.Put; -import org.apache.hadoop.hbase.client.Result; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.spark.api.java.JavaRDD; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.MethodOrderer; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.TestMethodOrder; - -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import java.util.stream.Collectors; - -import scala.Tuple2; - -import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.Mockito.atMost; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.times; -import static org.mockito.Mockito.verify; -import static org.mockito.Mockito.when; - -/** - * Note :: HBaseTestingUtility is really flaky with issues where the HbaseMiniCluster fails to shutdown across tests, - * (see one problem here : https://issues.apache.org/jira/browse/HBASE-15835). Hence, the need to use - * {@link MethodOrderer.Alphanumeric} to make sure the tests run in order. Please alter the order of tests running carefully. - */ -@TestMethodOrder(MethodOrderer.Alphanumeric.class) -@Tag("functional") -public class TestHBaseIndex extends FunctionalTestHarness { - - private static final String TABLE_NAME = "test_table"; - private static HBaseTestingUtility utility; - private static Configuration hbaseConfig; - - private Configuration hadoopConf; - private HoodieTestDataGenerator dataGen; - private HoodieTableMetaClient metaClient; - - @AfterAll - public static void clean() throws Exception { - if (utility != null) { - utility.deleteTable(TABLE_NAME); - utility.shutdownMiniCluster(); - } - } - - @BeforeAll - public static void init() throws Exception { - // Initialize HbaseMiniCluster - hbaseConfig = HBaseConfiguration.create(); - hbaseConfig.set("zookeeper.znode.parent", "/hudi-hbase-test"); - - utility = new HBaseTestingUtility(hbaseConfig); - utility.startMiniCluster(); - hbaseConfig = utility.getConnection().getConfiguration(); - utility.createTable(TableName.valueOf(TABLE_NAME), Bytes.toBytes("_s")); - } - - @BeforeEach - public void setUp() throws Exception { - hadoopConf = jsc().hadoopConfiguration(); - hadoopConf.addResource(utility.getConfiguration()); - metaClient = getHoodieMetaClient(hadoopConf, basePath()); - dataGen = new HoodieTestDataGenerator(); - } - - @Test - public void testSimpleTagLocationAndUpdateCOW() throws Exception { - testSimpleTagLocationAndUpdate(HoodieTableType.COPY_ON_WRITE); - } - - @Test void testSimpleTagLocationAndUpdateMOR() throws Exception { - testSimpleTagLocationAndUpdate(HoodieTableType.MERGE_ON_READ); - } - - public void testSimpleTagLocationAndUpdate(HoodieTableType tableType) throws Exception { - metaClient = HoodieTestUtils.init(hadoopConf, basePath(), tableType); - - final String newCommitTime = "001"; - final int numRecords = 10; - List records = dataGen.generateInserts(newCommitTime, numRecords); - JavaRDD writeRecords = jsc().parallelize(records, 1); - - // Load to memory - HoodieWriteConfig config = getConfig(); - SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); - try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config);) { - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - - // Test tagLocation without any entries in index - JavaRDD records1 = index.tagLocation(writeRecords, context(), hoodieTable); - assertEquals(0, records1.filter(record -> record.isCurrentLocationKnown()).count()); - - // Insert 200 records - writeClient.startCommitWithTime(newCommitTime); - JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); - assertNoWriteErrors(writeStatues.collect()); - - // Now tagLocation for these records, hbaseIndex should not tag them since commit never occurred - JavaRDD records2 = index.tagLocation(writeRecords, context(), hoodieTable); - assertEquals(0, records2.filter(record -> record.isCurrentLocationKnown()).count()); - - // Now commit this & update location of records inserted and validate no errors - writeClient.commit(newCommitTime, writeStatues); - // Now tagLocation for these records, hbaseIndex should tag them correctly - metaClient = HoodieTableMetaClient.reload(metaClient); - hoodieTable = HoodieSparkTable.create(config, context, metaClient); - List records3 = index.tagLocation(writeRecords, context(), hoodieTable).collect(); - assertEquals(numRecords, records3.stream().filter(record -> record.isCurrentLocationKnown()).count()); - assertEquals(numRecords, records3.stream().map(record -> record.getKey().getRecordKey()).distinct().count()); - assertEquals(numRecords, records3.stream().filter(record -> (record.getCurrentLocation() != null - && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count()); - } - } - - @Test - public void testTagLocationAndPartitionPathUpdate() throws Exception { - final String newCommitTime = "001"; - final int numRecords = 10; - final String oldPartitionPath = "1970/01/01"; - final String emptyHoodieRecordPayloadClasssName = EmptyHoodieRecordPayload.class.getName(); - - List newRecords = dataGen.generateInserts(newCommitTime, numRecords); - List oldRecords = new LinkedList(); - for (HoodieRecord newRecord: newRecords) { - HoodieKey key = new HoodieKey(newRecord.getRecordKey(), oldPartitionPath); - HoodieRecord hoodieRecord = new HoodieRecord(key, newRecord.getData()); - oldRecords.add(hoodieRecord); - } - - JavaRDD newWriteRecords = jsc().parallelize(newRecords, 1); - JavaRDD oldWriteRecords = jsc().parallelize(oldRecords, 1); - - HoodieWriteConfig config = getConfig(true); - SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(getConfig(true)); - - try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config);) { - // allowed path change test - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - - JavaRDD oldHoodieRecord = index.tagLocation(oldWriteRecords, context, hoodieTable); - assertEquals(0, oldHoodieRecord.filter(record -> record.isCurrentLocationKnown()).count()); - writeClient.startCommitWithTime(newCommitTime); - JavaRDD writeStatues = writeClient.upsert(oldWriteRecords, newCommitTime); - writeClient.commit(newCommitTime, writeStatues); - assertNoWriteErrors(writeStatues.collect()); - index.updateLocation(writeStatues, context, hoodieTable); - - metaClient = HoodieTableMetaClient.reload(metaClient); - hoodieTable = HoodieSparkTable.create(config, context, metaClient); - List taggedRecords = index.tagLocation(newWriteRecords, context, hoodieTable).collect(); - assertEquals(numRecords * 2L, taggedRecords.stream().count()); - // Verify the number of deleted records - assertEquals(numRecords, taggedRecords.stream().filter(record -> record.getKey().getPartitionPath().equals(oldPartitionPath) - && record.getData().getClass().getName().equals(emptyHoodieRecordPayloadClasssName)).count()); - // Verify the number of inserted records - assertEquals(numRecords, taggedRecords.stream().filter(record -> !record.getKey().getPartitionPath().equals(oldPartitionPath)).count()); - - // not allowed path change test - index = new SparkHoodieHBaseIndex<>(getConfig(false)); - List notAllowPathChangeRecords = index.tagLocation(newWriteRecords, context, hoodieTable).collect(); - assertEquals(numRecords, notAllowPathChangeRecords.stream().count()); - assertEquals(numRecords, taggedRecords.stream().filter(hoodieRecord -> hoodieRecord.isCurrentLocationKnown() - && hoodieRecord.getKey().getPartitionPath().equals(oldPartitionPath)).count()); - } - } - - @Test - public void testTagLocationAndDuplicateUpdate() throws Exception { - final String newCommitTime = "001"; - final int numRecords = 10; - List records = dataGen.generateInserts(newCommitTime, numRecords); - JavaRDD writeRecords = jsc().parallelize(records, 1); - - // Load to memory - HoodieWriteConfig config = getConfig(); - SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); - SparkRDDWriteClient writeClient = getHoodieWriteClient(config); - writeClient.startCommitWithTime(newCommitTime); - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - - JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); - index.tagLocation(writeRecords, context(), hoodieTable); - - // Duplicate upsert and ensure correctness is maintained - // We are trying to approximately imitate the case when the RDD is recomputed. For RDD creating, driver code is not - // recomputed. This includes the state transitions. We need to delete the inflight instance so that subsequent - // upsert will not run into conflicts. - metaClient.getFs().delete(new Path(metaClient.getMetaPath(), "001.inflight")); - - writeClient.upsert(writeRecords, newCommitTime); - assertNoWriteErrors(writeStatues.collect()); - - // Now commit this & update location of records inserted and validate no errors - writeClient.commit(newCommitTime, writeStatues); - // Now tagLocation for these records, hbaseIndex should tag them correctly - metaClient = HoodieTableMetaClient.reload(metaClient); - hoodieTable = HoodieSparkTable.create(config, context, metaClient); - List taggedRecords = index.tagLocation(writeRecords, context(), hoodieTable).collect(); - assertEquals(numRecords, taggedRecords.stream().filter(HoodieRecord::isCurrentLocationKnown).count()); - assertEquals(numRecords, taggedRecords.stream().map(record -> record.getKey().getRecordKey()).distinct().count()); - assertEquals(numRecords, taggedRecords.stream().filter(record -> (record.getCurrentLocation() != null - && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count()); - } - - @Test - public void testSimpleTagLocationAndUpdateWithRollback() throws Exception { - // Load to memory - HoodieWriteConfig config = getConfig(); - SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); - SparkRDDWriteClient writeClient = getHoodieWriteClient(config); - - final String newCommitTime = writeClient.startCommit(); - final int numRecords = 10; - List records = dataGen.generateInserts(newCommitTime, numRecords); - JavaRDD writeRecords = jsc().parallelize(records, 1); - metaClient = HoodieTableMetaClient.reload(metaClient); - - // Insert 200 records - JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); - assertNoWriteErrors(writeStatues.collect()); - - // commit this upsert - writeClient.commit(newCommitTime, writeStatues); - HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - // Now tagLocation for these records, hbaseIndex should tag them - List records2 = index.tagLocation(writeRecords, context(), hoodieTable).collect(); - assertEquals(numRecords, records2.stream().filter(HoodieRecord::isCurrentLocationKnown).count()); - - // check tagged records are tagged with correct fileIds - List fileIds = writeStatues.map(WriteStatus::getFileId).collect(); - assertEquals(0, records2.stream().filter(record -> record.getCurrentLocation().getFileId() == null).count()); - List taggedFileIds = records2.stream().map(record -> record.getCurrentLocation().getFileId()).distinct().collect(Collectors.toList()); - - // both lists should match - assertTrue(taggedFileIds.containsAll(fileIds) && fileIds.containsAll(taggedFileIds)); - // Rollback the last commit - writeClient.rollback(newCommitTime); - - hoodieTable = HoodieSparkTable.create(config, context, metaClient); - // Now tagLocation for these records, hbaseIndex should not tag them since it was a rolled - // back commit - List records3 = index.tagLocation(writeRecords, context(), hoodieTable).collect(); - assertEquals(0, records3.stream().filter(HoodieRecord::isCurrentLocationKnown).count()); - assertEquals(0, records3.stream().filter(record -> record.getCurrentLocation() != null).count()); - } - - @Test - public void testTotalGetsBatching() throws Exception { - HoodieWriteConfig config = getConfig(); - SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); - - // Mock hbaseConnection and related entities - Connection hbaseConnection = mock(Connection.class); - HTable table = mock(HTable.class); - when(hbaseConnection.getTable(TableName.valueOf(TABLE_NAME))).thenReturn(table); - when(table.get((List) any())).thenReturn(new Result[0]); - - // only for test, set the hbaseConnection to mocked object - index.setHbaseConnection(hbaseConnection); - - SparkRDDWriteClient writeClient = getHoodieWriteClient(config); - - // start a commit and generate test data - String newCommitTime = writeClient.startCommit(); - List records = dataGen.generateInserts(newCommitTime, 250); - JavaRDD writeRecords = jsc().parallelize(records, 1); - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - - // Insert 250 records - JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); - assertNoWriteErrors(writeStatues.collect()); - - // Now tagLocation for these records, hbaseIndex should tag them - index.tagLocation(writeRecords, context(), hoodieTable); - - // 3 batches should be executed given batchSize = 100 and parallelism = 1 - verify(table, times(3)).get((List) any()); - - } - - @Test - public void testTotalPutsBatching() throws Exception { - HoodieWriteConfig config = getConfig(); - SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); - SparkRDDWriteClient writeClient = getHoodieWriteClient(config); - - // start a commit and generate test data - String newCommitTime = writeClient.startCommit(); - List records = dataGen.generateInserts(newCommitTime, 250); - JavaRDD writeRecords = jsc().parallelize(records, 1); - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - - // Insert 200 records - JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); - - // commit this upsert - writeClient.commit(newCommitTime, writeStatues); - - // Mock hbaseConnection and related entities - Connection hbaseConnection = mock(Connection.class); - HTable table = mock(HTable.class); - when(hbaseConnection.getTable(TableName.valueOf(TABLE_NAME))).thenReturn(table); - when(table.get((List) any())).thenReturn(new Result[0]); - - // only for test, set the hbaseConnection to mocked object - index.setHbaseConnection(hbaseConnection); - - // Get all the files generated - int numberOfDataFileIds = (int) writeStatues.map(status -> status.getFileId()).distinct().count(); - - index.updateLocation(writeStatues, context(), hoodieTable); - // 3 batches should be executed given batchSize = 100 and <=numberOfDataFileIds getting updated, - // so each fileId ideally gets updates - verify(table, atMost(numberOfDataFileIds)).put((List) any()); - } - - @Test - public void testsHBasePutAccessParallelism() { - HoodieWriteConfig config = getConfig(); - SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); - final JavaRDD writeStatusRDD = jsc().parallelize( - Arrays.asList( - getSampleWriteStatus(0, 2), - getSampleWriteStatus(2, 3), - getSampleWriteStatus(4, 3), - getSampleWriteStatus(6, 3), - getSampleWriteStatus(8, 0)), - 10); - final Tuple2 tuple = index.getHBasePutAccessParallelism(writeStatusRDD); - final int hbasePutAccessParallelism = Integer.parseInt(tuple._2.toString()); - final int hbaseNumPuts = Integer.parseInt(tuple._1.toString()); - assertEquals(10, writeStatusRDD.getNumPartitions()); - assertEquals(4, hbasePutAccessParallelism); - assertEquals(20, hbaseNumPuts); - } - - @Test - public void testsWriteStatusPartitioner() { - HoodieWriteConfig config = getConfig(); - SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); - int parallelism = 4; - final JavaRDD writeStatusRDD = jsc().parallelize( - Arrays.asList( - getSampleWriteStatusWithFileId(0, 2), - getSampleWriteStatusWithFileId(2, 3), - getSampleWriteStatusWithFileId(4, 3), - getSampleWriteStatusWithFileId(0, 3), - getSampleWriteStatusWithFileId(11, 0)), parallelism); - - final Map fileIdPartitionMap = index.mapFileWithInsertsToUniquePartition(writeStatusRDD); - int numWriteStatusWithInserts = (int) index.getHBasePutAccessParallelism(writeStatusRDD)._2; - JavaRDD partitionedRDD = writeStatusRDD.mapToPair(w -> new Tuple2<>(w.getFileId(), w)) - .partitionBy(new SparkHoodieHBaseIndex - .WriteStatusPartitioner(fileIdPartitionMap, - numWriteStatusWithInserts)).map(w -> w._2()); - assertEquals(numWriteStatusWithInserts, partitionedRDD.getNumPartitions()); - int[] partitionIndexesBeforeRepartition = writeStatusRDD.partitions().stream().mapToInt(p -> p.index()).toArray(); - assertEquals(parallelism, partitionIndexesBeforeRepartition.length); - - int[] partitionIndexesAfterRepartition = partitionedRDD.partitions().stream().mapToInt(p -> p.index()).toArray(); - // there should be 3 partitions after repartition, because only 3 writestatus has - // inserts (numWriteStatusWithInserts) - assertEquals(numWriteStatusWithInserts, partitionIndexesAfterRepartition.length); - - List[] writeStatuses = partitionedRDD.collectPartitions(partitionIndexesAfterRepartition); - for (List list : writeStatuses) { - int count = 0; - for (WriteStatus w: list) { - if (w.getStat().getNumInserts() > 0) { - count++; - } - } - assertEquals(1, count); - } - } - - @Test - public void testsWriteStatusPartitionerWithNoInserts() { - HoodieWriteConfig config = getConfig(); - SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); - int parallelism = 3; - final JavaRDD writeStatusRDD = jsc().parallelize( - Arrays.asList( - getSampleWriteStatusWithFileId(0, 2), - getSampleWriteStatusWithFileId(0, 3), - getSampleWriteStatusWithFileId(0, 0)), parallelism); - - final Map fileIdPartitionMap = index.mapFileWithInsertsToUniquePartition(writeStatusRDD); - int numWriteStatusWithInserts = (int) index.getHBasePutAccessParallelism(writeStatusRDD)._2; - JavaRDD partitionedRDD = writeStatusRDD.mapToPair(w -> new Tuple2<>(w.getFileId(), w)) - .partitionBy(new SparkHoodieHBaseIndex - .WriteStatusPartitioner(fileIdPartitionMap, - numWriteStatusWithInserts)).map(w -> w._2()); - assertEquals(numWriteStatusWithInserts, partitionedRDD.getNumPartitions()); - int[] partitionIndexesBeforeRepartition = writeStatusRDD.partitions().stream().mapToInt(p -> p.index()).toArray(); - assertEquals(parallelism, partitionIndexesBeforeRepartition.length); - - int[] partitionIndexesAfterRepartition = partitionedRDD.partitions().stream().mapToInt(p -> p.index()).toArray(); - // there should be 3 partitions after repartition, because only 3 writestatus has inserts - // (numWriteStatusWithInserts) - assertEquals(numWriteStatusWithInserts, partitionIndexesAfterRepartition.length); - assertEquals(partitionIndexesBeforeRepartition.length, parallelism); - - } - - private WriteStatus getSampleWriteStatusWithFileId(final int numInserts, final int numUpdateWrites) { - final WriteStatus writeStatus = new WriteStatus(false, 0.0); - HoodieWriteStat hoodieWriteStat = new HoodieWriteStat(); - hoodieWriteStat.setNumInserts(numInserts); - hoodieWriteStat.setNumUpdateWrites(numUpdateWrites); - writeStatus.setStat(hoodieWriteStat); - writeStatus.setFileId(UUID.randomUUID().toString()); - return writeStatus; - } - - @Test - public void testsHBasePutAccessParallelismWithNoInserts() { - HoodieWriteConfig config = getConfig(); - SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); - final JavaRDD writeStatusRDD = - jsc().parallelize(Arrays.asList(getSampleWriteStatus(0, 2), getSampleWriteStatus(0, 1)), 10); - final Tuple2 tuple = index.getHBasePutAccessParallelism(writeStatusRDD); - final int hbasePutAccessParallelism = Integer.parseInt(tuple._2.toString()); - final int hbaseNumPuts = Integer.parseInt(tuple._1.toString()); - assertEquals(10, writeStatusRDD.getNumPartitions()); - assertEquals(0, hbasePutAccessParallelism); - assertEquals(0, hbaseNumPuts); - } - - @Test - public void testSmallBatchSize() throws Exception { - final String newCommitTime = "001"; - final int numRecords = 10; - List records = dataGen.generateInserts(newCommitTime, numRecords); - JavaRDD writeRecords = jsc().parallelize(records, 1); - - // Load to memory - HoodieWriteConfig config = getConfig(2); - SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); - try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config);) { - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - - // Test tagLocation without any entries in index - JavaRDD records1 = index.tagLocation(writeRecords, context(), hoodieTable); - assertEquals(0, records1.filter(record -> record.isCurrentLocationKnown()).count()); - // Insert 200 records - writeClient.startCommitWithTime(newCommitTime); - JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); - assertNoWriteErrors(writeStatues.collect()); - - // Now tagLocation for these records, hbaseIndex should not tag them since it was a failed - // commit - JavaRDD records2 = index.tagLocation(writeRecords, context(), hoodieTable); - assertEquals(0, records2.filter(record -> record.isCurrentLocationKnown()).count()); - - // Now commit this & update location of records inserted and validate no errors - writeClient.commit(newCommitTime, writeStatues); - // Now tagLocation for these records, hbaseIndex should tag them correctly - metaClient = HoodieTableMetaClient.reload(metaClient); - hoodieTable = HoodieSparkTable.create(config, context, metaClient); - List records3 = index.tagLocation(writeRecords, context(), hoodieTable).collect(); - assertEquals(numRecords, records3.stream().filter(record -> record.isCurrentLocationKnown()).count()); - assertEquals(numRecords, records3.stream().map(record -> record.getKey().getRecordKey()).distinct().count()); - assertEquals(numRecords, records3.stream().filter(record -> (record.getCurrentLocation() != null - && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count()); - } - } - - @Test - public void testDelete() throws Exception { - final String newCommitTime = "001"; - final int numRecords = 10; - List records = dataGen.generateInserts(newCommitTime, numRecords); - JavaRDD writeRecords = jsc().parallelize(records, 1); - - // Load to memory - HoodieWriteConfig config = getConfig(); - SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); - try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config);) { - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - - // Test tagLocation without any entries in index - JavaRDD records1 = index.tagLocation(writeRecords, context(), hoodieTable); - assertEquals(0, records1.filter(record -> record.isCurrentLocationKnown()).count()); - - // Insert records - writeClient.startCommitWithTime(newCommitTime); - JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); - assertNoWriteErrors(writeStatues.collect()); - writeClient.commit(newCommitTime, writeStatues); - - // Now tagLocation for these records, hbaseIndex should tag them correctly - metaClient = HoodieTableMetaClient.reload(metaClient); - hoodieTable = HoodieSparkTable.create(config, context, metaClient); - List records2 = index.tagLocation(writeRecords, context(), hoodieTable).collect(); - assertEquals(numRecords, records2.stream().filter(record -> record.isCurrentLocationKnown()).count()); - assertEquals(numRecords, records2.stream().map(record -> record.getKey().getRecordKey()).distinct().count()); - assertEquals(numRecords, records2.stream().filter(record -> (record.getCurrentLocation() != null - && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count()); - - // Delete all records. This has to be done directly as deleting index entries - // is not implemented via HoodieWriteClient - JavaRDD deleteWriteStatues = writeStatues.map(w -> { - WriteStatus newWriteStatus = new WriteStatus(true, 1.0); - w.getWrittenRecords().forEach(r -> newWriteStatus.markSuccess(new HoodieRecord(r.getKey(), null), Option.empty())); - assertEquals(w.getTotalRecords(), newWriteStatus.getTotalRecords()); - newWriteStatus.setStat(new HoodieWriteStat()); - return newWriteStatus; - }); - JavaRDD deleteStatus = index.updateLocation(deleteWriteStatues, context(), hoodieTable); - assertEquals(deleteStatus.count(), deleteWriteStatues.count()); - assertNoWriteErrors(deleteStatus.collect()); - - // Ensure no records can be tagged - List records3 = index.tagLocation(writeRecords, context(), hoodieTable).collect(); - assertEquals(0, records3.stream().filter(record -> record.isCurrentLocationKnown()).count()); - assertEquals(numRecords, records3.stream().map(record -> record.getKey().getRecordKey()).distinct().count()); - assertEquals(0, records3.stream().filter(record -> (record.getCurrentLocation() != null - && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count()); - } - } - - private WriteStatus getSampleWriteStatus(final int numInserts, final int numUpdateWrites) { - final WriteStatus writeStatus = new WriteStatus(false, 0.1); - HoodieWriteStat hoodieWriteStat = new HoodieWriteStat(); - hoodieWriteStat.setNumInserts(numInserts); - hoodieWriteStat.setNumUpdateWrites(numUpdateWrites); - writeStatus.setStat(hoodieWriteStat); - return writeStatus; - } - - private HoodieWriteConfig getConfig() { - return getConfigBuilder(100, false).build(); - } - - private HoodieWriteConfig getConfig(int hbaseIndexBatchSize) { - return getConfigBuilder(hbaseIndexBatchSize, false).build(); - } - - private HoodieWriteConfig getConfig(boolean updatePartitionPath) { - return getConfigBuilder(100, updatePartitionPath).build(); - } - - private HoodieWriteConfig.Builder getConfigBuilder(int hbaseIndexBatchSize, boolean updatePartitionPath) { - return HoodieWriteConfig.newBuilder().withPath(basePath()).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) - .withParallelism(1, 1).withDeleteParallelism(1) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024) - .withInlineCompaction(false).build()) - .withAutoCommit(false).withStorageConfig(HoodieStorageConfig.newBuilder() - .hfileMaxFileSize(1024 * 1024).parquetMaxFileSize(1024 * 1024).build()) - .forTable("test-trip-table") - .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.HBASE) - .withHBaseIndexConfig(new HoodieHBaseIndexConfig.Builder() - .hbaseZkPort(Integer.parseInt(hbaseConfig.get("hbase.zookeeper.property.clientPort"))) - .hbaseIndexPutBatchSizeAutoCompute(true) - .hbaseZkZnodeParent(hbaseConfig.get("zookeeper.znode.parent", "")) - .hbaseZkQuorum(hbaseConfig.get("hbase.zookeeper.quorum")).hbaseTableName(TABLE_NAME) - .hbaseIndexUpdatePartitionPath(updatePartitionPath) - .hbaseIndexGetBatchSize(hbaseIndexBatchSize).build()) - .build()); - } -} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestHBaseQPSResourceAllocator.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestHBaseQPSResourceAllocator.java index e9f8b87a77c65..be80e9d378331 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestHBaseQPSResourceAllocator.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestHBaseQPSResourceAllocator.java @@ -46,7 +46,7 @@ public void testsDefaultQPSResourceAllocator() { @Test public void testsExplicitDefaultQPSResourceAllocator() { - HoodieWriteConfig config = getConfig(Option.of(HoodieHBaseIndexConfig.DEFAULT_HBASE_INDEX_QPS_ALLOCATOR_CLASS)); + HoodieWriteConfig config = getConfig(Option.of(HoodieHBaseIndexConfig.QPS_ALLOCATOR_CLASS_NAME.defaultValue())); SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator = index.createQPSResourceAllocator(config); assertEquals(hBaseIndexQPSResourceAllocator.getClass().getName(), diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestSparkHoodieHBaseIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestSparkHoodieHBaseIndex.java new file mode 100644 index 0000000000000..f22a067ad81e8 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestSparkHoodieHBaseIndex.java @@ -0,0 +1,831 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index.hbase; + +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieArchivalConfig; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieHBaseIndexConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.testutils.SparkClientFunctionalTestHarness; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.client.Get; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.spark.api.java.JavaRDD; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.MethodOrderer; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestMethodOrder; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.stream.Collectors; + +import scala.Tuple2; + +import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.atMost; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; +import static org.apache.hadoop.hbase.HConstants.ZOOKEEPER_CLIENT_PORT; +import static org.apache.hadoop.hbase.HConstants.ZOOKEEPER_ZNODE_PARENT; +import static org.apache.hadoop.hbase.HConstants.ZOOKEEPER_QUORUM; + +/** + * Note :: HBaseTestingUtility is really flaky with issues where the HbaseMiniCluster fails to shutdown across tests, + * (see one problem here : https://issues.apache.org/jira/browse/HBASE-15835). Hence, the need to use + * {@link MethodOrderer.Alphanumeric} to make sure the tests run in order. Please alter the order of tests running carefully. + */ +@TestMethodOrder(MethodOrderer.Alphanumeric.class) +@Tag("functional") +public class TestSparkHoodieHBaseIndex extends SparkClientFunctionalTestHarness { + + private static final String TABLE_NAME = "test_table"; + private static HBaseTestingUtility utility; + private static Configuration hbaseConfig; + + private Configuration hadoopConf; + private HoodieTestDataGenerator dataGen; + private HoodieTableMetaClient metaClient; + private HoodieSparkEngineContext context; + private String basePath; + + @BeforeAll + public static void init() throws Exception { + // Initialize HbaseMiniCluster + hbaseConfig = HBaseConfiguration.create(); + hbaseConfig.set(ZOOKEEPER_ZNODE_PARENT, "/hudi-hbase-test"); + + utility = new HBaseTestingUtility(hbaseConfig); + utility.startMiniCluster(); + hbaseConfig = utility.getConnection().getConfiguration(); + utility.createTable(TableName.valueOf(TABLE_NAME), Bytes.toBytes("_s"),2); + } + + @AfterAll + public static void clean() throws Exception { + utility.shutdownMiniHBaseCluster(); + utility.shutdownMiniDFSCluster(); + utility.shutdownMiniMapReduceCluster(); + // skip shutdownZkCluster due to localhost connection refused issue + utility = null; + } + + @BeforeEach + public void setUp() throws Exception { + hadoopConf = jsc().hadoopConfiguration(); + hadoopConf.addResource(utility.getConfiguration()); + // reInit the context here to keep the hadoopConf the same with that in this class + context = new HoodieSparkEngineContext(jsc()); + basePath = utility.getDataTestDirOnTestFS(TABLE_NAME).toString(); + metaClient = getHoodieMetaClient(hadoopConf, basePath); + dataGen = new HoodieTestDataGenerator(); + } + + @AfterEach + public void cleanUpTableData() throws IOException { + utility.cleanupDataTestDirOnTestFS(TABLE_NAME); + } + + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testSimpleTagLocationAndUpdate(HoodieTableType tableType) throws Exception { + metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType); + + final String newCommitTime = "001"; + final int numRecords = 10; + List records = dataGen.generateInserts(newCommitTime, numRecords); + JavaRDD writeRecords = jsc().parallelize(records, 1); + + // Load to memory + HoodieWriteConfig config = getConfig(); + SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); + try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config);) { + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + + // Test tagLocation without any entries in index + JavaRDD records1 = tagLocation(index, writeRecords, hoodieTable); + assertEquals(0, records1.filter(record -> record.isCurrentLocationKnown()).count()); + + // Insert 200 records + writeClient.startCommitWithTime(newCommitTime); + JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); + assertNoWriteErrors(writeStatues.collect()); + + // Now tagLocation for these records, hbaseIndex should not tag them since commit never occurred + JavaRDD records2 = tagLocation(index, writeRecords, hoodieTable); + assertEquals(0, records2.filter(record -> record.isCurrentLocationKnown()).count()); + + // Now commit this & update location of records inserted and validate no errors + writeClient.commit(newCommitTime, writeStatues); + // Now tagLocation for these records, hbaseIndex should tag them correctly + metaClient = HoodieTableMetaClient.reload(metaClient); + hoodieTable = HoodieSparkTable.create(config, context, metaClient); + List records3 = tagLocation(index, writeRecords, hoodieTable).collect(); + assertEquals(numRecords, records3.stream().filter(record -> record.isCurrentLocationKnown()).count()); + assertEquals(numRecords, records3.stream().map(record -> record.getKey().getRecordKey()).distinct().count()); + assertEquals(numRecords, records3.stream().filter(record -> (record.getCurrentLocation() != null + && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count()); + } + } + + @Test + public void testTagLocationAndPartitionPathUpdate() throws Exception { + final String newCommitTime = "001"; + final int numRecords = 10; + final String oldPartitionPath = "1970/01/01"; + final String emptyHoodieRecordPayloadClassName = EmptyHoodieRecordPayload.class.getName(); + + List newRecords = dataGen.generateInserts(newCommitTime, numRecords); + List oldRecords = new LinkedList(); + for (HoodieRecord newRecord: newRecords) { + HoodieKey key = new HoodieKey(newRecord.getRecordKey(), oldPartitionPath); + HoodieRecord hoodieRecord = new HoodieAvroRecord(key, (HoodieRecordPayload) newRecord.getData()); + oldRecords.add(hoodieRecord); + } + + JavaRDD newWriteRecords = jsc().parallelize(newRecords, 1); + JavaRDD oldWriteRecords = jsc().parallelize(oldRecords, 1); + + HoodieWriteConfig config = getConfig(true, false); + SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(getConfig(true, false)); + + try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config);) { + // allowed path change test + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + + JavaRDD oldHoodieRecord = tagLocation(index, oldWriteRecords, hoodieTable); + assertEquals(0, oldHoodieRecord.filter(record -> record.isCurrentLocationKnown()).count()); + writeClient.startCommitWithTime(newCommitTime); + JavaRDD writeStatues = writeClient.upsert(oldWriteRecords, newCommitTime); + writeClient.commit(newCommitTime, writeStatues); + assertNoWriteErrors(writeStatues.collect()); + updateLocation(index, writeStatues, hoodieTable); + + metaClient = HoodieTableMetaClient.reload(metaClient); + hoodieTable = HoodieSparkTable.create(config, context, metaClient); + List taggedRecords = tagLocation(index, newWriteRecords, hoodieTable).collect(); + assertEquals(numRecords * 2L, taggedRecords.stream().count()); + // Verify the number of deleted records + assertEquals(numRecords, taggedRecords.stream().filter(record -> record.getKey().getPartitionPath().equals(oldPartitionPath) + && record.getData().getClass().getName().equals(emptyHoodieRecordPayloadClassName)).count()); + // Verify the number of inserted records + assertEquals(numRecords, taggedRecords.stream().filter(record -> !record.getKey().getPartitionPath().equals(oldPartitionPath)).count()); + + // not allowed path change test + index = new SparkHoodieHBaseIndex(getConfig(false, false)); + List notAllowPathChangeRecords = tagLocation(index, newWriteRecords, hoodieTable).collect(); + assertEquals(numRecords, notAllowPathChangeRecords.stream().count()); + assertEquals(numRecords, taggedRecords.stream().filter(hoodieRecord -> hoodieRecord.isCurrentLocationKnown() + && hoodieRecord.getKey().getPartitionPath().equals(oldPartitionPath)).count()); + } + } + + @Test + public void testTagLocationAndDuplicateUpdate() throws Exception { + final String newCommitTime = "001"; + final int numRecords = 10; + List records = dataGen.generateInserts(newCommitTime, numRecords); + JavaRDD writeRecords = jsc().parallelize(records, 1); + + // Load to memory + HoodieWriteConfig config = getConfig(); + SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); + SparkRDDWriteClient writeClient = getHoodieWriteClient(config); + writeClient.startCommitWithTime(newCommitTime); + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + + JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); + tagLocation(index, writeRecords, hoodieTable); + + // Duplicate upsert and ensure correctness is maintained + // We are trying to approximately imitate the case when the RDD is recomputed. For RDD creating, driver code is not + // recomputed. This includes the state transitions. We need to delete the inflight instance so that subsequent + // upsert will not run into conflicts. + metaClient.getFs().delete(new Path(metaClient.getMetaPath(), "001.inflight")); + + writeClient.upsert(writeRecords, newCommitTime); + assertNoWriteErrors(writeStatues.collect()); + + // Now commit this & update location of records inserted and validate no errors + writeClient.commit(newCommitTime, writeStatues); + // Now tagLocation for these records, hbaseIndex should tag them correctly + metaClient = HoodieTableMetaClient.reload(metaClient); + hoodieTable = HoodieSparkTable.create(config, context, metaClient); + List taggedRecords = tagLocation(index, writeRecords, hoodieTable).collect(); + assertEquals(numRecords, taggedRecords.stream().filter(HoodieRecord::isCurrentLocationKnown).count()); + assertEquals(numRecords, taggedRecords.stream().map(record -> record.getKey().getRecordKey()).distinct().count()); + assertEquals(numRecords, taggedRecords.stream().filter(record -> (record.getCurrentLocation() != null + && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count()); + } + + @Test + public void testTagLocationAndPartitionPathUpdateWithExplicitRollback() throws Exception { + final int numRecords = 10; + final String oldPartitionPath = "1970/01/01"; + final String emptyHoodieRecordPayloadClasssName = EmptyHoodieRecordPayload.class.getName(); + HoodieWriteConfig config = getConfigBuilder(100, true, true).withRollbackUsingMarkers(false).build(); + SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); + + try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config);) { + final String firstCommitTime = writeClient.startCommit(); + List newRecords = dataGen.generateInserts(firstCommitTime, numRecords); + List oldRecords = new LinkedList(); + for (HoodieRecord newRecord: newRecords) { + HoodieKey key = new HoodieKey(newRecord.getRecordKey(), oldPartitionPath); + HoodieRecord hoodieRecord = new HoodieAvroRecord(key, (HoodieRecordPayload) newRecord.getData()); + oldRecords.add(hoodieRecord); + } + JavaRDD newWriteRecords = jsc().parallelize(newRecords, 1); + JavaRDD oldWriteRecords = jsc().parallelize(oldRecords, 1); + // first commit old record + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + List beforeFirstTaggedRecords = tagLocation(index, oldWriteRecords, hoodieTable).collect(); + JavaRDD oldWriteStatues = writeClient.upsert(oldWriteRecords, firstCommitTime); + updateLocation(index, oldWriteStatues, hoodieTable); + writeClient.commit(firstCommitTime, oldWriteStatues); + List afterFirstTaggedRecords = tagLocation(index, oldWriteRecords, hoodieTable).collect(); + + metaClient = HoodieTableMetaClient.reload(metaClient); + hoodieTable = HoodieSparkTable.create(config, context, metaClient); + final String secondCommitTime = writeClient.startCommit(); + List beforeSecondTaggedRecords = tagLocation(index, newWriteRecords, hoodieTable).collect(); + JavaRDD newWriteStatues = writeClient.upsert(newWriteRecords, secondCommitTime); + updateLocation(index, newWriteStatues, hoodieTable); + writeClient.commit(secondCommitTime, newWriteStatues); + List afterSecondTaggedRecords = tagLocation(index, newWriteRecords, hoodieTable).collect(); + writeClient.rollback(secondCommitTime); + List afterRollback = tagLocation(index, newWriteRecords, hoodieTable).collect(); + + // Verify the first commit + assertEquals(numRecords, beforeFirstTaggedRecords.stream().filter(record -> record.getCurrentLocation() == null).count()); + assertEquals(numRecords, afterFirstTaggedRecords.stream().filter(HoodieRecord::isCurrentLocationKnown).count()); + // Verify the second commit + assertEquals(numRecords, beforeSecondTaggedRecords.stream() + .filter(record -> record.getKey().getPartitionPath().equals(oldPartitionPath) + && record.getData().getClass().getName().equals(emptyHoodieRecordPayloadClasssName)).count()); + assertEquals(numRecords * 2, beforeSecondTaggedRecords.stream().count()); + assertEquals(numRecords, afterSecondTaggedRecords.stream().count()); + assertEquals(numRecords, afterSecondTaggedRecords.stream().filter(record -> !record.getKey().getPartitionPath().equals(oldPartitionPath)).count()); + // Verify the rollback + // If an exception occurs after hbase writes the index and the index does not roll back, + // the currentLocation information will not be returned. + assertEquals(numRecords, afterRollback.stream().filter(record -> record.getKey().getPartitionPath().equals(oldPartitionPath) + && record.getData().getClass().getName().equals(emptyHoodieRecordPayloadClasssName)).count()); + assertEquals(numRecords * 2, beforeSecondTaggedRecords.stream().count()); + assertEquals(numRecords, afterRollback.stream().filter(HoodieRecord::isCurrentLocationKnown) + .filter(record -> record.getCurrentLocation().getInstantTime().equals(firstCommitTime)).count()); + } + } + + @Test + public void testSimpleTagLocationAndUpdateWithRollback() throws Exception { + // Load to memory + HoodieWriteConfig config = getConfigBuilder(100, false, false) + .withRollbackUsingMarkers(false).build(); + SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); + SparkRDDWriteClient writeClient = getHoodieWriteClient(config); + + final String newCommitTime = writeClient.startCommit(); + final int numRecords = 10; + List records = dataGen.generateInserts(newCommitTime, numRecords); + JavaRDD writeRecords = jsc().parallelize(records, 1); + metaClient = HoodieTableMetaClient.reload(metaClient); + + // Insert 200 records + JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); + assertNoWriteErrors(writeStatues.collect()); + + // commit this upsert + writeClient.commit(newCommitTime, writeStatues); + HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + // Now tagLocation for these records, hbaseIndex should tag them + List records2 = tagLocation(index, writeRecords, hoodieTable).collect(); + assertEquals(numRecords, records2.stream().filter(HoodieRecord::isCurrentLocationKnown).count()); + + // check tagged records are tagged with correct fileIds + List fileIds = writeStatues.map(WriteStatus::getFileId).collect(); + assertEquals(0, records2.stream().filter(record -> record.getCurrentLocation().getFileId() == null).count()); + List taggedFileIds = records2.stream().map(record -> record.getCurrentLocation().getFileId()).distinct().collect(Collectors.toList()); + + // both lists should match + assertTrue(taggedFileIds.containsAll(fileIds) && fileIds.containsAll(taggedFileIds)); + // Rollback the last commit + writeClient.rollback(newCommitTime); + + hoodieTable = HoodieSparkTable.create(config, context, metaClient); + // Now tagLocation for these records, hbaseIndex should not tag them since it was a rolled + // back commit + List records3 = tagLocation(index, writeRecords, hoodieTable).collect(); + assertEquals(0, records3.stream().filter(HoodieRecord::isCurrentLocationKnown).count()); + assertEquals(0, records3.stream().filter(record -> record.getCurrentLocation() != null).count()); + } + + /* + * Test case to verify that for taglocation entries present in HBase, if the corresponding commit instant is missing + * in timeline and the commit is not archived, taglocation would reset the current record location to null. + */ + @Test + public void testSimpleTagLocationWithInvalidCommit() throws Exception { + // Load to memory + HoodieWriteConfig config = getConfigBuilder(100, false, false).withRollbackUsingMarkers(false).build(); + SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); + SparkRDDWriteClient writeClient = getHoodieWriteClient(config); + + String newCommitTime = writeClient.startCommit(); + // make a commit with 199 records + JavaRDD writeRecords = generateAndCommitRecords(writeClient, 199, newCommitTime); + + // make a second commit with a single record + String invalidCommit = writeClient.startCommit(); + JavaRDD invalidWriteRecords = generateAndCommitRecords(writeClient, 1, invalidCommit); + + // verify location is tagged. + HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + JavaRDD javaRDD0 = tagLocation(index, invalidWriteRecords, hoodieTable); + assert (javaRDD0.collect().size() == 1); // one record present + assert (javaRDD0.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 1); // it is tagged + assert (javaRDD0.collect().get(0).getCurrentLocation().getInstantTime().equals(invalidCommit)); + + // rollback the invalid commit, so that hbase will be left with a stale entry. + writeClient.rollback(invalidCommit); + + // Now tagLocation for the valid records, hbaseIndex should tag them + metaClient = HoodieTableMetaClient.reload(metaClient); + hoodieTable = HoodieSparkTable.create(config, context, metaClient); + JavaRDD javaRDD1 = tagLocation(index, writeRecords, hoodieTable); + assert (javaRDD1.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 199); + + // tagLocation for the invalid record - commit is not present in timeline due to rollback. + JavaRDD javaRDD2 = tagLocation(index, invalidWriteRecords, hoodieTable); + assert (javaRDD2.collect().size() == 1); // one record present + assert (javaRDD2.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 0); // it is not tagged + } + + /* + * Test case to verify that taglocation() uses the commit timeline to validate the commitTS stored in hbase. + * When CheckIfValidCommit() in HbaseIndex uses the incorrect timeline filtering, this test would fail. + */ + @Test + public void testEnsureTagLocationUsesCommitTimeline() throws Exception { + // Load to memory + HoodieWriteConfig config = getConfigBuilder(100, false, false) + .withRollbackUsingMarkers(false).build(); + SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); + SparkRDDWriteClient writeClient = getHoodieWriteClient(config); + + String commitTime1 = writeClient.startCommit(); + JavaRDD writeRecords1 = generateAndCommitRecords(writeClient, 20, commitTime1); + + // rollback the commit - leaves a clean file in timeline. + writeClient.rollback(commitTime1); + + // create a second commit with 20 records + metaClient = HoodieTableMetaClient.reload(metaClient); + generateAndCommitRecords(writeClient, 20); + + // Now tagLocation for the first set of rolledback records, hbaseIndex should tag them + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + JavaRDD javaRDD1 = tagLocation(index, writeRecords1, hoodieTable); + assert (javaRDD1.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 20); + } + + private JavaRDD generateAndCommitRecords(SparkRDDWriteClient writeClient, int numRecs) throws Exception { + String commitTime = writeClient.startCommit(); + return generateAndCommitRecords(writeClient, numRecs, commitTime); + } + + private JavaRDD generateAndCommitRecords(SparkRDDWriteClient writeClient, + int numRecs, String commitTime) throws Exception { + // first batch of records + List records = dataGen.generateInserts(commitTime, numRecs); + JavaRDD writeRecords = jsc().parallelize(records, 1); + metaClient = HoodieTableMetaClient.reload(metaClient); + + // Insert records + JavaRDD writeStatues = writeClient.upsert(writeRecords, commitTime); + assertNoWriteErrors(writeStatues.collect()); + + // commit this upsert + writeClient.commit(commitTime, writeStatues); + + return writeRecords; + } + + // Verify hbase is tagging records belonging to an archived commit as valid. + @Test + public void testHbaseTagLocationForArchivedCommits() throws Exception { + // Load to memory + Map params = new HashMap(); + params.put(HoodieCleanConfig.CLEANER_COMMITS_RETAINED.key(), "1"); + params.put(HoodieArchivalConfig.MAX_COMMITS_TO_KEEP.key(), "3"); + params.put(HoodieArchivalConfig.MIN_COMMITS_TO_KEEP.key(), "2"); + HoodieWriteConfig config = getConfigBuilder(100, false, false).withProps(params).build(); + + SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); + SparkRDDWriteClient writeClient = getHoodieWriteClient(config); + + // make first commit with 20 records + JavaRDD writeRecords1 = generateAndCommitRecords(writeClient, 20); + + // Make 3 additional commits, so that first commit is archived + for (int nCommit = 0; nCommit < 3; nCommit++) { + generateAndCommitRecords(writeClient, 20); + } + + // tagLocation for the first set of records (for the archived commit), hbaseIndex should tag them as valid + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + JavaRDD javaRDD1 = tagLocation(index, writeRecords1, hoodieTable); + assertEquals(20, javaRDD1.filter(HoodieRecord::isCurrentLocationKnown).collect().size()); + } + + @Test + public void testTotalGetsBatching() throws Exception { + HoodieWriteConfig config = getConfig(); + SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); + + // Mock hbaseConnection and related entities + Connection hbaseConnection = mock(Connection.class); + HTable table = mock(HTable.class); + when(hbaseConnection.getTable(TableName.valueOf(TABLE_NAME))).thenReturn(table); + when(table.get((List) any())).thenReturn(new Result[0]); + + // only for test, set the hbaseConnection to mocked object + index.setHbaseConnection(hbaseConnection); + + SparkRDDWriteClient writeClient = getHoodieWriteClient(config); + + // start a commit and generate test data + String newCommitTime = writeClient.startCommit(); + List records = dataGen.generateInserts(newCommitTime, 250); + JavaRDD writeRecords = jsc().parallelize(records, 1); + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + + // Insert 250 records + JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); + assertNoWriteErrors(writeStatues.collect()); + + // Now tagLocation for these records, hbaseIndex should tag them + tagLocation(index, writeRecords, hoodieTable); + + // 3 batches should be executed given batchSize = 100 and parallelism = 1 + verify(table, times(3)).get((List) any()); + + } + + @Test + public void testTotalPutsBatching() throws Exception { + HoodieWriteConfig config = getConfig(); + SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); + SparkRDDWriteClient writeClient = getHoodieWriteClient(config); + + // start a commit and generate test data + String newCommitTime = writeClient.startCommit(); + List records = dataGen.generateInserts(newCommitTime, 250); + JavaRDD writeRecords = jsc().parallelize(records, 1); + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + + // Insert 200 records + JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); + + // commit this upsert + writeClient.commit(newCommitTime, writeStatues); + + // Mock hbaseConnection and related entities + Connection hbaseConnection = mock(Connection.class); + HTable table = mock(HTable.class); + when(hbaseConnection.getTable(TableName.valueOf(TABLE_NAME))).thenReturn(table); + when(table.get((List) any())).thenReturn(new Result[0]); + + // only for test, set the hbaseConnection to mocked object + index.setHbaseConnection(hbaseConnection); + + // Get all the files generated + int numberOfDataFileIds = (int) writeStatues.map(status -> status.getFileId()).distinct().count(); + + updateLocation(index, writeStatues, hoodieTable); + // 3 batches should be executed given batchSize = 100 and <=numberOfDataFileIds getting updated, + // so each fileId ideally gets updates + verify(table, atMost(numberOfDataFileIds)).put((List) any()); + } + + @Test + public void testsHBasePutAccessParallelism() { + HoodieWriteConfig config = getConfig(); + SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); + final JavaRDD writeStatusRDD = jsc().parallelize( + Arrays.asList( + getSampleWriteStatus(0, 2), + getSampleWriteStatus(2, 3), + getSampleWriteStatus(4, 3), + getSampleWriteStatus(6, 3), + getSampleWriteStatus(8, 0)), + 10); + final Tuple2 tuple = index.getHBasePutAccessParallelism(writeStatusRDD); + final int hbasePutAccessParallelism = Integer.parseInt(tuple._2.toString()); + final int hbaseNumPuts = Integer.parseInt(tuple._1.toString()); + assertEquals(10, writeStatusRDD.getNumPartitions()); + assertEquals(4, hbasePutAccessParallelism); + assertEquals(20, hbaseNumPuts); + } + + @Test + public void testsWriteStatusPartitioner() { + HoodieWriteConfig config = getConfig(); + SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); + int parallelism = 4; + final JavaRDD writeStatusRDD = jsc().parallelize( + Arrays.asList( + getSampleWriteStatusWithFileId(0, 2), + getSampleWriteStatusWithFileId(2, 3), + getSampleWriteStatusWithFileId(4, 3), + getSampleWriteStatusWithFileId(0, 3), + getSampleWriteStatusWithFileId(11, 0)), parallelism); + + final Map fileIdPartitionMap = index.mapFileWithInsertsToUniquePartition(writeStatusRDD); + int numWriteStatusWithInserts = (int) index.getHBasePutAccessParallelism(writeStatusRDD)._2; + JavaRDD partitionedRDD = writeStatusRDD.mapToPair(w -> new Tuple2<>(w.getFileId(), w)) + .partitionBy(new SparkHoodieHBaseIndex + .WriteStatusPartitioner(fileIdPartitionMap, + numWriteStatusWithInserts)).map(w -> w._2()); + assertEquals(numWriteStatusWithInserts, partitionedRDD.getNumPartitions()); + int[] partitionIndexesBeforeRepartition = writeStatusRDD.partitions().stream().mapToInt(p -> p.index()).toArray(); + assertEquals(parallelism, partitionIndexesBeforeRepartition.length); + + int[] partitionIndexesAfterRepartition = partitionedRDD.partitions().stream().mapToInt(p -> p.index()).toArray(); + // there should be 3 partitions after repartition, because only 3 writestatus has + // inserts (numWriteStatusWithInserts) + assertEquals(numWriteStatusWithInserts, partitionIndexesAfterRepartition.length); + + List[] writeStatuses = partitionedRDD.collectPartitions(partitionIndexesAfterRepartition); + for (List list : writeStatuses) { + int count = 0; + for (WriteStatus w: list) { + if (w.getStat().getNumInserts() > 0) { + count++; + } + } + assertEquals(1, count); + } + } + + @Test + public void testsWriteStatusPartitionerWithNoInserts() { + HoodieWriteConfig config = getConfig(); + SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); + int parallelism = 3; + final JavaRDD writeStatusRDD = jsc().parallelize( + Arrays.asList( + getSampleWriteStatusWithFileId(0, 2), + getSampleWriteStatusWithFileId(0, 3), + getSampleWriteStatusWithFileId(0, 0)), parallelism); + + final Map fileIdPartitionMap = index.mapFileWithInsertsToUniquePartition(writeStatusRDD); + int numWriteStatusWithInserts = (int) index.getHBasePutAccessParallelism(writeStatusRDD)._2; + JavaRDD partitionedRDD = writeStatusRDD.mapToPair(w -> new Tuple2<>(w.getFileId(), w)) + .partitionBy(new SparkHoodieHBaseIndex + .WriteStatusPartitioner(fileIdPartitionMap, + numWriteStatusWithInserts)).map(w -> w._2()); + assertEquals(numWriteStatusWithInserts, partitionedRDD.getNumPartitions()); + int[] partitionIndexesBeforeRepartition = writeStatusRDD.partitions().stream().mapToInt(p -> p.index()).toArray(); + assertEquals(parallelism, partitionIndexesBeforeRepartition.length); + + int[] partitionIndexesAfterRepartition = partitionedRDD.partitions().stream().mapToInt(p -> p.index()).toArray(); + // there should be 3 partitions after repartition, because only 3 writestatus has inserts + // (numWriteStatusWithInserts) + assertEquals(numWriteStatusWithInserts, partitionIndexesAfterRepartition.length); + assertEquals(partitionIndexesBeforeRepartition.length, parallelism); + + } + + private WriteStatus getSampleWriteStatusWithFileId(final int numInserts, final int numUpdateWrites) { + final WriteStatus writeStatus = new WriteStatus(false, 0.0); + HoodieWriteStat hoodieWriteStat = new HoodieWriteStat(); + hoodieWriteStat.setNumInserts(numInserts); + hoodieWriteStat.setNumUpdateWrites(numUpdateWrites); + writeStatus.setStat(hoodieWriteStat); + writeStatus.setFileId(UUID.randomUUID().toString()); + return writeStatus; + } + + @Test + public void testsHBasePutAccessParallelismWithNoInserts() { + HoodieWriteConfig config = getConfig(); + SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); + final JavaRDD writeStatusRDD = + jsc().parallelize(Arrays.asList(getSampleWriteStatus(0, 2), getSampleWriteStatus(0, 1)), 10); + final Tuple2 tuple = index.getHBasePutAccessParallelism(writeStatusRDD); + final int hbasePutAccessParallelism = Integer.parseInt(tuple._2.toString()); + final int hbaseNumPuts = Integer.parseInt(tuple._1.toString()); + assertEquals(10, writeStatusRDD.getNumPartitions()); + assertEquals(0, hbasePutAccessParallelism); + assertEquals(0, hbaseNumPuts); + } + + @Test + public void testSmallBatchSize() throws Exception { + final String newCommitTime = "001"; + final int numRecords = 10; + List records = dataGen.generateInserts(newCommitTime, numRecords); + JavaRDD writeRecords = jsc().parallelize(records, 1); + + // Load to memory + HoodieWriteConfig config = getConfig(2); + SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); + try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config);) { + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + + // Test tagLocation without any entries in index + JavaRDD records1 = tagLocation(index, writeRecords, hoodieTable); + assertEquals(0, records1.filter(record -> record.isCurrentLocationKnown()).count()); + // Insert 200 records + writeClient.startCommitWithTime(newCommitTime); + JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); + assertNoWriteErrors(writeStatues.collect()); + + // Now tagLocation for these records, hbaseIndex should not tag them since it was a failed + // commit + JavaRDD records2 = tagLocation(index, writeRecords, hoodieTable); + assertEquals(0, records2.filter(record -> record.isCurrentLocationKnown()).count()); + + // Now commit this & update location of records inserted and validate no errors + writeClient.commit(newCommitTime, writeStatues); + // Now tagLocation for these records, hbaseIndex should tag them correctly + metaClient = HoodieTableMetaClient.reload(metaClient); + hoodieTable = HoodieSparkTable.create(config, context, metaClient); + List records3 = tagLocation(index, writeRecords, hoodieTable).collect(); + assertEquals(numRecords, records3.stream().filter(record -> record.isCurrentLocationKnown()).count()); + assertEquals(numRecords, records3.stream().map(record -> record.getKey().getRecordKey()).distinct().count()); + assertEquals(numRecords, records3.stream().filter(record -> (record.getCurrentLocation() != null + && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count()); + } + } + + @Test + public void testDelete() throws Exception { + final String newCommitTime = "001"; + final int numRecords = 10; + List records = dataGen.generateInserts(newCommitTime, numRecords); + JavaRDD writeRecords = jsc().parallelize(records, 1); + + // Load to memory + HoodieWriteConfig config = getConfig(); + SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); + try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config);) { + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); + + // Test tagLocation without any entries in index + JavaRDD records1 = tagLocation(index, writeRecords, hoodieTable); + assertEquals(0, records1.filter(record -> record.isCurrentLocationKnown()).count()); + + // Insert records + writeClient.startCommitWithTime(newCommitTime); + JavaRDD writeStatues = writeClient.upsert(writeRecords, newCommitTime); + assertNoWriteErrors(writeStatues.collect()); + writeClient.commit(newCommitTime, writeStatues); + + // Now tagLocation for these records, hbaseIndex should tag them correctly + metaClient = HoodieTableMetaClient.reload(metaClient); + hoodieTable = HoodieSparkTable.create(config, context, metaClient); + List records2 = tagLocation(index, writeRecords, hoodieTable).collect(); + assertEquals(numRecords, records2.stream().filter(record -> record.isCurrentLocationKnown()).count()); + assertEquals(numRecords, records2.stream().map(record -> record.getKey().getRecordKey()).distinct().count()); + assertEquals(numRecords, records2.stream().filter(record -> (record.getCurrentLocation() != null + && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count()); + + // Delete all records. This has to be done directly as deleting index entries + // is not implemented via HoodieWriteClient + JavaRDD deleteWriteStatues = writeStatues.map(w -> { + WriteStatus newWriteStatus = new WriteStatus(true, 1.0); + w.getWrittenRecords().forEach(r -> newWriteStatus.markSuccess(new HoodieAvroRecord(r.getKey(), null), Option.empty())); + assertEquals(w.getTotalRecords(), newWriteStatus.getTotalRecords()); + newWriteStatus.setStat(new HoodieWriteStat()); + return newWriteStatus; + }); + // if not for this caching, due to RDD chaining/lineage, first time update is called again when subsequent update is called. + // So caching here to break the chain and so future update does not re-trigger update of older Rdd. + deleteWriteStatues.cache(); + JavaRDD deleteStatus = updateLocation(index, deleteWriteStatues, hoodieTable); + assertEquals(deleteStatus.count(), deleteWriteStatues.count()); + assertNoWriteErrors(deleteStatus.collect()); + + // Ensure no records can be tagged + List records3 = tagLocation(index, writeRecords, hoodieTable).collect(); + assertEquals(0, records3.stream().filter(record -> record.isCurrentLocationKnown()).count()); + assertEquals(numRecords, records3.stream().map(record -> record.getKey().getRecordKey()).distinct().count()); + assertEquals(0, records3.stream().filter(record -> (record.getCurrentLocation() != null + && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count()); + } + } + + private WriteStatus getSampleWriteStatus(final int numInserts, final int numUpdateWrites) { + final WriteStatus writeStatus = new WriteStatus(false, 0.1); + HoodieWriteStat hoodieWriteStat = new HoodieWriteStat(); + hoodieWriteStat.setNumInserts(numInserts); + hoodieWriteStat.setNumUpdateWrites(numUpdateWrites); + writeStatus.setStat(hoodieWriteStat); + return writeStatus; + } + + private HoodieWriteConfig getConfig() { + return getConfigBuilder(100, false, false).build(); + } + + private HoodieWriteConfig getConfig(int hbaseIndexBatchSize) { + return getConfigBuilder(hbaseIndexBatchSize, false, false).build(); + } + + private HoodieWriteConfig getConfig(boolean updatePartitionPath, boolean rollbackSync) { + return getConfigBuilder(100, updatePartitionPath, rollbackSync).build(); + } + + private HoodieWriteConfig.Builder getConfigBuilder(int hbaseIndexBatchSize, boolean updatePartitionPath, boolean rollbackSync) { + return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + .withParallelism(1, 1).withDeleteParallelism(1) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024) + .withInlineCompaction(false).build()) + .withAutoCommit(false).withStorageConfig(HoodieStorageConfig.newBuilder() + .hfileMaxFileSize(1024 * 1024).parquetMaxFileSize(1024 * 1024).build()) + .forTable("test-trip-table") + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.HBASE) + .withHBaseIndexConfig(new HoodieHBaseIndexConfig.Builder() + .hbaseZkPort(Integer.parseInt(hbaseConfig.get(ZOOKEEPER_CLIENT_PORT))) + .hbaseIndexPutBatchSizeAutoCompute(true) + .hbaseZkZnodeParent(hbaseConfig.get(ZOOKEEPER_ZNODE_PARENT, "")) + .hbaseZkQuorum(hbaseConfig.get(ZOOKEEPER_QUORUM)).hbaseTableName(TABLE_NAME) + .hbaseIndexUpdatePartitionPath(updatePartitionPath) + .hbaseIndexRollbackSync(rollbackSync) + .hbaseIndexGetBatchSize(hbaseIndexBatchSize).build()) + .build()); + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieKeyLocationFetchHandle.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieKeyLocationFetchHandle.java index 3a7d468e770d8..e02e613642c1a 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieKeyLocationFetchHandle.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieKeyLocationFetchHandle.java @@ -18,29 +18,36 @@ package org.apache.hudi.io; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.table.view.FileSystemViewStorageType; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndexUtils; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.testutils.HoodieClientTestHarness; -import org.apache.hudi.testutils.HoodieWriteableTestTable; +import org.apache.hudi.testutils.HoodieSparkWriteableTestTable; import org.apache.hudi.testutils.MetadataMergeWriteStatus; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import java.io.IOException; import java.util.ArrayList; @@ -48,6 +55,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Properties; import scala.Tuple2; @@ -71,10 +79,6 @@ public void setUp() throws Exception { initPath(); initTestDataGenerator(); initFileSystem(); - initMetaClient(); - config = getConfigBuilder() - .withIndexConfig(HoodieIndexConfig.newBuilder() - .build()).build(); } @AfterEach @@ -82,28 +86,38 @@ public void tearDown() throws IOException { cleanupResources(); } - @Test - public void testFetchHandle() throws Exception { + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testFetchHandle(boolean populateMetaFields) throws Exception { + metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE, populateMetaFields ? new Properties() : getPropertiesForKeyGen()); + config = getConfigBuilder() + .withProperties(getPropertiesForKeyGen()) + .withIndexConfig(HoodieIndexConfig.newBuilder() + .build()).build(); + List records = dataGen.generateInserts(makeNewCommitTime(), 100); Map> partitionRecordsMap = recordsToPartitionRecordsMap(records); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(hoodieTable, AVRO_SCHEMA_WITH_METADATA_FIELDS); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, AVRO_SCHEMA_WITH_METADATA_FIELDS); Map, List>> expectedList = writeToParquetAndGetExpectedRecordLocations(partitionRecordsMap, testTable); List> partitionPathFileIdPairs = loadAllFilesForPartitions(new ArrayList<>(partitionRecordsMap.keySet()), context, hoodieTable); + BaseKeyGenerator keyGenerator = (BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(getPropertiesForKeyGen())); + for (Tuple2 entry : partitionPathFileIdPairs) { - HoodieKeyLocationFetchHandle fetcherHandle = new HoodieKeyLocationFetchHandle(config, hoodieTable, Pair.of(entry._1, entry._2)); - Iterator> result = fetcherHandle.locations(); + HoodieKeyLocationFetchHandle fetcherHandle = new HoodieKeyLocationFetchHandle(config, hoodieTable, Pair.of(entry._1, entry._2), + populateMetaFields ? Option.empty() : Option.of(keyGenerator)); + Iterator> result = fetcherHandle.locations().iterator(); List> actualList = new ArrayList<>(); - result.forEachRemaining(actualList::add); + result.forEachRemaining(x -> actualList.add(new Tuple2<>(x.getLeft(), x.getRight()))); assertEquals(expectedList.get(new Tuple2<>(entry._1, entry._2.getFileId())), actualList); } } private Map, List>> writeToParquetAndGetExpectedRecordLocations( - Map> partitionRecordsMap, HoodieWriteableTestTable testTable) throws Exception { + Map> partitionRecordsMap, HoodieSparkWriteableTestTable testTable) throws Exception { Map, List>> expectedList = new HashMap<>(); for (Map.Entry> entry : partitionRecordsMap.entrySet()) { int totalRecordsPerPartition = entry.getValue().size(); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieMergeHandle.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieMergeHandle.java index 95ed61aa4f1e9..72749160e6bd0 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieMergeHandle.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieMergeHandle.java @@ -20,13 +20,17 @@ import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.config.HoodieCommonConfig; +import org.apache.hudi.common.model.BaseFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieStorageConfig; @@ -34,23 +38,28 @@ import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.testutils.HoodieClientTestHarness; import org.apache.hudi.testutils.HoodieClientTestUtils; - -import org.apache.hadoop.fs.FileSystem; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; +import java.util.Properties; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.params.provider.Arguments.arguments; @SuppressWarnings("unchecked") public class TestHoodieMergeHandle extends HoodieClientTestHarness { @@ -69,16 +78,24 @@ public void tearDown() throws Exception { cleanupResources(); } - @Test - public void testUpsertsForMultipleRecordsInSameFile() throws Exception { + @ParameterizedTest + @MethodSource("testArguments") + public void testUpsertsForMultipleRecordsInSameFile(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled) throws Exception { // Create records in a single partition String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0]; dataGen = new HoodieTestDataGenerator(new String[] {partitionPath}); + // Build a common config with diff configs + Properties properties = new Properties(); + properties.setProperty(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), diskMapType.name()); + properties.setProperty(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), String.valueOf(isCompressionEnabled)); + // Build a write config with bulkinsertparallelism set - HoodieWriteConfig cfg = getConfigBuilder().build(); + HoodieWriteConfig cfg = getConfigBuilder() + .withProperties(properties) + .build(); try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { - FileSystem fs = FSUtils.getFs(basePath, hadoopConf); /** * Write 1 (only inserts) This will do a bulk insert of 44 records of which there are 2 records repeated 21 times @@ -186,6 +203,7 @@ public void testUpsertsForMultipleRecordsInSameFile() throws Exception { // Check the entire dataset has 47 records still dataSet = getRecords(); assertEquals(47, dataSet.count(), "Must contain 47 records"); + Row[] rows = (Row[]) dataSet.collect(); int record1Count = 0; int record2Count = 0; @@ -212,13 +230,38 @@ public void testUpsertsForMultipleRecordsInSameFile() throws Exception { // Assert that id2 record count which has been updated to rider-004 and driver-004 is 21, which is the total // number of records with row_key id2 assertEquals(21, record2Count); + + // Validate that all the records only reference the _latest_ base files as part of the + // FILENAME_METADATA_FIELD payload (entailing that corresponding metadata is in-sync with + // the state of the table + HoodieTableFileSystemView tableView = + getHoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline(), HoodieTestTable.of(metaClient).listAllBaseFiles()); + + Set latestBaseFileNames = tableView.getLatestBaseFiles() + .map(BaseFile::getFileName) + .collect(Collectors.toSet()); + + Set metadataFilenameFieldRefs = dataSet.collectAsList().stream() + .map(row -> row.getAs(HoodieRecord.FILENAME_METADATA_FIELD)) + .collect(Collectors.toSet()); + + assertEquals(latestBaseFileNames, metadataFilenameFieldRefs); } } - @Test - public void testHoodieMergeHandleWriteStatMetrics() throws Exception { + @ParameterizedTest + @MethodSource("testArguments") + public void testHoodieMergeHandleWriteStatMetrics(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled) throws Exception { // insert 100 records - HoodieWriteConfig config = getConfigBuilder().build(); + // Build a common config with diff configs + Properties properties = new Properties(); + properties.setProperty(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), diskMapType.name()); + properties.setProperty(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), String.valueOf(isCompressionEnabled)); + + HoodieWriteConfig config = getConfigBuilder() + .withProperties(properties) + .build(); try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config);) { String newCommitTime = "100"; writeClient.startCommitWithTime(newCommitTime); @@ -317,6 +360,16 @@ HoodieWriteConfig.Builder getConfigBuilder() { .withBulkInsertParallelism(2).withWriteStatusClass(TestWriteStatus.class); } + private static Stream testArguments() { + // Arg1: ExternalSpillableMap Type, Arg2: isDiskMapCompressionEnabled + return Stream.of( + arguments(ExternalSpillableMap.DiskMapType.BITCASK, false), + arguments(ExternalSpillableMap.DiskMapType.ROCKS_DB, false), + arguments(ExternalSpillableMap.DiskMapType.BITCASK, true), + arguments(ExternalSpillableMap.DiskMapType.ROCKS_DB, true) + ); + } + /** * Overridden so that we can capture and inspect all success records. */ diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieRowCreateHandle.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieRowCreateHandle.java deleted file mode 100644 index edce77772d407..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieRowCreateHandle.java +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.io; - -import org.apache.hudi.client.HoodieInternalWriteStatus; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieWriteStat; -import org.apache.hudi.common.testutils.HoodieTestDataGenerator; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieInsertException; -import org.apache.hudi.table.HoodieSparkTable; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.testutils.HoodieClientTestHarness; - -import org.apache.hudi.testutils.SparkDatasetTestUtils; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.catalyst.InternalRow; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Random; -import java.util.UUID; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertNull; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; - -/** - * Unit tests {@link HoodieRowCreateHandle}. - */ -@SuppressWarnings("checkstyle:LineLength") -public class TestHoodieRowCreateHandle extends HoodieClientTestHarness { - - private static final Random RANDOM = new Random(); - - @BeforeEach - public void setUp() throws Exception { - initSparkContexts("TestHoodieRowCreateHandle"); - initPath(); - initFileSystem(); - initTestDataGenerator(); - initMetaClient(); - } - - @AfterEach - public void tearDown() throws Exception { - cleanupResources(); - } - - @Test - public void testRowCreateHandle() throws Exception { - // init config and table - HoodieWriteConfig cfg = SparkDatasetTestUtils.getConfigBuilder(basePath).build(); - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - List fileNames = new ArrayList<>(); - List fileAbsPaths = new ArrayList<>(); - - Dataset totalInputRows = null; - // one round per partition - for (int i = 0; i < 5; i++) { - String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[i % 3]; - - // init some args - String fileId = UUID.randomUUID().toString(); - String instantTime = "000"; - - HoodieRowCreateHandle handle = new HoodieRowCreateHandle(table, cfg, partitionPath, fileId, instantTime, RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE); - int size = 10 + RANDOM.nextInt(1000); - // Generate inputs - Dataset inputRows = SparkDatasetTestUtils.getRandomRows(sqlContext, size, partitionPath, false); - if (totalInputRows == null) { - totalInputRows = inputRows; - } else { - totalInputRows = totalInputRows.union(inputRows); - } - - // issue writes - HoodieInternalWriteStatus writeStatus = writeAndGetWriteStatus(inputRows, handle); - - fileAbsPaths.add(basePath + "/" + writeStatus.getStat().getPath()); - fileNames.add(handle.getFileName()); - // verify output - assertOutput(writeStatus, size, fileId, partitionPath, instantTime, totalInputRows, fileNames, fileAbsPaths); - } - } - - /** - * Issue some corrupted or wrong schematized InternalRow after few valid InternalRows so that global error is thrown. write batch 1 of valid records write batch 2 of invalid records Global Error - * should be thrown. - */ - @Test - public void testGlobalFailure() throws Exception { - // init config and table - HoodieWriteConfig cfg = SparkDatasetTestUtils.getConfigBuilder(basePath).build(); - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0]; - - // init some args - String fileId = UUID.randomUUID().toString(); - String instantTime = "000"; - - HoodieRowCreateHandle handle = new HoodieRowCreateHandle(table, cfg, partitionPath, fileId, instantTime, RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE); - int size = 10 + RANDOM.nextInt(1000); - int totalFailures = 5; - // Generate first batch of valid rows - Dataset inputRows = SparkDatasetTestUtils.getRandomRows(sqlContext, size / 2, partitionPath, false); - List internalRows = SparkDatasetTestUtils.toInternalRows(inputRows, SparkDatasetTestUtils.ENCODER); - - // generate some failures rows - for (int i = 0; i < totalFailures; i++) { - internalRows.add(SparkDatasetTestUtils.getInternalRowWithError(partitionPath)); - } - - // generate 2nd batch of valid rows - Dataset inputRows2 = SparkDatasetTestUtils.getRandomRows(sqlContext, size / 2, partitionPath, false); - internalRows.addAll(SparkDatasetTestUtils.toInternalRows(inputRows2, SparkDatasetTestUtils.ENCODER)); - - // issue writes - try { - for (InternalRow internalRow : internalRows) { - handle.write(internalRow); - } - fail("Should have failed"); - } catch (Throwable e) { - // expected - } - // close the create handle - HoodieInternalWriteStatus writeStatus = handle.close(); - - List fileNames = new ArrayList<>(); - fileNames.add(handle.getFileName()); - // verify write status - assertNotNull(writeStatus.getGlobalError()); - assertTrue(writeStatus.getGlobalError().getMessage().contains("java.lang.String cannot be cast to org.apache.spark.unsafe.types.UTF8String")); - assertEquals(writeStatus.getFileId(), fileId); - assertEquals(writeStatus.getPartitionPath(), partitionPath); - - // verify rows - Dataset result = sqlContext.read().parquet(basePath + "/" + partitionPath); - // passing only first batch of inputRows since after first batch global error would have been thrown - assertRows(inputRows, result, instantTime, fileNames); - } - - @Test - public void testInstantiationFailure() throws IOException { - // init config and table - HoodieWriteConfig cfg = SparkDatasetTestUtils.getConfigBuilder(basePath).withPath("/dummypath/abc/").build(); - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - - try { - new HoodieRowCreateHandle(table, cfg, " def", UUID.randomUUID().toString(), "001", RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE); - fail("Should have thrown exception"); - } catch (HoodieInsertException ioe) { - // expected - } - } - - private HoodieInternalWriteStatus writeAndGetWriteStatus(Dataset inputRows, HoodieRowCreateHandle handle) - throws Exception { - List internalRows = SparkDatasetTestUtils.toInternalRows(inputRows, SparkDatasetTestUtils.ENCODER); - // issue writes - for (InternalRow internalRow : internalRows) { - handle.write(internalRow); - } - // close the create handle - return handle.close(); - } - - private void assertOutput(HoodieInternalWriteStatus writeStatus, int size, String fileId, String partitionPath, String instantTime, Dataset inputRows, List filenames, - List fileAbsPaths) { - assertEquals(writeStatus.getPartitionPath(), partitionPath); - assertEquals(writeStatus.getTotalRecords(), size); - assertEquals(writeStatus.getFailedRowsSize(), 0); - assertEquals(writeStatus.getTotalErrorRecords(), 0); - assertFalse(writeStatus.hasErrors()); - assertNull(writeStatus.getGlobalError()); - assertEquals(writeStatus.getFileId(), fileId); - HoodieWriteStat writeStat = writeStatus.getStat(); - assertEquals(size, writeStat.getNumInserts()); - assertEquals(size, writeStat.getNumWrites()); - assertEquals(fileId, writeStat.getFileId()); - assertEquals(partitionPath, writeStat.getPartitionPath()); - assertEquals(0, writeStat.getNumDeletes()); - assertEquals(0, writeStat.getNumUpdateWrites()); - assertEquals(0, writeStat.getTotalWriteErrors()); - - // verify rows - Dataset result = sqlContext.read().parquet(fileAbsPaths.toArray(new String[0])); - assertRows(inputRows, result, instantTime, filenames); - } - - private void assertRows(Dataset expectedRows, Dataset actualRows, String instantTime, List filenames) { - // verify 3 meta fields that are filled in within create handle - actualRows.collectAsList().forEach(entry -> { - assertEquals(entry.get(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD)).toString(), instantTime); - assertTrue(filenames.contains(entry.get(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.FILENAME_METADATA_FIELD)).toString())); - assertFalse(entry.isNullAt(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD))); - }); - - // after trimming 2 of the meta fields, rest of the fields should match - Dataset trimmedExpected = expectedRows.drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, HoodieRecord.COMMIT_TIME_METADATA_FIELD, HoodieRecord.FILENAME_METADATA_FIELD); - Dataset trimmedActual = actualRows.drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, HoodieRecord.COMMIT_TIME_METADATA_FIELD, HoodieRecord.FILENAME_METADATA_FIELD); - assertEquals(0, trimmedActual.except(trimmedExpected).count()); - } -} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiveLog.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiveLog.java deleted file mode 100644 index f2427cd9a2348..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiveLog.java +++ /dev/null @@ -1,523 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.io; - -import org.apache.hudi.avro.model.HoodieActionInstant; -import org.apache.hudi.avro.model.HoodieCleanMetadata; -import org.apache.hudi.avro.model.HoodieCleanerPlan; -import org.apache.hudi.common.HoodieCleanStat; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.fs.HoodieWrapperFileSystem; -import org.apache.hudi.common.model.HoodieCleaningPolicy; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; -import org.apache.hudi.common.model.WriteOperationType; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieInstant.State; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler; -import org.apache.hudi.common.testutils.HoodieTestDataGenerator; -import org.apache.hudi.common.testutils.HoodieTestTable; -import org.apache.hudi.common.testutils.HoodieTestUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieCompactionConfig; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieSparkTable; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.HoodieTimelineArchiveLog; -import org.apache.hudi.testutils.HoodieClientTestHarness; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Random; -import java.util.Set; -import java.util.stream.Collectors; - -import static org.apache.hudi.common.util.CleanerUtils.convertCleanMetadata; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; - -public class TestHoodieTimelineArchiveLog extends HoodieClientTestHarness { - - private Configuration hadoopConf; - private HoodieWrapperFileSystem wrapperFs; - - @BeforeEach - public void init() throws Exception { - initPath(); - initSparkContexts(); - initMetaClient(); - hadoopConf = context.getHadoopConf().get(); - metaClient.getFs().mkdirs(new Path(basePath)); - metaClient = HoodieTestUtils.init(hadoopConf, basePath); - wrapperFs = metaClient.getFs(); - hadoopConf.addResource(wrapperFs.getConf()); - } - - @AfterEach - public void clean() throws IOException { - cleanupResources(); - } - - @Test - public void testArchiveEmptyTable() throws IOException { - HoodieWriteConfig cfg = - HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) - .withParallelism(2, 2).forTable("test-trip-table").build(); - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - boolean result = archiveLog.archiveIfRequired(context); - assertTrue(result); - } - - @Test - public void testArchiveTableWithArchival() throws IOException { - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 4).build()) - .forTable("test-trip-table").build(); - HoodieTestUtils.init(hadoopConf, basePath); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "100"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "100"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "100", wrapperFs.getConf()); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "101"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "101"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "101", wrapperFs.getConf()); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "102"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "102"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "102", wrapperFs.getConf()); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "103"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "103"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "103", wrapperFs.getConf()); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "104"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "104"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "104", wrapperFs.getConf()); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "105"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "105"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "105", wrapperFs.getConf()); - - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - - assertEquals(6, timeline.countInstants(), "Loaded 6 commits and the count should match"); - - createCleanMetadata("100", false); - createCleanMetadata("101", false); - createCleanMetadata("102", false); - createCleanMetadata("103", false); - createCleanMetadata("104", false); - createCleanMetadata("105", false); - createCleanMetadata("106", true); - createCleanMetadata("107", true); - - // reload the timeline and get all the commmits before archive - timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); - List originalCommits = timeline.getInstants().collect(Collectors.toList()); - - assertEquals(12, timeline.countInstants(), "Loaded 6 commits and the count should match"); - - // verify in-flight instants before archive - verifyInflightInstants(metaClient, 2); - - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - assertTrue(archiveLog.archiveIfRequired(context)); - - // reload the timeline and remove the remaining commits - timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); - originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList())); - - // Check compaction instants - List instants = metaClient.scanHoodieInstantsFromFileSystem( - new Path(metaClient.getMetaAuxiliaryPath()), HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE, false); - assertEquals(4, instants.size(), "Should delete all compaction instants < 104"); - assertFalse(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "100")), - "Requested Compaction must be absent for 100"); - assertFalse(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "100")), - "Inflight Compaction must be absent for 100"); - assertFalse(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "101")), - "Requested Compaction must be absent for 101"); - assertFalse(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "101")), - "Inflight Compaction must be absent for 101"); - assertFalse(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "102")), - "Requested Compaction must be absent for 102"); - assertFalse(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "102")), - "Inflight Compaction must be absent for 102"); - assertFalse(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "103")), - "Requested Compaction must be absent for 103"); - assertFalse(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "103")), - "Inflight Compaction must be absent for 103"); - assertTrue(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "104")), - "Requested Compaction must be present for 104"); - assertTrue(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "104")), - "Inflight Compaction must be present for 104"); - assertTrue(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "105")), - "Requested Compaction must be present for 105"); - assertTrue(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "105")), - "Inflight Compaction must be present for 105"); - - // read the file - HoodieArchivedTimeline archivedTimeline = new HoodieArchivedTimeline(metaClient); - assertEquals(24, archivedTimeline.countInstants(), - "Total archived records and total read records are the same count"); - - //make sure the archived commits are the same as the (originalcommits - commitsleft) - Set readCommits = - archivedTimeline.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toSet()); - assertEquals(originalCommits.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toSet()), readCommits, - "Read commits map should match the originalCommits - commitsLoadedFromArchival"); - - // verify in-flight instants after archive - verifyInflightInstants(metaClient, 2); - } - - @Test - public void testArchiveTableWithReplacedFiles() throws Exception { - HoodieTestUtils.init(hadoopConf, basePath); - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table") - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build()) - .build(); - - int numCommits = 4; - int commitInstant = 100; - for (int i = 0; i < numCommits; i++) { - createReplaceMetadata(String.valueOf(commitInstant)); - commitInstant += 100; - } - - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - assertEquals(4, timeline.countInstants(), "Loaded 4 commits and the count should match"); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - boolean result = archiveLog.archiveIfRequired(context); - assertTrue(result); - - FileStatus[] allFiles = metaClient.getFs().listStatus(new Path(basePath + "/" + HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)); - Set allFileIds = Arrays.stream(allFiles).map(fs -> FSUtils.getFileIdFromFilePath(fs.getPath())).collect(Collectors.toSet()); - - // verify 100-1,200-1 are deleted by archival - assertFalse(allFileIds.contains("file-100-1")); - assertFalse(allFileIds.contains("file-200-1")); - assertTrue(allFileIds.contains("file-100-2")); - assertTrue(allFileIds.contains("file-200-2")); - assertTrue(allFileIds.contains("file-300-1")); - assertTrue(allFileIds.contains("file-400-1")); - } - - @Test - public void testArchiveTableWithNoArchival() throws IOException { - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table") - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) - .build(); - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "100"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "100"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "100", wrapperFs.getConf()); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "101"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "101"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "101", wrapperFs.getConf()); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "102"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "102"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "102", wrapperFs.getConf()); - // Requested Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "103"), wrapperFs.getConf()); - // Inflight Compaction - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "103"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "103", wrapperFs.getConf()); - - HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - assertEquals(4, timeline.countInstants(), "Loaded 4 commits and the count should match"); - boolean result = archiveLog.archiveIfRequired(context); - assertTrue(result); - timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(); - assertEquals(4, timeline.countInstants(), "Should not archive commits when maxCommitsToKeep is 5"); - - List instants = metaClient.scanHoodieInstantsFromFileSystem( - new Path(metaClient.getMetaAuxiliaryPath()), HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE, false); - assertEquals(8, instants.size(), "Should not delete any aux compaction files when maxCommitsToKeep is 5"); - assertTrue(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "100")), - "Requested Compaction must be present for 100"); - assertTrue(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "100")), - "Inflight Compaction must be present for 100"); - assertTrue(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "101")), - "Requested Compaction must be present for 101"); - assertTrue(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "101")), - "Inflight Compaction must be present for 101"); - assertTrue(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "102")), - "Requested Compaction must be present for 102"); - assertTrue(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "102")), - "Inflight Compaction must be present for 102"); - assertTrue(instants.contains(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "103")), - "Requested Compaction must be present for 103"); - assertTrue(instants.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "103")), - "Inflight Compaction must be present for 103"); - } - - @Test - public void testArchiveCommitSafety() throws IOException { - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table") - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) - .build(); - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - HoodieTestDataGenerator.createCommitFile(basePath, "100", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "101", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "102", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "103", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "104", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "105", wrapperFs.getConf()); - - HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - assertEquals(6, timeline.countInstants(), "Loaded 6 commits and the count should match"); - boolean result = archiveLog.archiveIfRequired(context); - assertTrue(result); - timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(); - assertTrue(timeline.containsOrBeforeTimelineStarts("100"), "Archived commits should always be safe"); - assertTrue(timeline.containsOrBeforeTimelineStarts("101"), "Archived commits should always be safe"); - assertTrue(timeline.containsOrBeforeTimelineStarts("102"), "Archived commits should always be safe"); - assertTrue(timeline.containsOrBeforeTimelineStarts("103"), "Archived commits should always be safe"); - } - - @Test - public void testArchiveCommitSavepointNoHole() throws IOException { - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table") - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) - .build(); - - HoodieTestDataGenerator.createCommitFile(basePath, "100", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "101", wrapperFs.getConf()); - HoodieTestDataGenerator.createSavepointFile(basePath, "101", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "102", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "103", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "104", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "105", wrapperFs.getConf()); - HoodieTable table = HoodieSparkTable.create(cfg, context); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - - HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - assertEquals(6, timeline.countInstants(), "Loaded 6 commits and the count should match"); - assertTrue(archiveLog.archiveIfRequired(context)); - timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(); - assertEquals(5, timeline.countInstants(), - "Since we have a savepoint at 101, we should never archive any commit after 101 (we only archive 100)"); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "101")), - "Archived commits should always be safe"); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "102")), - "Archived commits should always be safe"); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "103")), - "Archived commits should always be safe"); - } - - @Test - public void testArchiveCommitCompactionNoHole() throws IOException { - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table") - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) - .build(); - HoodieTestDataGenerator.createCommitFile(basePath, "100", wrapperFs.getConf()); - HoodieTestDataGenerator.createCompactionRequestedFile(basePath, "101", wrapperFs.getConf()); - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "101"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "102", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "103", wrapperFs.getConf()); - HoodieTestDataGenerator.createCompactionRequestedFile(basePath, "104", wrapperFs.getConf()); - HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "104"), wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "105", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "106", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "107", wrapperFs.getConf()); - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - - HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsAndCompactionTimeline(); - assertEquals(8, timeline.countInstants(), "Loaded 6 commits and the count should match"); - boolean result = archiveLog.archiveIfRequired(context); - assertTrue(result); - timeline = metaClient.getActiveTimeline().reload().getCommitsAndCompactionTimeline(); - assertFalse(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "100")), - "Instants before oldest pending compaction can be removed"); - assertEquals(7, timeline.countInstants(), - "Since we have a pending compaction at 101, we should never archive any commit " - + "after 101 (we only archive 100)"); - assertTrue(timeline.containsInstant(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "101")), - "Requested Compaction must still be present"); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "102")), - "Instants greater than oldest pending compaction must be present"); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "103")), - "Instants greater than oldest pending compaction must be present"); - assertTrue(timeline.containsInstant(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "104")), - "Instants greater than oldest pending compaction must be present"); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "105")), - "Instants greater than oldest pending compaction must be present"); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "106")), - "Instants greater than oldest pending compaction must be present"); - assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "107")), - "Instants greater than oldest pending compaction must be present"); - } - - @Test - public void testArchiveCommitTimeline() throws IOException { - HoodieWriteConfig cfg = - HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) - .withParallelism(2, 2).forTable("test-trip-table") - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build()) - .build(); - metaClient = HoodieTableMetaClient.reload(metaClient); - - HoodieTestDataGenerator.createCommitFile(basePath, "1", wrapperFs.getConf()); - HoodieInstant instant1 = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "1"); - HoodieTestDataGenerator.createCommitFile(basePath, "2", wrapperFs.getConf()); - Path markerPath = new Path(metaClient.getMarkerFolderPath("2")); - wrapperFs.mkdirs(markerPath); - HoodieInstant instant2 = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "2"); - HoodieTestDataGenerator.createCommitFile(basePath, "3", wrapperFs.getConf()); - HoodieInstant instant3 = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "3"); - - //add 2 more instants to pass filter criteria set in compaction config above - HoodieTestDataGenerator.createCommitFile(basePath, "4", wrapperFs.getConf()); - HoodieTestDataGenerator.createCommitFile(basePath, "5", wrapperFs.getConf()); - - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - boolean result = archiveLog.archiveIfRequired(context); - assertTrue(result); - HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline(); - List archivedInstants = Arrays.asList(instant1, instant2, instant3); - assertEquals(new HashSet<>(archivedInstants), archivedTimeline.getInstants().collect(Collectors.toSet())); - assertFalse(wrapperFs.exists(markerPath)); - } - - private void verifyInflightInstants(HoodieTableMetaClient metaClient, int expectedTotalInstants) { - HoodieTimeline timeline = metaClient.getActiveTimeline().reload() - .getTimelineOfActions(Collections.singleton(HoodieTimeline.CLEAN_ACTION)).filterInflights(); - assertEquals(expectedTotalInstants, timeline.countInstants(), - "Loaded inflight clean actions and the count should match"); - } - - @Test - public void testConvertCommitMetadata() { - HoodieCommitMetadata hoodieCommitMetadata = new HoodieCommitMetadata(); - hoodieCommitMetadata.setOperationType(WriteOperationType.INSERT); - - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) - .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-commitMetadata-converter") - .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) - .build(); - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - - org.apache.hudi.avro.model.HoodieCommitMetadata expectedCommitMetadata = archiveLog.convertCommitMetadata(hoodieCommitMetadata); - assertEquals(expectedCommitMetadata.getOperationType(), WriteOperationType.INSERT.toString()); - } - - private void createReplaceMetadata(String instantTime) throws Exception { - String fileId1 = "file-" + instantTime + "-1"; - String fileId2 = "file-" + instantTime + "-2"; - - // create replace instant to mark fileId1 as deleted - HoodieReplaceCommitMetadata replaceMetadata = new HoodieReplaceCommitMetadata(); - replaceMetadata.addReplaceFileId(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1); - replaceMetadata.setOperationType(WriteOperationType.INSERT_OVERWRITE); - HoodieTestTable.of(metaClient) - .addReplaceCommit(instantTime, replaceMetadata) - .withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2); - } - - private void createCleanMetadata(String instantTime, boolean inflightOnly) throws IOException { - HoodieCleanerPlan cleanerPlan = new HoodieCleanerPlan(new HoodieActionInstant("", "", ""), "", new HashMap<>(), - CleanPlanV2MigrationHandler.VERSION, new HashMap<>()); - if (inflightOnly) { - HoodieTestTable.of(metaClient).addInflightClean(instantTime, cleanerPlan); - } else { - HoodieCleanStat cleanStats = new HoodieCleanStat( - HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS, - HoodieTestUtils.DEFAULT_PARTITION_PATHS[new Random().nextInt(HoodieTestUtils.DEFAULT_PARTITION_PATHS.length)], - Collections.emptyList(), - Collections.emptyList(), - Collections.emptyList(), - instantTime); - HoodieCleanMetadata cleanMetadata = convertCleanMetadata(instantTime, Option.of(0L), Collections.singletonList(cleanStats)); - HoodieTestTable.of(metaClient).addClean(instantTime, cleanerPlan, cleanMetadata); - } - } -} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiver.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiver.java new file mode 100644 index 0000000000000..bc7de2f175a44 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiver.java @@ -0,0 +1,1487 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io; + +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieSavepointMetadata; +import org.apache.hudi.client.HoodieTimelineArchiver; +import org.apache.hudi.client.transaction.lock.InProcessLockProvider; +import org.apache.hudi.client.utils.MetadataConversionUtils; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.WriteConcurrencyMode; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.log.HoodieLogFormat; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieInstant.State; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.testutils.HoodieMetadataTestTable; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.FileIOUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieArchivalConfig; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieLockConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.testutils.HoodieClientTestHarness; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.CsvSource; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +import static org.apache.hudi.common.testutils.HoodieTestUtils.createCompactionCommitInMetadataTable; +import static org.apache.hudi.config.HoodieArchivalConfig.ARCHIVE_BEYOND_SAVEPOINT; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestHoodieTimelineArchiver extends HoodieClientTestHarness { + + private static final Logger LOG = LogManager.getLogger(TestHoodieTimelineArchiver.class); + + private Configuration hadoopConf; + private HoodieWrapperFileSystem wrapperFs; + private HoodieTableMetadataWriter metadataWriter; + private HoodieTestTable testTable; + + public void init() throws Exception { + init(HoodieTableType.COPY_ON_WRITE); + } + + public void init(HoodieTableType tableType) throws Exception { + initPath(); + initSparkContexts(); + initTimelineService(); + initMetaClient(); + hadoopConf = context.getHadoopConf().get(); + metaClient.getFs().mkdirs(new Path(basePath)); + metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType); + wrapperFs = metaClient.getFs(); + hadoopConf.addResource(wrapperFs.getConf()); + } + + private void initWriteConfigAndMetatableWriter(HoodieWriteConfig writeConfig, boolean enableMetadataTable) { + if (enableMetadataTable) { + metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, writeConfig, context); + testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter); + } else { + testTable = HoodieTestTable.of(metaClient); + } + } + + @AfterEach + public void clean() throws IOException { + cleanupResources(); + } + + private HoodieWriteConfig initTestTableAndGetWriteConfig(boolean enableMetadata, int minArchivalCommits, int maxArchivalCommits, int maxDeltaCommitsMetadataTable) throws Exception { + return initTestTableAndGetWriteConfig(enableMetadata, minArchivalCommits, maxArchivalCommits, maxDeltaCommitsMetadataTable, HoodieTableType.COPY_ON_WRITE); + } + + private HoodieWriteConfig initTestTableAndGetWriteConfig(boolean enableMetadata, + int minArchivalCommits, + int maxArchivalCommits, + int maxDeltaCommits, + int maxDeltaCommitsMetadataTable, + HoodieTableType tableType) throws Exception { + return initTestTableAndGetWriteConfig(enableMetadata, minArchivalCommits, maxArchivalCommits, + maxDeltaCommits, maxDeltaCommitsMetadataTable, tableType, false, 10, 209715200, + HoodieFailedWritesCleaningPolicy.EAGER, WriteConcurrencyMode.SINGLE_WRITER); + } + + private HoodieWriteConfig initTestTableAndGetWriteConfig(boolean enableMetadata, + int minArchivalCommits, + int maxArchivalCommits, + int maxDeltaCommitsMetadataTable, + HoodieTableType tableType) throws Exception { + return initTestTableAndGetWriteConfig(enableMetadata, minArchivalCommits, maxArchivalCommits, + 5, maxDeltaCommitsMetadataTable, tableType, false, 10, 209715200, + HoodieFailedWritesCleaningPolicy.EAGER, WriteConcurrencyMode.SINGLE_WRITER); + } + + private HoodieWriteConfig initTestTableAndGetWriteConfig(boolean enableMetadata, + int minArchivalCommits, + int maxArchivalCommits, + int maxDeltaCommitsMetadataTable, + boolean enableArchiveMerge, + int archiveFilesBatch, + long size) throws Exception { + return initTestTableAndGetWriteConfig(enableMetadata, minArchivalCommits, maxArchivalCommits, 5, + maxDeltaCommitsMetadataTable, HoodieTableType.COPY_ON_WRITE, enableArchiveMerge, archiveFilesBatch, size, + HoodieFailedWritesCleaningPolicy.EAGER, WriteConcurrencyMode.SINGLE_WRITER); + } + + private HoodieWriteConfig initTestTableAndGetWriteConfig(boolean enableMetadata, + int minArchivalCommits, + int maxArchivalCommits, + int maxDeltaCommits, + int maxDeltaCommitsMetadataTable, + HoodieTableType tableType, + boolean enableArchiveMerge, + int archiveFilesBatch, + long size, + HoodieFailedWritesCleaningPolicy failedWritesCleaningPolicy, + WriteConcurrencyMode writeConcurrencyMode) throws Exception { + return initTestTableAndGetWriteConfig( + enableMetadata, + minArchivalCommits, + maxArchivalCommits, + maxDeltaCommits, + maxDeltaCommitsMetadataTable, + tableType, + enableArchiveMerge, + archiveFilesBatch, + size, + failedWritesCleaningPolicy, + writeConcurrencyMode, + ARCHIVE_BEYOND_SAVEPOINT.defaultValue()); + } + + private HoodieWriteConfig initTestTableAndGetWriteConfig(boolean enableMetadata, + int minArchivalCommits, + int maxArchivalCommits, + int maxDeltaCommits, + int maxDeltaCommitsMetadataTable, + HoodieTableType tableType, + boolean enableArchiveMerge, + int archiveFilesBatch, + long size, + HoodieFailedWritesCleaningPolicy failedWritesCleaningPolicy, + WriteConcurrencyMode writeConcurrencyMode, + boolean archiveProceedBeyondSavepoints) throws Exception { + init(tableType); + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .withCleanConfig(HoodieCleanConfig.newBuilder().retainCommits(1).withFailedWritesCleaningPolicy(failedWritesCleaningPolicy).build()) + .withArchivalConfig(HoodieArchivalConfig.newBuilder() + .withArchiveMergeEnable(enableArchiveMerge) + .withArchiveMergeFilesBatchSize(archiveFilesBatch) + .withArchiveMergeSmallFileLimit(size) + .archiveCommitsWith(minArchivalCommits, maxArchivalCommits) + .withArchiveBeyondSavepoint(archiveProceedBeyondSavepoints).build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withMaxNumDeltaCommitsBeforeCompaction(maxDeltaCommits).build()) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withRemoteServerPort(timelineServicePort).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadata) + .withMaxNumDeltaCommitsBeforeCompaction(maxDeltaCommitsMetadataTable).build()) + .withWriteConcurrencyMode(writeConcurrencyMode) + .withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(InProcessLockProvider.class) + .build()) + .forTable("test-trip-table").build(); + initWriteConfigAndMetatableWriter(writeConfig, enableMetadata); + return writeConfig; + } + + @Test + public void testArchiveEmptyTable() throws Exception { + init(); + HoodieWriteConfig cfg = + HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2).forTable("test-trip-table").build(); + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(cfg, table); + boolean result = archiver.archiveIfRequired(context); + assertTrue(result); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testArchiveTableWithArchival(boolean enableMetadata) throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(enableMetadata, 2, 4, 2); + + // min archival commits is 2 and max archival commits is 4. and so, after 5th commit, 3 commits will be archived. + // 1,2,3,4,5 : after archival -> 4,5 + // after 3 more commits, earliest 3 will be archived + // 4,5,6,7,8 : after archival -> 7, 8 + // after 9 no-op wrt archival. + for (int i = 1; i < 10; i++) { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + // trigger archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + if (i < 5) { + assertEquals(originalCommits, commitsAfterArchival); + } else if (i == 5) { + // archival should have kicked in. + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000001", "00000002", "00000003")), getActiveCommitInstants(Arrays.asList("00000004", "00000005")), commitsAfterArchival); + } else if (i < 8) { + assertEquals(originalCommits, commitsAfterArchival); + } else if (i == 8) { + // archival should have kicked in. + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000001", "00000002", "00000003", "00000004", "00000005", "00000006")), + getActiveCommitInstants(Arrays.asList("00000007", "00000008")), commitsAfterArchival); + } else { + assertEquals(originalCommits, commitsAfterArchival); + } + } + } + + @Test + public void testArchiveTableWithReplaceCommits() throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 4, 2); + for (int i = 1; i < 7; i++) { + if (i < 3) { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), + Arrays.asList("p1", "p2"), 2); + } else { + testTable.doWriteOperation("0000000" + i, WriteOperationType.INSERT_OVERWRITE, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + } + // trigger archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + + if (i == 6) { + // after all rounds, only 3 should be left in active timeline. 4,5,6 + assertEquals(originalCommits, commitsAfterArchival); + assertEquals(3, originalCommits.size()); + } + } + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testSavepointWithArchival(boolean archiveBeyondSavepoint) throws Exception { + boolean enableMetadata = false; + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(enableMetadata, 2, 4, 5, 2, HoodieTableType.COPY_ON_WRITE, + false, 10, 209715200, HoodieFailedWritesCleaningPolicy.EAGER, WriteConcurrencyMode.SINGLE_WRITER, archiveBeyondSavepoint); + + // min archival commits is 2 and max archival commits is 4. and so, after 5th commit, 3 commits will be archived. + for (int i = 1; i < 5; i++) { + testTable.doWriteOperation(String.format("%08d", i), WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + } + + // savepoint 3rd commit + String commitToSavepoint = String.format("%08d", 3); + HoodieSavepointMetadata savepointMetadata = testTable.doSavepoint(commitToSavepoint); + testTable.addSavepoint(commitToSavepoint, savepointMetadata); + + for (int i = 5; i < 7; i++) { + testTable.doWriteOperation(String.format("%08d", i), WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + } + // trigger archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + + if (archiveBeyondSavepoint) { + // retains only 2 commits. C3 and C8. and savepointed commit for C3. + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000001", "00000002", "00000004", "00000005")), + Stream.concat(getActiveCommitInstants(Arrays.asList("00000003", "00000006")).stream(), getActiveSavepointedCommitInstants(Arrays.asList("00000003")).stream()) + .collect(Collectors.toList()), commitsAfterArchival); + } else { + // archives only C1 and C2. stops at first savepointed commit C3. + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000001", "00000002")), + Stream.concat(getActiveCommitInstants(Arrays.asList("00000003", "00000004", "00000005", "00000006")).stream(), + getActiveSavepointedCommitInstants(Arrays.asList("00000003")).stream()) + .collect(Collectors.toList()), commitsAfterArchival); + } + + for (int i = 7; i < 10; i++) { + testTable.doWriteOperation(String.format("%08d", i), WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + } + + // once savepoint is removed. C3 will be archived. + testTable.deleteSavepoint(commitToSavepoint); + commitsList = archiveAndGetCommitsList(writeConfig); + originalCommits = commitsList.getKey(); + commitsAfterArchival = commitsList.getValue(); + + metaClient.reloadActiveTimeline(); + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000001", "00000002","00000003", "00000004", "00000005", "00000006", "00000007")), + getActiveCommitInstants(Arrays.asList("00000008", "00000009")), commitsAfterArchival); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testMergeSmallArchiveFilesRecoverFromBuildPlanFailed(boolean enableArchiveMerge) throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 3, 2, enableArchiveMerge, 3, 209715200); + + // do ingestion and trigger archive actions here. + for (int i = 1; i < 8; i++) { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + archiveAndGetCommitsList(writeConfig); + } + + // build a merge small archive plan with dummy content + // this plan can not be deserialized. + HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table); + FileStatus[] fsStatuses = metaClient.getFs().globStatus( + new Path(metaClient.getArchivePath() + "/.commits_.archive*")); + List candidateFiles = Arrays.stream(fsStatuses).map(fs -> fs.getPath().toString()).collect(Collectors.toList()); + + archiver.reOpenWriter(); + Path plan = new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME); + archiver.buildArchiveMergePlan(candidateFiles, plan, ".commits_.archive.3_1-0-1"); + String s = "Dummy Content"; + // stain the current merge plan file. + FileIOUtils.createFileInPath(metaClient.getFs(), plan, Option.of(s.getBytes())); + + // check that damaged plan file will not block archived timeline loading. + HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false); + HoodieArchivedTimeline archivedTimeLine = metaClient.getArchivedTimeline().reload(); + assertEquals(7 * 3, rawActiveTimeline.countInstants() + archivedTimeLine.countInstants()); + + // trigger several archive after left damaged merge small archive file plan. + for (int i = 1; i < 10; i++) { + testTable.doWriteOperation("1000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + archiveAndGetCommitsList(writeConfig); + } + + // loading archived timeline and active timeline success + HoodieActiveTimeline rawActiveTimeline1 = new HoodieActiveTimeline(metaClient, false); + HoodieArchivedTimeline archivedTimeLine1 = metaClient.getArchivedTimeline().reload(); + + // check instant number + assertEquals(16 * 3, archivedTimeLine1.countInstants() + rawActiveTimeline1.countInstants()); + + // if there are damaged archive files and damaged plan, hoodie need throw ioe while loading archived timeline. + Path damagedFile = new Path(metaClient.getArchivePath(), ".commits_.archive.300_1-0-1"); + FileIOUtils.createFileInPath(metaClient.getFs(), damagedFile, Option.of(s.getBytes())); + + assertThrows(HoodieException.class, () -> metaClient.getArchivedTimeline().reload()); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testMergeSmallArchiveFilesRecoverFromMergeFailed(boolean enableArchiveMerge) throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 3, 2, enableArchiveMerge, 3, 209715200); + + // do ingestion and trigger archive actions here. + for (int i = 1; i < 8; i++) { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + archiveAndGetCommitsList(writeConfig); + } + + // do a single merge small archive files + HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table); + FileStatus[] fsStatuses = metaClient.getFs().globStatus( + new Path(metaClient.getArchivePath() + "/.commits_.archive*")); + List candidateFiles = Arrays.stream(fsStatuses).map(fs -> fs.getPath().toString()).collect(Collectors.toList()); + archiver.reOpenWriter(); + + archiver.buildArchiveMergePlan(candidateFiles, new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME), ".commits_.archive.3_1-0-1"); + archiver.mergeArchiveFiles(Arrays.stream(fsStatuses).collect(Collectors.toList())); + HoodieLogFormat.Writer writer = archiver.reOpenWriter(); + + // check loading archived and active timeline success + HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false); + HoodieArchivedTimeline archivedTimeLine = metaClient.getArchivedTimeline().reload(); + assertEquals(7 * 3, rawActiveTimeline.countInstants() + archivedTimeLine.reload().countInstants()); + + String s = "Dummy Content"; + // stain the current merged archive file. + FileIOUtils.createFileInPath(metaClient.getFs(), writer.getLogFile().getPath(), Option.of(s.getBytes())); + + // do another archive actions with merge small archive files. + for (int i = 1; i < 10; i++) { + testTable.doWriteOperation("1000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + archiveAndGetCommitsList(writeConfig); + } + + // check result. + // we need to load archived timeline successfully and ignore the parsing damage merged archive files exception. + HoodieActiveTimeline rawActiveTimeline1 = new HoodieActiveTimeline(metaClient, false); + HoodieArchivedTimeline archivedTimeLine1 = metaClient.getArchivedTimeline().reload(); + + assertEquals(16 * 3, archivedTimeLine1.countInstants() + rawActiveTimeline1.countInstants()); + + // if there are a damaged merged archive files and other common damaged archive file. + // hoodie need throw ioe while loading archived timeline because of parsing the damaged archive file. + Path damagedFile = new Path(metaClient.getArchivePath(), ".commits_.archive.300_1-0-1"); + FileIOUtils.createFileInPath(metaClient.getFs(), damagedFile, Option.of(s.getBytes())); + + assertThrows(HoodieException.class, () -> metaClient.getArchivedTimeline().reload()); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testMergeSmallArchiveFilesRecoverFromDeleteFailed(boolean enableArchiveMerge) throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 3, 2, enableArchiveMerge, 3, 209715200); + + // do ingestion and trigger archive actions here. + for (int i = 1; i < 8; i++) { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + archiveAndGetCommitsList(writeConfig); + } + + // do a single merge small archive files + HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table); + FileStatus[] fsStatuses = metaClient.getFs().globStatus( + new Path(metaClient.getArchivePath() + "/.commits_.archive*")); + List candidateFiles = Arrays.stream(fsStatuses).map(fs -> fs.getPath().toString()).collect(Collectors.toList()); + + archiver.reOpenWriter(); + + archiver.buildArchiveMergePlan(candidateFiles, new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME), ".commits_.archive.3_1-0-1"); + archiver.mergeArchiveFiles(Arrays.stream(fsStatuses).collect(Collectors.toList())); + archiver.reOpenWriter(); + + // delete only one of the small archive file to simulate delete action failed. + metaClient.getFs().delete(fsStatuses[0].getPath()); + + // loading archived timeline and active timeline success + HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false); + HoodieArchivedTimeline archivedTimeLine = metaClient.getArchivedTimeline().reload(); + assertEquals(7 * 3, rawActiveTimeline.countInstants() + archivedTimeLine.countInstants()); + + // do another archive actions with merge small archive files. + for (int i = 1; i < 10; i++) { + testTable.doWriteOperation("1000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + archiveAndGetCommitsList(writeConfig); + } + + // check result. + HoodieActiveTimeline rawActiveTimeline1 = new HoodieActiveTimeline(metaClient, false); + HoodieArchivedTimeline archivedTimeLine1 = metaClient.getArchivedTimeline().reload(); + + assertEquals(16 * 3, archivedTimeLine1.countInstants() + rawActiveTimeline1.countInstants()); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testLoadArchiveTimelineWithDamagedPlanFile(boolean enableArchiveMerge) throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 3, 2, enableArchiveMerge, 3, 209715200); + + // do ingestion and trigger archive actions here. + for (int i = 1; i < 8; i++) { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + archiveAndGetCommitsList(writeConfig); + } + + Path plan = new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME); + String s = "Dummy Content"; + // stain the current merge plan file. + FileIOUtils.createFileInPath(metaClient.getFs(), plan, Option.of(s.getBytes())); + + // check that damaged plan file will not block archived timeline loading. + HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false); + HoodieArchivedTimeline archivedTimeLine = metaClient.getArchivedTimeline().reload(); + assertEquals(7 * 3, rawActiveTimeline.countInstants() + archivedTimeLine.countInstants()); + + // if there are damaged archive files and damaged plan, hoodie need throw ioe while loading archived timeline. + Path damagedFile = new Path(metaClient.getArchivePath(), ".commits_.archive.300_1-0-1"); + FileIOUtils.createFileInPath(metaClient.getFs(), damagedFile, Option.of(s.getBytes())); + + assertThrows(HoodieException.class, () -> metaClient.getArchivedTimeline().reload()); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testArchivalWithMultiWriters(boolean enableMetadata) throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(enableMetadata, 2, 4, 5, 2, + HoodieTableType.COPY_ON_WRITE, false, 10, 209715200, + HoodieFailedWritesCleaningPolicy.LAZY, WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL); + + final ExecutorService executors = Executors.newFixedThreadPool(2); + List> completableFutureList = new ArrayList<>(); + CountDownLatch countDownLatch = new CountDownLatch(1); + IntStream.range(0, 2).forEach(index -> { + completableFutureList.add(CompletableFuture.supplyAsync(() -> { + HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient); + try { + // wait until 4 commits are available so that archival thread will have something to archive. + countDownLatch.await(30, TimeUnit.SECONDS); + } catch (InterruptedException e) { + throw new HoodieException("Should not have thrown InterruptedException ", e); + } + metaClient.reloadActiveTimeline(); + while (!metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant().get().getTimestamp().endsWith("29") + || metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants() > 4) { + try { + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table); + archiver.archiveIfRequired(context, true); + // if not for below sleep, both archiving threads acquires lock in quick succession and does not give space for main thread + // to complete the write operation when metadata table is enabled. + if (enableMetadata) { + Thread.sleep(2); + } + } catch (IOException e) { + throw new HoodieException("IOException thrown while archiving ", e); + } catch (InterruptedException e) { + throw new HoodieException("Should not have thrown InterruptedException ", e); + } + table.getMetaClient().reloadActiveTimeline(); + } + return true; + }, executors)); + }); + + // do ingestion and trigger archive actions here. + for (int i = 1; i < 30; i++) { + testTable.doWriteOperation("0000000" + String.format("%02d", i), WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + if (i == 5) { + // start up archival threads only after 4 commits. + countDownLatch.countDown(); + } + } + + try { + CompletableFuture completableFuture = allOfTerminateOnFailure(completableFutureList); + completableFuture.get(); + } finally { + executors.shutdownNow(); + } + } + + public static CompletableFuture allOfTerminateOnFailure(List> futures) { + CompletableFuture failure = new CompletableFuture(); + AtomicBoolean jobFailed = new AtomicBoolean(false); + for (CompletableFuture f : futures) { + f.exceptionally(ex -> { + if (!jobFailed.getAndSet(true)) { + LOG.warn("One of the job failed. Cancelling all other futures. " + ex.getCause() + ", " + ex.getMessage()); + futures.forEach(future -> future.cancel(true)); + } + return null; + }); + } + return CompletableFuture.anyOf(failure, CompletableFuture.allOf(futures.toArray(new CompletableFuture[0]))); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testLoadArchiveTimelineWithUncompletedMergeArchiveFile(boolean enableArchiveMerge) throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 3, 2, enableArchiveMerge, 3, 209715200); + for (int i = 1; i < 8; i++) { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + archiveAndGetCommitsList(writeConfig); + } + + HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table); + FileStatus[] fsStatuses = metaClient.getFs().globStatus( + new Path(metaClient.getArchivePath() + "/.commits_.archive*")); + List candidateFiles = Arrays.stream(fsStatuses).map(fs -> fs.getPath().toString()).collect(Collectors.toList()); + + archiver.reOpenWriter(); + + archiver.buildArchiveMergePlan(candidateFiles, new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME), ".commits_.archive.3_1-0-1"); + archiver.mergeArchiveFiles(Arrays.stream(fsStatuses).collect(Collectors.toList())); + HoodieLogFormat.Writer writer = archiver.reOpenWriter(); + + String s = "Dummy Content"; + // stain the current merged archive file. + FileIOUtils.createFileInPath(metaClient.getFs(), writer.getLogFile().getPath(), Option.of(s.getBytes())); + + // if there's only a damaged merged archive file, we need to ignore the exception while reading this damaged file. + HoodieActiveTimeline rawActiveTimeline1 = new HoodieActiveTimeline(metaClient, false); + HoodieArchivedTimeline archivedTimeLine1 = metaClient.getArchivedTimeline(); + + assertEquals(7 * 3, archivedTimeLine1.countInstants() + rawActiveTimeline1.countInstants()); + + // if there are a damaged merged archive files and other common damaged archive file. + // hoodie need throw ioe while loading archived timeline because of parsing the damaged archive file. + Path damagedFile = new Path(metaClient.getArchivePath(), ".commits_.archive.300_1-0-1"); + FileIOUtils.createFileInPath(metaClient.getFs(), damagedFile, Option.of(s.getBytes())); + + assertThrows(HoodieException.class, () -> metaClient.getArchivedTimeline().reload()); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testNoArchivalUntilMaxArchiveConfigWithExtraInflightCommits(boolean enableMetadata) throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(enableMetadata, 2, 5, 2); + + // when max archival commits is set to 5, until 6th commit there should not be any archival. + for (int i = 1; i < 6; i++) { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, Arrays.asList("p1", "p2"), Arrays.asList("p1", "p2"), 2); + // archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + assertEquals(originalCommits, commitsAfterArchival); + } + + // add couple of inflight. no archival should kick in. + testTable.doWriteOperation("00000006", WriteOperationType.UPSERT, Arrays.asList("p1", "p2"), Arrays.asList("p1", "p2"), 2, false, true); + testTable.doWriteOperation("00000007", WriteOperationType.UPSERT, Arrays.asList("p1", "p2"), Arrays.asList("p1", "p2"), 2, false, true); + + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + assertEquals(originalCommits, commitsAfterArchival); + } + + private static Stream archiveCommitSavepointNoHoleParams() { + return Arrays.stream(new Boolean[][] { + {true, true}, + {false, true}, + {true, false}, + {false, false} + }).map(Arguments::of); + } + + @ParameterizedTest + @MethodSource("archiveCommitSavepointNoHoleParams") + public void testArchiveCommitSavepointNoHole(boolean enableMetadataTable, boolean archiveBeyondSavepoint) throws Exception { + init(); + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table") + .withArchivalConfig(HoodieArchivalConfig.newBuilder().archiveCommitsWith(2, 5).withArchiveBeyondSavepoint(archiveBeyondSavepoint).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder().retainCommits(1).build()) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withRemoteServerPort(timelineServicePort).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadataTable).build()) + .build(); + + HoodieTestDataGenerator.createCommitFile(basePath, "100", wrapperFs.getConf()); + HoodieTestDataGenerator.createCommitFile(basePath, "101", wrapperFs.getConf()); + HoodieTestDataGenerator.createSavepointFile(basePath, "101", wrapperFs.getConf()); + HoodieTestDataGenerator.createCommitFile(basePath, "102", wrapperFs.getConf()); + HoodieTestDataGenerator.createCommitFile(basePath, "103", wrapperFs.getConf()); + HoodieTestDataGenerator.createCommitFile(basePath, "104", wrapperFs.getConf()); + HoodieTestDataGenerator.createCommitFile(basePath, "105", wrapperFs.getConf()); + HoodieTable table = HoodieSparkTable.create(cfg, context); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(cfg, table); + + if (enableMetadataTable) { + // Simulate a compaction commit in metadata table timeline + // so the archival in data table can happen + createCompactionCommitInMetadataTable(hadoopConf, wrapperFs, basePath, "105"); + } + + HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); + assertEquals(6, timeline.countInstants(), "Loaded 6 commits and the count should match"); + assertTrue(archiver.archiveIfRequired(context)); + timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(); + if (archiveBeyondSavepoint) { + // commits in active timeline = 101 and 105. + assertEquals(2, timeline.countInstants(), + "Since archiveBeyondSavepoint config is enabled, we will archive commits 102, 103 "); + assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "101")), + "Savepointed commits should always be safe"); + assertFalse(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "102")), + "102 expected to be archived"); + assertFalse(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "103")), + "103 expected to be archived"); + assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "105")), + "104 expected to be archived"); + assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "105")), + "105 expected to be in active timeline"); + } else { + assertEquals(5, timeline.countInstants(), + "Since we have a savepoint at 101, we should never archive any commit after 101 (we only archive 100)"); + assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "101")), + "Archived commits should always be safe"); + assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "102")), + "Archived commits should always be safe"); + assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "103")), + "Archived commits should always be safe"); + } + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testPendingClusteringWillBlockArchival(boolean enableMetadata) throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(enableMetadata, 2, 5, 2); + HoodieTestDataGenerator.createPendingReplaceFile(basePath, "00000000", wrapperFs.getConf()); + for (int i = 1; i < 8; i++) { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, Arrays.asList("p1", "p2"), Arrays.asList("p1", "p2"), 2); + // archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + assertEquals(originalCommits, commitsAfterArchival); + } + + HoodieTimeline timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(); + assertEquals(7, timeline.countInstants(), + "Since we have a pending clustering instant at 00000000, we should never archive any commit after 00000000"); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testArchiveRollbacksTestTable(boolean enableMetadata) throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(enableMetadata, 2, 3, 2); + + for (int i = 1; i < 9; i += 2) { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + testTable.doRollback("0000000" + i, "0000000" + (i + 1)); + + // trigger archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + + if (i != 7) { + assertEquals(originalCommits, commitsAfterArchival); + } else { + // only time when archival will kick in + List expectedArchivedInstants = new ArrayList<>(); + expectedArchivedInstants.addAll(getAllArchivedCommitInstants(Arrays.asList("00000001", "00000003"))); + expectedArchivedInstants.addAll(getAllArchivedCommitInstants(Arrays.asList("00000002", "00000004"), HoodieTimeline.ROLLBACK_ACTION)); + List expectedActiveInstants = new ArrayList<>(); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000005", "00000007"))); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000006", "00000008"), HoodieTimeline.ROLLBACK_ACTION)); + verifyArchival(expectedArchivedInstants, expectedActiveInstants, commitsAfterArchival); + } + } + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testNoArchivalWithInflightCompactionInMiddle(boolean enableMetadata) throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(enableMetadata, 2, 4, 2, 2, + HoodieTableType.MERGE_ON_READ); + + // when max archival commits is set to 4, even after 7 commits, if there is an inflight compaction in the middle, archival should not kick in. + HoodieCommitMetadata inflightCompactionMetadata = null; + for (int i = 1; i < 8; i++) { + if (i == 2) { + inflightCompactionMetadata = testTable.doCompaction("0000000" + i, Arrays.asList("p1", "p2"), true); + } else { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + } + + // archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + if (enableMetadata) { + assertEquals(originalCommits, commitsAfterArchival); + } else { + if (i != 6) { + assertEquals(originalCommits, commitsAfterArchival); + } else { + // on 7th commit, archival will kick in. but will archive only one commit since 2nd compaction commit is inflight. + assertEquals(originalCommits.size() - commitsAfterArchival.size(), 1); + for (int j = 1; j <= 6; j++) { + if (j == 1) { + // first commit should be archived + assertFalse(commitsAfterArchival.contains(new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "0000000" + j))); + } else if (j == 2) { + // 2nd compaction should not be archived + assertFalse(commitsAfterArchival.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "0000000" + j))); + } else { + // every other commit should not be archived + assertTrue(commitsAfterArchival.contains(new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "0000000" + j))); + } + } + } + } + } + + // move inflight compaction to complete and add one regular write commit. archival should archive more commits. + // an extra one commit is required, bcoz compaction in data table will not trigger table services in metadata table. + // before this move, timeline : 2_inflight_compaction, 3,4,5,6,7. + // after this move: 6,7,8 (2,3,4,5 will be archived) + testTable.moveInflightCompactionToComplete("00000002", inflightCompactionMetadata); + testTable.doWriteOperation("00000008", WriteOperationType.UPSERT, Arrays.asList("p1", "p2"), 2); + + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List commitsAfterArchival = commitsList.getValue(); + + List archivedInstants = getAllArchivedCommitInstants(Arrays.asList("00000001", "00000003", "00000004", "00000005", "00000006"), HoodieTimeline.DELTA_COMMIT_ACTION); + archivedInstants.add(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "00000002")); + archivedInstants.add(new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "00000002")); + verifyArchival(archivedInstants, getActiveCommitInstants(Arrays.asList("00000007", "00000008"), HoodieTimeline.DELTA_COMMIT_ACTION), commitsAfterArchival); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testArchiveCommitTimeline(boolean enableMetadataTable) throws Exception { + init(); + HoodieWriteConfig cfg = + HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2).forTable("test-trip-table") + .withCleanConfig(HoodieCleanConfig.newBuilder().retainCommits(1).build()) + .withArchivalConfig(HoodieArchivalConfig.newBuilder().archiveCommitsWith(2, 3).build()) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withRemoteServerPort(timelineServicePort).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadataTable).build()) + .build(); + metaClient = HoodieTableMetaClient.reload(metaClient); + + HoodieTestDataGenerator.createCommitFile(basePath, "1", wrapperFs.getConf()); + HoodieInstant instant1 = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "1"); + HoodieTestDataGenerator.createCommitFile(basePath, "2", wrapperFs.getConf()); + Path markerPath = new Path(metaClient.getMarkerFolderPath("2")); + wrapperFs.mkdirs(markerPath); + HoodieInstant instant2 = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "2"); + HoodieTestDataGenerator.createCommitFile(basePath, "3", wrapperFs.getConf()); + HoodieInstant instant3 = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "3"); + + //add 2 more instants to pass filter criteria set in compaction config above + HoodieTestDataGenerator.createCommitFile(basePath, "4", wrapperFs.getConf()); + HoodieTestDataGenerator.createCommitFile(basePath, "5", wrapperFs.getConf()); + + if (enableMetadataTable) { + // Simulate a compaction commit in metadata table timeline + // so the archival in data table can happen + createCompactionCommitInMetadataTable(hadoopConf, wrapperFs, basePath, "5"); + } + + HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(cfg, table); + boolean result = archiver.archiveIfRequired(context); + assertTrue(result); + HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline(); + List archivedInstants = Arrays.asList(instant1, instant2, instant3); + assertEquals(new HashSet<>(archivedInstants), + archivedTimeline.filterCompletedInstants().getInstants().collect(Collectors.toSet())); + assertFalse(wrapperFs.exists(markerPath)); + } + + private void verifyInflightInstants(HoodieTableMetaClient metaClient, int expectedTotalInstants) { + HoodieTimeline timeline = metaClient.getActiveTimeline().reload() + .getTimelineOfActions(Collections.singleton(HoodieTimeline.CLEAN_ACTION)).filterInflights(); + assertEquals(expectedTotalInstants, timeline.countInstants(), + "Loaded inflight clean actions and the count should match"); + } + + @Test + public void testConvertCommitMetadata() throws Exception { + init(); + HoodieCommitMetadata hoodieCommitMetadata = new HoodieCommitMetadata(); + hoodieCommitMetadata.setOperationType(WriteOperationType.INSERT); + + metaClient = HoodieTableMetaClient.reload(metaClient); + + org.apache.hudi.avro.model.HoodieCommitMetadata expectedCommitMetadata = MetadataConversionUtils + .convertCommitMetadata(hoodieCommitMetadata); + assertEquals(expectedCommitMetadata.getOperationType(), WriteOperationType.INSERT.toString()); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testArchiveTableWithCleanCommits(boolean enableMetadata) throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(enableMetadata, 2, 4, 2); + + // min archival commits is 2 and max archival commits is 4 + // (either clean commits has to be > 4 or commits has to be greater than 4) + // and so, after 5th commit, 3 commits will be archived. + // 1,2,3,4,5,6 : after archival -> 1,5,6 (because, 2,3,4,5 and 6 are clean commits and are eligible for archival) + // after 7th and 8th commit no-op wrt archival. + Map cleanStats = new HashMap<>(); + cleanStats.put("p1", 1); + cleanStats.put("p2", 2); + for (int i = 1; i < 9; i++) { + if (i == 1) { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 10); + } else if (i < 7) { + testTable.doClean("0000000" + i, cleanStats); + } else { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + } + // trigger archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + if (i < 6) { + assertEquals(originalCommits, commitsAfterArchival); + } else if (i == 6) { + if (!enableMetadata) { + // 1,2,3,4,5,6 : after archival -> 1,5,6 (bcoz, 2,3,4,5 and 6 are clean commits and are eligible for archival) + List expectedActiveInstants = new ArrayList<>(); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000001"))); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000005", "00000006"), HoodieTimeline.CLEAN_ACTION)); + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000002", "00000003", "00000004"), HoodieTimeline.CLEAN_ACTION), expectedActiveInstants, commitsAfterArchival); + } else { + // with metadata enabled, archival in data table is fenced based on compaction in metadata table. Clean commits in data table will not trigger compaction in + // metadata table. + List expectedActiveInstants = new ArrayList<>(); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000001"))); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000002", "00000003", "00000004", "00000005", "00000006"), HoodieTimeline.CLEAN_ACTION)); + verifyArchival(getAllArchivedCommitInstants(Collections.emptyList(), HoodieTimeline.CLEAN_ACTION), expectedActiveInstants, commitsAfterArchival); + } + } else { + if (!enableMetadata) { + assertEquals(originalCommits, commitsAfterArchival); + } else { + if (i == 7) { + // when i == 7 compaction in metadata table will be triggered and hence archival in datatable will kick in. + // 1,2,3,4,5,6 : after archival -> 1,5,6 (bcoz, 2,3,4,5 and 6 are clean commits and are eligible for archival) + List expectedActiveInstants = new ArrayList<>(); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000001", "00000007"))); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000005", "00000006"), HoodieTimeline.CLEAN_ACTION)); + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000002", "00000003", "00000004"), HoodieTimeline.CLEAN_ACTION), expectedActiveInstants, commitsAfterArchival); + } else { + assertEquals(originalCommits, commitsAfterArchival); + } + } + } + } + } + + @Test + public void testArchiveRollbacksAndCleanTestTable() throws Exception { + int minArchiveCommits = 2; + int maxArchiveCommits = 9; + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, minArchiveCommits, maxArchiveCommits, 2); + + // trigger 1 commit to add lot of files so that future cleans can clean them up + testTable.doWriteOperation("00000001", WriteOperationType.UPSERT, Arrays.asList("p1", "p2"), Arrays.asList("p1", "p2"), 20); + + Map partitionToFileDeleteCount = new HashMap<>(); + partitionToFileDeleteCount.put("p1", 1); + partitionToFileDeleteCount.put("p2", 1); + // we are triggering 10 clean commits. (1 is commit, 2 -> 11 is clean) + for (int i = 2; i <= (maxArchiveCommits + 2); i++) { + testTable.doClean((i > 9 ? ("000000") : ("0000000")) + i, partitionToFileDeleteCount); + } + + // we are triggering 7 commits and 7 rollbacks for the same + for (int i = 12; i <= (2 * maxArchiveCommits); i += 2) { + testTable.doWriteOperation("000000" + i, WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + testTable.doRollback("000000" + i, "000000" + (i + 1)); + } + + // trigger archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + + // out of 10 clean commits, 8 will be archived. 2 to 9. 10 and 11 will be active. + // wrt regular commits, there aren't 9 commits yet and so all of them will be active. + List expectedActiveInstants = new ArrayList<>(); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000010", "00000011"), HoodieTimeline.CLEAN_ACTION)); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000001", "00000012", "00000014", "00000016", "00000018"))); + expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000013", "00000015", "00000017", "00000019"), HoodieTimeline.ROLLBACK_ACTION)); + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000002", "00000003", "00000004", "00000005", "00000006", "00000007", "00000008", "00000009"), + HoodieTimeline.CLEAN_ACTION), expectedActiveInstants, commitsAfterArchival); + } + + @ParameterizedTest + @CsvSource({"true,true", "true,false", "false,true", "false,false"}) + public void testArchiveCompletedRollbackAndClean(boolean isEmpty, boolean enableMetadataTable) throws Exception { + init(); + int minInstantsToKeep = 2; + int maxInstantsToKeep = 10; + HoodieWriteConfig cfg = + HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2).forTable("test-trip-table") + .withCleanConfig(HoodieCleanConfig.newBuilder().retainCommits(1).build()) + .withArchivalConfig(HoodieArchivalConfig.newBuilder().archiveCommitsWith(minInstantsToKeep, maxInstantsToKeep).build()) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder().withRemoteServerPort(timelineServicePort).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadataTable).build()) + .build(); + metaClient = HoodieTableMetaClient.reload(metaClient); + + int startInstant = 1; + List expectedArchivedInstants = new ArrayList<>(); + for (int i = 0; i < maxInstantsToKeep + 1; i++, startInstant++) { + createCleanMetadata(startInstant + "", false, false, isEmpty || i % 2 == 0); + expectedArchivedInstants.add(new HoodieInstant(State.REQUESTED, HoodieTimeline.CLEAN_ACTION, startInstant + "")); + expectedArchivedInstants.add(new HoodieInstant(State.INFLIGHT, HoodieTimeline.CLEAN_ACTION, startInstant + "")); + expectedArchivedInstants.add(new HoodieInstant(State.COMPLETED, HoodieTimeline.CLEAN_ACTION, startInstant + "")); + } + + for (int i = 0; i < maxInstantsToKeep + 1; i++, startInstant += 2) { + createCommitAndRollbackFile(startInstant + 1 + "", startInstant + "", false, isEmpty || i % 2 == 0); + expectedArchivedInstants.add(new HoodieInstant(State.REQUESTED, HoodieTimeline.ROLLBACK_ACTION, startInstant + "")); + expectedArchivedInstants.add(new HoodieInstant(State.INFLIGHT, HoodieTimeline.ROLLBACK_ACTION, startInstant + "")); + expectedArchivedInstants.add(new HoodieInstant(State.COMPLETED, HoodieTimeline.ROLLBACK_ACTION, startInstant + "")); + } + + if (enableMetadataTable) { + // Simulate a compaction commit in metadata table timeline + // so the archival in data table can happen + createCompactionCommitInMetadataTable(hadoopConf, wrapperFs, basePath, Integer.toString(99)); + } + + HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(cfg, table); + + archiver.archiveIfRequired(context); + + Stream currentInstants = metaClient.getActiveTimeline().reload().getInstants(); + Map> actionInstantMap = currentInstants.collect(Collectors.groupingBy(HoodieInstant::getAction)); + + assertTrue(actionInstantMap.containsKey("clean"), "Clean Action key must be preset"); + assertEquals(minInstantsToKeep, actionInstantMap.get("clean").size(), "Should have min instant"); + + assertTrue(actionInstantMap.containsKey("rollback"), "Rollback Action key must be preset"); + assertEquals(minInstantsToKeep, actionInstantMap.get("rollback").size(), "Should have min instant"); + + // verify all expected instants are part of archived timeline + metaClient.getArchivedTimeline().loadCompletedInstantDetailsInMemory(); + HoodieInstant firstInstant = metaClient.reloadActiveTimeline().firstInstant().get(); + expectedArchivedInstants = expectedArchivedInstants.stream() + .filter(entry -> HoodieTimeline.compareTimestamps(entry.getTimestamp(), HoodieTimeline.LESSER_THAN, firstInstant.getTimestamp() + )).collect(Collectors.toList()); + expectedArchivedInstants.forEach(entry -> assertTrue(metaClient.getArchivedTimeline().containsInstant(entry))); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testArchiveInflightClean(boolean enableMetadataTable) throws Exception { + init(); + HoodieWriteConfig cfg = + HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2).forTable("test-trip-table") + .withCleanConfig(HoodieCleanConfig.newBuilder().retainCommits(1).build()) + .withArchivalConfig(HoodieArchivalConfig.newBuilder().archiveCommitsWith(2, 3).build()) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withRemoteServerPort(timelineServicePort).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadataTable).build()) + .build(); + metaClient = HoodieTableMetaClient.reload(metaClient); + + createCleanMetadata("10", false); + createCleanMetadata("11", false); + HoodieInstant notArchivedInstant1 = createCleanMetadata("12", false); + HoodieInstant notArchivedInstant2 = createCleanMetadata("13", false); + HoodieInstant notArchivedInstant3 = createCleanMetadata("14", true); + + if (enableMetadataTable) { + // Simulate a compaction commit in metadata table timeline + // so the archival in data table can happen + createCompactionCommitInMetadataTable(hadoopConf, wrapperFs, basePath, "14"); + } + + HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(cfg, table); + + archiver.archiveIfRequired(context); + + List notArchivedInstants = metaClient.getActiveTimeline().reload().getInstants().collect(Collectors.toList()); + assertEquals(3, notArchivedInstants.size(), "Not archived instants should be 3"); + assertEquals(notArchivedInstants, Arrays.asList(notArchivedInstant1, notArchivedInstant2, notArchivedInstant3), ""); + } + + @Test + public void testArchiveTableWithMetadataTableCompaction() throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 4, 7); + + // min archival commits is 2 and max archival commits is 4. and so, after 5th commit, ideally archival should kick in. but max delta commits in metadata table is set to 6. and so + // archival will kick in only by 7th commit in datatable(1 commit for bootstrap + 6 commits from data table). + // and then 2nd compaction will take place + for (int i = 1; i < 6; i++) { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + // trigger archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + assertEquals(originalCommits, commitsAfterArchival); + } + + // two more commits will trigger compaction in metadata table and will let archival move forward. + testTable.doWriteOperation("00000006", WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + testTable.doWriteOperation("00000007", WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + // trigger archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + // before archival 1,2,3,4,5,6,7 + // after archival 6,7 + assertEquals(originalCommits.size() - commitsAfterArchival.size(), 5); + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000001", "00000002", "00000003", "00000004", "00000005")), + getActiveCommitInstants(Arrays.asList("00000006", "00000007")), commitsAfterArchival); + + // 3 more commits, 6 and 7 will be archived. but will not move after 6 since compaction has to kick in metadata table. + testTable.doWriteOperation("00000008", WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + testTable.doWriteOperation("00000009", WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + // trigger archival + commitsList = archiveAndGetCommitsList(writeConfig); + originalCommits = commitsList.getKey(); + commitsAfterArchival = commitsList.getValue(); + assertEquals(originalCommits, commitsAfterArchival); + + // ideally, this will archive commits 6, 7, 8 but since compaction in metadata is until 6, only 6 will get archived, + testTable.doWriteOperation("00000010", WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + commitsList = archiveAndGetCommitsList(writeConfig); + originalCommits = commitsList.getKey(); + commitsAfterArchival = commitsList.getValue(); + assertEquals(originalCommits.size() - commitsAfterArchival.size(), 1); + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000001", "00000002", "00000003", "00000004", "00000005", "00000006")), + getActiveCommitInstants(Arrays.asList("00000007", "00000008", "00000009", "00000010")), commitsAfterArchival); + + // and then 2nd compaction will take place at 12th commit + for (int i = 11; i < 14; i++) { + testTable.doWriteOperation("000000" + i, WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + // trigger archival + commitsList = archiveAndGetCommitsList(writeConfig); + originalCommits = commitsList.getKey(); + commitsAfterArchival = commitsList.getValue(); + assertEquals(originalCommits, commitsAfterArchival); + } + + // one more commit will trigger compaction in metadata table and will let archival move forward. + testTable.doWriteOperation("00000014", WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + // trigger archival + commitsList = archiveAndGetCommitsList(writeConfig); + originalCommits = commitsList.getKey(); + commitsAfterArchival = commitsList.getValue(); + // before archival 7,8,9,10,11,12,13,14 + // after archival 13,14 + assertEquals(originalCommits.size() - commitsAfterArchival.size(), 6); + verifyArchival(getAllArchivedCommitInstants(Arrays.asList("00000001", "00000002", "00000003", "00000004", "00000005", "00000006", "00000007", "00000008", + "00000009", "00000010", "00000011", "00000012")), getActiveCommitInstants(Arrays.asList("00000013", "00000014")), commitsAfterArchival); + } + + @Test + public void testArchiveCommitsWithCompactionCommitInMetadataTableTimeline() throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 4, 20); + int startInstantTime = 100; + int numCommits = 15; + int numExpectedArchived = 6; // "100" till "105" should be archived in this case + + for (int i = startInstantTime; i < startInstantTime + numCommits; i++) { + HoodieTestDataGenerator.createCommitFile(basePath, Integer.toString(i), wrapperFs.getConf()); + } + // Simulate a compaction commit in metadata table timeline + // so the archival in data table can happen + createCompactionCommitInMetadataTable(hadoopConf, wrapperFs, basePath, "105"); + + HoodieTable table = HoodieSparkTable.create(writeConfig, context); + HoodieTimelineArchiver archiveLog = new HoodieTimelineArchiver(writeConfig, table); + + HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); + assertEquals(numCommits, timeline.countInstants(), String.format("Loaded %d commits and the count should match", numCommits)); + assertTrue(archiveLog.archiveIfRequired(context)); + timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(); + assertEquals(numCommits - numExpectedArchived, timeline.countInstants(), + "Since we have a compaction commit of 105 in metadata table timeline, we should never archive any commit after that"); + for (int i = startInstantTime + numExpectedArchived; i < startInstantTime + numCommits; i++) { + assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, Integer.toString(i))), + String.format("Commit %d should not be archived", i)); + } + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testArchivalWithMaxDeltaCommitsGuaranteeForCompaction(boolean enableMetadata) throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig( + enableMetadata, 2, 4, 8, 1, HoodieTableType.MERGE_ON_READ); + + // When max archival commits is set to 4, even after 8 delta commits, since the number of delta + // commits is still smaller than 8, the archival should not kick in. + // The archival should only kick in after the 9th delta commit + // instant "00000001" to "00000009" + for (int i = 1; i < 10; i++) { + testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 + ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + // archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + + if (i <= 8) { + assertEquals(originalCommits, commitsAfterArchival); + } else { + assertEquals(1, originalCommits.size() - commitsAfterArchival.size()); + assertFalse(commitsAfterArchival.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "00000001"))); + IntStream.range(2, 10).forEach(j -> + assertTrue(commitsAfterArchival.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "0000000" + j)))); + } + } + + testTable.doCompaction("00000010", Arrays.asList("p1", "p2")); + + // instant "00000011" to "00000019" + for (int i = 1; i < 10; i++) { + testTable.doWriteOperation("0000001" + i, WriteOperationType.UPSERT, i == 1 + ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + // archival + Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List originalCommits = commitsList.getKey(); + List commitsAfterArchival = commitsList.getValue(); + + // first 9 delta commits before the completed compaction should be archived + IntStream.range(1, 10).forEach(j -> + assertFalse(commitsAfterArchival.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "0000000" + j)))); + + if (i == 1) { + assertEquals(8, originalCommits.size() - commitsAfterArchival.size()); + // instant from "00000011" should be in the active timeline + assertTrue(commitsAfterArchival.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "00000010"))); + assertTrue(commitsAfterArchival.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "00000011"))); + } else if (i < 8) { + assertEquals(originalCommits, commitsAfterArchival); + } else { + assertEquals(1, originalCommits.size() - commitsAfterArchival.size()); + assertFalse(commitsAfterArchival.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "00000010"))); + // i == 8 -> ["00000011", "00000018"] should be in the active timeline + // i == 9 -> ["00000012", "00000019"] should be in the active timeline + if (i == 9) { + assertFalse(commitsAfterArchival.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "00000011"))); + } + IntStream.range(i - 7, i + 1).forEach(j -> + assertTrue(commitsAfterArchival.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "0000001" + j)))); + } + } + } + + @Test + public void testArchivalAndCompactionInMetadataTable() throws Exception { + init(HoodieTableType.COPY_ON_WRITE); + // Test configs where metadata table has more aggressive archival configs than the compaction config + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .withArchivalConfig(HoodieArchivalConfig.newBuilder().archiveCommitsWith(2, 4).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder().retainCommits(1).build()) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withRemoteServerPort(timelineServicePort).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true) + .withMaxNumDeltaCommitsBeforeCompaction(8) + .retainCommits(1).archiveCommitsWith(2, 4).build()) + .forTable("test-trip-table").build(); + initWriteConfigAndMetatableWriter(writeConfig, true); + + HoodieTableMetaClient metadataTableMetaClient = HoodieTableMetaClient.builder() + .setConf(metaClient.getHadoopConf()) + .setBasePath(HoodieTableMetadata.getMetadataTableBasePath(basePath)) + .setLoadActiveTimelineOnLoad(true).build(); + + for (int i = 1; i <= 17; i++) { + testTable.doWriteOperation("000000" + String.format("%02d", i), WriteOperationType.UPSERT, + i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + // archival + archiveAndGetCommitsList(writeConfig); + + metadataTableMetaClient = HoodieTableMetaClient.reload(metadataTableMetaClient); + List metadataTableInstants = metadataTableMetaClient.getActiveTimeline() + .getCommitsTimeline().filterCompletedInstants().getInstants() + .collect(Collectors.toList()); + + if (i <= 7) { + // In the metadata table timeline, the first delta commit is "00000000000000" + // from metadata table init, delta commits "00000001" till "00000007" are added + // later on without archival or compaction + assertEquals(i + 1, metadataTableInstants.size()); + assertTrue(metadataTableInstants.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "00000000000000"))); + IntStream.range(1, i + 1).forEach(j -> + assertTrue(metadataTableInstants.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "0000000" + j)))); + } else if (i == 8) { + // i == 8 + // The instant "00000000000000" was archived since it's less than + // the earliest instant on the dataset active timeline, + // the dataset active timeline has instants of range [00000001 ~ 00000008] + // because when it does the archiving, no compaction instant on the + // metadata active timeline exists yet. + assertEquals(9, metadataTableInstants.size()); + assertTrue(metadataTableInstants.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "00000007001"))); + IntStream.range(1, i + 1).forEach(j -> + assertTrue(metadataTableInstants.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "0000000" + j)))); + } else if (i <= 11) { + // In the metadata table timeline, the first delta commit is "00000007" + // because it equals with the earliest commit on the dataset timeline, after archival, + // delta commits "00000008" till "00000011" are added later on without archival or compaction + assertEquals(i - 5, metadataTableInstants.size()); + assertTrue(metadataTableInstants.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "00000007001"))); + IntStream.range(7, i + 1).forEach(j -> + assertTrue(metadataTableInstants.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, + "000000" + String.format("%02d", j))))); + } else if (i <= 14) { + // In the metadata table timeline, the first delta commit is "00000007001" + // from metadata table compaction, after archival, delta commits "00000008" + // till "00000014" are added later on without archival or compaction + assertEquals(i - 6, metadataTableInstants.size()); + assertTrue(metadataTableInstants.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "00000007001"))); + IntStream.range(8, i + 1).forEach(j -> + assertTrue(metadataTableInstants.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, + "000000" + String.format("%02d", j))))); + } else if (i == 15) { + // Only delta commits "00000008" till "00000015" are in the active timeline + assertEquals(8, metadataTableInstants.size()); + assertFalse(metadataTableInstants.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "00000007001"))); + IntStream.range(8, 16).forEach(j -> + assertTrue(metadataTableInstants.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, + "000000" + String.format("%02d", j))))); + } else if (i == 16) { + // i == 16 + // dataset timeline has commits "00000015" and "00000016", + // the metadata timeline has commits [00000008, 00000016] and "00000015001" + assertEquals(10, metadataTableInstants.size()); + assertTrue(metadataTableInstants.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "00000015001"))); + IntStream.range(8, 17).forEach(j -> + assertTrue(metadataTableInstants.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, + "000000" + String.format("%02d", j))))); + } else { + // i == 17 + // Only commits [00000015, 00000017] and "00000015001" are on the metadata timeline + assertEquals(4, metadataTableInstants.size()); + assertTrue(metadataTableInstants.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "00000015001"))); + IntStream.range(15, 18).forEach(j -> + assertTrue(metadataTableInstants.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, + "000000" + String.format("%02d", j))))); + } + } + } + + private Pair, List> archiveAndGetCommitsList(HoodieWriteConfig writeConfig) throws IOException { + metaClient.reloadActiveTimeline(); + HoodieTimeline timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); + List originalCommits = timeline.getInstants().collect(Collectors.toList()); + HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table); + archiver.archiveIfRequired(context); + timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); + List commitsAfterArchival = timeline.getInstants().collect(Collectors.toList()); + return Pair.of(originalCommits, commitsAfterArchival); + } + + private void verifyArchival(List expectedArchivedInstants, List expectedActiveInstants, List commitsAfterArchival) { + Collections.sort(expectedActiveInstants, Comparator.comparing(HoodieInstant::getTimestamp)); + Collections.sort(commitsAfterArchival, Comparator.comparing(HoodieInstant::getTimestamp)); + assertEquals(expectedActiveInstants, commitsAfterArchival); + expectedArchivedInstants.forEach(entry -> assertFalse(commitsAfterArchival.contains(entry))); + HoodieArchivedTimeline archivedTimeline = new HoodieArchivedTimeline(metaClient); + List actualArchivedInstants = archivedTimeline.getInstants().collect(Collectors.toList()); + Collections.sort(actualArchivedInstants, Comparator.comparing(HoodieInstant::getTimestamp)); + Collections.sort(expectedArchivedInstants, Comparator.comparing(HoodieInstant::getTimestamp)); + assertEquals(actualArchivedInstants, expectedArchivedInstants); + + HoodieTimeline timeline = metaClient.getActiveTimeline(); + expectedArchivedInstants.forEach(entry -> { + // check safety + if (!entry.getAction().equals(HoodieTimeline.ROLLBACK_ACTION)) { + assertTrue(timeline.containsOrBeforeTimelineStarts(entry.getTimestamp()), "Archived commits should always be safe"); + } + } + ); + } + + private List getArchivedInstants(HoodieInstant instant) { + List instants = new ArrayList<>(); + if (instant.getAction().equals(HoodieTimeline.COMMIT_ACTION) || instant.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION) + || instant.getAction().equals(HoodieTimeline.CLEAN_ACTION) || instant.getAction().equals(HoodieTimeline.ROLLBACK_ACTION)) { + instants.add(new HoodieInstant(State.REQUESTED, instant.getAction(), instant.getTimestamp())); + } + instants.add(new HoodieInstant(State.INFLIGHT, instant.getAction(), instant.getTimestamp())); + instants.add(new HoodieInstant(State.COMPLETED, instant.getAction(), instant.getTimestamp())); + return instants; + } + + private List getAllArchivedCommitInstants(List commitTimes) { + return getAllArchivedCommitInstants(commitTimes, HoodieTimeline.COMMIT_ACTION); + } + + private List getAllArchivedCommitInstants(List commitTimes, String action) { + List allInstants = new ArrayList<>(); + commitTimes.forEach(entry -> allInstants.addAll(getArchivedInstants(new HoodieInstant(State.COMPLETED, action, entry)))); + return allInstants; + } + + private List getActiveCommitInstants(List commitTimes) { + return getActiveCommitInstants(commitTimes, HoodieTimeline.COMMIT_ACTION); + } + + private List getActiveSavepointedCommitInstants(List commitTimes) { + return getActiveCommitInstants(commitTimes, HoodieTimeline.SAVEPOINT_ACTION); + } + + private List getActiveCommitInstants(List commitTimes, String action) { + List allInstants = new ArrayList<>(); + commitTimes.forEach(entry -> allInstants.add(new HoodieInstant(State.COMPLETED, action, entry))); + return allInstants; + } + + private void createCommitAndRollbackFile(String commitToRollback, String rollbackTIme, boolean isRollbackInflight) throws IOException { + createCommitAndRollbackFile(commitToRollback, rollbackTIme, isRollbackInflight, false); + } + + private void createCommitAndRollbackFile(String commitToRollback, String rollbackTIme, boolean isRollbackInflight, boolean isEmpty) throws IOException { + HoodieTestDataGenerator.createCommitFile(basePath, commitToRollback, wrapperFs.getConf()); + createRollbackMetadata(rollbackTIme, commitToRollback, isRollbackInflight, isEmpty); + } + + private HoodieInstant createRollbackMetadata(String rollbackTime, String commitToRollback, boolean inflight, boolean isEmpty) throws IOException { + if (inflight) { + HoodieTestTable.of(metaClient).addInflightRollback(rollbackTime); + } else { + HoodieRollbackMetadata hoodieRollbackMetadata = HoodieRollbackMetadata.newBuilder() + .setVersion(1) + .setStartRollbackTime(rollbackTime) + .setTotalFilesDeleted(1) + .setTimeTakenInMillis(1000) + .setCommitsRollback(Collections.singletonList(commitToRollback)) + .setPartitionMetadata(Collections.emptyMap()) + .setInstantsRollback(Collections.emptyList()) + .build(); + HoodieTestTable.of(metaClient).addRollback(rollbackTime, hoodieRollbackMetadata, isEmpty); + } + return new HoodieInstant(inflight, "rollback", rollbackTime); + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestSparkIOUtils.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestSparkIOUtils.java index ffbf6d103b13b..7490a4d337a82 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestSparkIOUtils.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestSparkIOUtils.java @@ -25,10 +25,8 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; -import static org.apache.hudi.config.HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION; -import static org.apache.hudi.config.HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FRACTION_FOR_MERGE; -import static org.apache.hudi.config.HoodieMemoryConfig.MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP; -import static org.apache.hudi.config.HoodieMemoryConfig.MAX_MEMORY_FRACTION_FOR_MERGE_PROP; +import static org.apache.hudi.config.HoodieMemoryConfig.MAX_MEMORY_FRACTION_FOR_COMPACTION; +import static org.apache.hudi.config.HoodieMemoryConfig.MAX_MEMORY_FRACTION_FOR_MERGE; import static org.junit.jupiter.api.Assertions.assertEquals; public class TestSparkIOUtils { @@ -47,8 +45,8 @@ public void testMaxMemoryPerPartitionMergeWithMaxSizeDefined() { HoodieMemoryConfig memoryConfig = HoodieMemoryConfig.newBuilder().withMaxMemoryMaxSize(mergeMaxSize, compactionMaxSize).build(); HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(path).withMemoryConfig(memoryConfig).build(); - assertEquals(mergeMaxSize, IOUtils.getMaxMemoryPerPartitionMerge(contextSupplier, config.getProps())); - assertEquals(compactionMaxSize, IOUtils.getMaxMemoryPerCompaction(contextSupplier, config.getProps())); + assertEquals(mergeMaxSize, IOUtils.getMaxMemoryPerPartitionMerge(contextSupplier, config)); + assertEquals(compactionMaxSize, IOUtils.getMaxMemoryPerCompaction(contextSupplier, config)); } @Test @@ -57,13 +55,13 @@ public void testMaxMemoryPerPartitionMergeInDefault() { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(path).build(); - String compactionFraction = config.getProps().getProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION_PROP, DEFAULT_MAX_MEMORY_FRACTION_FOR_COMPACTION); + String compactionFraction = config.getProps().getProperty(MAX_MEMORY_FRACTION_FOR_COMPACTION.key(), MAX_MEMORY_FRACTION_FOR_COMPACTION.defaultValue()); long compactionMaxSize = IOUtils.getMaxMemoryAllowedForMerge(contextSupplier, compactionFraction); - String mergeFraction = config.getProps().getProperty(MAX_MEMORY_FRACTION_FOR_MERGE_PROP, DEFAULT_MAX_MEMORY_FRACTION_FOR_MERGE); + String mergeFraction = config.getProps().getProperty(MAX_MEMORY_FRACTION_FOR_MERGE.key(), MAX_MEMORY_FRACTION_FOR_MERGE.defaultValue()); long mergeMaxSize = IOUtils.getMaxMemoryAllowedForMerge(contextSupplier, mergeFraction); - assertEquals(mergeMaxSize, IOUtils.getMaxMemoryPerPartitionMerge(contextSupplier, config.getProps())); - assertEquals(compactionMaxSize, IOUtils.getMaxMemoryPerCompaction(contextSupplier, config.getProps())); + assertEquals(mergeMaxSize, IOUtils.getMaxMemoryPerPartitionMerge(contextSupplier, config)); + assertEquals(compactionMaxSize, IOUtils.getMaxMemoryPerCompaction(contextSupplier, config)); } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/TestHoodieFileWriterFactory.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/TestHoodieFileWriterFactory.java index 26f431a0e051e..66016305d7ad3 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/TestHoodieFileWriterFactory.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/TestHoodieFileWriterFactory.java @@ -49,13 +49,20 @@ public void testGetFileWriter() throws IOException { SparkTaskContextSupplier supplier = new SparkTaskContextSupplier(); HoodieFileWriter parquetWriter = HoodieFileWriterFactory.getFileWriter(instantTime, parquetPath, table, cfg, HoodieTestDataGenerator.AVRO_SCHEMA, supplier); - assertTrue(parquetWriter instanceof HoodieParquetWriter); + assertTrue(parquetWriter instanceof HoodieAvroParquetWriter); + // hfile format. final Path hfilePath = new Path(basePath + "/partition/path/f1_1-0-1_000.hfile"); HoodieFileWriter hfileWriter = HoodieFileWriterFactory.getFileWriter(instantTime, hfilePath, table, cfg, HoodieTestDataGenerator.AVRO_SCHEMA, supplier); assertTrue(hfileWriter instanceof HoodieHFileWriter); + // orc file format. + final Path orcPath = new Path(basePath + "/partition/path/f1_1-0-1_000.orc"); + HoodieFileWriter orcFileWriter = HoodieFileWriterFactory.getFileWriter(instantTime, + orcPath, table, cfg, HoodieTestDataGenerator.AVRO_SCHEMA, supplier); + assertTrue(orcFileWriter instanceof HoodieOrcWriter); + // other file format exception. final Path logPath = new Path(basePath + "/partition/path/f.b51192a8-574b-4a85-b246-bcfec03ac8bf_100.log.2_1-0-1"); final Throwable thrown = assertThrows(UnsupportedOperationException.class, () -> { diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/TestHoodieInternalRowParquetWriter.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/TestHoodieInternalRowParquetWriter.java deleted file mode 100644 index 2b344db7e1322..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/TestHoodieInternalRowParquetWriter.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.io.storage; - -import org.apache.hudi.common.bloom.BloomFilter; -import org.apache.hudi.common.bloom.BloomFilterFactory; -import org.apache.hudi.common.testutils.HoodieTestDataGenerator; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.testutils.HoodieClientTestHarness; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hudi.testutils.SparkDatasetTestUtils; -import org.apache.parquet.hadoop.metadata.CompressionCodecName; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.catalyst.InternalRow; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.util.List; -import java.util.Random; -import java.util.UUID; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -/** - * Unit tests {@link HoodieInternalRowParquetWriter}. - */ -public class TestHoodieInternalRowParquetWriter extends HoodieClientTestHarness { - - private static final Random RANDOM = new Random(); - - @BeforeEach - public void setUp() throws Exception { - initSparkContexts("TestHoodieInternalRowParquetWriter"); - initPath(); - initFileSystem(); - initTestDataGenerator(); - initMetaClient(); - } - - @AfterEach - public void tearDown() throws Exception { - cleanupResources(); - } - - @Test - public void endToEndTest() throws Exception { - HoodieWriteConfig cfg = SparkDatasetTestUtils.getConfigBuilder(basePath).build(); - for (int i = 0; i < 5; i++) { - // init write support and parquet config - HoodieRowParquetWriteSupport writeSupport = getWriteSupport(cfg, hadoopConf); - HoodieRowParquetConfig parquetConfig = new HoodieRowParquetConfig(writeSupport, - CompressionCodecName.SNAPPY, cfg.getParquetBlockSize(), cfg.getParquetPageSize(), cfg.getParquetMaxFileSize(), - writeSupport.getHadoopConf(), cfg.getParquetCompressionRatio()); - - // prepare path - String fileId = UUID.randomUUID().toString(); - Path filePath = new Path(basePath + "/" + fileId); - String partitionPath = HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; - metaClient.getFs().mkdirs(new Path(basePath)); - - // init writer - HoodieInternalRowParquetWriter writer = new HoodieInternalRowParquetWriter(filePath, parquetConfig); - - // generate input - int size = 10 + RANDOM.nextInt(100); - // Generate inputs - Dataset inputRows = SparkDatasetTestUtils.getRandomRows(sqlContext, size, partitionPath, false); - List internalRows = SparkDatasetTestUtils.toInternalRows(inputRows, SparkDatasetTestUtils.ENCODER); - - // issue writes - for (InternalRow internalRow : internalRows) { - writer.write(internalRow); - } - - // close the writer - writer.close(); - - // verify rows - Dataset result = sqlContext.read().parquet(basePath); - assertEquals(0, inputRows.except(result).count()); - } - } - - private HoodieRowParquetWriteSupport getWriteSupport(HoodieWriteConfig writeConfig, Configuration hadoopConf) { - BloomFilter filter = BloomFilterFactory.createBloomFilter( - writeConfig.getBloomFilterNumEntries(), - writeConfig.getBloomFilterFPP(), - writeConfig.getDynamicBloomFilterMaxNumEntries(), - writeConfig.getBloomFilterType()); - return new HoodieRowParquetWriteSupport(hadoopConf, SparkDatasetTestUtils.STRUCT_TYPE, filter); - } -} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieInternalRowParquetWriter.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieInternalRowParquetWriter.java new file mode 100644 index 0000000000000..dce0e2fad5910 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieInternalRowParquetWriter.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage.row; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.HoodieBloomFilterWriteSupport; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.bloom.BloomFilterFactory; +import org.apache.hudi.common.bloom.BloomFilterTypeCode; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ParquetUtils; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.storage.HoodieParquetConfig; +import org.apache.hudi.testutils.HoodieClientTestHarness; +import org.apache.hudi.testutils.SparkDatasetTestUtils; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Unit tests {@link HoodieInternalRowParquetWriter}. + */ +public class TestHoodieInternalRowParquetWriter extends HoodieClientTestHarness { + + @BeforeEach + public void setUp() throws Exception { + initSparkContexts("TestHoodieInternalRowParquetWriter"); + initPath(); + initFileSystem(); + initTestDataGenerator(); + initMetaClient(); + } + + @AfterEach + public void tearDown() throws Exception { + cleanupResources(); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testProperWriting(boolean parquetWriteLegacyFormatEnabled) throws Exception { + // Generate inputs + Dataset inputRows = SparkDatasetTestUtils.getRandomRows(sqlContext, 100, + HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, false); + StructType schema = inputRows.schema(); + + List rows = SparkDatasetTestUtils.toInternalRows(inputRows, SparkDatasetTestUtils.ENCODER); + + HoodieWriteConfig.Builder writeConfigBuilder = + SparkDatasetTestUtils.getConfigBuilder(basePath, timelineServicePort); + + HoodieRowParquetWriteSupport writeSupport = getWriteSupport(writeConfigBuilder, hadoopConf, parquetWriteLegacyFormatEnabled); + HoodieWriteConfig cfg = writeConfigBuilder.build(); + HoodieParquetConfig parquetConfig = new HoodieParquetConfig<>(writeSupport, + CompressionCodecName.SNAPPY, cfg.getParquetBlockSize(), cfg.getParquetPageSize(), cfg.getParquetMaxFileSize(), + writeSupport.getHadoopConf(), cfg.getParquetCompressionRatio(), cfg.parquetDictionaryEnabled()); + + Path filePath = new Path(basePath + "/internal_row_writer.parquet"); + + try (HoodieInternalRowParquetWriter writer = new HoodieInternalRowParquetWriter(filePath, parquetConfig)) { + for (InternalRow row : rows) { + writer.writeRow(row.getUTF8String(schema.fieldIndex("record_key")), row); + } + } + + // Step 1: Verify rows written correctly + Dataset result = sqlContext.read().parquet(basePath); + assertEquals(0, inputRows.except(result).count()); + + // Step 2: Assert Parquet metadata was written appropriately + List recordKeys = + rows.stream().map(r -> r.getString(schema.fieldIndex("record_key"))).collect(Collectors.toList()); + + String minKey = recordKeys.stream().min(Comparator.naturalOrder()).get(); + String maxKey = recordKeys.stream().max(Comparator.naturalOrder()).get(); + + FileMetaData parquetMetadata = ParquetUtils.readMetadata(hadoopConf, filePath).getFileMetaData(); + + Map extraMetadata = parquetMetadata.getKeyValueMetaData(); + + assertEquals(extraMetadata.get(HoodieBloomFilterWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER), minKey); + assertEquals(extraMetadata.get(HoodieBloomFilterWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER), maxKey); + assertEquals(extraMetadata.get(HoodieBloomFilterWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE), BloomFilterTypeCode.DYNAMIC_V0.name()); + + // Step 3: Make sure Bloom Filter contains all the record keys + BloomFilter bloomFilter = new ParquetUtils().readBloomFilterFromMetadata(hadoopConf, filePath); + recordKeys.forEach(recordKey -> { + assertTrue(bloomFilter.mightContain(recordKey)); + }); + } + + private HoodieRowParquetWriteSupport getWriteSupport(HoodieWriteConfig.Builder writeConfigBuilder, Configuration hadoopConf, boolean parquetWriteLegacyFormatEnabled) { + writeConfigBuilder.withStorageConfig(HoodieStorageConfig.newBuilder().parquetWriteLegacyFormat(String.valueOf(parquetWriteLegacyFormatEnabled)).build()); + HoodieWriteConfig writeConfig = writeConfigBuilder.build(); + BloomFilter filter = BloomFilterFactory.createBloomFilter( + writeConfig.getBloomFilterNumEntries(), + writeConfig.getBloomFilterFPP(), + writeConfig.getDynamicBloomFilterMaxNumEntries(), + writeConfig.getBloomFilterType()); + return new HoodieRowParquetWriteSupport(hadoopConf, SparkDatasetTestUtils.STRUCT_TYPE, Option.of(filter), writeConfig); + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieRowCreateHandle.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieRowCreateHandle.java new file mode 100644 index 0000000000000..ad73a256a6175 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieRowCreateHandle.java @@ -0,0 +1,263 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage.row; + +import org.apache.hudi.client.HoodieInternalWriteStatus; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieInsertException; +import org.apache.hudi.exception.TableNotFoundException; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.testutils.HoodieClientTestHarness; +import org.apache.hudi.testutils.SparkDatasetTestUtils; + +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.catalyst.InternalRow; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.util.ArrayList; +import java.util.List; +import java.util.Random; +import java.util.UUID; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +/** + * Unit tests {@link HoodieRowCreateHandle}. + */ +@SuppressWarnings("checkstyle:LineLength") +public class TestHoodieRowCreateHandle extends HoodieClientTestHarness { + + private static final Random RANDOM = new Random(); + + @BeforeEach + public void setUp() throws Exception { + initSparkContexts("TestHoodieRowCreateHandle"); + initPath(); + initFileSystem(); + initTestDataGenerator(); + initMetaClient(); + initTimelineService(); + } + + @AfterEach + public void tearDown() throws Exception { + cleanupResources(); + } + + @ParameterizedTest + @ValueSource(booleans = { true, false }) + public void testRowCreateHandle(boolean populateMetaFields) throws Exception { + // init config and table + HoodieWriteConfig config = SparkDatasetTestUtils.getConfigBuilder(basePath, timelineServicePort) + .withPopulateMetaFields(populateMetaFields) + .build(); + + HoodieTable table = HoodieSparkTable.create(config, context, metaClient); + List fileNames = new ArrayList<>(); + List fileAbsPaths = new ArrayList<>(); + + Dataset totalInputRows = null; + // one round per partition + for (int i = 0; i < 5; i++) { + String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[i % 3]; + + // init some args + String fileId = UUID.randomUUID().toString(); + String instantTime = "000"; + + HoodieRowCreateHandle handle = new HoodieRowCreateHandle(table, config, partitionPath, fileId, instantTime, + RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE); + int size = 10 + RANDOM.nextInt(1000); + // Generate inputs + Dataset inputRows = SparkDatasetTestUtils.getRandomRows(sqlContext, size, partitionPath, false); + if (totalInputRows == null) { + totalInputRows = inputRows; + } else { + totalInputRows = totalInputRows.union(inputRows); + } + + // issue writes + HoodieInternalWriteStatus writeStatus = writeAndGetWriteStatus(inputRows, handle); + + fileAbsPaths.add(basePath + "/" + writeStatus.getStat().getPath()); + fileNames.add(handle.getFileName()); + // verify output + assertOutput(writeStatus, size, fileId, partitionPath, instantTime, totalInputRows, fileNames, fileAbsPaths, populateMetaFields); + } + } + + /** + * Issue some corrupted or wrong schematized InternalRow after few valid InternalRows so that global error is thrown. write batch 1 of valid records write batch 2 of invalid records Global Error + * should be thrown. + */ + @Test + public void testGlobalFailure() throws Exception { + // init config and table + HoodieWriteConfig cfg = + SparkDatasetTestUtils.getConfigBuilder(basePath, timelineServicePort).build(); + HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); + String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0]; + + // init some args + String fileId = UUID.randomUUID().toString(); + String instantTime = "000"; + + HoodieRowCreateHandle handle = + new HoodieRowCreateHandle(table, cfg, partitionPath, fileId, instantTime, RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE); + int size = 10 + RANDOM.nextInt(1000); + int totalFailures = 5; + // Generate first batch of valid rows + Dataset inputRows = SparkDatasetTestUtils.getRandomRows(sqlContext, size / 2, partitionPath, false); + List internalRows = SparkDatasetTestUtils.toInternalRows(inputRows, SparkDatasetTestUtils.ENCODER); + + // generate some failures rows + for (int i = 0; i < totalFailures; i++) { + internalRows.add(SparkDatasetTestUtils.getInternalRowWithError(partitionPath)); + } + + // generate 2nd batch of valid rows + Dataset inputRows2 = SparkDatasetTestUtils.getRandomRows(sqlContext, size / 2, partitionPath, false); + internalRows.addAll(SparkDatasetTestUtils.toInternalRows(inputRows2, SparkDatasetTestUtils.ENCODER)); + + // issue writes + try { + for (InternalRow internalRow : internalRows) { + handle.write(internalRow); + } + fail("Should have failed"); + } catch (Throwable e) { + // expected + } + // close the create handle + HoodieInternalWriteStatus writeStatus = handle.close(); + + List fileNames = new ArrayList<>(); + fileNames.add(handle.getFileName()); + // verify write status + assertNotNull(writeStatus.getGlobalError()); + assertTrue(writeStatus.getGlobalError().getMessage().contains("java.lang.String cannot be cast to org.apache.spark.unsafe.types.UTF8String")); + assertEquals(writeStatus.getFileId(), fileId); + assertEquals(writeStatus.getPartitionPath(), partitionPath); + + // verify rows + Dataset result = sqlContext.read().parquet(basePath + "/" + partitionPath); + // passing only first batch of inputRows since after first batch global error would have been thrown + assertRows(inputRows, result, instantTime, fileNames, true); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testInstantiationFailure(boolean enableMetadataTable) { + // init config and table + HoodieWriteConfig cfg = SparkDatasetTestUtils.getConfigBuilder(basePath, timelineServicePort) + .withPath("/dummypath/abc/") + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadataTable).build()) + .build(); + + try { + HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); + new HoodieRowCreateHandle(table, cfg, " def", UUID.randomUUID().toString(), "001", RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE); + fail("Should have thrown exception"); + } catch (HoodieInsertException ioe) { + // expected without metadata table + if (enableMetadataTable) { + fail("Should have thrown TableNotFoundException"); + } + } catch (TableNotFoundException e) { + // expected with metadata table + if (!enableMetadataTable) { + fail("Should have thrown HoodieInsertException"); + } + } + } + + private HoodieInternalWriteStatus writeAndGetWriteStatus(Dataset inputRows, HoodieRowCreateHandle handle) + throws Exception { + List internalRows = SparkDatasetTestUtils.toInternalRows(inputRows, SparkDatasetTestUtils.ENCODER); + // issue writes + for (InternalRow internalRow : internalRows) { + handle.write(internalRow); + } + // close the create handle + return handle.close(); + } + + private void assertOutput(HoodieInternalWriteStatus writeStatus, int size, String fileId, String partitionPath, + String instantTime, Dataset inputRows, List filenames, List fileAbsPaths, boolean populateMetaFields) { + assertEquals(writeStatus.getPartitionPath(), partitionPath); + assertEquals(writeStatus.getTotalRecords(), size); + assertEquals(writeStatus.getFailedRowsSize(), 0); + assertEquals(writeStatus.getTotalErrorRecords(), 0); + assertFalse(writeStatus.hasErrors()); + assertNull(writeStatus.getGlobalError()); + assertEquals(writeStatus.getFileId(), fileId); + HoodieWriteStat writeStat = writeStatus.getStat(); + assertEquals(size, writeStat.getNumInserts()); + assertEquals(size, writeStat.getNumWrites()); + assertEquals(fileId, writeStat.getFileId()); + assertEquals(partitionPath, writeStat.getPartitionPath()); + assertEquals(0, writeStat.getNumDeletes()); + assertEquals(0, writeStat.getNumUpdateWrites()); + assertEquals(0, writeStat.getTotalWriteErrors()); + + // verify rows + Dataset result = sqlContext.read().parquet(fileAbsPaths.toArray(new String[0])); + assertRows(inputRows, result, instantTime, filenames, populateMetaFields); + } + + private void assertRows(Dataset expectedRows, Dataset actualRows, String instantTime, List filenames, boolean populateMetaFields) { + // verify 3 meta fields that are filled in within create handle + actualRows.collectAsList().forEach(entry -> { + String commitTime = entry.getString(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD)); + String fileName = entry.getString(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.FILENAME_METADATA_FIELD)); + String seqId = entry.getString(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD)); + + if (populateMetaFields) { + assertEquals(instantTime, commitTime); + assertFalse(StringUtils.isNullOrEmpty(seqId)); + assertTrue(filenames.contains(fileName)); + } else { + assertEquals("", commitTime); + assertEquals("", seqId); + assertEquals("", fileName); + } + }); + + // after trimming 2 of the meta fields, rest of the fields should match + Dataset trimmedExpected = expectedRows.drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, HoodieRecord.COMMIT_TIME_METADATA_FIELD, HoodieRecord.FILENAME_METADATA_FIELD); + Dataset trimmedActual = actualRows.drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, HoodieRecord.COMMIT_TIME_METADATA_FIELD, HoodieRecord.FILENAME_METADATA_FIELD); + assertEquals(0, trimmedActual.except(trimmedExpected).count()); + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestComplexKeyGenerator.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestComplexKeyGenerator.java deleted file mode 100644 index 54f4ffaef9dd2..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestComplexKeyGenerator.java +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.keygen; - -import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.testutils.HoodieTestDataGenerator; -import org.apache.hudi.exception.HoodieKeyException; - -import org.apache.avro.generic.GenericRecord; -import org.apache.hudi.keygen.constant.KeyGeneratorOptions; -import org.apache.hudi.testutils.KeyGeneratorTestUtilities; -import org.apache.spark.sql.Row; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - -import static junit.framework.TestCase.assertEquals; - -public class TestComplexKeyGenerator extends KeyGeneratorTestUtilities { - - private TypedProperties getCommonProps(boolean getComplexRecordKey) { - TypedProperties properties = new TypedProperties(); - if (getComplexRecordKey) { - properties.put(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY, "_row_key, pii_col"); - } else { - properties.put(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY, "_row_key"); - } - properties.put(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_OPT_KEY, "true"); - return properties; - } - - private TypedProperties getPropertiesWithoutPartitionPathProp() { - return getCommonProps(false); - } - - private TypedProperties getPropertiesWithoutRecordKeyProp() { - TypedProperties properties = new TypedProperties(); - properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "timestamp"); - return properties; - } - - private TypedProperties getWrongRecordKeyFieldProps() { - TypedProperties properties = new TypedProperties(); - properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "timestamp"); - properties.put(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY, "_wrong_key"); - return properties; - } - - private TypedProperties getProps() { - TypedProperties properties = getCommonProps(true); - properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "timestamp,ts_ms"); - return properties; - } - - @Test - public void testNullPartitionPathFields() { - Assertions.assertThrows(IllegalArgumentException.class, () -> new ComplexKeyGenerator(getPropertiesWithoutPartitionPathProp())); - } - - @Test - public void testNullRecordKeyFields() { - Assertions.assertThrows(IllegalArgumentException.class, () -> new ComplexKeyGenerator(getPropertiesWithoutRecordKeyProp())); - } - - @Test - public void testWrongRecordKeyField() { - ComplexKeyGenerator keyGenerator = new ComplexKeyGenerator(getWrongRecordKeyFieldProps()); - Assertions.assertThrows(HoodieKeyException.class, () -> keyGenerator.getRecordKey(getRecord())); - Assertions.assertThrows(HoodieKeyException.class, () -> keyGenerator.buildFieldPositionMapIfNeeded(KeyGeneratorTestUtilities.structType)); - } - - @Test - public void testHappyFlow() { - ComplexKeyGenerator keyGenerator = new ComplexKeyGenerator(getProps()); - GenericRecord record = getRecord(); - HoodieKey key = keyGenerator.getKey(record); - Assertions.assertEquals(key.getRecordKey(), "_row_key:key1,pii_col:pi"); - Assertions.assertEquals(key.getPartitionPath(), "timestamp=4357686/ts_ms=2020-03-21"); - Row row = KeyGeneratorTestUtilities.getRow(record); - Assertions.assertEquals(keyGenerator.getRecordKey(row), "_row_key:key1,pii_col:pi"); - Assertions.assertEquals(keyGenerator.getPartitionPath(row), "timestamp=4357686/ts_ms=2020-03-21"); - } - - @Test - public void testSingleValueKeyGenerator() { - TypedProperties properties = new TypedProperties(); - properties.setProperty(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY, "_row_key"); - properties.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "timestamp"); - ComplexKeyGenerator compositeKeyGenerator = new ComplexKeyGenerator(properties); - assertEquals(compositeKeyGenerator.getRecordKeyFields().size(), 1); - assertEquals(compositeKeyGenerator.getPartitionPathFields().size(), 1); - HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(); - GenericRecord record = dataGenerator.generateGenericRecords(1).get(0); - String rowKey = record.get("_row_key").toString(); - String partitionPath = record.get("timestamp").toString(); - HoodieKey hoodieKey = compositeKeyGenerator.getKey(record); - assertEquals("_row_key:" + rowKey, hoodieKey.getRecordKey()); - assertEquals(partitionPath, hoodieKey.getPartitionPath()); - } - - @Test - public void testMultipleValueKeyGenerator() { - TypedProperties properties = new TypedProperties(); - properties.setProperty(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY, "_row_key,timestamp"); - properties.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "rider,driver"); - ComplexKeyGenerator compositeKeyGenerator = new ComplexKeyGenerator(properties); - assertEquals(compositeKeyGenerator.getRecordKeyFields().size(), 2); - assertEquals(compositeKeyGenerator.getPartitionPathFields().size(), 2); - HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(); - GenericRecord record = dataGenerator.generateGenericRecords(1).get(0); - String rowKey = - "_row_key" + ComplexAvroKeyGenerator.DEFAULT_RECORD_KEY_SEPARATOR + record.get("_row_key").toString() + "," - + "timestamp" + ComplexAvroKeyGenerator.DEFAULT_RECORD_KEY_SEPARATOR + record.get("timestamp").toString(); - String partitionPath = record.get("rider").toString() + "/" + record.get("driver").toString(); - HoodieKey hoodieKey = compositeKeyGenerator.getKey(record); - assertEquals(rowKey, hoodieKey.getRecordKey()); - assertEquals(partitionPath, hoodieKey.getPartitionPath()); - } - - @Test - public void testMultipleValueKeyGeneratorNonPartitioned() { - TypedProperties properties = new TypedProperties(); - properties.setProperty(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY, "_row_key,timestamp"); - properties.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, ""); - ComplexKeyGenerator compositeKeyGenerator = new ComplexKeyGenerator(properties); - assertEquals(compositeKeyGenerator.getRecordKeyFields().size(), 2); - assertEquals(compositeKeyGenerator.getPartitionPathFields().size(), 0); - HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(); - GenericRecord record = dataGenerator.generateGenericRecords(1).get(0); - String rowKey = - "_row_key" + ComplexAvroKeyGenerator.DEFAULT_RECORD_KEY_SEPARATOR + record.get("_row_key").toString() + "," - + "timestamp" + ComplexAvroKeyGenerator.DEFAULT_RECORD_KEY_SEPARATOR + record.get("timestamp").toString(); - String partitionPath = ""; - HoodieKey hoodieKey = compositeKeyGenerator.getKey(record); - assertEquals(rowKey, hoodieKey.getRecordKey()); - assertEquals(partitionPath, hoodieKey.getPartitionPath()); - } -} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestCustomKeyGenerator.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestCustomKeyGenerator.java deleted file mode 100644 index dc30b932e9d2b..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestCustomKeyGenerator.java +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.keygen; - -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.config.TypedProperties; - -import org.apache.avro.generic.GenericRecord; -import org.apache.hudi.keygen.constant.KeyGeneratorOptions; -import org.apache.hudi.testutils.KeyGeneratorTestUtilities; -import org.apache.spark.sql.Row; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - -public class TestCustomKeyGenerator extends KeyGeneratorTestUtilities { - - private TypedProperties getCommonProps(boolean getComplexRecordKey) { - TypedProperties properties = new TypedProperties(); - if (getComplexRecordKey) { - properties.put(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY, "_row_key, pii_col"); - } else { - properties.put(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY, "_row_key"); - } - properties.put(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_OPT_KEY, "true"); - return properties; - } - - private TypedProperties getPropertiesForSimpleKeyGen() { - TypedProperties properties = getCommonProps(false); - properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "timestamp:simple"); - return properties; - } - - private TypedProperties getImproperPartitionFieldFormatProp() { - TypedProperties properties = getCommonProps(false); - properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "timestamp"); - return properties; - } - - private TypedProperties getInvalidPartitionKeyTypeProps() { - TypedProperties properties = getCommonProps(false); - properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "timestamp:dummy"); - return properties; - } - - private TypedProperties getComplexRecordKeyWithSimplePartitionProps() { - TypedProperties properties = getCommonProps(true); - properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "timestamp:simple"); - return properties; - } - - private TypedProperties getComplexRecordKeyAndPartitionPathProps() { - TypedProperties properties = getCommonProps(true); - properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "timestamp:simple,ts_ms:timestamp"); - populateNecessaryPropsForTimestampBasedKeyGen(properties); - return properties; - } - - private TypedProperties getPropsWithoutRecordKeyFieldProps() { - TypedProperties properties = new TypedProperties(); - properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "timestamp:simple"); - return properties; - } - - private void populateNecessaryPropsForTimestampBasedKeyGen(TypedProperties properties) { - properties.put("hoodie.deltastreamer.keygen.timebased.timestamp.type", "DATE_STRING"); - properties.put("hoodie.deltastreamer.keygen.timebased.input.dateformat", "yyyy-MM-dd"); - properties.put("hoodie.deltastreamer.keygen.timebased.output.dateformat", "yyyyMMdd"); - } - - private TypedProperties getPropertiesForTimestampBasedKeyGen() { - TypedProperties properties = getCommonProps(false); - properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "ts_ms:timestamp"); - populateNecessaryPropsForTimestampBasedKeyGen(properties); - return properties; - } - - private TypedProperties getPropertiesForNonPartitionedKeyGen() { - TypedProperties properties = getCommonProps(false); - properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, ""); - return properties; - } - - @Test - public void testSimpleKeyGenerator() { - BuiltinKeyGenerator keyGenerator = new CustomKeyGenerator(getPropertiesForSimpleKeyGen()); - GenericRecord record = getRecord(); - HoodieKey key = keyGenerator.getKey(record); - Assertions.assertEquals(key.getRecordKey(), "key1"); - Assertions.assertEquals(key.getPartitionPath(), "timestamp=4357686"); - Row row = KeyGeneratorTestUtilities.getRow(record); - Assertions.assertEquals(keyGenerator.getRecordKey(row), "key1"); - Assertions.assertEquals(keyGenerator.getPartitionPath(row), "timestamp=4357686"); - } - - @Test - public void testTimestampBasedKeyGenerator() { - BuiltinKeyGenerator keyGenerator = new CustomKeyGenerator(getPropertiesForTimestampBasedKeyGen()); - GenericRecord record = getRecord(); - HoodieKey key = keyGenerator.getKey(record); - Assertions.assertEquals(key.getRecordKey(), "key1"); - Assertions.assertEquals(key.getPartitionPath(), "ts_ms=20200321"); - Row row = KeyGeneratorTestUtilities.getRow(record); - Assertions.assertEquals(keyGenerator.getRecordKey(row), "key1"); - Assertions.assertEquals(keyGenerator.getPartitionPath(row), "ts_ms=20200321"); - } - - @Test - public void testNonPartitionedKeyGenerator() { - BuiltinKeyGenerator keyGenerator = new CustomKeyGenerator(getPropertiesForNonPartitionedKeyGen()); - GenericRecord record = getRecord(); - HoodieKey key = keyGenerator.getKey(record); - Assertions.assertEquals(key.getRecordKey(), "key1"); - Assertions.assertTrue(key.getPartitionPath().isEmpty()); - Row row = KeyGeneratorTestUtilities.getRow(record); - Assertions.assertEquals(keyGenerator.getRecordKey(row), "key1"); - Assertions.assertTrue(keyGenerator.getPartitionPath(row).isEmpty()); - } - - @Test - public void testInvalidPartitionKeyType() { - try { - BuiltinKeyGenerator keyGenerator = new CustomKeyGenerator(getInvalidPartitionKeyTypeProps()); - keyGenerator.getKey(getRecord()); - Assertions.fail("should fail when invalid PartitionKeyType is provided!"); - } catch (Exception e) { - Assertions.assertTrue(e.getMessage().contains("No enum constant org.apache.hudi.keygen.CustomAvroKeyGenerator.PartitionKeyType.DUMMY")); - } - - try { - BuiltinKeyGenerator keyGenerator = new CustomKeyGenerator(getInvalidPartitionKeyTypeProps()); - GenericRecord record = getRecord(); - Row row = KeyGeneratorTestUtilities.getRow(record); - keyGenerator.getPartitionPath(row); - Assertions.fail("should fail when invalid PartitionKeyType is provided!"); - } catch (Exception e) { - Assertions.assertTrue(e.getMessage().contains("No enum constant org.apache.hudi.keygen.CustomAvroKeyGenerator.PartitionKeyType.DUMMY")); - } - } - - @Test - public void testNoRecordKeyFieldProp() { - try { - BuiltinKeyGenerator keyGenerator = new CustomKeyGenerator(getPropsWithoutRecordKeyFieldProps()); - keyGenerator.getKey(getRecord()); - Assertions.fail("should fail when record key field is not provided!"); - } catch (Exception e) { - Assertions.assertTrue(e.getMessage().contains("Property hoodie.datasource.write.recordkey.field not found")); - } - - try { - BuiltinKeyGenerator keyGenerator = new CustomKeyGenerator(getPropsWithoutRecordKeyFieldProps()); - GenericRecord record = getRecord(); - Row row = KeyGeneratorTestUtilities.getRow(record); - keyGenerator.getRecordKey(row); - Assertions.fail("should fail when record key field is not provided!"); - } catch (Exception e) { - Assertions.assertTrue(e.getMessage().contains("Property hoodie.datasource.write.recordkey.field not found")); - } - } - - @Test - public void testPartitionFieldsInImproperFormat() { - try { - BuiltinKeyGenerator keyGenerator = new CustomKeyGenerator(getImproperPartitionFieldFormatProp()); - keyGenerator.getKey(getRecord()); - Assertions.fail("should fail when partition key field is provided in improper format!"); - } catch (Exception e) { - Assertions.assertTrue(e.getMessage().contains("Unable to find field names for partition path in proper format")); - } - - try { - BuiltinKeyGenerator keyGenerator = new CustomKeyGenerator(getImproperPartitionFieldFormatProp()); - GenericRecord record = getRecord(); - Row row = KeyGeneratorTestUtilities.getRow(record); - keyGenerator.getPartitionPath(row); - Assertions.fail("should fail when partition key field is provided in improper format!"); - } catch (Exception e) { - Assertions.assertTrue(e.getMessage().contains("Unable to find field names for partition path in proper format")); - } - } - - @Test - public void testComplexRecordKeyWithSimplePartitionPath() { - BuiltinKeyGenerator keyGenerator = new CustomKeyGenerator(getComplexRecordKeyWithSimplePartitionProps()); - GenericRecord record = getRecord(); - HoodieKey key = keyGenerator.getKey(record); - Assertions.assertEquals(key.getRecordKey(), "_row_key:key1,pii_col:pi"); - Assertions.assertEquals(key.getPartitionPath(), "timestamp=4357686"); - - Row row = KeyGeneratorTestUtilities.getRow(record); - Assertions.assertEquals(keyGenerator.getRecordKey(row), "_row_key:key1,pii_col:pi"); - Assertions.assertEquals(keyGenerator.getPartitionPath(row), "timestamp=4357686"); - } - - @Test - public void testComplexRecordKeysWithComplexPartitionPath() { - BuiltinKeyGenerator keyGenerator = new CustomKeyGenerator(getComplexRecordKeyAndPartitionPathProps()); - GenericRecord record = getRecord(); - HoodieKey key = keyGenerator.getKey(record); - Assertions.assertEquals(key.getRecordKey(), "_row_key:key1,pii_col:pi"); - Assertions.assertEquals(key.getPartitionPath(), "timestamp=4357686/ts_ms=20200321"); - - Row row = KeyGeneratorTestUtilities.getRow(record); - Assertions.assertEquals(keyGenerator.getRecordKey(row), "_row_key:key1,pii_col:pi"); - Assertions.assertEquals(keyGenerator.getPartitionPath(row), "timestamp=4357686/ts_ms=20200321"); - } -} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestGlobalDeleteKeyGenerator.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestGlobalDeleteKeyGenerator.java deleted file mode 100644 index 078101b4a6317..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestGlobalDeleteKeyGenerator.java +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.keygen; - -import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.exception.HoodieKeyException; - -import org.apache.avro.generic.GenericRecord; -import org.apache.hudi.keygen.constant.KeyGeneratorOptions; -import org.apache.hudi.testutils.KeyGeneratorTestUtilities; -import org.apache.spark.sql.Row; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - -public class TestGlobalDeleteKeyGenerator extends KeyGeneratorTestUtilities { - - private TypedProperties getCommonProps(boolean getComplexRecordKey) { - TypedProperties properties = new TypedProperties(); - if (getComplexRecordKey) { - properties.put(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY, "_row_key,pii_col"); - } else { - properties.put(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY, "_row_key"); - } - properties.put(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_OPT_KEY, "true"); - return properties; - } - - private TypedProperties getPropertiesWithoutRecordKeyProp() { - TypedProperties properties = new TypedProperties(); - properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "timestamp"); - return properties; - } - - private TypedProperties getWrongRecordKeyFieldProps() { - TypedProperties properties = new TypedProperties(); - properties.put(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY, "_wrong_key"); - return properties; - } - - private TypedProperties getProps() { - TypedProperties properties = getCommonProps(true); - properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "timestamp,ts_ms"); - return properties; - } - - @Test - public void testNullRecordKeyFields() { - Assertions.assertThrows(IllegalArgumentException.class, () -> new GlobalDeleteKeyGenerator(getPropertiesWithoutRecordKeyProp())); - } - - @Test - public void testWrongRecordKeyField() { - GlobalDeleteKeyGenerator keyGenerator = new GlobalDeleteKeyGenerator(getWrongRecordKeyFieldProps()); - Assertions.assertThrows(HoodieKeyException.class, () -> keyGenerator.getRecordKey(getRecord())); - Assertions.assertThrows(HoodieKeyException.class, () -> keyGenerator.buildFieldPositionMapIfNeeded(KeyGeneratorTestUtilities.structType)); - } - - @Test - public void testHappyFlow() { - GlobalDeleteKeyGenerator keyGenerator = new GlobalDeleteKeyGenerator(getProps()); - GenericRecord record = getRecord(); - HoodieKey key = keyGenerator.getKey(record); - Assertions.assertEquals(key.getRecordKey(), "_row_key:key1,pii_col:pi"); - Assertions.assertEquals(key.getPartitionPath(), ""); - keyGenerator.buildFieldPositionMapIfNeeded(KeyGeneratorTestUtilities.structType); - Row row = KeyGeneratorTestUtilities.getRow(record); - Assertions.assertEquals(keyGenerator.getRecordKey(row), "_row_key:key1,pii_col:pi"); - Assertions.assertEquals(keyGenerator.getPartitionPath(row), ""); - } -} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestSimpleKeyGenerator.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestSimpleKeyGenerator.java deleted file mode 100644 index 80b85d8ee1046..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestSimpleKeyGenerator.java +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.keygen; - -import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.exception.HoodieKeyException; - -import org.apache.avro.generic.GenericRecord; -import org.apache.hudi.keygen.constant.KeyGeneratorOptions; -import org.apache.hudi.testutils.KeyGeneratorTestUtilities; -import org.apache.spark.sql.Row; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - -public class TestSimpleKeyGenerator extends KeyGeneratorTestUtilities { - - private TypedProperties getCommonProps() { - TypedProperties properties = new TypedProperties(); - properties.put(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY, "_row_key"); - properties.put(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_OPT_KEY, "true"); - return properties; - } - - private TypedProperties getPropertiesWithoutPartitionPathProp() { - return getCommonProps(); - } - - private TypedProperties getPropertiesWithoutRecordKeyProp() { - TypedProperties properties = new TypedProperties(); - properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "timestamp"); - return properties; - } - - private TypedProperties getWrongRecordKeyFieldProps() { - TypedProperties properties = new TypedProperties(); - properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "timestamp"); - properties.put(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY, "_wrong_key"); - return properties; - } - - private TypedProperties getWrongPartitionPathFieldProps() { - TypedProperties properties = new TypedProperties(); - properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "_wrong_partition_path"); - properties.put(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY, "_row_key"); - return properties; - } - - private TypedProperties getComplexRecordKeyProp() { - TypedProperties properties = new TypedProperties(); - properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "timestamp"); - properties.put(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY, "_row_key,pii_col"); - return properties; - } - - private TypedProperties getProps() { - TypedProperties properties = getCommonProps(); - properties.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "timestamp"); - return properties; - } - - @Test - public void testNullPartitionPathFields() { - Assertions.assertThrows(IllegalArgumentException.class, () -> new SimpleKeyGenerator(getPropertiesWithoutPartitionPathProp())); - } - - @Test - public void testNullRecordKeyFields() { - Assertions.assertThrows(IllegalArgumentException.class, () -> new SimpleKeyGenerator(getPropertiesWithoutRecordKeyProp())); - } - - @Test - public void testWrongRecordKeyField() { - SimpleKeyGenerator keyGenerator = new SimpleKeyGenerator(getWrongRecordKeyFieldProps()); - Assertions.assertThrows(HoodieKeyException.class, () -> keyGenerator.getRecordKey(getRecord())); - Assertions.assertThrows(HoodieKeyException.class, () -> keyGenerator.buildFieldPositionMapIfNeeded(KeyGeneratorTestUtilities.structType)); - } - - @Test - public void testWrongPartitionPathField() { - SimpleKeyGenerator keyGenerator = new SimpleKeyGenerator(getWrongPartitionPathFieldProps()); - GenericRecord record = getRecord(); - Assertions.assertEquals(keyGenerator.getPartitionPath(record), KeyGenUtils.DEFAULT_PARTITION_PATH); - Assertions.assertEquals(keyGenerator.getPartitionPath(KeyGeneratorTestUtilities.getRow(record)), - KeyGenUtils.DEFAULT_PARTITION_PATH); - } - - @Test - public void testComplexRecordKeyField() { - SimpleKeyGenerator keyGenerator = new SimpleKeyGenerator(getComplexRecordKeyProp()); - Assertions.assertThrows(HoodieKeyException.class, () -> keyGenerator.getRecordKey(getRecord())); - Assertions.assertThrows(HoodieKeyException.class, () -> keyGenerator.buildFieldPositionMapIfNeeded(KeyGeneratorTestUtilities.structType)); - } - - @Test - public void testHappyFlow() { - SimpleKeyGenerator keyGenerator = new SimpleKeyGenerator(getProps()); - GenericRecord record = getRecord(); - HoodieKey key = keyGenerator.getKey(getRecord()); - Assertions.assertEquals(key.getRecordKey(), "key1"); - Assertions.assertEquals(key.getPartitionPath(), "timestamp=4357686"); - - Row row = KeyGeneratorTestUtilities.getRow(record); - Assertions.assertEquals(keyGenerator.getRecordKey(row), "key1"); - Assertions.assertEquals(keyGenerator.getPartitionPath(row), "timestamp=4357686"); - } - -} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestTimestampBasedKeyGenerator.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestTimestampBasedKeyGenerator.java deleted file mode 100644 index 98a8f67d6119c..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestTimestampBasedKeyGenerator.java +++ /dev/null @@ -1,366 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.keygen; - -import org.apache.hudi.AvroConversionHelper; -import org.apache.hudi.AvroConversionUtils; -import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.testutils.SchemaTestUtil; - -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.hudi.exception.HoodieKeyGeneratorException; -import org.apache.hudi.keygen.constant.KeyGeneratorOptions; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema; -import org.apache.spark.sql.types.StructType; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; - -import scala.Function1; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -public class TestTimestampBasedKeyGenerator { - - private GenericRecord baseRecord; - private TypedProperties properties = new TypedProperties(); - - private Schema schema; - private StructType structType; - private Row baseRow; - - @BeforeEach - public void initialize() throws IOException { - schema = SchemaTestUtil.getTimestampEvolvedSchema(); - structType = AvroConversionUtils.convertAvroSchemaToStructType(schema); - baseRecord = SchemaTestUtil - .generateAvroRecordFromJson(schema, 1, "001", "f1"); - baseRow = genericRecordToRow(baseRecord); - - properties.setProperty(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY, "field1"); - properties.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "createTime"); - properties.setProperty(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_OPT_KEY, "false"); - } - - private TypedProperties getBaseKeyConfig(String timestampType, String dateFormat, String timezone, String scalarType) { - properties.setProperty(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_TYPE_FIELD_PROP, timestampType); - properties.setProperty(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, dateFormat); - properties.setProperty(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_TIMEZONE_FORMAT_PROP, timezone); - - if (scalarType != null) { - properties.setProperty("hoodie.deltastreamer.keygen.timebased.timestamp.scalar.time.unit", scalarType); - } - - return properties; - } - - private Row genericRecordToRow(GenericRecord baseRecord) { - Function1 convertor = AvroConversionHelper.createConverterToRow(schema, structType); - Row row = (Row) convertor.apply(baseRecord); - int fieldCount = structType.fieldNames().length; - Object[] values = new Object[fieldCount]; - for (int i = 0; i < fieldCount; i++) { - values[i] = row.get(i); - } - return new GenericRowWithSchema(values, structType); - } - - private TypedProperties getBaseKeyConfig(String timestampType, String inputFormatList, String inputFormatDelimiterRegex, String inputTimezone, String outputFormat, String outputTimezone) { - if (timestampType != null) { - properties.setProperty(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_TYPE_FIELD_PROP, timestampType); - } - if (inputFormatList != null) { - properties.setProperty(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, inputFormatList); - } - if (inputFormatDelimiterRegex != null) { - properties.setProperty(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_INPUT_DATE_FORMAT_LIST_DELIMITER_REGEX_PROP, inputFormatDelimiterRegex); - } - if (inputTimezone != null) { - properties.setProperty(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_INPUT_TIMEZONE_FORMAT_PROP, inputTimezone); - } - if (outputFormat != null) { - properties.setProperty(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, outputFormat); - } - if (outputTimezone != null) { - properties.setProperty(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP, outputTimezone); - } - return properties; - } - - @Test - public void testTimestampBasedKeyGenerator() throws IOException { - // timezone is GMT+8:00 - baseRecord.put("createTime", 1578283932000L); - properties = getBaseKeyConfig("EPOCHMILLISECONDS", "yyyy-MM-dd hh", "GMT+8:00", null); - TimestampBasedKeyGenerator keyGen = new TimestampBasedKeyGenerator(properties); - HoodieKey hk1 = keyGen.getKey(baseRecord); - assertEquals("2020-01-06 12", hk1.getPartitionPath()); - - // test w/ Row - baseRow = genericRecordToRow(baseRecord); - assertEquals("2020-01-06 12", keyGen.getPartitionPath(baseRow)); - - // timezone is GMT - properties = getBaseKeyConfig("EPOCHMILLISECONDS", "yyyy-MM-dd hh", "GMT", null); - keyGen = new TimestampBasedKeyGenerator(properties); - HoodieKey hk2 = keyGen.getKey(baseRecord); - assertEquals("2020-01-06 04", hk2.getPartitionPath()); - - // test w/ Row - assertEquals("2020-01-06 04", keyGen.getPartitionPath(baseRow)); - - // timestamp is DATE_STRING, timezone is GMT+8:00 - baseRecord.put("createTime", "2020-01-06 12:12:12"); - properties = getBaseKeyConfig("DATE_STRING", "yyyy-MM-dd hh", "GMT+8:00", null); - properties.setProperty("hoodie.deltastreamer.keygen.timebased.input.dateformat", "yyyy-MM-dd hh:mm:ss"); - keyGen = new TimestampBasedKeyGenerator(properties); - HoodieKey hk3 = keyGen.getKey(baseRecord); - assertEquals("2020-01-06 12", hk3.getPartitionPath()); - - // test w/ Row - baseRow = genericRecordToRow(baseRecord); - assertEquals("2020-01-06 12", keyGen.getPartitionPath(baseRow)); - - // timezone is GMT - properties = getBaseKeyConfig("DATE_STRING", "yyyy-MM-dd hh", "GMT", null); - keyGen = new TimestampBasedKeyGenerator(properties); - HoodieKey hk4 = keyGen.getKey(baseRecord); - assertEquals("2020-01-06 12", hk4.getPartitionPath()); - - // test w/ Row - assertEquals("2020-01-06 12", keyGen.getPartitionPath(baseRow)); - - // timezone is GMT+8:00, createTime is null - baseRecord.put("createTime", null); - properties = getBaseKeyConfig("EPOCHMILLISECONDS", "yyyy-MM-dd hh", "GMT+8:00", null); - keyGen = new TimestampBasedKeyGenerator(properties); - HoodieKey hk5 = keyGen.getKey(baseRecord); - assertEquals("1970-01-01 08", hk5.getPartitionPath()); - - // test w/ Row - baseRow = genericRecordToRow(baseRecord); - assertEquals("1970-01-01 08", keyGen.getPartitionPath(baseRow)); - - // timestamp is DATE_STRING, timezone is GMT, createTime is null - baseRecord.put("createTime", null); - properties = getBaseKeyConfig("DATE_STRING", "yyyy-MM-dd hh:mm:ss", "GMT", null); - properties.setProperty("hoodie.deltastreamer.keygen.timebased.input.dateformat", "yyyy-MM-dd hh:mm:ss"); - keyGen = new TimestampBasedKeyGenerator(properties); - HoodieKey hk6 = keyGen.getKey(baseRecord); - assertEquals("1970-01-01 12:00:00", hk6.getPartitionPath()); - - // test w/ Row - baseRow = genericRecordToRow(baseRecord); - assertEquals("1970-01-01 12:00:00", keyGen.getPartitionPath(baseRow)); - } - - @Test - public void testScalar() throws IOException { - // timezone is GMT+8:00 - baseRecord.put("createTime", 20000L); - - // timezone is GMT - properties = getBaseKeyConfig("SCALAR", "yyyy-MM-dd hh", "GMT", "days"); - TimestampBasedKeyGenerator keyGen = new TimestampBasedKeyGenerator(properties); - HoodieKey hk1 = keyGen.getKey(baseRecord); - assertEquals(hk1.getPartitionPath(), "2024-10-04 12"); - - // test w/ Row - baseRow = genericRecordToRow(baseRecord); - assertEquals("2024-10-04 12", keyGen.getPartitionPath(baseRow)); - - // timezone is GMT, createTime is null - baseRecord.put("createTime", null); - properties = getBaseKeyConfig("SCALAR", "yyyy-MM-dd hh", "GMT", "days"); - keyGen = new TimestampBasedKeyGenerator(properties); - HoodieKey hk2 = keyGen.getKey(baseRecord); - assertEquals("1970-01-02 12", hk2.getPartitionPath()); - - // test w/ Row - baseRow = genericRecordToRow(baseRecord); - assertEquals("1970-01-02 12", keyGen.getPartitionPath(baseRow)); - - } - - @Test - public void test_ExpectsMatch_SingleInputFormat_ISO8601WithMsZ_OutputTimezoneAsUTC() throws IOException { - baseRecord.put("createTime", "2020-04-01T13:01:33.428Z"); - properties = this.getBaseKeyConfig( - "DATE_STRING", - "yyyy-MM-dd'T'HH:mm:ss.SSSZ", - "", - "", - "yyyyMMddHH", - "GMT"); - BuiltinKeyGenerator keyGen = new TimestampBasedKeyGenerator(properties); - HoodieKey hk1 = keyGen.getKey(baseRecord); - Assertions.assertEquals("2020040113", hk1.getPartitionPath()); - - baseRow = genericRecordToRow(baseRecord); - assertEquals("2020040113", keyGen.getPartitionPath(baseRow)); - } - - @Test - public void test_ExpectsMatch_SingleInputFormats_ISO8601WithMsZ_OutputTimezoneAsInputDateTimeZone() throws IOException { - baseRecord.put("createTime", "2020-04-01T13:01:33.428Z"); - properties = this.getBaseKeyConfig( - "DATE_STRING", - "yyyy-MM-dd'T'HH:mm:ss.SSSZ", - "", - "", - "yyyyMMddHH", - ""); - BuiltinKeyGenerator keyGen = new TimestampBasedKeyGenerator(properties); - HoodieKey hk1 = keyGen.getKey(baseRecord); - Assertions.assertEquals("2020040113", hk1.getPartitionPath()); - - baseRow = genericRecordToRow(baseRecord); - assertEquals("2020040113", keyGen.getPartitionPath(baseRow)); - } - - @Test - public void test_ExpectsMatch_MultipleInputFormats_ISO8601WithMsZ_OutputTimezoneAsUTC() throws IOException { - baseRecord.put("createTime", "2020-04-01T13:01:33.428Z"); - properties = this.getBaseKeyConfig( - "DATE_STRING", - "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ", - "", - "", - "yyyyMMddHH", - "UTC"); - BuiltinKeyGenerator keyGen = new TimestampBasedKeyGenerator(properties); - HoodieKey hk1 = keyGen.getKey(baseRecord); - Assertions.assertEquals("2020040113", hk1.getPartitionPath()); - - baseRow = genericRecordToRow(baseRecord); - assertEquals("2020040113", keyGen.getPartitionPath(baseRow)); - } - - @Test - public void test_ExpectsMatch_MultipleInputFormats_ISO8601NoMsZ_OutputTimezoneAsUTC() throws IOException { - baseRecord.put("createTime", "2020-04-01T13:01:33Z"); - properties = this.getBaseKeyConfig( - "DATE_STRING", - "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ", - "", - "", - "yyyyMMddHH", - "UTC"); - BuiltinKeyGenerator keyGen = new TimestampBasedKeyGenerator(properties); - HoodieKey hk1 = keyGen.getKey(baseRecord); - Assertions.assertEquals("2020040113", hk1.getPartitionPath()); - - baseRow = genericRecordToRow(baseRecord); - assertEquals("2020040113", keyGen.getPartitionPath(baseRow)); - } - - @Test - public void test_ExpectsMatch_MultipleInputFormats_ISO8601NoMsWithOffset_OutputTimezoneAsUTC() throws IOException { - baseRecord.put("createTime", "2020-04-01T13:01:33-05:00"); - properties = this.getBaseKeyConfig( - "DATE_STRING", - "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ", - "", - "", - "yyyyMMddHH", - "UTC"); - BuiltinKeyGenerator keyGen = new TimestampBasedKeyGenerator(properties); - HoodieKey hk1 = keyGen.getKey(baseRecord); - Assertions.assertEquals("2020040118", hk1.getPartitionPath()); - - baseRow = genericRecordToRow(baseRecord); - assertEquals("2020040118", keyGen.getPartitionPath(baseRow)); - } - - @Test - public void test_ExpectsMatch_MultipleInputFormats_ISO8601WithMsWithOffset_OutputTimezoneAsUTC() throws IOException { - baseRecord.put("createTime", "2020-04-01T13:01:33.123-05:00"); - properties = this.getBaseKeyConfig( - "DATE_STRING", - "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ", - "", - "", - "yyyyMMddHH", - "UTC"); - BuiltinKeyGenerator keyGen = new TimestampBasedKeyGenerator(properties); - HoodieKey hk1 = keyGen.getKey(baseRecord); - Assertions.assertEquals("2020040118", hk1.getPartitionPath()); - - baseRow = genericRecordToRow(baseRecord); - assertEquals("2020040118", keyGen.getPartitionPath(baseRow)); - } - - @Test - public void test_ExpectsMatch_MultipleInputFormats_ISO8601WithMsZ_OutputTimezoneAsEST() throws IOException { - baseRecord.put("createTime", "2020-04-01T13:01:33.123Z"); - properties = this.getBaseKeyConfig( - "DATE_STRING", - "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ", - "", - "", - "yyyyMMddHH", - "EST"); - BuiltinKeyGenerator keyGen = new TimestampBasedKeyGenerator(properties); - HoodieKey hk1 = keyGen.getKey(baseRecord); - Assertions.assertEquals("2020040109", hk1.getPartitionPath()); - - baseRow = genericRecordToRow(baseRecord); - assertEquals("2020040109", keyGen.getPartitionPath(baseRow)); - } - - @Test - public void test_Throws_MultipleInputFormats_InputDateNotMatchingFormats() throws IOException { - baseRecord.put("createTime", "2020-04-01 13:01:33.123-05:00"); - properties = this.getBaseKeyConfig( - "DATE_STRING", - "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ", - "", - "", - "yyyyMMddHH", - "UTC"); - BuiltinKeyGenerator keyGen = new TimestampBasedKeyGenerator(properties); - Assertions.assertThrows(HoodieKeyGeneratorException.class, () -> keyGen.getKey(baseRecord)); - - baseRow = genericRecordToRow(baseRecord); - Assertions.assertThrows(HoodieKeyGeneratorException.class, () -> keyGen.getPartitionPath(baseRow)); - } - - @Test - public void test_ExpectsMatch_MultipleInputFormats_ShortDate_OutputCustomDate() throws IOException { - baseRecord.put("createTime", "20200401"); - properties = this.getBaseKeyConfig( - "DATE_STRING", - "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ,yyyyMMdd", - "", - "UTC", - "MM/dd/yyyy", - "UTC"); - BuiltinKeyGenerator keyGen = new TimestampBasedKeyGenerator(properties); - HoodieKey hk1 = keyGen.getKey(baseRecord); - Assertions.assertEquals("04/01/2020", hk1.getPartitionPath()); - - baseRow = genericRecordToRow(baseRecord); - assertEquals("04/01/2020", keyGen.getPartitionPath(baseRow)); - } -} \ No newline at end of file diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java index 00f1ea00ea94b..7577ba8c833a3 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java @@ -22,25 +22,31 @@ import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieCleanPartitionMetadata; import org.apache.hudi.avro.model.HoodieCleanerPlan; -import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.avro.model.HoodieFileStatus; +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.avro.model.HoodieClusteringStrategy; +import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieSliceInfo; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.HoodieCleanStat; import org.apache.hudi.common.bootstrap.TestBootstrapIndex; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.BootstrapFileMapping; import org.apache.hudi.common.model.FileSlice; -import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieCleaningPolicy; import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieFileGroup; -import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.model.IOType; +import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; @@ -50,60 +56,60 @@ import org.apache.hudi.common.table.timeline.versioning.clean.CleanMetadataMigrator; import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanMigrator; import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV1MigrationHandler; -import org.apache.hudi.common.table.view.TableFileSystemView; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.testutils.HoodieMetadataTestTable; import org.apache.hudi.common.testutils.HoodieTestTable; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.util.CleanerUtils; import org.apache.hudi.common.util.CollectionUtils; -import org.apache.hudi.common.util.CompactionUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieCleanConfig; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.index.SparkHoodieIndex; +import org.apache.hudi.index.SparkHoodieIndexFactory; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; import org.apache.hudi.table.action.clean.CleanPlanner; import org.apache.hudi.testutils.HoodieClientTestBase; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.Arguments; -import org.junit.jupiter.params.provider.MethodSource; import org.junit.jupiter.params.provider.ValueSource; import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; -import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Set; -import java.util.TreeSet; import java.util.UUID; +import java.util.concurrent.TimeUnit; import java.util.function.Predicate; import java.util.stream.Collectors; import java.util.stream.Stream; import scala.Tuple3; -import static org.apache.hudi.common.testutils.HoodieTestTable.makeIncrementalCommitTimes; +import static org.apache.hudi.HoodieTestCommitGenerator.getBaseFilename; import static org.apache.hudi.common.testutils.HoodieTestTable.makeNewCommitTime; import static org.apache.hudi.common.testutils.HoodieTestUtils.DEFAULT_PARTITION_PATHS; import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; +import static org.awaitility.Awaitility.await; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -113,21 +119,24 @@ public class TestCleaner extends HoodieClientTestBase { private static final int BIG_BATCH_INSERT_SIZE = 500; - private static final Logger LOG = LogManager.getLogger(TestCleaner.class); + private static final int PARALLELISM = 10; /** * Helper method to do first batch of insert for clean by versions/commits tests. * - * @param cfg Hoodie Write Config + * @param context Spark engine context + * @param metaClient Hoodie table meta client * @param client Hoodie Client * @param recordGenFunction Function to generate records for insertion * @param insertFn Insertion API for testing * @throws Exception in case of error */ - private void insertFirstBigBatchForClientCleanerTest(HoodieWriteConfig cfg, SparkRDDWriteClient client, + public static Pair> insertFirstBigBatchForClientCleanerTest( + HoodieSparkEngineContext context, + HoodieTableMetaClient metaClient, + SparkRDDWriteClient client, Function2, String, Integer> recordGenFunction, - Function3, SparkRDDWriteClient, JavaRDD, String> insertFn, - HoodieCleaningPolicy cleaningPolicy) throws Exception { + Function3, SparkRDDWriteClient, JavaRDD, String> insertFn) throws Exception { /* * do a big insert (this is basically same as insert part of upsert, just adding it here so we can catch breakages @@ -136,12 +145,11 @@ private void insertFirstBigBatchForClientCleanerTest(HoodieWriteConfig cfg, Spar String newCommitTime = client.startCommit(); List records = recordGenFunction.apply(newCommitTime, BIG_BATCH_INSERT_SIZE); - JavaRDD writeRecords = jsc.parallelize(records, 5); + JavaRDD writeRecords = context.getJavaSparkContext().parallelize(records, PARALLELISM); - List statuses = insertFn.apply(client, writeRecords, newCommitTime).collect(); + JavaRDD statuses = insertFn.apply(client, writeRecords, newCommitTime); // Verify there are no errors - assertNoWriteErrors(statuses); - + assertNoWriteErrors(statuses.collect()); // verify that there is a commit metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline(); @@ -149,227 +157,218 @@ private void insertFirstBigBatchForClientCleanerTest(HoodieWriteConfig cfg, Spar // Should have 100 records in table (check using Index), all in locations marked at commit HoodieTable table = HoodieSparkTable.create(client.getConfig(), context, metaClient); - assertFalse(table.getCompletedCommitsTimeline().empty()); + if (client.getConfig().shouldAutoCommit()) { + assertFalse(table.getCompletedCommitsTimeline().empty()); + } // We no longer write empty cleaner plans when there is nothing to be cleaned. assertTrue(table.getCompletedCleanTimeline().empty()); - HoodieIndex index = SparkHoodieIndex.createIndex(cfg); - List taggedRecords = ((JavaRDD) index.tagLocation(jsc.parallelize(records, 1), context, table)).collect(); - checkTaggedRecords(taggedRecords, newCommitTime); + if (client.getConfig().shouldAutoCommit()) { + HoodieIndex index = SparkHoodieIndexFactory.createIndex(client.getConfig()); + List taggedRecords = tagLocation(index, context, context.getJavaSparkContext().parallelize(records, PARALLELISM), table).collect(); + checkTaggedRecords(taggedRecords, newCommitTime); + } + return Pair.of(newCommitTime, statuses); } /** - * Test Clean-By-Versions using insert/upsert API. + * Helper method to do first batch of insert for clean by versions/commits tests. + * + * @param context Spark engine context + * @param client Hoodie Client + * @param recordGenFunction Function to generate records for insertion + * @param insertFn Insertion API for testing + * @throws Exception in case of error */ - @Test - public void testInsertAndCleanByVersions() throws Exception { - testInsertAndCleanByVersions(SparkRDDWriteClient::insert, SparkRDDWriteClient::upsert, false); - } + public static Pair> insertFirstFailedBigBatchForClientCleanerTest( + HoodieSparkEngineContext context, + SparkRDDWriteClient client, + Function2, String, Integer> recordGenFunction, + Function3, SparkRDDWriteClient, JavaRDD, String> insertFn) throws Exception { - /** - * Test Clean-By-Versions using prepped versions of insert/upsert API. - */ - @Test - public void testInsertPreppedAndCleanByVersions() throws Exception { - testInsertAndCleanByVersions(SparkRDDWriteClient::insertPreppedRecords, SparkRDDWriteClient::upsertPreppedRecords, - true); - } + /* + * do a big insert (this is basically same as insert part of upsert, just adding it here so we can catch breakages + * in insert(), if the implementation diverges.) + */ + String newCommitTime = client.startCommit(); - /** - * Test Clean-By-Versions using bulk-insert/upsert API. - */ - @Test - public void testBulkInsertAndCleanByVersions() throws Exception { - testInsertAndCleanByVersions(SparkRDDWriteClient::bulkInsert, SparkRDDWriteClient::upsert, false); + List records = recordGenFunction.apply(newCommitTime, BIG_BATCH_INSERT_SIZE); + JavaRDD writeRecords = context.getJavaSparkContext().parallelize(records, 5); + + JavaRDD statuses = insertFn.apply(client, writeRecords, newCommitTime); + // Verify there are no errors + assertNoWriteErrors(statuses.collect()); + // Don't invoke commit to simulate failed write + client.getHeartbeatClient().stop(newCommitTime); + return Pair.of(newCommitTime, statuses); } /** - * Test Clean-By-Versions using prepped versions of bulk-insert/upsert API. + * Test Clean-Failed-Writes when Cleaning policy is by VERSIONS using insert/upsert API. */ @Test - public void testBulkInsertPreppedAndCleanByVersions() throws Exception { - testInsertAndCleanByVersions( - (client, recordRDD, instantTime) -> client.bulkInsertPreppedRecords(recordRDD, instantTime, Option.empty()), - SparkRDDWriteClient::upsertPreppedRecords, true); + public void testInsertAndCleanFailedWritesByVersions() throws Exception { + testInsertAndCleanFailedWritesByVersions(SparkRDDWriteClient::insert, false); } /** - * Test Helper for Cleaning by versions logic from HoodieWriteClient API perspective. + * Test Helper for cleaning failed writes by versions logic from HoodieWriteClient API perspective. * - * @param insertFn Insert API to be tested - * @param upsertFn Upsert API to be tested + * @param insertFn Insert API to be tested * @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during - * record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs) + * record generation to also tag the regards (de-dupe is implicit as we use unique record-gen APIs) * @throws Exception in case of errors */ - private void testInsertAndCleanByVersions( - Function3, SparkRDDWriteClient, JavaRDD, String> insertFn, - Function3, SparkRDDWriteClient, JavaRDD, String> upsertFn, boolean isPreppedAPI) + private void testInsertAndCleanFailedWritesByVersions( + Function3, SparkRDDWriteClient, JavaRDD, String> insertFn, boolean isPreppedAPI) throws Exception { - int maxVersions = 2; // keep upto 2 versions for each file + int maxVersions = 3; // keep upto 3 versions for each file HoodieWriteConfig cfg = getConfigBuilder() - .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withAutoCommit(false) + .withHeartbeatIntervalInMs(3000) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(maxVersions).build()) .withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1) .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) .build(); - try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { final Function2, String, Integer> recordInsertGenWrappedFunction = generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateInserts); - final Function2, String, Integer> recordUpsertGenWrappedFunction = - generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateUniqueUpdates); - - insertFirstBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, - HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS); - - Map compactionFileIdToLatestFileSlice = new HashMap<>(); - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable table = HoodieSparkTable.create(getConfig(), context, metaClient); - for (String partitionPath : dataGen.getPartitionPaths()) { - TableFileSystemView fsView = table.getFileSystemView(); - Option added = Option.fromJavaOptional(fsView.getAllFileGroups(partitionPath).findFirst().map(fg -> { - fg.getLatestFileSlice().map(fs -> compactionFileIdToLatestFileSlice.put(fg.getFileGroupId(), fs)); - return true; - })); - if (added.isPresent()) { - // Select only one file-group for compaction - break; - } - } + Pair> result = insertFirstBigBatchForClientCleanerTest(context, metaClient, client, recordInsertGenWrappedFunction, insertFn); - // Create workload with selected file-slices - List> partitionFileSlicePairs = compactionFileIdToLatestFileSlice.entrySet().stream() - .map(e -> Pair.of(e.getKey().getPartitionPath(), e.getValue())).collect(Collectors.toList()); - HoodieCompactionPlan compactionPlan = - CompactionUtils.buildFromFileSlices(partitionFileSlicePairs, Option.empty(), Option.empty()); - List instantTimes = makeIncrementalCommitTimes(9); - String compactionTime = instantTimes.get(0); - table.getActiveTimeline().saveToCompactionRequested( - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, compactionTime), - TimelineMetadataUtils.serializeCompactionPlan(compactionPlan)); - - instantTimes = instantTimes.subList(1, instantTimes.size()); - // Keep doing some writes and clean inline. Make sure we have expected number of files - // remaining. - for (String newInstantTime : instantTimes) { - try { - client.startCommitWithTime(newInstantTime); - List records = recordUpsertGenWrappedFunction.apply(newInstantTime, 100); - - List statuses = upsertFn.apply(client, jsc.parallelize(records, 1), newInstantTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - metaClient = HoodieTableMetaClient.reload(metaClient); - table = HoodieSparkTable.create(getConfig(), context, metaClient); - HoodieTimeline timeline = table.getMetaClient().getCommitsTimeline(); - - TableFileSystemView fsView = table.getFileSystemView(); - // Need to ensure the following - for (String partitionPath : dataGen.getPartitionPaths()) { - // compute all the versions of all files, from time 0 - HashMap> fileIdToVersions = new HashMap<>(); - for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) { - HoodieCommitMetadata commitMetadata = - HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(entry).get(), HoodieCommitMetadata.class); - - for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) { - if (!fileIdToVersions.containsKey(wstat.getFileId())) { - fileIdToVersions.put(wstat.getFileId(), new TreeSet<>()); - } - fileIdToVersions.get(wstat.getFileId()).add(FSUtils.getCommitTime(new Path(wstat.getPath()).getName())); - } - } - - List fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList()); - - for (HoodieFileGroup fileGroup : fileGroups) { - if (compactionFileIdToLatestFileSlice.containsKey(fileGroup.getFileGroupId())) { - // Ensure latest file-slice selected for compaction is retained - Option dataFileForCompactionPresent = - Option.fromJavaOptional(fileGroup.getAllBaseFiles().filter(df -> { - return compactionFileIdToLatestFileSlice.get(fileGroup.getFileGroupId()).getBaseInstantTime() - .equals(df.getCommitTime()); - }).findAny()); - assertTrue(dataFileForCompactionPresent.isPresent(), - "Data File selected for compaction is retained"); - } else { - // file has no more than max versions - String fileId = fileGroup.getFileGroupId().getFileId(); - List dataFiles = fileGroup.getAllBaseFiles().collect(Collectors.toList()); - - assertTrue(dataFiles.size() <= maxVersions, - "fileId " + fileId + " has more than " + maxVersions + " versions"); - - // Each file, has the latest N versions (i.e cleaning gets rid of older versions) - List commitedVersions = new ArrayList<>(fileIdToVersions.get(fileId)); - for (int i = 0; i < dataFiles.size(); i++) { - assertEquals((dataFiles.get(i)).getCommitTime(), - commitedVersions.get(commitedVersions.size() - 1 - i), - "File " + fileId + " does not have latest versions on commits" + commitedVersions); - } - } - } - } - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - } - } - } + client.commit(result.getLeft(), result.getRight()); - /** - * Test Clean-By-Versions using insert/upsert API. - */ - @Test - public void testInsertAndCleanByCommits() throws Exception { - testInsertAndCleanByCommits(SparkRDDWriteClient::insert, SparkRDDWriteClient::upsert, false); - } + HoodieTable table = HoodieSparkTable.create(client.getConfig(), context, metaClient); - /** - * Test Clean-By-Versions using prepped version of insert/upsert API. - */ - @Test - public void testInsertPreppedAndCleanByCommits() throws Exception { - testInsertAndCleanByCommits(SparkRDDWriteClient::insertPreppedRecords, SparkRDDWriteClient::upsertPreppedRecords, true); + assertTrue(table.getCompletedCleanTimeline().empty()); + + insertFirstFailedBigBatchForClientCleanerTest(context, client, recordInsertGenWrappedFunction, insertFn); + + insertFirstFailedBigBatchForClientCleanerTest(context, client, recordInsertGenWrappedFunction, insertFn); + + Pair> ret = + insertFirstFailedBigBatchForClientCleanerTest(context, client, recordInsertGenWrappedFunction, insertFn); + + // Await till enough time passes such that the last failed commits heartbeats are expired + await().atMost(10, TimeUnit.SECONDS).until(() -> client.getHeartbeatClient() + .isHeartbeatExpired(ret.getLeft())); + + List cleanStats = runCleaner(cfg); + assertEquals(0, cleanStats.size(), "Must not clean any files"); + HoodieActiveTimeline timeline = metaClient.reloadActiveTimeline(); + assertTrue(timeline.getTimelineOfActions( + CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().countInstants() == 3); + Option rollBackInstantForFailedCommit = timeline.getTimelineOfActions( + CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().lastInstant(); + HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.deserializeAvroMetadata( + timeline.getInstantDetails(rollBackInstantForFailedCommit.get()).get(), HoodieRollbackMetadata.class); + // Rollback of one of the failed writes should have deleted 3 files + assertEquals(3, rollbackMetadata.getTotalFilesDeleted()); + } } /** - * Test Clean-By-Versions using prepped versions of bulk-insert/upsert API. + * Tests no more than 1 clean is scheduled if hoodie.clean.allow.multiple config is set to false. */ @Test - public void testBulkInsertPreppedAndCleanByCommits() throws Exception { - testInsertAndCleanByCommits( - (client, recordRDD, instantTime) -> client.bulkInsertPreppedRecords(recordRDD, instantTime, Option.empty()), - SparkRDDWriteClient::upsertPreppedRecords, true); + public void testMultiClean() { + HoodieWriteConfig writeConfig = getConfigBuilder() + .withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder() + .withEnableBackupForRemoteFileSystemView(false).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.EAGER) + .allowMultipleCleans(false) + .withAutoClean(false).retainCommits(1).retainFileVersions(1) + .build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024) + .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1) + .build()) + .withEmbeddedTimelineServerEnabled(false).build(); + + int index = 0; + String cleanInstantTime; + final String partition = "2015/03/16"; + try (SparkRDDWriteClient client = new SparkRDDWriteClient(context, writeConfig)) { + // Three writes so we can initiate a clean + for (; index < 3; ++index) { + String newCommitTime = "00" + index; + List records = dataGen.generateInsertsForPartition(newCommitTime, 1, partition); + client.startCommitWithTime(newCommitTime); + client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + } + } + + // mimic failed/leftover clean by scheduling a clean but not performing it + cleanInstantTime = "00" + index++; + HoodieTable table = HoodieSparkTable.create(writeConfig, context); + Option cleanPlan = table.scheduleCleaning(context, cleanInstantTime, Option.empty()); + assertEquals(cleanPlan.get().getFilePathsToBeDeletedPerPartition().get(partition).size(), 1); + assertEquals(metaClient.reloadActiveTimeline().getCleanerTimeline().filterInflightsAndRequested().countInstants(), 1); + + try (SparkRDDWriteClient client = new SparkRDDWriteClient(context, writeConfig)) { + // Next commit. This is required so that there is an additional file version to clean. + String newCommitTime = "00" + index++; + List records = dataGen.generateInsertsForPartition(newCommitTime, 1, partition); + client.startCommitWithTime(newCommitTime); + client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + + // Try to schedule another clean + String newCleanInstantTime = "00" + index++; + HoodieCleanMetadata cleanMetadata = client.clean(newCleanInstantTime); + // When hoodie.clean.allow.multiple is set to false, a new clean action should not be scheduled. + // The existing requested clean should complete execution. + assertNotNull(cleanMetadata); + assertTrue(metaClient.reloadActiveTimeline().getCleanerTimeline() + .filterCompletedInstants().containsInstant(cleanInstantTime)); + assertFalse(metaClient.getActiveTimeline().getCleanerTimeline() + .containsInstant(newCleanInstantTime)); + + // 1 file cleaned + assertEquals(cleanMetadata.getPartitionMetadata().get(partition).getSuccessDeleteFiles().size(), 1); + assertEquals(cleanMetadata.getPartitionMetadata().get(partition).getFailedDeleteFiles().size(), 0); + assertEquals(cleanMetadata.getPartitionMetadata().get(partition).getDeletePathPatterns().size(), 1); + + // Now that there is no requested or inflight clean instant, a new clean action can be scheduled + cleanMetadata = client.clean(newCleanInstantTime); + assertNotNull(cleanMetadata); + assertTrue(metaClient.reloadActiveTimeline().getCleanerTimeline() + .containsInstant(newCleanInstantTime)); + + // 1 file cleaned + assertEquals(cleanMetadata.getPartitionMetadata().get(partition).getSuccessDeleteFiles().size(), 1); + assertEquals(cleanMetadata.getPartitionMetadata().get(partition).getFailedDeleteFiles().size(), 0); + assertEquals(cleanMetadata.getPartitionMetadata().get(partition).getDeletePathPatterns().size(), 1); + } } /** - * Test Clean-By-Versions using bulk-insert/upsert API. + * Test Clean-By-Commits using insert/upsert API. */ @Test - public void testBulkInsertAndCleanByCommits() throws Exception { - testInsertAndCleanByCommits(SparkRDDWriteClient::bulkInsert, SparkRDDWriteClient::upsert, false); + public void testFailedInsertAndCleanByCommits() throws Exception { + testFailedInsertAndCleanByCommits(SparkRDDWriteClient::insert, false); } /** - * Test Helper for Cleaning by versions logic from HoodieWriteClient API perspective. + * Test Helper for Cleaning failed commits by commits logic from HoodieWriteClient API perspective. * * @param insertFn Insert API to be tested - * @param upsertFn Upsert API to be tested * @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during * record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs) * @throws Exception in case of errors */ - private void testInsertAndCleanByCommits( - Function3, SparkRDDWriteClient, JavaRDD, String> insertFn, - Function3, SparkRDDWriteClient, JavaRDD, String> upsertFn, boolean isPreppedAPI) + private void testFailedInsertAndCleanByCommits( + Function3, SparkRDDWriteClient, JavaRDD, String> insertFn, boolean isPreppedAPI) throws Exception { int maxCommits = 3; // keep upto 3 commits from the past HoodieWriteConfig cfg = getConfigBuilder() - .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withAutoCommit(false) + .withHeartbeatIntervalInMs(3000) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(maxCommits).build()) .withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1) .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) @@ -379,54 +378,32 @@ private void testInsertAndCleanByCommits( final Function2, String, Integer> recordInsertGenWrappedFunction = generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateInserts); - final Function2, String, Integer> recordUpsertGenWrappedFunction = - generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateUniqueUpdates); + Pair> result = insertFirstBigBatchForClientCleanerTest(context, metaClient, client, recordInsertGenWrappedFunction, insertFn); + client.commit(result.getLeft(), result.getRight()); - insertFirstBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, - HoodieCleaningPolicy.KEEP_LATEST_COMMITS); + HoodieTable table = HoodieSparkTable.create(client.getConfig(), context, metaClient); + assertTrue(table.getCompletedCleanTimeline().empty()); - // Keep doing some writes and clean inline. Make sure we have expected number of files remaining. - makeIncrementalCommitTimes(8).forEach(newCommitTime -> { - try { - client.startCommitWithTime(newCommitTime); - List records = recordUpsertGenWrappedFunction.apply(newCommitTime, 100); - - List statuses = upsertFn.apply(client, jsc.parallelize(records, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable table1 = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimeline activeTimeline = table1.getCompletedCommitsTimeline(); - // NOTE: See CleanPlanner#getFilesToCleanKeepingLatestCommits. We explicitly keep one commit before earliest - // commit - Option earliestRetainedCommit = activeTimeline.nthFromLastInstant(maxCommits); - Set acceptableCommits = activeTimeline.getInstants().collect(Collectors.toSet()); - if (earliestRetainedCommit.isPresent()) { - acceptableCommits - .removeAll(activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp()) - .getInstants().collect(Collectors.toSet())); - acceptableCommits.add(earliestRetainedCommit.get()); - } - - TableFileSystemView fsView = table1.getFileSystemView(); - // Need to ensure the following - for (String partitionPath : dataGen.getPartitionPaths()) { - List fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList()); - for (HoodieFileGroup fileGroup : fileGroups) { - Set commitTimes = new HashSet<>(); - fileGroup.getAllBaseFiles().forEach(value -> { - LOG.debug("Data File - " + value); - commitTimes.add(value.getCommitTime()); - }); - assertEquals(acceptableCommits.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toSet()), commitTimes, - "Only contain acceptable versions of file should be present"); - } - } - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); + insertFirstFailedBigBatchForClientCleanerTest(context, client, recordInsertGenWrappedFunction, insertFn); + + insertFirstFailedBigBatchForClientCleanerTest(context, client, recordInsertGenWrappedFunction, insertFn); + + Pair> ret = + insertFirstFailedBigBatchForClientCleanerTest(context, client, recordInsertGenWrappedFunction, insertFn); + // Await till enough time passes such that the last failed commits heartbeats are expired + await().atMost(10, TimeUnit.SECONDS).until(() -> client.getHeartbeatClient() + .isHeartbeatExpired(ret.getLeft())); + List cleanStats = runCleaner(cfg); + assertEquals(0, cleanStats.size(), "Must not clean any files"); + HoodieActiveTimeline timeline = metaClient.reloadActiveTimeline(); + assertTrue(timeline.getTimelineOfActions( + CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().countInstants() == 3); + Option rollBackInstantForFailedCommit = timeline.getTimelineOfActions( + CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().lastInstant(); + HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.deserializeAvroMetadata( + timeline.getInstantDetails(rollBackInstantForFailedCommit.get()).get(), HoodieRollbackMetadata.class); + // Rollback of one of the failed writes should have deleted 3 files + assertEquals(3, rollbackMetadata.getTotalFilesDeleted()); } /** @@ -434,16 +411,25 @@ private void testInsertAndCleanByCommits( * * @param config HoodieWriteConfig */ - private List runCleaner(HoodieWriteConfig config) throws IOException { - return runCleaner(config, false, 1); + protected List runCleaner(HoodieWriteConfig config) throws IOException { + return runCleaner(config, false, false, 1, false); + } + + protected List runCleanerWithInstantFormat(HoodieWriteConfig config, boolean needInstantInHudiFormat) throws IOException { + return runCleaner(config, false, false, 1, needInstantInHudiFormat); + } + + protected List runCleaner(HoodieWriteConfig config, int firstCommitSequence, boolean needInstantInHudiFormat) throws IOException { + return runCleaner(config, false, false, firstCommitSequence, needInstantInHudiFormat); } - private List runCleaner(HoodieWriteConfig config, int firstCommitSequence) throws IOException { - return runCleaner(config, false, firstCommitSequence); + protected List runCleaner(HoodieWriteConfig config, boolean simulateRetryFailure) throws IOException { + return runCleaner(config, simulateRetryFailure, false, 1, false); } - private List runCleaner(HoodieWriteConfig config, boolean simulateRetryFailure) throws IOException { - return runCleaner(config, simulateRetryFailure, 1); + protected List runCleaner( + HoodieWriteConfig config, boolean simulateRetryFailure, boolean simulateMetadataFailure) throws IOException { + return runCleaner(config, simulateRetryFailure, simulateMetadataFailure, 1, false); } /** @@ -451,9 +437,11 @@ private List runCleaner(HoodieWriteConfig config, boolean simul * * @param config HoodieWriteConfig */ - private List runCleaner(HoodieWriteConfig config, boolean simulateRetryFailure, int firstCommitSequence) throws IOException { + protected List runCleaner( + HoodieWriteConfig config, boolean simulateRetryFailure, boolean simulateMetadataFailure, + Integer firstCommitSequence, boolean needInstantInHudiFormat) throws IOException { SparkRDDWriteClient writeClient = getHoodieWriteClient(config); - String cleanInstantTs = makeNewCommitTime(firstCommitSequence); + String cleanInstantTs = needInstantInHudiFormat ? makeNewCommitTime(firstCommitSequence, "%014d") : makeNewCommitTime(firstCommitSequence, "%09d"); HoodieCleanMetadata cleanMetadata1 = writeClient.clean(cleanInstantTs); if (null == cleanMetadata1) { @@ -467,20 +455,26 @@ private List runCleaner(HoodieWriteConfig config, boolean simul String dirPath = metaClient.getBasePath() + "/" + p.getPartitionPath(); p.getSuccessDeleteFiles().forEach(p2 -> { try { - metaClient.getFs().create(new Path(dirPath, p2), true); + metaClient.getFs().create(new Path(dirPath, p2), true).close(); } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); } }); }); metaClient.reloadActiveTimeline().revertToInflight(completedCleanInstant); - HoodieCleanMetadata newCleanMetadata = writeClient.clean(makeNewCommitTime(firstCommitSequence + 1)); - // No new clean metadata would be created. Only the previous one will be retried - assertNull(newCleanMetadata); - HoodieCleanMetadata cleanMetadata2 = CleanerUtils.getCleanerMetadata(metaClient, completedCleanInstant); - assertEquals(cleanMetadata1.getEarliestCommitToRetain(), cleanMetadata2.getEarliestCommitToRetain()); - assertEquals(cleanMetadata1.getTotalFilesDeleted(), cleanMetadata2.getTotalFilesDeleted()); - assertEquals(cleanMetadata1.getPartitionMetadata().keySet(), cleanMetadata2.getPartitionMetadata().keySet()); + + if (config.isMetadataTableEnabled() && simulateMetadataFailure) { + // Simulate the failure of corresponding instant in the metadata table + HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder() + .setBasePath(HoodieTableMetadata.getMetadataTableBasePath(metaClient.getBasePath())) + .setConf(metaClient.getHadoopConf()) + .build(); + HoodieInstant deltaCommit = new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, cleanInstantTs); + metadataMetaClient.reloadActiveTimeline().revertToInflight(deltaCommit); + } + + // retry clean operation again + writeClient.clean(); final HoodieCleanMetadata retriedCleanMetadata = CleanerUtils.getCleanerMetadata(HoodieTableMetaClient.reload(metaClient), completedCleanInstant); cleanMetadata1.getPartitionMetadata().keySet().forEach(k -> { HoodieCleanPartitionMetadata p1 = cleanMetadata1.getPartitionMetadata().get(k); @@ -516,132 +510,192 @@ private List runCleaner(HoodieWriteConfig config, boolean simul return new ArrayList<>(cleanStatMap.values()); } - /** - * Test HoodieTable.clean() Cleaning by versions logic. - */ - @ParameterizedTest - @ValueSource(booleans = {false, true}) - public void testKeepLatestFileVersions(Boolean enableBootstrapSourceClean) throws Exception { + @Test + public void testCleanEmptyInstants() throws Exception { HoodieWriteConfig config = - HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withCleanBootstrapBaseFileEnabled(enableBootstrapSourceClean) - .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()) - .build(); - HoodieTestTable testTable = HoodieTestTable.of(metaClient); + HoodieWriteConfig.newBuilder() + .withPath(basePath) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).build()) + .build(); + metaClient = HoodieTableMetaClient.reload(metaClient); + + int commitCount = 20; + int cleanCount = 20; + + int startInstant = 1; + + for (int i = 0; i < cleanCount; i++, startInstant++) { + String commitTime = makeNewCommitTime(startInstant, "%09d"); + createEmptyCleanMetadata(commitTime + "", false); + } + + int instantClean = startInstant; + + for (int i = 0; i < commitCount; i++, startInstant++) { + String commitTime = makeNewCommitTime(startInstant, "%09d"); + HoodieTestTable.of(metaClient).addCommit(commitTime); + } + + List cleanStats = runCleaner(config); + HoodieActiveTimeline timeline = metaClient.reloadActiveTimeline(); + + assertEquals(0, cleanStats.size(), "Must not clean any files"); + assertEquals(1, timeline.getTimelineOfActions( + CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterInflightsAndRequested().countInstants()); + assertEquals(0, timeline.getTimelineOfActions( + CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterInflights().countInstants()); + assertEquals(--cleanCount, timeline.getTimelineOfActions( + CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterCompletedInstants().countInstants()); + assertTrue(timeline.getTimelineOfActions( + CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterInflightsAndRequested().containsInstant(makeNewCommitTime(--instantClean, "%09d"))); + + cleanStats = runCleaner(config); + timeline = metaClient.reloadActiveTimeline(); + + assertEquals(0, cleanStats.size(), "Must not clean any files"); + assertEquals(1, timeline.getTimelineOfActions( + CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterInflightsAndRequested().countInstants()); + assertEquals(0, timeline.getTimelineOfActions( + CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterInflights().countInstants()); + assertEquals(--cleanCount, timeline.getTimelineOfActions( + CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterCompletedInstants().countInstants()); + assertTrue(timeline.getTimelineOfActions( + CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION)).filterInflightsAndRequested().containsInstant(makeNewCommitTime(--instantClean, "%09d"))); + } + + @Test + public void testCleanWithReplaceCommits() throws Exception { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .withMaxNumDeltaCommitsBeforeCompaction(1) + .withAssumeDatePartitioning(true).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS) + .retainCommits(2).build()) + .build(); + + HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context); + HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter); String p0 = "2020/01/01"; String p1 = "2020/01/02"; - Map> bootstrapMapping = enableBootstrapSourceClean ? generateBootstrapIndexAndSourceData(p0, p1) : null; // make 1 commit, with 1 file per partition - String file1P0C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p0).get(0).getFileId() - : UUID.randomUUID().toString(); - String file1P1C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p1).get(0).getFileId() - : UUID.randomUUID().toString(); - testTable.addCommit("00000000000001").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0); + String file1P0C0 = UUID.randomUUID().toString(); + String file1P1C0 = UUID.randomUUID().toString(); + testTable.addInflightCommit("00000000000001").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0); - List hoodieCleanStatsOne = runCleaner(config); - assertEquals(0, hoodieCleanStatsOne.size(), "Must not clean any files"); + HoodieCommitMetadata commitMetadata = generateCommitMetadata("00000000000001", + Collections.unmodifiableMap(new HashMap>() { + { + put(p0, CollectionUtils.createImmutableList(file1P0C0)); + put(p1, CollectionUtils.createImmutableList(file1P1C0)); + } + }) + ); + metadataWriter.update(commitMetadata, "00000000000001", false); + metaClient.getActiveTimeline().saveAsComplete( + new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000001"), + Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + + metaClient = HoodieTableMetaClient.reload(metaClient); + + List hoodieCleanStatsOne = runCleanerWithInstantFormat(config, true); + assertEquals(0, hoodieCleanStatsOne.size(), "Must not scan any partitions and clean any files"); assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0)); - // make next commit, with 1 insert & 1 update per partition - Map partitionAndFileId002 = testTable.addCommit("00000000000002") - .withBaseFilesInPartition(p0, file1P0C0) - .withBaseFilesInPartition(p1, file1P1C0) - .getFileIdsWithBaseFilesInPartitions(p0, p1); - - List hoodieCleanStatsTwo = runCleaner(config, 1); - // enableBootstrapSourceClean would delete the bootstrap base file as the same time - HoodieCleanStat cleanStat = getCleanStat(hoodieCleanStatsTwo, p0); - assertEquals(enableBootstrapSourceClean ? 2 : 1, cleanStat.getSuccessDeleteFiles().size() - + (cleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 - : cleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least 1 file"); - if (enableBootstrapSourceClean) { - HoodieFileStatus fstatus = - bootstrapMapping.get(p0).get(0).getBootstrapFileStatus(); - // This ensures full path is recorded in metadata. - assertTrue(cleanStat.getSuccessDeleteBootstrapBaseFiles().contains(fstatus.getPath().getUri()), - "Successful delete files were " + cleanStat.getSuccessDeleteBootstrapBaseFiles() - + " but did not contain " + fstatus.getPath().getUri()); - assertFalse(Files.exists(Paths.get(bootstrapMapping.get( - p0).get(0).getBootstrapFileStatus().getPath().getUri()))); - } - cleanStat = getCleanStat(hoodieCleanStatsTwo, p1); + // make next replacecommit, with 1 clustering operation. logically delete p0. No change to p1 + // notice that clustering generates empty inflight commit files + Map partitionAndFileId002 = testTable.forReplaceCommit("00000000000002").getFileIdsWithBaseFilesInPartitions(p0); String file2P0C1 = partitionAndFileId002.get(p0); - String file2P1C1 = partitionAndFileId002.get(p1); + Pair replaceMetadata = + generateReplaceCommitMetadata("00000000000002", p0, file1P0C0, file2P0C1); + testTable.addReplaceCommit("00000000000002", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue()); + + // run cleaner + List hoodieCleanStatsTwo = runCleanerWithInstantFormat(config, true); + assertEquals(0, hoodieCleanStatsTwo.size(), "Must not scan any partitions and clean any files"); assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1)); - assertTrue(testTable.baseFileExists(p1, "00000000000002", file2P1C1)); - assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); - assertFalse(testTable.baseFileExists(p1, "00000000000001", file1P1C0)); - assertEquals(enableBootstrapSourceClean ? 2 : 1, cleanStat.getSuccessDeleteFiles().size() - + (cleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 - : cleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least 1 file"); - if (enableBootstrapSourceClean) { - HoodieFileStatus fstatus = - bootstrapMapping.get(p1).get(0).getBootstrapFileStatus(); - // This ensures full path is recorded in metadata. - assertTrue(cleanStat.getSuccessDeleteBootstrapBaseFiles().contains(fstatus.getPath().getUri()), - "Successful delete files were " + cleanStat.getSuccessDeleteBootstrapBaseFiles() - + " but did not contain " + fstatus.getPath().getUri()); - assertFalse(Files.exists(Paths.get(bootstrapMapping.get( - p1).get(0).getBootstrapFileStatus().getPath().getUri()))); - } + assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); + assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0)); - // make next commit, with 2 updates to existing files, and 1 insert - String file3P0C2 = testTable.addCommit("00000000000003") - .withBaseFilesInPartition(p0, file1P0C0, file2P0C1) - .getFileIdsWithBaseFilesInPartitions(p0).get(p0); - List hoodieCleanStatsThree = runCleaner(config, 3); - assertEquals(2, - getCleanStat(hoodieCleanStatsThree, p0) - .getSuccessDeleteFiles().size(), "Must clean two files"); - assertFalse(testTable.baseFileExists(p0, "00000000000002", file1P0C0)); - assertFalse(testTable.baseFileExists(p0, "00000000000002", file2P0C1)); - assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2)); - - // No cleaning on partially written file, with no commit. - testTable.forCommit("00000000000004").withBaseFilesInPartition(p0, file3P0C2); - List hoodieCleanStatsFour = runCleaner(config); - assertEquals(0, hoodieCleanStatsFour.size(), "Must not clean any files"); - assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2)); - } + // make next replacecommit, with 1 clustering operation. Replace data in p1. No change to p0 + // notice that clustering generates empty inflight commit files + Map partitionAndFileId003 = testTable.forReplaceCommit("00000000000003").getFileIdsWithBaseFilesInPartitions(p1); + String file3P1C2 = partitionAndFileId003.get(p1); + replaceMetadata = generateReplaceCommitMetadata("00000000000003", p1, file1P1C0, file3P1C2); + testTable.addReplaceCommit("00000000000003", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue()); - /** - * Test HoodieTable.clean() Cleaning by versions logic for MOR table with Log files. - */ - @Test - public void testKeepLatestFileVersionsMOR() throws Exception { + // run cleaner + List hoodieCleanStatsThree = runCleanerWithInstantFormat(config, true); + assertEquals(0, hoodieCleanStatsThree.size(), "Must not scan any partitions and clean any files"); + assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1)); + assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); + assertTrue(testTable.baseFileExists(p1, "00000000000003", file3P1C2)); + assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0)); - HoodieWriteConfig config = - HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()) - .build(); + // make next replacecommit, with 1 clustering operation. Replace data in p0 again + // notice that clustering generates empty inflight commit files + Map partitionAndFileId004 = testTable.forReplaceCommit("00000000000004").getFileIdsWithBaseFilesInPartitions(p0); + String file4P0C3 = partitionAndFileId004.get(p0); + replaceMetadata = generateReplaceCommitMetadata("00000000000004", p0, file2P0C1, file4P0C3); + testTable.addReplaceCommit("00000000000004", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue()); - HoodieTableMetaClient metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ); - HoodieTestTable testTable = HoodieTestTable.of(metaClient); - String p0 = "2020/01/01"; + // run cleaner + List hoodieCleanStatsFour = runCleaner(config, 5, true); + assertTrue(testTable.baseFileExists(p0, "00000000000004", file4P0C3)); + assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1)); + assertTrue(testTable.baseFileExists(p1, "00000000000003", file3P1C2)); + assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); + //file1P1C0 still stays because its not replaced until 3 and its the only version available + assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0)); + + // make next replacecommit, with 1 clustering operation. Replace all data in p1. no new files created + // notice that clustering generates empty inflight commit files + Map partitionAndFileId005 = testTable.forReplaceCommit("00000000000006").getFileIdsWithBaseFilesInPartitions(p1); + String file4P1C4 = partitionAndFileId005.get(p1); + replaceMetadata = generateReplaceCommitMetadata("00000000000006", p0, file3P1C2, file4P1C4); + testTable.addReplaceCommit("00000000000006", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue()); - // Make 3 files, one base file and 2 log files associated with base file - String file1P0 = testTable.addDeltaCommit("000").getFileIdsWithBaseFilesInPartitions(p0).get(p0); - testTable.forDeltaCommit("000") - .withLogFile(p0, file1P0, 1) - .withLogFile(p0, file1P0, 2); - - // Make 2 files, one base file and 1 log files associated with base file - testTable.addDeltaCommit("001") - .withBaseFilesInPartition(p0, file1P0) - .withLogFile(p0, file1P0, 3); - - List hoodieCleanStats = runCleaner(config); - assertEquals(3, - getCleanStat(hoodieCleanStats, p0).getSuccessDeleteFiles() - .size(), "Must clean three files, one parquet and 2 log files"); - assertFalse(testTable.baseFileExists(p0, "000", file1P0)); - assertFalse(testTable.logFilesExist(p0, "000", file1P0, 1, 2)); - assertTrue(testTable.baseFileExists(p0, "001", file1P0)); - assertTrue(testTable.logFileExists(p0, "001", file1P0, 3)); + List hoodieCleanStatsFive = runCleaner(config, 7, true); + assertTrue(testTable.baseFileExists(p0, "00000000000004", file4P0C3)); + assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1)); + assertTrue(testTable.baseFileExists(p1, "00000000000003", file3P1C2)); + assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); + assertFalse(testTable.baseFileExists(p1, "00000000000001", file1P1C0)); + } + + private Pair generateReplaceCommitMetadata( + String instantTime, String partition, String replacedFileId, String newFileId) { + HoodieRequestedReplaceMetadata requestedReplaceMetadata = new HoodieRequestedReplaceMetadata(); + requestedReplaceMetadata.setOperationType(WriteOperationType.CLUSTER.toString()); + requestedReplaceMetadata.setVersion(1); + HoodieSliceInfo sliceInfo = HoodieSliceInfo.newBuilder().setFileId(replacedFileId).build(); + List clusteringGroups = new ArrayList<>(); + clusteringGroups.add(HoodieClusteringGroup.newBuilder() + .setVersion(1).setNumOutputFileGroups(1).setMetrics(Collections.emptyMap()) + .setSlices(Collections.singletonList(sliceInfo)).build()); + requestedReplaceMetadata.setExtraMetadata(Collections.emptyMap()); + requestedReplaceMetadata.setClusteringPlan(HoodieClusteringPlan.newBuilder() + .setVersion(1).setExtraMetadata(Collections.emptyMap()) + .setStrategy(HoodieClusteringStrategy.newBuilder().setStrategyClassName("").setVersion(1).build()) + .setInputGroups(clusteringGroups).build()); + + HoodieReplaceCommitMetadata replaceMetadata = new HoodieReplaceCommitMetadata(); + replaceMetadata.addReplaceFileId(partition, replacedFileId); + replaceMetadata.setOperationType(WriteOperationType.CLUSTER); + if (!StringUtils.isNullOrEmpty(newFileId)) { + HoodieWriteStat writeStat = new HoodieWriteStat(); + writeStat.setPartitionPath(partition); + writeStat.setPath(partition + "/" + getBaseFilename(instantTime, newFileId)); + writeStat.setFileId(newFileId); + writeStat.setTotalWriteBytes(1); + writeStat.setFileSizeInBytes(1); + replaceMetadata.addWriteStat(partition, writeStat); + } + return Pair.of(requestedReplaceMetadata, replaceMetadata); } @Test @@ -651,8 +705,9 @@ public void testCleanMetadataUpgradeDowngrade() { String partition1 = DEFAULT_PARTITION_PATHS[0]; String partition2 = DEFAULT_PARTITION_PATHS[1]; - String fileName1 = "data1_1_000.parquet"; - String fileName2 = "data2_1_000.parquet"; + String extension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); + String fileName1 = "data1_1_000" + extension; + String fileName2 = "data2_1_000" + extension; String filePath1 = metaClient.getBasePath() + "/" + partition1 + "/" + fileName1; String filePath2 = metaClient.getBasePath() + "/" + partition1 + "/" + fileName2; @@ -664,7 +719,7 @@ public void testCleanMetadataUpgradeDowngrade() { // create partition1 clean stat. HoodieCleanStat cleanStat1 = new HoodieCleanStat(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS, partition1, deletePathPatterns1, successDeleteFiles1, - failedDeleteFiles1, instantTime); + failedDeleteFiles1, instantTime, ""); List deletePathPatterns2 = new ArrayList<>(); List successDeleteFiles2 = new ArrayList<>(); @@ -673,7 +728,7 @@ public void testCleanMetadataUpgradeDowngrade() { // create partition2 empty clean stat. HoodieCleanStat cleanStat2 = new HoodieCleanStat(HoodieCleaningPolicy.KEEP_LATEST_COMMITS, partition2, deletePathPatterns2, successDeleteFiles2, - failedDeleteFiles2, instantTime); + failedDeleteFiles2, instantTime, ""); // map with absolute file path. Map oldExpected = new HashMap<>(); @@ -741,8 +796,9 @@ public void testCleanPlanUpgradeDowngrade() { String partition1 = DEFAULT_PARTITION_PATHS[0]; String partition2 = DEFAULT_PARTITION_PATHS[1]; - String fileName1 = "data1_1_000.parquet"; - String fileName2 = "data2_1_000.parquet"; + String extension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); + String fileName1 = "data1_1_000" + extension; + String fileName2 = "data2_1_000" + extension; Map> filesToBeCleanedPerPartition = new HashMap<>(); filesToBeCleanedPerPartition.put(partition1, Arrays.asList(fileName1)); @@ -810,157 +866,19 @@ private static void assertCleanMetadataPathEquals(Map expected, } } - private static Stream argumentsForTestKeepLatestCommits() { - return Stream.of( - Arguments.of(false, false, false), - Arguments.of(true, false, false), - Arguments.of(false, true, false), - Arguments.of(false, false, true) - ); - } - - /** - * Test HoodieTable.clean() Cleaning by commit logic for COW table. - */ - @ParameterizedTest - @MethodSource("argumentsForTestKeepLatestCommits") - public void testKeepLatestCommits(boolean simulateFailureRetry, boolean enableIncrementalClean, boolean enableBootstrapSourceClean) throws Exception { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withIncrementalCleaningMode(enableIncrementalClean) - .withCleanBootstrapBaseFileEnabled(enableBootstrapSourceClean) - .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()) - .build(); - - HoodieTestTable testTable = HoodieTestTable.of(metaClient); - String p0 = "2020/01/01"; - String p1 = "2020/01/02"; - Map> bootstrapMapping = enableBootstrapSourceClean ? generateBootstrapIndexAndSourceData(p0, p1) : null; - - // make 1 commit, with 1 file per partition - String file1P0C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p0).get(0).getFileId() - : UUID.randomUUID().toString(); - String file1P1C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p1).get(0).getFileId() - : UUID.randomUUID().toString(); - testTable.addInflightCommit("00000000000001").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0); - - HoodieCommitMetadata commitMetadata = generateCommitMetadata( - Collections.unmodifiableMap(new HashMap>() { - { - put(p0, CollectionUtils.createImmutableList(file1P0C0)); - put(p1, CollectionUtils.createImmutableList(file1P1C0)); - } - }) - ); - metaClient.getActiveTimeline().saveAsComplete( - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000001"), - Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); - - metaClient = HoodieTableMetaClient.reload(metaClient); - - List hoodieCleanStatsOne = runCleaner(config, simulateFailureRetry); - assertEquals(0, hoodieCleanStatsOne.size(), "Must not scan any partitions and clean any files"); - assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); - assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0)); - - // make next commit, with 1 insert & 1 update per partition - Map partitionAndFileId002 = testTable.addInflightCommit("00000000000002").getFileIdsWithBaseFilesInPartitions(p0, p1); - String file2P0C1 = partitionAndFileId002.get(p0); - String file2P1C1 = partitionAndFileId002.get(p1); - testTable.forCommit("00000000000002").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0); - commitMetadata = generateCommitMetadata(new HashMap>() { - { - put(p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1)); - put(p1, CollectionUtils.createImmutableList(file1P1C0, file2P1C1)); - } - }); - metaClient.getActiveTimeline().saveAsComplete( - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000002"), - Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); - List hoodieCleanStatsTwo = runCleaner(config, simulateFailureRetry); - assertEquals(0, hoodieCleanStatsTwo.size(), "Must not scan any partitions and clean any files"); - assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1)); - assertTrue(testTable.baseFileExists(p1, "00000000000002", file2P1C1)); - assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); - assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0)); - - // make next commit, with 2 updates to existing files, and 1 insert - String file3P0C2 = testTable.addInflightCommit("00000000000003") - .withBaseFilesInPartition(p0, file1P0C0) - .withBaseFilesInPartition(p0, file2P0C1) - .getFileIdsWithBaseFilesInPartitions(p0).get(p0); - commitMetadata = generateCommitMetadata(CollectionUtils - .createImmutableMap(p0, - CollectionUtils.createImmutableList(file1P0C0, file2P0C1, file3P0C2))); - metaClient.getActiveTimeline().saveAsComplete( - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000003"), - Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); - - List hoodieCleanStatsThree = runCleaner(config, simulateFailureRetry); - assertEquals(0, hoodieCleanStatsThree.size(), - "Must not clean any file. We have to keep 1 version before the latest commit time to keep"); - assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); - - // make next commit, with 2 updates to existing files, and 1 insert - String file4P0C3 = testTable.addInflightCommit("00000000000004") - .withBaseFilesInPartition(p0, file1P0C0) - .withBaseFilesInPartition(p0, file2P0C1) - .getFileIdsWithBaseFilesInPartitions(p0).get(p0); - commitMetadata = generateCommitMetadata(CollectionUtils.createImmutableMap( - p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1, file4P0C3))); - metaClient.getActiveTimeline().saveAsComplete( - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000004"), - Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); - - List hoodieCleanStatsFour = runCleaner(config, simulateFailureRetry); - // enableBootstrapSourceClean would delete the bootstrap base file as the same time - HoodieCleanStat partitionCleanStat = getCleanStat(hoodieCleanStatsFour, p0); - - assertEquals(enableBootstrapSourceClean ? 2 : 1, partitionCleanStat.getSuccessDeleteFiles().size() - + (partitionCleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 - : partitionCleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least one old file"); - assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); - assertTrue(testTable.baseFileExists(p0, "00000000000002", file1P0C0)); - assertTrue(testTable.baseFileExists(p0, "00000000000003", file1P0C0)); - assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1)); - assertTrue(testTable.baseFileExists(p0, "00000000000003", file2P0C1)); - assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2)); - assertTrue(testTable.baseFileExists(p0, "00000000000004", file4P0C3)); - if (enableBootstrapSourceClean) { - assertFalse(Files.exists(Paths.get(bootstrapMapping.get( - p0).get(0).getBootstrapFileStatus().getPath().getUri()))); - } - - // No cleaning on partially written file, with no commit. - testTable.forCommit("00000000000005").withBaseFilesInPartition(p0, file3P0C2); - commitMetadata = generateCommitMetadata(CollectionUtils.createImmutableMap(p0, - CollectionUtils.createImmutableList(file3P0C2))); - metaClient.getActiveTimeline().createNewInstant( - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMMIT_ACTION, "00000000000005")); - metaClient.getActiveTimeline().transitionRequestedToInflight( - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMMIT_ACTION, "00000000000005"), - Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); - List hoodieCleanStatsFive = runCleaner(config, simulateFailureRetry); - HoodieCleanStat cleanStat = getCleanStat(hoodieCleanStatsFive, p0); - assertNull(cleanStat, "Must not clean any files"); - assertTrue(testTable.baseFileExists(p0, "00000000000002", file1P0C0)); - assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1)); - assertTrue(testTable.baseFileExists(p0, "00000000000005", file3P0C2)); - } - /** * Generate Bootstrap index, bootstrap base file and corresponding metaClient. * @return Partition to BootstrapFileMapping Map * @throws IOException */ - private Map> generateBootstrapIndexAndSourceData(String... partitions) throws IOException { + protected Map> generateBootstrapIndexAndSourceData(String... partitions) throws IOException { // create bootstrap source data path java.nio.file.Path sourcePath = tempDir.resolve("data"); java.nio.file.Files.createDirectories(sourcePath); assertTrue(new File(sourcePath.toString()).exists()); // recreate metaClient with Bootstrap base path - metaClient = HoodieTestUtils.init(basePath, getTableType(), sourcePath.toString()); + metaClient = HoodieTestUtils.init(basePath, getTableType(), sourcePath.toString(), true); // generate bootstrap index Map> bootstrapMapping = TestBootstrapIndex.generateBootstrapIndex(metaClient, sourcePath.toString(), @@ -979,18 +897,23 @@ private Map> generateBootstrapIndexAndSourceD @Test public void testCleanMarkerDataFilesOnRollback() throws Exception { HoodieTestTable testTable = HoodieTestTable.of(metaClient) - .addRequestedCommit("000") + .addRequestedCommit("001") .withMarkerFiles("default", 10, IOType.MERGE); final int numTempFilesBefore = testTable.listAllFilesInTempFolder().length; assertEquals(10, numTempFilesBefore, "Some marker files are created."); - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder() + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withRemoteServerPort(timelineServicePort).build()) + .withPath(basePath).build(); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable table = HoodieSparkTable.create(config, context, metaClient); table.getActiveTimeline().transitionRequestedToInflight( - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMMIT_ACTION, "000"), Option.empty()); + new HoodieInstant(State.REQUESTED, HoodieTimeline.COMMIT_ACTION, "001"), Option.empty()); metaClient.reloadActiveTimeline(); - table.rollback(context, "001", new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "000"), true); + HoodieInstant rollbackInstant = new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "001"); + table.scheduleRollback(context, "002", rollbackInstant, false, config.shouldRollbackUsingMarkers()); + table.rollback(context, "002", rollbackInstant, true, false); final int numTempFilesAfter = testTable.listAllFilesInTempFolder().length; assertEquals(0, numTempFilesAfter, "All temp files are deleted."); } @@ -1000,15 +923,18 @@ public void testCleanMarkerDataFilesOnRollback() throws Exception { */ @Test public void testCleaningWithZeroPartitionPaths() throws Exception { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder() + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()) .build(); // Make a commit, although there are no partitionPaths. // Example use-case of this is when a client wants to create a table // with just some commit metadata, but no data/partitionPaths. - HoodieTestTable.of(metaClient).addCommit("000"); + HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context); + HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter); + testTable.doWriteOperation("001", WriteOperationType.INSERT, Collections.emptyList(), 1); metaClient = HoodieTableMetaClient.reload(metaClient); @@ -1019,14 +945,16 @@ public void testCleaningWithZeroPartitionPaths() throws Exception { /** * Test Keep Latest Commits when there are pending compactions. */ - @Test - public void testKeepLatestCommitsWithPendingCompactions() throws Exception { - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()) + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testKeepLatestCommitsWithPendingCompactions(boolean isAsync) throws Exception { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).withAsyncClean(isAsync).retainCommits(2).build()) .build(); // Deletions: - // . FileId Parquet Logs Total Retained Commits + // . FileId Base Logs Total Retained Commits // FileId7 5 10 15 009, 011 // FileId6 5 10 15 009 // FileId5 3 6 9 005 @@ -1044,12 +972,13 @@ public void testKeepLatestCommitsWithPendingCompactions() throws Exception { @ValueSource(booleans = {false, true}) public void testKeepLatestVersionsWithPendingCompactions(boolean retryFailure) throws Exception { HoodieWriteConfig config = - HoodieWriteConfig.newBuilder().withPath(basePath).withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder() + HoodieWriteConfig.newBuilder().withPath(basePath) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(2).build()) .build(); // Deletions: - // . FileId Parquet Logs Total Retained Commits + // . FileId Base Logs Total Retained Commits // FileId7 5 10 15 009, 011 // FileId6 4 8 12 007, 009 // FileId5 2 4 6 003 005 @@ -1067,12 +996,13 @@ public void testKeepLatestVersionsWithPendingCompactions(boolean retryFailure) t public void testCleanPreviousCorruptedCleanFiles() throws IOException { HoodieWriteConfig config = HoodieWriteConfig.newBuilder() - .withPath(basePath).withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()) + .withPath(basePath) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()) .build(); - String commitTime = makeNewCommitTime(1); + String commitTime = makeNewCommitTime(1, "%09d"); List cleanerFileNames = Arrays.asList( HoodieTimeline.makeRequestedCleanerFileName(commitTime), HoodieTimeline.makeInflightCleanerFileName(commitTime)); @@ -1090,6 +1020,74 @@ public void testCleanPreviousCorruptedCleanFiles() throws IOException { assertEquals(0, cleanStats.size(), "Must not clean any files"); } + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testRerunFailedClean(boolean simulateMetadataFailure) throws Exception { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .withMaxNumDeltaCommitsBeforeCompaction(1) + .withAssumeDatePartitioning(true).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()) + .build(); + + HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context); + HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter); + String p0 = "2020/01/01"; + String p1 = "2020/01/02"; + + // make 1 commit, with 1 file per partition + String file1P0C0 = UUID.randomUUID().toString(); + String file1P1C0 = UUID.randomUUID().toString(); + testTable.addInflightCommit("00000000000001").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0); + + HoodieCommitMetadata commitMetadata = generateCommitMetadata("00000000000001", + Collections.unmodifiableMap(new HashMap>() { + { + put(p0, CollectionUtils.createImmutableList(file1P0C0)); + put(p1, CollectionUtils.createImmutableList(file1P1C0)); + } + }) + ); + metadataWriter.update(commitMetadata, "00000000000001", false); + metaClient.getActiveTimeline().saveAsComplete( + new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000001"), + Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + + metaClient = HoodieTableMetaClient.reload(metaClient); + + // make next replacecommit, with 1 clustering operation. logically delete p0. No change to p1 + // notice that clustering generates empty inflight commit files + Map partitionAndFileId002 = testTable.forReplaceCommit("00000000000002").getFileIdsWithBaseFilesInPartitions(p0); + String file2P0C1 = partitionAndFileId002.get(p0); + Pair replaceMetadata = + generateReplaceCommitMetadata("00000000000002", p0, file1P0C0, file2P0C1); + testTable.addReplaceCommit("00000000000002", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue()); + + // make next replacecommit, with 1 clustering operation. Replace data in p1. No change to p0 + // notice that clustering generates empty inflight commit files + Map partitionAndFileId003 = testTable.forReplaceCommit("00000000000003").getFileIdsWithBaseFilesInPartitions(p1); + String file3P1C2 = partitionAndFileId003.get(p1); + replaceMetadata = generateReplaceCommitMetadata("00000000000003", p1, file1P1C0, file3P1C2); + testTable.addReplaceCommit("00000000000003", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue()); + + // make next replacecommit, with 1 clustering operation. Replace data in p0 again + // notice that clustering generates empty inflight commit files + Map partitionAndFileId004 = testTable.forReplaceCommit("00000000000004").getFileIdsWithBaseFilesInPartitions(p0); + String file4P0C3 = partitionAndFileId004.get(p0); + replaceMetadata = generateReplaceCommitMetadata("00000000000004", p0, file2P0C1, file4P0C3); + testTable.addReplaceCommit("00000000000004", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue()); + + // run cleaner with failures + List hoodieCleanStats = runCleaner(config, true, simulateMetadataFailure, 5, true); + assertTrue(testTable.baseFileExists(p0, "00000000000004", file4P0C3)); + assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1)); + assertTrue(testTable.baseFileExists(p1, "00000000000003", file3P1C2)); + assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); + //file1P1C0 still stays because its not replaced until 3 and its the only version available + assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0)); + } + /** * Common test method for validating pending compactions. * @@ -1231,14 +1229,18 @@ private Stream> convertPathToFileIdWithCommitTime(final Hoo return Stream.concat(stream1, stream2); } - private static HoodieCommitMetadata generateCommitMetadata(Map> partitionToFilePaths) { + protected static HoodieCommitMetadata generateCommitMetadata( + String instantTime, Map> partitionToFilePaths) { HoodieCommitMetadata metadata = new HoodieCommitMetadata(); - partitionToFilePaths.forEach((key, value) -> value.forEach(f -> { + metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, HoodieTestTable.PHONY_TABLE_SCHEMA); + partitionToFilePaths.forEach((partitionPath, fileList) -> fileList.forEach(f -> { HoodieWriteStat writeStat = new HoodieWriteStat(); - writeStat.setPartitionPath(key); - writeStat.setPath(f); + writeStat.setPartitionPath(partitionPath); + writeStat.setPath(partitionPath + "/" + getBaseFilename(instantTime, f)); writeStat.setFileId(f); - metadata.addWriteStat(key, writeStat); + writeStat.setTotalWriteBytes(1); + writeStat.setFileSizeInBytes(1); + metadata.addWriteStat(partitionPath, writeStat); })); return metadata; } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestConsistencyGuard.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestConsistencyGuard.java index 8b19ac1c102c6..22fafe4a58747 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestConsistencyGuard.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestConsistencyGuard.java @@ -22,6 +22,7 @@ import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.fs.FailSafeConsistencyGuard; import org.apache.hudi.common.fs.OptimisticConsistencyGuard; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.testutils.FileCreateUtils; import org.apache.hudi.testutils.HoodieClientTestHarness; @@ -44,6 +45,8 @@ */ public class TestConsistencyGuard extends HoodieClientTestHarness { + private static final String BASE_FILE_EXTENSION = HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension(); + // multiple parameters, uses Collection public static List consistencyGuardType() { return Arrays.asList( @@ -73,17 +76,19 @@ public void testCheckPassingAppearAndDisAppear(String consistencyGuardType) thro ConsistencyGuardConfig config = getConsistencyGuardConfig(1, 1000, 1000); ConsistencyGuard passing = consistencyGuardType.equals(FailSafeConsistencyGuard.class.getName()) ? new FailSafeConsistencyGuard(fs, config) : new OptimisticConsistencyGuard(fs, config); - passing.waitTillFileAppears(new Path(basePath + "/partition/path/f1_1-0-1_000.parquet")); - passing.waitTillFileAppears(new Path(basePath + "/partition/path/f2_1-0-1_000.parquet")); + passing.waitTillFileAppears(new Path(basePath + "/partition/path/f1_1-0-1_000" + BASE_FILE_EXTENSION)); + passing.waitTillFileAppears(new Path(basePath + "/partition/path/f2_1-0-1_000" + BASE_FILE_EXTENSION)); passing.waitTillAllFilesAppear(basePath + "/partition/path", Arrays - .asList(basePath + "/partition/path/f1_1-0-1_000.parquet", basePath + "/partition/path/f2_1-0-1_000.parquet")); + .asList(basePath + "/partition/path/f1_1-0-1_000" + BASE_FILE_EXTENSION, + basePath + "/partition/path/f2_1-0-1_000" + BASE_FILE_EXTENSION)); - fs.delete(new Path(basePath + "/partition/path/f1_1-0-1_000.parquet"), false); - fs.delete(new Path(basePath + "/partition/path/f2_1-0-1_000.parquet"), false); - passing.waitTillFileDisappears(new Path(basePath + "/partition/path/f1_1-0-1_000.parquet")); - passing.waitTillFileDisappears(new Path(basePath + "/partition/path/f2_1-0-1_000.parquet")); + fs.delete(new Path(basePath + "/partition/path/f1_1-0-1_000" + BASE_FILE_EXTENSION), false); + fs.delete(new Path(basePath + "/partition/path/f2_1-0-1_000" + BASE_FILE_EXTENSION), false); + passing.waitTillFileDisappears(new Path(basePath + "/partition/path/f1_1-0-1_000" + BASE_FILE_EXTENSION)); + passing.waitTillFileDisappears(new Path(basePath + "/partition/path/f2_1-0-1_000" + BASE_FILE_EXTENSION)); passing.waitTillAllFilesDisappear(basePath + "/partition/path", Arrays - .asList(basePath + "/partition/path/f1_1-0-1_000.parquet", basePath + "/partition/path/f2_1-0-1_000.parquet")); + .asList(basePath + "/partition/path/f1_1-0-1_000" + BASE_FILE_EXTENSION, + basePath + "/partition/path/f2_1-0-1_000" + BASE_FILE_EXTENSION)); } @Test @@ -92,7 +97,8 @@ public void testCheckFailingAppearFailSafe() throws Exception { ConsistencyGuard passing = new FailSafeConsistencyGuard(fs, getConsistencyGuardConfig()); assertThrows(TimeoutException.class, () -> { passing.waitTillAllFilesAppear(basePath + "/partition/path", Arrays - .asList(basePath + "/partition/path/f1_1-0-2_000.parquet", basePath + "/partition/path/f2_1-0-2_000.parquet")); + .asList(basePath + "/partition/path/f1_1-0-2_000" + BASE_FILE_EXTENSION, + basePath + "/partition/path/f2_1-0-2_000" + BASE_FILE_EXTENSION)); }); } @@ -101,7 +107,8 @@ public void testCheckFailingAppearTimedWait() throws Exception { FileCreateUtils.createBaseFile(basePath, "partition/path", "000", "f1"); ConsistencyGuard passing = new OptimisticConsistencyGuard(fs, getConsistencyGuardConfig()); passing.waitTillAllFilesAppear(basePath + "/partition/path", Arrays - .asList(basePath + "/partition/path/f1_1-0-2_000.parquet", basePath + "/partition/path/f2_1-0-2_000.parquet")); + .asList(basePath + "/partition/path/f1_1-0-2_000" + BASE_FILE_EXTENSION, + basePath + "/partition/path/f2_1-0-2_000" + BASE_FILE_EXTENSION)); } @Test @@ -109,7 +116,7 @@ public void testCheckFailingAppearsFailSafe() throws Exception { FileCreateUtils.createBaseFile(basePath, "partition/path", "000", "f1"); ConsistencyGuard passing = new FailSafeConsistencyGuard(fs, getConsistencyGuardConfig()); assertThrows(TimeoutException.class, () -> { - passing.waitTillFileAppears(new Path(basePath + "/partition/path/f1_1-0-2_000.parquet")); + passing.waitTillFileAppears(new Path(basePath + "/partition/path/f1_1-0-2_000" + BASE_FILE_EXTENSION)); }); } @@ -117,7 +124,7 @@ public void testCheckFailingAppearsFailSafe() throws Exception { public void testCheckFailingAppearsTimedWait() throws Exception { FileCreateUtils.createBaseFile(basePath, "partition/path", "000", "f1"); ConsistencyGuard passing = new OptimisticConsistencyGuard(fs, getConsistencyGuardConfig()); - passing.waitTillFileAppears(new Path(basePath + "/partition/path/f1_1-0-2_000.parquet")); + passing.waitTillFileAppears(new Path(basePath + "/partition/path/f1_1-0-2_000" + BASE_FILE_EXTENSION)); } @Test @@ -126,7 +133,8 @@ public void testCheckFailingDisappearFailSafe() throws Exception { ConsistencyGuard passing = new FailSafeConsistencyGuard(fs, getConsistencyGuardConfig()); assertThrows(TimeoutException.class, () -> { passing.waitTillAllFilesDisappear(basePath + "/partition/path", Arrays - .asList(basePath + "/partition/path/f1_1-0-1_000.parquet", basePath + "/partition/path/f2_1-0-2_000.parquet")); + .asList(basePath + "/partition/path/f1_1-0-1_000" + BASE_FILE_EXTENSION, + basePath + "/partition/path/f2_1-0-2_000" + BASE_FILE_EXTENSION)); }); } @@ -135,7 +143,8 @@ public void testCheckFailingDisappearTimedWait() throws Exception { FileCreateUtils.createBaseFile(basePath, "partition/path", "000", "f1"); ConsistencyGuard passing = new OptimisticConsistencyGuard(fs, getConsistencyGuardConfig()); passing.waitTillAllFilesDisappear(basePath + "/partition/path", Arrays - .asList(basePath + "/partition/path/f1_1-0-1_000.parquet", basePath + "/partition/path/f2_1-0-2_000.parquet")); + .asList(basePath + "/partition/path/f1_1-0-1_000" + BASE_FILE_EXTENSION, + basePath + "/partition/path/f2_1-0-2_000" + BASE_FILE_EXTENSION)); } @Test @@ -144,7 +153,7 @@ public void testCheckFailingDisappearsFailSafe() throws Exception { FileCreateUtils.createBaseFile(basePath, "partition/path", "000", "f1"); ConsistencyGuard passing = new FailSafeConsistencyGuard(fs, getConsistencyGuardConfig()); assertThrows(TimeoutException.class, () -> { - passing.waitTillFileDisappears(new Path(basePath + "/partition/path/f1_1-0-1_000.parquet")); + passing.waitTillFileDisappears(new Path(basePath + "/partition/path/f1_1-0-1_000" + BASE_FILE_EXTENSION)); }); } @@ -153,16 +162,16 @@ public void testCheckFailingDisappearsTimedWait() throws Exception { FileCreateUtils.createBaseFile(basePath, "partition/path", "000", "f1"); FileCreateUtils.createBaseFile(basePath, "partition/path", "000", "f1"); ConsistencyGuard passing = new OptimisticConsistencyGuard(fs, getConsistencyGuardConfig()); - passing.waitTillFileDisappears(new Path(basePath + "/partition/path/f1_1-0-1_000.parquet")); + passing.waitTillFileDisappears(new Path(basePath + "/partition/path/f1_1-0-1_000" + BASE_FILE_EXTENSION)); } private ConsistencyGuardConfig getConsistencyGuardConfig() { return getConsistencyGuardConfig(3, 10, 10); } - private ConsistencyGuardConfig getConsistencyGuardConfig(int maxChecks, int initalSleep, int maxSleep) { + private ConsistencyGuardConfig getConsistencyGuardConfig(int maxChecks, int initialSleep, int maxSleep) { return ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true) - .withInitialConsistencyCheckIntervalMs(initalSleep).withMaxConsistencyCheckIntervalMs(maxSleep) + .withInitialConsistencyCheckIntervalMs(initialSleep).withMaxConsistencyCheckIntervalMs(maxSleep) .withMaxConsistencyChecks(maxChecks).build(); } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java index 8b47fa3d49457..18f764c1fa25f 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java @@ -18,325 +18,94 @@ package org.apache.hudi.table; -import org.apache.hudi.client.HoodieReadClient; +import org.apache.hudi.client.SparkRDDReadClient; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieFileFormat; -import org.apache.hudi.common.model.HoodieFileGroup; -import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieInstant.State; import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; -import org.apache.hudi.common.table.view.HoodieTableFileSystemView; -import org.apache.hudi.common.table.view.SyncableFileSystemView; import org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView; -import org.apache.hudi.common.table.view.TableFileSystemView.SliceView; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; -import org.apache.hudi.common.testutils.HoodieTestTable; -import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.testutils.Transformations; +import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieCompactionConfig; -import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieClusteringConfig; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.hadoop.HoodieHFileInputFormat; -import org.apache.hudi.hadoop.HoodieParquetInputFormat; -import org.apache.hudi.hadoop.realtime.HoodieHFileRealtimeInputFormat; -import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; -import org.apache.hudi.hadoop.utils.HoodieHiveUtils; -import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.index.HoodieIndex.IndexType; -import org.apache.hudi.table.action.deltacommit.AbstractSparkDeltaCommitActionExecutor; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.deltacommit.BaseSparkDeltaCommitActionExecutor; import org.apache.hudi.table.action.deltacommit.SparkDeleteDeltaCommitActionExecutor; -import org.apache.hudi.testutils.HoodieClientTestHarness; import org.apache.hudi.testutils.HoodieClientTestUtils; import org.apache.hudi.testutils.HoodieMergeOnReadTestUtils; -import org.apache.hudi.testutils.HoodieWriteableTestTable; +import org.apache.hudi.testutils.HoodieSparkWriteableTestTable; import org.apache.hudi.testutils.MetadataMergeWriteStatus; +import org.apache.hudi.testutils.SparkClientFunctionalTestHarness; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.spark.api.java.JavaRDD; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.storage.StorageLevel; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; -import java.io.File; import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Properties; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; -import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; +import static org.apache.hudi.testutils.HoodieClientTestHarness.buildProfile; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness { - private JobConf roSnapshotJobConf; - private JobConf roJobConf; - private JobConf rtJobConf; - - @TempDir - public java.nio.file.Path tempFolder; - - private HoodieFileFormat baseFileFormat; - - public void init(HoodieFileFormat baseFileFormat) throws IOException { - this.baseFileFormat = baseFileFormat; - initDFS(); - initSparkContexts("TestHoodieMergeOnReadTable"); - hadoopConf.addResource(dfs.getConf()); - jsc.hadoopConfiguration().addResource(dfs.getConf()); - context = new HoodieSparkEngineContext(jsc); - initPath(); - dfs.mkdirs(new Path(basePath)); - metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ, baseFileFormat); - initTestDataGenerator(); - - roSnapshotJobConf = new JobConf(hadoopConf); - roJobConf = new JobConf(hadoopConf); - rtJobConf = new JobConf(hadoopConf); - } +public class TestHoodieMergeOnReadTable extends SparkClientFunctionalTestHarness { - @BeforeEach - public void init() throws IOException { - init(HoodieFileFormat.PARQUET); - } + private HoodieTableMetaClient metaClient; + private HoodieTestDataGenerator dataGen; - @AfterEach - public void clean() throws IOException { - cleanupResources(); - } - - @Test - public void testSimpleInsertAndUpdate() throws Exception { - HoodieWriteConfig cfg = getConfig(true); - try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { - - /** - * Write 1 (only inserts) - */ - String newCommitTime = "001"; - client.startCommitWithTime(newCommitTime); - - List records = dataGen.generateInserts(newCommitTime, 200); - insertAndGetFilePaths(records, client, cfg, newCommitTime); - - /** - * Write 2 (updates) - */ - newCommitTime = "004"; - client.startCommitWithTime(newCommitTime); - records = dataGen.generateUpdates(newCommitTime, 100); - updateAndGetFilePaths(records, client, cfg, newCommitTime); - - String compactionCommitTime = client.scheduleCompaction(Option.empty()).get().toString(); - client.compact(compactionCommitTime); - - HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context, metaClient); - FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); - tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); - HoodieTableFileSystemView roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); - Stream dataFilesToRead = tableView.getLatestBaseFiles(); - assertTrue(dataFilesToRead.findAny().isPresent()); - - // verify that there is a commit - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTimeline timeline = metaClient.getCommitTimeline().filterCompletedInstants(); - assertEquals(1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), - "Expecting a single commit."); - String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp(); - assertTrue(HoodieTimeline.compareTimestamps("000", HoodieTimeline.LESSER_THAN, latestCompactionCommitTime)); - - assertEquals(200, HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, "000"), - "Must contain 200 records"); - } - } - - @Test - public void testSimpleInsertAndUpdateHFile() throws Exception { - clean(); - init(HoodieFileFormat.HFILE); - - HoodieWriteConfig cfg = getConfig(true); - try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { - - /** - * Write 1 (only inserts) - */ - String newCommitTime = "001"; - client.startCommitWithTime(newCommitTime); - - List records = dataGen.generateInserts(newCommitTime, 200); - insertAndGetFilePaths(records, client, cfg, newCommitTime); - - /** - * Write 2 (updates) - */ - newCommitTime = "004"; - client.startCommitWithTime(newCommitTime); - records = dataGen.generateUpdates(newCommitTime, 100); - updateAndGetFilePaths(records, client, cfg, newCommitTime); - - String compactionCommitTime = client.scheduleCompaction(Option.empty()).get().toString(); - client.compact(compactionCommitTime); - - HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context, metaClient); - FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); - tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); - HoodieTableFileSystemView roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); - Stream dataFilesToRead = tableView.getLatestBaseFiles(); - assertTrue(dataFilesToRead.findAny().isPresent()); - - // verify that there is a commit - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTimeline timeline = metaClient.getCommitTimeline().filterCompletedInstants(); - assertEquals(1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), - "Expecting a single commit."); - String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp(); - assertTrue(HoodieTimeline.compareTimestamps("000", HoodieTimeline.LESSER_THAN, latestCompactionCommitTime)); - - assertEquals(200, HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, "000"), - "Must contain 200 records"); - } - } - - // test incremental read does not go past compaction instant for RO views - // For RT views, incremental read can go past compaction - @Test - public void testIncrementalReadsWithCompaction() throws Exception { - String partitionPath = "2020/02/20"; // use only one partition for this test - dataGen = new HoodieTestDataGenerator(new String[] { partitionPath }); - HoodieWriteConfig cfg = getConfig(true); - try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { - - /** - * Write 1 (only inserts) - */ - String commitTime1 = "001"; - client.startCommitWithTime(commitTime1); - - List records001 = dataGen.generateInserts(commitTime1, 200); - insertAndGetFilePaths(records001, client, cfg, commitTime1); - - // verify only one base file shows up with commit time 001 - FileStatus[] snapshotROFiles = getROSnapshotFiles(partitionPath); - validateFiles(partitionPath, 1, snapshotROFiles, false, roSnapshotJobConf, 200, commitTime1); - - FileStatus[] incrementalROFiles = getROIncrementalFiles(partitionPath, true); - validateFiles(partitionPath, 1, incrementalROFiles, false, roJobConf, 200, commitTime1); - Path firstFilePath = incrementalROFiles[0].getPath(); - - FileStatus[] incrementalRTFiles = getRTIncrementalFiles(partitionPath); - validateFiles(partitionPath, 1, incrementalRTFiles, true, rtJobConf,200, commitTime1); - - assertEquals(firstFilePath, incrementalRTFiles[0].getPath()); - - /** - * Write 2 (updates) - */ - String updateTime = "004"; - client.startCommitWithTime(updateTime); - List records004 = dataGen.generateUpdates(updateTime, 100); - updateAndGetFilePaths(records004, client, cfg, updateTime); - - // verify RO incremental reads - only one parquet file shows up because updates to into log files - incrementalROFiles = getROIncrementalFiles(partitionPath, false); - validateFiles(partitionPath, 1, incrementalROFiles, false, roJobConf, 200, commitTime1); - assertEquals(firstFilePath, incrementalROFiles[0].getPath()); - - // verify RT incremental reads includes updates also - incrementalRTFiles = getRTIncrementalFiles(partitionPath); - validateFiles(partitionPath, 1, incrementalRTFiles, true, rtJobConf, 200, commitTime1, updateTime); - - // request compaction, but do not perform compaction - String compactionCommitTime = "005"; - client.scheduleCompactionAtInstant("005", Option.empty()); - - // verify RO incremental reads - only one parquet file shows up because updates go into log files - incrementalROFiles = getROIncrementalFiles(partitionPath, true); - validateFiles(partitionPath,1, incrementalROFiles, false, roJobConf, 200, commitTime1); - - // verify RT incremental reads includes updates also - incrementalRTFiles = getRTIncrementalFiles(partitionPath); - validateFiles(partitionPath, 1, incrementalRTFiles, true, rtJobConf, 200, commitTime1, updateTime); - - // write 3 - more inserts - String insertsTime = "006"; - List records006 = dataGen.generateInserts(insertsTime, 200); - client.startCommitWithTime(insertsTime); - insertAndGetFilePaths(records006, client, cfg, insertsTime); - - // verify new write shows up in snapshot mode even though there is pending compaction - snapshotROFiles = getROSnapshotFiles(partitionPath); - validateFiles(partitionPath, 2, snapshotROFiles, false, roSnapshotJobConf,400, commitTime1, insertsTime); - - incrementalROFiles = getROIncrementalFiles(partitionPath, true); - assertEquals(firstFilePath, incrementalROFiles[0].getPath()); - // verify 006 does not show up in RO mode because of pending compaction - - validateFiles(partitionPath, 1, incrementalROFiles, false, roJobConf, 200, commitTime1); - - // verify that if stopAtCompaction is disabled, inserts from "insertsTime" show up - incrementalROFiles = getROIncrementalFiles(partitionPath, false); - validateFiles(partitionPath,2, incrementalROFiles, false, roJobConf, 400, commitTime1, insertsTime); - - // verify 006 shows up in RT views - incrementalRTFiles = getRTIncrementalFiles(partitionPath); - validateFiles(partitionPath, 2, incrementalRTFiles, true, rtJobConf, 400, commitTime1, updateTime, insertsTime); - - // perform the scheduled compaction - client.compact(compactionCommitTime); - - // verify new write shows up in snapshot mode after compaction is complete - snapshotROFiles = getROSnapshotFiles(partitionPath); - validateFiles(partitionPath,2, snapshotROFiles, false, roSnapshotJobConf,400, commitTime1, compactionCommitTime, - insertsTime); - - incrementalROFiles = getROIncrementalFiles(partitionPath, "002", -1, true); - assertTrue(incrementalROFiles.length == 2); - // verify 006 shows up because of pending compaction - validateFiles(partitionPath, 2, incrementalROFiles, false, roJobConf, 400, commitTime1, compactionCommitTime, - insertsTime); - } + void setUp(Properties props) throws IOException { + Properties properties = CollectionUtils.copy(props); + properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString()); + metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties); + dataGen = new HoodieTestDataGenerator(); } // Check if record level metadata is aggregated properly at the end of write. @Test public void testMetadataAggregateFromWriteStatus() throws Exception { HoodieWriteConfig cfg = getConfigBuilder(false).withWriteStatusClass(MetadataMergeWriteStatus.class).build(); + + setUp(cfg.getProps()); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { String newCommitTime = "001"; List records = dataGen.generateInserts(newCommitTime, 200); - JavaRDD writeRecords = jsc.parallelize(records, 1); + JavaRDD writeRecords = jsc().parallelize(records, 1); client.startCommitWithTime(newCommitTime); @@ -352,490 +121,30 @@ public void testMetadataAggregateFromWriteStatus() throws Exception { } } - @Test - public void testSimpleInsertUpdateAndDelete() throws Exception { - HoodieWriteConfig cfg = getConfig(true); - try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { - - /** - * Write 1 (only inserts, written as parquet file) - */ - String newCommitTime = "001"; - client.startCommitWithTime(newCommitTime); - - List records = dataGen.generateInserts(newCommitTime, 20); - JavaRDD writeRecords = jsc.parallelize(records, 1); - - List statuses = client.upsert(writeRecords, newCommitTime).collect(); - assertNoWriteErrors(statuses); - - metaClient = getHoodieMetaClient(hadoopConf, cfg.getBasePath()); - HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context, metaClient); - - Option deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant(); - assertTrue(deltaCommit.isPresent()); - assertEquals("001", deltaCommit.get().getTimestamp(), "Delta commit should be 001"); - - Option commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); - assertFalse(commit.isPresent()); - - FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); - tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); - Stream dataFilesToRead = tableView.getLatestBaseFiles(); - assertFalse(dataFilesToRead.findAny().isPresent()); - - tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); - dataFilesToRead = tableView.getLatestBaseFiles(); - assertTrue(dataFilesToRead.findAny().isPresent(), - "should list the parquet files we wrote in the delta commit"); - - /** - * Write 2 (only updates, written to .log file) - */ - newCommitTime = "002"; - client.startCommitWithTime(newCommitTime); + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testUpsertPartitioner(boolean populateMetaFields) throws Exception { + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + HoodieWriteConfig cfg = cfgBuilder.build(); - records = dataGen.generateUpdates(newCommitTime, records); - writeRecords = jsc.parallelize(records, 1); - statuses = client.upsert(writeRecords, newCommitTime).collect(); - assertNoWriteErrors(statuses); - - /** - * Write 2 (only deletes, written to .log file) - */ - newCommitTime = "004"; - client.startCommitWithTime(newCommitTime); - - List fewRecordsForDelete = dataGen.generateDeletesFromExistingRecords(records); - - statuses = client.upsert(jsc.parallelize(fewRecordsForDelete, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - metaClient = HoodieTableMetaClient.reload(metaClient); - deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant(); - assertTrue(deltaCommit.isPresent()); - assertEquals("004", deltaCommit.get().getTimestamp(), "Latest Delta commit should be 004"); - - commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); - assertFalse(commit.isPresent()); - - allFiles = listAllBaseFilesInPath(hoodieTable); - tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); - dataFilesToRead = tableView.getLatestBaseFiles(); - assertTrue(dataFilesToRead.findAny().isPresent()); - - List dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList()); - List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, dataFiles, basePath); - // Wrote 20 records and deleted 20 records, so remaining 20-20 = 0 - assertEquals(0, recordsRead.size(), "Must contain 0 records"); - } - } - - private void testCOWToMORConvertedTableRollback(Boolean rollbackUsingMarkers) throws Exception { - // Set TableType to COW - HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE); - - HoodieWriteConfig cfg = getConfig(false, rollbackUsingMarkers); - try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { - - /** - * Write 1 (only inserts) - */ - String newCommitTime = "001"; - client.startCommitWithTime(newCommitTime); - - List records = dataGen.generateInserts(newCommitTime, 200); - JavaRDD writeRecords = jsc.parallelize(records, 1); - - List statuses = client.upsert(writeRecords, newCommitTime).collect(); - // verify there are no errors - assertNoWriteErrors(statuses); - client.commit(newCommitTime, jsc.parallelize(statuses)); - - metaClient = getHoodieMetaClient(hadoopConf, cfg.getBasePath()); - Option commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); - assertTrue(commit.isPresent()); - assertEquals("001", commit.get().getTimestamp(), "commit should be 001"); - - /** - * Write 2 (updates) - */ - newCommitTime = "002"; - client.startCommitWithTime(newCommitTime); - - records = dataGen.generateUpdates(newCommitTime, records); - - statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - // Set TableType to MOR - HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ); - - // rollback a COW commit when TableType is MOR - client.rollback(newCommitTime); - - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context, metaClient); - FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); - tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); - - final String absentCommit = newCommitTime; - assertFalse(tableView.getLatestBaseFiles().anyMatch(file -> absentCommit.equals(file.getCommitTime()))); - } - } - - @Test - public void testCOWToMORConvertedTableRollbackUsingFileList() throws Exception { - testCOWToMORConvertedTableRollback(false); - } - - @Test - public void testCOWToMORConvertedTableRollbackUsingMarkers() throws Exception { - testCOWToMORConvertedTableRollback(true); - } - - private void testRollbackWithDeltaAndCompactionCommit(Boolean rollbackUsingMarkers) throws Exception { - HoodieWriteConfig cfg = getConfig(false, rollbackUsingMarkers); + setUp(cfg.getProps()); try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { - // Test delta commit rollback - /** - * Write 1 (only inserts) - */ - String newCommitTime = "001"; - client.startCommitWithTime(newCommitTime); - - List records = dataGen.generateInserts(newCommitTime, 200); - JavaRDD writeRecords = jsc.parallelize(records, 1); - - JavaRDD writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime); - client.commit(newCommitTime, writeStatusJavaRDD); - List statuses = writeStatusJavaRDD.collect(); - assertNoWriteErrors(statuses); - - metaClient = getHoodieMetaClient(hadoopConf, cfg.getBasePath()); - HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context, metaClient); - - Option deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant(); - assertTrue(deltaCommit.isPresent()); - assertEquals("001", deltaCommit.get().getTimestamp(), "Delta commit should be 001"); - - Option commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); - assertFalse(commit.isPresent()); - - FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); - tableView = - getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); - Stream dataFilesToRead = tableView.getLatestBaseFiles(); - assertTrue(!dataFilesToRead.findAny().isPresent()); - - tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); - dataFilesToRead = tableView.getLatestBaseFiles(); - assertTrue(dataFilesToRead.findAny().isPresent(), - "should list the parquet files we wrote in the delta commit"); - /** - * Write 2 (inserts + updates - testing failed delta commit) - */ - final String commitTime1 = "002"; - // WriteClient with custom config (disable small file handling) - try (SparkRDDWriteClient secondClient = getHoodieWriteClient(getHoodieWriteConfigWithSmallFileHandlingOff());) { - secondClient.startCommitWithTime(commitTime1); - - List copyOfRecords = new ArrayList<>(records); - copyOfRecords = dataGen.generateUpdates(commitTime1, copyOfRecords); - copyOfRecords.addAll(dataGen.generateInserts(commitTime1, 200)); - - List dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList()); - List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, dataFiles, - basePath); - assertEquals(recordsRead.size(), 200); - - statuses = secondClient.upsert(jsc.parallelize(copyOfRecords, 1), commitTime1).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - // Test failed delta commit rollback - secondClient.rollback(commitTime1); - allFiles = listAllBaseFilesInPath(hoodieTable); - // After rollback, there should be no base file with the failed commit time - List remainingFiles = Arrays.stream(allFiles).filter(file -> file.getPath().getName() - .contains(commitTime1)).map(fileStatus -> fileStatus.getPath().toString()).collect(Collectors.toList()); - assertEquals(0, remainingFiles.size(), "There files should have been rolled-back " - + "when rolling back commit " + commitTime1 + " but are still remaining. Files: " + remainingFiles); - dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList()); - recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, dataFiles, basePath); - assertEquals(200, recordsRead.size()); - } - - /** - * Write 3 (inserts + updates - testing successful delta commit) - */ - final String commitTime2 = "002"; - try (SparkRDDWriteClient thirdClient = getHoodieWriteClient(cfg);) { - thirdClient.startCommitWithTime(commitTime2); - - List copyOfRecords = new ArrayList<>(records); - copyOfRecords = dataGen.generateUpdates(commitTime2, copyOfRecords); - copyOfRecords.addAll(dataGen.generateInserts(commitTime2, 200)); - - List dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList()); - List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, dataFiles, - basePath); - assertEquals(200, recordsRead.size()); - - writeRecords = jsc.parallelize(copyOfRecords, 1); - writeStatusJavaRDD = thirdClient.upsert(writeRecords, commitTime2); - statuses = writeStatusJavaRDD.collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - // Test successful delta commit rollback - thirdClient.rollback(commitTime2); - allFiles = listAllBaseFilesInPath(hoodieTable); - // After rollback, there should be no parquet file with the failed commit time - assertEquals(0, Arrays.stream(allFiles) - .filter(file -> file.getPath().getName().contains(commitTime2)).count()); - - metaClient = HoodieTableMetaClient.reload(metaClient); - hoodieTable = HoodieSparkTable.create(cfg, context, metaClient); - tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); - dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList()); - recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, dataFiles, basePath); - // check that the number of records read is still correct after rollback operation - assertEquals(200, recordsRead.size()); - - // Test compaction commit rollback - /** - * Write 4 (updates) - */ - newCommitTime = "003"; - thirdClient.startCommitWithTime(newCommitTime); - - writeStatusJavaRDD = thirdClient.upsert(writeRecords, newCommitTime); - thirdClient.commit(newCommitTime, writeStatusJavaRDD); - statuses = writeStatusJavaRDD.collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - metaClient = HoodieTableMetaClient.reload(metaClient); - - String compactionInstantTime = thirdClient.scheduleCompaction(Option.empty()).get().toString(); - thirdClient.compact(compactionInstantTime); - - allFiles = listAllBaseFilesInPath(hoodieTable); - metaClient = HoodieTableMetaClient.reload(metaClient); - tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles); - - final String compactedCommitTime = metaClient.getActiveTimeline().reload().lastInstant().get().getTimestamp(); - assertTrue(Arrays.stream(listAllBaseFilesInPath(hoodieTable)) - .anyMatch(file -> compactedCommitTime.equals(new HoodieBaseFile(file).getCommitTime()))); - thirdClient.rollbackInflightCompaction(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactedCommitTime), - hoodieTable); - allFiles = listAllBaseFilesInPath(hoodieTable); - metaClient = HoodieTableMetaClient.reload(metaClient); - tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles); - - assertFalse(tableView.getLatestBaseFiles().anyMatch(file -> compactedCommitTime.equals(file.getCommitTime()))); - } - } - } - - @Test - public void testRollbackWithDeltaAndCompactionCommitUsingFileList() throws Exception { - testRollbackWithDeltaAndCompactionCommit(false); - } - - @Test - public void testRollbackWithDeltaAndCompactionCommitUsingMarkers() throws Exception { - testRollbackWithDeltaAndCompactionCommit(true); - } - - @Test - public void testMultiRollbackWithDeltaAndCompactionCommit() throws Exception { - HoodieWriteConfig cfg = getConfig(false); - try (final SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { - /** - * Write 1 (only inserts) - */ - String newCommitTime = "001"; - client.startCommitWithTime(newCommitTime); - - List records = dataGen.generateInserts(newCommitTime, 200); - JavaRDD writeRecords = jsc.parallelize(records, 1); - - JavaRDD writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime); - client.commit(newCommitTime, writeStatusJavaRDD); - List statuses = writeStatusJavaRDD.collect(); - assertNoWriteErrors(statuses); - - metaClient = getHoodieMetaClient(hadoopConf, cfg.getBasePath()); - HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context, metaClient); - - Option deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant(); - assertTrue(deltaCommit.isPresent()); - assertEquals("001", deltaCommit.get().getTimestamp(), "Delta commit should be 001"); - - Option commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); - assertFalse(commit.isPresent()); - - FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); - tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); - Stream dataFilesToRead = tableView.getLatestBaseFiles(); - assertFalse(dataFilesToRead.findAny().isPresent()); - - tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); - dataFilesToRead = tableView.getLatestBaseFiles(); - assertTrue(dataFilesToRead.findAny().isPresent(), - "Should list the parquet files we wrote in the delta commit"); - - /** - * Write 2 (inserts + updates) - */ - newCommitTime = "002"; - // WriteClient with custom config (disable small file handling) - SparkRDDWriteClient nClient = getHoodieWriteClient(getHoodieWriteConfigWithSmallFileHandlingOff()); - nClient.startCommitWithTime(newCommitTime); - - List copyOfRecords = new ArrayList<>(records); - copyOfRecords = dataGen.generateUpdates(newCommitTime, copyOfRecords); - copyOfRecords.addAll(dataGen.generateInserts(newCommitTime, 200)); - - List dataFiles = tableView.getLatestBaseFiles().map(hf -> hf.getPath()).collect(Collectors.toList()); - List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, dataFiles, - basePath); - assertEquals(200, recordsRead.size()); - - statuses = nClient.upsert(jsc.parallelize(copyOfRecords, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - nClient.commit(newCommitTime, writeStatusJavaRDD); - copyOfRecords.clear(); - - // Schedule a compaction - /** - * Write 3 (inserts + updates) - */ - newCommitTime = "003"; - client.startCommitWithTime(newCommitTime); - - List newInserts = dataGen.generateInserts(newCommitTime, 100); - records = dataGen.generateUpdates(newCommitTime, records); - records.addAll(newInserts); - writeRecords = jsc.parallelize(records, 1); - - writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime); - client.commit(newCommitTime, writeStatusJavaRDD); - statuses = writeStatusJavaRDD.collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - metaClient = HoodieTableMetaClient.reload(metaClient); - - String compactionInstantTime = "004"; - client.scheduleCompactionAtInstant(compactionInstantTime, Option.empty()); - - // Compaction commit - /** - * Write 4 (updates) - */ - newCommitTime = "005"; - client.startCommitWithTime(newCommitTime); - - records = dataGen.generateUpdates(newCommitTime, records); - writeRecords = jsc.parallelize(records, 1); - - writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime); - client.commit(newCommitTime, writeStatusJavaRDD); - statuses = writeStatusJavaRDD.collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - - metaClient = HoodieTableMetaClient.reload(metaClient); - - compactionInstantTime = "006"; - client.scheduleCompactionAtInstant(compactionInstantTime, Option.empty()); - JavaRDD ws = (JavaRDD) client.compact(compactionInstantTime); - client.commitCompaction(compactionInstantTime, ws, Option.empty()); - - allFiles = listAllBaseFilesInPath(hoodieTable); - metaClient = HoodieTableMetaClient.reload(metaClient); - tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles); - - final String compactedCommitTime = - metaClient.getActiveTimeline().reload().getCommitsTimeline().lastInstant().get().getTimestamp(); - - assertTrue(tableView.getLatestBaseFiles().anyMatch(file -> compactedCommitTime.equals(file.getCommitTime()))); - - /** - * Write 5 (updates) - */ - newCommitTime = "007"; - client.startCommitWithTime(newCommitTime); - copyOfRecords = new ArrayList<>(records); - copyOfRecords = dataGen.generateUpdates(newCommitTime, copyOfRecords); - copyOfRecords.addAll(dataGen.generateInserts(newCommitTime, 200)); - - statuses = client.upsert(jsc.parallelize(copyOfRecords, 1), newCommitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - client.commit(newCommitTime, writeStatusJavaRDD); - copyOfRecords.clear(); - - // Rollback latest commit first - client.restoreToInstant("000"); - - metaClient = HoodieTableMetaClient.reload(metaClient); - allFiles = listAllBaseFilesInPath(hoodieTable); - tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); - dataFilesToRead = tableView.getLatestBaseFiles(); - assertFalse(dataFilesToRead.findAny().isPresent()); - SliceView rtView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); - List fileGroups = - ((HoodieTableFileSystemView) rtView).getAllFileGroups().collect(Collectors.toList()); - assertTrue(fileGroups.isEmpty()); - - // make sure there are no log files remaining - assertEquals(0L, ((HoodieTableFileSystemView) rtView).getAllFileGroups() - .filter(fileGroup -> fileGroup.getAllRawFileSlices().noneMatch(f -> f.getLogFiles().count() == 0)) - .count()); - - } - } - - protected HoodieWriteConfig getHoodieWriteConfigWithSmallFileHandlingOff() { - return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .withDeleteParallelism(2) - .withAutoCommit(false).withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024) - .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build()) - .withEmbeddedTimelineServerEnabled(true) - .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024).parquetMaxFileSize(1024).build()).forTable("test-trip-table") - .build(); - } - - @Test - public void testUpsertPartitioner() throws Exception { - HoodieWriteConfig cfg = getConfig(true); - try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { - - /** - * Write 1 (only inserts, written as parquet file) + * Write 1 (only inserts, written as base file) */ String newCommitTime = "001"; client.startCommitWithTime(newCommitTime); List records = dataGen.generateInserts(newCommitTime, 20); - JavaRDD writeRecords = jsc.parallelize(records, 1); + JavaRDD writeRecords = jsc().parallelize(records, 1); List statuses = client.upsert(writeRecords, newCommitTime).collect(); assertNoWriteErrors(statuses); - metaClient = getHoodieMetaClient(hadoopConf, cfg.getBasePath()); - HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context, metaClient); + HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); Option deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant(); assertTrue(deltaCommit.isPresent()); @@ -848,17 +157,17 @@ public void testUpsertPartitioner() throws Exception { BaseFileOnlyView roView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), allFiles); Stream dataFilesToRead = roView.getLatestBaseFiles(); - Map parquetFileIdToSize = + Map fileIdToSize = dataFilesToRead.collect(Collectors.toMap(HoodieBaseFile::getFileId, HoodieBaseFile::getFileSize)); roView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); dataFilesToRead = roView.getLatestBaseFiles(); List dataFilesList = dataFilesToRead.collect(Collectors.toList()); assertTrue(dataFilesList.size() > 0, - "Should list the parquet files we wrote in the delta commit"); + "Should list the base files we wrote in the delta commit"); /** - * Write 2 (only updates + inserts, written to .log file + correction of existing parquet file size) + * Write 2 (only updates + inserts, written to .log file + correction of existing base file size) */ newCommitTime = "002"; client.startCommitWithTime(newCommitTime); @@ -866,7 +175,7 @@ public void testUpsertPartitioner() throws Exception { List newRecords = dataGen.generateUpdates(newCommitTime, records); newRecords.addAll(dataGen.generateInserts(newCommitTime, 20)); - statuses = client.upsert(jsc.parallelize(newRecords), newCommitTime).collect(); + statuses = client.upsert(jsc().parallelize(newRecords), newCommitTime).collect(); // Verify there are no errors assertNoWriteErrors(statuses); @@ -883,73 +192,85 @@ public void testUpsertPartitioner() throws Exception { hoodieTable.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(), allFiles); dataFilesToRead = roView.getLatestBaseFiles(); List newDataFilesList = dataFilesToRead.collect(Collectors.toList()); - Map parquetFileIdToNewSize = + Map fileIdToNewSize = newDataFilesList.stream().collect(Collectors.toMap(HoodieBaseFile::getFileId, HoodieBaseFile::getFileSize)); - assertTrue(parquetFileIdToNewSize.entrySet().stream().anyMatch(entry -> parquetFileIdToSize.get(entry.getKey()) < entry.getValue())); + assertTrue(fileIdToNewSize.entrySet().stream().anyMatch(entry -> fileIdToSize.get(entry.getKey()) < entry.getValue())); - List dataFiles = roView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList()); - List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, dataFiles, - basePath); + List inputPaths = roView.getLatestBaseFiles() + .map(baseFile -> new Path(baseFile.getPath()).getParent().toString()) + .collect(Collectors.toList()); + List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, + basePath(), new JobConf(hadoopConf()), true, false); // Wrote 20 records in 2 batches assertEquals(40, recordsRead.size(), "Must contain 40 records"); } } - @Test - public void testLogFileCountsAfterCompaction() throws Exception { + // TODO: Enable metadata virtual keys in this test once the feature HUDI-2593 is completed + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void testLogFileCountsAfterCompaction(boolean preserveCommitMeta) throws Exception { + boolean populateMetaFields = true; // insert 100 records - HoodieWriteConfig config = getConfig(true); + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true, false, HoodieIndex.IndexType.BLOOM, + 1024 * 1024 * 1024L, HoodieClusteringConfig.newBuilder().build(), preserveCommitMeta); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + HoodieWriteConfig config = cfgBuilder.build(); + + setUp(config.getProps()); + try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config);) { String newCommitTime = "100"; writeClient.startCommitWithTime(newCommitTime); List records = dataGen.generateInserts(newCommitTime, 100); - JavaRDD recordsRDD = jsc.parallelize(records, 1); + JavaRDD recordsRDD = jsc().parallelize(records, 1); writeClient.insert(recordsRDD, newCommitTime).collect(); // Update all the 100 records - metaClient = getHoodieMetaClient(hadoopConf, basePath); - newCommitTime = "101"; - writeClient.startCommitWithTime(newCommitTime); - List updatedRecords = dataGen.generateUpdates(newCommitTime, records); - JavaRDD updatedRecordsRDD = jsc.parallelize(updatedRecords, 1); + JavaRDD updatedRecordsRDD = jsc().parallelize(updatedRecords, 1); + + SparkRDDReadClient readClient = new SparkRDDReadClient(context(), config); + JavaRDD updatedTaggedRecordsRDD = readClient.tagLocation(updatedRecordsRDD); - HoodieReadClient readClient = new HoodieReadClient(context, config); - updatedRecords = readClient.tagLocation(updatedRecordsRDD).collect(); + writeClient.startCommitWithTime(newCommitTime); + writeClient.upsertPreppedRecords(updatedTaggedRecordsRDD, newCommitTime).collect(); // Write them to corresponding avro logfiles metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable table = HoodieSparkTable.create(config, context, metaClient); - HoodieWriteableTestTable.of(table, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS) - .withLogAppends(updatedRecords); - // In writeRecordsToLogFiles, no commit files are getting added, so resetting file-system view state - ((SyncableFileSystemView) (table.getSliceView())).reset(); + + HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create( + writeClient.getEngineContext().getHadoopConf().get(), config, writeClient.getEngineContext()); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable + .of(metaClient, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS, metadataWriter); + + Set allPartitions = updatedRecords.stream() + .map(record -> record.getPartitionPath()) + .collect(Collectors.groupingBy(partitionPath -> partitionPath)) + .keySet(); + assertEquals(allPartitions.size(), testTable.listAllBaseFiles().length); // Verify that all data file has one log file + HoodieTable table = HoodieSparkTable.create(config, context(), metaClient); for (String partitionPath : dataGen.getPartitionPaths()) { List groupedLogFiles = table.getSliceView().getLatestFileSlices(partitionPath).collect(Collectors.toList()); for (FileSlice fileSlice : groupedLogFiles) { - assertEquals(1, fileSlice.getLogFiles().count(), "There should be 1 log file written for every data file"); + assertEquals(1, fileSlice.getLogFiles().count(), + "There should be 1 log file written for the latest data file - " + fileSlice); } } - // Mark 2nd delta-instant as completed - metaClient.getActiveTimeline().createNewInstant(new HoodieInstant(State.INFLIGHT, - HoodieTimeline.DELTA_COMMIT_ACTION, newCommitTime)); - metaClient.getActiveTimeline().saveAsComplete( - new HoodieInstant(State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, newCommitTime), Option.empty()); - // Do a compaction String compactionInstantTime = writeClient.scheduleCompaction(Option.empty()).get().toString(); - JavaRDD result = (JavaRDD) writeClient.compact(compactionInstantTime); + HoodieWriteMetadata> result = writeClient.compact(compactionInstantTime); // Verify that recently written compacted data file has no log file metaClient = HoodieTableMetaClient.reload(metaClient); - table = HoodieSparkTable.create(config, context, metaClient); + table = HoodieSparkTable.create(config, context(), metaClient); HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); assertTrue(HoodieTimeline @@ -962,216 +283,36 @@ public void testLogFileCountsAfterCompaction() throws Exception { for (FileSlice slice : groupedLogFiles) { assertEquals(0, slice.getLogFiles().count(), "After compaction there should be no log files visible on a full view"); } - List writeStatuses = result.collect(); - assertTrue(writeStatuses.stream().anyMatch(writeStatus -> writeStatus.getStat().getPartitionPath().contentEquals(partitionPath))); - } - } - } - - @Test - public void testSimpleInsertsGeneratedIntoLogFiles() throws Exception { - // insert 100 records - // Setting IndexType to be InMemory to simulate Global Index nature - HoodieWriteConfig config = getConfigBuilder(false, IndexType.INMEMORY).build(); - try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config);) { - String newCommitTime = "100"; - writeClient.startCommitWithTime(newCommitTime); - - List records = dataGen.generateInserts(newCommitTime, 100); - JavaRDD recordsRDD = jsc.parallelize(records, 1); - JavaRDD statuses = writeClient.insert(recordsRDD, newCommitTime); - writeClient.commit(newCommitTime, statuses); - - HoodieTable table = - HoodieSparkTable.create(config, context, getHoodieMetaClient(hadoopConf, basePath)); - SliceView tableRTFileSystemView = table.getSliceView(); - - long numLogFiles = 0; - for (String partitionPath : dataGen.getPartitionPaths()) { - assertEquals(0, tableRTFileSystemView.getLatestFileSlices(partitionPath) - .filter(fileSlice -> fileSlice.getBaseFile().isPresent()).count()); - assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).anyMatch(fileSlice -> fileSlice.getLogFiles().count() > 0)); - numLogFiles += tableRTFileSystemView.getLatestFileSlices(partitionPath) - .filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count(); - } - - assertTrue(numLogFiles > 0); - // Do a compaction - String instantTime = writeClient.scheduleCompaction(Option.empty()).get().toString(); - statuses = (JavaRDD) writeClient.compact(instantTime); - assertEquals(statuses.map(status -> status.getStat().getPath().contains("parquet")).count(), numLogFiles); - assertEquals(statuses.count(), numLogFiles); - writeClient.commitCompaction(instantTime, statuses, Option.empty()); - } - } - - private void testInsertsGeneratedIntoLogFilesRollback(Boolean rollbackUsingMarkers) throws Exception { - // insert 100 records - // Setting IndexType to be InMemory to simulate Global Index nature - HoodieWriteConfig config = getConfigBuilder(false, rollbackUsingMarkers, IndexType.INMEMORY).build(); - try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) { - String newCommitTime = "100"; - writeClient.startCommitWithTime(newCommitTime); - - List records = dataGen.generateInserts(newCommitTime, 100); - JavaRDD recordsRDD = jsc.parallelize(records, 1); - JavaRDD statuses = writeClient.insert(recordsRDD, newCommitTime); - // trigger an action - List writeStatuses = statuses.collect(); - - // Ensure that inserts are written to only log files - assertEquals(0, - writeStatuses.stream().filter(writeStatus -> !writeStatus.getStat().getPath().contains("log")).count()); - assertTrue( - writeStatuses.stream().anyMatch(writeStatus -> writeStatus.getStat().getPath().contains("log"))); - - // rollback a failed commit - boolean rollback = writeClient.rollback(newCommitTime); - assertTrue(rollback); - - // insert 100 records - newCommitTime = "101"; - writeClient.startCommitWithTime(newCommitTime); - records = dataGen.generateInserts(newCommitTime, 100); - recordsRDD = jsc.parallelize(records, 1); - writeClient.insert(recordsRDD, newCommitTime).collect(); - - // Sleep for small interval (at least 1 second) to force a new rollback start time. - Thread.sleep(1000); - - // We will test HUDI-204 here. We will simulate rollback happening twice by copying the commit file to local fs - // and calling rollback twice - final String lastCommitTime = newCommitTime; - metaClient = getHoodieMetaClient(hadoopConf, basePath); - - // Save the .commit file to local directory. - // Rollback will be called twice to test the case where rollback failed first time and retried. - // We got the "BaseCommitTime cannot be null" exception before the fix - Map fileNameMap = new HashMap<>(); - for (State state : Arrays.asList(State.REQUESTED, State.INFLIGHT)) { - HoodieInstant toCopy = new HoodieInstant(state, HoodieTimeline.DELTA_COMMIT_ACTION, lastCommitTime); - File file = Files.createTempFile(tempFolder, null, null).toFile(); - metaClient.getFs().copyToLocalFile(new Path(metaClient.getMetaPath(), toCopy.getFileName()), - new Path(file.getAbsolutePath())); - fileNameMap.put(file.getAbsolutePath(), toCopy.getFileName()); - } - Path markerDir = new Path(Files.createTempDirectory(tempFolder,null).toAbsolutePath().toString()); - if (rollbackUsingMarkers) { - metaClient.getFs().copyToLocalFile(new Path(metaClient.getMarkerFolderPath(lastCommitTime)), - markerDir); - } - - writeClient.rollback(newCommitTime); - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable table = HoodieSparkTable.create(config, context); - SliceView tableRTFileSystemView = table.getSliceView(); - - long numLogFiles = 0; - for (String partitionPath : dataGen.getPartitionPaths()) { - assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).noneMatch(fileSlice -> fileSlice.getBaseFile().isPresent())); - assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).noneMatch(fileSlice -> fileSlice.getLogFiles().count() > 0)); - numLogFiles += tableRTFileSystemView.getLatestFileSlices(partitionPath) - .filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count(); - } - assertEquals(0, numLogFiles); - fileNameMap.forEach((key, value) -> { - try { - metaClient.getFs().copyFromLocalFile(new Path(key), - new Path(metaClient.getMetaPath(), value)); - } catch (IOException e) { - throw new HoodieIOException("Error copying state from local disk.", e); - } - }); - if (rollbackUsingMarkers) { - metaClient.getFs().copyFromLocalFile(markerDir, - new Path(metaClient.getMarkerFolderPath(lastCommitTime))); + assertTrue(result.getCommitMetadata().get().getWritePartitionPaths().stream().anyMatch(part -> part.contentEquals(partitionPath))); } - Thread.sleep(1000); - // Rollback again to pretend the first rollback failed partially. This should not error out - writeClient.rollback(newCommitTime); - } - } - - @Test - public void testInsertsGeneratedIntoLogFilesRollbackUsingFileList() throws Exception { - testInsertsGeneratedIntoLogFilesRollback(false); - } - - @Test - public void testInsertsGeneratedIntoLogFilesRollbackUsingMarkers() throws Exception { - testInsertsGeneratedIntoLogFilesRollback(true); - } - - private void testInsertsGeneratedIntoLogFilesRollbackAfterCompaction(Boolean rollbackUsingMarkers) throws Exception { - // insert 100 records - // Setting IndexType to be InMemory to simulate Global Index nature - HoodieWriteConfig config = getConfigBuilder(false, rollbackUsingMarkers, IndexType.INMEMORY).build(); - try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config);) { - String newCommitTime = "100"; - writeClient.startCommitWithTime(newCommitTime); - - List records = dataGen.generateInserts(newCommitTime, 100); - JavaRDD recordsRDD = jsc.parallelize(records, 1); - JavaRDD statuses = writeClient.insert(recordsRDD, newCommitTime); - writeClient.commit(newCommitTime, statuses); - // trigger an action - statuses.collect(); - - HoodieTable table = HoodieSparkTable.create(config, context, getHoodieMetaClient(hadoopConf, basePath)); - SliceView tableRTFileSystemView = table.getSliceView(); - long numLogFiles = 0; - for (String partitionPath : dataGen.getPartitionPaths()) { - assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).noneMatch(fileSlice -> fileSlice.getBaseFile().isPresent())); - assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).anyMatch(fileSlice -> fileSlice.getLogFiles().count() > 0)); - numLogFiles += tableRTFileSystemView.getLatestFileSlices(partitionPath) - .filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count(); + // Check the entire dataset has all records still + String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; + for (int i = 0; i < fullPartitionPaths.length; i++) { + fullPartitionPaths[i] = String.format("%s/%s/*", basePath(), dataGen.getPartitionPaths()[i]); } - - assertTrue(numLogFiles > 0); - // Do a compaction - newCommitTime = writeClient.scheduleCompaction(Option.empty()).get().toString(); - statuses = (JavaRDD) writeClient.compact(newCommitTime); - // Ensure all log files have been compacted into parquet files - assertEquals(statuses.map(status -> status.getStat().getPath().contains("parquet")).count(), numLogFiles); - assertEquals(statuses.count(), numLogFiles); - //writeClient.commitCompaction(newCommitTime, statuses, Option.empty()); - // Trigger a rollback of compaction - table.getActiveTimeline().reload(); - writeClient.rollbackInflightCompaction(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, newCommitTime), table); - - table = HoodieSparkTable.create(config, context, getHoodieMetaClient(hadoopConf, basePath)); - tableRTFileSystemView = table.getSliceView(); - ((SyncableFileSystemView) tableRTFileSystemView).reset(); - - for (String partitionPath : dataGen.getPartitionPaths()) { - List fileSlices = getFileSystemViewWithUnCommittedSlices(getHoodieMetaClient(hadoopConf, basePath)) - .getAllFileSlices(partitionPath).filter(fs -> fs.getBaseInstantTime().equals("100")).collect(Collectors.toList()); - assertTrue(fileSlices.stream().noneMatch(fileSlice -> fileSlice.getBaseFile().isPresent())); - assertTrue(fileSlices.stream().anyMatch(fileSlice -> fileSlice.getLogFiles().count() > 0)); + Dataset actual = HoodieClientTestUtils.read(jsc(), basePath(), sqlContext(), fs(), fullPartitionPaths); + List rows = actual.collectAsList(); + assertEquals(updatedRecords.size(), rows.size()); + for (Row row: rows) { + assertEquals(row.getAs(HoodieRecord.COMMIT_TIME_METADATA_FIELD), preserveCommitMeta ? newCommitTime : compactionInstantTime); } } } - @Test - public void testInsertsGeneratedIntoLogFilesRollbackAfterCompactionUsingFileList() throws Exception { - testInsertsGeneratedIntoLogFilesRollbackAfterCompaction(false); - } - - @Test - public void testInsertsGeneratedIntoLogFilesRollbackAfterCompactionUsingMarkers() throws Exception { - testInsertsGeneratedIntoLogFilesRollbackAfterCompaction(true); - } - /** * Test to ensure metadata stats are correctly written to metadata file. */ + @ParameterizedTest + @ValueSource(booleans = {true, false}) public void testMetadataStatsOnCommit(Boolean rollbackUsingMarkers) throws Exception { HoodieWriteConfig cfg = getConfigBuilder(false, rollbackUsingMarkers, IndexType.INMEMORY) .withAutoCommit(false).build(); + + setUp(cfg.getProps()); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { - metaClient = getHoodieMetaClient(hadoopConf, basePath); - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); + HoodieTable table = HoodieSparkTable.create(cfg, context(), metaClient); // Create a commit without metadata stats in metadata to test backwards compatibility HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); @@ -1186,13 +327,13 @@ public void testMetadataStatsOnCommit(Boolean rollbackUsingMarkers) throws Excep client.startCommitWithTime(instantTime); List records = dataGen.generateInserts(instantTime, 200); - JavaRDD writeRecords = jsc.parallelize(records, 1); + JavaRDD writeRecords = jsc().parallelize(records, 1); JavaRDD statuses = client.insert(writeRecords, instantTime); assertTrue(client.commit(instantTime, statuses), "Commit should succeed"); // Read from commit file - table = HoodieSparkTable.create(cfg, context); + table = HoodieSparkTable.create(cfg, context()); HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes( table.getActiveTimeline().getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class); @@ -1207,7 +348,7 @@ public void testMetadataStatsOnCommit(Boolean rollbackUsingMarkers) throws Excep instantTime = "002"; client.startCommitWithTime(instantTime); records = dataGen.generateUpdates(instantTime, records); - writeRecords = jsc.parallelize(records, 1); + writeRecords = jsc().parallelize(records, 1); statuses = client.upsert(writeRecords, instantTime); //assertTrue(client.commit(instantTime, statuses), "Commit should succeed"); inserts = 0; @@ -1225,7 +366,7 @@ public void testMetadataStatsOnCommit(Boolean rollbackUsingMarkers) throws Excep client.rollback(instantTime); // Read from commit file - table = HoodieSparkTable.create(cfg, context); + table = HoodieSparkTable.create(cfg, context()); metadata = HoodieCommitMetadata.fromBytes( table.getActiveTimeline() .getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), @@ -1243,25 +384,15 @@ public void testMetadataStatsOnCommit(Boolean rollbackUsingMarkers) throws Excep } } - /** - * Test to ensure rolling stats are correctly written to metadata file. - */ - @Test - public void testMetadataStatsOnCommitUsingFileList() throws Exception { - testMetadataStatsOnCommit(false); - } - - @Test - public void testMetadataStatsOnCommitUsingMarkers() throws Exception { - testMetadataStatsOnCommit(true); - } - /** * Test to ensure rolling stats are correctly written to the metadata file, identifies small files and corrects them. */ @Test public void testRollingStatsWithSmallFileHandling() throws Exception { HoodieWriteConfig cfg = getConfigBuilder(false, IndexType.INMEMORY).withAutoCommit(false).build(); + + setUp(cfg.getProps()); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { Map fileIdToInsertsMap = new HashMap<>(); Map fileIdToUpsertsMap = new HashMap<>(); @@ -1270,13 +401,13 @@ public void testRollingStatsWithSmallFileHandling() throws Exception { client.startCommitWithTime(instantTime); List records = dataGen.generateInserts(instantTime, 200); - JavaRDD writeRecords = jsc.parallelize(records, 1); + JavaRDD writeRecords = jsc().parallelize(records, 1); JavaRDD statuses = client.insert(writeRecords, instantTime); assertTrue(client.commit(instantTime, statuses), "Commit should succeed"); // Read from commit file - HoodieTable table = HoodieSparkTable.create(cfg, context); + HoodieTable table = HoodieSparkTable.create(cfg, context()); HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes( table.getActiveTimeline() .getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), @@ -1296,12 +427,12 @@ public void testRollingStatsWithSmallFileHandling() throws Exception { // generate updates + inserts. inserts should be handled into small files records = dataGen.generateUpdates(instantTime, records); records.addAll(dataGen.generateInserts(instantTime, 200)); - writeRecords = jsc.parallelize(records, 1); + writeRecords = jsc().parallelize(records, 1); statuses = client.upsert(writeRecords, instantTime); assertTrue(client.commit(instantTime, statuses), "Commit should succeed"); // Read from commit file - table = HoodieSparkTable.create(cfg, context); + table = HoodieSparkTable.create(cfg, context()); metadata = HoodieCommitMetadata.fromBytes( table.getActiveTimeline() .getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), @@ -1323,11 +454,12 @@ public void testRollingStatsWithSmallFileHandling() throws Exception { // Test small file handling after compaction instantTime = "002"; client.scheduleCompactionAtInstant(instantTime, Option.of(metadata.getExtraMetadata())); - statuses = (JavaRDD) client.compact(instantTime); - client.commitCompaction(instantTime, statuses, Option.empty()); + HoodieWriteMetadata> compactionMetadata = client.compact(instantTime); + statuses = compactionMetadata.getWriteStatuses(); + client.commitCompaction(instantTime, compactionMetadata.getCommitMetadata().get(), Option.empty()); // Read from commit file - table = HoodieSparkTable.create(cfg, context); + table = HoodieSparkTable.create(cfg, context()); HoodieCommitMetadata metadata1 = HoodieCommitMetadata.fromBytes( table.getActiveTimeline() .getInstantDetails(table.getActiveTimeline().getCommitsTimeline().lastInstant().get()).get(), @@ -1346,12 +478,12 @@ public void testRollingStatsWithSmallFileHandling() throws Exception { // generate updates + inserts. inserts should be handled into small files records = dataGen.generateUpdates(instantTime, records); records.addAll(dataGen.generateInserts(instantTime, 200)); - writeRecords = jsc.parallelize(records, 1); + writeRecords = jsc().parallelize(records, 1); statuses = client.upsert(writeRecords, instantTime); assertTrue(client.commit(instantTime, statuses), "Commit should succeed"); // Read from commit file - table = HoodieSparkTable.create(cfg, context); + table = HoodieSparkTable.create(cfg, context()); metadata = HoodieCommitMetadata.fromBytes( table.getActiveTimeline() .getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), @@ -1377,22 +509,24 @@ public void testRollingStatsWithSmallFileHandling() throws Exception { @Test public void testHandleUpdateWithMultiplePartitions() throws Exception { HoodieWriteConfig cfg = getConfig(true); + + setUp(cfg.getProps()); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { /** - * Write 1 (only inserts, written as parquet file) + * Write 1 (only inserts, written as base file) */ String newCommitTime = "001"; client.startCommitWithTime(newCommitTime); List records = dataGen.generateInserts(newCommitTime, 20); - JavaRDD writeRecords = jsc.parallelize(records, 1); + JavaRDD writeRecords = jsc().parallelize(records, 1); List statuses = client.upsert(writeRecords, newCommitTime).collect(); assertNoWriteErrors(statuses); - metaClient = getHoodieMetaClient(hadoopConf, cfg.getBasePath()); - HoodieSparkMergeOnReadTable hoodieTable = (HoodieSparkMergeOnReadTable) HoodieSparkTable.create(cfg, context, metaClient); + HoodieSparkMergeOnReadTable hoodieTable = (HoodieSparkMergeOnReadTable) HoodieSparkTable.create(cfg, context(), metaClient); Option deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant(); assertTrue(deltaCommit.isPresent()); @@ -1410,7 +544,7 @@ public void testHandleUpdateWithMultiplePartitions() throws Exception { roView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); dataFilesToRead = roView.getLatestBaseFiles(); assertTrue(dataFilesToRead.findAny().isPresent(), - "should list the parquet files we wrote in the delta commit"); + "should list the base files we wrote in the delta commit"); /** * Write 2 (only updates, written to .log file) @@ -1419,7 +553,7 @@ public void testHandleUpdateWithMultiplePartitions() throws Exception { client.startCommitWithTime(newCommitTime); metaClient.reloadActiveTimeline(); records = dataGen.generateUpdates(newCommitTime, records); - writeRecords = jsc.parallelize(records, 1); + writeRecords = jsc().parallelize(records, 1); statuses = client.upsert(writeRecords, newCommitTime).collect(); assertNoWriteErrors(statuses); @@ -1433,13 +567,14 @@ public void testHandleUpdateWithMultiplePartitions() throws Exception { metaClient.reloadActiveTimeline(); List fewRecordsForDelete = dataGen.generateDeletesFromExistingRecords(records); - JavaRDD deleteRDD = jsc.parallelize(fewRecordsForDelete, 1); + JavaRDD deleteRDD = jsc().parallelize(fewRecordsForDelete, 1); // initialize partitioner - AbstractSparkDeltaCommitActionExecutor actionExecutor = new SparkDeleteDeltaCommitActionExecutor(context, cfg, hoodieTable, - newDeleteTime, deleteRDD); + hoodieTable.getHoodieView().sync(); + BaseSparkDeltaCommitActionExecutor actionExecutor = new SparkDeleteDeltaCommitActionExecutor(context(), cfg, hoodieTable, + newDeleteTime, HoodieJavaRDD.of(deleteRDD)); actionExecutor.getUpsertPartitioner(new WorkloadProfile(buildProfile(deleteRDD))); - final List> deleteStatus = jsc.parallelize(Arrays.asList(1)).map(x -> { + final List> deleteStatus = jsc().parallelize(Arrays.asList(1)).map(x -> { return actionExecutor.handleUpdate(partitionPath, fileId, fewRecordsForDelete.iterator()); }).map(Transformations::flatten).collect(); @@ -1448,181 +583,55 @@ public void testHandleUpdateWithMultiplePartitions() throws Exception { WriteStatus status = deleteStatus.get(0).get(0); assertTrue(status.hasErrors()); long numRecordsInPartition = fewRecordsForDelete.stream().filter(u -> - u.getPartitionPath().equals(partitionPath)).count(); + u.getPartitionPath().equals(partitionPath)).count(); assertEquals(fewRecordsForDelete.size() - numRecordsInPartition, status.getTotalErrorRecords()); } } - private HoodieWriteConfig getConfig(Boolean autoCommit) { - return getConfigBuilder(autoCommit).build(); - } - - private HoodieWriteConfig getConfig(Boolean autoCommit, Boolean rollbackUsingMarkers) { - return getConfigBuilder(autoCommit, rollbackUsingMarkers, IndexType.BLOOM).build(); - } - - protected HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) { - return getConfigBuilder(autoCommit, IndexType.BLOOM); - } - - protected HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit, HoodieIndex.IndexType indexType) { - return getConfigBuilder(autoCommit, false, indexType); - } - - protected HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit, Boolean rollbackUsingMarkers, HoodieIndex.IndexType indexType) { - return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .withDeleteParallelism(2) - .withAutoCommit(autoCommit).withAssumeDatePartitioning(true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024) - .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build()) - .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024 * 1024).parquetMaxFileSize(1024 * 1024 * 1024).build()) - .withEmbeddedTimelineServerEnabled(true).forTable("test-trip-table") - .withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder() - .withEnableBackupForRemoteFileSystemView(false).build()) - .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).build()) - .withRollbackUsingMarkers(rollbackUsingMarkers); - } - - private FileStatus[] insertAndGetFilePaths(List records, SparkRDDWriteClient client, - HoodieWriteConfig cfg, String commitTime) throws IOException { - JavaRDD writeRecords = jsc.parallelize(records, 1); - - List statuses = client.insert(writeRecords, commitTime).collect(); - assertNoWriteErrors(statuses); - - metaClient = getHoodieMetaClient(hadoopConf, cfg.getBasePath()); - HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context, metaClient); - - Option deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant(); - assertTrue(deltaCommit.isPresent()); - assertEquals(commitTime, deltaCommit.get().getTimestamp(), "Delta commit should be specified value"); + @Test + public void testReleaseResource() throws Exception { + HoodieWriteConfig.Builder builder = getConfigBuilder(true); + builder.withReleaseResourceEnabled(true); + builder.withAutoCommit(false); - Option commit = metaClient.getActiveTimeline().getCommitTimeline().lastInstant(); - assertFalse(commit.isPresent()); + setUp(builder.build().getProps()); - FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); - BaseFileOnlyView roView = - getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); - Stream dataFilesToRead = roView.getLatestBaseFiles(); - assertTrue(!dataFilesToRead.findAny().isPresent()); + /** + * Write 1 (test when RELEASE_RESOURCE_ENABLE is true) + */ + try (SparkRDDWriteClient client = getHoodieWriteClient(builder.build())) { - roView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); - dataFilesToRead = roView.getLatestBaseFiles(); - assertTrue(dataFilesToRead.findAny().isPresent(), - "should list the parquet files we wrote in the delta commit"); - return allFiles; - } + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); - private FileStatus[] updateAndGetFilePaths(List records, SparkRDDWriteClient client, - HoodieWriteConfig cfg, String commitTime) throws IOException { - Map recordsMap = new HashMap<>(); - for (HoodieRecord rec : records) { - if (!recordsMap.containsKey(rec.getKey())) { - recordsMap.put(rec.getKey(), rec); - } + List records = dataGen.generateInserts(newCommitTime, 20); + JavaRDD writeRecords = jsc().parallelize(records, 1); + writeRecords.persist(StorageLevel.MEMORY_AND_DISK()); + List statuses = client.upsert(writeRecords, newCommitTime).collect(); + assertNoWriteErrors(statuses); + client.commitStats(newCommitTime, statuses.stream().map(WriteStatus::getStat).collect(Collectors.toList()), Option.empty(), metaClient.getCommitActionType()); + assertEquals(spark().sparkContext().persistentRdds().size(), 0); } - List statuses = client.upsert(jsc.parallelize(records, 1), commitTime).collect(); - // Verify there are no errors - assertNoWriteErrors(statuses); - metaClient = HoodieTableMetaClient.reload(metaClient); - Option deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant(); - assertTrue(deltaCommit.isPresent()); - assertEquals(commitTime, deltaCommit.get().getTimestamp(), - "Latest Delta commit should match specified time"); - - Option commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); - assertFalse(commit.isPresent()); - HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context, metaClient); - return listAllBaseFilesInPath(hoodieTable); - } - - private FileStatus[] getROSnapshotFiles(String partitionPath) - throws Exception { - FileInputFormat.setInputPaths(roSnapshotJobConf, basePath + "/" + partitionPath); - return listStatus(roSnapshotJobConf, false); - } - - private FileStatus[] getROIncrementalFiles(String partitionPath, boolean stopAtCompaction) - throws Exception { - return getROIncrementalFiles(partitionPath, "000", -1, stopAtCompaction); - } - - private FileStatus[] getROIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull, boolean stopAtCompaction) - throws Exception { - setupIncremental(roJobConf, startCommitTime, numCommitsToPull, stopAtCompaction); - FileInputFormat.setInputPaths(roJobConf, Paths.get(basePath, partitionPath).toString()); - return listStatus(roJobConf, false); - } - - private FileStatus[] getRTIncrementalFiles(String partitionPath) - throws Exception { - return getRTIncrementalFiles(partitionPath, "000", -1); - } - - private FileStatus[] getRTIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull) - throws Exception { - setupIncremental(rtJobConf, startCommitTime, numCommitsToPull, false); - FileInputFormat.setInputPaths(rtJobConf, Paths.get(basePath, partitionPath).toString()); - return listStatus(rtJobConf, true); - } - - private void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull, boolean stopAtCompaction) { - String modePropertyName = - String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); - jobConf.set(modePropertyName, HoodieHiveUtils.INCREMENTAL_SCAN_MODE); + builder.withReleaseResourceEnabled(false); - String startCommitTimestampName = - String.format(HoodieHiveUtils.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); - jobConf.set(startCommitTimestampName, startCommit); - - String maxCommitPulls = - String.format(HoodieHiveUtils.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); - jobConf.setInt(maxCommitPulls, numberOfCommitsToPull); - - String stopAtCompactionPropName = - String.format(HoodieHiveUtils.HOODIE_STOP_AT_COMPACTION_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); - jobConf.setBoolean(stopAtCompactionPropName, stopAtCompaction); - } - - private void validateFiles(String partitionPath, int expectedNumFiles, - FileStatus[] files, boolean realtime, JobConf jobConf, - int expectedRecords, String... expectedCommits) { - - assertEquals(expectedNumFiles, files.length); - Set expectedCommitsSet = Arrays.stream(expectedCommits).collect(Collectors.toSet()); - List records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, - Collections.singletonList(Paths.get(basePath, partitionPath).toString()), basePath, jobConf, realtime); - assertEquals(expectedRecords, records.size()); - Set actualCommits = records.stream().map(r -> - r.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString()).collect(Collectors.toSet()); - assertEquals(expectedCommitsSet, actualCommits); - } + /** + * Write 2 (test when RELEASE_RESOURCE_ENABLE is false) + */ + try (SparkRDDWriteClient client = getHoodieWriteClient(builder.build())) { + String newCommitTime = "002"; + client.startCommitWithTime(newCommitTime); - private FileStatus[] listAllBaseFilesInPath(HoodieTable table) throws IOException { - return HoodieTestTable.of(table.getMetaClient()).listAllBaseFiles(table.getBaseFileExtension()); - } + List records = dataGen.generateInserts(newCommitTime, 20); + JavaRDD writeRecords = jsc().parallelize(records, 1); - private FileStatus[] listStatus(JobConf jobConf, boolean realtime) throws IOException { - // This is required as Hoodie InputFormats do not extend a common base class and FileInputFormat's - // listStatus() is protected. - FileInputFormat inputFormat = HoodieInputFormatUtils.getInputFormat(baseFileFormat, realtime, jobConf); - switch (baseFileFormat) { - case PARQUET: - if (realtime) { - return ((HoodieParquetRealtimeInputFormat)inputFormat).listStatus(jobConf); - } else { - return ((HoodieParquetInputFormat)inputFormat).listStatus(jobConf); - } - case HFILE: - if (realtime) { - return ((HoodieHFileRealtimeInputFormat)inputFormat).listStatus(jobConf); - } else { - return ((HoodieHFileInputFormat)inputFormat).listStatus(jobConf); - } - default: - throw new HoodieIOException("Hoodie InputFormat not implemented for base file format " + baseFileFormat); + writeRecords.persist(StorageLevel.MEMORY_AND_DISK()); + List statuses = client.upsert(writeRecords, newCommitTime).collect(); + assertNoWriteErrors(statuses); + client.commitStats(newCommitTime, statuses.stream().map(WriteStatus::getStat).collect(Collectors.toList()), Option.empty(), metaClient.getCommitActionType()); + assertTrue(spark().sparkContext().persistentRdds().size() > 0); } + } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestMarkerFiles.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestMarkerFiles.java deleted file mode 100644 index b25427baf8279..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestMarkerFiles.java +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table; - -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.IOType; -import org.apache.hudi.common.testutils.FileSystemTestUtils; -import org.apache.hudi.common.testutils.HoodieCommonTestHarness; -import org.apache.hudi.common.util.CollectionUtils; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.testutils.HoodieClientTestUtils; - -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.spark.api.java.JavaSparkContext; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.util.List; -import java.util.stream.Collectors; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertIterableEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -public class TestMarkerFiles extends HoodieCommonTestHarness { - - private MarkerFiles markerFiles; - private FileSystem fs; - private Path markerFolderPath; - private JavaSparkContext jsc; - private HoodieSparkEngineContext context; - - @BeforeEach - public void setup() throws IOException { - initPath(); - initMetaClient(); - this.jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest(TestMarkerFiles.class.getName())); - this.context = new HoodieSparkEngineContext(jsc); - this.fs = FSUtils.getFs(metaClient.getBasePath(), metaClient.getHadoopConf()); - this.markerFolderPath = new Path(metaClient.getMarkerFolderPath("000")); - this.markerFiles = new MarkerFiles(fs, metaClient.getBasePath(), markerFolderPath.toString(), "000"); - } - - @AfterEach - public void cleanup() { - jsc.stop(); - context = null; - } - - private void createSomeMarkerFiles() { - markerFiles.create("2020/06/01", "file1", IOType.MERGE); - markerFiles.create("2020/06/02", "file2", IOType.APPEND); - markerFiles.create("2020/06/03", "file3", IOType.CREATE); - } - - private void createInvalidFile(String partitionPath, String invalidFileName) { - Path path = FSUtils.getPartitionPath(markerFolderPath.toString(), partitionPath); - Path invalidFilePath = new Path(path, invalidFileName); - try { - fs.create(invalidFilePath, false).close(); - } catch (IOException e) { - throw new HoodieException("Failed to create invalid file " + invalidFilePath, e); - } - } - - @Test - public void testCreation() throws Exception { - // when - createSomeMarkerFiles(); - - // then - assertTrue(fs.exists(markerFolderPath)); - List markerFiles = FileSystemTestUtils.listRecursive(fs, markerFolderPath) - .stream().filter(status -> status.getPath().getName().contains(".marker")) - .sorted().collect(Collectors.toList()); - assertEquals(3, markerFiles.size()); - assertIterableEquals(CollectionUtils.createImmutableList( - "file:" + markerFolderPath.toString() + "/2020/06/01/file1.marker.MERGE", - "file:" + markerFolderPath.toString() + "/2020/06/02/file2.marker.APPEND", - "file:" + markerFolderPath.toString() + "/2020/06/03/file3.marker.CREATE"), - markerFiles.stream().map(m -> m.getPath().toString()).collect(Collectors.toList()) - ); - } - - @Test - public void testDeletionWhenMarkerDirExists() throws IOException { - //when - markerFiles.create("2020/06/01", "file1", IOType.MERGE); - - // then - assertTrue(markerFiles.doesMarkerDirExist()); - assertTrue(markerFiles.deleteMarkerDir(context, 2)); - assertFalse(markerFiles.doesMarkerDirExist()); - } - - @Test - public void testDeletionWhenMarkerDirNotExists() throws IOException { - // then - assertFalse(markerFiles.doesMarkerDirExist()); - assertFalse(markerFiles.deleteMarkerDir(context, 2)); - } - - @Test - public void testDataPathsWhenCreatingOrMerging() throws IOException { - // add markfiles - createSomeMarkerFiles(); - // add invalid file - createInvalidFile("2020/06/01", "invalid_file3"); - int fileSize = FileSystemTestUtils.listRecursive(fs, markerFolderPath).size(); - assertEquals(fileSize,4); - - // then - assertIterableEquals(CollectionUtils.createImmutableList( - "2020/06/01/file1", "2020/06/03/file3"), - markerFiles.createdAndMergedDataPaths(context, 2).stream().sorted().collect(Collectors.toList()) - ); - } - - @Test - public void testAllMarkerPaths() throws IOException { - // given - createSomeMarkerFiles(); - - // then - assertIterableEquals(CollectionUtils.createImmutableList("2020/06/01/file1.marker.MERGE", - "2020/06/02/file2.marker.APPEND", "2020/06/03/file3.marker.CREATE"), - markerFiles.allMarkerFilePaths().stream().sorted().collect(Collectors.toList()) - ); - } - - @Test - public void testStripMarkerSuffix() { - // Given - final String pathPrefix = "file://" + metaClient.getMetaPath() + "/file"; - final String markerFilePath = pathPrefix + ".marker.APPEND"; - - // when-then - assertEquals(pathPrefix, MarkerFiles.stripMarkerSuffix(markerFilePath)); - } -} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/bootstrap/TestBootstrapUtils.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/bootstrap/TestBootstrapUtils.java index b57e1c52759f6..83a6caecd19d5 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/bootstrap/TestBootstrapUtils.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/bootstrap/TestBootstrapUtils.java @@ -18,6 +18,8 @@ package org.apache.hudi.table.action.bootstrap; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.hudi.avro.model.HoodieFileStatus; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieException; @@ -47,13 +49,15 @@ public void testAllLeafFoldersWithFiles() throws IOException { }); // Files inside partitions and marker directories - List files = Arrays.asList( - "2016/04/15/1_1-0-1_20190528120000.parquet", - "2016/04/15/2_1-0-1_20190528120000.parquet", - "2016/05/16/3_1-0-1_20190528120000.parquet", - "2016/05/16/4_1-0-1_20190528120000.parquet", - "2016/04/17/5_1-0-1_20190528120000.parquet", - "2016/04/17/6_1-0-1_20190528120000.parquet"); + List files = Stream.of( + "2016/04/15/1_1-0-1_20190528120000", + "2016/04/15/2_1-0-1_20190528120000", + "2016/05/16/3_1-0-1_20190528120000", + "2016/05/16/4_1-0-1_20190528120000", + "2016/04/17/5_1-0-1_20190528120000", + "2016/04/17/6_1-0-1_20190528120000") + .map(file -> file + metaClient.getTableConfig().getBaseFileFormat().getFileExtension()) + .collect(Collectors.toList()); files.forEach(f -> { try { diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/clean/TestCleanerInsertAndCleanByCommits.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/clean/TestCleanerInsertAndCleanByCommits.java new file mode 100644 index 0000000000000..4874fe7bb9ffe --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/clean/TestCleanerInsertAndCleanByCommits.java @@ -0,0 +1,261 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.clean; + +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.transaction.lock.InProcessLockProvider; +import org.apache.hudi.common.fs.ConsistencyGuardConfig; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFileGroup; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.TableFileSystemView; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieLockConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.testutils.SparkClientFunctionalTestHarness; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaRDD; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.model.HoodieCleaningPolicy.KEEP_LATEST_COMMITS; +import static org.apache.hudi.common.testutils.HoodieTestTable.makeNewCommitTime; +import static org.apache.hudi.table.TestCleaner.insertFirstBigBatchForClientCleanerTest; +import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; +import static org.apache.hudi.testutils.HoodieClientTestBase.Function2; +import static org.apache.hudi.testutils.HoodieClientTestBase.Function3; +import static org.apache.hudi.testutils.HoodieClientTestBase.wrapRecordsGenFunctionForPreppedCalls; +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestCleanerInsertAndCleanByCommits extends SparkClientFunctionalTestHarness { + + private static final Logger LOG = LogManager.getLogger(TestCleanerInsertAndCleanByCommits.class); + private static final int BATCH_SIZE = 100; + private static final int PARALLELISM = 2; + + /** + * Test Clean-By-Commits using insert/upsert API. + */ + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testInsertAndCleanByCommits(boolean isAsync) throws Exception { + testInsertAndCleanByCommits(SparkRDDWriteClient::insert, SparkRDDWriteClient::upsert, false, isAsync); + } + + /** + * Test Clean-By-Commits using prepped version of insert/upsert API. + */ + @Test + public void testInsertPreppedAndCleanByCommits() throws Exception { + testInsertAndCleanByCommits(SparkRDDWriteClient::insertPreppedRecords, SparkRDDWriteClient::upsertPreppedRecords, + true, false); + } + + /** + * Test Clean-By-Commits using prepped versions of bulk-insert/upsert API. + */ + @Test + public void testBulkInsertPreppedAndCleanByCommits() throws Exception { + testInsertAndCleanByCommits( + (client, recordRDD, instantTime) -> client.bulkInsertPreppedRecords(recordRDD, instantTime, Option.empty()), + SparkRDDWriteClient::upsertPreppedRecords, true, false); + } + + /** + * Test Clean-By-Commits using bulk-insert/upsert API. + */ + @Test + public void testBulkInsertAndCleanByCommits() throws Exception { + testInsertAndCleanByCommits(SparkRDDWriteClient::bulkInsert, SparkRDDWriteClient::upsert, false, false); + } + + /** + * Test Helper for Cleaning by versions logic from HoodieWriteClient API perspective. + * + * @param insertFn Insert API to be tested + * @param upsertFn Upsert API to be tested + * @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during + * record generation to also tag the regards (de-dupe is implicit as we use unique record-gen APIs) + * @throws Exception in case of errors + */ + private void testInsertAndCleanByCommits( + Function3, SparkRDDWriteClient, JavaRDD, String> insertFn, + Function3, SparkRDDWriteClient, JavaRDD, String> upsertFn, boolean isPreppedAPI, boolean isAsync) + throws Exception { + int maxCommits = 3; // keep upto 3 commits from the past + HoodieWriteConfig cfg = getConfigBuilder(true) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withCleanerPolicy(KEEP_LATEST_COMMITS) + .withAsyncClean(isAsync).retainCommits(maxCommits).build()) + .withParallelism(PARALLELISM, PARALLELISM) + .withBulkInsertParallelism(PARALLELISM) + .withFinalizeWriteParallelism(PARALLELISM) + .withDeleteParallelism(PARALLELISM) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) + .withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(InProcessLockProvider.class).build()) + .build(); + try (final SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { + final HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(System.nanoTime()); + final Function2, String, Integer> recordInsertGenWrappedFunction = isPreppedAPI + ? wrapRecordsGenFunctionForPreppedCalls(basePath(), hadoopConf(), context(), cfg, dataGen::generateInserts) + : dataGen::generateInserts; + final Function2, String, Integer> recordUpsertGenWrappedFunction = isPreppedAPI + ? wrapRecordsGenFunctionForPreppedCalls(basePath(), hadoopConf(), context(), cfg, dataGen::generateUniqueUpdates) + : dataGen::generateUniqueUpdates; + + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.COPY_ON_WRITE); + insertFirstBigBatchForClientCleanerTest(context(), metaClient, client, recordInsertGenWrappedFunction, insertFn); + + Map> commitWriteStatsMap = new HashMap<>(); + // Keep doing some writes and clean inline. Make sure we have expected number of files remaining. + for (int i = 0; i < 8; i++) { + String newCommitTime = makeNewCommitTime(); + client.startCommitWithTime(newCommitTime); + List records = recordUpsertGenWrappedFunction.apply(newCommitTime, BATCH_SIZE); + + List statuses = upsertFn.apply(client, jsc().parallelize(records, PARALLELISM), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + commitWriteStatsMap.put( + newCommitTime, + statuses.stream().map(WriteStatus::getStat).collect(Collectors.toList())); + + metaClient = HoodieTableMetaClient.reload(metaClient); + validateFilesAfterCleaning( + HoodieSparkTable.create(cfg, context(), metaClient), + commitWriteStatsMap, + dataGen.getPartitionPaths()); + } + } + } + + /** + * Validates the data files in a Hudi table based on the `KEEP_LATEST_COMMITS` cleaner policy. + * + * @param table {@link HoodieTable} instance. + * @param commitWriteStatsMap The cache for the list of write stats of each commit. + * @param partitionPaths List of partitions to validate. + */ + private void validateFilesAfterCleaning( + HoodieTable table, + Map> commitWriteStatsMap, + String[] partitionPaths) { + assertEquals(KEEP_LATEST_COMMITS, table.getConfig().getCleanerPolicy()); + boolean isAsyncClean = table.getConfig().isAsyncClean(); + int maxCommitsToRetain = table.getConfig().getCleanerCommitsRetained(); + HoodieTimeline commitsTimeline = table.getCompletedCommitsTimeline(); + HoodieInstant lastInstant = commitsTimeline.lastInstant().get(); + + if (isAsyncClean) { + commitsTimeline = commitsTimeline.findInstantsBefore(lastInstant.getTimestamp()); + } + // This corresponds to the `earliestCommitToRetain` in {@code CleanPlanner::getFilesToCleanKeepingLatestCommits} + Option earliestRetainedCommit = commitsTimeline.nthFromLastInstant(maxCommitsToRetain - 1); + // A final timeline to be used in Lambda function + HoodieTimeline timeline = commitsTimeline; + // Mapping of to expected set of instant timestamps + Map, Set> expectedInstantTimeMap = new HashMap<>(); + TableFileSystemView fsView = table.getFileSystemView(); + // Remaining file groups to figure out the one version before earliestRetainedCommit + Set> remainingFileGroupSet = new HashSet<>(); + + for (String partitionPath : partitionPaths) { + remainingFileGroupSet.addAll( + fsView.getAllFileGroups(partitionPath) + .map(fileGroup -> Pair.of(partitionPath, fileGroup.getFileGroupId().getFileId())) + .collect(Collectors.toList())); + } + // With KEEP_LATEST_COMMITS cleaner policy, for each file group, we need to figure out + // the latest version before earliestCommitToRetain, which is also kept from cleaning. + // The timeline of commits is traversed in reverse order to achieve this. + for (HoodieInstant instant : commitsTimeline.getReverseOrderedInstants().collect(Collectors.toList())) { + List hoodieWriteStatList = commitWriteStatsMap.computeIfAbsent(instant.getTimestamp(), newInstant -> { + try { + return HoodieCommitMetadata.fromBytes( + timeline.getInstantDetails( + timeline.filter(inst -> inst.getTimestamp().equals(newInstant)) + .firstInstant().get()).get(), + HoodieCommitMetadata.class) + .getWriteStats(); + } catch (IOException e) { + return Collections.EMPTY_LIST; + } + }); + hoodieWriteStatList.forEach(writeStat -> { + Pair partitionFileIdPair = Pair.of(writeStat.getPartitionPath(), writeStat.getFileId()); + if (remainingFileGroupSet.contains(partitionFileIdPair)) { + if (earliestRetainedCommit.isPresent() + && HoodieTimeline.compareTimestamps( + instant.getTimestamp(), HoodieTimeline.LESSER_THAN, earliestRetainedCommit.get().getTimestamp())) { + remainingFileGroupSet.remove(partitionFileIdPair); + } + expectedInstantTimeMap.computeIfAbsent(partitionFileIdPair, k -> new HashSet<>()) + .add(instant.getTimestamp()); + } + }); + if (remainingFileGroupSet.isEmpty()) { + break; + } + } + + // Need to ensure the following + for (String partitionPath : partitionPaths) { + List fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList()); + for (HoodieFileGroup fileGroup : fileGroups) { + Set commitTimes = new HashSet<>(); + fileGroup.getAllBaseFiles().forEach(value -> { + LOG.debug("Data File - " + value); + commitTimes.add(value.getCommitTime()); + }); + if (isAsyncClean) { + commitTimes.remove(lastInstant.getTimestamp()); + } + + assertEquals( + expectedInstantTimeMap.get( + Pair.of(partitionPath, fileGroup.getFileGroupId().getFileId())), + commitTimes, + "Only contain acceptable versions of file should be present"); + } + } + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/clean/TestCleanerInsertAndCleanByVersions.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/clean/TestCleanerInsertAndCleanByVersions.java new file mode 100644 index 0000000000000..e9c74936f3cec --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/clean/TestCleanerInsertAndCleanByVersions.java @@ -0,0 +1,238 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.action.clean; + +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.fs.ConsistencyGuardConfig; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFileGroup; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.table.view.TableFileSystemView; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.CompactionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.testutils.HoodieClientTestBase.Function2; +import org.apache.hudi.testutils.HoodieClientTestBase.Function3; +import org.apache.hudi.testutils.SparkClientFunctionalTestHarness; + +import org.apache.hadoop.fs.Path; +import org.apache.spark.api.java.JavaRDD; +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeSet; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.testutils.HoodieTestTable.makeIncrementalCommitTimes; +import static org.apache.hudi.table.TestCleaner.insertFirstBigBatchForClientCleanerTest; +import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; +import static org.apache.hudi.testutils.HoodieClientTestBase.wrapRecordsGenFunctionForPreppedCalls; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestCleanerInsertAndCleanByVersions extends SparkClientFunctionalTestHarness { + + private static final int BATCH_SIZE = 100; + private static final int PARALLELISM = 2; + + /** + * Test Clean-By-Versions using insert/upsert API. + */ + @Test + public void testInsertAndCleanByVersions() throws Exception { + testInsertAndCleanByVersions(SparkRDDWriteClient::insert, SparkRDDWriteClient::upsert, false); + } + + /** + * Test Clean-By-Versions using prepped versions of insert/upsert API. + */ + @Test + public void testInsertPreppedAndCleanByVersions() throws Exception { + testInsertAndCleanByVersions(SparkRDDWriteClient::insertPreppedRecords, SparkRDDWriteClient::upsertPreppedRecords, + true); + } + + /** + * Test Clean-By-Versions using bulk-insert/upsert API. + */ + @Test + public void testBulkInsertAndCleanByVersions() throws Exception { + testInsertAndCleanByVersions(SparkRDDWriteClient::bulkInsert, SparkRDDWriteClient::upsert, false); + } + + /** + * Test Clean-By-Versions using prepped versions of bulk-insert/upsert API. + */ + @Test + public void testBulkInsertPreppedAndCleanByVersions() throws Exception { + testInsertAndCleanByVersions( + (client, recordRDD, instantTime) -> client.bulkInsertPreppedRecords(recordRDD, instantTime, Option.empty()), + SparkRDDWriteClient::upsertPreppedRecords, true); + } + + /** + * Test Helper for Cleaning by versions logic from HoodieWriteClient API perspective. + * + * @param insertFn Insert API to be tested + * @param upsertFn Upsert API to be tested + * @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during + * record generation to also tag the regards (de-dupe is implicit as we use unique record-gen APIs) + * @throws Exception in case of errors + */ + private void testInsertAndCleanByVersions( + Function3, SparkRDDWriteClient, JavaRDD, String> insertFn, + Function3, SparkRDDWriteClient, JavaRDD, String> upsertFn, boolean isPreppedAPI) + throws Exception { + int maxVersions = 2; // keep upto 2 versions for each file + HoodieWriteConfig cfg = getConfigBuilder(true) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) + .retainFileVersions(maxVersions).build()) + .withParallelism(PARALLELISM, PARALLELISM) + .withBulkInsertParallelism(PARALLELISM) + .withFinalizeWriteParallelism(PARALLELISM) + .withDeleteParallelism(PARALLELISM) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) + .build(); + try (final SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { + final HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(System.nanoTime()); + final Function2, String, Integer> recordInsertGenWrappedFunction = isPreppedAPI + ? wrapRecordsGenFunctionForPreppedCalls(basePath(), hadoopConf(), context(), cfg, dataGen::generateInserts) + : dataGen::generateInserts; + final Function2, String, Integer> recordUpsertGenWrappedFunction = isPreppedAPI + ? wrapRecordsGenFunctionForPreppedCalls(basePath(), hadoopConf(), context(), cfg, dataGen::generateUniqueUpdates) + : dataGen::generateUniqueUpdates; + + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.COPY_ON_WRITE); + insertFirstBigBatchForClientCleanerTest(context(), metaClient, client, recordInsertGenWrappedFunction, insertFn); + + Map compactionFileIdToLatestFileSlice = new HashMap<>(); + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable table = HoodieSparkTable.create(cfg, context(), metaClient); + for (String partitionPath : dataGen.getPartitionPaths()) { + TableFileSystemView fsView = table.getFileSystemView(); + Option added = Option.fromJavaOptional(fsView.getAllFileGroups(partitionPath).findFirst().map(fg -> { + fg.getLatestFileSlice().map(fs -> compactionFileIdToLatestFileSlice.put(fg.getFileGroupId(), fs)); + return true; + })); + if (added.isPresent()) { + // Select only one file-group for compaction + break; + } + } + + // Create workload with selected file-slices + List> partitionFileSlicePairs = compactionFileIdToLatestFileSlice.entrySet().stream() + .map(e -> Pair.of(e.getKey().getPartitionPath(), e.getValue())).collect(Collectors.toList()); + HoodieCompactionPlan compactionPlan = + CompactionUtils.buildFromFileSlices(partitionFileSlicePairs, Option.empty(), Option.empty()); + List instantTimes = makeIncrementalCommitTimes(9, 1, 10); + String compactionTime = instantTimes.get(0); + table.getActiveTimeline().saveToCompactionRequested( + new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, compactionTime), + TimelineMetadataUtils.serializeCompactionPlan(compactionPlan)); + + instantTimes = instantTimes.subList(1, instantTimes.size()); + // Keep doing some writes and clean inline. Make sure we have expected number of files + // remaining. + for (String newInstantTime : instantTimes) { + client.startCommitWithTime(newInstantTime); + List records = recordUpsertGenWrappedFunction.apply(newInstantTime, BATCH_SIZE); + + List statuses = upsertFn.apply(client, jsc().parallelize(records, PARALLELISM), newInstantTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + metaClient = HoodieTableMetaClient.reload(metaClient); + table = HoodieSparkTable.create(cfg, context(), metaClient); + HoodieTimeline timeline = table.getMetaClient().getCommitsTimeline(); + + TableFileSystemView fsView = table.getFileSystemView(); + // Need to ensure the following + for (String partitionPath : dataGen.getPartitionPaths()) { + // compute all the versions of all files, from time 0 + HashMap> fileIdToVersions = new HashMap<>(); + for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) { + HoodieCommitMetadata commitMetadata = + HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(entry).get(), HoodieCommitMetadata.class); + + for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) { + if (!fileIdToVersions.containsKey(wstat.getFileId())) { + fileIdToVersions.put(wstat.getFileId(), new TreeSet<>()); + } + fileIdToVersions.get(wstat.getFileId()).add(FSUtils.getCommitTime(new Path(wstat.getPath()).getName())); + } + } + + List fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList()); + + for (HoodieFileGroup fileGroup : fileGroups) { + if (compactionFileIdToLatestFileSlice.containsKey(fileGroup.getFileGroupId())) { + // Ensure latest file-slice selected for compaction is retained + Option dataFileForCompactionPresent = + Option.fromJavaOptional(fileGroup.getAllBaseFiles().filter(df -> { + return compactionFileIdToLatestFileSlice.get(fileGroup.getFileGroupId()).getBaseInstantTime() + .equals(df.getCommitTime()); + }).findAny()); + assertTrue(dataFileForCompactionPresent.isPresent(), + "Data File selected for compaction is retained"); + } else { + // file has no more than max versions + String fileId = fileGroup.getFileGroupId().getFileId(); + List dataFiles = fileGroup.getAllBaseFiles().collect(Collectors.toList()); + + assertTrue(dataFiles.size() <= maxVersions, + "fileId " + fileId + " has more than " + maxVersions + " versions"); + + // Each file, has the latest N versions (i.e cleaning gets rid of older versions) + List commitedVersions = new ArrayList<>(fileIdToVersions.get(fileId)); + for (int i = 0; i < dataFiles.size(); i++) { + assertEquals((dataFiles.get(i)).getCommitTime(), + commitedVersions.get(commitedVersions.size() - 1 - i), + "File " + fileId + " does not have latest versions on commits" + commitedVersions); + } + } + } + } + } + } + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/cluster/strategy/TestSparkClusteringPlanPartitionFilter.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/cluster/strategy/TestSparkClusteringPlanPartitionFilter.java new file mode 100644 index 0000000000000..a68a9e33601ee --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/cluster/strategy/TestSparkClusteringPlanPartitionFilter.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.action.cluster.strategy; + +import org.apache.hudi.client.clustering.plan.strategy.SparkSizeBasedClusteringPlanStrategy; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieSparkCopyOnWriteTable; +import org.apache.hudi.table.action.cluster.ClusteringPlanPartitionFilterMode; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mock; + +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertSame; + +public class TestSparkClusteringPlanPartitionFilter { + @Mock + HoodieSparkCopyOnWriteTable table; + @Mock + HoodieSparkEngineContext context; + HoodieWriteConfig.Builder hoodieWriteConfigBuilder; + + @BeforeEach + public void setUp() { + this.hoodieWriteConfigBuilder = HoodieWriteConfig + .newBuilder() + .withPath("Fake_Table_Path"); + } + + @Test + public void testFilterPartitionNoFilter() { + HoodieWriteConfig config = hoodieWriteConfigBuilder.withClusteringConfig(HoodieClusteringConfig.newBuilder() + .withClusteringPlanPartitionFilterMode(ClusteringPlanPartitionFilterMode.NONE) + .build()) + .build(); + + PartitionAwareClusteringPlanStrategy sg = new SparkSizeBasedClusteringPlanStrategy(table, context, config); + ArrayList fakeTimeBasedPartitionsPath = new ArrayList<>(); + fakeTimeBasedPartitionsPath.add("20210718"); + fakeTimeBasedPartitionsPath.add("20210716"); + fakeTimeBasedPartitionsPath.add("20210719"); + List list = sg.filterPartitionPaths(fakeTimeBasedPartitionsPath); + assertEquals(3, list.size()); + } + + @Test + public void testFilterPartitionRecentDays() { + HoodieWriteConfig config = hoodieWriteConfigBuilder.withClusteringConfig(HoodieClusteringConfig.newBuilder() + .withClusteringSkipPartitionsFromLatest(1) + .withClusteringTargetPartitions(1) + .withClusteringPlanPartitionFilterMode(ClusteringPlanPartitionFilterMode.RECENT_DAYS) + .build()) + .build(); + + PartitionAwareClusteringPlanStrategy sg = new SparkSizeBasedClusteringPlanStrategy(table, context, config); + ArrayList fakeTimeBasedPartitionsPath = new ArrayList<>(); + fakeTimeBasedPartitionsPath.add("20210718"); + fakeTimeBasedPartitionsPath.add("20210716"); + fakeTimeBasedPartitionsPath.add("20210719"); + List list = sg.filterPartitionPaths(fakeTimeBasedPartitionsPath); + assertEquals(1, list.size()); + assertSame("20210718", list.get(0)); + } + + @Test + public void testFilterPartitionSelectedPartitions() { + HoodieWriteConfig config = hoodieWriteConfigBuilder.withClusteringConfig(HoodieClusteringConfig.newBuilder() + .withClusteringPartitionFilterBeginPartition("20211222") + .withClusteringPartitionFilterEndPartition("20211223") + .withClusteringPlanPartitionFilterMode(ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS) + .build()) + .build(); + + PartitionAwareClusteringPlanStrategy sg = new SparkSizeBasedClusteringPlanStrategy(table, context, config); + ArrayList fakeTimeBasedPartitionsPath = new ArrayList<>(); + fakeTimeBasedPartitionsPath.add("20211220"); + fakeTimeBasedPartitionsPath.add("20211221"); + fakeTimeBasedPartitionsPath.add("20211222"); + fakeTimeBasedPartitionsPath.add("20211224"); + List list = sg.filterPartitionPaths(fakeTimeBasedPartitionsPath); + assertEquals(1, list.size()); + assertSame("20211222", list.get(0)); + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java index c054bc4602f85..a359146b54009 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java @@ -21,25 +21,37 @@ import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.testutils.RawTripTestPayload; import org.apache.hudi.common.testutils.Transformations; +import org.apache.hudi.common.util.BaseFileUtils; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ParquetUtils; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieLayoutConfig; import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.hadoop.HoodieParquetInputFormat; import org.apache.hudi.hadoop.utils.HoodieHiveUtils; +import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.io.HoodieCreateHandle; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.table.HoodieSparkCopyOnWriteTable; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.storage.HoodieStorageLayout; import org.apache.hudi.testutils.HoodieClientTestBase; import org.apache.hudi.testutils.MetadataMergeWriteStatus; @@ -57,6 +69,8 @@ import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import org.junit.jupiter.params.provider.ValueSource; import java.io.File; @@ -66,7 +80,9 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Properties; import java.util.UUID; +import java.util.stream.Stream; import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; import static org.apache.hudi.common.testutils.HoodieTestTable.makeNewCommitTime; @@ -74,6 +90,7 @@ import static org.apache.hudi.execution.bulkinsert.TestBulkInsertInternalPartitioner.generateExpectedPartitionNumRecords; import static org.apache.hudi.execution.bulkinsert.TestBulkInsertInternalPartitioner.generateTestRecordsForBulkInsert; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -82,6 +99,13 @@ public class TestCopyOnWriteActionExecutor extends HoodieClientTestBase { private static final Logger LOG = LogManager.getLogger(TestCopyOnWriteActionExecutor.class); private static final Schema SCHEMA = getSchemaFromResource(TestCopyOnWriteActionExecutor.class, "/exampleSchema.avsc"); + private static final Stream indexType() { + HoodieIndex.IndexType[] data = new HoodieIndex.IndexType[] { + HoodieIndex.IndexType.BLOOM, + HoodieIndex.IndexType.BUCKET + }; + return Stream.of(data).map(Arguments::of); + } @Test public void testMakeNewPath() { @@ -103,7 +127,7 @@ public void testMakeNewPath() { }).collect().get(0); assertEquals(newPathWithWriteToken.getKey().toString(), Paths.get(this.basePath, partitionPath, - FSUtils.makeDataFileName(instantTime, newPathWithWriteToken.getRight(), fileName)).toString()); + FSUtils.makeBaseFileName(instantTime, newPathWithWriteToken.getRight(), fileName)).toString()); } private HoodieWriteConfig makeHoodieClientConfig() { @@ -112,14 +136,37 @@ private HoodieWriteConfig makeHoodieClientConfig() { private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() { // Prepare the AvroParquetIO - return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(SCHEMA.toString()); + return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(SCHEMA.toString()) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withRemoteServerPort(timelineServicePort).build()); + } + + private Properties makeIndexConfig(HoodieIndex.IndexType indexType) { + Properties props = new Properties(); + HoodieIndexConfig.Builder indexConfig = HoodieIndexConfig.newBuilder() + .withIndexType(indexType); + props.putAll(indexConfig.build().getProps()); + if (indexType.equals(HoodieIndex.IndexType.BUCKET)) { + props.setProperty(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "_row_key"); + indexConfig.fromProperties(props) + .withIndexKeyField("_row_key") + .withBucketNum("1") + .withBucketIndexEngineType(HoodieIndex.BucketIndexEngineType.SIMPLE); + props.putAll(indexConfig.build().getProps()); + props.putAll(HoodieLayoutConfig.newBuilder().fromProperties(props) + .withLayoutType(HoodieStorageLayout.LayoutType.BUCKET.name()) + .withLayoutPartitioner(SparkBucketIndexPartitioner.class.getName()).build().getProps()); + } + return props; } // TODO (weiy): Add testcases for crossing file writing. - @Test - public void testUpdateRecords() throws Exception { + @ParameterizedTest + @MethodSource("indexType") + public void testUpdateRecords(HoodieIndex.IndexType indexType) throws Exception { // Prepare the AvroParquetIO - HoodieWriteConfig config = makeHoodieClientConfig(); + HoodieWriteConfig config = makeHoodieClientConfigBuilder() + .withProps(makeIndexConfig(indexType)).build(); String firstCommitTime = makeNewCommitTime(); SparkRDDWriteClient writeClient = getHoodieWriteClient(config); writeClient.startCommitWithTime(firstCommitTime); @@ -140,11 +187,11 @@ public void testUpdateRecords() throws Exception { List records = new ArrayList<>(); RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); - records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); - records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); - records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); // Insert new records final HoodieSparkCopyOnWriteTable cowTable = table; @@ -154,18 +201,17 @@ public void testUpdateRecords() throws Exception { assertEquals(1, allFiles.length); // Read out the bloom filter and make sure filter can answer record exist or not - Path parquetFilePath = allFiles[0].getPath(); - BloomFilter filter = ParquetUtils.readBloomFilterFromParquetMetadata(hadoopConf, parquetFilePath); + Path filePath = allFiles[0].getPath(); + BloomFilter filter = BaseFileUtils.getInstance(table.getBaseFileFormat()).readBloomFilterFromMetadata(hadoopConf, filePath); for (HoodieRecord record : records) { assertTrue(filter.mightContain(record.getRecordKey())); } - // Read the parquet file, check the record content - List fileRecords = ParquetUtils.readAvroRecords(hadoopConf, parquetFilePath); + // Read the base file, check the record content + List fileRecords = BaseFileUtils.getInstance(table.getBaseFileFormat()).readAvroRecords(hadoopConf, filePath); GenericRecord newRecord; int index = 0; for (GenericRecord record : fileRecords) { - //System.out.println("Got :" + record.get("_row_key").toString() + ", Exp :" + records.get(index).getRecordKey()); assertEquals(records.get(index).getRecordKey(), record.get("_row_key").toString()); index++; } @@ -174,12 +220,12 @@ public void testUpdateRecords() throws Exception { String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; RawTripTestPayload updateRowChanges1 = new RawTripTestPayload(updateRecordStr1); - HoodieRecord updatedRecord1 = new HoodieRecord( + HoodieRecord updatedRecord1 = new HoodieAvroRecord( new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1); RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); HoodieRecord insertedRecord1 = - new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); List updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1); @@ -192,12 +238,12 @@ public void testUpdateRecords() throws Exception { allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1); assertEquals(1, allFiles.length); // verify new incremental file group is same as the previous one - assertEquals(FSUtils.getFileId(parquetFilePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName())); + assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName())); // Check whether the record has been updated - Path updatedParquetFilePath = allFiles[0].getPath(); + Path updatedFilePath = allFiles[0].getPath(); BloomFilter updatedFilter = - ParquetUtils.readBloomFilterFromParquetMetadata(hadoopConf, updatedParquetFilePath); + BaseFileUtils.getInstance(metaClient).readBloomFilterFromMetadata(hadoopConf, updatedFilePath); for (HoodieRecord record : records) { // No change to the _row_key assertTrue(updatedFilter.mightContain(record.getRecordKey())); @@ -206,7 +252,7 @@ public void testUpdateRecords() throws Exception { assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey())); records.add(insertedRecord1);// add this so it can further check below - ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedParquetFilePath).build(); + ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedFilePath).build(); index = 0; while ((newRecord = (GenericRecord) updatedReader.read()) != null) { assertEquals(newRecord.get("_row_key").toString(), records.get(index).getRecordKey()); @@ -254,7 +300,7 @@ private List newHoodieRecords(int n, String time) throws Exception String recordStr = String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}", UUID.randomUUID().toString(), time, i); RawTripTestPayload rowChange = new RawTripTestPayload(recordStr); - records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); } return records; } @@ -280,15 +326,15 @@ public void testMetadataAggregateFromWriteStatus() throws Exception { List records = new ArrayList<>(); RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); - records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); - records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); - records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); // Insert new records BaseSparkCommitActionExecutor actionExecutor = new SparkInsertCommitActionExecutor(context, config, table, - firstCommitTime, jsc.parallelize(records)); + firstCommitTime, context.parallelize(records)); List writeStatuses = jsc.parallelize(Arrays.asList(1)).map(x -> { return actionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), records.iterator()); }).flatMap(Transformations::flattenAsIterator).collect(); @@ -331,7 +377,7 @@ public void testInsertRecords() throws Exception { // Insert new records final List recs2 = records; BaseSparkCommitActionExecutor actionExecutor = new SparkInsertPreppedCommitActionExecutor(context, config, table, - instantTime, jsc.parallelize(recs2)); + instantTime, context.parallelize(recs2)); List returnedStatuses = jsc.parallelize(Arrays.asList(1)).map(x -> { return actionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), recs2.iterator()); }).flatMap(Transformations::flattenAsIterator).collect(); @@ -352,7 +398,7 @@ public void testInsertRecords() throws Exception { // Insert new records final List recs3 = records; BaseSparkCommitActionExecutor newActionExecutor = new SparkUpsertPreppedCommitActionExecutor(context, config, table, - instantTime, jsc.parallelize(recs3)); + instantTime, context.parallelize(recs3)); returnedStatuses = jsc.parallelize(Arrays.asList(1)).map(x -> { return newActionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), recs3.iterator()); }).flatMap(Transformations::flattenAsIterator).collect(); @@ -376,16 +422,16 @@ public void testFileSizeUpsertRecords() throws Exception { List records = new ArrayList<>(); // Approx 1150 records are written for block size of 64KB - for (int i = 0; i < 2000; i++) { + for (int i = 0; i < 2050; i++) { String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}"; RawTripTestPayload rowChange = new RawTripTestPayload(recordStr); - records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); } // Insert new records BaseSparkCommitActionExecutor actionExecutor = new SparkUpsertCommitActionExecutor(context, config, table, - instantTime, jsc.parallelize(records)); + instantTime, context.parallelize(records)); jsc.parallelize(Arrays.asList(1)) .map(i -> actionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), records.iterator())) .map(Transformations::flatten).collect(); @@ -393,11 +439,12 @@ public void testFileSizeUpsertRecords() throws Exception { // Check the updated file int counts = 0; for (File file : Paths.get(basePath, "2016/01/31").toFile().listFiles()) { - if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName()).equals(instantTime)) { + if (file.getName().endsWith(table.getBaseFileExtension()) && FSUtils.getCommitTime(file.getName()).equals(instantTime)) { LOG.info(file.getName() + "-" + file.length()); counts++; } } + // we check canWrite only once every 1000 records. and so 2 files with 1000 records and 3rd file with 50 records. assertEquals(3, counts, "If the number of records are more than 1150, then there should be a new file"); } @@ -405,15 +452,17 @@ public void testFileSizeUpsertRecords() throws Exception { public void testInsertUpsertWithHoodieAvroPayload() throws Exception { Schema schema = getSchemaFromResource(TestCopyOnWriteActionExecutor.class, "/testDataGeneratorSchema.txt"); HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schema.toString()) - .withStorageConfig(HoodieStorageConfig.newBuilder() - .parquetMaxFileSize(1000 * 1024).hfileMaxFileSize(1000 * 1024).build()).build(); + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withRemoteServerPort(timelineServicePort).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder() + .parquetMaxFileSize(1000 * 1024).hfileMaxFileSize(1000 * 1024).build()).build(); metaClient = HoodieTableMetaClient.reload(metaClient); - final HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient); + HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient); String instantTime = "000"; // Perform inserts of 100 records to test CreateHandle and BufferedExecutor final List inserts = dataGen.generateInsertsWithHoodieAvroPayload(instantTime, 100); BaseSparkCommitActionExecutor actionExecutor = new SparkInsertCommitActionExecutor(context, config, table, - instantTime, jsc.parallelize(inserts)); + instantTime, context.parallelize(inserts)); final List> ws = jsc.parallelize(Arrays.asList(1)).map(x -> { return actionExecutor.handleInsert(UUID.randomUUID().toString(), inserts.iterator()); }).map(Transformations::flatten).collect(); @@ -425,15 +474,16 @@ public void testInsertUpsertWithHoodieAvroPayload() throws Exception { String partitionPath = writeStatus.getPartitionPath(); long numRecordsInPartition = updates.stream().filter(u -> u.getPartitionPath().equals(partitionPath)).count(); + table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, HoodieTableMetaClient.reload(metaClient)); BaseSparkCommitActionExecutor newActionExecutor = new SparkUpsertCommitActionExecutor(context, config, table, - instantTime, jsc.parallelize(updates)); + instantTime, context.parallelize(updates)); final List> updateStatus = jsc.parallelize(Arrays.asList(1)).map(x -> { return newActionExecutor.handleUpdate(partitionPath, fileId, updates.iterator()); }).map(Transformations::flatten).collect(); assertEquals(updates.size() - numRecordsInPartition, updateStatus.get(0).get(0).getTotalErrorRecords()); } - public void testBulkInsertRecords(String bulkInsertMode) throws Exception { + private void testBulkInsertRecords(String bulkInsertMode) { HoodieWriteConfig config = HoodieWriteConfig.newBuilder() .withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA) .withBulkInsertParallelism(2).withBulkInsertSortMode(bulkInsertMode).build(); @@ -446,8 +496,8 @@ public void testBulkInsertRecords(String bulkInsertMode) throws Exception { // Insert new records final JavaRDD inputRecords = generateTestRecordsForBulkInsert(jsc); SparkBulkInsertCommitActionExecutor bulkInsertExecutor = new SparkBulkInsertCommitActionExecutor( - context, config, table, instantTime, inputRecords, Option.empty()); - List returnedStatuses = ((JavaRDD)bulkInsertExecutor.execute().getWriteStatuses()).collect(); + context, config, table, instantTime, HoodieJavaRDD.of(inputRecords), Option.empty()); + List returnedStatuses = ((HoodieData) bulkInsertExecutor.execute().getWriteStatuses()).collectAsList(); verifyStatusResult(returnedStatuses, generateExpectedPartitionNumRecords(inputRecords)); } @@ -456,4 +506,52 @@ public void testBulkInsertRecords(String bulkInsertMode) throws Exception { public void testBulkInsertRecordsWithGlobalSort(String bulkInsertMode) throws Exception { testBulkInsertRecords(bulkInsertMode); } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testPartitionMetafileFormat(boolean partitionMetafileUseBaseFormat) throws Exception { + // By default there is no format specified for partition metafile + HoodieWriteConfig config = HoodieWriteConfig.newBuilder() + .withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA).build(); + HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient); + assertFalse(table.getPartitionMetafileFormat().isPresent()); + + if (partitionMetafileUseBaseFormat) { + // Add the setting to use datafile format + Properties properties = new Properties(); + properties.setProperty(HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT.key(), "true"); + initMetaClient(HoodieTableType.COPY_ON_WRITE, properties); + metaClient = HoodieTableMetaClient.reload(metaClient); + assertTrue(metaClient.getTableConfig().getPartitionMetafileFormat().isPresent()); + table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient); + assertTrue(table.getPartitionMetafileFormat().isPresent()); + } + + String instantTime = makeNewCommitTime(); + SparkRDDWriteClient writeClient = getHoodieWriteClient(config); + writeClient.startCommitWithTime(instantTime); + + // Insert new records + final JavaRDD inputRecords = generateTestRecordsForBulkInsert(jsc, 50); + writeClient.bulkInsert(inputRecords, instantTime); + + // Partition metafile should be created + Path partitionPath = new Path(basePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH); + assertTrue(HoodiePartitionMetadata.hasPartitionMetadata(fs, partitionPath)); + Option metafilePath = HoodiePartitionMetadata.getPartitionMetafilePath(fs, partitionPath); + if (partitionMetafileUseBaseFormat) { + // Extension should be the same as the data file format of the table + assertTrue(metafilePath.get().toString().endsWith(table.getBaseFileFormat().getFileExtension())); + } else { + // No extension as it is in properties file format + assertTrue(metafilePath.get().toString().endsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX)); + } + + // Validate contents of the partition metafile + HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, partitionPath); + partitionMetadata.readFromFS(); + assertTrue(partitionMetadata.getPartitionDepth() == 3); + assertTrue(partitionMetadata.readPartitionCreatedCommitTime().get().equals(instantTime)); + } + } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestDeleteHelper.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestDeleteHelper.java deleted file mode 100644 index d9dc6ac978d92..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestDeleteHelper.java +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.commit; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.model.EmptyHoodieRecordPayload; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.index.bloom.SparkHoodieBloomIndex; -import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.action.HoodieWriteMetadata; - -import org.apache.spark.Partition; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.ExtendWith; -import org.mockito.Mock; -import org.mockito.junit.jupiter.MockitoExtension; - -import java.util.Collections; -import java.util.List; - -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.anyInt; -import static org.mockito.ArgumentMatchers.anyString; -import static org.mockito.ArgumentMatchers.eq; -import static org.mockito.Mockito.doNothing; -import static org.mockito.Mockito.doReturn; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.never; -import static org.mockito.Mockito.times; -import static org.mockito.Mockito.verify; -import static org.mockito.Mockito.when; - -@ExtendWith(MockitoExtension.class) -public class TestDeleteHelper { - - private enum CombineTestMode { - None, GlobalIndex, NoneGlobalIndex; - } - - private static final String BASE_PATH = "/tmp/"; - private static final boolean WITH_COMBINE = true; - private static final boolean WITHOUT_COMBINE = false; - private static final int DELETE_PARALLELISM = 200; - - @Mock private SparkHoodieBloomIndex index; - @Mock private HoodieTable, JavaRDD, JavaRDD> table; - @Mock private BaseSparkCommitActionExecutor executor; - @Mock private HoodieWriteMetadata metadata; - @Mock private JavaPairRDD keyPairs; - @Mock private JavaSparkContext jsc; - @Mock private HoodieSparkEngineContext context; - - private JavaRDD rddToDelete; - private HoodieWriteConfig config; - - @BeforeEach - public void setUp() { - when(table.getIndex()).thenReturn(index); - when(context.getJavaSparkContext()).thenReturn(jsc); - } - - @Test - public void deleteWithEmptyRDDShouldNotExecute() { - rddToDelete = mockEmptyHoodieKeyRdd(); - config = newWriteConfig(WITHOUT_COMBINE); - - SparkDeleteHelper.newInstance().execute("test-time", rddToDelete, context, config, table, executor); - - verify(rddToDelete, never()).repartition(DELETE_PARALLELISM); - verifyNoDeleteExecution(); - } - - @Test - public void deleteWithoutCombineShouldRepartitionForNonEmptyRdd() { - rddToDelete = newHoodieKeysRddMock(2, CombineTestMode.None); - config = newWriteConfig(WITHOUT_COMBINE); - - SparkDeleteHelper.newInstance().execute("test-time", rddToDelete, context, config, table, executor); - - verify(rddToDelete, times(1)).repartition(DELETE_PARALLELISM); - verifyDeleteExecution(); - } - - @Test - public void deleteWithCombineShouldRepartitionForNonEmptyRddAndNonGlobalIndex() { - rddToDelete = newHoodieKeysRddMock(2, CombineTestMode.NoneGlobalIndex); - config = newWriteConfig(WITH_COMBINE); - - SparkDeleteHelper.newInstance().execute("test-time", rddToDelete, context, config, table, executor); - - verify(rddToDelete, times(1)).distinct(DELETE_PARALLELISM); - verifyDeleteExecution(); - } - - @Test - public void deleteWithCombineShouldRepartitionForNonEmptyRddAndGlobalIndex() { - rddToDelete = newHoodieKeysRddMock(2, CombineTestMode.GlobalIndex); - config = newWriteConfig(WITH_COMBINE); - when(index.isGlobal()).thenReturn(true); - - SparkDeleteHelper.newInstance().execute("test-time", rddToDelete, context, config, table, executor); - - verify(keyPairs, times(1)).reduceByKey(any(), eq(DELETE_PARALLELISM)); - verifyDeleteExecution(); - } - - private void verifyDeleteExecution() { - verify(executor, times(1)).execute(any()); - verify(metadata, times(1)).setIndexLookupDuration(any()); - } - - private void verifyNoDeleteExecution() { - verify(executor, never()).execute(any()); - } - - private HoodieWriteConfig newWriteConfig(boolean combine) { - return HoodieWriteConfig.newBuilder() - .combineDeleteInput(combine) - .withPath(BASE_PATH) - .withDeleteParallelism(DELETE_PARALLELISM) - .build(); - } - - private JavaRDD newHoodieKeysRddMock(int howMany, CombineTestMode combineMode) { - JavaRDD keysToDelete = mock(JavaRDD.class); - - JavaRDD recordsRdd = mock(JavaRDD.class); - when(recordsRdd.filter(any())).thenReturn(recordsRdd); - when(recordsRdd.isEmpty()).thenReturn(howMany <= 0); - when(index.tagLocation(any(), any(), any())).thenReturn(recordsRdd); - - if (combineMode == CombineTestMode.GlobalIndex) { - when(keyPairs.reduceByKey(any(), anyInt())).thenReturn(keyPairs); - when(keyPairs.values()).thenReturn(keysToDelete); - when(keysToDelete.keyBy(any())).thenReturn(keyPairs); - } else if (combineMode == CombineTestMode.NoneGlobalIndex) { - when(keysToDelete.distinct(anyInt())).thenReturn(keysToDelete); - } else if (combineMode == CombineTestMode.None) { - List parts = mock(List.class); - when(parts.isEmpty()).thenReturn(howMany <= 0); - when(keysToDelete.repartition(anyInt())).thenReturn(keysToDelete); - when(keysToDelete.partitions()).thenReturn(parts); - } - - when(keysToDelete.map(any())).thenReturn(recordsRdd); - when(executor.execute(any())).thenReturn(metadata); - return keysToDelete; - } - - private JavaRDD mockEmptyHoodieKeyRdd() { - JavaRDD emptyRdd = mock(JavaRDD.class); - doReturn(true).when(emptyRdd).isEmpty(); - doReturn(Collections.emptyList()).when(emptyRdd).partitions(); - doReturn(emptyRdd).when(emptyRdd).map(any()); - - doReturn(emptyRdd).when(index).tagLocation(any(), any(), any()); - doReturn(emptyRdd).when(emptyRdd).filter(any()); - - doNothing().when(executor).saveWorkloadProfileMetadataToInflight(any(), anyString()); - doReturn(emptyRdd).when(jsc).emptyRDD(); - return emptyRdd; - } - -} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java index c19427c7f809a..3039eb3bd9b5f 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java @@ -18,22 +18,33 @@ package org.apache.hudi.table.action.commit; +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.testutils.ClusteringTestUtils; +import org.apache.hudi.common.testutils.CompactionTestUtils; import org.apache.hudi.common.testutils.FileCreateUtils; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieClusteringConfig; import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieHBaseIndexConfig; +import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieSparkCopyOnWriteTable; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.WorkloadProfile; +import org.apache.hudi.table.action.deltacommit.SparkUpsertDeltaCommitPartitioner; import org.apache.hudi.testutils.HoodieClientTestBase; import org.apache.avro.Schema; @@ -44,6 +55,7 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; @@ -52,6 +64,7 @@ import scala.Tuple2; +import static org.apache.hudi.common.testutils.HoodieTestUtils.DEFAULT_PARTITION_PATHS; import static org.apache.hudi.common.testutils.HoodieTestUtils.generateFakeHoodieWriteStat; import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource; import static org.apache.hudi.table.action.commit.UpsertPartitioner.averageBytesPerRecord; @@ -71,7 +84,7 @@ private UpsertPartitioner getUpsertPartitioner(int smallFileSize, int numInserts HoodieWriteConfig config = makeHoodieClientConfigBuilder() .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize) .insertSplitSize(100).autoTuneInsertSplits(autoSplitInserts).build()) - .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1000 * 1024).parquetMaxFileSize(1000 * 1024).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1000 * 1024).parquetMaxFileSize(1000 * 1024).orcMaxFileSize(1000 * 1024).build()) .build(); FileCreateUtils.createCommit(basePath, "001"); @@ -185,6 +198,21 @@ public void testUpsertPartitioner() throws Exception { assertEquals(2, insertBuckets.size(), "Total of 2 insert buckets"); } + @Test + public void testUpsertPartitionerWithRecordsPerBucket() throws Exception { + final String testPartitionPath = "2016/09/26"; + // Inserts + Updates... Check all updates go together & inserts subsplit + UpsertPartitioner partitioner = getUpsertPartitioner(0, 250, 100, 1024, testPartitionPath, false); + List insertBuckets = partitioner.getInsertBuckets(testPartitionPath); + int insertSplitSize = partitioner.config.getCopyOnWriteInsertSplitSize(); + int remainedInsertSize = 250 - 2 * insertSplitSize; + // will assigned 3 insertBuckets. 100, 100, 50 each + assertEquals(3, insertBuckets.size(), "Total of 3 insert buckets"); + assertEquals(0.4, insertBuckets.get(0).getLeft().weight, "insert " + insertSplitSize + " records"); + assertEquals(0.4, insertBuckets.get(1).getLeft().weight, "insert " + insertSplitSize + " records"); + assertEquals(0.2, insertBuckets.get(2).getLeft().weight, "insert " + remainedInsertSize + " records"); + } + @Test public void testPartitionWeight() throws Exception { final String testPartitionPath = "2016/09/26"; @@ -286,11 +314,168 @@ public void testUpsertPartitionerWithSmallInsertHandling() throws Exception { "Bucket 3 is INSERT"); assertEquals(4, insertBuckets.size(), "Total of 4 insert buckets"); - weights = new Double[] { 0.08, 0.31, 0.31, 0.31}; - cumulativeWeights = new Double[] { 0.08, 0.39, 0.69, 1.0}; + weights = new Double[] { 0.08, 0.42, 0.42, 0.08}; + cumulativeWeights = new Double[] { 0.08, 0.5, 0.92, 1.0}; assertInsertBuckets(weights, cumulativeWeights, insertBuckets); } + @Test + public void testUpsertPartitionerWithSmallFileHandlingWithInflightCompactionWithCanIndexLogFiles() throws Exception { + // Note this is used because it is same partition path used in CompactionTestUtils.createCompactionPlan() + final String testPartitionPath = DEFAULT_PARTITION_PATHS[0]; + + HoodieWriteConfig config = makeHoodieClientConfigBuilder() + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024).build()) + .withIndexConfig(HoodieIndexConfig.newBuilder() + .withIndexType(HoodieIndex.IndexType.HBASE) + .withHBaseIndexConfig(HoodieHBaseIndexConfig.newBuilder().build()) + .build()) + .build(); + + // This will generate initial commits and create a compaction plan which includes file groups created as part of this + HoodieCompactionPlan plan = CompactionTestUtils.createCompactionPlan(metaClient, "001", "002", 1, true, false); + FileCreateUtils.createRequestedCompactionCommit(basePath, "002", plan); + // Simulate one more commit so that inflight compaction is considered when building file groups in file system view + FileCreateUtils.createBaseFile(basePath, testPartitionPath, "003", "2", 1); + FileCreateUtils.createCommit(basePath, "003"); + + // Partitioner will attempt to assign inserts to file groups including base file created by inflight compaction + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] {testPartitionPath}); + List insertRecords = dataGenerator.generateInserts("004", 100); + WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords))); + + HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient); + SparkUpsertDeltaCommitPartitioner partitioner = new SparkUpsertDeltaCommitPartitioner(profile, context, table, config); + + assertEquals(1, partitioner.numPartitions(), "Should have 1 partitions"); + assertEquals(BucketType.UPDATE, partitioner.getBucketInfo(0).bucketType, + "Bucket 0 is UPDATE"); + assertEquals("2", partitioner.getBucketInfo(0).fileIdPrefix, + "Should be assigned to only file id not pending compaction which is 2"); + } + + @Test + public void testUpsertPartitionerWithSmallFileHandlingAndClusteringPlan() throws Exception { + final String testPartitionPath = DEFAULT_PARTITION_PATHS[0]; + + // create HoodieWriteConfig and set inline and async clustering disable here. + HoodieWriteConfig config = makeHoodieClientConfigBuilder() + .withCompactionConfig(HoodieCompactionConfig.newBuilder().build()) + .withClusteringConfig(HoodieClusteringConfig.newBuilder().withInlineClustering(false).withAsyncClustering(false).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1000 * 1024).parquetMaxFileSize(1000 * 1024).build()) + .build(); + + // create file slice with instantTime 001 and build clustering plan including this created 001 file slice. + HoodieClusteringPlan clusteringPlan = ClusteringTestUtils.createClusteringPlan(metaClient, "001", "1"); + // create requested replace commit + HoodieRequestedReplaceMetadata requestedReplaceMetadata = HoodieRequestedReplaceMetadata.newBuilder() + .setClusteringPlan(clusteringPlan).setOperationType(WriteOperationType.CLUSTER.name()).build(); + FileCreateUtils.createRequestedReplaceCommit(basePath,"002", Option.of(requestedReplaceMetadata)); + + // create file slice 003 + FileCreateUtils.createBaseFile(basePath, testPartitionPath, "003", "3", 1); + FileCreateUtils.createCommit(basePath, "003"); + + metaClient = HoodieTableMetaClient.reload(metaClient); + + // generate new data to be ingested + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] {testPartitionPath}); + List insertRecords = dataGenerator.generateInserts("004", 100); + WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords))); + + HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient); + // create UpsertPartitioner + UpsertPartitioner partitioner = new UpsertPartitioner(profile, context, table, config); + + // for now we have file slice1 and file slice3 and file slice1 is contained in pending clustering plan + // So that only file slice3 can be used for ingestion. + assertEquals(1, partitioner.smallFiles.size(), "Should have 1 small file to be ingested."); + } + + @Test + public void testUpsertPartitionerWithSmallFileHandlingWithCanIndexLogFiles() throws Exception { + // Note this is used because it is same partition path used in CompactionTestUtils.createCompactionPlan() + final String testPartitionPath = DEFAULT_PARTITION_PATHS[0]; + + HoodieWriteConfig config = makeHoodieClientConfigBuilder() + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().parquetMaxFileSize(1024).build()) + .withIndexConfig(HoodieIndexConfig.newBuilder() + .withIndexType(HoodieIndex.IndexType.HBASE) + .withHBaseIndexConfig(HoodieHBaseIndexConfig.newBuilder().build()) + .build()) + .build(); + + // Create file group with only one log file + FileCreateUtils.createLogFile(basePath, testPartitionPath, "001", "fg1", 1); + FileCreateUtils.createDeltaCommit(basePath, "001"); + // Create another file group size set to max parquet file size so should not be considered during small file sizing + FileCreateUtils.createBaseFile(basePath, testPartitionPath, "002", "fg2", 1024); + FileCreateUtils.createCommit(basePath, "002"); + FileCreateUtils.createLogFile(basePath, testPartitionPath, "003", "fg2", 1); + FileCreateUtils.createDeltaCommit(basePath, "003"); + + // Partitioner will attempt to assign inserts to file groups including base file created by inflight compaction + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] {testPartitionPath}); + // Default estimated record size will be 1024 based on last file group created. Only 1 record can be added to small file + List insertRecords = dataGenerator.generateInserts("004", 1); + WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords))); + + HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient); + SparkUpsertDeltaCommitPartitioner partitioner = new SparkUpsertDeltaCommitPartitioner(profile, context, table, config); + + assertEquals(1, partitioner.numPartitions(), "Should have 1 partitions"); + assertEquals(BucketType.UPDATE, partitioner.getBucketInfo(0).bucketType, + "Bucket 0 should be UPDATE"); + assertEquals("fg1", partitioner.getBucketInfo(0).fileIdPrefix, + "Insert should be assigned to fg1"); + } + + @Test + public void testUpsertPartitionerWithSmallFileHandlingPickingMultipleCandidates() throws Exception { + final String partitionPath = DEFAULT_PARTITION_PATHS[0]; + + HoodieWriteConfig config = + makeHoodieClientConfigBuilder() + .withMergeSmallFileGroupCandidatesLimit(3) + .withStorageConfig( + HoodieStorageConfig.newBuilder() + .parquetMaxFileSize(2048) + .build() + ) + .build(); + + // Bootstrap base files ("small-file targets") + FileCreateUtils.createBaseFile(basePath, partitionPath, "002", "fg-1", 1024); + FileCreateUtils.createBaseFile(basePath, partitionPath, "002", "fg-2", 1024); + FileCreateUtils.createBaseFile(basePath, partitionPath, "002", "fg-3", 1024); + + FileCreateUtils.createCommit(basePath, "002"); + + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] {partitionPath}); + // Default estimated record size will be 1024 based on last file group created. + // Only 1 record can be added to small file + WorkloadProfile profile = + new WorkloadProfile(buildProfile(jsc.parallelize(dataGenerator.generateInserts("003", 3)))); + + HoodieTableMetaClient reloadedMetaClient = HoodieTableMetaClient.reload(this.metaClient); + + HoodieSparkTable table = HoodieSparkTable.create(config, context, reloadedMetaClient); + + SparkUpsertDeltaCommitPartitioner partitioner = new SparkUpsertDeltaCommitPartitioner<>(profile, context, table, config); + + assertEquals(3, partitioner.numPartitions()); + assertEquals( + Arrays.asList( + new BucketInfo(BucketType.UPDATE, "fg-1", partitionPath), + new BucketInfo(BucketType.UPDATE, "fg-2", partitionPath), + new BucketInfo(BucketType.UPDATE, "fg-3", partitionPath) + ), + partitioner.getBucketInfos()); + } + private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() { // Prepare the AvroParquetIO return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(SCHEMA.toString()); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/CompactionTestBase.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/CompactionTestBase.java index 094c0b39069a3..a571a6f4732ea 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/CompactionTestBase.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/CompactionTestBase.java @@ -20,9 +20,10 @@ import org.apache.hudi.avro.model.HoodieCompactionOperation; import org.apache.hudi.avro.model.HoodieCompactionPlan; -import org.apache.hudi.client.HoodieReadClient; +import org.apache.hudi.client.SparkRDDReadClient; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieFileGroupId; @@ -45,6 +46,7 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.marker.WriteMarkersFactory; import org.apache.hudi.testutils.HoodieClientTestBase; import org.apache.hudi.testutils.HoodieClientTestUtils; @@ -70,11 +72,12 @@ protected HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) { return HoodieWriteConfig.newBuilder().withPath(basePath) .withSchema(TRIP_EXAMPLE_SCHEMA) .withParallelism(2, 2) - .withAutoCommit(autoCommit).withAssumeDatePartitioning(true) + .withAutoCommit(autoCommit) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024) .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build()) .withStorageConfig(HoodieStorageConfig.newBuilder() - .hfileMaxFileSize(1024 * 1024 * 1024).parquetMaxFileSize(1024 * 1024 * 1024).build()) + .hfileMaxFileSize(1024 * 1024 * 1024).parquetMaxFileSize(1024 * 1024 * 1024).orcMaxFileSize(1024 * 1024 * 1024).build()) .forTable("test-trip-table") .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) .withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() @@ -86,7 +89,7 @@ protected HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) { **/ protected void validateDeltaCommit(String latestDeltaCommit, final Map> fgIdToCompactionOperation, HoodieWriteConfig cfg) { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); HoodieTable table = getHoodieTable(metaClient, cfg); List fileSliceList = getCurrentLatestFileSlices(table); fileSliceList.forEach(fileSlice -> { @@ -103,11 +106,11 @@ protected void validateDeltaCommit(String latestDeltaCommit, final Map runNextDeltaCommits(SparkRDDWriteClient client, final HoodieReadClient readClient, List deltaInstants, + protected List runNextDeltaCommits(SparkRDDWriteClient client, final SparkRDDReadClient readClient, List deltaInstants, List records, HoodieWriteConfig cfg, boolean insertFirst, List expPendingCompactionInstants) throws Exception { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); List> pendingCompactions = readClient.getPendingCompactions(); List gotPendingCompactionInstants = pendingCompactions.stream().map(pc -> pc.getKey()).sorted().collect(Collectors.toList()); @@ -129,18 +132,18 @@ protected List runNextDeltaCommits(SparkRDDWriteClient client, fin client.commit(firstInstant, statuses); } assertNoWriteErrors(statusList); - metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); HoodieTable hoodieTable = getHoodieTable(metaClient, cfg); List dataFilesToRead = getCurrentLatestBaseFiles(hoodieTable); assertTrue(dataFilesToRead.stream().findAny().isPresent(), - "should list the parquet files we wrote in the delta commit"); + "should list the base files we wrote in the delta commit"); validateDeltaCommit(firstInstant, fgIdToCompactionOperation, cfg); } int numRecords = records.size(); for (String instantTime : deltaInstants) { records = dataGen.generateUpdates(instantTime, numRecords); - metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); createNextDeltaCommit(instantTime, records, client, metaClient, cfg, false); validateDeltaCommit(instantTime, fgIdToCompactionOperation, cfg); } @@ -148,7 +151,7 @@ protected List runNextDeltaCommits(SparkRDDWriteClient client, fin } protected void moveCompactionFromRequestedToInflight(String compactionInstantTime, HoodieWriteConfig cfg) { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); HoodieInstant compactionInstant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime); metaClient.getActiveTimeline().transitionCompactionRequestedToInflight(compactionInstant); HoodieInstant instant = metaClient.getActiveTimeline().reload().filterPendingCompactionTimeline().getInstants() @@ -158,7 +161,7 @@ protected void moveCompactionFromRequestedToInflight(String compactionInstantTim protected void scheduleCompaction(String compactionInstantTime, SparkRDDWriteClient client, HoodieWriteConfig cfg) { client.scheduleCompactionAtInstant(compactionInstantTime, Option.empty()); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); HoodieInstant instant = metaClient.getActiveTimeline().filterPendingCompactionTimeline().lastInstant().get(); assertEquals(compactionInstantTime, instant.getTimestamp(), "Last compaction instant must be the one set"); } @@ -173,6 +176,7 @@ protected void executeCompaction(String compactionInstantTime, SparkRDDWriteClie HoodieWriteConfig cfg, int expectedNumRecs, boolean hasDeltaCommitAfterPendingCompaction) throws IOException { client.compact(compactionInstantTime); + assertFalse(WriteMarkersFactory.get(cfg.getMarkersType(), table, compactionInstantTime).doesMarkerDirExist()); List fileSliceList = getCurrentLatestFileSlices(table); assertTrue(fileSliceList.stream().findAny().isPresent(), "Ensure latest file-slices are not empty"); assertFalse(fileSliceList.stream() @@ -190,13 +194,13 @@ protected void executeCompaction(String compactionInstantTime, SparkRDDWriteClie } // verify that there is a commit - table = getHoodieTable(new HoodieTableMetaClient(hadoopConf, cfg.getBasePath(), true), cfg); + table = getHoodieTable(HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).setLoadActiveTimelineOnLoad(true).build(), cfg); HoodieTimeline timeline = table.getMetaClient().getCommitTimeline().filterCompletedInstants(); String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp(); assertEquals(latestCompactionCommitTime, compactionInstantTime, "Expect compaction instant time to be the latest commit time"); assertEquals(expectedNumRecs, - HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, "000"), + HoodieClientTestUtils.countRecordsOptionallySince(jsc, basePath, sqlContext, timeline, Option.of("000")), "Must contain expected records"); } @@ -212,7 +216,7 @@ protected void executeCompactionWithReplacedFiles(String compactionInstantTime, "Compacted files should not show up in latest slices"); // verify that there is a commit - table = getHoodieTable(new HoodieTableMetaClient(hadoopConf, cfg.getBasePath(), true), cfg); + table = getHoodieTable(HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).setLoadActiveTimelineOnLoad(true).build(), cfg); HoodieTimeline timeline = table.getMetaClient().getCommitTimeline().filterCompletedInstants(); // verify compaction commit is visible in timeline assertTrue(timeline.filterCompletedInstants().getInstants() @@ -260,7 +264,7 @@ protected List getCurrentLatestBaseFiles(HoodieTable table) thro protected List getCurrentLatestFileSlices(HoodieTable table) { HoodieTableFileSystemView view = new HoodieTableFileSystemView(table.getMetaClient(), - table.getMetaClient().getActiveTimeline().reload().getCommitsAndCompactionTimeline()); + table.getMetaClient().getActiveTimeline().reload().getWriteTimeline()); return Arrays.stream(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS) .flatMap(view::getLatestFileSlices).collect(Collectors.toList()); } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestAsyncCompaction.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestAsyncCompaction.java index fd6bd839cf8f6..f673872804aff 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestAsyncCompaction.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestAsyncCompaction.java @@ -18,9 +18,7 @@ package org.apache.hudi.table.action.compact; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hudi.client.HoodieReadClient; +import org.apache.hudi.client.SparkRDDReadClient; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieRecord; @@ -31,6 +29,9 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.Test; @@ -50,15 +51,16 @@ public class TestAsyncCompaction extends CompactionTestBase { private HoodieWriteConfig getConfig(Boolean autoCommit) { - return getConfigBuilder(autoCommit).build(); + return getConfigBuilder(autoCommit) + .build(); } @Test public void testRollbackForInflightCompaction() throws Exception { // Rollback inflight compaction HoodieWriteConfig cfg = getConfig(false); - try (SparkRDDWriteClient client = getHoodieWriteClient(cfg, true);) { - HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath()); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { + SparkRDDReadClient readClient = getHoodieReadClient(cfg.getBasePath()); String firstInstantTime = "001"; String secondInstantTime = "004"; String compactionInstantTime = "005"; @@ -72,7 +74,7 @@ public void testRollbackForInflightCompaction() throws Exception { // Schedule compaction but do not run them scheduleCompaction(compactionInstantTime, client, cfg); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); HoodieInstant pendingCompactionInstant = metaClient.getActiveTimeline().filterPendingCompactionTimeline().firstInstant().get(); @@ -83,14 +85,12 @@ public void testRollbackForInflightCompaction() throws Exception { moveCompactionFromRequestedToInflight(compactionInstantTime, cfg); // Reload and rollback inflight compaction - metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context, metaClient); - // hoodieTable.rollback(jsc, - // new HoodieInstant(true, HoodieTimeline.COMPACTION_ACTION, compactionInstantTime), false); - client.rollbackInflightCompaction( - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactionInstantTime), hoodieTable); - metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + hoodieTable.rollbackInflightCompaction( + new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactionInstantTime)); + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); pendingCompactionInstant = metaClient.getCommitsAndCompactionTimeline().filterPendingCompactionTimeline() .getInstants().findFirst().get(); assertEquals("compaction", pendingCompactionInstant.getAction()); @@ -119,8 +119,8 @@ public void testRollbackInflightIngestionWithPendingCompaction() throws Exceptio int numRecs = 2000; - try (SparkRDDWriteClient client = getHoodieWriteClient(cfg, true);) { - HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath()); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { + SparkRDDReadClient readClient = getHoodieReadClient(cfg.getBasePath()); List records = dataGen.generateInserts(firstInstantTime, numRecs); records = runNextDeltaCommits(client, readClient, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, new ArrayList<>()); @@ -128,10 +128,10 @@ public void testRollbackInflightIngestionWithPendingCompaction() throws Exceptio // Schedule compaction but do not run them scheduleCompaction(compactionInstantTime, client, cfg); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); createNextDeltaCommit(inflightInstantTime, records, client, metaClient, cfg, true); - metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); HoodieInstant pendingCompactionInstant = metaClient.getActiveTimeline().filterPendingCompactionTimeline().firstInstant().get(); assertEquals(compactionInstantTime, pendingCompactionInstant.getTimestamp(), @@ -144,7 +144,7 @@ public void testRollbackInflightIngestionWithPendingCompaction() throws Exceptio client.startCommitWithTime(nextInflightInstantTime); // Validate - metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); inflightInstant = metaClient.getActiveTimeline().filterPendingExcludingCompaction().firstInstant().get(); assertEquals(inflightInstant.getTimestamp(), nextInflightInstantTime, "inflight instant has expected instant time"); assertEquals(1, metaClient.getActiveTimeline() @@ -161,8 +161,8 @@ public void testRollbackInflightIngestionWithPendingCompaction() throws Exceptio public void testInflightCompaction() throws Exception { // There is inflight compaction. Subsequent compaction run must work correctly HoodieWriteConfig cfg = getConfig(true); - try (SparkRDDWriteClient client = getHoodieWriteClient(cfg, true);) { - HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath()); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { + SparkRDDReadClient readClient = getHoodieReadClient(cfg.getBasePath()); String firstInstantTime = "001"; String secondInstantTime = "004"; String compactionInstantTime = "005"; @@ -176,7 +176,7 @@ public void testInflightCompaction() throws Exception { new ArrayList<>()); // Schedule and mark compaction instant as inflight - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); HoodieTable hoodieTable = getHoodieTable(metaClient, cfg); scheduleCompaction(compactionInstantTime, client, cfg); moveCompactionFromRequestedToInflight(compactionInstantTime, cfg); @@ -194,8 +194,8 @@ public void testInflightCompaction() throws Exception { public void testScheduleIngestionBeforePendingCompaction() throws Exception { // Case: Failure case. Latest pending compaction instant time must be earlier than this instant time HoodieWriteConfig cfg = getConfig(false); - SparkRDDWriteClient client = getHoodieWriteClient(cfg, true); - HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath()); + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + SparkRDDReadClient readClient = getHoodieReadClient(cfg.getBasePath()); String firstInstantTime = "001"; String secondInstantTime = "004"; @@ -203,13 +203,13 @@ public void testScheduleIngestionBeforePendingCompaction() throws Exception { String compactionInstantTime = "006"; int numRecs = 2000; - final List initalRecords = dataGen.generateInserts(firstInstantTime, numRecs); - final List records = runNextDeltaCommits(client, readClient, Arrays.asList(firstInstantTime, secondInstantTime), initalRecords, cfg, true, + final List initialRecords = dataGen.generateInserts(firstInstantTime, numRecs); + final List records = runNextDeltaCommits(client, readClient, Arrays.asList(firstInstantTime, secondInstantTime), initialRecords, cfg, true, new ArrayList<>()); // Schedule compaction but do not run them scheduleCompaction(compactionInstantTime, client, cfg); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); HoodieInstant pendingCompactionInstant = metaClient.getActiveTimeline().filterPendingCompactionTimeline().firstInstant().get(); assertEquals(compactionInstantTime, pendingCompactionInstant.getTimestamp(), "Pending Compaction instant has expected instant time"); @@ -225,8 +225,8 @@ public void testScheduleCompactionAfterPendingIngestion() throws Exception { // Case: Failure case. Earliest ingestion inflight instant time must be later than compaction time HoodieWriteConfig cfg = getConfig(false); - SparkRDDWriteClient client = getHoodieWriteClient(cfg, true); - HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath()); + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + SparkRDDReadClient readClient = getHoodieReadClient(cfg.getBasePath()); String firstInstantTime = "001"; String secondInstantTime = "004"; @@ -238,10 +238,10 @@ public void testScheduleCompactionAfterPendingIngestion() throws Exception { records = runNextDeltaCommits(client, readClient, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, new ArrayList<>()); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); createNextDeltaCommit(inflightInstantTime, records, client, metaClient, cfg, true); - metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); HoodieInstant inflightInstant = metaClient.getActiveTimeline().filterPendingExcludingCompaction().firstInstant().get(); assertEquals(inflightInstantTime, inflightInstant.getTimestamp(), "inflight instant has expected instant time"); @@ -257,8 +257,8 @@ public void testScheduleCompactionWithOlderOrSameTimestamp() throws Exception { // Case: Failure case. Earliest ingestion inflight instant time must be later than compaction time HoodieWriteConfig cfg = getConfig(false); - SparkRDDWriteClient client = getHoodieWriteClient(cfg, true); - HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath()); + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + SparkRDDReadClient readClient = getHoodieReadClient(cfg.getBasePath()); final String firstInstantTime = "001"; final String secondInstantTime = "004"; @@ -292,8 +292,8 @@ public void testScheduleCompactionWithOlderOrSameTimestamp() throws Exception { public void testCompactionAfterTwoDeltaCommits() throws Exception { // No Delta Commits after compaction request HoodieWriteConfig cfg = getConfig(true); - try (SparkRDDWriteClient client = getHoodieWriteClient(cfg, true);) { - HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath()); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { + SparkRDDReadClient readClient = getHoodieReadClient(cfg.getBasePath()); String firstInstantTime = "001"; String secondInstantTime = "004"; String compactionInstantTime = "005"; @@ -303,7 +303,7 @@ public void testCompactionAfterTwoDeltaCommits() throws Exception { runNextDeltaCommits(client, readClient, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, new ArrayList<>()); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); HoodieTable hoodieTable = getHoodieTable(metaClient, cfg); scheduleAndExecuteCompaction(compactionInstantTime, client, hoodieTable, cfg, numRecs, false); } @@ -313,8 +313,8 @@ public void testCompactionAfterTwoDeltaCommits() throws Exception { public void testInterleavedCompaction() throws Exception { // Case: Two delta commits before and after compaction schedule HoodieWriteConfig cfg = getConfig(true); - try (SparkRDDWriteClient client = getHoodieWriteClient(cfg, true);) { - HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath()); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { + SparkRDDReadClient readClient = getHoodieReadClient(cfg.getBasePath()); String firstInstantTime = "001"; String secondInstantTime = "004"; String compactionInstantTime = "005"; @@ -327,7 +327,7 @@ public void testInterleavedCompaction() throws Exception { records = runNextDeltaCommits(client, readClient, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, new ArrayList<>()); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); HoodieTable hoodieTable = getHoodieTable(metaClient, cfg); scheduleCompaction(compactionInstantTime, client, cfg); @@ -341,8 +341,8 @@ public void testInterleavedCompaction() throws Exception { public void testCompactionOnReplacedFiles() throws Exception { // Schedule a compaction. Replace those file groups and ensure compaction completes successfully. HoodieWriteConfig cfg = getConfig(true); - try (SparkRDDWriteClient client = getHoodieWriteClient(cfg, true);) { - HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath()); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { + SparkRDDReadClient readClient = getHoodieReadClient(cfg.getBasePath()); String firstInstantTime = "001"; String secondInstantTime = "004"; String compactionInstantTime = "005"; @@ -355,7 +355,7 @@ public void testCompactionOnReplacedFiles() throws Exception { runNextDeltaCommits(client, readClient, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, new ArrayList<>()); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); HoodieTable hoodieTable = getHoodieTable(metaClient, cfg); scheduleCompaction(compactionInstantTime, client, cfg); metaClient.reloadActiveTimeline(); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java index 2e6cea70ad921..59174a9371a58 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java @@ -21,6 +21,7 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieRecord; @@ -40,11 +41,11 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.index.bloom.SparkHoodieBloomIndex; +import org.apache.hudi.index.bloom.HoodieBloomIndex; +import org.apache.hudi.index.bloom.SparkHoodieBloomIndexHelper; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.testutils.HoodieClientTestHarness; -import org.apache.hudi.testutils.HoodieWriteableTestTable; import org.apache.hadoop.conf.Configuration; import org.apache.spark.api.java.JavaRDD; @@ -55,9 +56,6 @@ import java.util.List; import java.util.stream.Collectors; -import static org.apache.hudi.common.testutils.FileCreateUtils.createDeltaCommit; -import static org.apache.hudi.common.testutils.FileCreateUtils.createInflightDeltaCommit; -import static org.apache.hudi.common.testutils.FileCreateUtils.createRequestedDeltaCommit; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertThrows; @@ -97,7 +95,7 @@ private HoodieWriteConfig.Builder getConfigBuilder() { .withParallelism(2, 2) .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024) .withInlineCompaction(false).build()) - .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024).parquetMaxFileSize(1024 * 1024).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024).parquetMaxFileSize(1024 * 1024).orcMaxFileSize(1024 * 1024).build()) .withMemoryConfig(HoodieMemoryConfig.newBuilder().withMaxDFSStreamBufferSize(1 * 1024 * 1024).build()) .forTable("test-trip-table") .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()); @@ -132,10 +130,36 @@ public void testCompactionEmpty() throws Exception { } } + @Test + public void testScheduleCompactionWithInflightInstant() { + HoodieWriteConfig config = getConfig(); + try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) { + // insert 100 records. + String newCommitTime = "100"; + writeClient.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 100); + JavaRDD recordsRDD = jsc.parallelize(records, 1); + writeClient.insert(recordsRDD, newCommitTime).collect(); + + // create one inflight instance. + newCommitTime = "102"; + writeClient.startCommitWithTime(newCommitTime); + metaClient.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(State.REQUESTED, + HoodieTimeline.DELTA_COMMIT_ACTION, newCommitTime), Option.empty()); + + // create one compaction instance before exist inflight instance. + String compactionTime = "101"; + writeClient.scheduleCompactionAtInstant(compactionTime, Option.empty()); + } + } + @Test public void testWriteStatusContentsAfterCompaction() throws Exception { // insert 100 records - HoodieWriteConfig config = getConfig(); + HoodieWriteConfig config = getConfigBuilder() + .withCompactionConfig(HoodieCompactionConfig.newBuilder().withMaxNumDeltaCommitsBeforeCompaction(1).build()) + .build(); try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) { String newCommitTime = "100"; writeClient.startCommitWithTime(newCommitTime); @@ -147,19 +171,14 @@ public void testWriteStatusContentsAfterCompaction() throws Exception { // Update all the 100 records HoodieTable table = HoodieSparkTable.create(config, context); newCommitTime = "101"; - writeClient.startCommitWithTime(newCommitTime); List updatedRecords = dataGen.generateUpdates(newCommitTime, records); JavaRDD updatedRecordsRDD = jsc.parallelize(updatedRecords, 1); - HoodieIndex index = new SparkHoodieBloomIndex<>(config); - updatedRecords = ((JavaRDD)index.tagLocation(updatedRecordsRDD, context, table)).collect(); + HoodieIndex index = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); + JavaRDD updatedTaggedRecordsRDD = tagLocation(index, updatedRecordsRDD, table); - // Write them to corresponding avro logfiles. Also, set the state transition properly. - HoodieWriteableTestTable.of(table, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS) - .withLogAppends(updatedRecords); - metaClient.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(State.REQUESTED, - HoodieTimeline.DELTA_COMMIT_ACTION, newCommitTime), Option.empty()); - writeClient.commit(newCommitTime, jsc.emptyRDD(), Option.empty()); + writeClient.startCommitWithTime(newCommitTime); + writeClient.upsertPreppedRecords(updatedTaggedRecordsRDD, newCommitTime).collect(); metaClient.reloadActiveTimeline(); // Verify that all data file has one log file @@ -171,20 +190,18 @@ public void testWriteStatusContentsAfterCompaction() throws Exception { assertEquals(1, fileSlice.getLogFiles().count(), "There should be 1 log file written for every data file"); } } - createDeltaCommit(basePath, newCommitTime); - createRequestedDeltaCommit(basePath, newCommitTime); - createInflightDeltaCommit(basePath, newCommitTime); // Do a compaction table = HoodieSparkTable.create(config, context); String compactionInstantTime = "102"; table.scheduleCompaction(context, compactionInstantTime, Option.empty()); table.getMetaClient().reloadActiveTimeline(); - JavaRDD result = (JavaRDD) table.compact(context, compactionInstantTime).getWriteStatuses(); + HoodieData result = (HoodieData) table.compact( + context, compactionInstantTime).getWriteStatuses(); // Verify that all partition paths are present in the WriteStatus result for (String partitionPath : dataGen.getPartitionPaths()) { - List writeStatuses = result.collect(); + List writeStatuses = result.collectAsList(); assertTrue(writeStatuses.stream() .filter(writeStatus -> writeStatus.getStat().getPartitionPath().contentEquals(partitionPath)).count() > 0); } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestInlineCompaction.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestInlineCompaction.java index 066a9656f2a50..1797f61ed2a28 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestInlineCompaction.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestInlineCompaction.java @@ -18,15 +18,20 @@ package org.apache.hudi.table.action.compact; -import org.apache.hudi.client.HoodieReadClient; +import org.apache.hudi.client.SparkRDDReadClient; import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.marker.WriteMarkersFactory; + +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.util.ArrayList; @@ -36,81 +41,310 @@ import java.util.stream.IntStream; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; public class TestInlineCompaction extends CompactionTestBase { - private HoodieWriteConfig getConfigForInlineCompaction(int maxDeltaCommits) { + private HoodieWriteConfig getConfigForInlineCompaction(int maxDeltaCommits, int maxDeltaTime, CompactionTriggerStrategy inlineCompactionType) { + return getConfigBuilder(false) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withInlineCompaction(true) + .withMaxNumDeltaCommitsBeforeCompaction(maxDeltaCommits) + .withMaxDeltaSecondsBeforeCompaction(maxDeltaTime) + .withInlineCompactionTriggerStrategy(inlineCompactionType).build()) + .build(); + } + + private HoodieWriteConfig getConfigDisableComapction(int maxDeltaCommits, int maxDeltaTime, CompactionTriggerStrategy inlineCompactionType) { return getConfigBuilder(false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withInlineCompaction(true).withMaxNumDeltaCommitsBeforeCompaction(maxDeltaCommits).build()) + .withInlineCompaction(false) + .withScheduleInlineCompaction(false) + .withMaxNumDeltaCommitsBeforeCompaction(maxDeltaCommits) + .withMaxDeltaSecondsBeforeCompaction(maxDeltaTime) + .withInlineCompactionTriggerStrategy(inlineCompactionType).build()) .build(); } @Test public void testCompactionIsNotScheduledEarly() throws Exception { // Given: make two commits - HoodieWriteConfig cfg = getConfigForInlineCompaction(3); + HoodieWriteConfig cfg = getConfigForInlineCompaction(3, 60, CompactionTriggerStrategy.NUM_COMMITS); try (SparkRDDWriteClient writeClient = getHoodieWriteClient(cfg)) { - List records = dataGen.generateInserts("000", 100); - HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath()); - runNextDeltaCommits(writeClient, readClient, Arrays.asList("000", "001"), records, cfg, true, new ArrayList<>()); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + List records = dataGen.generateInserts(HoodieActiveTimeline.createNewInstantTime(), 100); + SparkRDDReadClient readClient = getHoodieReadClient(cfg.getBasePath()); + List instants = IntStream.range(0, 2).mapToObj(i -> HoodieActiveTimeline.createNewInstantTime()).collect(Collectors.toList()); + runNextDeltaCommits(writeClient, readClient, instants, records, cfg, true, new ArrayList<>()); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); - // Then: ensure no compaction is executedm since there are only 2 delta commits - assertEquals(2, metaClient.getActiveTimeline().getCommitsAndCompactionTimeline().countInstants()); + // Then: ensure no compaction is executed since there are only 2 delta commits + assertEquals(2, metaClient.getActiveTimeline().getWriteTimeline().countInstants()); } } @Test - public void testSuccessfulCompaction() throws Exception { + public void testSuccessfulCompactionBasedOnNumCommits() throws Exception { // Given: make three commits - HoodieWriteConfig cfg = getConfigForInlineCompaction(3); + HoodieWriteConfig cfg = getConfigForInlineCompaction(3, 60, CompactionTriggerStrategy.NUM_COMMITS); List instants = IntStream.range(0, 2).mapToObj(i -> HoodieActiveTimeline.createNewInstantTime()).collect(Collectors.toList()); try (SparkRDDWriteClient writeClient = getHoodieWriteClient(cfg)) { List records = dataGen.generateInserts(instants.get(0), 100); - HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath()); + SparkRDDReadClient readClient = getHoodieReadClient(cfg.getBasePath()); runNextDeltaCommits(writeClient, readClient, instants, records, cfg, true, new ArrayList<>()); // third commit, that will trigger compaction - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); String finalInstant = HoodieActiveTimeline.createNewInstantTime(); createNextDeltaCommit(finalInstant, dataGen.generateUpdates(finalInstant, 100), writeClient, metaClient, cfg, false); // Then: ensure the file slices are compacted as per policy - metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); - assertEquals(4, metaClient.getActiveTimeline().getCommitsAndCompactionTimeline().countInstants()); + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); + assertEquals(4, metaClient.getActiveTimeline().getWriteTimeline().countInstants()); + assertEquals(HoodieTimeline.COMMIT_ACTION, metaClient.getActiveTimeline().lastInstant().get().getAction()); + String compactionTime = metaClient.getActiveTimeline().lastInstant().get().getTimestamp(); + assertFalse(WriteMarkersFactory.get(cfg.getMarkersType(), HoodieSparkTable.create(cfg, context), compactionTime).doesMarkerDirExist()); + } + } + + @Test + public void testSuccessfulCompactionBasedOnNumAfterCompactionRequest() throws Exception { + // Given: make 4 commits + HoodieWriteConfig cfg = getConfigDisableComapction(4, 60, CompactionTriggerStrategy.NUM_COMMITS_AFTER_LAST_REQUEST); + // turn off compaction table service to mock compaction service is down or very slow + List instants = IntStream.range(0, 4).mapToObj(i -> HoodieActiveTimeline.createNewInstantTime()).collect(Collectors.toList()); + + try (SparkRDDWriteClient writeClient = getHoodieWriteClient(cfg)) { + List records = dataGen.generateInserts(instants.get(0), 100); + SparkRDDReadClient readClient = getHoodieReadClient(cfg.getBasePath()); + + // step 1: create and complete 4 delta commit, then create 1 compaction request after this + runNextDeltaCommits(writeClient, readClient, instants, records, cfg, true, new ArrayList<>()); + + String requestInstant = HoodieActiveTimeline.createNewInstantTime(); + scheduleCompaction(requestInstant, writeClient, cfg); + + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); + assertEquals(metaClient.getActiveTimeline().getInstants() + .filter(hoodieInstant -> hoodieInstant.getAction().equals(HoodieTimeline.COMPACTION_ACTION) + && hoodieInstant.getState() == HoodieInstant.State.REQUESTED).count(), 1); + + // step 2: try to create another, but this one should fail because the NUM_COMMITS_AFTER_LAST_REQUEST strategy , + // and will throw a AssertionError due to scheduleCompaction will check if the last instant is a compaction request + requestInstant = HoodieActiveTimeline.createNewInstantTime(); + try { + scheduleCompaction(requestInstant, writeClient, cfg); + Assertions.fail(); + } catch (AssertionError error) { + //should be here + } + + // step 3: compelete another 4 delta commit should be 2 compaction request after this + instants = IntStream.range(0, 4).mapToObj(i -> HoodieActiveTimeline.createNewInstantTime()).collect(Collectors.toList()); + records = dataGen.generateInsertsForPartition(instants.get(0), 100, "2022/03/15"); + for (String instant : instants) { + createNextDeltaCommit(instant, records, writeClient, metaClient, cfg, false); + } + // runNextDeltaCommits(writeClient, readClient, instants, records, cfg, true, gotPendingCompactionInstants); + requestInstant = HoodieActiveTimeline.createNewInstantTime(); + scheduleCompaction(requestInstant, writeClient, cfg); + + // step 4: restore the table service, complete the last commit, and this commit will trigger all compaction requests + cfg = getConfigForInlineCompaction(4, 60, CompactionTriggerStrategy.NUM_COMMITS_AFTER_LAST_REQUEST); + try (SparkRDDWriteClient newWriteClient = getHoodieWriteClient(cfg)) { + String finalInstant = HoodieActiveTimeline.createNewInstantTime(); + createNextDeltaCommit(finalInstant, dataGen.generateUpdates(finalInstant, 100), newWriteClient, metaClient, cfg, false); + } + + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); + // step 5: there should be only 2 .commit, and no pending compaction. + // the last instant should be delta commit since the compaction request is earlier. + assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().filter(instant -> instant.getAction().equals(HoodieTimeline.COMMIT_ACTION)) + .countInstants(), 2); + assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().filterPendingCompactionTimeline().countInstants(), 0); + assertEquals(HoodieTimeline.DELTA_COMMIT_ACTION, metaClient.getActiveTimeline().lastInstant().get().getAction()); + } + } + + @Test + public void testSuccessfulCompactionBasedOnTime() throws Exception { + // Given: make one commit + HoodieWriteConfig cfg = getConfigForInlineCompaction(5, 10, CompactionTriggerStrategy.TIME_ELAPSED); + + try (SparkRDDWriteClient writeClient = getHoodieWriteClient(cfg)) { + String instantTime = HoodieActiveTimeline.createNewInstantTime(); + List records = dataGen.generateInserts(instantTime, 10); + SparkRDDReadClient readClient = getHoodieReadClient(cfg.getBasePath()); + runNextDeltaCommits(writeClient, readClient, Arrays.asList(instantTime), records, cfg, true, new ArrayList<>()); + + // after 10s, that will trigger compaction + String finalInstant = HoodieActiveTimeline.createNewInstantTime(10000); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); + createNextDeltaCommit(finalInstant, dataGen.generateUpdates(finalInstant, 100), writeClient, metaClient, cfg, false); + + // Then: ensure the file slices are compacted as per policy + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); + assertEquals(3, metaClient.getActiveTimeline().getWriteTimeline().countInstants()); assertEquals(HoodieTimeline.COMMIT_ACTION, metaClient.getActiveTimeline().lastInstant().get().getAction()); } } @Test - public void testCompactionRetryOnFailure() throws Exception { + public void testSuccessfulCompactionBasedOnNumOrTime() throws Exception { + // Given: make three commits + HoodieWriteConfig cfg = getConfigForInlineCompaction(3, 60, CompactionTriggerStrategy.NUM_OR_TIME); + try (SparkRDDWriteClient writeClient = getHoodieWriteClient(cfg)) { + List records = dataGen.generateInserts(HoodieActiveTimeline.createNewInstantTime(), 10); + SparkRDDReadClient readClient = getHoodieReadClient(cfg.getBasePath()); + List instants = IntStream.range(0, 2).mapToObj(i -> HoodieActiveTimeline.createNewInstantTime()).collect(Collectors.toList()); + runNextDeltaCommits(writeClient, readClient, instants, records, cfg, true, new ArrayList<>()); + // Then: trigger the compaction because reach 3 commits. + String finalInstant = HoodieActiveTimeline.createNewInstantTime(); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); + createNextDeltaCommit(finalInstant, dataGen.generateUpdates(finalInstant, 10), writeClient, metaClient, cfg, false); + + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); + assertEquals(4, metaClient.getActiveTimeline().getWriteTimeline().countInstants()); + // 4th commit, that will trigger compaction because reach the time elapsed + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); + finalInstant = HoodieActiveTimeline.createNewInstantTime(60000); + createNextDeltaCommit(finalInstant, dataGen.generateUpdates(finalInstant, 10), writeClient, metaClient, cfg, false); + + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); + assertEquals(6, metaClient.getActiveTimeline().getWriteTimeline().countInstants()); + } + } + + @Test + public void testSuccessfulCompactionBasedOnNumAndTime() throws Exception { + // Given: make three commits + HoodieWriteConfig cfg = getConfigForInlineCompaction(3, 20, CompactionTriggerStrategy.NUM_AND_TIME); + try (SparkRDDWriteClient writeClient = getHoodieWriteClient(cfg)) { + List records = dataGen.generateInserts(HoodieActiveTimeline.createNewInstantTime(), 10); + SparkRDDReadClient readClient = getHoodieReadClient(cfg.getBasePath()); + List instants = IntStream.range(0, 3).mapToObj(i -> HoodieActiveTimeline.createNewInstantTime()).collect(Collectors.toList()); + runNextDeltaCommits(writeClient, readClient, instants, records, cfg, true, new ArrayList<>()); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); + + // Then: ensure no compaction is executed since there are only 3 delta commits + assertEquals(3, metaClient.getActiveTimeline().getWriteTimeline().countInstants()); + // 4th commit, that will trigger compaction + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); + String finalInstant = HoodieActiveTimeline.createNewInstantTime(20000); + createNextDeltaCommit(finalInstant, dataGen.generateUpdates(finalInstant, 10), writeClient, metaClient, cfg, false); + + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); + assertEquals(5, metaClient.getActiveTimeline().getWriteTimeline().countInstants()); + } + } + + @Test + public void testCompactionRetryOnFailureBasedOnNumCommits() throws Exception { // Given: two commits, schedule compaction and its failed/in-flight HoodieWriteConfig cfg = getConfigBuilder(false) .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build()) + .withInlineCompaction(false) + .withMaxNumDeltaCommitsBeforeCompaction(1).build()) .build(); - List instants = CollectionUtils.createImmutableList("000", "001"); + List instants = IntStream.range(0, 2).mapToObj(i -> HoodieActiveTimeline.createNewInstantTime()).collect(Collectors.toList()); + String instantTime2; try (SparkRDDWriteClient writeClient = getHoodieWriteClient(cfg)) { List records = dataGen.generateInserts(instants.get(0), 100); - HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath()); + SparkRDDReadClient readClient = getHoodieReadClient(cfg.getBasePath()); + runNextDeltaCommits(writeClient, readClient, instants, records, cfg, true, new ArrayList<>()); + // Schedule compaction instant2, make it in-flight (simulates inline compaction failing) + instantTime2 = HoodieActiveTimeline.createNewInstantTime(); + scheduleCompaction(instantTime2, writeClient, cfg); + moveCompactionFromRequestedToInflight(instantTime2, cfg); + } + + // When: a third commit happens + HoodieWriteConfig inlineCfg = getConfigForInlineCompaction(2, 60, CompactionTriggerStrategy.NUM_COMMITS); + String instantTime3 = HoodieActiveTimeline.createNewInstantTime(); + try (SparkRDDWriteClient writeClient = getHoodieWriteClient(inlineCfg)) { + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); + createNextDeltaCommit(instantTime3, dataGen.generateUpdates(instantTime3, 100), writeClient, metaClient, inlineCfg, false); + } + + // Then: 1 delta commit is done, the failed compaction is retried + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); + assertEquals(4, metaClient.getActiveTimeline().getWriteTimeline().countInstants()); + assertEquals(instantTime2, metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().firstInstant().get().getTimestamp()); + } + + @Test + public void testCompactionRetryOnFailureBasedOnTime() throws Exception { + // Given: two commits, schedule compaction and its failed/in-flight + HoodieWriteConfig cfg = getConfigBuilder(false) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withInlineCompaction(false) + .withMaxDeltaSecondsBeforeCompaction(5) + .withInlineCompactionTriggerStrategy(CompactionTriggerStrategy.TIME_ELAPSED).build()) + .build(); + String instantTime; + List instants = IntStream.range(0, 2).mapToObj(i -> HoodieActiveTimeline.createNewInstantTime()).collect(Collectors.toList()); + try (SparkRDDWriteClient writeClient = getHoodieWriteClient(cfg)) { + List records = dataGen.generateInserts(instants.get(0), 100); + SparkRDDReadClient readClient = getHoodieReadClient(cfg.getBasePath()); + runNextDeltaCommits(writeClient, readClient, instants, records, cfg, true, new ArrayList<>()); + // Schedule compaction instantTime, make it in-flight (simulates inline compaction failing) + instantTime = HoodieActiveTimeline.createNewInstantTime(10000); + scheduleCompaction(instantTime, writeClient, cfg); + moveCompactionFromRequestedToInflight(instantTime, cfg); + } + + // When: commit happens after 1000s. assumption is that, there won't be any new compaction getting scheduled within 100s, but the previous failed one will be + // rolledback and retried to move it to completion. + HoodieWriteConfig inlineCfg = getConfigForInlineCompaction(5, 1000, CompactionTriggerStrategy.TIME_ELAPSED); + String instantTime2; + try (SparkRDDWriteClient writeClient = getHoodieWriteClient(inlineCfg)) { + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); + instantTime2 = HoodieActiveTimeline.createNewInstantTime(); + createNextDeltaCommit(instantTime2, dataGen.generateUpdates(instantTime2, 10), writeClient, metaClient, inlineCfg, false); + } + + // Then: 1 delta commit is done, the failed compaction is retried + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); + // 2 delta commits at the beginning. 1 compaction, 1 delta commit following it. + assertEquals(4, metaClient.getActiveTimeline().getWriteTimeline().countInstants()); + assertEquals(instantTime, metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().firstInstant().get().getTimestamp()); + } + + @Test + public void testCompactionRetryOnFailureBasedOnNumAndTime() throws Exception { + // Given: two commits, schedule compaction and its failed/in-flight + HoodieWriteConfig cfg = getConfigBuilder(false) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withInlineCompaction(false) + .withMaxDeltaSecondsBeforeCompaction(1) + .withMaxNumDeltaCommitsBeforeCompaction(1) + .withInlineCompactionTriggerStrategy(CompactionTriggerStrategy.NUM_AND_TIME).build()) + .build(); + String instantTime; + List instants = IntStream.range(0, 2).mapToObj(i -> HoodieActiveTimeline.createNewInstantTime()).collect(Collectors.toList()); + try (SparkRDDWriteClient writeClient = getHoodieWriteClient(cfg)) { + List records = dataGen.generateInserts(instants.get(0), 10); + SparkRDDReadClient readClient = getHoodieReadClient(cfg.getBasePath()); runNextDeltaCommits(writeClient, readClient, instants, records, cfg, true, new ArrayList<>()); - // Schedule compaction 002, make it in-flight (simulates inline compaction failing) - scheduleCompaction("002", writeClient, cfg); - moveCompactionFromRequestedToInflight("002", cfg); + // Schedule compaction instantTime, make it in-flight (simulates inline compaction failing) + instantTime = HoodieActiveTimeline.createNewInstantTime(); + scheduleCompaction(instantTime, writeClient, cfg); + moveCompactionFromRequestedToInflight(instantTime, cfg); } // When: a third commit happens - HoodieWriteConfig inlineCfg = getConfigForInlineCompaction(2); + HoodieWriteConfig inlineCfg = getConfigForInlineCompaction(3, 20, CompactionTriggerStrategy.NUM_OR_TIME); + String instantTime2; try (SparkRDDWriteClient writeClient = getHoodieWriteClient(inlineCfg)) { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); - createNextDeltaCommit("003", dataGen.generateUpdates("003", 100), writeClient, metaClient, inlineCfg, false); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); + instantTime2 = HoodieActiveTimeline.createNewInstantTime(); + createNextDeltaCommit(instantTime2, dataGen.generateUpdates(instantTime2, 10), writeClient, metaClient, inlineCfg, false); } // Then: 1 delta commit is done, the failed compaction is retried - metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); - assertEquals(4, metaClient.getActiveTimeline().getCommitsAndCompactionTimeline().countInstants()); - assertEquals("002", metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().firstInstant().get().getTimestamp()); + metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); + assertEquals(4, metaClient.getActiveTimeline().getWriteTimeline().countInstants()); + assertEquals(instantTime, metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().firstInstant().get().getTimestamp()); } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/strategy/TestHoodieCompactionStrategy.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/strategy/TestHoodieCompactionStrategy.java index d140b1183aacf..319d6ea031eb1 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/strategy/TestHoodieCompactionStrategy.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/strategy/TestHoodieCompactionStrategy.java @@ -20,14 +20,15 @@ import org.apache.hudi.avro.model.HoodieCompactionOperation; import org.apache.hudi.common.model.BaseFile; +import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hadoop.fs.Path; import org.junit.jupiter.api.Test; import java.text.SimpleDateFormat; @@ -48,8 +49,8 @@ public class TestHoodieCompactionStrategy { private static final long MB = 1024 * 1024L; - private String[] partitionPaths = {"2017/01/01", "2017/01/02", "2017/01/03"}; private static final Random RANDOM = new Random(); + private String[] partitionPaths = {"2017/01/01", "2017/01/02", "2017/01/03"}; @Test public void testUnBounded() { @@ -75,7 +76,7 @@ public void testBoundedIOSimple() { sizesMap.put(90 * MB, Collections.singletonList(1024 * MB)); BoundedIOCompactionStrategy strategy = new BoundedIOCompactionStrategy(); HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig( - HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).withTargetIOPerCompactionInMB(400).build()) + HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).withTargetIOPerCompactionInMB(400).build()) .build(); List operations = createCompactionOperations(writeConfig, sizesMap); List returned = strategy.orderAndFilter(writeConfig, operations, new ArrayList<>()); @@ -98,19 +99,20 @@ public void testLogFileSizeCompactionSimple() { sizesMap.put(90 * MB, Collections.singletonList(1024 * MB)); LogFileSizeBasedCompactionStrategy strategy = new LogFileSizeBasedCompactionStrategy(); HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig( - HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).withTargetIOPerCompactionInMB(400).build()) + HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).withTargetIOPerCompactionInMB(1205) + .withLogFileSizeThresholdBasedCompaction(100 * 1024 * 1024).build()) .build(); List operations = createCompactionOperations(writeConfig, sizesMap); List returned = strategy.orderAndFilter(writeConfig, operations, new ArrayList<>()); assertTrue(returned.size() < operations.size(), "LogFileSizeBasedCompactionStrategy should have resulted in fewer compactions"); - assertEquals(1, returned.size(), "LogFileSizeBasedCompactionStrategy should have resulted in 1 compaction"); + assertEquals(2, returned.size(), "LogFileSizeBasedCompactionStrategy should have resulted in 2 compaction"); // Total size of all the log files Long returnedSize = returned.stream().map(s -> s.getMetrics().get(BoundedIOCompactionStrategy.TOTAL_IO_MB)) .map(Double::longValue).reduce(Long::sum).orElse(0L); - assertEquals(1204, (long) returnedSize, - "Should chose the first 2 compactions which should result in a total IO of 690 MB"); + assertEquals(1594, (long) returnedSize, + "Should chose the first 2 compactions which should result in a total IO of 1594 MB"); } @Test @@ -121,7 +123,7 @@ public void testDayBasedCompactionSimple() { sizesMap.put(100 * MB, Collections.singletonList(MB)); sizesMap.put(90 * MB, Collections.singletonList(1024 * MB)); - Map keyToPartitionMap = Collections.unmodifiableMap(new HashMap() { + Map keyToPartitionMap = Collections.unmodifiableMap(new HashMap() { { put(120 * MB, partitionPaths[2]); put(110 * MB, partitionPaths[2]); @@ -141,10 +143,10 @@ public void testDayBasedCompactionSimple() { "DayBasedCompactionStrategy should have resulted in fewer compactions"); assertEquals(2, returned.size(), "DayBasedCompactionStrategy should have resulted in fewer compactions"); - int comparision = strategy.getComparator().compare(returned.get(returned.size() - 1).getPartitionPath(), + int comparison = strategy.getComparator().compare(returned.get(returned.size() - 1).getPartitionPath(), returned.get(0).getPartitionPath()); // Either the partition paths are sorted in descending order or they are equal - assertTrue(comparision >= 0, "DayBasedCompactionStrategy should sort partitions in descending order"); + assertTrue(comparison >= 0, "DayBasedCompactionStrategy should sort partitions in descending order"); } @Test @@ -167,7 +169,7 @@ public void testBoundedPartitionAwareCompactionSimple() { String currentDayPlus1 = format.format(BoundedPartitionAwareCompactionStrategy.getDateAtOffsetFromToday(1)); String currentDayPlus5 = format.format(BoundedPartitionAwareCompactionStrategy.getDateAtOffsetFromToday(5)); - Map keyToPartitionMap = Collections.unmodifiableMap(new HashMap() { + Map keyToPartitionMap = Collections.unmodifiableMap(new HashMap() { { put(120 * MB, currentDay); put(110 * MB, currentDayMinus1); @@ -190,10 +192,10 @@ public void testBoundedPartitionAwareCompactionSimple() { assertEquals(5, returned.size(), "BoundedPartitionAwareCompactionStrategy should have resulted in fewer compactions"); - int comparision = strategy.getComparator().compare(returned.get(returned.size() - 1).getPartitionPath(), + int comparison = strategy.getComparator().compare(returned.get(returned.size() - 1).getPartitionPath(), returned.get(0).getPartitionPath()); // Either the partition paths are sorted in descending order or they are equal - assertTrue(comparision >= 0, "BoundedPartitionAwareCompactionStrategy should sort partitions in descending order"); + assertTrue(comparison >= 0, "BoundedPartitionAwareCompactionStrategy should sort partitions in descending order"); } @Test @@ -216,7 +218,7 @@ public void testUnboundedPartitionAwareCompactionSimple() { String currentDayPlus1 = format.format(BoundedPartitionAwareCompactionStrategy.getDateAtOffsetFromToday(1)); String currentDayPlus5 = format.format(BoundedPartitionAwareCompactionStrategy.getDateAtOffsetFromToday(5)); - Map keyToPartitionMap = Collections.unmodifiableMap(new HashMap() { + Map keyToPartitionMap = Collections.unmodifiableMap(new HashMap() { { put(120 * MB, currentDay); put(110 * MB, currentDayMinus1); @@ -241,8 +243,44 @@ public void testUnboundedPartitionAwareCompactionSimple() { "BoundedPartitionAwareCompactionStrategy should have resulted in 1 compaction"); } + @Test + public void testLogFileLengthBasedCompactionStrategy() { + Map> sizesMap = new HashMap<>(); + sizesMap.put(120 * MB, Arrays.asList(60 * MB, 10 * MB, 80 * MB)); + sizesMap.put(110 * MB, new ArrayList<>()); + sizesMap.put(100 * MB, Collections.singletonList(2048 * MB)); + sizesMap.put(90 * MB, Arrays.asList(512 * MB, 512 * MB)); + LogFileNumBasedCompactionStrategy strategy = new LogFileNumBasedCompactionStrategy(); + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig( + HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).withTargetIOPerCompactionInMB(1024) + .withCompactionLogFileNumThreshold(2).build()) + .build(); + List operations = createCompactionOperations(writeConfig, sizesMap); + List returned = strategy.orderAndFilter(writeConfig, operations, new ArrayList<>()); + + assertTrue(returned.size() < operations.size(), + "LogFileLengthBasedCompactionStrategy should have resulted in fewer compactions"); + assertEquals(2, returned.size(), "LogFileLengthBasedCompactionStrategy should have resulted in 2 compaction"); + + // Delte log File length + Integer allFileLength = returned.stream().map(s -> s.getDeltaFilePaths().size()) + .reduce(Integer::sum).orElse(0); + + assertEquals(5, allFileLength); + assertEquals(3, returned.get(0).getDeltaFilePaths().size()); + assertEquals(2, returned.get(1).getDeltaFilePaths().size()); + // Total size of all the log files + Long returnedSize = returned.stream().map(s -> s.getMetrics().get(BoundedIOCompactionStrategy.TOTAL_IO_MB)) + .map(Double::longValue).reduce(Long::sum).orElse(0L); + // TOTAL_IO_MB: ( 120 + 90 ) * 2 + 521 + 521 + 60 + 10 + 80 + assertEquals(1594, (long) returnedSize, + "Should chose the first 2 compactions which should result in a total IO of 1594 MB"); + + + } + private List createCompactionOperations(HoodieWriteConfig config, - Map> sizesMap) { + Map> sizesMap) { Map keyToPartitionMap = sizesMap.keySet().stream() .map(e -> Pair.of(e, partitionPaths[RANDOM.nextInt(partitionPaths.length - 1)])) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); @@ -250,17 +288,20 @@ private List createCompactionOperations(HoodieWriteCo } private List createCompactionOperations(HoodieWriteConfig config, - Map> sizesMap, Map keyToPartitionMap) { + Map> sizesMap, Map keyToPartitionMap) { List operations = new ArrayList<>(sizesMap.size()); sizesMap.forEach((k, v) -> { HoodieBaseFile df = TestHoodieBaseFile.newDataFile(k); String partitionPath = keyToPartitionMap.get(k); List logFiles = v.stream().map(TestHoodieLogFile::newLogFile).collect(Collectors.toList()); + FileSlice slice = new FileSlice(new HoodieFileGroupId(partitionPath, df.getFileId()), df.getCommitTime()); + slice.setBaseFile(df); + logFiles.stream().forEach(f -> slice.addLogFile(f)); operations.add(new HoodieCompactionOperation(df.getCommitTime(), logFiles.stream().map(s -> s.getPath().toString()).collect(Collectors.toList()), df.getPath(), df.getFileId(), partitionPath, - config.getCompactionStrategy().captureMetrics(config, Option.of(df), partitionPath, logFiles), + config.getCompactionStrategy().captureMetrics(config, slice), df.getBootstrapBaseFile().map(BaseFile::getPath).orElse(null)) ); }); @@ -272,7 +313,7 @@ public static class TestHoodieBaseFile extends HoodieBaseFile { private final long size; public TestHoodieBaseFile(long size) { - super("/tmp/XYXYXYXYXYYX_11_20180918020003.parquet"); + super("/tmp/XYXYXYXYXYYX_11_20180918020003" + HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension()); this.size = size; } @@ -303,10 +344,11 @@ public long getFileSize() { public static class TestHoodieLogFile extends HoodieLogFile { + private static int version = 0; private final long size; public TestHoodieLogFile(long size) { - super("/tmp/.ce481ee7-9e53-4a2e-9992-f9e295fa79c0_20180919184844.log.1"); + super("/tmp/.ce481ee7-9e53-4a2e-999-f9e295fa79c0_20180919184844.log." + version++); this.size = size; } @@ -314,11 +356,6 @@ public static HoodieLogFile newLogFile(long size) { return new TestHoodieLogFile(size); } - @Override - public Path getPath() { - return new Path("/tmp/test-log"); - } - @Override public long getFileSize() { return size; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/HoodieClientRollbackTestBase.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/HoodieClientRollbackTestBase.java index eb0e8711a484a..33a1c58a3a991 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/HoodieClientRollbackTestBase.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/HoodieClientRollbackTestBase.java @@ -18,14 +18,17 @@ package org.apache.hudi.table.action.rollback; +import org.apache.hudi.client.HoodieWriteResult; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieFileGroup; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.view.SyncableFileSystemView; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.testutils.Assertions; @@ -35,6 +38,7 @@ import java.io.IOException; import java.util.List; +import java.util.Set; import java.util.stream.Collectors; import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; @@ -49,7 +53,7 @@ protected void twoUpsertCommitDataWithTwoPartitions(List firstPartiti //just generate two partitions dataGen = new HoodieTestDataGenerator(new String[]{DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}); //1. prepare data - HoodieTestDataGenerator.writePartitionMetadata(fs, new String[]{DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}, basePath); + HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, new String[]{DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}, basePath); SparkRDDWriteClient client = getHoodieWriteClient(cfg); /** * Write 1 (only inserts) @@ -75,18 +79,18 @@ protected void twoUpsertCommitDataWithTwoPartitions(List firstPartiti } - //2. assert filegroup and get the first partition fileslice + //2. assert file group and get the first partition file slice HoodieTable table = this.getHoodieTable(metaClient, cfg); SyncableFileSystemView fsView = getFileSystemViewWithUnCommittedSlices(table.getMetaClient()); List firstPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList()); assertEquals(1, firstPartitionCommit2FileGroups.size()); firstPartitionCommit2FileSlices.addAll(firstPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList())); - //3. assert filegroup and get the second partition fileslice + //3. assert file group and get the second partition file slice List secondPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).collect(Collectors.toList()); assertEquals(1, secondPartitionCommit2FileGroups.size()); secondPartitionCommit2FileSlices.addAll(secondPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList())); - //4. assert fileslice + //4. assert file slice HoodieTableType tableType = this.getTableType(); if (tableType.equals(HoodieTableType.COPY_ON_WRITE)) { assertEquals(2, firstPartitionCommit2FileSlices.size()); @@ -96,4 +100,61 @@ protected void twoUpsertCommitDataWithTwoPartitions(List firstPartiti assertEquals(1, secondPartitionCommit2FileSlices.size()); } } + + protected void insertOverwriteCommitDataWithTwoPartitions(List firstPartitionCommit2FileSlices, + List secondPartitionCommit2FileSlices, + HoodieWriteConfig cfg, + boolean commitSecondInsertOverwrite) throws IOException { + //just generate two partitions + dataGen = new HoodieTestDataGenerator(new String[]{DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}); + HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, new String[]{DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}, basePath); + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + /** + * Write 1 (upsert) + */ + String newCommitTime = "001"; + List records = dataGen.generateInsertsContainsAllPartitions(newCommitTime, 2); + JavaRDD writeRecords = jsc.parallelize(records, 1); + client.startCommitWithTime(newCommitTime); + JavaRDD statuses = client.upsert(writeRecords, newCommitTime); + Assertions.assertNoWriteErrors(statuses.collect()); + client.commit(newCommitTime, statuses); + + // get fileIds written + HoodieTable table = this.getHoodieTable(metaClient, cfg); + SyncableFileSystemView fsView = getFileSystemViewWithUnCommittedSlices(table.getMetaClient()); + List firstPartitionCommit1FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList()); + assertEquals(1, firstPartitionCommit1FileGroups.size()); + Set partition1Commit1FileIds = firstPartitionCommit1FileGroups.get(0).getAllFileSlices().map(FileSlice::getFileId).collect(Collectors.toSet()); + List secondPartitionCommit1FileGroups = fsView.getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).collect(Collectors.toList()); + assertEquals(1, secondPartitionCommit1FileGroups.size()); + Set partition2Commit1FileIds = secondPartitionCommit1FileGroups.get(0).getAllFileSlices().map(FileSlice::getFileId).collect(Collectors.toSet()); + + /** + * Write 2 (one insert_overwrite) + */ + String commitActionType = HoodieTimeline.REPLACE_COMMIT_ACTION; + newCommitTime = "002"; + records = dataGen.generateInsertsContainsAllPartitions(newCommitTime, 2); + writeRecords = jsc.parallelize(records, 1); + client.startCommitWithTime(newCommitTime, commitActionType); + HoodieWriteResult result = client.insertOverwrite(writeRecords, newCommitTime); + statuses = result.getWriteStatuses(); + Assertions.assertNoWriteErrors(statuses.collect()); + if (commitSecondInsertOverwrite) { + client.commit(newCommitTime, statuses, Option.empty(), commitActionType, result.getPartitionToReplaceFileIds()); + } + metaClient.reloadActiveTimeline(); + // get new fileIds written as part of insert_overwrite + fsView = getFileSystemViewWithUnCommittedSlices(metaClient); + List firstPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH) + .filter(fg -> !partition1Commit1FileIds.contains(fg.getFileGroupId().getFileId())).collect(Collectors.toList()); + firstPartitionCommit2FileSlices.addAll(firstPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList())); + List secondPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH) + .filter(fg -> !partition2Commit1FileIds.contains(fg.getFileGroupId().getFileId())).collect(Collectors.toList()); + secondPartitionCommit2FileSlices.addAll(secondPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList())); + + assertEquals(1, firstPartitionCommit2FileSlices.size()); + assertEquals(1, secondPartitionCommit2FileSlices.size()); + } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestCopyOnWriteRollbackActionExecutor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestCopyOnWriteRollbackActionExecutor.java index e14dbf9c66142..237f06917824c 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestCopyOnWriteRollbackActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestCopyOnWriteRollbackActionExecutor.java @@ -19,24 +19,38 @@ package org.apache.hudi.table.action.rollback; import org.apache.hudi.avro.model.HoodieRollbackPartitionMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; +import org.apache.hudi.avro.model.HoodieRollbackRequest; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.HoodieRollbackStat; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieFileGroup; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestTable; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.MarkerFiles; +import org.apache.hudi.table.marker.WriteMarkersFactory; +import org.apache.hudi.testutils.Assertions; +import org.apache.hadoop.fs.FileSystem; +import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.ValueSource; +import org.mockito.Mockito; +import org.mockito.MockitoAnnotations; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; @@ -44,10 +58,12 @@ import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; +import static org.mockito.ArgumentMatchers.any; public class TestCopyOnWriteRollbackActionExecutor extends HoodieClientRollbackTestBase { @BeforeEach @@ -68,7 +84,7 @@ public void testCopyOnWriteRollbackActionExecutorForFileListingAsGenerateFile() final String p1 = "2015/03/16"; final String p2 = "2015/03/17"; final String p3 = "2016/03/15"; - // Let's create some commit files and parquet files + // Let's create some commit files and base files HoodieTestTable testTable = HoodieTestTable.of(metaClient) .withPartitionMetaFiles(p1, p2, p3) .addCommit("001") @@ -79,13 +95,18 @@ public void testCopyOnWriteRollbackActionExecutorForFileListingAsGenerateFile() .withBaseFilesInPartition(p1, "id21") .withBaseFilesInPartition(p2, "id22"); - HoodieTable table = this.getHoodieTable(metaClient, getConfig()); + HoodieWriteConfig writeConfig = getConfigBuilder().withRollbackUsingMarkers(false).build(); + HoodieTable table = this.getHoodieTable(metaClient, writeConfig); HoodieInstant needRollBackInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "002"); - + String rollbackInstant = "003"; // execute CopyOnWriteRollbackActionExecutor with filelisting mode - SparkCopyOnWriteRollbackActionExecutor copyOnWriteRollbackActionExecutor = new SparkCopyOnWriteRollbackActionExecutor(context, table.getConfig(), table, "003", needRollBackInstant, true); - assertFalse(copyOnWriteRollbackActionExecutor.getRollbackStrategy() instanceof SparkMarkerBasedRollbackStrategy); - List hoodieRollbackStats = copyOnWriteRollbackActionExecutor.executeRollback(); + BaseRollbackPlanActionExecutor copyOnWriteRollbackPlanActionExecutor = + new BaseRollbackPlanActionExecutor(context, table.getConfig(), table, rollbackInstant, needRollBackInstant, false, + table.getConfig().shouldRollbackUsingMarkers()); + HoodieRollbackPlan rollbackPlan = (HoodieRollbackPlan) copyOnWriteRollbackPlanActionExecutor.execute().get(); + CopyOnWriteRollbackActionExecutor copyOnWriteRollbackActionExecutor = new CopyOnWriteRollbackActionExecutor(context, table.getConfig(), table, "003", needRollBackInstant, true, + false); + List hoodieRollbackStats = copyOnWriteRollbackActionExecutor.executeRollback(rollbackPlan); // assert hoodieRollbackStats assertEquals(hoodieRollbackStats.size(), 3); @@ -96,14 +117,14 @@ public void testCopyOnWriteRollbackActionExecutorForFileListingAsGenerateFile() assertEquals(0, stat.getFailedDeleteFiles().size()); assertEquals(Collections.EMPTY_MAP, stat.getCommandBlocksCount()); assertEquals(testTable.forCommit("002").getBaseFilePath(p1, "id21").toString(), - stat.getSuccessDeleteFiles().get(0)); + this.fs.getScheme() + ":" + stat.getSuccessDeleteFiles().get(0)); break; case p2: assertEquals(1, stat.getSuccessDeleteFiles().size()); assertEquals(0, stat.getFailedDeleteFiles().size()); assertEquals(Collections.EMPTY_MAP, stat.getCommandBlocksCount()); assertEquals(testTable.forCommit("002").getBaseFilePath(p2, "id22").toString(), - stat.getSuccessDeleteFiles().get(0)); + this.fs.getScheme() + ":" + stat.getSuccessDeleteFiles().get(0)); break; case p3: assertEquals(0, stat.getSuccessDeleteFiles().size()); @@ -119,12 +140,79 @@ public void testCopyOnWriteRollbackActionExecutorForFileListingAsGenerateFile() assertTrue(testTable.commitExists("001")); assertTrue(testTable.baseFileExists(p1, "001", "id11")); assertTrue(testTable.baseFileExists(p2, "001", "id12")); - assertFalse(testTable.inflightCommitExists("002")); + // Note that executeRollback() does not delete inflight instant files + // The deletion is done in finishRollback() called by runRollback() + assertTrue(testTable.inflightCommitExists("002")); assertFalse(testTable.commitExists("002")); assertFalse(testTable.baseFileExists(p1, "002", "id21")); assertFalse(testTable.baseFileExists(p2, "002", "id22")); } + @Test + public void testListBasedRollbackStrategy() throws Exception { + //just generate two partitions + dataGen = new HoodieTestDataGenerator(new String[] {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH, DEFAULT_THIRD_PARTITION_PATH}); + HoodieWriteConfig cfg = getConfigBuilder().withRollbackUsingMarkers(false).build(); + // 1. prepare data + HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, new String[] {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}, basePath); + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + List records = dataGen.generateInsertsContainsAllPartitions(newCommitTime, 3); + JavaRDD writeRecords = jsc.parallelize(records, 1); + JavaRDD statuses = client.upsert(writeRecords, newCommitTime); + Assertions.assertNoWriteErrors(statuses.collect()); + + newCommitTime = "002"; + client.startCommitWithTime(newCommitTime); + records = dataGen.generateUpdates(newCommitTime, records); + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime); + Assertions.assertNoWriteErrors(statuses.collect()); + + context = new HoodieSparkEngineContext(jsc); + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable table = this.getHoodieTable(metaClient, cfg); + HoodieInstant needRollBackInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "002"); + String rollbackInstant = "003"; + + ListingBasedRollbackStrategy rollbackStrategy = new ListingBasedRollbackStrategy(table, context, table.getConfig(), rollbackInstant); + List rollBackRequests = rollbackStrategy.getRollbackRequests(needRollBackInstant); + rollBackRequests.forEach(entry -> System.out.println(" " + entry.getPartitionPath() + ", " + entry.getFileId() + " " + + Arrays.toString(entry.getFilesToBeDeleted().toArray()))); + + HoodieRollbackRequest rollbackRequest = rollBackRequests.stream().filter(entry -> entry.getPartitionPath().equals(DEFAULT_FIRST_PARTITION_PATH)).findFirst().get(); + + FileSystem fs = Mockito.mock(FileSystem.class); + MockitoAnnotations.initMocks(this); + + // mock to throw exception when fs.exists() is invoked + System.out.println("Fs.exists() call for " + rollbackRequest.getFilesToBeDeleted().get(0).toString()); + Mockito.when(fs.exists(any())) + .thenThrow(new IOException("Failing exists call for " + rollbackRequest.getFilesToBeDeleted().get(0))); + + rollbackStrategy = new ListingBasedRollbackStrategy(table, context, cfg, rollbackInstant); + List rollBackRequestsUpdated = rollbackStrategy.getRollbackRequests(needRollBackInstant); + rollBackRequestsUpdated.forEach(entry -> System.out.println(" " + entry.getPartitionPath() + ", " + entry.getFileId() + " " + + Arrays.toString(entry.getFilesToBeDeleted().toArray()))); + + assertEquals(rollBackRequests, rollBackRequestsUpdated); + } + + + // Verify that rollback works with replacecommit + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testCopyOnWriteRollbackWithReplaceCommits(boolean isUsingMarkers) throws IOException { + //1. prepare data and assert data result + List firstPartitionCommit2FileSlices = new ArrayList<>(); + List secondPartitionCommit2FileSlices = new ArrayList<>(); + HoodieWriteConfig cfg = getConfigBuilder().withRollbackUsingMarkers(isUsingMarkers).withAutoCommit(false).build(); + this.insertOverwriteCommitDataWithTwoPartitions(firstPartitionCommit2FileSlices, secondPartitionCommit2FileSlices, cfg, !isUsingMarkers); + HoodieTable table = this.getHoodieTable(metaClient, cfg); + performRollbackAndValidate(isUsingMarkers, cfg, table, firstPartitionCommit2FileSlices, secondPartitionCommit2FileSlices); + } + @ParameterizedTest @ValueSource(booleans = {true, false}) public void testCopyOnWriteRollbackActionExecutor(boolean isUsingMarkers) throws IOException { @@ -133,8 +221,14 @@ public void testCopyOnWriteRollbackActionExecutor(boolean isUsingMarkers) throws List secondPartitionCommit2FileSlices = new ArrayList<>(); HoodieWriteConfig cfg = getConfigBuilder().withRollbackUsingMarkers(isUsingMarkers).withAutoCommit(false).build(); this.twoUpsertCommitDataWithTwoPartitions(firstPartitionCommit2FileSlices, secondPartitionCommit2FileSlices, cfg, !isUsingMarkers); + metaClient.reloadActiveTimeline(); HoodieTable table = this.getHoodieTable(metaClient, cfg); + performRollbackAndValidate(isUsingMarkers, cfg, table, firstPartitionCommit2FileSlices, secondPartitionCommit2FileSlices); + } + private void performRollbackAndValidate(boolean isUsingMarkers, HoodieWriteConfig cfg, HoodieTable table, + List firstPartitionCommit2FileSlices, + List secondPartitionCommit2FileSlices) throws IOException { //2. rollback HoodieInstant commitInstant; if (isUsingMarkers) { @@ -143,12 +237,12 @@ public void testCopyOnWriteRollbackActionExecutor(boolean isUsingMarkers) throws commitInstant = table.getCompletedCommitTimeline().lastInstant().get(); } - SparkCopyOnWriteRollbackActionExecutor copyOnWriteRollbackActionExecutor = new SparkCopyOnWriteRollbackActionExecutor(context, cfg, table, "003", commitInstant, false); - if (!isUsingMarkers) { - assertFalse(copyOnWriteRollbackActionExecutor.getRollbackStrategy() instanceof SparkMarkerBasedRollbackStrategy); - } else { - assertTrue(copyOnWriteRollbackActionExecutor.getRollbackStrategy() instanceof SparkMarkerBasedRollbackStrategy); - } + BaseRollbackPlanActionExecutor copyOnWriteRollbackPlanActionExecutor = + new BaseRollbackPlanActionExecutor(context, table.getConfig(), table, "003", commitInstant, false, + table.getConfig().shouldRollbackUsingMarkers()); + HoodieRollbackPlan hoodieRollbackPlan = (HoodieRollbackPlan) copyOnWriteRollbackPlanActionExecutor.execute().get(); + CopyOnWriteRollbackActionExecutor copyOnWriteRollbackActionExecutor = new CopyOnWriteRollbackActionExecutor(context, cfg, table, "003", commitInstant, false, + false); Map rollbackMetadata = copyOnWriteRollbackActionExecutor.execute().getPartitionMetadata(); //3. assert the rollback stat @@ -156,9 +250,9 @@ public void testCopyOnWriteRollbackActionExecutor(boolean isUsingMarkers) throws for (Map.Entry entry : rollbackMetadata.entrySet()) { HoodieRollbackPartitionMetadata meta = entry.getValue(); assertTrue(meta.getFailedDeleteFiles() == null - || meta.getFailedDeleteFiles().size() == 0); + || meta.getFailedDeleteFiles().size() == 0); assertTrue(meta.getSuccessDeleteFiles() == null - || meta.getSuccessDeleteFiles().size() == 1); + || meta.getSuccessDeleteFiles().size() == 1); } //4. assert filegroup after rollback, and compare to the rollbackstat @@ -168,15 +262,11 @@ public void testCopyOnWriteRollbackActionExecutor(boolean isUsingMarkers) throws List firstPartitionRollBack1FileSlices = firstPartitionRollBack1FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()); assertEquals(1, firstPartitionRollBack1FileSlices.size()); - if (!isUsingMarkers) { - firstPartitionCommit2FileSlices.removeAll(firstPartitionRollBack1FileSlices); - assertEquals(1, firstPartitionCommit2FileSlices.size()); - assertEquals(firstPartitionCommit2FileSlices.get(0).getBaseFile().get().getPath(), - rollbackMetadata.get(DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles().get(0)); - } else { - assertEquals(firstPartitionCommit2FileSlices.get(0).getBaseFile().get().getPath(), - String.format("%s:%s/%s", this.fs.getScheme(), basePath, rollbackMetadata.get(DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles().get(0))); - } + firstPartitionCommit2FileSlices.removeAll(firstPartitionRollBack1FileSlices); + assertEquals(1, firstPartitionCommit2FileSlices.size()); + assertEquals(firstPartitionCommit2FileSlices.get(0).getBaseFile().get().getPath(), + this.fs.getScheme() + ":" + rollbackMetadata.get(DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles().get(0)); + // assert the second partition file group and file slice List secondPartitionRollBack1FileGroups = table.getFileSystemView().getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).collect(Collectors.toList()); @@ -185,16 +275,11 @@ public void testCopyOnWriteRollbackActionExecutor(boolean isUsingMarkers) throws assertEquals(1, secondPartitionRollBack1FileSlices.size()); // assert the second partition rollback file is equals rollBack1SecondPartitionStat - if (!isUsingMarkers) { - secondPartitionCommit2FileSlices.removeAll(secondPartitionRollBack1FileSlices); - assertEquals(1, secondPartitionCommit2FileSlices.size()); - assertEquals(secondPartitionCommit2FileSlices.get(0).getBaseFile().get().getPath(), - rollbackMetadata.get(DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles().get(0)); - } else { - assertEquals(secondPartitionCommit2FileSlices.get(0).getBaseFile().get().getPath(), - String.format("%s:%s/%s", this.fs.getScheme(), basePath, rollbackMetadata.get(DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles().get(0))); - } + secondPartitionCommit2FileSlices.removeAll(secondPartitionRollBack1FileSlices); + assertEquals(1, secondPartitionCommit2FileSlices.size()); + assertEquals(secondPartitionCommit2FileSlices.get(0).getBaseFile().get().getPath(), + this.fs.getScheme() + ":" + rollbackMetadata.get(DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles().get(0)); - assertFalse(new MarkerFiles(table, commitInstant.getTimestamp()).doesMarkerDirExist()); + assertFalse(WriteMarkersFactory.get(cfg.getMarkersType(), table, commitInstant.getTimestamp()).doesMarkerDirExist()); } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMarkerBasedRollbackStrategy.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMarkerBasedRollbackStrategy.java deleted file mode 100644 index 7acff79ef50fb..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMarkerBasedRollbackStrategy.java +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.rollback; - -import org.apache.hudi.common.HoodieRollbackStat; -import org.apache.hudi.common.model.HoodieFileFormat; -import org.apache.hudi.common.model.IOType; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.testutils.HoodieTestTable; -import org.apache.hudi.table.HoodieSparkTable; -import org.apache.hudi.testutils.HoodieClientTestBase; - -import org.apache.hadoop.fs.FileStatus; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.util.List; -import java.util.stream.Stream; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -public class TestMarkerBasedRollbackStrategy extends HoodieClientTestBase { - - @BeforeEach - public void setUp() throws Exception { - initPath(); - initSparkContexts(); - initFileSystem(); - initMetaClient(); - initDFS(); - } - - @AfterEach - public void tearDown() throws Exception { - cleanupResources(); - } - - @Test - public void testCopyOnWriteRollback() throws Exception { - // given: wrote some base files and corresponding markers - HoodieTestTable testTable = HoodieTestTable.of(metaClient); - String f0 = testTable.addRequestedCommit("000") - .getFileIdsWithBaseFilesInPartitions("partA").get("partA"); - String f1 = testTable.addCommit("001") - .withBaseFilesInPartition("partA", f0) - .getFileIdsWithBaseFilesInPartitions("partB").get("partB"); - String f2 = "f2"; - testTable.forCommit("001") - .withMarkerFile("partA", f0, IOType.MERGE) - .withMarkerFile("partB", f1, IOType.CREATE) - .withMarkerFile("partA", f2, IOType.CREATE); - - // when - List stats = new SparkMarkerBasedRollbackStrategy(HoodieSparkTable.create(getConfig(), context, metaClient), context, getConfig(), "002") - .execute(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "001")); - - // then: ensure files are deleted correctly, non-existent files reported as failed deletes - assertEquals(2, stats.size()); - - FileStatus[] partAFiles = testTable.listAllFilesInPartition("partA"); - FileStatus[] partBFiles = testTable.listAllFilesInPartition("partB"); - - assertEquals(0, partBFiles.length); - assertEquals(1, partAFiles.length); - assertEquals(2, stats.stream().mapToInt(r -> r.getSuccessDeleteFiles().size()).sum()); - assertEquals(1, stats.stream().mapToInt(r -> r.getFailedDeleteFiles().size()).sum()); - } - - @Test - public void testMergeOnReadRollback() throws Exception { - // given: wrote some base + log files and corresponding markers - HoodieTestTable testTable = HoodieTestTable.of(metaClient); - String f2 = testTable.addRequestedDeltaCommit("000") - .getFileIdsWithBaseFilesInPartitions("partA").get("partA"); - String f1 = testTable.addDeltaCommit("001") - .withLogFile("partA", f2) - .getFileIdsWithBaseFilesInPartitions("partB").get("partB"); - String f3 = "f3"; - String f4 = "f4"; - testTable.forDeltaCommit("001") - .withMarkerFile("partB", f1, IOType.CREATE) - .withMarkerFile("partA", f3, IOType.CREATE) - .withMarkerFile("partA", f2, IOType.APPEND) - .withMarkerFile("partB", f4, IOType.APPEND); - - // when - List stats = new SparkMarkerBasedRollbackStrategy(HoodieSparkTable.create(getConfig(), context, metaClient), context, getConfig(), "002") - .execute(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, "001")); - - // then: ensure files are deleted, rollback block is appended (even if append does not exist) - assertEquals(2, stats.size()); - // will have the log file - FileStatus[] partBFiles = testTable.listAllFilesInPartition("partB"); - assertEquals(1, partBFiles.length); - assertTrue(partBFiles[0].getPath().getName().contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())); - assertTrue(partBFiles[0].getLen() > 0); - - FileStatus[] partAFiles = testTable.listAllFilesInPartition("partA"); - assertEquals(3, partAFiles.length); - assertEquals(2, Stream.of(partAFiles).filter(s -> s.getPath().getName().contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())).count()); - assertEquals(1, Stream.of(partAFiles).filter(s -> s.getPath().getName().contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())).filter(f -> f.getLen() > 0).count()); - - // only partB/f1_001 will be deleted - assertEquals(1, stats.stream().mapToInt(r -> r.getSuccessDeleteFiles().size()).sum()); - // partA/f3_001 is non existent - assertEquals(1, stats.stream().mapToInt(r -> r.getFailedDeleteFiles().size()).sum()); - } -} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java index f64ce8be7544a..1c4de34e5ee38 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java @@ -19,19 +19,35 @@ package org.apache.hudi.table.action.rollback; import org.apache.hudi.avro.model.HoodieRollbackPartitionMetadata; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFileGroup; import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.FileSystemViewStorageType; +import org.apache.hudi.common.table.view.SyncableFileSystemView; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.MarkerFiles; +import org.apache.hudi.table.marker.WriteMarkersFactory; +import org.apache.hudi.testutils.MetadataMergeWriteStatus; +import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; @@ -39,9 +55,11 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import java.util.stream.Stream; import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH; @@ -60,7 +78,7 @@ public void setUp() throws Exception { initPath(); initSparkContexts(); //just generate tow partitions - dataGen = new HoodieTestDataGenerator(new String[]{DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}); + dataGen = new HoodieTestDataGenerator(new String[] {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}); initFileSystem(); initMetaClient(); } @@ -71,7 +89,7 @@ public void tearDown() throws Exception { } @ParameterizedTest - @ValueSource(booleans = {true, false}) + @ValueSource(booleans = {true}) public void testMergeOnReadRollbackActionExecutor(boolean isUsingMarkers) throws IOException { //1. prepare data and assert data result List firstPartitionCommit2FileSlices = new ArrayList<>(); @@ -88,31 +106,29 @@ public void testMergeOnReadRollbackActionExecutor(boolean isUsingMarkers) throws //2. rollback HoodieInstant rollBackInstant = new HoodieInstant(isUsingMarkers, HoodieTimeline.DELTA_COMMIT_ACTION, "002"); - SparkMergeOnReadRollbackActionExecutor mergeOnReadRollbackActionExecutor = new SparkMergeOnReadRollbackActionExecutor( + BaseRollbackPlanActionExecutor mergeOnReadRollbackPlanActionExecutor = + new BaseRollbackPlanActionExecutor(context, cfg, table, "003", rollBackInstant, false, + cfg.shouldRollbackUsingMarkers()); + mergeOnReadRollbackPlanActionExecutor.execute().get(); + MergeOnReadRollbackActionExecutor mergeOnReadRollbackActionExecutor = new MergeOnReadRollbackActionExecutor( context, cfg, table, "003", rollBackInstant, - true); - // assert is filelist mode - if (!isUsingMarkers) { - assertFalse(mergeOnReadRollbackActionExecutor.getRollbackStrategy() instanceof SparkMarkerBasedRollbackStrategy); - } else { - assertTrue(mergeOnReadRollbackActionExecutor.getRollbackStrategy() instanceof SparkMarkerBasedRollbackStrategy); - } - + true, + false); //3. assert the rollback stat Map rollbackMetadata = mergeOnReadRollbackActionExecutor.execute().getPartitionMetadata(); assertEquals(2, rollbackMetadata.size()); for (Map.Entry entry : rollbackMetadata.entrySet()) { HoodieRollbackPartitionMetadata meta = entry.getValue(); - assertTrue(meta.getFailedDeleteFiles() == null || meta.getFailedDeleteFiles().size() == 0); - assertTrue(meta.getSuccessDeleteFiles() == null || meta.getSuccessDeleteFiles().size() == 0); + assertEquals(0, meta.getFailedDeleteFiles().size()); + assertEquals(0, meta.getSuccessDeleteFiles().size()); } - //4. assert filegroup after rollback, and compare to the rollbackstat + //4. assert file group after rollback, and compare to the rollbackstat // assert the first partition data and log file size List firstPartitionRollBack1FileGroups = table.getFileSystemView().getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList()); assertEquals(1, firstPartitionRollBack1FileGroups.size()); @@ -137,22 +153,156 @@ public void testMergeOnReadRollbackActionExecutor(boolean isUsingMarkers) throws secondPartitionRollBackLogFiles.removeAll(secondPartitionCommit2LogFiles); assertEquals(1, secondPartitionRollBackLogFiles.size()); - assertFalse(new MarkerFiles(table, "002").doesMarkerDirExist()); + assertFalse(WriteMarkersFactory.get(cfg.getMarkersType(), table, "002").doesMarkerDirExist()); } @Test - public void testFailForCompletedInstants() { - Assertions.assertThrows(IllegalArgumentException.class, () -> { - HoodieInstant rollBackInstant = new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "002"); - new SparkMergeOnReadRollbackActionExecutor( - context, - getConfigBuilder().build(), - getHoodieTable(metaClient, getConfigBuilder().build()), - "003", - rollBackInstant, - true, - true, - true); + public void testRollbackForCanIndexLogFile() throws IOException { + cleanupResources(); + setUpDFS(); + //1. prepare data and assert data result + //just generate one partitions + dataGen = new HoodieTestDataGenerator(new String[]{DEFAULT_FIRST_PARTITION_PATH}); + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2).withBulkInsertParallelism(2).withFinalizeWriteParallelism(2).withDeleteParallelism(2) + .withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION) + .withWriteStatusClass(MetadataMergeWriteStatus.class) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024).parquetMaxFileSize(1024 * 1024).build()) + .forTable("test-trip-table") + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()) + .withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withEnableBackupForRemoteFileSystemView(false) // Fail test if problem connecting to timeline-server + .withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build()).withRollbackUsingMarkers(false).withAutoCommit(false).build(); + + //1. prepare data + new HoodieTestDataGenerator().writePartitionMetadata(fs, new String[]{DEFAULT_FIRST_PARTITION_PATH}, basePath); + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + // Write 1 (only inserts) + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + List records = dataGen.generateInsertsForPartition(newCommitTime, 2, DEFAULT_FIRST_PARTITION_PATH); + JavaRDD writeRecords = jsc.parallelize(records, 1); + JavaRDD statuses = client.upsert(writeRecords, newCommitTime); + org.apache.hudi.testutils.Assertions.assertNoWriteErrors(statuses.collect()); + client.commit(newCommitTime, statuses); + + // check fileSlice + HoodieTable table = this.getHoodieTable(metaClient, cfg); + SyncableFileSystemView fsView = getFileSystemViewWithUnCommittedSlices(table.getMetaClient()); + List firstPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList()); + assertEquals(1, firstPartitionCommit2FileGroups.size()); + assertEquals(1, (int) firstPartitionCommit2FileGroups.get(0).getAllFileSlices().count()); + assertFalse(firstPartitionCommit2FileGroups.get(0).getAllFileSlices().findFirst().get().getBaseFile().isPresent()); + assertEquals(1, firstPartitionCommit2FileGroups.get(0).getAllFileSlices().findFirst().get().getLogFiles().count()); + String generatedFileID = firstPartitionCommit2FileGroups.get(0).getFileGroupId().getFileId(); + + // check hoodieCommitMeta + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( + table.getMetaClient().getCommitTimeline() + .getInstantDetails(new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, "001")) + .get(), + HoodieCommitMetadata.class); + List firstPartitionWriteStat = commitMetadata.getPartitionToWriteStats().get(DEFAULT_FIRST_PARTITION_PATH); + assertEquals(2, firstPartitionWriteStat.size()); + // we have an empty writeStat for all partition + assert firstPartitionWriteStat.stream().anyMatch(wStat -> StringUtils.isNullOrEmpty(wStat.getFileId())); + // we have one non-empty writeStat which must contains update or insert + assertEquals(1, firstPartitionWriteStat.stream().filter(wStat -> !StringUtils.isNullOrEmpty(wStat.getFileId())).count()); + firstPartitionWriteStat.stream().filter(wStat -> !StringUtils.isNullOrEmpty(wStat.getFileId())).forEach(wStat -> { + assert wStat.getNumInserts() > 0; }); + + // Write 2 (inserts) + newCommitTime = "002"; + client.startCommitWithTime(newCommitTime); + List updateRecords = Collections.singletonList(dataGen.generateUpdateRecord(records.get(0).getKey(), newCommitTime)); + List insertRecordsInSamePartition = dataGen.generateInsertsForPartition(newCommitTime, 2, DEFAULT_FIRST_PARTITION_PATH); + List insertRecordsInOtherPartition = dataGen.generateInsertsForPartition(newCommitTime, 2, DEFAULT_SECOND_PARTITION_PATH); + List recordsToBeWrite = Stream.concat(Stream.concat(updateRecords.stream(), insertRecordsInSamePartition.stream()), insertRecordsInOtherPartition.stream()) + .collect(Collectors.toList()); + writeRecords = jsc.parallelize(recordsToBeWrite, 1); + statuses = client.upsert(writeRecords, newCommitTime); + client.commit(newCommitTime, statuses); + table = this.getHoodieTable(metaClient, cfg); + commitMetadata = HoodieCommitMetadata.fromBytes( + table.getMetaClient().getCommitTimeline() + .getInstantDetails(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, newCommitTime)) + .get(), + HoodieCommitMetadata.class); + assert commitMetadata.getPartitionToWriteStats().containsKey(DEFAULT_FIRST_PARTITION_PATH); + assert commitMetadata.getPartitionToWriteStats().containsKey(DEFAULT_SECOND_PARTITION_PATH); + List hoodieWriteStatOptionList = commitMetadata.getPartitionToWriteStats().get(DEFAULT_FIRST_PARTITION_PATH); + // Both update and insert record should enter same existing fileGroup due to small file handling + assertEquals(1, hoodieWriteStatOptionList.size()); + assertEquals(generatedFileID, hoodieWriteStatOptionList.get(0).getFileId()); + // check insert and update numbers + assertEquals(2, hoodieWriteStatOptionList.get(0).getNumInserts()); + assertEquals(1, hoodieWriteStatOptionList.get(0).getNumUpdateWrites()); + + List secondHoodieWriteStatOptionList = commitMetadata.getPartitionToWriteStats().get(DEFAULT_SECOND_PARTITION_PATH); + // All insert should enter one fileGroup + assertEquals(1, secondHoodieWriteStatOptionList.size()); + String fileIdInPartitionTwo = secondHoodieWriteStatOptionList.get(0).getFileId(); + assertEquals(2, hoodieWriteStatOptionList.get(0).getNumInserts()); + + // Rollback + HoodieInstant rollBackInstant = new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, "002"); + BaseRollbackPlanActionExecutor mergeOnReadRollbackPlanActionExecutor = + new BaseRollbackPlanActionExecutor(context, cfg, table, "003", rollBackInstant, false, + cfg.shouldRollbackUsingMarkers()); + mergeOnReadRollbackPlanActionExecutor.execute().get(); + MergeOnReadRollbackActionExecutor mergeOnReadRollbackActionExecutor = new MergeOnReadRollbackActionExecutor( + context, + cfg, + table, + "003", + rollBackInstant, + true, + false); + + //3. assert the rollback stat + Map rollbackMetadata = mergeOnReadRollbackActionExecutor.execute().getPartitionMetadata(); + assertEquals(2, rollbackMetadata.size()); + + //4. assert filegroup after rollback, and compare to the rollbackstat + // assert the first partition data and log file size + HoodieRollbackPartitionMetadata partitionMetadata = rollbackMetadata.get(DEFAULT_FIRST_PARTITION_PATH); + assertTrue(partitionMetadata.getSuccessDeleteFiles().isEmpty()); + assertTrue(partitionMetadata.getFailedDeleteFiles().isEmpty()); + assertEquals(1, partitionMetadata.getRollbackLogFiles().size()); + + // assert the second partition data and log file size + partitionMetadata = rollbackMetadata.get(DEFAULT_SECOND_PARTITION_PATH); + assertEquals(1, partitionMetadata.getSuccessDeleteFiles().size()); + assertTrue(partitionMetadata.getFailedDeleteFiles().isEmpty()); + assertTrue(partitionMetadata.getRollbackLogFiles().isEmpty()); + assertEquals(1, partitionMetadata.getSuccessDeleteFiles().size()); + } + + /** + * Test Cases for rolling back when there is no base file. + */ + @Test + public void testRollbackWhenFirstCommitFail() throws Exception { + + HoodieWriteConfig config = HoodieWriteConfig.newBuilder() + .withRollbackUsingMarkers(false) + .withPath(basePath).build(); + try (SparkRDDWriteClient client = getHoodieWriteClient(config)) { + client.startCommitWithTime("001"); + client.insert(jsc.emptyRDD(), "001"); + client.rollback("001"); + } + } + + private void setUpDFS() throws IOException { + initDFS(); + initSparkContexts(); + //just generate two partitions + dataGen = new HoodieTestDataGenerator(new String[] {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}); + initFileSystem(); + initDFSMetaClient(); } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestRollbackUtils.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestRollbackUtils.java index 8db2069b07339..f03d9f3967df2 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestRollbackUtils.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestRollbackUtils.java @@ -18,14 +18,16 @@ package org.apache.hudi.table.action.rollback; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hudi.common.HoodieRollbackStat; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.CollectionUtils; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.permission.FsPermission; import org.junit.jupiter.api.Test; import java.util.HashMap; @@ -33,10 +35,11 @@ import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertIterableEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; public class TestRollbackUtils { + private static final String BASE_FILE_EXTENSION = HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension(); private FileStatus generateFileStatus(String filePath) { Path dataFile1Path = new Path(filePath); @@ -62,15 +65,15 @@ public void testMergeRollbackStat() { String partitionPath2 = "/partitionPath2/"; //prepare HoodieRollbackStat for different partition Map dataFilesOnlyStat1Files = new HashMap<>(); - dataFilesOnlyStat1Files.put(generateFileStatus(partitionPath1 + "dataFile1.parquet"), true); - dataFilesOnlyStat1Files.put(generateFileStatus(partitionPath1 + "dataFile2.parquet"), true); + dataFilesOnlyStat1Files.put(generateFileStatus(partitionPath1 + "dataFile1" + BASE_FILE_EXTENSION), true); + dataFilesOnlyStat1Files.put(generateFileStatus(partitionPath1 + "dataFile2" + BASE_FILE_EXTENSION), true); HoodieRollbackStat dataFilesOnlyStat1 = HoodieRollbackStat.newBuilder() .withPartitionPath(partitionPath1) .withDeletedFileResults(dataFilesOnlyStat1Files).build(); Map dataFilesOnlyStat2Files = new HashMap<>(); - dataFilesOnlyStat2Files.put(generateFileStatus(partitionPath2 + "dataFile1.parquet"), true); - dataFilesOnlyStat2Files.put(generateFileStatus(partitionPath2 + "dataFile2.parquet"), true); + dataFilesOnlyStat2Files.put(generateFileStatus(partitionPath2 + "dataFile1" + BASE_FILE_EXTENSION), true); + dataFilesOnlyStat2Files.put(generateFileStatus(partitionPath2 + "dataFile2" + BASE_FILE_EXTENSION), true); HoodieRollbackStat dataFilesOnlyStat2 = HoodieRollbackStat.newBuilder() .withPartitionPath(partitionPath2) .withDeletedFileResults(dataFilesOnlyStat1Files).build(); @@ -83,7 +86,7 @@ public void testMergeRollbackStat() { //prepare HoodieRollbackStat for failed and block append Map dataFilesOnlyStat3Files = new HashMap<>(); dataFilesOnlyStat3Files.put(generateFileStatus(partitionPath1 + "dataFile1.log"), true); - dataFilesOnlyStat3Files.put(generateFileStatus(partitionPath1 + "dataFile3.parquet"), false); + dataFilesOnlyStat3Files.put(generateFileStatus(partitionPath1 + "dataFile3" + BASE_FILE_EXTENSION), false); HoodieRollbackStat dataFilesOnlyStat3 = HoodieRollbackStat.newBuilder() .withPartitionPath(partitionPath1) .withDeletedFileResults(dataFilesOnlyStat3Files).build(); @@ -98,10 +101,10 @@ public void testMergeRollbackStat() { HoodieRollbackStat dataFilesOnlyStatMerge1 = RollbackUtils.mergeRollbackStat(dataFilesOnlyStat1, dataFilesOnlyStat3); assertEquals(partitionPath1, dataFilesOnlyStatMerge1.getPartitionPath()); - assertIterableEquals(CollectionUtils.createImmutableList(partitionPath1 + "dataFile3.parquet"), + assertIterableEquals(CollectionUtils.createImmutableList(partitionPath1 + "dataFile3" + BASE_FILE_EXTENSION), dataFilesOnlyStatMerge1.getFailedDeleteFiles()); - assertIterableEquals(CollectionUtils.createImmutableList(partitionPath1 + "dataFile1.parquet", - partitionPath1 + "dataFile2.parquet", partitionPath1 + "dataFile1.log").stream().sorted().collect(Collectors.toList()), + assertIterableEquals(CollectionUtils.createImmutableList(partitionPath1 + "dataFile1" + BASE_FILE_EXTENSION, + partitionPath1 + "dataFile2" + BASE_FILE_EXTENSION, partitionPath1 + "dataFile1.log").stream().sorted().collect(Collectors.toList()), dataFilesOnlyStatMerge1.getSuccessDeleteFiles().stream().sorted().collect(Collectors.toList())); assertEquals(0, dataFilesOnlyStatMerge1.getCommandBlocksCount().size()); @@ -109,10 +112,10 @@ public void testMergeRollbackStat() { HoodieRollbackStat dataFilesOnlyStatMerge2 = RollbackUtils.mergeRollbackStat(dataFilesOnlyStatMerge1, dataFilesOnlyStat4); assertEquals(partitionPath1, dataFilesOnlyStatMerge1.getPartitionPath()); - assertIterableEquals(CollectionUtils.createImmutableList(partitionPath1 + "dataFile3.parquet").stream().sorted().collect(Collectors.toList()), + assertIterableEquals(CollectionUtils.createImmutableList(partitionPath1 + "dataFile3" + BASE_FILE_EXTENSION).stream().sorted().collect(Collectors.toList()), dataFilesOnlyStatMerge2.getFailedDeleteFiles().stream().sorted().collect(Collectors.toList())); - assertIterableEquals(CollectionUtils.createImmutableList(partitionPath1 + "dataFile1.parquet", - partitionPath1 + "dataFile2.parquet", partitionPath1 + "dataFile1.log").stream().sorted().collect(Collectors.toList()), + assertIterableEquals(CollectionUtils.createImmutableList(partitionPath1 + "dataFile1" + BASE_FILE_EXTENSION, + partitionPath1 + "dataFile2" + BASE_FILE_EXTENSION, partitionPath1 + "dataFile1.log").stream().sorted().collect(Collectors.toList()), dataFilesOnlyStatMerge2.getSuccessDeleteFiles().stream().sorted().collect(Collectors.toList())); assertEquals(CollectionUtils.createImmutableMap(generateFileStatus(partitionPath1 + "dataFile1.log"), 10L), dataFilesOnlyStatMerge2.getCommandBlocksCount()); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestCleanPlanExecutor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestCleanPlanExecutor.java new file mode 100644 index 0000000000000..9fcac64c002f1 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestCleanPlanExecutor.java @@ -0,0 +1,614 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.functional; + +import org.apache.hudi.avro.model.HoodieFileStatus; +import org.apache.hudi.common.HoodieCleanStat; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.model.BootstrapFileMapping; +import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.testutils.HoodieMetadataTestTable; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; +import org.apache.hudi.table.TestCleaner; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.time.Instant; +import java.time.ZoneId; +import java.time.ZonedDateTime; +import java.util.Arrays; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Tests covering different clean plan policies/strategies. + */ +public class TestCleanPlanExecutor extends TestCleaner { + + @Test + public void testInvalidCleaningTriggerStrategy() { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(false).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withIncrementalCleaningMode(true) + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.EAGER) + .withCleanBootstrapBaseFileEnabled(true) + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2) + .withCleaningTriggerStrategy("invalid_strategy").build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(false).build()).build(); + Exception e = assertThrows(IllegalArgumentException.class, () -> runCleaner(config, true), "should fail when invalid trigger strategy is provided!"); + assertTrue(e.getMessage().contains("No enum constant org.apache.hudi.table.action.clean.CleaningTriggerStrategy.invalid_strategy")); + } + + private static Stream argumentsForTestKeepLatestCommits() { + return Stream.of( + Arguments.of(false, false, false, false), + Arguments.of(true, false, false, false), + Arguments.of(true, true, false, false), + Arguments.of(false, false, true, false), + Arguments.of(false, false, false, true) + ); + } + + /** + * Test HoodieTable.clean() Cleaning by commit logic for COW table. + */ + @ParameterizedTest + @MethodSource("argumentsForTestKeepLatestCommits") + public void testKeepLatestCommits( + boolean simulateFailureRetry, boolean simulateMetadataFailure, + boolean enableIncrementalClean, boolean enableBootstrapSourceClean) throws Exception { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withIncrementalCleaningMode(enableIncrementalClean) + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.EAGER) + .withCleanBootstrapBaseFileEnabled(enableBootstrapSourceClean) + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS) + .retainCommits(2) + .withMaxCommitsBeforeCleaning(2) + .build()).build(); + + HoodieTestTable testTable = HoodieTestTable.of(metaClient); + String p0 = "2020/01/01"; + String p1 = "2020/01/02"; + Map> bootstrapMapping = enableBootstrapSourceClean ? generateBootstrapIndexAndSourceData(p0, p1) : null; + + // make 1 commit, with 1 file per partition + String file1P0C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p0).get(0).getFileId() + : UUID.randomUUID().toString(); + String file1P1C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p1).get(0).getFileId() + : UUID.randomUUID().toString(); + testTable.addInflightCommit("00000000000001").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0); + + HoodieCommitMetadata commitMetadata = generateCommitMetadata("00000000000001", + Collections.unmodifiableMap(new HashMap>() { + { + put(p0, CollectionUtils.createImmutableList(file1P0C0)); + put(p1, CollectionUtils.createImmutableList(file1P1C0)); + } + }) + ); + metaClient.getActiveTimeline().saveAsComplete( + new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000001"), + Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + + metaClient = HoodieTableMetaClient.reload(metaClient); + + List hoodieCleanStatsOne = + runCleaner(config, simulateFailureRetry, simulateMetadataFailure, 2, true); + assertEquals(0, hoodieCleanStatsOne.size(), "Must not scan any partitions and clean any files"); + assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); + assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0)); + + // make next commit, with 1 insert & 1 update per partition + Map partitionAndFileId002 = testTable.addInflightCommit("00000000000003").getFileIdsWithBaseFilesInPartitions(p0, p1); + String file2P0C1 = partitionAndFileId002.get(p0); + String file2P1C1 = partitionAndFileId002.get(p1); + testTable.forCommit("00000000000003").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0); + commitMetadata = generateCommitMetadata("00000000000003", new HashMap>() { + { + put(p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1)); + put(p1, CollectionUtils.createImmutableList(file1P1C0, file2P1C1)); + } + }); + metaClient.getActiveTimeline().saveAsComplete( + new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000003"), + Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + List hoodieCleanStatsTwo = + runCleaner(config, simulateFailureRetry, simulateMetadataFailure, 4, true); + assertEquals(0, hoodieCleanStatsTwo.size(), "Must not scan any partitions and clean any files"); + assertTrue(testTable.baseFileExists(p0, "00000000000003", file2P0C1)); + assertTrue(testTable.baseFileExists(p1, "00000000000003", file2P1C1)); + assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); + assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0)); + + // make next commit, with 2 updates to existing files, and 1 insert + String file3P0C2 = testTable.addInflightCommit("00000000000005") + .withBaseFilesInPartition(p0, file1P0C0) + .withBaseFilesInPartition(p0, file2P0C1) + .getFileIdsWithBaseFilesInPartitions(p0).get(p0); + commitMetadata = generateCommitMetadata("00000000000003", + CollectionUtils.createImmutableMap( + p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1, file3P0C2))); + metaClient.getActiveTimeline().saveAsComplete( + new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000005"), + Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + + List hoodieCleanStatsThree = + runCleaner(config, simulateFailureRetry, simulateMetadataFailure, 6, true); + assertEquals(0, hoodieCleanStatsThree.size(), + "Must not clean any file. We have to keep 1 version before the latest commit time to keep"); + assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); + + // make next commit, with 2 updates to existing files, and 1 insert + String file4P0C3 = testTable.addInflightCommit("00000000000007") + .withBaseFilesInPartition(p0, file1P0C0) + .withBaseFilesInPartition(p0, file2P0C1) + .getFileIdsWithBaseFilesInPartitions(p0).get(p0); + commitMetadata = generateCommitMetadata("00000000000004", + CollectionUtils.createImmutableMap( + p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1, file4P0C3))); + metaClient.getActiveTimeline().saveAsComplete( + new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000007"), + Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + + List hoodieCleanStatsFour = + runCleaner(config, simulateFailureRetry, simulateMetadataFailure, 8, true); + // enableBootstrapSourceClean would delete the bootstrap base file as the same time + HoodieCleanStat partitionCleanStat = getCleanStat(hoodieCleanStatsFour, p0); + + assertEquals(enableBootstrapSourceClean ? 2 : 1, partitionCleanStat.getSuccessDeleteFiles().size() + + (partitionCleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 + : partitionCleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least one old file"); + assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); + assertTrue(testTable.baseFileExists(p0, "00000000000003", file1P0C0)); + assertTrue(testTable.baseFileExists(p0, "00000000000005", file1P0C0)); + assertTrue(testTable.baseFileExists(p0, "00000000000003", file2P0C1)); + assertTrue(testTable.baseFileExists(p0, "00000000000005", file2P0C1)); + assertTrue(testTable.baseFileExists(p0, "00000000000005", file3P0C2)); + assertTrue(testTable.baseFileExists(p0, "00000000000007", file4P0C3)); + if (enableBootstrapSourceClean) { + assertFalse(Files.exists(Paths.get(bootstrapMapping.get( + p0).get(0).getBootstrapFileStatus().getPath().getUri()))); + } + + metaClient = HoodieTableMetaClient.reload(metaClient); + + String file5P0C4 = testTable.addInflightCommit("00000000000009") + .withBaseFilesInPartition(p0, file1P0C0) + .withBaseFilesInPartition(p0, file2P0C1) + .getFileIdsWithBaseFilesInPartitions(p0).get(p0); + commitMetadata = generateCommitMetadata("00000000000009", CollectionUtils.createImmutableMap( + p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1, file5P0C4))); + metaClient.getActiveTimeline().saveAsComplete( + new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000009"), + Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + + List hoodieCleanStatsFive = + runCleaner(config, simulateFailureRetry, simulateMetadataFailure, 10, true); + + assertEquals(0, hoodieCleanStatsFive.size(), "Must not clean any files since at least 2 commits are needed from last clean operation before " + + "clean can be scheduled again"); + assertTrue(testTable.baseFileExists(p0, "00000000000003", file1P0C0)); + assertTrue(testTable.baseFileExists(p0, "00000000000005", file1P0C0)); + assertTrue(testTable.baseFileExists(p0, "00000000000003", file2P0C1)); + assertTrue(testTable.baseFileExists(p0, "00000000000005", file2P0C1)); + assertTrue(testTable.baseFileExists(p0, "00000000000005", file3P0C2)); + assertTrue(testTable.baseFileExists(p0, "00000000000007", file4P0C3)); + + // No cleaning on partially written file, with no commit. + testTable.forCommit("00000000000011").withBaseFilesInPartition(p0, file3P0C2); + commitMetadata = generateCommitMetadata("00000000000011", CollectionUtils.createImmutableMap(p0, + CollectionUtils.createImmutableList(file3P0C2))); + metaClient.getActiveTimeline().createNewInstant( + new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMMIT_ACTION, "00000000000011")); + metaClient.getActiveTimeline().transitionRequestedToInflight( + new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMMIT_ACTION, "00000000000011"), + Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + List hoodieCleanStatsFive2 = + runCleaner(config, simulateFailureRetry, simulateMetadataFailure, 12, true); + HoodieCleanStat cleanStat = getCleanStat(hoodieCleanStatsFive2, p0); + assertNull(cleanStat, "Must not clean any files"); + assertTrue(testTable.baseFileExists(p0, "00000000000005", file3P0C2)); + assertTrue(testTable.baseFileExists(p0, "00000000000007", file4P0C3)); + } + + /** + * Test Hudi COW Table Cleaner - Keep the latest file versions policy. + */ + @Test + public void testKeepLatestFileVersions() throws Exception { + HoodieWriteConfig config = + HoodieWriteConfig.newBuilder().withPath(basePath) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()) + .build(); + + HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context); + HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter); + + final String p0 = "2020/01/01"; + final String p1 = "2020/01/02"; + + // make 1 commit, with 1 file per partition + final String file1P0C0 = UUID.randomUUID().toString(); + final String file1P1C0 = UUID.randomUUID().toString(); + + Map>> c1PartitionToFilesNameLengthMap = new HashMap<>(); + c1PartitionToFilesNameLengthMap.put(p0, Collections.singletonList(Pair.of(file1P0C0, 100))); + c1PartitionToFilesNameLengthMap.put(p1, Collections.singletonList(Pair.of(file1P1C0, 200))); + testTable.doWriteOperation("00000000000001", WriteOperationType.INSERT, Arrays.asList(p0, p1), + c1PartitionToFilesNameLengthMap, false, false); + + List hoodieCleanStatsOne = runCleanerWithInstantFormat(config, true); + assertEquals(0, hoodieCleanStatsOne.size(), "Must not clean any files"); + assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); + assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0)); + + // make next commit, with 1 insert & 1 update per partition + final String file2P0C1 = UUID.randomUUID().toString(); + final String file2P1C1 = UUID.randomUUID().toString(); + Map>> c2PartitionToFilesNameLengthMap = new HashMap<>(); + c2PartitionToFilesNameLengthMap.put(p0, Arrays.asList(Pair.of(file1P0C0, 101), Pair.of(file2P0C1, 100))); + c2PartitionToFilesNameLengthMap.put(p1, Arrays.asList(Pair.of(file1P1C0, 201), Pair.of(file2P1C1, 200))); + testTable.doWriteOperation("00000000000002", WriteOperationType.UPSERT, Collections.emptyList(), + c2PartitionToFilesNameLengthMap, false, false); + + // enableBootstrapSourceClean would delete the bootstrap base file at the same time + List hoodieCleanStatsTwo = runCleaner(config, 1, true); + HoodieCleanStat cleanStat = getCleanStat(hoodieCleanStatsTwo, p0); + assertEquals(1, cleanStat.getSuccessDeleteFiles().size() + + (cleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 + : cleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least 1 file"); + + cleanStat = getCleanStat(hoodieCleanStatsTwo, p1); + assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1)); + assertTrue(testTable.baseFileExists(p1, "00000000000002", file2P1C1)); + assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); + assertFalse(testTable.baseFileExists(p1, "00000000000001", file1P1C0)); + assertEquals(1, cleanStat.getSuccessDeleteFiles().size() + + (cleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 + : cleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least 1 file"); + + // make next commit, with 2 updates to existing files, and 1 insert + final String file3P0C2 = UUID.randomUUID().toString(); + Map>> c3PartitionToFilesNameLengthMap = new HashMap<>(); + c3PartitionToFilesNameLengthMap.put(p0, Arrays.asList(Pair.of(file1P0C0, 102), Pair.of(file2P0C1, 101), + Pair.of(file3P0C2, 100))); + testTable.doWriteOperation("00000000000003", WriteOperationType.UPSERT, Collections.emptyList(), + c3PartitionToFilesNameLengthMap, false, false); + + List hoodieCleanStatsThree = runCleaner(config, 3, true); + assertEquals(2, + getCleanStat(hoodieCleanStatsThree, p0) + .getSuccessDeleteFiles().size(), "Must clean two files"); + assertFalse(testTable.baseFileExists(p0, "00000000000002", file1P0C0)); + assertFalse(testTable.baseFileExists(p0, "00000000000002", file2P0C1)); + assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2)); + + // No cleaning on partially written file, with no commit. + testTable.forCommit("00000000000004").withBaseFilesInPartition(p0, file3P0C2); + + List hoodieCleanStatsFour = runCleaner(config); + assertEquals(0, hoodieCleanStatsFour.size(), "Must not clean any files"); + assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2)); + } + + @Test + public void testKeepLatestFileVersionsWithBootstrapFileClean() throws Exception { + HoodieWriteConfig config = + HoodieWriteConfig.newBuilder().withPath(basePath) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withCleanBootstrapBaseFileEnabled(true) + .withCleanerParallelism(1) + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()) + .build(); + + HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context); + HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter); + + final String p0 = "2020/01/01"; + final String p1 = "2020/01/02"; + final Map> bootstrapMapping = generateBootstrapIndexAndSourceData(p0, p1); + + // make 1 commit, with 1 file per partition + final String file1P0C0 = bootstrapMapping.get(p0).get(0).getFileId(); + final String file1P1C0 = bootstrapMapping.get(p1).get(0).getFileId(); + + Map>> c1PartitionToFilesNameLengthMap = new HashMap<>(); + c1PartitionToFilesNameLengthMap.put(p0, Collections.singletonList(Pair.of(file1P0C0, 100))); + c1PartitionToFilesNameLengthMap.put(p1, Collections.singletonList(Pair.of(file1P1C0, 200))); + testTable.doWriteOperation("00000000000001", WriteOperationType.INSERT, Arrays.asList(p0, p1), + c1PartitionToFilesNameLengthMap, false, false); + + List hoodieCleanStatsOne = runCleanerWithInstantFormat(config, true); + assertEquals(0, hoodieCleanStatsOne.size(), "Must not clean any files"); + assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); + assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0)); + + // make next commit, with 1 insert & 1 update per partition + final String file2P0C1 = UUID.randomUUID().toString(); + final String file2P1C1 = UUID.randomUUID().toString(); + Map>> c2PartitionToFilesNameLengthMap = new HashMap<>(); + c2PartitionToFilesNameLengthMap.put(p0, Arrays.asList(Pair.of(file1P0C0, 101), Pair.of(file2P0C1, 100))); + c2PartitionToFilesNameLengthMap.put(p1, Arrays.asList(Pair.of(file1P1C0, 201), Pair.of(file2P1C1, 200))); + testTable.doWriteOperation("00000000000002", WriteOperationType.UPSERT, Collections.emptyList(), + c2PartitionToFilesNameLengthMap, false, false); + + // should delete the bootstrap base file at the same time + List hoodieCleanStatsTwo = runCleaner(config, 1, true); + HoodieCleanStat cleanStat = getCleanStat(hoodieCleanStatsTwo, p0); + assertEquals(2, cleanStat.getSuccessDeleteFiles().size() + + (cleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 + : cleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least 1 file"); + + HoodieFileStatus fstatus = + bootstrapMapping.get(p0).get(0).getBootstrapFileStatus(); + // This ensures full path is recorded in metadata. + assertTrue(cleanStat.getSuccessDeleteBootstrapBaseFiles().contains(fstatus.getPath().getUri()), + "Successful delete files were " + cleanStat.getSuccessDeleteBootstrapBaseFiles() + + " but did not contain " + fstatus.getPath().getUri()); + assertFalse(Files.exists(Paths.get(bootstrapMapping.get( + p0).get(0).getBootstrapFileStatus().getPath().getUri()))); + + cleanStat = getCleanStat(hoodieCleanStatsTwo, p1); + assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1)); + assertTrue(testTable.baseFileExists(p1, "00000000000002", file2P1C1)); + assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0)); + assertFalse(testTable.baseFileExists(p1, "00000000000001", file1P1C0)); + assertEquals(2, cleanStat.getSuccessDeleteFiles().size() + + (cleanStat.getSuccessDeleteBootstrapBaseFiles() == null ? 0 + : cleanStat.getSuccessDeleteBootstrapBaseFiles().size()), "Must clean at least 1 file"); + + fstatus = bootstrapMapping.get(p1).get(0).getBootstrapFileStatus(); + // This ensures full path is recorded in metadata. + assertTrue(cleanStat.getSuccessDeleteBootstrapBaseFiles().contains(fstatus.getPath().getUri()), + "Successful delete files were " + cleanStat.getSuccessDeleteBootstrapBaseFiles() + + " but did not contain " + fstatus.getPath().getUri()); + assertFalse(Files.exists(Paths.get(bootstrapMapping.get( + p1).get(0).getBootstrapFileStatus().getPath().getUri()))); + + // make next commit, with 2 updates to existing files, and 1 insert + final String file3P0C2 = UUID.randomUUID().toString(); + Map>> c3PartitionToFilesNameLengthMap = new HashMap<>(); + c3PartitionToFilesNameLengthMap.put(p0, Arrays.asList(Pair.of(file1P0C0, 102), Pair.of(file2P0C1, 101), + Pair.of(file3P0C2, 100))); + testTable.doWriteOperation("00000000000003", WriteOperationType.UPSERT, Collections.emptyList(), + c3PartitionToFilesNameLengthMap, false, false); + + List hoodieCleanStatsThree = runCleaner(config, 3, true); + assertEquals(2, + getCleanStat(hoodieCleanStatsThree, p0) + .getSuccessDeleteFiles().size(), "Must clean two files"); + assertFalse(testTable.baseFileExists(p0, "00000000000002", file1P0C0)); + assertFalse(testTable.baseFileExists(p0, "00000000000002", file2P0C1)); + assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2)); + + // No cleaning on partially written file, with no commit. + testTable.forCommit("00000000000004").withBaseFilesInPartition(p0, file3P0C2); + + List hoodieCleanStatsFour = runCleaner(config); + assertEquals(0, hoodieCleanStatsFour.size(), "Must not clean any files"); + assertTrue(testTable.baseFileExists(p0, "00000000000003", file3P0C2)); + } + + /** + * Test HoodieTable.clean() Cleaning by versions logic for MOR table with Log files. + */ + @Test + public void testKeepLatestFileVersionsMOR() throws Exception { + + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true) + // Column Stats Index is disabled, since these tests construct tables which are + // not valid (empty commit metadata, invalid parquet files) + .withMetadataIndexColumnStats(false) + .build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1) + .build()).build(); + + HoodieTableMetaClient metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ); + HoodieTestTable testTable = HoodieTestTable.of(metaClient); + String p0 = "2020/01/01"; + + // Make 3 files, one base file and 2 log files associated with base file + String file1P0 = testTable.addDeltaCommit("000").getFileIdsWithBaseFilesInPartitions(p0).get(p0); + testTable.forDeltaCommit("000") + .withLogFile(p0, file1P0, 1) + .withLogFile(p0, file1P0, 2); + + // Make 2 files, one base file and 1 log files associated with base file + testTable.addDeltaCommit("001") + .withBaseFilesInPartition(p0, file1P0) + .withLogFile(p0, file1P0, 3); + + List hoodieCleanStats = runCleaner(config); + assertEquals(3, + getCleanStat(hoodieCleanStats, p0).getSuccessDeleteFiles() + .size(), "Must clean three files, one base and 2 log files"); + assertFalse(testTable.baseFileExists(p0, "000", file1P0)); + assertFalse(testTable.logFilesExist(p0, "000", file1P0, 1, 2)); + assertTrue(testTable.baseFileExists(p0, "001", file1P0)); + assertTrue(testTable.logFileExists(p0, "001", file1P0, 3)); + } + + /** + * Test HoodieTable.clean() Cleaning by commit logic for MOR table with Log files. + */ + @Test + public void testKeepLatestCommitsMOR() throws Exception { + + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true) + // Column Stats Index is disabled, since these tests construct tables which are + // not valid (empty commit metadata, invalid parquet files) + .withMetadataIndexColumnStats(false).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1).build()) + .build(); + + HoodieTableMetaClient metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ); + HoodieTestTable testTable = HoodieTestTable.of(metaClient); + String p0 = "2020/01/01"; + + // Make 3 files, one base file and 2 log files associated with base file + String file1P0 = testTable.addDeltaCommit("000").getFileIdsWithBaseFilesInPartitions(p0).get(p0); + testTable.forDeltaCommit("000") + .withLogFile(p0, file1P0, 1) + .withLogFile(p0, file1P0, 2); + + // Make 2 files, one base file and 1 log files associated with base file + testTable.addDeltaCommit("001") + .withBaseFilesInPartition(p0, file1P0) + .withLogFile(p0, file1P0, 3); + + // Make 2 files, one base file and 1 log files associated with base file + testTable.addDeltaCommit("002") + .withBaseFilesInPartition(p0, file1P0) + .withLogFile(p0, file1P0, 4); + + List hoodieCleanStats = runCleaner(config); + assertEquals(3, + getCleanStat(hoodieCleanStats, p0).getSuccessDeleteFiles() + .size(), "Must clean three files, one base and 2 log files"); + assertFalse(testTable.baseFileExists(p0, "000", file1P0)); + assertFalse(testTable.logFilesExist(p0, "000", file1P0, 1, 2)); + assertTrue(testTable.baseFileExists(p0, "001", file1P0)); + assertTrue(testTable.logFileExists(p0, "001", file1P0, 3)); + assertTrue(testTable.baseFileExists(p0, "002", file1P0)); + assertTrue(testTable.logFileExists(p0, "002", file1P0, 4)); + } + + /** + * Tests cleaning service based on number of hours retained. + */ + @ParameterizedTest + @MethodSource("argumentsForTestKeepLatestCommits") + public void testKeepXHoursWithCleaning( + boolean simulateFailureRetry, boolean simulateMetadataFailure, + boolean enableIncrementalClean, boolean enableBootstrapSourceClean) throws Exception { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withIncrementalCleaningMode(enableIncrementalClean) + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.EAGER) + .withCleanBootstrapBaseFileEnabled(enableBootstrapSourceClean) + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS).cleanerNumHoursRetained(2) + .build()) + .build(); + + HoodieTestTable testTable = HoodieTestTable.of(metaClient); + String p0 = "2020/01/01"; + String p1 = "2020/01/02"; + Map> bootstrapMapping = enableBootstrapSourceClean ? generateBootstrapIndexAndSourceData(p0, p1) : null; + + String file1P0C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p0).get(0).getFileId() + : UUID.randomUUID().toString(); + String file1P1C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p1).get(0).getFileId() + : UUID.randomUUID().toString(); + Instant instant = Instant.now(); + ZonedDateTime commitDateTime = ZonedDateTime.ofInstant(instant, ZoneId.systemDefault()); + int minutesForFirstCommit = 150; + String firstCommitTs = HoodieActiveTimeline.formatDate(Date.from(commitDateTime.minusMinutes(minutesForFirstCommit).toInstant())); + testTable.addInflightCommit(firstCommitTs).withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0); + + HoodieCommitMetadata commitMetadata = generateCommitMetadata(firstCommitTs, + Collections.unmodifiableMap(new HashMap>() { + { + put(p0, CollectionUtils.createImmutableList(file1P0C0)); + put(p1, CollectionUtils.createImmutableList(file1P1C0)); + } + }) + ); + metaClient.getActiveTimeline().saveAsComplete( + new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, firstCommitTs), + Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + + metaClient = HoodieTableMetaClient.reload(metaClient); + + List hoodieCleanStatsOne = + runCleaner(config, simulateFailureRetry, simulateMetadataFailure); + assertEquals(0, hoodieCleanStatsOne.size(), "Must not scan any partitions and clean any files"); + assertTrue(testTable.baseFileExists(p0, firstCommitTs, file1P0C0)); + assertTrue(testTable.baseFileExists(p1, firstCommitTs, file1P1C0)); + + // make next commit, with 1 insert & 1 update per partition + int minutesForSecondCommit = 90; + String secondCommitTs = HoodieActiveTimeline.formatDate(Date.from(commitDateTime.minusMinutes(minutesForSecondCommit).toInstant())); + Map partitionAndFileId002 = testTable.addInflightCommit(secondCommitTs).getFileIdsWithBaseFilesInPartitions(p0, p1); + String file2P0C1 = partitionAndFileId002.get(p0); + String file2P1C1 = partitionAndFileId002.get(p1); + testTable.forCommit(secondCommitTs).withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0); + commitMetadata = generateCommitMetadata(secondCommitTs, new HashMap>() { + { + put(p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1)); + put(p1, CollectionUtils.createImmutableList(file1P1C0, file2P1C1)); + } + }); + metaClient.getActiveTimeline().saveAsComplete( + new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, secondCommitTs), + Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + List hoodieCleanStatsTwo = + runCleaner(config, simulateFailureRetry, simulateMetadataFailure); + assertEquals(2, hoodieCleanStatsTwo.size(), "Should clean one file each from both the partitions"); + assertTrue(testTable.baseFileExists(p0, secondCommitTs, file2P0C1)); + assertTrue(testTable.baseFileExists(p1, secondCommitTs, file2P1C1)); + assertTrue(testTable.baseFileExists(p0, secondCommitTs, file1P0C0)); + assertTrue(testTable.baseFileExists(p1, secondCommitTs, file1P1C0)); + assertFalse(testTable.baseFileExists(p0, firstCommitTs, file1P0C0)); + assertFalse(testTable.baseFileExists(p1, firstCommitTs, file1P1C0)); + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkCopyOnWriteTableArchiveWithReplace.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkCopyOnWriteTableArchiveWithReplace.java new file mode 100644 index 0000000000000..baff4ebac8752 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkCopyOnWriteTableArchiveWithReplace.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.functional; + +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieArchivalConfig; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.testutils.SparkClientFunctionalTestHarness; + +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.IOException; +import java.util.Arrays; + +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH; +import static org.apache.hudi.testutils.HoodieClientTestUtils.countRecordsOptionallySince; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +@Tag("functional") +public class TestHoodieSparkCopyOnWriteTableArchiveWithReplace extends SparkClientFunctionalTestHarness { + + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void testDeletePartitionAndArchive(boolean metadataEnabled) throws IOException { + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.COPY_ON_WRITE); + HoodieWriteConfig writeConfig = getConfigBuilder(true) + .withCleanConfig(HoodieCleanConfig.newBuilder().retainCommits(1).build()) + .withArchivalConfig(HoodieArchivalConfig.newBuilder().archiveCommitsWith(2, 3).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(metadataEnabled).build()) + .build(); + try (SparkRDDWriteClient client = getHoodieWriteClient(writeConfig); + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(DEFAULT_PARTITION_PATHS)) { + + // 1st write batch; 3 commits for 3 partitions + String instantTime1 = HoodieActiveTimeline.createNewInstantTime(1000); + client.startCommitWithTime(instantTime1); + client.insert(jsc().parallelize(dataGen.generateInsertsForPartition(instantTime1, 10, DEFAULT_FIRST_PARTITION_PATH), 1), instantTime1); + String instantTime2 = HoodieActiveTimeline.createNewInstantTime(2000); + client.startCommitWithTime(instantTime2); + client.insert(jsc().parallelize(dataGen.generateInsertsForPartition(instantTime2, 10, DEFAULT_SECOND_PARTITION_PATH), 1), instantTime2); + String instantTime3 = HoodieActiveTimeline.createNewInstantTime(3000); + client.startCommitWithTime(instantTime3); + client.insert(jsc().parallelize(dataGen.generateInsertsForPartition(instantTime3, 1, DEFAULT_THIRD_PARTITION_PATH), 1), instantTime3); + + final HoodieTimeline timeline1 = metaClient.getCommitsTimeline().filterCompletedInstants(); + assertEquals(21, countRecordsOptionallySince(jsc(), basePath(), sqlContext(), timeline1, Option.empty())); + + // delete the 1st and the 2nd partition; 1 replace commit + final String instantTime4 = HoodieActiveTimeline.createNewInstantTime(4000); + client.startCommitWithTime(instantTime4, HoodieActiveTimeline.REPLACE_COMMIT_ACTION); + client.deletePartitions(Arrays.asList(DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH), instantTime4); + + // 2nd write batch; 4 commits for the 4th partition; the 4th commit to trigger archiving the replace commit + for (int i = 5; i < 9; i++) { + String instantTime = HoodieActiveTimeline.createNewInstantTime(i * 1000); + client.startCommitWithTime(instantTime); + client.insert(jsc().parallelize(dataGen.generateInsertsForPartition(instantTime, 1, DEFAULT_THIRD_PARTITION_PATH), 1), instantTime); + } + + // verify archived timeline + metaClient = HoodieTableMetaClient.reload(metaClient); + final HoodieTimeline archivedTimeline = metaClient.getArchivedTimeline(); + assertTrue(archivedTimeline.containsInstant(instantTime1)); + assertTrue(archivedTimeline.containsInstant(instantTime2)); + assertTrue(archivedTimeline.containsInstant(instantTime3)); + assertTrue(archivedTimeline.containsInstant(instantTime4), "should contain the replace commit."); + + // verify records + final HoodieTimeline timeline2 = metaClient.getCommitTimeline().filterCompletedInstants(); + assertEquals(5, countRecordsOptionallySince(jsc(), basePath(), sqlContext(), timeline2, Option.empty()), + "should only have the 5 records from the 3rd partition."); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java new file mode 100644 index 0000000000000..f959a8f0d9526 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.functional; + +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieLayoutConfig; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.table.action.commit.SparkBucketIndexPartitioner; +import org.apache.hudi.table.storage.HoodieStorageLayout; +import org.apache.hudi.testutils.HoodieMergeOnReadTestUtils; +import org.apache.hudi.testutils.SparkClientFunctionalTestHarness; + +import org.apache.spark.api.java.JavaRDD; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.IOException; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; +import static org.apache.hudi.config.HoodieWriteConfig.AUTO_COMMIT_ENABLE; + +@Tag("functional") +public class TestHoodieSparkMergeOnReadTableCompaction extends SparkClientFunctionalTestHarness { + + private static Stream writeLogTest() { + // enable metadata table, enable embedded time line server + Object[][] data = new Object[][] { + {true, true}, + {true, false}, + {false, true}, + {false, false} + }; + return Stream.of(data).map(Arguments::of); + } + + private HoodieTestDataGenerator dataGen; + private SparkRDDWriteClient client; + private HoodieTableMetaClient metaClient; + + @BeforeEach + public void setup() { + dataGen = new HoodieTestDataGenerator(); + } + + @Test + public void testWriteDuringCompaction() throws IOException { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder() + .forTable("test-trip-table") + .withPath(basePath()) + .withSchema(TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2) + .withAutoCommit(false) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withMaxNumDeltaCommitsBeforeCompaction(1).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder() + .parquetMaxFileSize(1024).build()) + .withLayoutConfig(HoodieLayoutConfig.newBuilder() + .withLayoutType(HoodieStorageLayout.LayoutType.BUCKET.name()) + .withLayoutPartitioner(SparkBucketIndexPartitioner.class.getName()).build()) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BUCKET).withBucketNum("1").build()) + .build(); + + Properties props = getPropertiesForKeyGen(true); + props.putAll(config.getProps()); + + metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, props); + client = getHoodieWriteClient(config); + + // write data and commit + writeData(HoodieActiveTimeline.createNewInstantTime(), 100, true); + // write data again, and in the case of bucket index, all records will go into log files (we use a small max_file_size) + writeData(HoodieActiveTimeline.createNewInstantTime(), 100, true); + Assertions.assertEquals(200, readTableTotalRecordsNum()); + // schedule compaction + String compactionTime = (String) client.scheduleCompaction(Option.empty()).get(); + // write data, and do not commit. those records should not visible to reader + String insertTime = HoodieActiveTimeline.createNewInstantTime(); + List writeStatuses = writeData(insertTime, 100, false); + Assertions.assertEquals(200, readTableTotalRecordsNum()); + // commit the write. The records should be visible now even though the compaction does not complete. + client.commitStats(insertTime, writeStatuses.stream().map(WriteStatus::getStat).collect(Collectors.toList()), Option.empty(), metaClient.getCommitActionType()); + Assertions.assertEquals(300, readTableTotalRecordsNum()); + // after the compaction, total records should remain the same + config.setValue(AUTO_COMMIT_ENABLE, "true"); + client.compact(compactionTime); + Assertions.assertEquals(300, readTableTotalRecordsNum()); + } + + @ParameterizedTest + @MethodSource("writeLogTest") + public void testWriteLogDuringCompaction(boolean enableMetadataTable, boolean enableTimelineServer) throws IOException { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder() + .forTable("test-trip-table") + .withPath(basePath()) + .withSchema(TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2) + .withAutoCommit(true) + .withEmbeddedTimelineServerEnabled(enableTimelineServer) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadataTable).build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withMaxNumDeltaCommitsBeforeCompaction(1).build()) + .withLayoutConfig(HoodieLayoutConfig.newBuilder() + .withLayoutType(HoodieStorageLayout.LayoutType.BUCKET.name()) + .withLayoutPartitioner(SparkBucketIndexPartitioner.class.getName()).build()) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BUCKET).withBucketNum("1").build()) + .build(); + + Properties props = getPropertiesForKeyGen(true); + props.putAll(config.getProps()); + + metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, props); + client = getHoodieWriteClient(config); + + final List records = dataGen.generateInserts("001", 100); + JavaRDD writeRecords = jsc().parallelize(records, 2); + + // initialize 100 records + client.upsert(writeRecords, client.startCommit()); + // update 100 records + client.upsert(writeRecords, client.startCommit()); + // schedule compaction + client.scheduleCompaction(Option.empty()); + // delete 50 records + List toBeDeleted = records.stream().map(HoodieRecord::getKey).limit(50).collect(Collectors.toList()); + JavaRDD deleteRecords = jsc().parallelize(toBeDeleted, 2); + client.delete(deleteRecords, client.startCommit()); + // insert the same 100 records again + client.upsert(writeRecords, client.startCommit()); + Assertions.assertEquals(100, readTableTotalRecordsNum()); + } + + private long readTableTotalRecordsNum() { + return HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), + Arrays.stream(dataGen.getPartitionPaths()).map(p -> Paths.get(basePath(), p).toString()).collect(Collectors.toList()), basePath()).size(); + } + + private List writeData(String instant, int numRecords, boolean doCommit) { + metaClient = HoodieTableMetaClient.reload(metaClient); + JavaRDD records = jsc().parallelize(dataGen.generateInserts(instant, numRecords), 2); + metaClient = HoodieTableMetaClient.reload(metaClient); + client.startCommitWithTime(instant); + List writeStatuses = client.upsert(records, instant).collect(); + org.apache.hudi.testutils.Assertions.assertNoWriteErrors(writeStatuses); + if (doCommit) { + List writeStats = writeStatuses.stream().map(WriteStatus::getStat).collect(Collectors.toList()); + boolean committed = client.commitStats(instant, writeStats, Option.empty(), metaClient.getCommitActionType()); + Assertions.assertTrue(committed); + } + metaClient = HoodieTableMetaClient.reload(metaClient); + return writeStatuses; + } +} \ No newline at end of file diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableIncrementalRead.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableIncrementalRead.java new file mode 100644 index 0000000000000..275fd32ca7d8b --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableIncrementalRead.java @@ -0,0 +1,267 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.functional; + +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.hadoop.HoodieHFileInputFormat; +import org.apache.hudi.hadoop.HoodieParquetInputFormat; +import org.apache.hudi.hadoop.realtime.HoodieHFileRealtimeInputFormat; +import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; +import org.apache.hudi.hadoop.utils.HoodieHiveUtils; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.testutils.HoodieMergeOnReadTestUtils; +import org.apache.hudi.testutils.SparkClientFunctionalTestHarness; + +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Properties; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +@Tag("functional") +public class TestHoodieSparkMergeOnReadTableIncrementalRead extends SparkClientFunctionalTestHarness { + + private JobConf roSnapshotJobConf; + private JobConf roJobConf; + private JobConf rtJobConf; + + @BeforeEach + void setUp() { + roSnapshotJobConf = new JobConf(hadoopConf()); + roJobConf = new JobConf(hadoopConf()); + rtJobConf = new JobConf(hadoopConf()); + } + + // test incremental read does not go past compaction instant for RO views + // For RT views, incremental read can go past compaction + @Test + public void testIncrementalReadsWithCompaction() throws Exception { + final String partitionPath = "2020/02/20"; // use only one partition for this test + final HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(new String[] { partitionPath }); + Properties props = getPropertiesForKeyGen(true); + props.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieFileFormat.PARQUET.toString()); + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, props); + HoodieWriteConfig cfg = getConfigBuilder(true).build(); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { + + /* + * Write 1 (only inserts) + */ + String commitTime1 = "001"; + client.startCommitWithTime(commitTime1); + + List records001 = dataGen.generateInserts(commitTime1, 200); + Stream dataFiles = insertRecordsToMORTable(metaClient, records001, client, cfg, commitTime1); + assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit"); + + // verify only one base file shows up with commit time 001 + FileStatus[] snapshotROFiles = getROSnapshotFiles(partitionPath); + validateFiles(partitionPath, 1, snapshotROFiles, false, roSnapshotJobConf, 200, commitTime1); + + FileStatus[] incrementalROFiles = getROIncrementalFiles(partitionPath, true); + validateFiles(partitionPath, 1, incrementalROFiles, false, roJobConf, 200, commitTime1); + Path firstFilePath = incrementalROFiles[0].getPath(); + + FileStatus[] incrementalRTFiles = getRTIncrementalFiles(partitionPath); + validateFiles(partitionPath, 1, incrementalRTFiles, true, rtJobConf,200, commitTime1); + + assertEquals(firstFilePath, incrementalRTFiles[0].getPath()); + + /* + * Write 2 (updates) + */ + String updateTime = "004"; + client.startCommitWithTime(updateTime); + List records004 = dataGen.generateUpdates(updateTime, 100); + updateRecordsInMORTable(metaClient, records004, client, cfg, updateTime, false); + + // verify RO incremental reads - only one base file shows up because updates to into log files + incrementalROFiles = getROIncrementalFiles(partitionPath, false); + validateFiles(partitionPath, 1, incrementalROFiles, false, roJobConf, 200, commitTime1); + assertEquals(firstFilePath, incrementalROFiles[0].getPath()); + + // verify RT incremental reads includes updates also + incrementalRTFiles = getRTIncrementalFiles(partitionPath); + validateFiles(partitionPath, 1, incrementalRTFiles, true, rtJobConf, 200, commitTime1, updateTime); + + // request compaction, but do not perform compaction + String compactionCommitTime = "005"; + client.scheduleCompactionAtInstant("005", Option.empty()); + + // verify RO incremental reads - only one base file shows up because updates go into log files + incrementalROFiles = getROIncrementalFiles(partitionPath, true); + validateFiles(partitionPath,1, incrementalROFiles, false, roJobConf, 200, commitTime1); + + // verify RT incremental reads includes updates also + incrementalRTFiles = getRTIncrementalFiles(partitionPath); + validateFiles(partitionPath, 1, incrementalRTFiles, true, rtJobConf, 200, commitTime1, updateTime); + + // write 3 - more inserts + String insertsTime = "006"; + List records006 = dataGen.generateInserts(insertsTime, 200); + client.startCommitWithTime(insertsTime); + dataFiles = insertRecordsToMORTable(metaClient, records006, client, cfg, insertsTime); + assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit"); + + // verify new write shows up in snapshot mode even though there is pending compaction + snapshotROFiles = getROSnapshotFiles(partitionPath); + validateFiles(partitionPath, 2, snapshotROFiles, false, roSnapshotJobConf,400, commitTime1, insertsTime); + + incrementalROFiles = getROIncrementalFiles(partitionPath, true); + assertEquals(firstFilePath, incrementalROFiles[0].getPath()); + // verify 006 does not show up in RO mode because of pending compaction + + validateFiles(partitionPath, 1, incrementalROFiles, false, roJobConf, 200, commitTime1); + + // verify that if stopAtCompaction is disabled, inserts from "insertsTime" show up + incrementalROFiles = getROIncrementalFiles(partitionPath, false); + validateFiles(partitionPath,2, incrementalROFiles, false, roJobConf, 400, commitTime1, insertsTime); + + // verify 006 shows up in RT views + incrementalRTFiles = getRTIncrementalFiles(partitionPath); + validateFiles(partitionPath, 2, incrementalRTFiles, true, rtJobConf, 400, commitTime1, updateTime, insertsTime); + + // perform the scheduled compaction + client.compact(compactionCommitTime); + + // verify new write shows up in snapshot mode after compaction is complete + snapshotROFiles = getROSnapshotFiles(partitionPath); + validateFiles(partitionPath,2, snapshotROFiles, false, roSnapshotJobConf,400, commitTime1, compactionCommitTime, + insertsTime); + + incrementalROFiles = getROIncrementalFiles(partitionPath, "002", -1, true); + assertTrue(incrementalROFiles.length == 2); + // verify 006 shows up because of pending compaction + validateFiles(partitionPath, 2, incrementalROFiles, false, roJobConf, 400, commitTime1, compactionCommitTime, + insertsTime); + } + } + + private FileStatus[] getROSnapshotFiles(String partitionPath) + throws Exception { + FileInputFormat.setInputPaths(roSnapshotJobConf, Paths.get(basePath(), partitionPath).toString()); + return listStatus(HoodieTableConfig.BASE_FILE_FORMAT.defaultValue(), roSnapshotJobConf, false); + } + + private FileStatus[] getROIncrementalFiles(String partitionPath, boolean stopAtCompaction) + throws Exception { + return getROIncrementalFiles(partitionPath, "000", -1, stopAtCompaction); + } + + private FileStatus[] getROIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull, boolean stopAtCompaction) + throws Exception { + setupIncremental(roJobConf, startCommitTime, numCommitsToPull, stopAtCompaction); + FileInputFormat.setInputPaths(roJobConf, Paths.get(basePath(), partitionPath).toString()); + return listStatus(HoodieTableConfig.BASE_FILE_FORMAT.defaultValue(), roJobConf, false); + } + + private FileStatus[] getRTIncrementalFiles(String partitionPath) + throws Exception { + return getRTIncrementalFiles(partitionPath, "000", -1); + } + + private FileStatus[] getRTIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull) + throws Exception { + setupIncremental(rtJobConf, startCommitTime, numCommitsToPull, false); + FileInputFormat.setInputPaths(rtJobConf, Paths.get(basePath(), partitionPath).toString()); + return listStatus(HoodieTableConfig.BASE_FILE_FORMAT.defaultValue(), rtJobConf, true); + } + + private void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull, boolean stopAtCompaction) { + String modePropertyName = + String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.set(modePropertyName, HoodieHiveUtils.INCREMENTAL_SCAN_MODE); + + String startCommitTimestampName = + String.format(HoodieHiveUtils.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.set(startCommitTimestampName, startCommit); + + String maxCommitPulls = + String.format(HoodieHiveUtils.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.setInt(maxCommitPulls, numberOfCommitsToPull); + + String stopAtCompactionPropName = + String.format(HoodieHiveUtils.HOODIE_STOP_AT_COMPACTION_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.setBoolean(stopAtCompactionPropName, stopAtCompaction); + } + + private void validateFiles(String partitionPath, int expectedNumFiles, + FileStatus[] files, boolean realtime, JobConf jobConf, + int expectedRecords, String... expectedCommits) { + + assertEquals(expectedNumFiles, files.length); + Set expectedCommitsSet = Arrays.stream(expectedCommits).collect(Collectors.toSet()); + List records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), + Collections.singletonList(Paths.get(basePath(), partitionPath).toString()), basePath(), jobConf, realtime); + assertEquals(expectedRecords, records.size()); + Set actualCommits = records.stream().map(r -> + r.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString()).collect(Collectors.toSet()); + assertEquals(expectedCommitsSet, actualCommits); + } + + private FileStatus[] listStatus(HoodieFileFormat baseFileFormat, JobConf jobConf, boolean realtime) throws IOException { + // This is required as Hoodie InputFormats do not extend a common base class and FileInputFormat's + // listStatus() is protected. + FileInputFormat inputFormat = HoodieInputFormatUtils.getInputFormat(baseFileFormat, realtime, jobConf); + switch (baseFileFormat) { + case PARQUET: + if (realtime) { + return ((HoodieParquetRealtimeInputFormat)inputFormat).listStatus(jobConf); + } else { + return ((HoodieParquetInputFormat)inputFormat).listStatus(jobConf); + } + case HFILE: + if (realtime) { + return ((HoodieHFileRealtimeInputFormat)inputFormat).listStatus(jobConf); + } else { + return ((HoodieHFileInputFormat)inputFormat).listStatus(jobConf); + } + default: + throw new HoodieIOException("Hoodie InputFormat not implemented for base file format " + baseFileFormat); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableInsertUpdateDelete.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableInsertUpdateDelete.java new file mode 100644 index 0000000000000..73b1da95648e2 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableInsertUpdateDelete.java @@ -0,0 +1,377 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.functional; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.table.view.TableFileSystemView; +import org.apache.hudi.common.testutils.FileCreateUtils; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.testutils.HoodieClientTestUtils; +import org.apache.hudi.testutils.HoodieMergeOnReadTestUtils; +import org.apache.hudi.testutils.SparkClientFunctionalTestHarness; + +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.mapred.JobConf; +import org.apache.spark.api.java.JavaRDD; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; + +import java.util.Collection; +import java.util.List; +import java.util.Properties; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +@Tag("functional") +public class TestHoodieSparkMergeOnReadTableInsertUpdateDelete extends SparkClientFunctionalTestHarness { + + private static Stream testSimpleInsertAndUpdate() { + return Stream.of( + Arguments.of(HoodieFileFormat.PARQUET, true), + Arguments.of(HoodieFileFormat.PARQUET, false), + Arguments.of(HoodieFileFormat.HFILE, true) + ); + } + + @ParameterizedTest + @MethodSource + public void testSimpleInsertAndUpdate(HoodieFileFormat fileFormat, boolean populateMetaFields) throws Exception { + Properties properties = populateMetaFields ? new Properties() : getPropertiesForKeyGen(); + properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), fileFormat.toString()); + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties); + + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + HoodieWriteConfig cfg = cfgBuilder.build(); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { + + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + /* + * Write 1 (only inserts) + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 200); + Stream dataFiles = insertRecordsToMORTable(metaClient, records, client, cfg, newCommitTime); + assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit"); + + /* + * Write 2 (updates) + */ + newCommitTime = "004"; + client.startCommitWithTime(newCommitTime); + records = dataGen.generateUpdates(newCommitTime, 100); + updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, false); + + String compactionCommitTime = client.scheduleCompaction(Option.empty()).get().toString(); + client.compact(compactionCommitTime); + + HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); + hoodieTable.getHoodieView().sync(); + FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); + HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); + Stream dataFilesToRead = tableView.getLatestBaseFiles(); + assertTrue(dataFilesToRead.findAny().isPresent()); + + // verify that there is a commit + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTimeline timeline = metaClient.getCommitTimeline().filterCompletedInstants(); + assertEquals(1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), + "Expecting a single commit."); + String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp(); + assertTrue(HoodieTimeline.compareTimestamps("000", HoodieTimeline.LESSER_THAN, latestCompactionCommitTime)); + + if (cfg.populateMetaFields()) { + assertEquals(200, HoodieClientTestUtils.countRecordsOptionallySince(jsc(), basePath(), sqlContext(), timeline, Option.of("000")), + "Must contain 200 records"); + } else { + assertEquals(200, HoodieClientTestUtils.countRecordsOptionallySince(jsc(), basePath(), sqlContext(), timeline, Option.empty())); + } + } + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testInlineScheduleCompaction(boolean scheduleInlineCompaction) throws Exception { + HoodieFileFormat fileFormat = HoodieFileFormat.PARQUET; + Properties properties = new Properties(); + properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), fileFormat.toString()); + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties); + + HoodieWriteConfig cfg = getConfigBuilder(false) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024) + .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(2).withPreserveCommitMetadata(true).withScheduleInlineCompaction(scheduleInlineCompaction).build()) + .build(); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { + + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + /* + * Write 1 (only inserts) + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 200); + Stream dataFiles = insertRecordsToMORTable(metaClient, records, client, cfg, newCommitTime, true); + assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit"); + + /* + * Write 2 (updates) + */ + newCommitTime = "004"; + client.startCommitWithTime(newCommitTime); + records = dataGen.generateUpdates(newCommitTime, 100); + updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, true); + + // verify that there is a commit + if (scheduleInlineCompaction) { + assertEquals(metaClient.reloadActiveTimeline().getAllCommitsTimeline().filterPendingCompactionTimeline().countInstants(), 1); + } else { + assertEquals(metaClient.reloadActiveTimeline().getAllCommitsTimeline().filterPendingCompactionTimeline().countInstants(), 0); + } + } + } + + @Test + public void testRepeatedRollbackOfCompaction() throws Exception { + boolean scheduleInlineCompaction = false; + HoodieFileFormat fileFormat = HoodieFileFormat.PARQUET; + Properties properties = new Properties(); + properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), fileFormat.toString()); + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties); + + HoodieWriteConfig cfg = getConfigBuilder(false) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024) + .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(2).withPreserveCommitMetadata(true).withScheduleInlineCompaction(scheduleInlineCompaction).build()) + .build(); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { + + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + /* + * Write 1 (only inserts) + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 200); + Stream dataFiles = insertRecordsToMORTable(metaClient, records, client, cfg, newCommitTime, true); + assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit"); + + /* + * Write 2 (updates) + */ + newCommitTime = "004"; + client.startCommitWithTime(newCommitTime); + records = dataGen.generateUpdates(newCommitTime, 100); + updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, true); + + Option compactionInstant = client.scheduleCompaction(Option.empty()); + client.compact(compactionInstant.get()); + + // trigger compaction again. + client.compact(compactionInstant.get()); + + metaClient.reloadActiveTimeline(); + // verify that there is no new rollback instant generated + HoodieInstant rollbackInstant = metaClient.getActiveTimeline().getRollbackTimeline().lastInstant().get(); + FileCreateUtils.deleteRollbackCommit(metaClient.getBasePath().substring(metaClient.getBasePath().indexOf(":") + 1), + rollbackInstant.getTimestamp()); + metaClient.reloadActiveTimeline(); + SparkRDDWriteClient client1 = getHoodieWriteClient(cfg); + // trigger compaction again. + client1.compact(compactionInstant.get()); + metaClient.reloadActiveTimeline(); + // verify that there is no new rollback instant generated + HoodieInstant newRollbackInstant = metaClient.getActiveTimeline().getRollbackTimeline().lastInstant().get(); + assertEquals(rollbackInstant.getTimestamp(), newRollbackInstant.getTimestamp()); + } + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testSimpleInsertUpdateAndDelete(boolean populateMetaFields) throws Exception { + Properties properties = populateMetaFields ? new Properties() : getPropertiesForKeyGen(); + properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString()); + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties); + + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + HoodieWriteConfig cfg = cfgBuilder.build(); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { + + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + /* + * Write 1 (only inserts, written as base file) + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 20); + JavaRDD writeRecords = jsc().parallelize(records, 1); + + List statuses = client.upsert(writeRecords, newCommitTime).collect(); + assertNoWriteErrors(statuses); + + HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); + + Option deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant(); + assertTrue(deltaCommit.isPresent()); + assertEquals("001", deltaCommit.get().getTimestamp(), "Delta commit should be 001"); + + Option commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); + assertFalse(commit.isPresent()); + + FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); + HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); + Stream dataFilesToRead = tableView.getLatestBaseFiles(); + assertFalse(dataFilesToRead.findAny().isPresent()); + + tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); + dataFilesToRead = tableView.getLatestBaseFiles(); + assertTrue(dataFilesToRead.findAny().isPresent(), + "should list the base files we wrote in the delta commit"); + + /* + * Write 2 (only updates, written to .log file) + */ + newCommitTime = "002"; + client.startCommitWithTime(newCommitTime); + + records = dataGen.generateUpdates(newCommitTime, records); + writeRecords = jsc().parallelize(records, 1); + statuses = client.upsert(writeRecords, newCommitTime).collect(); + assertNoWriteErrors(statuses); + + /* + * Write 2 (only deletes, written to .log file) + */ + newCommitTime = "004"; + client.startCommitWithTime(newCommitTime); + + List fewRecordsForDelete = dataGen.generateDeletesFromExistingRecords(records); + + statuses = client.upsert(jsc().parallelize(fewRecordsForDelete, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + metaClient = HoodieTableMetaClient.reload(metaClient); + deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant(); + assertTrue(deltaCommit.isPresent()); + assertEquals("004", deltaCommit.get().getTimestamp(), "Latest Delta commit should be 004"); + + commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); + assertFalse(commit.isPresent()); + + allFiles = listAllBaseFilesInPath(hoodieTable); + tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); + dataFilesToRead = tableView.getLatestBaseFiles(); + assertTrue(dataFilesToRead.findAny().isPresent()); + + List inputPaths = tableView.getLatestBaseFiles() + .map(baseFile -> new Path(baseFile.getPath()).getParent().toString()) + .collect(Collectors.toList()); + List recordsRead = + HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath(), new JobConf(hadoopConf()), true, populateMetaFields); + // Wrote 20 records and deleted 20 records, so remaining 20-20 = 0 + assertEquals(0, recordsRead.size(), "Must contain 0 records"); + } + } + + @Test + public void testSimpleInsertsGeneratedIntoLogFiles() throws Exception { + // insert 100 records + // Setting IndexType to be InMemory to simulate Global Index nature + HoodieWriteConfig config = getConfigBuilder(false, HoodieIndex.IndexType.INMEMORY).build(); + Properties properties = new Properties(); + properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString()); + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties); + + try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) { + String newCommitTime = "100"; + writeClient.startCommitWithTime(newCommitTime); + + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + List records = dataGen.generateInserts(newCommitTime, 100); + JavaRDD recordsRDD = jsc().parallelize(records, 1); + JavaRDD statuses = writeClient.insert(recordsRDD, newCommitTime); + writeClient.commit(newCommitTime, statuses); + + HoodieTable table = HoodieSparkTable.create(config, context(), metaClient); + table.getHoodieView().sync(); + TableFileSystemView.SliceView tableRTFileSystemView = table.getSliceView(); + + long numLogFiles = 0; + for (String partitionPath : dataGen.getPartitionPaths()) { + List allSlices = tableRTFileSystemView.getLatestFileSlices(partitionPath).collect(Collectors.toList()); + assertEquals(0, allSlices.stream().filter(fileSlice -> fileSlice.getBaseFile().isPresent()).count()); + assertTrue(allSlices.stream().anyMatch(fileSlice -> fileSlice.getLogFiles().count() > 0)); + long logFileCount = allSlices.stream().filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count(); + if (logFileCount > 0) { + // check the log versions start from the base version + assertTrue(allSlices.stream().map(slice -> slice.getLogFiles().findFirst().get().getLogVersion()) + .allMatch(version -> version.equals(HoodieLogFile.LOGFILE_BASE_VERSION))); + } + numLogFiles += logFileCount; + } + + assertTrue(numLogFiles > 0); + // Do a compaction + String instantTime = writeClient.scheduleCompaction(Option.empty()).get().toString(); + HoodieWriteMetadata> compactionMetadata = writeClient.compact(instantTime); + String extension = table.getBaseFileExtension(); + Collection> stats = compactionMetadata.getCommitMetadata().get().getPartitionToWriteStats().values(); + assertEquals(numLogFiles, stats.stream().flatMap(Collection::stream).filter(state -> state.getPath().contains(extension)).count()); + assertEquals(numLogFiles, stats.stream().mapToLong(Collection::size).sum()); + writeClient.commitCompaction(instantTime, compactionMetadata.getCommitMetadata().get(), Option.empty()); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java new file mode 100644 index 0000000000000..35d7b6329e262 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java @@ -0,0 +1,1008 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.functional; + +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.common.model.HoodieFileGroup; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.marker.MarkerType; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.table.view.SyncableFileSystemView; +import org.apache.hudi.common.table.view.TableFileSystemView; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.testutils.HoodieMergeOnReadTestUtils; +import org.apache.hudi.testutils.SparkClientFunctionalTestHarness; + +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.spark.api.java.JavaRDD; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; +import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; +import static org.junit.jupiter.api.Assertions.assertAll; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +@Tag("functional") +public class TestHoodieSparkMergeOnReadTableRollback extends SparkClientFunctionalTestHarness { + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testCOWToMORConvertedTableRollback(boolean rollbackUsingMarkers) throws Exception { + // Set TableType to COW + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.COPY_ON_WRITE); + + HoodieWriteConfig cfg = getConfig(false, rollbackUsingMarkers); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { + + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + /* + * Write 1 (only inserts) + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 200); + JavaRDD writeRecords = jsc().parallelize(records, 1); + + List statuses = client.upsert(writeRecords, newCommitTime).collect(); + // verify there are no errors + assertNoWriteErrors(statuses); + client.commit(newCommitTime, jsc().parallelize(statuses)); + + metaClient = HoodieTableMetaClient.reload(metaClient); + Option commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); + assertTrue(commit.isPresent()); + assertEquals("001", commit.get().getTimestamp(), "commit should be 001"); + + /* + * Write 2 (updates) + */ + newCommitTime = "002"; + client.startCommitWithTime(newCommitTime); + + records = dataGen.generateUpdates(newCommitTime, records); + + statuses = client.upsert(jsc().parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + // Set TableType to MOR + metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ); + + // rollback a COW commit when TableType is MOR + client.rollback(newCommitTime); + + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); + FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); + HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); + + final String absentCommit = newCommitTime; + assertAll(tableView.getLatestBaseFiles().map(file -> () -> assertNotEquals(absentCommit, file.getCommitTime()))); + } + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testRollbackWithDeltaAndCompactionCommit(boolean rollbackUsingMarkers) throws Exception { + // NOTE: First writer will have Metadata table DISABLED + HoodieWriteConfig.Builder cfgBuilder = + getConfigBuilder(false, rollbackUsingMarkers, HoodieIndex.IndexType.SIMPLE); + + addConfigsForPopulateMetaFields(cfgBuilder, true); + HoodieWriteConfig cfg = cfgBuilder.build(); + + Properties properties = CollectionUtils.copy(cfg.getProps()); + properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString()); + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties); + + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { + + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + + // Test delta commit rollback + /* + * Write 1 (only inserts) + */ + String newCommitTime = "000000001"; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 200); + JavaRDD writeRecords = jsc().parallelize(records, 1); + + JavaRDD writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime); + + List statuses = writeStatusJavaRDD.collect(); + assertNoWriteErrors(statuses); + + client.commit(newCommitTime, jsc().parallelize(statuses)); + + HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); + + Option deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant(); + assertTrue(deltaCommit.isPresent()); + assertEquals("000000001", deltaCommit.get().getTimestamp(), "Delta commit should be 000000001"); + + Option commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); + assertFalse(commit.isPresent()); + + FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); + HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); + Stream dataFilesToRead = tableView.getLatestBaseFiles(); + assertFalse(dataFilesToRead.findAny().isPresent()); + + tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); + dataFilesToRead = tableView.getLatestBaseFiles(); + assertTrue(dataFilesToRead.findAny().isPresent(), + "should list the base files we wrote in the delta commit"); + + /* + * Write 2 (inserts + updates - testing failed delta commit) + */ + final String commitTime1 = "000000002"; + // WriteClient with custom config (disable small file handling) + // NOTE: Second writer will have Metadata table ENABLED + try (SparkRDDWriteClient secondClient = getHoodieWriteClient(getHoodieWriteConfigWithSmallFileHandlingOff(true));) { + secondClient.startCommitWithTime(commitTime1); + + List copyOfRecords = new ArrayList<>(records); + copyOfRecords = dataGen.generateUpdates(commitTime1, copyOfRecords); + copyOfRecords.addAll(dataGen.generateInserts(commitTime1, 200)); + + List inputPaths = tableView.getLatestBaseFiles() + .map(baseFile -> new Path(baseFile.getPath()).getParent().toString()) + .collect(Collectors.toList()); + List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, + basePath()); + assertEquals(200, recordsRead.size()); + + statuses = secondClient.upsert(jsc().parallelize(copyOfRecords, 1), commitTime1).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + // Test failed delta commit rollback + secondClient.rollback(commitTime1); + allFiles = listAllBaseFilesInPath(hoodieTable); + // After rollback, there should be no base file with the failed commit time + List remainingFiles = Arrays.stream(allFiles).filter(file -> file.getPath().getName() + .contains("_" + commitTime1)).map(fileStatus -> fileStatus.getPath().toString()).collect(Collectors.toList()); + assertEquals(0, remainingFiles.size(), "These files should have been rolled-back " + + "when rolling back commit " + commitTime1 + " but are still remaining. Files: " + remainingFiles); + inputPaths = tableView.getLatestBaseFiles() + .map(baseFile -> new Path(baseFile.getPath()).getParent().toString()) + .collect(Collectors.toList()); + recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath()); + assertEquals(200, recordsRead.size()); + } + + /* + * Write 3 (inserts + updates - testing successful delta commit) + */ + final String commitTime2 = "000000003"; + try (SparkRDDWriteClient thirdClient = getHoodieWriteClient(getHoodieWriteConfigWithSmallFileHandlingOff(true));) { + thirdClient.startCommitWithTime(commitTime2); + + List copyOfRecords = new ArrayList<>(records); + copyOfRecords = dataGen.generateUpdates(commitTime2, copyOfRecords); + copyOfRecords.addAll(dataGen.generateInserts(commitTime2, 200)); + + List inputPaths = tableView.getLatestBaseFiles() + .map(baseFile -> new Path(baseFile.getPath()).getParent().toString()) + .collect(Collectors.toList()); + List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, + basePath()); + assertEquals(200, recordsRead.size()); + + writeRecords = jsc().parallelize(copyOfRecords, 1); + writeStatusJavaRDD = thirdClient.upsert(writeRecords, commitTime2); + statuses = writeStatusJavaRDD.collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + // Test successful delta commit rollback + thirdClient.rollback(commitTime2); + allFiles = listAllBaseFilesInPath(hoodieTable); + // After rollback, there should be no base file with the failed commit time + List remainingFiles = Arrays.stream(allFiles).filter(file -> file.getPath().getName() + .contains("_" + commitTime2)).map(fileStatus -> fileStatus.getPath().toString()).collect(Collectors.toList()); + assertEquals(0, remainingFiles.size(), "These files should have been rolled-back " + + "when rolling back commit " + commitTime2 + " but are still remaining. Files: " + remainingFiles); + + metaClient = HoodieTableMetaClient.reload(metaClient); + hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); + tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); + inputPaths = tableView.getLatestBaseFiles() + .map(baseFile -> new Path(baseFile.getPath()).getParent().toString()) + .collect(Collectors.toList()); + recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath()); + // check that the number of records read is still correct after rollback operation + assertEquals(200, recordsRead.size()); + + // Test compaction commit rollback + /* + * Write 4 (updates) + */ + newCommitTime = "000000004"; + thirdClient.startCommitWithTime(newCommitTime); + + writeStatusJavaRDD = thirdClient.upsert(writeRecords, newCommitTime); + + statuses = writeStatusJavaRDD.collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + thirdClient.commit(newCommitTime, jsc().parallelize(statuses)); + + metaClient = HoodieTableMetaClient.reload(metaClient); + + String compactionInstantTime = thirdClient.scheduleCompaction(Option.empty()).get().toString(); + thirdClient.compact(compactionInstantTime); + + metaClient = HoodieTableMetaClient.reload(metaClient); + + final String compactedCommitTime = metaClient.getActiveTimeline().reload().lastInstant().get().getTimestamp(); + assertTrue(Arrays.stream(listAllBaseFilesInPath(hoodieTable)) + .anyMatch(file -> compactedCommitTime.equals(new HoodieBaseFile(file).getCommitTime()))); + hoodieTable.rollbackInflightCompaction(new HoodieInstant( + HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactedCommitTime)); + allFiles = listAllBaseFilesInPath(hoodieTable); + metaClient = HoodieTableMetaClient.reload(metaClient); + tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles); + + assertFalse(tableView.getLatestBaseFiles().anyMatch(file -> compactedCommitTime.equals(file.getCommitTime()))); + assertAll(tableView.getLatestBaseFiles().map(file -> () -> assertNotEquals(compactedCommitTime, file.getCommitTime()))); + } + } + } + + @Test + void testMultiRollbackWithDeltaAndCompactionCommit() throws Exception { + boolean populateMetaFields = true; + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(false) + // Timeline-server-based markers are not used for multi-rollback tests + .withMarkersType(MarkerType.DIRECT.name()); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + HoodieWriteConfig cfg = cfgBuilder.build(); + + Properties properties = getPropertiesForKeyGen(populateMetaFields); + properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString()); + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties); + + try (final SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + + /* + * Write 1 (only inserts) + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 200); + JavaRDD writeRecords = jsc().parallelize(records, 1); + + JavaRDD writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime); + + List statuses = writeStatusJavaRDD.collect(); + assertNoWriteErrors(statuses); + + client.commit(newCommitTime, jsc().parallelize(statuses)); + client.close(); + + Option> instantCommitMetadataPairOpt = + metaClient.getActiveTimeline().getLastCommitMetadataWithValidData(); + + assertTrue(instantCommitMetadataPairOpt.isPresent()); + + HoodieInstant commitInstant = instantCommitMetadataPairOpt.get().getKey(); + + assertEquals("001", commitInstant.getTimestamp()); + assertEquals(HoodieTimeline.DELTA_COMMIT_ACTION, commitInstant.getAction()); + assertEquals(200, getTotalRecordsWritten(instantCommitMetadataPairOpt.get().getValue())); + + Option commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); + assertFalse(commit.isPresent()); + + HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); + + FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); + HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); + Stream dataFilesToRead = tableView.getLatestBaseFiles(); + assertFalse(dataFilesToRead.findAny().isPresent()); + + tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); + dataFilesToRead = tableView.getLatestBaseFiles(); + assertTrue(dataFilesToRead.findAny().isPresent(), + "Should list the base files we wrote in the delta commit"); + + /* + * Write 2 (inserts + updates) + */ + newCommitTime = "002"; + // WriteClient with custom config (disable small file handling) + HoodieWriteConfig smallFileWriteConfig = getHoodieWriteConfigWithSmallFileHandlingOffBuilder(populateMetaFields) + // Timeline-server-based markers are not used for multi-rollback tests + .withMarkersType(MarkerType.DIRECT.name()).build(); + try (SparkRDDWriteClient nClient = getHoodieWriteClient(smallFileWriteConfig)) { + nClient.startCommitWithTime(newCommitTime); + + List copyOfRecords = new ArrayList<>(records); + copyOfRecords = dataGen.generateUpdates(newCommitTime, copyOfRecords); + copyOfRecords.addAll(dataGen.generateInserts(newCommitTime, 200)); + + List dataFiles = tableView.getLatestBaseFiles() + .map(baseFile -> new Path(baseFile.getPath()).getParent().toString()) + .collect(Collectors.toList()); + List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), dataFiles, + basePath()); + assertEquals(200, recordsRead.size()); + + statuses = nClient.upsert(jsc().parallelize(copyOfRecords, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + nClient.commit(newCommitTime, jsc().parallelize(statuses)); + + copyOfRecords.clear(); + } + + // Schedule a compaction + /* + * Write 3 (inserts + updates) + */ + newCommitTime = "003"; + client.startCommitWithTime(newCommitTime); + + List newInserts = dataGen.generateInserts(newCommitTime, 100); + records = dataGen.generateUpdates(newCommitTime, records); + records.addAll(newInserts); + writeRecords = jsc().parallelize(records, 1); + + writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime); + statuses = writeStatusJavaRDD.collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + client.commit(newCommitTime, jsc().parallelize(statuses)); + + metaClient = HoodieTableMetaClient.reload(metaClient); + + String compactionInstantTime = "004"; + client.scheduleCompactionAtInstant(compactionInstantTime, Option.empty()); + + // Compaction commit + /* + * Write 4 (updates) + */ + newCommitTime = "005"; + client.startCommitWithTime(newCommitTime); + + records = dataGen.generateUpdates(newCommitTime, records); + writeRecords = jsc().parallelize(records, 1); + + writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime); + statuses = writeStatusJavaRDD.collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + client.commit(newCommitTime, jsc().parallelize(statuses)); + + metaClient = HoodieTableMetaClient.reload(metaClient); + + compactionInstantTime = "006"; + client.scheduleCompactionAtInstant(compactionInstantTime, Option.empty()); + HoodieWriteMetadata> compactionMetadata = client.compact(compactionInstantTime); + client.commitCompaction(compactionInstantTime, compactionMetadata.getCommitMetadata().get(), Option.empty()); + + allFiles = listAllBaseFilesInPath(hoodieTable); + metaClient = HoodieTableMetaClient.reload(metaClient); + tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles); + + final String compactedCommitTime = + metaClient.getActiveTimeline().reload().getCommitsTimeline().lastInstant().get().getTimestamp(); + + assertTrue(tableView.getLatestBaseFiles().anyMatch(file -> compactedCommitTime.equals(file.getCommitTime()))); + + /* + * Write 5 (updates) + */ + newCommitTime = "007"; + client.startCommitWithTime(newCommitTime); + List copyOfRecords = new ArrayList<>(records); + copyOfRecords = dataGen.generateUpdates(newCommitTime, copyOfRecords); + copyOfRecords.addAll(dataGen.generateInserts(newCommitTime, 200)); + + statuses = client.upsert(jsc().parallelize(copyOfRecords, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + client.commit(newCommitTime, jsc().parallelize(statuses)); + + copyOfRecords.clear(); + + // Rollback latest commit first + client.restoreToInstant("000", cfg.isMetadataTableEnabled()); + + metaClient = HoodieTableMetaClient.reload(metaClient); + allFiles = listAllBaseFilesInPath(hoodieTable); + tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); + dataFilesToRead = tableView.getLatestBaseFiles(); + assertFalse(dataFilesToRead.findAny().isPresent()); + TableFileSystemView.SliceView rtView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); + List fileGroups = + ((HoodieTableFileSystemView) rtView).getAllFileGroups().collect(Collectors.toList()); + assertTrue(fileGroups.isEmpty()); + + // make sure there are no log files remaining + assertEquals(0L, ((HoodieTableFileSystemView) rtView).getAllFileGroups() + .filter(fileGroup -> fileGroup.getAllRawFileSlices().noneMatch(f -> f.getLogFiles().count() == 0)) + .count()); + + } + } + + @Test + void testRestoreWithCleanedUpCommits() throws Exception { + boolean populateMetaFields = true; + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(false) + // Timeline-server-based markers are not used for multi-rollback tests + .withMarkersType(MarkerType.DIRECT.name()); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + HoodieWriteConfig cfg = cfgBuilder.build(); + + Properties properties = populateMetaFields ? new Properties() : getPropertiesForKeyGen(); + properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString()); + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties); + + try (final SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + + /* + * Write 1 (only inserts) + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + List records = dataGen.generateInserts(newCommitTime, 200); + JavaRDD writeRecords = jsc().parallelize(records, 1); + JavaRDD writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime); + List statuses = writeStatusJavaRDD.collect(); + assertNoWriteErrors(statuses); + client.commit(newCommitTime, jsc().parallelize(statuses)); + + upsertRecords(client, "002", records, dataGen); + + client.savepoint("002","user1","comment1"); + + upsertRecords(client, "003", records, dataGen); + upsertRecords(client, "004", records, dataGen); + + // Compaction commit + String compactionInstantTime = "006"; + client.scheduleCompactionAtInstant(compactionInstantTime, Option.empty()); + HoodieWriteMetadata> compactionMetadata = client.compact(compactionInstantTime); + client.commitCompaction(compactionInstantTime, compactionMetadata.getCommitMetadata().get(), Option.empty()); + + upsertRecords(client, "007", records, dataGen); + upsertRecords(client, "008", records, dataGen); + + // Compaction commit + String compactionInstantTime1 = "009"; + client.scheduleCompactionAtInstant(compactionInstantTime1, Option.empty()); + HoodieWriteMetadata> compactionMetadata1 = client.compact(compactionInstantTime1); + client.commitCompaction(compactionInstantTime1, compactionMetadata1.getCommitMetadata().get(), Option.empty()); + + upsertRecords(client, "010", records, dataGen); + + // trigger clean. creating a new client with aggresive cleaner configs so that clean will kick in immediately. + cfgBuilder = getConfigBuilder(false) + .withCleanConfig(HoodieCleanConfig.newBuilder().retainCommits(1).build()) + // Timeline-server-based markers are not used for multi-rollback tests + .withMarkersType(MarkerType.DIRECT.name()); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + HoodieWriteConfig cfg1 = cfgBuilder.build(); + final SparkRDDWriteClient client1 = getHoodieWriteClient(cfg1); + client1.clean(); + client1.close(); + + metaClient = HoodieTableMetaClient.reload(metaClient); + upsertRecords(client, "011", records, dataGen); + + // Rollback to 002 + client.restoreToInstant("002", cfg.isMetadataTableEnabled()); + + // verify that no files are present after 002. every data file should have been cleaned up + HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); + FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); + HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); + Stream dataFilesToRead = tableView.getLatestBaseFiles(); + assertFalse(dataFilesToRead.anyMatch(file -> HoodieTimeline.compareTimestamps("002", HoodieTimeline.GREATER_THAN, file.getCommitTime()))); + + client.deleteSavepoint("002"); + assertFalse(metaClient.reloadActiveTimeline().getSavePointTimeline().containsInstant("002")); + } + } + + private void upsertRecords(SparkRDDWriteClient client, String commitTime, List records, HoodieTestDataGenerator dataGen) throws IOException { + client.startCommitWithTime(commitTime); + List copyOfRecords = new ArrayList<>(records); + copyOfRecords = dataGen.generateUpdates(commitTime, copyOfRecords); + List statuses = client.upsert(jsc().parallelize(copyOfRecords, 1), commitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + client.commit(commitTime, jsc().parallelize(statuses)); + } + + private long getTotalRecordsWritten(HoodieCommitMetadata commitMetadata) { + return commitMetadata.getPartitionToWriteStats().values().stream() + .flatMap(Collection::stream) + .map(stat -> stat.getNumWrites() + stat.getNumUpdateWrites()) + .reduce(0L, Long::sum); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testMORTableRestore(boolean restoreAfterCompaction) throws Exception { + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(false) + // Timeline-server-based markers are not used for multi-rollback tests + .withMarkersType(MarkerType.DIRECT.name()); + HoodieWriteConfig cfg = cfgBuilder.build(); + + Properties properties = getPropertiesForKeyGen(true); + properties.putAll(cfg.getProps()); + properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString()); + + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties); + + try (final SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { + + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + List records = insertAndGetRecords("001", client, dataGen, 200); + List updates1 = updateAndGetRecords("002", client, dataGen, records); + List updates2 = updateAndGetRecords("003", client, dataGen, records); + List updates3 = updateAndGetRecords("004", client, dataGen, records); + validateRecords(cfg, metaClient, updates3); + + if (!restoreAfterCompaction) { + // restore to 002 and validate records. + client.restoreToInstant("002", cfg.isMetadataTableEnabled()); + validateRecords(cfg, metaClient, updates1); + } else { + // trigger compaction and then trigger couple of upserts followed by restore. + metaClient = HoodieTableMetaClient.reload(metaClient); + String compactionInstantTime = "005"; + client.scheduleCompactionAtInstant(compactionInstantTime, Option.empty()); + HoodieWriteMetadata> compactionMetadata = client.compact(compactionInstantTime); + client.commitCompaction(compactionInstantTime, compactionMetadata.getCommitMetadata().get(), Option.empty()); + + validateRecords(cfg, metaClient, updates3); + List updates4 = updateAndGetRecords("006", client, dataGen, records); + List updates5 = updateAndGetRecords("007", client, dataGen, records); + validateRecords(cfg, metaClient, updates5); + + // restore to 003 and validate records. + client.restoreToInstant("003", cfg.isMetadataTableEnabled()); + validateRecords(cfg, metaClient, updates2); + } + } + } + + private List insertAndGetRecords(String newCommitTime, SparkRDDWriteClient client, HoodieTestDataGenerator dataGen, int count) { + client.startCommitWithTime(newCommitTime); + List records = dataGen.generateInserts(newCommitTime, count); + JavaRDD writeRecords = jsc().parallelize(records, 1); + JavaRDD writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime); + client.commit(newCommitTime, writeStatusJavaRDD); + return records; + } + + private List updateAndGetRecords(String newCommitTime, SparkRDDWriteClient client, HoodieTestDataGenerator dataGen, List records) throws IOException { + client.startCommitWithTime(newCommitTime); + List updates = dataGen.generateUpdates(newCommitTime, records); + JavaRDD writeStatusJavaRDD = client.upsert(jsc().parallelize(updates, 1), newCommitTime); + client.commit(newCommitTime, writeStatusJavaRDD); + return updates; + } + + private void validateRecords(HoodieWriteConfig cfg, HoodieTableMetaClient metaClient, List expectedRecords) throws IOException { + + HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); + FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); + HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); + List inputPaths = tableView.getLatestBaseFiles() + .map(hf -> new Path(hf.getPath()).getParent().toString()) + .collect(Collectors.toList()); + List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, + basePath()); + assertRecords(expectedRecords, recordsRead); + } + + private void assertRecords(List inputRecords, List recordsRead) { + assertEquals(recordsRead.size(), inputRecords.size()); + Map expectedRecords = new HashMap<>(); + inputRecords.forEach(entry -> { + try { + expectedRecords.put(entry.getRecordKey(), (GenericRecord) ((HoodieRecordPayload) entry.getData()).getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA).get()); + } catch (IOException e) { + e.printStackTrace(); + } + }); + + Map actualRecords = new HashMap<>(); + recordsRead.forEach(entry -> actualRecords.put(String.valueOf(entry.get("_row_key")), entry)); + for (Map.Entry entry : expectedRecords.entrySet()) { + assertEquals(String.valueOf(entry.getValue().get("driver")), String.valueOf(actualRecords.get(entry.getKey()).get("driver"))); + } + } + + private HoodieWriteConfig getHoodieWriteConfigWithSmallFileHandlingOff(boolean populateMetaFields) { + return getHoodieWriteConfigWithSmallFileHandlingOffBuilder(populateMetaFields).build(); + } + + private HoodieWriteConfig.Builder getHoodieWriteConfigWithSmallFileHandlingOffBuilder(boolean populateMetaFields) { + HoodieWriteConfig.Builder cfgBuilder = HoodieWriteConfig.newBuilder().withPath(basePath()).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .withDeleteParallelism(2) + .withAutoCommit(false) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024) + .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build()) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withRemoteServerPort(timelineServicePort).build()) + .withEmbeddedTimelineServerEnabled(true) + .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024).parquetMaxFileSize(1024).build()).forTable("test-trip-table"); + + if (!populateMetaFields) { + addConfigsForPopulateMetaFields(cfgBuilder, false); + } + return cfgBuilder; + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testInsertsGeneratedIntoLogFilesRollback(boolean rollbackUsingMarkers) throws Exception { + Properties properties = new Properties(); + properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString()); + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties); + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + // insert 100 records + // Setting IndexType to be InMemory to simulate Global Index nature + HoodieWriteConfig config = getConfigBuilder(false, rollbackUsingMarkers, HoodieIndex.IndexType.INMEMORY).build(); + + try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) { + String newCommitTime = "100"; + writeClient.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 100); + JavaRDD recordsRDD = jsc().parallelize(records, 1); + // trigger an action + List writeStatuses = ((JavaRDD) writeClient.insert(recordsRDD, newCommitTime)).collect(); + + // Ensure that inserts are written to only log files + assertEquals(0, + writeStatuses.stream().filter(writeStatus -> !writeStatus.getStat().getPath().contains("log")).count()); + assertTrue( + writeStatuses.stream().anyMatch(writeStatus -> writeStatus.getStat().getPath().contains("log"))); + + // rollback a failed commit + boolean rollback = writeClient.rollback(newCommitTime); + assertTrue(rollback); + + // insert 100 records + newCommitTime = "101"; + writeClient.startCommitWithTime(newCommitTime); + records = dataGen.generateInserts(newCommitTime, 100); + recordsRDD = jsc().parallelize(records, 1); + writeClient.insert(recordsRDD, newCommitTime).collect(); + + // Sleep for small interval (at least 1 second) to force a new rollback start time. + Thread.sleep(1000); + + // We will test HUDI-204 here. We will simulate rollback happening twice by copying the commit file to local fs + // and calling rollback twice + final String lastCommitTime = newCommitTime; + + // Save the .commit file to local directory. + // Rollback will be called twice to test the case where rollback failed first time and retried. + // We got the "BaseCommitTime cannot be null" exception before the fix + java.nio.file.Path tempFolder = Files.createTempDirectory(this.getClass().getCanonicalName()); + Map fileNameMap = new HashMap<>(); + for (HoodieInstant.State state : Arrays.asList(HoodieInstant.State.REQUESTED, HoodieInstant.State.INFLIGHT)) { + HoodieInstant toCopy = new HoodieInstant(state, HoodieTimeline.DELTA_COMMIT_ACTION, lastCommitTime); + File file = Files.createTempFile(tempFolder, null, null).toFile(); + metaClient.getFs().copyToLocalFile(new Path(metaClient.getMetaPath(), toCopy.getFileName()), + new Path(file.getAbsolutePath())); + fileNameMap.put(file.getAbsolutePath(), toCopy.getFileName()); + } + Path markerDir = new Path(Files.createTempDirectory(tempFolder, null).toAbsolutePath().toString()); + if (rollbackUsingMarkers) { + metaClient.getFs().copyToLocalFile(new Path(metaClient.getMarkerFolderPath(lastCommitTime)), + markerDir); + } + + writeClient.rollback(newCommitTime); + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable table = HoodieSparkTable.create(config, context()); + TableFileSystemView.SliceView tableRTFileSystemView = table.getSliceView(); + + long numLogFiles = 0; + for (String partitionPath : dataGen.getPartitionPaths()) { + assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).noneMatch(fileSlice -> fileSlice.getBaseFile().isPresent())); + assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).noneMatch(fileSlice -> fileSlice.getLogFiles().count() > 0)); + numLogFiles += tableRTFileSystemView.getLatestFileSlices(partitionPath) + .filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count(); + } + assertEquals(0, numLogFiles); + for (Map.Entry entry : fileNameMap.entrySet()) { + try { + metaClient.getFs().copyFromLocalFile(new Path(entry.getKey()), + new Path(metaClient.getMetaPath(), entry.getValue())); + } catch (IOException e) { + throw new HoodieIOException("Error copying state from local disk.", e); + } + } + if (rollbackUsingMarkers) { + metaClient.getFs().copyFromLocalFile(new Path(markerDir, lastCommitTime), + new Path(metaClient.getMarkerFolderPath(lastCommitTime))); + } + Thread.sleep(1000); + // Rollback again to pretend the first rollback failed partially. This should not error out + writeClient.rollback(newCommitTime); + } + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testInsertsGeneratedIntoLogFilesRollbackAfterCompaction(boolean rollbackUsingMarkers) throws Exception { + Properties properties = new Properties(); + properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString()); + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties); + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + // insert 100 records + // Setting IndexType to be InMemory to simulate Global Index nature + HoodieWriteConfig config = getConfigBuilder(false, rollbackUsingMarkers, HoodieIndex.IndexType.INMEMORY).build(); + try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config);) { + String newCommitTime = "100"; + writeClient.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 100); + JavaRDD recordsRDD = jsc().parallelize(records, 1); + JavaRDD statuses = writeClient.insert(recordsRDD, newCommitTime); + writeClient.commit(newCommitTime, statuses); + + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable table = HoodieSparkTable.create(config, context(), metaClient); + table.getHoodieView().sync(); + TableFileSystemView.SliceView tableRTFileSystemView = table.getSliceView(); + + long numLogFiles = 0; + for (String partitionPath : dataGen.getPartitionPaths()) { + assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).noneMatch(fileSlice -> fileSlice.getBaseFile().isPresent())); + assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).anyMatch(fileSlice -> fileSlice.getLogFiles().count() > 0)); + numLogFiles += tableRTFileSystemView.getLatestFileSlices(partitionPath) + .filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count(); + } + + assertTrue(numLogFiles > 0); + // Do a compaction + newCommitTime = writeClient.scheduleCompaction(Option.empty()).get().toString(); + HoodieWriteMetadata> compactionMetadata = writeClient.compact(newCommitTime); + statuses = compactionMetadata.getWriteStatuses(); + // Ensure all log files have been compacted into base files + String extension = table.getBaseFileExtension(); + Collection> stats = compactionMetadata.getCommitMetadata().get().getPartitionToWriteStats().values(); + assertEquals(numLogFiles, stats.stream().flatMap(Collection::stream).filter(state -> state.getPath().contains(extension)).count()); + assertEquals(numLogFiles, stats.stream().mapToLong(Collection::size).sum()); + + //writeClient.commitCompaction(newCommitTime, statuses, Option.empty()); + // Trigger a rollback of compaction + table.getActiveTimeline().reload(); + table.rollbackInflightCompaction(new HoodieInstant( + HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, newCommitTime)); + + metaClient = HoodieTableMetaClient.reload(metaClient); + table = HoodieSparkTable.create(config, context(), metaClient); + tableRTFileSystemView = table.getSliceView(); + ((SyncableFileSystemView) tableRTFileSystemView).reset(); + + for (String partitionPath : dataGen.getPartitionPaths()) { + List fileSlices = getFileSystemViewWithUnCommittedSlices(metaClient) + .getAllFileSlices(partitionPath).filter(fs -> fs.getBaseInstantTime().equals("100")).collect(Collectors.toList()); + assertTrue(fileSlices.stream().noneMatch(fileSlice -> fileSlice.getBaseFile().isPresent())); + assertTrue(fileSlices.stream().anyMatch(fileSlice -> fileSlice.getLogFiles().count() > 0)); + } + } + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testLazyRollbackOfFailedCommit(boolean rollbackUsingMarkers) throws Exception { + Properties properties = new Properties(); + properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString()); + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties); + + HoodieWriteConfig cfg = getWriteConfig(true, rollbackUsingMarkers); + HoodieWriteConfig autoCommitFalseCfg = getWriteConfig(false, rollbackUsingMarkers); + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + // commit 1 + List records = insertRecords(client, dataGen, "001"); + // commit 2 to create log files + List updates1 = updateRecords(client, dataGen, "002", records, metaClient, cfg, true); + + // trigger a inflight commit 3 which will be later be rolled back explicitly. + SparkRDDWriteClient autoCommitFalseClient = getHoodieWriteClient(autoCommitFalseCfg); + List updates2 = updateRecords(autoCommitFalseClient, dataGen, "003", records, metaClient, autoCommitFalseCfg, false); + + // commit 4 successful (mimic multi-writer scenario) + List updates3 = updateRecords(client, dataGen, "004", records, metaClient, cfg, false); + + // trigger compaction + long numLogFiles = getNumLogFilesInLatestFileSlice(metaClient, cfg, dataGen); + doCompaction(autoCommitFalseClient, metaClient, cfg, numLogFiles); + long numLogFilesAfterCompaction = getNumLogFilesInLatestFileSlice(metaClient, cfg, dataGen); + assertNotEquals(numLogFiles, numLogFilesAfterCompaction); + + // rollback 3rd commit. + client.rollback("003"); + long numLogFilesAfterRollback = getNumLogFilesInLatestFileSlice(metaClient, cfg, dataGen); + // lazy rollback should have added the rollback block to previous file slice and not the latest. And so the latest slice's log file count should + // remain the same. + assertEquals(numLogFilesAfterRollback, numLogFilesAfterCompaction); + } + + private List insertRecords(SparkRDDWriteClient client, HoodieTestDataGenerator dataGen, String commitTime) { + /* + * Write 1 (only inserts, written as base file) + */ + client.startCommitWithTime(commitTime); + + List records = dataGen.generateInserts(commitTime, 20); + JavaRDD writeRecords = jsc().parallelize(records, 1); + + List statuses = client.upsert(writeRecords, commitTime).collect(); + assertNoWriteErrors(statuses); + return records; + } + + private List updateRecords(SparkRDDWriteClient client, HoodieTestDataGenerator dataGen, String commitTime, + List records, HoodieTableMetaClient metaClient, HoodieWriteConfig cfg, + boolean assertLogFiles) throws IOException { + client.startCommitWithTime(commitTime); + + records = dataGen.generateUpdates(commitTime, records); + JavaRDD writeRecords = jsc().parallelize(records, 1); + List statuses = client.upsert(writeRecords, commitTime).collect(); + assertNoWriteErrors(statuses); + + if (assertLogFiles) { + HoodieTable table = HoodieSparkTable.create(cfg, context(), metaClient); + table.getHoodieView().sync(); + TableFileSystemView.SliceView tableRTFileSystemView = table.getSliceView(); + + long numLogFiles = 0; + for (String partitionPath : dataGen.getPartitionPaths()) { + List allSlices = tableRTFileSystemView.getLatestFileSlices(partitionPath).collect(Collectors.toList()); + assertEquals(1, allSlices.stream().filter(fileSlice -> fileSlice.getBaseFile().isPresent()).count()); + assertTrue(allSlices.stream().anyMatch(fileSlice -> fileSlice.getLogFiles().count() > 0)); + numLogFiles += allSlices.stream().filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count(); + } + assertTrue(numLogFiles > 0); + } + return records; + } + + private long doCompaction(SparkRDDWriteClient client, HoodieTableMetaClient metaClient, HoodieWriteConfig cfg, long numLogFiles) throws IOException { + // Do a compaction + String instantTime = client.scheduleCompaction(Option.empty()).get().toString(); + HoodieWriteMetadata> compactionMetadata = client.compact(instantTime); + + metaClient.reloadActiveTimeline(); + HoodieTable table = HoodieSparkTable.create(cfg, context(), metaClient); + String extension = table.getBaseFileExtension(); + Collection> stats = compactionMetadata.getCommitMetadata().get().getPartitionToWriteStats().values(); + assertEquals(numLogFiles, stats.stream().flatMap(Collection::stream).filter(state -> state.getPath().contains(extension)).count()); + assertEquals(numLogFiles, stats.stream().mapToLong(Collection::size).sum()); + client.commitCompaction(instantTime, compactionMetadata.getCommitMetadata().get(), Option.empty()); + return numLogFiles; + } + + private long getNumLogFilesInLatestFileSlice(HoodieTableMetaClient metaClient, HoodieWriteConfig cfg, HoodieTestDataGenerator dataGen) { + metaClient.reloadActiveTimeline(); + HoodieTable table = HoodieSparkTable.create(cfg, context(), metaClient); + table.getHoodieView().sync(); + TableFileSystemView.SliceView tableRTFileSystemView = table.getSliceView(); + + long numLogFiles = 0; + for (String partitionPath : dataGen.getPartitionPaths()) { + List allSlices = tableRTFileSystemView.getLatestFileSlices(partitionPath).collect(Collectors.toList()); + numLogFiles += allSlices.stream().filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count(); + } + return numLogFiles; + } + + private HoodieWriteConfig getWriteConfig(boolean autoCommit, boolean rollbackUsingMarkers) { + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(autoCommit).withRollbackUsingMarkers(rollbackUsingMarkers) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withAutoClean(false) + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) + .build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024L) + .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(3) + .build()); + return cfgBuilder.build(); + } + + private SyncableFileSystemView getFileSystemViewWithUnCommittedSlices(HoodieTableMetaClient metaClient) { + try { + return new HoodieTableFileSystemView(metaClient, + metaClient.getActiveTimeline(), + HoodieTestTable.of(metaClient).listAllBaseAndLogFiles() + ); + } catch (IOException ioe) { + throw new HoodieIOException("Error getting file system view", ioe); + } + } + +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestMarkerBasedRollbackStrategy.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestMarkerBasedRollbackStrategy.java new file mode 100644 index 0000000000000..927f8f3c24b82 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestMarkerBasedRollbackStrategy.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.functional; + +import org.apache.hudi.avro.model.HoodieRollbackRequest; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.HoodieRollbackStat; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.IOType; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.rollback.BaseRollbackHelper; +import org.apache.hudi.table.action.rollback.MarkerBasedRollbackStrategy; +import org.apache.hudi.table.marker.DirectWriteMarkers; +import org.apache.hudi.testutils.HoodieClientTestBase; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.spark.api.java.JavaRDD; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; +import static org.mockito.MockitoAnnotations.initMocks; + +@Tag("functional") +public class TestMarkerBasedRollbackStrategy extends HoodieClientTestBase { + + private static final String TEST_NAME_WITH_PARAMS = "[{index}] Test with listing metadata enable={0}"; + + public static Stream configParams() { + return Arrays.stream(new Boolean[][] {{true}, {false}}).map(Arguments::of); + } + + private HoodieTableType tableType = HoodieTableType.COPY_ON_WRITE; + + @BeforeEach + public void setUp() throws Exception { + initPath(); + initSparkContexts(); + initFileSystem(); + initMetaClient(tableType); + initTestDataGenerator(); + } + + @AfterEach + public void tearDown() throws Exception { + cleanupResources(); + } + + @Test + public void testMarkerBasedRollbackAppend() throws Exception { + HoodieTestTable testTable = HoodieTestTable.of(metaClient); + String f0 = testTable.addRequestedCommit("000") + .getFileIdsWithBaseFilesInPartitions("partA").get("partA"); + testTable.forCommit("001") + .withMarkerFile("partA", f0, IOType.APPEND); + + HoodieTable hoodieTable = HoodieSparkTable.create(getConfig(), context, metaClient); + List rollbackRequests = new MarkerBasedRollbackStrategy(hoodieTable, context, getConfig(), + "002").getRollbackRequests(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "001")); + assertEquals(1, rollbackRequests.size()); + } + + @Test + public void testCopyOnWriteRollbackWithTestTable() throws Exception { + // given: wrote some base files and corresponding markers + HoodieTestTable testTable = HoodieTestTable.of(metaClient); + String f0 = testTable.addRequestedCommit("000") + .getFileIdsWithBaseFilesInPartitions("partA").get("partA"); + String f1 = testTable.addCommit("001") + .withBaseFilesInPartition("partA", f0) + .getFileIdsWithBaseFilesInPartitions("partB").get("partB"); + String f2 = "f2"; + testTable.forCommit("001") + .withMarkerFile("partA", f0, IOType.MERGE) + .withMarkerFile("partB", f1, IOType.CREATE) + .withMarkerFile("partA", f2, IOType.CREATE); + + // when + HoodieTable hoodieTable = HoodieSparkTable.create(getConfig(), context, metaClient); + List rollbackRequests = new MarkerBasedRollbackStrategy(hoodieTable, context, getConfig(), + "002").getRollbackRequests(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "001")); + + List stats = new BaseRollbackHelper(hoodieTable.getMetaClient(), getConfig()).performRollback(context, + new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "001"), + rollbackRequests); + + // then: ensure files are deleted correctly, non-existent files reported as failed deletes + assertEquals(2, stats.size()); + + FileStatus[] partAFiles = testTable.listAllFilesInPartition("partA"); + FileStatus[] partBFiles = testTable.listAllFilesInPartition("partB"); + + assertEquals(0, partBFiles.length); + assertEquals(1, partAFiles.length); + assertEquals(2, stats.stream().mapToInt(r -> r.getSuccessDeleteFiles().size()).sum()); + assertEquals(1, stats.stream().mapToInt(r -> r.getFailedDeleteFiles().size()).sum()); + } + + @ParameterizedTest(name = TEST_NAME_WITH_PARAMS) + @MethodSource("configParams") + public void testCopyOnWriteRollback(boolean useFileListingMetadata) throws Exception { + HoodieWriteConfig writeConfig = getConfigBuilder().withRollbackUsingMarkers(true).withAutoCommit(false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(useFileListingMetadata).build()) + .withPath(basePath).build(); + + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + try (SparkRDDWriteClient writeClient = new SparkRDDWriteClient(engineContext, writeConfig)) { + // rollback 2nd commit and ensure stats reflect the info. + List stats = testRun(useFileListingMetadata, writeConfig, writeClient); + + assertEquals(3, stats.size()); + for (HoodieRollbackStat stat : stats) { + assertEquals(1, stat.getSuccessDeleteFiles().size()); + assertEquals(0, stat.getFailedDeleteFiles().size()); + assertEquals(0, stat.getCommandBlocksCount().size()); + } + } + } + + @ParameterizedTest(name = TEST_NAME_WITH_PARAMS) + @MethodSource("configParams") + public void testMergeOnReadRollback(boolean useFileListingMetadata) throws Exception { + // init MERGE_ON_READ_TABLE + tearDown(); + tableType = HoodieTableType.MERGE_ON_READ; + setUp(); + + HoodieWriteConfig writeConfig = getConfigBuilder().withRollbackUsingMarkers(true).withAutoCommit(false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(useFileListingMetadata).build()) + .withPath(basePath).build(); + + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + try (SparkRDDWriteClient writeClient = new SparkRDDWriteClient(engineContext, writeConfig)) { + + // rollback 2nd commit and ensure stats reflect the info. + List stats = testRun(useFileListingMetadata, writeConfig, writeClient); + + assertEquals(3, stats.size()); + for (HoodieRollbackStat stat : stats) { + assertEquals(0, stat.getSuccessDeleteFiles().size()); + assertEquals(0, stat.getFailedDeleteFiles().size()); + assertEquals(1, stat.getCommandBlocksCount().size()); + stat.getCommandBlocksCount().forEach((fileStatus, len) -> assertTrue(fileStatus.getPath().getName().contains(HoodieFileFormat.HOODIE_LOG.getFileExtension()))); + } + } + } + + private List testRun(boolean useFileListingMetadata, HoodieWriteConfig writeConfig, SparkRDDWriteClient writeClient) { + String newCommitTime = "001"; + writeClient.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 100); + JavaRDD writeStatuses = writeClient.insert(jsc.parallelize(records, 1), newCommitTime); + writeClient.commit(newCommitTime, writeStatuses); + + // Updates + newCommitTime = "002"; + writeClient.startCommitWithTime(newCommitTime); + records = dataGen.generateUniqueUpdates(newCommitTime, 50); + writeStatuses = writeClient.upsert(jsc.parallelize(records, 1), newCommitTime); + writeStatuses.collect(); + + HoodieTable hoodieTable = HoodieSparkTable.create(getConfig(), context, metaClient); + List rollbackRequests = new MarkerBasedRollbackStrategy(hoodieTable, context, getConfig(), + "003").getRollbackRequests(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, "002")); + + // rollback 2nd commit and ensure stats reflect the info. + return new BaseRollbackHelper(hoodieTable.getMetaClient(), getConfig()).performRollback(context, + new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, "002"), + rollbackRequests); + } + + @Test + public void testMarkerBasedRollbackFallbackToTimelineServerWhenDirectMarkerFails() throws Exception { + HoodieTestTable testTable = HoodieTestTable.of(metaClient); + String f0 = testTable.addRequestedCommit("000") + .getFileIdsWithBaseFilesInPartitions("partA").get("partA"); + testTable.forCommit("001") + .withMarkerFile("partA", f0, IOType.APPEND); + + HoodieTable hoodieTable = HoodieSparkTable.create(getConfig(), context, metaClient); + + DirectWriteMarkers writeMarkers = mock(DirectWriteMarkers.class); + initMocks(this); + when(writeMarkers.allMarkerFilePaths()).thenThrow(new IOException("Markers.type file not present")); + MarkerBasedRollbackStrategy rollbackStrategy = new MarkerBasedRollbackStrategy(hoodieTable, context, getConfig(), "002"); + List rollbackRequests = rollbackStrategy.getRollbackRequests(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "001")); + assertEquals(1, rollbackRequests.size()); + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/marker/TestDirectWriteMarkers.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/marker/TestDirectWriteMarkers.java new file mode 100644 index 0000000000000..fa6df3ba73dff --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/marker/TestDirectWriteMarkers.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.marker; + +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.testutils.FileSystemTestUtils; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.testutils.HoodieClientTestUtils; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.spark.api.java.JavaSparkContext; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; + +import java.io.IOException; +import java.util.List; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertIterableEquals; + +public class TestDirectWriteMarkers extends TestWriteMarkersBase { + + @BeforeEach + public void setup() throws IOException { + initPath(); + initMetaClient(); + this.jsc = new JavaSparkContext( + HoodieClientTestUtils.getSparkConfForTest(TestDirectWriteMarkers.class.getName())); + this.context = new HoodieSparkEngineContext(jsc); + this.fs = FSUtils.getFs(metaClient.getBasePath(), metaClient.getHadoopConf()); + this.markerFolderPath = new Path(metaClient.getMarkerFolderPath("000")); + this.writeMarkers = new DirectWriteMarkers( + fs, metaClient.getBasePath(), markerFolderPath.toString(), "000"); + } + + @AfterEach + public void cleanup() { + jsc.stop(); + context = null; + } + + @Override + void verifyMarkersInFileSystem(boolean isTablePartitioned) throws IOException { + List markerFiles = FileSystemTestUtils.listRecursive(fs, markerFolderPath) + .stream().filter(status -> status.getPath().getName().contains(".marker")) + .sorted().collect(Collectors.toList()); + assertEquals(3, markerFiles.size()); + assertIterableEquals(CollectionUtils.createImmutableList( + "file:" + markerFolderPath.toString() + + (isTablePartitioned ? "/2020/06/01" : "") + "/file1.marker.MERGE", + "file:" + markerFolderPath.toString() + + (isTablePartitioned ? "/2020/06/02" : "") + "/file2.marker.APPEND", + "file:" + markerFolderPath.toString() + + (isTablePartitioned ? "/2020/06/03" : "") + "/file3.marker.CREATE"), + markerFiles.stream().map(m -> m.getPath().toString()).collect(Collectors.toList()) + ); + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/marker/TestTimelineServerBasedWriteMarkers.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/marker/TestTimelineServerBasedWriteMarkers.java new file mode 100644 index 0000000000000..61ee844b19171 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/marker/TestTimelineServerBasedWriteMarkers.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.marker; + +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.HoodieCommonConfig; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.engine.HoodieLocalEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.marker.MarkerType; +import org.apache.hudi.common.table.view.FileSystemViewManager; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.FileSystemViewStorageType; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.FileIOUtils; +import org.apache.hudi.common.util.MarkerUtils; +import org.apache.hudi.testutils.HoodieClientTestUtils; +import org.apache.hudi.timeline.service.TimelineService; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.api.java.JavaSparkContext; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Collection; +import java.util.List; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertIterableEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestTimelineServerBasedWriteMarkers extends TestWriteMarkersBase { + TimelineService timelineService; + + @BeforeEach + public void setup() throws IOException { + initPath(); + initMetaClient(); + this.jsc = new JavaSparkContext( + HoodieClientTestUtils.getSparkConfForTest(TestTimelineServerBasedWriteMarkers.class.getName())); + this.context = new HoodieSparkEngineContext(jsc); + this.fs = FSUtils.getFs(metaClient.getBasePath(), metaClient.getHadoopConf()); + this.markerFolderPath = new Path(metaClient.getMarkerFolderPath("000")); + + FileSystemViewStorageConfig storageConf = + FileSystemViewStorageConfig.newBuilder().withStorageType(FileSystemViewStorageType.SPILLABLE_DISK).build(); + HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder().build(); + HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf()); + + try { + timelineService = new TimelineService(localEngineContext, new Configuration(), + TimelineService.Config.builder().serverPort(0).enableMarkerRequests(true).build(), + FileSystem.get(new Configuration()), + FileSystemViewManager.createViewManager( + localEngineContext, metadataConfig, storageConf, HoodieCommonConfig.newBuilder().build())); + timelineService.startService(); + } catch (Exception ex) { + throw new RuntimeException(ex); + } + this.writeMarkers = new TimelineServerBasedWriteMarkers( + metaClient.getBasePath(), markerFolderPath.toString(), "000", "localhost", timelineService.getServerPort(), 300); + } + + @AfterEach + public void cleanup() { + if (timelineService != null) { + timelineService.close(); + } + jsc.stop(); + context = null; + } + + @Override + void verifyMarkersInFileSystem(boolean isTablePartitioned) throws IOException { + // Verifies the markers + List allMarkers = MarkerUtils.readTimelineServerBasedMarkersFromFileSystem( + markerFolderPath.toString(), fs, context, 1) + .values().stream().flatMap(Collection::stream).sorted() + .collect(Collectors.toList()); + assertEquals(3, allMarkers.size()); + List expectedMarkers = isTablePartitioned + ? CollectionUtils.createImmutableList( + "2020/06/01/file1.marker.MERGE", "2020/06/02/file2.marker.APPEND", + "2020/06/03/file3.marker.CREATE") + : CollectionUtils.createImmutableList( + "file1.marker.MERGE", "file2.marker.APPEND", "file3.marker.CREATE"); + assertIterableEquals(expectedMarkers, allMarkers); + // Verifies the marker type file + Path markerTypeFilePath = new Path(markerFolderPath, MarkerUtils.MARKER_TYPE_FILENAME); + assertTrue(MarkerUtils.doesMarkerTypeFileExist(fs, markerFolderPath.toString())); + FSDataInputStream fsDataInputStream = fs.open(markerTypeFilePath); + assertEquals(MarkerType.TIMELINE_SERVER_BASED.toString(), + FileIOUtils.readAsUTFString(fsDataInputStream)); + closeQuietly(fsDataInputStream); + } + + /** + * Closes {@code Closeable} quietly. + * + * @param closeable {@code Closeable} to close + */ + private void closeQuietly(Closeable closeable) { + if (closeable == null) { + return; + } + try { + closeable.close(); + } catch (IOException e) { + // Ignore + } + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/marker/TestWriteMarkersBase.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/marker/TestWriteMarkersBase.java new file mode 100644 index 0000000000000..6ba783c749ffb --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/marker/TestWriteMarkersBase.java @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.marker; + +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.IOType; +import org.apache.hudi.common.testutils.FileSystemTestUtils; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.MarkerUtils; +import org.apache.hudi.exception.HoodieException; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.api.java.JavaSparkContext; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.IOException; +import java.util.List; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertIterableEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public abstract class TestWriteMarkersBase extends HoodieCommonTestHarness { + + protected WriteMarkers writeMarkers; + protected FileSystem fs; + protected Path markerFolderPath; + protected JavaSparkContext jsc; + protected HoodieSparkEngineContext context; + + private void createSomeMarkers(boolean isTablePartitioned) { + writeMarkers.create(isTablePartitioned ? "2020/06/01" : "", "file1", IOType.MERGE); + writeMarkers.create(isTablePartitioned ? "2020/06/02" : "", "file2", IOType.APPEND); + writeMarkers.create(isTablePartitioned ? "2020/06/03" : "", "file3", IOType.CREATE); + } + + private void createInvalidFile(String partitionPath, String invalidFileName) { + Path path = FSUtils.getPartitionPath(markerFolderPath.toString(), partitionPath); + Path invalidFilePath = new Path(path, invalidFileName); + try { + fs.create(invalidFilePath, false).close(); + } catch (IOException e) { + throw new HoodieException("Failed to create invalid file " + invalidFilePath, e); + } + } + + abstract void verifyMarkersInFileSystem(boolean isTablePartitioned) throws IOException; + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testCreation(boolean isTablePartitioned) throws Exception { + // when + createSomeMarkers(isTablePartitioned); + + // then + assertTrue(fs.exists(markerFolderPath)); + verifyMarkersInFileSystem(isTablePartitioned); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testDeletionWhenMarkerDirExists(boolean isTablePartitioned) throws IOException { + //when + writeMarkers.create(isTablePartitioned ? "2020/06/01" : "", "file1", IOType.MERGE); + + // then + assertTrue(writeMarkers.doesMarkerDirExist()); + assertTrue(writeMarkers.deleteMarkerDir(context, 2)); + assertFalse(writeMarkers.doesMarkerDirExist()); + } + + @Test + public void testDeletionWhenMarkerDirNotExists() throws IOException { + // then + assertFalse(writeMarkers.doesMarkerDirExist()); + assertTrue(writeMarkers.allMarkerFilePaths().isEmpty()); + assertFalse(writeMarkers.deleteMarkerDir(context, 2)); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testDataPathsWhenCreatingOrMerging(boolean isTablePartitioned) throws IOException { + // add marker files + createSomeMarkers(isTablePartitioned); + // add invalid file + createInvalidFile(isTablePartitioned ? "2020/06/01" : "", "invalid_file3"); + long fileSize = FileSystemTestUtils.listRecursive(fs, markerFolderPath).stream() + .filter(fileStatus -> !fileStatus.getPath().getName().contains(MarkerUtils.MARKER_TYPE_FILENAME)) + .count(); + assertEquals(fileSize, 4); + + List expectedPaths = isTablePartitioned + ? CollectionUtils.createImmutableList("2020/06/01/file1", "2020/06/03/file3") + : CollectionUtils.createImmutableList("file1", "file3"); + // then + assertIterableEquals(expectedPaths, + writeMarkers.createdAndMergedDataPaths(context, 2).stream().sorted().collect(Collectors.toList()) + ); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testAllMarkerPaths(boolean isTablePartitioned) throws IOException { + // given + createSomeMarkers(isTablePartitioned); + + List expectedPaths = isTablePartitioned + ? CollectionUtils.createImmutableList("2020/06/01/file1.marker.MERGE", + "2020/06/02/file2.marker.APPEND", "2020/06/03/file3.marker.CREATE") + : CollectionUtils.createImmutableList( + "file1.marker.MERGE", "file2.marker.APPEND", "file3.marker.CREATE"); + // then + assertIterableEquals(expectedPaths, + writeMarkers.allMarkerFilePaths().stream() + .filter(path -> !path.contains(MarkerUtils.MARKER_TYPE_FILENAME)) + .sorted().collect(Collectors.toList()) + ); + } + + @Test + public void testStripMarkerSuffix() { + // Given + final String pathPrefix = "file://" + metaClient.getMetaPath() + "/file"; + final String markerFilePath = pathPrefix + ".marker.APPEND"; + + // when-then + assertEquals(pathPrefix, WriteMarkers.stripMarkerSuffix(markerFilePath)); + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/upgrade/TestUpgradeDowngrade.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/upgrade/TestUpgradeDowngrade.java index b8e02b905b8cb..39dbacabac6cc 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/upgrade/TestUpgradeDowngrade.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/upgrade/TestUpgradeDowngrade.java @@ -20,65 +20,114 @@ import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.HoodieConfig; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieFileGroup; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.HoodieTableVersion; +import org.apache.hudi.common.table.marker.MarkerType; import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.table.view.SyncableFileSystemView; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.keygen.SimpleKeyGenerator; +import org.apache.hudi.keygen.TimestampBasedKeyGenerator; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.MarkerFiles; +import org.apache.hudi.table.marker.WriteMarkers; +import org.apache.hudi.table.marker.WriteMarkersFactory; import org.apache.hudi.testutils.Assertions; import org.apache.hudi.testutils.HoodieClientTestBase; import org.apache.hudi.testutils.HoodieClientTestUtils; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.EnumSource; import org.junit.jupiter.params.provider.MethodSource; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; +import java.util.Set; +import java.util.function.Predicate; import java.util.stream.Collectors; import java.util.stream.Stream; -import static org.apache.hudi.common.table.HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME; +import static org.apache.hudi.common.table.HoodieTableConfig.BASE_FILE_FORMAT; +import static org.apache.hudi.common.table.HoodieTableConfig.TYPE; import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH; +import static org.apache.hudi.common.util.MarkerUtils.MARKERS_FILENAME_PREFIX; +import static org.apache.hudi.common.util.PartitionPathEncodeUtils.DEPRECATED_DEFAULT_PARTITION_PATH; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; /** - * Unit tests {@link SparkUpgradeDowngrade}. + * Unit tests {@link UpgradeDowngrade}. */ public class TestUpgradeDowngrade extends HoodieClientTestBase { private static final String TEST_NAME_WITH_PARAMS = "[{index}] Test with deletePartialMarkerFiles={0} and TableType = {1}"; + private static final String TEST_NAME_WITH_DOWNGRADE_PARAMS = "[{index}] Test with deletePartialMarkerFiles={0} and TableType = {1} and " + + "From version = {2}"; public static Stream configParams() { Object[][] data = new Object[][] { - {true, HoodieTableType.COPY_ON_WRITE}, {false, HoodieTableType.COPY_ON_WRITE}, - {true, HoodieTableType.MERGE_ON_READ}, {false, HoodieTableType.MERGE_ON_READ} + {true, HoodieTableType.COPY_ON_WRITE}, + {false, HoodieTableType.COPY_ON_WRITE}, + {true, HoodieTableType.MERGE_ON_READ}, + {false, HoodieTableType.MERGE_ON_READ} + }; + return Stream.of(data).map(Arguments::of); + } + + public static Stream downGradeConfigParams() { + Object[][] data = new Object[][] { + {true, HoodieTableType.COPY_ON_WRITE, HoodieTableVersion.TWO}, + {false, HoodieTableType.COPY_ON_WRITE, HoodieTableVersion.TWO}, + {true, HoodieTableType.MERGE_ON_READ, HoodieTableVersion.TWO}, + {false, HoodieTableType.MERGE_ON_READ, HoodieTableVersion.TWO}, + {true, HoodieTableType.COPY_ON_WRITE, HoodieTableVersion.ONE}, + {false, HoodieTableType.COPY_ON_WRITE, HoodieTableVersion.ONE}, + {true, HoodieTableType.MERGE_ON_READ, HoodieTableVersion.ONE}, + {false, HoodieTableType.MERGE_ON_READ, HoodieTableVersion.ONE} + }; + return Stream.of(data).map(Arguments::of); + } + + public static Stream twoToThreeUpgradeConfigParams() { + Object[][] data = new Object[][] { + {HoodieTableType.COPY_ON_WRITE, Option.empty()}, + {HoodieTableType.COPY_ON_WRITE, Option.of(TimestampBasedKeyGenerator.class.getName())}, + {HoodieTableType.MERGE_ON_READ, Option.empty()}, + {HoodieTableType.MERGE_ON_READ, Option.of(TimestampBasedKeyGenerator.class.getName())} }; return Stream.of(data).map(Arguments::of); } @@ -86,28 +135,33 @@ public static Stream configParams() { @BeforeEach public void setUp() throws Exception { initSparkContexts(); - initDFS(); + initPath(); initTestDataGenerator(); - initDFSMetaClient(); + initMetaClient(); + } + + @AfterEach + public void cleanUp() throws Exception { + cleanupResources(); } @Test public void testLeftOverUpdatedPropFileCleanup() throws IOException { - testUpgradeInternal(true, true, HoodieTableType.MERGE_ON_READ); + testUpgradeZeroToOneInternal(true, true, HoodieTableType.MERGE_ON_READ); } @ParameterizedTest(name = TEST_NAME_WITH_PARAMS) @MethodSource("configParams") - public void testUpgrade(boolean deletePartialMarkerFiles, HoodieTableType tableType) throws IOException { - testUpgradeInternal(false, deletePartialMarkerFiles, tableType); + public void testUpgradeZeroToOne(boolean deletePartialMarkerFiles, HoodieTableType tableType) throws IOException { + testUpgradeZeroToOneInternal(false, deletePartialMarkerFiles, tableType); } - public void testUpgradeInternal(boolean induceResiduesFromPrevUpgrade, boolean deletePartialMarkerFiles, HoodieTableType tableType) throws IOException { + public void testUpgradeZeroToOneInternal(boolean induceResiduesFromPrevUpgrade, boolean deletePartialMarkerFiles, HoodieTableType tableType) throws IOException { // init config, table and client. Map params = new HashMap<>(); if (tableType == HoodieTableType.MERGE_ON_READ) { - params.put(HOODIE_TABLE_TYPE_PROP_NAME, HoodieTableType.MERGE_ON_READ.name()); - metaClient = HoodieTestUtils.init(dfs.getConf(), dfsBasePath, HoodieTableType.MERGE_ON_READ); + params.put(TYPE.key(), HoodieTableType.MERGE_ON_READ.name()); + metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ); } HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withRollbackUsingMarkers(false).withProps(params).build(); SparkRDDWriteClient client = getHoodieWriteClient(cfg); @@ -121,8 +175,9 @@ public void testUpgradeInternal(boolean induceResiduesFromPrevUpgrade, boolean d HoodieInstant commitInstant = table.getPendingCommitTimeline().lastInstant().get(); // delete one of the marker files in 2nd commit if need be. - MarkerFiles markerFiles = new MarkerFiles(table, commitInstant.getTimestamp()); - List markerPaths = markerFiles.allMarkerFilePaths(); + WriteMarkers writeMarkers = + WriteMarkersFactory.get(getConfig().getMarkersType(), table, commitInstant.getTimestamp()); + List markerPaths = new ArrayList<>(writeMarkers.allMarkerFilePaths()); if (deletePartialMarkerFiles) { String toDeleteMarkerFile = markerPaths.get(0); table.getMetaClient().getFs().delete(new Path(table.getMetaClient().getTempFolderPath() + "/" + commitInstant.getTimestamp() + "/" + toDeleteMarkerFile)); @@ -137,36 +192,306 @@ public void testUpgradeInternal(boolean induceResiduesFromPrevUpgrade, boolean d } // should re-create marker files for 2nd commit since its pending. - new SparkUpgradeDowngrade(metaClient, cfg, context).run(metaClient, HoodieTableVersion.ONE, cfg, context, null); + new UpgradeDowngrade(metaClient, cfg, context, SparkUpgradeDowngradeHelper.getInstance()) + .run(HoodieTableVersion.ONE, null); // assert marker files assertMarkerFilesForUpgrade(table, commitInstant, firstPartitionCommit2FileSlices, secondPartitionCommit2FileSlices); // verify hoodie.table.version got upgraded + metaClient = HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(cfg.getBasePath()) + .setLayoutVersion(Option.of(new TimelineLayoutVersion(cfg.getTimelineLayoutVersion()))).build(); assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.ONE.versionCode()); assertTableVersionFromPropertyFile(HoodieTableVersion.ONE); // trigger 3rd commit with marker based rollback enabled. + /* HUDI-2310 List thirdBatch = triggerCommit("003", tableType, true); // Check the entire dataset has all records only from 1st commit and 3rd commit since 2nd is expected to be rolledback. assertRows(inputRecords.getKey(), thirdBatch); if (induceResiduesFromPrevUpgrade) { assertFalse(dfs.exists(new Path(metaClient.getMetaPath(), SparkUpgradeDowngrade.HOODIE_UPDATED_PROPERTY_FILE))); + }*/ + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + public void testUpgradeOneToTwo(HoodieTableType tableType) throws IOException { + // init config, table and client. + Map params = new HashMap<>(); + addNewTableParamsToProps(params); + if (tableType == HoodieTableType.MERGE_ON_READ) { + params.put(TYPE.key(), HoodieTableType.MERGE_ON_READ.name()); + metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ); } + HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withRollbackUsingMarkers(false).withProps(params).build(); + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + // Write inserts + doInsert(client); + + // downgrade table props + downgradeTableConfigsFromTwoToOne(cfg); + + // perform upgrade + new UpgradeDowngrade(metaClient, cfg, context, SparkUpgradeDowngradeHelper.getInstance()) + .run(HoodieTableVersion.TWO, null); + + // verify hoodie.table.version got upgraded + metaClient = HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(cfg.getBasePath()) + .setLayoutVersion(Option.of(new TimelineLayoutVersion(cfg.getTimelineLayoutVersion()))).build(); + assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.TWO.versionCode()); + assertTableVersionFromPropertyFile(HoodieTableVersion.TWO); + + // verify table props + assertTableProps(cfg); } - @ParameterizedTest(name = TEST_NAME_WITH_PARAMS) - @MethodSource("configParams") - public void testDowngrade(boolean deletePartialMarkerFiles, HoodieTableType tableType) throws IOException { + @ParameterizedTest + @MethodSource("twoToThreeUpgradeConfigParams") + public void testUpgradeTwoToThree( + HoodieTableType tableType, Option keyGeneratorClass) throws IOException { // init config, table and client. Map params = new HashMap<>(); + addNewTableParamsToProps(params); if (tableType == HoodieTableType.MERGE_ON_READ) { - params.put(HOODIE_TABLE_TYPE_PROP_NAME, HoodieTableType.MERGE_ON_READ.name()); + params.put(TYPE.key(), HoodieTableType.MERGE_ON_READ.name()); metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ); } + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder() + .withAutoCommit(false).withRollbackUsingMarkers(false).withProps(params); + if (keyGeneratorClass.isPresent()) { + cfgBuilder.withKeyGenerator(keyGeneratorClass.get()); + } + HoodieWriteConfig cfg = cfgBuilder.build(); + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + // Write inserts + doInsert(client); + + // downgrade table props + downgradeTableConfigsFromThreeToTwo(cfg); + + // perform upgrade + new UpgradeDowngrade(metaClient, cfg, context, SparkUpgradeDowngradeHelper.getInstance()) + .run(HoodieTableVersion.THREE, null); + + // verify hoodie.table.version got upgraded + metaClient = HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(cfg.getBasePath()) + .setLayoutVersion(Option.of(new TimelineLayoutVersion(cfg.getTimelineLayoutVersion()))).build(); + assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.THREE.versionCode()); + assertTableVersionFromPropertyFile(HoodieTableVersion.THREE); + + // verify table props + HoodieTableConfig tableConfig = metaClient.getTableConfig(); + Properties originalProps = cfg.getProps(); + assertEquals(tableConfig.getUrlEncodePartitioning(), + cfg.getStringOrDefault(HoodieTableConfig.URL_ENCODE_PARTITIONING)); + assertEquals(tableConfig.getHiveStylePartitioningEnable(), + cfg.getStringOrDefault(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE)); + assertEquals(tableConfig.getKeyGeneratorClassName(), originalProps.getOrDefault( + HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key(), SimpleKeyGenerator.class.getName())); + } + + @Test + public void testUpgradeDowngradeBetweenThreeAndCurrentVersion() throws IOException { + // init config, table and client. + Map params = new HashMap<>(); + addNewTableParamsToProps(params); HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withRollbackUsingMarkers(false).withProps(params).build(); + + // write inserts + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + doInsert(client); + + // current version should have TABLE_CHECKSUM key + assertEquals(HoodieTableVersion.current(), metaClient.getTableConfig().getTableVersion()); + assertTableVersionFromPropertyFile(HoodieTableVersion.current()); + assertTrue(metaClient.getTableConfig().getProps().containsKey(HoodieTableConfig.TABLE_CHECKSUM.key())); + String checksum = metaClient.getTableConfig().getProps().getString(HoodieTableConfig.TABLE_CHECKSUM.key()); + + // downgrade to version 3 and check TABLE_CHECKSUM is still present + new UpgradeDowngrade(metaClient, cfg, context, SparkUpgradeDowngradeHelper.getInstance()).run(HoodieTableVersion.THREE, null); + assertEquals(HoodieTableVersion.THREE.versionCode(), metaClient.getTableConfig().getTableVersion().versionCode()); + assertTableVersionFromPropertyFile(HoodieTableVersion.THREE); + assertTrue(metaClient.getTableConfig().getProps().containsKey(HoodieTableConfig.TABLE_CHECKSUM.key())); + assertEquals(checksum, metaClient.getTableConfig().getProps().getString(HoodieTableConfig.TABLE_CHECKSUM.key())); + + // remove TABLE_CHECKSUM and upgrade to current version + metaClient.getTableConfig().getProps().remove(HoodieTableConfig.TABLE_CHECKSUM.key()); + new UpgradeDowngrade(metaClient, cfg, context, SparkUpgradeDowngradeHelper.getInstance()).run(HoodieTableVersion.current(), null); + + // verify upgrade and TABLE_CHECKSUM + metaClient = HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(cfg.getBasePath()) + .setLayoutVersion(Option.of(new TimelineLayoutVersion(cfg.getTimelineLayoutVersion()))).build(); + assertEquals(HoodieTableVersion.current().versionCode(), metaClient.getTableConfig().getTableVersion().versionCode()); + assertTableVersionFromPropertyFile(HoodieTableVersion.current()); + assertTrue(metaClient.getTableConfig().getProps().containsKey(HoodieTableConfig.TABLE_CHECKSUM.key())); + assertEquals(checksum, metaClient.getTableConfig().getProps().getString(HoodieTableConfig.TABLE_CHECKSUM.key())); + } + + @Test + public void testUpgradeFourtoFive() throws Exception { + testUpgradeFourToFiveInternal(false, false); + } + + @Test + public void testUpgradeFourtoFiveWithDefaultPartition() throws Exception { + testUpgradeFourToFiveInternal(true, false); + } + + @Test + public void testUpgradeFourtoFiveWithDefaultPartitionWithSkipValidation() throws Exception { + testUpgradeFourToFiveInternal(true, true); + } + + private void testUpgradeFourToFiveInternal(boolean assertDefaultPartition, boolean skipDefaultPartitionValidation) throws Exception { + String tableName = metaClient.getTableConfig().getTableName(); + // clean up and re instantiate meta client w/ right table props + cleanUp(); + initSparkContexts(); + initPath(); + initTestDataGenerator(); + + Map params = new HashMap<>(); + addNewTableParamsToProps(params, tableName); + Properties properties = new Properties(); + params.forEach((k,v) -> properties.setProperty(k, v)); + + initMetaClient(getTableType(), properties); + // init config, table and client. + HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withRollbackUsingMarkers(false) + .doSkipDefaultPartitionValidation(skipDefaultPartitionValidation).withProps(params).build(); SparkRDDWriteClient client = getHoodieWriteClient(cfg); + // Write inserts + doInsert(client); + + if (assertDefaultPartition) { + doInsertWithDefaultPartition(client); + } + + // downgrade table props + downgradeTableConfigsFromFiveToFour(cfg); + + // perform upgrade + if (assertDefaultPartition && !skipDefaultPartitionValidation) { + // if "default" partition is present, upgrade should fail + assertThrows(HoodieException.class, () -> new UpgradeDowngrade(metaClient, cfg, context, SparkUpgradeDowngradeHelper.getInstance()) + .run(HoodieTableVersion.FIVE, null), "Upgrade from 4 to 5 is expected to fail if \"default\" partition is present."); + } else { + new UpgradeDowngrade(metaClient, cfg, context, SparkUpgradeDowngradeHelper.getInstance()) + .run(HoodieTableVersion.FIVE, null); + + // verify hoodie.table.version got upgraded + metaClient = HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(cfg.getBasePath()).build(); + assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.FIVE.versionCode()); + assertTableVersionFromPropertyFile(HoodieTableVersion.FIVE); + + // verify table props + assertTableProps(cfg); + } + } + + private void addNewTableParamsToProps(Map params) { + addNewTableParamsToProps(params, metaClient.getTableConfig().getTableName()); + } + + private void addNewTableParamsToProps(Map params, String tableName) { + params.put(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "uuid"); + params.put(HoodieTableConfig.RECORDKEY_FIELDS.key(), "uuid"); + params.put(HoodieTableConfig.PARTITION_FIELDS.key(), "partition_path"); + params.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "partition_path"); + params.put(HoodieTableConfig.NAME.key(), tableName); + params.put(BASE_FILE_FORMAT.key(), BASE_FILE_FORMAT.defaultValue().name()); + } + + private void doInsert(SparkRDDWriteClient client) { + // Write 1 (only inserts) + String commit1 = "000"; + client.startCommitWithTime(commit1); + List records = dataGen.generateInserts(commit1, 100); + JavaRDD writeRecords = jsc.parallelize(records, 1); + client.insert(writeRecords, commit1).collect(); + } + + private void doInsertWithDefaultPartition(SparkRDDWriteClient client) { + // Write 1 (only inserts) + dataGen = new HoodieTestDataGenerator(new String[]{DEPRECATED_DEFAULT_PARTITION_PATH}); + String commit1 = "005"; + client.startCommitWithTime(commit1); + List records = dataGen.generateInserts(commit1, 100); + JavaRDD writeRecords = jsc.parallelize(records, 1); + client.insert(writeRecords, commit1).collect(); + } + + private void downgradeTableConfigsFromTwoToOne(HoodieWriteConfig cfg) throws IOException { + Properties properties = new Properties(cfg.getProps()); + properties.remove(HoodieTableConfig.RECORDKEY_FIELDS.key()); + properties.remove(HoodieTableConfig.PARTITION_FIELDS.key()); + properties.remove(HoodieTableConfig.NAME.key()); + properties.remove(BASE_FILE_FORMAT.key()); + properties.setProperty(HoodieTableConfig.VERSION.key(), "1"); + + metaClient = HoodieTestUtils.init(hadoopConf, basePath, getTableType(), properties); + // set hoodie.table.version to 1 in hoodie.properties file + metaClient.getTableConfig().setTableVersion(HoodieTableVersion.ONE); + } + + private void downgradeTableConfigsFromThreeToTwo(HoodieWriteConfig cfg) throws IOException { + Properties properties = new Properties(cfg.getProps()); + properties.remove(HoodieTableConfig.URL_ENCODE_PARTITIONING.key()); + properties.remove(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE.key()); + properties.remove(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key()); + properties.remove(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key()); + properties.setProperty(HoodieTableConfig.VERSION.key(), "2"); + + metaClient = HoodieTestUtils.init(hadoopConf, basePath, getTableType(), properties); + // set hoodie.table.version to 2 in hoodie.properties file + metaClient.getTableConfig().setTableVersion(HoodieTableVersion.TWO); + } + + private void downgradeTableConfigsFromFiveToFour(HoodieWriteConfig cfg) throws IOException { + Properties properties = new Properties(); + cfg.getProps().forEach((k,v) -> properties.setProperty((String) k, (String) v)); + properties.setProperty(HoodieTableConfig.VERSION.key(), "4"); + metaClient = HoodieTestUtils.init(hadoopConf, basePath, getTableType(), properties); + // set hoodie.table.version to 4 in hoodie.properties file + metaClient.getTableConfig().setTableVersion(HoodieTableVersion.FOUR); + } + + private void assertTableProps(HoodieWriteConfig cfg) { + HoodieTableConfig tableConfig = metaClient.getTableConfig(); + Properties originalProps = cfg.getProps(); + assertEquals(tableConfig.getPartitionFieldProp(), originalProps.getProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())); + assertEquals(tableConfig.getRecordKeyFieldProp(), originalProps.getProperty(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key())); + assertEquals(tableConfig.getTableName(), cfg.getTableName()); + assertEquals(tableConfig.getBaseFileFormat().name(), originalProps.getProperty(BASE_FILE_FORMAT.key())); + } + + @ParameterizedTest(name = TEST_NAME_WITH_DOWNGRADE_PARAMS) + @MethodSource("downGradeConfigParams") + public void testDowngrade(boolean deletePartialMarkerFiles, HoodieTableType tableType, HoodieTableVersion fromVersion) throws IOException { + MarkerType markerType = fromVersion == HoodieTableVersion.TWO ? MarkerType.TIMELINE_SERVER_BASED : MarkerType.DIRECT; + // init config, table and client. + Map params = new HashMap<>(); + if (fromVersion == HoodieTableVersion.TWO) { + addNewTableParamsToProps(params); + } + if (tableType == HoodieTableType.MERGE_ON_READ) { + params.put(TYPE.key(), HoodieTableType.MERGE_ON_READ.name()); + metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ); + } + HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withRollbackUsingMarkers(true) + .withMarkersType(markerType.name()).withProps(params).build(); + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + + if (fromVersion == HoodieTableVersion.TWO) { + // set table configs + HoodieTableConfig tableConfig = metaClient.getTableConfig(); + tableConfig.setValue(HoodieTableConfig.NAME, cfg.getTableName()); + tableConfig.setValue(HoodieTableConfig.PARTITION_FIELDS, cfg.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())); + tableConfig.setValue(HoodieTableConfig.RECORDKEY_FIELDS, cfg.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key())); + tableConfig.setValue(BASE_FILE_FORMAT, cfg.getString(BASE_FILE_FORMAT)); + } // prepare data. Make 2 commits, in which 2nd is not committed. List firstPartitionCommit2FileSlices = new ArrayList<>(); @@ -177,51 +502,79 @@ public void testDowngrade(boolean deletePartialMarkerFiles, HoodieTableType tabl HoodieInstant commitInstant = table.getPendingCommitTimeline().lastInstant().get(); // delete one of the marker files in 2nd commit if need be. - MarkerFiles markerFiles = new MarkerFiles(table, commitInstant.getTimestamp()); - List markerPaths = markerFiles.allMarkerFilePaths(); + WriteMarkers writeMarkers = WriteMarkersFactory.get(markerType, table, commitInstant.getTimestamp()); + List markerPaths = new ArrayList<>(writeMarkers.allMarkerFilePaths()); if (deletePartialMarkerFiles) { String toDeleteMarkerFile = markerPaths.get(0); table.getMetaClient().getFs().delete(new Path(table.getMetaClient().getTempFolderPath() + "/" + commitInstant.getTimestamp() + "/" + toDeleteMarkerFile)); markerPaths.remove(toDeleteMarkerFile); } - // set hoodie.table.version to 1 in hoodie.properties file - prepForDowngrade(); + // set hoodie.table.version to fromVersion in hoodie.properties file + HoodieTableVersion toVersion = HoodieTableVersion.ZERO; + if (fromVersion == HoodieTableVersion.TWO) { + prepForDowngradeFromTwoToOne(); + toVersion = HoodieTableVersion.ONE; + } else { + prepForDowngradeFromOneToZero(); + } // downgrade should be performed. all marker files should be deleted - new SparkUpgradeDowngrade(metaClient, cfg, context).run(metaClient, HoodieTableVersion.ZERO, cfg, context, null); - - // assert marker files - assertMarkerFilesForDowngrade(table, commitInstant); + new UpgradeDowngrade(metaClient, cfg, context, SparkUpgradeDowngradeHelper.getInstance()) + .run(toVersion, null); + if (fromVersion == HoodieTableVersion.TWO) { + // assert marker files + assertMarkerFilesForDowngrade(table, commitInstant, toVersion == HoodieTableVersion.ONE); + } + // verify hoodie.table.version got downgraded - assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.ZERO.versionCode()); - assertTableVersionFromPropertyFile(HoodieTableVersion.ZERO); + metaClient = HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(cfg.getBasePath()) + .setLayoutVersion(Option.of(new TimelineLayoutVersion(cfg.getTimelineLayoutVersion()))).build(); + assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), toVersion.versionCode()); + assertTableVersionFromPropertyFile(toVersion); // trigger 3rd commit with marker based rollback disabled. + /* HUDI-2310 List thirdBatch = triggerCommit("003", tableType, false); // Check the entire dataset has all records only from 1st commit and 3rd commit since 2nd is expected to be rolledback. assertRows(inputRecords.getKey(), thirdBatch); + */ } - private void assertMarkerFilesForDowngrade(HoodieTable table, HoodieInstant commitInstant) throws IOException { + private void assertMarkerFilesForDowngrade(HoodieTable table, HoodieInstant commitInstant, boolean assertExists) throws IOException { // Verify recreated marker files are as expected - MarkerFiles markerFiles = new MarkerFiles(table, commitInstant.getTimestamp()); - assertFalse(markerFiles.doesMarkerDirExist()); + WriteMarkers writeMarkers = WriteMarkersFactory.get(getConfig().getMarkersType(), table, commitInstant.getTimestamp()); + if (assertExists) { + assertTrue(writeMarkers.doesMarkerDirExist()); + assertEquals(0, getTimelineServerBasedMarkerFileCount(table.getMetaClient().getMarkerFolderPath(commitInstant.getTimestamp()), + table.getMetaClient().getFs())); + } else { + assertFalse(writeMarkers.doesMarkerDirExist()); + } + } + + private long getTimelineServerBasedMarkerFileCount(String markerDir, FileSystem fileSystem) throws IOException { + FileStatus[] fileStatuses = fileSystem.listStatus(new Path(markerDir)); + Predicate prefixFilter = pathStr -> pathStr.contains(MARKERS_FILENAME_PREFIX); + return Arrays.stream(fileStatuses) + .map(fileStatus -> fileStatus.getPath().toString()) + .filter(prefixFilter) + .collect(Collectors.toList()).stream().count(); } private void assertMarkerFilesForUpgrade(HoodieTable table, HoodieInstant commitInstant, List firstPartitionCommit2FileSlices, List secondPartitionCommit2FileSlices) throws IOException { // Verify recreated marker files are as expected - MarkerFiles markerFiles = new MarkerFiles(table, commitInstant.getTimestamp()); - assertTrue(markerFiles.doesMarkerDirExist()); - List files = markerFiles.allMarkerFilePaths(); + WriteMarkers writeMarkers = WriteMarkersFactory.get(getConfig().getMarkersType(), table, commitInstant.getTimestamp()); + assertTrue(writeMarkers.doesMarkerDirExist()); + Set files = writeMarkers.allMarkerFilePaths(); assertEquals(2, files.size()); List actualFiles = new ArrayList<>(); for (String file : files) { - String fileName = MarkerFiles.stripMarkerSuffix(file); + String fileName = WriteMarkers.stripMarkerSuffix(file); actualFiles.add(fileName); } @@ -282,10 +635,10 @@ private void assertMarkerFilesForUpgrade(HoodieTable table, HoodieInstant commit private List triggerCommit(String newCommitTime, HoodieTableType tableType, boolean enableMarkedBasedRollback) { Map params = new HashMap<>(); if (tableType == HoodieTableType.MERGE_ON_READ) { - params.put(HOODIE_TABLE_TYPE_PROP_NAME, HoodieTableType.MERGE_ON_READ.name()); + params.put(TYPE.key(), HoodieTableType.MERGE_ON_READ.name()); } HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withRollbackUsingMarkers(enableMarkedBasedRollback).withProps(params).build(); - SparkRDDWriteClient client = getHoodieWriteClient(cfg, true); + SparkRDDWriteClient client = getHoodieWriteClient(cfg); client.startCommitWithTime(newCommitTime); @@ -341,7 +694,7 @@ private Pair, List> twoUpsertCommitDataWithTwoP //just generate two partitions dataGen = new HoodieTestDataGenerator(new String[] {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}); //1. prepare data - HoodieTestDataGenerator.writePartitionMetadata(dfs, new String[] {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}, dfsBasePath); + HoodieTestDataGenerator.writePartitionMetadataDeprecated(metaClient.getFs(), new String[] {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}, basePath); /** * Write 1 (only inserts) */ @@ -388,30 +741,38 @@ private Pair, List> twoUpsertCommitDataWithTwoP return Pair.of(records, records2); } - private void prepForDowngrade() throws IOException { + private void prepForDowngradeFromOneToZero() throws IOException { metaClient.getTableConfig().setTableVersion(HoodieTableVersion.ONE); Path propertyFile = new Path(metaClient.getMetaPath() + "/" + HoodieTableConfig.HOODIE_PROPERTIES_FILE); try (FSDataOutputStream os = metaClient.getFs().create(propertyFile)) { - metaClient.getTableConfig().getProperties().store(os, ""); + metaClient.getTableConfig().getProps().store(os, ""); + } + } + + private void prepForDowngradeFromTwoToOne() throws IOException { + metaClient.getTableConfig().setTableVersion(HoodieTableVersion.TWO); + Path propertyFile = new Path(metaClient.getMetaPath() + "/" + HoodieTableConfig.HOODIE_PROPERTIES_FILE); + try (FSDataOutputStream os = metaClient.getFs().create(propertyFile)) { + metaClient.getTableConfig().getProps().store(os, ""); } } private void createResidualFile() throws IOException { Path propertyFile = new Path(metaClient.getMetaPath() + "/" + HoodieTableConfig.HOODIE_PROPERTIES_FILE); - Path updatedPropertyFile = new Path(metaClient.getMetaPath() + "/" + SparkUpgradeDowngrade.HOODIE_UPDATED_PROPERTY_FILE); + Path updatedPropertyFile = new Path(metaClient.getMetaPath() + "/" + UpgradeDowngrade.HOODIE_UPDATED_PROPERTY_FILE); // Step1: Copy hoodie.properties to hoodie.properties.orig FileUtil.copy(metaClient.getFs(), propertyFile, metaClient.getFs(), updatedPropertyFile, - false, metaClient.getHadoopConf()); + false, hadoopConf); } private void assertTableVersionFromPropertyFile(HoodieTableVersion expectedVersion) throws IOException { Path propertyFile = new Path(metaClient.getMetaPath() + "/" + HoodieTableConfig.HOODIE_PROPERTIES_FILE); // Load the properties and verify FSDataInputStream fsDataInputStream = metaClient.getFs().open(propertyFile); - Properties prop = new Properties(); - prop.load(fsDataInputStream); + HoodieConfig hoodieConfig = HoodieConfig.create(fsDataInputStream); fsDataInputStream.close(); - assertEquals(Integer.toString(expectedVersion.versionCode()), prop.getProperty(HoodieTableConfig.HOODIE_TABLE_VERSION_PROP_NAME)); + assertEquals(Integer.toString(expectedVersion.versionCode()), hoodieConfig + .getString(HoodieTableConfig.VERSION)); } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/Assertions.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/Assertions.java deleted file mode 100644 index ad6561025a51d..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/Assertions.java +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.hudi.testutils; - -import org.apache.hudi.client.WriteStatus; - -import java.util.List; - -import static org.junit.jupiter.api.Assertions.assertAll; -import static org.junit.jupiter.api.Assertions.assertFalse; - -/** - * Commonly used assertion functions. - */ -public class Assertions { - - /** - * Assert no failures in writing hoodie files. - */ - public static void assertNoWriteErrors(List statuses) { - assertAll(statuses.stream().map(status -> () -> - assertFalse(status.hasErrors(), "Errors found in write of " + status.getFileId()))); - } -} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/FunctionalTestHarness.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/FunctionalTestHarness.java index 1020e932b684c..9d28577059404 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/FunctionalTestHarness.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/FunctionalTestHarness.java @@ -19,12 +19,11 @@ package org.apache.hudi.testutils; -import org.apache.hudi.client.HoodieReadClient; +import org.apache.hudi.client.SparkRDDReadClient; import org.apache.hudi.client.SparkRDDWriteClient; -import org.apache.hudi.client.common.HoodieEngineContext; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieAvroPayload; -import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.testutils.minicluster.HdfsTestService; import org.apache.hudi.config.HoodieWriteConfig; @@ -44,19 +43,22 @@ import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.io.TempDir; import java.io.IOException; import java.util.Properties; -import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; import static org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE; import static org.apache.hudi.common.testutils.HoodieTestUtils.RAW_TRIPS_TEST_NAME; +/** + * @deprecated Deprecated. Use {@link SparkClientFunctionalTestHarness} instead. + */ public class FunctionalTestHarness implements SparkProvider, DFSProvider, HoodieMetaClientProvider, HoodieWriteClientProvider { - private static transient SparkSession spark; + protected static transient SparkSession spark; private static transient SQLContext sqlContext; private static transient JavaSparkContext jsc; protected static transient HoodieSparkEngineContext context; @@ -117,16 +119,18 @@ public HoodieTableMetaClient getHoodieMetaClient(Configuration hadoopConf, Strin @Override public HoodieTableMetaClient getHoodieMetaClient(Configuration hadoopConf, String basePath, Properties props) throws IOException { - props.putIfAbsent(HoodieTableConfig.HOODIE_BASE_FILE_FORMAT_PROP_NAME, PARQUET.toString()); - props.putIfAbsent(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, RAW_TRIPS_TEST_NAME); - props.putIfAbsent(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, COPY_ON_WRITE.name()); - props.putIfAbsent(HoodieTableConfig.HOODIE_PAYLOAD_CLASS_PROP_NAME, HoodieAvroPayload.class.getName()); + props = HoodieTableMetaClient.withPropertyBuilder() + .setTableName(RAW_TRIPS_TEST_NAME) + .setTableType(COPY_ON_WRITE) + .setPayloadClass(HoodieAvroPayload.class) + .fromProperties(props) + .build(); return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, basePath, props); } @Override public SparkRDDWriteClient getHoodieWriteClient(HoodieWriteConfig cfg) throws IOException { - return new SparkRDDWriteClient(context(), cfg, false); + return new SparkRDDWriteClient(context(), cfg); } @BeforeEach @@ -135,7 +139,7 @@ public synchronized void runBeforeEach() throws Exception { if (!initialized) { SparkConf sparkConf = conf(); SparkRDDWriteClient.registerClasses(sparkConf); - HoodieReadClient.addHoodieSupport(sparkConf); + SparkRDDReadClient.addHoodieSupport(sparkConf); spark = SparkSession.builder().config(sparkConf).getOrCreate(); sqlContext = spark.sqlContext(); jsc = new JavaSparkContext(spark.sparkContext()); @@ -150,12 +154,22 @@ public synchronized void runBeforeEach() throws Exception { hdfsTestService.stop(); hdfsTestService = null; + jsc.close(); + jsc = null; spark.stop(); spark = null; })); } } + @AfterEach + public synchronized void tearDown() throws Exception { + if (spark != null) { + spark.stop(); + spark = null; + } + } + @AfterAll public static synchronized void cleanUpAfterAll() throws IOException { Path workDir = dfs.getWorkingDirectory(); @@ -164,5 +178,19 @@ public static synchronized void cleanUpAfterAll() throws IOException { for (FileStatus f : fileStatuses) { fs.delete(f.getPath(), true); } + if (hdfsTestService != null) { + hdfsTestService.stop(); + hdfsTestService = null; + } + if (spark != null) { + spark.stop(); + spark = null; + } + if (jsc != null) { + jsc.close(); + jsc = null; + } + sqlContext = null; + context = null; } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestBase.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestBase.java index 1caf9c04ff011..899bfbfbce2da 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestBase.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestBase.java @@ -20,9 +20,12 @@ import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.HoodieCleanStat; import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodieRecord; @@ -36,14 +39,17 @@ import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.RawTripTestPayload; import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieCleanConfig; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.index.HoodieIndex.IndexType; -import org.apache.hudi.index.SparkHoodieIndex; +import org.apache.hudi.index.SparkHoodieIndexFactory; import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; @@ -105,17 +111,30 @@ public HoodieWriteConfig.Builder getConfigBuilder() { return getConfigBuilder(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA); } + /** + * Get Config builder with default configs set. + * + * @return Config Builder + */ + public HoodieWriteConfig.Builder getConfigBuilder(HoodieFailedWritesCleaningPolicy cleaningPolicy) { + return getConfigBuilder(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA, IndexType.BLOOM, cleaningPolicy); + } + /** * Get Config builder with default configs set. * * @return Config Builder */ public HoodieWriteConfig.Builder getConfigBuilder(IndexType indexType) { - return getConfigBuilder(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA, indexType); + return getConfigBuilder(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA, indexType, HoodieFailedWritesCleaningPolicy.EAGER); } public HoodieWriteConfig.Builder getConfigBuilder(String schemaStr) { - return getConfigBuilder(schemaStr, IndexType.BLOOM); + return getConfigBuilder(schemaStr, IndexType.BLOOM, HoodieFailedWritesCleaningPolicy.EAGER); + } + + public HoodieWriteConfig.Builder getConfigBuilder(String schemaStr, IndexType indexType) { + return getConfigBuilder(schemaStr, indexType, HoodieFailedWritesCleaningPolicy.EAGER); } /** @@ -123,18 +142,21 @@ public HoodieWriteConfig.Builder getConfigBuilder(String schemaStr) { * * @return Config Builder */ - public HoodieWriteConfig.Builder getConfigBuilder(String schemaStr, IndexType indexType) { + public HoodieWriteConfig.Builder getConfigBuilder(String schemaStr, IndexType indexType, + HoodieFailedWritesCleaningPolicy cleaningPolicy) { return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr) .withParallelism(2, 2).withBulkInsertParallelism(2).withFinalizeWriteParallelism(2).withDeleteParallelism(2) .withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION) .withWriteStatusClass(MetadataMergeWriteStatus.class) .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder().withFailedWritesCleaningPolicy(cleaningPolicy).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build()) - .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024).parquetMaxFileSize(1024 * 1024).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024).parquetMaxFileSize(1024 * 1024).orcMaxFileSize(1024 * 1024).build()) .forTable("test-trip-table") .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).build()) .withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() .withEnableBackupForRemoteFileSystemView(false) // Fail test if problem connecting to timeline-server + .withRemoteServerPort(timelineServicePort) .withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build()); } @@ -144,18 +166,18 @@ public HoodieSparkTable getHoodieTable(HoodieTableMetaClient metaClient, HoodieW return table; } - public void assertPartitionMetadataForRecords(List inputRecords, FileSystem fs) throws IOException { + public void assertPartitionMetadataForRecords(String basePath, List inputRecords, FileSystem fs) throws IOException { Set partitionPathSet = inputRecords.stream() .map(HoodieRecord::getPartitionPath) .collect(Collectors.toSet()); - assertPartitionMetadata(partitionPathSet.stream().toArray(String[]::new), fs); + assertPartitionMetadata(basePath, partitionPathSet.stream().toArray(String[]::new), fs); } - public void assertPartitionMetadataForKeys(List inputKeys, FileSystem fs) throws IOException { + public void assertPartitionMetadataForKeys(String basePath, List inputKeys, FileSystem fs) throws IOException { Set partitionPathSet = inputKeys.stream() .map(HoodieKey::getPartitionPath) .collect(Collectors.toSet()); - assertPartitionMetadata(partitionPathSet.stream().toArray(String[]::new), fs); + assertPartitionMetadata(basePath, partitionPathSet.stream().toArray(String[]::new), fs); } /** @@ -165,7 +187,7 @@ public void assertPartitionMetadataForKeys(List inputKeys, FileSystem * @param fs File System * @throws IOException in case of error */ - public void assertPartitionMetadata(String[] partitionPaths, FileSystem fs) throws IOException { + public static void assertPartitionMetadata(String basePath, String[] partitionPaths, FileSystem fs) throws IOException { for (String partitionPath : partitionPaths) { assertTrue(HoodiePartitionMetadata.hasPartitionMetadata(fs, new Path(basePath, partitionPath))); HoodiePartitionMetadata pmeta = new HoodiePartitionMetadata(fs, new Path(basePath, partitionPath)); @@ -180,7 +202,7 @@ public void assertPartitionMetadata(String[] partitionPaths, FileSystem fs) thro * @param taggedRecords Tagged Records * @param instantTime Commit Timestamp */ - public void checkTaggedRecords(List taggedRecords, String instantTime) { + public static void checkTaggedRecords(List taggedRecords, String instantTime) { for (HoodieRecord rec : taggedRecords) { assertTrue(rec.isCurrentLocationKnown(), "Record " + rec + " found with no location."); assertEquals(rec.getCurrentLocation().getInstantTime(), instantTime, @@ -193,7 +215,7 @@ public void checkTaggedRecords(List taggedRecords, String instantT * * @param records List of Hoodie records */ - public void assertNodupesWithinPartition(List> records) { + public static void assertNodupesWithinPartition(List> records) { Map> partitionToKeys = new HashMap<>(); for (HoodieRecord r : records) { String key = r.getRecordKey(); @@ -212,17 +234,46 @@ public void assertNodupesWithinPartition(List> * guaranteed by record-generation function itself. * * @param writeConfig Hoodie Write Config - * @param recordGenFunction Records Generation function + * @param recordsGenFunction Records Generation function * @return Wrapped function */ - private Function2, String, Integer> wrapRecordsGenFunctionForPreppedCalls( - final HoodieWriteConfig writeConfig, final Function2, String, Integer> recordGenFunction) { + public static Function2, String, Integer> wrapRecordsGenFunctionForPreppedCalls( + final String basePath, + final Configuration hadoopConf, + final HoodieSparkEngineContext context, + final HoodieWriteConfig writeConfig, + final Function2, String, Integer> recordsGenFunction) { return (commit, numRecords) -> { - final SparkHoodieIndex index = SparkHoodieIndex.createIndex(writeConfig); - List records = recordGenFunction.apply(commit, numRecords); - final HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, basePath, true); + final HoodieIndex index = SparkHoodieIndexFactory.createIndex(writeConfig); + List records = recordsGenFunction.apply(commit, numRecords); + final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); + HoodieSparkTable table = HoodieSparkTable.create(writeConfig, context, metaClient); + JavaRDD taggedRecords = tagLocation(index, context, context.getJavaSparkContext().parallelize(records, 1), table); + return taggedRecords.collect(); + }; + } + + /** + * Helper to generate records generation function for testing Prepped version of API. Prepped APIs expect the records + * to be already de-duped and have location set. This wrapper takes care of record-location setting. Uniqueness is + * guaranteed by record-generation function itself. + * + * @param writeConfig Hoodie Write Config + * @param recordsGenFunction Records Generation function (for partition) + * @return Wrapped function + */ + public static Function3, String, Integer, String> wrapPartitionRecordsGenFunctionForPreppedCalls( + final String basePath, + final Configuration hadoopConf, + final HoodieSparkEngineContext context, + final HoodieWriteConfig writeConfig, + final Function3, String, Integer, String> recordsGenFunction) { + return (commit, numRecords, partition) -> { + final HoodieIndex index = SparkHoodieIndexFactory.createIndex(writeConfig); + List records = recordsGenFunction.apply(commit, numRecords, partition); + final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); HoodieSparkTable table = HoodieSparkTable.create(writeConfig, context, metaClient); - JavaRDD taggedRecords = index.tagLocation(jsc.parallelize(records, 1), context, table); + JavaRDD taggedRecords = tagLocation(index, context, context.getJavaSparkContext().parallelize(records, 1), table); return taggedRecords.collect(); }; } @@ -236,16 +287,20 @@ private Function2, String, Integer> wrapRecordsGenFunctionFor * @param keyGenFunction Keys Generation function * @return Wrapped function */ - private Function> wrapDeleteKeysGenFunctionForPreppedCalls( - final HoodieWriteConfig writeConfig, final Function> keyGenFunction) { + public static Function> wrapDeleteKeysGenFunctionForPreppedCalls( + final String basePath, + final Configuration hadoopConf, + final HoodieSparkEngineContext context, + final HoodieWriteConfig writeConfig, + final Function> keyGenFunction) { return (numRecords) -> { - final SparkHoodieIndex index = SparkHoodieIndex.createIndex(writeConfig); + final HoodieIndex index = SparkHoodieIndexFactory.createIndex(writeConfig); List records = keyGenFunction.apply(numRecords); - final HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, basePath, true); + final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); HoodieSparkTable table = HoodieSparkTable.create(writeConfig, context, metaClient); - JavaRDD recordsToDelete = jsc.parallelize(records, 1) - .map(key -> new HoodieRecord(key, new EmptyHoodieRecordPayload())); - JavaRDD taggedRecords = index.tagLocation(recordsToDelete, context, table); + JavaRDD recordsToDelete = context.getJavaSparkContext().parallelize(records, 1) + .map(key -> new HoodieAvroRecord(key, new EmptyHoodieRecordPayload())); + JavaRDD taggedRecords = tagLocation(index, context, recordsToDelete, table); return taggedRecords.map(record -> record.getKey()).collect(); }; } @@ -262,7 +317,24 @@ public Function2, String, Integer> generateWrapRecordsFn(bool HoodieWriteConfig writeConfig, Function2, String, Integer> wrapped) { if (isPreppedAPI) { - return wrapRecordsGenFunctionForPreppedCalls(writeConfig, wrapped); + return wrapRecordsGenFunctionForPreppedCalls(basePath, hadoopConf, context, writeConfig, wrapped); + } else { + return wrapped; + } + } + + /** + * Generate wrapper for record generation function for testing Prepped APIs. + * + * @param isPreppedAPI Flag to indicate if this is for testing prepped-version of APIs + * @param writeConfig Hoodie Write Config + * @param wrapped Actual Records Generation function (for partition) + * @return Wrapped Function + */ + public Function3, String, Integer, String> generateWrapRecordsForPartitionFn(boolean isPreppedAPI, + HoodieWriteConfig writeConfig, Function3, String, Integer, String> wrapped) { + if (isPreppedAPI) { + return wrapPartitionRecordsGenFunctionForPreppedCalls(basePath, hadoopConf, context, writeConfig, wrapped); } else { return wrapped; } @@ -279,12 +351,19 @@ public Function2, String, Integer> generateWrapRecordsFn(bool public Function> generateWrapDeleteKeysFn(boolean isPreppedAPI, HoodieWriteConfig writeConfig, Function> wrapped) { if (isPreppedAPI) { - return wrapDeleteKeysGenFunctionForPreppedCalls(writeConfig, wrapped); + return wrapDeleteKeysGenFunctionForPreppedCalls(basePath, hadoopConf, context, writeConfig, wrapped); } else { return wrapped; } } + public JavaRDD insertFirstBatch(HoodieWriteConfig writeConfig, SparkRDDWriteClient client, String newCommitTime, + String initCommitTime, int numRecordsInThisCommit, + Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, boolean isPreppedAPI, + boolean assertForCommit, int expRecordsInThisCommit) throws Exception { + return insertFirstBatch(writeConfig, client, newCommitTime, initCommitTime, numRecordsInThisCommit, writeFn, isPreppedAPI, assertForCommit, expRecordsInThisCommit, true); + } + /** * Helper to insert first batch of records and do regular assertions on the state after successful completion. * @@ -303,12 +382,12 @@ public Function> generateWrapDeleteKeysFn(boolean isPre public JavaRDD insertFirstBatch(HoodieWriteConfig writeConfig, SparkRDDWriteClient client, String newCommitTime, String initCommitTime, int numRecordsInThisCommit, Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, boolean isPreppedAPI, - boolean assertForCommit, int expRecordsInThisCommit) throws Exception { + boolean assertForCommit, int expRecordsInThisCommit, boolean filterForCommitTimeWithAssert) throws Exception { final Function2, String, Integer> recordGenFunction = generateWrapRecordsFn(isPreppedAPI, writeConfig, dataGen::generateInserts); return writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime, numRecordsInThisCommit, - recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expRecordsInThisCommit, 1); + recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expRecordsInThisCommit, 1, false, filterForCommitTimeWithAssert); } /** @@ -331,12 +410,31 @@ public JavaRDD insertFirstBatch(HoodieWriteConfig writeConfig, Spar public JavaRDD insertBatch(HoodieWriteConfig writeConfig, SparkRDDWriteClient client, String newCommitTime, String initCommitTime, int numRecordsInThisCommit, Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, boolean isPreppedAPI, - boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits) throws Exception { - final Function2, String, Integer> recordGenFunction = - generateWrapRecordsFn(isPreppedAPI, writeConfig, dataGen::generateInserts); + boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits, Option partition) throws Exception { - return writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime, numRecordsInThisCommit, - recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, expTotalCommits); + if (partition.isPresent()) { + final Function3, String, Integer, String> recordGenFunction = + generateWrapRecordsForPartitionFn(isPreppedAPI, writeConfig, dataGen::generateInsertsForPartition); + + return writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime, numRecordsInThisCommit, + recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, expTotalCommits, false, + partition.get()); + } else { + final Function2, String, Integer> recordGenFunction = + generateWrapRecordsFn(isPreppedAPI, writeConfig, dataGen::generateInserts); + + return writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime, numRecordsInThisCommit, + recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, expTotalCommits, false); + } + } + + public JavaRDD updateBatch(HoodieWriteConfig writeConfig, SparkRDDWriteClient client, String newCommitTime, + String prevCommitTime, Option> commitTimesBetweenPrevAndNew, String initCommitTime, + int numRecordsInThisCommit, + Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, boolean isPreppedAPI, + boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits) throws Exception { + return updateBatch(writeConfig, client, newCommitTime, prevCommitTime, commitTimesBetweenPrevAndNew, initCommitTime, numRecordsInThisCommit, writeFn, + isPreppedAPI, assertForCommit, expRecordsInThisCommit, expTotalRecords, expTotalCommits, true); } /** @@ -362,13 +460,23 @@ public JavaRDD updateBatch(HoodieWriteConfig writeConfig, SparkRDDW String prevCommitTime, Option> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit, Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, boolean isPreppedAPI, - boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits) throws Exception { + boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits, + boolean filterForCommitTimeWithAssert) throws Exception { final Function2, String, Integer> recordGenFunction = generateWrapRecordsFn(isPreppedAPI, writeConfig, dataGen::generateUniqueUpdates); return writeBatch(client, newCommitTime, prevCommitTime, commitTimesBetweenPrevAndNew, initCommitTime, numRecordsInThisCommit, recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, - expTotalCommits); + expTotalCommits, false, filterForCommitTimeWithAssert); + } + + public JavaRDD deleteBatch(HoodieWriteConfig writeConfig, SparkRDDWriteClient client, String newCommitTime, + String prevCommitTime, String initCommitTime, + int numRecordsInThisCommit, + Function3, SparkRDDWriteClient, JavaRDD, String> deleteFn, boolean isPreppedAPI, + boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords) throws Exception { + return deleteBatch(writeConfig, client, newCommitTime, prevCommitTime, initCommitTime, numRecordsInThisCommit, deleteFn, isPreppedAPI, + assertForCommit, expRecordsInThisCommit, expTotalRecords, true); } /** @@ -392,13 +500,32 @@ public JavaRDD deleteBatch(HoodieWriteConfig writeConfig, SparkRDDW String prevCommitTime, String initCommitTime, int numRecordsInThisCommit, Function3, SparkRDDWriteClient, JavaRDD, String> deleteFn, boolean isPreppedAPI, - boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords) throws Exception { + boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, boolean filterForCommitTimeWithAssert) throws Exception { final Function> keyGenFunction = generateWrapDeleteKeysFn(isPreppedAPI, writeConfig, dataGen::generateUniqueDeletes); return deleteBatch(client, newCommitTime, prevCommitTime, initCommitTime, numRecordsInThisCommit, keyGenFunction, - deleteFn, assertForCommit, expRecordsInThisCommit, expTotalRecords); + deleteFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, filterForCommitTimeWithAssert); + } + + public JavaRDD writeBatch(SparkRDDWriteClient client, String newCommitTime, String prevCommitTime, + Option> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit, + Function2, String, Integer> recordGenFunction, + Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, + boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits, boolean doCommit) throws Exception { + return writeBatch(client, newCommitTime, prevCommitTime, commitTimesBetweenPrevAndNew, initCommitTime, numRecordsInThisCommit, recordGenFunction, + writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, expTotalCommits, doCommit, true); + } + + public JavaRDD writeBatch(SparkRDDWriteClient client, String newCommitTime, String prevCommitTime, + Option> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit, + Function3, String, Integer, String> recordGenFunction, + Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, + boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits, + boolean doCommit, String partition) throws Exception { + return writeBatch(client, newCommitTime, prevCommitTime, commitTimesBetweenPrevAndNew, initCommitTime, numRecordsInThisCommit, recordGenFunction, + writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, expTotalCommits, doCommit, true, partition); } /** @@ -416,29 +543,59 @@ public JavaRDD deleteBatch(HoodieWriteConfig writeConfig, SparkRDDW * @param expRecordsInThisCommit Expected number of records in this commit * @param expTotalRecords Expected number of records when scanned * @param expTotalCommits Expected number of commits (including this commit) + * @param doCommit * @throws Exception in case of error */ public JavaRDD writeBatch(SparkRDDWriteClient client, String newCommitTime, String prevCommitTime, - Option> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit, - Function2, String, Integer> recordGenFunction, - Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, - boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits) throws Exception { + Option> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit, + Function2, String, Integer> recordGenFunction, + Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, + boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits, boolean doCommit, + boolean filterForCommitTimeWithAssert) throws Exception { + List records = recordGenFunction.apply(newCommitTime, numRecordsInThisCommit); + return writeBatchHelper(client, newCommitTime, prevCommitTime, commitTimesBetweenPrevAndNew, initCommitTime, + numRecordsInThisCommit, records, writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, + expTotalCommits, doCommit, filterForCommitTimeWithAssert); + } + + public JavaRDD writeBatch(SparkRDDWriteClient client, String newCommitTime, String prevCommitTime, + Option> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit, + Function3, String, Integer, String> recordGenFunction, + Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, + boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits, boolean doCommit, + boolean filterForCommitTimeWithAssert, + String partition) throws Exception { + + List records = recordGenFunction.apply(newCommitTime, numRecordsInThisCommit, partition); + return writeBatchHelper(client, newCommitTime, prevCommitTime, commitTimesBetweenPrevAndNew, initCommitTime, + numRecordsInThisCommit, records, writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, + expTotalCommits, doCommit, filterForCommitTimeWithAssert); + } + + private JavaRDD writeBatchHelper(SparkRDDWriteClient client, String newCommitTime, String prevCommitTime, + Option> commitTimesBetweenPrevAndNew, String initCommitTime, + int numRecordsInThisCommit, List records, + Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, + boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, + int expTotalCommits, boolean doCommit, boolean filterForCommitTimeWithAssert) throws IOException { // Write 1 (only inserts) client.startCommitWithTime(newCommitTime); - List records = recordGenFunction.apply(newCommitTime, numRecordsInThisCommit); JavaRDD writeRecords = jsc.parallelize(records, 1); JavaRDD result = writeFn.apply(client, writeRecords, newCommitTime); List statuses = result.collect(); assertNoWriteErrors(statuses); + if (doCommit) { + client.commit(newCommitTime, result); + } // check the partition metadata is written out - assertPartitionMetadataForRecords(records, fs); + assertPartitionMetadataForRecords(basePath, records, fs); // verify that there is a commit - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, basePath); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build(); HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline(); if (assertForCommit) { @@ -446,8 +603,10 @@ public JavaRDD writeBatch(SparkRDDWriteClient client, String newCom "Expecting " + expTotalCommits + " commits."); assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(), "Latest commit should be " + newCommitTime); - assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), - "Must contain " + expRecordsInThisCommit + " records"); + if (filterForCommitTimeWithAssert) { // when meta cols are disabled, we can't really do per commit assertion. + assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), + "Must contain " + expRecordsInThisCommit + " records"); + } // Check the entire dataset has all records still String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; @@ -457,16 +616,18 @@ public JavaRDD writeBatch(SparkRDDWriteClient client, String newCom assertEquals(expTotalRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), "Must contain " + expTotalRecords + " records"); - // Check that the incremental consumption from prevCommitTime - assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), - HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, prevCommitTime), - "Incremental consumption from " + prevCommitTime + " should give all records in latest commit"); - if (commitTimesBetweenPrevAndNew.isPresent()) { - commitTimesBetweenPrevAndNew.get().forEach(ct -> { - assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), - HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, ct), - "Incremental consumption from " + ct + " should give all records in latest commit"); - }); + if (filterForCommitTimeWithAssert) { + // Check that the incremental consumption from prevCommitTime + assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), + HoodieClientTestUtils.countRecordsOptionallySince(jsc, basePath, sqlContext, timeline, Option.of(prevCommitTime)), + "Incremental consumption from " + prevCommitTime + " should give all records in latest commit"); + if (commitTimesBetweenPrevAndNew.isPresent()) { + commitTimesBetweenPrevAndNew.get().forEach(ct -> { + assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), + HoodieClientTestUtils.countRecordsOptionallySince(jsc, basePath, sqlContext, timeline, Option.of(ct)), + "Incremental consumption from " + ct + " should give all records in latest commit"); + }); + } } } return result; @@ -490,7 +651,7 @@ public JavaRDD deleteBatch(SparkRDDWriteClient client, String newCo String initCommitTime, int numRecordsInThisCommit, Function> keyGenFunction, Function3, SparkRDDWriteClient, JavaRDD, String> deleteFn, - boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords) throws Exception { + boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, boolean filerForCommitTimeWithAssert) throws Exception { // Delete 1 (only deletes) client.startCommitWithTime(newCommitTime); @@ -503,10 +664,10 @@ public JavaRDD deleteBatch(SparkRDDWriteClient client, String newCo assertNoWriteErrors(statuses); // check the partition metadata is written out - assertPartitionMetadataForKeys(keysToDelete, fs); + assertPartitionMetadataForKeys(basePath, keysToDelete, fs); // verify that there is a commit - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, basePath); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build(); HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline(); if (assertForCommit) { @@ -514,8 +675,10 @@ public JavaRDD deleteBatch(SparkRDDWriteClient client, String newCo "Expecting 3 commits."); assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(), "Latest commit should be " + newCommitTime); - assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), - "Must contain " + expRecordsInThisCommit + " records"); + if (filerForCommitTimeWithAssert) { // if meta cols are disabled, we can't do assertion based on assertion time + assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), + "Must contain " + expRecordsInThisCommit + " records"); + } // Check the entire dataset has all records still String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; @@ -525,11 +688,13 @@ public JavaRDD deleteBatch(SparkRDDWriteClient client, String newCo assertEquals(expTotalRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), "Must contain " + expTotalRecords + " records"); - // Check that the incremental consumption from prevCommitTime - assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), - HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, prevCommitTime), - "Incremental consumption from " + prevCommitTime + " should give no records in latest commit," - + " since it is a delete operation"); + if (filerForCommitTimeWithAssert) { + // Check that the incremental consumption from prevCommitTime + assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), + HoodieClientTestUtils.countRecordsOptionallySince(jsc, basePath, sqlContext, timeline, Option.of(prevCommitTime)), + "Incremental consumption from " + prevCommitTime + " should give no records in latest commit," + + " since it is a delete operation"); + } } return result; } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java index b10781e3b8ec9..a0c093be16b8a 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java @@ -17,23 +17,57 @@ package org.apache.hudi.testutils; -import org.apache.hudi.client.HoodieReadClient; +import org.apache.hudi.HoodieConversionUtils; +import org.apache.hudi.avro.model.HoodieActionInstant; +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieCleanerPlan; +import org.apache.hudi.client.SparkRDDReadClient; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.SparkTaskContextSupplier; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.HoodieCleanStat; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieFileGroup; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.table.view.TableFileSystemView; import org.apache.hudi.common.testutils.HoodieCommonTestHarness; -import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestTable; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.testutils.minicluster.HdfsTestService; +import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.exception.HoodieMetadataException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.keygen.SimpleKeyGenerator; +import org.apache.hudi.metadata.FileSystemBackedTableMetadata; +import org.apache.hudi.metadata.HoodieBackedTableMetadataWriter; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.metadata.MetadataPartitionType; +import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.WorkloadStat; +import org.apache.hudi.timeline.service.TimelineService; +import org.apache.hudi.util.JFunction; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; @@ -42,22 +76,44 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.MiniDFSCluster; -import org.apache.hudi.table.WorkloadStat; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.SparkSessionExtensions; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.TestInfo; -import scala.Tuple2; import java.io.IOException; import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; +import java.util.List; import java.util.Map; +import java.util.Objects; +import java.util.Properties; +import java.util.Random; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.stream.Collectors; + +import scala.Tuple2; + +import static org.apache.hudi.common.util.CleanerUtils.convertCleanMetadata; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertLinesMatch; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; /** * The test harness for resource initialization and cleanup. @@ -65,19 +121,22 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness implements Serializable { private static final Logger LOG = LogManager.getLogger(HoodieClientTestHarness.class); - + + protected static int timelineServicePort = + FileSystemViewStorageConfig.REMOTE_PORT_NUM.defaultValue(); private String testMethodName; protected transient JavaSparkContext jsc = null; protected transient HoodieSparkEngineContext context = null; + protected transient SparkSession sparkSession = null; protected transient Configuration hadoopConf = null; protected transient SQLContext sqlContext; protected transient FileSystem fs; - protected transient HoodieTestDataGenerator dataGen = null; protected transient ExecutorService executorService; protected transient HoodieTableMetaClient metaClient; protected transient SparkRDDWriteClient writeClient; - protected transient HoodieReadClient readClient; + protected transient SparkRDDReadClient readClient; protected transient HoodieTableFileSystemView tableView; + protected transient TimelineService timelineService; protected final SparkTaskContextSupplier supplier = new SparkTaskContextSupplier(); @@ -87,6 +146,15 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im protected transient MiniDFSCluster dfsCluster; protected transient DistributedFileSystem dfs; + @AfterAll + public static void tearDownAll() throws IOException { + FileSystem.closeAll(); + } + + protected Option> getSparkSessionExtensionsInjector() { + return Option.empty(); + } + @BeforeEach public void setTestMethodName(TestInfo testInfo) { if (testInfo.getTestMethod().isPresent()) { @@ -105,12 +173,14 @@ public void initResources() throws IOException { initTestDataGenerator(); initFileSystem(); initMetaClient(); + initTimelineService(); } /** * Cleanups resource group for the subclasses of {@link HoodieClientTestBase}. */ public void cleanupResources() throws IOException { + cleanupTimelineService(); cleanupClients(); cleanupSparkContexts(); cleanupTestDataGenerator(); @@ -126,19 +196,36 @@ public void cleanupResources() throws IOException { * @param appName The specified application name. */ protected void initSparkContexts(String appName) { + Option> sparkSessionExtensionsInjector = + getSparkSessionExtensionsInjector(); + + if (sparkSessionExtensionsInjector.isPresent()) { + // In case we need to inject extensions into Spark Session, we have + // to stop any session that might still be active, since Spark will try + // to re-use it + HoodieConversionUtils.toJavaOption(SparkSession.getActiveSession()) + .ifPresent(SparkSession::stop); + } + // Initialize a local spark env jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest(appName + "#" + testMethodName)); jsc.setLogLevel("ERROR"); - hadoopConf = jsc.hadoopConfiguration(); - // SQLContext stuff - sqlContext = new SQLContext(jsc); + hadoopConf = jsc.hadoopConfiguration(); context = new HoodieSparkEngineContext(jsc); - hadoopConf = context.getHadoopConf().get(); + + sparkSession = SparkSession.builder() + .withExtensions(JFunction.toScala(sparkSessionExtensions -> { + sparkSessionExtensionsInjector.ifPresent(injector -> injector.accept(sparkSessionExtensions)); + return null; + })) + .config(jsc.getConf()) + .getOrCreate(); + sqlContext = new SQLContext(sparkSession); } /** - * Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) + * Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) * with a default name matching the name of the class. */ protected void initSparkContexts() { @@ -205,7 +292,20 @@ protected void cleanupFileSystem() throws IOException { * * @throws IOException */ + @Override protected void initMetaClient() throws IOException { + initMetaClient(getTableType()); + } + + protected void initMetaClient(Properties properties) throws IOException { + initMetaClient(getTableType(), properties); + } + + protected void initMetaClient(HoodieTableType tableType) throws IOException { + initMetaClient(tableType, new Properties()); + } + + protected void initMetaClient(HoodieTableType tableType, Properties properties) throws IOException { if (basePath == null) { throw new IllegalStateException("The base path has not been initialized."); } @@ -214,7 +314,67 @@ protected void initMetaClient() throws IOException { throw new IllegalStateException("The Spark context has not been initialized."); } - metaClient = HoodieTestUtils.init(context.getHadoopConf().get(), basePath, getTableType()); + if (tableName != null && !tableName.isEmpty()) { + properties.put(HoodieTableConfig.NAME.key(), tableName); + } + metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType, properties); + } + + /** + * Initializes timeline service based on the write config. + */ + protected void initTimelineService() { + timelineService = HoodieClientTestUtils.initTimelineService( + context, basePath, incrementTimelineServicePortToUse()); + timelineServicePort = timelineService.getServerPort(); + } + + protected void cleanupTimelineService() { + if (timelineService != null) { + timelineService.close(); + } + } + + protected int incrementTimelineServicePortToUse() { + // Increment the timeline service port for each individual test + // to avoid port reuse causing failures + timelineServicePort = (timelineServicePort + 1 - 1024) % (65536 - 1024) + 1024; + return timelineServicePort; + } + + protected Properties getPropertiesForKeyGen() { + return getPropertiesForKeyGen(false); + } + + protected Properties getPropertiesForKeyGen(boolean populateMetaFields) { + Properties properties = new Properties(); + properties.put(HoodieTableConfig.POPULATE_META_FIELDS.key(), String.valueOf(populateMetaFields)); + properties.put("hoodie.datasource.write.recordkey.field", "_row_key"); + properties.put("hoodie.datasource.write.partitionpath.field", "partition_path"); + properties.put(HoodieTableConfig.RECORDKEY_FIELDS.key(), "_row_key"); + properties.put(HoodieTableConfig.PARTITION_FIELDS.key(), "partition_path"); + properties.put(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key(), SimpleKeyGenerator.class.getName()); + return properties; + } + + protected Properties getPropertiesForMetadataTable() { + Properties properties = new Properties(); + properties.put(HoodieTableConfig.POPULATE_META_FIELDS.key(), "false"); + properties.put("hoodie.datasource.write.recordkey.field", "key"); + properties.put(HoodieTableConfig.RECORDKEY_FIELDS.key(), "key"); + return properties; + } + + protected void addConfigsForPopulateMetaFields(HoodieWriteConfig.Builder configBuilder, boolean populateMetaFields, + boolean isMetadataTable) { + if (!populateMetaFields) { + configBuilder.withProperties((isMetadataTable ? getPropertiesForMetadataTable() : getPropertiesForKeyGen())) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.SIMPLE).build()); + } + } + + protected void addConfigsForPopulateMetaFields(HoodieWriteConfig.Builder configBuilder, boolean populateMetaFields) { + addConfigsForPopulateMetaFields(configBuilder, populateMetaFields, false); } /** @@ -237,31 +397,12 @@ protected void cleanupClients() throws IOException { } } - /** - * Initializes a test data generator which used to generate test datas. - * - */ - protected void initTestDataGenerator() { - dataGen = new HoodieTestDataGenerator(); - } - - /** - * Cleanups test data generator. - * - */ - protected void cleanupTestDataGenerator() { - if (dataGen != null) { - dataGen = null; - } - } - /** * Initializes a distributed file system and base directory. * * @throws IOException */ protected void initDFS() throws IOException { - FileSystem.closeAll(); hdfsTestService = new HdfsTestService(); dfsCluster = hdfsTestService.start(true); @@ -298,7 +439,7 @@ protected void initDFSMetaClient() throws IOException { protected void cleanupDFS() throws IOException { if (hdfsTestService != null) { hdfsTestService.stop(); - dfsCluster.shutdown(); + dfsCluster.shutdown(true, true); hdfsTestService = null; dfsCluster = null; dfs = null; @@ -342,40 +483,51 @@ private void initFileSystemWithConfiguration(Configuration configuration) { } } - public SparkRDDWriteClient getHoodieWriteClient(HoodieWriteConfig cfg) { - return getHoodieWriteClient(cfg, false); - } - - public HoodieReadClient getHoodieReadClient(String basePath) { - readClient = new HoodieReadClient(context, basePath, SQLContext.getOrCreate(jsc.sc())); + public SparkRDDReadClient getHoodieReadClient(String basePath) { + readClient = new SparkRDDReadClient(context, basePath, SQLContext.getOrCreate(jsc.sc())); return readClient; } - public SparkRDDWriteClient getHoodieWriteClient(HoodieWriteConfig cfg, boolean rollbackInflightCommit) { + public SparkRDDWriteClient getHoodieWriteClient(HoodieWriteConfig cfg) { if (null != writeClient) { writeClient.close(); writeClient = null; } - writeClient = new SparkRDDWriteClient(context, cfg, rollbackInflightCommit); + writeClient = new SparkRDDWriteClient(context, cfg); return writeClient; } public HoodieTableMetaClient getHoodieMetaClient(Configuration conf, String basePath) { - metaClient = new HoodieTableMetaClient(conf, basePath); + metaClient = HoodieTableMetaClient.builder().setConf(conf).setBasePath(basePath).build(); return metaClient; } public HoodieTableFileSystemView getHoodieTableFileSystemView(HoodieTableMetaClient metaClient, HoodieTimeline visibleActiveTimeline, - FileStatus[] fileStatuses) { + FileStatus[] fileStatuses) { if (tableView == null) { - tableView = new HoodieTableFileSystemView(metaClient, visibleActiveTimeline, fileStatuses); + tableView = new HoodieTableFileSystemView(metaClient, visibleActiveTimeline, fileStatuses); } else { tableView.init(metaClient, visibleActiveTimeline, fileStatuses); } return tableView; } - protected Pair, WorkloadStat> buildProfile(JavaRDD inputRecordsRDD) { + /** + * @deprecated Use {@link #tagLocation(HoodieIndex, HoodieEngineContext, JavaRDD, HoodieTable)} instead. + */ + @Deprecated + public JavaRDD tagLocation( + HoodieIndex index, JavaRDD records, HoodieTable table) { + return HoodieJavaRDD.getJavaRDD( + index.tagLocation(HoodieJavaRDD.of(records), context, table)); + } + + public static JavaRDD tagLocation( + HoodieIndex index, HoodieEngineContext context, JavaRDD records, HoodieTable table) { + return HoodieJavaRDD.getJavaRDD(index.tagLocation(HoodieJavaRDD.of(records), context, table)); + } + + public static Pair, WorkloadStat> buildProfile(JavaRDD inputRecordsRDD) { HashMap partitionPathStatMap = new HashMap<>(); WorkloadStat globalStat = new WorkloadStat(); @@ -408,4 +560,205 @@ protected Pair, WorkloadStat> buildProfile(JavaRDD } return Pair.of(partitionPathStatMap, globalStat); } + + /** + * Validate the metadata tables contents to ensure it matches what is on the file system. + */ + public void validateMetadata(HoodieTestTable testTable, List inflightCommits, HoodieWriteConfig writeConfig, + String metadataTableBasePath, boolean doFullValidation) throws IOException { + HoodieTableMetadata tableMetadata = metadata(writeConfig, context); + assertNotNull(tableMetadata, "MetadataReader should have been initialized"); + if (!writeConfig.isMetadataTableEnabled()) { + return; + } + + if (!tableMetadata.getSyncedInstantTime().isPresent() || tableMetadata instanceof FileSystemBackedTableMetadata) { + throw new IllegalStateException("Metadata should have synced some commits or tableMetadata should not be an instance " + + "of FileSystemBackedTableMetadata"); + } + assertEquals(inflightCommits, testTable.inflightCommits()); + + HoodieTimer timer = new HoodieTimer().startTimer(); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + + // Partitions should match + List fsPartitionPaths = testTable.getAllPartitionPaths(); + List fsPartitions = new ArrayList<>(); + fsPartitionPaths.forEach(entry -> fsPartitions.add(entry.getFileName().toString())); + if (fsPartitions.isEmpty()) { + fsPartitions.add(""); + } + List metadataPartitions = tableMetadata.getAllPartitionPaths(); + + Collections.sort(fsPartitions); + Collections.sort(metadataPartitions); + + assertEquals(fsPartitions.size(), metadataPartitions.size(), "Partitions should match"); + assertEquals(fsPartitions, metadataPartitions, "Partitions should match"); + + // Files within each partition should match + metaClient = HoodieTableMetaClient.reload(metaClient); + HoodieTable table = HoodieSparkTable.create(writeConfig, engineContext); + TableFileSystemView tableView = table.getHoodieView(); + List fullPartitionPaths = fsPartitions.stream().map(partition -> basePath + "/" + partition).collect(Collectors.toList()); + Map partitionToFilesMap = tableMetadata.getAllFilesInPartitions(fullPartitionPaths); + assertEquals(fsPartitions.size(), partitionToFilesMap.size()); + + fsPartitions.forEach(partition -> { + try { + validateFilesPerPartition(testTable, tableMetadata, tableView, partitionToFilesMap, partition); + } catch (IOException e) { + fail("Exception should not be raised: " + e); + } + }); + if (doFullValidation) { + runFullValidation(table.getConfig().getMetadataConfig(), writeConfig, metadataTableBasePath, engineContext); + } + + LOG.info("Validation time=" + timer.endTimer()); + } + + public void syncTableMetadata(HoodieWriteConfig writeConfig) { + if (!writeConfig.getMetadataConfig().enabled()) { + return; + } + // Open up the metadata table again, for syncing + try (HoodieTableMetadataWriter writer = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, writeConfig, context)) { + LOG.info("Successfully synced to metadata table"); + } catch (Exception e) { + throw new HoodieMetadataException("Error syncing to metadata table.", e); + } + } + + public HoodieBackedTableMetadataWriter metadataWriter(HoodieWriteConfig clientConfig) { + return (HoodieBackedTableMetadataWriter) SparkHoodieBackedTableMetadataWriter + .create(hadoopConf, clientConfig, new HoodieSparkEngineContext(jsc)); + } + + public HoodieTableMetadata metadata(HoodieWriteConfig clientConfig, HoodieEngineContext hoodieEngineContext) { + return HoodieTableMetadata.create(hoodieEngineContext, clientConfig.getMetadataConfig(), clientConfig.getBasePath(), + clientConfig.getSpillableMapBasePath()); + } + + protected void validateFilesPerPartition(HoodieTestTable testTable, HoodieTableMetadata tableMetadata, TableFileSystemView tableView, + Map partitionToFilesMap, String partition) throws IOException { + Path partitionPath; + if (partition.equals("")) { + // Should be the non-partitioned case + partitionPath = new Path(basePath); + } else { + partitionPath = new Path(basePath, partition); + } + + FileStatus[] fsStatuses = testTable.listAllFilesInPartition(partition); + FileStatus[] metaStatuses = tableMetadata.getAllFilesInPartition(partitionPath); + List fsFileNames = Arrays.stream(fsStatuses) + .map(s -> s.getPath().getName()).collect(Collectors.toList()); + List metadataFilenames = Arrays.stream(metaStatuses) + .map(s -> s.getPath().getName()).collect(Collectors.toList()); + Collections.sort(fsFileNames); + Collections.sort(metadataFilenames); + + assertLinesMatch(fsFileNames, metadataFilenames); + assertEquals(fsStatuses.length, partitionToFilesMap.get(partitionPath.toString()).length); + + // Block sizes should be valid + Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getBlockSize() > 0)); + List fsBlockSizes = Arrays.stream(fsStatuses).map(FileStatus::getBlockSize).sorted().collect(Collectors.toList()); + List metadataBlockSizes = Arrays.stream(metaStatuses).map(FileStatus::getBlockSize).sorted().collect(Collectors.toList()); + assertEquals(fsBlockSizes, metadataBlockSizes); + + assertEquals(fsFileNames.size(), metadataFilenames.size(), "Files within partition " + partition + " should match"); + assertEquals(fsFileNames, metadataFilenames, "Files within partition " + partition + " should match"); + + // FileSystemView should expose the same data + List fileGroups = tableView.getAllFileGroups(partition).collect(Collectors.toList()); + fileGroups.addAll(tableView.getAllReplacedFileGroups(partition).collect(Collectors.toList())); + + fileGroups.forEach(g -> LogManager.getLogger(getClass()).info(g)); + fileGroups.forEach(g -> g.getAllBaseFiles().forEach(b -> LogManager.getLogger(getClass()).info(b))); + fileGroups.forEach(g -> g.getAllFileSlices().forEach(s -> LogManager.getLogger(getClass()).info(s))); + + long numFiles = fileGroups.stream() + .mapToLong(g -> g.getAllBaseFiles().count() + g.getAllFileSlices().mapToLong(s -> s.getLogFiles().count()).sum()) + .sum(); + assertEquals(metadataFilenames.size(), numFiles); + } + + private void runFullValidation(HoodieMetadataConfig metadataConfig, + HoodieWriteConfig writeConfig, + String metadataTableBasePath, + HoodieSparkEngineContext engineContext) { + HoodieBackedTableMetadataWriter metadataWriter = metadataWriter(writeConfig); + assertNotNull(metadataWriter, "MetadataWriter should have been initialized"); + + // Validate write config for metadata table + HoodieWriteConfig metadataWriteConfig = metadataWriter.getWriteConfig(); + assertFalse(metadataWriteConfig.isMetadataTableEnabled(), "No metadata table for metadata table"); + + HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build(); + + // Metadata table is MOR + assertEquals(metadataMetaClient.getTableType(), HoodieTableType.MERGE_ON_READ, "Metadata Table should be MOR"); + + // Metadata table is HFile format + assertEquals(metadataMetaClient.getTableConfig().getBaseFileFormat(), HoodieFileFormat.HFILE, + "Metadata Table base file format should be HFile"); + + // Metadata table has a fixed number of partitions + // Cannot use FSUtils.getAllFoldersWithPartitionMetaFile for this as that function filters all directory + // in the .hoodie folder. + List metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, HoodieTableMetadata.getMetadataTableBasePath(basePath), + false, false); + + List enabledPartitionTypes = metadataWriter.getEnabledPartitionTypes(); + + Assertions.assertEquals(enabledPartitionTypes.size(), metadataTablePartitions.size()); + + Map partitionTypeMap = enabledPartitionTypes.stream() + .collect(Collectors.toMap(MetadataPartitionType::getPartitionPath, Function.identity())); + + // Metadata table should automatically compact and clean + // versions are +1 as autoClean / compaction happens end of commits + int numFileVersions = metadataWriteConfig.getCleanerFileVersionsRetained() + 1; + HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metadataMetaClient, metadataMetaClient.getActiveTimeline()); + metadataTablePartitions.forEach(partition -> { + MetadataPartitionType partitionType = partitionTypeMap.get(partition); + + List latestSlices = fsView.getLatestFileSlices(partition).collect(Collectors.toList()); + + assertTrue(latestSlices.stream().map(FileSlice::getBaseFile).filter(Objects::nonNull).count() <= partitionType.getFileGroupCount(), "Should have a single latest base file"); + assertTrue(latestSlices.size() <= partitionType.getFileGroupCount(), "Should have a single latest file slice"); + assertTrue(latestSlices.size() <= numFileVersions, "Should limit file slice to " + + numFileVersions + " but was " + latestSlices.size()); + }); + } + + public HoodieInstant createCleanMetadata(String instantTime, boolean inflightOnly) throws IOException { + return createCleanMetadata(instantTime, inflightOnly, false, false); + } + + public HoodieInstant createEmptyCleanMetadata(String instantTime, boolean inflightOnly) throws IOException { + return createCleanMetadata(instantTime, inflightOnly, true, true); + } + + public HoodieInstant createCleanMetadata(String instantTime, boolean inflightOnly, boolean isEmptyForAll, boolean isEmptyCompleted) throws IOException { + HoodieCleanerPlan cleanerPlan = new HoodieCleanerPlan(new HoodieActionInstant("", "", ""), "", "", + new HashMap<>(), CleanPlanV2MigrationHandler.VERSION, new HashMap<>(), new ArrayList<>()); + if (inflightOnly) { + HoodieTestTable.of(metaClient).addInflightClean(instantTime, cleanerPlan); + } else { + HoodieCleanStat cleanStats = new HoodieCleanStat( + HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS, + HoodieTestUtils.DEFAULT_PARTITION_PATHS[new Random().nextInt(HoodieTestUtils.DEFAULT_PARTITION_PATHS.length)], + Collections.emptyList(), + Collections.emptyList(), + Collections.emptyList(), + instantTime, + ""); + HoodieCleanMetadata cleanMetadata = convertCleanMetadata(instantTime, Option.of(0L), Collections.singletonList(cleanStats)); + HoodieTestTable.of(metaClient).addClean(instantTime, cleanerPlan, cleanMetadata, isEmptyForAll, isEmptyCompleted); + } + return new HoodieInstant(inflightOnly, "clean", instantTime); + } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java index 307e0686756a8..458af3ad9e60b 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java @@ -19,22 +19,31 @@ package org.apache.hudi.testutils; import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.client.HoodieReadClient; +import org.apache.hudi.client.SparkRDDReadClient; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.FileSystemViewManager; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.io.storage.HoodieHFileUtils; +import org.apache.hudi.timeline.service.TimelineService; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.Cell; @@ -58,6 +67,8 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.apache.hudi.io.storage.HoodieHFileReader.SCHEMA_KEY; + /** * Utility methods to aid testing inside the HoodieClient module. */ @@ -89,7 +100,7 @@ public static SparkConf getSparkConfForTest(String appName) { sparkConf.set("spark.eventLog.dir", evlogDir); } - return HoodieReadClient.addHoodieSupport(sparkConf); + return SparkRDDReadClient.addHoodieSupport(sparkConf); } private static HashMap getLatestFileIDsToFullPath(String basePath, HoodieTimeline commitTimeline, @@ -98,13 +109,18 @@ private static HashMap getLatestFileIDsToFullPath(String basePat for (HoodieInstant commit : commitsToReturn) { HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit).get(), HoodieCommitMetadata.class); - fileIdToFullPath.putAll(metadata.getFileIdAndFullPaths(basePath)); + fileIdToFullPath.putAll(metadata.getFileIdAndFullPaths(new Path(basePath))); } return fileIdToFullPath; } public static Dataset readCommit(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline, String instantTime) { + return readCommit(basePath, sqlContext, commitTimeline, instantTime, true); + } + + public static Dataset readCommit(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline, + String instantTime, boolean filterByCommitTime) { HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, instantTime); if (!commitTimeline.containsInstant(commitInstant)) { throw new HoodieException("No commit exists at " + instantTime); @@ -113,57 +129,107 @@ public static Dataset readCommit(String basePath, SQLContext sqlContext, Ho HashMap paths = getLatestFileIDsToFullPath(basePath, commitTimeline, Arrays.asList(commitInstant)); LOG.info("Path :" + paths.values()); - return sqlContext.read().parquet(paths.values().toArray(new String[paths.size()])) - .filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, instantTime)); + Dataset unFilteredRows = null; + if (HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().equals(HoodieFileFormat.PARQUET)) { + unFilteredRows = sqlContext.read().parquet(paths.values().toArray(new String[paths.size()])); + } else if (HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().equals(HoodieFileFormat.ORC)) { + unFilteredRows = sqlContext.read().orc(paths.values().toArray(new String[paths.size()])); + } + if (unFilteredRows != null) { + if (filterByCommitTime) { + return unFilteredRows.filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, instantTime)); + } else { + return unFilteredRows; + } + } else { + return sqlContext.emptyDataFrame(); + } } catch (Exception e) { throw new HoodieException("Error reading commit " + instantTime, e); } } /** - * Obtain all new data written into the Hoodie table since the given timestamp. + * Obtain all new data written into the Hoodie table with an optional from timestamp. */ - public static long countRecordsSince(JavaSparkContext jsc, String basePath, SQLContext sqlContext, - HoodieTimeline commitTimeline, String lastCommitTime) { + public static long countRecordsOptionallySince(JavaSparkContext jsc, String basePath, SQLContext sqlContext, + HoodieTimeline commitTimeline, Option lastCommitTimeOpt) { List commitsToReturn = - commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE).getInstants().collect(Collectors.toList()); + lastCommitTimeOpt.isPresent() ? commitTimeline.findInstantsAfter(lastCommitTimeOpt.get(), Integer.MAX_VALUE).getInstants().collect(Collectors.toList()) : + commitTimeline.getInstants().collect(Collectors.toList()); try { // Go over the commit metadata, and obtain the new files that need to be read. HashMap fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn); String[] paths = fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]); if (paths[0].endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { - return sqlContext.read().parquet(paths) - .filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime)) - .count(); + Dataset rows = sqlContext.read().parquet(paths); + if (lastCommitTimeOpt.isPresent()) { + return rows.filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTimeOpt.get())) + .count(); + } else { + return rows.count(); + } } else if (paths[0].endsWith(HoodieFileFormat.HFILE.getFileExtension())) { - return readHFile(jsc, paths) - .filter(gr -> HoodieTimeline.compareTimestamps(lastCommitTime, HoodieActiveTimeline.LESSER_THAN, - gr.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString())) - .count(); + Stream genericRecordStream = readHFile(jsc, paths); + if (lastCommitTimeOpt.isPresent()) { + return genericRecordStream.filter(gr -> HoodieTimeline.compareTimestamps(lastCommitTimeOpt.get(), HoodieActiveTimeline.LESSER_THAN, + gr.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString())) + .count(); + } else { + return genericRecordStream.count(); + } + } else if (paths[0].endsWith(HoodieFileFormat.ORC.getFileExtension())) { + Dataset rows = sqlContext.read().orc(paths); + if (lastCommitTimeOpt.isPresent()) { + return rows.filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTimeOpt.get())) + .count(); + } else { + return rows.count(); + } } throw new HoodieException("Unsupported base file format for file :" + paths[0]); } catch (IOException e) { - throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e); + throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTimeOpt.get(), e); + } + } + + public static List getLatestBaseFiles(String basePath, FileSystem fs, + String... paths) { + List latestFiles = new ArrayList<>(); + try { + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); + for (String path : paths) { + BaseFileOnlyView fileSystemView = new HoodieTableFileSystemView(metaClient, + metaClient.getCommitsTimeline().filterCompletedInstants(), fs.globStatus(new Path(path))); + latestFiles.addAll(fileSystemView.getLatestBaseFiles().collect(Collectors.toList())); + } + } catch (Exception e) { + throw new HoodieException("Error reading hoodie table as a dataframe", e); } + return latestFiles; } /** - * Reads the paths under the a hoodie table out as a DataFrame. + * Reads the paths under the hoodie table out as a DataFrame. */ public static Dataset read(JavaSparkContext jsc, String basePath, SQLContext sqlContext, FileSystem fs, String... paths) { List filteredPaths = new ArrayList<>(); try { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath, true); - for (String path : paths) { - BaseFileOnlyView fileSystemView = new HoodieTableFileSystemView(metaClient, - metaClient.getCommitsTimeline().filterCompletedInstants(), fs.globStatus(new Path(path))); - List latestFiles = fileSystemView.getLatestBaseFiles().collect(Collectors.toList()); - for (HoodieBaseFile file : latestFiles) { - filteredPaths.add(file.getPath()); - } + List latestFiles = getLatestBaseFiles(basePath, fs, paths); + for (HoodieBaseFile file : latestFiles) { + filteredPaths.add(file.getPath()); + } + if (filteredPaths.isEmpty()) { + return sqlContext.emptyDataFrame(); } - return sqlContext.read().parquet(filteredPaths.toArray(new String[filteredPaths.size()])); + String[] filteredPathsToRead = filteredPaths.toArray(new String[filteredPaths.size()]); + if (filteredPathsToRead[0].endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { + return sqlContext.read().parquet(filteredPathsToRead); + } else if (filteredPathsToRead[0].endsWith(HoodieFileFormat.ORC.getFileExtension())) { + return sqlContext.read().orc(filteredPathsToRead); + } + return sqlContext.emptyDataFrame(); } catch (Exception e) { throw new HoodieException("Error reading hoodie table as a dataframe", e); } @@ -178,9 +244,10 @@ public static Stream readHFile(JavaSparkContext jsc, String[] pat Schema schema = null; for (String path : paths) { try { - HFile.Reader reader = HFile.createReader(fs, new Path(path), cacheConfig, fs.getConf()); + HFile.Reader reader = + HoodieHFileUtils.createHFileReader(fs, new Path(path), cacheConfig, fs.getConf()); if (schema == null) { - schema = new Schema.Parser().parse(new String(reader.loadFileInfo().get("schema".getBytes()))); + schema = new Schema.Parser().parse(new String(reader.getHFileInfo().get(SCHEMA_KEY.getBytes()))); } HFileScanner scanner = reader.getScanner(false, false); if (!scanner.seekTo()) { @@ -189,7 +256,7 @@ public static Stream readHFile(JavaSparkContext jsc, String[] pat } do { - Cell c = scanner.getKeyValue(); + Cell c = scanner.getCell(); byte[] value = Arrays.copyOfRange(c.getValueArray(), c.getValueOffset(), c.getValueOffset() + c.getValueLength()); valuesAsList.add(HoodieAvroUtils.bytesToAvro(value, schema)); } while (scanner.next()); @@ -200,4 +267,52 @@ public static Stream readHFile(JavaSparkContext jsc, String[] pat return valuesAsList.stream(); } + /** + * Initializes timeline service based on the write config. + * + * @param context {@link HoodieEngineContext} instance to use. + * @param basePath Base path of the table. + * @param timelineServicePort Port number to use for timeline service. + * @return started {@link TimelineService} instance. + */ + public static TimelineService initTimelineService( + HoodieEngineContext context, String basePath, int timelineServicePort) { + try { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder() + .withPath(basePath) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withRemoteServerPort(timelineServicePort).build()) + .build(); + TimelineService timelineService = new TimelineService(context, new Configuration(), + TimelineService.Config.builder().enableMarkerRequests(true) + .serverPort(config.getViewStorageConfig().getRemoteViewServerPort()).build(), + FileSystem.get(new Configuration()), + FileSystemViewManager.createViewManager(context, config.getMetadataConfig(), + config.getViewStorageConfig(), config.getCommonConfig())); + timelineService.startService(); + LOG.info("Timeline service server port: " + timelineServicePort); + return timelineService; + } catch (Exception ex) { + throw new RuntimeException(ex); + } + } + + public static Option getCommitMetadataForLatestInstant(HoodieTableMetaClient metaClient) { + HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); + if (timeline.lastInstant().isPresent()) { + return getCommitMetadataForInstant(metaClient, timeline.lastInstant().get()); + } else { + return Option.empty(); + } + } + + private static Option getCommitMetadataForInstant(HoodieTableMetaClient metaClient, HoodieInstant instant) { + try { + HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); + byte[] data = timeline.getInstantDetails(instant).get(); + return Option.of(HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class)); + } catch (Exception e) { + throw new HoodieException("Failed to read schema from commit metadata", e); + } + } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieMergeOnReadTestUtils.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieMergeOnReadTestUtils.java index 56335511201fa..931714fd2fcf6 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieMergeOnReadTestUtils.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieMergeOnReadTestUtils.java @@ -58,20 +58,39 @@ public static List getRecordsUsingInputFormat(Configuration conf, public static List getRecordsUsingInputFormat(Configuration conf, List inputPaths, String basePath, JobConf jobConf, boolean realtime) { + return getRecordsUsingInputFormat(conf, inputPaths, basePath, jobConf, realtime, true); + } + + public static List getRecordsUsingInputFormat(Configuration conf, List inputPaths, + String basePath, JobConf jobConf, boolean realtime, boolean populateMetaFields) { Schema schema = new Schema.Parser().parse(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA); return getRecordsUsingInputFormat(conf, inputPaths, basePath, jobConf, realtime, schema, - HoodieTestDataGenerator.TRIP_HIVE_COLUMN_TYPES, false, new ArrayList<>()); + HoodieTestDataGenerator.TRIP_HIVE_COLUMN_TYPES, false, new ArrayList<>(), populateMetaFields); } public static List getRecordsUsingInputFormat(Configuration conf, List inputPaths, String basePath, JobConf jobConf, boolean realtime, Schema rawSchema, String rawHiveColumnTypes, boolean projectCols, List projectedColumns) { + return getRecordsUsingInputFormat(conf, inputPaths, basePath, jobConf, realtime, rawSchema, rawHiveColumnTypes, projectCols, projectedColumns, true); + } + + public static List getRecordsUsingInputFormat(Configuration conf, List inputPaths, String basePath, JobConf jobConf, boolean realtime, Schema rawSchema, + String rawHiveColumnTypes, boolean projectCols, List projectedColumns, boolean populateMetaFields) { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(conf, basePath); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(conf).setBasePath(basePath).build(); FileInputFormat inputFormat = HoodieInputFormatUtils.getInputFormat(metaClient.getTableConfig().getBaseFileFormat(), realtime, jobConf); - Schema schema = HoodieAvroUtils.addMetadataFields(rawSchema); - String hiveColumnTypes = HoodieAvroUtils.addMetadataColumnTypes(rawHiveColumnTypes); - setPropsForInputFormat(inputFormat, jobConf, schema, hiveColumnTypes, projectCols, projectedColumns); + Schema schema; + String hiveColumnTypes; + + if (populateMetaFields) { + schema = HoodieAvroUtils.addMetadataFields(rawSchema); + hiveColumnTypes = HoodieAvroUtils.addMetadataColumnTypes(rawHiveColumnTypes); + } else { + schema = rawSchema; + hiveColumnTypes = rawHiveColumnTypes; + } + + setPropsForInputFormat(inputFormat, jobConf, schema, hiveColumnTypes, projectCols, projectedColumns, populateMetaFields); final List fields; if (projectCols) { fields = schema.getFields().stream().filter(f -> projectedColumns.contains(f.name())) @@ -112,6 +131,11 @@ public static List getRecordsUsingInputFormat(Configuration conf, } private static void setPropsForInputFormat(FileInputFormat inputFormat, JobConf jobConf, Schema schema, String hiveColumnTypes, boolean projectCols, List projectedCols) { + setPropsForInputFormat(inputFormat, jobConf, schema, hiveColumnTypes, projectCols, projectedCols, true); + } + + private static void setPropsForInputFormat(FileInputFormat inputFormat, JobConf jobConf, Schema schema, String hiveColumnTypes, boolean projectCols, List projectedCols, + boolean populateMetaFieldsConfigValue) { List fields = schema.getFields(); final List projectedColNames; if (!projectCols) { diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieSparkWriteableTestTable.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieSparkWriteableTestTable.java new file mode 100644 index 0000000000000..3b50d1b29b04f --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieSparkWriteableTestTable.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.testutils; + +import org.apache.hudi.client.SparkTaskContextSupplier; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.bloom.BloomFilterFactory; +import org.apache.hudi.common.bloom.BloomFilterTypeCode; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; +import org.apache.hudi.table.HoodieTable; + +import org.apache.avro.Schema; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.UUID; + +public class HoodieSparkWriteableTestTable extends HoodieWriteableTestTable { + private static final Logger LOG = LogManager.getLogger(HoodieSparkWriteableTestTable.class); + + private HoodieSparkWriteableTestTable(String basePath, FileSystem fs, HoodieTableMetaClient metaClient, Schema schema, + BloomFilter filter, HoodieTableMetadataWriter metadataWriter) { + super(basePath, fs, metaClient, schema, filter, metadataWriter); + } + + public static HoodieSparkWriteableTestTable of(HoodieTableMetaClient metaClient, Schema schema, BloomFilter filter) { + return new HoodieSparkWriteableTestTable(metaClient.getBasePath(), metaClient.getRawFs(), + metaClient, schema, filter, null); + } + + public static HoodieSparkWriteableTestTable of(HoodieTableMetaClient metaClient, Schema schema, BloomFilter filter, + HoodieTableMetadataWriter metadataWriter) { + return new HoodieSparkWriteableTestTable(metaClient.getBasePath(), metaClient.getRawFs(), + metaClient, schema, filter, metadataWriter); + } + + public static HoodieSparkWriteableTestTable of(HoodieTableMetaClient metaClient, Schema schema) { + BloomFilter filter = BloomFilterFactory + .createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name()); + return of(metaClient, schema, filter); + } + + public static HoodieSparkWriteableTestTable of(HoodieTableMetaClient metaClient, Schema schema, + HoodieTableMetadataWriter metadataWriter) { + BloomFilter filter = BloomFilterFactory + .createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.DYNAMIC_V0.name()); + return of(metaClient, schema, filter, metadataWriter); + } + + public static HoodieSparkWriteableTestTable of(HoodieTable hoodieTable, Schema schema) { + HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); + return of(metaClient, schema); + } + + public static HoodieSparkWriteableTestTable of(HoodieTable hoodieTable, Schema schema, BloomFilter filter) { + HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); + return of(metaClient, schema, filter); + } + + @Override + public HoodieSparkWriteableTestTable addCommit(String instantTime) throws Exception { + return (HoodieSparkWriteableTestTable) super.addCommit(instantTime); + } + + @Override + public HoodieSparkWriteableTestTable forCommit(String instantTime) { + return (HoodieSparkWriteableTestTable) super.forCommit(instantTime); + } + + public String getFileIdWithInserts(String partition) throws Exception { + return getFileIdWithInserts(partition, new HoodieRecord[0]); + } + + public String getFileIdWithInserts(String partition, HoodieRecord... records) throws Exception { + return getFileIdWithInserts(partition, Arrays.asList(records)); + } + + public String getFileIdWithInserts(String partition, List records) throws Exception { + String fileId = UUID.randomUUID().toString(); + withInserts(partition, fileId, records); + return fileId; + } + + public HoodieSparkWriteableTestTable withInserts(String partition, String fileId) throws Exception { + return withInserts(partition, fileId, new HoodieRecord[0]); + } + + public HoodieSparkWriteableTestTable withInserts(String partition, String fileId, HoodieRecord... records) throws Exception { + withInserts(partition, fileId, Arrays.asList(records)); + return this; + } + + public Path withInserts(String partition, String fileId, List records) throws Exception { + return super.withInserts(partition, fileId, records, new SparkTaskContextSupplier()); + } + + public HoodieSparkWriteableTestTable withLogAppends(String partition, String fileId, HoodieRecord... records) throws Exception { + withLogAppends(partition, fileId, Arrays.asList(records)); + return this; + } + + public Map> withLogAppends(String partition, String fileId, List records) throws Exception { + return super.withLogAppends(partition, fileId, records); + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java deleted file mode 100644 index e167a0f4b6650..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.hudi.testutils; - -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.avro.HoodieAvroWriteSupport; -import org.apache.hudi.client.SparkTaskContextSupplier; -import org.apache.hudi.common.bloom.BloomFilter; -import org.apache.hudi.common.bloom.BloomFilterFactory; -import org.apache.hudi.common.bloom.BloomFilterTypeCode; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.log.HoodieLogFormat; -import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; -import org.apache.hudi.common.table.log.block.HoodieLogBlock; -import org.apache.hudi.common.testutils.FileCreateUtils; -import org.apache.hudi.common.testutils.HoodieTestTable; -import org.apache.hudi.config.HoodieStorageConfig; -import org.apache.hudi.io.storage.HoodieAvroParquetConfig; -import org.apache.hudi.io.storage.HoodieParquetWriter; -import org.apache.hudi.table.HoodieTable; - -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.parquet.avro.AvroSchemaConverter; -import org.apache.parquet.hadoop.ParquetWriter; -import org.apache.parquet.hadoop.metadata.CompressionCodecName; - -import java.io.IOException; -import java.nio.file.Paths; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import java.util.stream.Collectors; - -import static org.apache.hudi.common.testutils.FileCreateUtils.baseFileName; - -public class HoodieWriteableTestTable extends HoodieTestTable { - private static final Logger LOG = LogManager.getLogger(HoodieWriteableTestTable.class); - - private final Schema schema; - private final BloomFilter filter; - - private HoodieWriteableTestTable(String basePath, FileSystem fs, HoodieTableMetaClient metaClient, Schema schema, BloomFilter filter) { - super(basePath, fs, metaClient); - this.schema = schema; - this.filter = filter; - } - - public static HoodieWriteableTestTable of(HoodieTableMetaClient metaClient, Schema schema, BloomFilter filter) { - return new HoodieWriteableTestTable(metaClient.getBasePath(), metaClient.getRawFs(), metaClient, schema, filter); - } - - public static HoodieWriteableTestTable of(HoodieTableMetaClient metaClient, Schema schema) { - BloomFilter filter = BloomFilterFactory - .createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name()); - return of(metaClient, schema, filter); - } - - public static HoodieWriteableTestTable of(HoodieTable hoodieTable, Schema schema) { - HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); - return of(metaClient, schema); - } - - public static HoodieWriteableTestTable of(HoodieTable hoodieTable, Schema schema, BloomFilter filter) { - HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); - return of(metaClient, schema, filter); - } - - @Override - public HoodieWriteableTestTable addCommit(String instantTime) throws Exception { - return (HoodieWriteableTestTable) super.addCommit(instantTime); - } - - @Override - public HoodieWriteableTestTable forCommit(String instantTime) { - return (HoodieWriteableTestTable) super.forCommit(instantTime); - } - - public String getFileIdWithInserts(String partition) throws Exception { - return getFileIdWithInserts(partition, new HoodieRecord[0]); - } - - public String getFileIdWithInserts(String partition, HoodieRecord... records) throws Exception { - return getFileIdWithInserts(partition, Arrays.asList(records)); - } - - public String getFileIdWithInserts(String partition, List records) throws Exception { - String fileId = UUID.randomUUID().toString(); - withInserts(partition, fileId, records); - return fileId; - } - - public HoodieWriteableTestTable withInserts(String partition, String fileId) throws Exception { - return withInserts(partition, fileId, new HoodieRecord[0]); - } - - public HoodieWriteableTestTable withInserts(String partition, String fileId, HoodieRecord... records) throws Exception { - return withInserts(partition, fileId, Arrays.asList(records)); - } - - public HoodieWriteableTestTable withInserts(String partition, String fileId, List records) throws Exception { - FileCreateUtils.createPartitionMetaFile(basePath, partition); - String fileName = baseFileName(currentInstantTime, fileId); - - HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport( - new AvroSchemaConverter().convert(schema), schema, filter); - HoodieAvroParquetConfig config = new HoodieAvroParquetConfig(writeSupport, CompressionCodecName.GZIP, - ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024, - new Configuration(), Double.parseDouble(HoodieStorageConfig.DEFAULT_STREAM_COMPRESSION_RATIO)); - try (HoodieParquetWriter writer = new HoodieParquetWriter( - currentInstantTime, - new Path(Paths.get(basePath, partition, fileName).toString()), - config, schema, new SparkTaskContextSupplier())) { - int seqId = 1; - for (HoodieRecord record : records) { - GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get(); - HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, currentInstantTime, String.valueOf(seqId++)); - HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), fileName); - writer.writeAvro(record.getRecordKey(), avroRecord); - filter.add(record.getRecordKey()); - } - } - - return this; - } - - public HoodieWriteableTestTable withLogAppends(HoodieRecord... records) throws Exception { - return withLogAppends(Arrays.asList(records)); - } - - public HoodieWriteableTestTable withLogAppends(List records) throws Exception { - for (List groupedRecords: records.stream() - .collect(Collectors.groupingBy(HoodieRecord::getCurrentLocation)).values()) { - appendRecordsToLogFile(groupedRecords); - } - return this; - } - - private void appendRecordsToLogFile(List groupedRecords) throws Exception { - String partitionPath = groupedRecords.get(0).getPartitionPath(); - HoodieRecordLocation location = groupedRecords.get(0).getCurrentLocation(); - try (HoodieLogFormat.Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath)) - .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(location.getFileId()) - .overBaseCommit(location.getInstantTime()).withFs(fs).build()) { - Map header = new HashMap<>(); - header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, location.getInstantTime()); - header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - logWriter.appendBlock(new HoodieAvroDataBlock(groupedRecords.stream().map(r -> { - try { - GenericRecord val = (GenericRecord) r.getData().getInsertValue(schema).get(); - HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), ""); - return (IndexedRecord) val; - } catch (IOException e) { - LOG.warn("Failed to convert record " + r.toString(), e); - return null; - } - }).collect(Collectors.toList()), header)); - } - } -} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/KeyGeneratorTestUtilities.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/KeyGeneratorTestUtilities.java deleted file mode 100644 index 53b2abfd9b6ea..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/KeyGeneratorTestUtilities.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.testutils; - -import org.apache.hudi.AvroConversionHelper; -import org.apache.hudi.AvroConversionUtils; - -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericRecord; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema; -import org.apache.spark.sql.types.StructType; - -import scala.Function1; - -public class KeyGeneratorTestUtilities { - - public static String exampleSchema = "{\"type\": \"record\",\"name\": \"testrec\",\"fields\": [ " - + "{\"name\": \"timestamp\",\"type\": \"long\"},{\"name\": \"_row_key\", \"type\": \"string\"}," - + "{\"name\": \"ts_ms\", \"type\": \"string\"}," - + "{\"name\": \"pii_col\", \"type\": \"string\"}]}"; - - public static final String TEST_STRUCTNAME = "test_struct_name"; - public static final String TEST_RECORD_NAMESPACE = "test_record_namespace"; - public static Schema schema = new Schema.Parser().parse(exampleSchema); - public static StructType structType = AvroConversionUtils.convertAvroSchemaToStructType(schema); - - public GenericRecord getRecord() { - GenericRecord record = new GenericData.Record(new Schema.Parser().parse(exampleSchema)); - record.put("timestamp", 4357686); - record.put("_row_key", "key1"); - record.put("ts_ms", "2020-03-21"); - record.put("pii_col", "pi"); - return record; - } - - public static Row getRow(GenericRecord record) { - return getRow(record, schema, structType); - } - - public static Row getRow(GenericRecord record, Schema schema, StructType structType) { - Function1 converterFn = AvroConversionHelper.createConverterToRow(schema, structType); - Row row = (Row) converterFn.apply(record); - int fieldCount = structType.fieldNames().length; - Object[] values = new Object[fieldCount]; - for (int i = 0; i < fieldCount; i++) { - values[i] = row.get(i); - } - return new GenericRowWithSchema(values, structType); - } -} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java new file mode 100644 index 0000000000000..cb7b2e6b3c43a --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.testutils; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.AvroConversionUtils; +import org.apache.hudi.client.SparkRDDReadClient; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.TableFileSystemView; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.keygen.SimpleKeyGenerator; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.testutils.providers.HoodieMetaClientProvider; +import org.apache.hudi.testutils.providers.HoodieWriteClientProvider; +import org.apache.hudi.testutils.providers.SparkProvider; +import org.apache.hudi.timeline.service.TimelineService; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; +import static org.apache.hudi.common.testutils.HoodieTestUtils.RAW_TRIPS_TEST_NAME; +import static org.apache.hudi.testutils.Assertions.assertFileSizesEqual; +import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class SparkClientFunctionalTestHarness implements SparkProvider, HoodieMetaClientProvider, HoodieWriteClientProvider { + + protected static int timelineServicePort = + FileSystemViewStorageConfig.REMOTE_PORT_NUM.defaultValue(); + private static transient SparkSession spark; + private static transient SQLContext sqlContext; + private static transient JavaSparkContext jsc; + private static transient HoodieSparkEngineContext context; + private static transient TimelineService timelineService; + private FileSystem fileSystem; + + /** + * An indicator of the initialization status. + */ + protected boolean initialized = false; + @TempDir + protected java.nio.file.Path tempDir; + + public String basePath() { + return tempDir.toAbsolutePath().toUri().toString(); + } + + @Override + public SparkSession spark() { + return spark; + } + + @Override + public SQLContext sqlContext() { + return sqlContext; + } + + @Override + public JavaSparkContext jsc() { + return jsc; + } + + public Configuration hadoopConf() { + return jsc.hadoopConfiguration(); + } + + public FileSystem fs() { + if (fileSystem == null) { + fileSystem = FSUtils.getFs(basePath(), hadoopConf()); + } + return fileSystem; + } + + @Override + public HoodieSparkEngineContext context() { + return context; + } + + public HoodieTableMetaClient getHoodieMetaClient(HoodieTableType tableType) throws IOException { + return getHoodieMetaClient(tableType, new Properties()); + } + + public HoodieTableMetaClient getHoodieMetaClient(HoodieTableType tableType, Properties props) throws IOException { + return getHoodieMetaClient(hadoopConf(), basePath(), tableType, props); + } + + public HoodieTableMetaClient getHoodieMetaClient(Configuration hadoopConf, String basePath, HoodieTableType tableType, Properties props) throws IOException { + props = HoodieTableMetaClient.withPropertyBuilder() + .setTableName(RAW_TRIPS_TEST_NAME) + .setTableType(tableType) + .setPayloadClass(HoodieAvroPayload.class) + .fromProperties(props) + .build(); + return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, basePath, props); + } + + public HoodieTableMetaClient getHoodieMetaClient(Configuration hadoopConf, String basePath) throws IOException { + return getHoodieMetaClient(hadoopConf, basePath, getPropertiesForKeyGen(true)); + } + + @Override + public HoodieTableMetaClient getHoodieMetaClient(Configuration hadoopConf, String basePath, Properties props) throws IOException { + props = HoodieTableMetaClient.withPropertyBuilder() + .setTableName(RAW_TRIPS_TEST_NAME) + .setTableType(COPY_ON_WRITE) + .setPayloadClass(HoodieAvroPayload.class) + .fromProperties(props) + .build(); + return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, basePath, props); + } + + @Override + public SparkRDDWriteClient getHoodieWriteClient(HoodieWriteConfig cfg) throws IOException { + return new SparkRDDWriteClient(context(), cfg); + } + + @BeforeEach + public synchronized void runBeforeEach() { + initialized = spark != null; + if (!initialized) { + SparkConf sparkConf = conf(); + SparkRDDWriteClient.registerClasses(sparkConf); + SparkRDDReadClient.addHoodieSupport(sparkConf); + spark = SparkSession.builder().config(sparkConf).getOrCreate(); + sqlContext = spark.sqlContext(); + jsc = new JavaSparkContext(spark.sparkContext()); + context = new HoodieSparkEngineContext(jsc); + timelineService = HoodieClientTestUtils.initTimelineService( + context, basePath(), incrementTimelineServicePortToUse()); + timelineServicePort = timelineService.getServerPort(); + } + } + + /** + * To clean up Spark resources after all testcases have run in functional tests. + * + * Spark session and contexts were reused for testcases in the same test class. Some + * testcase may invoke this specifically to clean up in case of repeated test runs. + */ + @AfterAll + public static synchronized void resetSpark() { + if (spark != null) { + spark.close(); + spark = null; + } + if (timelineService != null) { + timelineService.close(); + } + } + + @AfterEach + public void closeFileSystem() throws IOException { + if (fileSystem != null) { + fileSystem.close(); + fileSystem = null; + } + } + + protected JavaRDD tagLocation( + HoodieIndex index, JavaRDD records, HoodieTable table) { + return HoodieJavaRDD.getJavaRDD( + index.tagLocation(HoodieJavaRDD.of(records), context, table)); + } + + protected JavaRDD updateLocation( + HoodieIndex index, JavaRDD writeStatus, HoodieTable table) { + return HoodieJavaRDD.getJavaRDD( + index.updateLocation(HoodieJavaRDD.of(writeStatus), context, table)); + } + + protected Stream insertRecordsToMORTable(HoodieTableMetaClient metaClient, List records, + SparkRDDWriteClient client, HoodieWriteConfig cfg, String commitTime) throws IOException { + return insertRecordsToMORTable(metaClient, records, client, cfg, commitTime, false); + } + + protected Stream insertRecordsToMORTable(HoodieTableMetaClient metaClient, List records, + SparkRDDWriteClient client, HoodieWriteConfig cfg, String commitTime, + boolean doExplicitCommit) throws IOException { + HoodieTableMetaClient reloadedMetaClient = HoodieTableMetaClient.reload(metaClient); + + JavaRDD writeRecords = jsc().parallelize(records, 1); + JavaRDD statusesRdd = client.insert(writeRecords, commitTime); + List statuses = statusesRdd.collect(); + assertNoWriteErrors(statuses); + if (doExplicitCommit) { + client.commit(commitTime, statusesRdd); + } + assertFileSizesEqual(statuses, status -> FSUtils.getFileSize(reloadedMetaClient.getFs(), new Path(reloadedMetaClient.getBasePath(), status.getStat().getPath()))); + + HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), reloadedMetaClient); + + Option deltaCommit = reloadedMetaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant(); + assertTrue(deltaCommit.isPresent()); + assertEquals(commitTime, deltaCommit.get().getTimestamp(), "Delta commit should be specified value"); + + Option commit = reloadedMetaClient.getActiveTimeline().getCommitTimeline().lastInstant(); + assertFalse(commit.isPresent()); + + FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); + TableFileSystemView.BaseFileOnlyView roView = + getHoodieTableFileSystemView(reloadedMetaClient, reloadedMetaClient.getCommitTimeline().filterCompletedInstants(), allFiles); + Stream dataFilesToRead = roView.getLatestBaseFiles(); + assertTrue(!dataFilesToRead.findAny().isPresent()); + + roView = getHoodieTableFileSystemView(reloadedMetaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); + dataFilesToRead = roView.getLatestBaseFiles(); + return dataFilesToRead; + } + + protected void updateRecordsInMORTable(HoodieTableMetaClient metaClient, List records, SparkRDDWriteClient client, HoodieWriteConfig cfg, String commitTime) throws IOException { + updateRecordsInMORTable(metaClient, records, client, cfg, commitTime, true); + } + + protected void updateRecordsInMORTable(HoodieTableMetaClient metaClient, List records, SparkRDDWriteClient client, HoodieWriteConfig cfg, String commitTime, + boolean doExplicitCommit) throws IOException { + HoodieTableMetaClient reloadedMetaClient = HoodieTableMetaClient.reload(metaClient); + + Map recordsMap = new HashMap<>(); + for (HoodieRecord rec : records) { + if (!recordsMap.containsKey(rec.getKey())) { + recordsMap.put(rec.getKey(), rec); + } + } + + JavaRDD statusesRdd = client.upsert(jsc().parallelize(records, 1), commitTime); + List statuses = statusesRdd.collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + if (doExplicitCommit) { + client.commit(commitTime, statusesRdd); + } + assertFileSizesEqual(statuses, status -> FSUtils.getFileSize(reloadedMetaClient.getFs(), new Path(reloadedMetaClient.getBasePath(), status.getStat().getPath()))); + + Option deltaCommit = reloadedMetaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant(); + assertTrue(deltaCommit.isPresent()); + assertEquals(commitTime, deltaCommit.get().getTimestamp(), + "Latest Delta commit should match specified time"); + + Option commit = reloadedMetaClient.getActiveTimeline().getCommitTimeline().firstInstant(); + assertFalse(commit.isPresent()); + } + + protected FileStatus[] listAllBaseFilesInPath(HoodieTable table) throws IOException { + return HoodieTestTable.of(table.getMetaClient()).listAllBaseFiles(table.getBaseFileExtension()); + } + + protected Properties getPropertiesForKeyGen() { + return getPropertiesForKeyGen(false); + } + + protected Properties getPropertiesForKeyGen(boolean populateMetaFields) { + Properties properties = new Properties(); + properties.put(HoodieTableConfig.POPULATE_META_FIELDS.key(), String.valueOf(populateMetaFields)); + properties.put("hoodie.datasource.write.recordkey.field", "_row_key"); + properties.put("hoodie.datasource.write.partitionpath.field", "partition_path"); + properties.put(HoodieTableConfig.RECORDKEY_FIELDS.key(), "_row_key"); + properties.put(HoodieTableConfig.PARTITION_FIELDS.key(), "partition_path"); + properties.put(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key(), SimpleKeyGenerator.class.getName()); + return properties; + } + + protected void addConfigsForPopulateMetaFields(HoodieWriteConfig.Builder configBuilder, boolean populateMetaFields) { + configBuilder.withProperties(getPropertiesForKeyGen(populateMetaFields)); + if (!populateMetaFields) { + configBuilder.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.SIMPLE).build()); + } + } + + protected HoodieWriteConfig getConfig(Boolean autoCommit) { + return getConfigBuilder(autoCommit).build(); + } + + protected HoodieWriteConfig getConfig(Boolean autoCommit, Boolean rollbackUsingMarkers) { + return getConfigBuilder(autoCommit, rollbackUsingMarkers, HoodieIndex.IndexType.BLOOM).build(); + } + + protected HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) { + return getConfigBuilder(autoCommit, HoodieIndex.IndexType.BLOOM); + } + + protected HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit, HoodieIndex.IndexType indexType) { + return getConfigBuilder(autoCommit, false, indexType); + } + + protected HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit, long compactionSmallFileSize, HoodieClusteringConfig clusteringConfig) { + return getConfigBuilder(autoCommit, false, HoodieIndex.IndexType.BLOOM, compactionSmallFileSize, clusteringConfig, false); + } + + protected HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit, Boolean rollbackUsingMarkers, HoodieIndex.IndexType indexType) { + return getConfigBuilder(autoCommit, rollbackUsingMarkers, indexType, 1024 * 1024 * 1024L, HoodieClusteringConfig.newBuilder().build(), false); + } + + protected HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit, Boolean rollbackUsingMarkers, HoodieIndex.IndexType indexType, + long compactionSmallFileSize, HoodieClusteringConfig clusteringConfig, boolean preserveCommitMetaForCompaction) { + return HoodieWriteConfig.newBuilder().withPath(basePath()).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .withDeleteParallelism(2) + .withAutoCommit(autoCommit) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(compactionSmallFileSize) + .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).withPreserveCommitMetadata(preserveCommitMetaForCompaction).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024 * 1024).parquetMaxFileSize(1024 * 1024 * 1024).build()) + .withEmbeddedTimelineServerEnabled(true).forTable("test-trip-table") + .withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder() + .withRemoteServerPort(timelineServicePort) + .withEnableBackupForRemoteFileSystemView(false).build()) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).build()) + .withClusteringConfig(clusteringConfig) + .withRollbackUsingMarkers(rollbackUsingMarkers); + } + + protected Dataset toDataset(List records, Schema schema) { + List avroRecords = records.stream() + .map(r -> { + HoodieRecordPayload payload = (HoodieRecordPayload) r.getData(); + try { + return (GenericRecord) payload.getInsertValue(schema).get(); + } catch (IOException e) { + throw new HoodieIOException("Failed to extract Avro payload", e); + } + }) + .collect(Collectors.toList()); + JavaRDD jrdd = jsc.parallelize(avroRecords, 2); + return AvroConversionUtils.createDataFrame(jrdd.rdd(), schema.toString(), spark); + } + + protected int incrementTimelineServicePortToUse() { + // Increment the timeline service port for each individual test + // to avoid port reuse causing failures + timelineServicePort = (timelineServicePort + 1 - 1024) % (65536 - 1024) + 1024; + return timelineServicePort; + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkDatasetTestUtils.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkDatasetTestUtils.java index 3d2019dbdcef9..918462ac0a08f 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkDatasetTestUtils.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkDatasetTestUtils.java @@ -19,6 +19,7 @@ package org.apache.hudi.testutils; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; @@ -59,12 +60,17 @@ */ public class SparkDatasetTestUtils { + public static final String RECORD_KEY_FIELD_NAME = "record_key"; + public static final String PARTITION_PATH_FIELD_NAME = "partition_path"; + public static final StructType STRUCT_TYPE = new StructType(new StructField[] { new StructField(HoodieRecord.COMMIT_TIME_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()), new StructField(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()), new StructField(HoodieRecord.RECORD_KEY_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()), new StructField(HoodieRecord.PARTITION_PATH_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()), new StructField(HoodieRecord.FILENAME_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()), + new StructField(RECORD_KEY_FIELD_NAME, DataTypes.StringType, false, Metadata.empty()), + new StructField(PARTITION_PATH_FIELD_NAME, DataTypes.StringType, false, Metadata.empty()), new StructField("randomInt", DataTypes.IntegerType, false, Metadata.empty()), new StructField("randomLong", DataTypes.LongType, false, Metadata.empty())}); @@ -74,6 +80,8 @@ public class SparkDatasetTestUtils { new StructField(HoodieRecord.RECORD_KEY_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()), new StructField(HoodieRecord.PARTITION_PATH_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()), new StructField(HoodieRecord.FILENAME_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()), + new StructField(RECORD_KEY_FIELD_NAME, DataTypes.StringType, false, Metadata.empty()), + new StructField(PARTITION_PATH_FIELD_NAME, DataTypes.StringType, false, Metadata.empty()), new StructField("randomInt", DataTypes.IntegerType, false, Metadata.empty()), new StructField("randomStr", DataTypes.StringType, false, Metadata.empty())}); @@ -117,7 +125,7 @@ public static Dataset getRandomRows(SQLContext sqlContext, int count, Strin */ public static Row getRandomValue(String partitionPath, boolean isError) { // order commit time, seq no, record key, partition path, file name - Object[] values = new Object[7]; + Object[] values = new Object[9]; values[0] = ""; //commit time if (!isError) { values[1] = ""; // commit seq no @@ -127,11 +135,13 @@ public static Row getRandomValue(String partitionPath, boolean isError) { values[2] = UUID.randomUUID().toString(); values[3] = partitionPath; values[4] = ""; // filename - values[5] = RANDOM.nextInt(); + values[5] = UUID.randomUUID().toString(); + values[6] = partitionPath; + values[7] = RANDOM.nextInt(); if (!isError) { - values[6] = RANDOM.nextLong(); + values[8] = RANDOM.nextLong(); } else { - values[6] = UUID.randomUUID().toString(); + values[8] = UUID.randomUUID().toString(); } return new GenericRow(values); } @@ -154,23 +164,28 @@ public static List toInternalRows(Dataset rows, ExpressionEnco public static InternalRow getInternalRowWithError(String partitionPath) { // order commit time, seq no, record key, partition path, file name String recordKey = UUID.randomUUID().toString(); - Object[] values = new Object[7]; + Object[] values = new Object[9]; values[0] = ""; values[1] = ""; values[2] = recordKey; values[3] = partitionPath; values[4] = ""; - values[5] = RANDOM.nextInt(); - values[6] = RANDOM.nextBoolean(); + values[5] = recordKey; + values[6] = partitionPath; + values[7] = RANDOM.nextInt(); + values[8] = RANDOM.nextBoolean(); return new GenericInternalRow(values); } - public static HoodieWriteConfig.Builder getConfigBuilder(String basePath) { + public static HoodieWriteConfig.Builder getConfigBuilder(String basePath, int timelineServicePort) { return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + .withPopulateMetaFields(true) .withParallelism(2, 2) .withDeleteParallelism(2) .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build()) .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024).parquetMaxFileSize(1024 * 1024).build()) + .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withRemoteServerPort(timelineServicePort).build()) .forTable("test-trip-table") .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) .withBulkInsertParallelism(2); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/providers/SparkProvider.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/providers/SparkProvider.java index be15dc85d0560..92b1f76ac4024 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/providers/SparkProvider.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/providers/SparkProvider.java @@ -39,6 +39,8 @@ default SparkConf conf(Map overwritingConfigs) { SparkConf sparkConf = new SparkConf(); sparkConf.set("spark.app.name", getClass().getName()); sparkConf.set("spark.master", "local[*]"); + sparkConf.set("spark.default.parallelism", "4"); + sparkConf.set("spark.sql.shuffle.partitions", "4"); sparkConf.set("spark.driver.maxResultSize", "2g"); sparkConf.set("spark.hadoop.mapred.output.compress", "true"); sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true"); @@ -52,4 +54,4 @@ default SparkConf conf(Map overwritingConfigs) { default SparkConf conf() { return conf(Collections.emptyMap()); } -} \ No newline at end of file +} diff --git a/hudi-client/hudi-spark-client/src/test/resources/log4j-surefire-quiet.properties b/hudi-client/hudi-spark-client/src/test/resources/log4j-surefire-quiet.properties deleted file mode 100644 index 2b94ea2903067..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/resources/log4j-surefire-quiet.properties +++ /dev/null @@ -1,30 +0,0 @@ -### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -### -log4j.rootLogger=WARN, CONSOLE -log4j.logger.org.apache.hudi=DEBUG -log4j.logger.org.apache.hadoop.hbase=ERROR - -# CONSOLE is set to be a ConsoleAppender. -log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# CONSOLE uses PatternLayout. -log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout -log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n -log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter -log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true -log4j.appender.CONSOLE.filter.a.LevelMin=WARN -log4j.appender.CONSOLE.filter.a.LevelMax=FATAL \ No newline at end of file diff --git a/hudi-client/hudi-spark-client/src/test/resources/log4j-surefire.properties b/hudi-client/hudi-spark-client/src/test/resources/log4j-surefire.properties deleted file mode 100644 index 32af462093ae5..0000000000000 --- a/hudi-client/hudi-spark-client/src/test/resources/log4j-surefire.properties +++ /dev/null @@ -1,31 +0,0 @@ -### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -### -log4j.rootLogger=WARN, CONSOLE -log4j.logger.org.apache=INFO -log4j.logger.org.apache.hudi=DEBUG -log4j.logger.org.apache.hadoop.hbase=ERROR - -# A1 is set to be a ConsoleAppender. -log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. -log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout -log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n -log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter -log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true -log4j.appender.CONSOLE.filter.a.LevelMin=WARN -log4j.appender.CONSOLE.filter.a.LevelMax=FATAL diff --git a/hudi-client/hudi-spark-client/src/test/resources/testDataGeneratorSchema.txt b/hudi-client/hudi-spark-client/src/test/resources/testDataGeneratorSchema.txt index ada01b3530ff5..4cecd7974d5ad 100644 --- a/hudi-client/hudi-spark-client/src/test/resources/testDataGeneratorSchema.txt +++ b/hudi-client/hudi-spark-client/src/test/resources/testDataGeneratorSchema.txt @@ -25,6 +25,10 @@ }, { "name" : "_row_key", "type" : "string" + }, { + "name" : "partition_path", + "type" : ["null", "string"], + "default": null }, { "name" : "rider", "type" : "string" diff --git a/hudi-client/hudi-spark-client/src/test/scala/org/apache/spark/sql/TestHoodieUnsafeRowUtils.scala b/hudi-client/hudi-spark-client/src/test/scala/org/apache/spark/sql/TestHoodieUnsafeRowUtils.scala new file mode 100644 index 0000000000000..c23bbab99b4f0 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/scala/org/apache/spark/sql/TestHoodieUnsafeRowUtils.scala @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.sql.HoodieUnsafeRowUtils.{composeNestedFieldPath, getNestedInternalRowValue, getNestedRowValue} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.types._ +import org.junit.jupiter.api.Assertions.{assertEquals, fail} +import org.junit.jupiter.api.Test + +class TestHoodieUnsafeRowUtils { + + @Test + def testComposeNestedFieldPath(): Unit = { + val schema = StructType(Seq( + StructField("foo", StringType), + StructField( + name = "bar", + dataType = StructType(Seq( + StructField("baz", DateType), + StructField("bor", LongType) + )) + ) + )) + + assertEquals( + Seq((1, schema(1)), (0, schema(1).dataType.asInstanceOf[StructType](0))), + composeNestedFieldPath(schema, "bar.baz").parts.toSeq) + + assertThrows(classOf[IllegalArgumentException]) { () => + composeNestedFieldPath(schema, "foo.baz") + } + } + + @Test + def testGetNestedInternalRowValue(): Unit = { + val schema = StructType(Seq( + StructField("foo", StringType, nullable = false), + StructField( + name = "bar", + dataType = StructType(Seq( + StructField("baz", DateType), + StructField("bor", LongType) + )) + ) + )) + + val row = InternalRow("str", InternalRow(123, 456L)) + + assertEquals( + 123, + getNestedInternalRowValue(row, composeNestedFieldPath(schema, "bar.baz")) + ) + assertEquals( + 456L, + getNestedInternalRowValue(row, composeNestedFieldPath(schema, "bar.bor")) + ) + assertEquals( + "str", + getNestedInternalRowValue(row, composeNestedFieldPath(schema, "foo")) + ) + assertEquals( + row.getStruct(1, 2), + getNestedInternalRowValue(row, composeNestedFieldPath(schema, "bar")) + ) + + val rowProperNullable = InternalRow("str", null) + + assertEquals( + null, + getNestedInternalRowValue(rowProperNullable, composeNestedFieldPath(schema, "bar.baz")) + ) + assertEquals( + null, + getNestedInternalRowValue(rowProperNullable, composeNestedFieldPath(schema, "bar")) + ) + + val rowInvalidNullable = InternalRow(null, InternalRow(123, 456L)) + + assertThrows(classOf[IllegalArgumentException]) { () => + getNestedInternalRowValue(rowInvalidNullable, composeNestedFieldPath(schema, "foo")) + } + } + + @Test + def testGetNestedRowValue(): Unit = { + val schema = StructType(Seq( + StructField("foo", StringType, nullable = false), + StructField( + name = "bar", + dataType = StructType(Seq( + StructField("baz", DateType), + StructField("bor", LongType) + )) + ) + )) + + val row = Row("str", Row(123, 456L)) + + assertEquals( + 123, + getNestedRowValue(row, composeNestedFieldPath(schema, "bar.baz")) + ) + assertEquals( + 456L, + getNestedRowValue(row, composeNestedFieldPath(schema, "bar.bor")) + ) + assertEquals( + "str", + getNestedRowValue(row, composeNestedFieldPath(schema, "foo")) + ) + assertEquals( + row.getStruct(1), + getNestedRowValue(row, composeNestedFieldPath(schema, "bar")) + ) + + val rowProperNullable = Row("str", null) + + assertEquals( + null, + getNestedRowValue(rowProperNullable, composeNestedFieldPath(schema, "bar.baz")) + ) + assertEquals( + null, + getNestedRowValue(rowProperNullable, composeNestedFieldPath(schema, "bar")) + ) + + val rowInvalidNullable = Row(null, Row(123, 456L)) + + assertThrows(classOf[IllegalArgumentException]) { () => + getNestedRowValue(rowInvalidNullable, composeNestedFieldPath(schema, "foo")) + } + } + + // TODO rebase on ScalaAssertionSupport + private def assertThrows[T <: Throwable](expectedExceptionClass: Class[T])(f: () => Unit): T = { + try { + f.apply() + } catch { + case t: Throwable if expectedExceptionClass.isAssignableFrom(t.getClass) => + // scalastyle:off return + return t.asInstanceOf[T] + // scalastyle:on return + case ot @ _ => + fail(s"Expected exception of class $expectedExceptionClass, but ${ot.getClass} has been thrown") + } + + fail(s"Expected exception of class $expectedExceptionClass, but nothing has been thrown") + } + +} diff --git a/hudi-client/pom.xml b/hudi-client/pom.xml index cb838390e07ab..dff531c605ccd 100644 --- a/hudi-client/pom.xml +++ b/hudi-client/pom.xml @@ -19,7 +19,7 @@ hudi org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 diff --git a/hudi-common/pom.xml b/hudi-common/pom.xml index 4fbc63da7c644..13f86da6322ed 100644 --- a/hudi-common/pom.xml +++ b/hudi-common/pom.xml @@ -19,7 +19,7 @@ hudi org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 @@ -35,11 +35,8 @@ src/main/resources + - - org.jacoco - jacoco-maven-plugin - org.apache.maven.plugins maven-jar-plugin @@ -60,6 +57,10 @@ org.apache.rat apache-rat-plugin + + org.jacoco + jacoco-maven-plugin + org.apache.avro avro-maven-plugin @@ -77,7 +78,6 @@ ${basedir}/src/main/avro/HoodieRollbackMetadata.avsc ${basedir}/src/main/avro/HoodieRestoreMetadata.avsc ${basedir}/src/main/avro/HoodieReplaceCommitMetadata.avsc - ${basedir}/src/main/avro/HoodieArchivedMetaEntry.avsc ${basedir}/src/main/avro/HoodiePath.avsc ${basedir}/src/main/avro/HoodieFSPermission.avsc ${basedir}/src/main/avro/HoodieFileStatus.avsc @@ -89,6 +89,11 @@ ${basedir}/src/main/avro/HoodieClusteringStrategy.avsc ${basedir}/src/main/avro/HoodieClusteringPlan.avsc ${basedir}/src/main/avro/HoodieRequestedReplaceMetadata.avsc + ${basedir}/src/main/avro/HoodieMetadata.avsc + ${basedir}/src/main/avro/HoodieIndexPartitionInfo.avsc + ${basedir}/src/main/avro/HoodieIndexPlan.avsc + ${basedir}/src/main/avro/HoodieIndexCommitMetadata.avsc + ${basedir}/src/main/avro/HoodieArchivedMetaEntry.avsc @@ -96,6 +101,17 @@ + + org.openjdk.jol + jol-core + + + + + org.apache.logging.log4j + log4j-1.2-api + + com.fasterxml.jackson.core @@ -112,12 +128,26 @@ avro + + + com.github.ben-manes.caffeine + caffeine + + org.apache.parquet parquet-avro + + + org.apache.orc + orc-core + ${orc.version} + nohive + + org.apache.httpcomponents @@ -143,6 +173,7 @@ * + provided org.apache.hadoop @@ -153,6 +184,7 @@ org.apache.hadoop hadoop-hdfs + provided org.apache.hadoop @@ -161,6 +193,13 @@ test + + org.apache.hudi + hudi-tests-common + ${project.version} + test + + org.junit.jupiter junit-jupiter-api @@ -209,14 +248,23 @@ org.apache.hbase hbase-client ${hbase.version} - test + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + + - + org.apache.hbase hbase-server ${hbase.version} - + compile @@ -237,5 +285,18 @@ + + + + org.lz4 + lz4-java + 1.8.0 + + + + joda-time + joda-time + test + diff --git a/hudi-common/src/main/avro/HoodieArchivedMetaEntry.avsc b/hudi-common/src/main/avro/HoodieArchivedMetaEntry.avsc index c68ef879e7551..81bcaf745e5b8 100644 --- a/hudi-common/src/main/avro/HoodieArchivedMetaEntry.avsc +++ b/hudi-common/src/main/avro/HoodieArchivedMetaEntry.avsc @@ -104,6 +104,30 @@ "HoodieReplaceCommitMetadata" ], "default": null + }, + { + "name":"hoodieRequestedReplaceMetadata", + "type":[ + "null", + "HoodieRequestedReplaceMetadata" + ], + "default": null + }, + { + "name":"HoodieInflightReplaceMetadata", + "type":[ + "null", + "HoodieCommitMetadata" + ], + "default": null + }, + { + "name":"hoodieIndexCommitMetadata", + "type":[ + "null", + "HoodieIndexCommitMetadata" + ], + "default": null } ] } diff --git a/hudi-common/src/main/avro/HoodieCleanMetadata.avsc b/hudi-common/src/main/avro/HoodieCleanMetadata.avsc index c26b5a693b1c1..e51ecd0300cb0 100644 --- a/hudi-common/src/main/avro/HoodieCleanMetadata.avsc +++ b/hudi-common/src/main/avro/HoodieCleanMetadata.avsc @@ -23,6 +23,7 @@ {"name": "timeTakenInMillis", "type": "long"}, {"name": "totalFilesDeleted", "type": "int"}, {"name": "earliestCommitToRetain", "type": "string"}, + {"name": "lastCompletedCommitTimestamp", "type": "string", "default" : ""}, {"name": "partitionMetadata", "type": { "type" : "map", "values" : "HoodieCleanPartitionMetadata" } diff --git a/hudi-common/src/main/avro/HoodieCleanPartitionMetadata.avsc b/hudi-common/src/main/avro/HoodieCleanPartitionMetadata.avsc index 877b7259188f7..3cb096d48bd7a 100644 --- a/hudi-common/src/main/avro/HoodieCleanPartitionMetadata.avsc +++ b/hudi-common/src/main/avro/HoodieCleanPartitionMetadata.avsc @@ -24,6 +24,7 @@ {"name": "policy", "type": "string"}, {"name": "deletePathPatterns", "type": {"type": "array", "items": "string"}}, {"name": "successDeleteFiles", "type": {"type": "array", "items": "string"}}, - {"name": "failedDeleteFiles", "type": {"type": "array", "items": "string"}} + {"name": "failedDeleteFiles", "type": {"type": "array", "items": "string"}}, + {"name": "isPartitionDeleted", "type":["null", "boolean"], "default": null } ] } diff --git a/hudi-common/src/main/avro/HoodieCleanerPlan.avsc b/hudi-common/src/main/avro/HoodieCleanerPlan.avsc index c4481c2cd804c..42842c8be29e9 100644 --- a/hudi-common/src/main/avro/HoodieCleanerPlan.avsc +++ b/hudi-common/src/main/avro/HoodieCleanerPlan.avsc @@ -42,6 +42,11 @@ }], "default" : null }, + { + "name": "lastCompletedCommitTimestamp", + "type": "string", + "default" : "" + }, { "name": "policy", "type": "string" @@ -92,6 +97,14 @@ } }}], "default" : null + }, + { + "name": "partitionsToBeDeleted", + "doc": "partitions to be deleted", + "type":["null", + { "type":"array", "items":"string"} + ], + "default": null } ] } diff --git a/hudi-common/src/main/avro/HoodieClusteringGroup.avsc b/hudi-common/src/main/avro/HoodieClusteringGroup.avsc index fb41f6ef55855..b3a85cf9c0ee3 100644 --- a/hudi-common/src/main/avro/HoodieClusteringGroup.avsc +++ b/hudi-common/src/main/avro/HoodieClusteringGroup.avsc @@ -19,7 +19,6 @@ "namespace":"org.apache.hudi.avro.model", "type":"record", "name":"HoodieClusteringGroup", - "type":"record", "fields":[ { /* Group of files that needs to merged. All the slices in a group will belong to same partition initially. @@ -40,6 +39,21 @@ }], "default": null }, + { + "name":"numOutputFileGroups", + "type":["int", "null"], + "default": 1 + }, + { + /* Used to track extra metadata to facilitate clustering execution + */ + "name":"extraMetadata", + "type":["null", { + "type":"map", + "values":"string" + }], + "default": null + }, { "name":"version", "type":["int", "null"], diff --git a/hudi-common/src/main/avro/HoodieClusteringPlan.avsc b/hudi-common/src/main/avro/HoodieClusteringPlan.avsc index 709a0eb72df80..87486267d1ce5 100644 --- a/hudi-common/src/main/avro/HoodieClusteringPlan.avsc +++ b/hudi-common/src/main/avro/HoodieClusteringPlan.avsc @@ -30,7 +30,7 @@ }, { "name":"strategy", - "type":["HoodieClusteringStrategy", "null"], + "type":["null", "HoodieClusteringStrategy"], "default": null }, { @@ -45,6 +45,11 @@ "name":"version", "type":["int", "null"], "default": 1 + }, + { + "name":"preserveHoodieMetadata", + "type":["null", "boolean"], + "default": null } ] } diff --git a/hudi-common/src/main/avro/HoodieCommitMetadata.avsc b/hudi-common/src/main/avro/HoodieCommitMetadata.avsc index b7e7369456728..0f2a563d9199b 100644 --- a/hudi-common/src/main/avro/HoodieCommitMetadata.avsc +++ b/hudi-common/src/main/avro/HoodieCommitMetadata.avsc @@ -125,7 +125,8 @@ "name":"extraMetadata", "type":["null", { "type":"map", - "values":"string" + "values":"string", + "default": null }], "default": null }, diff --git a/hudi-common/src/main/avro/HoodieFSPermission.avsc b/hudi-common/src/main/avro/HoodieFSPermission.avsc index e5893352b3099..b64b16c5d0f7c 100644 --- a/hudi-common/src/main/avro/HoodieFSPermission.avsc +++ b/hudi-common/src/main/avro/HoodieFSPermission.avsc @@ -28,22 +28,22 @@ { "name":"userAction", "type":[ "null", "string" ], - "default": "null" + "default": null }, { "name":"groupAction", "type":[ "null", "string" ], - "default": "null" + "default": null }, { "name":"otherAction", "type":[ "null", "string" ], - "default": "null" + "default": null }, { "name":"stickyBit", "type":[ "null", "boolean" ], - "default": "null" + "default": null } ] } diff --git a/hudi-common/src/main/avro/HoodieIndexCommitMetadata.avsc b/hudi-common/src/main/avro/HoodieIndexCommitMetadata.avsc new file mode 100644 index 0000000000000..098a8c88e7328 --- /dev/null +++ b/hudi-common/src/main/avro/HoodieIndexCommitMetadata.avsc @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "namespace": "org.apache.hudi.avro.model", + "type": "record", + "name": "HoodieIndexCommitMetadata", + "fields": [ + { + "name": "version", + "doc": "This field replaces the field filesToBeDeletedPerPartition", + "type": [ + "int", + "null" + ], + "default": 1 + }, + { + "name": "operationType", + "doc": "This field replaces the field filesToBeDeletedPerPartition", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "indexPartitionInfos", + "doc": "This field contains the info for each partition that got indexed", + "type": [ + "null", + { + "type": "array", + "items": "HoodieIndexPartitionInfo" + } + ], + "default": null + } + ] +} diff --git a/hudi-common/src/main/avro/HoodieIndexPartitionInfo.avsc b/hudi-common/src/main/avro/HoodieIndexPartitionInfo.avsc new file mode 100644 index 0000000000000..52ed1e96aa3b3 --- /dev/null +++ b/hudi-common/src/main/avro/HoodieIndexPartitionInfo.avsc @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "namespace": "org.apache.hudi.avro.model", + "type": "record", + "name": "HoodieIndexPartitionInfo", + "fields": [ + { + "name": "version", + "type": [ + "int", + "null" + ], + "default": 1 + }, + { + "name": "metadataPartitionPath", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "indexUptoInstant", + "type": [ + "null", + "string" + ], + "default": null + } + ] +} diff --git a/hudi-common/src/main/avro/HoodieIndexPlan.avsc b/hudi-common/src/main/avro/HoodieIndexPlan.avsc new file mode 100644 index 0000000000000..9fb7ec311e34a --- /dev/null +++ b/hudi-common/src/main/avro/HoodieIndexPlan.avsc @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "namespace": "org.apache.hudi.avro.model", + "type": "record", + "name": "HoodieIndexPlan", + "fields": [ + { + "name": "version", + "type": [ + "int", + "null" + ], + "default": 1 + }, + { + "name": "indexPartitionInfos", + "type": [ + "null", + { + "type": "array", + "items": "HoodieIndexPartitionInfo" + } + ], + "default": null + } + ] +} diff --git a/hudi-common/src/main/avro/HoodieMergeArchiveFilePlan.avsc b/hudi-common/src/main/avro/HoodieMergeArchiveFilePlan.avsc new file mode 100644 index 0000000000000..2284109f7cd45 --- /dev/null +++ b/hudi-common/src/main/avro/HoodieMergeArchiveFilePlan.avsc @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "namespace":"org.apache.hudi.avro.model", + "type":"record", + "name":"HoodieMergeArchiveFilePlan", + "fields":[ + { + "name":"version", + "type":["int", "null"], + "default": 1 + }, + { + "name":"candidate", + "type":["null", { + "type":"array", + "items": "string" + }], + "default": null + }, + { + "name":"mergedArchiveFileName", + "type":["null", "string"], + "default": null + } + ] +} diff --git a/hudi-common/src/main/avro/HoodieMetadata.avsc b/hudi-common/src/main/avro/HoodieMetadata.avsc new file mode 100644 index 0000000000000..a47cbf3784f52 --- /dev/null +++ b/hudi-common/src/main/avro/HoodieMetadata.avsc @@ -0,0 +1,363 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "namespace": "org.apache.hudi.avro.model", + "type": "record", + "name": "HoodieMetadataRecord", + "doc": "A record saved within the Metadata Table", + "fields": [ + { + "name": "key", + "type": "string" + }, + { + "name": "type", + "doc": "Type of the metadata record", + "type": "int" + }, + { + "doc": "Contains information about partitions and files within the dataset", + "name": "filesystemMetadata", + "type": [ + "null", + { + "type": "map", + "values": { + "type": "record", + "name": "HoodieMetadataFileInfo", + "fields": [ + { + "name": "size", + "type": "long", + "doc": "Size of the file" + }, + { + "name": "isDeleted", + "type": "boolean", + "doc": "True if this file has been deleted" + } + ] + } + } + ] + }, + { + "doc": "Metadata Index of bloom filters for all data files in the user table", + "name": "BloomFilterMetadata", + "type": [ + "null", + { + "doc": "Data file bloom filter details", + "name": "HoodieMetadataBloomFilter", + "type": "record", + "fields": [ + { + "doc": "Bloom filter type code", + "name": "type", + "type": "string" + }, + { + "doc": "Instant timestamp when this metadata was created/updated", + "name": "timestamp", + "type": "string" + }, + { + "doc": "Bloom filter binary byte array", + "name": "bloomFilter", + "type": "bytes" + }, + { + "doc": "Bloom filter entry valid/deleted flag", + "name": "isDeleted", + "type": "boolean" + } + ] + } + ], + "default" : null + }, + { + "doc": "Metadata Index of column statistics for all data files in the user table", + "name": "ColumnStatsMetadata", + "type": [ + "null", + { + "doc": "Data file column statistics", + "name": "HoodieMetadataColumnStats", + "type": "record", + "fields": [ + { + "doc": "File name for which this column statistics applies", + "name": "fileName", + "type": [ + "null", + "string" + ], + "default" : null + }, + { + "doc": "Column name for which this column statistics applies", + "name": "columnName", + "type": [ + "null", + "string" + ], + "default" : null + }, + { + "doc": "Minimum value in the range. Based on user data table schema, we can convert this to appropriate type", + "name": "minValue", + "type": [ + // Those types should be aligned with Parquet `Statistics` impl + // making sure that we implement semantic consistent across file formats + // + // NOTE: Other logical types (decimal, date, timestamp, etc) will be converted + // into one of the following types, making sure that their corresponding + // ordering is preserved + "null", + { + "namespace": "org.apache.hudi.avro.model", + "type": "record", + "name": "BooleanWrapper", + "doc": "A record wrapping boolean type to be able to be used it w/in Avro's Union", + "fields": [ + { + "type": "boolean", + "name": "value" + } + ] + }, + { + "namespace": "org.apache.hudi.avro.model", + "type": "record", + "name": "IntWrapper", + "doc": "A record wrapping int type to be able to be used it w/in Avro's Union", + "fields": [ + { + "type": "int", + "name": "value" + } + ] + }, + { + "namespace": "org.apache.hudi.avro.model", + "type": "record", + "name": "LongWrapper", + "doc": "A record wrapping long type to be able to be used it w/in Avro's Union", + "fields": [ + { + "type": "long", + "name": "value" + } + ] + }, + { + "namespace": "org.apache.hudi.avro.model", + "type": "record", + "name": "FloatWrapper", + "doc": "A record wrapping float type to be able to be used it w/in Avro's Union", + "fields": [ + { + "type": "float", + "name": "value" + } + ] + }, + { + "namespace": "org.apache.hudi.avro.model", + "type": "record", + "name": "DoubleWrapper", + "doc": "A record wrapping double type to be able to be used it w/in Avro's Union", + "fields": [ + { + "type": "double", + "name": "value" + } + ] + }, + { + "namespace": "org.apache.hudi.avro.model", + "type": "record", + "name": "BytesWrapper", + "doc": "A record wrapping bytes type to be able to be used it w/in Avro's Union", + "fields": [ + { + "type": "bytes", + "name": "value" + } + ] + }, + { + "namespace": "org.apache.hudi.avro.model", + "type": "record", + "name": "StringWrapper", + "doc": "A record wrapping string type to be able to be used it w/in Avro's Union", + "fields": [ + { + "type": "string", + "name": "value" + } + ] + }, + { + "namespace": "org.apache.hudi.avro.model", + "type": "record", + "name": "DateWrapper", + "doc": "A record wrapping Date logical type to be able to be used it w/in Avro's Union", + "fields": [ + { + "type": { + "type": "int" + // NOTE: Due to breaking changes in code-gen b/w Avro 1.8.2 and 1.10, we can't + // rely on logical types to do proper encoding of the native Java types, + // and hereby have to encode statistic manually + //"logicalType": "date" + }, + "name": "value" + } + ] + }, + { + "namespace": "org.apache.hudi.avro.model", + "type": "record", + "name": "DecimalWrapper", + "doc": "A record wrapping Decimal logical type to be able to be used it w/in Avro's Union", + "fields": [ + { + "type": { + "type": "bytes", + "logicalType": "decimal", + // NOTE: This is equivalent to Spark's [[DoubleDecimal]] and should + // be enough for almost any possible use-cases + "precision": 30, + "scale": 15 + }, + "name": "value" + } + ] + }, + { + "namespace": "org.apache.hudi.avro.model", + "type": "record", + "name": "TimeMicrosWrapper", + "doc": "A record wrapping Time-micros logical type to be able to be used it w/in Avro's Union", + "fields": [ + { + "type": { + "type": "long", + "logicalType": "time-micros" + }, + "name": "value" + + } + ] + }, + { + "namespace": "org.apache.hudi.avro.model", + "type": "record", + "name": "TimestampMicrosWrapper", + "doc": "A record wrapping Timestamp-micros logical type to be able to be used it w/in Avro's Union", + "fields": [ + { + "type": { + "type": "long" + // NOTE: Due to breaking changes in code-gen b/w Avro 1.8.2 and 1.10, we can't + // rely on logical types to do proper encoding of the native Java types, + // and hereby have to encode statistic manually + //"logicalType": "timestamp-micros" + }, + "name": "value" + } + ] + } + ], + "default": null + }, + { + "doc": "Maximum value in the range. Based on user data table schema, we can convert it to appropriate type", + "name": "maxValue", + "type": [ + // Those types should be aligned with Parquet `Statistics` impl + // making sure that we implement semantic consistent across file formats + // + // NOTE: Other logical types (decimal, date, timestamp, etc) will be converted + // into one of the following types, making sure that their corresponding + // ordering is preserved + "null", + "org.apache.hudi.avro.model.BooleanWrapper", + "org.apache.hudi.avro.model.IntWrapper", + "org.apache.hudi.avro.model.LongWrapper", + "org.apache.hudi.avro.model.FloatWrapper", + "org.apache.hudi.avro.model.DoubleWrapper", + "org.apache.hudi.avro.model.BytesWrapper", + "org.apache.hudi.avro.model.StringWrapper", + "org.apache.hudi.avro.model.DateWrapper", + "org.apache.hudi.avro.model.DecimalWrapper", + "org.apache.hudi.avro.model.TimeMicrosWrapper", + "org.apache.hudi.avro.model.TimestampMicrosWrapper" + ], + "default": null + }, + { + "doc": "Total count of values", + "name": "valueCount", + "type": [ + "null", + "long" + ], + "default": null + }, + { + "doc": "Total count of null values", + "name": "nullCount", + "type": [ + "null", + "long" + ], + "default": null + }, + { + "doc": "Total storage size on disk", + "name": "totalSize", + "type": [ + "null", + "long" + ], + "default": null + }, + { + "doc": "Total uncompressed storage size on disk", + "name": "totalUncompressedSize", + "type": [ + "null", + "long" + ], + "default": null + }, + { + "doc": "Column range entry valid/deleted flag", + "name": "isDeleted", + "type": "boolean" + } + ] + } + ], + "default" : null + } + ] +} diff --git a/hudi-common/src/main/avro/HoodieRequestedReplaceMetadata.avsc b/hudi-common/src/main/avro/HoodieRequestedReplaceMetadata.avsc index f98f42410f340..bc06780e3b9b1 100644 --- a/hudi-common/src/main/avro/HoodieRequestedReplaceMetadata.avsc +++ b/hudi-common/src/main/avro/HoodieRequestedReplaceMetadata.avsc @@ -23,11 +23,11 @@ { "name":"operationType", "type":["null", "string"], - "default": "" + "default": null }, { "name":"clusteringPlan", /* only set if operationType == clustering" */ - "type":["HoodieClusteringPlan", "null"], + "type":["null", "HoodieClusteringPlan"], "default": null }, { diff --git a/hudi-common/src/main/avro/HoodieRestoreMetadata.avsc b/hudi-common/src/main/avro/HoodieRestoreMetadata.avsc index 6c6f98ff1ee59..f9795d334846c 100644 --- a/hudi-common/src/main/avro/HoodieRestoreMetadata.avsc +++ b/hudi-common/src/main/avro/HoodieRestoreMetadata.avsc @@ -38,7 +38,7 @@ /* overlaps with 'instantsToRollback' field. Adding this to track action type for all the instants being rolled back. */ { "name": "restoreInstantInfo", - "default": null, + "default": [], "type": { "type": "array", "default": null, diff --git a/hudi-common/src/main/avro/HoodieRestorePlan.avsc b/hudi-common/src/main/avro/HoodieRestorePlan.avsc new file mode 100644 index 0000000000000..1ad9e6a4b9c80 --- /dev/null +++ b/hudi-common/src/main/avro/HoodieRestorePlan.avsc @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "namespace":"org.apache.hudi.avro.model", + "type":"record", + "name":"HoodieRestorePlan", + "fields":[ + { + "name": "instantsToRollback", + "default": [], + "type": { + "type": "array", + "default": null, + "items": "HoodieInstantInfo" + } + }, + { + "name":"version", + "type":["int", "null"], + "default": 1 + }] +} \ No newline at end of file diff --git a/hudi-common/src/main/avro/HoodieRollbackMetadata.avsc b/hudi-common/src/main/avro/HoodieRollbackMetadata.avsc index a6bd4c20ef278..5a300cda9e638 100644 --- a/hudi-common/src/main/avro/HoodieRollbackMetadata.avsc +++ b/hudi-common/src/main/avro/HoodieRollbackMetadata.avsc @@ -30,11 +30,17 @@ "fields": [ {"name": "partitionPath", "type": "string"}, {"name": "successDeleteFiles", "type": {"type": "array", "items": "string"}}, - {"name": "failedDeleteFiles", "type": {"type": "array", "items": "string"}} + {"name": "failedDeleteFiles", "type": {"type": "array", "items": "string"}}, + {"name": "rollbackLogFiles", "type": ["null", { + "type": "map", + "doc": "Files to which append blocks were written to capture rollback commit", + "values": { + "type": "long", + "doc": "Size of this file in bytes" + } + }], "default":null } ] - } - } - }, + }}}, { "name":"version", "type":["int", "null"], @@ -43,10 +49,10 @@ /* overlaps with 'commitsRollback' field. Adding this to track action type for all the instants being rolled back. */ { "name": "instantsRollback", - "default": null, + "default": [], "type": { "type": "array", - "default": null, + "default": [], "items": "HoodieInstantInfo" } } diff --git a/hudi-common/src/main/avro/HoodieRollbackPlan.avsc b/hudi-common/src/main/avro/HoodieRollbackPlan.avsc new file mode 100644 index 0000000000000..99e0755bd52ce --- /dev/null +++ b/hudi-common/src/main/avro/HoodieRollbackPlan.avsc @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "namespace": "org.apache.hudi.avro.model", + "type": "record", + "name": "HoodieRollbackPlan", + "fields": [ + { + "name": "instantToRollback", + "doc": "Hoodie instant that needs to be rolled back", + "type": ["null", "HoodieInstantInfo"], + "default": null + }, + { + "name": "RollbackRequests", + "type":["null", { + "type":"array", + "items":{ + "type": "record", + "name": "HoodieRollbackRequest", + "fields": [ + {"name": "partitionPath", "type": "string"}, + {"name": "fileId", + "type":["null", "string"], + "default": null + }, + {"name": "latestBaseInstant", + "type":["null", "string"], + "default": null + }, + {"name": "filesToBeDeleted", + "default": [], + "type": { + "type": "array", + "default": [], + "items": "string" + } + }, + {"name": "logBlocksToBeDeleted", + "type": ["null", { + "type": "map", + "doc": "Log blocks that need to be deleted as part of the rollback", + "values": { + "type": "long", + "doc": "Size of this file/block in bytes" + } + }], + "default":null + } + ] + } + }], + "default" : null + }, + { + "name":"version", + "type":["int", "null"], + "default": 1 + } + ] +} diff --git a/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java b/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java new file mode 100644 index 0000000000000..ef627c67f2e3e --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java @@ -0,0 +1,410 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi; + +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.BaseFile; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieTableQueryType; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.hadoop.CachingPath; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.HoodieTableMetadataUtil; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.function.Function; +import java.util.stream.Collectors; + +import static org.apache.hudi.hadoop.CachingPath.createPathUnsafe; +import static org.apache.hudi.common.config.HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS; +import static org.apache.hudi.common.config.HoodieMetadataConfig.ENABLE; + +/** + * Common (engine-agnostic) File Index implementation enabling individual query engines to + * list Hudi Table contents based on the following: + * + *
      + *
    • Table type (MOR, COW)
    • + *
    • Query type (snapshot, read_optimized, incremental)
    • + *
    • Query instant/range
    • + *
    + */ +public abstract class BaseHoodieTableFileIndex implements AutoCloseable { + private static final Logger LOG = LogManager.getLogger(BaseHoodieTableFileIndex.class); + + private final String[] partitionColumns; + + protected final HoodieMetadataConfig metadataConfig; + + private final HoodieTableQueryType queryType; + private final Option specifiedQueryInstant; + protected final List queryPaths; + + private final boolean shouldIncludePendingCommits; + private final boolean shouldValidateInstant; + + private final HoodieTableType tableType; + protected final Path basePath; + + private final HoodieTableMetaClient metaClient; + private final HoodieEngineContext engineContext; + + private final transient FileStatusCache fileStatusCache; + + protected transient volatile long cachedFileSize = 0L; + protected transient volatile Map> cachedAllInputFileSlices; + + protected volatile boolean queryAsNonePartitionedTable = false; + + private transient volatile HoodieTableFileSystemView fileSystemView = null; + + private transient HoodieTableMetadata tableMetadata = null; + + /** + * @param engineContext Hudi engine-specific context + * @param metaClient Hudi table's meta-client + * @param configProperties unifying configuration (in the form of generic properties) + * @param queryType target query type + * @param queryPaths target DFS paths being queried + * @param specifiedQueryInstant instant as of which table is being queried + * @param shouldIncludePendingCommits flags whether file-index should exclude any pending operations + * @param shouldValidateInstant flags to validate whether query instant is present in the timeline + * @param fileStatusCache transient cache of fetched [[FileStatus]]es + */ + public BaseHoodieTableFileIndex(HoodieEngineContext engineContext, + HoodieTableMetaClient metaClient, + TypedProperties configProperties, + HoodieTableQueryType queryType, + List queryPaths, + Option specifiedQueryInstant, + boolean shouldIncludePendingCommits, + boolean shouldValidateInstant, + FileStatusCache fileStatusCache) { + this.partitionColumns = metaClient.getTableConfig().getPartitionFields() + .orElse(new String[0]); + + this.metadataConfig = HoodieMetadataConfig.newBuilder() + .fromProperties(configProperties) + .enable(configProperties.getBoolean(ENABLE.key(), DEFAULT_METADATA_ENABLE_FOR_READERS) + && HoodieTableMetadataUtil.isFilesPartitionAvailable(metaClient)) + .build(); + + this.queryType = queryType; + this.queryPaths = queryPaths; + this.specifiedQueryInstant = specifiedQueryInstant; + this.shouldIncludePendingCommits = shouldIncludePendingCommits; + this.shouldValidateInstant = shouldValidateInstant; + + this.tableType = metaClient.getTableType(); + this.basePath = metaClient.getBasePathV2(); + + this.metaClient = metaClient; + this.engineContext = engineContext; + this.fileStatusCache = fileStatusCache; + + doRefresh(); + } + + protected abstract Object[] parsePartitionColumnValues(String[] partitionColumns, String partitionPath); + + /** + * Returns latest completed instant as seen by this instance of the file-index + */ + public Option getLatestCompletedInstant() { + return getActiveTimeline().filterCompletedInstants().lastInstant(); + } + + /** + * Returns table's base-path + */ + public String getBasePath() { + return basePath.toString(); + } + + /** + * Fetch list of latest base files and log files per partition. + * + * @return mapping from string partition paths to its base/log files + */ + public Map> listFileSlices() { + return cachedAllInputFileSlices.entrySet() + .stream() + .collect(Collectors.toMap(e -> e.getKey().path, Map.Entry::getValue)); + } + + public int getFileSlicesCount() { + return cachedAllInputFileSlices.values().stream() + .mapToInt(List::size).sum(); + } + + @Override + public void close() throws Exception { + resetTableMetadata(null); + } + + protected List getAllQueryPartitionPaths() { + List queryRelativePartitionPaths = queryPaths.stream() + .map(path -> FSUtils.getRelativePartitionPath(basePath, path)) + .collect(Collectors.toList()); + + // Load all the partition path from the basePath, and filter by the query partition path. + // TODO load files from the queryRelativePartitionPaths directly. + List matchedPartitionPaths = FSUtils.getAllPartitionPaths(engineContext, metadataConfig, basePath.toString()) + .stream() + .filter(path -> queryRelativePartitionPaths.stream().anyMatch(path::startsWith)) + .collect(Collectors.toList()); + + // Convert partition's path into partition descriptor + return matchedPartitionPaths.stream() + .map(partitionPath -> { + Object[] partitionColumnValues = parsePartitionColumnValues(partitionColumns, partitionPath); + return new PartitionPath(partitionPath, partitionColumnValues); + }) + .collect(Collectors.toList()); + } + + protected void refresh() { + fileStatusCache.invalidate(); + doRefresh(); + } + + protected HoodieTimeline getActiveTimeline() { + // NOTE: We have to use commits and compactions timeline, to make sure that we're properly + // handling the following case: when records are inserted into the new log-file w/in the file-group + // that is under the pending compaction process, new log-file will bear the compaction's instant (on the + // timeline) in its name, as opposed to the base-file's commit instant. To make sure we're not filtering + // such log-file we have to _always_ include pending compaction instants into consideration + // TODO(HUDI-3302) re-evaluate whether we should filter any commits in here + HoodieTimeline timeline = metaClient.getCommitsAndCompactionTimeline(); + if (shouldIncludePendingCommits) { + return timeline; + } else { + return timeline.filterCompletedAndCompactionInstants(); + } + } + + /** + * Load all partition paths and it's files under the query table path. + */ + private Map loadPartitionPathFiles() { + // List files in all partition paths + List pathToFetch = new ArrayList<>(); + Map cachedPartitionToFiles = new HashMap<>(); + + // Fetch from the FileStatusCache + List partitionPaths = getAllQueryPartitionPaths(); + partitionPaths.forEach(partitionPath -> { + Option filesInPartition = fileStatusCache.get(partitionPath.fullPartitionPath(basePath)); + if (filesInPartition.isPresent()) { + cachedPartitionToFiles.put(partitionPath, filesInPartition.get()); + } else { + pathToFetch.add(partitionPath); + } + }); + + Map fetchedPartitionToFiles; + + if (pathToFetch.isEmpty()) { + fetchedPartitionToFiles = Collections.emptyMap(); + } else { + Map fullPartitionPathsMapToFetch = pathToFetch.stream() + .collect(Collectors.toMap( + partitionPath -> partitionPath.fullPartitionPath(basePath).toString(), + Function.identity()) + ); + + fetchedPartitionToFiles = + getAllFilesInPartitionsUnchecked(fullPartitionPathsMapToFetch.keySet()) + .entrySet() + .stream() + .collect(Collectors.toMap(e -> fullPartitionPathsMapToFetch.get(e.getKey()), e -> e.getValue())); + + } + + // Update the fileStatusCache + fetchedPartitionToFiles.forEach((partitionPath, filesInPartition) -> { + fileStatusCache.put(partitionPath.fullPartitionPath(basePath), filesInPartition); + }); + + return CollectionUtils.combine(cachedPartitionToFiles, fetchedPartitionToFiles); + } + + private void doRefresh() { + long startTime = System.currentTimeMillis(); + + HoodieTableMetadata newTableMetadata = HoodieTableMetadata.create(engineContext, metadataConfig, basePath.toString(), + FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue(), true); + + resetTableMetadata(newTableMetadata); + + Map partitionFiles = loadPartitionPathFiles(); + FileStatus[] allFiles = partitionFiles.values().stream().flatMap(Arrays::stream).toArray(FileStatus[]::new); + + metaClient.reloadActiveTimeline(); + + HoodieTimeline activeTimeline = getActiveTimeline(); + Option latestInstant = activeTimeline.lastInstant(); + + // TODO we can optimize the flow by: + // - First fetch list of files from instants of interest + // - Load FileStatus's + this.fileSystemView = new HoodieTableFileSystemView(metaClient, activeTimeline, allFiles); + + Option queryInstant = + specifiedQueryInstant.or(() -> latestInstant.map(HoodieInstant::getTimestamp)); + + validate(activeTimeline, queryInstant); + + // NOTE: For MOR table, when the compaction is inflight, we need to not only fetch the + // latest slices, but also include the base and log files of the second-last version of + // the file slice in the same file group as the latest file slice that is under compaction. + // This logic is realized by `AbstractTableFileSystemView::getLatestMergedFileSlicesBeforeOrOn` + // API. Note that for COW table, the merging logic of two slices does not happen as there + // is no compaction, thus there is no performance impact. + cachedAllInputFileSlices = partitionFiles.keySet().stream() + .collect(Collectors.toMap( + Function.identity(), + partitionPath -> + queryInstant.map(instant -> + fileSystemView.getLatestMergedFileSlicesBeforeOrOn(partitionPath.path, queryInstant.get()) + ) + .orElse(fileSystemView.getLatestFileSlices(partitionPath.path)) + .collect(Collectors.toList()) + ) + ); + + cachedFileSize = cachedAllInputFileSlices.values().stream() + .flatMap(Collection::stream) + .mapToLong(BaseHoodieTableFileIndex::fileSliceSize) + .sum(); + + // If the partition value contains InternalRow.empty, we query it as a non-partitioned table. + queryAsNonePartitionedTable = partitionFiles.keySet().stream().anyMatch(p -> p.values.length == 0); + + long duration = System.currentTimeMillis() - startTime; + + LOG.info(String.format("Refresh table %s, spent: %d ms", metaClient.getTableConfig().getTableName(), duration)); + } + + private Map getAllFilesInPartitionsUnchecked(Collection fullPartitionPathsMapToFetch) { + try { + return tableMetadata.getAllFilesInPartitions(new ArrayList<>(fullPartitionPathsMapToFetch)); + } catch (IOException e) { + throw new HoodieIOException("Failed to list partition paths for a table", e); + } + } + + private void validate(HoodieTimeline activeTimeline, Option queryInstant) { + if (shouldValidateInstant) { + if (queryInstant.isPresent() && !activeTimeline.containsInstant(queryInstant.get())) { + throw new HoodieIOException(String.format("Query instant (%s) not found in the timeline", queryInstant.get())); + } + } + } + + private static long fileSliceSize(FileSlice fileSlice) { + long logFileSize = fileSlice.getLogFiles().map(HoodieLogFile::getFileSize) + .filter(s -> s > 0) + .reduce(0L, Long::sum); + + return fileSlice.getBaseFile().map(BaseFile::getFileLen).orElse(0L) + logFileSize; + } + + private void resetTableMetadata(HoodieTableMetadata newTableMetadata) { + if (tableMetadata != null) { + try { + tableMetadata.close(); + } catch (Exception e) { + throw new HoodieException("Failed to close HoodieTableMetadata instance", e); + } + } + tableMetadata = newTableMetadata; + } + + public static final class PartitionPath { + + final String path; + final Object[] values; + + public PartitionPath(String path, Object[] values) { + this.path = path; + this.values = values; + } + + public String getPath() { + return path; + } + + Path fullPartitionPath(Path basePath) { + if (!path.isEmpty()) { + // NOTE: Since we now that the path is a proper relative path that doesn't require + // normalization we create Hadoop's Path using more performant unsafe variant + return new CachingPath(basePath, createPathUnsafe(path)); + } + + return basePath; + } + + @Override + public boolean equals(Object other) { + return other instanceof PartitionPath + && Objects.equals(path, ((PartitionPath) other).path) + && Arrays.equals(values, ((PartitionPath) other).values); + } + + @Override + public int hashCode() { + return path.hashCode() * 1103 + Arrays.hashCode(values); + } + } + + protected interface FileStatusCache { + Option get(Path path); + + void put(Path path, FileStatus[] leafFiles); + + void invalidate(); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaUtils.java b/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaUtils.java new file mode 100644 index 0000000000000..db523a2911706 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaUtils.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.avro; + +import org.apache.avro.AvroRuntimeException; +import org.apache.avro.Schema; + +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.util.ValidationUtils.checkState; + +public class AvroSchemaUtils { + + private AvroSchemaUtils() {} + + /** + * Appends provided new fields at the end of the given schema + * + * NOTE: No deduplication is made, this method simply appends fields at the end of the list + * of the source schema as is + */ + public static Schema appendFieldsToSchema(Schema schema, List newFields) { + List fields = schema.getFields().stream() + .map(field -> new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultVal())) + .collect(Collectors.toList()); + fields.addAll(newFields); + + Schema newSchema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), schema.isError()); + newSchema.setFields(fields); + return newSchema; + } + + /** + * Passed in {@code Union} schema and will try to resolve the field with the {@code fieldSchemaFullName} + * w/in the union returning its corresponding schema + * + * @param schema target schema to be inspected + * @param fieldSchemaFullName target field-name to be looked up w/in the union + * @return schema of the field w/in the union identified by the {@code fieldSchemaFullName} + */ + public static Schema resolveUnionSchema(Schema schema, String fieldSchemaFullName) { + if (schema.getType() != Schema.Type.UNION) { + return schema; + } + + List innerTypes = schema.getTypes(); + Schema nonNullType = + innerTypes.stream() + .filter(it -> it.getType() != Schema.Type.NULL && Objects.equals(it.getFullName(), fieldSchemaFullName)) + .findFirst() + .orElse(null); + + if (nonNullType == null) { + throw new AvroRuntimeException( + String.format("Unsupported Avro UNION type %s: Only UNION of a null type and a non-null type is supported", schema)); + } + + return nonNullType; + } + + /** + * Returns true in case provided {@link Schema} is nullable (ie accepting null values), + * returns false otherwise + */ + public static boolean isNullable(Schema schema) { + if (schema.getType() != Schema.Type.UNION) { + return false; + } + + List innerTypes = schema.getTypes(); + return innerTypes.size() > 1 && innerTypes.stream().anyMatch(it -> it.getType() == Schema.Type.NULL); + } + + /** + * Resolves typical Avro's nullable schema definition: {@code Union(Schema.Type.NULL, )}, + * decomposing union and returning the target non-null type + */ + public static Schema resolveNullableSchema(Schema schema) { + if (schema.getType() != Schema.Type.UNION) { + return schema; + } + + List innerTypes = schema.getTypes(); + Schema nonNullType = + innerTypes.stream() + .filter(it -> it.getType() != Schema.Type.NULL) + .findFirst() + .orElse(null); + + if (innerTypes.size() != 2 || nonNullType == null) { + throw new AvroRuntimeException( + String.format("Unsupported Avro UNION type %s: Only UNION of a null type and a non-null type is supported", schema)); + } + + return nonNullType; + } + + /** + * Creates schema following Avro's typical nullable schema definition: {@code Union(Schema.Type.NULL, )}, + * wrapping around provided target non-null type + */ + public static Schema createNullableSchema(Schema.Type avroType) { + checkState(avroType != Schema.Type.NULL); + return Schema.createUnion(Schema.create(Schema.Type.NULL), Schema.create(avroType)); + } + + /** + * Returns true in case when schema contains the field w/ provided name + */ + public static boolean containsFieldInSchema(Schema schema, String fieldName) { + try { + Schema.Field field = schema.getField(fieldName); + return field != null; + } catch (Exception e) { + return false; + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/ConvertingGenericData.java b/hudi-common/src/main/java/org/apache/hudi/avro/ConvertingGenericData.java new file mode 100644 index 0000000000000..9d36e214fb852 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/avro/ConvertingGenericData.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.avro; + +import org.apache.avro.Conversions; +import org.apache.avro.Schema; +import org.apache.avro.UnresolvedUnionException; +import org.apache.avro.data.TimeConversions; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericFixed; + +import java.util.Map; + +/** + * Custom instance of the {@link GenericData} model incorporating conversions from the + * common Avro logical types like "decimal", "uuid", "date", "time-micros", "timestamp-micros" + * + * NOTE: Given that this code has to be interoperable w/ Spark 2 (which relies on Avro 1.8.2) + * this model can't support newer conversion introduced in Avro 1.10 at the moment + */ +public class ConvertingGenericData extends GenericData { + + private static final Conversions.DecimalConversion DECIMAL_CONVERSION = new Conversions.DecimalConversion(); + private static final Conversions.UUIDConversion UUID_CONVERSION = new Conversions.UUIDConversion(); + private static final TimeConversions.DateConversion DATE_CONVERSION = new TimeConversions.DateConversion(); + private static final TimeConversions.TimeMicrosConversion TIME_MICROS_CONVERSION = new TimeConversions.TimeMicrosConversion(); + private static final TimeConversions.TimestampMicrosConversion TIMESTAMP_MICROS_CONVERSION = new TimeConversions.TimestampMicrosConversion(); + + // NOTE: Those are not supported in Avro 1.8.2 + // TODO re-enable upon upgrading to 1.10 + // private static final TimeConversions.TimestampMillisConversion TIMESTAMP_MILLIS_CONVERSION = new TimeConversions.TimestampMillisConversion(); + // private static final TimeConversions.TimeMillisConversion TIME_MILLIS_CONVERSION = new TimeConversions.TimeMillisConversion(); + // private static final TimeConversions.LocalTimestampMillisConversion LOCAL_TIMESTAMP_MILLIS_CONVERSION = new TimeConversions.LocalTimestampMillisConversion(); + // private static final TimeConversions.LocalTimestampMicrosConversion LOCAL_TIMESTAMP_MICROS_CONVERSION = new TimeConversions.LocalTimestampMicrosConversion(); + + public static final GenericData INSTANCE = new ConvertingGenericData(); + + private ConvertingGenericData() { + addLogicalTypeConversion(DECIMAL_CONVERSION); + addLogicalTypeConversion(UUID_CONVERSION); + addLogicalTypeConversion(DATE_CONVERSION); + addLogicalTypeConversion(TIME_MICROS_CONVERSION); + addLogicalTypeConversion(TIMESTAMP_MICROS_CONVERSION); + // NOTE: Those are not supported in Avro 1.8.2 + // TODO re-enable upon upgrading to 1.10 + // addLogicalTypeConversion(TIME_MILLIS_CONVERSION); + // addLogicalTypeConversion(TIMESTAMP_MILLIS_CONVERSION); + // addLogicalTypeConversion(LOCAL_TIMESTAMP_MILLIS_CONVERSION); + // addLogicalTypeConversion(LOCAL_TIMESTAMP_MICROS_CONVERSION); + } + + @Override + public boolean validate(Schema schema, Object datum) { + switch (schema.getType()) { + case RECORD: + if (!isRecord(datum)) { + return false; + } + for (Schema.Field f : schema.getFields()) { + if (!validate(f.schema(), getField(datum, f.name(), f.pos()))) { + return false; + } + } + return true; + case ENUM: + if (!isEnum(datum)) { + return false; + } + return schema.getEnumSymbols().contains(datum.toString()); + case ARRAY: + if (!(isArray(datum))) { + return false; + } + for (Object element : getArrayAsCollection(datum)) { + if (!validate(schema.getElementType(), element)) { + return false; + } + } + return true; + case MAP: + if (!(isMap(datum))) { + return false; + } + @SuppressWarnings(value = "unchecked") + Map map = (Map) datum; + for (Map.Entry entry : map.entrySet()) { + if (!validate(schema.getValueType(), entry.getValue())) { + return false; + } + } + return true; + case UNION: + try { + int i = resolveUnion(schema, datum); + return validate(schema.getTypes().get(i), datum); + } catch (UnresolvedUnionException e) { + return false; + } + case FIXED: + return (datum instanceof GenericFixed && ((GenericFixed) datum).bytes().length == schema.getFixedSize()) + || DECIMAL_CONVERSION.getConvertedType().isInstance(datum); + case STRING: + return isString(datum) + || UUID_CONVERSION.getConvertedType().isInstance(datum); + case BYTES: + return isBytes(datum) + || DECIMAL_CONVERSION.getConvertedType().isInstance(datum); + case INT: + return isInteger(datum) + || DATE_CONVERSION.getConvertedType().isInstance(datum); + case LONG: + return isLong(datum) + || TIME_MICROS_CONVERSION.getConvertedType().isInstance(datum) + || TIMESTAMP_MICROS_CONVERSION.getConvertedType().isInstance(datum); + case FLOAT: + return isFloat(datum); + case DOUBLE: + return isDouble(datum); + case BOOLEAN: + return isBoolean(datum); + case NULL: + return datum == null; + default: + return false; + } + } +} + diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java index 3b356a7924d0e..0fcd3a2c3d3da 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java @@ -18,19 +18,26 @@ package org.apache.hudi.avro; +import org.apache.hudi.common.config.SerializableSchema; +import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.SchemaCompatibilityException; +import org.apache.avro.AvroRuntimeException; +import org.apache.avro.Conversions; import org.apache.avro.Conversions.DecimalConversion; import org.apache.avro.JsonProperties; import org.apache.avro.LogicalTypes; import org.apache.avro.LogicalTypes.Decimal; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; +import org.apache.avro.SchemaCompatibility; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericData.Record; import org.apache.avro.generic.GenericDatumReader; @@ -45,43 +52,63 @@ import org.apache.avro.io.EncoderFactory; import org.apache.avro.io.JsonDecoder; import org.apache.avro.io.JsonEncoder; -import org.codehaus.jackson.node.NullNode; +import org.apache.avro.specific.SpecificRecordBase; + +import org.apache.hadoop.util.VersionUtil; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; +import java.math.BigDecimal; +import java.math.BigInteger; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; +import java.sql.Date; +import java.sql.Timestamp; import java.time.LocalDate; +import java.time.ZoneId; +import java.time.ZonedDateTime; import java.util.ArrayList; -import java.util.Arrays; +import java.util.Collection; import java.util.Collections; -import java.util.LinkedHashSet; +import java.util.HashMap; +import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Deque; +import java.util.LinkedList; +import java.util.Set; +import java.util.TimeZone; import java.util.stream.Collectors; -import java.util.zip.DeflaterOutputStream; -import java.util.zip.InflaterInputStream; + +import static org.apache.avro.Schema.Type.UNION; +import static org.apache.hudi.avro.AvroSchemaUtils.createNullableSchema; +import static org.apache.hudi.avro.AvroSchemaUtils.isNullable; +import static org.apache.hudi.avro.AvroSchemaUtils.resolveNullableSchema; +import static org.apache.hudi.avro.AvroSchemaUtils.resolveUnionSchema; +import static org.apache.hudi.common.util.ValidationUtils.checkState; /** * Helper class to do common stuff across Avro. */ public class HoodieAvroUtils { - private static ThreadLocal reuseEncoder = ThreadLocal.withInitial(() -> null); + public static final String AVRO_VERSION = Schema.class.getPackage().getImplementationVersion(); + private static final ThreadLocal BINARY_ENCODER = ThreadLocal.withInitial(() -> null); + private static final ThreadLocal BINARY_DECODER = ThreadLocal.withInitial(() -> null); - private static ThreadLocal reuseDecoder = ThreadLocal.withInitial(() -> null); + private static final long MILLIS_PER_DAY = 86400000L; + + //Export for test + public static final Conversions.DecimalConversion DECIMAL_CONVERSION = new Conversions.DecimalConversion(); // As per https://avro.apache.org/docs/current/spec.html#names - private static String INVALID_AVRO_CHARS_IN_NAMES = "[^A-Za-z0-9_]"; - private static String INVALID_AVRO_FIRST_CHAR_IN_NAMES = "[^A-Za-z_]"; - private static String MASK_FOR_INVALID_CHARS_IN_NAMES = "__"; + private static final String INVALID_AVRO_CHARS_IN_NAMES = "[^A-Za-z0-9_]"; + private static final String INVALID_AVRO_FIRST_CHAR_IN_NAMES = "[^A-Za-z_]"; + private static final String MASK_FOR_INVALID_CHARS_IN_NAMES = "__"; // All metadata fields are optional strings. - public static final Schema METADATA_FIELD_SCHEMA = - Schema.createUnion(Arrays.asList(Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.STRING))); + public static final Schema METADATA_FIELD_SCHEMA = createNullableSchema(Schema.Type.STRING); public static final Schema RECORD_KEY_SCHEMA = initRecordKeySchema(); @@ -93,10 +120,10 @@ public static byte[] avroToBytes(GenericRecord record) { } public static byte[] indexedRecordToBytes(T record) { - GenericDatumWriter writer = new GenericDatumWriter<>(record.getSchema()); + GenericDatumWriter writer = new GenericDatumWriter<>(record.getSchema(), ConvertingGenericData.INSTANCE); try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { - BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(out, reuseEncoder.get()); - reuseEncoder.set(encoder); + BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(out, BINARY_ENCODER.get()); + BINARY_ENCODER.set(encoder); writer.write(record, encoder); encoder.flush(); return out.toByteArray(); @@ -131,8 +158,8 @@ public static GenericRecord bytesToAvro(byte[] bytes, Schema schema) throws IOEx * Convert serialized bytes back into avro record. */ public static GenericRecord bytesToAvro(byte[] bytes, Schema writerSchema, Schema readerSchema) throws IOException { - BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(bytes, reuseDecoder.get()); - reuseDecoder.set(decoder); + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(bytes, BINARY_DECODER.get()); + BINARY_DECODER.set(decoder); GenericDatumReader reader = new GenericDatumReader<>(writerSchema, readerSchema); return reader.read(null, decoder); } @@ -148,11 +175,7 @@ public static GenericRecord jsonBytesToAvro(byte[] bytes, Schema schema) throws } public static boolean isMetadataField(String fieldName) { - return HoodieRecord.COMMIT_TIME_METADATA_FIELD.equals(fieldName) - || HoodieRecord.COMMIT_SEQNO_METADATA_FIELD.equals(fieldName) - || HoodieRecord.RECORD_KEY_METADATA_FIELD.equals(fieldName) - || HoodieRecord.PARTITION_PATH_METADATA_FIELD.equals(fieldName) - || HoodieRecord.FILENAME_METADATA_FIELD.equals(fieldName); + return HoodieRecord.HOODIE_META_COLUMNS_WITH_OPERATION.contains(fieldName); } public static Schema createHoodieWriteSchema(Schema originalSchema) { @@ -163,10 +186,26 @@ public static Schema createHoodieWriteSchema(String originalSchema) { return createHoodieWriteSchema(new Schema.Parser().parse(originalSchema)); } + public static Schema createHoodieWriteSchema(String originalSchema, boolean withOperationField) { + return addMetadataFields(new Schema.Parser().parse(originalSchema), withOperationField); + } + /** * Adds the Hoodie metadata fields to the given schema. + * + * @param schema The schema */ public static Schema addMetadataFields(Schema schema) { + return addMetadataFields(schema, false); + } + + /** + * Adds the Hoodie metadata fields to the given schema. + * + * @param schema The schema + * @param withOperationField Whether to include the '_hoodie_operation' field + */ + public static Schema addMetadataFields(Schema schema, boolean withOperationField) { List parentFields = new ArrayList<>(); Schema.Field commitTimeField = @@ -185,6 +224,13 @@ public static Schema addMetadataFields(Schema schema) { parentFields.add(recordKeyField); parentFields.add(partitionPathField); parentFields.add(fileNameField); + + if (withOperationField) { + final Schema.Field operationField = + new Schema.Field(HoodieRecord.OPERATION_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", JsonProperties.NULL_VALUE); + parentFields.add(operationField); + } + for (Schema.Field field : schema.getFields()) { if (!isMetadataField(field.name())) { Schema.Field newField = new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultVal()); @@ -201,11 +247,15 @@ public static Schema addMetadataFields(Schema schema) { } public static Schema removeMetadataFields(Schema schema) { + return removeFields(schema, HoodieRecord.HOODIE_META_COLUMNS_WITH_OPERATION); + } + + public static Schema removeFields(Schema schema, Set fieldsToRemove) { List filteredFields = schema.getFields() - .stream() - .filter(field -> !HoodieRecord.HOODIE_META_COLUMNS.contains(field.name())) - .map(field -> new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultVal())) - .collect(Collectors.toList()); + .stream() + .filter(field -> !fieldsToRemove.contains(field.name())) + .map(field -> new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultVal())) + .collect(Collectors.toList()); Schema filteredSchema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), false); filteredSchema.setFields(filteredFields); return filteredSchema; @@ -217,7 +267,7 @@ public static String addMetadataColumnTypes(String hiveColumnTypes) { private static Schema initRecordKeySchema() { Schema.Field recordKeyField = - new Schema.Field(HoodieRecord.RECORD_KEY_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", NullNode.getInstance()); + new Schema.Field(HoodieRecord.RECORD_KEY_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", JsonProperties.NULL_VALUE); Schema recordKeySchema = Schema.createRecord("HoodieRecordKey", "", "", false); recordKeySchema.setFields(Collections.singletonList(recordKeyField)); return recordKeySchema; @@ -235,9 +285,9 @@ public static Schema getRecordKeyPartitionPathSchema() { Schema recordSchema = Schema.createRecord("HoodieRecordKey", "", "", false); Schema.Field recordKeyField = - new Schema.Field(HoodieRecord.RECORD_KEY_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", NullNode.getInstance()); + new Schema.Field(HoodieRecord.RECORD_KEY_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", JsonProperties.NULL_VALUE); Schema.Field partitionPathField = - new Schema.Field(HoodieRecord.PARTITION_PATH_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", NullNode.getInstance()); + new Schema.Field(HoodieRecord.PARTITION_PATH_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", JsonProperties.NULL_VALUE); toBeAddedFields.add(recordKeyField); toBeAddedFields.add(partitionPathField); @@ -245,31 +295,33 @@ public static Schema getRecordKeyPartitionPathSchema() { return recordSchema; } + /** + * Fetch schema for record key and partition path. + */ + public static Schema getSchemaForFields(Schema fileSchema, List fields) { + List toBeAddedFields = new ArrayList<>(); + Schema recordSchema = Schema.createRecord("HoodieRecordKey", "", "", false); + + for (Schema.Field schemaField : fileSchema.getFields()) { + if (fields.contains(schemaField.name())) { + toBeAddedFields.add(new Schema.Field(schemaField.name(), schemaField.schema(), schemaField.doc(), schemaField.defaultVal())); + } + } + recordSchema.setFields(toBeAddedFields); + return recordSchema; + } + public static GenericRecord addHoodieKeyToRecord(GenericRecord record, String recordKey, String partitionPath, - String fileName) { + String fileName) { record.put(HoodieRecord.FILENAME_METADATA_FIELD, fileName); record.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, partitionPath); record.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recordKey); return record; } - /** - * Add null fields to passed in schema. Caller is responsible for ensuring there is no duplicates. As different query - * engines have varying constraints regarding treating the case-sensitivity of fields, its best to let caller - * determine that. - * - * @param schema Passed in schema - * @param newFieldNames Null Field names to be added - */ - public static Schema appendNullSchemaFields(Schema schema, List newFieldNames) { - List newFields = schema.getFields().stream() - .map(field -> new Field(field.name(), field.schema(), field.doc(), field.defaultValue())).collect(Collectors.toList()); - for (String newField : newFieldNames) { - newFields.add(new Schema.Field(newField, METADATA_FIELD_SCHEMA, "", NullNode.getInstance())); - } - Schema newSchema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), schema.isError()); - newSchema.setFields(newFields); - return newSchema; + public static GenericRecord addOperationToRecord(GenericRecord record, HoodieOperation operation) { + record.put(HoodieRecord.OPERATION_METADATA_FIELD, operation.getName()); + return record; } /** @@ -293,78 +345,111 @@ public static GenericRecord stitchRecords(GenericRecord left, GenericRecord righ } /** - * Given a avro record with a given schema, rewrites it into the new schema while setting fields only from the old + * Given an Avro record with a given schema, rewrites it into the new schema while setting fields only from the new * schema. + * + * NOTE: This method is rewriting every record's field that is record itself recursively. It's + * caller's responsibility to make sure that no unnecessary re-writing occurs (by preemptively + * checking whether the record does require re-writing to adhere to the new schema) + * + * NOTE: Here, the assumption is that you cannot go from an evolved schema (schema with (N) fields) + * to an older schema (schema with (N-1) fields). All fields present in the older record schema MUST be present in the + * new schema and the default/existing values are carried over. + * + * This particular method does the following: + *
      + *
    1. Create a new empty GenericRecord with the new schema.
    2. + *
    3. For GenericRecord, copy over the data from the old schema to the new schema or set default values for all + * fields of this transformed schema
    4. + *
    5. For SpecificRecord, hoodie_metadata_fields have a special treatment (see below)
    6. + *
    + * + * For SpecificRecord we ignore Hudi Metadata fields, because for code generated + * avro classes (HoodieMetadataRecord), the avro record is a SpecificBaseRecord type instead of a GenericRecord. + * SpecificBaseRecord throws null pointer exception for record.get(name) if name is not present in the schema of the + * record (which happens when converting a SpecificBaseRecord without hoodie_metadata_fields to a new record with it). + * In this case, we do NOT set the defaults for the hoodie_metadata_fields explicitly, instead, the new record assumes + * the default defined in the avro schema itself. + * TODO: See if we can always pass GenericRecord instead of SpecificBaseRecord in some cases. */ - public static GenericRecord rewriteRecord(GenericRecord record, Schema newSchema) { - return rewrite(record, getCombinedFieldsToWrite(record.getSchema(), newSchema), newSchema); - } + public static GenericRecord rewriteRecord(GenericRecord oldRecord, Schema newSchema) { + GenericRecord newRecord = new GenericData.Record(newSchema); + boolean isSpecificRecord = oldRecord instanceof SpecificRecordBase; + for (Schema.Field f : newSchema.getFields()) { + if (!(isSpecificRecord && isMetadataField(f.name()))) { + copyOldValueOrSetDefault(oldRecord, newRecord, f); + } + } - /** - * Given a avro record with a given schema, rewrites it into the new schema while setting fields only from the new - * schema. - */ - public static GenericRecord rewriteRecordWithOnlyNewSchemaFields(GenericRecord record, Schema newSchema) { - return rewrite(record, new LinkedHashSet<>(newSchema.getFields()), newSchema); + if (!ConvertingGenericData.INSTANCE.validate(newSchema, newRecord)) { + throw new SchemaCompatibilityException( + "Unable to validate the rewritten record " + oldRecord + " against schema " + newSchema); + } + + return newRecord; } - private static GenericRecord rewrite(GenericRecord record, LinkedHashSet fieldsToWrite, Schema newSchema) { + public static GenericRecord rewriteRecordWithMetadata(GenericRecord genericRecord, Schema newSchema, String fileName) { GenericRecord newRecord = new GenericData.Record(newSchema); - for (Schema.Field f : fieldsToWrite) { - if (record.get(f.name()) == null) { - if (f.defaultVal() instanceof JsonProperties.Null) { - newRecord.put(f.name(), null); - } else { - newRecord.put(f.name(), f.defaultVal()); - } - } else { - newRecord.put(f.name(), record.get(f.name())); - } + for (Schema.Field f : newSchema.getFields()) { + copyOldValueOrSetDefault(genericRecord, newRecord, f); } + // do not preserve FILENAME_METADATA_FIELD + newRecord.put(HoodieRecord.FILENAME_META_FIELD_ORD, fileName); if (!GenericData.get().validate(newSchema, newRecord)) { throw new SchemaCompatibilityException( - "Unable to validate the rewritten record " + record + " against schema " + newSchema); + "Unable to validate the rewritten record " + genericRecord + " against schema " + newSchema); } return newRecord; } + // TODO Unify the logical of rewriteRecordWithMetadata and rewriteEvolutionRecordWithMetadata, and delete this function. + public static GenericRecord rewriteEvolutionRecordWithMetadata(GenericRecord genericRecord, Schema newSchema, String fileName) { + GenericRecord newRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(genericRecord, newSchema, new HashMap<>()); + // do not preserve FILENAME_METADATA_FIELD + newRecord.put(HoodieRecord.FILENAME_META_FIELD_ORD, fileName); + return newRecord; + } + /** - * Generates a super set of fields from both old and new schema. + * Converts list of {@link GenericRecord} provided into the {@link GenericRecord} adhering to the + * provided {@code newSchema}. + *

    + * To better understand conversion rules please check {@link #rewriteRecord(GenericRecord, Schema)} */ - private static LinkedHashSet getCombinedFieldsToWrite(Schema oldSchema, Schema newSchema) { - LinkedHashSet allFields = new LinkedHashSet<>(oldSchema.getFields()); - for (Schema.Field f : newSchema.getFields()) { - if (!allFields.contains(f) && !isMetadataField(f.name())) { - allFields.add(f); - } - } - return allFields; + public static List rewriteRecords(List records, Schema newSchema) { + return records.stream().map(r -> rewriteRecord(r, newSchema)).collect(Collectors.toList()); } - public static byte[] compress(String text) { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - try { - OutputStream out = new DeflaterOutputStream(baos); - out.write(text.getBytes(StandardCharsets.UTF_8)); - out.close(); - } catch (IOException e) { - throw new HoodieIOException("IOException while compressing text " + text, e); - } - return baos.toByteArray(); + /** + * Given an Avro record and list of columns to remove, this method removes the list of columns from + * the given avro record using rewriteRecord method. + *

    + * To better understand how it removes please check {@link #rewriteRecord(GenericRecord, Schema)} + */ + public static GenericRecord removeFields(GenericRecord record, Set fieldsToRemove) { + Schema newSchema = removeFields(record.getSchema(), fieldsToRemove); + return rewriteRecord(record, newSchema); } - public static String decompress(byte[] bytes) { - InputStream in = new InflaterInputStream(new ByteArrayInputStream(bytes)); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - try { - byte[] buffer = new byte[8192]; - int len; - while ((len = in.read(buffer)) > 0) { - baos.write(buffer, 0, len); + private static void copyOldValueOrSetDefault(GenericRecord oldRecord, GenericRecord newRecord, Schema.Field field) { + Schema oldSchema = oldRecord.getSchema(); + Object fieldValue = oldSchema.getField(field.name()) == null ? null : oldRecord.get(field.name()); + + if (fieldValue != null) { + // In case field's value is a nested record, we have to rewrite it as well + Object newFieldValue; + if (fieldValue instanceof GenericRecord) { + GenericRecord record = (GenericRecord) fieldValue; + newFieldValue = rewriteRecord(record, resolveUnionSchema(field.schema(), record.getSchema().getFullName())); + } else { + newFieldValue = fieldValue; } - return new String(baos.toByteArray(), StandardCharsets.UTF_8); - } catch (IOException e) { - throw new HoodieIOException("IOException while decompressing text", e); + newRecord.put(field.name(), newFieldValue); + } else if (field.defaultVal() instanceof JsonProperties.Null) { + newRecord.put(field.name(), null); + } else { + newRecord.put(field.name(), field.defaultVal()); } } @@ -381,7 +466,7 @@ public static Schema generateProjectionSchema(Schema originalSchema, List(schemaFieldsMap.keySet())); } else { - projectedFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultValue())); + projectedFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultVal())); } } @@ -391,88 +476,209 @@ public static Schema generateProjectionSchema(Schema originalSchema, List getNullableValAsString(GenericRecord rec, String fieldName) { + Schema.Field field = rec.getSchema().getField(fieldName); + String fieldVal = field == null ? null : StringUtils.objToString(rec.get(field.pos())); + return Option.ofNullable(fieldVal); + } + /** * This method converts values for fields with certain Avro/Parquet data types that require special handling. * * @param fieldSchema avro field schema - * @param fieldValue avro field value + * @param fieldValue avro field value * @return field value either converted (for certain data types) or as it is. */ - private static Object convertValueForSpecificDataTypes(Schema fieldSchema, Object fieldValue) { + public static Object convertValueForSpecificDataTypes(Schema fieldSchema, + Object fieldValue, + boolean consistentLogicalTimestampEnabled) { if (fieldSchema == null) { return fieldValue; + } else if (fieldValue == null) { + checkState(isNullable(fieldSchema)); + return null; } - if (fieldSchema.getType() == Schema.Type.UNION) { - for (Schema schema : fieldSchema.getTypes()) { - if (schema.getType() != Schema.Type.NULL) { - return convertValueForAvroLogicalTypes(schema, fieldValue); - } - } - } - return convertValueForAvroLogicalTypes(fieldSchema, fieldValue); + return convertValueForAvroLogicalTypes(resolveNullableSchema(fieldSchema), fieldValue, consistentLogicalTimestampEnabled); } /** * This method converts values for fields with certain Avro Logical data types that require special handling. - * + *

    * Logical Date Type is converted to actual Date value instead of Epoch Integer which is how it is * represented/stored in parquet. - * + *

    * Decimal Data Type is converted to actual decimal value instead of bytes/fixed which is how it is * represented/stored in parquet. * * @param fieldSchema avro field schema - * @param fieldValue avro field value + * @param fieldValue avro field value * @return field value either converted (for certain data types) or as it is. */ - private static Object convertValueForAvroLogicalTypes(Schema fieldSchema, Object fieldValue) { + private static Object convertValueForAvroLogicalTypes(Schema fieldSchema, Object fieldValue, boolean consistentLogicalTimestampEnabled) { if (fieldSchema.getLogicalType() == LogicalTypes.date()) { return LocalDate.ofEpochDay(Long.parseLong(fieldValue.toString())); + } else if (fieldSchema.getLogicalType() == LogicalTypes.timestampMillis() && consistentLogicalTimestampEnabled) { + return new Timestamp(Long.parseLong(fieldValue.toString())); + } else if (fieldSchema.getLogicalType() == LogicalTypes.timestampMicros() && consistentLogicalTimestampEnabled) { + return new Timestamp(Long.parseLong(fieldValue.toString()) / 1000); } else if (fieldSchema.getLogicalType() instanceof LogicalTypes.Decimal) { Decimal dc = (Decimal) fieldSchema.getLogicalType(); DecimalConversion decimalConversion = new DecimalConversion(); @@ -480,8 +686,11 @@ private static Object convertValueForAvroLogicalTypes(Schema fieldSchema, Object return decimalConversion.fromFixed((GenericFixed) fieldValue, fieldSchema, LogicalTypes.decimal(dc.getPrecision(), dc.getScale())); } else if (fieldSchema.getType() == Schema.Type.BYTES) { - return decimalConversion.fromBytes((ByteBuffer) fieldValue, fieldSchema, + ByteBuffer byteBuffer = (ByteBuffer) fieldValue; + BigDecimal convertedValue = decimalConversion.fromBytes(byteBuffer, fieldSchema, LogicalTypes.decimal(dc.getPrecision(), dc.getScale())); + byteBuffer.rewind(); + return convertedValue; } } return fieldValue; @@ -494,13 +703,381 @@ public static Schema getNullSchema() { /** * Sanitizes Name according to Avro rule for names. * Removes characters other than the ones mentioned in https://avro.apache.org/docs/current/spec.html#names . + * * @param name input name * @return sanitized name */ public static String sanitizeName(String name) { - if (name.substring(0,1).matches(INVALID_AVRO_FIRST_CHAR_IN_NAMES)) { + if (name.substring(0, 1).matches(INVALID_AVRO_FIRST_CHAR_IN_NAMES)) { name = name.replaceFirst(INVALID_AVRO_FIRST_CHAR_IN_NAMES, MASK_FOR_INVALID_CHARS_IN_NAMES); } return name.replaceAll(INVALID_AVRO_CHARS_IN_NAMES, MASK_FOR_INVALID_CHARS_IN_NAMES); } + + /** + * Gets record column values into one object. + * + * @param record Hoodie record. + * @param columns Names of the columns to get values. + * @param schema {@link Schema} instance. + * @return Column value if a single column, or concatenated String values by comma. + */ + public static Object getRecordColumnValues(HoodieRecord record, + String[] columns, + Schema schema, boolean consistentLogicalTimestampEnabled) { + try { + GenericRecord genericRecord = (GenericRecord) record.getData().getInsertValue(schema).get(); + if (columns.length == 1) { + return HoodieAvroUtils.getNestedFieldVal(genericRecord, columns[0], true, consistentLogicalTimestampEnabled); + } else { + // TODO this is inefficient, instead we can simply return array of Comparable + StringBuilder sb = new StringBuilder(); + for (String col : columns) { + sb.append(HoodieAvroUtils.getNestedFieldValAsString(genericRecord, col, true, consistentLogicalTimestampEnabled)); + } + + return sb.toString(); + } + } catch (IOException e) { + throw new HoodieIOException("Unable to read record with key:" + record.getKey(), e); + } + } + + /** + * Gets record column values into one object. + * + * @param record Hoodie record. + * @param columns Names of the columns to get values. + * @param schema {@link SerializableSchema} instance. + * @return Column value if a single column, or concatenated String values by comma. + */ + public static Object getRecordColumnValues(HoodieRecord record, + String[] columns, + SerializableSchema schema, boolean consistentLogicalTimestampEnabled) { + return getRecordColumnValues(record, columns, schema.get(), consistentLogicalTimestampEnabled); + } + + /** + * Given a avro record with a given schema, rewrites it into the new schema while setting fields only from the new schema. + * support deep rewrite for nested record. + * This particular method does the following things : + * a) Create a new empty GenericRecord with the new schema. + * b) For GenericRecord, copy over the data from the old schema to the new schema or set default values for all fields of this transformed schema + * + * @param oldRecord oldRecord to be rewritten + * @param newSchema newSchema used to rewrite oldRecord + * @param renameCols a map store all rename cols, (k, v)-> (colNameFromNewSchema, colNameFromOldSchema) + * @return newRecord for new Schema + */ + public static GenericRecord rewriteRecordWithNewSchema(IndexedRecord oldRecord, Schema newSchema, Map renameCols) { + Object newRecord = rewriteRecordWithNewSchema(oldRecord, oldRecord.getSchema(), newSchema, renameCols, new LinkedList<>()); + return (GenericData.Record) newRecord; + } + + /** + * Given a avro record with a given schema, rewrites it into the new schema while setting fields only from the new schema. + * support deep rewrite for nested record and adjust rename operation. + * This particular method does the following things : + * a) Create a new empty GenericRecord with the new schema. + * b) For GenericRecord, copy over the data from the old schema to the new schema or set default values for all fields of this transformed schema + * + * @param oldRecord oldRecord to be rewritten + * @param oldAvroSchema old avro schema. + * @param newSchema newSchema used to rewrite oldRecord + * @param renameCols a map store all rename cols, (k, v)-> (colNameFromNewSchema, colNameFromOldSchema) + * @param fieldNames track the full name of visited field when we travel new schema. + * @return newRecord for new Schema + */ + private static Object rewriteRecordWithNewSchema(Object oldRecord, Schema oldAvroSchema, Schema newSchema, Map renameCols, Deque fieldNames) { + if (oldRecord == null) { + return null; + } + // try to get real schema for union type + Schema oldSchema = getActualSchemaFromUnion(oldAvroSchema, oldRecord); + switch (newSchema.getType()) { + case RECORD: + if (!(oldRecord instanceof IndexedRecord)) { + throw new IllegalArgumentException("cannot rewrite record with different type"); + } + IndexedRecord indexedRecord = (IndexedRecord) oldRecord; + List fields = newSchema.getFields(); + GenericData.Record newRecord = new GenericData.Record(newSchema); + for (int i = 0; i < fields.size(); i++) { + Schema.Field field = fields.get(i); + String fieldName = field.name(); + fieldNames.push(fieldName); + if (oldSchema.getField(field.name()) != null && !renameCols.containsKey(field.name())) { + Schema.Field oldField = oldSchema.getField(field.name()); + newRecord.put(i, rewriteRecordWithNewSchema(indexedRecord.get(oldField.pos()), oldField.schema(), fields.get(i).schema(), renameCols, fieldNames)); + } else { + String fieldFullName = createFullName(fieldNames); + String fieldNameFromOldSchema = renameCols.getOrDefault(fieldFullName, ""); + // deal with rename + if (oldSchema.getField(fieldNameFromOldSchema) != null) { + // find rename + Schema.Field oldField = oldSchema.getField(fieldNameFromOldSchema); + newRecord.put(i, rewriteRecordWithNewSchema(indexedRecord.get(oldField.pos()), oldField.schema(), fields.get(i).schema(), renameCols, fieldNames)); + } else { + // deal with default value + if (fields.get(i).defaultVal() instanceof JsonProperties.Null) { + newRecord.put(i, null); + } else { + newRecord.put(i, fields.get(i).defaultVal()); + } + } + } + fieldNames.pop(); + } + return newRecord; + case ARRAY: + if (!(oldRecord instanceof Collection)) { + throw new IllegalArgumentException("cannot rewrite record with different type"); + } + Collection array = (Collection)oldRecord; + List newArray = new ArrayList(); + fieldNames.push("element"); + for (Object element : array) { + newArray.add(rewriteRecordWithNewSchema(element, oldSchema.getElementType(), newSchema.getElementType(), renameCols, fieldNames)); + } + fieldNames.pop(); + return newArray; + case MAP: + if (!(oldRecord instanceof Map)) { + throw new IllegalArgumentException("cannot rewrite record with different type"); + } + Map map = (Map) oldRecord; + Map newMap = new HashMap<>(); + fieldNames.push("value"); + for (Map.Entry entry : map.entrySet()) { + newMap.put(entry.getKey(), rewriteRecordWithNewSchema(entry.getValue(), oldSchema.getValueType(), newSchema.getValueType(), renameCols, fieldNames)); + } + fieldNames.pop(); + return newMap; + case UNION: + return rewriteRecordWithNewSchema(oldRecord, getActualSchemaFromUnion(oldSchema, oldRecord), getActualSchemaFromUnion(newSchema, oldRecord), renameCols, fieldNames); + default: + return rewritePrimaryType(oldRecord, oldSchema, newSchema); + } + } + + private static String createFullName(Deque fieldNames) { + String result = ""; + if (!fieldNames.isEmpty()) { + List parentNames = new ArrayList<>(); + fieldNames.descendingIterator().forEachRemaining(parentNames::add); + result = parentNames.stream().collect(Collectors.joining(".")); + } + return result; + } + + private static Object rewritePrimaryType(Object oldValue, Schema oldSchema, Schema newSchema) { + Schema realOldSchema = oldSchema; + if (realOldSchema.getType() == UNION) { + realOldSchema = getActualSchemaFromUnion(oldSchema, oldValue); + } + if (realOldSchema.getType() == newSchema.getType()) { + switch (realOldSchema.getType()) { + case NULL: + case BOOLEAN: + case INT: + case LONG: + case FLOAT: + case DOUBLE: + case BYTES: + case STRING: + return oldValue; + case FIXED: + // fixed size and name must match: + if (!SchemaCompatibility.schemaNameEquals(realOldSchema, newSchema) || realOldSchema.getFixedSize() != newSchema.getFixedSize()) { + // deal with the precision change for decimalType + if (realOldSchema.getLogicalType() instanceof LogicalTypes.Decimal) { + final byte[] bytes; + bytes = ((GenericFixed) oldValue).bytes(); + LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) realOldSchema.getLogicalType(); + BigDecimal bd = new BigDecimal(new BigInteger(bytes), decimal.getScale()).setScale(((LogicalTypes.Decimal) newSchema.getLogicalType()).getScale()); + return DECIMAL_CONVERSION.toFixed(bd, newSchema, newSchema.getLogicalType()); + } + } else { + return oldValue; + } + return oldValue; + default: + throw new AvroRuntimeException("Unknown schema type: " + newSchema.getType()); + } + } else { + return rewritePrimaryTypeWithDiffSchemaType(oldValue, realOldSchema, newSchema); + } + } + + private static Object rewritePrimaryTypeWithDiffSchemaType(Object oldValue, Schema oldSchema, Schema newSchema) { + switch (newSchema.getType()) { + case NULL: + case BOOLEAN: + break; + case INT: + if (newSchema.getLogicalType() == LogicalTypes.date() && oldSchema.getType() == Schema.Type.STRING) { + return fromJavaDate(java.sql.Date.valueOf(oldValue.toString())); + } + break; + case LONG: + if (oldSchema.getType() == Schema.Type.INT) { + return ((Integer) oldValue).longValue(); + } + break; + case FLOAT: + if ((oldSchema.getType() == Schema.Type.INT) + || (oldSchema.getType() == Schema.Type.LONG)) { + return oldSchema.getType() == Schema.Type.INT ? ((Integer) oldValue).floatValue() : ((Long) oldValue).floatValue(); + } + break; + case DOUBLE: + if (oldSchema.getType() == Schema.Type.FLOAT) { + // java float cannot convert to double directly, deal with float precision change + return Double.valueOf(oldValue + ""); + } else if (oldSchema.getType() == Schema.Type.INT) { + return ((Integer) oldValue).doubleValue(); + } else if (oldSchema.getType() == Schema.Type.LONG) { + return ((Long) oldValue).doubleValue(); + } + break; + case BYTES: + if (oldSchema.getType() == Schema.Type.STRING) { + return (oldValue.toString()).getBytes(StandardCharsets.UTF_8); + } + break; + case STRING: + if (oldSchema.getType() == Schema.Type.BYTES) { + return String.valueOf(((byte[]) oldValue)); + } + if (oldSchema.getLogicalType() == LogicalTypes.date()) { + return toJavaDate((Integer) oldValue).toString(); + } + if (oldSchema.getType() == Schema.Type.INT + || oldSchema.getType() == Schema.Type.LONG + || oldSchema.getType() == Schema.Type.FLOAT + || oldSchema.getType() == Schema.Type.DOUBLE) { + return oldValue.toString(); + } + if (oldSchema.getType() == Schema.Type.FIXED && oldSchema.getLogicalType() instanceof LogicalTypes.Decimal) { + final byte[] bytes; + bytes = ((GenericFixed) oldValue).bytes(); + LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) oldSchema.getLogicalType(); + BigDecimal bd = new BigDecimal(new BigInteger(bytes), decimal.getScale()); + return bd.toString(); + } + break; + case FIXED: + // deal with decimal Type + if (newSchema.getLogicalType() instanceof LogicalTypes.Decimal) { + // TODO: support more types + if (oldSchema.getType() == Schema.Type.STRING + || oldSchema.getType() == Schema.Type.DOUBLE + || oldSchema.getType() == Schema.Type.INT + || oldSchema.getType() == Schema.Type.LONG + || oldSchema.getType() == Schema.Type.FLOAT) { + LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) newSchema.getLogicalType(); + BigDecimal bigDecimal = null; + if (oldSchema.getType() == Schema.Type.STRING) { + bigDecimal = new java.math.BigDecimal(oldValue.toString()) + .setScale(decimal.getScale()); + } else { + // Due to Java, there will be precision problems in direct conversion, we should use string instead of use double + bigDecimal = new java.math.BigDecimal(oldValue.toString()) + .setScale(decimal.getScale()); + } + return DECIMAL_CONVERSION.toFixed(bigDecimal, newSchema, newSchema.getLogicalType()); + } + } + break; + default: + } + throw new AvroRuntimeException(String.format("cannot support rewrite value for schema type: %s since the old schema type is: %s", newSchema, oldSchema)); + } + + /** + * convert days to Date + * + * NOTE: This method could only be used in tests + * + * @VisibleForTesting + */ + public static java.sql.Date toJavaDate(int days) { + LocalDate date = LocalDate.ofEpochDay(days); + ZoneId defaultZoneId = ZoneId.systemDefault(); + ZonedDateTime zonedDateTime = date.atStartOfDay(defaultZoneId); + return new java.sql.Date(zonedDateTime.toInstant().toEpochMilli()); + } + + /** + * convert Date to days + * + * NOTE: This method could only be used in tests + * + * @VisibleForTesting + */ + public static int fromJavaDate(Date date) { + long millisUtc = date.getTime(); + long millisLocal = millisUtc + TimeZone.getDefault().getOffset(millisUtc); + int julianDays = Math.toIntExact(Math.floorDiv(millisLocal, MILLIS_PER_DAY)); + return julianDays; + } + + private static Schema getActualSchemaFromUnion(Schema schema, Object data) { + Schema actualSchema; + if (!schema.getType().equals(UNION)) { + return schema; + } + if (schema.getTypes().size() == 2 + && schema.getTypes().get(0).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(1); + } else if (schema.getTypes().size() == 2 + && schema.getTypes().get(1).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(0); + } else if (schema.getTypes().size() == 1) { + actualSchema = schema.getTypes().get(0); + } else { + // deal complex union. this should not happened in hoodie, + // since flink/spark do not write this type. + int i = GenericData.get().resolveUnion(schema, data); + actualSchema = schema.getTypes().get(i); + } + return actualSchema; + } + + /** + * Given avro records, rewrites them with new schema. + * + * @param oldRecords oldRecords to be rewrite + * @param newSchema newSchema used to rewrite oldRecord + * @param renameCols a map store all rename cols, (k, v)-> (colNameFromNewSchema, colNameFromOldSchema) + * @return a iterator of rewrote GeneriRcords + */ + public static Iterator rewriteRecordWithNewSchema(Iterator oldRecords, Schema newSchema, Map renameCols) { + if (oldRecords == null || newSchema == null) { + return Collections.emptyIterator(); + } + return new Iterator() { + @Override + public boolean hasNext() { + return oldRecords.hasNext(); + } + + @Override + public GenericRecord next() { + return rewriteRecordWithNewSchema(oldRecords.next(), newSchema, renameCols); + } + }; + } + + public static GenericRecord rewriteRecordDeep(GenericRecord oldRecord, Schema newSchema) { + return rewriteRecordWithNewSchema(oldRecord, newSchema, Collections.EMPTY_MAP); + } + + public static boolean gteqAvro1_9() { + return VersionUtil.compareVersions(AVRO_VERSION, "1.9") >= 0; + } + + public static boolean gteqAvro1_10() { + return VersionUtil.compareVersions(AVRO_VERSION, "1.10") >= 0; + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroWriteSupport.java b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroWriteSupport.java index cb96e6f3f09f1..e87364fb90970 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroWriteSupport.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroWriteSupport.java @@ -18,64 +18,63 @@ package org.apache.hudi.avro; -import org.apache.hudi.common.bloom.BloomFilter; -import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter; - import org.apache.avro.Schema; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.Option; import org.apache.parquet.avro.AvroWriteSupport; import org.apache.parquet.hadoop.api.WriteSupport; import org.apache.parquet.schema.MessageType; +import java.nio.charset.StandardCharsets; +import java.util.Collections; import java.util.HashMap; +import java.util.Map; /** * Wrap AvroWriterSupport for plugging in the bloom filter. */ public class HoodieAvroWriteSupport extends AvroWriteSupport { - private BloomFilter bloomFilter; - private String minRecordKey; - private String maxRecordKey; + private final Option> bloomFilterWriteSupportOpt; + private final Map footerMetadata = new HashMap<>(); public static final String OLD_HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY = "com.uber.hoodie.bloomfilter"; public static final String HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY = "org.apache.hudi.bloomfilter"; - public static final String HOODIE_MIN_RECORD_KEY_FOOTER = "hoodie_min_record_key"; - public static final String HOODIE_MAX_RECORD_KEY_FOOTER = "hoodie_max_record_key"; - public static final String HOODIE_BLOOM_FILTER_TYPE_CODE = "hoodie_bloom_filter_type_code"; - public HoodieAvroWriteSupport(MessageType schema, Schema avroSchema, BloomFilter bloomFilter) { - super(schema, avroSchema); - this.bloomFilter = bloomFilter; + public HoodieAvroWriteSupport(MessageType schema, Schema avroSchema, Option bloomFilterOpt) { + super(schema, avroSchema, ConvertingGenericData.INSTANCE); + this.bloomFilterWriteSupportOpt = bloomFilterOpt.map(HoodieBloomFilterAvroWriteSupport::new); } @Override public WriteSupport.FinalizedWriteContext finalizeWrite() { - HashMap extraMetaData = new HashMap<>(); - if (bloomFilter != null) { - extraMetaData.put(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, bloomFilter.serializeToString()); - if (minRecordKey != null && maxRecordKey != null) { - extraMetaData.put(HOODIE_MIN_RECORD_KEY_FOOTER, minRecordKey); - extraMetaData.put(HOODIE_MAX_RECORD_KEY_FOOTER, maxRecordKey); - } - if (bloomFilter.getBloomFilterTypeCode().name().contains(HoodieDynamicBoundedBloomFilter.TYPE_CODE_PREFIX)) { - extraMetaData.put(HOODIE_BLOOM_FILTER_TYPE_CODE, bloomFilter.getBloomFilterTypeCode().name()); - } - } - return new WriteSupport.FinalizedWriteContext(extraMetaData); + Map extraMetadata = + CollectionUtils.combine(footerMetadata, + bloomFilterWriteSupportOpt.map(HoodieBloomFilterWriteSupport::finalizeMetadata) + .orElse(Collections.emptyMap()) + ); + + return new WriteSupport.FinalizedWriteContext(extraMetadata); } public void add(String recordKey) { - this.bloomFilter.add(recordKey); - if (minRecordKey != null) { - minRecordKey = minRecordKey.compareTo(recordKey) <= 0 ? minRecordKey : recordKey; - } else { - minRecordKey = recordKey; + this.bloomFilterWriteSupportOpt.ifPresent(bloomFilterWriteSupport -> + bloomFilterWriteSupport.addKey(recordKey)); + } + + public void addFooterMetadata(String key, String value) { + footerMetadata.put(key, value); + } + + private static class HoodieBloomFilterAvroWriteSupport extends HoodieBloomFilterWriteSupport { + public HoodieBloomFilterAvroWriteSupport(BloomFilter bloomFilter) { + super(bloomFilter); } - if (maxRecordKey != null) { - maxRecordKey = maxRecordKey.compareTo(recordKey) >= 0 ? maxRecordKey : recordKey; - } else { - maxRecordKey = recordKey; + @Override + protected byte[] getUTF8Bytes(String key) { + return key.getBytes(StandardCharsets.UTF_8); } } } diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieBloomFilterWriteSupport.java b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieBloomFilterWriteSupport.java new file mode 100644 index 0000000000000..1a689791ba3fd --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieBloomFilterWriteSupport.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.avro; + +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter; + +import java.util.HashMap; +import java.util.Map; + +import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY; + +/** + * This is write-support utility base-class taking up handling of + * + *
      + *
    • Adding record keys to the Bloom Filter
    • + *
    • Keeping track of min/max record key (w/in single file)
    • + *
    + * + * @param record-key type being ingested by this clas + */ +public abstract class HoodieBloomFilterWriteSupport> { + + public static final String HOODIE_MIN_RECORD_KEY_FOOTER = "hoodie_min_record_key"; + public static final String HOODIE_MAX_RECORD_KEY_FOOTER = "hoodie_max_record_key"; + public static final String HOODIE_BLOOM_FILTER_TYPE_CODE = "hoodie_bloom_filter_type_code"; + + private final BloomFilter bloomFilter; + + private T minRecordKey; + private T maxRecordKey; + + public HoodieBloomFilterWriteSupport(BloomFilter bloomFilter) { + this.bloomFilter = bloomFilter; + } + + public void addKey(T recordKey) { + bloomFilter.add(getUTF8Bytes(recordKey)); + + if (minRecordKey == null || minRecordKey.compareTo(recordKey) > 0) { + minRecordKey = dereference(recordKey); + } + + if (maxRecordKey == null || maxRecordKey.compareTo(recordKey) < 0) { + maxRecordKey = dereference(recordKey); + } + } + + public Map finalizeMetadata() { + HashMap extraMetadata = new HashMap<>(); + + extraMetadata.put(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, bloomFilter.serializeToString()); + if (bloomFilter.getBloomFilterTypeCode().name().contains(HoodieDynamicBoundedBloomFilter.TYPE_CODE_PREFIX)) { + extraMetadata.put(HOODIE_BLOOM_FILTER_TYPE_CODE, bloomFilter.getBloomFilterTypeCode().name()); + } + + if (minRecordKey != null && maxRecordKey != null) { + extraMetadata.put(HOODIE_MIN_RECORD_KEY_FOOTER, minRecordKey.toString()); + extraMetadata.put(HOODIE_MAX_RECORD_KEY_FOOTER, maxRecordKey.toString()); + } + + return extraMetadata; + } + + /** + * Since Bloom Filter ingests record-keys represented as UTF8 encoded byte string, + * this method have to be implemented for converting the original record key into one + */ + protected abstract byte[] getUTF8Bytes(T key); + + /** + * This method allows to dereference the key object (t/h cloning, for ex) that might be + * pointing at a shared mutable buffer, to make sure that we're not keeping references + * to mutable objects + */ + protected T dereference(T key) { + return key; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/MercifulJsonConverter.java b/hudi-common/src/main/java/org/apache/hudi/avro/MercifulJsonConverter.java index d759a8debf602..15335193414ae 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/MercifulJsonConverter.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/MercifulJsonConverter.java @@ -18,15 +18,14 @@ package org.apache.hudi.avro; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieIOException; - import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.avro.Schema; import org.apache.avro.Schema.Type; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; import java.io.IOException; import java.io.Serializable; @@ -293,7 +292,7 @@ public Pair convert(Object value, String name, Schema schema) { for (Object v : (List) value) { listRes.add(convertJsonToAvroField(v, name, elementSchema)); } - return Pair.of(true, listRes); + return Pair.of(true, new GenericData.Array<>(schema, listRes)); } }; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/HoodieCleanStat.java b/hudi-common/src/main/java/org/apache/hudi/common/HoodieCleanStat.java index e9de502f78bbf..0913a7440f020 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/HoodieCleanStat.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/HoodieCleanStat.java @@ -47,28 +47,36 @@ public class HoodieCleanStat implements Serializable { private final List failedDeleteBootstrapBaseFiles; // Earliest commit that was retained in this clean private final String earliestCommitToRetain; + // Last completed commit timestamp before clean + private final String lastCompletedCommitTimestamp; + // set to true if partition is deleted + private final boolean isPartitionDeleted; public HoodieCleanStat(HoodieCleaningPolicy policy, String partitionPath, List deletePathPatterns, - List successDeleteFiles, List failedDeleteFiles, String earliestCommitToRetain) { + List successDeleteFiles, List failedDeleteFiles, String earliestCommitToRetain,String lastCompletedCommitTimestamp) { this(policy, partitionPath, deletePathPatterns, successDeleteFiles, failedDeleteFiles, earliestCommitToRetain, - CollectionUtils.createImmutableList(), CollectionUtils.createImmutableList(), - CollectionUtils.createImmutableList()); + lastCompletedCommitTimestamp, CollectionUtils.createImmutableList(), CollectionUtils.createImmutableList(), + CollectionUtils.createImmutableList(), false); } public HoodieCleanStat(HoodieCleaningPolicy policy, String partitionPath, List deletePathPatterns, List successDeleteFiles, List failedDeleteFiles, - String earliestCommitToRetain, List deleteBootstrapBasePathPatterns, + String earliestCommitToRetain,String lastCompletedCommitTimestamp, + List deleteBootstrapBasePathPatterns, List successDeleteBootstrapBaseFiles, - List failedDeleteBootstrapBaseFiles) { + List failedDeleteBootstrapBaseFiles, + boolean isPartitionDeleted) { this.policy = policy; this.partitionPath = partitionPath; this.deletePathPatterns = deletePathPatterns; this.successDeleteFiles = successDeleteFiles; this.failedDeleteFiles = failedDeleteFiles; this.earliestCommitToRetain = earliestCommitToRetain; + this.lastCompletedCommitTimestamp = lastCompletedCommitTimestamp; this.deleteBootstrapBasePathPatterns = deleteBootstrapBasePathPatterns; this.successDeleteBootstrapBaseFiles = successDeleteBootstrapBaseFiles; this.failedDeleteBootstrapBaseFiles = failedDeleteBootstrapBaseFiles; + this.isPartitionDeleted = isPartitionDeleted; } public HoodieCleaningPolicy getPolicy() { @@ -107,7 +115,15 @@ public String getEarliestCommitToRetain() { return earliestCommitToRetain; } - public static HoodieCleanStat.Builder newBuilder() { + public String getLastCompletedCommitTimestamp() { + return lastCompletedCommitTimestamp; + } + + public boolean isPartitionDeleted() { + return isPartitionDeleted; + } + + public static Builder newBuilder() { return new Builder(); } @@ -122,9 +138,11 @@ public static class Builder { private List failedDeleteFiles; private String partitionPath; private String earliestCommitToRetain; + private String lastCompletedCommitTimestamp; private List deleteBootstrapBasePathPatterns; private List successDeleteBootstrapBaseFiles; private List failedDeleteBootstrapBaseFiles; + private boolean isPartitionDeleted; public Builder withPolicy(HoodieCleaningPolicy policy) { this.policy = policy; @@ -172,10 +190,20 @@ public Builder withEarliestCommitRetained(Option earliestCommitTo return this; } + public Builder withLastCompletedCommitTimestamp(String lastCompletedCommitTimestamp) { + this.lastCompletedCommitTimestamp = lastCompletedCommitTimestamp; + return this; + } + + public Builder isPartitionDeleted(boolean isPartitionDeleted) { + this.isPartitionDeleted = isPartitionDeleted; + return this; + } + public HoodieCleanStat build() { return new HoodieCleanStat(policy, partitionPath, deletePathPatterns, successDeleteFiles, failedDeleteFiles, - earliestCommitToRetain, deleteBootstrapBasePathPatterns, successDeleteBootstrapBaseFiles, - failedDeleteBootstrapBaseFiles); + earliestCommitToRetain, lastCompletedCommitTimestamp, deleteBootstrapBasePathPatterns, + successDeleteBootstrapBaseFiles, failedDeleteBootstrapBaseFiles, isPartitionDeleted); } } @@ -190,7 +218,8 @@ public String toString() { + ", earliestCommitToRetain='" + earliestCommitToRetain + ", deleteBootstrapBasePathPatterns=" + deleteBootstrapBasePathPatterns + ", successDeleteBootstrapBaseFiles=" + successDeleteBootstrapBaseFiles - + ", failedDeleteBootstrapBaseFiles=" + failedDeleteBootstrapBaseFiles + '\'' + + ", failedDeleteBootstrapBaseFiles=" + failedDeleteBootstrapBaseFiles + + ", isPartitionDeleted=" + isPartitionDeleted + '\'' + '}'; } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java index 1c15c66410e50..10869fc56828e 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java @@ -50,7 +50,7 @@ public HoodieJsonPayload(String json) throws IOException { } @Override - public HoodieJsonPayload preCombine(HoodieJsonPayload another) { + public HoodieJsonPayload preCombine(HoodieJsonPayload oldValue) { return this; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/HoodiePendingRollbackInfo.java b/hudi-common/src/main/java/org/apache/hudi/common/HoodiePendingRollbackInfo.java new file mode 100644 index 0000000000000..c53babf350102 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/HoodiePendingRollbackInfo.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common; + +import org.apache.hudi.avro.model.HoodieRollbackPlan; +import org.apache.hudi.common.table.timeline.HoodieInstant; + +/** + * Holds rollback instant and rollback plan for a pending rollback. + */ +public class HoodiePendingRollbackInfo { + + private final HoodieInstant rollbackInstant; + private final HoodieRollbackPlan rollbackPlan; + + public HoodiePendingRollbackInfo(HoodieInstant rollbackInstant, HoodieRollbackPlan rollbackPlan) { + this.rollbackInstant = rollbackInstant; + this.rollbackPlan = rollbackPlan; + } + + public HoodieInstant getRollbackInstant() { + return rollbackInstant; + } + + public HoodieRollbackPlan getRollbackPlan() { + return rollbackPlan; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/bloom/BloomFilter.java b/hudi-common/src/main/java/org/apache/hudi/common/bloom/BloomFilter.java index 7997da159b7f9..fbc46827dee68 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/bloom/BloomFilter.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/bloom/BloomFilter.java @@ -24,12 +24,19 @@ public interface BloomFilter { /** - * Add a key to the {@link BloomFilter}. + * Add a key represented by a {@link String} to the {@link BloomFilter}. * * @param key the key to the added to the {@link BloomFilter} */ void add(String key); + /** + * Add a key's bytes, representing UTF8-encoded string, to the {@link BloomFilter}. + * + * @param key the key bytes to the added to the {@link BloomFilter} + */ + void add(byte[] key); + /** * Tests for key membership. * diff --git a/hudi-common/src/main/java/org/apache/hudi/common/bloom/HoodieDynamicBoundedBloomFilter.java b/hudi-common/src/main/java/org/apache/hudi/common/bloom/HoodieDynamicBoundedBloomFilter.java index 343822b13adec..32093fc9c511f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/bloom/HoodieDynamicBoundedBloomFilter.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/bloom/HoodieDynamicBoundedBloomFilter.java @@ -63,7 +63,7 @@ public class HoodieDynamicBoundedBloomFilter implements BloomFilter { * @param serString the serialized string which represents the {@link HoodieDynamicBoundedBloomFilter} * @param typeCode type code of the bloom filter */ - HoodieDynamicBoundedBloomFilter(String serString, BloomFilterTypeCode typeCode) { + public HoodieDynamicBoundedBloomFilter(String serString, BloomFilterTypeCode typeCode) { // ignoring the type code for now, since we have just one version byte[] bytes = Base64CodecUtil.decode(serString); DataInputStream dis = new DataInputStream(new ByteArrayInputStream(bytes)); @@ -78,7 +78,12 @@ public class HoodieDynamicBoundedBloomFilter implements BloomFilter { @Override public void add(String key) { - internalDynamicBloomFilter.add(new Key(key.getBytes(StandardCharsets.UTF_8))); + add(key.getBytes(StandardCharsets.UTF_8)); + } + + @Override + public void add(byte[] keyBytes) { + internalDynamicBloomFilter.add(new Key(keyBytes)); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/bloom/SimpleBloomFilter.java b/hudi-common/src/main/java/org/apache/hudi/common/bloom/SimpleBloomFilter.java index b0278319fce46..43b19a19536b0 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/bloom/SimpleBloomFilter.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/bloom/SimpleBloomFilter.java @@ -77,16 +77,21 @@ public SimpleBloomFilter(String serString) { @Override public void add(String key) { - if (key == null) { - throw new NullPointerException("Key cannot by null"); + add(key.getBytes(StandardCharsets.UTF_8)); + } + + @Override + public void add(byte[] keyBytes) { + if (keyBytes == null) { + throw new NullPointerException("Key cannot be null"); } - filter.add(new Key(key.getBytes(StandardCharsets.UTF_8))); + filter.add(new Key(keyBytes)); } @Override public boolean mightContain(String key) { if (key == null) { - throw new NullPointerException("Key cannot by null"); + throw new NullPointerException("Key cannot be null"); } return filter.membershipTest(new Key(key.getBytes(StandardCharsets.UTF_8))); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/BootstrapIndex.java b/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/BootstrapIndex.java index 08d7f86ae078a..abd3ac51a20c2 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/BootstrapIndex.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/BootstrapIndex.java @@ -21,12 +21,12 @@ import org.apache.hudi.common.model.BootstrapFileMapping; import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.ReflectionUtils; import java.io.Serializable; import java.util.List; import java.util.Map; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.ReflectionUtils; /** * Bootstrap Index Interface. @@ -64,10 +64,14 @@ public BootstrapIndex(HoodieTableMetaClient metaClient) { * @return */ public final boolean useIndex() { - boolean validInstantTime = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant() - .map(i -> HoodieTimeline.compareTimestamps(i.getTimestamp(), HoodieTimeline.GREATER_THAN_OR_EQUALS, - HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS)).orElse(false); - return validInstantTime && metaClient.getTableConfig().getBootstrapBasePath().isPresent() && isPresent(); + if (isPresent()) { + boolean validInstantTime = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant() + .map(i -> HoodieTimeline.compareTimestamps(i.getTimestamp(), HoodieTimeline.GREATER_THAN_OR_EQUALS, + HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS)).orElse(false); + return validInstantTime && metaClient.getTableConfig().getBootstrapBasePath().isPresent(); + } else { + return false; + } } /** @@ -157,6 +161,6 @@ public abstract void appendNextPartition(String partitionPath, public static BootstrapIndex getBootstrapIndex(HoodieTableMetaClient metaClient) { return ((BootstrapIndex)(ReflectionUtils.loadClass( - metaClient.getTableConfig().getBootstrapIndexClass(), metaClient))); + metaClient.getTableConfig().getBootstrapIndexClass(), new Class[]{HoodieTableMetaClient.class}, metaClient))); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/HFileBootstrapIndex.java b/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/HFileBootstrapIndex.java index d4a77b0822847..718fff3cf3a73 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/HFileBootstrapIndex.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/HFileBootstrapIndex.java @@ -33,10 +33,13 @@ import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.io.storage.HoodieHFileUtils; +import org.apache.hudi.metadata.HoodieTableMetadata; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.CellComparatorImpl; import org.apache.hadoop.hbase.CellUtil; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.KeyValue; @@ -99,7 +102,10 @@ public HFileBootstrapIndex(HoodieTableMetaClient metaClient) { Path indexByFilePath = fileIdIndexPath(metaClient); try { FileSystem fs = metaClient.getFs(); - isPresent = fs.exists(indexByPartitionPath) && fs.exists(indexByFilePath); + // The metadata table is never bootstrapped, so the bootstrap index is always absent + // for the metadata table. The fs.exists calls are avoided for metadata table. + isPresent = !HoodieTableMetadata.isMetadataTable(metaClient.getBasePathV2().toString()) + && fs.exists(indexByPartitionPath) && fs.exists(indexByFilePath); } catch (IOException ioe) { throw new HoodieIOException(ioe.getMessage(), ioe); } @@ -178,9 +184,7 @@ private static String getUserKeyFromCellKey(String cellKey) { private static HFile.Reader createReader(String hFilePath, Configuration conf, FileSystem fileSystem) { try { LOG.info("Opening HFile for reading :" + hFilePath); - HFile.Reader reader = HFile.createReader(fileSystem, new HFilePathForReader(hFilePath), - new CacheConfig(conf), conf); - return reader; + return HoodieHFileUtils.createHFileReader(fileSystem, new HFilePathForReader(hFilePath), new CacheConfig(conf), conf); } catch (IOException ioe) { throw new HoodieIOException(ioe.getMessage(), ioe); } @@ -259,7 +263,7 @@ private void initIndexInfo() { private HoodieBootstrapIndexInfo fetchBootstrapIndexInfo() throws IOException { return TimelineMetadataUtils.deserializeAvroMetadata( - partitionIndexReader().loadFileInfo().get(INDEX_INFO_KEY), + partitionIndexReader().getHFileInfo().get(INDEX_INFO_KEY), HoodieBootstrapIndexInfo.class); } @@ -291,13 +295,13 @@ private HFile.Reader fileIdIndexReader() { @Override public List getIndexedPartitionPaths() { - HFileScanner scanner = partitionIndexReader().getScanner(true, true); + HFileScanner scanner = partitionIndexReader().getScanner(true, false); return getAllKeys(scanner, HFileBootstrapIndex::getPartitionFromKey); } @Override public List getIndexedFileGroupIds() { - HFileScanner scanner = fileIdIndexReader().getScanner(true, true); + HFileScanner scanner = fileIdIndexReader().getScanner(true, false); return getAllKeys(scanner, HFileBootstrapIndex::getFileGroupFromKey); } @@ -306,7 +310,7 @@ private List getAllKeys(HFileScanner scanner, Function convert try { boolean available = scanner.seekTo(); while (available) { - keys.add(converter.apply(getUserKeyFromCellKey(CellUtil.getCellKeyAsString(scanner.getKeyValue())))); + keys.add(converter.apply(getUserKeyFromCellKey(CellUtil.getCellKeyAsString(scanner.getCell())))); available = scanner.next(); } } catch (IOException ioe) { @@ -319,7 +323,7 @@ private List getAllKeys(HFileScanner scanner, Function convert @Override public List getSourceFileMappingForPartition(String partition) { try { - HFileScanner scanner = partitionIndexReader().getScanner(true, true); + HFileScanner scanner = partitionIndexReader().getScanner(true, false); KeyValue keyValue = new KeyValue(Bytes.toBytes(getPartitionKey(partition)), new byte[0], new byte[0], HConstants.LATEST_TIMESTAMP, KeyValue.Type.Put, new byte[0]); if (scanner.seekTo(keyValue) == 0) { @@ -352,7 +356,7 @@ public Map getSourceFileMappingForFileI List fileGroupIds = new ArrayList<>(ids); Collections.sort(fileGroupIds); try { - HFileScanner scanner = fileIdIndexReader().getScanner(true, true); + HFileScanner scanner = fileIdIndexReader().getScanner(true, false); for (HoodieFileGroupId fileGroupId : fileGroupIds) { KeyValue keyValue = new KeyValue(Bytes.toBytes(getFileGroupKey(fileGroupId)), new byte[0], new byte[0], HConstants.LATEST_TIMESTAMP, KeyValue.Type.Put, new byte[0]); @@ -528,13 +532,13 @@ public void close() { @Override public void begin() { try { - HFileContext meta = new HFileContextBuilder().build(); + HFileContext meta = new HFileContextBuilder().withCellComparator(new HoodieKVComparator()).build(); this.indexByPartitionWriter = HFile.getWriterFactory(metaClient.getHadoopConf(), new CacheConfig(metaClient.getHadoopConf())).withPath(metaClient.getFs(), indexByPartitionPath) - .withFileContext(meta).withComparator(new HoodieKVComparator()).create(); + .withFileContext(meta).create(); this.indexByFileIdWriter = HFile.getWriterFactory(metaClient.getHadoopConf(), new CacheConfig(metaClient.getHadoopConf())).withPath(metaClient.getFs(), indexByFileIdPath) - .withFileContext(meta).withComparator(new HoodieKVComparator()).create(); + .withFileContext(meta).create(); } catch (IOException ioe) { throw new HoodieIOException(ioe.getMessage(), ioe); } @@ -581,6 +585,6 @@ public String getName() { * This class is explicitly used as Key Comparator to workaround hard coded * legacy format class names inside HBase. Otherwise we will face issues with shading. */ - public static class HoodieKVComparator extends KeyValue.KVComparator { + public static class HoodieKVComparator extends CellComparatorImpl { } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/NoOpBootstrapIndex.java b/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/NoOpBootstrapIndex.java new file mode 100644 index 0000000000000..e4e32fa1277ac --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/NoOpBootstrapIndex.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.bootstrap.index; + +import org.apache.hudi.common.table.HoodieTableMetaClient; + +/** + * No Op Bootstrap Index , which is a empty implement and not do anything. + */ +public class NoOpBootstrapIndex extends BootstrapIndex { + + public NoOpBootstrapIndex(HoodieTableMetaClient metaClient) { + super(metaClient); + } + + @Override + public IndexReader createReader() { + throw new RuntimeException("DefaultBootstrapIndex not support create reader!"); + } + + @Override + public IndexWriter createWriter(String sourceBasePath) { + throw new RuntimeException("DefaultBootstrapIndex not support create writer!"); + } + + @Override + public void dropIndex() { + throw new RuntimeException("DefaultBootstrapIndex not support drop index!"); + } + + @Override + protected boolean isPresent() { + return false; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigClassProperty.java b/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigClassProperty.java new file mode 100644 index 0000000000000..f5d72a6bece71 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigClassProperty.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.config; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) + +/** + * Annotation for superclasses of {@link HoodieConfig} that includes the + * human-readable name of the config class, the config group ({@link ConfigGroupName}) + * it belongs to (e.g., spark/ flink/ write) + * and the description of the config class. + */ +public @interface ConfigClassProperty { + String name(); + ConfigGroups.Names groupName(); + String description(); +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigGroups.java b/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigGroups.java new file mode 100644 index 0000000000000..fef00389d8c54 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigGroups.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.config; + +/** + * In Hudi, we have multiple superclasses, aka Config Classes of {@link HoodieConfig} that maintain + * several configs. This class group one or more of these superclasses into higher + * level groups, such as Spark Config, Flink Configs, Metrics .... + * This class maintains the human readable name and description of each config group. + */ +public class ConfigGroups { + public enum Names { + SPARK_DATASOURCE("Spark Datasource Configs"), + FLINK_SQL("Flink Sql Configs"), + WRITE_CLIENT("Write Client Configs"), + METRICS("Metrics Configs"), + RECORD_PAYLOAD("Record Payload Config"), + KAFKA_CONNECT("Kafka Connect Configs"), + AWS("Amazon Web Services Configs"); + + public final String name; + + Names(String name) { + this.name = name; + } + } + + public static String getDescription(Names names) { + String description; + switch (names) { + case SPARK_DATASOURCE: + description = "These configs control the Hudi Spark Datasource, " + + "providing ability to define keys/partitioning, pick out the write operation, " + + "specify how to merge records or choosing query type to read."; + break; + case FLINK_SQL: + description = "These configs control the Hudi Flink SQL source/sink connectors, " + + "providing ability to define record keys, pick out the write operation, " + + "specify how to merge records, enable/disable asynchronous compaction " + + "or choosing query type to read."; + break; + case WRITE_CLIENT: + description = "Internally, the Hudi datasource uses a RDD based HoodieWriteClient API " + + "to actually perform writes to storage. These configs provide deep control over " + + "lower level aspects like file sizing, compression, parallelism, compaction, " + + "write schema, cleaning etc. Although Hudi provides sane defaults, from time-time " + + "these configs may need to be tweaked to optimize for specific workloads."; + break; + case RECORD_PAYLOAD: + description = "This is the lowest level of customization offered by Hudi. " + + "Record payloads define how to produce new values to upsert based on incoming " + + "new record and stored old record. Hudi provides default implementations such as " + + "OverwriteWithLatestAvroPayload which simply update table with the latest/last-written record. " + + "This can be overridden to a custom class extending HoodieRecordPayload class, " + + "on both datasource and WriteClient levels."; + break; + case METRICS: + description = "These set of configs are used to enable monitoring and reporting of key" + + "Hudi stats and metrics."; + break; + case KAFKA_CONNECT: + description = "These set of configs are used for Kafka Connect Sink Connector for writing Hudi Tables"; + break; + default: + description = "Please fill in the description for Config Group Name: " + names.name; + break; + } + return description; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigProperty.java b/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigProperty.java new file mode 100644 index 0000000000000..934803d8d315e --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigProperty.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.config; + +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.exception.HoodieException; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.function.Function; +import java.util.Objects; + +/** + * ConfigProperty describes a configuration property. It contains the configuration + * key, deprecated older versions of the key, and an optional default value for the configuration, + * configuration descriptions and also the an infer mechanism to infer the configuration value + * based on other configurations. + * + * @param The type of the default value. + */ +public class ConfigProperty implements Serializable { + + private final String key; + + private final T defaultValue; + + private final String doc; + + private final Option sinceVersion; + + private final Option deprecatedVersion; + + private final Set validValues; + + private final String[] alternatives; + + // provide the ability to infer config value based on other configs + private final Option>> inferFunction; + + ConfigProperty(String key, T defaultValue, String doc, Option sinceVersion, + Option deprecatedVersion, Option>> inferFunc, Set validValues, String... alternatives) { + this.key = Objects.requireNonNull(key); + this.defaultValue = defaultValue; + this.doc = doc; + this.sinceVersion = sinceVersion; + this.deprecatedVersion = deprecatedVersion; + this.inferFunction = inferFunc; + this.validValues = validValues; + this.alternatives = alternatives; + } + + public String key() { + return key; + } + + public T defaultValue() { + if (defaultValue == null) { + throw new HoodieException("There's no default value for this config"); + } + return defaultValue; + } + + public boolean hasDefaultValue() { + return defaultValue != null; + } + + public String doc() { + return StringUtils.isNullOrEmpty(doc) ? StringUtils.EMPTY_STRING : doc; + } + + public Option getSinceVersion() { + return sinceVersion; + } + + public Option getDeprecatedVersion() { + return deprecatedVersion; + } + + Option>> getInferFunc() { + return inferFunction; + } + + public void checkValues(String value) { + if (validValues != null && !validValues.isEmpty() && !validValues.contains(value)) { + throw new IllegalArgumentException( + "The value of " + key + " should be one of " + + String.join(",", validValues) + ", but was " + value); + } + } + + public List getAlternatives() { + return Arrays.asList(alternatives); + } + + public ConfigProperty withDocumentation(String doc) { + Objects.requireNonNull(doc); + return new ConfigProperty<>(key, defaultValue, doc, sinceVersion, deprecatedVersion, inferFunction, validValues, alternatives); + } + + public ConfigProperty withValidValues(String... validValues) { + Objects.requireNonNull(validValues); + return new ConfigProperty<>(key, defaultValue, doc, sinceVersion, deprecatedVersion, inferFunction, new HashSet<>(Arrays.asList(validValues)), alternatives); + } + + public ConfigProperty withAlternatives(String... alternatives) { + Objects.requireNonNull(alternatives); + return new ConfigProperty<>(key, defaultValue, doc, sinceVersion, deprecatedVersion, inferFunction, validValues, alternatives); + } + + public ConfigProperty sinceVersion(String sinceVersion) { + Objects.requireNonNull(sinceVersion); + return new ConfigProperty<>(key, defaultValue, doc, Option.of(sinceVersion), deprecatedVersion, inferFunction, validValues, alternatives); + } + + public ConfigProperty deprecatedAfter(String deprecatedVersion) { + Objects.requireNonNull(deprecatedVersion); + return new ConfigProperty<>(key, defaultValue, doc, sinceVersion, Option.of(deprecatedVersion), inferFunction, validValues, alternatives); + } + + public ConfigProperty withInferFunction(Function> inferFunction) { + Objects.requireNonNull(inferFunction); + return new ConfigProperty<>(key, defaultValue, doc, sinceVersion, deprecatedVersion, Option.of(inferFunction), validValues, alternatives); + } + + /** + * Create a OptionBuilder with key. + * + * @param key The key of the option + * @return Return a OptionBuilder. + */ + public static PropertyBuilder key(String key) { + Objects.requireNonNull(key); + return new PropertyBuilder(key); + } + + @Override + public String toString() { + return String.format( + "Key: '%s' , default: %s description: %s since version: %s deprecated after: %s)", + key, defaultValue, doc, sinceVersion.isPresent() ? sinceVersion.get() : "version is not defined", + deprecatedVersion.isPresent() ? deprecatedVersion.get() : "version is not defined"); + } + + /** + * The PropertyBuilder is used to build the ConfigProperty. + */ + public static final class PropertyBuilder { + + private final String key; + + PropertyBuilder(String key) { + this.key = key; + } + + public ConfigProperty defaultValue(T value) { + Objects.requireNonNull(value); + ConfigProperty configProperty = new ConfigProperty<>(key, value, "", Option.empty(), Option.empty(), Option.empty(), Collections.emptySet()); + return configProperty; + } + + public ConfigProperty noDefaultValue() { + ConfigProperty configProperty = new ConfigProperty<>(key, null, "", Option.empty(), + Option.empty(), Option.empty(), Collections.emptySet()); + return configProperty; + } + } +} \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/DFSPropertiesConfiguration.java b/hudi-common/src/main/java/org/apache/hudi/common/config/DFSPropertiesConfiguration.java index cd4ade1ef31a2..08cbd568df5d5 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/config/DFSPropertiesConfiguration.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/DFSPropertiesConfiguration.java @@ -18,14 +18,27 @@ package org.apache.hudi.common.config; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.exception.HoodieIOException; + +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + import java.io.BufferedReader; +import java.io.File; import java.io.IOException; import java.io.InputStreamReader; +import java.net.URI; +import java.net.URL; import java.util.HashSet; import java.util.Set; @@ -43,81 +56,186 @@ public class DFSPropertiesConfiguration { private static final Logger LOG = LogManager.getLogger(DFSPropertiesConfiguration.class); - private final FileSystem fs; + public static final String DEFAULT_PROPERTIES_FILE = "hudi-defaults.conf"; + public static final String CONF_FILE_DIR_ENV_NAME = "HUDI_CONF_DIR"; + public static final String DEFAULT_CONF_FILE_DIR = "file:/etc/hudi/conf"; + public static final Path DEFAULT_PATH = new Path(DEFAULT_CONF_FILE_DIR + "/" + DEFAULT_PROPERTIES_FILE); + + // props read from hudi-defaults.conf + private static TypedProperties GLOBAL_PROPS = loadGlobalProps(); - private final Path rootFile; + @Nullable + private final Configuration hadoopConfig; - private final TypedProperties props; + private Path currentFilePath; + + // props read from user defined configuration file or input stream + private final HoodieConfig hoodieConfig; // Keep track of files visited, to detect loops - private final Set visitedFiles; - - public DFSPropertiesConfiguration(FileSystem fs, Path rootFile, TypedProperties defaults) { - this.fs = fs; - this.rootFile = rootFile; - this.props = defaults; - this.visitedFiles = new HashSet<>(); - visitFile(rootFile); + private final Set visitedFilePaths; + + public DFSPropertiesConfiguration(@Nonnull Configuration hadoopConf, @Nonnull Path filePath) { + this.hadoopConfig = hadoopConf; + this.currentFilePath = filePath; + this.hoodieConfig = new HoodieConfig(); + this.visitedFilePaths = new HashSet<>(); + addPropsFromFile(filePath); } - public DFSPropertiesConfiguration(FileSystem fs, Path rootFile) { - this(fs, rootFile, new TypedProperties()); + public DFSPropertiesConfiguration() { + this.hadoopConfig = null; + this.currentFilePath = null; + this.hoodieConfig = new HoodieConfig(); + this.visitedFilePaths = new HashSet<>(); } - public DFSPropertiesConfiguration() { - this.fs = null; - this.rootFile = null; - this.props = new TypedProperties(); - this.visitedFiles = new HashSet<>(); + /** + * Load global props from hudi-defaults.conf which is under class loader or CONF_FILE_DIR_ENV_NAME. + * @return Typed Properties + */ + public static TypedProperties loadGlobalProps() { + DFSPropertiesConfiguration conf = new DFSPropertiesConfiguration(); + + // First try loading the external config file from class loader + URL configFile = Thread.currentThread().getContextClassLoader().getResource(DEFAULT_PROPERTIES_FILE); + if (configFile != null) { + try (BufferedReader br = new BufferedReader(new InputStreamReader(configFile.openStream()))) { + conf.addPropsFromStream(br); + return conf.getProps(); + } catch (IOException ioe) { + throw new HoodieIOException( + String.format("Failed to read %s from class loader", DEFAULT_PROPERTIES_FILE), ioe); + } + } + // Try loading the external config file from local file system + Option defaultConfPath = getConfPathFromEnv(); + if (defaultConfPath.isPresent()) { + conf.addPropsFromFile(defaultConfPath.get()); + } else { + try { + conf.addPropsFromFile(DEFAULT_PATH); + } catch (Exception e) { + LOG.warn("Cannot load default config file: " + DEFAULT_PATH, e); + } + } + return conf.getProps(); } - private String[] splitProperty(String line) { - int ind = line.indexOf('='); - String k = line.substring(0, ind).trim(); - String v = line.substring(ind + 1).trim(); - return new String[] {k, v}; + public static void refreshGlobalProps() { + GLOBAL_PROPS = loadGlobalProps(); + } + + public static void clearGlobalProps() { + GLOBAL_PROPS = new TypedProperties(); } - private void visitFile(Path file) { + /** + * Add properties from external configuration files. + * + * @param filePath File path for configuration file + */ + public void addPropsFromFile(Path filePath) { + if (visitedFilePaths.contains(filePath.toString())) { + throw new IllegalStateException("Loop detected; file " + filePath + " already referenced"); + } + + FileSystem fs = FSUtils.getFs( + filePath.toString(), + Option.ofNullable(hadoopConfig).orElseGet(Configuration::new) + ); + try { - if (visitedFiles.contains(file.getName())) { - throw new IllegalStateException("Loop detected; file " + file + " already referenced"); + if (filePath.equals(DEFAULT_PATH) && !fs.exists(filePath)) { + LOG.warn("Properties file " + filePath + " not found. Ignoring to load props file"); + return; } - visitedFiles.add(file.getName()); - BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(file))); - addProperties(reader); } catch (IOException ioe) { - LOG.error("Error reading in properies from dfs", ioe); - throw new IllegalArgumentException("Cannot read properties from dfs", ioe); + throw new HoodieIOException("Cannot check if the properties file exist: " + filePath, ioe); + } + + try (BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(filePath)))) { + visitedFilePaths.add(filePath.toString()); + currentFilePath = filePath; + addPropsFromStream(reader); + } catch (IOException ioe) { + LOG.error("Error reading in properties from dfs from file " + filePath); + throw new HoodieIOException("Cannot read properties from dfs from file " + filePath, ioe); } } /** - * Add properties from input stream. - * + * Add properties from buffered reader. + * * @param reader Buffered Reader * @throws IOException */ - public void addProperties(BufferedReader reader) throws IOException { + public void addPropsFromStream(BufferedReader reader) throws IOException { try { - String line; - while ((line = reader.readLine()) != null) { - if (line.startsWith("#") || line.equals("") || !line.contains("=")) { - continue; + reader.lines().forEach(line -> { + if (!isValidLine(line)) { + return; } String[] split = splitProperty(line); if (line.startsWith("include=") || line.startsWith("include =")) { - visitFile(new Path(rootFile.getParent(), split[1])); + Path includeFilePath = new Path(currentFilePath.getParent(), split[1]); + addPropsFromFile(includeFilePath); } else { - props.setProperty(split[0], split[1]); + hoodieConfig.setValue(split[0], split[1]); } - } + }); + } finally { reader.close(); } } - public TypedProperties getConfig() { - return props; + public static TypedProperties getGlobalProps() { + final TypedProperties globalProps = new TypedProperties(); + globalProps.putAll(GLOBAL_PROPS); + return globalProps; + } + + // test only + public static TypedProperties addToGlobalProps(String key, String value) { + GLOBAL_PROPS.put(key, value); + return GLOBAL_PROPS; + } + + public TypedProperties getProps() { + return new TypedProperties(hoodieConfig.getProps()); + } + + public TypedProperties getProps(boolean includeGlobalProps) { + return new TypedProperties(hoodieConfig.getProps(includeGlobalProps)); + } + + private static Option getConfPathFromEnv() { + String confDir = System.getenv(CONF_FILE_DIR_ENV_NAME); + if (confDir == null) { + LOG.warn("Cannot find " + CONF_FILE_DIR_ENV_NAME + ", please set it as the dir of " + DEFAULT_PROPERTIES_FILE); + return Option.empty(); + } + if (StringUtils.isNullOrEmpty(URI.create(confDir).getScheme())) { + confDir = "file://" + confDir; + } + return Option.of(new Path(confDir + File.separator + DEFAULT_PROPERTIES_FILE)); + } + + private String[] splitProperty(String line) { + line = line.replaceAll("\\s+", " "); + String delimiter = line.contains("=") ? "=" : " "; + int ind = line.indexOf(delimiter); + String k = line.substring(0, ind).trim(); + String v = line.substring(ind + 1).trim(); + return new String[] {k, v}; + } + + private boolean isValidLine(String line) { + ValidationUtils.checkArgument(line != null, "passed line is null"); + if (line.startsWith("#") || line.equals("")) { + return false; + } + return line.contains("=") || line.matches(".*\\s.*"); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/DefaultHoodieConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/config/DefaultHoodieConfig.java deleted file mode 100644 index e0766db8ce685..0000000000000 --- a/hudi-common/src/main/java/org/apache/hudi/common/config/DefaultHoodieConfig.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.common.config; - -import java.io.Serializable; -import java.util.Properties; - -/** - * Default Way to load Hoodie config through a {@link java.util.Properties}. - */ -public class DefaultHoodieConfig implements Serializable { - - protected final Properties props; - - public DefaultHoodieConfig(Properties props) { - this.props = props; - } - - public static void setDefaultOnCondition(Properties props, boolean condition, String propName, String defaultValue) { - if (condition) { - props.setProperty(propName, defaultValue); - } - } - - public static void setDefaultOnCondition(Properties props, boolean condition, DefaultHoodieConfig config) { - if (condition) { - props.putAll(config.getProps()); - } - } - - public Properties getProps() { - return props; - } - -} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieCommonConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieCommonConfig.java new file mode 100644 index 0000000000000..00ff7e5683307 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieCommonConfig.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.config; + +import org.apache.hudi.common.util.collection.ExternalSpillableMap; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Locale; +import java.util.Properties; + +@ConfigClassProperty(name = "Common Configurations", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "The following set of configurations are common across Hudi.") +public class HoodieCommonConfig extends HoodieConfig { + + public static final ConfigProperty SCHEMA_EVOLUTION_ENABLE = ConfigProperty + .key("hoodie.schema.on.read.enable") + .defaultValue(false) + .withDocumentation("Enables support for Schema Evolution feature"); + + public static final ConfigProperty TIMESTAMP_AS_OF = ConfigProperty + .key("as.of.instant") + .noDefaultValue() + .withDocumentation("The query instant for time travel. Without specified this option, we query the latest snapshot."); + + public static final ConfigProperty RECONCILE_SCHEMA = ConfigProperty + .key("hoodie.datasource.write.reconcile.schema") + .defaultValue(false) + .withDocumentation("When a new batch of write has records with old schema, but latest table schema got " + + "evolved, this config will upgrade the records to leverage latest table schema(default values will be " + + "injected to missing fields). If not, the write batch would fail."); + + public static final ConfigProperty SPILLABLE_DISK_MAP_TYPE = ConfigProperty + .key("hoodie.common.spillable.diskmap.type") + .defaultValue(ExternalSpillableMap.DiskMapType.BITCASK) + .withDocumentation("When handling input data that cannot be held in memory, to merge with a file on storage, a spillable diskmap is employed. " + + "By default, we use a persistent hashmap based loosely on bitcask, that offers O(1) inserts, lookups. " + + "Change this to `ROCKS_DB` to prefer using rocksDB, for handling the spill."); + + public static final ConfigProperty DISK_MAP_BITCASK_COMPRESSION_ENABLED = ConfigProperty + .key("hoodie.common.diskmap.compression.enabled") + .defaultValue(true) + .withDocumentation("Turn on compression for BITCASK disk map used by the External Spillable Map"); + + public ExternalSpillableMap.DiskMapType getSpillableDiskMapType() { + return ExternalSpillableMap.DiskMapType.valueOf(getString(SPILLABLE_DISK_MAP_TYPE).toUpperCase(Locale.ROOT)); + } + + public boolean isBitCaskDiskMapCompressionEnabled() { + return getBoolean(DISK_MAP_BITCASK_COMPRESSION_ENABLED); + } + + private HoodieCommonConfig() { + super(); + } + + public static HoodieCommonConfig.Builder newBuilder() { + return new HoodieCommonConfig.Builder(); + } + + public static class Builder { + + private final HoodieCommonConfig commonConfig = new HoodieCommonConfig(); + + public HoodieCommonConfig.Builder fromFile(File propertiesFile) throws IOException { + try (FileReader reader = new FileReader(propertiesFile)) { + commonConfig.getProps().load(reader); + return this; + } + } + + public HoodieCommonConfig.Builder fromProperties(Properties props) { + commonConfig.getProps().putAll(props); + return this; + } + + public Builder withSpillableDiskMapType(ExternalSpillableMap.DiskMapType diskMapType) { + commonConfig.setValue(SPILLABLE_DISK_MAP_TYPE, diskMapType.name()); + return this; + } + + public Builder withBitcaskDiskMapCompressionEnabled(boolean bitcaskDiskMapCompressionEnabled) { + commonConfig.setValue(DISK_MAP_BITCASK_COMPRESSION_ENABLED, String.valueOf(bitcaskDiskMapCompressionEnabled)); + return this; + } + + public HoodieCommonConfig build() { + commonConfig.setDefaults(HoodieCommonConfig.class.getName()); + return commonConfig; + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java new file mode 100644 index 0000000000000..366d19fe6ebc0 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.config; + +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.io.Serializable; +import java.lang.reflect.Modifier; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +/** + * This class deals with {@link ConfigProperty} and provides get/set functionalities. + */ +public class HoodieConfig implements Serializable { + + private static final Logger LOG = LogManager.getLogger(HoodieConfig.class); + + protected static final String CONFIG_VALUES_DELIMITER = ","; + + public static HoodieConfig create(FSDataInputStream inputStream) throws IOException { + HoodieConfig config = new HoodieConfig(); + config.props.load(inputStream); + return config; + } + + protected TypedProperties props; + + public HoodieConfig() { + this.props = new TypedProperties(); + } + + public HoodieConfig(Properties props) { + this.props = new TypedProperties(props); + } + + public void setValue(ConfigProperty cfg, String val) { + cfg.checkValues(val); + props.setProperty(cfg.key(), val); + } + + public void setValue(String key, String val) { + props.setProperty(key, val); + } + + public void setAll(Properties properties) { + props.putAll(properties); + } + + public void setDefaultValue(ConfigProperty configProperty) { + if (!contains(configProperty)) { + Option inferValue = Option.empty(); + if (configProperty.getInferFunc().isPresent()) { + inferValue = configProperty.getInferFunc().get().apply(this); + } + props.setProperty(configProperty.key(), inferValue.isPresent() ? inferValue.get().toString() : configProperty.defaultValue().toString()); + } + } + + public void setDefaultValue(ConfigProperty configProperty, T defaultVal) { + if (!contains(configProperty)) { + props.setProperty(configProperty.key(), defaultVal.toString()); + } + } + + public Boolean contains(String key) { + return props.containsKey(key); + } + + public boolean contains(ConfigProperty configProperty) { + if (props.containsKey(configProperty.key())) { + return true; + } + return configProperty.getAlternatives().stream().anyMatch(props::containsKey); + } + + private Option getRawValue(ConfigProperty configProperty) { + if (props.containsKey(configProperty.key())) { + return Option.ofNullable(props.get(configProperty.key())); + } + for (String alternative : configProperty.getAlternatives()) { + if (props.containsKey(alternative)) { + LOG.warn(String.format("The configuration key '%s' has been deprecated " + + "and may be removed in the future. Please use the new key '%s' instead.", + alternative, configProperty.key())); + return Option.ofNullable(props.get(alternative)); + } + } + return Option.empty(); + } + + protected void setDefaults(String configClassName) { + Class configClass = ReflectionUtils.getClass(configClassName); + Arrays.stream(configClass.getDeclaredFields()) + .filter(f -> Modifier.isStatic(f.getModifiers())) + .filter(f -> f.getType().isAssignableFrom(ConfigProperty.class)) + .forEach(f -> { + try { + ConfigProperty cfgProp = (ConfigProperty) f.get("null"); + if (cfgProp.hasDefaultValue()) { + setDefaultValue(cfgProp); + } + } catch (IllegalAccessException e) { + e.printStackTrace(); + } + }); + } + + public String getString(ConfigProperty configProperty) { + Option rawValue = getRawValue(configProperty); + return rawValue.map(Object::toString).orElse(null); + } + + public List getSplitStrings(ConfigProperty configProperty) { + return getSplitStrings(configProperty, ","); + } + + public List getSplitStrings(ConfigProperty configProperty, String delimiter) { + return StringUtils.split(getString(configProperty), delimiter); + } + + public String getString(String key) { + return props.getProperty(key); + } + + public Integer getInt(ConfigProperty configProperty) { + Option rawValue = getRawValue(configProperty); + return rawValue.map(v -> Integer.parseInt(v.toString())).orElse(null); + } + + public Integer getIntOrDefault(ConfigProperty configProperty) { + Option rawValue = getRawValue(configProperty); + return rawValue.map(v -> Integer.parseInt(v.toString())) + .orElse((Integer) configProperty.defaultValue()); + } + + public Boolean getBoolean(ConfigProperty configProperty) { + if (configProperty.hasDefaultValue()) { + return getBooleanOrDefault(configProperty); + } + Option rawValue = getRawValue(configProperty); + return rawValue.map(v -> Boolean.parseBoolean(v.toString())).orElse(null); + } + + public boolean getBooleanOrDefault(String key, boolean defaultVal) { + return Option.ofNullable(props.getProperty(key)).map(Boolean::parseBoolean).orElse(defaultVal); + } + + public boolean getBooleanOrDefault(ConfigProperty configProperty) { + Option rawValue = getRawValue(configProperty); + return rawValue.map(v -> Boolean.parseBoolean(v.toString())) + .orElseGet(() -> Boolean.parseBoolean(configProperty.defaultValue().toString())); + } + + public boolean getBooleanOrDefault(ConfigProperty configProperty, boolean defaultVal) { + Option rawValue = getRawValue(configProperty); + return rawValue.map(v -> Boolean.parseBoolean(v.toString())).orElse(defaultVal); + } + + public Long getLong(ConfigProperty configProperty) { + Option rawValue = getRawValue(configProperty); + return rawValue.map(v -> Long.parseLong(v.toString())).orElse(null); + } + + public Float getFloat(ConfigProperty configProperty) { + Option rawValue = getRawValue(configProperty); + return rawValue.map(v -> Float.parseFloat(v.toString())).orElse(null); + } + + public Double getDouble(ConfigProperty configProperty) { + Option rawValue = getRawValue(configProperty); + return rawValue.map(v -> Double.parseDouble(v.toString())).orElse(null); + } + + public String getStringOrDefault(ConfigProperty configProperty) { + return getStringOrDefault(configProperty, configProperty.defaultValue().toString()); + } + + public String getStringOrDefault(ConfigProperty configProperty, String defaultVal) { + Option rawValue = getRawValue(configProperty); + return rawValue.map(Object::toString).orElse(defaultVal); + } + + public TypedProperties getProps() { + return getProps(false); + } + + public TypedProperties getProps(boolean includeGlobalProps) { + if (includeGlobalProps) { + TypedProperties mergedProps = DFSPropertiesConfiguration.getGlobalProps(); + mergedProps.putAll(props); + return mergedProps; + } else { + return props; + } + } + + public void setDefaultOnCondition(boolean condition, HoodieConfig config) { + if (condition) { + setDefault(config); + } + } + + public void setDefault(HoodieConfig config) { + props.putAll(config.getProps()); + } + + public String getStringOrThrow(ConfigProperty configProperty, String errorMessage) throws HoodieException { + Option rawValue = getRawValue(configProperty); + if (rawValue.isPresent()) { + return rawValue.get().toString(); + } else { + throw new HoodieException(errorMessage); + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java new file mode 100644 index 0000000000000..b16373ef83436 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java @@ -0,0 +1,608 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.config; + +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.exception.HoodieNotSupportedException; + +import javax.annotation.concurrent.Immutable; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.List; +import java.util.Properties; + +/** + * Configurations used by the HUDI Metadata Table. + */ +@Immutable +@ConfigClassProperty(name = "Metadata Configs", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Configurations used by the Hudi Metadata Table. " + + "This table maintains the metadata about a given Hudi table (e.g file listings) " + + " to avoid overhead of accessing cloud storage, during queries.") +public final class HoodieMetadataConfig extends HoodieConfig { + + public static final String METADATA_PREFIX = "hoodie.metadata"; + + // Enable the internal Metadata Table which saves file listings + public static final ConfigProperty ENABLE = ConfigProperty + .key(METADATA_PREFIX + ".enable") + .defaultValue(true) + .sinceVersion("0.7.0") + .withDocumentation("Enable the internal metadata table which serves table metadata like level file listings"); + + public static final boolean DEFAULT_METADATA_ENABLE_FOR_READERS = false; + + // Enable metrics for internal Metadata Table + public static final ConfigProperty METRICS_ENABLE = ConfigProperty + .key(METADATA_PREFIX + ".metrics.enable") + .defaultValue(false) + .sinceVersion("0.7.0") + .withDocumentation("Enable publishing of metrics around metadata table."); + + // Parallelism for inserts + public static final ConfigProperty INSERT_PARALLELISM_VALUE = ConfigProperty + .key(METADATA_PREFIX + ".insert.parallelism") + .defaultValue(1) + .sinceVersion("0.7.0") + .withDocumentation("Parallelism to use when inserting to the metadata table"); + + // Async clean + public static final ConfigProperty ASYNC_CLEAN_ENABLE = ConfigProperty + .key(METADATA_PREFIX + ".clean.async") + .defaultValue(false) + .sinceVersion("0.7.0") + .withDocumentation("Enable asynchronous cleaning for metadata table"); + + // Async index + public static final ConfigProperty ASYNC_INDEX_ENABLE = ConfigProperty + .key(METADATA_PREFIX + ".index.async") + .defaultValue(false) + .sinceVersion("0.11.0") + .withDocumentation("Enable asynchronous indexing of metadata table."); + + // Maximum delta commits before compaction occurs + public static final ConfigProperty COMPACT_NUM_DELTA_COMMITS = ConfigProperty + .key(METADATA_PREFIX + ".compact.max.delta.commits") + .defaultValue(10) + .sinceVersion("0.7.0") + .withDocumentation("Controls how often the metadata table is compacted."); + + // Archival settings + public static final ConfigProperty MIN_COMMITS_TO_KEEP = ConfigProperty + .key(METADATA_PREFIX + ".keep.min.commits") + .defaultValue(20) + .sinceVersion("0.7.0") + .withDocumentation("Archiving service moves older entries from metadata table’s timeline " + + "into an archived log after each write, to keep the overhead constant, even as the " + + "metadata table size grows. This config controls the minimum number of instants " + + "to retain in the active timeline."); + + public static final ConfigProperty MAX_COMMITS_TO_KEEP = ConfigProperty + .key(METADATA_PREFIX + ".keep.max.commits") + .defaultValue(30) + .sinceVersion("0.7.0") + .withDocumentation("Similar to " + MIN_COMMITS_TO_KEEP.key() + ", this config controls " + + "the maximum number of instants to retain in the active timeline."); + + // Cleaner commits retained + public static final ConfigProperty CLEANER_COMMITS_RETAINED = ConfigProperty + .key(METADATA_PREFIX + ".cleaner.commits.retained") + .defaultValue(3) + .sinceVersion("0.7.0") + .withDocumentation("Number of commits to retain, without cleaning, on metadata table."); + + // Regex to filter out matching directories during bootstrap + public static final ConfigProperty DIR_FILTER_REGEX = ConfigProperty + .key(METADATA_PREFIX + ".dir.filter.regex") + .defaultValue("") + .sinceVersion("0.7.0") + .withDocumentation("Directories matching this regex, will be filtered out when initializing metadata table from lake storage for the first time."); + + public static final ConfigProperty ASSUME_DATE_PARTITIONING = ConfigProperty + .key("hoodie.assume.date.partitioning") + .defaultValue("false") + .sinceVersion("0.3.0") + .withDocumentation("Should HoodieWriteClient assume the data is partitioned by dates, i.e three levels from base path. " + + "This is a stop-gap to support tables created by versions < 0.3.1. Will be removed eventually"); + + public static final ConfigProperty FILE_LISTING_PARALLELISM_VALUE = ConfigProperty + .key("hoodie.file.listing.parallelism") + .defaultValue(200) + .sinceVersion("0.7.0") + .withDocumentation("Parallelism to use, when listing the table on lake storage."); + + public static final ConfigProperty ENABLE_FULL_SCAN_LOG_FILES = ConfigProperty + .key(METADATA_PREFIX + ".enable.full.scan.log.files") + .defaultValue(true) + .sinceVersion("0.10.0") + .withDocumentation("Enable full scanning of log files while reading log records. If disabled, Hudi does look up of only interested entries."); + + public static final ConfigProperty ENABLE_METADATA_INDEX_BLOOM_FILTER = ConfigProperty + .key(METADATA_PREFIX + ".index.bloom.filter.enable") + .defaultValue(false) + .sinceVersion("0.11.0") + .withDocumentation("Enable indexing bloom filters of user data files under metadata table. When enabled, " + + "metadata table will have a partition to store the bloom filter index and will be " + + "used during the index lookups."); + + public static final ConfigProperty METADATA_INDEX_BLOOM_FILTER_FILE_GROUP_COUNT = ConfigProperty + .key(METADATA_PREFIX + ".index.bloom.filter.file.group.count") + .defaultValue(4) + .sinceVersion("0.11.0") + .withDocumentation("Metadata bloom filter index partition file group count. This controls the size of the base and " + + "log files and read parallelism in the bloom filter index partition. The recommendation is to size the " + + "file group count such that the base files are under 1GB."); + + public static final ConfigProperty BLOOM_FILTER_INDEX_PARALLELISM = ConfigProperty + .key(METADATA_PREFIX + ".index.bloom.filter.parallelism") + .defaultValue(200) + .sinceVersion("0.11.0") + .withDocumentation("Parallelism to use for generating bloom filter index in metadata table."); + + public static final ConfigProperty ENABLE_METADATA_INDEX_COLUMN_STATS = ConfigProperty + .key(METADATA_PREFIX + ".index.column.stats.enable") + .defaultValue(false) + .sinceVersion("0.11.0") + .withDocumentation("Enable indexing column ranges of user data files under metadata table key lookups. When " + + "enabled, metadata table will have a partition to store the column ranges and will be " + + "used for pruning files during the index lookups."); + + public static final ConfigProperty METADATA_INDEX_COLUMN_STATS_FILE_GROUP_COUNT = ConfigProperty + .key(METADATA_PREFIX + ".index.column.stats.file.group.count") + .defaultValue(2) + .sinceVersion("0.11.0") + .withDocumentation("Metadata column stats partition file group count. This controls the size of the base and " + + "log files and read parallelism in the column stats index partition. The recommendation is to size the " + + "file group count such that the base files are under 1GB."); + + public static final ConfigProperty COLUMN_STATS_INDEX_PARALLELISM = ConfigProperty + .key(METADATA_PREFIX + ".index.column.stats.parallelism") + .defaultValue(10) + .sinceVersion("0.11.0") + .withDocumentation("Parallelism to use, when generating column stats index."); + + public static final ConfigProperty COLUMN_STATS_INDEX_FOR_COLUMNS = ConfigProperty + .key(METADATA_PREFIX + ".index.column.stats.column.list") + .noDefaultValue() + .sinceVersion("0.11.0") + .withDocumentation("Comma-separated list of columns for which column stats index will be built. If not set, all columns will be indexed"); + + public static final String COLUMN_STATS_INDEX_PROCESSING_MODE_IN_MEMORY = "in-memory"; + public static final String COLUMN_STATS_INDEX_PROCESSING_MODE_ENGINE = "engine"; + + public static final ConfigProperty COLUMN_STATS_INDEX_PROCESSING_MODE_OVERRIDE = ConfigProperty + .key(METADATA_PREFIX + ".index.column.stats.processing.mode.override") + .noDefaultValue() + .withValidValues(COLUMN_STATS_INDEX_PROCESSING_MODE_IN_MEMORY, COLUMN_STATS_INDEX_PROCESSING_MODE_ENGINE) + .sinceVersion("0.12.0") + .withDocumentation("By default Column Stats Index is automatically determining whether it should be read and processed either" + + "'in-memory' (w/in executing process) or using Spark (on a cluster), based on some factors like the size of the Index " + + "and how many columns are read. This config allows to override this behavior."); + + public static final ConfigProperty COLUMN_STATS_INDEX_IN_MEMORY_PROJECTION_THRESHOLD = ConfigProperty + .key(METADATA_PREFIX + ".index.column.stats.inMemory.projection.threshold") + .defaultValue(100000) + .sinceVersion("0.12.0") + .withDocumentation("When reading Column Stats Index, if the size of the expected resulting projection is below the in-memory" + + " threshold (counted by the # of rows), it will be attempted to be loaded \"in-memory\" (ie not using the execution engine" + + " like Spark, Flink, etc). If the value is above the threshold execution engine will be used to compose the projection."); + + public static final ConfigProperty BLOOM_FILTER_INDEX_FOR_COLUMNS = ConfigProperty + .key(METADATA_PREFIX + ".index.bloom.filter.column.list") + .noDefaultValue() + .sinceVersion("0.11.0") + .withDocumentation("Comma-separated list of columns for which bloom filter index will be built. If not set, only record key will be indexed."); + + public static final ConfigProperty METADATA_INDEX_CHECK_TIMEOUT_SECONDS = ConfigProperty + .key(METADATA_PREFIX + ".index.check.timeout.seconds") + .defaultValue(900) + .sinceVersion("0.11.0") + .withDocumentation("After the async indexer has finished indexing upto the base instant, it will ensure that all inflight writers " + + "reliably write index updates as well. If this timeout expires, then the indexer will abort itself safely."); + + public static final ConfigProperty POPULATE_META_FIELDS = ConfigProperty + .key(METADATA_PREFIX + ".populate.meta.fields") + .defaultValue(false) + .sinceVersion("0.10.0") + .withDocumentation("When enabled, populates all meta fields. When disabled, no meta fields are populated."); + + public static final ConfigProperty IGNORE_SPURIOUS_DELETES = ConfigProperty + .key("_" + METADATA_PREFIX + ".ignore.spurious.deletes") + .defaultValue(true) + .sinceVersion("0.10.0") + .withDocumentation("There are cases when extra files are requested to be deleted from " + + "metadata table which are never added before. This config determines how to handle " + + "such spurious deletes"); + + private HoodieMetadataConfig() { + super(); + } + + public static HoodieMetadataConfig.Builder newBuilder() { + return new Builder(); + } + + public int getFileListingParallelism() { + return Math.max(getInt(HoodieMetadataConfig.FILE_LISTING_PARALLELISM_VALUE), 1); + } + + public Boolean shouldAssumeDatePartitioning() { + return getBoolean(HoodieMetadataConfig.ASSUME_DATE_PARTITIONING); + } + + public boolean enabled() { + return getBoolean(ENABLE); + } + + public boolean isBloomFilterIndexEnabled() { + return getBooleanOrDefault(ENABLE_METADATA_INDEX_BLOOM_FILTER); + } + + public boolean isColumnStatsIndexEnabled() { + return getBooleanOrDefault(ENABLE_METADATA_INDEX_COLUMN_STATS); + } + + public List getColumnsEnabledForColumnStatsIndex() { + return StringUtils.split(getString(COLUMN_STATS_INDEX_FOR_COLUMNS), CONFIG_VALUES_DELIMITER); + } + + public String getColumnStatsIndexProcessingModeOverride() { + return getString(COLUMN_STATS_INDEX_PROCESSING_MODE_OVERRIDE); + } + + public Integer getColumnStatsIndexInMemoryProjectionThreshold() { + return getIntOrDefault(COLUMN_STATS_INDEX_IN_MEMORY_PROJECTION_THRESHOLD); + } + + public List getColumnsEnabledForBloomFilterIndex() { + return StringUtils.split(getString(BLOOM_FILTER_INDEX_FOR_COLUMNS), CONFIG_VALUES_DELIMITER); + } + + public int getBloomFilterIndexFileGroupCount() { + return getIntOrDefault(METADATA_INDEX_BLOOM_FILTER_FILE_GROUP_COUNT); + } + + public int getColumnStatsIndexFileGroupCount() { + return getIntOrDefault(METADATA_INDEX_COLUMN_STATS_FILE_GROUP_COUNT); + } + + public int getBloomFilterIndexParallelism() { + return getIntOrDefault(BLOOM_FILTER_INDEX_PARALLELISM); + } + + public int getColumnStatsIndexParallelism() { + return getIntOrDefault(COLUMN_STATS_INDEX_PARALLELISM); + } + + public int getIndexingCheckTimeoutSeconds() { + return getIntOrDefault(METADATA_INDEX_CHECK_TIMEOUT_SECONDS); + } + + public boolean enableMetrics() { + return getBoolean(METRICS_ENABLE); + } + + public String getDirectoryFilterRegex() { + return getString(DIR_FILTER_REGEX); + } + + public boolean allowFullScan() { + return getBooleanOrDefault(ENABLE_FULL_SCAN_LOG_FILES); + } + + public boolean populateMetaFields() { + return getBooleanOrDefault(HoodieMetadataConfig.POPULATE_META_FIELDS); + } + + public boolean ignoreSpuriousDeletes() { + return getBoolean(IGNORE_SPURIOUS_DELETES); + } + + public static class Builder { + + private EngineType engineType = EngineType.SPARK; + private final HoodieMetadataConfig metadataConfig = new HoodieMetadataConfig(); + + public Builder fromFile(File propertiesFile) throws IOException { + try (FileReader reader = new FileReader(propertiesFile)) { + this.metadataConfig.getProps().load(reader); + return this; + } + } + + public Builder fromProperties(Properties props) { + this.metadataConfig.getProps().putAll(props); + return this; + } + + public Builder enable(boolean enable) { + metadataConfig.setValue(ENABLE, String.valueOf(enable)); + return this; + } + + public Builder withMetadataIndexBloomFilter(boolean enable) { + metadataConfig.setValue(ENABLE_METADATA_INDEX_BLOOM_FILTER, String.valueOf(enable)); + return this; + } + + public Builder withMetadataIndexBloomFilterFileGroups(int fileGroupCount) { + metadataConfig.setValue(METADATA_INDEX_BLOOM_FILTER_FILE_GROUP_COUNT, String.valueOf(fileGroupCount)); + return this; + } + + public Builder withBloomFilterIndexParallelism(int parallelism) { + metadataConfig.setValue(BLOOM_FILTER_INDEX_PARALLELISM, String.valueOf(parallelism)); + return this; + } + + public Builder withMetadataIndexColumnStats(boolean enable) { + metadataConfig.setValue(ENABLE_METADATA_INDEX_COLUMN_STATS, String.valueOf(enable)); + return this; + } + + public Builder withMetadataIndexColumnStatsFileGroupCount(int fileGroupCount) { + metadataConfig.setValue(METADATA_INDEX_COLUMN_STATS_FILE_GROUP_COUNT, String.valueOf(fileGroupCount)); + return this; + } + + public Builder withColumnStatsIndexParallelism(int parallelism) { + metadataConfig.setValue(COLUMN_STATS_INDEX_PARALLELISM, String.valueOf(parallelism)); + return this; + } + + public Builder withColumnStatsIndexForColumns(String columns) { + metadataConfig.setValue(COLUMN_STATS_INDEX_FOR_COLUMNS, columns); + return this; + } + + public Builder withBloomFilterIndexForColumns(String columns) { + metadataConfig.setValue(BLOOM_FILTER_INDEX_FOR_COLUMNS, columns); + return this; + } + + public Builder withIndexingCheckTimeout(int timeoutInSeconds) { + metadataConfig.setValue(METADATA_INDEX_CHECK_TIMEOUT_SECONDS, String.valueOf(timeoutInSeconds)); + return this; + } + + public Builder enableMetrics(boolean enableMetrics) { + metadataConfig.setValue(METRICS_ENABLE, String.valueOf(enableMetrics)); + return this; + } + + public Builder withInsertParallelism(int parallelism) { + metadataConfig.setValue(INSERT_PARALLELISM_VALUE, String.valueOf(parallelism)); + return this; + } + + public Builder withAsyncClean(boolean asyncClean) { + metadataConfig.setValue(ASYNC_CLEAN_ENABLE, String.valueOf(asyncClean)); + return this; + } + + public Builder withAsyncIndex(boolean asyncIndex) { + metadataConfig.setValue(ASYNC_INDEX_ENABLE, String.valueOf(asyncIndex)); + return this; + } + + public Builder withMaxNumDeltaCommitsBeforeCompaction(int maxNumDeltaCommitsBeforeCompaction) { + metadataConfig.setValue(COMPACT_NUM_DELTA_COMMITS, String.valueOf(maxNumDeltaCommitsBeforeCompaction)); + return this; + } + + public Builder withPopulateMetaFields(boolean populateMetaFields) { + metadataConfig.setValue(POPULATE_META_FIELDS, Boolean.toString(populateMetaFields)); + return this; + } + + public Builder archiveCommitsWith(int minToKeep, int maxToKeep) { + metadataConfig.setValue(MIN_COMMITS_TO_KEEP, String.valueOf(minToKeep)); + metadataConfig.setValue(MAX_COMMITS_TO_KEEP, String.valueOf(maxToKeep)); + return this; + } + + public Builder retainCommits(int commitsRetained) { + metadataConfig.setValue(CLEANER_COMMITS_RETAINED, String.valueOf(commitsRetained)); + return this; + } + + public Builder withFileListingParallelism(int parallelism) { + metadataConfig.setValue(FILE_LISTING_PARALLELISM_VALUE, String.valueOf(parallelism)); + return this; + } + + public Builder withAssumeDatePartitioning(boolean assumeDatePartitioning) { + metadataConfig.setValue(ASSUME_DATE_PARTITIONING, String.valueOf(assumeDatePartitioning)); + return this; + } + + public Builder withDirectoryFilterRegex(String regex) { + metadataConfig.setValue(DIR_FILTER_REGEX, regex); + return this; + } + + public Builder enableFullScan(boolean enableFullScan) { + metadataConfig.setValue(ENABLE_FULL_SCAN_LOG_FILES, String.valueOf(enableFullScan)); + return this; + } + + public Builder ignoreSpuriousDeletes(boolean validateMetadataPayloadConsistency) { + metadataConfig.setValue(IGNORE_SPURIOUS_DELETES, String.valueOf(validateMetadataPayloadConsistency)); + return this; + } + + public Builder withEngineType(EngineType engineType) { + this.engineType = engineType; + return this; + } + + public Builder withProperties(Properties properties) { + this.metadataConfig.getProps().putAll(properties); + return this; + } + + public HoodieMetadataConfig build() { + metadataConfig.setDefaultValue(ENABLE, getDefaultMetadataEnable(engineType)); + metadataConfig.setDefaults(HoodieMetadataConfig.class.getName()); + return metadataConfig; + } + + private boolean getDefaultMetadataEnable(EngineType engineType) { + switch (engineType) { + case FLINK: + case SPARK: + return ENABLE.defaultValue(); + case JAVA: + return false; + default: + throw new HoodieNotSupportedException("Unsupported engine " + engineType); + } + } + } + + /** + * @deprecated Use {@link #ENABLE} and its methods. + */ + @Deprecated + public static final String METADATA_ENABLE_PROP = ENABLE.key(); + /** + * @deprecated Use {@link #ENABLE} and its methods. + */ + @Deprecated + public static final boolean DEFAULT_METADATA_ENABLE = ENABLE.defaultValue(); + + /** + * @deprecated Use {@link #METRICS_ENABLE} and its methods. + */ + @Deprecated + public static final String METADATA_METRICS_ENABLE_PROP = METRICS_ENABLE.key(); + /** + * @deprecated Use {@link #METRICS_ENABLE} and its methods. + */ + @Deprecated + public static final boolean DEFAULT_METADATA_METRICS_ENABLE = METRICS_ENABLE.defaultValue(); + + /** + * @deprecated Use {@link #INSERT_PARALLELISM_VALUE} and its methods. + */ + @Deprecated + public static final String METADATA_INSERT_PARALLELISM_PROP = INSERT_PARALLELISM_VALUE.key(); + /** + * @deprecated Use {@link #INSERT_PARALLELISM_VALUE} and its methods. + */ + @Deprecated + public static final int DEFAULT_METADATA_INSERT_PARALLELISM = INSERT_PARALLELISM_VALUE.defaultValue(); + + /** + * @deprecated Use {@link #ASYNC_CLEAN_ENABLE} and its methods. + */ + @Deprecated + public static final String METADATA_ASYNC_CLEAN_PROP = ASYNC_CLEAN_ENABLE.key(); + /** + * @deprecated Use {@link #ASYNC_CLEAN_ENABLE} and its methods. + */ + @Deprecated + public static final boolean DEFAULT_METADATA_ASYNC_CLEAN = ASYNC_CLEAN_ENABLE.defaultValue(); + + /** + * @deprecated Use {@link #COMPACT_NUM_DELTA_COMMITS} and its methods. + */ + @Deprecated + public static final String METADATA_COMPACT_NUM_DELTA_COMMITS_PROP = COMPACT_NUM_DELTA_COMMITS.key(); + /** + * @deprecated Use {@link #COMPACT_NUM_DELTA_COMMITS} and its methods. + */ + @Deprecated + public static final int DEFAULT_METADATA_COMPACT_NUM_DELTA_COMMITS = COMPACT_NUM_DELTA_COMMITS.defaultValue(); + + /** + * @deprecated Use {@link #MIN_COMMITS_TO_KEEP} and its methods. + */ + @Deprecated + public static final String MIN_COMMITS_TO_KEEP_PROP = MIN_COMMITS_TO_KEEP.key(); + /** + * @deprecated Use {@link #MIN_COMMITS_TO_KEEP} and its methods. + */ + @Deprecated + public static final int DEFAULT_MIN_COMMITS_TO_KEEP = MIN_COMMITS_TO_KEEP.defaultValue(); + /** + * @deprecated Use {@link #MAX_COMMITS_TO_KEEP} and its methods. + */ + @Deprecated + public static final String MAX_COMMITS_TO_KEEP_PROP = MAX_COMMITS_TO_KEEP.key(); + /** + * @deprecated Use {@link #MAX_COMMITS_TO_KEEP} and its methods. + */ + @Deprecated + public static final int DEFAULT_MAX_COMMITS_TO_KEEP = MAX_COMMITS_TO_KEEP.defaultValue(); + /** + * @deprecated Use {@link #CLEANER_COMMITS_RETAINED} and its methods. + */ + @Deprecated + public static final String CLEANER_COMMITS_RETAINED_PROP = CLEANER_COMMITS_RETAINED.key(); + /** + * @deprecated Use {@link #CLEANER_COMMITS_RETAINED} and its methods. + */ + @Deprecated + public static final int DEFAULT_CLEANER_COMMITS_RETAINED = CLEANER_COMMITS_RETAINED.defaultValue(); + /** + * @deprecated No longer takes any effect. + */ + @Deprecated + public static final String ENABLE_FALLBACK_PROP = METADATA_PREFIX + ".fallback.enable"; + /** + * @deprecated No longer takes any effect. + */ + @Deprecated + public static final String DEFAULT_ENABLE_FALLBACK = "true"; + /** + * @deprecated Use {@link #DIR_FILTER_REGEX} and its methods. + */ + @Deprecated + public static final String DIRECTORY_FILTER_REGEX = DIR_FILTER_REGEX.key(); + /** + * @deprecated Use {@link #DIR_FILTER_REGEX} and its methods. + */ + @Deprecated + public static final String DEFAULT_DIRECTORY_FILTER_REGEX = DIR_FILTER_REGEX.defaultValue(); + /** + * @deprecated Use {@link #ASSUME_DATE_PARTITIONING} and its methods. + */ + @Deprecated + public static final String HOODIE_ASSUME_DATE_PARTITIONING_PROP = ASSUME_DATE_PARTITIONING.key(); + /** + * @deprecated Use {@link #ASSUME_DATE_PARTITIONING} and its methods. + */ + @Deprecated + public static final String DEFAULT_ASSUME_DATE_PARTITIONING = ASSUME_DATE_PARTITIONING.defaultValue(); + /** + * @deprecated Use {@link #FILE_LISTING_PARALLELISM_VALUE} and its methods. + */ + @Deprecated + public static final String FILE_LISTING_PARALLELISM_PROP = FILE_LISTING_PARALLELISM_VALUE.key(); + /** + * @deprecated Use {@link #FILE_LISTING_PARALLELISM_VALUE} and its methods. + */ + @Deprecated + public static final int DEFAULT_FILE_LISTING_PARALLELISM = FILE_LISTING_PARALLELISM_VALUE.defaultValue(); +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetastoreConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetastoreConfig.java new file mode 100644 index 0000000000000..36e2798a4d32a --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetastoreConfig.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.config; + +import javax.annotation.concurrent.Immutable; +import java.util.Properties; + +/** + * Configurations used by the HUDI Metastore. + */ +@Immutable +@ConfigClassProperty(name = "Metastore Configs", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Configurations used by the Hudi Metastore.") +public class HoodieMetastoreConfig extends HoodieConfig { + + public static final String METASTORE_PREFIX = "hoodie.metastore"; + + public static final ConfigProperty METASTORE_ENABLE = ConfigProperty + .key(METASTORE_PREFIX + ".enable") + .defaultValue(false) + .withDocumentation("Use metastore server to store hoodie table metadata"); + + public static final ConfigProperty METASTORE_URLS = ConfigProperty + .key(METASTORE_PREFIX + ".uris") + .defaultValue("thrift://localhost:9090") + .withDocumentation("Metastore server uris"); + + public static final ConfigProperty METASTORE_CONNECTION_RETRIES = ConfigProperty + .key(METASTORE_PREFIX + ".connect.retries") + .defaultValue(3) + .withDocumentation("Number of retries while opening a connection to metastore"); + + public static final ConfigProperty METASTORE_CONNECTION_RETRY_DELAY = ConfigProperty + .key(METASTORE_PREFIX + ".connect.retry.delay") + .defaultValue(1) + .withDocumentation("Number of seconds for the client to wait between consecutive connection attempts"); + + public static HoodieMetastoreConfig.Builder newBuilder() { + return new HoodieMetastoreConfig.Builder(); + } + + public boolean enableMetastore() { + return getBoolean(METASTORE_ENABLE); + } + + public String getMetastoreUris() { + return getStringOrDefault(METASTORE_URLS); + } + + public int getConnectionRetryLimit() { + return getIntOrDefault(METASTORE_CONNECTION_RETRIES); + } + + public int getConnectionRetryDelay() { + return getIntOrDefault(METASTORE_CONNECTION_RETRY_DELAY); + } + + public static class Builder { + private final HoodieMetastoreConfig config = new HoodieMetastoreConfig(); + + public Builder fromProperties(Properties props) { + this.config.getProps().putAll(props); + return this; + } + + public Builder setUris(String uris) { + config.setValue(METASTORE_URLS, uris); + return this; + } + + public HoodieMetastoreConfig build() { + config.setDefaults(HoodieMetastoreConfig.class.getName()); + return config; + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/LockConfiguration.java b/hudi-common/src/main/java/org/apache/hudi/common/config/LockConfiguration.java new file mode 100644 index 0000000000000..c6ebc54e95d78 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/LockConfiguration.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.config; + +import java.io.Serializable; +import java.util.Properties; + +/** + * Configuration for managing locks. Since this configuration needs to be shared with HiveMetaStore based lock, + * which is in a different package than other lock providers, we use this as a data transfer object in hoodie-common + */ +public class LockConfiguration implements Serializable { + + public static final String LOCK_PREFIX = "hoodie.write.lock."; + + public static final String LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY = LOCK_PREFIX + "wait_time_ms_between_retry"; + public static final String DEFAULT_LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS = String.valueOf(1000L); + + public static final String LOCK_ACQUIRE_RETRY_MAX_WAIT_TIME_IN_MILLIS_PROP_KEY = LOCK_PREFIX + "max_wait_time_ms_between_retry"; + + public static final String LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY = LOCK_PREFIX + "client.wait_time_ms_between_retry"; + + public static final String LOCK_ACQUIRE_NUM_RETRIES_PROP_KEY = LOCK_PREFIX + "num_retries"; + public static final String DEFAULT_LOCK_ACQUIRE_NUM_RETRIES = String.valueOf(15); + + public static final String LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY = LOCK_PREFIX + "client.num_retries"; + + public static final String LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY = LOCK_PREFIX + "wait_time_ms"; + + // configs for file system based locks. NOTE: This only works for DFS with atomic create/delete operation + public static final String FILESYSTEM_BASED_LOCK_PROPERTY_PREFIX = LOCK_PREFIX + "filesystem."; + + public static final String FILESYSTEM_LOCK_PATH_PROP_KEY = FILESYSTEM_BASED_LOCK_PROPERTY_PREFIX + "path"; + + public static final String FILESYSTEM_LOCK_EXPIRE_PROP_KEY = FILESYSTEM_BASED_LOCK_PROPERTY_PREFIX + "expire"; + + // configs for metastore based locks + public static final String HIVE_METASTORE_LOCK_PROPERTY_PREFIX = LOCK_PREFIX + "hivemetastore."; + + public static final String HIVE_DATABASE_NAME_PROP_KEY = HIVE_METASTORE_LOCK_PROPERTY_PREFIX + "database"; + + public static final String HIVE_TABLE_NAME_PROP_KEY = HIVE_METASTORE_LOCK_PROPERTY_PREFIX + "table"; + + public static final String HIVE_METASTORE_URI_PROP_KEY = HIVE_METASTORE_LOCK_PROPERTY_PREFIX + "uris"; + + // Zookeeper configs for zk based locks + public static final String ZOOKEEPER_BASED_LOCK_PROPERTY_PREFIX = LOCK_PREFIX + "zookeeper."; + + public static final String ZK_BASE_PATH_PROP_KEY = ZOOKEEPER_BASED_LOCK_PROPERTY_PREFIX + "base_path"; + + public static final String ZK_SESSION_TIMEOUT_MS_PROP_KEY = ZOOKEEPER_BASED_LOCK_PROPERTY_PREFIX + "session_timeout_ms"; + public static final int DEFAULT_ZK_SESSION_TIMEOUT_MS = 60 * 1000; + + public static final String ZK_CONNECTION_TIMEOUT_MS_PROP_KEY = ZOOKEEPER_BASED_LOCK_PROPERTY_PREFIX + "connection_timeout_ms"; + public static final int DEFAULT_ZK_CONNECTION_TIMEOUT_MS = 15 * 1000; + + public static final String ZK_CONNECT_URL_PROP_KEY = ZOOKEEPER_BASED_LOCK_PROPERTY_PREFIX + "url"; + + public static final String ZK_PORT_PROP_KEY = ZOOKEEPER_BASED_LOCK_PROPERTY_PREFIX + "port"; + + public static final String ZK_LOCK_KEY_PROP_KEY = ZOOKEEPER_BASED_LOCK_PROPERTY_PREFIX + "lock_key"; + + /** @deprecated Use {@link #LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY} */ + @Deprecated + public static final String LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS_PROP = LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY; + /** @deprecated Use {@link #LOCK_ACQUIRE_RETRY_MAX_WAIT_TIME_IN_MILLIS_PROP_KEY} */ + @Deprecated + public static final String LOCK_ACQUIRE_RETRY_MAX_WAIT_TIME_IN_MILLIS_PROP = LOCK_ACQUIRE_RETRY_MAX_WAIT_TIME_IN_MILLIS_PROP_KEY; + @Deprecated + public static final String DEFAULT_LOCK_ACQUIRE_MAX_RETRY_WAIT_TIME_IN_MILLIS = String.valueOf(5000L); + /** @deprecated Use {@link #LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY} */ + @Deprecated + public static final String LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP = LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY; + @Deprecated + public static final String DEFAULT_LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS = String.valueOf(10000L); + /** @deprecated Use {@link #LOCK_ACQUIRE_NUM_RETRIES_PROP_KEY} */ + @Deprecated + public static final String LOCK_ACQUIRE_NUM_RETRIES_PROP = LOCK_ACQUIRE_NUM_RETRIES_PROP_KEY; + /** @deprecated Use {@link #LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY} */ + @Deprecated + public static final String LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP = LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY; + @Deprecated + public static final String DEFAULT_LOCK_ACQUIRE_CLIENT_NUM_RETRIES = String.valueOf(0); + /** @deprecated Use {@link #LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY} */ + @Deprecated + public static final String LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP = LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY; + @Deprecated + public static final int DEFAULT_ACQUIRE_LOCK_WAIT_TIMEOUT_MS = 60 * 1000; + /** @deprecated Use {@link #HIVE_DATABASE_NAME_PROP_KEY} */ + @Deprecated + public static final String HIVE_DATABASE_NAME_PROP = HIVE_DATABASE_NAME_PROP_KEY; + /** @deprecated Use {@link #HIVE_TABLE_NAME_PROP_KEY} */ + @Deprecated + public static final String HIVE_TABLE_NAME_PROP = HIVE_TABLE_NAME_PROP_KEY; + /** @deprecated Use {@link #HIVE_METASTORE_URI_PROP_KEY} */ + @Deprecated + public static final String HIVE_METASTORE_URI_PROP = HIVE_METASTORE_URI_PROP_KEY; + /** @deprecated Use {@link #ZK_BASE_PATH_PROP_KEY} */ + @Deprecated + public static final String ZK_BASE_PATH_PROP = ZK_BASE_PATH_PROP_KEY; + /** @deprecated Use {@link #ZK_SESSION_TIMEOUT_MS_PROP_KEY} */ + @Deprecated + public static final String ZK_SESSION_TIMEOUT_MS_PROP = ZK_SESSION_TIMEOUT_MS_PROP_KEY; + /** @deprecated Use {@link #ZK_CONNECTION_TIMEOUT_MS_PROP_KEY} */ + @Deprecated + public static final String ZK_CONNECTION_TIMEOUT_MS_PROP = ZK_CONNECTION_TIMEOUT_MS_PROP_KEY; + /** @deprecated Use {@link #ZK_CONNECT_URL_PROP_KEY} */ + @Deprecated + public static final String ZK_CONNECT_URL_PROP = ZK_CONNECT_URL_PROP_KEY; + /** @deprecated Use {@link #ZK_PORT_PROP_KEY} */ + @Deprecated + public static final String ZK_PORT_PROP = ZK_PORT_PROP_KEY; + /** @deprecated Use {@link #ZK_LOCK_KEY_PROP_KEY} */ + @Deprecated + public static final String ZK_LOCK_KEY_PROP = ZK_LOCK_KEY_PROP_KEY; + + private final TypedProperties props; + + public LockConfiguration(Properties props) { + this.props = new TypedProperties(props); + } + + public TypedProperties getConfig() { + return props; + } + +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/OrderedProperties.java b/hudi-common/src/main/java/org/apache/hudi/common/config/OrderedProperties.java new file mode 100644 index 0000000000000..fa2a61574a84e --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/OrderedProperties.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.config; + +import java.util.Collections; +import java.util.Enumeration; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.Map; +import java.util.Objects; +import java.util.Properties; +import java.util.Set; + +/** + * An extension of {@link java.util.Properties} that maintains the order. + * The implementation is not thread-safe. + */ +public class OrderedProperties extends Properties { + + private final HashSet keys = new LinkedHashSet<>(); + + public OrderedProperties() { + super(null); + } + + public OrderedProperties(Properties defaults) { + if (Objects.nonNull(defaults)) { + for (String key : defaults.stringPropertyNames()) { + put(key, defaults.getProperty(key)); + } + } + } + + @Override + public Enumeration propertyNames() { + return Collections.enumeration(keys); + } + + @Override + public synchronized Enumeration keys() { + return Collections.enumeration(keys); + } + + @Override + public Set stringPropertyNames() { + Set set = new LinkedHashSet<>(); + for (Object key : this.keys) { + if (key instanceof String) { + set.add((String) key); + } + } + return set; + } + + public synchronized void putAll(Properties t) { + for (Map.Entry e : t.entrySet()) { + if (!containsKey(String.valueOf(e.getKey()))) { + keys.add(e.getKey()); + } + super.put(e.getKey(), e.getValue()); + } + } + + @Override + public synchronized Object put(Object key, Object value) { + keys.remove(key); + keys.add(key); + return super.put(key, value); + } + + public synchronized Object putIfAbsent(Object key, Object value) { + if (!containsKey(String.valueOf(key))) { + keys.add(key); + } + return super.putIfAbsent(key, value); + } + + @Override + public Object remove(Object key) { + keys.remove(key); + return super.remove(key); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/SerializableSchema.java b/hudi-common/src/main/java/org/apache/hudi/common/config/SerializableSchema.java new file mode 100644 index 0000000000000..4f6de8ba5f3c3 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/SerializableSchema.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.config; + +import org.apache.avro.Schema; + +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; + +/** + * A wrapped Schema which can be serialized. + */ +public class SerializableSchema implements Serializable { + + private transient Schema schema; + + public SerializableSchema() { + } + + public SerializableSchema(Schema schema) { + this.schema = newCopy(schema); + } + + public SerializableSchema(SerializableSchema serializableSchema) { + this(serializableSchema.schema); + } + + public static Schema newCopy(Schema schemaObject) { + return new Schema.Parser().parse(schemaObject.toString()); + } + + public Schema get() { + return schema; + } + + private void writeObject(ObjectOutputStream out) throws IOException { + out.defaultWriteObject(); + writeObjectTo(out); + } + + private void readObject(ObjectInputStream in) throws IOException { + readObjectFrom(in); + } + + // create a public write method for unit test + public void writeObjectTo(ObjectOutputStream out) throws IOException { + // Note: writeUTF cannot support string length > 64K. So use writeObject which has small overhead (relatively). + out.writeObject(schema.toString()); + } + + // create a public read method for unit test + public void readObjectFrom(ObjectInputStream in) throws IOException { + try { + schema = new Schema.Parser().parse(in.readObject().toString()); + } catch (ClassNotFoundException e) { + throw new IOException("unable to parse schema", e); + } + } + + @Override + public String toString() { + return schema.toString(); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/TypedProperties.java b/hudi-common/src/main/java/org/apache/hudi/common/config/TypedProperties.java index c780ded740e24..f246b75be7aeb 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/config/TypedProperties.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/TypedProperties.java @@ -18,11 +18,14 @@ package org.apache.hudi.common.config; +import org.apache.hudi.common.util.StringUtils; + import java.io.Serializable; import java.util.Arrays; +import java.util.Enumeration; import java.util.List; +import java.util.Objects; import java.util.Properties; -import java.util.Set; import java.util.stream.Collectors; /** @@ -35,21 +38,34 @@ public TypedProperties() { } public TypedProperties(Properties defaults) { - super(defaults); + if (Objects.nonNull(defaults)) { + for (Enumeration e = defaults.propertyNames(); e.hasMoreElements(); ) { + Object k = e.nextElement(); + Object v = defaults.get(k); + if (v != null) { + put(k, v); + } + } + } } - private void checkKey(String property) { - if (!keyExists(property)) { - throw new IllegalArgumentException("Property " + property + " not found"); + public void setPropertyIfNonNull(String key, Object value) { + if (value != null) { + setProperty(key, value.toString()); } } - private boolean keyExists(String property) { - Set keys = super.stringPropertyNames(); - if (keys.contains(property)) { - return true; + @Override + public String getProperty(String key) { + Object oval = super.get(key); + String sval = (oval != null) ? String.valueOf(oval) : null; + return ((sval == null) && (defaults != null)) ? defaults.getProperty(key) : sval; + } + + private void checkKey(String property) { + if (!containsKey(property)) { + throw new IllegalArgumentException("Property " + property + " not found"); } - return false; } public String getString(String property) { @@ -58,14 +74,14 @@ public String getString(String property) { } public String getString(String property, String defaultValue) { - return keyExists(property) ? getProperty(property) : defaultValue; + return containsKey(property) ? getProperty(property) : defaultValue; } public List getStringList(String property, String delimiter, List defaultVal) { - if (!keyExists(property)) { + if (!containsKey(property)) { return defaultVal; } - return Arrays.stream(getProperty(property).split(delimiter)).map(String::trim).collect(Collectors.toList()); + return Arrays.stream(getProperty(property).split(delimiter)).map(String::trim).filter(s -> !StringUtils.isNullOrEmpty(s)).collect(Collectors.toList()); } public int getInteger(String property) { @@ -74,7 +90,7 @@ public int getInteger(String property) { } public int getInteger(String property, int defaultValue) { - return keyExists(property) ? Integer.parseInt(getProperty(property)) : defaultValue; + return containsKey(property) ? Integer.parseInt(getProperty(property)) : defaultValue; } public long getLong(String property) { @@ -83,7 +99,7 @@ public long getLong(String property) { } public long getLong(String property, long defaultValue) { - return keyExists(property) ? Long.parseLong(getProperty(property)) : defaultValue; + return containsKey(property) ? Long.parseLong(getProperty(property)) : defaultValue; } public boolean getBoolean(String property) { @@ -92,7 +108,7 @@ public boolean getBoolean(String property) { } public boolean getBoolean(String property, boolean defaultValue) { - return keyExists(property) ? Boolean.parseBoolean(getProperty(property)) : defaultValue; + return containsKey(property) ? Boolean.parseBoolean(getProperty(property)) : defaultValue; } public double getDouble(String property) { @@ -101,6 +117,6 @@ public double getDouble(String property) { } public double getDouble(String property, double defaultValue) { - return keyExists(property) ? Double.parseDouble(getProperty(property)) : defaultValue; + return containsKey(property) ? Double.parseDouble(getProperty(property)) : defaultValue; } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieAccumulator.java b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieAccumulator.java new file mode 100644 index 0000000000000..61fb98e1acc25 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieAccumulator.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.data; + +import java.io.Serializable; + +/** + * An abstraction for accumulator on counts. + */ +public abstract class HoodieAccumulator implements Serializable { + /** + * @return the count. + */ + public abstract long value(); + + /** + * Increments the count based on the input. + * + * @param increment the value to add. + */ + public abstract void add(long increment); +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieAtomicLongAccumulator.java b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieAtomicLongAccumulator.java new file mode 100644 index 0000000000000..3ace1c7a4a099 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieAtomicLongAccumulator.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.data; + +import java.util.concurrent.atomic.AtomicLong; + +/** + * An accumulator on counts based on {@link AtomicLong} implementation. + */ +public class HoodieAtomicLongAccumulator extends HoodieAccumulator { + + private final AtomicLong accumulator; + + private HoodieAtomicLongAccumulator() { + accumulator = new AtomicLong(0L); + } + + public static HoodieAtomicLongAccumulator create() { + return new HoodieAtomicLongAccumulator(); + } + + @Override + public long value() { + return accumulator.get(); + } + + @Override + public void add(long increment) { + accumulator.addAndGet(increment); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieBaseListData.java b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieBaseListData.java new file mode 100644 index 0000000000000..398762dc63070 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieBaseListData.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.data; + +import org.apache.hudi.common.util.Either; + +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +public abstract class HoodieBaseListData { + + protected final Either, List> data; + protected final boolean lazy; + + protected HoodieBaseListData(List data, boolean lazy) { + this.data = lazy ? Either.left(data.stream().parallel()) : Either.right(data); + this.lazy = lazy; + } + + protected HoodieBaseListData(Stream dataStream, boolean lazy) { + // NOTE: In case this container is being instantiated by an eager parent, we have to + // pre-materialize the stream + this.data = lazy ? Either.left(dataStream) : Either.right(dataStream.collect(Collectors.toList())); + this.lazy = lazy; + } + + protected Stream asStream() { + return lazy ? data.asLeft() : data.asRight().parallelStream(); + } + + protected boolean isEmpty() { + if (lazy) { + return data.asLeft().findAny().isPresent(); + } else { + return data.asRight().isEmpty(); + } + } + + protected long count() { + if (lazy) { + return data.asLeft().count(); + } else { + return data.asRight().size(); + } + } + + protected List collectAsList() { + if (lazy) { + return data.asLeft().collect(Collectors.toList()); + } else { + return data.asRight(); + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieData.java b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieData.java new file mode 100644 index 0000000000000..1d56e63fad928 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieData.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.data; + +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFunction; +import org.apache.hudi.common.util.collection.Pair; + +import java.io.Serializable; +import java.util.Iterator; +import java.util.List; + +/** + * An interface abstracting a container holding a collection of objects of type {@code T} + * allowing to perform common transformation on it. + * + * This abstraction provides common API implemented by + *
      + *
    1. In-memory implementation ({@code HoodieListData}, {@code HoodieListPairData}), where all objects + * are held in-memory by the executing process
    2. + *
    3. RDD-based implementation ({@code HoodieJavaRDD}, etc)
    4. , where underlying collection is held + * by an RDD allowing to execute transformations using Spark engine on the cluster + *
    + * + * All implementations provide for consistent semantic, where + *
      + *
    • All non-terminal* operations are executed lazily (for ex, {@code map}, {@code filter}, etc)
    • + *
    • All terminal operations are executed eagerly, executing all previously accumulated transformations. + * Note that, collection could not be re-used after invoking terminal operation on it.
    • + *
    + * + * @param type of object + */ +public interface HoodieData extends Serializable { + + /** + * Persists the data w/ provided {@code level} (if applicable) + */ + void persist(String level); + + /** + * Un-persists the data (if previously persisted) + */ + void unpersist(); + + /** + * Returns whether the collection is empty. + */ + boolean isEmpty(); + + /** + * Returns number of objects held in the collection + *

    + * NOTE: This is a terminal operation + */ + long count(); + + /** + * @return the number of data partitions in the engine-specific representation. + */ + int getNumPartitions(); + + /** + * Maps every element in the collection using provided mapping {@code func}. + *

    + * This is an intermediate operation + * + * @param func serializable map function + * @param output object type + * @return {@link HoodieData} holding mapped elements + */ + HoodieData map(SerializableFunction func); + + /** + * Maps every element in the collection's partition (if applicable) by applying provided + * mapping {@code func} to every collection's partition + * + * This is an intermediate operation + * + * @param func serializable map function accepting {@link Iterator} of a single + * partition's elements and returning a new {@link Iterator} mapping + * every element of the partition into a new one + * @param preservesPartitioning whether to preserve partitioning in the resulting collection + * @param output object type + * @return {@link HoodieData} holding mapped elements + */ + HoodieData mapPartitions(SerializableFunction, + Iterator> func, boolean preservesPartitioning); + + /** + * Maps every element in the collection into a collection of the new elements (provided by + * {@link Iterator}) using provided mapping {@code func}, subsequently flattening the result + * (by concatenating) into a single collection + * + * This is an intermediate operation + * + * @param func serializable function mapping every element {@link T} into {@code Iterator} + * @param output object type + * @return {@link HoodieData} holding mapped elements + */ + HoodieData flatMap(SerializableFunction> func); + + /** + * Maps every element in the collection using provided mapping {@code func} into a {@link Pair} + * of elements {@code K} and {@code V} + *

    + * This is an intermediate operation + * + * @param func serializable map function + * @param key type of the pair + * @param value type of the pair + * @return {@link HoodiePairData} holding mapped elements + */ + HoodiePairData mapToPair(SerializablePairFunction func); + + /** + * Returns new {@link HoodieData} collection holding only distinct objects of the original one + * + * This is a stateful intermediate operation + */ + HoodieData distinct(); + + /** + * Returns new {@link HoodieData} collection holding only distinct objects of the original one + * + * This is a stateful intermediate operation + */ + HoodieData distinct(int parallelism); + + /** + * Returns new instance of {@link HoodieData} collection only containing elements matching provided + * {@code filterFunc} (ie ones it returns true on) + * + * @param filterFunc filtering func either accepting or rejecting the elements + * @return {@link HoodieData} holding filtered elements + */ + HoodieData filter(SerializableFunction filterFunc); + + /** + * Unions {@link HoodieData} with another instance of {@link HoodieData}. + * Note that, it's only able to union same underlying collection implementations. + * + * This is a stateful intermediate operation + * + * @param other {@link HoodieData} collection + * @return {@link HoodieData} holding superset of elements of this and {@code other} collections + */ + HoodieData union(HoodieData other); + + /** + * Collects results of the underlying collection into a {@link List} + * + * This is a terminal operation + */ + List collectAsList(); + + /** + * Re-partitions underlying collection (if applicable) making sure new {@link HoodieData} has + * exactly {@code parallelism} partitions + * + * @param parallelism target number of partitions in the underlying collection + * @return {@link HoodieData} holding re-partitioned collection + */ + HoodieData repartition(int parallelism); + + default HoodieData distinctWithKey(SerializableFunction keyGetter, int parallelism) { + return mapToPair(i -> Pair.of(keyGetter.apply(i), i)) + .reduceByKey((value1, value2) -> value1, parallelism) + .values(); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieListData.java b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieListData.java new file mode 100644 index 0000000000000..b2a503a85b323 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieListData.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.data; + +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFunction; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.Pair; + +import java.util.Iterator; +import java.util.List; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.function.Function; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import static org.apache.hudi.common.function.FunctionWrapper.throwingMapToPairWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingMapWrapper; + +/** + * In-memory implementation of {@link HoodieData} holding internally a {@link Stream} of objects. + * + * {@link HoodieListData} can have either of the 2 execution semantics: + * + *

      + *
    1. Eager: with every operation being executed right away
    2. + *
    3. Lazy: with every operation being "stacked up", with it execution postponed until + * "terminal" operation is invoked
    4. + *
    + * + * NOTE: This is an in-memory counterpart for {@code HoodieJavaRDD}, and it strives to provide + * similar semantic as RDD container -- all intermediate (non-terminal, not de-referencing + * the stream like "collect", "groupBy", etc) operations are executed *lazily*. + * This allows to make sure that compute/memory churn is minimal since only necessary + * computations will ultimately be performed. + * + * Please note, however, that while RDD container allows the same collection to be + * de-referenced more than once (ie terminal operation invoked more than once), + * {@link HoodieListData} allows that only when instantiated w/ an eager execution semantic. + * + * @param type of object. + */ +public class HoodieListData extends HoodieBaseListData implements HoodieData { + + private HoodieListData(List data, boolean lazy) { + super(data, lazy); + } + + HoodieListData(Stream dataStream, boolean lazy) { + super(dataStream, lazy); + } + + /** + * Creates instance of {@link HoodieListData} bearing *eager* execution semantic + * + * @param listData a {@link List} of objects in type T + * @param type of object + * @return a new instance containing the {@link List} reference + */ + public static HoodieListData eager(List listData) { + return new HoodieListData<>(listData, false); + } + + /** + * Creates instance of {@link HoodieListData} bearing *lazy* execution semantic + * + * @param listData a {@link List} of objects in type T + * @param type of object + * @return a new instance containing the {@link List} reference + */ + public static HoodieListData lazy(List listData) { + return new HoodieListData<>(listData, true); + } + + @Override + public void persist(String level) { + // No OP + } + + @Override + public void unpersist() { + // No OP + } + + @Override + public HoodieData map(SerializableFunction func) { + return new HoodieListData<>(asStream().map(throwingMapWrapper(func)), lazy); + } + + @Override + public HoodieData mapPartitions(SerializableFunction, Iterator> func, boolean preservesPartitioning) { + Function, Iterator> mapper = throwingMapWrapper(func); + return new HoodieListData<>( + StreamSupport.stream( + Spliterators.spliteratorUnknownSize( + mapper.apply(asStream().iterator()), Spliterator.ORDERED), true), + lazy + ); + } + + @Override + public HoodieData flatMap(SerializableFunction> func) { + Function> mapper = throwingMapWrapper(func); + Stream mappedStream = asStream().flatMap(e -> + StreamSupport.stream( + Spliterators.spliteratorUnknownSize(mapper.apply(e), Spliterator.ORDERED), true)); + return new HoodieListData<>(mappedStream, lazy); + } + + @Override + public HoodiePairData mapToPair(SerializablePairFunction func) { + Function> throwableMapToPairFunc = throwingMapToPairWrapper(func); + return new HoodieListPairData<>(asStream().map(throwableMapToPairFunc), lazy); + } + + @Override + public HoodieData distinct() { + return new HoodieListData<>(asStream().distinct(), lazy); + } + + @Override + public HoodieData distinct(int parallelism) { + return distinct(); + } + + @Override + public HoodieData distinctWithKey(SerializableFunction keyGetter, int parallelism) { + return mapToPair(i -> Pair.of(keyGetter.apply(i), i)) + .reduceByKey((value1, value2) -> value1, parallelism) + .values(); + } + + @Override + public HoodieData filter(SerializableFunction filterFunc) { + return new HoodieListData<>(asStream().filter(r -> throwingMapWrapper(filterFunc).apply(r)), lazy); + } + + @Override + public HoodieData union(HoodieData other) { + ValidationUtils.checkArgument(other instanceof HoodieListData); + return new HoodieListData<>(Stream.concat(asStream(), ((HoodieListData)other).asStream()), lazy); + } + + @Override + public HoodieData repartition(int parallelism) { + // no op + return this; + } + + @Override + public boolean isEmpty() { + return super.isEmpty(); + } + + @Override + public long count() { + return super.count(); + } + + @Override + public int getNumPartitions() { + return 1; + } + + @Override + public List collectAsList() { + return super.collectAsList(); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieListPairData.java b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieListPairData.java new file mode 100644 index 0000000000000..a389649548e98 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieListPairData.java @@ -0,0 +1,201 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.data; + +import org.apache.hudi.common.function.SerializableBiFunction; +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFunction; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.Pair; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collector; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.apache.hudi.common.function.FunctionWrapper.throwingMapToPairWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingMapWrapper; + +/** + * In-memory implementation of {@link HoodiePairData} holding internally a {@link Stream} of {@link Pair}s. + * + * {@link HoodieListData} can have either of the 2 execution semantics: + * + *
      + *
    1. Eager: with every operation being executed right away
    2. + *
    3. Lazy: with every operation being "stacked up", with it execution postponed until + * "terminal" operation is invoked
    4. + *
    + * + * + * NOTE: This is an in-memory counterpart for {@code HoodieJavaPairRDD}, and it strives to provide + * similar semantic as RDD container -- all intermediate (non-terminal, not de-referencing + * the stream like "collect", "groupBy", etc) operations are executed *lazily*. + * This allows to make sure that compute/memory churn is minimal since only necessary + * computations will ultimately be performed. + * + * Please note, however, that while RDD container allows the same collection to be + * de-referenced more than once (ie terminal operation invoked more than once), + * {@link HoodieListData} allows that only when instantiated w/ an eager execution semantic. + * + * @param type of the key in the pair + * @param type of the value in the pair + */ +public class HoodieListPairData extends HoodieBaseListData> implements HoodiePairData { + + private HoodieListPairData(List> data, boolean lazy) { + super(data, lazy); + } + + HoodieListPairData(Stream> dataStream, boolean lazy) { + super(dataStream, lazy); + } + + @Override + public List> get() { + return collectAsList(); + } + + @Override + public void persist(String cacheConfig) { + // no-op + } + + @Override + public void unpersist() { + // no-op + } + + @Override + public HoodieData keys() { + return new HoodieListData<>(asStream().map(Pair::getKey), lazy); + } + + @Override + public HoodieData values() { + return new HoodieListData<>(asStream().map(Pair::getValue), lazy); + } + + @Override + public Map countByKey() { + return asStream().collect(Collectors.groupingBy(Pair::getKey, Collectors.counting())); + } + + @Override + public HoodiePairData> groupByKey() { + Collector, ?, List> mappingCollector = Collectors.mapping(Pair::getValue, Collectors.toList()); + Collector, ?, Map>> groupingCollector = + Collectors.groupingBy(Pair::getKey, mappingCollector); + + Map> groupedByKey = asStream().collect(groupingCollector); + return new HoodieListPairData<>( + groupedByKey.entrySet().stream().map(e -> Pair.of(e.getKey(), e.getValue())), + lazy + ); + } + + @Override + public HoodiePairData reduceByKey(SerializableBiFunction combiner, int parallelism) { + Map> reducedMap = asStream().collect( + Collectors.groupingBy( + Pair::getKey, + HashMap::new, + Collectors.mapping(Pair::getValue, Collectors.reducing(combiner::apply)))); + + return new HoodieListPairData<>( + reducedMap.entrySet() + .stream() + .map(e -> Pair.of(e.getKey(), e.getValue().orElse(null))), + lazy + ); + } + + @Override + public HoodieData map(SerializableFunction, O> func) { + Function, O> uncheckedMapper = throwingMapWrapper(func); + return new HoodieListData<>(asStream().map(uncheckedMapper), lazy); + } + + @Override + public HoodiePairData mapToPair(SerializablePairFunction, L, W> mapToPairFunc) { + return new HoodieListPairData<>(asStream().map(p -> throwingMapToPairWrapper(mapToPairFunc).apply(p)), lazy); + } + + @Override + public HoodiePairData>> leftOuterJoin(HoodiePairData other) { + ValidationUtils.checkArgument(other instanceof HoodieListPairData); + + // Transform right-side container to a multi-map of [[K]] to [[List]] values + HashMap> rightStreamMap = ((HoodieListPairData) other).asStream().collect( + Collectors.groupingBy( + Pair::getKey, + HashMap::new, + Collectors.mapping(Pair::getValue, Collectors.toList()))); + + Stream>>> leftOuterJoined = asStream().flatMap(pair -> { + K key = pair.getKey(); + V leftValue = pair.getValue(); + List rightValues = rightStreamMap.get(key); + + if (rightValues == null) { + return Stream.of(Pair.of(key, Pair.of(leftValue, Option.empty()))); + } else { + return rightValues.stream().map(rightValue -> + Pair.of(key, Pair.of(leftValue, Option.of(rightValue)))); + } + }); + + return new HoodieListPairData<>(leftOuterJoined, lazy); + } + + @Override + public long count() { + return super.count(); + } + + @Override + public List> collectAsList() { + return super.collectAsList(); + } + + public static HoodieListPairData lazy(List> data) { + return new HoodieListPairData<>(data, true); + } + + public static HoodieListPairData eager(List> data) { + return new HoodieListPairData<>(data, false); + } + + public static HoodieListPairData lazy(Map> data) { + return new HoodieListPairData<>(explode(data), true); + } + + public static HoodieListPairData eager(Map> data) { + return new HoodieListPairData<>(explode(data), false); + } + + private static Stream> explode(Map> data) { + return data.entrySet().stream() + .flatMap(e -> e.getValue().stream().map(v -> Pair.of(e.getKey(), v))); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/data/HoodiePairData.java b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodiePairData.java new file mode 100644 index 0000000000000..49fa7174da9a6 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodiePairData.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.data; + +import org.apache.hudi.common.function.SerializableBiFunction; +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFunction; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; + +import java.io.Serializable; +import java.util.List; +import java.util.Map; + +/** + * An abstraction for pairs of key in type K and value in type V to store the reference + * and do transformation. + * + * @param type of key. + * @param type of value. + */ +public interface HoodiePairData extends Serializable { + /** + * @return the collection of pairs. + */ + Object get(); + + /** + * Persists the data (if applicable) + * + * @param cacheConfig config value for caching. + */ + void persist(String cacheConfig); + + /** + * Un-persists the data (if applicable) + */ + void unpersist(); + + /** + * Returns a {@link HoodieData} holding the key from every corresponding pair + */ + HoodieData keys(); + + /** + * Returns a {@link HoodieData} holding the value from every corresponding pair + */ + HoodieData values(); + + /** + * Returns number of held pairs + */ + long count(); + + /** + * Counts the number of pairs grouping them by key + */ + Map countByKey(); + + /** + * Groups the values for each key in the dataset into a single sequence + */ + HoodiePairData> groupByKey(); + + /** + * Reduces original sequence by de-duplicating the pairs w/ the same key, using provided + * binary operator {@code combiner}. Returns an instance of {@link HoodiePairData} holding + * the "de-duplicated" pairs, ie only pairs with unique keys. + * + * @param combiner method to combine values of the pairs with the same key + * @param parallelism target parallelism (if applicable) + */ + HoodiePairData reduceByKey(SerializableBiFunction combiner, int parallelism); + + /** + * @param func serializable map function. + * @param output object type. + * @return {@link HoodieData} containing the result. Actual execution may be deferred. + */ + HoodieData map(SerializableFunction, O> func); + + /** + * @param mapToPairFunc serializable map function to generate another pair. + * @param new key type. + * @param new value type. + * @return containing the result. Actual execution may be deferred. + */ + HoodiePairData mapToPair( + SerializablePairFunction, L, W> mapToPairFunc); + + /** + * Performs a left outer join of this dataset against {@code other}. + * + * For each element (k, v) in this, the resulting {@link HoodiePairData} will either contain all + * pairs {@code (k, (v, Some(w)))} for every {@code w} in the {@code other}, or the pair {@code (k, (v, None))} + * if no elements in {@code other} have the pair w/ a key {@code k} + * + * @param other the other {@link HoodiePairData} + * @param value type of the other {@link HoodiePairData} + * @return containing the result of the left outer join + */ + HoodiePairData>> leftOuterJoin(HoodiePairData other); + + /** + * Collects results of the underlying collection into a {@link List>} + * + * This is a terminal operation + */ + List> collectAsList(); +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/EngineProperty.java b/hudi-common/src/main/java/org/apache/hudi/common/engine/EngineProperty.java similarity index 91% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/EngineProperty.java rename to hudi-common/src/main/java/org/apache/hudi/common/engine/EngineProperty.java index aeaec32747780..36e7594937b04 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/EngineProperty.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/engine/EngineProperty.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hudi.client.common; +package org.apache.hudi.common.engine; /** * Properties specific to each engine, that can be set/obtained from. @@ -26,8 +26,10 @@ public enum EngineProperty { EMBEDDED_SERVER_HOST, // Pool/queue to use to run compaction. COMPACTION_POOL_NAME, + CLUSTERING_POOL_NAME, + TOTAL_CORES_PER_EXECUTOR, // Amount of total memory available to each engine executor TOTAL_MEMORY_AVAILABLE, // Fraction of that memory, that is already in use by the engine - MEMORY_FRACTION_IN_USE, + MEMORY_FRACTION_IN_USE } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/EngineType.java b/hudi-common/src/main/java/org/apache/hudi/common/engine/EngineType.java similarity index 83% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/EngineType.java rename to hudi-common/src/main/java/org/apache/hudi/common/engine/EngineType.java index 1ecb0e9557b82..b951d66a42280 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/EngineType.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/engine/EngineType.java @@ -16,11 +16,13 @@ * limitations under the License. */ -package org.apache.hudi.client.common; +package org.apache.hudi.common.engine; /** - * Hoodie data processing engine. support only Apache Spark and Apache Flink for now. + * Hoodie data processing engine. + *

    + * Support only Apache Spark, Apache Flink and Java for now. */ public enum EngineType { - SPARK, FLINK + SPARK, FLINK, JAVA } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieEngineContext.java b/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieEngineContext.java new file mode 100644 index 0000000000000..d400a10f68a10 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieEngineContext.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.engine; + +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.data.HoodieAccumulator; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.function.SerializableBiFunction; +import org.apache.hudi.common.function.SerializableConsumer; +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFlatMapFunction; +import org.apache.hudi.common.function.SerializablePairFunction; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; + +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.stream.Stream; + +/** + * Base class contains the context information needed by the engine at runtime. It will be extended by different + * engine implementation if needed. + */ +public abstract class HoodieEngineContext { + + /** + * A wrapped hadoop configuration which can be serialized. + */ + private SerializableConfiguration hadoopConf; + + protected TaskContextSupplier taskContextSupplier; + + public HoodieEngineContext(SerializableConfiguration hadoopConf, TaskContextSupplier taskContextSupplier) { + this.hadoopConf = hadoopConf; + this.taskContextSupplier = taskContextSupplier; + } + + public SerializableConfiguration getHadoopConf() { + return hadoopConf; + } + + public TaskContextSupplier getTaskContextSupplier() { + return taskContextSupplier; + } + + public abstract HoodieAccumulator newAccumulator(); + + public abstract HoodieData emptyHoodieData(); + + public HoodieData parallelize(List data) { + return parallelize(data, data.size()); + } + + public abstract HoodieData parallelize(List data, int parallelism); + + public abstract List map(List data, SerializableFunction func, int parallelism); + + public abstract List mapToPairAndReduceByKey( + List data, SerializablePairFunction mapToPairFunc, SerializableBiFunction reduceFunc, int parallelism); + + public abstract Stream> mapPartitionsToPairAndReduceByKey( + Stream data, SerializablePairFlatMapFunction, K, V> flatMapToPairFunc, + SerializableBiFunction reduceFunc, int parallelism); + + public abstract List reduceByKey( + List> data, SerializableBiFunction reduceFunc, int parallelism); + + public abstract List flatMap(List data, SerializableFunction> func, int parallelism); + + public abstract void foreach(List data, SerializableConsumer consumer, int parallelism); + + public abstract Map mapToPair(List data, SerializablePairFunction func, Integer parallelism); + + public abstract void setProperty(EngineProperty key, String value); + + public abstract Option getProperty(EngineProperty key); + + public abstract void setJobStatus(String activeModule, String activityDescription); + +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieLocalEngineContext.java b/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieLocalEngineContext.java new file mode 100644 index 0000000000000..5d7d193dc6b8f --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieLocalEngineContext.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.engine; + +import org.apache.hadoop.conf.Configuration; + +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.data.HoodieAccumulator; +import org.apache.hudi.common.data.HoodieAtomicLongAccumulator; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodieListData; +import org.apache.hudi.common.function.SerializableBiFunction; +import org.apache.hudi.common.function.SerializableConsumer; +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.function.SerializablePairFlatMapFunction; +import org.apache.hudi.common.function.SerializablePairFunction; +import org.apache.hudi.common.util.Option; + +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; + +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static java.util.stream.Collectors.toList; +import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapToPairWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingForeachWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingMapToPairWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingMapWrapper; +import static org.apache.hudi.common.function.FunctionWrapper.throwingReduceWrapper; + +/** + * A java based engine context, use this implementation on the query engine integrations if needed. + */ +public final class HoodieLocalEngineContext extends HoodieEngineContext { + + public HoodieLocalEngineContext(Configuration conf) { + this(conf, new LocalTaskContextSupplier()); + } + + public HoodieLocalEngineContext(Configuration conf, TaskContextSupplier taskContextSupplier) { + super(new SerializableConfiguration(conf), taskContextSupplier); + } + + @Override + public HoodieAccumulator newAccumulator() { + return HoodieAtomicLongAccumulator.create(); + } + + @Override + public HoodieData emptyHoodieData() { + return HoodieListData.eager(Collections.emptyList()); + } + + @Override + public HoodieData parallelize(List data, int parallelism) { + return HoodieListData.eager(data); + } + + @Override + public List map(List data, SerializableFunction func, int parallelism) { + return data.stream().parallel().map(throwingMapWrapper(func)).collect(toList()); + } + + @Override + public List mapToPairAndReduceByKey( + List data, SerializablePairFunction mapToPairFunc, SerializableBiFunction reduceFunc, int parallelism) { + return data.stream().parallel().map(throwingMapToPairWrapper(mapToPairFunc)) + .collect(Collectors.groupingBy(p -> p.getKey())).values().stream() + .map(list -> list.stream().map(e -> e.getValue()).reduce(throwingReduceWrapper(reduceFunc)).get()) + .collect(Collectors.toList()); + } + + @Override + public Stream> mapPartitionsToPairAndReduceByKey( + Stream data, SerializablePairFlatMapFunction, K, V> flatMapToPairFunc, + SerializableBiFunction reduceFunc, int parallelism) { + return throwingFlatMapToPairWrapper(flatMapToPairFunc).apply(data.parallel().iterator()) + .collect(Collectors.groupingBy(Pair::getKey)).entrySet().stream() + .map(entry -> new ImmutablePair<>(entry.getKey(), entry.getValue().stream().map( + Pair::getValue).reduce(throwingReduceWrapper(reduceFunc)).orElse(null))) + .filter(Objects::nonNull); + } + + @Override + public List reduceByKey( + List> data, SerializableBiFunction reduceFunc, int parallelism) { + return data.stream().parallel() + .collect(Collectors.groupingBy(p -> p.getKey())).values().stream() + .map(list -> list.stream().map(e -> e.getValue()).reduce(throwingReduceWrapper(reduceFunc)).orElse(null)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + + @Override + public List flatMap(List data, SerializableFunction> func, int parallelism) { + return data.stream().parallel().flatMap(throwingFlatMapWrapper(func)).collect(toList()); + } + + @Override + public void foreach(List data, SerializableConsumer consumer, int parallelism) { + data.stream().forEach(throwingForeachWrapper(consumer)); + } + + @Override + public Map mapToPair(List data, SerializablePairFunction func, Integer parallelism) { + return data.stream().map(throwingMapToPairWrapper(func)).collect( + Collectors.toMap(Pair::getLeft, Pair::getRight, (oldVal, newVal) -> newVal) + ); + } + + @Override + public void setProperty(EngineProperty key, String value) { + // no operation for now + } + + @Override + public Option getProperty(EngineProperty key) { + return Option.empty(); + } + + @Override + public void setJobStatus(String activeModule, String activityDescription) { + // no operation for now + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/engine/LocalTaskContextSupplier.java b/hudi-common/src/main/java/org/apache/hudi/common/engine/LocalTaskContextSupplier.java new file mode 100644 index 0000000000000..0c7ae20e1d5e8 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/engine/LocalTaskContextSupplier.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.engine; + +import org.apache.hudi.common.util.Option; + +import java.util.function.Supplier; + +public final class LocalTaskContextSupplier extends TaskContextSupplier { + @Override + public Supplier getPartitionIdSupplier() { + return () -> 0; + } + + @Override + public Supplier getStageIdSupplier() { + return () -> 0; + } + + @Override + public Supplier getAttemptIdSupplier() { + return () -> 0L; + } + + @Override + public Option getProperty(EngineProperty prop) { + return Option.empty(); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/TaskContextSupplier.java b/hudi-common/src/main/java/org/apache/hudi/common/engine/TaskContextSupplier.java similarity index 97% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/TaskContextSupplier.java rename to hudi-common/src/main/java/org/apache/hudi/common/engine/TaskContextSupplier.java index 3a350d61d65c0..813236c07a842 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/TaskContextSupplier.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/engine/TaskContextSupplier.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hudi.client.common; +package org.apache.hudi.common.engine; import org.apache.hudi.common.util.Option; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/BoundedFsDataInputStream.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/BoundedFsDataInputStream.java new file mode 100644 index 0000000000000..27315f85e62c9 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/BoundedFsDataInputStream.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +package org.apache.hudi.common.fs; + +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; + +public class BoundedFsDataInputStream extends FSDataInputStream { + private FileSystem fs; + private Path file; + private long fileLen = -1L; + + public BoundedFsDataInputStream(FileSystem fs, Path file, InputStream in) { + super(in); + this.fs = fs; + this.file = file; + } + + @Override + public boolean markSupported() { + return false; + } + + /* Return the file length */ + private long getFileLength() throws IOException { + if (fileLen == -1L) { + fileLen = fs.getContentSummary(file).getLength(); + } + return fileLen; + } + + @Override + public synchronized void seek(long pos) throws IOException { + if (pos < 0 || pos > getFileLength()) { + throw new EOFException("Try to seek pos[" + pos + "] , but fileSize is " + getFileLength()); + } + super.seek(pos); + } + + @Override + public synchronized long skip(long n) throws IOException { + long curPos = getPos(); + long fileLength = getFileLength(); + if (n + curPos > fileLength) { + n = fileLength - curPos; + } + return super.skip(n); + } + +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/ConsistencyGuardConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/ConsistencyGuardConfig.java index e55fb2423a204..323e41f074efe 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/ConsistencyGuardConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/ConsistencyGuardConfig.java @@ -18,7 +18,10 @@ package org.apache.hudi.common.fs; -import org.apache.hudi.common.config.DefaultHoodieConfig; +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; import java.io.File; import java.io.FileReader; @@ -28,34 +31,58 @@ /** * The consistency guard relevant config options. */ -public class ConsistencyGuardConfig extends DefaultHoodieConfig { - - private static final String CONSISTENCY_CHECK_ENABLED_PROP = "hoodie.consistency.check.enabled"; - private static final String DEFAULT_CONSISTENCY_CHECK_ENABLED = "false"; - - // time between successive attempts to ensure written data's metadata is consistent on storage - private static final String INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP = - "hoodie.consistency.check.initial_interval_ms"; - private static long DEFAULT_INITIAL_CONSISTENCY_CHECK_INTERVAL_MS = 400L; - - // max interval time - private static final String MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP = "hoodie.consistency.check.max_interval_ms"; - private static long DEFAULT_MAX_CONSISTENCY_CHECK_INTERVAL_MS = 20000L; +@ConfigClassProperty(name = "Consistency Guard Configurations", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "The consistency guard related config options, to help talk to eventually consistent object storage." + + "(Tip: S3 is NOT eventually consistent anymore!)") +public class ConsistencyGuardConfig extends HoodieConfig { + + public static final ConfigProperty ENABLE = ConfigProperty + .key("hoodie.consistency.check.enabled") + .defaultValue("false") + .sinceVersion("0.5.0") + .deprecatedAfter("0.7.0") + .withDocumentation("Enabled to handle S3 eventual consistency issue. This property is no longer required " + + "since S3 is now strongly consistent. Will be removed in the future releases."); + + public static final ConfigProperty INITIAL_CHECK_INTERVAL_MS = ConfigProperty + .key("hoodie.consistency.check.initial_interval_ms") + .defaultValue(400L) + .sinceVersion("0.5.0") + .deprecatedAfter("0.7.0") + .withDocumentation("Amount of time (in ms) to wait, before checking for consistency after an operation on storage."); + + public static final ConfigProperty MAX_CHECK_INTERVAL_MS = ConfigProperty + .key("hoodie.consistency.check.max_interval_ms") + .defaultValue(20000L) + .sinceVersion("0.5.0") + .deprecatedAfter("0.7.0") + .withDocumentation("Maximum amount of time (in ms), to wait for consistency checking."); // maximum number of checks, for consistency of written data. Will wait upto 140 Secs - private static final String MAX_CONSISTENCY_CHECKS_PROP = "hoodie.consistency.check.max_checks"; - private static int DEFAULT_MAX_CONSISTENCY_CHECKS = 6; + public static final ConfigProperty MAX_CHECKS = ConfigProperty + .key("hoodie.consistency.check.max_checks") + .defaultValue(6) + .sinceVersion("0.5.0") + .deprecatedAfter("0.7.0") + .withDocumentation("Maximum number of consistency checks to perform, with exponential backoff."); // sleep time for OptimisticConsistencyGuard - private static final String OPTIMISTIC_CONSISTENCY_GUARD_SLEEP_TIME_MS_PROP = "hoodie.optimistic.consistency.guard.sleep_time_ms"; - private static long DEFAULT_OPTIMISTIC_CONSISTENCY_GUARD_SLEEP_TIME_MS_PROP = 500L; + public static final ConfigProperty OPTIMISTIC_CONSISTENCY_GUARD_SLEEP_TIME_MS = ConfigProperty + .key("hoodie.optimistic.consistency.guard.sleep_time_ms") + .defaultValue(500L) + .sinceVersion("0.6.0") + .withDocumentation("Amount of time (in ms), to wait after which we assume storage is consistent."); // config to enable OptimisticConsistencyGuard in finalizeWrite instead of FailSafeConsistencyGuard - private static final String ENABLE_OPTIMISTIC_CONSISTENCY_GUARD = "_hoodie.optimistic.consistency.guard.enable"; - private static boolean DEFAULT_ENABLE_OPTIMISTIC_CONSISTENCY_GUARD = true; - - public ConsistencyGuardConfig(Properties props) { - super(props); + public static final ConfigProperty OPTIMISTIC_CONSISTENCY_GUARD_ENABLE = ConfigProperty + .key("_hoodie.optimistic.consistency.guard.enable") + .defaultValue(false) + .sinceVersion("0.6.0") + .withDocumentation("Enable consistency guard, which optimistically assumes consistency is achieved after a certain time period."); + + private ConsistencyGuardConfig() { + super(); } public static ConsistencyGuardConfig.Builder newBuilder() { @@ -63,27 +90,27 @@ public static ConsistencyGuardConfig.Builder newBuilder() { } public boolean isConsistencyCheckEnabled() { - return Boolean.parseBoolean(props.getProperty(CONSISTENCY_CHECK_ENABLED_PROP)); + return getBoolean(ENABLE); } public int getMaxConsistencyChecks() { - return Integer.parseInt(props.getProperty(MAX_CONSISTENCY_CHECKS_PROP)); + return getInt(MAX_CHECKS); } public int getInitialConsistencyCheckIntervalMs() { - return Integer.parseInt(props.getProperty(INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP)); + return getInt(INITIAL_CHECK_INTERVAL_MS); } public int getMaxConsistencyCheckIntervalMs() { - return Integer.parseInt(props.getProperty(MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP)); + return getInt(MAX_CHECK_INTERVAL_MS); } public long getOptimisticConsistencyGuardSleepTimeMs() { - return Long.parseLong(props.getProperty(OPTIMISTIC_CONSISTENCY_GUARD_SLEEP_TIME_MS_PROP)); + return getLong(OPTIMISTIC_CONSISTENCY_GUARD_SLEEP_TIME_MS); } public boolean shouldEnableOptimisticConsistencyGuard() { - return Boolean.parseBoolean(props.getProperty(ENABLE_OPTIMISTIC_CONSISTENCY_GUARD)); + return getBoolean(OPTIMISTIC_CONSISTENCY_GUARD_ENABLE); } /** @@ -91,65 +118,114 @@ public boolean shouldEnableOptimisticConsistencyGuard() { */ public static class Builder { - private final Properties props = new Properties(); + private final ConsistencyGuardConfig consistencyGuardConfig = new ConsistencyGuardConfig(); public Builder fromFile(File propertiesFile) throws IOException { try (FileReader reader = new FileReader(propertiesFile)) { - props.load(reader); + consistencyGuardConfig.getProps().load(reader); return this; } } public Builder fromProperties(Properties props) { - this.props.putAll(props); + this.consistencyGuardConfig.getProps().putAll(props); return this; } public Builder withConsistencyCheckEnabled(boolean enabled) { - props.setProperty(CONSISTENCY_CHECK_ENABLED_PROP, String.valueOf(enabled)); + consistencyGuardConfig.setValue(ENABLE, String.valueOf(enabled)); return this; } public Builder withInitialConsistencyCheckIntervalMs(int initialIntevalMs) { - props.setProperty(INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(initialIntevalMs)); + consistencyGuardConfig.setValue(INITIAL_CHECK_INTERVAL_MS, String.valueOf(initialIntevalMs)); return this; } public Builder withMaxConsistencyCheckIntervalMs(int maxIntervalMs) { - props.setProperty(MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(maxIntervalMs)); + consistencyGuardConfig.setValue(MAX_CHECK_INTERVAL_MS, String.valueOf(maxIntervalMs)); return this; } public Builder withMaxConsistencyChecks(int maxConsistencyChecks) { - props.setProperty(MAX_CONSISTENCY_CHECKS_PROP, String.valueOf(maxConsistencyChecks)); + consistencyGuardConfig.setValue(MAX_CHECKS, String.valueOf(maxConsistencyChecks)); return this; } public Builder withOptimisticConsistencyGuardSleepTimeMs(long sleepTimeMs) { - props.setProperty(OPTIMISTIC_CONSISTENCY_GUARD_SLEEP_TIME_MS_PROP, String.valueOf(sleepTimeMs)); + consistencyGuardConfig.setValue(OPTIMISTIC_CONSISTENCY_GUARD_SLEEP_TIME_MS, String.valueOf(sleepTimeMs)); return this; } public Builder withEnableOptimisticConsistencyGuard(boolean enableOptimisticConsistencyGuard) { - props.setProperty(ENABLE_OPTIMISTIC_CONSISTENCY_GUARD, String.valueOf(enableOptimisticConsistencyGuard)); + consistencyGuardConfig.setValue(OPTIMISTIC_CONSISTENCY_GUARD_ENABLE, String.valueOf(enableOptimisticConsistencyGuard)); return this; } public ConsistencyGuardConfig build() { - setDefaultOnCondition(props, !props.containsKey(CONSISTENCY_CHECK_ENABLED_PROP), CONSISTENCY_CHECK_ENABLED_PROP, - DEFAULT_CONSISTENCY_CHECK_ENABLED); - setDefaultOnCondition(props, !props.containsKey(INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP), - INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(DEFAULT_INITIAL_CONSISTENCY_CHECK_INTERVAL_MS)); - setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP), - MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP, String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECK_INTERVAL_MS)); - setDefaultOnCondition(props, !props.containsKey(MAX_CONSISTENCY_CHECKS_PROP), MAX_CONSISTENCY_CHECKS_PROP, - String.valueOf(DEFAULT_MAX_CONSISTENCY_CHECKS)); - setDefaultOnCondition(props, !props.containsKey(OPTIMISTIC_CONSISTENCY_GUARD_SLEEP_TIME_MS_PROP), - OPTIMISTIC_CONSISTENCY_GUARD_SLEEP_TIME_MS_PROP, String.valueOf(DEFAULT_OPTIMISTIC_CONSISTENCY_GUARD_SLEEP_TIME_MS_PROP)); - setDefaultOnCondition(props, !props.containsKey(ENABLE_OPTIMISTIC_CONSISTENCY_GUARD), - ENABLE_OPTIMISTIC_CONSISTENCY_GUARD, - String.valueOf(DEFAULT_ENABLE_OPTIMISTIC_CONSISTENCY_GUARD)); - return new ConsistencyGuardConfig(props); + consistencyGuardConfig.setDefaults(ConsistencyGuardConfig.class.getName()); + return consistencyGuardConfig; } } + + /** + * @deprecated use {@link #ENABLE} and its methods. + */ + @Deprecated + private static final String CONSISTENCY_CHECK_ENABLED_PROP = ENABLE.key(); + /** + * @deprecated use {@link #ENABLE} and its methods. + */ + @Deprecated + private static final String DEFAULT_CONSISTENCY_CHECK_ENABLED = ENABLE.defaultValue(); + /** + * @deprecated use {@link #INITIAL_CHECK_INTERVAL_MS} and its methods. + */ + @Deprecated + private static final String INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP = INITIAL_CHECK_INTERVAL_MS.key(); + /** + * @deprecated use {@link #INITIAL_CHECK_INTERVAL_MS} and its methods. + */ + @Deprecated + private static long DEFAULT_INITIAL_CONSISTENCY_CHECK_INTERVAL_MS = INITIAL_CHECK_INTERVAL_MS.defaultValue(); + /** + * @deprecated use {@link #MAX_CHECK_INTERVAL_MS} and its methods. + */ + @Deprecated + private static final String MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP = MAX_CHECK_INTERVAL_MS.key(); + /** + * @deprecated use {@link #MAX_CHECK_INTERVAL_MS} and its methods. + */ + @Deprecated + private static long DEFAULT_MAX_CONSISTENCY_CHECK_INTERVAL_MS = MAX_CHECK_INTERVAL_MS.defaultValue(); + /** + * @deprecated use {@link #MAX_CHECKS} and its methods. + */ + @Deprecated + private static final String MAX_CONSISTENCY_CHECKS_PROP = MAX_CHECKS.key(); + /** + * @deprecated use {@link #MAX_CHECKS} and its methods. + */ + @Deprecated + private static int DEFAULT_MAX_CONSISTENCY_CHECKS = MAX_CHECKS.defaultValue(); + /** + * @deprecated use {@link #OPTIMISTIC_CONSISTENCY_GUARD_SLEEP_TIME_MS} and its methods. + */ + @Deprecated + private static final String OPTIMISTIC_CONSISTENCY_GUARD_SLEEP_TIME_MS_PROP = OPTIMISTIC_CONSISTENCY_GUARD_SLEEP_TIME_MS.key(); + /** + * @deprecated use {@link #OPTIMISTIC_CONSISTENCY_GUARD_SLEEP_TIME_MS} and its methods. + */ + @Deprecated + private static long DEFAULT_OPTIMISTIC_CONSISTENCY_GUARD_SLEEP_TIME_MS_PROP = OPTIMISTIC_CONSISTENCY_GUARD_SLEEP_TIME_MS.defaultValue(); + /** + * @deprecated use {@link #OPTIMISTIC_CONSISTENCY_GUARD_ENABLE} and its methods. + */ + @Deprecated + private static final String ENABLE_OPTIMISTIC_CONSISTENCY_GUARD = OPTIMISTIC_CONSISTENCY_GUARD_ENABLE.key(); + /** + * @deprecated use {@link #OPTIMISTIC_CONSISTENCY_GUARD_ENABLE} and its methods. + */ + @Deprecated + private static boolean DEFAULT_ENABLE_OPTIMISTIC_CONSISTENCY_GUARD = OPTIMISTIC_CONSISTENCY_GUARD_ENABLE.defaultValue(); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java index 0ce557348b795..f4895f988218c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java @@ -18,20 +18,26 @@ package org.apache.hudi.common.fs; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodiePartitionMetadata; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.InvalidHoodiePathException; +import org.apache.hudi.hadoop.CachingPath; +import org.apache.hudi.metadata.HoodieTableMetadata; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; @@ -43,20 +49,28 @@ import org.apache.log4j.Logger; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; +import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; - +import java.util.HashMap; +import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Map.Entry; import java.util.Objects; +import java.util.Set; import java.util.UUID; import java.util.function.Function; +import java.util.function.Predicate; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.apache.hudi.hadoop.CachingPath.getPathWithoutSchemeAndAuthority; + /** * Utility functions related to accessing the file storage. */ @@ -64,9 +78,8 @@ public class FSUtils { private static final Logger LOG = LogManager.getLogger(FSUtils.class); // Log files are of this pattern - .b5068208-e1a4-11e6-bf01-fe55135034f3_20170101134598.log.1 - private static final Pattern LOG_FILE_PATTERN = + public static final Pattern LOG_FILE_PATTERN = Pattern.compile("\\.(.*)_(.*)\\.(.*)\\.([0-9]*)(_(([0-9]*)-([0-9]*)-([0-9]*)))?"); - private static final String LOG_FILE_PREFIX = "."; private static final int MAX_ATTEMPTS_RECOVER_LEASE = 10; private static final long MIN_CLEAN_TO_KEEP = 10; private static final long MIN_ROLLBACK_TO_KEEP = 10; @@ -75,9 +88,6 @@ public class FSUtils { private static final PathFilter ALLOW_ALL_FILTER = file -> true; public static Configuration prepareHadoopConf(Configuration conf) { - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - // look for all properties, prefixed to be picked up for (Entry prop : System.getenv().entrySet()) { if (prop.getKey().startsWith(HOODIE_ENV_PROPS_PREFIX)) { @@ -88,24 +98,36 @@ public static Configuration prepareHadoopConf(Configuration conf) { return conf; } - public static FileSystem getFs(String path, Configuration conf) { + public static FileSystem getFs(String pathStr, Configuration conf) { + return getFs(new Path(pathStr), conf); + } + + public static FileSystem getFs(Path path, Configuration conf) { FileSystem fs; prepareHadoopConf(conf); try { - fs = new Path(path).getFileSystem(conf); + fs = path.getFileSystem(conf); } catch (IOException e) { throw new HoodieIOException("Failed to get instance of " + FileSystem.class.getName(), e); } - LOG.info(String.format("Hadoop Configuration: fs.defaultFS: [%s], Config:[%s], FileSystem: [%s]", - conf.getRaw("fs.defaultFS"), conf.toString(), fs.toString())); return fs; } - public static FileSystem getFs(String path, Configuration conf, boolean localByDefault) { + public static FileSystem getFs(String pathStr, Configuration conf, boolean localByDefault) { if (localByDefault) { - return getFs(addSchemeIfLocalPath(path).toString(), conf); + return getFs(addSchemeIfLocalPath(pathStr), conf); } - return getFs(path, conf); + return getFs(pathStr, conf); + } + + /** + * Check if table already exists in the given path. + * @param path base path of the table. + * @param fs instance of {@link FileSystem}. + * @return {@code true} if table exists. {@code false} otherwise. + */ + public static boolean isTableExists(String path, FileSystem fs) throws IOException { + return fs.exists(new Path(path + "/" + HoodieTableMetaClient.METAFOLDER_NAME)); } public static Path addSchemeIfLocalPath(String path) { @@ -120,6 +142,17 @@ public static Path addSchemeIfLocalPath(String path) { return providedPath; } + /** + * Makes path qualified w/ {@link FileSystem}'s URI + * + * @param fs instance of {@link FileSystem} path belongs to + * @param path path to be qualified + * @return qualified path, prefixed w/ the URI of the target FS object provided + */ + public static Path makeQualified(FileSystem fs, Path path) { + return path.makeQualified(fs.getUri(), fs.getWorkingDirectory()); + } + /** * A write token uniquely identifies an attempt at one of the IOHandle operations (Merge/Create/Append). */ @@ -128,11 +161,12 @@ public static String makeWriteToken(int taskPartitionId, int stageId, long taskA } // TODO: this should be removed - public static String makeDataFileName(String instantTime, String writeToken, String fileId) { - return String.format("%s_%s_%s%s", fileId, writeToken, instantTime, HoodieFileFormat.PARQUET.getFileExtension()); + public static String makeBaseFileName(String instantTime, String writeToken, String fileId) { + return String.format("%s_%s_%s%s", fileId, writeToken, instantTime, + HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension()); } - public static String makeDataFileName(String instantTime, String writeToken, String fileId, String fileExtension) { + public static String makeBaseFileName(String instantTime, String writeToken, String fileId, String fileExtension) { return String.format("%s_%s_%s%s", fileId, writeToken, instantTime, fileExtension); } @@ -141,7 +175,8 @@ public static String makeBootstrapIndexFileName(String instantTime, String fileI } public static String maskWithoutFileId(String instantTime, int taskPartitionId) { - return String.format("*_%s_%s%s", taskPartitionId, instantTime, HoodieFileFormat.PARQUET.getFileExtension()); + return String.format("*_%s_%s%s", taskPartitionId, instantTime, HoodieTableConfig.BASE_FILE_FORMAT + .defaultValue().getFileExtension()); } public static String getCommitFromCommitFile(String commitFileName) { @@ -149,6 +184,9 @@ public static String getCommitFromCommitFile(String commitFileName) { } public static String getCommitTime(String fullFileName) { + if (isLogFile(fullFileName)) { + return fullFileName.split("_")[1].split("\\.")[0]; + } return fullFileName.split("_")[2].split("\\.")[0]; } @@ -180,9 +218,15 @@ public static List getAllPartitionFoldersThreeLevelsDown(FileSystem fs, * Given a base partition and a partition path, return relative path of partition path to the base path. */ public static String getRelativePartitionPath(Path basePath, Path fullPartitionPath) { - basePath = Path.getPathWithoutSchemeAndAuthority(basePath); - fullPartitionPath = Path.getPathWithoutSchemeAndAuthority(fullPartitionPath); + basePath = getPathWithoutSchemeAndAuthority(basePath); + fullPartitionPath = getPathWithoutSchemeAndAuthority(fullPartitionPath); + String fullPartitionPathStr = fullPartitionPath.toString(); + + if (!fullPartitionPathStr.startsWith(basePath.toString())) { + throw new IllegalArgumentException("Partition path does not belong to base-path"); + } + int partitionStartIndex = fullPartitionPathStr.indexOf(basePath.getName(), basePath.getParent() == null ? 0 : basePath.getParent().toString().length()); // Partition-Path could be empty for non-partitioned tables @@ -192,25 +236,34 @@ public static String getRelativePartitionPath(Path basePath, Path fullPartitionP /** * Obtain all the partition paths, that are present in this table, denoted by presence of - * {@link HoodiePartitionMetadata#HOODIE_PARTITION_METAFILE}. + * {@link HoodiePartitionMetadata#HOODIE_PARTITION_METAFILE_PREFIX}. + * + * If the basePathStr is a subdirectory of .hoodie folder then we assume that the partitions of an internal + * table (a hoodie table within the .hoodie directory) are to be obtained. + * + * @param fs FileSystem instance + * @param basePathStr base directory */ public static List getAllFoldersWithPartitionMetaFile(FileSystem fs, String basePathStr) throws IOException { + // If the basePathStr is a folder within the .hoodie directory then we are listing partitions within an + // internal table. + final boolean isMetadataTable = HoodieTableMetadata.isMetadataTable(basePathStr); final Path basePath = new Path(basePathStr); final List partitions = new ArrayList<>(); processFiles(fs, basePathStr, (locatedFileStatus) -> { Path filePath = locatedFileStatus.getPath(); - if (filePath.getName().equals(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE)) { + if (filePath.getName().startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX)) { partitions.add(getRelativePartitionPath(basePath, filePath.getParent())); } return true; - }, true); + }, !isMetadataTable); return partitions; } /** * Recursively processes all files in the base-path. If excludeMetaFolder is set, the meta-folder and all its subdirs * are skipped - * + * * @param fs File System * @param basePathStr Base-Path * @param consumer Callback for processing @@ -240,12 +293,41 @@ public static void processFiles(FileSystem fs, String basePathStr, Function getAllPartitionPaths(FileSystem fs, String basePathStr, boolean assumeDatePartitioning) - throws IOException { - if (assumeDatePartitioning) { - return getAllPartitionFoldersThreeLevelsDown(fs, basePathStr); - } else { - return getAllFoldersWithPartitionMetaFile(fs, basePathStr); + public static List getAllPartitionPaths(HoodieEngineContext engineContext, String basePathStr, + boolean useFileListingFromMetadata, + boolean assumeDatePartitioning) { + HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder() + .enable(useFileListingFromMetadata) + .withAssumeDatePartitioning(assumeDatePartitioning) + .build(); + try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(engineContext, metadataConfig, basePathStr, + FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue())) { + return tableMetadata.getAllPartitionPaths(); + } catch (Exception e) { + throw new HoodieException("Error fetching partition paths from metadata table", e); + } + } + + public static List getAllPartitionPaths(HoodieEngineContext engineContext, HoodieMetadataConfig metadataConfig, + String basePathStr) { + try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(engineContext, metadataConfig, basePathStr, + FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue())) { + return tableMetadata.getAllPartitionPaths(); + } catch (Exception e) { + throw new HoodieException("Error fetching partition paths from metadata table", e); + } + } + + public static Map getFilesInPartitions(HoodieEngineContext engineContext, + HoodieMetadataConfig metadataConfig, + String basePathStr, + String[] partitionPaths, + String spillableMapPath) { + try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(engineContext, metadataConfig, basePathStr, + spillableMapPath, true)) { + return tableMetadata.getAllFilesInPartitions(Arrays.asList(partitionPaths)); + } catch (Exception ex) { + throw new HoodieException("Error get files in partitions: " + String.join(",", partitionPaths), ex); } } @@ -268,6 +350,10 @@ public static String createNewFileIdPfx() { return UUID.randomUUID().toString(); } + public static String createNewFileId(String idPfx, int id) { + return String.format("%s-%d", idPfx, id); + } + /** * Get the file extension from the log file. */ @@ -292,7 +378,7 @@ public static String getFileIdFromLogPath(Path path) { } /** - * Check if the file is a parquet file of a log file. Then get the fileId appropriately. + * Check if the file is a base file of a log file. Then get the fileId appropriately. */ public static String getFileIdFromFilePath(Path filePath) { if (FSUtils.isLogFile(filePath)) { @@ -373,33 +459,77 @@ public static int getFileVersionFromLog(Path logPath) { public static String makeLogFileName(String fileId, String logFileExtension, String baseCommitTime, int version, String writeToken) { - String suffix = - (writeToken == null) ? String.format("%s_%s%s.%d", fileId, baseCommitTime, logFileExtension, version) - : String.format("%s_%s%s.%d_%s", fileId, baseCommitTime, logFileExtension, version, writeToken); - return LOG_FILE_PREFIX + suffix; + String suffix = (writeToken == null) + ? String.format("%s_%s%s.%d", fileId, baseCommitTime, logFileExtension, version) + : String.format("%s_%s%s.%d_%s", fileId, baseCommitTime, logFileExtension, version, writeToken); + return HoodieLogFile.LOG_FILE_PREFIX + suffix; + } + + public static boolean isBaseFile(Path path) { + String extension = getFileExtension(path.getName()); + return HoodieFileFormat.BASE_FILE_EXTENSIONS.contains(extension); } public static boolean isLogFile(Path logPath) { - Matcher matcher = LOG_FILE_PATTERN.matcher(logPath.getName()); - return matcher.find() && logPath.getName().contains(".log"); + return isLogFile(logPath.getName()); + } + + public static boolean isLogFile(String fileName) { + Matcher matcher = LOG_FILE_PATTERN.matcher(fileName); + return matcher.find() && fileName.contains(".log"); } /** - * Get the latest log file written from the list of log files passed in. + * Returns true if the given path is a Base file or a Log file. */ - public static Option getLatestLogFile(Stream logFiles) { - return Option.fromJavaOptional(logFiles.min(HoodieLogFile.getReverseLogFileComparator())); + public static boolean isDataFile(Path path) { + return isBaseFile(path) || isLogFile(path); } /** - * Get all the log files for the passed in FileId in the partition path. + * Get the names of all the base and log files in the given partition path. + */ + public static FileStatus[] getAllDataFilesInPartition(FileSystem fs, Path partitionPath) throws IOException { + final Set validFileExtensions = Arrays.stream(HoodieFileFormat.values()) + .map(HoodieFileFormat::getFileExtension).collect(Collectors.toCollection(HashSet::new)); + final String logFileExtension = HoodieFileFormat.HOODIE_LOG.getFileExtension(); + + try { + return Arrays.stream(fs.listStatus(partitionPath, path -> { + String extension = FSUtils.getFileExtension(path.getName()); + return validFileExtensions.contains(extension) || path.getName().contains(logFileExtension); + })).filter(FileStatus::isFile).toArray(FileStatus[]::new); + } catch (IOException e) { + // return empty FileStatus if partition does not exist already + if (!fs.exists(partitionPath)) { + return new FileStatus[0]; + } else { + throw e; + } + } + } + + /** + * Get the latest log file for the passed in file-id in the partition path + */ + public static Option getLatestLogFile(FileSystem fs, Path partitionPath, String fileId, + String logFileExtension, String baseCommitTime) throws IOException { + return getLatestLogFile(getAllLogFiles(fs, partitionPath, fileId, logFileExtension, baseCommitTime)); + } + + /** + * Get all the log files for the passed in file-id in the partition path. */ public static Stream getAllLogFiles(FileSystem fs, Path partitionPath, final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException { - return Arrays - .stream(fs.listStatus(partitionPath, - path -> path.getName().startsWith("." + fileId) && path.getName().contains(logFileExtension))) - .map(HoodieLogFile::new).filter(s -> s.getBaseCommitTime().equals(baseCommitTime)); + try { + PathFilter pathFilter = path -> path.getName().startsWith("." + fileId) && path.getName().contains(logFileExtension); + return Arrays.stream(fs.listStatus(partitionPath, pathFilter)) + .map(HoodieLogFile::new) + .filter(s -> s.getBaseCommitTime().equals(baseCommitTime)); + } catch (FileNotFoundException e) { + return Stream.of(); + } } /** @@ -459,39 +589,6 @@ public static boolean recoverDFSFileLease(final DistributedFileSystem dfs, final return recovered; } - public static void deleteOlderCleanMetaFiles(FileSystem fs, String metaPath, Stream instants) { - // TODO - this should be archived when archival is made general for all meta-data - // skip MIN_CLEAN_TO_KEEP and delete rest - instants.skip(MIN_CLEAN_TO_KEEP).forEach(s -> { - try { - fs.delete(new Path(metaPath, s.getFileName()), false); - } catch (IOException e) { - throw new HoodieIOException("Could not delete clean meta files" + s.getFileName(), e); - } - }); - } - - public static void deleteOlderRollbackMetaFiles(FileSystem fs, String metaPath, Stream instants) { - // TODO - this should be archived when archival is made general for all meta-data - // skip MIN_ROLLBACK_TO_KEEP and delete rest - instants.skip(MIN_ROLLBACK_TO_KEEP).forEach(s -> { - try { - fs.delete(new Path(metaPath, s.getFileName()), false); - } catch (IOException e) { - throw new HoodieIOException("Could not delete rollback meta files " + s.getFileName(), e); - } - }); - } - - public static void deleteInstantFile(FileSystem fs, String metaPath, HoodieInstant instant) { - try { - LOG.warn("try to delete instant file: " + instant); - fs.delete(new Path(metaPath, instant.getFileName()), false); - } catch (IOException e) { - throw new HoodieIOException("Could not delete instant file" + instant.getFileName(), e); - } - } - public static void createPathIfNotExists(FileSystem fs, Path partitionPath) throws IOException { if (!fs.exists(partitionPath)) { fs.mkdirs(partitionPath); @@ -503,12 +600,39 @@ public static Long getSizeInMB(long sizeInBytes) { } public static Path getPartitionPath(String basePath, String partitionPath) { - return getPartitionPath(new Path(basePath), partitionPath); + if (StringUtils.isNullOrEmpty(partitionPath)) { + return new Path(basePath); + } + + // NOTE: We have to chop leading "/" to make sure Hadoop does not treat it like + // absolute path + String properPartitionPath = partitionPath.startsWith("/") + ? partitionPath.substring(1) + : partitionPath; + return getPartitionPath(new CachingPath(basePath), properPartitionPath); } public static Path getPartitionPath(Path basePath, String partitionPath) { - // FOr non-partitioned table, return only base-path - return ((partitionPath == null) || (partitionPath.isEmpty())) ? basePath : new Path(basePath, partitionPath); + // For non-partitioned table, return only base-path + return StringUtils.isNullOrEmpty(partitionPath) ? basePath : new CachingPath(basePath, partitionPath); + } + + /** + * Extracts the file name from the relative path based on the table base path. For example: + * "/2022/07/29/file1.parquet", "/2022/07/29" -> "file1.parquet" + * "2022/07/29/file2.parquet", "2022/07/29" -> "file2.parquet" + * "/file3.parquet", "" -> "file3.parquet" + * "file4.parquet", "" -> "file4.parquet" + * + * @param filePathWithPartition the relative file path based on the table base path. + * @param partition the relative partition path. For partitioned table, `partition` contains the relative partition path; + * for non-partitioned table, `partition` is empty + * @return Extracted file name in String. + */ + public static String getFileName(String filePathWithPartition, String partition) { + int offset = StringUtils.isNullOrEmpty(partition) + ? (filePathWithPartition.startsWith("/") ? 1 : 0) : partition.length() + 1; + return filePathWithPartition.substring(offset); } /** @@ -520,14 +644,20 @@ public static String getDFSFullPartitionPath(FileSystem fs, Path fullPartitionPa /** * This is due to HUDI-140 GCS has a different behavior for detecting EOF during seek(). - * - * @param inputStream FSDataInputStream + * + * @param fs fileSystem instance. * @return true if the inputstream or the wrapped one is of type GoogleHadoopFSInputStream */ - public static boolean isGCSInputStream(FSDataInputStream inputStream) { - return inputStream.getClass().getCanonicalName().equals("com.google.cloud.hadoop.fs.gcs.GoogleHadoopFSInputStream") - || inputStream.getWrappedStream().getClass().getCanonicalName() - .equals("com.google.cloud.hadoop.fs.gcs.GoogleHadoopFSInputStream"); + public static boolean isGCSFileSystem(FileSystem fs) { + return fs.getScheme().equals(StorageSchemes.GCS.getScheme()); + } + + /** + * Chdfs will throw {@code IOException} instead of {@code EOFException}. It will cause error in isBlockCorrupted(). + * Wrapped by {@code BoundedFsDataInputStream}, to check whether the desired offset is out of the file size in advance. + */ + public static boolean isCHDFileSystem(FileSystem fs) { + return StorageSchemes.CHDFS.getScheme().equals(fs.getScheme()); } public static Configuration registerFileSystem(Path file, Configuration conf) { @@ -558,8 +688,8 @@ public static HoodieWrapperFileSystem getFs(String path, SerializableConfigurati * Helper to filter out paths under metadata folder when running fs.globStatus. * @param fs File System * @param globPath Glob Path - * @return - * @throws IOException + * @return the file status list of globPath exclude the meta folder + * @throws IOException when having trouble listing the path */ public static List getGlobStatusExcludingMetaFolder(FileSystem fs, Path globPath) throws IOException { FileStatus[] statuses = fs.globStatus(globPath); @@ -567,4 +697,153 @@ public static List getGlobStatusExcludingMetaFolder(FileSystem fs, P .filter(fileStatus -> !fileStatus.getPath().toString().contains(HoodieTableMetaClient.METAFOLDER_NAME)) .collect(Collectors.toList()); } + + /** + * Deletes a directory by deleting sub-paths in parallel on the file system. + * + * @param hoodieEngineContext {@code HoodieEngineContext} instance + * @param fs file system + * @param dirPath directory path + * @param parallelism parallelism to use for sub-paths + * @return {@code true} if the directory is delete; {@code false} otherwise. + */ + public static boolean deleteDir( + HoodieEngineContext hoodieEngineContext, FileSystem fs, Path dirPath, int parallelism) { + try { + if (fs.exists(dirPath)) { + FSUtils.parallelizeSubPathProcess(hoodieEngineContext, fs, dirPath, parallelism, e -> true, + pairOfSubPathAndConf -> deleteSubPath( + pairOfSubPathAndConf.getKey(), pairOfSubPathAndConf.getValue(), true) + ); + boolean result = fs.delete(dirPath, false); + LOG.info("Removed directory at " + dirPath); + return result; + } + } catch (IOException ioe) { + throw new HoodieIOException(ioe.getMessage(), ioe); + } + return false; + } + + /** + * Processes sub-path in parallel. + * + * @param hoodieEngineContext {@code HoodieEngineContext} instance + * @param fs file system + * @param dirPath directory path + * @param parallelism parallelism to use for sub-paths + * @param subPathPredicate predicate to use to filter sub-paths for processing + * @param pairFunction actual processing logic for each sub-path + * @param type of result to return for each sub-path + * @return a map of sub-path to result of the processing + */ + public static Map parallelizeSubPathProcess( + HoodieEngineContext hoodieEngineContext, FileSystem fs, Path dirPath, int parallelism, + Predicate subPathPredicate, SerializableFunction, T> pairFunction) { + Map result = new HashMap<>(); + try { + FileStatus[] fileStatuses = fs.listStatus(dirPath); + List subPaths = Arrays.stream(fileStatuses) + .filter(subPathPredicate) + .map(fileStatus -> fileStatus.getPath().toString()) + .collect(Collectors.toList()); + result = parallelizeFilesProcess(hoodieEngineContext, fs, parallelism, pairFunction, subPaths); + } catch (IOException ioe) { + throw new HoodieIOException(ioe.getMessage(), ioe); + } + return result; + } + + public static Map parallelizeFilesProcess( + HoodieEngineContext hoodieEngineContext, + FileSystem fs, + int parallelism, + SerializableFunction, T> pairFunction, + List subPaths) { + Map result = new HashMap<>(); + if (subPaths.size() > 0) { + SerializableConfiguration conf = new SerializableConfiguration(fs.getConf()); + int actualParallelism = Math.min(subPaths.size(), parallelism); + + hoodieEngineContext.setJobStatus(FSUtils.class.getSimpleName(), + "Parallel listing paths " + String.join(",", subPaths)); + + result = hoodieEngineContext.mapToPair(subPaths, + subPath -> new ImmutablePair<>(subPath, pairFunction.apply(new ImmutablePair<>(subPath, conf))), + actualParallelism); + } + return result; + } + + /** + * Deletes a sub-path. + * + * @param subPathStr sub-path String + * @param conf serializable config + * @param recursive is recursive or not + * @return {@code true} if the sub-path is deleted; {@code false} otherwise. + */ + public static boolean deleteSubPath(String subPathStr, SerializableConfiguration conf, boolean recursive) { + try { + Path subPath = new Path(subPathStr); + FileSystem fileSystem = subPath.getFileSystem(conf.get()); + return fileSystem.delete(subPath, recursive); + } catch (IOException e) { + throw new HoodieIOException(e.getMessage(), e); + } + } + + /** + * Lists file status at a certain level in the directory hierarchy. + *

    + * E.g., given "/tmp/hoodie_table" as the rootPath, and 3 as the expected level, + * this method gives back the {@link FileStatus} of all files under + * "/tmp/hoodie_table/[*]/[*]/[*]/" folders. + * + * @param hoodieEngineContext {@link HoodieEngineContext} instance. + * @param fs {@link FileSystem} instance. + * @param rootPath Root path for the file listing. + * @param expectLevel Expected level of directory hierarchy for files to be added. + * @param parallelism Parallelism for the file listing. + * @return A list of file status of files at the level. + */ + + public static List getFileStatusAtLevel( + HoodieEngineContext hoodieEngineContext, FileSystem fs, Path rootPath, + int expectLevel, int parallelism) { + List levelPaths = new ArrayList<>(); + List result = new ArrayList<>(); + levelPaths.add(rootPath.toString()); + + for (int i = 0; i <= expectLevel; i++) { + result = FSUtils.parallelizeFilesProcess(hoodieEngineContext, fs, parallelism, + pairOfSubPathAndConf -> { + Path path = new Path(pairOfSubPathAndConf.getKey()); + try { + FileSystem fileSystem = path.getFileSystem(pairOfSubPathAndConf.getValue().get()); + return Arrays.stream(fileSystem.listStatus(path)) + .collect(Collectors.toList()); + } catch (IOException e) { + throw new HoodieIOException("Failed to list " + path, e); + } + }, + levelPaths) + .values().stream() + .flatMap(list -> list.stream()).collect(Collectors.toList()); + if (i < expectLevel) { + levelPaths = result.stream() + .filter(FileStatus::isDirectory) + .map(fileStatus -> fileStatus.getPath().toString()) + .collect(Collectors.toList()); + } + } + return result; + } + + public interface SerializableFunction extends Function, Serializable { + } + + private static Option getLatestLogFile(Stream logFiles) { + return Option.fromJavaOptional(logFiles.min(HoodieLogFile.getReverseLogFileComparator())); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/FileSystemRetryConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/FileSystemRetryConfig.java new file mode 100644 index 0000000000000..c7f99ece7e45d --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/FileSystemRetryConfig.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.fs; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +/** + * The file system retry relevant config options. + */ +@ConfigClassProperty(name = "FileSystem Guard Configurations", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "The filesystem retry related config options, to help deal with runtime exception like list/get/put/delete performance issues.") +public class FileSystemRetryConfig extends HoodieConfig { + + public static final ConfigProperty FILESYSTEM_RETRY_ENABLE = ConfigProperty + .key("hoodie.filesystem.operation.retry.enable") + .defaultValue("false") + .sinceVersion("0.11.0") + .withDocumentation("Enabled to handle list/get/delete etc file system performance issue."); + + public static final ConfigProperty INITIAL_RETRY_INTERVAL_MS = ConfigProperty + .key("hoodie.filesystem.operation.retry.initial_interval_ms") + .defaultValue(100L) + .sinceVersion("0.11.0") + .withDocumentation("Amount of time (in ms) to wait, before retry to do operations on storage."); + + public static final ConfigProperty MAX_RETRY_INTERVAL_MS = ConfigProperty + .key("hoodie.filesystem.operation.retry.max_interval_ms") + .defaultValue(2000L) + .sinceVersion("0.11.0") + .withDocumentation("Maximum amount of time (in ms), to wait for next retry."); + + public static final ConfigProperty MAX_RETRY_NUMBERS = ConfigProperty + .key("hoodie.filesystem.operation.retry.max_numbers") + .defaultValue(4) + .sinceVersion("0.11.0") + .withDocumentation("Maximum number of retry actions to perform, with exponential backoff."); + + public static final ConfigProperty RETRY_EXCEPTIONS = ConfigProperty + .key("hoodie.filesystem.operation.retry.exceptions") + .defaultValue("") + .sinceVersion("0.11.0") + .withDocumentation("The class name of the Exception that needs to be re-tryed, separated by commas. " + + "Default is empty which means retry all the IOException and RuntimeException from FileSystem"); + + private FileSystemRetryConfig() { + super(); + } + + public long getInitialRetryIntervalMs() { + return getLong(INITIAL_RETRY_INTERVAL_MS); + } + + public long getMaxRetryIntervalMs() { + return getLong(MAX_RETRY_INTERVAL_MS); + } + + public int getMaxRetryNumbers() { + return getInt(MAX_RETRY_NUMBERS); + } + + public boolean isFileSystemActionRetryEnable() { + return Boolean.parseBoolean(getStringOrDefault(FILESYSTEM_RETRY_ENABLE)); + } + + public static FileSystemRetryConfig.Builder newBuilder() { + return new Builder(); + } + + public String getRetryExceptions() { + return getString(RETRY_EXCEPTIONS); + } + + /** + * The builder used to build filesystem configurations. + */ + public static class Builder { + + private final FileSystemRetryConfig fileSystemRetryConfig = new FileSystemRetryConfig(); + + public Builder fromFile(File propertiesFile) throws IOException { + try (FileReader reader = new FileReader(propertiesFile)) { + fileSystemRetryConfig.getProps().load(reader); + return this; + } + } + + public Builder fromProperties(Properties props) { + this.fileSystemRetryConfig.getProps().putAll(props); + return this; + } + + public Builder withMaxRetryNumbers(int numbers) { + fileSystemRetryConfig.setValue(MAX_RETRY_NUMBERS, String.valueOf(numbers)); + return this; + } + + public Builder withInitialRetryIntervalMs(long intervalMs) { + fileSystemRetryConfig.setValue(INITIAL_RETRY_INTERVAL_MS, String.valueOf(intervalMs)); + return this; + } + + public Builder withMaxRetryIntervalMs(long intervalMs) { + fileSystemRetryConfig.setValue(MAX_RETRY_INTERVAL_MS, String.valueOf(intervalMs)); + return this; + } + + public Builder withFileSystemActionRetryEnabled(boolean enabled) { + fileSystemRetryConfig.setValue(FILESYSTEM_RETRY_ENABLE, String.valueOf(enabled)); + return this; + } + + public FileSystemRetryConfig build() { + fileSystemRetryConfig.setDefaults(FileSystemRetryConfig.class.getName()); + return fileSystemRetryConfig; + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieRetryWrapperFileSystem.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieRetryWrapperFileSystem.java new file mode 100644 index 0000000000000..075f811a42ea7 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieRetryWrapperFileSystem.java @@ -0,0 +1,257 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.fs; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.CreateFlag; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Options; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.util.Progressable; +import org.apache.hudi.common.util.RetryHelper; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.URI; +import java.util.EnumSet; + +public class HoodieRetryWrapperFileSystem extends FileSystem { + + private FileSystem fileSystem; + private long maxRetryIntervalMs; + private int maxRetryNumbers; + private long initialRetryIntervalMs; + private String retryExceptionsList; + + public HoodieRetryWrapperFileSystem(FileSystem fs, long maxRetryIntervalMs, int maxRetryNumbers, long initialRetryIntervalMs, String retryExceptions) { + this.fileSystem = fs; + this.maxRetryIntervalMs = maxRetryIntervalMs; + this.maxRetryNumbers = maxRetryNumbers; + this.initialRetryIntervalMs = initialRetryIntervalMs; + this.retryExceptionsList = retryExceptions; + + } + + @Override + public URI getUri() { + return fileSystem.getUri(); + } + + @Override + public FSDataInputStream open(Path f, int bufferSize) throws IOException { + return (FSDataInputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.open(f, bufferSize)).start(); + } + + @Override + public FSDataInputStream open(Path f) throws IOException { + return (FSDataInputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.open(f)).start(); + } + + @Override + public FSDataOutputStream create(Path f, + FsPermission permission, + boolean overwrite, + int bufferSize, + short replication, + long blockSize, + Progressable progress) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList) + .tryWith(() -> fileSystem.create(f, permission, overwrite, bufferSize, replication, blockSize, progress)).start(); + } + + @Override + public FSDataOutputStream create(Path f, boolean overwrite) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.create(f, overwrite)).start(); + } + + @Override + public FSDataOutputStream create(Path f) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.create(f)).start(); + } + + @Override + public FSDataOutputStream create(Path f, Progressable progress) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.create(f, progress)).start(); + } + + @Override + public FSDataOutputStream create(Path f, short replication) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.create(f, replication)).start(); + } + + @Override + public FSDataOutputStream create(Path f, short replication, Progressable progress) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.create(f, replication, progress)).start(); + } + + @Override + public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList) + .tryWith(() -> fileSystem.create(f, overwrite, bufferSize)).start(); + } + + @Override + public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, Progressable progress) + throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList) + .tryWith(() -> fileSystem.create(f, overwrite, bufferSize, progress)).start(); + } + + @Override + public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, long blockSize, + Progressable progress) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList) + .tryWith(() -> fileSystem.create(f, overwrite, bufferSize, replication, blockSize, progress)).start(); + } + + @Override + public FSDataOutputStream create(Path f, FsPermission permission, EnumSet flags, int bufferSize, + short replication, long blockSize, Progressable progress) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList) + .tryWith(() -> fileSystem.create(f, permission, flags, bufferSize, replication, blockSize, progress)).start(); + } + + @Override + public FSDataOutputStream create(Path f, FsPermission permission, EnumSet flags, int bufferSize, + short replication, long blockSize, Progressable progress, Options.ChecksumOpt checksumOpt) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList) + .tryWith(() -> fileSystem.create(f, permission, flags, bufferSize, replication, + blockSize, progress, checksumOpt)).start(); + } + + @Override + public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, long blockSize) + throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList) + .tryWith(() -> fileSystem.create(f, overwrite, bufferSize, replication, blockSize)).start(); + } + + @Override + public boolean createNewFile(Path f) throws IOException { + return (boolean) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.createNewFile(f)).start(); + } + + @Override + public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.append(f, bufferSize, progress)).start(); + } + + @Override + public FSDataOutputStream append(Path f) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.append(f)).start(); + } + + @Override + public FSDataOutputStream append(Path f, int bufferSize) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.append(f, bufferSize)).start(); + } + + @Override + public boolean rename(Path src, Path dst) throws IOException { + return (boolean) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.rename(src, dst)).start(); + } + + @Override + public boolean delete(Path f, boolean recursive) throws IOException { + return (boolean) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.delete(f, recursive)).start(); + } + + @Override + public boolean delete(Path f) throws IOException { + return (boolean) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.delete(f, true)).start(); + } + + @Override + public FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException { + return (FileStatus[]) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.listStatus(f)).start(); + } + + @Override + public FileStatus[] listStatus(Path f, PathFilter filter) throws IOException { + return (FileStatus[]) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.listStatus(f, filter)).start(); + } + + @Override + public FileStatus[] listStatus(Path[] files) throws IOException { + return (FileStatus[]) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.listStatus(files)).start(); + } + + @Override + public FileStatus[] listStatus(Path[] files, PathFilter filter) throws IOException { + return (FileStatus[]) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.listStatus(files, filter)).start(); + } + + @Override + public FileStatus[] globStatus(Path pathPattern) throws IOException { + return (FileStatus[]) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.globStatus(pathPattern)).start(); + } + + @Override + public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException { + return (FileStatus[]) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.globStatus(pathPattern, filter)).start(); + } + + @Override + public RemoteIterator listLocatedStatus(Path f) throws IOException { + return (RemoteIterator) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.listLocatedStatus(f)).start(); + } + + @Override + public RemoteIterator listFiles(Path f, boolean recursive) throws IOException { + return (RemoteIterator) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList) + .tryWith(() -> fileSystem.listFiles(f, recursive)).start(); + } + + @Override + public void setWorkingDirectory(Path newDir) { + fileSystem.setWorkingDirectory(newDir); + } + + @Override + public Path getWorkingDirectory() { + return fileSystem.getWorkingDirectory(); + } + + @Override + public boolean mkdirs(Path f, FsPermission permission) throws IOException { + return (boolean) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.mkdirs(f, permission)).start(); + } + + @Override + public FileStatus getFileStatus(Path f) throws IOException { + return (FileStatus) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.getFileStatus(f)).start(); + } + + @Override + public boolean exists(Path f) throws IOException { + return (boolean) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.exists(f)).start(); + } + + @Override + public Configuration getConf() { + return fileSystem.getConf(); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieWrapperFileSystem.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieWrapperFileSystem.java index c3f6189e8a97e..2979696be7157 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieWrapperFileSystem.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieWrapperFileSystem.java @@ -19,6 +19,9 @@ package org.apache.hudi.common.fs; import org.apache.hudi.common.metrics.Registry; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; @@ -46,6 +49,7 @@ import org.apache.hadoop.security.Credentials; import org.apache.hadoop.security.token.Token; import org.apache.hadoop.util.Progressable; +import org.apache.hudi.hadoop.CachingPath; import java.io.IOException; import java.net.URI; @@ -57,6 +61,8 @@ import java.util.concurrent.ConcurrentMap; import java.util.concurrent.TimeoutException; +import static org.apache.hudi.common.fs.StorageSchemes.HDFS; + /** * HoodieWrapperFileSystem wraps the default file system. It holds state about the open streams in the file system to * support getting the written size to each of the open streams. @@ -65,15 +71,58 @@ public class HoodieWrapperFileSystem extends FileSystem { public static final String HOODIE_SCHEME_PREFIX = "hoodie-"; - private enum MetricName { - create, rename, delete, listStatus, mkdirs, getFileStatus, globStatus, listFiles + private static final String TMP_PATH_POSTFIX = ".tmp"; + + protected enum MetricName { + create, rename, delete, listStatus, mkdirs, getFileStatus, globStatus, listFiles, read, write + } + + private static Registry METRICS_REGISTRY_DATA; + private static Registry METRICS_REGISTRY_META; + + public static void setMetricsRegistry(Registry registry, Registry registryMeta) { + METRICS_REGISTRY_DATA = registry; + METRICS_REGISTRY_META = registryMeta; } + private ConcurrentMap openStreams = new ConcurrentHashMap<>(); private FileSystem fileSystem; private URI uri; private ConsistencyGuard consistencyGuard = new NoOpConsistencyGuard(); - private Registry metricsRegistry = Registry.getRegistry(this.getClass().getSimpleName()); + + @FunctionalInterface + public interface CheckedFunction { + R get() throws IOException; + } + + private static Registry getMetricRegistryForPath(Path p) { + return ((p != null) && (p.toString().contains(HoodieTableMetaClient.METAFOLDER_NAME))) + ? METRICS_REGISTRY_META : METRICS_REGISTRY_DATA; + } + + protected static R executeFuncWithTimeMetrics(String metricName, Path p, CheckedFunction func) throws IOException { + HoodieTimer timer = new HoodieTimer().startTimer(); + R res = func.get(); + + Registry registry = getMetricRegistryForPath(p); + if (registry != null) { + registry.increment(metricName); + registry.add(metricName + ".totalDuration", timer.endTimer()); + } + + return res; + } + + protected static R executeFuncWithTimeAndByteMetrics(String metricName, Path p, long byteCount, + CheckedFunction func) throws IOException { + Registry registry = getMetricRegistryForPath(p); + if (registry != null) { + registry.add(metricName + ".totalBytes", byteCount); + } + + return executeFuncWithTimeMetrics(metricName, p, func); + } public HoodieWrapperFileSystem() {} @@ -92,13 +141,16 @@ public static Path convertToHoodiePath(Path file, Configuration conf) { } } - private static Path convertPathWithScheme(Path oldPath, String newScheme) { + public static Path convertPathWithScheme(Path oldPath, String newScheme) { URI oldURI = oldPath.toUri(); URI newURI; try { - newURI = new URI(newScheme, oldURI.getUserInfo(), oldURI.getHost(), oldURI.getPort(), oldURI.getPath(), - oldURI.getQuery(), oldURI.getFragment()); - return new Path(newURI); + newURI = new URI(newScheme, + oldURI.getAuthority(), + oldURI.getPath(), + oldURI.getQuery(), + oldURI.getFragment()); + return new CachingPath(newURI); } catch (URISyntaxException e) { // TODO - Better Exception handling throw new RuntimeException(e); @@ -140,16 +192,17 @@ public URI getUri() { @Override public FSDataInputStream open(Path f, int bufferSize) throws IOException { - return fileSystem.open(convertToDefaultPath(f), bufferSize); + return wrapInputStream(f, fileSystem.open(convertToDefaultPath(f), bufferSize)); } @Override public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, int bufferSize, - short replication, long blockSize, Progressable progress) throws IOException { - this.metricsRegistry.increment(MetricName.create.name()); - final Path translatedPath = convertToDefaultPath(f); - return wrapOutputStream(f, - fileSystem.create(translatedPath, permission, overwrite, bufferSize, replication, blockSize, progress)); + short replication, long blockSize, Progressable progress) throws IOException { + return executeFuncWithTimeMetrics(MetricName.create.name(), f, () -> { + final Path translatedPath = convertToDefaultPath(f); + return wrapOutputStream(f, + fileSystem.create(translatedPath, permission, overwrite, bufferSize, replication, blockSize, progress)); + }); } private FSDataOutputStream wrapOutputStream(final Path path, FSDataOutputStream fsDataOutputStream) @@ -164,79 +217,97 @@ private FSDataOutputStream wrapOutputStream(final Path path, FSDataOutputStream return os; } + private FSDataInputStream wrapInputStream(final Path path, FSDataInputStream fsDataInputStream) throws IOException { + if (fsDataInputStream instanceof TimedFSDataInputStream) { + return fsDataInputStream; + } + return new TimedFSDataInputStream(path, fsDataInputStream); + } + @Override public FSDataOutputStream create(Path f, boolean overwrite) throws IOException { - this.metricsRegistry.increment(MetricName.create.name()); - return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), overwrite)); + return executeFuncWithTimeMetrics(MetricName.create.name(), f, () -> { + return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), overwrite)); + }); } @Override public FSDataOutputStream create(Path f) throws IOException { - this.metricsRegistry.increment(MetricName.create.name()); - return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f))); + return executeFuncWithTimeMetrics(MetricName.create.name(), f, () -> { + return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f))); + }); } @Override public FSDataOutputStream create(Path f, Progressable progress) throws IOException { - this.metricsRegistry.increment(MetricName.create.name()); - return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), progress)); + return executeFuncWithTimeMetrics(MetricName.create.name(), f, () -> { + return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), progress)); + }); } @Override public FSDataOutputStream create(Path f, short replication) throws IOException { - this.metricsRegistry.increment(MetricName.create.name()); - return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), replication)); + return executeFuncWithTimeMetrics(MetricName.create.name(), f, () -> { + return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), replication)); + }); } @Override public FSDataOutputStream create(Path f, short replication, Progressable progress) throws IOException { - this.metricsRegistry.increment(MetricName.create.name()); - return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), replication, progress)); + return executeFuncWithTimeMetrics(MetricName.create.name(), f, () -> { + return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), replication, progress)); + }); } @Override public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize) throws IOException { - this.metricsRegistry.increment(MetricName.create.name()); - return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize)); + return executeFuncWithTimeMetrics(MetricName.create.name(), f, () -> { + return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize)); + }); } @Override public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, Progressable progress) throws IOException { - this.metricsRegistry.increment(MetricName.create.name()); - return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, progress)); + return executeFuncWithTimeMetrics(MetricName.create.name(), f, () -> { + return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, progress)); + }); } @Override public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, long blockSize, - Progressable progress) throws IOException { - this.metricsRegistry.increment(MetricName.create.name()); - return wrapOutputStream(f, - fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize, progress)); + Progressable progress) throws IOException { + return executeFuncWithTimeMetrics(MetricName.create.name(), f, () -> { + return wrapOutputStream(f, + fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize, progress)); + }); } @Override public FSDataOutputStream create(Path f, FsPermission permission, EnumSet flags, int bufferSize, - short replication, long blockSize, Progressable progress) throws IOException { - this.metricsRegistry.increment(MetricName.create.name()); - return wrapOutputStream(f, - fileSystem.create(convertToDefaultPath(f), permission, flags, bufferSize, replication, blockSize, progress)); + short replication, long blockSize, Progressable progress) throws IOException { + return executeFuncWithTimeMetrics(MetricName.create.name(), f, () -> { + return wrapOutputStream(f, + fileSystem.create(convertToDefaultPath(f), permission, flags, bufferSize, replication, blockSize, progress)); + }); } @Override public FSDataOutputStream create(Path f, FsPermission permission, EnumSet flags, int bufferSize, - short replication, long blockSize, Progressable progress, Options.ChecksumOpt checksumOpt) throws IOException { - this.metricsRegistry.increment(MetricName.create.name()); - return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), permission, flags, bufferSize, replication, - blockSize, progress, checksumOpt)); + short replication, long blockSize, Progressable progress, Options.ChecksumOpt checksumOpt) throws IOException { + return executeFuncWithTimeMetrics(MetricName.create.name(), f, () -> { + return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), permission, flags, bufferSize, replication, + blockSize, progress, checksumOpt)); + }); } @Override public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, long blockSize) throws IOException { - this.metricsRegistry.increment(MetricName.create.name()); - return wrapOutputStream(f, - fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize)); + return executeFuncWithTimeMetrics(MetricName.create.name(), f, () -> { + return wrapOutputStream(f, + fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize)); + }); } @Override @@ -246,50 +317,53 @@ public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) @Override public boolean rename(Path src, Path dst) throws IOException { - this.metricsRegistry.increment(MetricName.rename.name()); - try { - consistencyGuard.waitTillFileAppears(convertToDefaultPath(src)); - } catch (TimeoutException e) { - throw new HoodieException("Timed out waiting for " + src + " to appear", e); - } - - boolean success = fileSystem.rename(convertToDefaultPath(src), convertToDefaultPath(dst)); - - if (success) { + return executeFuncWithTimeMetrics(MetricName.rename.name(), src, () -> { try { - consistencyGuard.waitTillFileAppears(convertToDefaultPath(dst)); + consistencyGuard.waitTillFileAppears(convertToDefaultPath(src)); } catch (TimeoutException e) { - throw new HoodieException("Timed out waiting for " + dst + " to appear", e); + throw new HoodieException("Timed out waiting for " + src + " to appear", e); } - try { - consistencyGuard.waitTillFileDisappears(convertToDefaultPath(src)); - } catch (TimeoutException e) { - throw new HoodieException("Timed out waiting for " + src + " to disappear", e); + boolean success = fileSystem.rename(convertToDefaultPath(src), convertToDefaultPath(dst)); + + if (success) { + try { + consistencyGuard.waitTillFileAppears(convertToDefaultPath(dst)); + } catch (TimeoutException e) { + throw new HoodieException("Timed out waiting for " + dst + " to appear", e); + } + + try { + consistencyGuard.waitTillFileDisappears(convertToDefaultPath(src)); + } catch (TimeoutException e) { + throw new HoodieException("Timed out waiting for " + src + " to disappear", e); + } } - } - return success; + return success; + }); } @Override public boolean delete(Path f, boolean recursive) throws IOException { - this.metricsRegistry.increment(MetricName.delete.name()); - boolean success = fileSystem.delete(convertToDefaultPath(f), recursive); - - if (success) { - try { - consistencyGuard.waitTillFileDisappears(f); - } catch (TimeoutException e) { - throw new HoodieException("Timed out waiting for " + f + " to disappear", e); + return executeFuncWithTimeMetrics(MetricName.delete.name(), f, () -> { + boolean success = fileSystem.delete(convertToDefaultPath(f), recursive); + + if (success) { + try { + consistencyGuard.waitTillFileDisappears(f); + } catch (TimeoutException e) { + throw new HoodieException("Timed out waiting for " + f + " to disappear", e); + } } - } - return success; + return success; + }); } @Override public FileStatus[] listStatus(Path f) throws IOException { - this.metricsRegistry.increment(MetricName.listStatus.name()); - return fileSystem.listStatus(convertToDefaultPath(f)); + return executeFuncWithTimeMetrics(MetricName.listStatus.name(), f, () -> { + return fileSystem.listStatus(convertToDefaultPath(f)); + }); } @Override @@ -304,27 +378,29 @@ public void setWorkingDirectory(Path newDir) { @Override public boolean mkdirs(Path f, FsPermission permission) throws IOException { - this.metricsRegistry.increment(MetricName.mkdirs.name()); - boolean success = fileSystem.mkdirs(convertToDefaultPath(f), permission); - if (success) { - try { - consistencyGuard.waitTillFileAppears(convertToDefaultPath(f)); - } catch (TimeoutException e) { - throw new HoodieException("Timed out waiting for directory " + f + " to appear", e); + return executeFuncWithTimeMetrics(MetricName.mkdirs.name(), f, () -> { + boolean success = fileSystem.mkdirs(convertToDefaultPath(f), permission); + if (success) { + try { + consistencyGuard.waitTillFileAppears(convertToDefaultPath(f)); + } catch (TimeoutException e) { + throw new HoodieException("Timed out waiting for directory " + f + " to appear", e); + } } - } - return success; + return success; + }); } @Override public FileStatus getFileStatus(Path f) throws IOException { - this.metricsRegistry.increment(MetricName.getFileStatus.name()); - try { - consistencyGuard.waitTillFileAppears(convertToDefaultPath(f)); - } catch (TimeoutException e) { - // pass - } - return fileSystem.getFileStatus(convertToDefaultPath(f)); + return executeFuncWithTimeMetrics(MetricName.getFileStatus.name(), f, () -> { + try { + consistencyGuard.waitTillFileAppears(convertToDefaultPath(f)); + } catch (TimeoutException e) { + // pass + } + return fileSystem.getFileStatus(convertToDefaultPath(f)); + }); } @Override @@ -389,12 +465,12 @@ public Path resolvePath(Path p) throws IOException { @Override public FSDataInputStream open(Path f) throws IOException { - return fileSystem.open(convertToDefaultPath(f)); + return wrapInputStream(f, fileSystem.open(convertToDefaultPath(f))); } @Override public FSDataOutputStream createNonRecursive(Path f, boolean overwrite, int bufferSize, short replication, - long blockSize, Progressable progress) throws IOException { + long blockSize, Progressable progress) throws IOException { Path p = convertToDefaultPath(f); return wrapOutputStream(p, fileSystem.createNonRecursive(p, overwrite, bufferSize, replication, blockSize, progress)); @@ -402,7 +478,7 @@ public FSDataOutputStream createNonRecursive(Path f, boolean overwrite, int buff @Override public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, boolean overwrite, int bufferSize, - short replication, long blockSize, Progressable progress) throws IOException { + short replication, long blockSize, Progressable progress) throws IOException { Path p = convertToDefaultPath(f); return wrapOutputStream(p, fileSystem.createNonRecursive(p, permission, overwrite, bufferSize, replication, blockSize, progress)); @@ -410,7 +486,7 @@ public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, bo @Override public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, EnumSet flags, - int bufferSize, short replication, long blockSize, Progressable progress) throws IOException { + int bufferSize, short replication, long blockSize, Progressable progress) throws IOException { Path p = convertToDefaultPath(f); return wrapOutputStream(p, fileSystem.createNonRecursive(p, permission, flags, bufferSize, replication, blockSize, progress)); @@ -462,8 +538,9 @@ public boolean setReplication(Path src, short replication) throws IOException { @Override public boolean delete(Path f) throws IOException { - this.metricsRegistry.increment(MetricName.delete.name()); - return delete(f, true); + return executeFuncWithTimeMetrics(MetricName.delete.name(), f, () -> { + return delete(f, true); + }); } @Override @@ -508,32 +585,37 @@ public RemoteIterator listCorruptFileBlocks(Path path) throws IOException @Override public FileStatus[] listStatus(Path f, PathFilter filter) throws IOException { - this.metricsRegistry.increment(MetricName.listStatus.name()); - return fileSystem.listStatus(convertToDefaultPath(f), filter); + return executeFuncWithTimeMetrics(MetricName.listStatus.name(), f, () -> { + return fileSystem.listStatus(convertToDefaultPath(f), filter); + }); } @Override public FileStatus[] listStatus(Path[] files) throws IOException { - this.metricsRegistry.increment(MetricName.listStatus.name()); - return fileSystem.listStatus(convertDefaults(files)); + return executeFuncWithTimeMetrics(MetricName.listStatus.name(), files.length > 0 ? files[0] : null, () -> { + return fileSystem.listStatus(convertDefaults(files)); + }); } @Override public FileStatus[] listStatus(Path[] files, PathFilter filter) throws IOException { - this.metricsRegistry.increment(MetricName.listStatus.name()); - return fileSystem.listStatus(convertDefaults(files), filter); + return executeFuncWithTimeMetrics(MetricName.listStatus.name(), files.length > 0 ? files[0] : null, () -> { + return fileSystem.listStatus(convertDefaults(files), filter); + }); } @Override public FileStatus[] globStatus(Path pathPattern) throws IOException { - this.metricsRegistry.increment(MetricName.globStatus.name()); - return fileSystem.globStatus(convertToDefaultPath(pathPattern)); + return executeFuncWithTimeMetrics(MetricName.globStatus.name(), pathPattern, () -> { + return fileSystem.globStatus(convertToDefaultPath(pathPattern)); + }); } @Override public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException { - this.metricsRegistry.increment(MetricName.globStatus.name()); - return fileSystem.globStatus(convertToDefaultPath(pathPattern), filter); + return executeFuncWithTimeMetrics(MetricName.globStatus.name(), pathPattern, () -> { + return fileSystem.globStatus(convertToDefaultPath(pathPattern), filter); + }); } @Override @@ -543,8 +625,9 @@ public RemoteIterator listLocatedStatus(Path f) throws IOExce @Override public RemoteIterator listFiles(Path f, boolean recursive) throws IOException { - this.metricsRegistry.increment(MetricName.listFiles.name()); - return fileSystem.listFiles(convertToDefaultPath(f), recursive); + return executeFuncWithTimeMetrics(MetricName.listFiles.name(), f, () -> { + return fileSystem.listFiles(convertToDefaultPath(f), recursive); + }); } @Override @@ -554,16 +637,17 @@ public Path getHomeDirectory() { @Override public boolean mkdirs(Path f) throws IOException { - this.metricsRegistry.increment(MetricName.mkdirs.name()); - boolean success = fileSystem.mkdirs(convertToDefaultPath(f)); - if (success) { - try { - consistencyGuard.waitTillFileAppears(convertToDefaultPath(f)); - } catch (TimeoutException e) { - throw new HoodieException("Timed out waiting for directory " + f + " to appear", e); + return executeFuncWithTimeMetrics(MetricName.mkdirs.name(), f, () -> { + boolean success = fileSystem.mkdirs(convertToDefaultPath(f)); + if (success) { + try { + consistencyGuard.waitTillFileAppears(convertToDefaultPath(f)); + } catch (TimeoutException e) { + throw new HoodieException("Timed out waiting for directory " + f + " to appear", e); + } } - } - return success; + return success; + }); } @Override @@ -910,6 +994,65 @@ public long getBytesWritten(Path file) { file.toString() + " does not have a open stream. Cannot get the bytes written on the stream"); } + protected boolean needCreateTempFile() { + return HDFS.getScheme().equals(fileSystem.getScheme()); + } + + /** + * Creates a new file with overwrite set to false. This ensures files are created + * only once and never rewritten, also, here we take care if the content is not + * empty, will first write the content to a temp file if {needCreateTempFile} is + * true, and then rename it back after the content is written. + * + * @param fullPath File Path + * @param content Content to be stored + */ + public void createImmutableFileInPath(Path fullPath, Option content) + throws HoodieIOException { + FSDataOutputStream fsout = null; + Path tmpPath = null; + + boolean needTempFile = needCreateTempFile(); + + try { + if (!content.isPresent()) { + fsout = fileSystem.create(fullPath, false); + } + + if (content.isPresent() && needTempFile) { + Path parent = fullPath.getParent(); + tmpPath = new Path(parent, fullPath.getName() + TMP_PATH_POSTFIX); + fsout = fileSystem.create(tmpPath, false); + fsout.write(content.get()); + } + + if (content.isPresent() && !needTempFile) { + fsout = fileSystem.create(fullPath, false); + fsout.write(content.get()); + } + } catch (IOException e) { + String errorMsg = "Failed to create file" + (tmpPath != null ? tmpPath : fullPath); + throw new HoodieIOException(errorMsg, e); + } finally { + try { + if (null != fsout) { + fsout.close(); + } + } catch (IOException e) { + String errorMsg = "Failed to close file" + (needTempFile ? tmpPath : fullPath); + throw new HoodieIOException(errorMsg, e); + } + + try { + if (null != tmpPath) { + fileSystem.rename(tmpPath, fullPath); + } + } catch (IOException e) { + throw new HoodieIOException("Failed to rename " + tmpPath + " to the target " + fullPath, e); + } + } + } + public FileSystem getFileSystem() { return fileSystem; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/NoOpConsistencyGuard.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/NoOpConsistencyGuard.java index 058b3a104360d..ef4d7a4035300 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/NoOpConsistencyGuard.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/NoOpConsistencyGuard.java @@ -23,7 +23,8 @@ import java.util.List; /** - * Default Consistency guard that does nothing. Used for HDFS deployments + * Default Consistency guard that does nothing. Used for lake storage which provided read-after-write + * guarantees. */ public class NoOpConsistencyGuard implements ConsistencyGuard { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/SchemeAwareFSDataInputStream.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/SchemeAwareFSDataInputStream.java new file mode 100644 index 0000000000000..8795bf19d3568 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/SchemeAwareFSDataInputStream.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.fs; + +import org.apache.hadoop.fs.FSDataInputStream; + +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; + +/** + * Scheme aware FSDataInputStream so that we manipulate seeks for GS filesystem. + */ +public class SchemeAwareFSDataInputStream extends FSDataInputStream { + + private final boolean isGCSFileSystem; + + public SchemeAwareFSDataInputStream(InputStream in, boolean isGCSFileSystem) { + super(in); + this.isGCSFileSystem = isGCSFileSystem; + } + + @Override + public void seek(long desired) throws IOException { + try { + super.seek(desired); + } catch (EOFException e) { + // with GCSFileSystem, accessing the last byte might throw EOFException and hence this fix. + if (isGCSFileSystem) { + super.seek(desired - 1); + } else { + throw e; + } + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/SizeAwareFSDataOutputStream.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/SizeAwareFSDataOutputStream.java index 0b70bedc0988c..361d418c2f7f9 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/SizeAwareFSDataOutputStream.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/SizeAwareFSDataOutputStream.java @@ -43,8 +43,8 @@ public class SizeAwareFSDataOutputStream extends FSDataOutputStream { private final ConsistencyGuard consistencyGuard; public SizeAwareFSDataOutputStream(Path path, FSDataOutputStream out, ConsistencyGuard consistencyGuard, - Runnable closeCallback) throws IOException { - super(out, null); + Runnable closeCallback) throws IOException { + super(out, null, out.getPos()); this.path = path; this.closeCallback = closeCallback; this.consistencyGuard = consistencyGuard; @@ -52,14 +52,22 @@ public SizeAwareFSDataOutputStream(Path path, FSDataOutputStream out, Consistenc @Override public synchronized void write(byte[] b, int off, int len) throws IOException { - bytesWritten.addAndGet(len); - super.write(b, off, len); + HoodieWrapperFileSystem.executeFuncWithTimeAndByteMetrics(HoodieWrapperFileSystem.MetricName.write.name(), path, + len, () -> { + bytesWritten.addAndGet(len); + super.write(b, off, len); + return null; + }); } @Override public void write(byte[] b) throws IOException { - bytesWritten.addAndGet(b.length); - super.write(b); + HoodieWrapperFileSystem.executeFuncWithTimeAndByteMetrics(HoodieWrapperFileSystem.MetricName.write.name(), path, + b.length, () -> { + bytesWritten.addAndGet(b.length); + super.write(b); + return null; + }); } @Override @@ -76,5 +84,4 @@ public void close() throws IOException { public long getBytesWritten() { return bytesWritten.get(); } - } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/StorageSchemes.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/StorageSchemes.java index 7ebf641197bb3..10619f8b3afaf 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/StorageSchemes.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/StorageSchemes.java @@ -28,6 +28,8 @@ public enum StorageSchemes { FILE("file", false), // Hadoop File System HDFS("hdfs", true), + // Baidu Advanced File System + AFS("afs", true), // Mapr File System MAPRFS("maprfs", true), // Apache Ignite FS @@ -50,10 +52,24 @@ public enum StorageSchemes { ALLUXIO("alluxio", false), // Tencent Cloud Object Storage COSN("cosn", false), + // Tencent Cloud HDFS + CHDFS("ofs", true), + // Tencent Cloud CacheFileSystem + GOOSEFS("gfs", false), // Databricks file system DBFS("dbfs", false), // IBM Cloud Object Storage - COS("cos", false); + COS("cos", false), + // Huawei Cloud Object Storage + OBS("obs", false), + // Kingsoft Standard Storage ks3 + KS3("ks3", false), + // JuiceFileSystem + JFS("jfs", true), + // Baidu Object Storage + BOS("bos", false), + // Oracle Cloud Infrastructure Object Storage + OCI("oci", false); private String scheme; private boolean supportsAppend; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/TimedFSDataInputStream.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/TimedFSDataInputStream.java new file mode 100644 index 0000000000000..eca8ec368b869 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/TimedFSDataInputStream.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.fs; + +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.ReadOption; +import org.apache.hadoop.io.ByteBufferPool; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.EnumSet; + +/** + * Wrapper over FSDataInputStream that also times the operations. + */ +public class TimedFSDataInputStream extends FSDataInputStream { + + // Path + private final Path path; + + public TimedFSDataInputStream(Path path, FSDataInputStream in) { + super(in); + this.path = path; + } + + @Override + public int read(ByteBuffer buf) throws IOException { + return HoodieWrapperFileSystem.executeFuncWithTimeAndByteMetrics(HoodieWrapperFileSystem.MetricName.read.name(), + path, 0, () -> super.read(buf)); + } + + @Override + public int read(long position, byte[] buffer, int offset, int length) throws IOException { + return HoodieWrapperFileSystem.executeFuncWithTimeAndByteMetrics(HoodieWrapperFileSystem.MetricName.read.name(), + path, length, () -> super.read(position, buffer, offset, length)); + } + + @Override + public ByteBuffer read(ByteBufferPool bufferPool, int maxLength, EnumSet opts) + throws IOException, UnsupportedOperationException { + return HoodieWrapperFileSystem.executeFuncWithTimeAndByteMetrics(HoodieWrapperFileSystem.MetricName.read.name(), + path, maxLength, () -> super.read(bufferPool, maxLength, opts)); + } + + @Override + public void readFully(long position, byte[] buffer) throws IOException { + HoodieWrapperFileSystem.executeFuncWithTimeAndByteMetrics(HoodieWrapperFileSystem.MetricName.read.name(), + path, buffer.length, () -> { + super.readFully(position, buffer); + return null; + }); + } + + @Override + public void readFully(long position, byte[] buffer, int offset, int length) throws IOException { + HoodieWrapperFileSystem.executeFuncWithTimeAndByteMetrics(HoodieWrapperFileSystem.MetricName.read.name(), + path, length, () -> { + super.readFully(position, buffer, offset, length); + return null; + }); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFSUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFSUtils.java index e4570f94227cf..6031f29d907d3 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFSUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFSUtils.java @@ -20,6 +20,10 @@ import org.apache.hadoop.fs.Path; +import java.io.File; + +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; + /** * Utils to parse InLineFileSystem paths. * Inline FS format: @@ -29,70 +33,90 @@ public class InLineFSUtils { private static final String START_OFFSET_STR = "start_offset"; private static final String LENGTH_STR = "length"; + private static final String PATH_SEPARATOR = "/"; + private static final String SCHEME_SEPARATOR = ":"; private static final String EQUALS_STR = "="; + private static final String LOCAL_FILESYSTEM_SCHEME = "file"; /** - * Fetch inline file path from outer path. - * Eg - * Input: - * Path = s3a://file1, origScheme: file, startOffset = 20, length = 40 - * Output: "inlinefs:/file1/s3a/?start_offset=20&length=40" + * Get the InlineFS Path for a given schema and its Path. + *

    + * Examples: + * Input Path: s3a://file1, origScheme: file, startOffset = 20, length = 40 + * Output: "inlinefs://file1/s3a/?start_offset=20&length=40" * - * @param outerPath - * @param origScheme - * @param inLineStartOffset - * @param inLineLength - * @return + * @param outerPath The outer file Path + * @param origScheme The file schema + * @param inLineStartOffset Start offset for the inline file + * @param inLineLength Length for the inline file + * @return InlineFS Path for the requested outer path and schema */ public static Path getInlineFilePath(Path outerPath, String origScheme, long inLineStartOffset, long inLineLength) { - String subPath = outerPath.toString().substring(outerPath.toString().indexOf(":") + 1); + final String subPath = new File(outerPath.toString().substring(outerPath.toString().indexOf(":") + 1)).getPath(); return new Path( - InLineFileSystem.SCHEME + "://" + subPath + "/" + origScheme - + "/" + "?" + START_OFFSET_STR + EQUALS_STR + inLineStartOffset + InLineFileSystem.SCHEME + SCHEME_SEPARATOR + PATH_SEPARATOR + subPath + PATH_SEPARATOR + origScheme + + PATH_SEPARATOR + "?" + START_OFFSET_STR + EQUALS_STR + inLineStartOffset + "&" + LENGTH_STR + EQUALS_STR + inLineLength ); } /** - * Inline file format - * "inlinefs:////?start_offset=start_offset>&length=" - * Outer File format - * "://" + * InlineFS Path format: + * "inlinefs://path/to/outer/file/outer_file_scheme/?start_offset=start_offset>&length=" *

    - * Eg input : "inlinefs://file1/sa3/?start_offset=20&length=40". - * Output : "sa3://file1" + * Outer File Path format: + * "outer_file_scheme://path/to/outer/file" + *

    + * Example + * Input: "inlinefs://file1/s3a/?start_offset=20&length=40". + * Output: "s3a://file1" * - * @param inlinePath inline file system path - * @return + * @param inlineFSPath InLineFS Path to get the outer file Path + * @return Outer file Path from the InLineFS Path */ - public static Path getOuterfilePathFromInlinePath(Path inlinePath) { - String scheme = inlinePath.getParent().getName(); - Path basePath = inlinePath.getParent().getParent(); - return new Path(basePath.toString().replaceFirst(InLineFileSystem.SCHEME, scheme)); + public static Path getOuterFilePathFromInlinePath(Path inlineFSPath) { + assertInlineFSPath(inlineFSPath); + + final String outerFileScheme = inlineFSPath.getParent().getName(); + final Path basePath = inlineFSPath.getParent().getParent(); + checkArgument(basePath.toString().contains(SCHEME_SEPARATOR), + "Invalid InLineFS path: " + inlineFSPath); + + final String pathExceptScheme = basePath.toString().substring(basePath.toString().indexOf(SCHEME_SEPARATOR) + 1); + final String fullPath = outerFileScheme + SCHEME_SEPARATOR + + (outerFileScheme.equals(LOCAL_FILESYSTEM_SCHEME) ? PATH_SEPARATOR : "") + + pathExceptScheme; + return new Path(fullPath); } /** - * Eg input : "inlinefs://file1/s3a/?start_offset=20&length=40". - * output: 20 + * Returns start offset w/in the base for the block identified by the given InlineFS path * - * @param inlinePath - * @return + * input: "inlinefs://file1/s3a/?start_offset=20&length=40". + * output: 20 */ - public static int startOffset(Path inlinePath) { - String[] slices = inlinePath.toString().split("[?&=]"); - return Integer.parseInt(slices[slices.length - 3]); + public static long startOffset(Path inlineFSPath) { + assertInlineFSPath(inlineFSPath); + + String[] slices = inlineFSPath.toString().split("[?&=]"); + return Long.parseLong(slices[slices.length - 3]); } /** - * Eg input : "inlinefs:/file1/s3a/?start_offset=20&length=40". - * Output: 40 + * Returns length of the block (embedded w/in the base file) identified by the given InlineFS path * - * @param inlinePath - * @return + * input: "inlinefs:/file1/s3a/?start_offset=20&length=40". + * output: 40 */ - public static int length(Path inlinePath) { + public static long length(Path inlinePath) { + assertInlineFSPath(inlinePath); + String[] slices = inlinePath.toString().split("[?&=]"); - return Integer.parseInt(slices[slices.length - 1]); + return Long.parseLong(slices[slices.length - 1]); } + private static void assertInlineFSPath(Path inlinePath) { + String scheme = inlinePath.toUri().getScheme(); + checkArgument(InLineFileSystem.SCHEME.equals(scheme)); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFileSystem.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFileSystem.java index 4c693c5c5d0f4..1b2ea3cbedcf5 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFileSystem.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFileSystem.java @@ -57,13 +57,14 @@ public URI getUri() { return URI.create(getScheme()); } + @Override public String getScheme() { return SCHEME; } @Override public FSDataInputStream open(Path inlinePath, int bufferSize) throws IOException { - Path outerPath = InLineFSUtils.getOuterfilePathFromInlinePath(inlinePath); + Path outerPath = InLineFSUtils.getOuterFilePathFromInlinePath(inlinePath); FileSystem outerFs = outerPath.getFileSystem(conf); FSDataInputStream outerStream = outerFs.open(outerPath, bufferSize); return new InLineFsDataInputStream(InLineFSUtils.startOffset(inlinePath), outerStream, InLineFSUtils.length(inlinePath)); @@ -80,7 +81,7 @@ public boolean exists(Path f) { @Override public FileStatus getFileStatus(Path inlinePath) throws IOException { - Path outerPath = InLineFSUtils.getOuterfilePathFromInlinePath(inlinePath); + Path outerPath = InLineFSUtils.getOuterFilePathFromInlinePath(inlinePath); FileSystem outerFs = outerPath.getFileSystem(conf); FileStatus status = outerFs.getFileStatus(outerPath); FileStatus toReturn = new FileStatus(InLineFSUtils.length(inlinePath), status.isDirectory(), status.getReplication(), status.getBlockSize(), @@ -129,5 +130,4 @@ public Path getWorkingDirectory() { public boolean mkdirs(Path path, FsPermission fsPermission) throws IOException { throw new UnsupportedOperationException("Can't set working directory"); } - } \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFsDataInputStream.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFsDataInputStream.java index 4e8701244c2ad..fbd067c6c18cb 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFsDataInputStream.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFsDataInputStream.java @@ -33,11 +33,11 @@ */ public class InLineFsDataInputStream extends FSDataInputStream { - private final int startOffset; + private final long startOffset; private final FSDataInputStream outerStream; - private final int length; + private final long length; - public InLineFsDataInputStream(int startOffset, FSDataInputStream outerStream, int length) throws IOException { + public InLineFsDataInputStream(long startOffset, FSDataInputStream outerStream, long length) throws IOException { super(outerStream.getWrappedStream()); this.startOffset = startOffset; this.outerStream = outerStream; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/function/FunctionWrapper.java b/hudi-common/src/main/java/org/apache/hudi/common/function/FunctionWrapper.java new file mode 100644 index 0000000000000..40e1a9d3f7c46 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/function/FunctionWrapper.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.function; + +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; + +import java.util.function.BinaryOperator; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.stream.Stream; + +/** + * Function wrapper util class, which catches the exception thrown by input function and return a similar function + * with no exception thrown. + */ +public class FunctionWrapper { + + public static Function throwingMapWrapper(SerializableFunction throwingMapFunction) { + return v1 -> { + try { + return throwingMapFunction.apply(v1); + } catch (Exception e) { + throw new HoodieException("Error occurs when executing map", e); + } + }; + } + + public static Function> throwingFlatMapWrapper(SerializableFunction> throwingFlatMapFunction) { + return v1 -> { + try { + return throwingFlatMapFunction.apply(v1); + } catch (Exception e) { + throw new HoodieException("Error occurs when executing flatMap", e); + } + }; + } + + public static Consumer throwingForeachWrapper(SerializableConsumer throwingConsumer) { + return v1 -> { + try { + throwingConsumer.accept(v1); + } catch (Exception e) { + throw new HoodieException("Error occurs when executing foreach", e); + } + }; + } + + public static Function> throwingMapToPairWrapper(SerializablePairFunction throwingPairFunction) { + return v1 -> { + try { + return throwingPairFunction.call(v1); + } catch (Exception e) { + throw new HoodieException("Error occurs when executing mapToPair", e); + } + }; + } + + public static Function>> throwingFlatMapToPairWrapper( + SerializablePairFlatMapFunction throwingPairFlatMapFunction) { + return v1 -> { + try { + return throwingPairFlatMapFunction.call(v1); + } catch (Exception e) { + throw new HoodieException("Error occurs when executing mapToPair", e); + } + }; + } + + public static BinaryOperator throwingReduceWrapper(SerializableBiFunction throwingReduceFunction) { + return (v1, v2) -> { + try { + return throwingReduceFunction.apply(v1, v2); + } catch (Exception e) { + throw new HoodieException("Error occurs when executing mapToPair", e); + } + }; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/function/SerializableBiFunction.java b/hudi-common/src/main/java/org/apache/hudi/common/function/SerializableBiFunction.java new file mode 100644 index 0000000000000..940396cf8e1ec --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/function/SerializableBiFunction.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.function; + +import java.io.Serializable; + +/** + * A function that accepts two arguments and produces a result. + * + * @param the type of the first argument to the function + * @param the type of the second argument to the function + * @param the type of the result of the function + */ +@FunctionalInterface +public interface SerializableBiFunction extends Serializable { + R apply(T t, U u); +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/function/SerializableConsumer.java b/hudi-common/src/main/java/org/apache/hudi/common/function/SerializableConsumer.java similarity index 95% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/function/SerializableConsumer.java rename to hudi-common/src/main/java/org/apache/hudi/common/function/SerializableConsumer.java index d7c420522e605..5448ee164bb32 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/function/SerializableConsumer.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/function/SerializableConsumer.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hudi.client.common.function; +package org.apache.hudi.common.function; import java.io.Serializable; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/function/SerializableFunction.java b/hudi-common/src/main/java/org/apache/hudi/common/function/SerializableFunction.java similarity index 95% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/function/SerializableFunction.java rename to hudi-common/src/main/java/org/apache/hudi/common/function/SerializableFunction.java index d3714bc5b6bd1..7e9a270c622c2 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/function/SerializableFunction.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/function/SerializableFunction.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hudi.client.common.function; +package org.apache.hudi.common.function; import java.io.Serializable; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/function/SerializableFunctionUnchecked.java b/hudi-common/src/main/java/org/apache/hudi/common/function/SerializableFunctionUnchecked.java new file mode 100644 index 0000000000000..fd62d033243ed --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/function/SerializableFunctionUnchecked.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.function; + +import java.io.Serializable; +import java.util.function.Function; + +/** + * Serializable {@link Function} interface that only might be throwing unchecked exceptions + * + * @param input type + * @param output type + */ +@FunctionalInterface +public interface SerializableFunctionUnchecked extends Serializable { + O apply(I v1); +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/function/SerializablePairFlatMapFunction.java b/hudi-common/src/main/java/org/apache/hudi/common/function/SerializablePairFlatMapFunction.java new file mode 100644 index 0000000000000..4cc34ce6ee84c --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/function/SerializablePairFlatMapFunction.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.function; + +import org.apache.hudi.common.util.collection.Pair; + +import java.io.Serializable; +import java.util.stream.Stream; + +/** + * A function that returns a stream of key-value pairs (Pair<K, V>). + */ +@FunctionalInterface +public interface SerializablePairFlatMapFunction extends Serializable { + Stream> call(I t) throws Exception; +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/function/SerializablePairFunction.java b/hudi-common/src/main/java/org/apache/hudi/common/function/SerializablePairFunction.java similarity index 88% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/function/SerializablePairFunction.java rename to hudi-common/src/main/java/org/apache/hudi/common/function/SerializablePairFunction.java index 155837b7f0112..e3e730b6376c4 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/common/function/SerializablePairFunction.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/function/SerializablePairFunction.java @@ -16,9 +16,9 @@ * limitations under the License. */ -package org.apache.hudi.client.common.function; +package org.apache.hudi.common.function; -import scala.Tuple2; +import org.apache.hudi.common.util.collection.Pair; import java.io.Serializable; @@ -27,5 +27,5 @@ */ @FunctionalInterface public interface SerializablePairFunction extends Serializable { - Tuple2 call(I t) throws Exception; + Pair call(I t) throws Exception; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/function/SerializableSupplier.java b/hudi-common/src/main/java/org/apache/hudi/common/function/SerializableSupplier.java new file mode 100644 index 0000000000000..0500955f94dc8 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/function/SerializableSupplier.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.function; + +import java.io.Serializable; + +@FunctionalInterface +public interface SerializableSupplier extends Serializable { + T get(); +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/index/HoodieIndex.java b/hudi-common/src/main/java/org/apache/hudi/common/index/HoodieIndex.java new file mode 100644 index 0000000000000..6dabb1a41f8cd --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/index/HoodieIndex.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.index; + +import java.util.Arrays; +import java.util.Map; + +public class HoodieIndex { + private String indexName; + private String[] colNames; + private HoodieIndexType indexType; + private Map> colOptions; + private Map options; + + public HoodieIndex() { + } + + public HoodieIndex( + String indexName, + String[] colNames, + HoodieIndexType indexType, + Map> colOptions, + Map options) { + this.indexName = indexName; + this.colNames = colNames; + this.indexType = indexType; + this.colOptions = colOptions; + this.options = options; + } + + public String getIndexName() { + return indexName; + } + + public String[] getColNames() { + return colNames; + } + + public HoodieIndexType getIndexType() { + return indexType; + } + + public Map> getColOptions() { + return colOptions; + } + + public Map getOptions() { + return options; + } + + public static Builder builder() { + return new Builder(); + } + + @Override + public String toString() { + return "HoodieIndex{" + + "indexName='" + indexName + '\'' + + ", colNames='" + Arrays.toString(colNames) + '\'' + + ", indexType=" + indexType + + ", colOptions=" + colOptions + + ", options=" + options + + '}'; + } + + public static class Builder { + private String indexName; + private String[] colNames; + private HoodieIndexType indexType; + private Map> colOptions; + private Map options; + + public Builder setIndexName(String indexName) { + this.indexName = indexName; + return this; + } + + public Builder setColNames(String[] colNames) { + this.colNames = colNames; + return this; + } + + public Builder setIndexType(String indexType) { + this.indexType = HoodieIndexType.of(indexType); + return this; + } + + public Builder setColOptions(Map> colOptions) { + this.colOptions = colOptions; + return this; + } + + public Builder setOptions(Map options) { + this.options = options; + return this; + } + + public HoodieIndex build() { + return new HoodieIndex(indexName, colNames, indexType, colOptions, options); + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/index/HoodieIndexType.java b/hudi-common/src/main/java/org/apache/hudi/common/index/HoodieIndexType.java new file mode 100644 index 0000000000000..03618a767906d --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/index/HoodieIndexType.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.index; + +import org.apache.hudi.exception.HoodieIndexException; + +import java.util.Arrays; + +public enum HoodieIndexType { + LUCENE((byte) 1); + + private final byte type; + + HoodieIndexType(byte type) { + this.type = type; + } + + public byte getValue() { + return type; + } + + public static HoodieIndexType of(byte indexType) { + return Arrays.stream(HoodieIndexType.values()) + .filter(t -> t.type == indexType) + .findAny() + .orElseThrow(() -> + new HoodieIndexException("Unknown hoodie index type:" + indexType)); + } + + public static HoodieIndexType of(String indexType) { + return Arrays.stream(HoodieIndexType.values()) + .filter(t -> t.name().equals(indexType.toUpperCase())) + .findAny() + .orElseThrow(() -> + new HoodieIndexException("Unknown hoodie index type:" + indexType)); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/lock/LockProvider.java b/hudi-common/src/main/java/org/apache/hudi/common/lock/LockProvider.java new file mode 100644 index 0000000000000..7d8e527384542 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/lock/LockProvider.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.lock; + +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.Lock; + +/** + * Pluggable lock implementations using this provider class. + */ +public interface LockProvider extends Lock, AutoCloseable { + + @Override + default void lockInterruptibly() { + throw new UnsupportedOperationException(); + } + + @Override + default void lock() { + throw new UnsupportedOperationException(); + } + + @Override + default boolean tryLock() { + throw new UnsupportedOperationException(); + } + + @Override + default Condition newCondition() { + throw new UnsupportedOperationException(); + } + + default T getLock() { + throw new IllegalArgumentException(); + } + + @Override + default void close() { + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/lock/LockState.java b/hudi-common/src/main/java/org/apache/hudi/common/lock/LockState.java new file mode 100644 index 0000000000000..e40d5e5a9dd46 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/lock/LockState.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.lock; + +/** + * Enum to signal the state of the lock. + */ +public enum LockState { + ACQUIRING, ACQUIRED, ALREADY_ACQUIRED, RELEASING, RELEASED, ALREADY_RELEASED, + FAILED_TO_ACQUIRE, FAILED_TO_RELEASE +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/metrics/Counter.java b/hudi-common/src/main/java/org/apache/hudi/common/metrics/Counter.java index 546956d00b457..7e307e6fcc8c7 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/metrics/Counter.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/metrics/Counter.java @@ -35,6 +35,10 @@ public void add(long n) { this.count.addAndGet(n); } + public void set(long n) { + this.count.set(n); + } + @Override public Long getValue() { return count.get(); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/metrics/LocalRegistry.java b/hudi-common/src/main/java/org/apache/hudi/common/metrics/LocalRegistry.java new file mode 100644 index 0000000000000..330068f6d6551 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/metrics/LocalRegistry.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.metrics; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Registry that tracks metrics local to a single jvm process. + */ +public class LocalRegistry implements Registry { + ConcurrentHashMap counters = new ConcurrentHashMap<>(); + private final String name; + + public LocalRegistry(String name) { + this.name = name; + } + + @Override + public void clear() { + counters.clear(); + } + + @Override + public void increment(String name) { + getCounter(name).increment(); + } + + @Override + public void add(String name, long value) { + getCounter(name).add(value); + } + + @Override + public void set(String name, long value) { + getCounter(name).set(value); + } + + /** + * Get all Counter type metrics. + */ + @Override + public Map getAllCounts(boolean prefixWithRegistryName) { + HashMap countersMap = new HashMap<>(); + counters.forEach((k, v) -> { + String key = prefixWithRegistryName ? name + "." + k : k; + countersMap.put(key, v.getValue()); + }); + return countersMap; + } + + private synchronized Counter getCounter(String name) { + if (!counters.containsKey(name)) { + counters.put(name, new Counter()); + } + return counters.get(name); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/metrics/Metric.java b/hudi-common/src/main/java/org/apache/hudi/common/metrics/Metric.java index 12b42de23c832..79a7764823b7c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/metrics/Metric.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/metrics/Metric.java @@ -18,9 +18,11 @@ package org.apache.hudi.common.metrics; +import java.io.Serializable; + /** * Interface for Hudi Metric Types. */ -public interface Metric { +public interface Metric extends Serializable { Long getValue(); } \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/metrics/Registry.java b/hudi-common/src/main/java/org/apache/hudi/common/metrics/Registry.java index 169e8bc9003ca..bf5ee7e7b71c2 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/metrics/Registry.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/metrics/Registry.java @@ -18,87 +18,110 @@ package org.apache.hudi.common.metrics; +import java.io.Serializable; import java.util.HashMap; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import org.apache.hudi.common.util.ReflectionUtils; + /** - * Lightweight Metrics Registry to track Hudi events. + * Interface which defines a lightweight Metrics Registry to track Hudi events. */ -public class Registry { - ConcurrentHashMap counters = new ConcurrentHashMap<>(); - final String name; +public interface Registry extends Serializable { - private static ConcurrentHashMap registryMap = new ConcurrentHashMap<>(); + ConcurrentHashMap REGISTRY_MAP = new ConcurrentHashMap<>(); - private Registry(String name) { - this.name = name; + /** + * Get (or create) the registry for a provided name. + * + * This function creates a {@code LocalRegistry}. + * + * @param registryName Name of the registry + */ + static Registry getRegistry(String registryName) { + return getRegistry(registryName, LocalRegistry.class.getName()); } /** - * Get (or create) the registry for a provided name. + * Get (or create) the registry for a provided name and given class. + * + * @param registryName Name of the registry. + * @param clazz The fully qualified name of the registry class to create. */ - public static synchronized Registry getRegistry(String registryName) { - if (!registryMap.containsKey(registryName)) { - registryMap.put(registryName, new Registry(registryName)); + static Registry getRegistry(String registryName, String clazz) { + synchronized (Registry.class) { + if (!REGISTRY_MAP.containsKey(registryName)) { + Registry registry = (Registry)ReflectionUtils.loadClass(clazz, registryName); + REGISTRY_MAP.put(registryName, registry); + } + return REGISTRY_MAP.get(registryName); } - return registryMap.get(registryName); } /** * Get all registered metrics. - * @param flush clean all metrics as part of this operation. + * + * @param flush clear all metrics after this operation. * @param prefixWithRegistryName prefix each metric name with the registry name. * @return */ - public static synchronized Map getAllMetrics(boolean flush, boolean prefixWithRegistryName) { - HashMap allMetrics = new HashMap<>(); - registryMap.forEach((registryName, registry) -> { - allMetrics.putAll(registry.getAllCounts(prefixWithRegistryName)); - if (flush) { - registry.clear(); - } - }); - return allMetrics; + static Map getAllMetrics(boolean flush, boolean prefixWithRegistryName) { + synchronized (Registry.class) { + HashMap allMetrics = new HashMap<>(); + REGISTRY_MAP.forEach((registryName, registry) -> { + allMetrics.putAll(registry.getAllCounts(prefixWithRegistryName)); + if (flush) { + registry.clear(); + } + }); + return allMetrics; + } } - public void clear() { - counters.clear(); - } + /** + * Clear all metrics. + */ + void clear(); - public void increment(String name) { - getCounter(name).increment(); - } + /** + * Increment the metric. + * + * @param name Name of the metric to increment. + */ + void increment(String name); - public void add(String name, long value) { - getCounter(name).add(value); - } + /** + * Add value to the metric. + * + * @param name Name of the metric. + * @param value The value to add to the metrics. + */ + void add(String name, long value); - private synchronized Counter getCounter(String name) { - if (!counters.containsKey(name)) { - counters.put(name, new Counter()); - } - return counters.get(name); - } + /** + * Set the value to the metric. + * + * If the metric does not exist, it is added. If the metrics already exists, its value is replaced with the + * provided value. + * + * @param name Name of the metric. + * @param value The value to set for the metrics. + */ + void set(String name, long value); /** * Get all Counter type metrics. */ - public Map getAllCounts() { + default Map getAllCounts() { return getAllCounts(false); } /** * Get all Counter type metrics. + * + * @param prefixWithRegistryName If true, the names of all metrics are prefixed with name of this registry. */ - public Map getAllCounts(boolean prefixWithRegistryName) { - HashMap countersMap = new HashMap<>(); - counters.forEach((k, v) -> { - String key = prefixWithRegistryName ? name + "." + k : k; - countersMap.put(key, v.getValue()); - }); - return countersMap; - } - -} \ No newline at end of file + Map getAllCounts(boolean prefixWithRegistryName); +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/AWSDmsAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/AWSDmsAvroPayload.java new file mode 100644 index 0000000000000..fe044e0b431f1 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/AWSDmsAvroPayload.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hudi.common.util.Option; + +import java.io.IOException; +import java.util.Properties; + +/** + * Provides support for seamlessly applying changes captured via Amazon Database Migration Service onto S3. + * + * Typically, we get the following pattern of full change records corresponding to DML against the + * source database + * + * - Full load records with no `Op` field + * - For inserts against the source table, records contain full after image with `Op=I` + * - For updates against the source table, records contain full after image with `Op=U` + * - For deletes against the source table, records contain full before image with `Op=D` + * + * This payload implementation will issue matching insert, delete, updates against the hudi table + * + */ +public class AWSDmsAvroPayload extends OverwriteWithLatestAvroPayload { + + public static final String OP_FIELD = "Op"; + + public AWSDmsAvroPayload(GenericRecord record, Comparable orderingVal) { + super(record, orderingVal); + } + + public AWSDmsAvroPayload(Option record) { + this(record.isPresent() ? record.get() : null, 0); // natural order + } + + /** + * + * Handle a possible delete - check for "D" in Op column and return empty row if found. + * @param insertValue The new row that is being "inserted". + */ + private Option handleDeleteOperation(IndexedRecord insertValue) throws IOException { + boolean delete = false; + if (insertValue instanceof GenericRecord) { + GenericRecord record = (GenericRecord) insertValue; + delete = record.get(OP_FIELD) != null && record.get(OP_FIELD).toString().equalsIgnoreCase("D"); + } + + return delete ? Option.empty() : Option.of(insertValue); + } + + @Override + public Option getInsertValue(Schema schema, Properties properties) throws IOException { + return getInsertValue(schema); + } + + @Override + public Option getInsertValue(Schema schema) throws IOException { + IndexedRecord insertValue = super.getInsertValue(schema).get(); + return handleDeleteOperation(insertValue); + } + + @Override + public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema, Properties properties) + throws IOException { + return combineAndGetUpdateValue(currentValue, schema); + } + + @Override + public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) + throws IOException { + Option insertValue = super.getInsertValue(schema); + if (!insertValue.isPresent()) { + return Option.empty(); + } + return handleDeleteOperation(insertValue.get()); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/ActionType.java b/hudi-common/src/main/java/org/apache/hudi/common/model/ActionType.java index 6be321c9860b4..c10c99d8dc8ee 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/ActionType.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/ActionType.java @@ -22,6 +22,5 @@ * The supported action types. */ public enum ActionType { - //TODO HUDI-1281 make deltacommit part of this - commit, savepoint, compaction, clean, rollback, replacecommit + commit, savepoint, compaction, clean, rollback, replacecommit, deltacommit } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/BaseAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/BaseAvroPayload.java index 3b35b0d4dca16..cd3a95e6bf786 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/BaseAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/BaseAvroPayload.java @@ -37,7 +37,7 @@ public abstract class BaseAvroPayload implements Serializable { /** * For purposes of preCombining. */ - protected final Comparable orderingVal; + public final Comparable orderingVal; /** * Instantiate {@link BaseAvroPayload}. diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/BaseFile.java b/hudi-common/src/main/java/org/apache/hudi/common/model/BaseFile.java index f12c207ee75b6..fe9837e6c693b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/BaseFile.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/BaseFile.java @@ -20,6 +20,7 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; +import org.apache.hudi.hadoop.CachingPath; import java.io.Serializable; import java.util.Objects; @@ -31,34 +32,51 @@ public class BaseFile implements Serializable { private static final long serialVersionUID = 1L; + private transient FileStatus fileStatus; private final String fullPath; + private final String fileName; private long fileLen; public BaseFile(BaseFile dataFile) { - this.fileStatus = dataFile.fileStatus; - this.fullPath = dataFile.fullPath; - this.fileLen = dataFile.fileLen; + this(dataFile.fileStatus, + dataFile.fullPath, + dataFile.getFileName(), + dataFile.getFileLen()); } public BaseFile(FileStatus fileStatus) { - this.fileStatus = fileStatus; - this.fullPath = fileStatus.getPath().toString(); - this.fileLen = fileStatus.getLen(); + this(fileStatus, + fileStatus.getPath().toString(), + fileStatus.getPath().getName(), + fileStatus.getLen()); } public BaseFile(String filePath) { - this.fileStatus = null; - this.fullPath = filePath; - this.fileLen = -1; + this(null, filePath, getFileName(filePath), -1); + } + + private BaseFile(FileStatus fileStatus, String fullPath, String fileName, long fileLen) { + this.fileStatus = fileStatus; + this.fullPath = fullPath; + this.fileLen = fileLen; + this.fileName = fileName; } public String getPath() { return fullPath; } + public Path getHadoopPath() { + if (fileStatus != null) { + return fileStatus.getPath(); + } + + return new CachingPath(fullPath); + } + public String getFileName() { - return new Path(fullPath).getName(); + return fileName; } public FileStatus getFileStatus() { @@ -98,4 +116,8 @@ public int hashCode() { public String toString() { return "BaseFile{fullPath=" + fullPath + ", fileLen=" + fileLen + '}'; } + + private static String getFileName(String fullPath) { + return new Path(fullPath).getName(); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/ClusteringGroupInfo.java b/hudi-common/src/main/java/org/apache/hudi/common/model/ClusteringGroupInfo.java new file mode 100644 index 0000000000000..24a666a532ffe --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/ClusteringGroupInfo.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.hudi.avro.model.HoodieClusteringGroup; + +import java.io.Serializable; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; + +/** + * Encapsulates all the needed information about a clustering group. This is needed because spark serialization + * does not work with avro objects. + */ +public class ClusteringGroupInfo implements Serializable { + + private List operations; + private int numOutputGroups; + + public static ClusteringGroupInfo create(HoodieClusteringGroup clusteringGroup) { + List operations = clusteringGroup.getSlices().stream() + .map(ClusteringOperation::create).collect(Collectors.toList()); + + return new ClusteringGroupInfo(operations, clusteringGroup.getNumOutputFileGroups()); + } + + // Only for serialization/de-serialization + @Deprecated + public ClusteringGroupInfo() {} + + private ClusteringGroupInfo(final List operations, final int numOutputGroups) { + this.operations = operations; + this.numOutputGroups = numOutputGroups; + } + + public List getOperations() { + return this.operations; + } + + public void setOperations(final List operations) { + this.operations = operations; + } + + public int getNumOutputGroups() { + return this.numOutputGroups; + } + + public void setNumOutputGroups(final int numOutputGroups) { + this.numOutputGroups = numOutputGroups; + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final ClusteringGroupInfo that = (ClusteringGroupInfo) o; + return Objects.equals(getFilePathsInGroup(), that.getFilePathsInGroup()); + } + + @Override + public int hashCode() { + return Objects.hash(getFilePathsInGroup()); + } + + private String getFilePathsInGroup() { + return getOperations().stream().map(op -> op.getDataFilePath()).collect(Collectors.joining(",")); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/ClusteringOperation.java b/hudi-common/src/main/java/org/apache/hudi/common/model/ClusteringOperation.java new file mode 100644 index 0000000000000..3d732fc7fa01a --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/ClusteringOperation.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.hudi.avro.model.HoodieSliceInfo; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +/** + * Encapsulates all the needed information about a clustering file slice. This is needed because spark serialization + * does not work with avro objects. + */ +public class ClusteringOperation implements Serializable { + + private String dataFilePath; + private List deltaFilePaths; + private String fileId; + private String partitionPath; + private String bootstrapFilePath; + private int version; + + public static ClusteringOperation create(HoodieSliceInfo sliceInfo) { + return new ClusteringOperation(sliceInfo.getDataFilePath(), new ArrayList<>(sliceInfo.getDeltaFilePaths()), sliceInfo.getFileId(), + sliceInfo.getPartitionPath(), sliceInfo.getBootstrapFilePath(), sliceInfo.getVersion()); + } + + // Only for serialization/de-serialization + @Deprecated + public ClusteringOperation() {} + + private ClusteringOperation(final String dataFilePath, final List deltaFilePaths, final String fileId, + final String partitionPath, final String bootstrapFilePath, final int version) { + this.dataFilePath = dataFilePath; + this.deltaFilePaths = deltaFilePaths; + this.fileId = fileId; + this.partitionPath = partitionPath; + this.bootstrapFilePath = bootstrapFilePath; + this.version = version; + } + + public String getDataFilePath() { + return this.dataFilePath; + } + + public void setDataFilePath(final String dataFilePath) { + this.dataFilePath = dataFilePath; + } + + public List getDeltaFilePaths() { + return this.deltaFilePaths; + } + + public void setDeltaFilePaths(final List deltaFilePaths) { + this.deltaFilePaths = deltaFilePaths; + } + + public String getFileId() { + return this.fileId; + } + + public void setFileId(final String fileId) { + this.fileId = fileId; + } + + public String getPartitionPath() { + return this.partitionPath; + } + + public void setPartitionPath(final String partitionPath) { + this.partitionPath = partitionPath; + } + + public String getBootstrapFilePath() { + return this.bootstrapFilePath; + } + + public void setBootstrapFilePath(final String bootstrapFilePath) { + this.bootstrapFilePath = bootstrapFilePath; + } + + public int getVersion() { + return this.version; + } + + public void setVersion(final int version) { + this.version = version; + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final ClusteringOperation that = (ClusteringOperation) o; + return getVersion() == that.getVersion() + && Objects.equals(getDataFilePath(), that.getDataFilePath()) + && Objects.equals(getDeltaFilePaths(), that.getDeltaFilePaths()) + && Objects.equals(getFileId(), that.getFileId()) + && Objects.equals(getPartitionPath(), that.getPartitionPath()) + && Objects.equals(getBootstrapFilePath(), that.getBootstrapFilePath()); + } + + @Override + public int hashCode() { + return Objects.hash(getDataFilePath(), getDeltaFilePaths(), getFileId(), getPartitionPath(), getBootstrapFilePath(), getVersion()); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/ConsistentHashingNode.java b/hudi-common/src/main/java/org/apache/hudi/common/model/ConsistentHashingNode.java new file mode 100644 index 0000000000000..262bb963223bb --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/ConsistentHashingNode.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.hudi.common.util.JsonUtils; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonProperty; + +import java.io.IOException; +import java.io.Serializable; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** + * Used in consistent hashing index, representing nodes in the consistent hash ring. + * Record the end hash range value and its corresponding file group id. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class ConsistentHashingNode implements Serializable { + + private final int value; + private final String fileIdPrefix; + + @JsonCreator + public ConsistentHashingNode(@JsonProperty("value") int value, @JsonProperty("fileIdPrefix") String fileIdPrefix) { + this.value = value; + this.fileIdPrefix = fileIdPrefix; + } + + public static String toJsonString(List nodes) throws IOException { + return JsonUtils.getObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(nodes); + } + + public static List fromJsonString(String json) throws Exception { + if (json == null || json.isEmpty()) { + return Collections.emptyList(); + } + + ConsistentHashingNode[] nodes = JsonUtils.getObjectMapper().readValue(json, ConsistentHashingNode[].class); + return Arrays.asList(nodes); + } + + public int getValue() { + return value; + } + + public String getFileIdPrefix() { + return fileIdPrefix; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("ConsistentHashingNode{"); + sb.append("value=").append(value); + sb.append(", fileIdPfx='").append(fileIdPrefix).append('\''); + sb.append('}'); + return sb.toString(); + } +} \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/DefaultHoodieRecordPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/DefaultHoodieRecordPayload.java new file mode 100644 index 0000000000000..5a588eafa5f3f --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/DefaultHoodieRecordPayload.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; + +/** + * {@link HoodieRecordPayload} impl that honors ordering field in both preCombine and combineAndGetUpdateValue. + *

    + * 1. preCombine - Picks the latest delta record for a key, based on an ordering field 2. combineAndGetUpdateValue/getInsertValue - Chooses the latest record based on ordering field value. + */ +public class DefaultHoodieRecordPayload extends OverwriteWithLatestAvroPayload { + + public static final String METADATA_EVENT_TIME_KEY = "metadata.event_time.key"; + private Option eventTime = Option.empty(); + + public DefaultHoodieRecordPayload(GenericRecord record, Comparable orderingVal) { + super(record, orderingVal); + } + + public DefaultHoodieRecordPayload(Option record) { + this(record.isPresent() ? record.get() : null, 0); // natural order + } + + @Override + public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema, Properties properties) throws IOException { + if (recordBytes.length == 0) { + return Option.empty(); + } + + GenericRecord incomingRecord = HoodieAvroUtils.bytesToAvro(recordBytes, schema); + + // Null check is needed here to support schema evolution. The record in storage may be from old schema where + // the new ordering column might not be present and hence returns null. + if (!needUpdatingPersistedRecord(currentValue, incomingRecord, properties)) { + return Option.of(currentValue); + } + + /* + * We reached a point where the value is disk is older than the incoming record. + */ + eventTime = updateEventTime(incomingRecord, properties); + + /* + * Now check if the incoming record is a delete record. + */ + return isDeleteRecord(incomingRecord) ? Option.empty() : Option.of(incomingRecord); + } + + @Override + public Option getInsertValue(Schema schema, Properties properties) throws IOException { + if (recordBytes.length == 0) { + return Option.empty(); + } + GenericRecord incomingRecord = HoodieAvroUtils.bytesToAvro(recordBytes, schema); + eventTime = updateEventTime(incomingRecord, properties); + + return isDeleteRecord(incomingRecord) ? Option.empty() : Option.of(incomingRecord); + } + + private static Option updateEventTime(GenericRecord record, Properties properties) { + boolean consistentLogicalTimestampEnabled = Boolean.parseBoolean(properties.getProperty( + KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), + KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue())); + String eventTimeField = properties + .getProperty(HoodiePayloadProps.PAYLOAD_EVENT_TIME_FIELD_PROP_KEY); + if (eventTimeField == null) { + return Option.empty(); + } + return Option.ofNullable( + HoodieAvroUtils.getNestedFieldVal( + record, + eventTimeField, + true, + consistentLogicalTimestampEnabled) + ); + } + + @Override + public Option> getMetadata() { + Map metadata = new HashMap<>(); + if (eventTime.isPresent()) { + metadata.put(METADATA_EVENT_TIME_KEY, String.valueOf(eventTime.get())); + } + return metadata.isEmpty() ? Option.empty() : Option.of(metadata); + } + + protected boolean needUpdatingPersistedRecord(IndexedRecord currentValue, + IndexedRecord incomingRecord, Properties properties) { + /* + * Combining strategy here returns currentValue on disk if incoming record is older. + * The incoming record can be either a delete (sent as an upsert with _hoodie_is_deleted set to true) + * or an insert/update record. In any case, if it is older than the record in disk, the currentValue + * in disk is returned (to be rewritten with new commit time). + * + * NOTE: Deletes sent via EmptyHoodieRecordPayload and/or Delete operation type do not hit this code path + * and need to be dealt with separately. + */ + String orderField = properties.getProperty(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY); + if (orderField == null) { + return true; + } + boolean consistentLogicalTimestampEnabled = Boolean.parseBoolean(properties.getProperty( + KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), + KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue())); + Object persistedOrderingVal = HoodieAvroUtils.getNestedFieldVal((GenericRecord) currentValue, + orderField, + true, consistentLogicalTimestampEnabled); + Comparable incomingOrderingVal = (Comparable) HoodieAvroUtils.getNestedFieldVal((GenericRecord) incomingRecord, + orderField, + true, consistentLogicalTimestampEnabled); + return persistedOrderingVal == null || ((Comparable) persistedOrderingVal).compareTo(incomingOrderingVal) <= 0; + } + +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/DeleteRecord.java b/hudi-common/src/main/java/org/apache/hudi/common/model/DeleteRecord.java new file mode 100644 index 0000000000000..003b591c20c05 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/DeleteRecord.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import java.io.Serializable; +import java.util.Objects; + +/** + * Delete record is a combination of HoodieKey and ordering value. + * The record is used for {@link org.apache.hudi.common.table.log.block.HoodieDeleteBlock} + * to support per-record deletions. The deletion block is always appended after the data block, + * we need to keep the ordering val to combine with the data records when merging, or the data loss + * may occur if there are intermediate deletions for the inputs + * (a new INSERT comes after a DELETE in one input batch). + */ +public class DeleteRecord implements Serializable { + private static final long serialVersionUID = 1L; + + /** + * The record key and partition path. + */ + private final HoodieKey hoodieKey; + + /** + * For purposes of preCombining. + */ + private final Comparable orderingVal; + + private DeleteRecord(HoodieKey hoodieKey, Comparable orderingVal) { + this.hoodieKey = hoodieKey; + this.orderingVal = orderingVal; + } + + public static DeleteRecord create(HoodieKey hoodieKey) { + return create(hoodieKey, 0); + } + + public static DeleteRecord create(String recordKey, String partitionPath) { + return create(recordKey, partitionPath, 0); + } + + public static DeleteRecord create(String recordKey, String partitionPath, Comparable orderingVal) { + return create(new HoodieKey(recordKey, partitionPath), orderingVal); + } + + public static DeleteRecord create(HoodieKey hoodieKey, Comparable orderingVal) { + return new DeleteRecord(hoodieKey, orderingVal); + } + + public String getRecordKey() { + return hoodieKey.getRecordKey(); + } + + public String getPartitionPath() { + return hoodieKey.getPartitionPath(); + } + + public HoodieKey getHoodieKey() { + return hoodieKey; + } + + public Comparable getOrderingValue() { + return orderingVal; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof DeleteRecord)) { + return false; + } + DeleteRecord that = (DeleteRecord) o; + return this.hoodieKey.equals(that.hoodieKey) && this.orderingVal.equals(that.orderingVal); + } + + @Override + public int hashCode() { + return Objects.hash(this.hoodieKey, this.orderingVal); + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("DeleteRecord {"); + sb.append(" key=").append(hoodieKey); + sb.append(" orderingVal=").append(this.orderingVal); + sb.append('}'); + return sb.toString(); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/EmptyHoodieRecordPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/EmptyHoodieRecordPayload.java index 783422fc648f2..abcad8d922f0f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/EmptyHoodieRecordPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/EmptyHoodieRecordPayload.java @@ -36,8 +36,8 @@ public EmptyHoodieRecordPayload(GenericRecord record, Comparable orderingVal) { } @Override - public EmptyHoodieRecordPayload preCombine(EmptyHoodieRecordPayload another) { - return another; + public EmptyHoodieRecordPayload preCombine(EmptyHoodieRecordPayload oldValue) { + return oldValue; } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/EventTimeAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/EventTimeAvroPayload.java new file mode 100644 index 0000000000000..7c8efb66e5cb6 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/EventTimeAvroPayload.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.hudi.common.util.Option; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; + +import java.io.IOException; +import java.util.Map; +import java.util.Properties; + +import static org.apache.hudi.avro.HoodieAvroUtils.bytesToAvro; + +/** + * The only difference with {@link DefaultHoodieRecordPayload} is that is does not + * track the event time metadata for efficiency. + */ +public class EventTimeAvroPayload extends DefaultHoodieRecordPayload { + + public EventTimeAvroPayload(GenericRecord record, Comparable orderingVal) { + super(record, orderingVal); + } + + public EventTimeAvroPayload(Option record) { + this(record.isPresent() ? record.get() : null, 0); // natural order + } + + @Override + public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema, Properties properties) throws IOException { + if (recordBytes.length == 0) { + return Option.empty(); + } + + GenericRecord incomingRecord = bytesToAvro(recordBytes, schema); + + // Null check is needed here to support schema evolution. The record in storage may be from old schema where + // the new ordering column might not be present and hence returns null. + if (!needUpdatingPersistedRecord(currentValue, incomingRecord, properties)) { + return Option.of(currentValue); + } + + /* + * Now check if the incoming record is a delete record. + */ + return isDeleteRecord(incomingRecord) ? Option.empty() : Option.of(incomingRecord); + } + + @Override + public Option getInsertValue(Schema schema, Properties properties) throws IOException { + if (recordBytes.length == 0) { + return Option.empty(); + } + GenericRecord incomingRecord = bytesToAvro(recordBytes, schema); + + return isDeleteRecord(incomingRecord) ? Option.empty() : Option.of(incomingRecord); + } + + @Override + public Option> getMetadata() { + return Option.empty(); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/FileSlice.java b/hudi-common/src/main/java/org/apache/hudi/common/model/FileSlice.java index 688e72bd786a2..0fc580db0b657 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/FileSlice.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/FileSlice.java @@ -34,12 +34,12 @@ public class FileSlice implements Serializable { /** * File Group Id of the Slice. */ - private HoodieFileGroupId fileGroupId; + private final HoodieFileGroupId fileGroupId; /** * Point in the timeline, at which the slice was created. */ - private String baseInstantTime; + private final String baseInstantTime; /** * data file, with the compacted data, for this slice. diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java index a3ab2b71ae980..3fbcb8a620e0e 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java @@ -36,17 +36,20 @@ public class HoodieAvroPayload implements HoodieRecordPayload // Store the GenericRecord converted to bytes - 1) Doesn't store schema hence memory efficient 2) Makes the payload // java serializable private final byte[] recordBytes; + private final Comparable orderingVal; + + public HoodieAvroPayload(GenericRecord record, Comparable orderingVal) { + this.recordBytes = record == null ? new byte[0] : HoodieAvroUtils.avroToBytes(record); + this.orderingVal = orderingVal; + } public HoodieAvroPayload(Option record) { - if (record.isPresent()) { - this.recordBytes = HoodieAvroUtils.avroToBytes(record.get()); - } else { - this.recordBytes = new byte[0]; - } + this.recordBytes = record.isPresent() ? HoodieAvroUtils.avroToBytes(record.get()) : new byte[0]; + this.orderingVal = 0; } @Override - public HoodieAvroPayload preCombine(HoodieAvroPayload another) { + public HoodieAvroPayload preCombine(HoodieAvroPayload oldValue) { return this; } @@ -67,4 +70,9 @@ public Option getInsertValue(Schema schema) throws IOException { public byte[] getRecordBytes() { return recordBytes; } + + @Override + public Comparable getOrderingValue() { + return orderingVal; + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecord.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecord.java new file mode 100644 index 0000000000000..9a9bbb2b7427f --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecord.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.model; + +public class HoodieAvroRecord extends HoodieRecord { + public HoodieAvroRecord(HoodieKey key, T data) { + super(key, data); + } + + public HoodieAvroRecord(HoodieKey key, T data, HoodieOperation operation) { + super(key, data, operation); + } + + public HoodieAvroRecord(HoodieRecord record) { + super(record); + } + + public HoodieAvroRecord() { + } + + @Override + public HoodieRecord newInstance() { + return new HoodieAvroRecord<>(this); + } + + @Override + public T getData() { + if (data == null) { + throw new IllegalStateException("Payload already deflated for record."); + } + return data; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCleaningPolicy.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCleaningPolicy.java index faf22019a53d4..3eb8f784dbab7 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCleaningPolicy.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCleaningPolicy.java @@ -22,5 +22,5 @@ * Hoodie cleaning policies. */ public enum HoodieCleaningPolicy { - KEEP_LATEST_FILE_VERSIONS, KEEP_LATEST_COMMITS + KEEP_LATEST_FILE_VERSIONS, KEEP_LATEST_COMMITS, KEEP_LATEST_BY_HOURS } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieColumnRangeMetadata.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieColumnRangeMetadata.java new file mode 100644 index 0000000000000..e3c5a70d5cf16 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieColumnRangeMetadata.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import javax.annotation.Nullable; +import java.io.Serializable; +import java.util.Objects; + +/** + * Hoodie metadata for the column range of data stored in columnar format (like Parquet) + * + * NOTE: {@link Comparable} is used as raw-type so that we can handle polymorphism, where + * caller apriori is not aware of the type {@link HoodieColumnRangeMetadata} is + * associated with + */ +@SuppressWarnings("rawtype") +public class HoodieColumnRangeMetadata implements Serializable { + private final String filePath; + private final String columnName; + @Nullable + private final T minValue; + @Nullable + private final T maxValue; + private final long nullCount; + private final long valueCount; + private final long totalSize; + private final long totalUncompressedSize; + + private HoodieColumnRangeMetadata(String filePath, + String columnName, + @Nullable T minValue, + @Nullable T maxValue, + long nullCount, + long valueCount, + long totalSize, + long totalUncompressedSize) { + this.filePath = filePath; + this.columnName = columnName; + this.minValue = minValue; + this.maxValue = maxValue; + this.nullCount = nullCount; + this.valueCount = valueCount; + this.totalSize = totalSize; + this.totalUncompressedSize = totalUncompressedSize; + } + + public String getFilePath() { + return this.filePath; + } + + public String getColumnName() { + return this.columnName; + } + + @Nullable + public T getMinValue() { + return this.minValue; + } + + @Nullable + public T getMaxValue() { + return this.maxValue; + } + + public long getNullCount() { + return nullCount; + } + + public long getValueCount() { + return valueCount; + } + + public long getTotalSize() { + return totalSize; + } + + public long getTotalUncompressedSize() { + return totalUncompressedSize; + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final HoodieColumnRangeMetadata that = (HoodieColumnRangeMetadata) o; + return Objects.equals(getFilePath(), that.getFilePath()) + && Objects.equals(getColumnName(), that.getColumnName()) + && Objects.equals(getMinValue(), that.getMinValue()) + && Objects.equals(getMaxValue(), that.getMaxValue()) + && Objects.equals(getNullCount(), that.getNullCount()) + && Objects.equals(getValueCount(), that.getValueCount()) + && Objects.equals(getTotalSize(), that.getTotalSize()) + && Objects.equals(getTotalUncompressedSize(), that.getTotalUncompressedSize()); + } + + @Override + public int hashCode() { + return Objects.hash(getColumnName(), getMinValue(), getMaxValue(), getNullCount()); + } + + @Override + public String toString() { + return "HoodieColumnRangeMetadata{" + + "filePath ='" + filePath + '\'' + + ", columnName='" + columnName + '\'' + + ", minValue=" + minValue + + ", maxValue=" + maxValue + + ", nullCount=" + nullCount + + ", valueCount=" + valueCount + + ", totalSize=" + totalSize + + ", totalUncompressedSize=" + totalUncompressedSize + + '}'; + } + + public static > HoodieColumnRangeMetadata create(String filePath, + String columnName, + @Nullable T minValue, + @Nullable T maxValue, + long nullCount, + long valueCount, + long totalSize, + long totalUncompressedSize) { + return new HoodieColumnRangeMetadata<>(filePath, columnName, minValue, maxValue, nullCount, valueCount, totalSize, totalUncompressedSize); + } + + @SuppressWarnings("rawtype") + public static HoodieColumnRangeMetadata stub(String filePath, + String columnName) { + return new HoodieColumnRangeMetadata<>(filePath, columnName, null, null, -1, -1, -1, -1); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java index 3e760f6bd77d7..41d83813f1e0f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java @@ -18,14 +18,15 @@ package org.apache.hudi.common.model; -import org.apache.hadoop.fs.Path; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.JsonUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; -import com.fasterxml.jackson.annotation.JsonAutoDetect; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import com.fasterxml.jackson.annotation.PropertyAccessor; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -33,9 +34,12 @@ import java.io.Serializable; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Collection; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; /** * All the metadata that gets stored along with a commit. @@ -86,6 +90,10 @@ public Map> getPartitionToWriteStats() { return partitionToWriteStats; } + public List getWriteStats() { + return partitionToWriteStats.values().stream().flatMap(Collection::stream).collect(Collectors.toList()); + } + public String getMetadata(String metaKey) { return extraMetadata.get(metaKey); } @@ -101,8 +109,8 @@ public void setCompacted(Boolean compacted) { public HashMap getFileIdAndRelativePaths() { HashMap filePaths = new HashMap<>(); // list all partitions paths - for (Map.Entry> entry : getPartitionToWriteStats().entrySet()) { - for (HoodieWriteStat stat : entry.getValue()) { + for (List stats : getPartitionToWriteStats().values()) { + for (HoodieWriteStat stat : stats) { filePaths.put(stat.getFileId(), stat.getPath()); } } @@ -117,16 +125,30 @@ public WriteOperationType getOperationType() { return this.operationType; } - public HashMap getFileIdAndFullPaths(String basePath) { + public HashMap getFileIdAndFullPaths(Path basePath) { HashMap fullPaths = new HashMap<>(); for (Map.Entry entry : getFileIdAndRelativePaths().entrySet()) { - String fullPath = - (entry.getValue() != null) ? (FSUtils.getPartitionPath(basePath, entry.getValue())).toString() : null; + String fullPath = entry.getValue() != null + ? FSUtils.getPartitionPath(basePath, entry.getValue()).toString() + : null; fullPaths.put(entry.getKey(), fullPath); } return fullPaths; } + public List getFullPathsByPartitionPath(String basePath, String partitionPath) { + HashSet fullPaths = new HashSet<>(); + if (getPartitionToWriteStats().get(partitionPath) != null) { + for (HoodieWriteStat stat : getPartitionToWriteStats().get(partitionPath)) { + if ((stat.getFileId() != null)) { + String fullPath = FSUtils.getPartitionPath(basePath, stat.getPath()).toString(); + fullPaths.add(fullPath); + } + } + } + return new ArrayList<>(fullPaths); + } + public Map getFileGroupIdAndFullPaths(String basePath) { Map fileGroupIdToFullPaths = new HashMap<>(); for (Map.Entry> entry : getPartitionToWriteStats().entrySet()) { @@ -139,12 +161,71 @@ public Map getFileGroupIdAndFullPaths(String basePath return fileGroupIdToFullPaths; } + /** + * Extract the file status of all affected files from the commit metadata. If a file has + * been touched multiple times in the given commits, the return value will keep the one + * from the latest commit. + * + * + * @param hadoopConf + * @param basePath The base path + * @return the file full path to file status mapping + */ + public Map getFullPathToFileStatus(Configuration hadoopConf, String basePath) { + Map fullPathToFileStatus = new HashMap<>(); + for (List stats : getPartitionToWriteStats().values()) { + // Iterate through all the written files. + for (HoodieWriteStat stat : stats) { + String relativeFilePath = stat.getPath(); + Path fullPath = relativeFilePath != null ? FSUtils.getPartitionPath(basePath, relativeFilePath) : null; + if (fullPath != null) { + long blockSize = FSUtils.getFs(fullPath.toString(), hadoopConf).getDefaultBlockSize(fullPath); + FileStatus fileStatus = new FileStatus(stat.getFileSizeInBytes(), false, 0, blockSize, + 0, fullPath); + fullPathToFileStatus.put(fullPath.getName(), fileStatus); + } + } + } + return fullPathToFileStatus; + } + + /** + * Extract the file status of all affected files from the commit metadata. If a file has + * been touched multiple times in the given commits, the return value will keep the one + * from the latest commit by file group ID. + * + *

    Note: different with {@link #getFullPathToFileStatus(Configuration, String)}, + * only the latest commit file for a file group is returned, + * this is an optimization for COPY_ON_WRITE table to eliminate legacy files for filesystem view. + * + * + * @param hadoopConf + * @param basePath The base path + * @return the file ID to file status mapping + */ + public Map getFileIdToFileStatus(Configuration hadoopConf, String basePath) { + Map fileIdToFileStatus = new HashMap<>(); + for (List stats : getPartitionToWriteStats().values()) { + // Iterate through all the written files. + for (HoodieWriteStat stat : stats) { + String relativeFilePath = stat.getPath(); + Path fullPath = relativeFilePath != null ? FSUtils.getPartitionPath(basePath, relativeFilePath) : null; + if (fullPath != null) { + FileStatus fileStatus = new FileStatus(stat.getFileSizeInBytes(), false, 0, 0, + 0, fullPath); + fileIdToFileStatus.put(stat.getFileId(), fileStatus); + } + } + } + return fileIdToFileStatus; + } + public String toJsonString() throws IOException { if (partitionToWriteStats.containsKey(null)) { LOG.info("partition path is null for " + partitionToWriteStats.get(null)); partitionToWriteStats.remove(null); } - return getObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this); + return JsonUtils.getObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this); } public static T fromJsonString(String jsonStr, Class clazz) throws Exception { @@ -152,7 +233,7 @@ public static T fromJsonString(String jsonStr, Class clazz) throws Except // For empty commit file (no data or somethings bad happen). return clazz.newInstance(); } - return getObjectMapper().readValue(jsonStr, clazz); + return JsonUtils.getObjectMapper().readValue(jsonStr, clazz); } // Here the functions are named "fetch" instead of "get", to get avoid of the json conversion. @@ -323,6 +404,24 @@ public Long getTotalUpsertTime() { return totalUpsertTime; } + public Pair, Option> getMinAndMaxEventTime() { + long minEventTime = Long.MAX_VALUE; + long maxEventTime = Long.MIN_VALUE; + for (Map.Entry> entry : partitionToWriteStats.entrySet()) { + for (HoodieWriteStat writeStat : entry.getValue()) { + minEventTime = writeStat.getMinEventTime() != null ? Math.min(writeStat.getMinEventTime(), minEventTime) : minEventTime; + maxEventTime = writeStat.getMaxEventTime() != null ? Math.max(writeStat.getMaxEventTime(), maxEventTime) : maxEventTime; + } + } + return Pair.of( + minEventTime == Long.MAX_VALUE ? Option.empty() : Option.of(minEventTime), + maxEventTime == Long.MIN_VALUE ? Option.empty() : Option.of(maxEventTime)); + } + + public HashSet getWritePartitionPaths() { + return new HashSet<>(partitionToWriteStats.keySet()); + } + @Override public boolean equals(Object o) { if (this == o) { @@ -356,13 +455,6 @@ public static T fromBytes(byte[] bytes, Class clazz) throws IOException { } } - protected static ObjectMapper getObjectMapper() { - ObjectMapper mapper = new ObjectMapper(); - mapper.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES); - mapper.setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY); - return mapper; - } - @Override public String toString() { return "HoodieCommitMetadata{" + "partitionToWriteStats=" + partitionToWriteStats diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieConsistentHashingMetadata.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieConsistentHashingMetadata.java new file mode 100644 index 0000000000000..46f115262745f --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieConsistentHashingMetadata.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.JsonUtils; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.io.Serializable; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * All the metadata that is used for consistent hashing bucket index + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class HoodieConsistentHashingMetadata implements Serializable { + + private static final Logger LOG = LogManager.getLogger(HoodieConsistentHashingMetadata.class); + /** + * Upper-bound of the hash value + */ + public static final int HASH_VALUE_MASK = Integer.MAX_VALUE; + public static final String HASHING_METADATA_FILE_SUFFIX = ".hashing_meta"; + + private final short version; + private final String partitionPath; + private final String instant; + private final int numBuckets; + private final int seqNo; + private final List nodes; + + @JsonCreator + public HoodieConsistentHashingMetadata(@JsonProperty("version") short version, @JsonProperty("partitionPath") String partitionPath, + @JsonProperty("instant") String instant, @JsonProperty("numBuckets") int numBuckets, + @JsonProperty("seqNo") int seqNo, @JsonProperty("nodes") List nodes) { + this.version = version; + this.partitionPath = partitionPath; + this.instant = instant; + this.numBuckets = numBuckets; + this.seqNo = seqNo; + this.nodes = nodes; + } + + /** + * Construct default metadata with all bucket's file group uuid initialized + */ + public HoodieConsistentHashingMetadata(String partitionPath, int numBuckets) { + this((short) 0, partitionPath, HoodieTimeline.INIT_INSTANT_TS, numBuckets, 0, constructDefaultHashingNodes(numBuckets)); + } + + private static List constructDefaultHashingNodes(int numBuckets) { + long step = ((long) HASH_VALUE_MASK + numBuckets - 1) / numBuckets; + return IntStream.range(1, numBuckets + 1) + .mapToObj(i -> new ConsistentHashingNode((int) Math.min(step * i, HASH_VALUE_MASK), FSUtils.createNewFileIdPfx())).collect(Collectors.toList()); + } + + public short getVersion() { + return version; + } + + public String getPartitionPath() { + return partitionPath; + } + + public String getInstant() { + return instant; + } + + public int getNumBuckets() { + return numBuckets; + } + + public int getSeqNo() { + return seqNo; + } + + public List getNodes() { + return nodes; + } + + public String getFilename() { + return instant + HASHING_METADATA_FILE_SUFFIX; + } + + public byte[] toBytes() throws IOException { + return toJsonString().getBytes(StandardCharsets.UTF_8); + } + + public static HoodieConsistentHashingMetadata fromBytes(byte[] bytes) throws IOException { + try { + return fromJsonString(new String(bytes, StandardCharsets.UTF_8), HoodieConsistentHashingMetadata.class); + } catch (Exception e) { + throw new IOException("unable to read hashing metadata", e); + } + } + + private String toJsonString() throws IOException { + return JsonUtils.getObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this); + } + + protected static T fromJsonString(String jsonStr, Class clazz) throws Exception { + if (jsonStr == null || jsonStr.isEmpty()) { + // For empty commit file (no data or something bad happen). + return clazz.newInstance(); + } + return JsonUtils.getObjectMapper().readValue(jsonStr, clazz); + } + + /** + * Get instant time from the hashing metadata filename + * Pattern of the filename: .HASHING_METADATA_FILE_SUFFIX + */ + public static String getTimestampFromFile(String filename) { + return filename.split("\\.")[0]; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieDeltaWriteStat.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieDeltaWriteStat.java index 1b7dcb78d80ba..9626e218a2247 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieDeltaWriteStat.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieDeltaWriteStat.java @@ -19,15 +19,24 @@ package org.apache.hudi.common.model; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import org.apache.hudi.common.util.Option; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; /** * Statistics about a single Hoodie delta log operation. */ @JsonIgnoreProperties(ignoreUnknown = true) +@SuppressWarnings("rawtypes") public class HoodieDeltaWriteStat extends HoodieWriteStat { private int logVersion; private long logOffset; + private String baseFile; + private List logFiles = new ArrayList<>(); + private Option>> recordsStats = Option.empty(); public void setLogVersion(int logVersion) { this.logVersion = logVersion; @@ -44,4 +53,32 @@ public void setLogOffset(long logOffset) { public long getLogOffset() { return logOffset; } + + public void setBaseFile(String baseFile) { + this.baseFile = baseFile; + } + + public String getBaseFile() { + return baseFile; + } + + public void setLogFiles(List logFiles) { + this.logFiles = logFiles; + } + + public void addLogFiles(String logFile) { + logFiles.add(logFile); + } + + public List getLogFiles() { + return logFiles; + } + + public void setRecordsStats(Map> stats) { + recordsStats = Option.of(stats); + } + + public Option>> getColumnStats() { + return recordsStats; + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFailedWritesCleaningPolicy.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFailedWritesCleaningPolicy.java new file mode 100644 index 0000000000000..f7fef9295a7e7 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFailedWritesCleaningPolicy.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +/** + * Policy controlling how to perform cleaning for failed writes. + */ +public enum HoodieFailedWritesCleaningPolicy { + // performs cleaning of failed writes inline every write operation + EAGER, + // performs cleaning of failed writes lazily during clean + LAZY, + // Does not clean failed writes + NEVER; + + public boolean isEager() { + return this == EAGER; + } + + public boolean isLazy() { + return this == LAZY; + } + + public boolean isNever() { + return this == NEVER; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileFormat.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileFormat.java index 552c38ffd9bd1..326391035e616 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileFormat.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileFormat.java @@ -18,13 +18,24 @@ package org.apache.hudi.common.model; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; + /** * Hoodie file format. */ public enum HoodieFileFormat { PARQUET(".parquet"), HOODIE_LOG(".log"), - HFILE(".hfile"); + HFILE(".hfile"), + ORC(".orc"); + + public static final Set BASE_FILE_EXTENSIONS = Arrays.stream(HoodieFileFormat.values()) + .map(HoodieFileFormat::getFileExtension) + .filter(x -> !x.equals(HoodieFileFormat.HOODIE_LOG.getFileExtension())) + .collect(Collectors.toCollection(HashSet::new)); private final String extension; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileGroup.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileGroup.java index 849f08eed697b..9b5e8c1dd6f02 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileGroup.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileGroup.java @@ -21,16 +21,17 @@ import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; import java.io.Serializable; import java.util.Comparator; import java.util.List; -import java.util.Map; import java.util.TreeMap; -import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.compareTimestamps; + /** * A set of data/base files + set of log files, that make up an unit for all operations. */ @@ -63,9 +64,7 @@ public static Comparator getReverseCommitTimeComparator() { public HoodieFileGroup(HoodieFileGroup fileGroup) { this.timeline = fileGroup.timeline; this.fileGroupId = fileGroup.fileGroupId; - this.fileSlices = new TreeMap<>(fileGroup.fileSlices.entrySet().stream() - .map(e -> Pair.of(e.getKey(), new FileSlice(e.getValue()))) - .collect(Collectors.toMap(Pair::getKey, Pair::getValue))); + this.fileSlices = new TreeMap<>(fileGroup.fileSlices); this.lastInstant = fileGroup.lastInstant; } @@ -123,21 +122,22 @@ public HoodieFileGroupId getFileGroupId() { * some log files, that are based off a commit or delta commit. */ private boolean isFileSliceCommitted(FileSlice slice) { - String maxCommitTime = lastInstant.get().getTimestamp(); - return timeline.containsOrBeforeTimelineStarts(slice.getBaseInstantTime()) - && HoodieTimeline.compareTimestamps(slice.getBaseInstantTime(), HoodieTimeline.LESSER_THAN_OR_EQUALS, maxCommitTime); + if (!compareTimestamps(slice.getBaseInstantTime(), LESSER_THAN_OR_EQUALS, lastInstant.get().getTimestamp())) { + return false; + } + return timeline.containsOrBeforeTimelineStarts(slice.getBaseInstantTime()); } /** - * Get all the the file slices including in-flight ones as seen in underlying file-system. + * Get all the file slices including in-flight ones as seen in underlying file system. */ public Stream getAllFileSlicesIncludingInflight() { - return fileSlices.entrySet().stream().map(Map.Entry::getValue); + return fileSlices.values().stream(); } /** - * Get latest file slices including in-flight ones. + * Get the latest file slices including inflight ones. */ public Option getLatestFileSlicesIncludingInflight() { return Option.fromJavaOptional(getAllFileSlicesIncludingInflight().findFirst()); @@ -148,11 +148,15 @@ public Option getLatestFileSlicesIncludingInflight() { */ public Stream getAllFileSlices() { if (!timeline.empty()) { - return fileSlices.entrySet().stream().map(Map.Entry::getValue).filter(this::isFileSliceCommitted); + return fileSlices.values().stream().filter(this::isFileSliceCommitted); } return Stream.empty(); } + public Stream getAllFileSlicesBeforeOn(String maxInstantTime) { + return fileSlices.values().stream().filter(slice -> compareTimestamps(slice.getBaseInstantTime(), LESSER_THAN_OR_EQUALS, maxInstantTime)); + } + /** * Gets the latest slice - this can contain either. *

    @@ -174,19 +178,18 @@ public Option getLatestDataFile() { * Obtain the latest file slice, upto a instantTime i.e <= maxInstantTime. */ public Option getLatestFileSliceBeforeOrOn(String maxInstantTime) { - return Option.fromJavaOptional(getAllFileSlices().filter(slice -> HoodieTimeline - .compareTimestamps(slice.getBaseInstantTime(), HoodieTimeline.LESSER_THAN_OR_EQUALS, maxInstantTime)).findFirst()); + return Option.fromJavaOptional(getAllFileSlices().filter(slice -> compareTimestamps(slice.getBaseInstantTime(), LESSER_THAN_OR_EQUALS, maxInstantTime)).findFirst()); } /** * Obtain the latest file slice, upto an instantTime i.e < maxInstantTime. * * @param maxInstantTime Max Instant Time - * @return + * @return the latest file slice */ public Option getLatestFileSliceBefore(String maxInstantTime) { return Option.fromJavaOptional(getAllFileSlices().filter( - slice -> HoodieTimeline.compareTimestamps(slice.getBaseInstantTime(), HoodieTimeline.LESSER_THAN, maxInstantTime)) + slice -> compareTimestamps(slice.getBaseInstantTime(), LESSER_THAN, maxInstantTime)) .findFirst()); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileGroupId.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileGroupId.java index ceea16af7aec3..cf98d50868c78 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileGroupId.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileGroupId.java @@ -69,7 +69,7 @@ public String toString() { public int compareTo(HoodieFileGroupId o) { int ret = partitionPath.compareTo(o.partitionPath); if (ret == 0) { - ret = fileId.compareTo(fileId); + ret = fileId.compareTo(o.fileId); } return ret; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieKey.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieKey.java index c40bdc45c4b32..9030204099ae8 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieKey.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieKey.java @@ -29,19 +29,29 @@ */ public class HoodieKey implements Serializable { - private final String recordKey; + private String recordKey; + private String partitionPath; - private final String partitionPath; + public HoodieKey() { + } public HoodieKey(String recordKey, String partitionPath) { this.recordKey = recordKey; this.partitionPath = partitionPath; } + public void setRecordKey(String recordKey) { + this.recordKey = recordKey; + } + public String getRecordKey() { return recordKey; } + public void setPartitionPath(String partitionPath) { + this.partitionPath = partitionPath; + } + public String getPartitionPath() { return partitionPath; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieLogFile.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieLogFile.java index fa7f9b1bbe5d7..059dcba513c8c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieLogFile.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieLogFile.java @@ -18,11 +18,10 @@ package org.apache.hudi.common.model; -import org.apache.hudi.common.fs.FSUtils; - import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.fs.FSUtils; import java.io.IOException; import java.io.Serializable; @@ -39,8 +38,12 @@ public class HoodieLogFile implements Serializable { private static final long serialVersionUID = 1L; public static final String DELTA_EXTENSION = ".log"; + public static final String LOG_FILE_PREFIX = "."; public static final Integer LOGFILE_BASE_VERSION = 1; + private static final Comparator LOG_FILE_COMPARATOR = new LogFileComparator(); + private static final Comparator LOG_FILE_COMPARATOR_REVERSED = new LogFileComparator().reversed(); + private transient FileStatus fileStatus; private final String pathStr; private long fileLen; @@ -60,7 +63,13 @@ public HoodieLogFile(FileStatus fileStatus) { public HoodieLogFile(Path logPath) { this.fileStatus = null; this.pathStr = logPath.toString(); - this.fileLen = 0; + this.fileLen = -1; + } + + public HoodieLogFile(Path logPath, Long fileLen) { + this.fileStatus = null; + this.pathStr = logPath.toString(); + this.fileLen = fileLen; } public HoodieLogFile(String logPathStr) { @@ -124,11 +133,11 @@ public HoodieLogFile rollOver(FileSystem fs, String logWriteToken) throws IOExce } public static Comparator getLogFileComparator() { - return new LogFileComparator(); + return LOG_FILE_COMPARATOR; } public static Comparator getReverseLogFileComparator() { - return new LogFileComparator().reversed(); + return LOG_FILE_COMPARATOR_REVERSED; } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieMetadataWrapper.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieMetadataWrapper.java new file mode 100644 index 0000000000000..ecc18a7dfaddd --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieMetadataWrapper.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.hudi.avro.model.HoodieArchivedMetaEntry; + +public class HoodieMetadataWrapper { + + private HoodieArchivedMetaEntry avroMetadataFromTimeline; + private HoodieCommitMetadata commitMetadata; + private boolean isAvroMetadata = false; + + public HoodieMetadataWrapper(HoodieArchivedMetaEntry avroMetadataFromTimeline) { + this.avroMetadataFromTimeline = avroMetadataFromTimeline; + this.isAvroMetadata = true; + } + + public HoodieMetadataWrapper(HoodieCommitMetadata commitMetadata) { + this.commitMetadata = commitMetadata; + } + + public HoodieArchivedMetaEntry getMetadataFromTimeline() { + return avroMetadataFromTimeline; + } + + public HoodieCommitMetadata getCommitMetadata() { + return commitMetadata; + } + + public boolean isAvroMetadata() { + return isAvroMetadata; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieOperation.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieOperation.java new file mode 100644 index 0000000000000..47625820a7fbe --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieOperation.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.hudi.common.util.Option; + +/** + * Represents the changes that a row can describe in a changelog. + */ +public enum HoodieOperation { + /** + * Insert operation. + */ + INSERT("I", (byte) 0), + /** + * Update operation with previous record content, + * should be used together with {@link #UPDATE_AFTER} for modeling an update operation. + */ + UPDATE_BEFORE("-U", (byte) 1), + /** + * Update operation with new record content. + */ + UPDATE_AFTER("U", (byte) 2), + /** + * Delete operation. + */ + DELETE("D", (byte) 3); + + private final String name; + private final byte value; + + HoodieOperation(String name, byte value) { + this.name = name; + this.value = value; + } + + public String getName() { + return name; + } + + public byte getValue() { + return value; + } + + public static HoodieOperation fromValue(byte value) { + switch (value) { + case 0: + return INSERT; + case 1: + return UPDATE_BEFORE; + case 2: + return UPDATE_AFTER; + case 3: + return DELETE; + default: + throw new AssertionError(); + } + } + + public static HoodieOperation fromName(Option nameOpt) { + if (!nameOpt.isPresent()) { + return null; + } + return fromName(nameOpt.get()); + } + + public static HoodieOperation fromName(String name) { + switch (name) { + case "I": + return INSERT; + case "-U": + return UPDATE_BEFORE; + case "U": + return UPDATE_AFTER; + case "D": + return DELETE; + default: + throw new AssertionError(); + } + } + + /** + * Returns whether the operation is INSERT. + */ + public static boolean isInsert(HoodieOperation operation) { + return operation == INSERT; + } + + /** + * Returns whether the operation is UPDATE_BEFORE. + */ + public static boolean isUpdateBefore(HoodieOperation operation) { + return operation == UPDATE_BEFORE; + } + + /** + * Returns whether the operation is UPDATE_AFTER. + */ + public static boolean isUpdateAfter(HoodieOperation operation) { + return operation == UPDATE_AFTER; + } + + /** + * Returns whether the operation is DELETE. + */ + public static boolean isDelete(HoodieOperation operation) { + return operation == DELETE; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePartitionMetadata.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePartitionMetadata.java index faad46653ad25..89bad1c33f599 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePartitionMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePartitionMetadata.java @@ -18,26 +18,47 @@ package org.apache.hudi.common.model; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.HoodieAvroWriteSupport; +import org.apache.hudi.common.util.AvroOrcUtils; +import org.apache.hudi.common.util.BaseFileUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.avro.Schema; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import org.apache.orc.OrcFile; +import org.apache.orc.Writer; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Types; import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Map; import java.util.Properties; +import java.util.stream.Collectors; +import java.util.stream.Stream; /** * The metadata that goes into the meta file in each partition. */ public class HoodiePartitionMetadata { - public static final String HOODIE_PARTITION_METAFILE = ".hoodie_partition_metadata"; - public static final String PARTITION_DEPTH_KEY = "partitionDepth"; + public static final String HOODIE_PARTITION_METAFILE_PREFIX = ".hoodie_partition_metadata"; public static final String COMMIT_TIME_KEY = "commitTime"; + private static final String PARTITION_DEPTH_KEY = "partitionDepth"; + private static final Logger LOG = LogManager.getLogger(HoodiePartitionMetadata.class); /** * Contents of the metadata. @@ -51,7 +72,8 @@ public class HoodiePartitionMetadata { private final FileSystem fs; - private static final Logger LOG = LogManager.getLogger(HoodiePartitionMetadata.class); + // The format in which to write the partition metadata + private Option format; /** * Construct metadata from existing partition. @@ -60,13 +82,15 @@ public HoodiePartitionMetadata(FileSystem fs, Path partitionPath) { this.fs = fs; this.props = new Properties(); this.partitionPath = partitionPath; + this.format = Option.empty(); } /** * Construct metadata object to be written out. */ - public HoodiePartitionMetadata(FileSystem fs, String instantTime, Path basePath, Path partitionPath) { + public HoodiePartitionMetadata(FileSystem fs, String instantTime, Path basePath, Path partitionPath, Option format) { this(fs, partitionPath); + this.format = format; props.setProperty(COMMIT_TIME_KEY, instantTime); props.setProperty(PARTITION_DEPTH_KEY, String.valueOf(partitionPath.depth() - basePath.depth())); } @@ -82,21 +106,17 @@ public int getPartitionDepth() { * Write the metadata safely into partition atomically. */ public void trySave(int taskPartitionId) { + String extension = getMetafileExtension(); Path tmpMetaPath = - new Path(partitionPath, HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE + "_" + taskPartitionId); - Path metaPath = new Path(partitionPath, HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE); + new Path(partitionPath, HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX + "_" + taskPartitionId + extension); + Path metaPath = new Path(partitionPath, HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX + extension); boolean metafileExists = false; try { metafileExists = fs.exists(metaPath); if (!metafileExists) { // write to temporary file - FSDataOutputStream os = fs.create(tmpMetaPath, true); - props.store(os, "partition metadata"); - os.hsync(); - os.hflush(); - os.close(); - + writeMetafile(tmpMetaPath); // move to actual path fs.rename(tmpMetaPath, metaPath); } @@ -117,30 +137,171 @@ public void trySave(int taskPartitionId) { } } + private String getMetafileExtension() { + // To be backwards compatible, there is no extension to the properties file base partition metafile + return format.isPresent() ? format.get().getFileExtension() : StringUtils.EMPTY_STRING; + } + + /** + * Write the partition metadata in the correct format in the given file path. + * + * @param filePath Path of the file to write + * @throws IOException + */ + private void writeMetafile(Path filePath) throws IOException { + if (format.isPresent()) { + Schema schema = HoodieAvroUtils.getRecordKeySchema(); + + switch (format.get()) { + case PARQUET: + // Since we are only interested in saving metadata to the footer, the schema, blocksizes and other + // parameters are not important. + MessageType type = Types.buildMessage().optional(PrimitiveTypeName.INT64).named("dummyint").named("dummy"); + HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(type, schema, Option.empty()); + try (ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.UNCOMPRESSED, 1024, 1024)) { + for (String key : props.stringPropertyNames()) { + writeSupport.addFooterMetadata(key, props.getProperty(key)); + } + } + break; + case ORC: + // Since we are only interested in saving metadata to the footer, the schema, blocksizes and other + // parameters are not important. + OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(fs.getConf()).fileSystem(fs) + .setSchema(AvroOrcUtils.createOrcSchema(schema)); + try (Writer writer = OrcFile.createWriter(filePath, writerOptions)) { + for (String key : props.stringPropertyNames()) { + writer.addUserMetadata(key, ByteBuffer.wrap(props.getProperty(key).getBytes())); + } + } + break; + default: + throw new HoodieException("Unsupported format for partition metafiles: " + format.get()); + } + } else { + // Backwards compatible properties file format + FSDataOutputStream os = fs.create(filePath, true); + props.store(os, "partition metadata"); + os.hsync(); + os.hflush(); + os.close(); + } + } + /** * Read out the metadata for this partition. */ public void readFromFS() throws IOException { - FSDataInputStream is = null; - try { - Path metaFile = new Path(partitionPath, HOODIE_PARTITION_METAFILE); - is = fs.open(metaFile); + // first try reading the text format (legacy, currently widespread) + boolean readFile = readTextFormatMetaFile(); + if (!readFile) { + // now try reading the base file formats. + readFile = readBaseFormatMetaFile(); + } + + // throw exception. + if (!readFile) { + throw new HoodieException("Unable to read any partition meta file to locate the table timeline."); + } + } + + private boolean readTextFormatMetaFile() { + // Properties file format + Path metafilePath = textFormatMetaFilePath(partitionPath); + try (FSDataInputStream is = fs.open(metafilePath)) { props.load(is); - } catch (IOException ioe) { - throw new HoodieException("Error reading Hoodie partition metadata for " + partitionPath, ioe); - } finally { - if (is != null) { - is.close(); + format = Option.empty(); + return true; + } catch (Throwable t) { + LOG.debug("Unable to read partition meta properties file for partition " + partitionPath); + return false; + } + } + + private boolean readBaseFormatMetaFile() { + for (Path metafilePath : baseFormatMetaFilePaths(partitionPath)) { + try { + BaseFileUtils reader = BaseFileUtils.getInstance(metafilePath.toString()); + // Data file format + Map metadata = reader.readFooter(fs.getConf(), true, metafilePath, PARTITION_DEPTH_KEY, COMMIT_TIME_KEY); + props.clear(); + props.putAll(metadata); + format = Option.of(reader.getFormat()); + return true; + } catch (Throwable t) { + LOG.debug("Unable to read partition metadata " + metafilePath.getName() + " for partition " + partitionPath); } } + return false; + } + + /** + * Read out the COMMIT_TIME_KEY metadata for this partition. + */ + public Option readPartitionCreatedCommitTime() { + try { + if (!props.containsKey(COMMIT_TIME_KEY)) { + readFromFS(); + } + return Option.of(props.getProperty(COMMIT_TIME_KEY)); + } catch (IOException ioe) { + LOG.warn("Error fetch Hoodie partition metadata for " + partitionPath, ioe); + return Option.empty(); + } } // methods related to partition meta data public static boolean hasPartitionMetadata(FileSystem fs, Path partitionPath) { try { - return fs.exists(new Path(partitionPath, HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE)); + return textFormatMetaPathIfExists(fs, partitionPath).isPresent() + || baseFormatMetaPathIfExists(fs, partitionPath).isPresent(); + } catch (IOException ioe) { + throw new HoodieIOException("Error checking presence of partition meta file for " + partitionPath, ioe); + } + } + + /** + * Returns the name of the partition metadata. + * + * @return Name of the partition metafile or empty option + */ + public static Option getPartitionMetafilePath(FileSystem fs, Path partitionPath) { + // The partition listing is a costly operation so instead we are searching for existence of the files instead. + // This is in expected order as properties file based partition metafiles should be the most common. + try { + Option textFormatPath = textFormatMetaPathIfExists(fs, partitionPath); + if (textFormatPath.isPresent()) { + return textFormatPath; + } else { + return baseFormatMetaPathIfExists(fs, partitionPath); + } } catch (IOException ioe) { throw new HoodieException("Error checking Hoodie partition metadata for " + partitionPath, ioe); } } + + public static Option baseFormatMetaPathIfExists(FileSystem fs, Path partitionPath) throws IOException { + // Parquet should be more common than ORC so check it first + for (Path metafilePath : baseFormatMetaFilePaths(partitionPath)) { + if (fs.exists(metafilePath)) { + return Option.of(metafilePath); + } + } + return Option.empty(); + } + + public static Option textFormatMetaPathIfExists(FileSystem fs, Path partitionPath) throws IOException { + Path path = textFormatMetaFilePath(partitionPath); + return Option.ofNullable(fs.exists(path) ? path : null); + } + + static Path textFormatMetaFilePath(Path partitionPath) { + return new Path(partitionPath, HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX); + } + + static List baseFormatMetaFilePaths(Path partitionPath) { + return Stream.of(HoodieFileFormat.PARQUET.getFileExtension(), HoodieFileFormat.ORC.getFileExtension()) + .map(ext -> new Path(partitionPath, HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX + ext)) + .collect(Collectors.toList()); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePayloadProps.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePayloadProps.java new file mode 100644 index 0000000000000..b04b8dfdd768c --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePayloadProps.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.avro.Schema; + +import java.util.Properties; + +/** + * Holds payload properties that implementation of {@link HoodieRecordPayload} can leverage. + * Since both payload classes and HoodiePayloadConfig needs to access these props, storing it here in hudi-common. + */ +public class HoodiePayloadProps { + + /** + * Property for payload ordering field; to be used to merge incoming record with that in storage. + * Implementations of {@link HoodieRecordPayload} can leverage if required. + * + * @see DefaultHoodieRecordPayload + */ + public static final String PAYLOAD_ORDERING_FIELD_PROP_KEY = "hoodie.payload.ordering.field"; + + /** + * Property for payload event time field; to be used to extract source event time info. + * + * @see DefaultHoodieRecordPayload + */ + public static final String PAYLOAD_EVENT_TIME_FIELD_PROP_KEY = "hoodie.payload.event.time.field"; + + /** + * A runtime config pass to the {@link HoodieRecordPayload#getInsertValue(Schema, Properties)} + * to tell if the current record is a update record or insert record for mor table. + */ + public static final String PAYLOAD_IS_UPDATE_RECORD_FOR_MOR = "hoodie.is.update.record.for.mor"; + + /** @deprecated Use {@link #PAYLOAD_ORDERING_FIELD_PROP_KEY} */ + @Deprecated + public static final String PAYLOAD_ORDERING_FIELD_PROP = PAYLOAD_ORDERING_FIELD_PROP_KEY; + @Deprecated + public static String DEFAULT_PAYLOAD_ORDERING_FIELD_VAL = "ts"; + /** @deprecated Use {@link #PAYLOAD_EVENT_TIME_FIELD_PROP_KEY} */ + @Deprecated + public static final String PAYLOAD_EVENT_TIME_FIELD_PROP = PAYLOAD_EVENT_TIME_FIELD_PROP_KEY; + @Deprecated + public static String DEFAULT_PAYLOAD_EVENT_TIME_FIELD_VAL = "ts"; +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecord.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecord.java index ce2562ddc554d..2a3edafb8f27d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecord.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecord.java @@ -18,36 +18,52 @@ package org.apache.hudi.common.model; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.IntStream; import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; import java.io.Serializable; import java.util.List; +import java.util.Map; import java.util.Objects; -import org.apache.hudi.common.util.collection.Pair; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.IntStream; /** * A Single Record managed by Hoodie. */ -public class HoodieRecord implements Serializable { +public abstract class HoodieRecord implements Serializable { public static final String COMMIT_TIME_METADATA_FIELD = "_hoodie_commit_time"; public static final String COMMIT_SEQNO_METADATA_FIELD = "_hoodie_commit_seqno"; public static final String RECORD_KEY_METADATA_FIELD = "_hoodie_record_key"; public static final String PARTITION_PATH_METADATA_FIELD = "_hoodie_partition_path"; public static final String FILENAME_METADATA_FIELD = "_hoodie_file_name"; + public static final String OPERATION_METADATA_FIELD = "_hoodie_operation"; + public static final String HOODIE_IS_DELETED = "_hoodie_is_deleted"; public static final List HOODIE_META_COLUMNS = CollectionUtils.createImmutableList(COMMIT_TIME_METADATA_FIELD, COMMIT_SEQNO_METADATA_FIELD, RECORD_KEY_METADATA_FIELD, PARTITION_PATH_METADATA_FIELD, FILENAME_METADATA_FIELD); + // Temporary to support the '_hoodie_operation' field, once we solve + // the compatibility problem, it can be removed. + public static final Set HOODIE_META_COLUMNS_WITH_OPERATION = + CollectionUtils.createImmutableSet(COMMIT_TIME_METADATA_FIELD, COMMIT_SEQNO_METADATA_FIELD, + RECORD_KEY_METADATA_FIELD, PARTITION_PATH_METADATA_FIELD, FILENAME_METADATA_FIELD, + OPERATION_METADATA_FIELD); + public static final Map HOODIE_META_COLUMNS_NAME_TO_POS = IntStream.range(0, HOODIE_META_COLUMNS.size()).mapToObj(idx -> Pair.of(HOODIE_META_COLUMNS.get(idx), idx)) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); + public static int RECORD_KEY_META_FIELD_ORD = HOODIE_META_COLUMNS_NAME_TO_POS.get(RECORD_KEY_METADATA_FIELD); + public static int PARTITION_PATH_META_FIELD_ORD = HOODIE_META_COLUMNS_NAME_TO_POS.get(PARTITION_PATH_METADATA_FIELD); + public static int FILENAME_META_FIELD_ORD = HOODIE_META_COLUMNS_NAME_TO_POS.get(FILENAME_METADATA_FIELD); + public static int COMMIT_TIME_METADATA_FIELD_ORD = HOODIE_META_COLUMNS_NAME_TO_POS.get(COMMIT_TIME_METADATA_FIELD); + public static int COMMIT_SEQNO_METADATA_FIELD_ORD = HOODIE_META_COLUMNS_NAME_TO_POS.get(COMMIT_SEQNO_METADATA_FIELD); + /** * Identifies the record across the table. */ @@ -56,7 +72,7 @@ public class HoodieRecord implements Serializable /** * Actual payload of the record. */ - private T data; + protected T data; /** * Current location of record on storage. Filled in by looking up index @@ -73,12 +89,22 @@ public class HoodieRecord implements Serializable */ private boolean sealed; + /** + * The cdc operation. + */ + private HoodieOperation operation; + public HoodieRecord(HoodieKey key, T data) { + this(key, data, null); + } + + public HoodieRecord(HoodieKey key, T data, HoodieOperation operation) { this.key = key; this.data = data; this.currentLocation = null; this.newLocation = null; this.sealed = false; + this.operation = operation; } public HoodieRecord(HoodieRecord record) { @@ -86,12 +112,22 @@ public HoodieRecord(HoodieRecord record) { this.currentLocation = record.currentLocation; this.newLocation = record.newLocation; this.sealed = record.sealed; + this.operation = record.operation; } + public HoodieRecord() { + } + + public abstract HoodieRecord newInstance(); + public HoodieKey getKey() { return key; } + public HoodieOperation getOperation() { + return operation; + } + public T getData() { if (data == null) { throw new IllegalStateException("Payload already deflated for record."); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordGlobalLocation.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordGlobalLocation.java new file mode 100644 index 0000000000000..f469a1ab451c2 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordGlobalLocation.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import java.util.Objects; + +/** + * Similar with {@link org.apache.hudi.common.model.HoodieRecordLocation} but with partition path. + */ +public class HoodieRecordGlobalLocation extends HoodieRecordLocation { + private static final long serialVersionUID = 1L; + + private String partitionPath; + + public HoodieRecordGlobalLocation() { + } + + public HoodieRecordGlobalLocation(String partitionPath, String instantTime, String fileId) { + super(instantTime, fileId); + this.partitionPath = partitionPath; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("HoodieGlobalRecordLocation {"); + sb.append("partitionPath=").append(partitionPath).append(", "); + sb.append("instantTime=").append(instantTime).append(", "); + sb.append("fileId=").append(fileId); + sb.append('}'); + return sb.toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + HoodieRecordGlobalLocation otherLoc = (HoodieRecordGlobalLocation) o; + return Objects.equals(partitionPath, otherLoc.partitionPath) + && Objects.equals(instantTime, otherLoc.instantTime) + && Objects.equals(fileId, otherLoc.fileId); + } + + @Override + public int hashCode() { + return Objects.hash(partitionPath, instantTime, fileId); + } + + public String getPartitionPath() { + return partitionPath; + } + + public void setPartitionPath(String partitionPath) { + this.partitionPath = partitionPath; + } + + /** + * Returns the global record location from local. + */ + public static HoodieRecordGlobalLocation fromLocal(String partitionPath, HoodieRecordLocation localLoc) { + return new HoodieRecordGlobalLocation(partitionPath, localLoc.getInstantTime(), localLoc.getFileId()); + } + + /** + * Returns the record location as local. + */ + public HoodieRecordLocation toLocal(String instantTime) { + return new HoodieRecordLocation(instantTime, fileId); + } + + /** + * Copy the location with given partition path. + */ + public HoodieRecordGlobalLocation copy(String partitionPath) { + return new HoodieRecordGlobalLocation(partitionPath, instantTime, fileId); + } +} + diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordLocation.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordLocation.java index 690db88375200..2b1feab39b9cb 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordLocation.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordLocation.java @@ -26,8 +26,11 @@ */ public class HoodieRecordLocation implements Serializable { - private final String instantTime; - private final String fileId; + protected String instantTime; + protected String fileId; + + public HoodieRecordLocation() { + } public HoodieRecordLocation(String instantTime, String fileId) { this.instantTime = instantTime; @@ -64,7 +67,15 @@ public String getInstantTime() { return instantTime; } + public void setInstantTime(String instantTime) { + this.instantTime = instantTime; + } + public String getFileId() { return fileId; } + + public void setFileId(String fileId) { + this.fileId = fileId; + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java index 1afdd1b59af64..6752607d2f48c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java @@ -29,50 +29,101 @@ import java.io.IOException; import java.io.Serializable; import java.util.Map; +import java.util.Properties; /** - * Every Hoodie table has an implementation of the HoodieRecordPayload This abstracts out callbacks which - * depend on record specific logic. + * Every Hoodie table has an implementation of the HoodieRecordPayload This abstracts out callbacks which depend on record specific logic. */ @PublicAPIClass(maturity = ApiMaturityLevel.STABLE) public interface HoodieRecordPayload extends Serializable { /** - * When more than one HoodieRecord have the same HoodieKey, this function combines them before attempting to - * insert/upsert (if combining turned on in HoodieClientConfig). + * This method is deprecated. Please use this {@link #preCombine(HoodieRecordPayload, Properties)} method. + */ + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) + T preCombine(T oldValue); + + /** + * When more than one HoodieRecord have the same HoodieKey in the incoming batch, this function combines them before attempting to insert/upsert by taking in a property map. + * Implementation can leverage the property to decide their business logic to do preCombine. + * + * @param oldValue instance of the old {@link HoodieRecordPayload} to be combined with. + * @param properties Payload related properties. For example pass the ordering field(s) name to extract from value in storage. + * + * @return the combined value */ @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) - T preCombine(T another); + default T preCombine(T oldValue, Properties properties) { + return preCombine(oldValue); + } + + /** + * This methods is deprecated. Please refer to {@link #combineAndGetUpdateValue(IndexedRecord, Schema, Properties)} for java docs. + */ + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) + Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException; /** - * This methods lets you write custom merging/combining logic to produce new values as a function of current value on - * storage and whats contained in this object. + * This methods lets you write custom merging/combining logic to produce new values as a function of current value on storage and whats contained + * in this object. Implementations can leverage properties if required. *

    - * eg: 1) You are updating counters, you may want to add counts to currentValue and write back updated counts 2) You - * may be reading DB redo logs, and merge them with current image for a database row on storage + * eg: + * 1) You are updating counters, you may want to add counts to currentValue and write back updated counts + * 2) You may be reading DB redo logs, and merge them with current image for a database row on storage + *

    * * @param currentValue Current value in storage, to merge/combine this payload with * @param schema Schema used for record + * @param properties Payload related properties. For example pass the ordering field(s) name to extract from value in storage. * @return new combined/merged value to be written back to storage. EMPTY to skip writing this record. */ - @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) - Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException; + default Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema, Properties properties) throws IOException { + return combineAndGetUpdateValue(currentValue, schema); + } /** - * Generates an avro record out of the given HoodieRecordPayload, to be written out to storage. Called when writing a - * new value for the given HoodieKey, wherein there is no existing record in storage to be combined against. (i.e - * insert) Return EMPTY to skip writing this record. + * This method is deprecated. Refer to {@link #getInsertValue(Schema, Properties)} for java docs. + * @param schema Schema used for record + * @return the {@link IndexedRecord} to be inserted. */ - @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + @Deprecated + @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) Option getInsertValue(Schema schema) throws IOException; /** - * This method can be used to extract some metadata from HoodieRecordPayload. The metadata is passed to - * {@code WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()} in order to compute some aggregate metrics - * using the metadata in the context of a write success or failure. + * Generates an avro record out of the given HoodieRecordPayload, to be written out to storage. Called when writing a new value for the given + * HoodieKey, wherein there is no existing record in storage to be combined against. (i.e insert) Return EMPTY to skip writing this record. + * Implementations can leverage properties if required. + * @param schema Schema used for record + * @param properties Payload related properties. For example pass the ordering field(s) name to extract from value in storage. + * @return the {@link IndexedRecord} to be inserted. + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + default Option getInsertValue(Schema schema, Properties properties) throws IOException { + return getInsertValue(schema); + } + + /** + * This method can be used to extract some metadata from HoodieRecordPayload. The metadata is passed to {@code WriteStatus.markSuccess()} and + * {@code WriteStatus.markFailure()} in order to compute some aggregate metrics using the metadata in the context of a write success or failure. + * @return the metadata in the form of Map if any. */ @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) default Option> getMetadata() { return Option.empty(); } + + /** + * This method can be used to extract the ordering value of the payload for combining/merging, + * or 0 if no value is specified which means natural order(arrival time is used). + * + * @return the ordering value + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + default Comparable getOrderingValue() { + // default natural order + return 0; + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieReplaceCommitMetadata.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieReplaceCommitMetadata.java index 7cc9ee3a0c146..2dd6cda47d3db 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieReplaceCommitMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieReplaceCommitMetadata.java @@ -18,11 +18,9 @@ package org.apache.hudi.common.model; -import com.fasterxml.jackson.annotation.JsonAutoDetect; +import org.apache.hudi.common.util.JsonUtils; + import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import com.fasterxml.jackson.annotation.PropertyAccessor; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -80,7 +78,7 @@ public String toJsonString() throws IOException { LOG.info("partition path is null for " + partitionToReplaceFileIds.get(null)); partitionToReplaceFileIds.remove(null); } - return getObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this); + return JsonUtils.getObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this); } public static T fromJsonString(String jsonStr, Class clazz) throws Exception { @@ -88,7 +86,7 @@ public static T fromJsonString(String jsonStr, Class clazz) throws Except // For empty commit file (no data or somethings bad happen). return clazz.newInstance(); } - return getObjectMapper().readValue(jsonStr, clazz); + return JsonUtils.getObjectMapper().readValue(jsonStr, clazz); } @Override @@ -124,13 +122,6 @@ public static T fromBytes(byte[] bytes, Class clazz) throws IOException { } } - protected static ObjectMapper getObjectMapper() { - ObjectMapper mapper = new ObjectMapper(); - mapper.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES); - mapper.setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY); - return mapper; - } - @Override public String toString() { return "HoodieReplaceMetadata{" + "partitionToWriteStats=" + partitionToWriteStats diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRollingStatMetadata.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRollingStatMetadata.java index a354092675e4f..0a5240ed55d83 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRollingStatMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRollingStatMetadata.java @@ -18,6 +18,8 @@ package org.apache.hudi.common.model; +import org.apache.hudi.common.util.JsonUtils; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -81,7 +83,7 @@ public String toJsonString() throws IOException { LOG.info("partition path is null for " + partitionToRollingStats.get(null)); partitionToRollingStats.remove(null); } - return HoodieCommitMetadata.getObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this); + return JsonUtils.getObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this); } public HoodieRollingStatMetadata merge(HoodieRollingStatMetadata rollingStatMetadata) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieTableQueryType.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieTableQueryType.java new file mode 100644 index 0000000000000..f1d7557ae22f8 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieTableQueryType.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +/** + * Hudi table could be queried in one of the 3 following ways: + * + *
      + *
    1. Snapshot: snapshot of the table at the given (latest if not provided) instant is queried
    2. + *
    3. Read Optimized (MOR only): snapshot of the table at the given (latest if not provided) + * instant is queried, but w/o reading any of the delta-log files (only reading base-files)
    4. + *
    5. Incremental: only records added w/in the given time-window (defined by beginning and ending instant) + * are queried
    6. + *
    + */ +public enum HoodieTableQueryType { + SNAPSHOT, + INCREMENTAL, + READ_OPTIMIZED +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieTimelineTimeZone.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieTimelineTimeZone.java new file mode 100644 index 0000000000000..9b1c695d491ea --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieTimelineTimeZone.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +/** + * Hoodie TimelineZone. + */ +public enum HoodieTimelineTimeZone { + LOCAL("local"), + UTC("utc"); + + private final String timeZone; + + HoodieTimelineTimeZone(String timeZone) { + this.timeZone = timeZone; + } + + public String getTimeZone() { + return timeZone; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieWriteStat.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieWriteStat.java index 97288dfe00890..928a186173386 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieWriteStat.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieWriteStat.java @@ -18,8 +18,6 @@ package org.apache.hudi.common.model; -import com.fasterxml.jackson.annotation.JsonIgnore; -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import org.apache.hadoop.fs.Path; import javax.annotation.Nullable; @@ -29,7 +27,6 @@ /** * Statistics about a single Hoodie write operation. */ -@JsonIgnoreProperties(ignoreUnknown = true) public class HoodieWriteStat implements Serializable { public static final String NULL_COMMIT = "null"; @@ -71,7 +68,7 @@ public class HoodieWriteStat implements Serializable { private long numInserts; /** - * Total size of file written. + * Total number of bytes written. */ private long totalWriteBytes; @@ -143,8 +140,19 @@ public class HoodieWriteStat implements Serializable { */ private long fileSizeInBytes; + /** + * The earliest of incoming records' event times (Epoch ms) for calculating latency. + */ + @Nullable + private Long minEventTime; + + /** + * The latest of incoming records' event times (Epoch ms) for calculating freshness. + */ + @Nullable + private Long maxEventTime; + @Nullable - @JsonIgnore private RuntimeStats runtimeStats; public HoodieWriteStat() { @@ -303,6 +311,30 @@ public void setFileSizeInBytes(long fileSizeInBytes) { this.fileSizeInBytes = fileSizeInBytes; } + public Long getMinEventTime() { + return minEventTime; + } + + public void setMinEventTime(Long minEventTime) { + if (this.minEventTime == null) { + this.minEventTime = minEventTime; + } else { + this.minEventTime = Math.min(minEventTime, this.minEventTime); + } + } + + public Long getMaxEventTime() { + return maxEventTime; + } + + public void setMaxEventTime(Long maxEventTime) { + if (this.maxEventTime == null) { + this.maxEventTime = maxEventTime; + } else { + this.maxEventTime = Math.max(maxEventTime, this.maxEventTime); + } + } + @Nullable public RuntimeStats getRuntimeStats() { return runtimeStats; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteNonDefaultsWithLatestAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteNonDefaultsWithLatestAvroPayload.java index fca65684700f2..9ce241bc7822f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteNonDefaultsWithLatestAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteNonDefaultsWithLatestAvroPayload.java @@ -18,12 +18,13 @@ package org.apache.hudi.common.model; -import org.apache.hudi.common.util.Option; - import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.GenericRecordBuilder; import org.apache.avro.generic.IndexedRecord; +import org.apache.hudi.common.util.Option; + import java.io.IOException; import java.util.List; @@ -60,15 +61,19 @@ public Option combineAndGetUpdateValue(IndexedRecord currentValue if (isDeleteRecord(insertRecord)) { return Option.empty(); } else { + final GenericRecordBuilder builder = new GenericRecordBuilder(schema); List fields = schema.getFields(); fields.forEach(field -> { Object value = insertRecord.get(field.name()); + value = field.schema().getType().equals(Schema.Type.STRING) && value != null ? value.toString() : value; Object defaultValue = field.defaultVal(); if (!overwriteField(value, defaultValue)) { - currentRecord.put(field.name(), value); + builder.set(field, value); + } else { + builder.set(field, currentRecord.get(field.name())); } }); - return Option.of(currentRecord); + return Option.of(builder.build()); } } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java index e1e61244bd01a..d8469ed5a148e 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java @@ -18,6 +18,7 @@ package org.apache.hudi.common.model; +import org.apache.avro.JsonProperties; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.util.Option; @@ -26,6 +27,7 @@ import org.apache.avro.generic.IndexedRecord; import java.io.IOException; +import java.util.Objects; /** * Default payload used for delta streamer. @@ -47,10 +49,14 @@ public OverwriteWithLatestAvroPayload(Option record) { } @Override - public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload another) { - // pick the payload with greatest ordering value - if (another.orderingVal.compareTo(orderingVal) > 0) { - return another; + public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload oldValue) { + if (oldValue.recordBytes.length == 0) { + // use natural order for delete record + return this; + } + if (oldValue.orderingVal.compareTo(orderingVal) > 0) { + // pick the payload with greatest ordering value + return oldValue; } else { return this; } @@ -79,7 +85,14 @@ public Option getInsertValue(Schema schema) throws IOException { * @returns {@code true} if record represents a delete record. {@code false} otherwise. */ protected boolean isDeleteRecord(GenericRecord genericRecord) { - Object deleteMarker = genericRecord.get("_hoodie_is_deleted"); + final String isDeleteKey = HoodieRecord.HOODIE_IS_DELETED; + // Modify to be compatible with new version Avro. + // The new version Avro throws for GenericRecord.get if the field name + // does not exist in the schema. + if (genericRecord.getSchema().getField(isDeleteKey) == null) { + return false; + } + Object deleteMarker = genericRecord.get(isDeleteKey); return (deleteMarker instanceof Boolean && (boolean) deleteMarker); } @@ -87,6 +100,14 @@ protected boolean isDeleteRecord(GenericRecord genericRecord) { * Return true if value equals defaultValue otherwise false. */ public Boolean overwriteField(Object value, Object defaultValue) { - return defaultValue == null ? value == null : defaultValue.toString().equals(value.toString()); + if (JsonProperties.NULL_VALUE.equals(defaultValue)) { + return value == null; + } + return Objects.equals(value, defaultValue); + } + + @Override + public Comparable getOrderingValue() { + return this.orderingVal; } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/RewriteAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/RewriteAvroPayload.java new file mode 100644 index 0000000000000..d5c19b9116bbc --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/RewriteAvroPayload.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hudi.common.util.Option; + +import java.io.IOException; + +/** + * Default payload used for rewrite use cases where we dont change schema. We dont need to serialize/deserialize avro record in payload. + */ +public class RewriteAvroPayload implements HoodieRecordPayload { + + private GenericRecord record; + + public RewriteAvroPayload(GenericRecord record) { + this.record = record; + } + + @Override + public RewriteAvroPayload preCombine(RewriteAvroPayload another) { + throw new UnsupportedOperationException("precombine is not expected for rewrite payload"); + } + + @Override + public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { + return getInsertValue(schema); + } + + @Override + public Option getInsertValue(Schema schema) throws IOException { + return Option.of(record); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/TableServiceType.java b/hudi-common/src/main/java/org/apache/hudi/common/model/TableServiceType.java new file mode 100644 index 0000000000000..69dd30782ff77 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/TableServiceType.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.hudi.common.table.timeline.HoodieTimeline; + +/** + * Supported runtime table services. + */ +public enum TableServiceType { + ARCHIVE, COMPACT, CLUSTER, CLEAN; + + public String getAction() { + switch (this) { + case ARCHIVE: + // for table service type completeness; there is no timeline action associated with archive + return "NONE"; + case COMPACT: + return HoodieTimeline.COMPACTION_ACTION; + case CLEAN: + return HoodieTimeline.CLEAN_ACTION; + case CLUSTER: + return HoodieTimeline.REPLACE_COMMIT_ACTION; + default: + throw new IllegalArgumentException("Unknown table service " + this); + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/WriteConcurrencyMode.java b/hudi-common/src/main/java/org/apache/hudi/common/model/WriteConcurrencyMode.java new file mode 100644 index 0000000000000..9fe66ca0a0b68 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/WriteConcurrencyMode.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.hudi.exception.HoodieException; + +import java.util.Locale; + +/** + * Different concurrency modes for write operations. + */ +public enum WriteConcurrencyMode { + // Only a single writer can perform write ops + SINGLE_WRITER("single_writer"), + // Multiple writer can perform write ops with lazy conflict resolution using locks + OPTIMISTIC_CONCURRENCY_CONTROL("optimistic_concurrency_control"); + + private final String value; + + WriteConcurrencyMode(String value) { + this.value = value; + } + + /** + * Getter for write concurrency mode. + * @return + */ + public String value() { + return value; + } + + /** + * Convert string value to WriteConcurrencyMode. + */ + public static WriteConcurrencyMode fromValue(String value) { + switch (value.toLowerCase(Locale.ROOT)) { + case "single_writer": + return SINGLE_WRITER; + case "optimistic_concurrency_control": + return OPTIMISTIC_CONCURRENCY_CONTROL; + default: + throw new HoodieException("Invalid value of Type."); + } + } + + public boolean supportsOptimisticConcurrencyControl() { + return this == OPTIMISTIC_CONCURRENCY_CONTROL; + } + +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/WriteOperationType.java b/hudi-common/src/main/java/org/apache/hudi/common/model/WriteOperationType.java index 5f328a9bc69e4..f2f3809cf5c3a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/WriteOperationType.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/WriteOperationType.java @@ -42,8 +42,18 @@ public enum WriteOperationType { INSERT_OVERWRITE("insert_overwrite"), // cluster CLUSTER("cluster"), + // delete partition + DELETE_PARTITION("delete_partition"), // insert overwrite with dynamic partitioning INSERT_OVERWRITE_TABLE("insert_overwrite_table"), + // compact + COMPACT("compact"), + + INDEX("index"), + + // alter schema + ALTER_SCHEMA("alter_schema"), + // used for old version UNKNOWN("unknown"); @@ -74,8 +84,20 @@ public static WriteOperationType fromValue(String value) { return DELETE; case "insert_overwrite": return INSERT_OVERWRITE; + case "delete_partition": + return DELETE_PARTITION; case "insert_overwrite_table": return INSERT_OVERWRITE_TABLE; + case "cluster": + return CLUSTER; + case "compact": + return COMPACT; + case "index": + return INDEX; + case "alter_schema": + return ALTER_SCHEMA; + case "unknown": + return UNKNOWN; default: throw new HoodieException("Invalid value of Type."); } @@ -92,4 +114,8 @@ public String value() { public static boolean isChangingRecords(WriteOperationType operationType) { return operationType == UPSERT || operationType == UPSERT_PREPPED || operationType == DELETE; } + + public static boolean isOverwrite(WriteOperationType operationType) { + return operationType == INSERT_OVERWRITE || operationType == INSERT_OVERWRITE_TABLE; + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/AbstractDebeziumAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/AbstractDebeziumAvroPayload.java new file mode 100644 index 0000000000000..9082d572a4bdb --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/AbstractDebeziumAvroPayload.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model.debezium; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; +import org.apache.hudi.common.util.Option; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; + +/** + * Base class that provides support for seamlessly applying changes captured via Debezium. + *

    + * Debezium change event types are determined for the op field in the payload + *

    + * - For inserts, op=i + * - For deletes, op=d + * - For updates, op=u + * - For snapshort inserts, op=r + *

    + * This payload implementation will issue matching insert, delete, updates against the hudi table + */ +public abstract class AbstractDebeziumAvroPayload extends OverwriteWithLatestAvroPayload { + + private static final Logger LOG = LogManager.getLogger(AbstractDebeziumAvroPayload.class); + + public AbstractDebeziumAvroPayload(GenericRecord record, Comparable orderingVal) { + super(record, orderingVal); + } + + public AbstractDebeziumAvroPayload(Option record) { + super(record); + } + + @Override + public Option getInsertValue(Schema schema) throws IOException { + IndexedRecord insertRecord = getInsertRecord(schema); + return handleDeleteOperation(insertRecord); + } + + @Override + public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { + // Step 1: If the time occurrence of the current record in storage is higher than the time occurrence of the + // insert record (including a delete record), pick the current record. + if (shouldPickCurrentRecord(currentValue, getInsertRecord(schema), schema)) { + return Option.of(currentValue); + } + // Step 2: Pick the insert record (as a delete record if its a deleted event) + return getInsertValue(schema); + } + + protected abstract boolean shouldPickCurrentRecord(IndexedRecord currentRecord, IndexedRecord insertRecord, Schema schema) throws IOException; + + private Option handleDeleteOperation(IndexedRecord insertRecord) { + boolean delete = false; + if (insertRecord instanceof GenericRecord) { + GenericRecord record = (GenericRecord) insertRecord; + Object value = HoodieAvroUtils.getFieldVal(record, DebeziumConstants.FLATTENED_OP_COL_NAME); + delete = value != null && value.toString().equalsIgnoreCase(DebeziumConstants.DELETE_OP); + } + + return delete ? Option.empty() : Option.of(insertRecord); + } + + private IndexedRecord getInsertRecord(Schema schema) throws IOException { + return super.getInsertValue(schema).get(); + } +} \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/DebeziumConstants.java b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/DebeziumConstants.java new file mode 100644 index 0000000000000..3acaca5137555 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/DebeziumConstants.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model.debezium; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** + * Constants used by {@link DebeziumSource} and {@link DebeziumAvroPayload}. + */ +public class DebeziumConstants { + + // INPUT COLUMNS + public static final String INCOMING_BEFORE_FIELD = "before"; + public static final String INCOMING_AFTER_FIELD = "after"; + public static final String INCOMING_SOURCE_FIELD = "source"; + public static final String INCOMING_OP_FIELD = "op"; + public static final String INCOMING_TS_MS_FIELD = "ts_ms"; + + public static final String INCOMING_SOURCE_NAME_FIELD = "source.name"; + public static final String INCOMING_SOURCE_SCHEMA_FIELD = "source.schema"; + public static final String INCOMING_SOURCE_TS_MS_FIELD = "source.ts_ms"; + public static final String INCOMING_SOURCE_TXID_FIELD = "source.txId"; + + // INPUT COLUMNS SPECIFIC TO MYSQL + public static final String INCOMING_SOURCE_FILE_FIELD = "source.file"; + public static final String INCOMING_SOURCE_POS_FIELD = "source.pos"; + public static final String INCOMING_SOURCE_ROW_FIELD = "source.row"; + + // INPUT COLUMNS SPECIFIC TO POSTGRES + public static final String INCOMING_SOURCE_LSN_FIELD = "source.lsn"; + public static final String INCOMING_SOURCE_XMIN_FIELD = "source.xmin"; + + // OUTPUT COLUMNS + public static final String FLATTENED_OP_COL_NAME = "_change_operation_type"; + public static final String UPSTREAM_PROCESSING_TS_COL_NAME = "_upstream_event_processed_ts_ms"; + public static final String FLATTENED_SHARD_NAME = "db_shard_source_partition"; + public static final String FLATTENED_SCHEMA_NAME = "db_schema_source_partition"; + public static final String FLATTENED_TS_COL_NAME = "_event_origin_ts_ms"; + public static final String FLATTENED_TX_ID_COL_NAME = "_event_tx_id"; + + // OUTPUT COLUMNS SPECIFIC TO MYSQL + public static final String FLATTENED_FILE_COL_NAME = "_event_bin_file"; + public static final String FLATTENED_POS_COL_NAME = "_event_pos"; + public static final String FLATTENED_ROW_COL_NAME = "_event_row"; + public static final String ADDED_SEQ_COL_NAME = "_event_seq"; + + // OUTPUT COLUMNS SPECIFIC TO POSTGRES + public static final String FLATTENED_LSN_COL_NAME = "_event_lsn"; + public static final String FLATTENED_XMIN_COL_NAME = "_event_xmin"; + + // Other Constants + public static final String DELETE_OP = "d"; + + // List of meta data columns + public static List META_COLUMNS = Collections.unmodifiableList(Arrays.asList( + FLATTENED_OP_COL_NAME, + UPSTREAM_PROCESSING_TS_COL_NAME, + FLATTENED_TS_COL_NAME, + FLATTENED_TX_ID_COL_NAME, + FLATTENED_LSN_COL_NAME, + FLATTENED_XMIN_COL_NAME, + FLATTENED_SHARD_NAME + )); +} + diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/MySqlDebeziumAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/MySqlDebeziumAvroPayload.java new file mode 100644 index 0000000000000..f4dd9226495d4 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/MySqlDebeziumAvroPayload.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model.debezium; + +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieDebeziumAvroPayloadException; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.Objects; + +/** + * Provides support for seamlessly applying changes captured via Debezium for MysqlDB. + *

    + * Debezium change event types are determined for the op field in the payload + *

    + * - For inserts, op=i + * - For deletes, op=d + * - For updates, op=u + * - For snapshort inserts, op=r + *

    + * This payload implementation will issue matching insert, delete, updates against the hudi table + */ +public class MySqlDebeziumAvroPayload extends AbstractDebeziumAvroPayload { + + private static final Logger LOG = LogManager.getLogger(MySqlDebeziumAvroPayload.class); + + public MySqlDebeziumAvroPayload(GenericRecord record, Comparable orderingVal) { + super(record, orderingVal); + } + + public MySqlDebeziumAvroPayload(Option record) { + super(record); + } + + private Option extractSeq(IndexedRecord record) { + Object value = ((GenericRecord) record).get(DebeziumConstants.ADDED_SEQ_COL_NAME); + return Option.ofNullable(Objects.toString(value, null)); + } + + @Override + protected boolean shouldPickCurrentRecord(IndexedRecord currentRecord, IndexedRecord insertRecord, Schema schema) throws IOException { + String insertSourceSeq = extractSeq(insertRecord) + .orElseThrow(() -> + new HoodieDebeziumAvroPayloadException(String.format("%s cannot be null in insert record: %s", + DebeziumConstants.ADDED_SEQ_COL_NAME, insertRecord))); + Option currentSourceSeqOpt = extractSeq(currentRecord); + // Pick the current value in storage only if its Seq (file+pos) is latest + // compared to the Seq (file+pos) of the insert value + return currentSourceSeqOpt.isPresent() && insertSourceSeq.compareTo(currentSourceSeqOpt.get()) < 0; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/PostgresDebeziumAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/PostgresDebeziumAvroPayload.java new file mode 100644 index 0000000000000..21fe264e6c8cf --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/PostgresDebeziumAvroPayload.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model.debezium; + +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieDebeziumAvroPayloadException; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.Properties; + +/** + * Provides support for seamlessly applying changes captured via Debezium for PostgresDB. + *

    + * Debezium change event types are determined for the op field in the payload + *

    + * - For inserts, op=i + * - For deletes, op=d + * - For updates, op=u + * - For snapshort inserts, op=r + *

    + * This payload implementation will issue matching insert, delete, updates against the hudi table + */ +public class PostgresDebeziumAvroPayload extends AbstractDebeziumAvroPayload { + + private static final Logger LOG = LogManager.getLogger(PostgresDebeziumAvroPayload.class); + public static final String DEBEZIUM_TOASTED_VALUE = "__debezium_unavailable_value"; + + public PostgresDebeziumAvroPayload(GenericRecord record, Comparable orderingVal) { + super(record, orderingVal); + } + + public PostgresDebeziumAvroPayload(Option record) { + super(record); + } + + private Option extractLSN(IndexedRecord record) { + Object value = ((GenericRecord) record).get(DebeziumConstants.FLATTENED_LSN_COL_NAME); + return Option.ofNullable(value != null ? (Long) value : null); + } + + @Override + protected boolean shouldPickCurrentRecord(IndexedRecord currentRecord, IndexedRecord insertRecord, Schema schema) throws IOException { + Long insertSourceLSN = extractLSN(insertRecord) + .orElseThrow(() -> + new HoodieDebeziumAvroPayloadException(String.format("%s cannot be null in insert record: %s", + DebeziumConstants.FLATTENED_LSN_COL_NAME, insertRecord))); + Option currentSourceLSNOpt = extractLSN(currentRecord); + // Pick the current value in storage only if its LSN is latest compared to the LSN of the insert value + return currentSourceLSNOpt.isPresent() && insertSourceLSN < currentSourceLSNOpt.get(); + } + + @Override + public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema, Properties properties) throws IOException { + // Specific to Postgres: If the updated record has TOASTED columns, + // we will need to keep the previous value for those columns + // see https://debezium.io/documentation/reference/connectors/postgresql.html#postgresql-toasted-values + Option insertOrDeleteRecord = super.combineAndGetUpdateValue(currentValue, schema, properties); + + if (insertOrDeleteRecord.isPresent()) { + mergeToastedValuesIfPresent(insertOrDeleteRecord.get(), currentValue); + } + return insertOrDeleteRecord; + } + + @Override + public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { + // Specific to Postgres: If the updated record has TOASTED columns, + // we will need to keep the previous value for those columns + // see https://debezium.io/documentation/reference/connectors/postgresql.html#postgresql-toasted-values + Option insertOrDeleteRecord = super.combineAndGetUpdateValue(currentValue, schema); + + if (insertOrDeleteRecord.isPresent()) { + mergeToastedValuesIfPresent(insertOrDeleteRecord.get(), currentValue); + } + return insertOrDeleteRecord; + } + + private void mergeToastedValuesIfPresent(IndexedRecord incomingRecord, IndexedRecord currentRecord) { + List fields = incomingRecord.getSchema().getFields(); + + fields.forEach(field -> { + // There are only four avro data types that have unconstrained sizes, which are + // NON-NULLABLE STRING, NULLABLE STRING, NON-NULLABLE BYTES, NULLABLE BYTES + if (((GenericData.Record) incomingRecord).get(field.name()) != null + && (containsStringToastedValues(incomingRecord, field) || containsBytesToastedValues(incomingRecord, field))) { + ((GenericData.Record) incomingRecord).put(field.name(), ((GenericData.Record) currentRecord).get(field.name())); + } + }); + } + + /** + * Returns true if a column is either of type string or a union of one or more strings that contain a debezium toasted value. + * + * @param incomingRecord The incoming avro record + * @param field the column of interest + * @return + */ + private boolean containsStringToastedValues(IndexedRecord incomingRecord, Schema.Field field) { + return ((field.schema().getType() == Schema.Type.STRING + || (field.schema().getType() == Schema.Type.UNION && field.schema().getTypes().stream().anyMatch(s -> s.getType() == Schema.Type.STRING))) + // Check length first as an optimization + && ((CharSequence) ((GenericData.Record) incomingRecord).get(field.name())).length() == DEBEZIUM_TOASTED_VALUE.length() + && DEBEZIUM_TOASTED_VALUE.equals(((CharSequence) ((GenericData.Record) incomingRecord).get(field.name())).toString())); + } + + /** + * Returns true if a column is either of type bytes or a union of one or more bytes that contain a debezium toasted value. + * + * @param incomingRecord The incoming avro record + * @param field the column of interest + * @return + */ + private boolean containsBytesToastedValues(IndexedRecord incomingRecord, Schema.Field field) { + return ((field.schema().getType() == Schema.Type.BYTES + || (field.schema().getType() == Schema.Type.UNION && field.schema().getTypes().stream().anyMatch(s -> s.getType() == Schema.Type.BYTES))) + // Check length first as an optimization + && ((ByteBuffer) ((GenericData.Record) incomingRecord).get(field.name())).array().length == DEBEZIUM_TOASTED_VALUE.length() + && DEBEZIUM_TOASTED_VALUE.equals(new String(((ByteBuffer) ((GenericData.Record) incomingRecord).get(field.name())).array(), StandardCharsets.UTF_8))); + } +} + diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java index 7e2dffbfe1362..52e2a11247a95 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java @@ -19,14 +19,30 @@ package org.apache.hudi.common.table; import org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex; +import org.apache.hudi.common.bootstrap.index.NoOpBootstrapIndex; +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.config.OrderedProperties; +import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieTimelineTimeZone; import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; +import org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.util.BinaryUtil; +import org.apache.hudi.common.util.FileIOUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions.Config; +import org.apache.avro.Schema; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; @@ -35,12 +51,18 @@ import org.apache.log4j.Logger; import java.io.IOException; -import java.io.Serializable; -import java.util.Date; +import java.time.Instant; +import java.util.Arrays; +import java.util.List; import java.util.Map; import java.util.Properties; +import java.util.Set; +import java.util.HashSet; +import java.util.function.BiConsumer; import java.util.stream.Collectors; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Configurations on the Hoodie Table like type of ingestion, storage formats, hive table name etc Configurations are loaded from hoodie.properties, these properties are usually set during * initializing a path as hoodie base path and never changes during the lifetime of a hoodie table. @@ -48,119 +70,391 @@ * @see HoodieTableMetaClient * @since 0.3.0 */ -public class HoodieTableConfig implements Serializable { +@ConfigClassProperty(name = "Table Configurations", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Configurations that persist across writes and read on a Hudi table " + + " like base, log file formats, table name, creation schema, table version layouts. " + + " Configurations are loaded from hoodie.properties, these properties are usually set during " + + "initializing a path as hoodie base path and rarely changes during " + + "the lifetime of the table. Writers/Queries' configurations are validated against these " + + " each time for compatibility.") +public class HoodieTableConfig extends HoodieConfig { private static final Logger LOG = LogManager.getLogger(HoodieTableConfig.class); public static final String HOODIE_PROPERTIES_FILE = "hoodie.properties"; - public static final String HOODIE_TABLE_NAME_PROP_NAME = "hoodie.table.name"; - public static final String HOODIE_TABLE_TYPE_PROP_NAME = "hoodie.table.type"; - public static final String HOODIE_TABLE_VERSION_PROP_NAME = "hoodie.table.version"; - @Deprecated - public static final String HOODIE_RO_FILE_FORMAT_PROP_NAME = "hoodie.table.ro.file.format"; - @Deprecated - public static final String HOODIE_RT_FILE_FORMAT_PROP_NAME = "hoodie.table.rt.file.format"; - public static final String HOODIE_BASE_FILE_FORMAT_PROP_NAME = "hoodie.table.base.file.format"; - public static final String HOODIE_LOG_FILE_FORMAT_PROP_NAME = "hoodie.table.log.file.format"; - public static final String HOODIE_TIMELINE_LAYOUT_VERSION = "hoodie.timeline.layout.version"; - public static final String HOODIE_PAYLOAD_CLASS_PROP_NAME = "hoodie.compaction.payload.class"; - public static final String HOODIE_ARCHIVELOG_FOLDER_PROP_NAME = "hoodie.archivelog.folder"; - public static final String HOODIE_BOOTSTRAP_INDEX_CLASS_PROP_NAME = "hoodie.bootstrap.index.class"; - public static final String HOODIE_BOOTSTRAP_BASE_PATH = "hoodie.bootstrap.base.path"; - - public static final HoodieTableType DEFAULT_TABLE_TYPE = HoodieTableType.COPY_ON_WRITE; - public static final HoodieTableVersion DEFAULT_TABLE_VERSION = HoodieTableVersion.ZERO; - public static final HoodieFileFormat DEFAULT_BASE_FILE_FORMAT = HoodieFileFormat.PARQUET; - public static final HoodieFileFormat DEFAULT_LOG_FILE_FORMAT = HoodieFileFormat.HOODIE_LOG; - public static final String DEFAULT_PAYLOAD_CLASS = OverwriteWithLatestAvroPayload.class.getName(); - public static final String DEFAULT_BOOTSTRAP_INDEX_CLASS = HFileBootstrapIndex.class.getName(); - public static final String DEFAULT_ARCHIVELOG_FOLDER = ""; - - private Properties props; + public static final String HOODIE_PROPERTIES_FILE_BACKUP = "hoodie.properties.backup"; + public static final String HOODIE_WRITE_TABLE_NAME_KEY = "hoodie.datasource.write.table.name"; + public static final String HOODIE_TABLE_NAME_KEY = "hoodie.table.name"; + + public static final ConfigProperty DATABASE_NAME = ConfigProperty + .key("hoodie.database.name") + .noDefaultValue() + .withDocumentation("Database name that will be used for incremental query.If different databases have the same table name during incremental query, " + + "we can set it to limit the table name under a specific database"); + + public static final ConfigProperty NAME = ConfigProperty + .key(HOODIE_TABLE_NAME_KEY) + .noDefaultValue() + .withDocumentation("Table name that will be used for registering with Hive. Needs to be same across runs."); + + public static final ConfigProperty TYPE = ConfigProperty + .key("hoodie.table.type") + .defaultValue(HoodieTableType.COPY_ON_WRITE) + .withDocumentation("The table type for the underlying data, for this write. This can’t change between writes."); + + public static final ConfigProperty VERSION = ConfigProperty + .key("hoodie.table.version") + .defaultValue(HoodieTableVersion.ZERO) + .withDocumentation("Version of table, used for running upgrade/downgrade steps between releases with potentially " + + "breaking/backwards compatible changes."); + + public static final ConfigProperty PRECOMBINE_FIELD = ConfigProperty + .key("hoodie.table.precombine.field") + .noDefaultValue() + .withDocumentation("Field used in preCombining before actual write. By default, when two records have the same key value, " + + "the largest value for the precombine field determined by Object.compareTo(..), is picked."); + + public static final ConfigProperty PARTITION_FIELDS = ConfigProperty + .key("hoodie.table.partition.fields") + .noDefaultValue() + .withDocumentation("Fields used to partition the table. Concatenated values of these fields are used as " + + "the partition path, by invoking toString()"); + + public static final ConfigProperty RECORDKEY_FIELDS = ConfigProperty + .key("hoodie.table.recordkey.fields") + .noDefaultValue() + .withDocumentation("Columns used to uniquely identify the table. Concatenated values of these fields are used as " + + " the record key component of HoodieKey."); + + public static final ConfigProperty CREATE_SCHEMA = ConfigProperty + .key("hoodie.table.create.schema") + .noDefaultValue() + .withDocumentation("Schema used when creating the table, for the first time."); + + public static final ConfigProperty BASE_FILE_FORMAT = ConfigProperty + .key("hoodie.table.base.file.format") + .defaultValue(HoodieFileFormat.PARQUET) + .withAlternatives("hoodie.table.ro.file.format") + .withDocumentation("Base file format to store all the base file data."); + + public static final ConfigProperty LOG_FILE_FORMAT = ConfigProperty + .key("hoodie.table.log.file.format") + .defaultValue(HoodieFileFormat.HOODIE_LOG) + .withAlternatives("hoodie.table.rt.file.format") + .withDocumentation("Log format used for the delta logs."); + + public static final ConfigProperty TIMELINE_LAYOUT_VERSION = ConfigProperty + .key("hoodie.timeline.layout.version") + .noDefaultValue() + .withDocumentation("Version of timeline used, by the table."); + + public static final ConfigProperty PAYLOAD_CLASS_NAME = ConfigProperty + .key("hoodie.compaction.payload.class") + .defaultValue(OverwriteWithLatestAvroPayload.class.getName()) + .withDocumentation("Payload class to use for performing compactions, i.e merge delta logs with current base file and then " + + " produce a new base file."); + + public static final ConfigProperty ARCHIVELOG_FOLDER = ConfigProperty + .key("hoodie.archivelog.folder") + .defaultValue("archived") + .withDocumentation("path under the meta folder, to store archived timeline instants at."); + + public static final ConfigProperty BOOTSTRAP_INDEX_ENABLE = ConfigProperty + .key("hoodie.bootstrap.index.enable") + .defaultValue(true) + .withDocumentation("Whether or not, this is a bootstrapped table, with bootstrap base data and an mapping index defined, default true."); + + public static final ConfigProperty BOOTSTRAP_INDEX_CLASS_NAME = ConfigProperty + .key("hoodie.bootstrap.index.class") + .defaultValue(HFileBootstrapIndex.class.getName()) + .withDocumentation("Implementation to use, for mapping base files to bootstrap base file, that contain actual data."); + + public static final ConfigProperty BOOTSTRAP_BASE_PATH = ConfigProperty + .key("hoodie.bootstrap.base.path") + .noDefaultValue() + .withDocumentation("Base path of the dataset that needs to be bootstrapped as a Hudi table"); + + public static final ConfigProperty POPULATE_META_FIELDS = ConfigProperty + .key("hoodie.populate.meta.fields") + .defaultValue(true) + .withDocumentation("When enabled, populates all meta fields. When disabled, no meta fields are populated " + + "and incremental queries will not be functional. This is only meant to be used for append only/immutable data for batch processing"); + + public static final ConfigProperty KEY_GENERATOR_CLASS_NAME = ConfigProperty + .key("hoodie.table.keygenerator.class") + .noDefaultValue() + .withDocumentation("Key Generator class property for the hoodie table"); + + public static final ConfigProperty TIMELINE_TIMEZONE = ConfigProperty + .key("hoodie.table.timeline.timezone") + .defaultValue(HoodieTimelineTimeZone.LOCAL) + .withDocumentation("User can set hoodie commit timeline timezone, such as utc, local and so on. local is default"); + + public static final ConfigProperty PARTITION_METAFILE_USE_BASE_FORMAT = ConfigProperty + .key("hoodie.partition.metafile.use.base.format") + .defaultValue(false) + .withDocumentation("If true, partition metafiles are saved in the same format as base-files for this dataset (e.g. Parquet / ORC). " + + "If false (default) partition metafiles are saved as properties files."); + + public static final ConfigProperty DROP_PARTITION_COLUMNS = ConfigProperty + .key("hoodie.datasource.write.drop.partition.columns") + .defaultValue(false) + .withDocumentation("When set to true, will not write the partition columns into hudi. By default, false."); + + public static final ConfigProperty URL_ENCODE_PARTITIONING = KeyGeneratorOptions.URL_ENCODE_PARTITIONING; + public static final ConfigProperty HIVE_STYLE_PARTITIONING_ENABLE = KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE; + + public static final List PERSISTED_CONFIG_LIST = Arrays.asList( + Config.DATE_TIME_PARSER_PROP, + Config.INPUT_TIME_UNIT, Config.TIMESTAMP_INPUT_DATE_FORMAT_LIST_DELIMITER_REGEX_PROP, + Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, Config.TIMESTAMP_INPUT_TIMEZONE_FORMAT_PROP, + Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, Config.TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP, + Config.TIMESTAMP_TIMEZONE_FORMAT_PROP, Config.DATE_TIME_PARSER_PROP + ); + + public static final String NO_OP_BOOTSTRAP_INDEX_CLASS = NoOpBootstrapIndex.class.getName(); + + public static final ConfigProperty TABLE_CHECKSUM = ConfigProperty + .key("hoodie.table.checksum") + .noDefaultValue() + .sinceVersion("0.11.0") + .withDocumentation("Table checksum is used to guard against partial writes in HDFS. It is added as the last entry in hoodie.properties and then used to validate while reading table config."); + + public static final ConfigProperty TABLE_METADATA_PARTITIONS_INFLIGHT = ConfigProperty + .key("hoodie.table.metadata.partitions.inflight") + .noDefaultValue() + .sinceVersion("0.11.0") + .withDocumentation("Comma-separated list of metadata partitions whose building is in progress. " + + "These partitions are not yet ready for use by the readers."); + + public static final ConfigProperty TABLE_METADATA_PARTITIONS = ConfigProperty + .key("hoodie.table.metadata.partitions") + .noDefaultValue() + .sinceVersion("0.11.0") + .withDocumentation("Comma-separated list of metadata partitions that have been completely built and in-sync with data table. " + + "These partitions are ready for use by the readers"); + + private static final String TABLE_CHECKSUM_FORMAT = "%s.%s"; // . public HoodieTableConfig(FileSystem fs, String metaPath, String payloadClassName) { - Properties props = new Properties(); + super(); Path propertyPath = new Path(metaPath, HOODIE_PROPERTIES_FILE); LOG.info("Loading table properties from " + propertyPath); try { - try (FSDataInputStream inputStream = fs.open(propertyPath)) { - props.load(inputStream); - } - if (props.containsKey(HOODIE_PAYLOAD_CLASS_PROP_NAME) && payloadClassName != null - && !props.getProperty(HOODIE_PAYLOAD_CLASS_PROP_NAME).equals(payloadClassName)) { - props.setProperty(HOODIE_PAYLOAD_CLASS_PROP_NAME, payloadClassName); + fetchConfigs(fs, metaPath); + if (contains(PAYLOAD_CLASS_NAME) && payloadClassName != null + && !getString(PAYLOAD_CLASS_NAME).equals(payloadClassName)) { + setValue(PAYLOAD_CLASS_NAME, payloadClassName); + // FIXME(vc): wonder if this can be removed. Need to look into history. try (FSDataOutputStream outputStream = fs.create(propertyPath)) { - props.store(outputStream, "Properties saved on " + new Date(System.currentTimeMillis())); + storeProperties(props, outputStream); } } } catch (IOException e) { throw new HoodieIOException("Could not load Hoodie properties from " + propertyPath, e); } - this.props = props; - ValidationUtils.checkArgument(props.containsKey(HOODIE_TABLE_TYPE_PROP_NAME) && props.containsKey(HOODIE_TABLE_NAME_PROP_NAME), + ValidationUtils.checkArgument(contains(TYPE) && contains(NAME), "hoodie.properties file seems invalid. Please check for left over `.updated` files if any, manually copy it to hoodie.properties and retry"); } - public HoodieTableConfig(Properties props) { - this.props = props; + private static Properties getOrderedPropertiesWithTableChecksum(Properties props) { + Properties orderedProps = new OrderedProperties(props); + orderedProps.put(TABLE_CHECKSUM.key(), String.valueOf(generateChecksum(props))); + return orderedProps; } /** - * For serailizing and de-serializing. + * Write the properties to the given output stream and return the table checksum. * - * @deprecated + * @param props - properties to be written + * @param outputStream - output stream to which properties will be written + * @return return the table checksum + * @throws IOException + */ + private static String storeProperties(Properties props, FSDataOutputStream outputStream) throws IOException { + final String checksum; + if (isValidChecksum(props)) { + checksum = props.getProperty(TABLE_CHECKSUM.key()); + props.store(outputStream, "Updated at " + Instant.now()); + } else { + Properties propsWithChecksum = getOrderedPropertiesWithTableChecksum(props); + propsWithChecksum.store(outputStream, "Properties saved on " + Instant.now()); + checksum = propsWithChecksum.getProperty(TABLE_CHECKSUM.key()); + props.setProperty(TABLE_CHECKSUM.key(), checksum); + } + return checksum; + } + + private static boolean isValidChecksum(Properties props) { + return props.containsKey(TABLE_CHECKSUM.key()) && validateChecksum(props); + } + + /** + * For serializing and de-serializing. */ public HoodieTableConfig() { + super(); + } + + private void fetchConfigs(FileSystem fs, String metaPath) throws IOException { + Path cfgPath = new Path(metaPath, HOODIE_PROPERTIES_FILE); + try (FSDataInputStream is = fs.open(cfgPath)) { + props.load(is); + } catch (IOException ioe) { + if (!fs.exists(cfgPath)) { + LOG.warn("Run `table recover-configs` if config update/delete failed midway. Falling back to backed up configs."); + // try the backup. this way no query ever fails if update fails midway. + Path backupCfgPath = new Path(metaPath, HOODIE_PROPERTIES_FILE_BACKUP); + try (FSDataInputStream is = fs.open(backupCfgPath)) { + props.load(is); + } + } else { + throw ioe; + } + } + } + + public static void recover(FileSystem fs, Path metadataFolder) throws IOException { + Path cfgPath = new Path(metadataFolder, HOODIE_PROPERTIES_FILE); + Path backupCfgPath = new Path(metadataFolder, HOODIE_PROPERTIES_FILE_BACKUP); + recoverIfNeeded(fs, cfgPath, backupCfgPath); + } + + static void recoverIfNeeded(FileSystem fs, Path cfgPath, Path backupCfgPath) throws IOException { + if (!fs.exists(cfgPath)) { + // copy over from backup + try (FSDataInputStream in = fs.open(backupCfgPath); + FSDataOutputStream out = fs.create(cfgPath, false)) { + FileIOUtils.copy(in, out); + } + } + // regardless, we don't need the backup anymore. + fs.delete(backupCfgPath, false); + } + + private static void upsertProperties(Properties current, Properties updated) { + updated.forEach((k, v) -> current.setProperty(k.toString(), v.toString())); + } + + private static void deleteProperties(Properties current, Properties deleted) { + deleted.forEach((k, v) -> current.remove(k.toString())); + } + + private static void modify(FileSystem fs, Path metadataFolder, Properties modifyProps, BiConsumer modifyFn) { + Path cfgPath = new Path(metadataFolder, HOODIE_PROPERTIES_FILE); + Path backupCfgPath = new Path(metadataFolder, HOODIE_PROPERTIES_FILE_BACKUP); + try { + // 0. do any recovery from prior attempts. + recoverIfNeeded(fs, cfgPath, backupCfgPath); + + // 1. backup the existing properties. + try (FSDataInputStream in = fs.open(cfgPath); + FSDataOutputStream out = fs.create(backupCfgPath, false)) { + FileIOUtils.copy(in, out); + } + /// 2. delete the properties file, reads will go to the backup, until we are done. + fs.delete(cfgPath, false); + // 3. read current props, upsert and save back. + String checksum; + try (FSDataInputStream in = fs.open(backupCfgPath); + FSDataOutputStream out = fs.create(cfgPath, true)) { + Properties props = new TypedProperties(); + props.load(in); + modifyFn.accept(props, modifyProps); + checksum = storeProperties(props, out); + } + // 4. verify and remove backup. + try (FSDataInputStream in = fs.open(cfgPath)) { + Properties props = new TypedProperties(); + props.load(in); + if (!props.containsKey(TABLE_CHECKSUM.key()) || !props.getProperty(TABLE_CHECKSUM.key()).equals(checksum)) { + // delete the properties file and throw exception indicating update failure + // subsequent writes will recover and update, reads will go to the backup until then + fs.delete(cfgPath, false); + throw new HoodieIOException("Checksum property missing or does not match."); + } + } + fs.delete(backupCfgPath, false); + } catch (IOException e) { + throw new HoodieIOException("Error updating table configs.", e); + } + } + + /** + * Upserts the table config with the set of properties passed in. We implement a fail-safe backup protocol + * here for safely updating with recovery and also ensuring the table config continues to be readable. + */ + public static void update(FileSystem fs, Path metadataFolder, Properties updatedProps) { + modify(fs, metadataFolder, updatedProps, HoodieTableConfig::upsertProperties); + } + + public static void delete(FileSystem fs, Path metadataFolder, Set deletedProps) { + Properties props = new Properties(); + deletedProps.forEach(p -> props.setProperty(p, "")); + modify(fs, metadataFolder, props, HoodieTableConfig::deleteProperties); } /** * Initialize the hoodie meta directory and any necessary files inside the meta (including the hoodie.properties). */ - public static void createHoodieProperties(FileSystem fs, Path metadataFolder, Properties properties) + public static void create(FileSystem fs, Path metadataFolder, Properties properties) throws IOException { if (!fs.exists(metadataFolder)) { fs.mkdirs(metadataFolder); } + HoodieConfig hoodieConfig = new HoodieConfig(properties); Path propertyPath = new Path(metadataFolder, HOODIE_PROPERTIES_FILE); try (FSDataOutputStream outputStream = fs.create(propertyPath)) { - if (!properties.containsKey(HOODIE_TABLE_NAME_PROP_NAME)) { - throw new IllegalArgumentException(HOODIE_TABLE_NAME_PROP_NAME + " property needs to be specified"); - } - if (!properties.containsKey(HOODIE_TABLE_TYPE_PROP_NAME)) { - properties.setProperty(HOODIE_TABLE_TYPE_PROP_NAME, DEFAULT_TABLE_TYPE.name()); + if (!hoodieConfig.contains(NAME)) { + throw new IllegalArgumentException(NAME.key() + " property needs to be specified"); } - if (properties.getProperty(HOODIE_TABLE_TYPE_PROP_NAME).equals(HoodieTableType.MERGE_ON_READ.name()) - && !properties.containsKey(HOODIE_PAYLOAD_CLASS_PROP_NAME)) { - properties.setProperty(HOODIE_PAYLOAD_CLASS_PROP_NAME, DEFAULT_PAYLOAD_CLASS); + hoodieConfig.setDefaultValue(TYPE); + if (hoodieConfig.getString(TYPE).equals(HoodieTableType.MERGE_ON_READ.name())) { + hoodieConfig.setDefaultValue(PAYLOAD_CLASS_NAME); } - if (!properties.containsKey(HOODIE_ARCHIVELOG_FOLDER_PROP_NAME)) { - properties.setProperty(HOODIE_ARCHIVELOG_FOLDER_PROP_NAME, DEFAULT_ARCHIVELOG_FOLDER); - } - if (!properties.containsKey(HOODIE_TIMELINE_LAYOUT_VERSION)) { + hoodieConfig.setDefaultValue(ARCHIVELOG_FOLDER); + if (!hoodieConfig.contains(TIMELINE_LAYOUT_VERSION)) { // Use latest Version as default unless forced by client - properties.setProperty(HOODIE_TIMELINE_LAYOUT_VERSION, TimelineLayoutVersion.CURR_VERSION.toString()); + hoodieConfig.setValue(TIMELINE_LAYOUT_VERSION, TimelineLayoutVersion.CURR_VERSION.toString()); } - if (properties.containsKey(HOODIE_BOOTSTRAP_BASE_PATH) && !properties.containsKey(HOODIE_BOOTSTRAP_INDEX_CLASS_PROP_NAME)) { + if (hoodieConfig.contains(BOOTSTRAP_BASE_PATH)) { // Use the default bootstrap index class. - properties.setProperty(HOODIE_BOOTSTRAP_INDEX_CLASS_PROP_NAME, DEFAULT_BOOTSTRAP_INDEX_CLASS); + hoodieConfig.setDefaultValue(BOOTSTRAP_INDEX_CLASS_NAME, getDefaultBootstrapIndexClass(properties)); + } + if (hoodieConfig.contains(TIMELINE_TIMEZONE)) { + HoodieInstantTimeGenerator.setCommitTimeZone(HoodieTimelineTimeZone.valueOf(hoodieConfig.getString(TIMELINE_TIMEZONE))); } - properties.store(outputStream, "Properties saved on " + new Date(System.currentTimeMillis())); + + hoodieConfig.setDefaultValue(DROP_PARTITION_COLUMNS); + + storeProperties(hoodieConfig.getProps(), outputStream); + } + } + + public static long generateChecksum(Properties props) { + if (!props.containsKey(NAME.key())) { + throw new IllegalArgumentException(NAME.key() + " property needs to be specified"); } + String table = props.getProperty(NAME.key()); + String database = props.getProperty(DATABASE_NAME.key(), ""); + return BinaryUtil.generateChecksum(String.format(TABLE_CHECKSUM_FORMAT, database, table).getBytes(UTF_8)); + } + + public static boolean validateChecksum(Properties props) { + return Long.parseLong(props.getProperty(TABLE_CHECKSUM.key())) == generateChecksum(props); } /** * Read the table type from the table properties and if not found, return the default. */ public HoodieTableType getTableType() { - if (props.containsKey(HOODIE_TABLE_TYPE_PROP_NAME)) { - return HoodieTableType.valueOf(props.getProperty(HOODIE_TABLE_TYPE_PROP_NAME)); - } - return DEFAULT_TABLE_TYPE; + return HoodieTableType.valueOf(getStringOrDefault(TYPE)); } public Option getTimelineLayoutVersion() { - return props.containsKey(HOODIE_TIMELINE_LAYOUT_VERSION) - ? Option.of(new TimelineLayoutVersion(Integer.valueOf(props.getProperty(HOODIE_TIMELINE_LAYOUT_VERSION)))) + return contains(TIMELINE_LAYOUT_VERSION) + ? Option.of(new TimelineLayoutVersion(getInt(TIMELINE_LAYOUT_VERSION))) : Option.empty(); } @@ -168,13 +462,13 @@ public Option getTimelineLayoutVersion() { * @return the hoodie.table.version from hoodie.properties file. */ public HoodieTableVersion getTableVersion() { - return props.containsKey(HOODIE_TABLE_VERSION_PROP_NAME) - ? HoodieTableVersion.versionFromCode(Integer.parseInt(props.getProperty(HOODIE_TABLE_VERSION_PROP_NAME))) - : DEFAULT_TABLE_VERSION; + return contains(VERSION) + ? HoodieTableVersion.versionFromCode(getInt(VERSION)) + : VERSION.defaultValue(); } public void setTableVersion(HoodieTableVersion tableVersion) { - props.put(HOODIE_TABLE_VERSION_PROP_NAME, Integer.toString(tableVersion.versionCode())); + setValue(VERSION, Integer.toString(tableVersion.versionCode())); } /** @@ -183,28 +477,85 @@ public void setTableVersion(HoodieTableVersion tableVersion) { public String getPayloadClass() { // There could be tables written with payload class from com.uber.hoodie. Need to transparently // change to org.apache.hudi - return props.getProperty(HOODIE_PAYLOAD_CLASS_PROP_NAME, DEFAULT_PAYLOAD_CLASS).replace("com.uber.hoodie", + return getStringOrDefault(PAYLOAD_CLASS_NAME).replace("com.uber.hoodie", "org.apache.hudi"); } + public String getPreCombineField() { + return getString(PRECOMBINE_FIELD); + } + + public Option getRecordKeyFields() { + String keyFieldsValue = getStringOrDefault(RECORDKEY_FIELDS, HoodieRecord.RECORD_KEY_METADATA_FIELD); + return Option.of(Arrays.stream(keyFieldsValue.split(",")) + .filter(p -> p.length() > 0).collect(Collectors.toList()).toArray(new String[] {})); + } + + public Option getPartitionFields() { + if (contains(PARTITION_FIELDS)) { + return Option.of(Arrays.stream(getString(PARTITION_FIELDS).split(",")) + .filter(p -> p.length() > 0).collect(Collectors.toList()).toArray(new String[] {})); + } + return Option.empty(); + } + + public boolean isTablePartitioned() { + return getPartitionFields().map(pfs -> pfs.length > 0).orElse(false); + } + + /** + * @returns the partition field prop. + * @deprecated please use {@link #getPartitionFields()} instead + */ + @Deprecated + public String getPartitionFieldProp() { + // NOTE: We're adding a stub returning empty string to stay compatible w/ pre-existing + // behavior until this method is fully deprecated + return Option.ofNullable(getString(PARTITION_FIELDS)).orElse(""); + } + /** * Read the payload class for HoodieRecords from the table properties. */ public String getBootstrapIndexClass() { // There could be tables written with payload class from com.uber.hoodie. Need to transparently // change to org.apache.hudi - return props.getProperty(HOODIE_BOOTSTRAP_INDEX_CLASS_PROP_NAME, DEFAULT_BOOTSTRAP_INDEX_CLASS); + return getStringOrDefault(BOOTSTRAP_INDEX_CLASS_NAME, getDefaultBootstrapIndexClass(props)); + } + + public static String getDefaultBootstrapIndexClass(Properties props) { + HoodieConfig hoodieConfig = new HoodieConfig(props); + String defaultClass = BOOTSTRAP_INDEX_CLASS_NAME.defaultValue(); + if (!hoodieConfig.getBooleanOrDefault(BOOTSTRAP_INDEX_ENABLE)) { + defaultClass = NO_OP_BOOTSTRAP_INDEX_CLASS; + } + return defaultClass; } public Option getBootstrapBasePath() { - return Option.ofNullable(props.getProperty(HOODIE_BOOTSTRAP_BASE_PATH)); + return Option.ofNullable(getString(BOOTSTRAP_BASE_PATH)); + } + + public Option getTableCreateSchema() { + if (contains(CREATE_SCHEMA)) { + return Option.of(new Schema.Parser().parse(getString(CREATE_SCHEMA))); + } else { + return Option.empty(); + } + } + + /** + * Read the database name. + */ + public String getDatabaseName() { + return getString(DATABASE_NAME); } /** * Read the table name. */ public String getTableName() { - return props.getProperty(HOODIE_TABLE_NAME_PROP_NAME); + return getString(NAME); } /** @@ -213,13 +564,7 @@ public String getTableName() { * @return HoodieFileFormat for the base file Storage format */ public HoodieFileFormat getBaseFileFormat() { - if (props.containsKey(HOODIE_BASE_FILE_FORMAT_PROP_NAME)) { - return HoodieFileFormat.valueOf(props.getProperty(HOODIE_BASE_FILE_FORMAT_PROP_NAME)); - } - if (props.containsKey(HOODIE_RO_FILE_FORMAT_PROP_NAME)) { - return HoodieFileFormat.valueOf(props.getProperty(HOODIE_RO_FILE_FORMAT_PROP_NAME)); - } - return DEFAULT_BASE_FILE_FORMAT; + return HoodieFileFormat.valueOf(getStringOrDefault(BASE_FILE_FORMAT)); } /** @@ -228,28 +573,179 @@ public HoodieFileFormat getBaseFileFormat() { * @return HoodieFileFormat for the log Storage format */ public HoodieFileFormat getLogFileFormat() { - if (props.containsKey(HOODIE_LOG_FILE_FORMAT_PROP_NAME)) { - return HoodieFileFormat.valueOf(props.getProperty(HOODIE_LOG_FILE_FORMAT_PROP_NAME)); - } - if (props.containsKey(HOODIE_RT_FILE_FORMAT_PROP_NAME)) { - return HoodieFileFormat.valueOf(props.getProperty(HOODIE_RT_FILE_FORMAT_PROP_NAME)); - } - return DEFAULT_LOG_FILE_FORMAT; + return HoodieFileFormat.valueOf(getStringOrDefault(LOG_FILE_FORMAT)); } /** * Get the relative path of archive log folder under metafolder, for this table. */ public String getArchivelogFolder() { - return props.getProperty(HOODIE_ARCHIVELOG_FOLDER_PROP_NAME, DEFAULT_ARCHIVELOG_FOLDER); + return getStringOrDefault(ARCHIVELOG_FOLDER); + } + + /** + * @returns true is meta fields need to be populated. else returns false. + */ + public boolean populateMetaFields() { + return Boolean.parseBoolean(getStringOrDefault(POPULATE_META_FIELDS)); + } + + /** + * @returns the record key field prop. + */ + public String getRecordKeyFieldProp() { + return getStringOrDefault(RECORDKEY_FIELDS, HoodieRecord.RECORD_KEY_METADATA_FIELD); + } + + public String getKeyGeneratorClassName() { + return getString(KEY_GENERATOR_CLASS_NAME); + } + + public String getHiveStylePartitioningEnable() { + return getStringOrDefault(HIVE_STYLE_PARTITIONING_ENABLE); + } + + public String getUrlEncodePartitioning() { + return getStringOrDefault(URL_ENCODE_PARTITIONING); + } + + public Boolean shouldDropPartitionColumns() { + return getBooleanOrDefault(DROP_PARTITION_COLUMNS); } - public Map getProps() { + /** + * Read the table checksum. + */ + private Long getTableChecksum() { + return getLong(TABLE_CHECKSUM); + } + + public List getMetadataPartitionsInflight() { + return StringUtils.split( + getStringOrDefault(TABLE_METADATA_PARTITIONS_INFLIGHT, StringUtils.EMPTY_STRING), + CONFIG_VALUES_DELIMITER + ); + } + + public Set getMetadataPartitions() { + return new HashSet<>( + StringUtils.split(getStringOrDefault(TABLE_METADATA_PARTITIONS, StringUtils.EMPTY_STRING), + CONFIG_VALUES_DELIMITER)); + } + + /** + * Returns the format to use for partition meta files. + */ + public Option getPartitionMetafileFormat() { + if (getBooleanOrDefault(PARTITION_METAFILE_USE_BASE_FORMAT)) { + return Option.of(getBaseFileFormat()); + } + return Option.empty(); + } + + public Map propsMap() { return props.entrySet().stream() .collect(Collectors.toMap(e -> String.valueOf(e.getKey()), e -> String.valueOf(e.getValue()))); } - public Properties getProperties() { - return props; - } + /** + * @deprecated Use {@link #BASE_FILE_FORMAT} and its methods. + */ + @Deprecated + public static final String HOODIE_RO_FILE_FORMAT_PROP_NAME = "hoodie.table.ro.file.format"; + /** + * @deprecated Use {@link #LOG_FILE_FORMAT} and its methods. + */ + @Deprecated + public static final String HOODIE_RT_FILE_FORMAT_PROP_NAME = "hoodie.table.rt.file.format"; + /** + * @deprecated Use {@link #NAME} and its methods. + */ + @Deprecated + public static final String HOODIE_TABLE_NAME_PROP_NAME = NAME.key(); + /** + * @deprecated Use {@link #TYPE} and its methods. + */ + @Deprecated + public static final String HOODIE_TABLE_TYPE_PROP_NAME = TYPE.key(); + /** + * @deprecated Use {@link #VERSION} and its methods. + */ + @Deprecated + public static final String HOODIE_TABLE_VERSION_PROP_NAME = VERSION.key(); + /** + * @deprecated Use {@link #PRECOMBINE_FIELD} and its methods. + */ + @Deprecated + public static final String HOODIE_TABLE_PRECOMBINE_FIELD = PRECOMBINE_FIELD.key(); + /** + * @deprecated Use {@link #BASE_FILE_FORMAT} and its methods. + */ + @Deprecated + public static final String HOODIE_BASE_FILE_FORMAT_PROP_NAME = BASE_FILE_FORMAT.key(); + /** + * @deprecated Use {@link #LOG_FILE_FORMAT} and its methods. + */ + @Deprecated + public static final String HOODIE_LOG_FILE_FORMAT_PROP_NAME = LOG_FILE_FORMAT.key(); + /** + * @deprecated Use {@link #TIMELINE_LAYOUT_VERSION} and its methods. + */ + @Deprecated + public static final String HOODIE_TIMELINE_LAYOUT_VERSION = TIMELINE_LAYOUT_VERSION.key(); + /** + * @deprecated Use {@link #PAYLOAD_CLASS_NAME} and its methods. + */ + @Deprecated + public static final String HOODIE_PAYLOAD_CLASS_PROP_NAME = PAYLOAD_CLASS_NAME.key(); + /** + * @deprecated Use {@link #ARCHIVELOG_FOLDER} and its methods. + */ + @Deprecated + public static final String HOODIE_ARCHIVELOG_FOLDER_PROP_NAME = ARCHIVELOG_FOLDER.key(); + /** + * @deprecated Use {@link #BOOTSTRAP_INDEX_CLASS_NAME} and its methods. + */ + @Deprecated + public static final String HOODIE_BOOTSTRAP_INDEX_CLASS_PROP_NAME = BOOTSTRAP_INDEX_CLASS_NAME.key(); + /** + * @deprecated Use {@link #BOOTSTRAP_BASE_PATH} and its methods. + */ + @Deprecated + public static final String HOODIE_BOOTSTRAP_BASE_PATH = BOOTSTRAP_BASE_PATH.key(); + /** + * @deprecated Use {@link #TYPE} and its methods. + */ + @Deprecated + public static final HoodieTableType DEFAULT_TABLE_TYPE = TYPE.defaultValue(); + /** + * @deprecated Use {@link #VERSION} and its methods. + */ + @Deprecated + public static final HoodieTableVersion DEFAULT_TABLE_VERSION = VERSION.defaultValue(); + /** + * @deprecated Use {@link #BASE_FILE_FORMAT} and its methods. + */ + @Deprecated + public static final HoodieFileFormat DEFAULT_BASE_FILE_FORMAT = BASE_FILE_FORMAT.defaultValue(); + /** + * @deprecated Use {@link #LOG_FILE_FORMAT} and its methods. + */ + @Deprecated + public static final HoodieFileFormat DEFAULT_LOG_FILE_FORMAT = LOG_FILE_FORMAT.defaultValue(); + /** + * @deprecated Use {@link #PAYLOAD_CLASS_NAME} and its methods. + */ + @Deprecated + public static final String DEFAULT_PAYLOAD_CLASS = PAYLOAD_CLASS_NAME.defaultValue(); + /** + * @deprecated Use {@link #BOOTSTRAP_INDEX_CLASS_NAME} and its methods. + */ + @Deprecated + public static final String DEFAULT_BOOTSTRAP_INDEX_CLASS = BOOTSTRAP_INDEX_CLASS_NAME.defaultValue(); + /** + * @deprecated Use {@link #ARCHIVELOG_FOLDER} and its methods. + */ + @Deprecated + public static final String DEFAULT_ARCHIVELOG_FOLDER = ARCHIVELOG_FOLDER.defaultValue(); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java index 2e8857b2f5252..16dd373486f61 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java @@ -18,13 +18,19 @@ package org.apache.hudi.common.table; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.config.HoodieMetastoreConfig; import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FailSafeConsistencyGuard; +import org.apache.hudi.common.fs.FileSystemRetryConfig; +import org.apache.hudi.common.fs.HoodieRetryWrapperFileSystem; import org.apache.hudi.common.fs.HoodieWrapperFileSystem; import org.apache.hudi.common.fs.NoOpConsistencyGuard; +import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieTimelineTimeZone; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; @@ -33,9 +39,13 @@ import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.util.CommitUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.TableNotFoundException; +import org.apache.hudi.hadoop.CachingPath; +import org.apache.hudi.hadoop.SerializablePath; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; @@ -45,11 +55,11 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.File; import java.io.IOException; import java.io.Serializable; import java.util.Arrays; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.Properties; import java.util.Set; @@ -72,61 +82,51 @@ public class HoodieTableMetaClient implements Serializable { private static final long serialVersionUID = 1L; private static final Logger LOG = LogManager.getLogger(HoodieTableMetaClient.class); public static final String METAFOLDER_NAME = ".hoodie"; - public static final String TEMPFOLDER_NAME = METAFOLDER_NAME + File.separator + ".temp"; - public static final String AUXILIARYFOLDER_NAME = METAFOLDER_NAME + File.separator + ".aux"; - public static final String BOOTSTRAP_INDEX_ROOT_FOLDER_PATH = AUXILIARYFOLDER_NAME + File.separator + ".bootstrap"; - + public static final String TEMPFOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".temp"; + public static final String AUXILIARYFOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".aux"; + public static final String BOOTSTRAP_INDEX_ROOT_FOLDER_PATH = AUXILIARYFOLDER_NAME + Path.SEPARATOR + ".bootstrap"; + public static final String HEARTBEAT_FOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".heartbeat"; + public static final String METADATA_TABLE_FOLDER_PATH = METAFOLDER_NAME + Path.SEPARATOR + "metadata"; + public static final String HASHING_METADATA_FOLDER_NAME = ".bucket_index" + Path.SEPARATOR + "consistent_hashing_metadata"; public static final String BOOTSTRAP_INDEX_BY_PARTITION_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH - + File.separator + ".partitions"; - public static final String BOOTSTRAP_INDEX_BY_FILE_ID_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH + File.separator + + Path.SEPARATOR + ".partitions"; + public static final String BOOTSTRAP_INDEX_BY_FILE_ID_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH + Path.SEPARATOR + ".fileids"; + public static final String SCHEMA_FOLDER_NAME = ".schema"; + public static final String MARKER_EXTN = ".marker"; - private String basePath; + // NOTE: Since those two parameters lay on the hot-path of a lot of computations, we + // use tailored extension of the {@code Path} class allowing to avoid repetitive + // computations secured by its immutability + protected SerializablePath basePath; + protected SerializablePath metaPath; + private transient HoodieWrapperFileSystem fs; - private String metaPath; private boolean loadActiveTimelineOnLoad; - private SerializableConfiguration hadoopConf; + protected SerializableConfiguration hadoopConf; private HoodieTableType tableType; private TimelineLayoutVersion timelineLayoutVersion; - private HoodieTableConfig tableConfig; - private HoodieActiveTimeline activeTimeline; + protected HoodieTableConfig tableConfig; + protected HoodieActiveTimeline activeTimeline; private HoodieArchivedTimeline archivedTimeline; private ConsistencyGuardConfig consistencyGuardConfig = ConsistencyGuardConfig.newBuilder().build(); + private FileSystemRetryConfig fileSystemRetryConfig = FileSystemRetryConfig.newBuilder().build(); + protected HoodieMetastoreConfig metastoreConfig; - public HoodieTableMetaClient(Configuration conf, String basePath) { - // Do not load any timeline by default - this(conf, basePath, false); - } - - public HoodieTableMetaClient(Configuration conf, String basePath, String payloadClassName) { - this(conf, basePath, false, ConsistencyGuardConfig.newBuilder().build(), Option.of(TimelineLayoutVersion.CURR_LAYOUT_VERSION), - payloadClassName); - } - - public HoodieTableMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad, - ConsistencyGuardConfig consistencyGuardConfig, Option layoutVersion) { - this(conf, basePath, loadActiveTimelineOnLoad, consistencyGuardConfig, layoutVersion, null); - } - - public HoodieTableMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad) { - this(conf, basePath, loadActiveTimelineOnLoad, ConsistencyGuardConfig.newBuilder().build(), Option.of(TimelineLayoutVersion.CURR_LAYOUT_VERSION), null); - } - - public HoodieTableMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad, - ConsistencyGuardConfig consistencyGuardConfig, Option layoutVersion, - String payloadClassName) { + protected HoodieTableMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad, + ConsistencyGuardConfig consistencyGuardConfig, Option layoutVersion, + String payloadClassName, FileSystemRetryConfig fileSystemRetryConfig) { LOG.info("Loading HoodieTableMetaClient from " + basePath); - this.basePath = basePath; this.consistencyGuardConfig = consistencyGuardConfig; + this.fileSystemRetryConfig = fileSystemRetryConfig; this.hadoopConf = new SerializableConfiguration(conf); - Path basePathDir = new Path(this.basePath); - this.metaPath = new Path(basePath, METAFOLDER_NAME).toString(); - Path metaPathDir = new Path(this.metaPath); + this.basePath = new SerializablePath(new CachingPath(basePath)); + this.metaPath = new SerializablePath(new CachingPath(basePath, METAFOLDER_NAME)); this.fs = getFs(); - TableNotFoundException.checkTableValidity(fs, basePathDir, metaPathDir); - this.tableConfig = new HoodieTableConfig(fs, metaPath, payloadClassName); + TableNotFoundException.checkTableValidity(fs, this.basePath.get(), metaPath.get()); + this.tableConfig = new HoodieTableConfig(fs, metaPath.toString(), payloadClassName); this.tableType = tableConfig.getTableType(); Option tableConfigVersion = tableConfig.getTimelineLayoutVersion(); if (layoutVersion.isPresent() && tableConfigVersion.isPresent()) { @@ -150,12 +150,18 @@ public HoodieTableMetaClient(Configuration conf, String basePath, boolean loadAc * * @deprecated */ - public HoodieTableMetaClient() {} + public HoodieTableMetaClient() { + } public static HoodieTableMetaClient reload(HoodieTableMetaClient oldMetaClient) { - return new HoodieTableMetaClient(oldMetaClient.hadoopConf.get(), oldMetaClient.basePath, - oldMetaClient.loadActiveTimelineOnLoad, oldMetaClient.consistencyGuardConfig, - Option.of(oldMetaClient.timelineLayoutVersion), null); + return HoodieTableMetaClient.builder() + .setConf(oldMetaClient.hadoopConf.get()) + .setBasePath(oldMetaClient.basePath.toString()) + .setLoadActiveTimelineOnLoad(oldMetaClient.loadActiveTimelineOnLoad) + .setConsistencyGuardConfig(oldMetaClient.consistencyGuardConfig) + .setLayoutVersion(Option.of(oldMetaClient.timelineLayoutVersion)) + .setPayloadClassName(null) + .setFileSystemRetryConfig(oldMetaClient.fileSystemRetryConfig).build(); } /** @@ -165,18 +171,28 @@ public static HoodieTableMetaClient reload(HoodieTableMetaClient oldMetaClient) */ private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); - fs = null; // will be lazily inited + + fs = null; // will be lazily initialized } private void writeObject(java.io.ObjectOutputStream out) throws IOException { out.defaultWriteObject(); } + /** + * Returns base path of the table + */ + public Path getBasePathV2() { + return basePath.get(); + } + /** * @return Base path + * @deprecated please use {@link #getBasePathV2()} */ + @Deprecated public String getBasePath() { - return basePath; + return basePath.get().toString(); // this invocation is cached } /** @@ -190,14 +206,28 @@ public HoodieTableType getTableType() { * @return Meta path */ public String getMetaPath() { - return metaPath; + return metaPath.get().toString(); // this invocation is cached + } + + /** + * @return schema folder path + */ + public String getSchemaFolderName() { + return new Path(metaPath.get(), SCHEMA_FOLDER_NAME).toString(); + } + + /** + * @return Hashing metadata base path + */ + public String getHashingMetadataPath() { + return new Path(metaPath.get(), HASHING_METADATA_FOLDER_NAME).toString(); } /** * @return Temp Folder path */ public String getTempFolderPath() { - return basePath + File.separator + TEMPFOLDER_NAME; + return basePath + Path.SEPARATOR + TEMPFOLDER_NAME; } /** @@ -207,28 +237,35 @@ public String getTempFolderPath() { * @return */ public String getMarkerFolderPath(String instantTs) { - return String.format("%s%s%s", getTempFolderPath(), File.separator, instantTs); + return String.format("%s%s%s", getTempFolderPath(), Path.SEPARATOR, instantTs); } /** * @return Auxiliary Meta path */ public String getMetaAuxiliaryPath() { - return basePath + File.separator + AUXILIARYFOLDER_NAME; + return basePath + Path.SEPARATOR + AUXILIARYFOLDER_NAME; + } + + /** + * @return Heartbeat folder path. + */ + public static String getHeartbeatFolderPath(String basePath) { + return String.format("%s%s%s", basePath, Path.SEPARATOR, HEARTBEAT_FOLDER_NAME); } /** * @return Bootstrap Index By Partition Folder */ public String getBootstrapIndexByPartitionFolderPath() { - return basePath + File.separator + BOOTSTRAP_INDEX_BY_PARTITION_FOLDER_PATH; + return basePath + Path.SEPARATOR + BOOTSTRAP_INDEX_BY_PARTITION_FOLDER_PATH; } /** * @return Bootstrap Index By Hudi File Id Folder */ public String getBootstrapIndexByFileIdFolderNameFolderPath() { - return basePath + File.separator + BOOTSTRAP_INDEX_BY_FILE_ID_FOLDER_PATH; + return basePath + Path.SEPARATOR + BOOTSTRAP_INDEX_BY_FILE_ID_FOLDER_PATH; } /** @@ -236,11 +273,7 @@ public String getBootstrapIndexByFileIdFolderNameFolderPath() { */ public String getArchivePath() { String archiveFolder = tableConfig.getArchivelogFolder(); - if (archiveFolder.equals(HoodieTableConfig.DEFAULT_ARCHIVELOG_FOLDER)) { - return getMetaPath(); - } else { - return getMetaPath() + "/" + archiveFolder; - } + return getMetaPath() + Path.SEPARATOR + archiveFolder; } /** @@ -259,7 +292,15 @@ public TimelineLayoutVersion getTimelineLayoutVersion() { */ public HoodieWrapperFileSystem getFs() { if (fs == null) { - FileSystem fileSystem = FSUtils.getFs(metaPath, hadoopConf.newCopy()); + FileSystem fileSystem = FSUtils.getFs(metaPath.get(), hadoopConf.newCopy()); + + if (fileSystemRetryConfig.isFileSystemActionRetryEnable()) { + fileSystem = new HoodieRetryWrapperFileSystem(fileSystem, + fileSystemRetryConfig.getMaxRetryIntervalMs(), + fileSystemRetryConfig.getMaxRetryNumbers(), + fileSystemRetryConfig.getInitialRetryIntervalMs(), + fileSystemRetryConfig.getRetryExceptions()); + } ValidationUtils.checkArgument(!(fileSystem instanceof HoodieWrapperFileSystem), "File System not expected to be that of HoodieWrapperFileSystem"); fs = new HoodieWrapperFileSystem(fileSystem, @@ -270,6 +311,10 @@ public HoodieWrapperFileSystem getFs() { return fs; } + public void setFs(HoodieWrapperFileSystem fs) { + this.fs = fs; + } + /** * Return raw file-system. * @@ -309,11 +354,15 @@ public ConsistencyGuardConfig getConsistencyGuardConfig() { return consistencyGuardConfig; } + public FileSystemRetryConfig getFileSystemRetryConfig() { + return fileSystemRetryConfig; + } + /** * Get the archived commits as a timeline. This is costly operation, as all data from the archived files are read. * This should not be used, unless for historical debugging purposes. * - * @return Active commit timeline + * @return Archived commit timeline */ public synchronized HoodieArchivedTimeline getArchivedTimeline() { if (archivedTimeline == null) { @@ -322,78 +371,51 @@ public synchronized HoodieArchivedTimeline getArchivedTimeline() { return archivedTimeline; } - /** - * Helper method to initialize a table, with given basePath, tableType, name, archiveFolder, payloadClass and - * base file format. - */ - public static HoodieTableMetaClient initTableTypeWithBootstrap(Configuration hadoopConf, String basePath, HoodieTableType tableType, - String tableName, String archiveLogFolder, String payloadClassName, - String baseFileFormat, String bootstrapIndexClass, - String bootstrapBasePath) throws IOException { - return initTableType(hadoopConf, basePath, tableType, tableName, - archiveLogFolder, payloadClassName, null, baseFileFormat, bootstrapIndexClass, bootstrapBasePath); - } - - public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, HoodieTableType tableType, - String tableName, String archiveLogFolder, String payloadClassName, - String baseFileFormat) throws IOException { - return initTableType(hadoopConf, basePath, tableType, tableName, - archiveLogFolder, payloadClassName, null, baseFileFormat, null, null); + public HoodieMetastoreConfig getMetastoreConfig() { + if (metastoreConfig == null) { + metastoreConfig = new HoodieMetastoreConfig(); + } + return metastoreConfig; } /** - * Used primarily by tests, examples. + * Returns fresh new archived commits as a timeline from startTs (inclusive). + * + *

    This is costly operation if really early endTs is specified. + * Be caution to use this only when the time range is short. + * + *

    This method is not thread safe. + * + * @return Archived commit timeline */ - public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, HoodieTableType tableType, - String tableName, String payloadClassName) throws IOException { - return initTableType(hadoopConf, basePath, tableType, tableName, null, payloadClassName, - null, null, null, null); + public HoodieArchivedTimeline getArchivedTimeline(String startTs) { + return new HoodieArchivedTimeline(this, startTs); } - public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, HoodieTableType tableType, - String tableName, String archiveLogFolder, String payloadClassName, - Integer timelineLayoutVersion) throws IOException { - return initTableType(hadoopConf, basePath, tableType, tableName, archiveLogFolder, payloadClassName, - timelineLayoutVersion, null, null, null); - } - - private static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, HoodieTableType tableType, - String tableName, String archiveLogFolder, String payloadClassName, - Integer timelineLayoutVersion, String baseFileFormat, - String bootstrapIndexClass, String bootstrapBasePath) throws IOException { - Properties properties = new Properties(); - properties.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, tableName); - properties.setProperty(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, tableType.name()); - properties.setProperty(HoodieTableConfig.HOODIE_TABLE_VERSION_PROP_NAME, String.valueOf(HoodieTableVersion.current().versionCode())); - if (tableType == HoodieTableType.MERGE_ON_READ && payloadClassName != null) { - properties.setProperty(HoodieTableConfig.HOODIE_PAYLOAD_CLASS_PROP_NAME, payloadClassName); - } - - if (null != archiveLogFolder) { - properties.put(HoodieTableConfig.HOODIE_ARCHIVELOG_FOLDER_PROP_NAME, archiveLogFolder); - } - - if (null != timelineLayoutVersion) { - properties.put(HoodieTableConfig.HOODIE_TIMELINE_LAYOUT_VERSION, String.valueOf(timelineLayoutVersion)); - } - - if (null != baseFileFormat) { - properties.setProperty(HoodieTableConfig.HOODIE_BASE_FILE_FORMAT_PROP_NAME, baseFileFormat.toUpperCase()); - } - - if (null != bootstrapIndexClass) { - properties.put(HoodieTableConfig.HOODIE_BOOTSTRAP_INDEX_CLASS_PROP_NAME, bootstrapIndexClass); + /** + * Validate table properties. + * @param properties Properties from writeConfig. + */ + public void validateTableProperties(Properties properties) { + // Once meta fields are disabled, it cant be re-enabled for a given table. + if (!getTableConfig().populateMetaFields() + && Boolean.parseBoolean((String) properties.getOrDefault(HoodieTableConfig.POPULATE_META_FIELDS.key(), HoodieTableConfig.POPULATE_META_FIELDS.defaultValue().toString()))) { + throw new HoodieException(HoodieTableConfig.POPULATE_META_FIELDS.key() + " already disabled for the table. Can't be re-enabled back"); } - if (null != bootstrapBasePath) { - properties.put(HoodieTableConfig.HOODIE_BOOTSTRAP_BASE_PATH, bootstrapBasePath); + // Meta fields can be disabled only when either {@code SimpleKeyGenerator}, {@code ComplexKeyGenerator}, {@code NonpartitionedKeyGenerator} is used + if (!getTableConfig().populateMetaFields()) { + String keyGenClass = properties.getProperty(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key(), "org.apache.hudi.keygen.SimpleKeyGenerator"); + if (!keyGenClass.equals("org.apache.hudi.keygen.SimpleKeyGenerator") + && !keyGenClass.equals("org.apache.hudi.keygen.NonpartitionedKeyGenerator") + && !keyGenClass.equals("org.apache.hudi.keygen.ComplexKeyGenerator")) { + throw new HoodieException("Only simple, non-partitioned or complex key generator are supported when meta-fields are disabled. Used: " + keyGenClass); + } } - - return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, basePath, properties); } /** - * Helper method to initialize a given path as a hoodie table with configs passed in as as Properties. + * Helper method to initialize a given path as a hoodie table with configs passed in as Properties. * * @return Instance of HoodieTableMetaClient */ @@ -409,11 +431,15 @@ public static HoodieTableMetaClient initTableAndGetMetaClient(Configuration hado if (!fs.exists(metaPathDir)) { fs.mkdirs(metaPathDir); } + // create schema folder + Path schemaPathDir = new Path(metaPathDir, SCHEMA_FOLDER_NAME); + if (!fs.exists(schemaPathDir)) { + fs.mkdirs(schemaPathDir); + } // if anything other than default archive log folder is specified, create that too - String archiveLogPropVal = props.getProperty(HoodieTableConfig.HOODIE_ARCHIVELOG_FOLDER_PROP_NAME, - HoodieTableConfig.DEFAULT_ARCHIVELOG_FOLDER); - if (!archiveLogPropVal.equals(HoodieTableConfig.DEFAULT_ARCHIVELOG_FOLDER)) { + String archiveLogPropVal = new HoodieConfig(props).getStringOrDefault(HoodieTableConfig.ARCHIVELOG_FOLDER); + if (!StringUtils.isNullOrEmpty(archiveLogPropVal)) { Path archiveLogDir = new Path(metaPathDir, archiveLogPropVal); if (!fs.exists(archiveLogDir)) { fs.mkdirs(archiveLogDir); @@ -433,16 +459,16 @@ public static HoodieTableMetaClient initTableAndGetMetaClient(Configuration hado } initializeBootstrapDirsIfNotExists(hadoopConf, basePath, fs); - HoodieTableConfig.createHoodieProperties(fs, metaPathDir, props); + HoodieTableConfig.create(fs, metaPathDir, props); // We should not use fs.getConf as this might be different from the original configuration // used to create the fs in unit tests - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, basePath); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath) + .setProperties(props).build(); LOG.info("Finished initializing Table of type " + metaClient.getTableConfig().getTableType() + " from " + basePath); return metaClient; } - public static void initializeBootstrapDirsIfNotExists(Configuration hadoopConf, - String basePath, FileSystem fs) throws IOException { + public static void initializeBootstrapDirsIfNotExists(Configuration hadoopConf, String basePath, FileSystem fs) throws IOException { // Create bootstrap index by partition folder if it does not exist final Path bootstrap_index_folder_by_partition = @@ -473,6 +499,13 @@ public static FileStatus[] scanFiles(FileSystem fs, Path metaPath, PathFilter na return fs.listStatus(metaPath, nameFilter); } + /** + * @return {@code true} if any commits are found, else {@code false}. + */ + public boolean isTimelineNonEmpty() { + return getCommitsTimeline().filterCompletedInstants().getInstants().collect(Collectors.toList()).size() > 0; + } + /** * Get the commit timeline visible for this table. */ @@ -501,7 +534,7 @@ public HoodieTimeline getCommitsAndCompactionTimeline() { case COPY_ON_WRITE: return getActiveTimeline().getCommitTimeline(); case MERGE_ON_READ: - return getActiveTimeline().getCommitsAndCompactionTimeline(); + return getActiveTimeline().getWriteTimeline(); default: throw new HoodieException("Unsupported table type :" + this.getTableType()); } @@ -539,7 +572,7 @@ public String getCommitActionType() { */ public List scanHoodieInstantsFromFileSystem(Set includedExtensions, boolean applyLayoutVersionFilters) throws IOException { - return scanHoodieInstantsFromFileSystem(new Path(metaPath), includedExtensions, applyLayoutVersionFilters); + return scanHoodieInstantsFromFileSystem(metaPath.get(), includedExtensions, applyLayoutVersionFilters); } /** @@ -596,18 +629,462 @@ public String toString() { } public void initializeBootstrapDirsIfNotExists() throws IOException { - initializeBootstrapDirsIfNotExists(getHadoopConf(), basePath, getFs()); + initializeBootstrapDirsIfNotExists(getHadoopConf(), basePath.toString(), getFs()); } - public void setBasePath(String basePath) { - this.basePath = basePath; + private static HoodieTableMetaClient newMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad, + ConsistencyGuardConfig consistencyGuardConfig, Option layoutVersion, + String payloadClassName, FileSystemRetryConfig fileSystemRetryConfig, Properties props) { + HoodieMetastoreConfig metastoreConfig = null == props + ? new HoodieMetastoreConfig.Builder().build() + : new HoodieMetastoreConfig.Builder().fromProperties(props).build(); + return metastoreConfig.enableMetastore() + ? (HoodieTableMetaClient) ReflectionUtils.loadClass("org.apache.hudi.common.table.HoodieTableMetastoreClient", + new Class[]{Configuration.class, ConsistencyGuardConfig.class, FileSystemRetryConfig.class, String.class, String.class, HoodieMetastoreConfig.class}, + conf, consistencyGuardConfig, fileSystemRetryConfig, + props.getProperty(HoodieTableConfig.DATABASE_NAME.key()), props.getProperty(HoodieTableConfig.NAME.key()), metastoreConfig) + : new HoodieTableMetaClient(conf, basePath, + loadActiveTimelineOnLoad, consistencyGuardConfig, layoutVersion, payloadClassName, fileSystemRetryConfig); } - public void setMetaPath(String metaPath) { - this.metaPath = metaPath; + public static Builder builder() { + return new Builder(); } - public void setActiveTimeline(HoodieActiveTimeline activeTimeline) { - this.activeTimeline = activeTimeline; + /** + * Builder for {@link HoodieTableMetaClient}. + */ + public static class Builder { + + private Configuration conf; + private String basePath; + private boolean loadActiveTimelineOnLoad = false; + private String payloadClassName = null; + private ConsistencyGuardConfig consistencyGuardConfig = ConsistencyGuardConfig.newBuilder().build(); + private FileSystemRetryConfig fileSystemRetryConfig = FileSystemRetryConfig.newBuilder().build(); + private Option layoutVersion = Option.of(TimelineLayoutVersion.CURR_LAYOUT_VERSION); + private Properties props; + + public Builder setConf(Configuration conf) { + this.conf = conf; + return this; + } + + public Builder setBasePath(String basePath) { + this.basePath = basePath; + return this; + } + + public Builder setLoadActiveTimelineOnLoad(boolean loadActiveTimelineOnLoad) { + this.loadActiveTimelineOnLoad = loadActiveTimelineOnLoad; + return this; + } + + public Builder setPayloadClassName(String payloadClassName) { + this.payloadClassName = payloadClassName; + return this; + } + + public Builder setConsistencyGuardConfig(ConsistencyGuardConfig consistencyGuardConfig) { + this.consistencyGuardConfig = consistencyGuardConfig; + return this; + } + + public Builder setFileSystemRetryConfig(FileSystemRetryConfig fileSystemRetryConfig) { + this.fileSystemRetryConfig = fileSystemRetryConfig; + return this; + } + + public Builder setLayoutVersion(Option layoutVersion) { + this.layoutVersion = layoutVersion; + return this; + } + + public Builder setProperties(Properties properties) { + this.props = properties; + return this; + } + + public HoodieTableMetaClient build() { + ValidationUtils.checkArgument(conf != null, "Configuration needs to be set to init HoodieTableMetaClient"); + ValidationUtils.checkArgument(basePath != null, "basePath needs to be set to init HoodieTableMetaClient"); + return newMetaClient(conf, basePath, + loadActiveTimelineOnLoad, consistencyGuardConfig, layoutVersion, payloadClassName, fileSystemRetryConfig, props); + } + } + + public static PropertyBuilder withPropertyBuilder() { + return new PropertyBuilder(); + } + + public static class PropertyBuilder { + + private HoodieTableType tableType; + private String databaseName; + private String tableName; + private String tableCreateSchema; + private String recordKeyFields; + private String archiveLogFolder; + private String payloadClassName; + private Integer timelineLayoutVersion; + private String baseFileFormat; + private String preCombineField; + private String partitionFields; + private String bootstrapIndexClass; + private String bootstrapBasePath; + private Boolean bootstrapIndexEnable; + private Boolean populateMetaFields; + private String keyGeneratorClassProp; + private Boolean hiveStylePartitioningEnable; + private Boolean urlEncodePartitioning; + private HoodieTimelineTimeZone commitTimeZone; + private Boolean partitionMetafileUseBaseFormat; + private Boolean shouldDropPartitionColumns; + private String metadataPartitions; + private String inflightMetadataPartitions; + + /** + * Persist the configs that is written at the first time, and should not be changed. + * Like KeyGenerator's configs. + */ + private Properties others = new Properties(); + + private PropertyBuilder() { + + } + + public PropertyBuilder setTableType(HoodieTableType tableType) { + this.tableType = tableType; + return this; + } + + public PropertyBuilder setTableType(String tableType) { + return setTableType(HoodieTableType.valueOf(tableType)); + } + + public PropertyBuilder setDatabaseName(String databaseName) { + this.databaseName = databaseName; + return this; + } + + public PropertyBuilder setTableName(String tableName) { + this.tableName = tableName; + return this; + } + + public PropertyBuilder setTableCreateSchema(String tableCreateSchema) { + this.tableCreateSchema = tableCreateSchema; + return this; + } + + public PropertyBuilder setRecordKeyFields(String recordKeyFields) { + this.recordKeyFields = recordKeyFields; + return this; + } + + public PropertyBuilder setArchiveLogFolder(String archiveLogFolder) { + this.archiveLogFolder = archiveLogFolder; + return this; + } + + public PropertyBuilder setPayloadClassName(String payloadClassName) { + this.payloadClassName = payloadClassName; + return this; + } + + public PropertyBuilder setPayloadClass(Class payloadClass) { + return setPayloadClassName(payloadClass.getName()); + } + + public PropertyBuilder setTimelineLayoutVersion(Integer timelineLayoutVersion) { + this.timelineLayoutVersion = timelineLayoutVersion; + return this; + } + + public PropertyBuilder setBaseFileFormat(String baseFileFormat) { + this.baseFileFormat = baseFileFormat; + return this; + } + + public PropertyBuilder setPreCombineField(String preCombineField) { + this.preCombineField = preCombineField; + return this; + } + + public PropertyBuilder setPartitionFields(String partitionFields) { + this.partitionFields = partitionFields; + return this; + } + + public PropertyBuilder setBootstrapIndexClass(String bootstrapIndexClass) { + this.bootstrapIndexClass = bootstrapIndexClass; + return this; + } + + public PropertyBuilder setBootstrapBasePath(String bootstrapBasePath) { + this.bootstrapBasePath = bootstrapBasePath; + return this; + } + + public PropertyBuilder setBootstrapIndexEnable(Boolean bootstrapIndexEnable) { + this.bootstrapIndexEnable = bootstrapIndexEnable; + return this; + } + + public PropertyBuilder setPopulateMetaFields(boolean populateMetaFields) { + this.populateMetaFields = populateMetaFields; + return this; + } + + public PropertyBuilder setKeyGeneratorClassProp(String keyGeneratorClassProp) { + this.keyGeneratorClassProp = keyGeneratorClassProp; + return this; + } + + public PropertyBuilder setHiveStylePartitioningEnable(Boolean hiveStylePartitioningEnable) { + this.hiveStylePartitioningEnable = hiveStylePartitioningEnable; + return this; + } + + public PropertyBuilder setUrlEncodePartitioning(Boolean urlEncodePartitioning) { + this.urlEncodePartitioning = urlEncodePartitioning; + return this; + } + + public PropertyBuilder setCommitTimezone(HoodieTimelineTimeZone timelineTimeZone) { + this.commitTimeZone = timelineTimeZone; + return this; + } + + public PropertyBuilder setPartitionMetafileUseBaseFormat(Boolean useBaseFormat) { + this.partitionMetafileUseBaseFormat = useBaseFormat; + return this; + } + + public PropertyBuilder setShouldDropPartitionColumns(Boolean shouldDropPartitionColumns) { + this.shouldDropPartitionColumns = shouldDropPartitionColumns; + return this; + } + + public PropertyBuilder setMetadataPartitions(String partitions) { + this.metadataPartitions = partitions; + return this; + } + + public PropertyBuilder setInflightMetadataPartitions(String partitions) { + this.inflightMetadataPartitions = partitions; + return this; + } + + private void set(String key, Object value) { + if (HoodieTableConfig.PERSISTED_CONFIG_LIST.contains(key)) { + this.others.put(key, value); + } + } + + public PropertyBuilder set(Map props) { + for (String key: HoodieTableConfig.PERSISTED_CONFIG_LIST) { + Object value = props.get(key); + if (value != null) { + set(key, value); + } + } + return this; + } + + public PropertyBuilder fromMetaClient(HoodieTableMetaClient metaClient) { + return setTableType(metaClient.getTableType()) + .setTableName(metaClient.getTableConfig().getTableName()) + .setArchiveLogFolder(metaClient.getArchivePath()) + .setPayloadClassName(metaClient.getTableConfig().getPayloadClass()); + } + + public PropertyBuilder fromProperties(Properties properties) { + HoodieConfig hoodieConfig = new HoodieConfig(properties); + + for (String key: HoodieTableConfig.PERSISTED_CONFIG_LIST) { + Object value = hoodieConfig.getString(key); + if (value != null) { + set(key, value); + } + } + + if (hoodieConfig.contains(HoodieTableConfig.DATABASE_NAME)) { + setDatabaseName(hoodieConfig.getString(HoodieTableConfig.DATABASE_NAME)); + } + if (hoodieConfig.contains(HoodieTableConfig.NAME)) { + setTableName(hoodieConfig.getString(HoodieTableConfig.NAME)); + } + if (hoodieConfig.contains(HoodieTableConfig.TYPE)) { + setTableType(hoodieConfig.getString(HoodieTableConfig.TYPE)); + } + if (hoodieConfig.contains(HoodieTableConfig.ARCHIVELOG_FOLDER)) { + setArchiveLogFolder( + hoodieConfig.getString(HoodieTableConfig.ARCHIVELOG_FOLDER)); + } + if (hoodieConfig.contains(HoodieTableConfig.PAYLOAD_CLASS_NAME)) { + setPayloadClassName( + hoodieConfig.getString(HoodieTableConfig.PAYLOAD_CLASS_NAME)); + } + if (hoodieConfig.contains(HoodieTableConfig.TIMELINE_LAYOUT_VERSION)) { + setTimelineLayoutVersion(hoodieConfig.getInt(HoodieTableConfig.TIMELINE_LAYOUT_VERSION)); + } + if (hoodieConfig.contains(HoodieTableConfig.BASE_FILE_FORMAT)) { + setBaseFileFormat( + hoodieConfig.getString(HoodieTableConfig.BASE_FILE_FORMAT)); + } + if (hoodieConfig.contains(HoodieTableConfig.BOOTSTRAP_INDEX_CLASS_NAME)) { + setBootstrapIndexClass( + hoodieConfig.getString(HoodieTableConfig.BOOTSTRAP_INDEX_CLASS_NAME)); + } + if (hoodieConfig.contains(HoodieTableConfig.BOOTSTRAP_BASE_PATH)) { + setBootstrapBasePath(hoodieConfig.getString(HoodieTableConfig.BOOTSTRAP_BASE_PATH)); + } + + if (hoodieConfig.contains(HoodieTableConfig.BOOTSTRAP_INDEX_ENABLE)) { + setBootstrapIndexEnable(hoodieConfig.getBoolean(HoodieTableConfig.BOOTSTRAP_INDEX_ENABLE)); + } + + if (hoodieConfig.contains(HoodieTableConfig.PRECOMBINE_FIELD)) { + setPreCombineField(hoodieConfig.getString(HoodieTableConfig.PRECOMBINE_FIELD)); + } + if (hoodieConfig.contains(HoodieTableConfig.PARTITION_FIELDS)) { + setPartitionFields( + hoodieConfig.getString(HoodieTableConfig.PARTITION_FIELDS)); + } + if (hoodieConfig.contains(HoodieTableConfig.RECORDKEY_FIELDS)) { + setRecordKeyFields(hoodieConfig.getString(HoodieTableConfig.RECORDKEY_FIELDS)); + } + if (hoodieConfig.contains(HoodieTableConfig.CREATE_SCHEMA)) { + setTableCreateSchema(hoodieConfig.getString(HoodieTableConfig.CREATE_SCHEMA)); + } + if (hoodieConfig.contains(HoodieTableConfig.POPULATE_META_FIELDS)) { + setPopulateMetaFields(hoodieConfig.getBoolean(HoodieTableConfig.POPULATE_META_FIELDS)); + } + if (hoodieConfig.contains(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME)) { + setKeyGeneratorClassProp(hoodieConfig.getString(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME)); + } + if (hoodieConfig.contains(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE)) { + setHiveStylePartitioningEnable(hoodieConfig.getBoolean(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE)); + } + if (hoodieConfig.contains(HoodieTableConfig.URL_ENCODE_PARTITIONING)) { + setUrlEncodePartitioning(hoodieConfig.getBoolean(HoodieTableConfig.URL_ENCODE_PARTITIONING)); + } + if (hoodieConfig.contains(HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT)) { + setPartitionMetafileUseBaseFormat(hoodieConfig.getBoolean(HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT)); + } + if (hoodieConfig.contains(HoodieTableConfig.DROP_PARTITION_COLUMNS)) { + setShouldDropPartitionColumns(hoodieConfig.getBoolean(HoodieTableConfig.DROP_PARTITION_COLUMNS)); + } + if (hoodieConfig.contains(HoodieTableConfig.TABLE_METADATA_PARTITIONS)) { + setMetadataPartitions(hoodieConfig.getString(HoodieTableConfig.TABLE_METADATA_PARTITIONS)); + } + if (hoodieConfig.contains(HoodieTableConfig.TABLE_METADATA_PARTITIONS_INFLIGHT)) { + setInflightMetadataPartitions(hoodieConfig.getString(HoodieTableConfig.TABLE_METADATA_PARTITIONS_INFLIGHT)); + } + return this; + } + + public Properties build() { + ValidationUtils.checkArgument(tableType != null, "tableType is null"); + ValidationUtils.checkArgument(tableName != null, "tableName is null"); + + HoodieTableConfig tableConfig = new HoodieTableConfig(); + + tableConfig.setAll(others); + + if (databaseName != null) { + tableConfig.setValue(HoodieTableConfig.DATABASE_NAME, databaseName); + } + tableConfig.setValue(HoodieTableConfig.NAME, tableName); + tableConfig.setValue(HoodieTableConfig.TYPE, tableType.name()); + tableConfig.setValue(HoodieTableConfig.VERSION, + String.valueOf(HoodieTableVersion.current().versionCode())); + if (tableType == HoodieTableType.MERGE_ON_READ && payloadClassName != null) { + tableConfig.setValue(HoodieTableConfig.PAYLOAD_CLASS_NAME, payloadClassName); + } + + if (null != tableCreateSchema) { + tableConfig.setValue(HoodieTableConfig.CREATE_SCHEMA, tableCreateSchema); + } + + if (!StringUtils.isNullOrEmpty(archiveLogFolder)) { + tableConfig.setValue(HoodieTableConfig.ARCHIVELOG_FOLDER, archiveLogFolder); + } else { + tableConfig.setDefaultValue(HoodieTableConfig.ARCHIVELOG_FOLDER); + } + + if (null != timelineLayoutVersion) { + tableConfig.setValue(HoodieTableConfig.TIMELINE_LAYOUT_VERSION, + String.valueOf(timelineLayoutVersion)); + } + + if (null != baseFileFormat) { + tableConfig.setValue(HoodieTableConfig.BASE_FILE_FORMAT, baseFileFormat.toUpperCase()); + } + + if (null != bootstrapIndexClass) { + tableConfig.setValue(HoodieTableConfig.BOOTSTRAP_INDEX_CLASS_NAME, bootstrapIndexClass); + } + + if (null != bootstrapIndexEnable) { + tableConfig.setValue(HoodieTableConfig.BOOTSTRAP_INDEX_ENABLE, Boolean.toString(bootstrapIndexEnable)); + } + + if (null != bootstrapBasePath) { + tableConfig.setValue(HoodieTableConfig.BOOTSTRAP_BASE_PATH, bootstrapBasePath); + } + + if (null != preCombineField) { + tableConfig.setValue(HoodieTableConfig.PRECOMBINE_FIELD, preCombineField); + } + + if (null != partitionFields) { + tableConfig.setValue(HoodieTableConfig.PARTITION_FIELDS, partitionFields); + } + if (null != recordKeyFields) { + tableConfig.setValue(HoodieTableConfig.RECORDKEY_FIELDS, recordKeyFields); + } + if (null != populateMetaFields) { + tableConfig.setValue(HoodieTableConfig.POPULATE_META_FIELDS, Boolean.toString(populateMetaFields)); + } + if (null != keyGeneratorClassProp) { + tableConfig.setValue(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME, keyGeneratorClassProp); + } + if (null != hiveStylePartitioningEnable) { + tableConfig.setValue(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE, Boolean.toString(hiveStylePartitioningEnable)); + } + if (null != urlEncodePartitioning) { + tableConfig.setValue(HoodieTableConfig.URL_ENCODE_PARTITIONING, Boolean.toString(urlEncodePartitioning)); + } + if (null != commitTimeZone) { + tableConfig.setValue(HoodieTableConfig.TIMELINE_TIMEZONE, commitTimeZone.toString()); + } + if (null != partitionMetafileUseBaseFormat) { + tableConfig.setValue(HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT, partitionMetafileUseBaseFormat.toString()); + } + if (null != shouldDropPartitionColumns) { + tableConfig.setValue(HoodieTableConfig.DROP_PARTITION_COLUMNS, Boolean.toString(shouldDropPartitionColumns)); + } + if (null != metadataPartitions) { + tableConfig.setValue(HoodieTableConfig.TABLE_METADATA_PARTITIONS, metadataPartitions); + } + if (null != inflightMetadataPartitions) { + tableConfig.setValue(HoodieTableConfig.TABLE_METADATA_PARTITIONS_INFLIGHT, inflightMetadataPartitions); + } + return tableConfig.getProps(); + } + + /** + * Init Table with the properties build by this builder. + * + * @param configuration The hadoop config. + * @param basePath The base path for hoodie table. + */ + public HoodieTableMetaClient initTable(Configuration configuration, String basePath) + throws IOException { + return HoodieTableMetaClient.initTableAndGetMetaClient(configuration, basePath, build()); + } + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableVersion.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableVersion.java index eb2e200de6bde..8a13985d170bf 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableVersion.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableVersion.java @@ -30,7 +30,15 @@ public enum HoodieTableVersion { // < 0.6.0 versions ZERO(0), // 0.6.0 onwards - ONE(1); + ONE(1), + // 0.9.0 onwards + TWO(2), + // 0.10.0 onwards + THREE(3), + // 0.11.0 onwards + FOUR(4), + // 0.12.0 onwards + FIVE(5); private final int versionCode; @@ -43,10 +51,10 @@ public int versionCode() { } public static HoodieTableVersion current() { - return ONE; + return FIVE; } - static HoodieTableVersion versionFromCode(int versionCode) { + public static HoodieTableVersion versionFromCode(int versionCode) { return Arrays.stream(HoodieTableVersion.values()) .filter(v -> v.versionCode == versionCode).findAny() .orElseThrow(() -> new HoodieException("Unknown versionCode:" + versionCode)); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java b/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java index 372b3936b5b2f..657ac57c6375c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java @@ -18,17 +18,11 @@ package org.apache.hudi.common.table; -import java.io.IOException; - -import org.apache.avro.Schema; -import org.apache.avro.Schema.Field; -import org.apache.avro.SchemaCompatibility; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.HoodieLogFormat.Reader; import org.apache.hudi.common.table.log.block.HoodieDataBlock; @@ -36,11 +30,30 @@ import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Functions.Function1; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieIncompatibleSchemaException; import org.apache.hudi.exception.InvalidTableException; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager; +import org.apache.hudi.internal.schema.utils.SerDeHelper; +import org.apache.hudi.io.storage.HoodieHFileReader; +import org.apache.hudi.io.storage.HoodieOrcReader; +import org.apache.hudi.util.Lazy; + +import org.apache.avro.JsonProperties; +import org.apache.avro.Schema; +import org.apache.avro.Schema.Field; +import org.apache.avro.SchemaCompatibility; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.io.hfile.CacheConfig; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.parquet.avro.AvroSchemaConverter; @@ -49,99 +62,60 @@ import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.schema.MessageType; +import javax.annotation.concurrent.ThreadSafe; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.ConcurrentHashMap; + +import static org.apache.hudi.avro.AvroSchemaUtils.appendFieldsToSchema; +import static org.apache.hudi.avro.AvroSchemaUtils.containsFieldInSchema; +import static org.apache.hudi.avro.AvroSchemaUtils.createNullableSchema; + /** * Helper class to read schema from data files and log files and to convert it between different formats. */ +@ThreadSafe public class TableSchemaResolver { private static final Logger LOG = LogManager.getLogger(TableSchemaResolver.class); - private HoodieTableMetaClient metaClient; - public TableSchemaResolver(HoodieTableMetaClient metaClient) { - this.metaClient = metaClient; - } + private final HoodieTableMetaClient metaClient; /** - * Gets the schema for a hoodie table. Depending on the type of table, read from any file written in the latest - * commit. We will assume that the schema has not changed within a single atomic write. + * Signals whether suite of the meta-fields should have additional field designating + * operation particular record was added by. Note, that determining whether this meta-field + * should be appended to the schema requires reading out the actual schema of some data file, + * since it's ultimately the source of truth whether this field has to be represented in + * the schema + */ + private final Lazy hasOperationField; + + /** + * NOTE: {@link HoodieCommitMetadata} could be of non-trivial size for large tables (in 100s of Mbs) + * and therefore we'd want to limit amount of throw-away work being performed while fetching + * commits' metadata * - * @return Parquet schema for this table - * @throws Exception + * Please check out corresponding methods to fetch commonly used instances of {@link HoodieCommitMetadata}: + * {@link #getLatestCommitMetadataWithValidSchema()}, + * {@link #getLatestCommitMetadataWithValidSchema()}, + * {@link #getCachedCommitMetadata(HoodieInstant)} */ - private MessageType getTableParquetSchemaFromDataFile() throws Exception { - HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); + private final Lazy> commitMetadataCache; - try { - switch (metaClient.getTableType()) { - case COPY_ON_WRITE: - // If this is COW, get the last commit and read the schema from a file written in the - // last commit - HoodieInstant lastCommit = - activeTimeline.getCommitsTimeline().filterCompletedInstants().lastInstant().orElseThrow(() -> new InvalidTableException(metaClient.getBasePath())); - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata - .fromBytes(activeTimeline.getInstantDetails(lastCommit).get(), HoodieCommitMetadata.class); - String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny() - .orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for commit " - + lastCommit + ", could not get schema for table " + metaClient.getBasePath() + ", Metadata :" - + commitMetadata)); - return readSchemaFromBaseFile(new Path(filePath)); - case MERGE_ON_READ: - // If this is MOR, depending on whether the latest commit is a delta commit or - // compaction commit - // Get a datafile written and get the schema from that file - Option lastCompactionCommit = - metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().lastInstant(); - LOG.info("Found the last compaction commit as " + lastCompactionCommit); - - Option lastDeltaCommit; - if (lastCompactionCommit.isPresent()) { - lastDeltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants() - .findInstantsAfter(lastCompactionCommit.get().getTimestamp(), Integer.MAX_VALUE).lastInstant(); - } else { - lastDeltaCommit = - metaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants().lastInstant(); - } - LOG.info("Found the last delta commit " + lastDeltaCommit); - - if (lastDeltaCommit.isPresent()) { - HoodieInstant lastDeltaInstant = lastDeltaCommit.get(); - // read from the log file wrote - commitMetadata = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(lastDeltaInstant).get(), - HoodieCommitMetadata.class); - Pair filePathWithFormat = - commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream() - .filter(s -> s.contains(HoodieLogFile.DELTA_EXTENSION)).findAny() - .map(f -> Pair.of(f, HoodieFileFormat.HOODIE_LOG)).orElseGet(() -> { - // No Log files in Delta-Commit. Check if there are any parquet files - return commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream() - .filter(s -> s.contains((metaClient.getTableConfig().getBaseFileFormat().getFileExtension()))) - .findAny().map(f -> Pair.of(f, HoodieFileFormat.PARQUET)).orElseThrow(() -> - new IllegalArgumentException("Could not find any data file written for commit " - + lastDeltaInstant + ", could not get schema for table " + metaClient.getBasePath() - + ", CommitMetadata :" + commitMetadata)); - }); - switch (filePathWithFormat.getRight()) { - case HOODIE_LOG: - return readSchemaFromLogFile(lastCompactionCommit, new Path(filePathWithFormat.getLeft())); - case PARQUET: - return readSchemaFromBaseFile(new Path(filePathWithFormat.getLeft())); - default: - throw new IllegalArgumentException("Unknown file format :" + filePathWithFormat.getRight() - + " for file " + filePathWithFormat.getLeft()); - } - } else { - return readSchemaFromLastCompaction(lastCompactionCommit); - } - default: - LOG.error("Unknown table type " + metaClient.getTableType()); - throw new InvalidTableException(metaClient.getBasePath()); - } - } catch (IOException e) { - throw new HoodieException("Failed to read data schema", e); - } + private volatile HoodieInstant latestCommitWithValidSchema = null; + private volatile HoodieInstant latestCommitWithValidData = null; + + public TableSchemaResolver(HoodieTableMetaClient metaClient) { + this.metaClient = metaClient; + this.commitMetadataCache = Lazy.lazily(() -> new ConcurrentHashMap<>(2)); + this.hasOperationField = Lazy.lazily(this::hasOperationField); } - private Schema getTableAvroSchemaFromDataFile() throws Exception { + public Schema getTableAvroSchemaFromDataFile() { return convertParquetSchemaToAvro(getTableParquetSchemaFromDataFile()); } @@ -152,69 +126,111 @@ private Schema getTableAvroSchemaFromDataFile() throws Exception { * @throws Exception */ public Schema getTableAvroSchema() throws Exception { - Option schemaFromCommitMetadata = getTableSchemaFromCommitMetadata(true); - return schemaFromCommitMetadata.isPresent() ? schemaFromCommitMetadata.get() : getTableAvroSchemaFromDataFile(); + return getTableAvroSchema(metaClient.getTableConfig().populateMetaFields()); } /** - * Gets full schema (user + metadata) for a hoodie table in Parquet format. + * Gets schema for a hoodie table in Avro format, can choice if include metadata fields. * - * @return Parquet schema for the table + * @param includeMetadataFields choice if include metadata fields + * @return Avro schema for this table * @throws Exception */ - public MessageType getTableParquetSchema() throws Exception { - Option schemaFromCommitMetadata = getTableSchemaFromCommitMetadata(true); - return schemaFromCommitMetadata.isPresent() ? convertAvroSchemaToParquet(schemaFromCommitMetadata.get()) : - getTableParquetSchemaFromDataFile(); + public Schema getTableAvroSchema(boolean includeMetadataFields) throws Exception { + return getTableAvroSchemaInternal(includeMetadataFields, Option.empty()); } /** - * Gets users data schema for a hoodie table in Avro format. + * Fetches tables schema in Avro format as of the given instant * - * @return Avro user data schema - * @throws Exception + * @param timestamp as of which table's schema will be fetched */ - public Schema getTableAvroSchemaWithoutMetadataFields() throws Exception { - HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - Option schemaFromCommitMetadata = getTableSchemaFromCommitMetadata(timeline.lastInstant().get(), false); - return schemaFromCommitMetadata.isPresent() ? schemaFromCommitMetadata.get() : - HoodieAvroUtils.removeMetadataFields(getTableAvroSchemaFromDataFile()); + public Schema getTableAvroSchema(String timestamp) throws Exception { + Option instant = metaClient.getActiveTimeline().getCommitsTimeline() + .filterCompletedInstants() + .findInstantsBeforeOrEquals(timestamp) + .lastInstant(); + return getTableAvroSchemaInternal(metaClient.getTableConfig().populateMetaFields(), instant); } /** - * Gets users data schema for a hoodie table in Avro format of the instant. + * Fetches tables schema in Avro format as of the given instant * - * @param instant will get the instant data schema - * @return Avro user data schema - * @throws Exception + * @param instant as of which table's schema will be fetched */ - public Schema getTableAvroSchemaWithoutMetadataFields(HoodieInstant instant) throws Exception { - Option schemaFromCommitMetadata = getTableSchemaFromCommitMetadata(instant, false); - return schemaFromCommitMetadata.isPresent() ? schemaFromCommitMetadata.get() : - HoodieAvroUtils.removeMetadataFields(getTableAvroSchemaFromDataFile()); + public Schema getTableAvroSchema(HoodieInstant instant, boolean includeMetadataFields) throws Exception { + return getTableAvroSchemaInternal(includeMetadataFields, Option.of(instant)); } /** - * Gets the schema for a hoodie table in Avro format from the HoodieCommitMetadata of the last commit. + * Gets full schema (user + metadata) for a hoodie table in Parquet format. * - * @return Avro schema for this table + * @return Parquet schema for the table */ - private Option getTableSchemaFromCommitMetadata(boolean includeMetadataFields) { - HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - return getTableSchemaFromCommitMetadata(timeline.lastInstant().get(), includeMetadataFields); + public MessageType getTableParquetSchema() throws Exception { + return convertAvroSchemaToParquet(getTableAvroSchema(true)); } - /** - * Gets the schema for a hoodie table in Avro format from the HoodieCommitMetadata of the instant. + * Gets users data schema for a hoodie table in Avro format. * - * @return Avro schema for this table + * @return Avro user data schema + * @throws Exception + * + * @deprecated use {@link #getTableAvroSchema(boolean)} instead */ + @Deprecated + public Schema getTableAvroSchemaWithoutMetadataFields() throws Exception { + return getTableAvroSchema(false); + } + + private Schema getTableAvroSchemaInternal(boolean includeMetadataFields, Option instantOpt) { + Schema schema = + (instantOpt.isPresent() + ? getTableSchemaFromCommitMetadata(instantOpt.get(), includeMetadataFields) + : getTableSchemaFromLatestCommitMetadata(includeMetadataFields)) + .or(() -> + metaClient.getTableConfig().getTableCreateSchema() + .map(tableSchema -> + includeMetadataFields + ? HoodieAvroUtils.addMetadataFields(tableSchema, hasOperationField.get()) + : tableSchema) + ) + .orElseGet(() -> { + Schema schemaFromDataFile = getTableAvroSchemaFromDataFile(); + return includeMetadataFields + ? schemaFromDataFile + : HoodieAvroUtils.removeMetadataFields(schemaFromDataFile); + }); + + // TODO partition columns have to be appended in all read-paths + if (metaClient.getTableConfig().shouldDropPartitionColumns()) { + return metaClient.getTableConfig().getPartitionFields() + .map(partitionFields -> appendPartitionColumns(schema, Option.ofNullable(partitionFields))) + .orElse(schema); + } + + return schema; + } + + private Option getTableSchemaFromLatestCommitMetadata(boolean includeMetadataFields) { + Option> instantAndCommitMetadata = getLatestCommitMetadataWithValidSchema(); + if (instantAndCommitMetadata.isPresent()) { + HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight(); + String schemaStr = commitMetadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY); + Schema schema = new Schema.Parser().parse(schemaStr); + if (includeMetadataFields) { + schema = HoodieAvroUtils.addMetadataFields(schema, hasOperationField.get()); + } + return Option.of(schema); + } else { + return Option.empty(); + } + } + private Option getTableSchemaFromCommitMetadata(HoodieInstant instant, boolean includeMetadataFields) { try { - HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - byte[] data = timeline.getInstantDetails(instant).get(); - HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class); + HoodieCommitMetadata metadata = getCachedCommitMetadata(instant); String existingSchemaStr = metadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY); if (StringUtils.isNullOrEmpty(existingSchemaStr)) { @@ -223,7 +239,7 @@ private Option getTableSchemaFromCommitMetadata(HoodieInstant instant, b Schema schema = new Schema.Parser().parse(existingSchemaStr); if (includeMetadataFields) { - schema = HoodieAvroUtils.addMetadataFields(schema); + schema = HoodieAvroUtils.addMetadataFields(schema, hasOperationField.get()); } return Option.of(schema); } catch (Exception e) { @@ -232,23 +248,46 @@ private Option getTableSchemaFromCommitMetadata(HoodieInstant instant, b } /** - * Convert a parquet scheme to the avro format. - * - * @param parquetSchema The parquet schema to convert - * @return The converted avro schema + * Fetches the schema for a table from any the table's data files */ - public Schema convertParquetSchemaToAvro(MessageType parquetSchema) { + private MessageType getTableParquetSchemaFromDataFile() { + Option> instantAndCommitMetadata = getLatestCommitMetadataWithValidData(); + try { + switch (metaClient.getTableType()) { + case COPY_ON_WRITE: + case MERGE_ON_READ: + // For COW table, data could be written in either Parquet or Orc format currently; + // For MOR table, data could be written in either Parquet, Orc, Hfile or Delta-log format currently; + // + // Determine the file format based on the file name, and then extract schema from it. + if (instantAndCommitMetadata.isPresent()) { + HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight(); + Iterator filePaths = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePathV2()).values().iterator(); + return fetchSchemaFromFiles(filePaths); + } else { + throw new IllegalArgumentException("Could not find any data file written for commit, " + + "so could not get schema for table " + metaClient.getBasePath()); + } + default: + LOG.error("Unknown table type " + metaClient.getTableType()); + throw new InvalidTableException(metaClient.getBasePath()); + } + } catch (IOException e) { + throw new HoodieException("Failed to read data schema", e); + } + } + + public static MessageType convertAvroSchemaToParquet(Schema schema, Configuration hadoopConf) { + AvroSchemaConverter avroSchemaConverter = new AvroSchemaConverter(hadoopConf); + return avroSchemaConverter.convert(schema); + } + + private Schema convertParquetSchemaToAvro(MessageType parquetSchema) { AvroSchemaConverter avroSchemaConverter = new AvroSchemaConverter(metaClient.getHadoopConf()); return avroSchemaConverter.convert(parquetSchema); } - /** - * Convert a avro scheme to the parquet format. - * - * @param schema The avro schema to convert - * @return The converted parquet schema - */ - public MessageType convertAvroSchemaToParquet(Schema schema) { + private MessageType convertAvroSchemaToParquet(Schema schema) { AvroSchemaConverter avroSchemaConverter = new AvroSchemaConverter(metaClient.getHadoopConf()); return avroSchemaConverter.convert(schema); } @@ -292,11 +331,14 @@ public MessageType convertAvroSchemaToParquet(Schema schema) { * @param oldSchema Older schema to check. * @param newSchema Newer schema to check. * @return True if the schema validation is successful + * + * TODO revisit this method: it's implemented incorrectly as it might be applying different criteria + * to top-level record and nested record (for ex, if that nested record is contained w/in an array) */ public static boolean isSchemaCompatible(Schema oldSchema, Schema newSchema) { if (oldSchema.getType() == newSchema.getType() && newSchema.getType() == Schema.Type.RECORD) { // record names must match: - if (!SchemaCompatibility.schemaNameEquals(oldSchema, newSchema)) { + if (!SchemaCompatibility.schemaNameEquals(newSchema, oldSchema)) { return false; } @@ -319,7 +361,7 @@ public static boolean isSchemaCompatible(Schema oldSchema, Schema newSchema) { for (final Field newSchemaField : newSchema.getFields()) { final Field oldSchemaField = SchemaCompatibility.lookupWriterField(oldSchema, newSchemaField); if (oldSchemaField == null) { - if (newSchemaField.defaultValue() == null) { + if (newSchemaField.defaultVal() == null) { // C3: newly added field in newSchema does not have a default value return false; } @@ -329,9 +371,11 @@ public static boolean isSchemaCompatible(Schema oldSchema, Schema newSchema) { // All fields in the newSchema record can be populated from the oldSchema record return true; } else { - // Use the checks implemented by + // Use the checks implemented by Avro + // newSchema is the schema which will be used to read the records written earlier using oldSchema. Hence, in the + // check below, use newSchema as the reader schema and oldSchema as the writer schema. org.apache.avro.SchemaCompatibility.SchemaPairCompatibility compatResult = - org.apache.avro.SchemaCompatibility.checkReaderWriterCompatibility(oldSchema, newSchema); + org.apache.avro.SchemaCompatibility.checkReaderWriterCompatibility(newSchema, oldSchema); return compatResult.getType() == org.apache.avro.SchemaCompatibility.SchemaCompatibilityType.COMPATIBLE; } } @@ -341,24 +385,84 @@ public static boolean isSchemaCompatible(String oldSchema, String newSchema) { } /** - * Read the parquet schema from a parquet File. + * Returns table's latest Avro {@link Schema} iff table is non-empty (ie there's at least + * a single commit) + * + * This method differs from {@link #getTableAvroSchema(boolean)} in that it won't fallback + * to use table's schema used at creation */ - public MessageType readSchemaFromBaseFile(Path parquetFilePath) throws IOException { + public Option getTableAvroSchemaFromLatestCommit(boolean includeMetadataFields) throws Exception { + if (metaClient.isTimelineNonEmpty()) { + return Option.of(getTableAvroSchemaInternal(includeMetadataFields, Option.empty())); + } + + return Option.empty(); + } + + /** + * Get latest schema either from incoming schema or table schema. + * @param writeSchema incoming batch's write schema. + * @param convertTableSchemaToAddNamespace {@code true} if table schema needs to be converted. {@code false} otherwise. + * @param converterFn converter function to be called over table schema (to add namespace may be). Each caller can decide if any conversion is required. + * @return the latest schema. + * + * @deprecated will be removed (HUDI-4472) + */ + @Deprecated + public Schema getLatestSchema(Schema writeSchema, boolean convertTableSchemaToAddNamespace, + Function1 converterFn) { + Schema latestSchema = writeSchema; + try { + if (metaClient.isTimelineNonEmpty()) { + Schema tableSchema = getTableAvroSchemaWithoutMetadataFields(); + if (convertTableSchemaToAddNamespace && converterFn != null) { + tableSchema = converterFn.apply(tableSchema); + } + if (writeSchema.getFields().size() < tableSchema.getFields().size() && isSchemaCompatible(writeSchema, tableSchema)) { + // if incoming schema is a subset (old schema) compared to table schema. For eg, one of the + // ingestion pipeline is still producing events in old schema + latestSchema = tableSchema; + LOG.debug("Using latest table schema to rewrite incoming records " + tableSchema.toString()); + } + } + } catch (IllegalArgumentException | InvalidTableException e) { + LOG.warn("Could not find any commits, falling back to using incoming batch's write schema"); + } catch (Exception e) { + LOG.warn("Unknown exception thrown " + e.getMessage() + ", Falling back to using incoming batch's write schema"); + } + return latestSchema; + } + + private MessageType readSchemaFromParquetBaseFile(Path parquetFilePath) throws IOException { LOG.info("Reading schema from " + parquetFilePath); FileSystem fs = metaClient.getRawFs(); - if (!fs.exists(parquetFilePath)) { - throw new IllegalArgumentException( - "Failed to read schema from data file " + parquetFilePath + ". File does not exist."); - } ParquetMetadata fileFooter = ParquetFileReader.readFooter(fs.getConf(), parquetFilePath, ParquetMetadataConverter.NO_FILTER); return fileFooter.getFileMetaData().getSchema(); } + private MessageType readSchemaFromHFileBaseFile(Path hFilePath) throws IOException { + LOG.info("Reading schema from " + hFilePath); + + FileSystem fs = metaClient.getRawFs(); + CacheConfig cacheConfig = new CacheConfig(fs.getConf()); + HoodieHFileReader hFileReader = new HoodieHFileReader<>(fs.getConf(), hFilePath, cacheConfig); + return convertAvroSchemaToParquet(hFileReader.getSchema()); + } + + private MessageType readSchemaFromORCBaseFile(Path orcFilePath) throws IOException { + LOG.info("Reading schema from " + orcFilePath); + + FileSystem fs = metaClient.getRawFs(); + HoodieOrcReader orcReader = new HoodieOrcReader<>(fs.getConf(), orcFilePath); + return convertAvroSchemaToParquet(orcReader.getSchema()); + } + /** * Read schema from a data file from the last compaction commit done. - * @throws Exception + * + * @deprecated please use {@link #getTableAvroSchema(HoodieInstant, boolean)} instead */ public MessageType readSchemaFromLastCompaction(Option lastCompactionCommitOpt) throws Exception { HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); @@ -369,10 +473,14 @@ public MessageType readSchemaFromLastCompaction(Option lastCompac // Read from the compacted file wrote HoodieCommitMetadata compactionMetadata = HoodieCommitMetadata .fromBytes(activeTimeline.getInstantDetails(lastCompactionCommit).get(), HoodieCommitMetadata.class); - String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny() + String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePathV2()).values().stream().findAny() .orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for compaction " + lastCompactionCommit + ", could not get schema for table " + metaClient.getBasePath())); - return readSchemaFromBaseFile(new Path(filePath)); + return readSchemaFromBaseFile(filePath); + } + + private MessageType readSchemaFromLogFile(Path path) throws IOException { + return readSchemaFromLogFile(metaClient.getRawFs(), path); } /** @@ -380,43 +488,193 @@ public MessageType readSchemaFromLastCompaction(Option lastCompac * * @return */ - public MessageType readSchemaFromLogFile(Path path) throws IOException { - return readSchemaFromLogFile(metaClient.getRawFs(), path); + public static MessageType readSchemaFromLogFile(FileSystem fs, Path path) throws IOException { + try (Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(path), null)) { + HoodieDataBlock lastBlock = null; + while (reader.hasNext()) { + HoodieLogBlock block = reader.next(); + if (block instanceof HoodieDataBlock) { + lastBlock = (HoodieDataBlock) block; + } + } + return lastBlock != null ? new AvroSchemaConverter().convert(lastBlock.getSchema()) : null; + } } /** - * Read the schema from the log file on path. - * @throws Exception + * Gets the InternalSchema for a hoodie table from the HoodieCommitMetadata of the instant. + * + * @return InternalSchema for this table + */ + public Option getTableInternalSchemaFromCommitMetadata() { + HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); + return timeline.lastInstant().flatMap(this::getTableInternalSchemaFromCommitMetadata); + } + + /** + * Gets the InternalSchema for a hoodie table from the HoodieCommitMetadata of the instant. + * + * @return InternalSchema for this table + */ + public Option getTableInternalSchemaFromCommitMetadata(String timestamp) { + HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline() + .filterCompletedInstants() + .findInstantsBeforeOrEquals(timestamp); + return timeline.lastInstant().flatMap(this::getTableInternalSchemaFromCommitMetadata); + } + + /** + * Gets the InternalSchema for a hoodie table from the HoodieCommitMetadata of the instant. + * + * @return InternalSchema for this table */ - public MessageType readSchemaFromLogFile(Option lastCompactionCommitOpt, Path path) - throws Exception { - MessageType messageType = readSchemaFromLogFile(path); - // Fall back to read the schema from last compaction - if (messageType == null) { - LOG.info("Falling back to read the schema from last compaction " + lastCompactionCommitOpt); - return readSchemaFromLastCompaction(lastCompactionCommitOpt); + private Option getTableInternalSchemaFromCommitMetadata(HoodieInstant instant) { + try { + HoodieCommitMetadata metadata = getCachedCommitMetadata(instant); + String latestInternalSchemaStr = metadata.getMetadata(SerDeHelper.LATEST_SCHEMA); + if (latestInternalSchemaStr != null) { + return SerDeHelper.fromJson(latestInternalSchemaStr); + } else { + return Option.empty(); + } + } catch (Exception e) { + throw new HoodieException("Failed to read schema from commit metadata", e); } - return messageType; } /** - * Read the schema from the log file on path. + * Gets the history schemas as String for a hoodie table from the HoodieCommitMetadata of the instant. * - * @return + * @return history schemas string for this table */ - public static MessageType readSchemaFromLogFile(FileSystem fs, Path path) throws IOException { - Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(path), null); - HoodieDataBlock lastBlock = null; - while (reader.hasNext()) { - HoodieLogBlock block = reader.next(); - if (block instanceof HoodieDataBlock) { - lastBlock = (HoodieDataBlock) block; + public Option getTableHistorySchemaStrFromCommitMetadata() { + // now we only support FileBaseInternalSchemaManager + FileBasedInternalSchemaStorageManager manager = new FileBasedInternalSchemaStorageManager(metaClient); + String result = manager.getHistorySchemaStr(); + return result.isEmpty() ? Option.empty() : Option.of(result); + } + + /** + * NOTE: This method could only be used in tests + * + * @VisibleForTesting + */ + public boolean hasOperationField() { + try { + Schema tableAvroSchema = getTableAvroSchemaFromDataFile(); + return tableAvroSchema.getField(HoodieRecord.OPERATION_METADATA_FIELD) != null; + } catch (Exception e) { + LOG.info(String.format("Failed to read operation field from avro schema (%s)", e.getMessage())); + return false; + } + } + + private Option> getLatestCommitMetadataWithValidSchema() { + if (latestCommitWithValidSchema == null) { + Option> instantAndCommitMetadata = + metaClient.getActiveTimeline().getLastCommitMetadataWithValidSchema(); + if (instantAndCommitMetadata.isPresent()) { + HoodieInstant instant = instantAndCommitMetadata.get().getLeft(); + HoodieCommitMetadata metadata = instantAndCommitMetadata.get().getRight(); + synchronized (this) { + if (latestCommitWithValidSchema == null) { + latestCommitWithValidSchema = instant; + } + commitMetadataCache.get().putIfAbsent(instant, metadata); + } } } - reader.close(); - if (lastBlock != null) { - return new AvroSchemaConverter().convert(lastBlock.getSchema()); + + return Option.ofNullable(latestCommitWithValidSchema) + .map(instant -> Pair.of(instant, commitMetadataCache.get().get(instant))); + } + + private Option> getLatestCommitMetadataWithValidData() { + if (latestCommitWithValidData == null) { + Option> instantAndCommitMetadata = + metaClient.getActiveTimeline().getLastCommitMetadataWithValidData(); + if (instantAndCommitMetadata.isPresent()) { + HoodieInstant instant = instantAndCommitMetadata.get().getLeft(); + HoodieCommitMetadata metadata = instantAndCommitMetadata.get().getRight(); + synchronized (this) { + if (latestCommitWithValidData == null) { + latestCommitWithValidData = instant; + } + commitMetadataCache.get().putIfAbsent(instant, metadata); + } + } + } + + return Option.ofNullable(latestCommitWithValidData) + .map(instant -> Pair.of(instant, commitMetadataCache.get().get(instant))); + } + + private HoodieCommitMetadata getCachedCommitMetadata(HoodieInstant instant) { + return commitMetadataCache.get() + .computeIfAbsent(instant, (missingInstant) -> { + HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); + byte[] data = timeline.getInstantDetails(missingInstant).get(); + try { + return HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class); + } catch (IOException e) { + throw new HoodieIOException(String.format("Failed to fetch HoodieCommitMetadata for instant (%s)", missingInstant), e); + } + }); + } + + private MessageType fetchSchemaFromFiles(Iterator filePaths) throws IOException { + MessageType type = null; + while (filePaths.hasNext() && type == null) { + String filePath = filePaths.next(); + if (filePath.contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())) { + // this is a log file + type = readSchemaFromLogFile(new Path(filePath)); + } else { + type = readSchemaFromBaseFile(filePath); + } + } + return type; + } + + private MessageType readSchemaFromBaseFile(String filePath) throws IOException { + if (filePath.contains(HoodieFileFormat.PARQUET.getFileExtension())) { + return readSchemaFromParquetBaseFile(new Path(filePath)); + } else if (filePath.contains(HoodieFileFormat.HFILE.getFileExtension())) { + return readSchemaFromHFileBaseFile(new Path(filePath)); + } else if (filePath.contains(HoodieFileFormat.ORC.getFileExtension())) { + return readSchemaFromORCBaseFile(new Path(filePath)); + } else { + throw new IllegalArgumentException("Unknown base file format :" + filePath); + } + } + + public static Schema appendPartitionColumns(Schema dataSchema, Option partitionFields) { + // In cases when {@link DROP_PARTITION_COLUMNS} config is set true, partition columns + // won't be persisted w/in the data files, and therefore we need to append such columns + // when schema is parsed from data files + // + // Here we append partition columns with {@code StringType} as the data type + if (!partitionFields.isPresent() || partitionFields.get().length == 0) { + return dataSchema; } - return null; + + boolean hasPartitionColNotInSchema = Arrays.stream(partitionFields.get()).anyMatch(pf -> !containsFieldInSchema(dataSchema, pf)); + boolean hasPartitionColInSchema = Arrays.stream(partitionFields.get()).anyMatch(pf -> containsFieldInSchema(dataSchema, pf)); + if (hasPartitionColNotInSchema && hasPartitionColInSchema) { + throw new HoodieIncompatibleSchemaException("Partition columns could not be partially contained w/in the data schema"); + } + + if (hasPartitionColNotInSchema) { + // when hasPartitionColNotInSchema is true and hasPartitionColInSchema is false, all partition columns + // are not in originSchema. So we create and add them. + List newFields = new ArrayList<>(); + for (String partitionField: partitionFields.get()) { + newFields.add(new Schema.Field( + partitionField, createNullableSchema(Schema.Type.STRING), "", JsonProperties.NULL_VALUE)); + } + return appendFieldsToSchema(dataSchema, newFields); + } + + return dataSchema; } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java new file mode 100644 index 0000000000000..4566b1f5cd6b1 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java @@ -0,0 +1,579 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.table.log; + +import org.apache.hudi.common.model.DeleteRecord; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; +import org.apache.hudi.common.table.log.block.HoodieCommandBlock; +import org.apache.hudi.common.table.log.block.HoodieDataBlock; +import org.apache.hudi.common.table.log.block.HoodieDeleteBlock; +import org.apache.hudi.common.table.log.block.HoodieHFileDataBlock; +import org.apache.hudi.common.table.log.block.HoodieLogBlock; +import org.apache.hudi.common.table.log.block.HoodieParquetDataBlock; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.SpillableMapUtils; +import org.apache.hudi.common.util.InternalSchemaCache; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.action.InternalSchemaMerger; +import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.ArrayDeque; +import java.util.Arrays; +import java.util.Collections; +import java.util.Deque; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType.INSTANT_TIME; +import static org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType.COMMAND_BLOCK; +import static org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType.CORRUPT_BLOCK; + +/** + * Implements logic to scan log blocks and expose valid and deleted log records to subclass implementation. Subclass is + * free to either apply merging or expose raw data back to the caller. + * + * NOTE: If readBlockLazily is turned on, does not merge, instead keeps reading log blocks and merges everything at once + * This is an optimization to avoid seek() back and forth to read new block (forward seek()) and lazily read content of + * seen block (reverse and forward seek()) during merge | | Read Block 1 Metadata | | Read Block 1 Data | | | Read Block + * 2 Metadata | | Read Block 2 Data | | I/O Pass 1 | ..................... | I/O Pass 2 | ................. | | | Read + * Block N Metadata | | Read Block N Data | + *

    + * This results in two I/O passes over the log file. + */ +public abstract class AbstractHoodieLogRecordReader { + + private static final Logger LOG = LogManager.getLogger(AbstractHoodieLogRecordReader.class); + + // Reader schema for the records + protected final Schema readerSchema; + // Latest valid instant time + // Log-Blocks belonging to inflight delta-instants are filtered-out using this high-watermark. + private final String latestInstantTime; + private final HoodieTableMetaClient hoodieTableMetaClient; + // Merge strategy to use when combining records from log + private final String payloadClassFQN; + // preCombine field + private final String preCombineField; + // simple key gen fields + private Option> simpleKeyGenFields = Option.empty(); + // Log File Paths + protected final List logFilePaths; + // Read Lazily flag + private final boolean readBlocksLazily; + // Reverse reader - Not implemented yet (NA -> Why do we need ?) + // but present here for plumbing for future implementation + private final boolean reverseReader; + // Buffer Size for log file reader + private final int bufferSize; + // optional instant range for incremental block filtering + private final Option instantRange; + // Read the operation metadata field from the avro record + private final boolean withOperationField; + // FileSystem + private final FileSystem fs; + // Total log files read - for metrics + private AtomicLong totalLogFiles = new AtomicLong(0); + // Internal schema, used to support full schema evolution. + private InternalSchema internalSchema; + // Hoodie table path. + private final String path; + // Total log blocks read - for metrics + private AtomicLong totalLogBlocks = new AtomicLong(0); + // Total log records read - for metrics + private AtomicLong totalLogRecords = new AtomicLong(0); + // Total number of rollbacks written across all log files + private AtomicLong totalRollbacks = new AtomicLong(0); + // Total number of corrupt blocks written across all log files + private AtomicLong totalCorruptBlocks = new AtomicLong(0); + // Store the last instant log blocks (needed to implement rollback) + private Deque currentInstantLogBlocks = new ArrayDeque<>(); + // Enables full scan of log records + protected final boolean forceFullScan; + private int totalScannedLogFiles; + // Progress + private float progress = 0.0f; + // Partition name + private Option partitionName; + // Populate meta fields for the records + private boolean populateMetaFields = true; + + protected AbstractHoodieLogRecordReader(FileSystem fs, String basePath, List logFilePaths, + Schema readerSchema, + String latestInstantTime, boolean readBlocksLazily, boolean reverseReader, + int bufferSize, Option instantRange, + boolean withOperationField) { + this(fs, basePath, logFilePaths, readerSchema, latestInstantTime, readBlocksLazily, reverseReader, bufferSize, + instantRange, withOperationField, true, Option.empty(), InternalSchema.getEmptyInternalSchema()); + } + + protected AbstractHoodieLogRecordReader(FileSystem fs, String basePath, List logFilePaths, + Schema readerSchema, String latestInstantTime, boolean readBlocksLazily, + boolean reverseReader, int bufferSize, Option instantRange, + boolean withOperationField, boolean forceFullScan, + Option partitionName, InternalSchema internalSchema) { + this.readerSchema = readerSchema; + this.latestInstantTime = latestInstantTime; + this.hoodieTableMetaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).build(); + // load class from the payload fully qualified class name + HoodieTableConfig tableConfig = this.hoodieTableMetaClient.getTableConfig(); + this.payloadClassFQN = tableConfig.getPayloadClass(); + this.preCombineField = tableConfig.getPreCombineField(); + this.totalLogFiles.addAndGet(logFilePaths.size()); + this.logFilePaths = logFilePaths; + this.reverseReader = reverseReader; + this.readBlocksLazily = readBlocksLazily; + this.fs = fs; + this.bufferSize = bufferSize; + this.instantRange = instantRange; + this.withOperationField = withOperationField; + this.forceFullScan = forceFullScan; + this.internalSchema = internalSchema == null ? InternalSchema.getEmptyInternalSchema() : internalSchema; + this.path = basePath; + + // Key fields when populate meta fields is disabled (that is, virtual keys enabled) + if (!tableConfig.populateMetaFields()) { + this.populateMetaFields = false; + this.simpleKeyGenFields = Option.of( + Pair.of(tableConfig.getRecordKeyFieldProp(), tableConfig.getPartitionFieldProp())); + } + this.partitionName = partitionName; + } + + protected String getKeyField() { + if (this.populateMetaFields) { + return HoodieRecord.RECORD_KEY_METADATA_FIELD; + } + ValidationUtils.checkState(this.simpleKeyGenFields.isPresent()); + return this.simpleKeyGenFields.get().getKey(); + } + + public synchronized void scan() { + scanInternal(Option.empty()); + } + + public synchronized void scan(List keys) { + scanInternal(Option.of(new KeySpec(keys, true))); + } + + protected synchronized void scanInternal(Option keySpecOpt) { + currentInstantLogBlocks = new ArrayDeque<>(); + progress = 0.0f; + totalLogFiles = new AtomicLong(0); + totalRollbacks = new AtomicLong(0); + totalCorruptBlocks = new AtomicLong(0); + totalLogBlocks = new AtomicLong(0); + totalLogRecords = new AtomicLong(0); + HoodieLogFormatReader logFormatReaderWrapper = null; + HoodieTimeline commitsTimeline = this.hoodieTableMetaClient.getCommitsTimeline(); + HoodieTimeline completedInstantsTimeline = commitsTimeline.filterCompletedInstants(); + HoodieTimeline inflightInstantsTimeline = commitsTimeline.filterInflights(); + try { + // Get the key field based on populate meta fields config + // and the table type + final String keyField = getKeyField(); + + // Iterate over the paths + boolean enableRecordLookups = !forceFullScan; + logFormatReaderWrapper = new HoodieLogFormatReader(fs, + logFilePaths.stream().map(logFile -> new HoodieLogFile(new Path(logFile))).collect(Collectors.toList()), + readerSchema, readBlocksLazily, reverseReader, bufferSize, enableRecordLookups, keyField, internalSchema); + + Set scannedLogFiles = new HashSet<>(); + while (logFormatReaderWrapper.hasNext()) { + HoodieLogFile logFile = logFormatReaderWrapper.getLogFile(); + LOG.info("Scanning log file " + logFile); + scannedLogFiles.add(logFile); + totalLogFiles.set(scannedLogFiles.size()); + // Use the HoodieLogFileReader to iterate through the blocks in the log file + HoodieLogBlock logBlock = logFormatReaderWrapper.next(); + final String instantTime = logBlock.getLogBlockHeader().get(INSTANT_TIME); + totalLogBlocks.incrementAndGet(); + if (logBlock.getBlockType() != CORRUPT_BLOCK + && !HoodieTimeline.compareTimestamps(logBlock.getLogBlockHeader().get(INSTANT_TIME), HoodieTimeline.LESSER_THAN_OR_EQUALS, this.latestInstantTime + )) { + // hit a block with instant time greater than should be processed, stop processing further + break; + } + if (logBlock.getBlockType() != CORRUPT_BLOCK && logBlock.getBlockType() != COMMAND_BLOCK) { + if (!completedInstantsTimeline.containsOrBeforeTimelineStarts(instantTime) + || inflightInstantsTimeline.containsInstant(instantTime)) { + // hit an uncommitted block possibly from a failed write, move to the next one and skip processing this one + continue; + } + if (instantRange.isPresent() && !instantRange.get().isInRange(instantTime)) { + // filter the log block by instant range + continue; + } + } + switch (logBlock.getBlockType()) { + case HFILE_DATA_BLOCK: + case AVRO_DATA_BLOCK: + case PARQUET_DATA_BLOCK: + LOG.info("Reading a data block from file " + logFile.getPath() + " at instant " + + logBlock.getLogBlockHeader().get(INSTANT_TIME)); + if (isNewInstantBlock(logBlock) && !readBlocksLazily) { + // If this is an avro data block belonging to a different commit/instant, + // then merge the last blocks and records into the main result + processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size(), keySpecOpt); + } + // store the current block + currentInstantLogBlocks.push(logBlock); + break; + case DELETE_BLOCK: + LOG.info("Reading a delete block from file " + logFile.getPath()); + if (isNewInstantBlock(logBlock) && !readBlocksLazily) { + // If this is a delete data block belonging to a different commit/instant, + // then merge the last blocks and records into the main result + processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size(), keySpecOpt); + } + // store deletes so can be rolled back + currentInstantLogBlocks.push(logBlock); + break; + case COMMAND_BLOCK: + // Consider the following scenario + // (Time 0, C1, Task T1) -> Running + // (Time 1, C1, Task T1) -> Failed (Wrote either a corrupt block or a correct + // DataBlock (B1) with commitTime C1 + // (Time 2, C1, Task T1.2) -> Running (Task T1 was retried and the attempt number is 2) + // (Time 3, C1, Task T1.2) -> Finished (Wrote a correct DataBlock B2) + // Now a logFile L1 can have 2 correct Datablocks (B1 and B2) which are the same. + // Say, commit C1 eventually failed and a rollback is triggered. + // Rollback will write only 1 rollback block (R1) since it assumes one block is + // written per ingestion batch for a file but in reality we need to rollback (B1 & B2) + // The following code ensures the same rollback block (R1) is used to rollback + // both B1 & B2 + LOG.info("Reading a command block from file " + logFile.getPath()); + // This is a command block - take appropriate action based on the command + HoodieCommandBlock commandBlock = (HoodieCommandBlock) logBlock; + String targetInstantForCommandBlock = + logBlock.getLogBlockHeader().get(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME); + switch (commandBlock.getType()) { // there can be different types of command blocks + case ROLLBACK_PREVIOUS_BLOCK: + // Rollback the last read log block + // Get commit time from last record block, compare with targetCommitTime, + // rollback only if equal, this is required in scenarios of invalid/extra + // rollback blocks written due to failures during the rollback operation itself + // and ensures the same rollback block (R1) is used to rollback both B1 & B2 with + // same instant_time + int numBlocksRolledBack = 0; + totalRollbacks.incrementAndGet(); + while (!currentInstantLogBlocks.isEmpty()) { + HoodieLogBlock lastBlock = currentInstantLogBlocks.peek(); + // handle corrupt blocks separately since they may not have metadata + if (lastBlock.getBlockType() == CORRUPT_BLOCK) { + LOG.info("Rolling back the last corrupted log block read in " + logFile.getPath()); + currentInstantLogBlocks.pop(); + numBlocksRolledBack++; + } else if (targetInstantForCommandBlock.contentEquals(lastBlock.getLogBlockHeader().get(INSTANT_TIME))) { + // rollback last data block or delete block + LOG.info("Rolling back the last log block read in " + logFile.getPath()); + currentInstantLogBlocks.pop(); + numBlocksRolledBack++; + } else if (!targetInstantForCommandBlock + .contentEquals(currentInstantLogBlocks.peek().getLogBlockHeader().get(INSTANT_TIME))) { + // invalid or extra rollback block + LOG.warn("TargetInstantTime " + targetInstantForCommandBlock + + " invalid or extra rollback command block in " + logFile.getPath()); + break; + } else { + // this should not happen ideally + LOG.warn("Unable to apply rollback command block in " + logFile.getPath()); + } + } + LOG.info("Number of applied rollback blocks " + numBlocksRolledBack); + break; + default: + throw new UnsupportedOperationException("Command type not yet supported."); + } + break; + case CORRUPT_BLOCK: + LOG.info("Found a corrupt block in " + logFile.getPath()); + totalCorruptBlocks.incrementAndGet(); + // If there is a corrupt block - we will assume that this was the next data block + currentInstantLogBlocks.push(logBlock); + break; + default: + throw new UnsupportedOperationException("Block type not supported yet"); + } + } + // merge the last read block when all the blocks are done reading + if (!currentInstantLogBlocks.isEmpty()) { + LOG.info("Merging the final data blocks"); + processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size(), keySpecOpt); + } + // Done + progress = 1.0f; + } catch (IOException e) { + LOG.error("Got IOException when reading log file", e); + throw new HoodieIOException("IOException when reading log file ", e); + } catch (Exception e) { + LOG.error("Got exception when reading log file", e); + throw new HoodieException("Exception when reading log file ", e); + } finally { + try { + if (null != logFormatReaderWrapper) { + logFormatReaderWrapper.close(); + } + } catch (IOException ioe) { + // Eat exception as we do not want to mask the original exception that can happen + LOG.error("Unable to close log format reader", ioe); + } + } + } + + /** + * Checks if the current logblock belongs to a later instant. + */ + private boolean isNewInstantBlock(HoodieLogBlock logBlock) { + return currentInstantLogBlocks.size() > 0 && currentInstantLogBlocks.peek().getBlockType() != CORRUPT_BLOCK + && !logBlock.getLogBlockHeader().get(INSTANT_TIME) + .contentEquals(currentInstantLogBlocks.peek().getLogBlockHeader().get(INSTANT_TIME)); + } + + /** + * Iterate over the GenericRecord in the block, read the hoodie key and partition path and call subclass processors to + * handle it. + */ + private void processDataBlock(HoodieDataBlock dataBlock, Option keySpecOpt) throws Exception { + try (ClosableIterator recordIterator = getRecordsIterator(dataBlock, keySpecOpt)) { + Option schemaOption = getMergedSchema(dataBlock); + while (recordIterator.hasNext()) { + IndexedRecord currentRecord = recordIterator.next(); + IndexedRecord record = schemaOption.isPresent() ? HoodieAvroUtils.rewriteRecordWithNewSchema(currentRecord, schemaOption.get(), Collections.emptyMap()) : currentRecord; + processNextRecord(createHoodieRecord(record, this.hoodieTableMetaClient.getTableConfig(), this.payloadClassFQN, + this.preCombineField, this.withOperationField, this.simpleKeyGenFields, this.partitionName)); + totalLogRecords.incrementAndGet(); + } + } + } + + /** + * Get final Read Schema for support evolution. + * step1: find the fileSchema for current dataBlock. + * step2: determine whether fileSchema is compatible with the final read internalSchema. + * step3: merge fileSchema and read internalSchema to produce final read schema. + * + * @param dataBlock current processed block + * @return final read schema. + */ + private Option getMergedSchema(HoodieDataBlock dataBlock) { + Option result = Option.empty(); + if (!internalSchema.isEmptySchema()) { + Long currentInstantTime = Long.parseLong(dataBlock.getLogBlockHeader().get(INSTANT_TIME)); + InternalSchema fileSchema = InternalSchemaCache + .searchSchemaAndCache(currentInstantTime, hoodieTableMetaClient, false); + Schema mergeSchema = AvroInternalSchemaConverter + .convert(new InternalSchemaMerger(fileSchema, internalSchema, true, false).mergeSchema(), readerSchema.getName()); + result = Option.of(mergeSchema); + } + return result; + } + + /** + * Create @{@link HoodieRecord} from the @{@link IndexedRecord}. + * + * @param rec - IndexedRecord to create the HoodieRecord from + * @param hoodieTableConfig - Table config + * @param payloadClassFQN - Payload class fully qualified name + * @param preCombineField - PreCombine field + * @param withOperationField - Whether operation field is enabled + * @param simpleKeyGenFields - Key generator fields when populate meta fields is tuened off + * @param partitionName - Partition name + * @return HoodieRecord created from the IndexedRecord + */ + protected HoodieAvroRecord createHoodieRecord(final IndexedRecord rec, final HoodieTableConfig hoodieTableConfig, + final String payloadClassFQN, final String preCombineField, + final boolean withOperationField, + final Option> simpleKeyGenFields, + final Option partitionName) { + if (this.populateMetaFields) { + return SpillableMapUtils.convertToHoodieRecordPayload((GenericRecord) rec, payloadClassFQN, + preCombineField, withOperationField); + } else { + return SpillableMapUtils.convertToHoodieRecordPayload((GenericRecord) rec, payloadClassFQN, + preCombineField, simpleKeyGenFields.get(), withOperationField, partitionName); + } + } + + /** + * Process next record. + * + * @param hoodieRecord Hoodie Record to process + */ + protected abstract void processNextRecord(HoodieRecord hoodieRecord) throws Exception; + + /** + * Process next deleted record. + * + * @param deleteRecord Deleted record(hoodie key and ordering value) + */ + protected abstract void processNextDeletedRecord(DeleteRecord deleteRecord); + + /** + * Process the set of log blocks belonging to the last instant which is read fully. + */ + private void processQueuedBlocksForInstant(Deque logBlocks, int numLogFilesSeen, + Option keySpecOpt) throws Exception { + while (!logBlocks.isEmpty()) { + LOG.info("Number of remaining logblocks to merge " + logBlocks.size()); + // poll the element at the bottom of the stack since that's the order it was inserted + HoodieLogBlock lastBlock = logBlocks.pollLast(); + switch (lastBlock.getBlockType()) { + case AVRO_DATA_BLOCK: + processDataBlock((HoodieAvroDataBlock) lastBlock, keySpecOpt); + break; + case HFILE_DATA_BLOCK: + processDataBlock((HoodieHFileDataBlock) lastBlock, keySpecOpt); + break; + case PARQUET_DATA_BLOCK: + processDataBlock((HoodieParquetDataBlock) lastBlock, keySpecOpt); + break; + case DELETE_BLOCK: + Arrays.stream(((HoodieDeleteBlock) lastBlock).getRecordsToDelete()).forEach(this::processNextDeletedRecord); + break; + case CORRUPT_BLOCK: + LOG.warn("Found a corrupt block which was not rolled back"); + break; + default: + break; + } + } + // At this step the lastBlocks are consumed. We track approximate progress by number of log-files seen + progress = (numLogFilesSeen - 1) / logFilePaths.size(); + } + + private ClosableIterator getRecordsIterator(HoodieDataBlock dataBlock, Option keySpecOpt) throws IOException { + if (keySpecOpt.isPresent()) { + KeySpec keySpec = keySpecOpt.get(); + return dataBlock.getRecordIterator(keySpec.keys, keySpec.fullKey); + } + + return dataBlock.getRecordIterator(); + } + + /** + * Return progress of scanning as a float between 0.0 to 1.0. + */ + public float getProgress() { + return progress; + } + + public long getTotalLogFiles() { + return totalLogFiles.get(); + } + + public long getTotalLogRecords() { + return totalLogRecords.get(); + } + + public long getTotalLogBlocks() { + return totalLogBlocks.get(); + } + + protected String getPayloadClassFQN() { + return payloadClassFQN; + } + + public Option getPartitionName() { + return partitionName; + } + + public long getTotalRollbacks() { + return totalRollbacks.get(); + } + + public long getTotalCorruptBlocks() { + return totalCorruptBlocks.get(); + } + + public boolean isWithOperationField() { + return withOperationField; + } + + protected static class KeySpec { + private final List keys; + private final boolean fullKey; + + public KeySpec(List keys, boolean fullKey) { + this.keys = keys; + this.fullKey = fullKey; + } + } + + /** + * Builder used to build {@code AbstractHoodieLogRecordScanner}. + */ + public abstract static class Builder { + + public abstract Builder withFileSystem(FileSystem fs); + + public abstract Builder withBasePath(String basePath); + + public abstract Builder withLogFilePaths(List logFilePaths); + + public abstract Builder withReaderSchema(Schema schema); + + public abstract Builder withLatestInstantTime(String latestInstantTime); + + public abstract Builder withReadBlocksLazily(boolean readBlocksLazily); + + public abstract Builder withReverseReader(boolean reverseReader); + + public abstract Builder withBufferSize(int bufferSize); + + public Builder withPartition(String partitionName) { + throw new UnsupportedOperationException(); + } + + public Builder withInstantRange(Option instantRange) { + throw new UnsupportedOperationException(); + } + + public Builder withOperationField(boolean withOperationField) { + throw new UnsupportedOperationException(); + } + + public abstract AbstractHoodieLogRecordReader build(); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordScanner.java deleted file mode 100644 index 4ae709eda1de7..0000000000000 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordScanner.java +++ /dev/null @@ -1,384 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.common.table.log; - -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; -import org.apache.hudi.common.table.log.block.HoodieCommandBlock; -import org.apache.hudi.common.table.log.block.HoodieDataBlock; -import org.apache.hudi.common.table.log.block.HoodieDeleteBlock; -import org.apache.hudi.common.table.log.block.HoodieHFileDataBlock; -import org.apache.hudi.common.table.log.block.HoodieLogBlock; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.SpillableMapUtils; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieIOException; - -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.IOException; -import java.util.ArrayDeque; -import java.util.Arrays; -import java.util.Deque; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Collectors; - -import static org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType.INSTANT_TIME; -import static org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType.CORRUPT_BLOCK; - -/** - * Implements logic to scan log blocks and expose valid and deleted log records to subclass implementation. Subclass is - * free to either apply merging or expose raw data back to the caller. - * - * NOTE: If readBlockLazily is turned on, does not merge, instead keeps reading log blocks and merges everything at once - * This is an optimization to avoid seek() back and forth to read new block (forward seek()) and lazily read content of - * seen block (reverse and forward seek()) during merge | | Read Block 1 Metadata | | Read Block 1 Data | | | Read Block - * 2 Metadata | | Read Block 2 Data | | I/O Pass 1 | ..................... | I/O Pass 2 | ................. | | | Read - * Block N Metadata | | Read Block N Data | - *

    - * This results in two I/O passes over the log file. - */ -public abstract class AbstractHoodieLogRecordScanner { - - private static final Logger LOG = LogManager.getLogger(AbstractHoodieLogRecordScanner.class); - - // Reader schema for the records - protected final Schema readerSchema; - // Latest valid instant time - // Log-Blocks belonging to inflight delta-instants are filtered-out using this high-watermark. - private final String latestInstantTime; - private final HoodieTableMetaClient hoodieTableMetaClient; - // Merge strategy to use when combining records from log - private final String payloadClassFQN; - // Log File Paths - private final List logFilePaths; - // Read Lazily flag - private final boolean readBlocksLazily; - // Reverse reader - Not implemented yet (NA -> Why do we need ?) - // but present here for plumbing for future implementation - private final boolean reverseReader; - // Buffer Size for log file reader - private final int bufferSize; - // FileSystem - private final FileSystem fs; - // Total log files read - for metrics - private AtomicLong totalLogFiles = new AtomicLong(0); - // Total log blocks read - for metrics - private AtomicLong totalLogBlocks = new AtomicLong(0); - // Total log records read - for metrics - private AtomicLong totalLogRecords = new AtomicLong(0); - // Total number of rollbacks written across all log files - private AtomicLong totalRollbacks = new AtomicLong(0); - // Total number of corrupt blocks written across all log files - private AtomicLong totalCorruptBlocks = new AtomicLong(0); - // Store the last instant log blocks (needed to implement rollback) - private Deque currentInstantLogBlocks = new ArrayDeque<>(); - // Progress - private float progress = 0.0f; - - public AbstractHoodieLogRecordScanner(FileSystem fs, String basePath, List logFilePaths, Schema readerSchema, - String latestInstantTime, boolean readBlocksLazily, boolean reverseReader, int bufferSize) { - this.readerSchema = readerSchema; - this.latestInstantTime = latestInstantTime; - this.hoodieTableMetaClient = new HoodieTableMetaClient(fs.getConf(), basePath); - // load class from the payload fully qualified class name - this.payloadClassFQN = this.hoodieTableMetaClient.getTableConfig().getPayloadClass(); - this.totalLogFiles.addAndGet(logFilePaths.size()); - this.logFilePaths = logFilePaths; - this.readBlocksLazily = readBlocksLazily; - this.reverseReader = reverseReader; - this.fs = fs; - this.bufferSize = bufferSize; - } - - /** - * Scan Log files. - */ - public void scan() { - HoodieLogFormatReader logFormatReaderWrapper = null; - try { - // iterate over the paths - logFormatReaderWrapper = new HoodieLogFormatReader(fs, - logFilePaths.stream().map(logFile -> new HoodieLogFile(new Path(logFile))).collect(Collectors.toList()), - readerSchema, readBlocksLazily, reverseReader, bufferSize); - Set scannedLogFiles = new HashSet<>(); - while (logFormatReaderWrapper.hasNext()) { - HoodieLogFile logFile = logFormatReaderWrapper.getLogFile(); - LOG.info("Scanning log file " + logFile); - scannedLogFiles.add(logFile); - totalLogFiles.set(scannedLogFiles.size()); - // Use the HoodieLogFileReader to iterate through the blocks in the log file - HoodieLogBlock r = logFormatReaderWrapper.next(); - totalLogBlocks.incrementAndGet(); - if (r.getBlockType() != CORRUPT_BLOCK - && !HoodieTimeline.compareTimestamps(r.getLogBlockHeader().get(INSTANT_TIME), HoodieTimeline.LESSER_THAN_OR_EQUALS, this.latestInstantTime - )) { - // hit a block with instant time greater than should be processed, stop processing further - break; - } - switch (r.getBlockType()) { - case HFILE_DATA_BLOCK: - case AVRO_DATA_BLOCK: - LOG.info("Reading a data block from file " + logFile.getPath()); - if (isNewInstantBlock(r) && !readBlocksLazily) { - // If this is an avro data block belonging to a different commit/instant, - // then merge the last blocks and records into the main result - processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size()); - } - // store the current block - currentInstantLogBlocks.push(r); - break; - case DELETE_BLOCK: - LOG.info("Reading a delete block from file " + logFile.getPath()); - if (isNewInstantBlock(r) && !readBlocksLazily) { - // If this is a delete data block belonging to a different commit/instant, - // then merge the last blocks and records into the main result - processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size()); - } - // store deletes so can be rolled back - currentInstantLogBlocks.push(r); - break; - case COMMAND_BLOCK: - // Consider the following scenario - // (Time 0, C1, Task T1) -> Running - // (Time 1, C1, Task T1) -> Failed (Wrote either a corrupt block or a correct - // DataBlock (B1) with commitTime C1 - // (Time 2, C1, Task T1.2) -> Running (Task T1 was retried and the attempt number is 2) - // (Time 3, C1, Task T1.2) -> Finished (Wrote a correct DataBlock B2) - // Now a logFile L1 can have 2 correct Datablocks (B1 and B2) which are the same. - // Say, commit C1 eventually failed and a rollback is triggered. - // Rollback will write only 1 rollback block (R1) since it assumes one block is - // written per ingestion batch for a file but in reality we need to rollback (B1 & B2) - // The following code ensures the same rollback block (R1) is used to rollback - // both B1 & B2 - LOG.info("Reading a command block from file " + logFile.getPath()); - // This is a command block - take appropriate action based on the command - HoodieCommandBlock commandBlock = (HoodieCommandBlock) r; - String targetInstantForCommandBlock = - r.getLogBlockHeader().get(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME); - switch (commandBlock.getType()) { // there can be different types of command blocks - case ROLLBACK_PREVIOUS_BLOCK: - // Rollback the last read log block - // Get commit time from last record block, compare with targetCommitTime, - // rollback only if equal, this is required in scenarios of invalid/extra - // rollback blocks written due to failures during the rollback operation itself - // and ensures the same rollback block (R1) is used to rollback both B1 & B2 with - // same instant_time - int numBlocksRolledBack = 0; - totalRollbacks.incrementAndGet(); - while (!currentInstantLogBlocks.isEmpty()) { - HoodieLogBlock lastBlock = currentInstantLogBlocks.peek(); - // handle corrupt blocks separately since they may not have metadata - if (lastBlock.getBlockType() == CORRUPT_BLOCK) { - LOG.info("Rolling back the last corrupted log block read in " + logFile.getPath()); - currentInstantLogBlocks.pop(); - numBlocksRolledBack++; - } else if (lastBlock.getBlockType() != CORRUPT_BLOCK - && targetInstantForCommandBlock.contentEquals(lastBlock.getLogBlockHeader().get(INSTANT_TIME))) { - // rollback last data block or delete block - LOG.info("Rolling back the last log block read in " + logFile.getPath()); - currentInstantLogBlocks.pop(); - numBlocksRolledBack++; - } else if (!targetInstantForCommandBlock - .contentEquals(currentInstantLogBlocks.peek().getLogBlockHeader().get(INSTANT_TIME))) { - // invalid or extra rollback block - LOG.warn("TargetInstantTime " + targetInstantForCommandBlock - + " invalid or extra rollback command block in " + logFile.getPath()); - break; - } else { - // this should not happen ideally - LOG.warn("Unable to apply rollback command block in " + logFile.getPath()); - } - } - LOG.info("Number of applied rollback blocks " + numBlocksRolledBack); - break; - default: - throw new UnsupportedOperationException("Command type not yet supported."); - } - break; - case CORRUPT_BLOCK: - LOG.info("Found a corrupt block in " + logFile.getPath()); - totalCorruptBlocks.incrementAndGet(); - // If there is a corrupt block - we will assume that this was the next data block - currentInstantLogBlocks.push(r); - break; - default: - throw new UnsupportedOperationException("Block type not supported yet"); - } - } - // merge the last read block when all the blocks are done reading - if (!currentInstantLogBlocks.isEmpty()) { - LOG.info("Merging the final data blocks"); - processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size()); - } - // Done - progress = 1.0f; - } catch (IOException e) { - LOG.error("Got IOException when reading log file", e); - throw new HoodieIOException("IOException when reading log file ", e); - } catch (Exception e) { - LOG.error("Got exception when reading log file", e); - throw new HoodieException("Exception when reading log file ", e); - } finally { - try { - if (null != logFormatReaderWrapper) { - logFormatReaderWrapper.close(); - } - } catch (IOException ioe) { - // Eat exception as we do not want to mask the original exception that can happen - LOG.error("Unable to close log format reader", ioe); - } - } - } - - /** - * Checks if the current logblock belongs to a later instant. - */ - private boolean isNewInstantBlock(HoodieLogBlock logBlock) { - return currentInstantLogBlocks.size() > 0 && currentInstantLogBlocks.peek().getBlockType() != CORRUPT_BLOCK - && !logBlock.getLogBlockHeader().get(INSTANT_TIME) - .contentEquals(currentInstantLogBlocks.peek().getLogBlockHeader().get(INSTANT_TIME)); - } - - /** - * Iterate over the GenericRecord in the block, read the hoodie key and partition path and call subclass processors to - * handle it. - */ - private void processDataBlock(HoodieDataBlock dataBlock) throws Exception { - // TODO (NA) - Implement getRecordItr() in HoodieAvroDataBlock and use that here - List recs = dataBlock.getRecords(); - totalLogRecords.addAndGet(recs.size()); - for (IndexedRecord rec : recs) { - HoodieRecord hoodieRecord = - SpillableMapUtils.convertToHoodieRecordPayload((GenericRecord) rec, this.payloadClassFQN); - processNextRecord(hoodieRecord); - } - } - - /** - * Process next record. - * - * @param hoodieRecord Hoodie Record to process - */ - protected abstract void processNextRecord(HoodieRecord hoodieRecord) throws Exception; - - /** - * Process next deleted key. - * - * @param key Deleted record key - */ - protected abstract void processNextDeletedKey(HoodieKey key); - - /** - * Process the set of log blocks belonging to the last instant which is read fully. - */ - private void processQueuedBlocksForInstant(Deque lastBlocks, int numLogFilesSeen) throws Exception { - while (!lastBlocks.isEmpty()) { - LOG.info("Number of remaining logblocks to merge " + lastBlocks.size()); - // poll the element at the bottom of the stack since that's the order it was inserted - HoodieLogBlock lastBlock = lastBlocks.pollLast(); - switch (lastBlock.getBlockType()) { - case AVRO_DATA_BLOCK: - processDataBlock((HoodieAvroDataBlock) lastBlock); - break; - case HFILE_DATA_BLOCK: - processDataBlock((HoodieHFileDataBlock) lastBlock); - break; - case DELETE_BLOCK: - Arrays.stream(((HoodieDeleteBlock) lastBlock).getKeysToDelete()).forEach(this::processNextDeletedKey); - break; - case CORRUPT_BLOCK: - LOG.warn("Found a corrupt block which was not rolled back"); - break; - default: - break; - } - } - // At this step the lastBlocks are consumed. We track approximate progress by number of log-files seen - progress = numLogFilesSeen - 1 / logFilePaths.size(); - } - - /** - * Return progress of scanning as a float between 0.0 to 1.0. - */ - public float getProgress() { - return progress; - } - - public long getTotalLogFiles() { - return totalLogFiles.get(); - } - - public long getTotalLogRecords() { - return totalLogRecords.get(); - } - - public long getTotalLogBlocks() { - return totalLogBlocks.get(); - } - - protected String getPayloadClassFQN() { - return payloadClassFQN; - } - - public long getTotalRollbacks() { - return totalRollbacks.get(); - } - - public long getTotalCorruptBlocks() { - return totalCorruptBlocks.get(); - } - - /** - * Builder used to build {@code AbstractHoodieLogRecordScanner}. - */ - public abstract static class Builder { - - public abstract Builder withFileSystem(FileSystem fs); - - public abstract Builder withBasePath(String basePath); - - public abstract Builder withLogFilePaths(List logFilePaths); - - public abstract Builder withReaderSchema(Schema schema); - - public abstract Builder withLatestInstantTime(String latestInstantTime); - - public abstract Builder withReadBlocksLazily(boolean readBlocksLazily); - - public abstract Builder withReverseReader(boolean reverseReader); - - public abstract Builder withBufferSize(int bufferSize); - - public abstract AbstractHoodieLogRecordScanner build(); - } -} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AppendResult.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AppendResult.java new file mode 100644 index 0000000000000..8246edada9161 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AppendResult.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.table.log; + +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.table.log.block.HoodieLogBlock; + +/** + * Pojo holding information on the result of a {@link org.apache.hudi.common.table.log.HoodieLogFormat.Writer#appendBlock(HoodieLogBlock)}. + */ +public class AppendResult { + + private final HoodieLogFile logFile; + private final long offset; + private final long size; + + public AppendResult(HoodieLogFile logFile, long offset, long size) { + this.logFile = logFile; + this.offset = offset; + this.size = size; + } + + public HoodieLogFile logFile() { + return logFile; + } + + public long offset() { + return offset; + } + + public long size() { + return size; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java index 14d523ad9825e..11d9e75f4b186 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java @@ -18,8 +18,12 @@ package org.apache.hudi.common.table.log; +import org.apache.hudi.common.fs.BoundedFsDataInputStream; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.fs.SchemeAwareFSDataInputStream; +import org.apache.hudi.common.fs.TimedFSDataInputStream; import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; import org.apache.hudi.common.table.log.block.HoodieCommandBlock; import org.apache.hudi.common.table.log.block.HoodieCorruptBlock; @@ -28,20 +32,25 @@ import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType; +import org.apache.hudi.common.table.log.block.HoodieParquetDataBlock; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.exception.CorruptedLogFileException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieNotSupportedException; +import org.apache.hudi.internal.schema.InternalSchema; import org.apache.avro.Schema; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.BufferedFSInputStream; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSInputStream; import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.hbase.util.Bytes; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import javax.annotation.Nullable; + import java.io.EOFException; import java.io.IOException; import java.util.Arrays; @@ -49,6 +58,9 @@ import java.util.Map; import java.util.Objects; +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; +import static org.apache.hudi.common.util.ValidationUtils.checkState; + /** * Scans a log file and provides block level iterator on the log file Loads the entire block contents in memory Can emit * either a DataBlock, CommandBlock, DeleteBlock or CorruptBlock (if one is found). @@ -56,48 +68,61 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader { public static final int DEFAULT_BUFFER_SIZE = 16 * 1024 * 1024; // 16 MB + private static final int BLOCK_SCAN_READ_BUFFER_SIZE = 1024 * 1024; // 1 MB private static final Logger LOG = LogManager.getLogger(HoodieLogFileReader.class); + private final Configuration hadoopConf; private final FSDataInputStream inputStream; private final HoodieLogFile logFile; private final byte[] magicBuffer = new byte[6]; private final Schema readerSchema; + private InternalSchema internalSchema; + private final String keyField; private boolean readBlockLazily; private long reverseLogFilePosition; private long lastReverseLogFilePosition; private boolean reverseReader; + private boolean enableRecordLookups; private boolean closed = false; private transient Thread shutdownThread = null; public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize, - boolean readBlockLazily, boolean reverseReader) throws IOException { - FSDataInputStream fsDataInputStream = fs.open(logFile.getPath(), bufferSize); - if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) { - this.inputStream = new FSDataInputStream( - new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize)); - } else { - // fsDataInputStream.getWrappedStream() maybe a BufferedFSInputStream - // need to wrap in another BufferedFSInputStream the make bufferSize work? - this.inputStream = fsDataInputStream; - } + boolean readBlockLazily) throws IOException { + this(fs, logFile, readerSchema, bufferSize, readBlockLazily, false); + } + + public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize, + boolean readBlockLazily, boolean reverseReader) throws IOException { + this(fs, logFile, readerSchema, bufferSize, readBlockLazily, reverseReader, false, + HoodieRecord.RECORD_KEY_METADATA_FIELD); + } - this.logFile = logFile; + public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize, + boolean readBlockLazily, boolean reverseReader, boolean enableRecordLookups, + String keyField) throws IOException { + this(fs, logFile, readerSchema, bufferSize, readBlockLazily, reverseReader, enableRecordLookups, keyField, InternalSchema.getEmptyInternalSchema()); + } + + public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize, + boolean readBlockLazily, boolean reverseReader, boolean enableRecordLookups, + String keyField, InternalSchema internalSchema) throws IOException { + this.hadoopConf = fs.getConf(); + // NOTE: We repackage {@code HoodieLogFile} here to make sure that the provided path + // is prefixed with an appropriate scheme given that we're not propagating the FS + // further + this.logFile = new HoodieLogFile(FSUtils.makeQualified(fs, logFile.getPath()), logFile.getFileSize()); + this.inputStream = getFSDataInputStream(fs, this.logFile, bufferSize); this.readerSchema = readerSchema; this.readBlockLazily = readBlockLazily; this.reverseReader = reverseReader; + this.enableRecordLookups = enableRecordLookups; + this.keyField = keyField; + this.internalSchema = internalSchema == null ? InternalSchema.getEmptyInternalSchema() : internalSchema; if (this.reverseReader) { - this.reverseLogFilePosition = this.lastReverseLogFilePosition = fs.getFileStatus(logFile.getPath()).getLen(); + this.reverseLogFilePosition = this.lastReverseLogFilePosition = this.logFile.getFileSize(); } - addShutDownHook(); - } - - public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, boolean readBlockLazily, - boolean reverseReader) throws IOException { - this(fs, logFile, readerSchema, DEFAULT_BUFFER_SIZE, readBlockLazily, reverseReader); - } - public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema) throws IOException { - this(fs, logFile, readerSchema, DEFAULT_BUFFER_SIZE, false, false); + addShutDownHook(); } @Override @@ -123,119 +148,136 @@ private void addShutDownHook() { // TODO : convert content and block length to long by using ByteBuffer, raw byte [] allows // for max of Integer size private HoodieLogBlock readBlock() throws IOException { - - int blocksize; - int type; - HoodieLogBlockType blockType = null; - Map header = null; - + int blockSize; + long blockStartPos = inputStream.getPos(); try { // 1 Read the total size of the block - blocksize = (int) inputStream.readLong(); + blockSize = (int) inputStream.readLong(); } catch (EOFException | CorruptedLogFileException e) { // An exception reading any of the above indicates a corrupt block // Create a corrupt block by finding the next MAGIC marker or EOF - return createCorruptBlock(); + return createCorruptBlock(blockStartPos); } // We may have had a crash which could have written this block partially - // Skip blocksize in the stream and we should either find a sync marker (start of the next + // Skip blockSize in the stream and we should either find a sync marker (start of the next // block) or EOF. If we did not find either of it, then this block is a corrupted block. - boolean isCorrupted = isBlockCorrupt(blocksize); + boolean isCorrupted = isBlockCorrupted(blockSize); if (isCorrupted) { - return createCorruptBlock(); + return createCorruptBlock(blockStartPos); } // 2. Read the version for this log format HoodieLogFormat.LogFormatVersion nextBlockVersion = readVersion(); // 3. Read the block type for a log block - if (nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION) { - type = inputStream.readInt(); - - ValidationUtils.checkArgument(type < HoodieLogBlockType.values().length, "Invalid block byte type found " + type); - blockType = HoodieLogBlockType.values()[type]; - } + HoodieLogBlockType blockType = tryReadBlockType(nextBlockVersion); // 4. Read the header for a log block, if present - if (nextBlockVersion.hasHeader()) { - header = HoodieLogBlock.getLogMetadata(inputStream); - } - int contentLength = blocksize; + Map header = + nextBlockVersion.hasHeader() ? HoodieLogBlock.getLogMetadata(inputStream) : null; + // 5. Read the content length for the content - if (nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION) { - contentLength = (int) inputStream.readLong(); - } + // Fallback to full-block size if no content-length + // TODO replace w/ hasContentLength + int contentLength = + nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION ? (int) inputStream.readLong() : blockSize; // 6. Read the content or skip content based on IO vs Memory trade-off by client - // TODO - have a max block size and reuse this buffer in the ByteBuffer - // (hard to guess max block size for now) long contentPosition = inputStream.getPos(); - byte[] content = HoodieLogBlock.readOrSkipContent(inputStream, contentLength, readBlockLazily); + boolean shouldReadLazily = readBlockLazily && nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION; + Option content = HoodieLogBlock.tryReadContent(inputStream, contentLength, shouldReadLazily); // 7. Read footer if any - Map footer = null; - if (nextBlockVersion.hasFooter()) { - footer = HoodieLogBlock.getLogMetadata(inputStream); - } + Map footer = + nextBlockVersion.hasFooter() ? HoodieLogBlock.getLogMetadata(inputStream) : null; // 8. Read log block length, if present. This acts as a reverse pointer when traversing a // log file in reverse - @SuppressWarnings("unused") - long logBlockLength = 0; if (nextBlockVersion.hasLogBlockLength()) { - logBlockLength = inputStream.readLong(); + inputStream.readLong(); } // 9. Read the log block end position in the log file long blockEndPos = inputStream.getPos(); + HoodieLogBlock.HoodieLogBlockContentLocation logBlockContentLoc = + new HoodieLogBlock.HoodieLogBlockContentLocation(hadoopConf, logFile, contentPosition, contentLength, blockEndPos); + switch (Objects.requireNonNull(blockType)) { - // based on type read the block case AVRO_DATA_BLOCK: if (nextBlockVersion.getVersion() == HoodieLogFormatVersion.DEFAULT_VERSION) { - return HoodieAvroDataBlock.getBlock(content, readerSchema); + return HoodieAvroDataBlock.getBlock(content.get(), readerSchema, internalSchema); } else { - return new HoodieAvroDataBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily, - contentPosition, contentLength, blockEndPos, readerSchema, header, footer); + return new HoodieAvroDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc, + Option.ofNullable(readerSchema), header, footer, keyField, internalSchema); } + case HFILE_DATA_BLOCK: - return new HoodieHFileDataBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily, - contentPosition, contentLength, blockEndPos, readerSchema, header, footer); + checkState(nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION, + String.format("HFile block could not be of version (%d)", HoodieLogFormatVersion.DEFAULT_VERSION)); + + return new HoodieHFileDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc, + Option.ofNullable(readerSchema), header, footer, enableRecordLookups, logFile.getPath()); + + case PARQUET_DATA_BLOCK: + checkState(nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION, + String.format("Parquet block could not be of version (%d)", HoodieLogFormatVersion.DEFAULT_VERSION)); + + return new HoodieParquetDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc, + Option.ofNullable(readerSchema), header, footer, keyField); + case DELETE_BLOCK: - return HoodieDeleteBlock.getBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily, - contentPosition, contentLength, blockEndPos, header, footer); + return new HoodieDeleteBlock(content, inputStream, readBlockLazily, Option.of(logBlockContentLoc), header, footer); + case COMMAND_BLOCK: - return HoodieCommandBlock.getBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily, - contentPosition, contentLength, blockEndPos, header, footer); + return new HoodieCommandBlock(content, inputStream, readBlockLazily, Option.of(logBlockContentLoc), header, footer); + default: throw new HoodieNotSupportedException("Unsupported Block " + blockType); } } - private HoodieLogBlock createCorruptBlock() throws IOException { - LOG.info("Log " + logFile + " has a corrupted block at " + inputStream.getPos()); - long currentPos = inputStream.getPos(); + @Nullable + private HoodieLogBlockType tryReadBlockType(HoodieLogFormat.LogFormatVersion blockVersion) throws IOException { + if (blockVersion.getVersion() == HoodieLogFormatVersion.DEFAULT_VERSION) { + return null; + } + + int type = inputStream.readInt(); + checkArgument(type < HoodieLogBlockType.values().length, "Invalid block byte type found " + type); + return HoodieLogBlockType.values()[type]; + } + + private HoodieLogBlock createCorruptBlock(long blockStartPos) throws IOException { + LOG.info("Log " + logFile + " has a corrupted block at " + blockStartPos); + inputStream.seek(blockStartPos); long nextBlockOffset = scanForNextAvailableBlockOffset(); // Rewind to the initial start and read corrupted bytes till the nextBlockOffset - inputStream.seek(currentPos); + inputStream.seek(blockStartPos); LOG.info("Next available block in " + logFile + " starts at " + nextBlockOffset); - int corruptedBlockSize = (int) (nextBlockOffset - currentPos); + int corruptedBlockSize = (int) (nextBlockOffset - blockStartPos); long contentPosition = inputStream.getPos(); - byte[] corruptedBytes = HoodieLogBlock.readOrSkipContent(inputStream, corruptedBlockSize, readBlockLazily); - return HoodieCorruptBlock.getBlock(logFile, inputStream, Option.ofNullable(corruptedBytes), readBlockLazily, - contentPosition, corruptedBlockSize, corruptedBlockSize, new HashMap<>(), new HashMap<>()); + Option corruptedBytes = HoodieLogBlock.tryReadContent(inputStream, corruptedBlockSize, readBlockLazily); + HoodieLogBlock.HoodieLogBlockContentLocation logBlockContentLoc = + new HoodieLogBlock.HoodieLogBlockContentLocation(hadoopConf, logFile, contentPosition, corruptedBlockSize, nextBlockOffset); + return new HoodieCorruptBlock(corruptedBytes, inputStream, readBlockLazily, Option.of(logBlockContentLoc), new HashMap<>(), new HashMap<>()); } - private boolean isBlockCorrupt(int blocksize) throws IOException { + private boolean isBlockCorrupted(int blocksize) throws IOException { long currentPos = inputStream.getPos(); + long blockSizeFromFooter; + try { - if (FSUtils.isGCSInputStream(inputStream)) { - inputStream.seek(currentPos + blocksize - 1); - } else { - inputStream.seek(currentPos + blocksize); - } + // check if the blocksize mentioned in the footer is the same as the header; + // by seeking and checking the length of a long. We do not seek `currentPos + blocksize` + // which can be the file size for the last block in the file, causing EOFException + // for some FSDataInputStream implementation + inputStream.seek(currentPos + blocksize - Long.BYTES); + // Block size in the footer includes the magic header, which the header does not include. + // So we have to shorten the footer block size by the size of magic hash + blockSizeFromFooter = inputStream.readLong() - magicBuffer.length; } catch (EOFException e) { LOG.info("Found corrupted block in file " + logFile + " with block size(" + blocksize + ") running past EOF"); // this is corrupt @@ -246,19 +288,13 @@ private boolean isBlockCorrupt(int blocksize) throws IOException { return true; } - // check if the blocksize mentioned in the footer is the same as the header; by seeking back the length of a long - // the backward seek does not incur additional IO as {@link org.apache.hadoop.hdfs.DFSInputStream#seek()} - // only moves the index. actual IO happens on the next read operation - inputStream.seek(inputStream.getPos() - Long.BYTES); - // Block size in the footer includes the magic header, which the header does not include. - // So we have to shorten the footer block size by the size of magic hash - long blockSizeFromFooter = inputStream.readLong() - magicBuffer.length; if (blocksize != blockSizeFromFooter) { LOG.info("Found corrupted block in file " + logFile + ". Header block size(" + blocksize - + ") did not match the footer block size(" + blockSizeFromFooter + ")"); + + ") did not match the footer block size(" + blockSizeFromFooter + ")"); inputStream.seek(currentPos); return true; } + try { readMagic(); // all good - either we found the sync marker or EOF. Reset position and continue @@ -273,19 +309,25 @@ private boolean isBlockCorrupt(int blocksize) throws IOException { } private long scanForNextAvailableBlockOffset() throws IOException { + // Make buffer large enough to scan through the file as quick as possible especially if it is on S3/GCS. + byte[] dataBuf = new byte[BLOCK_SCAN_READ_BUFFER_SIZE]; + boolean eof = false; while (true) { long currentPos = inputStream.getPos(); try { - boolean hasNextMagic = hasNextMagic(); - if (hasNextMagic) { - return currentPos; - } else { - // No luck - advance and try again - inputStream.seek(currentPos + 1); - } + Arrays.fill(dataBuf, (byte) 0); + inputStream.readFully(dataBuf, 0, dataBuf.length); } catch (EOFException e) { + eof = true; + } + long pos = Bytes.indexOf(dataBuf, HoodieLogFormat.MAGIC); + if (pos >= 0) { + return currentPos + pos; + } + if (eof) { return inputStream.getPos(); } + inputStream.seek(currentPos + dataBuf.length - HoodieLogFormat.MAGIC.length); } } @@ -420,4 +462,63 @@ public long moveToPrev() throws IOException { public void remove() { throw new UnsupportedOperationException("Remove not supported for HoodieLogFileReader"); } + + /** + * Fetch the right {@link FSDataInputStream} to be used by wrapping with required input streams. + * @param fs instance of {@link FileSystem} in use. + * @param bufferSize buffer size to be used. + * @return the right {@link FSDataInputStream} as required. + */ + private static FSDataInputStream getFSDataInputStream(FileSystem fs, + HoodieLogFile logFile, + int bufferSize) throws IOException { + FSDataInputStream fsDataInputStream = fs.open(logFile.getPath(), bufferSize); + + if (FSUtils.isGCSFileSystem(fs)) { + // in GCS FS, we might need to interceptor seek offsets as we might get EOF exception + return new SchemeAwareFSDataInputStream(getFSDataInputStreamForGCS(fsDataInputStream, logFile, bufferSize), true); + } + + if (FSUtils.isCHDFileSystem(fs)) { + return new BoundedFsDataInputStream(fs, logFile.getPath(), fsDataInputStream); + } + + if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) { + return new TimedFSDataInputStream(logFile.getPath(), new FSDataInputStream( + new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize))); + } + + // fsDataInputStream.getWrappedStream() maybe a BufferedFSInputStream + // need to wrap in another BufferedFSInputStream the make bufferSize work? + return fsDataInputStream; + } + + /** + * GCS FileSystem needs some special handling for seek and hence this method assists to fetch the right {@link FSDataInputStream} to be + * used by wrapping with required input streams. + * @param fsDataInputStream original instance of {@link FSDataInputStream}. + * @param bufferSize buffer size to be used. + * @return the right {@link FSDataInputStream} as required. + */ + private static FSDataInputStream getFSDataInputStreamForGCS(FSDataInputStream fsDataInputStream, + HoodieLogFile logFile, + int bufferSize) { + // incase of GCS FS, there are two flows. + // a. fsDataInputStream.getWrappedStream() instanceof FSInputStream + // b. fsDataInputStream.getWrappedStream() not an instanceof FSInputStream, but an instance of FSDataInputStream. + // (a) is handled in the first if block and (b) is handled in the second if block. If not, we fallback to original fsDataInputStream + if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) { + return new TimedFSDataInputStream(logFile.getPath(), new FSDataInputStream( + new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize))); + } + + if (fsDataInputStream.getWrappedStream() instanceof FSDataInputStream + && ((FSDataInputStream) fsDataInputStream.getWrappedStream()).getWrappedStream() instanceof FSInputStream) { + FSInputStream inputStream = (FSInputStream)((FSDataInputStream) fsDataInputStream.getWrappedStream()).getWrappedStream(); + return new TimedFSDataInputStream(logFile.getPath(), + new FSDataInputStream(new BufferedFSInputStream(inputStream, bufferSize))); + } + + return fsDataInputStream; + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormat.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormat.java index b3700fbedf0cc..569b4a23b683b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormat.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormat.java @@ -33,6 +33,7 @@ import java.io.Closeable; import java.io.IOException; import java.util.Iterator; +import java.util.List; /** * File Format for Hoodie Log Files. The File Format consists of blocks each separated with a MAGIC sync marker. A Block @@ -55,20 +56,29 @@ public interface HoodieLogFormat { String UNKNOWN_WRITE_TOKEN = "1-0-1"; + String DEFAULT_WRITE_TOKEN = "0-0-0"; + /** * Writer interface to allow appending block to this file format. */ interface Writer extends Closeable { /** - * @return the path to this {@link HoodieLogFormat} + * @return the path to the current {@link HoodieLogFile} being written to. */ HoodieLogFile getLogFile(); /** - * Append Block returns a new Writer if the log is rolled. + * Append Block to a log file. + * @return {@link AppendResult} containing result of the append. + */ + AppendResult appendBlock(HoodieLogBlock block) throws IOException, InterruptedException; + + /** + * Appends the list of blocks to a logfile. + * @return {@link AppendResult} containing result of the append. */ - Writer appendBlock(HoodieLogBlock block) throws IOException, InterruptedException; + AppendResult appendBlocks(List blocks) throws IOException, InterruptedException; long getCurrentSize() throws IOException; } @@ -88,7 +98,7 @@ interface Reader extends Closeable, Iterator { * * @return */ - public boolean hasPrev(); + boolean hasPrev(); /** * Read log file in reverse order and return prev block if present. @@ -96,7 +106,7 @@ interface Reader extends Closeable, Iterator { * @return * @throws IOException */ - public HoodieLogBlock prev() throws IOException; + HoodieLogBlock prev() throws IOException; } /** @@ -125,6 +135,8 @@ class WriterBuilder { // version number for this log file. If not specified, then the current version will be // computed by inspecting the file system private Integer logVersion; + // file len of this log file + private Long fileLen = 0L; // Location of the directory containing the log private Path parentPath; // Log File Write Token @@ -142,13 +154,13 @@ public WriterBuilder withReplication(short replication) { return this; } - public WriterBuilder withLogWriteToken(String writeToken) { - this.logWriteToken = writeToken; + public WriterBuilder withRolloverLogWriteToken(String rolloverLogWriteToken) { + this.rolloverLogWriteToken = rolloverLogWriteToken; return this; } - public WriterBuilder withRolloverLogWriteToken(String rolloverLogWriteToken) { - this.rolloverLogWriteToken = rolloverLogWriteToken; + public WriterBuilder withLogWriteToken(String logWriteToken) { + this.logWriteToken = logWriteToken; return this; } @@ -182,12 +194,17 @@ public WriterBuilder withLogVersion(int version) { return this; } + public WriterBuilder withFileSize(long fileLen) { + this.fileLen = fileLen; + return this; + } + public WriterBuilder onParentPath(Path parentPath) { this.parentPath = parentPath; return this; } - public Writer build() throws IOException, InterruptedException { + public Writer build() throws IOException { LOG.info("Building HoodieLogFormat Writer"); if (fs == null) { throw new IllegalArgumentException("fs is not specified"); @@ -229,13 +246,14 @@ public Writer build() throws IOException, InterruptedException { if (logWriteToken == null) { // This is the case where we have existing log-file with old format. rollover to avoid any conflicts logVersion += 1; + fileLen = 0L; logWriteToken = rolloverLogWriteToken; } Path logPath = new Path(parentPath, FSUtils.makeLogFileName(logFileId, fileExtension, instantTime, logVersion, logWriteToken)); LOG.info("HoodieLogFile on path " + logPath); - HoodieLogFile logFile = new HoodieLogFile(logPath); + HoodieLogFile logFile = new HoodieLogFile(logPath, fileLen); if (bufferSize == null) { bufferSize = FSUtils.getDefaultBufferSize(fs); @@ -246,8 +264,7 @@ public Writer build() throws IOException, InterruptedException { if (sizeThreshold == null) { sizeThreshold = DEFAULT_SIZE_THRESHOLD; } - return new HoodieLogFormatWriter(fs, logFile, bufferSize, replication, sizeThreshold, logWriteToken, - rolloverLogWriteToken); + return new HoodieLogFormatWriter(fs, logFile, bufferSize, replication, sizeThreshold, rolloverLogWriteToken); } } @@ -257,7 +274,7 @@ static WriterBuilder newWriterBuilder() { static HoodieLogFormat.Reader newReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema) throws IOException { - return new HoodieLogFileReader(fs, logFile, readerSchema, HoodieLogFileReader.DEFAULT_BUFFER_SIZE, false, false); + return new HoodieLogFileReader(fs, logFile, readerSchema, HoodieLogFileReader.DEFAULT_BUFFER_SIZE, false); } static HoodieLogFormat.Reader newReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatReader.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatReader.java index a5834d227fe0c..c48107e392515 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatReader.java @@ -24,6 +24,7 @@ import org.apache.avro.Schema; import org.apache.hadoop.fs.FileSystem; +import org.apache.hudi.internal.schema.InternalSchema; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -42,14 +43,18 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader { private HoodieLogFileReader currentReader; private final FileSystem fs; private final Schema readerSchema; + private InternalSchema internalSchema = InternalSchema.getEmptyInternalSchema(); private final boolean readBlocksLazily; private final boolean reverseLogReader; + private final String recordKeyField; + private final boolean enableInlineReading; private int bufferSize; private static final Logger LOG = LogManager.getLogger(HoodieLogFormatReader.class); HoodieLogFormatReader(FileSystem fs, List logFiles, Schema readerSchema, boolean readBlocksLazily, - boolean reverseLogReader, int bufferSize) throws IOException { + boolean reverseLogReader, int bufferSize, boolean enableRecordLookups, + String recordKeyField, InternalSchema internalSchema) throws IOException { this.logFiles = logFiles; this.fs = fs; this.readerSchema = readerSchema; @@ -57,9 +62,13 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader { this.reverseLogReader = reverseLogReader; this.bufferSize = bufferSize; this.prevReadersInOpenState = new ArrayList<>(); + this.recordKeyField = recordKeyField; + this.enableInlineReading = enableRecordLookups; + this.internalSchema = internalSchema == null ? InternalSchema.getEmptyInternalSchema() : internalSchema; if (logFiles.size() > 0) { HoodieLogFile nextLogFile = logFiles.remove(0); - this.currentReader = new HoodieLogFileReader(fs, nextLogFile, readerSchema, bufferSize, readBlocksLazily, false); + this.currentReader = new HoodieLogFileReader(fs, nextLogFile, readerSchema, bufferSize, readBlocksLazily, false, + enableRecordLookups, recordKeyField, internalSchema); } } @@ -98,13 +107,13 @@ public boolean hasNext() { } else { this.prevReadersInOpenState.add(currentReader); } - this.currentReader = - new HoodieLogFileReader(fs, nextLogFile, readerSchema, bufferSize, readBlocksLazily, false); + this.currentReader = new HoodieLogFileReader(fs, nextLogFile, readerSchema, bufferSize, readBlocksLazily, false, + enableInlineReading, recordKeyField, internalSchema); } catch (IOException io) { throw new HoodieIOException("unable to initialize read with log file ", io); } LOG.info("Moving to the next reader for logfile " + currentReader.getLogFile()); - return this.currentReader.hasNext(); + return hasNext(); } return false; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatWriter.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatWriter.java index 7fe21e9b2955d..8dbe85efd1164 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatWriter.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatWriter.java @@ -21,7 +21,6 @@ import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.StorageSchemes; import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.table.log.HoodieLogFormat.Writer; import org.apache.hudi.common.table.log.HoodieLogFormat.WriterBuilder; import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.exception.HoodieException; @@ -38,6 +37,8 @@ import org.apache.log4j.Logger; import java.io.IOException; +import java.util.Collections; +import java.util.List; /** * HoodieLogFormatWriter can be used to append blocks to a log file Use HoodieLogFormat.WriterBuilder to construct. @@ -47,33 +48,24 @@ public class HoodieLogFormatWriter implements HoodieLogFormat.Writer { private static final Logger LOG = LogManager.getLogger(HoodieLogFormatWriter.class); private HoodieLogFile logFile; + private FSDataOutputStream output; + private final FileSystem fs; private final long sizeThreshold; private final Integer bufferSize; private final Short replication; - private final String logWriteToken; private final String rolloverLogWriteToken; - private FSDataOutputStream output; private boolean closed = false; private transient Thread shutdownThread = null; private static final String APPEND_UNAVAILABLE_EXCEPTION_MESSAGE = "not sufficiently replicated yet"; - /** - * @param fs - * @param logFile - * @param bufferSize - * @param replication - * @param sizeThreshold - */ - HoodieLogFormatWriter(FileSystem fs, HoodieLogFile logFile, Integer bufferSize, Short replication, Long sizeThreshold, - String logWriteToken, String rolloverLogWriteToken) { + HoodieLogFormatWriter(FileSystem fs, HoodieLogFile logFile, Integer bufferSize, Short replication, Long sizeThreshold, String rolloverLogWriteToken) { this.fs = fs; this.logFile = logFile; this.sizeThreshold = sizeThreshold; this.bufferSize = bufferSize; this.replication = replication; - this.logWriteToken = logWriteToken; this.rolloverLogWriteToken = rolloverLogWriteToken; addShutDownHook(); } @@ -105,6 +97,7 @@ private FSDataOutputStream getOutputStream() throws IOException, InterruptedExce if (isAppendSupported) { LOG.info(logFile + " exists. Appending to existing file"); try { + // open the path for append and record the offset this.output = fs.append(path, bufferSize); } catch (RemoteException e) { LOG.warn("Remote Exception, attempting to handle or recover lease", e); @@ -124,9 +117,9 @@ private FSDataOutputStream getOutputStream() throws IOException, InterruptedExce } } if (!isAppendSupported) { - this.logFile = logFile.rollOver(fs, rolloverLogWriteToken); - LOG.info("Append not supported.. Rolling over to " + logFile); + rollOver(); createNewFile(); + LOG.info("Append not supported.. Rolling over to " + logFile); } } else { LOG.info(logFile + " does not exist. Create a new file"); @@ -138,52 +131,72 @@ private FSDataOutputStream getOutputStream() throws IOException, InterruptedExce } @Override - public Writer appendBlock(HoodieLogBlock block) throws IOException, InterruptedException { + public AppendResult appendBlock(HoodieLogBlock block) throws IOException, InterruptedException { + return appendBlocks(Collections.singletonList(block)); + } + @Override + public AppendResult appendBlocks(List blocks) throws IOException, InterruptedException { // Find current version HoodieLogFormat.LogFormatVersion currentLogFormatVersion = new HoodieLogFormatVersion(HoodieLogFormat.CURRENT_VERSION); - FSDataOutputStream outputStream = getOutputStream(); - long currentSize = outputStream.size(); - - // 1. Write the magic header for the start of the block - outputStream.write(HoodieLogFormat.MAGIC); - - // bytes for header - byte[] headerBytes = HoodieLogBlock.getLogMetadataBytes(block.getLogBlockHeader()); - // content bytes - byte[] content = block.getContentBytes(); - // bytes for footer - byte[] footerBytes = HoodieLogBlock.getLogMetadataBytes(block.getLogBlockFooter()); - - // 2. Write the total size of the block (excluding Magic) - outputStream.writeLong(getLogBlockLength(content.length, headerBytes.length, footerBytes.length)); - - // 3. Write the version of this log block - outputStream.writeInt(currentLogFormatVersion.getVersion()); - // 4. Write the block type - outputStream.writeInt(block.getBlockType().ordinal()); - - // 5. Write the headers for the log block - outputStream.write(headerBytes); - // 6. Write the size of the content block - outputStream.writeLong(content.length); - // 7. Write the contents of the data block - outputStream.write(content); - // 8. Write the footers for the log block - outputStream.write(footerBytes); - // 9. Write the total size of the log block (including magic) which is everything written - // until now (for reverse pointer) - // Update: this information is now used in determining if a block is corrupt by comparing to the - // block size in header. This change assumes that the block size will be the last data written - // to a block. Read will break if any data is written past this point for a block. - outputStream.writeLong(outputStream.size() - currentSize); - // Flush every block to disk + FSDataOutputStream originalOutputStream = getOutputStream(); + long startPos = originalOutputStream.getPos(); + long sizeWritten = 0; + // HUDI-2655. here we wrap originalOutputStream to ensure huge blocks can be correctly written + FSDataOutputStream outputStream = new FSDataOutputStream(originalOutputStream, new FileSystem.Statistics(fs.getScheme()), startPos); + for (HoodieLogBlock block: blocks) { + long startSize = outputStream.size(); + + // 1. Write the magic header for the start of the block + outputStream.write(HoodieLogFormat.MAGIC); + + // bytes for header + byte[] headerBytes = HoodieLogBlock.getLogMetadataBytes(block.getLogBlockHeader()); + // content bytes + byte[] content = block.getContentBytes(); + // bytes for footer + byte[] footerBytes = HoodieLogBlock.getLogMetadataBytes(block.getLogBlockFooter()); + + // 2. Write the total size of the block (excluding Magic) + outputStream.writeLong(getLogBlockLength(content.length, headerBytes.length, footerBytes.length)); + + // 3. Write the version of this log block + outputStream.writeInt(currentLogFormatVersion.getVersion()); + // 4. Write the block type + outputStream.writeInt(block.getBlockType().ordinal()); + + // 5. Write the headers for the log block + outputStream.write(headerBytes); + // 6. Write the size of the content block + outputStream.writeLong(content.length); + // 7. Write the contents of the data block + outputStream.write(content); + // 8. Write the footers for the log block + outputStream.write(footerBytes); + // 9. Write the total size of the log block (including magic) which is everything written + // until now (for reverse pointer) + // Update: this information is now used in determining if a block is corrupt by comparing to the + // block size in header. This change assumes that the block size will be the last data written + // to a block. Read will break if any data is written past this point for a block. + outputStream.writeLong(outputStream.size() - startSize); + + // Fetch the size again, so it accounts also (9). + + // HUDI-2655. Check the size written to avoid log blocks whose size overflow. + if (outputStream.size() == Integer.MAX_VALUE) { + throw new HoodieIOException("Blocks appended may overflow. Please decrease log block size or log block amount"); + } + sizeWritten += outputStream.size() - startSize; + } + // Flush all blocks to disk flush(); + AppendResult result = new AppendResult(logFile, startPos, sizeWritten); // roll over if size is past the threshold - return rolloverIfNeeded(); + rolloverIfNeeded(); + return result; } /** @@ -201,20 +214,19 @@ private int getLogBlockLength(int contentLength, int headerLength, int footerLen Long.BYTES; // bytes to write totalLogBlockLength at end of block (for reverse ptr) } - private Writer rolloverIfNeeded() throws IOException, InterruptedException { + private void rolloverIfNeeded() throws IOException { // Roll over if the size is past the threshold if (getCurrentSize() > sizeThreshold) { - // TODO - make an end marker which seals the old log file (no more appends possible to that - // file). LOG.info("CurrentSize " + getCurrentSize() + " has reached threshold " + sizeThreshold + ". Rolling over to the next version"); - HoodieLogFile newLogFile = logFile.rollOver(fs, rolloverLogWriteToken); - // close this writer and return the new writer - close(); - return new HoodieLogFormatWriter(fs, newLogFile, bufferSize, replication, sizeThreshold, logWriteToken, - rolloverLogWriteToken); + rollOver(); } - return this; + } + + private void rollOver() throws IOException { + closeStream(); + this.logFile = logFile.rollOver(fs, rolloverLogWriteToken); + this.closed = false; } private void createNewFile() throws IOException { @@ -292,13 +304,12 @@ private void handleAppendExceptionOrRecoverLease(Path path, RemoteException e) // appended to, then the NN will throw an exception saying that it couldn't find any active replica with the // last block. Find more information here : https://issues.apache.org/jira/browse/HDFS-6325 LOG.warn("Failed to open an append stream to the log file. Opening a new log file..", e); - // Rollover the current log file (since cannot get a stream handle) and create new one - this.logFile = logFile.rollOver(fs, rolloverLogWriteToken); + rollOver(); createNewFile(); } else if (e.getClassName().contentEquals(AlreadyBeingCreatedException.class.getName())) { LOG.warn("Another task executor writing to the same log file(" + logFile + ". Rolling over"); // Rollover the current log file (since cannot get a stream handle) and create new one - this.logFile = logFile.rollOver(fs, rolloverLogWriteToken); + rollOver(); createNewFile(); } else if (e.getClassName().contentEquals(RecoveryInProgressException.class.getName()) && (fs instanceof DistributedFileSystem)) { @@ -311,8 +322,9 @@ private void handleAppendExceptionOrRecoverLease(Path path, RemoteException e) // try again this.output = fs.append(path, bufferSize); } else { - LOG.warn("Failed to recover lease on path " + path); - throw new HoodieException(e); + final String msg = "Failed to recover lease on path " + path; + LOG.warn(msg); + throw new HoodieException(msg, e); } } else { // When fs.append() has failed and an exception is thrown, by closing the output stream @@ -320,16 +332,16 @@ private void handleAppendExceptionOrRecoverLease(Path path, RemoteException e) // new attemptId, say taskId.1) it will be able to acquire lease on the log file (as output stream was // closed properly by taskId.0). // - // If close() call were to fail throwing an exception, our best bet is to rollover to a new log file. + // If closeStream() call were to fail throwing an exception, our best bet is to rollover to a new log file. try { - close(); + closeStream(); // output stream has been successfully closed and lease on the log file has been released, // before throwing an exception for the append failure. throw new HoodieIOException("Failed to append to the output stream ", e); } catch (Exception ce) { LOG.warn("Failed to close the output stream for " + fs.getClass().getName() + " on path " + path + ". Rolling over to a new log file."); - this.logFile = logFile.rollOver(fs, rolloverLogWriteToken); + rollOver(); createNewFile(); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java index 18f2167b7e346..acff6e70a5ecf 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java @@ -18,18 +18,26 @@ package org.apache.hudi.common.table.log; +import org.apache.avro.Schema; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.config.HoodieCommonConfig; +import org.apache.hudi.common.model.DeleteRecord; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.DefaultSizeEstimator; import org.apache.hudi.common.util.HoodieRecordSizeEstimator; import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.SpillableMapUtils; import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.hudi.exception.HoodieIOException; - -import org.apache.avro.Schema; -import org.apache.hadoop.fs.FileSystem; +import org.apache.hudi.internal.schema.InternalSchema; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -38,10 +46,13 @@ import java.util.List; import java.util.Map; +import static org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath; +import static org.apache.hudi.common.util.ValidationUtils.checkState; + /** * Scans through all the blocks in a list of HoodieLogFile and builds up a compacted/merged list of records which will * be used as a lookup table when merging the base columnar file with the redo log file. - * + *

    * NOTE: If readBlockLazily is turned on, does not merge, instead keeps reading log blocks and merges everything at once * This is an optimization to avoid seek() back and forth to read new block (forward seek()) and lazily read content of * seen block (reverse and forward seek()) during merge | | Read Block 1 Metadata | | Read Block 1 Data | | | Read Block @@ -51,53 +62,70 @@ * This results in two I/O passes over the log file. */ -public class HoodieMergedLogRecordScanner extends AbstractHoodieLogRecordScanner +public class HoodieMergedLogRecordScanner extends AbstractHoodieLogRecordReader implements Iterable> { private static final Logger LOG = LogManager.getLogger(HoodieMergedLogRecordScanner.class); - + // A timer for calculating elapsed time in millis + public final HoodieTimer timer = new HoodieTimer(); // Final map of compacted/merged records - private final ExternalSpillableMap> records; - + protected final ExternalSpillableMap> records; // count of merged records in log private long numMergedRecordsInLog; - + private long maxMemorySizeInBytes; // Stores the total time taken to perform reading and merging of log blocks - private final long totalTimeTakenToReadAndMergeBlocks; - // A timer for calculating elapsed time in millis - public final HoodieTimer timer = new HoodieTimer(); + private long totalTimeTakenToReadAndMergeBlocks; @SuppressWarnings("unchecked") - public HoodieMergedLogRecordScanner(FileSystem fs, String basePath, List logFilePaths, Schema readerSchema, - String latestInstantTime, Long maxMemorySizeInBytes, boolean readBlocksLazily, boolean reverseReader, - int bufferSize, String spillableMapBasePath) { - super(fs, basePath, logFilePaths, readerSchema, latestInstantTime, readBlocksLazily, reverseReader, bufferSize); + protected HoodieMergedLogRecordScanner(FileSystem fs, String basePath, List logFilePaths, Schema readerSchema, + String latestInstantTime, Long maxMemorySizeInBytes, boolean readBlocksLazily, + boolean reverseReader, int bufferSize, String spillableMapBasePath, + Option instantRange, + ExternalSpillableMap.DiskMapType diskMapType, + boolean isBitCaskDiskMapCompressionEnabled, + boolean withOperationField, boolean forceFullScan, + Option partitionName, InternalSchema internalSchema) { + super(fs, basePath, logFilePaths, readerSchema, latestInstantTime, readBlocksLazily, reverseReader, bufferSize, + instantRange, withOperationField, + forceFullScan, partitionName, internalSchema); try { // Store merged records for all versions for this log file, set the in-memory footprint to maxInMemoryMapSize this.records = new ExternalSpillableMap<>(maxMemorySizeInBytes, spillableMapBasePath, new DefaultSizeEstimator(), - new HoodieRecordSizeEstimator(readerSchema)); - // Do the scan and merge - timer.startTimer(); - scan(); - this.totalTimeTakenToReadAndMergeBlocks = timer.endTimer(); - this.numMergedRecordsInLog = records.size(); - LOG.info("MaxMemoryInBytes allowed for compaction => " + maxMemorySizeInBytes); - LOG.info("Number of entries in MemoryBasedMap in ExternalSpillableMap => " + records.getInMemoryMapNumEntries()); - LOG.info( - "Total size in bytes of MemoryBasedMap in ExternalSpillableMap => " + records.getCurrentInMemoryMapSize()); - LOG.info("Number of entries in DiskBasedMap in ExternalSpillableMap => " + records.getDiskBasedMapNumEntries()); - LOG.info("Size of file spilled to disk => " + records.getSizeOfFileOnDiskInBytes()); + new HoodieRecordSizeEstimator(readerSchema), diskMapType, isBitCaskDiskMapCompressionEnabled); + + this.maxMemorySizeInBytes = maxMemorySizeInBytes; } catch (IOException e) { - throw new HoodieIOException("IOException when reading log file ", e); + throw new HoodieIOException("IOException when creating ExternalSpillableMap at " + spillableMapBasePath, e); } + + if (forceFullScan) { + performScan(); + } + } + + protected void performScan() { + // Do the scan and merge + timer.startTimer(); + scan(); + this.totalTimeTakenToReadAndMergeBlocks = timer.endTimer(); + this.numMergedRecordsInLog = records.size(); + LOG.info("Number of log files scanned => " + logFilePaths.size()); + LOG.info("MaxMemoryInBytes allowed for compaction => " + maxMemorySizeInBytes); + LOG.info("Number of entries in MemoryBasedMap in ExternalSpillableMap => " + records.getInMemoryMapNumEntries()); + LOG.info( + "Total size in bytes of MemoryBasedMap in ExternalSpillableMap => " + records.getCurrentInMemoryMapSize()); + LOG.info("Number of entries in BitCaskDiskMap in ExternalSpillableMap => " + records.getDiskBasedMapNumEntries()); + LOG.info("Size of file spilled to disk => " + records.getSizeOfFileOnDiskInBytes()); } @Override public Iterator> iterator() { + checkState(forceFullScan, "Record reader has to be in full-scan mode to use this API"); return records.iterator(); } public Map> getRecords() { + checkState(forceFullScan, "Record reader has to be in full-scan mode to use this API"); return records; } @@ -117,9 +145,16 @@ protected void processNextRecord(HoodieRecord hoo String key = hoodieRecord.getRecordKey(); if (records.containsKey(key)) { // Merge and store the merged record. The HoodieRecordPayload implementation is free to decide what should be - // done when a delete (empty payload) is encountered before or after an insert/update. - HoodieRecordPayload combinedValue = hoodieRecord.getData().preCombine(records.get(key).getData()); - records.put(key, new HoodieRecord<>(new HoodieKey(key, hoodieRecord.getPartitionPath()), combinedValue)); + // done when a DELETE (empty payload) is encountered before or after an insert/update. + + HoodieRecord oldRecord = records.get(key); + HoodieRecordPayload oldValue = oldRecord.getData(); + HoodieRecordPayload combinedValue = hoodieRecord.getData().preCombine(oldValue); + // If combinedValue is oldValue, no need rePut oldRecord + if (combinedValue != oldValue) { + HoodieOperation operation = hoodieRecord.getOperation(); + records.put(key, new HoodieAvroRecord<>(new HoodieKey(key, hoodieRecord.getPartitionPath()), combinedValue, operation)); + } } else { // Put the record as is records.put(key, hoodieRecord); @@ -127,71 +162,119 @@ protected void processNextRecord(HoodieRecord hoo } @Override - protected void processNextDeletedKey(HoodieKey hoodieKey) { - records.put(hoodieKey.getRecordKey(), SpillableMapUtils.generateEmptyPayload(hoodieKey.getRecordKey(), - hoodieKey.getPartitionPath(), getPayloadClassFQN())); + protected void processNextDeletedRecord(DeleteRecord deleteRecord) { + String key = deleteRecord.getRecordKey(); + HoodieRecord oldRecord = records.get(key); + if (oldRecord != null) { + // Merge and store the merged record. The ordering val is taken to decide whether the same key record + // should be deleted or be kept. The old record is kept only if the DELETE record has smaller ordering val. + // For same ordering values, uses the natural order(arrival time semantics). + + Comparable curOrderingVal = oldRecord.getData().getOrderingValue(); + Comparable deleteOrderingVal = deleteRecord.getOrderingValue(); + // Checks the ordering value does not equal to 0 + // because we use 0 as the default value which means natural order + boolean choosePrev = !deleteOrderingVal.equals(0) + && ReflectionUtils.isSameClass(curOrderingVal, deleteOrderingVal) + && curOrderingVal.compareTo(deleteOrderingVal) > 0; + if (choosePrev) { + // The DELETE message is obsolete if the old message has greater orderingVal. + return; + } + } + // Put the DELETE record + records.put(key, SpillableMapUtils.generateEmptyPayload(key, + deleteRecord.getPartitionPath(), deleteRecord.getOrderingValue(), getPayloadClassFQN())); } public long getTotalTimeTakenToReadAndMergeBlocks() { return totalTimeTakenToReadAndMergeBlocks; } + public void close() { + if (records != null) { + records.close(); + } + } + /** * Builder used to build {@code HoodieUnMergedLogRecordScanner}. */ - public static class Builder extends AbstractHoodieLogRecordScanner.Builder { - private FileSystem fs; - private String basePath; - private List logFilePaths; - private Schema readerSchema; - private String latestInstantTime; - private boolean readBlocksLazily; - private boolean reverseReader; - private int bufferSize; + public static class Builder extends AbstractHoodieLogRecordReader.Builder { + protected FileSystem fs; + protected String basePath; + protected List logFilePaths; + protected Schema readerSchema; + private InternalSchema internalSchema = InternalSchema.getEmptyInternalSchema(); + protected String latestInstantTime; + protected boolean readBlocksLazily; + protected boolean reverseReader; + protected int bufferSize; // specific configurations - private Long maxMemorySizeInBytes; - private String spillableMapBasePath; + protected Long maxMemorySizeInBytes; + protected String spillableMapBasePath; + protected ExternalSpillableMap.DiskMapType diskMapType = HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue(); + protected boolean isBitCaskDiskMapCompressionEnabled = HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue(); + // incremental filtering + protected Option instantRange = Option.empty(); + protected String partitionName; + // operation field default false + private boolean withOperationField = false; + @Override public Builder withFileSystem(FileSystem fs) { this.fs = fs; return this; } + @Override public Builder withBasePath(String basePath) { this.basePath = basePath; return this; } + @Override public Builder withLogFilePaths(List logFilePaths) { this.logFilePaths = logFilePaths; return this; } + @Override public Builder withReaderSchema(Schema schema) { this.readerSchema = schema; return this; } + @Override public Builder withLatestInstantTime(String latestInstantTime) { this.latestInstantTime = latestInstantTime; return this; } + @Override public Builder withReadBlocksLazily(boolean readBlocksLazily) { this.readBlocksLazily = readBlocksLazily; return this; } + @Override public Builder withReverseReader(boolean reverseReader) { this.reverseReader = reverseReader; return this; } + @Override public Builder withBufferSize(int bufferSize) { this.bufferSize = bufferSize; return this; } + @Override + public Builder withInstantRange(Option instantRange) { + this.instantRange = instantRange; + return this; + } + public Builder withMaxMemorySizeInBytes(Long maxMemorySizeInBytes) { this.maxMemorySizeInBytes = maxMemorySizeInBytes; return this; @@ -202,11 +285,42 @@ public Builder withSpillableMapBasePath(String spillableMapBasePath) { return this; } + public Builder withDiskMapType(ExternalSpillableMap.DiskMapType diskMapType) { + this.diskMapType = diskMapType; + return this; + } + + public Builder withBitCaskDiskMapCompressionEnabled(boolean isBitCaskDiskMapCompressionEnabled) { + this.isBitCaskDiskMapCompressionEnabled = isBitCaskDiskMapCompressionEnabled; + return this; + } + + public Builder withInternalSchema(InternalSchema internalSchema) { + this.internalSchema = internalSchema == null ? InternalSchema.getEmptyInternalSchema() : internalSchema; + return this; + } + + public Builder withOperationField(boolean withOperationField) { + this.withOperationField = withOperationField; + return this; + } + + @Override + public Builder withPartition(String partitionName) { + this.partitionName = partitionName; + return this; + } + @Override public HoodieMergedLogRecordScanner build() { + if (this.partitionName == null && CollectionUtils.nonEmpty(this.logFilePaths)) { + this.partitionName = getRelativePartitionPath(new Path(basePath), new Path(this.logFilePaths.get(0)).getParent()); + } return new HoodieMergedLogRecordScanner(fs, basePath, logFilePaths, readerSchema, latestInstantTime, maxMemorySizeInBytes, readBlocksLazily, reverseReader, - bufferSize, spillableMapBasePath); + bufferSize, spillableMapBasePath, instantRange, + diskMapType, isBitCaskDiskMapCompressionEnabled, withOperationField, true, + Option.ofNullable(partitionName), internalSchema); } } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java index 1aac6330e06ba..8ea34d6f2fa0d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java @@ -18,9 +18,10 @@ package org.apache.hudi.common.table.log; -import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.DeleteRecord; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; import org.apache.avro.Schema; import org.apache.hadoop.fs.FileSystem; @@ -30,14 +31,14 @@ /** * A scanner used to scan hoodie unmerged log records. */ -public class HoodieUnMergedLogRecordScanner extends AbstractHoodieLogRecordScanner { +public class HoodieUnMergedLogRecordScanner extends AbstractHoodieLogRecordReader { private final LogRecordScannerCallback callback; - public HoodieUnMergedLogRecordScanner(FileSystem fs, String basePath, List logFilePaths, Schema readerSchema, - String latestInstantTime, boolean readBlocksLazily, boolean reverseReader, int bufferSize, - LogRecordScannerCallback callback) { - super(fs, basePath, logFilePaths, readerSchema, latestInstantTime, readBlocksLazily, reverseReader, bufferSize); + private HoodieUnMergedLogRecordScanner(FileSystem fs, String basePath, List logFilePaths, Schema readerSchema, + String latestInstantTime, boolean readBlocksLazily, boolean reverseReader, int bufferSize, + LogRecordScannerCallback callback, Option instantRange) { + super(fs, basePath, logFilePaths, readerSchema, latestInstantTime, readBlocksLazily, reverseReader, bufferSize, instantRange, false); this.callback = callback; } @@ -55,7 +56,7 @@ protected void processNextRecord(HoodieRecord hoo } @Override - protected void processNextDeletedKey(HoodieKey key) { + protected void processNextDeletedRecord(DeleteRecord deleteRecord) { throw new IllegalStateException("Not expected to see delete records in this log-scan mode. Check Job Config"); } @@ -63,15 +64,15 @@ protected void processNextDeletedKey(HoodieKey key) { * A callback for log record scanner. */ @FunctionalInterface - public static interface LogRecordScannerCallback { + public interface LogRecordScannerCallback { - public void apply(HoodieRecord record) throws Exception; + void apply(HoodieRecord record) throws Exception; } /** * Builder used to build {@code HoodieUnMergedLogRecordScanner}. */ - public static class Builder extends AbstractHoodieLogRecordScanner.Builder { + public static class Builder extends AbstractHoodieLogRecordReader.Builder { private FileSystem fs; private String basePath; private List logFilePaths; @@ -80,6 +81,7 @@ public static class Builder extends AbstractHoodieLogRecordScanner.Builder { private boolean readBlocksLazily; private boolean reverseReader; private int bufferSize; + private Option instantRange = Option.empty(); // specific configurations private LogRecordScannerCallback callback; @@ -123,6 +125,11 @@ public Builder withBufferSize(int bufferSize) { return this; } + public Builder withInstantRange(Option instantRange) { + this.instantRange = instantRange; + return this; + } + public Builder withLogRecordScannerCallback(LogRecordScannerCallback callback) { this.callback = callback; return this; @@ -131,7 +138,7 @@ public Builder withLogRecordScannerCallback(LogRecordScannerCallback callback) { @Override public HoodieUnMergedLogRecordScanner build() { return new HoodieUnMergedLogRecordScanner(fs, basePath, logFilePaths, readerSchema, - latestInstantTime, readBlocksLazily, reverseReader, bufferSize, callback); + latestInstantTime, readBlocksLazily, reverseReader, bufferSize, callback, instantRange); } } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/InstantRange.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/InstantRange.java new file mode 100644 index 0000000000000..0a49f0f343af2 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/InstantRange.java @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.table.log; + +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.ValidationUtils; + +import java.io.Serializable; +import java.util.Objects; + +/** + * A instant commits range used for incremental reader filtering. + */ +public abstract class InstantRange implements Serializable { + private static final long serialVersionUID = 1L; + + protected final String startInstant; + protected final String endInstant; + + public InstantRange(String startInstant, String endInstant) { + this.startInstant = startInstant; + this.endInstant = endInstant; + } + + /** + * Returns the builder. + */ + public static Builder builder() { + return new Builder(); + } + + public String getStartInstant() { + return startInstant; + } + + public String getEndInstant() { + return endInstant; + } + + public abstract boolean isInRange(String instant); + + // ------------------------------------------------------------------------- + // Inner Class + // ------------------------------------------------------------------------- + + /** + * Represents a range type. + */ + public static enum RangeType { + OPEN_CLOSE, CLOSE_CLOSE + } + + private static class OpenCloseRange extends InstantRange { + + public OpenCloseRange(String startInstant, String endInstant) { + super(Objects.requireNonNull(startInstant), endInstant); + } + + @Override + public boolean isInRange(String instant) { + // No need to do comparison: + // HoodieTimeline.compareTimestamps(instant, HoodieTimeline.LESSER_THAN_OR_EQUALS, endInstant) + // because the logic is ensured by the log scanner + return HoodieTimeline.compareTimestamps(instant, HoodieTimeline.GREATER_THAN, startInstant); + } + } + + private static class OpenCloseRangeNullableBoundary extends InstantRange { + + public OpenCloseRangeNullableBoundary(String startInstant, String endInstant) { + super(startInstant, endInstant); + ValidationUtils.checkArgument(startInstant != null || endInstant != null, + "Start and end instants can not both be null"); + } + + @Override + public boolean isInRange(String instant) { + if (startInstant == null) { + return HoodieTimeline.compareTimestamps(instant, HoodieTimeline.LESSER_THAN_OR_EQUALS, endInstant); + } else if (endInstant == null) { + return HoodieTimeline.compareTimestamps(instant, HoodieTimeline.GREATER_THAN, startInstant); + } else { + return HoodieTimeline.compareTimestamps(instant, HoodieTimeline.GREATER_THAN, startInstant) + && HoodieTimeline.compareTimestamps(instant, HoodieTimeline.LESSER_THAN_OR_EQUALS, endInstant); + } + } + } + + private static class CloseCloseRange extends InstantRange { + + public CloseCloseRange(String startInstant, String endInstant) { + super(Objects.requireNonNull(startInstant), endInstant); + } + + @Override + public boolean isInRange(String instant) { + // No need to do comparison: + // HoodieTimeline.compareTimestamps(instant, HoodieTimeline.LESSER_THAN_OR_EQUALS, endInstant) + // because the logic is ensured by the log scanner + return HoodieTimeline.compareTimestamps(instant, HoodieTimeline.GREATER_THAN_OR_EQUALS, startInstant); + } + } + + private static class CloseCloseRangeNullableBoundary extends InstantRange { + + public CloseCloseRangeNullableBoundary(String startInstant, String endInstant) { + super(startInstant, endInstant); + ValidationUtils.checkArgument(startInstant != null || endInstant != null, + "Start and end instants can not both be null"); + } + + @Override + public boolean isInRange(String instant) { + if (startInstant == null) { + return HoodieTimeline.compareTimestamps(instant, HoodieTimeline.LESSER_THAN_OR_EQUALS, endInstant); + } else if (endInstant == null) { + return HoodieTimeline.compareTimestamps(instant, HoodieTimeline.GREATER_THAN_OR_EQUALS, startInstant); + } else { + return HoodieTimeline.compareTimestamps(instant, HoodieTimeline.GREATER_THAN_OR_EQUALS, startInstant) + && HoodieTimeline.compareTimestamps(instant, HoodieTimeline.LESSER_THAN_OR_EQUALS, endInstant); + } + } + } + + // ------------------------------------------------------------------------- + // Inner Class + // ------------------------------------------------------------------------- + + /** + * Builder for {@link InstantRange}. + */ + public static class Builder { + private String startInstant; + private String endInstant; + private RangeType rangeType; + private boolean nullableBoundary = false; + + private Builder() { + } + + public Builder startInstant(String startInstant) { + this.startInstant = startInstant; + return this; + } + + public Builder endInstant(String endInstant) { + this.endInstant = endInstant; + return this; + } + + public Builder rangeType(RangeType rangeType) { + this.rangeType = rangeType; + return this; + } + + public Builder nullableBoundary(boolean nullable) { + this.nullableBoundary = nullable; + return this; + } + + public InstantRange build() { + ValidationUtils.checkState(this.rangeType != null, "Range type is required"); + switch (rangeType) { + case OPEN_CLOSE: + return nullableBoundary + ? new OpenCloseRangeNullableBoundary(startInstant, endInstant) + : new OpenCloseRange(startInstant, endInstant); + case CLOSE_CLOSE: + return nullableBoundary + ? new CloseCloseRangeNullableBoundary(startInstant, endInstant) + : new CloseCloseRange(startInstant, endInstant); + default: + throw new AssertionError(); + } + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/LogReaderUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/LogReaderUtils.java index ffc4b858207ee..c1b20cbb4c55c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/LogReaderUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/LogReaderUtils.java @@ -27,14 +27,15 @@ import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.collection.Pair; import org.apache.avro.Schema; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import java.io.IOException; import java.util.List; +import java.util.Map; import java.util.stream.Collectors; /** @@ -42,9 +43,10 @@ */ public class LogReaderUtils { - private static Schema readSchemaFromLogFileInReverse(FileSystem fs, HoodieActiveTimeline activeTimeline, Path path) + private static Schema readSchemaFromLogFileInReverse(FileSystem fs, HoodieActiveTimeline activeTimeline, HoodieLogFile hoodieLogFile) throws IOException { - Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(path), null, true, true); + // set length for the HoodieLogFile as it will be leveraged by HoodieLogFormat.Reader with reverseReading enabled + Reader reader = HoodieLogFormat.newReader(fs, hoodieLogFile, null, true, true); Schema writerSchema = null; HoodieTimeline completedTimeline = activeTimeline.getCommitsTimeline().filterCompletedInstants(); while (reader.hasPrev()) { @@ -62,17 +64,17 @@ private static Schema readSchemaFromLogFileInReverse(FileSystem fs, HoodieActive return writerSchema; } - public static Schema readLatestSchemaFromLogFiles(String basePath, List deltaFilePaths, Configuration config) + public static Schema readLatestSchemaFromLogFiles(String basePath, List logFiles, Configuration config) throws IOException { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(config, basePath); - List deltaPaths = deltaFilePaths.stream().map(s -> new HoodieLogFile(new Path(s))) - .sorted(HoodieLogFile.getReverseLogFileComparator()).map(s -> s.getPath().toString()) + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(config).setBasePath(basePath).build(); + List deltaPaths = logFiles.stream().sorted(HoodieLogFile.getReverseLogFileComparator()).map(s -> s.getPath().toString()) .collect(Collectors.toList()); if (deltaPaths.size() > 0) { + Map deltaFilePathToFileStatus = logFiles.stream().map(entry -> Pair.of(entry.getPath().toString(), entry)) + .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); for (String logPath : deltaPaths) { FileSystem fs = FSUtils.getFs(logPath, config); - Schema schemaFromLogFile = - readSchemaFromLogFileInReverse(fs, metaClient.getActiveTimeline(), new Path(logPath)); + Schema schemaFromLogFile = readSchemaFromLogFileInReverse(fs, metaClient.getActiveTimeline(), deltaFilePathToFileStatus.get(logPath)); if (schemaFromLogFile != null) { return schemaFromLogFile; } @@ -80,5 +82,4 @@ public static Schema readLatestSchemaFromLogFiles(String basePath, List } return null; } - } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java index 31fc352acad09..9e74d14c048f2 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java @@ -18,12 +18,6 @@ package org.apache.hudi.common.table.log.block; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.common.fs.SizeAwareDataInputStream; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.exception.HoodieIOException; - import org.apache.avro.Schema; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; @@ -35,45 +29,66 @@ import org.apache.avro.io.Encoder; import org.apache.avro.io.EncoderFactory; import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hudi.common.fs.SizeAwareDataInputStream; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.internal.schema.InternalSchema; +import javax.annotation.Nonnull; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.zip.DeflaterOutputStream; +import java.util.zip.InflaterInputStream; -import javax.annotation.Nonnull; +import static org.apache.hudi.common.util.ValidationUtils.checkState; /** * HoodieAvroDataBlock contains a list of records serialized using Avro. It is used with the Parquet base file format. */ public class HoodieAvroDataBlock extends HoodieDataBlock { - private ThreadLocal encoderCache = new ThreadLocal<>(); - private ThreadLocal decoderCache = new ThreadLocal<>(); - - public HoodieAvroDataBlock(@Nonnull Map logBlockHeader, - @Nonnull Map logBlockFooter, - @Nonnull Option blockContentLocation, @Nonnull Option content, - FSDataInputStream inputStream, boolean readBlockLazily) { - super(logBlockHeader, logBlockFooter, blockContentLocation, content, inputStream, readBlockLazily); + private final ThreadLocal encoderCache = new ThreadLocal<>(); + + public HoodieAvroDataBlock(FSDataInputStream inputStream, + Option content, + boolean readBlockLazily, + HoodieLogBlockContentLocation logBlockContentLocation, + Option readerSchema, + Map header, + Map footer, + String keyField, InternalSchema internalSchema) { + super(content, inputStream, readBlockLazily, Option.of(logBlockContentLocation), readerSchema, header, footer, keyField, false, internalSchema); } - public HoodieAvroDataBlock(HoodieLogFile logFile, FSDataInputStream inputStream, Option content, - boolean readBlockLazily, long position, long blockSize, long blockEndpos, Schema readerSchema, - Map header, Map footer) { - super(content, inputStream, readBlockLazily, - Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndpos)), readerSchema, header, - footer); + public HoodieAvroDataBlock(FSDataInputStream inputStream, + Option content, + boolean readBlockLazily, + HoodieLogBlockContentLocation logBlockContentLocation, + Option readerSchema, + Map header, + Map footer, + String keyField) { + super(content, inputStream, readBlockLazily, Option.of(logBlockContentLocation), readerSchema, header, footer, keyField, false); } - public HoodieAvroDataBlock(@Nonnull List records, @Nonnull Map header) { - super(records, header, new HashMap<>()); + public HoodieAvroDataBlock(@Nonnull List records, + @Nonnull Map header, + @Nonnull String keyField) { + super(records, header, new HashMap<>(), keyField); } @Override @@ -82,7 +97,7 @@ public HoodieLogBlockType getBlockType() { } @Override - protected byte[] serializeRecords() throws IOException { + protected byte[] serializeRecords(List records) throws IOException { Schema schema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); GenericDatumWriter writer = new GenericDatumWriter<>(schema); ByteArrayOutputStream baos = new ByteArrayOutputStream(); @@ -95,9 +110,7 @@ protected byte[] serializeRecords() throws IOException { output.writeInt(records.size()); // 3. Write the records - Iterator itr = records.iterator(); - while (itr.hasNext()) { - IndexedRecord s = itr.next(); + for (IndexedRecord s : records) { ByteArrayOutputStream temp = new ByteArrayOutputStream(); BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(temp, encoderCache.get()); encoderCache.set(encoder); @@ -112,56 +125,93 @@ protected byte[] serializeRecords() throws IOException { output.writeInt(size); // Write the content output.write(temp.toByteArray()); - itr.remove(); } catch (IOException e) { throw new HoodieIOException("IOException converting HoodieAvroDataBlock to bytes", e); } } + encoderCache.remove(); output.close(); return baos.toByteArray(); } // TODO (na) - Break down content into smaller chunks of byte [] to be GC as they are used - // TODO (na) - Implement a recordItr instead of recordList @Override - protected void deserializeRecords() throws IOException { - SizeAwareDataInputStream dis = - new SizeAwareDataInputStream(new DataInputStream(new ByteArrayInputStream(getContent().get()))); + protected ClosableIterator deserializeRecords(byte[] content) throws IOException { + checkState(this.readerSchema != null, "Reader's schema has to be non-null"); + return RecordIterator.getInstance(this, content, internalSchema); + } + + private static class RecordIterator implements ClosableIterator { + private byte[] content; + private final SizeAwareDataInputStream dis; + private final GenericDatumReader reader; + private final ThreadLocal decoderCache = new ThreadLocal<>(); - // 1. Read version for this data block - int version = dis.readInt(); - HoodieAvroDataBlockVersion logBlockVersion = new HoodieAvroDataBlockVersion(version); + private int totalRecords = 0; + private int readRecords = 0; - // Get schema from the header - Schema writerSchema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); + private RecordIterator(Schema readerSchema, Schema writerSchema, byte[] content, InternalSchema internalSchema) throws IOException { + this.content = content; - // If readerSchema was not present, use writerSchema - if (schema == null) { - schema = writerSchema; + this.dis = new SizeAwareDataInputStream(new DataInputStream(new ByteArrayInputStream(this.content))); + + // 1. Read version for this data block + int version = this.dis.readInt(); + HoodieAvroDataBlockVersion logBlockVersion = new HoodieAvroDataBlockVersion(version); + + Schema finalReadSchema = readerSchema; + if (!internalSchema.isEmptySchema()) { + // we should use write schema to read log file, + // since when we have done some DDL operation, the readerSchema maybe different from writeSchema, avro reader will throw exception. + // eg: origin writeSchema is: "a String, b double" then we add a new column now the readerSchema will be: "a string, c int, b double". it's wrong to use readerSchema to read old log file. + // after we read those record by writeSchema, we rewrite those record with readerSchema in AbstractHoodieLogRecordReader + finalReadSchema = writerSchema; + } + + this.reader = new GenericDatumReader<>(writerSchema, finalReadSchema); + + if (logBlockVersion.hasRecordCount()) { + this.totalRecords = this.dis.readInt(); + } } - GenericDatumReader reader = new GenericDatumReader<>(writerSchema, schema); - // 2. Get the total records - int totalRecords = 0; - if (logBlockVersion.hasRecordCount()) { - totalRecords = dis.readInt(); + public static RecordIterator getInstance(HoodieAvroDataBlock dataBlock, byte[] content, InternalSchema internalSchema) throws IOException { + // Get schema from the header + Schema writerSchema = new Schema.Parser().parse(dataBlock.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); + return new RecordIterator(dataBlock.readerSchema, writerSchema, content, internalSchema); } - List records = new ArrayList<>(totalRecords); - // 3. Read the content - for (int i = 0; i < totalRecords; i++) { - int recordLength = dis.readInt(); - BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(getContent().get(), dis.getNumberOfBytesRead(), - recordLength, decoderCache.get()); - decoderCache.set(decoder); - IndexedRecord record = reader.read(null, decoder); - records.add(record); - dis.skipBytes(recordLength); + @Override + public void close() { + try { + this.dis.close(); + this.decoderCache.remove(); + this.content = null; + } catch (IOException e) { + // ignore + } + } + + @Override + public boolean hasNext() { + return readRecords < totalRecords; + } + + @Override + public IndexedRecord next() { + try { + int recordLength = this.dis.readInt(); + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(this.content, this.dis.getNumberOfBytesRead(), + recordLength, this.decoderCache.get()); + this.decoderCache.set(decoder); + IndexedRecord record = this.reader.read(null, decoder); + this.dis.skipBytes(recordLength); + this.readRecords++; + return record; + } catch (IOException e) { + throw new HoodieIOException("Unable to convert bytes to record.", e); + } } - dis.close(); - this.records = records; - // Free up content to be GC'd, deflate - deflate(); } //---------------------------------------------------------------------------------------- @@ -177,9 +227,11 @@ protected void deserializeRecords() throws IOException { */ @Deprecated public HoodieAvroDataBlock(List records, Schema schema) { - super(new HashMap<>(), new HashMap<>(), Option.empty(), Option.empty(), null, false); - this.records = records; - this.schema = schema; + super(records, Collections.singletonMap(HeaderMetadataType.SCHEMA, schema.toString()), new HashMap<>(), HoodieRecord.RECORD_KEY_METADATA_FIELD); + } + + public static HoodieAvroDataBlock getBlock(byte[] content, Schema readerSchema) throws IOException { + return getBlock(content, readerSchema, InternalSchema.getEmptyInternalSchema()); } /** @@ -187,7 +239,7 @@ public HoodieAvroDataBlock(List records, Schema schema) { * HoodieLogFormat V1. */ @Deprecated - public static HoodieAvroDataBlock getBlock(byte[] content, Schema readerSchema) throws IOException { + public static HoodieAvroDataBlock getBlock(byte[] content, Schema readerSchema, InternalSchema internalSchema) throws IOException { SizeAwareDataInputStream dis = new SizeAwareDataInputStream(new DataInputStream(new ByteArrayInputStream(content))); @@ -195,12 +247,16 @@ public static HoodieAvroDataBlock getBlock(byte[] content, Schema readerSchema) int schemaLength = dis.readInt(); byte[] compressedSchema = new byte[schemaLength]; dis.readFully(compressedSchema, 0, schemaLength); - Schema writerSchema = new Schema.Parser().parse(HoodieAvroUtils.decompress(compressedSchema)); + Schema writerSchema = new Schema.Parser().parse(decompress(compressedSchema)); if (readerSchema == null) { readerSchema = writerSchema; } + if (!internalSchema.isEmptySchema()) { + readerSchema = writerSchema; + } + GenericDatumReader reader = new GenericDatumReader<>(writerSchema, readerSchema); // 2. Get the total records int totalRecords = dis.readInt(); @@ -218,6 +274,33 @@ public static HoodieAvroDataBlock getBlock(byte[] content, Schema readerSchema) return new HoodieAvroDataBlock(records, readerSchema); } + private static byte[] compress(String text) { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try { + OutputStream out = new DeflaterOutputStream(baos); + out.write(text.getBytes(StandardCharsets.UTF_8)); + out.close(); + } catch (IOException e) { + throw new HoodieIOException("IOException while compressing text " + text, e); + } + return baos.toByteArray(); + } + + private static String decompress(byte[] bytes) { + InputStream in = new InflaterInputStream(new ByteArrayInputStream(bytes)); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try { + byte[] buffer = new byte[8192]; + int len; + while ((len = in.read(buffer)) > 0) { + baos.write(buffer, 0, len); + } + return new String(baos.toByteArray(), StandardCharsets.UTF_8); + } catch (IOException e) { + throw new HoodieIOException("IOException while decompressing text", e); + } + } + @Deprecated public byte[] getBytes(Schema schema) throws IOException { @@ -225,15 +308,20 @@ public byte[] getBytes(Schema schema) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream output = new DataOutputStream(baos); - // 2. Compress and Write schema out - byte[] schemaContent = HoodieAvroUtils.compress(schema.toString()); + // 1. Compress and Write schema out + byte[] schemaContent = compress(schema.toString()); output.writeInt(schemaContent.length); output.write(schemaContent); - // 3. Write total number of records + List records = new ArrayList<>(); + try (ClosableIterator recordItr = getRecordIterator()) { + recordItr.forEachRemaining(records::add); + } + + // 2. Write total number of records output.writeInt(records.size()); - // 4. Write the records + // 3. Write the records Iterator itr = records.iterator(); while (itr.hasNext()) { IndexedRecord s = itr.next(); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCommandBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCommandBlock.java index 558053bc3f39f..0ff3a77b5007b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCommandBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCommandBlock.java @@ -18,7 +18,6 @@ package org.apache.hudi.common.table.log.block; -import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.util.Option; import org.apache.hadoop.fs.FSDataInputStream; @@ -44,9 +43,9 @@ public HoodieCommandBlock(Map header) { this(Option.empty(), null, false, Option.empty(), header, new HashMap<>()); } - private HoodieCommandBlock(Option content, FSDataInputStream inputStream, boolean readBlockLazily, - Option blockContentLocation, Map header, - Map footer) { + public HoodieCommandBlock(Option content, FSDataInputStream inputStream, boolean readBlockLazily, + Option blockContentLocation, Map header, + Map footer) { super(header, footer, blockContentLocation, content, inputStream, readBlockLazily); this.type = HoodieCommandBlockTypeEnum.values()[Integer.parseInt(header.get(HeaderMetadataType.COMMAND_BLOCK_TYPE))]; @@ -65,12 +64,4 @@ public HoodieLogBlockType getBlockType() { public byte[] getContentBytes() { return new byte[0]; } - - public static HoodieLogBlock getBlock(HoodieLogFile logFile, FSDataInputStream inputStream, Option content, - boolean readBlockLazily, long position, long blockSize, long blockEndpos, Map header, - Map footer) { - - return new HoodieCommandBlock(content, inputStream, readBlockLazily, - Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndpos)), header, footer); - } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCorruptBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCorruptBlock.java index 873be1315e50b..3e4f571588684 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCorruptBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCorruptBlock.java @@ -18,7 +18,6 @@ package org.apache.hudi.common.table.log.block; -import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.util.Option; import org.apache.hadoop.fs.FSDataInputStream; @@ -32,15 +31,14 @@ */ public class HoodieCorruptBlock extends HoodieLogBlock { - private HoodieCorruptBlock(Option corruptedBytes, FSDataInputStream inputStream, boolean readBlockLazily, - Option blockContentLocation, Map header, - Map footer) { + public HoodieCorruptBlock(Option corruptedBytes, FSDataInputStream inputStream, boolean readBlockLazily, + Option blockContentLocation, Map header, + Map footer) { super(header, footer, blockContentLocation, corruptedBytes, inputStream, readBlockLazily); } @Override public byte[] getContentBytes() throws IOException { - if (!getContent().isPresent() && readBlockLazily) { // read content from disk inflate(); @@ -53,11 +51,4 @@ public HoodieLogBlockType getBlockType() { return HoodieLogBlockType.CORRUPT_BLOCK; } - public static HoodieLogBlock getBlock(HoodieLogFile logFile, FSDataInputStream inputStream, - Option corruptedBytes, boolean readBlockLazily, long position, long blockSize, long blockEndPos, - Map header, Map footer) { - - return new HoodieCorruptBlock(corruptedBytes, inputStream, readBlockLazily, - Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndPos)), header, footer); - } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDataBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDataBlock.java index 8f5b741f37909..c83b3bc82d56c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDataBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDataBlock.java @@ -18,25 +18,29 @@ package org.apache.hudi.common.table.log.block; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.Option; -import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.FSDataInputStream; - -import javax.annotation.Nonnull; +import org.apache.hudi.internal.schema.InternalSchema; import java.io.IOException; -import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.function.Function; + +import static org.apache.hudi.common.util.ValidationUtils.checkState; /** * DataBlock contains a list of records serialized using formats compatible with the base file format. * For each base file format there is a corresponding DataBlock format. - * + *

    * The Datablock contains: * 1. Data Block version * 2. Total number of records in the block @@ -44,91 +48,256 @@ */ public abstract class HoodieDataBlock extends HoodieLogBlock { - protected List records; - protected Schema schema; + // TODO rebase records/content to leverage Either to warrant + // that they are mutex (used by read/write flows respectively) + private final Option> records; - public HoodieDataBlock(@Nonnull Map logBlockHeader, - @Nonnull Map logBlockFooter, - @Nonnull Option blockContentLocation, @Nonnull Option content, - FSDataInputStream inputStream, boolean readBlockLazily) { - super(logBlockHeader, logBlockFooter, blockContentLocation, content, inputStream, readBlockLazily); - } + /** + * Key field's name w/in the record's schema + */ + private final String keyFieldName; - public HoodieDataBlock(@Nonnull List records, @Nonnull Map header, - @Nonnull Map footer) { - super(header, footer, Option.empty(), Option.empty(), null, false); - this.records = records; - this.schema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); - } + private final boolean enablePointLookups; + + protected final Schema readerSchema; - public HoodieDataBlock(@Nonnull List records, @Nonnull Map header) { - this(records, header, new HashMap<>()); + protected InternalSchema internalSchema = InternalSchema.getEmptyInternalSchema(); + + /** + * NOTE: This ctor is used on the write-path (ie when records ought to be written into the log) + */ + public HoodieDataBlock(List records, + Map header, + Map footer, + String keyFieldName) { + super(header, footer, Option.empty(), Option.empty(), null, false); + this.records = Option.of(records); + this.keyFieldName = keyFieldName; + // If no reader-schema has been provided assume writer-schema as one + this.readerSchema = getWriterSchema(super.getLogBlockHeader()); + this.enablePointLookups = false; } - protected HoodieDataBlock(Option content, @Nonnull FSDataInputStream inputStream, boolean readBlockLazily, - Option blockContentLocation, Schema readerSchema, - @Nonnull Map headers, @Nonnull Map footer) { + /** + * NOTE: This ctor is used on the write-path (ie when records ought to be written into the log) + */ + protected HoodieDataBlock(Option content, + FSDataInputStream inputStream, + boolean readBlockLazily, + Option blockContentLocation, + Option readerSchema, + Map headers, + Map footer, + String keyFieldName, + boolean enablePointLookups) { super(headers, footer, blockContentLocation, content, inputStream, readBlockLazily); - this.schema = readerSchema; + this.records = Option.empty(); + this.keyFieldName = keyFieldName; + // If no reader-schema has been provided assume writer-schema as one + this.readerSchema = readerSchema.orElseGet(() -> getWriterSchema(super.getLogBlockHeader())); + this.enablePointLookups = enablePointLookups; } - public static HoodieLogBlock getBlock(HoodieLogBlockType logDataBlockFormat, List recordList, - Map header) { - switch (logDataBlockFormat) { - case AVRO_DATA_BLOCK: - return new HoodieAvroDataBlock(recordList, header); - case HFILE_DATA_BLOCK: - return new HoodieHFileDataBlock(recordList, header); - default: - throw new HoodieException("Data block format " + logDataBlockFormat + " not implemented"); - } + protected HoodieDataBlock(Option content, + FSDataInputStream inputStream, + boolean readBlockLazily, + Option blockContentLocation, + Option readerSchema, + Map headers, + Map footer, + String keyFieldName, + boolean enablePointLookups, + InternalSchema internalSchema) { + super(headers, footer, blockContentLocation, content, inputStream, readBlockLazily); + this.records = Option.empty(); + this.keyFieldName = keyFieldName; + // If no reader-schema has been provided assume writer-schema as one + this.readerSchema = readerSchema.orElseGet(() -> getWriterSchema(super.getLogBlockHeader())); + this.enablePointLookups = enablePointLookups; + this.internalSchema = internalSchema == null ? InternalSchema.getEmptyInternalSchema() : internalSchema; } @Override public byte[] getContentBytes() throws IOException { // In case this method is called before realizing records from content - if (getContent().isPresent()) { - return getContent().get(); - } else if (readBlockLazily && !getContent().isPresent() && records == null) { - // read block lazily - createRecordsFromContentBytes(); + Option content = getContent(); + + checkState(content.isPresent() || records.isPresent(), "Block is in invalid state"); + + if (content.isPresent()) { + return content.get(); } - return serializeRecords(); + return serializeRecords(records.get()); } - public abstract HoodieLogBlockType getBlockType(); + protected static Schema getWriterSchema(Map logBlockHeader) { + return new Schema.Parser().parse(logBlockHeader.get(HeaderMetadataType.SCHEMA)); + } - public List getRecords() { - if (records == null) { - try { - // in case records are absent, read content lazily and then convert to IndexedRecords - createRecordsFromContentBytes(); - } catch (IOException io) { - throw new HoodieIOException("Unable to convert content bytes to records", io); - } + /** + * Returns all the records iterator contained w/in this block. + */ + public final ClosableIterator getRecordIterator() { + if (records.isPresent()) { + return list2Iterator(records.get()); + } + try { + // in case records are absent, read content lazily and then convert to IndexedRecords + return readRecordsFromBlockPayload(); + } catch (IOException io) { + throw new HoodieIOException("Unable to convert content bytes to records", io); } - return records; } public Schema getSchema() { - // if getSchema was invoked before converting byte [] to records - if (records == null) { - getRecords(); + return readerSchema; + } + + /** + * Batch get of keys of interest. Implementation can choose to either do full scan and return matched entries or + * do a seek based parsing and return matched entries. + * + * @param keys keys of interest. + * @return List of IndexedRecords for the keys of interest. + * @throws IOException in case of failures encountered when reading/parsing records + */ + public final ClosableIterator getRecordIterator(List keys, boolean fullKey) throws IOException { + boolean fullScan = keys.isEmpty(); + if (enablePointLookups && !fullScan) { + return lookupRecords(keys, fullKey); + } + + // Otherwise, we fetch all the records and filter out all the records, but the + // ones requested + ClosableIterator allRecords = getRecordIterator(); + if (fullScan) { + return allRecords; } - return schema; + + HashSet keySet = new HashSet<>(keys); + return FilteringIterator.getInstance(allRecords, keySet, fullKey, this::getRecordKey); } - private void createRecordsFromContentBytes() throws IOException { + protected ClosableIterator readRecordsFromBlockPayload() throws IOException { if (readBlockLazily && !getContent().isPresent()) { // read log block contents from disk inflate(); } - deserializeRecords(); + try { + return deserializeRecords(getContent().get()); + } finally { + // Free up content to be GC'd by deflating the block + deflate(); + } + } + + protected ClosableIterator lookupRecords(List keys, boolean fullKey) throws IOException { + throw new UnsupportedOperationException( + String.format("Point lookups are not supported by this Data block type (%s)", getBlockType()) + ); + } + + protected abstract byte[] serializeRecords(List records) throws IOException; + + protected abstract ClosableIterator deserializeRecords(byte[] content) throws IOException; + + public abstract HoodieLogBlockType getBlockType(); + + protected Option getKeyField(Schema schema) { + return Option.ofNullable(schema.getField(keyFieldName)); + } + + protected Option getRecordKey(IndexedRecord record) { + return getKeyField(record.getSchema()) + .map(keyField -> record.get(keyField.pos())) + .map(Object::toString); + } + + /** + * Converts the given list to closable iterator. + */ + static ClosableIterator list2Iterator(List list) { + Iterator iterator = list.iterator(); + return new ClosableIterator() { + @Override + public void close() { + // ignored + } + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public T next() { + return iterator.next(); + } + }; } - protected abstract byte[] serializeRecords() throws IOException; + // ------------------------------------------------------------------------- + // Inner Class + // ------------------------------------------------------------------------- + + /** + * A {@link ClosableIterator} that supports filtering strategy with given keys. + * User should supply the key extraction function for fetching string format keys. + * + * @param the element type + */ + private static class FilteringIterator implements ClosableIterator { + private final ClosableIterator nested; // nested iterator + + private final Set keys; // the filtering keys + private final boolean fullKey; - protected abstract void deserializeRecords() throws IOException; + private final Function> keyExtract; // function to extract the key + + private T next; + + private FilteringIterator(ClosableIterator nested, Set keys, boolean fullKey, Function> keyExtract) { + this.nested = nested; + this.keys = keys; + this.fullKey = fullKey; + this.keyExtract = keyExtract; + } + + public static FilteringIterator getInstance( + ClosableIterator nested, + Set keys, + boolean fullKey, + Function> keyExtract) { + return new FilteringIterator<>(nested, keys, fullKey, keyExtract); + } + + @Override + public void close() { + this.nested.close(); + } + + @Override + public boolean hasNext() { + while (this.nested.hasNext()) { + this.next = this.nested.next(); + String key = keyExtract.apply(this.next) + .orElseGet(() -> { + throw new IllegalStateException(String.format("Record without a key (%s)", this.next)); + }); + + if (fullKey && keys.contains(key) + || !fullKey && keys.stream().anyMatch(key::startsWith)) { + return true; + } + } + return false; + } + + @Override + public T next() { + return this.next; + } + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDeleteBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDeleteBlock.java index 45534f7b51013..a5168072d014d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDeleteBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDeleteBlock.java @@ -19,8 +19,8 @@ package org.apache.hudi.common.table.log.block; import org.apache.hudi.common.fs.SizeAwareDataInputStream; +import org.apache.hudi.common.model.DeleteRecord; import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.SerializationUtils; import org.apache.hudi.exception.HoodieIOException; @@ -32,6 +32,7 @@ import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; +import java.util.Arrays; import java.util.HashMap; import java.util.Map; @@ -40,14 +41,14 @@ */ public class HoodieDeleteBlock extends HoodieLogBlock { - private HoodieKey[] keysToDelete; + private DeleteRecord[] recordsToDelete; - public HoodieDeleteBlock(HoodieKey[] keysToDelete, Map header) { + public HoodieDeleteBlock(DeleteRecord[] recordsToDelete, Map header) { this(Option.empty(), null, false, Option.empty(), header, new HashMap<>()); - this.keysToDelete = keysToDelete; + this.recordsToDelete = recordsToDelete; } - private HoodieDeleteBlock(Option content, FSDataInputStream inputStream, boolean readBlockLazily, + public HoodieDeleteBlock(Option content, FSDataInputStream inputStream, boolean readBlockLazily, Option blockContentLocation, Map header, Map footer) { super(header, footer, blockContentLocation, content, inputStream, readBlockLazily); @@ -55,27 +56,28 @@ private HoodieDeleteBlock(Option content, FSDataInputStream inputStream, @Override public byte[] getContentBytes() throws IOException { + Option content = getContent(); // In case this method is called before realizing keys from content - if (getContent().isPresent()) { - return getContent().get(); - } else if (readBlockLazily && !getContent().isPresent() && keysToDelete == null) { + if (content.isPresent()) { + return content.get(); + } else if (readBlockLazily && recordsToDelete == null) { // read block lazily - getKeysToDelete(); + getRecordsToDelete(); } ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream output = new DataOutputStream(baos); - byte[] bytesToWrite = SerializationUtils.serialize(getKeysToDelete()); + byte[] bytesToWrite = SerializationUtils.serialize(getRecordsToDelete()); output.writeInt(version); output.writeInt(bytesToWrite.length); output.write(bytesToWrite); return baos.toByteArray(); } - public HoodieKey[] getKeysToDelete() { + public DeleteRecord[] getRecordsToDelete() { try { - if (keysToDelete == null) { + if (recordsToDelete == null) { if (!getContent().isPresent() && readBlockLazily) { // read content from disk inflate(); @@ -86,25 +88,28 @@ public HoodieKey[] getKeysToDelete() { int dataLength = dis.readInt(); byte[] data = new byte[dataLength]; dis.readFully(data); - this.keysToDelete = SerializationUtils.deserialize(data); + this.recordsToDelete = deserialize(version, data); deflate(); } - return keysToDelete; + return recordsToDelete; } catch (IOException io) { throw new HoodieIOException("Unable to generate keys to delete from block content", io); } } + private static DeleteRecord[] deserialize(int version, byte[] data) { + if (version == 1) { + // legacy version + HoodieKey[] keys = SerializationUtils.deserialize(data); + return Arrays.stream(keys).map(DeleteRecord::create).toArray(DeleteRecord[]::new); + } else { + return SerializationUtils.deserialize(data); + } + } + @Override public HoodieLogBlockType getBlockType() { return HoodieLogBlockType.DELETE_BLOCK; } - public static HoodieLogBlock getBlock(HoodieLogFile logFile, FSDataInputStream inputStream, Option content, - boolean readBlockLazily, long position, long blockSize, long blockEndPos, Map header, - Map footer) throws IOException { - - return new HoodieDeleteBlock(content, inputStream, readBlockLazily, - Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndPos)), header, footer); - } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java index 61d9b7f233bee..b36aa135a2cef 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java @@ -18,39 +18,42 @@ package org.apache.hudi.common.table.log.block; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.io.storage.HoodieHFileReader; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - import org.apache.avro.Schema; -import org.apache.avro.Schema.Field; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.io.compress.Compression; import org.apache.hadoop.hbase.io.hfile.CacheConfig; import org.apache.hadoop.hbase.io.hfile.HFile; import org.apache.hadoop.hbase.io.hfile.HFileContext; import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder; -import org.apache.hadoop.hbase.util.Pair; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.fs.inline.InLineFSUtils; +import org.apache.hudi.common.fs.inline.InLineFileSystem; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.io.storage.HoodieHBaseKVComparator; +import org.apache.hudi.io.storage.HoodieHFileReader; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.TreeMap; -import java.util.stream.Collectors; -import javax.annotation.Nonnull; +import static org.apache.hudi.common.util.ValidationUtils.checkState; /** * HoodieHFileDataBlock contains a list of records stored inside an HFile format. It is used with the HFile @@ -58,26 +61,35 @@ */ public class HoodieHFileDataBlock extends HoodieDataBlock { private static final Logger LOG = LogManager.getLogger(HoodieHFileDataBlock.class); - private static Compression.Algorithm compressionAlgorithm = Compression.Algorithm.GZ; - private static int blockSize = 1 * 1024 * 1024; - - public HoodieHFileDataBlock(@Nonnull Map logBlockHeader, - @Nonnull Map logBlockFooter, - @Nonnull Option blockContentLocation, @Nonnull Option content, - FSDataInputStream inputStream, boolean readBlockLazily) { - super(logBlockHeader, logBlockFooter, blockContentLocation, content, inputStream, readBlockLazily); - } - public HoodieHFileDataBlock(HoodieLogFile logFile, FSDataInputStream inputStream, Option content, - boolean readBlockLazily, long position, long blockSize, long blockEndpos, Schema readerSchema, - Map header, Map footer) { - super(content, inputStream, readBlockLazily, - Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndpos)), readerSchema, header, - footer); + private static final int DEFAULT_BLOCK_SIZE = 1024 * 1024; + + private final Option compressionAlgorithm; + // This path is used for constructing HFile reader context, which should not be + // interpreted as the actual file path for the HFile data blocks + private final Path pathForReader; + + public HoodieHFileDataBlock(FSDataInputStream inputStream, + Option content, + boolean readBlockLazily, + HoodieLogBlockContentLocation logBlockContentLocation, + Option readerSchema, + Map header, + Map footer, + boolean enablePointLookups, + Path pathForReader) { + super(content, inputStream, readBlockLazily, Option.of(logBlockContentLocation), readerSchema, header, footer, HoodieHFileReader.KEY_FIELD_NAME, enablePointLookups); + this.compressionAlgorithm = Option.empty(); + this.pathForReader = pathForReader; } - public HoodieHFileDataBlock(@Nonnull List records, @Nonnull Map header) { - super(records, header, new HashMap<>()); + public HoodieHFileDataBlock(List records, + Map header, + Compression.Algorithm compressionAlgorithm, + Path pathForReader) { + super(records, header, new HashMap<>(), HoodieHFileReader.KEY_FIELD_NAME); + this.compressionAlgorithm = Option.of(compressionAlgorithm); + this.pathForReader = pathForReader; } @Override @@ -86,41 +98,46 @@ public HoodieLogBlockType getBlockType() { } @Override - protected byte[] serializeRecords() throws IOException { - HFileContext context = new HFileContextBuilder().withBlockSize(blockSize).withCompression(compressionAlgorithm) + protected byte[] serializeRecords(List records) throws IOException { + HFileContext context = new HFileContextBuilder() + .withBlockSize(DEFAULT_BLOCK_SIZE) + .withCompression(compressionAlgorithm.get()) + .withCellComparator(new HoodieHBaseKVComparator()) .build(); + Configuration conf = new Configuration(); CacheConfig cacheConfig = new CacheConfig(conf); ByteArrayOutputStream baos = new ByteArrayOutputStream(); FSDataOutputStream ostream = new FSDataOutputStream(baos, null); - HFile.Writer writer = HFile.getWriterFactory(conf, cacheConfig) - .withOutputStream(ostream).withFileContext(context).create(); + // Use simple incrementing counter as a key + boolean useIntegerKey = !getRecordKey(records.get(0)).isPresent(); + // This is set here to avoid re-computing this in the loop + int keyWidth = useIntegerKey ? (int) Math.ceil(Math.log(records.size())) + 1 : -1; // Serialize records into bytes Map sortedRecordsMap = new TreeMap<>(); Iterator itr = records.iterator(); - boolean useIntegerKey = false; - int key = 0; - int keySize = 0; - Field keyField = records.get(0).getSchema().getField(HoodieRecord.RECORD_KEY_METADATA_FIELD); - if (keyField == null) { - // Missing key metadata field so we should use an integer sequence key - useIntegerKey = true; - keySize = (int) Math.ceil(Math.log(records.size())) + 1; - } + + int id = 0; while (itr.hasNext()) { IndexedRecord record = itr.next(); String recordKey; if (useIntegerKey) { - recordKey = String.format("%" + keySize + "s", key++); + recordKey = String.format("%" + keyWidth + "s", id++); } else { - recordKey = record.get(keyField.pos()).toString(); + recordKey = getRecordKey(record).get(); } - byte[] recordBytes = HoodieAvroUtils.indexedRecordToBytes(record); + + final byte[] recordBytes = serializeRecord(record); + ValidationUtils.checkState(!sortedRecordsMap.containsKey(recordKey), + "Writing multiple records with same key not supported for " + this.getClass().getName()); sortedRecordsMap.put(recordKey, recordBytes); } + HFile.Writer writer = HFile.getWriterFactory(conf, cacheConfig) + .withOutputStream(ostream).withFileContext(context).create(); + // Write the records sortedRecordsMap.forEach((recordKey, recordBytes) -> { try { @@ -131,6 +148,8 @@ protected byte[] serializeRecords() throws IOException { } }); + writer.appendFileInfo(HoodieHFileReader.SCHEMA_KEY.getBytes(), getSchema().toString().getBytes()); + writer.close(); ostream.flush(); ostream.close(); @@ -139,21 +158,87 @@ protected byte[] serializeRecords() throws IOException { } @Override - protected void deserializeRecords() throws IOException { + protected ClosableIterator deserializeRecords(byte[] content) throws IOException { + checkState(readerSchema != null, "Reader's schema has to be non-null"); + // Get schema from the header Schema writerSchema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); - // If readerSchema was not present, use writerSchema - if (schema == null) { - schema = writerSchema; - } - // Read the content - HoodieHFileReader reader = new HoodieHFileReader<>(getContent().get()); - List> records = reader.readAllRecords(writerSchema, schema); - this.records = records.stream().map(t -> t.getSecond()).collect(Collectors.toList()); + HoodieHFileReader reader = new HoodieHFileReader<>(null, pathForReader, content, Option.of(writerSchema)); + Iterator recordIterator = reader.getRecordIterator(readerSchema); + return new ClosableIterator() { + @Override + public void close() { + reader.close(); + } + + @Override + public boolean hasNext() { + return recordIterator.hasNext(); + } - // Free up content to be GC'd, deflate - deflate(); + @Override + public IndexedRecord next() { + return recordIterator.next(); + } + }; + } + + // TODO abstract this w/in HoodieDataBlock + @Override + protected ClosableIterator lookupRecords(List keys, boolean fullKey) throws IOException { + HoodieLogBlockContentLocation blockContentLoc = getBlockContentLocation().get(); + + // NOTE: It's important to extend Hadoop configuration here to make sure configuration + // is appropriately carried over + Configuration inlineConf = new Configuration(blockContentLoc.getHadoopConf()); + inlineConf.set("fs." + InLineFileSystem.SCHEME + ".impl", InLineFileSystem.class.getName()); + inlineConf.setClassLoader(InLineFileSystem.class.getClassLoader()); + + Path inlinePath = InLineFSUtils.getInlineFilePath( + blockContentLoc.getLogFile().getPath(), + blockContentLoc.getLogFile().getPath().getFileSystem(inlineConf).getScheme(), + blockContentLoc.getContentPositionInLogFile(), + blockContentLoc.getBlockSize()); + + // HFile read will be efficient if keys are sorted, since on storage records are sorted by key. + // This will avoid unnecessary seeks. + List sortedKeys = new ArrayList<>(keys); + Collections.sort(sortedKeys); + + final HoodieHFileReader reader = + new HoodieHFileReader<>(inlineConf, inlinePath, new CacheConfig(inlineConf), inlinePath.getFileSystem(inlineConf)); + + // Get writer's schema from the header + final ClosableIterator recordIterator = + fullKey ? reader.getRecordsByKeysIterator(sortedKeys, readerSchema) : reader.getRecordsByKeyPrefixIterator(sortedKeys, readerSchema); + + return new ClosableIterator() { + @Override + public boolean hasNext() { + return recordIterator.hasNext(); + } + + @Override + public IndexedRecord next() { + return recordIterator.next(); + } + + @Override + public void close() { + recordIterator.close(); + reader.close(); + } + }; + } + + private byte[] serializeRecord(IndexedRecord record) { + Option keyField = getKeyField(record.getSchema()); + // Reset key value w/in the record to avoid duplicating the key w/in payload + if (keyField.isPresent()) { + record.put(keyField.get().pos(), StringUtils.EMPTY_STRING); + } + return HoodieAvroUtils.indexedRecordToBytes(record); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieLogBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieLogBlock.java index 1d185e49bf040..71336be883781 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieLogBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieLogBlock.java @@ -18,16 +18,18 @@ package org.apache.hudi.common.table.log.block; -import org.apache.hudi.common.fs.FSUtils; +import org.apache.hadoop.conf.Configuration; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.TypeUtils; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hadoop.fs.FSDataInputStream; import javax.annotation.Nonnull; +import javax.annotation.Nullable; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; @@ -37,6 +39,8 @@ import java.util.HashMap; import java.util.Map; +import static org.apache.hudi.common.util.ValidationUtils.checkState; + /** * Abstract class defining a block in HoodieLogFile. */ @@ -47,7 +51,7 @@ public abstract class HoodieLogBlock { * corresponding changes need to be made to {@link HoodieLogBlockVersion} TODO : Change this to a class, something * like HoodieLogBlockVersionV1/V2 and implement/override operations there */ - public static int version = 1; + public static int version = 2; // Header for each log block private final Map logBlockHeader; // Footer for each log block @@ -59,14 +63,17 @@ public abstract class HoodieLogBlock { // TODO : change this to just InputStream so this works for any FileSystem // create handlers to return specific type of inputstream based on FS // input stream corresponding to the log file where this logBlock belongs - protected FSDataInputStream inputStream; + private final FSDataInputStream inputStream; // Toggle flag, whether to read blocks lazily (I/O intensive) or not (Memory intensive) protected boolean readBlockLazily; - public HoodieLogBlock(@Nonnull Map logBlockHeader, + public HoodieLogBlock( + @Nonnull Map logBlockHeader, @Nonnull Map logBlockFooter, - @Nonnull Option blockContentLocation, @Nonnull Option content, - FSDataInputStream inputStream, boolean readBlockLazily) { + @Nonnull Option blockContentLocation, + @Nonnull Option content, + @Nullable FSDataInputStream inputStream, + boolean readBlockLazily) { this.logBlockHeader = logBlockHeader; this.logBlockFooter = logBlockFooter; this.blockContentLocation = blockContentLocation; @@ -110,7 +117,25 @@ public Option getContent() { * Type of the log block WARNING: This enum is serialized as the ordinal. Only add new enums at the end. */ public enum HoodieLogBlockType { - COMMAND_BLOCK, DELETE_BLOCK, CORRUPT_BLOCK, AVRO_DATA_BLOCK, HFILE_DATA_BLOCK + COMMAND_BLOCK(":command"), + DELETE_BLOCK(":delete"), + CORRUPT_BLOCK(":corrupted"), + AVRO_DATA_BLOCK("avro"), + HFILE_DATA_BLOCK("hfile"), + PARQUET_DATA_BLOCK("parquet"); + + private static final Map ID_TO_ENUM_MAP = + TypeUtils.getValueToEnumMap(HoodieLogBlockType.class, e -> e.id); + + private final String id; + + HoodieLogBlockType(String id) { + this.id = id; + } + + public static HoodieLogBlockType fromId(String id) { + return ID_TO_ENUM_MAP.get(id); + } } /** @@ -133,7 +158,8 @@ public enum FooterMetadataType { * intensive CompactedScanner, the location helps to lazily read contents from the log file */ public static final class HoodieLogBlockContentLocation { - + // Hadoop Config required to access the file + private final Configuration hadoopConf; // The logFile that contains this block private final HoodieLogFile logFile; // The filePosition in the logFile for the contents of this block @@ -143,14 +169,22 @@ public static final class HoodieLogBlockContentLocation { // The final position where the complete block ends private final long blockEndPos; - HoodieLogBlockContentLocation(HoodieLogFile logFile, long contentPositionInLogFile, long blockSize, - long blockEndPos) { + public HoodieLogBlockContentLocation(Configuration hadoopConf, + HoodieLogFile logFile, + long contentPositionInLogFile, + long blockSize, + long blockEndPos) { + this.hadoopConf = hadoopConf; this.logFile = logFile; this.contentPositionInLogFile = contentPositionInLogFile; this.blockSize = blockSize; this.blockEndPos = blockEndPos; } + public Configuration getHadoopConf() { + return hadoopConf; + } + public HoodieLogFile getLogFile() { return logFile; } @@ -211,30 +245,33 @@ public static Map getLogMetadata(DataInputStream dis * Read or Skip block content of a log block in the log file. Depends on lazy reading enabled in * {@link HoodieMergedLogRecordScanner} */ - public static byte[] readOrSkipContent(FSDataInputStream inputStream, Integer contentLength, boolean readBlockLazily) + public static Option tryReadContent(FSDataInputStream inputStream, Integer contentLength, boolean readLazily) throws IOException { - byte[] content = null; - if (!readBlockLazily) { - // Read the contents in memory - content = new byte[contentLength]; - inputStream.readFully(content, 0, contentLength); - } else { + if (readLazily) { // Seek to the end of the content block - safeSeek(inputStream, inputStream.getPos() + contentLength); + inputStream.seek(inputStream.getPos() + contentLength); + return Option.empty(); } - return content; + + // TODO re-use buffer if stream is backed by buffer + // Read the contents in memory + byte[] content = new byte[contentLength]; + inputStream.readFully(content, 0, contentLength); + return Option.of(content); } /** * When lazyReading of blocks is turned on, inflate the content of a log block from disk. */ protected void inflate() throws HoodieIOException { + checkState(!content.isPresent(), "Block has already been inflated"); + checkState(inputStream != null, "Block should have input-stream provided"); try { content = Option.of(new byte[(int) this.getBlockContentLocation().get().getBlockSize()]); - safeSeek(inputStream, this.getBlockContentLocation().get().getContentPositionInLogFile()); + inputStream.seek(this.getBlockContentLocation().get().getContentPositionInLogFile()); inputStream.readFully(content.get(), 0, content.get().length); - safeSeek(inputStream, this.getBlockContentLocation().get().getBlockEndPos()); + inputStream.seek(this.getBlockContentLocation().get().getBlockEndPos()); } catch (IOException e) { // TODO : fs.open() and return inputstream again, need to pass FS configuration // because the inputstream might close/timeout for large number of log blocks to be merged @@ -249,23 +286,4 @@ protected void inflate() throws HoodieIOException { protected void deflate() { content = Option.empty(); } - - /** - * Handles difference in seek behavior for GCS and non-GCS input stream. - * - * @param inputStream Input Stream - * @param pos Position to seek - * @throws IOException - - */ - private static void safeSeek(FSDataInputStream inputStream, long pos) throws IOException { - try { - inputStream.seek(pos); - } catch (EOFException e) { - if (FSUtils.isGCSInputStream(inputStream)) { - inputStream.seek(pos - 1); - } else { - throw e; - } - } - } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieParquetDataBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieParquetDataBlock.java new file mode 100644 index 0000000000000..afb448f844891 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieParquetDataBlock.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.table.log.block; + +import org.apache.avro.Schema; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.HoodieAvroWriteSupport; +import org.apache.hudi.common.fs.inline.InLineFSUtils; +import org.apache.hudi.common.fs.inline.InLineFileSystem; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ParquetReaderIterator; +import org.apache.hudi.io.storage.HoodieParquetConfig; +import org.apache.hudi.io.storage.HoodieParquetStreamWriter; +import org.apache.parquet.avro.AvroParquetReader; +import org.apache.parquet.avro.AvroReadSupport; +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; + +import javax.annotation.Nonnull; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * HoodieParquetDataBlock contains a list of records serialized using Parquet. + */ +public class HoodieParquetDataBlock extends HoodieDataBlock { + + private final Option compressionCodecName; + + public HoodieParquetDataBlock(FSDataInputStream inputStream, + Option content, + boolean readBlockLazily, + HoodieLogBlockContentLocation logBlockContentLocation, + Option readerSchema, + Map header, + Map footer, + String keyField) { + super(content, inputStream, readBlockLazily, Option.of(logBlockContentLocation), readerSchema, header, footer, keyField, false); + + this.compressionCodecName = Option.empty(); + } + + public HoodieParquetDataBlock( + @Nonnull List records, + @Nonnull Map header, + @Nonnull String keyField, + @Nonnull CompressionCodecName compressionCodecName + ) { + super(records, header, new HashMap<>(), keyField); + + this.compressionCodecName = Option.of(compressionCodecName); + } + + @Override + public HoodieLogBlockType getBlockType() { + return HoodieLogBlockType.PARQUET_DATA_BLOCK; + } + + @Override + protected byte[] serializeRecords(List records) throws IOException { + if (records.size() == 0) { + return new byte[0]; + } + + Schema writerSchema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); + + HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport( + new AvroSchemaConverter().convert(writerSchema), writerSchema, Option.empty()); + + HoodieParquetConfig avroParquetConfig = + new HoodieParquetConfig<>( + writeSupport, + compressionCodecName.get(), + ParquetWriter.DEFAULT_BLOCK_SIZE, + ParquetWriter.DEFAULT_PAGE_SIZE, + 1024 * 1024 * 1024, + new Configuration(), + Double.parseDouble(String.valueOf(0.1)));//HoodieStorageConfig.PARQUET_COMPRESSION_RATIO.defaultValue())); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + + try (FSDataOutputStream outputStream = new FSDataOutputStream(baos)) { + try (HoodieParquetStreamWriter parquetWriter = new HoodieParquetStreamWriter<>(outputStream, avroParquetConfig)) { + for (IndexedRecord record : records) { + String recordKey = getRecordKey(record).orElse(null); + parquetWriter.writeAvro(recordKey, record); + } + outputStream.flush(); + } + } + + return baos.toByteArray(); + } + + public static ClosableIterator getProjectedParquetRecordsIterator(Configuration conf, + Schema readerSchema, + InputFile inputFile) throws IOException { + AvroReadSupport.setAvroReadSchema(conf, readerSchema); + AvroReadSupport.setRequestedProjection(conf, readerSchema); + + ParquetReader reader = + AvroParquetReader.builder(inputFile).withConf(conf).build(); + return new ParquetReaderIterator<>(reader); + } + + /** + * NOTE: We're overriding the whole reading sequence to make sure we properly respect + * the requested Reader's schema and only fetch the columns that have been explicitly + * requested by the caller (providing projected Reader's schema) + */ + @Override + protected ClosableIterator readRecordsFromBlockPayload() throws IOException { + HoodieLogBlockContentLocation blockContentLoc = getBlockContentLocation().get(); + + // NOTE: It's important to extend Hadoop configuration here to make sure configuration + // is appropriately carried over + Configuration inlineConf = new Configuration(blockContentLoc.getHadoopConf()); + inlineConf.set("fs." + InLineFileSystem.SCHEME + ".impl", InLineFileSystem.class.getName()); + + Path inlineLogFilePath = InLineFSUtils.getInlineFilePath( + blockContentLoc.getLogFile().getPath(), + blockContentLoc.getLogFile().getPath().getFileSystem(inlineConf).getScheme(), + blockContentLoc.getContentPositionInLogFile(), + blockContentLoc.getBlockSize()); + + return getProjectedParquetRecordsIterator( + inlineConf, + readerSchema, + HadoopInputFile.fromPath(inlineLogFilePath, inlineConf)); + } + + @Override + protected ClosableIterator deserializeRecords(byte[] content) { + throw new UnsupportedOperationException("Should not be invoked"); + } +} \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/marker/MarkerOperation.java b/hudi-common/src/main/java/org/apache/hudi/common/table/marker/MarkerOperation.java new file mode 100644 index 0000000000000..94da60c39c1ab --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/marker/MarkerOperation.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.table.marker; + +import java.io.Serializable; + +/** + * Stores URLs to timeline server for marker-related operations. + */ +public class MarkerOperation implements Serializable { + + private static final String BASE_URL = "/v1/hoodie/marker"; + + public static final String MARKER_DIR_PATH_PARAM = "markerdirpath"; + public static final String MARKER_NAME_PARAM = "markername"; + + // GET requests + public static final String ALL_MARKERS_URL = String.format("%s/%s", BASE_URL, "all"); + public static final String CREATE_AND_MERGE_MARKERS_URL = String.format("%s/%s", BASE_URL, "create-and-merge"); + public static final String MARKERS_DIR_EXISTS_URL = String.format("%s/%s", BASE_URL, "dir/exists"); + + // POST requests + public static final String CREATE_MARKER_URL = String.format("%s/%s", BASE_URL, "create"); + public static final String DELETE_MARKER_DIR_URL = String.format("%s/%s", BASE_URL, "dir/delete"); +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/marker/MarkerType.java b/hudi-common/src/main/java/org/apache/hudi/common/table/marker/MarkerType.java new file mode 100644 index 0000000000000..2b0a28df88326 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/marker/MarkerType.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.table.marker; + +/** + * Marker type indicating how markers are stored in the file system. + */ +public enum MarkerType { + DIRECT, + TIMELINE_SERVER_BASED +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java index 99fd793e334c6..2b27d3ab5e568 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java @@ -18,15 +18,19 @@ package org.apache.hudi.common.table.timeline; +import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; +import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant.State; import org.apache.hudi.common.util.FileIOUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieIOException; import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -34,16 +38,16 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.Serializable; -import java.text.SimpleDateFormat; +import java.text.ParseException; import java.util.Arrays; import java.util.Collections; +import java.util.Comparator; import java.util.Date; import java.util.HashSet; import java.util.Objects; import java.util.Set; -import java.util.concurrent.atomic.AtomicReference; import java.util.function.Function; -import java.util.stream.Collectors; +import java.util.stream.Stream; /** * Represents the Active Timeline for the Hoodie table. Instants for the last 12 hours (configurable) is in the @@ -59,31 +63,91 @@ */ public class HoodieActiveTimeline extends HoodieDefaultTimeline { - public static final SimpleDateFormat COMMIT_FORMATTER = new SimpleDateFormat("yyyyMMddHHmmss"); - public static final Set VALID_EXTENSIONS_IN_ACTIVE_TIMELINE = new HashSet<>(Arrays.asList( - COMMIT_EXTENSION, INFLIGHT_COMMIT_EXTENSION, REQUESTED_COMMIT_EXTENSION, DELTA_COMMIT_EXTENSION, - INFLIGHT_DELTA_COMMIT_EXTENSION, REQUESTED_DELTA_COMMIT_EXTENSION, SAVEPOINT_EXTENSION, - INFLIGHT_SAVEPOINT_EXTENSION, CLEAN_EXTENSION, REQUESTED_CLEAN_EXTENSION, INFLIGHT_CLEAN_EXTENSION, - INFLIGHT_COMPACTION_EXTENSION, REQUESTED_COMPACTION_EXTENSION, INFLIGHT_RESTORE_EXTENSION, RESTORE_EXTENSION, - REQUESTED_REPLACE_COMMIT_EXTENSION, INFLIGHT_REPLACE_COMMIT_EXTENSION, REPLACE_COMMIT_EXTENSION)); + COMMIT_EXTENSION, INFLIGHT_COMMIT_EXTENSION, REQUESTED_COMMIT_EXTENSION, + DELTA_COMMIT_EXTENSION, INFLIGHT_DELTA_COMMIT_EXTENSION, REQUESTED_DELTA_COMMIT_EXTENSION, + SAVEPOINT_EXTENSION, INFLIGHT_SAVEPOINT_EXTENSION, + CLEAN_EXTENSION, REQUESTED_CLEAN_EXTENSION, INFLIGHT_CLEAN_EXTENSION, + INFLIGHT_COMPACTION_EXTENSION, REQUESTED_COMPACTION_EXTENSION, + REQUESTED_RESTORE_EXTENSION, INFLIGHT_RESTORE_EXTENSION, RESTORE_EXTENSION, + ROLLBACK_EXTENSION, REQUESTED_ROLLBACK_EXTENSION, INFLIGHT_ROLLBACK_EXTENSION, + REQUESTED_REPLACE_COMMIT_EXTENSION, INFLIGHT_REPLACE_COMMIT_EXTENSION, REPLACE_COMMIT_EXTENSION, + REQUESTED_INDEX_COMMIT_EXTENSION, INFLIGHT_INDEX_COMMIT_EXTENSION, INDEX_COMMIT_EXTENSION, + REQUESTED_SAVE_SCHEMA_ACTION_EXTENSION, INFLIGHT_SAVE_SCHEMA_ACTION_EXTENSION, SAVE_SCHEMA_ACTION_EXTENSION)); + + private static final Set NOT_PARSABLE_TIMESTAMPS = new HashSet(3) {{ + add(HoodieTimeline.INIT_INSTANT_TS); + add(HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS); + add(HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS); + }}; private static final Logger LOG = LogManager.getLogger(HoodieActiveTimeline.class); protected HoodieTableMetaClient metaClient; - private static AtomicReference lastInstantTime = new AtomicReference<>(String.valueOf(Integer.MIN_VALUE)); /** - * Returns next instant time in the {@link #COMMIT_FORMATTER} format. + * Parse the timestamp of an Instant and return a {@code Date}. + * Throw ParseException if timestamp is not valid format as + * {@link org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator#SECS_INSTANT_TIMESTAMP_FORMAT}. + * + * @param timestamp a timestamp String which follow pattern as + * {@link org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator#SECS_INSTANT_TIMESTAMP_FORMAT}. + * @return Date of instant timestamp + */ + public static Date parseDateFromInstantTime(String timestamp) throws ParseException { + return HoodieInstantTimeGenerator.parseDateFromInstantTime(timestamp); + } + + /** + * The same parsing method as above, but this method will mute ParseException. + * If the given timestamp is invalid, returns {@code Option.empty}. + * Or a corresponding Date value if these timestamp strings are provided + * {@link org.apache.hudi.common.table.timeline.HoodieTimeline#INIT_INSTANT_TS}, + * {@link org.apache.hudi.common.table.timeline.HoodieTimeline#METADATA_BOOTSTRAP_INSTANT_TS}, + * {@link org.apache.hudi.common.table.timeline.HoodieTimeline#FULL_BOOTSTRAP_INSTANT_TS}. + * This method is useful when parsing timestamp for metrics + * + * @param timestamp a timestamp String which follow pattern as + * {@link org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator#SECS_INSTANT_TIMESTAMP_FORMAT}. + * @return {@code Option} of instant timestamp, {@code Option.empty} if invalid timestamp + */ + public static Option parseDateFromInstantTimeSafely(String timestamp) { + Option parsedDate; + try { + parsedDate = Option.of(HoodieInstantTimeGenerator.parseDateFromInstantTime(timestamp)); + } catch (ParseException e) { + if (NOT_PARSABLE_TIMESTAMPS.contains(timestamp)) { + parsedDate = Option.of(new Date(Integer.parseInt(timestamp))); + } else { + LOG.warn("Failed to parse timestamp " + timestamp + ": " + e.getMessage()); + parsedDate = Option.empty(); + } + } + return parsedDate; + } + + /** + * Format the Date to a String representing the timestamp of a Hoodie Instant. + */ + public static String formatDate(Date timestamp) { + return HoodieInstantTimeGenerator.formatDate(timestamp); + } + + /** + * Returns next instant time in the correct format. * Ensures each instant time is atleast 1 second apart since we create instant times at second granularity */ public static String createNewInstantTime() { - return lastInstantTime.updateAndGet((oldVal) -> { - String newCommitTime; - do { - newCommitTime = HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date()); - } while (HoodieTimeline.compareTimestamps(newCommitTime, LESSER_THAN_OR_EQUALS, oldVal)); - return newCommitTime; - }); + return HoodieInstantTimeGenerator.createNewInstantTime(0); + } + + /** + * Returns next instant time that adds N milliseconds to current time. + * Ensures each instant time is atleast 1 second apart since we create instant times at second granularity + * + * @param milliseconds Milliseconds to add to current time while generating the new instant time + */ + public static String createNewInstantTime(long milliseconds) { + return HoodieInstantTimeGenerator.createNewInstantTime(milliseconds); } protected HoodieActiveTimeline(HoodieTableMetaClient metaClient, Set includedExtensions) { @@ -103,7 +167,7 @@ protected HoodieActiveTimeline(HoodieTableMetaClient metaClient, Set inc // multiple casts will make this lambda serializable - // http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16 this.details = (Function> & Serializable) this::getInstantDetails; - LOG.info("Loaded instants " + getInstants().collect(Collectors.toList())); + LOG.info("Loaded instants upto : " + lastInstant()); } public HoodieActiveTimeline(HoodieTableMetaClient metaClient) { @@ -119,6 +183,7 @@ public HoodieActiveTimeline(HoodieTableMetaClient metaClient, boolean applyLayou * * @deprecated */ + @Deprecated public HoodieActiveTimeline() { } @@ -127,6 +192,7 @@ public HoodieActiveTimeline() { * * @deprecated */ + @Deprecated private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); } @@ -137,6 +203,18 @@ public void createNewInstant(HoodieInstant instant) { createFileInMetaPath(instant.getFileName(), Option.empty(), false); } + public void createRequestedReplaceCommit(String instantTime, String actionType) { + try { + HoodieInstant instant = new HoodieInstant(State.REQUESTED, actionType, instantTime); + LOG.info("Creating a new instant " + instant); + // Create the request replace file + createFileInMetaPath(instant.getFileName(), + TimelineMetadataUtils.serializeRequestedReplaceMetadata(new HoodieRequestedReplaceMetadata()), false); + } catch (IOException e) { + throw new HoodieIOException("Error create requested replace commit ", e); + } + } + public void saveAsComplete(HoodieInstant instant, Option data) { LOG.info("Marking instant complete " + instant); ValidationUtils.checkArgument(instant.isInflight(), @@ -163,15 +241,52 @@ public void deletePending(HoodieInstant instant) { deleteInstantFile(instant); } + public void deleteCompletedRollback(HoodieInstant instant) { + ValidationUtils.checkArgument(instant.isCompleted()); + deleteInstantFile(instant); + } + + public static void deleteInstantFile(FileSystem fs, String metaPath, HoodieInstant instant) { + try { + fs.delete(new Path(metaPath, instant.getFileName()), false); + } catch (IOException e) { + throw new HoodieIOException("Could not delete instant file" + instant.getFileName(), e); + } + } + + public void deleteEmptyInstantIfExists(HoodieInstant instant) { + ValidationUtils.checkArgument(isEmpty(instant)); + deleteInstantFileIfExists(instant); + } + public void deleteCompactionRequested(HoodieInstant instant) { ValidationUtils.checkArgument(instant.isRequested()); ValidationUtils.checkArgument(Objects.equals(instant.getAction(), HoodieTimeline.COMPACTION_ACTION)); deleteInstantFile(instant); } - private void deleteInstantFile(HoodieInstant instant) { + public void deleteInstantFileIfExists(HoodieInstant instant) { LOG.info("Deleting instant " + instant); - Path inFlightCommitFilePath = new Path(metaClient.getMetaPath(), instant.getFileName()); + Path inFlightCommitFilePath = getInstantFileNamePath(instant.getFileName()); + try { + if (metaClient.getFs().exists(inFlightCommitFilePath)) { + boolean result = metaClient.getFs().delete(inFlightCommitFilePath, false); + if (result) { + LOG.info("Removed instant " + instant); + } else { + throw new HoodieIOException("Could not delete instant " + instant); + } + } else { + LOG.warn("The commit " + inFlightCommitFilePath + " to remove does not exist"); + } + } catch (IOException e) { + throw new HoodieIOException("Could not remove inflight commit " + inFlightCommitFilePath, e); + } + } + + protected void deleteInstantFile(HoodieInstant instant) { + LOG.info("Deleting instant " + instant); + Path inFlightCommitFilePath = getInstantFileNamePath(instant.getFileName()); try { boolean result = metaClient.getFs().delete(inFlightCommitFilePath, false); if (result) { @@ -186,12 +301,66 @@ private void deleteInstantFile(HoodieInstant instant) { @Override public Option getInstantDetails(HoodieInstant instant) { - Path detailPath = new Path(metaClient.getMetaPath(), instant.getFileName()); + Path detailPath = getInstantFileNamePath(instant.getFileName()); return readDataFromPath(detailPath); } + /** + * Returns most recent instant having valid schema in its {@link HoodieCommitMetadata} + */ + public Option> getLastCommitMetadataWithValidSchema() { + return Option.fromJavaOptional( + getCommitMetadataStream() + .filter(instantCommitMetadataPair -> + !StringUtils.isNullOrEmpty(instantCommitMetadataPair.getValue().getMetadata(HoodieCommitMetadata.SCHEMA_KEY))) + .findFirst() + ); + } + + /** + * Get the last instant with valid data, and convert this to HoodieCommitMetadata + */ + public Option> getLastCommitMetadataWithValidData() { + return Option.fromJavaOptional( + getCommitMetadataStream() + .filter(instantCommitMetadataPair -> + !instantCommitMetadataPair.getValue().getFileIdAndRelativePaths().isEmpty()) + .findFirst() + ); + } + + /** + * Returns stream of {@link HoodieCommitMetadata} in order reverse to chronological (ie most + * recent metadata being the first element) + */ + private Stream> getCommitMetadataStream() { + // NOTE: Streams are lazy + return getCommitsTimeline().filterCompletedInstants() + .getInstants() + .sorted(Comparator.comparing(HoodieInstant::getTimestamp).reversed()) + .map(instant -> { + try { + HoodieCommitMetadata commitMetadata = + HoodieCommitMetadata.fromBytes(getInstantDetails(instant).get(), HoodieCommitMetadata.class); + return Pair.of(instant, commitMetadata); + } catch (IOException e) { + throw new HoodieIOException(String.format("Failed to fetch HoodieCommitMetadata for instant (%s)", instant), e); + } + }); + } + public Option readCleanerInfoAsBytes(HoodieInstant instant) { // Cleaner metadata are always stored only in timeline .hoodie + return readDataFromPath(getInstantFileNamePath(instant.getFileName())); + } + + public Option readRollbackInfoAsBytes(HoodieInstant instant) { + // Rollback metadata are always stored only in timeline .hoodie + return readDataFromPath(getInstantFileNamePath(instant.getFileName())); + } + + public Option readRestoreInfoAsBytes(HoodieInstant instant) { + // Rollback metadata are always stored only in timeline .hoodie return readDataFromPath(new Path(metaClient.getMetaPath(), instant.getFileName())); } @@ -214,17 +383,20 @@ public Option readCompactionPlanAsBytes(HoodieInstant instant) { } } + public Option readIndexPlanAsBytes(HoodieInstant instant) { + return readDataFromPath(new Path(metaClient.getMetaPath(), instant.getFileName())); + } + /** - * Revert compaction State from inflight to requested. + * Revert instant state from inflight to requested. * * @param inflightInstant Inflight Instant * @return requested instant */ - public HoodieInstant revertCompactionInflightToRequested(HoodieInstant inflightInstant) { - ValidationUtils.checkArgument(inflightInstant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)); + public HoodieInstant revertInstantFromInflightToRequested(HoodieInstant inflightInstant) { ValidationUtils.checkArgument(inflightInstant.isInflight()); HoodieInstant requestedInstant = - new HoodieInstant(State.REQUESTED, COMPACTION_ACTION, inflightInstant.getTimestamp()); + new HoodieInstant(State.REQUESTED, inflightInstant.getAction(), inflightInstant.getTimestamp()); if (metaClient.getTimelineLayoutVersion().isNullVersion()) { // Pass empty data since it is read from the corresponding .aux/.compaction instant file transitionState(inflightInstant, requestedInstant, Option.empty()); @@ -267,7 +439,7 @@ public HoodieInstant transitionCompactionInflightToComplete(HoodieInstant inflig private void createFileInAuxiliaryFolder(HoodieInstant instant, Option data) { // This will be removed in future release. See HUDI-546 Path fullPath = new Path(metaClient.getMetaAuxiliaryPath(), instant.getFileName()); - createFileInPath(fullPath, data); + FileIOUtils.createFileInPath(metaClient.getFs(), fullPath, data); } //----------------------------------------------------------------- @@ -306,7 +478,68 @@ public HoodieInstant transitionCleanRequestedToInflight(HoodieInstant requestedI } /** - * Transition Clean State from inflight to Committed. + * Transition Rollback State from inflight to Committed. + * + * @param inflightInstant Inflight instant + * @param data Extra Metadata + * @return commit instant + */ + public HoodieInstant transitionRollbackInflightToComplete(HoodieInstant inflightInstant, Option data) { + ValidationUtils.checkArgument(inflightInstant.getAction().equals(HoodieTimeline.ROLLBACK_ACTION)); + ValidationUtils.checkArgument(inflightInstant.isInflight()); + HoodieInstant commitInstant = new HoodieInstant(State.COMPLETED, ROLLBACK_ACTION, inflightInstant.getTimestamp()); + // Then write to timeline + transitionState(inflightInstant, commitInstant, data); + return commitInstant; + } + + /** + * Transition Rollback State from requested to inflight. + * + * @param requestedInstant requested instant + * @return commit instant + */ + public HoodieInstant transitionRollbackRequestedToInflight(HoodieInstant requestedInstant) { + ValidationUtils.checkArgument(requestedInstant.getAction().equals(HoodieTimeline.ROLLBACK_ACTION)); + ValidationUtils.checkArgument(requestedInstant.isRequested()); + HoodieInstant inflight = new HoodieInstant(State.INFLIGHT, ROLLBACK_ACTION, requestedInstant.getTimestamp()); + transitionState(requestedInstant, inflight, Option.empty()); + return inflight; + } + + /** + * Transition Restore State from requested to inflight. + * + * @param requestedInstant requested instant + * @return commit instant + */ + public HoodieInstant transitionRestoreRequestedToInflight(HoodieInstant requestedInstant) { + ValidationUtils.checkArgument(requestedInstant.getAction().equals(HoodieTimeline.RESTORE_ACTION), "Transition to inflight requested for a restore instant with diff action " + + requestedInstant.toString()); + ValidationUtils.checkArgument(requestedInstant.isRequested(), "Transition to inflight requested for an instant not in requested state " + requestedInstant.toString()); + HoodieInstant inflight = new HoodieInstant(State.INFLIGHT, RESTORE_ACTION, requestedInstant.getTimestamp()); + transitionState(requestedInstant, inflight, Option.empty()); + return inflight; + } + + /** + * Transition replace requested file to replace inflight. + * + * @param requestedInstant Requested instant + * @param data Extra Metadata + * @return inflight instant + */ + public HoodieInstant transitionReplaceRequestedToInflight(HoodieInstant requestedInstant, Option data) { + ValidationUtils.checkArgument(requestedInstant.getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION)); + ValidationUtils.checkArgument(requestedInstant.isRequested()); + HoodieInstant inflightInstant = new HoodieInstant(State.INFLIGHT, REPLACE_COMMIT_ACTION, requestedInstant.getTimestamp()); + // Then write to timeline + transitionState(requestedInstant, inflightInstant, data); + return inflightInstant; + } + + /** + * Transition replace inflight to Committed. * * @param inflightInstant Inflight instant * @param data Extra Metadata @@ -325,41 +558,40 @@ private void transitionState(HoodieInstant fromInstant, HoodieInstant toInstant, transitionState(fromInstant, toInstant, data, false); } - private void transitionState(HoodieInstant fromInstant, HoodieInstant toInstant, Option data, + protected void transitionState(HoodieInstant fromInstant, HoodieInstant toInstant, Option data, boolean allowRedundantTransitions) { ValidationUtils.checkArgument(fromInstant.getTimestamp().equals(toInstant.getTimestamp())); try { if (metaClient.getTimelineLayoutVersion().isNullVersion()) { // Re-create the .inflight file by opening a new file and write the commit metadata in createFileInMetaPath(fromInstant.getFileName(), data, allowRedundantTransitions); - Path fromInstantPath = new Path(metaClient.getMetaPath(), fromInstant.getFileName()); - Path toInstantPath = new Path(metaClient.getMetaPath(), toInstant.getFileName()); + Path fromInstantPath = getInstantFileNamePath(fromInstant.getFileName()); + Path toInstantPath = getInstantFileNamePath(toInstant.getFileName()); boolean success = metaClient.getFs().rename(fromInstantPath, toInstantPath); if (!success) { throw new HoodieIOException("Could not rename " + fromInstantPath + " to " + toInstantPath); } } else { // Ensures old state exists in timeline - LOG.info("Checking for file exists ?" + new Path(metaClient.getMetaPath(), fromInstant.getFileName())); - ValidationUtils.checkArgument(metaClient.getFs().exists(new Path(metaClient.getMetaPath(), - fromInstant.getFileName()))); + LOG.info("Checking for file exists ?" + getInstantFileNamePath(fromInstant.getFileName())); + ValidationUtils.checkArgument(metaClient.getFs().exists(getInstantFileNamePath(fromInstant.getFileName()))); // Use Write Once to create Target File if (allowRedundantTransitions) { - createFileInPath(new Path(metaClient.getMetaPath(), toInstant.getFileName()), data); + FileIOUtils.createFileInPath(metaClient.getFs(), getInstantFileNamePath(toInstant.getFileName()), data); } else { - createImmutableFileInPath(new Path(metaClient.getMetaPath(), toInstant.getFileName()), data); + metaClient.getFs().createImmutableFileInPath(getInstantFileNamePath(toInstant.getFileName()), data); } - LOG.info("Create new file for toInstant ?" + new Path(metaClient.getMetaPath(), toInstant.getFileName())); + LOG.info("Create new file for toInstant ?" + getInstantFileNamePath(toInstant.getFileName())); } } catch (IOException e) { throw new HoodieIOException("Could not complete " + fromInstant, e); } } - private void revertCompleteToInflight(HoodieInstant completed, HoodieInstant inflight) { + protected void revertCompleteToInflight(HoodieInstant completed, HoodieInstant inflight) { ValidationUtils.checkArgument(completed.getTimestamp().equals(inflight.getTimestamp())); - Path inFlightCommitFilePath = new Path(metaClient.getMetaPath(), inflight.getFileName()); - Path commitFilePath = new Path(metaClient.getMetaPath(), completed.getFileName()); + Path inFlightCommitFilePath = getInstantFileNamePath(inflight.getFileName()); + Path commitFilePath = getInstantFileNamePath(completed.getFileName()); try { if (metaClient.getTimelineLayoutVersion().isNullVersion()) { if (!metaClient.getFs().exists(inFlightCommitFilePath)) { @@ -370,8 +602,8 @@ private void revertCompleteToInflight(HoodieInstant completed, HoodieInstant inf } } } else { - Path requestedInstantFilePath = new Path(metaClient.getMetaPath(), - new HoodieInstant(State.REQUESTED, inflight.getAction(), inflight.getTimestamp()).getFileName()); + Path requestedInstantFilePath = getInstantFileNamePath(new HoodieInstant(State.REQUESTED, + inflight.getAction(), inflight.getTimestamp()).getFileName()); // If inflight and requested files do not exist, create one if (!metaClient.getFs().exists(requestedInstantFilePath)) { @@ -390,6 +622,15 @@ private void revertCompleteToInflight(HoodieInstant completed, HoodieInstant inf } } + private Path getInstantFileNamePath(String fileName) { + return new Path(fileName.contains(SCHEMA_COMMIT_ACTION) ? metaClient.getSchemaFolderName() : metaClient.getMetaPath(), fileName); + } + + public void transitionRequestedToInflight(String commitType, String inFlightInstant) { + HoodieInstant requested = new HoodieInstant(HoodieInstant.State.REQUESTED, commitType, inFlightInstant); + transitionRequestedToInflight(requested, Option.empty(), false); + } + public void transitionRequestedToInflight(HoodieInstant requested, Option content) { transitionRequestedToInflight(requested, content, false); } @@ -413,7 +654,7 @@ public void saveToCompactionRequested(HoodieInstant instant, Option cont } /** - * Saves content for inflight/requested REPLACE instant. + * Saves content for requested REPLACE instant. */ public void saveToPendingReplaceCommit(HoodieInstant instant, Option content) { ValidationUtils.checkArgument(instant.getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION)); @@ -427,59 +668,85 @@ public void saveToCleanRequested(HoodieInstant instant, Option content) createFileInMetaPath(instant.getFileName(), content, false); } - private void createFileInMetaPath(String filename, Option content, boolean allowOverwrite) { - Path fullPath = new Path(metaClient.getMetaPath(), filename); - if (allowOverwrite || metaClient.getTimelineLayoutVersion().isNullVersion()) { - createFileInPath(fullPath, content); - } else { - createImmutableFileInPath(fullPath, content); - } + public void saveToRollbackRequested(HoodieInstant instant, Option content) { + ValidationUtils.checkArgument(instant.getAction().equals(HoodieTimeline.ROLLBACK_ACTION)); + ValidationUtils.checkArgument(instant.getState().equals(State.REQUESTED)); + // Plan is stored in meta path + createFileInMetaPath(instant.getFileName(), content, false); } - private void createFileInPath(Path fullPath, Option content) { - try { - // If the path does not exist, create it first - if (!metaClient.getFs().exists(fullPath)) { - if (metaClient.getFs().createNewFile(fullPath)) { - LOG.info("Created a new file in meta path: " + fullPath); - } else { - throw new HoodieIOException("Failed to create file " + fullPath); - } - } + public void saveToRestoreRequested(HoodieInstant instant, Option content) { + ValidationUtils.checkArgument(instant.getAction().equals(HoodieTimeline.RESTORE_ACTION)); + ValidationUtils.checkArgument(instant.getState().equals(State.REQUESTED)); + // Plan is stored in meta path + createFileInMetaPath(instant.getFileName(), content, false); + } - if (content.isPresent()) { - FSDataOutputStream fsout = metaClient.getFs().create(fullPath, true); - fsout.write(content.get()); - fsout.close(); - } - } catch (IOException e) { - throw new HoodieIOException("Failed to create file " + fullPath, e); + /** + * Transition index instant state from requested to inflight. + * + * @param requestedInstant Inflight Instant + * @return inflight instant + */ + public HoodieInstant transitionIndexRequestedToInflight(HoodieInstant requestedInstant, Option data) { + ValidationUtils.checkArgument(requestedInstant.getAction().equals(HoodieTimeline.INDEXING_ACTION), + String.format("%s is not equal to %s action", requestedInstant.getAction(), INDEXING_ACTION)); + ValidationUtils.checkArgument(requestedInstant.isRequested(), + String.format("Instant %s not in requested state", requestedInstant.getTimestamp())); + HoodieInstant inflightInstant = new HoodieInstant(State.INFLIGHT, INDEXING_ACTION, requestedInstant.getTimestamp()); + transitionState(requestedInstant, inflightInstant, data); + return inflightInstant; + } + + /** + * Transition index instant state from inflight to completed. + * @param inflightInstant Inflight Instant + * @return completed instant + */ + public HoodieInstant transitionIndexInflightToComplete(HoodieInstant inflightInstant, Option data) { + ValidationUtils.checkArgument(inflightInstant.getAction().equals(HoodieTimeline.INDEXING_ACTION), + String.format("%s is not equal to %s action", inflightInstant.getAction(), INDEXING_ACTION)); + ValidationUtils.checkArgument(inflightInstant.isInflight(), + String.format("Instant %s not inflight", inflightInstant.getTimestamp())); + HoodieInstant commitInstant = new HoodieInstant(State.COMPLETED, INDEXING_ACTION, inflightInstant.getTimestamp()); + transitionState(inflightInstant, commitInstant, data); + return commitInstant; + } + + /** + * Revert index instant state from inflight to requested. + * @param inflightInstant Inflight Instant + * @return requested instant + */ + public HoodieInstant revertIndexInflightToRequested(HoodieInstant inflightInstant) { + ValidationUtils.checkArgument(inflightInstant.getAction().equals(HoodieTimeline.INDEXING_ACTION), + String.format("%s is not equal to %s action", inflightInstant.getAction(), INDEXING_ACTION)); + ValidationUtils.checkArgument(inflightInstant.isInflight(), + String.format("Instant %s not inflight", inflightInstant.getTimestamp())); + HoodieInstant requestedInstant = new HoodieInstant(State.REQUESTED, INDEXING_ACTION, inflightInstant.getTimestamp()); + if (metaClient.getTimelineLayoutVersion().isNullVersion()) { + transitionState(inflightInstant, requestedInstant, Option.empty()); + } else { + deleteInflight(inflightInstant); } + return requestedInstant; } /** - * Creates a new file in timeline with overwrite set to false. This ensures - * files are created only once and never rewritten - * @param fullPath File Path - * @param content Content to be stored + * Save content for inflight/requested index instant. */ - private void createImmutableFileInPath(Path fullPath, Option content) { - FSDataOutputStream fsout = null; - try { - fsout = metaClient.getFs().create(fullPath, false); - if (content.isPresent()) { - fsout.write(content.get()); - } - } catch (IOException e) { - throw new HoodieIOException("Failed to create file " + fullPath, e); - } finally { - try { - if (null != fsout) { - fsout.close(); - } - } catch (IOException e) { - throw new HoodieIOException("Failed to close file " + fullPath, e); - } + public void saveToPendingIndexAction(HoodieInstant instant, Option content) { + ValidationUtils.checkArgument(instant.getAction().equals(HoodieTimeline.INDEXING_ACTION), + String.format("%s is not equal to %s action", instant.getAction(), INDEXING_ACTION)); + createFileInMetaPath(instant.getFileName(), content, false); + } + + protected void createFileInMetaPath(String filename, Option content, boolean allowOverwrite) { + Path fullPath = getInstantFileNamePath(filename); + if (allowOverwrite || metaClient.getTimelineLayoutVersion().isNullVersion()) { + FileIOUtils.createFileInPath(metaClient.getFs(), fullPath, content); + } else { + metaClient.getFs().createImmutableFileInPath(fullPath, content); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java index 9f8c4393b8df8..4df30b115e0ea 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java @@ -18,13 +18,21 @@ package org.apache.hudi.common.table.timeline; +import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieArchivedMetaEntry; +import org.apache.hudi.avro.model.HoodieMergeArchiveFilePlan; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; +import org.apache.hudi.common.table.log.block.HoodieLogBlock; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.FileIOUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.exception.HoodieIOException; import org.apache.avro.generic.GenericRecord; @@ -34,20 +42,26 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import javax.annotation.Nonnull; + import java.io.IOException; import java.io.Serializable; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.Comparator; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.Spliterator; +import java.util.Spliterators; import java.util.function.Function; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.stream.Collectors; -import java.util.stream.Stream; +import java.util.stream.StreamSupport; /** * Represents the Archived Timeline for the Hoodie table. Instants for the last 12 hours (configurable) is in the @@ -61,19 +75,21 @@ * This class can be serialized and de-serialized and on de-serialization the FileSystem is re-initialized. */ public class HoodieArchivedTimeline extends HoodieDefaultTimeline { + public static final String MERGE_ARCHIVE_PLAN_NAME = "mergeArchivePlan"; private static final Pattern ARCHIVE_FILE_PATTERN = - Pattern.compile("^\\.commits_\\.archive\\.([0-9]*)$"); + Pattern.compile("^\\.commits_\\.archive\\.([0-9]+).*"); private static final String HOODIE_COMMIT_ARCHIVE_LOG_FILE_PREFIX = "commits"; private static final String ACTION_TYPE_KEY = "actionType"; + private static final String ACTION_STATE = "actionState"; private HoodieTableMetaClient metaClient; - private Map readCommits = new HashMap<>(); + private final Map readCommits = new HashMap<>(); private static final Logger LOG = LogManager.getLogger(HoodieArchivedTimeline.class); /** - * Loads instants between (startTs, endTs]. - * Note that there is no lazy loading, so this may not work if really long time range (endTs-startTs) is specified. + * Loads all the archived instants. + * Note that there is no lazy loading, so this may not work if the archived timeline range is really long. * TBD: Should we enforce maximum time range? */ public HoodieArchivedTimeline(HoodieTableMetaClient metaClient) { @@ -84,12 +100,26 @@ public HoodieArchivedTimeline(HoodieTableMetaClient metaClient) { this.details = (Function> & Serializable) this::getInstantDetails; } + /** + * Loads completed instants from startTs(inclusive). + * Note that there is no lazy loading, so this may not work if really early startTs is specified. + */ + public HoodieArchivedTimeline(HoodieTableMetaClient metaClient, String startTs) { + this.metaClient = metaClient; + setInstants(loadInstants(new StartTsFilter(startTs), true, + record -> HoodieInstant.State.COMPLETED.toString().equals(record.get(ACTION_STATE).toString()))); + // multiple casts will make this lambda serializable - + // http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16 + this.details = (Function> & Serializable) this::getInstantDetails; + } + /** * For serialization and de-serialization only. * * @deprecated */ - public HoodieArchivedTimeline() {} + public HoodieArchivedTimeline() { + } /** * This method is only used when this object is deserialized in a spark executor. @@ -108,6 +138,27 @@ public void loadInstantDetailsInMemory(String startTs, String endTs) { loadInstants(startTs, endTs); } + public void loadCompletedInstantDetailsInMemory() { + loadInstants(null, true, + record -> HoodieInstant.State.COMPLETED.toString().equals(record.get(ACTION_STATE).toString())); + } + + public void loadCompactionDetailsInMemory(String compactionInstantTime) { + loadCompactionDetailsInMemory(compactionInstantTime, compactionInstantTime); + } + + public void loadCompactionDetailsInMemory(String startTs, String endTs) { + // load compactionPlan + loadInstants(new TimeRangeFilter(startTs, endTs), true, record -> + record.get(ACTION_TYPE_KEY).toString().equals(HoodieTimeline.COMPACTION_ACTION) + && HoodieInstant.State.INFLIGHT.toString().equals(record.get(ACTION_STATE).toString()) + ); + } + + public void clearInstantDetailsFromMemory(String instantTime) { + this.readCommits.remove(instantTime); + } + public void clearInstantDetailsFromMemory(String startTs, String endTs) { this.findInstantsInRange(startTs, endTs).getInstants().forEach(instant -> this.readCommits.remove(instant.getTimestamp())); @@ -123,30 +174,45 @@ public HoodieArchivedTimeline reload() { } private HoodieInstant readCommit(GenericRecord record, boolean loadDetails) { - final String instantTime = record.get(HoodiePartitionMetadata.COMMIT_TIME_KEY).toString(); + final String instantTime = record.get(HoodiePartitionMetadata.COMMIT_TIME_KEY).toString(); final String action = record.get(ACTION_TYPE_KEY).toString(); if (loadDetails) { - Option.ofNullable(record.get(getMetadataKey(action))).map(actionData -> - this.readCommits.put(instantTime, actionData.toString().getBytes(StandardCharsets.UTF_8)) - ); + getMetadataKey(action).map(key -> { + Object actionData = record.get(key); + if (actionData != null) { + if (action.equals(HoodieTimeline.COMPACTION_ACTION)) { + this.readCommits.put(instantTime, HoodieAvroUtils.indexedRecordToBytes((IndexedRecord) actionData)); + } else { + this.readCommits.put(instantTime, actionData.toString().getBytes(StandardCharsets.UTF_8)); + } + } + return null; + }); } - return new HoodieInstant(false, action, instantTime); + return new HoodieInstant(HoodieInstant.State.valueOf(record.get(ACTION_STATE).toString()), action, instantTime); } - private String getMetadataKey(String action) { + @Nonnull + private Option getMetadataKey(String action) { switch (action) { case HoodieTimeline.CLEAN_ACTION: - return "hoodieCleanMetadata"; + return Option.of("hoodieCleanMetadata"); case HoodieTimeline.COMMIT_ACTION: - return "hoodieCommitMetadata"; case HoodieTimeline.DELTA_COMMIT_ACTION: - return "hoodieCommitMetadata"; + return Option.of("hoodieCommitMetadata"); case HoodieTimeline.ROLLBACK_ACTION: - return "hoodieRollbackMetadata"; + return Option.of("hoodieRollbackMetadata"); case HoodieTimeline.SAVEPOINT_ACTION: - return "hoodieSavePointMetadata"; + return Option.of("hoodieSavePointMetadata"); + case HoodieTimeline.COMPACTION_ACTION: + return Option.of("hoodieCompactionPlan"); + case HoodieTimeline.REPLACE_COMMIT_ACTION: + return Option.of("hoodieReplaceCommitMetadata"); + case HoodieTimeline.INDEXING_ACTION: + return Option.of("hoodieIndexCommitMetadata"); default: - throw new HoodieIOException("Unknown action in metadata " + action); + LOG.error(String.format("Unknown action in metadata (%s)", action)); + return Option.empty(); } } @@ -158,42 +224,48 @@ private List loadInstants(String startTs, String endTs) { return loadInstants(new TimeRangeFilter(startTs, endTs), true); } + private List loadInstants(TimeRangeFilter filter, boolean loadInstantDetails) { + return loadInstants(filter, loadInstantDetails, record -> true); + } + /** * This is method to read selected instants. Do NOT use this directly use one of the helper methods above * If loadInstantDetails is set to true, this would also update 'readCommits' map with commit details * If filter is specified, only the filtered instants are loaded + * If commitsFilter is specified, only the filtered records are loaded */ - private List loadInstants(TimeRangeFilter filter, boolean loadInstantDetails) { + private List loadInstants(TimeRangeFilter filter, boolean loadInstantDetails, + Function commitsFilter) { try { - // list all files + // List all files FileStatus[] fsStatuses = metaClient.getFs().globStatus( new Path(metaClient.getArchivePath() + "/.commits_.archive*")); - // sort files by version suffix in reverse (implies reverse chronological order) + // Sort files by version suffix in reverse (implies reverse chronological order) Arrays.sort(fsStatuses, new ArchiveFileVersionComparator()); - List instantsInRange = new ArrayList<>(); + Set instantsInRange = new HashSet<>(); for (FileStatus fs : fsStatuses) { - //read the archived file - HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(metaClient.getFs(), - new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema()); - try { + // Read the archived file + try (HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(metaClient.getFs(), + new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema())) { int instantsInPreviousFile = instantsInRange.size(); - //read the avro blocks + // Read the avro blocks while (reader.hasNext()) { - HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); - // TODO If we can store additional metadata in datablock, we can skip parsing records - // (such as startTime, endTime of records in the block) - List records = blk.getRecords(); - // filter blocks in desired time window - Stream instantsInBlkStream = records.stream() - .map(r -> readCommit((GenericRecord) r, loadInstantDetails)); - - if (filter != null) { - instantsInBlkStream = instantsInBlkStream.filter(filter::isInRange); + HoodieLogBlock block = reader.next(); + if (block instanceof HoodieAvroDataBlock) { + HoodieAvroDataBlock avroBlock = (HoodieAvroDataBlock) block; + // TODO If we can store additional metadata in datablock, we can skip parsing records + // (such as startTime, endTime of records in the block) + try (ClosableIterator itr = avroBlock.getRecordIterator()) { + StreamSupport.stream(Spliterators.spliteratorUnknownSize(itr, Spliterator.IMMUTABLE), true) + // Filter blocks in desired time window + .filter(r -> commitsFilter.apply((GenericRecord) r)) + .map(r -> readCommit((GenericRecord) r, loadInstantDetails)) + .filter(c -> filter == null || filter.isInRange(c)) + .forEach(instantsInRange::add); + } } - - instantsInRange.addAll(instantsInBlkStream.collect(Collectors.toList())); } if (filter != null) { @@ -204,12 +276,32 @@ private List loadInstants(TimeRangeFilter filter, boolean loadIns break; } } - } finally { - reader.close(); + } catch (Exception originalException) { + // merge small archive files may left uncompleted archive file which will cause exception. + // need to ignore this kind of exception here. + try { + Path planPath = new Path(metaClient.getArchivePath(), MERGE_ARCHIVE_PLAN_NAME); + HoodieWrapperFileSystem fileSystem = metaClient.getFs(); + if (fileSystem.exists(planPath)) { + HoodieMergeArchiveFilePlan plan = TimelineMetadataUtils.deserializeAvroMetadata(FileIOUtils.readDataFromPath(fileSystem, planPath).get(), HoodieMergeArchiveFilePlan.class); + String mergedArchiveFileName = plan.getMergedArchiveFileName(); + if (!StringUtils.isNullOrEmpty(mergedArchiveFileName) && fs.getPath().getName().equalsIgnoreCase(mergedArchiveFileName)) { + LOG.warn("Catch exception because of reading uncompleted merging archive file " + mergedArchiveFileName + ". Ignore it here."); + continue; + } + } + throw originalException; + } catch (Exception e) { + // If anything wrong during parsing merge archive plan, we need to throw the original exception. + // For example corrupted archive file and corrupted plan are both existed. + throw originalException; + } } } - return instantsInRange; + ArrayList result = new ArrayList<>(instantsInRange); + Collections.sort(result); + return result; } catch (IOException e) { throw new HoodieIOException( "Could not load archived commit timeline from path " + metaClient.getArchivePath(), e); @@ -230,6 +322,19 @@ public boolean isInRange(HoodieInstant instant) { } } + private static class StartTsFilter extends TimeRangeFilter { + private final String startTs; + + public StartTsFilter(String startTs) { + super(startTs, null); // endTs is never used + this.startTs = startTs; + } + + public boolean isInRange(HoodieInstant instant) { + return HoodieTimeline.compareTimestamps(instant.getTimestamp(), GREATER_THAN_OR_EQUALS, startTs); + } + } + /** * Sort files by reverse order of version suffix in file name. */ @@ -254,4 +359,13 @@ private int getArchivedFileSuffix(FileStatus f) { return 0; } } + + @Override + public HoodieDefaultTimeline getWriteTimeline() { + // filter in-memory instants + Set validActions = CollectionUtils.createSet(COMMIT_ACTION, DELTA_COMMIT_ACTION, COMPACTION_ACTION, REPLACE_COMMIT_ACTION); + return new HoodieDefaultTimeline(getInstants().filter(i -> + readCommits.containsKey(i.getTimestamp())) + .filter(s -> validActions.contains(s.getAction())), details); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieDefaultTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieDefaultTimeline.java index 484d91b279468..7324421894c0d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieDefaultTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieDefaultTimeline.java @@ -35,6 +35,7 @@ import java.util.stream.Stream; import static java.util.Collections.reverse; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.compareTimestamps; /** * HoodieDefaultTimeline is a default implementation of the HoodieTimeline. It provides methods to inspect a @@ -75,7 +76,8 @@ public void setInstants(List instants) { * * @deprecated */ - public HoodieDefaultTimeline() {} + public HoodieDefaultTimeline() { + } @Override public HoodieTimeline filterInflights() { @@ -107,15 +109,25 @@ public HoodieTimeline filterCompletedAndCompactionInstants() { } @Override - public HoodieDefaultTimeline getCommitsAndCompactionTimeline() { + public HoodieDefaultTimeline getWriteTimeline() { Set validActions = CollectionUtils.createSet(COMMIT_ACTION, DELTA_COMMIT_ACTION, COMPACTION_ACTION, REPLACE_COMMIT_ACTION); return new HoodieDefaultTimeline(instants.stream().filter(s -> validActions.contains(s.getAction())), details); } + @Override + public HoodieTimeline getContiguousCompletedWriteTimeline() { + Option earliestPending = getWriteTimeline().filterInflightsAndRequested().firstInstant(); + if (earliestPending.isPresent()) { + return getWriteTimeline().filterCompletedInstants() + .filter(instant -> compareTimestamps(instant.getTimestamp(), LESSER_THAN, earliestPending.get().getTimestamp())); + } + return getWriteTimeline().filterCompletedInstants(); + } + @Override public HoodieTimeline getCompletedReplaceTimeline() { return new HoodieDefaultTimeline( - instants.stream().filter(s -> s.getAction().equals(REPLACE_COMMIT_ACTION)).filter(s -> s.isCompleted()), details); + instants.stream().filter(s -> s.getAction().equals(REPLACE_COMMIT_ACTION)).filter(HoodieInstant::isCompleted), details); } @Override @@ -124,10 +136,16 @@ public HoodieTimeline filterPendingReplaceTimeline() { s -> s.getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION) && !s.isCompleted()), details); } + @Override + public HoodieTimeline filterPendingRollbackTimeline() { + return new HoodieDefaultTimeline(instants.stream().filter( + s -> s.getAction().equals(HoodieTimeline.ROLLBACK_ACTION) && !s.isCompleted()), details); + } + @Override public HoodieTimeline filterPendingCompactionTimeline() { return new HoodieDefaultTimeline( - instants.stream().filter(s -> s.getAction().equals(HoodieTimeline.COMPACTION_ACTION)), details); + instants.stream().filter(s -> s.getAction().equals(HoodieTimeline.COMPACTION_ACTION) && !s.isCompleted()), details); } @Override @@ -139,29 +157,52 @@ public HoodieDefaultTimeline findInstantsInRange(String startTs, String endTs) { @Override public HoodieDefaultTimeline findInstantsAfter(String instantTime, int numCommits) { return new HoodieDefaultTimeline(instants.stream() - .filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), GREATER_THAN, instantTime)).limit(numCommits), + .filter(s -> compareTimestamps(s.getTimestamp(), GREATER_THAN, instantTime)).limit(numCommits), details); } + @Override + public HoodieTimeline findInstantsAfter(String instantTime) { + return new HoodieDefaultTimeline(instants.stream() + .filter(s -> compareTimestamps(s.getTimestamp(), GREATER_THAN, instantTime)), details); + } + @Override public HoodieDefaultTimeline findInstantsAfterOrEquals(String commitTime, int numCommits) { return new HoodieDefaultTimeline(instants.stream() - .filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), GREATER_THAN_OR_EQUALS, commitTime)) + .filter(s -> compareTimestamps(s.getTimestamp(), GREATER_THAN_OR_EQUALS, commitTime)) .limit(numCommits), details); } @Override public HoodieDefaultTimeline findInstantsBefore(String instantTime) { return new HoodieDefaultTimeline(instants.stream() - .filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), LESSER_THAN, instantTime)), + .filter(s -> compareTimestamps(s.getTimestamp(), LESSER_THAN, instantTime)), details); } + @Override + public HoodieDefaultTimeline findInstantsBeforeOrEquals(String instantTime) { + return new HoodieDefaultTimeline(instants.stream() + .filter(s -> compareTimestamps(s.getTimestamp(), LESSER_THAN_OR_EQUALS, instantTime)), + details); + } + @Override public HoodieTimeline filter(Predicate filter) { return new HoodieDefaultTimeline(instants.stream().filter(filter), details); } + @Override + public HoodieTimeline filterPendingIndexTimeline() { + return new HoodieDefaultTimeline(instants.stream().filter(s -> s.getAction().equals(INDEXING_ACTION) && !s.isCompleted()), details); + } + + @Override + public HoodieTimeline filterCompletedIndexTimeline() { + return new HoodieDefaultTimeline(instants.stream().filter(s -> s.getAction().equals(INDEXING_ACTION) && s.isCompleted()), details); + } + /** * Get all instants (commits, delta commits) that produce new data, in the active timeline. */ @@ -170,12 +211,12 @@ public HoodieTimeline getCommitsTimeline() { } /** - * Get all instants (commits, delta commits, compaction, clean, savepoint, rollback) that result in actions, + * Get all instants (commits, delta commits, compaction, clean, savepoint, rollback, replace commits, index) that result in actions, * in the active timeline. */ public HoodieTimeline getAllCommitsTimeline() { return getTimelineOfActions(CollectionUtils.createSet(COMMIT_ACTION, DELTA_COMMIT_ACTION, - CLEAN_ACTION, COMPACTION_ACTION, SAVEPOINT_ACTION, ROLLBACK_ACTION, REPLACE_COMMIT_ACTION)); + CLEAN_ACTION, COMPACTION_ACTION, SAVEPOINT_ACTION, ROLLBACK_ACTION, REPLACE_COMMIT_ACTION, INDEXING_ACTION)); } /** @@ -216,7 +257,14 @@ public HoodieTimeline getCleanerTimeline() { */ public HoodieTimeline getRollbackTimeline() { return new HoodieDefaultTimeline(filterInstantsByAction(ROLLBACK_ACTION), - (Function> & Serializable) this::getInstantDetails); + (Function> & Serializable) this::getInstantDetails); + } + + /** + * Get only the rollback and restore action (inflight and completed) in the active timeline. + */ + public HoodieTimeline getRollbackAndRestoreTimeline() { + return getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION, RESTORE_ACTION)); } /** @@ -254,6 +302,12 @@ public Option firstInstant() { return Option.fromJavaOptional(instants.stream().findFirst()); } + @Override + public Option firstInstant(String action, State state) { + return Option.fromJavaOptional(instants.stream() + .filter(s -> action.equals(s.getAction()) && state.equals(s.getState())).findFirst()); + } + @Override public Option nthInstant(int n) { if (empty() || n >= countInstants()) { @@ -280,6 +334,11 @@ public boolean containsInstant(HoodieInstant instant) { return instants.stream().anyMatch(s -> s.equals(instant)); } + @Override + public boolean containsInstant(String ts) { + return instants.stream().anyMatch(s -> s.getTimestamp().equals(ts)); + } + @Override public boolean containsOrBeforeTimelineStarts(String instant) { return instants.stream().anyMatch(s -> s.getTimestamp().equals(instant)) || isBeforeTimelineStarts(instant); @@ -304,18 +363,55 @@ public Stream getReverseOrderedInstants() { @Override public boolean isBeforeTimelineStarts(String instant) { - Option firstCommit = firstInstant(); - return firstCommit.isPresent() - && HoodieTimeline.compareTimestamps(instant, LESSER_THAN, firstCommit.get().getTimestamp()); + Option firstNonSavepointCommit = getFirstNonSavepointCommit(); + return firstNonSavepointCommit.isPresent() + && compareTimestamps(instant, LESSER_THAN, firstNonSavepointCommit.get().getTimestamp()); } + public Option getFirstNonSavepointCommit() { + Option firstCommit = firstInstant(); + Set savepointTimestamps = instants.stream() + .filter(entry -> entry.getAction().equals(HoodieTimeline.SAVEPOINT_ACTION)) + .map(HoodieInstant::getTimestamp) + .collect(Collectors.toSet()); + Option firstNonSavepointCommit = firstCommit; + if (!savepointTimestamps.isEmpty()) { + // There are chances that there could be holes in the timeline due to archival and savepoint interplay. + // So, the first non-savepoint commit is considered as beginning of the active timeline. + firstNonSavepointCommit = Option.fromJavaOptional(instants.stream() + .filter(entry -> !savepointTimestamps.contains(entry.getTimestamp())) + .findFirst()); + } + return firstNonSavepointCommit; + } + @Override public Option getInstantDetails(HoodieInstant instant) { return details.apply(instant); } + @Override + public boolean isEmpty(HoodieInstant instant) { + return getInstantDetails(instant).get().length == 0; + } + @Override public String toString() { return this.getClass().getName() + ": " + instants.stream().map(Object::toString).collect(Collectors.joining(",")); } + + /** + * Merge this timeline with the given timeline. + */ + public HoodieDefaultTimeline mergeTimeline(HoodieDefaultTimeline timeline) { + Stream instantStream = Stream.concat(instants.stream(), timeline.getInstants()).sorted(); + Function> details = instant -> { + if (instants.stream().anyMatch(i -> i.equals(instant))) { + return this.getInstantDetails(instant); + } else { + return timeline.getInstantDetails(instant); + } + }; + return new HoodieDefaultTimeline(instantStream, details); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstant.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstant.java index 65376b48e07c5..bd29e2d6a2f94 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstant.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstant.java @@ -68,7 +68,7 @@ public enum State { // Committed instant COMPLETED, // Invalid instant - INVALID + NIL } private State state = State.COMPLETED; @@ -147,7 +147,8 @@ public String getFileName() { : HoodieTimeline.makeCleanerFileName(timestamp); } else if (HoodieTimeline.ROLLBACK_ACTION.equals(action)) { return isInflight() ? HoodieTimeline.makeInflightRollbackFileName(timestamp) - : HoodieTimeline.makeRollbackFileName(timestamp); + : isRequested() ? HoodieTimeline.makeRequestedRollbackFileName(timestamp) + : HoodieTimeline.makeRollbackFileName(timestamp); } else if (HoodieTimeline.SAVEPOINT_ACTION.equals(action)) { return isInflight() ? HoodieTimeline.makeInflightSavePointFileName(timestamp) : HoodieTimeline.makeSavePointFileName(timestamp); @@ -165,11 +166,20 @@ public String getFileName() { } } else if (HoodieTimeline.RESTORE_ACTION.equals(action)) { return isInflight() ? HoodieTimeline.makeInflightRestoreFileName(timestamp) + : isRequested() ? HoodieTimeline.makeRequestedRestoreFileName(timestamp) : HoodieTimeline.makeRestoreFileName(timestamp); } else if (HoodieTimeline.REPLACE_COMMIT_ACTION.equals(action)) { return isInflight() ? HoodieTimeline.makeInflightReplaceFileName(timestamp) : isRequested() ? HoodieTimeline.makeRequestedReplaceFileName(timestamp) : HoodieTimeline.makeReplaceFileName(timestamp); + } else if (HoodieTimeline.INDEXING_ACTION.equals(action)) { + return isInflight() ? HoodieTimeline.makeInflightIndexFileName(timestamp) + : isRequested() ? HoodieTimeline.makeRequestedIndexFileName(timestamp) + : HoodieTimeline.makeIndexCommitFileName(timestamp); + } else if (HoodieTimeline.SCHEMA_COMMIT_ACTION.equals(action)) { + return isInflight() ? HoodieTimeline.makeInflightSchemaFileName(timestamp) + : isRequested() ? HoodieTimeline.makeRequestSchemaFileName(timestamp) + : HoodieTimeline.makeSchemaFileName(timestamp); } throw new IllegalArgumentException("Cannot get file name for unknown action " + action); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstantTimeGenerator.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstantTimeGenerator.java new file mode 100644 index 0000000000000..f2d2d7e29dcb1 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstantTimeGenerator.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.table.timeline; + +import org.apache.hudi.common.model.HoodieTimelineTimeZone; +import java.text.ParseException; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.format.DateTimeParseException; +import java.time.temporal.ChronoField; +import java.time.temporal.TemporalAccessor; +import java.util.Date; +import java.util.concurrent.atomic.AtomicReference; + +/** + * Utility class to generate and parse timestamps used in Instants. + */ +public class HoodieInstantTimeGenerator { + // Format of the timestamp used for an Instant + public static final String SECS_INSTANT_TIMESTAMP_FORMAT = "yyyyMMddHHmmss"; + public static final int SECS_INSTANT_ID_LENGTH = SECS_INSTANT_TIMESTAMP_FORMAT.length(); + public static final String MILLIS_INSTANT_TIMESTAMP_FORMAT = "yyyyMMddHHmmssSSS"; + public static final int MILLIS_INSTANT_ID_LENGTH = MILLIS_INSTANT_TIMESTAMP_FORMAT.length(); + public static final int MILLIS_INSTANT_TIMESTAMP_FORMAT_LENGTH = MILLIS_INSTANT_TIMESTAMP_FORMAT.length(); + // Formatter to generate Instant timestamps + // Unfortunately millisecond format is not parsable as is https://bugs.openjdk.java.net/browse/JDK-8031085. hence have to do appendValue() + private static DateTimeFormatter MILLIS_INSTANT_TIME_FORMATTER = new DateTimeFormatterBuilder().appendPattern(SECS_INSTANT_TIMESTAMP_FORMAT) + .appendValue(ChronoField.MILLI_OF_SECOND, 3).toFormatter(); + private static final String MILLIS_GRANULARITY_DATE_FORMAT = "yyyy-MM-dd HH:mm:ss.SSS"; + private static DateTimeFormatter MILLIS_GRANULARITY_DATE_FORMATTER = DateTimeFormatter.ofPattern(MILLIS_GRANULARITY_DATE_FORMAT); + + // The last Instant timestamp generated + private static AtomicReference lastInstantTime = new AtomicReference<>(String.valueOf(Integer.MIN_VALUE)); + + // The default number of milliseconds that we add if they are not present + // We prefer the max timestamp as it mimics the current behavior with second granularity + // when performing comparisons such as LESS_THAN_OR_EQUAL_TO + private static final String DEFAULT_MILLIS_EXT = "999"; + + private static HoodieTimelineTimeZone commitTimeZone = HoodieTimelineTimeZone.LOCAL; + + /** + * Returns next instant time that adds N milliseconds to the current time. + * Ensures each instant time is atleast 1 second apart since we create instant times at second granularity + * + * @param milliseconds Milliseconds to add to current time while generating the new instant time + */ + public static String createNewInstantTime(long milliseconds) { + return lastInstantTime.updateAndGet((oldVal) -> { + String newCommitTime; + do { + if (commitTimeZone.equals(HoodieTimelineTimeZone.UTC)) { + LocalDateTime now = LocalDateTime.now(ZoneOffset.UTC); + newCommitTime = now.format(MILLIS_INSTANT_TIME_FORMATTER); + } else { + Date d = new Date(System.currentTimeMillis() + milliseconds); + newCommitTime = MILLIS_INSTANT_TIME_FORMATTER.format(convertDateToTemporalAccessor(d)); + } + } while (HoodieTimeline.compareTimestamps(newCommitTime, HoodieActiveTimeline.LESSER_THAN_OR_EQUALS, oldVal)); + return newCommitTime; + }); + } + + public static Date parseDateFromInstantTime(String timestamp) throws ParseException { + try { + // Enables backwards compatibility with non-millisecond granularity instants + String timestampInMillis = timestamp; + if (isSecondGranularity(timestamp)) { + // Add milliseconds to the instant in order to parse successfully + timestampInMillis = timestamp + DEFAULT_MILLIS_EXT; + } else if (timestamp.length() > MILLIS_INSTANT_TIMESTAMP_FORMAT_LENGTH) { + // compaction and cleaning in metadata has special format. handling it by trimming extra chars and treating it with ms granularity + timestampInMillis = timestamp.substring(0, MILLIS_INSTANT_TIMESTAMP_FORMAT_LENGTH); + } + + LocalDateTime dt = LocalDateTime.parse(timestampInMillis, MILLIS_INSTANT_TIME_FORMATTER); + return Date.from(dt.atZone(ZoneId.systemDefault()).toInstant()); + } catch (DateTimeParseException e) { + throw new ParseException(e.getMessage(), e.getErrorIndex()); + } + } + + private static boolean isSecondGranularity(String instant) { + return instant.length() == SECS_INSTANT_ID_LENGTH; + } + + public static String formatDate(Date timestamp) { + return getInstantFromTemporalAccessor(convertDateToTemporalAccessor(timestamp)); + } + + public static String getInstantFromTemporalAccessor(TemporalAccessor temporalAccessor) { + return MILLIS_INSTANT_TIME_FORMATTER.format(temporalAccessor); + } + + /** + * Creates an instant string given a valid date-time string. + * @param dateString A date-time string in the format yyyy-MM-dd HH:mm:ss[:SSS] + * @return A timeline instant + * @throws ParseException If we cannot parse the date string + */ + public static String getInstantForDateString(String dateString) { + try { + return getInstantFromTemporalAccessor(LocalDateTime.parse(dateString, MILLIS_GRANULARITY_DATE_FORMATTER)); + } catch (Exception e) { + // Attempt to add the milliseconds in order to complete parsing + return getInstantFromTemporalAccessor(LocalDateTime.parse( + String.format("%s:%s", dateString, DEFAULT_MILLIS_EXT), MILLIS_GRANULARITY_DATE_FORMATTER)); + } + } + + private static TemporalAccessor convertDateToTemporalAccessor(Date d) { + return d.toInstant().atZone(ZoneId.systemDefault()).toLocalDateTime(); + } + + public static void setCommitTimeZone(HoodieTimelineTimeZone commitTimeZone) { + HoodieInstantTimeGenerator.commitTimeZone = commitTimeZone; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieTimeline.java index ff251e3144487..e52a2795969ab 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieTimeline.java @@ -55,10 +55,13 @@ public interface HoodieTimeline extends Serializable { String COMPACTION_ACTION = "compaction"; String REQUESTED_EXTENSION = ".requested"; String RESTORE_ACTION = "restore"; + String INDEXING_ACTION = "indexing"; + // only for schema save + String SCHEMA_COMMIT_ACTION = "schemacommit"; String[] VALID_ACTIONS_IN_TIMELINE = {COMMIT_ACTION, DELTA_COMMIT_ACTION, CLEAN_ACTION, SAVEPOINT_ACTION, RESTORE_ACTION, ROLLBACK_ACTION, - COMPACTION_ACTION, REPLACE_COMMIT_ACTION}; + COMPACTION_ACTION, REPLACE_COMMIT_ACTION, INDEXING_ACTION}; String COMMIT_EXTENSION = "." + COMMIT_ACTION; String DELTA_COMMIT_EXTENSION = "." + DELTA_COMMIT_ACTION; @@ -73,15 +76,23 @@ public interface HoodieTimeline extends Serializable { String INFLIGHT_CLEAN_EXTENSION = "." + CLEAN_ACTION + INFLIGHT_EXTENSION; String REQUESTED_CLEAN_EXTENSION = "." + CLEAN_ACTION + REQUESTED_EXTENSION; String INFLIGHT_ROLLBACK_EXTENSION = "." + ROLLBACK_ACTION + INFLIGHT_EXTENSION; + String REQUESTED_ROLLBACK_EXTENSION = "." + ROLLBACK_ACTION + REQUESTED_EXTENSION; String INFLIGHT_SAVEPOINT_EXTENSION = "." + SAVEPOINT_ACTION + INFLIGHT_EXTENSION; String REQUESTED_COMPACTION_SUFFIX = StringUtils.join(COMPACTION_ACTION, REQUESTED_EXTENSION); String REQUESTED_COMPACTION_EXTENSION = StringUtils.join(".", REQUESTED_COMPACTION_SUFFIX); String INFLIGHT_COMPACTION_EXTENSION = StringUtils.join(".", COMPACTION_ACTION, INFLIGHT_EXTENSION); + String REQUESTED_RESTORE_EXTENSION = "." + RESTORE_ACTION + REQUESTED_EXTENSION; String INFLIGHT_RESTORE_EXTENSION = "." + RESTORE_ACTION + INFLIGHT_EXTENSION; String RESTORE_EXTENSION = "." + RESTORE_ACTION; String INFLIGHT_REPLACE_COMMIT_EXTENSION = "." + REPLACE_COMMIT_ACTION + INFLIGHT_EXTENSION; String REQUESTED_REPLACE_COMMIT_EXTENSION = "." + REPLACE_COMMIT_ACTION + REQUESTED_EXTENSION; String REPLACE_COMMIT_EXTENSION = "." + REPLACE_COMMIT_ACTION; + String INFLIGHT_INDEX_COMMIT_EXTENSION = "." + INDEXING_ACTION + INFLIGHT_EXTENSION; + String REQUESTED_INDEX_COMMIT_EXTENSION = "." + INDEXING_ACTION + REQUESTED_EXTENSION; + String INDEX_COMMIT_EXTENSION = "." + INDEXING_ACTION; + String SAVE_SCHEMA_ACTION_EXTENSION = "." + SCHEMA_COMMIT_ACTION; + String INFLIGHT_SAVE_SCHEMA_ACTION_EXTENSION = "." + SCHEMA_COMMIT_ACTION + INFLIGHT_EXTENSION; + String REQUESTED_SAVE_SCHEMA_ACTION_EXTENSION = "." + SCHEMA_COMMIT_ACTION + REQUESTED_EXTENSION; String INVALID_INSTANT_TS = "0"; @@ -109,7 +120,7 @@ public interface HoodieTimeline extends Serializable { /** * Filter this timeline to just include the in-flights excluding compaction instants. * - * @return New instance of HoodieTimeline with just in-flights excluding compaction inflights + * @return New instance of HoodieTimeline with just in-flights excluding compaction instants */ HoodieTimeline filterPendingExcludingCompaction(); @@ -131,11 +142,20 @@ public interface HoodieTimeline extends Serializable { HoodieTimeline filterCompletedAndCompactionInstants(); /** - * Timeline to just include commits (commit/deltacommit) and compaction actions. + * Timeline to just include commits (commit/deltacommit), compaction and replace actions. * * @return */ - HoodieTimeline getCommitsAndCompactionTimeline(); + HoodieTimeline getWriteTimeline(); + + /** + * Timeline to just include commits (commit/deltacommit), compaction and replace actions that are completed and contiguous. + * For example, if timeline is [C0.completed, C1.completed, C2.completed, C3.inflight, C4.completed]. + * Then, a timeline of [C0.completed, C1.completed, C2.completed] will be returned. + * + * @return + */ + HoodieTimeline getContiguousCompletedWriteTimeline(); /** * Timeline to just include replace instants that have valid (commit/deltacommit) actions. @@ -156,6 +176,10 @@ public interface HoodieTimeline extends Serializable { */ HoodieTimeline filterPendingReplaceTimeline(); + /** + * Filter this timeline to include pending rollbacks. + */ + HoodieTimeline filterPendingRollbackTimeline(); /** * Create a new Timeline with all the instants after startTs. @@ -172,16 +196,36 @@ public interface HoodieTimeline extends Serializable { */ HoodieTimeline findInstantsAfter(String instantTime, int numCommits); + /** + * Create a new Timeline with all the instants after startTs. + */ + HoodieTimeline findInstantsAfter(String instantTime); + /** * Create a new Timeline with all instants before specified time. */ HoodieTimeline findInstantsBefore(String instantTime); + /** + * Create new timeline with all instants before or equals specified time. + */ + HoodieTimeline findInstantsBeforeOrEquals(String instantTime); + /** * Custom Filter of Instants. */ HoodieTimeline filter(Predicate filter); + /** + * Filter this timeline to just include requested and inflight index instants. + */ + HoodieTimeline filterPendingIndexTimeline(); + + /** + * Filter this timeline to just include completed index instants. + */ + HoodieTimeline filterCompletedIndexTimeline(); + /** * If the timeline has any instants. * @@ -199,6 +243,13 @@ public interface HoodieTimeline extends Serializable { */ Option firstInstant(); + /** + * @param action Instant action String. + * @param state Instant State. + * @return first instant of a specific action and state if available + */ + Option firstInstant(String action, State state); + /** * @return nth completed instant from the first completed instant */ @@ -227,6 +278,11 @@ public interface HoodieTimeline extends Serializable { */ boolean containsInstant(HoodieInstant instant); + /** + * @return true if the passed instant is present as a completed instant on the timeline + */ + boolean containsInstant(String ts); + /** * @return true if the passed instant is present as a completed instant on the timeline or if the instant is before * the first completed instant in the timeline @@ -249,11 +305,22 @@ public interface HoodieTimeline extends Serializable { */ boolean isBeforeTimelineStarts(String ts); + /** + * First non-savepoint commit in the active data timeline. Examples: + * 1. An active data timeline C1, C2, C3, C4, C5 returns C1. + * 2. If archival is allowed beyond savepoint and let's say C1, C2, C4 have been archived + * while C3, C5 have been savepointed, then for the data timeline + * C3, C3_Savepoint, C5, C5_Savepoint, C6, C7 returns C6. + */ + Option getFirstNonSavepointCommit(); + /** * Read the completed instant details. */ Option getInstantDetails(HoodieInstant instant); + boolean isEmpty(HoodieInstant instant); + /** * Helper methods to compare instants. **/ @@ -299,6 +366,26 @@ static HoodieInstant getCompactionInflightInstant(final String timestamp) { return new HoodieInstant(State.INFLIGHT, COMPACTION_ACTION, timestamp); } + static HoodieInstant getReplaceCommitRequestedInstant(final String timestamp) { + return new HoodieInstant(State.REQUESTED, REPLACE_COMMIT_ACTION, timestamp); + } + + static HoodieInstant getReplaceCommitInflightInstant(final String timestamp) { + return new HoodieInstant(State.INFLIGHT, REPLACE_COMMIT_ACTION, timestamp); + } + + static HoodieInstant getRollbackRequestedInstant(HoodieInstant instant) { + return instant.isRequested() ? instant : HoodieTimeline.getRequestedInstant(instant); + } + + static HoodieInstant getIndexRequestedInstant(final String timestamp) { + return new HoodieInstant(State.REQUESTED, INDEXING_ACTION, timestamp); + } + + static HoodieInstant getIndexInflightInstant(final String timestamp) { + return new HoodieInstant(State.INFLIGHT, INDEXING_ACTION, timestamp); + } + /** * Returns the inflight instant corresponding to the instant being passed. Takes care of changes in action names * between inflight and completed instants (compaction <=> commit). @@ -341,6 +428,14 @@ static String makeRollbackFileName(String instant) { return StringUtils.join(instant, HoodieTimeline.ROLLBACK_EXTENSION); } + static String makeRequestedRollbackFileName(String instant) { + return StringUtils.join(instant, HoodieTimeline.REQUESTED_ROLLBACK_EXTENSION); + } + + static String makeRequestedRestoreFileName(String instant) { + return StringUtils.join(instant, HoodieTimeline.REQUESTED_RESTORE_EXTENSION); + } + static String makeInflightRollbackFileName(String instant) { return StringUtils.join(instant, HoodieTimeline.INFLIGHT_ROLLBACK_EXTENSION); } @@ -404,4 +499,28 @@ static String makeFileNameAsComplete(String fileName) { static String makeFileNameAsInflight(String fileName) { return StringUtils.join(fileName, HoodieTimeline.INFLIGHT_EXTENSION); } + + static String makeIndexCommitFileName(String instant) { + return StringUtils.join(instant, HoodieTimeline.INDEX_COMMIT_EXTENSION); + } + + static String makeInflightIndexFileName(String instant) { + return StringUtils.join(instant, HoodieTimeline.INFLIGHT_INDEX_COMMIT_EXTENSION); + } + + static String makeRequestedIndexFileName(String instant) { + return StringUtils.join(instant, HoodieTimeline.REQUESTED_INDEX_COMMIT_EXTENSION); + } + + static String makeSchemaFileName(String instantTime) { + return StringUtils.join(instantTime, HoodieTimeline.SAVE_SCHEMA_ACTION_EXTENSION); + } + + static String makeInflightSchemaFileName(String instantTime) { + return StringUtils.join(instantTime, HoodieTimeline.INFLIGHT_SAVE_SCHEMA_ACTION_EXTENSION); + } + + static String makeRequestSchemaFileName(String instantTime) { + return StringUtils.join(instantTime, HoodieTimeline.REQUESTED_SAVE_SCHEMA_ACTION_EXTENSION); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineMetadataUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineMetadataUtils.java index 640d4894feb69..b50846b8780bf 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineMetadataUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineMetadataUtils.java @@ -18,29 +18,39 @@ package org.apache.hudi.common.table.timeline; -import org.apache.avro.file.DataFileReader; -import org.apache.avro.file.DataFileWriter; -import org.apache.avro.file.FileReader; -import org.apache.avro.file.SeekableByteArrayInput; -import org.apache.avro.io.DatumReader; -import org.apache.avro.io.DatumWriter; -import org.apache.avro.specific.SpecificDatumReader; -import org.apache.avro.specific.SpecificDatumWriter; -import org.apache.avro.specific.SpecificRecordBase; +import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieCleanerPlan; import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.avro.model.HoodieIndexCommitMetadata; +import org.apache.hudi.avro.model.HoodieIndexPlan; import org.apache.hudi.avro.model.HoodieInstantInfo; +import org.apache.hudi.avro.model.HoodieReplaceCommitMetadata; import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRestorePlan; import org.apache.hudi.avro.model.HoodieRollbackMetadata; import org.apache.hudi.avro.model.HoodieRollbackPartitionMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.avro.model.HoodieSavepointMetadata; import org.apache.hudi.avro.model.HoodieSavepointPartitionMetadata; import org.apache.hudi.common.HoodieRollbackStat; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileReader; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.file.FileReader; +import org.apache.avro.file.SeekableByteArrayInput; +import org.apache.avro.io.DatumReader; +import org.apache.avro.io.DatumWriter; +import org.apache.avro.specific.SpecificData; +import org.apache.avro.specific.SpecificDatumReader; +import org.apache.avro.specific.SpecificDatumWriter; +import org.apache.avro.specific.SpecificRecordBase; +import org.apache.hadoop.fs.FileStatus; + import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Collections; @@ -68,8 +78,10 @@ public static HoodieRollbackMetadata convertRollbackMetadata(String startRollbac Map partitionMetadataBuilder = new HashMap<>(); int totalDeleted = 0; for (HoodieRollbackStat stat : rollbackStats) { + Map rollbackLogFiles = stat.getCommandBlocksCount().keySet().stream() + .collect(Collectors.toMap(f -> f.getPath().toString(), FileStatus::getLen)); HoodieRollbackPartitionMetadata metadata = new HoodieRollbackPartitionMetadata(stat.getPartitionPath(), - stat.getSuccessDeleteFiles(), stat.getFailedDeleteFiles()); + stat.getSuccessDeleteFiles(), stat.getFailedDeleteFiles(), rollbackLogFiles); partitionMetadataBuilder.put(stat.getPartitionPath(), metadata); totalDeleted += stat.getSuccessDeleteFiles().size(); } @@ -99,6 +111,14 @@ public static Option serializeCleanerPlan(HoodieCleanerPlan cleanPlan) t return serializeAvroMetadata(cleanPlan, HoodieCleanerPlan.class); } + public static Option serializeRollbackPlan(HoodieRollbackPlan rollbackPlan) throws IOException { + return serializeAvroMetadata(rollbackPlan, HoodieRollbackPlan.class); + } + + public static Option serializeRestorePlan(HoodieRestorePlan restorePlan) throws IOException { + return serializeAvroMetadata(restorePlan, HoodieRestorePlan.class); + } + public static Option serializeCleanMetadata(HoodieCleanMetadata metadata) throws IOException { return serializeAvroMetadata(metadata, HoodieCleanMetadata.class); } @@ -119,6 +139,14 @@ public static Option serializeRequestedReplaceMetadata(HoodieRequestedRe return serializeAvroMetadata(clusteringPlan, HoodieRequestedReplaceMetadata.class); } + public static Option serializeIndexPlan(HoodieIndexPlan indexPlan) throws IOException { + return serializeAvroMetadata(indexPlan, HoodieIndexPlan.class); + } + + public static Option serializeIndexCommitMetadata(HoodieIndexCommitMetadata indexCommitMetadata) throws IOException { + return serializeAvroMetadata(indexCommitMetadata, HoodieIndexCommitMetadata.class); + } + public static Option serializeAvroMetadata(T metadata, Class clazz) throws IOException { DatumWriter datumWriter = new SpecificDatumWriter<>(clazz); @@ -146,14 +174,30 @@ public static HoodieRollbackMetadata deserializeHoodieRollbackMetadata(byte[] by return deserializeAvroMetadata(bytes, HoodieRollbackMetadata.class); } + public static HoodieRestoreMetadata deserializeHoodieRestoreMetadata(byte[] bytes) throws IOException { + return deserializeAvroMetadata(bytes, HoodieRestoreMetadata.class); + } + public static HoodieSavepointMetadata deserializeHoodieSavepointMetadata(byte[] bytes) throws IOException { return deserializeAvroMetadata(bytes, HoodieSavepointMetadata.class); } - public static HoodieRequestedReplaceMetadata deserializeRequestedReplaceMetadta(byte[] bytes) throws IOException { + public static HoodieRequestedReplaceMetadata deserializeRequestedReplaceMetadata(byte[] bytes) throws IOException { return deserializeAvroMetadata(bytes, HoodieRequestedReplaceMetadata.class); } + public static HoodieReplaceCommitMetadata deserializeHoodieReplaceMetadata(byte[] bytes) throws IOException { + return deserializeAvroMetadata(bytes, HoodieReplaceCommitMetadata.class); + } + + public static HoodieIndexPlan deserializeIndexPlan(byte[] bytes) throws IOException { + return deserializeAvroMetadata(bytes, HoodieIndexPlan.class); + } + + public static HoodieIndexCommitMetadata deserializeIndexCommitMetadata(byte[] bytes) throws IOException { + return deserializeAvroMetadata(bytes, HoodieIndexCommitMetadata.class); + } + public static T deserializeAvroMetadata(byte[] bytes, Class clazz) throws IOException { DatumReader reader = new SpecificDatumReader<>(clazz); @@ -161,4 +205,13 @@ public static T deserializeAvroMetadata(byte[] by ValidationUtils.checkArgument(fileReader.hasNext(), "Could not deserialize metadata of type " + clazz); return fileReader.next(); } + + public static T deserializeAvroRecordMetadata(byte[] bytes, Schema schema) + throws IOException { + return deserializeAvroRecordMetadata(HoodieAvroUtils.bytesToAvro(bytes, schema), schema); + } + + public static T deserializeAvroRecordMetadata(Object object, Schema schema) { + return (T) SpecificData.get().deepCopy(schema, object); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineUtils.java index 95a2ae618cfc3..a070a7e94d1c0 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineUtils.java @@ -21,14 +21,21 @@ import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; +import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieIOException; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + import java.io.IOException; import java.util.Collection; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -40,16 +47,40 @@ * 2) Incremental reads - InputFormats can use this API to query */ public class TimelineUtils { + private static final Logger LOG = LogManager.getLogger(TimelineUtils.class); /** * Returns partitions that have new data strictly after commitTime. * Does not include internal operations such as clean in the timeline. */ - public static List getPartitionsWritten(HoodieTimeline timeline) { - HoodieTimeline timelineToSync = timeline.getCommitsAndCompactionTimeline(); + public static List getWrittenPartitions(HoodieTimeline timeline) { + HoodieTimeline timelineToSync = timeline.getWriteTimeline(); return getAffectedPartitions(timelineToSync); } + /** + * Returns partitions that have been deleted or marked for deletion in the given timeline. + * Does not include internal operations such as clean in the timeline. + */ + public static List getDroppedPartitions(HoodieTimeline timeline) { + HoodieTimeline replaceCommitTimeline = timeline.getWriteTimeline().filterCompletedInstants().getCompletedReplaceTimeline(); + + return replaceCommitTimeline.getInstants().flatMap(instant -> { + try { + HoodieReplaceCommitMetadata commitMetadata = HoodieReplaceCommitMetadata.fromBytes( + replaceCommitTimeline.getInstantDetails(instant).get(), HoodieReplaceCommitMetadata.class); + if (WriteOperationType.DELETE_PARTITION.equals(commitMetadata.getOperationType())) { + Map> partitionToReplaceFileIds = commitMetadata.getPartitionToReplaceFileIds(); + return partitionToReplaceFileIds.keySet().stream(); + } else { + return Stream.empty(); + } + } catch (IOException e) { + throw new HoodieIOException("Failed to get partitions modified at " + instant, e); + } + }).distinct().filter(partition -> !partition.isEmpty()).collect(Collectors.toList()); + } + /** * Returns partitions that have been modified including internal operations such as clean in the passed timeline. */ @@ -63,6 +94,17 @@ public static List getAffectedPartitions(HoodieTimeline timeline) { return commitMetadata.getPartitionToWriteStats().keySet().stream(); } catch (IOException e) { throw new HoodieIOException("Failed to get partitions written at " + s, e); + } + case HoodieTimeline.REPLACE_COMMIT_ACTION: + try { + HoodieReplaceCommitMetadata commitMetadata = HoodieReplaceCommitMetadata.fromBytes( + timeline.getInstantDetails(s).get(), HoodieReplaceCommitMetadata.class); + Set partitions = new HashSet<>(); + partitions.addAll(commitMetadata.getPartitionToReplaceFileIds().keySet()); + partitions.addAll(commitMetadata.getPartitionToWriteStats().keySet()); + return partitions.stream(); + } catch (IOException e) { + throw new HoodieIOException("Failed to get partitions modified at " + s, e); } case HoodieTimeline.CLEAN_ACTION: try { @@ -103,13 +145,26 @@ public static List getAffectedPartitions(HoodieTimeline timeline) { } /** - * Get extra metadata for specified key from latest commit/deltacommit instant. + * Get extra metadata for specified key from latest commit/deltacommit/replacecommit(eg. insert_overwrite) instant. */ public static Option getExtraMetadataFromLatest(HoodieTableMetaClient metaClient, String extraMetadataKey) { - return metaClient.getCommitsTimeline().filterCompletedInstants().getReverseOrderedInstants().findFirst().map(instant -> + return metaClient.getCommitsTimeline().filterCompletedInstants().getReverseOrderedInstants() + // exclude clustering commits for returning user stored extra metadata + .filter(instant -> !isClusteringCommit(metaClient, instant)) + .findFirst().map(instant -> getMetadataValue(metaClient, extraMetadataKey, instant)).orElse(Option.empty()); } + /** + * Get extra metadata for specified key from latest commit/deltacommit/replacecommit instant including internal commits + * such as clustering. + */ + public static Option getExtraMetadataFromLatestIncludeClustering(HoodieTableMetaClient metaClient, String extraMetadataKey) { + return metaClient.getCommitsTimeline().filterCompletedInstants().getReverseOrderedInstants() + .findFirst().map(instant -> + getMetadataValue(metaClient, extraMetadataKey, instant)).orElse(Option.empty()); + } + /** * Get extra metadata for specified key from all active commit/deltacommit instants. */ @@ -120,6 +175,7 @@ public static Map> getAllExtraMetadataForKey(HoodieTableM private static Option getMetadataValue(HoodieTableMetaClient metaClient, String extraMetadataKey, HoodieInstant instant) { try { + LOG.info("reading checkpoint info for:" + instant + " key: " + extraMetadataKey); HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( metaClient.getCommitsTimeline().getInstantDetails(instant).get(), HoodieCommitMetadata.class); @@ -128,4 +184,29 @@ private static Option getMetadataValue(HoodieTableMetaClient metaClient, throw new HoodieIOException("Unable to parse instant metadata " + instant, e); } } + + public static boolean isClusteringCommit(HoodieTableMetaClient metaClient, HoodieInstant instant) { + try { + if (HoodieTimeline.REPLACE_COMMIT_ACTION.equals(instant.getAction())) { + // replacecommit is used for multiple operations: insert_overwrite/cluster etc. + // Check operation type to see if this instant is related to clustering. + HoodieReplaceCommitMetadata replaceMetadata = HoodieReplaceCommitMetadata.fromBytes( + metaClient.getActiveTimeline().getInstantDetails(instant).get(), HoodieReplaceCommitMetadata.class); + return WriteOperationType.CLUSTER.equals(replaceMetadata.getOperationType()); + } + + return false; + } catch (IOException e) { + throw new HoodieIOException("Unable to read instant information: " + instant + " for " + metaClient.getBasePath(), e); + } + } + + public static HoodieDefaultTimeline getTimeline(HoodieTableMetaClient metaClient, boolean includeArchivedTimeline) { + HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); + if (includeArchivedTimeline) { + HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline(); + return archivedTimeline.mergeTimeline(activeTimeline); + } + return activeTimeline; + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/TimelineLayoutVersion.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/TimelineLayoutVersion.java index 994c86778aba5..0ed83ab59c05a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/TimelineLayoutVersion.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/TimelineLayoutVersion.java @@ -34,7 +34,7 @@ public class TimelineLayoutVersion implements Serializable, Comparable partitionMetadataMap = input.getPartitionMetadata() .entrySet() @@ -80,6 +74,7 @@ public HoodieCleanMetadata upgradeFrom(HoodieCleanMetadata input) { return HoodieCleanMetadata.newBuilder() .setEarliestCommitToRetain(input.getEarliestCommitToRetain()) + .setLastCompletedCommitTimestamp(input.getLastCompletedCommitTimestamp()) .setStartCleanTime(input.getStartCleanTime()) .setTimeTakenInMillis(input.getTimeTakenInMillis()) .setTotalFilesDeleted(input.getTotalFilesDeleted()) diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/clean/CleanPlanV1MigrationHandler.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/clean/CleanPlanV1MigrationHandler.java index 0010aa21fb1c1..5c8d9b8fb3e26 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/clean/CleanPlanV1MigrationHandler.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/clean/CleanPlanV1MigrationHandler.java @@ -18,6 +18,7 @@ package org.apache.hudi.common.table.timeline.versioning.clean; +import java.util.ArrayList; import java.util.HashMap; import org.apache.hudi.avro.model.HoodieCleanerPlan; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -56,11 +57,9 @@ public HoodieCleanerPlan downgradeFrom(HoodieCleanerPlan plan) { "This version do not support METADATA_ONLY bootstrapped tables. Failed to downgrade."); } Map> filesPerPartition = plan.getFilePathsToBeDeletedPerPartition().entrySet().stream() - .map(e -> { - return Pair.of(e.getKey(), e.getValue().stream().map(v -> new Path(v.getFilePath()).getName()) - .collect(Collectors.toList())); - }).collect(Collectors.toMap(Pair::getKey, Pair::getValue)); - return new HoodieCleanerPlan(plan.getEarliestInstantToRetain(), plan.getPolicy(), filesPerPartition, VERSION, - new HashMap<>()); + .map(e -> Pair.of(e.getKey(), e.getValue().stream().map(v -> new Path(v.getFilePath()).getName()) + .collect(Collectors.toList()))).collect(Collectors.toMap(Pair::getKey, Pair::getValue)); + return new HoodieCleanerPlan(plan.getEarliestInstantToRetain(), plan.getLastCompletedCommitTimestamp(), + plan.getPolicy(), filesPerPartition, VERSION, new HashMap<>(), new ArrayList<>()); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/clean/CleanPlanV2MigrationHandler.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/clean/CleanPlanV2MigrationHandler.java index e141e9a15499f..c17af4020a3ca 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/clean/CleanPlanV2MigrationHandler.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/versioning/clean/CleanPlanV2MigrationHandler.java @@ -27,6 +27,7 @@ import org.apache.hadoop.fs.Path; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -52,8 +53,8 @@ public HoodieCleanerPlan upgradeFrom(HoodieCleanerPlan plan) { .map(v -> new HoodieCleanFileInfo( new Path(FSUtils.getPartitionPath(metaClient.getBasePath(), e.getKey()), v).toString(), false)) .collect(Collectors.toList()))).collect(Collectors.toMap(Pair::getKey, Pair::getValue)); - return new HoodieCleanerPlan(plan.getEarliestInstantToRetain(), plan.getPolicy(), new HashMap<>(), VERSION, - filePathsPerPartition); + return new HoodieCleanerPlan(plan.getEarliestInstantToRetain(), plan.getLastCompletedCommitTimestamp(), + plan.getPolicy(), new HashMap<>(), VERSION, filePathsPerPartition, new ArrayList<>()); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java index 1dd6b006b7a30..39bb3a2a5da27 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java @@ -18,8 +18,6 @@ package org.apache.hudi.common.table.view; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; import org.apache.hudi.common.bootstrap.index.BootstrapIndex; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.BootstrapBaseFileMapping; @@ -41,9 +39,13 @@ import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieIOException; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.Serializable; import java.util.AbstractMap; @@ -59,9 +61,11 @@ import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock; import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; import java.util.function.Predicate; +import java.util.regex.Matcher; import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN; import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN_OR_EQUALS; import static org.apache.hudi.common.table.timeline.HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS; @@ -92,8 +96,8 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV private BootstrapIndex bootstrapIndex; - private String getPartitionPathFromFilePath(String fullPath) { - return FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), new Path(fullPath).getParent()); + private String getPartitionPathFor(HoodieBaseFile baseFile) { + return FSUtils.getRelativePartitionPath(metaClient.getBasePathV2(), baseFile.getHadoopPath().getParent()); } /** @@ -117,13 +121,13 @@ protected void init(HoodieTableMetaClient metaClient, HoodieTimeline visibleActi * @param visibleActiveTimeline Visible Active Timeline */ protected void refreshTimeline(HoodieTimeline visibleActiveTimeline) { - this.visibleCommitsAndCompactionTimeline = visibleActiveTimeline.getCommitsAndCompactionTimeline(); + this.visibleCommitsAndCompactionTimeline = visibleActiveTimeline.getWriteTimeline(); } /** * Adds the provided statuses into the file system view, and also caches it inside this object. */ - protected List addFilesToView(FileStatus[] statuses) { + public List addFilesToView(FileStatus[] statuses) { HoodieTimer timer = new HoodieTimer().startTimer(); List fileGroups = buildFileGroups(statuses, visibleCommitsAndCompactionTimeline, true); long fgBuildTimeTakenMs = timer.endTimer(); @@ -163,14 +167,14 @@ protected List buildFileGroups(FileStatus[] statuses, HoodieTim protected List buildFileGroups(Stream baseFileStream, Stream logFileStream, HoodieTimeline timeline, boolean addPendingCompactionFileSlice) { Map, List> baseFiles = - baseFileStream.collect(Collectors.groupingBy((baseFile) -> { - String partitionPathStr = getPartitionPathFromFilePath(baseFile.getPath()); + baseFileStream.collect(Collectors.groupingBy(baseFile -> { + String partitionPathStr = getPartitionPathFor(baseFile); return Pair.of(partitionPathStr, baseFile.getFileId()); })); Map, List> logFiles = logFileStream.collect(Collectors.groupingBy((logFile) -> { String partitionPathStr = - FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), logFile.getPath().getParent()); + FSUtils.getRelativePartitionPath(metaClient.getBasePathV2(), logFile.getPath().getParent()); return Pair.of(partitionPathStr, logFile.getFileId()); })); @@ -180,7 +184,8 @@ protected List buildFileGroups(Stream baseFileS List fileGroups = new ArrayList<>(); fileIdSet.forEach(pair -> { String fileId = pair.getValue(); - HoodieFileGroup group = new HoodieFileGroup(pair.getKey(), fileId, timeline); + String partitionPath = pair.getKey(); + HoodieFileGroup group = new HoodieFileGroup(partitionPath, fileId, timeline); if (baseFiles.containsKey(pair)) { baseFiles.get(pair).forEach(group::addBaseFile); } @@ -219,6 +224,17 @@ private void resetFileGroupsReplaced(HoodieTimeline timeline) { // get replace instant mapping for each partition, fileId return replaceMetadata.getPartitionToReplaceFileIds().entrySet().stream().flatMap(entry -> entry.getValue().stream().map(e -> new AbstractMap.SimpleEntry<>(new HoodieFileGroupId(entry.getKey(), e), instant))); + } catch (HoodieIOException ex) { + + if (ex.getIOException() instanceof FileNotFoundException) { + // Replace instant could be deleted by archive and FileNotFoundException could be threw during getInstantDetails function + // So that we need to catch the FileNotFoundException here and continue + LOG.warn(ex.getMessage()); + return Stream.empty(); + } else { + throw ex; + } + } catch (IOException e) { throw new HoodieIOException("error reading commit metadata for " + instant); } @@ -230,19 +246,24 @@ private void resetFileGroupsReplaced(HoodieTimeline timeline) { + replacedFileGroups.size() + " replaced file groups"); } + @Override + public void close() { + try { + writeLock.lock(); + clear(); + } finally { + writeLock.unlock(); + } + } + /** * Clears the partition Map and reset view states. */ @Override - public final void reset() { + public void reset() { try { writeLock.lock(); - - addedPartitions.clear(); - resetViewState(); - - bootstrapIndex = null; - + clear(); // Initialize with new Hoodie timeline. init(metaClient, getTimeline()); } finally { @@ -250,6 +271,15 @@ public final void reset() { } } + /** + * Clear the resource. + */ + private void clear() { + addedPartitions.clear(); + resetViewState(); + bootstrapIndex = null; + } + /** * Allows all view metadata in file system view storage to be reset by subclasses. */ @@ -272,13 +302,11 @@ private void ensurePartitionLoadedCorrectly(String partition) { try { LOG.info("Building file system view for partition (" + partitionPathStr + ")"); - // Create the path if it does not exist already - Path partitionPath = FSUtils.getPartitionPath(metaClient.getBasePath(), partitionPathStr); - FSUtils.createPathIfNotExists(metaClient.getFs(), partitionPath); + Path partitionPath = FSUtils.getPartitionPath(metaClient.getBasePathV2(), partitionPathStr); long beginLsTs = System.currentTimeMillis(); - FileStatus[] statuses = metaClient.getFs().listStatus(partitionPath); + FileStatus[] statuses = listPartition(partitionPath); long endLsTs = System.currentTimeMillis(); - LOG.info("#files found in partition (" + partitionPathStr + ") =" + statuses.length + ", Time taken =" + LOG.debug("#files found in partition (" + partitionPathStr + ") =" + statuses.length + ", Time taken =" + (endLsTs - beginLsTs)); List groups = addFilesToView(statuses); @@ -292,11 +320,32 @@ private void ensurePartitionLoadedCorrectly(String partition) { LOG.debug("View already built for Partition :" + partitionPathStr + ", FOUND is "); } long endTs = System.currentTimeMillis(); - LOG.info("Time to load partition (" + partitionPathStr + ") =" + (endTs - beginTs)); + LOG.debug("Time to load partition (" + partitionPathStr + ") =" + (endTs - beginTs)); return true; }); } + /** + * Return all the files from the partition. + * + * @param partitionPath The absolute path of the partition + * @throws IOException + */ + protected FileStatus[] listPartition(Path partitionPath) throws IOException { + try { + return metaClient.getFs().listStatus(partitionPath); + } catch (IOException e) { + // Create the path if it does not exist already + if (!metaClient.getFs().exists(partitionPath)) { + metaClient.getFs().mkdirs(partitionPath); + return new FileStatus[0]; + } else { + // in case the partition path was created by another caller + return metaClient.getFs().listStatus(partitionPath); + } + } + } + /** * Helper to convert file-status to base-files. * @@ -314,8 +363,11 @@ private Stream convertFileStatusesToBaseFiles(FileStatus[] statu * @param statuses List of FIle-Status */ private Stream convertFileStatusesToLogFiles(FileStatus[] statuses) { - Predicate rtFilePredicate = fileStatus -> fileStatus.getPath().getName() - .contains(metaClient.getTableConfig().getLogFileFormat().getFileExtension()); + Predicate rtFilePredicate = fileStatus -> { + String fileName = fileStatus.getPath().getName(); + Matcher matcher = FSUtils.LOG_FILE_PATTERN.matcher(fileName); + return matcher.find() && fileName.contains(metaClient.getTableConfig().getLogFileFormat().getFileExtension()); + }; return Arrays.stream(statuses).filter(rtFilePredicate).map(HoodieLogFile::new); } @@ -326,7 +378,7 @@ private Stream convertFileStatusesToLogFiles(FileStatus[] statuse * @param baseFile base File */ protected boolean isBaseFileDueToPendingCompaction(HoodieBaseFile baseFile) { - final String partitionPath = getPartitionPathFromFilePath(baseFile.getPath()); + final String partitionPath = getPartitionPathFor(baseFile); Option> compactionWithInstantTime = getPendingCompactionOperationWithInstant(new HoodieFileGroupId(partitionPath, baseFile.getFileId())); @@ -334,6 +386,19 @@ protected boolean isBaseFileDueToPendingCompaction(HoodieBaseFile baseFile) { && baseFile.getCommitTime().equals(compactionWithInstantTime.get().getKey()); } + /** + * With async clustering, it is possible to see partial/complete base-files due to inflight-clustering, Ignore those + * base-files. + * + * @param baseFile base File + */ + protected boolean isBaseFileDueToPendingClustering(HoodieBaseFile baseFile) { + List pendingReplaceInstants = + metaClient.getActiveTimeline().filterPendingReplaceTimeline().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + + return !pendingReplaceInstants.isEmpty() && pendingReplaceInstants.contains(baseFile.getCommitTime()); + } + /** * Returns true if the file-group is under pending-compaction and the file-slice' baseInstant matches compaction * Instant. @@ -343,7 +408,6 @@ protected boolean isBaseFileDueToPendingCompaction(HoodieBaseFile baseFile) { protected boolean isFileSliceAfterPendingCompaction(FileSlice fileSlice) { Option> compactionWithInstantTime = getPendingCompactionOperationWithInstant(fileSlice.getFileGroupId()); - LOG.info("Pending Compaction instant for (" + fileSlice + ") is :" + compactionWithInstantTime); return (compactionWithInstantTime.isPresent()) && fileSlice.getBaseInstantTime().equals(compactionWithInstantTime.get().getKey()); } @@ -353,18 +417,21 @@ protected boolean isFileSliceAfterPendingCompaction(FileSlice fileSlice) { * base-files. * * @param fileSlice File Slice + * @param includeEmptyFileSlice include empty file-slice */ - protected FileSlice filterBaseFileAfterPendingCompaction(FileSlice fileSlice) { + protected Stream filterBaseFileAfterPendingCompaction(FileSlice fileSlice, boolean includeEmptyFileSlice) { if (isFileSliceAfterPendingCompaction(fileSlice)) { - LOG.info("File Slice (" + fileSlice + ") is in pending compaction"); + LOG.debug("File Slice (" + fileSlice + ") is in pending compaction"); // Base file is filtered out of the file-slice as the corresponding compaction // instant not completed yet. - FileSlice transformed = - new FileSlice(fileSlice.getPartitionPath(), fileSlice.getBaseInstantTime(), fileSlice.getFileId()); + FileSlice transformed = new FileSlice(fileSlice.getPartitionPath(), fileSlice.getBaseInstantTime(), fileSlice.getFileId()); fileSlice.getLogFiles().forEach(transformed::addLogFile); - return transformed; + if (transformed.isEmpty() && !includeEmptyFileSlice) { + return Stream.of(); + } + return Stream.of(transformed); } - return fileSlice; + return Stream.of(fileSlice); } protected HoodieFileGroup addBootstrapBaseFileIfPresent(HoodieFileGroup fileGroup) { @@ -412,6 +479,20 @@ public final Stream> getPendingCompactionOpera } } + public final List getPartitionPaths() { + try { + readLock.lock(); + return fetchAllStoredFileGroups() + .filter(fg -> !isFileGroupReplaced(fg)) + .map(HoodieFileGroup::getPartitionPath) + .distinct() + .map(name -> name.isEmpty() ? metaClient.getBasePathV2() : new Path(metaClient.getBasePathV2(), name)) + .collect(Collectors.toList()); + } finally { + readLock.unlock(); + } + } + @Override public final Stream getLatestBaseFiles(String partitionStr) { try { @@ -447,7 +528,7 @@ public final Stream getLatestBaseFilesBeforeOrOn(String partitio .map(fileGroup -> Option.fromJavaOptional(fileGroup.getAllBaseFiles() .filter(baseFile -> HoodieTimeline.compareTimestamps(baseFile.getCommitTime(), HoodieTimeline.LESSER_THAN_OR_EQUALS, maxCommitTime )) - .filter(df -> !isBaseFileDueToPendingCompaction(df)).findFirst())) + .filter(df -> !isBaseFileDueToPendingCompaction(df) && !isBaseFileDueToPendingClustering(df)).findFirst())) .filter(Option::isPresent).map(Option::get) .map(df -> addBootstrapBaseFileIfPresent(new HoodieFileGroupId(partitionPath, df.getFileId()), df)); } finally { @@ -466,7 +547,7 @@ public final Option getBaseFileOn(String partitionStr, String in } else { return fetchHoodieFileGroup(partitionPath, fileId).map(fileGroup -> fileGroup.getAllBaseFiles() .filter(baseFile -> HoodieTimeline.compareTimestamps(baseFile.getCommitTime(), HoodieTimeline.EQUALS, - instantTime)).filter(df -> !isBaseFileDueToPendingCompaction(df)).findFirst().orElse(null)) + instantTime)).filter(df -> !isBaseFileDueToPendingCompaction(df) && !isBaseFileDueToPendingClustering(df)).findFirst().orElse(null)) .map(df -> addBootstrapBaseFileIfPresent(new HoodieFileGroupId(partitionPath, fileId), df)); } } finally { @@ -502,7 +583,7 @@ public final Stream getLatestBaseFilesInRange(List commi .filter(fileGroup -> !isFileGroupReplacedBeforeAny(fileGroup.getFileGroupId(), commitsToReturn)) .map(fileGroup -> Pair.of(fileGroup.getFileGroupId(), Option.fromJavaOptional( fileGroup.getAllBaseFiles().filter(baseFile -> commitsToReturn.contains(baseFile.getCommitTime()) - && !isBaseFileDueToPendingCompaction(baseFile)).findFirst()))).filter(p -> p.getValue().isPresent()) + && !isBaseFileDueToPendingCompaction(baseFile) && !isBaseFileDueToPendingClustering(baseFile)).findFirst()))).filter(p -> p.getValue().isPresent()) .map(p -> addBootstrapBaseFileIfPresent(p.getKey(), p.getValue().get())); } finally { readLock.unlock(); @@ -518,7 +599,7 @@ public final Stream getAllBaseFiles(String partitionStr) { return fetchAllBaseFiles(partitionPath) .filter(df -> !isFileGroupReplaced(partitionPath, df.getFileId())) .filter(df -> visibleCommitsAndCompactionTimeline.containsOrBeforeTimelineStarts(df.getCommitTime())) - .filter(df -> !isBaseFileDueToPendingCompaction(df)) + .filter(df -> !isBaseFileDueToPendingCompaction(df) && !isBaseFileDueToPendingClustering(df)) .map(df -> addBootstrapBaseFileIfPresent(new HoodieFileGroupId(partitionPath, df.getFileId()), df)); } finally { readLock.unlock(); @@ -532,9 +613,9 @@ public final Stream getLatestFileSlices(String partitionStr) { String partitionPath = formatPartitionKey(partitionStr); ensurePartitionLoadedCorrectly(partitionPath); return fetchLatestFileSlices(partitionPath) - .filter(slice -> !isFileGroupReplaced(slice.getFileGroupId())) - .map(this::filterBaseFileAfterPendingCompaction) - .map(this::addBootstrapBaseFileIfPresent); + .filter(slice -> !isFileGroupReplaced(slice.getFileGroupId())) + .flatMap(slice -> this.filterBaseFileAfterPendingCompaction(slice, true)) + .map(this::addBootstrapBaseFileIfPresent); } finally { readLock.unlock(); } @@ -553,7 +634,10 @@ public final Option getLatestFileSlice(String partitionStr, String fi return Option.empty(); } else { Option fs = fetchLatestFileSlice(partitionPath, fileId); - return fs.map(this::filterBaseFileAfterPendingCompaction).map(this::addBootstrapBaseFileIfPresent); + if (!fs.isPresent()) { + return Option.empty(); + } + return Option.ofNullable(filterBaseFileAfterPendingCompaction(fs.get(), true).map(this::addBootstrapBaseFileIfPresent).findFirst().orElse(null)); } } finally { readLock.unlock(); @@ -591,13 +675,21 @@ public final Stream getLatestFileSlicesBeforeOrOn(String partitionStr readLock.lock(); String partitionPath = formatPartitionKey(partitionStr); ensurePartitionLoadedCorrectly(partitionPath); - Stream fileSliceStream = fetchLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime) - .filter(slice -> !isFileGroupReplacedBeforeOrOn(slice.getFileGroupId(), maxCommitTime)); + Stream> allFileSliceStream = fetchAllStoredFileGroups(partitionPath) + .filter(slice -> !isFileGroupReplacedBeforeOrOn(slice.getFileGroupId(), maxCommitTime)) + .map(fg -> fg.getAllFileSlicesBeforeOn(maxCommitTime)); if (includeFileSlicesInPendingCompaction) { - return fileSliceStream.map(this::filterBaseFileAfterPendingCompaction).map(this::addBootstrapBaseFileIfPresent); + return allFileSliceStream.map(sliceStream -> sliceStream.flatMap(slice -> this.filterBaseFileAfterPendingCompaction(slice, false))) + .map(sliceStream -> Option.fromJavaOptional(sliceStream.findFirst())).filter(Option::isPresent).map(Option::get) + .map(this::addBootstrapBaseFileIfPresent); } else { - return fileSliceStream.filter(fs -> !isPendingCompactionScheduledForFileId(fs.getFileGroupId())) - .map(this::addBootstrapBaseFileIfPresent); + return allFileSliceStream + .map(sliceStream -> + Option.fromJavaOptional(sliceStream + .filter(slice -> !isPendingCompactionScheduledForFileId(slice.getFileGroupId())) + .filter(slice -> !slice.isEmpty()) + .findFirst())) + .filter(Option::isPresent).map(Option::get).map(this::addBootstrapBaseFileIfPresent); } } finally { readLock.unlock(); @@ -680,6 +772,16 @@ public Stream getReplacedFileGroupsBeforeOrOn(String maxCommitT return getAllFileGroupsIncludingReplaced(partitionPath).filter(fg -> isFileGroupReplacedBeforeOrOn(fg.getFileGroupId(), maxCommitTime)); } + @Override + public Stream getReplacedFileGroupsBefore(String maxCommitTime, String partitionPath) { + return getAllFileGroupsIncludingReplaced(partitionPath).filter(fg -> isFileGroupReplacedBefore(fg.getFileGroupId(), maxCommitTime)); + } + + @Override + public Stream getAllReplacedFileGroups(String partitionPath) { + return getAllFileGroupsIncludingReplaced(partitionPath).filter(fg -> isFileGroupReplaced(fg.getFileGroupId())); + } + @Override public final Stream> getFileGroupsInPendingClustering() { try { @@ -809,7 +911,6 @@ protected abstract Option> getPendingCompactio */ abstract Stream fetchBootstrapBaseFiles(); - /** * Checks if partition is pre-loaded and available in store. * @@ -883,14 +984,15 @@ Stream fetchLatestFileSliceInRange(List commitsToReturn) { */ Stream fetchAllFileSlices(String partitionPath) { return fetchAllStoredFileGroups(partitionPath).map(this::addBootstrapBaseFileIfPresent) - .map(HoodieFileGroup::getAllFileSlices).flatMap(sliceList -> sliceList); + .flatMap(HoodieFileGroup::getAllFileSlices); } /** * Default implementation for fetching latest base-files for the partition-path. */ - Stream fetchLatestBaseFiles(final String partitionPath) { + public Stream fetchLatestBaseFiles(final String partitionPath) { return fetchAllStoredFileGroups(partitionPath) + .filter(fg -> !isFileGroupReplaced(fg)) .map(fg -> Pair.of(fg.getFileGroupId(), getLatestBaseFile(fg))) .filter(p -> p.getValue().isPresent()) .map(p -> addBootstrapBaseFileIfPresent(p.getKey(), p.getValue().get())); @@ -898,7 +1000,7 @@ Stream fetchLatestBaseFiles(final String partitionPath) { protected Option getLatestBaseFile(HoodieFileGroup fileGroup) { return Option - .fromJavaOptional(fileGroup.getAllBaseFiles().filter(df -> !isBaseFileDueToPendingCompaction(df)).findFirst()); + .fromJavaOptional(fileGroup.getAllBaseFiles().filter(df -> !isBaseFileDueToPendingCompaction(df) && !isBaseFileDueToPendingClustering(df)).findFirst()); } /** @@ -918,8 +1020,7 @@ private Stream fetchLatestBaseFiles() { * @param partitionPath partition-path */ Stream fetchAllBaseFiles(String partitionPath) { - return fetchAllStoredFileGroups(partitionPath).map(HoodieFileGroup::getAllBaseFiles) - .flatMap(baseFileList -> baseFileList); + return fetchAllStoredFileGroups(partitionPath).flatMap(HoodieFileGroup::getAllBaseFiles); } /** @@ -938,18 +1039,6 @@ Stream fetchLatestFileSlices(String partitionPath) { .map(Option::get); } - /** - * Default implementation for fetching latest file-slices for a partition path as of instant. - * - * @param partitionPath Partition Path - * @param maxCommitTime Instant Time - */ - Stream fetchLatestFileSlicesBeforeOrOn(String partitionPath, String maxCommitTime) { - return fetchAllStoredFileGroups(partitionPath) - .map(fileGroup -> fileGroup.getLatestFileSliceBeforeOrOn(maxCommitTime)).filter(Option::isPresent) - .map(Option::get); - } - /** * Helper to merge last 2 file-slices. These 2 file-slices do not have compaction done yet. * @@ -993,7 +1082,7 @@ private FileSlice fetchMergedFileSlice(HoodieFileGroup fileGroup, FileSlice file /** * Default implementation for fetching latest base-file. - * + * * @param partitionPath Partition path * @param fileId File Id * @return base File if present @@ -1005,7 +1094,7 @@ protected Option fetchLatestBaseFile(String partitionPath, Strin /** * Default implementation for fetching file-slice. - * + * * @param partitionPath Partition path * @param fileId File Id * @return File Slice if present @@ -1031,6 +1120,15 @@ private boolean isFileGroupReplacedBeforeAny(HoodieFileGroupId fileGroupId, List return isFileGroupReplacedBeforeOrOn(fileGroupId, instants.stream().max(Comparator.naturalOrder()).get()); } + private boolean isFileGroupReplacedBefore(HoodieFileGroupId fileGroupId, String instant) { + Option hoodieInstantOption = getReplaceInstant(fileGroupId); + if (!hoodieInstantOption.isPresent()) { + return false; + } + + return HoodieTimeline.compareTimestamps(instant, GREATER_THAN, hoodieInstantOption.get().getTimestamp()); + } + private boolean isFileGroupReplacedBeforeOrOn(HoodieFileGroupId fileGroupId, String instant) { Option hoodieInstantOption = getReplaceInstant(fileGroupId); if (!hoodieInstantOption.isPresent()) { @@ -1071,15 +1169,14 @@ public void sync() { */ protected void runSync(HoodieTimeline oldTimeline, HoodieTimeline newTimeline) { refreshTimeline(newTimeline); - addedPartitions.clear(); - resetViewState(); + clear(); // Initialize with new Hoodie timeline. init(metaClient, newTimeline); } /** * Return Only Commits and Compaction timeline for building file-groups. - * + * * @return {@code HoodieTimeline} */ public HoodieTimeline getVisibleCommitsAndCompactionTimeline() { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewManager.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewManager.java index d31018123c3a5..48023d50463d2 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewManager.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewManager.java @@ -18,10 +18,19 @@ package org.apache.hudi.common.table.view; +import org.apache.hudi.common.config.HoodieCommonConfig; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.HoodieMetastoreConfig; import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.function.SerializableSupplier; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Functions.Function2; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.metadata.HoodieMetadataFileSystemView; +import org.apache.hudi.metadata.HoodieTableMetadata; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -52,6 +61,8 @@ public class FileSystemViewManager { private static final Logger LOG = LogManager.getLogger(FileSystemViewManager.class); + private static final String HOODIE_METASTORE_FILE_SYSTEM_VIEW_CLASS = "org.apache.hudi.common.table.view.HoodieMetastoreFileSystemView"; + private final SerializableConfiguration conf; // The View Storage config used to store file-system views private final FileSystemViewStorageConfig viewStorageConfig; @@ -60,9 +71,9 @@ public class FileSystemViewManager { // Factory Map to create file-system views private final Function2 viewCreator; - public FileSystemViewManager(SerializableConfiguration conf, FileSystemViewStorageConfig viewStorageConfig, + private FileSystemViewManager(HoodieEngineContext context, FileSystemViewStorageConfig viewStorageConfig, Function2 viewCreator) { - this.conf = new SerializableConfiguration(conf); + this.conf = context.getHadoopConf(); this.viewStorageConfig = viewStorageConfig; this.globalViewMap = new ConcurrentHashMap<>(); this.viewCreator = viewCreator; @@ -88,7 +99,7 @@ public void clearFileSystemView(String basePath) { */ public SyncableFileSystemView getFileSystemView(String basePath) { return globalViewMap.computeIfAbsent(basePath, (path) -> { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(conf.newCopy(), path); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(conf.newCopy()).setBasePath(path).build(); return viewCreator.apply(metaClient, viewStorageConfig); }); } @@ -108,8 +119,10 @@ public SyncableFileSystemView getFileSystemView(HoodieTableMetaClient metaClient * Closes all views opened. */ public void close() { - this.globalViewMap.values().forEach(SyncableFileSystemView::close); - this.globalViewMap.clear(); + if (!this.globalViewMap.isEmpty()) { + this.globalViewMap.values().forEach(SyncableFileSystemView::close); + this.globalViewMap.clear(); + } } // FACTORY METHODS FOR CREATING FILE-SYSTEM VIEWS @@ -137,27 +150,57 @@ private static RocksDbBasedFileSystemView createRocksDBBasedFileSystemView(Seria * @return */ private static SpillableMapBasedFileSystemView createSpillableMapBasedFileSystemView(SerializableConfiguration conf, - FileSystemViewStorageConfig viewConf, HoodieTableMetaClient metaClient) { + FileSystemViewStorageConfig viewConf, HoodieTableMetaClient metaClient, HoodieCommonConfig commonConfig) { LOG.info("Creating SpillableMap based view for basePath " + metaClient.getBasePath()); HoodieTimeline timeline = metaClient.getActiveTimeline().filterCompletedAndCompactionInstants(); - return new SpillableMapBasedFileSystemView(metaClient, timeline, viewConf); + return new SpillableMapBasedFileSystemView(metaClient, timeline, viewConf, commonConfig); } /** * Create an in-memory file System view for a table. - * - * @param conf Hadoop Configuration - * @param viewConf View Storage Configuration - * @param metaClient HoodieTableMetaClient - * @return + * */ - private static HoodieTableFileSystemView createInMemoryFileSystemView(SerializableConfiguration conf, - FileSystemViewStorageConfig viewConf, HoodieTableMetaClient metaClient) { + private static HoodieTableFileSystemView createInMemoryFileSystemView(HoodieMetadataConfig metadataConfig, FileSystemViewStorageConfig viewConf, + HoodieTableMetaClient metaClient, SerializableSupplier metadataSupplier) { LOG.info("Creating InMemory based view for basePath " + metaClient.getBasePath()); HoodieTimeline timeline = metaClient.getActiveTimeline().filterCompletedAndCompactionInstants(); + if (metadataConfig.enabled()) { + ValidationUtils.checkArgument(metadataSupplier != null, "Metadata supplier is null. Cannot instantiate metadata file system view"); + return new HoodieMetadataFileSystemView(metaClient, metaClient.getActiveTimeline().filterCompletedAndCompactionInstants(), + metadataSupplier.get()); + } + if (metaClient.getMetastoreConfig().enableMetastore()) { + return (HoodieTableFileSystemView) ReflectionUtils.loadClass(HOODIE_METASTORE_FILE_SYSTEM_VIEW_CLASS, + new Class[] {HoodieTableMetaClient.class, HoodieTimeline.class, HoodieMetastoreConfig.class}, + metaClient, metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(), metaClient.getMetastoreConfig()); + } return new HoodieTableFileSystemView(metaClient, timeline, viewConf.isIncrementalTimelineSyncEnabled()); } + public static HoodieTableFileSystemView createInMemoryFileSystemView(HoodieEngineContext engineContext, HoodieTableMetaClient metaClient, + HoodieMetadataConfig metadataConfig) { + + return createInMemoryFileSystemViewWithTimeline(engineContext, metaClient, metadataConfig, + metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants()); + + } + + public static HoodieTableFileSystemView createInMemoryFileSystemViewWithTimeline(HoodieEngineContext engineContext, + HoodieTableMetaClient metaClient, + HoodieMetadataConfig metadataConfig, + HoodieTimeline timeline) { + LOG.info("Creating InMemory based view for basePath " + metaClient.getBasePath()); + if (metadataConfig.enabled()) { + return new HoodieMetadataFileSystemView(engineContext, metaClient, timeline, metadataConfig); + } + if (metaClient.getMetastoreConfig().enableMetastore()) { + return (HoodieTableFileSystemView) ReflectionUtils.loadClass(HOODIE_METASTORE_FILE_SYSTEM_VIEW_CLASS, + new Class[] {HoodieTableMetaClient.class, HoodieTimeline.class, HoodieMetadataConfig.class}, + metaClient, metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(), metaClient.getMetastoreConfig()); + } + return new HoodieTableFileSystemView(metaClient, timeline); + } + /** * Create a remote file System view for a table. * @@ -171,52 +214,68 @@ private static RemoteHoodieTableFileSystemView createRemoteFileSystemView(Serial LOG.info("Creating remote view for basePath " + metaClient.getBasePath() + ". Server=" + viewConf.getRemoteViewServerHost() + ":" + viewConf.getRemoteViewServerPort() + ", Timeout=" + viewConf.getRemoteTimelineClientTimeoutSecs()); - return new RemoteHoodieTableFileSystemView(viewConf.getRemoteViewServerHost(), viewConf.getRemoteViewServerPort(), - metaClient, viewConf.getRemoteTimelineClientTimeoutSecs()); + return new RemoteHoodieTableFileSystemView(metaClient, viewConf); + } + + public static FileSystemViewManager createViewManager(final HoodieEngineContext context, + final HoodieMetadataConfig metadataConfig, + final FileSystemViewStorageConfig config, + final HoodieCommonConfig commonConfig) { + return createViewManager(context, metadataConfig, config, commonConfig, (SerializableSupplier) null); + } + + public static FileSystemViewManager createViewManager(final HoodieEngineContext context, + final HoodieMetadataConfig metadataConfig, + final FileSystemViewStorageConfig config, + final HoodieCommonConfig commonConfig, + final String basePath) { + return createViewManager(context, metadataConfig, config, commonConfig, + () -> HoodieTableMetadata.create(context, metadataConfig, basePath, config.getSpillableDir(), true)); } /** * Main Factory method for building file-system views. - * - * @param conf Hadoop Configuration - * @param config View Storage Configuration - * @return + * */ - public static FileSystemViewManager createViewManager(final SerializableConfiguration conf, - final FileSystemViewStorageConfig config) { + public static FileSystemViewManager createViewManager(final HoodieEngineContext context, + final HoodieMetadataConfig metadataConfig, + final FileSystemViewStorageConfig config, + final HoodieCommonConfig commonConfig, + final SerializableSupplier metadataSupplier) { LOG.info("Creating View Manager with storage type :" + config.getStorageType()); + final SerializableConfiguration conf = context.getHadoopConf(); switch (config.getStorageType()) { case EMBEDDED_KV_STORE: LOG.info("Creating embedded rocks-db based Table View"); - return new FileSystemViewManager(conf, config, + return new FileSystemViewManager(context, config, (metaClient, viewConf) -> createRocksDBBasedFileSystemView(conf, viewConf, metaClient)); case SPILLABLE_DISK: LOG.info("Creating Spillable Disk based Table View"); - return new FileSystemViewManager(conf, config, - (metaClient, viewConf) -> createSpillableMapBasedFileSystemView(conf, viewConf, metaClient)); + return new FileSystemViewManager(context, config, + (metaClient, viewConf) -> createSpillableMapBasedFileSystemView(conf, viewConf, metaClient, commonConfig)); case MEMORY: LOG.info("Creating in-memory based Table View"); - return new FileSystemViewManager(conf, config, - (metaClient, viewConfig) -> createInMemoryFileSystemView(conf, viewConfig, metaClient)); + return new FileSystemViewManager(context, config, + (metaClient, viewConfig) -> createInMemoryFileSystemView(metadataConfig, viewConfig, metaClient, metadataSupplier)); case REMOTE_ONLY: LOG.info("Creating remote only table view"); - return new FileSystemViewManager(conf, config, (metaClient, viewConfig) -> createRemoteFileSystemView(conf, + return new FileSystemViewManager(context, config, (metaClient, viewConfig) -> createRemoteFileSystemView(conf, viewConfig, metaClient)); case REMOTE_FIRST: LOG.info("Creating remote first table view"); - return new FileSystemViewManager(conf, config, (metaClient, viewConfig) -> { + return new FileSystemViewManager(context, config, (metaClient, viewConfig) -> { RemoteHoodieTableFileSystemView remoteFileSystemView = createRemoteFileSystemView(conf, viewConfig, metaClient); SyncableFileSystemView secondaryView; switch (viewConfig.getSecondaryStorageType()) { case MEMORY: - secondaryView = createInMemoryFileSystemView(conf, viewConfig, metaClient); + secondaryView = createInMemoryFileSystemView(metadataConfig, viewConfig, metaClient, metadataSupplier); break; case EMBEDDED_KV_STORE: secondaryView = createRocksDBBasedFileSystemView(conf, viewConfig, metaClient); break; case SPILLABLE_DISK: - secondaryView = createSpillableMapBasedFileSystemView(conf, viewConfig, metaClient); + secondaryView = createSpillableMapBasedFileSystemView(conf, viewConfig, metaClient, commonConfig); break; default: throw new IllegalArgumentException("Secondary Storage type can only be in-memory or spillable. Was :" diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewStorageConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewStorageConfig.java index ff3a78f77f29d..92937f61e2c2c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewStorageConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewStorageConfig.java @@ -18,137 +18,228 @@ package org.apache.hudi.common.table.view; -import org.apache.hudi.common.config.DefaultHoodieConfig; +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; import org.apache.hudi.common.util.ValidationUtils; import java.io.File; import java.io.FileReader; import java.io.IOException; +import java.util.Arrays; import java.util.Properties; +import java.util.stream.Collectors; /** * File System View Storage Configurations. */ -public class FileSystemViewStorageConfig extends DefaultHoodieConfig { +@ConfigClassProperty(name = "File System View Storage Configurations", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Configurations that control how file metadata is stored by Hudi, for transaction processing and queries.") +public class FileSystemViewStorageConfig extends HoodieConfig { // Property Names - public static final String FILESYSTEM_VIEW_STORAGE_TYPE = "hoodie.filesystem.view.type"; - public static final String FILESYSTEM_VIEW_INCREMENTAL_SYNC_MODE = "hoodie.filesystem.view.incr.timeline.sync.enable"; - public static final String FILESYSTEM_SECONDARY_VIEW_STORAGE_TYPE = "hoodie.filesystem.view.secondary.type"; - public static final String FILESYSTEM_VIEW_REMOTE_HOST = "hoodie.filesystem.view.remote.host"; - public static final String FILESYSTEM_VIEW_REMOTE_PORT = "hoodie.filesystem.view.remote.port"; - public static final String FILESYSTEM_VIEW_SPILLABLE_DIR = "hoodie.filesystem.view.spillable.dir"; - public static final String FILESYSTEM_VIEW_SPILLABLE_MEM = "hoodie.filesystem.view.spillable.mem"; - public static final String FILESYSTEM_VIEW_PENDING_COMPACTION_MEM_FRACTION = - "hoodie.filesystem.view.spillable.compaction.mem.fraction"; - public static final String FILESYSTEM_VIEW_BOOTSTRAP_BASE_FILE_FRACTION = - "hoodie.filesystem.view.spillable.bootstrap.base.file.mem.fraction"; - public static final String FILESYSTEM_VIEW_REPLACED_MEM_FRACTION = - "hoodie.filesystem.view.spillable.replaced.mem.fraction"; - public static final String FILESYSTEM_VIEW_PENDING_CLUSTERING_MEM_FRACTION = - "hoodie.filesystem.view.spillable.clustering.mem.fraction"; - private static final String ROCKSDB_BASE_PATH_PROP = "hoodie.filesystem.view.rocksdb.base.path"; - public static final String FILESTYSTEM_REMOTE_TIMELINE_CLIENT_TIMEOUT_SECS = - "hoodie.filesystem.view.remote.timeout.secs"; - - - public static final FileSystemViewStorageType DEFAULT_VIEW_STORAGE_TYPE = FileSystemViewStorageType.MEMORY; - public static final FileSystemViewStorageType DEFAULT_SECONDARY_VIEW_STORAGE_TYPE = FileSystemViewStorageType.MEMORY; - public static final String DEFAULT_ROCKSDB_BASE_PATH = "/tmp/hoodie_timeline_rocksdb"; - - public static final String DEFAULT_FILESYSTEM_VIEW_INCREMENTAL_SYNC_MODE = "false"; - public static final String DEFUALT_REMOTE_VIEW_SERVER_HOST = "localhost"; - public static final Integer DEFAULT_REMOTE_VIEW_SERVER_PORT = 26754; - public static final Integer DEFAULT_REMOTE_TIMELINE_CLIENT_TIMEOUT_SECS = 5 * 60; // 5 min - public static final String DEFAULT_VIEW_SPILLABLE_DIR = "/tmp/view_map/"; - private static final Double DEFAULT_MEM_FRACTION_FOR_PENDING_COMPACTION = 0.01; - private static final Double DEFAULT_MEM_FRACTION_FOR_EXTERNAL_DATA_FILE = 0.05; - private static final Double DEFAULT_MEM_FRACTION_FOR_REPLACED_FILEGROUPS = 0.01; - private static final Double DEFAULT_MEM_FRACTION_FOR_PENDING_CLUSTERING_FILEGROUPS = 0.01; - private static final Long DEFAULT_MAX_MEMORY_FOR_VIEW = 100 * 1024 * 1024L; // 100 MB - - /** - * Configs to control whether backup needs to be configured if clients were not able to reach - * timeline service. - */ - public static final String REMOTE_BACKUP_VIEW_HANDLER_ENABLE = - "hoodie.filesystem.remote.backup.view.enable"; - // Need to be disabled only for tests. - public static final String DEFAULT_REMOTE_BACKUP_VIEW_HANDLER_ENABLE = "true"; + public static final ConfigProperty VIEW_TYPE = ConfigProperty + .key("hoodie.filesystem.view.type") + .defaultValue(FileSystemViewStorageType.MEMORY) + .withDocumentation("File system view provides APIs for viewing the files on the underlying lake storage, " + + " as file groups and file slices. This config controls how such a view is held. Options include " + + Arrays.stream(FileSystemViewStorageType.values()).map(Enum::name).collect(Collectors.joining(",")) + + " which provide different trade offs for memory usage and API request performance."); + + public static final ConfigProperty INCREMENTAL_TIMELINE_SYNC_ENABLE = ConfigProperty + .key("hoodie.filesystem.view.incr.timeline.sync.enable") + .defaultValue("false") + .withDocumentation("Controls whether or not, the file system view is incrementally updated as " + + "new actions are performed on the timeline."); + + public static final ConfigProperty SECONDARY_VIEW_TYPE = ConfigProperty + .key("hoodie.filesystem.view.secondary.type") + .defaultValue(FileSystemViewStorageType.MEMORY) + .withDocumentation("Specifies the secondary form of storage for file system view, if the primary (e.g timeline server) " + + " is unavailable."); + + public static final ConfigProperty REMOTE_HOST_NAME = ConfigProperty + .key("hoodie.filesystem.view.remote.host") + .defaultValue("localhost") + .withDocumentation("We expect this to be rarely hand configured."); + + public static final ConfigProperty REMOTE_PORT_NUM = ConfigProperty + .key("hoodie.filesystem.view.remote.port") + .defaultValue(26754) + .withDocumentation("Port to serve file system view queries, when remote. We expect this to be rarely hand configured."); + + public static final ConfigProperty SPILLABLE_DIR = ConfigProperty + .key("hoodie.filesystem.view.spillable.dir") + .defaultValue("/tmp/") + .withDocumentation("Path on local storage to use, when file system view is held in a spillable map."); + + public static final ConfigProperty SPILLABLE_MEMORY = ConfigProperty + .key("hoodie.filesystem.view.spillable.mem") + .defaultValue(100 * 1024 * 1024L) // 100 MB + .withDocumentation("Amount of memory to be used in bytes for holding file system view, before spilling to disk."); + + public static final ConfigProperty SPILLABLE_COMPACTION_MEM_FRACTION = ConfigProperty + .key("hoodie.filesystem.view.spillable.compaction.mem.fraction") + .defaultValue(0.8) + .withDocumentation("Fraction of the file system view memory, to be used for holding compaction related metadata."); + + public static final ConfigProperty BOOTSTRAP_BASE_FILE_MEM_FRACTION = ConfigProperty + .key("hoodie.filesystem.view.spillable.bootstrap.base.file.mem.fraction") + .defaultValue(0.05) + .withDocumentation("Fraction of the file system view memory, to be used for holding mapping to bootstrap base files."); + + public static final ConfigProperty SPILLABLE_REPLACED_MEM_FRACTION = ConfigProperty + .key("hoodie.filesystem.view.spillable.replaced.mem.fraction") + .defaultValue(0.01) + .withDocumentation("Fraction of the file system view memory, to be used for holding replace commit related metadata."); + + public static final ConfigProperty SPILLABLE_CLUSTERING_MEM_FRACTION = ConfigProperty + .key("hoodie.filesystem.view.spillable.clustering.mem.fraction") + .defaultValue(0.01) + .withDocumentation("Fraction of the file system view memory, to be used for holding clustering related metadata."); + + public static final ConfigProperty ROCKSDB_BASE_PATH = ConfigProperty + .key("hoodie.filesystem.view.rocksdb.base.path") + .defaultValue("/tmp/hoodie_timeline_rocksdb") + .withDocumentation("Path on local storage to use, when storing file system view in embedded kv store/rocksdb."); + + public static final ConfigProperty REMOTE_TIMEOUT_SECS = ConfigProperty + .key("hoodie.filesystem.view.remote.timeout.secs") + .defaultValue(5 * 60) // 5 min + .withDocumentation("Timeout in seconds, to wait for API requests against a remote file system view. e.g timeline server."); + + public static final ConfigProperty REMOTE_RETRY_ENABLE = ConfigProperty + .key("hoodie.filesystem.view.remote.retry.enable") + .defaultValue("false") + .sinceVersion("0.12.1") + .withDocumentation("Whether to enable API request retry for remote file system view."); + + public static final ConfigProperty REMOTE_MAX_RETRY_NUMBERS = ConfigProperty + .key("hoodie.filesystem.view.remote.retry.max_numbers") + .defaultValue(3) // 3 times + .sinceVersion("0.12.1") + .withDocumentation("Maximum number of retry for API requests against a remote file system view. e.g timeline server."); + + public static final ConfigProperty REMOTE_INITIAL_RETRY_INTERVAL_MS = ConfigProperty + .key("hoodie.filesystem.view.remote.retry.initial_interval_ms") + .defaultValue(100L) + .sinceVersion("0.12.1") + .withDocumentation("Amount of time (in ms) to wait, before retry to do operations on storage."); + + public static final ConfigProperty REMOTE_MAX_RETRY_INTERVAL_MS = ConfigProperty + .key("hoodie.filesystem.view.remote.retry.max_interval_ms") + .defaultValue(2000L) + .sinceVersion("0.12.1") + .withDocumentation("Maximum amount of time (in ms), to wait for next retry."); + + public static final ConfigProperty RETRY_EXCEPTIONS = ConfigProperty + .key("hoodie.filesystem.view.remote.retry.exceptions") + .defaultValue("") + .sinceVersion("0.12.1") + .withDocumentation("The class name of the Exception that needs to be re-tryed, separated by commas. " + + "Default is empty which means retry all the IOException and RuntimeException from Remote Request."); + + public static final ConfigProperty REMOTE_BACKUP_VIEW_ENABLE = ConfigProperty + .key("hoodie.filesystem.remote.backup.view.enable") + .defaultValue("true") // Need to be disabled only for tests. + .withDocumentation("Config to control whether backup needs to be configured if clients were not able to reach" + + " timeline service."); public static FileSystemViewStorageConfig.Builder newBuilder() { return new Builder(); } - private FileSystemViewStorageConfig(Properties props) { - super(props); + private FileSystemViewStorageConfig() { + super(); } public FileSystemViewStorageType getStorageType() { - return FileSystemViewStorageType.valueOf(props.getProperty(FILESYSTEM_VIEW_STORAGE_TYPE)); + return FileSystemViewStorageType.valueOf(getString(VIEW_TYPE)); } public boolean isIncrementalTimelineSyncEnabled() { - return Boolean.parseBoolean(props.getProperty(FILESYSTEM_VIEW_INCREMENTAL_SYNC_MODE)); + return getBoolean(INCREMENTAL_TIMELINE_SYNC_ENABLE); } public String getRemoteViewServerHost() { - return props.getProperty(FILESYSTEM_VIEW_REMOTE_HOST); + return getString(REMOTE_HOST_NAME); } public Integer getRemoteViewServerPort() { - return Integer.parseInt(props.getProperty(FILESYSTEM_VIEW_REMOTE_PORT)); + return getInt(REMOTE_PORT_NUM); } public Integer getRemoteTimelineClientTimeoutSecs() { - return Integer.parseInt(props.getProperty(FILESTYSTEM_REMOTE_TIMELINE_CLIENT_TIMEOUT_SECS)); + return getInt(REMOTE_TIMEOUT_SECS); + } + + public boolean isRemoteTimelineClientRetryEnabled() { + return getBoolean(REMOTE_RETRY_ENABLE); + } + + public Integer getRemoteTimelineClientMaxRetryNumbers() { + return getInt(REMOTE_MAX_RETRY_NUMBERS); + } + + public Long getRemoteTimelineInitialRetryIntervalMs() { + return getLong(REMOTE_INITIAL_RETRY_INTERVAL_MS); + } + + public Long getRemoteTimelineClientMaxRetryIntervalMs() { + return getLong(REMOTE_MAX_RETRY_INTERVAL_MS); + } + + public String getRemoteTimelineClientRetryExceptions() { + return getString(RETRY_EXCEPTIONS); } public long getMaxMemoryForFileGroupMap() { - long totalMemory = Long.parseLong(props.getProperty(FILESYSTEM_VIEW_SPILLABLE_MEM)); + long totalMemory = getLong(SPILLABLE_MEMORY); return totalMemory - getMaxMemoryForPendingCompaction() - getMaxMemoryForBootstrapBaseFile(); } public long getMaxMemoryForPendingCompaction() { - long totalMemory = Long.parseLong(props.getProperty(FILESYSTEM_VIEW_SPILLABLE_MEM)); - return new Double(totalMemory * Double.parseDouble(props.getProperty(FILESYSTEM_VIEW_PENDING_COMPACTION_MEM_FRACTION))) + long totalMemory = getLong(SPILLABLE_MEMORY); + return new Double(totalMemory * getDouble(SPILLABLE_COMPACTION_MEM_FRACTION)) .longValue(); } public long getMaxMemoryForBootstrapBaseFile() { - long totalMemory = Long.parseLong(props.getProperty(FILESYSTEM_VIEW_SPILLABLE_MEM)); + long totalMemory = getLong(SPILLABLE_MEMORY); long reservedForExternalDataFile = - new Double(totalMemory * Double.parseDouble(props.getProperty(FILESYSTEM_VIEW_BOOTSTRAP_BASE_FILE_FRACTION))) + new Double(totalMemory * getDouble(BOOTSTRAP_BASE_FILE_MEM_FRACTION)) .longValue(); return reservedForExternalDataFile; } public long getMaxMemoryForReplacedFileGroups() { - long totalMemory = Long.parseLong(props.getProperty(FILESYSTEM_VIEW_SPILLABLE_MEM)); - return new Double(totalMemory * Double.parseDouble(props.getProperty(FILESYSTEM_VIEW_REPLACED_MEM_FRACTION))) + long totalMemory = getLong(SPILLABLE_MEMORY); + return new Double(totalMemory * getDouble(SPILLABLE_REPLACED_MEM_FRACTION)) .longValue(); } public long getMaxMemoryForPendingClusteringFileGroups() { - long totalMemory = Long.parseLong(props.getProperty(FILESYSTEM_VIEW_SPILLABLE_MEM)); - return new Double(totalMemory * Double.parseDouble(props.getProperty(FILESYSTEM_VIEW_PENDING_CLUSTERING_MEM_FRACTION))) + long totalMemory = getLong(SPILLABLE_MEMORY); + return new Double(totalMemory * getDouble(SPILLABLE_CLUSTERING_MEM_FRACTION)) .longValue(); } - public String getBaseStoreDir() { - return props.getProperty(FILESYSTEM_VIEW_SPILLABLE_DIR); + public String getSpillableDir() { + return getString(SPILLABLE_DIR); } public FileSystemViewStorageType getSecondaryStorageType() { - return FileSystemViewStorageType.valueOf(props.getProperty(FILESYSTEM_SECONDARY_VIEW_STORAGE_TYPE)); + return FileSystemViewStorageType.valueOf(getString(SECONDARY_VIEW_TYPE)); } public boolean shouldEnableBackupForRemoteFileSystemView() { - return Boolean.parseBoolean(props.getProperty(REMOTE_BACKUP_VIEW_HANDLER_ENABLE)); + return getBoolean(REMOTE_BACKUP_VIEW_ENABLE); } public String getRocksdbBasePath() { - return props.getProperty(ROCKSDB_BASE_PATH_PROP); + return getString(ROCKSDB_BASE_PATH); } /** @@ -156,119 +247,243 @@ public String getRocksdbBasePath() { */ public static class Builder { - private final Properties props = new Properties(); + private final FileSystemViewStorageConfig fileSystemViewStorageConfig = new FileSystemViewStorageConfig(); public Builder fromFile(File propertiesFile) throws IOException { try (FileReader reader = new FileReader(propertiesFile)) { - props.load(reader); + fileSystemViewStorageConfig.getProps().load(reader); return this; } } public Builder fromProperties(Properties props) { - this.props.putAll(props); + this.fileSystemViewStorageConfig.getProps().putAll(props); return this; } public Builder withStorageType(FileSystemViewStorageType storageType) { - props.setProperty(FILESYSTEM_VIEW_STORAGE_TYPE, storageType.name()); + fileSystemViewStorageConfig.setValue(VIEW_TYPE, storageType.name()); return this; } public Builder withSecondaryStorageType(FileSystemViewStorageType storageType) { - props.setProperty(FILESYSTEM_SECONDARY_VIEW_STORAGE_TYPE, storageType.name()); + fileSystemViewStorageConfig.setValue(SECONDARY_VIEW_TYPE, storageType.name()); return this; } public Builder withIncrementalTimelineSync(boolean enableIncrTimelineSync) { - props.setProperty(FILESYSTEM_VIEW_INCREMENTAL_SYNC_MODE, Boolean.toString(enableIncrTimelineSync)); + fileSystemViewStorageConfig.setValue(INCREMENTAL_TIMELINE_SYNC_ENABLE, Boolean.toString(enableIncrTimelineSync)); return this; } public Builder withRemoteServerHost(String remoteServerHost) { - props.setProperty(FILESYSTEM_VIEW_REMOTE_HOST, remoteServerHost); + fileSystemViewStorageConfig.setValue(REMOTE_HOST_NAME, remoteServerHost); return this; } public Builder withRemoteServerPort(Integer remoteServerPort) { - props.setProperty(FILESYSTEM_VIEW_REMOTE_PORT, remoteServerPort.toString()); + fileSystemViewStorageConfig.setValue(REMOTE_PORT_NUM, remoteServerPort.toString()); return this; } public Builder withMaxMemoryForView(Long maxMemoryForView) { - props.setProperty(FILESYSTEM_VIEW_SPILLABLE_MEM, maxMemoryForView.toString()); + fileSystemViewStorageConfig.setValue(SPILLABLE_MEMORY, maxMemoryForView.toString()); + return this; + } + + public Builder withRemoteTimelineClientTimeoutSecs(Integer timelineClientTimeoutSecs) { + fileSystemViewStorageConfig.setValue(REMOTE_TIMEOUT_SECS, timelineClientTimeoutSecs.toString()); return this; } - public Builder withRemoteTimelineClientTimeoutSecs(Long timelineClientTimeoutSecs) { - props.setProperty(FILESTYSTEM_REMOTE_TIMELINE_CLIENT_TIMEOUT_SECS, timelineClientTimeoutSecs.toString()); + public Builder withRemoteTimelineClientRetry(boolean enableRetry) { + fileSystemViewStorageConfig.setValue(REMOTE_RETRY_ENABLE, Boolean.toString(enableRetry)); + return this; + } + + public Builder withRemoteTimelineClientMaxRetryNumbers(Integer maxRetryNumbers) { + fileSystemViewStorageConfig.setValue(REMOTE_MAX_RETRY_NUMBERS, maxRetryNumbers.toString()); + return this; + } + + public Builder withRemoteTimelineInitialRetryIntervalMs(Long initialRetryIntervalMs) { + fileSystemViewStorageConfig.setValue(REMOTE_INITIAL_RETRY_INTERVAL_MS, initialRetryIntervalMs.toString()); + return this; + } + + public Builder withRemoteTimelineClientMaxRetryIntervalMs(Long maxRetryIntervalMs) { + fileSystemViewStorageConfig.setValue(REMOTE_MAX_RETRY_INTERVAL_MS, maxRetryIntervalMs.toString()); + return this; + } + + public Builder withRemoteTimelineClientRetryExceptions(String retryExceptions) { + fileSystemViewStorageConfig.setValue(RETRY_EXCEPTIONS, retryExceptions); return this; } public Builder withMemFractionForPendingCompaction(Double memFractionForPendingCompaction) { - props.setProperty(FILESYSTEM_VIEW_PENDING_COMPACTION_MEM_FRACTION, memFractionForPendingCompaction.toString()); + fileSystemViewStorageConfig.setValue(SPILLABLE_COMPACTION_MEM_FRACTION, memFractionForPendingCompaction.toString()); return this; } public Builder withMemFractionForExternalDataFile(Double memFractionForExternalDataFile) { - props.setProperty(FILESYSTEM_VIEW_BOOTSTRAP_BASE_FILE_FRACTION, memFractionForExternalDataFile.toString()); + fileSystemViewStorageConfig.setValue(BOOTSTRAP_BASE_FILE_MEM_FRACTION, memFractionForExternalDataFile.toString()); return this; } public Builder withBaseStoreDir(String baseStorePath) { - props.setProperty(FILESYSTEM_VIEW_SPILLABLE_DIR, baseStorePath); + fileSystemViewStorageConfig.setValue(SPILLABLE_DIR, baseStorePath); return this; } public Builder withRocksDBPath(String basePath) { - props.setProperty(ROCKSDB_BASE_PATH_PROP, basePath); + fileSystemViewStorageConfig.setValue(ROCKSDB_BASE_PATH, basePath); return this; } public Builder withEnableBackupForRemoteFileSystemView(boolean enable) { - props.setProperty(REMOTE_BACKUP_VIEW_HANDLER_ENABLE, Boolean.toString(enable)); + fileSystemViewStorageConfig.setValue(REMOTE_BACKUP_VIEW_ENABLE, Boolean.toString(enable)); return this; } public FileSystemViewStorageConfig build() { - setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_STORAGE_TYPE), FILESYSTEM_VIEW_STORAGE_TYPE, - DEFAULT_VIEW_STORAGE_TYPE.name()); - setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_INCREMENTAL_SYNC_MODE), - FILESYSTEM_VIEW_INCREMENTAL_SYNC_MODE, DEFAULT_FILESYSTEM_VIEW_INCREMENTAL_SYNC_MODE); - setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_SECONDARY_VIEW_STORAGE_TYPE), - FILESYSTEM_SECONDARY_VIEW_STORAGE_TYPE, DEFAULT_SECONDARY_VIEW_STORAGE_TYPE.name()); - setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_REMOTE_HOST), FILESYSTEM_VIEW_REMOTE_HOST, - DEFUALT_REMOTE_VIEW_SERVER_HOST); - setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_REMOTE_PORT), FILESYSTEM_VIEW_REMOTE_PORT, - DEFAULT_REMOTE_VIEW_SERVER_PORT.toString()); - - setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_SPILLABLE_DIR), FILESYSTEM_VIEW_SPILLABLE_DIR, - DEFAULT_VIEW_SPILLABLE_DIR); - setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_SPILLABLE_MEM), FILESYSTEM_VIEW_SPILLABLE_MEM, - DEFAULT_MAX_MEMORY_FOR_VIEW.toString()); - setDefaultOnCondition(props, !props.containsKey(FILESTYSTEM_REMOTE_TIMELINE_CLIENT_TIMEOUT_SECS), - FILESTYSTEM_REMOTE_TIMELINE_CLIENT_TIMEOUT_SECS, DEFAULT_REMOTE_TIMELINE_CLIENT_TIMEOUT_SECS.toString()); - setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_PENDING_COMPACTION_MEM_FRACTION), - FILESYSTEM_VIEW_PENDING_COMPACTION_MEM_FRACTION, DEFAULT_MEM_FRACTION_FOR_PENDING_COMPACTION.toString()); - setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_BOOTSTRAP_BASE_FILE_FRACTION), - FILESYSTEM_VIEW_BOOTSTRAP_BASE_FILE_FRACTION, DEFAULT_MEM_FRACTION_FOR_EXTERNAL_DATA_FILE.toString()); - setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_REPLACED_MEM_FRACTION), - FILESYSTEM_VIEW_REPLACED_MEM_FRACTION, DEFAULT_MEM_FRACTION_FOR_REPLACED_FILEGROUPS.toString()); - setDefaultOnCondition(props, !props.containsKey(FILESYSTEM_VIEW_PENDING_CLUSTERING_MEM_FRACTION), - FILESYSTEM_VIEW_PENDING_CLUSTERING_MEM_FRACTION, DEFAULT_MEM_FRACTION_FOR_PENDING_CLUSTERING_FILEGROUPS.toString()); - - setDefaultOnCondition(props, !props.containsKey(ROCKSDB_BASE_PATH_PROP), ROCKSDB_BASE_PATH_PROP, - DEFAULT_ROCKSDB_BASE_PATH); - - setDefaultOnCondition(props, !props.containsKey(REMOTE_BACKUP_VIEW_HANDLER_ENABLE), - REMOTE_BACKUP_VIEW_HANDLER_ENABLE, DEFAULT_REMOTE_BACKUP_VIEW_HANDLER_ENABLE); - + fileSystemViewStorageConfig.setDefaults(FileSystemViewStorageConfig.class.getName()); // Validations - FileSystemViewStorageType.valueOf(props.getProperty(FILESYSTEM_VIEW_STORAGE_TYPE)); - FileSystemViewStorageType.valueOf(props.getProperty(FILESYSTEM_SECONDARY_VIEW_STORAGE_TYPE)); - ValidationUtils.checkArgument(Integer.parseInt(props.getProperty(FILESYSTEM_VIEW_REMOTE_PORT)) > 0); - return new FileSystemViewStorageConfig(props); + FileSystemViewStorageType.valueOf(fileSystemViewStorageConfig.getString(VIEW_TYPE)); + FileSystemViewStorageType.valueOf(fileSystemViewStorageConfig.getString(SECONDARY_VIEW_TYPE)); + ValidationUtils.checkArgument(fileSystemViewStorageConfig.getInt(REMOTE_PORT_NUM) > 0); + return fileSystemViewStorageConfig; } } + /** + * @deprecated Use {@link #VIEW_TYPE} and its methods. + */ + @Deprecated + public static final String FILESYSTEM_VIEW_STORAGE_TYPE = VIEW_TYPE.key(); + /** + * @deprecated Use {@link #VIEW_TYPE} and its methods. + */ + @Deprecated + public static final FileSystemViewStorageType DEFAULT_VIEW_STORAGE_TYPE = VIEW_TYPE.defaultValue(); + /** + * @deprecated Use {@link #INCREMENTAL_TIMELINE_SYNC_ENABLE} and its methods. + */ + @Deprecated + public static final String FILESYSTEM_VIEW_INCREMENTAL_SYNC_MODE = INCREMENTAL_TIMELINE_SYNC_ENABLE.key(); + /** + * @deprecated Use {@link #INCREMENTAL_TIMELINE_SYNC_ENABLE} and its methods. + */ + @Deprecated + public static final String DEFAULT_FILESYSTEM_VIEW_INCREMENTAL_SYNC_MODE = INCREMENTAL_TIMELINE_SYNC_ENABLE.defaultValue(); + /** + * @deprecated Use {@link #SECONDARY_VIEW_TYPE} and its methods. + */ + @Deprecated + public static final String FILESYSTEM_SECONDARY_VIEW_STORAGE_TYPE = SECONDARY_VIEW_TYPE.key(); + /** + * @deprecated Use {@link #SECONDARY_VIEW_TYPE} and its methods. + */ + @Deprecated + public static final FileSystemViewStorageType DEFAULT_SECONDARY_VIEW_STORAGE_TYPE = SECONDARY_VIEW_TYPE.defaultValue(); + /** + * @deprecated Use {@link #REMOTE_HOST_NAME} and its methods. + */ + @Deprecated + public static final String FILESYSTEM_VIEW_REMOTE_HOST = REMOTE_HOST_NAME.key(); + /** + * @deprecated Use {@link #REMOTE_HOST_NAME} and its methods. + */ + @Deprecated + public static final String DEFUALT_REMOTE_VIEW_SERVER_HOST = REMOTE_HOST_NAME.defaultValue(); + /** + * @deprecated Use {@link #REMOTE_PORT_NUM} and its methods. + */ + @Deprecated + public static final String FILESYSTEM_VIEW_REMOTE_PORT = REMOTE_PORT_NUM.key(); + /** + * @deprecated Use {@link #REMOTE_PORT_NUM} and its methods. + */ + @Deprecated + public static final Integer DEFAULT_REMOTE_VIEW_SERVER_PORT = REMOTE_PORT_NUM.defaultValue(); + /** + * @deprecated Use {@link #SPILLABLE_DIR} and its methods. + */ + @Deprecated + public static final String FILESYSTEM_VIEW_SPILLABLE_DIR = SPILLABLE_DIR.key(); + /** + * @deprecated Use {@link #SPILLABLE_DIR} and its methods. + */ + @Deprecated + public static final String DEFAULT_VIEW_SPILLABLE_DIR = SPILLABLE_DIR.defaultValue(); + /** + * @deprecated Use {@link #SPILLABLE_MEMORY} and its methods. + */ + @Deprecated + public static final String FILESYSTEM_VIEW_SPILLABLE_MEM = SPILLABLE_MEMORY.key(); + /** + * @deprecated Use {@link #SPILLABLE_MEMORY} and its methods. + */ + @Deprecated + private static final Long DEFAULT_MAX_MEMORY_FOR_VIEW = SPILLABLE_MEMORY.defaultValue(); + /** + * @deprecated Use {@link #SPILLABLE_COMPACTION_MEM_FRACTION} and its methods. + */ + @Deprecated + public static final String FILESYSTEM_VIEW_PENDING_COMPACTION_MEM_FRACTION = SPILLABLE_COMPACTION_MEM_FRACTION.key(); + /** + * @deprecated Use {@link #SPILLABLE_COMPACTION_MEM_FRACTION} and its methods. + */ + @Deprecated + private static final Double DEFAULT_MEM_FRACTION_FOR_PENDING_COMPACTION = SPILLABLE_COMPACTION_MEM_FRACTION.defaultValue(); + /** + * @deprecated Use {@link #BOOTSTRAP_BASE_FILE_MEM_FRACTION} and its methods. + */ + @Deprecated + public static final String FILESYSTEM_VIEW_BOOTSTRAP_BASE_FILE_FRACTION = BOOTSTRAP_BASE_FILE_MEM_FRACTION.key(); + /** + * @deprecated Use {@link #SPILLABLE_REPLACED_MEM_FRACTION} and its methods. + */ + @Deprecated + public static final String FILESYSTEM_VIEW_REPLACED_MEM_FRACTION = SPILLABLE_REPLACED_MEM_FRACTION.key(); + /** + * @deprecated Use {@link #SPILLABLE_REPLACED_MEM_FRACTION} and its methods. + */ + @Deprecated + private static final Double DEFAULT_MEM_FRACTION_FOR_REPLACED_FILEGROUPS = SPILLABLE_REPLACED_MEM_FRACTION.defaultValue(); + /** + * @deprecated Use {@link #SPILLABLE_CLUSTERING_MEM_FRACTION} and its methods. + */ + @Deprecated + public static final String FILESYSTEM_VIEW_PENDING_CLUSTERING_MEM_FRACTION = SPILLABLE_CLUSTERING_MEM_FRACTION.key(); + /** + * @deprecated Use {@link #SPILLABLE_CLUSTERING_MEM_FRACTION} and its methods. + */ + @Deprecated + private static final Double DEFAULT_MEM_FRACTION_FOR_PENDING_CLUSTERING_FILEGROUPS = SPILLABLE_CLUSTERING_MEM_FRACTION.defaultValue(); + /** + * @deprecated Use {@link #ROCKSDB_BASE_PATH} and its methods. + */ + @Deprecated + private static final String ROCKSDB_BASE_PATH_PROP = ROCKSDB_BASE_PATH.key(); + /** + * @deprecated Use {@link #ROCKSDB_BASE_PATH} and its methods. + */ + @Deprecated + public static final String DEFAULT_ROCKSDB_BASE_PATH = ROCKSDB_BASE_PATH.defaultValue(); + /** + * @deprecated Use {@link #REMOTE_TIMEOUT_SECS} and its methods. + */ + @Deprecated + public static final String FILESTYSTEM_REMOTE_TIMELINE_CLIENT_TIMEOUT_SECS = REMOTE_TIMEOUT_SECS.key(); + /** + * @deprecated Use {@link #REMOTE_TIMEOUT_SECS} and its methods. + */ + @Deprecated + public static final Integer DEFAULT_REMOTE_TIMELINE_CLIENT_TIMEOUT_SECS = REMOTE_TIMEOUT_SECS.defaultValue(); + /** + * @deprecated Use {@link #BOOTSTRAP_BASE_FILE_MEM_FRACTION} and its methods. + */ + @Deprecated + private static final Double DEFAULT_MEM_FRACTION_FOR_EXTERNAL_DATA_FILE = BOOTSTRAP_BASE_FILE_MEM_FRACTION.defaultValue(); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/HoodieTableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/HoodieTableFileSystemView.java index f0c095f598b00..9dac36081384b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/HoodieTableFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/HoodieTableFileSystemView.java @@ -18,9 +18,9 @@ package org.apache.hudi.common.table.view; -import org.apache.hadoop.fs.FileStatus; import org.apache.hudi.common.model.BootstrapBaseFileMapping; import org.apache.hudi.common.model.CompactionOperation; +import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieFileGroup; import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -29,6 +29,8 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; + +import org.apache.hadoop.fs.FileStatus; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -114,11 +116,22 @@ public void init(HoodieTableMetaClient metaClient, HoodieTimeline visibleActiveT @Override protected void resetViewState() { - this.fgIdToPendingCompaction = null; - this.partitionToFileGroupsMap = null; - this.fgIdToBootstrapBaseFile = null; - this.fgIdToReplaceInstants = null; - this.fgIdToPendingClustering = null; + // do not nullify the members to avoid NPE. + + // there are two cases that #resetViewState is called: + // 1. when #sync is invoked, the view clear the state through calling #resetViewState, + // then re-initialize the view; + // 2. when #close is invoked. + // (see AbstractTableFileSystemView for details.) + + // for the 1st case, we better do not nullify the members when #resetViewState + // because there is possibility that this in-memory view is a backend view under TimelineServer, + // and many methods in the RequestHandler is not thread safe, when performRefreshCheck flag in ViewHandler + // is set as false, the view does not perform refresh check, if #sync is called just before and the members + // are nullified, the methods that use these members would throw NPE. + + // actually there is no need to nullify the members here for 1st case, the members are assigned with new values + // when calling #init, for 2nd case, the #close method already nullify the members. } protected Map> createPartitionToFileGroups() { @@ -317,7 +330,7 @@ protected boolean isPartitionAvailableInStore(String partitionPath) { @Override protected void storePartitionView(String partitionPath, List fileGroups) { - LOG.info("Adding file-groups for partition :" + partitionPath + ", #FileGroups=" + fileGroups.size()); + LOG.debug("Adding file-groups for partition :" + partitionPath + ", #FileGroups=" + fileGroups.size()); List newList = new ArrayList<>(fileGroups); partitionToFileGroupsMap.put(partitionPath, newList); } @@ -347,14 +360,28 @@ protected Option getReplaceInstant(final HoodieFileGroupId fileGr return Option.ofNullable(fgIdToReplaceInstants.get(fileGroupId)); } + /** + * Get the latest file slices for a given partition including the inflight ones. + * + * @param partitionPath + * @return Stream of latest {@link FileSlice} in the partition path. + */ + public Stream fetchLatestFileSlicesIncludingInflight(String partitionPath) { + return fetchAllStoredFileGroups(partitionPath) + .map(HoodieFileGroup::getLatestFileSlicesIncludingInflight) + .filter(Option::isPresent) + .map(Option::get); + } + @Override public void close() { + super.close(); + this.fgIdToPendingCompaction = null; + this.partitionToFileGroupsMap = null; + this.fgIdToBootstrapBaseFile = null; + this.fgIdToReplaceInstants = null; + this.fgIdToPendingClustering = null; closed = true; - super.reset(); - partitionToFileGroupsMap = null; - fgIdToPendingCompaction = null; - fgIdToBootstrapBaseFile = null; - fgIdToReplaceInstants = null; } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/HoodieTablePreCommitFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/HoodieTablePreCommitFileSystemView.java new file mode 100644 index 0000000000000..7401617a6abb6 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/HoodieTablePreCommitFileSystemView.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.table.view; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.HoodieTableMetaClient; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * Create PreCommitFileSystemView by only filtering instants that are of interest. + * For example, we want to exclude + * other inflight instants. This is achieved by combining + * 1) FileSystemView with completed commits + * 2) Using list of files written/replaced by inflight instant that we are validating + * + */ +public class HoodieTablePreCommitFileSystemView { + + private Map> partitionToReplaceFileIds; + private List filesWritten; + private String preCommitInstantTime; + private SyncableFileSystemView completedCommitsFileSystemView; + private HoodieTableMetaClient tableMetaClient; + + /** + * Create a file system view for the inflight commit that we are validating. + */ + public HoodieTablePreCommitFileSystemView(HoodieTableMetaClient metaClient, + SyncableFileSystemView completedCommitsFileSystemView, + List filesWritten, + Map> partitionToReplaceFileIds, + String instantTime) { + this.completedCommitsFileSystemView = completedCommitsFileSystemView; + this.filesWritten = filesWritten; + this.partitionToReplaceFileIds = partitionToReplaceFileIds; + this.preCommitInstantTime = instantTime; + this.tableMetaClient = metaClient; + } + + /** + * Combine committed base files + new files created/replaced for given partition. + */ + public final Stream getLatestBaseFiles(String partitionStr) { + // get fileIds replaced by current inflight commit + List replacedFileIdsForPartition = partitionToReplaceFileIds.getOrDefault(partitionStr, Collections.emptyList()); + + // get new files written by current inflight commit + Map newFilesWrittenForPartition = filesWritten.stream() + .filter(file -> partitionStr.equals(file.getPartitionPath())) + .collect(Collectors.toMap(HoodieWriteStat::getFileId, writeStat -> + new HoodieBaseFile(new Path(tableMetaClient.getBasePath(), writeStat.getPath()).toString()))); + + Stream committedBaseFiles = this.completedCommitsFileSystemView.getLatestBaseFiles(partitionStr); + Map allFileIds = committedBaseFiles + // Remove files replaced by current inflight commit + .filter(baseFile -> !replacedFileIdsForPartition.contains(baseFile.getFileId())) + .collect(Collectors.toMap(HoodieBaseFile::getFileId, baseFile -> baseFile)); + + allFileIds.putAll(newFilesWrittenForPartition); + return allFileIds.values().stream(); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/PriorityBasedFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/PriorityBasedFileSystemView.java index f7244eefdf9d4..ff44c7cef017b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/PriorityBasedFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/PriorityBasedFileSystemView.java @@ -32,6 +32,8 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; +import org.apache.http.HttpStatus; +import org.apache.http.client.HttpResponseException; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -65,7 +67,7 @@ private R execute(Function0 preferredFunction, Function0 secondaryFunc try { return preferredFunction.apply(); } catch (RuntimeException re) { - LOG.error("Got error running preferred function. Trying secondary", re); + handleRuntimeException(re); errorOnPreferredView = true; return secondaryFunction.apply(); } @@ -80,7 +82,7 @@ private R execute(T1 val, Function1 preferredFunction, Function1< try { return preferredFunction.apply(val); } catch (RuntimeException re) { - LOG.error("Got error running preferred function. Trying secondary", re); + handleRuntimeException(re); errorOnPreferredView = true; return secondaryFunction.apply(val); } @@ -96,7 +98,7 @@ private R execute(T1 val, T2 val2, Function2 preferredFun try { return preferredFunction.apply(val, val2); } catch (RuntimeException re) { - LOG.error("Got error running preferred function. Trying secondary", re); + handleRuntimeException(re); errorOnPreferredView = true; return secondaryFunction.apply(val, val2); } @@ -112,13 +114,21 @@ private R execute(T1 val, T2 val2, T3 val3, Function3 getLatestBaseFiles(String partitionPath) { return execute(partitionPath, preferredView::getLatestBaseFiles, secondaryView::getLatestBaseFiles); @@ -199,6 +209,16 @@ public Stream getReplacedFileGroupsBeforeOrOn(String maxCommitT return execute(maxCommitTime, partitionPath, preferredView::getReplacedFileGroupsBeforeOrOn, secondaryView::getReplacedFileGroupsBeforeOrOn); } + @Override + public Stream getReplacedFileGroupsBefore(String maxCommitTime, String partitionPath) { + return execute(maxCommitTime, partitionPath, preferredView::getReplacedFileGroupsBefore, secondaryView::getReplacedFileGroupsBefore); + } + + @Override + public Stream getAllReplacedFileGroups(String partitionPath) { + return execute(partitionPath, preferredView::getAllReplacedFileGroups, secondaryView::getAllReplacedFileGroups); + } + @Override public Stream> getPendingCompactionOperations() { return execute(preferredView::getPendingCompactionOperations, secondaryView::getPendingCompactionOperations); @@ -219,6 +239,7 @@ public void close() { public void reset() { preferredView.reset(); secondaryView.reset(); + errorOnPreferredView = false; } @Override @@ -233,8 +254,9 @@ public HoodieTimeline getTimeline() { @Override public void sync() { - preferredView.reset(); - secondaryView.reset(); + preferredView.sync(); + secondaryView.sync(); + errorOnPreferredView = false; } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/RemoteHoodieTableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/RemoteHoodieTableFileSystemView.java index 91a28a861fada..bd18ba22a25d6 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/RemoteHoodieTableFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/RemoteHoodieTableFileSystemView.java @@ -18,11 +18,6 @@ package org.apache.hudi.common.table.view; -import com.fasterxml.jackson.core.type.TypeReference; -import com.fasterxml.jackson.databind.ObjectMapper; -import org.apache.http.client.fluent.Request; -import org.apache.http.client.fluent.Response; -import org.apache.http.client.utils.URIBuilder; import org.apache.hudi.common.model.CompactionOperation; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; @@ -39,10 +34,18 @@ import org.apache.hudi.common.table.timeline.dto.InstantDTO; import org.apache.hudi.common.table.timeline.dto.TimelineDTO; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.RetryHelper; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieRemoteException; + +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.http.Consts; +import org.apache.http.client.fluent.Request; +import org.apache.http.client.fluent.Response; +import org.apache.http.client.utils.URIBuilder; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -91,6 +94,12 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, public static final String ALL_REPLACED_FILEGROUPS_BEFORE_OR_ON = String.format("%s/%s", BASE_URL, "filegroups/replaced/beforeoron/"); + public static final String ALL_REPLACED_FILEGROUPS_BEFORE = + String.format("%s/%s", BASE_URL, "filegroups/replaced/before/"); + + public static final String ALL_REPLACED_FILEGROUPS_PARTITION = + String.format("%s/%s", BASE_URL, "filegroups/replaced/partition/"); + public static final String PENDING_CLUSTERING_FILEGROUPS = String.format("%s/%s", BASE_URL, "clustering/pending/"); @@ -120,28 +129,38 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, private final int serverPort; private final String basePath; private final HoodieTableMetaClient metaClient; - private final HoodieTimeline timeline; + private HoodieTimeline timeline; private final ObjectMapper mapper; - private final int timeoutSecs; + private final int timeoutMs; private boolean closed = false; + private RetryHelper retryHelper; + private enum RequestMethod { GET, POST } public RemoteHoodieTableFileSystemView(String server, int port, HoodieTableMetaClient metaClient) { - this(server, port, metaClient, 300); + this(metaClient, FileSystemViewStorageConfig.newBuilder().withRemoteServerHost(server).withRemoteServerPort(port).build()); } - public RemoteHoodieTableFileSystemView(String server, int port, HoodieTableMetaClient metaClient, int timeoutSecs) { + public RemoteHoodieTableFileSystemView(HoodieTableMetaClient metaClient, FileSystemViewStorageConfig viewConf) { this.basePath = metaClient.getBasePath(); - this.serverHost = server; - this.serverPort = port; this.mapper = new ObjectMapper(); this.metaClient = metaClient; this.timeline = metaClient.getActiveTimeline().filterCompletedAndCompactionInstants(); - this.timeoutSecs = timeoutSecs; + this.serverHost = viewConf.getRemoteViewServerHost(); + this.serverPort = viewConf.getRemoteViewServerPort(); + this.timeoutMs = viewConf.getRemoteTimelineClientTimeoutSecs() * 1000; + if (viewConf.isRemoteTimelineClientRetryEnabled()) { + retryHelper = new RetryHelper( + viewConf.getRemoteTimelineClientMaxRetryIntervalMs(), + viewConf.getRemoteTimelineClientMaxRetryNumbers(), + viewConf.getRemoteTimelineInitialRetryIntervalMs(), + viewConf.getRemoteTimelineClientRetryExceptions(), + "Sending request"); + } } private T executeRequest(String requestPath, Map queryParameters, TypeReference reference, @@ -159,18 +178,8 @@ private T executeRequest(String requestPath, Map queryParame String url = builder.toString(); LOG.info("Sending request : (" + url + ")"); - Response response; - int timeout = this.timeoutSecs * 1000; // msec - switch (method) { - case GET: - response = Request.Get(url).connectTimeout(timeout).socketTimeout(timeout).execute(); - break; - case POST: - default: - response = Request.Post(url).connectTimeout(timeout).socketTimeout(timeout).execute(); - break; - } - String content = response.returnContent().asString(); + Response response = retryHelper != null ? retryHelper.start(() -> get(timeoutMs, url, method)) : get(timeoutMs, url, method); + String content = response.returnContent().asString(Consts.UTF_8); return (T) mapper.readValue(content, reference); } @@ -380,9 +389,35 @@ public Stream getReplacedFileGroupsBeforeOrOn(String maxCommitT } } + @Override + public Stream getReplacedFileGroupsBefore(String maxCommitTime, String partitionPath) { + Map paramsMap = getParamsWithAdditionalParam(partitionPath, MAX_INSTANT_PARAM, maxCommitTime); + try { + List fileGroups = executeRequest(ALL_REPLACED_FILEGROUPS_BEFORE, paramsMap, + new TypeReference>() {}, RequestMethod.GET); + return fileGroups.stream().map(dto -> FileGroupDTO.toFileGroup(dto, metaClient)); + } catch (IOException e) { + throw new HoodieRemoteException(e); + } + } + + @Override + public Stream getAllReplacedFileGroups(String partitionPath) { + Map paramsMap = getParamsWithPartitionPath(partitionPath); + try { + List fileGroups = executeRequest(ALL_REPLACED_FILEGROUPS_PARTITION, paramsMap, + new TypeReference>() {}, RequestMethod.GET); + return fileGroups.stream().map(dto -> FileGroupDTO.toFileGroup(dto, metaClient)); + } catch (IOException e) { + throw new HoodieRemoteException(e); + } + } + public boolean refresh() { Map paramsMap = getParams(); try { + // refresh the local timeline first. + this.timeline = metaClient.reloadActiveTimeline().filterCompletedAndCompactionInstants(); return executeRequest(REFRESH_TABLE, paramsMap, new TypeReference() {}, RequestMethod.POST); } catch (IOException e) { throw new HoodieRemoteException(e); @@ -449,7 +484,7 @@ public HoodieTimeline getTimeline() { @Override public void sync() { - // noop + refresh(); } @Override @@ -463,4 +498,14 @@ public Option getLatestBaseFile(String partitionPath, String fil throw new HoodieRemoteException(e); } } + + private Response get(int timeoutMs, String url, RequestMethod method) throws IOException { + switch (method) { + case GET: + return Request.Get(url).connectTimeout(timeoutMs).socketTimeout(timeoutMs).execute(); + case POST: + default: + return Request.Post(url).connectTimeout(timeoutMs).socketTimeout(timeoutMs).execute(); + } + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/RocksDbBasedFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/RocksDbBasedFileSystemView.java index af0dc130162aa..02a406e7e0763 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/RocksDbBasedFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/RocksDbBasedFileSystemView.java @@ -199,6 +199,7 @@ protected void resetViewState() { LOG.info("Deleting all rocksdb data associated with table filesystem view"); rocksDB.close(); rocksDB = new RocksDBDAO(metaClient.getBasePath(), config.getRocksdbBasePath()); + schemaHelper.getAllColumnFamilies().forEach(rocksDB::addColumnFamily); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/SpillableMapBasedFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/SpillableMapBasedFileSystemView.java index d72516921ada1..e4144420831c7 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/SpillableMapBasedFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/SpillableMapBasedFileSystemView.java @@ -19,6 +19,8 @@ package org.apache.hudi.common.table.view; import org.apache.hadoop.fs.FileStatus; + +import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.model.BootstrapBaseFileMapping; import org.apache.hudi.common.model.CompactionOperation; import org.apache.hudi.common.model.HoodieFileGroup; @@ -53,22 +55,26 @@ public class SpillableMapBasedFileSystemView extends HoodieTableFileSystemView { private final long maxMemoryForReplaceFileGroups; private final long maxMemoryForClusteringFileGroups; private final String baseStoreDir; + private final ExternalSpillableMap.DiskMapType diskMapType; + private final boolean isBitCaskDiskMapCompressionEnabled; public SpillableMapBasedFileSystemView(HoodieTableMetaClient metaClient, HoodieTimeline visibleActiveTimeline, - FileSystemViewStorageConfig config) { + FileSystemViewStorageConfig config, HoodieCommonConfig commonConfig) { super(config.isIncrementalTimelineSyncEnabled()); this.maxMemoryForFileGroupMap = config.getMaxMemoryForFileGroupMap(); this.maxMemoryForPendingCompaction = config.getMaxMemoryForPendingCompaction(); this.maxMemoryForBootstrapBaseFile = config.getMaxMemoryForBootstrapBaseFile(); this.maxMemoryForReplaceFileGroups = config.getMaxMemoryForReplacedFileGroups(); this.maxMemoryForClusteringFileGroups = config.getMaxMemoryForPendingClusteringFileGroups(); - this.baseStoreDir = config.getBaseStoreDir(); + this.baseStoreDir = config.getSpillableDir(); + diskMapType = commonConfig.getSpillableDiskMapType(); + isBitCaskDiskMapCompressionEnabled = commonConfig.isBitCaskDiskMapCompressionEnabled(); init(metaClient, visibleActiveTimeline); } public SpillableMapBasedFileSystemView(HoodieTableMetaClient metaClient, HoodieTimeline visibleActiveTimeline, - FileStatus[] fileStatuses, FileSystemViewStorageConfig config) { - this(metaClient, visibleActiveTimeline, config); + FileStatus[] fileStatuses, FileSystemViewStorageConfig config, HoodieCommonConfig commonConfig) { + this(metaClient, visibleActiveTimeline, config, commonConfig); addFilesToView(fileStatuses); } @@ -79,7 +85,8 @@ protected Map> createPartitionToFileGroups() { + ", BaseDir=" + baseStoreDir); new File(baseStoreDir).mkdirs(); return (Map>) (new ExternalSpillableMap<>(maxMemoryForFileGroupMap, baseStoreDir, - new DefaultSizeEstimator(), new DefaultSizeEstimator<>())); + new DefaultSizeEstimator(), new DefaultSizeEstimator<>(), + diskMapType, isBitCaskDiskMapCompressionEnabled)); } catch (IOException e) { throw new RuntimeException(e); } @@ -93,7 +100,8 @@ protected Map> createFileId + ", BaseDir=" + baseStoreDir); new File(baseStoreDir).mkdirs(); Map> pendingMap = new ExternalSpillableMap<>( - maxMemoryForPendingCompaction, baseStoreDir, new DefaultSizeEstimator(), new DefaultSizeEstimator<>()); + maxMemoryForPendingCompaction, baseStoreDir, new DefaultSizeEstimator(), new DefaultSizeEstimator<>(), + diskMapType, isBitCaskDiskMapCompressionEnabled); pendingMap.putAll(fgIdToPendingCompaction); return pendingMap; } catch (IOException e) { @@ -109,7 +117,8 @@ protected Map createFileIdToBootstr + ", BaseDir=" + baseStoreDir); new File(baseStoreDir).mkdirs(); Map pendingMap = new ExternalSpillableMap<>( - maxMemoryForBootstrapBaseFile, baseStoreDir, new DefaultSizeEstimator(), new DefaultSizeEstimator<>()); + maxMemoryForBootstrapBaseFile, baseStoreDir, new DefaultSizeEstimator(), new DefaultSizeEstimator<>(), + diskMapType, isBitCaskDiskMapCompressionEnabled); pendingMap.putAll(fileGroupIdBootstrapBaseFileMap); return pendingMap; } catch (IOException e) { @@ -124,7 +133,8 @@ protected Map createFileIdToReplaceInstantMap( + ", BaseDir=" + baseStoreDir); new File(baseStoreDir).mkdirs(); Map pendingMap = new ExternalSpillableMap<>( - maxMemoryForReplaceFileGroups, baseStoreDir, new DefaultSizeEstimator(), new DefaultSizeEstimator<>()); + maxMemoryForReplaceFileGroups, baseStoreDir, new DefaultSizeEstimator(), new DefaultSizeEstimator<>(), + diskMapType, isBitCaskDiskMapCompressionEnabled); pendingMap.putAll(replacedFileGroups); return pendingMap; } catch (IOException e) { @@ -139,7 +149,8 @@ protected Map createFileIdToPendingClusteringM + ", BaseDir=" + baseStoreDir); new File(baseStoreDir).mkdirs(); Map pendingMap = new ExternalSpillableMap<>( - maxMemoryForClusteringFileGroups, baseStoreDir, new DefaultSizeEstimator(), new DefaultSizeEstimator<>()); + maxMemoryForClusteringFileGroups, baseStoreDir, new DefaultSizeEstimator(), new DefaultSizeEstimator<>(), + diskMapType, isBitCaskDiskMapCompressionEnabled); pendingMap.putAll(fileGroupsInClustering); return pendingMap; } catch (IOException e) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/TableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/TableFileSystemView.java index 504f95a9ee089..c32e2cabb1012 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/TableFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/TableFileSystemView.java @@ -124,7 +124,7 @@ Stream getLatestFileSlicesBeforeOrOn(String partitionPath, String max * @param maxInstantTime Max Instant Time * @return */ - public Stream getLatestMergedFileSlicesBeforeOrOn(String partitionPath, String maxInstantTime); + Stream getLatestMergedFileSlicesBeforeOrOn(String partitionPath, String maxInstantTime); /** * Stream all the latest file slices, in the given range. @@ -167,10 +167,20 @@ interface SliceView extends SliceViewWithLatestSlice { HoodieTimeline getTimeline(); /** - * Stream all the replaced file groups before maxCommitTime. + * Stream all the replaced file groups before or on maxCommitTime for given partition. */ Stream getReplacedFileGroupsBeforeOrOn(String maxCommitTime, String partitionPath); + /** + * Stream all the replaced file groups before maxCommitTime for given partition. + */ + Stream getReplacedFileGroupsBefore(String maxCommitTime, String partitionPath); + + /** + * Stream all the replaced file groups for given partition. + */ + Stream getAllReplacedFileGroups(String partitionPath); + /** * Filegroups that are in pending clustering. */ diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/AvroOrcUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/AvroOrcUtils.java new file mode 100644 index 0000000000000..c31184244390f --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/AvroOrcUtils.java @@ -0,0 +1,875 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.sql.Timestamp; +import java.util.Base64; +import java.util.Date; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import org.apache.avro.Conversions; +import org.apache.avro.LogicalType; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema.Field; +import org.apache.avro.generic.GenericData; +import java.nio.charset.StandardCharsets; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData.StringType; +import org.apache.avro.util.Utf8; +import org.apache.orc.storage.common.type.HiveDecimal; +import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; +import org.apache.orc.storage.ql.exec.vector.ColumnVector; +import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; +import org.apache.orc.storage.ql.exec.vector.DoubleColumnVector; +import org.apache.orc.storage.ql.exec.vector.ListColumnVector; +import org.apache.orc.storage.ql.exec.vector.LongColumnVector; +import org.apache.orc.storage.ql.exec.vector.MapColumnVector; +import org.apache.orc.storage.ql.exec.vector.StructColumnVector; +import org.apache.orc.storage.ql.exec.vector.TimestampColumnVector; +import org.apache.orc.storage.ql.exec.vector.UnionColumnVector; +import org.apache.orc.storage.serde2.io.DateWritable; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.orc.TypeDescription; + +import static org.apache.avro.JsonProperties.NULL_VALUE; +import static org.apache.hudi.common.util.BinaryUtil.toBytes; + +/** + * Methods including addToVector, addUnionValue, createOrcSchema are originally from + * https://github.com/streamsets/datacollector. + * Source classes: + * - com.streamsets.pipeline.lib.util.avroorc.AvroToOrcRecordConverter + * - com.streamsets.pipeline.lib.util.avroorc.AvroToOrcSchemaConverter + * + * Changes made: + * 1. Flatten nullable Avro schema type when the value is not null in `addToVector`. + * 2. Use getLogicalType(), constants from LogicalTypes instead of getJsonProp() to handle Avro logical types. + */ +public class AvroOrcUtils { + + private static final int MICROS_PER_MILLI = 1000; + private static final int NANOS_PER_MICRO = 1000; + + /** + * Add an object (of a given ORC type) to the column vector at a given position. + * + * @param type ORC schema of the value Object. + * @param colVector The column vector to store the value Object. + * @param avroSchema Avro schema of the value Object. + * Only used to check logical types for timestamp unit conversion. + * @param value Object to be added to the column vector + * @param vectorPos The position in the vector where value will be stored at. + */ + public static void addToVector(TypeDescription type, ColumnVector colVector, Schema avroSchema, Object value, int vectorPos) { + + final int currentVecLength = colVector.isNull.length; + if (vectorPos >= currentVecLength) { + colVector.ensureSize(2 * currentVecLength, true); + } + if (value == null) { + colVector.isNull[vectorPos] = true; + colVector.noNulls = false; + return; + } + + if (avroSchema.getType().equals(Schema.Type.UNION)) { + avroSchema = getActualSchemaType(avroSchema); + } + + LogicalType logicalType = avroSchema != null ? avroSchema.getLogicalType() : null; + + switch (type.getCategory()) { + case BOOLEAN: + LongColumnVector boolVec = (LongColumnVector) colVector; + boolVec.vector[vectorPos] = (boolean) value ? 1 : 0; + break; + case BYTE: + LongColumnVector byteColVec = (LongColumnVector) colVector; + byteColVec.vector[vectorPos] = (byte) value; + break; + case SHORT: + LongColumnVector shortColVec = (LongColumnVector) colVector; + shortColVec.vector[vectorPos] = (short) value; + break; + case INT: + // the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MILLIS, but we will ignore that fact here + // since Orc has no way to represent a time in the way Avro defines it; we will simply preserve the int value + LongColumnVector intColVec = (LongColumnVector) colVector; + intColVec.vector[vectorPos] = (int) value; + break; + case LONG: + // the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MICROS, but we will ignore that fact here + // since Orc has no way to represent a time in the way Avro defines it; we will simply preserve the long value + LongColumnVector longColVec = (LongColumnVector) colVector; + longColVec.vector[vectorPos] = (long) value; + break; + case FLOAT: + DoubleColumnVector floatColVec = (DoubleColumnVector) colVector; + floatColVec.vector[vectorPos] = (float) value; + break; + case DOUBLE: + DoubleColumnVector doubleColVec = (DoubleColumnVector) colVector; + doubleColVec.vector[vectorPos] = (double) value; + break; + case VARCHAR: + case CHAR: + case STRING: + BytesColumnVector bytesColVec = (BytesColumnVector) colVector; + byte[] bytes = null; + + if (value instanceof String) { + bytes = ((String) value).getBytes(StandardCharsets.UTF_8); + } else if (value instanceof Utf8) { + final Utf8 utf8 = (Utf8) value; + bytes = utf8.getBytes(); + } else if (value instanceof GenericData.EnumSymbol) { + bytes = ((GenericData.EnumSymbol) value).toString().getBytes(StandardCharsets.UTF_8); + } else { + throw new IllegalStateException(String.format( + "Unrecognized type for Avro %s field value, which has type %s, value %s", + type.getCategory().getName(), + value.getClass().getName(), + value.toString() + )); + } + + if (bytes == null) { + bytesColVec.isNull[vectorPos] = true; + bytesColVec.noNulls = false; + } else { + bytesColVec.setRef(vectorPos, bytes, 0, bytes.length); + } + break; + case DATE: + LongColumnVector dateColVec = (LongColumnVector) colVector; + int daysSinceEpoch; + if (logicalType instanceof LogicalTypes.Date) { + daysSinceEpoch = (int) value; + } else if (value instanceof java.sql.Date) { + daysSinceEpoch = DateWritable.dateToDays((java.sql.Date) value); + } else if (value instanceof Date) { + daysSinceEpoch = DateWritable.millisToDays(((Date) value).getTime()); + } else { + throw new IllegalStateException(String.format( + "Unrecognized type for Avro DATE field value, which has type %s, value %s", + value.getClass().getName(), + value.toString() + )); + } + dateColVec.vector[vectorPos] = daysSinceEpoch; + break; + case TIMESTAMP: + TimestampColumnVector tsColVec = (TimestampColumnVector) colVector; + + long time; + int nanos = 0; + + // The unit for Timestamp in ORC is millis, convert timestamp to millis if needed + if (logicalType instanceof LogicalTypes.TimestampMillis) { + time = (long) value; + } else if (logicalType instanceof LogicalTypes.TimestampMicros) { + final long logicalTsValue = (long) value; + time = logicalTsValue / MICROS_PER_MILLI; + nanos = NANOS_PER_MICRO * ((int) (logicalTsValue % MICROS_PER_MILLI)); + } else if (value instanceof Timestamp) { + Timestamp tsValue = (Timestamp) value; + time = tsValue.getTime(); + nanos = tsValue.getNanos(); + } else if (value instanceof java.sql.Date) { + java.sql.Date sqlDateValue = (java.sql.Date) value; + time = sqlDateValue.getTime(); + } else if (value instanceof Date) { + Date dateValue = (Date) value; + time = dateValue.getTime(); + } else { + throw new IllegalStateException(String.format( + "Unrecognized type for Avro TIMESTAMP field value, which has type %s, value %s", + value.getClass().getName(), + value.toString() + )); + } + + tsColVec.time[vectorPos] = time; + tsColVec.nanos[vectorPos] = nanos; + break; + case BINARY: + BytesColumnVector binaryColVec = (BytesColumnVector) colVector; + + byte[] binaryBytes; + if (value instanceof GenericData.Fixed) { + binaryBytes = ((GenericData.Fixed)value).bytes(); + } else if (value instanceof ByteBuffer) { + final ByteBuffer byteBuffer = (ByteBuffer) value; + binaryBytes = toBytes(byteBuffer); + } else if (value instanceof byte[]) { + binaryBytes = (byte[]) value; + } else { + throw new IllegalStateException(String.format( + "Unrecognized type for Avro BINARY field value, which has type %s, value %s", + value.getClass().getName(), + value.toString() + )); + } + binaryColVec.setRef(vectorPos, binaryBytes, 0, binaryBytes.length); + break; + case DECIMAL: + DecimalColumnVector decimalColVec = (DecimalColumnVector) colVector; + HiveDecimal decimalValue; + if (value instanceof BigDecimal) { + final BigDecimal decimal = (BigDecimal) value; + decimalValue = HiveDecimal.create(decimal); + } else if (value instanceof ByteBuffer) { + final ByteBuffer byteBuffer = (ByteBuffer) value; + final byte[] decimalBytes = new byte[byteBuffer.remaining()]; + byteBuffer.get(decimalBytes); + final BigInteger bigInt = new BigInteger(decimalBytes); + final int scale = type.getScale(); + BigDecimal bigDecVal = new BigDecimal(bigInt, scale); + + decimalValue = HiveDecimal.create(bigDecVal); + if (decimalValue == null && decimalBytes.length > 0) { + throw new IllegalStateException( + "Unexpected read null HiveDecimal from bytes (base-64 encoded): " + + Base64.getEncoder().encodeToString(decimalBytes) + ); + } + } else if (value instanceof GenericData.Fixed) { + final BigDecimal decimal = new Conversions.DecimalConversion() + .fromFixed((GenericData.Fixed) value, avroSchema, logicalType); + decimalValue = HiveDecimal.create(decimal); + } else { + throw new IllegalStateException(String.format( + "Unexpected type for decimal (%s), cannot convert from Avro value", + value.getClass().getCanonicalName() + )); + } + if (decimalValue == null) { + decimalColVec.isNull[vectorPos] = true; + decimalColVec.noNulls = false; + } else { + decimalColVec.set(vectorPos, decimalValue); + } + break; + case LIST: + List list = (List) value; + ListColumnVector listColVec = (ListColumnVector) colVector; + listColVec.offsets[vectorPos] = listColVec.childCount; + listColVec.lengths[vectorPos] = list.size(); + + TypeDescription listType = type.getChildren().get(0); + for (Object listItem : list) { + addToVector(listType, listColVec.child, avroSchema.getElementType(), listItem, listColVec.childCount++); + } + break; + case MAP: + Map mapValue = (Map) value; + + MapColumnVector mapColumnVector = (MapColumnVector) colVector; + mapColumnVector.offsets[vectorPos] = mapColumnVector.childCount; + mapColumnVector.lengths[vectorPos] = mapValue.size(); + + // keys are always strings + Schema keySchema = Schema.create(Schema.Type.STRING); + for (Map.Entry entry : mapValue.entrySet()) { + addToVector( + type.getChildren().get(0), + mapColumnVector.keys, + keySchema, + entry.getKey(), + mapColumnVector.childCount + ); + + addToVector( + type.getChildren().get(1), + mapColumnVector.values, + avroSchema.getValueType(), + entry.getValue(), + mapColumnVector.childCount + ); + + mapColumnVector.childCount++; + } + + break; + case STRUCT: + StructColumnVector structColVec = (StructColumnVector) colVector; + + GenericData.Record record = (GenericData.Record) value; + + for (int i = 0; i < type.getFieldNames().size(); i++) { + String fieldName = type.getFieldNames().get(i); + Object fieldValue = record.get(fieldName); + TypeDescription fieldType = type.getChildren().get(i); + addToVector(fieldType, structColVec.fields[i], avroSchema.getFields().get(i).schema(), fieldValue, vectorPos); + } + + break; + case UNION: + UnionColumnVector unionColVec = (UnionColumnVector) colVector; + + List childTypes = type.getChildren(); + boolean added = addUnionValue(unionColVec, childTypes, avroSchema, value, vectorPos); + + if (!added) { + throw new IllegalStateException(String.format( + "Failed to add value %s to union with type %s", + value == null ? "null" : value.toString(), + type.toString() + )); + } + + break; + default: + throw new IllegalArgumentException("Invalid TypeDescription " + type.toString() + "."); + } + } + + /** + * Match value with its ORC type and add to the union vector at a given position. + * + * @param unionVector The vector to store value. + * @param unionChildTypes All possible types for the value Object. + * @param avroSchema Avro union schema for the value Object. + * @param value Object to be added to the unionVector + * @param vectorPos The position in the vector where value will be stored at. + * @return succeeded or failed + */ + public static boolean addUnionValue( + UnionColumnVector unionVector, + List unionChildTypes, + Schema avroSchema, + Object value, + int vectorPos + ) { + int matchIndex = -1; + TypeDescription matchType = null; + Object matchValue = null; + + for (int t = 0; t < unionChildTypes.size(); t++) { + TypeDescription childType = unionChildTypes.get(t); + boolean matches = false; + + switch (childType.getCategory()) { + case BOOLEAN: + matches = value instanceof Boolean; + break; + case BYTE: + matches = value instanceof Byte; + break; + case SHORT: + matches = value instanceof Short; + break; + case INT: + matches = value instanceof Integer; + break; + case LONG: + matches = value instanceof Long; + break; + case FLOAT: + matches = value instanceof Float; + break; + case DOUBLE: + matches = value instanceof Double; + break; + case STRING: + case VARCHAR: + case CHAR: + if (value instanceof String) { + matches = true; + matchValue = ((String) value).getBytes(StandardCharsets.UTF_8); + } else if (value instanceof Utf8) { + matches = true; + matchValue = ((Utf8) value).getBytes(); + } + break; + case DATE: + matches = value instanceof Date; + break; + case TIMESTAMP: + matches = value instanceof Timestamp; + break; + case BINARY: + matches = value instanceof byte[] || value instanceof GenericData.Fixed; + break; + case DECIMAL: + matches = value instanceof BigDecimal; + break; + case LIST: + matches = value instanceof List; + break; + case MAP: + matches = value instanceof Map; + break; + case STRUCT: + throw new UnsupportedOperationException("Cannot handle STRUCT within UNION."); + case UNION: + List children = childType.getChildren(); + if (value == null) { + matches = children == null || children.size() == 0; + } else { + matches = addUnionValue(unionVector, children, avroSchema, value, vectorPos); + } + break; + default: + throw new IllegalArgumentException("Invalid TypeDescription " + childType.getCategory().toString() + "."); + } + + if (matches) { + matchIndex = t; + matchType = childType; + break; + } + } + + if (value == null && matchValue != null) { + value = matchValue; + } + + if (matchIndex >= 0) { + unionVector.tags[vectorPos] = matchIndex; + if (value == null) { + unionVector.isNull[vectorPos] = true; + unionVector.noNulls = false; + } else { + addToVector(matchType, unionVector.fields[matchIndex], avroSchema.getTypes().get(matchIndex), value, vectorPos); + } + return true; + } else { + return false; + } + } + + /** + * Read the Column vector at a given position conforming to a given ORC schema. + * + * @param type ORC schema of the object to read. + * @param colVector The column vector to read. + * @param avroSchema Avro schema of the object to read. + * Only used to check logical types for timestamp unit conversion. + * @param vectorPos The position in the vector where the value to read is stored at. + * @return The object being read. + */ + public static Object readFromVector(TypeDescription type, ColumnVector colVector, Schema avroSchema, int vectorPos) { + + if (colVector.isRepeating) { + vectorPos = 0; + } + + if (colVector.isNull[vectorPos]) { + return null; + } + + if (avroSchema.getType().equals(Schema.Type.UNION)) { + avroSchema = getActualSchemaType(avroSchema); + } + LogicalType logicalType = avroSchema != null ? avroSchema.getLogicalType() : null; + + switch (type.getCategory()) { + case BOOLEAN: + return ((LongColumnVector) colVector).vector[vectorPos] != 0; + case BYTE: + return (byte) ((LongColumnVector) colVector).vector[vectorPos]; + case SHORT: + return (short) ((LongColumnVector) colVector).vector[vectorPos]; + case INT: + return (int) ((LongColumnVector) colVector).vector[vectorPos]; + case LONG: + return ((LongColumnVector) colVector).vector[vectorPos]; + case FLOAT: + return (float) ((DoubleColumnVector) colVector).vector[vectorPos]; + case DOUBLE: + return ((DoubleColumnVector) colVector).vector[vectorPos]; + case VARCHAR: + case CHAR: + int maxLength = type.getMaxLength(); + String result = ((BytesColumnVector) colVector).toString(vectorPos); + if (result.length() <= maxLength) { + return result; + } else { + throw new HoodieIOException("CHAR/VARCHAR has length " + result.length() + " greater than Max Length allowed"); + } + case STRING: + String stringType = avroSchema.getProp(GenericData.STRING_PROP); + if (stringType == null || !stringType.equals(StringType.String)) { + int stringLength = ((BytesColumnVector) colVector).length[vectorPos]; + int stringOffset = ((BytesColumnVector) colVector).start[vectorPos]; + byte[] stringBytes = new byte[stringLength]; + System.arraycopy(((BytesColumnVector) colVector).vector[vectorPos], stringOffset, stringBytes, 0, stringLength); + return new Utf8(stringBytes); + } else { + return ((BytesColumnVector) colVector).toString(vectorPos); + } + case DATE: + // convert to daysSinceEpoch for LogicalType.Date + return (int) ((LongColumnVector) colVector).vector[vectorPos]; + case TIMESTAMP: + // The unit of time in ORC is millis. Convert (time,nanos) to the desired unit per logicalType + long time = ((TimestampColumnVector) colVector).time[vectorPos]; + int nanos = ((TimestampColumnVector) colVector).nanos[vectorPos]; + if (logicalType instanceof LogicalTypes.TimestampMillis) { + return time; + } else if (logicalType instanceof LogicalTypes.TimestampMicros) { + return time * MICROS_PER_MILLI + nanos / NANOS_PER_MICRO; + } else { + return ((TimestampColumnVector) colVector).getTimestampAsLong(vectorPos); + } + case BINARY: + int binaryLength = ((BytesColumnVector) colVector).length[vectorPos]; + int binaryOffset = ((BytesColumnVector) colVector).start[vectorPos]; + byte[] binaryBytes = new byte[binaryLength]; + System.arraycopy(((BytesColumnVector) colVector).vector[vectorPos], binaryOffset, binaryBytes, 0, binaryLength); + // return a ByteBuffer to be consistent with AvroRecordConverter + return ByteBuffer.wrap(binaryBytes); + case DECIMAL: + // HiveDecimal always ignores trailing zeros, thus modifies the scale implicitly, + // therefore, the scale must be enforced here. + BigDecimal bigDecimal = ((DecimalColumnVector) colVector).vector[vectorPos] + .getHiveDecimal().bigDecimalValue() + .setScale(((LogicalTypes.Decimal) logicalType).getScale()); + Schema.Type baseType = avroSchema.getType(); + if (baseType.equals(Schema.Type.FIXED)) { + return new Conversions.DecimalConversion().toFixed(bigDecimal, avroSchema, logicalType); + } else if (baseType.equals(Schema.Type.BYTES)) { + return bigDecimal.unscaledValue().toByteArray(); + } else { + throw new HoodieIOException(baseType.getName() + "is not a valid type for LogicalTypes.DECIMAL."); + } + case LIST: + ArrayList list = new ArrayList<>(); + ListColumnVector listVector = (ListColumnVector) colVector; + int listLength = (int) listVector.lengths[vectorPos]; + int listOffset = (int) listVector.offsets[vectorPos]; + list.ensureCapacity(listLength); + TypeDescription childType = type.getChildren().get(0); + for (int i = 0; i < listLength; i++) { + list.add(readFromVector(childType, listVector.child, avroSchema.getElementType(), listOffset + i)); + } + return list; + case MAP: + Map map = new HashMap(); + MapColumnVector mapVector = (MapColumnVector) colVector; + int mapLength = (int) mapVector.lengths[vectorPos]; + int mapOffset = (int) mapVector.offsets[vectorPos]; + // keys are always strings for maps in Avro + Schema keySchema = Schema.create(Schema.Type.STRING); + for (int i = 0; i < mapLength; i++) { + map.put( + readFromVector(type.getChildren().get(0), mapVector.keys, keySchema, i + mapOffset).toString(), + readFromVector(type.getChildren().get(1), mapVector.values, + avroSchema.getValueType(), i + mapOffset)); + } + return map; + case STRUCT: + StructColumnVector structVector = (StructColumnVector) colVector; + List children = type.getChildren(); + GenericData.Record record = new GenericData.Record(avroSchema); + for (int i = 0; i < children.size(); i++) { + record.put(i, readFromVector(children.get(i), structVector.fields[i], + avroSchema.getFields().get(i).schema(), vectorPos)); + } + return record; + case UNION: + UnionColumnVector unionVector = (UnionColumnVector) colVector; + int tag = unionVector.tags[vectorPos]; + ColumnVector fieldVector = unionVector.fields[tag]; + return readFromVector(type.getChildren().get(tag), fieldVector, avroSchema.getTypes().get(tag), vectorPos); + default: + throw new HoodieIOException("Unrecognized TypeDescription " + type.toString()); + } + } + + public static TypeDescription createOrcSchema(Schema avroSchema) { + + LogicalType logicalType = avroSchema.getLogicalType(); + + if (logicalType != null) { + if (logicalType instanceof LogicalTypes.Decimal) { + return TypeDescription.createDecimal() + .withPrecision(((LogicalTypes.Decimal) logicalType).getPrecision()) + .withScale(((LogicalTypes.Decimal) logicalType).getScale()); + } else if (logicalType instanceof LogicalTypes.Date) { + // The date logical type represents a date within the calendar, with no reference to a particular time zone + // or time of day. + // + // A date logical type annotates an Avro int, where the int stores the number of days from the unix epoch, 1 + // January 1970 (ISO calendar). + return TypeDescription.createDate(); + } else if (logicalType instanceof LogicalTypes.TimeMillis) { + // The time-millis logical type represents a time of day, with no reference to a particular calendar, time + // zone or date, with a precision of one millisecond. + // + // A time-millis logical type annotates an Avro int, where the int stores the number of milliseconds after + // midnight, 00:00:00.000. + return TypeDescription.createInt(); + } else if (logicalType instanceof LogicalTypes.TimeMicros) { + // The time-micros logical type represents a time of day, with no reference to a particular calendar, time + // zone or date, with a precision of one microsecond. + // + // A time-micros logical type annotates an Avro long, where the long stores the number of microseconds after + // midnight, 00:00:00.000000. + return TypeDescription.createLong(); + } else if (logicalType instanceof LogicalTypes.TimestampMillis) { + // The timestamp-millis logical type represents an instant on the global timeline, independent of a + // particular time zone or calendar, with a precision of one millisecond. + // + // A timestamp-millis logical type annotates an Avro long, where the long stores the number of milliseconds + // from the unix epoch, 1 January 1970 00:00:00.000 UTC. + return TypeDescription.createTimestamp(); + } else if (logicalType instanceof LogicalTypes.TimestampMicros) { + // The timestamp-micros logical type represents an instant on the global timeline, independent of a + // particular time zone or calendar, with a precision of one microsecond. + // + // A timestamp-micros logical type annotates an Avro long, where the long stores the number of microseconds + // from the unix epoch, 1 January 1970 00:00:00.000000 UTC. + return TypeDescription.createTimestamp(); + } + } + + final Schema.Type type = avroSchema.getType(); + switch (type) { + case NULL: + // empty union represents null type + final TypeDescription nullUnion = TypeDescription.createUnion(); + return nullUnion; + case LONG: + return TypeDescription.createLong(); + case INT: + return TypeDescription.createInt(); + case BYTES: + return TypeDescription.createBinary(); + case ARRAY: + return TypeDescription.createList(createOrcSchema(avroSchema.getElementType())); + case RECORD: + final TypeDescription recordStruct = TypeDescription.createStruct(); + for (Schema.Field field : avroSchema.getFields()) { + final Schema fieldSchema = field.schema(); + final TypeDescription fieldType = createOrcSchema(fieldSchema); + if (fieldType != null) { + recordStruct.addField(field.name(), fieldType); + } + } + return recordStruct; + case MAP: + return TypeDescription.createMap( + // in Avro maps, keys are always strings + TypeDescription.createString(), + createOrcSchema(avroSchema.getValueType()) + ); + case UNION: + final List nonNullMembers = avroSchema.getTypes().stream().filter( + schema -> !Schema.Type.NULL.equals(schema.getType()) + ).collect(Collectors.toList()); + + if (nonNullMembers.isEmpty()) { + // no non-null union members; represent as an ORC empty union + return TypeDescription.createUnion(); + } else if (nonNullMembers.size() == 1) { + // a single non-null union member + // this is how Avro represents "nullable" types; as a union of the NULL type with another + // since ORC already supports nullability of all types, just use the child type directly + return createOrcSchema(nonNullMembers.get(0)); + } else { + // more than one non-null type; represent as an actual ORC union of them + final TypeDescription union = TypeDescription.createUnion(); + for (final Schema childSchema : nonNullMembers) { + union.addUnionChild(createOrcSchema(childSchema)); + } + return union; + } + case STRING: + return TypeDescription.createString(); + case FLOAT: + return TypeDescription.createFloat(); + case DOUBLE: + return TypeDescription.createDouble(); + case BOOLEAN: + return TypeDescription.createBoolean(); + case ENUM: + // represent as String for now + return TypeDescription.createString(); + case FIXED: + return TypeDescription.createBinary(); + default: + throw new IllegalStateException(String.format("Unrecognized Avro type: %s", type.getName())); + } + } + + public static Schema createAvroSchema(TypeDescription orcSchema) { + switch (orcSchema.getCategory()) { + case BOOLEAN: + return Schema.create(Schema.Type.BOOLEAN); + case BYTE: + // tinyint (8 bit), use int to hold it + return Schema.create(Schema.Type.INT); + case SHORT: + // smallint (16 bit), use int to hold it + return Schema.create(Schema.Type.INT); + case INT: + // the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MILLIS, but there is no way to distinguish + return Schema.create(Schema.Type.INT); + case LONG: + // the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MICROS, but there is no way to distinguish + return Schema.create(Schema.Type.LONG); + case FLOAT: + return Schema.create(Schema.Type.FLOAT); + case DOUBLE: + return Schema.create(Schema.Type.DOUBLE); + case VARCHAR: + case CHAR: + case STRING: + return Schema.create(Schema.Type.STRING); + case DATE: + Schema date = Schema.create(Schema.Type.INT); + LogicalTypes.date().addToSchema(date); + return date; + case TIMESTAMP: + // Cannot distinguish between TIMESTAMP_MILLIS and TIMESTAMP_MICROS + // Assume TIMESTAMP_MILLIS because Timestamp in ORC is in millis + Schema timestamp = Schema.create(Schema.Type.LONG); + LogicalTypes.timestampMillis().addToSchema(timestamp); + return timestamp; + case BINARY: + return Schema.create(Schema.Type.BYTES); + case DECIMAL: + Schema decimal = Schema.create(Schema.Type.BYTES); + LogicalTypes.decimal(orcSchema.getPrecision(), orcSchema.getScale()).addToSchema(decimal); + return decimal; + case LIST: + return Schema.createArray(createAvroSchema(orcSchema.getChildren().get(0))); + case MAP: + return Schema.createMap(createAvroSchema(orcSchema.getChildren().get(1))); + case STRUCT: + List childFields = new ArrayList<>(); + for (int i = 0; i < orcSchema.getChildren().size(); i++) { + TypeDescription childType = orcSchema.getChildren().get(i); + String childName = orcSchema.getFieldNames().get(i); + childFields.add(new Field(childName, createAvroSchema(childType), "", null)); + } + return Schema.createRecord(childFields); + case UNION: + return Schema.createUnion(orcSchema.getChildren().stream() + .map(AvroOrcUtils::createAvroSchema) + .collect(Collectors.toList())); + default: + throw new IllegalStateException(String.format("Unrecognized ORC type: %s", orcSchema.getCategory().getName())); + } + } + + /** + * Returns the actual schema of a field. + * + * All types in ORC is nullable whereas Avro uses a union that contains the NULL type to imply + * the nullability of an Avro type. To achieve consistency between the Avro and ORC schema, + * non-NULL types are extracted from the union type. + * @param unionSchema A schema of union type. + * @return An Avro schema that is either NULL or a UNION without NULL fields. + */ + private static Schema getActualSchemaType(Schema unionSchema) { + final List nonNullMembers = unionSchema.getTypes().stream().filter( + schema -> !Schema.Type.NULL.equals(schema.getType()) + ).collect(Collectors.toList()); + if (nonNullMembers.isEmpty()) { + return Schema.create(Schema.Type.NULL); + } else if (nonNullMembers.size() == 1) { + return nonNullMembers.get(0); + } else { + return Schema.createUnion(nonNullMembers); + } + } + + public static Schema createAvroSchemaWithDefaultValue(TypeDescription orcSchema, String recordName, String namespace, boolean nullable) { + Schema avroSchema = createAvroSchemaWithNamespace(orcSchema,recordName,namespace); + List fields = new ArrayList(); + List fieldList = avroSchema.getFields(); + for (Field field : fieldList) { + Schema fieldSchema = field.schema(); + Schema nullableSchema = Schema.createUnion(Schema.create(Schema.Type.NULL),fieldSchema); + if (nullable) { + fields.add(new Schema.Field(field.name(), nullableSchema, null, NULL_VALUE)); + } else { + fields.add(new Schema.Field(field.name(), fieldSchema, null, (Object) null)); + } + } + Schema schema = Schema.createRecord(recordName, null, null, false); + schema.setFields(fields); + return schema; + } + + private static Schema createAvroSchemaWithNamespace(TypeDescription orcSchema, String recordName, String namespace) { + switch (orcSchema.getCategory()) { + case BOOLEAN: + return Schema.create(Schema.Type.BOOLEAN); + case BYTE: + // tinyint (8 bit), use int to hold it + return Schema.create(Schema.Type.INT); + case SHORT: + // smallint (16 bit), use int to hold it + return Schema.create(Schema.Type.INT); + case INT: + // the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MILLIS, but there is no way to distinguish + return Schema.create(Schema.Type.INT); + case LONG: + // the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MICROS, but there is no way to distinguish + return Schema.create(Schema.Type.LONG); + case FLOAT: + return Schema.create(Schema.Type.FLOAT); + case DOUBLE: + return Schema.create(Schema.Type.DOUBLE); + case VARCHAR: + case CHAR: + case STRING: + return Schema.create(Schema.Type.STRING); + case DATE: + Schema date = Schema.create(Schema.Type.INT); + LogicalTypes.date().addToSchema(date); + return date; + case TIMESTAMP: + Schema timestamp = Schema.create(Schema.Type.LONG); + LogicalTypes.timestampMillis().addToSchema(timestamp); + return timestamp; + case BINARY: + return Schema.create(Schema.Type.BYTES); + case DECIMAL: + Schema decimal = Schema.create(Schema.Type.BYTES); + LogicalTypes.decimal(orcSchema.getPrecision(), orcSchema.getScale()).addToSchema(decimal); + return decimal; + case LIST: + return Schema.createArray(createAvroSchemaWithNamespace(orcSchema.getChildren().get(0), recordName, "")); + case MAP: + return Schema.createMap(createAvroSchemaWithNamespace(orcSchema.getChildren().get(1), recordName, "")); + case STRUCT: + List childFields = new ArrayList<>(); + for (int i = 0; i < orcSchema.getChildren().size(); i++) { + TypeDescription childType = orcSchema.getChildren().get(i); + String childName = orcSchema.getFieldNames().get(i); + childFields.add(new Field(childName, createAvroSchemaWithNamespace(childType, childName, ""), null, null)); + } + return Schema.createRecord(recordName, null, namespace, false, childFields); + default: + throw new IllegalStateException(String.format("Unrecognized ORC type: %s", orcSchema.getCategory().getName())); + + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/Base64CodecUtil.java b/hudi-common/src/main/java/org/apache/hudi/common/util/Base64CodecUtil.java index a86879ad6305e..97e9133cfa51c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/Base64CodecUtil.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/Base64CodecUtil.java @@ -26,11 +26,11 @@ public final class Base64CodecUtil { /** * Decodes data from the input string into using the encoding scheme. * - * @param serString + * @param encodedString - Base64 encoded string to decode * @return A newly-allocated byte array containing the decoded bytes. */ - public static byte[] decode(String serString) { - return Base64.getDecoder().decode(serString.getBytes(StandardCharsets.UTF_8)); + public static byte[] decode(String encodedString) { + return Base64.getDecoder().decode(encodedString.getBytes(StandardCharsets.UTF_8)); } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/BaseFileUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/BaseFileUtils.java new file mode 100644 index 0000000000000..badb5e37a70f3 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/BaseFileUtils.java @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import org.apache.hudi.avro.HoodieAvroWriteSupport; +import org.apache.hudi.avro.HoodieBloomFilterWriteSupport; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.bloom.BloomFilterFactory; +import org.apache.hudi.common.bloom.BloomFilterTypeCode; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.keygen.BaseKeyGenerator; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; + +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +public abstract class BaseFileUtils { + + public static BaseFileUtils getInstance(String path) { + if (path.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { + return new ParquetUtils(); + } else if (path.endsWith(HoodieFileFormat.ORC.getFileExtension())) { + return new OrcUtils(); + } + throw new UnsupportedOperationException("The format for file " + path + " is not supported yet."); + } + + public static BaseFileUtils getInstance(HoodieFileFormat fileFormat) { + if (HoodieFileFormat.PARQUET.equals(fileFormat)) { + return new ParquetUtils(); + } else if (HoodieFileFormat.ORC.equals(fileFormat)) { + return new OrcUtils(); + } + throw new UnsupportedOperationException(fileFormat.name() + " format not supported yet."); + } + + public static BaseFileUtils getInstance(HoodieTableMetaClient metaClient) { + return getInstance(metaClient.getTableConfig().getBaseFileFormat()); + } + + /** + * Read the rowKey list from the given data file. + * @param filePath The data file path + * @param configuration configuration to build fs object + * @return Set Set of row keys + */ + public Set readRowKeys(Configuration configuration, Path filePath) { + return filterRowKeys(configuration, filePath, new HashSet<>()); + } + + /** + * Read the bloom filter from the metadata of the given data file. + * @param configuration Configuration + * @param filePath The data file path + * @return a BloomFilter object + */ + public BloomFilter readBloomFilterFromMetadata(Configuration configuration, Path filePath) { + Map footerVals = + readFooter(configuration, false, filePath, + HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, + HoodieAvroWriteSupport.OLD_HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, + HoodieBloomFilterWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE); + String footerVal = footerVals.get(HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY); + if (null == footerVal) { + // We use old style key "com.uber.hoodie.bloomfilter" + footerVal = footerVals.get(HoodieAvroWriteSupport.OLD_HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY); + } + BloomFilter toReturn = null; + if (footerVal != null) { + if (footerVals.containsKey(HoodieBloomFilterWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE)) { + toReturn = BloomFilterFactory.fromString(footerVal, + footerVals.get(HoodieBloomFilterWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE)); + } else { + toReturn = BloomFilterFactory.fromString(footerVal, BloomFilterTypeCode.SIMPLE.name()); + } + } + return toReturn; + } + + /** + * Read the min and max record key from the metadata of the given data file. + * @param configuration Configuration + * @param filePath The data file path + * @return A array of two string where the first is min record key and the second is max record key + */ + public String[] readMinMaxRecordKeys(Configuration configuration, Path filePath) { + Map minMaxKeys = readFooter(configuration, true, filePath, + HoodieBloomFilterWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER, HoodieBloomFilterWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER); + if (minMaxKeys.size() != 2) { + throw new HoodieException( + String.format("Could not read min/max record key out of footer correctly from %s. read) : %s", + filePath, minMaxKeys)); + } + return new String[] {minMaxKeys.get(HoodieBloomFilterWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER), + minMaxKeys.get(HoodieBloomFilterWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER)}; + } + + /** + * Read the data file + * NOTE: This literally reads the entire file contents, thus should be used with caution. + * @param configuration Configuration + * @param filePath The data file path + * @return A list of GenericRecord + */ + public abstract List readAvroRecords(Configuration configuration, Path filePath); + + /** + * Read the data file using the given schema + * NOTE: This literally reads the entire file contents, thus should be used with caution. + * @param configuration Configuration + * @param filePath The data file path + * @return A list of GenericRecord + */ + public abstract List readAvroRecords(Configuration configuration, Path filePath, Schema schema); + + /** + * Read the footer data of the given data file. + * @param configuration Configuration + * @param required require the footer data to be in data file + * @param filePath The data file path + * @param footerNames The footer names to read + * @return A map where the key is the footer name and the value is the footer value + */ + public abstract Map readFooter(Configuration configuration, boolean required, Path filePath, + String... footerNames); + + /** + * Returns the number of records in the data file. + * @param configuration Configuration + * @param filePath The data file path + */ + public abstract long getRowCount(Configuration configuration, Path filePath); + + /** + * Read the rowKey list matching the given filter, from the given data file. + * If the filter is empty, then this will return all the row keys. + * @param filePath The data file path + * @param configuration configuration to build fs object + * @param filter record keys filter + * @return Set Set of row keys matching candidateRecordKeys + */ + public abstract Set filterRowKeys(Configuration configuration, Path filePath, Set filter); + + /** + * Fetch {@link HoodieKey}s from the given data file. + * @param configuration configuration to build fs object + * @param filePath The data file path + * @return {@link List} of {@link HoodieKey}s fetched from the data file + */ + public abstract List fetchHoodieKeys(Configuration configuration, Path filePath); + + /** + * Provides a closable iterator for reading the given data file. + * @param configuration configuration to build fs object + * @param filePath The data file path + * @param keyGeneratorOpt instance of KeyGenerator. + * @return {@link ClosableIterator} of {@link HoodieKey}s for reading the file + */ + public abstract ClosableIterator getHoodieKeyIterator(Configuration configuration, Path filePath, Option keyGeneratorOpt); + + /** + * Provides a closable iterator for reading the given data file. + * @param configuration configuration to build fs object + * @param filePath The data file path + * @return {@link ClosableIterator} of {@link HoodieKey}s for reading the file + */ + public abstract ClosableIterator getHoodieKeyIterator(Configuration configuration, Path filePath); + + /** + * Fetch {@link HoodieKey}s from the given data file. + * @param configuration configuration to build fs object + * @param filePath The data file path + * @param keyGeneratorOpt instance of KeyGenerator. + * @return {@link List} of {@link HoodieKey}s fetched from the data file + */ + public abstract List fetchHoodieKeys(Configuration configuration, Path filePath, Option keyGeneratorOpt); + + /** + * Read the Avro schema of the data file. + * @param configuration Configuration + * @param filePath The data file path + * @return The Avro schema of the data file + */ + public abstract Schema readAvroSchema(Configuration configuration, Path filePath); + + /** + * @return The subclass's {@link HoodieFileFormat}. + */ + public abstract HoodieFileFormat getFormat(); +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/BinaryUtil.java b/hudi-common/src/main/java/org/apache/hudi/common/util/BinaryUtil.java new file mode 100644 index 0000000000000..9d8f6c8e90cf3 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/BinaryUtil.java @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.util.zip.CRC32; + +public class BinaryUtil { + + /** + * Lexicographically compare two arrays. + * copy from hbase + * @param buffer1 left operand + * @param buffer2 right operand + * @param offset1 Where to start comparing in the left buffer + * @param offset2 Where to start comparing in the right buffer + * @param length1 How much to compare from the left buffer + * @param length2 How much to compare from the right buffer + * @return 0 if equal, < 0 if left is less than right, etc. + */ + public static int compareTo(byte[] buffer1, int offset1, int length1, + byte[] buffer2, int offset2, int length2) { + // Short circuit equal case + if (buffer1 == buffer2 + && offset1 == offset2 + && length1 == length2) { + return 0; + } + // Bring WritableComparator code local + int end1 = offset1 + length1; + int end2 = offset2 + length2; + for (int i = offset1, j = offset2; i < end1 && j < end2; i++, j++) { + int a = (buffer1[i] & 0xff); + int b = (buffer2[j] & 0xff); + if (a != b) { + return a - b; + } + } + return length1 - length2; + } + + public static byte[] paddingTo8Byte(byte[] a) { + if (a.length == 8) { + return a; + } + if (a.length > 8) { + byte[] result = new byte[8]; + System.arraycopy(a, 0, result, 0, 8); + return result; + } + int paddingSize = 8 - a.length; + byte[] result = new byte[8]; + for (int i = 0; i < paddingSize; i++) { + result[i] = 0; + } + System.arraycopy(a, 0, result, paddingSize, a.length); + + return result; + } + + /** + * Interleaving array bytes. + * Interleaving means take one bit from the first matrix element, one bit + * from the next, etc, then take the second bit from the first matrix + * element, second bit from the second, all the way to the last bit of the + * last element. Combine those bits in that order into a single BigInteger, + * @param buffer candidate element to do interleaving + * @return byte size of candidate element + */ + public static byte[] interleaving(byte[][] buffer, int size) { + int candidateSize = buffer.length; + byte[] result = new byte[size * candidateSize]; + int resBitPos = 0; + int totalBits = size * 8; + for (int bitStep = 0; bitStep < totalBits; bitStep++) { + int currentBytePos = (int) Math.floor(bitStep / 8); + int currentBitPos = bitStep % 8; + + for (int i = 0; i < candidateSize; i++) { + int tempResBytePos = (int) Math.floor(resBitPos / 8); + int tempResBitPos = resBitPos % 8; + result[tempResBytePos] = updatePos(result[tempResBytePos], tempResBitPos, buffer[i][currentBytePos], currentBitPos); + resBitPos++; + } + } + return result; + } + + public static byte updatePos(byte a, int apos, byte b, int bpos) { + byte temp = (byte) (b & (1 << (7 - bpos))); + if (apos < bpos) { + temp = (byte) (temp << (bpos - apos)); + } + if (apos > bpos) { + temp = (byte) (temp >> (apos - bpos)); + } + byte atemp = (byte) (a & (1 << (7 - apos))); + if ((byte) (atemp ^ temp) == 0) { + return a; + } + return (byte) (a ^ (1 << (7 - apos))); + } + + /** + * Copies {@link ByteBuffer} into allocated {@code byte[]} array + */ + public static byte[] toBytes(ByteBuffer buffer) { + byte[] bytes = new byte[buffer.remaining()]; + buffer.get(bytes); + return bytes; + } + + public static byte[] toBytes(int val) { + byte[] b = new byte[4]; + for (int i = 3; i > 0; i--) { + b[i] = (byte) val; + val >>>= 8; + } + b[0] = (byte) val; + return b; + } + + public static byte[] toBytes(long val) { + long temp = val; + byte[] b = new byte[8]; + for (int i = 7; i > 0; i--) { + b[i] = (byte) temp; + temp >>>= 8; + } + b[0] = (byte) temp; + return b; + } + + public static byte[] toBytes(final double d) { + return toBytes(Double.doubleToRawLongBits(d)); + } + + public static byte[] intTo8Byte(int a) { + int temp = a; + temp = temp ^ (1 << 31); + return paddingTo8Byte(toBytes(temp)); + } + + public static byte[] byteTo8Byte(byte a) { + return paddingTo8Byte(new byte[] { a }); + } + + public static byte[] longTo8Byte(long a) { + long temp = a; + temp = temp ^ (1L << 63); + return toBytes(temp); + } + + public static byte[] doubleTo8Byte(double a) { + byte[] temp = toBytes(a); + if (a > 0) { + temp[0] = (byte) (temp[0] ^ (1 << 7)); + } + if (a < 0) { + for (int i = 0; i < temp.length; i++) { + temp[i] = (byte) ~temp[i]; + } + } + return temp; + } + + public static byte[] utf8To8Byte(String a) { + return paddingTo8Byte(a.getBytes(Charset.forName("utf-8"))); + } + + public static Long convertStringToLong(String a) { + byte[] bytes = utf8To8Byte(a); + return convertBytesToLong(bytes); + } + + public static long convertBytesToLong(byte[] bytes) { + byte[] paddedBytes = paddingTo8Byte(bytes); + long temp = 0L; + for (int i = 7; i >= 0; i--) { + temp = temp | (((long) paddedBytes[i] & 0xff) << (7 - i) * 8); + } + return temp; + } + + /** + * Generate a checksum for a given set of bytes. + */ + public static long generateChecksum(byte[] data) { + CRC32 crc = new CRC32(); + crc.update(data); + return crc.getValue(); + } +} + diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/CleanerUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/CleanerUtils.java index 6049ee30752ba..513c4fa29ed25 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/CleanerUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/CleanerUtils.java @@ -25,20 +25,30 @@ import org.apache.hudi.avro.model.HoodieCleanerPlan; import org.apache.hudi.common.HoodieCleanStat; import org.apache.hudi.common.model.CleanFileInfo; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.table.timeline.versioning.clean.CleanMetadataMigrator; import org.apache.hudi.common.table.timeline.versioning.clean.CleanMetadataV1MigrationHandler; import org.apache.hudi.common.table.timeline.versioning.clean.CleanMetadataV2MigrationHandler; import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanMigrator; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.COMMIT_ACTION; + public class CleanerUtils { + + private static final Logger LOG = LogManager.getLogger(CleanerUtils.class); + public static final Integer CLEAN_METADATA_VERSION_1 = CleanMetadataV1MigrationHandler.VERSION; public static final Integer CLEAN_METADATA_VERSION_2 = CleanMetadataV2MigrationHandler.VERSION; public static final Integer LATEST_CLEAN_METADATA_VERSION = CLEAN_METADATA_VERSION_2; @@ -51,27 +61,29 @@ public static HoodieCleanMetadata convertCleanMetadata(String startCleanTime, int totalDeleted = 0; String earliestCommitToRetain = null; + String lastCompletedCommitTimestamp = ""; for (HoodieCleanStat stat : cleanStats) { HoodieCleanPartitionMetadata metadata = new HoodieCleanPartitionMetadata(stat.getPartitionPath(), stat.getPolicy().name(), - stat.getDeletePathPatterns(), stat.getSuccessDeleteFiles(), stat.getFailedDeleteFiles()); + stat.getDeletePathPatterns(), stat.getSuccessDeleteFiles(), stat.getFailedDeleteFiles(), stat.isPartitionDeleted()); partitionMetadataMap.put(stat.getPartitionPath(), metadata); if ((null != stat.getDeleteBootstrapBasePathPatterns()) && (!stat.getDeleteBootstrapBasePathPatterns().isEmpty())) { HoodieCleanPartitionMetadata bootstrapMetadata = new HoodieCleanPartitionMetadata(stat.getPartitionPath(), stat.getPolicy().name(), stat.getDeleteBootstrapBasePathPatterns(), stat.getSuccessDeleteBootstrapBaseFiles(), - stat.getFailedDeleteBootstrapBaseFiles()); + stat.getFailedDeleteBootstrapBaseFiles(), stat.isPartitionDeleted()); partitionBootstrapMetadataMap.put(stat.getPartitionPath(), bootstrapMetadata); } totalDeleted += stat.getSuccessDeleteFiles().size(); if (earliestCommitToRetain == null) { // This will be the same for all partitions earliestCommitToRetain = stat.getEarliestCommitToRetain(); + lastCompletedCommitTimestamp = stat.getLastCompletedCommitTimestamp(); } } - return new HoodieCleanMetadata(startCleanTime, durationInMs.orElseGet(() -> -1L), totalDeleted, - earliestCommitToRetain, partitionMetadataMap, CLEAN_METADATA_VERSION_2, partitionBootstrapMetadataMap); + return new HoodieCleanMetadata(startCleanTime, durationInMs.orElseGet(() -> -1L), totalDeleted, earliestCommitToRetain, + lastCompletedCommitTimestamp, partitionMetadataMap, CLEAN_METADATA_VERSION_2, partitionBootstrapMetadataMap); } /** @@ -112,4 +124,38 @@ public static HoodieCleanerPlan getCleanerPlan(HoodieTableMetaClient metaClient, public static List convertToHoodieCleanFileInfoList(List cleanFileInfoList) { return cleanFileInfoList.stream().map(CleanFileInfo::toHoodieFileCleanInfo).collect(Collectors.toList()); } + + /** + * Execute {@link HoodieFailedWritesCleaningPolicy} to rollback failed writes for different actions. + * @param cleaningPolicy + * @param actionType + * @param rollbackFailedWritesFunc + */ + public static void rollbackFailedWrites(HoodieFailedWritesCleaningPolicy cleaningPolicy, String actionType, + Functions.Function0 rollbackFailedWritesFunc) { + switch (actionType) { + case HoodieTimeline.CLEAN_ACTION: + if (cleaningPolicy.isEager()) { + // No need to do any special cleanup for failed operations during clean + return; + } else if (cleaningPolicy.isLazy()) { + LOG.info("Cleaned failed attempts if any"); + // Perform rollback of failed operations for all types of actions during clean + rollbackFailedWritesFunc.apply(); + return; + } + // No action needed for cleaning policy NEVER + break; + case COMMIT_ACTION: + // For any other actions, perform rollback of failed writes + if (cleaningPolicy.isEager()) { + LOG.info("Cleaned failed attempts if any"); + rollbackFailedWritesFunc.apply(); + return; + } + break; + default: + throw new IllegalArgumentException("Unsupported action type " + actionType); + } + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ClosableIterator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ClosableIterator.java new file mode 100644 index 0000000000000..9e1d0c2b2b954 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ClosableIterator.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import java.util.Iterator; + +/** + * An iterator that give a chance to release resources. + * + * @param The return type + */ +public interface ClosableIterator extends Iterator, AutoCloseable { + @Override + void close(); // override to not throw exception +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ClusteringUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ClusteringUtils.java index c0c88c04ac151..9d741a03f82ec 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ClusteringUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ClusteringUtils.java @@ -31,8 +31,10 @@ import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -68,27 +70,52 @@ public static Stream> getAllPendingClu .filter(Option::isPresent).map(Option::get); } - public static Option> getClusteringPlan(HoodieTableMetaClient metaClient, HoodieInstant requestedReplaceInstant) { + /** + * Get requested replace metadata from timeline. + * @param metaClient + * @param pendingReplaceInstant + * @return + * @throws IOException + */ + private static Option getRequestedReplaceMetadata(HoodieTableMetaClient metaClient, HoodieInstant pendingReplaceInstant) throws IOException { + final HoodieInstant requestedInstant; + if (!pendingReplaceInstant.isRequested()) { + // inflight replacecommit files don't have clustering plan. + // This is because replacecommit inflight can have workload profile for 'insert_overwrite'. + // Get the plan from corresponding requested instant. + requestedInstant = HoodieTimeline.getReplaceCommitRequestedInstant(pendingReplaceInstant.getTimestamp()); + } else { + requestedInstant = pendingReplaceInstant; + } + Option content = metaClient.getActiveTimeline().getInstantDetails(requestedInstant); + if (!content.isPresent() || content.get().length == 0) { + // few operations create requested file without any content. Assume these are not clustering + return Option.empty(); + } + return Option.of(TimelineMetadataUtils.deserializeRequestedReplaceMetadata(content.get())); + } + + /** + * Get Clustering plan from timeline. + * @param metaClient + * @param pendingReplaceInstant + * @return + */ + public static Option> getClusteringPlan(HoodieTableMetaClient metaClient, HoodieInstant pendingReplaceInstant) { try { - Option content = metaClient.getActiveTimeline().getInstantDetails(requestedReplaceInstant); - if (!content.isPresent() || content.get().length == 0) { - // few operations create requested file without any content. Assume these are not clustering - LOG.warn("No content found in requested file for instant " + requestedReplaceInstant); - return Option.empty(); - } - HoodieRequestedReplaceMetadata requestedReplaceMetadata = TimelineMetadataUtils.deserializeRequestedReplaceMetadta(content.get()); - if (WriteOperationType.CLUSTER.name().equals(requestedReplaceMetadata.getOperationType())) { - return Option.of(Pair.of(requestedReplaceInstant, requestedReplaceMetadata.getClusteringPlan())); + Option requestedReplaceMetadata = getRequestedReplaceMetadata(metaClient, pendingReplaceInstant); + if (requestedReplaceMetadata.isPresent() && WriteOperationType.CLUSTER.name().equals(requestedReplaceMetadata.get().getOperationType())) { + return Option.of(Pair.of(pendingReplaceInstant, requestedReplaceMetadata.get().getClusteringPlan())); } return Option.empty(); } catch (IOException e) { - throw new HoodieIOException("Error reading clustering plan " + requestedReplaceInstant.getTimestamp(), e); + throw new HoodieIOException("Error reading clustering plan " + pendingReplaceInstant.getTimestamp(), e); } } /** * Get filegroups to pending clustering instant mapping for all pending clustering plans. - * This includes all clustering operattions in 'requested' and 'inflight' states. + * This includes all clustering operations in 'requested' and 'inflight' states. */ public static Map getAllFileGroupsInPendingClusteringPlans( HoodieTableMetaClient metaClient) { @@ -97,7 +124,16 @@ public static Map getAllFileGroupsInPendingClu // get all filegroups in the plan getFileGroupEntriesInClusteringPlan(clusteringPlan.getLeft(), clusteringPlan.getRight())); - Map resultMap = resultStream.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + Map resultMap; + try { + resultMap = resultStream.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } catch (Exception e) { + if (e instanceof IllegalStateException && e.getMessage().contains("Duplicate key")) { + throw new HoodieException("Found duplicate file groups pending clustering. If you're running deltastreamer in continuous mode, consider adding delay using --min-sync-interval-seconds. " + + "Or consider setting write concurrency mode to optimistic_concurrency_control.", e); + } + throw new HoodieException("Error getting all file groups in pending clustering", e); + } LOG.info("Found " + resultMap.size() + " files in pending clustering operations"); return resultMap; } @@ -114,7 +150,11 @@ private static Stream> getFileGroupE new AbstractMap.SimpleEntry<>(entry.getLeft(), entry.getRight())); } - private static Stream getFileGroupsFromClusteringGroup(HoodieClusteringGroup group) { + public static Stream getFileGroupsFromClusteringPlan(HoodieClusteringPlan clusteringPlan) { + return clusteringPlan.getInputGroups().stream().flatMap(ClusteringUtils::getFileGroupsFromClusteringGroup); + } + + public static Stream getFileGroupsFromClusteringGroup(HoodieClusteringGroup group) { return group.getSlices().stream().map(slice -> new HoodieFileGroupId(slice.getPartitionPath(), slice.getFileId())); } @@ -135,22 +175,20 @@ public static HoodieClusteringPlan createClusteringPlan(String strategyClassName .setStrategyClassName(strategyClassName).setStrategyParams(strategyParams) .build(); - HoodieClusteringPlan plan = HoodieClusteringPlan.newBuilder() + return HoodieClusteringPlan.newBuilder() .setInputGroups(clusteringGroups) .setExtraMetadata(extraMetadata) .setStrategy(strategy) .build(); - - return plan; } private static List getFileSliceInfo(List slices) { - return slices.stream().map(slice -> new HoodieSliceInfo().newBuilder() + return slices.stream().map(slice -> HoodieSliceInfo.newBuilder() .setPartitionPath(slice.getPartitionPath()) .setFileId(slice.getFileId()) .setDataFilePath(slice.getBaseFile().map(BaseFile::getPath).orElse(null)) .setDeltaFilePaths(slice.getLogFiles().map(f -> f.getPath().getName()).collect(Collectors.toList())) - .setBootstrapFilePath(slice.getBaseFile().map(bf -> bf.getBootstrapBaseFile().map(bbf -> bbf.getPath()).orElse(null)).orElse(null)) + .setBootstrapFilePath(slice.getBaseFile().map(bf -> bf.getBootstrapBaseFile().map(BaseFile::getPath).orElse(null)).orElse(null)) .build()).collect(Collectors.toList()); } @@ -175,4 +213,14 @@ private static Map buildMetrics(List fileSlices) { metrics.put(TOTAL_LOG_FILES, (double) numLogFiles); return metrics; } + + public static List getPendingClusteringInstantTimes(HoodieTableMetaClient metaClient) { + return metaClient.getActiveTimeline().filterPendingReplaceTimeline().getInstants() + .filter(instant -> isPendingClusteringInstant(metaClient, instant)) + .collect(Collectors.toList()); + } + + public static boolean isPendingClusteringInstant(HoodieTableMetaClient metaClient, HoodieInstant instant) { + return getClusteringPlan(metaClient, instant).isPresent(); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/CollectionUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/CollectionUtils.java index 0fb238e0f76b2..90d6e6ae90fb0 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/CollectionUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/CollectionUtils.java @@ -20,17 +20,170 @@ import org.apache.hudi.common.util.collection.Pair; +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Properties; import java.util.Set; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.function.BiFunction; import java.util.stream.Collectors; +import java.util.stream.IntStream; import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; public class CollectionUtils { + + private static final Properties EMPTY_PROPERTIES = new Properties(); + + /** + * Returns an empty {@code Properties} instance. The props instance is a singleton, + * it should not be modified in any case. + */ + public static Properties emptyProps() { + return EMPTY_PROPERTIES; + } + + public static boolean isNullOrEmpty(Collection c) { + return Objects.isNull(c) || c.isEmpty(); + } + + public static boolean nonEmpty(Collection c) { + return !isNullOrEmpty(c); + } + + /** + * Makes a copy of provided {@link Properties} object + */ + public static Properties copy(Properties props) { + Properties copy = new Properties(); + copy.putAll(props); + return copy; + } + + /** + * Returns last element of the array of {@code T} + */ + public static T tail(T[] ts) { + checkArgument(ts.length > 0); + return ts[ts.length - 1]; + } + + /** + * Collects provided {@link Iterator} to a {@link Stream} + */ + public static Stream toStream(Iterator iterator) { + return StreamSupport.stream( + Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), + false + ); + } + + /** + * Combines provided arrays into one + */ + @SuppressWarnings("unchecked") + public static T[] combine(T[] one, T[] another) { + T[] combined = (T[]) Array.newInstance(one.getClass().getComponentType(), one.length + another.length); + System.arraycopy(one, 0, combined, 0, one.length); + System.arraycopy(another, 0, combined, one.length, another.length); + return combined; + } + + /** + * Combines provided array and an element into a new array + */ + @SuppressWarnings("unchecked") + public static T[] append(T[] array, T elem) { + T[] combined = (T[]) Array.newInstance(array.getClass().getComponentType(), array.length + 1); + System.arraycopy(array, 0, combined, 0, array.length); + combined[array.length] = elem; + return combined; + } + + + /** + * Combines provided {@link List}s into one, returning new instance of {@link ArrayList} + */ + public static List combine(List one, List another) { + ArrayList combined = new ArrayList<>(one.size() + another.size()); + combined.addAll(one); + combined.addAll(another); + return combined; + } + + /** + * Combines provided {@link Map}s into one, returning new instance of {@link HashMap}. + * + * NOTE: That values associated with overlapping keys from the second map, will override + * values from the first one + */ + public static HashMap combine(Map one, Map another) { + HashMap combined = new HashMap<>(one.size() + another.size()); + combined.putAll(one); + combined.putAll(another); + return combined; + } + + /** + * Combines provided {@link Map}s into one, returning new instance of {@link HashMap}. + * + * NOTE: That values associated with overlapping keys from the second map, will override + * values from the first one + */ + public static HashMap combine(Map one, Map another, BiFunction merge) { + HashMap combined = new HashMap<>(one.size() + another.size()); + combined.putAll(one); + another.forEach((k, v) -> combined.merge(k, v, merge)); + return combined; + } + + /** + * Returns difference b/w {@code one} {@link Set} of elements and {@code another} + */ + public static Set diff(Set one, Set another) { + Set diff = new HashSet<>(one); + diff.removeAll(another); + return diff; + } + + /** + * Returns difference b/w {@code one} {@link List} of elements and {@code another} + * + * NOTE: This is less optimal counterpart to {@link #diff(Set, Set)}, accepting {@link List} + * as a holding collection to support duplicate elements use-cases + */ + public static List diff(List one, List another) { + List diff = new ArrayList<>(one); + diff.removeAll(another); + return diff; + } + + public static Stream> batchesAsStream(List list, int batchSize) { + checkArgument(batchSize > 0, "batch size must be positive."); + int total = list.size(); + if (total <= 0) { + return Stream.empty(); + } + int numFullBatches = (total - 1) / batchSize; + return IntStream.range(0, numFullBatches + 1).mapToObj( + n -> list.subList(n * batchSize, n == numFullBatches ? total : (n + 1) * batchSize)); + } + + public static List> batches(List list, int batchSize) { + return batchesAsStream(list, batchSize).collect(Collectors.toList()); + } + /** * Determines whether two iterators contain equal elements in the same order. More specifically, * this method returns {@code true} if {@code iterator1} and {@code iterator2} contain the same diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/CommitUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/CommitUtils.java index 7b4c7c5cad1f2..6baacf1ec88f5 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/CommitUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/CommitUtils.java @@ -26,9 +26,14 @@ import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; + +import org.apache.avro.Schema; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import java.io.IOException; +import java.util.HashMap; import java.util.List; import java.util.Map; @@ -38,9 +43,26 @@ public class CommitUtils { private static final Logger LOG = LogManager.getLogger(CommitUtils.class); + private static final String NULL_SCHEMA_STR = Schema.create(Schema.Type.NULL).toString(); + + /** + * Gets the commit action type for given write operation and table type. + * Use this API when commit action type can differ not only on the basis of table type but also write operation type. + * For example, INSERT_OVERWRITE/INSERT_OVERWRITE_TABLE operations have REPLACE commit action type. + */ + public static String getCommitActionType(WriteOperationType operation, HoodieTableType tableType) { + if (operation == WriteOperationType.INSERT_OVERWRITE || operation == WriteOperationType.INSERT_OVERWRITE_TABLE + || operation == WriteOperationType.DELETE_PARTITION) { + return HoodieTimeline.REPLACE_COMMIT_ACTION; + } else { + return getCommitActionType(tableType); + } + } /** * Gets the commit action type for given table type. + * Note: Use this API only when the commit action type is not dependent on the write operation type. + * See {@link CommitUtils#getCommitActionType(WriteOperationType, HoodieTableType)} for more details. */ public static String getCommitActionType(HoodieTableType tableType) { switch (tableType) { @@ -66,7 +88,8 @@ public static HoodieCommitMetadata buildMetadata(List writeStat if (extraMetadata.isPresent()) { extraMetadata.get().forEach(commitMetadata::addMetadata); } - commitMetadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, schemaToStoreInCommit); + commitMetadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, (schemaToStoreInCommit == null || schemaToStoreInCommit.equals(NULL_SCHEMA_STR)) + ? "" : schemaToStoreInCommit); commitMetadata.setOperationType(operationType); return commitMetadata; } @@ -76,7 +99,7 @@ private static HoodieCommitMetadata buildMetadataFromStats(List String commitActionType, WriteOperationType operationType) { final HoodieCommitMetadata commitMetadata; - if (commitActionType == HoodieTimeline.REPLACE_COMMIT_ACTION) { + if (HoodieTimeline.REPLACE_COMMIT_ACTION.equals(commitActionType)) { HoodieReplaceCommitMetadata replaceMetadata = new HoodieReplaceCommitMetadata(); replaceMetadata.setPartitionToReplaceFileIds(partitionToReplaceFileIds); commitMetadata = replaceMetadata; @@ -90,7 +113,56 @@ private static HoodieCommitMetadata buildMetadataFromStats(List } LOG.info("Creating metadata for " + operationType + " numWriteStats:" + writeStats.size() - + "numReplaceFileIds:" + partitionToReplaceFileIds.values().stream().mapToInt(e -> e.size()).sum()); + + " numReplaceFileIds:" + partitionToReplaceFileIds.values().stream().mapToInt(e -> e.size()).sum()); return commitMetadata; } + + public static HashMap getFileIdWithoutSuffixAndRelativePathsFromSpecificRecord(Map> + partitionToWriteStats) { + HashMap fileIdToPath = new HashMap<>(); + // list all partitions paths + for (Map.Entry> entry : partitionToWriteStats.entrySet()) { + for (org.apache.hudi.avro.model.HoodieWriteStat stat : entry.getValue()) { + fileIdToPath.put(stat.getFileId(), stat.getPath()); + } + } + return fileIdToPath; + } + + public static HashMap getFileIdWithoutSuffixAndRelativePaths(Map> + partitionToWriteStats) { + HashMap fileIdToPath = new HashMap<>(); + // list all partitions paths + for (Map.Entry> entry : partitionToWriteStats.entrySet()) { + for (HoodieWriteStat stat : entry.getValue()) { + fileIdToPath.put(stat.getFileId(), stat.getPath()); + } + } + return fileIdToPath; + } + + /** + * Process previous commits metadata in the timeline to determine the checkpoint given a checkpoint key. + * NOTE: This is very similar in intent to DeltaSync#getLatestCommitMetadataWithValidCheckpointInfo except that + * different deployment models (deltastreamer or spark structured streaming) could have different checkpoint keys. + * + * @param timeline completed commits in active timeline. + * @param checkpointKey the checkpoint key in the extra metadata of the commit. + * @return An optional commit metadata with latest checkpoint. + */ + public static Option getLatestCommitMetadataWithValidCheckpointInfo(HoodieTimeline timeline, String checkpointKey) { + return (Option) timeline.getReverseOrderedInstants().map(instant -> { + try { + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(timeline.getInstantDetails(instant).get(), HoodieCommitMetadata.class); + if (StringUtils.nonEmpty(commitMetadata.getMetadata(checkpointKey))) { + return Option.of(commitMetadata); + } else { + return Option.empty(); + } + } catch (IOException e) { + throw new HoodieIOException("Failed to parse HoodieCommitMetadata for " + instant.toString(), e); + } + }).filter(Option::isPresent).findFirst().orElse(Option.empty()); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/CompactionUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/CompactionUtils.java index f3e4dc62837c1..cf9b5fb3ced8c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/CompactionUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/CompactionUtils.java @@ -25,6 +25,7 @@ import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; @@ -59,13 +60,13 @@ public class CompactionUtils { /** * Generate compaction operation from file-slice. * - * @param partitionPath Partition path - * @param fileSlice File Slice + * @param partitionPath Partition path + * @param fileSlice File Slice * @param metricsCaptureFunction Metrics Capture function * @return Compaction Operation */ public static HoodieCompactionOperation buildFromFileSlice(String partitionPath, FileSlice fileSlice, - Option, Map>> metricsCaptureFunction) { + Option, Map>> metricsCaptureFunction) { HoodieCompactionOperation.Builder builder = HoodieCompactionOperation.newBuilder(); builder.setPartitionPath(partitionPath); builder.setFileId(fileSlice.getFileId()); @@ -87,12 +88,12 @@ public static HoodieCompactionOperation buildFromFileSlice(String partitionPath, * Generate compaction plan from file-slices. * * @param partitionFileSlicePairs list of partition file-slice pairs - * @param extraMetadata Extra Metadata - * @param metricsCaptureFunction Metrics Capture function + * @param extraMetadata Extra Metadata + * @param metricsCaptureFunction Metrics Capture function */ public static HoodieCompactionPlan buildFromFileSlices(List> partitionFileSlicePairs, - Option> extraMetadata, - Option, Map>> metricsCaptureFunction) { + Option> extraMetadata, + Option, Map>> metricsCaptureFunction) { HoodieCompactionPlan.Builder builder = HoodieCompactionPlan.newBuilder(); extraMetadata.ifPresent(builder::setExtraMetadata); @@ -195,10 +196,106 @@ public static Stream getPendingCompactionInstantTimes(HoodieTableMetaClient metaClient) { return metaClient.getActiveTimeline().filterPendingCompactionTimeline().getInstants().collect(Collectors.toList()); } + + /** + * Returns a pair of (timeline containing the delta commits after the latest completed + * compaction commit, the completed compaction commit instant), if the latest completed + * compaction commit is present; a pair of (timeline containing all the delta commits, + * the first delta commit instant), if there is no completed compaction commit. + * + * @param activeTimeline Active timeline of a table. + * @return Pair of timeline containing delta commits and an instant. + */ + public static Option> getDeltaCommitsSinceLatestCompaction( + HoodieActiveTimeline activeTimeline) { + Option lastCompaction = activeTimeline.getCommitTimeline() + .filterCompletedInstants().lastInstant(); + HoodieTimeline deltaCommits = activeTimeline.getDeltaCommitTimeline(); + + HoodieInstant latestInstant; + if (lastCompaction.isPresent()) { + latestInstant = lastCompaction.get(); + // timeline containing the delta commits after the latest completed compaction commit, + // and the completed compaction commit instant + return Option.of(Pair.of(deltaCommits.findInstantsAfter( + latestInstant.getTimestamp(), Integer.MAX_VALUE), lastCompaction.get())); + } else { + if (deltaCommits.countInstants() > 0) { + latestInstant = deltaCommits.firstInstant().get(); + // timeline containing all the delta commits, and the first delta commit instant + return Option.of(Pair.of(deltaCommits.findInstantsAfterOrEquals( + latestInstant.getTimestamp(), Integer.MAX_VALUE), latestInstant)); + } else { + return Option.empty(); + } + } + } + + public static Option> getDeltaCommitsSinceLatestCompactionRequest( + HoodieActiveTimeline activeTimeline) { + Option lastCompaction = activeTimeline.getCommitTimeline() + .filterCompletedInstants().lastInstant(); + Option lastRequestCompaction = activeTimeline.getAllCommitsTimeline() + .filterPendingCompactionTimeline().lastInstant(); + if (lastRequestCompaction.isPresent()) { + lastCompaction = lastRequestCompaction; + } + HoodieTimeline deltaCommits = activeTimeline.getDeltaCommitTimeline(); + + HoodieInstant latestInstant; + if (lastCompaction.isPresent()) { + latestInstant = lastCompaction.get(); + // timeline containing the delta commits after the latest completed compaction commit, + // and the completed compaction commit instant + return Option.of(Pair.of(deltaCommits.findInstantsAfter( + latestInstant.getTimestamp(), Integer.MAX_VALUE), lastCompaction.get())); + } else { + if (deltaCommits.countInstants() > 0) { + latestInstant = deltaCommits.firstInstant().get(); + // timeline containing all the delta commits, and the first delta commit instant + return Option.of(Pair.of(deltaCommits.findInstantsAfterOrEquals( + latestInstant.getTimestamp(), Integer.MAX_VALUE), latestInstant)); + } else { + return Option.empty(); + } + } + } + + /** + * Gets the oldest instant to retain for MOR compaction. + * If there is no completed compaction, + * num delta commits >= "hoodie.compact.inline.max.delta.commits" + * If there is a completed compaction, + * num delta commits after latest completed compaction >= "hoodie.compact.inline.max.delta.commits" + * + * @param activeTimeline Active timeline of a table. + * @param maxDeltaCommits Maximum number of delta commits that trigger the compaction plan, + * i.e., "hoodie.compact.inline.max.delta.commits". + * @return the oldest instant to keep for MOR compaction. + */ + public static Option getOldestInstantToRetainForCompaction( + HoodieActiveTimeline activeTimeline, int maxDeltaCommits) { + Option> deltaCommitsInfoOption = + CompactionUtils.getDeltaCommitsSinceLatestCompaction(activeTimeline); + if (deltaCommitsInfoOption.isPresent()) { + Pair deltaCommitsInfo = deltaCommitsInfoOption.get(); + HoodieTimeline deltaCommitTimeline = deltaCommitsInfo.getLeft(); + int numDeltaCommits = deltaCommitTimeline.countInstants(); + if (numDeltaCommits < maxDeltaCommits) { + return Option.of(deltaCommitsInfo.getRight()); + } else { + // delta commits with the last one to keep + List instants = deltaCommitTimeline.getInstants() + .limit(numDeltaCommits - maxDeltaCommits + 1).collect(Collectors.toList()); + return Option.of(instants.get(instants.size() - 1)); + } + } + return Option.empty(); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/CustomizedThreadFactory.java b/hudi-common/src/main/java/org/apache/hudi/common/util/CustomizedThreadFactory.java new file mode 100644 index 0000000000000..738be514b2cbf --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/CustomizedThreadFactory.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import org.jetbrains.annotations.NotNull; +import java.util.concurrent.ThreadFactory; +import java.util.concurrent.atomic.AtomicLong; + +/** + * A thread factory for creation of threads + */ +public class CustomizedThreadFactory implements ThreadFactory { + + private static final AtomicLong POOL_NUM = new AtomicLong(1); + private final AtomicLong threadNum = new AtomicLong(1); + + private final String threadName; + private final boolean daemon; + + public CustomizedThreadFactory() { + this("pool-" + POOL_NUM.getAndIncrement(), false); + } + + public CustomizedThreadFactory(String threadNamePrefix) { + this(threadNamePrefix, false); + } + + public CustomizedThreadFactory(String threadNamePrefix, boolean daemon) { + this.threadName = threadNamePrefix + "-thread-"; + this.daemon = daemon; + } + + @Override + public Thread newThread(@NotNull Runnable r) { + Thread runThread = new Thread(r); + runThread.setDaemon(daemon); + runThread.setName(threadName + threadNum.getAndIncrement()); + return runThread; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/DateTimeUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/DateTimeUtils.java new file mode 100644 index 0000000000000..cf90eff8d6185 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/DateTimeUtils.java @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util; + +import java.time.Duration; +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; +import java.time.temporal.ChronoUnit; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + +public class DateTimeUtils { + private static final Map LABEL_TO_UNIT_MAP = + Collections.unmodifiableMap(initMap()); + + /** + * Converts provided microseconds (from epoch) to {@link Instant} + */ + public static Instant microsToInstant(long microsFromEpoch) { + long epochSeconds = microsFromEpoch / (1_000_000L); + long nanoAdjustment = (microsFromEpoch % (1_000_000L)) * 1_000L; + + return Instant.ofEpochSecond(epochSeconds, nanoAdjustment); + } + + /** + * Converts provided {@link Instant} to microseconds (from epoch) + */ + public static long instantToMicros(Instant instant) { + long seconds = instant.getEpochSecond(); + int nanos = instant.getNano(); + + if (seconds < 0 && nanos > 0) { + long micros = Math.multiplyExact(seconds + 1, 1_000_000L); + long adjustment = (nanos / 1_000L) - 1_000_000; + + return Math.addExact(micros, adjustment); + } else { + long micros = Math.multiplyExact(seconds, 1_000_000L); + + return Math.addExact(micros, nanos / 1_000L); + } + } + + /** + * Parse input String to a {@link java.time.Instant}. + * + * @param s Input String should be Epoch time in millisecond or ISO-8601 format. + */ + public static Instant parseDateTime(String s) throws DateTimeParseException { + ValidationUtils.checkArgument(Objects.nonNull(s), "Input String cannot be null."); + try { + return Instant.ofEpochMilli(Long.parseLong(s)); + } catch (NumberFormatException e) { + return Instant.parse(s); + } + } + + /** + * Parse the given string to a java {@link Duration}. The string is in format "{length + * value}{time unit label}", e.g. "123ms", "321 s". If no time unit label is specified, it will + * be considered as milliseconds. + * + *

    Supported time unit labels are: + * + *

      + *
    • DAYS: "d", "day" + *
    • HOURS: "h", "hour" + *
    • MINUTES: "min", "minute" + *
    • SECONDS: "s", "sec", "second" + *
    • MILLISECONDS: "ms", "milli", "millisecond" + *
    • MICROSECONDS: "µs", "micro", "microsecond" + *
    • NANOSECONDS: "ns", "nano", "nanosecond" + *
    + * + * @param text string to parse. + */ + public static Duration parseDuration(String text) { + ValidationUtils.checkArgument(!StringUtils.isNullOrEmpty(text)); + + final String trimmed = text.trim(); + ValidationUtils.checkArgument(!trimmed.isEmpty(), "argument is an empty- or whitespace-only string"); + + final int len = trimmed.length(); + int pos = 0; + + char current; + while (pos < len && (current = trimmed.charAt(pos)) >= '0' && current <= '9') { + pos++; + } + + final String number = trimmed.substring(0, pos); + final String unitLabel = trimmed.substring(pos).trim().toLowerCase(Locale.US); + + if (number.isEmpty()) { + throw new NumberFormatException("text does not start with a number"); + } + + final long value; + try { + value = Long.parseLong(number); // this throws a NumberFormatException on overflow + } catch (NumberFormatException e) { + throw new IllegalArgumentException( + "The value '" + + number + + "' cannot be re represented as 64bit number (numeric overflow)."); + } + + if (unitLabel.isEmpty()) { + return Duration.of(value, ChronoUnit.MILLIS); + } + + ChronoUnit unit = LABEL_TO_UNIT_MAP.get(unitLabel); + if (unit != null) { + return Duration.of(value, unit); + } else { + throw new IllegalArgumentException( + "Time interval unit label '" + + unitLabel + + "' does not match any of the recognized units: " + + TimeUnit.getAllUnits()); + } + } + + private static Map initMap() { + Map labelToUnit = new HashMap<>(); + for (TimeUnit timeUnit : TimeUnit.values()) { + for (String label : timeUnit.getLabels()) { + labelToUnit.put(label, timeUnit.getUnit()); + } + } + return labelToUnit; + } + + /** + * Convert UNIX_TIMESTAMP to string in given format. + * + * @param unixTimestamp UNIX_TIMESTAMP + * @param timeFormat string time format + */ + public static String formatUnixTimestamp(long unixTimestamp, String timeFormat) { + ValidationUtils.checkArgument(!StringUtils.isNullOrEmpty(timeFormat)); + DateTimeFormatter dtf = DateTimeFormatter.ofPattern(timeFormat); + return LocalDateTime + .ofInstant(Instant.ofEpochSecond(unixTimestamp), ZoneId.systemDefault()) + .format(dtf); + } + + /** + * Enum which defines time unit, mostly used to parse value from configuration file. + */ + private enum TimeUnit { + DAYS(ChronoUnit.DAYS, singular("d"), plural("day")), + HOURS(ChronoUnit.HOURS, singular("h"), plural("hour")), + MINUTES(ChronoUnit.MINUTES, singular("min"), plural("minute")), + SECONDS(ChronoUnit.SECONDS, singular("s"), plural("sec"), plural("second")), + MILLISECONDS(ChronoUnit.MILLIS, singular("ms"), plural("milli"), plural("millisecond")), + MICROSECONDS(ChronoUnit.MICROS, singular("µs"), plural("micro"), plural("microsecond")), + NANOSECONDS(ChronoUnit.NANOS, singular("ns"), plural("nano"), plural("nanosecond")); + + private static final String PLURAL_SUFFIX = "s"; + + private final List labels; + + private final ChronoUnit unit; + + TimeUnit(ChronoUnit unit, String[]... labels) { + this.unit = unit; + this.labels = + Arrays.stream(labels) + .flatMap(Arrays::stream) + .collect(Collectors.toList()); + } + + /** + * @param label the original label + * @return the singular format of the original label + */ + private static String[] singular(String label) { + return new String[] {label}; + } + + /** + * @param label the original label + * @return both the singular format and plural format of the original label + */ + private static String[] plural(String label) { + return new String[] {label, label + PLURAL_SUFFIX}; + } + + public List getLabels() { + return labels; + } + + public ChronoUnit getUnit() { + return unit; + } + + public static String getAllUnits() { + return Arrays.stream(TimeUnit.values()) + .map(TimeUnit::createTimeUnitString) + .collect(Collectors.joining(", ")); + } + + private static String createTimeUnitString(TimeUnit timeUnit) { + return timeUnit.name() + ": (" + String.join(" | ", timeUnit.getLabels()) + ")"; + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/DefaultSizeEstimator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/DefaultSizeEstimator.java index 0e6d78f1f224d..8032a04df4647 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/DefaultSizeEstimator.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/DefaultSizeEstimator.java @@ -18,12 +18,14 @@ package org.apache.hudi.common.util; +import java.io.Serializable; + /** * Default implementation of size-estimator that uses Twitter's ObjectSizeCalculator. * * @param */ -public class DefaultSizeEstimator implements SizeEstimator { +public class DefaultSizeEstimator implements SizeEstimator, Serializable { @Override public long sizeEstimate(T t) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/Either.java b/hudi-common/src/main/java/org/apache/hudi/common/util/Either.java new file mode 100644 index 0000000000000..fb624c6075349 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/Either.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import javax.annotation.Nonnull; + +import static org.apache.hudi.common.util.TypeUtils.unsafeCast; + +/** + * Utility that could hold exclusively only either of (hence the name): + *
      + *
    • Non-null value of type {@link L}
    • + *
    • Non-null value of type {@link R}
    • + *
    + * + * @param type of the "left" potential element + * @param type of the "right" potential element + */ +public abstract class Either { + + @Nonnull + protected abstract Object getValue(); + + public final boolean isLeft() { + return this instanceof EitherLeft; + } + + public final boolean isRight() { + return this instanceof EitherRight; + } + + public R asRight() { + ValidationUtils.checkArgument(isRight(), "Trying to access non-existent value of Either"); + EitherRight right = unsafeCast(this); + return right.getValue(); + } + + public L asLeft() { + ValidationUtils.checkArgument(isLeft(), "Trying to access non-existent value of Either"); + EitherLeft left = unsafeCast(this); + return left.getValue(); + } + + public static Either right(R right) { + return new EitherRight<>(right); + } + + public static Either left(L left) { + return new EitherLeft<>(left); + } + + public static class EitherRight extends Either { + private final R value; + private EitherRight(@Nonnull R right) { + this.value = right; + } + + @Nonnull + @Override + protected R getValue() { + return value; + } + } + + public static class EitherLeft extends Either { + private final L value; + private EitherLeft(@Nonnull L value) { + this.value = value; + } + + @Nonnull + @Override + protected L getValue() { + return value; + } + } +} \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/FileIOUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/FileIOUtils.java index f1095b6845c0f..426a703503328 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/FileIOUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/FileIOUtils.java @@ -18,23 +18,37 @@ package org.apache.hudi.common.util; +import org.apache.hudi.exception.HoodieIOException; + +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.BufferedReader; import java.io.ByteArrayOutputStream; +import java.io.Closeable; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; import java.io.OutputStream; import java.io.PrintStream; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; /** * Bunch of utility methods for working with files and byte streams. */ public class FileIOUtils { - + public static final Logger LOG = LogManager.getLogger(FileIOUtils.class); public static final long KB = 1024; public static void deleteDirectory(File directory) throws IOException { @@ -67,6 +81,20 @@ public static String readAsUTFString(InputStream input, int length) throws IOExc return new String(bos.toByteArray(), StandardCharsets.UTF_8); } + /** + * Reads the input stream into String lines. + * + * @param input {@code InputStream} instance. + * @return String lines in a list. + */ + public static List readAsUTFStringLines(InputStream input) { + List lines = new ArrayList<>(); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(input, StandardCharsets.UTF_8)); + lines = bufferedReader.lines().collect(Collectors.toList()); + closeQuietly(bufferedReader); + return lines; + } + public static void copy(InputStream inputStream, OutputStream outputStream) throws IOException { byte[] buffer = new byte[1024]; int len; @@ -75,6 +103,31 @@ public static void copy(InputStream inputStream, OutputStream outputStream) thro } } + /** + * Copies the file content from source path to destination path. + * + * @param fileSystem {@link FileSystem} instance. + * @param sourceFilePath Source file path. + * @param destFilePath Destination file path. + */ + public static void copy( + FileSystem fileSystem, org.apache.hadoop.fs.Path sourceFilePath, + org.apache.hadoop.fs.Path destFilePath) { + FSDataInputStream fsDataInputStream = null; + FSDataOutputStream fsDataOutputStream = null; + try { + fsDataInputStream = fileSystem.open(sourceFilePath); + fsDataOutputStream = fileSystem.create(destFilePath, false); + copy(fsDataInputStream, fsDataOutputStream); + } catch (IOException e) { + throw new HoodieIOException(String.format("Cannot copy from %s to %s", + sourceFilePath.toString(), destFilePath.toString()), e); + } finally { + closeQuietly(fsDataInputStream); + closeQuietly(fsDataOutputStream); + } + } + public static byte[] readAsByteArray(InputStream input) throws IOException { return readAsByteArray(input, 128); } @@ -91,4 +144,100 @@ public static void writeStringToFile(String str, String filePath) throws IOExcep out.flush(); out.close(); } + + /** + * Closes {@code Closeable} quietly. + * + * @param closeable {@code Closeable} to close + */ + public static void closeQuietly(Closeable closeable) { + if (closeable == null) { + return; + } + try { + closeable.close(); + } catch (IOException e) { + LOG.warn("IOException during close", e); + } + } + + public static void createFileInPath(FileSystem fileSystem, org.apache.hadoop.fs.Path fullPath, Option content, boolean ignoreIOE) { + try { + // If the path does not exist, create it first + if (!fileSystem.exists(fullPath)) { + if (fileSystem.createNewFile(fullPath)) { + LOG.info("Created a new file in meta path: " + fullPath); + } else { + throw new HoodieIOException("Failed to create file " + fullPath); + } + } + + if (content.isPresent()) { + FSDataOutputStream fsout = fileSystem.create(fullPath, true); + fsout.write(content.get()); + fsout.close(); + } + } catch (IOException e) { + LOG.warn("Failed to create file " + fullPath, e); + if (!ignoreIOE) { + throw new HoodieIOException("Failed to create file " + fullPath, e); + } + } + } + + public static void createFileInPath(FileSystem fileSystem, org.apache.hadoop.fs.Path fullPath, Option content) { + createFileInPath(fileSystem, fullPath, content, false); + } + + public static Option readDataFromPath(FileSystem fileSystem, org.apache.hadoop.fs.Path detailPath, boolean ignoreIOE) { + try (FSDataInputStream is = fileSystem.open(detailPath)) { + return Option.of(FileIOUtils.readAsByteArray(is)); + } catch (IOException e) { + LOG.warn("Could not read commit details from " + detailPath, e); + if (!ignoreIOE) { + throw new HoodieIOException("Could not read commit details from " + detailPath, e); + } + return Option.empty(); + } + } + + public static Option readDataFromPath(FileSystem fileSystem, org.apache.hadoop.fs.Path detailPath) { + return readDataFromPath(fileSystem, detailPath, false); + } + + /** + * Return the configured local directories where hudi can write files. This + * method does not create any directories on its own, it only encapsulates the + * logic of locating the local directories according to deployment mode. + */ + public static String[] getConfiguredLocalDirs() { + if (isRunningInYarnContainer()) { + // If we are in yarn mode, systems can have different disk layouts so we must set it + // to what Yarn on this system said was available. Note this assumes that Yarn has + // created the directories already, and that they are secured so that only the + // user has access to them. + return getYarnLocalDirs().split(","); + } else if (System.getProperty("java.io.tmpdir") != null) { + return System.getProperty("java.io.tmpdir").split(","); + } else { + return null; + } + } + + private static boolean isRunningInYarnContainer() { + // These environment variables are set by YARN. + return System.getenv("CONTAINER_ID") != null; + } + + /** + * Get the Yarn approved local directories. + */ + private static String getYarnLocalDirs() { + String localDirs = Option.of(System.getenv("LOCAL_DIRS")).orElse(""); + + if (localDirs.isEmpty()) { + throw new HoodieIOException("Yarn Local dirs can't be empty"); + } + return localDirs; + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/Functions.java b/hudi-common/src/main/java/org/apache/hudi/common/util/Functions.java index 3ec96be207330..728ac717e4cd5 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/Functions.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/Functions.java @@ -25,31 +25,36 @@ */ public interface Functions { + static Runnable noop() { + return () -> { + }; + } + /** * A function which has not any parameter. */ - public interface Function0 extends Serializable { + interface Function0 extends Serializable { R apply(); } /** * A function which contains only one parameter. */ - public interface Function1 extends Serializable { + interface Function1 extends Serializable { R apply(T1 val1); } /** * A function which contains two parameters. */ - public interface Function2 extends Serializable { + interface Function2 extends Serializable { R apply(T1 val1, T2 val2); } /** * A function which contains three parameters. */ - public interface Function3 extends Serializable { + interface Function3 extends Serializable { R apply(T1 val1, T2 val2, T3 val3); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/FutureUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/FutureUtils.java new file mode 100644 index 0000000000000..b0029917eebdf --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/FutureUtils.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import javax.annotation.Nonnull; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.stream.Collectors; + +/** + * A utility class for future operation. + */ +public class FutureUtils { + + /** + * Parallel CompletableFutures + * + * @param futures CompletableFuture list + * @return a new CompletableFuture which will completed when all of the given CompletableFutures complete. + */ + public static CompletableFuture> allOf(@Nonnull List> futures) { + return CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])) + .thenApply(aVoid -> + futures.stream() + // NOTE: This join wouldn't block, since all the + // futures are completed at this point. + .map(CompletableFuture::join) + .collect(Collectors.toList())); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordSizeEstimator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordSizeEstimator.java index 78f7b1b9bb105..88a0f70cd831c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordSizeEstimator.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordSizeEstimator.java @@ -26,12 +26,14 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import java.io.Serializable; + /** * Size Estimator for Hoodie record payload. * * @param */ -public class HoodieRecordSizeEstimator implements SizeEstimator> { +public class HoodieRecordSizeEstimator implements SizeEstimator>, Serializable { private static final Logger LOG = LogManager.getLogger(HoodieRecordSizeEstimator.class); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieTimer.java b/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieTimer.java index 0ccc7ca110a3a..a0a8ca0867e93 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieTimer.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieTimer.java @@ -30,7 +30,21 @@ public class HoodieTimer { // Ordered stack of TimeInfo's to make sure stopping the timer returns the correct elapsed time - Deque timeInfoDeque = new ArrayDeque<>(); + private final Deque timeInfoDeque = new ArrayDeque<>(); + + /** + * @deprecated please use either {@link HoodieTimer#start} or {@link HoodieTimer#create} APIs + */ + @Deprecated + public HoodieTimer() { + this(false); + } + + private HoodieTimer(boolean shouldStart) { + if (shouldStart) { + startTimer(); + } + } static class TimeInfo { @@ -69,4 +83,12 @@ public long endTimer() { } return timeInfoDeque.pop().stop(); } + + public static HoodieTimer start() { + return new HoodieTimer(true); + } + + public static HoodieTimer create() { + return new HoodieTimer(false); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/InternalSchemaCache.java b/hudi-common/src/main/java/org/apache/hudi/common/util/InternalSchemaCache.java new file mode 100644 index 0000000000000..1d2786197780c --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/InternalSchemaCache.java @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager; +import org.apache.hudi.internal.schema.utils.InternalSchemaUtils; +import org.apache.hudi.internal.schema.utils.SerDeHelper; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import java.util.TreeMap; +import java.util.stream.Collectors; + +public class InternalSchemaCache { + private static final Logger LOG = LogManager.getLogger(InternalSchemaCache.class); + // Use segment lock to reduce competition. + // the lock size should be powers of 2 for better hash. + private static Object[] lockList = new Object[16]; + + static { + for (int i = 0; i < lockList.length; i++) { + lockList[i] = new Object(); + } + } + + // historySchemas cache maintain a map about (tablePath, HistorySchemas). + // this is a Global cache, all threads in one container/executor share the same cache. + private static final Cache> + HISTORICAL_SCHEMA_CACHE = Caffeine.newBuilder().maximumSize(1000).weakValues().build(); + + /** + * Search internalSchema based on versionID. + * first step: try to get internalSchema from hoodie commit files, we no need to add lock. + * if we cannot get internalSchema by first step, then we try to get internalSchema from cache. + * + * @param versionID schema version_id need to search + * @param metaClient current hoodie metaClient + * @return internalSchema + */ + public static InternalSchema searchSchemaAndCache(long versionID, HoodieTableMetaClient metaClient, boolean cacheEnable) { + Option candidateSchema = getSchemaByReadingCommitFile(versionID, metaClient); + if (candidateSchema.isPresent()) { + return candidateSchema.get(); + } + if (!cacheEnable) { + // parse history schema and return directly + return InternalSchemaUtils.searchSchema(versionID, getHistoricalSchemas(metaClient)); + } + String tablePath = metaClient.getBasePath(); + // use segment lock to reduce competition. + synchronized (lockList[tablePath.hashCode() & (lockList.length - 1)]) { + TreeMap historicalSchemas = HISTORICAL_SCHEMA_CACHE.getIfPresent(tablePath); + if (historicalSchemas == null || InternalSchemaUtils.searchSchema(versionID, historicalSchemas) == null) { + historicalSchemas = getHistoricalSchemas(metaClient); + HISTORICAL_SCHEMA_CACHE.put(tablePath, historicalSchemas); + } else { + long maxVersionId = historicalSchemas.keySet().stream().max(Long::compareTo).get(); + if (versionID > maxVersionId) { + historicalSchemas = getHistoricalSchemas(metaClient); + HISTORICAL_SCHEMA_CACHE.put(tablePath, historicalSchemas); + } + } + return InternalSchemaUtils.searchSchema(versionID, historicalSchemas); + } + } + + private static TreeMap getHistoricalSchemas(HoodieTableMetaClient metaClient) { + TreeMap result = new TreeMap<>(); + FileBasedInternalSchemaStorageManager schemasManager = new FileBasedInternalSchemaStorageManager(metaClient); + String historySchemaStr = schemasManager.getHistorySchemaStr(); + if (!StringUtils.isNullOrEmpty(historySchemaStr)) { + result = SerDeHelper.parseSchemas(historySchemaStr); + } + return result; + } + + private static Option getSchemaByReadingCommitFile(long versionID, HoodieTableMetaClient metaClient) { + try { + HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); + List instants = timeline.getInstants().filter(f -> f.getTimestamp().equals(String.valueOf(versionID))).collect(Collectors.toList()); + if (instants.isEmpty()) { + return Option.empty(); + } + byte[] data = timeline.getInstantDetails(instants.get(0)).get(); + HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class); + String latestInternalSchemaStr = metadata.getMetadata(SerDeHelper.LATEST_SCHEMA); + return SerDeHelper.fromJson(latestInternalSchemaStr); + } catch (Exception e) { + throw new HoodieException("Failed to read schema from commit metadata", e); + } + } + + /** + * Get internalSchema and avroSchema for compaction/cluster operation. + * + * @param metaClient current hoodie metaClient + * @param compactionAndClusteringInstant first instant before current compaction/cluster instant + * @return (internalSchemaStrOpt, avroSchemaStrOpt) a pair of InternalSchema/avroSchema + */ + public static Pair, Option> getInternalSchemaAndAvroSchemaForClusteringAndCompaction(HoodieTableMetaClient metaClient, String compactionAndClusteringInstant) { + // try to load internalSchema to support Schema Evolution + HoodieTimeline timelineBeforeCurrentCompaction = metaClient.getCommitsAndCompactionTimeline().findInstantsBefore(compactionAndClusteringInstant).filterCompletedInstants(); + Option lastInstantBeforeCurrentCompaction = timelineBeforeCurrentCompaction.lastInstant(); + if (lastInstantBeforeCurrentCompaction.isPresent()) { + // try to find internalSchema + byte[] data = timelineBeforeCurrentCompaction.getInstantDetails(lastInstantBeforeCurrentCompaction.get()).get(); + HoodieCommitMetadata metadata; + try { + metadata = HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class); + } catch (Exception e) { + throw new HoodieException(String.format("cannot read metadata from commit: %s", lastInstantBeforeCurrentCompaction.get()), e); + } + String internalSchemaStr = metadata.getMetadata(SerDeHelper.LATEST_SCHEMA); + if (internalSchemaStr != null) { + String existingSchemaStr = metadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY); + return Pair.of(Option.of(internalSchemaStr), Option.of(existingSchemaStr)); + } + } + return Pair.of(Option.empty(), Option.empty()); + } + + /** + * Give a schema versionId return its internalSchema. + * This method will be called by spark tasks, we should minimize time cost. + * We try our best to not use metaClient, since the initialization of metaClient is time cost + * step1: + * try to parser internalSchema from HoodieInstant directly + * step2: + * if we cannot parser internalSchema in step1, + * try to find internalSchema in historySchema. + * + * @param versionId the internalSchema version to be search. + * @param tablePath table path + * @param hadoopConf conf + * @param validCommits current validate commits, use to make up the commit file path/verify the validity of the history schema files + * @return a internalSchema. + */ + public static InternalSchema getInternalSchemaByVersionId(long versionId, String tablePath, Configuration hadoopConf, String validCommits) { + Set commitSet = Arrays.stream(validCommits.split(",")).collect(Collectors.toSet()); + List validateCommitList = commitSet.stream().map(fileName -> { + String fileExtension = HoodieInstant.getTimelineFileExtension(fileName); + return fileName.replace(fileExtension, ""); + }).collect(Collectors.toList()); + + FileSystem fs = FSUtils.getFs(tablePath, hadoopConf); + Path hoodieMetaPath = new Path(tablePath, HoodieTableMetaClient.METAFOLDER_NAME); + //step1: + Path candidateCommitFile = commitSet.stream().filter(fileName -> { + String fileExtension = HoodieInstant.getTimelineFileExtension(fileName); + return fileName.replace(fileExtension, "").equals(versionId + ""); + }).findFirst().map(f -> new Path(hoodieMetaPath, f)).orElse(null); + if (candidateCommitFile != null) { + try { + byte[] data; + try (FSDataInputStream is = fs.open(candidateCommitFile)) { + data = FileIOUtils.readAsByteArray(is); + } catch (IOException e) { + throw e; + } + HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class); + String latestInternalSchemaStr = metadata.getMetadata(SerDeHelper.LATEST_SCHEMA); + if (latestInternalSchemaStr != null) { + return SerDeHelper.fromJson(latestInternalSchemaStr).orElse(null); + } + } catch (Exception e1) { + // swallow this exception. + LOG.warn(String.format("Cannot find internal schema from commit file %s. Falling back to parsing historical internal schema", candidateCommitFile.toString())); + } + } + // step2: + FileBasedInternalSchemaStorageManager fileBasedInternalSchemaStorageManager = new FileBasedInternalSchemaStorageManager(hadoopConf, new Path(tablePath)); + String lastestHistorySchema = fileBasedInternalSchemaStorageManager.getHistorySchemaStrByGivenValidCommits(validateCommitList); + return InternalSchemaUtils.searchSchema(versionId, SerDeHelper.parseSchemas(lastestHistorySchema)); + } +} + diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/JsonUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/JsonUtils.java new file mode 100644 index 0000000000000..7c41fe4f29d95 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/JsonUtils.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util; + +import com.fasterxml.jackson.annotation.JsonAutoDetect; +import com.fasterxml.jackson.annotation.PropertyAccessor; +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; + +public class JsonUtils { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + static { + MAPPER.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES); + // We need to exclude custom getters, setters and creators which can use member fields + // to derive new fields, so that they are not included in the serialization + MAPPER.setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY); + MAPPER.setVisibility(PropertyAccessor.GETTER, JsonAutoDetect.Visibility.NONE); + MAPPER.setVisibility(PropertyAccessor.IS_GETTER, JsonAutoDetect.Visibility.NONE); + MAPPER.setVisibility(PropertyAccessor.SETTER, JsonAutoDetect.Visibility.NONE); + MAPPER.setVisibility(PropertyAccessor.CREATOR, JsonAutoDetect.Visibility.NONE); + } + + public static ObjectMapper getObjectMapper() { + return MAPPER; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/MapUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/MapUtils.java new file mode 100644 index 0000000000000..c39f6fd74f424 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/MapUtils.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util; + +import java.util.Map; +import java.util.Objects; + +public class MapUtils { + + public static boolean isNullOrEmpty(Map m) { + return Objects.isNull(m) || m.isEmpty(); + } + + public static boolean nonEmpty(Map m) { + return !isNullOrEmpty(m); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/MarkerUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/MarkerUtils.java new file mode 100644 index 0000000000000..0aff8f594a5df --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/MarkerUtils.java @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util; + +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.marker.MarkerType; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; + +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.function.Predicate; + +import static org.apache.hudi.common.util.FileIOUtils.closeQuietly; + +/** + * A utility class for marker related operations. + */ +public class MarkerUtils { + public static final String MARKERS_FILENAME_PREFIX = "MARKERS"; + public static final String MARKER_TYPE_FILENAME = MARKERS_FILENAME_PREFIX + ".type"; + private static final Logger LOG = LogManager.getLogger(MarkerUtils.class); + + /** + * Strips the folder prefix of the marker file path corresponding to a data file. + * + * @param fullMarkerPath the full path of the marker file + * @param basePath the base path + * @param instantTime instant of interest + * @return marker file name + */ + public static String stripMarkerFolderPrefix(String fullMarkerPath, String basePath, String instantTime) { + ValidationUtils.checkArgument(fullMarkerPath.contains(HoodieTableMetaClient.MARKER_EXTN), + String.format("Using DIRECT markers but marker path does not contain extension: %s", HoodieTableMetaClient.MARKER_EXTN)); + String markerRootPath = Path.getPathWithoutSchemeAndAuthority( + new Path(String.format("%s/%s/%s", basePath, HoodieTableMetaClient.TEMPFOLDER_NAME, instantTime))).toString(); + return stripMarkerFolderPrefix(fullMarkerPath, markerRootPath); + } + + /** + * Strips the marker folder prefix of any file path under the marker directory. + * + * @param fullMarkerPath the full path of the file + * @param markerDir marker directory + * @return file name + */ + public static String stripMarkerFolderPrefix(String fullMarkerPath, String markerDir) { + int begin = fullMarkerPath.indexOf(markerDir); + ValidationUtils.checkArgument(begin >= 0, + "Not in marker dir. Marker Path=" + fullMarkerPath + ", Expected Marker Root=" + markerDir); + return fullMarkerPath.substring(begin + markerDir.length() + 1); + } + + /** + * @param fileSystem file system to use. + * @param markerDir marker directory. + * @return {@code true} if the MARKERS.type file exists; {@code false} otherwise. + */ + public static boolean doesMarkerTypeFileExist(FileSystem fileSystem, String markerDir) throws IOException { + return fileSystem.exists(new Path(markerDir, MARKER_TYPE_FILENAME)); + } + + /** + * Reads the marker type from `MARKERS.type` file. + * + * @param fileSystem file system to use. + * @param markerDir marker directory. + * @return the marker type, or empty if the marker type file does not exist. + */ + public static Option readMarkerType(FileSystem fileSystem, String markerDir) { + Path markerTypeFilePath = new Path(markerDir, MARKER_TYPE_FILENAME); + FSDataInputStream fsDataInputStream = null; + Option content = Option.empty(); + try { + if (!doesMarkerTypeFileExist(fileSystem, markerDir)) { + return Option.empty(); + } + fsDataInputStream = fileSystem.open(markerTypeFilePath); + content = Option.of(MarkerType.valueOf(FileIOUtils.readAsUTFString(fsDataInputStream))); + } catch (IOException e) { + throw new HoodieIOException("Cannot read marker type file " + markerTypeFilePath.toString() + + "; " + e.getMessage(), e); + } finally { + closeQuietly(fsDataInputStream); + } + return content; + } + + /** + * Writes the marker type to the file `MARKERS.type`. + * + * @param markerType marker type. + * @param fileSystem file system to use. + * @param markerDir marker directory. + */ + public static void writeMarkerTypeToFile(MarkerType markerType, FileSystem fileSystem, String markerDir) { + Path markerTypeFilePath = new Path(markerDir, MARKER_TYPE_FILENAME); + FSDataOutputStream fsDataOutputStream = null; + BufferedWriter bufferedWriter = null; + try { + fsDataOutputStream = fileSystem.create(markerTypeFilePath, false); + bufferedWriter = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8)); + bufferedWriter.write(markerType.toString()); + } catch (IOException e) { + throw new HoodieException("Failed to create marker type file " + markerTypeFilePath.toString() + + "; " + e.getMessage(), e); + } finally { + closeQuietly(bufferedWriter); + closeQuietly(fsDataOutputStream); + } + } + + /** + * Deletes `MARKERS.type` file. + * + * @param fileSystem file system to use. + * @param markerDir marker directory. + */ + public static void deleteMarkerTypeFile(FileSystem fileSystem, String markerDir) { + Path markerTypeFilePath = new Path(markerDir, MARKER_TYPE_FILENAME); + try { + fileSystem.delete(markerTypeFilePath, false); + } catch (IOException e) { + throw new HoodieIOException("Cannot delete marker type file " + markerTypeFilePath.toString() + + "; " + e.getMessage(), e); + } + } + + /** + * Reads files containing the markers written by timeline-server-based marker mechanism. + * + * @param markerDir marker directory. + * @param fileSystem file system to use. + * @param context instance of {@link HoodieEngineContext} to use + * @param parallelism parallelism to use + * @return A {@code Map} of file name to the set of markers stored in the file. + */ + public static Map> readTimelineServerBasedMarkersFromFileSystem( + String markerDir, FileSystem fileSystem, HoodieEngineContext context, int parallelism) { + Path dirPath = new Path(markerDir); + try { + if (fileSystem.exists(dirPath)) { + Predicate prefixFilter = fileStatus -> + fileStatus.getPath().getName().startsWith(MARKERS_FILENAME_PREFIX); + Predicate markerTypeFilter = fileStatus -> + !fileStatus.getPath().getName().equals(MARKER_TYPE_FILENAME); + return FSUtils.parallelizeSubPathProcess( + context, fileSystem, dirPath, parallelism, prefixFilter.and(markerTypeFilter), + pairOfSubPathAndConf -> { + String markersFilePathStr = pairOfSubPathAndConf.getKey(); + SerializableConfiguration conf = pairOfSubPathAndConf.getValue(); + return readMarkersFromFile(new Path(markersFilePathStr), conf); + }); + } + return new HashMap<>(); + } catch (IOException ioe) { + throw new HoodieIOException(ioe.getMessage(), ioe); + } + } + + /** + * Reads the markers stored in the underlying file. + * + * @param markersFilePath file path for the markers + * @param conf serializable config + * @return markers in a {@code Set} of String. + */ + public static Set readMarkersFromFile(Path markersFilePath, SerializableConfiguration conf) { + FSDataInputStream fsDataInputStream = null; + Set markers = new HashSet<>(); + try { + LOG.debug("Read marker file: " + markersFilePath); + FileSystem fs = markersFilePath.getFileSystem(conf.get()); + fsDataInputStream = fs.open(markersFilePath); + markers = new HashSet<>(FileIOUtils.readAsUTFStringLines(fsDataInputStream)); + } catch (IOException e) { + throw new HoodieIOException("Failed to read MARKERS file " + markersFilePath, e); + } finally { + closeQuietly(fsDataInputStream); + } + return markers; + } +} \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/NetworkUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/NetworkUtils.java index d6a56fe39ca97..329c1090ea3bd 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/NetworkUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/NetworkUtils.java @@ -21,7 +21,15 @@ import org.apache.hudi.exception.HoodieException; import java.io.IOException; -import java.net.ServerSocket; +import java.net.DatagramSocket; +import java.net.Inet4Address; +import java.net.InetAddress; +import java.net.NetworkInterface; +import java.net.SocketException; +import java.net.UnknownHostException; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; /** * A utility class for network. @@ -29,20 +37,51 @@ public class NetworkUtils { public static synchronized String getHostname() { - ServerSocket s = null; - try { - s = new ServerSocket(0); - return s.getInetAddress().getHostAddress(); + InetAddress localAddress; + try (DatagramSocket s = new DatagramSocket()) { + // see https://stackoverflow.com/questions/9481865/getting-the-ip-address-of-the-current-machine-using-java + // for details. + s.connect(InetAddress.getByName("8.8.8.8"), 10002); + localAddress = s.getLocalAddress(); + if (validAddress(localAddress)) { + return localAddress.getHostAddress(); + } } catch (IOException e) { throw new HoodieException("Unable to find server port", e); - } finally { - if (null != s) { - try { - s.close(); - } catch (IOException e) { - throw new HoodieException("Unable to close server port", e); + } + + // fallback + try { + List activeNetworkIFs = Collections.list(NetworkInterface.getNetworkInterfaces()); + // On unix-like system, getNetworkInterfaces returns ifs in reverse order + // compared to ifconfig output order, + // pick ip address following system output order. + Collections.reverse(activeNetworkIFs); + for (NetworkInterface ni : activeNetworkIFs) { + List addresses = Collections.list(ni.getInetAddresses()).stream() + .filter(NetworkUtils::validAddress) + .collect(Collectors.toList()); + if (addresses.size() > 0) { + // IPv4 has higher priority + InetAddress address = addresses.stream() + .filter(addr -> addr instanceof Inet4Address).findAny() + .orElse(addresses.get(0)); + try { + // Inet6Address.toHostName may add interface at the end if it knows about it + return InetAddress.getByAddress(address.getAddress()).getHostAddress(); + } catch (UnknownHostException e) { + throw new HoodieException("Unable to fetch raw IP address for: " + address); + } } } + + return localAddress.getHostAddress(); + } catch (SocketException e) { + throw new HoodieException("Unable to find server port", e); } } + + private static boolean validAddress(InetAddress address) { + return !(address.isLinkLocalAddress() || address.isLoopbackAddress() || address.isAnyLocalAddress()); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ObjectSizeCalculator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ObjectSizeCalculator.java index f3944152faefb..86f1d9215e8c3 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ObjectSizeCalculator.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ObjectSizeCalculator.java @@ -18,33 +18,11 @@ package org.apache.hudi.common.util; -import org.apache.hudi.common.util.jvm.MemoryLayoutSpecification; -import org.apache.hudi.common.util.jvm.HotSpotMemoryLayoutSpecification32bit; -import org.apache.hudi.common.util.jvm.HotSpotMemoryLayoutSpecification64bit; -import org.apache.hudi.common.util.jvm.HotSpotMemoryLayoutSpecification64bitCompressed; -import org.apache.hudi.common.util.jvm.OpenJ9MemoryLayoutSpecification32bit; -import org.apache.hudi.common.util.jvm.OpenJ9MemoryLayoutSpecification64bit; -import org.apache.hudi.common.util.jvm.OpenJ9MemoryLayoutSpecification64bitCompressed; - -import java.lang.management.ManagementFactory; -import java.lang.management.MemoryPoolMXBean; -import java.lang.reflect.Array; -import java.lang.reflect.Field; -import java.lang.reflect.Modifier; -import java.util.ArrayDeque; -import java.util.Arrays; -import java.util.Collections; -import java.util.Deque; -import java.util.IdentityHashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Set; +import org.openjdk.jol.info.GraphLayout; /** * Contains utility methods for calculating the memory usage of objects. It only works on the HotSpot and OpenJ9 JVMs, and infers - * the actual memory layout (32 bit vs. 64 bit word size, compressed object pointers vs. uncompressed) from best + * the actual memory layout (32 bit vs. 64 bit word size, compressed object pointers vs. uncompressed) from the best * available indicators. It can reliably detect a 32 bit vs. 64 bit JVM. It can only make an educated guess at whether * compressed OOPs are used, though; specifically, it knows what the JVM's default choice of OOP compression would be * based on HotSpot version and maximum heap sizes, but if the choice is explicitly overridden with the @@ -54,14 +32,9 @@ * @author Attila Szegedi */ public class ObjectSizeCalculator { - private static class CurrentLayout { - - private static final MemoryLayoutSpecification SPEC = getEffectiveMemoryLayoutSpecification(); - } - /** * Given an object, returns the total allocated size, in bytes, of the object and all other objects reachable from it. - * Attempts to to detect the current JVM memory layout, but may fail with {@link UnsupportedOperationException}; + * Attempts to detect the current JVM memory layout, but may fail with {@link UnsupportedOperationException}; * * @param obj the object; can be null. Passing in a {@link java.lang.Class} object doesn't do anything special, it * measures the size of all objects reachable through it (which will include its class loader, and by @@ -71,282 +44,16 @@ private static class CurrentLayout { * @throws UnsupportedOperationException if the current vm memory layout cannot be detected. */ public static long getObjectSize(Object obj) throws UnsupportedOperationException { - return obj == null ? 0 : new ObjectSizeCalculator(CurrentLayout.SPEC).calculateObjectSize(obj); - } - - // Fixed object header size for arrays. - private final int arrayHeaderSize; - // Fixed object header size for non-array objects. - private final int objectHeaderSize; - // Padding for the object size - if the object size is not an exact multiple - // of this, it is padded to the next multiple. - private final int objectPadding; - // Size of reference (pointer) fields. - private final int referenceSize; - // Padding for the fields of superclass before fields of subclasses are - // added. - private final int superclassFieldPadding; - - private final Map, ClassSizeInfo> classSizeInfos = new IdentityHashMap<>(); - - private final Set alreadyVisited = Collections.newSetFromMap(new IdentityHashMap<>()); - private final Deque pending = new ArrayDeque<>(16 * 1024); - private long size; - - /** - * Creates an object size calculator that can calculate object sizes for a given {@code memoryLayoutSpecification}. - * - * @param memoryLayoutSpecification a description of the JVM memory layout. - */ - public ObjectSizeCalculator(MemoryLayoutSpecification memoryLayoutSpecification) { - Objects.requireNonNull(memoryLayoutSpecification); - arrayHeaderSize = memoryLayoutSpecification.getArrayHeaderSize(); - objectHeaderSize = memoryLayoutSpecification.getObjectHeaderSize(); - objectPadding = memoryLayoutSpecification.getObjectPadding(); - referenceSize = memoryLayoutSpecification.getReferenceSize(); - superclassFieldPadding = memoryLayoutSpecification.getSuperclassFieldPadding(); - } - - /** - * Given an object, returns the total allocated size, in bytes, of the object and all other objects reachable from it. - * - * @param obj the object; can be null. Passing in a {@link java.lang.Class} object doesn't do anything special, it - * measures the size of all objects reachable through it (which will include its class loader, and by - * extension, all other Class objects loaded by the same loader, and all the parent class loaders). It doesn't - * provide the size of the static fields in the JVM class that the Class object represents. - * @return the total allocated size of the object and all other objects it retains. - */ - public synchronized long calculateObjectSize(Object obj) { - // Breadth-first traversal instead of naive depth-first with recursive - // implementation, so we don't blow the stack traversing long linked lists. - try { - for (;;) { - visit(obj); - if (pending.isEmpty()) { - return size; - } - obj = pending.removeFirst(); - } - } finally { - alreadyVisited.clear(); - pending.clear(); - size = 0; - } - } - - private ClassSizeInfo getClassSizeInfo(final Class clazz) { - ClassSizeInfo csi = classSizeInfos.get(clazz); - if (csi == null) { - csi = new ClassSizeInfo(clazz); - classSizeInfos.put(clazz, csi); - } - return csi; - } - - private void visit(Object obj) { - if (alreadyVisited.contains(obj)) { - return; - } - final Class clazz = obj.getClass(); - if (clazz == ArrayElementsVisitor.class) { - ((ArrayElementsVisitor) obj).visit(this); - } else { - alreadyVisited.add(obj); - if (clazz.isArray()) { - visitArray(obj); - } else { - getClassSizeInfo(clazz).visit(obj, this); - } - } - } - - private void visitArray(Object array) { - final Class componentType = array.getClass().getComponentType(); - final int length = Array.getLength(array); - if (componentType.isPrimitive()) { - increaseByArraySize(length, getPrimitiveFieldSize(componentType)); - } else { - increaseByArraySize(length, referenceSize); - // If we didn't use an ArrayElementsVisitor, we would be enqueueing every - // element of the array here instead. For large arrays, it would - // tremendously enlarge the queue. In essence, we're compressing it into - // a small command object instead. This is different than immediately - // visiting the elements, as their visiting is scheduled for the end of - // the current queue. - switch (length) { - case 0: { - break; - } - case 1: { - enqueue(Array.get(array, 0)); - break; - } - default: { - enqueue(new ArrayElementsVisitor((Object[]) array)); - } - } - } - } - - private void increaseByArraySize(int length, long elementSize) { - increaseSize(roundTo(arrayHeaderSize + length * elementSize, objectPadding)); - } - - private static class ArrayElementsVisitor { - - private final Object[] array; - - ArrayElementsVisitor(Object[] array) { - this.array = array; - } - - public void visit(ObjectSizeCalculator calc) { - for (Object elem : array) { - if (elem != null) { - calc.visit(elem); - } - } - } - } - - void enqueue(Object obj) { - if (obj != null) { - pending.addLast(obj); - } - } - - void increaseSize(long objectSize) { - size += objectSize; - } - - static long roundTo(long x, int multiple) { - return ((x + multiple - 1) / multiple) * multiple; - } - - private class ClassSizeInfo { - - // Padded fields + header size - private final long objectSize; - // Only the fields size - used to calculate the subclasses' memory - // footprint. - private final long fieldsSize; - private final Field[] referenceFields; - - public ClassSizeInfo(Class clazz) { - long fieldsSize = 0; - final List referenceFields = new LinkedList<>(); - for (Field f : clazz.getDeclaredFields()) { - if (Modifier.isStatic(f.getModifiers())) { - continue; - } - final Class type = f.getType(); - if (type.isPrimitive()) { - fieldsSize += getPrimitiveFieldSize(type); - } else { - f.setAccessible(true); - referenceFields.add(f); - fieldsSize += referenceSize; - } - } - final Class superClass = clazz.getSuperclass(); - if (superClass != null) { - final ClassSizeInfo superClassInfo = getClassSizeInfo(superClass); - fieldsSize += roundTo(superClassInfo.fieldsSize, superclassFieldPadding); - referenceFields.addAll(Arrays.asList(superClassInfo.referenceFields)); - } - this.fieldsSize = fieldsSize; - this.objectSize = roundTo(objectHeaderSize + fieldsSize, objectPadding); - this.referenceFields = referenceFields.toArray(new Field[referenceFields.size()]); - } - - void visit(Object obj, ObjectSizeCalculator calc) { - calc.increaseSize(objectSize); - enqueueReferencedObjects(obj, calc); - } - - public void enqueueReferencedObjects(Object obj, ObjectSizeCalculator calc) { - for (Field f : referenceFields) { - try { - calc.enqueue(f.get(obj)); - } catch (IllegalAccessException e) { - throw new AssertionError("Unexpected denial of access to " + f, e); - } - } - } - } - - private static long getPrimitiveFieldSize(Class type) { - if (type == boolean.class || type == byte.class) { - return 1; - } - if (type == char.class || type == short.class) { - return 2; - } - if (type == int.class || type == float.class) { - return 4; - } - if (type == long.class || type == double.class) { - return 8; - } - throw new AssertionError("Encountered unexpected primitive type " + type.getName()); - } - - static MemoryLayoutSpecification getEffectiveMemoryLayoutSpecification() { - final String vmName = System.getProperty("java.vm.name"); - if (vmName == null || !(vmName.startsWith("Java HotSpot(TM) ") || vmName.startsWith("OpenJDK") - || vmName.startsWith("TwitterJDK") || vmName.startsWith("Eclipse OpenJ9"))) { - throw new UnsupportedOperationException("ObjectSizeCalculator only supported on HotSpot or Eclipse OpenJ9 VMs"); - } - - final String strVmVersion = System.getProperty("java.vm.version"); - // Support for OpenJ9 JVM - if (strVmVersion.startsWith("openj9")) { - final String dataModel = System.getProperty("sun.arch.data.model"); - if ("32".equals(dataModel)) { - // Running with 32-bit data model - return new OpenJ9MemoryLayoutSpecification32bit(); - } else if (!"64".equals(dataModel)) { - throw new UnsupportedOperationException( - "Unrecognized value '" + dataModel + "' of sun.arch.data.model system property"); - } - - long maxMemory = 0; - for (MemoryPoolMXBean mp : ManagementFactory.getMemoryPoolMXBeans()) { - maxMemory += mp.getUsage().getMax(); - } - if (maxMemory < 57L * 1024 * 1024 * 1024) { - // OpenJ9 use compressed references below 57GB of RAM total - return new OpenJ9MemoryLayoutSpecification64bitCompressed(); - } else { - // it's a 64-bit uncompressed references object model - return new OpenJ9MemoryLayoutSpecification64bit(); - } - } else { - // Support for HotSpot JVM - final String dataModel = System.getProperty("sun.arch.data.model"); - if ("32".equals(dataModel)) { - // Running with 32-bit data model - return new HotSpotMemoryLayoutSpecification32bit(); - } else if (!"64".equals(dataModel)) { - throw new UnsupportedOperationException( - "Unrecognized value '" + dataModel + "' of sun.arch.data.model system property"); - } - - final int vmVersion = Integer.parseInt(strVmVersion.substring(0, strVmVersion.indexOf('.'))); - if (vmVersion >= 17) { - long maxMemory = 0; - for (MemoryPoolMXBean mp : ManagementFactory.getMemoryPoolMXBeans()) { - maxMemory += mp.getUsage().getMax(); - } - if (maxMemory < 30L * 1024 * 1024 * 1024) { - // HotSpot 17.0 and above use compressed OOPs below 30GB of RAM total - // for all memory pools (yes, including code cache). - return new HotSpotMemoryLayoutSpecification64bitCompressed(); - } - } - - // In other cases, it's a 64-bit uncompressed OOPs object model - return new HotSpotMemoryLayoutSpecification64bit(); - } + // JDK versions 16 or later enforce strong encapsulation and block illegal reflective access. + // In effect, we cannot calculate object size by deep reflection and invoking `setAccessible` on a field, + // especially when the `isAccessible` is false. More details in JEP 403. While integrating Hudi with other + // software packages that compile against JDK 16 or later (e.g. Trino), the IllegalAccessException will be thrown. + // In that case, we use Java Object Layout (JOL) to estimate the object size. + // + // NOTE: We cannot get the object size base on the amount of byte serialized because there is no guarantee + // that the incoming object is serializable. We could have used Java's Instrumentation API, but it + // needs an instrumentation agent that can be hooked to the JVM. In lieu of that, we are using JOL. + // GraphLayout gives the deep size of an object, including the size of objects that are referenced from the given object. + return obj == null ? 0 : GraphLayout.parseInstance(obj).totalSize(); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/Option.java b/hudi-common/src/main/java/org/apache/hudi/common/util/Option.java index 42d6057968f97..3d4bfcb6c84e4 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/Option.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/Option.java @@ -34,7 +34,7 @@ public final class Option implements Serializable { private static final long serialVersionUID = 0L; - private static final Option NULL_VAL = new Option<>(); + private static final Option EMPTY = new Option<>(); private final T val; @@ -67,8 +67,9 @@ private Option(T val) { this.val = val; } + @SuppressWarnings("unchecked") public static Option empty() { - return (Option) NULL_VAL; + return (Option) EMPTY; } public static Option of(T value) { @@ -108,14 +109,42 @@ public Option map(Function mapper) { } } + public Option flatMap(Function> mapper) { + if (null == mapper) { + throw new NullPointerException("mapper should not be null"); + } + if (!isPresent()) { + return empty(); + } else { + return Objects.requireNonNull(mapper.apply(val)); + } + } + + /** + * Returns this {@link Option} if not empty, otherwise evaluates the provided supplier + * and returns the alternative + */ + public Option or(Supplier> other) { + return val != null ? this : other.get(); + } + + /** + * Identical to {@code Optional.orElse} + */ public T orElse(T other) { return val != null ? val : other; } + /** + * Identical to {@code Optional.orElseGet} + */ public T orElseGet(Supplier other) { return val != null ? val : other.get(); } + /** + * Identical to {@code Optional.orElseThrow} + */ public T orElseThrow(Supplier exceptionSupplier) throws X { if (val != null) { return val; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/OrcReaderIterator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/OrcReaderIterator.java new file mode 100644 index 0000000000000..d9ceeeee40f63 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/OrcReaderIterator.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import java.util.List; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericData.Record; +import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; +import org.apache.hudi.exception.HoodieIOException; + +import org.apache.orc.RecordReader; +import org.apache.orc.TypeDescription; + +import java.io.IOException; + +/** + * This class wraps a ORC reader and provides an iterator based api to read from an ORC file. + */ +public class OrcReaderIterator implements ClosableIterator { + + private final RecordReader recordReader; + private final Schema avroSchema; + private final List fieldNames; + private final List orcFieldTypes; + private final Schema[] avroFieldSchemas; + private final VectorizedRowBatch batch; + private int rowInBatch; + private T next; + + public OrcReaderIterator(RecordReader recordReader, Schema schema, TypeDescription orcSchema) { + this.recordReader = recordReader; + this.avroSchema = schema; + this.fieldNames = orcSchema.getFieldNames(); + this.orcFieldTypes = orcSchema.getChildren(); + this.avroFieldSchemas = fieldNames.stream() + .map(fieldName -> avroSchema.getField(fieldName).schema()) + .toArray(Schema[]::new); + this.batch = orcSchema.createRowBatch(); + this.rowInBatch = 0; + } + + /** + * If the current batch is empty, get a new one. + * @return true if we have rows available. + * @throws IOException + */ + private boolean ensureBatch() throws IOException { + if (rowInBatch >= batch.size) { + rowInBatch = 0; + return recordReader.nextBatch(batch); + } + return true; + } + + @Override + public boolean hasNext() { + try { + ensureBatch(); + if (this.next == null) { + this.next = (T) readRecordFromBatch(); + } + return this.next != null; + } catch (IOException io) { + throw new HoodieIOException("unable to read next record from ORC file ", io); + } + } + + @Override + public T next() { + try { + // To handle case when next() is called before hasNext() + if (this.next == null) { + if (!hasNext()) { + throw new HoodieIOException("No more records left to read from ORC file"); + } + } + T retVal = this.next; + this.next = (T) readRecordFromBatch(); + return retVal; + } catch (IOException io) { + throw new HoodieIOException("unable to read next record from ORC file ", io); + } + } + + private GenericData.Record readRecordFromBatch() throws IOException { + // No more records left to read from ORC file + if (!ensureBatch()) { + return null; + } + + GenericData.Record record = new Record(avroSchema); + int numFields = orcFieldTypes.size(); + for (int i = 0; i < numFields; i++) { + Object data = AvroOrcUtils.readFromVector(orcFieldTypes.get(i), batch.cols[i], avroFieldSchemas[i], rowInBatch); + record.put(fieldNames.get(i), data); + } + rowInBatch++; + return record; + } + + @Override + public void close() { + FileIOUtils.closeQuietly(this.recordReader); + } +} \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java new file mode 100644 index 0000000000000..5afe354d0e755 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java @@ -0,0 +1,267 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.MetadataNotFoundException; +import org.apache.hudi.keygen.BaseKeyGenerator; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.orc.OrcFile; +import org.apache.orc.OrcProto.UserMetadataItem; +import org.apache.orc.Reader; +import org.apache.orc.Reader.Options; +import org.apache.orc.RecordReader; +import org.apache.orc.TypeDescription; +import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; +import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.util.BinaryUtil.toBytes; + +/** + * Utility functions for ORC files. + */ +public class OrcUtils extends BaseFileUtils { + + /** + * Provides a closable iterator for reading the given ORC file. + * + * @param configuration configuration to build fs object + * @param filePath The ORC file path + * @return {@link ClosableIterator} of {@link HoodieKey}s for reading the ORC file + */ + @Override + public ClosableIterator getHoodieKeyIterator(Configuration configuration, Path filePath) { + try { + Configuration conf = new Configuration(configuration); + conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf()); + Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf)); + + Schema readSchema = HoodieAvroUtils.getRecordKeyPartitionPathSchema(); + TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(readSchema); + RecordReader recordReader = reader.rows(new Options(conf).schema(orcSchema)); + List fieldNames = orcSchema.getFieldNames(); + + // column indices for the RECORD_KEY_METADATA_FIELD, PARTITION_PATH_METADATA_FIELD fields + int keyCol = -1; + int partitionCol = -1; + for (int i = 0; i < fieldNames.size(); i++) { + if (fieldNames.get(i).equals(HoodieRecord.RECORD_KEY_METADATA_FIELD)) { + keyCol = i; + } + if (fieldNames.get(i).equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)) { + partitionCol = i; + } + } + if (keyCol == -1 || partitionCol == -1) { + throw new HoodieException(String.format("Couldn't find row keys or partition path in %s.", filePath)); + } + return new OrcReaderIterator<>(recordReader, readSchema, orcSchema); + } catch (IOException e) { + throw new HoodieIOException("Failed to open reader from ORC file:" + filePath, e); + } + } + + /** + * Fetch {@link HoodieKey}s from the given ORC file. + * + * @param filePath The ORC file path. + * @param configuration configuration to build fs object + * @return {@link List} of {@link HoodieKey}s fetched from the ORC file + */ + @Override + public List fetchHoodieKeys(Configuration configuration, Path filePath) { + try { + if (!filePath.getFileSystem(configuration).exists(filePath)) { + return Collections.emptyList(); + } + } catch (IOException e) { + throw new HoodieIOException("Failed to read from ORC file:" + filePath, e); + } + List hoodieKeys = new ArrayList<>(); + try (ClosableIterator iterator = getHoodieKeyIterator(configuration, filePath, Option.empty())) { + iterator.forEachRemaining(hoodieKeys::add); + } + return hoodieKeys; + } + + @Override + public List fetchHoodieKeys(Configuration configuration, Path filePath, Option keyGeneratorOpt) { + throw new UnsupportedOperationException("Custom key generator is not supported yet"); + } + + @Override + public ClosableIterator getHoodieKeyIterator(Configuration configuration, Path filePath, Option keyGeneratorOpt) { + throw new UnsupportedOperationException("Custom key generator is not supported yet"); + } + + /** + * NOTE: This literally reads the entire file contents, thus should be used with caution. + */ + @Override + public List readAvroRecords(Configuration configuration, Path filePath) { + Schema avroSchema; + try (Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(configuration))) { + avroSchema = AvroOrcUtils.createAvroSchema(reader.getSchema()); + } catch (IOException io) { + throw new HoodieIOException("Unable to read Avro records from an ORC file:" + filePath, io); + } + return readAvroRecords(configuration, filePath, avroSchema); + } + + /** + * NOTE: This literally reads the entire file contents, thus should be used with caution. + */ + @Override + public List readAvroRecords(Configuration configuration, Path filePath, Schema avroSchema) { + List records = new ArrayList<>(); + try (Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(configuration))) { + TypeDescription orcSchema = reader.getSchema(); + try (RecordReader recordReader = reader.rows(new Options(configuration).schema(orcSchema))) { + OrcReaderIterator iterator = new OrcReaderIterator<>(recordReader, avroSchema, orcSchema); + while (iterator.hasNext()) { + GenericRecord record = iterator.next(); + records.add(record); + } + } + } catch (IOException io) { + throw new HoodieIOException("Unable to create an ORC reader for ORC file:" + filePath, io); + } + return records; + } + + /** + * Read the rowKey list matching the given filter, from the given ORC file. If the filter is empty, then this will + * return all the rowkeys. + * + * @param conf configuration to build fs object. + * @param filePath The ORC file path. + * @param filter record keys filter + * @return Set Set of row keys matching candidateRecordKeys + */ + @Override + public Set filterRowKeys(Configuration conf, Path filePath, Set filter) + throws HoodieIOException { + try (Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));) { + TypeDescription schema = reader.getSchema(); + try (RecordReader recordReader = reader.rows(new Options(conf).schema(schema))) { + Set filteredRowKeys = new HashSet<>(); + List fieldNames = schema.getFieldNames(); + VectorizedRowBatch batch = schema.createRowBatch(); + + // column index for the RECORD_KEY_METADATA_FIELD field + int colIndex = -1; + for (int i = 0; i < fieldNames.size(); i++) { + if (fieldNames.get(i).equals(HoodieRecord.RECORD_KEY_METADATA_FIELD)) { + colIndex = i; + break; + } + } + if (colIndex == -1) { + throw new HoodieException(String.format("Couldn't find row keys in %s.", filePath)); + } + while (recordReader.nextBatch(batch)) { + BytesColumnVector rowKeys = (BytesColumnVector) batch.cols[colIndex]; + for (int i = 0; i < batch.size; i++) { + String rowKey = rowKeys.toString(i); + if (filter.isEmpty() || filter.contains(rowKey)) { + filteredRowKeys.add(rowKey); + } + } + } + return filteredRowKeys; + } + } catch (IOException io) { + throw new HoodieIOException("Unable to read row keys for ORC file:" + filePath, io); + } + } + + @Override + public Map readFooter(Configuration conf, boolean required, + Path orcFilePath, String... footerNames) { + try (Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf))) { + Map footerVals = new HashMap<>(); + List metadataItemList = reader.getFileTail().getFooter().getMetadataList(); + Map metadata = metadataItemList.stream().collect(Collectors.toMap( + UserMetadataItem::getName, + metadataItem -> metadataItem.getValue().toStringUtf8())); + for (String footerName : footerNames) { + if (metadata.containsKey(footerName)) { + footerVals.put(footerName, metadata.get(footerName)); + } else if (required) { + throw new MetadataNotFoundException( + "Could not find index in ORC footer. Looked for key " + footerName + " in " + orcFilePath); + } + } + return footerVals; + } catch (IOException io) { + throw new HoodieIOException("Unable to read footer for ORC file:" + orcFilePath, io); + } + } + + @Override + public Schema readAvroSchema(Configuration conf, Path orcFilePath) { + try (Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf))) { + if (reader.hasMetadataValue("orc.avro.schema")) { + ByteBuffer metadataValue = reader.getMetadataValue("orc.avro.schema"); + byte[] bytes = toBytes(metadataValue); + return new Schema.Parser().parse(new String(bytes)); + } else { + TypeDescription orcSchema = reader.getSchema(); + return AvroOrcUtils.createAvroSchema(orcSchema); + } + } catch (IOException io) { + throw new HoodieIOException("Unable to get Avro schema for ORC file:" + orcFilePath, io); + } + } + + @Override + public HoodieFileFormat getFormat() { + return HoodieFileFormat.ORC; + } + + @Override + public long getRowCount(Configuration conf, Path orcFilePath) { + try (Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf))) { + return reader.getNumberOfRows(); + } catch (IOException io) { + throw new HoodieIOException("Unable to get row count for ORC file:" + orcFilePath, io); + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetReaderIterator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetReaderIterator.java index 20c79dd78e130..03bd471b606f1 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetReaderIterator.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetReaderIterator.java @@ -19,18 +19,17 @@ package org.apache.hudi.common.util; import org.apache.hudi.common.util.queue.BoundedInMemoryQueue; -import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieException; import org.apache.parquet.hadoop.ParquetReader; import java.io.IOException; -import java.util.Iterator; /** * This class wraps a parquet reader and provides an iterator based api to read from a parquet file. This is used in * {@link BoundedInMemoryQueue} */ -public class ParquetReaderIterator implements Iterator { +public class ParquetReaderIterator implements ClosableIterator { // Parquet reader for an existing parquet file private final ParquetReader parquetReader; @@ -49,8 +48,9 @@ public boolean hasNext() { this.next = parquetReader.read(); } return this.next != null; - } catch (IOException io) { - throw new HoodieIOException("unable to read next record from parquet file ", io); + } catch (Exception e) { + FileIOUtils.closeQuietly(parquetReader); + throw new HoodieException("unable to read next record from parquet file ", e); } } @@ -60,18 +60,23 @@ public T next() { // To handle case when next() is called before hasNext() if (this.next == null) { if (!hasNext()) { - throw new HoodieIOException("No more records left to read from parquet file"); + throw new HoodieException("No more records left to read from parquet file"); } } T retVal = this.next; this.next = parquetReader.read(); return retVal; - } catch (IOException io) { - throw new HoodieIOException("unable to read next record from parquet file ", io); + } catch (Exception e) { + FileIOUtils.closeQuietly(parquetReader); + throw new HoodieException("unable to read next record from parquet file ", e); } } - public void close() throws IOException { - parquetReader.close(); + public void close() { + try { + parquetReader.close(); + } catch (IOException e) { + throw new HoodieException("Exception while closing the parquet reader", e); + } } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java index dc444aa21bf14..ddd28fc4ea10a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java @@ -19,21 +19,21 @@ package org.apache.hudi.common.util; import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.avro.HoodieAvroWriteSupport; -import org.apache.hudi.common.bloom.BloomFilter; -import org.apache.hudi.common.bloom.BloomFilterFactory; -import org.apache.hudi.common.bloom.BloomFilterTypeCode; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieColumnRangeMetadata; +import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.MetadataNotFoundException; +import org.apache.hudi.keygen.BaseKeyGenerator; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import org.apache.parquet.avro.AvroParquetReader; import org.apache.parquet.avro.AvroReadSupport; import org.apache.parquet.avro.AvroSchemaConverter; @@ -41,9 +41,17 @@ import org.apache.parquet.hadoop.ParquetReader; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.DecimalMetadata; import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; + +import javax.annotation.Nonnull; import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; @@ -51,22 +59,16 @@ import java.util.Map; import java.util.Set; import java.util.function.Function; +import java.util.stream.Collector; +import java.util.stream.Collectors; +import java.util.stream.Stream; /** * Utility functions involving with parquet. */ -public class ParquetUtils { +public class ParquetUtils extends BaseFileUtils { - /** - * Read the rowKey list from the given parquet file. - * - * @param filePath The parquet file path. - * @param configuration configuration to build fs object - * @return Set Set of row keys - */ - public static Set readRowKeysFromParquet(Configuration configuration, Path filePath) { - return filterParquetRowKeys(configuration, filePath, new HashSet<>()); - } + private static final Logger LOG = LogManager.getLogger(ParquetUtils.class); /** * Read the rowKey list matching the given filter, from the given parquet file. If the filter is empty, then this will @@ -77,10 +79,22 @@ public static Set readRowKeysFromParquet(Configuration configuration, Pa * @param filter record keys filter * @return Set Set of row keys matching candidateRecordKeys */ - public static Set filterParquetRowKeys(Configuration configuration, Path filePath, Set filter) { + @Override + public Set filterRowKeys(Configuration configuration, Path filePath, Set filter) { return filterParquetRowKeys(configuration, filePath, filter, HoodieAvroUtils.getRecordKeySchema()); } + public static ParquetMetadata readMetadata(Configuration conf, Path parquetFilePath) { + ParquetMetadata footer; + try { + // TODO(vc): Should we use the parallel reading version here? + footer = ParquetFileReader.readFooter(FSUtils.getFs(parquetFilePath.toString(), conf).getConf(), parquetFilePath); + } catch (IOException e) { + throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, e); + } + return footer; + } + /** * Read the rowKey list matching the given filter, from the given parquet file. If the filter is empty, then this will * return all the rowkeys. @@ -128,53 +142,72 @@ private static Set filterParquetRowKeys(Configuration configuration, Pat * @param configuration configuration to build fs object * @return {@link List} of {@link HoodieKey}s fetched from the parquet file */ - public static List fetchRecordKeyPartitionPathFromParquet(Configuration configuration, Path filePath) { - List hoodieKeys = new ArrayList<>(); - try { - if (!filePath.getFileSystem(configuration).exists(filePath)) { - return new ArrayList<>(); - } + @Override + public List fetchHoodieKeys(Configuration configuration, Path filePath) { + return fetchHoodieKeys(configuration, filePath, Option.empty()); + } + @Override + public ClosableIterator getHoodieKeyIterator(Configuration configuration, Path filePath) { + return getHoodieKeyIterator(configuration, filePath, Option.empty()); + } + + /** + * Returns a closable iterator for reading the given parquet file. + * + * @param configuration configuration to build fs object + * @param filePath The parquet file path + * @param keyGeneratorOpt instance of KeyGenerator + * + * @return {@link ClosableIterator} of {@link HoodieKey}s for reading the parquet file + */ + @Override + public ClosableIterator getHoodieKeyIterator(Configuration configuration, Path filePath, Option keyGeneratorOpt) { + try { Configuration conf = new Configuration(configuration); conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf()); - Schema readSchema = HoodieAvroUtils.getRecordKeyPartitionPathSchema(); + Schema readSchema = keyGeneratorOpt.map(keyGenerator -> { + List fields = new ArrayList<>(); + fields.addAll(keyGenerator.getRecordKeyFieldNames()); + fields.addAll(keyGenerator.getPartitionPathFields()); + return HoodieAvroUtils.getSchemaForFields(readAvroSchema(conf, filePath), fields); + }) + .orElse(HoodieAvroUtils.getRecordKeyPartitionPathSchema()); AvroReadSupport.setAvroReadSchema(conf, readSchema); AvroReadSupport.setRequestedProjection(conf, readSchema); - ParquetReader reader = AvroParquetReader.builder(filePath).withConf(conf).build(); - Object obj = reader.read(); - while (obj != null) { - if (obj instanceof GenericRecord) { - String recordKey = ((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - String partitionPath = ((GenericRecord) obj).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); - hoodieKeys.add(new HoodieKey(recordKey, partitionPath)); - obj = reader.read(); - } - } + ParquetReader reader = AvroParquetReader.builder(filePath).withConf(conf).build(); + return HoodieKeyIterator.getInstance(new ParquetReaderIterator<>(reader), keyGeneratorOpt); } catch (IOException e) { throw new HoodieIOException("Failed to read from Parquet file " + filePath, e); } - return hoodieKeys; } - public static ParquetMetadata readMetadata(Configuration conf, Path parquetFilePath) { - ParquetMetadata footer; - try { - // TODO(vc): Should we use the parallel reading version here? - footer = ParquetFileReader.readFooter(FSUtils.getFs(parquetFilePath.toString(), conf).getConf(), parquetFilePath); - } catch (IOException e) { - throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, e); + /** + * Fetch {@link HoodieKey}s from the given parquet file. + * + * @param configuration configuration to build fs object + * @param filePath The parquet file path. + * @param keyGeneratorOpt instance of KeyGenerator. + * @return {@link List} of {@link HoodieKey}s fetched from the parquet file + */ + @Override + public List fetchHoodieKeys(Configuration configuration, Path filePath, Option keyGeneratorOpt) { + List hoodieKeys = new ArrayList<>(); + try (ClosableIterator iterator = getHoodieKeyIterator(configuration, filePath, keyGeneratorOpt)) { + iterator.forEachRemaining(hoodieKeys::add); + return hoodieKeys; } - return footer; } /** * Get the schema of the given parquet file. */ - public static MessageType readSchema(Configuration configuration, Path parquetFilePath) { + public MessageType readSchema(Configuration configuration, Path parquetFilePath) { return readMetadata(configuration, parquetFilePath).getFileMetaData().getSchema(); } - private static Map readParquetFooter(Configuration configuration, boolean required, + @Override + public Map readFooter(Configuration configuration, boolean required, Path parquetFilePath, String... footerNames) { Map footerVals = new HashMap<>(); ParquetMetadata footer = readMetadata(configuration, parquetFilePath); @@ -190,56 +223,24 @@ private static Map readParquetFooter(Configuration configuration return footerVals; } - public static Schema readAvroSchema(Configuration configuration, Path parquetFilePath) { - return new AvroSchemaConverter(configuration).convert(readSchema(configuration, parquetFilePath)); + @Override + public Schema readAvroSchema(Configuration configuration, Path parquetFilePath) { + MessageType parquetSchema = readSchema(configuration, parquetFilePath); + return new AvroSchemaConverter(configuration).convert(parquetSchema); } - /** - * Read out the bloom filter from the parquet file meta data. - */ - public static BloomFilter readBloomFilterFromParquetMetadata(Configuration configuration, Path parquetFilePath) { - Map footerVals = - readParquetFooter(configuration, false, parquetFilePath, - HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, - HoodieAvroWriteSupport.OLD_HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, - HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE); - String footerVal = footerVals.get(HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY); - if (null == footerVal) { - // We use old style key "com.uber.hoodie.bloomfilter" - footerVal = footerVals.get(HoodieAvroWriteSupport.OLD_HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY); - } - BloomFilter toReturn = null; - if (footerVal != null) { - if (footerVals.containsKey(HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE)) { - toReturn = BloomFilterFactory.fromString(footerVal, - footerVals.get(HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE)); - } else { - toReturn = BloomFilterFactory.fromString(footerVal, BloomFilterTypeCode.SIMPLE.name()); - } - } - return toReturn; - } - - public static String[] readMinMaxRecordKeys(Configuration configuration, Path parquetFilePath) { - Map minMaxKeys = readParquetFooter(configuration, true, parquetFilePath, - HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER, HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER); - if (minMaxKeys.size() != 2) { - throw new HoodieException( - String.format("Could not read min/max record key out of footer correctly from %s. read) : %s", - parquetFilePath, minMaxKeys)); - } - return new String[] {minMaxKeys.get(HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER), - minMaxKeys.get(HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER)}; + @Override + public HoodieFileFormat getFormat() { + return HoodieFileFormat.PARQUET; } /** * NOTE: This literally reads the entire file contents, thus should be used with caution. */ - public static List readAvroRecords(Configuration configuration, Path filePath) { - ParquetReader reader = null; + @Override + public List readAvroRecords(Configuration configuration, Path filePath) { List records = new ArrayList<>(); - try { - reader = AvroParquetReader.builder(filePath).withConf(configuration).build(); + try (ParquetReader reader = AvroParquetReader.builder(filePath).withConf(configuration).build()) { Object obj = reader.read(); while (obj != null) { if (obj instanceof GenericRecord) { @@ -250,25 +251,24 @@ public static List readAvroRecords(Configuration configuration, P } catch (IOException e) { throw new HoodieIOException("Failed to read avro records from Parquet " + filePath, e); - } finally { - if (reader != null) { - try { - reader.close(); - } catch (IOException e) { - // ignore - } - } } return records; } + @Override + public List readAvroRecords(Configuration configuration, Path filePath, Schema schema) { + AvroReadSupport.setAvroReadSchema(configuration, schema); + return readAvroRecords(configuration, filePath); + } + /** * Returns the number of records in the parquet file. * - * @param conf Configuration + * @param conf Configuration * @param parquetFilePath path of the file */ - public static long getRowCount(Configuration conf, Path parquetFilePath) { + @Override + public long getRowCount(Configuration conf, Path parquetFilePath) { ParquetMetadata footer; long rowCount = 0; footer = readMetadata(conf, parquetFilePath); @@ -291,4 +291,203 @@ public Boolean apply(String recordKey) { return candidateKeys.contains(recordKey); } } + + /** + * Parse min/max statistics stored in parquet footers for all columns. + */ + @SuppressWarnings("rawtype") + public List> readRangeFromParquetMetadata( + @Nonnull Configuration conf, + @Nonnull Path parquetFilePath, + @Nonnull List cols + ) { + ParquetMetadata metadata = readMetadata(conf, parquetFilePath); + + // NOTE: This collector has to have fully specialized generic type params since + // Java 1.8 struggles to infer them + Collector, ?, Map>>> groupingByCollector = + Collectors.groupingBy(HoodieColumnRangeMetadata::getColumnName); + + // Collect stats from all individual Parquet blocks + Map>> columnToStatsListMap = + (Map>>) metadata.getBlocks().stream().sequential() + .flatMap(blockMetaData -> + blockMetaData.getColumns().stream() + .filter(f -> cols.contains(f.getPath().toDotString())) + .map(columnChunkMetaData -> + HoodieColumnRangeMetadata.create( + parquetFilePath.getName(), + columnChunkMetaData.getPath().toDotString(), + convertToNativeJavaType( + columnChunkMetaData.getPrimitiveType(), + columnChunkMetaData.getStatistics().genericGetMin()), + convertToNativeJavaType( + columnChunkMetaData.getPrimitiveType(), + columnChunkMetaData.getStatistics().genericGetMax()), + columnChunkMetaData.getStatistics().getNumNulls(), + columnChunkMetaData.getValueCount(), + columnChunkMetaData.getTotalSize(), + columnChunkMetaData.getTotalUncompressedSize())) + ) + .collect(groupingByCollector); + + // Combine those into file-level statistics + // NOTE: Inlining this var makes javac (1.8) upset (due to its inability to infer + // expression type correctly) + Stream> stream = columnToStatsListMap.values() + .stream() + .map(this::getColumnRangeInFile); + + return stream.collect(Collectors.toList()); + } + + private > HoodieColumnRangeMetadata getColumnRangeInFile( + @Nonnull List> blockRanges + ) { + if (blockRanges.size() == 1) { + // only one block in parquet file. we can just return that range. + return blockRanges.get(0); + } + + // there are multiple blocks. Compute min(block_mins) and max(block_maxs) + return blockRanges.stream() + .sequential() + .reduce(this::combineRanges).get(); + } + + private > HoodieColumnRangeMetadata combineRanges( + HoodieColumnRangeMetadata one, + HoodieColumnRangeMetadata another + ) { + final T minValue; + final T maxValue; + if (one.getMinValue() != null && another.getMinValue() != null) { + minValue = one.getMinValue().compareTo(another.getMinValue()) < 0 ? one.getMinValue() : another.getMinValue(); + } else if (one.getMinValue() == null) { + minValue = another.getMinValue(); + } else { + minValue = one.getMinValue(); + } + + if (one.getMaxValue() != null && another.getMaxValue() != null) { + maxValue = one.getMaxValue().compareTo(another.getMaxValue()) < 0 ? another.getMaxValue() : one.getMaxValue(); + } else if (one.getMaxValue() == null) { + maxValue = another.getMaxValue(); + } else { + maxValue = one.getMaxValue(); + } + + return HoodieColumnRangeMetadata.create( + one.getFilePath(), + one.getColumnName(), minValue, maxValue, + one.getNullCount() + another.getNullCount(), + one.getValueCount() + another.getValueCount(), + one.getTotalSize() + another.getTotalSize(), + one.getTotalUncompressedSize() + another.getTotalUncompressedSize()); + } + + private static Comparable convertToNativeJavaType(PrimitiveType primitiveType, Comparable val) { + if (val == null) { + return null; + } + + if (primitiveType.getOriginalType() == OriginalType.DECIMAL) { + return extractDecimal(val, primitiveType.getDecimalMetadata()); + } else if (primitiveType.getOriginalType() == OriginalType.DATE) { + // NOTE: This is a workaround to address race-condition in using + // {@code SimpleDataFormat} concurrently (w/in {@code DateStringifier}) + // TODO cleanup after Parquet upgrade to 1.12 + synchronized (primitiveType.stringifier()) { + // Date logical type is implemented as a signed INT32 + // REF: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md + return java.sql.Date.valueOf( + primitiveType.stringifier().stringify((Integer) val) + ); + } + } else if (primitiveType.getOriginalType() == OriginalType.UTF8) { + // NOTE: UTF8 type designates a byte array that should be interpreted as a + // UTF-8 encoded character string + // REF: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md + return ((Binary) val).toStringUsingUTF8(); + } else if (primitiveType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.BINARY) { + // NOTE: `getBytes` access makes a copy of the underlying byte buffer + return ((Binary) val).toByteBuffer(); + } + + return val; + } + + @Nonnull + private static BigDecimal extractDecimal(Object val, DecimalMetadata decimalMetadata) { + // In Parquet, Decimal could be represented as either of + // 1. INT32 (for 1 <= precision <= 9) + // 2. INT64 (for 1 <= precision <= 18) + // 3. FIXED_LEN_BYTE_ARRAY (precision is limited by the array size. Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits) + // 4. BINARY (precision is not limited) + // REF: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#DECIMAL + int scale = decimalMetadata.getScale(); + if (val == null) { + return null; + } else if (val instanceof Integer) { + return BigDecimal.valueOf((Integer) val, scale); + } else if (val instanceof Long) { + return BigDecimal.valueOf((Long) val, scale); + } else if (val instanceof Binary) { + // NOTE: Unscaled number is stored in BE format (most significant byte is 0th) + return new BigDecimal(new BigInteger(((Binary) val).getBytesUnsafe()), scale); + } else { + throw new UnsupportedOperationException(String.format("Unsupported value type (%s)", val.getClass().getName())); + } + } + + // ------------------------------------------------------------------------- + // Inner Class + // ------------------------------------------------------------------------- + + /** + * An iterator that can apply the given function {@code func} to transform records + * from the underneath record iterator to hoodie keys. + */ + private static class HoodieKeyIterator implements ClosableIterator { + private final ClosableIterator nestedItr; + private final Function func; + + public static HoodieKeyIterator getInstance(ClosableIterator nestedItr, Option keyGenerator) { + return new HoodieKeyIterator(nestedItr, keyGenerator); + } + + private HoodieKeyIterator(ClosableIterator nestedItr, Option keyGenerator) { + this.nestedItr = nestedItr; + if (keyGenerator.isPresent()) { + this.func = retVal -> { + String recordKey = keyGenerator.get().getRecordKey(retVal); + String partitionPath = keyGenerator.get().getPartitionPath(retVal); + return new HoodieKey(recordKey, partitionPath); + }; + } else { + this.func = retVal -> { + String recordKey = retVal.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String partitionPath = retVal.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); + return new HoodieKey(recordKey, partitionPath); + }; + } + } + + @Override + public void close() { + if (this.nestedItr != null) { + this.nestedItr.close(); + } + } + + @Override + public boolean hasNext() { + return this.nestedItr.hasNext(); + } + + @Override + public HoodieKey next() { + return this.func.apply(this.nestedItr.next()); + } + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/PartitionPathEncodeUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/PartitionPathEncodeUtils.java new file mode 100644 index 0000000000000..e8562c21157b1 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/PartitionPathEncodeUtils.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import java.util.BitSet; + +/** + * Utils to encode/decode the partition path. + * This code is mainly copy from Hive (org.apache.hadoop.hive.common.FileUtils). + */ +public class PartitionPathEncodeUtils { + + public static final String DEPRECATED_DEFAULT_PARTITION_PATH = "default"; + public static final String DEFAULT_PARTITION_PATH = "__HIVE_DEFAULT_PARTITION__"; + + static BitSet charToEscape = new BitSet(128); + static { + for (char c = 0; c < ' '; c++) { + charToEscape.set(c); + } + + /** + * ASCII 01-1F are HTTP control characters that need to be escaped. + * \u000A and \u000D are \n and \r, respectively. + */ + char[] clist = new char[] {'\u0001', '\u0002', '\u0003', '\u0004', + '\u0005', '\u0006', '\u0007', '\u0008', '\u0009', '\n', '\u000B', + '\u000C', '\r', '\u000E', '\u000F', '\u0010', '\u0011', '\u0012', + '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', + '\u001A', '\u001B', '\u001C', '\u001D', '\u001E', '\u001F', + '"', '#', '%', '\'', '*', '/', ':', '=', '?', '\\', '\u007F', '{', + '[', ']', '^'}; + + for (char c : clist) { + charToEscape.set(c); + } + } + + static boolean needsEscaping(char c) { + return c >= 0 && c < charToEscape.size() && charToEscape.get(c); + } + + public static String escapePathName(String path) { + return escapePathName(path, null); + } + + /** + * Escapes a path name. + * @param path The path to escape. + * @param defaultPath + * The default name for the path, if the given path is empty or null. + * @return An escaped path name. + */ + public static String escapePathName(String path, String defaultPath) { + if (path == null || path.length() == 0) { + if (defaultPath == null) { + // previously, when path is empty or null and no default path is specified, + // "default" was the return value for escapePathName + return DEFAULT_PARTITION_PATH; + } else { + return defaultPath; + } + } + + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < path.length(); i++) { + char c = path.charAt(i); + if (needsEscaping(c)) { + sb.append('%'); + sb.append(String.format("%1$02X", (int) c)); + } else { + sb.append(c); + } + } + return sb.toString(); + } + + public static String unescapePathName(String path) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < path.length(); i++) { + char c = path.charAt(i); + if (c == '%' && i + 2 < path.length()) { + int code = -1; + try { + code = Integer.parseInt(path.substring(i + 1, i + 3), 16); + } catch (Exception e) { + code = -1; + } + if (code >= 0) { + sb.append((char) code); + i += 2; + continue; + } + } + sb.append(c); + } + return sb.toString(); + } + + public static String escapePartitionValue(String value) { + if (value == null || value.isEmpty()) { + return DEFAULT_PARTITION_PATH; + } else { + return escapePathName(value); + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/RateLimiter.java b/hudi-common/src/main/java/org/apache/hudi/common/util/RateLimiter.java index e156ccffdbb97..4915e454af215 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/RateLimiter.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/RateLimiter.java @@ -53,19 +53,22 @@ private RateLimiter(int permits, TimeUnit timePeriod) { } public boolean tryAcquire(int numPermits) { - if (numPermits > maxPermits) { - acquire(maxPermits); - return tryAcquire(numPermits - maxPermits); - } else { - return acquire(numPermits); + int remainingPermits = numPermits; + while (remainingPermits > 0) { + if (remainingPermits > maxPermits) { + acquire(maxPermits); + remainingPermits -= maxPermits; + } else { + return acquire(remainingPermits); + } } + return true; } public boolean acquire(int numOps) { try { - if (!semaphore.tryAcquire(numOps)) { + while (!semaphore.tryAcquire(numOps)) { Thread.sleep(WAIT_BEFORE_NEXT_ACQUIRE_PERMIT_IN_MS); - return acquire(numOps); } LOG.debug(String.format("acquire permits: %s, maxPremits: %s", numOps, maxPermits)); } catch (InterruptedException e) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ReflectionUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ReflectionUtils.java index 23a87e77076de..6ee7928c759da 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ReflectionUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ReflectionUtils.java @@ -45,25 +45,27 @@ public class ReflectionUtils { private static final Logger LOG = LogManager.getLogger(ReflectionUtils.class); - private static Map> clazzCache = new HashMap<>(); + private static final Map> CLAZZ_CACHE = new HashMap<>(); - private static Class getClass(String clazzName) { - if (!clazzCache.containsKey(clazzName)) { - try { - Class clazz = Class.forName(clazzName); - clazzCache.put(clazzName, clazz); - } catch (ClassNotFoundException e) { - throw new HoodieException("Unable to load class", e); + public static Class getClass(String clazzName) { + synchronized (CLAZZ_CACHE) { + if (!CLAZZ_CACHE.containsKey(clazzName)) { + try { + Class clazz = Class.forName(clazzName); + CLAZZ_CACHE.put(clazzName, clazz); + } catch (ClassNotFoundException e) { + throw new HoodieException("Unable to load class", e); + } } } - return clazzCache.get(clazzName); + return CLAZZ_CACHE.get(clazzName); } - public static T loadClass(String fqcn) { + public static T loadClass(String className) { try { - return (T) getClass(fqcn).newInstance(); + return (T) getClass(className).newInstance(); } catch (InstantiationException | IllegalAccessException e) { - throw new HoodieException("Could not load class " + fqcn, e); + throw new HoodieException("Could not load class " + className, e); } } @@ -80,13 +82,31 @@ public static T loadPayload(String recordPayload } /** - * Creates an instnace of the given class. Use this version when dealing with interface types as constructor args. + * Creates an instance of the given class. Use this version when dealing with interface types as constructor args. */ public static Object loadClass(String clazz, Class[] constructorArgTypes, Object... constructorArgs) { try { return getClass(clazz).getConstructor(constructorArgTypes).newInstance(constructorArgs); } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) { - throw new HoodieException("Unable to instantiate class ", e); + throw new HoodieException("Unable to instantiate class " + clazz, e); + } + } + + /** + * Check if the clazz has the target constructor or not. + * + * When catch {@link HoodieException} from {@link #loadClass}, it's inconvenient to say if the exception was thrown + * due to the instantiation's own logic or missing constructor. + * + * TODO: ReflectionUtils should throw a specific exception to indicate Reflection problem. + */ + public static boolean hasConstructor(String clazz, Class[] constructorArgTypes) { + try { + getClass(clazz).getConstructor(constructorArgTypes); + return true; + } catch (NoSuchMethodException e) { + LOG.warn("Unable to instantiate class " + clazz, e); + return false; } } @@ -153,4 +173,11 @@ private static List findClasses(File directory, String packageName) { } return classes; } + + /** + * Returns whether the given two comparable values come from the same runtime class. + */ + public static boolean isSameClass(Comparable v, Comparable o) { + return v.getClass() == o.getClass(); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/RetryHelper.java b/hudi-common/src/main/java/org/apache/hudi/common/util/RetryHelper.java new file mode 100644 index 0000000000000..2e82b548f0da7 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/RetryHelper.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import org.apache.hudi.exception.HoodieException; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import java.util.stream.Collectors; + +public class RetryHelper implements Serializable { + private static final Logger LOG = LogManager.getLogger(RetryHelper.class); + private transient CheckedFunction func; + private final int num; + private final long maxIntervalTime; + private final long initialIntervalTime; + private String taskInfo = "N/A"; + private List> retryExceptionsClasses; + + public RetryHelper(long maxRetryIntervalMs, int maxRetryNumbers, long initialRetryIntervalMs, String retryExceptions) { + this.num = maxRetryNumbers; + this.initialIntervalTime = initialRetryIntervalMs; + this.maxIntervalTime = maxRetryIntervalMs; + if (StringUtils.isNullOrEmpty(retryExceptions)) { + this.retryExceptionsClasses = new ArrayList<>(); + } else { + try { + this.retryExceptionsClasses = Arrays.stream(retryExceptions.split(",")) + .map(exception -> (Exception) ReflectionUtils.loadClass(exception, "")) + .map(Exception::getClass) + .collect(Collectors.toList()); + } catch (HoodieException e) { + LOG.error("Exception while loading retry exceptions classes '" + retryExceptions + "'.", e); + this.retryExceptionsClasses = new ArrayList<>(); + } + } + } + + public RetryHelper(long maxRetryIntervalMs, int maxRetryNumbers, long initialRetryIntervalMs, String retryExceptions, String taskInfo) { + this(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptions); + this.taskInfo = taskInfo; + } + + public RetryHelper tryWith(CheckedFunction func) { + this.func = func; + return this; + } + + public T start(CheckedFunction func) throws IOException { + int retries = 0; + T functionResult = null; + + while (true) { + long waitTime = Math.min(getWaitTimeExp(retries), maxIntervalTime); + try { + functionResult = func.get(); + break; + } catch (IOException | RuntimeException e) { + if (!checkIfExceptionInRetryList(e)) { + throw e; + } + if (retries++ >= num) { + String message = "Still failed to " + taskInfo + " after retried " + num + " times."; + LOG.error(message, e); + if (e instanceof IOException) { + throw new IOException(message, e); + } + throw e; + } + LOG.warn("Catch Exception for " + taskInfo + ", will retry after " + waitTime + " ms.", e); + try { + Thread.sleep(waitTime); + } catch (InterruptedException ex) { + // ignore InterruptedException here + } + } + } + + if (retries > 0) { + LOG.info("Success to " + taskInfo + " after retried " + retries + " times."); + } + + return functionResult; + } + + public T start() throws IOException { + return start(this.func); + } + + private boolean checkIfExceptionInRetryList(Exception e) { + boolean inRetryList = false; + + // if users didn't set hoodie.filesystem.operation.retry.exceptions + // we will retry all the IOException and RuntimeException + if (retryExceptionsClasses.isEmpty()) { + return true; + } + + for (Class clazz : retryExceptionsClasses) { + if (clazz.isInstance(e)) { + inRetryList = true; + break; + } + } + return inRetryList; + } + + private long getWaitTimeExp(int retryCount) { + Random random = new Random(); + if (0 == retryCount) { + return initialIntervalTime; + } + + return (long) Math.pow(2, retryCount) * initialIntervalTime + random.nextInt(100); + } + + @FunctionalInterface + public interface CheckedFunction extends Serializable { + T get() throws IOException; + } +} \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/SerializationUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/SerializationUtils.java index 9041db5144458..872848a5d4979 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/SerializationUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/SerializationUtils.java @@ -19,8 +19,10 @@ package org.apache.hudi.common.util; import com.esotericsoftware.kryo.Kryo; +import com.esotericsoftware.kryo.Serializer; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; +import org.apache.avro.util.Utf8; import org.objenesis.strategy.StdInstantiatorStrategy; import java.io.ByteArrayOutputStream; @@ -36,9 +38,6 @@ public class SerializationUtils { private static final ThreadLocal SERIALIZER_REF = ThreadLocal.withInitial(KryoSerializerInstance::new); - // Serialize - // ----------------------------------------------------------------------- - /** *

    * Serializes an {@code Object} to a byte array for storage/serialization. @@ -52,9 +51,6 @@ public static byte[] serialize(final Object obj) throws IOException { return SERIALIZER_REF.get().serialize(obj); } - // Deserialize - // ----------------------------------------------------------------------- - /** *

    * Deserializes a single {@code Object} from an array of bytes. @@ -112,17 +108,42 @@ Object deserialize(byte[] objectData) { private static class KryoInstantiator implements Serializable { public Kryo newKryo() { - Kryo kryo = new Kryo(); - // ensure that kryo doesn't fail if classes are not registered with kryo. + + // This instance of Kryo should not require prior registration of classes kryo.setRegistrationRequired(false); - // This would be used for object initialization if nothing else works out. kryo.setInstantiatorStrategy(new Kryo.DefaultInstantiatorStrategy(new StdInstantiatorStrategy())); // Handle cases where we may have an odd classloader setup like with libjars // for hadoop kryo.setClassLoader(Thread.currentThread().getContextClassLoader()); + + // Register serializers + kryo.register(Utf8.class, new AvroUtf8Serializer()); + return kryo; } } + + /** + * NOTE: This {@link Serializer} could deserialize instance of {@link Utf8} serialized + * by implicitly generated Kryo serializer (based on {@link com.esotericsoftware.kryo.serializers.FieldSerializer} + */ + private static class AvroUtf8Serializer extends Serializer { + + @SuppressWarnings("unchecked") + @Override + public void write(Kryo kryo, Output output, Utf8 utf8String) { + Serializer bytesSerializer = kryo.getDefaultSerializer(byte[].class); + bytesSerializer.write(kryo, output, utf8String.getBytes()); + } + + @SuppressWarnings("unchecked") + @Override + public Utf8 read(Kryo kryo, Input input, Class type) { + Serializer bytesSerializer = kryo.getDefaultSerializer(byte[].class); + byte[] bytes = bytesSerializer.read(kryo, input, byte[].class); + return new Utf8(bytes); + } + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/SpillableMapUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/SpillableMapUtils.java index b6eab3cfb5621..d4bafd9c9feee 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/SpillableMapUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/SpillableMapUtils.java @@ -19,17 +19,23 @@ package org.apache.hudi.common.util; import org.apache.hudi.common.fs.SizeAwareDataOutputStream; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.collection.DiskBasedMap.FileEntry; +import org.apache.hudi.common.util.collection.BitCaskDiskMap.FileEntry; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieCorruptedDataException; +import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import java.io.IOException; import java.io.RandomAccessFile; -import java.util.zip.CRC32; + +import static org.apache.hudi.avro.HoodieAvroUtils.getNullableValAsString; +import static org.apache.hudi.common.util.BinaryUtil.generateChecksum; /** * A utility class supports spillable map. @@ -89,15 +95,6 @@ private static long spill(SizeAwareDataOutputStream outputStream, FileEntry file return outputStream.getSize(); } - /** - * Generate a checksum for a given set of bytes. - */ - public static long generateChecksum(byte[] data) { - CRC32 crc = new CRC32(); - crc.update(data); - return crc.getValue(); - } - /** * Compute a bytes representation of the payload by serializing the contents This is used to estimate the size of the * payload (either in memory or when written to disk). @@ -109,20 +106,64 @@ public static long computePayloadSize(R value, SizeEstimator valueSizeEst /** * Utility method to convert bytes to HoodieRecord using schema and payload class. */ - public static R convertToHoodieRecordPayload(GenericRecord rec, String payloadClazz) { - String recKey = rec.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - String partitionPath = rec.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); - HoodieRecord hoodieRecord = new HoodieRecord<>(new HoodieKey(recKey, partitionPath), - ReflectionUtils.loadPayload(payloadClazz, new Object[] {Option.of(rec)}, Option.class)); + public static R convertToHoodieRecordPayload(GenericRecord rec, String payloadClazz, String preCombineField, boolean withOperationField) { + return convertToHoodieRecordPayload(rec, payloadClazz, preCombineField, + Pair.of(HoodieRecord.RECORD_KEY_METADATA_FIELD, HoodieRecord.PARTITION_PATH_METADATA_FIELD), + withOperationField, Option.empty()); + } + + public static R convertToHoodieRecordPayload(GenericRecord record, String payloadClazz, + String preCombineField, + boolean withOperationField, + Option partitionName) { + return convertToHoodieRecordPayload(record, payloadClazz, preCombineField, + Pair.of(HoodieRecord.RECORD_KEY_METADATA_FIELD, HoodieRecord.PARTITION_PATH_METADATA_FIELD), + withOperationField, partitionName); + } + + /** + * Utility method to convert bytes to HoodieRecord using schema and payload class. + */ + public static R convertToHoodieRecordPayload(GenericRecord record, String payloadClazz, + String preCombineField, + Pair recordKeyPartitionPathFieldPair, + boolean withOperationField, + Option partitionName) { + final String recKey = record.get(recordKeyPartitionPathFieldPair.getKey()).toString(); + final String partitionPath = (partitionName.isPresent() ? partitionName.get() : + record.get(recordKeyPartitionPathFieldPair.getRight()).toString()); + + Object preCombineVal = getPreCombineVal(record, preCombineField); + HoodieOperation operation = withOperationField + ? HoodieOperation.fromName(getNullableValAsString(record, HoodieRecord.OPERATION_METADATA_FIELD)) : null; + HoodieRecord hoodieRecord = new HoodieAvroRecord<>(new HoodieKey(recKey, partitionPath), + ReflectionUtils.loadPayload(payloadClazz, new Object[]{record, preCombineVal}, GenericRecord.class, + Comparable.class), operation); + return (R) hoodieRecord; } + /** + * Returns the preCombine value with given field name. + * + * @param rec The avro record + * @param preCombineField The preCombine field name + * @return the preCombine field value or 0 if the field does not exist in the avro schema + */ + private static Object getPreCombineVal(GenericRecord rec, String preCombineField) { + if (preCombineField == null) { + return 0; + } + Schema.Field field = rec.getSchema().getField(preCombineField); + return field == null ? 0 : rec.get(field.pos()); + } + /** * Utility method to convert bytes to HoodieRecord using schema and payload class. */ - public static R generateEmptyPayload(String recKey, String partitionPath, String payloadClazz) { - HoodieRecord hoodieRecord = new HoodieRecord<>(new HoodieKey(recKey, partitionPath), - ReflectionUtils.loadPayload(payloadClazz, new Object[] {Option.empty()}, Option.class)); + public static R generateEmptyPayload(String recKey, String partitionPath, Comparable orderingVal, String payloadClazz) { + HoodieRecord hoodieRecord = new HoodieAvroRecord<>(new HoodieKey(recKey, partitionPath), + ReflectionUtils.loadPayload(payloadClazz, new Object[] {null, orderingVal}, GenericRecord.class, Comparable.class)); return (R) hoodieRecord; } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/StringUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/StringUtils.java index 49f1075508c86..a4f2c62437caa 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/StringUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/StringUtils.java @@ -20,11 +20,19 @@ import javax.annotation.Nullable; +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + /** * Simple utility for operations on strings. */ public class StringUtils { + public static final String EMPTY_STRING = ""; + /** *

    * Joins the elements of the provided array into a single String containing the provided list of elements. @@ -44,7 +52,7 @@ public class StringUtils { * */ public static String join(final String... elements) { - return join(elements, ""); + return join(elements, EMPTY_STRING); } public static String joinUsingDelim(String delim, final String... elements) { @@ -58,6 +66,18 @@ public static String join(final String[] array, final String separator) { return org.apache.hadoop.util.StringUtils.join(separator, array); } + /** + * Wrapper of {@link java.lang.String#join(CharSequence, Iterable)}. + * + * Allow return {@code null} when {@code Iterable} is {@code null}. + */ + public static String join(CharSequence delimiter, Iterable elements) { + if (elements == null) { + return null; + } + return String.join(delimiter, elements); + } + public static String toHexString(byte[] bytes) { StringBuilder sb = new StringBuilder(bytes.length * 2); for (byte b : bytes) { @@ -70,6 +90,9 @@ public static boolean isNullOrEmpty(String str) { return str == null || str.length() == 0; } + public static boolean nonEmpty(String str) { + return !isNullOrEmpty(str); + } /** * Returns the given string if it is non-null; the empty string otherwise. @@ -82,7 +105,10 @@ public static String nullToEmpty(@Nullable String string) { } public static String objToString(@Nullable Object obj) { - return obj == null ? null : obj.toString(); + if (obj == null) { + return null; + } + return obj instanceof ByteBuffer ? toHexString(((ByteBuffer) obj).array()) : obj.toString(); } /** @@ -98,4 +124,15 @@ public static String objToString(@Nullable Object obj) { private static boolean stringIsNullOrEmpty(@Nullable String string) { return string == null || string.isEmpty(); } + + /** + * Splits input string, delimited {@code delimiter} into a list of non-empty strings + * (skipping any empty string produced during splitting) + */ + public static List split(@Nullable String input, String delimiter) { + if (isNullOrEmpty(input)) { + return Collections.emptyList(); + } + return Stream.of(input.split(delimiter)).map(String::trim).filter(s -> !s.isEmpty()).collect(Collectors.toList()); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/TablePathUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/TablePathUtils.java index 6982fdbbdf112..9d279d5328ccc 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/TablePathUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/TablePathUtils.java @@ -50,13 +50,13 @@ public static Option getTablePath(FileSystem fs, Path path) throws HoodieE FileStatus fileStatus = fs.getFileStatus(path); Path directory = fileStatus.isFile() ? fileStatus.getPath().getParent() : fileStatus.getPath(); - if (TablePathUtils.hasTableMetadataFolder(fs, directory)) { + if (hasTableMetadataFolder(fs, directory)) { // Handle table folder itself return Option.of(directory); } // Handle metadata folder or metadata sub folder path - Option tablePath = getTablePathFromTableMetadataPath(fs, directory); + Option tablePath = getTablePathFromMetaFolderPath(directory); if (tablePath.isPresent()) { return tablePath; } @@ -65,20 +65,20 @@ public static Option getTablePath(FileSystem fs, Path path) throws HoodieE return getTablePathFromPartitionPath(fs, directory); } - private static boolean isTableMetadataFolder(String path) { - return path != null && path.endsWith("/" + HoodieTableMetaClient.METAFOLDER_NAME); + private static boolean isInsideTableMetaFolder(String path) { + return path != null && path.contains("/" + HoodieTableMetaClient.METAFOLDER_NAME); } - private static boolean isInsideTableMetadataFolder(String path) { - return path != null && path.contains("/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"); + private static boolean isInsideMetadataTableInMetaFolder(String path) { + return path != null && path.contains("/" + HoodieTableMetaClient.METADATA_TABLE_FOLDER_PATH); } - private static Option getTablePathFromTableMetadataPath(FileSystem fs, Path path) { + private static Option getTablePathFromMetaFolderPath(Path path) { String pathStr = path.toString(); - if (isTableMetadataFolder(pathStr)) { - return Option.of(path.getParent()); - } else if (isInsideTableMetadataFolder(pathStr)) { + // NOTE: Since Metadata Table itself resides w/in the Meta-folder, we need to make sure + // that we don't misinterpret attempt to read MT table itself + if (isInsideTableMetaFolder(pathStr) && !isInsideMetadataTableInMetaFolder(pathStr)) { int index = pathStr.indexOf("/" + HoodieTableMetaClient.METAFOLDER_NAME); return Option.of(new Path(pathStr.substring(0, index))); } @@ -92,12 +92,21 @@ private static Option getTablePathFromPartitionPath(FileSystem fs, Path pa HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, partitionPath); metadata.readFromFS(); return Option.of(getNthParent(partitionPath, metadata.getPartitionDepth())); + } else { + // Simply traverse directory structure until found .hoodie folder + Path current = partitionPath; + while (current != null) { + if (hasTableMetadataFolder(fs, current)) { + return Option.of(current); + } + current = current.getParent(); + } + + return Option.empty(); } } catch (IOException ioe) { throw new HoodieException("Error reading partition metadata for " + partitionPath, ioe); } - - return Option.empty(); } private static Path getNthParent(Path path, int n) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/TypeUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/TypeUtils.java new file mode 100644 index 0000000000000..87ce471baa4d7 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/TypeUtils.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import javax.annotation.Nonnull; +import java.util.Arrays; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; + +public final class TypeUtils { + + private TypeUtils() {} + + /** + * Maps values from the provided Enum's {@link Class} into corresponding values, + * extracted by provided {@code valueMapper} + */ + public static > Map getValueToEnumMap( + @Nonnull Class klass, + @Nonnull Function valueMapper + ) { + return Arrays.stream(klass.getEnumConstants()) + .collect(Collectors.toMap(valueMapper, Function.identity())); + } + + /** + * This utility abstracts unsafe type-casting in a way that allows to + *

      + *
    • Search for such type-casts more easily (just searching for usages of this method)
    • + *
    • Avoid type-cast warnings from the compiler
    • + *
    + */ + @SuppressWarnings("unchecked") + public static T unsafeCast(Object o) { + return (T) o; + } + +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/BitCaskDiskMap.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/BitCaskDiskMap.java new file mode 100644 index 0000000000000..9fb0b20e74f2c --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/BitCaskDiskMap.java @@ -0,0 +1,458 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util.collection; + +import org.apache.hudi.common.fs.SizeAwareDataOutputStream; +import org.apache.hudi.common.util.BufferedRandomAccessFile; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.SerializationUtils; +import org.apache.hudi.common.util.SpillableMapUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieNotSupportedException; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.RandomAccessFile; +import java.io.Serializable; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Stream; +import java.util.zip.Deflater; +import java.util.zip.DeflaterOutputStream; +import java.util.zip.InflaterInputStream; + +import static org.apache.hudi.common.util.BinaryUtil.generateChecksum; + +/** + * This class provides a disk spillable only map implementation. All of the data is currenly written to one file, + * without any rollover support. It uses the following : 1) An in-memory map that tracks the key-> latest ValueMetadata. + * 2) Current position in the file NOTE : Only String.class type supported for Key + * + * Inspired by https://github.com/basho/bitcask + */ +public final class BitCaskDiskMap extends DiskMap { + + public static final int BUFFER_SIZE = 128 * 1024; // 128 KB + private static final Logger LOG = LogManager.getLogger(BitCaskDiskMap.class); + // Caching byte compression/decompression to avoid creating instances for every operation + private static final ThreadLocal DISK_COMPRESSION_REF = + ThreadLocal.withInitial(CompressionHandler::new); + + // Stores the key and corresponding value's latest metadata spilled to disk + private final Map valueMetadataMap; + // Enables compression for all values stored in the disk map + private final boolean isCompressionEnabled; + // Write only file + private final File writeOnlyFile; + // Write only OutputStream to be able to ONLY append to the file + private final SizeAwareDataOutputStream writeOnlyFileHandle; + // FileOutputStream for the file handle to be able to force fsync + // since FileOutputStream's flush() does not force flush to disk + private final FileOutputStream fileOutputStream; + // Current position in the file + private final AtomicLong filePosition; + // FilePath to store the spilled data + private final String filePath; + // Thread-safe random access file + private final ThreadLocal randomAccessFile = new ThreadLocal<>(); + private final Queue openedAccessFiles = new ConcurrentLinkedQueue<>(); + + private final List> iterators = new ArrayList<>(); + + public BitCaskDiskMap(String baseFilePath, boolean isCompressionEnabled) throws IOException { + super(baseFilePath, ExternalSpillableMap.DiskMapType.BITCASK.name()); + this.valueMetadataMap = new ConcurrentHashMap<>(); + this.isCompressionEnabled = isCompressionEnabled; + this.writeOnlyFile = new File(diskMapPath, UUID.randomUUID().toString()); + this.filePath = writeOnlyFile.getPath(); + initFile(writeOnlyFile); + this.fileOutputStream = new FileOutputStream(writeOnlyFile, true); + this.writeOnlyFileHandle = new SizeAwareDataOutputStream(fileOutputStream, BUFFER_SIZE); + this.filePosition = new AtomicLong(0L); + } + + public BitCaskDiskMap(String baseFilePath) throws IOException { + this(baseFilePath, false); + } + + /** + * RandomAcessFile is not thread-safe. This API opens a new file handle per thread and returns. + * + * @return + */ + private BufferedRandomAccessFile getRandomAccessFile() { + try { + BufferedRandomAccessFile readHandle = randomAccessFile.get(); + if (readHandle == null) { + readHandle = new BufferedRandomAccessFile(filePath, "r", BUFFER_SIZE); + readHandle.seek(0); + randomAccessFile.set(readHandle); + openedAccessFiles.offer(readHandle); + } + return readHandle; + } catch (IOException ioe) { + throw new HoodieException(ioe); + } + } + + private void initFile(File writeOnlyFile) throws IOException { + // delete the file if it exists + if (writeOnlyFile.exists()) { + writeOnlyFile.delete(); + } + if (!writeOnlyFile.getParentFile().exists()) { + writeOnlyFile.getParentFile().mkdir(); + } + writeOnlyFile.createNewFile(); + LOG.debug("Spilling to file location " + writeOnlyFile.getAbsolutePath()); + // Make sure file is deleted when JVM exits + writeOnlyFile.deleteOnExit(); + } + + private void flushToDisk() { + try { + writeOnlyFileHandle.flush(); + } catch (IOException e) { + throw new HoodieIOException("Failed to flush to BitCaskDiskMap file", e); + } + } + + /** + * Custom iterator to iterate over values written to disk. + */ + @Override + public Iterator iterator() { + ClosableIterator iterator = new LazyFileIterable(filePath, valueMetadataMap, isCompressionEnabled).iterator(); + this.iterators.add(iterator); + return iterator; + } + + /** + * Number of bytes spilled to disk. + */ + @Override + public long sizeOfFileOnDiskInBytes() { + return filePosition.get(); + } + + @Override + public int size() { + return valueMetadataMap.size(); + } + + @Override + public boolean isEmpty() { + return valueMetadataMap.isEmpty(); + } + + @Override + public boolean containsKey(Object key) { + return valueMetadataMap.containsKey(key); + } + + @Override + public boolean containsValue(Object value) { + throw new HoodieNotSupportedException("unable to compare values in map"); + } + + @Override + public R get(Object key) { + ValueMetadata entry = valueMetadataMap.get(key); + if (entry == null) { + return null; + } + return get(entry); + } + + private R get(ValueMetadata entry) { + return get(entry, getRandomAccessFile(), isCompressionEnabled); + } + + public static R get(ValueMetadata entry, RandomAccessFile file, boolean isCompressionEnabled) { + try { + byte[] bytesFromDisk = SpillableMapUtils.readBytesFromDisk(file, entry.getOffsetOfValue(), entry.getSizeOfValue()); + if (isCompressionEnabled) { + return SerializationUtils.deserialize(DISK_COMPRESSION_REF.get().decompressBytes(bytesFromDisk)); + } + return SerializationUtils.deserialize(bytesFromDisk); + } catch (IOException e) { + throw new HoodieIOException("Unable to readFromDisk Hoodie Record from disk", e); + } + } + + private synchronized R put(T key, R value, boolean flush) { + try { + byte[] val = isCompressionEnabled ? DISK_COMPRESSION_REF.get().compressBytes(SerializationUtils.serialize(value)) : + SerializationUtils.serialize(value); + Integer valueSize = val.length; + Long timestamp = System.currentTimeMillis(); + this.valueMetadataMap.put(key, + new BitCaskDiskMap.ValueMetadata(this.filePath, valueSize, filePosition.get(), timestamp)); + byte[] serializedKey = SerializationUtils.serialize(key); + filePosition + .set(SpillableMapUtils.spillToDisk(writeOnlyFileHandle, new FileEntry(generateChecksum(val), + serializedKey.length, valueSize, serializedKey, val, timestamp))); + if (flush) { + flushToDisk(); + } + } catch (IOException io) { + throw new HoodieIOException("Unable to store data in Disk Based map", io); + } + return value; + } + + @Override + public R put(T key, R value) { + return put(key, value, true); + } + + @Override + public R remove(Object key) { + R value = get(key); + valueMetadataMap.remove(key); + return value; + } + + @Override + public void putAll(Map m) { + for (Map.Entry entry : m.entrySet()) { + put(entry.getKey(), entry.getValue(), false); + } + flushToDisk(); + } + + @Override + public void clear() { + valueMetadataMap.clear(); + // Do not delete file-handles & file as there is no way to do it without synchronizing get/put(and + // reducing concurrency). Instead, just clear the pointer map. The file will be removed on exit. + } + + @Override + public void close() { + valueMetadataMap.clear(); + try { + if (writeOnlyFileHandle != null) { + writeOnlyFileHandle.flush(); + fileOutputStream.getChannel().force(false); + writeOnlyFileHandle.close(); + } + + while (!openedAccessFiles.isEmpty()) { + BufferedRandomAccessFile file = openedAccessFiles.poll(); + if (null != file) { + try { + file.close(); + } catch (IOException ioe) { + // skip exception + } + } + } + writeOnlyFile.delete(); + this.iterators.forEach(ClosableIterator::close); + } catch (Exception e) { + // delete the file for any sort of exception + writeOnlyFile.delete(); + } finally { + super.close(); + } + } + + @Override + public Set keySet() { + return valueMetadataMap.keySet(); + } + + @Override + public Collection values() { + throw new HoodieException("Unsupported Operation Exception"); + } + + @Override + public Stream valueStream() { + final BufferedRandomAccessFile file = getRandomAccessFile(); + return valueMetadataMap.values().stream().sorted().sequential().map(valueMetaData -> (R) get(valueMetaData, file, isCompressionEnabled)); + } + + @Override + public Set> entrySet() { + Set> entrySet = new HashSet<>(); + for (T key : valueMetadataMap.keySet()) { + entrySet.add(new AbstractMap.SimpleEntry<>(key, get(key))); + } + return entrySet; + } + + /** + * The file metadata that should be spilled to disk. + */ + public static final class FileEntry { + + // Checksum of the value written to disk, compared during every readFromDisk to make sure no corruption + private Long crc; + // Size (numberOfBytes) of the key written to disk + private Integer sizeOfKey; + // Size (numberOfBytes) of the value written to disk + private Integer sizeOfValue; + // Actual key + private byte[] key; + // Actual value + private byte[] value; + // Current timestamp when the value was written to disk + private Long timestamp; + + public FileEntry(long crc, int sizeOfKey, int sizeOfValue, byte[] key, byte[] value, long timestamp) { + this.crc = crc; + this.sizeOfKey = sizeOfKey; + this.sizeOfValue = sizeOfValue; + this.key = key; + this.value = value; + this.timestamp = timestamp; + } + + public long getCrc() { + return crc; + } + + public int getSizeOfKey() { + return sizeOfKey; + } + + public int getSizeOfValue() { + return sizeOfValue; + } + + public byte[] getKey() { + return key; + } + + public byte[] getValue() { + return value; + } + + public long getTimestamp() { + return timestamp; + } + } + + /** + * The value relevant metadata. + */ + public static final class ValueMetadata implements Comparable { + + // FilePath to store the spilled data + private String filePath; + // Size (numberOfBytes) of the value written to disk + private Integer sizeOfValue; + // FilePosition of the value written to disk + private Long offsetOfValue; + // Current timestamp when the value was written to disk + private Long timestamp; + + protected ValueMetadata(String filePath, int sizeOfValue, long offsetOfValue, long timestamp) { + this.filePath = filePath; + this.sizeOfValue = sizeOfValue; + this.offsetOfValue = offsetOfValue; + this.timestamp = timestamp; + } + + public String getFilePath() { + return filePath; + } + + public int getSizeOfValue() { + return sizeOfValue; + } + + public Long getOffsetOfValue() { + return offsetOfValue; + } + + public long getTimestamp() { + return timestamp; + } + + @Override + public int compareTo(ValueMetadata o) { + return Long.compare(this.offsetOfValue, o.offsetOfValue); + } + } + + private static class CompressionHandler implements Serializable { + private static final int DISK_COMPRESSION_INITIAL_BUFFER_SIZE = 1048576; + private static final int DECOMPRESS_INTERMEDIATE_BUFFER_SIZE = 8192; + + // Caching ByteArrayOutputStreams to avoid recreating it for every operation + private final ByteArrayOutputStream compressBaos; + private final ByteArrayOutputStream decompressBaos; + private final byte[] decompressIntermediateBuffer; + + CompressionHandler() { + compressBaos = new ByteArrayOutputStream(DISK_COMPRESSION_INITIAL_BUFFER_SIZE); + decompressBaos = new ByteArrayOutputStream(DISK_COMPRESSION_INITIAL_BUFFER_SIZE); + decompressIntermediateBuffer = new byte[DECOMPRESS_INTERMEDIATE_BUFFER_SIZE]; + } + + private byte[] compressBytes(final byte[] value) throws IOException { + compressBaos.reset(); + Deflater deflater = new Deflater(Deflater.BEST_COMPRESSION); + DeflaterOutputStream dos = new DeflaterOutputStream(compressBaos, deflater); + try { + dos.write(value); + } finally { + dos.close(); + deflater.end(); + } + return compressBaos.toByteArray(); + } + + private byte[] decompressBytes(final byte[] bytes) throws IOException { + decompressBaos.reset(); + InputStream in = new InflaterInputStream(new ByteArrayInputStream(bytes)); + try { + int len; + while ((len = in.read(decompressIntermediateBuffer)) > 0) { + decompressBaos.write(decompressIntermediateBuffer, 0, len); + } + return decompressBaos.toByteArray(); + } catch (IOException e) { + throw new HoodieIOException("IOException while decompressing bytes", e); + } + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/DiskBasedMap.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/DiskBasedMap.java deleted file mode 100644 index fe4666305d3f6..0000000000000 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/DiskBasedMap.java +++ /dev/null @@ -1,398 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.common.util.collection; - -import org.apache.hudi.common.fs.SizeAwareDataOutputStream; -import org.apache.hudi.common.util.BufferedRandomAccessFile; -import org.apache.hudi.common.util.SerializationUtils; -import org.apache.hudi.common.util.SpillableMapUtils; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.exception.HoodieNotSupportedException; - -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.RandomAccessFile; -import java.io.Serializable; -import java.net.InetAddress; -import java.util.AbstractMap; -import java.util.Collection; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Map; -import java.util.Queue; -import java.util.Set; -import java.util.UUID; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Stream; - -/** - * This class provides a disk spillable only map implementation. All of the data is currenly written to one file, - * without any rollover support. It uses the following : 1) An in-memory map that tracks the key-> latest ValueMetadata. - * 2) Current position in the file NOTE : Only String.class type supported for Key - */ -public final class DiskBasedMap implements Map, Iterable { - - public static final int BUFFER_SIZE = 128 * 1024; // 128 KB - private static final Logger LOG = LogManager.getLogger(DiskBasedMap.class); - // Stores the key and corresponding value's latest metadata spilled to disk - private final Map valueMetadataMap; - // Write only file - private File writeOnlyFile; - // Write only OutputStream to be able to ONLY append to the file - private SizeAwareDataOutputStream writeOnlyFileHandle; - // FileOutputStream for the file handle to be able to force fsync - // since FileOutputStream's flush() does not force flush to disk - private FileOutputStream fileOutputStream; - // Current position in the file - private AtomicLong filePosition; - // FilePath to store the spilled data - private String filePath; - // Thread-safe random access file - private ThreadLocal randomAccessFile = new ThreadLocal<>(); - private Queue openedAccessFiles = new ConcurrentLinkedQueue<>(); - - private transient Thread shutdownThread = null; - - public DiskBasedMap(String baseFilePath) throws IOException { - this.valueMetadataMap = new ConcurrentHashMap<>(); - this.writeOnlyFile = new File(baseFilePath, UUID.randomUUID().toString()); - this.filePath = writeOnlyFile.getPath(); - initFile(writeOnlyFile); - this.fileOutputStream = new FileOutputStream(writeOnlyFile, true); - this.writeOnlyFileHandle = new SizeAwareDataOutputStream(fileOutputStream, BUFFER_SIZE); - this.filePosition = new AtomicLong(0L); - } - - /** - * RandomAcessFile is not thread-safe. This API opens a new file handle per thread and returns. - * - * @return - */ - private BufferedRandomAccessFile getRandomAccessFile() { - try { - BufferedRandomAccessFile readHandle = randomAccessFile.get(); - if (readHandle == null) { - readHandle = new BufferedRandomAccessFile(filePath, "r", BUFFER_SIZE); - readHandle.seek(0); - randomAccessFile.set(readHandle); - openedAccessFiles.offer(readHandle); - } - return readHandle; - } catch (IOException ioe) { - throw new HoodieException(ioe); - } - } - - private void initFile(File writeOnlyFile) throws IOException { - // delete the file if it exists - if (writeOnlyFile.exists()) { - writeOnlyFile.delete(); - } - if (!writeOnlyFile.getParentFile().exists()) { - writeOnlyFile.getParentFile().mkdir(); - } - writeOnlyFile.createNewFile(); - LOG.info("Spilling to file location " + writeOnlyFile.getAbsolutePath() + " in host (" - + InetAddress.getLocalHost().getHostAddress() + ") with hostname (" + InetAddress.getLocalHost().getHostName() - + ")"); - // Make sure file is deleted when JVM exits - writeOnlyFile.deleteOnExit(); - addShutDownHook(); - } - - /** - * Register shutdown hook to force flush contents of the data written to FileOutputStream from OS page cache - * (typically 4 KB) to disk. - */ - private void addShutDownHook() { - shutdownThread = new Thread(this::cleanup); - Runtime.getRuntime().addShutdownHook(shutdownThread); - } - - private void flushToDisk() { - try { - writeOnlyFileHandle.flush(); - } catch (IOException e) { - throw new HoodieIOException("Failed to flush to DiskBasedMap file", e); - } - } - - /** - * Custom iterator to iterate over values written to disk. - */ - @Override - public Iterator iterator() { - return new LazyFileIterable(filePath, valueMetadataMap).iterator(); - } - - /** - * Number of bytes spilled to disk. - */ - public long sizeOfFileOnDiskInBytes() { - return filePosition.get(); - } - - @Override - public int size() { - return valueMetadataMap.size(); - } - - @Override - public boolean isEmpty() { - return valueMetadataMap.isEmpty(); - } - - @Override - public boolean containsKey(Object key) { - return valueMetadataMap.containsKey(key); - } - - @Override - public boolean containsValue(Object value) { - throw new HoodieNotSupportedException("unable to compare values in map"); - } - - @Override - public R get(Object key) { - ValueMetadata entry = valueMetadataMap.get(key); - if (entry == null) { - return null; - } - return get(entry); - } - - private R get(ValueMetadata entry) { - return get(entry, getRandomAccessFile()); - } - - public static R get(ValueMetadata entry, RandomAccessFile file) { - try { - return SerializationUtils - .deserialize(SpillableMapUtils.readBytesFromDisk(file, entry.getOffsetOfValue(), entry.getSizeOfValue())); - } catch (IOException e) { - throw new HoodieIOException("Unable to readFromDisk Hoodie Record from disk", e); - } - } - - private synchronized R put(T key, R value, boolean flush) { - try { - byte[] val = SerializationUtils.serialize(value); - Integer valueSize = val.length; - Long timestamp = System.currentTimeMillis(); - this.valueMetadataMap.put(key, - new DiskBasedMap.ValueMetadata(this.filePath, valueSize, filePosition.get(), timestamp)); - byte[] serializedKey = SerializationUtils.serialize(key); - filePosition - .set(SpillableMapUtils.spillToDisk(writeOnlyFileHandle, new FileEntry(SpillableMapUtils.generateChecksum(val), - serializedKey.length, valueSize, serializedKey, val, timestamp))); - if (flush) { - flushToDisk(); - } - } catch (IOException io) { - throw new HoodieIOException("Unable to store data in Disk Based map", io); - } - return value; - } - - @Override - public R put(T key, R value) { - return put(key, value, true); - } - - @Override - public R remove(Object key) { - R value = get(key); - valueMetadataMap.remove(key); - return value; - } - - @Override - public void putAll(Map m) { - for (Map.Entry entry : m.entrySet()) { - put(entry.getKey(), entry.getValue(), false); - } - flushToDisk(); - } - - @Override - public void clear() { - valueMetadataMap.clear(); - // Do not delete file-handles & file as there is no way to do it without synchronizing get/put(and - // reducing concurrency). Instead, just clear the pointer map. The file will be removed on exit. - } - - public void close() { - cleanup(); - if (shutdownThread != null) { - Runtime.getRuntime().removeShutdownHook(shutdownThread); - } - } - - private void cleanup() { - valueMetadataMap.clear(); - try { - if (writeOnlyFileHandle != null) { - writeOnlyFileHandle.flush(); - fileOutputStream.getChannel().force(false); - writeOnlyFileHandle.close(); - } - - while (!openedAccessFiles.isEmpty()) { - BufferedRandomAccessFile file = openedAccessFiles.poll(); - if (null != file) { - try { - file.close(); - } catch (IOException ioe) { - // skip exception - } - } - } - writeOnlyFile.delete(); - } catch (Exception e) { - // delete the file for any sort of exception - writeOnlyFile.delete(); - } - } - - @Override - public Set keySet() { - return valueMetadataMap.keySet(); - } - - @Override - public Collection values() { - throw new HoodieException("Unsupported Operation Exception"); - } - - public Stream valueStream() { - final BufferedRandomAccessFile file = getRandomAccessFile(); - return valueMetadataMap.values().stream().sorted().sequential().map(valueMetaData -> (R) get(valueMetaData, file)); - } - - @Override - public Set> entrySet() { - Set> entrySet = new HashSet<>(); - for (T key : valueMetadataMap.keySet()) { - entrySet.add(new AbstractMap.SimpleEntry<>(key, get(key))); - } - return entrySet; - } - - /** - * The file metadata that should be spilled to disk. - */ - public static final class FileEntry { - - // Checksum of the value written to disk, compared during every readFromDisk to make sure no corruption - private Long crc; - // Size (numberOfBytes) of the key written to disk - private Integer sizeOfKey; - // Size (numberOfBytes) of the value written to disk - private Integer sizeOfValue; - // Actual key - private byte[] key; - // Actual value - private byte[] value; - // Current timestamp when the value was written to disk - private Long timestamp; - - public FileEntry(long crc, int sizeOfKey, int sizeOfValue, byte[] key, byte[] value, long timestamp) { - this.crc = crc; - this.sizeOfKey = sizeOfKey; - this.sizeOfValue = sizeOfValue; - this.key = key; - this.value = value; - this.timestamp = timestamp; - } - - public long getCrc() { - return crc; - } - - public int getSizeOfKey() { - return sizeOfKey; - } - - public int getSizeOfValue() { - return sizeOfValue; - } - - public byte[] getKey() { - return key; - } - - public byte[] getValue() { - return value; - } - - public long getTimestamp() { - return timestamp; - } - } - - /** - * The value relevant metadata. - */ - public static final class ValueMetadata implements Comparable { - - // FilePath to store the spilled data - private String filePath; - // Size (numberOfBytes) of the value written to disk - private Integer sizeOfValue; - // FilePosition of the value written to disk - private Long offsetOfValue; - // Current timestamp when the value was written to disk - private Long timestamp; - - protected ValueMetadata(String filePath, int sizeOfValue, long offsetOfValue, long timestamp) { - this.filePath = filePath; - this.sizeOfValue = sizeOfValue; - this.offsetOfValue = offsetOfValue; - this.timestamp = timestamp; - } - - public String getFilePath() { - return filePath; - } - - public int getSizeOfValue() { - return sizeOfValue; - } - - public Long getOffsetOfValue() { - return offsetOfValue; - } - - public long getTimestamp() { - return timestamp; - } - - @Override - public int compareTo(ValueMetadata o) { - return Long.compare(this.offsetOfValue, o.offsetOfValue); - } - } -} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/DiskMap.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/DiskMap.java new file mode 100644 index 0000000000000..c609212124f5e --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/DiskMap.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util.collection; + +import org.apache.hudi.common.util.FileIOUtils; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.File; +import java.io.IOException; +import java.io.Serializable; +import java.util.Map; +import java.util.UUID; +import java.util.stream.Stream; + +/** + * This interface provides the map interface for storing records in disk after they + * spill over from memory. Used by {@link ExternalSpillableMap}. + * + * @param The generic type of the keys + * @param The generic type of the values + */ +public abstract class DiskMap implements Map, Iterable { + + private static final Logger LOG = LogManager.getLogger(DiskMap.class); + private static final String SUBFOLDER_PREFIX = "hudi"; + private final File diskMapPathFile; + private transient Thread shutdownThread = null; + + // Base path for the write file + protected final String diskMapPath; + + public DiskMap(String basePath, String prefix) throws IOException { + this.diskMapPath = + String.format("%s/%s-%s-%s", basePath, SUBFOLDER_PREFIX, prefix, UUID.randomUUID().toString()); + diskMapPathFile = new File(diskMapPath); + FileIOUtils.deleteDirectory(diskMapPathFile); + FileIOUtils.mkdir(diskMapPathFile); + // Make sure the folder is deleted when JVM exits + diskMapPathFile.deleteOnExit(); + addShutDownHook(); + } + + /** + * Register shutdown hook to force flush contents of the data written to FileOutputStream from OS page cache + * (typically 4 KB) to disk. + */ + private void addShutDownHook() { + shutdownThread = new Thread(this::cleanup); + Runtime.getRuntime().addShutdownHook(shutdownThread); + } + + /** + * @returns a stream of the values stored in the disk. + */ + abstract Stream valueStream(); + + /** + * Number of bytes spilled to disk. + */ + abstract long sizeOfFileOnDiskInBytes(); + + /** + * Close and cleanup the Map. + */ + public void close() { + cleanup(false); + } + + /** + * Cleanup all resources, files and folders + * triggered by shutdownhook. + */ + private void cleanup() { + cleanup(true); + } + + /** + * Cleanup all resources, files and folders. + */ + private void cleanup(boolean isTriggeredFromShutdownHook) { + try { + FileIOUtils.deleteDirectory(diskMapPathFile); + } catch (IOException exception) { + LOG.warn("Error while deleting the disk map directory=" + diskMapPath, exception); + } + if (!isTriggeredFromShutdownHook && shutdownThread != null) { + Runtime.getRuntime().removeShutdownHook(shutdownThread); + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ExternalSpillableMap.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ExternalSpillableMap.java index 003d525b66d5e..8d2707d6045bb 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ExternalSpillableMap.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ExternalSpillableMap.java @@ -52,7 +52,7 @@ * map may occupy more memory than is available, resulting in OOM. However, if the spill threshold is too low, we spill * frequently and incur unnecessary disk writes. */ -public class ExternalSpillableMap implements Map { +public class ExternalSpillableMap implements Map, Serializable { // Find the actual estimated payload size after inserting N records private static final int NUMBER_OF_RECORDS_TO_ESTIMATE_PAYLOAD_SIZE = 100; @@ -61,8 +61,8 @@ public class ExternalSpillableMap inMemoryMap; - // Map to store key-valuemetadata important to find the values spilled to disk - private transient volatile DiskBasedMap diskBasedMap; + // Map to store key-values on disk or db after it spilled over the memory + private transient volatile DiskMap diskBasedMap; // TODO(na) : a dynamic sizing factor to ensure we have space for other objects in memory and // incorrect payload estimation private final Double sizingFactorForInMemoryMap = 0.8; @@ -70,6 +70,10 @@ public class ExternalSpillableMap keySizeEstimator; // Size Estimator for key types private final SizeEstimator valueSizeEstimator; + // Type of the disk map + private final DiskMapType diskMapType; + // Enables compression of values stored in disc + private final boolean isCompressionEnabled; // current space occupied by this map in-memory private Long currentInMemoryMapSize; // An estimate of the size of each payload written to this map @@ -80,22 +84,40 @@ public class ExternalSpillableMap keySizeEstimator, - SizeEstimator valueSizeEstimator) throws IOException { + SizeEstimator valueSizeEstimator) throws IOException { + this(maxInMemorySizeInBytes, baseFilePath, keySizeEstimator, valueSizeEstimator, DiskMapType.BITCASK); + } + + public ExternalSpillableMap(Long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator keySizeEstimator, + SizeEstimator valueSizeEstimator, DiskMapType diskMapType) throws IOException { + this(maxInMemorySizeInBytes, baseFilePath, keySizeEstimator, valueSizeEstimator, diskMapType, false); + } + + public ExternalSpillableMap(Long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator keySizeEstimator, + SizeEstimator valueSizeEstimator, DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException { this.inMemoryMap = new HashMap<>(); this.baseFilePath = baseFilePath; - this.diskBasedMap = new DiskBasedMap<>(baseFilePath); this.maxInMemorySizeInBytes = (long) Math.floor(maxInMemorySizeInBytes * sizingFactorForInMemoryMap); this.currentInMemoryMapSize = 0L; this.keySizeEstimator = keySizeEstimator; this.valueSizeEstimator = valueSizeEstimator; + this.diskMapType = diskMapType; + this.isCompressionEnabled = isCompressionEnabled; } - private DiskBasedMap getDiskBasedMap() { + private DiskMap getDiskBasedMap() { if (null == diskBasedMap) { synchronized (this) { if (null == diskBasedMap) { try { - diskBasedMap = new DiskBasedMap<>(baseFilePath); + switch (diskMapType) { + case ROCKS_DB: + diskBasedMap = new RocksDbDiskMap<>(baseFilePath); + break; + case BITCASK: + default: + diskBasedMap = new BitCaskDiskMap<>(baseFilePath, isCompressionEnabled); + } } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); } @@ -113,7 +135,7 @@ public Iterator iterator() { } /** - * Number of entries in DiskBasedMap. + * Number of entries in BitCaskDiskMap. */ public int getDiskBasedMapNumEntries() { return getDiskBasedMap().size(); @@ -160,6 +182,14 @@ public boolean containsValue(Object value) { return inMemoryMap.containsValue(value) || getDiskBasedMap().containsValue(value); } + public boolean inMemoryContainsKey(Object key) { + return inMemoryMap.containsKey(key); + } + + public boolean inDiskContainsKey(Object key) { + return getDiskBasedMap().containsKey(key); + } + @Override public R get(Object key) { if (inMemoryMap.containsKey(key)) { @@ -178,7 +208,8 @@ public R put(T key, R value) { // Note, the converter may over estimate the size of a record in the JVM this.estimatedPayloadSize = keySizeEstimator.sizeEstimate(key) + valueSizeEstimator.sizeEstimate(value); LOG.info("Estimated Payload size => " + estimatedPayloadSize); - } else if (shouldEstimatePayloadSize && inMemoryMap.size() % NUMBER_OF_RECORDS_TO_ESTIMATE_PAYLOAD_SIZE == 0) { + } else if (shouldEstimatePayloadSize && !inMemoryMap.isEmpty() + && (inMemoryMap.size() % NUMBER_OF_RECORDS_TO_ESTIMATE_PAYLOAD_SIZE == 0)) { // Re-estimate the size of a record by calculating the size of the entire map containing // N entries and then dividing by the number of entries present (N). This helps to get a // correct estimation of the size of each record in the JVM. @@ -226,7 +257,9 @@ public void clear() { } public void close() { + inMemoryMap.clear(); getDiskBasedMap().close(); + currentInMemoryMapSize = 0L; } @Override @@ -259,14 +292,24 @@ public Set> entrySet() { return entrySet; } + /** + * The type of map to use for storing the Key, values on disk after it spills + * from memory in the {@link ExternalSpillableMap}. + */ + public enum DiskMapType { + BITCASK, + ROCKS_DB, + UNKNOWN + } + /** * Iterator that wraps iterating over all the values for this map 1) inMemoryIterator - Iterates over all the data * in-memory map 2) diskLazyFileIterator - Iterates over all the data spilled to disk. */ private class IteratorWrapper implements Iterator { - private Iterator inMemoryIterator; - private Iterator diskLazyFileIterator; + private final Iterator inMemoryIterator; + private final Iterator diskLazyFileIterator; public IteratorWrapper(Iterator inMemoryIterator, Iterator diskLazyFileIterator) { this.inMemoryIterator = inMemoryIterator; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/LazyFileIterable.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/LazyFileIterable.java index 95b1ac2b37b36..49d81443151a3 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/LazyFileIterable.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/LazyFileIterable.java @@ -19,6 +19,7 @@ package org.apache.hudi.common.util.collection; import org.apache.hudi.common.util.BufferedRandomAccessFile; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.exception.HoodieException; import java.io.IOException; @@ -36,17 +37,24 @@ public class LazyFileIterable implements Iterable { // Used to access the value written at a specific position in the file private final String filePath; // Stores the key and corresponding value's latest metadata spilled to disk - private final Map inMemoryMetadataOfSpilledData; + private final Map inMemoryMetadataOfSpilledData; + // Was compressions enabled for the values when inserted into the file/ map + private final boolean isCompressionEnabled; private transient Thread shutdownThread = null; - public LazyFileIterable(String filePath, Map map) { + public LazyFileIterable(String filePath, Map map) { + this(filePath, map, false); + } + + public LazyFileIterable(String filePath, Map map, boolean isCompressionEnabled) { this.filePath = filePath; this.inMemoryMetadataOfSpilledData = map; + this.isCompressionEnabled = isCompressionEnabled; } @Override - public Iterator iterator() { + public ClosableIterator iterator() { try { return new LazyFileIterator<>(filePath, inMemoryMetadataOfSpilledData); } catch (IOException io) { @@ -57,20 +65,20 @@ public Iterator iterator() { /** * Iterator implementation for the iterable defined above. */ - public class LazyFileIterator implements Iterator { + public class LazyFileIterator implements ClosableIterator { private final String filePath; private BufferedRandomAccessFile readOnlyFileHandle; - private final Iterator> metadataIterator; + private final Iterator> metadataIterator; - public LazyFileIterator(String filePath, Map map) throws IOException { + public LazyFileIterator(String filePath, Map map) throws IOException { this.filePath = filePath; - this.readOnlyFileHandle = new BufferedRandomAccessFile(filePath, "r", DiskBasedMap.BUFFER_SIZE); + this.readOnlyFileHandle = new BufferedRandomAccessFile(filePath, "r", BitCaskDiskMap.BUFFER_SIZE); readOnlyFileHandle.seek(0); // sort the map in increasing order of offset of value so disk seek is only in one(forward) direction this.metadataIterator = map.entrySet().stream() - .sorted((Map.Entry o1, Map.Entry o2) -> o1 + .sorted((Map.Entry o1, Map.Entry o2) -> o1 .getValue().getOffsetOfValue().compareTo(o2.getValue().getOffsetOfValue())) .collect(Collectors.toList()).iterator(); this.addShutdownHook(); @@ -90,8 +98,8 @@ public R next() { if (!hasNext()) { throw new IllegalStateException("next() called on EOF'ed stream. File :" + filePath); } - Map.Entry entry = this.metadataIterator.next(); - return DiskBasedMap.get(entry.getValue(), readOnlyFileHandle); + Map.Entry entry = this.metadataIterator.next(); + return BitCaskDiskMap.get(entry.getValue(), readOnlyFileHandle, isCompressionEnabled); } @Override @@ -104,7 +112,7 @@ public void forEachRemaining(Consumer action) { action.accept(next()); } - private void close() { + public void close() { closeHandle(); Runtime.getRuntime().removeShutdownHook(shutdownThread); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/RocksDBBasedMap.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/RocksDBBasedMap.java index c4fbd3a8cb0db..67b4ffabe2e57 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/RocksDBBasedMap.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/RocksDBBasedMap.java @@ -29,12 +29,12 @@ /** * A map's implementation based on RocksDB. */ -public final class RocksDBBasedMap implements Map { +public final class RocksDBBasedMap implements Map, Serializable { private static final String COL_FAMILY_NAME = "map_handle"; private final String rocksDbStoragePath; - private RocksDBDAO rocksDBDAO; + private transient RocksDBDAO rocksDBDAO; private final String columnFamilyName; public RocksDBBasedMap(String rocksDbStoragePath) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/RocksDBDAO.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/RocksDBDAO.java index 3c08460f2339a..fe40d98594287 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/RocksDBDAO.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/RocksDBDAO.java @@ -45,6 +45,7 @@ import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; +import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.UUID; @@ -64,11 +65,13 @@ public class RocksDBDAO { private transient RocksDB rocksDB; private boolean closed = false; private final String rocksDBBasePath; + private long totalBytesWritten; public RocksDBDAO(String basePath, String rocksDBBasePath) { this.rocksDBBasePath = String.format("%s/%s/%s", rocksDBBasePath, basePath.replace("/", "_"), UUID.randomUUID().toString()); init(); + totalBytesWritten = 0L; } /** @@ -169,7 +172,7 @@ public void writeBatch(BatchHandler handler) { */ public void putInBatch(WriteBatch batch, String columnFamilyName, String key, T value) { try { - byte[] payload = SerializationUtils.serialize(value); + byte[] payload = serializePayload(value); batch.put(managedHandlesMap.get(columnFamilyName), key.getBytes(), payload); } catch (Exception e) { throw new HoodieException(e); @@ -189,7 +192,7 @@ public void putInBatch(WriteBat K key, T value) { try { byte[] keyBytes = SerializationUtils.serialize(key); - byte[] payload = SerializationUtils.serialize(value); + byte[] payload = serializePayload(value); batch.put(managedHandlesMap.get(columnFamilyName), keyBytes, payload); } catch (Exception e) { throw new HoodieException(e); @@ -206,7 +209,7 @@ public void putInBatch(WriteBat */ public void put(String columnFamilyName, String key, T value) { try { - byte[] payload = SerializationUtils.serialize(value); + byte[] payload = serializePayload(value); getRocksDB().put(managedHandlesMap.get(columnFamilyName), key.getBytes(), payload); } catch (Exception e) { throw new HoodieException(e); @@ -223,7 +226,7 @@ public void put(String columnFamilyName, String key, T */ public void put(String columnFamilyName, K key, T value) { try { - byte[] payload = SerializationUtils.serialize(value); + byte[] payload = serializePayload(value); getRocksDB().put(managedHandlesMap.get(columnFamilyName), SerializationUtils.serialize(key), payload); } catch (Exception e) { throw new HoodieException(e); @@ -351,6 +354,16 @@ public Stream> prefixSearch(String colu return results.stream(); } + /** + * Return Iterator of key-value pairs from RocksIterator. + * + * @param columnFamilyName Column Family Name + * @param Type of value stored + */ + public Iterator iterator(String columnFamilyName) { + return new IteratorWrapper<>(getRocksDB().newIterator(managedHandlesMap.get(columnFamilyName))); + } + /** * Perform a prefix delete and return stream of key-value pairs retrieved. * @@ -448,10 +461,48 @@ public synchronized void close() { } } + public long getTotalBytesWritten() { + return totalBytesWritten; + } + + private byte[] serializePayload(T value) throws IOException { + byte[] payload = SerializationUtils.serialize(value); + totalBytesWritten += payload.length; + return payload; + } + String getRocksDBBasePath() { return rocksDBBasePath; } + /** + * {@link Iterator} wrapper for RocksDb Iterator {@link RocksIterator}. + */ + private static class IteratorWrapper implements Iterator { + + private final RocksIterator iterator; + + public IteratorWrapper(final RocksIterator iterator) { + this.iterator = iterator; + iterator.seekToFirst(); + } + + @Override + public boolean hasNext() { + return iterator.isValid(); + } + + @Override + public R next() { + if (!hasNext()) { + throw new IllegalStateException("next() called on rocksDB with no more valid entries"); + } + R val = SerializationUtils.deserialize(iterator.value()); + iterator.next(); + return val; + } + } + /** * Functional interface for stacking operation to Write batch. */ diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/RocksDbDiskMap.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/RocksDbDiskMap.java new file mode 100644 index 0000000000000..21211a5700e51 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/RocksDbDiskMap.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util.collection; + +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieNotSupportedException; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.io.Serializable; +import java.util.AbstractMap; +import java.util.Collection; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.Spliterators; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +/** + * This class provides a disk spillable only map implementation. + * All of the data is stored using the RocksDB implementation. + */ +public final class RocksDbDiskMap extends DiskMap { + // ColumnFamily allows partitioning data within RockDB, which allows + // independent configuration and faster deletes across partitions + // https://github.com/facebook/rocksdb/wiki/Column-Families + // For this use case, we use a single static column family/ partition + // + private static final String ROCKSDB_COL_FAMILY = "rocksdb-diskmap"; + + private static final Logger LOG = LogManager.getLogger(RocksDbDiskMap.class); + // Stores the key and corresponding value's latest metadata spilled to disk + private final Set keySet; + private RocksDBDAO rocksDb; + + public RocksDbDiskMap(String rocksDbStoragePath) throws IOException { + super(rocksDbStoragePath, ExternalSpillableMap.DiskMapType.ROCKS_DB.name()); + this.keySet = new HashSet<>(); + } + + @Override + public int size() { + return keySet.size(); + } + + @Override + public boolean isEmpty() { + return keySet.isEmpty(); + } + + @Override + public boolean containsKey(Object key) { + return keySet.contains((T) key); + } + + @Override + public boolean containsValue(Object value) { + throw new HoodieNotSupportedException("unable to compare values in map"); + } + + @Override + public R get(Object key) { + if (!containsKey(key)) { + return null; + } + return getRocksDb().get(ROCKSDB_COL_FAMILY, (T) key); + } + + @Override + public R put(T key, R value) { + getRocksDb().put(ROCKSDB_COL_FAMILY, key, value); + keySet.add(key); + return value; + } + + @Override + public R remove(Object key) { + R value = get(key); + if (value != null) { + keySet.remove((T) key); + getRocksDb().delete(ROCKSDB_COL_FAMILY, (T) key); + } + return value; + } + + @Override + public void putAll(Map keyValues) { + getRocksDb().writeBatch(batch -> keyValues.forEach((key, value) -> getRocksDb().putInBatch(batch, ROCKSDB_COL_FAMILY, key, value))); + keySet.addAll(keyValues.keySet()); + } + + @Override + public void clear() { + close(); + } + + @Override + public Set keySet() { + return keySet; + } + + @Override + public Collection values() { + throw new HoodieException("Unsupported Operation Exception"); + } + + @Override + public Set> entrySet() { + Set> entrySet = new HashSet<>(); + for (T key : keySet) { + entrySet.add(new AbstractMap.SimpleEntry<>(key, get(key))); + } + return entrySet; + } + + /** + * Custom iterator to iterate over values written to disk. + */ + @Override + public Iterator iterator() { + return getRocksDb().iterator(ROCKSDB_COL_FAMILY); + } + + @Override + public Stream valueStream() { + return StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator(), 0), false); + } + + @Override + public long sizeOfFileOnDiskInBytes() { + return getRocksDb().getTotalBytesWritten(); + } + + @Override + public void close() { + keySet.clear(); + if (null != rocksDb) { + rocksDb.close(); + } + rocksDb = null; + super.close(); + } + + private RocksDBDAO getRocksDb() { + if (null == rocksDb) { + synchronized (this) { + if (null == rocksDb) { + rocksDb = new RocksDBDAO(ROCKSDB_COL_FAMILY, diskMapPath); + rocksDb.addColumnFamily(ROCKSDB_COL_FAMILY); + } + } + } + return rocksDb; + } + +} \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnIndexID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnIndexID.java new file mode 100644 index 0000000000000..92e60b30a311f --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnIndexID.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util.hash; + +import org.apache.hudi.common.util.Base64CodecUtil; + +/** + * A stateful Hoodie object ID representing any table column. + */ +public class ColumnIndexID extends HoodieIndexID { + + private static final Type TYPE = Type.COLUMN; + public static final HashID.Size ID_COLUMN_HASH_SIZE = HashID.Size.BITS_64; + private final String column; + private final byte[] hash; + + public ColumnIndexID(final String column) { + this.column = column; + this.hash = HashID.hash(column, ID_COLUMN_HASH_SIZE); + } + + @Override + public String getName() { + return column; + } + + @Override + public int bits() { + return ID_COLUMN_HASH_SIZE.byteSize(); + } + + @Override + public byte[] asBytes() { + return this.hash; + } + + @Override + public String asBase64EncodedString() { + return Base64CodecUtil.encode(this.hash); + } + + @Override + public String toString() { + return new String(this.hash); + } + + @Override + protected Type getType() { + return TYPE; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileIndexID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileIndexID.java new file mode 100644 index 0000000000000..3f9616908bb39 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileIndexID.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util.hash; + +import org.apache.hudi.common.util.Base64CodecUtil; + +/** + * Hoodie object ID representing any file. + */ +public class FileIndexID extends HoodieIndexID { + + private static final Type TYPE = Type.FILE; + private static final HashID.Size ID_FILE_HASH_SIZE = HashID.Size.BITS_128; + private final String fileName; + private final byte[] hash; + + public FileIndexID(final String fileName) { + this.fileName = fileName; + this.hash = HashID.hash(fileName, ID_FILE_HASH_SIZE); + } + + @Override + public String getName() { + return fileName; + } + + @Override + public int bits() { + return ID_FILE_HASH_SIZE.byteSize(); + } + + @Override + public byte[] asBytes() { + return this.hash; + } + + @Override + public String asBase64EncodedString() { + return Base64CodecUtil.encode(this.hash); + } + + @Override + public String toString() { + return new String(this.hash); + } + + @Override + protected Type getType() { + return TYPE; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HashID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HashID.java new file mode 100644 index 0000000000000..ccb29dfbb580d --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HashID.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util.hash; + +import net.jpountz.xxhash.XXHash32; +import net.jpountz.xxhash.XXHash64; +import net.jpountz.xxhash.XXHashFactory; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hudi.exception.HoodieIOException; + +import java.io.Serializable; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; + +/** + * A stateless Hash class which generates ID for the desired bit count. + */ +public class HashID implements Serializable { + + private static final String MD5_ALGORITHM_NAME = "MD5"; + private static final int HASH_SEED = 0xdabadaba; + + /** + * Represents HashID size in bits. + */ + public enum Size { + BITS_32(32), + BITS_64(64), + BITS_128(128); + + private final int bits; + + Size(int bitCount) { + this.bits = bitCount; + } + + /** + * Get this Hash size in bytes. + * + * @return Bytes needed to represent this size + */ + public int byteSize() { + return (((this.bits - 1) / Byte.SIZE) + 1); + } + + /** + * Get this Hash size in bits. + * + * @return bits needed to represent the size + */ + public int bits() { + return this.bits; + } + + @Override + public String toString() { + return "HashSize{" + bits + "}"; + } + } + + /** + * Get the hash value for a string message and for the desired @{@link Size}. + * + * @param message - String message to get the hash value for + * @param bits - @{@link Size} of the hash value + * @return Hash value for the message as byte array + */ + public static byte[] hash(final String message, final Size bits) { + return hash(message.getBytes(StandardCharsets.UTF_8), bits); + } + + /** + * Get the hash value for a byte array and for the desired @{@link Size}. + * + * @param messageBytes - Byte array message to get the hash value for + * @param bits - @{@link Size} of the hash value + * @return Hash value for the message as byte array + */ + public static byte[] hash(final byte[] messageBytes, final Size bits) { + switch (bits) { + case BITS_32: + case BITS_64: + return getXXHash(messageBytes, bits); + case BITS_128: + return getMD5Hash(messageBytes); + default: + throw new IllegalArgumentException("Unexpected Hash size bits: " + bits); + } + } + + public static int getXXHash32(final String message, int hashSeed) { + return getXXHash32(message.getBytes(StandardCharsets.UTF_8), hashSeed); + } + + public static int getXXHash32(final byte[] message, int hashSeed) { + XXHashFactory factory = XXHashFactory.fastestInstance(); + return factory.hash32().hash(message, 0, message.length, hashSeed); + } + + private static byte[] getXXHash(final byte[] message, final Size bits) { + XXHashFactory factory = XXHashFactory.fastestInstance(); + switch (bits) { + case BITS_32: + XXHash32 hash32 = factory.hash32(); + return Bytes.toBytes(hash32.hash(message, 0, message.length, HASH_SEED)); + case BITS_64: + XXHash64 hash64 = factory.hash64(); + return Bytes.toBytes(hash64.hash(message, 0, message.length, HASH_SEED)); + default: + throw new HoodieIOException("XX" + bits + " hash is unsupported!"); + } + } + + private static byte[] getMD5Hash(final byte[] message) throws HoodieIOException { + try { + MessageDigest messageDigest = MessageDigest.getInstance(MD5_ALGORITHM_NAME); + messageDigest.update(message); + return messageDigest.digest(); + } catch (NoSuchAlgorithmException e) { + throw new HoodieIOException("Failed to create MD5 Hash: " + e); + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieIndexID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieIndexID.java new file mode 100644 index 0000000000000..139efd17ed0ae --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieIndexID.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util.hash; + +import org.apache.hudi.exception.HoodieNotSupportedException; + +import java.io.Serializable; + +/** + * A serializable ID that can be used to identify any Hoodie table fields and + * resources in the on-disk index. + */ +public abstract class HoodieIndexID implements Serializable { + + private static final long serialVersionUID = 1L; + + /** + * Supported ID types. + */ + public enum Type { + COLUMN("HoodieColumnID"), + PARTITION("HoodiePartitionID"), + FILE("HoodieFileID"); + + private final String name; + + Type(final String name) { + this.name = name; + } + + @Override + public String toString() { + return "Type{name='" + name + "'}"; + } + } + + /** + * Get the resource name for which this index id is generated. + * + * @return The resource name + */ + public abstract String getName(); + + /** + * Get the number of bits representing this ID in memory. + *

    + * Note: Will be in multiples of 8 only. + * + * @return The number of bits in this ID + */ + public abstract int bits(); + + /** + * Get this ID as a byte array. + * + * @return A byte array representing this ID + */ + public abstract byte[] asBytes(); + + /** + * Get the String version of this ID. + * + * @return String version of this ID. + */ + public abstract String toString(); + + /** + * Get the Base64 encoded version of the ID. + */ + public String asBase64EncodedString() { + throw new HoodieNotSupportedException("Unsupported hash for " + getType()); + } + + /** + * Get the ID type. + * + * @return This ID type + */ + protected abstract Type getType(); + + /** + * Is this ID a ColumnID type ? + * + * @return True if this ID of ColumnID type + */ + public final boolean isColumnID() { + return (getType() == Type.COLUMN); + } + + /** + * Is this ID a Partition type ? + * + * @return True if this ID of PartitionID type + */ + public final boolean isPartition() { + return (getType() == Type.PARTITION); + } + + /** + * Is this ID a FileID type ? + * + * @return True if this ID of FileID type + */ + public final boolean isFileID() { + return (getType() == Type.FILE); + } + +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionIndexID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionIndexID.java new file mode 100644 index 0000000000000..0fbae27b80de8 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionIndexID.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util.hash; + +import org.apache.hudi.common.util.Base64CodecUtil; + +/** + * Hoodie object ID representing any partition. + */ +public class PartitionIndexID extends HoodieIndexID { + + private static final Type TYPE = Type.PARTITION; + private static final HashID.Size ID_PARTITION_HASH_SIZE = HashID.Size.BITS_64; + private final String partition; + private final byte[] hash; + + public PartitionIndexID(final String partition) { + this.partition = partition; + this.hash = HashID.hash(partition, ID_PARTITION_HASH_SIZE); + } + + @Override + public String getName() { + return partition; + } + + @Override + public int bits() { + return ID_PARTITION_HASH_SIZE.byteSize(); + } + + @Override + public byte[] asBytes() { + return this.hash; + } + + @Override + public String asBase64EncodedString() { + return Base64CodecUtil.encode(this.hash); + } + + @Override + public String toString() { + return new String(this.hash); + } + + @Override + protected Type getType() { + return TYPE; + } + +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/io/ByteBufferBackedInputStream.java b/hudi-common/src/main/java/org/apache/hudi/common/util/io/ByteBufferBackedInputStream.java new file mode 100644 index 0000000000000..0f96d1011a3f0 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/io/ByteBufferBackedInputStream.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util.io; + +import javax.annotation.Nonnull; +import java.io.InputStream; +import java.nio.ByteBuffer; + +/** + * Instance of {@link InputStream} backed by {@link ByteBuffer}, implementing following + * functionality (on top of what's required by {@link InputStream}) + * + *

      + *
    1. Seeking: enables random access by allowing to seek to an arbitrary position w/in the stream
    2. + *
    3. (Thread-safe) Copying: enables to copy from the underlying buffer not modifying the state of the stream
    4. + *
    + * + * NOTE: Generally methods of this class are NOT thread-safe, unless specified otherwise + */ +public class ByteBufferBackedInputStream extends InputStream { + + private final ByteBuffer buffer; + private final int bufferOffset; + + public ByteBufferBackedInputStream(ByteBuffer buf) { + this.buffer = buf.duplicate(); + // We're marking current buffer position, so that we will be able + // to reset it later on appropriately (to support seek operations) + this.buffer.mark(); + this.bufferOffset = buffer.position(); + } + + public ByteBufferBackedInputStream(byte[] array) { + this(array, 0, array.length); + } + + public ByteBufferBackedInputStream(byte[] array, int offset, int length) { + this(ByteBuffer.wrap(array, offset, length)); + } + + @Override + public int read() { + if (!buffer.hasRemaining()) { + throw new IllegalArgumentException("Reading past backed buffer boundary"); + } + return buffer.get() & 0xFF; + } + + @Override + public int read(@Nonnull byte[] bytes, int offset, int length) { + if (!buffer.hasRemaining()) { + throw new IllegalArgumentException("Reading past backed buffer boundary"); + } + // Determine total number of bytes available to read + int available = Math.min(length, buffer.remaining()); + // Copy bytes into the target buffer + buffer.get(bytes, offset, available); + return available; + } + + /** + * Returns current position of the stream + */ + public int getPosition() { + return buffer.position() - bufferOffset; + } + + /** + * Seeks to a position w/in the stream + * + * NOTE: Position is relative to the start of the stream (ie its absolute w/in this stream), + * with following invariant being assumed: + *

    0 <= pos <= length (of the stream)

    + * + * This method is NOT thread-safe + * + * @param pos target position to seek to w/in the holding buffer + */ + public void seek(long pos) { + buffer.reset(); // to mark + int offset = buffer.position(); + // NOTE: That the new pos is still relative to buffer's offset + int newPos = offset + (int) pos; + if (newPos > buffer.limit() || newPos < offset) { + throw new IllegalArgumentException( + String.format("Can't seek past the backing buffer (limit %d, offset %d, new %d)", buffer.limit(), offset, newPos) + ); + } + + buffer.position(newPos); + } + + /** + * Copies at most {@code length} bytes starting from position {@code pos} into the target + * buffer with provided {@code offset}. Returns number of bytes copied from the backing buffer + * + * NOTE: This does not change the current position of the stream and is thread-safe + * + * @param pos absolute position w/in stream to read from + * @param targetBuffer target buffer to copy into + * @param offset target buffer offset to copy at + * @param length length of the sequence to copy + * @return number of bytes copied + */ + public int copyFrom(long pos, byte[] targetBuffer, int offset, int length) { + int bufferPos = bufferOffset + (int) pos; + if (bufferPos > buffer.limit()) { + throw new IllegalArgumentException( + String.format("Can't read past the backing buffer boundary (offset %d, length %d)", pos, buffer.limit() - bufferOffset) + ); + } else if (length > targetBuffer.length) { + throw new IllegalArgumentException( + String.format("Target buffer is too small (length %d, buffer size %d)", length, targetBuffer.length) + ); + } + // Determine total number of bytes available to read + int available = Math.min(length, buffer.limit() - bufferPos); + // Get current buffer position in the backing array + System.arraycopy(buffer.array(), bufferPos, targetBuffer, offset, available); + return available; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryExecutor.java b/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryExecutor.java index cec9ab61a9c1b..46ef5dc40caf8 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryExecutor.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryExecutor.java @@ -18,7 +18,9 @@ package org.apache.hudi.common.util.queue; +import org.apache.hudi.common.util.CustomizedThreadFactory; import org.apache.hudi.common.util.DefaultSizeEstimator; +import org.apache.hudi.common.util.Functions; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.SizeEstimator; import org.apache.hudi.exception.HoodieException; @@ -26,7 +28,8 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; import java.util.List; import java.util.concurrent.CompletableFuture; import java.util.concurrent.CountDownLatch; @@ -34,6 +37,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; import java.util.function.Function; import java.util.stream.Collectors; @@ -45,38 +49,48 @@ public class BoundedInMemoryExecutor { private static final Logger LOG = LogManager.getLogger(BoundedInMemoryExecutor.class); - - // Executor service used for launching writer thread. - private final ExecutorService executorService; + private static final long TERMINATE_WAITING_TIME_SECS = 60L; + // Executor service used for launching write thread. + private final ExecutorService producerExecutorService; + // Executor service used for launching read thread. + private final ExecutorService consumerExecutorService; // Used for buffering records which is controlled by HoodieWriteConfig#WRITE_BUFFER_LIMIT_BYTES. private final BoundedInMemoryQueue queue; // Producers private final List> producers; // Consumer private final Option> consumer; + // pre-execute function to implement environment specific behavior before executors (producers/consumer) run + private final Runnable preExecuteRunnable; + + public BoundedInMemoryExecutor(final long bufferLimitInBytes, final Iterator inputItr, + BoundedInMemoryQueueConsumer consumer, Function transformFunction, Runnable preExecuteRunnable) { + this(bufferLimitInBytes, new IteratorBasedQueueProducer<>(inputItr), Option.of(consumer), transformFunction, preExecuteRunnable); + } + + public BoundedInMemoryExecutor(final long bufferLimitInBytes, BoundedInMemoryQueueProducer producer, + Option> consumer, final Function transformFunction) { + this(bufferLimitInBytes, producer, consumer, transformFunction, Functions.noop()); + } public BoundedInMemoryExecutor(final long bufferLimitInBytes, BoundedInMemoryQueueProducer producer, - Option> consumer, final Function transformFunction) { - this(bufferLimitInBytes, Arrays.asList(producer), consumer, transformFunction, new DefaultSizeEstimator<>()); + Option> consumer, final Function transformFunction, Runnable preExecuteRunnable) { + this(bufferLimitInBytes, Collections.singletonList(producer), consumer, transformFunction, new DefaultSizeEstimator<>(), preExecuteRunnable); } public BoundedInMemoryExecutor(final long bufferLimitInBytes, List> producers, - Option> consumer, final Function transformFunction, - final SizeEstimator sizeEstimator) { + Option> consumer, final Function transformFunction, + final SizeEstimator sizeEstimator, Runnable preExecuteRunnable) { this.producers = producers; this.consumer = consumer; - // Ensure single thread for each producer thread and one for consumer - this.executorService = Executors.newFixedThreadPool(producers.size() + 1); + this.preExecuteRunnable = preExecuteRunnable; + // Ensure fixed thread for each producer thread + this.producerExecutorService = Executors.newFixedThreadPool(producers.size(), new CustomizedThreadFactory("producer")); + // Ensure single thread for consumer + this.consumerExecutorService = Executors.newSingleThreadExecutor(new CustomizedThreadFactory("consumer")); this.queue = new BoundedInMemoryQueue<>(bufferLimitInBytes, transformFunction, sizeEstimator); } - /** - * Callback to implement environment specific behavior before executors (producers/consumer) run. - */ - public void preExecute() { - // Do Nothing in general context - } - /** * Start all Producers. */ @@ -84,13 +98,13 @@ public ExecutorCompletionService startProducers() { // Latch to control when and which producer thread will close the queue final CountDownLatch latch = new CountDownLatch(producers.size()); final ExecutorCompletionService completionService = - new ExecutorCompletionService(executorService); + new ExecutorCompletionService(producerExecutorService); producers.stream().map(producer -> { return completionService.submit(() -> { try { - preExecute(); + preExecuteRunnable.run(); producer.produce(queue); - } catch (Exception e) { + } catch (Throwable e) { LOG.error("error producing records", e); queue.markAsFailed(e); throw e; @@ -114,9 +128,9 @@ public ExecutorCompletionService startProducers() { */ private Future startConsumer() { return consumer.map(consumer -> { - return executorService.submit(() -> { + return consumerExecutorService.submit(() -> { LOG.info("starting consumer thread"); - preExecute(); + preExecuteRunnable.run(); try { E result = consumer.consume(queue); LOG.info("Queue Consumption is done; notifying producer threads"); @@ -135,10 +149,14 @@ private Future startConsumer() { */ public E execute() { try { - ExecutorCompletionService producerService = startProducers(); + startProducers(); Future future = startConsumer(); // Wait for consumer to be done return future.get(); + } catch (InterruptedException ie) { + shutdownNow(); + Thread.currentThread().interrupt(); + throw new HoodieException(ie); } catch (Exception e) { throw new HoodieException(e); } @@ -149,7 +167,29 @@ public boolean isRemaining() { } public void shutdownNow() { - executorService.shutdownNow(); + producerExecutorService.shutdownNow(); + consumerExecutorService.shutdownNow(); + // close queue to force producer stop + queue.close(); + } + + public boolean awaitTermination() { + // if current thread has been interrupted before awaitTermination was called, we still give + // executor a chance to proceeding. So clear the interrupt flag and reset it if needed before return. + boolean interruptedBefore = Thread.interrupted(); + boolean producerTerminated = false; + boolean consumerTerminated = false; + try { + producerTerminated = producerExecutorService.awaitTermination(TERMINATE_WAITING_TIME_SECS, TimeUnit.SECONDS); + consumerTerminated = consumerExecutorService.awaitTermination(TERMINATE_WAITING_TIME_SECS, TimeUnit.SECONDS); + } catch (InterruptedException ie) { + // fail silently for any other interruption + } + // reset interrupt flag if needed + if (interruptedBefore) { + Thread.currentThread().interrupt(); + } + return producerTerminated && consumerTerminated; } public BoundedInMemoryQueue getQueue() { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryQueue.java b/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryQueue.java index f1ebdcd449dec..dfe33b49ec0c7 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryQueue.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryQueue.java @@ -78,10 +78,10 @@ public class BoundedInMemoryQueue implements Iterable { private final long memoryLimit; /** - * it holds the root cause of the exception in case either queueing records + * it holds the root cause of the Throwable in case either queueing records * (consuming from inputIterator) fails or thread reading records from queue fails. */ - private final AtomicReference hasFailed = new AtomicReference<>(null); + private final AtomicReference hasFailed = new AtomicReference<>(null); /** Used for indicating that all the records from queue are read successfully. **/ private final AtomicBoolean isReadDone = new AtomicBoolean(false); @@ -172,7 +172,7 @@ private void adjustBufferSizeIfNeeded(final O payload) throws InterruptedExcepti /** * Inserts record into queue after applying transformation. * - * @param t Item to be queueed + * @param t Item to be queued */ public void insertRecord(I t) throws Exception { // If already closed, throw exception @@ -222,7 +222,7 @@ private Option readNextRecord() { throw new HoodieException(e); } } - // Check one more time here as it is possible producer errored out and closed immediately + // Check one more time here as it is possible producer erred out and closed immediately throwExceptionIfFailed(); if (newRecord != null && newRecord.isPresent()) { @@ -244,6 +244,7 @@ public void close() { private void throwExceptionIfFailed() { if (this.hasFailed.get() != null) { + close(); throw new HoodieException("operation has failed", this.hasFailed.get()); } } @@ -251,7 +252,7 @@ private void throwExceptionIfFailed() { /** * API to allow producers and consumer to communicate termination due to failure. */ - public void markAsFailed(Exception e) { + public void markAsFailed(Throwable e) { this.hasFailed.set(e); // release the permits so that if the queueing thread is waiting for permits then it will // get it. diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/ExceptionUtil.java b/hudi-common/src/main/java/org/apache/hudi/exception/ExceptionUtil.java new file mode 100644 index 0000000000000..a0550ba9eaf60 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/exception/ExceptionUtil.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.exception; + +import javax.annotation.Nonnull; + +/** + * Class collecting common utilities helping in handling {@link Exception}s + */ +public final class ExceptionUtil { + private ExceptionUtil() {} + + /** + * Fetches inner-most cause of the provided {@link Throwable} + */ + @Nonnull + public static Throwable getRootCause(@Nonnull Throwable t) { + Throwable cause = t; + while (cause.getCause() != null) { + cause = cause.getCause(); + } + + return cause; + } + +} diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieCatalogException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieCatalogException.java new file mode 100644 index 0000000000000..ccfef909096b5 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieCatalogException.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.exception; + +/** + * Exception thrown for Hoodie Catalog errors. + */ +public class HoodieCatalogException extends RuntimeException { + + public HoodieCatalogException() { + super(); + } + + public HoodieCatalogException(String message) { + super(message); + } + + public HoodieCatalogException(String message, Throwable t) { + super(message, t); + } + + public HoodieCatalogException(Throwable t) { + super(t); + } + +} diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieCompactException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieCompactException.java new file mode 100644 index 0000000000000..0d51706bbec5b --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieCompactException.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.exception; + +public class HoodieCompactException extends HoodieException { + + public HoodieCompactException(String msg) { + super(msg); + } + + public HoodieCompactException(String msg, Throwable e) { + super(msg, e); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieDebeziumAvroPayloadException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieDebeziumAvroPayloadException.java new file mode 100644 index 0000000000000..a1c83477dc3b9 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieDebeziumAvroPayloadException.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.exception; + +import java.io.IOException; + +public class HoodieDebeziumAvroPayloadException extends IOException { + + public HoodieDebeziumAvroPayloadException(String msg) { + super(msg); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieDuplicateKeyException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieDuplicateKeyException.java new file mode 100644 index 0000000000000..b75b7e97a09ad --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieDuplicateKeyException.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.exception; + +/** + * Exception throws when insert a duplicate key to the table using sql insert statement. + */ +public class HoodieDuplicateKeyException extends HoodieException { + + public HoodieDuplicateKeyException(String duplicateKey) { + super("Duplicate key found for insert statement, key is: " + duplicateKey); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieException.java index 58adc0e4307ad..9bf01b34cdb50 100644 --- a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieException.java +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieException.java @@ -18,7 +18,6 @@ package org.apache.hudi.exception; -import java.io.Serializable; /** *

    @@ -29,7 +28,7 @@ * exception. *

    */ -public class HoodieException extends RuntimeException implements Serializable { +public class HoodieException extends RuntimeException { public HoodieException() { super(); diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieHeartbeatException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieHeartbeatException.java new file mode 100644 index 0000000000000..eef6baf40e285 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieHeartbeatException.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.exception; + +import java.io.Serializable; + +/** + *

    + * Exception thrown for Hoodie hearbeat failures. The root of the exception hierarchy. + *

    + *

    + * Hoodie Write/Read clients will throw this exception if any of its operations fail. This is a runtime (unchecked) + * exception. + *

    + */ +public class HoodieHeartbeatException extends RuntimeException implements Serializable { + + public HoodieHeartbeatException() { + super(); + } + + public HoodieHeartbeatException(String message) { + super(message); + } + + public HoodieHeartbeatException(String message, Throwable t) { + super(message, t); + } + + public HoodieHeartbeatException(Throwable t) { + super(t); + } + +} diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieIncompatibleSchemaException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieIncompatibleSchemaException.java new file mode 100644 index 0000000000000..a739af67909b0 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieIncompatibleSchemaException.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.exception; + +/** + * Exception for incompatible schema. + */ +public class HoodieIncompatibleSchemaException extends RuntimeException { + + public HoodieIncompatibleSchemaException(String msg, Throwable e) { + super(msg, e); + } + + public HoodieIncompatibleSchemaException(String msg) { + super(msg); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieLockException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieLockException.java new file mode 100644 index 0000000000000..cd1ff7f0eaf02 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieLockException.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.exception; + +/** + *

    + * Exception thrown for Hoodie failures. The root of the exception hierarchy. + *

    + *

    + * Hoodie Write clients will throw this exception if unable to acquire a lock. This is a runtime (unchecked) + * exception. + *

    + */ +public class HoodieLockException extends HoodieException { + + public HoodieLockException(String msg) { + super(msg); + } + + public HoodieLockException(Throwable e) { + super(e); + } + + public HoodieLockException(String msg, Throwable e) { + super(msg, e); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieMetadataException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieMetadataException.java new file mode 100644 index 0000000000000..132a9f804edf2 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieMetadataException.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.exception; + +/** + *

    + * Exception thrown for table metadata related failures. + *

    + */ +public class HoodieMetadataException extends HoodieException { + public HoodieMetadataException(String msg, Exception t) { + super(msg, t); + } + + public HoodieMetadataException(String msg) { + super(msg); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieRemoteException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieRemoteException.java index 24eb2a1699e7d..2cf52805b544a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieRemoteException.java +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieRemoteException.java @@ -29,4 +29,7 @@ public HoodieRemoteException(IOException t) { super(t.getMessage(), t); } + public HoodieRemoteException(String message, IOException t) { + super(message + "\n" + t.getMessage(), t); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieValidationException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieValidationException.java new file mode 100644 index 0000000000000..04b696d309f23 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieValidationException.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.exception; + +/** + *

    + * Exception thrown for validation failures. + *

    + */ +public class HoodieValidationException extends HoodieException { + + public HoodieValidationException(String msg, Throwable t) { + super(msg, t); + } + + public HoodieValidationException(String msg) { + super(msg); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieWriteConflictException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieWriteConflictException.java new file mode 100644 index 0000000000000..f0f6dcbf0ab17 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieWriteConflictException.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.exception; + +/** + *

    + * Exception thrown for Hoodie failures. The root of the exception hierarchy. + *

    + *

    + * Hoodie Write clients will throw this exception if unable to commit due to conflicts. This is a runtime (unchecked) + * exception. + *

    + */ +public class HoodieWriteConflictException extends HoodieException { + + public HoodieWriteConflictException(String msg) { + super(msg); + } + + public HoodieWriteConflictException(Throwable e) { + super(e); + } + + public HoodieWriteConflictException(String msg, Throwable e) { + super(msg, e); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/TableNotFoundException.java b/hudi-common/src/main/java/org/apache/hudi/exception/TableNotFoundException.java index 7666e90a74f90..fd5fe102decb5 100644 --- a/hudi-common/src/main/java/org/apache/hudi/exception/TableNotFoundException.java +++ b/hudi-common/src/main/java/org/apache/hudi/exception/TableNotFoundException.java @@ -18,9 +18,11 @@ package org.apache.hudi.exception; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import java.io.FileNotFoundException; import java.io.IOException; /** @@ -34,23 +36,25 @@ public TableNotFoundException(String basePath) { super(getErrorMessage(basePath)); } + public TableNotFoundException(String basePath, Throwable t) { + super(getErrorMessage(basePath), t); + } + private static String getErrorMessage(String basePath) { return "Hoodie table not found in path " + basePath; } public static void checkTableValidity(FileSystem fs, Path basePathDir, Path metaPathDir) { - // Check if the base path is found + // Check if the base and meta paths are found try { - if (!fs.exists(basePathDir) || !fs.isDirectory(basePathDir)) { - throw new TableNotFoundException(basePathDir.toString()); - } - // Check if the meta path is found - if (!fs.exists(metaPathDir) || !fs.isDirectory(metaPathDir)) { + // Since metaPath is within the basePath, it is enough to check the metaPath exists + FileStatus status = fs.getFileStatus(metaPathDir); + if (!status.isDirectory()) { throw new TableNotFoundException(metaPathDir.toString()); } - } catch (IllegalArgumentException e) { + } catch (FileNotFoundException | IllegalArgumentException e) { // if the base path is file:///, then we have a IllegalArgumentException - throw new TableNotFoundException(metaPathDir.toString()); + throw new TableNotFoundException(metaPathDir.toString(), e); } catch (IOException e) { throw new HoodieIOException("Could not check if " + basePathDir + " is a valid table", e); } diff --git a/hudi-common/src/main/java/org/apache/hudi/hadoop/CachingPath.java b/hudi-common/src/main/java/org/apache/hudi/hadoop/CachingPath.java new file mode 100644 index 0000000000000..d6e35dbbdc5a8 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/hadoop/CachingPath.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.exception.HoodieException; + +import javax.annotation.concurrent.ThreadSafe; +import java.net.URI; +import java.net.URISyntaxException; + +/** + * This is an extension of the {@code Path} class allowing to avoid repetitive + * computations (like {@code getFileName}, {@code toString}) which are secured + * by its immutability + * + * NOTE: This class is thread-safe + */ +@ThreadSafe +public class CachingPath extends Path { + + // NOTE: `volatile` keyword is redundant here and put mostly for reader notice, since all + // reads/writes to references are always atomic (including 64-bit JVMs) + // https://docs.oracle.com/javase/specs/jls/se8/html/jls-17.html#jls-17.7 + private volatile Path parent; + private volatile String fileName; + private volatile String fullPathStr; + + public CachingPath(String parent, String child) { + super(parent, child); + } + + public CachingPath(Path parent, String child) { + super(parent, child); + } + + public CachingPath(String parent, Path child) { + super(parent, child); + } + + public CachingPath(Path parent, Path child) { + super(parent, child); + } + + public CachingPath(String pathString) throws IllegalArgumentException { + super(pathString); + } + + public CachingPath(URI aUri) { + super(aUri); + } + + @Override + public String getName() { + // This value could be overwritten concurrently and that's okay, since + // {@code Path} is immutable + if (fileName == null) { + fileName = super.getName(); + } + return fileName; + } + + @Override + public Path getParent() { + // This value could be overwritten concurrently and that's okay, since + // {@code Path} is immutable + if (parent == null) { + parent = super.getParent(); + } + + return parent; + } + + @Override + public String toString() { + // This value could be overwritten concurrently and that's okay, since + // {@code Path} is immutable + if (fullPathStr == null) { + fullPathStr = super.toString(); + } + return fullPathStr; + } + + public CachingPath subPath(String relativePath) { + return new CachingPath(this, createPathUnsafe(relativePath)); + } + + public static CachingPath wrap(Path path) { + if (path instanceof CachingPath) { + return (CachingPath) path; + } + + return new CachingPath(path.toUri()); + } + + /** + * Creates path based on the provided *relative* path + * + * NOTE: This is an unsafe version that is relying on the fact that the caller is aware + * what they are doing this is not going to work with paths having scheme (which require + * parsing) and is only meant to work w/ relative paths in a few specific cases. + */ + public static CachingPath createPathUnsafe(String relativePath) { + try { + // NOTE: {@code normalize} is going to be invoked by {@code Path} ctor, so there's no + // point in invoking it here + URI uri = new URI(null, null, relativePath, null, null); + return new CachingPath(uri); + } catch (URISyntaxException e) { + throw new HoodieException("Failed to instantiate relative path", e); + } + } + + /** + * This is {@link Path#getPathWithoutSchemeAndAuthority(Path)} counterpart, instantiating + * {@link CachingPath} + */ + public static Path getPathWithoutSchemeAndAuthority(Path path) { + // This code depends on Path.toString() to remove the leading slash before + // the drive specification on Windows. + return path.isUriPathAbsolute() + ? createPathUnsafe(path.toUri().getPath()) + : path; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/hadoop/SerializablePath.java b/hudi-common/src/main/java/org/apache/hudi/hadoop/SerializablePath.java new file mode 100644 index 0000000000000..796600a7e838e --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/hadoop/SerializablePath.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop; + +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import java.net.URI; +import java.util.Objects; + +/** + * {@link Serializable} wrapper encapsulating {@link Path} + */ +public class SerializablePath implements Serializable { + + private Path path; + + public SerializablePath(Path path) { + this.path = path; + } + + public Path get() { + return path; + } + + private void writeObject(ObjectOutputStream out) throws IOException { + out.writeObject(path.toUri()); + } + + private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException { + URI uri = (URI) in.readObject(); + path = new CachingPath(uri); + } + + @Override + public String toString() { + return path.toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + SerializablePath that = (SerializablePath) o; + return Objects.equals(path, that.path); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/HoodieSchemaException.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/HoodieSchemaException.java new file mode 100644 index 0000000000000..7fdafc5238d10 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/HoodieSchemaException.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema; + +import org.apache.hudi.exception.HoodieException; + +/** + * Exception thrown for Hoodie schema convert failures. The root of the exception hierarchy. + * Hoodie Write/Read clients will throw this exception if any of its operations fail. This is a runtime (unchecked) + * exception. + */ +public class HoodieSchemaException extends HoodieException { + public HoodieSchemaException() { + super(); + } + + public HoodieSchemaException(String message) { + super(message); + } + + public HoodieSchemaException(String message, Throwable t) { + super(message, t); + } + + public HoodieSchemaException(Throwable t) { + super(t); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/InternalSchema.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/InternalSchema.java new file mode 100644 index 0000000000000..659612cd5cfaf --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/InternalSchema.java @@ -0,0 +1,291 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema; + +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.internal.schema.Types.Field; +import org.apache.hudi.internal.schema.Types.RecordType; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Internal schema for hudi table. + * used to support schema evolution. + */ +public class InternalSchema implements Serializable { + + private static final long DEFAULT_VERSION_ID = 0; + + private final RecordType record; + + private int maxColumnId; + private long versionId; + + private transient Map idToField = null; + private transient Map nameToId = null; + private transient Map idToName = null; + + public static InternalSchema getEmptyInternalSchema() { + return new InternalSchema(-1L, new ArrayList<>()); + } + + public boolean isEmptySchema() { + return versionId < 0; + } + + public InternalSchema(List columns) { + this(DEFAULT_VERSION_ID, columns); + } + + public InternalSchema(Field... columns) { + this(DEFAULT_VERSION_ID, Arrays.asList(columns)); + } + + public InternalSchema(long versionId, List cols) { + this.versionId = versionId; + this.record = RecordType.get(cols); + idToName = cols.isEmpty() ? new HashMap<>() : InternalSchemaBuilder.getBuilder().buildIdToName(record); + nameToId = cols.isEmpty() ? new HashMap<>() : idToName.entrySet().stream().collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey)); + maxColumnId = idToName.isEmpty() ? -1 : idToName.keySet().stream().max(Comparator.comparing(Integer::valueOf)).get(); + } + + public InternalSchema(long versionId, int maxColumnId, List cols) { + this.maxColumnId = maxColumnId; + this.versionId = versionId; + this.record = RecordType.get(cols); + buildIdToName(); + } + + public InternalSchema(long versionId, int maxColumnId, Field... cols) { + this(versionId, maxColumnId, Arrays.asList(cols)); + } + + public RecordType getRecord() { + return record; + } + + private Map buildIdToName() { + if (idToName == null) { + idToName = InternalSchemaBuilder.getBuilder().buildIdToName(record); + } + return idToName; + } + + private Map buildNameToId() { + if (nameToId == null) { + if (idToName != null && !idToName.isEmpty()) { + nameToId = idToName.entrySet().stream().collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey)); + return nameToId; + } + nameToId = InternalSchemaBuilder.getBuilder().buildNameToId(record); + } + return nameToId; + } + + private Map buildIdToField() { + if (idToField == null) { + idToField = InternalSchemaBuilder.getBuilder().buildIdToField(record); + } + return idToField; + } + + /** + * Get all columns full name. + */ + public List getAllColsFullName() { + if (nameToId == null) { + nameToId = InternalSchemaBuilder.getBuilder().buildNameToId(record); + } + return Arrays.asList(nameToId.keySet().toArray(new String[0])); + } + + /** + * Set the version ID for this schema. + */ + public InternalSchema setSchemaId(long versionId) { + this.versionId = versionId; + return this; + } + + /** + * Returns the version ID for this schema. + */ + public long schemaId() { + return this.versionId; + } + + /** + * Set the version ID for this schema. + */ + public void setMaxColumnId(int maxColumnId) { + this.maxColumnId = maxColumnId; + } + + /** + * Returns the max column id for this schema. + */ + public int getMaxColumnId() { + return this.maxColumnId; + } + + /** + * Returns a List of the {@link Field columns} in this Schema. + */ + public List columns() { + return record.fields(); + } + + /** + * Returns the {@link Type} of a sub-field identified by the field name. + * + * @param id a field id + * @return fullName of field of + */ + public String findfullName(int id) { + if (idToName == null) { + buildIdToName(); + } + String result = idToName.get(id); + return result == null ? "" : result; + } + + /** + * Returns the {@link Type} of a sub-field identified by the field name. + * + * @param name a field name + * @return a Type for the sub-field or null if it is not found + */ + public Type findType(String name) { + if (name == null || name.isEmpty()) { + return null; + } + Integer id = buildNameToId().get(name); + if (id != null) { // name is found + return findType(id); + } + return null; + } + + /** + * Returns the {@link Type} of a sub-field identified by the field id. + * + * @param id a field id + * @return a Type for the sub-field or null if it is not found + */ + public Type findType(int id) { + Field field = buildIdToField().get(id); + if (field != null) { + return field.type(); + } + return null; + } + + /** + * Returns all field ids + */ + public Set getAllIds() { + if (idToName == null) { + buildIdToName(); + } + return idToName.keySet(); + } + + /** + * Returns the sub-field identified by the field id. + * + * @param id a field id + * @return the sub-field or null if it is not found + */ + public Field findField(int id) { + return buildIdToField().get(id); + } + + /** + * Returns a sub-field by name as a {@link Field}. + * The result may be a top-level or a nested field. + * + * @param name a String name + * @return a Type for the sub-field or null if it is not found + */ + public Field findField(String name) { + if (name == null || name.isEmpty()) { + return null; + } + Integer id = buildNameToId().get(name); + if (id != null) { + return buildIdToField().get(id); + } + return null; + } + + /** + * Whether colName exists in current Schema. + * Case insensitive. + * + * @param colName a colName + * @return Whether colName exists in current Schema + */ + public boolean findDuplicateCol(String colName) { + return idToName.entrySet().stream().map(e -> e.getValue().toLowerCase(Locale.ROOT)) + .collect(Collectors.toSet()).contains(colName); + } + + public int findIdByName(String name) { + if (name == null || name.isEmpty()) { + return -1; + } + return buildNameToId().getOrDefault(name, -1); + } + + @Override + public String toString() { + return String.format("table {\n%s\n}", + StringUtils.join(record.fields().stream() + .map(f -> " " + f) + .collect(Collectors.toList()).toArray(new String[0]), "\n")); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } else if (!(o instanceof InternalSchema)) { + return false; + } + InternalSchema that = (InternalSchema) o; + if (versionId != that.schemaId()) { + return false; + } + return record.equals(that.record); + } + + @Override + public int hashCode() { + return record.hashCode(); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/InternalSchemaBuilder.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/InternalSchemaBuilder.java new file mode 100644 index 0000000000000..5fc86ef723958 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/InternalSchemaBuilder.java @@ -0,0 +1,272 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema; + +import org.apache.hudi.internal.schema.visitor.InternalSchemaVisitor; +import org.apache.hudi.internal.schema.visitor.NameToIDVisitor; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Deque; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * A build class to help build fields for InternalSchema + */ +public class InternalSchemaBuilder implements Serializable { + private static final InternalSchemaBuilder INSTANCE = new InternalSchemaBuilder(); + + public static InternalSchemaBuilder getBuilder() { + return INSTANCE; + } + + private InternalSchemaBuilder() { + } + + + /** + * Build a mapping from id to full field name for a internal Type. + * if a field y belong to a struct filed x, then the full name of y is x.y + * + * @param type hoodie internal type + * @return a mapping from id to full field name + */ + public Map buildIdToName(Type type) { + Map result = new HashMap<>(); + buildNameToId(type).forEach((k, v) -> result.put(v, k)); + return result; + } + + /** + * Build a mapping from full field name to id for a internal Type. + * if a field y belong to a struct filed x, then the full name of y is x.y + * + * @param type hoodie internal type + * @return a mapping from full field name to id + */ + public Map buildNameToId(Type type) { + return visit(type, new NameToIDVisitor()); + } + + /** + * Use to traverse all types in internalSchema with visitor. + * + * @param schema hoodie internal schema + * @return vistor expected result. + */ + public T visit(InternalSchema schema, InternalSchemaVisitor visitor) { + return visitor.schema(schema, visit(schema.getRecord(), visitor)); + } + + public T visit(Type type, InternalSchemaVisitor visitor) { + switch (type.typeId()) { + case RECORD: + Types.RecordType record = (Types.RecordType) type; + List results = new ArrayList<>(); + for (Types.Field f : record.fields()) { + visitor.beforeField(f); + T result; + try { + result = visit(f.type(), visitor); + } finally { + visitor.afterField(f); + } + results.add(visitor.field(f, result)); + } + return visitor.record(record, results); + case ARRAY: + Types.ArrayType array = (Types.ArrayType) type; + T elementResult; + Types.Field elementField = array.field(array.elementId()); + visitor.beforeArrayElement(elementField); + try { + elementResult = visit(elementField.type(), visitor); + } finally { + visitor.afterArrayElement(elementField); + } + return visitor.array(array, elementResult); + case MAP: + Types.MapType map = (Types.MapType) type; + T keyResult; + T valueResult; + Types.Field keyField = map.field(map.keyId()); + visitor.beforeMapKey(keyField); + try { + keyResult = visit(map.keyType(), visitor); + } finally { + visitor.afterMapKey(keyField); + } + Types.Field valueField = map.field(map.valueId()); + visitor.beforeMapValue(valueField); + try { + valueResult = visit(map.valueType(), visitor); + } finally { + visitor.afterMapValue(valueField); + } + return visitor.map(map, keyResult, valueResult); + default: + return visitor.primitive((Type.PrimitiveType)type); + } + } + + /** + * Build a mapping from id to field for a internal Type. + * + * @param type hoodie internal type + * @return a mapping from id to field + */ + public Map buildIdToField(Type type) { + Map idToField = new HashMap<>(); + visitIdToField(type, idToField); + return idToField; + } + + private void visitIdToField(Type type, Map index) { + switch (type.typeId()) { + case RECORD: + Types.RecordType record = (Types.RecordType) type; + for (Types.Field field : record.fields()) { + visitIdToField(field.type(), index); + index.put(field.fieldId(), field); + } + return; + case ARRAY: + Types.ArrayType array = (Types.ArrayType) type; + visitIdToField(array.elementType(), index); + for (Types.Field field : array.fields()) { + index.put(field.fieldId(), field); + } + return; + case MAP: + Types.MapType map = (Types.MapType) type; + visitIdToField(map.keyType(), index); + visitIdToField(map.valueType(), index); + for (Types.Field field : map.fields()) { + index.put(field.fieldId(), field); + } + return; + default: + return; + } + } + + /** + * Build a mapping which maintain the relation between child field id and it's parent field id. + * if a child field y(which id is 9) belong to a nest field x(which id is 6), then (9 -> 6) will be added to the result map. + * if a field has no parent field, nothings will be added. + * + * @param record hoodie record type. + * @return a mapping from id to parentId for a record Type + */ + public Map index2Parents(Types.RecordType record) { + Map result = new HashMap<>(); + Deque parentIds = new LinkedList<>(); + index2Parents(record, parentIds, result); + return result; + } + + private void index2Parents(Type type, Deque pids, Map id2p) { + switch (type.typeId()) { + case RECORD: + Types.RecordType record = (Types.RecordType)type; + for (Types.Field f : record.fields()) { + pids.push(f.fieldId()); + index2Parents(f.type(), pids, id2p); + pids.pop(); + } + + for (Types.Field f : record.fields()) { + // root record has no parent id. + if (!pids.isEmpty()) { + Integer pid = pids.peek(); + id2p.put(f.fieldId(), pid); + } + } + return; + case ARRAY: + Types.ArrayType array = (Types.ArrayType) type; + Types.Field elementField = array.field(array.elementId()); + pids.push(elementField.fieldId()); + index2Parents(elementField.type(), pids, id2p); + pids.pop(); + id2p.put(array.elementId(), pids.peek()); + return; + case MAP: + Types.MapType map = (Types.MapType) type; + Types.Field keyField = map.field(map.keyId()); + Types.Field valueField = map.field(map.valueId()); + // visit key + pids.push(map.keyId()); + index2Parents(keyField.type(), pids, id2p); + pids.pop(); + // visit value + pids.push(map.valueId()); + index2Parents(valueField.type(), pids, id2p); + pids.pop(); + id2p.put(map.keyId(), pids.peek()); + id2p.put(map.valueId(), pids.peek()); + return; + default: + } + } + + /** + * Assigns new ids for all fields in a Type, based on initial id. + * + * @param type a type. + * @param nextId initial id which used to fresh ids for all fields in a type + * @return a new type with new ids + */ + public Type refreshNewId(Type type, AtomicInteger nextId) { + switch (type.typeId()) { + case RECORD: + Types.RecordType record = (Types.RecordType) type; + List oldFields = record.fields(); + int currentId = nextId.get(); + nextId.set(currentId + record.fields().size()); + List internalFields = new ArrayList<>(); + for (int i = 0; i < oldFields.size(); i++) { + Types.Field oldField = oldFields.get(i); + Type fieldType = refreshNewId(oldField.type(), nextId); + internalFields.add(Types.Field.get(currentId++, oldField.isOptional(), oldField.name(), fieldType, oldField.doc())); + } + return Types.RecordType.get(internalFields); + case ARRAY: + Types.ArrayType array = (Types.ArrayType) type; + int elementId = nextId.get(); + nextId.set(elementId + 1); + Type elementType = refreshNewId(array.elementType(), nextId); + return Types.ArrayType.get(elementId, array.isElementOptional(), elementType); + case MAP: + Types.MapType map = (Types.MapType) type; + int keyId = nextId.get(); + int valueId = keyId + 1; + nextId.set(keyId + 2); + Type keyType = refreshNewId(map.keyType(), nextId); + Type valueType = refreshNewId(map.valueType(), nextId); + return Types.MapType.get(keyId, valueId, keyType, valueType, map.isValueOptional()); + default: + return type; + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/Type.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/Type.java new file mode 100644 index 0000000000000..5fec01a822477 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/Type.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema; + +import java.io.Serializable; +import java.util.List; +import java.util.Locale; +import java.util.Objects; + +/** + * The type of a schema, reference avro schema. + * now avro version used by hoodie, not support localTime. + * to do add support for localTime if avro version is updated + */ +public interface Type extends Serializable { + enum TypeID { + RECORD, ARRAY, MAP, FIXED, STRING, BINARY, + INT, LONG, FLOAT, DOUBLE, DATE, BOOLEAN, TIME, TIMESTAMP, DECIMAL, UUID; + private String name; + TypeID() { + this.name = this.name().toLowerCase(Locale.ROOT); + } + + public String getName() { + return name; + } + } + + static TypeID fromValue(String value) { + try { + return TypeID.valueOf(value.toUpperCase(Locale.ROOT)); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException(String.format("Invalid value of Type: %s", value)); + } + } + + TypeID typeId(); + + default boolean isNestedType() { + return false; + } + + abstract class PrimitiveType implements Type { + @Override + public boolean isNestedType() { + return false; + } + + /** + * We need to override equals because the check {@code intType1 == intType2} can return {@code false}. + * Despite the fact that most subclasses look like singleton with static field {@code INSTANCE}, + * they can still be created by deserializer. + */ + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } else if (!(o instanceof PrimitiveType)) { + return false; + } + PrimitiveType that = (PrimitiveType) o; + return typeId().equals(that.typeId()); + } + + @Override + public int hashCode() { + return Objects.hashCode(typeId()); + } + } + + abstract class NestedType implements Type { + + @Override + public boolean isNestedType() { + return true; + } + + public abstract List fields(); + + public abstract Type fieldType(String name); + + public abstract Types.Field field(int id); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/Types.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/Types.java new file mode 100644 index 0000000000000..fff10a700f618 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/Types.java @@ -0,0 +1,716 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema; + +import org.apache.hudi.internal.schema.Type.PrimitiveType; +import org.apache.hudi.internal.schema.Type.NestedType; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + +public class Types { + private Types() { + } + + public static class BooleanType extends PrimitiveType { + private static final BooleanType INSTANCE = new BooleanType(); + + public static BooleanType get() { + return INSTANCE; + } + + @Override + public TypeID typeId() { + return Type.TypeID.BOOLEAN; + } + + @Override + public String toString() { + return "boolean"; + } + } + + public static class IntType extends PrimitiveType { + private static final IntType INSTANCE = new IntType(); + + public static IntType get() { + return INSTANCE; + } + + @Override + public TypeID typeId() { + return TypeID.INT; + } + + @Override + public String toString() { + return "int"; + } + } + + public static class LongType extends PrimitiveType { + private static final LongType INSTANCE = new LongType(); + + public static LongType get() { + return INSTANCE; + } + + @Override + public TypeID typeId() { + return TypeID.LONG; + } + + @Override + public String toString() { + return "long"; + } + } + + public static class FloatType extends PrimitiveType { + private static final FloatType INSTANCE = new FloatType(); + + public static FloatType get() { + return INSTANCE; + } + + @Override + public TypeID typeId() { + return TypeID.FLOAT; + } + + @Override + public String toString() { + return "float"; + } + } + + public static class DoubleType extends PrimitiveType { + private static final DoubleType INSTANCE = new DoubleType(); + + public static DoubleType get() { + return INSTANCE; + } + + @Override + public TypeID typeId() { + return TypeID.DOUBLE; + } + + @Override + public String toString() { + return "double"; + } + } + + public static class DateType extends PrimitiveType { + private static final DateType INSTANCE = new DateType(); + + public static DateType get() { + return INSTANCE; + } + + @Override + public TypeID typeId() { + return TypeID.DATE; + } + + @Override + public String toString() { + return "date"; + } + } + + public static class TimeType extends PrimitiveType { + private static final TimeType INSTANCE = new TimeType(); + + public static TimeType get() { + return INSTANCE; + } + + private TimeType() { + } + + @Override + public TypeID typeId() { + return TypeID.TIME; + } + + @Override + public String toString() { + return "time"; + } + } + + public static class TimestampType extends PrimitiveType { + private static final TimestampType INSTANCE = new TimestampType(); + + public static TimestampType get() { + return INSTANCE; + } + + private TimestampType() { + } + + @Override + public TypeID typeId() { + return TypeID.TIMESTAMP; + } + + @Override + public String toString() { + return "timestamp"; + } + } + + public static class StringType extends PrimitiveType { + private static final StringType INSTANCE = new StringType(); + + public static StringType get() { + return INSTANCE; + } + + @Override + public TypeID typeId() { + return TypeID.STRING; + } + + @Override + public String toString() { + return "string"; + } + } + + public static class BinaryType extends PrimitiveType { + private static final BinaryType INSTANCE = new BinaryType(); + + public static BinaryType get() { + return INSTANCE; + } + + @Override + public TypeID typeId() { + return TypeID.BINARY; + } + + @Override + public String toString() { + return "binary"; + } + } + + public static class FixedType extends PrimitiveType { + public static FixedType getFixed(int size) { + return new FixedType(size); + } + + private final int size; + + private FixedType(int length) { + this.size = length; + } + + public int getFixedSize() { + return size; + } + + @Override + public TypeID typeId() { + return TypeID.FIXED; + } + + @Override + public String toString() { + return String.format("fixed[%d]", size); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } else if (!(o instanceof FixedType)) { + return false; + } + + FixedType fixedType = (FixedType) o; + return size == fixedType.size; + } + + @Override + public int hashCode() { + return Objects.hash(FixedType.class, size); + } + } + + public static class DecimalType extends PrimitiveType { + public static DecimalType get(int precision, int scale) { + return new DecimalType(precision, scale); + } + + private final int scale; + private final int precision; + + private DecimalType(int precision, int scale) { + this.scale = scale; + this.precision = precision; + } + + /** + * Returns whether this DecimalType is wider than `other`. If yes, it means `other` + * can be casted into `this` safely without losing any precision or range. + */ + public boolean isWiderThan(PrimitiveType other) { + if (other instanceof DecimalType) { + DecimalType dt = (DecimalType) other; + return (precision - scale) >= (dt.precision - dt.scale) && scale > dt.scale; + } + if (other instanceof IntType) { + return isWiderThan(get(10, 0)); + } + return false; + } + + /** + * Returns whether this DecimalType is tighter than `other`. If yes, it means `this` + * can be casted into `other` safely without losing any precision or range. + */ + public boolean isTighterThan(PrimitiveType other) { + if (other instanceof DecimalType) { + DecimalType dt = (DecimalType) other; + return (precision - scale) <= (dt.precision - dt.scale) && scale <= dt.scale; + } + if (other instanceof IntType) { + return isTighterThan(get(10, 0)); + } + return false; + } + + public int scale() { + return scale; + } + + public int precision() { + return precision; + } + + @Override + public TypeID typeId() { + return TypeID.DECIMAL; + } + + @Override + public String toString() { + return String.format("decimal(%d, %d)", precision, scale); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } else if (!(o instanceof DecimalType)) { + return false; + } + + DecimalType that = (DecimalType) o; + if (scale != that.scale) { + return false; + } + return precision == that.precision; + } + + @Override + public int hashCode() { + return Objects.hash(DecimalType.class, scale, precision); + } + } + + public static class UUIDType extends PrimitiveType { + private static final UUIDType INSTANCE = new UUIDType(); + + public static UUIDType get() { + return INSTANCE; + } + + @Override + public TypeID typeId() { + return TypeID.UUID; + } + + @Override + public String toString() { + return "uuid"; + } + } + + /** A field within a record. */ + public static class Field implements Serializable { + // Experimental method to support defaultValue + public static Field get(int id, boolean isOptional, String name, Type type, String doc, Object defaultValue) { + return new Field(isOptional, id, name, type, doc, defaultValue); + } + + public static Field get(int id, boolean isOptional, String name, Type type, String doc) { + return new Field(isOptional, id, name, type, doc, null); + } + + public static Field get(int id, boolean isOptional, String name, Type type) { + return new Field(isOptional, id, name, type, null, null); + } + + public static Field get(int id, String name, Type type) { + return new Field(true, id, name, type, null, null); + } + + private final boolean isOptional; + private final int id; + private final String name; + private final Type type; + private final String doc; + // Experimental properties + private final Object defaultValue; + + private Field(boolean isOptional, int id, String name, Type type, String doc, Object defaultValue) { + this.isOptional = isOptional; + this.id = id; + this.name = name; + this.type = type; + this.doc = doc; + this.defaultValue = defaultValue; + } + + public Object getDefaultValue() { + return defaultValue; + } + + public boolean isOptional() { + return isOptional; + } + + public int fieldId() { + return id; + } + + public String name() { + return name; + } + + public Type type() { + return type; + } + + public String doc() { + return doc; + } + + @Override + public String toString() { + return String.format("%d: %s: %s %s", + id, name, isOptional ? "optional" : "required", type) + (doc != null ? " (" + doc + ")" : ""); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } else if (!(o instanceof Field)) { + return false; + } + + Field that = (Field) o; + if (isOptional != that.isOptional) { + return false; + } else if (id != that.id) { + return false; + } else if (!name.equals(that.name)) { + return false; + } else if (!Objects.equals(doc, that.doc)) { + return false; + } + return type.equals(that.type); + } + + @Override + public int hashCode() { + return Objects.hash(Field.class, id, isOptional, name, type); + } + } + + public static class RecordType extends NestedType { + + public static RecordType get(List fields) { + return new RecordType(fields); + } + + public static RecordType get(Field... fields) { + return new RecordType(Arrays.asList(fields)); + } + + private final Field[] fields; + + private transient Map nameToFields = null; + private transient Map idToFields = null; + + private RecordType(List fields) { + this.fields = new Field[fields.size()]; + for (int i = 0; i < this.fields.length; i += 1) { + this.fields[i] = fields.get(i); + } + } + + @Override + public List fields() { + return Arrays.asList(fields); + } + + public Field field(String name) { + if (nameToFields == null) { + nameToFields = new HashMap<>(); + for (Field field : fields) { + nameToFields.put(field.name().toLowerCase(Locale.ROOT), field); + } + } + return nameToFields.get(name.toLowerCase(Locale.ROOT)); + } + + @Override + public Field field(int id) { + if (idToFields == null) { + idToFields = new HashMap<>(); + for (Field field : fields) { + idToFields.put(field.fieldId(), field); + } + } + return idToFields.get(id); + } + + @Override + public Type fieldType(String name) { + Field field = field(name); + if (field != null) { + return field.type(); + } + return null; + } + + @Override + public TypeID typeId() { + return TypeID.RECORD; + } + + @Override + public String toString() { + return String.format("Record<%s>", Arrays.stream(fields).map(f -> f.toString()).collect(Collectors.joining("-"))); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } else if (!(o instanceof RecordType)) { + return false; + } + + RecordType that = (RecordType) o; + return Arrays.equals(fields, that.fields); + } + + @Override + public int hashCode() { + return Objects.hash(Field.class, Arrays.hashCode(fields)); + } + } + + public static class ArrayType extends NestedType { + public static ArrayType get(int elementId, boolean isOptional, Type elementType) { + return new ArrayType(Field.get(elementId, isOptional,"element", elementType)); + } + + private final Field elementField; + + private ArrayType(Field elementField) { + this.elementField = elementField; + } + + public Type elementType() { + return elementField.type(); + } + + @Override + public Type fieldType(String name) { + if ("element".equals(name)) { + return elementType(); + } + return null; + } + + @Override + public Field field(int id) { + if (elementField.fieldId() == id) { + return elementField; + } + return null; + } + + @Override + public List fields() { + return Arrays.asList(elementField); + } + + public int elementId() { + return elementField.fieldId(); + } + + public boolean isElementOptional() { + return elementField.isOptional; + } + + @Override + public TypeID typeId() { + return TypeID.ARRAY; + } + + @Override + public String toString() { + return String.format("list<%s>", elementField.type()); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } else if (!(o instanceof ArrayType)) { + return false; + } + ArrayType listType = (ArrayType) o; + return elementField.equals(listType.elementField); + } + + @Override + public int hashCode() { + return Objects.hash(ArrayType.class, elementField); + } + } + + public static class MapType extends NestedType { + + public static MapType get(int keyId, int valueId, Type keyType, Type valueType) { + return new MapType( + Field.get(keyId, "key", keyType), + Field.get(valueId, "value", valueType)); + } + + public static MapType get(int keyId, int valueId, Type keyType, Type valueType, boolean isOptional) { + return new MapType( + Field.get(keyId, isOptional, "key", keyType), + Field.get(valueId, isOptional, "value", valueType)); + } + + private final Field keyField; + private final Field valueField; + private transient List fields = null; + + private MapType(Field keyField, Field valueField) { + this.keyField = keyField; + this.valueField = valueField; + } + + public Type keyType() { + return keyField.type(); + } + + public Type valueType() { + return valueField.type(); + } + + @Override + public Type fieldType(String name) { + if ("key".equals(name)) { + return keyField.type(); + } else if ("value".equals(name)) { + return valueField.type(); + } + return null; + } + + @Override + public Field field(int id) { + if (keyField.fieldId() == id) { + return keyField; + } else if (valueField.fieldId() == id) { + return valueField; + } + return null; + } + + @Override + public List fields() { + if (fields == null) { + fields = Arrays.asList(keyField, valueField); + } + return fields; + } + + public int keyId() { + return keyField.fieldId(); + } + + public int valueId() { + return valueField.fieldId(); + } + + public boolean isValueOptional() { + return valueField.isOptional; + } + + @Override + public TypeID typeId() { + return TypeID.MAP; + } + + @Override + public String toString() { + return String.format("map<%s, %s>", keyField.type(), valueField.type()); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } else if (!(o instanceof MapType)) { + return false; + } + + MapType mapType = (MapType) o; + if (!keyField.equals(mapType.keyField)) { + return false; + } + return valueField.equals(mapType.valueField); + } + + @Override + public int hashCode() { + return Objects.hash(MapType.class, keyField, valueField); + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/InternalSchemaChangeApplier.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/InternalSchemaChangeApplier.java new file mode 100644 index 0000000000000..36aac462a137e --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/InternalSchemaChangeApplier.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.action; + +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.Type; +import org.apache.hudi.internal.schema.utils.SchemaChangeUtils; + +import java.util.Arrays; + +/** + * Manage schema change for HoodieWriteClient. + */ +public class InternalSchemaChangeApplier { + private InternalSchema latestSchema; + + public InternalSchemaChangeApplier(InternalSchema latestSchema) { + this.latestSchema = latestSchema; + } + + /** + * Add columns to table. + * + * @param colName col name to be added. if we want to add col to a nested filed, the fullName should be specify + * @param colType col type to be added. + * @param doc col doc to be added. + * @param position col position to be added + * @param positionType col position change type. now support three change types: first/after/before + */ + public InternalSchema applyAddChange( + String colName, + Type colType, + String doc, + String position, + TableChange.ColumnPositionChange.ColumnPositionType positionType) { + TableChanges.ColumnAddChange add = TableChanges.ColumnAddChange.get(latestSchema); + String parentName = TableChangesHelper.getParentName(colName); + String leafName = TableChangesHelper.getLeafName(colName); + add.addColumns(parentName, leafName, colType, doc); + if (positionType != null) { + switch (positionType) { + case NO_OPERATION: + break; + case FIRST: + add.addPositionChange(colName, "", positionType); + break; + case AFTER: + case BEFORE: + if (position == null || position.isEmpty()) { + throw new IllegalArgumentException("position should not be null/empty_string when specify positionChangeType as after/before"); + } + String referParentName = TableChangesHelper.getParentName(position); + if (!parentName.equals(referParentName)) { + throw new IllegalArgumentException("cannot reorder two columns which has different parent"); + } + add.addPositionChange(colName, position, positionType); + break; + default: + throw new IllegalArgumentException(String.format("only support first/before/after but found: %s", positionType)); + } + } else { + throw new IllegalArgumentException(String.format("positionType should be specified")); + } + return SchemaChangeUtils.applyTableChanges2Schema(latestSchema, add); + } + + /** + * Delete columns to table. + * + * @param colNames col name to be deleted. if we want to delete col from a nested filed, the fullName should be specify + */ + public InternalSchema applyDeleteChange(String... colNames) { + TableChanges.ColumnDeleteChange delete = TableChanges.ColumnDeleteChange.get(latestSchema); + Arrays.stream(colNames).forEach(colName -> delete.deleteColumn(colName)); + return SchemaChangeUtils.applyTableChanges2Schema(latestSchema, delete); + } + + /** + * Rename col name for hudi table. + * + * @param colName col name to be renamed. if we want to rename col from a nested filed, the fullName should be specify + * @param newName new name for current col. no need to specify fullName. + */ + public InternalSchema applyRenameChange(String colName, String newName) { + TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(latestSchema); + updateChange.renameColumn(colName, newName); + return SchemaChangeUtils.applyTableChanges2Schema(latestSchema, updateChange); + } + + /** + * Update col nullability for hudi table. + * + * @param colName col name to be changed. if we want to change col from a nested filed, the fullName should be specify + * @param nullable . + */ + public InternalSchema applyColumnNullabilityChange(String colName, boolean nullable) { + TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(latestSchema); + updateChange.updateColumnNullability(colName, nullable); + return SchemaChangeUtils.applyTableChanges2Schema(latestSchema, updateChange); + } + + /** + * Update col type for hudi table. + * + * @param colName col name to be changed. if we want to change col from a nested filed, the fullName should be specify + * @param newType . + */ + public InternalSchema applyColumnTypeChange(String colName, Type newType) { + TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(latestSchema); + updateChange.updateColumnType(colName, newType); + return SchemaChangeUtils.applyTableChanges2Schema(latestSchema, updateChange); + } + + /** + * Update col comment for hudi table. + * + * @param colName col name to be changed. if we want to change col from a nested filed, the fullName should be specify + * @param doc . + */ + public InternalSchema applyColumnCommentChange(String colName, String doc) { + TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(latestSchema); + updateChange.updateColumnComment(colName, doc); + return SchemaChangeUtils.applyTableChanges2Schema(latestSchema, updateChange); + } + + /** + * Reorder the position of col. + * + * @param colName column which need to be reordered. if we want to change col from a nested filed, the fullName should be specify. + * @param referColName reference position. + * @param positionType col position change type. now support three change types: first/after/before + */ + public InternalSchema applyReOrderColPositionChange( + String colName, + String referColName, + TableChange.ColumnPositionChange.ColumnPositionType positionType) { + TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(latestSchema); + String parentName = TableChangesHelper.getParentName(colName); + String referParentName = TableChangesHelper.getParentName(referColName); + if (positionType.equals(TableChange.ColumnPositionChange.ColumnPositionType.FIRST)) { + updateChange.addPositionChange(colName, "", positionType); + } else if (parentName.equals(referParentName)) { + updateChange.addPositionChange(colName, referColName, positionType); + } else { + throw new IllegalArgumentException("cannot reorder two columns which has different parent"); + } + return SchemaChangeUtils.applyTableChanges2Schema(latestSchema, updateChange); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/InternalSchemaMerger.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/InternalSchemaMerger.java new file mode 100644 index 0000000000000..cd9bae0541cdc --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/InternalSchemaMerger.java @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.action; + +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.Type; +import org.apache.hudi.internal.schema.Types; + +import java.util.ArrayList; +import java.util.List; + +/** + * Auxiliary class. + * help to merge file schema and query schema to produce final read schema for avro/parquet file + */ +public class InternalSchemaMerger { + private final InternalSchema fileSchema; + private final InternalSchema querySchema; + // now there exist some bugs when we use spark update/merge api, + // those operation will change col nullability from optional to required which is wrong. + // Before that bug is fixed, we need to do adapt. + // if mergeRequiredFiledForce is true, we will ignore the col's required attribute. + private final boolean ignoreRequiredAttribute; + // Whether to use column Type from file schema to read files when we find some column type has changed. + // spark parquetReader need the original column type to read data, otherwise the parquetReader will failed. + // eg: current column type is StringType, now we changed it to decimalType, + // we should not pass decimalType to parquetReader, we must pass StringType to it; when we read out the data, we convert data from String to Decimal, everything is ok. + // for log reader + // since our reWriteRecordWithNewSchema function support rewrite directly, so we no need this parameter + // eg: current column type is StringType, now we changed it to decimalType, + // we can pass decimalType to reWriteRecordWithNewSchema directly, everything is ok. + private boolean useColumnTypeFromFileSchema = true; + + // deal with rename + // Whether to use column name from file schema to read files when we find some column name has changed. + // spark parquetReader need the original column name to read data, otherwise the parquetReader will read nothing. + // eg: current column name is colOldName, now we rename it to colNewName, + // we should not pass colNewName to parquetReader, we must pass colOldName to it; when we read out the data. + // for log reader + // since our reWriteRecordWithNewSchema function support rewrite directly, so we no need this parameter + // eg: current column name is colOldName, now we rename it to colNewName, + // we can pass colNewName to reWriteRecordWithNewSchema directly, everything is ok. + private boolean useColNameFromFileSchema = true; + + public InternalSchemaMerger(InternalSchema fileSchema, InternalSchema querySchema, boolean ignoreRequiredAttribute, boolean useColumnTypeFromFileSchema, boolean useColNameFromFileSchema) { + this.fileSchema = fileSchema; + this.querySchema = querySchema; + this.ignoreRequiredAttribute = ignoreRequiredAttribute; + this.useColumnTypeFromFileSchema = useColumnTypeFromFileSchema; + this.useColNameFromFileSchema = useColNameFromFileSchema; + } + + public InternalSchemaMerger(InternalSchema fileSchema, InternalSchema querySchema, boolean ignoreRequiredAttribute, boolean useColumnTypeFromFileSchema) { + this(fileSchema, querySchema, ignoreRequiredAttribute, useColumnTypeFromFileSchema, true); + } + + /** + * Create final read schema to read avro/parquet file. + * + * @return read schema to read avro/parquet file. + */ + public InternalSchema mergeSchema() { + Types.RecordType record = (Types.RecordType) mergeType(querySchema.getRecord(), 0); + return new InternalSchema(record.fields()); + } + + /** + * Create final read schema to read avro/parquet file. + * this is auxiliary function used by mergeSchema. + */ + private Type mergeType(Type type, int currentTypeId) { + switch (type.typeId()) { + case RECORD: + Types.RecordType record = (Types.RecordType) type; + List newTypes = new ArrayList<>(); + for (Types.Field f : record.fields()) { + Type newType = mergeType(f.type(), f.fieldId()); + newTypes.add(newType); + } + return Types.RecordType.get(buildRecordType(record.fields(), newTypes)); + case ARRAY: + Types.ArrayType array = (Types.ArrayType) type; + Type newElementType; + Types.Field elementField = array.fields().get(0); + newElementType = mergeType(elementField.type(), elementField.fieldId()); + return buildArrayType(array, newElementType); + case MAP: + Types.MapType map = (Types.MapType) type; + Type newValueType = mergeType(map.valueType(), map.valueId()); + return buildMapType(map, newValueType); + default: + return buildPrimitiveType((Type.PrimitiveType) type, currentTypeId); + } + } + + private List buildRecordType(List oldFields, List newTypes) { + List newFields = new ArrayList<>(); + for (int i = 0; i < newTypes.size(); i++) { + Type newType = newTypes.get(i); + Types.Field oldField = oldFields.get(i); + int fieldId = oldField.fieldId(); + String fullName = querySchema.findfullName(fieldId); + if (fileSchema.findField(fieldId) != null) { + if (fileSchema.findfullName(fieldId).equals(fullName)) { + // maybe col type changed, deal with it. + newFields.add(Types.Field.get(oldField.fieldId(), oldField.isOptional(), oldField.name(), newType, oldField.doc())); + } else { + // find rename, deal with it. + newFields.add(dealWithRename(fieldId, newType, oldField)); + } + } else { + // buildFullName + fullName = normalizeFullName(fullName); + if (fileSchema.findField(fullName) != null) { + newFields.add(Types.Field.get(oldField.fieldId(), oldField.isOptional(), oldField.name() + "suffix", oldField.type(), oldField.doc())); + } else { + // find add column + // now there exist some bugs when we use spark update/merge api, those operation will change col optional to required. + if (ignoreRequiredAttribute) { + newFields.add(Types.Field.get(oldField.fieldId(), true, oldField.name(), newType, oldField.doc())); + } else { + newFields.add(Types.Field.get(oldField.fieldId(), oldField.isOptional(), oldField.name(), newType, oldField.doc())); + } + } + } + } + return newFields; + } + + private Types.Field dealWithRename(int fieldId, Type newType, Types.Field oldField) { + Types.Field fieldFromFileSchema = fileSchema.findField(fieldId); + String nameFromFileSchema = fieldFromFileSchema.name(); + String nameFromQuerySchema = querySchema.findField(fieldId).name(); + String finalFieldName = useColNameFromFileSchema ? nameFromFileSchema : nameFromQuerySchema; + Type typeFromFileSchema = fieldFromFileSchema.type(); + // Current design mechanism guarantees nestedType change is not allowed, so no need to consider. + if (newType.isNestedType()) { + return Types.Field.get(oldField.fieldId(), oldField.isOptional(), + finalFieldName, newType, oldField.doc()); + } else { + return Types.Field.get(oldField.fieldId(), oldField.isOptional(), + finalFieldName, useColumnTypeFromFileSchema ? typeFromFileSchema : newType, oldField.doc()); + } + } + + private String normalizeFullName(String fullName) { + // find parent rename, and normalize fullName + // eg: we renamed a nest field struct(c, d) to aa, the we delete a.d and add it back later. + String[] nameParts = fullName.split("\\."); + String[] normalizedNameParts = new String[nameParts.length]; + System.arraycopy(nameParts, 0, normalizedNameParts, 0, nameParts.length); + for (int j = 0; j < nameParts.length - 1; j++) { + StringBuilder sb = new StringBuilder(); + for (int k = 0; k <= j; k++) { + sb.append(nameParts[k]); + } + String parentName = sb.toString(); + int parentFieldIdFromQuerySchema = querySchema.findIdByName(parentName); + String parentNameFromFileSchema = fileSchema.findfullName(parentFieldIdFromQuerySchema); + if (parentNameFromFileSchema.isEmpty()) { + break; + } + if (!parentNameFromFileSchema.equalsIgnoreCase(parentName)) { + // find parent rename, update nameParts + String[] parentNameParts = parentNameFromFileSchema.split("\\."); + System.arraycopy(parentNameParts, 0, normalizedNameParts, 0, parentNameParts.length); + } + } + return StringUtils.join(normalizedNameParts, "."); + } + + private Type buildArrayType(Types.ArrayType array, Type newType) { + Types.Field elementField = array.fields().get(0); + int elementId = elementField.fieldId(); + if (elementField.type() == newType) { + return array; + } else { + return Types.ArrayType.get(elementId, elementField.isOptional(), newType); + } + } + + private Type buildMapType(Types.MapType map, Type newValue) { + Types.Field valueFiled = map.fields().get(1); + if (valueFiled.type() == newValue) { + return map; + } else { + return Types.MapType.get(map.keyId(), map.valueId(), map.keyType(), newValue, map.isValueOptional()); + } + } + + private Type buildPrimitiveType(Type.PrimitiveType typeFromQuerySchema, int currentPrimitiveTypeId) { + Type typeFromFileSchema = fileSchema.findType(currentPrimitiveTypeId); + if (typeFromFileSchema == null) { + return typeFromQuerySchema; + } else { + return useColumnTypeFromFileSchema ? typeFromFileSchema : typeFromQuerySchema; + } + } +} + diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/TableChange.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/TableChange.java new file mode 100644 index 0000000000000..7594f94732a90 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/TableChange.java @@ -0,0 +1,252 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.action; + +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.internal.schema.HoodieSchemaException; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.InternalSchemaBuilder; +import org.apache.hudi.internal.schema.Types; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +/** + * TableChange subclasses represent requested changes to a table. + * now only column changes support. + * to do support partition changes + */ +public interface TableChange { + /* The action Type of schema change. */ + enum ColumnChangeID { + ADD, UPDATE, DELETE, PROPERTY_CHANGE, REPLACE; + private String name; + + private ColumnChangeID() { + this.name = this.name().toLowerCase(Locale.ROOT); + } + + public String getName() { + return name; + } + } + + static ColumnChangeID fromValue(String value) { + switch (value.toLowerCase(Locale.ROOT)) { + case "add": + return ColumnChangeID.ADD; + case "change": + return ColumnChangeID.UPDATE; + case "delete": + return ColumnChangeID.DELETE; + case "property": + return ColumnChangeID.PROPERTY_CHANGE; + case "replace": + return ColumnChangeID.REPLACE; + default: + throw new IllegalArgumentException("Invalid value of Type."); + } + } + + ColumnChangeID columnChangeId(); + + default boolean withPositionChange() { + return false; + } + + abstract class BaseColumnChange implements TableChange { + protected final InternalSchema internalSchema; + protected final Map id2parent; + protected final Map> positionChangeMap = new HashMap<>(); + + BaseColumnChange(InternalSchema schema) { + this.internalSchema = schema; + this.id2parent = InternalSchemaBuilder.getBuilder().index2Parents(schema.getRecord()); + } + + /** + * Add position change. + * + * @param srcName column which need to be reordered + * @param dsrName reference position + * @param orderType change types + * @return this + */ + public BaseColumnChange addPositionChange(String srcName, String dsrName, ColumnPositionChange.ColumnPositionType orderType) { + Integer srcId = findIdByFullName(srcName); + Option dsrIdOpt = dsrName.isEmpty() ? Option.empty() : Option.of(findIdByFullName(dsrName)); + Integer srcParentId = id2parent.get(srcId); + Option dsrParentIdOpt = dsrIdOpt.map(id2parent::get); + // forbid adjust hoodie metadata columns. + switch (orderType) { + case BEFORE: + checkColModifyIsLegal(dsrName); + break; + case FIRST: + if (srcId == null || srcId == -1 || srcParentId == null || srcParentId == -1) { + throw new HoodieSchemaException("forbid adjust top-level columns position by using through first syntax"); + } + break; + case AFTER: + List checkColumns = HoodieRecord.HOODIE_META_COLUMNS.subList(0, HoodieRecord.HOODIE_META_COLUMNS.size() - 2); + if (checkColumns.stream().anyMatch(f -> f.equalsIgnoreCase(dsrName))) { + throw new HoodieSchemaException("forbid adjust the position of ordinary columns between meta columns"); + } + break; + case NO_OPERATION: + default: + break; + } + int parentId; + if (srcParentId != null && dsrParentIdOpt.isPresent() && srcParentId.equals(dsrParentIdOpt.get())) { + Types.Field parentField = internalSchema.findField(srcParentId); + if (!(parentField.type() instanceof Types.RecordType)) { + throw new HoodieSchemaException(String.format("only support reorder fields in struct type, but find: %s", parentField.type())); + } + parentId = parentField.fieldId(); + } else if (srcParentId == null && !dsrParentIdOpt.isPresent()) { + parentId = -1; + } else if (srcParentId != null && !dsrParentIdOpt.isPresent() && orderType.equals(ColumnPositionChange.ColumnPositionType.FIRST)) { + parentId = srcParentId; + } else { + throw new HoodieSchemaException("cannot order position from different parent"); + } + + ArrayList changes = positionChangeMap.getOrDefault(parentId, new ArrayList<>()); + changes.add(ColumnPositionChange.get(srcId, dsrIdOpt.orElse(-1), orderType)); + positionChangeMap.put(parentId, changes); + return this; + } + + public BaseColumnChange addPositionChange(String srcName, String dsrName, String orderType) { + return addPositionChange(srcName, dsrName, ColumnPositionChange.fromTypeValue(orderType)); + } + + /** + * Abstract method. + * give a column fullName and return the field id + * + * @param fullName column fullName + * @return field id of current column + */ + protected abstract Integer findIdByFullName(String fullName); + + // Modify hudi meta columns is prohibited + protected void checkColModifyIsLegal(String colNeedToModfiy) { + if (HoodieRecord.HOODIE_META_COLUMNS.stream().anyMatch(f -> f.equalsIgnoreCase(colNeedToModfiy))) { + throw new IllegalArgumentException(String.format("cannot modify hudi meta col: %s", colNeedToModfiy)); + } + } + + @Override + public boolean withPositionChange() { + return false; + } + } + + /** + * Column position change. + * now support three change types: FIRST/AFTER/BEFORE + * FIRST means the specified column should be the first column. + * AFTER means the specified column should be put after the given column. + * BEFORE means the specified column should be put before the given column. + * Note that, the specified column may be a nested field: + * AFTER/BEFORE means the given columns should in the same struct; + * FIRST means this field should be the first one within the struct. + */ + class ColumnPositionChange { + public enum ColumnPositionType { + FIRST, + BEFORE, + AFTER, + // only expose to internal use. + NO_OPERATION + } + + static ColumnPositionType fromTypeValue(String value) { + switch (value.toLowerCase(Locale.ROOT)) { + case "first": + return ColumnPositionType.FIRST; + case "before": + return ColumnPositionType.BEFORE; + case "after": + return ColumnPositionType.AFTER; + case "no_operation": + return ColumnPositionType.NO_OPERATION; + default: + throw new IllegalArgumentException(String.format("only support first/before/after but found: %s", value)); + } + } + + private final int srcId; + private final int dsrId; + private final ColumnPositionType type; + + static ColumnPositionChange first(int srcId) { + return new ColumnPositionChange(srcId, -1, ColumnPositionType.FIRST); + } + + static ColumnPositionChange before(int srcId, int dsrId) { + return new ColumnPositionChange(srcId, dsrId, ColumnPositionType.BEFORE); + } + + static ColumnPositionChange after(int srcId, int dsrId) { + return new ColumnPositionChange(srcId, dsrId, ColumnPositionType.AFTER); + } + + static ColumnPositionChange get(int srcId, int dsrId, String type) { + return get(srcId, dsrId, fromTypeValue(type)); + } + + static ColumnPositionChange get(int srcId, int dsrId, ColumnPositionType type) { + switch (type) { + case FIRST: + return ColumnPositionChange.first(srcId); + case BEFORE: + return ColumnPositionChange.before(srcId, dsrId); + case AFTER: + return ColumnPositionChange.after(srcId, dsrId); + default: + throw new IllegalArgumentException(String.format("only support first/before/after but found: %s", type)); + } + } + + private ColumnPositionChange(int srcId, int dsrId, ColumnPositionType type) { + this.srcId = srcId; + this.dsrId = dsrId; + this.type = type; + } + + public int getSrcId() { + return srcId; + } + + public int getDsrId() { + return dsrId; + } + + public ColumnPositionType type() { + return type; + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/TableChanges.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/TableChanges.java new file mode 100644 index 0000000000000..4e0adc27895af --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/TableChanges.java @@ -0,0 +1,398 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.action; + +import org.apache.hudi.internal.schema.HoodieSchemaException; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.InternalSchemaBuilder; +import org.apache.hudi.internal.schema.Type; +import org.apache.hudi.internal.schema.Types; +import org.apache.hudi.internal.schema.utils.SchemaChangeUtils; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; + +public class TableChanges { + + /** Deal with update columns changes for table. */ + public static class ColumnUpdateChange extends TableChange.BaseColumnChange { + private final Map updates = new HashMap<>(); + + public static ColumnUpdateChange get(InternalSchema schema) { + return new ColumnUpdateChange(schema); + } + + private ColumnUpdateChange(InternalSchema schema) { + super(schema); + } + + @Override + public boolean withPositionChange() { + return true; + } + + public Type applyUpdates(Types.Field oldField, Type type) { + Types.Field update = updates.get(oldField.fieldId()); + if (update != null && update.type() != oldField.type()) { + return update.type(); + } + // + ArrayList pchanges = positionChangeMap.getOrDefault(oldField.fieldId(), new ArrayList<>()); + if (!pchanges.isEmpty()) { + // when we build ColumnAddChange,we have already done some check, so it's safe to convert newType to RecordType + List newFields = TableChangesHelper.applyAddChange2Fields(((Types.RecordType) type).fields(), new ArrayList<>(), pchanges); + return Types.RecordType.get(newFields); + } + return type; + } + + public Map getUpdates() { + return updates; + } + + /** + * Update a column in the schema to a new type. + * only support update primitive type. + * Only updates that widen types are allowed. + * + * @param name name of the column to update + * @param newType new type for the column + * @return this + * @throws IllegalArgumentException + */ + public ColumnUpdateChange updateColumnType(String name, Type newType) { + checkColModifyIsLegal(name); + if (newType.isNestedType()) { + throw new IllegalArgumentException(String.format("only support update primitive type but find nest column: %s", name)); + } + Types.Field field = internalSchema.findField(name); + if (field == null) { + throw new IllegalArgumentException(String.format("cannot update a missing column: %s", name)); + } + + if (!SchemaChangeUtils.isTypeUpdateAllow(field.type(), newType)) { + throw new IllegalArgumentException(String.format("cannot update origin type: %s to a incompatibility type: %s", field.type(), newType)); + } + + if (field.type().equals(newType)) { + // do nothings + return this; + } + // save update info + Types.Field update = updates.get(field.fieldId()); + if (update == null) { + updates.put(field.fieldId(), Types.Field.get(field.fieldId(), field.isOptional(), field.name(), newType, field.doc())); + } else { + updates.put(field.fieldId(), Types.Field.get(field.fieldId(), update.isOptional(), update.name(), newType, update.doc())); + } + return this; + } + + /** + * Update a column doc in the schema to a new primitive type. + * + * @param name name of the column to update + * @param newDoc new documentation for the column + * @return this + * @throws IllegalArgumentException + */ + public ColumnUpdateChange updateColumnComment(String name, String newDoc) { + checkColModifyIsLegal(name); + Types.Field field = internalSchema.findField(name); + if (field == null) { + throw new IllegalArgumentException(String.format("cannot update a missing column: %s", name)); + } + // consider null + if (Objects.equals(field.doc(), newDoc)) { + // do nothings + return this; + } + // save update info + Types.Field update = updates.get(field.fieldId()); + if (update == null) { + updates.put(field.fieldId(), Types.Field.get(field.fieldId(), field.isOptional(), field.name(), field.type(), newDoc)); + } else { + updates.put(field.fieldId(), Types.Field.get(field.fieldId(), update.isOptional(), update.name(), update.type(), newDoc)); + } + return this; + } + + /** + * Rename a column in the schema. + * + * @param name name of the column to rename + * @param newName new name for the column + * @return this + * @throws IllegalArgumentException + */ + public ColumnUpdateChange renameColumn(String name, String newName) { + checkColModifyIsLegal(name); + Types.Field field = internalSchema.findField(name); + if (field == null) { + throw new IllegalArgumentException(String.format("cannot update a missing column: %s", name)); + } + if (newName == null || newName.isEmpty()) { + throw new IllegalArgumentException(String.format("cannot rename column: %s to empty", name)); + } + // keep consisitent with hive. column names insensitive, so we check 'newName.toLowerCase(Locale.ROOT)' + if (internalSchema.findDuplicateCol(newName.toLowerCase(Locale.ROOT))) { + throw new IllegalArgumentException(String.format("cannot rename column: %s to a existing name", name)); + } + // save update info + Types.Field update = updates.get(field.fieldId()); + if (update == null) { + updates.put(field.fieldId(), Types.Field.get(field.fieldId(), field.isOptional(), newName, field.type(), field.doc())); + } else { + updates.put(field.fieldId(), Types.Field.get(field.fieldId(), update.isOptional(), newName, update.type(), update.doc())); + } + return this; + } + + /** + * Update nullable for column. + * only support required type -> optional type + * + * @param name name of the column to update + * @param nullable nullable for updated name + * @return this + * @throws IllegalArgumentException + */ + public ColumnUpdateChange updateColumnNullability(String name, boolean nullable) { + return updateColumnNullability(name, nullable, false); + } + + public ColumnUpdateChange updateColumnNullability(String name, boolean nullable, boolean force) { + checkColModifyIsLegal(name); + Types.Field field = internalSchema.findField(name); + if (field == null) { + throw new IllegalArgumentException(String.format("cannot update a missing column: %s", name)); + } + if (field.isOptional() == nullable) { + // do nothings + return this; + } + if (field.isOptional() && !nullable && !force) { + throw new IllegalArgumentException("cannot update column Nullability: optional to required"); + } + // save update info + Types.Field update = updates.get(field.fieldId()); + if (update == null) { + updates.put(field.fieldId(), Types.Field.get(field.fieldId(), nullable, field.name(), field.type(), field.doc())); + } else { + updates.put(field.fieldId(), Types.Field.get(field.fieldId(), nullable, update.name(), update.type(), update.doc())); + } + + return this; + } + + public Map> getPositionChangeMap() { + return positionChangeMap; + } + + @Override + public ColumnChangeID columnChangeId() { + return ColumnChangeID.UPDATE; + } + + @Override + protected Integer findIdByFullName(String fullName) { + Types.Field field = internalSchema.findField(fullName); + if (field != null) { + return field.fieldId(); + } else { + throw new IllegalArgumentException(String.format("cannot find col id for given column fullName: %s", fullName)); + } + } + } + + /** Deal with delete columns changes for table. */ + public static class ColumnDeleteChange extends TableChange.BaseColumnChange { + private final Set deletes = new HashSet<>(); + + @Override + public ColumnChangeID columnChangeId() { + return ColumnChangeID.DELETE; + } + + public static ColumnDeleteChange get(InternalSchema schema) { + return new ColumnDeleteChange(schema); + } + + private ColumnDeleteChange(InternalSchema schema) { + super(schema); + } + + @Override + public boolean withPositionChange() { + return false; + } + + @Override + public BaseColumnChange addPositionChange(String srcId, String dsrId, String orderType) { + throw new UnsupportedOperationException("no support add position change for ColumnDeleteChange"); + } + + public ColumnDeleteChange deleteColumn(String name) { + checkColModifyIsLegal(name); + Types.Field field = internalSchema.findField(name); + if (field == null) { + throw new IllegalArgumentException(String.format("cannot delete missing columns: %s", name)); + } + deletes.add(field.fieldId()); + return this; + } + + public Type applyDelete(int id, Type type) { + if (deletes.contains(id)) { + return null; + } + return type; + } + + public Set getDeletes() { + return deletes; + } + + @Override + protected Integer findIdByFullName(String fullName) { + throw new UnsupportedOperationException("delete change cannot support this method"); + } + } + + /** Deal with add columns changes for table. */ + public static class ColumnAddChange extends TableChange.BaseColumnChange { + private final Map fullColName2Id = new HashMap<>(); + private final Map> parentId2AddCols = new HashMap<>(); + private int nextId; + + public static ColumnAddChange get(InternalSchema internalSchema) { + return new ColumnAddChange(internalSchema); + } + + public Type applyAdd(Types.Field orignalField, Type type) { + int fieldId = orignalField.fieldId(); + ArrayList addFields = parentId2AddCols.getOrDefault(fieldId, new ArrayList<>()); + ArrayList pchanges = positionChangeMap.getOrDefault(fieldId, new ArrayList<>()); + + if (!addFields.isEmpty() || !pchanges.isEmpty()) { + // when we build ColumnAddChange,we have already done some check, so it's safe to convert newType to RecordType + List newFields = TableChangesHelper.applyAddChange2Fields(((Types.RecordType) type).fields(), addFields, pchanges); + return Types.RecordType.get(newFields); + } + return type; + } + + public ColumnAddChange addColumns(String name, Type type, String doc) { + checkColModifyIsLegal(name); + return addColumns("", name, type, doc); + } + + public ColumnAddChange addColumns(String parent, String name, Type type, String doc) { + checkColModifyIsLegal(name); + addColumnsInternal(parent, name, type, doc); + return this; + } + + private void addColumnsInternal(String parent, String name, Type type, String doc) { + // root record has no parent, so set parentId to -1 as default + int parentId = -1; + // do check + String fullName = name; + if (!parent.isEmpty()) { + Types.Field parentField = internalSchema.findField(parent); + if (parentField == null) { + throw new HoodieSchemaException(String.format("cannot add column: %s which parent: %s is not exist", name, parent)); + } + Type parentType = parentField.type(); + if (!(parentField.type() instanceof Types.RecordType)) { + throw new HoodieSchemaException("only support add nested columns to struct column"); + } + parentId = parentField.fieldId(); + Types.Field newParentField = internalSchema.findField(parent + "." + name); + if (newParentField != null) { + throw new HoodieSchemaException(String.format("cannot add column: %s which already exist", name)); + } + fullName = parent + "." + name; + } else { + // keep consistent with hive, column name case insensitive + if (internalSchema.findDuplicateCol(name.toLowerCase(Locale.ROOT))) { + throw new HoodieSchemaException(String.format("cannot add column: %s which already exist", name)); + } + } + if (fullColName2Id.containsKey(fullName)) { + throw new HoodieSchemaException(String.format("cannot repeat add column: %s", name)); + } + fullColName2Id.put(fullName, nextId); + if (parentId != -1) { + id2parent.put(nextId, parentId); + } + AtomicInteger assignNextId = new AtomicInteger(nextId + 1); + Type typeWithNewId = InternalSchemaBuilder.getBuilder().refreshNewId(type, assignNextId); + // only allow add optional columns. + ArrayList adds = parentId2AddCols.getOrDefault(parentId, new ArrayList<>()); + adds.add(Types.Field.get(nextId, true, name, typeWithNewId, doc)); + parentId2AddCols.put(parentId, adds); + nextId = assignNextId.get(); + } + + private ColumnAddChange(InternalSchema internalSchema) { + super(internalSchema); + this.nextId = internalSchema.getMaxColumnId() + 1; + } + + public Map> getParentId2AddCols() { + return parentId2AddCols; + } + + public Map> getPositionChangeMap() { + return positionChangeMap; + } + + // expose to test + public Map getFullColName2Id() { + return fullColName2Id; + } + + protected Integer findIdByFullName(String fullName) { + Types.Field field = internalSchema.findField(fullName); + if (field != null) { + return field.fieldId(); + } + return fullColName2Id.getOrDefault(fullName, -1); + } + + @Override + public ColumnChangeID columnChangeId() { + return ColumnChangeID.ADD; + } + + @Override + public boolean withPositionChange() { + return true; + } + } +} + diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/TableChangesHelper.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/TableChangesHelper.java new file mode 100644 index 0000000000000..80b9c6298dd89 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/TableChangesHelper.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.action; + +import org.apache.hudi.internal.schema.Types; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +/** + * Helper class to support Table schema changes. + */ +public class TableChangesHelper { + /** + * Apply add operation and column position change operation. + * + * @param fields origin column fields. + * @param adds column fields to be added. + * @param pchanges a wrapper class hold all the position change operations. + * @return column fields after adjusting the position. + */ + public static List applyAddChange2Fields(List fields, ArrayList adds, ArrayList pchanges) { + if (adds == null && pchanges == null) { + return fields; + } + LinkedList result = new LinkedList<>(fields); + // apply add columns + if (adds != null && !adds.isEmpty()) { + result.addAll(adds); + } + // apply position change + if (pchanges != null && !pchanges.isEmpty()) { + for (TableChange.ColumnPositionChange pchange : pchanges) { + Types.Field srcField = result.stream().filter(f -> f.fieldId() == pchange.getSrcId()).findFirst().get(); + Types.Field dsrField = result.stream().filter(f -> f.fieldId() == pchange.getDsrId()).findFirst().orElse(null); + // we remove srcField first + result.remove(srcField); + switch (pchange.type()) { + case AFTER: + // add srcField after dsrField + result.add(result.indexOf(dsrField) + 1, srcField); + break; + case BEFORE: + // add srcField before dsrField + result.add(result.indexOf(dsrField), srcField); + break; + case FIRST: + result.addFirst(srcField); + break; + default: + // should not reach here + } + } + } + return result; + } + + public static String getParentName(String fullColName) { + int offset = fullColName.lastIndexOf("."); + return offset > 0 ? fullColName.substring(0, offset) : ""; + } + + public static String getLeafName(String fullColName) { + int offset = fullColName.lastIndexOf("."); + return offset > 0 ? fullColName.substring(offset + 1) : fullColName; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/convert/AvroInternalSchemaConverter.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/convert/AvroInternalSchemaConverter.java new file mode 100644 index 0000000000000..d941b27328aba --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/convert/AvroInternalSchemaConverter.java @@ -0,0 +1,443 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.convert; + +import org.apache.avro.JsonProperties; +import org.apache.avro.LogicalType; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.hudi.internal.schema.HoodieSchemaException; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.Type; +import org.apache.hudi.internal.schema.Types; + +import java.util.ArrayList; +import java.util.Deque; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.apache.avro.Schema.Type.UNION; + +/** + * Auxiliary class. + * Converts an avro schema into InternalSchema, or convert InternalSchema to an avro schema + */ +public class AvroInternalSchemaConverter { + + /** + * Convert internalSchema to avro Schema. + * + * @param internalSchema internal schema. + * @param tableName the record name. + * @return an avro Schema. + */ + public static Schema convert(InternalSchema internalSchema, String tableName, String namespace) { + return buildAvroSchemaFromInternalSchema(internalSchema, tableName, namespace); + } + + public static Schema convert(InternalSchema internalSchema, String tableName) { + return buildAvroSchemaFromInternalSchema(internalSchema, tableName, ""); + } + + /** + * Convert RecordType to avro Schema. + * + * @param type internal schema. + * @param name the record name. + * @return an avro Schema. + */ + public static Schema convert(Types.RecordType type, String name) { + return buildAvroSchemaFromType(type, name); + } + + /** + * Convert internal type to avro Schema. + * + * @param type internal type. + * @param name the record name. + * @return an avro Schema. + */ + public static Schema convert(Type type, String name) { + return buildAvroSchemaFromType(type, name); + } + + /** Convert an avro schema into internal type. */ + public static Type convertToField(Schema schema) { + return buildTypeFromAvroSchema(schema); + } + + /** Convert an avro schema into internalSchema. */ + public static InternalSchema convert(Schema schema) { + List fields = ((Types.RecordType) convertToField(schema)).fields(); + return new InternalSchema(fields); + } + + /** Check whether current avro schema is optional?. */ + public static boolean isOptional(Schema schema) { + if (schema.getType() == UNION && schema.getTypes().size() == 2) { + return schema.getTypes().get(0).getType() == Schema.Type.NULL || schema.getTypes().get(1).getType() == Schema.Type.NULL; + } + return false; + } + + /** Returns schema with nullable true. */ + public static Schema nullableSchema(Schema schema) { + if (schema.getType() == UNION) { + if (!isOptional(schema)) { + throw new HoodieSchemaException(String.format("Union schemas are not supported: %s", schema)); + } + return schema; + } else { + return Schema.createUnion(Schema.create(Schema.Type.NULL), schema); + } + } + + /** + * Build hudi type from avro schema. + * + * @param schema a avro schema. + * @return a hudi type. + */ + public static Type buildTypeFromAvroSchema(Schema schema) { + // set flag to check this has not been visited. + Deque visited = new LinkedList(); + AtomicInteger nextId = new AtomicInteger(1); + return visitAvroSchemaToBuildType(schema, visited, true, nextId); + } + + /** + * Converts an avro schema into hudi type. + * + * @param schema a avro schema. + * @param visited track the visit node when do traversal for avro schema; used to check if the name of avro record schema is correct. + * @param firstVisitRoot track whether the current visited schema node is a root node. + * @param nextId a initial id which used to create id for all fields. + * @return a hudi type match avro schema. + */ + private static Type visitAvroSchemaToBuildType(Schema schema, Deque visited, Boolean firstVisitRoot, AtomicInteger nextId) { + switch (schema.getType()) { + case RECORD: + String name = schema.getFullName(); + if (visited.contains(name)) { + throw new HoodieSchemaException(String.format("cannot convert recursive avro record %s", name)); + } + visited.push(name); + List fields = schema.getFields(); + List fieldTypes = new ArrayList<>(fields.size()); + int nextAssignId = nextId.get(); + // when first visit root record, set nextAssignId = 0; + if (firstVisitRoot) { + nextAssignId = 0; + } + nextId.set(nextAssignId + fields.size()); + fields.stream().forEach(field -> { + fieldTypes.add(visitAvroSchemaToBuildType(field.schema(), visited, false, nextId)); + }); + visited.pop(); + List internalFields = new ArrayList<>(fields.size()); + + for (int i = 0; i < fields.size(); i++) { + Schema.Field field = fields.get(i); + Type fieldType = fieldTypes.get(i); + internalFields.add(Types.Field.get(nextAssignId, AvroInternalSchemaConverter.isOptional(field.schema()), field.name(), fieldType, field.doc())); + nextAssignId += 1; + } + return Types.RecordType.get(internalFields); + case UNION: + List fTypes = new ArrayList<>(); + schema.getTypes().stream().forEach(t -> { + fTypes.add(visitAvroSchemaToBuildType(t, visited, false, nextId)); + }); + return fTypes.get(0) == null ? fTypes.get(1) : fTypes.get(0); + case ARRAY: + Schema elementSchema = schema.getElementType(); + int elementId = nextId.get(); + nextId.set(elementId + 1); + Type elementType = visitAvroSchemaToBuildType(elementSchema, visited, false, nextId); + return Types.ArrayType.get(elementId, AvroInternalSchemaConverter.isOptional(schema.getElementType()), elementType); + case MAP: + int keyId = nextId.get(); + int valueId = keyId + 1; + nextId.set(valueId + 1); + Type valueType = visitAvroSchemaToBuildType(schema.getValueType(), visited, false, nextId); + return Types.MapType.get(keyId, valueId, Types.StringType.get(), valueType, AvroInternalSchemaConverter.isOptional(schema.getValueType())); + default: + return visitAvroPrimitiveToBuildInternalType(schema); + } + } + + private static Type visitAvroPrimitiveToBuildInternalType(Schema primitive) { + LogicalType logical = primitive.getLogicalType(); + if (logical != null) { + String name = logical.getName(); + if (logical instanceof LogicalTypes.Decimal) { + return Types.DecimalType.get( + ((LogicalTypes.Decimal) logical).getPrecision(), + ((LogicalTypes.Decimal) logical).getScale()); + + } else if (logical instanceof LogicalTypes.Date) { + return Types.DateType.get(); + + } else if ( + logical instanceof LogicalTypes.TimeMillis + || logical instanceof LogicalTypes.TimeMicros) { + return Types.TimeType.get(); + + } else if ( + logical instanceof LogicalTypes.TimestampMillis + || logical instanceof LogicalTypes.TimestampMicros) { + return Types.TimestampType.get(); + } else if (LogicalTypes.uuid().getName().equals(name)) { + return Types.UUIDType.get(); + } + } + + switch (primitive.getType()) { + case BOOLEAN: + return Types.BooleanType.get(); + case INT: + return Types.IntType.get(); + case LONG: + return Types.LongType.get(); + case FLOAT: + return Types.FloatType.get(); + case DOUBLE: + return Types.DoubleType.get(); + case STRING: + case ENUM: + return Types.StringType.get(); + case FIXED: + return Types.FixedType.getFixed(primitive.getFixedSize()); + case BYTES: + return Types.BinaryType.get(); + case NULL: + return null; + default: + throw new UnsupportedOperationException("Unsupported primitive type: " + primitive); + } + } + + /** + * Converts hudi type into an Avro Schema. + * + * @param type a hudi type. + * @param recordName the record name + * @return a Avro schema match this type + */ + public static Schema buildAvroSchemaFromType(Type type, String recordName) { + Map cache = new HashMap<>(); + return visitInternalSchemaToBuildAvroSchema(type, cache, recordName, ""); + } + + /** + * Converts hudi internal Schema into an Avro Schema. + * + * @param schema a hudi internal Schema. + * @param recordName the record name + * @return a Avro schema match hudi internal schema. + */ + public static Schema buildAvroSchemaFromInternalSchema(InternalSchema schema, String recordName, String namespace) { + Map cache = new HashMap<>(); + return visitInternalSchemaToBuildAvroSchema(schema.getRecord(), cache, recordName, namespace); + } + + /** + * Converts hudi type into an Avro Schema. + * + * @param type a hudi type. + * @param cache use to cache intermediate convert result to save cost. + * @param recordName the record name + * @return a Avro schema match this type + */ + private static Schema visitInternalSchemaToBuildAvroSchema( + Type type, Map cache, String recordName, String namespace) { + switch (type.typeId()) { + case RECORD: + Types.RecordType record = (Types.RecordType) type; + List schemas = new ArrayList<>(); + record.fields().forEach(f -> { + Schema tempSchema = visitInternalSchemaToBuildAvroSchema( + f.type(), cache, recordName + "_" + f.name(), namespace); + // convert tempSchema + Schema result = f.isOptional() ? AvroInternalSchemaConverter.nullableSchema(tempSchema) : tempSchema; + schemas.add(result); + }); + // check visited + Schema recordSchema; + recordSchema = cache.get(record); + if (recordSchema != null) { + return recordSchema; + } + recordSchema = visitInternalRecordToBuildAvroRecord(record, schemas, recordName, namespace); + cache.put(record, recordSchema); + return recordSchema; + case ARRAY: + Types.ArrayType array = (Types.ArrayType) type; + Schema elementSchema; + elementSchema = visitInternalSchemaToBuildAvroSchema(array.elementType(), cache, recordName, namespace); + Schema arraySchema; + arraySchema = cache.get(array); + if (arraySchema != null) { + return arraySchema; + } + arraySchema = visitInternalArrayToBuildAvroArray(array, elementSchema); + cache.put(array, arraySchema); + return arraySchema; + case MAP: + Types.MapType map = (Types.MapType) type; + Schema keySchema; + Schema valueSchema; + keySchema = visitInternalSchemaToBuildAvroSchema(map.keyType(), cache, recordName, namespace); + valueSchema = visitInternalSchemaToBuildAvroSchema(map.valueType(), cache, recordName, namespace); + Schema mapSchema; + mapSchema = cache.get(map); + if (mapSchema != null) { + return mapSchema; + } + mapSchema = visitInternalMapToBuildAvroMap(map, keySchema, valueSchema); + cache.put(map, mapSchema); + return mapSchema; + default: + Schema primitiveSchema = visitInternalPrimitiveToBuildAvroPrimitiveType((Type.PrimitiveType) type); + cache.put(type, primitiveSchema); + return primitiveSchema; + } + } + + /** + * Converts hudi RecordType to Avro RecordType. + * this is auxiliary function used by visitInternalSchemaToBuildAvroSchema + */ + private static Schema visitInternalRecordToBuildAvroRecord( + Types.RecordType record, List fieldSchemas, String recordName, String namespace) { + List fields = record.fields(); + List avroFields = new ArrayList<>(); + for (int i = 0; i < fields.size(); i++) { + Types.Field f = fields.get(i); + Schema.Field field = new Schema.Field(f.name(), fieldSchemas.get(i), f.doc(), f.isOptional() ? JsonProperties.NULL_VALUE : null); + avroFields.add(field); + } + return Schema.createRecord(recordName, null, namespace, false, avroFields); + } + + /** + * Converts hudi ArrayType to Avro ArrayType. + * this is auxiliary function used by visitInternalSchemaToBuildAvroSchema + */ + private static Schema visitInternalArrayToBuildAvroArray(Types.ArrayType array, Schema elementSchema) { + Schema result; + if (array.isElementOptional()) { + result = Schema.createArray(AvroInternalSchemaConverter.nullableSchema(elementSchema)); + } else { + result = Schema.createArray(elementSchema); + } + return result; + } + + /** + * Converts hudi MapType to Avro MapType. + * this is auxiliary function used by visitInternalSchemaToBuildAvroSchema + */ + private static Schema visitInternalMapToBuildAvroMap(Types.MapType map, Schema keySchema, Schema valueSchema) { + Schema mapSchema; + if (keySchema.getType() == Schema.Type.STRING) { + mapSchema = Schema.createMap(map.isValueOptional() ? AvroInternalSchemaConverter.nullableSchema(valueSchema) : valueSchema); + } else { + throw new HoodieSchemaException("only support StringType key for avro MapType"); + } + return mapSchema; + } + + /** + * Converts hudi PrimitiveType to Avro PrimitiveType. + * this is auxiliary function used by visitInternalSchemaToBuildAvroSchema + */ + private static Schema visitInternalPrimitiveToBuildAvroPrimitiveType(Type.PrimitiveType primitive) { + Schema primitiveSchema; + switch (primitive.typeId()) { + case BOOLEAN: + primitiveSchema = Schema.create(Schema.Type.BOOLEAN); + break; + case INT: + primitiveSchema = Schema.create(Schema.Type.INT); + break; + case LONG: + primitiveSchema = Schema.create(Schema.Type.LONG); + break; + case FLOAT: + primitiveSchema = Schema.create(Schema.Type.FLOAT); + break; + case DOUBLE: + primitiveSchema = Schema.create(Schema.Type.DOUBLE); + break; + case DATE: + primitiveSchema = LogicalTypes.date() + .addToSchema(Schema.create(Schema.Type.INT)); + break; + case TIME: + primitiveSchema = LogicalTypes.timeMicros() + .addToSchema(Schema.create(Schema.Type.LONG)); + break; + case TIMESTAMP: + primitiveSchema = LogicalTypes.timestampMicros() + .addToSchema(Schema.create(Schema.Type.LONG)); + break; + case STRING: + primitiveSchema = Schema.create(Schema.Type.STRING); + break; + case UUID: + primitiveSchema = LogicalTypes.uuid() + .addToSchema(Schema.createFixed("uuid_fixed", null, null, 16)); + break; + case FIXED: + Types.FixedType fixed = (Types.FixedType) primitive; + primitiveSchema = Schema.createFixed("fixed_" + fixed.getFixedSize(), null, null, fixed.getFixedSize()); + break; + case BINARY: + primitiveSchema = Schema.create(Schema.Type.BYTES); + break; + case DECIMAL: + Types.DecimalType decimal = (Types.DecimalType) primitive; + primitiveSchema = LogicalTypes.decimal(decimal.precision(), decimal.scale()) + .addToSchema(Schema.createFixed( + "decimal_" + decimal.precision() + "_" + decimal.scale(), + null, null, computeMinBytesForPrecision(decimal.precision()))); + break; + default: + throw new UnsupportedOperationException( + "Unsupported type ID: " + primitive.typeId()); + } + return primitiveSchema; + } + + /** + * Return the minimum number of bytes needed to store a decimal with a give 'precision'. + * reference from Spark release 3.1 . + */ + private static int computeMinBytesForPrecision(int precision) { + int numBytes = 1; + while (Math.pow(2.0, 8 * numBytes - 1) < Math.pow(10.0, precision)) { + numBytes += 1; + } + return numBytes; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/io/AbstractInternalSchemaStorageManager.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/io/AbstractInternalSchemaStorageManager.java new file mode 100644 index 0000000000000..d4db68425fda8 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/io/AbstractInternalSchemaStorageManager.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.io; + +import org.apache.hudi.common.util.Option; + +import java.util.List; + +abstract class AbstractInternalSchemaStorageManager { + + /** + * Persist history schema str. + */ + public abstract void persistHistorySchemaStr(String instantTime, String historySchemaStr); + + /** + * Get latest history schema string. + */ + public abstract String getHistorySchemaStr(); + + /** + * Get latest history schema string. + * Using give validCommits to validate all legal histroy Schema files, and return the latest one. + * If the passed valid commits is null or empty, valid instants will be fetched from the file-system and used. + */ + public abstract String getHistorySchemaStrByGivenValidCommits(List validCommits); + + /** + * Get internalSchema by using given versionId + * + * @param versionId schema version_id need to search + * @return internalSchema + */ + public abstract Option getSchemaByKey(String versionId); +} diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/io/FileBasedInternalSchemaStorageManager.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/io/FileBasedInternalSchemaStorageManager.java new file mode 100644 index 0000000000000..6cca0728a8312 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/io/FileBasedInternalSchemaStorageManager.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.io; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.FileIOUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.utils.InternalSchemaUtils; +import org.apache.hudi.internal.schema.utils.SerDeHelper; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.TreeMap; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.table.timeline.HoodieTimeline.SCHEMA_COMMIT_ACTION; + +public class FileBasedInternalSchemaStorageManager extends AbstractInternalSchemaStorageManager { + private static final Logger LOG = LogManager.getLogger(FileBasedInternalSchemaStorageManager.class); + + public static final String SCHEMA_NAME = ".schema"; + private final Path baseSchemaPath; + private final Configuration conf; + private HoodieTableMetaClient metaClient; + + public FileBasedInternalSchemaStorageManager(Configuration conf, Path baseTablePath) { + Path metaPath = new Path(baseTablePath, ".hoodie"); + this.baseSchemaPath = new Path(metaPath, SCHEMA_NAME); + this.conf = conf; + } + + public FileBasedInternalSchemaStorageManager(HoodieTableMetaClient metaClient) { + Path metaPath = new Path(metaClient.getBasePath(), ".hoodie"); + this.baseSchemaPath = new Path(metaPath, SCHEMA_NAME); + this.conf = metaClient.getHadoopConf(); + this.metaClient = metaClient; + } + + // make metaClient build lazy + private HoodieTableMetaClient getMetaClient() { + if (metaClient == null) { + metaClient = HoodieTableMetaClient.builder().setBasePath(baseSchemaPath.getParent().getParent().toString()).setConf(conf).build(); + } + return metaClient; + } + + @Override + public void persistHistorySchemaStr(String instantTime, String historySchemaStr) { + cleanResidualFiles(); + HoodieActiveTimeline timeline = getMetaClient().getActiveTimeline(); + HoodieInstant hoodieInstant = new HoodieInstant(HoodieInstant.State.REQUESTED, SCHEMA_COMMIT_ACTION, instantTime); + timeline.createNewInstant(hoodieInstant); + byte[] writeContent = historySchemaStr.getBytes(StandardCharsets.UTF_8); + timeline.transitionRequestedToInflight(hoodieInstant, Option.empty()); + timeline.saveAsComplete(new HoodieInstant(HoodieInstant.State.INFLIGHT, hoodieInstant.getAction(), hoodieInstant.getTimestamp()), Option.of(writeContent)); + LOG.info(String.format("persist history schema success on commit time: %s", instantTime)); + } + + private void cleanResidualFiles() { + List validateCommits = getValidInstants(); + try { + FileSystem fs = baseSchemaPath.getFileSystem(conf); + if (fs.exists(baseSchemaPath)) { + List candidateSchemaFiles = Arrays.stream(fs.listStatus(baseSchemaPath)).filter(f -> f.isFile()) + .map(file -> file.getPath().getName()).collect(Collectors.toList()); + List residualSchemaFiles = candidateSchemaFiles.stream().filter(f -> !validateCommits.contains(f.split("\\.")[0])).collect(Collectors.toList()); + // clean residual files + residualSchemaFiles.forEach(f -> { + try { + fs.delete(new Path(getMetaClient().getSchemaFolderName(), f)); + } catch (IOException o) { + throw new HoodieException(o); + } + }); + } + } catch (IOException e) { + throw new HoodieException(e); + } + } + + public void cleanOldFiles(List validateCommits) { + try { + FileSystem fs = baseSchemaPath.getFileSystem(conf); + if (fs.exists(baseSchemaPath)) { + List candidateSchemaFiles = Arrays.stream(fs.listStatus(baseSchemaPath)).filter(f -> f.isFile()) + .map(file -> file.getPath().getName()).collect(Collectors.toList()); + List validateSchemaFiles = candidateSchemaFiles.stream().filter(f -> validateCommits.contains(f.split("\\.")[0])).collect(Collectors.toList()); + for (int i = 0; i < validateSchemaFiles.size(); i++) { + fs.delete(new Path(validateSchemaFiles.get(i))); + } + } + } catch (IOException e) { + throw new HoodieException(e); + } + } + + private List getValidInstants() { + return getMetaClient().getCommitsTimeline() + .filterCompletedInstants().getInstants().map(f -> f.getTimestamp()).collect(Collectors.toList()); + } + + @Override + public String getHistorySchemaStr() { + return getHistorySchemaStrByGivenValidCommits(Collections.EMPTY_LIST); + } + + @Override + public String getHistorySchemaStrByGivenValidCommits(List validCommits) { + List commitList = validCommits == null || validCommits.isEmpty() ? getValidInstants() : validCommits; + try { + FileSystem fs = FSUtils.getFs(baseSchemaPath.toString(), conf); + if (fs.exists(baseSchemaPath)) { + List validaSchemaFiles = Arrays.stream(fs.listStatus(baseSchemaPath)) + .filter(f -> f.isFile() && f.getPath().getName().endsWith(SCHEMA_COMMIT_ACTION)) + .map(file -> file.getPath().getName()).filter(f -> commitList.contains(f.split("\\.")[0])).sorted().collect(Collectors.toList()); + if (!validaSchemaFiles.isEmpty()) { + Path latestFilePath = new Path(baseSchemaPath, validaSchemaFiles.get(validaSchemaFiles.size() - 1)); + byte[] content; + try (FSDataInputStream is = fs.open(latestFilePath)) { + content = FileIOUtils.readAsByteArray(is); + LOG.info(String.format("read history schema success from file : %s", latestFilePath)); + return new String(content, StandardCharsets.UTF_8); + } catch (IOException e) { + throw new HoodieIOException("Could not read history schema from " + latestFilePath, e); + } + } + } + } catch (IOException io) { + throw new HoodieException(io); + } + LOG.info("failed to read history schema"); + return ""; + } + + @Override + public Option getSchemaByKey(String versionId) { + String historySchemaStr = getHistorySchemaStr(); + TreeMap treeMap; + if (historySchemaStr.isEmpty()) { + return Option.empty(); + } else { + treeMap = SerDeHelper.parseSchemas(historySchemaStr); + InternalSchema result = InternalSchemaUtils.searchSchema(Long.valueOf(versionId), treeMap); + if (result == null) { + return Option.empty(); + } + return Option.of(result); + } + } +} + + diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/AvroSchemaEvolutionUtils.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/AvroSchemaEvolutionUtils.java new file mode 100644 index 0000000000000..413a3c4df1bc3 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/AvroSchemaEvolutionUtils.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.utils; + +import org.apache.avro.Schema; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.Types; +import org.apache.hudi.internal.schema.action.TableChanges; +import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter; + +import java.util.ArrayList; +import java.util.List; +import java.util.TreeMap; +import java.util.stream.Collectors; + +/** + * Utility methods to support evolve old avro schema based on a given schema. + */ +public class AvroSchemaEvolutionUtils { + + /** + * Support reconcile from a new avroSchema. + * 1) incoming data has missing columns that were already defined in the table –> null values will be injected into missing columns + * 2) incoming data contains new columns not defined yet in the table -> columns will be added to the table schema (incoming dataframe?) + * 3) incoming data has missing columns that are already defined in the table and new columns not yet defined in the table -> + * new columns will be added to the table schema, missing columns will be injected with null values + * 4) support nested schema change. + * Notice: + * the incoming schema should not have delete/rename semantics. + * for example: incoming schema: int a, int b, int d; oldTableSchema int a, int b, int c, int d + * we must guarantee the column c is missing semantic, instead of delete semantic. + * @param incomingSchema implicitly evolution of avro when hoodie write operation + * @param oldTableSchema old internalSchema + * @return reconcile Schema + */ + public static InternalSchema reconcileSchema(Schema incomingSchema, InternalSchema oldTableSchema) { + InternalSchema inComingInternalSchema = AvroInternalSchemaConverter.convert(incomingSchema); + // do check, only support add column evolution + List colNamesFromIncoming = inComingInternalSchema.getAllColsFullName(); + List colNamesFromOldSchema = oldTableSchema.getAllColsFullName(); + List diffFromOldSchema = colNamesFromOldSchema.stream().filter(f -> !colNamesFromIncoming.contains(f)).collect(Collectors.toList()); + List newFields = new ArrayList<>(); + if (colNamesFromIncoming.size() == colNamesFromOldSchema.size() && diffFromOldSchema.size() == 0) { + return oldTableSchema; + } + List diffFromEvolutionSchema = colNamesFromIncoming.stream().filter(f -> !colNamesFromOldSchema.contains(f)).collect(Collectors.toList()); + // Remove redundancy from diffFromEvolutionSchema. + // for example, now we add a struct col in evolvedSchema, the struct col is " user struct " + // when we do diff operation: user, user.name, user.age will appeared in the resultSet which is redundancy, user.name and user.age should be excluded. + // deal with add operation + TreeMap finalAddAction = new TreeMap<>(); + for (int i = 0; i < diffFromEvolutionSchema.size(); i++) { + String name = diffFromEvolutionSchema.get(i); + int splitPoint = name.lastIndexOf("."); + String parentName = splitPoint > 0 ? name.substring(0, splitPoint) : ""; + if (!parentName.isEmpty() && diffFromEvolutionSchema.contains(parentName)) { + // find redundancy, skip it + continue; + } + finalAddAction.put(inComingInternalSchema.findIdByName(name), name); + } + + TableChanges.ColumnAddChange addChange = TableChanges.ColumnAddChange.get(oldTableSchema); + finalAddAction.entrySet().stream().forEach(f -> { + String name = f.getValue(); + int splitPoint = name.lastIndexOf("."); + String parentName = splitPoint > 0 ? name.substring(0, splitPoint) : ""; + String rawName = splitPoint > 0 ? name.substring(splitPoint + 1) : name; + // try to infer add position. + java.util.Optional inferPosition = + colNamesFromIncoming.stream().filter(c -> + c.lastIndexOf(".") == splitPoint + && c.startsWith(parentName) + && inComingInternalSchema.findIdByName(c) > inComingInternalSchema.findIdByName(name) + && oldTableSchema.findIdByName(c) > 0).sorted((s1, s2) -> oldTableSchema.findIdByName(s1) - oldTableSchema.findIdByName(s2)).findFirst(); + addChange.addColumns(parentName, rawName, inComingInternalSchema.findType(name), null); + inferPosition.map(i -> addChange.addPositionChange(name, i, "before")); + }); + + return SchemaChangeUtils.applyTableChanges2Schema(oldTableSchema, addChange); + } + + /** + * Canonical the nullability. + * Do not allow change cols Nullability field from optional to required. + * If above problem occurs, try to correct it. + * + * @param writeSchema writeSchema hoodie used to write data. + * @param readSchema read schema + * @return canonical Schema + */ + public static Schema canonicalizeColumnNullability(Schema writeSchema, Schema readSchema) { + if (writeSchema.getFields().isEmpty() || readSchema.getFields().isEmpty()) { + return writeSchema; + } + InternalSchema writeInternalSchema = AvroInternalSchemaConverter.convert(writeSchema); + InternalSchema readInternalSchema = AvroInternalSchemaConverter.convert(readSchema); + List colNamesWriteSchema = writeInternalSchema.getAllColsFullName(); + List colNamesFromReadSchema = readInternalSchema.getAllColsFullName(); + // try to deal with optional change. now when we use sparksql to update hudi table, + // sparksql Will change the col type from optional to required, this is a bug. + List candidateUpdateCols = colNamesWriteSchema.stream().filter(f -> { + boolean exist = colNamesFromReadSchema.contains(f); + if (exist && (writeInternalSchema.findField(f).isOptional() != readInternalSchema.findField(f).isOptional())) { + return true; + } else { + return false; + } + }).collect(Collectors.toList()); + if (candidateUpdateCols.isEmpty()) { + return writeSchema; + } + // try to correct all changes + TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(writeInternalSchema); + candidateUpdateCols.stream().forEach(f -> updateChange.updateColumnNullability(f, true)); + Schema result = AvroInternalSchemaConverter.convert( + SchemaChangeUtils.applyTableChanges2Schema(writeInternalSchema, updateChange), + writeSchema.getName(), writeSchema.getNamespace()); + return result; + } +} + diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/InternalSchemaUtils.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/InternalSchemaUtils.java new file mode 100644 index 0000000000000..c799c236d0db0 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/InternalSchemaUtils.java @@ -0,0 +1,289 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.utils; + +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.internal.schema.HoodieSchemaException; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.Type; +import org.apache.hudi.internal.schema.Types; +import org.apache.hudi.internal.schema.Types.Field; + +import java.util.ArrayList; +import java.util.Deque; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import java.util.SortedMap; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Util methods to help us do some operations on InternalSchema. + * eg: column prune, filter rebuild for query engine... + */ +public class InternalSchemaUtils { + + private InternalSchemaUtils() { + } + + /** + * Create project internalSchema, based on the project names which produced by query engine. + * support nested project. + * + * @param schema a internal schema. + * @param names project names produced by query engine. + * @return a project internalSchema. + */ + public static InternalSchema pruneInternalSchema(InternalSchema schema, List names) { + // do check + List prunedIds = names.stream().map(name -> { + int id = schema.findIdByName(name); + if (id == -1) { + throw new IllegalArgumentException(String.format("cannot prune col: %s which not exisit in hudi table", name)); + } + return id; + }).collect(Collectors.toList()); + // find top parent field ID. eg: a.b.c, f.g.h, only collect id of a and f ignore all child field. + List topParentFieldIds = new ArrayList<>(); + names.stream().forEach(f -> { + int id = schema.findIdByName(f.split("\\.")[0]); + if (!topParentFieldIds.contains(id)) { + topParentFieldIds.add(id); + } + }); + return pruneInternalSchemaByID(schema, prunedIds, topParentFieldIds); + } + + /** + * Create project internalSchema. + * support nested project. + * + * @param schema a internal schema. + * @param fieldIds project col field_ids. + * @return a project internalSchema. + */ + public static InternalSchema pruneInternalSchemaByID(InternalSchema schema, List fieldIds, List topParentFieldIds) { + Types.RecordType recordType = (Types.RecordType)pruneType(schema.getRecord(), fieldIds); + // reorder top parent fields, since the recordType.fields() produced by pruneType maybe out of order. + List newFields = new ArrayList<>(); + if (topParentFieldIds != null && !topParentFieldIds.isEmpty()) { + for (int id : topParentFieldIds) { + Types.Field f = recordType.field(id); + if (f != null) { + newFields.add(f); + } else { + throw new HoodieSchemaException(String.format("cannot find pruned id %s in currentSchema %s", id, schema.toString())); + } + } + } + return new InternalSchema(newFields.isEmpty() ? recordType.fields() : newFields); + } + + /** + * Project hudi type by projected cols field_ids + * this is auxiliary function used by pruneInternalSchema. + */ + private static Type pruneType(Type type, List fieldIds) { + switch (type.typeId()) { + case RECORD: + Types.RecordType record = (Types.RecordType) type; + List fields = record.fields(); + List newTypes = new ArrayList<>(); + for (Types.Field f : fields) { + Type newType = pruneType(f.type(), fieldIds); + if (fieldIds.contains(f.fieldId())) { + newTypes.add(f.type()); + } else if (newType != null) { + newTypes.add(newType); + } else { + newTypes.add(null); + } + } + boolean changed = false; + List newFields = new ArrayList<>(); + for (int i = 0; i < fields.size(); i++) { + Types.Field oldField = fields.get(i); + Type newType = newTypes.get(i); + if (oldField.type() == newType) { + newFields.add(oldField); + } else if (newType != null) { + changed = true; + newFields.add(Types.Field.get(oldField.fieldId(), oldField.isOptional(), oldField.name(), newType, oldField.doc())); + } + } + if (newFields.isEmpty()) { + return null; + } + if (newFields.size() == fields.size() && !changed) { + return record; + } else { + return Types.RecordType.get(newFields); + } + case ARRAY: + Types.ArrayType array = (Types.ArrayType) type; + Type newElementType = pruneType(array.elementType(), fieldIds); + if (fieldIds.contains(array.elementId())) { + return array; + } else if (newElementType != null) { + if (array.elementType() == newElementType) { + return array; + } + return Types.ArrayType.get(array.elementId(), array.isElementOptional(), newElementType); + } + return null; + case MAP: + Types.MapType map = (Types.MapType) type; + Type newValueType = pruneType(map.valueType(), fieldIds); + if (fieldIds.contains(map.valueId())) { + return map; + } else if (newValueType != null) { + if (map.valueType() == newValueType) { + return map; + } + return Types.MapType.get(map.keyId(), map.valueId(), map.keyType(), newValueType, map.isValueOptional()); + } + return null; + default: + return null; + } + } + + /** + * A helper function to help correct the colName of pushed filters. + * + * @param name origin col name from pushed filters. + * @param fileSchema the real schema of avro/parquet file. + * @param querySchema the query schema which query engine produced. + * @return a corrected name. + */ + public static String reBuildFilterName(String name, InternalSchema fileSchema, InternalSchema querySchema) { + int nameId = querySchema.findIdByName(name); + if (nameId == -1) { + throw new IllegalArgumentException(String.format("cannot found filter col name:%s from querySchema: %s", name, querySchema)); + } + if (fileSchema.findField(nameId) == null) { + // added operation found + // the read file does not contain current col, so current colFilter is invalid + return ""; + } else { + if (name.equals(fileSchema.findfullName(nameId))) { + // no change happened on current col + return name; + } else { + // find rename operation on current col + // return the name from fileSchema + return fileSchema.findfullName(nameId); + } + } + } + + /** + * Collect all type changed cols to build a colPosition -> (newColType, oldColType) map. + * only collect top level col changed. eg: a is a nest field(record(b int, d long), now a.b is changed from int to long, + * only a will be collected, a.b will excluded. + * + * @param schema a type changed internalSchema + * @param oldSchema an old internalSchema. + * @return a map. + */ + public static Map> collectTypeChangedCols(InternalSchema schema, InternalSchema oldSchema) { + Set ids = schema.getAllIds(); + Set otherIds = oldSchema.getAllIds(); + Map> result = new HashMap<>(); + ids.stream().filter(f -> otherIds.contains(f)).forEach(f -> { + if (!schema.findType(f).equals(oldSchema.findType(f))) { + String[] fieldNameParts = schema.findfullName(f).split("\\."); + String[] otherFieldNameParts = oldSchema.findfullName(f).split("\\."); + String parentName = fieldNameParts[0]; + String otherParentName = otherFieldNameParts[0]; + if (fieldNameParts.length == otherFieldNameParts.length && schema.findIdByName(parentName) == oldSchema.findIdByName(otherParentName)) { + int index = schema.findIdByName(parentName); + int position = schema.getRecord().fields().stream().map(s -> s.fieldId()).collect(Collectors.toList()).indexOf(index); + if (!result.containsKey(position)) { + result.put(position, Pair.of(schema.findType(parentName), oldSchema.findType(otherParentName))); + } + } + } + }); + return result; + } + + /** + * Search target internalSchema by version number. + * + * @param versionId the internalSchema version to be search. + * @param internalSchemas internalSchemas to be searched. + * @return a internalSchema. + */ + public static InternalSchema searchSchema(long versionId, List internalSchemas) { + TreeMap treeMap = new TreeMap<>(); + internalSchemas.forEach(s -> treeMap.put(s.schemaId(), s)); + return searchSchema(versionId, treeMap); + } + + /** + * Search target internalSchema by version number. + * + * @param versionId the internalSchema version to be search. + * @param treeMap internalSchemas collections to be searched. + * @return a internalSchema. + */ + public static InternalSchema searchSchema(long versionId, TreeMap treeMap) { + if (treeMap.containsKey(versionId)) { + return treeMap.get(versionId); + } else { + SortedMap headMap = treeMap.headMap(versionId); + if (!headMap.isEmpty()) { + return headMap.get(headMap.lastKey()); + } + } + return InternalSchema.getEmptyInternalSchema(); + } + + public static String createFullName(String name, Deque fieldNames) { + String result = name; + if (!fieldNames.isEmpty()) { + List parentNames = new ArrayList<>(); + fieldNames.descendingIterator().forEachRemaining(parentNames::add); + result = parentNames.stream().collect(Collectors.joining(".")) + "." + result; + } + return result; + } + + /** + * Try to find all renamed cols between oldSchema and newSchema. + * + * @param oldSchema oldSchema + * @param newSchema newSchema which modified from oldSchema + * @return renameCols Map. (k, v) -> (colNameFromNewSchema, colNameLastPartFromOldSchema) + */ + public static Map collectRenameCols(InternalSchema oldSchema, InternalSchema newSchema) { + List colNamesFromWriteSchema = oldSchema.getAllColsFullName(); + return colNamesFromWriteSchema.stream().filter(f -> { + int filedIdFromWriteSchema = oldSchema.findIdByName(f); + // try to find the cols which has the same id, but have different colName; + return newSchema.getAllIds().contains(filedIdFromWriteSchema) && !newSchema.findfullName(filedIdFromWriteSchema).equalsIgnoreCase(f); + }).collect(Collectors.toMap(e -> newSchema.findfullName(oldSchema.findIdByName(e)), e -> { + int lastDotIndex = e.lastIndexOf("."); + return e.substring(lastDotIndex == -1 ? 0 : lastDotIndex + 1); + })); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java new file mode 100644 index 0000000000000..d719008042021 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java @@ -0,0 +1,305 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.utils; + +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.Type; +import org.apache.hudi.internal.schema.Types; +import org.apache.hudi.internal.schema.action.TableChanges; +import org.apache.hudi.internal.schema.action.TableChangesHelper; + +import java.util.ArrayList; +import java.util.List; + +/** + * Helper methods for schema Change. + */ +public class SchemaChangeUtils { + private SchemaChangeUtils() { + + } + + /** + * Whether to allow the column type to be updated. + * now only support: + * int => long/float/double/string + * long => float/double/string + * float => double/String + * double => String/Decimal + * Decimal => Decimal/String + * String => date/decimal + * date => String + * TODO: support more type update. + * + * @param src origin column type. + * @param dsr new column type. + * @return whether to allow the column type to be updated. + */ + public static boolean isTypeUpdateAllow(Type src, Type dsr) { + if (src.isNestedType() || dsr.isNestedType()) { + throw new IllegalArgumentException("only support update primitive type"); + } + if (src.equals(dsr)) { + return true; + } + switch (src.typeId()) { + case INT: + return dsr == Types.LongType.get() || dsr == Types.FloatType.get() + || dsr == Types.DoubleType.get() || dsr == Types.StringType.get() || dsr.typeId() == Type.TypeID.DECIMAL; + case LONG: + return dsr == Types.FloatType.get() || dsr == Types.DoubleType.get() || dsr == Types.StringType.get() || dsr.typeId() == Type.TypeID.DECIMAL; + case FLOAT: + return dsr == Types.DoubleType.get() || dsr == Types.StringType.get() || dsr.typeId() == Type.TypeID.DECIMAL; + case DOUBLE: + return dsr == Types.StringType.get() || dsr.typeId() == Type.TypeID.DECIMAL; + case DATE: + return dsr == Types.StringType.get(); + case DECIMAL: + if (dsr.typeId() == Type.TypeID.DECIMAL) { + Types.DecimalType decimalSrc = (Types.DecimalType)src; + Types.DecimalType decimalDsr = (Types.DecimalType)dsr; + if (decimalDsr.isWiderThan(decimalSrc)) { + return true; + } + } else if (dsr.typeId() == Type.TypeID.STRING) { + return true; + } + break; + case STRING: + return dsr == Types.DateType.get() || dsr.typeId() == Type.TypeID.DECIMAL; + default: + return false; + } + return false; + } + + /** + * Apply all the DDL add operations to internalSchema to produce a new internalSchema. + * + * @param internalSchema origin internalSchema. + * @param adds a wrapper class for all the DDL add operations. + * @return a new internalSchema. + */ + public static InternalSchema applyTableChanges2Schema(InternalSchema internalSchema, TableChanges.ColumnAddChange adds) { + Types.RecordType newType = (Types.RecordType)applyTableChange2Type(internalSchema.getRecord(), adds); + // deal with root level changes + List newFields = TableChangesHelper.applyAddChange2Fields(newType.fields(), + adds.getParentId2AddCols().get(-1), adds.getPositionChangeMap().get(-1)); + return new InternalSchema(newFields); + } + + /** + * Apply all the DDL add operations to Type to produce a new internalSchema. + * do not call this method directly. expose this method only for UT. + * + * @param type origin hudi Type. + * @param adds a wrapper class for all the DDL add operations. + * @return a new internalSchema. + */ + public static Type applyTableChange2Type(Type type, TableChanges.ColumnAddChange adds) { + switch (type.typeId()) { + case RECORD: + Types.RecordType record = (Types.RecordType) type; + List newTypes = new ArrayList<>(); + for (Types.Field f : record.fields()) { + Type newType = applyTableChange2Type(f.type(), adds); + // try to apply add + newTypes.add(newType.isNestedType() ? adds.applyAdd(f, newType) : newType); + } + List newFields = new ArrayList<>(); + boolean hasChanged = false; + for (int i = 0; i < newTypes.size(); i++) { + Type newType = newTypes.get(i); + Types.Field oldfield = record.fields().get(i); + if (oldfield.type() == newType) { + newFields.add(oldfield); + } else { + hasChanged = true; + newFields.add(Types.Field.get(oldfield.fieldId(), oldfield.isOptional(), oldfield.name(), newType, oldfield.doc())); + } + } + return hasChanged ? Types.RecordType.get(newFields) : record; + case ARRAY: + Types.ArrayType array = (Types.ArrayType) type; + Type newElementType; + Types.Field elementField = array.field(array.elementId()); + newElementType = applyTableChange2Type(array.elementType(), adds); + // try to apply add + newElementType = adds.applyAdd(elementField, newElementType); + if (newElementType == array.elementType()) { + return array; + } + return Types.ArrayType.get(array.elementId(), array.isElementOptional(), newElementType); + case MAP: + Types.MapType map = (Types.MapType) type; + Type newValueType; + Types.Field valueField = map.field(map.valueId()); + if (adds.getParentId2AddCols().containsKey(map.keyId())) { + throw new IllegalArgumentException("Cannot add fields to map keys: " + map); + } + newValueType = applyTableChange2Type(map.valueType(), adds); + // try to apply add + newValueType = adds.applyAdd(valueField, newValueType); + if (newValueType == map.valueType()) { + return map; + } + return Types.MapType.get(map.keyId(), map.valueId(), map.keyType(), newValueType, map.isValueOptional()); + default: + return type; + } + } + + /** + * Apply all the DDL delete operations to internalSchema to produce a new internalSchema. + * + * @param internalSchema origin internalSchema. + * @param deletes a wrapper class for all the DDL delete operations. + * @return a new internalSchema. + */ + public static InternalSchema applyTableChanges2Schema(InternalSchema internalSchema, TableChanges.ColumnDeleteChange deletes) { + Types.RecordType newType = (Types.RecordType)applyTableChange2Type(internalSchema.getRecord(), deletes); + return new InternalSchema(newType.fields()); + } + + /** + * Apply all the DDL delete operations to Type to produce a new internalSchema. + * do not call this method directly. expose this method only for UT. + * + * @param type origin type. + * @param deletes a wrapper class for all the DDL delete operations. + * @return a new internalSchema. + */ + private static Type applyTableChange2Type(Type type, TableChanges.ColumnDeleteChange deletes) { + switch (type.typeId()) { + case RECORD: + Types.RecordType record = (Types.RecordType) type; + List fields = new ArrayList<>(); + for (Types.Field f : record.fields()) { + Type newType = applyTableChange2Type(f.type(), deletes); + // apply delete + newType = deletes.applyDelete(f.fieldId(), newType); + if (newType != null) { + fields.add(Types.Field.get(f.fieldId(), f.isOptional(), f.name(), newType, f.doc())); + } + } + if (fields.isEmpty()) { + throw new UnsupportedOperationException("cannot support delete all columns from Struct"); + } + return Types.RecordType.get(fields); + case ARRAY: + Types.ArrayType array = (Types.ArrayType) type; + Type newElementType = applyTableChange2Type(array.elementType(), deletes); + newElementType = deletes.applyDelete(array.elementId(), newElementType); + if (newElementType == null) { + throw new IllegalArgumentException(String.format("cannot delete element from arrayType: %s", array)); + } + return Types.ArrayType.get(array.elementId(), array.isElementOptional(), newElementType); + case MAP: + Types.MapType map = (Types.MapType) type; + int keyId = map.fields().get(0).fieldId(); + if (deletes.getDeletes().contains(keyId)) { + throw new IllegalArgumentException(String.format("cannot delete key from mapType: %s", map)); + } + Type newValueType = applyTableChange2Type(map.valueType(), deletes); + newValueType = deletes.applyDelete(map.valueId(), newValueType); + if (newValueType == null) { + throw new IllegalArgumentException(String.format("cannot delete value from mapType: %s", map)); + } + return Types.MapType.get(map.keyId(), map.valueId(), map.keyType(), newValueType, map.isValueOptional()); + default: + return type; + } + } + + /** + * Apply all the DDL update operations to internalSchema to produce a new internalSchema. + * + * @param internalSchema origin internalSchema. + * @param updates a wrapper class for all the DDL update operations. + * @return a new internalSchema. + */ + public static InternalSchema applyTableChanges2Schema(InternalSchema internalSchema, TableChanges.ColumnUpdateChange updates) { + Types.RecordType newType = (Types.RecordType)applyTableChange2Type(internalSchema.getRecord(), updates); + // deal with root level changes + List newFields = TableChangesHelper.applyAddChange2Fields(newType.fields(), + new ArrayList<>(), updates.getPositionChangeMap().get(-1)); + return new InternalSchema(newFields); + } + + /** + * Apply all the DDL update operations to type to produce a new internalSchema. + * do not call this method directly. expose this method only for UT. + * + * @param type origin internalSchema. + * @param updates a wrapper class for all the DDL update operations. + * @return a new internalSchema. + */ + private static Type applyTableChange2Type(Type type, TableChanges.ColumnUpdateChange updates) { + switch (type.typeId()) { + case RECORD: + Types.RecordType record = (Types.RecordType) type; + List newTypes = new ArrayList<>(); + for (Types.Field f : record.fields()) { + Type newType = applyTableChange2Type(f.type(), updates); + newTypes.add(updates.applyUpdates(f, newType)); + } + List newFields = new ArrayList<>(); + for (int i = 0; i < newTypes.size(); i++) { + Type newType = newTypes.get(i); + Types.Field oldField = record.fields().get(i); + Types.Field updateField = updates.getUpdates().get(oldField.fieldId()); + if (updateField != null) { + newFields.add(Types.Field.get(oldField.fieldId(), updateField.isOptional(), updateField.name(), newType, updateField.doc())); + } else if (!oldField.type().equals(newType)) { + newFields.add(Types.Field.get(oldField.fieldId(), oldField.isOptional(), oldField.name(), newType, oldField.doc())); + } else { + newFields.add(oldField); + } + } + return Types.RecordType.get(newFields); + case ARRAY: + Types.ArrayType array = (Types.ArrayType) type; + Type newElementType; + Types.Field elementField = array.fields().get(0); + newElementType = applyTableChange2Type(array.elementType(), updates); + newElementType = updates.applyUpdates(elementField, newElementType); + Types.Field elementUpdate = updates.getUpdates().get(elementField.fieldId()); + boolean optional = elementUpdate == null ? array.isElementOptional() : elementUpdate.isOptional(); + if (optional == elementField.isOptional() && array.elementType() == newElementType) { + return array; + } + return Types.ArrayType.get(array.elementId(), optional, newElementType); + case MAP: + Types.MapType map = (Types.MapType) type; + Types.Field valueFiled = map.fields().get(1); + Type newValueType; + newValueType = applyTableChange2Type(map.valueType(), updates); + newValueType = updates.applyUpdates(valueFiled, newValueType); + Types.Field valueUpdate = updates.getUpdates().get(valueFiled.fieldId()); + boolean valueOptional = valueUpdate == null ? map.isValueOptional() : valueUpdate.isOptional(); + if (valueOptional == map.isValueOptional() && map.valueType() == newValueType) { + return map; + } + return Types.MapType.get(map.keyId(), map.valueId(), map.keyType(), newValueType, valueOptional); + default: + return type; + } + } +} + diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SerDeHelper.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SerDeHelper.java new file mode 100644 index 0000000000000..aebda533e324a --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SerDeHelper.java @@ -0,0 +1,351 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.utils; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.Type; +import org.apache.hudi.internal.schema.Types; + +import java.io.IOException; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.TreeMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class SerDeHelper { + private SerDeHelper() { + + } + + public static final String LATEST_SCHEMA = "latest_schema"; + public static final String SCHEMAS = "schemas"; + private static final String MAX_COLUMN_ID = "max_column_id"; + private static final String VERSION_ID = "version_id"; + private static final String TYPE = "type"; + private static final String RECORD = "record"; + private static final String ARRAY = "array"; + private static final String MAP = "map"; + private static final String FIELDS = "fields"; + private static final String ELEMENT = "element"; + private static final String KEY = "key"; + private static final String VALUE = "value"; + private static final String DOC = "doc"; + private static final String NAME = "name"; + private static final String ID = "id"; + private static final String ELEMENT_ID = "element_id"; + private static final String KEY_ID = "key_id"; + private static final String VALUE_ID = "value_id"; + private static final String OPTIONAL = "optional"; + private static final String ELEMENT_OPTIONAL = "element_optional"; + private static final String VALUE_OPTIONAL = "value_optional"; + + private static final Pattern FIXED = Pattern.compile("fixed\\[(\\d+)\\]"); + private static final Pattern DECIMAL = Pattern.compile("decimal\\((\\d+),\\s+(\\d+)\\)"); + + /** + * Convert history internalSchemas to json. + * this is used when save history schemas into hudi. + * + * @param internalSchemas history internal schemas + * @return a string + */ + public static String toJson(List internalSchemas) { + try { + StringWriter writer = new StringWriter(); + JsonGenerator generator = (new JsonFactory()).createGenerator(writer); + generator.writeStartObject(); + generator.writeArrayFieldStart(SCHEMAS); + for (InternalSchema schema : internalSchemas) { + toJson(schema, generator); + } + generator.writeEndArray(); + generator.writeEndObject(); + generator.flush(); + return writer.toString(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * Convert internalSchemas to json. + * + * @param internalSchema a internal schema + * @return a string + */ + public static String toJson(InternalSchema internalSchema) { + if (internalSchema == null || internalSchema.isEmptySchema()) { + return ""; + } + try { + StringWriter writer = new StringWriter(); + JsonGenerator generator = (new JsonFactory()).createGenerator(writer); + toJson(internalSchema, generator); + generator.flush(); + return writer.toString(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private static void toJson(InternalSchema internalSchema, JsonGenerator generator) throws IOException { + toJson(internalSchema.getRecord(), internalSchema.getMaxColumnId(), internalSchema.schemaId(), generator); + } + + private static void toJson(Types.RecordType record, Integer maxColumnId, Long versionId, JsonGenerator generator) throws IOException { + generator.writeStartObject(); + if (maxColumnId != null) { + generator.writeNumberField(MAX_COLUMN_ID, maxColumnId); + } + if (versionId != null) { + generator.writeNumberField(VERSION_ID, versionId); + } + generator.writeStringField(TYPE, RECORD); + generator.writeArrayFieldStart(FIELDS); + for (Types.Field field : record.fields()) { + generator.writeStartObject(); + generator.writeNumberField(ID, field.fieldId()); + generator.writeStringField(NAME, field.name()); + generator.writeBooleanField(OPTIONAL, field.isOptional()); + generator.writeFieldName(TYPE); + toJson(field.type(), generator); + if (field.doc() != null) { + generator.writeStringField(DOC, field.doc()); + } + generator.writeEndObject(); + } + generator.writeEndArray(); + generator.writeEndObject(); + } + + private static void toJson(Type type, JsonGenerator generator) throws IOException { + switch (type.typeId()) { + case RECORD: + toJson((Types.RecordType) type, null, null, generator); + break; + case ARRAY: + Types.ArrayType array = (Types.ArrayType) type; + generator.writeStartObject(); + generator.writeStringField(TYPE, ARRAY); + generator.writeNumberField(ELEMENT_ID, array.elementId()); + generator.writeFieldName(ELEMENT); + toJson(array.elementType(), generator); + generator.writeBooleanField(ELEMENT_OPTIONAL, array.isElementOptional()); + generator.writeEndObject(); + break; + case MAP: + Types.MapType map = (Types.MapType) type; + generator.writeStartObject(); + generator.writeStringField(TYPE, MAP); + generator.writeNumberField(KEY_ID, map.keyId()); + generator.writeFieldName(KEY); + toJson(map.keyType(), generator); + generator.writeNumberField(VALUE_ID, map.valueId()); + generator.writeFieldName(VALUE); + toJson(map.valueType(), generator); + generator.writeBooleanField(VALUE_OPTIONAL, map.isValueOptional()); + generator.writeEndObject(); + break; + default: + if (!type.isNestedType()) { + generator.writeString(type.toString()); + } else { + throw new HoodieIOException(String.format("cannot write unknown types: %s", type)); + } + } + } + + private static Type parserTypeFromJson(JsonNode jsonNode) { + if (jsonNode.isTextual()) { + String type = jsonNode.asText().toLowerCase(Locale.ROOT); + // deal with fixed and decimal + Matcher fixed = FIXED.matcher(type); + if (fixed.matches()) { + return Types.FixedType.getFixed(Integer.parseInt(fixed.group(1))); + } + Matcher decimal = DECIMAL.matcher(type); + if (decimal.matches()) { + return Types.DecimalType.get( + Integer.parseInt(decimal.group(1)), + Integer.parseInt(decimal.group(2))); + } + // deal with other type + switch (Type.fromValue(type)) { + case BOOLEAN: + return Types.BooleanType.get(); + case INT: + return Types.IntType.get(); + case LONG: + return Types.LongType.get(); + case FLOAT: + return Types.FloatType.get(); + case DOUBLE: + return Types.DoubleType.get(); + case DATE: + return Types.DateType.get(); + case TIME: + return Types.TimeType.get(); + case TIMESTAMP: + return Types.TimestampType.get(); + case STRING: + return Types.StringType.get(); + case UUID: + return Types.UUIDType.get(); + case BINARY: + return Types.BinaryType.get(); + default: + throw new IllegalArgumentException("cannot parser types from jsonNode"); + } + } else if (jsonNode.isObject()) { + String typeStr = jsonNode.get(TYPE).asText(); + if (RECORD.equals(typeStr)) { + JsonNode fieldNodes = jsonNode.get(FIELDS); + Iterator iter = fieldNodes.elements(); + List fields = new ArrayList<>(); + while (iter.hasNext()) { + JsonNode field = iter.next(); + // extract + int id = field.get(ID).asInt(); + String name = field.get(NAME).asText(); + Type type = parserTypeFromJson(field.get(TYPE)); + String doc = field.has(DOC) ? field.get(DOC).asText() : null; + boolean optional = field.get(OPTIONAL).asBoolean(); + // build fields + fields.add(Types.Field.get(id, optional, name, type, doc)); + } + return Types.RecordType.get(fields); + } else if (ARRAY.equals(typeStr)) { + int elementId = jsonNode.get(ELEMENT_ID).asInt(); + Type elementType = parserTypeFromJson(jsonNode.get(ELEMENT)); + boolean optional = jsonNode.get(ELEMENT_OPTIONAL).asBoolean(); + return Types.ArrayType.get(elementId, optional, elementType); + } else if (MAP.equals(typeStr)) { + int keyId = jsonNode.get(KEY_ID).asInt(); + Type keyType = parserTypeFromJson(jsonNode.get(KEY)); + int valueId = jsonNode.get(VALUE_ID).asInt(); + Type valueType = parserTypeFromJson(jsonNode.get(VALUE)); + boolean optional = jsonNode.get(VALUE_OPTIONAL).asBoolean(); + return Types.MapType.get(keyId, valueId, keyType, valueType, optional); + } + } + throw new IllegalArgumentException(String.format("cannot parse type from jsonNode: %s", jsonNode)); + } + + /** + * Convert jsonNode to internalSchema. + * + * @param jsonNode a jsonNode. + * @return a internalSchema. + */ + public static InternalSchema fromJson(JsonNode jsonNode) { + Integer maxColumnId = !jsonNode.has(MAX_COLUMN_ID) ? null : jsonNode.get(MAX_COLUMN_ID).asInt(); + Long versionId = !jsonNode.has(VERSION_ID) ? null : jsonNode.get(VERSION_ID).asLong(); + Types.RecordType type = (Types.RecordType)parserTypeFromJson(jsonNode); + if (versionId == null) { + return new InternalSchema(type.fields()); + } else { + if (maxColumnId != null) { + return new InternalSchema(versionId, maxColumnId, type.fields()); + } else { + return new InternalSchema(versionId, type.fields()); + } + } + } + + /** + * Convert string to internalSchema. + * + * @param json a json string. + * @return a internalSchema. + */ + public static Option fromJson(String json) { + if (json == null || json.isEmpty()) { + return Option.empty(); + } + try { + return Option.of(fromJson((new ObjectMapper(new JsonFactory())).readValue(json, JsonNode.class))); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * Convert json string to history internalSchemas. + * TreeMap is used to hold history internalSchemas. + * + * @param json a json string + * @return a TreeMap + */ + public static TreeMap parseSchemas(String json) { + TreeMap result = new TreeMap<>(); + try { + JsonNode jsonNode = (new ObjectMapper(new JsonFactory())).readValue(json, JsonNode.class); + if (!jsonNode.has(SCHEMAS)) { + throw new IllegalArgumentException(String.format("cannot parser schemas from current json string, missing key name: %s", SCHEMAS)); + } + JsonNode schemas = jsonNode.get(SCHEMAS); + Iterator iter = schemas.elements(); + while (iter.hasNext()) { + JsonNode schema = iter.next(); + InternalSchema current = fromJson(schema); + result.put(current.schemaId(), current); + } + } catch (IOException e) { + throw new HoodieException(e); + } + return result; + } + + /** + * Add the new schema to the historical schemas. + * use string operations to reduce overhead. + * + * @param newSchema a new internalSchema + * @param oldSchemas historical schemas string. + * @return a string. + */ + public static String inheritSchemas(InternalSchema newSchema, String oldSchemas) { + if (newSchema == null) { + return ""; + } + if (oldSchemas == null || oldSchemas.isEmpty()) { + return toJson(Arrays.asList(newSchema)); + } + String checkedString = "{\"schemas\":["; + if (!oldSchemas.startsWith("{\"schemas\":")) { + return ""; + } + String oldSchemasSuffix = oldSchemas.substring(checkedString.length()); + return checkedString + toJson(newSchema) + "," + oldSchemasSuffix; + } +} + diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/visitor/InternalSchemaVisitor.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/visitor/InternalSchemaVisitor.java new file mode 100644 index 0000000000000..79a9410c65555 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/visitor/InternalSchemaVisitor.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.visitor; + +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.Type; +import org.apache.hudi.internal.schema.Types; + +import java.util.List; + +/** + * Base class of schema visitor. + */ +public abstract class InternalSchemaVisitor { + + public void beforeField(Types.Field field) { + } + + public void afterField(Types.Field field) { + } + + public void beforeArrayElement(Types.Field elementField) { + beforeField(elementField); + } + + public void afterArrayElement(Types.Field elementField) { + afterField(elementField); + } + + public void beforeMapKey(Types.Field keyField) { + beforeField(keyField); + } + + public void afterMapKey(Types.Field keyField) { + afterField(keyField); + } + + public void beforeMapValue(Types.Field valueField) { + beforeField(valueField); + } + + public void afterMapValue(Types.Field valueField) { + afterField(valueField); + } + + public T schema(InternalSchema schema, T recordResult) { + return null; + } + + public T record(Types.RecordType record, List fieldResults) { + return null; + } + + public T field(Types.Field field, T fieldResult) { + return null; + } + + public T array(Types.ArrayType array, T elementResult) { + return null; + } + + public T map(Types.MapType map, T keyResult, T valueResult) { + return null; + } + + public T primitive(Type.PrimitiveType primitive) { + return null; + } +} + diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/visitor/NameToIDVisitor.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/visitor/NameToIDVisitor.java new file mode 100644 index 0000000000000..4960f434eeb23 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/visitor/NameToIDVisitor.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.visitor; + +import static org.apache.hudi.internal.schema.utils.InternalSchemaUtils.createFullName; + +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.Type; +import org.apache.hudi.internal.schema.Types; + +import java.util.Deque; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +/** + * Schema visitor to produce name -> id map for internalSchema. + */ +public class NameToIDVisitor extends InternalSchemaVisitor> { + private final Deque fieldNames = new LinkedList<>(); + private final Map nameToId = new HashMap<>(); + + @Override + public void beforeField(Types.Field field) { + fieldNames.push(field.name()); + } + + @Override + public void afterField(Types.Field field) { + fieldNames.pop(); + } + + @Override + public void beforeArrayElement(Types.Field elementField) { + fieldNames.push(elementField.name()); + } + + @Override + public void afterArrayElement(Types.Field elementField) { + fieldNames.pop(); + } + + @Override + public void beforeMapKey(Types.Field keyField) { + fieldNames.push(keyField.name()); + } + + @Override + public void afterMapKey(Types.Field keyField) { + fieldNames.pop(); + } + + @Override + public void beforeMapValue(Types.Field valueField) { + fieldNames.push(valueField.name()); + } + + @Override + public void afterMapValue(Types.Field valueField) { + fieldNames.pop(); + } + + @Override + public Map schema(InternalSchema schema, Map recordResult) { + return nameToId; + } + + @Override + public Map record(Types.RecordType record, List> fieldResults) { + return nameToId; + } + + @Override + public Map field(Types.Field field, Map fieldResult) { + nameToId.put(createFullName(field.name(), fieldNames), field.fieldId()); + return nameToId; + } + + @Override + public Map array(Types.ArrayType array, Map elementResult) { + nameToId.put(createFullName("element", fieldNames), array.elementId()); + return nameToId; + } + + @Override + public Map map(Types.MapType map, Map keyResult, Map valueResult) { + nameToId.put(createFullName("key", fieldNames), map.keyId()); + nameToId.put(createFullName("value", fieldNames), map.valueId()); + return nameToId; + } + + @Override + public Map primitive(Type.PrimitiveType primitive) { + return nameToId; + } +} \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReader.java index fefe7eb7e5cc6..6490425c42b75 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReader.java @@ -18,26 +18,28 @@ package org.apache.hudi.io.storage; -import java.io.IOException; -import java.util.Iterator; -import java.util.Set; - import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.Option; -public interface HoodieFileReader { +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.List; +import java.util.Set; + +public interface HoodieFileReader extends AutoCloseable { - public String[] readMinMaxRecordKeys(); + String[] readMinMaxRecordKeys(); - public BloomFilter readBloomFilter(); + BloomFilter readBloomFilter(); - public Set filterRowKeys(Set candidateRowKeys); + Set filterRowKeys(Set candidateRowKeys); - public Iterator getRecordIterator(Schema readerSchema) throws IOException; + ClosableIterator getRecordIterator(Schema readerSchema) throws IOException; - default Iterator getRecordIterator() throws IOException { + default ClosableIterator getRecordIterator() throws IOException { return getRecordIterator(getSchema()); } @@ -49,6 +51,22 @@ default Option getRecordByKey(String key) throws IOException { return getRecordByKey(key, getSchema()); } + default ClosableIterator getRecordsByKeysIterator(List keys, Schema schema) throws IOException { + throw new UnsupportedOperationException(); + } + + default ClosableIterator getRecordsByKeysIterator(List keys) throws IOException { + return getRecordsByKeysIterator(keys, getSchema()); + } + + default ClosableIterator getRecordsByKeyPrefixIterator(List keyPrefixes, Schema schema) throws IOException { + throw new UnsupportedEncodingException(); + } + + default ClosableIterator getRecordsByKeyPrefixIterator(List keyPrefixes) throws IOException { + return getRecordsByKeyPrefixIterator(keyPrefixes, getSchema()); + } + Schema getSchema(); void close(); diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReaderFactory.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReaderFactory.java index ff559c5593712..f913df7e152a9 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReaderFactory.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReaderFactory.java @@ -27,6 +27,7 @@ import java.io.IOException; +import static org.apache.hudi.common.model.HoodieFileFormat.ORC; import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; import static org.apache.hudi.common.model.HoodieFileFormat.HFILE; @@ -40,6 +41,9 @@ public static HoodieFileReader getFileReader(Config if (HFILE.getFileExtension().equals(extension)) { return newHFileFileReader(conf, path); } + if (ORC.getFileExtension().equals(extension)) { + return newOrcFileReader(conf, path); + } throw new UnsupportedOperationException(extension + " format not supported yet."); } @@ -52,4 +56,8 @@ private static HoodieFileReader newHFileFileReader( CacheConfig cacheConfig = new CacheConfig(conf); return new HoodieHFileReader<>(conf, path, cacheConfig); } + + private static HoodieFileReader newOrcFileReader(Configuration conf, Path path) { + return new HoodieOrcReader<>(conf, path); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHBaseKVComparator.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHBaseKVComparator.java new file mode 100644 index 0000000000000..aaf1dcd7037b7 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHBaseKVComparator.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.hadoop.hbase.CellComparatorImpl; + +/** + * This class is explicitly used as Key Comparator to work around the hard coded + * legacy format class names inside HBase. Otherwise, we will face issues with shading. + */ +public class HoodieHBaseKVComparator extends CellComparatorImpl { +} diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java index 1d76929533905..3e5b3ff6acba0 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java @@ -18,100 +18,121 @@ package org.apache.hudi.io.storage; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; - import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PositionedReadable; import org.apache.hadoop.fs.Seekable; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.KeyValue; -import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper; import org.apache.hadoop.hbase.io.hfile.CacheConfig; import org.apache.hadoop.hbase.io.hfile.HFile; +import org.apache.hadoop.hbase.io.hfile.HFileInfo; import org.apache.hadoop.hbase.io.hfile.HFileScanner; -import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.nio.ByteBuff; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.BloomFilterFactory; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.io.ByteBufferBackedInputStream; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.util.Lazy; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.TreeSet; +import java.util.stream.Collectors; -public class HoodieHFileReader implements HoodieFileReader { - private Path path; - private Configuration conf; - private HFile.Reader reader; - private Schema schema; +import static org.apache.hudi.common.util.CollectionUtils.toStream; +import static org.apache.hudi.common.util.ValidationUtils.checkState; - public static final String KEY_SCHEMA = "schema"; +/** + * NOTE: PLEASE READ DOCS & COMMENTS CAREFULLY BEFORE MAKING CHANGES + *

    + * {@link HoodieFileReader} implementation allowing to read from {@link HFile}. + */ +public class HoodieHFileReader implements HoodieFileReader { + + // TODO HoodieHFileReader right now tightly coupled to MT, we should break that coupling + public static final String SCHEMA_KEY = "schema"; public static final String KEY_BLOOM_FILTER_META_BLOCK = "bloomFilter"; public static final String KEY_BLOOM_FILTER_TYPE_CODE = "bloomFilterTypeCode"; + + public static final String KEY_FIELD_NAME = "key"; public static final String KEY_MIN_RECORD = "minRecordKey"; public static final String KEY_MAX_RECORD = "maxRecordKey"; - public HoodieHFileReader(Configuration configuration, Path path, CacheConfig cacheConfig) throws IOException { - this.conf = configuration; - this.path = path; - this.reader = HFile.createReader(FSUtils.getFs(path.toString(), configuration), path, cacheConfig, conf); + private static final Logger LOG = LogManager.getLogger(HoodieHFileReader.class); + + private final Path path; + + private final Lazy schema; + + // NOTE: Reader is ONLY THREAD-SAFE for {@code Scanner} operating in Positional Read ("pread") + // mode (ie created w/ "pread = true") + private final HFile.Reader reader; + // NOTE: Scanner caches read blocks, therefore it's important to re-use scanner + // wherever possible + private final HFileScanner sharedScanner; + + private final Object sharedScannerLock = new Object(); + + public HoodieHFileReader(Configuration hadoopConf, Path path, CacheConfig cacheConfig) throws IOException { + this(path, + HoodieHFileUtils.createHFileReader(FSUtils.getFs(path.toString(), hadoopConf), path, cacheConfig, hadoopConf), + Option.empty()); } - public HoodieHFileReader(byte[] content) throws IOException { - Configuration conf = new Configuration(); - Path path = new Path("hoodie"); - SeekableByteArrayInputStream bis = new SeekableByteArrayInputStream(content); - FSDataInputStream fsdis = new FSDataInputStream(bis); - this.reader = HFile.createReader(FSUtils.getFs("hoodie", conf), path, new FSDataInputStreamWrapper(fsdis), - content.length, new CacheConfig(conf), conf); + public HoodieHFileReader(Configuration hadoopConf, Path path, CacheConfig cacheConfig, FileSystem fs) throws IOException { + this(path, HoodieHFileUtils.createHFileReader(fs, path, cacheConfig, hadoopConf), Option.empty()); } - @Override - public String[] readMinMaxRecordKeys() { - try { - Map fileInfo = reader.loadFileInfo(); - return new String[] { new String(fileInfo.get(KEY_MIN_RECORD.getBytes())), - new String(fileInfo.get(KEY_MAX_RECORD.getBytes()))}; - } catch (IOException e) { - throw new HoodieException("Could not read min/max record key out of file information block correctly from path", e); - } + public HoodieHFileReader(FileSystem fs, Path dummyPath, byte[] content, Option schemaOpt) throws IOException { + this(null, HoodieHFileUtils.createHFileReader(fs, dummyPath, content), schemaOpt); } - @Override - public Schema getSchema() { - if (schema == null) { - try { - Map fileInfo = reader.loadFileInfo(); - schema = new Schema.Parser().parse(new String(fileInfo.get(KEY_SCHEMA.getBytes()))); - } catch (IOException e) { - throw new HoodieException("Could not read schema of file from path", e); - } - } + public HoodieHFileReader(Path path, HFile.Reader reader, Option schemaOpt) throws IOException { + this.path = path; + this.reader = reader; + // For shared scanner, which is primarily used for point-lookups, we're caching blocks + // by default, to minimize amount of traffic to the underlying storage + this.sharedScanner = getHFileScanner(reader, true); + this.schema = schemaOpt.map(Lazy::eagerly) + .orElseGet(() -> Lazy.lazily(() -> fetchSchema(reader))); + } - return schema; + @Override + public String[] readMinMaxRecordKeys() { + // NOTE: This access to reader is thread-safe + HFileInfo fileInfo = reader.getHFileInfo(); + return new String[]{new String(fileInfo.get(KEY_MIN_RECORD.getBytes())), + new String(fileInfo.get(KEY_MAX_RECORD.getBytes()))}; } @Override public BloomFilter readBloomFilter() { - Map fileInfo; try { - fileInfo = reader.loadFileInfo(); - ByteBuffer serializedFilter = reader.getMetaBlock(KEY_BLOOM_FILTER_META_BLOCK, false); - byte[] filterBytes = new byte[serializedFilter.remaining()]; - serializedFilter.get(filterBytes); // read the bytes that were written - return BloomFilterFactory.fromString(new String(filterBytes), + // NOTE: This access to reader is thread-safe + HFileInfo fileInfo = reader.getHFileInfo(); + ByteBuff buf = reader.getMetaBlock(KEY_BLOOM_FILTER_META_BLOCK, false).getBufferWithoutHeader(); + // We have to copy bytes here, since we can't reuse buffer's underlying + // array as is, since it contains additional metadata (header) + byte[] bytes = new byte[buf.remaining()]; + buf.get(bytes); + return BloomFilterFactory.fromString(new String(bytes), new String(fileInfo.get(KEY_BLOOM_FILTER_TYPE_CODE.getBytes()))); } catch (IOException e) { throw new HoodieException("Could not read bloom filter from " + path, e); @@ -119,152 +140,451 @@ public BloomFilter readBloomFilter() { } @Override - public Set filterRowKeys(Set candidateRowKeys) { - // Current implementation reads all records and filters them. In certain cases, it many be better to: - // 1. Scan a limited subset of keys (min/max range of candidateRowKeys) - // 2. Lookup keys individually (if the size of candidateRowKeys is much less than the total keys in file) - try { - List> allRecords = readAllRecords(); - Set rowKeys = new HashSet<>(); - allRecords.forEach(t -> { - if (candidateRowKeys.contains(t.getFirst())) { - rowKeys.add(t.getFirst()); + public Schema getSchema() { + return schema.get(); + } + + /** + * Filter keys by availability. + *

    + * Note: This method is performant when the caller passes in a sorted candidate keys. + * + * @param candidateRowKeys - Keys to check for the availability + * @return Subset of candidate keys that are available + */ + @Override + public Set filterRowKeys(Set candidateRowKeys) { + checkState(candidateRowKeys instanceof TreeSet, + String.format("HFile reader expects a TreeSet as iterating over ordered keys is more performant, got (%s)", candidateRowKeys.getClass().getSimpleName())); + + synchronized (sharedScannerLock) { + return candidateRowKeys.stream().filter(k -> { + try { + return isKeyAvailable(k, sharedScanner); + } catch (IOException e) { + LOG.error("Failed to check key availability: " + k); + return false; } - }); - return rowKeys; - } catch (IOException e) { - throw new HoodieIOException("Failed to read row keys from " + path, e); + }).collect(Collectors.toSet()); } } - public List> readAllRecords(Schema writerSchema, Schema readerSchema) throws IOException { - List> recordList = new LinkedList<>(); + @SuppressWarnings("unchecked") + @Override + public Option getRecordByKey(String key, Schema readerSchema) throws IOException { + synchronized (sharedScannerLock) { + return (Option) fetchRecordByKeyInternal(sharedScanner, key, getSchema(), readerSchema); + } + } + + @SuppressWarnings("unchecked") + @Override + public ClosableIterator getRecordIterator(Schema readerSchema) throws IOException { + // TODO eval whether seeking scanner would be faster than pread + HFileScanner scanner = getHFileScanner(reader, false); + return (ClosableIterator) new RecordIterator(scanner, getSchema(), readerSchema); + } + + @SuppressWarnings("unchecked") + @Override + public ClosableIterator getRecordsByKeysIterator(List keys, Schema readerSchema) throws IOException { + // We're caching blocks for this scanner to minimize amount of traffic + // to the underlying storage as we fetched (potentially) sparsely distributed + // keys + HFileScanner scanner = getHFileScanner(reader, true); + return (ClosableIterator) new RecordByKeyIterator(scanner, keys, getSchema(), readerSchema); + } + + @SuppressWarnings("unchecked") + @Override + public ClosableIterator getRecordsByKeyPrefixIterator(List keyPrefixes, Schema readerSchema) throws IOException { + // We're caching blocks for this scanner to minimize amount of traffic + // to the underlying storage as we fetched (potentially) sparsely distributed + // keys + HFileScanner scanner = getHFileScanner(reader, true); + return (ClosableIterator) new RecordByKeyPrefixIterator(scanner, keyPrefixes, getSchema(), readerSchema); + } + + @Override + public long getTotalRecords() { + // NOTE: This access to reader is thread-safe + return reader.getEntries(); + } + + @Override + public void close() { try { - HFileScanner scanner = reader.getScanner(false, false); - if (scanner.seekTo()) { - do { - Cell c = scanner.getKeyValue(); - byte[] keyBytes = Arrays.copyOfRange(c.getRowArray(), c.getRowOffset(), c.getRowOffset() + c.getRowLength()); - R record = getRecordFromCell(c, writerSchema, readerSchema); - recordList.add(new Pair<>(new String(keyBytes), record)); - } while (scanner.next()); + synchronized (this) { + reader.close(); } - - return recordList; } catch (IOException e) { - throw new HoodieException("Error reading hfile " + path + " as a dataframe", e); + throw new HoodieIOException("Error closing the hfile reader", e); } } - public List> readAllRecords() throws IOException { - Schema schema = new Schema.Parser().parse(new String(reader.loadFileInfo().get(KEY_SCHEMA.getBytes()))); - return readAllRecords(schema, schema); + private boolean isKeyAvailable(String key, HFileScanner keyScanner) throws IOException { + final KeyValue kv = new KeyValue(key.getBytes(), null, null, null); + return keyScanner.seekTo(kv) == 0; } - @Override - public Iterator getRecordIterator(Schema readerSchema) throws IOException { - final HFileScanner scanner = reader.getScanner(false, false); - return new Iterator() { - private R next = null; + private static Iterator getRecordByKeyPrefixIteratorInternal(HFileScanner scanner, + String keyPrefix, + Schema writerSchema, + Schema readerSchema) throws IOException { + KeyValue kv = new KeyValue(keyPrefix.getBytes(), null, null, null); + + // NOTE: HFile persists both keys/values as bytes, therefore lexicographical sorted is + // essentially employed + // + // For the HFile containing list of cells c[0], c[1], ..., c[N], `seekTo(cell)` would return + // following: + // a) -1, if cell < c[0], no position; + // b) 0, such that c[i] = cell and scanner is left in position i; + // c) and 1, such that c[i] < cell, and scanner is left in position i. + // + // Consider entries w/ the following keys in HFile: [key01, key02, key03, key04,..., key20]; + // In case looked up key-prefix is + // - "key", `seekTo()` will return -1 and place the cursor just before "key01", + // `getCell()` will return "key01" entry + // - "key03", `seekTo()` will return 0 (exact match) and place the cursor just before "key03", + // `getCell()` will return "key03" entry + // - "key1", `seekTo()` will return 1 (first not lower than) and place the cursor just before + // "key10" (i.e. on "key09"); + // + int val = scanner.seekTo(kv); + if (val == 1) { + // Try moving to next entry, matching the prefix key; if we're at the EOF, + // `next()` will return false + if (!scanner.next()) { + return Collections.emptyIterator(); + } + } else if (val == -1) { + // Whenever val == -1 HFile reader will place the pointer right before the first record. We have to advance it to the first record + // of the file to validate whether it matches our search criteria + scanner.seekTo(); + } + + class KeyPrefixIterator implements Iterator { + private GenericRecord next = null; private boolean eof = false; @Override public boolean hasNext() { + if (next != null) { + return true; + } else if (eof) { + return false; + } + + Cell c = Objects.requireNonNull(scanner.getCell()); + byte[] keyBytes = copyKeyFromCell(c); + String key = new String(keyBytes); + // Check whether we're still reading records corresponding to the key-prefix + if (!key.startsWith(keyPrefix)) { + return false; + } + + // Extract the byte value before releasing the lock since we cannot hold on to the returned cell afterwards + byte[] valueBytes = copyValueFromCell(c); try { - // To handle when hasNext() is called multiple times for idempotency and/or the first time - if (this.next == null && !this.eof) { - if (!scanner.isSeeked() && scanner.seekTo()) { - this.next = (R)getRecordFromCell(scanner.getKeyValue(), getSchema(), readerSchema); - } - } - return this.next != null; - } catch (IOException io) { - throw new HoodieIOException("unable to read next record from hfile ", io); + next = deserialize(keyBytes, valueBytes, writerSchema, readerSchema); + // In case scanner is not able to advance, it means we reached EOF + eof = !scanner.next(); + } catch (IOException e) { + throw new HoodieIOException("Failed to deserialize payload", e); } + + return true; } @Override - public R next() { - try { - // To handle case when next() is called before hasNext() - if (this.next == null) { - if (!hasNext()) { - throw new HoodieIOException("No more records left to read from hfile"); - } - } - R retVal = this.next; - if (scanner.next()) { - this.next = (R)getRecordFromCell(scanner.getKeyValue(), getSchema(), readerSchema); - } else { - this.next = null; - this.eof = true; - } - return retVal; - } catch (IOException io) { - throw new HoodieIOException("unable to read next record from parquet file ", io); - } + public GenericRecord next() { + GenericRecord next = this.next; + this.next = null; + return next; } - }; + } + + return new KeyPrefixIterator(); } - @Override - public Option getRecordByKey(String key, Schema readerSchema) throws IOException { - HFileScanner scanner = reader.getScanner(false, true); + private static Option fetchRecordByKeyInternal(HFileScanner scanner, String key, Schema writerSchema, Schema readerSchema) throws IOException { KeyValue kv = new KeyValue(key.getBytes(), null, null, null); - if (scanner.seekTo(kv) == 0) { - Cell c = scanner.getKeyValue(); - byte[] keyBytes = Arrays.copyOfRange(c.getRowArray(), c.getRowOffset(), c.getRowOffset() + c.getRowLength()); - R record = getRecordFromCell(c, getSchema(), readerSchema); - return Option.of(record); + if (scanner.seekTo(kv) != 0) { + return Option.empty(); } - return Option.empty(); - } + Cell c = scanner.getCell(); + byte[] valueBytes = copyValueFromCell(c); + GenericRecord record = deserialize(key.getBytes(), valueBytes, writerSchema, readerSchema); - private R getRecordFromCell(Cell c, Schema writerSchema, Schema readerSchema) throws IOException { - byte[] value = Arrays.copyOfRange(c.getValueArray(), c.getValueOffset(), c.getValueOffset() + c.getValueLength()); - return (R)HoodieAvroUtils.bytesToAvro(value, writerSchema, readerSchema); + return Option.of(record); } - @Override - public long getTotalRecords() { - return reader.getEntries(); + private static GenericRecord getRecordFromCell(Cell cell, Schema writerSchema, Schema readerSchema) throws IOException { + final byte[] keyBytes = copyKeyFromCell(cell); + final byte[] valueBytes = copyValueFromCell(cell); + return deserialize(keyBytes, valueBytes, writerSchema, readerSchema); } - @Override - public void close() { + private static GenericRecord deserializeUnchecked(final byte[] keyBytes, + final byte[] valueBytes, + Schema writerSchema, + Schema readerSchema) { try { - reader.close(); - reader = null; + return deserialize(keyBytes, valueBytes, writerSchema, readerSchema); } catch (IOException e) { - e.printStackTrace(); + throw new HoodieIOException("Failed to deserialize payload", e); } } - static class SeekableByteArrayInputStream extends ByteArrayInputStream implements Seekable, PositionedReadable { - public SeekableByteArrayInputStream(byte[] buf) { - super(buf); + private static GenericRecord deserialize(final byte[] keyBytes, + final byte[] valueBytes, + Schema writerSchema, + Schema readerSchema) throws IOException { + GenericRecord record = HoodieAvroUtils.bytesToAvro(valueBytes, writerSchema, readerSchema); + + getKeySchema(readerSchema).ifPresent(keyFieldSchema -> { + final Object keyObject = record.get(keyFieldSchema.pos()); + if (keyObject != null && keyObject.toString().isEmpty()) { + record.put(keyFieldSchema.pos(), new String(keyBytes)); + } + }); + + return record; + } + + private static Schema fetchSchema(HFile.Reader reader) { + HFileInfo fileInfo = reader.getHFileInfo(); + return new Schema.Parser().parse(new String(fileInfo.get(SCHEMA_KEY.getBytes()))); + } + + private static byte[] copyKeyFromCell(Cell cell) { + return Arrays.copyOfRange(cell.getRowArray(), cell.getRowOffset(), cell.getRowOffset() + cell.getRowLength()); + } + + private static byte[] copyValueFromCell(Cell c) { + return Arrays.copyOfRange(c.getValueArray(), c.getValueOffset(), c.getValueOffset() + c.getValueLength()); + } + + /** + * NOTE: THIS SHOULD ONLY BE USED FOR TESTING, RECORDS ARE MATERIALIZED EAGERLY + *

    + * Reads all the records with given schema + */ + public static List readAllRecords(HoodieHFileReader reader) throws IOException { + Schema schema = reader.getSchema(); + return toStream(reader.getRecordIterator(schema)) + .collect(Collectors.toList()); + } + + /** + * NOTE: THIS SHOULD ONLY BE USED FOR TESTING, RECORDS ARE MATERIALIZED EAGERLY + *

    + * Reads all the records with given schema and filtering keys. + */ + public static List readRecords(HoodieHFileReader reader, + List keys) throws IOException { + return readRecords(reader, keys, reader.getSchema()); + } + + /** + * NOTE: THIS SHOULD ONLY BE USED FOR TESTING, RECORDS ARE MATERIALIZED EAGERLY + *

    + * Reads all the records with given schema and filtering keys. + */ + public static List readRecords(HoodieHFileReader reader, + List keys, + Schema schema) throws IOException { + Collections.sort(keys); + return toStream(reader.getRecordsByKeysIterator(keys, schema)) + .collect(Collectors.toList()); + } + + private static HFileScanner getHFileScanner(HFile.Reader reader, boolean cacheBlocks) { + // NOTE: Only scanners created in Positional Read ("pread") mode could share the same reader, + // since scanners in default mode will be seeking w/in the underlying stream + return reader.getScanner(cacheBlocks, true); + } + + private static Option getKeySchema(Schema schema) { + return Option.ofNullable(schema.getField(KEY_FIELD_NAME)); + } + + private static class RecordByKeyPrefixIterator implements ClosableIterator { + private final Iterator keyPrefixesIterator; + private Iterator recordsIterator; + + private final HFileScanner scanner; + + private final Schema writerSchema; + private final Schema readerSchema; + + private GenericRecord next = null; + + RecordByKeyPrefixIterator(HFileScanner scanner, List keyPrefixes, Schema writerSchema, Schema readerSchema) throws IOException { + this.keyPrefixesIterator = keyPrefixes.iterator(); + + this.scanner = scanner; + this.scanner.seekTo(); // position at the beginning of the file + + this.writerSchema = writerSchema; + this.readerSchema = readerSchema; } @Override - public long getPos() throws IOException { - return pos; + public boolean hasNext() { + try { + while (true) { + // NOTE: This is required for idempotency + if (next != null) { + return true; + } else if (recordsIterator != null && recordsIterator.hasNext()) { + next = recordsIterator.next(); + return true; + } else if (keyPrefixesIterator.hasNext()) { + String currentKeyPrefix = keyPrefixesIterator.next(); + recordsIterator = + getRecordByKeyPrefixIteratorInternal(scanner, currentKeyPrefix, writerSchema, readerSchema); + } else { + return false; + } + } + } catch (IOException e) { + throw new HoodieIOException("Unable to read next record from HFile", e); + } + } + + @Override + public GenericRecord next() { + GenericRecord next = this.next; + this.next = null; + return next; + } + + @Override + public void close() { + scanner.close(); + } + } + + private static class RecordByKeyIterator implements ClosableIterator { + private final Iterator keyIterator; + + private final HFileScanner scanner; + + private final Schema readerSchema; + private final Schema writerSchema; + + private GenericRecord next = null; + + RecordByKeyIterator(HFileScanner scanner, List keys, Schema writerSchema, Schema readerSchema) throws IOException { + this.keyIterator = keys.iterator(); + + this.scanner = scanner; + this.scanner.seekTo(); // position at the beginning of the file + + this.writerSchema = writerSchema; + this.readerSchema = readerSchema; } @Override - public void seek(long pos) throws IOException { - if (mark != 0) { - throw new IllegalStateException(); + public boolean hasNext() { + try { + // NOTE: This is required for idempotency + if (next != null) { + return true; + } + + while (keyIterator.hasNext()) { + Option value = fetchRecordByKeyInternal(scanner, keyIterator.next(), writerSchema, readerSchema); + if (value.isPresent()) { + next = value.get(); + return true; + } + } + return false; + } catch (IOException e) { + throw new HoodieIOException("unable to read next record from hfile ", e); } + } + + @Override + public GenericRecord next() { + GenericRecord next = this.next; + this.next = null; + return next; + } + + @Override + public void close() { + scanner.close(); + } + } + + private static class RecordIterator implements ClosableIterator { + private final HFileScanner scanner; + + private final Schema writerSchema; + private final Schema readerSchema; + + private GenericRecord next = null; + + RecordIterator(HFileScanner scanner, Schema writerSchema, Schema readerSchema) { + this.scanner = scanner; + this.writerSchema = writerSchema; + this.readerSchema = readerSchema; + } + + @Override + public boolean hasNext() { + try { + // NOTE: This is required for idempotency + if (next != null) { + return true; + } - reset(); - long skipped = skip(pos); + boolean hasRecords; + if (!scanner.isSeeked()) { + hasRecords = scanner.seekTo(); + } else { + hasRecords = scanner.next(); + } + + if (!hasRecords) { + return false; + } - if (skipped != pos) { - throw new IOException(); + this.next = getRecordFromCell(scanner.getCell(), writerSchema, readerSchema); + return true; + } catch (IOException io) { + throw new HoodieIOException("unable to read next record from hfile ", io); } } + @Override + public GenericRecord next() { + GenericRecord next = this.next; + this.next = null; + return next; + } + + @Override + public void close() { + scanner.close(); + } + } + + static class SeekableByteArrayInputStream extends ByteBufferBackedInputStream implements Seekable, PositionedReadable { + public SeekableByteArrayInputStream(byte[] buf) { + super(buf); + } + + @Override + public long getPos() throws IOException { + return getPosition(); + } + @Override public boolean seekToNewSource(long targetPos) throws IOException { return false; @@ -272,19 +592,7 @@ public boolean seekToNewSource(long targetPos) throws IOException { @Override public int read(long position, byte[] buffer, int offset, int length) throws IOException { - - if (position >= buf.length) { - throw new IllegalArgumentException(); - } - if (position + length > buf.length) { - throw new IllegalArgumentException(); - } - if (length > buffer.length) { - throw new IllegalArgumentException(); - } - - System.arraycopy(buf, (int) position, buffer, offset, length); - return length; + return copyFrom(position, buffer, offset, length); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileUtils.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileUtils.java new file mode 100644 index 0000000000000..878a3c563b6f5 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileUtils.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper; +import org.apache.hadoop.hbase.io.hfile.CacheConfig; +import org.apache.hadoop.hbase.io.hfile.HFile; +import org.apache.hadoop.hbase.io.hfile.HFileInfo; +import org.apache.hadoop.hbase.io.hfile.ReaderContext; +import org.apache.hadoop.hbase.io.hfile.ReaderContextBuilder; + +import java.io.IOException; + +/** + * Util class for HFile reading and writing in Hudi + */ +public class HoodieHFileUtils { + // Based on HBase 2.4.9, the primaryReplicaReader is mainly used for constructing + // block cache key, so if we do not use block cache then it is OK to set it as any + // value. We use true here. + private static final boolean USE_PRIMARY_REPLICA_READER = true; + + /** + * Creates HFile reader for a file with default `primaryReplicaReader` as true. + * + * @param fs File system. + * @param path Path to file to read. + * @param cacheConfig Cache configuration. + * @param configuration Configuration + * @return HFile reader + * @throws IOException Upon error. + */ + public static HFile.Reader createHFileReader( + FileSystem fs, Path path, CacheConfig cacheConfig, Configuration configuration) throws IOException { + return HFile.createReader(fs, path, cacheConfig, USE_PRIMARY_REPLICA_READER, configuration); + } + + /** + * Creates HFile reader for byte array with default `primaryReplicaReader` as true. + * + * @param fs File system. + * @param dummyPath Dummy path to file to read. + * @param content Content in byte array. + * @return HFile reader + * @throws IOException Upon error. + */ + public static HFile.Reader createHFileReader( + FileSystem fs, Path dummyPath, byte[] content) throws IOException { + // Avoid loading default configs, from the FS, since this configuration is mostly + // used as a stub to initialize HFile reader + Configuration conf = new Configuration(false); + HoodieHFileReader.SeekableByteArrayInputStream bis = new HoodieHFileReader.SeekableByteArrayInputStream(content); + FSDataInputStream fsdis = new FSDataInputStream(bis); + FSDataInputStreamWrapper stream = new FSDataInputStreamWrapper(fsdis); + ReaderContext context = new ReaderContextBuilder() + .withFilePath(dummyPath) + .withInputStreamWrapper(stream) + .withFileSize(content.length) + .withFileSystem(fs) + .withPrimaryReplicaReader(USE_PRIMARY_REPLICA_READER) + .withReaderType(ReaderContext.ReaderType.STREAM) + .build(); + HFileInfo fileInfo = new HFileInfo(context, conf); + HFile.Reader reader = HFile.createReader(context, fileInfo, new CacheConfig(conf), conf); + fileInfo.initMetaAndIndex(reader); + return reader; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcReader.java new file mode 100644 index 0000000000000..5431bf3782af2 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcReader.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.avro.Schema; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.util.AvroOrcUtils; +import org.apache.hudi.common.util.BaseFileUtils; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.OrcReaderIterator; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.orc.Reader.Options; +import org.apache.orc.RecordReader; +import org.apache.orc.TypeDescription; + +import java.io.IOException; +import java.util.Set; + +public class HoodieOrcReader implements HoodieFileReader { + private Path path; + private Configuration conf; + private final BaseFileUtils orcUtils; + + public HoodieOrcReader(Configuration configuration, Path path) { + this.conf = configuration; + this.path = path; + this.orcUtils = BaseFileUtils.getInstance(HoodieFileFormat.ORC); + } + + @Override + public String[] readMinMaxRecordKeys() { + return orcUtils.readMinMaxRecordKeys(conf, path); + } + + @Override + public BloomFilter readBloomFilter() { + return orcUtils.readBloomFilterFromMetadata(conf, path); + } + + @Override + public Set filterRowKeys(Set candidateRowKeys) { + return orcUtils.filterRowKeys(conf, path, candidateRowKeys); + } + + @Override + public ClosableIterator getRecordIterator(Schema schema) throws IOException { + try { + Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); + TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(schema); + RecordReader recordReader = reader.rows(new Options(conf).schema(orcSchema)); + return new OrcReaderIterator<>(recordReader, schema, orcSchema); + } catch (IOException io) { + throw new HoodieIOException("Unable to create an ORC reader.", io); + } + } + + @Override + public Schema getSchema() { + return orcUtils.readAvroSchema(conf, path); + } + + @Override + public void close() { + } + + @Override + public long getTotalRecords() { + return orcUtils.getRowCount(conf, path); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetConfig.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetConfig.java new file mode 100644 index 0000000000000..77fea6beee520 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetConfig.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; + +/** + * Base ParquetConfig to hold config params for writing to Parquet. + * @param + */ +public class HoodieParquetConfig { + private final T writeSupport; + private final CompressionCodecName compressionCodecName; + private final int blockSize; + private final int pageSize; + private final long maxFileSize; + private final Configuration hadoopConf; + private final double compressionRatio; + private final boolean dictionaryEnabled; + + public HoodieParquetConfig(T writeSupport, CompressionCodecName compressionCodecName, int blockSize, + int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio) { + this(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio, false); + } + + public HoodieParquetConfig(T writeSupport, CompressionCodecName compressionCodecName, int blockSize, + int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio, boolean dictionaryEnabled) { + this.writeSupport = writeSupport; + this.compressionCodecName = compressionCodecName; + this.blockSize = blockSize; + this.pageSize = pageSize; + this.maxFileSize = maxFileSize; + this.hadoopConf = hadoopConf; + this.compressionRatio = compressionRatio; + this.dictionaryEnabled = dictionaryEnabled; + } + + public CompressionCodecName getCompressionCodecName() { + return compressionCodecName; + } + + public int getBlockSize() { + return blockSize; + } + + public int getPageSize() { + return pageSize; + } + + public long getMaxFileSize() { + return maxFileSize; + } + + public Configuration getHadoopConf() { + return hadoopConf; + } + + public double getCompressionRatio() { + return compressionRatio; + } + + public T getWriteSupport() { + return writeSupport; + } + + public boolean dictionaryEnabled() { + return dictionaryEnabled; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetReader.java index feacbda54606b..804e4354c749e 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetReader.java @@ -18,62 +18,73 @@ package org.apache.hudi.io.storage; -import java.io.IOException; -import java.util.Iterator; -import java.util.Set; - import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.util.BaseFileUtils; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.ParquetReaderIterator; -import org.apache.hudi.common.util.ParquetUtils; import org.apache.parquet.avro.AvroParquetReader; import org.apache.parquet.avro.AvroReadSupport; import org.apache.parquet.hadoop.ParquetReader; -public class HoodieParquetReader implements HoodieFileReader { - private Path path; - private Configuration conf; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +public class HoodieParquetReader implements HoodieFileReader { + + private final Path path; + private final Configuration conf; + private final BaseFileUtils parquetUtils; + private List readerIterators = new ArrayList<>(); public HoodieParquetReader(Configuration configuration, Path path) { this.conf = configuration; this.path = path; + this.parquetUtils = BaseFileUtils.getInstance(HoodieFileFormat.PARQUET); } + @Override public String[] readMinMaxRecordKeys() { - return ParquetUtils.readMinMaxRecordKeys(conf, path); + return parquetUtils.readMinMaxRecordKeys(conf, path); } @Override public BloomFilter readBloomFilter() { - return ParquetUtils.readBloomFilterFromParquetMetadata(conf, path); + return parquetUtils.readBloomFilterFromMetadata(conf, path); } @Override - public Set filterRowKeys(Set candidateRowKeys) { - return ParquetUtils.filterParquetRowKeys(conf, path, candidateRowKeys); + public Set filterRowKeys(Set candidateRowKeys) { + return parquetUtils.filterRowKeys(conf, path, candidateRowKeys); } @Override - public Iterator getRecordIterator(Schema schema) throws IOException { + public ClosableIterator getRecordIterator(Schema schema) throws IOException { AvroReadSupport.setAvroReadSchema(conf, schema); - ParquetReader reader = AvroParquetReader.builder(path).withConf(conf).build(); - return new ParquetReaderIterator(reader); + ParquetReader reader = AvroParquetReader.builder(path).withConf(conf).build(); + ParquetReaderIterator parquetReaderIterator = new ParquetReaderIterator<>(reader); + readerIterators.add(parquetReaderIterator); + return parquetReaderIterator; } @Override public Schema getSchema() { - return ParquetUtils.readAvroSchema(conf, path); + return parquetUtils.readAvroSchema(conf, path); } @Override public void close() { + readerIterators.forEach(ParquetReaderIterator::close); } @Override public long getTotalRecords() { - return ParquetUtils.getRowCount(conf, path); + return parquetUtils.getRowCount(conf, path); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetStreamWriter.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetStreamWriter.java new file mode 100644 index 0000000000000..c8f78c3501158 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetStreamWriter.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.HoodieAvroWriteSupport; +import org.apache.hudi.parquet.io.OutputStreamBackedOutputFile; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.api.WriteSupport; +import org.apache.parquet.io.OutputFile; + +import java.io.IOException; + +// TODO(HUDI-3035) unify w/ HoodieParquetWriter +public class HoodieParquetStreamWriter implements AutoCloseable { + + private final ParquetWriter writer; + private final HoodieAvroWriteSupport writeSupport; + + public HoodieParquetStreamWriter(FSDataOutputStream outputStream, + HoodieParquetConfig parquetConfig) throws IOException { + this.writeSupport = parquetConfig.getWriteSupport(); + this.writer = new Builder(new OutputStreamBackedOutputFile(outputStream), writeSupport) + .withWriteMode(ParquetFileWriter.Mode.CREATE) + .withCompressionCodec(parquetConfig.getCompressionCodecName()) + .withRowGroupSize(parquetConfig.getBlockSize()) + .withPageSize(parquetConfig.getPageSize()) + .withDictionaryPageSize(parquetConfig.getPageSize()) + .withDictionaryEncoding(parquetConfig.dictionaryEnabled()) + .withWriterVersion(ParquetWriter.DEFAULT_WRITER_VERSION) + .withConf(parquetConfig.getHadoopConf()) + .build(); + } + + public void writeAvro(String key, R object) throws IOException { + writer.write(object); + writeSupport.add(key); + } + + @Override + public void close() throws IOException { + writer.close(); + } + + private static class Builder extends ParquetWriter.Builder> { + private final WriteSupport writeSupport; + + private Builder(Path file, WriteSupport writeSupport) { + super(file); + this.writeSupport = writeSupport; + } + + private Builder(OutputFile file, WriteSupport writeSupport) { + super(file); + this.writeSupport = writeSupport; + } + + @Override + protected Builder self() { + return this; + } + + @Override + protected WriteSupport getWriteSupport(Configuration conf) { + return writeSupport; + } + } +} \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/keygen/BaseKeyGenerator.java b/hudi-common/src/main/java/org/apache/hudi/keygen/BaseKeyGenerator.java new file mode 100644 index 0000000000000..a09101dedfbeb --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/keygen/BaseKeyGenerator.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.keygen; + +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.exception.HoodieKeyException; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; + +import java.util.List; + +public abstract class BaseKeyGenerator extends KeyGenerator { + + protected List recordKeyFields; + protected List partitionPathFields; + protected final boolean encodePartitionPath; + protected final boolean hiveStylePartitioning; + protected final boolean consistentLogicalTimestampEnabled; + + protected BaseKeyGenerator(TypedProperties config) { + super(config); + this.encodePartitionPath = config.getBoolean(KeyGeneratorOptions.URL_ENCODE_PARTITIONING.key(), + Boolean.parseBoolean(KeyGeneratorOptions.URL_ENCODE_PARTITIONING.defaultValue())); + this.hiveStylePartitioning = config.getBoolean(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE.key(), + Boolean.parseBoolean(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE.defaultValue())); + this.consistentLogicalTimestampEnabled = config.getBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), + Boolean.parseBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue())); + } + + /** + * Generate a record Key out of provided generic record. + */ + public abstract String getRecordKey(GenericRecord record); + + /** + * Generate a partition path out of provided generic record. + */ + public abstract String getPartitionPath(GenericRecord record); + + /** + * Generate a Hoodie Key out of provided generic record. + */ + @Override + public final HoodieKey getKey(GenericRecord record) { + if (getRecordKeyFieldNames() == null || getPartitionPathFields() == null) { + throw new HoodieKeyException("Unable to find field names for record key or partition path in cfg"); + } + return new HoodieKey(getRecordKey(record), getPartitionPath(record)); + } + + @Override + public List getRecordKeyFieldNames() { + return recordKeyFields; + } + + public List getPartitionPathFields() { + return partitionPathFields; + } + + public boolean isConsistentLogicalTimestampEnabled() { + return consistentLogicalTimestampEnabled; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenerator.java b/hudi-common/src/main/java/org/apache/hudi/keygen/KeyGenerator.java similarity index 98% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenerator.java rename to hudi-common/src/main/java/org/apache/hudi/keygen/KeyGenerator.java index 8c3f794ee6fa8..691b1f4d5560c 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenerator.java +++ b/hudi-common/src/main/java/org/apache/hudi/keygen/KeyGenerator.java @@ -34,7 +34,7 @@ @PublicAPIClass(maturity = ApiMaturityLevel.STABLE) public abstract class KeyGenerator implements KeyGeneratorInterface { - protected TypedProperties config; + protected final TypedProperties config; protected KeyGenerator(TypedProperties config) { this.config = config; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGeneratorInterface.java b/hudi-common/src/main/java/org/apache/hudi/keygen/KeyGeneratorInterface.java similarity index 100% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGeneratorInterface.java rename to hudi-common/src/main/java/org/apache/hudi/keygen/KeyGeneratorInterface.java diff --git a/hudi-common/src/main/java/org/apache/hudi/keygen/constant/KeyGeneratorOptions.java b/hudi-common/src/main/java/org/apache/hudi/keygen/constant/KeyGeneratorOptions.java new file mode 100644 index 0000000000000..ff182c4c1661f --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/keygen/constant/KeyGeneratorOptions.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.keygen.constant; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +@ConfigClassProperty(name = "Key Generator Options", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Hudi maintains keys (record key + partition path) " + + "for uniquely identifying a particular record. " + + "This config allows developers to setup the Key generator class that " + + "will extract these out of incoming records.") +public class KeyGeneratorOptions extends HoodieConfig { + + public static final ConfigProperty URL_ENCODE_PARTITIONING = ConfigProperty + .key("hoodie.datasource.write.partitionpath.urlencode") + .defaultValue("false") + .withDocumentation("Should we url encode the partition path value, before creating the folder structure."); + + public static final ConfigProperty HIVE_STYLE_PARTITIONING_ENABLE = ConfigProperty + .key("hoodie.datasource.write.hive_style_partitioning") + .defaultValue("false") + .withDocumentation("Flag to indicate whether to use Hive style partitioning.\n" + + "If set true, the names of partition folders follow = format.\n" + + "By default false (the names of partition folders are only partition values)"); + + public static final ConfigProperty RECORDKEY_FIELD_NAME = ConfigProperty + .key("hoodie.datasource.write.recordkey.field") + .defaultValue("uuid") + .withDocumentation("Record key field. Value to be used as the `recordKey` component of `HoodieKey`.\n" + + "Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using\n" + + "the dot notation eg: `a.b.c`"); + + public static final ConfigProperty PARTITIONPATH_FIELD_NAME = ConfigProperty + .key("hoodie.datasource.write.partitionpath.field") + .noDefaultValue() + .withDocumentation("Partition path field. Value to be used at the partitionPath component of HoodieKey. " + + "Actual value ontained by invoking .toString()"); + + public static final ConfigProperty KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED = ConfigProperty + .key("hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled") + .defaultValue("false") + .withDocumentation("When set to true, consistent value will be generated for a logical timestamp type column, " + + "like timestamp-millis and timestamp-micros, irrespective of whether row-writer is enabled. Disabled by default so " + + "as not to break the pipeline that deploy either fully row-writer path or non row-writer path. For example, " + + "if it is kept disabled then record key of timestamp type with value `2016-12-29 09:54:00` will be written as timestamp " + + "`2016-12-29 09:54:00.0` in row-writer path, while it will be written as long value `1483023240000000` in non row-writer path. " + + "If enabled, then the timestamp value will be written in both the cases."); + + /** + * @deprecated Use {@link #URL_ENCODE_PARTITIONING} and its methods. + */ + @Deprecated + public static final String URL_ENCODE_PARTITIONING_OPT_KEY = URL_ENCODE_PARTITIONING.key(); + /** + * @deprecated Use {@link #URL_ENCODE_PARTITIONING} and its methods. + */ + @Deprecated + public static final String DEFAULT_URL_ENCODE_PARTITIONING_OPT_VAL = URL_ENCODE_PARTITIONING.defaultValue(); + /** + * @deprecated Use {@link #HIVE_STYLE_PARTITIONING_ENABLE} and its methods. + */ + @Deprecated + public static final String HIVE_STYLE_PARTITIONING_OPT_KEY = HIVE_STYLE_PARTITIONING_ENABLE.key(); + /** + * @deprecated Use {@link #HIVE_STYLE_PARTITIONING_ENABLE} and its methods. + */ + @Deprecated + public static final String DEFAULT_HIVE_STYLE_PARTITIONING_OPT_VAL = HIVE_STYLE_PARTITIONING_ENABLE.defaultValue(); + /** + * @deprecated Use {@link #RECORDKEY_FIELD_NAME} and its methods. + */ + @Deprecated + public static final String RECORDKEY_FIELD_OPT_KEY = RECORDKEY_FIELD_NAME.key(); + /** + * @deprecated Use {@link #PARTITIONPATH_FIELD_NAME} and its methods. + */ + @Deprecated + public static final String PARTITIONPATH_FIELD_OPT_KEY = PARTITIONPATH_FIELD_NAME.key(); + + /** + * Supported configs. + */ + public static class Config { + + // One value from TimestampType above + public static final String TIMESTAMP_TYPE_FIELD_PROP = "hoodie.deltastreamer.keygen.timebased.timestamp.type"; + public static final String INPUT_TIME_UNIT = + "hoodie.deltastreamer.keygen.timebased.timestamp.scalar.time.unit"; + //This prop can now accept list of input date formats. + public static final String TIMESTAMP_INPUT_DATE_FORMAT_PROP = + "hoodie.deltastreamer.keygen.timebased.input.dateformat"; + public static final String TIMESTAMP_INPUT_DATE_FORMAT_LIST_DELIMITER_REGEX_PROP = "hoodie.deltastreamer.keygen.timebased.input.dateformat.list.delimiter.regex"; + public static final String TIMESTAMP_INPUT_TIMEZONE_FORMAT_PROP = "hoodie.deltastreamer.keygen.timebased.input.timezone"; + public static final String TIMESTAMP_OUTPUT_DATE_FORMAT_PROP = + "hoodie.deltastreamer.keygen.timebased.output.dateformat"; + //still keeping this prop for backward compatibility so that functionality for existing users does not break. + public static final String TIMESTAMP_TIMEZONE_FORMAT_PROP = + "hoodie.deltastreamer.keygen.timebased.timezone"; + public static final String TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP = "hoodie.deltastreamer.keygen.timebased.output.timezone"; + public static final String DATE_TIME_PARSER_PROP = "hoodie.deltastreamer.keygen.datetime.parser.class"; + } +} + diff --git a/hudi-common/src/main/java/org/apache/hudi/keygen/constant/KeyGeneratorType.java b/hudi-common/src/main/java/org/apache/hudi/keygen/constant/KeyGeneratorType.java new file mode 100644 index 0000000000000..4babda59249a3 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/keygen/constant/KeyGeneratorType.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.keygen.constant; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Types of {@link org.apache.hudi.keygen.KeyGenerator}. + */ +public enum KeyGeneratorType { + /** + * Simple key generator, which takes names of fields to be used for recordKey and partitionPath as configs. + */ + SIMPLE, + + /** + * Complex key generator, which takes names of fields to be used for recordKey and partitionPath as configs. + */ + COMPLEX, + + /** + * Key generator, that relies on timestamps for partitioning field. Still picks record key by name. + */ + TIMESTAMP, + + /** + * This is a generic implementation type of KeyGenerator where users can configure record key as a single field or + * a combination of fields. Similarly partition path can be configured to have multiple fields or only one field. + *

    + * This KeyGenerator expects value for prop "hoodie.datasource.write.partitionpath.field" in a specific format. + * For example: + * properties.put("hoodie.datasource.write.partitionpath.field", "field1:PartitionKeyType1,field2:PartitionKeyType2"). + */ + CUSTOM, + + /** + * Simple Key generator for unpartitioned Hive Tables. + */ + NON_PARTITION, + + /** + * Key generator for deletes using global indices. + */ + GLOBAL_DELETE; + + public static List getNames() { + List names = new ArrayList<>(KeyGeneratorType.values().length); + Arrays.stream(KeyGeneratorType.values()) + .forEach(x -> names.add(x.name())); + return names; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java new file mode 100644 index 0000000000000..37a209b0a8719 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java @@ -0,0 +1,414 @@ + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hudi.avro.model.HoodieMetadataBloomFilter; +import org.apache.hudi.avro.model.HoodieMetadataColumnStats; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.bloom.BloomFilterFactory; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieLocalEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.metrics.Registry; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.common.util.hash.ColumnIndexID; +import org.apache.hudi.common.util.hash.FileIndexID; +import org.apache.hudi.common.util.hash.PartitionIndexID; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieMetadataException; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.hadoop.CachingPath; +import org.apache.hudi.hadoop.SerializablePath; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.TreeSet; +import java.util.function.Function; +import java.util.stream.Collectors; + +public abstract class BaseTableMetadata implements HoodieTableMetadata { + + private static final Logger LOG = LogManager.getLogger(BaseTableMetadata.class); + + public static final long MAX_MEMORY_SIZE_IN_BYTES = 1024 * 1024 * 1024; + public static final int BUFFER_SIZE = 10 * 1024 * 1024; + + protected final transient HoodieEngineContext engineContext; + protected final SerializableConfiguration hadoopConf; + protected final SerializablePath dataBasePath; + protected final HoodieTableMetaClient dataMetaClient; + protected final Option metrics; + protected final HoodieMetadataConfig metadataConfig; + // Directory used for Spillable Map when merging records + protected final String spillableMapDirectory; + + protected boolean isMetadataTableEnabled; + protected boolean isBloomFilterIndexEnabled = false; + protected boolean isColumnStatsIndexEnabled = false; + + protected BaseTableMetadata(HoodieEngineContext engineContext, HoodieMetadataConfig metadataConfig, + String dataBasePath, String spillableMapDirectory) { + this.engineContext = engineContext; + this.hadoopConf = new SerializableConfiguration(engineContext.getHadoopConf()); + this.dataBasePath = new SerializablePath(new CachingPath(dataBasePath)); + this.dataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf.get()).setBasePath(dataBasePath).build(); + this.spillableMapDirectory = spillableMapDirectory; + this.metadataConfig = metadataConfig; + + this.isMetadataTableEnabled = metadataConfig.enabled(); + if (metadataConfig.enableMetrics()) { + this.metrics = Option.of(new HoodieMetadataMetrics(Registry.getRegistry("HoodieMetadata"))); + } else { + this.metrics = Option.empty(); + } + } + + /** + * Return the list of partitions in the dataset. + *

    + * If the Metadata Table is enabled, the listing is retrieved from the stored metadata. Otherwise, the list of + * partitions is retrieved directly from the underlying {@code FileSystem}. + *

    + * On any errors retrieving the listing from the metadata, defaults to using the file system listings. + */ + @Override + public List getAllPartitionPaths() throws IOException { + if (isMetadataTableEnabled) { + try { + return fetchAllPartitionPaths(); + } catch (Exception e) { + throw new HoodieMetadataException("Failed to retrieve list of partition from metadata", e); + } + } + return new FileSystemBackedTableMetadata(getEngineContext(), hadoopConf, dataBasePath.toString(), + metadataConfig.shouldAssumeDatePartitioning()).getAllPartitionPaths(); + } + + /** + * Return the list of files in a partition. + *

    + * If the Metadata Table is enabled, the listing is retrieved from the stored metadata. Otherwise, the list of + * partitions is retrieved directly from the underlying {@code FileSystem}. + *

    + * On any errors retrieving the listing from the metadata, defaults to using the file system listings. + * + * @param partitionPath The absolute path of the partition to list + */ + @Override + public FileStatus[] getAllFilesInPartition(Path partitionPath) + throws IOException { + if (isMetadataTableEnabled) { + try { + return fetchAllFilesInPartition(partitionPath); + } catch (Exception e) { + throw new HoodieMetadataException("Failed to retrieve files in partition " + partitionPath + " from metadata", e); + } + } + + return new FileSystemBackedTableMetadata(getEngineContext(), hadoopConf, dataBasePath.toString(), metadataConfig.shouldAssumeDatePartitioning()) + .getAllFilesInPartition(partitionPath); + } + + @Override + public Map getAllFilesInPartitions(List partitions) + throws IOException { + if (partitions.isEmpty()) { + return Collections.emptyMap(); + } + + if (isMetadataTableEnabled) { + try { + List partitionPaths = partitions.stream().map(Path::new).collect(Collectors.toList()); + return fetchAllFilesInPartitionPaths(partitionPaths); + } catch (Exception e) { + throw new HoodieMetadataException("Failed to retrieve files in partition from metadata", e); + } + } + + return new FileSystemBackedTableMetadata(getEngineContext(), hadoopConf, dataBasePath.toString(), metadataConfig.shouldAssumeDatePartitioning()) + .getAllFilesInPartitions(partitions); + } + + @Override + public Option getBloomFilter(final String partitionName, final String fileName) + throws HoodieMetadataException { + if (!isBloomFilterIndexEnabled) { + LOG.error("Metadata bloom filter index is disabled!"); + return Option.empty(); + } + + final Pair partitionFileName = Pair.of(partitionName, fileName); + Map, BloomFilter> bloomFilters = getBloomFilters(Collections.singletonList(partitionFileName)); + if (bloomFilters.isEmpty()) { + LOG.error("Meta index: missing bloom filter for partition: " + partitionName + ", file: " + fileName); + return Option.empty(); + } + + ValidationUtils.checkState(bloomFilters.containsKey(partitionFileName)); + return Option.of(bloomFilters.get(partitionFileName)); + } + + @Override + public Map, BloomFilter> getBloomFilters(final List> partitionNameFileNameList) + throws HoodieMetadataException { + if (!isBloomFilterIndexEnabled) { + LOG.error("Metadata bloom filter index is disabled!"); + return Collections.emptyMap(); + } + if (partitionNameFileNameList.isEmpty()) { + return Collections.emptyMap(); + } + + HoodieTimer timer = new HoodieTimer().startTimer(); + Set partitionIDFileIDSortedStrings = new TreeSet<>(); + Map> fileToKeyMap = new HashMap<>(); + partitionNameFileNameList.forEach(partitionNameFileNamePair -> { + final String bloomFilterIndexKey = HoodieMetadataPayload.getBloomFilterIndexKey( + new PartitionIndexID(partitionNameFileNamePair.getLeft()), new FileIndexID(partitionNameFileNamePair.getRight())); + partitionIDFileIDSortedStrings.add(bloomFilterIndexKey); + fileToKeyMap.put(bloomFilterIndexKey, partitionNameFileNamePair); + } + ); + + List partitionIDFileIDStrings = new ArrayList<>(partitionIDFileIDSortedStrings); + List>>> hoodieRecordList = + getRecordsByKeys(partitionIDFileIDStrings, MetadataPartitionType.BLOOM_FILTERS.getPartitionPath()); + metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_BLOOM_FILTERS_METADATA_STR, + (timer.endTimer() / partitionIDFileIDStrings.size()))); + + Map, BloomFilter> partitionFileToBloomFilterMap = new HashMap<>(); + for (final Pair>> entry : hoodieRecordList) { + if (entry.getRight().isPresent()) { + final Option bloomFilterMetadata = + entry.getRight().get().getData().getBloomFilterMetadata(); + if (bloomFilterMetadata.isPresent()) { + if (!bloomFilterMetadata.get().getIsDeleted()) { + ValidationUtils.checkState(fileToKeyMap.containsKey(entry.getLeft())); + final ByteBuffer bloomFilterByteBuffer = bloomFilterMetadata.get().getBloomFilter(); + final String bloomFilterType = bloomFilterMetadata.get().getType(); + final BloomFilter bloomFilter = BloomFilterFactory.fromString( + StandardCharsets.UTF_8.decode(bloomFilterByteBuffer).toString(), bloomFilterType); + partitionFileToBloomFilterMap.put(fileToKeyMap.get(entry.getLeft()), bloomFilter); + } + } else { + LOG.error("Meta index bloom filter missing for: " + fileToKeyMap.get(entry.getLeft())); + } + } + } + return partitionFileToBloomFilterMap; + } + + @Override + public Map, HoodieMetadataColumnStats> getColumnStats(final List> partitionNameFileNameList, final String columnName) + throws HoodieMetadataException { + if (!isColumnStatsIndexEnabled) { + LOG.error("Metadata column stats index is disabled!"); + return Collections.emptyMap(); + } + + Map> columnStatKeyToFileNameMap = new HashMap<>(); + TreeSet sortedKeys = new TreeSet<>(); + final ColumnIndexID columnIndexID = new ColumnIndexID(columnName); + for (Pair partitionNameFileNamePair : partitionNameFileNameList) { + final String columnStatsIndexKey = HoodieMetadataPayload.getColumnStatsIndexKey( + new PartitionIndexID(partitionNameFileNamePair.getLeft()), + new FileIndexID(partitionNameFileNamePair.getRight()), + columnIndexID); + sortedKeys.add(columnStatsIndexKey); + columnStatKeyToFileNameMap.put(columnStatsIndexKey, partitionNameFileNamePair); + } + + List columnStatKeys = new ArrayList<>(sortedKeys); + HoodieTimer timer = new HoodieTimer().startTimer(); + List>>> hoodieRecordList = + getRecordsByKeys(columnStatKeys, MetadataPartitionType.COLUMN_STATS.getPartitionPath()); + metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_COLUMN_STATS_METADATA_STR, timer.endTimer())); + + Map, HoodieMetadataColumnStats> fileToColumnStatMap = new HashMap<>(); + for (final Pair>> entry : hoodieRecordList) { + if (entry.getRight().isPresent()) { + final Option columnStatMetadata = + entry.getRight().get().getData().getColumnStatMetadata(); + if (columnStatMetadata.isPresent()) { + if (!columnStatMetadata.get().getIsDeleted()) { + ValidationUtils.checkState(columnStatKeyToFileNameMap.containsKey(entry.getLeft())); + final Pair partitionFileNamePair = columnStatKeyToFileNameMap.get(entry.getLeft()); + ValidationUtils.checkState(!fileToColumnStatMap.containsKey(partitionFileNamePair)); + fileToColumnStatMap.put(partitionFileNamePair, columnStatMetadata.get()); + } + } else { + LOG.error("Meta index column stats missing for: " + entry.getLeft()); + } + } + } + return fileToColumnStatMap; + } + + /** + * Returns a list of all partitions. + */ + protected List fetchAllPartitionPaths() { + HoodieTimer timer = new HoodieTimer().startTimer(); + Option> recordOpt = getRecordByKey(RECORDKEY_PARTITION_LIST, + MetadataPartitionType.FILES.getPartitionPath()); + metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_PARTITIONS_STR, timer.endTimer())); + + List partitions = recordOpt.map(record -> { + HoodieMetadataPayload metadataPayload = record.getData(); + checkForSpuriousDeletes(metadataPayload, "\"all partitions\""); + + List relativePaths = metadataPayload.getFilenames(); + // Non-partitioned tables have a single empty partition + if (relativePaths.size() == 1 && relativePaths.get(0).equals(NON_PARTITIONED_NAME)) { + return Collections.singletonList(""); + } else { + return relativePaths; + } + }) + .orElse(Collections.emptyList()); + + LOG.info("Listed partitions from metadata: #partitions=" + partitions.size()); + return partitions; + } + + /** + * Return all the files from the partition. + * + * @param partitionPath The absolute path of the partition + */ + FileStatus[] fetchAllFilesInPartition(Path partitionPath) throws IOException { + String relativePartitionPath = FSUtils.getRelativePartitionPath(dataBasePath.get(), partitionPath); + String recordKey = relativePartitionPath.isEmpty() ? NON_PARTITIONED_NAME : relativePartitionPath; + + HoodieTimer timer = new HoodieTimer().startTimer(); + Option> recordOpt = getRecordByKey(recordKey, + MetadataPartitionType.FILES.getPartitionPath()); + metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_FILES_STR, timer.endTimer())); + + FileStatus[] statuses = recordOpt.map(record -> { + HoodieMetadataPayload metadataPayload = record.getData(); + checkForSpuriousDeletes(metadataPayload, recordKey); + try { + return metadataPayload.getFileStatuses(hadoopConf.get(), partitionPath); + } catch (IOException e) { + throw new HoodieIOException("Failed to extract file-statuses from the payload", e); + } + }) + .orElse(new FileStatus[0]); + + LOG.info("Listed file in partition from metadata: partition=" + relativePartitionPath + ", #files=" + statuses.length); + return statuses; + } + + Map fetchAllFilesInPartitionPaths(List partitionPaths) throws IOException { + Map partitionIdToPathMap = + partitionPaths.parallelStream() + .collect( + Collectors.toMap(partitionPath -> { + String partitionId = FSUtils.getRelativePartitionPath(dataBasePath.get(), partitionPath); + return partitionId.isEmpty() ? NON_PARTITIONED_NAME : partitionId; + }, Function.identity()) + ); + + HoodieTimer timer = new HoodieTimer().startTimer(); + List>>> partitionIdRecordPairs = + getRecordsByKeys(new ArrayList<>(partitionIdToPathMap.keySet()), MetadataPartitionType.FILES.getPartitionPath()); + metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_FILES_STR, timer.endTimer())); + + FileSystem fs = partitionPaths.get(0).getFileSystem(hadoopConf.get()); + + Map partitionPathToFilesMap = partitionIdRecordPairs.parallelStream() + .map(pair -> { + String partitionId = pair.getKey(); + Option> recordOpt = pair.getValue(); + + Path partitionPath = partitionIdToPathMap.get(partitionId); + + return recordOpt.map(record -> { + HoodieMetadataPayload metadataPayload = record.getData(); + checkForSpuriousDeletes(metadataPayload, partitionId); + + FileStatus[] files = metadataPayload.getFileStatuses(fs, partitionPath); + return Pair.of(partitionPath.toString(), files); + }) + .orElse(null); + }) + .filter(Objects::nonNull) + .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); + + LOG.info("Listed files in partitions from metadata: partition list =" + Arrays.toString(partitionPaths.toArray())); + + return partitionPathToFilesMap; + } + + /** + * Handle spurious deletes. Depending on config, throw an exception or log a warn msg. + */ + private void checkForSpuriousDeletes(HoodieMetadataPayload metadataPayload, String partitionName) { + if (!metadataPayload.getDeletions().isEmpty()) { + if (metadataConfig.ignoreSpuriousDeletes()) { + LOG.warn("Metadata record for " + partitionName + " encountered some files to be deleted which was not added before. " + + "Ignoring the spurious deletes as the `" + HoodieMetadataConfig.IGNORE_SPURIOUS_DELETES.key() + "` config is set to true"); + } else { + throw new HoodieMetadataException("Metadata record for " + partitionName + " is inconsistent: " + + metadataPayload); + } + } + } + + protected abstract Option> getRecordByKey(String key, String partitionName); + + public abstract List>>> getRecordsByKeys(List key, String partitionName); + + protected HoodieEngineContext getEngineContext() { + return engineContext != null ? engineContext : new HoodieLocalEngineContext(hadoopConf.get()); + } + + public HoodieMetadataConfig getMetadataConfig() { + return metadataConfig; + } + + protected String getLatestDataInstantTime() { + return dataMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant() + .map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java new file mode 100644 index 0000000000000..bcfd891711a46 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java @@ -0,0 +1,182 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.hudi.avro.model.HoodieMetadataColumnStats; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodiePartitionMetadata; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieMetadataException; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.stream.Collectors; + +public class FileSystemBackedTableMetadata implements HoodieTableMetadata { + + private static final int DEFAULT_LISTING_PARALLELISM = 1500; + + private final transient HoodieEngineContext engineContext; + private final SerializableConfiguration hadoopConf; + private final String datasetBasePath; + private final boolean assumeDatePartitioning; + + public FileSystemBackedTableMetadata(HoodieEngineContext engineContext, SerializableConfiguration conf, String datasetBasePath, + boolean assumeDatePartitioning) { + this.engineContext = engineContext; + this.hadoopConf = conf; + this.datasetBasePath = datasetBasePath; + this.assumeDatePartitioning = assumeDatePartitioning; + } + + @Override + public FileStatus[] getAllFilesInPartition(Path partitionPath) throws IOException { + FileSystem fs = partitionPath.getFileSystem(hadoopConf.get()); + return FSUtils.getAllDataFilesInPartition(fs, partitionPath); + } + + @Override + public List getAllPartitionPaths() throws IOException { + Path basePath = new Path(datasetBasePath); + FileSystem fs = basePath.getFileSystem(hadoopConf.get()); + if (assumeDatePartitioning) { + return FSUtils.getAllPartitionFoldersThreeLevelsDown(fs, datasetBasePath); + } + + List pathsToList = new CopyOnWriteArrayList<>(); + pathsToList.add(basePath); + List partitionPaths = new CopyOnWriteArrayList<>(); + + while (!pathsToList.isEmpty()) { + // TODO: Get the parallelism from HoodieWriteConfig + int listingParallelism = Math.min(DEFAULT_LISTING_PARALLELISM, pathsToList.size()); + + // List all directories in parallel + List dirToFileListing = engineContext.flatMap(pathsToList, path -> { + FileSystem fileSystem = path.getFileSystem(hadoopConf.get()); + return Arrays.stream(fileSystem.listStatus(path)); + }, listingParallelism); + pathsToList.clear(); + + // if current dictionary contains PartitionMetadata, add it to result + // if current dictionary does not contain PartitionMetadata, add it to queue to be processed. + int fileListingParallelism = Math.min(DEFAULT_LISTING_PARALLELISM, dirToFileListing.size()); + if (!dirToFileListing.isEmpty()) { + // result below holds a list of pair. first entry in the pair optionally holds the deduced list of partitions. + // and second entry holds optionally a directory path to be processed further. + List, Option>> result = engineContext.map(dirToFileListing, fileStatus -> { + FileSystem fileSystem = fileStatus.getPath().getFileSystem(hadoopConf.get()); + if (fileStatus.isDirectory()) { + if (HoodiePartitionMetadata.hasPartitionMetadata(fileSystem, fileStatus.getPath())) { + return Pair.of(Option.of(FSUtils.getRelativePartitionPath(new Path(datasetBasePath), fileStatus.getPath())), Option.empty()); + } else if (!fileStatus.getPath().getName().equals(HoodieTableMetaClient.METAFOLDER_NAME)) { + return Pair.of(Option.empty(), Option.of(fileStatus.getPath())); + } + } else if (fileStatus.getPath().getName().startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX)) { + String partitionName = FSUtils.getRelativePartitionPath(new Path(datasetBasePath), fileStatus.getPath().getParent()); + return Pair.of(Option.of(partitionName), Option.empty()); + } + return Pair.of(Option.empty(), Option.empty()); + }, fileListingParallelism); + + partitionPaths.addAll(result.stream().filter(entry -> entry.getKey().isPresent()).map(entry -> entry.getKey().get()) + .collect(Collectors.toList())); + + pathsToList.addAll(result.stream().filter(entry -> entry.getValue().isPresent()).map(entry -> entry.getValue().get()) + .collect(Collectors.toList())); + } + } + return partitionPaths; + } + + @Override + public Map getAllFilesInPartitions(List partitionPaths) + throws IOException { + if (partitionPaths == null || partitionPaths.isEmpty()) { + return Collections.emptyMap(); + } + + int parallelism = Math.min(DEFAULT_LISTING_PARALLELISM, partitionPaths.size()); + + List> partitionToFiles = engineContext.map(partitionPaths, partitionPathStr -> { + Path partitionPath = new Path(partitionPathStr); + FileSystem fs = partitionPath.getFileSystem(hadoopConf.get()); + return Pair.of(partitionPathStr, FSUtils.getAllDataFilesInPartition(fs, partitionPath)); + }, parallelism); + + return partitionToFiles.stream().collect(Collectors.toMap(Pair::getLeft, Pair::getRight)); + } + + @Override + public Option getSyncedInstantTime() { + throw new UnsupportedOperationException(); + } + + @Override + public Option getLatestCompactionTime() { + throw new UnsupportedOperationException(); + } + + @Override + public void close() throws Exception { + // no-op + } + + @Override + public void reset() { + // no-op + } + + public Option getBloomFilter(final String partitionName, final String fileName) + throws HoodieMetadataException { + throw new HoodieMetadataException("Unsupported operation: getBloomFilter for " + fileName); + } + + @Override + public Map, BloomFilter> getBloomFilters(final List> partitionNameFileNameList) + throws HoodieMetadataException { + throw new HoodieMetadataException("Unsupported operation: getBloomFilters!"); + } + + @Override + public Map, HoodieMetadataColumnStats> getColumnStats(final List> partitionNameFileNameList, final String columnName) + throws HoodieMetadataException { + throw new HoodieMetadataException("Unsupported operation: getColumnsStats!"); + } + + @Override + public HoodieData> getRecordsByKeyPrefixes(List keyPrefixes, String partitionName, boolean shouldLoadInMemory) { + throw new HoodieMetadataException("Unsupported operation: getRecordsByKeyPrefixes!"); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java new file mode 100644 index 0000000000000..187791558a24a --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java @@ -0,0 +1,658 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieMetadataRecord; +import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.common.config.HoodieCommonConfig; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodieListData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.function.SerializableFunction; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.SpillableMapUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieMetadataException; +import org.apache.hudi.exception.TableNotFoundException; +import org.apache.hudi.io.storage.HoodieFileReader; +import org.apache.hudi.io.storage.HoodieFileReaderFactory; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.util.CollectionUtils.isNullOrEmpty; +import static org.apache.hudi.common.util.CollectionUtils.toStream; +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.PARTITION_NAME_BLOOM_FILTERS; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.PARTITION_NAME_FILES; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getFileSystemView; + +/** + * Table metadata provided by an internal DFS backed Hudi metadata table. + */ +public class HoodieBackedTableMetadata extends BaseTableMetadata { + + private static final Logger LOG = LogManager.getLogger(HoodieBackedTableMetadata.class); + + private static final Schema METADATA_RECORD_SCHEMA = HoodieMetadataRecord.getClassSchema(); + + private String metadataBasePath; + // Metadata table's timeline and metaclient + private HoodieTableMetaClient metadataMetaClient; + private HoodieTableConfig metadataTableConfig; + private HoodieTableFileSystemView metadataFileSystemView; + // should we reuse the open file handles, across calls + private final boolean reuse; + + // Readers for the latest file slice corresponding to file groups in the metadata partition + private Map, Pair> partitionReaders = + new ConcurrentHashMap<>(); + + public HoodieBackedTableMetadata(HoodieEngineContext engineContext, HoodieMetadataConfig metadataConfig, + String datasetBasePath, String spillableMapDirectory) { + this(engineContext, metadataConfig, datasetBasePath, spillableMapDirectory, false); + } + + public HoodieBackedTableMetadata(HoodieEngineContext engineContext, HoodieMetadataConfig metadataConfig, + String datasetBasePath, String spillableMapDirectory, boolean reuse) { + super(engineContext, metadataConfig, datasetBasePath, spillableMapDirectory); + this.reuse = reuse; + initIfNeeded(); + } + + private void initIfNeeded() { + this.metadataBasePath = HoodieTableMetadata.getMetadataTableBasePath(dataBasePath.toString()); + if (!isMetadataTableEnabled) { + if (!HoodieTableMetadata.isMetadataTable(metadataBasePath)) { + LOG.info("Metadata table is disabled."); + } + } else if (this.metadataMetaClient == null) { + try { + this.metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf.get()).setBasePath(metadataBasePath).build(); + this.metadataFileSystemView = getFileSystemView(metadataMetaClient); + this.metadataTableConfig = metadataMetaClient.getTableConfig(); + this.isBloomFilterIndexEnabled = metadataConfig.isBloomFilterIndexEnabled(); + this.isColumnStatsIndexEnabled = metadataConfig.isColumnStatsIndexEnabled(); + } catch (TableNotFoundException e) { + LOG.warn("Metadata table was not found at path " + metadataBasePath); + this.isMetadataTableEnabled = false; + this.metadataMetaClient = null; + this.metadataFileSystemView = null; + this.metadataTableConfig = null; + } catch (Exception e) { + LOG.error("Failed to initialize metadata table at path " + metadataBasePath, e); + this.isMetadataTableEnabled = false; + this.metadataMetaClient = null; + this.metadataFileSystemView = null; + this.metadataTableConfig = null; + } + } + } + + @Override + protected Option> getRecordByKey(String key, String partitionName) { + List>>> recordsByKeys = getRecordsByKeys(Collections.singletonList(key), partitionName); + return recordsByKeys.size() == 0 ? Option.empty() : recordsByKeys.get(0).getValue(); + } + + @Override + public HoodieData> getRecordsByKeyPrefixes(List keyPrefixes, + String partitionName, + boolean shouldLoadInMemory) { + // Sort the columns so that keys are looked up in order + List sortedKeyPrefixes = new ArrayList<>(keyPrefixes); + Collections.sort(sortedKeyPrefixes); + + // NOTE: Since we partition records to a particular file-group by full key, we will have + // to scan all file-groups for all key-prefixes as each of these might contain some + // records matching the key-prefix + List partitionFileSlices = + HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices( + metadataMetaClient, metadataFileSystemView, partitionName); + + return (shouldLoadInMemory ? HoodieListData.lazy(partitionFileSlices) : engineContext.parallelize(partitionFileSlices)) + .flatMap((SerializableFunction>>) fileSlice -> { + // NOTE: Since this will be executed by executors, we can't access previously cached + // readers, and therefore have to always open new ones + Pair readers = + openReaders(partitionName, fileSlice); + + try { + List timings = new ArrayList<>(); + + HoodieFileReader baseFileReader = readers.getKey(); + HoodieMetadataMergedLogRecordReader logRecordScanner = readers.getRight(); + + if (baseFileReader == null && logRecordScanner == null) { + // TODO: what do we do if both does not exist? should we throw an exception and let caller do the fallback ? + return Collections.emptyIterator(); + } + + boolean fullKeys = false; + + Map>> logRecords = + readLogRecords(logRecordScanner, sortedKeyPrefixes, fullKeys, timings); + + List>>> mergedRecords = + readFromBaseAndMergeWithLogRecords(baseFileReader, sortedKeyPrefixes, fullKeys, logRecords, timings, partitionName); + + LOG.debug(String.format("Metadata read for %s keys took [baseFileRead, logMerge] %s ms", + sortedKeyPrefixes.size(), timings)); + + return mergedRecords.stream() + .map(keyRecordPair -> keyRecordPair.getValue().orElse(null)) + .iterator(); + } catch (IOException ioe) { + throw new HoodieIOException("Error merging records from metadata table for " + sortedKeyPrefixes.size() + " key : ", ioe); + } finally { + closeReader(readers); + } + }) + .filter(Objects::nonNull); + } + + @Override + public List>>> getRecordsByKeys(List keys, + String partitionName) { + // Sort the columns so that keys are looked up in order + List sortedKeys = new ArrayList<>(keys); + Collections.sort(sortedKeys); + Map, List> partitionFileSliceToKeysMap = getPartitionFileSliceToKeysMapping(partitionName, sortedKeys); + List>>> result = new ArrayList<>(); + AtomicInteger fileSlicesKeysCount = new AtomicInteger(); + partitionFileSliceToKeysMap.forEach((partitionFileSlicePair, fileSliceKeys) -> { + Pair readers = + getOrCreateReaders(partitionName, partitionFileSlicePair.getRight()); + try { + List timings = new ArrayList<>(); + HoodieFileReader baseFileReader = readers.getKey(); + HoodieMetadataMergedLogRecordReader logRecordScanner = readers.getRight(); + if (baseFileReader == null && logRecordScanner == null) { + return; + } + + boolean fullKeys = true; + Map>> logRecords = + readLogRecords(logRecordScanner, fileSliceKeys, fullKeys, timings); + + result.addAll(readFromBaseAndMergeWithLogRecords(baseFileReader, fileSliceKeys, fullKeys, logRecords, + timings, partitionName)); + + LOG.debug(String.format("Metadata read for %s keys took [baseFileRead, logMerge] %s ms", + fileSliceKeys.size(), timings)); + fileSlicesKeysCount.addAndGet(fileSliceKeys.size()); + } catch (IOException ioe) { + throw new HoodieIOException("Error merging records from metadata table for " + sortedKeys.size() + " key : ", ioe); + } finally { + if (!reuse) { + closeReader(readers); + } + } + }); + + return result; + } + + private Map>> readLogRecords(HoodieMetadataMergedLogRecordReader logRecordScanner, + List keys, + boolean fullKey, + List timings) { + HoodieTimer timer = new HoodieTimer().startTimer(); + timer.startTimer(); + + if (logRecordScanner == null) { + timings.add(timer.endTimer()); + return Collections.emptyMap(); + } + + String partitionName = logRecordScanner.getPartitionName().get(); + + Map>> logRecords = new HashMap<>(); + if (isFullScanAllowedForPartition(partitionName)) { + checkArgument(fullKey, "If full-scan is required, only full keys could be used!"); + // Path which does full scan of log files + for (String key : keys) { + logRecords.put(key, logRecordScanner.getRecordByKey(key).get(0).getValue()); + } + } else { + // This path will do seeks pertaining to the keys passed in + List>>> logRecordsList = + fullKey ? logRecordScanner.getRecordsByKeys(keys) + : logRecordScanner.getRecordsByKeyPrefixes(keys) + .stream() + .map(record -> Pair.of(record.getRecordKey(), Option.of(record))) + .collect(Collectors.toList()); + + for (Pair>> entry : logRecordsList) { + logRecords.put(entry.getKey(), entry.getValue()); + } + } + + timings.add(timer.endTimer()); + return logRecords; + } + + private List>>> readFromBaseAndMergeWithLogRecords(HoodieFileReader baseFileReader, + List keys, + boolean fullKeys, + Map>> logRecords, + List timings, + String partitionName) throws IOException { + HoodieTimer timer = new HoodieTimer().startTimer(); + timer.startTimer(); + + if (baseFileReader == null) { + // No base file at all + timings.add(timer.endTimer()); + if (fullKeys) { + // In case full-keys (not key-prefixes) were provided, it's expected that the list of + // records will contain an (optional) entry for each corresponding key + return keys.stream() + .map(key -> Pair.of(key, logRecords.getOrDefault(key, Option.empty()))) + .collect(Collectors.toList()); + } else { + return logRecords.entrySet().stream() + .map(entry -> Pair.of(entry.getKey(), entry.getValue())) + .collect(Collectors.toList()); + } + } + + HoodieTimer readTimer = new HoodieTimer(); + readTimer.startTimer(); + + Map> records = + fetchBaseFileRecordsByKeys(baseFileReader, keys, fullKeys, partitionName); + + metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BASEFILE_READ_STR, readTimer.endTimer())); + + // Iterate over all provided log-records, merging them into existing records + for (Option> logRecordOpt : logRecords.values()) { + if (logRecordOpt.isPresent()) { + HoodieRecord logRecord = logRecordOpt.get(); + records.merge( + logRecord.getRecordKey(), + logRecord, + (oldRecord, newRecord) -> + new HoodieAvroRecord<>(oldRecord.getKey(), newRecord.getData().preCombine(oldRecord.getData())) + ); + } + } + + timings.add(timer.endTimer()); + + if (fullKeys) { + // In case full-keys (not key-prefixes) were provided, it's expected that the list of + // records will contain an (optional) entry for each corresponding key + return keys.stream() + .map(key -> Pair.of(key, Option.ofNullable(records.get(key)))) + .collect(Collectors.toList()); + } else { + return records.values().stream() + .map(record -> Pair.of(record.getRecordKey(), Option.of(record))) + .collect(Collectors.toList()); + } + } + + private Map> fetchBaseFileRecordsByKeys(HoodieFileReader baseFileReader, + List keys, + boolean fullKeys, + String partitionName) throws IOException { + ClosableIterator records = fullKeys ? baseFileReader.getRecordsByKeysIterator(keys) + : baseFileReader.getRecordsByKeyPrefixIterator(keys); + + return toStream(records) + .map(record -> Pair.of( + (String) record.get(HoodieMetadataPayload.KEY_FIELD_NAME), + composeRecord(record, partitionName))) + .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); + } + + private HoodieRecord composeRecord(GenericRecord avroRecord, String partitionName) { + if (metadataTableConfig.populateMetaFields()) { + return SpillableMapUtils.convertToHoodieRecordPayload(avroRecord, + metadataTableConfig.getPayloadClass(), metadataTableConfig.getPreCombineField(), false); + } + return SpillableMapUtils.convertToHoodieRecordPayload(avroRecord, + metadataTableConfig.getPayloadClass(), metadataTableConfig.getPreCombineField(), + Pair.of(metadataTableConfig.getRecordKeyFieldProp(), metadataTableConfig.getPartitionFieldProp()), + false, Option.of(partitionName)); + } + + /** + * Get the latest file slices for the interested keys in a given partition. + * + * @param partitionName - Partition to get the file slices from + * @param keys - Interested keys + * @return FileSlices for the keys + */ + private Map, List> getPartitionFileSliceToKeysMapping(final String partitionName, final List keys) { + // Metadata is in sync till the latest completed instant on the dataset + List latestFileSlices = + HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices( + metadataMetaClient, metadataFileSystemView, partitionName); + + Map, List> partitionFileSliceToKeysMap = new HashMap<>(); + for (String key : keys) { + if (!isNullOrEmpty(latestFileSlices)) { + final FileSlice slice = latestFileSlices.get(HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(key, + latestFileSlices.size())); + final Pair partitionNameFileSlicePair = Pair.of(partitionName, slice); + partitionFileSliceToKeysMap.computeIfAbsent(partitionNameFileSlicePair, k -> new ArrayList<>()).add(key); + } + } + return partitionFileSliceToKeysMap; + } + + /** + * Create a file reader and the record scanner for a given partition and file slice + * if readers are not already available. + * + * @param partitionName - Partition name + * @param slice - The file slice to open readers for + * @return File reader and the record scanner pair for the requested file slice + */ + private Pair getOrCreateReaders(String partitionName, FileSlice slice) { + if (reuse) { + return partitionReaders.computeIfAbsent(Pair.of(partitionName, slice.getFileId()), k -> { + return openReaders(partitionName, slice); }); + } else { + return openReaders(partitionName, slice); + } + } + + private Pair openReaders(String partitionName, FileSlice slice) { + try { + HoodieTimer timer = new HoodieTimer().startTimer(); + // Open base file reader + Pair baseFileReaderOpenTimePair = getBaseFileReader(slice, timer); + HoodieFileReader baseFileReader = baseFileReaderOpenTimePair.getKey(); + final long baseFileOpenMs = baseFileReaderOpenTimePair.getValue(); + + // Open the log record scanner using the log files from the latest file slice + List logFiles = slice.getLogFiles().collect(Collectors.toList()); + Pair logRecordScannerOpenTimePair = + getLogRecordScanner(logFiles, partitionName, Option.empty()); + HoodieMetadataMergedLogRecordReader logRecordScanner = logRecordScannerOpenTimePair.getKey(); + final long logScannerOpenMs = logRecordScannerOpenTimePair.getValue(); + + metrics.ifPresent(metrics -> metrics.updateMetrics(HoodieMetadataMetrics.SCAN_STR, + +baseFileOpenMs + logScannerOpenMs)); + return Pair.of(baseFileReader, logRecordScanner); + } catch (IOException e) { + throw new HoodieIOException("Error opening readers for metadata table partition " + partitionName, e); + } + } + + private Pair getBaseFileReader(FileSlice slice, HoodieTimer timer) throws IOException { + HoodieFileReader baseFileReader = null; + Long baseFileOpenMs; + // If the base file is present then create a reader + Option basefile = slice.getBaseFile(); + if (basefile.isPresent()) { + String basefilePath = basefile.get().getPath(); + baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(basefilePath)); + baseFileOpenMs = timer.endTimer(); + LOG.info(String.format("Opened metadata base file from %s at instant %s in %d ms", basefilePath, + basefile.get().getCommitTime(), baseFileOpenMs)); + } else { + baseFileOpenMs = 0L; + timer.endTimer(); + } + return Pair.of(baseFileReader, baseFileOpenMs); + } + + private Set getValidInstantTimestamps() { + // Only those log files which have a corresponding completed instant on the dataset should be read + // This is because the metadata table is updated before the dataset instants are committed. + HoodieActiveTimeline datasetTimeline = dataMetaClient.getActiveTimeline(); + Set validInstantTimestamps = datasetTimeline.filterCompletedInstants().getInstants() + .map(HoodieInstant::getTimestamp).collect(Collectors.toSet()); + + // For any rollbacks and restores, we cannot neglect the instants that they are rolling back. + // The rollback instant should be more recent than the start of the timeline for it to have rolled back any + // instant which we have a log block for. + final String earliestInstantTime = validInstantTimestamps.isEmpty() ? SOLO_COMMIT_TIMESTAMP : Collections.min(validInstantTimestamps); + datasetTimeline.getRollbackAndRestoreTimeline().filterCompletedInstants().getInstants() + .filter(instant -> HoodieTimeline.compareTimestamps(instant.getTimestamp(), HoodieTimeline.GREATER_THAN, earliestInstantTime)) + .forEach(instant -> { + validInstantTimestamps.addAll(getRollbackedCommits(instant, datasetTimeline)); + }); + + // SOLO_COMMIT_TIMESTAMP is used during bootstrap so it is a valid timestamp + validInstantTimestamps.add(SOLO_COMMIT_TIMESTAMP); + return validInstantTimestamps; + } + + public Pair getLogRecordScanner(List logFiles, + String partitionName, + Option allowFullScanOverride) { + HoodieTimer timer = new HoodieTimer().startTimer(); + List sortedLogFilePaths = logFiles.stream() + .sorted(HoodieLogFile.getLogFileComparator()) + .map(o -> o.getPath().toString()) + .collect(Collectors.toList()); + + // Only those log files which have a corresponding completed instant on the dataset should be read + // This is because the metadata table is updated before the dataset instants are committed. + Set validInstantTimestamps = getValidInstantTimestamps(); + + Option latestMetadataInstant = metadataMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant(); + String latestMetadataInstantTime = latestMetadataInstant.map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP); + + boolean allowFullScan = allowFullScanOverride.orElseGet(() -> isFullScanAllowedForPartition(partitionName)); + + // Load the schema + Schema schema = HoodieAvroUtils.addMetadataFields(HoodieMetadataRecord.getClassSchema()); + HoodieCommonConfig commonConfig = HoodieCommonConfig.newBuilder().fromProperties(metadataConfig.getProps()).build(); + HoodieMetadataMergedLogRecordReader logRecordScanner = HoodieMetadataMergedLogRecordReader.newBuilder() + .withFileSystem(metadataMetaClient.getFs()) + .withBasePath(metadataBasePath) + .withLogFilePaths(sortedLogFilePaths) + .withReaderSchema(schema) + .withLatestInstantTime(latestMetadataInstantTime) + .withMaxMemorySizeInBytes(MAX_MEMORY_SIZE_IN_BYTES) + .withBufferSize(BUFFER_SIZE) + .withSpillableMapBasePath(spillableMapDirectory) + .withDiskMapType(commonConfig.getSpillableDiskMapType()) + .withBitCaskDiskMapCompressionEnabled(commonConfig.isBitCaskDiskMapCompressionEnabled()) + .withLogBlockTimestamps(validInstantTimestamps) + .allowFullScan(allowFullScan) + .withPartition(partitionName) + .build(); + + Long logScannerOpenMs = timer.endTimer(); + LOG.info(String.format("Opened %d metadata log files (dataset instant=%s, metadata instant=%s) in %d ms", + sortedLogFilePaths.size(), getLatestDataInstantTime(), latestMetadataInstantTime, logScannerOpenMs)); + return Pair.of(logRecordScanner, logScannerOpenMs); + } + + // NOTE: We're allowing eager full-scan of the log-files only for "files" partition. + // Other partitions (like "column_stats", "bloom_filters") will have to be fetched + // t/h point-lookups + private boolean isFullScanAllowedForPartition(String partitionName) { + switch (partitionName) { + case PARTITION_NAME_FILES: + return metadataConfig.allowFullScan(); + + case PARTITION_NAME_COLUMN_STATS: + case PARTITION_NAME_BLOOM_FILTERS: + default: + return false; + } + } + + /** + * Returns a list of commits which were rolled back as part of a Rollback or Restore operation. + * + * @param instant The Rollback operation to read + * @param timeline instant of timeline from dataset. + */ + private List getRollbackedCommits(HoodieInstant instant, HoodieActiveTimeline timeline) { + try { + if (instant.getAction().equals(HoodieTimeline.ROLLBACK_ACTION)) { + HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.deserializeHoodieRollbackMetadata( + timeline.getInstantDetails(instant).get()); + return rollbackMetadata.getCommitsRollback(); + } + + List rollbackedCommits = new LinkedList<>(); + if (instant.getAction().equals(HoodieTimeline.RESTORE_ACTION)) { + // Restore is made up of several rollbacks + HoodieRestoreMetadata restoreMetadata = TimelineMetadataUtils.deserializeHoodieRestoreMetadata( + timeline.getInstantDetails(instant).get()); + restoreMetadata.getHoodieRestoreMetadata().values().forEach(rms -> { + rms.forEach(rm -> rollbackedCommits.addAll(rm.getCommitsRollback())); + }); + } + return rollbackedCommits; + } catch (IOException e) { + throw new HoodieMetadataException("Error retrieving rollback commits for instant " + instant, e); + } + } + + @Override + public void close() { + closePartitionReaders(); + } + + /** + * Close the file reader and the record scanner for the given file slice. + * + * @param partitionFileSlicePair - Partition and FileSlice + */ + private synchronized void close(Pair partitionFileSlicePair) { + Pair readers = + partitionReaders.remove(partitionFileSlicePair); + closeReader(readers); + } + + /** + * Close and clear all the partitions readers. + */ + private void closePartitionReaders() { + for (Pair partitionFileSlicePair : partitionReaders.keySet()) { + close(partitionFileSlicePair); + } + partitionReaders.clear(); + } + + private void closeReader(Pair readers) { + if (readers != null) { + try { + if (readers.getKey() != null) { + readers.getKey().close(); + } + if (readers.getValue() != null) { + readers.getValue().close(); + } + } catch (Exception e) { + throw new HoodieException("Error closing resources during metadata table merge", e); + } + } + } + + public boolean enabled() { + return isMetadataTableEnabled; + } + + public SerializableConfiguration getHadoopConf() { + return hadoopConf; + } + + public HoodieTableMetaClient getMetadataMetaClient() { + return metadataMetaClient; + } + + public Map stats() { + return metrics.map(m -> m.getStats(true, metadataMetaClient, this)).orElse(new HashMap<>()); + } + + @Override + public Option getSyncedInstantTime() { + if (metadataMetaClient != null) { + Option latestInstant = metadataMetaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants().lastInstant(); + if (latestInstant.isPresent()) { + return Option.of(latestInstant.get().getTimestamp()); + } + } + return Option.empty(); + } + + @Override + public Option getLatestCompactionTime() { + if (metadataMetaClient != null) { + Option latestCompaction = metadataMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().lastInstant(); + if (latestCompaction.isPresent()) { + return Option.of(latestCompaction.get().getTimestamp()); + } + } + return Option.empty(); + } + + @Override + public void reset() { + initIfNeeded(); + dataMetaClient.reloadActiveTimeline(); + if (metadataMetaClient != null) { + metadataMetaClient.reloadActiveTimeline(); + metadataFileSystemView = getFileSystemView(metadataMetaClient); + } + // the cached reader has max instant time restriction, they should be cleared + // because the metadata timeline may have changed. + closePartitionReaders(); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataFileSystemView.java new file mode 100644 index 0000000000000..ab5b5f6b4db82 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataFileSystemView.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.exception.HoodieException; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; + +/** + * {@code HoodieTableFileSystemView} implementation that retrieved partition listings from the Metadata Table. + */ +public class HoodieMetadataFileSystemView extends HoodieTableFileSystemView { + + private final HoodieTableMetadata tableMetadata; + + public HoodieMetadataFileSystemView(HoodieTableMetaClient metaClient, + HoodieTimeline visibleActiveTimeline, + HoodieTableMetadata tableMetadata) { + super(metaClient, visibleActiveTimeline); + this.tableMetadata = tableMetadata; + } + + public HoodieMetadataFileSystemView(HoodieEngineContext engineContext, + HoodieTableMetaClient metaClient, + HoodieTimeline visibleActiveTimeline, + HoodieMetadataConfig metadataConfig) { + super(metaClient, visibleActiveTimeline); + this.tableMetadata = HoodieTableMetadata.create(engineContext, metadataConfig, metaClient.getBasePath(), + FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue(), true); + } + + /** + * Return all the files in the partition by reading from the Metadata Table. + * + * @param partitionPath The absolute path of the partition + * @throws IOException + */ + @Override + protected FileStatus[] listPartition(Path partitionPath) throws IOException { + return tableMetadata.getAllFilesInPartition(partitionPath); + } + + @Override + public void reset() { + super.reset(); + tableMetadata.reset(); + } + + @Override + public void sync() { + super.sync(); + tableMetadata.reset(); + } + + @Override + public void close() { + try { + tableMetadata.close(); + } catch (Exception e) { + throw new HoodieException("Error closing metadata file system view.", e); + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMergedLogRecordReader.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMergedLogRecordReader.java new file mode 100644 index 0000000000000..cbd7e6c17511c --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMergedLogRecordReader.java @@ -0,0 +1,252 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.table.log.InstantRange; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.SpillableMapUtils; +import org.apache.hudi.common.util.collection.ExternalSpillableMap; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.util.ValidationUtils.checkState; + +/** + * A {@code HoodieMergedLogRecordScanner} implementation which only merged records matching providing keys. This is + * useful in limiting memory usage when only a small subset of updates records are to be read. + */ +public class HoodieMetadataMergedLogRecordReader extends HoodieMergedLogRecordScanner { + + private static final Logger LOG = LogManager.getLogger(HoodieMetadataMergedLogRecordReader.class); + + private HoodieMetadataMergedLogRecordReader(FileSystem fs, String basePath, String partitionName, + List logFilePaths, + Schema readerSchema, String latestInstantTime, + Long maxMemorySizeInBytes, int bufferSize, + String spillableMapBasePath, + ExternalSpillableMap.DiskMapType diskMapType, + boolean isBitCaskDiskMapCompressionEnabled, + Option instantRange, boolean allowFullScan) { + super(fs, basePath, logFilePaths, readerSchema, latestInstantTime, maxMemorySizeInBytes, true, false, bufferSize, + spillableMapBasePath, instantRange, diskMapType, isBitCaskDiskMapCompressionEnabled, false, allowFullScan, Option.of(partitionName), InternalSchema.getEmptyInternalSchema()); + } + + @Override + protected HoodieAvroRecord createHoodieRecord(final IndexedRecord rec, final HoodieTableConfig hoodieTableConfig, + final String payloadClassFQN, final String preCombineField, + final boolean withOperationField, + final Option> simpleKeyGenFields, + final Option partitionName) { + if (hoodieTableConfig.populateMetaFields()) { + return super.createHoodieRecord(rec, hoodieTableConfig, payloadClassFQN, preCombineField, withOperationField, + simpleKeyGenFields, partitionName); + } + + // When meta fields are not available, create the record using the + // preset key field and the known partition name + return SpillableMapUtils.convertToHoodieRecordPayload((GenericRecord) rec, payloadClassFQN, + preCombineField, simpleKeyGenFields.get(), withOperationField, partitionName); + } + + /** + * Returns the builder for {@code HoodieMetadataMergedLogRecordScanner}. + */ + public static HoodieMetadataMergedLogRecordReader.Builder newBuilder() { + return new HoodieMetadataMergedLogRecordReader.Builder(); + } + + /** + * Retrieve a record given its key. + * + * @param key Key of the record to retrieve + * @return {@code HoodieRecord} if key was found else {@code Option.empty()} + */ + public synchronized List>>> getRecordByKey(String key) { + checkState(forceFullScan, "Record reader has to be in full-scan mode to use this API"); + return Collections.singletonList(Pair.of(key, Option.ofNullable((HoodieRecord) records.get(key)))); + } + + @SuppressWarnings("unchecked") + public List> getRecordsByKeyPrefixes(List keyPrefixes) { + // Following operations have to be atomic, otherwise concurrent + // readers would race with each other and could crash when + // processing log block records as part of scan. + synchronized (this) { + records.clear(); + scanInternal(Option.of(new KeySpec(keyPrefixes, false))); + return records.values().stream() + .filter(Objects::nonNull) + .map(record -> (HoodieRecord) record) + .collect(Collectors.toList()); + } + } + + @SuppressWarnings("unchecked") + public synchronized List>>> getRecordsByKeys(List keys) { + // Following operations have to be atomic, otherwise concurrent + // readers would race with each other and could crash when + // processing log block records as part of scan. + synchronized (this) { + records.clear(); + scan(keys); + return keys.stream() + .map(key -> Pair.of(key, Option.ofNullable((HoodieRecord) records.get(key)))) + .collect(Collectors.toList()); + } + } + + @Override + protected String getKeyField() { + return HoodieMetadataPayload.KEY_FIELD_NAME; + } + + /** + * Builder used to build {@code HoodieMetadataMergedLogRecordScanner}. + */ + public static class Builder extends HoodieMergedLogRecordScanner.Builder { + private boolean allowFullScan = HoodieMetadataConfig.ENABLE_FULL_SCAN_LOG_FILES.defaultValue(); + + @Override + public Builder withFileSystem(FileSystem fs) { + this.fs = fs; + return this; + } + + @Override + public Builder withBasePath(String basePath) { + this.basePath = basePath; + return this; + } + + @Override + public Builder withLogFilePaths(List logFilePaths) { + this.logFilePaths = logFilePaths; + return this; + } + + @Override + public Builder withReaderSchema(Schema schema) { + this.readerSchema = schema; + return this; + } + + @Override + public Builder withLatestInstantTime(String latestInstantTime) { + this.latestInstantTime = latestInstantTime; + return this; + } + + @Override + public Builder withReadBlocksLazily(boolean readBlocksLazily) { + throw new UnsupportedOperationException(); + } + + @Override + public Builder withReverseReader(boolean reverseReader) { + throw new UnsupportedOperationException(); + } + + @Override + public Builder withBufferSize(int bufferSize) { + this.bufferSize = bufferSize; + return this; + } + + @Override + public Builder withPartition(String partitionName) { + this.partitionName = partitionName; + return this; + } + + @Override + public Builder withMaxMemorySizeInBytes(Long maxMemorySizeInBytes) { + this.maxMemorySizeInBytes = maxMemorySizeInBytes; + return this; + } + + @Override + public Builder withSpillableMapBasePath(String spillableMapBasePath) { + this.spillableMapBasePath = spillableMapBasePath; + return this; + } + + @Override + public Builder withDiskMapType(ExternalSpillableMap.DiskMapType diskMapType) { + this.diskMapType = diskMapType; + return this; + } + + @Override + public Builder withBitCaskDiskMapCompressionEnabled(boolean isBitCaskDiskMapCompressionEnabled) { + this.isBitCaskDiskMapCompressionEnabled = isBitCaskDiskMapCompressionEnabled; + return this; + } + + public Builder withLogBlockTimestamps(Set validLogBlockTimestamps) { + withInstantRange(Option.of(new ExplicitMatchRange(validLogBlockTimestamps))); + return this; + } + + public Builder allowFullScan(boolean enableFullScan) { + this.allowFullScan = enableFullScan; + return this; + } + + @Override + public HoodieMetadataMergedLogRecordReader build() { + return new HoodieMetadataMergedLogRecordReader(fs, basePath, partitionName, logFilePaths, readerSchema, + latestInstantTime, maxMemorySizeInBytes, bufferSize, spillableMapBasePath, + diskMapType, isBitCaskDiskMapCompressionEnabled, instantRange, allowFullScan); + } + } + + /** + * Class to assist in checking if an instant is part of a set of instants. + */ + private static class ExplicitMatchRange extends InstantRange { + Set instants; + + public ExplicitMatchRange(Set instants) { + super(Collections.min(instants), Collections.max(instants)); + this.instants = instants; + } + + @Override + public boolean isInRange(String instant) { + return this.instants.contains(instant); + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMetrics.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMetrics.java new file mode 100644 index 0000000000000..fe8612c42e802 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMetrics.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.hudi.common.metrics.Registry; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.exception.HoodieIOException; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.io.Serializable; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class HoodieMetadataMetrics implements Serializable { + + // Metric names + public static final String LOOKUP_PARTITIONS_STR = "lookup_partitions"; + public static final String LOOKUP_FILES_STR = "lookup_files"; + public static final String LOOKUP_BLOOM_FILTERS_METADATA_STR = "lookup_meta_index_bloom_filters"; + public static final String LOOKUP_COLUMN_STATS_METADATA_STR = "lookup_meta_index_column_ranges"; + public static final String SCAN_STR = "scan"; + public static final String BASEFILE_READ_STR = "basefile_read"; + public static final String INITIALIZE_STR = "initialize"; + public static final String REBOOTSTRAP_STR = "rebootstrap"; + public static final String BOOTSTRAP_ERR_STR = "bootstrap_error"; + + // Stats names + public static final String STAT_TOTAL_BASE_FILE_SIZE = "totalBaseFileSizeInBytes"; + public static final String STAT_TOTAL_LOG_FILE_SIZE = "totalLogFileSizeInBytes"; + public static final String STAT_COUNT_BASE_FILES = "baseFileCount"; + public static final String STAT_COUNT_LOG_FILES = "logFileCount"; + public static final String STAT_COUNT_PARTITION = "partitionCount"; + public static final String STAT_LAST_COMPACTION_TIMESTAMP = "lastCompactionTimestamp"; + + private static final Logger LOG = LogManager.getLogger(HoodieMetadataMetrics.class); + + private final Registry metricsRegistry; + + public HoodieMetadataMetrics(Registry metricsRegistry) { + this.metricsRegistry = metricsRegistry; + } + + public Map getStats(boolean detailed, HoodieTableMetaClient metaClient, HoodieTableMetadata metadata) { + try { + metaClient.reloadActiveTimeline(); + HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline()); + return getStats(fsView, detailed, metadata); + } catch (IOException ioe) { + throw new HoodieIOException("Unable to get metadata stats.", ioe); + } + } + + private Map getStats(HoodieTableFileSystemView fsView, boolean detailed, HoodieTableMetadata tableMetadata) throws IOException { + Map stats = new HashMap<>(); + + // Total size of the metadata and count of base/log files + for (String metadataPartition : MetadataPartitionType.allPaths()) { + List latestSlices = fsView.getLatestFileSlices(metadataPartition).collect(Collectors.toList()); + + // Total size of the metadata and count of base/log files + long totalBaseFileSizeInBytes = 0; + long totalLogFileSizeInBytes = 0; + int baseFileCount = 0; + int logFileCount = 0; + + for (FileSlice slice : latestSlices) { + if (slice.getBaseFile().isPresent()) { + totalBaseFileSizeInBytes += slice.getBaseFile().get().getFileStatus().getLen(); + ++baseFileCount; + } + Iterator it = slice.getLogFiles().iterator(); + while (it.hasNext()) { + totalLogFileSizeInBytes += it.next().getFileSize(); + ++logFileCount; + } + } + + stats.put(metadataPartition + "." + STAT_TOTAL_BASE_FILE_SIZE, String.valueOf(totalBaseFileSizeInBytes)); + stats.put(metadataPartition + "." + STAT_TOTAL_LOG_FILE_SIZE, String.valueOf(totalLogFileSizeInBytes)); + stats.put(metadataPartition + "." + STAT_COUNT_BASE_FILES, String.valueOf(baseFileCount)); + stats.put(metadataPartition + "." + STAT_COUNT_LOG_FILES, String.valueOf(logFileCount)); + } + + if (detailed) { + stats.put(HoodieMetadataMetrics.STAT_COUNT_PARTITION, String.valueOf(tableMetadata.getAllPartitionPaths().size())); + } + + return stats; + } + + protected void updateMetrics(String action, long durationInMs) { + if (metricsRegistry == null) { + return; + } + + // Update sum of duration and total for count + String countKey = action + ".count"; + String durationKey = action + ".totalDuration"; + incrementMetric(countKey, 1); + incrementMetric(durationKey, durationInMs); + } + + public void updateSizeMetrics(HoodieTableMetaClient metaClient, HoodieBackedTableMetadata metadata) { + Map stats = getStats(false, metaClient, metadata); + for (Map.Entry e : stats.entrySet()) { + incrementMetric(e.getKey(), Long.parseLong(e.getValue())); + } + } + + protected void incrementMetric(String action, long value) { + LOG.info(String.format("Updating metadata metrics (%s=%d) in %s", action, value, metricsRegistry)); + metricsRegistry.add(action, value); + } + + public Registry registry() { + return metricsRegistry; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java new file mode 100644 index 0000000000000..8e42b7c6eaeff --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java @@ -0,0 +1,809 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.avro.Conversions; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.avro.util.Utf8; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.model.BooleanWrapper; +import org.apache.hudi.avro.model.BytesWrapper; +import org.apache.hudi.avro.model.DateWrapper; +import org.apache.hudi.avro.model.DecimalWrapper; +import org.apache.hudi.avro.model.DoubleWrapper; +import org.apache.hudi.avro.model.FloatWrapper; +import org.apache.hudi.avro.model.HoodieMetadataBloomFilter; +import org.apache.hudi.avro.model.HoodieMetadataColumnStats; +import org.apache.hudi.avro.model.HoodieMetadataFileInfo; +import org.apache.hudi.avro.model.HoodieMetadataRecord; +import org.apache.hudi.avro.model.IntWrapper; +import org.apache.hudi.avro.model.LongWrapper; +import org.apache.hudi.avro.model.StringWrapper; +import org.apache.hudi.avro.model.TimestampMicrosWrapper; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieColumnRangeMetadata; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.hash.ColumnIndexID; +import org.apache.hudi.common.util.hash.FileIndexID; +import org.apache.hudi.common.util.hash.PartitionIndexID; +import org.apache.hudi.exception.HoodieMetadataException; +import org.apache.hudi.hadoop.CachingPath; +import org.apache.hudi.io.storage.HoodieHFileReader; +import org.apache.hudi.util.Lazy; + +import java.io.IOException; +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.sql.Date; +import java.sql.Timestamp; +import java.time.Instant; +import java.time.LocalDate; +import java.util.Arrays; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Properties; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.apache.hudi.common.util.DateTimeUtils.instantToMicros; +import static org.apache.hudi.common.util.DateTimeUtils.microsToInstant; +import static org.apache.hudi.common.util.TypeUtils.unsafeCast; +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; +import static org.apache.hudi.common.util.ValidationUtils.checkState; +import static org.apache.hudi.hadoop.CachingPath.createPathUnsafe; +import static org.apache.hudi.metadata.HoodieTableMetadata.RECORDKEY_PARTITION_LIST; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getPartitionIdentifier; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.tryUpcastDecimal; + +/** + * MetadataTable records are persisted with the schema defined in HoodieMetadata.avsc. + * This class represents the payload for the MetadataTable. + *

    + * This single metadata payload is shared by all the partitions under the metadata table. + * The partition specific records are determined by the field "type" saved within the record. + * The following types are supported: + *

    + * METADATA_TYPE_PARTITION_LIST (1): + * -- List of all partitions. There is a single such record + * -- key = @{@link HoodieTableMetadata#RECORDKEY_PARTITION_LIST} + *

    + * METADATA_TYPE_FILE_LIST (2): + * -- List of all files in a partition. There is one such record for each partition + * -- key = partition name + *

    + * METADATA_TYPE_COLUMN_STATS (3): + * -- This is an index for column stats in the table + *

    + * METADATA_TYPE_BLOOM_FILTER (4): + * -- This is an index for base file bloom filters. This is a map of FileID to its BloomFilter byte[]. + *

    + * During compaction on the table, the deletions are merged with additions and hence records are pruned. + */ +public class HoodieMetadataPayload implements HoodieRecordPayload { + + // Type of the record. This can be an enum in the schema but Avro1.8 + // has a bug - https://issues.apache.org/jira/browse/AVRO-1810 + protected static final int METADATA_TYPE_PARTITION_LIST = 1; + protected static final int METADATA_TYPE_FILE_LIST = 2; + protected static final int METADATA_TYPE_COLUMN_STATS = 3; + protected static final int METADATA_TYPE_BLOOM_FILTER = 4; + + // HoodieMetadata schema field ids + public static final String KEY_FIELD_NAME = HoodieHFileReader.KEY_FIELD_NAME; + public static final String SCHEMA_FIELD_NAME_TYPE = "type"; + public static final String SCHEMA_FIELD_NAME_METADATA = "filesystemMetadata"; + public static final String SCHEMA_FIELD_ID_COLUMN_STATS = "ColumnStatsMetadata"; + public static final String SCHEMA_FIELD_ID_BLOOM_FILTER = "BloomFilterMetadata"; + + // HoodieMetadata bloom filter payload field ids + private static final String FIELD_IS_DELETED = "isDeleted"; + private static final String BLOOM_FILTER_FIELD_TYPE = "type"; + private static final String BLOOM_FILTER_FIELD_TIMESTAMP = "timestamp"; + private static final String BLOOM_FILTER_FIELD_BLOOM_FILTER = "bloomFilter"; + private static final String BLOOM_FILTER_FIELD_IS_DELETED = FIELD_IS_DELETED; + + // HoodieMetadata column stats payload field ids + public static final String COLUMN_STATS_FIELD_MIN_VALUE = "minValue"; + public static final String COLUMN_STATS_FIELD_MAX_VALUE = "maxValue"; + public static final String COLUMN_STATS_FIELD_NULL_COUNT = "nullCount"; + public static final String COLUMN_STATS_FIELD_VALUE_COUNT = "valueCount"; + public static final String COLUMN_STATS_FIELD_TOTAL_SIZE = "totalSize"; + public static final String COLUMN_STATS_FIELD_FILE_NAME = "fileName"; + public static final String COLUMN_STATS_FIELD_COLUMN_NAME = "columnName"; + public static final String COLUMN_STATS_FIELD_TOTAL_UNCOMPRESSED_SIZE = "totalUncompressedSize"; + public static final String COLUMN_STATS_FIELD_IS_DELETED = FIELD_IS_DELETED; + + private static final Conversions.DecimalConversion AVRO_DECIMAL_CONVERSION = new Conversions.DecimalConversion(); + + // NOTE: PLEASE READ CAREFULLY + // + // In Avro 1.10 generated builders rely on {@code SpecificData.getForSchema} invocation that in turn + // does use reflection to load the code-gen'd class corresponding to the Avro record model. This has + // serious adverse effects in terms of performance when gets executed on the hot-path (both, in terms + // of runtime and efficiency). + // + // To work this around instead of using default code-gen'd builder invoking {@code SpecificData.getForSchema}, + // we instead rely on overloaded ctor accepting another instance of the builder: {@code Builder(Builder)}, + // which bypasses such invocation. Following corresponding builder's stubs are statically initialized + // to be used exactly for that purpose. + // + // You can find more details in HUDI-3834 + private static final Lazy METADATA_COLUMN_STATS_BUILDER_STUB = Lazy.lazily(HoodieMetadataColumnStats::newBuilder); + private static final Lazy STRING_WRAPPER_BUILDER_STUB = Lazy.lazily(StringWrapper::newBuilder); + private static final Lazy BYTES_WRAPPER_BUILDER_STUB = Lazy.lazily(BytesWrapper::newBuilder); + private static final Lazy DOUBLE_WRAPPER_BUILDER_STUB = Lazy.lazily(DoubleWrapper::newBuilder); + private static final Lazy FLOAT_WRAPPER_BUILDER_STUB = Lazy.lazily(FloatWrapper::newBuilder); + private static final Lazy LONG_WRAPPER_BUILDER_STUB = Lazy.lazily(LongWrapper::newBuilder); + private static final Lazy INT_WRAPPER_BUILDER_STUB = Lazy.lazily(IntWrapper::newBuilder); + private static final Lazy BOOLEAN_WRAPPER_BUILDER_STUB = Lazy.lazily(BooleanWrapper::newBuilder); + private static final Lazy TIMESTAMP_MICROS_WRAPPER_BUILDER_STUB = Lazy.lazily(TimestampMicrosWrapper::newBuilder); + private static final Lazy DECIMAL_WRAPPER_BUILDER_STUB = Lazy.lazily(DecimalWrapper::newBuilder); + private static final Lazy DATE_WRAPPER_BUILDER_STUB = Lazy.lazily(DateWrapper::newBuilder); + + private String key = null; + private int type = 0; + private Map filesystemMetadata = null; + private HoodieMetadataBloomFilter bloomFilterMetadata = null; + private HoodieMetadataColumnStats columnStatMetadata = null; + + public HoodieMetadataPayload(GenericRecord record, Comparable orderingVal) { + this(Option.of(record)); + } + + public HoodieMetadataPayload(Option recordOpt) { + if (recordOpt.isPresent()) { + GenericRecord record = recordOpt.get(); + // This can be simplified using SpecificData.deepcopy once this bug is fixed + // https://issues.apache.org/jira/browse/AVRO-1811 + // + // NOTE: {@code HoodieMetadataRecord} has to always carry both "key" and "type" fields + // for it to be handled appropriately, therefore these fields have to be reflected + // in any (read-)projected schema + key = record.get(KEY_FIELD_NAME).toString(); + type = (int) record.get(SCHEMA_FIELD_NAME_TYPE); + + Map metadata = getNestedFieldValue(record, SCHEMA_FIELD_NAME_METADATA); + if (metadata != null) { + filesystemMetadata = metadata; + filesystemMetadata.keySet().forEach(k -> { + GenericRecord v = filesystemMetadata.get(k); + filesystemMetadata.put(k, new HoodieMetadataFileInfo((Long) v.get("size"), (Boolean) v.get("isDeleted"))); + }); + } + + if (type == METADATA_TYPE_BLOOM_FILTER) { + GenericRecord bloomFilterRecord = getNestedFieldValue(record, SCHEMA_FIELD_ID_BLOOM_FILTER); + // NOTE: Only legitimate reason for {@code BloomFilterMetadata} to not be present is when + // it's not been read from the storage (ie it's not been a part of projected schema). + // Otherwise, it has to be present or the record would be considered invalid + if (bloomFilterRecord == null) { + checkArgument(record.getSchema().getField(SCHEMA_FIELD_ID_BLOOM_FILTER) == null, + String.format("Valid %s record expected for type: %s", SCHEMA_FIELD_ID_BLOOM_FILTER, METADATA_TYPE_COLUMN_STATS)); + } else { + bloomFilterMetadata = new HoodieMetadataBloomFilter( + (String) bloomFilterRecord.get(BLOOM_FILTER_FIELD_TYPE), + (String) bloomFilterRecord.get(BLOOM_FILTER_FIELD_TIMESTAMP), + (ByteBuffer) bloomFilterRecord.get(BLOOM_FILTER_FIELD_BLOOM_FILTER), + (Boolean) bloomFilterRecord.get(BLOOM_FILTER_FIELD_IS_DELETED) + ); + } + } + + if (type == METADATA_TYPE_COLUMN_STATS) { + GenericRecord columnStatsRecord = getNestedFieldValue(record, SCHEMA_FIELD_ID_COLUMN_STATS); + // NOTE: Only legitimate reason for {@code ColumnStatsMetadata} to not be present is when + // it's not been read from the storage (ie it's not been a part of projected schema). + // Otherwise, it has to be present or the record would be considered invalid + if (columnStatsRecord == null) { + checkArgument(record.getSchema().getField(SCHEMA_FIELD_ID_COLUMN_STATS) == null, + String.format("Valid %s record expected for type: %s", SCHEMA_FIELD_ID_COLUMN_STATS, METADATA_TYPE_COLUMN_STATS)); + } else { + columnStatMetadata = HoodieMetadataColumnStats.newBuilder(METADATA_COLUMN_STATS_BUILDER_STUB.get()) + .setFileName((String) columnStatsRecord.get(COLUMN_STATS_FIELD_FILE_NAME)) + .setColumnName((String) columnStatsRecord.get(COLUMN_STATS_FIELD_COLUMN_NAME)) + .setMinValue(columnStatsRecord.get(COLUMN_STATS_FIELD_MIN_VALUE)) + .setMaxValue(columnStatsRecord.get(COLUMN_STATS_FIELD_MAX_VALUE)) + .setValueCount((Long) columnStatsRecord.get(COLUMN_STATS_FIELD_VALUE_COUNT)) + .setNullCount((Long) columnStatsRecord.get(COLUMN_STATS_FIELD_NULL_COUNT)) + .setTotalSize((Long) columnStatsRecord.get(COLUMN_STATS_FIELD_TOTAL_SIZE)) + .setTotalUncompressedSize((Long) columnStatsRecord.get(COLUMN_STATS_FIELD_TOTAL_UNCOMPRESSED_SIZE)) + .setIsDeleted((Boolean) columnStatsRecord.get(COLUMN_STATS_FIELD_IS_DELETED)) + .build(); + } + } + } + } + + private HoodieMetadataPayload(String key, int type, Map filesystemMetadata) { + this(key, type, filesystemMetadata, null, null); + } + + private HoodieMetadataPayload(String key, HoodieMetadataBloomFilter metadataBloomFilter) { + this(key, METADATA_TYPE_BLOOM_FILTER, null, metadataBloomFilter, null); + } + + private HoodieMetadataPayload(String key, HoodieMetadataColumnStats columnStats) { + this(key, METADATA_TYPE_COLUMN_STATS, null, null, columnStats); + } + + protected HoodieMetadataPayload(String key, int type, + Map filesystemMetadata, + HoodieMetadataBloomFilter metadataBloomFilter, + HoodieMetadataColumnStats columnStats) { + this.key = key; + this.type = type; + this.filesystemMetadata = filesystemMetadata; + this.bloomFilterMetadata = metadataBloomFilter; + this.columnStatMetadata = columnStats; + } + + /** + * Create and return a {@code HoodieMetadataPayload} to save list of partitions. + * + * @param partitions The list of partitions + */ + public static HoodieRecord createPartitionListRecord(List partitions) { + return createPartitionListRecord(partitions, false); + } + + /** + * Create and return a {@code HoodieMetadataPayload} to save list of partitions. + * + * @param partitions The list of partitions + */ + public static HoodieRecord createPartitionListRecord(List partitions, boolean isDeleted) { + Map fileInfo = new HashMap<>(); + partitions.forEach(partition -> fileInfo.put(getPartitionIdentifier(partition), new HoodieMetadataFileInfo(0L, isDeleted))); + + HoodieKey key = new HoodieKey(RECORDKEY_PARTITION_LIST, MetadataPartitionType.FILES.getPartitionPath()); + HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_PARTITION_LIST, + fileInfo); + return new HoodieAvroRecord<>(key, payload); + } + + /** + * Create and return a {@code HoodieMetadataPayload} to save list of partitions. + * + * @param partitionsAdded The list of added partitions + * @param partitionsDeleted The list of deleted partitions + */ + public static HoodieRecord createPartitionListRecord(List partitionsAdded, List partitionsDeleted) { + Map fileInfo = new HashMap<>(); + partitionsAdded.forEach(partition -> fileInfo.put(partition, new HoodieMetadataFileInfo(0L, false))); + partitionsDeleted.forEach(partition -> fileInfo.put(partition, new HoodieMetadataFileInfo(0L, true))); + + HoodieKey key = new HoodieKey(RECORDKEY_PARTITION_LIST, MetadataPartitionType.FILES.getPartitionPath()); + HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_PARTITION_LIST, + fileInfo); + return new HoodieAvroRecord<>(key, payload); + } + + /** + * Create and return a {@code HoodieMetadataPayload} to save list of files within a partition. + * + * @param partition The name of the partition + * @param filesAdded Mapping of files to their sizes for files which have been added to this partition + * @param filesDeleted List of files which have been deleted from this partition + */ + public static HoodieRecord createPartitionFilesRecord(String partition, + Option> filesAdded, + Option> filesDeleted) { + Map fileInfo = new HashMap<>(); + filesAdded.ifPresent(filesMap -> + fileInfo.putAll( + filesMap.entrySet().stream().collect( + Collectors.toMap(Map.Entry::getKey, (entry) -> { + long fileSize = entry.getValue(); + // Assert that the file-size of the file being added is positive, since Hudi + // should not be creating empty files + checkState(fileSize > 0); + return new HoodieMetadataFileInfo(fileSize, false); + }))) + ); + filesDeleted.ifPresent(filesList -> + fileInfo.putAll( + filesList.stream().collect( + Collectors.toMap(Function.identity(), (ignored) -> new HoodieMetadataFileInfo(0L, true)))) + ); + + HoodieKey key = new HoodieKey(partition, MetadataPartitionType.FILES.getPartitionPath()); + HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo); + return new HoodieAvroRecord<>(key, payload); + } + + /** + * Create bloom filter metadata record. + * + * @param partitionName - Partition name + * @param baseFileName - Base file name for which the bloom filter needs to persisted + * @param timestamp - Instant timestamp responsible for this record + * @param bloomFilter - Bloom filter for the File + * @param isDeleted - Is the bloom filter no more valid + * @return Metadata payload containing the fileID and its bloom filter record + */ + public static HoodieRecord createBloomFilterMetadataRecord(final String partitionName, + final String baseFileName, + final String timestamp, + final String bloomFilterType, + final ByteBuffer bloomFilter, + final boolean isDeleted) { + checkArgument(!baseFileName.contains(Path.SEPARATOR) + && FSUtils.isBaseFile(new Path(baseFileName)), + "Invalid base file '" + baseFileName + "' for MetaIndexBloomFilter!"); + final String bloomFilterIndexKey = new PartitionIndexID(partitionName).asBase64EncodedString() + .concat(new FileIndexID(baseFileName).asBase64EncodedString()); + HoodieKey key = new HoodieKey(bloomFilterIndexKey, MetadataPartitionType.BLOOM_FILTERS.getPartitionPath()); + + HoodieMetadataBloomFilter metadataBloomFilter = + new HoodieMetadataBloomFilter(bloomFilterType, timestamp, bloomFilter, isDeleted); + HoodieMetadataPayload metadataPayload = new HoodieMetadataPayload(key.getRecordKey(), metadataBloomFilter); + return new HoodieAvroRecord<>(key, metadataPayload); + } + + @Override + public HoodieMetadataPayload preCombine(HoodieMetadataPayload previousRecord) { + checkArgument(previousRecord.type == type, + "Cannot combine " + previousRecord.type + " with " + type); + + switch (type) { + case METADATA_TYPE_PARTITION_LIST: + case METADATA_TYPE_FILE_LIST: + Map combinedFileInfo = combineFileSystemMetadata(previousRecord); + return new HoodieMetadataPayload(key, type, combinedFileInfo); + case METADATA_TYPE_BLOOM_FILTER: + HoodieMetadataBloomFilter combineBloomFilterMetadata = combineBloomFilterMetadata(previousRecord); + return new HoodieMetadataPayload(key, combineBloomFilterMetadata); + case METADATA_TYPE_COLUMN_STATS: + return new HoodieMetadataPayload(key, combineColumnStatsMetadata(previousRecord)); + default: + throw new HoodieMetadataException("Unknown type of HoodieMetadataPayload: " + type); + } + } + + private HoodieMetadataBloomFilter combineBloomFilterMetadata(HoodieMetadataPayload previousRecord) { + // Bloom filters are always additive. No need to merge with previous bloom filter + return this.bloomFilterMetadata; + } + + private HoodieMetadataColumnStats combineColumnStatsMetadata(HoodieMetadataPayload previousRecord) { + checkArgument(previousRecord.getColumnStatMetadata().isPresent()); + checkArgument(getColumnStatMetadata().isPresent()); + + HoodieMetadataColumnStats previousColStatsRecord = previousRecord.getColumnStatMetadata().get(); + HoodieMetadataColumnStats newColumnStatsRecord = getColumnStatMetadata().get(); + + return mergeColumnStatsRecords(previousColStatsRecord, newColumnStatsRecord); + } + + @Override + public Option combineAndGetUpdateValue(IndexedRecord oldRecord, Schema schema, Properties properties) throws IOException { + HoodieMetadataPayload anotherPayload = new HoodieMetadataPayload(Option.of((GenericRecord) oldRecord)); + HoodieRecordPayload combinedPayload = preCombine(anotherPayload); + return combinedPayload.getInsertValue(schema, properties); + } + + @Override + public Option combineAndGetUpdateValue(IndexedRecord oldRecord, Schema schema) throws IOException { + return combineAndGetUpdateValue(oldRecord, schema, new Properties()); + } + + @Override + public Option getInsertValue(Schema schemaIgnored, Properties propertiesIgnored) throws IOException { + if (key == null) { + return Option.empty(); + } + + HoodieMetadataRecord record = new HoodieMetadataRecord(key, type, filesystemMetadata, bloomFilterMetadata, + columnStatMetadata); + return Option.of(record); + } + + @Override + public Option getInsertValue(Schema schema) throws IOException { + return getInsertValue(schema, new Properties()); + } + + /** + * Returns the list of filenames added as part of this record. + */ + public List getFilenames() { + return filterFileInfoEntries(false).map(Map.Entry::getKey).sorted().collect(Collectors.toList()); + } + + /** + * Returns the list of filenames deleted as part of this record. + */ + public List getDeletions() { + return filterFileInfoEntries(true).map(Map.Entry::getKey).sorted().collect(Collectors.toList()); + } + + /** + * Get the bloom filter metadata from this payload. + */ + public Option getBloomFilterMetadata() { + if (bloomFilterMetadata == null) { + return Option.empty(); + } + + return Option.of(bloomFilterMetadata); + } + + /** + * Get the bloom filter metadata from this payload. + */ + public Option getColumnStatMetadata() { + if (columnStatMetadata == null) { + return Option.empty(); + } + + return Option.of(columnStatMetadata); + } + + /** + * Returns the files added as part of this record. + */ + public FileStatus[] getFileStatuses(Configuration hadoopConf, Path partitionPath) throws IOException { + FileSystem fs = partitionPath.getFileSystem(hadoopConf); + return getFileStatuses(fs, partitionPath); + } + + /** + * Returns the files added as part of this record. + */ + public FileStatus[] getFileStatuses(FileSystem fs, Path partitionPath) { + long blockSize = fs.getDefaultBlockSize(partitionPath); + return filterFileInfoEntries(false) + .map(e -> { + // NOTE: Since we know that the Metadata Table's Payload is simply a file-name we're + // creating Hadoop's Path using more performant unsafe variant + CachingPath filePath = new CachingPath(partitionPath, createPathUnsafe(e.getKey())); + return new FileStatus(e.getValue().getSize(), false, 0, blockSize, 0, 0, + null, null, null, filePath); + }) + .toArray(FileStatus[]::new); + } + + private Stream> filterFileInfoEntries(boolean isDeleted) { + if (filesystemMetadata == null) { + return Stream.empty(); + } + + return filesystemMetadata.entrySet().stream().filter(e -> e.getValue().getIsDeleted() == isDeleted); + } + + private Map combineFileSystemMetadata(HoodieMetadataPayload previousRecord) { + Map combinedFileInfo = new HashMap<>(); + + // First, add all files listed in the previous record + if (previousRecord.filesystemMetadata != null) { + combinedFileInfo.putAll(previousRecord.filesystemMetadata); + } + + // Second, merge in the files listed in the new record + if (filesystemMetadata != null) { + validatePayload(type, filesystemMetadata); + + filesystemMetadata.forEach((key, fileInfo) -> { + combinedFileInfo.merge(key, fileInfo, + // Combine previous record w/ the new one, new records taking precedence over + // the old one + // + // NOTE: That if previous listing contains the file that is being deleted by the tombstone + // record (`IsDeleted` = true) in the new one, we simply delete the file from the resulting + // listing as well as drop the tombstone itself. + // However, if file is not present in the previous record we have to persist tombstone + // record in the listing to make sure we carry forward information that this file + // was deleted. This special case could occur since the merging flow is 2-stage: + // - First we merge records from all of the delta log-files + // - Then we merge records from base-files with the delta ones (coming as a result + // of the previous step) + (oldFileInfo, newFileInfo) -> + // NOTE: We can’t assume that MT update records will be ordered the same way as actual + // FS operations (since they are not atomic), therefore MT record merging should be a + // _commutative_ & _associative_ operation (ie one that would work even in case records + // will get re-ordered), which is + // - Possible for file-sizes (since file-sizes will ever grow, we can simply + // take max of the old and new records) + // - Not possible for is-deleted flags* + // + // *However, we’re assuming that the case of concurrent write and deletion of the same + // file is _impossible_ -- it would only be possible with concurrent upsert and + // rollback operation (affecting the same log-file), which is implausible, b/c either + // of the following have to be true: + // - We’re appending to failed log-file (then the other writer is trying to + // rollback it concurrently, before it’s own write) + // - Rollback (of completed instant) is running concurrently with append (meaning + // that restore is running concurrently with a write, which is also nut supported + // currently) + newFileInfo.getIsDeleted() + ? null + : new HoodieMetadataFileInfo(Math.max(newFileInfo.getSize(), oldFileInfo.getSize()), false)); + }); + } + + return combinedFileInfo; + } + + /** + * Get bloom filter index key. + * + * @param partitionIndexID - Partition index id + * @param fileIndexID - File index id + * @return Bloom filter index key + */ + public static String getBloomFilterIndexKey(PartitionIndexID partitionIndexID, FileIndexID fileIndexID) { + return partitionIndexID.asBase64EncodedString() + .concat(fileIndexID.asBase64EncodedString()); + } + + /** + * Get column stats index key. + * + * @param partitionIndexID - Partition index id + * @param fileIndexID - File index id + * @param columnIndexID - Column index id + * @return Column stats index key + */ + public static String getColumnStatsIndexKey(PartitionIndexID partitionIndexID, FileIndexID fileIndexID, ColumnIndexID columnIndexID) { + return columnIndexID.asBase64EncodedString() + .concat(partitionIndexID.asBase64EncodedString()) + .concat(fileIndexID.asBase64EncodedString()); + } + + /** + * Get column stats index key from the column range metadata. + * + * @param partitionName - Partition name + * @param columnRangeMetadata - Column range metadata + * @return Column stats index key + */ + public static String getColumnStatsIndexKey(String partitionName, HoodieColumnRangeMetadata columnRangeMetadata) { + final PartitionIndexID partitionIndexID = new PartitionIndexID(partitionName); + final FileIndexID fileIndexID = new FileIndexID(new Path(columnRangeMetadata.getFilePath()).getName()); + final ColumnIndexID columnIndexID = new ColumnIndexID(columnRangeMetadata.getColumnName()); + return getColumnStatsIndexKey(partitionIndexID, fileIndexID, columnIndexID); + } + + public static Stream createColumnStatsRecords(String partitionName, + Collection> columnRangeMetadataList, + boolean isDeleted) { + return columnRangeMetadataList.stream().map(columnRangeMetadata -> { + HoodieKey key = new HoodieKey(getColumnStatsIndexKey(partitionName, columnRangeMetadata), + MetadataPartitionType.COLUMN_STATS.getPartitionPath()); + + HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), + HoodieMetadataColumnStats.newBuilder() + .setFileName(new Path(columnRangeMetadata.getFilePath()).getName()) + .setColumnName(columnRangeMetadata.getColumnName()) + .setMinValue(wrapStatisticValue(columnRangeMetadata.getMinValue())) + .setMaxValue(wrapStatisticValue(columnRangeMetadata.getMaxValue())) + .setNullCount(columnRangeMetadata.getNullCount()) + .setValueCount(columnRangeMetadata.getValueCount()) + .setTotalSize(columnRangeMetadata.getTotalSize()) + .setTotalUncompressedSize(columnRangeMetadata.getTotalUncompressedSize()) + .setIsDeleted(isDeleted) + .build()); + + return new HoodieAvroRecord<>(key, payload); + }); + } + + @SuppressWarnings({"rawtypes", "unchecked"}) + private static HoodieMetadataColumnStats mergeColumnStatsRecords(HoodieMetadataColumnStats prevColumnStats, + HoodieMetadataColumnStats newColumnStats) { + checkArgument(Objects.equals(prevColumnStats.getFileName(), newColumnStats.getFileName())); + checkArgument(Objects.equals(prevColumnStats.getColumnName(), newColumnStats.getColumnName())); + + // We're handling 2 cases in here + // - New record is a tombstone: in this case it simply overwrites previous state + // - Previous record is a tombstone: in that case new proper record would also + // be simply overwriting previous state + if (newColumnStats.getIsDeleted() || prevColumnStats.getIsDeleted()) { + return newColumnStats; + } + + Comparable minValue = + (Comparable) Stream.of( + (Comparable) unwrapStatisticValueWrapper(prevColumnStats.getMinValue()), + (Comparable) unwrapStatisticValueWrapper(newColumnStats.getMinValue())) + .filter(Objects::nonNull) + .min(Comparator.naturalOrder()) + .orElse(null); + + Comparable maxValue = + (Comparable) Stream.of( + (Comparable) unwrapStatisticValueWrapper(prevColumnStats.getMaxValue()), + (Comparable) unwrapStatisticValueWrapper(newColumnStats.getMaxValue())) + .filter(Objects::nonNull) + .max(Comparator.naturalOrder()) + .orElse(null); + + return HoodieMetadataColumnStats.newBuilder(METADATA_COLUMN_STATS_BUILDER_STUB.get()) + .setFileName(newColumnStats.getFileName()) + .setColumnName(newColumnStats.getColumnName()) + .setMinValue(wrapStatisticValue(minValue)) + .setMaxValue(wrapStatisticValue(maxValue)) + .setValueCount(prevColumnStats.getValueCount() + newColumnStats.getValueCount()) + .setNullCount(prevColumnStats.getNullCount() + newColumnStats.getNullCount()) + .setTotalSize(prevColumnStats.getTotalSize() + newColumnStats.getTotalSize()) + .setTotalUncompressedSize(prevColumnStats.getTotalUncompressedSize() + newColumnStats.getTotalUncompressedSize()) + .setIsDeleted(newColumnStats.getIsDeleted()) + .build(); + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } else if (!(other instanceof HoodieMetadataPayload)) { + return false; + } + + HoodieMetadataPayload otherMetadataPayload = (HoodieMetadataPayload) other; + + return this.type == otherMetadataPayload.type + && Objects.equals(this.key, otherMetadataPayload.key) + && Objects.equals(this.filesystemMetadata, otherMetadataPayload.filesystemMetadata) + && Objects.equals(this.bloomFilterMetadata, otherMetadataPayload.bloomFilterMetadata) + && Objects.equals(this.columnStatMetadata, otherMetadataPayload.columnStatMetadata); + } + + @Override + public int hashCode() { + return Objects.hash(key, type, filesystemMetadata, bloomFilterMetadata, columnStatMetadata); + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("HoodieMetadataPayload {"); + sb.append(KEY_FIELD_NAME + "=").append(key).append(", "); + sb.append(SCHEMA_FIELD_NAME_TYPE + "=").append(type).append(", "); + sb.append("creations=").append(Arrays.toString(getFilenames().toArray())).append(", "); + sb.append("deletions=").append(Arrays.toString(getDeletions().toArray())).append(", "); + if (type == METADATA_TYPE_BLOOM_FILTER) { + checkState(getBloomFilterMetadata().isPresent()); + sb.append("BloomFilter: {"); + sb.append("bloom size: ").append(getBloomFilterMetadata().get().getBloomFilter().array().length).append(", "); + sb.append("timestamp: ").append(getBloomFilterMetadata().get().getTimestamp()).append(", "); + sb.append("deleted: ").append(getBloomFilterMetadata().get().getIsDeleted()); + sb.append("}"); + } + if (type == METADATA_TYPE_COLUMN_STATS) { + checkState(getColumnStatMetadata().isPresent()); + sb.append("ColStats: {"); + sb.append(getColumnStatMetadata().get()); + sb.append("}"); + } + sb.append('}'); + return sb.toString(); + } + + private static Object wrapStatisticValue(Comparable statValue) { + if (statValue == null) { + return null; + } else if (statValue instanceof Date || statValue instanceof LocalDate) { + // NOTE: Due to breaking changes in code-gen b/w Avro 1.8.2 and 1.10, we can't + // rely on logical types to do proper encoding of the native Java types, + // and hereby have to encode statistic manually + LocalDate localDate = statValue instanceof LocalDate + ? (LocalDate) statValue + : ((Date) statValue).toLocalDate(); + return DateWrapper.newBuilder(DATE_WRAPPER_BUILDER_STUB.get()) + .setValue((int) localDate.toEpochDay()) + .build(); + } else if (statValue instanceof BigDecimal) { + Schema valueSchema = DecimalWrapper.SCHEMA$.getField("value").schema(); + BigDecimal upcastDecimal = tryUpcastDecimal((BigDecimal) statValue, (LogicalTypes.Decimal) valueSchema.getLogicalType()); + return DecimalWrapper.newBuilder(DECIMAL_WRAPPER_BUILDER_STUB.get()) + .setValue(AVRO_DECIMAL_CONVERSION.toBytes(upcastDecimal, valueSchema, valueSchema.getLogicalType())) + .build(); + } else if (statValue instanceof Timestamp) { + // NOTE: Due to breaking changes in code-gen b/w Avro 1.8.2 and 1.10, we can't + // rely on logical types to do proper encoding of the native Java types, + // and hereby have to encode statistic manually + Instant instant = ((Timestamp) statValue).toInstant(); + return TimestampMicrosWrapper.newBuilder(TIMESTAMP_MICROS_WRAPPER_BUILDER_STUB.get()) + .setValue(instantToMicros(instant)) + .build(); + } else if (statValue instanceof Boolean) { + return BooleanWrapper.newBuilder(BOOLEAN_WRAPPER_BUILDER_STUB.get()).setValue((Boolean) statValue).build(); + } else if (statValue instanceof Integer) { + return IntWrapper.newBuilder(INT_WRAPPER_BUILDER_STUB.get()).setValue((Integer) statValue).build(); + } else if (statValue instanceof Long) { + return LongWrapper.newBuilder(LONG_WRAPPER_BUILDER_STUB.get()).setValue((Long) statValue).build(); + } else if (statValue instanceof Float) { + return FloatWrapper.newBuilder(FLOAT_WRAPPER_BUILDER_STUB.get()).setValue((Float) statValue).build(); + } else if (statValue instanceof Double) { + return DoubleWrapper.newBuilder(DOUBLE_WRAPPER_BUILDER_STUB.get()).setValue((Double) statValue).build(); + } else if (statValue instanceof ByteBuffer) { + return BytesWrapper.newBuilder(BYTES_WRAPPER_BUILDER_STUB.get()).setValue((ByteBuffer) statValue).build(); + } else if (statValue instanceof String || statValue instanceof Utf8) { + return StringWrapper.newBuilder(STRING_WRAPPER_BUILDER_STUB.get()).setValue(statValue.toString()).build(); + } else { + throw new UnsupportedOperationException(String.format("Unsupported type of the statistic (%s)", statValue.getClass())); + } + } + + public static Comparable unwrapStatisticValueWrapper(Object statValueWrapper) { + if (statValueWrapper == null) { + return null; + } else if (statValueWrapper instanceof DateWrapper) { + return LocalDate.ofEpochDay(((DateWrapper) statValueWrapper).getValue()); + } else if (statValueWrapper instanceof DecimalWrapper) { + Schema valueSchema = DecimalWrapper.SCHEMA$.getField("value").schema(); + return AVRO_DECIMAL_CONVERSION.fromBytes(((DecimalWrapper) statValueWrapper).getValue(), valueSchema, valueSchema.getLogicalType()); + } else if (statValueWrapper instanceof TimestampMicrosWrapper) { + return microsToInstant(((TimestampMicrosWrapper) statValueWrapper).getValue()); + } else if (statValueWrapper instanceof BooleanWrapper) { + return ((BooleanWrapper) statValueWrapper).getValue(); + } else if (statValueWrapper instanceof IntWrapper) { + return ((IntWrapper) statValueWrapper).getValue(); + } else if (statValueWrapper instanceof LongWrapper) { + return ((LongWrapper) statValueWrapper).getValue(); + } else if (statValueWrapper instanceof FloatWrapper) { + return ((FloatWrapper) statValueWrapper).getValue(); + } else if (statValueWrapper instanceof DoubleWrapper) { + return ((DoubleWrapper) statValueWrapper).getValue(); + } else if (statValueWrapper instanceof BytesWrapper) { + return ((BytesWrapper) statValueWrapper).getValue(); + } else if (statValueWrapper instanceof StringWrapper) { + return ((StringWrapper) statValueWrapper).getValue(); + } else if (statValueWrapper instanceof GenericRecord) { + // NOTE: This branch could be hit b/c Avro records could be reconstructed + // as {@code GenericRecord) + // TODO add logical type decoding + GenericRecord record = (GenericRecord) statValueWrapper; + return (Comparable) record.get("value"); + } else { + throw new UnsupportedOperationException(String.format("Unsupported type of the statistic (%s)", statValueWrapper.getClass())); + } + } + + private static void validatePayload(int type, Map filesystemMetadata) { + if (type == METADATA_TYPE_FILE_LIST) { + filesystemMetadata.forEach((fileName, fileInfo) -> { + checkState(fileInfo.getIsDeleted() || fileInfo.getSize() > 0, "Existing files should have size > 0"); + }); + } + } + + private static T getNestedFieldValue(GenericRecord record, String fieldName) { + // NOTE: This routine is more lightweight than {@code HoodieAvroUtils.getNestedFieldVal} + if (record.getSchema().getField(fieldName) == null) { + return null; + } + + return unsafeCast(record.get(fieldName)); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java new file mode 100644 index 0000000000000..349c0efb482a5 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.model.HoodieMetadataColumnStats; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieMetadataException; + +import java.io.IOException; +import java.io.Serializable; +import java.util.List; +import java.util.Map; + +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; +import static org.apache.hudi.common.util.ValidationUtils.checkState; + +/** + * Interface that supports querying various pieces of metadata about a hudi table. + */ +public interface HoodieTableMetadata extends Serializable, AutoCloseable { + + // Table name suffix + String METADATA_TABLE_NAME_SUFFIX = "_metadata"; + /** + * Timestamp for a commit when the base dataset had not had any commits yet. this is < than even + * {@link org.apache.hudi.common.table.timeline.HoodieTimeline#INIT_INSTANT_TS}, such that the metadata table + * can be prepped even before bootstrap is done. + */ + String SOLO_COMMIT_TIMESTAMP = "00000000000000"; + // Key for the record which saves list of all partitions + String RECORDKEY_PARTITION_LIST = "__all_partitions__"; + // The partition name used for non-partitioned tables + String NON_PARTITIONED_NAME = "."; + String EMPTY_PARTITION_NAME = ""; + + /** + * Return the base-path of the Metadata Table for the given Dataset identified by base-path + */ + static String getMetadataTableBasePath(String dataTableBasePath) { + return dataTableBasePath + Path.SEPARATOR + HoodieTableMetaClient.METADATA_TABLE_FOLDER_PATH; + } + + /** + * Returns the base path of the Dataset provided the base-path of the Metadata Table of this + * Dataset + */ + static String getDataTableBasePathFromMetadataTable(String metadataTableBasePath) { + checkArgument(isMetadataTable(metadataTableBasePath)); + return metadataTableBasePath.substring(0, metadataTableBasePath.lastIndexOf(HoodieTableMetaClient.METADATA_TABLE_FOLDER_PATH) - 1); + } + + /** + * Return the base path of the dataset. + * + * @param metadataTableBasePath The base path of the metadata table + */ + static String getDatasetBasePath(String metadataTableBasePath) { + int endPos = metadataTableBasePath.lastIndexOf(Path.SEPARATOR + HoodieTableMetaClient.METADATA_TABLE_FOLDER_PATH); + checkState(endPos != -1, metadataTableBasePath + " should be base path of the metadata table"); + return metadataTableBasePath.substring(0, endPos); + } + + /** + * Returns {@code True} if the given path contains a metadata table. + * + * @param basePath The base path to check + */ + static boolean isMetadataTable(String basePath) { + if (basePath.endsWith(Path.SEPARATOR)) { + basePath = basePath.substring(0, basePath.length() - 1); + } + return basePath.endsWith(HoodieTableMetaClient.METADATA_TABLE_FOLDER_PATH); + } + + static HoodieTableMetadata create(HoodieEngineContext engineContext, HoodieMetadataConfig metadataConfig, String datasetBasePath, + String spillableMapPath) { + return create(engineContext, metadataConfig, datasetBasePath, spillableMapPath, false); + } + + static HoodieTableMetadata create(HoodieEngineContext engineContext, HoodieMetadataConfig metadataConfig, String datasetBasePath, + String spillableMapPath, boolean reuse) { + if (metadataConfig.enabled()) { + return createHoodieBackedTableMetadata(engineContext, metadataConfig, datasetBasePath, spillableMapPath, reuse); + } else { + return createFSBackedTableMetadata(engineContext, metadataConfig, datasetBasePath); + } + } + + static FileSystemBackedTableMetadata createFSBackedTableMetadata(HoodieEngineContext engineContext, + HoodieMetadataConfig metadataConfig, + String datasetBasePath) { + return new FileSystemBackedTableMetadata(engineContext, new SerializableConfiguration(engineContext.getHadoopConf()), + datasetBasePath, metadataConfig.shouldAssumeDatePartitioning()); + } + + static HoodieBackedTableMetadata createHoodieBackedTableMetadata(HoodieEngineContext engineContext, + HoodieMetadataConfig metadataConfig, + String datasetBasePath, + String spillableMapPath, + boolean reuse) { + return new HoodieBackedTableMetadata(engineContext, metadataConfig, datasetBasePath, spillableMapPath, reuse); + } + + /** + * Fetch all the files at the given partition path, per the latest snapshot of the metadata. + */ + FileStatus[] getAllFilesInPartition(Path partitionPath) throws IOException; + + /** + * Fetch list of all partition paths, per the latest snapshot of the metadata. + */ + List getAllPartitionPaths() throws IOException; + + /** + * Fetch all files for given partition paths. + */ + Map getAllFilesInPartitions(List partitionPaths) throws IOException; + + /** + * Get the bloom filter for the FileID from the metadata table. + * + * @param partitionName - Partition name + * @param fileName - File name for which bloom filter needs to be retrieved + * @return BloomFilter if available, otherwise empty + * @throws HoodieMetadataException + */ + Option getBloomFilter(final String partitionName, final String fileName) + throws HoodieMetadataException; + + /** + * Get bloom filters for files from the metadata table index. + * + * @param partitionNameFileNameList - List of partition and file name pair for which bloom filters need to be retrieved + * @return Map of partition file name pair to its bloom filter + * @throws HoodieMetadataException + */ + Map, BloomFilter> getBloomFilters(final List> partitionNameFileNameList) + throws HoodieMetadataException; + + /** + * Get column stats for files from the metadata table index. + * + * @param partitionNameFileNameList - List of partition and file name pair for which bloom filters need to be retrieved + * @param columnName - Column name for which stats are needed + * @return Map of partition and file name pair to its column stats + * @throws HoodieMetadataException + */ + Map, HoodieMetadataColumnStats> getColumnStats(final List> partitionNameFileNameList, final String columnName) + throws HoodieMetadataException; + + /** + * Fetch records by key prefixes. Key prefix passed is expected to match the same prefix as stored in Metadata table partitions. For eg, in case of col stats partition, + * actual keys in metadata partition is encoded values of column name, partition name and file name. So, key prefixes passed to this method is expected to be encoded already. + * + * @param keyPrefixes list of key prefixes for which interested records are looked up for. + * @param partitionName partition name in metadata table where the records are looked up for. + * @return {@link HoodieData} of {@link HoodieRecord}s with records matching the passed in key prefixes. + */ + HoodieData> getRecordsByKeyPrefixes(List keyPrefixes, + String partitionName, + boolean shouldLoadInMemory); + + /** + * Get the instant time to which the metadata is synced w.r.t data timeline. + */ + Option getSyncedInstantTime(); + + /** + * Returns the timestamp of the latest compaction. + */ + Option getLatestCompactionTime(); + + /** + * Clear the states of the table metadata. + */ + void reset(); +} diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java new file mode 100644 index 0000000000000..374d6fb46e72a --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java @@ -0,0 +1,1378 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.hudi.avro.ConvertingGenericData; +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieMetadataColumnStats; +import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieColumnRangeMetadata; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieDeltaWriteStat; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ParquetUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieMetadataException; +import org.apache.hudi.io.storage.HoodieFileReader; +import org.apache.hudi.io.storage.HoodieFileReaderFactory; +import org.apache.hudi.util.Lazy; + +import org.apache.avro.AvroTypeException; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import javax.annotation.Nonnull; + +import java.io.IOException; +import java.math.BigDecimal; +import java.math.RoundingMode; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.stream.Collector; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.apache.hudi.avro.AvroSchemaUtils.resolveNullableSchema; +import static org.apache.hudi.avro.HoodieAvroUtils.addMetadataFields; +import static org.apache.hudi.avro.HoodieAvroUtils.convertValueForSpecificDataTypes; +import static org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldSchemaFromWriteSchema; +import static org.apache.hudi.common.util.StringUtils.isNullOrEmpty; +import static org.apache.hudi.common.util.ValidationUtils.checkState; +import static org.apache.hudi.metadata.HoodieMetadataPayload.unwrapStatisticValueWrapper; +import static org.apache.hudi.metadata.HoodieTableMetadata.EMPTY_PARTITION_NAME; +import static org.apache.hudi.metadata.HoodieTableMetadata.NON_PARTITIONED_NAME; + +/** + * A utility to convert timeline information to metadata table records. + */ +public class HoodieTableMetadataUtil { + + private static final Logger LOG = LogManager.getLogger(HoodieTableMetadataUtil.class); + + public static final String PARTITION_NAME_FILES = "files"; + public static final String PARTITION_NAME_COLUMN_STATS = "column_stats"; + public static final String PARTITION_NAME_BLOOM_FILTERS = "bloom_filters"; + + /** + * Returns whether the files partition of metadata table is ready for read. + * + * @param metaClient {@link HoodieTableMetaClient} instance. + * @return true if the files partition of metadata table is ready for read, + * based on the table config; false otherwise. + */ + public static boolean isFilesPartitionAvailable(HoodieTableMetaClient metaClient) { + return metaClient.getTableConfig().getMetadataPartitions() + .contains(HoodieTableMetadataUtil.PARTITION_NAME_FILES); + } + + /** + * Collects {@link HoodieColumnRangeMetadata} for the provided collection of records, pretending + * as if provided records have been persisted w/in given {@code filePath} + * + * @param records target records to compute column range metadata for + * @param targetFields columns (fields) to be collected + * @param filePath file path value required for {@link HoodieColumnRangeMetadata} + * + * @return map of {@link HoodieColumnRangeMetadata} for each of the provided target fields for + * the collection of provided records + */ + public static Map> collectColumnRangeMetadata(List records, + List targetFields, + String filePath) { + // Helper class to calculate column stats + class ColumnStats { + Object minValue; + Object maxValue; + long nullCount; + long valueCount; + } + + HashMap allColumnStats = new HashMap<>(); + + // Collect stats for all columns by iterating through records while accounting + // corresponding stats + records.forEach((record) -> { + // For each column (field) we have to index update corresponding column stats + // with the values from this record + targetFields.forEach(field -> { + ColumnStats colStats = allColumnStats.computeIfAbsent(field.name(), (ignored) -> new ColumnStats()); + + GenericRecord genericRecord = (GenericRecord) record; + + final Object fieldVal = convertValueForSpecificDataTypes(field.schema(), genericRecord.get(field.name()), false); + final Schema fieldSchema = getNestedFieldSchemaFromWriteSchema(genericRecord.getSchema(), field.name()); + + if (fieldVal != null && canCompare(fieldSchema)) { + // Set the min value of the field + if (colStats.minValue == null + || ConvertingGenericData.INSTANCE.compare(fieldVal, colStats.minValue, fieldSchema) < 0) { + colStats.minValue = fieldVal; + } + + // Set the max value of the field + if (colStats.maxValue == null || ConvertingGenericData.INSTANCE.compare(fieldVal, colStats.maxValue, fieldSchema) > 0) { + colStats.maxValue = fieldVal; + } + + colStats.valueCount++; + } else { + colStats.nullCount++; + } + }); + }); + + Collector, ?, Map>> collector = + Collectors.toMap(colRangeMetadata -> colRangeMetadata.getColumnName(), Function.identity()); + + return (Map>) targetFields.stream() + .map(field -> { + ColumnStats colStats = allColumnStats.get(field.name()); + return HoodieColumnRangeMetadata.create( + filePath, + field.name(), + colStats == null ? null : coerceToComparable(field.schema(), colStats.minValue), + colStats == null ? null : coerceToComparable(field.schema(), colStats.maxValue), + colStats == null ? 0 : colStats.nullCount, + colStats == null ? 0 : colStats.valueCount, + // NOTE: Size and compressed size statistics are set to 0 to make sure we're not + // mixing up those provided by Parquet with the ones from other encodings, + // since those are not directly comparable + 0, + 0 + ); + }) + .collect(collector); + } + + /** + * Converts instance of {@link HoodieMetadataColumnStats} to {@link HoodieColumnRangeMetadata} + */ + public static HoodieColumnRangeMetadata convertColumnStatsRecordToColumnRangeMetadata(HoodieMetadataColumnStats columnStats) { + return HoodieColumnRangeMetadata.create( + columnStats.getFileName(), + columnStats.getColumnName(), + unwrapStatisticValueWrapper(columnStats.getMinValue()), + unwrapStatisticValueWrapper(columnStats.getMaxValue()), + columnStats.getNullCount(), + columnStats.getValueCount(), + columnStats.getTotalSize(), + columnStats.getTotalUncompressedSize()); + } + + /** + * Delete the metadata table for the dataset. This will be invoked during upgrade/downgrade operation during which + * no other + * process should be running. + * + * @param basePath base path of the dataset + * @param context instance of {@link HoodieEngineContext}. + */ + public static void deleteMetadataTable(String basePath, HoodieEngineContext context) { + final String metadataTablePathStr = HoodieTableMetadata.getMetadataTableBasePath(basePath); + FileSystem fs = FSUtils.getFs(metadataTablePathStr, context.getHadoopConf().get()); + try { + Path metadataTablePath = new Path(metadataTablePathStr); + if (fs.exists(metadataTablePath)) { + fs.delete(metadataTablePath, true); + } + } catch (Exception e) { + throw new HoodieMetadataException("Failed to remove metadata table from path " + metadataTablePathStr, e); + } + } + + /** + * Deletes the metadata partition from the file system. + * + * @param basePath - base path of the dataset + * @param context - instance of {@link HoodieEngineContext} + * @param partitionType - {@link MetadataPartitionType} of the partition to delete + */ + public static void deleteMetadataPartition(String basePath, HoodieEngineContext context, MetadataPartitionType partitionType) { + final String metadataTablePath = HoodieTableMetadata.getMetadataTableBasePath(basePath); + FileSystem fs = FSUtils.getFs(metadataTablePath, context.getHadoopConf().get()); + try { + fs.delete(new Path(metadataTablePath, partitionType.getPartitionPath()), true); + } catch (Exception e) { + throw new HoodieMetadataException(String.format("Failed to remove metadata partition %s from path %s", partitionType, metadataTablePath), e); + } + } + + /** + * Check if the given metadata partition exists. + * + * @param basePath base path of the dataset + * @param context instance of {@link HoodieEngineContext}. + */ + public static boolean metadataPartitionExists(String basePath, HoodieEngineContext context, MetadataPartitionType partitionType) { + final String metadataTablePath = HoodieTableMetadata.getMetadataTableBasePath(basePath); + FileSystem fs = FSUtils.getFs(metadataTablePath, context.getHadoopConf().get()); + try { + return fs.exists(new Path(metadataTablePath, partitionType.getPartitionPath())); + } catch (Exception e) { + throw new HoodieIOException(String.format("Failed to check metadata partition %s exists.", partitionType.getPartitionPath())); + } + } + + /** + * Convert commit action to metadata records for the enabled partition types. + * + * @param commitMetadata - Commit action metadata + * @param instantTime - Action instant time + * @param recordsGenerationParams - Parameters for the record generation + * @return Map of partition to metadata records for the commit action + */ + public static Map> convertMetadataToRecords( + HoodieEngineContext context, HoodieCommitMetadata commitMetadata, String instantTime, + MetadataRecordsGenerationParams recordsGenerationParams) { + final Map> partitionToRecordsMap = new HashMap<>(); + final HoodieData filesPartitionRecordsRDD = context.parallelize( + convertMetadataToFilesPartitionRecords(commitMetadata, instantTime), 1); + partitionToRecordsMap.put(MetadataPartitionType.FILES, filesPartitionRecordsRDD); + + if (recordsGenerationParams.getEnabledPartitionTypes().contains(MetadataPartitionType.BLOOM_FILTERS)) { + final HoodieData metadataBloomFilterRecords = convertMetadataToBloomFilterRecords(context, commitMetadata, instantTime, recordsGenerationParams); + partitionToRecordsMap.put(MetadataPartitionType.BLOOM_FILTERS, metadataBloomFilterRecords); + } + + if (recordsGenerationParams.getEnabledPartitionTypes().contains(MetadataPartitionType.COLUMN_STATS)) { + final HoodieData metadataColumnStatsRDD = convertMetadataToColumnStatsRecords(commitMetadata, context, recordsGenerationParams); + partitionToRecordsMap.put(MetadataPartitionType.COLUMN_STATS, metadataColumnStatsRDD); + } + return partitionToRecordsMap; + } + + /** + * Finds all new files/partitions created as part of commit and creates metadata table records for them. + * + * @param commitMetadata - Commit action metadata + * @param instantTime - Commit action instant time + * @return List of metadata table records + */ + public static List convertMetadataToFilesPartitionRecords(HoodieCommitMetadata commitMetadata, + String instantTime) { + List records = new ArrayList<>(commitMetadata.getPartitionToWriteStats().size()); + + // Add record bearing added partitions list + List partitionsAdded = getPartitionsAdded(commitMetadata); + + // Add record bearing deleted partitions list + List partitionsDeleted = getPartitionsDeleted(commitMetadata); + + records.add(HoodieMetadataPayload.createPartitionListRecord(partitionsAdded, partitionsDeleted)); + + // Update files listing records for each individual partition + List> updatedPartitionFilesRecords = + commitMetadata.getPartitionToWriteStats().entrySet() + .stream() + .map(entry -> { + String partitionStatName = entry.getKey(); + List writeStats = entry.getValue(); + + String partition = getPartitionIdentifier(partitionStatName); + + HashMap updatedFilesToSizesMapping = + writeStats.stream().reduce(new HashMap<>(writeStats.size()), + (map, stat) -> { + String pathWithPartition = stat.getPath(); + if (pathWithPartition == null) { + // Empty partition + LOG.warn("Unable to find path in write stat to update metadata table " + stat); + return map; + } + + String fileName = FSUtils.getFileName(pathWithPartition, partitionStatName); + + // Since write-stats are coming in no particular order, if the same + // file have previously been appended to w/in the txn, we simply pick max + // of the sizes as reported after every write, since file-sizes are + // monotonically increasing (ie file-size never goes down, unless deleted) + map.merge(fileName, stat.getFileSizeInBytes(), Math::max); + + return map; + }, + CollectionUtils::combine); + + return HoodieMetadataPayload.createPartitionFilesRecord(partition, Option.of(updatedFilesToSizesMapping), + Option.empty()); + }) + .collect(Collectors.toList()); + + records.addAll(updatedPartitionFilesRecords); + + LOG.info("Updating at " + instantTime + " from Commit/" + commitMetadata.getOperationType() + + ". #partitions_updated=" + records.size()); + + return records; + } + + private static List getPartitionsAdded(HoodieCommitMetadata commitMetadata) { + return commitMetadata.getPartitionToWriteStats().keySet().stream() + // We need to make sure we properly handle case of non-partitioned tables + .map(HoodieTableMetadataUtil::getPartitionIdentifier) + .collect(Collectors.toList()); + } + + private static List getPartitionsDeleted(HoodieCommitMetadata commitMetadata) { + if (commitMetadata instanceof HoodieReplaceCommitMetadata + && WriteOperationType.DELETE_PARTITION.equals(commitMetadata.getOperationType())) { + Map> partitionToReplaceFileIds = + ((HoodieReplaceCommitMetadata) commitMetadata).getPartitionToReplaceFileIds(); + + return partitionToReplaceFileIds.keySet().stream() + // We need to make sure we properly handle case of non-partitioned tables + .map(HoodieTableMetadataUtil::getPartitionIdentifier) + .collect(Collectors.toList()); + } + + return Collections.emptyList(); + } + + /** + * Convert commit action metadata to bloom filter records. + * + * @param context - Engine context to use + * @param commitMetadata - Commit action metadata + * @param instantTime - Action instant time + * @param recordsGenerationParams - Parameters for bloom filter record generation + * @return HoodieData of metadata table records + */ + public static HoodieData convertMetadataToBloomFilterRecords( + HoodieEngineContext context, HoodieCommitMetadata commitMetadata, + String instantTime, MetadataRecordsGenerationParams recordsGenerationParams) { + final List allWriteStats = commitMetadata.getPartitionToWriteStats().values().stream() + .flatMap(entry -> entry.stream()).collect(Collectors.toList()); + if (allWriteStats.isEmpty()) { + return context.emptyHoodieData(); + } + + final int parallelism = Math.max(Math.min(allWriteStats.size(), recordsGenerationParams.getBloomIndexParallelism()), 1); + HoodieData allWriteStatsRDD = context.parallelize(allWriteStats, parallelism); + return allWriteStatsRDD.flatMap(hoodieWriteStat -> { + final String partition = hoodieWriteStat.getPartitionPath(); + + // For bloom filter index, delta writes do not change the base file bloom filter entries + if (hoodieWriteStat instanceof HoodieDeltaWriteStat) { + return Collections.emptyListIterator(); + } + + String pathWithPartition = hoodieWriteStat.getPath(); + if (pathWithPartition == null) { + // Empty partition + LOG.error("Failed to find path in write stat to update metadata table " + hoodieWriteStat); + return Collections.emptyListIterator(); + } + + String fileName = FSUtils.getFileName(pathWithPartition, partition); + if (!FSUtils.isBaseFile(new Path(fileName))) { + return Collections.emptyListIterator(); + } + + final Path writeFilePath = new Path(recordsGenerationParams.getDataMetaClient().getBasePath(), pathWithPartition); + try (HoodieFileReader fileReader = + HoodieFileReaderFactory.getFileReader(recordsGenerationParams.getDataMetaClient().getHadoopConf(), writeFilePath)) { + try { + final BloomFilter fileBloomFilter = fileReader.readBloomFilter(); + if (fileBloomFilter == null) { + LOG.error("Failed to read bloom filter for " + writeFilePath); + return Collections.emptyListIterator(); + } + ByteBuffer bloomByteBuffer = ByteBuffer.wrap(fileBloomFilter.serializeToString().getBytes()); + HoodieRecord record = HoodieMetadataPayload.createBloomFilterMetadataRecord( + partition, fileName, instantTime, recordsGenerationParams.getBloomFilterType(), bloomByteBuffer, false); + return Collections.singletonList(record).iterator(); + } catch (Exception e) { + LOG.error("Failed to read bloom filter for " + writeFilePath); + return Collections.emptyListIterator(); + } finally { + fileReader.close(); + } + } catch (IOException e) { + LOG.error("Failed to get bloom filter for file: " + writeFilePath + ", write stat: " + hoodieWriteStat); + } + return Collections.emptyListIterator(); + }); + } + + /** + * Convert the clean action to metadata records. + */ + public static Map> convertMetadataToRecords(HoodieEngineContext engineContext, + HoodieCleanMetadata cleanMetadata, + MetadataRecordsGenerationParams recordsGenerationParams, + String instantTime) { + final Map> partitionToRecordsMap = new HashMap<>(); + final HoodieData filesPartitionRecordsRDD = engineContext.parallelize( + convertMetadataToFilesPartitionRecords(cleanMetadata, instantTime), 1); + partitionToRecordsMap.put(MetadataPartitionType.FILES, filesPartitionRecordsRDD); + + if (recordsGenerationParams.getEnabledPartitionTypes().contains(MetadataPartitionType.BLOOM_FILTERS)) { + final HoodieData metadataBloomFilterRecordsRDD = + convertMetadataToBloomFilterRecords(cleanMetadata, engineContext, instantTime, recordsGenerationParams); + partitionToRecordsMap.put(MetadataPartitionType.BLOOM_FILTERS, metadataBloomFilterRecordsRDD); + } + + if (recordsGenerationParams.getEnabledPartitionTypes().contains(MetadataPartitionType.COLUMN_STATS)) { + final HoodieData metadataColumnStatsRDD = + convertMetadataToColumnStatsRecords(cleanMetadata, engineContext, recordsGenerationParams); + partitionToRecordsMap.put(MetadataPartitionType.COLUMN_STATS, metadataColumnStatsRDD); + } + + return partitionToRecordsMap; + } + + /** + * Finds all files that were deleted as part of a clean and creates metadata table records for them. + * + * @param cleanMetadata + * @param instantTime + * @return a list of metadata table records + */ + public static List convertMetadataToFilesPartitionRecords(HoodieCleanMetadata cleanMetadata, + String instantTime) { + List records = new LinkedList<>(); + int[] fileDeleteCount = {0}; + List deletedPartitions = new ArrayList<>(); + cleanMetadata.getPartitionMetadata().forEach((partitionName, partitionMetadata) -> { + final String partition = getPartitionIdentifier(partitionName); + // Files deleted from a partition + List deletedFiles = partitionMetadata.getDeletePathPatterns(); + HoodieRecord record = HoodieMetadataPayload.createPartitionFilesRecord(partition, Option.empty(), + Option.of(new ArrayList<>(deletedFiles))); + + records.add(record); + fileDeleteCount[0] += deletedFiles.size(); + boolean isPartitionDeleted = partitionMetadata.getIsPartitionDeleted(); + if (isPartitionDeleted) { + deletedPartitions.add(partitionName); + } + }); + + if (!deletedPartitions.isEmpty()) { + // if there are partitions to be deleted, add them to delete list + records.add(HoodieMetadataPayload.createPartitionListRecord(deletedPartitions, true)); + } + LOG.info("Updating at " + instantTime + " from Clean. #partitions_updated=" + records.size() + + ", #files_deleted=" + fileDeleteCount[0] + ", #partitions_deleted=" + deletedPartitions.size()); + return records; + } + + /** + * Convert clean metadata to bloom filter index records. + * + * @param cleanMetadata - Clean action metadata + * @param engineContext - Engine context + * @param instantTime - Clean action instant time + * @param recordsGenerationParams - Parameters for bloom filter record generation + * @return List of bloom filter index records for the clean metadata + */ + public static HoodieData convertMetadataToBloomFilterRecords(HoodieCleanMetadata cleanMetadata, + HoodieEngineContext engineContext, + String instantTime, + MetadataRecordsGenerationParams recordsGenerationParams) { + List> deleteFileList = new ArrayList<>(); + cleanMetadata.getPartitionMetadata().forEach((partition, partitionMetadata) -> { + // Files deleted from a partition + List deletedFiles = partitionMetadata.getDeletePathPatterns(); + deletedFiles.forEach(entry -> { + final Path deletedFilePath = new Path(entry); + if (FSUtils.isBaseFile(deletedFilePath)) { + deleteFileList.add(Pair.of(partition, deletedFilePath.getName())); + } + }); + }); + + final int parallelism = Math.max(Math.min(deleteFileList.size(), recordsGenerationParams.getBloomIndexParallelism()), 1); + HoodieData> deleteFileListRDD = engineContext.parallelize(deleteFileList, parallelism); + return deleteFileListRDD.map(deleteFileInfoPair -> HoodieMetadataPayload.createBloomFilterMetadataRecord( + deleteFileInfoPair.getLeft(), deleteFileInfoPair.getRight(), instantTime, StringUtils.EMPTY_STRING, + ByteBuffer.allocate(0), true)); + } + + /** + * Convert clean metadata to column stats index records. + * + * @param cleanMetadata - Clean action metadata + * @param engineContext - Engine context + * @param recordsGenerationParams - Parameters for bloom filter record generation + * @return List of column stats index records for the clean metadata + */ + public static HoodieData convertMetadataToColumnStatsRecords(HoodieCleanMetadata cleanMetadata, + HoodieEngineContext engineContext, + MetadataRecordsGenerationParams recordsGenerationParams) { + List> deleteFileList = new ArrayList<>(); + cleanMetadata.getPartitionMetadata().forEach((partition, partitionMetadata) -> { + // Files deleted from a partition + List deletedFiles = partitionMetadata.getDeletePathPatterns(); + deletedFiles.forEach(entry -> deleteFileList.add(Pair.of(partition, entry))); + }); + + HoodieTableMetaClient dataTableMetaClient = recordsGenerationParams.getDataMetaClient(); + + List columnsToIndex = + getColumnsToIndex(recordsGenerationParams, + Lazy.lazily(() -> tryResolveSchemaForTable(dataTableMetaClient))); + + if (columnsToIndex.isEmpty()) { + // In case there are no columns to index, bail + return engineContext.emptyHoodieData(); + } + + int parallelism = Math.max(Math.min(deleteFileList.size(), recordsGenerationParams.getColumnStatsIndexParallelism()), 1); + return engineContext.parallelize(deleteFileList, parallelism) + .flatMap(deleteFileInfoPair -> { + String partitionPath = deleteFileInfoPair.getLeft(); + String filePath = deleteFileInfoPair.getRight(); + + if (filePath.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { + return getColumnStatsRecords(partitionPath, filePath, dataTableMetaClient, columnsToIndex, true).iterator(); + } + return Collections.emptyListIterator(); + }); + } + + /** + * Convert restore action metadata to metadata table records. + */ + public static Map> convertMetadataToRecords( + HoodieEngineContext engineContext, HoodieActiveTimeline metadataTableTimeline, HoodieRestoreMetadata restoreMetadata, + MetadataRecordsGenerationParams recordsGenerationParams, String instantTime, Option lastSyncTs) { + final Map> partitionToRecordsMap = new HashMap<>(); + final Map> partitionToAppendedFiles = new HashMap<>(); + final Map> partitionToDeletedFiles = new HashMap<>(); + + processRestoreMetadata(metadataTableTimeline, restoreMetadata, partitionToAppendedFiles, partitionToDeletedFiles, lastSyncTs); + final HoodieData filesPartitionRecordsRDD = + engineContext.parallelize(convertFilesToFilesPartitionRecords(partitionToDeletedFiles, partitionToAppendedFiles, instantTime, "Restore"), 1); + partitionToRecordsMap.put(MetadataPartitionType.FILES, filesPartitionRecordsRDD); + + if (recordsGenerationParams.getEnabledPartitionTypes().contains(MetadataPartitionType.BLOOM_FILTERS)) { + final HoodieData metadataBloomFilterRecordsRDD = + convertFilesToBloomFilterRecords(engineContext, partitionToDeletedFiles, partitionToAppendedFiles, recordsGenerationParams, instantTime); + partitionToRecordsMap.put(MetadataPartitionType.BLOOM_FILTERS, metadataBloomFilterRecordsRDD); + } + + if (recordsGenerationParams.getEnabledPartitionTypes().contains(MetadataPartitionType.COLUMN_STATS)) { + final HoodieData metadataColumnStatsRDD = + convertFilesToColumnStatsRecords(engineContext, partitionToDeletedFiles, partitionToAppendedFiles, recordsGenerationParams); + partitionToRecordsMap.put(MetadataPartitionType.COLUMN_STATS, metadataColumnStatsRDD); + } + return partitionToRecordsMap; + } + + /** + * Aggregates all files deleted and appended to from all rollbacks associated with a restore operation then + * creates metadata table records for them. + * + * @param restoreMetadata - Restore action metadata + * @return a list of metadata table records + */ + private static void processRestoreMetadata(HoodieActiveTimeline metadataTableTimeline, + HoodieRestoreMetadata restoreMetadata, + Map> partitionToAppendedFiles, + Map> partitionToDeletedFiles, + Option lastSyncTs) { + restoreMetadata.getHoodieRestoreMetadata().values().forEach(rms -> rms.forEach(rm -> processRollbackMetadata(metadataTableTimeline, rm, + partitionToDeletedFiles, partitionToAppendedFiles, lastSyncTs))); + } + + /** + * Convert rollback action metadata to metadata table records. + */ + public static Map> convertMetadataToRecords( + HoodieEngineContext engineContext, HoodieActiveTimeline metadataTableTimeline, + HoodieRollbackMetadata rollbackMetadata, MetadataRecordsGenerationParams recordsGenerationParams, + String instantTime, Option lastSyncTs, boolean wasSynced) { + final Map> partitionToRecordsMap = new HashMap<>(); + Map> partitionToDeletedFiles = new HashMap<>(); + Map> partitionToAppendedFiles = new HashMap<>(); + + List filesPartitionRecords = + convertMetadataToRollbackRecords(metadataTableTimeline, rollbackMetadata, partitionToDeletedFiles, partitionToAppendedFiles, instantTime, lastSyncTs, wasSynced); + final HoodieData rollbackRecordsRDD = engineContext.parallelize(filesPartitionRecords, 1); + partitionToRecordsMap.put(MetadataPartitionType.FILES, rollbackRecordsRDD); + + if (recordsGenerationParams.getEnabledPartitionTypes().contains(MetadataPartitionType.BLOOM_FILTERS)) { + final HoodieData metadataBloomFilterRecordsRDD = + convertFilesToBloomFilterRecords(engineContext, partitionToDeletedFiles, partitionToAppendedFiles, recordsGenerationParams, instantTime); + partitionToRecordsMap.put(MetadataPartitionType.BLOOM_FILTERS, metadataBloomFilterRecordsRDD); + } + + if (recordsGenerationParams.getEnabledPartitionTypes().contains(MetadataPartitionType.COLUMN_STATS)) { + final HoodieData metadataColumnStatsRDD = + convertFilesToColumnStatsRecords(engineContext, partitionToDeletedFiles, partitionToAppendedFiles, recordsGenerationParams); + partitionToRecordsMap.put(MetadataPartitionType.COLUMN_STATS, metadataColumnStatsRDD); + } + + return partitionToRecordsMap; + } + + /** + * Convert rollback action metadata to files partition records. + */ + private static List convertMetadataToRollbackRecords(HoodieActiveTimeline metadataTableTimeline, + HoodieRollbackMetadata rollbackMetadata, + Map> partitionToDeletedFiles, + Map> partitionToAppendedFiles, + String instantTime, + Option lastSyncTs, boolean wasSynced) { + processRollbackMetadata(metadataTableTimeline, rollbackMetadata, partitionToDeletedFiles, + partitionToAppendedFiles, lastSyncTs); + if (!wasSynced) { + // Since the instant-being-rolled-back was never committed to the metadata table, the files added there + // need not be deleted. For MOR Table, the rollback appends logBlocks so we need to keep the appended files. + partitionToDeletedFiles.clear(); + } + return convertFilesToFilesPartitionRecords(partitionToDeletedFiles, partitionToAppendedFiles, instantTime, "Rollback"); + } + + /** + * Extracts information about the deleted and append files from the {@code HoodieRollbackMetadata}. + *

    + * During a rollback files may be deleted (COW, MOR) or rollback blocks be appended (MOR only) to files. This + * function will extract this change file for each partition. + * + * @param metadataTableTimeline Current timeline of the Metadata Table + * @param rollbackMetadata {@code HoodieRollbackMetadata} + * @param partitionToDeletedFiles The {@code Map} to fill with files deleted per partition. + * @param partitionToAppendedFiles The {@code Map} to fill with files appended per partition and their sizes. + */ + private static void processRollbackMetadata(HoodieActiveTimeline metadataTableTimeline, + HoodieRollbackMetadata rollbackMetadata, + Map> partitionToDeletedFiles, + Map> partitionToAppendedFiles, + Option lastSyncTs) { + rollbackMetadata.getPartitionMetadata().values().forEach(pm -> { + final String instantToRollback = rollbackMetadata.getCommitsRollback().get(0); + // Has this rollback produced new files? + boolean hasRollbackLogFiles = pm.getRollbackLogFiles() != null && !pm.getRollbackLogFiles().isEmpty(); + boolean hasNonZeroRollbackLogFiles = hasRollbackLogFiles && pm.getRollbackLogFiles().values().stream().mapToLong(Long::longValue).sum() > 0; + + // If instant-to-rollback has not been synced to metadata table yet then there is no need to update metadata + // This can happen in two cases: + // Case 1: Metadata Table timeline is behind the instant-to-rollback. + boolean shouldSkip = lastSyncTs.isPresent() + && HoodieTimeline.compareTimestamps(instantToRollback, HoodieTimeline.GREATER_THAN, lastSyncTs.get()); + + if (!hasNonZeroRollbackLogFiles && shouldSkip) { + LOG.info(String.format("Skipping syncing of rollbackMetadata at %s, given metadata table is already synced upto to %s", + instantToRollback, lastSyncTs.get())); + return; + } + + // Case 2: The instant-to-rollback was never committed to Metadata Table. This can happen if the instant-to-rollback + // was a failed commit (never completed). + // + // There are two cases for failed commit that we need to take care of: + // 1) The commit was synced to metadata table successfully but the dataset meta file switches state failed + // (from INFLIGHT to COMPLETED), the committed files should be rolled back thus the rollback metadata + // can not be skipped, usually a failover should be triggered and the metadata active timeline expects + // to contain the commit, we could check whether the commit was synced to metadata table + // through HoodieActiveTimeline#containsInstant. + // + // 2) The commit synced to metadata table failed or was never synced to metadata table, + // in this case, the rollback metadata should be skipped. + // + // And in which case, + // metadataTableTimeline.getCommitsTimeline().isBeforeTimelineStarts(syncedInstant.getTimestamp()) + // returns true ? + // It is most probably because of compaction rollback, we schedule a compaction plan early in the timeline (say t1) + // then after a long time schedule and execute the plan then try to rollback it. + // + // scheduled execution rollback compaction actions + // ----- t1 ----- t3 ----- t4 ----- dataset timeline + // + // ---------- t2 (archive) ----------- metadata timeline + // + // when at time t4, we commit the compaction rollback,the above check returns true. + HoodieInstant syncedInstant = new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, instantToRollback); + if (metadataTableTimeline.getCommitsTimeline().isBeforeTimelineStarts(syncedInstant.getTimestamp())) { + throw new HoodieMetadataException(String.format("The instant %s required to sync rollback of %s has been archived", + syncedInstant, instantToRollback)); + } + shouldSkip = !metadataTableTimeline.containsInstant(syncedInstant); + if (!hasNonZeroRollbackLogFiles && shouldSkip) { + LOG.info(String.format("Skipping syncing of rollbackMetadata at %s, since this instant was never committed to Metadata Table", + instantToRollback)); + return; + } + + final String partition = pm.getPartitionPath(); + if ((!pm.getSuccessDeleteFiles().isEmpty() || !pm.getFailedDeleteFiles().isEmpty()) && !shouldSkip) { + if (!partitionToDeletedFiles.containsKey(partition)) { + partitionToDeletedFiles.put(partition, new ArrayList<>()); + } + + // Extract deleted file name from the absolute paths saved in getSuccessDeleteFiles() + List deletedFiles = pm.getSuccessDeleteFiles().stream().map(p -> new Path(p).getName()) + .collect(Collectors.toList()); + if (!pm.getFailedDeleteFiles().isEmpty()) { + deletedFiles.addAll(pm.getFailedDeleteFiles().stream().map(p -> new Path(p).getName()) + .collect(Collectors.toList())); + } + partitionToDeletedFiles.get(partition).addAll(deletedFiles); + } + + BiFunction fileMergeFn = (oldSize, newSizeCopy) -> { + // if a file exists in both written log files and rollback log files, we want to pick the one that is higher + // as rollback file could have been updated after written log files are computed. + return oldSize > newSizeCopy ? oldSize : newSizeCopy; + }; + + if (hasRollbackLogFiles) { + if (!partitionToAppendedFiles.containsKey(partition)) { + partitionToAppendedFiles.put(partition, new HashMap<>()); + } + + // Extract appended file name from the absolute paths saved in getAppendFiles() + pm.getRollbackLogFiles().forEach((path, size) -> { + partitionToAppendedFiles.get(partition).merge(new Path(path).getName(), size, fileMergeFn); + }); + } + }); + } + + /** + * Convert rollback action metadata to files partition records. + */ + private static List convertFilesToFilesPartitionRecords(Map> partitionToDeletedFiles, + Map> partitionToAppendedFiles, + String instantTime, String operation) { + List records = new LinkedList<>(); + int[] fileChangeCount = {0, 0}; // deletes, appends + + partitionToDeletedFiles.forEach((partitionName, deletedFiles) -> { + fileChangeCount[0] += deletedFiles.size(); + final String partition = getPartitionIdentifier(partitionName); + + Option> filesAdded = Option.empty(); + if (partitionToAppendedFiles.containsKey(partitionName)) { + filesAdded = Option.of(partitionToAppendedFiles.remove(partitionName)); + } + + HoodieRecord record = HoodieMetadataPayload.createPartitionFilesRecord(partition, filesAdded, + Option.of(new ArrayList<>(deletedFiles))); + records.add(record); + }); + + partitionToAppendedFiles.forEach((partitionName, appendedFileMap) -> { + final String partition = getPartitionIdentifier(partitionName); + fileChangeCount[1] += appendedFileMap.size(); + + // Validate that no appended file has been deleted + checkState( + !appendedFileMap.keySet().removeAll(partitionToDeletedFiles.getOrDefault(partition, Collections.emptyList())), + "Rollback file cannot both be appended and deleted"); + + // New files added to a partition + HoodieRecord record = HoodieMetadataPayload.createPartitionFilesRecord(partition, Option.of(appendedFileMap), + Option.empty()); + records.add(record); + }); + + LOG.info("Found at " + instantTime + " from " + operation + ". #partitions_updated=" + records.size() + + ", #files_deleted=" + fileChangeCount[0] + ", #files_appended=" + fileChangeCount[1]); + + return records; + } + + /** + * Returns partition name for the given path. + */ + public static String getPartitionIdentifier(@Nonnull String relativePartitionPath) { + return EMPTY_PARTITION_NAME.equals(relativePartitionPath) ? NON_PARTITIONED_NAME : relativePartitionPath; + } + + /** + * Convert added and deleted files metadata to bloom filter index records. + */ + public static HoodieData convertFilesToBloomFilterRecords(HoodieEngineContext engineContext, + Map> partitionToDeletedFiles, + Map> partitionToAppendedFiles, + MetadataRecordsGenerationParams recordsGenerationParams, + String instantTime) { + HoodieData allRecordsRDD = engineContext.emptyHoodieData(); + + List>> partitionToDeletedFilesList = partitionToDeletedFiles.entrySet() + .stream().map(e -> Pair.of(e.getKey(), e.getValue())).collect(Collectors.toList()); + int parallelism = Math.max(Math.min(partitionToDeletedFilesList.size(), recordsGenerationParams.getBloomIndexParallelism()), 1); + HoodieData>> partitionToDeletedFilesRDD = engineContext.parallelize(partitionToDeletedFilesList, parallelism); + + HoodieData deletedFilesRecordsRDD = partitionToDeletedFilesRDD.flatMap(partitionToDeletedFilesPair -> { + final String partitionName = partitionToDeletedFilesPair.getLeft(); + final List deletedFileList = partitionToDeletedFilesPair.getRight(); + return deletedFileList.stream().flatMap(deletedFile -> { + if (!FSUtils.isBaseFile(new Path(deletedFile))) { + return Stream.empty(); + } + + final String partition = getPartitionIdentifier(partitionName); + return Stream.of(HoodieMetadataPayload.createBloomFilterMetadataRecord( + partition, deletedFile, instantTime, StringUtils.EMPTY_STRING, ByteBuffer.allocate(0), true)); + }).iterator(); + }); + allRecordsRDD = allRecordsRDD.union(deletedFilesRecordsRDD); + + List>> partitionToAppendedFilesList = partitionToAppendedFiles.entrySet() + .stream().map(entry -> Pair.of(entry.getKey(), entry.getValue())).collect(Collectors.toList()); + parallelism = Math.max(Math.min(partitionToAppendedFilesList.size(), recordsGenerationParams.getBloomIndexParallelism()), 1); + HoodieData>> partitionToAppendedFilesRDD = engineContext.parallelize(partitionToAppendedFilesList, parallelism); + + HoodieData appendedFilesRecordsRDD = partitionToAppendedFilesRDD.flatMap(partitionToAppendedFilesPair -> { + final String partitionName = partitionToAppendedFilesPair.getLeft(); + final Map appendedFileMap = partitionToAppendedFilesPair.getRight(); + final String partition = getPartitionIdentifier(partitionName); + return appendedFileMap.entrySet().stream().flatMap(appendedFileLengthPairEntry -> { + final String appendedFile = appendedFileLengthPairEntry.getKey(); + if (!FSUtils.isBaseFile(new Path(appendedFile))) { + return Stream.empty(); + } + final String pathWithPartition = partitionName + "/" + appendedFile; + final Path appendedFilePath = new Path(recordsGenerationParams.getDataMetaClient().getBasePath(), pathWithPartition); + try (HoodieFileReader fileReader = + HoodieFileReaderFactory.getFileReader(recordsGenerationParams.getDataMetaClient().getHadoopConf(), appendedFilePath)) { + final BloomFilter fileBloomFilter = fileReader.readBloomFilter(); + if (fileBloomFilter == null) { + LOG.error("Failed to read bloom filter for " + appendedFilePath); + return Stream.empty(); + } + ByteBuffer bloomByteBuffer = ByteBuffer.wrap(fileBloomFilter.serializeToString().getBytes()); + HoodieRecord record = HoodieMetadataPayload.createBloomFilterMetadataRecord( + partition, appendedFile, instantTime, recordsGenerationParams.getBloomFilterType(), bloomByteBuffer, false); + return Stream.of(record); + } catch (IOException e) { + LOG.error("Failed to get bloom filter for file: " + appendedFilePath); + } + return Stream.empty(); + }).iterator(); + }); + allRecordsRDD = allRecordsRDD.union(appendedFilesRecordsRDD); + + return allRecordsRDD; + } + + /** + * Convert added and deleted action metadata to column stats index records. + */ + public static HoodieData convertFilesToColumnStatsRecords(HoodieEngineContext engineContext, + Map> partitionToDeletedFiles, + Map> partitionToAppendedFiles, + MetadataRecordsGenerationParams recordsGenerationParams) { + HoodieData allRecordsRDD = engineContext.emptyHoodieData(); + HoodieTableMetaClient dataTableMetaClient = recordsGenerationParams.getDataMetaClient(); + + final List columnsToIndex = + getColumnsToIndex(recordsGenerationParams, + Lazy.lazily(() -> tryResolveSchemaForTable(dataTableMetaClient))); + + if (columnsToIndex.isEmpty()) { + // In case there are no columns to index, bail + return engineContext.emptyHoodieData(); + } + + final List>> partitionToDeletedFilesList = partitionToDeletedFiles.entrySet().stream() + .map(e -> Pair.of(e.getKey(), e.getValue())) + .collect(Collectors.toList()); + + int deletedFilesTargetParallelism = Math.max(Math.min(partitionToDeletedFilesList.size(), recordsGenerationParams.getColumnStatsIndexParallelism()), 1); + final HoodieData>> partitionToDeletedFilesRDD = + engineContext.parallelize(partitionToDeletedFilesList, deletedFilesTargetParallelism); + + HoodieData deletedFilesRecordsRDD = partitionToDeletedFilesRDD.flatMap(partitionToDeletedFilesPair -> { + final String partitionPath = partitionToDeletedFilesPair.getLeft(); + final String partitionId = getPartitionIdentifier(partitionPath); + final List deletedFileList = partitionToDeletedFilesPair.getRight(); + + return deletedFileList.stream().flatMap(deletedFile -> { + final String filePathWithPartition = partitionPath + "/" + deletedFile; + return getColumnStatsRecords(partitionId, filePathWithPartition, dataTableMetaClient, columnsToIndex, true); + }).iterator(); + }); + + allRecordsRDD = allRecordsRDD.union(deletedFilesRecordsRDD); + + final List>> partitionToAppendedFilesList = partitionToAppendedFiles.entrySet().stream() + .map(entry -> Pair.of(entry.getKey(), entry.getValue())) + .collect(Collectors.toList()); + + int appendedFilesTargetParallelism = Math.max(Math.min(partitionToAppendedFilesList.size(), recordsGenerationParams.getColumnStatsIndexParallelism()), 1); + final HoodieData>> partitionToAppendedFilesRDD = + engineContext.parallelize(partitionToAppendedFilesList, appendedFilesTargetParallelism); + + HoodieData appendedFilesRecordsRDD = partitionToAppendedFilesRDD.flatMap(partitionToAppendedFilesPair -> { + final String partitionPath = partitionToAppendedFilesPair.getLeft(); + final String partitionId = getPartitionIdentifier(partitionPath); + final Map appendedFileMap = partitionToAppendedFilesPair.getRight(); + + return appendedFileMap.entrySet().stream().flatMap(appendedFileNameLengthEntry -> { + if (!FSUtils.isBaseFile(new Path(appendedFileNameLengthEntry.getKey())) + || !appendedFileNameLengthEntry.getKey().endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { + return Stream.empty(); + } + final String filePathWithPartition = partitionPath + "/" + appendedFileNameLengthEntry.getKey(); + return getColumnStatsRecords(partitionId, filePathWithPartition, dataTableMetaClient, columnsToIndex, false); + }).iterator(); + }); + + allRecordsRDD = allRecordsRDD.union(appendedFilesRecordsRDD); + + return allRecordsRDD; + } + + /** + * Map a record key to a file group in partition of interest. + *

    + * Note: For hashing, the algorithm is same as String.hashCode() but is being defined here as hashCode() + * implementation is not guaranteed by the JVM to be consistent across JVM versions and implementations. + * + * @param recordKey record key for which the file group index is looked up for. + * @return An integer hash of the given string + */ + public static int mapRecordKeyToFileGroupIndex(String recordKey, int numFileGroups) { + int h = 0; + for (int i = 0; i < recordKey.length(); ++i) { + h = 31 * h + recordKey.charAt(i); + } + + return Math.abs(Math.abs(h) % numFileGroups); + } + + /** + * Get the latest file slices for a Metadata Table partition. If the file slice is + * because of pending compaction instant, then merge the file slice with the one + * just before the compaction instant time. The list of file slices returned is + * sorted in the correct order of file group name. + * + * @param metaClient Instance of {@link HoodieTableMetaClient}. + * @param fsView Metadata table filesystem view. + * @param partition The name of the partition whose file groups are to be loaded. + * @return List of latest file slices for all file groups in a given partition. + */ + public static List getPartitionLatestMergedFileSlices( + HoodieTableMetaClient metaClient, HoodieTableFileSystemView fsView, String partition) { + LOG.info("Loading latest merged file slices for metadata table partition " + partition); + return getPartitionFileSlices(metaClient, Option.of(fsView), partition, true); + } + + /** + * Get the latest file slices for a Metadata Table partition. The list of file slices + * returned is sorted in the correct order of file group name. + * + * @param metaClient - Instance of {@link HoodieTableMetaClient}. + * @param fsView - Metadata table filesystem view + * @param partition - The name of the partition whose file groups are to be loaded. + * @return List of latest file slices for all file groups in a given partition. + */ + public static List getPartitionLatestFileSlices(HoodieTableMetaClient metaClient, + Option fsView, String partition) { + LOG.info("Loading latest file slices for metadata table partition " + partition); + return getPartitionFileSlices(metaClient, fsView, partition, false); + } + + /** + * Get metadata table file system view. + * + * @param metaClient - Metadata table meta client + * @return Filesystem view for the metadata table + */ + public static HoodieTableFileSystemView getFileSystemView(HoodieTableMetaClient metaClient) { + // If there are no commits on the metadata table then the table's + // default FileSystemView will not return any file slices even + // though we may have initialized them. + HoodieTimeline timeline = metaClient.getActiveTimeline(); + if (timeline.empty()) { + final HoodieInstant instant = new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, + HoodieActiveTimeline.createNewInstantTime()); + timeline = new HoodieDefaultTimeline(Stream.of(instant), metaClient.getActiveTimeline()::getInstantDetails); + } + return new HoodieTableFileSystemView(metaClient, timeline); + } + + /** + * Get the latest file slices for a given partition. + * + * @param metaClient - Instance of {@link HoodieTableMetaClient}. + * @param partition - The name of the partition whose file groups are to be loaded. + * @param mergeFileSlices - When enabled, will merge the latest file slices with the last known + * completed instant. This is useful for readers when there are pending + * compactions. MergeFileSlices when disabled, will return the latest file + * slices without any merging, and this is needed for the writers. + * @return List of latest file slices for all file groups in a given partition. + */ + private static List getPartitionFileSlices(HoodieTableMetaClient metaClient, + Option fileSystemView, + String partition, + boolean mergeFileSlices) { + HoodieTableFileSystemView fsView = fileSystemView.orElse(getFileSystemView(metaClient)); + Stream fileSliceStream; + if (mergeFileSlices) { + if (metaClient.getActiveTimeline().filterCompletedInstants().lastInstant().isPresent()) { + fileSliceStream = fsView.getLatestMergedFileSlicesBeforeOrOn( + partition, metaClient.getActiveTimeline().filterCompletedInstants().lastInstant().get().getTimestamp()); + } else { + return Collections.EMPTY_LIST; + } + } else { + fileSliceStream = fsView.getLatestFileSlices(partition); + } + return fileSliceStream.sorted(Comparator.comparing(FileSlice::getFileId)).collect(Collectors.toList()); + } + + /** + * Get the latest file slices for a given partition including the inflight ones. + * + * @param metaClient - instance of {@link HoodieTableMetaClient} + * @param fileSystemView - hoodie table file system view, which will be fetched from meta client if not already present + * @param partition - name of the partition whose file groups are to be loaded + * @return + */ + public static List getPartitionLatestFileSlicesIncludingInflight(HoodieTableMetaClient metaClient, + Option fileSystemView, + String partition) { + HoodieTableFileSystemView fsView = fileSystemView.orElse(getFileSystemView(metaClient)); + Stream fileSliceStream = fsView.fetchLatestFileSlicesIncludingInflight(partition); + return fileSliceStream + .sorted(Comparator.comparing(FileSlice::getFileId)) + .collect(Collectors.toList()); + } + + public static HoodieData convertMetadataToColumnStatsRecords(HoodieCommitMetadata commitMetadata, + HoodieEngineContext engineContext, + MetadataRecordsGenerationParams recordsGenerationParams) { + List allWriteStats = commitMetadata.getPartitionToWriteStats().values().stream() + .flatMap(Collection::stream).collect(Collectors.toList()); + + if (allWriteStats.isEmpty()) { + return engineContext.emptyHoodieData(); + } + + try { + Option writerSchema = + Option.ofNullable(commitMetadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY)) + .flatMap(writerSchemaStr -> + isNullOrEmpty(writerSchemaStr) + ? Option.empty() + : Option.of(new Schema.Parser().parse(writerSchemaStr))); + + HoodieTableMetaClient dataTableMetaClient = recordsGenerationParams.getDataMetaClient(); + HoodieTableConfig tableConfig = dataTableMetaClient.getTableConfig(); + + // NOTE: Writer schema added to commit metadata will not contain Hudi's metadata fields + Option tableSchema = writerSchema.map(schema -> + tableConfig.populateMetaFields() ? addMetadataFields(schema) : schema); + + List columnsToIndex = getColumnsToIndex(recordsGenerationParams, + Lazy.eagerly(tableSchema)); + + if (columnsToIndex.isEmpty()) { + // In case there are no columns to index, bail + return engineContext.emptyHoodieData(); + } + + int parallelism = Math.max(Math.min(allWriteStats.size(), recordsGenerationParams.getColumnStatsIndexParallelism()), 1); + return engineContext.parallelize(allWriteStats, parallelism) + .flatMap(writeStat -> + translateWriteStatToColumnStats(writeStat, dataTableMetaClient, columnsToIndex).iterator()); + } catch (Exception e) { + throw new HoodieException("Failed to generate column stats records for metadata table", e); + } + } + + /** + * Get the list of columns for the table for column stats indexing + */ + private static List getColumnsToIndex(MetadataRecordsGenerationParams recordsGenParams, + Lazy> lazyWriterSchemaOpt) { + checkState(recordsGenParams.isColumnStatsIndexEnabled()); + + List targetColumns = recordsGenParams.getTargetColumnsForColumnStatsIndex(); + if (!targetColumns.isEmpty()) { + return targetColumns; + } + + Option writerSchemaOpt = lazyWriterSchemaOpt.get(); + return writerSchemaOpt + .map(writerSchema -> + writerSchema.getFields().stream() + .map(Schema.Field::name) + .collect(Collectors.toList())) + .orElse(Collections.emptyList()); + } + + private static Stream translateWriteStatToColumnStats(HoodieWriteStat writeStat, + HoodieTableMetaClient datasetMetaClient, + List columnsToIndex) { + if (writeStat instanceof HoodieDeltaWriteStat && ((HoodieDeltaWriteStat) writeStat).getColumnStats().isPresent()) { + Map> columnRangeMap = ((HoodieDeltaWriteStat) writeStat).getColumnStats().get(); + Collection> columnRangeMetadataList = columnRangeMap.values(); + return HoodieMetadataPayload.createColumnStatsRecords(writeStat.getPartitionPath(), columnRangeMetadataList, false); + } + + return getColumnStatsRecords(writeStat.getPartitionPath(), writeStat.getPath(), datasetMetaClient, columnsToIndex, false); + } + + private static Stream getColumnStatsRecords(String partitionPath, + String filePath, + HoodieTableMetaClient datasetMetaClient, + List columnsToIndex, + boolean isDeleted) { + String filePartitionPath = filePath.startsWith("/") ? filePath.substring(1) : filePath; + String fileName = FSUtils.getFileName(filePath, partitionPath); + + if (isDeleted) { + // TODO we should delete records instead of stubbing them + List> columnRangeMetadataList = columnsToIndex.stream() + .map(entry -> HoodieColumnRangeMetadata.stub(fileName, entry)) + .collect(Collectors.toList()); + + return HoodieMetadataPayload.createColumnStatsRecords(partitionPath, columnRangeMetadataList, true); + } + + List> columnRangeMetadata = + readColumnRangeMetadataFrom(filePartitionPath, datasetMetaClient, columnsToIndex); + + return HoodieMetadataPayload.createColumnStatsRecords(partitionPath, columnRangeMetadata, false); + } + + private static List> readColumnRangeMetadataFrom(String filePath, + HoodieTableMetaClient datasetMetaClient, + List columnsToIndex) { + try { + if (filePath.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { + Path fullFilePath = new Path(datasetMetaClient.getBasePath(), filePath); + List> columnRangeMetadataList = + new ParquetUtils().readRangeFromParquetMetadata(datasetMetaClient.getHadoopConf(), fullFilePath, columnsToIndex); + + return columnRangeMetadataList; + } + + LOG.warn("Column range index not supported for: " + filePath); + return Collections.emptyList(); + } catch (Exception e) { + // NOTE: In case reading column range metadata from individual file failed, + // we simply fall back, in lieu of failing the whole task + LOG.error("Failed to fetch column range metadata for: " + filePath); + return Collections.emptyList(); + } + } + + /** + * Get file group count for a metadata table partition. + * + * @param partitionType - Metadata table partition type + * @param metaClient - Metadata table meta client + * @param fsView - Filesystem view + * @param metadataConfig - Metadata config + * @param isBootstrapCompleted - Is bootstrap completed for the metadata table + * @return File group count for the requested metadata partition type + */ + public static int getPartitionFileGroupCount(final MetadataPartitionType partitionType, + final Option metaClient, + final Option fsView, + final HoodieMetadataConfig metadataConfig, boolean isBootstrapCompleted) { + if (isBootstrapCompleted) { + final List latestFileSlices = HoodieTableMetadataUtil + .getPartitionLatestFileSlices(metaClient.get(), fsView, partitionType.getPartitionPath()); + if (latestFileSlices.size() == 0 && !partitionType.getPartitionPath().equals(MetadataPartitionType.FILES.getPartitionPath())) { + return getFileGroupCount(partitionType, metadataConfig); + } + return Math.max(latestFileSlices.size(), 1); + } + + return getFileGroupCount(partitionType, metadataConfig); + } + + private static int getFileGroupCount(MetadataPartitionType partitionType, final HoodieMetadataConfig metadataConfig) { + switch (partitionType) { + case BLOOM_FILTERS: + return metadataConfig.getBloomFilterIndexFileGroupCount(); + case COLUMN_STATS: + return metadataConfig.getColumnStatsIndexFileGroupCount(); + default: + return 1; + } + } + + /** + * Does an upcast for {@link BigDecimal} instance to align it with scale/precision expected by + * the {@link org.apache.avro.LogicalTypes.Decimal} Avro logical type + */ + public static BigDecimal tryUpcastDecimal(BigDecimal value, final LogicalTypes.Decimal decimal) { + final int scale = decimal.getScale(); + final int valueScale = value.scale(); + + boolean scaleAdjusted = false; + if (valueScale != scale) { + try { + value = value.setScale(scale, RoundingMode.UNNECESSARY); + scaleAdjusted = true; + } catch (ArithmeticException aex) { + throw new AvroTypeException( + "Cannot encode decimal with scale " + valueScale + " as scale " + scale + " without rounding"); + } + } + + int precision = decimal.getPrecision(); + int valuePrecision = value.precision(); + if (valuePrecision > precision) { + if (scaleAdjusted) { + throw new AvroTypeException("Cannot encode decimal with precision " + valuePrecision + " as max precision " + + precision + ". This is after safely adjusting scale from " + valueScale + " to required " + scale); + } else { + throw new AvroTypeException( + "Cannot encode decimal with precision " + valuePrecision + " as max precision " + precision); + } + } + + return value; + } + + private static Option tryResolveSchemaForTable(HoodieTableMetaClient dataTableMetaClient) { + if (dataTableMetaClient.getCommitsTimeline().filterCompletedInstants().countInstants() == 0) { + return Option.empty(); + } + + try { + TableSchemaResolver schemaResolver = new TableSchemaResolver(dataTableMetaClient); + return Option.of(schemaResolver.getTableAvroSchema()); + } catch (Exception e) { + throw new HoodieException("Failed to get latest columns for " + dataTableMetaClient.getBasePath(), e); + } + } + + /** + * Given a schema, coerces provided value to instance of {@link Comparable} such that + * it could subsequently used in column stats + * + * NOTE: This method has to stay compatible with the semantic of + * {@link ParquetUtils#readRangeFromParquetMetadata} as they are used in tandem + */ + private static Comparable coerceToComparable(Schema schema, Object val) { + if (val == null) { + return null; + } + + switch (schema.getType()) { + case UNION: + // TODO we need to handle unions in general case as well + return coerceToComparable(resolveNullableSchema(schema), val); + + case FIXED: + case BYTES: + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + return (Comparable) val; + } + return (ByteBuffer) val; + + + case INT: + if (schema.getLogicalType() == LogicalTypes.date() + || schema.getLogicalType() == LogicalTypes.timeMillis()) { + // NOTE: This type will be either {@code java.sql.Date} or {org.joda.LocalDate} + // depending on the Avro version. Hence, we simply cast it to {@code Comparable} + return (Comparable) val; + } + return (Integer) val; + + case LONG: + if (schema.getLogicalType() == LogicalTypes.timeMicros() + || schema.getLogicalType() == LogicalTypes.timestampMicros() + || schema.getLogicalType() == LogicalTypes.timestampMillis()) { + // NOTE: This type will be either {@code java.sql.Date} or {org.joda.LocalDate} + // depending on the Avro version. Hence, we simply cast it to {@code Comparable} + return (Comparable) val; + } + return (Long) val; + + case STRING: + // unpack the avro Utf8 if possible + return val.toString(); + case FLOAT: + case DOUBLE: + case BOOLEAN: + return (Comparable) val; + + + // TODO add support for those types + case ENUM: + case MAP: + case NULL: + case RECORD: + case ARRAY: + return null; + + default: + throw new IllegalStateException("Unexpected type: " + schema.getType()); + } + } + + private static boolean canCompare(Schema schema) { + return schema.getType() != Schema.Type.MAP; + } + + public static Set getInflightMetadataPartitions(HoodieTableConfig tableConfig) { + return new HashSet<>(tableConfig.getMetadataPartitionsInflight()); + } + + public static Set getInflightAndCompletedMetadataPartitions(HoodieTableConfig tableConfig) { + Set inflightAndCompletedPartitions = getInflightMetadataPartitions(tableConfig); + inflightAndCompletedPartitions.addAll(tableConfig.getMetadataPartitions()); + return inflightAndCompletedPartitions; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataPartitionType.java b/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataPartitionType.java new file mode 100644 index 0000000000000..ddb76ca2579ea --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataPartitionType.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import java.util.Arrays; +import java.util.List; + +public enum MetadataPartitionType { + FILES(HoodieTableMetadataUtil.PARTITION_NAME_FILES, "files-"), + COLUMN_STATS(HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS, "col-stats-"), + BLOOM_FILTERS(HoodieTableMetadataUtil.PARTITION_NAME_BLOOM_FILTERS, "bloom-filters-"); + + // Partition path in metadata table. + private final String partitionPath; + // FileId prefix used for all file groups in this partition. + private final String fileIdPrefix; + // Total file groups + // TODO fix: enum should not have any mutable aspect as this compromises whole idea + // of the inum being static, immutable entity + private int fileGroupCount = 1; + + MetadataPartitionType(final String partitionPath, final String fileIdPrefix) { + this.partitionPath = partitionPath; + this.fileIdPrefix = fileIdPrefix; + } + + public String getPartitionPath() { + return partitionPath; + } + + public String getFileIdPrefix() { + return fileIdPrefix; + } + + public void setFileGroupCount(final int fileGroupCount) { + this.fileGroupCount = fileGroupCount; + } + + public int getFileGroupCount() { + return this.fileGroupCount; + } + + public static List allPaths() { + return Arrays.asList( + FILES.getPartitionPath(), + COLUMN_STATS.getPartitionPath(), + BLOOM_FILTERS.getPartitionPath() + ); + } + + @Override + public String toString() { + return "Metadata partition {" + + "name: " + getPartitionPath() + + ", prefix: " + getFileIdPrefix() + + ", groups: " + getFileGroupCount() + + "}"; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataRecordsGenerationParams.java b/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataRecordsGenerationParams.java new file mode 100644 index 0000000000000..72a8bf4cd26f8 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataRecordsGenerationParams.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.hudi.common.table.HoodieTableMetaClient; + +import java.io.Serializable; +import java.util.List; + +/** + * Encapsulates all parameters required to generate metadata index for enabled index types. + * + * @deprecated this component currently duplicates configuration coming from the {@code HoodieWriteConfig} + * which is problematic; instead we should break this component down and use source of truth + * for each respective data-point directly ({@code HoodieWriteConfig}, {@code HoodieTableMetaClient}, etc) + */ +@Deprecated +public class MetadataRecordsGenerationParams implements Serializable { + + private final HoodieTableMetaClient dataMetaClient; + private final List enabledPartitionTypes; + private final String bloomFilterType; + private final int bloomIndexParallelism; + private final boolean isColumnStatsIndexEnabled; + private final int columnStatsIndexParallelism; + private final List targetColumnsForColumnStatsIndex; + private final List targetColumnsForBloomFilterIndex; + + MetadataRecordsGenerationParams(HoodieTableMetaClient dataMetaClient, List enabledPartitionTypes, String bloomFilterType, int bloomIndexParallelism, + boolean isColumnStatsIndexEnabled, int columnStatsIndexParallelism, List targetColumnsForColumnStatsIndex, List targetColumnsForBloomFilterIndex) { + this.dataMetaClient = dataMetaClient; + this.enabledPartitionTypes = enabledPartitionTypes; + this.bloomFilterType = bloomFilterType; + this.bloomIndexParallelism = bloomIndexParallelism; + this.isColumnStatsIndexEnabled = isColumnStatsIndexEnabled; + this.columnStatsIndexParallelism = columnStatsIndexParallelism; + this.targetColumnsForColumnStatsIndex = targetColumnsForColumnStatsIndex; + this.targetColumnsForBloomFilterIndex = targetColumnsForBloomFilterIndex; + } + + public HoodieTableMetaClient getDataMetaClient() { + return dataMetaClient; + } + + public List getEnabledPartitionTypes() { + return enabledPartitionTypes; + } + + public String getBloomFilterType() { + return bloomFilterType; + } + + public boolean isColumnStatsIndexEnabled() { + return isColumnStatsIndexEnabled; + } + + public int getBloomIndexParallelism() { + return bloomIndexParallelism; + } + + public int getColumnStatsIndexParallelism() { + return columnStatsIndexParallelism; + } + + public List getTargetColumnsForColumnStatsIndex() { + return targetColumnsForColumnStatsIndex; + } + + public List getSecondaryKeysForBloomFilterIndex() { + return targetColumnsForBloomFilterIndex; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/parquet/io/ByteBufferBackedInputFile.java b/hudi-common/src/main/java/org/apache/hudi/parquet/io/ByteBufferBackedInputFile.java new file mode 100644 index 0000000000000..40454d306ac78 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/parquet/io/ByteBufferBackedInputFile.java @@ -0,0 +1,63 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.parquet.io; + +import org.apache.hudi.common.util.io.ByteBufferBackedInputStream; +import org.apache.parquet.io.DelegatingSeekableInputStream; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.io.SeekableInputStream; + +/** + * Implementation of {@link InputFile} backed by {@code byte[]} buffer + */ +public class ByteBufferBackedInputFile implements InputFile { + private final byte[] buffer; + private final int offset; + private final int length; + + public ByteBufferBackedInputFile(byte[] buffer, int offset, int length) { + this.buffer = buffer; + this.offset = offset; + this.length = length; + } + + public ByteBufferBackedInputFile(byte[] buffer) { + this(buffer, 0, buffer.length); + } + + @Override + public long getLength() { + return length; + } + + @Override + public SeekableInputStream newStream() { + return new DelegatingSeekableInputStream(new ByteBufferBackedInputStream(buffer, offset, length)) { + @Override + public long getPos() { + return ((ByteBufferBackedInputStream) getStream()).getPosition(); + } + + @Override + public void seek(long newPos) { + ((ByteBufferBackedInputStream) getStream()).seek(newPos); + } + }; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/parquet/io/OutputStreamBackedOutputFile.java b/hudi-common/src/main/java/org/apache/hudi/parquet/io/OutputStreamBackedOutputFile.java new file mode 100644 index 0000000000000..48c2c82e7b422 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/parquet/io/OutputStreamBackedOutputFile.java @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.parquet.io; + +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.parquet.io.OutputFile; +import org.apache.parquet.io.PositionOutputStream; + +import javax.annotation.Nonnull; +import java.io.IOException; + +/** + * Implementation of the {@link OutputFile} backed by {@link java.io.OutputStream} + */ +public class OutputStreamBackedOutputFile implements OutputFile { + + private static final long DEFAULT_BLOCK_SIZE = 1024L * 1024L; + + private final FSDataOutputStream outputStream; + + public OutputStreamBackedOutputFile(FSDataOutputStream outputStream) { + this.outputStream = outputStream; + } + + @Override + public PositionOutputStream create(long blockSizeHint) { + return new PositionOutputStreamAdapter(outputStream); + } + + @Override + public PositionOutputStream createOrOverwrite(long blockSizeHint) { + return create(blockSizeHint); + } + + @Override + public boolean supportsBlockSize() { + return false; + } + + @Override + public long defaultBlockSize() { + return DEFAULT_BLOCK_SIZE; + } + + private static class PositionOutputStreamAdapter extends PositionOutputStream { + private final FSDataOutputStream delegate; + + PositionOutputStreamAdapter(FSDataOutputStream delegate) { + this.delegate = delegate; + } + + @Override + public long getPos() throws IOException { + return delegate.getPos(); + } + + @Override + public void write(int b) throws IOException { + delegate.write(b); + } + + @Override + public void write(@Nonnull byte[] buffer, int off, int len) throws IOException { + delegate.write(buffer, off, len); + } + + @Override + public void flush() throws IOException { + delegate.flush(); + } + + @Override + public void close() { + // We're deliberately not closing the delegate stream here to allow caller + // to explicitly manage its lifecycle + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/util/Lazy.java b/hudi-common/src/main/java/org/apache/hudi/util/Lazy.java new file mode 100644 index 0000000000000..1a843430b7d9c --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/util/Lazy.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import javax.annotation.concurrent.ThreadSafe; +import java.util.function.Supplier; + +/** + * Utility implementing lazy semantics in Java + * + * @param type of the object being held by {@link Lazy} + */ +@ThreadSafe +public class Lazy { + + private volatile boolean initialized; + + private Supplier initializer; + private T ref; + + private Lazy(Supplier initializer) { + this.initializer = initializer; + this.ref = null; + this.initialized = false; + } + + private Lazy(T ref) { + this.initializer = null; + this.ref = ref; + this.initialized = true; + } + + public T get() { + if (!initialized) { + synchronized (this) { + if (!initialized) { + this.ref = initializer.get(); + this.initializer = null; + initialized = true; + } + } + } + + return ref; + } + + /** + * Executes provided {@code initializer} lazily, while providing for "exactly once" semantic, + * to instantiate value of type {@link T} being subsequently held by the returned instance of + * {@link Lazy} + */ + public static Lazy lazily(Supplier initializer) { + return new Lazy<>(initializer); + } + + /** + * Instantiates {@link Lazy} in an "eagerly" fashion setting it w/ the provided value of + * type {@link T} directly, bypassing lazy initialization sequence + */ + public static Lazy eagerly(T ref) { + return new Lazy<>(ref); + } +} diff --git a/hudi-common/src/main/resources/hbase-site.xml b/hudi-common/src/main/resources/hbase-site.xml new file mode 100644 index 0000000000000..67853ae3ec982 --- /dev/null +++ b/hudi-common/src/main/resources/hbase-site.xml @@ -0,0 +1,2185 @@ + + + + + + + + + + + + hbase.tmp.dir + ${java.io.tmpdir}/hbase-${user.name} + Temporary directory on the local filesystem. + Change this setting to point to a location more permanent + than '/tmp', the usual resolve for java.io.tmpdir, as the + '/tmp' directory is cleared on machine restart. + + + + hbase.rootdir + ${hbase.tmp.dir}/hbase + The directory shared by region servers and into + which HBase persists. The URL should be 'fully-qualified' + to include the filesystem scheme. For example, to specify the + HDFS directory '/hbase' where the HDFS instance's namenode is + running at namenode.example.org on port 9000, set this value to: + hdfs://namenode.example.org:9000/hbase. By default, we write + to whatever ${hbase.tmp.dir} is set too -- usually /tmp -- + so change this configuration or else all data will be lost on + machine restart. + + + + hbase.cluster.distributed + false + The mode the cluster will be in. Possible values are + false for standalone mode and true for distributed mode. If + false, startup will run all HBase and ZooKeeper daemons together + in the one JVM. + + + + hbase.zookeeper.quorum + + 127.0.0.1 + Comma separated list of servers in the ZooKeeper ensemble + (This config. should have been named hbase.zookeeper.ensemble). + For example, "host1.mydomain.com,host2.mydomain.com,host3.mydomain.com". + By default this is set to localhost for local and pseudo-distributed modes + of operation. For a fully-distributed setup, this should be set to a full + list of ZooKeeper ensemble servers. If HBASE_MANAGES_ZK is set in hbase-env.sh + this is the list of servers which hbase will start/stop ZooKeeper on as + part of cluster start/stop. Client-side, we will take this list of + ensemble members and put it together with the hbase.zookeeper.property.clientPort + config. and pass it into zookeeper constructor as the connectString + parameter. + + + + + + zookeeper.recovery.retry.maxsleeptime + 60000 + Max sleep time before retry zookeeper operations in milliseconds, + a max time is needed here so that sleep time won't grow unboundedly + + + + hbase.local.dir + ${hbase.tmp.dir}/local/ + Directory on the local filesystem to be used + as a local storage. + + + + + + hbase.master.port + 16000 + The port the HBase Master should bind to. + + + hbase.master.info.port + 16010 + The port for the HBase Master web UI. + Set to -1 if you do not want a UI instance run. + + + + hbase.master.info.bindAddress + 0.0.0.0 + The bind address for the HBase Master web UI + + + + hbase.master.logcleaner.plugins + + org.apache.hadoop.hbase.master.cleaner.TimeToLiveLogCleaner,org.apache.hadoop.hbase.master.cleaner.TimeToLiveProcedureWALCleaner,org.apache.hadoop.hbase.master.cleaner.TimeToLiveMasterLocalStoreWALCleaner + + A comma-separated list of BaseLogCleanerDelegate invoked by + the LogsCleaner service. These WAL cleaners are called in order, + so put the cleaner that prunes the most files in front. To + implement your own BaseLogCleanerDelegate, just put it in HBase's classpath + and add the fully qualified class name here. Always add the above + default log cleaners in the list. + + + + hbase.master.logcleaner.ttl + 600000 + How long a WAL remain in the archive ({hbase.rootdir}/oldWALs) directory, + after which it will be cleaned by a Master thread. The value is in milliseconds. + + + + hbase.master.hfilecleaner.plugins + + org.apache.hadoop.hbase.master.cleaner.TimeToLiveHFileCleaner,org.apache.hadoop.hbase.master.cleaner.TimeToLiveMasterLocalStoreHFileCleaner + + A comma-separated list of BaseHFileCleanerDelegate invoked by + the HFileCleaner service. These HFiles cleaners are called in order, + so put the cleaner that prunes the most files in front. To + implement your own BaseHFileCleanerDelegate, just put it in HBase's classpath + and add the fully qualified class name here. Always add the above + default hfile cleaners in the list as they will be overwritten in + hbase-site.xml. + + + + hbase.master.infoserver.redirect + true + Whether or not the Master listens to the Master web + UI port (hbase.master.info.port) and redirects requests to the web + UI server shared by the Master and RegionServer. Config. makes + sense when Master is serving Regions (not the default). + + + + hbase.master.fileSplitTimeout + 600000 + Splitting a region, how long to wait on the file-splitting + step before aborting the attempt. Default: 600000. This setting used + to be known as hbase.regionserver.fileSplitTimeout in hbase-1.x. + Split is now run master-side hence the rename (If a + 'hbase.master.fileSplitTimeout' setting found, will use it to + prime the current 'hbase.master.fileSplitTimeout' + Configuration. + + + + + + hbase.regionserver.port + 16020 + The port the HBase RegionServer binds to. + + + hbase.regionserver.info.port + 16030 + The port for the HBase RegionServer web UI + Set to -1 if you do not want the RegionServer UI to run. + + + + hbase.regionserver.info.bindAddress + 0.0.0.0 + The address for the HBase RegionServer web UI + + + hbase.regionserver.info.port.auto + false + Whether or not the Master or RegionServer + UI should search for a port to bind to. Enables automatic port + search if hbase.regionserver.info.port is already in use. + Useful for testing, turned off by default. + + + + hbase.regionserver.handler.count + 30 + Count of RPC Listener instances spun up on RegionServers. + Same property is used by the Master for count of master handlers. + Too many handlers can be counter-productive. Make it a multiple of + CPU count. If mostly read-only, handlers count close to cpu count + does well. Start with twice the CPU count and tune from there. + + + + hbase.ipc.server.callqueue.handler.factor + 0.1 + Factor to determine the number of call queues. + A value of 0 means a single queue shared between all the handlers. + A value of 1 means that each handler has its own queue. + + + + hbase.ipc.server.callqueue.read.ratio + 0 + Split the call queues into read and write queues. + The specified interval (which should be between 0.0 and 1.0) + will be multiplied by the number of call queues. + A value of 0 indicate to not split the call queues, meaning that both read and write + requests will be pushed to the same set of queues. + A value lower than 0.5 means that there will be less read queues than write queues. + A value of 0.5 means there will be the same number of read and write queues. + A value greater than 0.5 means that there will be more read queues than write queues. + A value of 1.0 means that all the queues except one are used to dispatch read requests. + + Example: Given the total number of call queues being 10 + a read.ratio of 0 means that: the 10 queues will contain both read/write requests. + a read.ratio of 0.3 means that: 3 queues will contain only read requests + and 7 queues will contain only write requests. + a read.ratio of 0.5 means that: 5 queues will contain only read requests + and 5 queues will contain only write requests. + a read.ratio of 0.8 means that: 8 queues will contain only read requests + and 2 queues will contain only write requests. + a read.ratio of 1 means that: 9 queues will contain only read requests + and 1 queues will contain only write requests. + + + + hbase.ipc.server.callqueue.scan.ratio + 0 + Given the number of read call queues, calculated from the total number + of call queues multiplied by the callqueue.read.ratio, the scan.ratio property + will split the read call queues into small-read and long-read queues. + A value lower than 0.5 means that there will be less long-read queues than short-read queues. + A value of 0.5 means that there will be the same number of short-read and long-read queues. + A value greater than 0.5 means that there will be more long-read queues than short-read queues + A value of 0 or 1 indicate to use the same set of queues for gets and scans. + + Example: Given the total number of read call queues being 8 + a scan.ratio of 0 or 1 means that: 8 queues will contain both long and short read requests. + a scan.ratio of 0.3 means that: 2 queues will contain only long-read requests + and 6 queues will contain only short-read requests. + a scan.ratio of 0.5 means that: 4 queues will contain only long-read requests + and 4 queues will contain only short-read requests. + a scan.ratio of 0.8 means that: 6 queues will contain only long-read requests + and 2 queues will contain only short-read requests. + + + + hbase.regionserver.msginterval + 3000 + Interval between messages from the RegionServer to Master + in milliseconds. + + + + hbase.regionserver.logroll.period + 3600000 + Period at which we will roll the commit log regardless + of how many edits it has. + + + + hbase.regionserver.logroll.errors.tolerated + 2 + The number of consecutive WAL close errors we will allow + before triggering a server abort. A setting of 0 will cause the + region server to abort if closing the current WAL writer fails during + log rolling. Even a small value (2 or 3) will allow a region server + to ride over transient HDFS errors. + + + + hbase.regionserver.hlog.reader.impl + org.apache.hadoop.hbase.regionserver.wal.ProtobufLogReader + The WAL file reader implementation. + + + hbase.regionserver.hlog.writer.impl + org.apache.hadoop.hbase.regionserver.wal.ProtobufLogWriter + The WAL file writer implementation. + + + hbase.regionserver.global.memstore.size + + Maximum size of all memstores in a region server before new + updates are blocked and flushes are forced. Defaults to 40% of heap (0.4). + Updates are blocked and flushes are forced until size of all memstores + in a region server hits hbase.regionserver.global.memstore.size.lower.limit. + The default value in this configuration has been intentionally left empty in order to + honor the old hbase.regionserver.global.memstore.upperLimit property if present. + + + + hbase.regionserver.global.memstore.size.lower.limit + + Maximum size of all memstores in a region server before flushes + are forced. Defaults to 95% of hbase.regionserver.global.memstore.size + (0.95). A 100% value for this value causes the minimum possible flushing + to occur when updates are blocked due to memstore limiting. The default + value in this configuration has been intentionally left empty in order to + honor the old hbase.regionserver.global.memstore.lowerLimit property if + present. + + + + hbase.systemtables.compacting.memstore.type + NONE + Determines the type of memstore to be used for system tables like + META, namespace tables etc. By default NONE is the type and hence we use the + default memstore for all the system tables. If we need to use compacting + memstore for system tables then set this property to BASIC/EAGER + + + + hbase.regionserver.optionalcacheflushinterval + 3600000 + + Maximum amount of time an edit lives in memory before being automatically flushed. + Default 1 hour. Set it to 0 to disable automatic flushing. + + + + hbase.regionserver.dns.interface + default + The name of the Network Interface from which a region server + should report its IP address. + + + + hbase.regionserver.dns.nameserver + default + The host name or IP address of the name server (DNS) + which a region server should use to determine the host name used by the + master for communication and display purposes. + + + + hbase.regionserver.region.split.policy + org.apache.hadoop.hbase.regionserver.SteppingSplitPolicy + + A split policy determines when a region should be split. The various + other split policies that are available currently are BusyRegionSplitPolicy, + ConstantSizeRegionSplitPolicy, DisabledRegionSplitPolicy, + DelimitedKeyPrefixRegionSplitPolicy, KeyPrefixRegionSplitPolicy, and + SteppingSplitPolicy. DisabledRegionSplitPolicy blocks manual region splitting. + + + + hbase.regionserver.regionSplitLimit + 1000 + + Limit for the number of regions after which no more region splitting + should take place. This is not hard limit for the number of regions + but acts as a guideline for the regionserver to stop splitting after + a certain limit. Default is set to 1000. + + + + + + zookeeper.session.timeout + 90000 + ZooKeeper session timeout in milliseconds. It is used in two different ways. + First, this value is used in the ZK client that HBase uses to connect to the ensemble. + It is also used by HBase when it starts a ZK server and it is passed as the 'maxSessionTimeout'. + See https://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#ch_zkSessions. + For example, if an HBase region server connects to a ZK ensemble that's also managed + by HBase, then the session timeout will be the one specified by this configuration. + But, a region server that connects to an ensemble managed with a different configuration + will be subjected that ensemble's maxSessionTimeout. So, even though HBase might propose + using 90 seconds, the ensemble can have a max timeout lower than this and it will take + precedence. The current default maxSessionTimeout that ZK ships with is 40 seconds, which is lower than + HBase's. + + + + zookeeper.znode.parent + /hbase + Root ZNode for HBase in ZooKeeper. All of HBase's ZooKeeper + files that are configured with a relative path will go under this node. + By default, all of HBase's ZooKeeper file paths are configured with a + relative path, so they will all go under this directory unless changed. + + + + zookeeper.znode.acl.parent + acl + Root ZNode for access control lists. + + + hbase.zookeeper.dns.interface + default + The name of the Network Interface from which a ZooKeeper server + should report its IP address. + + + + hbase.zookeeper.dns.nameserver + default + The host name or IP address of the name server (DNS) + which a ZooKeeper server should use to determine the host name used by the + master for communication and display purposes. + + + + + hbase.zookeeper.peerport + 2888 + Port used by ZooKeeper peers to talk to each other. + See https://zookeeper.apache.org/doc/r3.3.3/zookeeperStarted.html#sc_RunningReplicatedZooKeeper + for more information. + + + + hbase.zookeeper.leaderport + 3888 + Port used by ZooKeeper for leader election. + See https://zookeeper.apache.org/doc/r3.3.3/zookeeperStarted.html#sc_RunningReplicatedZooKeeper + for more information. + + + + + + + hbase.zookeeper.property.initLimit + 10 + Property from ZooKeeper's config zoo.cfg. + The number of ticks that the initial synchronization phase can take. + + + + hbase.zookeeper.property.syncLimit + 5 + Property from ZooKeeper's config zoo.cfg. + The number of ticks that can pass between sending a request and getting an + acknowledgment. + + + + hbase.zookeeper.property.dataDir + ${hbase.tmp.dir}/zookeeper + Property from ZooKeeper's config zoo.cfg. + The directory where the snapshot is stored. + + + + hbase.zookeeper.property.clientPort + 2181 + Property from ZooKeeper's config zoo.cfg. + The port at which the clients will connect. + + + + hbase.zookeeper.property.maxClientCnxns + 300 + Property from ZooKeeper's config zoo.cfg. + Limit on number of concurrent connections (at the socket level) that a + single client, identified by IP address, may make to a single member of + the ZooKeeper ensemble. Set high to avoid zk connection issues running + standalone and pseudo-distributed. + + + + + + + hbase.client.write.buffer + 2097152 + Default size of the BufferedMutator write buffer in bytes. + A bigger buffer takes more memory -- on both the client and server + side since server instantiates the passed write buffer to process + it -- but a larger buffer size reduces the number of RPCs made. + For an estimate of server-side memory-used, evaluate + hbase.client.write.buffer * hbase.regionserver.handler.count + + + + hbase.client.pause + 100 + General client pause value. Used mostly as value to wait + before running a retry of a failed get, region lookup, etc. + See hbase.client.retries.number for description of how we backoff from + this initial pause amount and how this pause works w/ retries. + + + + hbase.client.pause.cqtbe + + Whether or not to use a special client pause for + CallQueueTooBigException (cqtbe). Set this property to a higher value + than hbase.client.pause if you observe frequent CQTBE from the same + RegionServer and the call queue there keeps full + + + + hbase.client.retries.number + 15 + Maximum retries. Used as maximum for all retryable + operations such as the getting of a cell's value, starting a row update, + etc. Retry interval is a rough function based on hbase.client.pause. At + first we retry at this interval but then with backoff, we pretty quickly reach + retrying every ten seconds. See HConstants#RETRY_BACKOFF for how the backup + ramps up. Change this setting and hbase.client.pause to suit your workload. + + + + hbase.client.max.total.tasks + 100 + The maximum number of concurrent mutation tasks a single HTable instance will + send to the cluster. + + + + hbase.client.max.perserver.tasks + 2 + The maximum number of concurrent mutation tasks a single HTable instance will + send to a single region server. + + + + hbase.client.max.perregion.tasks + 1 + The maximum number of concurrent mutation tasks the client will + maintain to a single Region. That is, if there is already + hbase.client.max.perregion.tasks writes in progress for this region, new puts + won't be sent to this region until some writes finishes. + + + + hbase.client.perserver.requests.threshold + 2147483647 + The max number of concurrent pending requests for one server in all client threads + (process level). Exceeding requests will be thrown ServerTooBusyException immediately to prevent + user's threads being occupied and blocked by only one slow region server. If you use a fix + number of threads to access HBase in a synchronous way, set this to a suitable value which is + related to the number of threads will help you. See + https://issues.apache.org/jira/browse/HBASE-16388 for details. + + + + hbase.client.scanner.caching + 2147483647 + Number of rows that we try to fetch when calling next + on a scanner if it is not served from (local, client) memory. This configuration + works together with hbase.client.scanner.max.result.size to try and use the + network efficiently. The default value is Integer.MAX_VALUE by default so that + the network will fill the chunk size defined by hbase.client.scanner.max.result.size + rather than be limited by a particular number of rows since the size of rows varies + table to table. If you know ahead of time that you will not require more than a certain + number of rows from a scan, this configuration should be set to that row limit via + Scan#setCaching. Higher caching values will enable faster scanners but will eat up more + memory and some calls of next may take longer and longer times when the cache is empty. + Do not set this value such that the time between invocations is greater than the scanner + timeout; i.e. hbase.client.scanner.timeout.period + + + + hbase.client.keyvalue.maxsize + 10485760 + Specifies the combined maximum allowed size of a KeyValue + instance. This is to set an upper boundary for a single entry saved in a + storage file. Since they cannot be split it helps avoiding that a region + cannot be split any further because the data is too large. It seems wise + to set this to a fraction of the maximum region size. Setting it to zero + or less disables the check. + + + + hbase.server.keyvalue.maxsize + 10485760 + Maximum allowed size of an individual cell, inclusive of value and all key + components. A value of 0 or less disables the check. + The default value is 10MB. + This is a safety setting to protect the server from OOM situations. + + + + hbase.client.scanner.timeout.period + 60000 + Client scanner lease period in milliseconds. + + + hbase.client.localityCheck.threadPoolSize + 2 + + + + + hbase.bulkload.retries.number + 10 + Maximum retries. This is maximum number of iterations + to atomic bulk loads are attempted in the face of splitting operations + 0 means never give up. + + + + hbase.master.balancer.maxRitPercent + 1.0 + The max percent of regions in transition when balancing. + The default value is 1.0. So there are no balancer throttling. If set this config to 0.01, + It means that there are at most 1% regions in transition when balancing. + Then the cluster's availability is at least 99% when balancing. + + + + hbase.balancer.period + + 300000 + Period at which the region balancer runs in the Master, in + milliseconds. + + + + hbase.regions.slop + 0.001 + Rebalance if any regionserver has average + (average * slop) regions. + The default value of this parameter is 0.001 in StochasticLoadBalancer (the default load + balancer), while the default is 0.2 in other load balancers (i.e., + SimpleLoadBalancer). + + + + hbase.normalizer.period + 300000 + Period at which the region normalizer runs in the Master, in + milliseconds. + + + + hbase.normalizer.split.enabled + true + Whether to split a region as part of normalization. + + + hbase.normalizer.merge.enabled + true + Whether to merge a region as part of normalization. + + + hbase.normalizer.min.region.count + 3 + The minimum number of regions in a table to consider it for merge + normalization. + + + + hbase.normalizer.merge.min_region_age.days + 3 + The minimum age for a region to be considered for a merge, in days. + + + hbase.normalizer.merge.min_region_age.days + 3 + The minimum age for a region to be considered for a merge, in days. + + + hbase.normalizer.merge.min_region_size.mb + 1 + The minimum size for a region to be considered for a merge, in whole + MBs. + + + + hbase.table.normalization.enabled + false + This config is used to set default behaviour of normalizer at table level. + To override this at table level one can set NORMALIZATION_ENABLED at table descriptor level + and that property will be honored + + + + hbase.server.thread.wakefrequency + 10000 + Time to sleep in between searches for work (in milliseconds). + Used as sleep interval by service threads such as log roller. + + + + hbase.server.versionfile.writeattempts + 3 + + How many times to retry attempting to write a version file + before just aborting. Each attempt is separated by the + hbase.server.thread.wakefrequency milliseconds. + + + + hbase.hregion.memstore.flush.size + 134217728 + + Memstore will be flushed to disk if size of the memstore + exceeds this number of bytes. Value is checked by a thread that runs + every hbase.server.thread.wakefrequency. + + + + hbase.hregion.percolumnfamilyflush.size.lower.bound.min + 16777216 + + If FlushLargeStoresPolicy is used and there are multiple column families, + then every time that we hit the total memstore limit, we find out all the + column families whose memstores exceed a "lower bound" and only flush them + while retaining the others in memory. The "lower bound" will be + "hbase.hregion.memstore.flush.size / column_family_number" by default + unless value of this property is larger than that. If none of the families + have their memstore size more than lower bound, all the memstores will be + flushed (just as usual). + + + + hbase.hregion.preclose.flush.size + 5242880 + + If the memstores in a region are this size or larger when we go + to close, run a "pre-flush" to clear out memstores before we put up + the region closed flag and take the region offline. On close, + a flush is run under the close flag to empty memory. During + this time the region is offline and we are not taking on any writes. + If the memstore content is large, this flush could take a long time to + complete. The preflush is meant to clean out the bulk of the memstore + before putting up the close flag and taking the region offline so the + flush that runs under the close flag has little to do. + + + + hbase.hregion.memstore.block.multiplier + 4 + + Block updates if memstore has hbase.hregion.memstore.block.multiplier + times hbase.hregion.memstore.flush.size bytes. Useful preventing + runaway memstore during spikes in update traffic. Without an + upper-bound, memstore fills such that when it flushes the + resultant flush files take a long time to compact or split, or + worse, we OOME. + + + + hbase.hregion.memstore.mslab.enabled + true + + Enables the MemStore-Local Allocation Buffer, + a feature which works to prevent heap fragmentation under + heavy write loads. This can reduce the frequency of stop-the-world + GC pauses on large heaps. + + + + hbase.hregion.memstore.mslab.chunksize + 2097152 + The maximum byte size of a chunk in the MemStoreLAB. Unit: bytes + + + hbase.regionserver.offheap.global.memstore.size + 0 + The amount of off-heap memory all MemStores in a RegionServer may use. + A value of 0 means that no off-heap memory will be used and all chunks in MSLAB + will be HeapByteBuffer, otherwise the non-zero value means how many megabyte of + off-heap memory will be used for chunks in MSLAB and all chunks in MSLAB will be + DirectByteBuffer. Unit: megabytes. + + + + hbase.hregion.memstore.mslab.max.allocation + 262144 + The maximal size of one allocation in the MemStoreLAB, if the desired byte + size exceed this threshold then it will be just allocated from JVM heap rather than MemStoreLAB. + + + + hbase.hregion.max.filesize + 10737418240 + + Maximum HFile size. If the sum of the sizes of a region's HFiles has grown to exceed this + value, the region is split in two. + + + + hbase.hregion.split.overallfiles + false + If we should sum overall region files size when check to split. + + + hbase.hregion.majorcompaction + 604800000 + Time between major compactions, expressed in milliseconds. Set to 0 to disable + time-based automatic major compactions. User-requested and size-based major compactions will + still run. This value is multiplied by hbase.hregion.majorcompaction.jitter to cause + compaction to start at a somewhat-random time during a given window of time. The default value + is 7 days, expressed in milliseconds. If major compactions are causing disruption in your + environment, you can configure them to run at off-peak times for your deployment, or disable + time-based major compactions by setting this parameter to 0, and run major compactions in a + cron job or by another external mechanism. + + + + hbase.hregion.majorcompaction.jitter + 0.50 + A multiplier applied to hbase.hregion.majorcompaction to cause compaction to occur + a given amount of time either side of hbase.hregion.majorcompaction. The smaller the number, + the closer the compactions will happen to the hbase.hregion.majorcompaction + interval. + + + + hbase.hstore.compactionThreshold + 3 + If more than this number of StoreFiles exist in any one Store + (one StoreFile is written per flush of MemStore), a compaction is run to rewrite all + StoreFiles into a single StoreFile. Larger values delay compaction, but when compaction does + occur, it takes longer to complete. + + + + hbase.regionserver.compaction.enabled + true + Enable/disable compactions on by setting true/false. + We can further switch compactions dynamically with the + compaction_switch shell command. + + + + hbase.hstore.flusher.count + 2 + The number of flush threads. With fewer threads, the MemStore flushes will be + queued. With more threads, the flushes will be executed in parallel, increasing the load on + HDFS, and potentially causing more compactions. + + + + hbase.hstore.blockingStoreFiles + 16 + If more than this number of StoreFiles exist in any one Store (one StoreFile + is written per flush of MemStore), updates are blocked for this region until a compaction is + completed, or until hbase.hstore.blockingWaitTime has been exceeded. + + + + hbase.hstore.blockingWaitTime + 90000 + The time for which a region will block updates after reaching the StoreFile limit + defined by hbase.hstore.blockingStoreFiles. After this time has elapsed, the region will stop + blocking updates even if a compaction has not been completed. + + + + hbase.hstore.compaction.min + + The minimum number of StoreFiles which must be eligible for compaction before + compaction can run. The goal of tuning hbase.hstore.compaction.min is to avoid ending up with + too many tiny StoreFiles to compact. Setting this value to 2 would cause a minor compaction + each time you have two StoreFiles in a Store, and this is probably not appropriate. If you + set this value too high, all the other values will need to be adjusted accordingly. For most + cases, the default value is appropriate (empty value here, results in 3 by code logic). In + previous versions of HBase, the parameter hbase.hstore.compaction.min was named + hbase.hstore.compactionThreshold. + + + + hbase.hstore.compaction.max + 10 + The maximum number of StoreFiles which will be selected for a single minor + compaction, regardless of the number of eligible StoreFiles. Effectively, the value of + hbase.hstore.compaction.max controls the length of time it takes a single compaction to + complete. Setting it larger means that more StoreFiles are included in a compaction. For most + cases, the default value is appropriate. + + + + hbase.hstore.compaction.min.size + 134217728 + A StoreFile (or a selection of StoreFiles, when using ExploringCompactionPolicy) + smaller than this size will always be eligible for minor compaction. + HFiles this size or larger are evaluated by hbase.hstore.compaction.ratio to determine if + they are eligible. Because this limit represents the "automatic include" limit for all + StoreFiles smaller than this value, this value may need to be reduced in write-heavy + environments where many StoreFiles in the 1-2 MB range are being flushed, because every + StoreFile will be targeted for compaction and the resulting StoreFiles may still be under the + minimum size and require further compaction. If this parameter is lowered, the ratio check is + triggered more quickly. This addressed some issues seen in earlier versions of HBase but + changing this parameter is no longer necessary in most situations. Default: 128 MB expressed + in bytes. + + + + hbase.hstore.compaction.max.size + 9223372036854775807 + A StoreFile (or a selection of StoreFiles, when using ExploringCompactionPolicy) + larger than this size will be excluded from compaction. The effect of + raising hbase.hstore.compaction.max.size is fewer, larger StoreFiles that do not get + compacted often. If you feel that compaction is happening too often without much benefit, you + can try raising this value. Default: the value of LONG.MAX_VALUE, expressed in bytes. + + + + hbase.hstore.compaction.ratio + 1.2F + For minor compaction, this ratio is used to determine whether a given StoreFile + which is larger than hbase.hstore.compaction.min.size is eligible for compaction. Its + effect is to limit compaction of large StoreFiles. The value of hbase.hstore.compaction.ratio + is expressed as a floating-point decimal. A large ratio, such as 10, will produce a single + giant StoreFile. Conversely, a low value, such as .25, will produce behavior similar to the + BigTable compaction algorithm, producing four StoreFiles. A moderate value of between 1.0 and + 1.4 is recommended. When tuning this value, you are balancing write costs with read costs. + Raising the value (to something like 1.4) will have more write costs, because you will + compact larger StoreFiles. However, during reads, HBase will need to seek through fewer + StoreFiles to accomplish the read. Consider this approach if you cannot take advantage of + Bloom filters. Otherwise, you can lower this value to something like 1.0 to reduce the + background cost of writes, and use Bloom filters to control the number of StoreFiles touched + during reads. For most cases, the default value is appropriate. + + + + hbase.hstore.compaction.ratio.offpeak + 5.0F + Allows you to set a different (by default, more aggressive) ratio for determining + whether larger StoreFiles are included in compactions during off-peak hours. Works in the + same way as hbase.hstore.compaction.ratio. Only applies if hbase.offpeak.start.hour and + hbase.offpeak.end.hour are also enabled. + + + + hbase.hstore.time.to.purge.deletes + 0 + The amount of time to delay purging of delete markers with future timestamps. If + unset, or set to 0, all delete markers, including those with future timestamps, are purged + during the next major compaction. Otherwise, a delete marker is kept until the major compaction + which occurs after the marker's timestamp plus the value of this setting, in milliseconds. + + + + hbase.offpeak.start.hour + -1 + The start of off-peak hours, expressed as an integer between 0 and 23, inclusive. + Set to -1 to disable off-peak. + + + + hbase.offpeak.end.hour + -1 + The end of off-peak hours, expressed as an integer between 0 and 23, inclusive. Set + to -1 to disable off-peak. + + + + hbase.regionserver.thread.compaction.throttle + 2684354560 + There are two different thread pools for compactions, one for large compactions and + the other for small compactions. This helps to keep compaction of lean tables (such as + hbase:meta) fast. If a compaction is larger than this threshold, it + goes into the large compaction pool. In most cases, the default value is appropriate. Default: + 2 x hbase.hstore.compaction.max x hbase.hregion.memstore.flush.size (which defaults to 128MB). + The value field assumes that the value of hbase.hregion.memstore.flush.size is unchanged from + the default. + + + + hbase.regionserver.majorcompaction.pagecache.drop + true + Specifies whether to drop pages read/written into the system page cache by + major compactions. Setting it to true helps prevent major compactions from + polluting the page cache, which is almost always required, especially for clusters + with low/moderate memory to storage ratio. + + + + hbase.regionserver.minorcompaction.pagecache.drop + true + Specifies whether to drop pages read/written into the system page cache by + minor compactions. Setting it to true helps prevent minor compactions from + polluting the page cache, which is most beneficial on clusters with low + memory to storage ratio or very write heavy clusters. You may want to set it to + false under moderate to low write workload when bulk of the reads are + on the most recently written data. + + + + hbase.hstore.compaction.kv.max + 10 + The maximum number of KeyValues to read and then write in a batch when flushing or + compacting. Set this lower if you have big KeyValues and problems with Out Of Memory + Exceptions Set this higher if you have wide, small rows. + + + + hbase.storescanner.parallel.seek.enable + false + + Enables StoreFileScanner parallel-seeking in StoreScanner, + a feature which can reduce response latency under special conditions. + + + + hbase.storescanner.parallel.seek.threads + 10 + + The default thread pool size if parallel-seeking feature enabled. + + + + hfile.block.cache.policy + LRU + The eviction policy for the L1 block cache (LRU or TinyLFU). + + + hfile.block.cache.size + 0.4 + Percentage of maximum heap (-Xmx setting) to allocate to block cache + used by a StoreFile. Default of 0.4 means allocate 40%. + Set to 0 to disable but it's not recommended; you need at least + enough cache to hold the storefile indices. + + + + hfile.block.index.cacheonwrite + false + This allows to put non-root multi-level index blocks into the block + cache at the time the index is being written. + + + + hfile.index.block.max.size + 131072 + When the size of a leaf-level, intermediate-level, or root-level + index block in a multi-level block index grows to this size, the + block is written out and a new block is started. + + + + hbase.bucketcache.ioengine + + Where to store the contents of the bucketcache. One of: offheap, + file, files, mmap or pmem. If a file or files, set it to file(s):PATH_TO_FILE. + mmap means the content will be in an mmaped file. Use mmap:PATH_TO_FILE. 'pmem' + is bucket cache over a file on the persistent memory device. + Use pmem:PATH_TO_FILE. + See http://hbase.apache.org/book.html#offheap.blockcache for more information. + + + + hbase.hstore.compaction.throughput.lower.bound + 52428800 + The target lower bound on aggregate compaction throughput, in bytes/sec. Allows + you to tune the minimum available compaction throughput when the + PressureAwareCompactionThroughputController throughput controller is active. (It is active by + default.) + + + + hbase.hstore.compaction.throughput.higher.bound + 104857600 + The target upper bound on aggregate compaction throughput, in bytes/sec. Allows + you to control aggregate compaction throughput demand when the + PressureAwareCompactionThroughputController throughput controller is active. (It is active by + default.) The maximum throughput will be tuned between the lower and upper bounds when + compaction pressure is within the range [0.0, 1.0]. If compaction pressure is 1.0 or greater + the higher bound will be ignored until pressure returns to the normal range. + + + + hbase.bucketcache.size + + A float that EITHER represents a percentage of total heap memory + size to give to the cache (if < 1.0) OR, it is the total capacity in + megabytes of BucketCache. Default: 0.0 + + + + hbase.bucketcache.bucket.sizes + + A comma-separated list of sizes for buckets for the bucketcache. + Can be multiple sizes. List block sizes in order from smallest to largest. + The sizes you use will depend on your data access patterns. + Must be a multiple of 256 else you will run into + 'java.io.IOException: Invalid HFile block magic' when you go to read from cache. + If you specify no values here, then you pick up the default bucketsizes set + in code (See BucketAllocator#DEFAULT_BUCKET_SIZES). + + + + hfile.format.version + 3 + The HFile format version to use for new files. + Version 3 adds support for tags in hfiles (See http://hbase.apache.org/book.html#hbase.tags). + Also see the configuration 'hbase.replication.rpc.codec'. + + + + hfile.block.bloom.cacheonwrite + false + Enables cache-on-write for inline blocks of a compound Bloom filter. + + + io.storefile.bloom.block.size + 131072 + The size in bytes of a single block ("chunk") of a compound Bloom + filter. This size is approximate, because Bloom blocks can only be + inserted at data block boundaries, and the number of keys per data + block varies. + + + + hbase.rs.cacheblocksonwrite + false + Whether an HFile block should be added to the block cache when the + block is finished. + + + + hbase.rpc.timeout + 60000 + This is for the RPC layer to define how long (millisecond) HBase client applications + take for a remote call to time out. It uses pings to check connections + but will eventually throw a TimeoutException. + + + + hbase.client.operation.timeout + 1200000 + Operation timeout is a top-level restriction (millisecond) that makes sure a + blocking operation in Table will not be blocked more than this. In each operation, if rpc + request fails because of timeout or other reason, it will retry until success or throw + RetriesExhaustedException. But if the total time being blocking reach the operation timeout + before retries exhausted, it will break early and throw SocketTimeoutException. + + + + hbase.cells.scanned.per.heartbeat.check + 10000 + The number of cells scanned in between heartbeat checks. Heartbeat + checks occur during the processing of scans to determine whether or not the + server should stop scanning in order to send back a heartbeat message to the + client. Heartbeat messages are used to keep the client-server connection alive + during long running scans. Small values mean that the heartbeat checks will + occur more often and thus will provide a tighter bound on the execution time of + the scan. Larger values mean that the heartbeat checks occur less frequently + + + + hbase.rpc.shortoperation.timeout + 10000 + This is another version of "hbase.rpc.timeout". For those RPC operation + within cluster, we rely on this configuration to set a short timeout limitation + for short operation. For example, short rpc timeout for region server's trying + to report to active master can benefit quicker master failover process. + + + + hbase.ipc.client.tcpnodelay + true + Set no delay on rpc socket connections. See + http://docs.oracle.com/javase/1.5.0/docs/api/java/net/Socket.html#getTcpNoDelay() + + + + hbase.unsafe.regionserver.hostname + + This config is for experts: don't set its value unless you really know what you are doing. + When set to a non-empty value, this represents the (external facing) hostname for the underlying server. + See https://issues.apache.org/jira/browse/HBASE-12954 for details. + + + + hbase.unsafe.regionserver.hostname.disable.master.reversedns + false + This config is for experts: don't set its value unless you really know what you are doing. + When set to true, regionserver will use the current node hostname for the servername and HMaster will + skip reverse DNS lookup and use the hostname sent by regionserver instead. Note that this config and + hbase.unsafe.regionserver.hostname are mutually exclusive. See https://issues.apache.org/jira/browse/HBASE-18226 + for more details. + + + + + hbase.master.keytab.file + + Full path to the kerberos keytab file to use for logging in + the configured HMaster server principal. + + + + hbase.master.kerberos.principal + + Ex. "hbase/_HOST@EXAMPLE.COM". The kerberos principal name + that should be used to run the HMaster process. The principal name should + be in the form: user/hostname@DOMAIN. If "_HOST" is used as the hostname + portion, it will be replaced with the actual hostname of the running + instance. + + + + hbase.regionserver.keytab.file + + Full path to the kerberos keytab file to use for logging in + the configured HRegionServer server principal. + + + + hbase.regionserver.kerberos.principal + + Ex. "hbase/_HOST@EXAMPLE.COM". The kerberos principal name + that should be used to run the HRegionServer process. The principal name + should be in the form: user/hostname@DOMAIN. If "_HOST" is used as the + hostname portion, it will be replaced with the actual hostname of the + running instance. An entry for this principal must exist in the file + specified in hbase.regionserver.keytab.file + + + + + hadoop.policy.file + hbase-policy.xml + The policy configuration file used by RPC servers to make + authorization decisions on client requests. Only used when HBase + security is enabled. + + + + hbase.superuser + + List of users or groups (comma-separated), who are allowed + full privileges, regardless of stored ACLs, across the cluster. + Only used when HBase security is enabled. + + + + hbase.auth.key.update.interval + 86400000 + The update interval for master key for authentication tokens + in servers in milliseconds. Only used when HBase security is enabled. + + + + hbase.auth.token.max.lifetime + 604800000 + The maximum lifetime in milliseconds after which an + authentication token expires. Only used when HBase security is enabled. + + + + hbase.ipc.client.fallback-to-simple-auth-allowed + false + When a client is configured to attempt a secure connection, but attempts to + connect to an insecure server, that server may instruct the client to + switch to SASL SIMPLE (unsecure) authentication. This setting controls + whether or not the client will accept this instruction from the server. + When false (the default), the client will not allow the fallback to SIMPLE + authentication, and will abort the connection. + + + + hbase.ipc.server.fallback-to-simple-auth-allowed + false + When a server is configured to require secure connections, it will + reject connection attempts from clients using SASL SIMPLE (unsecure) authentication. + This setting allows secure servers to accept SASL SIMPLE connections from clients + when the client requests. When false (the default), the server will not allow the fallback + to SIMPLE authentication, and will reject the connection. WARNING: This setting should ONLY + be used as a temporary measure while converting clients over to secure authentication. It + MUST BE DISABLED for secure operation. + + + + hbase.display.keys + true + When this is set to true the webUI and such will display all start/end keys + as part of the table details, region names, etc. When this is set to false, + the keys are hidden. + + + + hbase.coprocessor.enabled + true + Enables or disables coprocessor loading. If 'false' + (disabled), any other coprocessor related configuration will be ignored. + + + + hbase.coprocessor.user.enabled + true + Enables or disables user (aka. table) coprocessor loading. + If 'false' (disabled), any table coprocessor attributes in table + descriptors will be ignored. If "hbase.coprocessor.enabled" is 'false' + this setting has no effect. + + + + hbase.coprocessor.region.classes + + A comma-separated list of Coprocessors that are loaded by + default on all tables. For any override coprocessor method, these classes + will be called in order. After implementing your own Coprocessor, just put + it in HBase's classpath and add the fully qualified class name here. + A coprocessor can also be loaded on demand by setting HTableDescriptor. + + + + hbase.coprocessor.master.classes + + A comma-separated list of + org.apache.hadoop.hbase.coprocessor.MasterObserver coprocessors that are + loaded by default on the active HMaster process. For any implemented + coprocessor methods, the listed classes will be called in order. After + implementing your own MasterObserver, just put it in HBase's classpath + and add the fully qualified class name here. + + + + hbase.coprocessor.abortonerror + true + Set to true to cause the hosting server (master or regionserver) + to abort if a coprocessor fails to load, fails to initialize, or throws an + unexpected Throwable object. Setting this to false will allow the server to + continue execution but the system wide state of the coprocessor in question + will become inconsistent as it will be properly executing in only a subset + of servers, so this is most useful for debugging only. + + + + hbase.rest.port + 8080 + The port for the HBase REST server. + + + hbase.rest.readonly + false + Defines the mode the REST server will be started in. Possible values are: + false: All HTTP methods are permitted - GET/PUT/POST/DELETE. + true: Only the GET method is permitted. + + + + hbase.rest.threads.max + 100 + The maximum number of threads of the REST server thread pool. + Threads in the pool are reused to process REST requests. This + controls the maximum number of requests processed concurrently. + It may help to control the memory used by the REST server to + avoid OOM issues. If the thread pool is full, incoming requests + will be queued up and wait for some free threads. + + + + hbase.rest.threads.min + 2 + The minimum number of threads of the REST server thread pool. + The thread pool always has at least these number of threads so + the REST server is ready to serve incoming requests. + + + + hbase.rest.support.proxyuser + false + Enables running the REST server to support proxy-user mode. + + + hbase.defaults.for.version + 2.4.9 + This defaults file was compiled for version ${project.version}. This variable is used + to make sure that a user doesn't have an old version of hbase-default.xml on the + classpath. + + + + hbase.defaults.for.version.skip + true + Set to true to skip the 'hbase.defaults.for.version' check. + Setting this to true can be useful in contexts other than + the other side of a maven generation; i.e. running in an + IDE. You'll want to set this boolean to true to avoid + seeing the RuntimeException complaint: "hbase-default.xml file + seems to be for and old version of HBase (\${hbase.version}), this + version is X.X.X-SNAPSHOT" + + + + hbase.table.lock.enable + true + Set to true to enable locking the table in zookeeper for schema change operations. + Table locking from master prevents concurrent schema modifications to corrupt table + state. + + + + hbase.table.max.rowsize + 1073741824 + + Maximum size of single row in bytes (default is 1 Gb) for Get'ting + or Scan'ning without in-row scan flag set. If row size exceeds this limit + RowTooBigException is thrown to client. + + + + hbase.thrift.minWorkerThreads + 16 + The "core size" of the thread pool. New threads are created on every + connection until this many threads are created. + + + + hbase.thrift.maxWorkerThreads + 1000 + The maximum size of the thread pool. When the pending request queue + overflows, new threads are created until their number reaches this number. + After that, the server starts dropping connections. + + + + hbase.thrift.maxQueuedRequests + 1000 + The maximum number of pending Thrift connections waiting in the queue. If + there are no idle threads in the pool, the server queues requests. Only + when the queue overflows, new threads are added, up to + hbase.thrift.maxQueuedRequests threads. + + + + hbase.regionserver.thrift.framed + false + Use Thrift TFramedTransport on the server side. + This is the recommended transport for thrift servers and requires a similar setting + on the client side. Changing this to false will select the default transport, + vulnerable to DoS when malformed requests are issued due to THRIFT-601. + + + + hbase.regionserver.thrift.framed.max_frame_size_in_mb + 2 + Default frame size when using framed transport, in MB + + + hbase.regionserver.thrift.compact + false + Use Thrift TCompactProtocol binary serialization protocol. + + + hbase.rootdir.perms + 700 + FS Permissions for the root data subdirectory in a secure (kerberos) setup. + When master starts, it creates the rootdir with this permissions or sets the permissions + if it does not match. + + + + hbase.wal.dir.perms + 700 + FS Permissions for the root WAL directory in a secure(kerberos) setup. + When master starts, it creates the WAL dir with this permissions or sets the permissions + if it does not match. + + + + hbase.data.umask.enable + false + Enable, if true, that file permissions should be assigned + to the files written by the regionserver + + + + hbase.data.umask + 000 + File permissions that should be used to write data + files when hbase.data.umask.enable is true + + + + hbase.snapshot.enabled + true + Set to true to allow snapshots to be taken / restored / cloned. + + + hbase.snapshot.restore.take.failsafe.snapshot + true + Set to true to take a snapshot before the restore operation. + The snapshot taken will be used in case of failure, to restore the previous state. + At the end of the restore operation this snapshot will be deleted + + + + hbase.snapshot.restore.failsafe.name + hbase-failsafe-{snapshot.name}-{restore.timestamp} + Name of the failsafe snapshot taken by the restore operation. + You can use the {snapshot.name}, {table.name} and {restore.timestamp} variables + to create a name based on what you are restoring. + + + + hbase.snapshot.working.dir + + Location where the snapshotting process will occur. The location of the + completed snapshots will not change, but the temporary directory where the snapshot + process occurs will be set to this location. This can be a separate filesystem than + the root directory, for performance increase purposes. See HBASE-21098 for more + information + + + + hbase.server.compactchecker.interval.multiplier + 1000 + The number that determines how often we scan to see if compaction is necessary. + Normally, compactions are done after some events (such as memstore flush), but if + region didn't receive a lot of writes for some time, or due to different compaction + policies, it may be necessary to check it periodically. The interval between checks is + hbase.server.compactchecker.interval.multiplier multiplied by + hbase.server.thread.wakefrequency. + + + + hbase.lease.recovery.timeout + 900000 + How long we wait on dfs lease recovery in total before giving up. + + + hbase.lease.recovery.dfs.timeout + 64000 + How long between dfs recover lease invocations. Should be larger than the sum of + the time it takes for the namenode to issue a block recovery command as part of + datanode; dfs.heartbeat.interval and the time it takes for the primary + datanode, performing block recovery to timeout on a dead datanode; usually + dfs.client.socket-timeout. See the end of HBASE-8389 for more. + + + + hbase.column.max.version + 1 + New column family descriptors will use this value as the default number of versions + to keep. + + + + dfs.client.read.shortcircuit + + + If set to true, this configuration parameter enables short-circuit local + reads. + + + + dfs.domain.socket.path + + + This is a path to a UNIX domain socket that will be used for + communication between the DataNode and local HDFS clients, if + dfs.client.read.shortcircuit is set to true. If the string "_PORT" is + present in this path, it will be replaced by the TCP port of the DataNode. + Be careful about permissions for the directory that hosts the shared + domain socket; dfsclient will complain if open to other users than the HBase user. + + + + hbase.dfs.client.read.shortcircuit.buffer.size + 131072 + If the DFSClient configuration + dfs.client.read.shortcircuit.buffer.size is unset, we will + use what is configured here as the short circuit read default + direct byte buffer size. DFSClient native default is 1MB; HBase + keeps its HDFS files open so number of file blocks * 1MB soon + starts to add up and threaten OOME because of a shortage of + direct memory. So, we set it down from the default. Make + it > the default hbase block size set in the HColumnDescriptor + which is usually 64k. + + + + hbase.regionserver.checksum.verify + true + + If set to true (the default), HBase verifies the checksums for hfile + blocks. HBase writes checksums inline with the data when it writes out + hfiles. HDFS (as of this writing) writes checksums to a separate file + than the data file necessitating extra seeks. Setting this flag saves + some on i/o. Checksum verification by HDFS will be internally disabled + on hfile streams when this flag is set. If the hbase-checksum verification + fails, we will switch back to using HDFS checksums (so do not disable HDFS + checksums! And besides this feature applies to hfiles only, not to WALs). + If this parameter is set to false, then hbase will not verify any checksums, + instead it will depend on checksum verification being done in the HDFS client. + + + + hbase.hstore.bytes.per.checksum + 16384 + + Number of bytes in a newly created checksum chunk for HBase-level + checksums in hfile blocks. + + + + hbase.hstore.checksum.algorithm + CRC32C + + Name of an algorithm that is used to compute checksums. Possible values + are NULL, CRC32, CRC32C. + + + + hbase.client.scanner.max.result.size + 2097152 + Maximum number of bytes returned when calling a scanner's next method. + Note that when a single row is larger than this limit the row is still returned completely. + The default value is 2MB, which is good for 1ge networks. + With faster and/or high latency networks this value should be increased. + + + + hbase.server.scanner.max.result.size + 104857600 + Maximum number of bytes returned when calling a scanner's next method. + Note that when a single row is larger than this limit the row is still returned completely. + The default value is 100MB. + This is a safety setting to protect the server from OOM situations. + + + + hbase.status.published + false + + This setting activates the publication by the master of the status of the region server. + When a region server dies and its recovery starts, the master will push this information + to the client application, to let them cut the connection immediately instead of waiting + for a timeout. + + + + hbase.status.publisher.class + org.apache.hadoop.hbase.master.ClusterStatusPublisher$MulticastPublisher + + Implementation of the status publication with a multicast message. + + + + hbase.status.multicast.address.ip + 226.1.1.3 + + Multicast address to use for the status publication by multicast. + + + + hbase.status.multicast.address.port + 16100 + + Multicast port to use for the status publication by multicast. + + + + hbase.dynamic.jars.dir + ${hbase.rootdir}/lib + + The directory from which the custom filter JARs can be loaded + dynamically by the region server without the need to restart. However, + an already loaded filter/co-processor class would not be un-loaded. See + HBASE-1936 for more details. + + Does not apply to coprocessors. + + + + hbase.security.authentication + simple + + Controls whether or not secure authentication is enabled for HBase. + Possible values are 'simple' (no authentication), and 'kerberos'. + + + + hbase.rest.filter.classes + org.apache.hadoop.hbase.rest.filter.GzipFilter + + Servlet filters for REST service. + + + + hbase.master.loadbalancer.class + org.apache.hadoop.hbase.master.balancer.StochasticLoadBalancer + + Class used to execute the regions balancing when the period occurs. + See the class comment for more on how it works + http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.html + It replaces the DefaultLoadBalancer as the default (since renamed + as the SimpleLoadBalancer). + + + + hbase.master.loadbalance.bytable + false + Factor Table name when the balancer runs. + Default: false. + + + + hbase.master.normalizer.class + org.apache.hadoop.hbase.master.normalizer.SimpleRegionNormalizer + + Class used to execute the region normalization when the period occurs. + See the class comment for more on how it works + http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/master/normalizer/SimpleRegionNormalizer.html + + + + hbase.rest.csrf.enabled + false + + Set to true to enable protection against cross-site request forgery (CSRF) + + + + hbase.rest-csrf.browser-useragents-regex + ^Mozilla.*,^Opera.* + + A comma-separated list of regular expressions used to match against an HTTP + request's User-Agent header when protection against cross-site request + forgery (CSRF) is enabled for REST server by setting + hbase.rest.csrf.enabled to true. If the incoming User-Agent matches + any of these regular expressions, then the request is considered to be sent + by a browser, and therefore CSRF prevention is enforced. If the request's + User-Agent does not match any of these regular expressions, then the request + is considered to be sent by something other than a browser, such as scripted + automation. In this case, CSRF is not a potential attack vector, so + the prevention is not enforced. This helps achieve backwards-compatibility + with existing automation that has not been updated to send the CSRF + prevention header. + + + + hbase.security.exec.permission.checks + false + + If this setting is enabled and ACL based access control is active (the + AccessController coprocessor is installed either as a system coprocessor + or on a table as a table coprocessor) then you must grant all relevant + users EXEC privilege if they require the ability to execute coprocessor + endpoint calls. EXEC privilege, like any other permission, can be + granted globally to a user, or to a user on a per table or per namespace + basis. For more information on coprocessor endpoints, see the coprocessor + section of the HBase online manual. For more information on granting or + revoking permissions using the AccessController, see the security + section of the HBase online manual. + + + + hbase.procedure.regionserver.classes + + A comma-separated list of + org.apache.hadoop.hbase.procedure.RegionServerProcedureManager procedure managers that are + loaded by default on the active HRegionServer process. The lifecycle methods (init/start/stop) + will be called by the active HRegionServer process to perform the specific globally barriered + procedure. After implementing your own RegionServerProcedureManager, just put it in + HBase's classpath and add the fully qualified class name here. + + + + hbase.procedure.master.classes + + A comma-separated list of + org.apache.hadoop.hbase.procedure.MasterProcedureManager procedure managers that are + loaded by default on the active HMaster process. A procedure is identified by its signature and + users can use the signature and an instant name to trigger an execution of a globally barriered + procedure. After implementing your own MasterProcedureManager, just put it in HBase's classpath + and add the fully qualified class name here. + + + + hbase.coordinated.state.manager.class + org.apache.hadoop.hbase.coordination.ZkCoordinatedStateManager + Fully qualified name of class implementing coordinated state manager. + + + hbase.regionserver.storefile.refresh.period + 0 + + The period (in milliseconds) for refreshing the store files for the secondary regions. 0 + means this feature is disabled. Secondary regions sees new files (from flushes and + compactions) from primary once the secondary region refreshes the list of files in the + region (there is no notification mechanism). But too frequent refreshes might cause + extra Namenode pressure. If the files cannot be refreshed for longer than HFile TTL + (hbase.master.hfilecleaner.ttl) the requests are rejected. Configuring HFile TTL to a larger + value is also recommended with this setting. + + + + hbase.region.replica.replication.enabled + false + + Whether asynchronous WAL replication to the secondary region replicas is enabled or not. + If this is enabled, a replication peer named "region_replica_replication" will be created + which will tail the logs and replicate the mutations to region replicas for tables that + have region replication > 1. If this is enabled once, disabling this replication also + requires disabling the replication peer using shell or Admin java class. + Replication to secondary region replicas works over standard inter-cluster replication. + + + + hbase.http.filter.initializers + org.apache.hadoop.hbase.http.lib.StaticUserWebFilter + + A comma separated list of class names. Each class in the list must extend + org.apache.hadoop.hbase.http.FilterInitializer. The corresponding Filter will + be initialized. Then, the Filter will be applied to all user facing jsp + and servlet web pages. + The ordering of the list defines the ordering of the filters. + The default StaticUserWebFilter add a user principal as defined by the + hbase.http.staticuser.user property. + + + + hbase.security.visibility.mutations.checkauths + false + + This property if enabled, will check whether the labels in the visibility + expression are associated with the user issuing the mutation + + + + hbase.http.max.threads + 16 + + The maximum number of threads that the HTTP Server will create in its + ThreadPool. + + + + hbase.replication.rpc.codec + org.apache.hadoop.hbase.codec.KeyValueCodecWithTags + + The codec that is to be used when replication is enabled so that + the tags are also replicated. This is used along with HFileV3 which + supports tags in them. If tags are not used or if the hfile version used + is HFileV2 then KeyValueCodec can be used as the replication codec. Note that + using KeyValueCodecWithTags for replication when there are no tags causes no harm. + + + + hbase.replication.source.maxthreads + 10 + + The maximum number of threads any replication source will use for + shipping edits to the sinks in parallel. This also limits the number of + chunks each replication batch is broken into. Larger values can improve + the replication throughput between the master and slave clusters. The + default of 10 will rarely need to be changed. + + + + + hbase.http.staticuser.user + dr.stack + + The user name to filter as, on static web filters + while rendering content. An example use is the HDFS + web UI (user to be used for browsing files). + + + + hbase.regionserver.handler.abort.on.error.percent + 0.5 + The percent of region server RPC threads failed to abort RS. + -1 Disable aborting; 0 Abort if even a single handler has died; + 0.x Abort only when this percent of handlers have died; + 1 Abort only all of the handers have died. + + + + + hbase.mob.file.cache.size + 1000 + + Number of opened file handlers to cache. + A larger value will benefit reads by providing more file handlers per mob + file cache and would reduce frequent file opening and closing. + However, if this is set too high, this could lead to a "too many opened file handlers" + The default value is 1000. + + + + hbase.mob.cache.evict.period + 3600 + + The amount of time in seconds before the mob cache evicts cached mob files. + The default value is 3600 seconds. + + + + hbase.mob.cache.evict.remain.ratio + 0.5f + + The ratio (between 0.0 and 1.0) of files that remains cached after an eviction + is triggered when the number of cached mob files exceeds the hbase.mob.file.cache.size. + The default value is 0.5f. + + + + hbase.master.mob.ttl.cleaner.period + 86400 + + The period that ExpiredMobFileCleanerChore runs. The unit is second. + The default value is one day. The MOB file name uses only the date part of + the file creation time in it. We use this time for deciding TTL expiry of + the files. So the removal of TTL expired files might be delayed. The max + delay might be 24 hrs. + + + + hbase.mob.compaction.mergeable.threshold + 1342177280 + + If the size of a mob file is less than this value, it's regarded as a small + file and needs to be merged in mob compaction. The default value is 1280MB. + + + + hbase.mob.delfile.max.count + 3 + + The max number of del files that is allowed in the mob compaction. + In the mob compaction, when the number of existing del files is larger than + this value, they are merged until number of del files is not larger this value. + The default value is 3. + + + + hbase.mob.compaction.batch.size + 100 + + The max number of the mob files that is allowed in a batch of the mob compaction. + The mob compaction merges the small mob files to bigger ones. If the number of the + small files is very large, it could lead to a "too many opened file handlers" in the merge. + And the merge has to be split into batches. This value limits the number of mob files + that are selected in a batch of the mob compaction. The default value is 100. + + + + hbase.mob.compaction.chore.period + 604800 + + The period that MobCompactionChore runs. The unit is second. + The default value is one week. + + + + hbase.mob.compactor.class + org.apache.hadoop.hbase.mob.compactions.PartitionedMobCompactor + + Implementation of mob compactor, the default one is PartitionedMobCompactor. + + + + hbase.mob.compaction.threads.max + 1 + + The max number of threads used in MobCompactor. + + + + hbase.snapshot.master.timeout.millis + 300000 + + Timeout for master for the snapshot procedure execution. + + + + hbase.snapshot.region.timeout + 300000 + + Timeout for regionservers to keep threads in snapshot request pool waiting. + + + + hbase.rpc.rows.warning.threshold + 5000 + + Number of rows in a batch operation above which a warning will be logged. + + + + hbase.master.wait.on.service.seconds + 30 + Default is 5 minutes. Make it 30 seconds for tests. See + HBASE-19794 for some context. + + + + hbase.master.cleaner.snapshot.interval + 1800000 + + Snapshot Cleanup chore interval in milliseconds. + The cleanup thread keeps running at this interval + to find all snapshots that are expired based on TTL + and delete them. + + + + hbase.master.snapshot.ttl + 0 + + Default Snapshot TTL to be considered when the user does not specify TTL while + creating snapshot. Default value 0 indicates FOREVERE - snapshot should not be + automatically deleted until it is manually deleted + + + + hbase.master.regions.recovery.check.interval + 1200000 + + Regions Recovery Chore interval in milliseconds. + This chore keeps running at this interval to + find all regions with configurable max store file ref count + and reopens them. + + + + hbase.regions.recovery.store.file.ref.count + -1 + + Very large number of ref count on a compacted + store file indicates that it is a ref leak + on that object(compacted store file). + Such files can not be removed after + it is invalidated via compaction. + Only way to recover in such scenario is to + reopen the region which can release + all resources, like the refcount, + leases, etc. This config represents Store files Ref + Count threshold value considered for reopening + regions. Any region with compacted store files + ref count > this value would be eligible for + reopening by master. Here, we get the max + refCount among all refCounts on all + compacted away store files that belong to a + particular region. Default value -1 indicates + this feature is turned off. Only positive + integer value should be provided to + enable this feature. + + + + hbase.regionserver.slowlog.ringbuffer.size + 256 + + Default size of ringbuffer to be maintained by each RegionServer in order + to store online slowlog responses. This is an in-memory ring buffer of + requests that were judged to be too slow in addition to the responseTooSlow + logging. The in-memory representation would be complete. + For more details, please look into Doc Section: + Get Slow Response Log from shell + + + + hbase.regionserver.slowlog.buffer.enabled + false + + Indicates whether RegionServers have ring buffer running for storing + Online Slow logs in FIFO manner with limited entries. The size of + the ring buffer is indicated by config: hbase.regionserver.slowlog.ringbuffer.size + The default value is false, turn this on and get latest slowlog + responses with complete data. + + + + hbase.regionserver.slowlog.systable.enabled + false + + Should be enabled only if hbase.regionserver.slowlog.buffer.enabled is enabled. If enabled + (true), all slow/large RPC logs would be persisted to system table hbase:slowlog (in addition + to in-memory ring buffer at each RegionServer). The records are stored in increasing + order of time. Operators can scan the table with various combination of ColumnValueFilter. + More details are provided in the doc section: + "Get Slow/Large Response Logs from System table hbase:slowlog" + + + + hbase.rpc.rows.size.threshold.reject + false + + If value is true, RegionServer will abort batch requests of Put/Delete with number of rows + in a batch operation exceeding threshold defined by value of config: + hbase.rpc.rows.warning.threshold. The default value is false and hence, by default, only + warning will be logged. This config should be turned on to prevent RegionServer from serving + very large batch size of rows and this way we can improve CPU usages by discarding + too large batch request. + + + + hbase.namedqueue.provider.classes + + org.apache.hadoop.hbase.namequeues.impl.SlowLogQueueService,org.apache.hadoop.hbase.namequeues.impl.BalancerDecisionQueueService,org.apache.hadoop.hbase.namequeues.impl.BalancerRejectionQueueService + + + Default values for NamedQueueService implementors. This comma separated full class names + represent all implementors of NamedQueueService that we would like to be invoked by + LogEvent handler service. One example of NamedQueue service is SlowLogQueueService which + is used to store slow/large RPC logs in ringbuffer at each RegionServer. + All implementors of NamedQueueService should be found under package: + "org.apache.hadoop.hbase.namequeues.impl" + + + + hbase.master.balancer.decision.buffer.enabled + false + + Indicates whether active HMaster has ring buffer running for storing + balancer decisions in FIFO manner with limited entries. The size of + the ring buffer is indicated by config: hbase.master.balancer.decision.queue.size + + + + hbase.master.balancer.rejection.buffer.enabled + false + + Indicates whether active HMaster has ring buffer running for storing + balancer rejection in FIFO manner with limited entries. The size of + the ring buffer is indicated by config: hbase.master.balancer.rejection.queue.size + + + diff --git a/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java b/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java index 40db67b50870e..483c49b1f50bc 100644 --- a/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java @@ -18,20 +18,31 @@ package org.apache.hudi.avro; -import org.apache.avro.JsonProperties; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.testutils.SchemaTestUtil; +import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.SchemaCompatibilityException; +import org.apache.avro.AvroRuntimeException; +import org.apache.avro.JsonProperties; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; -import org.codehaus.jackson.node.NullNode; + import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Assertions; +import java.io.IOException; +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.sql.Date; import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; +import static org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldSchemaFromWriteSchema; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; @@ -47,8 +58,10 @@ public class TestHoodieAvroUtils { + "{\"name\": \"timestamp\",\"type\": \"double\"},{\"name\": \"_row_key\", \"type\": \"string\"}," + "{\"name\": \"non_pii_col\", \"type\": \"string\"}," + "{\"name\": \"pii_col\", \"type\": \"string\", \"column_category\": \"user_profile\"}," - + "{\"name\": \"new_col1\", \"type\": \"string\", \"default\": \"dummy_val\"}," - + "{\"name\": \"new_col2\", \"type\": [\"int\", \"null\"]}]}"; + + "{\"name\": \"new_col_not_nullable_default_dummy_val\", \"type\": \"string\", \"default\": \"dummy_val\"}," + + "{\"name\": \"new_col_nullable_wo_default\", \"type\": [\"int\", \"null\"]}," + + "{\"name\": \"new_col_nullable_default_null\", \"type\": [\"null\" ,\"string\"],\"default\": null}," + + "{\"name\": \"new_col_nullable_default_dummy_val\", \"type\": [\"string\" ,\"null\"],\"default\": \"dummy_val\"}]}"; private static String EXAMPLE_SCHEMA = "{\"type\": \"record\",\"name\": \"testrec\",\"fields\": [ " + "{\"name\": \"timestamp\",\"type\": \"double\"},{\"name\": \"_row_key\", \"type\": \"string\"}," @@ -80,6 +93,23 @@ public class TestHoodieAvroUtils { + "{\"name\": \"nullable_field\",\"type\": [\"null\" ,\"string\"],\"default\": null}," + "{\"name\": \"non_nullable_field_with_default\",\"type\": \"string\", \"default\": \"dummy\"}]}"; + private static String SCHEMA_WITH_DECIMAL_FIELD = "{\"type\":\"record\",\"name\":\"record\",\"fields\":[" + + "{\"name\":\"key_col\",\"type\":[\"null\",\"int\"],\"default\":null}," + + "{\"name\":\"decimal_col\",\"type\":[\"null\"," + + "{\"type\":\"bytes\",\"logicalType\":\"decimal\",\"precision\":8,\"scale\":4}],\"default\":null}]}"; + + private static String SCHEMA_WITH_NESTED_FIELD = "{\"name\":\"MyClass\",\"type\":\"record\",\"namespace\":\"com.acme.avro\",\"fields\":[" + + "{\"name\":\"firstname\",\"type\":\"string\"}," + + "{\"name\":\"lastname\",\"type\":\"string\"}," + + "{\"name\":\"student\",\"type\":{\"name\":\"student\",\"type\":\"record\",\"fields\":[" + + "{\"name\":\"firstname\",\"type\":[\"null\" ,\"string\"],\"default\": null},{\"name\":\"lastname\",\"type\":[\"null\" ,\"string\"],\"default\": null}]}}]}"; + + private static String SCHEMA_WITH_NESTED_FIELD_RENAMED = "{\"name\":\"MyClass\",\"type\":\"record\",\"namespace\":\"com.acme.avro\",\"fields\":[" + + "{\"name\":\"fn\",\"type\":\"string\"}," + + "{\"name\":\"ln\",\"type\":\"string\"}," + + "{\"name\":\"ss\",\"type\":{\"name\":\"ss\",\"type\":\"record\",\"fields\":[" + + "{\"name\":\"fn\",\"type\":[\"null\" ,\"string\"],\"default\": null},{\"name\":\"ln\",\"type\":[\"null\" ,\"string\"],\"default\": null}]}}]}"; + @Test public void testPropsPresent() { Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(EXAMPLE_SCHEMA)); @@ -112,8 +142,10 @@ public void testDefaultValue() { rec.put("timestamp", 3.5); Schema schemaWithMetadata = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(EVOLVED_SCHEMA)); GenericRecord rec1 = HoodieAvroUtils.rewriteRecord(rec, schemaWithMetadata); - assertEquals(rec1.get("new_col1"), "dummy_val"); - assertNull(rec1.get("new_col2")); + assertEquals("dummy_val", rec1.get("new_col_not_nullable_default_dummy_val")); + assertNull(rec1.get("new_col_nullable_wo_default")); + assertNull(rec1.get("new_col_nullable_default_null")); + assertEquals("dummy_val", rec1.get("new_col_nullable_default_dummy_val")); assertNull(rec1.get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); } @@ -125,8 +157,8 @@ public void testDefaultValueWithSchemaEvolution() { rec.put("pii_col", "val2"); rec.put("timestamp", 3.5); GenericRecord rec1 = HoodieAvroUtils.rewriteRecord(rec, new Schema.Parser().parse(EVOLVED_SCHEMA)); - assertEquals(rec1.get("new_col1"), "dummy_val"); - assertNull(rec1.get("new_col2")); + assertEquals("dummy_val", rec1.get("new_col_not_nullable_default_dummy_val")); + assertNull(rec1.get("new_col_nullable_wo_default")); } @Test @@ -160,7 +192,7 @@ public void testNonNullableFieldWithDefault() { rec.put("pii_col", "val2"); rec.put("timestamp", 3.5); GenericRecord rec1 = HoodieAvroUtils.rewriteRecord(rec, new Schema.Parser().parse(SCHEMA_WITH_NON_NULLABLE_FIELD_WITH_DEFAULT)); - assertEquals(rec1.get("non_nullable_field_with_default"), "dummy"); + assertEquals("dummy", rec1.get("non_nullable_field_with_default")); } @Test @@ -184,7 +216,7 @@ public void testJsonNodeNullWithDefaultValues() { Schema.Field evolvedField1 = new Schema.Field("key", HoodieAvroUtils.METADATA_FIELD_SCHEMA, "", JsonProperties.NULL_VALUE); Schema.Field evolvedField2 = new Schema.Field("key1", HoodieAvroUtils.METADATA_FIELD_SCHEMA, "", JsonProperties.NULL_VALUE); Schema.Field evolvedField3 = new Schema.Field("key2", HoodieAvroUtils.METADATA_FIELD_SCHEMA, "", JsonProperties.NULL_VALUE); - Schema.Field evolvedField4 = new Schema.Field("evolved_field", HoodieAvroUtils.METADATA_FIELD_SCHEMA, "", NullNode.getInstance()); + Schema.Field evolvedField4 = new Schema.Field("evolved_field", HoodieAvroUtils.METADATA_FIELD_SCHEMA, "", JsonProperties.NULL_VALUE); Schema.Field evolvedField5 = new Schema.Field("evolved_field1", HoodieAvroUtils.METADATA_FIELD_SCHEMA, "", JsonProperties.NULL_VALUE); evolvedFields.add(evolvedField1); evolvedFields.add(evolvedField2); @@ -203,8 +235,196 @@ public void testJsonNodeNullWithDefaultValues() { @Test public void testAddingAndRemovingMetadataFields() { Schema schemaWithMetaCols = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(EXAMPLE_SCHEMA)); - assertEquals(schemaWithMetaCols.getFields().size(), NUM_FIELDS_IN_EXAMPLE_SCHEMA + HoodieRecord.HOODIE_META_COLUMNS.size()); + assertEquals(NUM_FIELDS_IN_EXAMPLE_SCHEMA + HoodieRecord.HOODIE_META_COLUMNS.size(), schemaWithMetaCols.getFields().size()); Schema schemaWithoutMetaCols = HoodieAvroUtils.removeMetadataFields(schemaWithMetaCols); - assertEquals(schemaWithoutMetaCols.getFields().size(), NUM_FIELDS_IN_EXAMPLE_SCHEMA); + assertEquals(NUM_FIELDS_IN_EXAMPLE_SCHEMA, schemaWithoutMetaCols.getFields().size()); + } + + @Test + public void testRemoveFields() { + // partitioned table test. + String schemaStr = "{\"type\": \"record\",\"name\": \"testrec\",\"fields\": [ " + + "{\"name\": \"timestamp\",\"type\": \"double\"},{\"name\": \"_row_key\", \"type\": \"string\"}," + + "{\"name\": \"non_pii_col\", \"type\": \"string\"}]},"; + Schema expectedSchema = new Schema.Parser().parse(schemaStr); + GenericRecord rec = new GenericData.Record(new Schema.Parser().parse(EXAMPLE_SCHEMA)); + rec.put("_row_key", "key1"); + rec.put("non_pii_col", "val1"); + rec.put("pii_col", "val2"); + rec.put("timestamp", 3.5); + GenericRecord rec1 = HoodieAvroUtils.removeFields(rec, Collections.singleton("pii_col")); + assertEquals("key1", rec1.get("_row_key")); + assertEquals("val1", rec1.get("non_pii_col")); + assertEquals(3.5, rec1.get("timestamp")); + if (HoodieAvroUtils.gteqAvro1_10()) { + GenericRecord finalRec1 = rec1; + assertThrows(AvroRuntimeException.class, () -> finalRec1.get("pii_col")); + } else { + assertNull(rec1.get("pii_col")); + } + assertEquals(expectedSchema, rec1.getSchema()); + + // non-partitioned table test with empty list of fields. + schemaStr = "{\"type\": \"record\",\"name\": \"testrec\",\"fields\": [ " + + "{\"name\": \"timestamp\",\"type\": \"double\"},{\"name\": \"_row_key\", \"type\": \"string\"}," + + "{\"name\": \"non_pii_col\", \"type\": \"string\"}," + + "{\"name\": \"pii_col\", \"type\": \"string\"}]},"; + expectedSchema = new Schema.Parser().parse(schemaStr); + rec1 = HoodieAvroUtils.removeFields(rec, Collections.singleton("")); + assertEquals(expectedSchema, rec1.getSchema()); + } + + @Test + public void testGetRootLevelFieldName() { + assertEquals("a", HoodieAvroUtils.getRootLevelFieldName("a.b.c")); + assertEquals("a", HoodieAvroUtils.getRootLevelFieldName("a")); + assertEquals("", HoodieAvroUtils.getRootLevelFieldName("")); + } + + @Test + public void testGetNestedFieldVal() { + GenericRecord rec = new GenericData.Record(new Schema.Parser().parse(EXAMPLE_SCHEMA)); + rec.put("_row_key", "key1"); + rec.put("non_pii_col", "val1"); + rec.put("pii_col", "val2"); + + Object rowKey = HoodieAvroUtils.getNestedFieldVal(rec, "_row_key", true, false); + assertEquals("key1", rowKey); + + Object rowKeyNotExist = HoodieAvroUtils.getNestedFieldVal(rec, "fake_key", true, false); + assertNull(rowKeyNotExist); + + // Field does not exist + assertEquals("fake_key(Part -fake_key) field not found in record. Acceptable fields were :[timestamp, _row_key, non_pii_col, pii_col]", + assertThrows(HoodieException.class, () -> + HoodieAvroUtils.getNestedFieldVal(rec, "fake_key", false, false)).getMessage()); + + // Field exists while value not + assertNull(HoodieAvroUtils.getNestedFieldVal(rec, "timestamp", false, false)); + } + + @Test + public void testGetNestedFieldValWithNestedField() { + Schema nestedSchema = new Schema.Parser().parse(SCHEMA_WITH_NESTED_FIELD); + GenericRecord rec = new GenericData.Record(nestedSchema); + + // test get . + assertEquals(". field not found in record. Acceptable fields were :[firstname, lastname, student]", + assertThrows(HoodieException.class, () -> + HoodieAvroUtils.getNestedFieldVal(rec, ".", false, false)).getMessage()); + + // test get fake_key + assertEquals("fake_key(Part -fake_key) field not found in record. Acceptable fields were :[firstname, lastname, student]", + assertThrows(HoodieException.class, () -> + HoodieAvroUtils.getNestedFieldVal(rec, "fake_key", false, false)).getMessage()); + + // test get student(null) + assertNull(HoodieAvroUtils.getNestedFieldVal(rec, "student", false, false)); + + // test get student + GenericRecord studentRecord = new GenericData.Record(rec.getSchema().getField("student").schema()); + studentRecord.put("firstname", "person"); + rec.put("student", studentRecord); + assertEquals(studentRecord, HoodieAvroUtils.getNestedFieldVal(rec, "student", false, false)); + + // test get student.fake_key + assertEquals("student.fake_key(Part -fake_key) field not found in record. Acceptable fields were :[firstname, lastname]", + assertThrows(HoodieException.class, () -> + HoodieAvroUtils.getNestedFieldVal(rec, "student.fake_key", false, false)).getMessage()); + + // test get student.firstname + assertEquals("person", HoodieAvroUtils.getNestedFieldVal(rec, "student.firstname", false, false)); + + // test get student.lastname(null) + assertNull(HoodieAvroUtils.getNestedFieldVal(rec, "student.lastname", false, false)); + + // test get student.firstname.fake_key + assertEquals("Cannot find a record at part value :firstname", + assertThrows(HoodieException.class, () -> + HoodieAvroUtils.getNestedFieldVal(rec, "student.firstname.fake_key", false, false)).getMessage()); + + // test get student.lastname(null).fake_key + assertEquals("Cannot find a record at part value :lastname", + assertThrows(HoodieException.class, () -> + HoodieAvroUtils.getNestedFieldVal(rec, "student.lastname.fake_key", false, false)).getMessage()); + } + + @Test + public void testGetNestedFieldValWithDecimalField() { + GenericRecord rec = new GenericData.Record(new Schema.Parser().parse(SCHEMA_WITH_DECIMAL_FIELD)); + rec.put("key_col", "key"); + BigDecimal bigDecimal = new BigDecimal("1234.5678"); + ByteBuffer byteBuffer = ByteBuffer.wrap(bigDecimal.unscaledValue().toByteArray()); + rec.put("decimal_col", byteBuffer); + + Object decimalCol = HoodieAvroUtils.getNestedFieldVal(rec, "decimal_col", true, false); + assertEquals(bigDecimal, decimalCol); + + Object obj = rec.get(1); + assertTrue(obj instanceof ByteBuffer); + ByteBuffer buffer = (ByteBuffer) obj; + assertEquals(0, buffer.position()); + } + + @Test + public void testGetNestedFieldSchema() throws IOException { + Schema schema = SchemaTestUtil.getEvolvedSchema(); + GenericRecord rec = new GenericData.Record(schema); + rec.put("field1", "key1"); + rec.put("field2", "val1"); + rec.put("name", "val2"); + rec.put("favorite_number", 2); + // test simple field schema + assertEquals(Schema.create(Schema.Type.STRING), getNestedFieldSchemaFromWriteSchema(rec.getSchema(), "field1")); + + GenericRecord rec2 = new GenericData.Record(schema); + rec2.put("field1", "key1"); + rec2.put("field2", "val1"); + rec2.put("name", "val2"); + rec2.put("favorite_number", 12); + // test comparison of non-string type + assertEquals(-1, GenericData.get().compare(rec.get("favorite_number"), rec2.get("favorite_number"), getNestedFieldSchemaFromWriteSchema(rec.getSchema(), "favorite_number"))); + + // test nested field schema + Schema nestedSchema = new Schema.Parser().parse(SCHEMA_WITH_NESTED_FIELD); + GenericRecord rec3 = new GenericData.Record(nestedSchema); + rec3.put("firstname", "person1"); + rec3.put("lastname", "person2"); + GenericRecord studentRecord = new GenericData.Record(rec3.getSchema().getField("student").schema()); + studentRecord.put("firstname", "person1"); + studentRecord.put("lastname", "person2"); + rec3.put("student", studentRecord); + + assertEquals(Schema.create(Schema.Type.STRING), getNestedFieldSchemaFromWriteSchema(rec3.getSchema(), "student.firstname")); + assertEquals(Schema.create(Schema.Type.STRING), getNestedFieldSchemaFromWriteSchema(nestedSchema, "student.firstname")); + } + + @Test + public void testReWriteAvroRecordWithNewSchema() { + Schema nestedSchema = new Schema.Parser().parse(SCHEMA_WITH_NESTED_FIELD); + GenericRecord rec3 = new GenericData.Record(nestedSchema); + rec3.put("firstname", "person1"); + rec3.put("lastname", "person2"); + GenericRecord studentRecord = new GenericData.Record(rec3.getSchema().getField("student").schema()); + studentRecord.put("firstname", "person1"); + studentRecord.put("lastname", "person2"); + rec3.put("student", studentRecord); + + Schema nestedSchemaRename = new Schema.Parser().parse(SCHEMA_WITH_NESTED_FIELD_RENAMED); + Map colRenames = new HashMap<>(); + colRenames.put("fn", "firstname"); + colRenames.put("ln", "lastname"); + colRenames.put("ss", "student"); + colRenames.put("ss.fn", "firstname"); + colRenames.put("ss.ln", "lastname"); + GenericRecord studentRecordRename = HoodieAvroUtils.rewriteRecordWithNewSchema(rec3, nestedSchemaRename, colRenames); + Assertions.assertEquals(GenericData.get().validate(nestedSchemaRename, studentRecordRename), true); + } + + @Test + public void testConvertDaysToDate() { + Date now = new Date(System.currentTimeMillis()); + int days = HoodieAvroUtils.fromJavaDate(now); + assertEquals(now.toLocalDate(), HoodieAvroUtils.toJavaDate(days).toLocalDate()); } } diff --git a/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroWriteSupport.java b/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroWriteSupport.java deleted file mode 100644 index 72f1453d55034..0000000000000 --- a/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroWriteSupport.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.avro; - -import org.apache.hudi.common.bloom.BloomFilter; -import org.apache.hudi.common.bloom.BloomFilterFactory; -import org.apache.hudi.common.bloom.BloomFilterTypeCode; -import org.apache.hudi.common.model.HoodieRecord; - -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.fs.Path; -import org.apache.parquet.avro.AvroSchemaConverter; -import org.apache.parquet.hadoop.ParquetWriter; -import org.apache.parquet.hadoop.metadata.CompressionCodecName; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.UUID; - -public class TestHoodieAvroWriteSupport { - - @Test - public void testAddKey(@TempDir java.nio.file.Path tempDir) throws IOException { - List rowKeys = new ArrayList<>(); - for (int i = 0; i < 1000; i++) { - rowKeys.add(UUID.randomUUID().toString()); - } - String filePath = tempDir.resolve("test.parquet").toAbsolutePath().toString(); - Schema schema = HoodieAvroUtils.getRecordKeySchema(); - BloomFilter filter = BloomFilterFactory.createBloomFilter( - 1000, 0.0001, 10000, - BloomFilterTypeCode.SIMPLE.name()); - HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport( - new AvroSchemaConverter().convert(schema), schema, filter); - ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP, - 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE); - for (String rowKey : rowKeys) { - GenericRecord rec = new GenericData.Record(schema); - rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey); - writer.write(rec); - writeSupport.add(rowKey); - } - writer.close(); - } -} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/bootstrap/TestBootstrapIndex.java b/hudi-common/src/test/java/org/apache/hudi/common/bootstrap/TestBootstrapIndex.java index bbe75cf893770..5b7147111a3ab 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/bootstrap/TestBootstrapIndex.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/bootstrap/TestBootstrapIndex.java @@ -24,13 +24,18 @@ import org.apache.hudi.common.bootstrap.index.BootstrapIndex; import org.apache.hudi.common.bootstrap.index.BootstrapIndex.IndexWriter; import org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex; +import org.apache.hudi.common.bootstrap.index.NoOpBootstrapIndex; import org.apache.hudi.common.model.BootstrapFileMapping; import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.testutils.HoodieCommonTestHarness; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsAction; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.ArrayList; @@ -40,6 +45,7 @@ import java.util.Date; import java.util.List; import java.util.Map; +import java.util.Properties; import java.util.Set; import java.util.UUID; import java.util.concurrent.ExecutorService; @@ -48,9 +54,6 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -62,7 +65,7 @@ public class TestBootstrapIndex extends HoodieCommonTestHarness { private static final String[] PARTITIONS = {"2020/03/18", "2020/03/19", "2020/03/20", "2020/03/21"}; private static final Set PARTITION_SET = Arrays.stream(PARTITIONS).collect(Collectors.toSet()); - private static final String BOOTSTRAP_BASE_PATH = "/tmp/source/parquet_tables/table1"; + private static final String BOOTSTRAP_BASE_PATH = "/tmp/source/data_tables/table1"; @BeforeEach public void init() throws IOException { @@ -85,6 +88,19 @@ public void testBootstrapIndexRecreateIndex() throws IOException { testBootstrapIndexOneRound(5); } + @Test + public void testNoOpBootstrapIndex() throws IOException { + Properties props = metaClient.getTableConfig().getProps(); + props.put(HoodieTableConfig.BOOTSTRAP_INDEX_ENABLE.key(), "false"); + Properties properties = new Properties(); + properties.putAll(props); + HoodieTableConfig.create(metaClient.getFs(), new Path(metaClient.getMetaPath()), properties); + + metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(basePath).build(); + BootstrapIndex bootstrapIndex = BootstrapIndex.getBootstrapIndex(metaClient); + assert (bootstrapIndex instanceof NoOpBootstrapIndex); + } + @Test public void testBootstrapIndexConcurrent() throws Exception { Map> bootstrapMapping = generateBootstrapIndex(metaClient, BOOTSTRAP_BASE_PATH, PARTITIONS, 100); @@ -168,7 +184,7 @@ private static Map> generateBootstrapMapping( return Arrays.stream(partitions).map(partition -> { return Pair.of(partition, IntStream.range(0, numEntriesPerPartition).mapToObj(idx -> { String hudiFileId = UUID.randomUUID().toString(); - String sourceFileName = idx + ".parquet"; + String sourceFileName = idx + HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension(); HoodieFileStatus sourceFileStatus = HoodieFileStatus.newBuilder() .setPath(HoodiePath.newBuilder().setUri(sourceBasePath + "/" + partition + "/" + sourceFileName).build()) .setLength(256 * 1024 * 1024L) diff --git a/hudi-common/src/test/java/org/apache/hudi/common/config/TestConfigProperty.java b/hudi-common/src/test/java/org/apache/hudi/common/config/TestConfigProperty.java new file mode 100644 index 0000000000000..6cbb9bd48e496 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/config/TestConfigProperty.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.config; + +import org.apache.hudi.common.util.Option; +import org.junit.jupiter.api.Test; + +import java.util.Properties; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestConfigProperty extends HoodieConfig { + + public static ConfigProperty FAKE_STRING_CONFIG = ConfigProperty + .key("test.fake.string.config") + .defaultValue("1") + .withAlternatives("test.fake.string.alternative.config") + .withDocumentation("Fake config only for testing"); + + public static ConfigProperty FAKE_BOOLEAN_CONFIG = ConfigProperty + .key("test.fake.boolean.config") + .defaultValue("false") + .withDocumentation("Fake config only for testing"); + + public static ConfigProperty FAKE_BOOLEAN_CONFIG_NO_DEFAULT = ConfigProperty + .key("test.fake.boolean.config") + .noDefaultValue() + .withDocumentation("Fake config only for testing"); + + public static ConfigProperty FAKE_INTEGER_CONFIG = ConfigProperty + .key("test.fake.integer.config") + .defaultValue(0) + .withInferFunction(p -> { + if (p.contains(FAKE_STRING_CONFIG) && p.getString(FAKE_STRING_CONFIG).equals("5")) { + return Option.of(100); + } + return Option.empty(); + }) + .withDocumentation("Fake config only for testing"); + + @Test + public void testGetTypedValue() { + HoodieConfig hoodieConfig = new HoodieConfig(); + assertNull(hoodieConfig.getInt(FAKE_STRING_CONFIG)); + hoodieConfig.setValue(FAKE_STRING_CONFIG, "5"); + assertEquals(5, hoodieConfig.getInt(FAKE_STRING_CONFIG)); + + assertEquals(false, hoodieConfig.getBoolean(FAKE_BOOLEAN_CONFIG)); + hoodieConfig.setValue(FAKE_BOOLEAN_CONFIG, "true"); + assertEquals(true, hoodieConfig.getBoolean(FAKE_BOOLEAN_CONFIG)); + } + + @Test + public void testGetBooleanShouldReturnFalseWhenDefaultValueFalseButNotSet() { + HoodieConfig hoodieConfig = new HoodieConfig(); + assertEquals(false, hoodieConfig.getBoolean(FAKE_BOOLEAN_CONFIG)); + } + + @Test + public void testGetBooleanShouldReturnNullWhenNoDefaultValuePresent() { + HoodieConfig hoodieConfig = new HoodieConfig(); + assertNull(hoodieConfig.getBoolean(FAKE_BOOLEAN_CONFIG_NO_DEFAULT)); + } + + @Test + public void testGetOrDefault() { + Properties props = new Properties(); + props.put("test.unknown.config", "abc"); + HoodieConfig hoodieConfig = new HoodieConfig(props); + assertEquals("1", hoodieConfig.getStringOrDefault(FAKE_STRING_CONFIG)); + assertEquals("2", hoodieConfig.getStringOrDefault(FAKE_STRING_CONFIG, "2")); + } + + @Test + public void testAlternatives() { + Properties props = new Properties(); + props.put("test.fake.string.alternative.config", "1"); + HoodieConfig hoodieConfig = new HoodieConfig(props); + assertTrue(hoodieConfig.contains(FAKE_STRING_CONFIG)); + assertEquals("1", hoodieConfig.getString(FAKE_STRING_CONFIG)); + } + + @Test + public void testInference() { + HoodieConfig hoodieConfig1 = new HoodieConfig(); + hoodieConfig1.setDefaultValue(FAKE_INTEGER_CONFIG); + assertEquals(0, hoodieConfig1.getInt(FAKE_INTEGER_CONFIG)); + + HoodieConfig hoodieConfig2 = new HoodieConfig(); + hoodieConfig2.setValue(FAKE_STRING_CONFIG, "5"); + hoodieConfig2.setDefaultValue(FAKE_INTEGER_CONFIG); + assertEquals(100, hoodieConfig2.getInt(FAKE_INTEGER_CONFIG)); + } + + @Test + public void testSetDefaults() { + setDefaults(this.getClass().getName()); + assertEquals(3, getProps().size()); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/data/TestHoodieListData.java b/hudi-common/src/test/java/org/apache/hudi/common/data/TestHoodieListData.java new file mode 100644 index 0000000000000..ea19f128d1a98 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/data/TestHoodieListData.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.data; + +import org.apache.hudi.common.util.collection.Pair; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class TestHoodieListData { + + private static Stream distinctWithKey() { + return Stream.of( + Arguments.of( + Arrays.asList(Pair.of("k1", 1), Pair.of("k2", 2)), + Arrays.asList(Pair.of("k1", 1), Pair.of("k1", 10), Pair.of("k1", 100), Pair.of("k2", 2))) + ); + } + + @ParameterizedTest + @MethodSource + void distinctWithKey(List> expected, List> originalList) { + List> distinctList = HoodieListData.eager(originalList).distinctWithKey(Pair::getLeft, 1).collectAsList(); + assertEquals(expected, distinctList); + } + + @Test + void testEagerSemantic() { + List sourceList = Arrays.asList("quick", "brown", "fox"); + + HoodieListData originalListData = HoodieListData.eager(sourceList); + HoodieData lengthsListData = originalListData.map(String::length); + + List expectedLengths = sourceList.stream().map(String::length).collect(Collectors.toList()); + assertEquals(expectedLengths, lengthsListData.collectAsList()); + // Here we assert that even though we already de-referenced derivative container, + // we still can dereference its parent (multiple times) + assertEquals(3, originalListData.count()); + assertEquals(sourceList, originalListData.collectAsList()); + } + + @Test + public void testGetNumPartitions() { + HoodieData listData = HoodieListData.eager( + IntStream.rangeClosed(0, 100).boxed().collect(Collectors.toList())); + assertEquals(1, listData.getNumPartitions()); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/data/TestHoodieListDataPairData.java b/hudi-common/src/test/java/org/apache/hudi/common/data/TestHoodieListDataPairData.java new file mode 100644 index 0000000000000..bb65909230da0 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/data/TestHoodieListDataPairData.java @@ -0,0 +1,265 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.data; + +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import static org.apache.hudi.common.util.CollectionUtils.createImmutableList; +import static org.apache.hudi.common.util.CollectionUtils.createImmutableMap; +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestHoodieListDataPairData { + + private static final String KEY1 = "key1"; + private static final String KEY2 = "key2"; + private static final String KEY3 = "key3"; + private static final String KEY4 = "key4"; + private static final String KEY5 = "key5"; + + private static final String STRING_VALUE1 = "value1"; + private static final String STRING_VALUE2 = "value2"; + private static final String STRING_VALUE3 = "value3"; + private static final String STRING_VALUE4 = "value4"; + private static final String STRING_VALUE5 = "value5"; + private static final String STRING_VALUE6 = "value6"; + + private static final int INTEGER_VALUE1 = 1; + private static final int INTEGER_VALUE2 = 2; + private static final int INTEGER_VALUE3 = 3; + private static final int INTEGER_VALUE4 = 4; + private static final int INTEGER_VALUE5 = 5; + + private List> testPairs; + private HoodiePairData testHoodiePairData; + + @BeforeEach + public void setup() { + testPairs = constructPairs(); + testHoodiePairData = HoodieListPairData.lazy(testPairs); + } + + @Test + public void testKeys() { + assertHoodieDataEquals(Arrays.asList(KEY1, KEY1, KEY2, KEY2, KEY3, KEY4), testHoodiePairData.keys()); + } + + @Test + public void testValues() { + assertHoodieDataEquals(Arrays.asList( + STRING_VALUE1, STRING_VALUE2, STRING_VALUE3, STRING_VALUE4, STRING_VALUE5, STRING_VALUE6), + testHoodiePairData.values()); + } + + @Test + public void testCount() { + assertEquals(6, testHoodiePairData.count()); + } + + @Test + public void testCountByKey() { + Map expectedResultMap = new HashMap<>(); + expectedResultMap.put(KEY1, 2L); + expectedResultMap.put(KEY2, 2L); + expectedResultMap.put(KEY3, 1L); + expectedResultMap.put(KEY4, 1L); + + assertEquals(expectedResultMap, testHoodiePairData.countByKey()); + } + + @Test + public void testMap() { + assertHoodieDataEquals(Arrays.asList( + "key1,value1", "key1,value2", "key2,value3", "key2,value4", "key3,value5", "key4,value6"), + testHoodiePairData.map(pair -> pair.getKey() + "," + pair.getValue())); + } + + @Test + public void testMapToPair() { + Map> expectedResultMap = new HashMap<>(); + expectedResultMap.put("key10", Arrays.asList(1, 2)); + expectedResultMap.put("key20", Arrays.asList(3, 4)); + expectedResultMap.put("key30", Arrays.asList(5)); + expectedResultMap.put("key40", Arrays.asList(6)); + assertEquals(expectedResultMap, toMap( + testHoodiePairData.mapToPair( + pair -> { + String value = pair.getValue(); + return new ImmutablePair<>(pair.getKey() + "0", + Integer.parseInt(String.valueOf(value.charAt(value.length() - 1)))); + }))); + } + + private static Stream testReduceByKey() { + return Stream.of( + Arguments.of( + createImmutableMap( + Pair.of(1, createImmutableList(1001)), + Pair.of(2, createImmutableList(2001)), + Pair.of(3, createImmutableList(3001))), + createImmutableMap( + Pair.of(1, createImmutableList(1001, 1002, 1003)), + Pair.of(2, createImmutableList(2001, 2002)), + Pair.of(3, createImmutableList(3001)), + Pair.of(4, createImmutableList()))) + ); + } + + @ParameterizedTest + @MethodSource + public void testReduceByKey(Map> expected, Map> original) { + HoodiePairData reduced = HoodieListPairData.lazy(original).reduceByKey((a, b) -> a, 1); + assertEquals(expected, toMap(reduced)); + } + + @Test + public void testLeftOuterJoinSingleValuePerKey() { + HoodiePairData pairData1 = HoodieListPairData.lazy(Arrays.asList( + ImmutablePair.of(KEY1, STRING_VALUE1), + ImmutablePair.of(KEY2, STRING_VALUE2), + ImmutablePair.of(KEY3, STRING_VALUE3), + ImmutablePair.of(KEY4, STRING_VALUE4) + )); + + HoodiePairData pairData2 = HoodieListPairData.lazy(Arrays.asList( + ImmutablePair.of(KEY1, INTEGER_VALUE1), + ImmutablePair.of(KEY2, INTEGER_VALUE2), + ImmutablePair.of(KEY5, INTEGER_VALUE3) + )); + + Map>>> expectedResultMap = new HashMap<>(); + expectedResultMap.put(KEY1, Arrays.asList( + ImmutablePair.of(STRING_VALUE1, Option.of(INTEGER_VALUE1)))); + expectedResultMap.put(KEY2, Arrays.asList( + ImmutablePair.of(STRING_VALUE2, Option.of(INTEGER_VALUE2)))); + expectedResultMap.put(KEY3, Arrays.asList( + ImmutablePair.of(STRING_VALUE3, Option.empty()))); + expectedResultMap.put(KEY4, Arrays.asList( + ImmutablePair.of(STRING_VALUE4, Option.empty()))); + + assertEquals(expectedResultMap, + toMap(pairData1.leftOuterJoin(pairData2))); + } + + @Test + public void testLeftOuterJoinMultipleValuesPerKey() { + HoodiePairData otherPairData = HoodieListPairData.lazy(Arrays.asList( + ImmutablePair.of(KEY1, INTEGER_VALUE1), + ImmutablePair.of(KEY2, INTEGER_VALUE2), + ImmutablePair.of(KEY2, INTEGER_VALUE3), + ImmutablePair.of(KEY3, INTEGER_VALUE4), + ImmutablePair.of(KEY5, INTEGER_VALUE5) + )); + + Map>>> expectedResultMap = new HashMap<>(); + expectedResultMap.put(KEY1, Arrays.asList( + ImmutablePair.of(STRING_VALUE1, Option.of(INTEGER_VALUE1)), + ImmutablePair.of(STRING_VALUE2, Option.of(INTEGER_VALUE1)))); + expectedResultMap.put(KEY2, Arrays.asList( + ImmutablePair.of(STRING_VALUE3, Option.of(INTEGER_VALUE2)), + ImmutablePair.of(STRING_VALUE3, Option.of(INTEGER_VALUE3)), + ImmutablePair.of(STRING_VALUE4, Option.of(INTEGER_VALUE2)), + ImmutablePair.of(STRING_VALUE4, Option.of(INTEGER_VALUE3)))); + expectedResultMap.put(KEY3, Arrays.asList( + ImmutablePair.of(STRING_VALUE5, Option.of(INTEGER_VALUE4)))); + expectedResultMap.put(KEY4, Arrays.asList( + ImmutablePair.of(STRING_VALUE6, Option.empty()))); + + assertEquals(expectedResultMap, + toMap(testHoodiePairData.leftOuterJoin(otherPairData))); + } + + @Test + void testEagerSemantic() { + List> sourceList = + Stream.of("quick", "brown", "fox") + .map(s -> Pair.of(s, s.length())) + .collect(Collectors.toList()); + + HoodieListPairData originalListData = HoodieListPairData.eager(sourceList); + HoodieData lengthsListData = originalListData.values(); + + List expectedLengths = sourceList.stream().map(Pair::getValue).collect(Collectors.toList()); + assertEquals(expectedLengths, lengthsListData.collectAsList()); + // Here we assert that even though we already de-referenced derivative container, + // we still can dereference its parent (multiple times) + assertEquals(3, originalListData.count()); + assertEquals(sourceList, originalListData.collectAsList()); + } + + private static List> constructPairs() { + return Arrays.asList( + ImmutablePair.of(KEY1, STRING_VALUE1), + ImmutablePair.of(KEY1, STRING_VALUE2), + ImmutablePair.of(KEY2, STRING_VALUE3), + ImmutablePair.of(KEY2, STRING_VALUE4), + ImmutablePair.of(KEY3, STRING_VALUE5), + ImmutablePair.of(KEY4, STRING_VALUE6) + ); + } + + private static Map> toMap(HoodiePairData pairData) { + return ((List>>) pairData.groupByKey().get()).stream() + .collect( + Collectors.toMap( + p -> p.getKey(), + p -> StreamSupport.stream(p.getValue().spliterator(), false).collect(Collectors.toList()) + ) + ); + } + + private static void addPairsToMap( + Map> map, final List> pairs) { + for (Pair pair : pairs) { + String key = pair.getKey(); + V value = pair.getValue(); + List list = map.computeIfAbsent(key, k -> new ArrayList<>()); + list.add(value); + } + } + + private void assertHoodieDataEquals( + List expectedList, HoodieData hoodieData) { + assertHoodieDataEquals(expectedList, hoodieData, Comparator.naturalOrder()); + } + + private void assertHoodieDataEquals( + List expectedList, HoodieData hoodieData, Comparator comparator) { + assertEquals(expectedList, + hoodieData.collectAsList().stream().sorted(comparator).collect(Collectors.toList()) + ); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java index f1d80785c0184..481bb1dd452da 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java @@ -18,38 +18,46 @@ package org.apache.hudi.common.fs; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.engine.HoodieLocalEngineContext; import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.testutils.HoodieCommonTestHarness; import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.junit.Rule; import org.junit.contrib.java.lang.system.EnvironmentVariables; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import java.io.File; import java.io.IOException; +import java.net.URI; import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.Date; import java.util.List; +import java.util.Map; import java.util.UUID; import java.util.stream.Collectors; import java.util.stream.Stream; -import static org.apache.hudi.common.table.timeline.HoodieActiveTimeline.COMMIT_FORMATTER; +import static org.apache.hudi.common.model.HoodieFileFormat.HOODIE_LOG; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; /** @@ -61,6 +69,7 @@ public class TestFSUtils extends HoodieCommonTestHarness { private final long minCleanToKeep = 10; private static String TEST_WRITE_TOKEN = "1-0-1"; + public static final String BASE_FILE_EXTENSION = HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension(); @Rule public final EnvironmentVariables environmentVariables = new EnvironmentVariables(); @@ -68,20 +77,21 @@ public class TestFSUtils extends HoodieCommonTestHarness { @BeforeEach public void setUp() throws IOException { initMetaClient(); + basePath = "file:" + basePath; } @Test public void testMakeDataFileName() { - String instantTime = COMMIT_FORMATTER.format(new Date()); + String instantTime = HoodieActiveTimeline.formatDate(new Date()); String fileName = UUID.randomUUID().toString(); - assertEquals(FSUtils.makeDataFileName(instantTime, TEST_WRITE_TOKEN, fileName), fileName + "_" + TEST_WRITE_TOKEN + "_" + instantTime + ".parquet"); + assertEquals(FSUtils.makeBaseFileName(instantTime, TEST_WRITE_TOKEN, fileName), fileName + "_" + TEST_WRITE_TOKEN + "_" + instantTime + BASE_FILE_EXTENSION); } @Test public void testMaskFileName() { - String instantTime = COMMIT_FORMATTER.format(new Date()); + String instantTime = HoodieActiveTimeline.formatDate(new Date()); int taskPartitionId = 2; - assertEquals(FSUtils.maskWithoutFileId(instantTime, taskPartitionId), "*_" + taskPartitionId + "_" + instantTime + ".parquet"); + assertEquals(FSUtils.maskWithoutFileId(instantTime, taskPartitionId), "*_" + taskPartitionId + "_" + instantTime + BASE_FILE_EXTENSION); } @Test @@ -105,9 +115,12 @@ public void testProcessFiles() throws Exception { }); // Files inside partitions and marker directories - List files = Arrays.asList("2016/04/15/1_1-0-1_20190528120000.parquet", - "2016/05/16/2_1-0-1_20190528120000.parquet", ".hoodie/.temp/2/2016/05/16/2_1-0-1_20190528120000.parquet", - ".hoodie/.temp/2/2016/04/15/1_1-0-1_20190528120000.parquet"); + List files = Stream.of("2016/04/15/1_1-0-1_20190528120000", + "2016/05/16/2_1-0-1_20190528120000", + ".hoodie/.temp/2/2016/05/16/2_1-0-1_20190528120000", + ".hoodie/.temp/2/2016/04/15/1_1-0-1_20190528120000") + .map(fileName -> fileName + BASE_FILE_EXTENSION) + .collect(Collectors.toList()); files.forEach(f -> { try { @@ -144,17 +157,20 @@ public void testProcessFiles() throws Exception { @Test public void testGetCommitTime() { - String instantTime = COMMIT_FORMATTER.format(new Date()); + String instantTime = HoodieActiveTimeline.formatDate(new Date()); String fileName = UUID.randomUUID().toString(); - String fullFileName = FSUtils.makeDataFileName(instantTime, TEST_WRITE_TOKEN, fileName); + String fullFileName = FSUtils.makeBaseFileName(instantTime, TEST_WRITE_TOKEN, fileName); + assertEquals(instantTime, FSUtils.getCommitTime(fullFileName)); + // test log file name + fullFileName = FSUtils.makeLogFileName(fileName, HOODIE_LOG.getFileExtension(), instantTime, 1, TEST_WRITE_TOKEN); assertEquals(instantTime, FSUtils.getCommitTime(fullFileName)); } @Test public void testGetFileNameWithoutMeta() { - String instantTime = COMMIT_FORMATTER.format(new Date()); + String instantTime = HoodieActiveTimeline.formatDate(new Date()); String fileName = UUID.randomUUID().toString(); - String fullFileName = FSUtils.makeDataFileName(instantTime, TEST_WRITE_TOKEN, fileName); + String fullFileName = FSUtils.makeBaseFileName(instantTime, TEST_WRITE_TOKEN, fileName); assertEquals(fileName, FSUtils.getFileId(fullFileName)); } @@ -174,6 +190,9 @@ public void testGetRelativePartitionPath() { Path basePath = new Path("/test/apache"); Path partitionPath = new Path("/test/apache/hudi/sub"); assertEquals("hudi/sub", FSUtils.getRelativePartitionPath(basePath, partitionPath)); + + Path nonPartitionPath = new Path("/test/something/else"); + assertThrows(IllegalArgumentException.class, () -> FSUtils.getRelativePartitionPath(basePath, nonPartitionPath)); } @Test @@ -273,51 +292,6 @@ public static String makeOldLogFileName(String fileId, String logFileExtension, return "." + String.format("%s_%s%s.%d", fileId, baseCommitTime, logFileExtension, version); } - @Test - public void testDeleteOlderRollbackFiles() throws Exception { - String[] instantTimes = new String[]{"20160501010101", "20160501020101", "20160501030101", "20160501040101", - "20160502020601", "20160502030601", "20160502040601", "20160502050601", "20160506030611", - "20160506040611", "20160506050611", "20160506060611"}; - List hoodieInstants = new ArrayList<>(); - // create rollback files - for (String instantTime : instantTimes) { - Files.createFile(Paths.get(basePath, - HoodieTableMetaClient.METAFOLDER_NAME, - instantTime + HoodieTimeline.ROLLBACK_EXTENSION)); - hoodieInstants.add(new HoodieInstant(false, HoodieTimeline.ROLLBACK_ACTION, instantTime)); - } - - String metaPath = Paths.get(basePath, ".hoodie").toString(); - FSUtils.deleteOlderRollbackMetaFiles(FSUtils.getFs(basePath, new Configuration()), - metaPath, hoodieInstants.stream()); - File[] rollbackFiles = new File(metaPath).listFiles((dir, name) - -> name.contains(HoodieTimeline.ROLLBACK_EXTENSION)); - assertNotNull(rollbackFiles); - assertEquals(rollbackFiles.length, minRollbackToKeep); - } - - @Test - public void testDeleteOlderCleanMetaFiles() throws Exception { - String[] instantTimes = new String[]{"20160501010101", "20160501020101", "20160501030101", "20160501040101", - "20160502020601", "20160502030601", "20160502040601", "20160502050601", "20160506030611", - "20160506040611", "20160506050611", "20160506060611"}; - List hoodieInstants = new ArrayList<>(); - // create rollback files - for (String instantTime : instantTimes) { - Files.createFile(Paths.get(basePath, - HoodieTableMetaClient.METAFOLDER_NAME, - instantTime + HoodieTimeline.CLEAN_EXTENSION)); - hoodieInstants.add(new HoodieInstant(false, HoodieTimeline.CLEAN_ACTION, instantTime)); - } - String metaPath = Paths.get(basePath, ".hoodie").toString(); - FSUtils.deleteOlderCleanMetaFiles(FSUtils.getFs(basePath, new Configuration()), - metaPath, hoodieInstants.stream()); - File[] cleanFiles = new File(metaPath).listFiles((dir, name) - -> name.contains(HoodieTimeline.CLEAN_EXTENSION)); - assertNotNull(cleanFiles); - assertEquals(cleanFiles.length, minCleanToKeep); - } - @Test public void testFileNameRelatedFunctions() throws Exception { String instantTime = "20160501010101"; @@ -330,7 +304,7 @@ public void testFileNameRelatedFunctions() throws Exception { final String LOG_EXTENTION = "." + LOG_STR; // data file name - String dataFileName = FSUtils.makeDataFileName(instantTime, writeToken, fileId); + String dataFileName = FSUtils.makeBaseFileName(instantTime, writeToken, fileId); assertEquals(instantTime, FSUtils.getCommitTime(dataFileName)); assertEquals(fileId, FSUtils.getFileId(dataFileName)); @@ -342,7 +316,7 @@ public void testFileNameRelatedFunctions() throws Exception { assertEquals(LOG_STR, FSUtils.getFileExtensionFromLog(new Path(logFileName))); // create three versions of log file - java.nio.file.Path partitionPath = Paths.get(basePath, partitionStr); + java.nio.file.Path partitionPath = Paths.get(URI.create(basePath + "/" + partitionStr)); Files.createDirectories(partitionPath); String log1 = FSUtils.makeLogFileName(fileId, LOG_EXTENTION, instantTime, 1, writeToken); Files.createFile(partitionPath.resolve(log1)); @@ -356,4 +330,160 @@ public void testFileNameRelatedFunctions() throws Exception { assertEquals(4, FSUtils.computeNextLogVersion(FSUtils.getFs(basePath, new Configuration()), new Path(partitionPath.toString()), fileId, LOG_EXTENTION, instantTime)); } + + @Test + public void testGetFilename() { + assertEquals("file1.parquet", FSUtils.getFileName("/2022/07/29/file1.parquet", "/2022/07/29")); + assertEquals("file2.parquet", FSUtils.getFileName("2022/07/29/file2.parquet", "2022/07/29")); + assertEquals("file3.parquet", FSUtils.getFileName("/file3.parquet", "")); + assertEquals("file4.parquet", FSUtils.getFileName("file4.parquet", "")); + } + + private void prepareTestDirectory(FileSystem fileSystem, String rootDir) throws IOException { + // Directory structure + // .hoodie/.temp/ + // - subdir1 + // - file1.txt + // - subdir2 + // - file2.txt + // - file3 + Path dirPath = new Path(rootDir); + String subDir1 = rootDir + "/subdir1"; + String file1 = subDir1 + "/file1.txt"; + String subDir2 = rootDir + "/subdir2"; + String file2 = subDir2 + "/file2.txt"; + String file3 = rootDir + "/file3.txt"; + String[] dirs = new String[]{rootDir, subDir1, subDir2}; + String[] files = new String[]{file1, file2, file3}; + // clean up first + cleanUpTestDirectory(fileSystem, rootDir); + for (String dir : dirs) { + fileSystem.mkdirs(new Path(dir)); + } + for (String filename : files) { + fileSystem.create(new Path(filename)); + } + } + + private void cleanUpTestDirectory(FileSystem fileSystem, String rootDir) throws IOException { + fileSystem.delete(new Path(rootDir), true); + } + + @Test + public void testDeleteExistingDir() throws IOException { + String rootDir = basePath + "/.hoodie/.temp"; + FileSystem fileSystem = metaClient.getFs(); + prepareTestDirectory(fileSystem, rootDir); + + Path rootDirPath = new Path(rootDir); + assertTrue(fileSystem.exists(rootDirPath)); + assertTrue(FSUtils.deleteDir( + new HoodieLocalEngineContext(metaClient.getHadoopConf()), fileSystem, rootDirPath, 2)); + assertFalse(fileSystem.exists(rootDirPath)); + } + + @Test + public void testDeleteNonExistingDir() throws IOException { + String rootDir = basePath + "/.hoodie/.temp"; + FileSystem fileSystem = metaClient.getFs(); + cleanUpTestDirectory(fileSystem, rootDir); + + assertFalse(FSUtils.deleteDir( + new HoodieLocalEngineContext(metaClient.getHadoopConf()), fileSystem, new Path(rootDir), 2)); + } + + @Test + public void testDeleteSubDirectoryRecursively() throws IOException { + String rootDir = basePath + "/.hoodie/.temp"; + String subPathStr = rootDir + "/subdir1"; + FileSystem fileSystem = metaClient.getFs(); + prepareTestDirectory(fileSystem, rootDir); + + assertTrue(FSUtils.deleteSubPath( + subPathStr, new SerializableConfiguration(fileSystem.getConf()), true)); + } + + @Test + public void testDeleteSubDirectoryNonRecursively() throws IOException { + String rootDir = basePath + "/.hoodie/.temp"; + String subPathStr = rootDir + "/subdir1"; + FileSystem fileSystem = metaClient.getFs(); + prepareTestDirectory(fileSystem, rootDir); + + assertThrows( + HoodieIOException.class, + () -> FSUtils.deleteSubPath( + subPathStr, new SerializableConfiguration(fileSystem.getConf()), false)); + } + + @Test + public void testDeleteSubPathAsFile() throws IOException { + String rootDir = basePath + "/.hoodie/.temp"; + String subPathStr = rootDir + "/file3.txt"; + FileSystem fileSystem = metaClient.getFs(); + prepareTestDirectory(fileSystem, rootDir); + + assertTrue(FSUtils.deleteSubPath( + subPathStr, new SerializableConfiguration(fileSystem.getConf()), false)); + } + + @Test + public void testDeleteNonExistingSubDirectory() throws IOException { + String rootDir = basePath + "/.hoodie/.temp"; + String subPathStr = rootDir + "/subdir10"; + FileSystem fileSystem = metaClient.getFs(); + cleanUpTestDirectory(fileSystem, rootDir); + + assertFalse(FSUtils.deleteSubPath( + subPathStr, new SerializableConfiguration(fileSystem.getConf()), true)); + } + + @Test + public void testParallelizeSubPathProcessWithExistingDir() throws IOException { + String rootDir = basePath + "/.hoodie/.temp"; + FileSystem fileSystem = metaClient.getFs(); + prepareTestDirectory(fileSystem, rootDir); + Map> result = FSUtils.parallelizeSubPathProcess( + new HoodieLocalEngineContext(fileSystem.getConf()), fileSystem, new Path(rootDir), 2, + fileStatus -> !fileStatus.getPath().getName().contains("1"), + pairOfSubPathAndConf -> { + Path subPath = new Path(pairOfSubPathAndConf.getKey()); + List listFiles = new ArrayList<>(); + try { + FileSystem fs = subPath.getFileSystem(pairOfSubPathAndConf.getValue().get()); + FileStatus[] fileStatuses = fs.listStatus(subPath); + listFiles = Arrays.stream(fileStatuses) + .map(fileStatus -> fileStatus.getPath().getName()).collect(Collectors.toList()); + } catch (IOException e) { + e.printStackTrace(); + } + return listFiles; + } + ); + assertEquals(2, result.size()); + for (String subPath : result.keySet()) { + if (subPath.contains("subdir2")) { + assertEquals(Collections.singletonList("file2.txt"), result.get(subPath)); + } else if (subPath.contains("file3")) { + assertEquals(Collections.singletonList("file3.txt"), result.get(subPath)); + } + } + } + + @Test + public void testGetFileStatusAtLevel() throws IOException { + String rootDir = basePath + "/.hoodie/.temp"; + FileSystem fileSystem = metaClient.getFs(); + prepareTestDirectory(fileSystem, rootDir); + List fileStatusList = FSUtils.getFileStatusAtLevel( + new HoodieLocalEngineContext(fileSystem.getConf()), fileSystem, + new Path(basePath), 3, 2); + assertEquals(CollectionUtils.createImmutableSet( + basePath + "/.hoodie/.temp/subdir1/file1.txt", + basePath + "/.hoodie/.temp/subdir2/file2.txt"), + fileStatusList.stream() + .map(fileStatus -> fileStatus.getPath().toString()) + .filter(filePath -> filePath.endsWith(".txt")) + .collect(Collectors.toSet())); + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtilsWithRetryWrapperEnable.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtilsWithRetryWrapperEnable.java new file mode 100644 index 0000000000000..0b849ebec8185 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtilsWithRetryWrapperEnable.java @@ -0,0 +1,210 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version loop.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-loop.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.fs; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.util.Progressable; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.URI; +import java.util.Arrays; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertThrows; + +/** + * Tests file system utils with retry wrapper enable. + * P.S extends TestFSUtils and setUp a HoodieWrapperFileSystem for metaClient which can test all the TestFSUtils uts with RetryWrapperEnable + */ +public class TestFSUtilsWithRetryWrapperEnable extends TestFSUtils { + + private static final String EXCEPTION_MESSAGE = "Fake runtime exception here."; + private long maxRetryIntervalMs; + private int maxRetryNumbers; + private long initialRetryIntervalMs; + + @Override + @BeforeEach + public void setUp() throws IOException { + initMetaClient(); + basePath = "file:" + basePath; + FileSystemRetryConfig fileSystemRetryConfig = FileSystemRetryConfig.newBuilder().withFileSystemActionRetryEnabled(true).build(); + maxRetryIntervalMs = fileSystemRetryConfig.getMaxRetryIntervalMs(); + maxRetryNumbers = fileSystemRetryConfig.getMaxRetryNumbers(); + initialRetryIntervalMs = fileSystemRetryConfig.getInitialRetryIntervalMs(); + + FakeRemoteFileSystem fakeFs = new FakeRemoteFileSystem(FSUtils.getFs(metaClient.getMetaPath(), metaClient.getHadoopConf()), 2); + FileSystem fileSystem = new HoodieRetryWrapperFileSystem(fakeFs, maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, ""); + + HoodieWrapperFileSystem fs = new HoodieWrapperFileSystem(fileSystem, new NoOpConsistencyGuard()); + metaClient.setFs(fs); + } + + // Test the scenario that fs keeps retrying until it fails. + @Test + public void testProcessFilesWithExceptions() throws Exception { + FakeRemoteFileSystem fakeFs = new FakeRemoteFileSystem(FSUtils.getFs(metaClient.getMetaPath(), metaClient.getHadoopConf()), 100); + FileSystem fileSystem = new HoodieRetryWrapperFileSystem(fakeFs, maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, ""); + HoodieWrapperFileSystem fs = new HoodieWrapperFileSystem(fileSystem, new NoOpConsistencyGuard()); + metaClient.setFs(fs); + List folders = + Arrays.asList("2016/04/15", ".hoodie/.temp/2/2016/04/15"); + folders.forEach(f -> assertThrows(RuntimeException.class, () -> metaClient.getFs().mkdirs(new Path(new Path(basePath), f)))); + } + + /** + * Fake remote FileSystem which will throw RuntimeException something like AmazonS3Exception 503. + */ + class FakeRemoteFileSystem extends FileSystem { + + private FileSystem fs; + private int count = 1; + private int loop; + + public FakeRemoteFileSystem(FileSystem fs, int retryLoop) { + this.fs = fs; + this.loop = retryLoop; + } + + @Override + public URI getUri() { + return fs.getUri(); + } + + @Override + public FSDataInputStream open(Path f, int bufferSize) throws IOException { + if (count % loop == 0) { + count++; + return fs.open(f, bufferSize); + } else { + count++; + throw new RuntimeException(EXCEPTION_MESSAGE); + } + } + + @Override + public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, int bufferSize, short replication, long blockSize, Progressable progress) throws IOException { + if (count % loop == 0) { + count++; + return fs.create(f, permission, overwrite, bufferSize, replication, blockSize, progress); + } else { + count++; + throw new RuntimeException(EXCEPTION_MESSAGE); + } + } + + @Override + public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) throws IOException { + if (count % loop == 0) { + count++; + return fs.append(f, bufferSize, progress); + } else { + count++; + throw new RuntimeException(EXCEPTION_MESSAGE); + } + } + + @Override + public boolean rename(Path src, Path dst) throws IOException { + if (count % loop == 0) { + count++; + return fs.rename(src, dst); + } else { + count++; + throw new RuntimeException(EXCEPTION_MESSAGE); + } + } + + @Override + public boolean delete(Path f, boolean recursive) throws IOException { + if (count % loop == 0) { + count++; + return fs.delete(f, recursive); + } else { + count++; + throw new RuntimeException(EXCEPTION_MESSAGE); + } + } + + @Override + public FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException { + if (count % loop == 0) { + count++; + return fs.listStatus(f); + } else { + count++; + throw new RuntimeException(EXCEPTION_MESSAGE); + } + } + + @Override + public void setWorkingDirectory(Path newDir) { + fs.setWorkingDirectory(newDir); + } + + @Override + public Path getWorkingDirectory() { + return fs.getWorkingDirectory(); + } + + @Override + public boolean mkdirs(Path f, FsPermission permission) throws IOException { + if (count % loop == 0) { + count++; + return fs.mkdirs(f, permission); + } else { + count++; + throw new RuntimeException(EXCEPTION_MESSAGE); + } + } + + @Override + public FileStatus getFileStatus(Path f) throws IOException { + if (count % loop == 0) { + count++; + return fs.getFileStatus(f); + } else { + count++; + throw new RuntimeException(EXCEPTION_MESSAGE); + } + } + + @Override + public RemoteIterator listLocatedStatus(Path f) throws IOException { + return fs.listLocatedStatus(f); + } + + @Override + public Configuration getConf() { + return fs.getConf(); + } + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestStorageSchemes.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestStorageSchemes.java index 9f12620384c36..354ad6d0cca31 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestStorageSchemes.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestStorageSchemes.java @@ -18,8 +18,10 @@ package org.apache.hudi.common.fs; +import org.apache.hadoop.fs.Path; import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -32,6 +34,7 @@ public class TestStorageSchemes { @Test public void testStorageSchemes() { assertTrue(StorageSchemes.isSchemeSupported("hdfs")); + assertTrue(StorageSchemes.isSchemeSupported("afs")); assertFalse(StorageSchemes.isSchemeSupported("s2")); assertFalse(StorageSchemes.isAppendSupported("s3a")); assertFalse(StorageSchemes.isAppendSupported("gs")); @@ -44,8 +47,33 @@ public void testStorageSchemes() { assertFalse(StorageSchemes.isAppendSupported("cosn")); assertFalse(StorageSchemes.isAppendSupported("dbfs")); assertFalse(StorageSchemes.isAppendSupported("cos")); + assertTrue(StorageSchemes.isAppendSupported("jfs")); + assertFalse(StorageSchemes.isAppendSupported("bos")); + assertFalse(StorageSchemes.isAppendSupported("ks3")); + assertTrue(StorageSchemes.isAppendSupported("ofs")); + assertFalse(StorageSchemes.isAppendSupported("oci")); assertThrows(IllegalArgumentException.class, () -> { StorageSchemes.isAppendSupported("s2"); }, "Should throw exception for unsupported schemes"); } + + @Test + public void testConversionToNewSchema() { + Path s3TablePath1 = new Path("s3://test.1234/table1"); + assertEquals(s3TablePath1, HoodieWrapperFileSystem.convertPathWithScheme(s3TablePath1, "s3")); + + Path s3TablePath2 = new Path("s3://1234.test/table1"); + assertEquals(s3TablePath2, HoodieWrapperFileSystem.convertPathWithScheme(s3TablePath2, "s3")); + + Path s3TablePath3 = new Path("s3://test1234/table1"); + assertEquals(s3TablePath3, HoodieWrapperFileSystem.convertPathWithScheme(s3TablePath3, "s3")); + + Path hdfsTablePath = new Path("hdfs://sandbox.foo.com:8020/test.1234/table1"); + assertEquals(hdfsTablePath, HoodieWrapperFileSystem.convertPathWithScheme(hdfsTablePath, "hdfs")); + + Path localTablePath = new Path("file:/var/table1"); + Path localTablePathNoPrefix = new Path("/var/table1"); + assertEquals(localTablePath, HoodieWrapperFileSystem.convertPathWithScheme(localTablePath, "file")); + assertEquals(localTablePath, HoodieWrapperFileSystem.convertPathWithScheme(localTablePathNoPrefix, "file")); + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/InLineFSUtilsTest.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/InLineFSUtilsTest.java new file mode 100644 index 0000000000000..7d704c91126d6 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/InLineFSUtilsTest.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.fs.inline; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.testutils.FileSystemTestUtils; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * Tests {@link InLineFileSystem}. + */ +public class InLineFSUtilsTest { + + private static Stream configParams() { + Long[] data = new Long[] { + 0L, + 1000L, + (long) Integer.MAX_VALUE + 1, + Long.MAX_VALUE + }; + return Stream.of(data).map(Arguments::of); + } + + @ParameterizedTest + @MethodSource("configParams") + void startOffset(long startOffset) { + Path inlinePath = FileSystemTestUtils.getPhantomFile(FileSystemTestUtils.getRandomOuterFSPath(), startOffset, 0L); + assertEquals(startOffset, InLineFSUtils.startOffset(inlinePath)); + } + + @ParameterizedTest + @MethodSource("configParams") + void length(long inlineLength) { + Path inlinePath = FileSystemTestUtils.getPhantomFile(FileSystemTestUtils.getRandomOuterFSPath(), 0L, inlineLength); + assertEquals(inlineLength, InLineFSUtils.length(inlinePath)); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystem.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystem.java index 4553aa5a923ab..88bd35ef4b536 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystem.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystem.java @@ -296,6 +296,64 @@ public void testsetWorkingDirectory() throws IOException { }, "Should have thrown exception"); } + static class TestFSPath { + final Path inputPath; + final Path expectedInLineFSPath; + final Path transformedInputPath; + + TestFSPath(final Path inputPath, final Path expectedInLineFSPath, final Path transformedInputPath) { + this.inputPath = inputPath; + this.expectedInLineFSPath = expectedInLineFSPath; + this.transformedInputPath = transformedInputPath; + } + } + + @Test + public void testInLineFSPathConversions() { + final List expectedInLinePaths = Arrays.asList( + new TestFSPath( + new Path("/zero/524bae7e-f01d-47ae-b7cd-910400a81336"), + new Path("inlinefs://zero/524bae7e-f01d-47ae-b7cd-910400a81336/file/?start_offset=10&length=10"), + new Path("file:/zero/524bae7e-f01d-47ae-b7cd-910400a81336")), + new TestFSPath( + new Path("file:/one/524bae7e-f01d-47ae-b7cd-910400a81336"), + new Path("inlinefs://one/524bae7e-f01d-47ae-b7cd-910400a81336/file/?start_offset=10&length=10"), + new Path("file:/one/524bae7e-f01d-47ae-b7cd-910400a81336")), + new TestFSPath( + new Path("file://two/524bae7e-f01d-47ae-b7cd-910400a81336"), + new Path("inlinefs://two/524bae7e-f01d-47ae-b7cd-910400a81336/file/?start_offset=10&length=10"), + new Path("file:/two/524bae7e-f01d-47ae-b7cd-910400a81336")), + new TestFSPath( + new Path("hdfs://three/524bae7e-f01d-47ae-b7cd-910400a81336"), + new Path("inlinefs://three/524bae7e-f01d-47ae-b7cd-910400a81336/hdfs/?start_offset=10&length=10"), + new Path("hdfs://three/524bae7e-f01d-47ae-b7cd-910400a81336")), + new TestFSPath( + new Path("s3://four/524bae7e-f01d-47ae-b7cd-910400a81336"), + new Path("inlinefs://four/524bae7e-f01d-47ae-b7cd-910400a81336/s3/?start_offset=10&length=10"), + new Path("s3://four/524bae7e-f01d-47ae-b7cd-910400a81336")), + new TestFSPath( + new Path("s3a://five/524bae7e-f01d-47ae-b7cd-910400a81336"), + new Path("inlinefs://five/524bae7e-f01d-47ae-b7cd-910400a81336/s3a/?start_offset=10&length=10"), + new Path("s3a://five/524bae7e-f01d-47ae-b7cd-910400a81336")) + ); + + for (TestFSPath entry : expectedInLinePaths) { + final Path inputPath = entry.inputPath; + final Path expectedInLineFSPath = entry.expectedInLineFSPath; + final Path expectedTransformedInputPath = entry.transformedInputPath; + + String scheme = "file"; + if (inputPath.toString().contains(":")) { + scheme = inputPath.toString().split(":")[0]; + } + final Path actualInLineFSPath = InLineFSUtils.getInlineFilePath(inputPath, scheme, 10, 10); + assertEquals(expectedInLineFSPath, actualInLineFSPath); + + final Path actualOuterFilePath = InLineFSUtils.getOuterFilePathFromInlinePath(actualInLineFSPath); + assertEquals(expectedTransformedInputPath, actualOuterFilePath); + } + } + @Test public void testExists() throws IOException { Path inlinePath = getRandomInlinePath(); @@ -311,7 +369,6 @@ private Path getRandomInlinePath() { private void verifyFileStatus(FileStatus expected, Path inlinePath, long expectedLength, FileStatus actual) { assertEquals(inlinePath, actual.getPath()); assertEquals(expectedLength, actual.getLen()); - assertEquals(expected.getAccessTime(), actual.getAccessTime()); assertEquals(expected.getBlockSize(), actual.getBlockSize()); assertEquals(expected.getGroup(), actual.getGroup()); assertEquals(expected.getModificationTime(), actual.getModificationTime()); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystemHFileInLining.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystemHFileInLining.java index 4122d500b4356..190ad398e1b60 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystemHFileInLining.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystemHFileInLining.java @@ -19,11 +19,13 @@ package org.apache.hudi.common.fs.inline; import org.apache.hudi.common.testutils.FileSystemTestUtils; +import org.apache.hudi.io.storage.HoodieHFileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.io.hfile.CacheConfig; @@ -38,10 +40,12 @@ import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; +import java.util.Arrays; import java.util.HashSet; import java.util.Set; import java.util.UUID; +import static org.apache.hadoop.hbase.CellComparatorImpl.COMPARATOR; import static org.apache.hudi.common.testutils.FileSystemTestUtils.FILE_SCHEME; import static org.apache.hudi.common.testutils.FileSystemTestUtils.RANDOM; import static org.apache.hudi.common.testutils.FileSystemTestUtils.getPhantomFile; @@ -55,11 +59,12 @@ */ public class TestInLineFileSystemHFileInLining { + private static final String LOCAL_FORMATTER = "%010d"; + private static final String VALUE_PREFIX = "value"; + private static final int MIN_BLOCK_BYTES = 1024; private final Configuration inMemoryConf; private final Configuration inlineConf; - private final int minBlockSize = 1024; - private static final String LOCAL_FORMATTER = "%010d"; - private int maxRows = 100 + RANDOM.nextInt(1000); + private final int maxRows = 100 + RANDOM.nextInt(1000); private Path generatedPath; public TestInLineFileSystemHFileInLining() { @@ -87,12 +92,11 @@ public void testSimpleInlineFileSystem() throws IOException { CacheConfig cacheConf = new CacheConfig(inMemoryConf); FSDataOutputStream fout = createFSOutput(outerInMemFSPath, inMemoryConf); HFileContext meta = new HFileContextBuilder() - .withBlockSize(minBlockSize) + .withBlockSize(MIN_BLOCK_BYTES).withCellComparator(COMPARATOR) .build(); HFile.Writer writer = HFile.getWriterFactory(inMemoryConf, cacheConf) .withOutputStream(fout) .withFileContext(meta) - .withComparator(new KeyValue.KVComparator()) .create(); writeRecords(writer); @@ -109,9 +113,8 @@ public void testSimpleInlineFileSystem() throws IOException { InLineFileSystem inlineFileSystem = (InLineFileSystem) inlinePath.getFileSystem(inlineConf); FSDataInputStream fin = inlineFileSystem.open(inlinePath); - HFile.Reader reader = HFile.createReader(inlineFileSystem, inlinePath, cacheConf, inlineConf); - // Load up the index. - reader.loadFileInfo(); + HFile.Reader reader = + HoodieHFileUtils.createHFileReader(inlineFileSystem, inlinePath, cacheConf, inlineConf); // Get a scanner that caches and that does not use pread. HFileScanner scanner = reader.getScanner(true, false); // Align scanner at start of the file. @@ -120,21 +123,24 @@ public void testSimpleInlineFileSystem() throws IOException { Set rowIdsToSearch = getRandomValidRowIds(10); for (int rowId : rowIdsToSearch) { - assertEquals(0, scanner.seekTo(KeyValue.createKeyValueFromKey(getSomeKey(rowId))), + KeyValue keyValue = new KeyValue.KeyOnlyKeyValue(getSomeKey(rowId)); + assertEquals(0, scanner.seekTo(keyValue), "location lookup failed"); // read the key and see if it matches - ByteBuffer readKey = scanner.getKey(); - assertArrayEquals(getSomeKey(rowId), Bytes.toBytes(readKey), "seeked key does not match"); - scanner.seekTo(KeyValue.createKeyValueFromKey(getSomeKey(rowId))); + Cell cell = scanner.getCell(); + byte[] key = Arrays.copyOfRange(cell.getRowArray(), cell.getRowOffset(), cell.getRowOffset() + cell.getRowLength()); + byte[] expectedKey = Arrays.copyOfRange(keyValue.getRowArray(), keyValue.getRowOffset(), keyValue.getRowOffset() + keyValue.getRowLength()); + assertArrayEquals(expectedKey, key, "seeked key does not match"); + scanner.seekTo(keyValue); ByteBuffer val1 = scanner.getValue(); - scanner.seekTo(KeyValue.createKeyValueFromKey(getSomeKey(rowId))); + scanner.seekTo(keyValue); ByteBuffer val2 = scanner.getValue(); assertArrayEquals(Bytes.toBytes(val1), Bytes.toBytes(val2)); } int[] invalidRowIds = {-4, maxRows, maxRows + 1, maxRows + 120, maxRows + 160, maxRows + 1000}; for (int rowId : invalidRowIds) { - assertNotEquals(0, scanner.seekTo(KeyValue.createKeyValueFromKey(getSomeKey(rowId))), + assertNotEquals(0, scanner.seekTo(new KeyValue.KeyOnlyKeyValue(getSomeKey(rowId))), "location lookup should have failed"); } reader.close(); @@ -154,7 +160,7 @@ private Set getRandomValidRowIds(int count) { } private byte[] getSomeKey(int rowId) { - KeyValue kv = new KeyValue(String.format(LOCAL_FORMATTER, Integer.valueOf(rowId)).getBytes(), + KeyValue kv = new KeyValue(String.format(LOCAL_FORMATTER, rowId).getBytes(), Bytes.toBytes("family"), Bytes.toBytes("qual"), HConstants.LATEST_TIMESTAMP, KeyValue.Type.Put); return kv.getKey(); } @@ -168,17 +174,15 @@ private void writeRecords(HFile.Writer writer) throws IOException { writer.close(); } - private int writeSomeRecords(HFile.Writer writer) + private void writeSomeRecords(HFile.Writer writer) throws IOException { - String value = "value"; KeyValue kv; for (int i = 0; i < (maxRows); i++) { - String key = String.format(LOCAL_FORMATTER, Integer.valueOf(i)); + String key = String.format(LOCAL_FORMATTER, i); kv = new KeyValue(Bytes.toBytes(key), Bytes.toBytes("family"), Bytes.toBytes("qual"), - Bytes.toBytes(value + key)); + Bytes.toBytes(VALUE_PREFIX + key)); writer.append(kv); } - return (maxRows); } private void readAllRecords(HFileScanner scanner) throws IOException { @@ -186,30 +190,31 @@ private void readAllRecords(HFileScanner scanner) throws IOException { } // read the records and check - private int readAndCheckbytes(HFileScanner scanner, int start, int n) + private void readAndCheckbytes(HFileScanner scanner, int start, int n) throws IOException { - String value = "value"; int i = start; for (; i < (start + n); i++) { - ByteBuffer key = scanner.getKey(); - ByteBuffer val = scanner.getValue(); - String keyStr = String.format(LOCAL_FORMATTER, Integer.valueOf(i)); - String valStr = value + keyStr; + Cell cell = scanner.getCell(); + byte[] key = Arrays.copyOfRange( + cell.getRowArray(), cell.getRowOffset(), cell.getRowOffset() + cell.getRowLength()); + byte[] val = Arrays.copyOfRange( + cell.getValueArray(), cell.getValueOffset(), cell.getValueOffset() + cell.getValueLength()); + String keyStr = String.format(LOCAL_FORMATTER, i); + String valStr = VALUE_PREFIX + keyStr; KeyValue kv = new KeyValue(Bytes.toBytes(keyStr), Bytes.toBytes("family"), Bytes.toBytes("qual"), Bytes.toBytes(valStr)); - byte[] keyBytes = new KeyValue.KeyOnlyKeyValue(Bytes.toBytes(key), 0, - Bytes.toBytes(key).length).getKey(); - assertArrayEquals(kv.getKey(), keyBytes, - "bytes for keys do not match " + keyStr + " " + Bytes.toString(Bytes.toBytes(key))); - byte[] valBytes = Bytes.toBytes(val); - assertArrayEquals(Bytes.toBytes(valStr), valBytes, - "bytes for vals do not match " + valStr + " " + Bytes.toString(valBytes)); + byte[] keyBytes = new KeyValue.KeyOnlyKeyValue(key, 0, key.length).getKey(); + byte[] expectedKeyBytes = Arrays.copyOfRange( + kv.getRowArray(), kv.getRowOffset(), kv.getRowOffset() + kv.getRowLength()); + assertArrayEquals(expectedKeyBytes, keyBytes, + "bytes for keys do not match " + keyStr + " " + Bytes.toString(key)); + assertArrayEquals(Bytes.toBytes(valStr), val, + "bytes for vals do not match " + valStr + " " + Bytes.toString(val)); if (!scanner.next()) { break; } } assertEquals(i, start + n - 1); - return (start + n); } private long generateOuterFile(Path outerPath, byte[] inlineBytes) throws IOException { diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestParquetInLining.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestParquetInLining.java index c4e728dc24909..9ed27c4b2d63c 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestParquetInLining.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestParquetInLining.java @@ -19,6 +19,7 @@ package org.apache.hudi.common.fs.inline; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.testutils.FileSystemTestUtils; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; @@ -144,7 +145,8 @@ static List getParquetHoodieRecords() throws IOException { List hoodieRecords = dataGenerator.generateInsertsWithHoodieAvroPayload(commitTime, 10); List toReturn = new ArrayList<>(); for (HoodieRecord record : hoodieRecords) { - toReturn.add((GenericRecord) record.getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA).get()); + toReturn.add((GenericRecord) ((HoodieAvroRecord) record).getData() + .getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA).get()); } return toReturn; } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java b/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java index 98ece7309ea7d..abb31ab19ea13 100755 --- a/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java @@ -20,11 +20,13 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.DeleteRecord; import org.apache.hudi.common.model.HoodieArchivedLogFile; -import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.log.AppendResult; import org.apache.hudi.common.table.log.HoodieLogFileReader; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.HoodieLogFormat.Reader; @@ -38,26 +40,39 @@ import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType; +import org.apache.hudi.common.table.log.block.HoodieParquetDataBlock; +import org.apache.hudi.common.testutils.FileCreateUtils; +import org.apache.hudi.common.testutils.HadoopMapRedUtils; import org.apache.hudi.common.testutils.HoodieCommonTestHarness; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.testutils.SchemaTestUtil; import org.apache.hudi.common.testutils.minicluster.MiniClusterUtil; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.hudi.exception.CorruptedLogFileException; +import org.apache.hudi.exception.HoodieIOException; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.io.compress.Compression; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.hadoop.util.counters.BenchmarkCounter; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.MethodSource; import org.junit.jupiter.params.provider.ValueSource; import java.io.IOException; @@ -72,13 +87,16 @@ import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import java.util.stream.Stream; import static org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.params.provider.Arguments.arguments; /** * Tests hoodie log format {@link HoodieLogFormat}. @@ -86,11 +104,12 @@ @SuppressWarnings("Duplicates") public class TestHoodieLogFormat extends HoodieCommonTestHarness { + private static final HoodieLogBlockType DEFAULT_DATA_BLOCK_TYPE = HoodieLogBlockType.AVRO_DATA_BLOCK; + private static String BASE_OUTPUT_PATH = "/tmp/"; private FileSystem fs; private Path partitionPath; private int bufferSize = 4096; - private HoodieLogBlockType dataBlockType = HoodieLogBlockType.AVRO_DATA_BLOCK; @BeforeAll public static void setUpClass() throws IOException, InterruptedException { @@ -116,10 +135,11 @@ public void setUp() throws IOException, InterruptedException { @AfterEach public void tearDown() throws IOException { fs.delete(partitionPath, true); + fs.delete(new Path(basePath), true); } @Test - public void testEmptyLog() throws IOException, InterruptedException { + public void testEmptyLog() throws IOException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); @@ -129,7 +149,7 @@ public void testEmptyLog() throws IOException, InterruptedException { } @ParameterizedTest - @EnumSource(names = { "AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK" }) + @EnumSource(names = {"AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK", "PARQUET_DATA_BLOCK"}) public void testBasicAppend(HoodieLogBlockType dataBlockType) throws IOException, InterruptedException, URISyntaxException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) @@ -138,18 +158,21 @@ public void testBasicAppend(HoodieLogBlockType dataBlockType) throws IOException Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); + long pos = writer.getCurrentSize(); HoodieDataBlock dataBlock = getDataBlock(dataBlockType, records, header); - writer = writer.appendBlock(dataBlock); + AppendResult result = writer.appendBlock(dataBlock); + long size = writer.getCurrentSize(); assertTrue(size > 0, "We just wrote a block - size should be > 0"); assertEquals(size, fs.getFileStatus(writer.getLogFile().getPath()).getLen(), "Write should be auto-flushed. The size reported by FileStatus and the writer should match"); + assertEquals(size, result.size()); + assertEquals(writer.getLogFile(), result.logFile()); + assertEquals(0, result.offset()); writer.close(); - } - @ParameterizedTest - @EnumSource(names = { "AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK" }) + @Test public void testRollover() throws IOException, InterruptedException, URISyntaxException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) @@ -158,25 +181,38 @@ public void testRollover() throws IOException, InterruptedException, URISyntaxEx Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - HoodieDataBlock dataBlock = getDataBlock(records, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); // Write out a block - writer = writer.appendBlock(dataBlock); + AppendResult firstAppend = writer.appendBlock(dataBlock); // Get the size of the block long size = writer.getCurrentSize(); writer.close(); + assertEquals(0, firstAppend.offset()); + assertEquals(size, firstAppend.size()); + // Create a writer with the size threshold as the size we just wrote - so this has to roll writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).withSizeThreshold(size - 1).build(); records = SchemaTestUtil.generateTestRecords(0, 100); - header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - dataBlock = getDataBlock(records, header); - writer = writer.appendBlock(dataBlock); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); + AppendResult secondAppend = writer.appendBlock(dataBlock); + + assertEquals(firstAppend.logFile(), secondAppend.logFile()); + assertNotEquals(0, secondAppend.offset()); assertEquals(0, writer.getCurrentSize(), "This should be a new log file and hence size should be 0"); assertEquals(2, writer.getLogFile().getLogVersion(), "Version should be rolled to 2"); Path logFilePath = writer.getLogFile().getPath(); assertFalse(fs.exists(logFilePath), "Path (" + logFilePath + ") must not exist"); + + // Write one more block, which should not go to the new log file. + records = SchemaTestUtil.generateTestRecords(0, 100); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); + AppendResult rolloverAppend = writer.appendBlock(dataBlock); + + assertNotEquals(secondAppend.logFile(), rolloverAppend.logFile()); + assertEquals(0, rolloverAppend.offset()); writer.close(); } @@ -203,17 +239,13 @@ private void testConcurrentAppend(boolean logFileExists, boolean newLogFileForma if (newLogFileFormat && logFileExists) { // Assume there is an existing log-file with write token - builder1 = builder1.withLogVersion(1).withLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN) - .withRolloverLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN); - builder2 = builder2.withLogVersion(1).withLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN) - .withRolloverLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN); + builder1 = builder1.withLogVersion(1).withRolloverLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN); + builder2 = builder2.withLogVersion(1).withRolloverLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN); } else if (newLogFileFormat) { // First log file of the file-slice builder1 = builder1.withLogVersion(HoodieLogFile.LOGFILE_BASE_VERSION) - .withLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN) .withRolloverLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN); builder2 = builder2.withLogVersion(HoodieLogFile.LOGFILE_BASE_VERSION) - .withLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN) .withRolloverLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN); } else { builder1 = builder1.withLogVersion(1).withRolloverLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN); @@ -223,10 +255,10 @@ private void testConcurrentAppend(boolean logFileExists, boolean newLogFileForma Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - HoodieDataBlock dataBlock = getDataBlock(records, header); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); + writer.appendBlock(dataBlock); Writer writer2 = builder2.build(); - writer2 = writer2.appendBlock(dataBlock); + writer2.appendBlock(dataBlock); HoodieLogFile logFile1 = writer.getLogFile(); HoodieLogFile logFile2 = writer2.getLogFile(); writer.close(); @@ -235,8 +267,9 @@ private void testConcurrentAppend(boolean logFileExists, boolean newLogFileForma assertEquals(logFile1.getLogVersion(), logFile2.getLogVersion() - 1, "Log Files must have different versions"); } - @Test - public void testMultipleAppend() throws IOException, URISyntaxException, InterruptedException { + @ParameterizedTest + @EnumSource(names = {"AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK", "PARQUET_DATA_BLOCK"}) + public void testMultipleAppend(HoodieLogBlockType dataBlockType) throws IOException, URISyntaxException, InterruptedException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); @@ -244,8 +277,8 @@ public void testMultipleAppend() throws IOException, URISyntaxException, Interru Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - HoodieDataBlock dataBlock = getDataBlock(records, header); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = getDataBlock(dataBlockType, records, header); + writer.appendBlock(dataBlock); long size1 = writer.getCurrentSize(); writer.close(); @@ -254,8 +287,8 @@ public void testMultipleAppend() throws IOException, URISyntaxException, Interru .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - dataBlock = getDataBlock(records, header); - writer = writer.appendBlock(dataBlock); + dataBlock = getDataBlock(dataBlockType, records, header); + writer.appendBlock(dataBlock); long size2 = writer.getCurrentSize(); assertTrue(size2 > size1, "We just wrote a new block - size2 should be > size1"); assertEquals(size2, fs.getFileStatus(writer.getLogFile().getPath()).getLen(), @@ -268,8 +301,8 @@ public void testMultipleAppend() throws IOException, URISyntaxException, Interru .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - dataBlock = getDataBlock(records, header); - writer = writer.appendBlock(dataBlock); + dataBlock = getDataBlock(dataBlockType, records, header); + writer.appendBlock(dataBlock); long size3 = writer.getCurrentSize(); assertTrue(size3 > size2, "We just wrote a new block - size3 should be > size2"); assertEquals(size3, fs.getFileStatus(writer.getLogFile().getPath()).getLen(), @@ -287,26 +320,27 @@ public void testMultipleAppend() throws IOException, URISyntaxException, Interru * This is actually a test on concurrent append and not recovery lease. Commenting this out. * https://issues.apache.org/jira/browse/HUDI-117 */ + /** * @Test public void testLeaseRecovery() throws IOException, URISyntaxException, InterruptedException { Writer writer - * = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - * .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - * .overBaseCommit("100").withFs(fs).build(); List records = - * SchemaTestUtil.generateTestRecords(0, 100); Map header = - * Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); - * header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieAvroDataBlock - * dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size1 = - * writer.getCurrentSize(); // do not close this writer - this simulates a data note appending to a log dying - * without closing the file // writer.close(); - * - * writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - * .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") - * .withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); - * header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new - * HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size2 = - * writer.getCurrentSize(); assertTrue("We just wrote a new block - size2 should be > size1", size2 > size1); - * assertEquals("Write should be auto-flushed. The size reported by FileStatus and the writer should match", - * size2, fs.getFileStatus(writer.getLogFile().getPath()).getLen()); writer.close(); } + * = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) + * .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") + * .overBaseCommit("100").withFs(fs).build(); List records = + * SchemaTestUtil.generateTestRecords(0, 100); Map header = + * Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); + * header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieAvroDataBlock + * dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size1 = + * writer.getCurrentSize(); // do not close this writer - this simulates a data note appending to a log dying + * without closing the file // writer.close(); + *

    + * writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) + * .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") + * .withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); + * header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new + * HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size2 = + * writer.getCurrentSize(); assertTrue("We just wrote a new block - size2 should be > size1", size2 > size1); + * assertEquals("Write should be auto-flushed. The size reported by FileStatus and the writer should match", + * size2, fs.getFileStatus(writer.getLogFile().getPath()).getLen()); writer.close(); } */ @Test @@ -322,12 +356,14 @@ public void testAppendNotSupported() throws IOException, URISyntaxException, Int Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - HoodieDataBlock dataBlock = getDataBlock(records, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); for (int i = 0; i < 2; i++) { - HoodieLogFormat.newWriterBuilder().onParentPath(testPath) + Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(testPath) .withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION).withFileId("commits.archive").overBaseCommit("") - .withFs(localFs).build().appendBlock(dataBlock).close(); + .withFs(localFs).build(); + writer.appendBlock(dataBlock); + writer.close(); } // ensure there are two log file versions, with same data. @@ -335,8 +371,7 @@ public void testAppendNotSupported() throws IOException, URISyntaxException, Int assertEquals(2, statuses.length); } - @ParameterizedTest - @EnumSource(names = { "AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK" }) + @Test public void testBasicWriteAndScan() throws IOException, URISyntaxException, InterruptedException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) @@ -348,28 +383,94 @@ public void testBasicWriteAndScan() throws IOException, URISyntaxException, Inte Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - HoodieDataBlock dataBlock = getDataBlock(records, header); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); + writer.appendBlock(dataBlock); writer.close(); Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema()); assertTrue(reader.hasNext(), "We wrote a block, we should be able to read it"); HoodieLogBlock nextBlock = reader.next(); - assertEquals(dataBlockType, nextBlock.getBlockType(), "The next block should be a data block"); + assertEquals(DEFAULT_DATA_BLOCK_TYPE, nextBlock.getBlockType(), "The next block should be a data block"); HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock; - assertEquals(copyOfRecords.size(), dataBlockRead.getRecords().size(), + List recordsRead = getRecords(dataBlockRead); + assertEquals(copyOfRecords.size(), recordsRead.size(), "Read records size should be equal to the written records size"); - assertEquals(copyOfRecords, dataBlockRead.getRecords(), + assertEquals(copyOfRecords, recordsRead, "Both records lists should be the same. (ordering guaranteed)"); reader.close(); } - @ParameterizedTest - @EnumSource(names = { "AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK" }) - public void testBasicAppendAndRead() throws IOException, URISyntaxException, InterruptedException { + @Test + public void testHugeLogFileWrite() throws IOException, URISyntaxException, InterruptedException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) - .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).withSizeThreshold(3L * 1024 * 1024 * 1024) + .build(); + Schema schema = getSimpleSchema(); + List records = SchemaTestUtil.generateTestRecords(0, 1000); + List copyOfRecords = records.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + Map header = new HashMap<>(); + header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); + header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); + byte[] dataBlockContentBytes = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header).getContentBytes(); + HoodieLogBlock.HoodieLogBlockContentLocation logBlockContentLoc = new HoodieLogBlock.HoodieLogBlockContentLocation(new Configuration(), null, 0, dataBlockContentBytes.length, 0); + HoodieDataBlock reusableDataBlock = new HoodieAvroDataBlock(null, Option.ofNullable(dataBlockContentBytes), false, + logBlockContentLoc, Option.ofNullable(getSimpleSchema()), header, new HashMap<>(), HoodieRecord.RECORD_KEY_METADATA_FIELD); + long writtenSize = 0; + int logBlockWrittenNum = 0; + while (writtenSize < Integer.MAX_VALUE) { + AppendResult appendResult = writer.appendBlock(reusableDataBlock); + assertTrue(appendResult.size() > 0); + writtenSize += appendResult.size(); + logBlockWrittenNum++; + } + writer.close(); + + Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(), + true, true); + assertTrue(reader.hasNext(), "We wrote a block, we should be able to read it"); + HoodieLogBlock nextBlock = reader.next(); + assertEquals(DEFAULT_DATA_BLOCK_TYPE, nextBlock.getBlockType(), "The next block should be a data block"); + HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock; + List recordsRead = getRecords(dataBlockRead); + assertEquals(copyOfRecords.size(), recordsRead.size(), + "Read records size should be equal to the written records size"); + assertEquals(copyOfRecords, recordsRead, + "Both records lists should be the same. (ordering guaranteed)"); + int logBlockReadNum = 1; + while (reader.hasNext()) { + reader.next(); + logBlockReadNum++; + } + assertEquals(logBlockWrittenNum, logBlockReadNum, "All written log should be correctly found"); + reader.close(); + + // test writing oversize data block which should be rejected + Writer oversizeWriter = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withSizeThreshold(3L * 1024 * 1024 * 1024).withFs(fs) + .build(); + List dataBlocks = new ArrayList<>(logBlockWrittenNum + 1); + for (int i = 0; i < logBlockWrittenNum + 1; i++) { + dataBlocks.add(reusableDataBlock); + } + assertThrows(HoodieIOException.class, () -> { + oversizeWriter.appendBlocks(dataBlocks); + }, "Blocks appended may overflow. Please decrease log block size or log block amount"); + oversizeWriter.close(); + } + + @ParameterizedTest + @EnumSource(names = {"AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK", "PARQUET_DATA_BLOCK"}) + public void testBasicAppendAndRead(HoodieLogBlockType dataBlockType) throws IOException, URISyntaxException, InterruptedException { + Writer writer = HoodieLogFormat.newWriterBuilder() + .onParentPath(partitionPath) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1") + .overBaseCommit("100") + .withFs(fs) + .build(); List records1 = SchemaTestUtil.generateTestRecords(0, 100); Schema schema = getSimpleSchema(); List copyOfRecords1 = records1.stream() @@ -377,64 +478,78 @@ public void testBasicAppendAndRead() throws IOException, URISyntaxException, Int Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = getDataBlock(dataBlockType, records1, header); + writer.appendBlock(dataBlock); writer.close(); - writer = - HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) - .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); + writer = HoodieLogFormat.newWriterBuilder() + .onParentPath(partitionPath) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1") + .overBaseCommit("100") + .withFs(fs) + .build(); List records2 = SchemaTestUtil.generateTestRecords(0, 100); List copyOfRecords2 = records2.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - dataBlock = getDataBlock(records2, header); - writer = writer.appendBlock(dataBlock); + dataBlock = getDataBlock(dataBlockType, records2, header); + writer.appendBlock(dataBlock); writer.close(); // Close and Open again and append 100 more records - writer = - HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) - .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); + writer = HoodieLogFormat.newWriterBuilder() + .onParentPath(partitionPath) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1") + .overBaseCommit("100") + .withFs(fs) + .build(); + List records3 = SchemaTestUtil.generateTestRecords(0, 100); List copyOfRecords3 = records3.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - dataBlock = getDataBlock(records3, header); - writer = writer.appendBlock(dataBlock); + dataBlock = getDataBlock(dataBlockType, records3, header); + writer.appendBlock(dataBlock); writer.close(); Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema()); assertTrue(reader.hasNext(), "First block should be available"); HoodieLogBlock nextBlock = reader.next(); HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock; - assertEquals(copyOfRecords1.size(), dataBlockRead.getRecords().size(), + List recordsRead1 = getRecords(dataBlockRead); + assertEquals(copyOfRecords1.size(),recordsRead1.size(), "Read records size should be equal to the written records size"); - assertEquals(copyOfRecords1, dataBlockRead.getRecords(), + assertEquals(copyOfRecords1, recordsRead1, "Both records lists should be the same. (ordering guaranteed)"); assertEquals(dataBlockRead.getSchema(), getSimpleSchema()); reader.hasNext(); nextBlock = reader.next(); dataBlockRead = (HoodieDataBlock) nextBlock; - assertEquals(copyOfRecords2.size(), dataBlockRead.getRecords().size(), + List recordsRead2 = getRecords(dataBlockRead); + assertEquals(copyOfRecords2.size(), recordsRead2.size(), "Read records size should be equal to the written records size"); - assertEquals(copyOfRecords2, dataBlockRead.getRecords(), + assertEquals(copyOfRecords2, recordsRead2, "Both records lists should be the same. (ordering guaranteed)"); reader.hasNext(); nextBlock = reader.next(); dataBlockRead = (HoodieDataBlock) nextBlock; - assertEquals(copyOfRecords3.size(), dataBlockRead.getRecords().size(), + List recordsRead3 = getRecords(dataBlockRead); + assertEquals(copyOfRecords3.size(), recordsRead3.size(), "Read records size should be equal to the written records size"); - assertEquals(copyOfRecords3, dataBlockRead.getRecords(), + assertEquals(copyOfRecords3, recordsRead3, "Both records lists should be the same. (ordering guaranteed)"); reader.close(); } @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testBasicAppendAndScanMultipleFiles(boolean readBlocksLazily) + @MethodSource("testArguments") + public void testBasicAppendAndScanMultipleFiles(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean readBlocksLazily) throws IOException, URISyntaxException, InterruptedException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) @@ -454,11 +569,11 @@ public void testBasicAppendAndScanMultipleFiles(boolean readBlocksLazily) .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); allRecords.add(copyOfRecords1); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); + writer.appendBlock(dataBlock); } writer.close(); - + FileCreateUtils.createDeltaCommit(basePath, "100", fs); // scan all log blocks (across multiple log files) HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder() .withFileSystem(fs) @@ -473,11 +588,14 @@ public void testBasicAppendAndScanMultipleFiles(boolean readBlocksLazily) .withReverseReader(false) .withBufferSize(bufferSize) .withSpillableMapBasePath(BASE_OUTPUT_PATH) + .withDiskMapType(diskMapType) + .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) .build(); List scannedRecords = new ArrayList<>(); for (HoodieRecord record : scanner) { - scannedRecords.add((IndexedRecord) record.getData().getInsertValue(schema).get()); + scannedRecords.add((IndexedRecord) + ((HoodieAvroRecord) record).getData().getInsertValue(schema).get()); } assertEquals(scannedRecords.size(), allRecords.stream().mapToLong(Collection::size).sum(), @@ -487,20 +605,11 @@ public void testBasicAppendAndScanMultipleFiles(boolean readBlocksLazily) @Test public void testAppendAndReadOnCorruptedLog() throws IOException, URISyntaxException, InterruptedException { - Writer writer = - HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) - .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); - List records = SchemaTestUtil.generateTestRecords(0, 100); - Map header = new HashMap<>(); - header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); - header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - HoodieDataBlock dataBlock = getDataBlock(records, header); - writer = writer.appendBlock(dataBlock); - writer.close(); + HoodieLogFile logFile = addValidBlock("test-fileId1", "100", 100); - // Append some arbit byte[] to thee end of the log (mimics a partially written commit) + // Append some arbit byte[] to the end of the log (mimics a partially written commit) fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf()); - FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath()); + FSDataOutputStream outputStream = fs.append(logFile.getPath()); // create a block with outputStream.write(HoodieLogFormat.MAGIC); // Write out a length that does not confirm with the content @@ -515,17 +624,10 @@ public void testAppendAndReadOnCorruptedLog() throws IOException, URISyntaxExcep outputStream.close(); // Append a proper block that is of the missing length of the corrupted block - writer = - HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) - .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); - records = SchemaTestUtil.generateTestRecords(0, 10); - header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - dataBlock = getDataBlock(records, header); - writer = writer.appendBlock(dataBlock); - writer.close(); + logFile = addValidBlock("test-fileId1", "100", 10); // First round of reads - we should be able to read the first block and then EOF - Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema()); + Reader reader = HoodieLogFormat.newReader(fs, logFile, SchemaTestUtil.getSimpleSchema()); assertTrue(reader.hasNext(), "First block should be available"); reader.next(); assertTrue(reader.hasNext(), "We should have corrupted block next"); @@ -538,7 +640,7 @@ public void testAppendAndReadOnCorruptedLog() throws IOException, URISyntaxExcep reader.close(); // Simulate another failure back to back - outputStream = fs.append(writer.getLogFile().getPath()); + outputStream = fs.append(logFile.getPath()); // create a block with outputStream.write(HoodieLogFormat.MAGIC); // Write out a length that does not confirm with the content @@ -553,17 +655,10 @@ public void testAppendAndReadOnCorruptedLog() throws IOException, URISyntaxExcep outputStream.close(); // Should be able to append a new block - writer = - HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) - .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); - records = SchemaTestUtil.generateTestRecords(0, 100); - header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - dataBlock = getDataBlock(records, header); - writer = writer.appendBlock(dataBlock); - writer.close(); + logFile = addValidBlock("test-fileId1", "100", 100); // Second round of reads - we should be able to read the first and last block - reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema()); + reader = HoodieLogFormat.newReader(fs, logFile, SchemaTestUtil.getSimpleSchema()); assertTrue(reader.hasNext(), "First block should be available"); reader.next(); assertTrue(reader.hasNext(), "We should get the 1st corrupted block next"); @@ -579,9 +674,110 @@ public void testAppendAndReadOnCorruptedLog() throws IOException, URISyntaxExcep reader.close(); } + @Test + public void testMissingBlockExceptMagicBytes() throws IOException, URISyntaxException, InterruptedException { + HoodieLogFile logFile = addValidBlock("test-fileId1", "100", 100); + + // Append just magic bytes and move onto next block + fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf()); + FSDataOutputStream outputStream = fs.append(logFile.getPath()); + outputStream.write(HoodieLogFormat.MAGIC); + outputStream.flush(); + outputStream.close(); + + // Append a proper block + logFile = addValidBlock("test-fileId1", "100", 10); + + // First round of reads - we should be able to read the first block and then EOF + Reader reader = HoodieLogFormat.newReader(fs, logFile, SchemaTestUtil.getSimpleSchema()); + assertTrue(reader.hasNext(), "First block should be available"); + reader.next(); + assertTrue(reader.hasNext(), "We should have corrupted block next"); + HoodieLogBlock block = reader.next(); + assertEquals(HoodieLogBlockType.CORRUPT_BLOCK, block.getBlockType(), "The read block should be a corrupt block"); + assertTrue(reader.hasNext(), "Third block should be available"); + reader.next(); + assertFalse(reader.hasNext(), "There should be no more block left"); + + reader.close(); + } + + private HoodieLogFile addValidBlock(String fileId, String commitTime, int numRecords) throws IOException, URISyntaxException, InterruptedException { + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId(fileId).overBaseCommit(commitTime).withFs(fs).build(); + List records = SchemaTestUtil.generateTestRecords(0, numRecords); + Map header = new HashMap<>(); + header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); + header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); + writer.appendBlock(dataBlock); + writer.close(); + return writer.getLogFile(); + } + + @Test + public void testValidateCorruptBlockEndPosition() throws IOException, URISyntaxException, InterruptedException { + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); + List records = SchemaTestUtil.generateTestRecords(0, 100); + Map header = new HashMap<>(); + header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); + header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); + writer.appendBlock(dataBlock); + writer.close(); + + // Append some arbit byte[] to the end of the log (mimics a partially written commit) + fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf()); + FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath()); + // create a block with + outputStream.write(HoodieLogFormat.MAGIC); + // Write out a length that does not confirm with the content + outputStream.writeLong(474); + outputStream.writeInt(HoodieLogBlockType.AVRO_DATA_BLOCK.ordinal()); + outputStream.writeInt(HoodieLogFormat.CURRENT_VERSION); + // Write out a length that does not confirm with the content + outputStream.writeLong(400); + // Write out incomplete content + outputStream.write("something-random".getBytes()); + // get corrupt block end position + long corruptBlockEndPos = outputStream.getPos(); + outputStream.flush(); + outputStream.close(); + + // Append a proper block again + writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); + records = SchemaTestUtil.generateTestRecords(0, 10); + header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); + writer.appendBlock(dataBlock); + writer.close(); + + // Read data and corrupt block + Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema()); + assertTrue(reader.hasNext(), "First block should be available"); + reader.next(); + assertTrue(reader.hasNext(), "We should have corrupted block next"); + HoodieLogBlock block = reader.next(); + assertEquals(HoodieLogBlockType.CORRUPT_BLOCK, block.getBlockType(), "The read block should be a corrupt block"); + // validate the corrupt block end position correctly. + assertEquals(corruptBlockEndPos, block.getBlockContentLocation().get().getBlockEndPos()); + assertTrue(reader.hasNext(), "Third block should be available"); + reader.next(); + assertFalse(reader.hasNext(), "There should be no more block left"); + + reader.close(); + } + @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testAvroLogRecordReaderBasic(boolean readBlocksLazily) + @MethodSource("testArguments") + public void testAvroLogRecordReaderBasic(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean readBlocksLazily) throws IOException, URISyntaxException, InterruptedException { Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version @@ -596,22 +792,24 @@ public void testAvroLogRecordReaderBasic(boolean readBlocksLazily) Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); + writer.appendBlock(dataBlock); // Write 2 List records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List copyOfRecords2 = records2.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - dataBlock = getDataBlock(records2, header); - writer = writer.appendBlock(dataBlock); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header); + writer.appendBlock(dataBlock); writer.close(); List allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") .map(s -> s.getPath().toString()).collect(Collectors.toList()); + FileCreateUtils.createDeltaCommit(basePath, "100", fs); + HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder() .withFileSystem(fs) .withBasePath(basePath) @@ -623,6 +821,8 @@ public void testAvroLogRecordReaderBasic(boolean readBlocksLazily) .withReverseReader(false) .withBufferSize(bufferSize) .withSpillableMapBasePath(BASE_OUTPUT_PATH) + .withDiskMapType(diskMapType) + .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) .build(); assertEquals(200, scanner.getTotalLogRecords()); Set readKeys = new HashSet<>(200); @@ -636,8 +836,10 @@ public void testAvroLogRecordReaderBasic(boolean readBlocksLazily) } @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testAvroLogRecordReaderWithRollbackTombstone(boolean readBlocksLazily) + @MethodSource("testArguments") + public void testAvroLogRecordReaderWithRollbackTombstone(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean readBlocksLazily) throws IOException, URISyntaxException, InterruptedException { Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version @@ -653,22 +855,22 @@ public void testAvroLogRecordReaderWithRollbackTombstone(boolean readBlocksLazil header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); + writer.appendBlock(dataBlock); // Write 2 header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101"); List records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - dataBlock = getDataBlock(records2, header); - writer = writer.appendBlock(dataBlock); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header); + writer.appendBlock(dataBlock); // Rollback the last write header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "101"); header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); - writer = writer.appendBlock(commandBlock); + writer.appendBlock(commandBlock); // Write 3 header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "102"); @@ -676,14 +878,16 @@ public void testAvroLogRecordReaderWithRollbackTombstone(boolean readBlocksLazil List copyOfRecords3 = records3.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - dataBlock = getDataBlock(records3, header); - writer = writer.appendBlock(dataBlock); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records3, header); + writer.appendBlock(dataBlock); writer.close(); List allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") .map(s -> s.getPath().toString()).collect(Collectors.toList()); + FileCreateUtils.createDeltaCommit(basePath, "100", fs); + FileCreateUtils.createDeltaCommit(basePath, "102", fs); HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder() .withFileSystem(fs) .withBasePath(basePath) @@ -695,6 +899,8 @@ public void testAvroLogRecordReaderWithRollbackTombstone(boolean readBlocksLazil .withReverseReader(false) .withBufferSize(bufferSize) .withSpillableMapBasePath(BASE_OUTPUT_PATH) + .withDiskMapType(diskMapType) + .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) .build(); assertEquals(200, scanner.getTotalLogRecords(), "We read 200 records from 2 write batches"); Set readKeys = new HashSet<>(200); @@ -707,8 +913,10 @@ public void testAvroLogRecordReaderWithRollbackTombstone(boolean readBlocksLazil assertEquals(originalKeys, readKeys, "CompositeAvroLogReader should return 200 records from 2 versions"); } - @Test - public void testAvroLogRecordReaderWithRollbackPartialBlock() + @ParameterizedTest + @MethodSource("testArguments") + public void testAvroLogRecordReaderWithFailedPartialBlock(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled) throws IOException, URISyntaxException, InterruptedException { Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version @@ -723,13 +931,13 @@ public void testAvroLogRecordReaderWithRollbackPartialBlock() Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); + writer.appendBlock(dataBlock); writer.close(); // Write 2 header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101"); - // Append some arbit byte[] to thee end of the log (mimics a partially written commit) + // Append some arbit byte[] to the end of the log (mimics a partially written commit) fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf()); FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath()); // create a block with @@ -747,17 +955,9 @@ public void testAvroLogRecordReaderWithRollbackPartialBlock() outputStream.flush(); outputStream.close(); - // Rollback the last write - header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "102"); - header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "101"); - header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, - String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); - HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); - writer = writer.appendBlock(commandBlock); - // Write 3 header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "103"); List records3 = SchemaTestUtil.generateHoodieTestRecords(0, 100); @@ -765,14 +965,16 @@ public void testAvroLogRecordReaderWithRollbackPartialBlock() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - dataBlock = getDataBlock(records3, header); - writer = writer.appendBlock(dataBlock); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records3, header); + writer.appendBlock(dataBlock); writer.close(); List allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") .map(s -> s.getPath().toString()).collect(Collectors.toList()); + FileCreateUtils.createDeltaCommit(basePath, "100", fs); + FileCreateUtils.createDeltaCommit(basePath, "103", fs); HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder() .withFileSystem(fs) .withBasePath(basePath) @@ -784,6 +986,8 @@ public void testAvroLogRecordReaderWithRollbackPartialBlock() .withReverseReader(false) .withBufferSize(bufferSize) .withSpillableMapBasePath(BASE_OUTPUT_PATH) + .withDiskMapType(diskMapType) + .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) .build(); assertEquals(200, scanner.getTotalLogRecords(), "We would read 200 records"); Set readKeys = new HashSet<>(200); @@ -797,8 +1001,10 @@ public void testAvroLogRecordReaderWithRollbackPartialBlock() } @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testAvroLogRecordReaderWithDeleteAndRollback(boolean readBlocksLazily) + @MethodSource("testArguments") + public void testAvroLogRecordReaderWithDeleteAndRollback(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean readBlocksLazily) throws IOException, URISyntaxException, InterruptedException { Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version @@ -813,16 +1019,16 @@ public void testAvroLogRecordReaderWithDeleteAndRollback(boolean readBlocksLazil Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); + writer.appendBlock(dataBlock); // Write 2 header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101"); List records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List copyOfRecords2 = records2.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); - dataBlock = getDataBlock(records2, header); - writer = writer.appendBlock(dataBlock); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header); + writer.appendBlock(dataBlock); copyOfRecords1.addAll(copyOfRecords2); List originalKeys = @@ -830,19 +1036,23 @@ public void testAvroLogRecordReaderWithDeleteAndRollback(boolean readBlocksLazil .collect(Collectors.toList()); // Delete 50 keys - List deletedKeys = copyOfRecords1.stream() - .map(s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), + List deletedRecords = copyOfRecords1.stream() + .map(s -> (DeleteRecord.create(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), ((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString()))) .collect(Collectors.toList()).subList(0, 50); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "102"); - HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new HoodieKey[50]), header); - writer = writer.appendBlock(deleteBlock); + HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedRecords.toArray(new DeleteRecord[50]), header); + writer.appendBlock(deleteBlock); List allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") .map(s -> s.getPath().toString()).collect(Collectors.toList()); + FileCreateUtils.createDeltaCommit(basePath, "100", fs); + FileCreateUtils.createDeltaCommit(basePath, "101", fs); + FileCreateUtils.createDeltaCommit(basePath, "102", fs); + HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder() .withFileSystem(fs) .withBasePath(basePath) @@ -854,7 +1064,10 @@ public void testAvroLogRecordReaderWithDeleteAndRollback(boolean readBlocksLazil .withReverseReader(false) .withBufferSize(bufferSize) .withSpillableMapBasePath(BASE_OUTPUT_PATH) + .withDiskMapType(diskMapType) + .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) .build(); + assertEquals(200, scanner.getTotalLogRecords(), "We still would read 200 records"); final List readKeys = new ArrayList<>(200); final List emptyPayloads = new ArrayList<>(); @@ -870,7 +1083,7 @@ public void testAvroLogRecordReaderWithDeleteAndRollback(boolean readBlocksLazil }); assertEquals(200, readKeys.size(), "Stream collect should return all 200 records"); assertEquals(50, emptyPayloads.size(), "Stream collect should return all 50 records with empty payloads"); - originalKeys.removeAll(deletedKeys); + originalKeys.removeAll(deletedRecords); Collections.sort(originalKeys); Collections.sort(readKeys); assertEquals(originalKeys, readKeys, "CompositeAvroLogReader should return 150 records from 2 versions"); @@ -883,6 +1096,8 @@ public void testAvroLogRecordReaderWithDeleteAndRollback(boolean readBlocksLazil HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); writer.appendBlock(commandBlock); + FileCreateUtils.deleteDeltaCommit(basePath, "102", fs); + readKeys.clear(); scanner = HoodieMergedLogRecordScanner.newBuilder() .withFileSystem(fs) @@ -895,14 +1110,135 @@ public void testAvroLogRecordReaderWithDeleteAndRollback(boolean readBlocksLazil .withReverseReader(false) .withBufferSize(bufferSize) .withSpillableMapBasePath(BASE_OUTPUT_PATH) + .withDiskMapType(diskMapType) + .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) .build(); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); assertEquals(200, readKeys.size(), "Stream collect should return all 200 records after rollback of delete"); } @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testAvroLogRecordReaderWithFailedRollbacks(boolean readBlocksLazily) + @MethodSource("testArguments") + public void testAvroLogRecordReaderWithDisorderDelete(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean readBlocksLazily) + throws IOException, URISyntaxException, InterruptedException { + Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); + // Set a small threshold so that every block is a new version + Writer writer = + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); + + // Write 1 + List records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); + List copyOfRecords1 = records1.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + Map header = new HashMap<>(); + header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); + header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); + writer.appendBlock(dataBlock); + + // Write 2 + header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101"); + List records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100); + List copyOfRecords2 = records2.stream() + .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header); + writer.appendBlock(dataBlock); + + copyOfRecords1.addAll(copyOfRecords2); + List originalKeys = + copyOfRecords1.stream().map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) + .collect(Collectors.toList()); + + // Delete 10 keys + // Default orderingVal is 0, which means natural order, the DELETE records + // should overwrite the data records. + List deleteRecords1 = copyOfRecords1.subList(0, 10).stream() + .map(s -> (DeleteRecord.create(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), + ((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString()))) + .collect(Collectors.toList()); + + header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "102"); + HoodieDeleteBlock deleteBlock1 = new HoodieDeleteBlock(deleteRecords1.toArray(new DeleteRecord[0]), header); + writer.appendBlock(deleteBlock1); + + // Delete another 10 keys with -1 as orderingVal. + // The deletion should not work + + header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "103"); + HoodieDeleteBlock deleteBlock2 = new HoodieDeleteBlock(copyOfRecords1.subList(10, 20).stream() + .map(s -> (DeleteRecord.create(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), + ((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(), -1))).toArray(DeleteRecord[]::new), header); + writer.appendBlock(deleteBlock2); + + // Delete another 10 keys with +1 as orderingVal. + // The deletion should work because the keys has greater ordering value. + List deletedRecords3 = copyOfRecords1.subList(20, 30).stream() + .map(s -> (DeleteRecord.create(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), + ((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(), 1))) + .collect(Collectors.toList()); + + header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "104"); + HoodieDeleteBlock deleteBlock3 = new HoodieDeleteBlock(deletedRecords3.toArray(new DeleteRecord[0]), header); + writer.appendBlock(deleteBlock3); + + List allLogFiles = + FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") + .map(s -> s.getPath().toString()).collect(Collectors.toList()); + + FileCreateUtils.createDeltaCommit(basePath, "100", fs); + FileCreateUtils.createDeltaCommit(basePath, "101", fs); + FileCreateUtils.createDeltaCommit(basePath, "102", fs); + FileCreateUtils.createDeltaCommit(basePath, "103", fs); + FileCreateUtils.createDeltaCommit(basePath, "104", fs); + + HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder() + .withFileSystem(fs) + .withBasePath(basePath) + .withLogFilePaths(allLogFiles) + .withReaderSchema(schema) + .withLatestInstantTime("104") + .withMaxMemorySizeInBytes(10240L) + .withReadBlocksLazily(readBlocksLazily) + .withReverseReader(false) + .withBufferSize(bufferSize) + .withSpillableMapBasePath(BASE_OUTPUT_PATH) + .withDiskMapType(diskMapType) + .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) + .build(); + + assertEquals(200, scanner.getTotalLogRecords(), "We still would read 200 records"); + final List readKeys = new ArrayList<>(200); + final List emptyPayloadKeys = new ArrayList<>(); + scanner.forEach(s -> readKeys.add(s.getRecordKey())); + scanner.forEach(s -> { + try { + if (!s.getData().getInsertValue(schema).isPresent()) { + emptyPayloadKeys.add(s.getRecordKey()); + } + } catch (IOException io) { + throw new UncheckedIOException(io); + } + }); + assertEquals(200, readKeys.size(), "Stream collect should return all 200 records"); + assertEquals(20, emptyPayloadKeys.size(), "Stream collect should return all 20 records with empty payloads"); + + originalKeys.removeAll(deleteRecords1.stream().map(DeleteRecord::getRecordKey).collect(Collectors.toSet())); + originalKeys.removeAll(deletedRecords3.stream().map(DeleteRecord::getRecordKey).collect(Collectors.toSet())); + readKeys.removeAll(emptyPayloadKeys); + + Collections.sort(originalKeys); + Collections.sort(readKeys); + assertEquals(originalKeys, readKeys, "HoodieMergedLogRecordScanner should return 180 records from 4 versions"); + } + + @ParameterizedTest + @MethodSource("testArguments") + public void testAvroLogRecordReaderWithFailedRollbacks(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean readBlocksLazily) throws IOException, URISyntaxException, InterruptedException { // Write a Data block and Delete block with same InstantTime (written in same batch) @@ -921,31 +1257,33 @@ public void testAvroLogRecordReaderWithFailedRollbacks(boolean readBlocksLazily) header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); + writer.appendBlock(dataBlock); // Write 2 List records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - dataBlock = getDataBlock(records2, header); - writer = writer.appendBlock(dataBlock); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header); + writer.appendBlock(dataBlock); // Delete 50 keys // Delete 50 keys - List deletedKeys = copyOfRecords1.stream() - .map(s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), + List deleteRecords = copyOfRecords1.stream() + .map(s -> (DeleteRecord.create(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), ((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString()))) .collect(Collectors.toList()).subList(0, 50); - HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new HoodieKey[50]), header); - writer = writer.appendBlock(deleteBlock); + HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deleteRecords.toArray(new DeleteRecord[50]), header); + writer.appendBlock(deleteBlock); + + FileCreateUtils.createDeltaCommit(basePath, "100", fs); // Attempt 1 : Write rollback block for a failed write header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); try { - writer = writer.appendBlock(commandBlock); + writer.appendBlock(commandBlock); // Say job failed, retry writing 2 rollback in the next rollback(..) attempt throw new Exception("simulating failure"); } catch (Exception e) { @@ -970,17 +1308,22 @@ public void testAvroLogRecordReaderWithFailedRollbacks(boolean readBlocksLazily) .withReverseReader(false) .withBufferSize(bufferSize) .withSpillableMapBasePath(BASE_OUTPUT_PATH) + .withDiskMapType(diskMapType) + .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) .build(); assertEquals(0, scanner.getTotalLogRecords(), "We would have scanned 0 records because of rollback"); final List readKeys = new ArrayList<>(); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); assertEquals(0, readKeys.size(), "Stream collect should return all 0 records"); + FileCreateUtils.deleteDeltaCommit(basePath, "100", fs); } @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testAvroLogRecordReaderWithInsertDeleteAndRollback(boolean readBlocksLazily) + @MethodSource("testArguments") + public void testAvroLogRecordReaderWithInsertDeleteAndRollback(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean readBlocksLazily) throws IOException, URISyntaxException, InterruptedException { // Write a Data block and Delete block with same InstantTime (written in same batch) @@ -998,22 +1341,24 @@ public void testAvroLogRecordReaderWithInsertDeleteAndRollback(boolean readBlock header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); + writer.appendBlock(dataBlock); // Delete 50 keys - List deletedKeys = copyOfRecords1.stream() - .map(s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), + List deleteRecords = copyOfRecords1.stream() + .map(s -> (DeleteRecord.create(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), ((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString()))) .collect(Collectors.toList()).subList(0, 50); - HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new HoodieKey[50]), header); - writer = writer.appendBlock(deleteBlock); + HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deleteRecords.toArray(new DeleteRecord[50]), header); + writer.appendBlock(deleteBlock); + + FileCreateUtils.createDeltaCommit(basePath, "100", fs); // Write 2 rollback blocks (1 data block + 1 delete bloc) for a failed write header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); - writer = writer.appendBlock(commandBlock); + writer.appendBlock(commandBlock); writer.appendBlock(commandBlock); List allLogFiles = @@ -1031,13 +1376,18 @@ public void testAvroLogRecordReaderWithInsertDeleteAndRollback(boolean readBlock .withReverseReader(false) .withBufferSize(bufferSize) .withSpillableMapBasePath(BASE_OUTPUT_PATH) + .withDiskMapType(diskMapType) + .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) .build(); assertEquals(0, scanner.getTotalLogRecords(), "We would read 0 records"); + FileCreateUtils.deleteDeltaCommit(basePath, "100", fs); } @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testAvroLogRecordReaderWithInvalidRollback(boolean readBlocksLazily) + @MethodSource("testArguments") + public void testAvroLogRecordReaderWithInvalidRollback(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean readBlocksLazily) throws IOException, URISyntaxException, InterruptedException { Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version @@ -1050,8 +1400,10 @@ public void testAvroLogRecordReaderWithInvalidRollback(boolean readBlocksLazily) Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); + writer.appendBlock(dataBlock); + + FileCreateUtils.createDeltaCommit(basePath, "100", fs); // Write invalid rollback for a failed write (possible for in-flight commits) header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "101"); @@ -1075,6 +1427,8 @@ public void testAvroLogRecordReaderWithInvalidRollback(boolean readBlocksLazily) .withReverseReader(false) .withBufferSize(bufferSize) .withSpillableMapBasePath(BASE_OUTPUT_PATH) + .withDiskMapType(diskMapType) + .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) .build(); assertEquals(100, scanner.getTotalLogRecords(), "We still would read 100 records"); final List readKeys = new ArrayList<>(100); @@ -1083,8 +1437,10 @@ public void testAvroLogRecordReaderWithInvalidRollback(boolean readBlocksLazily) } @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testAvroLogRecordReaderWithInsertsDeleteAndRollback(boolean readBlocksLazily) + @MethodSource("testArguments") + public void testAvroLogRecordReaderWithInsertsDeleteAndRollback(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean readBlocksLazily) throws IOException, URISyntaxException, InterruptedException { // Write a 3 Data blocs with same InstantTime (written in same batch) @@ -1102,22 +1458,25 @@ public void testAvroLogRecordReaderWithInsertsDeleteAndRollback(boolean readBloc header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); - writer = writer.appendBlock(dataBlock); - writer = writer.appendBlock(dataBlock); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); + writer.appendBlock(dataBlock); + writer.appendBlock(dataBlock); + writer.appendBlock(dataBlock); // Delete 50 keys // Delete 50 keys - List deletedKeys = copyOfRecords1.stream() - .map(s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), + List deleteRecords = copyOfRecords1.stream() + .map(s -> (DeleteRecord.create(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), ((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString()))) .collect(Collectors.toList()).subList(0, 50); - HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new HoodieKey[50]), header); - writer = writer.appendBlock(deleteBlock); + HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deleteRecords.toArray(new DeleteRecord[50]), header); + writer.appendBlock(deleteBlock); + + FileCreateUtils.createDeltaCommit(basePath, "100", fs); // Write 1 rollback block for a failed write header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101"); + header.put(HeaderMetadataType.TARGET_INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); @@ -1138,13 +1497,17 @@ public void testAvroLogRecordReaderWithInsertsDeleteAndRollback(boolean readBloc .withReverseReader(false) .withBufferSize(bufferSize) .withSpillableMapBasePath(BASE_OUTPUT_PATH) + .withDiskMapType(diskMapType) + .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) .build(); assertEquals(0, scanner.getTotalLogRecords(), "We would read 0 records"); } @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testAvroLogRecordReaderWithMixedInsertsCorruptsAndRollback(boolean readBlocksLazily) + @MethodSource("testArguments") + public void testAvroLogRecordReaderWithMixedInsertsCorruptsAndRollback(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean readBlocksLazily) throws IOException, URISyntaxException, InterruptedException { // Write a 3 Data blocs with same InstantTime (written in same batch) @@ -1159,12 +1522,14 @@ public void testAvroLogRecordReaderWithMixedInsertsCorruptsAndRollback(boolean r Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); - writer = writer.appendBlock(dataBlock); - writer = writer.appendBlock(dataBlock); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); + writer.appendBlock(dataBlock); + writer.appendBlock(dataBlock); + writer.appendBlock(dataBlock); writer.close(); + FileCreateUtils.createDeltaCommit(basePath, "100", fs); + // Append some arbit byte[] to the end of the log (mimics a partially written commit) fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf()); FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath()); @@ -1195,7 +1560,7 @@ public void testAvroLogRecordReaderWithMixedInsertsCorruptsAndRollback(boolean r HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); - writer = writer.appendBlock(dataBlock); + writer.appendBlock(dataBlock); writer.close(); // Append some arbit byte[] to the end of the log (mimics a partially written commit) @@ -1220,7 +1585,7 @@ public void testAvroLogRecordReaderWithMixedInsertsCorruptsAndRollback(boolean r header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); - writer = writer.appendBlock(commandBlock); + writer.appendBlock(commandBlock); writer.close(); List allLogFiles = @@ -1238,8 +1603,11 @@ public void testAvroLogRecordReaderWithMixedInsertsCorruptsAndRollback(boolean r .withReverseReader(false) .withBufferSize(bufferSize) .withSpillableMapBasePath(BASE_OUTPUT_PATH) + .withDiskMapType(diskMapType) + .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) .build(); assertEquals(0, scanner.getTotalLogRecords(), "We would read 0 records"); + FileCreateUtils.deleteDeltaCommit(basePath, "100", fs); } /* @@ -1256,7 +1624,9 @@ public void testAvroLogRecordReaderWithMixedInsertsCorruptsAndRollback(boolean r * */ private void testAvroLogRecordReaderMergingMultipleLogFiles(int numRecordsInLog1, int numRecordsInLog2, - boolean readBlocksLazily) { + ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean readBlocksLazily) { try { // Write one Data block with same InstantTime (written in same batch) Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); @@ -1271,8 +1641,8 @@ private void testAvroLogRecordReaderMergingMultipleLogFiles(int numRecordsInLog1 Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records.subList(0, numRecordsInLog1), header); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records.subList(0, numRecordsInLog1), header); + writer.appendBlock(dataBlock); // Get the size of the block long size = writer.getCurrentSize(); writer.close(); @@ -1285,11 +1655,13 @@ private void testAvroLogRecordReaderMergingMultipleLogFiles(int numRecordsInLog1 Map header2 = new HashMap<>(); header2.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header2.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock2 = getDataBlock(records2.subList(0, numRecordsInLog2), header2); - writer2 = writer2.appendBlock(dataBlock2); + HoodieDataBlock dataBlock2 = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2.subList(0, numRecordsInLog2), header2); + writer2.appendBlock(dataBlock2); // Get the size of the block writer2.close(); + FileCreateUtils.createDeltaCommit(basePath, "100", fs); + // From the two log files generated, read the records List allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100").map(s -> s.getPath().toString()).collect(Collectors.toList()); @@ -1305,6 +1677,8 @@ private void testAvroLogRecordReaderMergingMultipleLogFiles(int numRecordsInLog1 .withReverseReader(false) .withBufferSize(bufferSize) .withSpillableMapBasePath(BASE_OUTPUT_PATH) + .withDiskMapType(diskMapType) + .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) .build(); assertEquals(Math.max(numRecordsInLog1, numRecordsInLog2), scanner.getNumMergedRecordsInLog(), @@ -1316,33 +1690,42 @@ private void testAvroLogRecordReaderMergingMultipleLogFiles(int numRecordsInLog1 } @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testAvroLogRecordReaderWithFailedTaskInFirstStageAttempt(boolean readBlocksLazily) { + @MethodSource("testArguments") + public void testAvroLogRecordReaderWithFailedTaskInFirstStageAttempt(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean readBlocksLazily) { /* * FIRST_ATTEMPT_FAILED: * Original task from the stage attempt failed, but subsequent stage retry succeeded. */ - testAvroLogRecordReaderMergingMultipleLogFiles(77, 100, readBlocksLazily); + testAvroLogRecordReaderMergingMultipleLogFiles(77, 100, + diskMapType, isCompressionEnabled, readBlocksLazily); } @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testAvroLogRecordReaderWithFailedTaskInSecondStageAttempt(boolean readBlocksLazily) { + @MethodSource("testArguments") + public void testAvroLogRecordReaderWithFailedTaskInSecondStageAttempt(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean readBlocksLazily) { /* * SECOND_ATTEMPT_FAILED: * Original task from stage attempt succeeded, but subsequent retry attempt failed. */ - testAvroLogRecordReaderMergingMultipleLogFiles(100, 66, readBlocksLazily); + testAvroLogRecordReaderMergingMultipleLogFiles(100, 66, + diskMapType, isCompressionEnabled, readBlocksLazily); } @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testAvroLogRecordReaderTasksSucceededInBothStageAttempts(boolean readBlocksLazily) { + @MethodSource("testArguments") + public void testAvroLogRecordReaderTasksSucceededInBothStageAttempts(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean readBlocksLazily) { /* * BOTH_ATTEMPTS_SUCCEEDED: * Original task from the stage attempt and duplicate task from the stage retry succeeded. */ - testAvroLogRecordReaderMergingMultipleLogFiles(100, 100, readBlocksLazily); + testAvroLogRecordReaderMergingMultipleLogFiles(100, 100, + diskMapType, isCompressionEnabled, readBlocksLazily); } @ParameterizedTest @@ -1359,8 +1742,8 @@ public void testBasicAppendAndReadInReverse(boolean readBlocksLazily) Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); + writer.appendBlock(dataBlock); writer.close(); writer = @@ -1369,8 +1752,8 @@ public void testBasicAppendAndReadInReverse(boolean readBlocksLazily) List records2 = SchemaTestUtil.generateTestRecords(0, 100); List copyOfRecords2 = records2.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); - dataBlock = getDataBlock(records2, header); - writer = writer.appendBlock(dataBlock); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header); + writer.appendBlock(dataBlock); writer.close(); // Close and Open again and append 100 more records @@ -1380,40 +1763,45 @@ public void testBasicAppendAndReadInReverse(boolean readBlocksLazily) List records3 = SchemaTestUtil.generateTestRecords(0, 100); List copyOfRecords3 = records3.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); - dataBlock = getDataBlock(records3, header); - writer = writer.appendBlock(dataBlock); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records3, header); + writer.appendBlock(dataBlock); writer.close(); - HoodieLogFileReader reader = new HoodieLogFileReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(), - bufferSize, readBlocksLazily, true); - - assertTrue(reader.hasPrev(), "Last block should be available"); - HoodieLogBlock prevBlock = reader.prev(); - HoodieDataBlock dataBlockRead = (HoodieDataBlock) prevBlock; - - assertEquals(copyOfRecords3.size(), dataBlockRead.getRecords().size(), - "Third records size should be equal to the written records size"); - assertEquals(copyOfRecords3, dataBlockRead.getRecords(), - "Both records lists should be the same. (ordering guaranteed)"); - - assertTrue(reader.hasPrev(), "Second block should be available"); - prevBlock = reader.prev(); - dataBlockRead = (HoodieDataBlock) prevBlock; - assertEquals(copyOfRecords2.size(), dataBlockRead.getRecords().size(), - "Read records size should be equal to the written records size"); - assertEquals(copyOfRecords2, dataBlockRead.getRecords(), - "Both records lists should be the same. (ordering guaranteed)"); - - assertTrue(reader.hasPrev(), "First block should be available"); - prevBlock = reader.prev(); - dataBlockRead = (HoodieDataBlock) prevBlock; - assertEquals(copyOfRecords1.size(), dataBlockRead.getRecords().size(), - "Read records size should be equal to the written records size"); - assertEquals(copyOfRecords1, dataBlockRead.getRecords(), - "Both records lists should be the same. (ordering guaranteed)"); - - assertFalse(reader.hasPrev()); - reader.close(); + FileCreateUtils.createDeltaCommit(basePath, "100", fs); + + HoodieLogFile logFile = new HoodieLogFile(writer.getLogFile().getPath(), fs.getFileStatus(writer.getLogFile().getPath()).getLen()); + try (HoodieLogFileReader reader = new HoodieLogFileReader(fs, logFile, SchemaTestUtil.getSimpleSchema(), bufferSize, readBlocksLazily, true)) { + + assertTrue(reader.hasPrev(), "Last block should be available"); + HoodieLogBlock prevBlock = reader.prev(); + HoodieDataBlock dataBlockRead = (HoodieDataBlock) prevBlock; + + List recordsRead1 = getRecords(dataBlockRead); + assertEquals(copyOfRecords3.size(), recordsRead1.size(), + "Third records size should be equal to the written records size"); + assertEquals(copyOfRecords3, recordsRead1, + "Both records lists should be the same. (ordering guaranteed)"); + + assertTrue(reader.hasPrev(), "Second block should be available"); + prevBlock = reader.prev(); + dataBlockRead = (HoodieDataBlock) prevBlock; + List recordsRead2 = getRecords(dataBlockRead); + assertEquals(copyOfRecords2.size(), recordsRead2.size(), + "Read records size should be equal to the written records size"); + assertEquals(copyOfRecords2, recordsRead2, + "Both records lists should be the same. (ordering guaranteed)"); + + assertTrue(reader.hasPrev(), "First block should be available"); + prevBlock = reader.prev(); + dataBlockRead = (HoodieDataBlock) prevBlock; + List recordsRead3 = getRecords(dataBlockRead); + assertEquals(copyOfRecords1.size(), recordsRead3.size(), + "Read records size should be equal to the written records size"); + assertEquals(copyOfRecords1, recordsRead3, + "Both records lists should be the same. (ordering guaranteed)"); + + assertFalse(reader.hasPrev()); + } } @ParameterizedTest @@ -1428,11 +1816,13 @@ public void testAppendAndReadOnCorruptedLogInReverse(boolean readBlocksLazily) Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records, header); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); + writer.appendBlock(dataBlock); writer.close(); - // Append some arbit byte[] to thee end of the log (mimics a partially written commit) + FileCreateUtils.createDeltaCommit(basePath, "100", fs); + + // Append some arbit byte[] to the end of the log (mimics a partially written commit) fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf()); FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath()); // create a block with @@ -1454,23 +1844,25 @@ public void testAppendAndReadOnCorruptedLogInReverse(boolean readBlocksLazily) HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); - dataBlock = getDataBlock(records, header); - writer = writer.appendBlock(dataBlock); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); + writer.appendBlock(dataBlock); writer.close(); // First round of reads - we should be able to read the first block and then EOF - HoodieLogFileReader reader = - new HoodieLogFileReader(fs, writer.getLogFile(), schema, bufferSize, readBlocksLazily, true); + HoodieLogFile logFile = new HoodieLogFile(writer.getLogFile().getPath(), fs.getFileStatus(writer.getLogFile().getPath()).getLen()); - assertTrue(reader.hasPrev(), "Last block should be available"); - HoodieLogBlock block = reader.prev(); - assertTrue(block instanceof HoodieDataBlock, "Last block should be datablock"); + try (HoodieLogFileReader reader = + new HoodieLogFileReader(fs, logFile, schema, bufferSize, readBlocksLazily, true)) { - assertTrue(reader.hasPrev(), "Last block should be available"); - assertThrows(CorruptedLogFileException.class, () -> { - reader.prev(); - }); - reader.close(); + assertTrue(reader.hasPrev(), "Last block should be available"); + HoodieLogBlock block = reader.prev(); + assertTrue(block instanceof HoodieDataBlock, "Last block should be datablock"); + + assertTrue(reader.hasPrev(), "Last block should be available"); + assertThrows(CorruptedLogFileException.class, () -> { + reader.prev(); + }); + } } @ParameterizedTest @@ -1487,16 +1879,16 @@ public void testBasicAppendAndTraverseInReverse(boolean readBlocksLazily) Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); + writer.appendBlock(dataBlock); writer.close(); writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); List records2 = SchemaTestUtil.generateTestRecords(0, 100); - dataBlock = getDataBlock(records2, header); - writer = writer.appendBlock(dataBlock); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header); + writer.appendBlock(dataBlock); writer.close(); // Close and Open again and append 100 more records @@ -1504,30 +1896,34 @@ public void testBasicAppendAndTraverseInReverse(boolean readBlocksLazily) HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); List records3 = SchemaTestUtil.generateTestRecords(0, 100); - dataBlock = getDataBlock(records3, header); - writer = writer.appendBlock(dataBlock); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records3, header); + writer.appendBlock(dataBlock); writer.close(); - HoodieLogFileReader reader = new HoodieLogFileReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(), - bufferSize, readBlocksLazily, true); + FileCreateUtils.createDeltaCommit(basePath, "100", fs); - assertTrue(reader.hasPrev(), "Third block should be available"); - reader.moveToPrev(); + HoodieLogFile logFile = new HoodieLogFile(writer.getLogFile().getPath(), fs.getFileStatus(writer.getLogFile().getPath()).getLen()); + try (HoodieLogFileReader reader = + new HoodieLogFileReader(fs, logFile, SchemaTestUtil.getSimpleSchema(), bufferSize, readBlocksLazily, true)) { - assertTrue(reader.hasPrev(), "Second block should be available"); - reader.moveToPrev(); + assertTrue(reader.hasPrev(), "Third block should be available"); + reader.moveToPrev(); - // After moving twice, this last reader.prev() should read the First block written - assertTrue(reader.hasPrev(), "First block should be available"); - HoodieLogBlock prevBlock = reader.prev(); - HoodieDataBlock dataBlockRead = (HoodieDataBlock) prevBlock; - assertEquals(copyOfRecords1.size(), dataBlockRead.getRecords().size(), - "Read records size should be equal to the written records size"); - assertEquals(copyOfRecords1, dataBlockRead.getRecords(), - "Both records lists should be the same. (ordering guaranteed)"); + assertTrue(reader.hasPrev(), "Second block should be available"); + reader.moveToPrev(); - assertFalse(reader.hasPrev()); - reader.close(); + // After moving twice, this last reader.prev() should read the First block written + assertTrue(reader.hasPrev(), "First block should be available"); + HoodieLogBlock prevBlock = reader.prev(); + HoodieDataBlock dataBlockRead = (HoodieDataBlock) prevBlock; + List recordsRead = getRecords(dataBlockRead); + assertEquals(copyOfRecords1.size(), recordsRead.size(), + "Read records size should be equal to the written records size"); + assertEquals(copyOfRecords1, recordsRead, + "Both records lists should be the same. (ordering guaranteed)"); + + assertFalse(reader.hasPrev()); + } } @Test @@ -1546,7 +1942,7 @@ public void testV0Format() throws IOException, URISyntaxException { HoodieLogBlock logBlock = HoodieAvroDataBlock.getBlock(content, schema); assertEquals(HoodieLogBlockType.AVRO_DATA_BLOCK, logBlock.getBlockType()); - List readRecords = ((HoodieAvroDataBlock) logBlock).getRecords(); + List readRecords = getRecords((HoodieAvroDataBlock) logBlock); assertEquals(readRecords.size(), recordsCopy.size()); for (int i = 0; i < recordsCopy.size(); ++i) { assertEquals(recordsCopy.get(i), readRecords.get(i)); @@ -1555,26 +1951,117 @@ public void testV0Format() throws IOException, URISyntaxException { // Reader schema is optional if it is same as write schema logBlock = HoodieAvroDataBlock.getBlock(content, null); assertEquals(HoodieLogBlockType.AVRO_DATA_BLOCK, logBlock.getBlockType()); - readRecords = ((HoodieAvroDataBlock) logBlock).getRecords(); + readRecords = getRecords((HoodieAvroDataBlock) logBlock); assertEquals(readRecords.size(), recordsCopy.size()); for (int i = 0; i < recordsCopy.size(); ++i) { assertEquals(recordsCopy.get(i), readRecords.get(i)); } } - private HoodieDataBlock getDataBlock(List records, Map header) { - return getDataBlock(dataBlockType, records, header); + @ParameterizedTest + @EnumSource(names = {"AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK", "PARQUET_DATA_BLOCK"}) + public void testDataBlockFormatAppendAndReadWithProjectedSchema( + HoodieLogBlockType dataBlockType + ) throws IOException, URISyntaxException, InterruptedException { + Writer writer = HoodieLogFormat.newWriterBuilder() + .onParentPath(partitionPath) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1") + .overBaseCommit("100") + .withFs(fs) + .build(); + + List records = SchemaTestUtil.generateTestGenericRecords(0, 1000); + + Schema schema = getSimpleSchema(); + + Map header = + new HashMap() {{ + put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); + put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); + }}; + + // Init Benchmark to report number of bytes actually read from the Block + BenchmarkCounter.initCounterFromReporter(HadoopMapRedUtils.createTestReporter(), fs.getConf()); + + // NOTE: Have to use this ugly hack since List generic is not covariant in its type param + HoodieDataBlock dataBlock = getDataBlock(dataBlockType, (List)(List) records, header); + + writer.appendBlock(dataBlock); + writer.close(); + + Schema projectedSchema = HoodieAvroUtils.generateProjectionSchema(schema, Collections.singletonList("name")); + + List projectedRecords = HoodieAvroUtils.rewriteRecords(records, projectedSchema); + + try (Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), projectedSchema, true, false)) { + assertTrue(reader.hasNext(), "First block should be available"); + + HoodieLogBlock nextBlock = reader.next(); + + HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock; + + Map expectedReadBytes = + new HashMap() {{ + put(HoodieLogBlockType.AVRO_DATA_BLOCK, 0); // not supported + put(HoodieLogBlockType.HFILE_DATA_BLOCK, 0); // not supported + put(HoodieLogBlockType.PARQUET_DATA_BLOCK, HoodieAvroUtils.gteqAvro1_9() ? 2593 : 2605); + }}; + + List recordsRead = getRecords(dataBlockRead); + assertEquals(projectedRecords.size(), recordsRead.size(), + "Read records size should be equal to the written records size"); + assertEquals(projectedRecords, recordsRead, + "Both records lists should be the same. (ordering guaranteed)"); + assertEquals(dataBlockRead.getSchema(), projectedSchema); + + int bytesRead = (int) BenchmarkCounter.getBytesRead(); + + assertEquals(expectedReadBytes.get(dataBlockType), bytesRead, "Read bytes have to match"); + } } private HoodieDataBlock getDataBlock(HoodieLogBlockType dataBlockType, List records, Map header) { + return getDataBlock(dataBlockType, records, header, new Path("dummy_path")); + } + + private HoodieDataBlock getDataBlock(HoodieLogBlockType dataBlockType, List records, + Map header, Path pathForReader) { switch (dataBlockType) { case AVRO_DATA_BLOCK: - return new HoodieAvroDataBlock(records, header); + return new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD); case HFILE_DATA_BLOCK: - return new HoodieHFileDataBlock(records, header); + return new HoodieHFileDataBlock(records, header, Compression.Algorithm.GZ, pathForReader); + case PARQUET_DATA_BLOCK: + return new HoodieParquetDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD, CompressionCodecName.GZIP); default: throw new RuntimeException("Unknown data block type " + dataBlockType); } } + + private static Stream testArguments() { + // Arg1: ExternalSpillableMap Type, Arg2: isDiskMapCompressionEnabled, Arg3: readBlocksLazily + return Stream.of( + arguments(ExternalSpillableMap.DiskMapType.BITCASK, false, false), + arguments(ExternalSpillableMap.DiskMapType.ROCKS_DB, false, false), + arguments(ExternalSpillableMap.DiskMapType.BITCASK, true, false), + arguments(ExternalSpillableMap.DiskMapType.ROCKS_DB, true, false), + arguments(ExternalSpillableMap.DiskMapType.BITCASK, false, true), + arguments(ExternalSpillableMap.DiskMapType.ROCKS_DB, false, true), + arguments(ExternalSpillableMap.DiskMapType.BITCASK, true, true), + arguments(ExternalSpillableMap.DiskMapType.ROCKS_DB, true, true) + ); + } + + /** + * Utility to convert the given iterator to a List. + */ + private static List getRecords(HoodieDataBlock dataBlock) { + ClosableIterator itr = dataBlock.getRecordIterator(); + + List elements = new ArrayList<>(); + itr.forEachRemaining(elements::add); + return elements; + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormatAppendFailure.java b/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormatAppendFailure.java index 71616f6683eda..6c4d69a05b296 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormatAppendFailure.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormatAppendFailure.java @@ -19,6 +19,7 @@ package org.apache.hudi.common.functional; import org.apache.hudi.common.model.HoodieArchivedLogFile; +import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.HoodieLogFormat.Writer; import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; @@ -104,13 +105,13 @@ public void testFailedToGetAppendStreamFromHDFSNameNode() Map header = new HashMap<>(2); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); + HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD); Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(testPath) .withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION).withFileId("commits.archive") .overBaseCommit("").withFs(fs).build(); - writer = writer.appendBlock(dataBlock); + writer.appendBlock(dataBlock); // get the current log file version to compare later int logFileVersion = writer.getLogFile().getLogVersion(); Path logFilePath = writer.getLogFile().getPath(); diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/payload/TestAWSDmsAvroPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestAWSDmsAvroPayload.java similarity index 78% rename from hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/payload/TestAWSDmsAvroPayload.java rename to hudi-common/src/test/java/org/apache/hudi/common/model/TestAWSDmsAvroPayload.java index 802096a3a74e1..07bc1d6f43e1f 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/payload/TestAWSDmsAvroPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestAWSDmsAvroPayload.java @@ -16,18 +16,17 @@ * limitations under the License. */ -package org.apache.hudi.payload; - -import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; -import org.apache.hudi.common.util.Option; +package org.apache.hudi.common.model; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; - +import org.apache.hudi.common.util.Option; import org.junit.jupiter.api.Test; +import java.util.Properties; + import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; @@ -45,13 +44,14 @@ public void testInsert() { Schema avroSchema = new Schema.Parser().parse(AVRO_SCHEMA_STRING); GenericRecord record = new GenericData.Record(avroSchema); + Properties properties = new Properties(); record.put("field1", 0); record.put("Op", "I"); AWSDmsAvroPayload payload = new AWSDmsAvroPayload(Option.of(record)); try { - Option outputPayload = payload.getInsertValue(avroSchema); + Option outputPayload = payload.getInsertValue(avroSchema, properties); assertTrue((int) outputPayload.get().get(0) == 0); assertTrue(outputPayload.get().get(1).toString().equals("I")); } catch (Exception e) { @@ -64,6 +64,7 @@ public void testInsert() { public void testUpdate() { Schema avroSchema = new Schema.Parser().parse(AVRO_SCHEMA_STRING); GenericRecord newRecord = new GenericData.Record(avroSchema); + Properties properties = new Properties(); newRecord.put("field1", 1); newRecord.put("Op", "U"); @@ -74,7 +75,7 @@ public void testUpdate() { AWSDmsAvroPayload payload = new AWSDmsAvroPayload(Option.of(newRecord)); try { - Option outputPayload = payload.combineAndGetUpdateValue(oldRecord, avroSchema); + Option outputPayload = payload.combineAndGetUpdateValue(oldRecord, avroSchema, properties); assertTrue((int) outputPayload.get().get(0) == 1); assertTrue(outputPayload.get().get(1).toString().equals("U")); } catch (Exception e) { @@ -87,6 +88,7 @@ public void testUpdate() { public void testDelete() { Schema avroSchema = new Schema.Parser().parse(AVRO_SCHEMA_STRING); GenericRecord deleteRecord = new GenericData.Record(avroSchema); + Properties properties = new Properties(); deleteRecord.put("field1", 2); deleteRecord.put("Op", "D"); @@ -97,8 +99,8 @@ public void testDelete() { AWSDmsAvroPayload payload = new AWSDmsAvroPayload(Option.of(deleteRecord)); try { - Option outputPayload = payload.combineAndGetUpdateValue(oldRecord, avroSchema); - // expect nothing to be comitted to table + Option outputPayload = payload.combineAndGetUpdateValue(oldRecord, avroSchema, properties); + // expect nothing to be committed to table assertFalse(outputPayload.isPresent()); } catch (Exception e) { fail("Unexpected exception"); @@ -106,10 +108,32 @@ public void testDelete() { } + @Test + public void testDeleteWithEmptyPayLoad() { + Schema avroSchema = new Schema.Parser().parse(AVRO_SCHEMA_STRING); + Properties properties = new Properties(); + + GenericRecord oldRecord = new GenericData.Record(avroSchema); + oldRecord.put("field1", 2); + oldRecord.put("Op", "U"); + + AWSDmsAvroPayload payload = new AWSDmsAvroPayload(Option.empty()); + + try { + Option outputPayload = payload.combineAndGetUpdateValue(oldRecord, avroSchema, properties); + // expect nothing to be committed to table + assertFalse(outputPayload.isPresent()); + } catch (Exception e) { + e.printStackTrace(); + fail("Unexpected exception"); + } + } + @Test public void testPreCombineWithDelete() { Schema avroSchema = new Schema.Parser().parse(AVRO_SCHEMA_STRING); GenericRecord deleteRecord = new GenericData.Record(avroSchema); + Properties properties = new Properties(); deleteRecord.put("field1", 4); deleteRecord.put("Op", "D"); @@ -122,8 +146,8 @@ public void testPreCombineWithDelete() { try { OverwriteWithLatestAvroPayload output = payload.preCombine(insertPayload); - Option outputPayload = output.getInsertValue(avroSchema); - // expect nothing to be comitted to table + Option outputPayload = output.getInsertValue(avroSchema, properties); + // expect nothing to be committed to table assertFalse(outputPayload.isPresent()); } catch (Exception e) { fail("Unexpected exception"); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestDefaultHoodieRecordPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestDefaultHoodieRecordPayload.java new file mode 100644 index 0000000000000..c0896e723ea07 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestDefaultHoodieRecordPayload.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.hudi.common.util.Option; + +import org.apache.avro.Schema; +import org.apache.avro.Schema.Type; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Properties; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Unit tests {@link DefaultHoodieRecordPayload}. + */ +public class TestDefaultHoodieRecordPayload { + + private Schema schema; + private Properties props; + + @BeforeEach + public void setUp() throws Exception { + schema = Schema.createRecord(Arrays.asList( + new Schema.Field("id", Schema.create(Schema.Type.STRING), "", null), + new Schema.Field("partition", Schema.create(Schema.Type.STRING), "", null), + new Schema.Field("ts", Schema.create(Schema.Type.LONG), "", null), + new Schema.Field("_hoodie_is_deleted", Schema.create(Type.BOOLEAN), "", false) + )); + props = new Properties(); + props.setProperty(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY, "ts"); + props.setProperty(HoodiePayloadProps.PAYLOAD_EVENT_TIME_FIELD_PROP_KEY, "ts"); + } + + @Test + public void testActiveRecords() throws IOException { + GenericRecord record1 = new GenericData.Record(schema); + record1.put("id", "1"); + record1.put("partition", "partition0"); + record1.put("ts", 0L); + record1.put("_hoodie_is_deleted", false); + + GenericRecord record2 = new GenericData.Record(schema); + record2.put("id", "2"); + record2.put("partition", "partition1"); + record2.put("ts", 1L); + record2.put("_hoodie_is_deleted", false); + + DefaultHoodieRecordPayload payload1 = new DefaultHoodieRecordPayload(record1, 1); + DefaultHoodieRecordPayload payload2 = new DefaultHoodieRecordPayload(record2, 2); + assertEquals(payload1.preCombine(payload2, props), payload2); + assertEquals(payload2.preCombine(payload1, props), payload2); + + assertEquals(record1, payload1.getInsertValue(schema, props).get()); + assertEquals(record2, payload2.getInsertValue(schema, props).get()); + + assertEquals(payload1.combineAndGetUpdateValue(record2, schema, props).get(), record2); + assertEquals(payload2.combineAndGetUpdateValue(record1, schema, props).get(), record2); + } + + @Test + public void testDeletedRecord() throws IOException { + GenericRecord record1 = new GenericData.Record(schema); + record1.put("id", "1"); + record1.put("partition", "partition0"); + record1.put("ts", 0L); + record1.put("_hoodie_is_deleted", false); + + GenericRecord delRecord1 = new GenericData.Record(schema); + delRecord1.put("id", "2"); + delRecord1.put("partition", "partition1"); + delRecord1.put("ts", 1L); + delRecord1.put("_hoodie_is_deleted", true); + + DefaultHoodieRecordPayload payload1 = new DefaultHoodieRecordPayload(record1, 1); + DefaultHoodieRecordPayload payload2 = new DefaultHoodieRecordPayload(delRecord1, 2); + assertEquals(payload1.preCombine(payload2, props), payload2); + assertEquals(payload2.preCombine(payload1, props), payload2); + + assertEquals(record1, payload1.getInsertValue(schema, props).get()); + assertFalse(payload2.getInsertValue(schema, props).isPresent()); + + assertEquals(payload1.combineAndGetUpdateValue(delRecord1, schema, props).get(), delRecord1); + assertFalse(payload2.combineAndGetUpdateValue(record1, schema, props).isPresent()); + } + + @Test + public void testGetEmptyMetadata() { + GenericRecord record = new GenericData.Record(schema); + record.put("id", "1"); + record.put("partition", "partition0"); + record.put("ts", 0L); + record.put("_hoodie_is_deleted", false); + DefaultHoodieRecordPayload payload = new DefaultHoodieRecordPayload(Option.of(record)); + assertFalse(payload.getMetadata().isPresent()); + } + + @ParameterizedTest + @ValueSource(longs = {1L, 1612542030000L}) + public void testGetEventTimeInMetadata(long eventTime) throws IOException { + GenericRecord record1 = new GenericData.Record(schema); + record1.put("id", "1"); + record1.put("partition", "partition0"); + record1.put("ts", 0L); + record1.put("_hoodie_is_deleted", false); + + GenericRecord record2 = new GenericData.Record(schema); + record2.put("id", "1"); + record2.put("partition", "partition0"); + record2.put("ts", eventTime); + record2.put("_hoodie_is_deleted", false); + + DefaultHoodieRecordPayload payload2 = new DefaultHoodieRecordPayload(record2, eventTime); + payload2.combineAndGetUpdateValue(record1, schema, props); + assertTrue(payload2.getMetadata().isPresent()); + assertEquals(eventTime, + Long.parseLong(payload2.getMetadata().get().get(DefaultHoodieRecordPayload.METADATA_EVENT_TIME_KEY))); + } + + @Test + public void testEmptyProperty() throws IOException { + GenericRecord record1 = new GenericData.Record(schema); + record1.put("id", "1"); + record1.put("partition", "partition0"); + record1.put("ts", 0L); + record1.put("_hoodie_is_deleted", false); + + GenericRecord record2 = new GenericData.Record(schema); + record2.put("id", "1"); + record2.put("partition", "partition0"); + record2.put("ts", 1L); + record2.put("_hoodie_is_deleted", false); + + DefaultHoodieRecordPayload payload = new DefaultHoodieRecordPayload(Option.of(record1)); + Properties properties = new Properties(); + payload.getInsertValue(schema, properties); + payload.combineAndGetUpdateValue(record2, schema, properties); + } + + @ParameterizedTest + @ValueSource(longs = {1L, 1612542030000L}) + public void testGetEventTimeInMetadataForInserts(long eventTime) throws IOException { + GenericRecord record = new GenericData.Record(schema); + + record.put("id", "1"); + record.put("partition", "partition0"); + record.put("ts", eventTime); + record.put("_hoodie_is_deleted", false); + DefaultHoodieRecordPayload payload = new DefaultHoodieRecordPayload(record, eventTime); + payload.getInsertValue(schema, props); + assertTrue(payload.getMetadata().isPresent()); + assertEquals(eventTime, + Long.parseLong(payload.getMetadata().get().get(DefaultHoodieRecordPayload.METADATA_EVENT_TIME_KEY))); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieCommitMetadata.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieCommitMetadata.java index 0eaaff1267d43..e8c159540a3d3 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieCommitMetadata.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieCommitMetadata.java @@ -19,11 +19,16 @@ package org.apache.hudi.common.model; import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.FileIOUtils; +import org.apache.hudi.common.util.JsonUtils; import org.junit.jupiter.api.Test; +import java.io.IOException; +import java.util.Arrays; import java.util.List; +import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertSame; @@ -34,6 +39,30 @@ */ public class TestHoodieCommitMetadata { + private static final List EXPECTED_FIELD_NAMES = Arrays.asList( + "partitionToWriteStats", "compacted", "extraMetadata", "operationType"); + + public static void verifyMetadataFieldNames( + HoodieCommitMetadata commitMetadata, List expectedFieldNameList) + throws IOException { + String serializedCommitMetadata = commitMetadata.toJsonString(); + List actualFieldNameList = CollectionUtils.toStream( + JsonUtils.getObjectMapper().readTree(serializedCommitMetadata).fieldNames()) + .collect(Collectors.toList()); + assertEquals( + expectedFieldNameList.stream().sorted().collect(Collectors.toList()), + actualFieldNameList.stream().sorted().collect(Collectors.toList()) + ); + } + + @Test + public void verifyFieldNamesInCommitMetadata() throws IOException { + List fakeHoodieWriteStats = HoodieTestUtils.generateFakeHoodieWriteStat(10); + HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); + fakeHoodieWriteStats.forEach(stat -> commitMetadata.addWriteStat(stat.getPartitionPath(), stat)); + verifyMetadataFieldNames(commitMetadata, EXPECTED_FIELD_NAMES); + } + @Test public void testPerfStatPresenceInHoodieMetadata() throws Exception { @@ -48,8 +77,9 @@ public void testPerfStatPresenceInHoodieMetadata() throws Exception { String serializedCommitMetadata = commitMetadata.toJsonString(); HoodieCommitMetadata metadata = HoodieCommitMetadata.fromJsonString(serializedCommitMetadata, HoodieCommitMetadata.class); - // Make sure timing metrics are not written to instant file - assertEquals(0, (long) metadata.getTotalScanTime()); + assertTrue(commitMetadata.getTotalCreateTime() > 0); + assertTrue(commitMetadata.getTotalUpsertTime() > 0); + assertTrue(commitMetadata.getTotalScanTime() > 0); assertTrue(metadata.getTotalLogFilesCompacted() > 0); } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieConsistentHashingMetadata.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieConsistentHashingMetadata.java new file mode 100644 index 0000000000000..8aa2e65561c59 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieConsistentHashingMetadata.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +public class TestHoodieConsistentHashingMetadata { + + @Test + public void testGetTimestamp() { + Assertions.assertTrue(HoodieConsistentHashingMetadata.getTimestampFromFile("0000.hashing_metadata").equals("0000")); + Assertions.assertTrue(HoodieConsistentHashingMetadata.getTimestampFromFile("1234.hashing_metadata").equals("1234")); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieDeltaWriteStat.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieDeltaWriteStat.java new file mode 100644 index 0000000000000..b774e06cea6d3 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieDeltaWriteStat.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.hudi.common.table.HoodieTableConfig; + +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Tests hoodie delta write stat {@link HoodieDeltaWriteStat}. + */ +public class TestHoodieDeltaWriteStat { + + @Test + public void testBaseFileAndLogFiles() { + HoodieDeltaWriteStat writeStat = new HoodieDeltaWriteStat(); + String baseFile = "file1" + HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension(); + String logFile1 = ".log1.log"; + String logFile2 = ".log2.log"; + + writeStat.setBaseFile(baseFile); + writeStat.addLogFiles(logFile1); + writeStat.addLogFiles(logFile2); + assertTrue(writeStat.getLogFiles().contains(logFile1)); + assertTrue(writeStat.getLogFiles().contains(logFile2)); + assertEquals(baseFile, writeStat.getBaseFile()); + + writeStat.setLogFiles(new ArrayList<>()); + assertTrue(writeStat.getLogFiles().isEmpty()); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieFileGroup.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieFileGroup.java new file mode 100644 index 0000000000000..91a2019f10b7c --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieFileGroup.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.testutils.MockHoodieTimeline; + +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestHoodieFileGroup { + + @Test + public void testCommittedFileSlices() { + // "000" is archived + Stream completed = Arrays.asList("001").stream(); + Stream inflight = Arrays.asList("002").stream(); + MockHoodieTimeline activeTimeline = new MockHoodieTimeline(completed, inflight); + HoodieFileGroup fileGroup = new HoodieFileGroup("", "data", + activeTimeline.getCommitsTimeline().filterCompletedInstants()); + for (int i = 0; i < 3; i++) { + HoodieBaseFile baseFile = new HoodieBaseFile("data_1_00" + i); + fileGroup.addBaseFile(baseFile); + } + assertEquals(2, fileGroup.getAllFileSlices().count()); + assertTrue(!fileGroup.getAllFileSlices().anyMatch(s -> s.getBaseInstantTime().equals("002"))); + assertEquals(3, fileGroup.getAllFileSlicesIncludingInflight().count()); + assertTrue(fileGroup.getLatestFileSlice().get().getBaseInstantTime().equals("001")); + assertTrue((new HoodieFileGroup(fileGroup)).getLatestFileSlice().get().getBaseInstantTime().equals("001")); + } + + @Test + public void testCommittedFileSlicesWithSavepointAndHoles() { + MockHoodieTimeline activeTimeline = new MockHoodieTimeline(Stream.of( + new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "01"), + new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.SAVEPOINT_ACTION, "01"), + new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "03"), + new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.SAVEPOINT_ACTION, "03"), + new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "05") // this can be DELTA_COMMIT/REPLACE_COMMIT as well + ).collect(Collectors.toList())); + HoodieFileGroup fileGroup = new HoodieFileGroup("", "data", activeTimeline.filterCompletedAndCompactionInstants()); + for (int i = 0; i < 7; i++) { + HoodieBaseFile baseFile = new HoodieBaseFile("data_1_0" + i); + fileGroup.addBaseFile(baseFile); + } + List allFileSlices = fileGroup.getAllFileSlices().collect(Collectors.toList()); + assertEquals(6, allFileSlices.size()); + assertTrue(!allFileSlices.stream().anyMatch(s -> s.getBaseInstantTime().equals("06"))); + assertEquals(7, fileGroup.getAllFileSlicesIncludingInflight().count()); + assertTrue(fileGroup.getLatestFileSlice().get().getBaseInstantTime().equals("05")); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodiePartitionMetadata.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodiePartitionMetadata.java new file mode 100644 index 0000000000000..3ec15d4f65d12 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodiePartitionMetadata.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.IOException; +import java.util.Arrays; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestHoodiePartitionMetadata extends HoodieCommonTestHarness { + + FileSystem fs; + + @BeforeEach + public void setupTest() throws IOException { + initMetaClient(); + fs = metaClient.getFs(); + } + + static Stream formatProviderFn() { + return Stream.of( + Arguments.arguments(Option.empty()), + Arguments.arguments(Option.of(HoodieFileFormat.PARQUET)), + Arguments.arguments(Option.of(HoodieFileFormat.ORC)) + ); + } + + @ParameterizedTest + @MethodSource("formatProviderFn") + public void testTextFormatMetaFile(Option format) throws IOException { + // given + final Path partitionPath = new Path(basePath, "a/b/" + + format.map(Enum::name).orElse("text")); + fs.mkdirs(partitionPath); + final String commitTime = "000000000001"; + HoodiePartitionMetadata writtenMetadata = new HoodiePartitionMetadata(metaClient.getFs(), commitTime, new Path(basePath), partitionPath, format); + writtenMetadata.trySave(0); + + // when + HoodiePartitionMetadata readMetadata = new HoodiePartitionMetadata(metaClient.getFs(), new Path(metaClient.getBasePath(), partitionPath)); + + // then + assertTrue(HoodiePartitionMetadata.hasPartitionMetadata(fs, partitionPath)); + assertEquals(Option.of(commitTime), readMetadata.readPartitionCreatedCommitTime()); + assertEquals(3, readMetadata.getPartitionDepth()); + } + + @Test + public void testErrorIfAbsent() throws IOException { + final Path partitionPath = new Path(basePath, "a/b/not-a-partition"); + fs.mkdirs(partitionPath); + HoodiePartitionMetadata readMetadata = new HoodiePartitionMetadata(metaClient.getFs(), new Path(metaClient.getBasePath(), partitionPath)); + assertThrows(HoodieException.class, readMetadata::readPartitionCreatedCommitTime); + } + + @Test + public void testFileNames() { + assertEquals(new Path("/a/b/c/.hoodie_partition_metadata"), HoodiePartitionMetadata.textFormatMetaFilePath(new Path("/a/b/c"))); + assertEquals(Arrays.asList(new Path("/a/b/c/.hoodie_partition_metadata.parquet"), + new Path("/a/b/c/.hoodie_partition_metadata.orc")), HoodiePartitionMetadata.baseFormatMetaFilePaths(new Path("/a/b/c"))); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieRecord.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieRecord.java index e31286d10c2cf..b6bbc34cc3de9 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieRecord.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieRecord.java @@ -44,7 +44,7 @@ public class TestHoodieRecord { public void setUp() throws Exception { final List indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1); final List hoodieRecords = - indexedRecords.stream().map(r -> new HoodieRecord(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), + indexedRecords.stream().map(r -> new HoodieAvroRecord(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), new AvroBinaryTestPayload(Option.of((GenericRecord) r)))).collect(Collectors.toList()); hoodieRecord = hoodieRecords.get(0); } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieReplaceCommitMetadata.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieReplaceCommitMetadata.java new file mode 100644 index 0000000000000..f2c0c1c043bf8 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieReplaceCommitMetadata.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.hudi.common.testutils.HoodieTestUtils; + +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static org.apache.hudi.common.model.TestHoodieCommitMetadata.verifyMetadataFieldNames; + +public class TestHoodieReplaceCommitMetadata { + + private static final List EXPECTED_FIELD_NAMES = Arrays.asList( + "partitionToWriteStats", "partitionToReplaceFileIds", "compacted", "extraMetadata", "operationType"); + + @Test + public void verifyFieldNamesInReplaceCommitMetadata() throws IOException { + List fakeHoodieWriteStats = HoodieTestUtils.generateFakeHoodieWriteStat(10); + HoodieReplaceCommitMetadata commitMetadata = new HoodieReplaceCommitMetadata(); + fakeHoodieWriteStats.forEach(stat -> { + commitMetadata.addWriteStat(stat.getPartitionPath(), stat); + commitMetadata.addReplaceFileId(stat.getPartitionPath(), stat.getFileId()); + }); + verifyMetadataFieldNames(commitMetadata, EXPECTED_FIELD_NAMES); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieWriteStat.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieWriteStat.java index 7136ce7d372bb..631c7cd41a385 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieWriteStat.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieWriteStat.java @@ -19,14 +19,13 @@ package org.apache.hudi.common.model; import org.apache.hudi.common.fs.FSUtils; - +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hadoop.fs.Path; import org.junit.jupiter.api.Test; import java.util.Date; import java.util.UUID; -import static org.apache.hudi.common.table.timeline.HoodieActiveTimeline.COMMIT_FORMATTER; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNull; @@ -37,7 +36,7 @@ public class TestHoodieWriteStat { @Test public void testSetPaths() { - String instantTime = COMMIT_FORMATTER.format(new Date()); + String instantTime = HoodieActiveTimeline.formatDate(new Date()); String basePathString = "/data/tables/some-hoodie-table"; String partitionPathString = "2017/12/31"; String fileName = UUID.randomUUID().toString(); @@ -46,7 +45,7 @@ public void testSetPaths() { Path basePath = new Path(basePathString); Path partitionPath = new Path(basePath, partitionPathString); - Path finalizeFilePath = new Path(partitionPath, FSUtils.makeDataFileName(instantTime, writeToken, fileName)); + Path finalizeFilePath = new Path(partitionPath, FSUtils.makeBaseFileName(instantTime, writeToken, fileName)); HoodieWriteStat writeStat = new HoodieWriteStat(); writeStat.setPath(basePath, finalizeFilePath); assertEquals(finalizeFilePath, new Path(basePath, writeStat.getPath())); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteNonDefaultsWithLatestAvroPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteNonDefaultsWithLatestAvroPayload.java index cce492a5ed117..4b7e4bda0b36c 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteNonDefaultsWithLatestAvroPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteNonDefaultsWithLatestAvroPayload.java @@ -18,17 +18,22 @@ package org.apache.hudi.common.model; +import org.apache.avro.JsonProperties; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hudi.avro.HoodieAvroUtils; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.Arrays; +import java.util.Collections; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotSame; /** * Unit tests {@link TestOverwriteNonDefaultsWithLatestAvroPayload}. @@ -43,18 +48,22 @@ public void setUp() throws Exception { new Schema.Field("partition", Schema.create(Schema.Type.STRING), "", ""), new Schema.Field("ts", Schema.create(Schema.Type.LONG), "", null), new Schema.Field("_hoodie_is_deleted", Schema.create(Schema.Type.BOOLEAN), "", false), - new Schema.Field("city", Schema.create(Schema.Type.STRING), "", "NY") + new Schema.Field("city", Schema.create(Schema.Type.STRING), "", "NY"), + new Schema.Field("child", Schema.createArray(Schema.create(Schema.Type.STRING)), "", Collections.emptyList()) )); } @Test public void testActiveRecords() throws IOException { + Schema writerSchema = HoodieAvroUtils.addMetadataFields(schema); + GenericRecord record1 = new GenericData.Record(schema); record1.put("id", "1"); record1.put("partition", "partition1"); record1.put("ts", 0L); record1.put("_hoodie_is_deleted", false); record1.put("city", "NY0"); + record1.put("child", Collections.singletonList("A")); GenericRecord record2 = new GenericData.Record(schema); record2.put("id", "2"); @@ -62,6 +71,7 @@ public void testActiveRecords() throws IOException { record2.put("ts", 1L); record2.put("_hoodie_is_deleted", false); record2.put("city", "NY"); + record2.put("child", Collections.emptyList()); GenericRecord record3 = new GenericData.Record(schema); record3.put("id", "2"); @@ -69,18 +79,64 @@ public void testActiveRecords() throws IOException { record3.put("ts", 1L); record3.put("_hoodie_is_deleted", false); record3.put("city", "NY0"); - + record3.put("child", Collections.singletonList("A")); + + // same content with record1 plus metadata fields + GenericRecord record4 = createRecordWithMetadataFields(writerSchema, "1", "partition1"); + record4.put("id", "1"); + record4.put("partition", "partition1"); + record4.put("ts", 0L); + record4.put("_hoodie_is_deleted", false); + record4.put("city", "NY0"); + record4.put("child", Collections.singletonList("A")); + + // same content with record2 plus metadata fields + GenericRecord record5 = createRecordWithMetadataFields(writerSchema, "2", ""); + record5.put("id", "2"); + record5.put("partition", ""); + record5.put("ts", 1L); + record5.put("_hoodie_is_deleted", false); + record5.put("city", "NY"); + record5.put("child", Collections.emptyList()); + + // same content with record3 plus metadata fields + GenericRecord record6 = createRecordWithMetadataFields(writerSchema, "2", ""); + record6.put("id", "2"); + record6.put("partition", "partition1"); + record6.put("ts", 1L); + record6.put("_hoodie_is_deleted", false); + record6.put("city", "NY0"); + record6.put("child", Collections.singletonList("A")); OverwriteNonDefaultsWithLatestAvroPayload payload1 = new OverwriteNonDefaultsWithLatestAvroPayload(record1, 1); OverwriteNonDefaultsWithLatestAvroPayload payload2 = new OverwriteNonDefaultsWithLatestAvroPayload(record2, 2); + OverwriteNonDefaultsWithLatestAvroPayload payload5 = new OverwriteNonDefaultsWithLatestAvroPayload(record5, 2); assertEquals(payload1.preCombine(payload2), payload2); assertEquals(payload2.preCombine(payload1), payload2); assertEquals(record1, payload1.getInsertValue(schema).get()); assertEquals(record2, payload2.getInsertValue(schema).get()); - assertEquals(payload1.combineAndGetUpdateValue(record2, schema).get(), record1); - assertEquals(payload2.combineAndGetUpdateValue(record1, schema).get(), record3); + IndexedRecord combinedVal1 = payload1.combineAndGetUpdateValue(record2, schema).get(); + assertEquals(combinedVal1, record1); + assertNotSame(combinedVal1, record1); + + IndexedRecord combinedVal2 = payload2.combineAndGetUpdateValue(record1, schema).get(); + assertEquals(combinedVal2, record3); + assertNotSame(combinedVal2, record3); + + // the real case in production is: the current record to be combined includes the metadata fields, + // the payload record could include the metadata fields (for compaction) or not (for normal writer path). + + // case1: validate normal writer path + IndexedRecord combinedVal3 = payload2.combineAndGetUpdateValue(record4, schema).get(); + assertEquals(combinedVal3, record3); + assertNotSame(combinedVal3, record3); + + // case2: validate compaction path + IndexedRecord combinedVal4 = payload5.combineAndGetUpdateValue(record4, writerSchema).get(); + assertEquals(combinedVal4, record6); + assertNotSame(combinedVal4, record6); } @Test @@ -91,6 +147,7 @@ public void testDeletedRecord() throws IOException { record1.put("ts", 0L); record1.put("_hoodie_is_deleted", false); record1.put("city", "NY0"); + record1.put("child", Collections.emptyList()); GenericRecord delRecord1 = new GenericData.Record(schema); delRecord1.put("id", "2"); @@ -98,6 +155,7 @@ public void testDeletedRecord() throws IOException { delRecord1.put("ts", 1L); delRecord1.put("_hoodie_is_deleted", true); delRecord1.put("city", "NY0"); + delRecord1.put("child", Collections.emptyList()); GenericRecord record2 = new GenericData.Record(schema); record2.put("id", "1"); @@ -105,6 +163,7 @@ public void testDeletedRecord() throws IOException { record2.put("ts", 0L); record2.put("_hoodie_is_deleted", true); record2.put("city", "NY0"); + record2.put("child", Collections.emptyList()); OverwriteNonDefaultsWithLatestAvroPayload payload1 = new OverwriteNonDefaultsWithLatestAvroPayload(record1, 1); OverwriteNonDefaultsWithLatestAvroPayload payload2 = new OverwriteNonDefaultsWithLatestAvroPayload(delRecord1, 2); @@ -118,4 +177,44 @@ public void testDeletedRecord() throws IOException { assertEquals(payload1.combineAndGetUpdateValue(delRecord1, schema).get(), record2); assertFalse(payload2.combineAndGetUpdateValue(record1, schema).isPresent()); } + + @Test + public void testNullColumn() throws IOException { + Schema avroSchema = Schema.createRecord(Arrays.asList( + new Schema.Field("id", Schema.createUnion(Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.STRING)), "", JsonProperties.NULL_VALUE), + new Schema.Field("name", Schema.createUnion(Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.STRING)), "", JsonProperties.NULL_VALUE), + new Schema.Field("age", Schema.createUnion(Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.STRING)), "", JsonProperties.NULL_VALUE), + new Schema.Field("job", Schema.createUnion(Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.STRING)), "", JsonProperties.NULL_VALUE) + )); + GenericRecord record1 = new GenericData.Record(avroSchema); + record1.put("id", "1"); + record1.put("name", "aa"); + record1.put("age", "1"); + record1.put("job", "1"); + + GenericRecord record2 = new GenericData.Record(avroSchema); + record2.put("id", "1"); + record2.put("name", "bb"); + record2.put("age", "2"); + record2.put("job", null); + + GenericRecord record3 = new GenericData.Record(avroSchema); + record3.put("id", "1"); + record3.put("name", "bb"); + record3.put("age", "2"); + record3.put("job", "1"); + + OverwriteNonDefaultsWithLatestAvroPayload payload2 = new OverwriteNonDefaultsWithLatestAvroPayload(record2, 1); + assertEquals(payload2.combineAndGetUpdateValue(record1, avroSchema).get(), record3); + } + + private static GenericRecord createRecordWithMetadataFields(Schema schema, String recordKey, String partitionPath) { + GenericRecord record = new GenericData.Record(schema); + record.put(HoodieRecord.COMMIT_TIME_METADATA_FIELD, "001"); + record.put(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, "123"); + record.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recordKey); + record.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, partitionPath); + record.put(HoodieRecord.FILENAME_METADATA_FIELD, "file1"); + return record; + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/debezium/TestMySqlDebeziumAvroPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/debezium/TestMySqlDebeziumAvroPayload.java new file mode 100644 index 0000000000000..f9e922622d6c1 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/debezium/TestMySqlDebeziumAvroPayload.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or mo contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model.debezium; + +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieDebeziumAvroPayloadException; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Objects; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class TestMySqlDebeziumAvroPayload { + + private static final String KEY_FIELD_NAME = "Key"; + + private Schema avroSchema; + + @BeforeEach + void setUp() { + this.avroSchema = Schema.createRecord(Arrays.asList( + new Schema.Field(KEY_FIELD_NAME, Schema.create(Schema.Type.INT), "", 0), + new Schema.Field(DebeziumConstants.FLATTENED_OP_COL_NAME, + Schema.createUnion(Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.STRING)), "", null), + new Schema.Field(DebeziumConstants.ADDED_SEQ_COL_NAME, + Schema.createUnion(Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.STRING)), "", null) + )); + } + + @Test + public void testInsert() throws IOException { + GenericRecord insertRecord = createRecord(0, Operation.INSERT, "00001.111"); + MySqlDebeziumAvroPayload payload = new MySqlDebeziumAvroPayload(insertRecord, "00001.111"); + validateRecord(payload.getInsertValue(avroSchema), 0, Operation.INSERT, "00001.111"); + } + + @Test + public void testPreCombine() { + GenericRecord insertRecord = createRecord(0, Operation.INSERT, "00002.111"); + MySqlDebeziumAvroPayload insertPayload = new MySqlDebeziumAvroPayload(insertRecord, "00002.111"); + + GenericRecord updateRecord = createRecord(0, Operation.UPDATE, "00001.111"); + MySqlDebeziumAvroPayload updatePayload = new MySqlDebeziumAvroPayload(updateRecord, "00001.111"); + + GenericRecord deleteRecord = createRecord(0, Operation.DELETE, "00002.11"); + MySqlDebeziumAvroPayload deletePayload = new MySqlDebeziumAvroPayload(deleteRecord, "00002.11"); + + assertEquals(insertPayload, insertPayload.preCombine(updatePayload)); + assertEquals(deletePayload, deletePayload.preCombine(updatePayload)); + assertEquals(insertPayload, deletePayload.preCombine(insertPayload)); + } + + @Test + public void testMergeWithUpdate() throws IOException { + GenericRecord updateRecord = createRecord(1, Operation.UPDATE, "00002.11"); + MySqlDebeziumAvroPayload payload = new MySqlDebeziumAvroPayload(updateRecord, "00002.11"); + + GenericRecord existingRecord = createRecord(1, Operation.INSERT, "00001.111"); + Option mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema); + validateRecord(mergedRecord, 1, Operation.UPDATE, "00002.11"); + + GenericRecord lateRecord = createRecord(1, Operation.UPDATE, "00000.222"); + payload = new MySqlDebeziumAvroPayload(lateRecord, "00000.222"); + mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema); + validateRecord(mergedRecord, 1, Operation.INSERT, "00001.111"); + } + + @Test + public void testMergeWithDelete() throws IOException { + GenericRecord deleteRecord = createRecord(2, Operation.DELETE, "00002.11"); + MySqlDebeziumAvroPayload payload = new MySqlDebeziumAvroPayload(deleteRecord, "00002.11"); + + GenericRecord existingRecord = createRecord(2, Operation.UPDATE, "00001.111"); + Option mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema); + // expect nothing to be committed to table + assertFalse(mergedRecord.isPresent()); + + GenericRecord lateRecord = createRecord(2, Operation.DELETE, "00000.222"); + payload = new MySqlDebeziumAvroPayload(lateRecord, "00000.222"); + mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema); + validateRecord(mergedRecord, 2, Operation.UPDATE, "00001.111"); + } + + @Test + public void testMergeWithBootstrappedExistingRecords() throws IOException { + GenericRecord incomingRecord = createRecord(3, Operation.UPDATE, "00002.111"); + MySqlDebeziumAvroPayload payload = new MySqlDebeziumAvroPayload(incomingRecord, "00002.111"); + + GenericRecord existingRecord = createRecord(3, null, null); + Option mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema); + validateRecord(mergedRecord, 3, Operation.UPDATE, "00002.111"); + } + + @Test + public void testInvalidIncomingRecord() { + GenericRecord incomingRecord = createRecord(4, null, null); + MySqlDebeziumAvroPayload payload = new MySqlDebeziumAvroPayload(incomingRecord, "00002.111"); + + GenericRecord existingRecord = createRecord(4, Operation.INSERT, "00001.111"); + assertThrows(HoodieDebeziumAvroPayloadException.class, + () -> payload.combineAndGetUpdateValue(existingRecord, avroSchema), + "should have thrown because event seq value of the incoming record is null"); + } + + private GenericRecord createRecord(int primaryKeyValue, @Nullable Operation op, @Nullable String seqValue) { + GenericRecord record = new GenericData.Record(avroSchema); + record.put(KEY_FIELD_NAME, primaryKeyValue); + record.put(DebeziumConstants.FLATTENED_OP_COL_NAME, Objects.toString(op, null)); + record.put(DebeziumConstants.ADDED_SEQ_COL_NAME, seqValue); + return record; + } + + private void validateRecord(Option iRecord, int primaryKeyValue, Operation op, String seqValue) { + IndexedRecord record = iRecord.get(); + assertEquals(primaryKeyValue, (int) record.get(0)); + assertEquals(op.op, record.get(1).toString()); + assertEquals(seqValue, record.get(2).toString()); + } + + private enum Operation { + INSERT("c"), + UPDATE("u"), + DELETE("d"); + + public final String op; + + Operation(String op) { + this.op = op; + } + + @Override + public String toString() { + return op; + } + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/debezium/TestPostgresDebeziumAvroPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/debezium/TestPostgresDebeziumAvroPayload.java new file mode 100644 index 0000000000000..78599afc1fe16 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/debezium/TestPostgresDebeziumAvroPayload.java @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or mo contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model.debezium; + +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieDebeziumAvroPayloadException; + +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.avro.util.Utf8; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Objects; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class TestPostgresDebeziumAvroPayload { + + private static final String KEY_FIELD_NAME = "Key"; + private Schema avroSchema; + + @BeforeEach + void setUp() { + this.avroSchema = Schema.createRecord(Arrays.asList( + new Schema.Field(KEY_FIELD_NAME, Schema.create(Schema.Type.INT), "", 0), + new Schema.Field(DebeziumConstants.FLATTENED_OP_COL_NAME, + Schema.createUnion(Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.STRING)), "", null), + new Schema.Field(DebeziumConstants.FLATTENED_LSN_COL_NAME, + Schema.createUnion(Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.LONG)), "", null) + )); + } + + @Test + public void testInsert() throws IOException { + GenericRecord insertRecord = createRecord(0, Operation.INSERT, 100L); + PostgresDebeziumAvroPayload payload = new PostgresDebeziumAvroPayload(insertRecord, 100L); + validateRecord(payload.getInsertValue(avroSchema), 0, Operation.INSERT, 100L); + } + + @Test + public void testPreCombine() { + GenericRecord insertRecord = createRecord(0, Operation.INSERT, 120L); + PostgresDebeziumAvroPayload insertPayload = new PostgresDebeziumAvroPayload(insertRecord, 120L); + + GenericRecord updateRecord = createRecord(0, Operation.UPDATE, 99L); + PostgresDebeziumAvroPayload updatePayload = new PostgresDebeziumAvroPayload(updateRecord, 99L); + + GenericRecord deleteRecord = createRecord(0, Operation.DELETE, 111L); + PostgresDebeziumAvroPayload deletePayload = new PostgresDebeziumAvroPayload(deleteRecord, 111L); + + assertEquals(insertPayload, insertPayload.preCombine(updatePayload)); + assertEquals(deletePayload, deletePayload.preCombine(updatePayload)); + assertEquals(insertPayload, deletePayload.preCombine(insertPayload)); + } + + @Test + public void testMergeWithUpdate() throws IOException { + GenericRecord updateRecord = createRecord(1, Operation.UPDATE, 100L); + PostgresDebeziumAvroPayload payload = new PostgresDebeziumAvroPayload(updateRecord, 100L); + + GenericRecord existingRecord = createRecord(1, Operation.INSERT, 99L); + Option mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema); + validateRecord(mergedRecord, 1, Operation.UPDATE, 100L); + + GenericRecord lateRecord = createRecord(1, Operation.UPDATE, 98L); + payload = new PostgresDebeziumAvroPayload(lateRecord, 98L); + mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema); + validateRecord(mergedRecord, 1, Operation.INSERT, 99L); + } + + @Test + public void testMergeWithDelete() throws IOException { + GenericRecord deleteRecord = createRecord(2, Operation.DELETE, 100L); + PostgresDebeziumAvroPayload payload = new PostgresDebeziumAvroPayload(deleteRecord, 100L); + + GenericRecord existingRecord = createRecord(2, Operation.UPDATE, 99L); + Option mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema); + // expect nothing to be committed to table + assertFalse(mergedRecord.isPresent()); + + GenericRecord lateRecord = createRecord(2, Operation.DELETE, 98L); + payload = new PostgresDebeziumAvroPayload(lateRecord, 98L); + mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema); + validateRecord(mergedRecord, 2, Operation.UPDATE, 99L); + } + + @Test + public void testMergeWithBootstrappedExistingRecords() throws IOException { + GenericRecord incomingRecord = createRecord(3, Operation.UPDATE, 100L); + PostgresDebeziumAvroPayload payload = new PostgresDebeziumAvroPayload(incomingRecord, 100L); + + GenericRecord existingRecord = createRecord(3, null, null); + Option mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema); + validateRecord(mergedRecord, 3, Operation.UPDATE, 100L); + } + + @Test + public void testInvalidIncomingRecord() { + GenericRecord incomingRecord = createRecord(4, null, null); + PostgresDebeziumAvroPayload payload = new PostgresDebeziumAvroPayload(incomingRecord, 100L); + + GenericRecord existingRecord = createRecord(4, Operation.INSERT, 99L); + assertThrows(HoodieDebeziumAvroPayloadException.class, + () -> payload.combineAndGetUpdateValue(existingRecord, avroSchema), + "should have thrown because LSN value of the incoming record is null"); + } + + @Test + public void testMergeWithToastedValues() throws IOException { + Schema avroSchema = SchemaBuilder.builder() + .record("test_schema") + .namespace("test_namespace") + .fields() + .name(DebeziumConstants.FLATTENED_LSN_COL_NAME).type().longType().noDefault() + .name("string_col").type().stringType().noDefault() + .name("byte_col").type().bytesType().noDefault() + .name("string_null_col_1").type().nullable().stringType().noDefault() + .name("byte_null_col_1").type().nullable().bytesType().noDefault() + .name("string_null_col_2").type().nullable().stringType().noDefault() + .name("byte_null_col_2").type().nullable().bytesType().noDefault() + .endRecord(); + + GenericRecord oldVal = new GenericData.Record(avroSchema); + oldVal.put(DebeziumConstants.FLATTENED_LSN_COL_NAME, 100L); + oldVal.put("string_col", "valid string value"); + oldVal.put("byte_col", ByteBuffer.wrap("valid byte value".getBytes())); + oldVal.put("string_null_col_1", "valid string value"); + oldVal.put("byte_null_col_1", ByteBuffer.wrap("valid byte value".getBytes())); + oldVal.put("string_null_col_2", null); + oldVal.put("byte_null_col_2", null); + + GenericRecord newVal = new GenericData.Record(avroSchema); + newVal.put(DebeziumConstants.FLATTENED_LSN_COL_NAME, 105L); + newVal.put("string_col", PostgresDebeziumAvroPayload.DEBEZIUM_TOASTED_VALUE); + newVal.put("byte_col", ByteBuffer.wrap(PostgresDebeziumAvroPayload.DEBEZIUM_TOASTED_VALUE.getBytes())); + newVal.put("string_null_col_1", null); + newVal.put("byte_null_col_1", null); + newVal.put("string_null_col_2", "valid string value"); + newVal.put("byte_null_col_2", ByteBuffer.wrap("valid byte value".getBytes())); + + PostgresDebeziumAvroPayload payload = new PostgresDebeziumAvroPayload(Option.of(newVal)); + + GenericRecord outputRecord = (GenericRecord) payload + .combineAndGetUpdateValue(oldVal, avroSchema).get(); + + assertEquals("valid string value", outputRecord.get("string_col")); + assertEquals("valid byte value", new String(((ByteBuffer) outputRecord.get("byte_col")).array(), StandardCharsets.UTF_8)); + assertNull(outputRecord.get("string_null_col_1")); + assertNull(outputRecord.get("byte_null_col_1")); + assertEquals("valid string value", ((Utf8) outputRecord.get("string_null_col_2")).toString()); + assertEquals("valid byte value", new String(((ByteBuffer) outputRecord.get("byte_null_col_2")).array(), StandardCharsets.UTF_8)); + } + + private GenericRecord createRecord(int primaryKeyValue, @Nullable Operation op, @Nullable Long lsnValue) { + GenericRecord record = new GenericData.Record(avroSchema); + record.put(KEY_FIELD_NAME, primaryKeyValue); + record.put(DebeziumConstants.FLATTENED_OP_COL_NAME, Objects.toString(op, null)); + record.put(DebeziumConstants.FLATTENED_LSN_COL_NAME, lsnValue); + return record; + } + + private void validateRecord(Option iRecord, int primaryKeyValue, Operation op, long lsnValue) { + IndexedRecord record = iRecord.get(); + assertEquals(primaryKeyValue, (int) record.get(0)); + assertEquals(op.op, record.get(1).toString()); + assertEquals(lsnValue, (long) record.get(2)); + } + + private enum Operation { + INSERT("c"), + UPDATE("u"), + DELETE("d"); + + public final String op; + + Operation(String op) { + this.op = op; + } + + @Override + public String toString() { + return op; + } + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/properties/TestOrderedProperties.java b/hudi-common/src/test/java/org/apache/hudi/common/properties/TestOrderedProperties.java new file mode 100644 index 0000000000000..c75df04a2a633 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/properties/TestOrderedProperties.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.properties; + +import org.apache.hudi.common.config.OrderedProperties; + +import org.junit.jupiter.api.Test; + +import java.util.Properties; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestOrderedProperties { + + @Test + public void testPutPropertiesOrder() { + Properties properties = new OrderedProperties(); + properties.put("key0", "true"); + properties.put("key1", "false"); + properties.put("key2", "true"); + properties.put("key3", "false"); + properties.put("key4", "true"); + properties.put("key5", "true"); + properties.put("key6", "false"); + properties.put("key7", "true"); + properties.put("key8", "false"); + properties.put("key9", "true"); + + OrderedProperties typedProperties = new OrderedProperties(properties); + assertTypeProperties(typedProperties, 0); + } + + @Test + void testPutAllPropertiesOrder() { + Properties firstProp = new OrderedProperties(); + firstProp.put("key0", "true"); + firstProp.put("key1", "false"); + firstProp.put("key2", "true"); + + OrderedProperties firstProperties = new OrderedProperties(firstProp); + assertTypeProperties(firstProperties, 0); + + OrderedProperties secondProperties = new OrderedProperties(); + secondProperties.put("key3", "true"); + secondProperties.put("key4", "false"); + secondProperties.put("key5", "true"); + assertTypeProperties(secondProperties, 3); + + OrderedProperties thirdProperties = new OrderedProperties(); + thirdProperties.putAll(firstProp); + thirdProperties.putAll(secondProperties); + + assertEquals(3, firstProp.stringPropertyNames().size()); + assertEquals(3, secondProperties.stringPropertyNames().size()); + assertEquals(6, thirdProperties.stringPropertyNames().size()); + } + + private void assertTypeProperties(OrderedProperties typedProperties, int start) { + String[] props = typedProperties.stringPropertyNames().toArray(new String[0]); + for (int i = start; i < props.length; i++) { + assertEquals(String.format("key%d", i), props[i]); + } + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/properties/TestTypedProperties.java b/hudi-common/src/test/java/org/apache/hudi/common/properties/TestTypedProperties.java index 95955d4d72a27..954b53651af5c 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/properties/TestTypedProperties.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/properties/TestTypedProperties.java @@ -19,11 +19,14 @@ package org.apache.hudi.common.properties; import org.apache.hudi.common.config.TypedProperties; + import org.junit.jupiter.api.Test; import java.util.Properties; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; public class TestTypedProperties { @Test @@ -77,8 +80,30 @@ public void testGetBoolean() { properties.put("key1", "true"); TypedProperties typedProperties = new TypedProperties(properties); - assertEquals(true, typedProperties.getBoolean("key1")); - assertEquals(true, typedProperties.getBoolean("key1", false)); - assertEquals(false, typedProperties.getBoolean("key2", false)); + assertTrue(typedProperties.getBoolean("key1")); + assertTrue(typedProperties.getBoolean("key1", false)); + assertFalse(typedProperties.getBoolean("key2", false)); + // test getBoolean with non-string value for key2 + properties.put("key2", true); + typedProperties = new TypedProperties(properties); + assertTrue(typedProperties.getBoolean("key1", false)); + assertTrue(typedProperties.getBoolean("key2", false)); + // put non-string value in TypedProperties + typedProperties.put("key3", true); + assertTrue(typedProperties.getBoolean("key3", false)); + } + + @Test + public void testTypedPropertiesWithNonStringValue() { + Properties properties = new Properties(); + properties.put("key1", "1"); + properties.put("key2", 2); + + TypedProperties props = new TypedProperties(properties); + assertEquals(1, props.getInteger("key1")); + assertEquals(2, props.getInteger("key2")); + // put non-string value in TypedProperties + props.put("key2", 3); + assertEquals(3, props.getInteger("key2")); } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableConfig.java b/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableConfig.java new file mode 100644 index 0000000000000..0defefe2ea4e4 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableConfig.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.table; + +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.exception.HoodieIOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.IOException; +import java.util.Properties; +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestHoodieTableConfig extends HoodieCommonTestHarness { + + private FileSystem fs; + private Path metaPath; + private Path cfgPath; + private Path backupCfgPath; + + @BeforeEach + public void setUp() throws Exception { + initPath(); + fs = new Path(basePath).getFileSystem(new Configuration()); + metaPath = new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME); + Properties props = new Properties(); + props.setProperty(HoodieTableConfig.NAME.key(), "test-table"); + HoodieTableConfig.create(fs, metaPath, props); + cfgPath = new Path(metaPath, HoodieTableConfig.HOODIE_PROPERTIES_FILE); + backupCfgPath = new Path(metaPath, HoodieTableConfig.HOODIE_PROPERTIES_FILE_BACKUP); + } + + @Test + public void testCreate() throws IOException { + assertTrue(fs.exists(new Path(metaPath, HoodieTableConfig.HOODIE_PROPERTIES_FILE))); + HoodieTableConfig config = new HoodieTableConfig(fs, metaPath.toString(), null); + assertEquals(6, config.getProps().size()); + } + + @Test + public void testUpdate() throws IOException { + Properties updatedProps = new Properties(); + updatedProps.setProperty(HoodieTableConfig.NAME.key(), "test-table2"); + updatedProps.setProperty(HoodieTableConfig.PRECOMBINE_FIELD.key(), "new_field"); + HoodieTableConfig.update(fs, metaPath, updatedProps); + + assertTrue(fs.exists(cfgPath)); + assertFalse(fs.exists(backupCfgPath)); + HoodieTableConfig config = new HoodieTableConfig(fs, metaPath.toString(), null); + assertEquals(7, config.getProps().size()); + assertEquals("test-table2", config.getTableName()); + assertEquals("new_field", config.getPreCombineField()); + } + + @Test + public void testDelete() throws IOException { + Set deletedProps = CollectionUtils.createSet(HoodieTableConfig.ARCHIVELOG_FOLDER.key(), "hoodie.invalid.config"); + HoodieTableConfig.delete(fs, metaPath, deletedProps); + + assertTrue(fs.exists(cfgPath)); + assertFalse(fs.exists(backupCfgPath)); + HoodieTableConfig config = new HoodieTableConfig(fs, metaPath.toString(), null); + assertEquals(5, config.getProps().size()); + assertNull(config.getProps().getProperty("hoodie.invalid.config")); + assertFalse(config.getProps().contains(HoodieTableConfig.ARCHIVELOG_FOLDER.key())); + } + + @Test + public void testReadsWhenPropsFileDoesNotExist() throws IOException { + fs.delete(cfgPath, false); + assertThrows(HoodieIOException.class, () -> { + new HoodieTableConfig(fs, metaPath.toString(), null); + }); + } + + @Test + public void testReadsWithUpdateFailures() throws IOException { + HoodieTableConfig config = new HoodieTableConfig(fs, metaPath.toString(), null); + fs.delete(cfgPath, false); + try (FSDataOutputStream out = fs.create(backupCfgPath)) { + config.getProps().store(out, ""); + } + + assertFalse(fs.exists(cfgPath)); + assertTrue(fs.exists(backupCfgPath)); + config = new HoodieTableConfig(fs, metaPath.toString(), null); + assertEquals(6, config.getProps().size()); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testUpdateRecovery(boolean shouldPropsFileExist) throws IOException { + HoodieTableConfig config = new HoodieTableConfig(fs, metaPath.toString(), null); + if (!shouldPropsFileExist) { + fs.delete(cfgPath, false); + } + try (FSDataOutputStream out = fs.create(backupCfgPath)) { + config.getProps().store(out, ""); + } + + HoodieTableConfig.recoverIfNeeded(fs, cfgPath, backupCfgPath); + assertTrue(fs.exists(cfgPath)); + assertFalse(fs.exists(backupCfgPath)); + config = new HoodieTableConfig(fs, metaPath.toString(), null); + assertEquals(6, config.getProps().size()); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableMetaClient.java b/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableMetaClient.java index 121e173c655c9..840e6ddf4ad3f 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableMetaClient.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableMetaClient.java @@ -54,15 +54,17 @@ public void checkMetadata() { assertEquals(basePath, metaClient.getBasePath(), "Basepath should be the one assigned"); assertEquals(basePath + "/.hoodie", metaClient.getMetaPath(), "Metapath should be ${basepath}/.hoodie"); + assertTrue(metaClient.getTableConfig().getProps().containsKey(HoodieTableConfig.TABLE_CHECKSUM.key())); + assertTrue(HoodieTableConfig.validateChecksum(metaClient.getTableConfig().getProps())); } @Test public void checkSerDe() { // check if this object is serialized and de-serialized, we are able to read from the file system - HoodieTableMetaClient deseralizedMetaClient = + HoodieTableMetaClient deserializedMetaClient = HoodieTestUtils.serializeDeserialize(metaClient, HoodieTableMetaClient.class); - assertNotNull(deseralizedMetaClient); - HoodieActiveTimeline commitTimeline = deseralizedMetaClient.getActiveTimeline(); + assertNotNull(deserializedMetaClient); + HoodieActiveTimeline commitTimeline = deserializedMetaClient.getActiveTimeline(); HoodieInstant instant = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "1"); commitTimeline.createNewInstant(instant); commitTimeline.saveAsComplete(instant, Option.of("test-detail".getBytes())); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/TestTableSchemaResolver.java b/hudi-common/src/test/java/org/apache/hudi/common/table/TestTableSchemaResolver.java new file mode 100644 index 0000000000000..5d949431e4937 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/TestTableSchemaResolver.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.table; + +import org.apache.hudi.avro.AvroSchemaUtils; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieIncompatibleSchemaException; + +import org.apache.avro.Schema; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestTableSchemaResolver { + + @Test + public void testRecreateSchemaWhenDropPartitionColumns() { + Schema originSchema = new Schema.Parser().parse(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA); + + // case2 + String[] pts1 = new String[0]; + Schema s2 = TableSchemaResolver.appendPartitionColumns(originSchema, Option.of(pts1)); + assertEquals(originSchema, s2); + + // case3: partition_path is in originSchema + String[] pts2 = {"partition_path"}; + Schema s3 = TableSchemaResolver.appendPartitionColumns(originSchema, Option.of(pts2)); + assertEquals(originSchema, s3); + + // case4: user_partition is not in originSchema + String[] pts3 = {"user_partition"}; + Schema s4 = TableSchemaResolver.appendPartitionColumns(originSchema, Option.of(pts3)); + assertNotEquals(originSchema, s4); + assertTrue(s4.getFields().stream().anyMatch(f -> f.name().equals("user_partition"))); + Schema.Field f = s4.getField("user_partition"); + assertEquals(f.schema(), AvroSchemaUtils.createNullableSchema(Schema.Type.STRING)); + + // case5: user_partition is in originSchema, but partition_path is in originSchema + String[] pts4 = {"user_partition", "partition_path"}; + try { + TableSchemaResolver.appendPartitionColumns(originSchema, Option.of(pts3)); + } catch (HoodieIncompatibleSchemaException e) { + assertTrue(e.getMessage().contains("Partial partition fields are still in the schema")); + } + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/TestTimelineUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/table/TestTimelineUtils.java index e677f491f14d5..da078372b5c3b 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/TestTimelineUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/TestTimelineUtils.java @@ -23,9 +23,12 @@ import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; import org.apache.hudi.common.HoodieRollbackStat; +import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieCleaningPolicy; import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; @@ -58,6 +61,45 @@ public void setUp() throws Exception { initMetaClient(); } + @Test + public void testGetPartitionsWithReplaceCommits() throws IOException { + HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); + HoodieTimeline activeCommitTimeline = activeTimeline.getCommitTimeline(); + assertTrue(activeCommitTimeline.empty()); + + String ts1 = "1"; + String replacePartition = "2021/01/01"; + String newFilePartition = "2021/01/02"; + HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.REPLACE_COMMIT_ACTION, ts1); + activeTimeline.createNewInstant(instant1); + // create replace metadata only with replaced file Ids (no new files created) + activeTimeline.saveAsComplete(instant1, + Option.of(getReplaceCommitMetadata(basePath, ts1, replacePartition, 2, + newFilePartition, 0, Collections.emptyMap(), WriteOperationType.CLUSTER))); + metaClient.reloadActiveTimeline(); + + List partitions = TimelineUtils.getAffectedPartitions(metaClient.getActiveTimeline().findInstantsAfter("0", 10)); + assertEquals(1, partitions.size()); + assertEquals(replacePartition, partitions.get(0)); + + String ts2 = "2"; + HoodieInstant instant2 = new HoodieInstant(true, HoodieTimeline.REPLACE_COMMIT_ACTION, ts2); + activeTimeline.createNewInstant(instant2); + // create replace metadata only with replaced file Ids (no new files created) + activeTimeline.saveAsComplete(instant2, + Option.of(getReplaceCommitMetadata(basePath, ts2, replacePartition, 0, + newFilePartition, 3, Collections.emptyMap(), WriteOperationType.CLUSTER))); + metaClient.reloadActiveTimeline(); + partitions = TimelineUtils.getAffectedPartitions(metaClient.getActiveTimeline().findInstantsAfter("1", 10)); + assertEquals(1, partitions.size()); + assertEquals(newFilePartition, partitions.get(0)); + + partitions = TimelineUtils.getAffectedPartitions(metaClient.getActiveTimeline().findInstantsAfter("0", 10)); + assertEquals(2, partitions.size()); + assertTrue(partitions.contains(replacePartition)); + assertTrue(partitions.contains(newFilePartition)); + } + @Test public void testGetPartitions() throws IOException { HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); @@ -81,24 +123,24 @@ public void testGetPartitions() throws IOException { // verify modified partitions included cleaned data List partitions = TimelineUtils.getAffectedPartitions(metaClient.getActiveTimeline().findInstantsAfter("1", 10)); assertEquals(5, partitions.size()); - assertEquals(partitions, Arrays.asList(new String[]{"0", "2", "3", "4", "5"})); + assertEquals(partitions, Arrays.asList(new String[] {"0", "2", "3", "4", "5"})); partitions = TimelineUtils.getAffectedPartitions(metaClient.getActiveTimeline().findInstantsInRange("1", "4")); assertEquals(4, partitions.size()); - assertEquals(partitions, Arrays.asList(new String[]{"0", "2", "3", "4"})); + assertEquals(partitions, Arrays.asList(new String[] {"0", "2", "3", "4"})); // verify only commit actions - partitions = TimelineUtils.getPartitionsWritten(metaClient.getActiveTimeline().findInstantsAfter("1", 10)); + partitions = TimelineUtils.getWrittenPartitions(metaClient.getActiveTimeline().findInstantsAfter("1", 10)); assertEquals(4, partitions.size()); - assertEquals(partitions, Arrays.asList(new String[]{"2", "3", "4", "5"})); + assertEquals(partitions, Arrays.asList(new String[] {"2", "3", "4", "5"})); - partitions = TimelineUtils.getPartitionsWritten(metaClient.getActiveTimeline().findInstantsInRange("1", "4")); + partitions = TimelineUtils.getWrittenPartitions(metaClient.getActiveTimeline().findInstantsInRange("1", "4")); assertEquals(3, partitions.size()); - assertEquals(partitions, Arrays.asList(new String[]{"2", "3", "4"})); + assertEquals(partitions, Arrays.asList(new String[] {"2", "3", "4"})); } @Test - public void testGetPartitionsUnpartitioned() throws IOException { + public void testGetPartitionsUnPartitioned() throws IOException { HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); HoodieTimeline activeCommitTimeline = activeTimeline.getCommitTimeline(); assertTrue(activeCommitTimeline.empty()); @@ -142,10 +184,10 @@ public void testRestoreInstants() throws Exception { // verify modified partitions included cleaned data List partitions = TimelineUtils.getAffectedPartitions(metaClient.getActiveTimeline().findInstantsAfter("1", 10)); - assertEquals(partitions, Arrays.asList(new String[]{"2", "3", "4", "5"})); + assertEquals(partitions, Arrays.asList(new String[] {"2", "3", "4", "5"})); partitions = TimelineUtils.getAffectedPartitions(metaClient.getActiveTimeline().findInstantsInRange("1", "4")); - assertEquals(partitions, Arrays.asList(new String[]{"2", "3", "4"})); + assertEquals(partitions, Arrays.asList(new String[] {"2", "3", "4"})); } @Test @@ -162,7 +204,7 @@ public void testGetExtraMetadata() throws Exception { activeTimeline.createNewInstant(instant); activeTimeline.saveAsComplete(instant, Option.of(getCommitMetadata(basePath, ts, ts, 2, Collections.emptyMap()))); - ts = "1"; + ts = "1"; instant = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, ts); activeTimeline.createNewInstant(instant); Map extraMetadata = new HashMap<>(); @@ -172,16 +214,42 @@ public void testGetExtraMetadata() throws Exception { metaClient.reloadActiveTimeline(); // verify modified partitions included cleaned data - Option extraLatestValue = TimelineUtils.getExtraMetadataFromLatest(metaClient, extraMetadataKey); - assertTrue(extraLatestValue.isPresent()); - assertEquals(extraMetadataValue1, extraLatestValue.get()); + verifyExtraMetadataLatestValue(extraMetadataKey, extraMetadataValue1, false); + assertFalse(TimelineUtils.getExtraMetadataFromLatest(metaClient, "unknownKey").isPresent()); + + // verify adding clustering commit doesn't change behavior of getExtraMetadataFromLatest + String ts2 = "2"; + HoodieInstant instant2 = new HoodieInstant(true, HoodieTimeline.REPLACE_COMMIT_ACTION, ts2); + activeTimeline.createNewInstant(instant2); + String newValueForMetadata = "newValue2"; + extraMetadata.put(extraMetadataKey, newValueForMetadata); + activeTimeline.saveAsComplete(instant2, + Option.of(getReplaceCommitMetadata(basePath, ts2, "p2", 0, + "p2", 3, extraMetadata, WriteOperationType.CLUSTER))); + metaClient.reloadActiveTimeline(); + + verifyExtraMetadataLatestValue(extraMetadataKey, extraMetadataValue1, false); + verifyExtraMetadataLatestValue(extraMetadataKey, newValueForMetadata, true); assertFalse(TimelineUtils.getExtraMetadataFromLatest(metaClient, "unknownKey").isPresent()); Map> extraMetadataEntries = TimelineUtils.getAllExtraMetadataForKey(metaClient, extraMetadataKey); - assertEquals(2, extraMetadataEntries.size()); + assertEquals(3, extraMetadataEntries.size()); assertFalse(extraMetadataEntries.get("0").isPresent()); assertTrue(extraMetadataEntries.get("1").isPresent()); assertEquals(extraMetadataValue1, extraMetadataEntries.get("1").get()); + assertTrue(extraMetadataEntries.get("2").isPresent()); + assertEquals(newValueForMetadata, extraMetadataEntries.get("2").get()); + } + + private void verifyExtraMetadataLatestValue(String extraMetadataKey, String expected, boolean includeClustering) { + final Option extraLatestValue; + if (includeClustering) { + extraLatestValue = TimelineUtils.getExtraMetadataFromLatestIncludeClustering(metaClient, extraMetadataKey); + } else { + extraLatestValue = TimelineUtils.getExtraMetadataFromLatest(metaClient, extraMetadataKey); + } + assertTrue(extraLatestValue.isPresent()); + assertEquals(expected, extraLatestValue.get()); } private byte[] getRestoreMetadata(String basePath, String partition, String commitTs, int count, String actionType) throws IOException { @@ -215,7 +283,7 @@ private byte[] getCommitMetadata(String basePath, String partition, String commi HoodieWriteStat stat = new HoodieWriteStat(); stat.setFileId(i + ""); stat.setPartitionPath(Paths.get(basePath, partition).toString()); - stat.setPath(commitTs + "." + i + ".parquet"); + stat.setPath(commitTs + "." + i + metaClient.getTableConfig().getBaseFileFormat().getFileExtension()); commit.addWriteStat(partition, stat); } for (Map.Entry extraEntries : extraMetadata.entrySet()) { @@ -224,6 +292,33 @@ private byte[] getCommitMetadata(String basePath, String partition, String commi return commit.toJsonString().getBytes(StandardCharsets.UTF_8); } + private byte[] getReplaceCommitMetadata(String basePath, String commitTs, String replacePartition, int replaceCount, + String newFilePartition, int newFileCount, Map extraMetadata, + WriteOperationType operationType) + throws IOException { + HoodieReplaceCommitMetadata commit = new HoodieReplaceCommitMetadata(); + commit.setOperationType(operationType); + for (int i = 1; i <= newFileCount; i++) { + HoodieWriteStat stat = new HoodieWriteStat(); + stat.setFileId(i + ""); + stat.setPartitionPath(Paths.get(basePath, newFilePartition).toString()); + stat.setPath(commitTs + "." + i + metaClient.getTableConfig().getBaseFileFormat().getFileExtension()); + commit.addWriteStat(newFilePartition, stat); + } + Map> partitionToReplaceFileIds = new HashMap<>(); + if (replaceCount > 0) { + partitionToReplaceFileIds.put(replacePartition, new ArrayList<>()); + } + for (int i = 1; i <= replaceCount; i++) { + partitionToReplaceFileIds.get(replacePartition).add(FSUtils.createNewFileIdPfx()); + } + commit.setPartitionToReplaceFileIds(partitionToReplaceFileIds); + for (Map.Entry extraEntries : extraMetadata.entrySet()) { + commit.addMetadata(extraEntries.getKey(), extraEntries.getValue()); + } + return commit.toJsonString().getBytes(StandardCharsets.UTF_8); + } + private Option getCleanMetadata(String partition, String time) throws IOException { Map partitionToFilesCleaned = new HashMap<>(); List filesDeleted = new ArrayList<>(); @@ -243,6 +338,7 @@ private Option getCleanMetadata(String partition, String time) throws IO .setTotalFilesDeleted(1) .setStartCleanTime(time) .setEarliestCommitToRetain(time) + .setLastCompletedCommitTimestamp("") .setPartitionMetadata(partitionToFilesCleaned).build(); return TimelineMetadataUtils.serializeCleanMetadata(cleanMetadata); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/timeline/TestHoodieActiveTimeline.java b/hudi-common/src/test/java/org/apache/hudi/common/table/timeline/TestHoodieActiveTimeline.java index d7e3bde8cb074..182dd086789d0 100755 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/timeline/TestHoodieActiveTimeline.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/timeline/TestHoodieActiveTimeline.java @@ -18,6 +18,8 @@ package org.apache.hudi.common.table.timeline; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; +import org.apache.hudi.common.fs.NoOpConsistencyGuard; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant.State; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; @@ -25,19 +27,28 @@ import org.apache.hudi.common.testutils.MockHoodieTimeline; import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; import org.apache.hadoop.fs.Path; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.text.ParseException; import java.util.ArrayList; import java.util.Collections; +import java.util.Date; import java.util.HashSet; import java.util.List; import java.util.Random; import java.util.Set; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; import java.util.function.BiConsumer; +import java.util.function.Consumer; import java.util.function.Supplier; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -109,14 +120,18 @@ public void testLoadingInstantsFromFiles() throws IOException { "Check the instants stream"); // Backwards compatibility testing for reading compaction plans - metaClient = HoodieTableMetaClient.initTableType(metaClient.getHadoopConf(), - metaClient.getBasePath(), metaClient.getTableType(), metaClient.getTableConfig().getTableName(), - metaClient.getArchivePath(), metaClient.getTableConfig().getPayloadClass(), VERSION_0); + metaClient = HoodieTableMetaClient.withPropertyBuilder() + .fromMetaClient(metaClient) + .setTimelineLayoutVersion(VERSION_0) + .initTable(metaClient.getHadoopConf(), metaClient.getBasePath()); + HoodieInstant instant6 = new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "9"); byte[] dummy = new byte[5]; - HoodieActiveTimeline oldTimeline = new HoodieActiveTimeline(new HoodieTableMetaClient(metaClient.getHadoopConf(), - metaClient.getBasePath(), true, metaClient.getConsistencyGuardConfig(), - Option.of(new TimelineLayoutVersion(VERSION_0)))); + HoodieActiveTimeline oldTimeline = new HoodieActiveTimeline( + HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(metaClient.getBasePath()) + .setLoadActiveTimelineOnLoad(true).setConsistencyGuardConfig(metaClient.getConsistencyGuardConfig()) + .setFileSystemRetryConfig(metaClient.getFileSystemRetryConfig()) + .setLayoutVersion(Option.of(new TimelineLayoutVersion(VERSION_0))).build()); // Old Timeline writes both to aux and timeline folder oldTimeline.saveToCompactionRequested(instant6, Option.of(dummy)); // Now use latest timeline version @@ -167,6 +182,15 @@ public void testTimelineOperations() { assertFalse(timeline.empty()); assertFalse(timeline.getCommitTimeline().filterPendingExcludingCompaction().empty()); assertEquals(12, timeline.countInstants()); + assertEquals("01", timeline.firstInstant( + HoodieTimeline.COMMIT_ACTION, State.COMPLETED).get().getTimestamp()); + assertEquals("21", timeline.firstInstant( + HoodieTimeline.COMMIT_ACTION, State.INFLIGHT).get().getTimestamp()); + assertFalse(timeline.firstInstant( + HoodieTimeline.COMMIT_ACTION, State.REQUESTED).isPresent()); + assertFalse(timeline.firstInstant( + HoodieTimeline.REPLACE_COMMIT_ACTION, State.COMPLETED).isPresent()); + HoodieTimeline activeCommitTimeline = timeline.getCommitTimeline().filterCompletedInstants(); assertEquals(10, activeCommitTimeline.countInstants()); @@ -179,6 +203,116 @@ public void testTimelineOperations() { assertTrue(activeCommitTimeline.isBeforeTimelineStarts("00")); } + @Test + public void testAllowTempCommit() { + shouldAllowTempCommit(true, hoodieMetaClient -> { + timeline = new HoodieActiveTimeline(hoodieMetaClient); + + HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "1"); + timeline.createNewInstant(instant1); + + byte[] data = "commit".getBytes(StandardCharsets.UTF_8); + timeline.saveAsComplete(new HoodieInstant(true, instant1.getAction(), + instant1.getTimestamp()), Option.of(data)); + + timeline = timeline.reload(); + + assertTrue(timeline.getContiguousCompletedWriteTimeline().lastInstant().isPresent()); + assertEquals(instant1.getTimestamp(), timeline.getContiguousCompletedWriteTimeline().lastInstant().get().getTimestamp()); + }); + } + + @Test + public void testGetContiguousCompletedWriteTimeline() { + // a mock timeline with holes + timeline = new MockHoodieTimeline(Stream.of("01", "03", "05", "07", "13", "15", "17"), + Stream.of("09", "11", "19")); + assertTrue(timeline.getContiguousCompletedWriteTimeline().lastInstant().isPresent()); + assertEquals("07", timeline.getContiguousCompletedWriteTimeline().lastInstant().get().getTimestamp()); + + // add some instants where two are inflight and one of them (instant8 below) is not part of write timeline + HoodieInstant instant1 = new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "1"); + HoodieInstant instant2 = new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "2"); + HoodieInstant instant3 = new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "3"); + HoodieInstant instant4 = new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "4"); + HoodieInstant instant5 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "5"); + HoodieInstant instant6 = new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "6"); + HoodieInstant instant7 = new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "7"); + HoodieInstant instant8 = new HoodieInstant(true, HoodieTimeline.RESTORE_ACTION, "8"); + + timeline = new HoodieActiveTimeline(metaClient); + timeline.createNewInstant(instant1); + timeline.createNewInstant(instant2); + timeline.createNewInstant(instant3); + timeline.createNewInstant(instant4); + timeline.createNewInstant(instant5); + timeline.createNewInstant(instant6); + timeline.createNewInstant(instant7); + timeline.createNewInstant(instant8); + timeline.setInstants(Stream.of(instant1, instant2, instant3, instant4, instant5, instant6, instant7, instant8).collect(Collectors.toList())); + + assertTrue(timeline.getContiguousCompletedWriteTimeline().lastInstant().isPresent()); + assertEquals(instant4.getTimestamp(), timeline.getContiguousCompletedWriteTimeline().lastInstant().get().getTimestamp()); + // transition both inflight instants to complete + timeline.saveAsComplete(new HoodieInstant(true, instant5.getAction(), instant5.getTimestamp()), Option.empty()); + timeline.saveAsComplete(new HoodieInstant(true, instant8.getAction(), instant8.getTimestamp()), Option.empty()); + timeline = timeline.reload(); + // instant8 in not considered in write timeline, so last completed instant in timeline should be instant7 + assertTrue(timeline.getContiguousCompletedWriteTimeline().lastInstant().isPresent()); + assertEquals(instant7.getTimestamp(), timeline.getContiguousCompletedWriteTimeline().lastInstant().get().getTimestamp()); + } + + @Test + public void testTimelineWithSavepointAndHoles() { + timeline = new MockHoodieTimeline(Stream.of( + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "01"), + new HoodieInstant(State.COMPLETED, HoodieTimeline.SAVEPOINT_ACTION, "01"), + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "03"), + new HoodieInstant(State.COMPLETED, HoodieTimeline.SAVEPOINT_ACTION, "03"), + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "05") // this can be DELTA_COMMIT/REPLACE_COMMIT as well + ).collect(Collectors.toList())); + assertTrue(timeline.isBeforeTimelineStarts("00")); + assertTrue(timeline.isBeforeTimelineStarts("01")); + assertTrue(timeline.isBeforeTimelineStarts("02")); + assertTrue(timeline.isBeforeTimelineStarts("03")); + assertTrue(timeline.isBeforeTimelineStarts("04")); + assertFalse(timeline.isBeforeTimelineStarts("05")); + assertFalse(timeline.isBeforeTimelineStarts("06")); + + // with an inflight savepoint in between + timeline = new MockHoodieTimeline(Stream.of( + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "01"), + new HoodieInstant(State.INFLIGHT, HoodieTimeline.SAVEPOINT_ACTION, "01"), + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "03"), + new HoodieInstant(State.COMPLETED, HoodieTimeline.SAVEPOINT_ACTION, "03"), + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "05") + ).collect(Collectors.toList())); + assertTrue(timeline.isBeforeTimelineStarts("00")); + assertTrue(timeline.isBeforeTimelineStarts("01")); + assertTrue(timeline.isBeforeTimelineStarts("02")); + assertTrue(timeline.isBeforeTimelineStarts("03")); + assertTrue(timeline.isBeforeTimelineStarts("04")); + assertFalse(timeline.isBeforeTimelineStarts("05")); + assertFalse(timeline.isBeforeTimelineStarts("06")); + + // with a pending replacecommit after savepoints + timeline = new MockHoodieTimeline(Stream.of( + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "01"), + new HoodieInstant(State.COMPLETED, HoodieTimeline.SAVEPOINT_ACTION, "01"), + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "03"), + new HoodieInstant(State.COMPLETED, HoodieTimeline.SAVEPOINT_ACTION, "03"), + new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "05"), + new HoodieInstant(State.INFLIGHT, HoodieTimeline.REPLACE_COMMIT_ACTION, "06") + ).collect(Collectors.toList())); + assertTrue(timeline.isBeforeTimelineStarts("00")); + assertTrue(timeline.isBeforeTimelineStarts("01")); + assertTrue(timeline.isBeforeTimelineStarts("02")); + assertTrue(timeline.isBeforeTimelineStarts("03")); + assertTrue(timeline.isBeforeTimelineStarts("04")); + assertFalse(timeline.isBeforeTimelineStarts("05")); + assertFalse(timeline.isBeforeTimelineStarts("06")); + } + @Test public void testTimelineGetOperations() { List allInstants = getAllInstants(); @@ -198,20 +332,19 @@ public void testTimelineGetOperations() { // Test that various types of getXXX operations from HoodieActiveTimeline // return the correct set of Instant - checkTimeline.accept(timeline.getCommitsTimeline(), - CollectionUtils.createSet(HoodieTimeline.COMMIT_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION, HoodieTimeline.REPLACE_COMMIT_ACTION)); - checkTimeline.accept(timeline.getCommitsAndCompactionTimeline(), - CollectionUtils.createSet(HoodieTimeline.COMMIT_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION, HoodieTimeline.COMPACTION_ACTION, HoodieTimeline.REPLACE_COMMIT_ACTION)); + checkTimeline.accept(timeline.getCommitsTimeline(), CollectionUtils.createSet( + HoodieTimeline.COMMIT_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION, HoodieTimeline.REPLACE_COMMIT_ACTION)); + checkTimeline.accept(timeline.getWriteTimeline(), CollectionUtils.createSet( + HoodieTimeline.COMMIT_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION, HoodieTimeline.COMPACTION_ACTION, HoodieTimeline.REPLACE_COMMIT_ACTION)); checkTimeline.accept(timeline.getCommitTimeline(), CollectionUtils.createSet(HoodieTimeline.COMMIT_ACTION, HoodieTimeline.REPLACE_COMMIT_ACTION)); checkTimeline.accept(timeline.getDeltaCommitTimeline(), Collections.singleton(HoodieTimeline.DELTA_COMMIT_ACTION)); checkTimeline.accept(timeline.getCleanerTimeline(), Collections.singleton(HoodieTimeline.CLEAN_ACTION)); checkTimeline.accept(timeline.getRollbackTimeline(), Collections.singleton(HoodieTimeline.ROLLBACK_ACTION)); checkTimeline.accept(timeline.getRestoreTimeline(), Collections.singleton(HoodieTimeline.RESTORE_ACTION)); checkTimeline.accept(timeline.getSavePointTimeline(), Collections.singleton(HoodieTimeline.SAVEPOINT_ACTION)); - checkTimeline.accept(timeline.getAllCommitsTimeline(), - CollectionUtils.createSet(HoodieTimeline.COMMIT_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION, - HoodieTimeline.CLEAN_ACTION, HoodieTimeline.COMPACTION_ACTION, HoodieTimeline.REPLACE_COMMIT_ACTION, - HoodieTimeline.SAVEPOINT_ACTION, HoodieTimeline.ROLLBACK_ACTION)); + checkTimeline.accept(timeline.getAllCommitsTimeline(), CollectionUtils.createSet( + HoodieTimeline.COMMIT_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION, HoodieTimeline.CLEAN_ACTION, HoodieTimeline.COMPACTION_ACTION, + HoodieTimeline.REPLACE_COMMIT_ACTION, HoodieTimeline.SAVEPOINT_ACTION, HoodieTimeline.ROLLBACK_ACTION, HoodieTimeline.INDEXING_ACTION)); // Get some random Instants Random rand = new Random(); @@ -277,7 +410,7 @@ public void testTimelineInstantOperations() { timeline = timeline.reload(); assertFalse(timeline.containsInstant(compaction)); assertTrue(timeline.containsInstant(inflight)); - compaction = timeline.revertCompactionInflightToRequested(inflight); + compaction = timeline.revertInstantFromInflightToRequested(inflight); timeline = timeline.reload(); assertTrue(timeline.containsInstant(compaction)); assertFalse(timeline.containsInstant(inflight)); @@ -425,6 +558,96 @@ public void testReplaceActionsTimeline() { assertEquals(HoodieTimeline.REPLACE_COMMIT_ACTION, validReplaceInstants.get(0).getAction()); } + @Test + public void testCreateNewInstantTime() throws Exception { + String lastInstantTime = HoodieActiveTimeline.createNewInstantTime(); + for (int i = 0; i < 3; ++i) { + String newInstantTime = HoodieActiveTimeline.createNewInstantTime(); + assertTrue(HoodieTimeline.compareTimestamps(lastInstantTime, HoodieTimeline.LESSER_THAN, newInstantTime)); + lastInstantTime = newInstantTime; + } + + // Multiple thread test + final int numChecks = 100000; + final int numThreads = 100; + final long milliSecondsInYear = 365 * 24 * 3600 * 1000; + ExecutorService executorService = Executors.newFixedThreadPool(numThreads); + List futures = new ArrayList<>(numThreads); + for (int idx = 0; idx < numThreads; ++idx) { + futures.add(executorService.submit(() -> { + Date date = new Date(System.currentTimeMillis() + (int)(Math.random() * numThreads) * milliSecondsInYear); + final String expectedFormat = HoodieActiveTimeline.formatDate(date); + for (int tidx = 0; tidx < numChecks; ++tidx) { + final String curFormat = HoodieActiveTimeline.formatDate(date); + if (!curFormat.equals(expectedFormat)) { + throw new HoodieException("Format error: expected=" + expectedFormat + ", curFormat=" + curFormat); + } + } + })); + } + + executorService.shutdown(); + assertTrue(executorService.awaitTermination(60, TimeUnit.SECONDS)); + // required to catch exceptions + for (Future f : futures) { + f.get(); + } + } + + @Test + public void testMetadataCompactionInstantDateParsing() throws ParseException { + // default second granularity instant ID + String secondGranularityInstant = "20210101120101123"; + Date defaultSecsGranularityDate = HoodieActiveTimeline.parseDateFromInstantTime(secondGranularityInstant); + // metadata table compaction/cleaning : ms granularity instant ID + String compactionInstant = secondGranularityInstant + "001"; + Date defaultMsGranularityDate = HoodieActiveTimeline.parseDateFromInstantTime(compactionInstant); + assertEquals(0, defaultMsGranularityDate.getTime() - defaultSecsGranularityDate.getTime(), "Expected the ms part to be 0"); + assertTrue(HoodieTimeline.compareTimestamps(secondGranularityInstant, HoodieTimeline.LESSER_THAN, compactionInstant)); + assertTrue(HoodieTimeline.compareTimestamps(compactionInstant, HoodieTimeline.GREATER_THAN, secondGranularityInstant)); + } + + @Test + public void testMillisGranularityInstantDateParsing() throws ParseException { + // Old second granularity instant ID + String secondGranularityInstant = "20210101120101"; + Date defaultMsGranularityDate = HoodieActiveTimeline.parseDateFromInstantTime(secondGranularityInstant); + // New ms granularity instant ID + String specificMsGranularityInstant = secondGranularityInstant + "009"; + Date msGranularityDate = HoodieActiveTimeline.parseDateFromInstantTime(specificMsGranularityInstant); + assertEquals(999, defaultMsGranularityDate.getTime() % 1000, "Expected the ms part to be 999"); + assertEquals(9, msGranularityDate.getTime() % 1000, "Expected the ms part to be 9"); + + // Ensure that any date math which expects second granularity still works + String laterDateInstant = "20210101120111"; // + 10 seconds from original instant + assertEquals( + 10, + HoodieActiveTimeline.parseDateFromInstantTime(laterDateInstant).getTime() / 1000 + - HoodieActiveTimeline.parseDateFromInstantTime(secondGranularityInstant).getTime() / 1000, + "Expected the difference between later instant and previous instant to be 10 seconds" + ); + } + + @Test + public void testInvalidInstantDateParsing() throws ParseException { + // Test all invalid timestamp in HoodieTimeline, shouldn't throw any error and should return a correct value + assertEquals(Long.parseLong(HoodieTimeline.INIT_INSTANT_TS), + HoodieActiveTimeline.parseDateFromInstantTimeSafely(HoodieTimeline.INIT_INSTANT_TS).get().getTime()); + assertEquals(Long.parseLong(HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS), + HoodieActiveTimeline.parseDateFromInstantTimeSafely(HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS).get().getTime()); + assertEquals(Long.parseLong(HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS), + HoodieActiveTimeline.parseDateFromInstantTimeSafely(HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS).get().getTime()); + + // Test metadata table compaction instant date parsing with INIT_INSTANT_TS, should return Option.empty + assertEquals(Option.empty(), + HoodieActiveTimeline.parseDateFromInstantTimeSafely(HoodieTimeline.INIT_INSTANT_TS + "001")); + + // Test a valid instant timestamp, should equal the same result as HoodieActiveTimeline.parseDateFromInstantTime + String testInstant = "20210101120101"; + assertEquals(HoodieActiveTimeline.parseDateFromInstantTime(testInstant).getTime(), + HoodieActiveTimeline.parseDateFromInstantTimeSafely(testInstant).get().getTime()); + } + /** * Returns an exhaustive list of all possible HoodieInstant. * @return list of HoodieInstant @@ -434,7 +657,7 @@ private List getAllInstants() { List allInstants = new ArrayList<>(); long instantTime = 1; for (State state : State.values()) { - if (state == State.INVALID) { + if (state == State.NIL) { continue; } for (String action : HoodieTimeline.VALID_ACTIONS_IN_TIMELINE) { @@ -462,4 +685,25 @@ private List getAllInstants() { } return allInstants; } + + private void shouldAllowTempCommit(boolean allowTempCommit, Consumer fun) { + if (allowTempCommit) { + HoodieWrapperFileSystem fs = metaClient.getFs(); + HoodieWrapperFileSystem newFs = new HoodieWrapperFileSystem(fs.getFileSystem(), new NoOpConsistencyGuard()) { + @Override + protected boolean needCreateTempFile() { + return true; + } + }; + metaClient.setFs(newFs); + try { + fun.accept(metaClient); + } finally { + metaClient.setFs(fs); + } + return; + } + fun.accept(metaClient); + } + } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFSViewWithClustering.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFSViewWithClustering.java new file mode 100644 index 0000000000000..559375c795a86 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFSViewWithClustering.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.table.view; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFileGroup; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.CommitUtils; +import org.apache.hudi.common.util.Option; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.File; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; + +public class TestHoodieTableFSViewWithClustering extends HoodieCommonTestHarness { + + private static final String TEST_WRITE_TOKEN = "1-0-1"; + private static final String BOOTSTRAP_SOURCE_PATH = "/usr/warehouse/hive/data/tables/src1/"; + + private IncrementalTimelineSyncFileSystemView fsView; + private TableFileSystemView.BaseFileOnlyView roView; + + @BeforeEach + public void setup() throws IOException { + metaClient = HoodieTestUtils.init(tempDir.toAbsolutePath().toString(), getTableType(), BOOTSTRAP_SOURCE_PATH, false); + basePath = metaClient.getBasePathV2().toString(); + refreshFsView(); + } + + protected void refreshFsView() throws IOException { + super.refreshFsView(); + closeFsView(); + fsView = (IncrementalTimelineSyncFileSystemView) getFileSystemView(metaClient.getActiveTimeline().filterCompletedAndCompactionInstants()); + roView = fsView; + } + + private void closeFsView() { + if (null != fsView) { + fsView.close(); + fsView = null; + } + } + + @AfterEach + public void close() { + closeFsView(); + } + + @Test + public void testReplaceFileIdIsExcludedInView() throws IOException { + String partitionPath1 = "2020/06/27"; + String partitionPath2 = "2020/07/14"; + new File(basePath + "/" + partitionPath1).mkdirs(); + new File(basePath + "/" + partitionPath2).mkdirs(); + + // create 2 fileId in partition1 - fileId1 is replaced later on. + String fileId1 = UUID.randomUUID().toString(); + String fileId2 = UUID.randomUUID().toString(); + + // create 2 fileId in partition2 - fileId3, fileId4 is replaced later on. + String fileId3 = UUID.randomUUID().toString(); + String fileId4 = UUID.randomUUID().toString(); + + assertFalse(roView.getLatestBaseFiles(partitionPath1) + .anyMatch(dfile -> dfile.getFileId().equals(fileId1) || dfile.getFileId().equals(fileId2)), + "No commit, should not find any data file"); + assertFalse(roView.getLatestBaseFiles(partitionPath2) + .anyMatch(dfile -> dfile.getFileId().equals(fileId3) || dfile.getFileId().equals(fileId4)), + "No commit, should not find any data file"); + assertFalse(fsView.fetchLatestBaseFiles(partitionPath1) + .anyMatch(dfile -> dfile.getFileId().equals(fileId1) || dfile.getFileId().equals(fileId2)), + "No commit, should not find any data file"); + assertFalse(fsView.fetchLatestBaseFiles(partitionPath2) + .anyMatch(dfile -> dfile.getFileId().equals(fileId3) || dfile.getFileId().equals(fileId4)), + "No commit, should not find any data file"); + + // Only one commit + String commitTime1 = "1"; + String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1); + String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2); + String fileName3 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId3); + String fileName4 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId4); + new File(basePath + "/" + partitionPath1 + "/" + fileName1).createNewFile(); + new File(basePath + "/" + partitionPath1 + "/" + fileName2).createNewFile(); + new File(basePath + "/" + partitionPath2 + "/" + fileName3).createNewFile(); + new File(basePath + "/" + partitionPath2 + "/" + fileName4).createNewFile(); + + Map> partitionToReplaceFileIds = new HashMap<>(); + List replacedFileIdsP1 = new ArrayList<>(); + replacedFileIdsP1.add(fileId1); + partitionToReplaceFileIds.put(partitionPath1, replacedFileIdsP1); + List replacedFileIdsP2 = new ArrayList<>(); + replacedFileIdsP2.add(fileId3); + replacedFileIdsP2.add(fileId4); + partitionToReplaceFileIds.put(partitionPath2, replacedFileIdsP2); + HoodieCommitMetadata commitMetadata = + CommitUtils.buildMetadata(Collections.emptyList(), partitionToReplaceFileIds, Option.empty(), WriteOperationType.INSERT_OVERWRITE, "", HoodieTimeline.REPLACE_COMMIT_ACTION); + + HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline(); + HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.REPLACE_COMMIT_ACTION, commitTime1); + saveAsComplete(commitTimeline, instant1, Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + refreshFsView(); + assertEquals(0, roView.getLatestBaseFiles(partitionPath1) + .filter(dfile -> dfile.getFileId().equals(fileId1)).count()); + assertEquals(fileName2, roView.getLatestBaseFiles(partitionPath1) + .filter(dfile -> dfile.getFileId().equals(fileId2)).findFirst().get().getFileName()); + assertEquals(0, roView.getLatestBaseFiles(partitionPath2) + .filter(dfile -> dfile.getFileId().equals(fileId3)).count()); + assertEquals(0, roView.getLatestBaseFiles(partitionPath2) + .filter(dfile -> dfile.getFileId().equals(fileId4)).count()); + assertEquals(0, fsView.fetchLatestBaseFiles(partitionPath1) + .filter(dfile -> dfile.getFileId().equals(fileId1)).count()); + assertEquals(fileName2, fsView.fetchLatestBaseFiles(partitionPath1) + .filter(dfile -> dfile.getFileId().equals(fileId2)).findFirst().get().getFileName()); + assertEquals(0, fsView.fetchLatestBaseFiles(partitionPath2) + .filter(dfile -> dfile.getFileId().equals(fileId3)).count()); + assertEquals(0, fsView.fetchLatestBaseFiles(partitionPath2) + .filter(dfile -> dfile.getFileId().equals(fileId4)).count()); + + // ensure replacedFileGroupsBefore works with all instants + List replacedOnInstant1 = fsView.getReplacedFileGroupsBeforeOrOn("0", partitionPath1).collect(Collectors.toList()); + assertEquals(0, replacedOnInstant1.size()); + + List allReplaced = fsView.getReplacedFileGroupsBeforeOrOn("2", partitionPath1).collect(Collectors.toList()); + allReplaced.addAll(fsView.getReplacedFileGroupsBeforeOrOn("2", partitionPath2).collect(Collectors.toList())); + assertEquals(3, allReplaced.size()); + Set allReplacedFileIds = allReplaced.stream().map(fg -> fg.getFileGroupId().getFileId()).collect(Collectors.toSet()); + Set actualReplacedFileIds = Stream.of(fileId1, fileId3, fileId4).collect(Collectors.toSet()); + assertEquals(actualReplacedFileIds, allReplacedFileIds); + } + + private static void saveAsComplete(HoodieActiveTimeline timeline, HoodieInstant inflight, Option data) { + if (inflight.getAction().equals(HoodieTimeline.COMPACTION_ACTION)) { + timeline.transitionCompactionInflightToComplete(inflight, data); + } else { + HoodieInstant requested = new HoodieInstant(HoodieInstant.State.REQUESTED, inflight.getAction(), inflight.getTimestamp()); + timeline.createNewInstant(requested); + timeline.transitionRequestedToInflight(requested, Option.empty()); + timeline.saveAsComplete(inflight, data); + } + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFileSystemView.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFileSystemView.java index 3fceee3bb40c1..02f65886013f7 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFileSystemView.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFileSystemView.java @@ -18,9 +18,6 @@ package org.apache.hudi.common.table.view; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.permission.FsAction; import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieFSPermission; @@ -31,6 +28,7 @@ import org.apache.hudi.common.bootstrap.index.BootstrapIndex.IndexWriter; import org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; import org.apache.hudi.common.model.BaseFile; import org.apache.hudi.common.model.BootstrapFileMapping; import org.apache.hudi.common.model.CompactionOperation; @@ -41,6 +39,7 @@ import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; @@ -50,12 +49,18 @@ import org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView; import org.apache.hudi.common.table.view.TableFileSystemView.SliceView; import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.testutils.HoodieTestTable; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.util.ClusteringUtils; import org.apache.hudi.common.util.CommitUtils; import org.apache.hudi.common.util.CompactionUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.permission.FsAction; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.junit.jupiter.api.BeforeEach; @@ -81,6 +86,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -107,7 +113,7 @@ public static Stream configParams() { @BeforeEach public void setup() throws IOException { - metaClient = HoodieTestUtils.init(tempDir.toAbsolutePath().toString(), getTableType(), BOOTSTRAP_SOURCE_PATH); + metaClient = HoodieTestUtils.init(tempDir.toAbsolutePath().toString(), getTableType(), BOOTSTRAP_SOURCE_PATH, false); basePath = metaClient.getBasePath(); refreshFsView(); } @@ -141,6 +147,46 @@ public void testViewForFileSlicesWithNoBaseFileNonPartitioned() throws Exception testViewForFileSlicesWithNoBaseFile(1, 0, ""); } + @Test + public void testCloseHoodieTableFileSystemView() throws Exception { + String instantTime1 = "1"; + String instantTime2 = "2"; + String clusteringInstantTime3 = "3"; + String clusteringInstantTime4 = "4"; + + // prepare metadata + HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline(); + Map> partitionToReplaceFileIds = new HashMap<>(); + List replacedFileIds = new ArrayList<>(); + replacedFileIds.add("fake_file_id"); + partitionToReplaceFileIds.put("fake_partition_path", replacedFileIds); + + // prepare Instants + HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, instantTime1); + HoodieInstant instant2 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, instantTime2); + HoodieInstant clusteringInstant3 = new HoodieInstant(true, HoodieTimeline.REPLACE_COMMIT_ACTION, clusteringInstantTime3); + HoodieInstant clusteringInstant4 = new HoodieInstant(true, HoodieTimeline.REPLACE_COMMIT_ACTION, clusteringInstantTime4); + HoodieCommitMetadata commitMetadata = + CommitUtils.buildMetadata(Collections.emptyList(), partitionToReplaceFileIds, Option.empty(), WriteOperationType.CLUSTER, "", HoodieTimeline.REPLACE_COMMIT_ACTION); + + saveAsComplete(commitTimeline, instant1, Option.empty()); + saveAsComplete(commitTimeline, instant2, Option.empty()); + saveAsComplete(commitTimeline, clusteringInstant3, Option.empty()); + saveAsComplete(commitTimeline, clusteringInstant4, Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + + refreshFsView(); + + // Now create a scenario where archiving deleted replace commits (requested,inflight and replacecommit) + boolean deleteReplaceCommit = new File(this.basePath + "/.hoodie/" + clusteringInstantTime3 + ".replacecommit").delete(); + boolean deleteReplaceCommitRequested = new File(this.basePath + "/.hoodie/" + clusteringInstantTime3 + ".replacecommit.requested").delete(); + boolean deleteReplaceCommitInflight = new File(this.basePath + "/.hoodie/" + clusteringInstantTime3 + ".replacecommit.inflight").delete(); + + // confirm deleted + assertTrue(deleteReplaceCommit && deleteReplaceCommitInflight && deleteReplaceCommitRequested); + assertDoesNotThrow(() -> fsView.close()); + + } + protected void testViewForFileSlicesWithNoBaseFile(int expNumTotalFileSlices, int expNumTotalDataFiles, String partitionPath) throws Exception { Paths.get(basePath, partitionPath).toFile().mkdirs(); @@ -238,6 +284,52 @@ public void testViewForFileSlicesWithBaseFileAndInflightCompaction(boolean testB testViewForFileSlicesWithAsyncCompaction(false, true, 2, 2, true, testBootstrap); } + @Test + protected void testInvalidLogFiles() throws Exception { + String partitionPath = "2016/05/01"; + Paths.get(basePath, partitionPath).toFile().mkdirs(); + String fileId = UUID.randomUUID().toString(); + + String instantTime1 = "1"; + String deltaInstantTime1 = "2"; + String deltaInstantTime2 = "3"; + String fileName1 = + FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, instantTime1, 0, TEST_WRITE_TOKEN); + String fileName2 = + FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, instantTime1, 1, TEST_WRITE_TOKEN); + // create a dummy log file mimicing cloud stores marker files + String fileName3 = "_DUMMY_" + fileName1.substring(1, fileName1.length()); + // this file should not be deduced as a log file. + + Paths.get(basePath, partitionPath, fileName1).toFile().createNewFile(); + Paths.get(basePath, partitionPath, fileName2).toFile().createNewFile(); + Paths.get(basePath, partitionPath, fileName3).toFile().createNewFile(); + HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline(); + + HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, instantTime1); + HoodieInstant deltaInstant2 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime1); + HoodieInstant deltaInstant3 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime2); + + saveAsComplete(commitTimeline, instant1, Option.empty()); + saveAsComplete(commitTimeline, deltaInstant2, Option.empty()); + saveAsComplete(commitTimeline, deltaInstant3, Option.empty()); + + refreshFsView(); + + List dataFiles = roView.getLatestBaseFiles().collect(Collectors.toList()); + assertTrue(dataFiles.isEmpty(), "No data file expected"); + List fileSliceList = rtView.getLatestFileSlices(partitionPath).collect(Collectors.toList()); + assertEquals(1, fileSliceList.size()); + FileSlice fileSlice = fileSliceList.get(0); + assertEquals(fileId, fileSlice.getFileId(), "File-Id must be set correctly"); + assertFalse(fileSlice.getBaseFile().isPresent(), "Data file for base instant must be present"); + assertEquals(instantTime1, fileSlice.getBaseInstantTime(), "Base Instant for file-group set correctly"); + List logFiles = fileSlice.getLogFiles().collect(Collectors.toList()); + assertEquals(2, logFiles.size(), "Correct number of log-files shows up in file-slice"); + assertEquals(fileName2, logFiles.get(0).getFileName(), "Log File Order check"); + assertEquals(fileName1, logFiles.get(1).getFileName(), "Log File Order check"); + } + /** * Returns all file-slices including uncommitted ones. * @@ -303,10 +395,15 @@ private void checkExternalFile(HoodieFileStatus srcFileStatus, Option protected void testViewForFileSlicesWithAsyncCompaction(boolean skipCreatingDataFile, boolean isCompactionInFlight, int expTotalFileSlices, int expTotalDataFiles, boolean includeInvalidAndInflight, boolean testBootstrap) throws Exception { + + if (testBootstrap) { + metaClient = HoodieTestUtils.init(tempDir.toAbsolutePath().toString(), getTableType(), BOOTSTRAP_SOURCE_PATH, testBootstrap); + } + String partitionPath = "2016/05/01"; new File(basePath + "/" + partitionPath).mkdirs(); String fileId = UUID.randomUUID().toString(); - String srcName = "part_0000.parquet"; + String srcName = "part_0000" + metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); HoodieFileStatus srcFileStatus = HoodieFileStatus.newBuilder() .setPath(HoodiePath.newBuilder().setUri(BOOTSTRAP_SOURCE_PATH + partitionPath + "/" + srcName).build()) .setLength(256 * 1024 * 1024L) @@ -327,7 +424,7 @@ protected void testViewForFileSlicesWithAsyncCompaction(boolean skipCreatingData String dataFileName = null; if (!skipCreatingDataFile) { - dataFileName = FSUtils.makeDataFileName(instantTime1, TEST_WRITE_TOKEN, fileId); + dataFileName = FSUtils.makeBaseFileName(instantTime1, TEST_WRITE_TOKEN, fileId); new File(basePath + "/" + partitionPath + "/" + dataFileName).createNewFile(); } String fileName1 = @@ -366,7 +463,7 @@ protected void testViewForFileSlicesWithAsyncCompaction(boolean skipCreatingData checkExternalFile(srcFileStatus, fileSlice.getBaseFile().get().getBootstrapBaseFile(), testBootstrap); } String compactionRequestedTime = "4"; - String compactDataFileName = FSUtils.makeDataFileName(compactionRequestedTime, TEST_WRITE_TOKEN, fileId); + String compactDataFileName = FSUtils.makeBaseFileName(compactionRequestedTime, TEST_WRITE_TOKEN, fileId); List> partitionFileSlicesPairs = new ArrayList<>(); partitionFileSlicesPairs.add(Pair.of(partitionPath, fileSlices.get(0))); HoodieCompactionPlan compactionPlan = @@ -501,12 +598,12 @@ protected void testViewForFileSlicesWithAsyncCompaction(boolean skipCreatingData final String orphanFileId2 = UUID.randomUUID().toString(); final String invalidInstantId = "INVALIDTIME"; String inflightDeltaInstantTime = "7"; - String orphanDataFileName = FSUtils.makeDataFileName(invalidInstantId, TEST_WRITE_TOKEN, orphanFileId1); + String orphanDataFileName = FSUtils.makeBaseFileName(invalidInstantId, TEST_WRITE_TOKEN, orphanFileId1); new File(basePath + "/" + partitionPath + "/" + orphanDataFileName).createNewFile(); String orphanLogFileName = FSUtils.makeLogFileName(orphanFileId2, HoodieLogFile.DELTA_EXTENSION, invalidInstantId, 0, TEST_WRITE_TOKEN); new File(basePath + "/" + partitionPath + "/" + orphanLogFileName).createNewFile(); - String inflightDataFileName = FSUtils.makeDataFileName(inflightDeltaInstantTime, TEST_WRITE_TOKEN, inflightFileId1); + String inflightDataFileName = FSUtils.makeBaseFileName(inflightDeltaInstantTime, TEST_WRITE_TOKEN, inflightFileId1); new File(basePath + "/" + partitionPath + "/" + inflightDataFileName).createNewFile(); String inflightLogFileName = FSUtils.makeLogFileName(inflightFileId2, HoodieLogFile.DELTA_EXTENSION, inflightDeltaInstantTime, 0, TEST_WRITE_TOKEN); @@ -661,7 +758,7 @@ public void testGetLatestDataFilesForFileId() throws IOException { // Only one commit, but is not safe String commitTime1 = "1"; - String fileName1 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId); + String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId); new File(basePath + "/" + partitionPath + "/" + fileName1).createNewFile(); refreshFsView(); assertFalse(roView.getLatestBaseFiles(partitionPath).anyMatch(dfile -> dfile.getFileId().equals(fileId)), @@ -677,7 +774,7 @@ public void testGetLatestDataFilesForFileId() throws IOException { // Do another commit, but not safe String commitTime2 = "2"; - String fileName2 = FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, fileId); + String fileName2 = FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId); new File(basePath + "/" + partitionPath + "/" + fileName2).createNewFile(); refreshFsView(); assertEquals(fileName1, roView.getLatestBaseFiles(partitionPath) @@ -711,22 +808,22 @@ public void testStreamLatestVersionInPartition(boolean isLatestFileSliceOnly) th String fileId3 = UUID.randomUUID().toString(); String fileId4 = UUID.randomUUID().toString(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)).createNewFile(); new File(fullPartitionPath + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0, TEST_WRITE_TOKEN)) .createNewFile(); new File(fullPartitionPath + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 1, TEST_WRITE_TOKEN)) .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)).createNewFile(); new File(fullPartitionPath + FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0, TEST_WRITE_TOKEN)) .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)).createNewFile(); new File(fullPartitionPath + FSUtils.makeLogFileName(fileId4, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0, TEST_WRITE_TOKEN)) .createNewFile(); @@ -776,9 +873,9 @@ private void testStreamLatestVersionInPartition(boolean isLatestFileSliceOnly, S for (HoodieBaseFile status : dataFileList) { filenames.add(status.getFileName()); } - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId1))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId3))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3))); filenames = new HashSet<>(); List logFilesList = rtView.getLatestFileSlicesBeforeOrOn("2016/05/01", commitTime4, true) @@ -805,12 +902,12 @@ private void testStreamLatestVersionInPartition(boolean isLatestFileSliceOnly, S } if (!isLatestFileSliceOnly) { assertEquals(3, dataFiles.size()); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId3))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3))); } else { assertEquals(1, dataFiles.size()); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2))); } logFilesList = rtView.getLatestFileSlicesBeforeOrOn("2016/05/01", commitTime3, true) @@ -836,13 +933,13 @@ protected void testStreamEveryVersionInPartition(boolean isLatestFileSliceOnly) String fileId2 = UUID.randomUUID().toString(); String fileId3 = UUID.randomUUID().toString(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)).createNewFile(); new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); @@ -867,22 +964,22 @@ protected void testStreamEveryVersionInPartition(boolean isLatestFileSliceOnly) Set expFileNames = new HashSet<>(); if (fileId.equals(fileId1)) { if (!isLatestFileSliceOnly) { - expFileNames.add(FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)); + expFileNames.add(FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)); } - expFileNames.add(FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)); + expFileNames.add(FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)); assertEquals(expFileNames, filenames); } else if (fileId.equals(fileId2)) { if (!isLatestFileSliceOnly) { - expFileNames.add(FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)); - expFileNames.add(FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)); + expFileNames.add(FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)); + expFileNames.add(FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)); } - expFileNames.add(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)); + expFileNames.add(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)); assertEquals(expFileNames, filenames); } else { if (!isLatestFileSliceOnly) { - expFileNames.add(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)); + expFileNames.add(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)); } - expFileNames.add(FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)); + expFileNames.add(FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)); assertEquals(expFileNames, filenames); } } @@ -905,21 +1002,21 @@ protected void testStreamLatestVersionInRange(boolean isLatestFileSliceOnly) thr String fileId2 = UUID.randomUUID().toString(); String fileId3 = UUID.randomUUID().toString(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)).createNewFile(); new File(fullPartitionPath + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime1, 0, TEST_WRITE_TOKEN)) .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId1)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId1)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)).createNewFile(); new File(fullPartitionPath + FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0, TEST_WRITE_TOKEN)) .createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)).createNewFile(); new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); @@ -942,10 +1039,10 @@ protected void testStreamLatestVersionInRange(boolean isLatestFileSliceOnly) thr filenames.add(status.getFileName()); } - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId1))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId1))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2))); if (!isLatestFileSliceOnly) { - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId3))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3))); } List slices = @@ -986,13 +1083,13 @@ protected void testStreamLatestVersionsBefore(boolean isLatestFileSliceOnly) thr String fileId2 = UUID.randomUUID().toString(); String fileId3 = UUID.randomUUID().toString(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)).createNewFile(); new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); @@ -1012,8 +1109,8 @@ protected void testStreamLatestVersionsBefore(boolean isLatestFileSliceOnly) thr for (HoodieBaseFile status : dataFiles) { filenames.add(status.getFileName()); } - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, fileId2))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2))); } else { assertEquals(0, dataFiles.size()); } @@ -1037,30 +1134,30 @@ protected void testStreamLatestVersions(boolean isLatestFileSliceOnly) throws IO String fileId2 = UUID.randomUUID().toString(); String fileId3 = UUID.randomUUID().toString(); - new File(fullPartitionPath + "/" + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)) + new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)) .createNewFile(); new File(fullPartitionPath + "/" + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime1, 0, TEST_WRITE_TOKEN)) .createNewFile(); - new File(fullPartitionPath + "/" + FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)) + new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)) .createNewFile(); new File(fullPartitionPath + "/" + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0, TEST_WRITE_TOKEN)) .createNewFile(); - new File(fullPartitionPath + "/" + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)) + new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)) .createNewFile(); - new File(fullPartitionPath + "/" + FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)) + new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)) .createNewFile(); new File(fullPartitionPath + "/" + FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime2, 0, TEST_WRITE_TOKEN)) .createNewFile(); - new File(fullPartitionPath + "/" + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)) + new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)) .createNewFile(); - new File(fullPartitionPath + "/" + FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)) + new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)) .createNewFile(); - new File(fullPartitionPath + "/" + FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)) + new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)) .createNewFile(); new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); @@ -1107,9 +1204,9 @@ protected void testStreamLatestVersions(boolean isLatestFileSliceOnly) throws IO for (HoodieBaseFile status : statuses1) { filenames.add(status.getFileName()); } - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId1))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2))); - assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId3))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3))); } @Test @@ -1130,15 +1227,15 @@ public void testPendingCompactionWithDuplicateFileIdsAcrossPartitions() throws E String deltaInstantTime2 = "3"; String fileId = UUID.randomUUID().toString(); - String dataFileName = FSUtils.makeDataFileName(instantTime1, TEST_WRITE_TOKEN, fileId); + String dataFileName = FSUtils.makeBaseFileName(instantTime1, TEST_WRITE_TOKEN, fileId); new File(fullPartitionPath1 + dataFileName).createNewFile(); String fileName1 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, instantTime1, 0, TEST_WRITE_TOKEN); new File(fullPartitionPath1 + fileName1).createNewFile(); - new File(fullPartitionPath2 + FSUtils.makeDataFileName(instantTime1, TEST_WRITE_TOKEN, fileId)).createNewFile(); + new File(fullPartitionPath2 + FSUtils.makeBaseFileName(instantTime1, TEST_WRITE_TOKEN, fileId)).createNewFile(); new File(fullPartitionPath2 + fileName1).createNewFile(); - new File(fullPartitionPath3 + FSUtils.makeDataFileName(instantTime1, TEST_WRITE_TOKEN, fileId)).createNewFile(); + new File(fullPartitionPath3 + FSUtils.makeBaseFileName(instantTime1, TEST_WRITE_TOKEN, fileId)).createNewFile(); new File(fullPartitionPath3 + fileName1).createNewFile(); HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline(); @@ -1177,7 +1274,7 @@ public void testPendingCompactionWithDuplicateFileIdsAcrossPartitions() throws E partitionFileSlicesPairs.add(Pair.of(partitionPath3, fileSlices.get(0))); String compactionRequestedTime = "2"; - String compactDataFileName = FSUtils.makeDataFileName(compactionRequestedTime, TEST_WRITE_TOKEN, fileId); + String compactDataFileName = FSUtils.makeBaseFileName(compactionRequestedTime, TEST_WRITE_TOKEN, fileId); HoodieCompactionPlan compactionPlan = CompactionUtils.buildFromFileSlices(partitionFileSlicesPairs, Option.empty(), Option.empty()); @@ -1294,8 +1391,8 @@ public void testReplaceWithTimeTravel() throws IOException { "No commit, should not find any data file"); // Only one commit String commitTime1 = "1"; - String fileName1 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1); - String fileName2 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId2); + String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1); + String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2); new File(basePath + "/" + partitionPath1 + "/" + fileName1).createNewFile(); new File(basePath + "/" + partitionPath1 + "/" + fileName2).createNewFile(); @@ -1311,8 +1408,8 @@ public void testReplaceWithTimeTravel() throws IOException { // create commit2 - fileId1 is replaced. new file groups fileId3,fileId4 are created. String fileId3 = UUID.randomUUID().toString(); String fileId4 = UUID.randomUUID().toString(); - String fileName3 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId3); - String fileName4 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId4); + String fileName3 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId3); + String fileName4 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId4); new File(basePath + "/" + partitionPath1 + "/" + fileName3).createNewFile(); new File(basePath + "/" + partitionPath1 + "/" + fileName4).createNewFile(); @@ -1356,6 +1453,13 @@ public void testReplaceWithTimeTravel() throws IOException { List allReplaced = fsView.getReplacedFileGroupsBeforeOrOn("2", partitionPath1).collect(Collectors.toList()); assertEquals(1, allReplaced.size()); assertEquals(fileId1, allReplaced.get(0).getFileGroupId().getFileId()); + + allReplaced = fsView.getReplacedFileGroupsBefore("2", partitionPath1).collect(Collectors.toList()); + assertEquals(0, allReplaced.size()); + + allReplaced = fsView.getAllReplacedFileGroups(partitionPath1).collect(Collectors.toList()); + assertEquals(1, allReplaced.size()); + assertEquals(fileId1, allReplaced.get(0).getFileGroupId().getFileId()); } @Test @@ -1382,10 +1486,10 @@ public void testReplaceFileIdIsExcludedInView() throws IOException { // Only one commit String commitTime1 = "1"; - String fileName1 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1); - String fileName2 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId2); - String fileName3 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId3); - String fileName4 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId4); + String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1); + String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2); + String fileName3 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId3); + String fileName4 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId4); new File(basePath + "/" + partitionPath1 + "/" + fileName1).createNewFile(); new File(basePath + "/" + partitionPath1 + "/" + fileName2).createNewFile(); new File(basePath + "/" + partitionPath2 + "/" + fileName3).createNewFile(); @@ -1442,9 +1546,9 @@ public void testPendingClusteringOperations() throws IOException { "No commit, should not find any data file"); // Only one commit String commitTime1 = "1"; - String fileName1 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1); - String fileName2 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId2); - String fileName3 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId3); + String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1); + String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2); + String fileName3 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId3); new File(basePath + "/" + partitionPath1 + "/" + fileName1).createNewFile(); new File(basePath + "/" + partitionPath1 + "/" + fileName2).createNewFile(); new File(basePath + "/" + partitionPath1 + "/" + fileName3).createNewFile(); @@ -1484,6 +1588,234 @@ public void testPendingClusteringOperations() throws IOException { assertFalse(fileIds.contains(fileId3)); } + /** + * + * create hoodie table like + * . + * ├── .hoodie + * │   ├── .aux + * │   │   └── .bootstrap + * │   │   ├── .fileids + * │   │   └── .partitions + * │   ├── .temp + * │   ├── 1.commit + * │   ├── 1.commit.requested + * │   ├── 1.inflight + * │   ├── 2.replacecommit + * │   ├── 2.replacecommit.inflight + * │   ├── 2.replacecommit.requested + * │   ├── 3.commit + * │   ├── 3.commit.requested + * │   ├── 3.inflight + * │   ├── archived + * │   └── hoodie.properties + * └── 2020 + * └── 06 + * └── 27 + * ├── 5fe477d2-0150-46d4-833c-1e9cc8da9948_1-0-1_3.parquet + * ├── 7e3208c8-fdec-4254-9682-8fff1e51ee8d_1-0-1_2.parquet + * ├── e04b0e2d-1467-46b2-8ea6-f4fe950965a5_1-0-1_1.parquet + * └── f3936b66-b3db-4fc8-a6d0-b1a7559016e6_1-0-1_1.parquet + * + * First test fsView API with finished clustering: + * 1. getLatestBaseFilesBeforeOrOn + * 2. getBaseFileOn + * 3. getLatestBaseFilesInRange + * 4. getAllBaseFiles + * 5. getLatestBaseFiles + * + * Then remove 2.replacecommit, 1.commit, 1.commit.requested, 1.inflight to simulate + * pending clustering at the earliest position in the active timeline and test these APIs again. + * + * @throws IOException + */ + @Test + public void testHoodieTableFileSystemViewWithPendingClustering() throws IOException { + List latestBaseFilesBeforeOrOn; + Option baseFileOn; + List latestBaseFilesInRange; + List allBaseFiles; + List latestBaseFiles; + List latestBaseFilesPerPartition; + String partitionPath = "2020/06/27"; + new File(basePath + "/" + partitionPath).mkdirs(); + HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline(); + + // will create 5 fileId in partition. + // fileId1 and fileId2 will be replaced by fileID3 + // fileId4 and fileId5 will be committed after clustering finished. + String fileId1 = UUID.randomUUID().toString(); + String fileId2 = UUID.randomUUID().toString(); + String fileId3 = UUID.randomUUID().toString(); + String fileId4 = UUID.randomUUID().toString(); + String fileId5 = UUID.randomUUID().toString(); + + assertFalse(roView.getLatestBaseFiles(partitionPath) + .anyMatch(dfile -> dfile.getFileId().equals(fileId1) + || dfile.getFileId().equals(fileId2) + || dfile.getFileId().equals(fileId3) + || dfile.getFileId().equals(fileId4) + || dfile.getFileId().equals(fileId5)), + "No commit, should not find any data file"); + + // first insert commit + String commitTime1 = "1"; + String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1); + String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2); + new File(basePath + "/" + partitionPath + "/" + fileName1).createNewFile(); + new File(basePath + "/" + partitionPath + "/" + fileName2).createNewFile(); + + HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime1); + + // build writeStats + HashMap> partitionToFile1 = new HashMap<>(); + ArrayList files1 = new ArrayList<>(); + files1.add(fileId1); + files1.add(fileId2); + partitionToFile1.put(partitionPath, files1); + List writeStats1 = buildWriteStats(partitionToFile1, commitTime1); + + HoodieCommitMetadata commitMetadata1 = + CommitUtils.buildMetadata(writeStats1, new HashMap<>(), Option.empty(), WriteOperationType.INSERT, "", HoodieTimeline.COMMIT_ACTION); + saveAsComplete(commitTimeline, instant1, Option.of(commitMetadata1.toJsonString().getBytes(StandardCharsets.UTF_8))); + commitTimeline.reload(); + + // replace commit + String commitTime2 = "2"; + String fileName3 = FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId3); + new File(basePath + "/" + partitionPath + "/" + fileName3).createNewFile(); + + HoodieInstant instant2 = new HoodieInstant(true, HoodieTimeline.REPLACE_COMMIT_ACTION, commitTime2); + Map> partitionToReplaceFileIds = new HashMap<>(); + List replacedFileIds = new ArrayList<>(); + replacedFileIds.add(fileId1); + replacedFileIds.add(fileId2); + partitionToReplaceFileIds.put(partitionPath, replacedFileIds); + + HashMap> partitionToFile2 = new HashMap<>(); + ArrayList files2 = new ArrayList<>(); + files2.add(fileId3); + partitionToFile2.put(partitionPath, files2); + List writeStats2 = buildWriteStats(partitionToFile2, commitTime2); + + HoodieCommitMetadata commitMetadata2 = + CommitUtils.buildMetadata(writeStats2, partitionToReplaceFileIds, Option.empty(), WriteOperationType.INSERT_OVERWRITE, "", HoodieTimeline.REPLACE_COMMIT_ACTION); + saveAsComplete(commitTimeline, instant2, Option.of(commitMetadata2.toJsonString().getBytes(StandardCharsets.UTF_8))); + + // another insert commit + String commitTime3 = "3"; + String fileName4 = FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId4); + new File(basePath + "/" + partitionPath + "/" + fileName4).createNewFile(); + HoodieInstant instant3 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime3); + + // build writeStats + HashMap> partitionToFile3 = new HashMap<>(); + ArrayList files3 = new ArrayList<>(); + files3.add(fileId4); + partitionToFile3.put(partitionPath, files3); + List writeStats3 = buildWriteStats(partitionToFile3, commitTime3); + HoodieCommitMetadata commitMetadata3 = + CommitUtils.buildMetadata(writeStats3, new HashMap<>(), Option.empty(), WriteOperationType.INSERT, "", HoodieTimeline.COMMIT_ACTION); + saveAsComplete(commitTimeline, instant3, Option.of(commitMetadata3.toJsonString().getBytes(StandardCharsets.UTF_8))); + + metaClient.reloadActiveTimeline(); + refreshFsView(); + + ArrayList commits = new ArrayList<>(); + commits.add(commitTime1); + commits.add(commitTime2); + commits.add(commitTime3); + + // do check + latestBaseFilesBeforeOrOn = fsView.getLatestBaseFilesBeforeOrOn(partitionPath, commitTime3).map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(2, latestBaseFilesBeforeOrOn.size()); + assertTrue(latestBaseFilesBeforeOrOn.contains(fileId3)); + assertTrue(latestBaseFilesBeforeOrOn.contains(fileId4)); + + // could see fileId3 because clustering is committed. + baseFileOn = fsView.getBaseFileOn(partitionPath, commitTime2, fileId3); + assertTrue(baseFileOn.isPresent()); + assertEquals(baseFileOn.get().getFileId(), fileId3); + + latestBaseFilesInRange = fsView.getLatestBaseFilesInRange(commits).map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(2, latestBaseFilesInRange.size()); + assertTrue(latestBaseFilesInRange.contains(fileId3)); + assertTrue(latestBaseFilesInRange.contains(fileId4)); + + allBaseFiles = fsView.getAllBaseFiles(partitionPath).map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(2, allBaseFiles.size()); + assertTrue(allBaseFiles.contains(fileId3)); + assertTrue(allBaseFiles.contains(fileId4)); + + // could see fileId3 because clustering is committed. + latestBaseFiles = fsView.getLatestBaseFiles().map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(2, latestBaseFiles.size()); + assertTrue(allBaseFiles.contains(fileId3)); + assertTrue(allBaseFiles.contains(fileId4)); + + // could see fileId3 because clustering is committed. + latestBaseFilesPerPartition = fsView.getLatestBaseFiles(partitionPath).map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(2, latestBaseFiles.size()); + assertTrue(latestBaseFilesPerPartition.contains(fileId3)); + assertTrue(latestBaseFilesPerPartition.contains(fileId4)); + + HoodieWrapperFileSystem fs = metaClient.getFs(); + fs.delete(new Path(basePath + "/.hoodie", "1.commit"), false); + fs.delete(new Path(basePath + "/.hoodie", "1.inflight"), false); + fs.delete(new Path(basePath + "/.hoodie", "1.commit.requested"), false); + fs.delete(new Path(basePath + "/.hoodie", "2.replacecommit"), false); + + metaClient.reloadActiveTimeline(); + refreshFsView(); + // do check after delete some commit file + latestBaseFilesBeforeOrOn = fsView.getLatestBaseFilesBeforeOrOn(partitionPath, commitTime3).map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(3, latestBaseFilesBeforeOrOn.size()); + assertTrue(latestBaseFilesBeforeOrOn.contains(fileId1)); + assertTrue(latestBaseFilesBeforeOrOn.contains(fileId2)); + assertTrue(latestBaseFilesBeforeOrOn.contains(fileId4)); + + // couldn't see fileId3 because clustering is not committed. + baseFileOn = fsView.getBaseFileOn(partitionPath, commitTime2, fileId3); + assertFalse(baseFileOn.isPresent()); + + latestBaseFilesInRange = fsView.getLatestBaseFilesInRange(commits).map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(3, latestBaseFilesInRange.size()); + assertTrue(latestBaseFilesInRange.contains(fileId1)); + assertTrue(latestBaseFilesInRange.contains(fileId2)); + assertTrue(latestBaseFilesInRange.contains(fileId4)); + + allBaseFiles = fsView.getAllBaseFiles(partitionPath).map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(3, allBaseFiles.size()); + assertTrue(allBaseFiles.contains(fileId1)); + assertTrue(allBaseFiles.contains(fileId2)); + assertTrue(allBaseFiles.contains(fileId4)); + + // couldn't see fileId3 because clustering is not committed. + latestBaseFiles = fsView.getLatestBaseFiles().map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(3, latestBaseFiles.size()); + assertTrue(allBaseFiles.contains(fileId1)); + assertTrue(allBaseFiles.contains(fileId2)); + assertTrue(allBaseFiles.contains(fileId4)); + + // couldn't see fileId3 because clustering is not committed. + latestBaseFilesPerPartition = fsView.getLatestBaseFiles(partitionPath).map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(3, latestBaseFiles.size()); + assertTrue(latestBaseFilesPerPartition.contains(fileId1)); + assertTrue(latestBaseFilesPerPartition.contains(fileId2)); + assertTrue(latestBaseFilesPerPartition.contains(fileId4)); + } + + + // Generate Hoodie WriteStat For Given Partition + private List buildWriteStats(HashMap> partitionToFileIds, String commitTime) { + HashMap>> maps = new HashMap<>(); + for (String partition : partitionToFileIds.keySet()) { + List> list = partitionToFileIds.get(partition).stream().map(fileId -> new ImmutablePair(fileId, 0)).collect(Collectors.toList()); + maps.put(partition, list); + } + return HoodieTestTable.generateHoodieWriteStatForPartition(maps, commitTime, false); + } + @Override protected HoodieTableType getTableType() { return HoodieTableType.MERGE_ON_READ; diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestIncrementalFSViewSync.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestIncrementalFSViewSync.java index e4933cf4e983b..2f284c5befd1e 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestIncrementalFSViewSync.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestIncrementalFSViewSync.java @@ -20,6 +20,7 @@ import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; import org.apache.hudi.common.HoodieCleanStat; @@ -35,6 +36,7 @@ import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieInstant.State; @@ -322,7 +324,7 @@ public void testMultipleTransitions() throws IOException { instantsToFiles = testMultipleWriteSteps(view1, Collections.singletonList("11"), true, "11"); SyncableFileSystemView view2 = - getFileSystemView(new HoodieTableMetaClient(metaClient.getHadoopConf(), metaClient.getBasePath())); + getFileSystemView(HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(metaClient.getBasePath()).build()); // Run 2 more ingestion on MOR table. View1 is not yet synced but View2 is instantsToFiles.putAll(testMultipleWriteSteps(view2, Arrays.asList("12", "13"), true, "11")); @@ -332,7 +334,7 @@ public void testMultipleTransitions() throws IOException { view2.sync(); SyncableFileSystemView view3 = - getFileSystemView(new HoodieTableMetaClient(metaClient.getHadoopConf(), metaClient.getBasePath())); + getFileSystemView(HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(metaClient.getBasePath()).build()); view3.sync(); areViewsConsistent(view1, view2, partitions.size() * fileIdsPerPartition.size()); @@ -344,7 +346,7 @@ public void testMultipleTransitions() throws IOException { view1.sync(); areViewsConsistent(view1, view2, partitions.size() * fileIdsPerPartition.size()); SyncableFileSystemView view4 = - getFileSystemView(new HoodieTableMetaClient(metaClient.getHadoopConf(), metaClient.getBasePath())); + getFileSystemView(HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(metaClient.getBasePath()).build()); view4.sync(); /* @@ -358,7 +360,7 @@ public void testMultipleTransitions() throws IOException { view1.sync(); areViewsConsistent(view1, view2, partitions.size() * fileIdsPerPartition.size() * 2); SyncableFileSystemView view5 = - getFileSystemView(new HoodieTableMetaClient(metaClient.getHadoopConf(), metaClient.getBasePath())); + getFileSystemView(HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(metaClient.getBasePath()).build()); view5.sync(); /* @@ -381,7 +383,7 @@ public void testMultipleTransitions() throws IOException { view1.sync(); areViewsConsistent(view1, view2, partitions.size() * fileIdsPerPartition.size() * 2); SyncableFileSystemView view6 = - getFileSystemView(new HoodieTableMetaClient(metaClient.getHadoopConf(), metaClient.getBasePath())); + getFileSystemView(HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(metaClient.getBasePath()).build()); view6.sync(); /* @@ -534,7 +536,7 @@ private void performClean(String instant, List files, String cleanInstan Map> partititonToFiles = deleteFiles(files); List cleanStats = partititonToFiles.entrySet().stream().map(e -> new HoodieCleanStat(HoodieCleaningPolicy.KEEP_LATEST_COMMITS, e.getKey(), e.getValue(), e.getValue(), - new ArrayList<>(), Integer.toString(Integer.parseInt(instant) + 1))).collect(Collectors.toList()); + new ArrayList<>(), Integer.toString(Integer.parseInt(instant) + 1), "")).collect(Collectors.toList()); HoodieInstant cleanInflightInstant = new HoodieInstant(true, HoodieTimeline.CLEAN_ACTION, cleanInstant); metaClient.getActiveTimeline().createNewInstant(cleanInflightInstant); @@ -824,7 +826,7 @@ private List> generateDataForInstant(String baseIn File file = new File(basePath + "/" + p + "/" + (deltaCommit ? FSUtils.makeLogFileName(f, ".log", baseInstant, Integer.parseInt(instant), TEST_WRITE_TOKEN) - : FSUtils.makeDataFileName(instant, TEST_WRITE_TOKEN, f))); + : FSUtils.makeBaseFileName(instant, TEST_WRITE_TOKEN, f))); file.createNewFile(); HoodieWriteStat w = new HoodieWriteStat(); w.setFileId(f); @@ -857,11 +859,20 @@ private List addInstant(HoodieTableMetaClient metaClient, String instant private List addReplaceInstant(HoodieTableMetaClient metaClient, String instant, List> writeStats, Map> partitionToReplaceFileIds) throws IOException { + // created requested + HoodieInstant newRequestedInstant = new HoodieInstant(State.REQUESTED, HoodieTimeline.REPLACE_COMMIT_ACTION, instant); + HoodieRequestedReplaceMetadata requestedReplaceMetadata = HoodieRequestedReplaceMetadata.newBuilder() + .setOperationType(WriteOperationType.UNKNOWN.name()).build(); + metaClient.getActiveTimeline().saveToPendingReplaceCommit(newRequestedInstant, + TimelineMetadataUtils.serializeRequestedReplaceMetadata(requestedReplaceMetadata)); + + metaClient.reloadActiveTimeline(); + // transition to inflight + HoodieInstant inflightInstant = metaClient.getActiveTimeline().transitionReplaceRequestedToInflight(newRequestedInstant, Option.empty()); + // transition to replacecommit HoodieReplaceCommitMetadata replaceCommitMetadata = new HoodieReplaceCommitMetadata(); writeStats.forEach(e -> replaceCommitMetadata.addWriteStat(e.getKey(), e.getValue())); replaceCommitMetadata.setPartitionToReplaceFileIds(partitionToReplaceFileIds); - HoodieInstant inflightInstant = new HoodieInstant(true, HoodieTimeline.REPLACE_COMMIT_ACTION, instant); - metaClient.getActiveTimeline().createNewInstant(inflightInstant); metaClient.getActiveTimeline().saveAsComplete(inflightInstant, Option.of(replaceCommitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); return writeStats.stream().map(e -> e.getValue().getPath()).collect(Collectors.toList()); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestPriorityBasedFileSystemView.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestPriorityBasedFileSystemView.java index bae74961180b8..9fa96216f4d53 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestPriorityBasedFileSystemView.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestPriorityBasedFileSystemView.java @@ -22,6 +22,7 @@ import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieFileGroup; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.testutils.MockHoodieTimeline; @@ -29,6 +30,12 @@ import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair; +import org.apache.http.client.HttpResponseException; +import org.apache.logging.log4j.Level; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.core.LogEvent; +import org.apache.logging.log4j.core.Logger; +import org.apache.logging.log4j.core.appender.AbstractAppender; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; @@ -36,12 +43,15 @@ import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.UUID; import java.util.stream.Stream; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.reset; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; @@ -66,7 +76,8 @@ public class TestPriorityBasedFileSystemView { public void setUp() { fsView = new PriorityBasedFileSystemView(primary, secondary); testBaseFileStream = Stream.of(new HoodieBaseFile("test")); - testFileSliceStream = Stream.of(new FileSlice("2020-01-01", "20:20", "file0001.parquet")); + testFileSliceStream = Stream.of(new FileSlice("2020-01-01", "20:20", + "file0001" + HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension())); } private void resetMocks() { @@ -100,6 +111,31 @@ public void testGetLatestBaseFiles() { }); } + @Test + public void testBadRequestExceptionWithPrimary() { + final TestLogAppender appender = new TestLogAppender(); + final Logger logger = (Logger) LogManager.getLogger(PriorityBasedFileSystemView.class); + try { + appender.start(); + logger.addAppender(appender); + Stream actual; + Stream expected = testBaseFileStream; + + resetMocks(); + when(primary.getLatestBaseFiles()).thenThrow(new RuntimeException(new HttpResponseException(400, "Bad Request"))); + when(secondary.getLatestBaseFiles()).thenReturn(testBaseFileStream); + actual = fsView.getLatestBaseFiles(); + assertEquals(expected, actual); + final List logs = appender.getLog(); + final LogEvent firstLogEntry = logs.get(0); + assertEquals(firstLogEntry.getLevel(), Level.WARN); + assertTrue((firstLogEntry.getMessage().getFormattedMessage()).contains("Got error running preferred function. Likely due to another " + + "concurrent writer in progress. Trying secondary")); + } finally { + logger.removeAppender(appender); + } + } + @Test public void testGetLatestBaseFilesWithPartitionPath() { Stream actual; @@ -589,8 +625,8 @@ public void testGetTimeline() { @Test public void testSync() { fsView.sync(); - verify(primary, times(1)).reset(); - verify(secondary, times(1)).reset(); + verify(primary, times(1)).sync(); + verify(secondary, times(1)).sync(); } @Test @@ -631,4 +667,21 @@ public void testGetPreferredView() { public void testGetSecondaryView() { assertEquals(secondary, fsView.getSecondaryView()); } + + class TestLogAppender extends AbstractAppender { + private final List log = new ArrayList<>(); + + protected TestLogAppender() { + super(UUID.randomUUID().toString(), null, null, false, null); + } + + @Override + public void append(LogEvent event) { + log.add(event); + } + + public List getLog() { + return new ArrayList(log); + } + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestSpillableMapBasedFileSystemView.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestSpillableMapBasedFileSystemView.java index 7f2e0dc297d72..8109249c19de8 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestSpillableMapBasedFileSystemView.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestSpillableMapBasedFileSystemView.java @@ -18,6 +18,7 @@ package org.apache.hudi.common.table.view; +import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.table.timeline.HoodieTimeline; /** @@ -29,6 +30,7 @@ public class TestSpillableMapBasedFileSystemView extends TestHoodieTableFileSyst protected SyncableFileSystemView getFileSystemView(HoodieTimeline timeline) { return new SpillableMapBasedFileSystemView(metaClient, timeline, FileSystemViewStorageConfig.newBuilder() // pure disk base View - .withStorageType(FileSystemViewStorageType.SPILLABLE_DISK).withMaxMemoryForView(0L).build()); + .withStorageType(FileSystemViewStorageType.SPILLABLE_DISK).withMaxMemoryForView(0L).build(), + HoodieCommonConfig.newBuilder().build()); } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestSpillableMapBasedIncrementalFSViewSync.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestSpillableMapBasedIncrementalFSViewSync.java index d3478ce140b21..c678dd2e48c12 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestSpillableMapBasedIncrementalFSViewSync.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestSpillableMapBasedIncrementalFSViewSync.java @@ -18,6 +18,7 @@ package org.apache.hudi.common.table.view; +import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieTimeline; @@ -29,6 +30,7 @@ public class TestSpillableMapBasedIncrementalFSViewSync extends TestIncrementalF @Override protected SyncableFileSystemView getFileSystemView(HoodieTableMetaClient metaClient, HoodieTimeline timeline) { return new SpillableMapBasedFileSystemView(metaClient, timeline, - FileSystemViewStorageConfig.newBuilder().withMaxMemoryForView(0L).withIncrementalTimelineSync(true).build()); + FileSystemViewStorageConfig.newBuilder().withMaxMemoryForView(0L).withIncrementalTimelineSync(true).build(), + HoodieCommonConfig.newBuilder().build()); } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/AvroBinaryTestPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/AvroBinaryTestPayload.java index ff862ee7b7f7f..edd1a05360231 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/AvroBinaryTestPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/AvroBinaryTestPayload.java @@ -44,7 +44,7 @@ public AvroBinaryTestPayload(Option record) { } @Override - public HoodieRecordPayload preCombine(HoodieRecordPayload another) { + public HoodieRecordPayload preCombine(HoodieRecordPayload oldValue) { return this; } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/CheckedFunction.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/CheckedFunction.java new file mode 100644 index 0000000000000..b0b3588c626a6 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/CheckedFunction.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.testutils; + +@FunctionalInterface +public interface CheckedFunction { + R apply(T t) throws Exception; +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/ClusteringTestUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/ClusteringTestUtils.java new file mode 100644 index 0000000000000..b142fe90b4cfa --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/ClusteringTestUtils.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.testutils; + +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.ClusteringUtils; +import org.apache.hudi.exception.HoodieException; + +import java.nio.file.Paths; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; + +import static org.apache.hudi.common.testutils.FileCreateUtils.baseFileName; +import static org.apache.hudi.common.testutils.FileCreateUtils.createBaseFile; +import static org.apache.hudi.common.testutils.HoodieTestUtils.DEFAULT_PARTITION_PATHS; + +public class ClusteringTestUtils { + + public static HoodieClusteringPlan createClusteringPlan(HoodieTableMetaClient metaClient, String instantTime, String fileId) { + try { + String basePath = metaClient.getBasePath(); + String partition = DEFAULT_PARTITION_PATHS[0]; + createBaseFile(basePath, partition, instantTime, fileId, 1); + FileSlice slice = new FileSlice(partition, instantTime, fileId); + slice.setBaseFile(new CompactionTestUtils.DummyHoodieBaseFile(Paths.get(basePath, partition, + baseFileName(instantTime, fileId)).toString())); + List[] fileSliceGroups = new List[] {Collections.singletonList(slice)}; + HoodieClusteringPlan clusteringPlan = ClusteringUtils.createClusteringPlan("strategy", new HashMap<>(), + fileSliceGroups, Collections.emptyMap()); + return clusteringPlan; + } catch (Exception e) { + throw new HoodieException(e.getMessage(), e); + } + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/CompactionTestUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/CompactionTestUtils.java index 44e3da059d86c..fb5f123e80234 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/CompactionTestUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/CompactionTestUtils.java @@ -110,7 +110,7 @@ public static Map> se } }); - metaClient = new HoodieTableMetaClient(metaClient.getHadoopConf(), metaClient.getBasePath(), true); + metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(metaClient.getBasePath()).setLoadActiveTimelineOnLoad(true).build(); Map> pendingCompactionMap = CompactionUtils.getAllPendingCompactionOperations(metaClient); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java index bca91f8001677..8be78a3a96927 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java @@ -21,18 +21,30 @@ import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieCleanerPlan; +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; +import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; +import org.apache.hudi.avro.model.HoodieSavepointMetadata; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; import org.apache.hudi.common.model.IOType; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.table.view.TableFileSystemView; +import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieException; import org.apache.hadoop.fs.FileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import java.io.IOException; import java.io.RandomAccessFile; @@ -40,22 +52,37 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.nio.file.attribute.FileTime; +import java.time.Instant; +import java.util.Collections; import java.util.HashMap; +import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import static org.apache.hudi.common.table.timeline.TimelineMetadataUtils.serializeCleanMetadata; import static org.apache.hudi.common.table.timeline.TimelineMetadataUtils.serializeCleanerPlan; +import static org.apache.hudi.common.table.timeline.TimelineMetadataUtils.serializeCompactionPlan; +import static org.apache.hudi.common.table.timeline.TimelineMetadataUtils.serializeRequestedReplaceMetadata; +import static org.apache.hudi.common.table.timeline.TimelineMetadataUtils.serializeRestoreMetadata; +import static org.apache.hudi.common.table.timeline.TimelineMetadataUtils.serializeRollbackMetadata; +import static org.apache.hudi.common.table.timeline.TimelineMetadataUtils.serializeRollbackPlan; public class FileCreateUtils { + private static final Logger LOG = LogManager.getLogger(FileCreateUtils.class); + private static final String WRITE_TOKEN = "1-0-1"; + private static final String BASE_FILE_EXTENSION = HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension(); + /** An empty byte array */ + public static final byte[] EMPTY_BYTES = new byte[0]; public static String baseFileName(String instantTime, String fileId) { - return baseFileName(instantTime, fileId, HoodieFileFormat.PARQUET.getFileExtension()); + return baseFileName(instantTime, fileId, BASE_FILE_EXTENSION); } public static String baseFileName(String instantTime, String fileId, String fileExtension) { - return FSUtils.makeDataFileName(instantTime, WRITE_TOKEN, fileId, fileExtension); + return FSUtils.makeBaseFileName(instantTime, WRITE_TOKEN, fileId, fileExtension); } public static String logFileName(String instantTime, String fileId, int version) { @@ -67,28 +94,54 @@ public static String logFileName(String instantTime, String fileId, int version, } public static String markerFileName(String instantTime, String fileId, IOType ioType) { - return markerFileName(instantTime, fileId, ioType, HoodieFileFormat.PARQUET.getFileExtension()); + return markerFileName(instantTime, fileId, ioType, BASE_FILE_EXTENSION); } public static String markerFileName(String instantTime, String fileId, IOType ioType, String fileExtension) { return String.format("%s_%s_%s%s%s.%s", fileId, WRITE_TOKEN, instantTime, fileExtension, HoodieTableMetaClient.MARKER_EXTN, ioType); } + private static void createMetaFile(String basePath, String instantTime, String suffix, FileSystem fs) throws IOException { + org.apache.hadoop.fs.Path parentPath = new org.apache.hadoop.fs.Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME); + if (!fs.exists(parentPath)) { + fs.create(parentPath).close(); + } + org.apache.hadoop.fs.Path metaFilePath = new org.apache.hadoop.fs.Path(parentPath, instantTime + suffix); + if (!fs.exists(metaFilePath)) { + fs.create(metaFilePath).close(); + } + } + private static void createMetaFile(String basePath, String instantTime, String suffix) throws IOException { + createMetaFile(basePath, instantTime, suffix, "".getBytes()); + } + + private static void createMetaFile(String basePath, String instantTime, String suffix, byte[] content) throws IOException { Path parentPath = Paths.get(basePath, HoodieTableMetaClient.METAFOLDER_NAME); Files.createDirectories(parentPath); Path metaFilePath = parentPath.resolve(instantTime + suffix); if (Files.notExists(metaFilePath)) { - Files.createFile(metaFilePath); + if (content.length == 0) { + Files.createFile(metaFilePath); + } else { + Files.write(metaFilePath, content); + } } } - private static void createMetaFile(String basePath, String instantTime, String suffix, byte[] content) throws IOException { + private static void deleteMetaFile(String basePath, String instantTime, String suffix, FileSystem fs) throws IOException { + org.apache.hadoop.fs.Path parentPath = new org.apache.hadoop.fs.Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME); + org.apache.hadoop.fs.Path metaFilePath = new org.apache.hadoop.fs.Path(parentPath, instantTime + suffix); + if (fs.exists(metaFilePath)) { + fs.delete(metaFilePath, true); + } + } + + private static void deleteMetaFile(String basePath, String instantTime, String suffix) throws IOException { Path parentPath = Paths.get(basePath, HoodieTableMetaClient.METAFOLDER_NAME); - Files.createDirectories(parentPath); Path metaFilePath = parentPath.resolve(instantTime + suffix); - if (Files.notExists(metaFilePath)) { - Files.write(metaFilePath, content); + if (Files.exists(metaFilePath)) { + Files.delete(metaFilePath); } } @@ -96,6 +149,23 @@ public static void createCommit(String basePath, String instantTime) throws IOEx createMetaFile(basePath, instantTime, HoodieTimeline.COMMIT_EXTENSION); } + public static void createCommit(String basePath, String instantTime, Option metadata) throws IOException { + if (metadata.isPresent()) { + createMetaFile(basePath, instantTime, HoodieTimeline.COMMIT_EXTENSION, + metadata.get().toJsonString().getBytes(StandardCharsets.UTF_8)); + } else { + createMetaFile(basePath, instantTime, HoodieTimeline.COMMIT_EXTENSION); + } + } + + public static void createSavepointCommit(String basePath, String instantTime, HoodieSavepointMetadata savepointMetadata) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.SAVEPOINT_EXTENSION, TimelineMetadataUtils.serializeSavepointMetadata(savepointMetadata).get()); + } + + public static void createCommit(String basePath, String instantTime, FileSystem fs) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.COMMIT_EXTENSION, fs); + } + public static void createRequestedCommit(String basePath, String instantTime) throws IOException { createMetaFile(basePath, instantTime, HoodieTimeline.REQUESTED_COMMIT_EXTENSION); } @@ -104,10 +174,18 @@ public static void createInflightCommit(String basePath, String instantTime) thr createMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_COMMIT_EXTENSION); } + public static void createDeltaCommit(String basePath, String instantTime, HoodieCommitMetadata metadata) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.DELTA_COMMIT_EXTENSION, metadata.toJsonString().getBytes(StandardCharsets.UTF_8)); + } + public static void createDeltaCommit(String basePath, String instantTime) throws IOException { createMetaFile(basePath, instantTime, HoodieTimeline.DELTA_COMMIT_EXTENSION); } + public static void createDeltaCommit(String basePath, String instantTime, FileSystem fs) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.DELTA_COMMIT_EXTENSION, fs); + } + public static void createRequestedDeltaCommit(String basePath, String instantTime) throws IOException { createMetaFile(basePath, instantTime, HoodieTimeline.REQUESTED_DELTA_COMMIT_EXTENSION); } @@ -116,30 +194,82 @@ public static void createInflightDeltaCommit(String basePath, String instantTime createMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_DELTA_COMMIT_EXTENSION); } + public static void createInflightReplaceCommit(String basePath, String instantTime) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_REPLACE_COMMIT_EXTENSION); + } + public static void createReplaceCommit(String basePath, String instantTime, HoodieReplaceCommitMetadata metadata) throws IOException { createMetaFile(basePath, instantTime, HoodieTimeline.REPLACE_COMMIT_EXTENSION, metadata.toJsonString().getBytes(StandardCharsets.UTF_8)); } - public static void createRequestedReplaceCommit(String basePath, String instantTime) throws IOException { - createMetaFile(basePath, instantTime, HoodieTimeline.REQUESTED_REPLACE_COMMIT_EXTENSION); + public static void createRequestedReplaceCommit(String basePath, String instantTime, Option requestedReplaceMetadata) throws IOException { + if (requestedReplaceMetadata.isPresent()) { + createMetaFile(basePath, instantTime, HoodieTimeline.REQUESTED_REPLACE_COMMIT_EXTENSION, serializeRequestedReplaceMetadata(requestedReplaceMetadata.get()).get()); + } else { + createMetaFile(basePath, instantTime, HoodieTimeline.REQUESTED_REPLACE_COMMIT_EXTENSION); + } } - public static void createInflightReplaceCommit(String basePath, String instantTime) throws IOException { - createMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_REPLACE_COMMIT_EXTENSION); + public static void createInflightReplaceCommit(String basePath, String instantTime, Option inflightReplaceMetadata) throws IOException { + if (inflightReplaceMetadata.isPresent()) { + createMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_REPLACE_COMMIT_EXTENSION, inflightReplaceMetadata.get().toJsonString().getBytes(StandardCharsets.UTF_8)); + } else { + createMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_REPLACE_COMMIT_EXTENSION); + } + } + + public static void createRequestedCompactionCommit(String basePath, String instantTime, HoodieCompactionPlan requestedCompactionPlan) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.REQUESTED_COMPACTION_EXTENSION, serializeCompactionPlan(requestedCompactionPlan).get()); } public static void createCleanFile(String basePath, String instantTime, HoodieCleanMetadata metadata) throws IOException { createMetaFile(basePath, instantTime, HoodieTimeline.CLEAN_EXTENSION, serializeCleanMetadata(metadata).get()); } + public static void createCleanFile(String basePath, String instantTime, HoodieCleanMetadata metadata, boolean isEmpty) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.CLEAN_EXTENSION, isEmpty ? EMPTY_BYTES : serializeCleanMetadata(metadata).get()); + } + public static void createRequestedCleanFile(String basePath, String instantTime, HoodieCleanerPlan cleanerPlan) throws IOException { createMetaFile(basePath, instantTime, HoodieTimeline.REQUESTED_CLEAN_EXTENSION, serializeCleanerPlan(cleanerPlan).get()); } + public static void createRequestedCleanFile(String basePath, String instantTime, HoodieCleanerPlan cleanerPlan, boolean isEmpty) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.REQUESTED_CLEAN_EXTENSION, isEmpty ? EMPTY_BYTES : serializeCleanerPlan(cleanerPlan).get()); + } + public static void createInflightCleanFile(String basePath, String instantTime, HoodieCleanerPlan cleanerPlan) throws IOException { createMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_CLEAN_EXTENSION, serializeCleanerPlan(cleanerPlan).get()); } + public static void createInflightCleanFile(String basePath, String instantTime, HoodieCleanerPlan cleanerPlan, boolean isEmpty) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_CLEAN_EXTENSION, isEmpty ? EMPTY_BYTES : serializeCleanerPlan(cleanerPlan).get()); + } + + public static void createRequestedRollbackFile(String basePath, String instantTime, HoodieRollbackPlan plan) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.REQUESTED_ROLLBACK_EXTENSION, serializeRollbackPlan(plan).get()); + } + + public static void createRequestedRollbackFile(String basePath, String instantTime, byte[] content) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.REQUESTED_ROLLBACK_EXTENSION, content); + } + + public static void createRequestedRollbackFile(String basePath, String instantTime) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.REQUESTED_ROLLBACK_EXTENSION); + } + + public static void createInflightRollbackFile(String basePath, String instantTime) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_ROLLBACK_EXTENSION); + } + + public static void createRollbackFile(String basePath, String instantTime, HoodieRollbackMetadata hoodieRollbackMetadata, boolean isEmpty) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.ROLLBACK_EXTENSION, isEmpty ? EMPTY_BYTES : serializeRollbackMetadata(hoodieRollbackMetadata).get()); + } + + public static void createRestoreFile(String basePath, String instantTime, HoodieRestoreMetadata hoodieRestoreMetadata) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.RESTORE_ACTION, serializeRestoreMetadata(hoodieRestoreMetadata).get()); + } + private static void createAuxiliaryMetaFile(String basePath, String instantTime, String suffix) throws IOException { Path parentPath = Paths.get(basePath, HoodieTableMetaClient.AUXILIARYFOLDER_NAME); Files.createDirectories(parentPath); @@ -157,10 +287,18 @@ public static void createInflightCompaction(String basePath, String instantTime) createAuxiliaryMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_COMPACTION_EXTENSION); } + public static void createPendingInflightCompaction(String basePath, String instantTime) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_COMPACTION_EXTENSION); + } + + public static void createInflightSavepoint(String basePath, String instantTime) throws IOException { + createAuxiliaryMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_SAVEPOINT_EXTENSION); + } + public static void createPartitionMetaFile(String basePath, String partitionPath) throws IOException { Path parentPath = Paths.get(basePath, partitionPath); Files.createDirectories(parentPath); - Path metaFilePath = parentPath.resolve(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE); + Path metaFilePath = parentPath.resolve(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX); if (Files.notExists(metaFilePath)) { Files.createFile(metaFilePath); } @@ -168,18 +306,31 @@ public static void createPartitionMetaFile(String basePath, String partitionPath public static void createBaseFile(String basePath, String partitionPath, String instantTime, String fileId) throws Exception { - createBaseFile(basePath, partitionPath, instantTime, fileId, 0); + createBaseFile(basePath, partitionPath, instantTime, fileId, 1); } public static void createBaseFile(String basePath, String partitionPath, String instantTime, String fileId, long length) throws Exception { + createBaseFile(basePath, partitionPath, instantTime, fileId, length, Instant.now().toEpochMilli()); + } + + public static void createBaseFile(String basePath, String partitionPath, String instantTime, String fileId, long length, long lastModificationTimeMilli) + throws Exception { Path parentPath = Paths.get(basePath, partitionPath); Files.createDirectories(parentPath); Path baseFilePath = parentPath.resolve(baseFileName(instantTime, fileId)); if (Files.notExists(baseFilePath)) { Files.createFile(baseFilePath); } - new RandomAccessFile(baseFilePath.toFile(), "rw").setLength(length); + RandomAccessFile raf = new RandomAccessFile(baseFilePath.toFile(), "rw"); + raf.setLength(length); + raf.close(); + Files.setLastModifiedTime(baseFilePath, FileTime.fromMillis(lastModificationTimeMilli)); + } + + public static Path getBaseFilePath(String basePath, String partitionPath, String instantTime, String fileId) { + Path parentPath = Paths.get(basePath, partitionPath); + return parentPath.resolve(baseFileName(instantTime, fileId)); } public static void createLogFile(String basePath, String partitionPath, String instantTime, String fileId, int version) @@ -195,7 +346,9 @@ public static void createLogFile(String basePath, String partitionPath, String i if (Files.notExists(logFilePath)) { Files.createFile(logFilePath); } - new RandomAccessFile(logFilePath.toFile(), "rw").setLength(length); + RandomAccessFile raf = new RandomAccessFile(logFilePath.toFile(), "rw"); + raf.setLength(length); + raf.close(); } public static String createMarkerFile(String basePath, String partitionPath, String instantTime, String fileId, IOType ioType) @@ -209,6 +362,48 @@ public static String createMarkerFile(String basePath, String partitionPath, Str return markerFilePath.toAbsolutePath().toString(); } + private static void removeMetaFile(String basePath, String instantTime, String suffix) throws IOException { + Path parentPath = Paths.get(basePath, HoodieTableMetaClient.METAFOLDER_NAME); + Path metaFilePath = parentPath.resolve(instantTime + suffix); + if (Files.exists(metaFilePath)) { + Files.delete(metaFilePath); + } + } + + public static void deleteCommit(String basePath, String instantTime) throws IOException { + removeMetaFile(basePath, instantTime, HoodieTimeline.COMMIT_EXTENSION); + } + + public static void deleteRequestedCommit(String basePath, String instantTime) throws IOException { + removeMetaFile(basePath, instantTime, HoodieTimeline.REQUESTED_COMMIT_EXTENSION); + } + + public static void deleteInflightCommit(String basePath, String instantTime) throws IOException { + removeMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_COMMIT_EXTENSION); + } + + public static void deleteDeltaCommit(String basePath, String instantTime) throws IOException { + removeMetaFile(basePath, instantTime, HoodieTimeline.DELTA_COMMIT_EXTENSION); + } + + public static void deleteReplaceCommit(String basePath, String instantTime) throws IOException { + removeMetaFile(basePath, instantTime, HoodieTimeline.REPLACE_COMMIT_EXTENSION); + } + + public static void deleteRollbackCommit(String basePath, String instantTime) throws IOException { + removeMetaFile(basePath, instantTime, HoodieTimeline.ROLLBACK_EXTENSION); + } + + public static Path renameFileToTemp(Path sourcePath, String instantTime) throws IOException { + Path dummyFilePath = sourcePath.getParent().resolve(instantTime + ".temp"); + Files.move(sourcePath, dummyFilePath); + return dummyFilePath; + } + + public static void renameTempToMetaFile(Path tempFilePath, Path destPath) throws IOException { + Files.move(tempFilePath, destPath); + } + public static long getTotalMarkerFileCount(String basePath, String partitionPath, String instantTime, IOType ioType) throws IOException { Path parentPath = Paths.get(basePath, HoodieTableMetaClient.TEMPFOLDER_NAME, instantTime, partitionPath); if (Files.notExists(parentPath)) { @@ -218,13 +413,32 @@ public static long getTotalMarkerFileCount(String basePath, String partitionPath .endsWith(String.format("%s.%s", HoodieTableMetaClient.MARKER_EXTN, ioType))).count(); } + public static List getPartitionPaths(Path basePath) throws IOException { + if (Files.notExists(basePath)) { + return Collections.emptyList(); + } + return Files.list(basePath).filter(entry -> !entry.getFileName().toString().equals(HoodieTableMetaClient.METAFOLDER_NAME) + && !isBaseOrLogFilename(entry.getFileName().toString()) + && !entry.getFileName().toString().startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX)) + .collect(Collectors.toList()); + } + + public static boolean isBaseOrLogFilename(String filename) { + for (HoodieFileFormat format : HoodieFileFormat.values()) { + if (filename.contains(format.getFileExtension())) { + return true; + } + } + return false; + } + /** * Find total basefiles for passed in paths. */ public static Map getBaseFileCountsForPaths(String basePath, FileSystem fs, String... paths) { Map toReturn = new HashMap<>(); try { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath, true); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); for (String path : paths) { TableFileSystemView.BaseFileOnlyView fileSystemView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), fs.globStatus(new org.apache.hadoop.fs.Path(path))); @@ -235,4 +449,13 @@ public static Map getBaseFileCountsForPaths(String basePath, FileS throw new HoodieException("Error reading hoodie table as a dataframe", e); } } + + public static void deleteDeltaCommit(String basePath, String instantTime, FileSystem fs) throws IOException { + deleteMetaFile(basePath, instantTime, HoodieTimeline.DELTA_COMMIT_EXTENSION, fs); + } + + public static void deleteSavepointCommit(String basePath, String instantTime, FileSystem fs) throws IOException { + deleteMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_SAVEPOINT_EXTENSION, fs); + deleteMetaFile(basePath, instantTime, HoodieTimeline.SAVEPOINT_EXTENSION, fs); + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileSystemTestUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileSystemTestUtils.java index 76fdf18d4a820..95188bb0b68d9 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileSystemTestUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileSystemTestUtils.java @@ -75,7 +75,11 @@ public static void deleteFile(File fileToDelete) throws IOException { } public static List listRecursive(FileSystem fs, Path path) throws IOException { - RemoteIterator itr = fs.listFiles(path, true); + return listFiles(fs, path, true); + } + + public static List listFiles(FileSystem fs, Path path, boolean recursive) throws IOException { + RemoteIterator itr = fs.listFiles(path, recursive); List statuses = new ArrayList<>(); while (itr.hasNext()) { statuses.add(itr.next()); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HadoopMapRedUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HadoopMapRedUtils.java new file mode 100644 index 0000000000000..a06039b5fba35 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HadoopMapRedUtils.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.testutils; + +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hudi.common.util.Option; + +import java.util.concurrent.ConcurrentHashMap; + +public class HadoopMapRedUtils { + + /** + * Creates instance of {@link Reporter} to collect reported counters + */ + public static Reporter createTestReporter() { + class TestReporter implements Reporter { + private final ConcurrentHashMap counters = + new ConcurrentHashMap<>(); + + @Override + public void setStatus(String status) { + // not-supported + } + + @Override + public Counters.Counter getCounter(Enum name) { + return counters.computeIfAbsent(name.name(), (ignored) -> new Counters.Counter()); + } + + @Override + public Counters.Counter getCounter(String group, String name) { + return counters.computeIfAbsent(getKey(group, name), (ignored) -> new Counters.Counter()); + } + + @Override + public void incrCounter(Enum key, long amount) { + Option.ofNullable(counters.get(key)) + .ifPresent(c -> c.increment(amount)); + } + + @Override + public void incrCounter(String group, String counter, long amount) { + Option.ofNullable(counters.get(getKey(group, counter))) + .ifPresent(c -> c.increment(amount)); + } + + @Override + public InputSplit getInputSplit() throws UnsupportedOperationException { + throw new UnsupportedOperationException("not supported"); + } + + @Override + public float getProgress() { + return -1; + } + + @Override + public void progress() { + // not-supported + } + + private String getKey(String group, String name) { + return String.format("%s:%s", group, name); + } + } + + return new TestReporter(); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java index 96a00da6f2b32..dc64856d3c76c 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java @@ -34,11 +34,17 @@ */ public class HoodieCommonTestHarness { + protected String tableName = null; protected String basePath = null; + protected transient HoodieTestDataGenerator dataGen = null; protected transient HoodieTableMetaClient metaClient; @TempDir public java.nio.file.Path tempDir; + protected void setTableName(String tableName) { + this.tableName = tableName; + } + /** * Initializes basePath. */ @@ -52,6 +58,28 @@ protected void initPath() { } } + /** + * Initializes a test data generator which used to generate test datas. + * + */ + protected void initTestDataGenerator() { + dataGen = new HoodieTestDataGenerator(); + } + + protected void initTestDataGenerator(String[] partitionPaths) { + dataGen = new HoodieTestDataGenerator(partitionPaths); + } + + /** + * Cleanups test data generator. + * + */ + protected void cleanupTestDataGenerator() { + if (dataGen != null) { + dataGen = null; + } + } + /** * Initializes an instance of {@link HoodieTableMetaClient} with a special table type specified by * {@code getTableType()}. @@ -64,7 +92,7 @@ protected void initMetaClient() throws IOException { } protected void refreshFsView() throws IOException { - metaClient = new HoodieTableMetaClient(metaClient.getHadoopConf(), basePath, true); + metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); } protected SyncableFileSystemView getFileSystemView(HoodieTimeline timeline) throws IOException { diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java index 17e93feca95d2..8614060126dfa 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java @@ -23,6 +23,7 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodiePartitionMetadata; @@ -31,7 +32,9 @@ import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.util.AvroOrcUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.avro.Conversions; @@ -47,13 +50,18 @@ import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import org.apache.orc.TypeDescription; import java.io.IOException; import java.io.Serializable; +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; import java.math.BigDecimal; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; -import java.sql.Date; +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -74,13 +82,14 @@ *

    * Test data uses a toy Uber trips, data model. */ -public class HoodieTestDataGenerator { +public class HoodieTestDataGenerator implements AutoCloseable { // based on examination of sample file, the schema produces the following per record size public static final int BYTES_PER_RECORD = (int) (1.2 * 1024); // with default bloom filter with 60,000 entries and 0.000000001 FPRate public static final int BLOOM_FILTER_BYTES = 323495; private static Logger logger = LogManager.getLogger(HoodieTestDataGenerator.class); + public static final String NO_PARTITION_PATH = ""; public static final String DEFAULT_FIRST_PARTITION_PATH = "2016/03/15"; public static final String DEFAULT_SECOND_PARTITION_PATH = "2015/03/16"; public static final String DEFAULT_THIRD_PARTITION_PATH = "2015/03/17"; @@ -88,8 +97,10 @@ public class HoodieTestDataGenerator { public static final String[] DEFAULT_PARTITION_PATHS = {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH, DEFAULT_THIRD_PARTITION_PATH}; public static final int DEFAULT_PARTITION_DEPTH = 3; + public static final String TRIP_SCHEMA_PREFIX = "{\"type\": \"record\"," + "\"name\": \"triprec\"," + "\"fields\": [ " + "{\"name\": \"timestamp\",\"type\": \"long\"}," + "{\"name\": \"_row_key\", \"type\": \"string\"}," + + "{\"name\": \"partition_path\", \"type\": [\"null\", \"string\"], \"default\": null }," + "{\"name\": \"rider\", \"type\": \"string\"}," + "{\"name\": \"driver\", \"type\": \"string\"}," + "{\"name\": \"begin_lat\", \"type\": \"double\"}," + "{\"name\": \"begin_lon\", \"type\": \"double\"}," + "{\"name\": \"end_lat\", \"type\": \"double\"}," + "{\"name\": \"end_lon\", \"type\": \"double\"},"; @@ -98,8 +109,8 @@ public class HoodieTestDataGenerator { + "{\"name\": \"amount\",\"type\": \"double\"},{\"name\": \"currency\", \"type\": \"string\"}]}},"; public static final String FARE_FLATTENED_SCHEMA = "{\"name\": \"fare\", \"type\": \"double\"}," + "{\"name\": \"currency\", \"type\": \"string\"},"; - public static final String TIP_NESTED_SCHEMA = "{\"name\": \"tip_history\", \"default\": null, \"type\": {\"type\": " - + "\"array\", \"items\": {\"type\": \"record\", \"default\": null, \"name\": \"tip_history\", \"fields\": [" + public static final String TIP_NESTED_SCHEMA = "{\"name\": \"tip_history\", \"default\": [], \"type\": {\"type\": " + + "\"array\", \"default\": [], \"items\": {\"type\": \"record\", \"default\": null, \"name\": \"tip_history\", \"fields\": [" + "{\"name\": \"amount\", \"type\": \"double\"}, {\"name\": \"currency\", \"type\": \"string\"}]}}},"; public static final String MAP_TYPE_SCHEMA = "{\"name\": \"city_to_state\", \"type\": {\"type\": \"map\", \"values\": \"string\"}},"; public static final String EXTRA_TYPE_SCHEMA = "{\"name\": \"distance_in_meters\", \"type\": \"int\"}," @@ -123,18 +134,20 @@ public class HoodieTestDataGenerator { + "{\"name\":\"driver\",\"type\":\"string\"},{\"name\":\"fare\",\"type\":\"double\"},{\"name\": \"_hoodie_is_deleted\", \"type\": \"boolean\", \"default\": false}]}"; public static final String NULL_SCHEMA = Schema.create(Schema.Type.NULL).toString(); - public static final String TRIP_HIVE_COLUMN_TYPES = "bigint,string,string,string,double,double,double,double,int,bigint,float,binary,int,bigint,decimal(10,6)," + public static final String TRIP_HIVE_COLUMN_TYPES = "bigint,string,string,string,string,double,double,double,double,int,bigint,float,binary,int,bigint,decimal(10,6)," + "map,struct,array>,boolean"; public static final Schema AVRO_SCHEMA = new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA); + public static final TypeDescription ORC_SCHEMA = AvroOrcUtils.createOrcSchema(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA)); public static final Schema AVRO_SCHEMA_WITH_METADATA_FIELDS = HoodieAvroUtils.addMetadataFields(AVRO_SCHEMA); public static final Schema AVRO_SHORT_TRIP_SCHEMA = new Schema.Parser().parse(SHORT_TRIP_SCHEMA); public static final Schema AVRO_TRIP_SCHEMA = new Schema.Parser().parse(TRIP_SCHEMA); + public static final TypeDescription ORC_TRIP_SCHEMA = AvroOrcUtils.createOrcSchema(new Schema.Parser().parse(TRIP_SCHEMA)); public static final Schema FLATTENED_AVRO_SCHEMA = new Schema.Parser().parse(TRIP_FLATTENED_SCHEMA); - private static final Random RAND = new Random(46474747); + private final Random rand; //Maintains all the existing keys schema wise private final Map> existingKeysBySchema; @@ -142,28 +155,60 @@ public class HoodieTestDataGenerator { //maintains the count of existing keys schema wise private Map numKeysBySchema; + public HoodieTestDataGenerator(long seed) { + this(seed, DEFAULT_PARTITION_PATHS, new HashMap<>()); + } + + public HoodieTestDataGenerator(long seed, String[] partitionPaths, Map keyPartitionMap) { + this.rand = new Random(seed); + this.partitionPaths = Arrays.copyOf(partitionPaths, partitionPaths.length); + this.existingKeysBySchema = new HashMap<>(); + this.existingKeysBySchema.put(TRIP_EXAMPLE_SCHEMA, keyPartitionMap); + this.numKeysBySchema = new HashMap<>(); + this.numKeysBySchema.put(TRIP_EXAMPLE_SCHEMA, keyPartitionMap.size()); + + logger.info(String.format("Test DataGenerator's seed (%s)", seed)); + } + + ////////////////////////////////////////////////////////////////////////////////// + // DEPRECATED API + ////////////////////////////////////////////////////////////////////////////////// + + @Deprecated public HoodieTestDataGenerator(String[] partitionPaths) { this(partitionPaths, new HashMap<>()); } + @Deprecated public HoodieTestDataGenerator() { this(DEFAULT_PARTITION_PATHS); } + @Deprecated public HoodieTestDataGenerator(String[] partitionPaths, Map keyPartitionMap) { - this.partitionPaths = Arrays.copyOf(partitionPaths, partitionPaths.length); - this.existingKeysBySchema = new HashMap<>(); - existingKeysBySchema.put(TRIP_EXAMPLE_SCHEMA, keyPartitionMap); - numKeysBySchema = new HashMap<>(); + // NOTE: This used as a workaround to make sure that new instantiations of the generator + // always return "new" random values. + // Caveat is that if 2 successive invocations are made w/in the timespan that is smaller + // than the resolution of {@code nanoTime}, then this will produce identical results + this(System.nanoTime(), partitionPaths, keyPartitionMap); } + /** + * @deprecated please use non-static version + */ + public static void writePartitionMetadataDeprecated(FileSystem fs, String[] partitionPaths, String basePath) { + new HoodieTestDataGenerator().writePartitionMetadata(fs, partitionPaths, basePath); + } + + ////////////////////////////////////////////////////////////////////////////////// + /** * @implNote {@link HoodieTestDataGenerator} is supposed to just generate records with schemas. Leave HoodieTable files (metafile, basefile, logfile, etc) to {@link HoodieTestTable}. * @deprecated Use {@link HoodieTestTable#withPartitionMetaFiles(java.lang.String...)} instead. */ - public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths, String basePath) { + public void writePartitionMetadata(FileSystem fs, String[] partitionPaths, String basePath) { for (String partitionPath : partitionPaths) { - new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath)).trySave(0); + new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath), Option.empty()).trySave(0); } } @@ -191,7 +236,7 @@ public RawTripTestPayload generateRandomValueAsPerSchema(String schemaStr, Hoodi * @param instantTime Instant time to use. * @return Raw paylaod of a test record. */ - public static RawTripTestPayload generateRandomValue(HoodieKey key, String instantTime) throws IOException { + public RawTripTestPayload generateRandomValue(HoodieKey key, String instantTime) throws IOException { return generateRandomValue(key, instantTime, false); } @@ -205,10 +250,15 @@ public static RawTripTestPayload generateRandomValue(HoodieKey key, String insta * @return Raw paylaod of a test record. * @throws IOException */ - public static RawTripTestPayload generateRandomValue( + private RawTripTestPayload generateRandomValue( HoodieKey key, String instantTime, boolean isFlattened) throws IOException { + return generateRandomValue(key, instantTime, isFlattened, 0); + } + + private RawTripTestPayload generateRandomValue( + HoodieKey key, String instantTime, boolean isFlattened, int ts) throws IOException { GenericRecord rec = generateGenericRecord( - key.getRecordKey(), "rider-" + instantTime, "driver-" + instantTime, 0, + key.getRecordKey(), key.getPartitionPath(), "rider-" + instantTime, "driver-" + instantTime, ts, false, isFlattened); return new RawTripTestPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA); } @@ -229,52 +279,53 @@ public RawTripTestPayload generatePayloadForShortTripSchema(HoodieKey key, Strin /** * Generates a new avro record of the above schema format for a delete. */ - public static RawTripTestPayload generateRandomDeleteValue(HoodieKey key, String instantTime) throws IOException { - GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + instantTime, "driver-" + instantTime, 0, + private RawTripTestPayload generateRandomDeleteValue(HoodieKey key, String instantTime) throws IOException { + GenericRecord rec = generateGenericRecord(key.getRecordKey(), key.getPartitionPath(), "rider-" + instantTime, "driver-" + instantTime, 0, true, false); - return new RawTripTestPayload(Option.of(rec.toString()), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA, true); + return new RawTripTestPayload(Option.of(rec.toString()), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA, true, 0L); } /** * Generates a new avro record of the above schema format, retaining the key if optionally provided. */ - public static HoodieAvroPayload generateAvroPayload(HoodieKey key, String instantTime) { - GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + instantTime, "driver-" + instantTime, 0); + private HoodieAvroPayload generateAvroPayload(HoodieKey key, String instantTime) { + GenericRecord rec = generateGenericRecord(key.getRecordKey(), key.getPartitionPath(), "rider-" + instantTime, "driver-" + instantTime, 0); return new HoodieAvroPayload(Option.of(rec)); } - public static GenericRecord generateGenericRecord(String rowKey, String riderName, String driverName, - long timestamp) { - return generateGenericRecord(rowKey, riderName, driverName, timestamp, false, false); + public GenericRecord generateGenericRecord(String rowKey, String partitionPath, String riderName, String driverName, + long timestamp) { + return generateGenericRecord(rowKey, partitionPath, riderName, driverName, timestamp, false, false); } - public static GenericRecord generateGenericRecord(String rowKey, String riderName, String driverName, + public GenericRecord generateGenericRecord(String rowKey, String partitionPath, String riderName, String driverName, long timestamp, boolean isDeleteRecord, boolean isFlattened) { GenericRecord rec = new GenericData.Record(isFlattened ? FLATTENED_AVRO_SCHEMA : AVRO_SCHEMA); rec.put("_row_key", rowKey); rec.put("timestamp", timestamp); + rec.put("partition_path", partitionPath); rec.put("rider", riderName); rec.put("driver", driverName); - rec.put("begin_lat", RAND.nextDouble()); - rec.put("begin_lon", RAND.nextDouble()); - rec.put("end_lat", RAND.nextDouble()); - rec.put("end_lon", RAND.nextDouble()); + rec.put("begin_lat", rand.nextDouble()); + rec.put("begin_lon", rand.nextDouble()); + rec.put("end_lat", rand.nextDouble()); + rec.put("end_lon", rand.nextDouble()); if (isFlattened) { - rec.put("fare", RAND.nextDouble() * 100); + rec.put("fare", rand.nextDouble() * 100); rec.put("currency", "USD"); } else { - rec.put("distance_in_meters", RAND.nextInt()); - rec.put("seconds_since_epoch", RAND.nextLong()); - rec.put("weight", RAND.nextFloat()); + rec.put("distance_in_meters", rand.nextInt()); + rec.put("seconds_since_epoch", rand.nextLong()); + rec.put("weight", rand.nextFloat()); byte[] bytes = "Canada".getBytes(); rec.put("nation", ByteBuffer.wrap(bytes)); - long currentTimeMillis = System.currentTimeMillis(); - Date date = new Date(currentTimeMillis); - rec.put("current_date", (int) date.toLocalDate().toEpochDay()); - rec.put("current_ts", currentTimeMillis); + long randomMillis = genRandomTimeMillis(rand); + Instant instant = Instant.ofEpochMilli(randomMillis); + rec.put("current_date", (int) LocalDateTime.ofInstant(instant, ZoneOffset.UTC).toLocalDate().toEpochDay()); + rec.put("current_ts", randomMillis); - BigDecimal bigDecimal = new BigDecimal(String.format("%5f", RAND.nextFloat())); + BigDecimal bigDecimal = new BigDecimal(String.format("%5f", rand.nextFloat())); Schema decimalSchema = AVRO_SCHEMA.getField("height").schema(); Conversions.DecimalConversion decimalConversions = new Conversions.DecimalConversion(); GenericFixed genericFixed = decimalConversions.toFixed(bigDecimal, decimalSchema, LogicalTypes.decimal(10, 6)); @@ -283,14 +334,14 @@ public static GenericRecord generateGenericRecord(String rowKey, String riderNam rec.put("city_to_state", Collections.singletonMap("LA", "CA")); GenericRecord fareRecord = new GenericData.Record(AVRO_SCHEMA.getField("fare").schema()); - fareRecord.put("amount", RAND.nextDouble() * 100); + fareRecord.put("amount", rand.nextDouble() * 100); fareRecord.put("currency", "USD"); rec.put("fare", fareRecord); GenericArray tipHistoryArray = new GenericData.Array<>(1, AVRO_SCHEMA.getField("tip_history").schema()); Schema tipSchema = new Schema.Parser().parse(AVRO_SCHEMA.getField("tip_history").schema().toString()).getElementType(); GenericRecord tipRecord = new GenericData.Record(tipSchema); - tipRecord.put("amount", RAND.nextDouble() * 100); + tipRecord.put("amount", rand.nextDouble() * 100); tipRecord.put("currency", "USD"); tipHistoryArray.add(tipRecord); rec.put("tip_history", tipHistoryArray); @@ -313,7 +364,7 @@ public GenericRecord generateRecordForTripSchema(String rowKey, String riderName rec.put("timestamp", timestamp); rec.put("rider", riderName); rec.put("driver", driverName); - rec.put("fare", RAND.nextDouble() * 100); + rec.put("fare", rand.nextDouble() * 100); rec.put("_hoodie_is_deleted", false); return rec; } @@ -324,7 +375,7 @@ public GenericRecord generateRecordForShortTripSchema(String rowKey, String ride rec.put("timestamp", timestamp); rec.put("rider", riderName); rec.put("driver", driverName); - rec.put("fare", RAND.nextDouble() * 100); + rec.put("fare", rand.nextDouble() * 100); rec.put("_hoodie_is_deleted", false); return rec; } @@ -334,13 +385,21 @@ public static void createCommitFile(String basePath, String instantTime, Configu createCommitFile(basePath, instantTime, configuration, commitMetadata); } - public static void createCommitFile(String basePath, String instantTime, Configuration configuration, HoodieCommitMetadata commitMetadata) { + private static void createCommitFile(String basePath, String instantTime, Configuration configuration, HoodieCommitMetadata commitMetadata) { Arrays.asList(HoodieTimeline.makeCommitFileName(instantTime), HoodieTimeline.makeInflightCommitFileName(instantTime), HoodieTimeline.makeRequestedCommitFileName(instantTime)) .forEach(f -> createMetadataFile(f, basePath, configuration, commitMetadata)); } private static void createMetadataFile(String f, String basePath, Configuration configuration, HoodieCommitMetadata commitMetadata) { + try { + createMetadataFile(f, basePath, configuration, commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)); + } catch (IOException e) { + throw new HoodieIOException(e.getMessage(), e); + } + } + + private static void createMetadataFile(String f, String basePath, Configuration configuration, byte[] content) { Path commitFile = new Path( basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + f); FSDataOutputStream os = null; @@ -348,7 +407,7 @@ private static void createMetadataFile(String f, String basePath, Configuration FileSystem fs = FSUtils.getFs(basePath, configuration); os = fs.create(commitFile, true); // Write empty commit metadata - os.writeBytes(new String(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + os.write(content); } catch (IOException ioe) { throw new HoodieIOException(ioe.getMessage(), ioe); } finally { @@ -362,23 +421,21 @@ private static void createMetadataFile(String f, String basePath, Configuration } } - public static void createReplaceFile(String basePath, String instantTime, Configuration configuration, HoodieCommitMetadata commitMetadata) { - Arrays.asList(HoodieTimeline.makeReplaceFileName(instantTime), HoodieTimeline.makeInflightReplaceFileName(instantTime), + private static void createPendingReplaceFile(String basePath, String instantTime, Configuration configuration, HoodieCommitMetadata commitMetadata) { + Arrays.asList(HoodieTimeline.makeInflightReplaceFileName(instantTime), HoodieTimeline.makeRequestedReplaceFileName(instantTime)) .forEach(f -> createMetadataFile(f, basePath, configuration, commitMetadata)); } - public static void createEmptyCleanRequestedFile(String basePath, String instantTime, Configuration configuration) - throws IOException { - Path commitFile = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" - + HoodieTimeline.makeRequestedCleanerFileName(instantTime)); - createEmptyFile(basePath, commitFile, configuration); + public static void createPendingReplaceFile(String basePath, String instantTime, Configuration configuration) { + HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); + createPendingReplaceFile(basePath, instantTime, configuration, commitMetadata); } - public static void createCompactionRequestedFile(String basePath, String instantTime, Configuration configuration) + public static void createEmptyCleanRequestedFile(String basePath, String instantTime, Configuration configuration) throws IOException { Path commitFile = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" - + HoodieTimeline.makeRequestedCompactionFileName(instantTime)); + + HoodieTimeline.makeRequestedCleanerFileName(instantTime)); createEmptyFile(basePath, commitFile, configuration); } @@ -452,13 +509,13 @@ public List generateInsertsContainsAllPartitions(String instantTim } public List generateInsertsForPartition(String instantTime, Integer n, String partition) { - return generateInsertsStream(instantTime, n, false, TRIP_EXAMPLE_SCHEMA, false, () -> partition, () -> UUID.randomUUID().toString()).collect(Collectors.toList()); + return generateInsertsStream(instantTime, n, false, TRIP_EXAMPLE_SCHEMA, false, () -> partition, () -> genPseudoRandomUUID(rand).toString()).collect(Collectors.toList()); } public Stream generateInsertsStream(String commitTime, Integer n, boolean isFlattened, String schemaStr, boolean containsAllPartitions) { return generateInsertsStream(commitTime, n, isFlattened, schemaStr, containsAllPartitions, - () -> partitionPaths[RAND.nextInt(partitionPaths.length)], - () -> UUID.randomUUID().toString()); + () -> partitionPaths[rand.nextInt(partitionPaths.length)], + () -> genPseudoRandomUUID(rand).toString()); } /** @@ -479,7 +536,7 @@ public Stream generateInsertsStream(String instantTime, Integer n, populateKeysBySchema(schemaStr, currSize + i, kp); incrementNumExistingKeysBySchema(schemaStr); try { - return new HoodieRecord(key, generateRandomValueAsPerSchema(schemaStr, key, instantTime, isFlattened)); + return new HoodieAvroRecord(key, generateRandomValueAsPerSchema(schemaStr, key, instantTime, isFlattened)); } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); } @@ -510,7 +567,7 @@ public List generateSameKeyInserts(String instantTime, List copy = new ArrayList<>(); for (HoodieRecord r : origin) { HoodieKey key = r.getKey(); - HoodieRecord record = new HoodieRecord(key, generateRandomValue(key, instantTime)); + HoodieRecord record = new HoodieAvroRecord(key, generateRandomValue(key, instantTime)); copy.add(record); } return copy; @@ -520,9 +577,9 @@ public List generateInsertsWithHoodieAvroPayload(String instantTim List inserts = new ArrayList<>(); int currSize = getNumExistingKeys(TRIP_EXAMPLE_SCHEMA); for (int i = 0; i < limit; i++) { - String partitionPath = partitionPaths[RAND.nextInt(partitionPaths.length)]; - HoodieKey key = new HoodieKey(UUID.randomUUID().toString(), partitionPath); - HoodieRecord record = new HoodieRecord(key, generateAvroPayload(key, instantTime)); + String partitionPath = partitionPaths[rand.nextInt(partitionPaths.length)]; + HoodieKey key = new HoodieKey(genPseudoRandomUUID(rand).toString(), partitionPath); + HoodieRecord record = new HoodieAvroRecord(key, generateAvroPayload(key, instantTime)); inserts.add(record); KeyPartition kp = new KeyPartition(); @@ -537,7 +594,7 @@ public List generateInsertsWithHoodieAvroPayload(String instantTim public List generateUpdatesWithHoodieAvroPayload(String instantTime, List baseRecords) { List updates = new ArrayList<>(); for (HoodieRecord baseRecord : baseRecords) { - HoodieRecord record = new HoodieRecord(baseRecord.getKey(), generateAvroPayload(baseRecord.getKey(), instantTime)); + HoodieRecord record = new HoodieAvroRecord(baseRecord.getKey(), generateAvroPayload(baseRecord.getKey(), instantTime)); updates.add(record); } return updates; @@ -564,12 +621,12 @@ public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOE public HoodieRecord generateDeleteRecord(HoodieKey key) throws IOException { RawTripTestPayload payload = - new RawTripTestPayload(Option.empty(), key.getRecordKey(), key.getPartitionPath(), null, true); - return new HoodieRecord(key, payload); + new RawTripTestPayload(Option.empty(), key.getRecordKey(), key.getPartitionPath(), null, true, 0L); + return new HoodieAvroRecord(key, payload); } public HoodieRecord generateUpdateRecord(HoodieKey key, String instantTime) throws IOException { - return new HoodieRecord(key, generateRandomValue(key, instantTime)); + return new HoodieAvroRecord(key, generateRandomValue(key, instantTime)); } public List generateUpdates(String instantTime, List baseRecords) throws IOException { @@ -581,6 +638,16 @@ public List generateUpdates(String instantTime, List return updates; } + public List generateUpdatesWithTS(String instantTime, List baseRecords, int ts) throws IOException { + List updates = new ArrayList<>(); + for (HoodieRecord baseRecord : baseRecords) { + HoodieRecord record = new HoodieAvroRecord(baseRecord.getKey(), + generateRandomValue(baseRecord.getKey(), instantTime, false, ts)); + updates.add(record); + } + return updates; + } + public List generateUpdatesWithDiffPartition(String instantTime, List baseRecords) throws IOException { List updates = new ArrayList<>(); @@ -612,7 +679,7 @@ public List generateUpdates(String instantTime, Integer n) throws for (int i = 0; i < n; i++) { Map existingKeys = existingKeysBySchema.get(TRIP_EXAMPLE_SCHEMA); Integer numExistingKeys = numKeysBySchema.get(TRIP_EXAMPLE_SCHEMA); - KeyPartition kp = existingKeys.get(RAND.nextInt(numExistingKeys - 1)); + KeyPartition kp = existingKeys.get(rand.nextInt(numExistingKeys - 1)); HoodieRecord record = generateUpdateRecord(kp.key, instantTime); updates.add(record); } @@ -684,7 +751,7 @@ public Stream generateUniqueUpdatesStream(String instantTime, Inte } return IntStream.range(0, n).boxed().map(i -> { - int index = numExistingKeys == 1 ? 0 : RAND.nextInt(numExistingKeys - 1); + int index = numExistingKeys == 1 ? 0 : rand.nextInt(numExistingKeys - 1); KeyPartition kp = existingKeys.get(index); // Find the available keyPartition starting from randomly chosen one. while (used.contains(kp)) { @@ -694,7 +761,7 @@ public Stream generateUniqueUpdatesStream(String instantTime, Inte logger.debug("key getting updated: " + kp.key.getRecordKey()); used.add(kp); try { - return new HoodieRecord(kp.key, generateRandomValueAsPerSchema(schemaStr, kp.key, instantTime, false)); + return new HoodieAvroRecord(kp.key, generateRandomValueAsPerSchema(schemaStr, kp.key, instantTime, false)); } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); } @@ -717,7 +784,7 @@ public Stream generateUniqueDeleteStream(Integer n) { List result = new ArrayList<>(); for (int i = 0; i < n; i++) { - int index = RAND.nextInt(numExistingKeys); + int index = rand.nextInt(numExistingKeys); while (!existingKeys.containsKey(index)) { index = (index + 1) % numExistingKeys; } @@ -749,7 +816,7 @@ public Stream generateUniqueDeleteRecordStream(String instantTime, List result = new ArrayList<>(); for (int i = 0; i < n; i++) { - int index = RAND.nextInt(numExistingKeys); + int index = rand.nextInt(numExistingKeys); while (!existingKeys.containsKey(index)) { index = (index + 1) % numExistingKeys; } @@ -760,7 +827,7 @@ public Stream generateUniqueDeleteRecordStream(String instantTime, numExistingKeys--; used.add(kp); try { - result.add(new HoodieRecord(kp.key, generateRandomDeleteValue(kp.key, instantTime))); + result.add(new HoodieAvroRecord(kp.key, generateRandomDeleteValue(kp.key, instantTime))); } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); } @@ -796,12 +863,14 @@ public boolean deleteExistingKeyIfPresent(HoodieKey key) { return false; } + public GenericRecord generateGenericRecord() { + return generateGenericRecord(genPseudoRandomUUID(rand).toString(), "0", + genPseudoRandomUUID(rand).toString(), genPseudoRandomUUID(rand).toString(), rand.nextLong()); + } + public List generateGenericRecords(int numRecords) { List list = new ArrayList<>(); - IntStream.range(0, numRecords).forEach(i -> { - list.add(generateGenericRecord(UUID.randomUUID().toString(), UUID.randomUUID().toString(), UUID.randomUUID() - .toString(), RAND.nextLong())); - }); + IntStream.range(0, numRecords).forEach(i -> list.add(generateGenericRecord())); return list; } @@ -815,11 +884,39 @@ public int getNumExistingKeys(String schemaStr) { public static class KeyPartition implements Serializable { - HoodieKey key; - String partitionPath; + public HoodieKey key; + public String partitionPath; } + @Override public void close() { existingKeysBySchema.clear(); } + + private static long genRandomTimeMillis(Random r) { + // Fri Feb 13 15:31:30 PST 2009 + long anchorTs = 1234567890L; + // NOTE: To provide for certainty and not generate overly random dates, we will limit + // dispersion to be w/in +/- 3 days from the anchor date + return anchorTs + r.nextLong() % 259200000L; + } + + public static UUID genPseudoRandomUUID(Random r) { + byte[] bytes = new byte[16]; + r.nextBytes(bytes); + + bytes[6] &= 0x0f; + bytes[6] |= 0x40; + bytes[8] &= 0x3f; + bytes[8] |= 0x80; + + try { + Constructor ctor = UUID.class.getDeclaredConstructor(byte[].class); + ctor.setAccessible(true); + return ctor.newInstance((Object) bytes); + } catch (InvocationTargetException | InstantiationException | IllegalAccessException | NoSuchMethodException e) { + logger.info("Failed to generate pseudo-random UUID!"); + throw new HoodieException(e); + } + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java index 3663917a54d75..c2531d47c1c8b 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java @@ -19,42 +19,76 @@ package org.apache.hudi.common.testutils; +import org.apache.hudi.avro.model.HoodieActionInstant; import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieCleanerPlan; import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.avro.model.HoodieInstantInfo; +import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; +import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPartitionMetadata; +import org.apache.hudi.avro.model.HoodieRollbackPlan; +import org.apache.hudi.avro.model.HoodieSavepointMetadata; +import org.apache.hudi.avro.model.HoodieSavepointPartitionMetadata; +import org.apache.hudi.common.HoodieCleanStat; +import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.model.IOType; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler; import org.apache.hudi.common.util.CompactionUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieIOException; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Paths; import java.time.Instant; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Random; import java.util.UUID; import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.Stream; import static java.time.temporal.ChronoUnit.SECONDS; -import static org.apache.hudi.common.table.timeline.HoodieActiveTimeline.COMMIT_FORMATTER; +import static org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ; +import static org.apache.hudi.common.model.WriteOperationType.CLUSTER; +import static org.apache.hudi.common.model.WriteOperationType.COMPACT; +import static org.apache.hudi.common.model.WriteOperationType.UPSERT; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.CLEAN_ACTION; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.REPLACE_COMMIT_ACTION; import static org.apache.hudi.common.testutils.FileCreateUtils.baseFileName; import static org.apache.hudi.common.testutils.FileCreateUtils.createCleanFile; import static org.apache.hudi.common.testutils.FileCreateUtils.createCommit; @@ -64,6 +98,8 @@ import static org.apache.hudi.common.testutils.FileCreateUtils.createInflightCompaction; import static org.apache.hudi.common.testutils.FileCreateUtils.createInflightDeltaCommit; import static org.apache.hudi.common.testutils.FileCreateUtils.createInflightReplaceCommit; +import static org.apache.hudi.common.testutils.FileCreateUtils.createInflightRollbackFile; +import static org.apache.hudi.common.testutils.FileCreateUtils.createInflightSavepoint; import static org.apache.hudi.common.testutils.FileCreateUtils.createMarkerFile; import static org.apache.hudi.common.testutils.FileCreateUtils.createReplaceCommit; import static org.apache.hudi.common.testutils.FileCreateUtils.createRequestedCleanFile; @@ -71,10 +107,29 @@ import static org.apache.hudi.common.testutils.FileCreateUtils.createRequestedCompaction; import static org.apache.hudi.common.testutils.FileCreateUtils.createRequestedDeltaCommit; import static org.apache.hudi.common.testutils.FileCreateUtils.createRequestedReplaceCommit; +import static org.apache.hudi.common.testutils.FileCreateUtils.createRequestedRollbackFile; +import static org.apache.hudi.common.testutils.FileCreateUtils.createRestoreFile; +import static org.apache.hudi.common.testutils.FileCreateUtils.createRollbackFile; +import static org.apache.hudi.common.testutils.FileCreateUtils.createSavepointCommit; +import static org.apache.hudi.common.testutils.FileCreateUtils.deleteSavepointCommit; import static org.apache.hudi.common.testutils.FileCreateUtils.logFileName; +import static org.apache.hudi.common.util.CleanerUtils.convertCleanMetadata; +import static org.apache.hudi.common.util.CollectionUtils.createImmutableMap; +import static org.apache.hudi.common.util.CommitUtils.buildMetadata; +import static org.apache.hudi.common.util.CommitUtils.getCommitActionType; +import static org.apache.hudi.common.util.StringUtils.EMPTY_STRING; public class HoodieTestTable { + public static final String PHONY_TABLE_SCHEMA = + "{\"namespace\": \"org.apache.hudi.avro.model\", \"type\": \"record\", \"name\": \"PhonyRecord\", \"fields\": []}"; + + private static final Logger LOG = LogManager.getLogger(HoodieTestTable.class); + private static final Random RANDOM = new Random(); + + protected static HoodieTestTableState testTableState; + private final List inflightCommits = new ArrayList<>(); + protected final String basePath; protected final FileSystem fs; protected HoodieTableMetaClient metaClient; @@ -86,14 +141,16 @@ protected HoodieTestTable(String basePath, FileSystem fs, HoodieTableMetaClient this.basePath = basePath; this.fs = fs; this.metaClient = metaClient; + testTableState = HoodieTestTableState.of(); } public static HoodieTestTable of(HoodieTableMetaClient metaClient) { + testTableState = HoodieTestTableState.of(); return new HoodieTestTable(metaClient.getBasePath(), metaClient.getRawFs(), metaClient); } - public static String makeNewCommitTime(int sequence) { - return String.format("%09d", sequence); + public static String makeNewCommitTime(int sequence, String instantFormat) { + return String.format(instantFormat, sequence); } public static String makeNewCommitTime() { @@ -101,56 +158,88 @@ public static String makeNewCommitTime() { } public static String makeNewCommitTime(Instant dateTime) { - return COMMIT_FORMATTER.format(Date.from(dateTime)); - } - - public static List makeIncrementalCommitTimes(int num) { - return makeIncrementalCommitTimes(num, 1); + return HoodieActiveTimeline.formatDate(Date.from(dateTime)); } - public static List makeIncrementalCommitTimes(int num, int firstOffsetSeconds) { + public static List makeIncrementalCommitTimes(int num, int firstOffsetSeconds, int deltaSecs) { final Instant now = Instant.now(); return IntStream.range(0, num) - .mapToObj(i -> makeNewCommitTime(now.plus(firstOffsetSeconds + i, SECONDS))) + .mapToObj(i -> makeNewCommitTime(now.plus(deltaSecs == 0 ? (firstOffsetSeconds + i) : (i == 0 ? (firstOffsetSeconds) : (i * deltaSecs) + i), SECONDS))) .collect(Collectors.toList()); } public HoodieTestTable addRequestedCommit(String instantTime) throws Exception { createRequestedCommit(basePath, instantTime); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); - return this; - } - - public HoodieTestTable addRequestedDeltaCommit(String instantTime) throws Exception { - createRequestedDeltaCommit(basePath, instantTime); - currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } public HoodieTestTable addInflightCommit(String instantTime) throws Exception { createRequestedCommit(basePath, instantTime); createInflightCommit(basePath, instantTime); + inflightCommits.add(instantTime); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } public HoodieTestTable addInflightDeltaCommit(String instantTime) throws Exception { createRequestedDeltaCommit(basePath, instantTime); createInflightDeltaCommit(basePath, instantTime); + inflightCommits.add(instantTime); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } public HoodieTestTable addCommit(String instantTime) throws Exception { + return addCommit(instantTime, Option.empty()); + } + + public HoodieTestTable addCommit(String instantTime, Option metadata) throws Exception { createRequestedCommit(basePath, instantTime); createInflightCommit(basePath, instantTime); - createCommit(basePath, instantTime); + createCommit(basePath, instantTime, metadata); + currentInstantTime = instantTime; + return this; + } + + public HoodieTestTable addSavepointCommit(String instantTime, HoodieSavepointMetadata savepointMetadata) throws IOException { + createInflightSavepoint(basePath, instantTime); + createSavepointCommit(basePath, instantTime, savepointMetadata); + return this; + } + + public HoodieCommitMetadata createCommitMetadata(WriteOperationType operationType, String commitTime, + HoodieTestTableState testTableState) { + String actionType = getCommitActionType(operationType, metaClient.getTableType()); + return createCommitMetadata(operationType, commitTime, Collections.emptyMap(), testTableState, false, actionType); + } + + public HoodieCommitMetadata createCommitMetadata(WriteOperationType operationType, String commitTime, + HoodieTestTableState testTableState, boolean bootstrap) { + String actionType = getCommitActionType(operationType, metaClient.getTableType()); + return createCommitMetadata(operationType, commitTime, Collections.emptyMap(), testTableState, bootstrap, + actionType); + } + + public HoodieCommitMetadata createCommitMetadata(WriteOperationType operationType, String commitTime, + Map> partitionToReplaceFileIds, + HoodieTestTableState testTableState, boolean bootstrap, String action) { + List writeStats = generateHoodieWriteStatForPartition(testTableState.getPartitionToBaseFileInfoMap(commitTime), commitTime, bootstrap); + if (MERGE_ON_READ.equals(metaClient.getTableType()) && UPSERT.equals(operationType)) { + writeStats.addAll(generateHoodieWriteStatForPartitionLogFiles(testTableState.getPartitionToLogFileInfoMap(commitTime), commitTime, bootstrap)); + } + Map extraMetadata = createImmutableMap("test", "test"); + return buildMetadata(writeStats, partitionToReplaceFileIds, Option.of(extraMetadata), operationType, PHONY_TABLE_SCHEMA, action); + } + + public HoodieTestTable moveInflightCommitToComplete(String instantTime, HoodieCommitMetadata metadata) throws IOException { + if (metaClient.getTableType() == HoodieTableType.COPY_ON_WRITE) { + createCommit(basePath, instantTime, Option.of(metadata)); + } else { + createDeltaCommit(basePath, instantTime, metadata); + } + inflightCommits.remove(instantTime); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } @@ -159,16 +248,45 @@ public HoodieTestTable addDeltaCommit(String instantTime) throws Exception { createInflightDeltaCommit(basePath, instantTime); createDeltaCommit(basePath, instantTime); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } - public HoodieTestTable addReplaceCommit(String instantTime, HoodieReplaceCommitMetadata metadata) throws Exception { - createRequestedReplaceCommit(basePath, instantTime); - createInflightReplaceCommit(basePath, instantTime); - createReplaceCommit(basePath, instantTime, metadata); + public HoodieTestTable addDeltaCommit(String instantTime, HoodieCommitMetadata metadata) throws Exception { + createRequestedDeltaCommit(basePath, instantTime); + createInflightDeltaCommit(basePath, instantTime); + createDeltaCommit(basePath, instantTime, metadata); + currentInstantTime = instantTime; + return this; + } + + public HoodieTestTable addReplaceCommit( + String instantTime, + Option requestedReplaceMetadata, + Option inflightReplaceMetadata, + HoodieReplaceCommitMetadata completeReplaceMetadata) throws Exception { + createRequestedReplaceCommit(basePath, instantTime, requestedReplaceMetadata); + createInflightReplaceCommit(basePath, instantTime, inflightReplaceMetadata); + createReplaceCommit(basePath, instantTime, completeReplaceMetadata); + currentInstantTime = instantTime; + return this; + } + + public HoodieTestTable addPendingReplace(String instantTime, Option requestedReplaceMetadata, Option inflightReplaceMetadata) throws Exception { + createRequestedReplaceCommit(basePath, instantTime, requestedReplaceMetadata); + createInflightReplaceCommit(basePath, instantTime, inflightReplaceMetadata); + currentInstantTime = instantTime; + return this; + } + + public HoodieTestTable addRequestedReplace(String instantTime, Option requestedReplaceMetadata) throws Exception { + createRequestedReplaceCommit(basePath, instantTime, requestedReplaceMetadata); + currentInstantTime = instantTime; + return this; + } + + public HoodieTestTable addInflightReplace(String instantTime, Option inflightReplaceMetadata) throws Exception { + createInflightReplaceCommit(basePath, instantTime, inflightReplaceMetadata); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } @@ -176,23 +294,139 @@ public HoodieTestTable addInflightClean(String instantTime, HoodieCleanerPlan cl createRequestedCleanFile(basePath, instantTime, cleanerPlan); createInflightCleanFile(basePath, instantTime, cleanerPlan); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } public HoodieTestTable addClean(String instantTime, HoodieCleanerPlan cleanerPlan, HoodieCleanMetadata metadata) throws IOException { - createRequestedCleanFile(basePath, instantTime, cleanerPlan); - createInflightCleanFile(basePath, instantTime, cleanerPlan); - createCleanFile(basePath, instantTime, metadata); + return addClean(instantTime, cleanerPlan, metadata, false, false); + } + + public HoodieTestTable addClean(String instantTime, HoodieCleanerPlan cleanerPlan, HoodieCleanMetadata metadata, boolean isEmptyForAll, boolean isEmptyCompleted) throws IOException { + createRequestedCleanFile(basePath, instantTime, cleanerPlan, isEmptyForAll); + createInflightCleanFile(basePath, instantTime, cleanerPlan, isEmptyForAll); + createCleanFile(basePath, instantTime, metadata, isEmptyCompleted); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } + public HoodieTestTable addClean(String instantTime) throws IOException { + HoodieCleanerPlan cleanerPlan = new HoodieCleanerPlan(new HoodieActionInstant(EMPTY_STRING, EMPTY_STRING, EMPTY_STRING), + EMPTY_STRING, EMPTY_STRING, new HashMap<>(), CleanPlanV2MigrationHandler.VERSION, new HashMap<>(), new ArrayList<>()); + HoodieCleanStat cleanStats = new HoodieCleanStat( + HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS, + HoodieTestUtils.DEFAULT_PARTITION_PATHS[RANDOM.nextInt(HoodieTestUtils.DEFAULT_PARTITION_PATHS.length)], + Collections.emptyList(), + Collections.emptyList(), + Collections.emptyList(), + instantTime, + ""); + HoodieCleanMetadata cleanMetadata = convertCleanMetadata(instantTime, Option.of(0L), Collections.singletonList(cleanStats)); + return HoodieTestTable.of(metaClient).addClean(instantTime, cleanerPlan, cleanMetadata); + } + + public Pair getHoodieCleanMetadata(String commitTime, HoodieTestTableState testTableState) { + HoodieCleanerPlan cleanerPlan = new HoodieCleanerPlan(new HoodieActionInstant(commitTime, CLEAN_ACTION, EMPTY_STRING), + EMPTY_STRING, EMPTY_STRING, new HashMap<>(), CleanPlanV2MigrationHandler.VERSION, new HashMap<>(), new ArrayList<>()); + List cleanStats = new ArrayList<>(); + for (Map.Entry> entry : testTableState.getPartitionToFileIdMapForCleaner(commitTime).entrySet()) { + cleanStats.add(new HoodieCleanStat(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS, + entry.getKey(), entry.getValue(), entry.getValue(), Collections.emptyList(), commitTime, "")); + } + return Pair.of(cleanerPlan, convertCleanMetadata(commitTime, Option.of(0L), cleanStats)); + } + + public HoodieTestTable addRequestedRollback(String instantTime, HoodieRollbackPlan plan) throws IOException { + createRequestedRollbackFile(basePath, instantTime, plan); + currentInstantTime = instantTime; + return this; + } + + public HoodieTestTable addInflightRollback(String instantTime) throws IOException { + createInflightRollbackFile(basePath, instantTime); + currentInstantTime = instantTime; + return this; + } + + public HoodieTestTable addRollback(String instantTime, HoodieRollbackMetadata rollbackMetadata) throws IOException { + return addRollback(instantTime, rollbackMetadata, false); + } + + public HoodieTestTable addRollback(String instantTime, HoodieRollbackMetadata rollbackMetadata, boolean isEmpty) throws IOException { + createRequestedRollbackFile(basePath, instantTime); + createInflightRollbackFile(basePath, instantTime); + createRollbackFile(basePath, instantTime, rollbackMetadata, isEmpty); + currentInstantTime = instantTime; + return this; + } + + public HoodieTestTable addRestore(String instantTime, HoodieRestoreMetadata restoreMetadata) throws IOException { + createRestoreFile(basePath, instantTime, restoreMetadata); + currentInstantTime = instantTime; + return this; + } + + public HoodieRollbackMetadata getRollbackMetadata(String instantTimeToDelete, Map> partitionToFilesMeta) throws Exception { + HoodieRollbackMetadata rollbackMetadata = new HoodieRollbackMetadata(); + rollbackMetadata.setCommitsRollback(Collections.singletonList(instantTimeToDelete)); + rollbackMetadata.setStartRollbackTime(instantTimeToDelete); + Map partitionMetadataMap = new HashMap<>(); + for (Map.Entry> entry : partitionToFilesMeta.entrySet()) { + HoodieRollbackPartitionMetadata rollbackPartitionMetadata = new HoodieRollbackPartitionMetadata(); + rollbackPartitionMetadata.setPartitionPath(entry.getKey()); + rollbackPartitionMetadata.setSuccessDeleteFiles(entry.getValue()); + rollbackPartitionMetadata.setFailedDeleteFiles(new ArrayList<>()); + long rollbackLogFileSize = 50 + RANDOM.nextInt(500); + String fileId = UUID.randomUUID().toString(); + String logFileName = logFileName(instantTimeToDelete, fileId, 0); + FileCreateUtils.createLogFile(basePath, entry.getKey(), instantTimeToDelete, fileId, 0, (int) rollbackLogFileSize); + rollbackPartitionMetadata.setRollbackLogFiles(createImmutableMap(logFileName, rollbackLogFileSize)); + partitionMetadataMap.put(entry.getKey(), rollbackPartitionMetadata); + } + rollbackMetadata.setPartitionMetadata(partitionMetadataMap); + rollbackMetadata.setInstantsRollback(Collections.singletonList(new HoodieInstantInfo(instantTimeToDelete, HoodieTimeline.ROLLBACK_ACTION))); + return rollbackMetadata; + } + + /** + * Return a map of log file name to file size that were expected to be rolled back in that partition. + */ + private Map getWrittenLogFiles(String instant, Map.Entry> entry) { + Map writtenLogFiles = new HashMap<>(); + for (String fileName : entry.getValue()) { + if (FSUtils.isLogFile(new Path(fileName))) { + if (testTableState.getPartitionToLogFileInfoMap(instant) != null + && testTableState.getPartitionToLogFileInfoMap(instant).containsKey(entry.getKey())) { + List> fileInfos = testTableState.getPartitionToLogFileInfoMap(instant).get(entry.getKey()); + for (Pair fileInfo : fileInfos) { + if (fileName.equals(logFileName(instant, fileInfo.getLeft(), fileInfo.getRight()[0]))) { + writtenLogFiles.put(fileName, Long.valueOf(fileInfo.getRight()[1])); + } + } + } + } + } + return writtenLogFiles; + } + + public HoodieSavepointMetadata getSavepointMetadata(String instant, Map> partitionToFilesMeta) { + HoodieSavepointMetadata savepointMetadata = new HoodieSavepointMetadata(); + savepointMetadata.setSavepointedAt(12345L); + Map partitionMetadataMap = new HashMap<>(); + for (Map.Entry> entry : partitionToFilesMeta.entrySet()) { + HoodieSavepointPartitionMetadata savepointPartitionMetadata = new HoodieSavepointPartitionMetadata(); + savepointPartitionMetadata.setPartitionPath(entry.getKey()); + savepointPartitionMetadata.setSavepointDataFile(entry.getValue()); + partitionMetadataMap.put(entry.getKey(), savepointPartitionMetadata); + } + savepointMetadata.setPartitionMetadata(partitionMetadataMap); + savepointMetadata.setSavepointedBy("test"); + savepointMetadata.setComments("test_comment"); + return savepointMetadata; + } + public HoodieTestTable addRequestedCompaction(String instantTime) throws IOException { createRequestedCompaction(basePath, instantTime); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); return this; } @@ -210,11 +444,41 @@ public HoodieTestTable addRequestedCompaction(String instantTime, FileSlice... f return addRequestedCompaction(instantTime, plan); } - public HoodieTestTable addCompaction(String instantTime) throws IOException { + public HoodieTestTable addInflightCompaction(String instantTime, HoodieCommitMetadata commitMetadata) throws Exception { + List fileSlices = new ArrayList<>(); + for (Map.Entry> entry : commitMetadata.getPartitionToWriteStats().entrySet()) { + for (HoodieWriteStat stat: entry.getValue()) { + fileSlices.add(new FileSlice(entry.getKey(), instantTime, stat.getPath())); + } + } + this.addRequestedCompaction(instantTime, fileSlices.toArray(new FileSlice[0])); + createInflightCompaction(basePath, instantTime); + inflightCommits.add(instantTime); + currentInstantTime = instantTime; + return this; + } + + public HoodieTestTable addCompaction(String instantTime, HoodieCommitMetadata commitMetadata) throws Exception { createRequestedCompaction(basePath, instantTime); createInflightCompaction(basePath, instantTime); + return addCommit(instantTime, Option.of(commitMetadata)); + } + + public HoodieTestTable moveInflightCompactionToComplete(String instantTime, HoodieCommitMetadata metadata) throws IOException { + createCommit(basePath, instantTime, Option.of(metadata)); + inflightCommits.remove(instantTime); currentInstantTime = instantTime; - metaClient = HoodieTableMetaClient.reload(metaClient); + return this; + } + + public HoodieTestTable addSavepoint(String instantTime, HoodieSavepointMetadata savepointMetadata) throws IOException { + createInflightSavepoint(basePath, instantTime); + createSavepointCommit(basePath, instantTime, savepointMetadata); + return this; + } + + public HoodieTestTable deleteSavepoint(String instantTime) throws IOException { + deleteSavepointCommit(basePath, instantTime, fs); return this; } @@ -228,7 +492,7 @@ public HoodieTestTable forDeltaCommit(String instantTime) { return this; } - public HoodieTestTable forCompaction(String instantTime) { + public HoodieTestTable forReplaceCommit(String instantTime) { currentInstantTime = instantTime; return this; } @@ -240,10 +504,6 @@ public HoodieTestTable withPartitionMetaFiles(String... partitionPaths) throws I return this; } - public HoodieTestTable withMarkerFile(String partitionPath, IOType ioType) throws IOException { - return withMarkerFile(partitionPath, UUID.randomUUID().toString(), ioType); - } - public HoodieTestTable withMarkerFile(String partitionPath, String fileId, IOType ioType) throws IOException { createMarkerFile(basePath, partitionPath, currentInstantTime, fileId, ioType); return this; @@ -298,6 +558,13 @@ public HoodieTestTable withBaseFilesInPartition(String partition, int... lengths return this; } + public HoodieTestTable withBaseFilesInPartition(String partition, List> fileInfos) throws Exception { + for (Pair fileInfo : fileInfos) { + FileCreateUtils.createBaseFile(basePath, partition, currentInstantTime, fileInfo.getKey(), fileInfo.getValue()); + } + return this; + } + public String getFileIdWithLogFile(String partitionPath) throws Exception { String fileId = UUID.randomUUID().toString(); withLogFile(partitionPath, fileId); @@ -315,8 +582,11 @@ public HoodieTestTable withLogFile(String partitionPath, String fileId, int... v return this; } - public boolean inflightCommitsExist(String... instantTime) { - return Arrays.stream(instantTime).allMatch(this::inflightCommitExists); + public HoodieTestTable withLogFilesInPartition(String partition, List> fileInfos) throws Exception { + for (Pair fileInfo : fileInfos) { + FileCreateUtils.createLogFile(basePath, partition, currentInstantTime, fileInfo.getKey(), fileInfo.getValue()[0], fileInfo.getValue()[1]); + } + return this; } public boolean inflightCommitExists(String instantTime) { @@ -327,10 +597,6 @@ public boolean inflightCommitExists(String instantTime) { } } - public boolean commitsExist(String... instantTime) { - return Arrays.stream(instantTime).allMatch(this::commitExists); - } - public boolean commitExists(String instantTime) { try { return fs.exists(getCommitFilePath(instantTime)); @@ -347,10 +613,6 @@ public boolean baseFilesExist(Map partitionAndFileId, String ins }); } - public boolean baseFilesExist(String partition, String instantTime, String... fileIds) { - return Arrays.stream(fileIds).allMatch(f -> baseFileExists(partition, instantTime, f)); - } - public boolean baseFileExists(String partition, String instantTime, String fileId) { try { return fs.exists(new Path(Paths.get(basePath, partition, baseFileName(instantTime, fileId)).toString())); @@ -387,6 +649,11 @@ public Path getPartitionPath(String partition) { return new Path(Paths.get(basePath, partition).toUri()); } + public List getAllPartitionPaths() throws IOException { + java.nio.file.Path basePathPath = Paths.get(basePath); + return FileCreateUtils.getPartitionPaths(basePathPath); + } + public Path getBaseFilePath(String partition, String fileId) { return new Path(Paths.get(basePath, partition, getBaseFileNameById(fileId)).toUri()); } @@ -395,8 +662,26 @@ public String getBaseFileNameById(String fileId) { return baseFileName(currentInstantTime, fileId); } + public Path getLogFilePath(String partition, String fileId, int version) { + return new Path(Paths.get(basePath, partition, getLogFileNameById(fileId, version)).toString()); + } + + public String getLogFileNameById(String fileId, int version) { + return logFileName(currentInstantTime, fileId, version); + } + + public List getEarliestFilesInPartition(String partition, int count) throws IOException { + List fileStatuses = Arrays.asList(listAllFilesInPartition(partition)); + fileStatuses.sort(Comparator.comparing(FileStatus::getModificationTime)); + return fileStatuses.subList(0, count).stream().map(entry -> entry.getPath().getName()).collect(Collectors.toList()); + } + + public List inflightCommits() { + return this.inflightCommits; + } + public FileStatus[] listAllBaseFiles() throws IOException { - return listAllBaseFiles(HoodieFileFormat.PARQUET.getFileExtension()); + return listAllBaseFiles(HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension()); } public FileStatus[] listAllBaseFiles(String fileExtension) throws IOException { @@ -420,16 +705,526 @@ public FileStatus[] listAllBaseAndLogFiles() throws IOException { } public FileStatus[] listAllFilesInPartition(String partitionPath) throws IOException { - return FileSystemTestUtils.listRecursive(fs, new Path(Paths.get(basePath, partitionPath).toString())).toArray(new FileStatus[0]); + return FileSystemTestUtils.listRecursive(fs, new Path(Paths.get(basePath, partitionPath).toString())).stream() + .filter(entry -> { + boolean toReturn = true; + String filePath = entry.getPath().toString(); + String fileName = entry.getPath().getName(); + if (fileName.startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX) + || !FileCreateUtils.isBaseOrLogFilename(fileName) + || filePath.contains("metadata")) { + toReturn = false; + } else { + for (String inflight : inflightCommits) { + if (fileName.contains(inflight)) { + toReturn = false; + break; + } + } + } + return toReturn; + }).toArray(FileStatus[]::new); } public FileStatus[] listAllFilesInTempFolder() throws IOException { return FileSystemTestUtils.listRecursive(fs, new Path(Paths.get(basePath, HoodieTableMetaClient.TEMPFOLDER_NAME).toString())).toArray(new FileStatus[0]); } + public void deleteFilesInPartition(String partitionPath, List filesToDelete) throws IOException { + FileStatus[] allFiles = listAllFilesInPartition(partitionPath); + Arrays.stream(allFiles).filter(entry -> filesToDelete.contains(entry.getPath().getName())).forEach(entry -> { + try { + Files.delete(Paths.get(basePath, partitionPath, entry.getPath().getName())); + } catch (IOException e) { + throw new HoodieTestTableException(e); + } + }); + } + + public HoodieTestTable doRollback(String commitTimeToRollback, String commitTime) throws Exception { + metaClient = HoodieTableMetaClient.reload(metaClient); + Option commitMetadata = getMetadataForInstant(commitTimeToRollback); + if (!commitMetadata.isPresent()) { + throw new IllegalArgumentException("Instant to rollback not present in timeline: " + commitTimeToRollback); + } + Map> partitionFiles = getPartitionFiles(commitMetadata.get()); + HoodieRollbackMetadata rollbackMetadata = getRollbackMetadata(commitTimeToRollback, partitionFiles); + for (Map.Entry> entry : partitionFiles.entrySet()) { + deleteFilesInPartition(entry.getKey(), entry.getValue()); + } + return addRollback(commitTime, rollbackMetadata); + } + + public HoodieTestTable doRollbackWithExtraFiles(String commitTimeToRollback, String commitTime, Map> extraFiles) throws Exception { + metaClient = HoodieTableMetaClient.reload(metaClient); + Option commitMetadata = getMetadataForInstant(commitTimeToRollback); + if (!commitMetadata.isPresent()) { + throw new IllegalArgumentException("Instant to rollback not present in timeline: " + commitTimeToRollback); + } + Map> partitionFiles = getPartitionFiles(commitMetadata.get()); + for (Map.Entry> entry : partitionFiles.entrySet()) { + deleteFilesInPartition(entry.getKey(), entry.getValue()); + } + for (Map.Entry> entry: extraFiles.entrySet()) { + if (partitionFiles.containsKey(entry.getKey())) { + partitionFiles.get(entry.getKey()).addAll(entry.getValue()); + } + } + HoodieRollbackMetadata rollbackMetadata = getRollbackMetadata(commitTimeToRollback, partitionFiles); + return addRollback(commitTime, rollbackMetadata); + } + + public HoodieTestTable doRestore(String commitToRestoreTo, String restoreTime) throws Exception { + metaClient = HoodieTableMetaClient.reload(metaClient); + List commitsToRollback = metaClient.getActiveTimeline().getCommitsTimeline() + .filterCompletedInstants().findInstantsAfter(commitToRestoreTo).getReverseOrderedInstants().collect(Collectors.toList()); + Map> rollbackMetadataMap = new HashMap<>(); + for (HoodieInstant commitInstantToRollback: commitsToRollback) { + Option commitMetadata = getCommitMeta(commitInstantToRollback); + if (!commitMetadata.isPresent()) { + throw new IllegalArgumentException("Instant to rollback not present in timeline: " + commitInstantToRollback.getTimestamp()); + } + Map> partitionFiles = getPartitionFiles(commitMetadata.get()); + rollbackMetadataMap.put(commitInstantToRollback.getTimestamp(), + Collections.singletonList(getRollbackMetadata(commitInstantToRollback.getTimestamp(), partitionFiles))); + for (Map.Entry> entry : partitionFiles.entrySet()) { + deleteFilesInPartition(entry.getKey(), entry.getValue()); + } + } + + HoodieRestoreMetadata restoreMetadata = TimelineMetadataUtils.convertRestoreMetadata(restoreTime,1000L, + commitsToRollback, rollbackMetadataMap); + return addRestore(restoreTime, restoreMetadata); + } + + public HoodieReplaceCommitMetadata doCluster(String commitTime, Map> partitionToReplaceFileIds, List partitions, int filesPerPartition) throws Exception { + HoodieTestTableState testTableState = getTestTableStateWithPartitionFileInfo(CLUSTER, metaClient.getTableType(), commitTime, partitions, filesPerPartition); + this.currentInstantTime = commitTime; + Map>> partitionToReplaceFileIdsWithLength = new HashMap<>(); + for (Map.Entry> entry : partitionToReplaceFileIds.entrySet()) { + String partition = entry.getKey(); + partitionToReplaceFileIdsWithLength.put(entry.getKey(), new ArrayList<>()); + for (String fileId : entry.getValue()) { + int length = 100 + RANDOM.nextInt(500); + partitionToReplaceFileIdsWithLength.get(partition).add(Pair.of(fileId, length)); + } + } + List writeStats = generateHoodieWriteStatForPartition(testTableState.getPartitionToBaseFileInfoMap(commitTime), commitTime, false); + for (String partition : testTableState.getPartitionToBaseFileInfoMap(commitTime).keySet()) { + this.withBaseFilesInPartition(partition, testTableState.getPartitionToBaseFileInfoMap(commitTime).get(partition)); + } + HoodieReplaceCommitMetadata replaceMetadata = + (HoodieReplaceCommitMetadata) buildMetadata(writeStats, partitionToReplaceFileIds, Option.empty(), CLUSTER, PHONY_TABLE_SCHEMA, + REPLACE_COMMIT_ACTION); + addReplaceCommit(commitTime, Option.empty(), Option.empty(), replaceMetadata); + return replaceMetadata; + } + + public HoodieCleanMetadata doClean(String commitTime, Map partitionFileCountsToDelete) throws IOException { + Map> partitionFilesToDelete = new HashMap<>(); + for (Map.Entry entry : partitionFileCountsToDelete.entrySet()) { + partitionFilesToDelete.put(entry.getKey(), getEarliestFilesInPartition(entry.getKey(), entry.getValue())); + } + HoodieTestTableState testTableState = new HoodieTestTableState(); + for (Map.Entry> entry : partitionFilesToDelete.entrySet()) { + testTableState = testTableState.createTestTableStateForCleaner(commitTime, entry.getKey(), entry.getValue()); + deleteFilesInPartition(entry.getKey(), entry.getValue()); + } + Pair cleanerMeta = getHoodieCleanMetadata(commitTime, testTableState); + addClean(commitTime, cleanerMeta.getKey(), cleanerMeta.getValue()); + return cleanerMeta.getValue(); + } + + public HoodieCleanMetadata doCleanBasedOnCommits(String cleanCommitTime, List commitsToClean) throws IOException { + Map partitionFileCountsToDelete = new HashMap<>(); + for (String commitTime : commitsToClean) { + Option commitMetadata = getMetadataForInstant(commitTime); + if (commitMetadata.isPresent()) { + Map> partitionFiles = getPartitionFiles(commitMetadata.get()); + for (String partition : partitionFiles.keySet()) { + partitionFileCountsToDelete.put(partition, partitionFiles.get(partition).size() + partitionFileCountsToDelete.getOrDefault(partition, 0)); + } + } + } + return doClean(cleanCommitTime, partitionFileCountsToDelete); + } + + public HoodieSavepointMetadata doSavepoint(String commitTime) throws IOException { + Option commitMetadata = getMetadataForInstant(commitTime); + if (!commitMetadata.isPresent()) { + throw new IllegalArgumentException("Instant to rollback not present in timeline: " + commitTime); + } + Map> partitionFiles = getPartitionFiles(commitMetadata.get()); + HoodieSavepointMetadata savepointMetadata = getSavepointMetadata(commitTime, partitionFiles); + for (Map.Entry> entry : partitionFiles.entrySet()) { + deleteFilesInPartition(entry.getKey(), entry.getValue()); + } + return savepointMetadata; + } + + public HoodieCommitMetadata doCompaction(String commitTime, List partitions) throws Exception { + return doCompaction(commitTime, partitions, false); + } + + public HoodieCommitMetadata doCompaction(String commitTime, List partitions, boolean inflight) throws Exception { + this.currentInstantTime = commitTime; + if (partitions.isEmpty()) { + partitions = Collections.singletonList(EMPTY_STRING); + } + HoodieTestTableState testTableState = getTestTableStateWithPartitionFileInfo(COMPACT, metaClient.getTableType(), commitTime, partitions, 1); + HoodieCommitMetadata commitMetadata = createCommitMetadata(COMPACT, commitTime, testTableState); + for (String partition : partitions) { + this.withBaseFilesInPartition(partition, testTableState.getPartitionToBaseFileInfoMap(commitTime).get(partition)); + } + if (inflight) { + this.addInflightCompaction(commitTime, commitMetadata); + } else { + this.addCompaction(commitTime, commitMetadata); + } + return commitMetadata; + } + + public HoodieCommitMetadata doWriteOperation(String commitTime, WriteOperationType operationType, + List partitions, int filesPerPartition) throws Exception { + return doWriteOperation(commitTime, operationType, Collections.emptyList(), partitions, filesPerPartition, false); + } + + public HoodieCommitMetadata doWriteOperation(String commitTime, WriteOperationType operationType, + List newPartitionsToAdd, List partitions, + int filesPerPartition) throws Exception { + return doWriteOperation(commitTime, operationType, newPartitionsToAdd, partitions, filesPerPartition, false); + } + + public HoodieCommitMetadata doWriteOperation(String commitTime, WriteOperationType operationType, + List newPartitionsToAdd, List partitions, + int filesPerPartition, boolean bootstrap) throws Exception { + return doWriteOperation(commitTime, operationType, newPartitionsToAdd, partitions, filesPerPartition, bootstrap, false); + } + + public HoodieCommitMetadata doWriteOperation(String commitTime, WriteOperationType operationType, + List partitions, int filesPerPartition, boolean bootstrap) throws Exception { + return doWriteOperation(commitTime, operationType, Collections.emptyList(), partitions, filesPerPartition, + bootstrap, false); + } + + public HoodieCommitMetadata doWriteOperation(String commitTime, WriteOperationType operationType, + List newPartitionsToAdd, List partitions, + int filesPerPartition, boolean bootstrap, + boolean createInflightCommit) throws Exception { + if (partitions.isEmpty()) { + partitions = Collections.singletonList(EMPTY_STRING); + } + + Map>> partitionToFilesNameLengthMap = getPartitionFiles(partitions, + filesPerPartition); + return doWriteOperation(commitTime, operationType, newPartitionsToAdd, partitionToFilesNameLengthMap, bootstrap, + createInflightCommit); + } + + /** + * Add commits to the requested partitions. + * + * @param commitTime - Commit time for the operation + * @param operationType - Operation type + * @param newPartitionsToAdd - New partitions to add for the operation + * @param partitionToFilesNameLengthMap - Map of partition names to its list of files name and length pair + * @param bootstrap - Whether bootstrapping needed for the operation + * @param createInflightCommit - Whether in flight commit needed for the operation + * @return Commit metadata for the commit operation performed. + * @throws Exception + */ + public HoodieCommitMetadata doWriteOperation(String commitTime, WriteOperationType operationType, + List newPartitionsToAdd, + Map>> partitionToFilesNameLengthMap, + boolean bootstrap, boolean createInflightCommit) throws Exception { + if (partitionToFilesNameLengthMap.isEmpty()) { + partitionToFilesNameLengthMap = Collections.singletonMap(EMPTY_STRING, Collections.EMPTY_LIST); + } + HoodieTestTableState testTableState = getTestTableStateWithPartitionFileInfo(operationType, + metaClient.getTableType(), commitTime, partitionToFilesNameLengthMap); + HoodieCommitMetadata commitMetadata = createCommitMetadata(operationType, commitTime, testTableState, bootstrap); + for (String str : newPartitionsToAdd) { + this.withPartitionMetaFiles(str); + } + if (createInflightCommit) { + if (metaClient.getTableType() == HoodieTableType.COPY_ON_WRITE) { + this.addInflightCommit(commitTime); + } else { + this.addInflightDeltaCommit(commitTime); + } + } else { + if (metaClient.getTableType() == HoodieTableType.COPY_ON_WRITE) { + this.addCommit(commitTime, Option.of(commitMetadata)); + } else { + this.addDeltaCommit(commitTime, commitMetadata); + } + } + for (Map.Entry>> entry : partitionToFilesNameLengthMap.entrySet()) { + String partition = entry.getKey(); + this.withBaseFilesInPartition(partition, testTableState.getPartitionToBaseFileInfoMap(commitTime).get(partition)); + if (MERGE_ON_READ.equals(metaClient.getTableType()) && UPSERT.equals(operationType)) { + this.withLogFilesInPartition(partition, testTableState.getPartitionToLogFileInfoMap(commitTime).get(partition)); + } + } + return commitMetadata; + } + + private Option getMetadataForInstant(String instantTime) { + metaClient = HoodieTableMetaClient.reload(metaClient); + Option hoodieInstant = metaClient.getActiveTimeline().getCommitsTimeline() + .filterCompletedInstants().filter(i -> i.getTimestamp().equals(instantTime)).firstInstant(); + try { + if (hoodieInstant.isPresent()) { + return getCommitMeta(hoodieInstant.get()); + } else { + return Option.empty(); + } + } catch (IOException io) { + throw new HoodieIOException("Unable to read metadata for instant " + hoodieInstant.get(), io); + } + } + + private Option getCommitMeta(HoodieInstant hoodieInstant) throws IOException { + switch (hoodieInstant.getAction()) { + case HoodieTimeline.REPLACE_COMMIT_ACTION: + HoodieReplaceCommitMetadata replaceCommitMetadata = HoodieReplaceCommitMetadata + .fromBytes(metaClient.getActiveTimeline().getInstantDetails(hoodieInstant).get(), HoodieReplaceCommitMetadata.class); + return Option.of(replaceCommitMetadata); + case HoodieTimeline.DELTA_COMMIT_ACTION: + case HoodieTimeline.COMMIT_ACTION: + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(metaClient.getActiveTimeline().getInstantDetails(hoodieInstant).get(), HoodieCommitMetadata.class); + return Option.of(commitMetadata); + default: + throw new IllegalArgumentException("Unknown instant action" + hoodieInstant.getAction()); + } + } + + private static Map> getPartitionFiles(HoodieCommitMetadata commitMetadata) { + Map> partitionFilesToDelete = new HashMap<>(); + Map> partitionToWriteStats = commitMetadata.getPartitionToWriteStats(); + for (Map.Entry> entry : partitionToWriteStats.entrySet()) { + partitionFilesToDelete.put(entry.getKey(), new ArrayList<>()); + entry.getValue().forEach(writeStat -> partitionFilesToDelete.get(entry.getKey()).add(writeStat.getFileId())); + } + return partitionFilesToDelete; + } + + /** + * Generate partition files names and length details. + * + * @param partitions - List of partition for which file details need to be generated + * @param filesPerPartition - File count per partition + * @return Map of partition to its collection of files name and length pair + */ + protected static Map>> getPartitionFiles(List partitions, + int filesPerPartition) { + Map>> partitionToFilesNameLengthMap = new HashMap<>(); + for (String partition : partitions) { + Stream fileLengths = IntStream.range(0, filesPerPartition).map(i -> 100 + RANDOM.nextInt(500)).boxed(); + List> fileNameAndLengthList = + fileLengths.map(len -> Pair.of(UUID.randomUUID().toString(), len)).collect(Collectors.toList()); + partitionToFilesNameLengthMap.put(partition, fileNameAndLengthList); + } + return partitionToFilesNameLengthMap; + } + + /** + * Get Test table state for the requested partitions and file count. + * + * @param operationType - Table write operation type + * @param tableType - Hudi table type + * @param commitTime - Write commit time + * @param partitions - List of partition names + * @param filesPerPartition - Total file count per partition + * @return Test table state for the requested partitions and file count + */ + private static HoodieTestTableState getTestTableStateWithPartitionFileInfo(WriteOperationType operationType, + HoodieTableType tableType, + String commitTime, + List partitions, + int filesPerPartition) { + Map>> partitionToFilesNameLengthMap = getPartitionFiles(partitions, + filesPerPartition); + return getTestTableStateWithPartitionFileInfo(operationType, tableType, commitTime, partitionToFilesNameLengthMap); + } + + /** + * Get Test table state for the requested partitions and files. + * + * @param operationType - Table write operation type + * @param tableType - Hudi table type + * @param commitTime - Write commit time + * @param partitionToFilesNameLengthMap - Map of partition names to its list of files and their lengths + * @return Test table state for the requested partitions and files + */ + private static HoodieTestTableState getTestTableStateWithPartitionFileInfo(WriteOperationType operationType, + HoodieTableType tableType, + String commitTime, + Map>> partitionToFilesNameLengthMap) { + for (Map.Entry>> partitionEntry : partitionToFilesNameLengthMap.entrySet()) { + String partitionName = partitionEntry.getKey(); + List> fileNameAndLengthList = partitionEntry.getValue(); + if (MERGE_ON_READ.equals(tableType) && UPSERT.equals(operationType)) { + List> fileVersionAndLength = + fileNameAndLengthList.stream().map(nameLengthPair -> Pair.of(0, nameLengthPair.getRight())).collect(Collectors.toList()); + testTableState = testTableState.createTestTableStateForBaseAndLogFiles(commitTime, partitionName, + fileVersionAndLength); + } else { + testTableState = testTableState.createTestTableStateForBaseFilesOnly(commitTime, partitionName, + fileNameAndLengthList); + } + } + return testTableState; + } + + public static List generateHoodieWriteStatForPartition(Map>> partitionToFileIdMap, + String commitTime, boolean bootstrap) { + List writeStats = new ArrayList<>(); + for (Map.Entry>> entry : partitionToFileIdMap.entrySet()) { + String partition = entry.getKey(); + for (Pair fileIdInfo : entry.getValue()) { + HoodieWriteStat writeStat = new HoodieWriteStat(); + String fileName = bootstrap ? fileIdInfo.getKey() : + FileCreateUtils.baseFileName(commitTime, fileIdInfo.getKey()); + writeStat.setFileId(fileName); + writeStat.setPartitionPath(partition); + writeStat.setPath(StringUtils.isNullOrEmpty(partition) ? fileName : partition + "/" + fileName); + writeStat.setTotalWriteBytes(fileIdInfo.getValue()); + writeStat.setFileSizeInBytes(fileIdInfo.getValue()); + writeStats.add(writeStat); + } + } + return writeStats; + } + + /** + * Returns the write stats for log files in the partition. Since log file has version associated with it, the {@param partitionToFileIdMap} + * contains list of Pair where the Integer[] array has both file version and file size. + */ + private static List generateHoodieWriteStatForPartitionLogFiles(Map>> partitionToFileIdMap, String commitTime, boolean bootstrap) { + List writeStats = new ArrayList<>(); + if (partitionToFileIdMap == null) { + return writeStats; + } + for (Map.Entry>> entry : partitionToFileIdMap.entrySet()) { + String partition = entry.getKey(); + for (Pair fileIdInfo : entry.getValue()) { + HoodieWriteStat writeStat = new HoodieWriteStat(); + String fileName = bootstrap ? fileIdInfo.getKey() : + FileCreateUtils.logFileName(commitTime, fileIdInfo.getKey(), fileIdInfo.getValue()[0]); + writeStat.setFileId(fileName); + writeStat.setPartitionPath(partition); + writeStat.setPath(StringUtils.isNullOrEmpty(partition) ? fileName : partition + "/" + fileName); + writeStat.setTotalWriteBytes(fileIdInfo.getValue()[1]); + writeStat.setFileSizeInBytes(fileIdInfo.getValue()[1]); + writeStats.add(writeStat); + } + } + return writeStats; + } + public static class HoodieTestTableException extends RuntimeException { public HoodieTestTableException(Throwable t) { super(t); } } + + static class HoodieTestTableState { + /** + * Map>> + * Used in building CLEAN metadata. + */ + Map>> commitsToPartitionToFileIdForCleaner = new HashMap<>(); + /** + * Map>>> + * Used to build commit metadata for base files for several write operations. + */ + Map>>> commitsToPartitionToBaseFileInfoStats = new HashMap<>(); + /** + * Map>>> + * Used to build commit metadata for log files for several write operations. + */ + Map>>> commitsToPartitionToLogFileInfoStats = new HashMap<>(); + + HoodieTestTableState() { + } + + static HoodieTestTableState of() { + return new HoodieTestTableState(); + } + + HoodieTestTableState createTestTableStateForCleaner(String commitTime, String partitionPath, List filesToClean) { + if (!commitsToPartitionToFileIdForCleaner.containsKey(commitTime)) { + commitsToPartitionToFileIdForCleaner.put(commitTime, new HashMap<>()); + } + if (!this.commitsToPartitionToFileIdForCleaner.get(commitTime).containsKey(partitionPath)) { + this.commitsToPartitionToFileIdForCleaner.get(commitTime).put(partitionPath, new ArrayList<>()); + } + + this.commitsToPartitionToFileIdForCleaner.get(commitTime).get(partitionPath).addAll(filesToClean); + return this; + } + + Map> getPartitionToFileIdMapForCleaner(String commitTime) { + return this.commitsToPartitionToFileIdForCleaner.get(commitTime); + } + + HoodieTestTableState createTestTableStateForBaseFileLengthsOnly(String commitTime, String partitionPath, + List lengths) { + List> fileNameLengthList = new ArrayList<>(); + for (int length : lengths) { + fileNameLengthList.add(Pair.of(UUID.randomUUID().toString(), length)); + } + return createTestTableStateForBaseFilesOnly(commitTime, partitionPath, fileNameLengthList); + } + + HoodieTestTableState createTestTableStateForBaseFilesOnly(String commitTime, String partitionPath, + List> fileNameAndLengthList) { + if (!commitsToPartitionToBaseFileInfoStats.containsKey(commitTime)) { + commitsToPartitionToBaseFileInfoStats.put(commitTime, new HashMap<>()); + } + if (!this.commitsToPartitionToBaseFileInfoStats.get(commitTime).containsKey(partitionPath)) { + this.commitsToPartitionToBaseFileInfoStats.get(commitTime).put(partitionPath, new ArrayList<>()); + } + + this.commitsToPartitionToBaseFileInfoStats.get(commitTime).get(partitionPath).addAll(fileNameAndLengthList); + return this; + } + + HoodieTestTableState createTestTableStateForBaseAndLogFiles(String commitTime, String partitionPath, + List> versionsAndLengths) { + if (!commitsToPartitionToBaseFileInfoStats.containsKey(commitTime)) { + createTestTableStateForBaseFileLengthsOnly(commitTime, partitionPath, + versionsAndLengths.stream().map(Pair::getRight).collect(Collectors.toList())); + } + if (!this.commitsToPartitionToBaseFileInfoStats.get(commitTime).containsKey(partitionPath)) { + createTestTableStateForBaseFileLengthsOnly(commitTime, partitionPath, + versionsAndLengths.stream().map(Pair::getRight).collect(Collectors.toList())); + } + if (!commitsToPartitionToLogFileInfoStats.containsKey(commitTime)) { + commitsToPartitionToLogFileInfoStats.put(commitTime, new HashMap<>()); + } + if (!this.commitsToPartitionToLogFileInfoStats.get(commitTime).containsKey(partitionPath)) { + this.commitsToPartitionToLogFileInfoStats.get(commitTime).put(partitionPath, new ArrayList<>()); + } + + List> fileInfos = new ArrayList<>(); + for (int i = 0; i < versionsAndLengths.size(); i++) { + Pair versionAndLength = versionsAndLengths.get(i); + String fileId = FSUtils.getFileId(commitsToPartitionToBaseFileInfoStats.get(commitTime).get(partitionPath).get(i).getLeft()); + fileInfos.add(Pair.of(fileId, new Integer[] {versionAndLength.getLeft(), versionAndLength.getRight()})); + } + this.commitsToPartitionToLogFileInfoStats.get(commitTime).get(partitionPath).addAll(fileInfos); + return this; + } + + Map>> getPartitionToBaseFileInfoMap(String commitTime) { + return this.commitsToPartitionToBaseFileInfoStats.get(commitTime); + } + + Map>> getPartitionToLogFileInfoMap(String commitTime) { + return this.commitsToPartitionToLogFileInfoStats.get(commitTime); + } + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java index d94f41f91c41b..d3c1de56773b3 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java @@ -18,6 +18,7 @@ package org.apache.hudi.common.testutils; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; import org.apache.hudi.common.model.HoodieAvroPayload; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieTableType; @@ -25,6 +26,7 @@ import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.metadata.HoodieTableMetadata; import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; @@ -38,6 +40,7 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.List; +import java.util.Objects; import java.util.Properties; import java.util.UUID; @@ -46,6 +49,7 @@ */ public class HoodieTestUtils { + public static final String HOODIE_DATABASE = "test_incremental"; public static final String RAW_TRIPS_TEST_NAME = "raw_trips"; public static final String DEFAULT_WRITE_TOKEN = "1-0-1"; public static final int DEFAULT_LOG_VERSION = 1; @@ -63,9 +67,14 @@ public static HoodieTableMetaClient init(String basePath, HoodieTableType tableT return init(getDefaultHadoopConf(), basePath, tableType); } - public static HoodieTableMetaClient init(String basePath, HoodieTableType tableType, String bootstrapBasePath) throws IOException { + public static HoodieTableMetaClient init(String basePath, HoodieTableType tableType, Properties properties) throws IOException { + return init(getDefaultHadoopConf(), basePath, tableType, properties); + } + + public static HoodieTableMetaClient init(String basePath, HoodieTableType tableType, String bootstrapBasePath, boolean bootstrapIndexEnable) throws IOException { Properties props = new Properties(); - props.setProperty(HoodieTableConfig.HOODIE_BOOTSTRAP_BASE_PATH, bootstrapBasePath); + props.setProperty(HoodieTableConfig.BOOTSTRAP_BASE_PATH.key(), bootstrapBasePath); + props.put(HoodieTableConfig.BOOTSTRAP_INDEX_ENABLE.key(), bootstrapIndexEnable); return init(getDefaultHadoopConf(), basePath, tableType, props); } @@ -86,29 +95,69 @@ public static HoodieTableMetaClient init(Configuration hadoopConf, String basePa String tableName) throws IOException { Properties properties = new Properties(); - properties.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, tableName); + properties.setProperty(HoodieTableConfig.NAME.key(), tableName); return init(hadoopConf, basePath, tableType, properties); } public static HoodieTableMetaClient init(Configuration hadoopConf, String basePath, HoodieTableType tableType, - HoodieFileFormat baseFileFormat) + HoodieFileFormat baseFileFormat, String databaseName) + throws IOException { + Properties properties = new Properties(); + properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), baseFileFormat.toString()); + return init(hadoopConf, basePath, tableType, properties, databaseName); + } + + public static HoodieTableMetaClient init(Configuration hadoopConf, String basePath, HoodieTableType tableType, + HoodieFileFormat baseFileFormat) throws IOException { + return init(hadoopConf, basePath, tableType, baseFileFormat, false, null, true); + } + + public static HoodieTableMetaClient init(Configuration hadoopConf, String basePath, HoodieTableType tableType, + HoodieFileFormat baseFileFormat, boolean setKeyGen, String keyGenerator, boolean populateMetaFields) throws IOException { Properties properties = new Properties(); - properties.setProperty(HoodieTableConfig.HOODIE_BASE_FILE_FORMAT_PROP_NAME, baseFileFormat.toString()); + properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), baseFileFormat.toString()); + if (setKeyGen) { + properties.setProperty("hoodie.datasource.write.keygenerator.class", keyGenerator); + } + properties.setProperty("hoodie.populate.meta.fields", Boolean.toString(populateMetaFields)); return init(hadoopConf, basePath, tableType, properties); } public static HoodieTableMetaClient init(Configuration hadoopConf, String basePath, HoodieTableType tableType, - Properties properties) + Properties properties) throws IOException { + return init(hadoopConf, basePath, tableType, properties, null); + } + + public static HoodieTableMetaClient init(Configuration hadoopConf, String basePath, HoodieTableType tableType, + Properties properties, String databaseName) throws IOException { - properties.putIfAbsent(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, RAW_TRIPS_TEST_NAME); - properties.putIfAbsent(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, tableType.name()); - properties.putIfAbsent(HoodieTableConfig.HOODIE_PAYLOAD_CLASS_PROP_NAME, HoodieAvroPayload.class.getName()); - return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, basePath, properties); + HoodieTableMetaClient.PropertyBuilder builder = + HoodieTableMetaClient.withPropertyBuilder() + .setDatabaseName(databaseName) + .setTableName(RAW_TRIPS_TEST_NAME) + .setTableType(tableType) + .setPayloadClass(HoodieAvroPayload.class); + + String keyGen = properties.getProperty("hoodie.datasource.write.keygenerator.class"); + if (!Objects.equals(keyGen, "org.apache.hudi.keygen.NonpartitionedKeyGenerator")) { + builder.setPartitionFields("some_nonexistent_field"); + } + + Properties processedProperties = builder.fromProperties(properties).build(); + + return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, basePath, processedProperties); + } + + public static HoodieTableMetaClient init(String basePath, HoodieTableType tableType, String bootstrapBasePath, HoodieFileFormat baseFileFormat) throws IOException { + Properties props = new Properties(); + props.setProperty(HoodieTableConfig.BOOTSTRAP_BASE_PATH.key(), bootstrapBasePath); + props.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), baseFileFormat.name()); + return init(getDefaultHadoopConf(), basePath, tableType, props); } public static T serializeDeserialize(T object, Class clazz) { - // Using Kyro as the default serializer in Spark Jobs + // Using Kryo as the default serializer in Spark Jobs Kryo kryo = new Kryo(); kryo.register(HoodieTableMetaClient.class, new JavaSerializer()); @@ -118,9 +167,9 @@ public static T serializeDeserialize(T object, Class output.close(); Input input = new Input(new ByteArrayInputStream(baos.toByteArray())); - T deseralizedObject = kryo.readObject(input, clazz); + T deserializedObject = kryo.readObject(input, clazz); input.close(); - return deseralizedObject; + return deserializedObject; } public static List generateFakeHoodieWriteStat(int limit) { @@ -143,4 +192,17 @@ public static List generateFakeHoodieWriteStat(int limit) { } return writeStatList; } + + public static void createCompactionCommitInMetadataTable( + Configuration hadoopConf, HoodieWrapperFileSystem wrapperFs, String basePath, + String instantTime) throws IOException { + // This is to simulate a completed compaction commit in metadata table timeline, + // so that the commits on data table timeline can be archived + // Note that, if metadata table is enabled, instants in data table timeline, + // which are more recent than the last compaction on the metadata table, + // are not archived (HoodieTimelineArchiveLog::getInstantsToArchive) + String metadataTableBasePath = HoodieTableMetadata.getMetadataTableBasePath(basePath); + HoodieTestUtils.init(hadoopConf, metadataTableBasePath, HoodieTableType.MERGE_ON_READ); + HoodieTestDataGenerator.createCommitFile(metadataTableBasePath, instantTime + "001", wrapperFs.getConf()); + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/MockHoodieTimeline.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/MockHoodieTimeline.java index 5da6b325f3ef2..4014531809846 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/MockHoodieTimeline.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/MockHoodieTimeline.java @@ -23,6 +23,7 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import java.util.Comparator; +import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -38,4 +39,9 @@ public MockHoodieTimeline(Stream completed, Stream inflights) { inflights.map(s -> new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, s))) .sorted(Comparator.comparing(HoodieInstant::getFileName)).collect(Collectors.toList())); } + + public MockHoodieTimeline(List instants) { + super(); + this.setInstants(instants); + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java index 8442aff084a49..c052b63ab544b 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java @@ -20,6 +20,7 @@ package org.apache.hudi.common.testutils; import org.apache.hudi.avro.MercifulJsonConverter; +import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.FileIOUtils; @@ -52,9 +53,10 @@ public class RawTripTestPayload implements HoodieRecordPayload jsonData, String rowKey, String partitionPath, String schemaStr, - Boolean isDeleted) throws IOException { + Boolean isDeleted, Comparable orderingVal) throws IOException { if (jsonData.isPresent()) { this.jsonDataCompressed = compressData(jsonData.get()); this.dataSize = jsonData.get().length(); @@ -62,10 +64,11 @@ public RawTripTestPayload(Option jsonData, String rowKey, String partiti this.rowKey = rowKey; this.partitionPath = partitionPath; this.isDeleted = isDeleted; + this.orderingVal = orderingVal; } public RawTripTestPayload(String jsonData, String rowKey, String partitionPath, String schemaStr) throws IOException { - this(Option.of(jsonData), rowKey, partitionPath, schemaStr, false); + this(Option.of(jsonData), rowKey, partitionPath, schemaStr, false, 0L); } public RawTripTestPayload(String jsonData) throws IOException { @@ -77,6 +80,23 @@ public RawTripTestPayload(String jsonData) throws IOException { this.isDeleted = false; } + /** + * @deprecated PLEASE READ THIS CAREFULLY + * + * Converting properly typed schemas into JSON leads to inevitable information loss, since JSON + * encodes only representation of the record (with no schema accompanying it), therefore occasionally + * losing nuances of the original data-types provided by the schema (for ex, with 1.23 literal it's + * impossible to tell whether original type was Double or Decimal). + * + * Multiplied by the fact that Spark 2 JSON schema inference has substantial gaps in it (see below), + * it's **NOT RECOMMENDED** to use this method. Instead please consider using {@link AvroConversionUtils#createDataframe()} + * method accepting list of {@link HoodieRecord} (as produced by the {@link HoodieTestDataGenerator} + * to create Spark's {@code Dataframe}s directly. + * + * REFs + * https://medium.com/swlh/notes-about-json-schema-handling-in-spark-sql-be1e7f13839d + */ + @Deprecated public static List recordsToStrings(List records) { return records.stream().map(RawTripTestPayload::recordToString).filter(Option::isPresent).map(Option::get) .collect(Collectors.toList()); @@ -94,13 +114,23 @@ public static Option recordToString(HoodieRecord record) { } } + public static List deleteRecordsToStrings(List records) { + return records.stream().map(record -> "{\"_row_key\": \"" + record.getRecordKey() + "\",\"partition\": \"" + record.getPartitionPath() + "\"}") + .collect(Collectors.toList()); + } + public String getPartitionPath() { return partitionPath; } @Override - public RawTripTestPayload preCombine(RawTripTestPayload another) { - return another; + public RawTripTestPayload preCombine(RawTripTestPayload oldValue) { + if (oldValue.orderingVal.compareTo(orderingVal) > 0) { + // pick the payload with greatest ordering value + return oldValue; + } else { + return this; + } } @Override diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/SampleTestRecord.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/SampleTestRecord.java index f8e6252a27fc5..c4a3d4031d5f0 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/SampleTestRecord.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/SampleTestRecord.java @@ -75,11 +75,17 @@ class TestNestedRecord implements Serializable { private String[] stringArray; public SampleTestRecord(String instantTime, int recordNumber, String fileId) { - this._hoodie_commit_time = instantTime; - this._hoodie_record_key = "key" + recordNumber; - this._hoodie_partition_path = instantTime; - this._hoodie_file_name = fileId; - this._hoodie_commit_seqno = instantTime + recordNumber; + this(instantTime, recordNumber, fileId, true); + } + + public SampleTestRecord(String instantTime, int recordNumber, String fileId, boolean populateMetaFields) { + if (populateMetaFields) { + this._hoodie_commit_time = instantTime; + this._hoodie_record_key = "key" + recordNumber; + this._hoodie_partition_path = instantTime; + this._hoodie_file_name = fileId; + this._hoodie_commit_seqno = instantTime + recordNumber; + } String commitTimeSuffix = "@" + instantTime; int commitHashCode = instantTime.hashCode(); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/SchemaTestUtil.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/SchemaTestUtil.java index d680b12683c42..70d5a1bb3e9d6 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/SchemaTestUtil.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/SchemaTestUtil.java @@ -21,6 +21,7 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.MercifulJsonConverter; import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; @@ -28,23 +29,30 @@ import org.apache.hudi.exception.HoodieIOException; import org.apache.avro.Schema; +import org.apache.avro.generic.GenericArray; +import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import org.apache.avro.io.DecoderFactory; +import org.apache.avro.util.Utf8; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; +import java.nio.ByteBuffer; import java.nio.file.FileSystem; import java.nio.file.FileSystemNotFoundException; import java.nio.file.FileSystems; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.HashMap; +import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Random; import java.util.UUID; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -64,17 +72,24 @@ public static List generateTestRecords(int from, int limit) throw return toRecords(getSimpleSchema(), getSimpleSchema(), from, limit); } - private static List toRecords(Schema writerSchema, Schema readerSchema, int from, int limit) - throws IOException, URISyntaxException { - GenericDatumReader reader = new GenericDatumReader<>(writerSchema, readerSchema); - // Required to register the necessary JAR:// file system - URI resource = SchemaTestUtil.class.getResource(RESOURCE_SAMPLE_DATA).toURI(); - Path dataPath; - if (resource.toString().contains("!")) { - dataPath = uriToPath(resource); - } else { - dataPath = Paths.get(SchemaTestUtil.class.getResource(RESOURCE_SAMPLE_DATA).toURI()); + public static List generateTestGenericRecords(int from, int limit) throws IOException, URISyntaxException { + return toRecords(getSimpleSchema(), getSimpleSchema(), from, limit); + } + + public static List generateTestJsonRecords(int from, int limit) throws IOException, URISyntaxException { + Path dataPath = initializeSampleDataPath(); + + try (Stream stream = Files.lines(dataPath)) { + return stream.skip(from).limit(limit).collect(Collectors.toList()); + } catch (IOException e) { + throw new HoodieIOException("Could not read data from " + RESOURCE_SAMPLE_DATA, e); } + } + + private static List toRecords(Schema writerSchema, Schema readerSchema, int from, int limit) + throws IOException, URISyntaxException { + GenericDatumReader reader = new GenericDatumReader<>(writerSchema, readerSchema); + Path dataPath = initializeSampleDataPath(); try (Stream stream = Files.lines(dataPath)) { return stream.skip(from).limit(limit).map(s -> { @@ -89,6 +104,21 @@ private static List toRecords(Schema writerSchema, Schema readerS } } + /** + * Required to register the necessary JAR:// file system. + * @return Path to the sample data in the resource file. + * @throws IOException + * @throws URISyntaxException + */ + private static Path initializeSampleDataPath() throws IOException, URISyntaxException { + URI resource = SchemaTestUtil.class.getResource(RESOURCE_SAMPLE_DATA).toURI(); + if (resource.toString().contains("!")) { + return uriToPath(resource); + } else { + return Paths.get(SchemaTestUtil.class.getResource(RESOURCE_SAMPLE_DATA).toURI()); + } + } + public static Path uriToPath(URI uri) throws IOException { final Map env = new HashMap<>(); final String[] array = uri.toString().split("!"); @@ -123,7 +153,7 @@ public static List generateHoodieTestRecords(int from, int limit, } private static HoodieRecord convertToHoodieRecords(IndexedRecord iRecord, String key, String partitionPath) { - return new HoodieRecord<>(new HoodieKey(key, partitionPath), + return new HoodieAvroRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) iRecord))); } @@ -143,7 +173,7 @@ public static List generateHoodieTestRecordsWithoutHoodieMetadata( throws IOException, URISyntaxException { List iRecords = generateTestRecords(from, limit); - return iRecords.stream().map(r -> new HoodieRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), + return iRecords.stream().map(r -> new HoodieAvroRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), new HoodieAvroPayload(Option.of((GenericRecord) r)))).collect(Collectors.toList()); } @@ -151,9 +181,9 @@ public static List updateHoodieTestRecordsWithoutHoodieMetadata(Li Schema schema, String fieldNameToUpdate, String newValue) { return oldRecords.stream().map(r -> { try { - GenericRecord rec = (GenericRecord) r.getData().getInsertValue(schema).get(); + GenericRecord rec = (GenericRecord) ((HoodieAvroRecord) r).getData().getInsertValue(schema).get(); rec.put(fieldNameToUpdate, newValue); - return new HoodieRecord<>(r.getKey(), new HoodieAvroPayload(Option.of(rec))); + return new HoodieAvroRecord<>(r.getKey(), new HoodieAvroPayload(Option.of(rec))); } catch (IOException io) { throw new HoodieIOException("unable to get data from hoodie record", io); } @@ -164,6 +194,10 @@ public static Schema getEvolvedSchema() throws IOException { return new Schema.Parser().parse(SchemaTestUtil.class.getResourceAsStream("/simple-test-evolved.avsc")); } + public static Schema getEvolvedCompatibleSchema() throws IOException { + return new Schema.Parser().parse(SchemaTestUtil.class.getResourceAsStream("/simple-test-evolved-compatible.avsc")); + } + public static List generateEvolvedTestRecords(int from, int limit) throws IOException, URISyntaxException { return toRecords(getSimpleSchema(), getEvolvedSchema(), from, limit); @@ -177,9 +211,18 @@ public static Schema getTimestampEvolvedSchema() throws IOException { return new Schema.Parser().parse(SchemaTestUtil.class.getResourceAsStream("/timestamp-test-evolved.avsc")); } + public static Schema getTimestampWithLogicalTypeSchema() throws IOException { + return new Schema.Parser().parse(SchemaTestUtil.class.getResourceAsStream("/timestamp-logical-type.avsc")); + } + + public static GenericRecord generateAvroRecordFromJson(Schema schema, int recordNumber, String instantTime, + String fileId) throws IOException { + return generateAvroRecordFromJson(schema, recordNumber, instantTime, fileId, true); + } + public static GenericRecord generateAvroRecordFromJson(Schema schema, int recordNumber, String instantTime, - String fileId) throws IOException { - SampleTestRecord record = new SampleTestRecord(instantTime, recordNumber, fileId); + String fileId, boolean populateMetaFields) throws IOException { + SampleTestRecord record = new SampleTestRecord(instantTime, recordNumber, fileId, populateMetaFields); MercifulJsonConverter converter = new MercifulJsonConverter(); return converter.convert(record.toJsonString(), schema); } @@ -196,4 +239,123 @@ public static Schema getSchemaFromResource(Class clazz, String name, boolean public static Schema getSchemaFromResource(Class clazz, String name) { return getSchemaFromResource(clazz, name, false); } + + public static List generateTestRecordsForSchema(Schema schema) { + RandomData generator = new RandomData(schema, 1000); + List records = new ArrayList<>(); + for (Object o : generator) { + IndexedRecord record = (IndexedRecord) o; + records.add(record); + } + return records; + } + + //Taken from test pkg 1.8.2 avro. This is available as a util class in latest versions. When we upgrade avro we can remove this + static class RandomData implements Iterable { + private final Schema root; + private final long seed; + private final int count; + + public RandomData(Schema schema, int count) { + this(schema, count, System.currentTimeMillis()); + } + + public RandomData(Schema schema, int count, long seed) { + this.root = schema; + this.seed = seed; + this.count = count; + } + + @SuppressWarnings(value = "unchecked") + private static Object generate(Schema schema, Random random, int d) { + switch (schema.getType()) { + case RECORD: + GenericRecord record = new GenericData.Record(schema); + for (Schema.Field field : schema.getFields()) { + record.put(field.name(), generate(field.schema(), random, d + 1)); + } + return record; + case ENUM: + List symbols = schema.getEnumSymbols(); + return new GenericData.EnumSymbol(schema, symbols.get(random.nextInt(symbols.size()))); + case ARRAY: + int length = (random.nextInt(5) + 2) - d; + GenericArray array = + new GenericData.Array(length <= 0 ? 0 : length, schema); + for (int i = 0; i < length; i++) { + array.add(generate(schema.getElementType(), random, d + 1)); + } + return array; + case MAP: + length = (random.nextInt(5) + 2) - d; + Map map = new HashMap(length <= 0 ? 0 : length); + for (int i = 0; i < length; i++) { + map.put(randomUtf8(random, 40), + generate(schema.getValueType(), random, d + 1)); + } + return map; + case UNION: + List types = schema.getTypes(); + //Dropping the null at the end. + return generate(types.get(random.nextInt(types.size() - 1)), random, d); + case FIXED: + byte[] bytes = new byte[schema.getFixedSize()]; + random.nextBytes(bytes); + return new GenericData.Fixed(schema, bytes); + case STRING: + return randomUtf8(random, 40); + case BYTES: + return randomBytes(random, 40); + case INT: + return random.nextInt(); + case LONG: + return random.nextLong(); + case FLOAT: + return random.nextFloat(); + case DOUBLE: + return random.nextDouble(); + case BOOLEAN: + return random.nextBoolean(); + case NULL: + return null; + default: + throw new RuntimeException("Unknown type: " + schema); + } + } + + private static Utf8 randomUtf8(Random rand, int maxLength) { + Utf8 utf8 = new Utf8().setLength(rand.nextInt(maxLength)); + for (int i = 0; i < utf8.getLength(); i++) { + utf8.getBytes()[i] = (byte) ('a' + rand.nextInt('z' - 'a')); + } + return utf8; + } + + private static ByteBuffer randomBytes(Random rand, int maxLength) { + ByteBuffer bytes = ByteBuffer.allocate(rand.nextInt(maxLength)); + bytes.limit(bytes.capacity()); + rand.nextBytes(bytes.array()); + return bytes; + } + + public Iterator iterator() { + return new Iterator() { + private int n; + private Random random = new Random(seed); + + public boolean hasNext() { + return n < count; + } + + public Object next() { + n++; + return generate(root, random, 0); + } + + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/SpillableMapTestUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/SpillableMapTestUtils.java index 89155904ec605..2e450660b5a4c 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/SpillableMapTestUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/SpillableMapTestUtils.java @@ -19,6 +19,7 @@ package org.apache.hudi.common.testutils; import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; @@ -48,7 +49,7 @@ public static List upsertRecords(List iRecords, String partitionPath = ((GenericRecord) r).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); recordKeys.add(key); HoodieRecord record = - new HoodieRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r))); + new HoodieAvroRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r))); record.unseal(); record.setCurrentLocation(new HoodieRecordLocation("DUMMY_COMMIT_TIME", "DUMMY_FILE_ID")); record.seal(); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/minicluster/HdfsTestService.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/minicluster/HdfsTestService.java index 44af4ecea7c71..0766c61c67b39 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/minicluster/HdfsTestService.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/minicluster/HdfsTestService.java @@ -18,7 +18,6 @@ package org.apache.hudi.common.testutils.minicluster; -import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.testutils.NetworkTestUtils; import org.apache.hudi.common.util.FileIOUtils; @@ -45,7 +44,7 @@ public class HdfsTestService { /** * Configuration settings. */ - private Configuration hadoopConf; + private final Configuration hadoopConf; private final String workDir; /** @@ -54,6 +53,7 @@ public class HdfsTestService { private MiniDFSCluster miniDfsCluster; public HdfsTestService() throws IOException { + hadoopConf = new Configuration(); workDir = Files.createTempDirectory("temp").toAbsolutePath().toString(); } @@ -63,7 +63,6 @@ public Configuration getHadoopConf() { public MiniDFSCluster start(boolean format) throws IOException { Objects.requireNonNull(workDir, "The work dir must be set before starting cluster."); - hadoopConf = HoodieTestUtils.getDefaultHadoopConf(); // If clean, then remove the work dir so we can start fresh. String localDFSLocation = getDFSLocation(workDir); @@ -103,9 +102,10 @@ public MiniDFSCluster start(boolean format) throws IOException { public void stop() { LOG.info("HDFS Minicluster service being shut down."); - miniDfsCluster.shutdown(); + if (miniDfsCluster != null) { + miniDfsCluster.shutdown(true, true); + } miniDfsCluster = null; - hadoopConf = null; } /** @@ -121,9 +121,9 @@ private static String getDFSLocation(String baseFsLocation) { /** * Configure the DFS Cluster before launching it. * - * @param config The already created Hadoop configuration we'll further configure for HDFS + * @param config The already created Hadoop configuration we'll further configure for HDFS * @param localDFSLocation The location on the local filesystem where cluster data is stored - * @param bindIP An IP address we want to force the datanode and namenode to bind to. + * @param bindIP An IP address we want to force the datanode and namenode to bind to. * @return The updated Configuration object. */ private static Configuration configureDFSCluster(Configuration config, String localDFSLocation, String bindIP, @@ -144,6 +144,8 @@ private static Configuration configureDFSCluster(Configuration config, String lo String user = System.getProperty("user.name"); config.set("hadoop.proxyuser." + user + ".groups", "*"); config.set("hadoop.proxyuser." + user + ".hosts", "*"); + config.setBoolean("dfs.permissions", false); + config.set("dfs.blocksize","16777216"); return config; } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/minicluster/MiniClusterUtil.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/minicluster/MiniClusterUtil.java index ecc00e5f68311..135d875e43842 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/minicluster/MiniClusterUtil.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/minicluster/MiniClusterUtil.java @@ -50,10 +50,10 @@ public static void setUp() throws IOException, InterruptedException { public static void shutdown() { if (dfsCluster != null) { - dfsCluster.shutdown(); + dfsCluster.shutdown(true, true); } if (zkServer != null) { - zkServer.shutdown(); + zkServer.shutdown(true); } } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/minicluster/ZookeeperTestService.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/minicluster/ZookeeperTestService.java index c278770081904..e5c228f40432b 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/minicluster/ZookeeperTestService.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/minicluster/ZookeeperTestService.java @@ -124,14 +124,14 @@ public ZooKeeperServer start() throws IOException, InterruptedException { return zooKeeperServer; } - public void stop() throws IOException { + public void stop() throws RuntimeException { if (!started) { return; } standaloneServerFactory.shutdown(); if (!waitForServerDown(clientPort, CONNECTION_TIMEOUT)) { - throw new IOException("Waiting for shutdown of standalone server"); + throw new RuntimeException("Waiting for shutdown of standalone server"); } // clear everything @@ -232,4 +232,8 @@ private static boolean waitForServerUp(String hostname, int port, long timeout) } return false; } + + public String connectString() { + return bindIP + ":" + clientPort; + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestAvroOrcUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestAvroOrcUtils.java new file mode 100644 index 0000000000000..692aa1ed14e19 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestAvroOrcUtils.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.AVRO_SCHEMA; + +import java.util.Arrays; +import java.util.List; +import org.apache.avro.Schema; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.orc.TypeDescription; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestAvroOrcUtils extends HoodieCommonTestHarness { + + public static List testCreateOrcSchemaArgs() { + // the ORC schema is constructed in the order as AVRO_SCHEMA: + // TRIP_SCHEMA_PREFIX, EXTRA_TYPE_SCHEMA, MAP_TYPE_SCHEMA, FARE_NESTED_SCHEMA, TIP_NESTED_SCHEMA, TRIP_SCHEMA_SUFFIX + // The following types are tested: + // DATE, DECIMAL, LONG, INT, BYTES, ARRAY, RECORD, MAP, STRING, FLOAT, DOUBLE + TypeDescription orcSchema = TypeDescription.fromString("struct<" + + "timestamp:bigint,_row_key:string,partition_path:string,rider:string,driver:string,begin_lat:double," + + "begin_lon:double,end_lat:double,end_lon:double," + + "distance_in_meters:int,seconds_since_epoch:bigint,weight:float,nation:binary," + + "current_date:date,current_ts:bigint,height:decimal(10,6)," + + "city_to_state:map," + + "fare:struct," + + "tip_history:array>," + + "_hoodie_is_deleted:boolean>"); + + // Tests the types FIXED, UNION + String structField = "{\"type\":\"record\", \"name\":\"fare\",\"fields\": " + + "[{\"name\": \"amount\",\"type\": \"double\"},{\"name\": \"currency\", \"type\": \"string\"}]}"; + Schema avroSchemaWithMoreTypes = new Schema.Parser().parse( + "{\"type\": \"record\"," + "\"name\": \"triprec\"," + "\"fields\": [ " + + "{\"name\" : \"age\", \"type\":{\"type\": \"fixed\", \"size\": 16, \"name\": \"fixedField\" }}," + + "{\"name\" : \"height\", \"type\": [\"int\", \"null\"] }," + + "{\"name\" : \"id\", \"type\": [\"int\", \"string\"] }," + + "{\"name\" : \"fare\", \"type\": [" + structField + ", \"null\"] }]}"); + TypeDescription orcSchemaWithMoreTypes = TypeDescription.fromString( + "struct,fare:struct>"); + + return Arrays.asList( + Arguments.of(AVRO_SCHEMA, orcSchema), + Arguments.of(avroSchemaWithMoreTypes, orcSchemaWithMoreTypes) + ); + } + + @ParameterizedTest + @MethodSource("testCreateOrcSchemaArgs") + public void testCreateOrcSchema(Schema avroSchema, TypeDescription orcSchema) { + TypeDescription convertedSchema = AvroOrcUtils.createOrcSchema(avroSchema); + assertEquals(orcSchema, convertedSchema); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestBinaryUtil.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestBinaryUtil.java new file mode 100644 index 0000000000000..1efe5a06865d8 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestBinaryUtil.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util; + +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestBinaryUtil { + + @Test + public void testIntConvert() { + // test Int + int[] testInt = new int[] {-1, 1, -2, 10000, -100000, 2, Integer.MAX_VALUE, Integer.MIN_VALUE}; + List> valueWrappers = new ArrayList<>(); + List> convertResultWrappers = new ArrayList<>(); + for (int i = 0; i < testInt.length; i++) { + valueWrappers.add(new OrginValueWrapper<>(i, testInt[i])); + convertResultWrappers.add(new ConvertResultWrapper<>(i, BinaryUtil.intTo8Byte(testInt[i]))); + } + + Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue))); + + Collections.sort(convertResultWrappers, ((o1, o2) -> BinaryUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length))); + + for (int i = 0; i < testInt.length; i++) { + assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index); + } + } + + @Test + public void testLongConvert() { + // test Long + long[] testLong = new long[] {-1L, 1L, -2L, 10000L, -100000L, 2L, Long.MAX_VALUE, Long.MIN_VALUE}; + List> valueWrappers = new ArrayList<>(); + List> convertResultWrappers = new ArrayList<>(); + for (int i = 0; i < testLong.length; i++) { + valueWrappers.add(new OrginValueWrapper<>((long)i, testLong[i])); + convertResultWrappers.add(new ConvertResultWrapper<>((long)i, BinaryUtil.longTo8Byte(testLong[i]))); + } + + Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue))); + + Collections.sort(convertResultWrappers, ((o1, o2) -> BinaryUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length))); + + for (int i = 0; i < testLong.length; i++) { + assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index); + } + } + + @Test + public void testDoubleConvert() { + // test Long + double[] testDouble = new double[] {-1.00d, 1.05d, -2.3d, 10000.002d, -100000.7d, 2.9d, Double.MAX_VALUE}; + List> valueWrappers = new ArrayList<>(); + List> convertResultWrappers = new ArrayList<>(); + for (int i = 0; i < testDouble.length; i++) { + valueWrappers.add(new OrginValueWrapper<>((Double)(i * 1.0), testDouble[i])); + convertResultWrappers.add(new ConvertResultWrapper<>((Double)(i * 1.0), BinaryUtil.doubleTo8Byte(testDouble[i]))); + } + + Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue))); + + Collections.sort(convertResultWrappers, ((o1, o2) -> BinaryUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length))); + + for (int i = 0; i < testDouble.length; i++) { + assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index); + } + } + + @Test + public void testFloatConvert() { + // test Long + float[] testDouble = new float[] {-1.00f, 1.05f, -2.3f, 10000.002f, -100000.7f, 2.9f, Float.MAX_VALUE, Float.MIN_VALUE}; + List> valueWrappers = new ArrayList<>(); + List> convertResultWrappers = new ArrayList<>(); + for (int i = 0; i < testDouble.length; i++) { + valueWrappers.add(new OrginValueWrapper<>((float)(i * 1.0), testDouble[i])); + convertResultWrappers.add(new ConvertResultWrapper<>((float)(i * 1.0), BinaryUtil.doubleTo8Byte((double) testDouble[i]))); + } + + Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue))); + + Collections.sort(convertResultWrappers, ((o1, o2) -> BinaryUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length))); + + for (int i = 0; i < testDouble.length; i++) { + assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index); + } + } + + private class ConvertResultWrapper { + T index; + byte[] result; + public ConvertResultWrapper(T index, byte[] result) { + this.index = index; + this.result = result; + } + } + + private class OrginValueWrapper { + T index; + T originValue; + public OrginValueWrapper(T index, T originValue) { + this.index = index; + this.originValue = originValue; + } + } + + @Test + public void testConvertBytesToLong() { + long[] tests = new long[] {Long.MIN_VALUE, -1L, 0, 1L, Long.MAX_VALUE}; + for (int i = 0; i < tests.length; i++) { + assertEquals(BinaryUtil.convertBytesToLong(convertLongToBytes(tests[i])), tests[i]); + } + } + + @Test + public void testConvertBytesToLongWithPadding() { + byte[] bytes = new byte[2]; + bytes[0] = 2; + bytes[1] = 127; + assertEquals(BinaryUtil.convertBytesToLong(bytes), 2 * 256 + 127); + } + + private byte[] convertLongToBytes(long num) { + byte[] byteNum = new byte[8]; + for (int i = 0; i < 8; i++) { + int offset = 64 - (i + 1) * 8; + byteNum[i] = (byte) ((num >> offset) & 0xff); + } + return byteNum; + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestClusteringUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestClusteringUtils.java index 5d82bbce734ca..a5d45d1184f9b 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestClusteringUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestClusteringUtils.java @@ -98,6 +98,22 @@ public void testClusteringPlanMultipleInstants() throws Exception { validateClusteringInstant(fileIds3, partitionPath1, clusterTime, fileGroupToInstantMap); } + // replacecommit.inflight doesnt have clustering plan. + // Verify that getClusteringPlan fetches content from corresponding requested file. + @Test + public void testClusteringPlanInflight() throws Exception { + String partitionPath1 = "partition1"; + List fileIds1 = new ArrayList<>(); + fileIds1.add(UUID.randomUUID().toString()); + fileIds1.add(UUID.randomUUID().toString()); + String clusterTime1 = "1"; + HoodieInstant requestedInstant = createRequestedReplaceInstant(partitionPath1, clusterTime1, fileIds1); + HoodieInstant inflightInstant = metaClient.getActiveTimeline().transitionReplaceRequestedToInflight(requestedInstant, Option.empty()); + HoodieClusteringPlan requestedClusteringPlan = ClusteringUtils.getClusteringPlan(metaClient, requestedInstant).get().getRight(); + HoodieClusteringPlan inflightClusteringPlan = ClusteringUtils.getClusteringPlan(metaClient, inflightInstant).get().getRight(); + assertEquals(requestedClusteringPlan, inflightClusteringPlan); + } + private void validateClusteringInstant(List fileIds, String partitionPath, String expectedInstantTime, Map fileGroupToInstantMap) { for (String fileId : fileIds) { @@ -131,7 +147,7 @@ private HoodieInstant createRequestedReplaceInstant(String partitionPath1, Strin private FileSlice generateFileSlice(String partitionPath, String fileId, String baseInstant) { FileSlice fs = new FileSlice(new HoodieFileGroupId(partitionPath, fileId), baseInstant); - fs.setBaseFile(new HoodieBaseFile(FSUtils.makeDataFileName(baseInstant, "1-0-1", fileId))); + fs.setBaseFile(new HoodieBaseFile(FSUtils.makeBaseFileName(baseInstant, "1-0-1", fileId))); return fs; } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestCollectionUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestCollectionUtils.java new file mode 100644 index 0000000000000..53ca9b2bebc1f --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestCollectionUtils.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util; + +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import static org.apache.hudi.common.util.CollectionUtils.batches; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +class TestCollectionUtils { + + @Test + void getBatchesFromList() { + assertThrows(IllegalArgumentException.class, () -> { + batches(Collections.emptyList(), -1); + }); + + assertThrows(IllegalArgumentException.class, () -> { + batches(Collections.emptyList(), 0); + }); + + assertEquals(Collections.emptyList(), batches(Collections.emptyList(), 1)); + + List> intsBatches1 = batches(Arrays.asList(1, 2, 3, 4, 5, 6), 3); + assertEquals(2, intsBatches1.size()); + assertEquals(Arrays.asList(1, 2, 3), intsBatches1.get(0)); + assertEquals(Arrays.asList(4, 5, 6), intsBatches1.get(1)); + + List> intsBatches2 = batches(Arrays.asList(1, 2, 3, 4, 5, 6), 5); + assertEquals(2, intsBatches2.size()); + assertEquals(Arrays.asList(1, 2, 3, 4, 5), intsBatches2.get(0)); + assertEquals(Collections.singletonList(6), intsBatches2.get(1)); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestCommitUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestCommitUtils.java index 98535e95238e6..c55e34acfa877 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestCommitUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestCommitUtils.java @@ -93,4 +93,4 @@ private HoodieWriteStat createWriteStat(String partition, String fileId) { writeStat1.setFileId(fileId); return writeStat1; } -} +} \ No newline at end of file diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestCompactionUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestCompactionUtils.java index 35ff4cba08997..30abe48cb4e19 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestCompactionUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestCompactionUtils.java @@ -27,6 +27,9 @@ import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.versioning.compaction.CompactionPlanMigrator; import org.apache.hudi.common.testutils.CompactionTestUtils.DummyHoodieBaseFile; import org.apache.hudi.common.testutils.HoodieCommonTestHarness; @@ -35,15 +38,20 @@ import org.apache.hadoop.fs.Path; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.IntStream; +import java.util.stream.Stream; import static org.apache.hudi.common.testutils.CompactionTestUtils.createCompactionPlan; import static org.apache.hudi.common.testutils.CompactionTestUtils.scheduleCompaction; @@ -97,6 +105,8 @@ public void testUpgradeDowngrade() { @Test public void testBuildFromFileSlice() { + String extension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); + // Empty File-Slice with no data and log files FileSlice emptyFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "empty1"); HoodieCompactionOperation op = @@ -106,7 +116,7 @@ public void testBuildFromFileSlice() { // File Slice with data-file but no log files FileSlice noLogFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "noLog1"); - noLogFileSlice.setBaseFile(new DummyHoodieBaseFile("/tmp/noLog_1_000.parquet")); + noLogFileSlice.setBaseFile(new DummyHoodieBaseFile("/tmp/noLog_1_000" + extension)); op = CompactionUtils.buildFromFileSlice(DEFAULT_PARTITION_PATHS[0], noLogFileSlice, Option.of(metricsCaptureFn)); testFileSliceCompactionOpEquality(noLogFileSlice, op, DEFAULT_PARTITION_PATHS[0], LATEST_COMPACTION_METADATA_VERSION); @@ -122,7 +132,7 @@ public void testBuildFromFileSlice() { // File Slice with data-file and log files present FileSlice fileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "noData1"); - fileSlice.setBaseFile(new DummyHoodieBaseFile("/tmp/noLog_1_000.parquet")); + fileSlice.setBaseFile(new DummyHoodieBaseFile("/tmp/noLog_1_000" + extension)); fileSlice.addLogFile( new HoodieLogFile(new Path(FSUtils.makeLogFileName("noData1", ".log", "000", 1, TEST_WRITE_TOKEN)))); fileSlice.addLogFile( @@ -135,16 +145,18 @@ public void testBuildFromFileSlice() { * Generate input for compaction plan tests. */ private Pair>, HoodieCompactionPlan> buildCompactionPlan() { + String extension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); + Path fullPartitionPath = new Path(new Path(metaClient.getBasePath()), DEFAULT_PARTITION_PATHS[0]); FileSlice emptyFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "empty1"); FileSlice fileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "noData1"); - fileSlice.setBaseFile(new DummyHoodieBaseFile(fullPartitionPath.toString() + "/data1_1_000.parquet")); + fileSlice.setBaseFile(new DummyHoodieBaseFile(fullPartitionPath.toString() + "/data1_1_000" + extension)); fileSlice.addLogFile(new HoodieLogFile( new Path(fullPartitionPath, new Path(FSUtils.makeLogFileName("noData1", ".log", "000", 1, TEST_WRITE_TOKEN))))); fileSlice.addLogFile(new HoodieLogFile( new Path(fullPartitionPath, new Path(FSUtils.makeLogFileName("noData1", ".log", "000", 2, TEST_WRITE_TOKEN))))); FileSlice noLogFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "noLog1"); - noLogFileSlice.setBaseFile(new DummyHoodieBaseFile(fullPartitionPath.toString() + "/noLog_1_000.parquet")); + noLogFileSlice.setBaseFile(new DummyHoodieBaseFile(fullPartitionPath.toString() + "/noLog_1_000" + extension)); FileSlice noDataFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "noData1"); noDataFileSlice.addLogFile(new HoodieLogFile( new Path(fullPartitionPath, new Path(FSUtils.makeLogFileName("noData1", ".log", "000", 1, TEST_WRITE_TOKEN))))); @@ -188,7 +200,7 @@ public void testGetAllPendingCompactionOperationsWithDupFileId() throws IOExcept // schedule similar plan again so that there will be duplicates plan1.getOperations().get(0).setDataFilePath("bla"); scheduleCompaction(metaClient, "005", plan1); - metaClient = new HoodieTableMetaClient(metaClient.getHadoopConf(), basePath, true); + metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); assertThrows(IllegalStateException.class, () -> { CompactionUtils.getAllPendingCompactionOperations(metaClient); }); @@ -203,7 +215,7 @@ public void testGetAllPendingCompactionOperationsWithFullDupFileId() throws IOEx scheduleCompaction(metaClient, "003", plan2); // schedule same plan again so that there will be duplicates. It should not fail as it is a full duplicate scheduleCompaction(metaClient, "005", plan1); - metaClient = new HoodieTableMetaClient(metaClient.getHadoopConf(), basePath, true); + metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); Map> res = CompactionUtils.getAllPendingCompactionOperations(metaClient); } @@ -226,11 +238,95 @@ public void testGetAllPendingCompactionOperationsForEmptyCompactions() throws IO setupAndValidateCompactionOperations(metaClient, false, 0, 0, 0, 0); } + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testGetDeltaCommitsSinceLatestCompaction(boolean hasCompletedCompaction) { + HoodieActiveTimeline timeline = prepareTimeline(hasCompletedCompaction); + Pair actual = + CompactionUtils.getDeltaCommitsSinceLatestCompaction(timeline).get(); + if (hasCompletedCompaction) { + Stream instants = actual.getLeft().getInstants(); + assertEquals( + Stream.of( + new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "07"), + new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "08"), + new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, "09")) + .collect(Collectors.toList()), + actual.getLeft().getInstants().collect(Collectors.toList())); + assertEquals( + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "06"), + actual.getRight()); + } else { + assertEquals( + Stream.of( + new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "01"), + new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "02"), + new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "03"), + new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "04"), + new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "05"), + new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "07"), + new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "08"), + new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, "09")) + .collect(Collectors.toList()), + actual.getLeft().getInstants().collect(Collectors.toList())); + assertEquals( + new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "01"), + actual.getRight()); + } + } + + @Test + public void testGetDeltaCommitsSinceLatestCompactionWithEmptyDeltaCommits() { + HoodieActiveTimeline timeline = new MockHoodieActiveTimeline(); + assertEquals(Option.empty(), CompactionUtils.getDeltaCommitsSinceLatestCompaction(timeline)); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testGetOldestInstantToKeepForCompaction(boolean hasCompletedCompaction) { + HoodieActiveTimeline timeline = prepareTimeline(hasCompletedCompaction); + Option actual = CompactionUtils.getOldestInstantToRetainForCompaction(timeline, 20); + + if (hasCompletedCompaction) { + assertEquals(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "06"), actual.get()); + } else { + assertEquals(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "01"), actual.get()); + } + + actual = CompactionUtils.getOldestInstantToRetainForCompaction(timeline, 3); + assertEquals(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "07"), actual.get()); + + actual = CompactionUtils.getOldestInstantToRetainForCompaction(timeline, 2); + assertEquals(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "08"), actual.get()); + } + + @Test + public void testGetOldestInstantToKeepForCompactionWithEmptyDeltaCommits() { + HoodieActiveTimeline timeline = new MockHoodieActiveTimeline(); + assertEquals(Option.empty(), CompactionUtils.getOldestInstantToRetainForCompaction(timeline, 20)); + } + + private HoodieActiveTimeline prepareTimeline(boolean hasCompletedCompaction) { + if (hasCompletedCompaction) { + return new MockHoodieActiveTimeline( + Stream.of("01", "02", "03", "04", "05", "07", "08"), + Stream.of("06"), + Stream.of(Pair.of("09", HoodieTimeline.DELTA_COMMIT_ACTION))); + } else { + return new MockHoodieActiveTimeline( + Stream.of("01", "02", "03", "04", "05", "07", "08"), + Stream.empty(), + Stream.of( + Pair.of("06", HoodieTimeline.COMMIT_ACTION), + Pair.of("09", HoodieTimeline.DELTA_COMMIT_ACTION))); + } + } + /** * Validates if generated compaction plan matches with input file-slices. * * @param input File Slices with partition-path - * @param plan Compaction Plan + * @param plan Compaction Plan */ private void testFileSlicesCompactionPlanEquality(List> input, HoodieCompactionPlan plan) { assertEquals(input.size(), plan.getOperations().size(), "All file-slices present"); @@ -241,12 +337,12 @@ private void testFileSlicesCompactionPlanEquality(List> /** * Validates if generated compaction operation matches with input file slice and partition path. * - * @param slice File Slice - * @param op HoodieCompactionOperation + * @param slice File Slice + * @param op HoodieCompactionOperation * @param expPartitionPath Partition path */ private void testFileSliceCompactionOpEquality(FileSlice slice, HoodieCompactionOperation op, String expPartitionPath, - int version) { + int version) { assertEquals(expPartitionPath, op.getPartitionPath(), "Partition path is correct"); assertEquals(slice.getBaseInstantTime(), op.getBaseInstantTime(), "Same base-instant"); assertEquals(slice.getFileId(), op.getFileId(), "Same file-id"); @@ -266,4 +362,24 @@ private void testFileSliceCompactionOpEquality(FileSlice slice, HoodieCompaction protected HoodieTableType getTableType() { return HoodieTableType.MERGE_ON_READ; } + + class MockHoodieActiveTimeline extends HoodieActiveTimeline { + + public MockHoodieActiveTimeline() { + super(); + this.setInstants(new ArrayList<>()); + } + + public MockHoodieActiveTimeline( + Stream completedDeltaCommits, + Stream completedCompactionCommits, + Stream> inflights) { + super(); + this.setInstants(Stream.concat( + Stream.concat(completedDeltaCommits.map(s -> new HoodieInstant(false, DELTA_COMMIT_ACTION, s)), + completedCompactionCommits.map(s -> new HoodieInstant(false, COMMIT_ACTION, s))), + inflights.map(s -> new HoodieInstant(true, s.getRight(), s.getLeft()))) + .sorted(Comparator.comparing(HoodieInstant::getFileName)).collect(Collectors.toList())); + } + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestCustomizedThreadFactory.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestCustomizedThreadFactory.java new file mode 100644 index 0000000000000..797643a2c994f --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestCustomizedThreadFactory.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.locks.LockSupport; + +public class TestCustomizedThreadFactory { + + @Test + public void testThreadPrefix() throws ExecutionException, InterruptedException { + int nThreads = 100; + String threadNamePrefix = "consumer"; + ExecutorService executorService = Executors.newFixedThreadPool(nThreads, new CustomizedThreadFactory(threadNamePrefix)); + for (int i = 0; i < nThreads; i++) { + Future resultFuture = executorService.submit(() -> { + LockSupport.parkNanos(10000000L); + String name = Thread.currentThread().getName(); + return name.startsWith(threadNamePrefix); + }); + Boolean result = resultFuture.get(); + Assertions.assertTrue(result); + } + } + + @Test + public void testDefaultThreadPrefix() throws ExecutionException, InterruptedException { + int nThreads = 100; + String defaultThreadNamePrefix = "pool-1"; + ExecutorService executorService = Executors.newFixedThreadPool(nThreads, new CustomizedThreadFactory()); + for (int i = 0; i < nThreads; i++) { + Future resultFuture = executorService.submit(() -> { + LockSupport.parkNanos(10000000L); + String name = Thread.currentThread().getName(); + return name.startsWith(defaultThreadNamePrefix); + }); + Boolean result = resultFuture.get(); + Assertions.assertTrue(result); + } + } + + @Test + public void testDaemonThread() throws ExecutionException, InterruptedException { + int nThreads = 100; + String threadNamePrefix = "consumer"; + ExecutorService executorService = Executors.newFixedThreadPool(nThreads, new CustomizedThreadFactory(threadNamePrefix, true)); + for (int i = 0; i < nThreads; i++) { + Future resultFuture = executorService.submit(() -> { + LockSupport.parkNanos(10000000L); + String name = Thread.currentThread().getName(); + boolean daemon = Thread.currentThread().isDaemon(); + return name.startsWith(threadNamePrefix) && daemon; + }); + Boolean result = resultFuture.get(); + Assertions.assertTrue(result); + } + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestDFSPropertiesConfiguration.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestDFSPropertiesConfiguration.java index ddce3216b0d6a..73d10d73d2f73 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestDFSPropertiesConfiguration.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestDFSPropertiesConfiguration.java @@ -18,17 +18,24 @@ package org.apache.hudi.common.util; +import org.apache.hadoop.conf.Configuration; import org.apache.hudi.common.config.DFSPropertiesConfiguration; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.testutils.minicluster.HdfsTestService; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hudi.exception.HoodieIOException; +import org.junit.Rule; +import org.junit.contrib.java.lang.system.EnvironmentVariables; import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import java.io.File; import java.io.IOException; import java.io.PrintStream; @@ -47,6 +54,10 @@ public class TestDFSPropertiesConfiguration { private static MiniDFSCluster dfsCluster; private static DistributedFileSystem dfs; + @Rule + public static final EnvironmentVariables ENVIRONMENT_VARIABLES + = new EnvironmentVariables(); + @BeforeAll public static void initClass() throws Exception { hdfsTestService = new HdfsTestService(); @@ -73,12 +84,17 @@ public static void initClass() throws Exception { } @AfterAll - public static void cleanupClass() throws Exception { + public static void cleanupClass() { if (hdfsTestService != null) { hdfsTestService.stop(); } } + @AfterEach + public void cleanupGlobalConfig() { + DFSPropertiesConfiguration.clearGlobalProps(); + } + private static void writePropertiesFile(Path path, String[] lines) throws IOException { PrintStream out = new PrintStream(dfs.create(path, true)); for (String line : lines) { @@ -90,8 +106,8 @@ private static void writePropertiesFile(Path path, String[] lines) throws IOExce @Test public void testParsing() { - DFSPropertiesConfiguration cfg = new DFSPropertiesConfiguration(dfs, new Path(dfsBasePath + "/t1.props")); - TypedProperties props = cfg.getConfig(); + DFSPropertiesConfiguration cfg = new DFSPropertiesConfiguration(dfs.getConf(), new Path(dfsBasePath + "/t1.props")); + TypedProperties props = cfg.getProps(); assertEquals(5, props.size()); assertThrows(IllegalArgumentException.class, () -> { props.getString("invalid.key"); @@ -118,8 +134,8 @@ public void testParsing() { @Test public void testIncludes() { - DFSPropertiesConfiguration cfg = new DFSPropertiesConfiguration(dfs, new Path(dfsBasePath + "/t3.props")); - TypedProperties props = cfg.getConfig(); + DFSPropertiesConfiguration cfg = new DFSPropertiesConfiguration(dfs.getConf(), new Path(dfsBasePath + "/t3.props")); + TypedProperties props = cfg.getProps(); assertEquals(123, props.getInteger("int.prop")); assertEquals(243.4, props.getDouble("double.prop"), 0.001); @@ -127,7 +143,59 @@ public void testIncludes() { assertEquals("t3.value", props.getString("string.prop")); assertEquals(1354354354, props.getLong("long.prop")); assertThrows(IllegalStateException.class, () -> { - new DFSPropertiesConfiguration(dfs, new Path(dfsBasePath + "/t4.props")); + cfg.addPropsFromFile(new Path(dfsBasePath + "/t4.props")); }, "Should error out on a self-included file."); } + + @Test + public void testLocalFileSystemLoading() throws IOException { + DFSPropertiesConfiguration cfg = new DFSPropertiesConfiguration(dfs.getConf(), new Path(dfsBasePath + "/t1.props")); + + cfg.addPropsFromFile( + new Path( + String.format( + "file:%s", + getClass().getClassLoader() + .getResource("props/test.properties") + .getPath() + ) + )); + + TypedProperties props = cfg.getProps(); + + assertEquals(123, props.getInteger("int.prop")); + assertEquals(113.4, props.getDouble("double.prop"), 0.001); + assertTrue(props.getBoolean("boolean.prop")); + assertEquals("str", props.getString("string.prop")); + assertEquals(1354354354, props.getLong("long.prop")); + assertEquals(123, props.getInteger("some.random.prop")); + } + + @Test + public void testNoGlobalConfFileConfigured() { + ENVIRONMENT_VARIABLES.clear(DFSPropertiesConfiguration.CONF_FILE_DIR_ENV_NAME); + DFSPropertiesConfiguration.refreshGlobalProps(); + try { + if (!FSUtils.getFs(DFSPropertiesConfiguration.DEFAULT_PATH, new Configuration()).exists(DFSPropertiesConfiguration.DEFAULT_PATH)) { + assertEquals(0, DFSPropertiesConfiguration.getGlobalProps().size()); + } + } catch (IOException e) { + throw new HoodieIOException("Cannot check if the default config file exist: " + DFSPropertiesConfiguration.DEFAULT_PATH); + } + } + + @Test + public void testLoadGlobalConfFile() { + // set HUDI_CONF_DIR + String testPropsFilePath = new File("src/test/resources/external-config").getAbsolutePath(); + ENVIRONMENT_VARIABLES.set(DFSPropertiesConfiguration.CONF_FILE_DIR_ENV_NAME, testPropsFilePath); + + DFSPropertiesConfiguration.refreshGlobalProps(); + assertEquals(5, DFSPropertiesConfiguration.getGlobalProps().size()); + assertEquals("jdbc:hive2://localhost:10000", DFSPropertiesConfiguration.getGlobalProps().get("hoodie.datasource.hive_sync.jdbcurl")); + assertEquals("true", DFSPropertiesConfiguration.getGlobalProps().get("hoodie.datasource.hive_sync.use_jdbc")); + assertEquals("false", DFSPropertiesConfiguration.getGlobalProps().get("hoodie.datasource.hive_sync.support_timestamp")); + assertEquals("BLOOM", DFSPropertiesConfiguration.getGlobalProps().get("hoodie.index.type")); + assertEquals("true", DFSPropertiesConfiguration.getGlobalProps().get("hoodie.metadata.enable")); + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestDateTimeUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestDateTimeUtils.java new file mode 100644 index 0000000000000..996c8ba6cc1ac --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestDateTimeUtils.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.time.format.DateTimeParseException; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class TestDateTimeUtils { + + @ParameterizedTest + @ValueSource(strings = {"0", "1612542030000", "2020-01-01T01:01:00Z", "1970-01-01T00:00:00.123456Z"}) + public void testParseStringIntoInstant(String s) { + assertDoesNotThrow(() -> { + DateTimeUtils.parseDateTime(s); + }); + } + + @ParameterizedTest + @ValueSource(strings = {"#", "0L", ""}) + public void testParseDateTimeThrowsException(String s) { + assertThrows(DateTimeParseException.class, () -> { + DateTimeUtils.parseDateTime(s); + }); + } + + @Test + public void testParseDateTimeWithNull() { + assertThrows(IllegalArgumentException.class, () -> { + DateTimeUtils.parseDateTime(null); + }); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestFileIOUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestFileIOUtils.java index 0c2ac7cf6ea34..762aad704609b 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestFileIOUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestFileIOUtils.java @@ -26,6 +26,9 @@ import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -63,4 +66,12 @@ public void testInputStreamReads() throws IOException { inputStream = new ByteArrayInputStream(msg.getBytes(StandardCharsets.UTF_8)); assertEquals(msg.length(), FileIOUtils.readAsByteArray(inputStream).length); } + + @Test + public void testReadAsUTFStringLines() { + String content = "a\nb\nc"; + List expectedLines = Arrays.stream(new String[]{"a", "b", "c"}).collect(Collectors.toList()); + ByteArrayInputStream inputStream = new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8)); + assertEquals(expectedLines, FileIOUtils.readAsUTFStringLines(inputStream)); + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestObjectSizeCalculator.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestObjectSizeCalculator.java new file mode 100644 index 0000000000000..625e30198404a --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestObjectSizeCalculator.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.HoodieRecord; + +import org.apache.avro.Schema; +import org.junit.jupiter.api.Test; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.hudi.common.util.ObjectSizeCalculator.getObjectSize; +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestObjectSizeCalculator { + + @Test + public void testGetObjectSize() { + EmptyClass emptyClass = new EmptyClass(); + StringClass stringClass = new StringClass(); + PayloadClass payloadClass = new PayloadClass(); + String emptyString = ""; + String string = "hello"; + String[] stringArray = {emptyString, string, " world"}; + String[] anotherStringArray = new String[100]; + List stringList = new ArrayList<>(); + StringBuilder stringBuilder = new StringBuilder(100); + int maxIntPrimitive = Integer.MAX_VALUE; + int minIntPrimitive = Integer.MIN_VALUE; + Integer maxInteger = Integer.MAX_VALUE; + Integer minInteger = Integer.MIN_VALUE; + long zeroLong = 0L; + double zeroDouble = 0.0; + boolean booleanField = true; + Object object = new Object(); + String name = "Alice Bob"; + Person person = new Person(name); + + assertEquals(40, getObjectSize(emptyString)); + assertEquals(56, getObjectSize(string)); + assertEquals(184, getObjectSize(stringArray)); + assertEquals(416, getObjectSize(anotherStringArray)); + assertEquals(40, getObjectSize(stringList)); + assertEquals(240, getObjectSize(stringBuilder)); + assertEquals(16, getObjectSize(maxIntPrimitive)); + assertEquals(16, getObjectSize(minIntPrimitive)); + assertEquals(16, getObjectSize(maxInteger)); + assertEquals(16, getObjectSize(minInteger)); + assertEquals(24, getObjectSize(zeroLong)); + assertEquals(24, getObjectSize(zeroDouble)); + assertEquals(16, getObjectSize(booleanField)); + assertEquals(80, getObjectSize(DayOfWeek.TUESDAY)); + assertEquals(16, getObjectSize(object)); + assertEquals(32, getObjectSize(emptyClass)); + assertEquals(40, getObjectSize(stringClass)); + assertEquals(40, getObjectSize(payloadClass)); + // Since avro 1.9, Schema use ConcurrentHashMap instead of LinkedHashMap to + // implement props, which will change the size of the object. + assertEquals(HoodieAvroUtils.gteqAvro1_9() ? 1320 : 1240, + getObjectSize(Schema.create(Schema.Type.STRING))); + assertEquals(104, getObjectSize(person)); + } + + class EmptyClass { + } + + class StringClass { + private String s; + } + + class PayloadClass implements Serializable { + private HoodieRecord record; + } + + class Person { + private String name; + + public Person(String name) { + this.name = name; + } + } + + public enum DayOfWeek { + MONDAY, TUESDAY, WEDNESDAY, THURSDAY, FRIDAY, SATURDAY, SUNDAY + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestOrcReaderIterator.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestOrcReaderIterator.java new file mode 100644 index 0000000000000..b55995c0c2d55 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestOrcReaderIterator.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.orc.CompressionKind; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.orc.RecordReader; +import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; +import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; +import org.apache.orc.storage.ql.exec.vector.LongColumnVector; +import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.File; +import java.nio.charset.StandardCharsets; +import java.util.Iterator; + +import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource; +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestOrcReaderIterator { + private final Path filePath = new Path(System.getProperty("java.io.tmpdir") + "/f1_1-0-1_000.orc"); + + @BeforeEach + @AfterEach + public void clearTempFile() { + File file = new File(filePath.toString()); + if (file.exists()) { + file.delete(); + } + } + + @Test + public void testOrcIteratorReadData() throws Exception { + final Configuration conf = new Configuration(); + Schema avroSchema = getSchemaFromResource(TestOrcReaderIterator.class, "/simple-test.avsc"); + TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(avroSchema); + OrcFile.WriterOptions options = OrcFile.writerOptions(conf).setSchema(orcSchema).compress(CompressionKind.ZLIB); + Writer writer = OrcFile.createWriter(filePath, options); + VectorizedRowBatch batch = orcSchema.createRowBatch(); + BytesColumnVector nameColumns = (BytesColumnVector) batch.cols[0]; + LongColumnVector numberColumns = (LongColumnVector) batch.cols[1]; + BytesColumnVector colorColumns = (BytesColumnVector) batch.cols[2]; + for (int r = 0; r < 5; ++r) { + int row = batch.size++; + byte[] name = ("name" + r).getBytes(StandardCharsets.UTF_8); + nameColumns.setVal(row, name); + byte[] color = ("color" + r).getBytes(StandardCharsets.UTF_8); + colorColumns.setVal(row, color); + numberColumns.vector[row] = r; + } + writer.addRowBatch(batch); + writer.close(); + + Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf)); + RecordReader recordReader = reader.rows(new Reader.Options(conf).schema(orcSchema)); + Iterator iterator = new OrcReaderIterator<>(recordReader, avroSchema, orcSchema); + int recordCount = 0; + while (iterator.hasNext()) { + GenericRecord record = iterator.next(); + assertEquals("name" + recordCount, record.get("name").toString()); + assertEquals("color" + recordCount, record.get("favorite_color").toString()); + assertEquals(recordCount, record.get("favorite_number")); + recordCount++; + } + assertEquals(5, recordCount); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetReaderIterator.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetReaderIterator.java index 799ed248b1d8a..37fead4928b43 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetReaderIterator.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetReaderIterator.java @@ -18,7 +18,7 @@ package org.apache.hudi.common.util; -import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieException; import org.apache.parquet.hadoop.ParquetReader; import org.junit.jupiter.api.Test; @@ -30,6 +30,8 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; public class TestParquetReaderIterator { @@ -59,6 +61,7 @@ public void testParquetIterator() throws IOException { assertEquals(1, iterator.next()); // no more entries to iterate on assertFalse(iterator.hasNext()); - assertThrows(HoodieIOException.class, iterator::next, "should throw an exception since there is only 1 record"); + assertThrows(HoodieException.class, iterator::next, "should throw an exception since there is only 1 record"); + verify(reader, times(1)).close(); } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetUtils.java index 2bcbcbdab67a3..d5b769190c2a5 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetUtils.java @@ -23,11 +23,14 @@ import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.BloomFilterFactory; import org.apache.hudi.common.bloom.BloomFilterTypeCode; +import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.testutils.HoodieCommonTestHarness; import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.avro.JsonProperties; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; @@ -50,6 +53,7 @@ import java.util.Set; import java.util.UUID; +import static org.apache.hudi.avro.HoodieAvroUtils.METADATA_FIELD_SCHEMA; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -58,6 +62,8 @@ */ public class TestParquetUtils extends HoodieCommonTestHarness { + private ParquetUtils parquetUtils; + public static List bloomFilterTypeCodes() { return Arrays.asList( Arguments.of(BloomFilterTypeCode.SIMPLE.name()), @@ -68,6 +74,7 @@ public static List bloomFilterTypeCodes() { @BeforeEach public void setup() { initPath(); + parquetUtils = new ParquetUtils(); } @ParameterizedTest @@ -78,18 +85,18 @@ public void testHoodieWriteSupport(String typeCode) throws Exception { rowKeys.add(UUID.randomUUID().toString()); } - String filePath = Paths.get(basePath, "test.parquet").toString(); + String filePath = Paths.get(basePath, "test.parquet").toUri().toString(); writeParquetFile(typeCode, filePath, rowKeys); // Read and verify List rowKeysInFile = new ArrayList<>( - ParquetUtils.readRowKeysFromParquet(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath))); + parquetUtils.readRowKeys(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath))); Collections.sort(rowKeysInFile); Collections.sort(rowKeys); assertEquals(rowKeys, rowKeysInFile, "Did not read back the expected list of keys"); BloomFilter filterInFile = - ParquetUtils.readBloomFilterFromParquetMetadata(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath)); + parquetUtils.readBloomFilterFromMetadata(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath)); for (String rowKey : rowKeys) { assertTrue(filterInFile.mightContain(rowKey), "key should be found in bloom filter"); } @@ -108,12 +115,12 @@ public void testFilterParquetRowKeys(String typeCode) throws Exception { } } - String filePath = Paths.get(basePath, "test.parquet").toString(); + String filePath = Paths.get(basePath, "test.parquet").toUri().toString(); writeParquetFile(typeCode, filePath, rowKeys); // Read and verify Set filtered = - ParquetUtils.filterParquetRowKeys(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath), filter); + parquetUtils.filterRowKeys(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath), filter); assertEquals(filter.size(), filtered.size(), "Filtered count does not match"); @@ -134,13 +141,40 @@ public void testFetchRecordKeyPartitionPathFromParquet(String typeCode) throws E expected.add(new HoodieKey(rowKey, partitionPath)); } - String filePath = basePath + "/test.parquet"; + String filePath = Paths.get(basePath, "test.parquet").toUri().toString(); Schema schema = HoodieAvroUtils.getRecordKeyPartitionPathSchema(); writeParquetFile(typeCode, filePath, rowKeys, schema, true, partitionPath); // Read and verify List fetchedRows = - ParquetUtils.fetchRecordKeyPartitionPathFromParquet(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath)); + parquetUtils.fetchHoodieKeys(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath)); + assertEquals(rowKeys.size(), fetchedRows.size(), "Total count does not match"); + + for (HoodieKey entry : fetchedRows) { + assertTrue(expected.contains(entry), "Record key must be in the given filter"); + } + } + + @Test + public void testFetchRecordKeyPartitionPathVirtualKeysFromParquet() throws Exception { + List rowKeys = new ArrayList<>(); + List expected = new ArrayList<>(); + String partitionPath = "path1"; + for (int i = 0; i < 1000; i++) { + String rowKey = UUID.randomUUID().toString(); + rowKeys.add(rowKey); + expected.add(new HoodieKey(rowKey, partitionPath)); + } + + String filePath = Paths.get(basePath, "test.parquet").toUri().toString(); + Schema schema = getSchemaWithFields(Arrays.asList(new String[]{"abc", "def"})); + writeParquetFile(BloomFilterTypeCode.SIMPLE.name(), filePath, rowKeys, schema, true, partitionPath, + false, "abc", "def"); + + // Read and verify + List fetchedRows = + parquetUtils.fetchHoodieKeys(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath), + Option.of(new TestBaseKeyGen("abc","def"))); assertEquals(rowKeys.size(), fetchedRows.size(), "Total count does not match"); for (HoodieKey entry : fetchedRows) { @@ -150,14 +184,14 @@ public void testFetchRecordKeyPartitionPathFromParquet(String typeCode) throws E @Test public void testReadCounts() throws Exception { - String filePath = basePath + "/test.parquet"; + String filePath = Paths.get(basePath, "test.parquet").toUri().toString(); List rowKeys = new ArrayList<>(); for (int i = 0; i < 123; i++) { rowKeys.add(UUID.randomUUID().toString()); } writeParquetFile(BloomFilterTypeCode.SIMPLE.name(), filePath, rowKeys); - assertEquals(123, ParquetUtils.getRowCount(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath))); + assertEquals(123, parquetUtils.getRowCount(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath))); } private void writeParquetFile(String typeCode, String filePath, List rowKeys) throws Exception { @@ -165,22 +199,73 @@ private void writeParquetFile(String typeCode, String filePath, List row } private void writeParquetFile(String typeCode, String filePath, List rowKeys, Schema schema, boolean addPartitionPathField, String partitionPath) throws Exception { + writeParquetFile(typeCode, filePath, rowKeys, schema, addPartitionPathField, partitionPath, + true, null, null); + } + + private void writeParquetFile(String typeCode, String filePath, List rowKeys, Schema schema, boolean addPartitionPathField, String partitionPathValue, + boolean useMetaFields, String recordFieldName, String partitionFieldName) throws Exception { // Write out a parquet file BloomFilter filter = BloomFilterFactory .createBloomFilter(1000, 0.0001, 10000, typeCode); HoodieAvroWriteSupport writeSupport = - new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter); + new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, Option.of(filter)); ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE); for (String rowKey : rowKeys) { GenericRecord rec = new GenericData.Record(schema); - rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey); + rec.put(useMetaFields ? HoodieRecord.RECORD_KEY_METADATA_FIELD : recordFieldName, rowKey); if (addPartitionPathField) { - rec.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, partitionPath); + rec.put(useMetaFields ? HoodieRecord.PARTITION_PATH_METADATA_FIELD : partitionFieldName, partitionPathValue); } writer.write(rec); writeSupport.add(rowKey); } writer.close(); } + + private static Schema getSchemaWithFields(List fields) { + List toBeAddedFields = new ArrayList<>(); + Schema recordSchema = Schema.createRecord("HoodieRecordKey", "", "", false); + + for (String field: fields) { + Schema.Field schemaField = + new Schema.Field(field, METADATA_FIELD_SCHEMA, "", JsonProperties.NULL_VALUE); + toBeAddedFields.add(schemaField); + } + recordSchema.setFields(toBeAddedFields); + return recordSchema; + } + + class TestBaseKeyGen extends BaseKeyGenerator { + + private String recordKeyField; + private String partitionField; + + public TestBaseKeyGen(String recordKeyField, String partitionField) { + super(new TypedProperties()); + this.recordKeyField = recordKeyField; + this.partitionField = partitionField; + } + + @Override + public String getRecordKey(GenericRecord record) { + return record.get(recordKeyField).toString(); + } + + @Override + public String getPartitionPath(GenericRecord record) { + return record.get(partitionField).toString(); + } + + @Override + public List getRecordKeyFieldNames() { + return Arrays.asList(new String[]{recordKeyField}); + } + + @Override + public List getPartitionPathFields() { + return Arrays.asList(new String[]{partitionField}); + } + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestRatelimiter.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestRatelimiter.java index c2e939c3b9854..c712543c7280d 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestRatelimiter.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestRatelimiter.java @@ -20,6 +20,7 @@ import java.util.concurrent.TimeUnit; import org.junit.jupiter.api.Test; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestSerializableSchema.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestSerializableSchema.java new file mode 100644 index 0000000000000..03421a3005f04 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestSerializableSchema.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import org.apache.avro.Schema; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.config.SerializableSchema; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.junit.jupiter.api.Test; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Tests serializable schema. + */ +public class TestSerializableSchema { + + @Test + public void testSerDeser() throws IOException { + verifySchema(HoodieTestDataGenerator.AVRO_TRIP_SCHEMA); + verifySchema(HoodieAvroUtils.addMetadataFields(HoodieTestDataGenerator.AVRO_TRIP_SCHEMA)); + verifySchema(HoodieTestDataGenerator.AVRO_SHORT_TRIP_SCHEMA); + verifySchema(HoodieAvroUtils.addMetadataFields(HoodieTestDataGenerator.AVRO_SHORT_TRIP_SCHEMA)); + verifySchema(HoodieTestDataGenerator.FLATTENED_AVRO_SCHEMA); + verifySchema(HoodieAvroUtils.addMetadataFields(HoodieTestDataGenerator.FLATTENED_AVRO_SCHEMA)); + verifySchema(HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS); + } + + @Test + public void testLargeSchema() throws IOException { + verifySchema(new Schema.Parser().parse(generateLargeSchema())); + } + + private void verifySchema(Schema schema) throws IOException { + SerializableSchema serializableSchema = new SerializableSchema(schema); + assertEquals(schema, serializableSchema.get()); + assertTrue(schema != serializableSchema.get()); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + ObjectOutputStream oos = new ObjectOutputStream(baos); + serializableSchema.writeObjectTo(oos); + oos.flush(); + oos.close(); + + byte[] bytesWritten = baos.toByteArray(); + SerializableSchema newSchema = new SerializableSchema(); + newSchema.readObjectFrom(new ObjectInputStream(new ByteArrayInputStream(bytesWritten))); + assertEquals(schema, newSchema.get()); + } + + // generate large schemas (>64K which is limitation of ObjectOutputStream#writeUTF) to validate it can be serialized + private String generateLargeSchema() { + StringBuilder schema = new StringBuilder(); + schema.append(HoodieTestDataGenerator.TRIP_SCHEMA_PREFIX); + int fieldNum = 1; + while (schema.length() < 80 * 1024) { + String fieldName = "field" + fieldNum; + schema.append("{\"name\": \"" + fieldName + "\",\"type\": {\"type\":\"record\", \"name\":\"" + fieldName + "\",\"fields\": [" + + "{\"name\": \"amount\",\"type\": \"double\"},{\"name\": \"currency\", \"type\": \"string\"}]}},"); + fieldNum++; + } + + schema.append(HoodieTestDataGenerator.TRIP_SCHEMA_SUFFIX); + return schema.toString(); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestSerializationUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestSerializationUtils.java index 9d6c1b81b044c..f2714aaf9a268 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestSerializationUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestSerializationUtils.java @@ -19,15 +19,21 @@ package org.apache.hudi.common.util; import org.apache.avro.util.Utf8; +import org.apache.hudi.common.model.DeleteRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.table.log.block.HoodieDeleteBlock; import org.junit.jupiter.api.Test; import java.io.IOException; +import java.nio.ByteBuffer; import java.util.Arrays; +import java.util.Collections; import java.util.LinkedList; import java.util.Objects; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNotSame; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -52,12 +58,33 @@ public void testSerDeser() throws IOException { verifyObject(new LinkedList<>(Arrays.asList(2, 3, 5))); } + @Test + public void testAvroUtf8SerDe() throws IOException { + byte[] firstBytes = SerializationUtils.serialize(new Utf8("test")); + // 4 byte string + 3 bytes length (Kryo uses variable-length encoding) + assertEquals(7, firstBytes.length); + } + + @Test + public void testClassFullyQualifiedNameSerialization() throws IOException { + DeleteRecord deleteRecord = DeleteRecord.create(new HoodieKey("key", "partition")); + HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(new DeleteRecord[]{deleteRecord}, Collections.emptyMap()); + + byte[] firstBytes = SerializationUtils.serialize(deleteBlock); + byte[] secondBytes = SerializationUtils.serialize(deleteBlock); + + assertNotSame(firstBytes, secondBytes); + // NOTE: Here we assert that Kryo doesn't optimize out the fully-qualified class-name + // and always writes it out + assertEquals(ByteBuffer.wrap(firstBytes), ByteBuffer.wrap(secondBytes)); + } + private void verifyObject(T expectedValue) throws IOException { byte[] serializedObject = SerializationUtils.serialize(expectedValue); assertNotNull(serializedObject); assertTrue(serializedObject.length > 0); - final T deserializedValue = SerializationUtils.deserialize(serializedObject); + final T deserializedValue = SerializationUtils.deserialize(serializedObject); if (expectedValue == null) { assertNull(deserializedValue); } else { diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestStringUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestStringUtils.java index b402996fa78ae..bbaca74434dca 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestStringUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestStringUtils.java @@ -20,6 +20,11 @@ import org.junit.jupiter.api.Test; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNull; @@ -41,6 +46,14 @@ public void testStringJoin() { assertNotEquals(null, StringUtils.join(STRINGS)); } + @Test + public void testStringJoinWithJavaImpl() { + assertNull(StringUtils.join(",", null)); + assertEquals("", String.join(",", Collections.singletonList(""))); + assertEquals(",", String.join(",", Arrays.asList("", ""))); + assertEquals("a,", String.join(",", Arrays.asList("a", ""))); + } + @Test public void testStringNullToEmpty() { String str = "This is a test"; @@ -48,6 +61,20 @@ public void testStringNullToEmpty() { assertEquals("", StringUtils.nullToEmpty(null)); } + @Test + public void testStringObjToString() { + assertNull(StringUtils.objToString(null)); + assertEquals("Test String", StringUtils.objToString("Test String")); + + // assert byte buffer + ByteBuffer byteBuffer1 = ByteBuffer.wrap("1234".getBytes()); + ByteBuffer byteBuffer2 = ByteBuffer.wrap("5678".getBytes()); + // assert equal because ByteBuffer has overwritten the toString to return a summary string + assertEquals(byteBuffer1.toString(), byteBuffer2.toString()); + // assert not equal + assertNotEquals(StringUtils.objToString(byteBuffer1), StringUtils.objToString(byteBuffer2)); + } + @Test public void testStringEmptyToNull() { assertNull(StringUtils.emptyToNull("")); @@ -61,4 +88,12 @@ public void testStringNullOrEmpty() { assertNotEquals(null, StringUtils.isNullOrEmpty("this is not empty")); assertTrue(StringUtils.isNullOrEmpty("")); } + + @Test + public void testSplit() { + assertEquals(new ArrayList<>(), StringUtils.split(null, ",")); + assertEquals(new ArrayList<>(), StringUtils.split("", ",")); + assertEquals(Arrays.asList("a", "b", "c"), StringUtils.split("a,b, c", ",")); + assertEquals(Arrays.asList("a", "b", "c"), StringUtils.split("a,b,, c ", ",")); + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestTablePathUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestTablePathUtils.java index 05031f035c30f..eae1cdce8399b 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestTablePathUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestTablePathUtils.java @@ -17,14 +17,18 @@ package org.apache.hudi.common.util; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodiePartitionMetadata; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hudi.common.model.HoodiePartitionMetadata; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; import java.io.File; import java.io.IOException; @@ -36,9 +40,10 @@ import static org.junit.jupiter.api.Assertions.assertTrue; public final class TestTablePathUtils { + private static final String BASE_FILE_EXTENSION = HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension(); @TempDir - static File tempDir; + public File tempDir; private static FileSystem fs; private static Path tablePath; private static Path partitionPath1; @@ -46,9 +51,12 @@ public final class TestTablePathUtils { private static Path filePath1; private static Path filePath2; - @BeforeAll - static void setup() throws IOException { - URI tablePathURI = Paths.get(tempDir.getAbsolutePath(),"test_table").toUri(); + private void setup() throws IOException { + setup(Option.empty()); + } + + private void setup(Option partitionMetafileFormat) throws IOException { + URI tablePathURI = Paths.get(tempDir.getAbsolutePath(), "test_table").toUri(); tablePath = new Path(tablePathURI); fs = tablePath.getFileSystem(new Configuration()); @@ -66,16 +74,16 @@ static void setup() throws IOException { assertTrue(new File(partitionPathURI2).mkdirs()); HoodiePartitionMetadata partitionMetadata1 = new HoodiePartitionMetadata(fs, Instant.now().toString(), tablePath, - partitionPath1); + partitionPath1, partitionMetafileFormat); partitionMetadata1.trySave(1); HoodiePartitionMetadata partitionMetadata2 = new HoodiePartitionMetadata(fs, Instant.now().toString(), tablePath, - partitionPath2); + partitionPath2, partitionMetafileFormat); partitionMetadata2.trySave(2); // Create files - URI filePathURI1 = Paths.get(partitionPathURI1.getPath(), "data1.parquet").toUri(); + URI filePathURI1 = Paths.get(partitionPathURI1.getPath(), "data1" + BASE_FILE_EXTENSION).toUri(); filePath1 = new Path(filePathURI1); - URI filePathURI2 = Paths.get(partitionPathURI2.getPath(), "data2.parquet").toUri(); + URI filePathURI2 = Paths.get(partitionPathURI2.getPath(), "data2" + BASE_FILE_EXTENSION).toUri(); filePath2 = new Path(filePathURI2); assertTrue(new File(filePathURI1).createNewFile()); @@ -84,30 +92,43 @@ static void setup() throws IOException { @Test void getTablePathFromTablePath() throws IOException { + setup(); Option inferredTablePath = TablePathUtils.getTablePath(fs, tablePath); assertEquals(tablePath, inferredTablePath.get()); } @Test void getTablePathFromMetadataFolderPath() throws IOException { - Path metadataFolder = new Path(tablePath, HoodieTableMetaClient.METAFOLDER_NAME); - Option inferredTablePath = TablePathUtils.getTablePath(fs, metadataFolder); + setup(); + Path metaFolder = new Path(tablePath, HoodieTableMetaClient.METAFOLDER_NAME); + Option inferredTablePath = TablePathUtils.getTablePath(fs, metaFolder); assertEquals(tablePath, inferredTablePath.get()); } @Test void getTablePathFromMetadataSubFolderPath() throws IOException { + setup(); Path auxFolder = new Path(tablePath, HoodieTableMetaClient.AUXILIARYFOLDER_NAME); - Option inferredTablePath = TablePathUtils.getTablePath(fs, auxFolder); - assertEquals(tablePath, inferredTablePath.get()); + assertEquals(tablePath, TablePathUtils.getTablePath(fs, auxFolder).get()); Path bootstrapIndexFolder = new Path(tablePath, HoodieTableMetaClient.BOOTSTRAP_INDEX_ROOT_FOLDER_PATH); - inferredTablePath = TablePathUtils.getTablePath(fs, bootstrapIndexFolder); - assertEquals(tablePath, inferredTablePath.get()); + assertEquals(tablePath, TablePathUtils.getTablePath(fs, bootstrapIndexFolder).get()); + + Path metadataTableFolder = new Path(tablePath, HoodieTableMetaClient.METADATA_TABLE_FOLDER_PATH); + Path metadataTableMetaFolder = new Path(metadataTableFolder, HoodieTableMetaClient.METAFOLDER_NAME); + assertTrue(new File(metadataTableMetaFolder.toUri()).mkdirs()); + + assertEquals(metadataTableFolder, TablePathUtils.getTablePath(fs, metadataTableFolder).get()); + + Path metadataTablePartitionFolder = new Path(metadataTableFolder, "column_stats"); + assertTrue(new File(metadataTablePartitionFolder.toUri()).mkdir()); + assertEquals(metadataTableFolder, TablePathUtils.getTablePath(fs, metadataTablePartitionFolder).get()); } - @Test - void getTablePathFromPartitionFolderPath() throws IOException { + @ParameterizedTest + @EnumSource(value = HoodieFileFormat.class, names = {"PARQUET", "ORC"}) + void getTablePathFromPartitionFolderPath(HoodieFileFormat partitionMetafileFormat) throws IOException { + setup(Option.of(partitionMetafileFormat)); Option inferredTablePath = TablePathUtils.getTablePath(fs, partitionPath1); assertEquals(tablePath, inferredTablePath.get()); @@ -117,6 +138,7 @@ void getTablePathFromPartitionFolderPath() throws IOException { @Test void getTablePathFromFilePath() throws IOException { + setup(); Option inferredTablePath = TablePathUtils.getTablePath(fs, filePath1); assertEquals(tablePath, inferredTablePath.get()); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestBitCaskDiskMap.java b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestBitCaskDiskMap.java new file mode 100755 index 0000000000000..9bbe4277162e0 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestBitCaskDiskMap.java @@ -0,0 +1,254 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util.collection; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.testutils.AvroBinaryTestPayload; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.testutils.SchemaTestUtil; +import org.apache.hudi.common.testutils.SpillableMapTestUtils; +import org.apache.hudi.common.util.HoodieRecordSizeEstimator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.SpillableMapUtils; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.File; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.UUID; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Tests dis based map {@link BitCaskDiskMap}. + */ +public class TestBitCaskDiskMap extends HoodieCommonTestHarness { + + @BeforeEach + public void setup() { + initPath(); + } + + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void testSimpleInsert(boolean isCompressionEnabled) throws IOException, URISyntaxException { + BitCaskDiskMap records = new BitCaskDiskMap<>(basePath, isCompressionEnabled); + List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); + List recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records); + + Map originalRecords = iRecords.stream() + .collect(Collectors.toMap(k -> ((GenericRecord) k).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), v -> v)); + + // make sure records have spilled to disk + assertTrue(records.sizeOfFileOnDiskInBytes() > 0); + Iterator> itr = records.iterator(); + while (itr.hasNext()) { + HoodieRecord rec = itr.next(); + assert recordKeys.contains(rec.getRecordKey()); + IndexedRecord originalRecord = originalRecords.get(rec.getRecordKey()); + HoodieAvroPayload payload = (HoodieAvroPayload) rec.getData(); + Option value = payload.getInsertValue(HoodieAvroUtils.addMetadataFields(getSimpleSchema())); + assertEquals(originalRecord, value.get()); + } + + verifyCleanup(records); + } + + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void testSimpleInsertWithoutHoodieMetadata(boolean isCompressionEnabled) throws IOException, URISyntaxException { + BitCaskDiskMap records = new BitCaskDiskMap<>(basePath, isCompressionEnabled); + List hoodieRecords = SchemaTestUtil.generateHoodieTestRecordsWithoutHoodieMetadata(0, 1000); + Set recordKeys = new HashSet<>(); + // insert generated records into the map + hoodieRecords.forEach(r -> { + records.put(r.getRecordKey(), r); + recordKeys.add(r.getRecordKey()); + }); + // make sure records have spilled to disk + assertTrue(records.sizeOfFileOnDiskInBytes() > 0); + Iterator> itr = records.iterator(); + List oRecords = new ArrayList<>(); + while (itr.hasNext()) { + HoodieRecord rec = itr.next(); + oRecords.add(rec); + assert recordKeys.contains(rec.getRecordKey()); + } + + verifyCleanup(records); + } + + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void testSimpleUpsert(boolean isCompressionEnabled) throws IOException, URISyntaxException { + Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); + + BitCaskDiskMap records = new BitCaskDiskMap<>(basePath, isCompressionEnabled); + List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); + + // perform some inserts + List recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records); + + long fileSize = records.sizeOfFileOnDiskInBytes(); + // make sure records have spilled to disk + assertTrue(fileSize > 0); + + // generate updates from inserts + List updatedRecords = SchemaTestUtil.updateHoodieTestRecords(recordKeys, + SchemaTestUtil.generateHoodieTestRecords(0, 100), HoodieActiveTimeline.createNewInstantTime()); + String newCommitTime = + ((GenericRecord) updatedRecords.get(0)).get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); + + // perform upserts + recordKeys = SpillableMapTestUtils.upsertRecords(updatedRecords, records); + + // upserts should be appended to the existing file, hence increasing the sizeOfFile on disk + assertTrue(records.sizeOfFileOnDiskInBytes() > fileSize); + + // Upserted records (on disk) should have the latest commit time + Iterator> itr = records.iterator(); + while (itr.hasNext()) { + HoodieRecord rec = itr.next(); + assert recordKeys.contains(rec.getRecordKey()); + try { + IndexedRecord indexedRecord = (IndexedRecord) rec.getData().getInsertValue(schema).get(); + String latestCommitTime = + ((GenericRecord) indexedRecord).get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); + assertEquals(latestCommitTime, newCommitTime); + } catch (IOException io) { + throw new UncheckedIOException(io); + } + } + verifyCleanup(records); + } + + @Test + public void testSizeEstimator() throws IOException, URISyntaxException { + Schema schema = SchemaTestUtil.getSimpleSchema(); + + // Test sizeEstimator without hoodie metadata fields + List hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema); + + long payloadSize = + SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema)); + assertTrue(payloadSize > 0); + + // Test sizeEstimator with hoodie metadata fields + schema = HoodieAvroUtils.addMetadataFields(schema); + hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema); + payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema)); + assertTrue(payloadSize > 0); + + // Following tests payloads without an Avro Schema in the Record + + // Test sizeEstimator without hoodie metadata fields and without schema object in the payload + schema = SchemaTestUtil.getSimpleSchema(); + List indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1); + hoodieRecords = + indexedRecords.stream().map(r -> new HoodieAvroRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), + new AvroBinaryTestPayload(Option.of((GenericRecord) r)))).collect(Collectors.toList()); + payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema)); + assertTrue(payloadSize > 0); + + // Test sizeEstimator with hoodie metadata fields and without schema object in the payload + final Schema simpleSchemaWithMetadata = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); + indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1); + hoodieRecords = indexedRecords.stream() + .map(r -> new HoodieAvroRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), + new AvroBinaryTestPayload( + Option.of(HoodieAvroUtils.rewriteRecord((GenericRecord) r, simpleSchemaWithMetadata))))) + .collect(Collectors.toList()); + payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema)); + assertTrue(payloadSize > 0); + } + + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void testPutAll(boolean isCompressionEnabled) throws IOException, URISyntaxException { + BitCaskDiskMap records = new BitCaskDiskMap<>(basePath, isCompressionEnabled); + List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); + Map recordMap = new HashMap<>(); + iRecords.forEach(r -> { + String key = ((GenericRecord) r).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String partitionPath = ((GenericRecord) r).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); + HoodieRecord value = new HoodieAvroRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r))); + recordMap.put(key, value); + }); + + records.putAll(recordMap); + // make sure records have spilled to disk + assertTrue(records.sizeOfFileOnDiskInBytes() > 0); + + // make sure all added records are present + for (Map.Entry entry : records.entrySet()) { + assertTrue(recordMap.containsKey(entry.getKey())); + } + } + + /** + * @na: Leaving this test here for a quick performance test + */ + @Disabled + @Test + public void testSizeEstimatorPerformance() throws IOException, URISyntaxException { + // Test sizeEstimatorPerformance with simpleSchema + Schema schema = SchemaTestUtil.getSimpleSchema(); + List hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema); + HoodieRecordSizeEstimator sizeEstimator = new HoodieRecordSizeEstimator<>(schema); + HoodieRecord record = hoodieRecords.remove(0); + long startTime = System.currentTimeMillis(); + SpillableMapUtils.computePayloadSize(record, sizeEstimator); + long timeTaken = System.currentTimeMillis() - startTime; + System.out.println("Time taken :" + timeTaken); + assertTrue(timeTaken < 100); + } + + private void verifyCleanup(BitCaskDiskMap records) { + File basePathDir = new File(basePath); + assert Objects.requireNonNull(basePathDir.list()).length > 0; + records.close(); + assertEquals(Objects.requireNonNull(basePathDir.list()).length, 0); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestDiskBasedMap.java b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestDiskBasedMap.java deleted file mode 100755 index e3cc886568ffb..0000000000000 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestDiskBasedMap.java +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.common.util.collection; - -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.common.model.HoodieAvroPayload; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; -import org.apache.hudi.common.testutils.AvroBinaryTestPayload; -import org.apache.hudi.common.testutils.HoodieCommonTestHarness; -import org.apache.hudi.common.testutils.SchemaTestUtil; -import org.apache.hudi.common.testutils.SpillableMapTestUtils; -import org.apache.hudi.common.util.HoodieRecordSizeEstimator; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.SpillableMapUtils; - -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.net.URISyntaxException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.UUID; -import java.util.stream.Collectors; - -import static org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -/** - * Tests dis based map {@link DiskBasedMap}. - */ -public class TestDiskBasedMap extends HoodieCommonTestHarness { - - @BeforeEach - public void setup() { - initPath(); - } - - @Test - public void testSimpleInsert() throws IOException, URISyntaxException { - DiskBasedMap records = new DiskBasedMap<>(basePath); - List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); - ((GenericRecord) iRecords.get(0)).get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); - List recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records); - - // make sure records have spilled to disk - assertTrue(records.sizeOfFileOnDiskInBytes() > 0); - Iterator> itr = records.iterator(); - List oRecords = new ArrayList<>(); - while (itr.hasNext()) { - HoodieRecord rec = itr.next(); - oRecords.add(rec); - assert recordKeys.contains(rec.getRecordKey()); - } - } - - @Test - public void testSimpleInsertWithoutHoodieMetadata() throws IOException, URISyntaxException { - DiskBasedMap records = new DiskBasedMap<>(basePath); - List hoodieRecords = SchemaTestUtil.generateHoodieTestRecordsWithoutHoodieMetadata(0, 1000); - Set recordKeys = new HashSet<>(); - // insert generated records into the map - hoodieRecords.forEach(r -> { - records.put(r.getRecordKey(), r); - recordKeys.add(r.getRecordKey()); - }); - // make sure records have spilled to disk - assertTrue(records.sizeOfFileOnDiskInBytes() > 0); - Iterator> itr = records.iterator(); - List oRecords = new ArrayList<>(); - while (itr.hasNext()) { - HoodieRecord rec = itr.next(); - oRecords.add(rec); - assert recordKeys.contains(rec.getRecordKey()); - } - } - - @Test - public void testSimpleUpsert() throws IOException, URISyntaxException { - Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); - - DiskBasedMap records = new DiskBasedMap<>(basePath); - List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); - - // perform some inserts - List recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records); - - long fileSize = records.sizeOfFileOnDiskInBytes(); - // make sure records have spilled to disk - assertTrue(fileSize > 0); - - // generate updates from inserts - List updatedRecords = SchemaTestUtil.updateHoodieTestRecords(recordKeys, - SchemaTestUtil.generateHoodieTestRecords(0, 100), HoodieActiveTimeline.createNewInstantTime()); - String newCommitTime = - ((GenericRecord) updatedRecords.get(0)).get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); - - // perform upserts - recordKeys = SpillableMapTestUtils.upsertRecords(updatedRecords, records); - - // upserts should be appended to the existing file, hence increasing the sizeOfFile on disk - assertTrue(records.sizeOfFileOnDiskInBytes() > fileSize); - - // Upserted records (on disk) should have the latest commit time - Iterator> itr = records.iterator(); - while (itr.hasNext()) { - HoodieRecord rec = itr.next(); - assert recordKeys.contains(rec.getRecordKey()); - try { - IndexedRecord indexedRecord = (IndexedRecord) rec.getData().getInsertValue(schema).get(); - String latestCommitTime = - ((GenericRecord) indexedRecord).get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); - assertEquals(latestCommitTime, newCommitTime); - } catch (IOException io) { - throw new UncheckedIOException(io); - } - } - } - - @Test - public void testSizeEstimator() throws IOException, URISyntaxException { - Schema schema = SchemaTestUtil.getSimpleSchema(); - - // Test sizeEstimator without hoodie metadata fields - List hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema); - - long payloadSize = - SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema)); - assertTrue(payloadSize > 0); - - // Test sizeEstimator with hoodie metadata fields - schema = HoodieAvroUtils.addMetadataFields(schema); - hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema); - payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema)); - assertTrue(payloadSize > 0); - - // Following tests payloads without an Avro Schema in the Record - - // Test sizeEstimator without hoodie metadata fields and without schema object in the payload - schema = SchemaTestUtil.getSimpleSchema(); - List indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1); - hoodieRecords = - indexedRecords.stream().map(r -> new HoodieRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), - new AvroBinaryTestPayload(Option.of((GenericRecord) r)))).collect(Collectors.toList()); - payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema)); - assertTrue(payloadSize > 0); - - // Test sizeEstimator with hoodie metadata fields and without schema object in the payload - final Schema simpleSchemaWithMetadata = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); - indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1); - hoodieRecords = indexedRecords.stream() - .map(r -> new HoodieRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), - new AvroBinaryTestPayload( - Option.of(HoodieAvroUtils.rewriteRecord((GenericRecord) r, simpleSchemaWithMetadata))))) - .collect(Collectors.toList()); - payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema)); - assertTrue(payloadSize > 0); - } - - @Test - public void testPutAll() throws IOException, URISyntaxException { - DiskBasedMap records = new DiskBasedMap<>(basePath); - List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); - Map recordMap = new HashMap<>(); - iRecords.forEach(r -> { - String key = ((GenericRecord) r).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - String partitionPath = ((GenericRecord) r).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); - HoodieRecord value = new HoodieRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r))); - recordMap.put(key, value); - }); - - records.putAll(recordMap); - // make sure records have spilled to disk - assertTrue(records.sizeOfFileOnDiskInBytes() > 0); - - // make sure all added records are present - for (Map.Entry entry : records.entrySet()) { - assertTrue(recordMap.containsKey(entry.getKey())); - } - } - - /** - * @na: Leaving this test here for a quick performance test - */ - @Disabled - @Test - public void testSizeEstimatorPerformance() throws IOException, URISyntaxException { - // Test sizeEstimatorPerformance with simpleSchema - Schema schema = SchemaTestUtil.getSimpleSchema(); - List hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema); - HoodieRecordSizeEstimator sizeEstimator = new HoodieRecordSizeEstimator<>(schema); - HoodieRecord record = hoodieRecords.remove(0); - long startTime = System.currentTimeMillis(); - SpillableMapUtils.computePayloadSize(record, sizeEstimator); - long timeTaken = System.currentTimeMillis() - startTime; - System.out.println("Time taken :" + timeTaken); - assertTrue(timeTaken < 100); - } -} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestExternalSpillableMap.java b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestExternalSpillableMap.java index b240c8dd14aa0..e33baf1493a93 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestExternalSpillableMap.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestExternalSpillableMap.java @@ -20,6 +20,7 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -38,6 +39,9 @@ import org.junit.jupiter.api.MethodOrderer.Alphanumeric; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestMethodOrder; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import java.io.IOException; import java.io.UncheckedIOException; @@ -45,11 +49,15 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.params.provider.Arguments.arguments; /** * Tests external spillable map {@link ExternalSpillableMap}. @@ -65,32 +73,47 @@ public void setUp() { failureOutputPath = basePath + "/test_fail"; } - @Test - public void simpleInsertTest() throws IOException, URISyntaxException { + @ParameterizedTest + @MethodSource("testArguments") + public void simpleInsertTest(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException { Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); - String payloadClazz = HoodieAvroPayload.class.getName(); + ExternalSpillableMap> records = - new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); // 16B + new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), + new HoodieRecordSizeEstimator(schema), diskMapType, isCompressionEnabled); // 16B List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); List recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records); assert (recordKeys.size() == 100); + + // Test iterator Iterator> itr = records.iterator(); - List oRecords = new ArrayList<>(); + int cntSize = 0; while (itr.hasNext()) { HoodieRecord rec = itr.next(); - oRecords.add(rec); + cntSize++; assert recordKeys.contains(rec.getRecordKey()); } + assertEquals(recordKeys.size(), cntSize); + + // Test value stream + List> values = records.valueStream().collect(Collectors.toList()); + cntSize = 0; + for (HoodieRecord value : values) { + assert recordKeys.contains(value.getRecordKey()); + cntSize++; + } + assertEquals(recordKeys.size(), cntSize); } - @Test - public void testSimpleUpsert() throws IOException, URISyntaxException { - + @ParameterizedTest + @MethodSource("testArguments") + public void testSimpleUpsert(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException { Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); ExternalSpillableMap> records = - new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); // 16B + new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), + new HoodieRecordSizeEstimator(schema), diskMapType, isCompressionEnabled); // 16B List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); List recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records); @@ -113,21 +136,23 @@ public void testSimpleUpsert() throws IOException, URISyntaxException { updatedRecords.forEach(record -> { HoodieRecord rec = records.get(((GenericRecord) record).get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); try { - assertEquals(rec.getData().getInsertValue(schema).get(), record); + assertEquals(((HoodieAvroRecord) rec).getData().getInsertValue(schema).get(), record); } catch (IOException io) { throw new UncheckedIOException(io); } }); } - @Test - public void testAllMapOperations() throws IOException, URISyntaxException { + @ParameterizedTest + @MethodSource("testArguments") + public void testAllMapOperations(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException { Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); String payloadClazz = HoodieAvroPayload.class.getName(); ExternalSpillableMap> records = - new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); // 16B + new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), + new HoodieRecordSizeEstimator(schema), diskMapType, isCompressionEnabled); // 16B List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); // insert a bunch of records so that values spill to disk too @@ -135,13 +160,13 @@ public void testAllMapOperations() throws IOException, URISyntaxException { IndexedRecord inMemoryRecord = iRecords.get(0); String ikey = ((GenericRecord) inMemoryRecord).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); String iPartitionPath = ((GenericRecord) inMemoryRecord).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); - HoodieRecord inMemoryHoodieRecord = new HoodieRecord<>(new HoodieKey(ikey, iPartitionPath), + HoodieRecord inMemoryHoodieRecord = new HoodieAvroRecord<>(new HoodieKey(ikey, iPartitionPath), new HoodieAvroPayload(Option.of((GenericRecord) inMemoryRecord))); IndexedRecord onDiskRecord = iRecords.get(99); String dkey = ((GenericRecord) onDiskRecord).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); String dPartitionPath = ((GenericRecord) onDiskRecord).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); - HoodieRecord onDiskHoodieRecord = new HoodieRecord<>(new HoodieKey(dkey, dPartitionPath), + HoodieRecord onDiskHoodieRecord = new HoodieAvroRecord<>(new HoodieKey(dkey, dPartitionPath), new HoodieAvroPayload(Option.of((GenericRecord) onDiskRecord))); // assert size assert records.size() == 100; @@ -176,12 +201,14 @@ public void testAllMapOperations() throws IOException, URISyntaxException { assertTrue(records.size() == 0); } - @Test - public void simpleTestWithException() throws IOException, URISyntaxException { + @ParameterizedTest + @MethodSource("testArguments") + public void simpleTestWithException(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException, URISyntaxException { Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); ExternalSpillableMap> records = new ExternalSpillableMap<>(16L, - failureOutputPath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); // 16B + failureOutputPath, new DefaultSizeEstimator(), + new HoodieRecordSizeEstimator(schema), diskMapType, isCompressionEnabled); // 16B List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); List recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records); @@ -194,13 +221,17 @@ public void simpleTestWithException() throws IOException, URISyntaxException { }); } - @Test - public void testDataCorrectnessWithUpsertsToDataInMapAndOnDisk() throws IOException, URISyntaxException { + @ParameterizedTest + @MethodSource("testArguments") + public void testDataCorrectnessWithUpsertsToDataInMapAndOnDisk(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled) throws IOException, + URISyntaxException { Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); ExternalSpillableMap> records = - new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); // 16B + new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), + new HoodieRecordSizeEstimator(schema), diskMapType, isCompressionEnabled); // 16B List recordKeys = new ArrayList<>(); // Ensure we spill to disk @@ -211,7 +242,7 @@ public void testDataCorrectnessWithUpsertsToDataInMapAndOnDisk() throws IOExcept // Get a record from the in-Memory map String key = recordKeys.get(0); - HoodieRecord record = records.get(key); + HoodieAvroRecord record = (HoodieAvroRecord) records.get(key); List recordsToUpdate = new ArrayList<>(); recordsToUpdate.add((IndexedRecord) record.getData().getInsertValue(schema).get()); @@ -229,7 +260,7 @@ public void testDataCorrectnessWithUpsertsToDataInMapAndOnDisk() throws IOExcept // Get a record from the disk based map key = recordKeys.get(recordKeys.size() - 1); - record = records.get(key); + record = (HoodieAvroRecord) records.get(key); recordsToUpdate = new ArrayList<>(); recordsToUpdate.add((IndexedRecord) record.getData().getInsertValue(schema).get()); @@ -245,13 +276,17 @@ record = records.get(key); assert newCommitTime.contentEquals(gRecord.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString()); } - @Test - public void testDataCorrectnessWithoutHoodieMetadata() throws IOException, URISyntaxException { + @ParameterizedTest + @MethodSource("testArguments") + public void testDataCorrectnessWithoutHoodieMetadata(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled) throws IOException, + URISyntaxException { Schema schema = SchemaTestUtil.getSimpleSchema(); ExternalSpillableMap> records = - new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); // 16B + new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), + new HoodieRecordSizeEstimator(schema), diskMapType, isCompressionEnabled); // 16B List recordKeys = new ArrayList<>(); // Ensure we spill to disk @@ -308,7 +343,42 @@ record = records.get(key); assertEquals(gRecord.get(fieldName).toString(), newValue); } - // TODO : come up with a performance eval test for spillableMap @Test - public void testLargeInsertUpsert() {} + public void testEstimationWithEmptyMap() throws IOException, URISyntaxException { + final ExternalSpillableMap.DiskMapType diskMapType = ExternalSpillableMap.DiskMapType.BITCASK; + final boolean isCompressionEnabled = false; + final Schema schema = SchemaTestUtil.getSimpleSchema(); + + ExternalSpillableMap> records = + new ExternalSpillableMap<>(16L, basePath, new DefaultSizeEstimator(), + new HoodieRecordSizeEstimator(schema), diskMapType, isCompressionEnabled); + + List recordKeys = new ArrayList<>(); + + // Put a single record. Payload size estimation happens as part of this initial put. + HoodieRecord seedRecord = SchemaTestUtil.generateHoodieTestRecordsWithoutHoodieMetadata(0, 1).get(0); + records.put(seedRecord.getRecordKey(), seedRecord); + + // Remove the key immediately to make the map empty again. + records.remove(seedRecord.getRecordKey()); + + // Verify payload size re-estimation does not throw exception + List hoodieRecords = SchemaTestUtil.generateHoodieTestRecordsWithoutHoodieMetadata(0, 250); + hoodieRecords.stream().forEach(hoodieRecord -> { + assertDoesNotThrow(() -> { + records.put(hoodieRecord.getRecordKey(), hoodieRecord); + }, "ExternalSpillableMap put() should not throw exception!"); + recordKeys.add(hoodieRecord.getRecordKey()); + }); + } + + private static Stream testArguments() { + // Arguments : 1. Disk Map Type 2. isCompressionEnabled for BitCaskMap + return Stream.of( + arguments(ExternalSpillableMap.DiskMapType.BITCASK, false), + arguments(ExternalSpillableMap.DiskMapType.ROCKS_DB, false), + arguments(ExternalSpillableMap.DiskMapType.UNKNOWN, false), + arguments(ExternalSpillableMap.DiskMapType.BITCASK, true) + ); + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestRocksDbBasedMap.java b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestRocksDbBasedMap.java index 1111d10e43f37..5b71b5ec24235 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestRocksDbBasedMap.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestRocksDbBasedMap.java @@ -38,7 +38,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; /** - * Tests RocksDB based map {@link RocksDBBasedMap}. + * Tests RocksDB based map {@link RocksDbDiskMap}. */ public class TestRocksDbBasedMap extends HoodieCommonTestHarness { @@ -49,7 +49,7 @@ public void setUp() { @Test public void testSimple() throws IOException, URISyntaxException { - RocksDBBasedMap records = new RocksDBBasedMap(basePath); + RocksDbDiskMap records = new RocksDbDiskMap(basePath); List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); ((GenericRecord) iRecords.get(0)).get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); List recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestRocksDbDiskMap.java b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestRocksDbDiskMap.java new file mode 100644 index 0000000000000..31daaab213604 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestRocksDbDiskMap.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util.collection; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.testutils.SchemaTestUtil; +import org.apache.hudi.common.testutils.SpillableMapTestUtils; +import org.apache.hudi.common.util.Option; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.net.URISyntaxException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Test the rocksDb based Map {@link RocksDbDiskMap} + * that is used by {@link ExternalSpillableMap}. + */ +public class TestRocksDbDiskMap extends HoodieCommonTestHarness { + + @BeforeEach + public void setUp() { + initPath(); + } + + @Test + public void testSimpleInsertSequential() throws IOException, URISyntaxException { + RocksDbDiskMap> rocksDBBasedMap = new RocksDbDiskMap<>(basePath); + List recordKeys = setupMapWithRecords(rocksDBBasedMap, 100); + + Iterator> itr = rocksDBBasedMap.iterator(); + int cntSize = 0; + while (itr.hasNext()) { + HoodieRecord rec = itr.next(); + cntSize++; + assert recordKeys.contains(rec.getRecordKey()); + } + assertEquals(recordKeys.size(), cntSize); + + // Test value stream + long currentTimeMs = System.currentTimeMillis(); + List> values = + rocksDBBasedMap.valueStream().collect(Collectors.toList()); + cntSize = 0; + for (HoodieRecord value : values) { + assert recordKeys.contains(value.getRecordKey()); + cntSize++; + } + assertEquals(recordKeys.size(), cntSize); + } + + @Test + public void testSimpleInsertRandomAccess() throws IOException, URISyntaxException { + RocksDbDiskMap rocksDBBasedMap = new RocksDbDiskMap<>(basePath); + List recordKeys = setupMapWithRecords(rocksDBBasedMap, 100); + + Random random = new Random(); + for (int i = 0; i < recordKeys.size(); i++) { + String key = recordKeys.get(random.nextInt(recordKeys.size())); + assert rocksDBBasedMap.get(key) != null; + } + } + + @Test + public void testSimpleInsertWithoutHoodieMetadata() throws IOException, URISyntaxException { + RocksDbDiskMap rocksDBBasedMap = new RocksDbDiskMap<>(basePath); + List hoodieRecords = SchemaTestUtil.generateHoodieTestRecordsWithoutHoodieMetadata(0, 1000); + Set recordKeys = new HashSet<>(); + // insert generated records into the map + hoodieRecords.forEach(r -> { + rocksDBBasedMap.put(r.getRecordKey(), r); + recordKeys.add(r.getRecordKey()); + }); + // make sure records have spilled to disk + assertTrue(rocksDBBasedMap.sizeOfFileOnDiskInBytes() > 0); + Iterator> itr = rocksDBBasedMap.iterator(); + int cntSize = 0; + while (itr.hasNext()) { + HoodieRecord rec = itr.next(); + cntSize++; + assert recordKeys.contains(rec.getRecordKey()); + } + assertEquals(recordKeys.size(), cntSize); + } + + @Test + public void testSimpleUpsert() throws IOException, URISyntaxException { + Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); + + RocksDbDiskMap rocksDBBasedMap = new RocksDbDiskMap<>(basePath); + List insertedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); + List recordKeys = SpillableMapTestUtils.upsertRecords(insertedRecords, rocksDBBasedMap); + String oldCommitTime = + ((GenericRecord) insertedRecords.get(0)).get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); + + // generate updates from inserts for first 50 keys / subset of keys + List updatedRecords = SchemaTestUtil.updateHoodieTestRecords(recordKeys.subList(0, 50), + SchemaTestUtil.generateHoodieTestRecords(0, 50), HoodieActiveTimeline.createNewInstantTime()); + String newCommitTime = + ((GenericRecord) updatedRecords.get(0)).get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); + + // perform upserts + List updatedRecordKeys = SpillableMapTestUtils.upsertRecords(updatedRecords, rocksDBBasedMap); + + // Upserted records (on disk) should have the latest commit time + Iterator> itr = rocksDBBasedMap.iterator(); + while (itr.hasNext()) { + HoodieRecord rec = itr.next(); + try { + IndexedRecord indexedRecord = (IndexedRecord) rec.getData().getInsertValue(schema).get(); + String latestCommitTime = + ((GenericRecord) indexedRecord).get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); + assert recordKeys.contains(rec.getRecordKey()) || updatedRecordKeys.contains(rec.getRecordKey()); + assertEquals(latestCommitTime, updatedRecordKeys.contains(rec.getRecordKey()) ? newCommitTime : oldCommitTime); + } catch (IOException io) { + throw new UncheckedIOException(io); + } + } + } + + @Test + public void testPutAll() throws IOException, URISyntaxException { + RocksDbDiskMap rocksDBBasedMap = new RocksDbDiskMap<>(basePath); + List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); + Map recordMap = new HashMap<>(); + iRecords.forEach(r -> { + String key = ((GenericRecord) r).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String partitionPath = ((GenericRecord) r).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); + HoodieRecord value = new HoodieAvroRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r))); + recordMap.put(key, value); + }); + + rocksDBBasedMap.putAll(recordMap); + // make sure records have spilled to disk + assertTrue(rocksDBBasedMap.sizeOfFileOnDiskInBytes() > 0); + + // make sure all added records are present + for (Map.Entry entry : rocksDBBasedMap.entrySet()) { + assertTrue(recordMap.containsKey(entry.getKey())); + } + } + + @Test + public void testSimpleRemove() throws IOException, URISyntaxException { + RocksDbDiskMap rocksDBBasedMap = new RocksDbDiskMap<>(basePath); + List recordKeys = setupMapWithRecords(rocksDBBasedMap, 100); + + List deleteKeys = recordKeys.subList(0, 10); + for (String deleteKey : deleteKeys) { + assert rocksDBBasedMap.remove(deleteKey) != null; + assert rocksDBBasedMap.get(deleteKey) == null; + } + } + + private List setupMapWithRecords(RocksDbDiskMap rocksDBBasedMap, int numRecords) throws IOException, URISyntaxException { + List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, numRecords); + List recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, rocksDBBasedMap); + // Ensure the number of records is correct + assertEquals(rocksDBBasedMap.size(), recordKeys.size()); + // make sure records have spilled to disk + assertTrue(rocksDBBasedMap.sizeOfFileOnDiskInBytes() > 0); + return recordKeys; + } +} \ No newline at end of file diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/hash/TestHashID.java b/hudi-common/src/test/java/org/apache/hudi/common/util/hash/TestHashID.java new file mode 100644 index 0000000000000..de0424f42580a --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/hash/TestHashID.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util.hash; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import javax.xml.bind.DatatypeConverter; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.Random; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestHashID { + + /** + * Test HashID of all sizes for ByteArray type input message. + */ + @ParameterizedTest + @EnumSource(HashID.Size.class) + public void testHashForByteInput(HashID.Size size) { + final int count = 8; + Random random = new Random(); + for (int i = 0; i < count; i++) { + final String message = random.ints(50, 120) + .filter(j -> (j <= 57 || j >= 65) && (j <= 90 || j >= 97)) + .limit((32 + (i * 4))) + .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) + .toString(); + final byte[] originalData = message.getBytes(StandardCharsets.UTF_8); + final byte[] hashBytes = HashID.hash(originalData, size); + assertEquals(hashBytes.length, size.byteSize()); + } + } + + /** + * Test HashID of all sizes for String type input message. + */ + @ParameterizedTest + @EnumSource(HashID.Size.class) + public void testHashForStringInput(HashID.Size size) { + final int count = 8; + Random random = new Random(); + for (int i = 0; i < count; i++) { + final String message = random.ints(50, 120) + .filter(j -> (j <= 57 || j >= 65) && (j <= 90 || j >= 97)) + .limit((32 + (i * 4))) + .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) + .toString(); + final byte[] hashBytes = HashID.hash(message, size); + assertEquals(hashBytes.length, size.byteSize()); + } + } + + /** + * Test expected hash values for all bit sizes. + */ + @Test + public void testHashValues() { + Map> expectedValuesMap = new HashMap>(); + Map hash32ExpectedValues = new HashMap() { + { + put("Hudi", "FB6A3F92"); + put("Data lake", "99913A4D"); + put("Data Lake", "6F7DAD6A"); + put("Col1", "B4393B9A"); + put("A", "CDD946CE"); + put("2021/10/28/", "BBD4FDB2"); + } + }; + expectedValuesMap.put(HashID.Size.BITS_32, hash32ExpectedValues); + + Map hash64ExpectedValues = new HashMap() { + { + put("Hudi", "F7727B9A28379071"); + put("Data lake", "52BC72D592EBCAE5"); + put("Data Lake", "5ED19AF9FD746E3E"); + put("Col1", "22FB1DD2F4784D31"); + put("A", "EBF88350484B5AA7"); + put("2021/10/28/", "2A9399AF6E7C8B12"); + } + }; + expectedValuesMap.put(HashID.Size.BITS_128, hash64ExpectedValues); + + Map hash128ExpectedValues = new HashMap() { + { + put("Hudi", "09DAB749F255311C1C9EF6DD7B790170"); + put("Data lake", "7F2FC1EA445FC81F67CAA25EC9089C08"); + put("Data Lake", "9D2CEF0D61B02848C528A070ED75C570"); + put("Col1", "EC0FFE21E704DE2A580661C59A81D453"); + put("A", "7FC56270E7A70FA81A5935B72EACBE29"); + put("2021/10/28/", "1BAE8F04F44CB7ACF2458EF5219742DC"); + } + }; + expectedValuesMap.put(HashID.Size.BITS_128, hash128ExpectedValues); + + for (Map.Entry> allSizeEntries : expectedValuesMap.entrySet()) { + for (Map.Entry sizeEntry : allSizeEntries.getValue().entrySet()) { + final byte[] actualHashBytes = HashID.hash(sizeEntry.getKey(), allSizeEntries.getKey()); + final byte[] expectedHashBytes = DatatypeConverter.parseHexBinary(sizeEntry.getValue()); + assertTrue(Arrays.equals(expectedHashBytes, actualHashBytes)); + } + } + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/io/TestByteBufferBackedInputStream.java b/hudi-common/src/test/java/org/apache/hudi/common/util/io/TestByteBufferBackedInputStream.java new file mode 100644 index 0000000000000..87bd2eea2ebe5 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/io/TestByteBufferBackedInputStream.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util.io; + +import org.junit.jupiter.api.Test; + +import java.nio.ByteBuffer; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class TestByteBufferBackedInputStream { + + @Test + public void testConstructor() { + byte[] bytes = { 0xD, 0xE, 0xA, 0xD, 0xD, 0xE, 0xE, 0xD }; + ByteBuffer byteBuf = ByteBuffer.wrap(bytes, 0, 1); + ByteBuffer byteBufClone = byteBuf.duplicate(); + + // ByteBuffer ctor + ByteBufferBackedInputStream first = new ByteBufferBackedInputStream(byteBuf); + + assertEquals(first.read(), 0xD); + assertThrows(IllegalArgumentException.class, first::read); + // Make sure that the original buffer stays intact + assertEquals(byteBufClone, byteBuf); + + // byte[] ctor + ByteBufferBackedInputStream second = new ByteBufferBackedInputStream(bytes); + + assertEquals(second.read(), 0xD); + + // byte[] ctor (w/ offset) + ByteBufferBackedInputStream third = new ByteBufferBackedInputStream(bytes, 1, 1); + + assertEquals(third.read(), 0xE); + assertThrows(IllegalArgumentException.class, third::read); + } + + @Test + public void testRead() { + byte[] sourceBytes = { 0xD, 0xE, 0xA, 0xD, 0xD, 0xE, 0xE, 0xD }; + + ByteBufferBackedInputStream stream = new ByteBufferBackedInputStream(sourceBytes); + + int firstByte = stream.read(); + assertEquals(firstByte, 0xD); + + byte[] readBytes = new byte[4]; + int read = stream.read(readBytes, 1, 3); + + assertEquals(3, read); + assertArrayEquals(new byte[]{0, 0xE, 0xA, 0xD}, readBytes); + assertEquals(4, stream.getPosition()); + } + + @Test + public void testSeek() { + byte[] sourceBytes = { 0xD, 0xE, 0xA, 0xD, 0xD, 0xA, 0xE, 0xD }; + + ByteBufferBackedInputStream stream = new ByteBufferBackedInputStream(sourceBytes, 1, 7); + + // Seek to 2 byte in the stream (3 in the original buffer) + stream.seek(1); + int firstRead = stream.read(); + assertEquals(0xA, firstRead); + + // Seek to 5 byte in the stream (6 in the original buffer) + stream.seek(5); + int secondRead = stream.read(); + assertEquals(0xE, secondRead); + + // Try to seek past the stream boundary + assertThrows(IllegalArgumentException.class, () -> stream.seek(8)); + } + + @Test + public void testCopyFrom() { + byte[] sourceBytes = { 0xD, 0xE, 0xA, 0xD, 0xD, 0xA, 0xE, 0xD }; + + ByteBufferBackedInputStream stream = new ByteBufferBackedInputStream(sourceBytes); + + int firstByte = stream.read(); + assertEquals(firstByte, 0xD); + + // Copy 5 byes from the stream (while keeping stream's state intact) + byte[] targetBytes = new byte[5]; + stream.copyFrom(2, targetBytes, 0, targetBytes.length); + + assertArrayEquals(new byte[] { 0xA, 0xD, 0xD, 0xA, 0xE }, targetBytes); + + // Continue reading the stream from where we left of (before copying) + int secondByte = stream.read(); + assertEquals(secondByte, 0xE); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/internal/schema/TestSerDeHelper.java b/hudi-common/src/test/java/org/apache/hudi/internal/schema/TestSerDeHelper.java new file mode 100644 index 0000000000000..5a337004812ca --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/internal/schema/TestSerDeHelper.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema; + +import org.apache.hudi.internal.schema.utils.InternalSchemaUtils; +import org.apache.hudi.internal.schema.utils.SerDeHelper; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Assertions; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.TreeMap; + +public class TestSerDeHelper { + + @Test + public void testComplexSchema2Json() { + InternalSchema internalSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()), + Types.Field.get(1, true, "data", Types.StringType.get()), + Types.Field.get(2, true, "preferences", + Types.RecordType.get(Types.Field.get(7, false, "feature1", + Types.BooleanType.get()), Types.Field.get(8, true, "feature2", Types.BooleanType.get()))), + Types.Field.get(3, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(), + Types.RecordType.get(Types.Field.get(11, false, "lat", Types.FloatType.get()), Types.Field.get(12, false, "long", Types.FloatType.get())), false)), + Types.Field.get(4, true, "points", Types.ArrayType.get(13, true, + Types.RecordType.get(Types.Field.get(14, false, "x", Types.LongType.get()), Types.Field.get(15, false, "y", Types.LongType.get())))), + Types.Field.get(5, false,"doubles", Types.ArrayType.get(16, false, Types.DoubleType.get())), + Types.Field.get(6, true, "properties", Types.MapType.get(17, 18, Types.StringType.get(), Types.StringType.get())) + ); + // test schema2json + String result = SerDeHelper.toJson(internalSchema); + InternalSchema convertedSchema = SerDeHelper.fromJson(result).get(); + Assertions.assertEquals(internalSchema, convertedSchema); + // test schemas2json + String results = SerDeHelper.toJson(Arrays.asList(internalSchema)); + TreeMap convertedSchemas = SerDeHelper.parseSchemas(results); + Assertions.assertEquals(1, convertedSchemas.size()); + } + + @Test + public void testPrimitive2Json() { + Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] { + Types.Field.get(0, "bool", Types.BooleanType.get()), + Types.Field.get(1, "int", Types.IntType.get()), + Types.Field.get(2, "long", Types.LongType.get()), + Types.Field.get(3, "float", Types.FloatType.get()), + Types.Field.get(4, "double", Types.DoubleType.get()), + Types.Field.get(5, "date", Types.DateType.get()), + Types.Field.get(6, "time", Types.TimeType.get()), + Types.Field.get(7, "timestamp", Types.TimestampType.get()), + Types.Field.get(8, "string", Types.StringType.get()), + Types.Field.get(9, "uuid", Types.UUIDType.get()), + Types.Field.get(10, "fixed", Types.FixedType.getFixed(10)), + Types.Field.get(11, "binary", Types.BinaryType.get()), + Types.Field.get(12, "decimal", Types.DecimalType.get(10, 2)) + })); + InternalSchema internalSchema = new InternalSchema(record.fields()); + String result = SerDeHelper.toJson(internalSchema); + InternalSchema convertedSchema = SerDeHelper.fromJson(result).get(); + Assertions.assertEquals(internalSchema, convertedSchema); + } + + @Test + public void testSearchSchema() { + List schemas = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + schemas.add(new InternalSchema(i * 10, + Arrays.asList(Types.Field.get(1, true, "schema" + i * 10, Types.LongType.get())))); + } + + Assertions.assertEquals(InternalSchemaUtils.searchSchema(0, schemas).getRecord().fields().get(0), + Types.Field.get(1, true, "schema" + 0, Types.LongType.get())); + + Assertions.assertEquals(InternalSchemaUtils.searchSchema(9, schemas).getRecord().fields().get(0), + Types.Field.get(1, true, "schema" + 0, Types.LongType.get())); + + Assertions.assertEquals(InternalSchemaUtils.searchSchema(99, schemas).getRecord().fields().get(0), + Types.Field.get(1, true, "schema" + 90, Types.LongType.get())); + + Assertions.assertEquals(InternalSchemaUtils.searchSchema(9999, schemas).getRecord().fields().get(0), + Types.Field.get(1, true, "schema" + 990, Types.LongType.get())); + } + + @Test + public void testInheritSchemas() { + List schemas = new ArrayList<>(); + for (int i = 0; i < 2; i++) { + schemas.add(new InternalSchema(i, + Arrays.asList(Types.Field.get(1, true, "schema" + i, Types.LongType.get())))); + } + String oldSchemas = SerDeHelper.toJson(schemas); + InternalSchema newSchema = new InternalSchema(3, + Arrays.asList(Types.Field.get(1, true, "schema" + 3, Types.LongType.get()))); + + String finalResult = SerDeHelper.inheritSchemas(newSchema, oldSchemas); + // convert back + Assertions.assertEquals(SerDeHelper.parseSchemas(finalResult).size(), 3); + } +} + diff --git a/hudi-common/src/test/java/org/apache/hudi/internal/schema/action/TestMergeSchema.java b/hudi-common/src/test/java/org/apache/hudi/internal/schema/action/TestMergeSchema.java new file mode 100644 index 0000000000000..3118e143870e3 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/internal/schema/action/TestMergeSchema.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.action; + +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.Types; + +import org.apache.hudi.internal.schema.utils.SchemaChangeUtils; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; + +public class TestMergeSchema { + + @Test + public void testPrimitiveMerge() { + Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] { + Types.Field.get(0, "col1", Types.BooleanType.get()), + Types.Field.get(1, "col2", Types.IntType.get()), + Types.Field.get(2, "col3", Types.LongType.get()), + Types.Field.get(3, "col4", Types.FloatType.get())})); + + InternalSchema oldSchema = new InternalSchema(record.fields()); + // add c1 after 'col1', and c2 before 'col3' + TableChanges.ColumnAddChange addChange = TableChanges.ColumnAddChange.get(oldSchema); + addChange.addColumns("c1", Types.BooleanType.get(), "add c1 after col1"); + addChange.addPositionChange("c1", "col1", "after"); + addChange.addColumns("c2", Types.IntType.get(), "add c2 before col3"); + addChange.addPositionChange("c2", "col3", "before"); + InternalSchema newAddSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, addChange); + TableChanges.ColumnDeleteChange deleteChange = TableChanges.ColumnDeleteChange.get(newAddSchema); + deleteChange.deleteColumn("col1"); + deleteChange.deleteColumn("col3"); + InternalSchema newDeleteSchema = SchemaChangeUtils.applyTableChanges2Schema(newAddSchema, deleteChange); + + TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(newDeleteSchema); + updateChange.updateColumnType("col2", Types.LongType.get()) + .updateColumnComment("col2", "alter col2 comments") + .renameColumn("col2", "colx").addPositionChange("col2", + "col4", "after"); + InternalSchema updateSchema = SchemaChangeUtils.applyTableChanges2Schema(newDeleteSchema, updateChange); + + // add col1 again + TableChanges.ColumnAddChange addChange1 = TableChanges.ColumnAddChange.get(updateSchema); + addChange1.addColumns("col1", Types.BooleanType.get(), "add new col1"); + InternalSchema finalSchema = SchemaChangeUtils.applyTableChanges2Schema(updateSchema, addChange1); + // merge schema by using columnType from query schema + InternalSchema mergeSchema = new InternalSchemaMerger(oldSchema, finalSchema, true, false).mergeSchema(); + + InternalSchema checkedSchema = new InternalSchema(Arrays.asList(new Types.Field[] { + Types.Field.get(4, true, "c1", Types.BooleanType.get(), "add c1 after col1"), + Types.Field.get(5, true, "c2", Types.IntType.get(), "add c2 before col3"), + Types.Field.get(3, true, "col4", Types.FloatType.get()), + Types.Field.get(1, true, "col2", Types.LongType.get(), "alter col2 comments"), + Types.Field.get(6, true, "col1suffix", Types.BooleanType.get(), "add new col1") + })); + Assertions.assertEquals(mergeSchema, checkedSchema); + + // merge schema by using columnType from file schema + InternalSchema mergeSchema1 = new InternalSchemaMerger(oldSchema, finalSchema, true, true).mergeSchema(); + InternalSchema checkedSchema1 = new InternalSchema(Arrays.asList(new Types.Field[] { + Types.Field.get(4, true, "c1", Types.BooleanType.get(), "add c1 after col1"), + Types.Field.get(5, true, "c2", Types.IntType.get(), "add c2 before col3"), + Types.Field.get(3, true, "col4", Types.FloatType.get()), + Types.Field.get(1, true, "col2", Types.IntType.get(), "alter col2 comments"), + Types.Field.get(6, true, "col1suffix", Types.BooleanType.get(), "add new col1") + })); + Assertions.assertEquals(mergeSchema1, checkedSchema1); + } +} + diff --git a/hudi-common/src/test/java/org/apache/hudi/internal/schema/action/TestTableChanges.java b/hudi-common/src/test/java/org/apache/hudi/internal/schema/action/TestTableChanges.java new file mode 100644 index 0000000000000..f8f5a4dc0293a --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/internal/schema/action/TestTableChanges.java @@ -0,0 +1,315 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.action; + +import org.apache.hudi.internal.schema.HoodieSchemaException; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.Type; +import org.apache.hudi.internal.schema.Types; + +import org.apache.hudi.internal.schema.Types.StringType; +import org.apache.hudi.internal.schema.action.TableChange.ColumnPositionChange.ColumnPositionType; +import org.apache.hudi.internal.schema.utils.SchemaChangeUtils; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Assertions; + +import java.util.Arrays; + +public class TestTableChanges { + + @Test + public void testPrimitiveAdd() { + Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] { + Types.Field.get(0, "col1", Types.BooleanType.get()), + Types.Field.get(1, "col2", Types.IntType.get()), + Types.Field.get(2, "col3", Types.LongType.get()), + Types.Field.get(3, "col4", Types.FloatType.get())})); + + Types.RecordType checkRecord = Types.RecordType.get(Arrays.asList(new Types.Field[] { + Types.Field.get(0, "col1", Types.BooleanType.get()), + Types.Field.get(4, true, "c1", Types.BooleanType.get(), "add c1 after col1"), + Types.Field.get(1, "col2", Types.IntType.get()), + Types.Field.get(5, true, "c2", Types.IntType.get(), "add c2 before col3"), + Types.Field.get(2, "col3", Types.LongType.get()), + Types.Field.get(3, "col4", Types.FloatType.get())})); + + InternalSchema oldSchema = new InternalSchema(record.fields()); + // add c1 after 'col1', and c2 before 'col3' + TableChanges.ColumnAddChange addChange = TableChanges.ColumnAddChange.get(oldSchema); + addChange.addColumns("c1", Types.BooleanType.get(), "add c1 after col1"); + // check repeated add. + Assertions.assertThrows(HoodieSchemaException.class, () -> addChange.addColumns("c1", Types.BooleanType.get(), "add c1 after col1")); + addChange.addPositionChange("c1", "col1", "after"); + addChange.addColumns("c2", Types.IntType.get(), "add c2 before col3"); + addChange.addPositionChange("c2", "col3", "before"); + InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, addChange); + Assertions.assertEquals(newSchema.getRecord(), checkRecord); + } + + @Test + public void testNestAdd() { + InternalSchema oldSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()), + Types.Field.get(1, true, "data", Types.StringType.get()), + Types.Field.get(2, true, "preferences", + Types.RecordType.get(Types.Field.get(7, false, "feature1", + Types.BooleanType.get()), Types.Field.get(8, true, "feature2", Types.BooleanType.get()))), + Types.Field.get(3, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(), + Types.RecordType.get(Types.Field.get(11, false, "lat", Types.FloatType.get()), Types.Field.get(12, false, "long", Types.FloatType.get())), false)), + Types.Field.get(4, true, "points", Types.ArrayType.get(13, true, + Types.RecordType.get(Types.Field.get(14, false, "x", Types.LongType.get()), Types.Field.get(15, false, "y", Types.LongType.get())))), + Types.Field.get(5, false,"doubles", Types.ArrayType.get(16, false, Types.DoubleType.get())), + Types.Field.get(6, true, "properties", Types.MapType.get(17, 18, Types.StringType.get(), Types.StringType.get())) + ); + + TableChanges.ColumnAddChange addChange = TableChanges.ColumnAddChange.get(oldSchema); + // add c1 first + addChange.addColumns("c1", Types.StringType.get(), "add c1 first"); + addChange.addPositionChange("c1", "id", "before"); + //add preferences.cx before preferences.feature2 + addChange.addColumns("preferences", "cx", Types.BooleanType.get(), "add preferences.cx before preferences.feature2"); + // check repeated add. + Assertions.assertThrows(HoodieSchemaException.class, () -> addChange.addColumns("preferences", "cx", Types.BooleanType.get(), "add preferences.cx before preferences.feature2")); + addChange.addPositionChange("preferences.cx", "preferences.feature2", "before"); + // add locations.value.lax before locations.value.long + addChange.addColumns("locations.value", "lax", Types.BooleanType.get(), "add locations.value.lax before locations.value.long"); + addChange.addPositionChange("locations.value.lax", "locations.value.long", "before"); + // + // add points.element.z after points.element.y + addChange.addColumns("points.element", "z", Types.BooleanType.get(), "add points.element.z after points.element.y"); + addChange.addPositionChange("points.element.z", "points.element.y", "after"); + InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, addChange); + InternalSchema checkedSchema = new InternalSchema( + Types.Field.get(19, true, "c1", Types.StringType.get(), "add c1 first"), + Types.Field.get(0, false, "id", Types.IntType.get()), + Types.Field.get(1, true, "data", Types.StringType.get()), + Types.Field.get(2, true, "preferences", + Types.RecordType.get(Types.Field.get(7, false, "feature1", Types.BooleanType.get()), + Types.Field.get(20, true, "cx", Types.BooleanType.get(), "add preferences.cx before preferences.feature2"), + Types.Field.get(8, true, "feature2", Types.BooleanType.get()))), + Types.Field.get(3, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(), + Types.RecordType.get(Types.Field.get(11, false, "lat", Types.FloatType.get()), + Types.Field.get(21, true, "lax", Types.BooleanType.get(), "add locations.value.lax before locations.value.long"), + Types.Field.get(12, false, "long", Types.FloatType.get())), false)), + Types.Field.get(4, true, "points", Types.ArrayType.get(13, true, + Types.RecordType.get(Types.Field.get(14, false, "x", Types.LongType.get()), + Types.Field.get(15, false, "y", Types.LongType.get()), + Types.Field.get(22, true, "z", Types.BooleanType.get(), "add points.element.z after points.element.y")))), + Types.Field.get(5, false,"doubles", Types.ArrayType.get(16, false, Types.DoubleType.get())), + Types.Field.get(6, true, "properties", Types.MapType.get(17, 18, Types.StringType.get(), Types.StringType.get())) + ); + Assertions.assertEquals(newSchema.getRecord(), checkedSchema.getRecord()); + } + + @Test + public void testPrimitiveDelete() { + Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] { + Types.Field.get(0, "col1", Types.BooleanType.get()), + Types.Field.get(1, "col2", Types.IntType.get()), + Types.Field.get(2, "col3", Types.LongType.get()), + Types.Field.get(3, "col4", Types.FloatType.get())})); + InternalSchema oldSchema = new InternalSchema(record.fields()); + TableChanges.ColumnDeleteChange deleteChange = TableChanges.ColumnDeleteChange.get(oldSchema); + deleteChange.deleteColumn("col1"); + // check repeated delete. + // deletechange can handle deleting the same column multiple times, only keep one operation. + deleteChange.deleteColumn("col1"); + deleteChange.deleteColumn("col3"); + InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, deleteChange); + Types.RecordType checkRecord = Types.RecordType.get(Arrays.asList(new Types.Field[] { + Types.Field.get(1, "col2", Types.IntType.get()), + Types.Field.get(3, "col4", Types.FloatType.get())})); + Assertions.assertEquals(newSchema.getRecord(), checkRecord); + } + + @Test + public void testNestDelete() { + InternalSchema oldSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()), + Types.Field.get(1, true, "data", Types.StringType.get()), + Types.Field.get(2, true, "preferences", + Types.RecordType.get(Types.Field.get(5, false, "feature1", + Types.BooleanType.get()), Types.Field.get(6, true, "feature2", Types.BooleanType.get()))), + Types.Field.get(3, false, "locations", Types.MapType.get(7, 8, Types.StringType.get(), + Types.RecordType.get(Types.Field.get(9, false, "lat", Types.FloatType.get()), Types.Field.get(10, false, "long", Types.FloatType.get())), false)), + Types.Field.get(4, true, "points", Types.ArrayType.get(11, true, + Types.RecordType.get(Types.Field.get(12, false, "x", Types.LongType.get()), Types.Field.get(13, false, "y", Types.LongType.get())))) + ); + TableChanges.ColumnDeleteChange deleteChange = TableChanges.ColumnDeleteChange.get(oldSchema); + deleteChange.deleteColumn("data"); + deleteChange.deleteColumn("preferences.feature2"); + deleteChange.deleteColumn("preferences.feature2"); + deleteChange.deleteColumn("locations.value.lat"); + deleteChange.deleteColumn("points.element.y"); + InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, deleteChange); + InternalSchema checkedSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()), + Types.Field.get(2, true, "preferences", + Types.RecordType.get(Types.Field.get(5, false, "feature1", + Types.BooleanType.get()))), + Types.Field.get(3, false, "locations", Types.MapType.get(7, 8, Types.StringType.get(), + Types.RecordType.get(Types.Field.get(10, false, "long", Types.FloatType.get())), false)), + Types.Field.get(4, true, "points", Types.ArrayType.get(11, true, + Types.RecordType.get(Types.Field.get(12, false, "x", Types.LongType.get())))) + ); + Assertions.assertEquals(newSchema.getRecord(), checkedSchema.getRecord()); + } + + @Test + public void testPrimitiveUpdate() { + Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] { + Types.Field.get(0, "col1", Types.BooleanType.get()), + Types.Field.get(1, "col2", Types.IntType.get()), + Types.Field.get(2, "col3", Types.LongType.get()), + Types.Field.get(3, "col4", Types.FloatType.get())})); + InternalSchema oldSchema = new InternalSchema(record.fields()); + TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(oldSchema); + updateChange.updateColumnType("col2", Types.LongType.get()) + .updateColumnComment("col2", "alter col2 comments") + .renameColumn("col2", "colx").addPositionChange("col2", "col4", "after"); + InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, updateChange); + Types.RecordType checkedRecord = Types.RecordType.get(Arrays.asList(new Types.Field[] { + Types.Field.get(0, "col1", Types.BooleanType.get()), + Types.Field.get(2, "col3", Types.LongType.get()), + Types.Field.get(3, "col4", Types.FloatType.get()), + Types.Field.get(1, true, "colx", Types.LongType.get(), "alter col2 comments")})); + Assertions.assertEquals(newSchema.getRecord(), checkedRecord); + } + + @Test + public void testNestUpdate() { + InternalSchema oldSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()), + Types.Field.get(1, true, "data", Types.StringType.get()), + Types.Field.get(2, true, "preferences", + Types.RecordType.get(Types.Field.get(5, false, "feature1", + Types.BooleanType.get()), Types.Field.get(6, true, "feature2", Types.BooleanType.get()))), + Types.Field.get(3, false, "locations", Types.MapType.get(7, 8, Types.StringType.get(), + Types.RecordType.get(Types.Field.get(9, false, "lat", Types.FloatType.get()), Types.Field.get(10, false, "long", Types.FloatType.get())), false)), + Types.Field.get(4, true, "points", Types.ArrayType.get(11, true, + Types.RecordType.get(Types.Field.get(12, false, "x", Types.LongType.get()), Types.Field.get(13, false, "y", Types.LongType.get())))) + ); + TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(oldSchema); + updateChange + .updateColumnNullability("id", true) + .renameColumn("id", "idx") + .addPositionChange("data", "points", "after"); + updateChange + .updateColumnComment("preferences.feature1", "add feature1 comment") + .renameColumn("preferences.feature1", "f1") + .addPositionChange("preferences.feature1", "preferences.feature1", "first"); + updateChange.updateColumnComment("locations.value.lat", "add lat comment") + .renameColumn("locations.value.lat", "lax") + .addPositionChange("locations.value.lat", "locations.value.lat", "first"); + updateChange.renameColumn("points.element.x", "z") + .addPositionChange("points.element.x", "points.element.y", "after"); + InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, updateChange); + InternalSchema checkSchema = new InternalSchema(Types.Field.get(0, true, "idx", Types.IntType.get()), + Types.Field.get(2, true, "preferences", + Types.RecordType.get(Types.Field.get(5, false, "f1", + Types.BooleanType.get(), "add feature1 comment"), Types.Field.get(6, true, "feature2", Types.BooleanType.get()))), + Types.Field.get(3, false, "locations", Types.MapType.get(7, 8, Types.StringType.get(), + Types.RecordType.get(Types.Field.get(9, false, "lax", Types.FloatType.get(), "add lat comment"), Types.Field.get(10, false, "long", Types.FloatType.get())), false)), + Types.Field.get(4, true, "points", Types.ArrayType.get(11, true, + Types.RecordType.get(Types.Field.get(13, false, "y", Types.LongType.get()), Types.Field.get(12, false, "z", Types.LongType.get())))), + Types.Field.get(1, true, "data", Types.StringType.get()) + ); + Assertions.assertEquals(newSchema.getRecord(), checkSchema.getRecord()); + } + + @Test + public void testChangeApplier() { + // We add test here to verify the logic of applyAddChange and applyReOrderColPositionChange + InternalSchema oldSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()), + Types.Field.get(1, true, "data", Types.StringType.get()), + Types.Field.get(2, true, "preferences", + Types.RecordType.get(Types.Field.get(7, false, "feature1", + Types.BooleanType.get()), Types.Field.get(8, true, "feature2", Types.BooleanType.get()))), + Types.Field.get(3, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(), + Types.RecordType.get(Types.Field.get(11, false, "lat", Types.FloatType.get()), Types.Field.get(12, false, "long", Types.FloatType.get())), false)), + Types.Field.get(4, true, "points", Types.ArrayType.get(13, true, + Types.RecordType.get(Types.Field.get(14, false, "x", Types.LongType.get()), Types.Field.get(15, false, "y", Types.LongType.get())))), + Types.Field.get(5, false,"doubles", Types.ArrayType.get(16, false, Types.DoubleType.get())), + Types.Field.get(6, true, "properties", Types.MapType.get(17, 18, Types.StringType.get(), Types.StringType.get())) + ); + + // add c1 first + InternalSchema newSchema = addOperationForSchemaChangeApplier(oldSchema, "c1", StringType.get(), "add c1 first", + "id", ColumnPositionType.BEFORE); + //add preferences.cx before preferences.feature2 + newSchema = addOperationForSchemaChangeApplier(newSchema, "preferences.cx", Types.BooleanType.get(), "add preferences.cx before preferences.feature2", + "preferences.feature2", ColumnPositionType.BEFORE); + // check repeated add. + InternalSchema currSchema = newSchema; + Assertions.assertThrows(HoodieSchemaException.class, () -> addOperationForSchemaChangeApplier(currSchema, "preferences.cx", Types.BooleanType.get(), + "add preferences.cx before preferences.feature2")); + // add locations.value.lax before locations.value.long + newSchema = addOperationForSchemaChangeApplier(newSchema, "locations.value.lax", Types.BooleanType.get(), "add locations.value.lax before locations.value.long"); + newSchema = reOrderOperationForSchemaChangeApplier(newSchema, "locations.value.lax", "locations.value.long", ColumnPositionType.BEFORE); + // + // add points.element.z after points.element.y + newSchema = addOperationForSchemaChangeApplier(newSchema, "points.element.z", Types.BooleanType.get(), "add points.element.z after points.element.y", "points.element.y", ColumnPositionType.AFTER); + InternalSchema checkedSchema = new InternalSchema( + Types.Field.get(19, true, "c1", Types.StringType.get(), "add c1 first"), + Types.Field.get(0, false, "id", Types.IntType.get()), + Types.Field.get(1, true, "data", Types.StringType.get()), + Types.Field.get(2, true, "preferences", + Types.RecordType.get(Types.Field.get(7, false, "feature1", Types.BooleanType.get()), + Types.Field.get(20, true, "cx", Types.BooleanType.get(), "add preferences.cx before preferences.feature2"), + Types.Field.get(8, true, "feature2", Types.BooleanType.get()))), + Types.Field.get(3, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(), + Types.RecordType.get(Types.Field.get(11, false, "lat", Types.FloatType.get()), + Types.Field.get(21, true, "lax", Types.BooleanType.get(), "add locations.value.lax before locations.value.long"), + Types.Field.get(12, false, "long", Types.FloatType.get())), false)), + Types.Field.get(4, true, "points", Types.ArrayType.get(13, true, + Types.RecordType.get(Types.Field.get(14, false, "x", Types.LongType.get()), + Types.Field.get(15, false, "y", Types.LongType.get()), + Types.Field.get(22, true, "z", Types.BooleanType.get(), "add points.element.z after points.element.y")))), + Types.Field.get(5, false,"doubles", Types.ArrayType.get(16, false, Types.DoubleType.get())), + Types.Field.get(6, true, "properties", Types.MapType.get(17, 18, Types.StringType.get(), Types.StringType.get())) + ); + Assertions.assertEquals(newSchema.getRecord(), checkedSchema.getRecord()); + } + + private static InternalSchema addOperationForSchemaChangeApplier( + InternalSchema schema, + String colName, + Type colType, + String doc, + String position, + TableChange.ColumnPositionChange.ColumnPositionType positionType) { + InternalSchemaChangeApplier applier = new InternalSchemaChangeApplier(schema); + return applier.applyAddChange(colName, colType, doc, position, positionType); + } + + private static InternalSchema reOrderOperationForSchemaChangeApplier( + InternalSchema schema, + String colName, + String position, + TableChange.ColumnPositionChange.ColumnPositionType positionType) { + InternalSchemaChangeApplier applier = new InternalSchemaChangeApplier(schema); + return applier.applyReOrderColPositionChange(colName, position, positionType); + } + + private static InternalSchema addOperationForSchemaChangeApplier( + InternalSchema schema, + String colName, + Type colType, + String doc) { + return addOperationForSchemaChangeApplier(schema, colName, colType, doc, "", + ColumnPositionType.NO_OPERATION); + } +} + diff --git a/hudi-common/src/test/java/org/apache/hudi/internal/schema/io/TestFileBasedInternalSchemaStorageManager.java b/hudi-common/src/test/java/org/apache/hudi/internal/schema/io/TestFileBasedInternalSchemaStorageManager.java new file mode 100644 index 0000000000000..9db05b31e3e80 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/internal/schema/io/TestFileBasedInternalSchemaStorageManager.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.io; + +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.Types; +import org.apache.hudi.internal.schema.utils.SerDeHelper; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Tests {@link FileBasedInternalSchemaStorageManager}. + */ +public class TestFileBasedInternalSchemaStorageManager extends HoodieCommonTestHarness { + private HoodieActiveTimeline timeline; + + @BeforeEach + public void setUp() throws Exception { + initMetaClient(); + } + + @Test + public void testPersistAndReadHistorySchemaStr() throws IOException { + timeline = new HoodieActiveTimeline(metaClient); + FileBasedInternalSchemaStorageManager fm = new FileBasedInternalSchemaStorageManager(metaClient); + InternalSchema currentSchema = getSimpleSchema(); + currentSchema.setSchemaId(0L); + // save first schema. + fm.persistHistorySchemaStr("0000", SerDeHelper.inheritSchemas(currentSchema, "")); + // Simulate commit. + simulateCommit("0000"); + metaClient.reloadActiveTimeline(); + // try to read schema + InternalSchema readSchema = fm.getSchemaByKey("0").get(); + assertEquals(currentSchema, readSchema); + // save history schema again + InternalSchema secondSchema = getSimpleSchema(); + secondSchema.setSchemaId(1L); + fm.persistHistorySchemaStr("0001", SerDeHelper.inheritSchemas(secondSchema, fm.getHistorySchemaStr())); + // Simulate commit. + simulateCommit("0001"); + metaClient.reloadActiveTimeline(); + // try to read schema + assertEquals(secondSchema, fm.getSchemaByKey("1").get()); + + // test write failed and residual file clean. + InternalSchema thirdSchema = getSimpleSchema(); + thirdSchema.setSchemaId(2L); + fm.persistHistorySchemaStr("0002", SerDeHelper.inheritSchemas(thirdSchema, fm.getHistorySchemaStr())); + // do not simulate commit "0002", so current save file will be residual files. + // try 4st persist + InternalSchema lastSchema = getSimpleSchema(); + lastSchema.setSchemaId(3L); + fm.persistHistorySchemaStr("0004", SerDeHelper.inheritSchemas(lastSchema, fm.getHistorySchemaStr())); + simulateCommit("0004"); + metaClient.reloadActiveTimeline(); + // now the residual file created by 3st persist should be removed. + File f = new File(metaClient.getSchemaFolderName() + File.separator + "0002.schemacommit"); + assertTrue(!f.exists()); + assertEquals(lastSchema, fm.getSchemaByKey("3").get()); + } + + private void simulateCommit(String commitTime) { + if (timeline == null) { + timeline = new HoodieActiveTimeline(metaClient); + } + HoodieInstant instant = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMMIT_ACTION, commitTime); + timeline.createNewInstant(instant); + timeline.transitionRequestedToInflight(instant, Option.empty()); + timeline.saveAsComplete(new HoodieInstant(true, instant.getAction(), instant.getTimestamp()), + Option.empty()); + } + + private InternalSchema getSimpleSchema() { + Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] { + Types.Field.get(0, "bool", Types.BooleanType.get()), + Types.Field.get(1, "int", Types.IntType.get()), + })); + return new InternalSchema(record.fields()); + } +} + diff --git a/hudi-common/src/test/java/org/apache/hudi/internal/schema/utils/TestAvroSchemaEvolutionUtils.java b/hudi-common/src/test/java/org/apache/hudi/internal/schema/utils/TestAvroSchemaEvolutionUtils.java new file mode 100644 index 0000000000000..6126c479c6154 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/internal/schema/utils/TestAvroSchemaEvolutionUtils.java @@ -0,0 +1,474 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.utils; + +import org.apache.avro.JsonProperties; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.InternalSchemaBuilder; +import org.apache.hudi.internal.schema.Type; +import org.apache.hudi.internal.schema.Types; +import org.apache.hudi.internal.schema.action.TableChanges; +import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Assertions; + +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; + +public class TestAvroSchemaEvolutionUtils { + + String schemaStr = "{\"type\":\"record\",\"name\":\"newTableName\",\"fields\":[{\"name\":\"id\",\"type\":\"int\"},{\"name\":\"data\"," + + "\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"preferences\",\"type\":[\"null\"," + + "{\"type\":\"record\",\"name\":\"newTableName_preferences\",\"fields\":[{\"name\":\"feature1\"," + + "\"type\":\"boolean\"},{\"name\":\"feature2\",\"type\":[\"null\",\"boolean\"],\"default\":null}]}]," + + "\"default\":null},{\"name\":\"locations\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"record\"," + + "\"name\":\"newTableName_locations\",\"fields\":[{\"name\":\"lat\",\"type\":\"float\"},{\"name\":\"long\"," + + "\"type\":\"float\"}]}}},{\"name\":\"points\",\"type\":[\"null\",{\"type\":\"array\",\"items\":[\"null\"," + + "{\"type\":\"record\",\"name\":\"newTableName_points\",\"fields\":[{\"name\":\"x\",\"type\":\"long\"}," + + "{\"name\":\"y\",\"type\":\"long\"}]}]}],\"default\":null},{\"name\":\"doubles\",\"type\":{\"type\":\"array\",\"items\":\"double\"}}," + + "{\"name\":\"properties\",\"type\":[\"null\",{\"type\":\"map\",\"values\":[\"null\",\"string\"]}],\"default\":null}]}"; + + @Test + public void testPrimitiveTypes() { + Schema[] avroPrimitives = new Schema[] { + Schema.create(Schema.Type.BOOLEAN), + Schema.create(Schema.Type.INT), + Schema.create(Schema.Type.LONG), + Schema.create(Schema.Type.FLOAT), + Schema.create(Schema.Type.DOUBLE), + LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT)), + LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG)), + LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)), + Schema.create(Schema.Type.STRING), + LogicalTypes.uuid().addToSchema(Schema.createFixed("uuid_fixed", null, null, 16)), + Schema.createFixed("fixed_12", null, null, 12), + Schema.create(Schema.Type.BYTES), + LogicalTypes.decimal(9, 4).addToSchema(Schema.createFixed("decimal_9_4", null, null, 4))}; + + Type[] primitiveTypes = new Type[] { + Types.BooleanType.get(), + Types.IntType.get(), + Types.LongType.get(), + Types.FloatType.get(), + Types.DoubleType.get(), + Types.DateType.get(), + Types.TimeType.get(), + Types.TimestampType.get(), + Types.StringType.get(), + Types.UUIDType.get(), + Types.FixedType.getFixed(12), + Types.BinaryType.get(), + Types.DecimalType.get(9, 4) + }; + + for (int i = 0; i < primitiveTypes.length; i++) { + Type convertPrimitiveResult = AvroInternalSchemaConverter.convertToField(avroPrimitives[i]); + Assertions.assertEquals(convertPrimitiveResult, primitiveTypes[i]); + Schema convertResult = AvroInternalSchemaConverter.convert(primitiveTypes[i], "t1"); + Assertions.assertEquals(convertResult, avroPrimitives[i]); + } + } + + @Test + public void testRecordAndPrimitiveTypes() { + Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] { + Types.Field.get(0, "bool", Types.BooleanType.get()), + Types.Field.get(1, "int", Types.IntType.get()), + Types.Field.get(2, "long", Types.LongType.get()), + Types.Field.get(3, "float", Types.FloatType.get()), + Types.Field.get(4, "double", Types.DoubleType.get()), + Types.Field.get(5, "date", Types.DateType.get()), + Types.Field.get(6, "time", Types.TimeType.get()), + Types.Field.get(7, "timestamp", Types.TimestampType.get()), + Types.Field.get(8, "string", Types.StringType.get()), + Types.Field.get(9, "uuid", Types.UUIDType.get()), + Types.Field.get(10, "fixed", Types.FixedType.getFixed(10)), + Types.Field.get(11, "binary", Types.BinaryType.get()), + Types.Field.get(12, "decimal", Types.DecimalType.get(10, 2)) + })); + + Schema schema = create("t1", + new Schema.Field("bool", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.BOOLEAN)), null, JsonProperties.NULL_VALUE), + new Schema.Field("int", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.INT)), null, JsonProperties.NULL_VALUE), + new Schema.Field("long", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.LONG)), null, JsonProperties.NULL_VALUE), + new Schema.Field("float", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.FLOAT)), null, JsonProperties.NULL_VALUE), + new Schema.Field("double", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.DOUBLE)), null, JsonProperties.NULL_VALUE), + new Schema.Field("date", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT))), null, JsonProperties.NULL_VALUE), + new Schema.Field("time", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG))), null, JsonProperties.NULL_VALUE), + new Schema.Field("timestamp", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG))), null, JsonProperties.NULL_VALUE), + new Schema.Field("string", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.STRING)), null, JsonProperties.NULL_VALUE), + new Schema.Field("uuid", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.uuid().addToSchema(Schema.createFixed("uuid_fixed", null, null, 16))), null, JsonProperties.NULL_VALUE), + new Schema.Field("fixed", AvroInternalSchemaConverter.nullableSchema(Schema.createFixed("fixed_10", null, null, 10)), null, JsonProperties.NULL_VALUE), + new Schema.Field("binary", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.BYTES)), null, JsonProperties.NULL_VALUE), + new Schema.Field("decimal", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.decimal(10, 2) + .addToSchema(Schema.createFixed("decimal_10_2", null, null, 5))), null, JsonProperties.NULL_VALUE)); + Schema convertedSchema = AvroInternalSchemaConverter.convert(record, "t1"); + Assertions.assertEquals(convertedSchema, schema); + Types.RecordType convertedRecord = AvroInternalSchemaConverter.convert(schema).getRecord(); + Assertions.assertEquals(convertedRecord, record); + } + + private Schema create(String name, Schema.Field... fields) { + return Schema.createRecord(name, null, null, false, Arrays.asList(fields)); + } + + @Test + public void testArrayType() { + Type arrayNestRecordType = Types.ArrayType.get(1, false, + Types.RecordType.get(Arrays.asList(Types.Field.get(2, false, "a", Types.FloatType.get()), + Types.Field.get(3, false, "b", Types.FloatType.get())))); + + Schema schema = SchemaBuilder.array().items(create("t1", + new Schema.Field("a", Schema.create(Schema.Type.FLOAT), null, null), + new Schema.Field("b", Schema.create(Schema.Type.FLOAT), null, null))); + Schema convertedSchema = AvroInternalSchemaConverter.convert(arrayNestRecordType, "t1"); + Assertions.assertEquals(convertedSchema, schema); + Types.ArrayType convertedRecord = (Types.ArrayType) AvroInternalSchemaConverter.convertToField(schema); + Assertions.assertEquals(convertedRecord, arrayNestRecordType); + } + + @Test + public void testComplexConvert() { + Schema schema = new Schema.Parser().parse(schemaStr); + + InternalSchema internalSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()), + Types.Field.get(1, true, "data", Types.StringType.get()), + Types.Field.get(2, true, "preferences", + Types.RecordType.get(Types.Field.get(7, false, "feature1", + Types.BooleanType.get()), Types.Field.get(8, true, "feature2", Types.BooleanType.get()))), + Types.Field.get(3, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(), + Types.RecordType.get(Types.Field.get(11, false, "lat", Types.FloatType.get()), Types.Field.get(12, false, "long", Types.FloatType.get())), false)), + Types.Field.get(4, true, "points", Types.ArrayType.get(13, true, + Types.RecordType.get(Types.Field.get(14, false, "x", Types.LongType.get()), Types.Field.get(15, false, "y", Types.LongType.get())))), + Types.Field.get(5, false,"doubles", Types.ArrayType.get(16, false, Types.DoubleType.get())), + Types.Field.get(6, true, "properties", Types.MapType.get(17, 18, Types.StringType.get(), Types.StringType.get())) + ); + + Type convertRecord = AvroInternalSchemaConverter.convert(schema).getRecord(); + Assertions.assertEquals(convertRecord, internalSchema.getRecord()); + Assertions.assertEquals(schema, AvroInternalSchemaConverter.convert(internalSchema, "newTableName")); + } + + @Test + public void testRefreshNewId() { + Types.RecordType record = Types.RecordType.get(Types.Field.get(0, false, "id", Types.IntType.get()), + Types.Field.get(1, true, "data", Types.StringType.get()), + Types.Field.get(2, true, "preferences", + Types.RecordType.get(Types.Field.get(4, false, "feature1", + Types.BooleanType.get()), Types.Field.get(5, true, "feature2", Types.BooleanType.get()))), + Types.Field.get(3, false, "locations", Types.MapType.get(6, 7, Types.StringType.get(), + Types.RecordType.get(Types.Field.get(8, false, "lat", Types.FloatType.get()), Types.Field.get(9, false, "long", Types.FloatType.get())), false)) + ); + AtomicInteger newId = new AtomicInteger(100); + Types.RecordType recordWithNewId = (Types.RecordType) InternalSchemaBuilder.getBuilder().refreshNewId(record, newId); + + Types.RecordType newRecord = Types.RecordType.get(Types.Field.get(100, false, "id", Types.IntType.get()), + Types.Field.get(101, true, "data", Types.StringType.get()), + Types.Field.get(102, true, "preferences", + Types.RecordType.get(Types.Field.get(104, false, "feature1", + Types.BooleanType.get()), Types.Field.get(105, true, "feature2", Types.BooleanType.get()))), + Types.Field.get(103, false, "locations", Types.MapType.get(106, 107, Types.StringType.get(), + Types.RecordType.get(Types.Field.get(108, false, "lat", Types.FloatType.get()), Types.Field.get(109, false, "long", Types.FloatType.get())), false)) + ); + Assertions.assertEquals(newRecord, recordWithNewId); + } + + /** + * test record data type changes. + * int => long/float/double/string + * long => float/double/string + * float => double/String + * double => String/Decimal + * Decimal => Decimal/String + * String => date/decimal + * date => String + */ + @Test + public void testReWriteRecordWithTypeChanged() { + Schema avroSchema = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"h0_record\",\"namespace\":\"hoodie.h0\",\"fields\"" + + ":[{\"name\":\"id\",\"type\":[\"null\",\"int\"],\"default\":null}," + + "{\"name\":\"comb\",\"type\":[\"null\",\"int\"],\"default\":null}," + + "{\"name\":\"com1\",\"type\":[\"null\",\"int\"],\"default\":null}," + + "{\"name\":\"col0\",\"type\":[\"null\",\"int\"],\"default\":null}," + + "{\"name\":\"col1\",\"type\":[\"null\",\"long\"],\"default\":null}," + + "{\"name\":\"col11\",\"type\":[\"null\",\"long\"],\"default\":null}," + + "{\"name\":\"col12\",\"type\":[\"null\",\"long\"],\"default\":null}," + + "{\"name\":\"col2\",\"type\":[\"null\",\"float\"],\"default\":null}," + + "{\"name\":\"col21\",\"type\":[\"null\",\"float\"],\"default\":null}," + + "{\"name\":\"col3\",\"type\":[\"null\",\"double\"],\"default\":null}," + + "{\"name\":\"col31\",\"type\":[\"null\",\"double\"],\"default\":null}," + + "{\"name\":\"col4\",\"type\":[\"null\",{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"hoodie.h0.h0_record.col4\"," + + "\"size\":5,\"logicalType\":\"decimal\",\"precision\":10,\"scale\":4}],\"default\":null}," + + "{\"name\":\"col41\",\"type\":[\"null\",{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"hoodie.h0.h0_record.col41\"," + + "\"size\":5,\"logicalType\":\"decimal\",\"precision\":10,\"scale\":4}],\"default\":null}," + + "{\"name\":\"col5\",\"type\":[\"null\",\"string\"],\"default\":null}," + + "{\"name\":\"col51\",\"type\":[\"null\",\"string\"],\"default\":null}," + + "{\"name\":\"col6\",\"type\":[\"null\",{\"type\":\"int\",\"logicalType\":\"date\"}],\"default\":null}," + + "{\"name\":\"col7\",\"type\":[\"null\",{\"type\":\"long\",\"logicalType\":\"timestamp-micros\"}],\"default\":null}," + + "{\"name\":\"col8\",\"type\":[\"null\",\"boolean\"],\"default\":null}," + + "{\"name\":\"col9\",\"type\":[\"null\",\"bytes\"],\"default\":null},{\"name\":\"par\",\"type\":[\"null\",{\"type\":\"int\",\"logicalType\":\"date\"}],\"default\":null}]}"); + // create a test record with avroSchema + GenericData.Record avroRecord = new GenericData.Record(avroSchema); + avroRecord.put("id", 1); + avroRecord.put("comb", 100); + avroRecord.put("com1", -100); + avroRecord.put("col0", 256); + avroRecord.put("col1", 1000L); + avroRecord.put("col11", -100L); + avroRecord.put("col12", 2000L); + avroRecord.put("col2", -5.001f); + avroRecord.put("col21", 5.001f); + avroRecord.put("col3", 12.999d); + avroRecord.put("col31", 9999.999d); + Schema currentDecimalType = avroSchema.getField("col4").schema().getTypes().get(1); + BigDecimal bd = new BigDecimal("123.456").setScale(((LogicalTypes.Decimal) currentDecimalType.getLogicalType()).getScale()); + avroRecord.put("col4", HoodieAvroUtils.DECIMAL_CONVERSION.toFixed(bd, currentDecimalType, currentDecimalType.getLogicalType())); + Schema currentDecimalType1 = avroSchema.getField("col41").schema().getTypes().get(1); + BigDecimal bd1 = new BigDecimal("7890.456").setScale(((LogicalTypes.Decimal) currentDecimalType1.getLogicalType()).getScale()); + avroRecord.put("col41", HoodieAvroUtils.DECIMAL_CONVERSION.toFixed(bd1, currentDecimalType1, currentDecimalType1.getLogicalType())); + + avroRecord.put("col5", "2011-01-01"); + avroRecord.put("col51", "199.342"); + avroRecord.put("col6", 18987); + avroRecord.put("col7", 1640491505000000L); + avroRecord.put("col8", false); + ByteBuffer bb = ByteBuffer.wrap(new byte[] {97, 48, 53}); + avroRecord.put("col9", bb); + Assertions.assertEquals(GenericData.get().validate(avroSchema, avroRecord), true); + InternalSchema internalSchema = AvroInternalSchemaConverter.convert(avroSchema); + // do change type operation + TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(internalSchema); + updateChange + .updateColumnType("id", Types.LongType.get()) + .updateColumnType("comb", Types.FloatType.get()) + .updateColumnType("com1", Types.DoubleType.get()) + .updateColumnType("col0", Types.StringType.get()) + .updateColumnType("col1", Types.FloatType.get()) + .updateColumnType("col11", Types.DoubleType.get()) + .updateColumnType("col12", Types.StringType.get()) + .updateColumnType("col2", Types.DoubleType.get()) + .updateColumnType("col21", Types.StringType.get()) + .updateColumnType("col3", Types.StringType.get()) + .updateColumnType("col31", Types.DecimalType.get(18, 9)) + .updateColumnType("col4", Types.DecimalType.get(18, 9)) + .updateColumnType("col41", Types.StringType.get()) + .updateColumnType("col5", Types.DateType.get()) + .updateColumnType("col51", Types.DecimalType.get(18, 9)) + .updateColumnType("col6", Types.StringType.get()); + InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(internalSchema, updateChange); + Schema newAvroSchema = AvroInternalSchemaConverter.convert(newSchema, avroSchema.getName()); + GenericRecord newRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(avroRecord, newAvroSchema, Collections.emptyMap()); + + Assertions.assertEquals(GenericData.get().validate(newAvroSchema, newRecord), true); + } + + @Test + public void testReWriteNestRecord() { + Types.RecordType record = Types.RecordType.get(Types.Field.get(0, false, "id", Types.IntType.get()), + Types.Field.get(1, true, "data", Types.StringType.get()), + Types.Field.get(2, true, "preferences", + Types.RecordType.get(Types.Field.get(5, false, "feature1", + Types.BooleanType.get()), Types.Field.get(6, true, "feature2", Types.BooleanType.get()))), + Types.Field.get(3, false,"doubles", Types.ArrayType.get(7, false, Types.DoubleType.get())), + Types.Field.get(4, false, "locations", Types.MapType.get(8, 9, Types.StringType.get(), + Types.RecordType.get(Types.Field.get(10, false, "lat", Types.FloatType.get()), Types.Field.get(11, false, "long", Types.FloatType.get())), false)) + ); + Schema schema = AvroInternalSchemaConverter.convert(record, "test1"); + GenericData.Record avroRecord = new GenericData.Record(schema); + GenericData.get().validate(schema, avroRecord); + avroRecord.put("id", 2); + avroRecord.put("data", "xs"); + // fill record type + GenericData.Record preferencesRecord = new GenericData.Record(AvroInternalSchemaConverter.convert(record.fieldType("preferences"), "test1_preferences")); + preferencesRecord.put("feature1", false); + preferencesRecord.put("feature2", true); + Assertions.assertEquals(GenericData.get().validate(AvroInternalSchemaConverter.convert(record.fieldType("preferences"), "test1_preferences"), preferencesRecord), true); + avroRecord.put("preferences", preferencesRecord); + // fill mapType + Map locations = new HashMap<>(); + Schema mapSchema = AvroInternalSchemaConverter.convert(((Types.MapType)record.field("locations").type()).valueType(), "test1_locations"); + GenericData.Record locationsValue = new GenericData.Record(mapSchema); + locationsValue.put("lat", 1.2f); + locationsValue.put("long", 1.4f); + GenericData.Record locationsValue1 = new GenericData.Record(mapSchema); + locationsValue1.put("lat", 2.2f); + locationsValue1.put("long", 2.4f); + locations.put("key1", locationsValue); + locations.put("key2", locationsValue1); + avroRecord.put("locations", locations); + + List doubles = new ArrayList<>(); + doubles.add(2.0d); + doubles.add(3.0d); + avroRecord.put("doubles", doubles); + + // do check + Assertions.assertEquals(GenericData.get().validate(schema, avroRecord), true); + // create newSchema + Types.RecordType newRecord = Types.RecordType.get( + Types.Field.get(0, false, "id", Types.IntType.get()), + Types.Field.get(1, true, "data", Types.StringType.get()), + Types.Field.get(2, true, "preferences", + Types.RecordType.get( + Types.Field.get(5, false, "feature1", Types.BooleanType.get()), + Types.Field.get(5, true, "featurex", Types.BooleanType.get()), + Types.Field.get(6, true, "feature2", Types.BooleanType.get()))), + Types.Field.get(3, false,"doubles", Types.ArrayType.get(7, false, Types.DoubleType.get())), + Types.Field.get(4, false, "locations", Types.MapType.get(8, 9, Types.StringType.get(), + Types.RecordType.get( + Types.Field.get(10, true, "laty", Types.FloatType.get()), + Types.Field.get(11, false, "long", Types.FloatType.get())), false) + ) + ); + + Schema newAvroSchema = AvroInternalSchemaConverter.convert(newRecord, schema.getName()); + GenericRecord newAvroRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(avroRecord, newAvroSchema, Collections.emptyMap()); + // test the correctly of rewrite + Assertions.assertEquals(GenericData.get().validate(newAvroSchema, newAvroRecord), true); + + // test rewrite with rename + InternalSchema internalSchema = AvroInternalSchemaConverter.convert(schema); + // do change rename operation + TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(internalSchema); + updateChange + .renameColumn("id", "idx") + .renameColumn("data", "datax") + .renameColumn("preferences.feature1", "f1") + .renameColumn("preferences.feature2", "f2") + .renameColumn("locations.value.lat", "lt"); + InternalSchema internalSchemaRename = SchemaChangeUtils.applyTableChanges2Schema(internalSchema, updateChange); + Schema avroSchemaRename = AvroInternalSchemaConverter.convert(internalSchemaRename, schema.getName()); + Map renameCols = InternalSchemaUtils.collectRenameCols(internalSchema, internalSchemaRename); + GenericRecord avroRecordRename = HoodieAvroUtils.rewriteRecordWithNewSchema(avroRecord, avroSchemaRename, renameCols); + // test the correctly of rewrite + Assertions.assertEquals(GenericData.get().validate(avroSchemaRename, avroRecordRename), true); + } + + @Test + public void testEvolutionSchemaFromNewAvroSchema() { + Types.RecordType oldRecord = Types.RecordType.get( + Types.Field.get(0, false, "id", Types.IntType.get()), + Types.Field.get(1, true, "data", Types.StringType.get()), + Types.Field.get(2, true, "preferences", + Types.RecordType.get( + Types.Field.get(5, false, "feature1", Types.BooleanType.get()), + Types.Field.get(6, true, "featurex", Types.BooleanType.get()), + Types.Field.get(7, true, "feature2", Types.BooleanType.get()))), + Types.Field.get(3, false,"doubles", Types.ArrayType.get(8, false, Types.DoubleType.get())), + Types.Field.get(4, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(), + Types.RecordType.get( + Types.Field.get(11, false, "laty", Types.FloatType.get()), + Types.Field.get(12, false, "long", Types.FloatType.get())), false) + ) + ); + InternalSchema oldSchema = new InternalSchema(oldRecord.fields()); + Types.RecordType evolvedRecord = Types.RecordType.get( + Types.Field.get(0, false, "id", Types.IntType.get()), + Types.Field.get(1, true, "data", Types.StringType.get()), + Types.Field.get(2, true, "preferences", + Types.RecordType.get( + Types.Field.get(5, false, "feature1", Types.BooleanType.get()), + Types.Field.get(5, true, "featurex", Types.BooleanType.get()), + Types.Field.get(6, true, "feature2", Types.BooleanType.get()), + Types.Field.get(5, true, "feature3", Types.BooleanType.get()))), + Types.Field.get(3, false,"doubles", Types.ArrayType.get(7, false, Types.DoubleType.get())), + Types.Field.get(4, false, "locations", Types.MapType.get(8, 9, Types.StringType.get(), + Types.RecordType.get( + Types.Field.get(10, false, "laty", Types.FloatType.get()), + Types.Field.get(11, false, "long", Types.FloatType.get())), false) + ), + Types.Field.get(0, false, "add1", Types.IntType.get()), + Types.Field.get(2, true, "addStruct", + Types.RecordType.get( + Types.Field.get(5, false, "nest1", Types.BooleanType.get()), + Types.Field.get(5, true, "nest2", Types.BooleanType.get()))) + ); + evolvedRecord = (Types.RecordType)InternalSchemaBuilder.getBuilder().refreshNewId(evolvedRecord, new AtomicInteger(0)); + Schema evolvedAvroSchema = AvroInternalSchemaConverter.convert(evolvedRecord, "test1"); + InternalSchema result = AvroSchemaEvolutionUtils.reconcileSchema(evolvedAvroSchema, oldSchema); + Types.RecordType checkedRecord = Types.RecordType.get( + Types.Field.get(0, false, "id", Types.IntType.get()), + Types.Field.get(1, true, "data", Types.StringType.get()), + Types.Field.get(2, true, "preferences", + Types.RecordType.get( + Types.Field.get(5, false, "feature1", Types.BooleanType.get()), + Types.Field.get(6, true, "featurex", Types.BooleanType.get()), + Types.Field.get(7, true, "feature2", Types.BooleanType.get()), + Types.Field.get(17, true, "feature3", Types.BooleanType.get()))), + Types.Field.get(3, false,"doubles", Types.ArrayType.get(8, false, Types.DoubleType.get())), + Types.Field.get(4, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(), + Types.RecordType.get( + Types.Field.get(11, false, "laty", Types.FloatType.get()), + Types.Field.get(12, false, "long", Types.FloatType.get())), false) + ), + Types.Field.get(13, true, "add1", Types.IntType.get()), + Types.Field.get(14, true, "addStruct", + Types.RecordType.get( + Types.Field.get(15, false, "nest1", Types.BooleanType.get()), + Types.Field.get(16, true, "nest2", Types.BooleanType.get()))) + ); + Assertions.assertEquals(result.getRecord(), checkedRecord); + } + + @Test + public void testReconcileSchema() { + // simple schema test + // a: boolean, b: int, c: long, d: date + Schema schema = create("simple", + new Schema.Field("a", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.BOOLEAN)), null, JsonProperties.NULL_VALUE), + new Schema.Field("b", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.INT)), null, JsonProperties.NULL_VALUE), + new Schema.Field("c", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.LONG)), null, JsonProperties.NULL_VALUE), + new Schema.Field("d", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT))), null, JsonProperties.NULL_VALUE)); + // a: boolean, c: long, c_1: long, d: date + Schema incomingSchema = create("simpleIncoming", + new Schema.Field("a", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.BOOLEAN)), null, JsonProperties.NULL_VALUE), + new Schema.Field("a1", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.LONG)), null, JsonProperties.NULL_VALUE), + new Schema.Field("c", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.LONG)), null, JsonProperties.NULL_VALUE), + new Schema.Field("c1", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.LONG)), null, JsonProperties.NULL_VALUE), + new Schema.Field("c2", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.LONG)), null, JsonProperties.NULL_VALUE), + new Schema.Field("d", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT))), null, JsonProperties.NULL_VALUE), + new Schema.Field("d1", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT))), null, JsonProperties.NULL_VALUE), + new Schema.Field("d2", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT))), null, JsonProperties.NULL_VALUE)); + + Schema simpleCheckSchema = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"simpleReconcileSchema\",\"fields\":[{\"name\":\"a\",\"type\":[\"null\",\"boolean\"],\"default\":null}," + + "{\"name\":\"b\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"a1\",\"type\":[\"null\",\"long\"],\"default\":null}," + + "{\"name\":\"c\",\"type\":[\"null\",\"long\"],\"default\":null}," + + "{\"name\":\"c1\",\"type\":[\"null\",\"long\"],\"default\":null},{\"name\":\"c2\",\"type\":[\"null\",\"long\"],\"default\":null}," + + "{\"name\":\"d\",\"type\":[\"null\",{\"type\":\"int\",\"logicalType\":\"date\"}],\"default\":null}," + + "{\"name\":\"d1\",\"type\":[\"null\",{\"type\":\"int\",\"logicalType\":\"date\"}],\"default\":null}," + + "{\"name\":\"d2\",\"type\":[\"null\",{\"type\":\"int\",\"logicalType\":\"date\"}],\"default\":null}]}"); + + Schema simpleReconcileSchema = AvroInternalSchemaConverter.convert(AvroSchemaEvolutionUtils + .reconcileSchema(incomingSchema, AvroInternalSchemaConverter.convert(schema)), "simpleReconcileSchema"); + Assertions.assertEquals(simpleReconcileSchema, simpleCheckSchema); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/internal/schema/utils/TestInternalSchemaUtils.java b/hudi-common/src/test/java/org/apache/hudi/internal/schema/utils/TestInternalSchemaUtils.java new file mode 100644 index 0000000000000..27482f4c5019d --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/internal/schema/utils/TestInternalSchemaUtils.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal.schema.utils; + +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.InternalSchemaBuilder; +import org.apache.hudi.internal.schema.Types; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Assertions; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +public class TestInternalSchemaUtils { + @Test + public void testPruneSchema() { + Types.RecordType record = getSimpleRecordType(); + InternalSchema originSchema = new InternalSchema(record.fields()); + List prunedCols = new ArrayList<>(); + prunedCols.add(4); + prunedCols.add(3); + prunedCols.add(0); + prunedCols.add(2); + InternalSchema prunedSchema = InternalSchemaUtils.pruneInternalSchemaByID(originSchema, prunedCols, null); + InternalSchema checkedSchema = new InternalSchema(Arrays.asList(new Types.Field[] { + Types.Field.get(0, "bool", Types.BooleanType.get()), + Types.Field.get(2, "long", Types.LongType.get()), + Types.Field.get(3, "float", Types.FloatType.get()), + Types.Field.get(4, "double", Types.DoubleType.get()) + })); + Assertions.assertEquals(prunedSchema, checkedSchema); + + // nest schema + Types.RecordType nestRecord = getNestRecordType(); + InternalSchema originNestSchema = new InternalSchema(nestRecord.fields()); + List prunedNestCols = new ArrayList<>(); + prunedNestCols.add(0); + prunedNestCols.add(1); + prunedNestCols.add(5); + prunedNestCols.add(11); + InternalSchema prunedNestSchema = InternalSchemaUtils.pruneInternalSchemaByID(originNestSchema, prunedNestCols, null); + } + + @Test + public void testInternalSchemaVisitor() { + Types.RecordType nestRecord = getNestRecordType(); + Map result = InternalSchemaBuilder.getBuilder().buildNameToId(nestRecord); + Assertions.assertEquals(result.size(), 12); + Assertions.assertEquals(result.get("locations.value.long"), 11); + Assertions.assertEquals(result.get("locations.value.lat"), 10); + Assertions.assertEquals(result.get("locations.value"), 9); + Assertions.assertEquals(result.get("locations.key"), 8); + Assertions.assertEquals(result.get("doubles.element"), 7); + + Types.RecordType simpleRecord = getSimpleRecordType(); + Map result1 = InternalSchemaBuilder.getBuilder().buildNameToId(simpleRecord); + Assertions.assertEquals(result1.size(), 5); + Assertions.assertEquals(result1.get("double"), 4); + } + + @Test + public void testIntTypeEqualsAfterDeserialization() throws Exception { + Types.IntType intType = Types.IntType.get(); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + new ObjectOutputStream(baos).writeObject(intType); + Types.IntType deserializedIntType = (Types.IntType) + new ObjectInputStream(new ByteArrayInputStream(baos.toByteArray())).readObject(); + Assertions.assertEquals(intType, deserializedIntType); + } + + public Types.RecordType getNestRecordType() { + return Types.RecordType.get(Types.Field.get(0, false, "id", Types.IntType.get()), + Types.Field.get(1, true, "data", Types.StringType.get()), + Types.Field.get(2, true, "preferences", + Types.RecordType.get(Types.Field.get(5, false, "feature1", + Types.BooleanType.get()), Types.Field.get(6, true, "feature2", Types.BooleanType.get()))), + Types.Field.get(3, false,"doubles", Types.ArrayType.get(7, false, Types.DoubleType.get())), + Types.Field.get(4, false, "locations", Types.MapType.get(8, 9, Types.StringType.get(), + Types.RecordType.get(Types.Field.get(10, false, "lat", Types.FloatType.get()), Types.Field.get(11, false, "long", Types.FloatType.get())), false)) + ); + } + + public Types.RecordType getSimpleRecordType() { + return Types.RecordType.get(Arrays.asList(new Types.Field[] { + Types.Field.get(0, "bool", Types.BooleanType.get()), + Types.Field.get(1, "int", Types.IntType.get()), + Types.Field.get(2, "long", Types.LongType.get()), + Types.Field.get(3, "float", Types.FloatType.get()), + Types.Field.get(4, "double", Types.DoubleType.get()) + })); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/io/storage/TestHoodieFileReaderFactory.java b/hudi-common/src/test/java/org/apache/hudi/io/storage/TestHoodieFileReaderFactory.java index 13971d5f6b644..ec334bde1e437 100644 --- a/hudi-common/src/test/java/org/apache/hudi/io/storage/TestHoodieFileReaderFactory.java +++ b/hudi-common/src/test/java/org/apache/hudi/io/storage/TestHoodieFileReaderFactory.java @@ -44,11 +44,16 @@ public void testGetFileReader() throws IOException { HoodieFileReader parquetReader = HoodieFileReaderFactory.getFileReader(hadoopConf, parquetPath); assertTrue(parquetReader instanceof HoodieParquetReader); - // other file format exception. + // log file format. final Path logPath = new Path("/partition/path/f.b51192a8-574b-4a85-b246-bcfec03ac8bf_100.log.2_1-0-1"); final Throwable thrown = assertThrows(UnsupportedOperationException.class, () -> { HoodieFileReader logWriter = HoodieFileReaderFactory.getFileReader(hadoopConf, logPath); }, "should fail since log storage reader is not supported yet."); assertTrue(thrown.getMessage().contains("format not supported yet.")); + + // Orc file format. + final Path orcPath = new Path("/partition/path/f1_1-0-1_000.orc"); + HoodieFileReader orcReader = HoodieFileReaderFactory.getFileReader(hadoopConf, orcPath); + assertTrue(orcReader instanceof HoodieOrcReader); } } diff --git a/hudi-common/src/test/java/org/apache/hudi/metadata/TestFileSystemBackedTableMetadata.java b/hudi-common/src/test/java/org/apache/hudi/metadata/TestFileSystemBackedTableMetadata.java new file mode 100644 index 0000000000000..9ec793daa0127 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/metadata/TestFileSystemBackedTableMetadata.java @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.engine.HoodieLocalEngineContext; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +public class TestFileSystemBackedTableMetadata extends HoodieCommonTestHarness { + + private static final String DEFAULT_PARTITION = ""; + private static final List DATE_PARTITIONS = Arrays.asList("2019/01/01", "2020/01/02", "2021/03/01"); + private static final List ONE_LEVEL_PARTITIONS = Arrays.asList("2019", "2020", "2021"); + private static final List MULTI_LEVEL_PARTITIONS = Arrays.asList("2019/01", "2020/01", "2021/01"); + private static HoodieTestTable hoodieTestTable; + + @BeforeEach + public void setUp() throws IOException { + initMetaClient(); + hoodieTestTable = HoodieTestTable.of(metaClient); + } + + @AfterEach + public void tearDown() throws IOException { + metaClient.getFs().delete(new Path(metaClient.getBasePath()), true); + } + + /** + * Test non partition hoodie table. + * @throws Exception + */ + @Test + public void testNonPartitionedTable() throws Exception { + // Generate 10 files under basepath + hoodieTestTable.addCommit("100").withBaseFilesInPartition(DEFAULT_PARTITION, IntStream.range(0, 10).toArray()); + HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf()); + FileSystemBackedTableMetadata fileSystemBackedTableMetadata = + new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false); + Assertions.assertEquals(0, fileSystemBackedTableMetadata.getAllPartitionPaths().size()); + Assertions.assertEquals(10, fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath)).length); + Assertions.assertEquals(10, fileSystemBackedTableMetadata.getAllFilesInPartitions( + Collections.singletonList(basePath)).get(basePath).length); + } + + /** + * Test listing of partitions result for date based partitions. + * @throws Exception + */ + @Test + public void testDatePartitionedTable() throws Exception { + String instant = "100"; + hoodieTestTable = hoodieTestTable.addCommit(instant); + // Generate 10 files under each partition + DATE_PARTITIONS.stream().forEach(p -> { + try { + hoodieTestTable = hoodieTestTable.withBaseFilesInPartition(p, IntStream.range(0, 10).toArray()); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf()); + FileSystemBackedTableMetadata fileSystemBackedTableMetadata = + new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, true); + Assertions.assertEquals(3, fileSystemBackedTableMetadata.getAllPartitionPaths().size()); + Assertions.assertEquals(10, fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + DATE_PARTITIONS.get(0))).length); + + List fullPartitionPaths = DATE_PARTITIONS.stream().map(p -> basePath + "/" + p).collect(Collectors.toList()); + Map partitionToFilesMap = fileSystemBackedTableMetadata.getAllFilesInPartitions(fullPartitionPaths); + for (String p : fullPartitionPaths) { + Assertions.assertEquals(10, partitionToFilesMap.get(p).length); + } + } + + /** + * Test listing of partitions result for date based partitions with assumeDataPartitioning = false. + * @throws Exception + */ + @Test + public void testDatePartitionedTableWithAssumeDateIsFalse() throws Exception { + String instant = "100"; + hoodieTestTable = hoodieTestTable.addCommit(instant); + // Generate 10 files under each partition + DATE_PARTITIONS.stream().forEach(p -> { + try { + hoodieTestTable = hoodieTestTable + .withPartitionMetaFiles(p) + .withBaseFilesInPartition(p, IntStream.range(0, 10).toArray()); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf()); + FileSystemBackedTableMetadata fileSystemBackedTableMetadata = + new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false); + Assertions.assertEquals(3, fileSystemBackedTableMetadata.getAllPartitionPaths().size()); + + List fullPartitionPaths = DATE_PARTITIONS.stream().map(p -> basePath + "/" + p).collect(Collectors.toList()); + Map partitionToFilesMap = fileSystemBackedTableMetadata.getAllFilesInPartitions(fullPartitionPaths); + for (String p : fullPartitionPaths) { + Assertions.assertEquals(10, partitionToFilesMap.get(p).length); + } + } + + @Test + public void testOneLevelPartitionedTable() throws Exception { + String instant = "100"; + hoodieTestTable = hoodieTestTable.addCommit(instant); + // Generate 10 files under each partition + ONE_LEVEL_PARTITIONS.stream().forEach(p -> { + try { + hoodieTestTable = hoodieTestTable.withPartitionMetaFiles(p) + .withBaseFilesInPartition(p, IntStream.range(0, 10).toArray()); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf()); + FileSystemBackedTableMetadata fileSystemBackedTableMetadata = + new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false); + Assertions.assertEquals(3, fileSystemBackedTableMetadata.getAllPartitionPaths().size()); + Assertions.assertEquals(10, fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + ONE_LEVEL_PARTITIONS.get(0))).length); + + List fullPartitionPaths = ONE_LEVEL_PARTITIONS.stream().map(p -> basePath + "/" + p).collect(Collectors.toList()); + Map partitionToFilesMap = fileSystemBackedTableMetadata.getAllFilesInPartitions(fullPartitionPaths); + for (String p : fullPartitionPaths) { + Assertions.assertEquals(10, partitionToFilesMap.get(p).length); + } + } + + @Test + public void testMultiLevelPartitionedTable() throws Exception { + String instant = "100"; + hoodieTestTable = hoodieTestTable.addCommit(instant); + // Generate 10 files under each partition + MULTI_LEVEL_PARTITIONS.stream().forEach(p -> { + try { + hoodieTestTable = hoodieTestTable.withPartitionMetaFiles(p) + .withBaseFilesInPartition(p, IntStream.range(0, 10).toArray()); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf()); + FileSystemBackedTableMetadata fileSystemBackedTableMetadata = + new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false); + Assertions.assertEquals(3, fileSystemBackedTableMetadata.getAllPartitionPaths().size()); + Assertions.assertEquals(10, fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + MULTI_LEVEL_PARTITIONS.get(0))).length); + + List fullPartitionPaths = MULTI_LEVEL_PARTITIONS.stream().map(p -> basePath + "/" + p).collect(Collectors.toList()); + Map partitionToFilesMap = fileSystemBackedTableMetadata.getAllFilesInPartitions(fullPartitionPaths); + for (String p : fullPartitionPaths) { + Assertions.assertEquals(10, partitionToFilesMap.get(p).length); + } + } + + @Test + public void testMultiLevelEmptyPartitionTable() throws Exception { + String instant = "100"; + hoodieTestTable = hoodieTestTable.addCommit(instant); + // Generate 10 files under each partition + MULTI_LEVEL_PARTITIONS.stream().forEach(p -> { + try { + hoodieTestTable = hoodieTestTable.withPartitionMetaFiles(p); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf()); + FileSystemBackedTableMetadata fileSystemBackedTableMetadata = + new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false); + Assertions.assertEquals(3, fileSystemBackedTableMetadata.getAllPartitionPaths().size()); + Assertions.assertEquals(0, fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + MULTI_LEVEL_PARTITIONS.get(0))).length); + + List fullPartitionPaths = MULTI_LEVEL_PARTITIONS.stream().map(p -> basePath + "/" + p).collect(Collectors.toList()); + Map partitionToFilesMap = fileSystemBackedTableMetadata.getAllFilesInPartitions(fullPartitionPaths); + for (String p : fullPartitionPaths) { + Assertions.assertEquals(0, partitionToFilesMap.get(p).length); + } + } + +} diff --git a/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java b/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java new file mode 100644 index 0000000000000..7b4d432b3f80e --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.avro.generic.IndexedRecord; +import org.apache.hudi.common.model.HoodieColumnRangeMetadata; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import static org.apache.hudi.common.util.CollectionUtils.createImmutableMap; +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestHoodieMetadataPayload extends HoodieCommonTestHarness { + + @Test + public void testFileSystemMetadataPayloadMerging() { + String partitionName = "2022/10/01"; + + Map firstCommitAddedFiles = createImmutableMap( + Pair.of("file1.parquet", 1000L), + Pair.of("file2.parquet", 2000L), + Pair.of("file3.parquet", 3000L) + ); + + HoodieRecord firstPartitionFilesRecord = + HoodieMetadataPayload.createPartitionFilesRecord(partitionName, Option.of(firstCommitAddedFiles), Option.empty()); + + Map secondCommitAddedFiles = createImmutableMap( + // NOTE: This is an append + Pair.of("file3.parquet", 3333L), + Pair.of("file4.parquet", 4000L), + Pair.of("file5.parquet", 5000L) + ); + + List secondCommitDeletedFiles = Collections.singletonList("file1.parquet"); + + HoodieRecord secondPartitionFilesRecord = + HoodieMetadataPayload.createPartitionFilesRecord(partitionName, Option.of(secondCommitAddedFiles), Option.of(secondCommitDeletedFiles)); + + HoodieMetadataPayload combinedPartitionFilesRecordPayload = + secondPartitionFilesRecord.getData().preCombine(firstPartitionFilesRecord.getData()); + + HoodieMetadataPayload expectedCombinedPartitionedFilesRecordPayload = + HoodieMetadataPayload.createPartitionFilesRecord(partitionName, + Option.of( + createImmutableMap( + Pair.of("file2.parquet", 2000L), + Pair.of("file3.parquet", 3333L), + Pair.of("file4.parquet", 4000L), + Pair.of("file5.parquet", 5000L) + ) + ), + Option.empty() + ).getData(); + + assertEquals(expectedCombinedPartitionedFilesRecordPayload, combinedPartitionFilesRecordPayload); + } + + @Test + public void testColumnStatsPayloadMerging() throws IOException { + String partitionPath = "2022/10/01"; + String fileName = "file.parquet"; + String targetColName = "c1"; + + HoodieColumnRangeMetadata c1Metadata = + HoodieColumnRangeMetadata.create(fileName, targetColName, 100, 1000, 5, 1000, 123456, 123456); + + HoodieRecord columnStatsRecord = + HoodieMetadataPayload.createColumnStatsRecords(partitionPath, Collections.singletonList(c1Metadata), false) + .findFirst().get(); + + //////////////////////////////////////////////////////////////////////// + // Case 1: Combining proper (non-deleted) records + //////////////////////////////////////////////////////////////////////// + + // NOTE: Column Stats record will only be merged in case existing file will be modified, + // which could only happen on storages schemes supporting appends + HoodieColumnRangeMetadata c1AppendedBlockMetadata = + HoodieColumnRangeMetadata.create(fileName, targetColName, 0, 500, 0, 100, 12345, 12345); + + HoodieRecord updatedColumnStatsRecord = + HoodieMetadataPayload.createColumnStatsRecords(partitionPath, Collections.singletonList(c1AppendedBlockMetadata), false) + .findFirst().get(); + + HoodieMetadataPayload combinedMetadataPayload = + columnStatsRecord.getData().preCombine(updatedColumnStatsRecord.getData()); + + HoodieColumnRangeMetadata expectedColumnRangeMetadata = + HoodieColumnRangeMetadata.create(fileName, targetColName, 0, 1000, 5, 1100, 135801, 135801); + + HoodieRecord expectedColumnStatsRecord = + HoodieMetadataPayload.createColumnStatsRecords(partitionPath, Collections.singletonList(expectedColumnRangeMetadata), false) + .findFirst().get(); + + // Assert combined payload + assertEquals(combinedMetadataPayload, expectedColumnStatsRecord.getData()); + + Option alternativelyCombinedMetadataPayloadAvro = + columnStatsRecord.getData().combineAndGetUpdateValue(updatedColumnStatsRecord.getData().getInsertValue(null).get(), null); + + // Assert that using legacy API yields the same value + assertEquals(combinedMetadataPayload.getInsertValue(null), alternativelyCombinedMetadataPayloadAvro); + + //////////////////////////////////////////////////////////////////////// + // Case 2: Combining w/ deleted records + //////////////////////////////////////////////////////////////////////// + + HoodieColumnRangeMetadata c1StubbedMetadata = + HoodieColumnRangeMetadata.stub(fileName, targetColName); + + HoodieRecord deletedColumnStatsRecord = + HoodieMetadataPayload.createColumnStatsRecords(partitionPath, Collections.singletonList(c1StubbedMetadata), true) + .findFirst().get(); + + // NOTE: In this case, deleted (or tombstone) record will be therefore deleting + // previous state of the record + HoodieMetadataPayload deletedCombinedMetadataPayload = + deletedColumnStatsRecord.getData().preCombine(columnStatsRecord.getData()); + + assertEquals(deletedColumnStatsRecord.getData(), deletedCombinedMetadataPayload); + + // NOTE: In this case, proper incoming record will be overwriting previously deleted + // record + HoodieMetadataPayload overwrittenCombinedMetadataPayload = + columnStatsRecord.getData().preCombine(deletedColumnStatsRecord.getData()); + + assertEquals(columnStatsRecord.getData(), overwrittenCombinedMetadataPayload); + } +} diff --git a/hudi-common/src/test/resources/complex.schema.avsc b/hudi-common/src/test/resources/complex.schema.avsc new file mode 100644 index 0000000000000..1672415bd4c4c --- /dev/null +++ b/hudi-common/src/test/resources/complex.schema.avsc @@ -0,0 +1,1882 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +{ + "type": "record", + "name": "field_20", + "namespace": "hoodie.complex", + "fields": [ + { + "name": "field_24", + "type": [ + "string", + "null" + ], + "default": "null" + }, + { + "name": "field_31", + "type": [ + { + "type": "record", + "name": "field_35", + "namespace": "hoodie.complex.complex_record", + "fields": [ + { + "name": "field_39", + "type": [ + { + "type": "record", + "name": "field_43", + "namespace": "hoodie.complex.complex_record.metadata", + "fields": [ + { + "name": "field_47", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_54", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_61", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_68", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_75", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_82", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_89", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_96", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_103", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_110", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_117", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_124", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_131", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_138", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_145", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_152", + "type": [ + { + "type": "array", + "items": [ + "string", + "null" + ] + }, + "null" + ] + }, + { + "name": "field_165", + "type": [ + "double", + "null" + ] + }, + { + "name": "field_172", + "type": [ + "long", + "null" + ] + } + ] + }, + "null" + ] + }, + { + "name": "field_184", + "type": [ + { + "type": "record", + "name": "field_188", + "namespace": "hoodie.complex.complex_record.metadata", + "fields": [ + { + "name": "field_192", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_199", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_206", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_213", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_220", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_227", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_234", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_241", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_248", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_255", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_262", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_269", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_276", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_283", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_290", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_297", + "type": [ + { + "type": "array", + "items": [ + "string", + "null" + ] + }, + "null" + ] + }, + { + "name": "field_310", + "type": [ + "double", + "null" + ] + }, + { + "name": "field_317", + "type": [ + "long", + "null" + ] + } + ] + }, + "null" + ] + }, + { + "name": "field_329", + "type": [ + { + "type": "record", + "name": "field_333", + "namespace": "hoodie.complex.complex_record.metadata", + "fields": [ + { + "name": "field_337", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_344", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_351", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_358", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_365", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_372", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_379", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_386", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_393", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_400", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_407", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_414", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_421", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_428", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_435", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_442", + "type": [ + { + "type": "array", + "items": [ + "string", + "null" + ] + }, + "null" + ] + }, + { + "name": "field_455", + "type": [ + "double", + "null" + ] + }, + { + "name": "field_462", + "type": [ + "long", + "null" + ] + } + ] + }, + "null" + ] + }, + { + "name": "field_474", + "type": [ + { + "type": "record", + "name": "field_478", + "namespace": "hoodie.complex.complex_record.metadata", + "fields": [ + { + "name": "field_482", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_489", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_496", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_503", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_510", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_517", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_524", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_531", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_538", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_545", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_552", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_559", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_566", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_573", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_580", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_587", + "type": [ + { + "type": "array", + "items": [ + "string", + "null" + ] + }, + "null" + ] + }, + { + "name": "field_600", + "type": [ + "double", + "null" + ] + }, + { + "name": "field_607", + "type": [ + "long", + "null" + ] + } + ] + }, + "null" + ] + }, + { + "name": "field_619", + "type": [ + { + "type": "record", + "name": "field_623", + "namespace": "hoodie.complex.complex_record.metadata", + "fields": [ + { + "name": "field_627", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_634", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_641", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_648", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_655", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_662", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_669", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_676", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_683", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_690", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_697", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_704", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_711", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_718", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_725", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_732", + "type": [ + { + "type": "array", + "items": [ + "string", + "null" + ] + }, + "null" + ] + }, + { + "name": "field_745", + "type": [ + "double", + "null" + ] + }, + { + "name": "field_752", + "type": [ + "long", + "null" + ] + } + ] + }, + "null" + ] + }, + { + "name": "field_764", + "type": [ + { + "type": "record", + "name": "field_768", + "namespace": "hoodie.complex.complex_record.metadata", + "fields": [ + { + "name": "field_772", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_779", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_786", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_793", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_800", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_807", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_814", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_821", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_828", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_835", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_842", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_849", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_856", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_863", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_870", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_877", + "type": [ + { + "type": "array", + "items": [ + "string", + "null" + ] + }, + "null" + ] + }, + { + "name": "field_890", + "type": [ + "double", + "null" + ] + }, + { + "name": "field_897", + "type": [ + "long", + "null" + ] + } + ] + }, + "null" + ] + }, + { + "name": "field_909", + "type": [ + { + "type": "record", + "name": "field_913", + "namespace": "hoodie.complex.complex_record.metadata", + "fields": [ + { + "name": "field_917", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_924", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_931", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_938", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_945", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_952", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_959", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_966", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_973", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_980", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_987", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_994", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1001", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_1008", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1015", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_1022", + "type": [ + { + "type": "array", + "items": [ + "string", + "null" + ] + }, + "null" + ] + }, + { + "name": "field_1035", + "type": [ + "double", + "null" + ] + }, + { + "name": "field_1042", + "type": [ + "long", + "null" + ] + } + ] + }, + "null" + ] + }, + { + "name": "field_1054", + "type": [ + { + "type": "record", + "name": "field_1058", + "namespace": "hoodie.complex.complex_record.metadata", + "fields": [ + { + "name": "field_1062", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_1069", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_1076", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_1083", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1090", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1097", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_1104", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1111", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_1118", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1125", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_1132", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1139", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1146", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_1153", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1160", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_1167", + "type": [ + { + "type": "array", + "items": [ + "string", + "null" + ] + }, + "null" + ] + }, + { + "name": "field_1180", + "type": [ + "double", + "null" + ] + }, + { + "name": "field_1187", + "type": [ + "long", + "null" + ] + } + ] + }, + "null" + ] + }, + { + "name": "field_1199", + "type": [ + { + "type": "record", + "name": "field_1203", + "namespace": "hoodie.complex.complex_record.metadata", + "fields": [ + { + "name": "field_1207", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_1214", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_1221", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_1228", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1235", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1242", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_1249", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1256", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_1263", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1270", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_1277", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1284", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1291", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_1298", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1305", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_1312", + "type": [ + { + "type": "array", + "items": [ + "string", + "null" + ] + }, + "null" + ] + }, + { + "name": "field_1325", + "type": [ + "double", + "null" + ] + }, + { + "name": "field_1332", + "type": [ + "long", + "null" + ] + } + ] + }, + "null" + ] + }, + { + "name": "field_1344", + "type": [ + { + "type": "record", + "name": "field_1348", + "namespace": "hoodie.complex.complex_record.metadata", + "fields": [ + { + "name": "field_1352", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_1359", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_1366", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_1373", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1380", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1387", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_1394", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1401", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_1408", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1415", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_1422", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1429", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1436", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_1443", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1450", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_1457", + "type": [ + { + "type": "array", + "items": [ + "string", + "null" + ] + }, + "null" + ] + }, + { + "name": "field_1470", + "type": [ + "double", + "null" + ] + }, + { + "name": "field_1477", + "type": [ + "long", + "null" + ] + } + ] + }, + "null" + ] + } + ] + }, + "null" + ] + }, + { + "name": "field_1494", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1501", + "type": [ + "double", + "null" + ] + }, + { + "name": "field_1508", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1515", + "type": [ + "int", + "null" + ] + }, + { + "name": "field_1522", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_1529", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1536", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1543", + "type": [ + { + "type": "record", + "name": "field_1547", + "namespace": "hoodie.complex.complex_record", + "fields": [ + { + "name": "field_1551", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1558", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1565", + "type": [ + "long", + "null" + ] + }, + { + "name": "field_1572", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1579", + "type": [ + "string", + "null" + ] + } + ] + }, + "null" + ] + }, + { + "name": "field_1591", + "type": [ + { + "type": "record", + "name": "field_1595", + "namespace": "hoodie.complex.complex_record", + "fields": [ + { + "name": "field_1599", + "type": [ + "boolean", + "null" + ] + }, + { + "name": "field_1606", + "type": [ + { + "type": "record", + "name": "field_1610", + "namespace": "hoodie.complex.complex_record.FLAGS", + "fields": [ + { + "name": "field_1614", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1621", + "type": [ + "string", + "null" + ] + } + ] + }, + "null" + ] + } + ] + }, + "null" + ] + }, + { + "name": "field_1638", + "type": [ + { + "type": "record", + "name": "field_1642", + "namespace": "hoodie.complex.complex_record", + "fields": [ + { + "name": "field_1646", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1653", + "type": [ + { + "type": "array", + "items": [ + "string", + "null" + ] + }, + "null" + ] + }, + { + "name": "field_1666", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1673", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1680", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1687", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1694", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1701", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1708", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1715", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1722", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1729", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1736", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1743", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1750", + "type": [ + "string", + "null" + ] + } + ] + }, + "null" + ] + }, + { + "name": "field_1762", + "type": [ + { + "type": "record", + "name": "field_1766", + "namespace": "hoodie.complex.complex_record", + "fields": [ + { + "name": "field_1770", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1777", + "type": [ + { + "type": "array", + "items": [ + "string", + "null" + ] + }, + "null" + ] + }, + { + "name": "field_1790", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1797", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1804", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1811", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1818", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1825", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1832", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1839", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1846", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1853", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1860", + "type": [ + "string", + "null" + ] + }, + { + "name": "field_1867", + "type": [ + "string", + "null" + ] + } + ] + }, + "null" + ] + } + ] +} diff --git a/hudi-common/src/test/resources/external-config/hudi-defaults.conf b/hudi-common/src/test/resources/external-config/hudi-defaults.conf new file mode 100644 index 0000000000000..1133adb4d7735 --- /dev/null +++ b/hudi-common/src/test/resources/external-config/hudi-defaults.conf @@ -0,0 +1,26 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running Hudi jobs. +# This is useful for setting default environmental settings. + +# Example: +hoodie.datasource.hive_sync.jdbcurl jdbc:hive2://localhost:10000 +hoodie.datasource.hive_sync.use_jdbc true +hoodie.datasource.hive_sync.support_timestamp false +hoodie.index.type BLOOM +hoodie.metadata.enable true diff --git a/hudi-common/src/test/resources/log4j-surefire-quiet.properties b/hudi-common/src/test/resources/log4j-surefire-quiet.properties deleted file mode 100644 index ca0a50c84270c..0000000000000 --- a/hudi-common/src/test/resources/log4j-surefire-quiet.properties +++ /dev/null @@ -1,30 +0,0 @@ -### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -### -log4j.rootLogger=WARN, CONSOLE -log4j.logger.org.apache.hudi=DEBUG -log4j.logger.org.apache.hadoop.hbase=ERROR - -# CONSOLE is set to be a ConsoleAppender. -log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# CONSOLE uses PatternLayout. -log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout -log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n -log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter -log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true -log4j.appender.CONSOLE.filter.a.LevelMin=WARN -log4j.appender.CONSOLE.filter.a.LevelMax=FATAL diff --git a/hudi-common/src/test/resources/log4j-surefire.properties b/hudi-common/src/test/resources/log4j-surefire.properties deleted file mode 100644 index c5bdf75ae2ae3..0000000000000 --- a/hudi-common/src/test/resources/log4j-surefire.properties +++ /dev/null @@ -1,31 +0,0 @@ -### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -### -log4j.rootLogger=WARN, CONSOLE -log4j.logger.org.apache=INFO -log4j.logger.org.apache.hudi=DEBUG -log4j.logger.org.apache.hadoop.hbase=ERROR - -# A1 is set to be a ConsoleAppender. -log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. -log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout -log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n -log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter -log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true -log4j.appender.CONSOLE.filter.a.LevelMin=WARN -log4j.appender.CONSOLE.filter.a.LevelMax=FATAL \ No newline at end of file diff --git a/hudi-common/src/test/resources/props/test.properties b/hudi-common/src/test/resources/props/test.properties new file mode 100644 index 0000000000000..8e848aff79d33 --- /dev/null +++ b/hudi-common/src/test/resources/props/test.properties @@ -0,0 +1,18 @@ + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +some.random.prop=123 \ No newline at end of file diff --git a/hudi-common/src/test/resources/simple-test-doced.avsc b/hudi-common/src/test/resources/simple-test-doced.avsc new file mode 100644 index 0000000000000..f6b53aff8ee8a --- /dev/null +++ b/hudi-common/src/test/resources/simple-test-doced.avsc @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ +"namespace": "example.avro", + "type": "record", + "name": "User", + "fields": [ + {"name": "name", "type": "string","doc":"name_comment"}, + {"name": "favorite_number", "type": "int","doc":"favorite_number_comment"}, + {"name": "favorite_color", "type": "string"} + ] +} diff --git a/hudi-common/src/test/resources/simple-test-evolved-compatible.avsc b/hudi-common/src/test/resources/simple-test-evolved-compatible.avsc new file mode 100644 index 0000000000000..09463fa310937 --- /dev/null +++ b/hudi-common/src/test/resources/simple-test-evolved-compatible.avsc @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ +"namespace": "example.avro", + "type": "record", + "name": "User", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "favorite_number", "type": "int"}, + {"name": "favorite_color", "type": "string"}, + {"name": "field1", "type": ["null", "string"], "default": null}, + {"name": "field2", "type": ["null", "string"], "default": null} + ] +} diff --git a/hudi-common/src/test/resources/timestamp-logical-type.avsc b/hudi-common/src/test/resources/timestamp-logical-type.avsc new file mode 100644 index 0000000000000..6720523be9927 --- /dev/null +++ b/hudi-common/src/test/resources/timestamp-logical-type.avsc @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "namespace": "example.avro", + "type": "record", + "name": "User", + "fields": [ + {"name": "field1", "type": ["null", "string"], "default": null}, + {"name": "createTime", "type": ["null", {"type" : "long", "logicalType" : "timestamp-micros"}], "default": null} + ] +} \ No newline at end of file diff --git a/hudi-common/src/test/resources/timestamp-test-evolved.avsc b/hudi-common/src/test/resources/timestamp-test-evolved.avsc index beb36329eabac..7a52ca6f245e1 100644 --- a/hudi-common/src/test/resources/timestamp-test-evolved.avsc +++ b/hudi-common/src/test/resources/timestamp-test-evolved.avsc @@ -20,7 +20,43 @@ "type": "record", "name": "User", "fields": [ - {"name": "field1", "type": ["null", "string"], "default": null}, - {"name": "createTime", "type": ["null", "long"], "default": null} + { + "name": "field1", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "createTime", + "type": [ + "null", + "long" + ], + "default": null + }, + { + "name": "createTimeString", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "createTimeDecimal", + "type": [ + "null", + { + "name": "decimalFixed", + "type": "fixed", + "logicalType": "decimal", + "precision": 20, + "scale": 4, + "size": 10 + } + ] + } ] } \ No newline at end of file diff --git a/hudi-examples/README.md b/hudi-examples/README.md index dfaf5d788ed10..5c228b6825081 100644 --- a/hudi-examples/README.md +++ b/hudi-examples/README.md @@ -36,10 +36,10 @@ To run the demo: 5.3 Run `bin/kafka-delta-streamer-example.sh` - 5.4 continuously write source data to the Kafka topic your configured with `hoodie.deltastreamer.source.kafka.topic` in `kafka-source.properties` + 5.4 Continuously write source data to the Kafka topic your configured with `hoodie.deltastreamer.source.kafka.topic` in `kafka-source.properties` 6. Some notes delta streamer demo: 6.1 The configuration files we provided is just the simplest demo, you can change it according to your specific needs. - 6.2 You could also use Intellij to run the example directly by configuring parameters as "Program arguments" + 6.2 You could also use IntelliJ IDEA to run the example directly by configuring parameters as "Program arguments" diff --git a/hudi-examples/bin/hudi-delta-streamer b/hudi-examples/bin/hudi-delta-streamer index 9accd7174ae4b..a1e9ee18804f0 100755 --- a/hudi-examples/bin/hudi-delta-streamer +++ b/hudi-examples/bin/hudi-delta-streamer @@ -32,7 +32,6 @@ exec "${SPARK_HOME}"/bin/spark-submit \ --conf spark.kryoserializer.buffer.max=128m \ --conf spark.yarn.queue=root.default \ --conf spark.yarn.submit.waitAppCompletion=false \ ---packages org.apache.spark:spark-avro_2.11:2.4.4 \ --jars ${EXAMPLES_JARS} \ --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer \ "${JAR_FILE}" \ diff --git a/hudi-examples/hudi-examples-common/pom.xml b/hudi-examples/hudi-examples-common/pom.xml new file mode 100644 index 0000000000000..fa63168257ee5 --- /dev/null +++ b/hudi-examples/hudi-examples-common/pom.xml @@ -0,0 +1,137 @@ + + + + + hudi-examples + org.apache.hudi + 0.12.2-dt-SNAPSHOT + + 4.0.0 + + hudi-examples-common + + + ${project.parent.basedir} + true + + + + + + src/main/resources + + + + + + net.alchim31.maven + scala-maven-plugin + + + scala-compile-first + process-resources + + add-source + compile + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + + compile + + compile + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + + test-jar + + test-compile + + + + false + + + + org.apache.rat + apache-rat-plugin + + + + + + + org.apache.hudi + hudi-common + ${project.version} + + + + + org.apache.avro + avro + + + + org.apache.parquet + parquet-avro + + + + + org.junit.jupiter + junit-jupiter-api + test + + + org.junit.jupiter + junit-jupiter-engine + test + + + org.junit.vintage + junit-vintage-engine + test + + + org.junit.jupiter + junit-jupiter-params + test + + + org.mockito + mockito-junit-jupiter + test + + + + diff --git a/hudi-examples/hudi-examples-common/src/main/java/org/apache/hudi/examples/common/HoodieExampleDataGenerator.java b/hudi-examples/hudi-examples-common/src/main/java/org/apache/hudi/examples/common/HoodieExampleDataGenerator.java new file mode 100644 index 0000000000000..004271a329d36 --- /dev/null +++ b/hudi-examples/hudi-examples-common/src/main/java/org/apache/hudi/examples/common/HoodieExampleDataGenerator.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.examples.common; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.UUID; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +/** + * Class to be used to generate test data. + */ +public class HoodieExampleDataGenerator> { + + public static final String DEFAULT_FIRST_PARTITION_PATH = "2020/01/01"; + public static final String DEFAULT_SECOND_PARTITION_PATH = "2020/01/02"; + public static final String DEFAULT_THIRD_PARTITION_PATH = "2020/01/03"; + + public static final String[] DEFAULT_PARTITION_PATHS = + {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH, DEFAULT_THIRD_PARTITION_PATH}; + public static String TRIP_EXAMPLE_SCHEMA = "{\"type\": \"record\",\"name\": \"triprec\",\"fields\": [ " + + "{\"name\": \"ts\",\"type\": \"long\"},{\"name\": \"uuid\", \"type\": \"string\"}," + + "{\"name\": \"rider\", \"type\": \"string\"},{\"name\": \"driver\", \"type\": \"string\"}," + + "{\"name\": \"begin_lat\", \"type\": \"double\"},{\"name\": \"begin_lon\", \"type\": \"double\"}," + + "{\"name\": \"end_lat\", \"type\": \"double\"},{\"name\": \"end_lon\", \"type\": \"double\"}," + + "{\"name\":\"fare\",\"type\": \"double\"}]}"; + public static Schema avroSchema = new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA); + + private static final Random RAND = new Random(46474747); + + private final Map existingKeys; + private final String[] partitionPaths; + private int numExistingKeys; + + public HoodieExampleDataGenerator(String[] partitionPaths) { + this(partitionPaths, new HashMap<>()); + } + + public HoodieExampleDataGenerator() { + this(DEFAULT_PARTITION_PATHS); + } + + public HoodieExampleDataGenerator(String[] partitionPaths, Map keyPartitionMap) { + this.partitionPaths = Arrays.copyOf(partitionPaths, partitionPaths.length); + this.existingKeys = keyPartitionMap; + } + + /** + * Generates a new avro record of the above schema format, retaining the key if optionally provided. + */ + @SuppressWarnings("unchecked") + public T generateRandomValue(HoodieKey key, String commitTime) { + GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime, "driver-" + commitTime, 0); + return (T) new HoodieAvroPayload(Option.of(rec)); + } + + public GenericRecord generateGenericRecord(String rowKey, String riderName, String driverName, + long timestamp) { + GenericRecord rec = new GenericData.Record(avroSchema); + rec.put("uuid", rowKey); + rec.put("ts", timestamp); + rec.put("rider", riderName); + rec.put("driver", driverName); + rec.put("begin_lat", RAND.nextDouble()); + rec.put("begin_lon", RAND.nextDouble()); + rec.put("end_lat", RAND.nextDouble()); + rec.put("end_lon", RAND.nextDouble()); + rec.put("fare", RAND.nextDouble() * 100); + return rec; + } + + /** + * Generates new inserts, uniformly across the partition paths above. It also updates the list of existing keys. + */ + public List> generateInserts(String commitTime, Integer n) { + return generateInsertsStream(commitTime, n).collect(Collectors.toList()); + } + + /** + * Generates new inserts, uniformly across the partition paths above. It also updates the list of existing keys. + */ + public Stream> generateInsertsStream(String commitTime, Integer n) { + int currSize = getNumExistingKeys(); + + return IntStream.range(0, n).boxed().map(i -> { + String partitionPath = partitionPaths[RAND.nextInt(partitionPaths.length)]; + HoodieKey key = new HoodieKey(UUID.randomUUID().toString(), partitionPath); + KeyPartition kp = new KeyPartition(); + kp.key = key; + kp.partitionPath = partitionPath; + existingKeys.put(currSize + i, kp); + numExistingKeys++; + return new HoodieAvroRecord<>(key, generateRandomValue(key, commitTime)); + }); + } + + /** + * Generates new inserts, across a single partition path. It also updates the list of existing keys. + */ + public List> generateInsertsOnPartition(String commitTime, Integer n, String partitionPath) { + return generateInsertsStreamOnPartition(commitTime, n, partitionPath).collect(Collectors.toList()); + } + + /** + * Generates new inserts, across a single partition path. It also updates the list of existing keys. + */ + public Stream> generateInsertsStreamOnPartition(String commitTime, Integer n, String partitionPath) { + int currSize = getNumExistingKeys(); + + return IntStream.range(0, n).boxed().map(i -> { + HoodieKey key = new HoodieKey(UUID.randomUUID().toString(), partitionPath); + KeyPartition kp = new KeyPartition(); + kp.key = key; + kp.partitionPath = partitionPath; + existingKeys.put(currSize + i, kp); + numExistingKeys++; + return new HoodieAvroRecord<>(key, generateRandomValue(key, commitTime)); + }); + } + + /** + * Generates new updates, randomly distributed across the keys above. There can be duplicates within the returned + * list + * + * @param commitTime Commit Timestamp + * @param n Number of updates (including dups) + * @return list of hoodie record updates + */ + public List> generateUpdates(String commitTime, Integer n) { + List> updates = new ArrayList<>(); + for (int i = 0; i < n; i++) { + KeyPartition kp = existingKeys.get(RAND.nextInt(numExistingKeys - 1)); + HoodieRecord record = generateUpdateRecord(kp.key, commitTime); + updates.add(record); + } + return updates; + } + + /** + * Generates new updates, one for each of the keys above + * list + * + * @param commitTime Commit Timestamp + * @return list of hoodie record updates + */ + public List> generateUniqueUpdates(String commitTime) { + List> updates = new ArrayList<>(); + for (int i = 0; i < numExistingKeys; i++) { + KeyPartition kp = existingKeys.get(i); + HoodieRecord record = generateUpdateRecord(kp.key, commitTime); + updates.add(record); + } + return updates; + } + + public HoodieRecord generateUpdateRecord(HoodieKey key, String commitTime) { + return new HoodieAvroRecord<>(key, generateRandomValue(key, commitTime)); + } + + private Option convertToString(HoodieRecord record) { + try { + String str = HoodieAvroUtils + .bytesToAvro(((HoodieAvroPayload) record.getData()).getRecordBytes(), avroSchema) + .toString(); + str = "{" + str.substring(str.indexOf("\"ts\":")); + return Option.of(str.replaceAll("}", ", \"partitionpath\": \"" + record.getPartitionPath() + "\"}")); + } catch (IOException e) { + return Option.empty(); + } + } + + public List convertToStringList(List> records) { + return records.stream().map(this::convertToString).filter(Option::isPresent).map(Option::get) + .collect(Collectors.toList()); + } + + public int getNumExistingKeys() { + return numExistingKeys; + } + + public static class KeyPartition implements Serializable { + + HoodieKey key; + String partitionPath; + } + + public void close() { + existingKeys.clear(); + } + +} diff --git a/hudi-examples/hudi-examples-dbt/.gitignore b/hudi-examples/hudi-examples-dbt/.gitignore new file mode 100644 index 0000000000000..0eb3fd035dbc7 --- /dev/null +++ b/hudi-examples/hudi-examples-dbt/.gitignore @@ -0,0 +1,8 @@ +target/ +dbt_modules/ +logs/ +.tox/ +.idea/ +.DS_Store +.vscode +*.log diff --git a/hudi-examples/hudi-examples-dbt/README.md b/hudi-examples/hudi-examples-dbt/README.md new file mode 100644 index 0000000000000..8fe796d37c521 --- /dev/null +++ b/hudi-examples/hudi-examples-dbt/README.md @@ -0,0 +1,134 @@ + +## Testing dbt project: `hudi_examples_dbt` + +This dbt project transforms demonstrates hudi integration with dbt, it has a few models to demonstrate the different ways in which you can create hudi datasets using dbt. + +### What is this repo? +What this repo _is_: +- A self-contained playground dbt project, useful for testing out scripts, and communicating some of the core dbt concepts. + +### Running this project +To get up and running with this project: +1. Install dbt using [these instructions](https://docs.getdbt.com/docs/installation). + +2. Install [dbt-spark](https://github.com/dbt-labs/dbt-spark) package: +```bash +pip install dbt-spark +``` + +3. Clone this repo and change into the `hudi-examples-dbt` directory from the command line: +```bash +cd hudi-examples/hudi-examples-dbt +``` + +4. Set up a profile called `spark` to connect to a spark cluster by following [these instructions](https://docs.getdbt.com/reference/warehouse-profiles/spark-profile). If you have access to a data warehouse, you can use those credentials – we recommend setting your [target schema](https://docs.getdbt.com/docs/configure-your-profile#section-populating-your-profile) to be a new schema (dbt will create the schema for you, as long as you have the right privileges). If you don't have access to an existing data warehouse, you can also setup a local postgres database and connect to it in your profile. + +> **NOTE:** You need to include the hudi spark bundle to the spark cluster, the latest supported version is 0.10.1. + +5. Ensure your profile is setup correctly from the command line: +```bash +dbt debug +``` + +Output of the above command should show this text at the end of the output: +```bash +All checks passed! +``` + +6. Run the models: +```bash +dbt run +``` + +Output should look like this: +```bash +05:47:28 Running with dbt=1.0.0 +05:47:28 Found 5 models, 10 tests, 0 snapshots, 0 analyses, 0 macros, 0 operations, 0 seed files, 0 sources, 0 exposures, 0 metrics +05:47:28 +05:47:29 Concurrency: 1 threads (target='local') +05:47:29 +05:47:29 1 of 5 START incremental model analytics.hudi_insert_table...................... [RUN] +05:47:31 1 of 5 OK created incremental model analytics.hudi_insert_table................. [OK in 2.61s] +05:47:31 2 of 5 START incremental model analytics.hudi_insert_overwrite_table............ [RUN] +05:47:34 2 of 5 OK created incremental model analytics.hudi_insert_overwrite_table....... [OK in 3.19s] +05:47:34 3 of 5 START incremental model analytics.hudi_upsert_table...................... [RUN] +05:47:37 3 of 5 OK created incremental model analytics.hudi_upsert_table................. [OK in 2.68s] +05:47:37 4 of 5 START incremental model analytics.hudi_upsert_partitioned_cow_table...... [RUN] +05:47:40 4 of 5 OK created incremental model analytics.hudi_upsert_partitioned_cow_table. [OK in 2.60s] +05:47:40 5 of 5 START incremental model analytics.hudi_upsert_partitioned_mor_table...... [RUN] +05:47:42 5 of 5 OK created incremental model analytics.hudi_upsert_partitioned_mor_table. [OK in 2.53s] +05:47:42 +05:47:42 Finished running 5 incremental models in 14.70s. +05:47:42 +05:47:42 Completed successfully +``` +7. Test the output of the models: +```bash +dbt test +``` +Output should look like this: +```bash +05:48:17 Running with dbt=1.0.0 +05:48:17 Found 5 models, 10 tests, 0 snapshots, 0 analyses, 0 macros, 0 operations, 0 seed files, 0 sources, 0 exposures, 0 metrics +05:48:17 +05:48:19 Concurrency: 1 threads (target='local') +05:48:19 +05:48:19 1 of 10 START test not_null_hudi_insert_overwrite_table_id...................... [RUN] +05:48:19 1 of 10 PASS not_null_hudi_insert_overwrite_table_id............................ [PASS in 0.50s] +05:48:19 2 of 10 START test not_null_hudi_insert_overwrite_table_name.................... [RUN] +05:48:20 2 of 10 PASS not_null_hudi_insert_overwrite_table_name.......................... [PASS in 0.45s] +05:48:20 3 of 10 START test not_null_hudi_insert_overwrite_table_ts...................... [RUN] +05:48:20 3 of 10 PASS not_null_hudi_insert_overwrite_table_ts............................ [PASS in 0.47s] +05:48:20 4 of 10 START test not_null_hudi_insert_table_id................................ [RUN] +05:48:20 4 of 10 PASS not_null_hudi_insert_table_id...................................... [PASS in 0.44s] +05:48:20 5 of 10 START test not_null_hudi_upsert_table_id................................ [RUN] +05:48:21 5 of 10 PASS not_null_hudi_upsert_table_id...................................... [PASS in 0.38s] +05:48:21 6 of 10 START test not_null_hudi_upsert_table_name.............................. [RUN] +05:48:21 6 of 10 PASS not_null_hudi_upsert_table_name.................................... [PASS in 0.40s] +05:48:21 7 of 10 START test not_null_hudi_upsert_table_ts................................ [RUN] +05:48:22 7 of 10 PASS not_null_hudi_upsert_table_ts...................................... [PASS in 0.38s] +05:48:22 8 of 10 START test unique_hudi_insert_overwrite_table_id........................ [RUN] +05:48:23 8 of 10 PASS unique_hudi_insert_overwrite_table_id.............................. [PASS in 1.32s] +05:48:23 9 of 10 START test unique_hudi_insert_table_id.................................. [RUN] +05:48:24 9 of 10 PASS unique_hudi_insert_table_id........................................ [PASS in 1.26s] +05:48:24 10 of 10 START test unique_hudi_upsert_table_id................................. [RUN] +05:48:25 10 of 10 PASS unique_hudi_upsert_table_id....................................... [PASS in 1.29s] +05:48:26 +05:48:26 Finished running 10 tests in 8.23s. +05:48:26 +05:48:26 Completed successfully +05:48:26 +05:48:26 Done. PASS=10 WARN=0 ERROR=0 SKIP=0 TOTAL=10 +``` + +8. Generate documentation for the project: +```bash +dbt docs generate +``` + +9. View the [documentation](http://127.0.0.1:8080/#!/overview) for the project after running the following command: +```bash +dbt docs serve +``` + +--- +For more information on dbt: +- Read the [introduction to dbt](https://docs.getdbt.com/docs/introduction). +- Read the [dbt viewpoint](https://docs.getdbt.com/docs/about/viewpoint). +- Join the [dbt community](http://community.getdbt.com/). +--- diff --git a/hudi-examples/hudi-examples-dbt/dbt_project.yml b/hudi-examples/hudi-examples-dbt/dbt_project.yml new file mode 100644 index 0000000000000..dc5f5593d64d3 --- /dev/null +++ b/hudi-examples/hudi-examples-dbt/dbt_project.yml @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Name your project! Project names should contain only lowercase characters +# and underscores. A good package name should reflect your organization's +# name or the intended use of these models +name: 'hudi_examples_dbt' +version: '1.0.0' +config-version: 2 + +# This setting configures which "profile" dbt uses for this project. +profile: 'spark' + +# These configurations specify where dbt should look for different types of files. +# The `source-paths` config, for example, states that models in this project can be +# found in the "models/" directory. You probably won't need to change these! +model-paths: ["models"] + +target-path: "target" # directory which will store compiled SQL files +clean-targets: # directories to be removed by `dbt clean` + - "target" + - "dbt_modules" + +# Configuring models +# Full documentation: https://docs.getdbt.com/docs/configuring-models + +# In this example config, we tell dbt to build all models in the example/ directory +# as tables. These settings can be overridden in the individual model files +# using the `{{ config(...) }}` macro. +models: + +file_format: hudi + hudi_examples_dbt: + # Applies to all files under models/example/ + example: + materialized: table diff --git a/hudi-examples/hudi-examples-dbt/models/example/hudi_insert_overwrite_table.sql b/hudi-examples/hudi-examples-dbt/models/example/hudi_insert_overwrite_table.sql new file mode 100644 index 0000000000000..e0afa5a456cf6 --- /dev/null +++ b/hudi-examples/hudi-examples-dbt/models/example/hudi_insert_overwrite_table.sql @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +/* + Example of an insert_overwrite for a non-partitioned table with incremental materialization. + */ +{{ config( + materialized='incremental', + file_format='hudi', + incremental_strategy='insert_overwrite', + options={ + 'type': 'cow', + 'precombineKey': 'ts', + }, + unique_key='id' + ) +}} + +select id, cast(rand() as string) as name, current_timestamp() as ts +from {{ ref('hudi_insert_table') }} \ No newline at end of file diff --git a/hudi-examples/hudi-examples-dbt/models/example/hudi_insert_table.sql b/hudi-examples/hudi-examples-dbt/models/example/hudi_insert_table.sql new file mode 100644 index 0000000000000..a77bf796cad28 --- /dev/null +++ b/hudi-examples/hudi-examples-dbt/models/example/hudi_insert_table.sql @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +/* + Example of an insert for a non-partitioned table with incremental materialization. + */ +{{ + config( + materialized='incremental', + file_format='hudi', + unique_key='id' + ) +}} + +with source_data as ( + + select format_number(rand()*1000, 0) as id + union all + select null as id + + ) + +select * +from source_data +where id is not null \ No newline at end of file diff --git a/hudi-examples/hudi-examples-dbt/models/example/hudi_upsert_partitioned_cow_table.sql b/hudi-examples/hudi-examples-dbt/models/example/hudi_upsert_partitioned_cow_table.sql new file mode 100644 index 0000000000000..caedcbc5fd055 --- /dev/null +++ b/hudi-examples/hudi-examples-dbt/models/example/hudi_upsert_partitioned_cow_table.sql @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +/* + Example of an upsert for a partitioned copy on write table with incremental materialization using merge strategy. + */ +{{ config( + materialized='incremental', + file_format='hudi', + incremental_strategy='merge', + options={ + 'type': 'cow', + 'primaryKey': 'id', + 'precombineKey': 'ts', + }, + unique_key='id', + partition_by='datestr', + pre_hook=["set spark.sql.datetime.java8API.enabled=false;"], + ) +}} + +select id, name, current_timestamp() as ts, current_date as datestr +from {{ ref('hudi_upsert_table') }} \ No newline at end of file diff --git a/hudi-examples/hudi-examples-dbt/models/example/hudi_upsert_partitioned_mor_table.sql b/hudi-examples/hudi-examples-dbt/models/example/hudi_upsert_partitioned_mor_table.sql new file mode 100644 index 0000000000000..2beab7c4ae466 --- /dev/null +++ b/hudi-examples/hudi-examples-dbt/models/example/hudi_upsert_partitioned_mor_table.sql @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +/* + Example of an upsert for a partitioned merge on read table with incremental materialization using merge strategy. + */ +{{ config( + materialized='incremental', + file_format='hudi', + incremental_strategy='merge', + options={ + 'type': 'mor', + 'primaryKey': 'id', + 'precombineKey': 'ts', + }, + unique_key='id', + partition_by='datestr', + pre_hook=["set spark.sql.datetime.java8API.enabled=false;"], + ) +}} + +select id, name, current_timestamp() as ts, current_date as datestr +from {{ ref('hudi_upsert_table') }} \ No newline at end of file diff --git a/hudi-examples/hudi-examples-dbt/models/example/hudi_upsert_table.sql b/hudi-examples/hudi-examples-dbt/models/example/hudi_upsert_table.sql new file mode 100644 index 0000000000000..b8ee5b3ed444b --- /dev/null +++ b/hudi-examples/hudi-examples-dbt/models/example/hudi_upsert_table.sql @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +/* + Example of an upsert for a non-partitioned table with incremental materialization using merge strategy. + */ +{{ config( + materialized='incremental', + file_format='hudi', + incremental_strategy='merge', + options={ + 'type': 'cow', + 'primaryKey': 'id', + 'precombineKey': 'ts', + }, + unique_key='id' + ) +}} + +select id, name, current_timestamp() as ts +from {{ ref('hudi_insert_overwrite_table') }} \ No newline at end of file diff --git a/hudi-examples/hudi-examples-dbt/models/example/schema.yml b/hudi-examples/hudi-examples-dbt/models/example/schema.yml new file mode 100644 index 0000000000000..64ae9099bdd97 --- /dev/null +++ b/hudi-examples/hudi-examples-dbt/models/example/schema.yml @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +version: 2 + +models: + - name: hudi_insert_table + description: "Hudi insert non-partitioned table with incremental materialization" + columns: + - name: id + description: "The primary key for this table" + tests: + - unique + - not_null + + - name: hudi_insert_overwrite_table + description: "Hudi insert overwrite non-partitioned table with incremental materialization" + columns: + - name: id + description: "The primary key for this table" + tests: + - unique + - not_null + - name: name + description: "Employee name" + tests: + - not_null + - name: ts + description: "Created timestamp" + tests: + - not_null + + - name: hudi_upsert_table + description: "Hudi upsert non-partitioned table with incremental materialization" + columns: + - name: id + description: "The primary key for this table" + tests: + - unique + - not_null + - name: name + description: "Employee name" + tests: + - not_null + - name: ts + description: "Created timestamp" + tests: + - not_null + + - name: hudi_upsert_paritioned_cow_table + description: "Hudi upsert partitioned copy-on-write table with incremental materialization using merge strategy" + columns: + - name: id + description: "The primary key for this table" + tests: + - unique + - not_null + - name: name + description: "Employee name" + tests: + - not_null + - name: ts + description: "Created timestamp" + tests: + - not_null + - name: datestr + description: "Partition date string column" + tests: + - not_null + + - name: hudi_upsert_paritioned_mor_table + description: "Hudi upsert partitioned merge-on-read table with incremental materialization using merge strategy" + columns: + - name: id + description: "The primary key for this table" + tests: + - unique + - not_null + - name: name + description: "Employee name" + tests: + - not_null + - name: ts + description: "Created timestamp" + tests: + - not_null + - name: datestr + description: "Partition date string column" + tests: + - not_null diff --git a/hudi-examples/hudi-examples-flink/pom.xml b/hudi-examples/hudi-examples-flink/pom.xml new file mode 100644 index 0000000000000..09530dea1f7fe --- /dev/null +++ b/hudi-examples/hudi-examples-flink/pom.xml @@ -0,0 +1,361 @@ + + + + + hudi-examples + org.apache.hudi + 0.12.2-dt-SNAPSHOT + + 4.0.0 + + hudi-examples-flink + + + ${project.parent.basedir} + true + 1.11.1 + + + + + + org.jacoco + jacoco-maven-plugin + + + org.apache.maven.plugins + maven-compiler-plugin + + 1.8 + 1.8 + + + + org.apache.maven.plugins + maven-jar-plugin + 3.1.2 + + + + test-jar + + + + + + org.apache.rat + apache-rat-plugin + + + + + + src/main/resources + + + src/test/resources + + + + + + + + org.apache.hudi + hudi-common + ${project.version} + + + org.apache.hudi + hudi-client-common + ${project.version} + + + org.apache.hudi + hudi-flink-client + ${project.version} + + + org.apache.hudi + hudi-hadoop-mr + ${project.version} + + + org.apache.hudi + hudi-hive-sync + ${project.version} + + + org.apache.hudi + hudi-sync-common + ${project.version} + + + + org.apache.hudi + hudi-flink + ${project.version} + compile + + + + + org.apache.flink + ${flink.streaming.java.artifactId} + compile + + + org.apache.flink + ${flink.clients.artifactId} + compile + + + com.esotericsoftware.kryo + kryo + + + com.esotericsoftware.minlog + minlog + + + + + org.apache.flink + ${flink.connector.kafka.artifactId} + compile + + + org.apache.kafka + kafka-clients + ${kafka.version} + + + org.apache.flink + ${flink.hadoop.compatibility.artifactId} + ${flink.version} + + + org.apache.flink + ${flink.parquet.artifactId} + ${flink.version} + provided + + + org.apache.flink + flink-json + ${flink.version} + provided + + + org.apache.flink + flink-table-common + ${flink.version} + provided + + + org.apache.flink + ${flink.table.runtime.artifactId} + ${flink.version} + provided + + + org.apache.flink + ${flink.table.planner.artifactId} + ${flink.version} + provided + + + org.apache.flink + ${flink.statebackend.rocksdb.artifactId} + ${flink.version} + provided + + + + org.apache.parquet + parquet-hadoop + ${parquet.version} + + + org.xerial.snappy + snappy-java + + + + + + + org.apache.avro + avro + ${flink.avro.version} + compile + + + + + org.apache.hadoop + hadoop-mapreduce-client-core + compile + + + org.slf4j + slf4j-log4j12 + + + + + + com.beust + jcommander + compile + + + com.twitter + bijection-avro_${scala.binary.version} + 0.9.7 + + + joda-time + joda-time + 2.5 + + + + ${hive.groupid} + hive-exec + ${hive.version} + ${hive.exec.classifier} + + + javax.mail + mail + + + org.eclipse.jetty.aggregate + * + + + + + + + org.apache.hudi + hudi-tests-common + ${project.version} + test + + + + org.junit.platform + junit-platform-launcher + ${junit.platform.version} + test + + + org.junit.jupiter + junit-jupiter-api + test + + + org.junit.jupiter + junit-jupiter-engine + test + + + org.junit.vintage + junit-vintage-engine + test + + + org.junit.jupiter + junit-jupiter-params + test + + + + + org.apache.hudi + hudi-common + ${project.version} + tests + test-jar + test + + + org.apache.hudi + hudi-client-common + ${project.version} + tests + test-jar + test + + + org.apache.hudi + hudi-flink-client + ${project.version} + tests + test-jar + test + + + + + org.apache.flink + ${flink.test.utils.artifactId} + ${flink.version} + test + + + org.apache.flink + ${flink.runtime.artifactId} + ${flink.version} + test + test-jar + + + org.apache.flink + ${flink.streaming.java.artifactId} + ${flink.version} + test + test-jar + + + org.apache.flink + ${flink.table.runtime.artifactId} + ${flink.version} + test + test-jar + + + org.apache.flink + flink-csv + ${flink.version} + test + + + + + org.apache.parquet + parquet-avro + ${parquet.version} + test + + + diff --git a/hudi-examples/hudi-examples-flink/src/main/java/org/apache/hudi/examples/quickstart/HoodieFlinkQuickstart.java b/hudi-examples/hudi-examples-flink/src/main/java/org/apache/hudi/examples/quickstart/HoodieFlinkQuickstart.java new file mode 100644 index 0000000000000..b3e105015a58c --- /dev/null +++ b/hudi-examples/hudi-examples-flink/src/main/java/org/apache/hudi/examples/quickstart/HoodieFlinkQuickstart.java @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.examples.quickstart; + +import static org.apache.hudi.examples.quickstart.utils.QuickstartConfigurations.sql; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.table.api.config.ExecutionConfigOptions; +import org.apache.flink.table.api.internal.TableEnvironmentImpl; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.types.Row; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.examples.quickstart.factory.CollectSinkTableFactory; +import org.apache.hudi.examples.quickstart.utils.QuickstartConfigurations; +import org.jetbrains.annotations.NotNull; + +public final class HoodieFlinkQuickstart { + private EnvironmentSettings settings = null; + private TableEnvironment streamTableEnv = null; + + private String tableName; + + private HoodieFlinkQuickstart() { + } + + public static HoodieFlinkQuickstart instance() { + return new HoodieFlinkQuickstart(); + } + + public static void main(String[] args) throws TableNotExistException, InterruptedException { + if (args.length < 3) { + System.err.println("Usage: HoodieWriteClientExample "); + System.exit(1); + } + String tablePath = args[0]; + String tableName = args[1]; + String tableType = args[2]; + + HoodieFlinkQuickstart flinkQuickstart = instance(); + flinkQuickstart.initEnv(); + + // create filesystem table named source + flinkQuickstart.createFileSource(); + + // create hudi table + flinkQuickstart.createHudiTable(tablePath, tableName, HoodieTableType.valueOf(tableType)); + + // insert data + flinkQuickstart.insertData(); + + // query data + flinkQuickstart.queryData(); + + // update data + flinkQuickstart.updateData(); + } + + public void initEnv() { + if (this.streamTableEnv == null) { + settings = EnvironmentSettings.newInstance().build(); + TableEnvironment streamTableEnv = TableEnvironmentImpl.create(settings); + streamTableEnv.getConfig().getConfiguration() + .setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 1); + Configuration execConf = streamTableEnv.getConfig().getConfiguration(); + execConf.setString("execution.checkpointing.interval", "2s"); + // configure not to retry after failure + execConf.setString("restart-strategy", "fixed-delay"); + execConf.setString("restart-strategy.fixed-delay.attempts", "0"); + this.streamTableEnv = streamTableEnv; + } + } + + public TableEnvironment getStreamTableEnv() { + return streamTableEnv; + } + + public TableEnvironment getBatchTableEnv() { + Configuration conf = new Configuration(); + // for batch upsert use cases: current suggestion is to disable these 2 options, + // from 1.14, flink runtime execution mode has switched from streaming + // to batch for batch execution mode(before that, both streaming and batch use streaming execution mode), + // current batch execution mode has these limitations: + // + // 1. the keyed stream default to always sort the inputs by key; + // 2. the batch state-backend requires the inputs sort by state key + // + // For our hudi batch pipeline upsert case, we rely on the consuming sequence for index records and data records, + // the index records must be loaded first before data records for BucketAssignFunction to keep upsert semantics correct, + // so we suggest disabling these 2 options to use streaming state-backend for batch execution mode + // to keep the strategy before 1.14. + conf.setBoolean("execution.sorted-inputs.enabled", false); + conf.setBoolean("execution.batch-state-backend.enabled", false); + StreamExecutionEnvironment execEnv = StreamExecutionEnvironment.getExecutionEnvironment(conf); + settings = EnvironmentSettings.newInstance().inBatchMode().build(); + TableEnvironment batchTableEnv = StreamTableEnvironment.create(execEnv, settings); + batchTableEnv.getConfig().getConfiguration() + .setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 1); + return batchTableEnv; + } + + public void createHudiTable(String tablePath, String tableName, + HoodieTableType tableType) { + this.tableName = tableName; + + // create hudi table + String hoodieTableDDL = sql(tableName) + .option(FlinkOptions.PATH, tablePath) + .option(FlinkOptions.READ_AS_STREAMING, true) + .option(FlinkOptions.TABLE_TYPE, tableType) + .end(); + streamTableEnv.executeSql(hoodieTableDDL); + } + + public void createFileSource() { + // create filesystem table named source + String createSource = QuickstartConfigurations.getFileSourceDDL("source"); + streamTableEnv.executeSql(createSource); + } + + @NotNull List insertData() throws InterruptedException, TableNotExistException { + // insert data + String insertInto = String.format("insert into %s select * from source", tableName); + execInsertSql(streamTableEnv, insertInto); + return queryData(); + } + + List queryData() throws InterruptedException, TableNotExistException { + // query data + // reading from the latest commit instance. + return execSelectSql(streamTableEnv, String.format("select * from %s", tableName), 10); + } + + @NotNull List updateData() throws InterruptedException, TableNotExistException { + // update data + String insertInto = String.format("insert into %s select * from source", tableName); + execInsertSql(getStreamTableEnv(), insertInto); + return queryData(); + } + + public static void execInsertSql(TableEnvironment tEnv, String insert) { + TableResult tableResult = tEnv.executeSql(insert); + // wait to finish + try { + tableResult.getJobClient().get().getJobExecutionResult().get(); + } catch (InterruptedException | ExecutionException ex) { + // ignored + } + } + + public static List execSelectSql(TableEnvironment tEnv, String select, long timeout) + throws InterruptedException, TableNotExistException { + return execSelectSql(tEnv, select, timeout, null); + } + + public static List execSelectSql(TableEnvironment tEnv, String select, long timeout, String sourceTable) + throws InterruptedException, TableNotExistException { + final String sinkDDL; + if (sourceTable != null) { + // use the source table schema as the sink schema if the source table was specified, . + ObjectPath objectPath = new ObjectPath(tEnv.getCurrentDatabase(), sourceTable); + TableSchema schema = tEnv.getCatalog(tEnv.getCurrentCatalog()).get().getTable(objectPath).getSchema(); + sinkDDL = QuickstartConfigurations.getCollectSinkDDL("sink", schema); + } else { + sinkDDL = QuickstartConfigurations.getCollectSinkDDL("sink"); + } + return execSelectSql(tEnv, select, sinkDDL, timeout); + } + + public static List execSelectSql(TableEnvironment tEnv, String select, String sinkDDL, long timeout) + throws InterruptedException { + tEnv.executeSql("DROP TABLE IF EXISTS sink"); + tEnv.executeSql(sinkDDL); + TableResult tableResult = tEnv.executeSql("insert into sink " + select); + // wait for the timeout then cancels the job + TimeUnit.SECONDS.sleep(timeout); + tableResult.getJobClient().ifPresent(JobClient::cancel); + tEnv.executeSql("DROP TABLE IF EXISTS sink"); + return CollectSinkTableFactory.RESULT.values().stream() + .flatMap(Collection::stream) + .collect(Collectors.toList()); + } +} diff --git a/hudi-examples/hudi-examples-flink/src/main/java/org/apache/hudi/examples/quickstart/factory/CollectSinkTableFactory.java b/hudi-examples/hudi-examples-flink/src/main/java/org/apache/hudi/examples/quickstart/factory/CollectSinkTableFactory.java new file mode 100644 index 0000000000000..5687a7c146720 --- /dev/null +++ b/hudi-examples/hudi-examples-flink/src/main/java/org/apache/hudi/examples/quickstart/factory/CollectSinkTableFactory.java @@ -0,0 +1,178 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.examples.quickstart.factory; + +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.runtime.state.FunctionInitializationContext; +import org.apache.flink.runtime.state.FunctionSnapshotContext; +import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; +import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.sink.DynamicTableSink; +import org.apache.flink.table.connector.sink.SinkFunctionProvider; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.factories.DynamicTableSinkFactory; +import org.apache.flink.table.factories.FactoryUtil; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.flink.types.Row; +import org.apache.flink.types.RowKind; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Factory for CollectTableSink. + * + *

    Note: The CollectTableSink collects all the data of a table into a global collection {@code RESULT}, + * so the tests should executed in single thread and the table name should be the same. + */ +public class CollectSinkTableFactory implements DynamicTableSinkFactory { + public static final String FACTORY_ID = "collect"; + + // global results to collect and query + public static final Map> RESULT = new HashMap<>(); + + @Override + public DynamicTableSink createDynamicTableSink(Context context) { + FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context); + helper.validate(); + + TableSchema schema = context.getCatalogTable().getSchema(); + RESULT.clear(); + return new CollectTableSink(schema, context.getObjectIdentifier().getObjectName()); + } + + @Override + public String factoryIdentifier() { + return FACTORY_ID; + } + + @Override + public Set> requiredOptions() { + return Collections.emptySet(); + } + + @Override + public Set> optionalOptions() { + return Collections.emptySet(); + } + + // -------------------------------------------------------------------------------------------- + // Table sinks + // -------------------------------------------------------------------------------------------- + + /** + * Values {@link DynamicTableSink} for testing. + */ + private static class CollectTableSink implements DynamicTableSink { + + private final TableSchema schema; + private final String tableName; + + private CollectTableSink( + TableSchema schema, + String tableName) { + this.schema = schema; + this.tableName = tableName; + } + + @Override + public ChangelogMode getChangelogMode(ChangelogMode requestedMode) { + return ChangelogMode.newBuilder() + .addContainedKind(RowKind.INSERT) + .addContainedKind(RowKind.DELETE) + .addContainedKind(RowKind.UPDATE_AFTER) + .build(); + } + + @Override + public SinkRuntimeProvider getSinkRuntimeProvider(Context context) { + final DataType rowType = schema.toPhysicalRowDataType(); + final RowTypeInfo rowTypeInfo = (RowTypeInfo) TypeConversions.fromDataTypeToLegacyInfo(rowType); + DataStructureConverter converter = context.createDataStructureConverter(schema.toPhysicalRowDataType()); + return SinkFunctionProvider.of(new CollectSinkFunction(converter, rowTypeInfo)); + } + + @Override + public DynamicTableSink copy() { + return new CollectTableSink(schema, tableName); + } + + @Override + public String asSummaryString() { + return "CollectSink"; + } + } + + static class CollectSinkFunction extends RichSinkFunction implements CheckpointedFunction { + + private static final long serialVersionUID = 1L; + private final DynamicTableSink.DataStructureConverter converter; + private final RowTypeInfo rowTypeInfo; + + protected transient ListState resultState; + protected transient List localResult; + + private int taskID; + + protected CollectSinkFunction(DynamicTableSink.DataStructureConverter converter, RowTypeInfo rowTypeInfo) { + this.converter = converter; + this.rowTypeInfo = rowTypeInfo; + } + + @Override + public void invoke(RowData value, Context context) { + Row row = (Row) converter.toExternal(value); + assert row != null; + row.setKind(value.getRowKind()); + RESULT.get(taskID).add(row); + } + + @Override + public void initializeState(FunctionInitializationContext context) throws Exception { + this.resultState = context.getOperatorStateStore().getListState( + new ListStateDescriptor<>("sink-results", rowTypeInfo)); + this.localResult = new ArrayList<>(); + if (context.isRestored()) { + for (Row value : resultState.get()) { + localResult.add(value); + } + } + this.taskID = getRuntimeContext().getIndexOfThisSubtask(); + synchronized (CollectSinkTableFactory.class) { + RESULT.put(taskID, localResult); + } + } + + @Override + public void snapshotState(FunctionSnapshotContext context) throws Exception { + resultState.clear(); + resultState.addAll(RESULT.get(taskID)); + } + } +} diff --git a/hudi-examples/hudi-examples-flink/src/main/java/org/apache/hudi/examples/quickstart/factory/ContinuousFileSourceFactory.java b/hudi-examples/hudi-examples-flink/src/main/java/org/apache/hudi/examples/quickstart/factory/ContinuousFileSourceFactory.java new file mode 100644 index 0000000000000..834fa9f252fd5 --- /dev/null +++ b/hudi-examples/hudi-examples-flink/src/main/java/org/apache/hudi/examples/quickstart/factory/ContinuousFileSourceFactory.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.examples.quickstart.factory; + +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.Path; +import org.apache.flink.table.api.ValidationException; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.factories.DynamicTableSourceFactory; +import org.apache.flink.table.factories.FactoryUtil; + +import java.util.Collections; +import java.util.Set; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.examples.quickstart.source.ContinuousFileSource; + +/** + * Factory for ContinuousFileSource. + */ +public class ContinuousFileSourceFactory implements DynamicTableSourceFactory { + public static final String FACTORY_ID = "continuous-file-source"; + + public static final ConfigOption CHECKPOINTS = ConfigOptions + .key("checkpoints") + .intType() + .defaultValue(2) + .withDescription("Number of checkpoints to write the data set as, default 2"); + + @Override + public DynamicTableSource createDynamicTableSource(Context context) { + FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context); + helper.validate(); + + Configuration conf = (Configuration) helper.getOptions(); + Path path = new Path(conf.getOptional(FlinkOptions.PATH).orElseThrow(() -> + new ValidationException("Option [path] should be not empty."))); + return new ContinuousFileSource(context.getCatalogTable().getResolvedSchema(), path, conf); + } + + @Override + public String factoryIdentifier() { + return FACTORY_ID; + } + + @Override + public Set> requiredOptions() { + return Collections.singleton(FlinkOptions.PATH); + } + + @Override + public Set> optionalOptions() { + return Collections.singleton(CHECKPOINTS); + } +} diff --git a/hudi-examples/hudi-examples-flink/src/main/java/org/apache/hudi/examples/quickstart/source/ContinuousFileSource.java b/hudi-examples/hudi-examples-flink/src/main/java/org/apache/hudi/examples/quickstart/source/ContinuousFileSource.java new file mode 100644 index 0000000000000..f1696f332824f --- /dev/null +++ b/hudi-examples/hudi-examples-flink/src/main/java/org/apache/hudi/examples/quickstart/source/ContinuousFileSource.java @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.examples.quickstart.source; + +import org.apache.hudi.adapter.DataStreamScanProviderAdapter; + +import org.apache.flink.api.common.state.CheckpointListener; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.common.TimestampFormat; +import org.apache.flink.formats.json.JsonRowDataDeserializationSchema; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.connector.source.ScanTableSource; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.flink.table.types.logical.RowType; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.apache.hudi.examples.quickstart.factory.ContinuousFileSourceFactory.CHECKPOINTS; + +/** + * A continuous file source that can trigger checkpoints continuously. + * + *

    It loads the data in the specified file and split the data into number of checkpoints batches. + * Say, if you want 4 checkpoints and there are 8 records in the file, the emit strategy is: + * + *

    + *   | 2 records | 2 records | 2 records | 2 records |
    + *   | cp1       | cp2       |cp3        | cp4       |
    + * 
    + * + *

    If all the data are flushed out, it waits for the next checkpoint to finish and tear down the source. + */ +public class ContinuousFileSource implements ScanTableSource { + + private final ResolvedSchema tableSchema; + private final Path path; + private final Configuration conf; + + public ContinuousFileSource( + ResolvedSchema tableSchema, + Path path, + Configuration conf) { + this.tableSchema = tableSchema; + this.path = path; + this.conf = conf; + } + + @Override + public ScanRuntimeProvider getScanRuntimeProvider(ScanContext scanContext) { + return new DataStreamScanProviderAdapter() { + + @Override + public boolean isBounded() { + return false; + } + + @Override + public DataStream produceDataStream(StreamExecutionEnvironment execEnv) { + final RowType rowType = (RowType) tableSchema.toSourceRowDataType().getLogicalType(); + JsonRowDataDeserializationSchema deserializationSchema = new JsonRowDataDeserializationSchema( + rowType, + InternalTypeInfo.of(rowType), + false, + true, + TimestampFormat.ISO_8601); + + return execEnv.addSource(new BoundedSourceFunction(path, conf.getInteger(CHECKPOINTS))) + .name("continuous_file_source") + .setParallelism(1) + .map(record -> deserializationSchema.deserialize(record.getBytes(StandardCharsets.UTF_8)), + InternalTypeInfo.of(rowType)); + } + }; + } + + @Override + public ChangelogMode getChangelogMode() { + return ChangelogMode.insertOnly(); + } + + @Override + public DynamicTableSource copy() { + return new ContinuousFileSource(this.tableSchema, this.path, this.conf); + } + + @Override + public String asSummaryString() { + return "ContinuousFileSource"; + } + + /** + * Source function that partition the data into given number checkpoints batches. + */ + public static class BoundedSourceFunction implements SourceFunction, CheckpointListener { + private final Path path; + private List dataBuffer; + + private final int checkpoints; + private final AtomicInteger currentCP = new AtomicInteger(0); + + private volatile boolean isRunning = true; + + public BoundedSourceFunction(Path path, int checkpoints) { + this.path = path; + this.checkpoints = checkpoints; + } + + @Override + public void run(SourceContext context) throws Exception { + if (this.dataBuffer == null) { + loadDataBuffer(); + } + int oldCP = this.currentCP.get(); + boolean finish = false; + while (isRunning) { + int batchSize = this.dataBuffer.size() / this.checkpoints; + int start = batchSize * oldCP; + synchronized (context.getCheckpointLock()) { + for (int i = start; i < start + batchSize; i++) { + if (i >= this.dataBuffer.size()) { + finish = true; + break; + // wait for the next checkpoint and exit + } + context.collect(this.dataBuffer.get(i)); + } + } + oldCP++; + while (this.currentCP.get() < oldCP) { + synchronized (context.getCheckpointLock()) { + context.getCheckpointLock().wait(10); + } + } + if (finish || !isRunning) { + return; + } + } + } + + @Override + public void cancel() { + this.isRunning = false; + } + + private void loadDataBuffer() { + try { + this.dataBuffer = Files.readAllLines(Paths.get(this.path.toUri())); + } catch (IOException e) { + throw new RuntimeException("Read file " + this.path + " error", e); + } + } + + @Override + public void notifyCheckpointComplete(long l) { + this.currentCP.incrementAndGet(); + } + } +} diff --git a/hudi-examples/hudi-examples-flink/src/main/java/org/apache/hudi/examples/quickstart/utils/QuickstartConfigurations.java b/hudi-examples/hudi-examples-flink/src/main/java/org/apache/hudi/examples/quickstart/utils/QuickstartConfigurations.java new file mode 100644 index 0000000000000..8dfd9df9eb479 --- /dev/null +++ b/hudi-examples/hudi-examples-flink/src/main/java/org/apache/hudi/examples/quickstart/utils/QuickstartConfigurations.java @@ -0,0 +1,317 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.examples.quickstart.utils; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.runtime.typeutils.RowDataSerializer; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.examples.quickstart.factory.CollectSinkTableFactory; +import org.apache.hudi.examples.quickstart.factory.ContinuousFileSourceFactory; +import org.apache.hudi.streamer.FlinkStreamerConfig; + +/** + * Configurations for the test. + */ +public class QuickstartConfigurations { + private QuickstartConfigurations() { + } + + public static final DataType ROW_DATA_TYPE = DataTypes.ROW( + DataTypes.FIELD("uuid", DataTypes.VARCHAR(20)),// record key + DataTypes.FIELD("name", DataTypes.VARCHAR(10)), + DataTypes.FIELD("age", DataTypes.INT()), + DataTypes.FIELD("ts", DataTypes.TIMESTAMP(3)), // precombine field + DataTypes.FIELD("partition", DataTypes.VARCHAR(10))) + .notNull(); + + public static final RowType ROW_TYPE = (RowType) ROW_DATA_TYPE.getLogicalType(); + + public static final ResolvedSchema TABLE_SCHEMA = SchemaBuilder.instance() + .fields(ROW_TYPE.getFieldNames(), ROW_DATA_TYPE.getChildren()) + .build(); + + private static final List FIELDS = ROW_TYPE.getFields().stream() + .map(RowType.RowField::asSummaryString).collect(Collectors.toList()); + + public static final DataType ROW_DATA_TYPE_WIDER = DataTypes.ROW( + DataTypes.FIELD("uuid", DataTypes.VARCHAR(20)),// record key + DataTypes.FIELD("name", DataTypes.VARCHAR(10)), + DataTypes.FIELD("age", DataTypes.INT()), + DataTypes.FIELD("salary", DataTypes.DOUBLE()), + DataTypes.FIELD("ts", DataTypes.TIMESTAMP(3)), // precombine field + DataTypes.FIELD("partition", DataTypes.VARCHAR(10))) + .notNull(); + + public static final RowType ROW_TYPE_WIDER = (RowType) ROW_DATA_TYPE_WIDER.getLogicalType(); + + public static String getCreateHoodieTableDDL(String tableName, Map options) { + return getCreateHoodieTableDDL(tableName, options, true, "partition"); + } + + public static String getCreateHoodieTableDDL( + String tableName, + Map options, + boolean havePartition, + String partitionField) { + return getCreateHoodieTableDDL(tableName, FIELDS, options, havePartition, "uuid", partitionField); + } + + public static String getCreateHoodieTableDDL( + String tableName, + List fields, + Map options, + boolean havePartition, + String pkField, + String partitionField) { + StringBuilder builder = new StringBuilder(); + builder.append("create table ").append(tableName).append("(\n"); + for (String field : fields) { + builder.append(" ").append(field).append(",\n"); + } + builder.append(" PRIMARY KEY(").append(pkField).append(") NOT ENFORCED\n") + .append(")\n"); + if (havePartition) { + builder.append("PARTITIONED BY (`").append(partitionField).append("`)\n"); + } + final String connector = options.computeIfAbsent("connector", k -> "hudi"); + builder.append("with (\n" + + " 'connector' = '").append(connector).append("'"); + options.forEach((k, v) -> builder.append(",\n") + .append(" '").append(k).append("' = '").append(v).append("'")); + builder.append("\n)"); + return builder.toString(); + } + + public static String getCreateHudiCatalogDDL(final String catalogName, final String catalogPath) { + StringBuilder builder = new StringBuilder(); + builder.append("create catalog ").append(catalogName).append(" with (\n"); + builder.append(" 'type' = 'hudi',\n" + + " 'catalog.path' = '").append(catalogPath).append("'"); + builder.append("\n)"); + return builder.toString(); + } + + public static String getFileSourceDDL(String tableName) { + return getFileSourceDDL(tableName, "source-file.json"); + } + + public static String getFileSourceDDL(String tableName, int checkpoints) { + return getFileSourceDDL(tableName, "source-file.json", checkpoints); + } + + public static String getFileSourceDDL(String tableName, String fileName) { + return getFileSourceDDL(tableName, fileName, 2); + } + + public static String getFileSourceDDL(String tableName, String fileName, int checkpoints) { + String sourcePath = Objects.requireNonNull(Thread.currentThread() + .getContextClassLoader().getResource(fileName)).toString(); + return "create table " + tableName + "(\n" + + " uuid varchar(20),\n" + + " name varchar(10),\n" + + " age int,\n" + + " ts timestamp(3),\n" + + " `partition` varchar(20)\n" + + ") with (\n" + + " 'connector' = '" + ContinuousFileSourceFactory.FACTORY_ID + "',\n" + + " 'path' = '" + sourcePath + "',\n" + + " 'checkpoints' = '" + checkpoints + "'\n" + + ")"; + } + + public static String getCollectSinkDDL(String tableName) { + return "create table " + tableName + "(\n" + + " uuid varchar(20),\n" + + " name varchar(10),\n" + + " age int,\n" + + " ts timestamp(3),\n" + + " `partition` varchar(20)\n" + + ") with (\n" + + " 'connector' = '" + CollectSinkTableFactory.FACTORY_ID + "'" + + ")"; + } + + public static String getCollectSinkDDL(String tableName, TableSchema tableSchema) { + final StringBuilder builder = new StringBuilder("create table " + tableName + "(\n"); + String[] fieldNames = tableSchema.getFieldNames(); + DataType[] fieldTypes = tableSchema.getFieldDataTypes(); + for (int i = 0; i < fieldNames.length; i++) { + builder.append(" `") + .append(fieldNames[i]) + .append("` ") + .append(fieldTypes[i].toString()); + if (i != fieldNames.length - 1) { + builder.append(","); + } + builder.append("\n"); + } + final String withProps = "" + + ") with (\n" + + " 'connector' = '" + CollectSinkTableFactory.FACTORY_ID + "'\n" + + ")"; + builder.append(withProps); + return builder.toString(); + } + + public static String getCsvSourceDDL(String tableName, String fileName) { + String sourcePath = Objects.requireNonNull(Thread.currentThread() + .getContextClassLoader().getResource(fileName)).toString(); + return "create table " + tableName + "(\n" + + " uuid varchar(20),\n" + + " name varchar(10),\n" + + " age int,\n" + + " ts timestamp(3),\n" + + " `partition` varchar(20)\n" + + ") with (\n" + + " 'connector' = 'filesystem',\n" + + " 'path' = '" + sourcePath + "',\n" + + " 'format' = 'csv'\n" + + ")"; + } + + public static final RowDataSerializer SERIALIZER = new RowDataSerializer(ROW_TYPE); + + public static Configuration getDefaultConf(String tablePath) { + Configuration conf = new Configuration(); + conf.setString(FlinkOptions.PATH, tablePath); + conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA_PATH, + Objects.requireNonNull(Thread.currentThread() + .getContextClassLoader().getResource("test_read_schema.avsc")).toString()); + conf.setString(FlinkOptions.TABLE_NAME, "TestHoodieTable"); + conf.setString(FlinkOptions.PARTITION_PATH_FIELD, "partition"); + return conf; + } + + public static FlinkStreamerConfig getDefaultStreamerConf(String tablePath) { + FlinkStreamerConfig streamerConf = new FlinkStreamerConfig(); + streamerConf.targetBasePath = tablePath; + streamerConf.sourceAvroSchemaPath = Objects.requireNonNull(Thread.currentThread() + .getContextClassLoader().getResource("test_read_schema.avsc")).toString(); + streamerConf.targetTableName = "TestHoodieTable"; + streamerConf.partitionPathField = "partition"; + streamerConf.tableType = "COPY_ON_WRITE"; + streamerConf.checkpointInterval = 4000L; + return streamerConf; + } + + /** + * Creates the tool to build hoodie table DDL. + */ + public static Sql sql(String tableName) { + return new Sql(tableName); + } + + public static Catalog catalog(String catalogName) { + return new Catalog(catalogName); + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + /** + * Tool to build hoodie table DDL with schema {@link #TABLE_SCHEMA}. + */ + public static class Sql { + private final Map options; + private final String tableName; + private List fields = new ArrayList<>(); + private boolean withPartition = true; + private String pkField = "uuid"; + private String partitionField = "partition"; + + public Sql(String tableName) { + options = new HashMap<>(); + this.tableName = tableName; + } + + public Sql option(ConfigOption option, Object val) { + this.options.put(option.key(), val.toString()); + return this; + } + + public Sql option(String key, Object val) { + this.options.put(key, val.toString()); + return this; + } + + public Sql options(Map options) { + this.options.putAll(options); + return this; + } + + public Sql noPartition() { + this.withPartition = false; + return this; + } + + public Sql pkField(String pkField) { + this.pkField = pkField; + return this; + } + + public Sql partitionField(String partitionField) { + this.partitionField = partitionField; + return this; + } + + public Sql field(String fieldSchema) { + fields.add(fieldSchema); + return this; + } + + public String end() { + if (this.fields.size() == 0) { + this.fields = FIELDS; + } + return QuickstartConfigurations.getCreateHoodieTableDDL(this.tableName, this.fields, options, + this.withPartition, this.pkField, this.partitionField); + } + } + + public static class Catalog { + private final String catalogName; + private String catalogPath = "."; + + public Catalog(String catalogName) { + this.catalogName = catalogName; + } + + public Catalog catalogPath(String catalogPath) { + this.catalogPath = catalogPath; + return this; + } + + public String end() { + return QuickstartConfigurations.getCreateHudiCatalogDDL(catalogName, catalogPath); + } + } +} diff --git a/hudi-examples/hudi-examples-flink/src/main/java/org/apache/hudi/examples/quickstart/utils/SchemaBuilder.java b/hudi-examples/hudi-examples-flink/src/main/java/org/apache/hudi/examples/quickstart/utils/SchemaBuilder.java new file mode 100644 index 0000000000000..76306f780646d --- /dev/null +++ b/hudi-examples/hudi-examples-flink/src/main/java/org/apache/hudi/examples/quickstart/utils/SchemaBuilder.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.examples.quickstart.utils; + +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.catalog.UniqueConstraint; +import org.apache.flink.table.catalog.WatermarkSpec; +import org.apache.flink.table.types.DataType; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Builder for {@link ResolvedSchema}. + */ +public class SchemaBuilder { + private List columns; + private List watermarkSpecs; + private UniqueConstraint constraint; + + public static SchemaBuilder instance() { + return new SchemaBuilder(); + } + + private SchemaBuilder() { + this.columns = new ArrayList<>(); + this.watermarkSpecs = new ArrayList<>(); + } + + public SchemaBuilder field(String name, DataType type) { + this.columns.add(Column.physical(name, type)); + return this; + } + + public SchemaBuilder fields(List names, List types) { + List columns = IntStream.range(0, names.size()) + .mapToObj(idx -> Column.physical(names.get(idx), types.get(idx))) + .collect(Collectors.toList()); + this.columns.addAll(columns); + return this; + } + + public SchemaBuilder primaryKey(String... columns) { + this.constraint = UniqueConstraint.primaryKey("pk", Arrays.asList(columns)); + return this; + } + + public ResolvedSchema build() { + return new ResolvedSchema(columns, watermarkSpecs, constraint); + } +} diff --git a/hudi-examples/hudi-examples-flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory b/hudi-examples/hudi-examples-flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory new file mode 100644 index 0000000000000..27a137292b388 --- /dev/null +++ b/hudi-examples/hudi-examples-flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.hudi.examples.quickstart.factory.ContinuousFileSourceFactory +org.apache.hudi.examples.quickstart.factory.CollectSinkTableFactory diff --git a/hudi-examples/hudi-examples-flink/src/main/resources/source-file.json b/hudi-examples/hudi-examples-flink/src/main/resources/source-file.json new file mode 100644 index 0000000000000..2f628e29c535b --- /dev/null +++ b/hudi-examples/hudi-examples-flink/src/main/resources/source-file.json @@ -0,0 +1,8 @@ +{"uuid": "id1", "name": "Danny", "age": 23, "ts": "1970-01-01T00:00:01", "partition": "par1"} +{"uuid": "id2", "name": "Stephen", "age": 33, "ts": "1970-01-01T00:00:02", "partition": "par1"} +{"uuid": "id3", "name": "Julian", "age": 53, "ts": "1970-01-01T00:00:03", "partition": "par2"} +{"uuid": "id4", "name": "Fabian", "age": 31, "ts": "1970-01-01T00:00:04", "partition": "par2"} +{"uuid": "id5", "name": "Sophia", "age": 18, "ts": "1970-01-01T00:00:05", "partition": "par3"} +{"uuid": "id6", "name": "Emma", "age": 20, "ts": "1970-01-01T00:00:06", "partition": "par3"} +{"uuid": "id7", "name": "Bob", "age": 44, "ts": "1970-01-01T00:00:07", "partition": "par4"} +{"uuid": "id8", "name": "Han", "age": 56, "ts": "1970-01-01T00:00:08", "partition": "par4"} diff --git a/hudi-examples/hudi-examples-flink/src/test/java/org/apache/hudi/examples/quickstart/TestHoodieFlinkQuickstart.java b/hudi-examples/hudi-examples-flink/src/test/java/org/apache/hudi/examples/quickstart/TestHoodieFlinkQuickstart.java new file mode 100644 index 0000000000000..368f7f372cfe7 --- /dev/null +++ b/hudi-examples/hudi-examples-flink/src/test/java/org/apache/hudi/examples/quickstart/TestHoodieFlinkQuickstart.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.examples.quickstart; + +import org.apache.flink.test.util.AbstractTestBase; +import org.apache.flink.types.Row; +import org.apache.hudi.common.model.HoodieTableType; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import java.io.File; +import java.util.List; + +import static org.apache.hudi.examples.quickstart.TestQuickstartData.assertRowsEquals; + +/** + * IT cases for Hoodie table source and sink. + */ +public class TestHoodieFlinkQuickstart extends AbstractTestBase { + private final HoodieFlinkQuickstart flinkQuickstart = HoodieFlinkQuickstart.instance(); + + @BeforeEach + void beforeEach() { + flinkQuickstart.initEnv(); + } + + @TempDir + File tempFile; + + @Disabled + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testHoodieFlinkQuickstart(HoodieTableType tableType) throws Exception { + // create filesystem table named source + flinkQuickstart.createFileSource(); + + // create hudi table + flinkQuickstart.createHudiTable(tempFile.getAbsolutePath(), "t1", tableType); + + // insert data + List rows = flinkQuickstart.insertData(); + assertRowsEquals(rows, TestQuickstartData.DATA_SET_SOURCE_INSERT_LATEST_COMMIT); + + // query data + List rows1 = flinkQuickstart.queryData(); + assertRowsEquals(rows1, TestQuickstartData.DATA_SET_SOURCE_INSERT_LATEST_COMMIT); + + // update data + List rows2 = flinkQuickstart.updateData(); + assertRowsEquals(rows2, TestQuickstartData.DATA_SET_SOURCE_INSERT_LATEST_COMMIT); + } +} diff --git a/hudi-examples/hudi-examples-flink/src/test/java/org/apache/hudi/examples/quickstart/TestQuickstartData.java b/hudi-examples/hudi-examples-flink/src/test/java/org/apache/hudi/examples/quickstart/TestQuickstartData.java new file mode 100644 index 0000000000000..67691a3ec7bd1 --- /dev/null +++ b/hudi-examples/hudi-examples-flink/src/test/java/org/apache/hudi/examples/quickstart/TestQuickstartData.java @@ -0,0 +1,423 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.examples.quickstart; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.data.binary.BinaryRowData; +import org.apache.flink.table.data.conversion.DataStructureConverter; +import org.apache.flink.table.data.conversion.DataStructureConverters; +import org.apache.flink.table.data.writer.BinaryRowWriter; +import org.apache.flink.table.data.writer.BinaryWriter; +import org.apache.flink.table.runtime.typeutils.InternalSerializers; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.flink.types.RowKind; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.config.HoodieCommonConfig; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.examples.quickstart.utils.QuickstartConfigurations; +import org.apache.parquet.Strings; +import org.apache.parquet.avro.AvroParquetReader; +import org.apache.parquet.hadoop.ParquetReader; + +import java.io.File; +import java.io.FileFilter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Properties; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static junit.framework.TestCase.assertEquals; +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +/** + * Data set for testing, also some utilities to check the results. + */ +public class TestQuickstartData { + + public static List DATA_SET_INSERT_DUPLICATES = new ArrayList<>(); + + static { + IntStream.range(0, 5).forEach(i -> DATA_SET_INSERT_DUPLICATES.add( + insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(1), StringData.fromString("par1")))); + } + + public static List DATA_SET_INSERT_SAME_KEY = new ArrayList<>(); + + static { + IntStream.range(0, 5).forEach(i -> DATA_SET_INSERT_SAME_KEY.add( + insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(i), StringData.fromString("par1")))); + } + + // data set of source-file.json latest commit. + public static List DATA_SET_SOURCE_INSERT_LATEST_COMMIT = Arrays.asList( + insertRow(StringData.fromString("id5"), StringData.fromString("Sophia"), 18, + TimestampData.fromEpochMillis(5000), StringData.fromString("par3")), + insertRow(StringData.fromString("id6"), StringData.fromString("Emma"), 20, + TimestampData.fromEpochMillis(6000), StringData.fromString("par3")), + insertRow(StringData.fromString("id7"), StringData.fromString("Bob"), 44, + TimestampData.fromEpochMillis(7000), StringData.fromString("par4")), + insertRow(StringData.fromString("id8"), StringData.fromString("Han"), 56, + TimestampData.fromEpochMillis(8000), StringData.fromString("par4")) + ); + + public static List DATA_SET_DISORDER_UPDATE_DELETE = Arrays.asList( + // DISORDER UPDATE + updateAfterRow(StringData.fromString("id1"), StringData.fromString("Danny"), 21, + TimestampData.fromEpochMillis(3), StringData.fromString("par1")), + updateAfterRow(StringData.fromString("id1"), StringData.fromString("Danny"), 20, + TimestampData.fromEpochMillis(2), StringData.fromString("par1")), + updateBeforeRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(1), StringData.fromString("par1")), + updateBeforeRow(StringData.fromString("id1"), StringData.fromString("Danny"), 20, + TimestampData.fromEpochMillis(2), StringData.fromString("par1")), + updateAfterRow(StringData.fromString("id1"), StringData.fromString("Danny"), 22, + TimestampData.fromEpochMillis(4), StringData.fromString("par1")), + updateBeforeRow(StringData.fromString("id1"), StringData.fromString("Danny"), 21, + TimestampData.fromEpochMillis(3), StringData.fromString("par1")), + // DISORDER DELETE + deleteRow(StringData.fromString("id1"), StringData.fromString("Danny"), 22, + TimestampData.fromEpochMillis(2), StringData.fromString("par1")) + ); + + public static List dataSetInsert(int... ids) { + List inserts = new ArrayList<>(); + Arrays.stream(ids).forEach(i -> inserts.add( + insertRow(StringData.fromString("id" + i), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(i), StringData.fromString("par1")))); + return inserts; + } + + private static Integer toIdSafely(Object id) { + if (id == null) { + return -1; + } + final String idStr = id.toString(); + if (idStr.startsWith("id")) { + return Integer.parseInt(idStr.substring(2)); + } + return -1; + } + + /** + * Returns string format of a list of RowData. + */ + public static String rowDataToString(List rows) { + DataStructureConverter converter = + DataStructureConverters.getConverter(QuickstartConfigurations.ROW_DATA_TYPE); + return rows.stream() + .sorted(Comparator.comparing(o -> toIdSafely(o.getString(0)))) + .map(row -> converter.toExternal(row).toString()) + .collect(Collectors.toList()).toString(); + } + + private static String toStringSafely(Object obj) { + return obj == null ? "null" : obj.toString(); + } + + /** + * Sort the {@code rows} using field at index 0 and asserts + * it equals with the expected string {@code expected}. + * + * @param rows Actual result rows + * @param expected Expected string of the sorted rows + */ + public static void assertRowsEquals(List rows, String expected) { + assertRowsEquals(rows, expected, false); + } + + /** + * Sort the {@code rows} using field at index 0 and asserts + * it equals with the expected string {@code expected}. + * + * @param rows Actual result rows + * @param expected Expected string of the sorted rows + * @param withChangeFlag Whether compares with change flags + */ + public static void assertRowsEquals(List rows, String expected, boolean withChangeFlag) { + String rowsString = rows.stream() + .sorted(Comparator.comparing(o -> toStringSafely(o.getField(0)))) + .map(row -> { + final String rowStr = row.toString(); + if (withChangeFlag) { + return row.getKind().shortString() + "(" + rowStr + ")"; + } else { + return rowStr; + } + }) + .collect(Collectors.toList()).toString(); + assertThat(rowsString, is(expected)); + } + + /** + * Sort the {@code rows} using field at index {@code orderingPos} and asserts + * it equals with the expected string {@code expected}. + * + * @param rows Actual result rows + * @param expected Expected string of the sorted rows + * @param orderingPos Field position for ordering + */ + public static void assertRowsEquals(List rows, String expected, int orderingPos) { + String rowsString = rows.stream() + .sorted(Comparator.comparing(o -> toStringSafely(o.getField(orderingPos)))) + .collect(Collectors.toList()).toString(); + assertThat(rowsString, is(expected)); + } + + /** + * Sort the {@code rows} using field at index 0 and asserts + * it equals with the expected row data list {@code expected}. + * + * @param rows Actual result rows + * @param expected Expected row data list + */ + public static void assertRowsEquals(List rows, List expected) { + String rowsString = rows.stream() + .sorted(Comparator.comparing(o -> toIdSafely(o.getField(0)))) + .collect(Collectors.toList()).toString(); + assertThat(rowsString, is(rowDataToString(expected))); + } + + /** + * Sort the {@code rows} using field at index 0 and asserts + * it equals with the expected string {@code expected}. + * + * @param rows Actual result rows + * @param expected Expected string of the sorted rows + */ + public static void assertRowDataEquals(List rows, String expected) { + String rowsString = rowDataToString(rows); + assertThat(rowsString, is(expected)); + } + + /** + * Sort the {@code rows} using field at index 0 and asserts + * it equals with the expected row data list {@code expected}. + * + * @param rows Actual result rows + * @param expected Expected row data list + */ + public static void assertRowDataEquals(List rows, List expected) { + String rowsString = rowDataToString(rows); + assertThat(rowsString, is(rowDataToString(expected))); + } + + /** + * Checks the source data set are written as expected. + * + *

    Note: Replace it with the Flink reader when it is supported. + * + * @param baseFile The file base to check, should be a directory + * @param expected The expected results mapping, the key should be the partition path + * and value should be values list with the key partition + */ + public static void checkWrittenData(File baseFile, Map expected) throws IOException { + checkWrittenData(baseFile, expected, 4); + } + + /** + * Checks the source data set are written as expected. + * + *

    Note: Replace it with the Flink reader when it is supported. + * + * @param baseFile The file base to check, should be a directory + * @param expected The expected results mapping, the key should be the partition path + * and value should be values list with the key partition + * @param partitions The expected partition number + */ + public static void checkWrittenData( + File baseFile, + Map expected, + int partitions) throws IOException { + assert baseFile.isDirectory(); + FileFilter filter = file -> !file.getName().startsWith("."); + File[] partitionDirs = baseFile.listFiles(filter); + assertNotNull(partitionDirs); + assertThat(partitionDirs.length, is(partitions)); + for (File partitionDir : partitionDirs) { + File[] dataFiles = partitionDir.listFiles(filter); + assertNotNull(dataFiles); + File latestDataFile = Arrays.stream(dataFiles) + .max(Comparator.comparing(f -> FSUtils.getCommitTime(f.getName()))) + .orElse(dataFiles[0]); + ParquetReader reader = AvroParquetReader + .builder(new Path(latestDataFile.getAbsolutePath())).build(); + List readBuffer = new ArrayList<>(); + GenericRecord nextRecord = reader.read(); + while (nextRecord != null) { + readBuffer.add(filterOutVariables(nextRecord)); + nextRecord = reader.read(); + } + readBuffer.sort(Comparator.naturalOrder()); + assertThat(readBuffer.toString(), is(expected.get(partitionDir.getName()))); + } + } + + /** + * Checks the MERGE_ON_READ source data are written as expected. + * + *

    Note: Replace it with the Flink reader when it is supported. + * + * @param fs The file system + * @param latestInstant The latest committed instant of current table + * @param baseFile The file base to check, should be a directory + * @param expected The expected results mapping, the key should be the partition path + * @param partitions The expected partition number + * @param schema The read schema + */ + public static void checkWrittenDataMOR( + FileSystem fs, + String latestInstant, + File baseFile, + Map expected, + int partitions, + Schema schema) { + assert baseFile.isDirectory() : "Base path should be a directory"; + FileFilter partitionFilter = file -> !file.getName().startsWith("."); + File[] partitionDirs = baseFile.listFiles(partitionFilter); + assertNotNull(partitionDirs); + assertThat(partitionDirs.length, is(partitions)); + for (File partitionDir : partitionDirs) { + File[] dataFiles = partitionDir.listFiles(file -> + file.getName().contains(".log.") && !file.getName().startsWith("..")); + assertNotNull(dataFiles); + HoodieMergedLogRecordScanner scanner = getScanner( + fs, baseFile.getPath(), Arrays.stream(dataFiles).map(File::getAbsolutePath) + .sorted(Comparator.naturalOrder()).collect(Collectors.toList()), + schema, latestInstant); + List readBuffer = scanner.getRecords().values().stream() + .map(hoodieRecord -> { + try { + // in case it is a delete + GenericRecord record = (GenericRecord) hoodieRecord.getData() + .getInsertValue(schema, new Properties()) + .orElse(null); + return record == null ? (String) null : filterOutVariables(record); + } catch (IOException e) { + throw new RuntimeException(e); + } + }) + .filter(Objects::nonNull) + .sorted(Comparator.naturalOrder()) + .collect(Collectors.toList()); + assertThat(readBuffer.toString(), is(expected.get(partitionDir.getName()))); + } + } + + /** + * Returns the scanner to read avro log files. + */ + private static HoodieMergedLogRecordScanner getScanner( + FileSystem fs, + String basePath, + List logPaths, + Schema readSchema, + String instant) { + return HoodieMergedLogRecordScanner.newBuilder() + .withFileSystem(fs) + .withBasePath(basePath) + .withLogFilePaths(logPaths) + .withReaderSchema(readSchema) + .withLatestInstantTime(instant) + .withReadBlocksLazily(false) + .withReverseReader(false) + .withBufferSize(16 * 1024 * 1024) + .withMaxMemorySizeInBytes(1024 * 1024L) + .withSpillableMapBasePath("/tmp/") + .withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()) + .withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()) + .build(); + } + + /** + * Filter out the variables like file name. + */ + private static String filterOutVariables(GenericRecord genericRecord) { + List fields = new ArrayList<>(); + fields.add(genericRecord.get("_hoodie_record_key").toString()); + fields.add(genericRecord.get("_hoodie_partition_path").toString()); + fields.add(genericRecord.get("uuid").toString()); + fields.add(genericRecord.get("name").toString()); + fields.add(genericRecord.get("age").toString()); + fields.add(genericRecord.get("ts").toString()); + fields.add(genericRecord.get("partition").toString()); + return Strings.join(fields, ","); + } + + public static BinaryRowData insertRow(Object... fields) { + return insertRow(QuickstartConfigurations.ROW_TYPE, fields); + } + + public static BinaryRowData insertRow(RowType rowType, Object... fields) { + LogicalType[] types = rowType.getFields().stream().map(RowType.RowField::getType) + .toArray(LogicalType[]::new); + assertEquals( + "Filed count inconsistent with type information", + fields.length, + types.length); + BinaryRowData row = new BinaryRowData(fields.length); + BinaryRowWriter writer = new BinaryRowWriter(row); + writer.reset(); + for (int i = 0; i < fields.length; i++) { + Object field = fields[i]; + if (field == null) { + writer.setNullAt(i); + } else { + BinaryWriter.write(writer, i, field, types[i], InternalSerializers.create(types[i])); + } + } + writer.complete(); + return row; + } + + private static BinaryRowData deleteRow(Object... fields) { + BinaryRowData rowData = insertRow(fields); + rowData.setRowKind(RowKind.DELETE); + return rowData; + } + + private static BinaryRowData updateBeforeRow(Object... fields) { + BinaryRowData rowData = insertRow(fields); + rowData.setRowKind(RowKind.UPDATE_BEFORE); + return rowData; + } + + private static BinaryRowData updateAfterRow(Object... fields) { + BinaryRowData rowData = insertRow(fields); + rowData.setRowKind(RowKind.UPDATE_AFTER); + return rowData; + } +} diff --git a/hudi-examples/hudi-examples-flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory b/hudi-examples/hudi-examples-flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory new file mode 100644 index 0000000000000..27a137292b388 --- /dev/null +++ b/hudi-examples/hudi-examples-flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.hudi.examples.quickstart.factory.ContinuousFileSourceFactory +org.apache.hudi.examples.quickstart.factory.CollectSinkTableFactory diff --git a/hudi-examples/hudi-examples-java/pom.xml b/hudi-examples/hudi-examples-java/pom.xml new file mode 100644 index 0000000000000..50fcd08118d44 --- /dev/null +++ b/hudi-examples/hudi-examples-java/pom.xml @@ -0,0 +1,192 @@ + + + + + hudi-examples + org.apache.hudi + 0.12.2-dt-SNAPSHOT + + 4.0.0 + + hudi-examples-java + + + ${project.parent.basedir} + true + + + + + + src/main/resources + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-dependencies + prepare-package + + copy-dependencies + + + ${project.build.directory}/lib + true + true + true + + + + + + net.alchim31.maven + scala-maven-plugin + + + scala-compile-first + process-resources + + add-source + compile + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + + compile + + compile + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + + test-jar + + test-compile + + + + false + + + + org.apache.rat + apache-rat-plugin + + + + + + + + org.apache.logging.log4j + log4j-1.2-api + + + + + org.apache.hudi + hudi-examples-common + ${project.version} + + + + org.apache.hudi + hudi-client-common + ${project.version} + + + + org.apache.hudi + hudi-java-client + ${project.version} + + + + org.xerial.snappy + snappy-java + provided + + + + + org.junit.jupiter + junit-jupiter-api + test + + + org.junit.jupiter + junit-jupiter-engine + test + + + org.junit.vintage + junit-vintage-engine + test + + + org.junit.jupiter + junit-jupiter-params + test + + + org.mockito + mockito-junit-jupiter + test + + + org.apache.hudi + hudi-tests-common + ${project.version} + test + + + org.apache.parquet + parquet-hadoop + ${parquet.version} + + + org.xerial.snappy + snappy-java + + + + + org.apache.parquet + parquet-avro + ${parquet.version} + provided + + + diff --git a/hudi-examples/hudi-examples-java/src/main/java/org/apache/hudi/examples/java/HoodieJavaWriteClientExample.java b/hudi-examples/hudi-examples-java/src/main/java/org/apache/hudi/examples/java/HoodieJavaWriteClientExample.java new file mode 100644 index 0000000000000..6e20ee1190661 --- /dev/null +++ b/hudi-examples/hudi-examples-java/src/main/java/org/apache/hudi/examples/java/HoodieJavaWriteClientExample.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.examples.java; + +import org.apache.hudi.client.HoodieJavaWriteClient; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.config.HoodieArchivalConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.examples.common.HoodieExampleDataGenerator; +import org.apache.hudi.index.HoodieIndex; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + + +/** + * Simple examples of #{@link HoodieJavaWriteClient}. + * + * Usage: HoodieJavaWriteClientExample + * and describe root path of hudi and table name + * for example, `HoodieJavaWriteClientExample file:///tmp/hoodie/sample-table hoodie_rt` + */ +public class HoodieJavaWriteClientExample { + + private static final Logger LOG = LogManager.getLogger(HoodieJavaWriteClientExample.class); + + private static String tableType = HoodieTableType.COPY_ON_WRITE.name(); + + public static void main(String[] args) throws Exception { + if (args.length < 2) { + System.err.println("Usage: HoodieJavaWriteClientExample "); + System.exit(1); + } + String tablePath = args[0]; + String tableName = args[1]; + + // Generator of some records to be loaded in. + HoodieExampleDataGenerator dataGen = new HoodieExampleDataGenerator<>(); + + Configuration hadoopConf = new Configuration(); + // initialize the table, if not done already + Path path = new Path(tablePath); + FileSystem fs = FSUtils.getFs(tablePath, hadoopConf); + if (!fs.exists(path)) { + HoodieTableMetaClient.withPropertyBuilder() + .setTableType(tableType) + .setTableName(tableName) + .setPayloadClassName(HoodieAvroPayload.class.getName()) + .initTable(hadoopConf, tablePath); + } + + // Create the write client to write some records in + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath) + .withSchema(HoodieExampleDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .withDeleteParallelism(2).forTable(tableName) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()) + .withArchivalConfig(HoodieArchivalConfig.newBuilder().archiveCommitsWith(20, 30).build()).build(); + HoodieJavaWriteClient client = + new HoodieJavaWriteClient<>(new HoodieJavaEngineContext(hadoopConf), cfg); + + // inserts + String newCommitTime = client.startCommit(); + LOG.info("Starting commit " + newCommitTime); + + List> records = dataGen.generateInserts(newCommitTime, 10); + List> recordsSoFar = new ArrayList<>(records); + List> writeRecords = + recordsSoFar.stream().map(r -> new HoodieAvroRecord(r)).collect(Collectors.toList()); + client.insert(writeRecords, newCommitTime); + + // updates + newCommitTime = client.startCommit(); + LOG.info("Starting commit " + newCommitTime); + List> toBeUpdated = dataGen.generateUpdates(newCommitTime, 2); + records.addAll(toBeUpdated); + recordsSoFar.addAll(toBeUpdated); + writeRecords = + recordsSoFar.stream().map(r -> new HoodieAvroRecord(r)).collect(Collectors.toList()); + client.upsert(writeRecords, newCommitTime); + + // Delete + newCommitTime = client.startCommit(); + LOG.info("Starting commit " + newCommitTime); + // just delete half of the records + int numToDelete = recordsSoFar.size() / 2; + List toBeDeleted = + recordsSoFar.stream().map(HoodieRecord::getKey).limit(numToDelete).collect(Collectors.toList()); + client.delete(toBeDeleted, newCommitTime); + + client.close(); + } +} diff --git a/hudi-examples/hudi-examples-spark/pom.xml b/hudi-examples/hudi-examples-spark/pom.xml new file mode 100644 index 0000000000000..42da0da222c39 --- /dev/null +++ b/hudi-examples/hudi-examples-spark/pom.xml @@ -0,0 +1,304 @@ + + + + + hudi-examples + org.apache.hudi + 0.12.2-dt-SNAPSHOT + + 4.0.0 + + hudi-examples-spark + + + ${project.parent.basedir} + true + + + + + + src/main/resources + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-dependencies + prepare-package + + copy-dependencies + + + ${project.build.directory}/lib + true + true + true + + + + + + net.alchim31.maven + scala-maven-plugin + + + scala-compile-first + process-resources + + add-source + compile + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + + compile + + compile + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + + test-jar + + test-compile + + + + false + + + + org.apache.rat + apache-rat-plugin + + + + + + + + org.scala-lang + scala-library + ${scala.version} + + + + org.scala-lang.modules + scala-collection-compat_${scala.binary.version} + + + + + org.apache.logging.log4j + log4j-1.2-api + + + + + org.apache.hudi + hudi-examples-common + ${project.version} + + + * + * + + + + + + org.apache.hudi + hudi-client-common + ${project.version} + + + + org.apache.hudi + hudi-java-client + ${project.version} + + + + org.apache.hudi + hudi-spark-client + ${project.version} + + + + org.apache.hudi + hudi-utilities_${scala.binary.version} + ${project.version} + + + + org.apache.hudi + hudi-spark_${scala.binary.version} + ${project.version} + + + + org.apache.hudi + hudi-spark-common_${scala.binary.version} + ${project.version} + + + + org.apache.hudi + hudi-hadoop-mr + ${project.version} + + + + org.apache.hudi + hudi-timeline-service + ${project.version} + + + + + org.apache.spark + spark-core_${scala.binary.version} + + + org.apache.spark + spark-sql_${scala.binary.version} + + + + + org.apache.hadoop + hadoop-auth + + + + + org.apache.parquet + parquet-hadoop + ${parquet.version} + + + + + org.apache.avro + avro + + + + org.apache.parquet + parquet-avro + + + + + ${hive.groupid} + hive-common + + + ${hive.groupid} + hive-exec + ${hive.version} + provided + ${hive.exec.classifier} + + + javax.mail + mail + + + org.eclipse.jetty.aggregate + * + + + + + + + org.junit.platform + junit-platform-launcher + ${junit.platform.version} + test + + + org.junit.jupiter + junit-jupiter-api + test + + + org.junit.jupiter + junit-jupiter-engine + test + + + org.junit.vintage + junit-vintage-engine + test + + + org.junit.jupiter + junit-jupiter-params + test + + + org.mockito + mockito-junit-jupiter + test + + + + + org.apache.hudi + hudi-tests-common + ${project.version} + test + + + org.apache.hudi + hudi-client-common + ${project.version} + tests + test-jar + test + + + org.apache.hudi + hudi-spark-client + ${project.version} + tests + test-jar + test + + + diff --git a/hudi-examples/src/main/java/org/apache/hudi/examples/common/ExampleDataSchemaProvider.java b/hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/common/ExampleDataSchemaProvider.java similarity index 99% rename from hudi-examples/src/main/java/org/apache/hudi/examples/common/ExampleDataSchemaProvider.java rename to hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/common/ExampleDataSchemaProvider.java index 4486a4286c43f..c974d9ad73313 100644 --- a/hudi-examples/src/main/java/org/apache/hudi/examples/common/ExampleDataSchemaProvider.java +++ b/hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/common/ExampleDataSchemaProvider.java @@ -23,7 +23,6 @@ import org.apache.hudi.utilities.schema.SchemaProvider; import org.apache.spark.api.java.JavaSparkContext; - /** * the example SchemaProvider of example json data from uber. */ diff --git a/hudi-examples/src/main/java/org/apache/hudi/examples/common/HoodieExampleSparkUtils.java b/hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/common/HoodieExampleSparkUtils.java similarity index 95% rename from hudi-examples/src/main/java/org/apache/hudi/examples/common/HoodieExampleSparkUtils.java rename to hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/common/HoodieExampleSparkUtils.java index 3e7b0e837aa25..fcdc2a813ab66 100644 --- a/hudi-examples/src/main/java/org/apache/hudi/examples/common/HoodieExampleSparkUtils.java +++ b/hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/common/HoodieExampleSparkUtils.java @@ -33,6 +33,7 @@ private static Map defaultConf() { Map additionalConfigs = new HashMap<>(); additionalConfigs.put("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); additionalConfigs.put("spark.kryoserializer.buffer.max", "512m"); + additionalConfigs.put("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension"); return additionalConfigs; } diff --git a/hudi-examples/src/main/java/org/apache/hudi/examples/common/IdentityTransformer.java b/hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/common/IdentityTransformer.java similarity index 100% rename from hudi-examples/src/main/java/org/apache/hudi/examples/common/IdentityTransformer.java rename to hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/common/IdentityTransformer.java diff --git a/hudi-examples/src/main/java/org/apache/hudi/examples/common/RandomJsonSource.java b/hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/common/RandomJsonSource.java similarity index 100% rename from hudi-examples/src/main/java/org/apache/hudi/examples/common/RandomJsonSource.java rename to hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/common/RandomJsonSource.java diff --git a/hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/quickstart/HoodieSparkQuickstart.java b/hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/quickstart/HoodieSparkQuickstart.java new file mode 100644 index 0000000000000..9c6293fe4471e --- /dev/null +++ b/hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/quickstart/HoodieSparkQuickstart.java @@ -0,0 +1,291 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.examples.quickstart; + +import org.apache.hudi.QuickstartUtils; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.examples.common.HoodieExampleDataGenerator; +import org.apache.hudi.examples.common.HoodieExampleSparkUtils; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; + +import java.util.List; + +import static org.apache.hudi.config.HoodieWriteConfig.TBL_NAME; +import static org.apache.spark.sql.SaveMode.Append; +import static org.apache.spark.sql.SaveMode.Overwrite; + +public final class HoodieSparkQuickstart { + + private HoodieSparkQuickstart() { + } + + public static void main(String[] args) { + if (args.length < 2) { + System.err.println("Usage: HoodieWriteClientExample "); + System.exit(1); + } + String tablePath = args[0]; + String tableName = args[1]; + + SparkSession spark = HoodieExampleSparkUtils.defaultSparkSession("Hudi Spark basic example"); + SparkConf sparkConf = HoodieExampleSparkUtils.defaultSparkConf("hoodie-client-example"); + + try (JavaSparkContext jsc = new JavaSparkContext(sparkConf)) { + runQuickstart(jsc, spark, tableName, tablePath); + } + } + + /** + * Visible for testing + */ + public static void runQuickstart(JavaSparkContext jsc, SparkSession spark, String tableName, String tablePath) { + final HoodieExampleDataGenerator dataGen = new HoodieExampleDataGenerator<>(); + + String snapshotQuery = "SELECT begin_lat, begin_lon, driver, end_lat, end_lon, fare, partitionpath, rider, ts, uuid FROM hudi_ro_table"; + + Dataset insertDf = insertData(spark, jsc, tablePath, tableName, dataGen); + queryData(spark, jsc, tablePath, tableName, dataGen); + assert insertDf.except(spark.sql(snapshotQuery)).count() == 0; + + Dataset snapshotBeforeUpdate = spark.sql(snapshotQuery); + Dataset updateDf = updateData(spark, jsc, tablePath, tableName, dataGen); + queryData(spark, jsc, tablePath, tableName, dataGen); + Dataset snapshotAfterUpdate = spark.sql(snapshotQuery); + assert snapshotAfterUpdate.intersect(updateDf).count() == updateDf.count(); + assert snapshotAfterUpdate.except(updateDf).except(snapshotBeforeUpdate).count() == 0; + + incrementalQuery(spark, tablePath, tableName); + pointInTimeQuery(spark, tablePath, tableName); + + Dataset snapshotBeforeDelete = snapshotAfterUpdate; + Dataset deleteDf = delete(spark, tablePath, tableName); + queryData(spark, jsc, tablePath, tableName, dataGen); + Dataset snapshotAfterDelete = spark.sql(snapshotQuery); + assert snapshotAfterDelete.intersect(deleteDf).count() == 0; + assert snapshotBeforeDelete.except(deleteDf).except(snapshotAfterDelete).count() == 0; + + Dataset snapshotBeforeOverwrite = snapshotAfterDelete; + Dataset overwriteDf = insertOverwriteData(spark, jsc, tablePath, tableName, dataGen); + queryData(spark, jsc, tablePath, tableName, dataGen); + Dataset withoutThirdPartitionDf = snapshotBeforeOverwrite.filter("partitionpath != '" + HoodieExampleDataGenerator.DEFAULT_THIRD_PARTITION_PATH + "'"); + Dataset expectedDf = withoutThirdPartitionDf.union(overwriteDf); + Dataset snapshotAfterOverwrite = spark.sql(snapshotQuery); + assert snapshotAfterOverwrite.except(expectedDf).count() == 0; + + + Dataset snapshotBeforeDeleteByPartition = snapshotAfterOverwrite; + deleteByPartition(spark, tablePath, tableName); + queryData(spark, jsc, tablePath, tableName, dataGen); + Dataset snapshotAfterDeleteByPartition = spark.sql(snapshotQuery); + assert snapshotAfterDeleteByPartition.intersect(snapshotBeforeDeleteByPartition.filter("partitionpath == '" + HoodieExampleDataGenerator.DEFAULT_FIRST_PARTITION_PATH + "'")).count() == 0; + assert snapshotAfterDeleteByPartition.count() == snapshotBeforeDeleteByPartition.filter("partitionpath != '" + HoodieExampleDataGenerator.DEFAULT_FIRST_PARTITION_PATH + "'").count(); + } + + /** + * Generate some new trips, load them into a DataFrame and write the DataFrame into the Hudi dataset as below. + */ + public static Dataset insertData(SparkSession spark, JavaSparkContext jsc, String tablePath, String tableName, + HoodieExampleDataGenerator dataGen) { + String commitTime = Long.toString(System.currentTimeMillis()); + List inserts = dataGen.convertToStringList(dataGen.generateInserts(commitTime, 20)); + Dataset df = spark.read().json(jsc.parallelize(inserts, 1)); + + df.write().format("org.apache.hudi") + .options(QuickstartUtils.getQuickstartWriteConfigs()) + .option(HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key(), "ts") + .option(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "uuid") + .option(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "partitionpath") + .option(TBL_NAME.key(), tableName) + .mode(Overwrite) + .save(tablePath); + return df; + } + + /** + * Generate new records, load them into a {@link Dataset} and insert-overwrite it into the Hudi dataset + */ + public static Dataset insertOverwriteData(SparkSession spark, JavaSparkContext jsc, String tablePath, String tableName, + HoodieExampleDataGenerator dataGen) { + String commitTime = Long.toString(System.currentTimeMillis()); + List inserts = dataGen.convertToStringList(dataGen.generateInsertsOnPartition(commitTime, 20, HoodieExampleDataGenerator.DEFAULT_THIRD_PARTITION_PATH)); + Dataset df = spark.read().json(jsc.parallelize(inserts, 1)); + + df.write().format("org.apache.hudi") + .options(QuickstartUtils.getQuickstartWriteConfigs()) + .option("hoodie.datasource.write.operation", WriteOperationType.INSERT_OVERWRITE.name()) + .option(HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key(), "ts") + .option(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "uuid") + .option(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "partitionpath") + .option(TBL_NAME.key(), tableName) + .mode(Append) + .save(tablePath); + return df; + } + + /** + * Load the data files into a DataFrame. + */ + public static void queryData(SparkSession spark, JavaSparkContext jsc, String tablePath, String tableName, + HoodieExampleDataGenerator dataGen) { + Dataset roViewDF = spark + .read() + .format("org.apache.hudi") + .load(tablePath + "/*/*/*/*"); + + roViewDF.createOrReplaceTempView("hudi_ro_table"); + + spark.sql("select fare, begin_lon, begin_lat, ts from hudi_ro_table where fare > 20.0").show(); + // +-----------------+-------------------+-------------------+---+ + // | fare| begin_lon| begin_lat| ts| + // +-----------------+-------------------+-------------------+---+ + // |98.88075495133515|0.39556048623031603|0.17851135255091155|0.0| + // ... + + spark.sql( + "select _hoodie_commit_time, _hoodie_record_key, _hoodie_partition_path, rider, driver, fare from hudi_ro_table") + .show(); + // +-------------------+--------------------+----------------------+-------------------+--------------------+------------------+ + // |_hoodie_commit_time| _hoodie_record_key|_hoodie_partition_path| rider| driver| fare| + // +-------------------+--------------------+----------------------+-------------------+--------------------+------------------+ + // | 20191231181501|31cafb9f-0196-4b1...| 2020/01/02|rider-1577787297889|driver-1577787297889| 98.88075495133515| + // ... + } + + /** + * This is similar to inserting new data. Generate updates to existing trips using the data generator, + * load into a DataFrame and write DataFrame into the hudi dataset. + */ + public static Dataset updateData(SparkSession spark, JavaSparkContext jsc, String tablePath, String tableName, + HoodieExampleDataGenerator dataGen) { + + String commitTime = Long.toString(System.currentTimeMillis()); + List updates = dataGen.convertToStringList(dataGen.generateUniqueUpdates(commitTime)); + Dataset df = spark.read().json(jsc.parallelize(updates, 1)); + df.write().format("org.apache.hudi") + .options(QuickstartUtils.getQuickstartWriteConfigs()) + .option(HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key(), "ts") + .option(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "uuid") + .option(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "partitionpath") + .option(TBL_NAME.key(), tableName) + .mode(Append) + .save(tablePath); + return df; + } + + /** + * Deleta data based in data information. + */ + public static Dataset delete(SparkSession spark, String tablePath, String tableName) { + + Dataset roViewDF = spark.read().format("org.apache.hudi").load(tablePath + "/*/*/*/*"); + roViewDF.createOrReplaceTempView("hudi_ro_table"); + Dataset toBeDeletedDf = spark.sql("SELECT begin_lat, begin_lon, driver, end_lat, end_lon, fare, partitionpath, rider, ts, uuid FROM hudi_ro_table limit 2"); + Dataset df = toBeDeletedDf.select("uuid", "partitionpath", "ts"); + + df.write().format("org.apache.hudi") + .options(QuickstartUtils.getQuickstartWriteConfigs()) + .option(HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key(), "ts") + .option(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "uuid") + .option(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "partitionpath") + .option(TBL_NAME.key(), tableName) + .option("hoodie.datasource.write.operation", WriteOperationType.DELETE.value()) + .mode(Append) + .save(tablePath); + return toBeDeletedDf; + } + + /** + * Delete the data of the first partition. + */ + public static void deleteByPartition(SparkSession spark, String tablePath, String tableName) { + Dataset df = spark.emptyDataFrame(); + df.write().format("org.apache.hudi") + .options(QuickstartUtils.getQuickstartWriteConfigs()) + .option(HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key(), "ts") + .option(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "uuid") + .option(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "partitionpath") + .option(TBL_NAME.key(), tableName) + .option("hoodie.datasource.write.operation", WriteOperationType.DELETE_PARTITION.value()) + .option("hoodie.datasource.write.partitions.to.delete", HoodieExampleDataGenerator.DEFAULT_FIRST_PARTITION_PATH) + .mode(Append) + .save(tablePath); + } + + /** + * Hudi also provides capability to obtain a stream of records that changed since given commit timestamp. + * This can be achieved using Hudi’s incremental view and providing a begin time from which changes need to be streamed. + * We do not need to specify endTime, if we want all changes after the given commit (as is the common case). + */ + public static void incrementalQuery(SparkSession spark, String tablePath, String tableName) { + List commits = + spark.sql("select distinct(_hoodie_commit_time) as commitTime from hudi_ro_table order by commitTime") + .toJavaRDD() + .map((Function) row -> row.getString(0)) + .take(50); + + String beginTime = commits.get(commits.size() - 1); // commit time we are interested in + + // incrementally query data + Dataset incViewDF = spark + .read() + .format("org.apache.hudi") + .option("hoodie.datasource.query.type", "incremental") + .option("hoodie.datasource.read.begin.instanttime", beginTime) + .load(tablePath); + + incViewDF.createOrReplaceTempView("hudi_incr_table"); + spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_incr_table where fare > 20.0") + .show(); + } + + /** + * Lets look at how to query data as of a specific time. + * The specific time can be represented by pointing endTime to a specific commit time + * and beginTime to “000” (denoting earliest possible commit time). + */ + public static void pointInTimeQuery(SparkSession spark, String tablePath, String tableName) { + List commits = + spark.sql("select distinct(_hoodie_commit_time) as commitTime from hudi_ro_table order by commitTime") + .toJavaRDD() + .map((Function) row -> row.getString(0)) + .take(50); + String beginTime = "000"; // Represents all commits > this time. + String endTime = commits.get(commits.size() - 1); // commit time we are interested in + + //incrementally query data + Dataset incViewDF = spark.read().format("org.apache.hudi") + .option("hoodie.datasource.query.type", "incremental") + .option("hoodie.datasource.read.begin.instanttime", beginTime) + .option("hoodie.datasource.read.end.instanttime", endTime) + .load(tablePath); + + incViewDF.createOrReplaceTempView("hudi_incr_table"); + spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_incr_table where fare > 20.0") + .show(); + } +} diff --git a/hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/spark/HoodieSparkBootstrapExample.java b/hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/spark/HoodieSparkBootstrapExample.java new file mode 100644 index 0000000000000..518f095993bc7 --- /dev/null +++ b/hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/spark/HoodieSparkBootstrapExample.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.examples.spark; + +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.config.HoodieBootstrapConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.examples.common.HoodieExampleSparkUtils; +import org.apache.hudi.keygen.NonpartitionedKeyGenerator; +import org.apache.hudi.DataSourceWriteOptions; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.Dataset; + +public class HoodieSparkBootstrapExample { + + private static String tableType = HoodieTableType.MERGE_ON_READ.name(); + + public static void main(String[] args) throws Exception { + if (args.length < 5) { + System.err.println("Usage: HoodieSparkBootstrapExample "); + System.exit(1); + } + String recordKey = args[0]; + String tableName = args[1]; + String partitionPath = args[2]; + String preCombineField = args[3]; + String basePath = args[4]; + + SparkConf sparkConf = HoodieExampleSparkUtils.defaultSparkConf("hoodie-client-example"); + + SparkSession spark = SparkSession + .builder() + .appName("Java Spark SQL basic example") + .config("spark.some.config.option", "some-value") + .enableHiveSupport() + .getOrCreate(); + + Dataset df = spark.emptyDataFrame(); + + df.write().format("hudi").option(HoodieWriteConfig.TBL_NAME.key(), tableName) + .option(DataSourceWriteOptions.OPERATION().key(), DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL()) + .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), recordKey) + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), partitionPath) + .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), preCombineField) + .option(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieFileFormat.ORC.name()) + .option(HoodieBootstrapConfig.BASE_PATH.key(), basePath) + .option(HoodieBootstrapConfig.KEYGEN_CLASS_NAME.key(), NonpartitionedKeyGenerator.class.getCanonicalName()) + .mode(SaveMode.Overwrite).save("/hudi/" + tableName); + + df.count(); + } +} diff --git a/hudi-examples/src/main/java/org/apache/hudi/examples/spark/HoodieWriteClientExample.java b/hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/spark/HoodieWriteClientExample.java similarity index 78% rename from hudi-examples/src/main/java/org/apache/hudi/examples/spark/HoodieWriteClientExample.java rename to hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/spark/HoodieWriteClientExample.java index b606c527b0306..299fe992fa00a 100644 --- a/hudi-examples/src/main/java/org/apache/hudi/examples/spark/HoodieWriteClientExample.java +++ b/hudi-examples/hudi-examples-spark/src/main/java/org/apache/hudi/examples/spark/HoodieWriteClientExample.java @@ -27,8 +27,9 @@ import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieArchivalConfig; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.examples.common.HoodieExampleDataGenerator; @@ -37,6 +38,8 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hudi.table.action.HoodieWriteMetadata; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.SparkConf; @@ -47,7 +50,6 @@ import java.util.List; import java.util.stream.Collectors; - /** * Simple examples of #{@link SparkRDDWriteClient}. * @@ -85,8 +87,11 @@ public static void main(String[] args) throws Exception { Path path = new Path(tablePath); FileSystem fs = FSUtils.getFs(tablePath, jsc.hadoopConfiguration()); if (!fs.exists(path)) { - HoodieTableMetaClient.initTableType(jsc.hadoopConfiguration(), tablePath, HoodieTableType.valueOf(tableType), - tableName, HoodieAvroPayload.class.getName()); + HoodieTableMetaClient.withPropertyBuilder() + .setTableType(tableType) + .setTableName(tableName) + .setPayloadClass(HoodieAvroPayload.class) + .initTable(jsc.hadoopConfiguration(), tablePath); } // Create the write client to write some records in @@ -94,7 +99,7 @@ public static void main(String[] args) throws Exception { .withSchema(HoodieExampleDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .withDeleteParallelism(2).forTable(tableName) .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(20, 30).build()).build(); + .withArchivalConfig(HoodieArchivalConfig.newBuilder().archiveCommitsWith(20, 30).build()).build(); SparkRDDWriteClient client = new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jsc), cfg); // inserts @@ -104,7 +109,7 @@ public static void main(String[] args) throws Exception { List> records = dataGen.generateInserts(newCommitTime, 10); List> recordsSoFar = new ArrayList<>(records); JavaRDD> writeRecords = jsc.parallelize(records, 1); - client.upsert(writeRecords, newCommitTime); + client.insert(writeRecords, newCommitTime); // updates newCommitTime = client.startCommit(); @@ -124,11 +129,21 @@ public static void main(String[] args) throws Exception { JavaRDD deleteRecords = jsc.parallelize(toBeDeleted, 1); client.delete(deleteRecords, newCommitTime); + // Delete by partition + newCommitTime = client.startCommit(); + client.startCommitWithTime(newCommitTime, HoodieTimeline.REPLACE_COMMIT_ACTION); + LOG.info("Starting commit " + newCommitTime); + // The partition where the data needs to be deleted + List partitionList = toBeDeleted.stream().map(s -> s.getPartitionPath()).distinct().collect(Collectors.toList()); + List deleteList = recordsSoFar.stream().filter(f -> !partitionList.contains(f.getPartitionPath())) + .map(m -> m.getKey().getPartitionPath()).distinct().collect(Collectors.toList()); + client.deletePartitions(deleteList, newCommitTime); + // compaction if (HoodieTableType.valueOf(tableType) == HoodieTableType.MERGE_ON_READ) { Option instant = client.scheduleCompaction(Option.empty()); - JavaRDD writeStatues = client.compact(instant.get()); - client.commitCompaction(instant.get(), writeStatues, Option.empty()); + HoodieWriteMetadata> compactionMetadata = client.compact(instant.get()); + client.commitCompaction(instant.get(), compactionMetadata.getCommitMetadata().get(), Option.empty()); } } diff --git a/hudi-examples/src/main/resources/delta-streamer-config/dfs/source-file.json b/hudi-examples/hudi-examples-spark/src/main/resources/delta-streamer-config/dfs/source-file.json similarity index 100% rename from hudi-examples/src/main/resources/delta-streamer-config/dfs/source-file.json rename to hudi-examples/hudi-examples-spark/src/main/resources/delta-streamer-config/dfs/source-file.json diff --git a/hudi-examples/src/main/resources/delta-streamer-config/kafka/kafka-source.properties b/hudi-examples/hudi-examples-spark/src/main/resources/delta-streamer-config/kafka/kafka-source.properties similarity index 100% rename from hudi-examples/src/main/resources/delta-streamer-config/kafka/kafka-source.properties rename to hudi-examples/hudi-examples-spark/src/main/resources/delta-streamer-config/kafka/kafka-source.properties diff --git a/hudi-examples/hudi-examples-spark/src/main/scala/org/apache/hudi/examples/spark/HoodieDataSourceExample.scala b/hudi-examples/hudi-examples-spark/src/main/scala/org/apache/hudi/examples/spark/HoodieDataSourceExample.scala new file mode 100644 index 0000000000000..33c085cba3eb6 --- /dev/null +++ b/hudi-examples/hudi-examples-spark/src/main/scala/org/apache/hudi/examples/spark/HoodieDataSourceExample.scala @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.examples.spark + +import org.apache.hudi.DataSourceReadOptions.{BEGIN_INSTANTTIME, END_INSTANTTIME, QUERY_TYPE, QUERY_TYPE_INCREMENTAL_OPT_VAL} +import org.apache.hudi.DataSourceWriteOptions.{PARTITIONPATH_FIELD, PRECOMBINE_FIELD, RECORDKEY_FIELD, PARTITIONS_TO_DELETE, OPERATION, DELETE_PARTITION_OPERATION_OPT_VAL, DELETE_OPERATION_OPT_VAL} +import org.apache.hudi.QuickstartUtils.getQuickstartWriteConfigs +import org.apache.hudi.common.model.HoodieAvroPayload +import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME +import org.apache.hudi.examples.common.{HoodieExampleDataGenerator, HoodieExampleSparkUtils} +import org.apache.spark.sql.SaveMode.{Append, Overwrite} +import org.apache.spark.sql.SparkSession + +import scala.collection.JavaConversions._ + +/** + * Simple examples of [[org.apache.hudi.DefaultSource]] + * + * To run this example, you should + * 1. For running in IDE, set VM options `-Dspark.master=local[2]` + * 2. For running in shell, using `spark-submit` + * + * Usage: HoodieWriteClientExample . + * and describe root path of hudi and table name + * for example, `HoodieDataSourceExample file:///tmp/hoodie/hudi_cow_table hudi_cow_table` + */ +object HoodieDataSourceExample { + + def main(args: Array[String]): Unit = { + + if (args.length < 2) { + System.err.println("Usage: HoodieDataSourceExample ") + System.exit(1) + } + val tablePath = args(0) + val tableName = args(1) + + val spark = HoodieExampleSparkUtils.defaultSparkSession("Hudi Spark basic example") + + val dataGen = new HoodieExampleDataGenerator[HoodieAvroPayload] + insertData(spark, tablePath, tableName, dataGen) + updateData(spark, tablePath, tableName, dataGen) + queryData(spark, tablePath, tableName, dataGen) + + incrementalQuery(spark, tablePath, tableName) + pointInTimeQuery(spark, tablePath, tableName) + + delete(spark, tablePath, tableName) + deleteByPartition(spark, tablePath, tableName) + + spark.stop() + } + + /** + * Generate some new trips, load them into a DataFrame and write the DataFrame into the Hudi dataset as below. + */ + def insertData(spark: SparkSession, tablePath: String, tableName: String, dataGen: HoodieExampleDataGenerator[HoodieAvroPayload]): Unit = { + + val commitTime: String = System.currentTimeMillis().toString + val inserts = dataGen.convertToStringList(dataGen.generateInserts(commitTime, 20)) + val df = spark.read.json(spark.sparkContext.parallelize(inserts, 1)) + df.write.format("org.apache.hudi"). + options(getQuickstartWriteConfigs). + option(PRECOMBINE_FIELD.key, "ts"). + option(RECORDKEY_FIELD.key, "uuid"). + option(PARTITIONPATH_FIELD.key, "partitionpath"). + option(TBL_NAME.key, tableName). + mode(Overwrite). + save(tablePath) + } + + /** + * Load the data files into a DataFrame. + */ + def queryData(spark: SparkSession, tablePath: String, tableName: String, dataGen: HoodieExampleDataGenerator[HoodieAvroPayload]): Unit = { + val roViewDF = spark. + read. + format("org.apache.hudi"). + load(tablePath + "/*/*/*/*") + + roViewDF.createOrReplaceTempView("hudi_ro_table") + + spark.sql("select fare, begin_lon, begin_lat, ts from hudi_ro_table where fare > 20.0").show() + // +-----------------+-------------------+-------------------+---+ + // | fare| begin_lon| begin_lat| ts| + // +-----------------+-------------------+-------------------+---+ + // |98.88075495133515|0.39556048623031603|0.17851135255091155|0.0| + // ... + + spark.sql("select _hoodie_commit_time, _hoodie_record_key, _hoodie_partition_path, rider, driver, fare from hudi_ro_table").show() + // +-------------------+--------------------+----------------------+-------------------+--------------------+------------------+ + // |_hoodie_commit_time| _hoodie_record_key|_hoodie_partition_path| rider| driver| fare| + // +-------------------+--------------------+----------------------+-------------------+--------------------+------------------+ + // | 20191231181501|31cafb9f-0196-4b1...| 2020/01/02|rider-1577787297889|driver-1577787297889| 98.88075495133515| + // ... + } + + /** + * This is similar to inserting new data. Generate updates to existing trips using the data generator, + * load into a DataFrame and write DataFrame into the hudi dataset. + */ + def updateData(spark: SparkSession, tablePath: String, tableName: String, dataGen: HoodieExampleDataGenerator[HoodieAvroPayload]): Unit = { + + val commitTime: String = System.currentTimeMillis().toString + val updates = dataGen.convertToStringList(dataGen.generateUpdates(commitTime, 10)) + val df = spark.read.json(spark.sparkContext.parallelize(updates, 1)) + df.write.format("org.apache.hudi"). + options(getQuickstartWriteConfigs). + option(PRECOMBINE_FIELD.key, "ts"). + option(RECORDKEY_FIELD.key, "uuid"). + option(PARTITIONPATH_FIELD.key, "partitionpath"). + option(TBL_NAME.key, tableName). + mode(Append). + save(tablePath) + } + + /** + * Deleta data based in data information. + */ + def delete(spark: SparkSession, tablePath: String, tableName: String): Unit = { + + val roViewDF = spark.read.format("org.apache.hudi").load(tablePath + "/*/*/*/*") + roViewDF.createOrReplaceTempView("hudi_ro_table") + val df = spark.sql("select uuid, partitionpath, ts from hudi_ro_table limit 2") + + df.write.format("org.apache.hudi"). + options(getQuickstartWriteConfigs). + option(PRECOMBINE_FIELD.key, "ts"). + option(RECORDKEY_FIELD.key, "uuid"). + option(PARTITIONPATH_FIELD.key, "partitionpath"). + option(TBL_NAME.key, tableName). + option(OPERATION.key, DELETE_OPERATION_OPT_VAL). + mode(Append). + save(tablePath) + } + + /** + * Delete the data of a single or multiple partitions. + */ + def deleteByPartition(spark: SparkSession, tablePath: String, tableName: String): Unit = { + val df = spark.emptyDataFrame + df.write.format("org.apache.hudi"). + options(getQuickstartWriteConfigs). + option(PRECOMBINE_FIELD.key, "ts"). + option(RECORDKEY_FIELD.key, "uuid"). + option(PARTITIONPATH_FIELD.key, "partitionpath"). + option(TBL_NAME.key, tableName). + option(OPERATION.key, DELETE_PARTITION_OPERATION_OPT_VAL). + option(PARTITIONS_TO_DELETE.key(), HoodieExampleDataGenerator.DEFAULT_PARTITION_PATHS.mkString(",")). + mode(Append). + save(tablePath) + } + + /** + * Hudi also provides capability to obtain a stream of records that changed since given commit timestamp. + * This can be achieved using Hudi’s incremental view and providing a begin time from which changes need to be streamed. + * We do not need to specify endTime, if we want all changes after the given commit (as is the common case). + */ + def incrementalQuery(spark: SparkSession, tablePath: String, tableName: String): Unit = { + import spark.implicits._ + val commits = spark.sql("select distinct(_hoodie_commit_time) as commitTime from hudi_ro_table order by commitTime").map(k => k.getString(0)).take(50) + val beginTime = commits(commits.length - 2) // commit time we are interested in + + // incrementally query data + val incViewDF = spark. + read. + format("org.apache.hudi"). + option(QUERY_TYPE.key, QUERY_TYPE_INCREMENTAL_OPT_VAL). + option(BEGIN_INSTANTTIME.key, beginTime). + load(tablePath) + incViewDF.createOrReplaceTempView("hudi_incr_table") + spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_incr_table where fare > 20.0").show() + } + + /** + * Lets look at how to query data as of a specific time. + * The specific time can be represented by pointing endTime to a specific commit time + * and beginTime to “000” (denoting earliest possible commit time). + */ + def pointInTimeQuery(spark: SparkSession, tablePath: String, tableName: String): Unit = { + import spark.implicits._ + val commits = spark.sql("select distinct(_hoodie_commit_time) as commitTime from hudi_ro_table order by commitTime").map(k => k.getString(0)).take(50) + val beginTime = "000" // Represents all commits > this time. + val endTime = commits(commits.length - 2) // commit time we are interested in + + //incrementally query data + val incViewDF = spark.read.format("org.apache.hudi"). + option(QUERY_TYPE.key, QUERY_TYPE_INCREMENTAL_OPT_VAL). + option(BEGIN_INSTANTTIME.key, beginTime). + option(END_INSTANTTIME.key, endTime). + load(tablePath) + incViewDF.createOrReplaceTempView("hudi_incr_table") + spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_incr_table where fare > 20.0").show() + } +} diff --git a/hudi-examples/hudi-examples-spark/src/main/scala/org/apache/hudi/examples/spark/HoodieMorCompactionJob.scala b/hudi-examples/hudi-examples-spark/src/main/scala/org/apache/hudi/examples/spark/HoodieMorCompactionJob.scala new file mode 100644 index 0000000000000..8a2c8715b30eb --- /dev/null +++ b/hudi-examples/hudi-examples-spark/src/main/scala/org/apache/hudi/examples/spark/HoodieMorCompactionJob.scala @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.examples.spark + +import org.apache.hudi.DataSourceWriteOptions.{PARTITIONPATH_FIELD, PRECOMBINE_FIELD, RECORDKEY_FIELD, TABLE_TYPE} +import org.apache.hudi.QuickstartUtils.getQuickstartWriteConfigs +import org.apache.hudi.client.SparkRDDWriteClient +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.model.{HoodieAvroPayload, HoodieRecordPayload, HoodieTableType} +import org.apache.hudi.common.util.Option +import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME +import org.apache.hudi.config.{HoodieCompactionConfig, HoodieWriteConfig} +import org.apache.hudi.examples.common.{HoodieExampleDataGenerator, HoodieExampleSparkUtils} +import org.apache.spark.sql.SaveMode.{Append, Overwrite} +import org.apache.spark.sql.SparkSession + +import scala.collection.JavaConverters._ + +/** + * Simple example to run a compaction job for MOR table. + * To run this example, you should: + * 1. For running in IDE, set VM options `-Dspark.master=local[2]` + * 2. For running in shell, using `spark-submit` + * + * Usage: HoodieMorCompactionJob . + * and describe root path of hudi and table name + * for example, `HoodieMorCompactionJob file:///tmp/hoodie/hudi_mor_table hudi_mor_table` + */ +object HoodieMorCompactionJob { + + def main(args: Array[String]): Unit = { + if (args.length < 2) { + System.err.println("Usage: HoodieMorCompactionJob ") + System.exit(1) + } + + val spark = HoodieExampleSparkUtils.defaultSparkSession("Hudi MOR table compaction via Spark example") + val dataGen = new HoodieExampleDataGenerator[HoodieAvroPayload] + val tablePath = args(0) + val tableName = args(1) + + insertData(spark, tablePath, tableName, dataGen, HoodieTableType.MERGE_ON_READ.name()) + updateData(spark, tablePath, tableName, dataGen, HoodieTableType.MERGE_ON_READ.name()) + val cfg = HoodieWriteConfig.newBuilder() + .withPath(tablePath) + .withSchema(HoodieExampleDataGenerator.TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2) + .forTable(tableName) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withInlineCompaction(true) + .withMaxNumDeltaCommitsBeforeCompaction(1).build()) + .build() + val client = new SparkRDDWriteClient[HoodieRecordPayload[Nothing]](new HoodieSparkEngineContext(spark.sparkContext), cfg) + try { + val instant = client.scheduleCompaction(Option.empty()) + client.compact(instant.get()) + client.clean() + } catch { + case e: Exception => System.err.println(s"Compaction failed due to", e) + } finally { + client.close() + spark.stop() + } + } + + def insertData(spark: SparkSession, tablePath: String, tableName: String, + dataGen: HoodieExampleDataGenerator[HoodieAvroPayload], tableType: String): Unit = { + val commitTime: String = System.currentTimeMillis().toString + val inserts = dataGen.convertToStringList(dataGen.generateInserts(commitTime, 20)) + val df = spark.read.json(spark.sparkContext.parallelize(inserts.asScala, 1)) + df.write.format("org.apache.hudi"). + options(getQuickstartWriteConfigs). + option(PRECOMBINE_FIELD.key, "ts"). + option(RECORDKEY_FIELD.key, "uuid"). + option(PARTITIONPATH_FIELD.key, "partitionpath"). + option(TBL_NAME.key, tableName). + option(TABLE_TYPE.key, tableType). + mode(Overwrite). + save(tablePath) + } + + def updateData(spark: SparkSession, tablePath: String, tableName: String, + dataGen: HoodieExampleDataGenerator[HoodieAvroPayload], tableType: String): Unit = { + val commitTime: String = System.currentTimeMillis().toString + val updates = dataGen.convertToStringList(dataGen.generateUpdates(commitTime, 10)) + val df = spark.read.json(spark.sparkContext.parallelize(updates.asScala, 1)) + df.write.format("org.apache.hudi"). + options(getQuickstartWriteConfigs). + option(PRECOMBINE_FIELD.key, "ts"). + option(RECORDKEY_FIELD.key, "uuid"). + option(PARTITIONPATH_FIELD.key, "partitionpath"). + option(TBL_NAME.key, tableName). + option(TABLE_TYPE.key, tableType). + mode(Append). + save(tablePath) + } +} diff --git a/hudi-examples/hudi-examples-spark/src/test/java/org/apache/hudi/examples/quickstart/TestHoodieSparkQuickstart.java b/hudi-examples/hudi-examples-spark/src/test/java/org/apache/hudi/examples/quickstart/TestHoodieSparkQuickstart.java new file mode 100644 index 0000000000000..c23db7f8e7106 --- /dev/null +++ b/hudi-examples/hudi-examples-spark/src/test/java/org/apache/hudi/examples/quickstart/TestHoodieSparkQuickstart.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.examples.quickstart; + +import org.apache.hudi.client.SparkRDDReadClient; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.examples.common.HoodieExampleDataGenerator; +import org.apache.hudi.testutils.providers.SparkProvider; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.util.Utils; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.File; +import java.nio.file.Paths; + +import static org.apache.hudi.examples.quickstart.HoodieSparkQuickstart.delete; +import static org.apache.hudi.examples.quickstart.HoodieSparkQuickstart.deleteByPartition; +import static org.apache.hudi.examples.quickstart.HoodieSparkQuickstart.incrementalQuery; +import static org.apache.hudi.examples.quickstart.HoodieSparkQuickstart.insertData; +import static org.apache.hudi.examples.quickstart.HoodieSparkQuickstart.insertOverwriteData; +import static org.apache.hudi.examples.quickstart.HoodieSparkQuickstart.pointInTimeQuery; +import static org.apache.hudi.examples.quickstart.HoodieSparkQuickstart.queryData; +import static org.apache.hudi.examples.quickstart.HoodieSparkQuickstart.runQuickstart; +import static org.apache.hudi.examples.quickstart.HoodieSparkQuickstart.updateData; + +public class TestHoodieSparkQuickstart implements SparkProvider { + protected static HoodieSparkEngineContext context; + + private static SparkSession spark; + private static SQLContext sqlContext; + private static JavaSparkContext jsc; + + /** + * An indicator of the initialization status. + */ + protected boolean initialized = false; + @TempDir + protected java.nio.file.Path tempDir; + + @Override + public SparkSession spark() { + return spark; + } + + @Override + public SQLContext sqlContext() { + return sqlContext; + } + + @Override + public JavaSparkContext jsc() { + return jsc; + } + + @Override + public HoodieSparkEngineContext context() { + return context; + } + + public String basePath() { + return tempDir.toAbsolutePath().toString(); + } + + public String tablePath(String tableName) { + return Paths.get(basePath(), tableName).toString(); + } + + @BeforeEach + public synchronized void runBeforeEach() { + initialized = spark != null; + if (!initialized) { + SparkConf sparkConf = conf(); + SparkRDDWriteClient.registerClasses(sparkConf); + SparkRDDReadClient.addHoodieSupport(sparkConf); + spark = SparkSession.builder().config(sparkConf).getOrCreate(); + sqlContext = spark.sqlContext(); + jsc = new JavaSparkContext(spark.sparkContext()); + context = new HoodieSparkEngineContext(jsc); + } + } + + @Test + public void testHoodieSparkQuickstart() { + String tableName = "spark_quick_start"; + String tablePath = tablePath(tableName); + + try { + runQuickstart(jsc, spark, tableName, tablePath); + } finally { + Utils.deleteRecursively(new File(tablePath)); + } + } +} diff --git a/hudi-examples/hudi-examples-spark/src/test/python/HoodiePySparkQuickstart.py b/hudi-examples/hudi-examples-spark/src/test/python/HoodiePySparkQuickstart.py new file mode 100644 index 0000000000000..c3be6a176c9b7 --- /dev/null +++ b/hudi-examples/hudi-examples-spark/src/test/python/HoodiePySparkQuickstart.py @@ -0,0 +1,266 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import sys +import os +from pyspark import sql +import random +from pyspark.sql.functions import lit +from functools import reduce +import tempfile +import argparse + + +class ExamplePySpark: + def __init__(self, spark: sql.SparkSession, tableName: str, basePath: str): + self.spark = spark + self.tableName = tableName + self.basePath = basePath + "/" + tableName + self.hudi_options = { + 'hoodie.table.name': tableName, + 'hoodie.datasource.write.recordkey.field': 'uuid', + 'hoodie.datasource.write.partitionpath.field': 'partitionpath', + 'hoodie.datasource.write.table.name': tableName, + 'hoodie.datasource.write.operation': 'upsert', + 'hoodie.datasource.write.precombine.field': 'ts', + 'hoodie.upsert.shuffle.parallelism': 2, + 'hoodie.insert.shuffle.parallelism': 2 + } + + self.dataGen = spark._jvm.org.apache.hudi.QuickstartUtils.DataGenerator() + self.snapshotQuery = "SELECT begin_lat, begin_lon, driver, end_lat, end_lon, fare, partitionpath, rider, ts, uuid FROM hudi_trips_snapshot" + return + + def runQuickstart(self): + + def snap(): + return self.spark.sql(self.snapshotQuery) + insertDf = self.insertData() + self.queryData() + assert len(insertDf.exceptAll(snap()).collect()) == 0 + + snapshotBeforeUpdate = snap() + updateDf = self.updateData() + self.queryData() + assert len(snap().intersect(updateDf).collect()) == len(updateDf.collect()) + assert len(snap().exceptAll(updateDf).exceptAll(snapshotBeforeUpdate).collect()) == 0 + + + self.timeTravelQuery() + self.incrementalQuery() + self.pointInTimeQuery() + + self.softDeletes() + self.queryData() + + snapshotBeforeDelete = snap() + deletesDf = self.hardDeletes() + self.queryData() + assert len(snap().select(["uuid", "partitionpath", "ts"]).intersect(deletesDf).collect()) == 0 + assert len(snapshotBeforeDelete.exceptAll(snap()).exceptAll(snapshotBeforeDelete).collect()) == 0 + + snapshotBeforeInsertOverwrite = snap() + insertOverwriteDf = self.insertOverwrite() + self.queryData() + withoutSanFran = snapshotBeforeInsertOverwrite.filter("partitionpath != 'americas/united_states/san_francisco'") + expectedDf = withoutSanFran.union(insertOverwriteDf) + assert len(snap().exceptAll(expectedDf).collect()) == 0 + return + + def insertData(self): + print("Insert Data") + inserts = self.spark._jvm.org.apache.hudi.QuickstartUtils.convertToStringList(self.dataGen.generateInserts(10)) + df = self.spark.read.json(self.spark.sparkContext.parallelize(inserts, 2)) + df.write.format("hudi").options(**self.hudi_options).mode("overwrite").save(self.basePath) + return df + + def updateData(self): + print("Update Data") + updates = self.spark._jvm.org.apache.hudi.QuickstartUtils.convertToStringList(self.dataGen.generateUniqueUpdates(5)) + df = self.spark.read.json(spark.sparkContext.parallelize(updates, 2)) + df.write.format("hudi").options(**self.hudi_options).mode("append").save(self.basePath) + return df + + def queryData(self): + print("Query Data") + tripsSnapshotDF = self.spark.read.format("hudi").load(self.basePath) + tripsSnapshotDF.createOrReplaceTempView("hudi_trips_snapshot") + self.spark.sql("SELECT fare, begin_lon, begin_lat, ts FROM hudi_trips_snapshot WHERE fare > 20.0").show() + self.spark.sql("SELECT _hoodie_commit_time, _hoodie_record_key, _hoodie_partition_path, rider, driver, fare FROM hudi_trips_snapshot").show() + return + + def timeTravelQuery(self): + query = "SELECT begin_lat, begin_lon, driver, end_lat, end_lon, fare, partitionpath, rider, ts, uuid FROM time_travel_query" + print("Time Travel Query") + self.spark.read.format("hudi").option("as.of.instant", "20210728141108").load(self.basePath).createOrReplaceTempView("time_travel_query") + self.spark.sql(query) + self.spark.read.format("hudi").option("as.of.instant", "2021-07-28 14:11:08.000").load(self.basePath).createOrReplaceTempView("time_travel_query") + self.spark.sql(query) + self.spark.read.format("hudi").option("as.of.instant", "2021-07-28").load(self.basePath).createOrReplaceTempView("time_travel_query") + self.spark.sql(query) + return + + def incrementalQuery(self): + print("Incremental Query") + self.spark.read.format("hudi").load(self.basePath).createOrReplaceTempView("hudi_trips_snapshot") + self.commits = list(map(lambda row: row[0], self.spark.sql("SELECT DISTINCT(_hoodie_commit_time) AS commitTime FROM hudi_trips_snapshot ORDER BY commitTime").limit(50).collect())) + beginTime = self.commits[len(self.commits) - 2] + incremental_read_options = { + 'hoodie.datasource.query.type': 'incremental', + 'hoodie.datasource.read.begin.instanttime': beginTime, + } + tripsIncrementalDF = self.spark.read.format("hudi").options(**incremental_read_options).load(self.basePath) + tripsIncrementalDF.createOrReplaceTempView("hudi_trips_incremental") + self.spark.sql("SELECT `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts FROM hudi_trips_incremental WHERE fare > 20.0").show() + + def pointInTimeQuery(self): + print("Point-in-time Query") + beginTime = "000" + endTime = self.commits[len(self.commits) - 2] + point_in_time_read_options = { + 'hoodie.datasource.query.type': 'incremental', + 'hoodie.datasource.read.end.instanttime': endTime, + 'hoodie.datasource.read.begin.instanttime': beginTime + } + + tripsPointInTimeDF = self.spark.read.format("hudi").options(**point_in_time_read_options).load(self.basePath) + tripsPointInTimeDF.createOrReplaceTempView("hudi_trips_point_in_time") + self.spark.sql("SELECT `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts FROM hudi_trips_point_in_time WHERE fare > 20.0").show() + + def softDeletes(self): + print("Soft Deletes") + spark.read.format("hudi").load(self.basePath).createOrReplaceTempView("hudi_trips_snapshot") + + # fetch total records count + trip_count = spark.sql("SELECT uuid, partitionpath FROM hudi_trips_snapshot").count() + non_null_rider_count = spark.sql("SELECT uuid, partitionpath FROM hudi_trips_snapshot WHERE rider IS NOT null").count() + print(f"trip count: {trip_count}, non null rider count: {non_null_rider_count}") + # fetch two records for soft deletes + soft_delete_ds = spark.sql("SELECT * FROM hudi_trips_snapshot").limit(2) + # prepare the soft deletes by ensuring the appropriate fields are nullified + meta_columns = ["_hoodie_commit_time", "_hoodie_commit_seqno", "_hoodie_record_key", + "_hoodie_partition_path", "_hoodie_file_name"] + excluded_columns = meta_columns + ["ts", "uuid", "partitionpath"] + nullify_columns = list(filter(lambda field: field[0] not in excluded_columns, \ + list(map(lambda field: (field.name, field.dataType), soft_delete_ds.schema.fields)))) + + hudi_soft_delete_options = { + 'hoodie.table.name': self.tableName, + 'hoodie.datasource.write.recordkey.field': 'uuid', + 'hoodie.datasource.write.partitionpath.field': 'partitionpath', + 'hoodie.datasource.write.table.name': self.tableName, + 'hoodie.datasource.write.operation': 'upsert', + 'hoodie.datasource.write.precombine.field': 'ts', + 'hoodie.upsert.shuffle.parallelism': 2, + 'hoodie.insert.shuffle.parallelism': 2 + } + + soft_delete_df = reduce(lambda df,col: df.withColumn(col[0], lit(None).cast(col[1])), \ + nullify_columns, reduce(lambda df,col: df.drop(col[0]), meta_columns, soft_delete_ds)) + + # simply upsert the table after setting these fields to null + soft_delete_df.write.format("hudi").options(**hudi_soft_delete_options).mode("append").save(self.basePath) + + # reload data + self.spark.read.format("hudi").load(self.basePath).createOrReplaceTempView("hudi_trips_snapshot") + + # This should return the same total count as before + trip_count = self.spark.sql("SELECT uuid, partitionpath FROM hudi_trips_snapshot").count() + # This should return (total - 2) count as two records are updated with nulls + non_null_rider_count = self.spark.sql("SELECT uuid, partitionpath FROM hudi_trips_snapshot WHERE rider IS NOT null").count() + print(f"trip count: {trip_count}, non null rider count: {non_null_rider_count}") + + def hardDeletes(self): + print("Hard Deletes") + # fetch total records count + total_count = self.spark.sql("SELECT uuid, partitionpath FROM hudi_trips_snapshot").count() + print(f"total count: {total_count}") + # fetch two records to be deleted + ds = self.spark.sql("SELECT uuid, partitionpath FROM hudi_trips_snapshot").limit(2) + + # issue deletes + hudi_hard_delete_options = { + 'hoodie.table.name': self.tableName, + 'hoodie.datasource.write.recordkey.field': 'uuid', + 'hoodie.datasource.write.partitionpath.field': 'partitionpath', + 'hoodie.datasource.write.table.name': self.tableName, + 'hoodie.datasource.write.operation': 'delete', + 'hoodie.datasource.write.precombine.field': 'ts', + 'hoodie.upsert.shuffle.parallelism': 2, + 'hoodie.insert.shuffle.parallelism': 2 + } + + deletes = list(map(lambda row: (row[0], row[1]), ds.collect())) + hard_delete_df = self.spark.sparkContext.parallelize(deletes).toDF(['uuid', 'partitionpath']).withColumn('ts', lit(0.0)) + hard_delete_df.write.format("hudi").options(**hudi_hard_delete_options).mode("append").save(self.basePath) + + # run the same read query as above. + roAfterDeleteViewDF = self.spark.read.format("hudi").load(self.basePath) + roAfterDeleteViewDF.createOrReplaceTempView("hudi_trips_snapshot") + # fetch should return (total - 2) records + total_count = self.spark.sql("SELECT uuid, partitionpath FROM hudi_trips_snapshot").count() + print(f"total count: {total_count}") + return hard_delete_df + + def insertOverwrite(self): + print("Insert Overwrite") + self.spark.read.format("hudi").load(self.basePath).select(["uuid","partitionpath"]).sort(["partitionpath", "uuid"]).show(n=100,truncate=False) + inserts = self.spark._jvm.org.apache.hudi.QuickstartUtils.convertToStringList(self.dataGen.generateInserts(10)) + df = self.spark.read.json(self.spark.sparkContext.parallelize(inserts, 2)).filter("partitionpath = 'americas/united_states/san_francisco'") + hudi_insert_overwrite_options = { + 'hoodie.table.name': self.tableName, + 'hoodie.datasource.write.recordkey.field': 'uuid', + 'hoodie.datasource.write.partitionpath.field': 'partitionpath', + 'hoodie.datasource.write.table.name': self.tableName, + 'hoodie.datasource.write.operation': 'insert_overwrite', + 'hoodie.datasource.write.precombine.field': 'ts', + 'hoodie.upsert.shuffle.parallelism': 2, + 'hoodie.insert.shuffle.parallelism': 2 + } + df.write.format("hudi").options(**hudi_insert_overwrite_options).mode("append").save(self.basePath) + self.spark.read.format("hudi").load(self.basePath).select(["uuid","partitionpath"]).sort(["partitionpath", "uuid"]).show(n=100,truncate=False) + return df + +if __name__ == "__main__": + random.seed(46474747) + parser = argparse.ArgumentParser(description="Examples of various operations to perform on Hudi with PySpark",formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("-t", "--table", action="store", required=True, help="the name of the table to create") + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument("-p", "--package", action="store", help="the name of the hudi-spark-bundle package\n eg. \"org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.0\"") + group.add_argument("-j", "--jar", action="store", help="the full path to hudi-spark-bundle .jar file\n eg. \"[HUDI_BASE_PATH]/packaging/hudi-spark-bundle/target/hudi-spark-bundle[VERSION].jar\"") + args = vars(parser.parse_args()) + package = args["package"] + jar = args["jar"] + if package != None: + os.environ["PYSPARK_SUBMIT_ARGS"] = f"--packages {package} pyspark-shell" + elif "jar" != None: + os.environ["PYSPARK_SUBMIT_ARGS"] = f"--jars {jar} pyspark-shell" + + with tempfile.TemporaryDirectory() as tmpdirname: + spark = sql.SparkSession \ + .builder \ + .appName("Hudi Spark basic example") \ + .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \ + .config("spark.kryoserializer.buffer.max", "512m") \ + .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \ + .getOrCreate() + ps = ExamplePySpark(spark,args["table"],tmpdirname) + ps.runQuickstart() + + + + diff --git a/hudi-examples/hudi-examples-spark/src/test/python/README.md b/hudi-examples/hudi-examples-spark/src/test/python/README.md new file mode 100644 index 0000000000000..71382fd979d67 --- /dev/null +++ b/hudi-examples/hudi-examples-spark/src/test/python/README.md @@ -0,0 +1,42 @@ + +# Requirements +Python is required to run this. Pyspark 2.4.7 does not work with the latest versions of python (python 3.8+) so if you want to use a later version (in the example below 3.3) you can build Hudi by using the command: +```bash +cd $HUDI_DIR +mvn clean install -DskipTests -Dspark3.3 -Dscala2.12 +``` +Various python packages may also need to be installed so you should get pip and then use **pip install \** to get them +# How to Run +1. [Download pyspark](https://spark.apache.org/downloads) +2. Extract it where you want it to be installed and note that location +3. Run(or add to .bashrc) the following and make sure that you put in the correct path for SPARK_HOME +```bash +export SPARK_HOME=/path/to/spark/home +export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin +export PYSPARK_SUBMIT_ARGS="--master local[*]" +export PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH +export PYTHONPATH=$SPARK_HOME/python/lib/*.zip:$PYTHONPATH +``` +4. Identify the Hudi Spark Bundle .jar or package that you wish to use: +A package will be in the format **org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.0** +A jar will be in the format **\[HUDI_BASE_PATH\]/packaging/hudi-spark-bundle/target/hudi-spark-bundle\[VERSION\].jar** +5. Go to the hudi directory and run the quickstart examples using the commands below, using the -t flag for the table name and the -p flag or -j flag for your package or jar respectively. +```bash +cd $HUDI_DIR +python3 hudi-examples/hudi-examples-spark/src/test/python/HoodiePySparkQuickstart.py [-h] -t TABLE (-p PACKAGE | -j JAR) +``` \ No newline at end of file diff --git a/hudi-examples/pom.xml b/hudi-examples/pom.xml index ba132904e19f1..97750899b2526 100644 --- a/hudi-examples/pom.xml +++ b/hudi-examples/pom.xml @@ -20,186 +20,18 @@ hudi org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 hudi-examples - jar + pom - - ${project.parent.basedir} - true - + + hudi-examples-common + hudi-examples-spark + hudi-examples-flink + hudi-examples-java + - - - - src/main/resources - - - - - - org.apache.maven.plugins - maven-dependency-plugin - - - copy-dependencies - prepare-package - - copy-dependencies - - - ${project.build.directory}/lib - true - true - true - - - - - - net.alchim31.maven - scala-maven-plugin - - - scala-compile-first - process-resources - - add-source - compile - - - - - - org.apache.maven.plugins - maven-compiler-plugin - - - compile - - compile - - - - - - org.apache.maven.plugins - maven-jar-plugin - - - - test-jar - - test-compile - - - - false - - - - org.apache.rat - apache-rat-plugin - - - - - - - - org.scala-lang - scala-library - ${scala.version} - - - - org.apache.hudi - hudi-common - ${project.version} - - - - org.apache.hudi - hudi-cli - ${project.version} - - - - org.apache.hudi - hudi-client-common - ${project.version} - - - - org.apache.hudi - hudi-spark-client - ${project.version} - - - - org.apache.hudi - hudi-utilities_${scala.binary.version} - ${project.version} - - - - org.apache.hudi - hudi-spark_${scala.binary.version} - ${project.version} - - - - org.apache.hudi - hudi-hadoop-mr - ${project.version} - - - - org.apache.hudi - hudi-timeline-service - ${project.version} - - - - - org.apache.spark - spark-core_${scala.binary.version} - - - org.apache.spark - spark-sql_${scala.binary.version} - - - org.apache.spark - spark-avro_${scala.binary.version} - - - - - org.apache.parquet - parquet-hadoop - ${parquet.version} - - - - - org.apache.avro - avro - - - - org.apache.parquet - parquet-avro - - - - - ${hive.groupid} - hive-common - - - diff --git a/hudi-examples/src/main/java/org/apache/hudi/examples/common/HoodieExampleDataGenerator.java b/hudi-examples/src/main/java/org/apache/hudi/examples/common/HoodieExampleDataGenerator.java deleted file mode 100644 index 4a9868bd39fea..0000000000000 --- a/hudi-examples/src/main/java/org/apache/hudi/examples/common/HoodieExampleDataGenerator.java +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.examples.common; - -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.common.model.HoodieAvroPayload; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; - -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericRecord; - -import java.io.IOException; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Random; -import java.util.UUID; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import java.util.stream.Stream; - - -/** - * Class to be used to generate test data. - */ -public class HoodieExampleDataGenerator> { - - public static final String DEFAULT_FIRST_PARTITION_PATH = "2020/01/01"; - public static final String DEFAULT_SECOND_PARTITION_PATH = "2020/01/02"; - public static final String DEFAULT_THIRD_PARTITION_PATH = "2020/01/03"; - - public static final String[] DEFAULT_PARTITION_PATHS = - {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH, DEFAULT_THIRD_PARTITION_PATH}; - public static String TRIP_EXAMPLE_SCHEMA = "{\"type\": \"record\",\"name\": \"triprec\",\"fields\": [ " - + "{\"name\": \"ts\",\"type\": \"double\"},{\"name\": \"uuid\", \"type\": \"string\"}," - + "{\"name\": \"rider\", \"type\": \"string\"},{\"name\": \"driver\", \"type\": \"string\"}," - + "{\"name\": \"begin_lat\", \"type\": \"double\"},{\"name\": \"begin_lon\", \"type\": \"double\"}," - + "{\"name\": \"end_lat\", \"type\": \"double\"},{\"name\": \"end_lon\", \"type\": \"double\"}," - + "{\"name\":\"fare\",\"type\": \"double\"}]}"; - public static Schema avroSchema = new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA); - - private static Random rand = new Random(46474747); - - private final Map existingKeys; - private final String[] partitionPaths; - private int numExistingKeys; - - public HoodieExampleDataGenerator(String[] partitionPaths) { - this(partitionPaths, new HashMap<>()); - } - - public HoodieExampleDataGenerator() { - this(DEFAULT_PARTITION_PATHS); - } - - public HoodieExampleDataGenerator(String[] partitionPaths, Map keyPartitionMap) { - this.partitionPaths = Arrays.copyOf(partitionPaths, partitionPaths.length); - this.existingKeys = keyPartitionMap; - } - - /** - * Generates a new avro record of the above schema format, retaining the key if optionally provided. - */ - @SuppressWarnings("unchecked") - public T generateRandomValue(HoodieKey key, String commitTime) { - GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime, "driver-" + commitTime, 0); - return (T) new HoodieAvroPayload(Option.of(rec)); - } - - public GenericRecord generateGenericRecord(String rowKey, String riderName, String driverName, - long timestamp) { - GenericRecord rec = new GenericData.Record(avroSchema); - rec.put("uuid", rowKey); - rec.put("ts", timestamp); - rec.put("rider", riderName); - rec.put("driver", driverName); - rec.put("begin_lat", rand.nextDouble()); - rec.put("begin_lon", rand.nextDouble()); - rec.put("end_lat", rand.nextDouble()); - rec.put("end_lon", rand.nextDouble()); - rec.put("fare", rand.nextDouble() * 100); - return rec; - } - - /** - * Generates new inserts, uniformly across the partition paths above. It also updates the list of existing keys. - */ - public List> generateInserts(String commitTime, Integer n) { - return generateInsertsStream(commitTime, n).collect(Collectors.toList()); - } - - /** - * Generates new inserts, uniformly across the partition paths above. It also updates the list of existing keys. - */ - public Stream> generateInsertsStream(String commitTime, Integer n) { - int currSize = getNumExistingKeys(); - - return IntStream.range(0, n).boxed().map(i -> { - String partitionPath = partitionPaths[rand.nextInt(partitionPaths.length)]; - HoodieKey key = new HoodieKey(UUID.randomUUID().toString(), partitionPath); - KeyPartition kp = new KeyPartition(); - kp.key = key; - kp.partitionPath = partitionPath; - existingKeys.put(currSize + i, kp); - numExistingKeys++; - return new HoodieRecord<>(key, generateRandomValue(key, commitTime)); - }); - } - - /** - * Generates new updates, randomly distributed across the keys above. There can be duplicates within the returned - * list - * - * @param commitTime Commit Timestamp - * @param n Number of updates (including dups) - * @return list of hoodie record updates - */ - public List> generateUpdates(String commitTime, Integer n) { - List> updates = new ArrayList<>(); - for (int i = 0; i < n; i++) { - KeyPartition kp = existingKeys.get(rand.nextInt(numExistingKeys - 1)); - HoodieRecord record = generateUpdateRecord(kp.key, commitTime); - updates.add(record); - } - return updates; - } - - public HoodieRecord generateUpdateRecord(HoodieKey key, String commitTime) { - return new HoodieRecord<>(key, generateRandomValue(key, commitTime)); - } - - private Option convertToString(HoodieRecord record) { - try { - String str = HoodieAvroUtils - .bytesToAvro(((HoodieAvroPayload)record.getData()).getRecordBytes(), avroSchema) - .toString(); - str = "{" + str.substring(str.indexOf("\"ts\":")); - return Option.of(str.replaceAll("}", ", \"partitionpath\": \"" + record.getPartitionPath() + "\"}")); - } catch (IOException e) { - return Option.empty(); - } - } - - public List convertToStringList(List> records) { - return records.stream().map(this::convertToString).filter(Option::isPresent).map(Option::get) - .collect(Collectors.toList()); - } - - public int getNumExistingKeys() { - return numExistingKeys; - } - - public static class KeyPartition implements Serializable { - - HoodieKey key; - String partitionPath; - } - - public void close() { - existingKeys.clear(); - } - -} diff --git a/hudi-examples/src/main/scala/org/apache/hudi/examples/spark/HoodieDataSourceExample.scala b/hudi-examples/src/main/scala/org/apache/hudi/examples/spark/HoodieDataSourceExample.scala deleted file mode 100644 index 27accadebf1a9..0000000000000 --- a/hudi-examples/src/main/scala/org/apache/hudi/examples/spark/HoodieDataSourceExample.scala +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.examples.spark - -import org.apache.hudi.DataSourceReadOptions.{BEGIN_INSTANTTIME_OPT_KEY, END_INSTANTTIME_OPT_KEY, QUERY_TYPE_INCREMENTAL_OPT_VAL, QUERY_TYPE_OPT_KEY} -import org.apache.hudi.DataSourceWriteOptions.{PARTITIONPATH_FIELD_OPT_KEY, PRECOMBINE_FIELD_OPT_KEY, RECORDKEY_FIELD_OPT_KEY} -import org.apache.hudi.QuickstartUtils.getQuickstartWriteConfigs -import org.apache.hudi.common.model.HoodieAvroPayload -import org.apache.hudi.config.HoodieWriteConfig.TABLE_NAME -import org.apache.hudi.examples.common.{HoodieExampleDataGenerator, HoodieExampleSparkUtils} -import org.apache.spark.sql.SaveMode.{Append, Overwrite} -import org.apache.spark.sql.SparkSession - -import scala.collection.JavaConversions._ - -/** - * Simple examples of [[org.apache.hudi.DefaultSource]] - * - * To run this example, you should - * 1. For running in IDE, set VM options `-Dspark.master=local[2]` - * 2. For running in shell, using `spark-submit` - * - * Usage: HoodieWriteClientExample . - * and describe root path of hudi and table name - * for example, `HoodieDataSourceExample file:///tmp/hoodie/hudi_cow_table hudi_cow_table` - */ -object HoodieDataSourceExample { - - def main(args: Array[String]): Unit = { - - if (args.length < 2) { - System.err.println("Usage: HoodieDataSourceExample ") - System.exit(1) - } - val tablePath = args(0) - val tableName = args(1) - - val spark = HoodieExampleSparkUtils.defaultSparkSession("Hudi Spark basic example") - - val dataGen = new HoodieExampleDataGenerator[HoodieAvroPayload] - insertData(spark, tablePath, tableName, dataGen) - updateData(spark, tablePath, tableName, dataGen) - queryData(spark, tablePath, tableName, dataGen) - - incrementalQuery(spark, tablePath, tableName) - pointInTimeQuery(spark, tablePath, tableName) - - spark.stop() - } - - - /** - * Generate some new trips, load them into a DataFrame and write the DataFrame into the Hudi dataset as below. - */ - def insertData(spark: SparkSession, tablePath: String, tableName: String, dataGen: HoodieExampleDataGenerator[HoodieAvroPayload]): Unit = { - - val commitTime: String = System.currentTimeMillis().toString - val inserts = dataGen.convertToStringList(dataGen.generateInserts(commitTime, 20)) - spark.sparkContext.parallelize(inserts, 2) - val df = spark.read.json(spark.sparkContext.parallelize(inserts, 1)) - df.write.format("org.apache.hudi"). - options(getQuickstartWriteConfigs). - option(PRECOMBINE_FIELD_OPT_KEY, "ts"). - option(RECORDKEY_FIELD_OPT_KEY, "uuid"). - option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). - option(TABLE_NAME, tableName). - mode(Overwrite). - save(tablePath) - } - - /** - * Load the data files into a DataFrame. - */ - def queryData(spark: SparkSession, tablePath: String, tableName: String, dataGen: HoodieExampleDataGenerator[HoodieAvroPayload]): Unit = { - val roViewDF = spark. - read. - format("org.apache.hudi"). - load(tablePath + "/*/*/*/*") - - roViewDF.createOrReplaceTempView("hudi_ro_table") - - spark.sql("select fare, begin_lon, begin_lat, ts from hudi_ro_table where fare > 20.0").show() - // +-----------------+-------------------+-------------------+---+ - // | fare| begin_lon| begin_lat| ts| - // +-----------------+-------------------+-------------------+---+ - // |98.88075495133515|0.39556048623031603|0.17851135255091155|0.0| - // ... - - spark.sql("select _hoodie_commit_time, _hoodie_record_key, _hoodie_partition_path, rider, driver, fare from hudi_ro_table").show() - // +-------------------+--------------------+----------------------+-------------------+--------------------+------------------+ - // |_hoodie_commit_time| _hoodie_record_key|_hoodie_partition_path| rider| driver| fare| - // +-------------------+--------------------+----------------------+-------------------+--------------------+------------------+ - // | 20191231181501|31cafb9f-0196-4b1...| 2020/01/02|rider-1577787297889|driver-1577787297889| 98.88075495133515| - // ... - } - - /** - * This is similar to inserting new data. Generate updates to existing trips using the data generator, - * load into a DataFrame and write DataFrame into the hudi dataset. - */ - def updateData(spark: SparkSession, tablePath: String, tableName: String, dataGen: HoodieExampleDataGenerator[HoodieAvroPayload]): Unit = { - - val commitTime: String = System.currentTimeMillis().toString - val updates = dataGen.convertToStringList(dataGen.generateUpdates(commitTime, 10)) - val df = spark.read.json(spark.sparkContext.parallelize(updates, 1)) - df.write.format("org.apache.hudi"). - options(getQuickstartWriteConfigs). - option(PRECOMBINE_FIELD_OPT_KEY, "ts"). - option(RECORDKEY_FIELD_OPT_KEY, "uuid"). - option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). - option(TABLE_NAME, tableName). - mode(Append). - save(tablePath) - } - - /** - * Hudi also provides capability to obtain a stream of records that changed since given commit timestamp. - * This can be achieved using Hudi’s incremental view and providing a begin time from which changes need to be streamed. - * We do not need to specify endTime, if we want all changes after the given commit (as is the common case). - */ - def incrementalQuery(spark: SparkSession, tablePath: String, tableName: String) { - import spark.implicits._ - val commits = spark.sql("select distinct(_hoodie_commit_time) as commitTime from hudi_ro_table order by commitTime").map(k => k.getString(0)).take(50) - val beginTime = commits(commits.length - 2) // commit time we are interested in - - // incrementally query data - val incViewDF = spark. - read. - format("org.apache.hudi"). - option(QUERY_TYPE_OPT_KEY, QUERY_TYPE_INCREMENTAL_OPT_VAL). - option(BEGIN_INSTANTTIME_OPT_KEY, beginTime). - load(tablePath) - incViewDF.createOrReplaceTempView("hudi_incr_table") - spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_incr_table where fare > 20.0").show() - } - - /** - * Lets look at how to query data as of a specific time. - * The specific time can be represented by pointing endTime to a specific commit time - * and beginTime to “000” (denoting earliest possible commit time). - */ - def pointInTimeQuery(spark: SparkSession, tablePath: String, tableName: String) { - import spark.implicits._ - val commits = spark.sql("select distinct(_hoodie_commit_time) as commitTime from hudi_ro_table order by commitTime").map(k => k.getString(0)).take(50) - val beginTime = "000" // Represents all commits > this time. - val endTime = commits(commits.length - 2) // commit time we are interested in - - //incrementally query data - val incViewDF = spark.read.format("org.apache.hudi"). - option(QUERY_TYPE_OPT_KEY, QUERY_TYPE_INCREMENTAL_OPT_VAL). - option(BEGIN_INSTANTTIME_OPT_KEY, beginTime). - option(END_INSTANTTIME_OPT_KEY, endTime). - load(tablePath) - incViewDF.createOrReplaceTempView("hudi_incr_table") - spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_incr_table where fare > 20.0").show() - } -} diff --git a/hudi-flink-datasource/hudi-flink/pom.xml b/hudi-flink-datasource/hudi-flink/pom.xml new file mode 100644 index 0000000000000..15a6bccac116f --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/pom.xml @@ -0,0 +1,436 @@ + + + + + + + hudi-flink-datasource + org.apache.hudi + 0.12.2-dt-SNAPSHOT + + 4.0.0 + + hudi-flink + 0.12.2-dt-SNAPSHOT + jar + + + ${project.parent.parent.basedir} + ${flink.format.parquet.version} + + + + + + org.jacoco + jacoco-maven-plugin + + + org.apache.maven.plugins + maven-compiler-plugin + + 1.8 + 1.8 + + + + org.apache.maven.plugins + maven-jar-plugin + 3.1.2 + + + + test-jar + + + + + + org.apache.rat + apache-rat-plugin + + + + + + src/main/resources + + + + + + + + org.apache.logging.log4j + log4j-1.2-api + + + org.apache.logging.log4j + log4j-slf4j-impl + + + org.slf4j + slf4j-api + + + + + org.apache.hudi + hudi-common + ${project.version} + + + org.apache.hudi + hudi-client-common + ${project.version} + + + org.apache.hudi + hudi-aws + ${project.version} + provided + + + org.apache.hudi + hudi-flink-client + ${project.version} + + + org.apache.hudi + hudi-hadoop-mr + ${project.version} + + + org.apache.hudi + hudi-hive-sync + ${project.version} + + + org.apache.hudi + hudi-sync-common + ${project.version} + + + org.apache.hudi + ${hudi.flink.module} + ${project.version} + + + + + org.apache.flink + ${flink.streaming.java.artifactId} + ${flink.version} + compile + + + org.apache.flink + ${flink.clients.artifactId} + compile + + + com.esotericsoftware.kryo + kryo + + + com.esotericsoftware.minlog + minlog + + + + + org.apache.flink + ${flink.connector.kafka.artifactId} + compile + + + org.apache.kafka + kafka-clients + ${kafka.version} + + + org.apache.flink + ${flink.hadoop.compatibility.artifactId} + ${flink.version} + + + org.apache.flink + ${flink.parquet.artifactId} + ${flink.version} + provided + + + org.apache.flink + flink-json + ${flink.version} + provided + + + org.apache.flink + flink-table-common + ${flink.version} + provided + + + org.apache.flink + ${flink.table.runtime.artifactId} + ${flink.version} + provided + + + org.apache.flink + ${flink.table.planner.artifactId} + ${flink.version} + provided + + + org.apache.flink + ${flink.statebackend.rocksdb.artifactId} + ${flink.version} + provided + + + + org.apache.parquet + parquet-hadoop + ${parquet.version} + + + org.xerial.snappy + snappy-java + + + + + + + org.apache.parquet + parquet-avro + ${parquet.version} + test + + + + + org.apache.avro + avro + ${flink.avro.version} + compile + + + + + org.apache.hadoop + hadoop-mapreduce-client-core + compile + + + org.slf4j + slf4j-log4j12 + + + + + + com.beust + jcommander + compile + + + joda-time + joda-time + 2.5 + + + + ${hive.groupid} + hive-exec + ${hive.version} + ${hive.exec.classifier} + + + javax.mail + mail + + + org.eclipse.jetty.aggregate + * + + + + + ${hive.groupid} + hive-metastore + ${hive.version} + provided + + + javax.transaction + jta + + + javax.transaction + javax.transaction-api + + + javax.mail + mail + + + org.eclipse.jetty.aggregate + * + + + + + + org.apache.hudi + hudi-tests-common + ${project.version} + test + + + + org.junit.jupiter + junit-jupiter-api + test + + + org.junit.jupiter + junit-jupiter-engine + test + + + org.junit.vintage + junit-vintage-engine + test + + + org.junit.jupiter + junit-jupiter-params + test + + + org.mockito + mockito-junit-jupiter + test + + + org.junit.platform + junit-platform-runner + test + + + org.junit.platform + junit-platform-suite-api + test + + + org.junit.platform + junit-platform-commons + test + + + + org.apache.hudi + hudi-common + ${project.version} + tests + test-jar + test + + + org.apache.hudi + hudi-client-common + ${project.version} + tests + test-jar + test + + + org.apache.hudi + hudi-flink-client + ${project.version} + tests + test-jar + test + + + org.apache.hudi + ${hudi.flink.module} + ${project.version} + tests + test-jar + test + + + + org.apache.flink + ${flink.test.utils.artifactId} + ${flink.version} + test + + + org.apache.flink + ${flink.runtime.artifactId} + ${flink.version} + test + test-jar + + + org.apache.flink + ${flink.streaming.java.artifactId} + ${flink.version} + test + test-jar + + + org.apache.flink + ${flink.table.runtime.artifactId} + ${flink.version} + test + test-jar + + + org.apache.flink + flink-csv + ${flink.version} + test + + + org.apache.flink + flink-connector-files + ${flink.version} + test + + + + javax.transaction + jta + 1.1 + test + + + javax.transaction + javax.transaction-api + 1.3 + test + + + diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java new file mode 100644 index 0000000000000..bcd4f407d6240 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java @@ -0,0 +1,917 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.configuration; + +import org.apache.hudi.client.clustering.plan.strategy.FlinkSizeBasedClusteringPlanStrategy; +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.model.EventTimeAvroPayload; +import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.hive.MultiPartKeysValueExtractor; +import org.apache.hudi.hive.ddl.HiveSyncMode; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.apache.hudi.keygen.constant.KeyGeneratorType; +import org.apache.hudi.table.action.cluster.ClusteringPlanPartitionFilterMode; + +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.flink.configuration.Configuration; + +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.apache.hudi.common.util.PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH; +import static org.apache.hudi.config.HoodieClusteringConfig.DAYBASED_LOOKBACK_PARTITIONS; +import static org.apache.hudi.config.HoodieClusteringConfig.PARTITION_FILTER_BEGIN_PARTITION; +import static org.apache.hudi.config.HoodieClusteringConfig.PARTITION_FILTER_END_PARTITION; +import static org.apache.hudi.config.HoodieClusteringConfig.PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST; + +/** + * Hoodie Flink config options. + * + *

    It has the options for Hoodie table read and write. It also defines some utilities. + */ +@ConfigClassProperty(name = "Flink Options", + groupName = ConfigGroups.Names.FLINK_SQL, + description = "Flink jobs using the SQL can be configured through the options in WITH clause." + + " The actual datasource level configs are listed below.") +public class FlinkOptions extends HoodieConfig { + private FlinkOptions() { + } + + // ------------------------------------------------------------------------ + // Base Options + // ------------------------------------------------------------------------ + + public static final ConfigOption PATH = ConfigOptions + .key("path") + .stringType() + .noDefaultValue() + .withDescription("Base path for the target hoodie table.\n" + + "The path would be created if it does not exist,\n" + + "otherwise a Hoodie table expects to be initialized successfully"); + + // ------------------------------------------------------------------------ + // Common Options + // ------------------------------------------------------------------------ + + public static final ConfigOption TABLE_NAME = ConfigOptions + .key(HoodieWriteConfig.TBL_NAME.key()) + .stringType() + .noDefaultValue() + .withDescription("Table name to register to Hive metastore"); + + public static final String TABLE_TYPE_COPY_ON_WRITE = HoodieTableType.COPY_ON_WRITE.name(); + public static final String TABLE_TYPE_MERGE_ON_READ = HoodieTableType.MERGE_ON_READ.name(); + public static final ConfigOption TABLE_TYPE = ConfigOptions + .key("table.type") + .stringType() + .defaultValue(TABLE_TYPE_COPY_ON_WRITE) + .withDescription("Type of table to write. COPY_ON_WRITE (or) MERGE_ON_READ"); + + public static final String NO_PRE_COMBINE = "no_precombine"; + public static final ConfigOption PRECOMBINE_FIELD = ConfigOptions + .key("precombine.field") + .stringType() + .defaultValue("ts") + .withFallbackKeys("write.precombine.field") + .withDescription("Field used in preCombining before actual write. When two records have the same\n" + + "key value, we will pick the one with the largest value for the precombine field,\n" + + "determined by Object.compareTo(..)"); + + public static final ConfigOption PAYLOAD_CLASS_NAME = ConfigOptions + .key("payload.class") + .stringType() + .defaultValue(EventTimeAvroPayload.class.getName()) + .withFallbackKeys("write.payload.class") + .withDescription("Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting.\n" + + "This will render any value set for the option in-effective"); + + public static final ConfigOption PARTITION_DEFAULT_NAME = ConfigOptions + .key("partition.default_name") + .stringType() + .defaultValue(DEFAULT_PARTITION_PATH) // keep sync with hoodie style + .withDescription("The default partition name in case the dynamic partition" + + " column value is null/empty string"); + + public static final ConfigOption CHANGELOG_ENABLED = ConfigOptions + .key("changelog.enabled") + .booleanType() + .defaultValue(false) + .withDescription("Whether to keep all the intermediate changes, " + + "we try to keep all the changes of a record when enabled:\n" + + "1). The sink accept the UPDATE_BEFORE message;\n" + + "2). The source try to emit every changes of a record.\n" + + "The semantics is best effort because the compaction job would finally merge all changes of a record into one.\n" + + " default false to have UPSERT semantics"); + + // ------------------------------------------------------------------------ + // Metadata table Options + // ------------------------------------------------------------------------ + + public static final ConfigOption METADATA_ENABLED = ConfigOptions + .key("metadata.enabled") + .booleanType() + .defaultValue(false) + .withDescription("Enable the internal metadata table which serves table metadata like level file listings, default disabled"); + + public static final ConfigOption METADATA_COMPACTION_DELTA_COMMITS = ConfigOptions + .key("metadata.compaction.delta_commits") + .intType() + .defaultValue(10) + .withDescription("Max delta commits for metadata table to trigger compaction, default 10"); + + // ------------------------------------------------------------------------ + // Index Options + // ------------------------------------------------------------------------ + + public static final ConfigOption INDEX_TYPE = ConfigOptions + .key("index.type") + .stringType() + .defaultValue(HoodieIndex.IndexType.FLINK_STATE.name()) + .withDescription("Index type of Flink write job, default is using state backed index."); + + public static final ConfigOption INDEX_BOOTSTRAP_ENABLED = ConfigOptions + .key("index.bootstrap.enabled") + .booleanType() + .defaultValue(false) + .withDescription("Whether to bootstrap the index state from existing hoodie table, default false"); + + public static final ConfigOption INDEX_STATE_TTL = ConfigOptions + .key("index.state.ttl") + .doubleType() + .defaultValue(0D) + .withDescription("Index state ttl in days, default stores the index permanently"); + + public static final ConfigOption INDEX_GLOBAL_ENABLED = ConfigOptions + .key("index.global.enabled") + .booleanType() + .defaultValue(true) + .withDescription("Whether to update index for the old partition path\n" + + "if same key record with different partition path came in, default true"); + + public static final ConfigOption INDEX_PARTITION_REGEX = ConfigOptions + .key("index.partition.regex") + .stringType() + .defaultValue(".*") + .withDescription("Whether to load partitions in state if partition path matching, default `*`"); + + // ------------------------------------------------------------------------ + // Read Options + // ------------------------------------------------------------------------ + + public static final ConfigOption READ_TASKS = ConfigOptions + .key("read.tasks") + .intType() + .noDefaultValue() + .withDescription("Parallelism of tasks that do actual read, default is the parallelism of the execution environment"); + + public static final ConfigOption SOURCE_AVRO_SCHEMA_PATH = ConfigOptions + .key("source.avro-schema.path") + .stringType() + .noDefaultValue() + .withDescription("Source avro schema file path, the parsed schema is used for deserialization"); + + public static final ConfigOption SOURCE_AVRO_SCHEMA = ConfigOptions + .key("source.avro-schema") + .stringType() + .noDefaultValue() + .withDescription("Source avro schema string, the parsed schema is used for deserialization"); + + public static final String QUERY_TYPE_SNAPSHOT = "snapshot"; + public static final String QUERY_TYPE_READ_OPTIMIZED = "read_optimized"; + public static final String QUERY_TYPE_INCREMENTAL = "incremental"; + public static final ConfigOption QUERY_TYPE = ConfigOptions + .key("hoodie.datasource.query.type") + .stringType() + .defaultValue(QUERY_TYPE_SNAPSHOT) + .withDescription("Decides how data files need to be read, in\n" + + "1) Snapshot mode (obtain latest view, based on row & columnar data);\n" + + "2) incremental mode (new data since an instantTime);\n" + + "3) Read Optimized mode (obtain latest view, based on columnar data)\n." + + "Default: snapshot"); + + public static final String REALTIME_SKIP_MERGE = "skip_merge"; + public static final String REALTIME_PAYLOAD_COMBINE = "payload_combine"; + public static final ConfigOption MERGE_TYPE = ConfigOptions + .key("hoodie.datasource.merge.type") + .stringType() + .defaultValue(REALTIME_PAYLOAD_COMBINE) + .withDescription("For Snapshot query on merge on read table. Use this key to define how the payloads are merged, in\n" + + "1) skip_merge: read the base file records plus the log file records;\n" + + "2) payload_combine: read the base file records first, for each record in base file, checks whether the key is in the\n" + + " log file records(combines the two records with same key for base and log file records), then read the left log file records"); + + public static final ConfigOption UTC_TIMEZONE = ConfigOptions + .key("read.utc-timezone") + .booleanType() + .defaultValue(true) + .withDescription("Use UTC timezone or local timezone to the conversion between epoch" + + " time and LocalDateTime. Hive 0.x/1.x/2.x use local timezone. But Hive 3.x" + + " use UTC timezone, by default true"); + + public static final ConfigOption READ_AS_STREAMING = ConfigOptions + .key("read.streaming.enabled") + .booleanType() + .defaultValue(false)// default read as batch + .withDescription("Whether to read as streaming source, default false"); + + public static final ConfigOption READ_STREAMING_CHECK_INTERVAL = ConfigOptions + .key("read.streaming.check-interval") + .intType() + .defaultValue(60)// default 1 minute + .withDescription("Check interval for streaming read of SECOND, default 1 minute"); + + // this option is experimental + public static final ConfigOption READ_STREAMING_SKIP_COMPACT = ConfigOptions + .key("read.streaming.skip_compaction") + .booleanType() + .defaultValue(false)// default read as batch + .withDescription("Whether to skip compaction instants for streaming read,\n" + + "there are two cases that this option can be used to avoid reading duplicates:\n" + + "1) you are definitely sure that the consumer reads faster than any compaction instants, " + + "usually with delta time compaction strategy that is long enough, for e.g, one week;\n" + + "2) changelog mode is enabled, this option is a solution to keep data integrity"); + + public static final String START_COMMIT_EARLIEST = "earliest"; + public static final ConfigOption READ_START_COMMIT = ConfigOptions + .key("read.start-commit") + .stringType() + .noDefaultValue() + .withDescription("Start commit instant for reading, the commit time format should be 'yyyyMMddHHmmss', " + + "by default reading from the latest instant for streaming read"); + + public static final ConfigOption READ_END_COMMIT = ConfigOptions + .key("read.end-commit") + .stringType() + .noDefaultValue() + .withDescription("End commit instant for reading, the commit time format should be 'yyyyMMddHHmmss'"); + + public static final ConfigOption READ_DATA_SKIPPING_ENABLED = ConfigOptions + .key("read.data.skipping.enabled") + .booleanType() + .defaultValue(false) + .withDescription("Enables data-skipping allowing queries to leverage indexes to reduce the search space by" + + "skipping over files"); + + // ------------------------------------------------------------------------ + // Write Options + // ------------------------------------------------------------------------ + + public static final ConfigOption INSERT_CLUSTER = ConfigOptions + .key("write.insert.cluster") + .booleanType() + .defaultValue(false) + .withDescription("Whether to merge small files for insert mode, " + + "if true, the write throughput will decrease because the read/write of existing small file, " + + "only valid for COW table, default false"); + + public static final ConfigOption OPERATION = ConfigOptions + .key("write.operation") + .stringType() + .defaultValue(WriteOperationType.UPSERT.value()) + .withDescription("The write operation, that this write should do"); + + /** + * Flag to indicate whether to drop duplicates before insert/upsert. + * By default false to gain extra performance. + */ + public static final ConfigOption PRE_COMBINE = ConfigOptions + .key("write.precombine") + .booleanType() + .defaultValue(false) + .withDescription("Flag to indicate whether to drop duplicates before insert/upsert.\n" + + "By default these cases will accept duplicates, to gain extra performance:\n" + + "1) insert operation;\n" + + "2) upsert for MOR table, the MOR table deduplicate on reading"); + + public static final ConfigOption RETRY_TIMES = ConfigOptions + .key("write.retry.times") + .intType() + .defaultValue(3) + .withDescription("Flag to indicate how many times streaming job should retry for a failed checkpoint batch.\n" + + "By default 3"); + + public static final ConfigOption RETRY_INTERVAL_MS = ConfigOptions + .key("write.retry.interval.ms") + .longType() + .defaultValue(2000L) + .withDescription("Flag to indicate how long (by millisecond) before a retry should issued for failed checkpoint batch.\n" + + "By default 2000 and it will be doubled by every retry"); + + public static final ConfigOption IGNORE_FAILED = ConfigOptions + .key("write.ignore.failed") + .booleanType() + .defaultValue(false) + .withDescription("Flag to indicate whether to ignore any non exception error (e.g. writestatus error). within a checkpoint batch.\n" + + "By default false. Turning this on, could hide the write status errors while the spark checkpoint moves ahead. \n" + + " So, would recommend users to use this with caution."); + + public static final ConfigOption RECORD_KEY_FIELD = ConfigOptions + .key(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()) + .stringType() + .defaultValue("uuid") + .withDescription("Record key field. Value to be used as the `recordKey` component of `HoodieKey`.\n" + + "Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using " + + "the dot notation eg: `a.b.c`"); + + public static final ConfigOption INDEX_KEY_FIELD = ConfigOptions + .key(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key()) + .stringType() + .defaultValue("") + .withDescription("Index key field. Value to be used as hashing to find the bucket ID. Should be a subset of or equal to the recordKey fields.\n" + + "Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using " + + "the dot notation eg: `a.b.c`"); + + public static final ConfigOption BUCKET_INDEX_NUM_BUCKETS = ConfigOptions + .key(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS.key()) + .intType() + .defaultValue(4) // default 4 buckets per partition + .withDescription("Hudi bucket number per partition. Only affected if using Hudi bucket index."); + + public static final ConfigOption PARTITION_PATH_FIELD = ConfigOptions + .key(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()) + .stringType() + .defaultValue("") + .withDescription("Partition path field. Value to be used at the `partitionPath` component of `HoodieKey`.\n" + + "Actual value obtained by invoking .toString(), default ''"); + + public static final ConfigOption URL_ENCODE_PARTITIONING = ConfigOptions + .key(KeyGeneratorOptions.URL_ENCODE_PARTITIONING.key()) + .booleanType() + .defaultValue(false) + .withDescription("Whether to encode the partition path url, default false"); + + public static final ConfigOption HIVE_STYLE_PARTITIONING = ConfigOptions + .key(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE.key()) + .booleanType() + .defaultValue(false) + .withDescription("Whether to use Hive style partitioning.\n" + + "If set true, the names of partition folders follow = format.\n" + + "By default false (the names of partition folders are only partition values)"); + + public static final ConfigOption KEYGEN_CLASS_NAME = ConfigOptions + .key(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key()) + .stringType() + .noDefaultValue() + .withDescription("Key generator class, that implements will extract the key out of incoming record"); + + public static final ConfigOption KEYGEN_TYPE = ConfigOptions + .key(HoodieWriteConfig.KEYGENERATOR_TYPE.key()) + .stringType() + .defaultValue(KeyGeneratorType.SIMPLE.name()) + .withDescription("Key generator type, that implements will extract the key out of incoming record"); + + public static final String PARTITION_FORMAT_HOUR = "yyyyMMddHH"; + public static final String PARTITION_FORMAT_DAY = "yyyyMMdd"; + public static final String PARTITION_FORMAT_DASHED_DAY = "yyyy-MM-dd"; + public static final ConfigOption PARTITION_FORMAT = ConfigOptions + .key("write.partition.format") + .stringType() + .noDefaultValue() + .withDescription("Partition path format, only valid when 'write.datetime.partitioning' is true, default is:\n" + + "1) 'yyyyMMddHH' for timestamp(3) WITHOUT TIME ZONE, LONG, FLOAT, DOUBLE, DECIMAL;\n" + + "2) 'yyyyMMdd' for DATE and INT."); + + public static final ConfigOption INDEX_BOOTSTRAP_TASKS = ConfigOptions + .key("write.index_bootstrap.tasks") + .intType() + .noDefaultValue() + .withDescription("Parallelism of tasks that do index bootstrap, default same as the write task parallelism"); + + public static final ConfigOption BUCKET_ASSIGN_TASKS = ConfigOptions + .key("write.bucket_assign.tasks") + .intType() + .noDefaultValue() + .withDescription("Parallelism of tasks that do bucket assign, default same as the write task parallelism"); + + public static final ConfigOption WRITE_TASKS = ConfigOptions + .key("write.tasks") + .intType() + .noDefaultValue() + .withDescription("Parallelism of tasks that do actual write, default is the parallelism of the execution environment"); + + public static final ConfigOption WRITE_TASK_MAX_SIZE = ConfigOptions + .key("write.task.max.size") + .doubleType() + .defaultValue(1024D) // 1GB + .withDescription("Maximum memory in MB for a write task, when the threshold hits,\n" + + "it flushes the max size data bucket to avoid OOM, default 1GB"); + + public static final ConfigOption WRITE_RATE_LIMIT = ConfigOptions + .key("write.rate.limit") + .longType() + .defaultValue(0L) // default no limit + .withDescription("Write record rate limit per second to prevent traffic jitter and improve stability, default 0 (no limit)"); + + public static final ConfigOption WRITE_BATCH_SIZE = ConfigOptions + .key("write.batch.size") + .doubleType() + .defaultValue(256D) // 256MB + .withDescription("Batch buffer size in MB to flush data into the underneath filesystem, default 256MB"); + + public static final ConfigOption WRITE_LOG_BLOCK_SIZE = ConfigOptions + .key("write.log_block.size") + .intType() + .defaultValue(128) + .withDescription("Max log block size in MB for log file, default 128MB"); + + public static final ConfigOption WRITE_LOG_MAX_SIZE = ConfigOptions + .key("write.log.max.size") + .longType() + .defaultValue(1024L) + .withDescription("Maximum size allowed in MB for a log file before it is rolled over to the next version, default 1GB"); + + public static final ConfigOption WRITE_PARQUET_BLOCK_SIZE = ConfigOptions + .key("write.parquet.block.size") + .intType() + .defaultValue(120) + .withDescription("Parquet RowGroup size. It's recommended to make this large enough that scan costs can be" + + " amortized by packing enough column values into a single row group."); + + public static final ConfigOption WRITE_PARQUET_MAX_FILE_SIZE = ConfigOptions + .key("write.parquet.max.file.size") + .intType() + .defaultValue(120) + .withDescription("Target size for parquet files produced by Hudi write phases. " + + "For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance."); + + public static final ConfigOption WRITE_PARQUET_PAGE_SIZE = ConfigOptions + .key("write.parquet.page.size") + .intType() + .defaultValue(1) + .withDescription("Parquet page size. Page is the unit of read within a parquet file. " + + "Within a block, pages are compressed separately."); + + public static final ConfigOption WRITE_MERGE_MAX_MEMORY = ConfigOptions + .key("write.merge.max_memory") + .intType() + .defaultValue(100) // default 100 MB + .withDescription("Max memory in MB for merge, default 100MB"); + + // this is only for internal use + public static final ConfigOption WRITE_COMMIT_ACK_TIMEOUT = ConfigOptions + .key("write.commit.ack.timeout") + .longType() + .defaultValue(-1L) // default at least once + .withDescription("Timeout limit for a writer task after it finishes a checkpoint and\n" + + "waits for the instant commit success, only for internal use"); + + public static final ConfigOption WRITE_BULK_INSERT_SHUFFLE_INPUT = ConfigOptions + .key("write.bulk_insert.shuffle_input") + .booleanType() + .defaultValue(true) + .withDescription("Whether to shuffle the inputs by specific fields for bulk insert tasks, default true"); + + public static final ConfigOption WRITE_BULK_INSERT_SORT_INPUT = ConfigOptions + .key("write.bulk_insert.sort_input") + .booleanType() + .defaultValue(true) + .withDescription("Whether to sort the inputs by specific fields for bulk insert tasks, default true"); + + public static final ConfigOption WRITE_SORT_MEMORY = ConfigOptions + .key("write.sort.memory") + .intType() + .defaultValue(128) + .withDescription("Sort memory in MB, default 128MB"); + + // ------------------------------------------------------------------------ + // Compaction Options + // ------------------------------------------------------------------------ + + public static final ConfigOption COMPACTION_SCHEDULE_ENABLED = ConfigOptions + .key("compaction.schedule.enabled") + .booleanType() + .defaultValue(true) // default true for MOR write + .withDescription("Schedule the compaction plan, enabled by default for MOR"); + + public static final ConfigOption COMPACTION_ASYNC_ENABLED = ConfigOptions + .key("compaction.async.enabled") + .booleanType() + .defaultValue(true) // default true for MOR write + .withDescription("Async Compaction, enabled by default for MOR"); + + public static final ConfigOption COMPACTION_TASKS = ConfigOptions + .key("compaction.tasks") + .intType() + .noDefaultValue() + .withDescription("Parallelism of tasks that do actual compaction, default same as the write task parallelism"); + + public static final String NUM_COMMITS = "num_commits"; + public static final String TIME_ELAPSED = "time_elapsed"; + public static final String NUM_AND_TIME = "num_and_time"; + public static final String NUM_OR_TIME = "num_or_time"; + public static final ConfigOption COMPACTION_TRIGGER_STRATEGY = ConfigOptions + .key("compaction.trigger.strategy") + .stringType() + .defaultValue(NUM_COMMITS) // default true for MOR write + .withDescription("Strategy to trigger compaction, options are 'num_commits': trigger compaction when reach N delta commits;\n" + + "'time_elapsed': trigger compaction when time elapsed > N seconds since last compaction;\n" + + "'num_and_time': trigger compaction when both NUM_COMMITS and TIME_ELAPSED are satisfied;\n" + + "'num_or_time': trigger compaction when NUM_COMMITS or TIME_ELAPSED is satisfied.\n" + + "Default is 'num_commits'"); + + public static final ConfigOption COMPACTION_DELTA_COMMITS = ConfigOptions + .key("compaction.delta_commits") + .intType() + .defaultValue(5) + .withDescription("Max delta commits needed to trigger compaction, default 5 commits"); + + public static final ConfigOption COMPACTION_DELTA_SECONDS = ConfigOptions + .key("compaction.delta_seconds") + .intType() + .defaultValue(3600) // default 1 hour + .withDescription("Max delta seconds time needed to trigger compaction, default 1 hour"); + + public static final ConfigOption COMPACTION_TIMEOUT_SECONDS = ConfigOptions + .key("compaction.timeout.seconds") + .intType() + .defaultValue(1200) // default 20 minutes + .withDescription("Max timeout time in seconds for online compaction to rollback, default 20 minutes"); + + public static final ConfigOption COMPACTION_MAX_MEMORY = ConfigOptions + .key("compaction.max_memory") + .intType() + .defaultValue(100) // default 100 MB + .withDescription("Max memory in MB for compaction spillable map, default 100MB"); + + public static final ConfigOption COMPACTION_TARGET_IO = ConfigOptions + .key("compaction.target_io") + .longType() + .defaultValue(500 * 1024L) // default 500 GB + .withDescription("Target IO in MB for per compaction (both read and write), default 500 GB"); + + public static final ConfigOption CLEAN_ASYNC_ENABLED = ConfigOptions + .key("clean.async.enabled") + .booleanType() + .defaultValue(true) + .withDescription("Whether to cleanup the old commits immediately on new commits, enabled by default"); + + public static final ConfigOption CLEAN_POLICY = ConfigOptions + .key("clean.policy") + .stringType() + .defaultValue(HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name()) + .withDescription("Clean policy to manage the Hudi table. Available option: KEEP_LATEST_COMMITS, KEEP_LATEST_FILE_VERSIONS, KEEP_LATEST_BY_HOURS." + + "Default is KEEP_LATEST_COMMITS."); + + public static final ConfigOption CLEAN_RETAIN_COMMITS = ConfigOptions + .key("clean.retain_commits") + .intType() + .defaultValue(30)// default 30 commits + .withDescription("Number of commits to retain. So data will be retained for num_of_commits * time_between_commits (scheduled).\n" + + "This also directly translates into how much you can incrementally pull on this table, default 30"); + + public static final ConfigOption CLEAN_RETAIN_HOURS = ConfigOptions + .key("clean.retain_hours") + .intType() + .defaultValue(24)// default 24 hours + .withDescription("Number of hours for which commits need to be retained. This config provides a more flexible option as" + + "compared to number of commits retained for cleaning service. Setting this property ensures all the files, but the latest in a file group," + + " corresponding to commits with commit times older than the configured number of hours to be retained are cleaned."); + + public static final ConfigOption CLEAN_RETAIN_FILE_VERSIONS = ConfigOptions + .key("clean.retain_file_versions") + .intType() + .defaultValue(5)// default 5 version + .withDescription("Number of file versions to retain. default 5"); + + public static final ConfigOption ARCHIVE_MAX_COMMITS = ConfigOptions + .key("archive.max_commits") + .intType() + .defaultValue(50)// default max 50 commits + .withDescription("Max number of commits to keep before archiving older commits into a sequential log, default 50"); + + public static final ConfigOption ARCHIVE_MIN_COMMITS = ConfigOptions + .key("archive.min_commits") + .intType() + .defaultValue(40)// default min 40 commits + .withDescription("Min number of commits to keep before archiving older commits into a sequential log, default 40"); + + // ------------------------------------------------------------------------ + // Clustering Options + // ------------------------------------------------------------------------ + + public static final ConfigOption CLUSTERING_SCHEDULE_ENABLED = ConfigOptions + .key("clustering.schedule.enabled") + .booleanType() + .defaultValue(false) // default false for pipeline + .withDescription("Schedule the cluster plan, default false"); + + public static final ConfigOption CLUSTERING_ASYNC_ENABLED = ConfigOptions + .key("clustering.async.enabled") + .booleanType() + .defaultValue(false) // default false for pipeline + .withDescription("Async Clustering, default false"); + + public static final ConfigOption CLUSTERING_DELTA_COMMITS = ConfigOptions + .key("clustering.delta_commits") + .intType() + .defaultValue(4) + .withDescription("Max delta commits needed to trigger clustering, default 4 commits"); + + public static final ConfigOption CLUSTERING_TASKS = ConfigOptions + .key("clustering.tasks") + .intType() + .noDefaultValue() + .withDescription("Parallelism of tasks that do actual clustering, default same as the write task parallelism"); + + public static final ConfigOption CLUSTERING_TARGET_PARTITIONS = ConfigOptions + .key("clustering.plan.strategy.daybased.lookback.partitions") + .intType() + .defaultValue(2) + .withDescription("Number of partitions to list to create ClusteringPlan, default is 2"); + + public static final ConfigOption CLUSTERING_PLAN_STRATEGY_CLASS = ConfigOptions + .key("clustering.plan.strategy.class") + .stringType() + .defaultValue(FlinkSizeBasedClusteringPlanStrategy.class.getName()) + .withDescription("Config to provide a strategy class (subclass of ClusteringPlanStrategy) to create clustering plan " + + "i.e select what file groups are being clustered. Default strategy, looks at the last N (determined by " + + CLUSTERING_TARGET_PARTITIONS.key() + ") day based partitions picks the small file slices within those partitions."); + + public static final ConfigOption CLUSTERING_PLAN_PARTITION_FILTER_MODE_NAME = ConfigOptions + .key("clustering.plan.partition.filter.mode") + .stringType() + .defaultValue(ClusteringPlanPartitionFilterMode.NONE.name()) + .withDescription("Partition filter mode used in the creation of clustering plan. Available values are - " + + "NONE: do not filter table partition and thus the clustering plan will include all partitions that have clustering candidate." + + "RECENT_DAYS: keep a continuous range of partitions, worked together with configs '" + DAYBASED_LOOKBACK_PARTITIONS.key() + "' and '" + + PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST.key() + "." + + "SELECTED_PARTITIONS: keep partitions that are in the specified range ['" + PARTITION_FILTER_BEGIN_PARTITION.key() + "', '" + + PARTITION_FILTER_END_PARTITION.key() + "']."); + + public static final ConfigOption CLUSTERING_PLAN_STRATEGY_TARGET_FILE_MAX_BYTES = ConfigOptions + .key("clustering.plan.strategy.target.file.max.bytes") + .longType() + .defaultValue(1024 * 1024 * 1024L) // default 1 GB + .withDescription("Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups, default 1 GB"); + + public static final ConfigOption CLUSTERING_PLAN_STRATEGY_SMALL_FILE_LIMIT = ConfigOptions + .key("clustering.plan.strategy.small.file.limit") + .longType() + .defaultValue(600L) // default 600 MB + .withDescription("Files smaller than the size specified here are candidates for clustering, default 600 MB"); + + public static final ConfigOption CLUSTERING_PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST = ConfigOptions + .key("clustering.plan.strategy.daybased.skipfromlatest.partitions") + .intType() + .defaultValue(0) + .withDescription("Number of partitions to skip from latest when choosing partitions to create ClusteringPlan"); + + public static final ConfigOption CLUSTERING_SORT_COLUMNS = ConfigOptions + .key("clustering.plan.strategy.sort.columns") + .stringType() + .defaultValue("") + .withDescription("Columns to sort the data by when clustering"); + + public static final ConfigOption CLUSTERING_MAX_NUM_GROUPS = ConfigOptions + .key("clustering.plan.strategy.max.num.groups") + .intType() + .defaultValue(30) + .withDescription("Maximum number of groups to create as part of ClusteringPlan. Increasing groups will increase parallelism, default is 30"); + + // ------------------------------------------------------------------------ + // Hive Sync Options + // ------------------------------------------------------------------------ + + public static final ConfigOption HIVE_SYNC_ENABLED = ConfigOptions + .key("hive_sync.enabled") + .booleanType() + .defaultValue(false) + .withFallbackKeys("hive_sync.enable") + .withDescription("Asynchronously sync Hive meta to HMS, default false"); + + public static final ConfigOption HIVE_SYNC_DB = ConfigOptions + .key("hive_sync.db") + .stringType() + .defaultValue("default") + .withDescription("Database name for hive sync, default 'default'"); + + public static final ConfigOption HIVE_SYNC_TABLE = ConfigOptions + .key("hive_sync.table") + .stringType() + .defaultValue("unknown") + .withDescription("Table name for hive sync, default 'unknown'"); + + public static final ConfigOption HIVE_SYNC_FILE_FORMAT = ConfigOptions + .key("hive_sync.file_format") + .stringType() + .defaultValue("PARQUET") + .withDescription("File format for hive sync, default 'PARQUET'"); + + public static final ConfigOption HIVE_SYNC_MODE = ConfigOptions + .key("hive_sync.mode") + .stringType() + .defaultValue(HiveSyncMode.HMS.name()) + .withDescription("Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql, default 'hms'"); + + public static final ConfigOption HIVE_SYNC_USERNAME = ConfigOptions + .key("hive_sync.username") + .stringType() + .defaultValue("hive") + .withDescription("Username for hive sync, default 'hive'"); + + public static final ConfigOption HIVE_SYNC_PASSWORD = ConfigOptions + .key("hive_sync.password") + .stringType() + .defaultValue("hive") + .withDescription("Password for hive sync, default 'hive'"); + + public static final ConfigOption HIVE_SYNC_JDBC_URL = ConfigOptions + .key("hive_sync.jdbc_url") + .stringType() + .defaultValue("jdbc:hive2://localhost:10000") + .withDescription("Jdbc URL for hive sync, default 'jdbc:hive2://localhost:10000'"); + + public static final ConfigOption HIVE_SYNC_METASTORE_URIS = ConfigOptions + .key("hive_sync.metastore.uris") + .stringType() + .defaultValue("") + .withDescription("Metastore uris for hive sync, default ''"); + + public static final ConfigOption HIVE_SYNC_PARTITION_FIELDS = ConfigOptions + .key("hive_sync.partition_fields") + .stringType() + .defaultValue("") + .withDescription("Partition fields for hive sync, default ''"); + + public static final ConfigOption HIVE_SYNC_PARTITION_EXTRACTOR_CLASS_NAME = ConfigOptions + .key("hive_sync.partition_extractor_class") + .stringType() + .defaultValue(MultiPartKeysValueExtractor.class.getName()) + .withDescription("Tool to extract the partition value from HDFS path, " + + "default 'MultiPartKeysValueExtractor'"); + + public static final ConfigOption HIVE_SYNC_ASSUME_DATE_PARTITION = ConfigOptions + .key("hive_sync.assume_date_partitioning") + .booleanType() + .defaultValue(false) + .withDescription("Assume partitioning is yyyy/mm/dd, default false"); + + public static final ConfigOption HIVE_SYNC_USE_JDBC = ConfigOptions + .key("hive_sync.use_jdbc") + .booleanType() + .defaultValue(true) + .withDescription("Use JDBC when hive synchronization is enabled, default true"); + + public static final ConfigOption HIVE_SYNC_AUTO_CREATE_DB = ConfigOptions + .key("hive_sync.auto_create_db") + .booleanType() + .defaultValue(true) + .withDescription("Auto create hive database if it does not exists, default true"); + + public static final ConfigOption HIVE_SYNC_IGNORE_EXCEPTIONS = ConfigOptions + .key("hive_sync.ignore_exceptions") + .booleanType() + .defaultValue(false) + .withDescription("Ignore exceptions during hive synchronization, default false"); + + public static final ConfigOption HIVE_SYNC_SKIP_RO_SUFFIX = ConfigOptions + .key("hive_sync.skip_ro_suffix") + .booleanType() + .defaultValue(false) + .withDescription("Skip the _ro suffix for Read optimized table when registering, default false"); + + public static final ConfigOption HIVE_SYNC_SUPPORT_TIMESTAMP = ConfigOptions + .key("hive_sync.support_timestamp") + .booleanType() + .defaultValue(true) + .withDescription("INT64 with original type TIMESTAMP_MICROS is converted to hive timestamp type.\n" + + "Disabled by default for backward compatibility."); + + public static final ConfigOption HIVE_SYNC_TABLE_PROPERTIES = ConfigOptions + .key("hive_sync.table_properties") + .stringType() + .noDefaultValue() + .withDescription("Additional properties to store with table, the data format is k1=v1\nk2=v2"); + + public static final ConfigOption HIVE_SYNC_TABLE_SERDE_PROPERTIES = ConfigOptions + .key("hive_sync.serde_properties") + .stringType() + .noDefaultValue() + .withDescription("Serde properties to hive table, the data format is k1=v1\nk2=v2"); + + public static final ConfigOption HIVE_SYNC_CONF_DIR = ConfigOptions + .key("hive_sync.conf.dir") + .stringType() + .noDefaultValue() + .withDescription("The hive configuration directory, where the hive-site.xml lies in, the file should be put on the client machine"); + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + // Prefix for Hoodie specific properties. + private static final String PROPERTIES_PREFIX = "properties."; + + /** + * Collects the config options that start with specified prefix {@code prefix} into a 'key'='value' list. + */ + public static Map getPropertiesWithPrefix(Map options, String prefix) { + final Map hoodieProperties = new HashMap<>(); + if (hasPropertyOptions(options, prefix)) { + options.keySet().stream() + .filter(key -> key.startsWith(prefix)) + .forEach(key -> { + final String value = options.get(key); + final String subKey = key.substring(prefix.length()); + hoodieProperties.put(subKey, value); + }); + } + return hoodieProperties; + } + + /** + * Collects all the config options, the 'properties.' prefix would be removed if the option key starts with it. + */ + public static Configuration flatOptions(Configuration conf) { + final Map propsMap = new HashMap<>(); + + conf.toMap().forEach((key, value) -> { + final String subKey = key.startsWith(PROPERTIES_PREFIX) + ? key.substring((PROPERTIES_PREFIX).length()) + : key; + propsMap.put(subKey, value); + }); + return fromMap(propsMap); + } + + private static boolean hasPropertyOptions(Map options, String prefix) { + return options.keySet().stream().anyMatch(k -> k.startsWith(prefix)); + } + + /** + * Creates a new configuration that is initialized with the options of the given map. + */ + public static Configuration fromMap(Map map) { + final Configuration configuration = new Configuration(); + for (Map.Entry entry : map.entrySet()) { + configuration.setString(entry.getKey().trim(), entry.getValue()); + } + return configuration; + } + + /** + * Returns whether the given conf defines default value for the option {@code option}. + */ + public static boolean isDefaultValueDefined(Configuration conf, ConfigOption option) { + return !conf.getOptional(option).isPresent() + || conf.get(option).equals(option.defaultValue()); + } + + /** + * Returns all the optional config options. + */ + public static Set> optionalOptions() { + Set> options = new HashSet<>(allOptions()); + options.remove(PATH); + return options; + } + + /** + * Returns all the config options. + */ + public static List> allOptions() { + Field[] declaredFields = FlinkOptions.class.getDeclaredFields(); + List> options = new ArrayList<>(); + for (Field field : declaredFields) { + if (java.lang.reflect.Modifier.isStatic(field.getModifiers()) + && field.getType().equals(ConfigOption.class)) { + try { + options.add((ConfigOption) field.get(ConfigOption.class)); + } catch (IllegalAccessException e) { + throw new HoodieException("Error while fetching static config option", e); + } + } + } + return options; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/HadoopConfigurations.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/HadoopConfigurations.java new file mode 100644 index 0000000000000..7a9e18b9b6d9a --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/HadoopConfigurations.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.configuration; + +import org.apache.hudi.util.FlinkClientUtil; + +import org.apache.flink.configuration.Configuration; +import org.apache.hadoop.fs.Path; + +import java.util.Map; + +/** + * Utilities for fetching hadoop configurations. + */ +public class HadoopConfigurations { + private static final String HADOOP_PREFIX = "hadoop."; + private static final String PARQUET_PREFIX = "parquet."; + + /** + * Creates a merged hadoop configuration with given flink configuration and hadoop configuration. + */ + public static org.apache.hadoop.conf.Configuration getParquetConf( + org.apache.flink.configuration.Configuration options, + org.apache.hadoop.conf.Configuration hadoopConf) { + org.apache.hadoop.conf.Configuration copy = new org.apache.hadoop.conf.Configuration(hadoopConf); + Map parquetOptions = FlinkOptions.getPropertiesWithPrefix(options.toMap(), PARQUET_PREFIX); + parquetOptions.forEach((k, v) -> copy.set(PARQUET_PREFIX + k, v)); + return copy; + } + + /** + * Creates a new hadoop configuration that is initialized with the given flink configuration. + */ + public static org.apache.hadoop.conf.Configuration getHadoopConf(Configuration conf) { + org.apache.hadoop.conf.Configuration hadoopConf = FlinkClientUtil.getHadoopConf(); + Map options = FlinkOptions.getPropertiesWithPrefix(conf.toMap(), HADOOP_PREFIX); + options.forEach(hadoopConf::set); + return hadoopConf; + } + + /** + * Creates a Hive configuration with configured dir path or empty if no Hive conf dir is set. + */ + public static org.apache.hadoop.conf.Configuration getHiveConf(Configuration conf) { + String explicitDir = conf.getString(FlinkOptions.HIVE_SYNC_CONF_DIR, System.getenv("HIVE_CONF_DIR")); + org.apache.hadoop.conf.Configuration hadoopConf = new org.apache.hadoop.conf.Configuration(); + if (explicitDir != null) { + hadoopConf.addResource(new Path(explicitDir, "hive-site.xml")); + } + return hadoopConf; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsInference.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsInference.java new file mode 100644 index 0000000000000..3e02d23732703 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsInference.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.configuration; + +import org.apache.flink.configuration.Configuration; + +/** + * Tool helping to infer the flink options {@link FlinkOptions}. + */ +public class OptionsInference { + + /** + * Sets up the default source task parallelism if it is not specified. + * + * @param conf The configuration + * @param envTasks The parallelism of the execution env + */ + public static void setupSourceTasks(Configuration conf, int envTasks) { + if (!conf.contains(FlinkOptions.READ_TASKS)) { + conf.setInteger(FlinkOptions.READ_TASKS, envTasks); + } + } + + /** + * Sets up the default sink tasks parallelism if it is not specified. + * + * @param conf The configuration + * @param envTasks The parallelism of the execution env + */ + public static void setupSinkTasks(Configuration conf, int envTasks) { + // write task number, default same as execution env tasks + if (!conf.contains(FlinkOptions.WRITE_TASKS)) { + conf.setInteger(FlinkOptions.WRITE_TASKS, envTasks); + } + int writeTasks = conf.getInteger(FlinkOptions.WRITE_TASKS); + // bucket assign tasks, default same as write tasks + if (!conf.contains(FlinkOptions.BUCKET_ASSIGN_TASKS)) { + conf.setInteger(FlinkOptions.BUCKET_ASSIGN_TASKS, writeTasks); + } + // compaction tasks, default same as write tasks + if (!conf.contains(FlinkOptions.COMPACTION_TASKS)) { + conf.setInteger(FlinkOptions.COMPACTION_TASKS, writeTasks); + } + // clustering tasks, default same as write tasks + if (!conf.contains(FlinkOptions.CLUSTERING_TASKS)) { + conf.setInteger(FlinkOptions.CLUSTERING_TASKS, writeTasks); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java new file mode 100644 index 0000000000000..cbd942616fead --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.configuration; + +import org.apache.hudi.common.model.DefaultHoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.table.format.FilePathUtils; + +import org.apache.flink.configuration.Configuration; + +import java.util.Locale; +import java.util.Map; + +/** + * Tool helping to resolve the flink options {@link FlinkOptions}. + */ +public class OptionsResolver { + /** + * Returns whether insert clustering is allowed with given configuration {@code conf}. + */ + public static boolean insertClustering(Configuration conf) { + return isCowTable(conf) && isInsertOperation(conf) && conf.getBoolean(FlinkOptions.INSERT_CLUSTER); + } + + /** + * Returns whether the insert is clustering disabled with given configuration {@code conf}. + */ + public static boolean isAppendMode(Configuration conf) { + // 1. inline clustering is supported for COW table; + // 2. async clustering is supported for both COW and MOR table + return isCowTable(conf) && isInsertOperation(conf) && !conf.getBoolean(FlinkOptions.INSERT_CLUSTER) + || needsScheduleClustering(conf); + } + + /** + * Returns whether the table operation is 'insert'. + */ + public static boolean isInsertOperation(Configuration conf) { + WriteOperationType operationType = WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION)); + return operationType == WriteOperationType.INSERT; + } + + /** + * Returns whether it is a MERGE_ON_READ table. + */ + public static boolean isMorTable(Configuration conf) { + return conf.getString(FlinkOptions.TABLE_TYPE) + .toUpperCase(Locale.ROOT) + .equals(FlinkOptions.TABLE_TYPE_MERGE_ON_READ); + } + + /** + * Returns whether it is a MERGE_ON_READ table. + */ + public static boolean isMorTable(Map options) { + return options.getOrDefault(FlinkOptions.TABLE_TYPE.key(), + FlinkOptions.TABLE_TYPE.defaultValue()).equalsIgnoreCase(FlinkOptions.TABLE_TYPE_MERGE_ON_READ); + } + + /** + * Returns whether it is a COPY_ON_WRITE table. + */ + public static boolean isCowTable(Configuration conf) { + return conf.getString(FlinkOptions.TABLE_TYPE) + .toUpperCase(Locale.ROOT) + .equals(FlinkOptions.TABLE_TYPE_COPY_ON_WRITE); + } + + /** + * Returns whether the payload clazz is {@link DefaultHoodieRecordPayload}. + */ + public static boolean isDefaultHoodieRecordPayloadClazz(Configuration conf) { + return conf.getString(FlinkOptions.PAYLOAD_CLASS_NAME).contains(DefaultHoodieRecordPayload.class.getSimpleName()); + } + + /** + * Returns the preCombine field + * or null if the value is set as {@link FlinkOptions#NO_PRE_COMBINE}. + */ + public static String getPreCombineField(Configuration conf) { + final String preCombineField = conf.getString(FlinkOptions.PRECOMBINE_FIELD); + return preCombineField.equals(FlinkOptions.NO_PRE_COMBINE) ? null : preCombineField; + } + + /** + * Returns whether the compaction strategy is based on elapsed delta time. + */ + public static boolean isDeltaTimeCompaction(Configuration conf) { + final String strategy = conf.getString(FlinkOptions.COMPACTION_TRIGGER_STRATEGY).toLowerCase(Locale.ROOT); + return FlinkOptions.TIME_ELAPSED.equals(strategy) || FlinkOptions.NUM_OR_TIME.equals(strategy); + } + + /** + * Returns whether the table is partitioned. + */ + public static boolean isPartitionedTable(Configuration conf) { + return FilePathUtils.extractPartitionKeys(conf).length > 0; + } + + public static boolean isBucketIndexType(Configuration conf) { + return conf.getString(FlinkOptions.INDEX_TYPE).equalsIgnoreCase(HoodieIndex.IndexType.BUCKET.name()); + } + + /** + * Returns whether the source should emit changelog. + * + * @return true if the source is read as streaming with changelog mode enabled + */ + public static boolean emitChangelog(Configuration conf) { + return conf.getBoolean(FlinkOptions.READ_AS_STREAMING) + && conf.getBoolean(FlinkOptions.CHANGELOG_ENABLED); + } + + /** + * Returns whether there is need to schedule the async compaction. + * + * @param conf The flink configuration. + */ + public static boolean needsAsyncCompaction(Configuration conf) { + return OptionsResolver.isMorTable(conf) + && conf.getBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED); + } + + /** + * Returns whether there is need to schedule the compaction plan. + * + * @param conf The flink configuration. + */ + public static boolean needsScheduleCompaction(Configuration conf) { + return OptionsResolver.isMorTable(conf) + && conf.getBoolean(FlinkOptions.COMPACTION_SCHEDULE_ENABLED); + } + + /** + * Returns whether there is need to schedule the async clustering. + * + * @param conf The flink configuration. + */ + public static boolean needsAsyncClustering(Configuration conf) { + return isInsertOperation(conf) && conf.getBoolean(FlinkOptions.CLUSTERING_ASYNC_ENABLED); + } + + /** + * Returns whether there is need to schedule the clustering plan. + * + * @param conf The flink configuration. + */ + public static boolean needsScheduleClustering(Configuration conf) { + return isInsertOperation(conf) && conf.getBoolean(FlinkOptions.CLUSTERING_SCHEDULE_ENABLED); + } + + /** + * Returns whether the clustering sort is enabled. + */ + public static boolean sortClusteringEnabled(Configuration conf) { + return !StringUtils.isNullOrEmpty(conf.getString(FlinkOptions.CLUSTERING_SORT_COLUMNS)); + } + + /** + * Returns whether the operation is INSERT OVERWRITE (table or partition). + */ + public static boolean isInsertOverwrite(Configuration conf) { + return conf.getString(FlinkOptions.OPERATION).equals(WriteOperationType.INSERT_OVERWRITE_TABLE.value()) + || conf.getString(FlinkOptions.OPERATION).equals(WriteOperationType.INSERT_OVERWRITE.value()); + } + + /** + * Returns whether the read start commit is specific commit timestamp. + */ + public static boolean isSpecificStartCommit(Configuration conf) { + return conf.getOptional(FlinkOptions.READ_START_COMMIT).isPresent() + && !conf.get(FlinkOptions.READ_START_COMMIT).equalsIgnoreCase(FlinkOptions.START_COMMIT_EARLIEST); + } + + /** + * Returns true if there are no explicit start and end commits. + */ + public static boolean hasNoSpecificReadCommits(Configuration conf) { + return !conf.contains(FlinkOptions.READ_START_COMMIT) && !conf.contains(FlinkOptions.READ_END_COMMIT); + } +} diff --git a/hudi-flink/src/main/java/org/apache/hudi/schema/FilebasedSchemaProvider.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/schema/FilebasedSchemaProvider.java similarity index 76% rename from hudi-flink/src/main/java/org/apache/hudi/schema/FilebasedSchemaProvider.java rename to hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/schema/FilebasedSchemaProvider.java index 82699d978141b..a349314b7a111 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/schema/FilebasedSchemaProvider.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/schema/FilebasedSchemaProvider.java @@ -20,10 +20,13 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.HadoopConfigurations; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.util.StreamerUtil; import org.apache.avro.Schema; +import org.apache.flink.configuration.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -43,16 +46,14 @@ public static class Config { private static final String TARGET_SCHEMA_FILE_PROP = "hoodie.deltastreamer.schemaprovider.target.schema.file"; } - private final FileSystem fs; - private final Schema sourceSchema; private Schema targetSchema; + @Deprecated public FilebasedSchemaProvider(TypedProperties props) { - super(props); StreamerUtil.checkRequiredProperties(props, Collections.singletonList(Config.SOURCE_SCHEMA_FILE_PROP)); - this.fs = FSUtils.getFs(props.getString(Config.SOURCE_SCHEMA_FILE_PROP), StreamerUtil.getHadoopConf()); + FileSystem fs = FSUtils.getFs(props.getString(Config.SOURCE_SCHEMA_FILE_PROP), HadoopConfigurations.getHadoopConf(new Configuration())); try { this.sourceSchema = new Schema.Parser().parse(fs.open(new Path(props.getString(Config.SOURCE_SCHEMA_FILE_PROP)))); if (props.containsKey(Config.TARGET_SCHEMA_FILE_PROP)) { @@ -64,6 +65,16 @@ public FilebasedSchemaProvider(TypedProperties props) { } } + public FilebasedSchemaProvider(Configuration conf) { + final String sourceSchemaPath = conf.getString(FlinkOptions.SOURCE_AVRO_SCHEMA_PATH); + final FileSystem fs = FSUtils.getFs(sourceSchemaPath, HadoopConfigurations.getHadoopConf(conf)); + try { + this.sourceSchema = new Schema.Parser().parse(fs.open(new Path(sourceSchemaPath))); + } catch (IOException ioe) { + throw new HoodieIOException("Error reading schema", ioe); + } + } + @Override public Schema getSourceSchema() { return sourceSchema; diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/schema/SchemaProvider.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/schema/SchemaProvider.java new file mode 100644 index 0000000000000..5def413b5029e --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/schema/SchemaProvider.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.schema; + +import org.apache.avro.Schema; + +import java.io.Serializable; + +/** + * Class to provide schema for reading data and also writing into a Hoodie table. + */ +public abstract class SchemaProvider implements Serializable { + + public abstract Schema getSourceSchema(); + + public Schema getTargetSchema() { + // by default, use source schema as target for hoodie table as well + return getSourceSchema(); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/schema/SchemaRegistryProvider.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/schema/SchemaRegistryProvider.java new file mode 100644 index 0000000000000..c302c1db0d133 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/schema/SchemaRegistryProvider.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.schema; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.util.StreamerUtil; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.avro.Schema; + +import java.io.IOException; +import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.Base64; +import java.util.Collections; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Obtains latest schema from the Confluent/Kafka schema-registry. + *

    + * https://github.com/confluentinc/schema-registry + */ +public class SchemaRegistryProvider extends SchemaProvider { + + private final TypedProperties config; + + + /** + * Configs supported. + */ + public static class Config { + + private static final String SRC_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url"; + private static final String TARGET_SCHEMA_REGISTRY_URL_PROP = + "hoodie.deltastreamer.schemaprovider.registry.targetUrl"; + } + + /** + * The method takes the provided url {@code registryUrl} and gets the schema from the schema registry using that url. + * If the caller provides userInfo credentials in the url (e.g "https://foo:bar@schemaregistry.org") then the credentials + * are extracted the url using the Matcher and the extracted credentials are set on the request as an Authorization + * header. + * + * @param registryUrl + * @return the Schema in String form. + * @throws IOException + */ + public String fetchSchemaFromRegistry(String registryUrl) throws IOException { + URL registry; + HttpURLConnection connection; + Matcher matcher = Pattern.compile("://(.*?)@").matcher(registryUrl); + if (matcher.find()) { + String creds = matcher.group(1); + String urlWithoutCreds = registryUrl.replace(creds + "@", ""); + registry = new URL(urlWithoutCreds); + connection = (HttpURLConnection) registry.openConnection(); + setAuthorizationHeader(matcher.group(1), connection); + } else { + registry = new URL(registryUrl); + connection = (HttpURLConnection) registry.openConnection(); + } + ObjectMapper mapper = new ObjectMapper(); + JsonNode node = mapper.readTree(getStream(connection)); + return node.get("schema").asText(); + } + + protected void setAuthorizationHeader(String creds, HttpURLConnection connection) { + String encodedAuth = Base64.getEncoder().encodeToString(creds.getBytes(StandardCharsets.UTF_8)); + connection.setRequestProperty("Authorization", "Basic " + encodedAuth); + } + + protected InputStream getStream(HttpURLConnection connection) throws IOException { + return connection.getInputStream(); + } + + public SchemaRegistryProvider(TypedProperties props) { + this.config = props; + StreamerUtil.checkRequiredProperties(props, Collections.singletonList(Config.SRC_SCHEMA_REGISTRY_URL_PROP)); + } + + private Schema getSchema(String registryUrl) throws IOException { + return new Schema.Parser().parse(fetchSchemaFromRegistry(registryUrl)); + } + + @Override + public Schema getSourceSchema() { + String registryUrl = config.getString(Config.SRC_SCHEMA_REGISTRY_URL_PROP); + try { + return getSchema(registryUrl); + } catch (IOException ioe) { + throw new HoodieIOException("Error reading source schema from registry :" + registryUrl, ioe); + } + } + + @Override + public Schema getTargetSchema() { + String registryUrl = config.getString(Config.SRC_SCHEMA_REGISTRY_URL_PROP); + String targetRegistryUrl = config.getString(Config.TARGET_SCHEMA_REGISTRY_URL_PROP, registryUrl); + try { + return getSchema(targetRegistryUrl); + } catch (IOException ioe) { + throw new HoodieIOException("Error reading target schema from registry :" + registryUrl, ioe); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/CleanFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/CleanFunction.java new file mode 100644 index 0000000000000..638fe9fdab286 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/CleanFunction.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink; + +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.sink.utils.NonThrownExecutor; +import org.apache.hudi.util.FlinkWriteClients; + +import org.apache.flink.api.common.functions.AbstractRichFunction; +import org.apache.flink.api.common.state.CheckpointListener; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.state.FunctionInitializationContext; +import org.apache.flink.runtime.state.FunctionSnapshotContext; +import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; +import org.apache.flink.streaming.api.functions.sink.SinkFunction; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Sink function that cleans the old commits. + * + *

    It starts a cleaning task on new checkpoints, there is only one cleaning task + * at a time, a new task can not be scheduled until the last task finished(fails or normally succeed). + * The cleaning task never expects to throw but only log. + */ +public class CleanFunction extends AbstractRichFunction + implements SinkFunction, CheckpointedFunction, CheckpointListener { + private static final Logger LOG = LoggerFactory.getLogger(CleanFunction.class); + + private final Configuration conf; + + protected HoodieFlinkWriteClient writeClient; + + private NonThrownExecutor executor; + + private volatile boolean isCleaning; + + public CleanFunction(Configuration conf) { + this.conf = conf; + } + + @Override + public void open(Configuration parameters) throws Exception { + super.open(parameters); + this.writeClient = FlinkWriteClients.createWriteClient(conf, getRuntimeContext()); + this.executor = NonThrownExecutor.builder(LOG).waitForTasksFinish(true).build(); + String instantTime = HoodieActiveTimeline.createNewInstantTime(); + LOG.info(String.format("exec clean with instant time %s...", instantTime)); + executor.execute(() -> writeClient.clean(instantTime), "wait for cleaning finish"); + } + + @Override + public void notifyCheckpointComplete(long l) throws Exception { + if (conf.getBoolean(FlinkOptions.CLEAN_ASYNC_ENABLED) && isCleaning) { + executor.execute(() -> { + try { + this.writeClient.waitForCleaningFinish(); + } finally { + // ensure to switch the isCleaning flag + this.isCleaning = false; + } + }, "wait for cleaning finish"); + } + } + + @Override + public void snapshotState(FunctionSnapshotContext context) throws Exception { + if (conf.getBoolean(FlinkOptions.CLEAN_ASYNC_ENABLED) && !isCleaning) { + try { + this.writeClient.startAsyncCleaning(); + this.isCleaning = true; + } catch (Throwable throwable) { + // catch the exception to not affect the normal checkpointing + LOG.warn("Error while start async cleaning", throwable); + } + } + } + + @Override + public void initializeState(FunctionInitializationContext context) throws Exception { + // no operation + } + + @Override + public void close() throws Exception { + if (executor != null) { + executor.close(); + } + + if (this.writeClient != null) { + this.writeClient.close(); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteFunction.java new file mode 100644 index 0000000000000..2748af5290646 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteFunction.java @@ -0,0 +1,487 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieOperation; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.ObjectSizeCalculator; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.sink.common.AbstractStreamWriteFunction; +import org.apache.hudi.sink.event.WriteMetadataEvent; +import org.apache.hudi.table.action.commit.FlinkWriteHelper; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.functions.ProcessFunction; +import org.apache.flink.util.Collector; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.function.BiFunction; +import java.util.stream.Collectors; + +/** + * Sink function to write the data to the underneath filesystem. + * + *

    Work Flow

    + * + *

    The function firstly buffers the data as a batch of {@link HoodieRecord}s, + * It flushes(write) the records batch when the batch size exceeds the configured size {@link FlinkOptions#WRITE_BATCH_SIZE} + * or the total buffer size exceeds the configured size {@link FlinkOptions#WRITE_TASK_MAX_SIZE} + * or a Flink checkpoint starts. After a batch has been written successfully, + * the function notifies its operator coordinator {@link StreamWriteOperatorCoordinator} to mark a successful write. + * + *

    The Semantics

    + * + *

    The task implements exactly-once semantics by buffering the data between checkpoints. The operator coordinator + * starts a new instant on the timeline when a checkpoint triggers, the coordinator checkpoints always + * start before its operator, so when this function starts a checkpoint, a REQUESTED instant already exists. + * + *

    The function process thread blocks data buffering after the checkpoint thread finishes flushing the existing data buffer until + * the current checkpoint succeed and the coordinator starts a new instant. Any error triggers the job failure during the metadata committing, + * when the job recovers from a failure, the write function re-send the write metadata to the coordinator to see if these metadata + * can re-commit, thus if unexpected error happens during the instant committing, the coordinator would retry to commit when the job + * recovers. + * + *

    Fault Tolerance

    + * + *

    The operator coordinator checks and commits the last instant then starts a new one after a checkpoint finished successfully. + * It rolls back any inflight instant before it starts a new instant, this means one hoodie instant only span one checkpoint, + * the write function blocks data buffer flushing for the configured checkpoint timeout + * before it throws exception, any checkpoint failure would finally trigger the job failure. + * + *

    Note: The function task requires the input stream be shuffled by the file IDs. + * + * @param Type of the input record + * @see StreamWriteOperatorCoordinator + */ +public class StreamWriteFunction extends AbstractStreamWriteFunction { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(StreamWriteFunction.class); + + /** + * Write buffer as buckets for a checkpoint. The key is bucket ID. + */ + private transient Map buckets; + + private transient BiFunction, String, List> writeFunction; + + /** + * Total size tracer. + */ + private transient TotalSizeTracer tracer; + + /** + * Constructs a StreamingSinkFunction. + * + * @param config The config options + */ + public StreamWriteFunction(Configuration config) { + super(config); + } + + @Override + public void open(Configuration parameters) throws IOException { + this.tracer = new TotalSizeTracer(this.config); + initBuffer(); + initWriteFunction(); + } + + @Override + public void snapshotState() { + // Based on the fact that the coordinator starts the checkpoint first, + // it would check the validity. + // wait for the buffer data flush out and request a new instant + flushRemaining(false); + } + + @Override + public void processElement(I value, ProcessFunction.Context ctx, Collector out) throws Exception { + bufferRecord((HoodieRecord) value); + } + + @Override + public void close() { + if (this.writeClient != null) { + this.writeClient.cleanHandlesGracefully(); + this.writeClient.close(); + } + } + + /** + * End input action for batch source. + */ + public void endInput() { + super.endInput(); + flushRemaining(true); + this.writeClient.cleanHandles(); + this.writeStatuses.clear(); + } + + // ------------------------------------------------------------------------- + // Getter/Setter + // ------------------------------------------------------------------------- + @VisibleForTesting + @SuppressWarnings("rawtypes") + public Map> getDataBuffer() { + Map> ret = new HashMap<>(); + for (Map.Entry entry : buckets.entrySet()) { + ret.put(entry.getKey(), entry.getValue().writeBuffer()); + } + return ret; + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + private void initBuffer() { + this.buckets = new LinkedHashMap<>(); + } + + private void initWriteFunction() { + final String writeOperation = this.config.get(FlinkOptions.OPERATION); + switch (WriteOperationType.fromValue(writeOperation)) { + case INSERT: + this.writeFunction = (records, instantTime) -> this.writeClient.insert(records, instantTime); + break; + case UPSERT: + this.writeFunction = (records, instantTime) -> this.writeClient.upsert(records, instantTime); + break; + case INSERT_OVERWRITE: + this.writeFunction = (records, instantTime) -> this.writeClient.insertOverwrite(records, instantTime); + break; + case INSERT_OVERWRITE_TABLE: + this.writeFunction = (records, instantTime) -> this.writeClient.insertOverwriteTable(records, instantTime); + break; + default: + throw new RuntimeException("Unsupported write operation : " + writeOperation); + } + } + + /** + * Represents a data item in the buffer, this is needed to reduce the + * memory footprint. + * + *

    A {@link HoodieRecord} was firstly transformed into a {@link DataItem} + * for buffering, it then transforms back to the {@link HoodieRecord} before flushing. + */ + private static class DataItem { + private final String key; // record key + private final String instant; // 'U' or 'I' + private final HoodieRecordPayload data; // record payload + private final HoodieOperation operation; // operation + + private DataItem(String key, String instant, HoodieRecordPayload data, HoodieOperation operation) { + this.key = key; + this.instant = instant; + this.data = data; + this.operation = operation; + } + + public static DataItem fromHoodieRecord(HoodieRecord record) { + return new DataItem( + record.getRecordKey(), + record.getCurrentLocation().getInstantTime(), + ((HoodieAvroRecord) record).getData(), + record.getOperation()); + } + + public HoodieRecord toHoodieRecord(String partitionPath) { + HoodieKey hoodieKey = new HoodieKey(this.key, partitionPath); + HoodieRecord record = new HoodieAvroRecord<>(hoodieKey, data, operation); + HoodieRecordLocation loc = new HoodieRecordLocation(instant, null); + record.setCurrentLocation(loc); + return record; + } + } + + /** + * Data bucket. + */ + private static class DataBucket { + private final List records; + private final BufferSizeDetector detector; + private final String partitionPath; + private final String fileID; + + private DataBucket(Double batchSize, HoodieRecord hoodieRecord) { + this.records = new ArrayList<>(); + this.detector = new BufferSizeDetector(batchSize); + this.partitionPath = hoodieRecord.getPartitionPath(); + this.fileID = hoodieRecord.getCurrentLocation().getFileId(); + } + + /** + * Prepare the write data buffer: patch up all the records with correct partition path. + */ + public List writeBuffer() { + // rewrite all the records with new record key + return records.stream() + .map(record -> record.toHoodieRecord(partitionPath)) + .collect(Collectors.toList()); + } + + /** + * Sets up before flush: patch up the first record with correct partition path and fileID. + * + *

    Note: the method may modify the given records {@code records}. + */ + public void preWrite(List records) { + // rewrite the first record with expected fileID + HoodieRecord first = records.get(0); + HoodieRecord record = new HoodieAvroRecord<>(first.getKey(), (HoodieRecordPayload) first.getData(), first.getOperation()); + HoodieRecordLocation newLoc = new HoodieRecordLocation(first.getCurrentLocation().getInstantTime(), fileID); + record.setCurrentLocation(newLoc); + + records.set(0, record); + } + + public void reset() { + this.records.clear(); + this.detector.reset(); + } + } + + /** + * Tool to detect if to flush out the existing buffer. + * Sampling the record to compute the size with 0.01 percentage. + */ + private static class BufferSizeDetector { + private final Random random = new Random(47); + private static final int DENOMINATOR = 100; + + private final double batchSizeBytes; + + private long lastRecordSize = -1L; + private long totalSize = 0L; + + BufferSizeDetector(double batchSizeMb) { + this.batchSizeBytes = batchSizeMb * 1024 * 1024; + } + + boolean detect(Object record) { + if (lastRecordSize == -1 || sampling()) { + lastRecordSize = ObjectSizeCalculator.getObjectSize(record); + } + totalSize += lastRecordSize; + return totalSize > this.batchSizeBytes; + } + + boolean sampling() { + // 0.01 sampling percentage + return random.nextInt(DENOMINATOR) == 1; + } + + void reset() { + this.lastRecordSize = -1L; + this.totalSize = 0L; + } + } + + /** + * Tool to trace the total buffer size. It computes the maximum buffer size, + * if current buffer size is greater than the maximum buffer size, the data bucket + * flush triggers. + */ + private static class TotalSizeTracer { + private long bufferSize = 0L; + private final double maxBufferSize; + + TotalSizeTracer(Configuration conf) { + long mergeReaderMem = 100; // constant 100MB + long mergeMapMaxMem = conf.getInteger(FlinkOptions.WRITE_MERGE_MAX_MEMORY); + this.maxBufferSize = (conf.getDouble(FlinkOptions.WRITE_TASK_MAX_SIZE) - mergeReaderMem - mergeMapMaxMem) * 1024 * 1024; + final String errMsg = String.format("'%s' should be at least greater than '%s' plus merge reader memory(constant 100MB now)", + FlinkOptions.WRITE_TASK_MAX_SIZE.key(), FlinkOptions.WRITE_MERGE_MAX_MEMORY.key()); + ValidationUtils.checkState(this.maxBufferSize > 0, errMsg); + } + + /** + * Trace the given record size {@code recordSize}. + * + * @param recordSize The record size + * @return true if the buffer size exceeds the maximum buffer size + */ + boolean trace(long recordSize) { + this.bufferSize += recordSize; + return this.bufferSize > this.maxBufferSize; + } + + void countDown(long size) { + this.bufferSize -= size; + } + + public void reset() { + this.bufferSize = 0; + } + } + + /** + * Returns the bucket ID with the given value {@code value}. + */ + private String getBucketID(HoodieRecord record) { + final String fileId = record.getCurrentLocation().getFileId(); + return StreamerUtil.generateBucketKey(record.getPartitionPath(), fileId); + } + + /** + * Buffers the given record. + * + *

    Flush the data bucket first if the bucket records size is greater than + * the configured value {@link FlinkOptions#WRITE_BATCH_SIZE}. + * + *

    Flush the max size data bucket if the total buffer size exceeds the configured + * threshold {@link FlinkOptions#WRITE_TASK_MAX_SIZE}. + * + * @param value HoodieRecord + */ + protected void bufferRecord(HoodieRecord value) { + final String bucketID = getBucketID(value); + + DataBucket bucket = this.buckets.computeIfAbsent(bucketID, + k -> new DataBucket(this.config.getDouble(FlinkOptions.WRITE_BATCH_SIZE), value)); + final DataItem item = DataItem.fromHoodieRecord(value); + + bucket.records.add(item); + + boolean flushBucket = bucket.detector.detect(item); + boolean flushBuffer = this.tracer.trace(bucket.detector.lastRecordSize); + if (flushBucket) { + if (flushBucket(bucket)) { + this.tracer.countDown(bucket.detector.totalSize); + bucket.reset(); + } + } else if (flushBuffer) { + // find the max size bucket and flush it out + List sortedBuckets = this.buckets.values().stream() + .sorted((b1, b2) -> Long.compare(b2.detector.totalSize, b1.detector.totalSize)) + .collect(Collectors.toList()); + final DataBucket bucketToFlush = sortedBuckets.get(0); + if (flushBucket(bucketToFlush)) { + this.tracer.countDown(bucketToFlush.detector.totalSize); + bucketToFlush.reset(); + } else { + LOG.warn("The buffer size hits the threshold {}, but still flush the max size data bucket failed!", this.tracer.maxBufferSize); + } + } + } + + private boolean hasData() { + return this.buckets.size() > 0 + && this.buckets.values().stream().anyMatch(bucket -> bucket.records.size() > 0); + } + + @SuppressWarnings("unchecked, rawtypes") + private boolean flushBucket(DataBucket bucket) { + String instant = instantToWrite(true); + + if (instant == null) { + // in case there are empty checkpoints that has no input data + LOG.info("No inflight instant when flushing data, skip."); + return false; + } + + List records = bucket.writeBuffer(); + ValidationUtils.checkState(records.size() > 0, "Data bucket to flush has no buffering records"); + if (config.getBoolean(FlinkOptions.PRE_COMBINE)) { + records = FlinkWriteHelper.newInstance().deduplicateRecords(records, (HoodieIndex) null, -1); + } + bucket.preWrite(records); + final List writeStatus = new ArrayList<>(writeFunction.apply(records, instant)); + records.clear(); + final WriteMetadataEvent event = WriteMetadataEvent.builder() + .taskID(taskID) + .instantTime(instant) // the write instant may shift but the event still use the currentInstant. + .writeStatus(writeStatus) + .lastBatch(false) + .endInput(false) + .build(); + + this.eventGateway.sendEventToCoordinator(event); + writeStatuses.addAll(writeStatus); + return true; + } + + @SuppressWarnings("unchecked, rawtypes") + private void flushRemaining(boolean endInput) { + this.currentInstant = instantToWrite(hasData()); + if (this.currentInstant == null) { + // in case there are empty checkpoints that has no input data + throw new HoodieException("No inflight instant when flushing data!"); + } + final List writeStatus; + if (buckets.size() > 0) { + writeStatus = new ArrayList<>(); + this.buckets.values() + // The records are partitioned by the bucket ID and each batch sent to + // the writer belongs to one bucket. + .forEach(bucket -> { + List records = bucket.writeBuffer(); + if (records.size() > 0) { + if (config.getBoolean(FlinkOptions.PRE_COMBINE)) { + records = FlinkWriteHelper.newInstance().deduplicateRecords(records, (HoodieIndex) null, -1); + } + bucket.preWrite(records); + writeStatus.addAll(writeFunction.apply(records, currentInstant)); + records.clear(); + bucket.reset(); + } + }); + } else { + LOG.info("No data to write in subtask [{}] for instant [{}]", taskID, currentInstant); + writeStatus = Collections.emptyList(); + } + final WriteMetadataEvent event = WriteMetadataEvent.builder() + .taskID(taskID) + .instantTime(currentInstant) + .writeStatus(writeStatus) + .lastBatch(true) + .endInput(endInput) + .build(); + + this.eventGateway.sendEventToCoordinator(event); + this.buckets.clear(); + this.tracer.reset(); + this.writeClient.cleanHandles(); + this.writeStatuses.addAll(writeStatus); + // blocks flushing until the coordinator starts a new instant + this.confirming = true; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperator.java new file mode 100644 index 0000000000000..9e39e3f26a70a --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperator.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink; + +import org.apache.hudi.sink.common.AbstractWriteOperator; +import org.apache.hudi.sink.common.WriteOperatorFactory; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.operators.StreamSink; + +/** + * Operator for {@link StreamSink}. + * + * @param The input type + */ +public class StreamWriteOperator extends AbstractWriteOperator { + + public StreamWriteOperator(Configuration conf) { + super(new StreamWriteFunction<>(conf)); + } + + public static WriteOperatorFactory getFactory(Configuration conf) { + return WriteOperatorFactory.instance(conf, new StreamWriteOperator<>(conf)); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java new file mode 100644 index 0000000000000..17b789e2f0dcc --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java @@ -0,0 +1,637 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink; + +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.CommitUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.sink.event.CommitAckEvent; +import org.apache.hudi.sink.event.WriteMetadataEvent; +import org.apache.hudi.sink.meta.CkpMetadata; +import org.apache.hudi.sink.utils.HiveSyncContext; +import org.apache.hudi.sink.utils.NonThrownExecutor; +import org.apache.hudi.util.ClusteringUtil; +import org.apache.hudi.util.CompactionUtil; +import org.apache.hudi.util.FlinkWriteClients; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.runtime.operators.coordination.TaskNotRunningException; +import org.jetbrains.annotations.Nullable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.Serializable; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CompletionException; +import java.util.stream.Collectors; + +import static org.apache.hudi.util.StreamerUtil.initTableIfNotExists; + +/** + * {@link OperatorCoordinator} for {@link StreamWriteFunction}. + * + *

    This coordinator starts a new instant when a new checkpoint starts. It commits the instant when all the + * operator tasks write the buffer successfully for a round of checkpoint. + * + *

    If there is no data for a round of checkpointing, it resets the events buffer and returns early. + * + * @see StreamWriteFunction for the work flow and semantics + */ +public class StreamWriteOperatorCoordinator + implements OperatorCoordinator { + private static final Logger LOG = LoggerFactory.getLogger(StreamWriteOperatorCoordinator.class); + + /** + * Config options. + */ + private final Configuration conf; + + /** + * Hive config options. + */ + private final SerializableConfiguration hiveConf; + + /** + * Coordinator context. + */ + private final Context context; + + /** + * Gateways for sending events to sub tasks. + */ + private transient SubtaskGateway[] gateways; + + /** + * Write client. + */ + private transient HoodieFlinkWriteClient writeClient; + + /** + * Meta client. + */ + private transient HoodieTableMetaClient metaClient; + + /** + * Current REQUESTED instant, for validation. + */ + private volatile String instant = WriteMetadataEvent.BOOTSTRAP_INSTANT; + + /** + * Event buffer for one round of checkpointing. When all the elements are non-null and have the same + * write instant, then the instant succeed and we can commit it. + */ + private transient WriteMetadataEvent[] eventBuffer; + + /** + * Task number of the operator. + */ + private final int parallelism; + + /** + * A single-thread executor to handle all the asynchronous jobs of the coordinator. + */ + private NonThrownExecutor executor; + + /** + * A single-thread executor to handle asynchronous hive sync. + */ + private NonThrownExecutor hiveSyncExecutor; + + /** + * Context that holds variables for asynchronous hive sync. + */ + private HiveSyncContext hiveSyncContext; + + /** + * The table state. + */ + private transient TableState tableState; + + /** + * The checkpoint metadata. + */ + private CkpMetadata ckpMetadata; + + /** + * Constructs a StreamingSinkOperatorCoordinator. + * + * @param conf The config options + * @param context The coordinator context + */ + public StreamWriteOperatorCoordinator( + Configuration conf, + Context context) { + this.conf = conf; + this.context = context; + this.parallelism = context.currentParallelism(); + this.hiveConf = new SerializableConfiguration(HadoopConfigurations.getHiveConf(conf)); + } + + @Override + public void start() throws Exception { + // setup classloader for APIs that use reflection without taking ClassLoader param + // reference: https://stackoverflow.com/questions/1771679/difference-between-threads-context-class-loader-and-normal-classloader + Thread.currentThread().setContextClassLoader(getClass().getClassLoader()); + // initialize event buffer + reset(); + this.gateways = new SubtaskGateway[this.parallelism]; + // init table, create if not exists. + this.metaClient = initTableIfNotExists(this.conf); + this.ckpMetadata = initCkpMetadata(this.metaClient); + // the write client must create after the table creation + this.writeClient = FlinkWriteClients.createWriteClient(conf); + initMetadataTable(this.writeClient); + this.tableState = TableState.create(conf); + // start the executor + this.executor = NonThrownExecutor.builder(LOG) + .exceptionHook((errMsg, t) -> this.context.failJob(new HoodieException(errMsg, t))) + .waitForTasksFinish(true).build(); + // start the executor if required + if (tableState.syncHive) { + initHiveSync(); + } + } + + @Override + public void close() throws Exception { + // teardown the resource + if (executor != null) { + executor.close(); + } + if (hiveSyncExecutor != null) { + hiveSyncExecutor.close(); + } + // the write client must close after the executor service + // because the task in the service may send requests to the embedded timeline service. + if (writeClient != null) { + writeClient.close(); + } + this.eventBuffer = null; + if (this.ckpMetadata != null) { + this.ckpMetadata.close(); + } + } + + @Override + public void checkpointCoordinator(long checkpointId, CompletableFuture result) { + executor.execute( + () -> { + try { + result.complete(new byte[0]); + } catch (Throwable throwable) { + // when a checkpoint fails, throws directly. + result.completeExceptionally( + new CompletionException( + String.format("Failed to checkpoint Instant %s for source %s", + this.instant, this.getClass().getSimpleName()), throwable)); + } + }, "taking checkpoint %d", checkpointId + ); + } + + @Override + public void notifyCheckpointComplete(long checkpointId) { + executor.execute( + () -> { + // The executor thread inherits the classloader of the #notifyCheckpointComplete + // caller, which is a AppClassLoader. + Thread.currentThread().setContextClassLoader(getClass().getClassLoader()); + // for streaming mode, commits the ever received events anyway, + // the stream write task snapshot and flush the data buffer synchronously in sequence, + // so a successful checkpoint subsumes the old one(follows the checkpoint subsuming contract) + final boolean committed = commitInstant(this.instant, checkpointId); + + if (tableState.scheduleCompaction) { + // if async compaction is on, schedule the compaction + CompactionUtil.scheduleCompaction(metaClient, writeClient, tableState.isDeltaTimeCompaction, committed); + } + + if (tableState.scheduleClustering) { + // if async clustering is on, schedule the clustering + ClusteringUtil.scheduleClustering(conf, writeClient, committed); + } + + if (committed) { + // start new instant. + startInstant(); + // sync Hive if is enabled + syncHiveAsync(); + } + }, "commits the instant %s", this.instant + ); + } + + @Override + public void resetToCheckpoint(long checkpointID, byte[] checkpointData) { + // no operation + } + + @Override + public void handleEventFromOperator(int i, OperatorEvent operatorEvent) { + ValidationUtils.checkState(operatorEvent instanceof WriteMetadataEvent, + "The coordinator can only handle WriteMetaEvent"); + WriteMetadataEvent event = (WriteMetadataEvent) operatorEvent; + + if (event.isEndInput()) { + // handle end input event synchronously + // wrap handleEndInputEvent in executeSync to preserve the order of events + executor.executeSync(() -> handleEndInputEvent(event), "handle end input event for instant %s", this.instant); + } else { + executor.execute( + () -> { + if (event.isBootstrap()) { + handleBootstrapEvent(event); + } else { + handleWriteMetaEvent(event); + } + }, "handle write metadata event for instant %s", this.instant + ); + } + } + + @Override + public void subtaskFailed(int i, @Nullable Throwable throwable) { + // reset the event + this.eventBuffer[i] = null; + LOG.warn("Reset the event for task [" + i + "]", throwable); + } + + @Override + public void subtaskReset(int i, long l) { + // no operation + } + + @Override + public void subtaskReady(int i, SubtaskGateway subtaskGateway) { + this.gateways[i] = subtaskGateway; + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + private void initHiveSync() { + this.hiveSyncExecutor = NonThrownExecutor.builder(LOG).waitForTasksFinish(true).build(); + this.hiveSyncContext = HiveSyncContext.create(conf, this.hiveConf); + } + + private void syncHiveAsync() { + if (tableState.syncHive) { + this.hiveSyncExecutor.execute(this::doSyncHive, "sync hive metadata for instant %s", this.instant); + } + } + + private void syncHive() { + if (tableState.syncHive) { + doSyncHive(); + LOG.info("Sync hive metadata for instant {} success!", this.instant); + } + } + + /** + * Sync hoodie table metadata to Hive metastore. + */ + public void doSyncHive() { + hiveSyncContext.hiveSyncTool().syncHoodieTable(); + } + + private static void initMetadataTable(HoodieFlinkWriteClient writeClient) { + writeClient.initMetadataTable(); + } + + private static CkpMetadata initCkpMetadata(HoodieTableMetaClient metaClient) throws IOException { + CkpMetadata ckpMetadata = CkpMetadata.getInstance(metaClient.getFs(), metaClient.getBasePath()); + ckpMetadata.bootstrap(); + return ckpMetadata; + } + + private void reset() { + this.eventBuffer = new WriteMetadataEvent[this.parallelism]; + } + + /** + * Checks the buffer is ready to commit. + */ + private boolean allEventsReceived() { + return Arrays.stream(eventBuffer) + // we do not use event.isReady to check the instant + // because the write task may send an event eagerly for empty + // data set, the even may have a timestamp of last committed instant. + .allMatch(event -> event != null && event.isLastBatch()); + } + + private void addEventToBuffer(WriteMetadataEvent event) { + if (this.eventBuffer[event.getTaskID()] != null) { + this.eventBuffer[event.getTaskID()].mergeWith(event); + } else { + this.eventBuffer[event.getTaskID()] = event; + } + } + + private void startInstant() { + // put the assignment in front of metadata generation, + // because the instant request from write task is asynchronous. + this.instant = this.writeClient.startCommit(tableState.commitAction, this.metaClient); + this.metaClient.getActiveTimeline().transitionRequestedToInflight(tableState.commitAction, this.instant); + this.ckpMetadata.startInstant(this.instant); + LOG.info("Create instant [{}] for table [{}] with type [{}]", this.instant, + this.conf.getString(FlinkOptions.TABLE_NAME), conf.getString(FlinkOptions.TABLE_TYPE)); + } + + /** + * Initializes the instant. + * + *

    Recommits the last inflight instant if the write metadata checkpoint successfully + * but was not committed due to some rare cases. + * + *

    Starts a new instant, a writer can not flush data buffer + * until it finds a new inflight instant on the timeline. + */ + private void initInstant(String instant) { + HoodieTimeline completedTimeline = + StreamerUtil.createMetaClient(conf).getActiveTimeline().filterCompletedInstants(); + executor.execute(() -> { + if (instant.equals("") || completedTimeline.containsInstant(instant)) { + // the last instant committed successfully + reset(); + } else { + LOG.info("Recommit instant {}", instant); + commitInstant(instant); + } + // starts a new instant + startInstant(); + // upgrade downgrade + this.writeClient.upgradeDowngrade(this.instant, this.metaClient); + }, "initialize instant %s", instant); + } + + private void handleBootstrapEvent(WriteMetadataEvent event) { + this.eventBuffer[event.getTaskID()] = event; + if (Arrays.stream(eventBuffer).allMatch(evt -> evt != null && evt.isBootstrap())) { + // start to initialize the instant. + initInstant(event.getInstantTime()); + } + } + + private void handleEndInputEvent(WriteMetadataEvent event) { + addEventToBuffer(event); + if (allEventsReceived()) { + // start to commit the instant. + boolean committed = commitInstant(this.instant); + if (committed) { + // The executor thread inherits the classloader of the #handleEventFromOperator + // caller, which is a AppClassLoader. + Thread.currentThread().setContextClassLoader(getClass().getClassLoader()); + // sync Hive synchronously if it is enabled in batch mode. + syncHive(); + // schedules the compaction plan in batch execution mode + if (tableState.scheduleCompaction) { + // if async compaction is on, schedule the compaction + CompactionUtil.scheduleCompaction(metaClient, writeClient, tableState.isDeltaTimeCompaction, true); + } + } + } + } + + private void handleWriteMetaEvent(WriteMetadataEvent event) { + // the write task does not block after checkpointing(and before it receives a checkpoint success event), + // if it checkpoints succeed then flushes the data buffer again before this coordinator receives a checkpoint + // success event, the data buffer would flush with an older instant time. + ValidationUtils.checkState( + HoodieTimeline.compareTimestamps(this.instant, HoodieTimeline.GREATER_THAN_OR_EQUALS, event.getInstantTime()), + String.format("Receive an unexpected event for instant %s from task %d", + event.getInstantTime(), event.getTaskID())); + + addEventToBuffer(event); + } + + /** + * The coordinator reuses the instant if there is no data for this round of checkpoint, + * sends the commit ack events to unblock the flushing. + */ + private void sendCommitAckEvents(long checkpointId) { + CompletableFuture[] futures = Arrays.stream(this.gateways).filter(Objects::nonNull) + .map(gw -> gw.sendEvent(CommitAckEvent.getInstance(checkpointId))) + .toArray(CompletableFuture[]::new); + CompletableFuture.allOf(futures).whenComplete((resp, error) -> { + if (!sendToFinishedTasks(error)) { + throw new HoodieException("Error while waiting for the commit ack events to finish sending", error); + } + }); + } + + /** + * Decides whether the given exception is caused by sending events to FINISHED tasks. + * + *

    Ugly impl: the exception may change in the future. + */ + private static boolean sendToFinishedTasks(Throwable throwable) { + return throwable.getCause() instanceof TaskNotRunningException + || throwable.getCause().getMessage().contains("running"); + } + + /** + * Commits the instant. + */ + private boolean commitInstant(String instant) { + return commitInstant(instant, -1); + } + + /** + * Commits the instant. + * + * @return true if the write statuses are committed successfully. + */ + private boolean commitInstant(String instant, long checkpointId) { + if (Arrays.stream(eventBuffer).allMatch(Objects::isNull)) { + // The last checkpoint finished successfully. + return false; + } + + List writeResults = Arrays.stream(eventBuffer) + .filter(Objects::nonNull) + .map(WriteMetadataEvent::getWriteStatuses) + .flatMap(Collection::stream) + .collect(Collectors.toList()); + + if (writeResults.size() == 0) { + // No data has written, reset the buffer and returns early + reset(); + // Send commit ack event to the write function to unblock the flushing + // If this checkpoint has no inputs while the next checkpoint has inputs, + // the 'isConfirming' flag should be switched with the ack event. + sendCommitAckEvents(checkpointId); + return false; + } + doCommit(instant, writeResults); + return true; + } + + /** + * Performs the actual commit action. + */ + @SuppressWarnings("unchecked") + private void doCommit(String instant, List writeResults) { + // commit or rollback + long totalErrorRecords = writeResults.stream().map(WriteStatus::getTotalErrorRecords).reduce(Long::sum).orElse(0L); + long totalRecords = writeResults.stream().map(WriteStatus::getTotalRecords).reduce(Long::sum).orElse(0L); + boolean hasErrors = totalErrorRecords > 0; + + if (!hasErrors || this.conf.getBoolean(FlinkOptions.IGNORE_FAILED)) { + HashMap checkpointCommitMetadata = new HashMap<>(); + if (hasErrors) { + LOG.warn("Some records failed to merge but forcing commit since commitOnErrors set to true. Errors/Total=" + + totalErrorRecords + "/" + totalRecords); + } + + final Map> partitionToReplacedFileIds = tableState.isOverwrite + ? writeClient.getPartitionToReplacedFileIds(tableState.operationType, writeResults) + : Collections.emptyMap(); + boolean success = writeClient.commit(instant, writeResults, Option.of(checkpointCommitMetadata), + tableState.commitAction, partitionToReplacedFileIds); + if (success) { + reset(); + this.ckpMetadata.commitInstant(instant); + LOG.info("Commit instant [{}] success!", instant); + } else { + throw new HoodieException(String.format("Commit instant [%s] failed!", instant)); + } + } else { + LOG.error("Error when writing. Errors/Total=" + totalErrorRecords + "/" + totalRecords); + LOG.error("The first 100 error messages"); + writeResults.stream().filter(WriteStatus::hasErrors).limit(100).forEach(ws -> { + LOG.error("Global error for partition path {} and fileID {}: {}", + ws.getGlobalError(), ws.getPartitionPath(), ws.getFileId()); + if (ws.getErrors().size() > 0) { + ws.getErrors().forEach((key, value) -> LOG.trace("Error for key:" + key + " and value " + value)); + } + }); + // Rolls back instant + writeClient.rollback(instant); + throw new HoodieException(String.format("Commit instant [%s] failed and rolled back !", instant)); + } + } + + @VisibleForTesting + public WriteMetadataEvent[] getEventBuffer() { + return eventBuffer; + } + + @VisibleForTesting + public String getInstant() { + return instant; + } + + @VisibleForTesting + public Context getContext() { + return context; + } + + @VisibleForTesting + public void setExecutor(NonThrownExecutor executor) throws Exception { + if (this.executor != null) { + this.executor.close(); + } + this.executor = executor; + } + + // ------------------------------------------------------------------------- + // Inner Class + // ------------------------------------------------------------------------- + + /** + * Provider for {@link StreamWriteOperatorCoordinator}. + */ + public static class Provider implements OperatorCoordinator.Provider { + private final OperatorID operatorId; + private final Configuration conf; + + public Provider(OperatorID operatorId, Configuration conf) { + this.operatorId = operatorId; + this.conf = conf; + } + + @Override + public OperatorID getOperatorId() { + return this.operatorId; + } + + @Override + public OperatorCoordinator create(Context context) { + return new StreamWriteOperatorCoordinator(this.conf, context); + } + } + + /** + * Remember some table state variables. + */ + private static class TableState implements Serializable { + private static final long serialVersionUID = 1L; + + final WriteOperationType operationType; + final String commitAction; + final boolean isOverwrite; + final boolean scheduleCompaction; + final boolean scheduleClustering; + final boolean syncHive; + final boolean syncMetadata; + final boolean isDeltaTimeCompaction; + + private TableState(Configuration conf) { + this.operationType = WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION)); + this.commitAction = CommitUtils.getCommitActionType(this.operationType, + HoodieTableType.valueOf(conf.getString(FlinkOptions.TABLE_TYPE).toUpperCase(Locale.ROOT))); + this.isOverwrite = WriteOperationType.isOverwrite(this.operationType); + this.scheduleCompaction = OptionsResolver.needsScheduleCompaction(conf); + this.scheduleClustering = OptionsResolver.needsScheduleClustering(conf); + this.syncHive = conf.getBoolean(FlinkOptions.HIVE_SYNC_ENABLED); + this.syncMetadata = conf.getBoolean(FlinkOptions.METADATA_ENABLED); + this.isDeltaTimeCompaction = OptionsResolver.isDeltaTimeCompaction(conf); + } + + public static TableState create(Configuration conf) { + return new TableState(conf); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/append/AppendWriteFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/append/AppendWriteFunction.java new file mode 100644 index 0000000000000..e1db125731cec --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/append/AppendWriteFunction.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.append; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.sink.StreamWriteOperatorCoordinator; +import org.apache.hudi.sink.bulk.BulkInsertWriterHelper; +import org.apache.hudi.sink.common.AbstractStreamWriteFunction; +import org.apache.hudi.sink.event.WriteMetadataEvent; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.util.Collector; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collections; +import java.util.List; + +/** + * Sink function to write the data to the underneath filesystem. + * + *

    The function writes base files directly for each checkpoint, + * the file may roll over when it’s size hits the configured threshold. + * + * @param Type of the input record + * @see StreamWriteOperatorCoordinator + */ +public class AppendWriteFunction extends AbstractStreamWriteFunction { + private static final Logger LOG = LoggerFactory.getLogger(AppendWriteFunction.class); + + private static final long serialVersionUID = 1L; + + /** + * Helper class for log mode. + */ + private transient BulkInsertWriterHelper writerHelper; + + /** + * Table row type. + */ + private final RowType rowType; + + /** + * Constructs an AppendWriteFunction. + * + * @param config The config options + */ + public AppendWriteFunction(Configuration config, RowType rowType) { + super(config); + this.rowType = rowType; + } + + @Override + public void snapshotState() { + // Based on the fact that the coordinator starts the checkpoint first, + // it would check the validity. + // wait for the buffer data flush out and request a new instant + flushData(false); + } + + @Override + public void processElement(I value, Context ctx, Collector out) throws Exception { + if (this.writerHelper == null) { + initWriterHelper(); + } + this.writerHelper.write((RowData) value); + } + + /** + * End input action for batch source. + */ + public void endInput() { + super.endInput(); + flushData(true); + this.writeStatuses.clear(); + } + + // ------------------------------------------------------------------------- + // GetterSetter + // ------------------------------------------------------------------------- + @VisibleForTesting + public BulkInsertWriterHelper getWriterHelper() { + return this.writerHelper; + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + private void initWriterHelper() { + final String instant = instantToWrite(true); + if (instant == null) { + // in case there are empty checkpoints that has no input data + throw new HoodieException("No inflight instant when flushing data!"); + } + this.writerHelper = new BulkInsertWriterHelper(this.config, this.writeClient.getHoodieTable(), this.writeClient.getConfig(), + instant, this.taskID, getRuntimeContext().getNumberOfParallelSubtasks(), getRuntimeContext().getAttemptNumber(), + this.rowType); + } + + private void flushData(boolean endInput) { + final List writeStatus; + if (this.writerHelper != null) { + writeStatus = this.writerHelper.getWriteStatuses(this.taskID); + this.currentInstant = this.writerHelper.getInstantTime(); + } else { + writeStatus = Collections.emptyList(); + this.currentInstant = instantToWrite(false); + LOG.info("No data to write in subtask [{}] for instant [{}]", taskID, this.currentInstant); + } + final WriteMetadataEvent event = WriteMetadataEvent.builder() + .taskID(taskID) + .instantTime(this.currentInstant) + .writeStatus(writeStatus) + .lastBatch(true) + .endInput(endInput) + .build(); + this.eventGateway.sendEventToCoordinator(event); + // nullify the write helper for next ckp + this.writerHelper = null; + this.writeStatuses.addAll(writeStatus); + // blocks flushing until the coordinator starts a new instant + this.confirming = true; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/append/AppendWriteOperator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/append/AppendWriteOperator.java new file mode 100644 index 0000000000000..ad1a00203e819 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/append/AppendWriteOperator.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.append; + +import org.apache.hudi.sink.common.AbstractWriteOperator; +import org.apache.hudi.sink.common.WriteOperatorFactory; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.types.logical.RowType; + +/** + * Operator for {@link AppendWriteFunction}. + * + * @param The input type + */ +public class AppendWriteOperator extends AbstractWriteOperator { + + public AppendWriteOperator(Configuration conf, RowType rowType) { + super(new AppendWriteFunction<>(conf, rowType)); + } + + public static WriteOperatorFactory getFactory(Configuration conf, RowType rowType) { + return WriteOperatorFactory.instance(conf, new AppendWriteOperator<>(conf, rowType)); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java new file mode 100644 index 0000000000000..3eaa47e3b6278 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.bootstrap; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordGlobalLocation; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.BaseFileUtils; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.sink.bootstrap.aggregate.BootstrapAggFunction; +import org.apache.hudi.sink.meta.CkpMetadata; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.format.FormatUtils; +import org.apache.hudi.util.FlinkTables; +import org.apache.hudi.util.FlinkWriteClients; + +import org.apache.avro.Schema; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.state.KeyGroupRangeAssignment; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.runtime.state.StateSnapshotContext; +import org.apache.flink.runtime.taskexecutor.GlobalAggregateManager; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.regex.Pattern; + +import static java.util.stream.Collectors.toList; +import static org.apache.hudi.util.StreamerUtil.isValidFile; + +/** + * The operator to load index from existing hoodieTable. + * + *

    Each subtask of the function triggers the index bootstrap when the first element came in, + * the record cannot be sent until all the index records have been sent. + * + *

    The output records should then shuffle by the recordKey and thus do scalable write. + */ +public class BootstrapOperator> + extends AbstractStreamOperator implements OneInputStreamOperator { + + private static final Logger LOG = LoggerFactory.getLogger(BootstrapOperator.class); + + protected HoodieTable hoodieTable; + + private CkpMetadata ckpMetadata; + + protected final Configuration conf; + + protected transient org.apache.hadoop.conf.Configuration hadoopConf; + protected transient HoodieWriteConfig writeConfig; + + private transient GlobalAggregateManager aggregateManager; + + private transient ListState instantState; + private final Pattern pattern; + private String lastInstantTime; + + public BootstrapOperator(Configuration conf) { + this.conf = conf; + this.pattern = Pattern.compile(conf.getString(FlinkOptions.INDEX_PARTITION_REGEX)); + } + + @Override + public void snapshotState(StateSnapshotContext context) throws Exception { + lastInstantTime = this.ckpMetadata.lastPendingInstant(); + instantState.update(Collections.singletonList(lastInstantTime)); + } + + @Override + public void initializeState(StateInitializationContext context) throws Exception { + ListStateDescriptor instantStateDescriptor = new ListStateDescriptor<>( + "instantStateDescriptor", + Types.STRING + ); + instantState = context.getOperatorStateStore().getListState(instantStateDescriptor); + + if (context.isRestored()) { + Iterator instantIterator = instantState.get().iterator(); + if (instantIterator.hasNext()) { + lastInstantTime = instantIterator.next(); + } + } + + this.hadoopConf = HadoopConfigurations.getHadoopConf(this.conf); + this.writeConfig = FlinkWriteClients.getHoodieClientConfig(this.conf, true); + this.hoodieTable = FlinkTables.createTable(writeConfig, hadoopConf, getRuntimeContext()); + this.ckpMetadata = CkpMetadata.getInstance(hoodieTable.getMetaClient().getFs(), this.writeConfig.getBasePath()); + this.aggregateManager = getRuntimeContext().getGlobalAggregateManager(); + + preLoadIndexRecords(); + } + + /** + * Load the index records before {@link #processElement}. + */ + protected void preLoadIndexRecords() throws Exception { + String basePath = hoodieTable.getMetaClient().getBasePath(); + int taskID = getRuntimeContext().getIndexOfThisSubtask(); + LOG.info("Start loading records in table {} into the index state, taskId = {}", basePath, taskID); + for (String partitionPath : FSUtils.getAllFoldersWithPartitionMetaFile(FSUtils.getFs(basePath, hadoopConf), basePath)) { + if (pattern.matcher(partitionPath).matches()) { + loadRecords(partitionPath); + } + } + + LOG.info("Finish sending index records, taskId = {}.", getRuntimeContext().getIndexOfThisSubtask()); + + // wait for the other bootstrap tasks finish bootstrapping. + waitForBootstrapReady(getRuntimeContext().getIndexOfThisSubtask()); + } + + /** + * Wait for other bootstrap tasks to finish the index bootstrap. + */ + private void waitForBootstrapReady(int taskID) { + int taskNum = getRuntimeContext().getNumberOfParallelSubtasks(); + int readyTaskNum = 1; + while (taskNum != readyTaskNum) { + try { + readyTaskNum = aggregateManager.updateGlobalAggregate(BootstrapAggFunction.NAME + conf.getString(FlinkOptions.TABLE_NAME), taskID, new BootstrapAggFunction()); + LOG.info("Waiting for other bootstrap tasks to complete, taskId = {}.", taskID); + + TimeUnit.SECONDS.sleep(5); + } catch (Exception e) { + LOG.warn("Update global task bootstrap summary error", e); + } + } + } + + @Override + @SuppressWarnings("unchecked") + public void processElement(StreamRecord element) throws Exception { + output.collect((StreamRecord) element); + } + + /** + * Loads all the indices of give partition path into the backup state. + * + * @param partitionPath The partition path + */ + @SuppressWarnings("unchecked") + protected void loadRecords(String partitionPath) throws Exception { + long start = System.currentTimeMillis(); + + final int parallelism = getRuntimeContext().getNumberOfParallelSubtasks(); + final int maxParallelism = getRuntimeContext().getMaxNumberOfParallelSubtasks(); + final int taskID = getRuntimeContext().getIndexOfThisSubtask(); + + HoodieTimeline commitsTimeline = this.hoodieTable.getMetaClient().getCommitsTimeline(); + if (!StringUtils.isNullOrEmpty(lastInstantTime)) { + commitsTimeline = commitsTimeline.findInstantsAfter(lastInstantTime); + } + Option latestCommitTime = commitsTimeline.filterCompletedInstants().lastInstant(); + + if (latestCommitTime.isPresent()) { + BaseFileUtils fileUtils = BaseFileUtils.getInstance(this.hoodieTable.getBaseFileFormat()); + Schema schema = new TableSchemaResolver(this.hoodieTable.getMetaClient()).getTableAvroSchema(); + + List fileSlices = this.hoodieTable.getSliceView() + .getLatestMergedFileSlicesBeforeOrOn(partitionPath, latestCommitTime.get().getTimestamp()) + .collect(toList()); + + for (FileSlice fileSlice : fileSlices) { + if (!shouldLoadFile(fileSlice.getFileId(), maxParallelism, parallelism, taskID)) { + continue; + } + LOG.info("Load records from {}.", fileSlice); + + // load parquet records + fileSlice.getBaseFile().ifPresent(baseFile -> { + // filter out crushed files + if (!isValidFile(baseFile.getFileStatus())) { + return; + } + try (ClosableIterator iterator = fileUtils.getHoodieKeyIterator(this.hadoopConf, new Path(baseFile.getPath()))) { + iterator.forEachRemaining(hoodieKey -> { + output.collect(new StreamRecord(new IndexRecord(generateHoodieRecord(hoodieKey, fileSlice)))); + }); + } + }); + + // load avro log records + List logPaths = fileSlice.getLogFiles() + .sorted(HoodieLogFile.getLogFileComparator()) + // filter out crushed files + .filter(logFile -> isValidFile(logFile.getFileStatus())) + .map(logFile -> logFile.getPath().toString()) + .collect(toList()); + HoodieMergedLogRecordScanner scanner = FormatUtils.logScanner(logPaths, schema, latestCommitTime.get().getTimestamp(), + writeConfig, hadoopConf); + + try { + for (String recordKey : scanner.getRecords().keySet()) { + output.collect(new StreamRecord(new IndexRecord(generateHoodieRecord(new HoodieKey(recordKey, partitionPath), fileSlice)))); + } + } catch (Exception e) { + throw new HoodieException(String.format("Error when loading record keys from files: %s", logPaths), e); + } finally { + scanner.close(); + } + } + } + + long cost = System.currentTimeMillis() - start; + LOG.info("Task [{}}:{}}] finish loading the index under partition {} and sending them to downstream, time cost: {} milliseconds.", + this.getClass().getSimpleName(), taskID, partitionPath, cost); + } + + @SuppressWarnings("unchecked") + public static HoodieRecord generateHoodieRecord(HoodieKey hoodieKey, FileSlice fileSlice) { + HoodieRecord hoodieRecord = new HoodieAvroRecord(hoodieKey, null); + hoodieRecord.setCurrentLocation(new HoodieRecordGlobalLocation(hoodieKey.getPartitionPath(), fileSlice.getBaseInstantTime(), fileSlice.getFileId())); + hoodieRecord.seal(); + + return hoodieRecord; + } + + protected boolean shouldLoadFile(String fileId, + int maxParallelism, + int parallelism, + int taskID) { + return KeyGroupRangeAssignment.assignKeyToParallelOperator( + fileId, maxParallelism, parallelism) == taskID; + } + + @VisibleForTesting + public boolean isAlreadyBootstrap() throws Exception { + return instantState.get().iterator().hasNext(); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/IndexRecord.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/IndexRecord.java new file mode 100644 index 0000000000000..edae0389b8aca --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/IndexRecord.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.bootstrap; + +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; + +/** + * The index record. + */ +public class IndexRecord extends HoodieAvroRecord { + private static final long serialVersionUID = 1L; + + public IndexRecord(HoodieRecord record) { + super(record); + } + + @Override + public HoodieRecord newInstance() { + return new IndexRecord<>(this); + } +} \ No newline at end of file diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/aggregate/BootstrapAccumulator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/aggregate/BootstrapAccumulator.java new file mode 100644 index 0000000000000..14630a1f89b72 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/aggregate/BootstrapAccumulator.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.bootstrap.aggregate; + +import java.io.Serializable; +import java.util.HashSet; +import java.util.Set; + +/** + * Bootstrap ready task id accumulator. + */ +public class BootstrapAccumulator implements Serializable { + private static final long serialVersionUID = 1L; + + private final Set readyTaskSet; + + public BootstrapAccumulator() { + this.readyTaskSet = new HashSet<>(); + } + + public void update(int taskId) { + readyTaskSet.add(taskId); + } + + public int readyTaskNum() { + return readyTaskSet.size(); + } + + public BootstrapAccumulator merge(BootstrapAccumulator acc) { + if (acc == null) { + return this; + } + + readyTaskSet.addAll(acc.readyTaskSet); + return this; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/aggregate/BootstrapAggFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/aggregate/BootstrapAggFunction.java new file mode 100644 index 0000000000000..8c42fe903ad3c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/aggregate/BootstrapAggFunction.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.bootstrap.aggregate; + +import org.apache.flink.api.common.functions.AggregateFunction; + +/** + * Aggregate function that accumulates the loaded task number of + * function {@link org.apache.hudi.sink.bootstrap.BootstrapOperator}. + */ +public class BootstrapAggFunction implements AggregateFunction { + public static final String NAME = BootstrapAggFunction.class.getSimpleName(); + + @Override + public BootstrapAccumulator createAccumulator() { + return new BootstrapAccumulator(); + } + + @Override + public BootstrapAccumulator add(Integer taskId, BootstrapAccumulator bootstrapAccumulator) { + bootstrapAccumulator.update(taskId); + return bootstrapAccumulator; + } + + @Override + public Integer getResult(BootstrapAccumulator bootstrapAccumulator) { + return bootstrapAccumulator.readyTaskNum(); + } + + @Override + public BootstrapAccumulator merge(BootstrapAccumulator bootstrapAccumulator, BootstrapAccumulator acc) { + return bootstrapAccumulator.merge(acc); + } +} \ No newline at end of file diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/batch/BatchBootstrapOperator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/batch/BatchBootstrapOperator.java new file mode 100644 index 0000000000000..ead00d40a936d --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/batch/BatchBootstrapOperator.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.bootstrap.batch; + +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.sink.bootstrap.BootstrapOperator; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; + +import java.util.HashSet; +import java.util.Set; + +/** + * The operator to load index from existing hoodieTable. + * + *

    This function should only be used for bounded source. + * + *

    When a record comes in, the function firstly checks whether the partition path of the record is already loaded, + * if the partition is not loaded yet, loads the entire partition and sends the index records to downstream operators + * before it sends the input record; if the partition is loaded already, sends the input record directly. + * + *

    The input records should shuffle by the partition path to avoid repeated loading. + */ +public class BatchBootstrapOperator> + extends BootstrapOperator { + + private Set partitionPathSet; + private boolean haveSuccessfulCommits; + + public BatchBootstrapOperator(Configuration conf) { + super(conf); + } + + @Override + public void open() throws Exception { + super.open(); + this.partitionPathSet = new HashSet<>(); + this.haveSuccessfulCommits = StreamerUtil.haveSuccessfulCommits(hoodieTable.getMetaClient()); + } + + @Override + protected void preLoadIndexRecords() { + // no operation + } + + @Override + @SuppressWarnings("unchecked") + public void processElement(StreamRecord element) throws Exception { + final HoodieRecord record = (HoodieRecord) element.getValue(); + final String partitionPath = record.getKey().getPartitionPath(); + + if (haveSuccessfulCommits && !partitionPathSet.contains(partitionPath)) { + loadRecords(partitionPath); + partitionPathSet.add(partitionPath); + } + + // send the trigger record + output.collect((StreamRecord) element); + } + + @Override + protected boolean shouldLoadFile(String fileId, int maxParallelism, int parallelism, int taskID) { + // load all the file groups in the partition + return true; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketBulkInsertWriterHelper.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketBulkInsertWriterHelper.java new file mode 100644 index 0000000000000..7d1400cb5c1d2 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketBulkInsertWriterHelper.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.bucket; + +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.bucket.BucketIdentifier; +import org.apache.hudi.io.storage.row.HoodieRowDataCreateHandle; +import org.apache.hudi.sink.bulk.BulkInsertWriterHelper; +import org.apache.hudi.sink.bulk.RowDataKeyGen; +import org.apache.hudi.sink.bulk.sort.SortOperatorGen; +import org.apache.hudi.table.HoodieTable; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Map; + +/** + * Helper class for bucket index bulk insert used by Flink. + */ +public class BucketBulkInsertWriterHelper extends BulkInsertWriterHelper { + private static final Logger LOG = LoggerFactory.getLogger(BucketBulkInsertWriterHelper.class); + public static final String FILE_GROUP_META_FIELD = "_fg"; + + private final int recordArity; + + private String lastFileId; // for efficient code path + + public BucketBulkInsertWriterHelper(Configuration conf, HoodieTable hoodieTable, HoodieWriteConfig writeConfig, + String instantTime, int taskPartitionId, long taskId, long taskEpochId, RowType rowType) { + super(conf, hoodieTable, writeConfig, instantTime, taskPartitionId, taskId, taskEpochId, rowType); + this.recordArity = rowType.getFieldCount(); + } + + public void write(RowData tuple) throws IOException { + try { + RowData record = tuple.getRow(1, this.recordArity); + String recordKey = keyGen.getRecordKey(record); + String partitionPath = keyGen.getPartitionPath(record); + String fileId = tuple.getString(0).toString(); + if ((lastFileId == null) || !lastFileId.equals(fileId)) { + LOG.info("Creating new file for partition path " + partitionPath); + handle = getRowCreateHandle(partitionPath, fileId); + lastFileId = fileId; + } + handle.write(recordKey, partitionPath, record); + } catch (Throwable throwable) { + IOException ioException = new IOException("Exception happened when bulk insert.", throwable); + LOG.error("Global error thrown while trying to write records in HoodieRowDataCreateHandle", ioException); + throw ioException; + } + } + + private HoodieRowDataCreateHandle getRowCreateHandle(String partitionPath, String fileId) throws IOException { + if (!handles.containsKey(fileId)) { // if there is no handle corresponding to the fileId + if (this.isInputSorted) { + // if records are sorted, we can close all existing handles + close(); + } + HoodieRowDataCreateHandle rowCreateHandle = new HoodieRowDataCreateHandle(hoodieTable, writeConfig, partitionPath, fileId, + instantTime, taskPartitionId, taskId, taskEpochId, rowType, preserveHoodieMetadata); + handles.put(fileId, rowCreateHandle); + } + return handles.get(fileId); + } + + public static SortOperatorGen getFileIdSorterGen(RowType rowType) { + return new SortOperatorGen(rowType, new String[] {FILE_GROUP_META_FIELD}); + } + + private static String getFileId(Map bucketIdToFileId, RowDataKeyGen keyGen, RowData record, String indexKeys, int numBuckets) { + String recordKey = keyGen.getRecordKey(record); + String partition = keyGen.getPartitionPath(record); + final int bucketNum = BucketIdentifier.getBucketId(recordKey, indexKeys, numBuckets); + String bucketId = partition + bucketNum; + return bucketIdToFileId.computeIfAbsent(bucketId, k -> BucketIdentifier.newBucketFileIdPrefix(bucketNum)); + } + + public static RowData rowWithFileId(Map bucketIdToFileId, RowDataKeyGen keyGen, RowData record, String indexKeys, int numBuckets) { + final String fileId = getFileId(bucketIdToFileId, keyGen, record, indexKeys, numBuckets); + return GenericRowData.of(StringData.fromString(fileId), record); + } + + public static RowType rowTypeWithFileId(RowType rowType) { + LogicalType[] types = new LogicalType[] {DataTypes.STRING().getLogicalType(), rowType}; + String[] names = new String[] {FILE_GROUP_META_FIELD, "record"}; + return RowType.of(types, names); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketStreamWriteFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketStreamWriteFunction.java new file mode 100644 index 0000000000000..1ccfe91dbc0a9 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketStreamWriteFunction.java @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.bucket; + +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.index.bucket.BucketIdentifier; +import org.apache.hudi.sink.StreamWriteFunction; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.state.FunctionInitializationContext; +import org.apache.flink.streaming.api.functions.ProcessFunction; +import org.apache.flink.util.Collector; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +/** + * A stream write function with bucket hash index. + * + *

    The task holds a fresh new local index: {(partition + bucket number) &rarr fileId} mapping, this index + * is used for deciding whether the incoming records in an UPDATE or INSERT. + * The index is local because different partition paths have separate items in the index. + * + * @param the input type + */ +public class BucketStreamWriteFunction extends StreamWriteFunction { + + private static final Logger LOG = LoggerFactory.getLogger(BucketStreamWriteFunction.class); + + private int parallelism; + + private int bucketNum; + + private String indexKeyFields; + + /** + * BucketID to file group mapping in each partition. + * Map(partition -> Map(bucketId, fileID)). + */ + private Map> bucketIndex; + + /** + * Incremental bucket index of the current checkpoint interval, + * it is needed because the bucket type('I' or 'U') should be decided based on the committed files view, + * all the records in one bucket should have the same bucket type. + */ + private Set incBucketIndex; + + /** + * Constructs a BucketStreamWriteFunction. + * + * @param config The config options + */ + public BucketStreamWriteFunction(Configuration config) { + super(config); + } + + @Override + public void open(Configuration parameters) throws IOException { + super.open(parameters); + this.bucketNum = config.getInteger(FlinkOptions.BUCKET_INDEX_NUM_BUCKETS); + this.indexKeyFields = config.getString(FlinkOptions.INDEX_KEY_FIELD); + this.taskID = getRuntimeContext().getIndexOfThisSubtask(); + this.parallelism = getRuntimeContext().getNumberOfParallelSubtasks(); + this.bucketIndex = new HashMap<>(); + this.incBucketIndex = new HashSet<>(); + } + + @Override + public void initializeState(FunctionInitializationContext context) throws Exception { + super.initializeState(context); + } + + @Override + public void snapshotState() { + super.snapshotState(); + this.incBucketIndex.clear(); + } + + @Override + public void processElement(I i, ProcessFunction.Context context, Collector collector) throws Exception { + HoodieRecord record = (HoodieRecord) i; + final HoodieKey hoodieKey = record.getKey(); + final String partition = hoodieKey.getPartitionPath(); + final HoodieRecordLocation location; + + bootstrapIndexIfNeed(partition); + Map bucketToFileId = bucketIndex.computeIfAbsent(partition, p -> new HashMap<>()); + final int bucketNum = BucketIdentifier.getBucketId(hoodieKey, indexKeyFields, this.bucketNum); + final String bucketId = partition + bucketNum; + + if (incBucketIndex.contains(bucketId)) { + location = new HoodieRecordLocation("I", bucketToFileId.get(bucketNum)); + } else if (bucketToFileId.containsKey(bucketNum)) { + location = new HoodieRecordLocation("U", bucketToFileId.get(bucketNum)); + } else { + String newFileId = BucketIdentifier.newBucketFileIdPrefix(bucketNum); + location = new HoodieRecordLocation("I", newFileId); + bucketToFileId.put(bucketNum, newFileId); + incBucketIndex.add(bucketId); + } + record.unseal(); + record.setCurrentLocation(location); + record.seal(); + bufferRecord(record); + } + + /** + * Determine whether the current fileID belongs to the current task. + * (partition + curBucket) % numPartitions == this taskID belongs to this task. + */ + public boolean isBucketToLoad(int bucketNumber, String partition) { + int globalHash = ((partition + bucketNumber).hashCode()) & Integer.MAX_VALUE; + return BucketIdentifier.mod(globalHash, parallelism) == taskID; + } + + /** + * Get partition_bucket -> fileID mapping from the existing hudi table. + * This is a required operation for each restart to avoid having duplicate file ids for one bucket. + */ + private void bootstrapIndexIfNeed(String partition) { + if (bucketIndex.containsKey(partition)) { + return; + } + LOG.info(String.format("Loading Hoodie Table %s, with path %s", this.metaClient.getTableConfig().getTableName(), + this.metaClient.getBasePath() + "/" + partition)); + + // Load existing fileID belongs to this task + Map bucketToFileIDMap = new HashMap<>(); + this.writeClient.getHoodieTable().getFileSystemView().getAllFileGroups(partition).forEach(fileGroup -> { + String fileID = fileGroup.getFileGroupId().getFileId(); + int bucketNumber = BucketIdentifier.bucketIdFromFileId(fileID); + if (isBucketToLoad(bucketNumber, partition)) { + LOG.info(String.format("Should load this partition bucket %s with fileID %s", bucketNumber, fileID)); + if (bucketToFileIDMap.containsKey(bucketNumber)) { + throw new RuntimeException(String.format("Duplicate fileID %s from bucket %s of partition %s found " + + "during the BucketStreamWriteFunction index bootstrap.", fileID, bucketNumber, partition)); + } else { + LOG.info(String.format("Adding fileID %s to the bucket %s of partition %s.", fileID, bucketNumber, partition)); + bucketToFileIDMap.put(bucketNumber, fileID); + } + } + }); + bucketIndex.put(partition, bucketToFileIDMap); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketStreamWriteOperator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketStreamWriteOperator.java new file mode 100644 index 0000000000000..a48ea44ddc44a --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketStreamWriteOperator.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.bucket; + +import org.apache.hudi.sink.common.AbstractWriteOperator; +import org.apache.hudi.sink.common.WriteOperatorFactory; + +import org.apache.flink.configuration.Configuration; + +/** + * Operator for {@link BucketStreamWriteFunction}. + * + * @param The input type + */ +public class BucketStreamWriteOperator extends AbstractWriteOperator { + + public BucketStreamWriteOperator(Configuration conf) { + super(new BucketStreamWriteFunction<>(conf)); + } + + public static WriteOperatorFactory getFactory(Configuration conf) { + return WriteOperatorFactory.instance(conf, new BucketStreamWriteOperator<>(conf)); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/BulkInsertWriteFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/BulkInsertWriteFunction.java new file mode 100644 index 0000000000000..9fbdbdd8e1afc --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/BulkInsertWriteFunction.java @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.bulk; + +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.sink.StreamWriteOperatorCoordinator; +import org.apache.hudi.sink.common.AbstractWriteFunction; +import org.apache.hudi.sink.event.WriteMetadataEvent; +import org.apache.hudi.sink.meta.CkpMetadata; +import org.apache.hudi.sink.utils.TimeWait; +import org.apache.hudi.util.FlinkWriteClients; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.runtime.operators.coordination.OperatorEventGateway; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.util.Collector; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; + +/** + * Sink function to write the data to the underneath filesystem. + * + *

    The function should only be used in operation type {@link WriteOperationType#BULK_INSERT}. + * + *

    Note: The function task requires the input stream be shuffled by partition path. + * + * @param Type of the input record + * @see StreamWriteOperatorCoordinator + */ +public class BulkInsertWriteFunction + extends AbstractWriteFunction { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(BulkInsertWriteFunction.class); + + /** + * Helper class for bulk insert mode. + */ + private transient BulkInsertWriterHelper writerHelper; + + /** + * Config options. + */ + private final Configuration config; + + /** + * Table row type. + */ + private final RowType rowType; + + /** + * Id of current subtask. + */ + private int taskID; + + /** + * Write Client. + */ + private transient HoodieFlinkWriteClient writeClient; + + /** + * The initial inflight instant when start up. + */ + private volatile String initInstant; + + /** + * Gateway to send operator events to the operator coordinator. + */ + private transient OperatorEventGateway eventGateway; + + /** + * Checkpoint metadata. + */ + private CkpMetadata ckpMetadata; + + /** + * Constructs a StreamingSinkFunction. + * + * @param config The config options + */ + public BulkInsertWriteFunction(Configuration config, RowType rowType) { + this.config = config; + this.rowType = rowType; + } + + @Override + public void open(Configuration parameters) throws IOException { + this.taskID = getRuntimeContext().getIndexOfThisSubtask(); + this.writeClient = FlinkWriteClients.createWriteClient(this.config, getRuntimeContext()); + this.ckpMetadata = CkpMetadata.getInstance(config); + this.initInstant = lastPendingInstant(); + sendBootstrapEvent(); + initWriterHelper(); + } + + @Override + public void processElement(I value, Context ctx, Collector out) throws IOException { + this.writerHelper.write((RowData) value); + } + + @Override + public void close() { + if (this.writeClient != null) { + this.writeClient.cleanHandlesGracefully(); + this.writeClient.close(); + } + } + + /** + * End input action for batch source. + */ + public void endInput() { + final List writeStatus = this.writerHelper.getWriteStatuses(this.taskID); + + final WriteMetadataEvent event = WriteMetadataEvent.builder() + .taskID(taskID) + .instantTime(this.writerHelper.getInstantTime()) + .writeStatus(writeStatus) + .lastBatch(true) + .endInput(true) + .build(); + this.eventGateway.sendEventToCoordinator(event); + } + + @Override + public void handleOperatorEvent(OperatorEvent event) { + // no operation + } + + // ------------------------------------------------------------------------- + // Getter/Setter + // ------------------------------------------------------------------------- + + public void setOperatorEventGateway(OperatorEventGateway operatorEventGateway) { + this.eventGateway = operatorEventGateway; + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + private void initWriterHelper() { + String instant = instantToWrite(); + this.writerHelper = WriterHelpers.getWriterHelper(this.config, this.writeClient.getHoodieTable(), this.writeClient.getConfig(), + instant, this.taskID, getRuntimeContext().getNumberOfParallelSubtasks(), getRuntimeContext().getAttemptNumber(), + this.rowType); + } + + private void sendBootstrapEvent() { + WriteMetadataEvent event = WriteMetadataEvent.builder() + .taskID(taskID) + .writeStatus(Collections.emptyList()) + .instantTime("") + .bootstrap(true) + .build(); + this.eventGateway.sendEventToCoordinator(event); + LOG.info("Send bootstrap write metadata event to coordinator, task[{}].", taskID); + } + + /** + * Returns the last pending instant time. + */ + protected String lastPendingInstant() { + return this.ckpMetadata.lastPendingInstant(); + } + + private String instantToWrite() { + String instant = lastPendingInstant(); + // if exactly-once semantics turns on, + // waits for the checkpoint notification until the checkpoint timeout threshold hits. + TimeWait timeWait = TimeWait.builder() + .timeout(config.getLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT)) + .action("instant initialize") + .build(); + while (instant == null || instant.equals(this.initInstant)) { + // wait condition: + // 1. there is no inflight instant + // 2. the inflight instant does not change + // sleep for a while + timeWait.waitFor(); + // refresh the inflight instant + instant = lastPendingInstant(); + } + return instant; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/BulkInsertWriteOperator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/BulkInsertWriteOperator.java new file mode 100644 index 0000000000000..16fb87fb3931a --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/BulkInsertWriteOperator.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.bulk; + +import org.apache.hudi.sink.common.AbstractWriteOperator; +import org.apache.hudi.sink.common.WriteOperatorFactory; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.table.types.logical.RowType; + +/** + * Operator for bulk insert mode sink. + * + * @param The input type + */ +public class BulkInsertWriteOperator + extends AbstractWriteOperator + implements BoundedOneInput { + + public BulkInsertWriteOperator(Configuration conf, RowType rowType) { + super(new BulkInsertWriteFunction<>(conf, rowType)); + } + + @Override + public void handleOperatorEvent(OperatorEvent event) { + // no operation + } + + public static WriteOperatorFactory getFactory(Configuration conf, RowType rowType) { + return WriteOperatorFactory.instance(conf, new BulkInsertWriteOperator<>(conf, rowType)); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/BulkInsertWriterHelper.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/BulkInsertWriterHelper.java new file mode 100644 index 0000000000000..abd2d09f78e28 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/BulkInsertWriterHelper.java @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.bulk; + +import org.apache.hudi.client.HoodieInternalWriteStatus; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.io.storage.row.HoodieRowDataCreateHandle; +import org.apache.hudi.table.HoodieTable; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.stream.Collectors; + +/** + * Helper class for bulk insert used by Flink. + */ +public class BulkInsertWriterHelper { + + private static final Logger LOG = LogManager.getLogger(BulkInsertWriterHelper.class); + + protected final String instantTime; + protected final int taskPartitionId; + protected final long taskId; + protected final long taskEpochId; + protected final HoodieTable hoodieTable; + protected final HoodieWriteConfig writeConfig; + protected final RowType rowType; + protected final boolean preserveHoodieMetadata; + protected final Boolean isInputSorted; + private final List writeStatusList = new ArrayList<>(); + protected HoodieRowDataCreateHandle handle; + private String lastKnownPartitionPath = null; + private final String fileIdPrefix; + private int numFilesWritten = 0; + protected final Map handles = new HashMap<>(); + @Nullable protected final RowDataKeyGen keyGen; + + public BulkInsertWriterHelper(Configuration conf, HoodieTable hoodieTable, HoodieWriteConfig writeConfig, + String instantTime, int taskPartitionId, long taskId, long taskEpochId, RowType rowType) { + this(conf, hoodieTable, writeConfig, instantTime, taskPartitionId, taskId, taskEpochId, rowType, false); + } + + public BulkInsertWriterHelper(Configuration conf, HoodieTable hoodieTable, HoodieWriteConfig writeConfig, + String instantTime, int taskPartitionId, long taskId, long taskEpochId, RowType rowType, + boolean preserveHoodieMetadata) { + this.hoodieTable = hoodieTable; + this.writeConfig = writeConfig; + this.instantTime = instantTime; + this.taskPartitionId = taskPartitionId; + this.taskId = taskId; + this.taskEpochId = taskEpochId; + this.rowType = preserveHoodieMetadata ? rowType : addMetadataFields(rowType, writeConfig.allowOperationMetadataField()); // patch up with metadata fields + this.preserveHoodieMetadata = preserveHoodieMetadata; + this.isInputSorted = conf.getBoolean(FlinkOptions.WRITE_BULK_INSERT_SORT_INPUT); + this.fileIdPrefix = UUID.randomUUID().toString(); + this.keyGen = preserveHoodieMetadata ? null : RowDataKeyGen.instance(conf, rowType); + } + + /** + * Returns the write instant time. + */ + public String getInstantTime() { + return this.instantTime; + } + + public void write(RowData record) throws IOException { + try { + String recordKey = preserveHoodieMetadata + ? record.getString(HoodieRecord.RECORD_KEY_META_FIELD_ORD).toString() + : keyGen.getRecordKey(record); + String partitionPath = preserveHoodieMetadata + ? record.getString(HoodieRecord.PARTITION_PATH_META_FIELD_ORD).toString() + : keyGen.getPartitionPath(record); + + if ((lastKnownPartitionPath == null) || !lastKnownPartitionPath.equals(partitionPath) || !handle.canWrite()) { + LOG.info("Creating new file for partition path " + partitionPath); + handle = getRowCreateHandle(partitionPath); + lastKnownPartitionPath = partitionPath; + } + handle.write(recordKey, partitionPath, record); + } catch (Throwable t) { + IOException ioException = new IOException("Exception happened when bulk insert.", t); + LOG.error("Global error thrown while trying to write records in HoodieRowCreateHandle ", ioException); + throw new IOException(ioException); + } + } + + public List getHoodieWriteStatuses() throws IOException { + close(); + return writeStatusList; + } + + private HoodieRowDataCreateHandle getRowCreateHandle(String partitionPath) throws IOException { + if (!handles.containsKey(partitionPath)) { // if there is no handle corresponding to the partition path + // if records are sorted, we can close all existing handles + if (isInputSorted) { + close(); + } + HoodieRowDataCreateHandle rowCreateHandle = new HoodieRowDataCreateHandle(hoodieTable, writeConfig, partitionPath, getNextFileId(), + instantTime, taskPartitionId, taskId, taskEpochId, rowType, preserveHoodieMetadata); + handles.put(partitionPath, rowCreateHandle); + } else if (!handles.get(partitionPath).canWrite()) { + // even if there is a handle to the partition path, it could have reached its max size threshold. So, we close the handle here and + // create a new one. + writeStatusList.add(handles.remove(partitionPath).close()); + HoodieRowDataCreateHandle rowCreateHandle = new HoodieRowDataCreateHandle(hoodieTable, writeConfig, partitionPath, getNextFileId(), + instantTime, taskPartitionId, taskId, taskEpochId, rowType, preserveHoodieMetadata); + handles.put(partitionPath, rowCreateHandle); + } + return handles.get(partitionPath); + } + + public void close() throws IOException { + for (HoodieRowDataCreateHandle rowCreateHandle : handles.values()) { + writeStatusList.add(rowCreateHandle.close()); + } + handles.clear(); + handle = null; + } + + private String getNextFileId() { + return String.format("%s-%d", fileIdPrefix, numFilesWritten++); + } + + /** + * Adds the Hoodie metadata fields to the given row type. + */ + public static RowType addMetadataFields(RowType rowType, boolean withOperationField) { + List mergedFields = new ArrayList<>(); + + LogicalType metadataFieldType = DataTypes.STRING().getLogicalType(); + RowType.RowField commitTimeField = + new RowType.RowField(HoodieRecord.COMMIT_TIME_METADATA_FIELD, metadataFieldType, "commit time"); + RowType.RowField commitSeqnoField = + new RowType.RowField(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, metadataFieldType, "commit seqno"); + RowType.RowField recordKeyField = + new RowType.RowField(HoodieRecord.RECORD_KEY_METADATA_FIELD, metadataFieldType, "record key"); + RowType.RowField partitionPathField = + new RowType.RowField(HoodieRecord.PARTITION_PATH_METADATA_FIELD, metadataFieldType, "partition path"); + RowType.RowField fileNameField = + new RowType.RowField(HoodieRecord.FILENAME_METADATA_FIELD, metadataFieldType, "field name"); + + mergedFields.add(commitTimeField); + mergedFields.add(commitSeqnoField); + mergedFields.add(recordKeyField); + mergedFields.add(partitionPathField); + mergedFields.add(fileNameField); + + if (withOperationField) { + RowType.RowField operationField = + new RowType.RowField(HoodieRecord.OPERATION_METADATA_FIELD, metadataFieldType, "operation"); + mergedFields.add(operationField); + } + + mergedFields.addAll(rowType.getFields()); + + return new RowType(false, mergedFields); + } + + public List getWriteStatuses(int taskID) { + try { + return getHoodieWriteStatuses().stream() + .map(BulkInsertWriterHelper::toWriteStatus).collect(Collectors.toList()); + } catch (IOException e) { + throw new HoodieException("Error collect the write status for task [" + taskID + "]", e); + } + } + + /** + * Tool to convert {@link HoodieInternalWriteStatus} into {@link WriteStatus}. + */ + private static WriteStatus toWriteStatus(HoodieInternalWriteStatus internalWriteStatus) { + WriteStatus writeStatus = new WriteStatus(false, 0.1); + writeStatus.setStat(internalWriteStatus.getStat()); + writeStatus.setFileId(internalWriteStatus.getFileId()); + writeStatus.setGlobalError(internalWriteStatus.getGlobalError()); + writeStatus.setTotalRecords(internalWriteStatus.getTotalRecords()); + writeStatus.setTotalErrorRecords(internalWriteStatus.getTotalErrorRecords()); + return writeStatus; + } +} + diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/RowDataKeyGen.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/RowDataKeyGen.java new file mode 100644 index 0000000000000..a2414abc3de21 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/RowDataKeyGen.java @@ -0,0 +1,270 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.bulk; + +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieKeyException; +import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator; +import org.apache.hudi.util.RowDataProjection; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; + +import java.io.IOException; +import java.io.Serializable; +import java.util.Arrays; +import java.util.List; + +import static org.apache.hudi.common.util.PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH; +import static org.apache.hudi.common.util.PartitionPathEncodeUtils.escapePathName; + +/** + * Key generator for {@link RowData}. + */ +public class RowDataKeyGen implements Serializable { + private static final long serialVersionUID = 1L; + + // reference: NonpartitionedAvroKeyGenerator + private static final String EMPTY_PARTITION = ""; + + // reference: org.apache.hudi.keygen.KeyGenUtils + private static final String NULL_RECORDKEY_PLACEHOLDER = "__null__"; + private static final String EMPTY_RECORDKEY_PLACEHOLDER = "__empty__"; + + private static final String DEFAULT_PARTITION_PATH_SEPARATOR = "/"; + + private final String[] recordKeyFields; + private final String[] partitionPathFields; + + private final RowDataProjection recordKeyProjection; + private final RowDataProjection partitionPathProjection; + + private final boolean hiveStylePartitioning; + private final boolean encodePartitionPath; + + private final Option keyGenOpt; + + // efficient code path + private boolean simpleRecordKey = false; + private RowData.FieldGetter recordKeyFieldGetter; + + private boolean simplePartitionPath = false; + private RowData.FieldGetter partitionPathFieldGetter; + + private boolean nonPartitioned; + + private RowDataKeyGen( + String recordKeys, + String partitionFields, + RowType rowType, + boolean hiveStylePartitioning, + boolean encodePartitionPath, + Option keyGenOpt) { + this.recordKeyFields = recordKeys.split(","); + this.partitionPathFields = partitionFields.split(","); + List fieldNames = rowType.getFieldNames(); + List fieldTypes = rowType.getChildren(); + + this.hiveStylePartitioning = hiveStylePartitioning; + this.encodePartitionPath = encodePartitionPath; + if (this.recordKeyFields.length == 1) { + // efficient code path + this.simpleRecordKey = true; + int recordKeyIdx = fieldNames.indexOf(this.recordKeyFields[0]); + this.recordKeyFieldGetter = RowData.createFieldGetter(fieldTypes.get(recordKeyIdx), recordKeyIdx); + this.recordKeyProjection = null; + } else { + this.recordKeyProjection = getProjection(this.recordKeyFields, fieldNames, fieldTypes); + } + if (this.partitionPathFields.length == 1) { + // efficient code path + if (this.partitionPathFields[0].equals("")) { + this.nonPartitioned = true; + } else { + this.simplePartitionPath = true; + int partitionPathIdx = fieldNames.indexOf(this.partitionPathFields[0]); + this.partitionPathFieldGetter = RowData.createFieldGetter(fieldTypes.get(partitionPathIdx), partitionPathIdx); + } + this.partitionPathProjection = null; + } else { + this.partitionPathProjection = getProjection(this.partitionPathFields, fieldNames, fieldTypes); + } + this.keyGenOpt = keyGenOpt; + } + + public static RowDataKeyGen instance(Configuration conf, RowType rowType) { + Option keyGeneratorOpt = Option.empty(); + if (TimestampBasedAvroKeyGenerator.class.getName().equals(conf.getString(FlinkOptions.KEYGEN_CLASS_NAME))) { + try { + keyGeneratorOpt = Option.of(new TimestampBasedAvroKeyGenerator(StreamerUtil.flinkConf2TypedProperties(conf))); + } catch (IOException e) { + throw new HoodieKeyException("Initialize TimestampBasedAvroKeyGenerator error", e); + } + } + return new RowDataKeyGen(conf.getString(FlinkOptions.RECORD_KEY_FIELD), conf.getString(FlinkOptions.PARTITION_PATH_FIELD), + rowType, conf.getBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING), conf.getBoolean(FlinkOptions.URL_ENCODE_PARTITIONING), + keyGeneratorOpt); + } + + public HoodieKey getHoodieKey(RowData rowData) { + return new HoodieKey(getRecordKey(rowData), getPartitionPath(rowData)); + } + + public String getRecordKey(RowData rowData) { + if (this.simpleRecordKey) { + return getRecordKey(recordKeyFieldGetter.getFieldOrNull(rowData), this.recordKeyFields[0]); + } else { + Object[] keyValues = this.recordKeyProjection.projectAsValues(rowData); + return getRecordKey(keyValues, this.recordKeyFields); + } + } + + public String getPartitionPath(RowData rowData) { + if (this.simplePartitionPath) { + return getPartitionPath(partitionPathFieldGetter.getFieldOrNull(rowData), + this.partitionPathFields[0], this.hiveStylePartitioning, this.encodePartitionPath, this.keyGenOpt); + } else if (this.nonPartitioned) { + return EMPTY_PARTITION; + } else { + Object[] partValues = this.partitionPathProjection.projectAsValues(rowData); + return getRecordPartitionPath(partValues, this.partitionPathFields, this.hiveStylePartitioning, this.encodePartitionPath); + } + } + + // reference: org.apache.hudi.keygen.KeyGenUtils.getRecordPartitionPath + private static String getRecordKey(Object[] keyValues, String[] keyFields) { + boolean keyIsNullEmpty = true; + StringBuilder recordKey = new StringBuilder(); + for (int i = 0; i < keyValues.length; i++) { + String recordKeyField = keyFields[i]; + String recordKeyValue = StringUtils.objToString(keyValues[i]); + if (recordKeyValue == null) { + recordKey.append(recordKeyField).append(":").append(NULL_RECORDKEY_PLACEHOLDER).append(","); + } else if (recordKeyValue.isEmpty()) { + recordKey.append(recordKeyField).append(":").append(EMPTY_RECORDKEY_PLACEHOLDER).append(","); + } else { + recordKey.append(recordKeyField).append(":").append(recordKeyValue).append(","); + keyIsNullEmpty = false; + } + } + recordKey.deleteCharAt(recordKey.length() - 1); + if (keyIsNullEmpty) { + throw new HoodieKeyException("recordKey values: \"" + recordKey + "\" for fields: " + + Arrays.toString(keyFields) + " cannot be entirely null or empty."); + } + return recordKey.toString(); + } + + // reference: org.apache.hudi.keygen.KeyGenUtils.getRecordPartitionPath + private static String getRecordPartitionPath( + Object[] partValues, + String[] partFields, + boolean hiveStylePartitioning, + boolean encodePartitionPath) { + StringBuilder partitionPath = new StringBuilder(); + for (int i = 0; i < partFields.length; i++) { + String partField = partFields[i]; + String partValue = StringUtils.objToString(partValues[i]); + if (partValue == null || partValue.isEmpty()) { + partitionPath.append(hiveStylePartitioning ? partField + "=" + DEFAULT_PARTITION_PATH + : DEFAULT_PARTITION_PATH); + } else { + if (encodePartitionPath) { + partValue = escapePathName(partValue); + } + partitionPath.append(hiveStylePartitioning ? partField + "=" + partValue : partValue); + } + partitionPath.append(DEFAULT_PARTITION_PATH_SEPARATOR); + } + partitionPath.deleteCharAt(partitionPath.length() - 1); + return partitionPath.toString(); + } + + // reference: org.apache.hudi.keygen.KeyGenUtils.getRecordKey + public static String getRecordKey(Object recordKeyValue, String recordKeyField) { + String recordKey = StringUtils.objToString(recordKeyValue); + if (recordKey == null || recordKey.isEmpty()) { + throw new HoodieKeyException("recordKey value: \"" + recordKey + "\" for field: \"" + recordKeyField + "\" cannot be null or empty."); + } + return recordKey; + } + + // reference: org.apache.hudi.keygen.KeyGenUtils.getPartitionPath + public static String getPartitionPath( + Object partValue, + String partField, + boolean hiveStylePartitioning, + boolean encodePartitionPath, + Option keyGenOpt) { + if (keyGenOpt.isPresent()) { + TimestampBasedAvroKeyGenerator keyGenerator = keyGenOpt.get(); + return keyGenerator.getPartitionPath(toEpochMilli(partValue, keyGenerator)); + } + String partitionPath = StringUtils.objToString(partValue); + if (partitionPath == null || partitionPath.isEmpty()) { + partitionPath = DEFAULT_PARTITION_PATH; + } + if (encodePartitionPath) { + partitionPath = escapePathName(partitionPath); + } + if (hiveStylePartitioning) { + partitionPath = partField + "=" + partitionPath; + } + return partitionPath; + } + + private static Object toEpochMilli(Object val, TimestampBasedAvroKeyGenerator keyGenerator) { + if (val instanceof TimestampData) { + return ((TimestampData) val).toInstant().toEpochMilli(); + } + if (val == null) { + // should match the default partition path when STRING partition path re-format is supported + return keyGenerator.getDefaultPartitionVal(); + } + return val; + } + + /** + * Returns the row data projection for the given field names and table schema. + * + * @param fields The projected field names + * @param schemaFields The table schema names + * @param schemaTypes The table schema types + * @return the row data projection for the fields + */ + private static RowDataProjection getProjection(String[] fields, List schemaFields, List schemaTypes) { + int[] positions = getFieldPositions(fields, schemaFields); + LogicalType[] types = Arrays.stream(positions).mapToObj(schemaTypes::get).toArray(LogicalType[]::new); + return RowDataProjection.instance(types, positions); + } + + /** + * Returns the field positions of the given fields {@code fields} among all the fields {@code allFields}. + */ + private static int[] getFieldPositions(String[] fields, List allFields) { + return Arrays.stream(fields).mapToInt(allFields::indexOf).toArray(); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/WriterHelpers.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/WriterHelpers.java new file mode 100644 index 0000000000000..99a9ae114cd8e --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/WriterHelpers.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.bulk; + +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.sink.bucket.BucketBulkInsertWriterHelper; +import org.apache.hudi.table.HoodieTable; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.types.logical.RowType; + +/** + * Factory clazz to generate bulk insert writer helpers. + */ +public class WriterHelpers { + public static BulkInsertWriterHelper getWriterHelper(Configuration conf, HoodieTable hoodieTable, HoodieWriteConfig writeConfig, + String instantTime, int taskPartitionId, long taskId, long taskEpochId, RowType rowType) { + return OptionsResolver.isBucketIndexType(conf) + ? new BucketBulkInsertWriterHelper(conf, hoodieTable, writeConfig, instantTime, taskPartitionId, taskId, taskEpochId, rowType) + : new BulkInsertWriterHelper(conf, hoodieTable, writeConfig, instantTime, taskPartitionId, taskId, taskEpochId, rowType); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/sort/SortOperator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/sort/SortOperator.java new file mode 100644 index 0000000000000..aa6224057946c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/sort/SortOperator.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.bulk.sort; + +import org.apache.flink.metrics.Gauge; +import org.apache.flink.runtime.memory.MemoryManager; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.binary.BinaryRowData; +import org.apache.flink.table.runtime.generated.GeneratedNormalizedKeyComputer; +import org.apache.flink.table.runtime.generated.GeneratedRecordComparator; +import org.apache.flink.table.runtime.generated.NormalizedKeyComputer; +import org.apache.flink.table.runtime.generated.RecordComparator; +import org.apache.flink.table.runtime.operators.TableStreamOperator; +import org.apache.flink.table.runtime.operators.sort.BinaryExternalSorter; +import org.apache.flink.table.runtime.typeutils.AbstractRowDataSerializer; +import org.apache.flink.table.runtime.typeutils.BinaryRowDataSerializer; +import org.apache.flink.table.runtime.util.StreamRecordCollector; +import org.apache.flink.util.MutableObjectIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Operator for batch sort. + * + *

    Copied from org.apache.flink.table.runtime.operators.sort.SortOperator to change the annotation. + */ +public class SortOperator extends TableStreamOperator + implements OneInputStreamOperator, BoundedOneInput { + + private static final Logger LOG = LoggerFactory.getLogger(SortOperator.class); + + private GeneratedNormalizedKeyComputer gComputer; + private GeneratedRecordComparator gComparator; + + private transient BinaryExternalSorter sorter; + private transient StreamRecordCollector collector; + private transient BinaryRowDataSerializer binarySerializer; + + public SortOperator( + GeneratedNormalizedKeyComputer gComputer, GeneratedRecordComparator gComparator) { + this.gComputer = gComputer; + this.gComparator = gComparator; + } + + @Override + public void open() throws Exception { + super.open(); + LOG.info("Opening SortOperator"); + + ClassLoader cl = getContainingTask().getUserCodeClassLoader(); + + AbstractRowDataSerializer inputSerializer = + (AbstractRowDataSerializer) + getOperatorConfig().getTypeSerializerIn1(getUserCodeClassloader()); + this.binarySerializer = new BinaryRowDataSerializer(inputSerializer.getArity()); + + NormalizedKeyComputer computer = gComputer.newInstance(cl); + RecordComparator comparator = gComparator.newInstance(cl); + gComputer = null; + gComparator = null; + + MemoryManager memManager = getContainingTask().getEnvironment().getMemoryManager(); + this.sorter = + new BinaryExternalSorter( + this.getContainingTask(), + memManager, + computeMemorySize(), + this.getContainingTask().getEnvironment().getIOManager(), + inputSerializer, + binarySerializer, + computer, + comparator, + getContainingTask().getJobConfiguration()); + this.sorter.startThreads(); + + collector = new StreamRecordCollector<>(output); + + // register the the metrics. + getMetricGroup().gauge("memoryUsedSizeInBytes", (Gauge) sorter::getUsedMemoryInBytes); + getMetricGroup().gauge("numSpillFiles", (Gauge) sorter::getNumSpillFiles); + getMetricGroup().gauge("spillInBytes", (Gauge) sorter::getSpillInBytes); + } + + @Override + public void processElement(StreamRecord element) throws Exception { + this.sorter.write(element.getValue()); + } + + @Override + public void endInput() throws Exception { + BinaryRowData row = binarySerializer.createInstance(); + MutableObjectIterator iterator = sorter.getIterator(); + while ((row = iterator.next(row)) != null) { + collector.collect(row); + } + } + + @Override + public void close() throws Exception { + LOG.info("Closing SortOperator"); + super.close(); + if (sorter != null) { + sorter.close(); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/sort/SortOperatorGen.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/sort/SortOperatorGen.java new file mode 100644 index 0000000000000..e6821e667e831 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/sort/SortOperatorGen.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.bulk.sort; + +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.table.api.TableConfig; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.planner.codegen.sort.SortCodeGenerator; +import org.apache.flink.table.planner.plan.nodes.exec.spec.SortSpec; +import org.apache.flink.table.types.logical.RowType; + +import java.util.Arrays; + +/** + * Tools to generate the sort operator. + */ +public class SortOperatorGen { + private final int[] sortIndices; + private final RowType rowType; + private final TableConfig tableConfig = new TableConfig(); + + public SortOperatorGen(RowType rowType, String[] sortFields) { + this.sortIndices = Arrays.stream(sortFields).mapToInt(rowType::getFieldIndex).toArray(); + this.rowType = rowType; + } + + public OneInputStreamOperator createSortOperator() { + SortCodeGenerator codeGen = createSortCodeGenerator(); + return new SortOperator( + codeGen.generateNormalizedKeyComputer("SortComputer"), + codeGen.generateRecordComparator("SortComparator")); + } + + public SortCodeGenerator createSortCodeGenerator() { + SortSpec.SortSpecBuilder builder = SortSpec.builder(); + for (int sortIndex : sortIndices) { + builder.addField(sortIndex, true, true); + } + return new SortCodeGenerator(tableConfig, rowType, builder.build()); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringCommitEvent.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringCommitEvent.java new file mode 100644 index 0000000000000..46a15a62648bf --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringCommitEvent.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.clustering; + +import org.apache.hudi.client.WriteStatus; + +import java.io.Serializable; +import java.util.List; + +/** + * Represents a commit event from the clustering task {@link ClusteringOperator}. + */ +public class ClusteringCommitEvent implements Serializable { + private static final long serialVersionUID = 1L; + + /** + * The clustering commit instant time. + */ + private String instant; + /** + * The write statuses. + */ + private List writeStatuses; + /** + * The clustering task identifier. + */ + private int taskID; + + public ClusteringCommitEvent() { + } + + public ClusteringCommitEvent(String instant, List writeStatuses, int taskID) { + this.instant = instant; + this.writeStatuses = writeStatuses; + this.taskID = taskID; + } + + public ClusteringCommitEvent(String instant, int taskID) { + this(instant, null, taskID); + } + + public void setInstant(String instant) { + this.instant = instant; + } + + public void setWriteStatuses(List writeStatuses) { + this.writeStatuses = writeStatuses; + } + + public void setTaskID(int taskID) { + this.taskID = taskID; + } + + public String getInstant() { + return instant; + } + + public List getWriteStatuses() { + return writeStatuses; + } + + public int getTaskID() { + return taskID; + } + + public boolean isFailed() { + return this.writeStatuses == null; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringCommitSink.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringCommitSink.java new file mode 100644 index 0000000000000..5a46dcf8f3360 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringCommitSink.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.clustering; + +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.TableServiceType; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.ClusteringUtils; +import org.apache.hudi.common.util.CommitUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieClusteringException; +import org.apache.hudi.sink.CleanFunction; +import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.util.CompactionUtil; +import org.apache.hudi.util.FlinkWriteClients; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.flink.configuration.Configuration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Function to check and commit the clustering action. + * + *

    Each time after receiving a clustering commit event {@link ClusteringCommitEvent}, + * it loads and checks the clustering plan {@link org.apache.hudi.avro.model.HoodieClusteringPlan}, + * if all the clustering operations {@link org.apache.hudi.common.model.ClusteringOperation} + * of the plan are finished, tries to commit the clustering action. + * + *

    It also inherits the {@link CleanFunction} cleaning ability. This is needed because + * the SQL API does not allow multiple sinks in one table sink provider. + */ +public class ClusteringCommitSink extends CleanFunction { + private static final Logger LOG = LoggerFactory.getLogger(ClusteringCommitSink.class); + + /** + * Config options. + */ + private final Configuration conf; + + private transient HoodieFlinkTable table; + + /** + * Buffer to collect the event from each clustering task {@code ClusteringFunction}. + * The key is the instant time. + */ + private transient Map> commitBuffer; + + public ClusteringCommitSink(Configuration conf) { + super(conf); + this.conf = conf; + } + + @Override + public void open(Configuration parameters) throws Exception { + super.open(parameters); + if (writeClient == null) { + this.writeClient = FlinkWriteClients.createWriteClient(conf, getRuntimeContext()); + } + this.commitBuffer = new HashMap<>(); + this.table = writeClient.getHoodieTable(); + } + + @Override + public void invoke(ClusteringCommitEvent event, Context context) throws Exception { + final String instant = event.getInstant(); + commitBuffer.computeIfAbsent(instant, k -> new ArrayList<>()) + .add(event); + commitIfNecessary(instant, commitBuffer.get(instant)); + } + + /** + * Condition to commit: the commit buffer has equal size with the clustering plan operations + * and all the clustering commit event {@link ClusteringCommitEvent} has the same clustering instant time. + * + * @param instant Clustering commit instant time + * @param events Commit events ever received for the instant + */ + private void commitIfNecessary(String instant, List events) { + HoodieInstant clusteringInstant = HoodieTimeline.getReplaceCommitInflightInstant(instant); + Option> clusteringPlanOption = ClusteringUtils.getClusteringPlan( + StreamerUtil.createMetaClient(this.conf), clusteringInstant); + HoodieClusteringPlan clusteringPlan = clusteringPlanOption.get().getRight(); + boolean isReady = clusteringPlan.getInputGroups().size() == events.size(); + if (!isReady) { + return; + } + + if (events.stream().anyMatch(ClusteringCommitEvent::isFailed)) { + try { + // handle failure case + CompactionUtil.rollbackCompaction(table, instant); + } finally { + // remove commitBuffer to avoid obsolete metadata commit + reset(instant); + } + return; + } + + try { + doCommit(instant, clusteringPlan, events); + } catch (Throwable throwable) { + // make it fail-safe + LOG.error("Error while committing clustering instant: " + instant, throwable); + } finally { + // reset the status + reset(instant); + } + } + + private void doCommit(String instant, HoodieClusteringPlan clusteringPlan, List events) { + List statuses = events.stream() + .map(ClusteringCommitEvent::getWriteStatuses) + .flatMap(Collection::stream) + .collect(Collectors.toList()); + + HoodieWriteMetadata> writeMetadata = new HoodieWriteMetadata<>(); + writeMetadata.setWriteStatuses(statuses); + writeMetadata.setWriteStats(statuses.stream().map(WriteStatus::getStat).collect(Collectors.toList())); + writeMetadata.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(clusteringPlan, writeMetadata)); + validateWriteResult(clusteringPlan, instant, writeMetadata); + if (!writeMetadata.getCommitMetadata().isPresent()) { + HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata( + writeMetadata.getWriteStats().get(), + writeMetadata.getPartitionToReplaceFileIds(), + Option.empty(), + WriteOperationType.CLUSTER, + this.writeClient.getConfig().getSchema(), + HoodieTimeline.REPLACE_COMMIT_ACTION); + writeMetadata.setCommitMetadata(Option.of(commitMetadata)); + } + // commit the clustering + this.table.getMetaClient().reloadActiveTimeline(); + this.writeClient.completeTableService( + TableServiceType.CLUSTER, writeMetadata.getCommitMetadata().get(), table, instant); + + // whether to clean up the input base parquet files used for clustering + if (!conf.getBoolean(FlinkOptions.CLEAN_ASYNC_ENABLED)) { + LOG.info("Running inline clean"); + this.writeClient.clean(); + } + } + + private void reset(String instant) { + this.commitBuffer.remove(instant); + } + + /** + * Validate actions taken by clustering. In the first implementation, we validate at least one new file is written. + * But we can extend this to add more validation. E.g. number of records read = number of records written etc. + * We can also make these validations in BaseCommitActionExecutor to reuse pre-commit hooks for multiple actions. + */ + private static void validateWriteResult(HoodieClusteringPlan clusteringPlan, String instantTime, HoodieWriteMetadata> writeMetadata) { + if (writeMetadata.getWriteStatuses().isEmpty()) { + throw new HoodieClusteringException("Clustering plan produced 0 WriteStatus for " + instantTime + + " #groups: " + clusteringPlan.getInputGroups().size() + " expected at least " + + clusteringPlan.getInputGroups().stream().mapToInt(HoodieClusteringGroup::getNumOutputFileGroups).sum() + + " write statuses"); + } + } + + private static Map> getPartitionToReplacedFileIds( + HoodieClusteringPlan clusteringPlan, + HoodieWriteMetadata> writeMetadata) { + Set newFilesWritten = writeMetadata.getWriteStats().get().stream() + .map(s -> new HoodieFileGroupId(s.getPartitionPath(), s.getFileId())).collect(Collectors.toSet()); + return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan) + .filter(fg -> !newFilesWritten.contains(fg)) + .collect(Collectors.groupingBy(HoodieFileGroupId::getPartitionPath, Collectors.mapping(HoodieFileGroupId::getFileId, Collectors.toList()))); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringOperator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringOperator.java new file mode 100644 index 0000000000000..e7bde41ca8b0a --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringOperator.java @@ -0,0 +1,388 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.clustering; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.client.FlinkTaskContextSupplier; +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.utils.ConcatenatingIterator; +import org.apache.hudi.common.model.ClusteringGroupInfo; +import org.apache.hudi.common.model.ClusteringOperation; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.log.HoodieFileSliceReader; +import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.exception.HoodieClusteringException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.io.IOUtils; +import org.apache.hudi.io.storage.HoodieFileReader; +import org.apache.hudi.io.storage.HoodieFileReaderFactory; +import org.apache.hudi.sink.bulk.BulkInsertWriterHelper; +import org.apache.hudi.sink.bulk.sort.SortOperatorGen; +import org.apache.hudi.sink.utils.NonThrownExecutor; +import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.AvroToRowDataConverters; +import org.apache.hudi.util.FlinkWriteClients; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.GenericRecordBuilder; +import org.apache.avro.generic.IndexedRecord; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.metrics.Gauge; +import org.apache.flink.runtime.memory.MemoryManager; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.operators.Output; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.binary.BinaryRowData; +import org.apache.flink.table.planner.codegen.sort.SortCodeGenerator; +import org.apache.flink.table.runtime.generated.NormalizedKeyComputer; +import org.apache.flink.table.runtime.generated.RecordComparator; +import org.apache.flink.table.runtime.operators.TableStreamOperator; +import org.apache.flink.table.runtime.operators.sort.BinaryExternalSorter; +import org.apache.flink.table.runtime.typeutils.AbstractRowDataSerializer; +import org.apache.flink.table.runtime.typeutils.BinaryRowDataSerializer; +import org.apache.flink.table.runtime.typeutils.RowDataSerializer; +import org.apache.flink.table.runtime.util.StreamRecordCollector; +import org.apache.flink.table.types.logical.RowType; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +import static org.apache.hudi.table.format.FormatUtils.buildAvroRecordBySchema; + +/** + * Operator to execute the actual clustering task assigned by the clustering plan task. + * In order to execute scalable, the input should shuffle by the clustering event {@link ClusteringPlanEvent}. + */ +public class ClusteringOperator extends TableStreamOperator implements + OneInputStreamOperator, BoundedOneInput { + private static final Logger LOG = LoggerFactory.getLogger(ClusteringOperator.class); + + private final Configuration conf; + private final boolean preserveHoodieMetadata; + private final RowType rowType; + private int taskID; + private transient HoodieWriteConfig writeConfig; + private transient HoodieFlinkTable table; + private transient Schema schema; + private transient Schema readerSchema; + private transient int[] requiredPos; + private transient AvroToRowDataConverters.AvroToRowDataConverter avroToRowDataConverter; + private transient HoodieFlinkWriteClient writeClient; + private transient StreamRecordCollector collector; + private transient BinaryRowDataSerializer binarySerializer; + + /** + * Whether to execute clustering asynchronously. + */ + private final boolean asyncClustering; + + /** + * Whether the clustering sort is enabled. + */ + private final boolean sortClusteringEnabled; + + /** + * Executor service to execute the clustering task. + */ + private transient NonThrownExecutor executor; + + public ClusteringOperator(Configuration conf, RowType rowType) { + this.conf = conf; + this.preserveHoodieMetadata = conf.getBoolean(HoodieClusteringConfig.PRESERVE_COMMIT_METADATA.key(), HoodieClusteringConfig.PRESERVE_COMMIT_METADATA.defaultValue()); + this.rowType = this.preserveHoodieMetadata + ? BulkInsertWriterHelper.addMetadataFields(rowType, false) + : rowType; + this.asyncClustering = OptionsResolver.needsAsyncClustering(conf); + this.sortClusteringEnabled = OptionsResolver.sortClusteringEnabled(conf); + + // override max parquet file size in conf + this.conf.setLong(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.key(), + this.conf.getLong(FlinkOptions.CLUSTERING_PLAN_STRATEGY_TARGET_FILE_MAX_BYTES)); + } + + @Override + public void open() throws Exception { + super.open(); + + this.taskID = getRuntimeContext().getIndexOfThisSubtask(); + this.writeConfig = FlinkWriteClients.getHoodieClientConfig(this.conf); + this.writeClient = FlinkWriteClients.createWriteClient(conf, getRuntimeContext()); + this.table = writeClient.getHoodieTable(); + + this.schema = AvroSchemaConverter.convertToSchema(rowType); + this.readerSchema = this.preserveHoodieMetadata ? this.schema : HoodieAvroUtils.addMetadataFields(this.schema); + this.requiredPos = getRequiredPositions(); + + this.avroToRowDataConverter = AvroToRowDataConverters.createRowConverter(rowType); + this.binarySerializer = new BinaryRowDataSerializer(rowType.getFieldCount()); + + if (this.asyncClustering) { + this.executor = NonThrownExecutor.builder(LOG).build(); + } + + this.collector = new StreamRecordCollector<>(output); + } + + @Override + public void processWatermark(Watermark mark) { + // no need to propagate the watermark + } + + @Override + public void processElement(StreamRecord element) throws Exception { + ClusteringPlanEvent event = element.getValue(); + final String instantTime = event.getClusteringInstantTime(); + if (this.asyncClustering) { + // executes the compaction task asynchronously to not block the checkpoint barrier propagate. + executor.execute( + () -> doClustering(instantTime, event), + (errMsg, t) -> collector.collect(new ClusteringCommitEvent(instantTime, taskID)), + "Execute clustering for instant %s from task %d", instantTime, taskID); + } else { + // executes the clustering task synchronously for batch mode. + LOG.info("Execute clustering for instant {} from task {}", instantTime, taskID); + doClustering(instantTime, event); + } + } + + @Override + public void close() throws Exception { + if (null != this.executor) { + this.executor.close(); + } + if (this.writeClient != null) { + this.writeClient.cleanHandlesGracefully(); + this.writeClient.close(); + this.writeClient = null; + } + } + + /** + * End input action for batch source. + */ + public void endInput() { + // no operation + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + private void doClustering(String instantTime, ClusteringPlanEvent event) throws Exception { + final ClusteringGroupInfo clusteringGroupInfo = event.getClusteringGroupInfo(); + + BulkInsertWriterHelper writerHelper = new BulkInsertWriterHelper(this.conf, this.table, this.writeConfig, + instantTime, this.taskID, getRuntimeContext().getNumberOfParallelSubtasks(), getRuntimeContext().getAttemptNumber(), + this.rowType, this.preserveHoodieMetadata); + + List clusteringOps = clusteringGroupInfo.getOperations(); + boolean hasLogFiles = clusteringOps.stream().anyMatch(op -> op.getDeltaFilePaths().size() > 0); + + Iterator iterator; + if (hasLogFiles) { + // if there are log files, we read all records into memory for a file group and apply updates. + iterator = readRecordsForGroupWithLogs(clusteringOps, instantTime); + } else { + // We want to optimize reading records for case there are no log files. + iterator = readRecordsForGroupBaseFiles(clusteringOps); + } + + RowDataSerializer rowDataSerializer = new RowDataSerializer(rowType); + + if (this.sortClusteringEnabled) { + BinaryExternalSorter sorter = initSorter(); + while (iterator.hasNext()) { + RowData rowData = iterator.next(); + BinaryRowData binaryRowData = rowDataSerializer.toBinaryRow(rowData).copy(); + sorter.write(binaryRowData); + } + + BinaryRowData row = binarySerializer.createInstance(); + while ((row = sorter.getIterator().next(row)) != null) { + writerHelper.write(row); + } + sorter.close(); + } else { + while (iterator.hasNext()) { + writerHelper.write(iterator.next()); + } + } + + List writeStatuses = writerHelper.getWriteStatuses(this.taskID); + collector.collect(new ClusteringCommitEvent(instantTime, writeStatuses, this.taskID)); + writerHelper.close(); + } + + /** + * Read records from baseFiles, apply updates and convert to Iterator. + */ + @SuppressWarnings("unchecked") + private Iterator readRecordsForGroupWithLogs(List clusteringOps, String instantTime) { + List> recordIterators = new ArrayList<>(); + + long maxMemoryPerCompaction = IOUtils.getMaxMemoryPerCompaction(new FlinkTaskContextSupplier(null), writeConfig); + LOG.info("MaxMemoryPerCompaction run as part of clustering => " + maxMemoryPerCompaction); + + for (ClusteringOperation clusteringOp : clusteringOps) { + try { + Option baseFileReader = StringUtils.isNullOrEmpty(clusteringOp.getDataFilePath()) + ? Option.empty() + : Option.of(HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), new Path(clusteringOp.getDataFilePath()))); + HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder() + .withFileSystem(table.getMetaClient().getFs()) + .withBasePath(table.getMetaClient().getBasePath()) + .withLogFilePaths(clusteringOp.getDeltaFilePaths()) + .withReaderSchema(readerSchema) + .withLatestInstantTime(instantTime) + .withMaxMemorySizeInBytes(maxMemoryPerCompaction) + .withReadBlocksLazily(writeConfig.getCompactionLazyBlockReadEnabled()) + .withReverseReader(writeConfig.getCompactionReverseLogReadEnabled()) + .withBufferSize(writeConfig.getMaxDFSStreamBufferSize()) + .withSpillableMapBasePath(writeConfig.getSpillableMapBasePath()) + .withDiskMapType(writeConfig.getCommonConfig().getSpillableDiskMapType()) + .withBitCaskDiskMapCompressionEnabled(writeConfig.getCommonConfig().isBitCaskDiskMapCompressionEnabled()) + .build(); + + HoodieTableConfig tableConfig = table.getMetaClient().getTableConfig(); + HoodieFileSliceReader hoodieFileSliceReader = HoodieFileSliceReader.getFileSliceReader(baseFileReader, scanner, readerSchema, + tableConfig.getPayloadClass(), + tableConfig.getPreCombineField(), + tableConfig.populateMetaFields() ? Option.empty() : Option.of(Pair.of(tableConfig.getRecordKeyFieldProp(), + tableConfig.getPartitionFieldProp()))); + + recordIterators.add(StreamSupport.stream(Spliterators.spliteratorUnknownSize(hoodieFileSliceReader, Spliterator.NONNULL), false).map(hoodieRecord -> { + try { + return this.transform((IndexedRecord) hoodieRecord.getData().getInsertValue(readerSchema).get()); + } catch (IOException e) { + throw new HoodieIOException("Failed to read next record", e); + } + }).iterator()); + } catch (IOException e) { + throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath() + + " and " + clusteringOp.getDeltaFilePaths(), e); + } + } + + return new ConcatenatingIterator<>(recordIterators); + } + + /** + * Read records from baseFiles and get iterator. + */ + private Iterator readRecordsForGroupBaseFiles(List clusteringOps) { + List> iteratorsForPartition = clusteringOps.stream().map(clusteringOp -> { + Iterable indexedRecords = () -> { + try { + return HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), new Path(clusteringOp.getDataFilePath())).getRecordIterator(readerSchema); + } catch (IOException e) { + throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath() + + " and " + clusteringOp.getDeltaFilePaths(), e); + } + }; + + return StreamSupport.stream(indexedRecords.spliterator(), false).map(this::transform).iterator(); + }).collect(Collectors.toList()); + + return new ConcatenatingIterator<>(iteratorsForPartition); + } + + /** + * Transform IndexedRecord into HoodieRecord. + */ + private RowData transform(IndexedRecord indexedRecord) { + GenericRecord record = this.preserveHoodieMetadata + ? (GenericRecord) indexedRecord + : buildAvroRecordBySchema(indexedRecord, schema, requiredPos, new GenericRecordBuilder(schema)); + return (RowData) avroToRowDataConverter.convert(record); + } + + private int[] getRequiredPositions() { + final List fieldNames = readerSchema.getFields().stream().map(Schema.Field::name).collect(Collectors.toList()); + return schema.getFields().stream() + .map(field -> fieldNames.indexOf(field.name())) + .mapToInt(i -> i) + .toArray(); + } + + private BinaryExternalSorter initSorter() { + ClassLoader cl = getContainingTask().getUserCodeClassLoader(); + NormalizedKeyComputer computer = createSortCodeGenerator().generateNormalizedKeyComputer("SortComputer").newInstance(cl); + RecordComparator comparator = createSortCodeGenerator().generateRecordComparator("SortComparator").newInstance(cl); + + MemoryManager memManager = getContainingTask().getEnvironment().getMemoryManager(); + BinaryExternalSorter sorter = + new BinaryExternalSorter( + this.getContainingTask(), + memManager, + computeMemorySize(), + this.getContainingTask().getEnvironment().getIOManager(), + (AbstractRowDataSerializer) binarySerializer, + binarySerializer, + computer, + comparator, + getContainingTask().getJobConfiguration()); + sorter.startThreads(); + + // register the metrics. + getMetricGroup().gauge("memoryUsedSizeInBytes", (Gauge) sorter::getUsedMemoryInBytes); + getMetricGroup().gauge("numSpillFiles", (Gauge) sorter::getNumSpillFiles); + getMetricGroup().gauge("spillInBytes", (Gauge) sorter::getSpillInBytes); + return sorter; + } + + private SortCodeGenerator createSortCodeGenerator() { + SortOperatorGen sortOperatorGen = new SortOperatorGen(rowType, + conf.getString(FlinkOptions.CLUSTERING_SORT_COLUMNS).split(",")); + return sortOperatorGen.createSortCodeGenerator(); + } + + @VisibleForTesting + public void setExecutor(NonThrownExecutor executor) { + this.executor = executor; + } + + @VisibleForTesting + public void setOutput(Output> output) { + this.output = output; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringPlanEvent.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringPlanEvent.java new file mode 100644 index 0000000000000..c82075877bcf3 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringPlanEvent.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.clustering; + +import org.apache.hudi.common.model.ClusteringGroupInfo; + +import java.io.Serializable; +import java.util.Map; + +/** + * Represents a cluster command from the clustering plan task {@link ClusteringPlanSourceFunction}. + */ +public class ClusteringPlanEvent implements Serializable { + private static final long serialVersionUID = 1L; + + private String clusteringInstantTime; + + private ClusteringGroupInfo clusteringGroupInfo; + + private Map strategyParams; + + public ClusteringPlanEvent() { + } + + public ClusteringPlanEvent( + String instantTime, + ClusteringGroupInfo clusteringGroupInfo, + Map strategyParams) { + this.clusteringInstantTime = instantTime; + this.clusteringGroupInfo = clusteringGroupInfo; + this.strategyParams = strategyParams; + } + + public void setClusteringInstantTime(String clusteringInstantTime) { + this.clusteringInstantTime = clusteringInstantTime; + } + + public void setClusteringGroupInfo(ClusteringGroupInfo clusteringGroupInfo) { + this.clusteringGroupInfo = clusteringGroupInfo; + } + + public void setStrategyParams(Map strategyParams) { + this.strategyParams = strategyParams; + } + + public String getClusteringInstantTime() { + return clusteringInstantTime; + } + + public ClusteringGroupInfo getClusteringGroupInfo() { + return clusteringGroupInfo; + } + + public Map getStrategyParams() { + return strategyParams; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringPlanOperator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringPlanOperator.java new file mode 100644 index 0000000000000..48b2a9becd436 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringPlanOperator.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.clustering; + +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.common.model.ClusteringGroupInfo; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.ClusteringUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.util.ClusteringUtil; +import org.apache.hudi.util.FlinkTables; +import org.apache.hudi.util.FlinkWriteClients; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.operators.Output; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; + +/** + * Operator that generates the clustering plan with pluggable strategies on finished checkpoints. + * + *

    It should be singleton to avoid conflicts. + */ +public class ClusteringPlanOperator extends AbstractStreamOperator + implements OneInputStreamOperator { + + /** + * Config options. + */ + private final Configuration conf; + + /** + * Meta Client. + */ + @SuppressWarnings("rawtypes") + private transient HoodieFlinkTable table; + + public ClusteringPlanOperator(Configuration conf) { + this.conf = conf; + } + + @Override + public void open() throws Exception { + super.open(); + this.table = FlinkTables.createTable(conf, getRuntimeContext()); + // when starting up, rolls back all the inflight clustering instants if there exists, + // these instants are in priority for scheduling task because the clustering instants are + // scheduled from earliest(FIFO sequence). + ClusteringUtil.rollbackClustering(table, FlinkWriteClients.createWriteClient(conf, getRuntimeContext())); + } + + @Override + public void processElement(StreamRecord streamRecord) { + // no operation + } + + @Override + public void notifyCheckpointComplete(long checkpointId) { + try { + table.getMetaClient().reloadActiveTimeline(); + scheduleClustering(table, checkpointId); + } catch (Throwable throwable) { + // make it fail-safe + LOG.error("Error while scheduling clustering plan for checkpoint: " + checkpointId, throwable); + } + } + + private void scheduleClustering(HoodieFlinkTable table, long checkpointId) { + // the first instant takes the highest priority. + Option firstRequested = Option.fromJavaOptional( + ClusteringUtils.getPendingClusteringInstantTimes(table.getMetaClient()).stream() + .filter(instant -> instant.getState() == HoodieInstant.State.REQUESTED).findFirst()); + if (!firstRequested.isPresent()) { + // do nothing. + LOG.info("No clustering plan for checkpoint " + checkpointId); + return; + } + + String clusteringInstantTime = firstRequested.get().getTimestamp(); + + // generate clustering plan + // should support configurable commit metadata + HoodieInstant clusteringInstant = HoodieTimeline.getReplaceCommitRequestedInstant(clusteringInstantTime); + Option> clusteringPlanOption = ClusteringUtils.getClusteringPlan( + table.getMetaClient(), clusteringInstant); + + if (!clusteringPlanOption.isPresent()) { + // do nothing. + LOG.info("No clustering plan scheduled"); + return; + } + + HoodieClusteringPlan clusteringPlan = clusteringPlanOption.get().getRight(); + + if (clusteringPlan == null || (clusteringPlan.getInputGroups() == null) + || (clusteringPlan.getInputGroups().isEmpty())) { + // do nothing. + LOG.info("Empty clustering plan for instant " + clusteringInstantTime); + } else { + // Mark instant as clustering inflight + table.getActiveTimeline().transitionReplaceRequestedToInflight(clusteringInstant, Option.empty()); + table.getMetaClient().reloadActiveTimeline(); + + for (HoodieClusteringGroup clusteringGroup : clusteringPlan.getInputGroups()) { + LOG.info("Execute clustering plan for instant {} as {} file slices", clusteringInstantTime, clusteringGroup.getSlices().size()); + output.collect(new StreamRecord<>( + new ClusteringPlanEvent(clusteringInstantTime, ClusteringGroupInfo.create(clusteringGroup), clusteringPlan.getStrategy().getStrategyParams()) + )); + } + } + } + + @VisibleForTesting + public void setOutput(Output> output) { + this.output = output; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringPlanSourceFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringPlanSourceFunction.java new file mode 100644 index 0000000000000..fafaf9a1ce963 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringPlanSourceFunction.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.clustering; + +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.common.model.ClusteringGroupInfo; +import org.apache.hudi.common.model.ClusteringOperation; + +import org.apache.flink.api.common.functions.AbstractRichFunction; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Flink hudi clustering source function. + * + *

    This function read the clustering plan as {@link ClusteringOperation}s then assign the clustering task + * event {@link ClusteringPlanEvent} to downstream operators. + * + *

    The clustering instant time is specified explicitly with strategies: + * + *

      + *
    • If the timeline has no inflight instants, + * use {@link org.apache.hudi.common.table.timeline.HoodieActiveTimeline#createNewInstantTime()} + * as the instant time;
    • + *
    • If the timeline has inflight instants, + * use the median instant time between [last complete instant time, earliest inflight instant time] + * as the instant time.
    • + *
    + */ +public class ClusteringPlanSourceFunction extends AbstractRichFunction implements SourceFunction { + + protected static final Logger LOG = LoggerFactory.getLogger(ClusteringPlanSourceFunction.class); + + /** + * The clustering plan. + */ + private final HoodieClusteringPlan clusteringPlan; + + /** + * Clustering instant time. + */ + private final String clusteringInstantTime; + + public ClusteringPlanSourceFunction(String clusteringInstantTime, HoodieClusteringPlan clusteringPlan) { + this.clusteringInstantTime = clusteringInstantTime; + this.clusteringPlan = clusteringPlan; + } + + @Override + public void open(Configuration parameters) throws Exception { + // no operation + } + + @Override + public void run(SourceContext sourceContext) throws Exception { + for (HoodieClusteringGroup clusteringGroup : clusteringPlan.getInputGroups()) { + LOG.info("Execute clustering plan for instant {} as {} file slices", clusteringInstantTime, clusteringGroup.getSlices().size()); + sourceContext.collect(new ClusteringPlanEvent(this.clusteringInstantTime, ClusteringGroupInfo.create(clusteringGroup), clusteringPlan.getStrategy().getStrategyParams())); + } + } + + @Override + public void close() throws Exception { + // no operation + } + + @Override + public void cancel() { + // no operation + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/FlinkClusteringConfig.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/FlinkClusteringConfig.java new file mode 100644 index 0000000000000..3bbae38e00e5e --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/FlinkClusteringConfig.java @@ -0,0 +1,195 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.clustering; + +import org.apache.hudi.client.clustering.plan.strategy.FlinkSizeBasedClusteringPlanStrategy; +import org.apache.hudi.common.config.DFSPropertiesConfiguration; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.util.StreamerUtil; + +import com.beust.jcommander.Parameter; +import org.apache.flink.configuration.Configuration; +import org.apache.hadoop.fs.Path; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Configurations for Hoodie Flink clustering. + */ +public class FlinkClusteringConfig extends Configuration { + + @Parameter(names = {"--help", "-h"}, help = true) + public Boolean help = false; + + // ------------------------------------------------------------------------ + // Hudi Write Options + // ------------------------------------------------------------------------ + + @Parameter(names = {"--path"}, description = "Base path for the target hoodie table.", required = true) + public String path; + + // ------------------------------------------------------------------------ + // Clustering Options + // ------------------------------------------------------------------------ + @Parameter(names = {"--clustering-delta-commits"}, description = "Max delta commits needed to trigger clustering, default 4 commits", required = false) + public Integer clusteringDeltaCommits = 1; + + @Parameter(names = {"--clustering-tasks"}, description = "Parallelism of tasks that do actual clustering, default is -1", required = false) + public Integer clusteringTasks = -1; + + @Parameter(names = {"--compaction-max-memory"}, description = "Max memory in MB for compaction spillable map, default 100MB.", required = false) + public Integer compactionMaxMemory = 100; + + @Parameter(names = {"--clean-retain-commits"}, + description = "Number of commits to retain. So data will be retained for num_of_commits * time_between_commits (scheduled).\n" + + "This also directly translates into how much you can incrementally pull on this table, default 10", + required = false) + public Integer cleanRetainCommits = 10; + + @Parameter(names = {"--archive-min-commits"}, + description = "Min number of commits to keep before archiving older commits into a sequential log, default 20.", + required = false) + public Integer archiveMinCommits = 20; + + @Parameter(names = {"--archive-max-commits"}, + description = "Max number of commits to keep before archiving older commits into a sequential log, default 30.", + required = false) + public Integer archiveMaxCommits = 30; + + @Parameter(names = {"--schedule", "-sc"}, description = "Schedule the clustering plan in this job.\n" + + "Default is false", required = false) + public Boolean schedule = false; + + @Parameter(names = {"--instant-time", "-it"}, description = "Clustering Instant time") + public String clusteringInstantTime = null; + + @Parameter(names = {"--clean-async-enabled"}, description = "Whether to cleanup the old commits immediately on new commits, disabled by default", required = false) + public Boolean cleanAsyncEnable = false; + + @Parameter(names = {"--plan-strategy-class"}, description = "Config to provide a strategy class to generator clustering plan", required = false) + public String planStrategyClass = FlinkSizeBasedClusteringPlanStrategy.class.getName(); + + @Parameter(names = {"--plan-partition-filter-mode"}, description = "Partition filter mode used in the creation of clustering plan", required = false) + public String planPartitionFilterMode = "NONE"; + + @Parameter(names = {"--target-file-max-bytes"}, description = "Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups, default 1 GB", required = false) + public Long targetFileMaxBytes = 1024 * 1024 * 1024L; + + @Parameter(names = {"--small-file-limit"}, description = "Files smaller than the size specified here are candidates for clustering, default 600 MB", required = false) + public Long smallFileLimit = 600L; + + @Parameter(names = {"--skip-from-latest-partitions"}, description = "Number of partitions to skip from latest when choosing partitions to create ClusteringPlan, default 0", required = false) + public Integer skipFromLatestPartitions = 0; + + @Parameter(names = {"--sort-columns"}, description = "Columns to sort the data by when clustering.", required = false) + public String sortColumns = ""; + + @Parameter(names = {"--max-num-groups"}, description = "Maximum number of groups to create as part of ClusteringPlan. Increasing groups will increase parallelism. default 30", required = false) + public Integer maxNumGroups = 30; + + @Parameter(names = {"--target-partitions"}, description = "Number of partitions to list to create ClusteringPlan, default 2", required = false) + public Integer targetPartitions = 2; + + public static final String SEQ_FIFO = "FIFO"; + public static final String SEQ_LIFO = "LIFO"; + @Parameter(names = {"--seq"}, description = "Clustering plan execution sequence, two options are supported:\n" + + "1). FIFO: execute the oldest plan first;\n" + + "2). LIFO: execute the latest plan first, by default FIFO", required = false) + public String clusteringSeq = SEQ_FIFO; + + @Parameter(names = {"--service"}, description = "Flink Clustering runs in service mode, disable by default") + public Boolean serviceMode = false; + + @Parameter(names = {"--min-clustering-interval-seconds"}, + description = "Min clustering interval of async clustering service, default 10 minutes") + public Integer minClusteringIntervalSeconds = 600; + + @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file " + + "(using the CLI parameter \"--props\") can also be passed through command line using this parameter.") + public List configs = new ArrayList<>(); + + @Parameter(names = {"--props"}, description = "Path to properties file on localfs or dfs, with configurations for " + + "hoodie client, schema provider, key generator and data source. For hoodie client props, sane defaults are " + + "used, but recommend use to provide basic things like metrics endpoints, hive configs etc. For sources, refer" + + "to individual classes, for supported properties.") + public String propsFilePath = ""; + + public static TypedProperties buildProperties(List props) { + TypedProperties properties = DFSPropertiesConfiguration.getGlobalProps(); + props.forEach(x -> { + String[] kv = x.split("="); + ValidationUtils.checkArgument(kv.length == 2); + properties.setProperty(kv[0], kv[1]); + }); + return properties; + } + + public static TypedProperties getProps(FlinkClusteringConfig cfg) { + return cfg.propsFilePath.isEmpty() + ? buildProperties(cfg.configs) + : StreamerUtil.readConfig(HadoopConfigurations.getHadoopConf(cfg), + new Path(cfg.propsFilePath), cfg.configs).getProps(); + } + + /** + * Transforms a {@code FlinkClusteringConfig.config} into {@code Configuration}. + * The latter is more suitable for the table APIs. It reads all the properties + * in the properties file (set by `--props` option) and cmd line options + * (set by `--hoodie-conf` option). + */ + public static Configuration toFlinkConfig(FlinkClusteringConfig config) { + Map propsMap = new HashMap((Map) getProps(config)); + org.apache.flink.configuration.Configuration conf = fromMap(propsMap); + + conf.setString(FlinkOptions.PATH, config.path); + conf.setInteger(FlinkOptions.ARCHIVE_MAX_COMMITS, config.archiveMaxCommits); + conf.setInteger(FlinkOptions.ARCHIVE_MIN_COMMITS, config.archiveMinCommits); + conf.setInteger(FlinkOptions.CLEAN_RETAIN_COMMITS, config.cleanRetainCommits); + conf.setInteger(FlinkOptions.COMPACTION_MAX_MEMORY, config.compactionMaxMemory); + conf.setInteger(FlinkOptions.CLUSTERING_DELTA_COMMITS, config.clusteringDeltaCommits); + conf.setInteger(FlinkOptions.CLUSTERING_TASKS, config.clusteringTasks); + conf.setString(FlinkOptions.CLUSTERING_PLAN_STRATEGY_CLASS, config.planStrategyClass); + conf.setString(FlinkOptions.CLUSTERING_PLAN_PARTITION_FILTER_MODE_NAME, config.planPartitionFilterMode); + conf.setLong(FlinkOptions.CLUSTERING_PLAN_STRATEGY_TARGET_FILE_MAX_BYTES, config.targetFileMaxBytes); + conf.setLong(FlinkOptions.CLUSTERING_PLAN_STRATEGY_SMALL_FILE_LIMIT, config.smallFileLimit); + conf.setInteger(FlinkOptions.CLUSTERING_PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST, config.skipFromLatestPartitions); + conf.setString(FlinkOptions.CLUSTERING_SORT_COLUMNS, config.sortColumns); + conf.setInteger(FlinkOptions.CLUSTERING_MAX_NUM_GROUPS, config.maxNumGroups); + conf.setInteger(FlinkOptions.CLUSTERING_TARGET_PARTITIONS, config.targetPartitions); + conf.setBoolean(FlinkOptions.CLEAN_ASYNC_ENABLED, config.cleanAsyncEnable); + + // use synchronous clustering always + conf.setBoolean(FlinkOptions.CLUSTERING_ASYNC_ENABLED, false); + conf.setBoolean(FlinkOptions.CLUSTERING_SCHEDULE_ENABLED, config.schedule); + + // bulk insert conf + HoodieTableConfig tableConfig = StreamerUtil.createMetaClient(conf).getTableConfig(); + conf.setBoolean(FlinkOptions.URL_ENCODE_PARTITIONING, Boolean.parseBoolean(tableConfig.getUrlEncodePartitioning())); + conf.setBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING, Boolean.parseBoolean(tableConfig.getHiveStylePartitioningEnable())); + + return conf; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/HoodieFlinkClusteringJob.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/HoodieFlinkClusteringJob.java new file mode 100644 index 0000000000000..1942b1ce29e71 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/HoodieFlinkClusteringJob.java @@ -0,0 +1,380 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.clustering; + +import org.apache.hudi.async.HoodieAsyncTableService; +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.ClusteringUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.sink.compact.HoodieFlinkCompactor; +import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.ClusteringUtil; +import org.apache.hudi.util.CompactionUtil; +import org.apache.hudi.util.FlinkWriteClients; +import org.apache.hudi.util.StreamerUtil; + +import com.beust.jcommander.JCommander; +import org.apache.avro.Schema; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.client.deployment.application.ApplicationExecutionException; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.planner.plan.nodes.exec.utils.ExecNodeUtil; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * Flink hudi clustering program that can be executed manually. + */ +public class HoodieFlinkClusteringJob { + + protected static final Logger LOG = LoggerFactory.getLogger(HoodieFlinkClusteringJob.class); + + private static final String NO_EXECUTE_KEYWORD = "no execute"; + + /** + * Flink Execution Environment. + */ + private final AsyncClusteringService clusteringScheduleService; + + public HoodieFlinkClusteringJob(AsyncClusteringService service) { + this.clusteringScheduleService = service; + } + + public static void main(String[] args) throws Exception { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + FlinkClusteringConfig cfg = getFlinkClusteringConfig(args); + Configuration conf = FlinkClusteringConfig.toFlinkConfig(cfg); + + AsyncClusteringService service = new AsyncClusteringService(cfg, conf, env); + + new HoodieFlinkClusteringJob(service).start(cfg.serviceMode); + } + + /** + * Main method to start clustering service. + */ + public void start(boolean serviceMode) throws Exception { + if (serviceMode) { + clusteringScheduleService.start(null); + try { + clusteringScheduleService.waitForShutdown(); + } catch (Exception e) { + throw new HoodieException(e.getMessage(), e); + } finally { + LOG.info("Shut down hoodie flink clustering"); + } + } else { + LOG.info("Hoodie Flink Clustering running only single round"); + try { + clusteringScheduleService.cluster(); + } catch (ApplicationExecutionException aee) { + if (aee.getMessage().contains(NO_EXECUTE_KEYWORD)) { + LOG.info("Clustering is not performed"); + } else { + LOG.error("Got error trying to perform clustering. Shutting down", aee); + throw aee; + } + } catch (Exception e) { + LOG.error("Got error running delta sync once. Shutting down", e); + throw e; + } finally { + LOG.info("Shut down hoodie flink clustering"); + } + } + } + + public static FlinkClusteringConfig getFlinkClusteringConfig(String[] args) { + FlinkClusteringConfig cfg = new FlinkClusteringConfig(); + JCommander cmd = new JCommander(cfg, null, args); + if (cfg.help || args.length == 0) { + cmd.usage(); + System.exit(1); + } + return cfg; + } + + // ------------------------------------------------------------------------- + // Inner Class + // ------------------------------------------------------------------------- + + /** + * Schedules clustering in service. + */ + public static class AsyncClusteringService extends HoodieAsyncTableService { + + private static final long serialVersionUID = 1L; + + /** + * Flink Clustering Config. + */ + private final FlinkClusteringConfig cfg; + + /** + * Flink Config. + */ + private final Configuration conf; + + /** + * Meta Client. + */ + private final HoodieTableMetaClient metaClient; + + /** + * Write Client. + */ + private final HoodieFlinkWriteClient writeClient; + + /** + * The hoodie table. + */ + private final HoodieFlinkTable table; + + /** + * Flink Execution Environment. + */ + private final StreamExecutionEnvironment env; + + /** + * Executor Service. + */ + private final ExecutorService executor; + + public AsyncClusteringService(FlinkClusteringConfig cfg, Configuration conf, StreamExecutionEnvironment env) throws Exception { + this.cfg = cfg; + this.conf = conf; + this.env = env; + this.executor = Executors.newFixedThreadPool(1); + + // create metaClient + this.metaClient = StreamerUtil.createMetaClient(conf); + + // set table name + conf.setString(FlinkOptions.TABLE_NAME, metaClient.getTableConfig().getTableName()); + + // set table type + conf.setString(FlinkOptions.TABLE_TYPE, metaClient.getTableConfig().getTableType().name()); + + // set record key field + conf.setString(FlinkOptions.RECORD_KEY_FIELD, metaClient.getTableConfig().getRecordKeyFieldProp()); + + // set partition field + conf.setString(FlinkOptions.PARTITION_PATH_FIELD, metaClient.getTableConfig().getPartitionFieldProp()); + + // set table schema + CompactionUtil.setAvroSchema(conf, metaClient); + + this.writeClient = FlinkWriteClients.createWriteClientV2(conf); + this.writeConfig = writeClient.getConfig(); + this.table = writeClient.getHoodieTable(); + } + + @Override + protected Pair startService() { + return Pair.of(CompletableFuture.supplyAsync(() -> { + boolean error = false; + + try { + while (!isShutdownRequested()) { + try { + cluster(); + Thread.sleep(cfg.minClusteringIntervalSeconds * 1000); + } catch (ApplicationExecutionException aee) { + if (aee.getMessage().contains(NO_EXECUTE_KEYWORD)) { + LOG.info("Clustering is not performed."); + } else { + throw new HoodieException(aee.getMessage(), aee); + } + } catch (Exception e) { + LOG.error("Shutting down clustering service due to exception", e); + error = true; + throw new HoodieException(e.getMessage(), e); + } + } + } finally { + shutdownAsyncService(error); + } + return true; + }, executor), executor); + } + + /** + * Follows the same execution methodology of HoodieFlinkCompactor, where only one clustering job is allowed to be + * executed at any point in time. + *

    + * If there is an inflight clustering job, it will be rolled back and re-attempted. + *

    + * A clustering plan will be generated if `schedule` is true. + * + * @throws Exception + * @see HoodieFlinkCompactor + */ + private void cluster() throws Exception { + table.getMetaClient().reloadActiveTimeline(); + + if (cfg.schedule) { + // create a clustering plan on the timeline + ClusteringUtil.validateClusteringScheduling(conf); + + String clusteringInstantTime = cfg.clusteringInstantTime != null ? cfg.clusteringInstantTime + : HoodieActiveTimeline.createNewInstantTime(); + + LOG.info("Creating a clustering plan for instant [" + clusteringInstantTime + "]"); + boolean scheduled = writeClient.scheduleClusteringAtInstant(clusteringInstantTime, Option.empty()); + if (!scheduled) { + // do nothing. + LOG.info("No clustering plan for this job"); + return; + } + table.getMetaClient().reloadActiveTimeline(); + } + + // fetch the instant based on the configured execution sequence + List instants = ClusteringUtils.getPendingClusteringInstantTimes(table.getMetaClient()); + if (instants.isEmpty()) { + // do nothing. + LOG.info("No clustering plan scheduled, turns on the clustering plan schedule with --schedule option"); + return; + } + + final HoodieInstant clusteringInstant; + if (cfg.clusteringInstantTime != null) { + clusteringInstant = instants.stream() + .filter(i -> i.getTimestamp().equals(cfg.clusteringInstantTime)) + .findFirst() + .orElseThrow(() -> new HoodieException("Clustering instant [" + cfg.clusteringInstantTime + "] not found")); + } else { + // check for inflight clustering plans and roll them back if required + clusteringInstant = + CompactionUtil.isLIFO(cfg.clusteringSeq) ? instants.get(instants.size() - 1) : instants.get(0); + } + + HoodieInstant inflightInstant = HoodieTimeline.getReplaceCommitInflightInstant( + clusteringInstant.getTimestamp()); + if (table.getMetaClient().getActiveTimeline().containsInstant(inflightInstant)) { + LOG.info("Rollback inflight clustering instant: [" + clusteringInstant + "]"); + table.rollbackInflightClustering(inflightInstant, + commitToRollback -> writeClient.getPendingRollbackInfo(table.getMetaClient(), commitToRollback, false)); + table.getMetaClient().reloadActiveTimeline(); + } + + // generate clustering plan + // should support configurable commit metadata + Option> clusteringPlanOption = ClusteringUtils.getClusteringPlan( + table.getMetaClient(), clusteringInstant); + + if (!clusteringPlanOption.isPresent()) { + // do nothing. + LOG.info("No clustering plan scheduled, turns on the clustering plan schedule with --schedule option"); + return; + } + + HoodieClusteringPlan clusteringPlan = clusteringPlanOption.get().getRight(); + + if (clusteringPlan == null || (clusteringPlan.getInputGroups() == null) + || (clusteringPlan.getInputGroups().isEmpty())) { + // no clustering plan, do nothing and return. + LOG.info("No clustering plan for instant " + clusteringInstant.getTimestamp()); + return; + } + + HoodieInstant instant = HoodieTimeline.getReplaceCommitRequestedInstant(clusteringInstant.getTimestamp()); + HoodieTimeline pendingClusteringTimeline = table.getActiveTimeline().filterPendingReplaceTimeline(); + if (!pendingClusteringTimeline.containsInstant(instant)) { + // this means that the clustering plan was written to auxiliary path(.tmp) + // but not the meta path(.hoodie), this usually happens when the job crush + // exceptionally. + + // clean the clustering plan in auxiliary path and cancels the clustering. + LOG.warn("The clustering plan was fetched through the auxiliary path(.tmp) but not the meta path(.hoodie).\n" + + "Clean the clustering plan in auxiliary path and cancels the clustering"); + CompactionUtil.cleanInstant(table.getMetaClient(), instant); + return; + } + + // get clusteringParallelism. + int clusteringParallelism = conf.getInteger(FlinkOptions.CLUSTERING_TASKS) == -1 + ? clusteringPlan.getInputGroups().size() : conf.getInteger(FlinkOptions.CLUSTERING_TASKS); + + // Mark instant as clustering inflight + table.getActiveTimeline().transitionReplaceRequestedToInflight(instant, Option.empty()); + + final Schema tableAvroSchema = StreamerUtil.getTableAvroSchema(table.getMetaClient(), false); + final DataType rowDataType = AvroSchemaConverter.convertToDataType(tableAvroSchema); + final RowType rowType = (RowType) rowDataType.getLogicalType(); + + // setup configuration + long ckpTimeout = env.getCheckpointConfig().getCheckpointTimeout(); + conf.setLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT, ckpTimeout); + + DataStream dataStream = env.addSource(new ClusteringPlanSourceFunction(clusteringInstant.getTimestamp(), clusteringPlan)) + .name("clustering_source") + .uid("uid_clustering_source") + .rebalance() + .transform("clustering_task", + TypeInformation.of(ClusteringCommitEvent.class), + new ClusteringOperator(conf, rowType)) + .setParallelism(clusteringParallelism); + + ExecNodeUtil.setManagedMemoryWeight(dataStream.getTransformation(), + conf.getInteger(FlinkOptions.WRITE_SORT_MEMORY) * 1024L * 1024L); + + dataStream + .addSink(new ClusteringCommitSink(conf)) + .name("clustering_commit") + .uid("uid_clustering_commit") + .setParallelism(1); + + env.execute("flink_hudi_clustering_" + clusteringInstant.getTimestamp()); + } + + /** + * Shutdown async services like compaction/clustering as DeltaSync is shutdown. + */ + public void shutdownAsyncService(boolean error) { + LOG.info("Gracefully shutting down clustering job. Error ?" + error); + executor.shutdown(); + writeClient.close(); + } + + @VisibleForTesting + public void shutDown() { + shutdownAsyncService(false); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/common/AbstractStreamWriteFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/common/AbstractStreamWriteFunction.java new file mode 100644 index 0000000000000..f8438a4eb245c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/common/AbstractStreamWriteFunction.java @@ -0,0 +1,293 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.common; + +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.sink.StreamWriteOperatorCoordinator; +import org.apache.hudi.sink.event.CommitAckEvent; +import org.apache.hudi.sink.event.WriteMetadataEvent; +import org.apache.hudi.sink.meta.CkpMetadata; +import org.apache.hudi.sink.utils.TimeWait; +import org.apache.hudi.util.FlinkWriteClients; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.runtime.operators.coordination.OperatorEventGateway; +import org.apache.flink.runtime.state.FunctionInitializationContext; +import org.apache.flink.runtime.state.FunctionSnapshotContext; +import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +/** + * Base infrastructures for streaming writer function. + * + * @param Type of the input record + * @see StreamWriteOperatorCoordinator + */ +public abstract class AbstractStreamWriteFunction + extends AbstractWriteFunction + implements CheckpointedFunction { + + private static final Logger LOG = LoggerFactory.getLogger(AbstractStreamWriteFunction.class); + + /** + * Config options. + */ + protected final Configuration config; + + /** + * Id of current subtask. + */ + protected int taskID; + + /** + * Meta Client. + */ + protected transient HoodieTableMetaClient metaClient; + + /** + * Write Client. + */ + protected transient HoodieFlinkWriteClient writeClient; + + /** + * The REQUESTED instant we write the data. + */ + protected volatile String currentInstant; + + /** + * Gateway to send operator events to the operator coordinator. + */ + protected transient OperatorEventGateway eventGateway; + + /** + * Flag saying whether the write task is waiting for the checkpoint success notification + * after it finished a checkpoint. + * + *

    The flag is needed because the write task does not block during the waiting time interval, + * some data buckets still flush out with old instant time. There are two cases that the flush may produce + * corrupted files if the old instant is committed successfully: + * 1) the write handle was writing data but interrupted, left a corrupted parquet file; + * 2) the write handle finished the write but was not closed, left an empty parquet file. + * + *

    To solve, when this flag was set to true, we block the data flushing thus the #processElement method, + * the flag was reset to false if the task receives the checkpoint success event or the latest inflight instant + * time changed(the last instant committed successfully). + */ + protected volatile boolean confirming = false; + + /** + * List state of the write metadata events. + */ + private transient ListState writeMetadataState; + + /** + * Write status list for the current checkpoint. + */ + protected List writeStatuses; + + /** + * The checkpoint metadata. + */ + private transient CkpMetadata ckpMetadata; + + /** + * Since flink 1.15, the streaming job with bounded source triggers one checkpoint + * after calling #endInput, use this flag to avoid unnecessary data flush. + */ + private transient boolean inputEnded; + + /** + * Constructs a StreamWriteFunctionBase. + * + * @param config The config options + */ + public AbstractStreamWriteFunction(Configuration config) { + this.config = config; + } + + @Override + public void initializeState(FunctionInitializationContext context) throws Exception { + this.taskID = getRuntimeContext().getIndexOfThisSubtask(); + this.metaClient = StreamerUtil.createMetaClient(this.config); + this.writeClient = FlinkWriteClients.createWriteClient(this.config, getRuntimeContext()); + this.writeStatuses = new ArrayList<>(); + this.writeMetadataState = context.getOperatorStateStore().getListState( + new ListStateDescriptor<>( + "write-metadata-state", + TypeInformation.of(WriteMetadataEvent.class) + )); + + this.ckpMetadata = CkpMetadata.getInstance(this.metaClient.getFs(), this.metaClient.getBasePath()); + this.currentInstant = lastPendingInstant(); + if (context.isRestored()) { + restoreWriteMetadata(); + } else { + sendBootstrapEvent(); + } + // blocks flushing until the coordinator starts a new instant + this.confirming = true; + } + + @Override + public void snapshotState(FunctionSnapshotContext functionSnapshotContext) throws Exception { + if (inputEnded) { + return; + } + snapshotState(); + // Reload the snapshot state as the current state. + reloadWriteMetaState(); + } + + public abstract void snapshotState(); + + @Override + public void endInput() { + this.inputEnded = true; + } + + // ------------------------------------------------------------------------- + // Getter/Setter + // ------------------------------------------------------------------------- + @VisibleForTesting + public boolean isConfirming() { + return this.confirming; + } + + public void setOperatorEventGateway(OperatorEventGateway operatorEventGateway) { + this.eventGateway = operatorEventGateway; + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + private void restoreWriteMetadata() throws Exception { + boolean eventSent = false; + for (WriteMetadataEvent event : this.writeMetadataState.get()) { + if (Objects.equals(this.currentInstant, event.getInstantTime())) { + // Reset taskID for event + event.setTaskID(taskID); + // The checkpoint succeed but the meta does not commit, + // re-commit the inflight instant + this.eventGateway.sendEventToCoordinator(event); + LOG.info("Send uncommitted write metadata event to coordinator, task[{}].", taskID); + eventSent = true; + } + } + if (!eventSent) { + sendBootstrapEvent(); + } + } + + private void sendBootstrapEvent() { + int attemptId = getRuntimeContext().getAttemptNumber(); + if (attemptId > 0) { + // either a partial or global failover, reuses the current inflight instant + if (this.currentInstant != null) { + LOG.info("Recover task[{}] for instant [{}] with attemptId [{}]", taskID, this.currentInstant, attemptId); + this.currentInstant = null; + } + return; + } + this.eventGateway.sendEventToCoordinator(WriteMetadataEvent.emptyBootstrap(taskID)); + LOG.info("Send bootstrap write metadata event to coordinator, task[{}].", taskID); + } + + /** + * Reload the write metadata state as the current checkpoint. + */ + private void reloadWriteMetaState() throws Exception { + this.writeMetadataState.clear(); + WriteMetadataEvent event = WriteMetadataEvent.builder() + .taskID(taskID) + .instantTime(currentInstant) + .writeStatus(new ArrayList<>(writeStatuses)) + .bootstrap(true) + .build(); + this.writeMetadataState.add(event); + writeStatuses.clear(); + } + + public void handleOperatorEvent(OperatorEvent event) { + ValidationUtils.checkArgument(event instanceof CommitAckEvent, + "The write function can only handle CommitAckEvent"); + this.confirming = false; + } + + /** + * Returns the last pending instant time. + */ + protected String lastPendingInstant() { + return this.ckpMetadata.lastPendingInstant(); + } + + /** + * Prepares the instant time to write with for next checkpoint. + * + * @param hasData Whether the task has buffering data + * @return The instant time + */ + protected String instantToWrite(boolean hasData) { + String instant = lastPendingInstant(); + // if exactly-once semantics turns on, + // waits for the checkpoint notification until the checkpoint timeout threshold hits. + TimeWait timeWait = TimeWait.builder() + .timeout(config.getLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT)) + .action("instant initialize") + .build(); + while (confirming) { + // wait condition: + // 1. there is no inflight instant + // 2. the inflight instant does not change and the checkpoint has buffering data + if (instant == null || invalidInstant(instant, hasData)) { + // sleep for a while + timeWait.waitFor(); + // refresh the inflight instant + instant = lastPendingInstant(); + } else { + // the pending instant changed, that means the last instant was committed + // successfully. + confirming = false; + } + } + return instant; + } + + /** + * Returns whether the pending instant is invalid to write with. + */ + private boolean invalidInstant(String instant, boolean hasData) { + return instant.equals(this.currentInstant) && hasData; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/common/AbstractWriteFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/common/AbstractWriteFunction.java new file mode 100644 index 0000000000000..9e131ff91e1ea --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/common/AbstractWriteFunction.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.common; + +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.runtime.operators.coordination.OperatorEventGateway; +import org.apache.flink.streaming.api.functions.ProcessFunction; +import org.apache.flink.streaming.api.operators.BoundedOneInput; + +/** + * Base class for write function. + * + * @param the input type + */ +public abstract class AbstractWriteFunction extends ProcessFunction implements BoundedOneInput { + /** + * Sets up the event gateway. + */ + public abstract void setOperatorEventGateway(OperatorEventGateway operatorEventGateway); + + /** + * Invoked when bounded source ends up. + */ + public abstract void endInput(); + + /** + * Handles the operator event sent by the coordinator. + * + * @param event The event + */ + public abstract void handleOperatorEvent(OperatorEvent event); +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/common/AbstractWriteOperator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/common/AbstractWriteOperator.java new file mode 100644 index 0000000000000..e339ccb0b791c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/common/AbstractWriteOperator.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.common; + +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.runtime.operators.coordination.OperatorEventGateway; +import org.apache.flink.runtime.operators.coordination.OperatorEventHandler; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.api.operators.ProcessOperator; + +/** + * Base class for write operator. + * + * @param the input type + */ +public abstract class AbstractWriteOperator + extends ProcessOperator + implements OperatorEventHandler, BoundedOneInput { + private final AbstractWriteFunction function; + + public AbstractWriteOperator(AbstractWriteFunction function) { + super(function); + this.function = function; + } + + public void setOperatorEventGateway(OperatorEventGateway operatorEventGateway) { + this.function.setOperatorEventGateway(operatorEventGateway); + } + + @Override + public void endInput() { + this.function.endInput(); + } + + @Override + public void handleOperatorEvent(OperatorEvent evt) { + this.function.handleOperatorEvent(evt); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/common/WriteOperatorFactory.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/common/WriteOperatorFactory.java new file mode 100644 index 0000000000000..01a28debc7a38 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/common/WriteOperatorFactory.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.common; + +import org.apache.hudi.sink.StreamWriteOperator; +import org.apache.hudi.sink.StreamWriteOperatorCoordinator; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; +import org.apache.flink.runtime.operators.coordination.OperatorEventDispatcher; +import org.apache.flink.streaming.api.operators.CoordinatedOperatorFactory; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.SimpleUdfStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.StreamOperator; +import org.apache.flink.streaming.api.operators.StreamOperatorParameters; + +/** + * Factory class for {@link StreamWriteOperator}. + */ +public class WriteOperatorFactory + extends SimpleUdfStreamOperatorFactory + implements CoordinatedOperatorFactory, OneInputStreamOperatorFactory { + private static final long serialVersionUID = 1L; + + private final AbstractWriteOperator operator; + private final Configuration conf; + + public WriteOperatorFactory(Configuration conf, AbstractWriteOperator operator) { + super(operator); + this.operator = operator; + this.conf = conf; + } + + public static WriteOperatorFactory instance(Configuration conf, AbstractWriteOperator operator) { + return new WriteOperatorFactory<>(conf, operator); + } + + @Override + @SuppressWarnings("unchecked") + public > T createStreamOperator(StreamOperatorParameters parameters) { + final OperatorID operatorID = parameters.getStreamConfig().getOperatorID(); + final OperatorEventDispatcher eventDispatcher = parameters.getOperatorEventDispatcher(); + + this.operator.setOperatorEventGateway(eventDispatcher.getOperatorEventGateway(operatorID)); + this.operator.setup(parameters.getContainingTask(), parameters.getStreamConfig(), parameters.getOutput()); + this.operator.setProcessingTimeService(this.processingTimeService); + eventDispatcher.registerEventHandler(operatorID, operator); + return (T) operator; + } + + @Override + public OperatorCoordinator.Provider getCoordinatorProvider(String s, OperatorID operatorID) { + return new StreamWriteOperatorCoordinator.Provider(operatorID, this.conf); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactOperator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactOperator.java new file mode 100644 index 0000000000000..c5a59376b5821 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactOperator.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.compact; + +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.CompactionOperation; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.sink.utils.NonThrownExecutor; +import org.apache.hudi.table.HoodieFlinkCopyOnWriteTable; +import org.apache.hudi.table.action.compact.HoodieFlinkMergeOnReadTableCompactor; +import org.apache.hudi.util.CompactionUtil; +import org.apache.hudi.util.FlinkWriteClients; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.table.runtime.operators.TableStreamOperator; +import org.apache.flink.table.runtime.util.StreamRecordCollector; +import org.apache.flink.util.Collector; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.List; + +/** + * Operator to execute the actual compaction task assigned by the compaction plan task. + * In order to execute scalable, the input should shuffle by the compact event {@link CompactionPlanEvent}. + */ +public class CompactOperator extends TableStreamOperator + implements OneInputStreamOperator { + private static final Logger LOG = LoggerFactory.getLogger(CompactOperator.class); + + /** + * Config options. + */ + private final Configuration conf; + + /** + * Write Client. + */ + private transient HoodieFlinkWriteClient writeClient; + + /** + * Whether to execute compaction asynchronously. + */ + private final boolean asyncCompaction; + + /** + * Id of current subtask. + */ + private int taskID; + + /** + * Executor service to execute the compaction task. + */ + private transient NonThrownExecutor executor; + + /** + * Output records collector. + */ + private transient StreamRecordCollector collector; + + public CompactOperator(Configuration conf) { + this.conf = conf; + this.asyncCompaction = OptionsResolver.needsAsyncCompaction(conf); + } + + @Override + public void open() throws Exception { + this.taskID = getRuntimeContext().getIndexOfThisSubtask(); + this.writeClient = FlinkWriteClients.createWriteClient(conf, getRuntimeContext()); + if (this.asyncCompaction) { + this.executor = NonThrownExecutor.builder(LOG).build(); + } + this.collector = new StreamRecordCollector<>(output); + } + + @Override + public void processWatermark(Watermark mark) { + // no need to propagate the watermark + } + + @Override + public void processElement(StreamRecord record) throws Exception { + final CompactionPlanEvent event = record.getValue(); + final String instantTime = event.getCompactionInstantTime(); + final CompactionOperation compactionOperation = event.getOperation(); + if (asyncCompaction) { + // executes the compaction task asynchronously to not block the checkpoint barrier propagate. + executor.execute( + () -> doCompaction(instantTime, compactionOperation, collector, reloadWriteConfig()), + (errMsg, t) -> collector.collect(new CompactionCommitEvent(instantTime, compactionOperation.getFileId(), taskID)), + "Execute compaction for instant %s from task %d", instantTime, taskID); + } else { + // executes the compaction task synchronously for batch mode. + LOG.info("Execute compaction for instant {} from task {}", instantTime, taskID); + doCompaction(instantTime, compactionOperation, collector, writeClient.getConfig()); + } + } + + private void doCompaction(String instantTime, + CompactionOperation compactionOperation, + Collector collector, + HoodieWriteConfig writeConfig) throws IOException { + HoodieFlinkMergeOnReadTableCompactor compactor = new HoodieFlinkMergeOnReadTableCompactor<>(); + List writeStatuses = compactor.compact( + new HoodieFlinkCopyOnWriteTable<>( + writeConfig, + writeClient.getEngineContext(), + writeClient.getHoodieTable().getMetaClient()), + writeClient.getHoodieTable().getMetaClient(), + writeClient.getConfig(), + compactionOperation, + instantTime, + writeClient.getHoodieTable().getTaskContextSupplier()); + collector.collect(new CompactionCommitEvent(instantTime, compactionOperation.getFileId(), writeStatuses, taskID)); + } + + private HoodieWriteConfig reloadWriteConfig() throws Exception { + HoodieWriteConfig writeConfig = writeClient.getConfig(); + CompactionUtil.setAvroSchema(writeConfig, writeClient.getHoodieTable().getMetaClient()); + return writeConfig; + } + + @VisibleForTesting + public void setExecutor(NonThrownExecutor executor) { + this.executor = executor; + } + + @Override + public void close() throws Exception { + if (null != this.executor) { + this.executor.close(); + } + if (null != this.writeClient) { + this.writeClient.cleanHandlesGracefully(); + this.writeClient.close(); + this.writeClient = null; + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitEvent.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitEvent.java new file mode 100644 index 0000000000000..faad4c2338d1b --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitEvent.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.compact; + +import org.apache.hudi.client.WriteStatus; + +import java.io.Serializable; +import java.util.List; + +/** + * Represents a commit event from the compaction task {@link CompactOperator}. + */ +public class CompactionCommitEvent implements Serializable { + private static final long serialVersionUID = 1L; + + /** + * The compaction commit instant time. + */ + private String instant; + + /** + * The file ID. + */ + private String fileId; + + /** + * The write statuses. + */ + private List writeStatuses; + /** + * The compaction task identifier. + */ + private int taskID; + + public CompactionCommitEvent() { + } + + /** + * An event with NULL write statuses that represents a failed compaction. + */ + public CompactionCommitEvent(String instant, String fileId, int taskID) { + this(instant, fileId, null, taskID); + } + + public CompactionCommitEvent(String instant, String fileId, List writeStatuses, int taskID) { + this.instant = instant; + this.fileId = fileId; + this.writeStatuses = writeStatuses; + this.taskID = taskID; + } + + public boolean isFailed() { + return this.writeStatuses == null; + } + + // ------------------------------------------------------------------------- + // Getter/Setter + // ------------------------------------------------------------------------- + + public void setInstant(String instant) { + this.instant = instant; + } + + public void setFileId(String fileId) { + this.fileId = fileId; + } + + public void setWriteStatuses(List writeStatuses) { + this.writeStatuses = writeStatuses; + } + + public void setTaskID(int taskID) { + this.taskID = taskID; + } + + public String getInstant() { + return instant; + } + + public String getFileId() { + return fileId; + } + + public List getWriteStatuses() { + return writeStatuses; + } + + public int getTaskID() { + return taskID; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitSink.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitSink.java new file mode 100644 index 0000000000000..ef182241e4f17 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitSink.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.compact; + +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieListData; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.util.CompactionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.sink.CleanFunction; +import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.table.action.compact.CompactHelpers; +import org.apache.hudi.util.CompactionUtil; +import org.apache.hudi.util.FlinkWriteClients; + +import org.apache.flink.configuration.Configuration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * Function to check and commit the compaction action. + * + *

    Each time after receiving a compaction commit event {@link CompactionCommitEvent}, + * it loads and checks the compaction plan {@link HoodieCompactionPlan}, + * if all the compaction operations {@link org.apache.hudi.common.model.CompactionOperation} + * of the plan are finished, tries to commit the compaction action. + * + *

    It also inherits the {@link CleanFunction} cleaning ability. This is needed because + * the SQL API does not allow multiple sinks in one table sink provider. + */ +public class CompactionCommitSink extends CleanFunction { + private static final Logger LOG = LoggerFactory.getLogger(CompactionCommitSink.class); + + /** + * Config options. + */ + private final Configuration conf; + + /** + * Buffer to collect the event from each compact task {@code CompactFunction}. + * + *

    Stores the mapping of instant_time -> file_id -> event. Use a map to collect the + * events because the rolling back of intermediate compaction tasks generates corrupt + * events. + */ + private transient Map> commitBuffer; + + /** + * Cache to store compaction plan for each instant. + * Stores the mapping of instant_time -> compactionPlan. + */ + private transient Map compactionPlanCache; + + /** + * The hoodie table. + */ + private transient HoodieFlinkTable table; + + public CompactionCommitSink(Configuration conf) { + super(conf); + this.conf = conf; + } + + @Override + public void open(Configuration parameters) throws Exception { + super.open(parameters); + if (writeClient == null) { + this.writeClient = FlinkWriteClients.createWriteClient(conf, getRuntimeContext()); + } + this.commitBuffer = new HashMap<>(); + this.compactionPlanCache = new HashMap<>(); + this.table = this.writeClient.getHoodieTable(); + } + + @Override + public void invoke(CompactionCommitEvent event, Context context) throws Exception { + final String instant = event.getInstant(); + commitBuffer.computeIfAbsent(instant, k -> new HashMap<>()) + .put(event.getFileId(), event); + commitIfNecessary(instant, commitBuffer.get(instant).values()); + } + + /** + * Condition to commit: the commit buffer has equal size with the compaction plan operations + * and all the compact commit event {@link CompactionCommitEvent} has the same compaction instant time. + * + * @param instant Compaction commit instant time + * @param events Commit events ever received for the instant + */ + private void commitIfNecessary(String instant, Collection events) throws IOException { + HoodieCompactionPlan compactionPlan = compactionPlanCache.computeIfAbsent(instant, k -> { + try { + return CompactionUtils.getCompactionPlan( + this.writeClient.getHoodieTable().getMetaClient(), instant); + } catch (IOException e) { + throw new HoodieException(e); + } + }); + + boolean isReady = compactionPlan.getOperations().size() == events.size(); + if (!isReady) { + return; + } + + if (events.stream().anyMatch(CompactionCommitEvent::isFailed)) { + try { + // handle failure case + CompactionUtil.rollbackCompaction(table, instant); + } finally { + // remove commitBuffer to avoid obsolete metadata commit + reset(instant); + } + return; + } + + try { + doCommit(instant, events); + } catch (Throwable throwable) { + // make it fail-safe + LOG.error("Error while committing compaction instant: " + instant, throwable); + } finally { + // reset the status + reset(instant); + } + } + + @SuppressWarnings("unchecked") + private void doCommit(String instant, Collection events) throws IOException { + List statuses = events.stream() + .map(CompactionCommitEvent::getWriteStatuses) + .flatMap(Collection::stream) + .collect(Collectors.toList()); + + HoodieCommitMetadata metadata = CompactHelpers.getInstance().createCompactionMetadata( + table, instant, HoodieListData.eager(statuses), writeClient.getConfig().getSchema()); + + // commit the compaction + this.writeClient.commitCompaction(instant, metadata, Option.empty()); + + // Whether to clean up the old log file when compaction + if (!conf.getBoolean(FlinkOptions.CLEAN_ASYNC_ENABLED)) { + this.writeClient.clean(); + } + } + + private void reset(String instant) { + this.commitBuffer.remove(instant); + this.compactionPlanCache.remove(instant); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionPlanEvent.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionPlanEvent.java new file mode 100644 index 0000000000000..4fd09d477f0ec --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionPlanEvent.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.compact; + +import org.apache.hudi.common.model.CompactionOperation; + +import java.io.Serializable; + +/** + * Represents a compact command from the compaction plan task {@link CompactionPlanOperator}. + */ +public class CompactionPlanEvent implements Serializable { + private static final long serialVersionUID = 1L; + + private String compactionInstantTime; + + private CompactionOperation operation; + + public CompactionPlanEvent() { + } + + public CompactionPlanEvent(String instantTime, CompactionOperation operation) { + this.compactionInstantTime = instantTime; + this.operation = operation; + } + + public void setCompactionInstantTime(String compactionInstantTime) { + this.compactionInstantTime = compactionInstantTime; + } + + public void setOperation(CompactionOperation operation) { + this.operation = operation; + } + + public String getCompactionInstantTime() { + return compactionInstantTime; + } + + public CompactionOperation getOperation() { + return operation; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionPlanOperator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionPlanOperator.java new file mode 100644 index 0000000000000..d7446c9bfab29 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionPlanOperator.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.compact; + +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.common.model.CompactionOperation; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.CompactionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.table.marker.WriteMarkersFactory; +import org.apache.hudi.util.CompactionUtil; +import org.apache.hudi.util.FlinkTables; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.operators.Output; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; + +import java.io.IOException; +import java.util.List; + +import static java.util.stream.Collectors.toList; + +/** + * Operator that generates the compaction plan with pluggable strategies on finished checkpoints. + * + *

    It should be singleton to avoid conflicts. + */ +public class CompactionPlanOperator extends AbstractStreamOperator + implements OneInputStreamOperator, BoundedOneInput { + + /** + * Config options. + */ + private final Configuration conf; + + /** + * Meta Client. + */ + @SuppressWarnings("rawtypes") + private transient HoodieFlinkTable table; + + public CompactionPlanOperator(Configuration conf) { + this.conf = conf; + } + + @Override + public void open() throws Exception { + super.open(); + this.table = FlinkTables.createTable(conf, getRuntimeContext()); + // when starting up, rolls back all the inflight compaction instants if there exists, + // these instants are in priority for scheduling task because the compaction instants are + // scheduled from earliest(FIFO sequence). + CompactionUtil.rollbackCompaction(table); + } + + @Override + public void processElement(StreamRecord streamRecord) { + // no operation + } + + @Override + public void notifyCheckpointComplete(long checkpointId) { + try { + table.getMetaClient().reloadActiveTimeline(); + // There is no good way to infer when the compaction task for an instant crushed + // or is still undergoing. So we use a configured timeout threshold to control the rollback: + // {@code FlinkOptions.COMPACTION_TIMEOUT_SECONDS}, + // when the earliest inflight instant has timed out, assumes it has failed + // already and just rolls it back. + + // comment out: do we really need the timeout rollback ? + // CompactionUtil.rollbackEarliestCompaction(table, conf); + scheduleCompaction(table, checkpointId); + } catch (Throwable throwable) { + // make it fail-safe + LOG.error("Error while scheduling compaction plan for checkpoint: " + checkpointId, throwable); + } + } + + private void scheduleCompaction(HoodieFlinkTable table, long checkpointId) throws IOException { + // the first instant takes the highest priority. + Option firstRequested = table.getActiveTimeline().filterPendingCompactionTimeline() + .filter(instant -> instant.getState() == HoodieInstant.State.REQUESTED).firstInstant(); + if (!firstRequested.isPresent()) { + // do nothing. + LOG.info("No compaction plan for checkpoint " + checkpointId); + return; + } + + String compactionInstantTime = firstRequested.get().getTimestamp(); + + // generate compaction plan + // should support configurable commit metadata + HoodieCompactionPlan compactionPlan = CompactionUtils.getCompactionPlan( + table.getMetaClient(), compactionInstantTime); + + if (compactionPlan == null || (compactionPlan.getOperations() == null) + || (compactionPlan.getOperations().isEmpty())) { + // do nothing. + LOG.info("Empty compaction plan for instant " + compactionInstantTime); + } else { + HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime); + // Mark instant as compaction inflight + table.getActiveTimeline().transitionCompactionRequestedToInflight(instant); + table.getMetaClient().reloadActiveTimeline(); + + List operations = compactionPlan.getOperations().stream() + .map(CompactionOperation::convertFromAvroRecordInstance).collect(toList()); + LOG.info("Execute compaction plan for instant {} as {} file groups", compactionInstantTime, operations.size()); + WriteMarkersFactory + .get(table.getConfig().getMarkersType(), table, compactionInstantTime) + .deleteMarkerDir(table.getContext(), table.getConfig().getMarkersDeleteParallelism()); + for (CompactionOperation operation : operations) { + output.collect(new StreamRecord<>(new CompactionPlanEvent(compactionInstantTime, operation))); + } + } + } + + @VisibleForTesting + public void setOutput(Output> output) { + this.output = output; + } + + @Override + public void endInput() throws Exception { + // Called when the input data ends, only used in batch mode. + notifyCheckpointComplete(-1); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionPlanSourceFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionPlanSourceFunction.java new file mode 100644 index 0000000000000..883ba8bd114cd --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionPlanSourceFunction.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.compact; + +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.common.model.CompactionOperation; +import org.apache.hudi.common.util.collection.Pair; + +import org.apache.flink.api.common.functions.AbstractRichFunction; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.stream.Collectors; + +/** + * Flink hudi compaction source function. + * + *

    This function read the compaction plan as {@link CompactionOperation}s then assign the compaction task + * event {@link CompactionPlanEvent} to downstream operators. + * + *

    The compaction instant time is specified explicitly with strategies: + * + *

      + *
    • If the timeline has no inflight instants, + * use {@link org.apache.hudi.common.table.timeline.HoodieActiveTimeline#createNewInstantTime()} + * as the instant time;
    • + *
    • If the timeline has inflight instants, + * use the median instant time between [last complete instant time, earliest inflight instant time] + * as the instant time.
    • + *
    + */ +public class CompactionPlanSourceFunction extends AbstractRichFunction implements SourceFunction { + + protected static final Logger LOG = LoggerFactory.getLogger(CompactionPlanSourceFunction.class); + + /** + * compaction plan instant -> compaction plan + */ + private final List> compactionPlans; + + public CompactionPlanSourceFunction(List> compactionPlans) { + this.compactionPlans = compactionPlans; + } + + @Override + public void open(Configuration parameters) throws Exception { + // no operation + } + + @Override + public void run(SourceContext sourceContext) throws Exception { + for (Pair pair : compactionPlans) { + HoodieCompactionPlan compactionPlan = pair.getRight(); + List operations = compactionPlan.getOperations().stream() + .map(CompactionOperation::convertFromAvroRecordInstance).collect(Collectors.toList()); + LOG.info("CompactionPlanFunction compacting " + operations + " files"); + for (CompactionOperation operation : operations) { + sourceContext.collect(new CompactionPlanEvent(pair.getLeft(), operation)); + } + } + } + + @Override + public void close() throws Exception { + // no operation + } + + @Override + public void cancel() { + // no operation + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/FlinkCompactionConfig.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/FlinkCompactionConfig.java new file mode 100644 index 0000000000000..449b06846156c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/FlinkCompactionConfig.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.compact; + +import org.apache.hudi.config.HoodieMemoryConfig; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.sink.compact.strategy.CompactionPlanStrategy; + +import com.beust.jcommander.Parameter; +import org.apache.flink.configuration.Configuration; + +/** + * Configurations for Hoodie Flink compaction. + */ +public class FlinkCompactionConfig extends Configuration { + + @Parameter(names = {"--help", "-h"}, help = true) + public Boolean help = false; + + // ------------------------------------------------------------------------ + // Hudi Write Options + // ------------------------------------------------------------------------ + + @Parameter(names = {"--path"}, description = "Base path for the target hoodie table.", required = true) + public String path; + + // ------------------------------------------------------------------------ + // Compaction Options + // ------------------------------------------------------------------------ + + public static final String NUM_COMMITS = "num_commits"; + public static final String TIME_ELAPSED = "time_elapsed"; + public static final String NUM_AND_TIME = "num_and_time"; + public static final String NUM_OR_TIME = "num_or_time"; + @Parameter(names = {"--compaction-trigger-strategy"}, + description = "Strategy to trigger compaction, options are 'num_commits': trigger compaction when reach N delta commits;\n" + + "'time_elapsed': trigger compaction when time elapsed > N seconds since last compaction;\n" + + "'num_and_time': trigger compaction when both NUM_COMMITS and TIME_ELAPSED are satisfied;\n" + + "'num_or_time': trigger compaction when NUM_COMMITS or TIME_ELAPSED is satisfied.\n" + + "Default is 'num_commits'", + required = false) + public String compactionTriggerStrategy = NUM_COMMITS; + + @Parameter(names = {"--compaction-delta-commits"}, description = "Max delta commits needed to trigger compaction, default 5 commits", required = false) + public Integer compactionDeltaCommits = 1; + + @Parameter(names = {"--compaction-delta-seconds"}, description = "Max delta seconds time needed to trigger compaction, default 1 hour", required = false) + public Integer compactionDeltaSeconds = 3600; + + @Parameter(names = {"--clean-async-enabled"}, description = "Whether to cleanup the old commits immediately on new commits, enabled by default", required = false) + public Boolean cleanAsyncEnable = false; + + @Parameter(names = {"--clean-retain-commits"}, + description = "Number of commits to retain. So data will be retained for num_of_commits * time_between_commits (scheduled).\n" + + "This also directly translates into how much you can incrementally pull on this table, default 10", + required = false) + public Integer cleanRetainCommits = 10; + + @Parameter(names = {"--archive-min-commits"}, + description = "Min number of commits to keep before archiving older commits into a sequential log, default 20.", + required = false) + public Integer archiveMinCommits = 20; + + @Parameter(names = {"--archive-max-commits"}, + description = "Max number of commits to keep before archiving older commits into a sequential log, default 30.", + required = false) + public Integer archiveMaxCommits = 30; + + @Parameter(names = {"--compaction-max-memory"}, description = "Max memory in MB for compaction spillable map, default 100MB.", required = false) + public Integer compactionMaxMemory = 100; + + @Parameter(names = {"--compaction-target-io"}, description = "Target IO per compaction (both read and write) for batching compaction, default 512000M.", required = false) + public Long compactionTargetIo = 512000L; + + @Parameter(names = {"--compaction-tasks"}, description = "Parallelism of tasks that do actual compaction, default is -1", required = false) + public Integer compactionTasks = -1; + + @Parameter(names = {"--schedule", "-sc"}, description = "Not recommended. Schedule the compaction plan in this job.\n" + + "There is a risk of losing data when scheduling compaction outside the writer job.\n" + + "Scheduling compaction in the writer job and only let this job do the compaction execution is recommended.\n" + + "Default is false", required = false) + public Boolean schedule = false; + + public static final String SEQ_FIFO = "FIFO"; + public static final String SEQ_LIFO = "LIFO"; + @Parameter(names = {"--seq"}, description = "Compaction plan execution sequence, two options are supported:\n" + + "1). FIFO: execute the oldest plan first;\n" + + "2). LIFO: execute the latest plan first, by default LIFO", required = false) + public String compactionSeq = SEQ_FIFO; + + @Parameter(names = {"--service"}, description = "Flink Compaction runs in service mode, disable by default") + public Boolean serviceMode = false; + + @Parameter(names = {"--min-compaction-interval-seconds"}, + description = "Min compaction interval of async compaction service, default 10 minutes") + public Integer minCompactionIntervalSeconds = 600; + + @Parameter(names = {"--plan-select-strategy"}, description = "The strategy define how to select compaction plan to compact.\n" + + "1). num_instants: select plans by specific number of instants, it's the default strategy with 1 instant at a time;\n" + + "3). all: Select all pending compaction plan;\n" + + "4). instants: Select the compaction plan by specific instants") + public String compactionPlanSelectStrategy = CompactionPlanStrategy.NUM_INSTANTS; + + @Parameter(names = {"--max-num-plans"}, description = "Max number of compaction plan would be selected in compaction." + + "It's only effective for MultiCompactionPlanSelectStrategy.") + public Integer maxNumCompactionPlans = 1; + + @Parameter(names = {"--target-instants"}, description = "Specify the compaction plan instants to compact,\n" + + "Multiple instants are supported by comma separated instant time.\n" + + "It's only effective for 'instants' plan selection strategy.") + public String compactionPlanInstant; + + @Parameter(names = {"--spillable_map_path"}, description = "Default file path prefix for spillable map.", required = false) + public String spillableMapPath = HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue(); + + /** + * Transforms a {@code HoodieFlinkCompaction.config} into {@code Configuration}. + * The latter is more suitable for the table APIs. It reads all the properties + * in the properties file (set by `--props` option) and cmd line options + * (set by `--hoodie-conf` option). + */ + public static org.apache.flink.configuration.Configuration toFlinkConfig(FlinkCompactionConfig config) { + org.apache.flink.configuration.Configuration conf = new Configuration(); + + conf.setString(FlinkOptions.PATH, config.path); + conf.setString(FlinkOptions.COMPACTION_TRIGGER_STRATEGY, config.compactionTriggerStrategy); + conf.setInteger(FlinkOptions.ARCHIVE_MAX_COMMITS, config.archiveMaxCommits); + conf.setInteger(FlinkOptions.ARCHIVE_MIN_COMMITS, config.archiveMinCommits); + conf.setInteger(FlinkOptions.CLEAN_RETAIN_COMMITS, config.cleanRetainCommits); + conf.setInteger(FlinkOptions.COMPACTION_DELTA_COMMITS, config.compactionDeltaCommits); + conf.setInteger(FlinkOptions.COMPACTION_DELTA_SECONDS, config.compactionDeltaSeconds); + conf.setInteger(FlinkOptions.COMPACTION_MAX_MEMORY, config.compactionMaxMemory); + conf.setLong(FlinkOptions.COMPACTION_TARGET_IO, config.compactionTargetIo); + conf.setInteger(FlinkOptions.COMPACTION_TASKS, config.compactionTasks); + conf.setBoolean(FlinkOptions.CLEAN_ASYNC_ENABLED, config.cleanAsyncEnable); + // use synchronous compaction always + conf.setBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED, false); + conf.setBoolean(FlinkOptions.COMPACTION_SCHEDULE_ENABLED, config.schedule); + // Map memory + conf.setString(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.key(), config.spillableMapPath); + + return conf; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/HoodieFlinkCompactor.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/HoodieFlinkCompactor.java new file mode 100644 index 0000000000000..b97306ff29874 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/HoodieFlinkCompactor.java @@ -0,0 +1,342 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.compact; + +import org.apache.hudi.async.HoodieAsyncTableService; +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.CompactionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.sink.compact.strategy.CompactionPlanStrategies; +import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.util.CompactionUtil; +import org.apache.hudi.util.FlinkWriteClients; +import org.apache.hudi.util.StreamerUtil; + +import com.beust.jcommander.JCommander; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.client.deployment.application.ApplicationExecutionException; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.stream.Collectors; + +/** + * Flink hudi compaction program that can be executed manually. + */ +public class HoodieFlinkCompactor { + + protected static final Logger LOG = LoggerFactory.getLogger(HoodieFlinkCompactor.class); + + private static final String NO_EXECUTE_KEYWORD = "no execute"; + + /** + * Flink Execution Environment. + */ + private final AsyncCompactionService compactionScheduleService; + + public HoodieFlinkCompactor(AsyncCompactionService service) { + this.compactionScheduleService = service; + } + + public static void main(String[] args) throws Exception { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + FlinkCompactionConfig cfg = getFlinkCompactionConfig(args); + Configuration conf = FlinkCompactionConfig.toFlinkConfig(cfg); + + AsyncCompactionService service = new AsyncCompactionService(cfg, conf, env); + + new HoodieFlinkCompactor(service).start(cfg.serviceMode); + } + + /** + * Main method to start compaction service. + */ + public void start(boolean serviceMode) throws Exception { + if (serviceMode) { + compactionScheduleService.start(null); + try { + compactionScheduleService.waitForShutdown(); + } catch (Exception e) { + throw new HoodieException(e.getMessage(), e); + } finally { + LOG.info("Shut down hoodie flink compactor"); + } + } else { + LOG.info("Hoodie Flink Compactor running only single round"); + try { + compactionScheduleService.compact(); + } catch (ApplicationExecutionException aee) { + if (aee.getMessage().contains(NO_EXECUTE_KEYWORD)) { + LOG.info("Compaction is not performed"); + } else { + throw aee; + } + } catch (Exception e) { + LOG.error("Got error running delta sync once. Shutting down", e); + throw e; + } finally { + LOG.info("Shut down hoodie flink compactor"); + } + } + } + + public static FlinkCompactionConfig getFlinkCompactionConfig(String[] args) { + FlinkCompactionConfig cfg = new FlinkCompactionConfig(); + JCommander cmd = new JCommander(cfg, null, args); + if (cfg.help || args.length == 0) { + cmd.usage(); + System.exit(1); + } + return cfg; + } + + // ------------------------------------------------------------------------- + // Inner Class + // ------------------------------------------------------------------------- + + /** + * Schedules compaction in service. + */ + public static class AsyncCompactionService extends HoodieAsyncTableService { + + private static final long serialVersionUID = 1L; + + /** + * Flink Compaction Config. + */ + private final FlinkCompactionConfig cfg; + + /** + * Flink Config. + */ + private final Configuration conf; + + /** + * Meta Client. + */ + private final HoodieTableMetaClient metaClient; + + /** + * Write Client. + */ + private final HoodieFlinkWriteClient writeClient; + + /** + * The hoodie table. + */ + private final HoodieFlinkTable table; + + /** + * Flink Execution Environment. + */ + private final StreamExecutionEnvironment env; + + /** + * Executor Service. + */ + private final ExecutorService executor; + + public AsyncCompactionService(FlinkCompactionConfig cfg, Configuration conf, StreamExecutionEnvironment env) throws Exception { + this.cfg = cfg; + this.conf = conf; + this.env = env; + this.executor = Executors.newFixedThreadPool(1); + + // create metaClient + this.metaClient = StreamerUtil.createMetaClient(conf); + + // get the table name + conf.setString(FlinkOptions.TABLE_NAME, metaClient.getTableConfig().getTableName()); + + // set table schema + CompactionUtil.setAvroSchema(conf, metaClient); + + CompactionUtil.setPreCombineField(conf, metaClient); + + // infer changelog mode + CompactionUtil.inferChangelogMode(conf, metaClient); + + this.writeClient = FlinkWriteClients.createWriteClientV2(conf); + this.writeConfig = writeClient.getConfig(); + this.table = writeClient.getHoodieTable(); + } + + @Override + protected Pair startService() { + return Pair.of(CompletableFuture.supplyAsync(() -> { + boolean error = false; + + try { + while (!isShutdownRequested()) { + try { + compact(); + Thread.sleep(cfg.minCompactionIntervalSeconds * 1000); + } catch (ApplicationExecutionException aee) { + if (aee.getMessage().contains(NO_EXECUTE_KEYWORD)) { + LOG.info("Compaction is not performed."); + } else { + throw new HoodieException(aee.getMessage(), aee); + } + } catch (Exception e) { + LOG.error("Shutting down compaction service due to exception", e); + error = true; + throw new HoodieException(e.getMessage(), e); + } + } + } finally { + shutdownAsyncService(error); + } + return true; + }, executor), executor); + } + + private void compact() throws Exception { + table.getMetaClient().reloadActiveTimeline(); + + // checks the compaction plan and do compaction. + if (cfg.schedule) { + Option compactionInstantTimeOption = CompactionUtil.getCompactionInstantTime(metaClient); + if (compactionInstantTimeOption.isPresent()) { + boolean scheduled = writeClient.scheduleCompactionAtInstant(compactionInstantTimeOption.get(), Option.empty()); + if (!scheduled) { + // do nothing. + LOG.info("No compaction plan for this job "); + return; + } + table.getMetaClient().reloadActiveTimeline(); + } + } + + // fetch the instant based on the configured execution sequence + HoodieTimeline pendingCompactionTimeline = table.getActiveTimeline().filterPendingCompactionTimeline(); + List requested = CompactionPlanStrategies.getStrategy(cfg).select(pendingCompactionTimeline); + if (requested.isEmpty()) { + // do nothing. + LOG.info("No compaction plan scheduled, turns on the compaction plan schedule with --schedule option"); + return; + } + + List compactionInstantTimes = requested.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + compactionInstantTimes.forEach(timestamp -> { + HoodieInstant inflightInstant = HoodieTimeline.getCompactionInflightInstant(timestamp); + if (pendingCompactionTimeline.containsInstant(inflightInstant)) { + LOG.info("Rollback inflight compaction instant: [" + timestamp + "]"); + table.rollbackInflightCompaction(inflightInstant); + table.getMetaClient().reloadActiveTimeline(); + } + }); + + // generate timestamp and compaction plan pair + // should support configurable commit metadata + List> compactionPlans = compactionInstantTimes.stream() + .map(timestamp -> { + try { + return Pair.of(timestamp, CompactionUtils.getCompactionPlan(table.getMetaClient(), timestamp)); + } catch (IOException e) { + throw new HoodieException("Get compaction plan at instant " + timestamp + " error", e); + } + }) + // reject empty compaction plan + .filter(pair -> validCompactionPlan(pair.getRight())) + .collect(Collectors.toList()); + + if (compactionPlans.isEmpty()) { + // No compaction plan, do nothing and return. + LOG.info("No compaction plan for instant " + String.join(",", compactionInstantTimes)); + return; + } + + List instants = compactionInstantTimes.stream().map(HoodieTimeline::getCompactionRequestedInstant).collect(Collectors.toList()); + for (HoodieInstant instant : instants) { + if (!pendingCompactionTimeline.containsInstant(instant)) { + // this means that the compaction plan was written to auxiliary path(.tmp) + // but not the meta path(.hoodie), this usually happens when the job crush + // exceptionally. + // clean the compaction plan in auxiliary path and cancels the compaction. + LOG.warn("The compaction plan was fetched through the auxiliary path(.tmp) but not the meta path(.hoodie).\n" + + "Clean the compaction plan in auxiliary path and cancels the compaction"); + CompactionUtil.cleanInstant(table.getMetaClient(), instant); + return; + } + } + + // get compactionParallelism. + int compactionParallelism = conf.getInteger(FlinkOptions.COMPACTION_TASKS) == -1 + ? Math.toIntExact(compactionPlans.stream().mapToLong(pair -> pair.getRight().getOperations().size()).sum()) + : conf.getInteger(FlinkOptions.COMPACTION_TASKS); + + LOG.info("Start to compaction for instant " + compactionInstantTimes); + + // Mark instant as compaction inflight + for (HoodieInstant instant : instants) { + table.getActiveTimeline().transitionCompactionRequestedToInflight(instant); + } + table.getMetaClient().reloadActiveTimeline(); + + env.addSource(new CompactionPlanSourceFunction(compactionPlans)) + .name("compaction_source") + .uid("uid_compaction_source") + .rebalance() + .transform("compact_task", + TypeInformation.of(CompactionCommitEvent.class), + new CompactOperator(conf)) + .setParallelism(compactionParallelism) + .addSink(new CompactionCommitSink(conf)) + .name("compaction_commit") + .uid("uid_compaction_commit") + .setParallelism(1); + + env.execute("flink_hudi_compaction_" + String.join(",", compactionInstantTimes)); + } + + /** + * Shutdown async services like compaction/clustering as DeltaSync is shutdown. + */ + public void shutdownAsyncService(boolean error) { + LOG.info("Gracefully shutting down compactor. Error ?" + error); + executor.shutdown(); + writeClient.close(); + } + + @VisibleForTesting + public void shutDown() { + shutdownAsyncService(false); + } + } + + private static boolean validCompactionPlan(HoodieCompactionPlan plan) { + return plan != null && plan.getOperations() != null && plan.getOperations().size() > 0; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/strategy/CompactionPlanStrategies.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/strategy/CompactionPlanStrategies.java new file mode 100644 index 0000000000000..662dcabda3220 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/strategy/CompactionPlanStrategies.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.compact.strategy; + +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.sink.compact.FlinkCompactionConfig; +import org.apache.hudi.util.CompactionUtil; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Locale; +import java.util.stream.Collectors; + +/** + * Factory clazz for CompactionPlanStrategy. + */ +public class CompactionPlanStrategies { + private static final Logger LOG = LoggerFactory.getLogger(CompactionPlanStrategies.class); + + private CompactionPlanStrategies() { + } + + public static CompactionPlanStrategy getStrategy(FlinkCompactionConfig config) { + switch (config.compactionPlanSelectStrategy.toLowerCase(Locale.ROOT)) { + case CompactionPlanStrategy.ALL: + return pendingCompactionTimeline -> pendingCompactionTimeline.getInstants().collect(Collectors.toList()); + case CompactionPlanStrategy.INSTANTS: + return pendingCompactionTimeline -> { + if (StringUtils.isNullOrEmpty(config.compactionPlanInstant)) { + LOG.warn("None instant is selected"); + return Collections.emptyList(); + } + List instants = Arrays.asList(config.compactionPlanInstant.split(",")); + return pendingCompactionTimeline.getInstants() + .filter(instant -> instants.contains(instant.getTimestamp())) + .collect(Collectors.toList()); + }; + case CompactionPlanStrategy.NUM_INSTANTS: + return pendingCompactionTimeline -> { + List pendingCompactionPlanInstants = pendingCompactionTimeline.getInstants().collect(Collectors.toList()); + if (CompactionUtil.isLIFO(config.compactionSeq)) { + Collections.reverse(pendingCompactionPlanInstants); + } + int range = Math.min(config.maxNumCompactionPlans, pendingCompactionPlanInstants.size()); + return pendingCompactionPlanInstants.subList(0, range); + }; + default: + throw new UnsupportedOperationException("Unknown compaction plan strategy: " + + config.compactionPlanSelectStrategy + + ", supported strategies:[num_instants,instants,all]"); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/strategy/CompactionPlanStrategy.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/strategy/CompactionPlanStrategy.java new file mode 100644 index 0000000000000..e209ff53391fc --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/strategy/CompactionPlanStrategy.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.compact.strategy; + +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; + +import java.util.List; + +/** + * Compaction plan selection strategy. + */ +public interface CompactionPlanStrategy { + String ALL = "all"; + String INSTANTS = "instants"; + String NUM_INSTANTS = "num_instants"; + + /** + * Define how to select compaction plan to compact. + */ + List select(HoodieTimeline pendingCompactionTimeline); +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/event/CommitAckEvent.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/event/CommitAckEvent.java new file mode 100644 index 0000000000000..84274f0e2eb42 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/event/CommitAckEvent.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.event; + +import org.apache.flink.runtime.operators.coordination.OperatorEvent; + +/** + * An operator event to mark successful instant commit. + */ +public class CommitAckEvent implements OperatorEvent { + private static final long serialVersionUID = 1L; + + private long checkpointId; + + public CommitAckEvent(long checkpointId) { + this.checkpointId = checkpointId; + } + + // default constructor for efficient serialization + public CommitAckEvent() { + } + + public long getCheckpointId() { + return checkpointId; + } + + public void setCheckpointId(long checkpointId) { + this.checkpointId = checkpointId; + } + + public static CommitAckEvent getInstance(long checkpointId) { + return new CommitAckEvent(checkpointId); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/event/WriteMetadataEvent.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/event/WriteMetadataEvent.java new file mode 100644 index 0000000000000..0eb06bdd822f7 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/event/WriteMetadataEvent.java @@ -0,0 +1,249 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.event; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.util.ValidationUtils; + +import org.apache.flink.runtime.operators.coordination.OperatorEvent; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +/** + * An operator event to mark successful checkpoint batch write. + */ +public class WriteMetadataEvent implements OperatorEvent { + private static final long serialVersionUID = 1L; + + public static final String BOOTSTRAP_INSTANT = ""; + + private List writeStatuses; + private int taskID; + private String instantTime; + private boolean lastBatch; + + /** + * Flag saying whether the event comes from the end of input, e.g. the source + * is bounded, there are two cases in which this flag should be set to true: + * 1. batch execution mode + * 2. bounded stream source such as VALUES + */ + private boolean endInput; + + /** + * Flag saying whether the event comes from bootstrap of a write function. + */ + private boolean bootstrap; + + /** + * Creates an event. + * + * @param taskID The task ID + * @param instantTime The instant time under which to write the data + * @param writeStatuses The write statues list + * @param lastBatch Whether the event reports the last batch + * within an checkpoint interval, + * if true, the whole data set of the checkpoint + * has been flushed successfully + * @param bootstrap Whether the event comes from the bootstrap + */ + private WriteMetadataEvent( + int taskID, + String instantTime, + List writeStatuses, + boolean lastBatch, + boolean endInput, + boolean bootstrap) { + this.taskID = taskID; + this.instantTime = instantTime; + this.writeStatuses = new ArrayList<>(writeStatuses); + this.lastBatch = lastBatch; + this.endInput = endInput; + this.bootstrap = bootstrap; + } + + // default constructor for efficient serialization + public WriteMetadataEvent() { + } + + /** + * Returns the builder for {@link WriteMetadataEvent}. + */ + public static Builder builder() { + return new Builder(); + } + + public List getWriteStatuses() { + return writeStatuses; + } + + public void setWriteStatuses(List writeStatuses) { + this.writeStatuses = writeStatuses; + } + + public int getTaskID() { + return taskID; + } + + public void setTaskID(int taskID) { + this.taskID = taskID; + } + + public String getInstantTime() { + return instantTime; + } + + public void setInstantTime(String instantTime) { + this.instantTime = instantTime; + } + + public boolean isEndInput() { + return endInput; + } + + public void setEndInput(boolean endInput) { + this.endInput = endInput; + } + + public boolean isBootstrap() { + return bootstrap; + } + + public void setBootstrap(boolean bootstrap) { + this.bootstrap = bootstrap; + } + + public boolean isLastBatch() { + return lastBatch; + } + + public void setLastBatch(boolean lastBatch) { + this.lastBatch = lastBatch; + } + + /** + * Merges this event with given {@link WriteMetadataEvent} {@code other}. + * + * @param other The event to be merged + */ + public void mergeWith(WriteMetadataEvent other) { + ValidationUtils.checkArgument(this.taskID == other.taskID); + // the instant time could be monotonically increasing + this.instantTime = other.instantTime; + this.lastBatch |= other.lastBatch; // true if one of the event lastBatch is true + List statusList = new ArrayList<>(); + statusList.addAll(this.writeStatuses); + statusList.addAll(other.writeStatuses); + this.writeStatuses = statusList; + } + + /** + * Returns whether the event is ready to commit. + */ + public boolean isReady(String currentInstant) { + return lastBatch && this.instantTime.equals(currentInstant); + } + + @Override + public String toString() { + return "WriteMetadataEvent{" + + "writeStatusesSize=" + writeStatuses.size() + + ", taskID=" + taskID + + ", instantTime='" + instantTime + '\'' + + ", lastBatch=" + lastBatch + + ", endInput=" + endInput + + ", bootstrap=" + bootstrap + + '}'; + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + /** + * Creates empty bootstrap event for task {@code taskId}. + * + *

    The event indicates that the new instant can start directly, + * there is no old instant write statuses to recover. + */ + public static WriteMetadataEvent emptyBootstrap(int taskId) { + return WriteMetadataEvent.builder() + .taskID(taskId) + .instantTime(BOOTSTRAP_INSTANT) + .writeStatus(Collections.emptyList()) + .bootstrap(true) + .build(); + } + + // ------------------------------------------------------------------------- + // Builder + // ------------------------------------------------------------------------- + + /** + * Builder for {@link WriteMetadataEvent}. + */ + public static class Builder { + private List writeStatus; + private Integer taskID; + private String instantTime; + private boolean lastBatch = false; + private boolean endInput = false; + private boolean bootstrap = false; + + public WriteMetadataEvent build() { + Objects.requireNonNull(taskID); + Objects.requireNonNull(instantTime); + Objects.requireNonNull(writeStatus); + return new WriteMetadataEvent(taskID, instantTime, writeStatus, lastBatch, endInput, bootstrap); + } + + public Builder taskID(int taskID) { + this.taskID = taskID; + return this; + } + + public Builder instantTime(String instantTime) { + this.instantTime = instantTime; + return this; + } + + public Builder writeStatus(List writeStatus) { + this.writeStatus = writeStatus; + return this; + } + + public Builder lastBatch(boolean lastBatch) { + this.lastBatch = lastBatch; + return this; + } + + public Builder endInput(boolean endInput) { + this.endInput = endInput; + return this; + } + + public Builder bootstrap(boolean bootstrap) { + this.bootstrap = bootstrap; + return this; + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/meta/CkpMessage.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/meta/CkpMessage.java new file mode 100644 index 0000000000000..1a27ae05c5190 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/meta/CkpMessage.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.meta; + +import org.apache.hudi.common.util.ValidationUtils; + +import org.apache.hadoop.fs.FileStatus; +import org.jetbrains.annotations.NotNull; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; + +/** + * A checkpoint message. + */ +public class CkpMessage implements Serializable, Comparable { + private static final long serialVersionUID = 1L; + + public static final Comparator COMPARATOR = Comparator.comparing(CkpMessage::getInstant) + .thenComparing(CkpMessage::getState); + + private final String instant; // the instant time + private final State state; // the checkpoint state + + public CkpMessage(String instant, String state) { + this.instant = instant; + this.state = State.valueOf(state); + } + + public CkpMessage(FileStatus fileStatus) { + String fileName = fileStatus.getPath().getName(); + String[] nameAndExt = fileName.split("\\."); + ValidationUtils.checkState(nameAndExt.length == 2); + String name = nameAndExt[0]; + String ext = nameAndExt[1]; + + this.instant = name; + this.state = State.valueOf(ext); + } + + public String getInstant() { + return instant; + } + + public State getState() { + return state; + } + + public boolean isAborted() { + return State.ABORTED == this.state; + } + + public boolean isComplete() { + return State.COMPLETED == this.state; + } + + public boolean isInflight() { + return State.INFLIGHT == this.state; + } + + public static String getFileName(String instant, State state) { + return instant + "." + state.name(); + } + + public static List getAllFileNames(String instant) { + return Arrays.stream(State.values()) + .map(state -> getFileName(instant, state)) + .collect(Collectors.toList()); + } + + @Override + public int compareTo(@NotNull CkpMessage o) { + return COMPARATOR.compare(this, o); + } + + /** + * Instant State. + */ + public enum State { + // Inflight instant + INFLIGHT, + // Aborted instant + // An instant can be aborted then be reused again, so it has lower priority + // than COMPLETED + ABORTED, + // Committed instant + COMPLETED + } + + @Override + public String toString() { + return "Ckp{" + "instant='" + instant + '\'' + ", state='" + state + '\'' + '}'; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/meta/CkpMetadata.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/meta/CkpMetadata.java new file mode 100644 index 0000000000000..c20b263fa36da --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/meta/CkpMetadata.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.meta; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.exception.HoodieException; + +import org.apache.flink.configuration.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +/** + * The checkpoint metadata for bookkeeping the checkpoint messages. + * + *

    Each time the driver starts a new instant, it writes a commit message into the metadata, the write tasks + * then consume the message and unblock the data flushing. + * + *

    Why we use the DFS based message queue instead of sending + * the {@link org.apache.flink.runtime.operators.coordination.OperatorEvent} ? + * The write task handles the operator event using the main mailbox executor which has the lowest priority for mails, + * it is also used to process the inputs. When the write task blocks and waits for the operator event to ack the valid instant to write, + * it actually blocks all the subsequent events in the mailbox, the operator event would never be consumed then it causes deadlock. + * + *

    The checkpoint metadata is also more lightweight than the active timeline. + * + *

    NOTE: should be removed in the future if we have good manner to handle the async notifications from driver. + */ +public class CkpMetadata implements Serializable { + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(CkpMetadata.class); + + protected static final int MAX_RETAIN_CKP_NUM = 3; + + // the ckp metadata directory + private static final String CKP_META = "ckp_meta"; + + private final FileSystem fs; + protected final Path path; + + private List messages; + private List instantCache; + + private CkpMetadata(Configuration config) { + this(FSUtils.getFs(config.getString(FlinkOptions.PATH), HadoopConfigurations.getHadoopConf(config)), config.getString(FlinkOptions.PATH)); + } + + private CkpMetadata(FileSystem fs, String basePath) { + this.fs = fs; + this.path = new Path(ckpMetaPath(basePath)); + } + + public void close() { + this.instantCache = null; + } + + // ------------------------------------------------------------------------- + // WRITE METHODS + // ------------------------------------------------------------------------- + + /** + * Initialize the message bus, would clean all the messages + * + *

    This expects to be called by the driver. + */ + public void bootstrap() throws IOException { + fs.delete(path, true); + fs.mkdirs(path); + } + + public void startInstant(String instant) { + Path path = fullPath(CkpMessage.getFileName(instant, CkpMessage.State.INFLIGHT)); + try { + fs.createNewFile(path); + } catch (IOException e) { + throw new HoodieException("Exception while adding checkpoint start metadata for instant: " + instant, e); + } + // cleaning + clean(instant); + } + + private void clean(String newInstant) { + if (this.instantCache == null) { + this.instantCache = new ArrayList<>(); + } + this.instantCache.add(newInstant); + if (instantCache.size() > MAX_RETAIN_CKP_NUM) { + final String instant = instantCache.get(0); + boolean[] error = new boolean[1]; + CkpMessage.getAllFileNames(instant).stream().map(this::fullPath).forEach(path -> { + try { + fs.delete(path, false); + } catch (IOException e) { + error[0] = true; + LOG.warn("Exception while cleaning the checkpoint meta file: " + path); + } + }); + if (!error[0]) { + instantCache.remove(0); + } + } + } + + /** + * Add a checkpoint commit message. + * + * @param instant The committed instant + */ + public void commitInstant(String instant) { + Path path = fullPath(CkpMessage.getFileName(instant, CkpMessage.State.COMPLETED)); + try { + fs.createNewFile(path); + } catch (IOException e) { + throw new HoodieException("Exception while adding checkpoint commit metadata for instant: " + instant, e); + } + } + + /** + * Add an aborted checkpoint message. + */ + public void abortInstant(String instant) { + Path path = fullPath(CkpMessage.getFileName(instant, CkpMessage.State.ABORTED)); + try { + fs.createNewFile(path); + } catch (IOException e) { + throw new HoodieException("Exception while adding checkpoint abort metadata for instant: " + instant); + } + } + + // ------------------------------------------------------------------------- + // READ METHODS + // ------------------------------------------------------------------------- + + private void load() { + try { + this.messages = scanCkpMetadata(this.path); + } catch (IOException e) { + throw new HoodieException("Exception while scanning the checkpoint meta files under path: " + this.path, e); + } + } + + @Nullable + public String lastPendingInstant() { + load(); + if (this.messages.size() > 0) { + CkpMessage ckpMsg = this.messages.get(this.messages.size() - 1); + // consider 'aborted' as pending too to reuse the instant + if (!ckpMsg.isComplete()) { + return ckpMsg.getInstant(); + } + } + return null; + } + + public List getMessages() { + load(); + return messages; + } + + public boolean isAborted(String instant) { + ValidationUtils.checkState(this.messages != null, "The checkpoint metadata should #load first"); + return this.messages.stream().anyMatch(ckpMsg -> instant.equals(ckpMsg.getInstant()) && ckpMsg.isAborted()); + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + public static CkpMetadata getInstance(Configuration config) { + return new CkpMetadata(config); + } + + public static CkpMetadata getInstance(FileSystem fs, String basePath) { + return new CkpMetadata(fs, basePath); + } + + protected static String ckpMetaPath(String basePath) { + return basePath + Path.SEPARATOR + HoodieTableMetaClient.AUXILIARYFOLDER_NAME + Path.SEPARATOR + CKP_META; + } + + private Path fullPath(String fileName) { + return new Path(path, fileName); + } + + private List scanCkpMetadata(Path ckpMetaPath) throws IOException { + return Arrays.stream(this.fs.listStatus(ckpMetaPath)).map(CkpMessage::new) + .collect(Collectors.groupingBy(CkpMessage::getInstant)).values().stream() + .map(messages -> messages.stream().reduce((x, y) -> { + // Pick the one with the highest state + if (x.getState().compareTo(y.getState()) >= 0) { + return x; + } + return y; + }).get()) + .sorted().collect(Collectors.toList()); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssignFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssignFunction.java new file mode 100644 index 0000000000000..676c03f41c97d --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssignFunction.java @@ -0,0 +1,242 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.partitioner; + +import org.apache.hudi.client.FlinkTaskContextSupplier; +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.model.BaseAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordGlobalLocation; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.sink.bootstrap.IndexRecord; +import org.apache.hudi.sink.utils.PayloadCreation; +import org.apache.hudi.table.action.commit.BucketInfo; +import org.apache.hudi.util.FlinkWriteClients; + +import org.apache.flink.api.common.state.CheckpointListener; +import org.apache.flink.api.common.state.StateTtlConfig; +import org.apache.flink.api.common.state.ValueState; +import org.apache.flink.api.common.state.ValueStateDescriptor; +import org.apache.flink.api.common.time.Time; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.state.FunctionInitializationContext; +import org.apache.flink.runtime.state.FunctionSnapshotContext; +import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; +import org.apache.flink.streaming.api.functions.KeyedProcessFunction; +import org.apache.flink.util.Collector; + +import java.util.Objects; + +/** + * The function to build the write profile incrementally for records within a checkpoint, + * it then assigns the bucket with ID using the {@link BucketAssigner}. + * + *

    All the records are tagged with HoodieRecordLocation, instead of real instant time, + * INSERT record uses "I" and UPSERT record uses "U" as instant time. There is no need to keep + * the "real" instant time for each record, the bucket ID (partition path & fileID) actually decides + * where the record should write to. The "I" and "U" tags are only used for downstream to decide whether + * the data bucket is an INSERT or an UPSERT, we should factor the tags out when the underneath writer + * supports specifying the bucket type explicitly. + * + *

    The output records should then shuffle by the bucket ID and thus do scalable write. + * + * @see BucketAssigner + */ +public class BucketAssignFunction> + extends KeyedProcessFunction + implements CheckpointedFunction, CheckpointListener { + + /** + * Index cache(speed-up) state for the underneath file based(BloomFilter) indices. + * When a record came in, we do these check: + * + *

      + *
    • Try to load all the records in the partition path where the record belongs to
    • + *
    • Checks whether the state contains the record key
    • + *
    • If it does, tag the record with the location
    • + *
    • If it does not, use the {@link BucketAssigner} to generate a new bucket ID
    • + *
    + */ + private ValueState indexState; + + /** + * Bucket assigner to assign new bucket IDs or reuse existing ones. + */ + private BucketAssigner bucketAssigner; + + private final Configuration conf; + + private final boolean isChangingRecords; + + /** + * Used to create DELETE payload. + */ + private PayloadCreation payloadCreation; + + /** + * If the index is global, update the index for the old partition path + * if same key record with different partition path came in. + */ + private final boolean globalIndex; + + public BucketAssignFunction(Configuration conf) { + this.conf = conf; + this.isChangingRecords = WriteOperationType.isChangingRecords( + WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION))); + this.globalIndex = conf.getBoolean(FlinkOptions.INDEX_GLOBAL_ENABLED) + && !conf.getBoolean(FlinkOptions.CHANGELOG_ENABLED); + } + + @Override + public void open(Configuration parameters) throws Exception { + super.open(parameters); + HoodieWriteConfig writeConfig = FlinkWriteClients.getHoodieClientConfig(this.conf, true); + HoodieFlinkEngineContext context = new HoodieFlinkEngineContext( + new SerializableConfiguration(HadoopConfigurations.getHadoopConf(this.conf)), + new FlinkTaskContextSupplier(getRuntimeContext())); + this.bucketAssigner = BucketAssigners.create( + getRuntimeContext().getIndexOfThisSubtask(), + getRuntimeContext().getMaxNumberOfParallelSubtasks(), + getRuntimeContext().getNumberOfParallelSubtasks(), + ignoreSmallFiles(), + HoodieTableType.valueOf(conf.getString(FlinkOptions.TABLE_TYPE)), + context, + writeConfig); + this.payloadCreation = PayloadCreation.instance(this.conf); + } + + private boolean ignoreSmallFiles() { + WriteOperationType operationType = WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION)); + return WriteOperationType.isOverwrite(operationType); + } + + @Override + public void snapshotState(FunctionSnapshotContext context) { + this.bucketAssigner.reset(); + } + + @Override + public void initializeState(FunctionInitializationContext context) { + ValueStateDescriptor indexStateDesc = + new ValueStateDescriptor<>( + "indexState", + TypeInformation.of(HoodieRecordGlobalLocation.class)); + double ttl = conf.getDouble(FlinkOptions.INDEX_STATE_TTL) * 24 * 60 * 60 * 1000; + if (ttl > 0) { + indexStateDesc.enableTimeToLive(StateTtlConfig.newBuilder(Time.milliseconds((long) ttl)).build()); + } + indexState = context.getKeyedStateStore().getState(indexStateDesc); + } + + @Override + public void processElement(I value, Context ctx, Collector out) throws Exception { + if (value instanceof IndexRecord) { + IndexRecord indexRecord = (IndexRecord) value; + this.indexState.update((HoodieRecordGlobalLocation) indexRecord.getCurrentLocation()); + } else { + processRecord((HoodieRecord) value, out); + } + } + + @SuppressWarnings("unchecked") + private void processRecord(HoodieRecord record, Collector out) throws Exception { + // 1. put the record into the BucketAssigner; + // 2. look up the state for location, if the record has a location, just send it out; + // 3. if it is an INSERT, decide the location using the BucketAssigner then send it out. + final HoodieKey hoodieKey = record.getKey(); + final String recordKey = hoodieKey.getRecordKey(); + final String partitionPath = hoodieKey.getPartitionPath(); + final HoodieRecordLocation location; + + // Only changing records need looking up the index for the location, + // append only records are always recognized as INSERT. + HoodieRecordGlobalLocation oldLoc = indexState.value(); + if (isChangingRecords && oldLoc != null) { + // Set up the instant time as "U" to mark the bucket as an update bucket. + if (!Objects.equals(oldLoc.getPartitionPath(), partitionPath)) { + if (globalIndex) { + // if partition path changes, emit a delete record for old partition path, + // then update the index state using location with new partition path. + HoodieRecord deleteRecord = new HoodieAvroRecord<>(new HoodieKey(recordKey, oldLoc.getPartitionPath()), + payloadCreation.createDeletePayload((BaseAvroPayload) record.getData())); + deleteRecord.setCurrentLocation(oldLoc.toLocal("U")); + deleteRecord.seal(); + out.collect((O) deleteRecord); + } + location = getNewRecordLocation(partitionPath); + } else { + location = oldLoc.toLocal("U"); + this.bucketAssigner.addUpdate(partitionPath, location.getFileId()); + } + } else { + location = getNewRecordLocation(partitionPath); + } + // always refresh the index + if (isChangingRecords) { + updateIndexState(partitionPath, location); + } + record.setCurrentLocation(location); + out.collect((O) record); + } + + private HoodieRecordLocation getNewRecordLocation(String partitionPath) { + final BucketInfo bucketInfo = this.bucketAssigner.addInsert(partitionPath); + final HoodieRecordLocation location; + switch (bucketInfo.getBucketType()) { + case INSERT: + // This is an insert bucket, use HoodieRecordLocation instant time as "I". + // Downstream operators can then check the instant time to know whether + // a record belongs to an insert bucket. + location = new HoodieRecordLocation("I", bucketInfo.getFileIdPrefix()); + break; + case UPDATE: + location = new HoodieRecordLocation("U", bucketInfo.getFileIdPrefix()); + break; + default: + throw new AssertionError(); + } + return location; + } + + private void updateIndexState( + String partitionPath, + HoodieRecordLocation localLoc) throws Exception { + this.indexState.update(HoodieRecordGlobalLocation.fromLocal(partitionPath, localLoc)); + } + + @Override + public void notifyCheckpointComplete(long checkpointId) { + // Refresh the table state when there are new commits. + this.bucketAssigner.reload(checkpointId); + } + + @Override + public void close() throws Exception { + this.bucketAssigner.close(); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssigner.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssigner.java new file mode 100644 index 0000000000000..ebb47a91eb573 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssigner.java @@ -0,0 +1,341 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.partitioner; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.sink.partitioner.profile.WriteProfile; +import org.apache.hudi.sink.partitioner.profile.WriteProfiles; +import org.apache.hudi.table.action.commit.BucketInfo; +import org.apache.hudi.table.action.commit.BucketType; +import org.apache.hudi.table.action.commit.SmallFile; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.runtime.state.KeyGroupRangeAssignment; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * Bucket assigner that assigns the data buffer of one checkpoint into buckets. + * + *

    This assigner assigns the record one by one. + * If the record is an update, checks and reuse existing UPDATE bucket or generates a new one; + * If the record is an insert, checks the record partition for small files first, try to find a small file + * that has space to append new records and reuse the small file's data bucket, if + * there is no small file(or no left space for new records), generates an INSERT bucket. + * + *

    Use {partition}_{fileId} as the bucket identifier, so that the bucket is unique + * within and among partitions. + */ +public class BucketAssigner implements AutoCloseable { + private static final Logger LOG = LogManager.getLogger(BucketAssigner.class); + + /** + * Task ID. + */ + private final int taskID; + + /** + * The max parallelism. + */ + private final int maxParallelism; + + /** + * Number of tasks. + */ + private final int numTasks; + + /** + * Remembers what type each bucket is for later. + */ + private final HashMap bucketInfoMap; + + /** + * The write config. + */ + protected final HoodieWriteConfig config; + + /** + * Write profile. + */ + private final WriteProfile writeProfile; + + /** + * Partition path to small file assign mapping. + */ + private final Map smallFileAssignMap; + + /** + * Bucket ID(partition + fileId) -> new file assign state. + */ + private final Map newFileAssignStates; + + /** + * Num of accumulated successful checkpoints, used for cleaning the new file assign state. + */ + private int accCkp = 0; + + public BucketAssigner( + int taskID, + int maxParallelism, + int numTasks, + WriteProfile profile, + HoodieWriteConfig config) { + this.taskID = taskID; + this.maxParallelism = maxParallelism; + this.numTasks = numTasks; + this.config = config; + this.writeProfile = profile; + + this.bucketInfoMap = new HashMap<>(); + this.smallFileAssignMap = new HashMap<>(); + this.newFileAssignStates = new HashMap<>(); + } + + /** + * Reset the states of this assigner, should do once for each checkpoint, + * all the states are accumulated within one checkpoint interval. + */ + public void reset() { + bucketInfoMap.clear(); + } + + public BucketInfo addUpdate(String partitionPath, String fileIdHint) { + final String key = StreamerUtil.generateBucketKey(partitionPath, fileIdHint); + if (!bucketInfoMap.containsKey(key)) { + BucketInfo bucketInfo = new BucketInfo(BucketType.UPDATE, fileIdHint, partitionPath); + bucketInfoMap.put(key, bucketInfo); + } + // else do nothing because the bucket already exists. + return bucketInfoMap.get(key); + } + + public BucketInfo addInsert(String partitionPath) { + // for new inserts, compute buckets depending on how many records we have for each partition + SmallFileAssign smallFileAssign = getSmallFileAssign(partitionPath); + + // first try packing this into one of the smallFiles + if (smallFileAssign != null && smallFileAssign.assign()) { + return new BucketInfo(BucketType.UPDATE, smallFileAssign.getFileId(), partitionPath); + } + + // if we have anything more, create new insert buckets, like normal + if (newFileAssignStates.containsKey(partitionPath)) { + NewFileAssignState newFileAssignState = newFileAssignStates.get(partitionPath); + if (newFileAssignState.canAssign()) { + newFileAssignState.assign(); + final String key = StreamerUtil.generateBucketKey(partitionPath, newFileAssignState.fileId); + if (bucketInfoMap.containsKey(key)) { + // the newFileAssignStates is cleaned asynchronously when received the checkpoint success notification, + // the records processed within the time range: + // (start checkpoint, checkpoint success(and instant committed)) + // should still be assigned to the small buckets of last checkpoint instead of new one. + + // the bucketInfoMap is cleaned when checkpoint starts. + + // A promotion: when the HoodieRecord can record whether it is an UPDATE or INSERT, + // we can always return an UPDATE BucketInfo here, and there is no need to record the + // UPDATE bucket through calling #addUpdate. + return bucketInfoMap.get(key); + } + return new BucketInfo(BucketType.UPDATE, newFileAssignState.fileId, partitionPath); + } + } + BucketInfo bucketInfo = new BucketInfo(BucketType.INSERT, createFileIdOfThisTask(), partitionPath); + final String key = StreamerUtil.generateBucketKey(partitionPath, bucketInfo.getFileIdPrefix()); + bucketInfoMap.put(key, bucketInfo); + NewFileAssignState newFileAssignState = new NewFileAssignState(bucketInfo.getFileIdPrefix(), writeProfile.getRecordsPerBucket()); + newFileAssignState.assign(); + newFileAssignStates.put(partitionPath, newFileAssignState); + return bucketInfo; + } + + private synchronized SmallFileAssign getSmallFileAssign(String partitionPath) { + if (smallFileAssignMap.containsKey(partitionPath)) { + return smallFileAssignMap.get(partitionPath); + } + List smallFiles = smallFilesOfThisTask(writeProfile.getSmallFiles(partitionPath)); + if (smallFiles.size() > 0) { + LOG.info("For partitionPath : " + partitionPath + " Small Files => " + smallFiles); + SmallFileAssignState[] states = smallFiles.stream() + .map(smallFile -> new SmallFileAssignState(config.getParquetMaxFileSize(), smallFile, writeProfile.getAvgSize())) + .toArray(SmallFileAssignState[]::new); + SmallFileAssign assign = new SmallFileAssign(states); + smallFileAssignMap.put(partitionPath, assign); + return assign; + } + smallFileAssignMap.put(partitionPath, null); + return null; + } + + /** + * Refresh the table state like TableFileSystemView and HoodieTimeline. + */ + public synchronized void reload(long checkpointId) { + this.accCkp += 1; + if (this.accCkp > 1) { + // do not clean the new file assignment state for the first checkpoint, + // this #reload calling is triggered by checkpoint success event, the coordinator + // also relies on the checkpoint success event to commit the inflight instant, + // and very possibly this component receives the notification before the coordinator, + // if we do the cleaning, the records processed within the time range: + // (start checkpoint, checkpoint success(and instant committed)) + // would be assigned to a fresh new data bucket which is not the right behavior. + this.newFileAssignStates.clear(); + this.accCkp = 0; + } + this.smallFileAssignMap.clear(); + this.writeProfile.reload(checkpointId); + } + + private boolean fileIdOfThisTask(String fileId) { + // the file id can shuffle to this task + return KeyGroupRangeAssignment.assignKeyToParallelOperator(fileId, maxParallelism, numTasks) == taskID; + } + + @VisibleForTesting + public String createFileIdOfThisTask() { + String newFileIdPfx = FSUtils.createNewFileIdPfx(); + while (!fileIdOfThisTask(newFileIdPfx)) { + newFileIdPfx = FSUtils.createNewFileIdPfx(); + } + return newFileIdPfx; + } + + @VisibleForTesting + public List smallFilesOfThisTask(List smallFiles) { + // computes the small files to write inserts for this task. + return smallFiles.stream() + .filter(smallFile -> fileIdOfThisTask(smallFile.location.getFileId())) + .collect(Collectors.toList()); + } + + public void close() { + reset(); + WriteProfiles.clean(config.getBasePath()); + } + + /** + * Assigns the record to one of the small files under one partition. + * + *

    The tool is initialized with an array of {@link SmallFileAssignState}s. + * A pointer points to the current small file we are ready to assign, + * if the current small file can not be assigned anymore (full assigned), the pointer + * move to next small file. + *

    +   *       |  ->
    +   *       V
    +   *   | smallFile_1 | smallFile_2 | smallFile_3 | ... | smallFile_N |
    +   * 
    + * + *

    If all the small files are full assigned, a flag {@code noSpace} was marked to true, and + * we can return early for future check. + */ + private static class SmallFileAssign { + final SmallFileAssignState[] states; + int assignIdx = 0; + boolean noSpace = false; + + SmallFileAssign(SmallFileAssignState[] states) { + this.states = states; + } + + public boolean assign() { + if (noSpace) { + return false; + } + SmallFileAssignState state = states[assignIdx]; + while (!state.canAssign()) { + assignIdx += 1; + if (assignIdx >= states.length) { + noSpace = true; + return false; + } + // move to next slot if possible + state = states[assignIdx]; + } + state.assign(); + return true; + } + + public String getFileId() { + return states[assignIdx].fileId; + } + } + + /** + * Candidate bucket state for small file. It records the total number of records + * that the bucket can append and the current number of assigned records. + */ + private static class SmallFileAssignState { + long assigned; + long totalUnassigned; + final String fileId; + + SmallFileAssignState(long parquetMaxFileSize, SmallFile smallFile, long averageRecordSize) { + this.assigned = 0; + this.totalUnassigned = (parquetMaxFileSize - smallFile.sizeBytes) / averageRecordSize; + this.fileId = smallFile.location.getFileId(); + } + + public boolean canAssign() { + return this.totalUnassigned > 0 && this.totalUnassigned > this.assigned; + } + + /** + * Remembers to invoke {@link #canAssign()} first. + */ + public void assign() { + this.assigned++; + } + } + + /** + * Candidate bucket state for a new file. It records the total number of records + * that the bucket can append and the current number of assigned records. + */ + private static class NewFileAssignState { + long assigned; + long totalUnassigned; + final String fileId; + + NewFileAssignState(String fileId, long insertRecordsPerBucket) { + this.fileId = fileId; + this.assigned = 0; + this.totalUnassigned = insertRecordsPerBucket; + } + + public boolean canAssign() { + return this.totalUnassigned > 0 && this.totalUnassigned > this.assigned; + } + + /** + * Remembers to invoke {@link #canAssign()} first. + */ + public void assign() { + this.assigned++; + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssigners.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssigners.java new file mode 100644 index 0000000000000..13d4587602de0 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssigners.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.partitioner; + +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.sink.partitioner.profile.WriteProfile; +import org.apache.hudi.sink.partitioner.profile.WriteProfiles; + +/** + * Utilities for {@code BucketAssigner}. + */ +public abstract class BucketAssigners { + + private BucketAssigners() { + } + + /** + * Creates a {@code BucketAssigner}. + * + * @param taskID The task ID + * @param maxParallelism The max parallelism + * @param numTasks The number of tasks + * @param ignoreSmallFiles Whether to ignore the small files + * @param tableType The table type + * @param context The engine context + * @param config The configuration + * @return the bucket assigner instance + */ + public static BucketAssigner create( + int taskID, + int maxParallelism, + int numTasks, + boolean ignoreSmallFiles, + HoodieTableType tableType, + HoodieFlinkEngineContext context, + HoodieWriteConfig config) { + boolean delta = tableType.equals(HoodieTableType.MERGE_ON_READ); + WriteProfile writeProfile = WriteProfiles.singleton(ignoreSmallFiles, delta, config, context); + return new BucketAssigner(taskID, maxParallelism, numTasks, writeProfile, config); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketIndexPartitioner.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketIndexPartitioner.java new file mode 100644 index 0000000000000..5fa3d1ab9a0a2 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketIndexPartitioner.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.partitioner; + +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.index.bucket.BucketIdentifier; + +import org.apache.flink.api.common.functions.Partitioner; + +/** + * Bucket index input partitioner. + * The fields to hash can be a subset of the primary key fields. + * + * @param The type of obj to hash + */ +public class BucketIndexPartitioner implements Partitioner { + + private final int bucketNum; + private final String indexKeyFields; + + public BucketIndexPartitioner(int bucketNum, String indexKeyFields) { + this.bucketNum = bucketNum; + this.indexKeyFields = indexKeyFields; + } + + @Override + public int partition(HoodieKey key, int numPartitions) { + int curBucket = BucketIdentifier.getBucketId(key, indexKeyFields, bucketNum); + int globalHash = (key.getPartitionPath() + curBucket).hashCode() & Integer.MAX_VALUE; + return BucketIdentifier.mod(globalHash, numPartitions); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/DeltaWriteProfile.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/DeltaWriteProfile.java new file mode 100644 index 0000000000000..d63696effba4a --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/DeltaWriteProfile.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.partitioner.profile; + +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.SyncableFileSystemView; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.action.commit.SmallFile; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +/** + * WriteProfile for MERGE_ON_READ table type, this allows auto correction of small parquet files to larger ones + * without the need for an index in the logFile. + * + *

    Note: assumes the index can always index log files for Flink write. + */ +public class DeltaWriteProfile extends WriteProfile { + public DeltaWriteProfile(HoodieWriteConfig config, HoodieFlinkEngineContext context) { + super(config, context); + } + + @Override + protected List smallFilesProfile(String partitionPath) { + // smallFiles only for partitionPath + List smallFileLocations = new ArrayList<>(); + + // Init here since this class (and member variables) might not have been initialized + HoodieTimeline commitTimeline = metaClient.getCommitsTimeline().filterCompletedInstants(); + + // Find out all eligible small file slices + if (!commitTimeline.empty()) { + HoodieInstant latestCommitTime = commitTimeline.lastInstant().get(); + // find the smallest file in partition and append to it + List allSmallFileSlices = new ArrayList<>(); + // If we can index log files, we can add more inserts to log files for fileIds including those under + // pending compaction. + List allFileSlices = fsView.getLatestMergedFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()) + .collect(Collectors.toList()); + for (FileSlice fileSlice : allFileSlices) { + if (isSmallFile(fileSlice)) { + allSmallFileSlices.add(fileSlice); + } + } + // Create SmallFiles from the eligible file slices + for (FileSlice smallFileSlice : allSmallFileSlices) { + SmallFile sf = new SmallFile(); + if (smallFileSlice.getBaseFile().isPresent()) { + // TODO : Move logic of file name, file id, base commit time handling inside file slice + String filename = smallFileSlice.getBaseFile().get().getFileName(); + sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename)); + sf.sizeBytes = getTotalFileSize(smallFileSlice); + smallFileLocations.add(sf); + } else { + smallFileSlice.getLogFiles().findFirst().ifPresent(logFile -> { + // in case there is something error, and the file slice has no log file + sf.location = new HoodieRecordLocation(FSUtils.getBaseCommitTimeFromLogPath(logFile.getPath()), + FSUtils.getFileIdFromLogPath(logFile.getPath())); + sf.sizeBytes = getTotalFileSize(smallFileSlice); + smallFileLocations.add(sf); + }); + } + } + } + return smallFileLocations; + } + + protected SyncableFileSystemView getFileSystemView() { + return (SyncableFileSystemView) getTable().getSliceView(); + } + + private long getTotalFileSize(FileSlice fileSlice) { + if (!fileSlice.getBaseFile().isPresent()) { + return convertLogFilesSizeToExpectedParquetSize(fileSlice.getLogFiles().collect(Collectors.toList())); + } else { + return fileSlice.getBaseFile().get().getFileSize() + + convertLogFilesSizeToExpectedParquetSize(fileSlice.getLogFiles().collect(Collectors.toList())); + } + } + + private boolean isSmallFile(FileSlice fileSlice) { + long totalSize = getTotalFileSize(fileSlice); + return totalSize < config.getParquetMaxFileSize(); + } + + // TODO (NA) : Make this static part of utility + public long convertLogFilesSizeToExpectedParquetSize(List hoodieLogFiles) { + long totalSizeOfLogFiles = hoodieLogFiles.stream().map(HoodieLogFile::getFileSize) + .filter(size -> size > 0).reduce(Long::sum).orElse(0L); + // Here we assume that if there is no base parquet file, all log files contain only inserts. + // We can then just get the parquet equivalent size of these log files, compare that with + // {@link config.getParquetMaxFileSize()} and decide if there is scope to insert more rows + return (long) (totalSizeOfLogFiles * config.getLogFileToParquetCompressionRatio()); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/EmptyWriteProfile.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/EmptyWriteProfile.java new file mode 100644 index 0000000000000..e0a6fc1f4a336 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/EmptyWriteProfile.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.partitioner.profile; + +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.action.commit.SmallFile; + +import java.util.Collections; +import java.util.List; + +/** + * WriteProfile that always return empty small files. + * + *

    This write profile is used for INSERT OVERWRITE and INSERT OVERWRITE TABLE operations, + * the existing small files are ignored because of the 'OVERWRITE' semantics. + * + *

    Note: assumes the index can always index log files for Flink write. + */ +public class EmptyWriteProfile extends WriteProfile { + public EmptyWriteProfile(HoodieWriteConfig config, HoodieFlinkEngineContext context) { + super(config, context); + } + + @Override + protected List smallFilesProfile(String partitionPath) { + return Collections.emptyList(); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfile.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfile.java new file mode 100644 index 0000000000000..db13a6c2ae584 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfile.java @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.partitioner.profile; + +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.SyncableFileSystemView; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.sink.partitioner.BucketAssigner; +import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.commit.SmallFile; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.core.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * Profiling of write statistics for {@link BucketAssigner}, + * such as the average record size and small files. + * + *

    The profile is re-constructed when there are new commits on the timeline. + */ +public class WriteProfile { + private static final Logger LOG = LoggerFactory.getLogger(WriteProfile.class); + + /** + * The write config. + */ + protected final HoodieWriteConfig config; + + /** + * Table base path. + */ + private final Path basePath; + + /** + * The meta client. + */ + protected final HoodieTableMetaClient metaClient; + + /** + * The average record size. + */ + private long avgSize = -1L; + + /** + * Total records to write for each bucket based on + * the config option {@link org.apache.hudi.config.HoodieStorageConfig#PARQUET_MAX_FILE_SIZE}. + */ + private long recordsPerBucket; + + /** + * Partition path to small files mapping. + */ + private final Map> smallFilesMap; + + /** + * Checkpoint id to avoid redundant reload. + */ + private long reloadedCheckpointId; + + /** + * The file system view cache for one checkpoint interval. + */ + protected SyncableFileSystemView fsView; + + /** + * Metadata cache to reduce IO of metadata files. + */ + private final Map metadataCache; + + /** + * The engine context. + */ + private final HoodieFlinkEngineContext context; + + public WriteProfile(HoodieWriteConfig config, HoodieFlinkEngineContext context) { + this.config = config; + this.context = context; + this.basePath = new Path(config.getBasePath()); + this.smallFilesMap = new HashMap<>(); + this.recordsPerBucket = config.getCopyOnWriteInsertSplitSize(); + this.metaClient = StreamerUtil.createMetaClient(config.getBasePath(), context.getHadoopConf().get()); + this.metadataCache = new HashMap<>(); + this.fsView = getFileSystemView(); + // profile the record statistics on construction + recordProfile(); + } + + public long getAvgSize() { + return avgSize; + } + + public long getRecordsPerBucket() { + return recordsPerBucket; + } + + public HoodieTableMetaClient getMetaClient() { + return this.metaClient; + } + + protected HoodieTable getTable() { + return HoodieFlinkTable.create(config, context); + } + + /** + * Obtains the average record size based on records written during previous commits. Used for estimating how many + * records pack into one file. + */ + private long averageBytesPerRecord() { + long avgSize = config.getCopyOnWriteRecordSizeEstimate(); + long fileSizeThreshold = (long) (config.getRecordSizeEstimationThreshold() * config.getParquetSmallFileLimit()); + HoodieTimeline commitTimeline = metaClient.getCommitsTimeline().filterCompletedInstants(); + if (!commitTimeline.empty()) { + // Go over the reverse ordered commits to get a more recent estimate of average record size. + Iterator instants = commitTimeline.getReverseOrderedInstants().iterator(); + while (instants.hasNext()) { + HoodieInstant instant = instants.next(); + final HoodieCommitMetadata commitMetadata = + this.metadataCache.computeIfAbsent( + instant.getTimestamp(), + k -> WriteProfiles.getCommitMetadataSafely(config.getTableName(), basePath, instant, commitTimeline) + .orElse(null)); + if (commitMetadata == null) { + continue; + } + long totalBytesWritten = commitMetadata.fetchTotalBytesWritten(); + long totalRecordsWritten = commitMetadata.fetchTotalRecordsWritten(); + if (totalBytesWritten > fileSizeThreshold && totalRecordsWritten > 0) { + avgSize = (long) Math.ceil((1.0 * totalBytesWritten) / totalRecordsWritten); + break; + } + } + } + LOG.info("Refresh average bytes per record => " + avgSize); + return avgSize; + } + + /** + * Returns a list of small files in the given partition path. + * + *

    Note: This method should be thread safe. + */ + public synchronized List getSmallFiles(String partitionPath) { + // lookup the cache first + if (smallFilesMap.containsKey(partitionPath)) { + return smallFilesMap.get(partitionPath); + } + + List smallFiles = new ArrayList<>(); + if (config.getParquetSmallFileLimit() <= 0) { + this.smallFilesMap.put(partitionPath, smallFiles); + return smallFiles; + } + + smallFiles = smallFilesProfile(partitionPath); + this.smallFilesMap.put(partitionPath, smallFiles); + return smallFiles; + } + + /** + * Returns a list of small files in the given partition path from the latest filesystem view. + */ + protected List smallFilesProfile(String partitionPath) { + // smallFiles only for partitionPath + List smallFileLocations = new ArrayList<>(); + + HoodieTimeline commitTimeline = metaClient.getCommitsTimeline().filterCompletedInstants(); + + if (!commitTimeline.empty()) { // if we have some commits + HoodieInstant latestCommitTime = commitTimeline.lastInstant().get(); + List allFiles = fsView + .getLatestBaseFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList()); + + for (HoodieBaseFile file : allFiles) { + // filter out the corrupted files. + if (file.getFileSize() < config.getParquetSmallFileLimit() && file.getFileSize() > 0) { + String filename = file.getFileName(); + SmallFile sf = new SmallFile(); + sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename)); + sf.sizeBytes = file.getFileSize(); + smallFileLocations.add(sf); + } + } + } + + return smallFileLocations; + } + + protected SyncableFileSystemView getFileSystemView() { + return (SyncableFileSystemView) getTable().getBaseFileOnlyView(); + } + + /** + * Remove the overdue metadata from the cache + * whose instant does not belong to the given instants {@code instants}. + */ + private void cleanMetadataCache(Stream instants) { + Set timestampSet = instants.map(HoodieInstant::getTimestamp).collect(Collectors.toSet()); + this.metadataCache.keySet().retainAll(timestampSet); + } + + private void recordProfile() { + this.avgSize = averageBytesPerRecord(); + if (config.shouldAllowMultiWriteOnSameInstant()) { + this.recordsPerBucket = config.getParquetMaxFileSize() / avgSize; + LOG.info("Refresh insert records per bucket => " + recordsPerBucket); + } + } + + /** + * Reload the write profile, should do once for each checkpoint. + * + *

    We do these things: i). reload the timeline; ii). re-construct the record profile; + * iii) clean the small files cache. + * + *

    Note: This method should be thread safe. + */ + public synchronized void reload(long checkpointId) { + if (this.reloadedCheckpointId >= checkpointId) { + // already reloaded + return; + } + this.metaClient.reloadActiveTimeline(); + // release the old fs view and create a new one + SyncableFileSystemView oldFsView = this.fsView; + this.fsView = getFileSystemView(); + oldFsView.close(); + + recordProfile(); + cleanMetadataCache(this.metaClient.getCommitsTimeline().filterCompletedInstants().getInstants()); + this.smallFilesMap.clear(); + this.reloadedCheckpointId = checkpointId; + } + + @VisibleForTesting + public Map getMetadataCache() { + return this.metadataCache; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfiles.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfiles.java new file mode 100644 index 0000000000000..90c58687db28a --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfiles.java @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.partitioner.profile; + +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.flink.core.fs.Path; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * Factory for {@link WriteProfile}. + */ +public class WriteProfiles { + private static final Logger LOG = LoggerFactory.getLogger(WriteProfiles.class); + + private static final Map PROFILES = new HashMap<>(); + + private WriteProfiles() { + } + + public static synchronized WriteProfile singleton( + boolean ignoreSmallFiles, + boolean delta, + HoodieWriteConfig config, + HoodieFlinkEngineContext context) { + return PROFILES.computeIfAbsent(config.getBasePath(), + k -> getWriteProfile(ignoreSmallFiles, delta, config, context)); + } + + private static WriteProfile getWriteProfile( + boolean ignoreSmallFiles, + boolean delta, + HoodieWriteConfig config, + HoodieFlinkEngineContext context) { + if (ignoreSmallFiles) { + return new EmptyWriteProfile(config, context); + } else if (delta) { + return new DeltaWriteProfile(config, context); + } else { + return new WriteProfile(config, context); + } + } + + public static void clean(String path) { + PROFILES.remove(path); + } + + /** + * Returns all the incremental write file statuses with the given commits metadata. + * + *

    Different with {@link #getWritePathsOfInstants}, the files are not filtered by + * existence. + * + * @param basePath Table base path + * @param hadoopConf The hadoop conf + * @param metadataList The commits metadata + * @param tableType The table type + * @return the file status array + */ + public static FileStatus[] getRawWritePathsOfInstants( + Path basePath, + Configuration hadoopConf, + List metadataList, + HoodieTableType tableType) { + Map uniqueIdToFileStatus = new HashMap<>(); + metadataList.forEach(metadata -> + uniqueIdToFileStatus.putAll(getFilesToReadOfInstant(basePath, metadata, hadoopConf, tableType))); + return uniqueIdToFileStatus.values().toArray(new FileStatus[0]); + } + + /** + * Returns all the incremental write file statuses with the given commits metadata. + * + * @param basePath Table base path + * @param hadoopConf The hadoop conf + * @param metadataList The commits metadata + * @param tableType The table type + * @return the file status array + */ + public static FileStatus[] getWritePathsOfInstants( + Path basePath, + Configuration hadoopConf, + List metadataList, + HoodieTableType tableType) { + FileSystem fs = FSUtils.getFs(basePath.toString(), hadoopConf); + Map uniqueIdToFileStatus = new HashMap<>(); + metadataList.forEach(metadata -> + uniqueIdToFileStatus.putAll(getFilesToReadOfInstant(basePath, metadata, fs, tableType))); + return uniqueIdToFileStatus.values().toArray(new FileStatus[0]); + } + + /** + * Returns the commit file status info with given metadata. + * + * @param basePath Table base path + * @param metadata The metadata + * @param hadoopConf The filesystem + * @param tableType The table type + * @return the commit file status info grouping by specific ID + */ + private static Map getFilesToReadOfInstant( + Path basePath, + HoodieCommitMetadata metadata, + Configuration hadoopConf, + HoodieTableType tableType) { + return getFilesToRead(hadoopConf, metadata, basePath.toString(), tableType).entrySet().stream() + .filter(entry -> StreamerUtil.isValidFile(entry.getValue())) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } + + /** + * Returns the commit file status info with given metadata. + * + * @param basePath Table base path + * @param metadata The metadata + * @param fs The filesystem + * @param tableType The table type + * @return the commit file status info grouping by specific ID + */ + private static Map getFilesToReadOfInstant( + Path basePath, + HoodieCommitMetadata metadata, + FileSystem fs, + HoodieTableType tableType) { + return getFilesToRead(fs.getConf(), metadata, basePath.toString(), tableType).entrySet().stream() + // filter out the file paths that does not exist, some files may be cleaned by + // the cleaner. + .filter(entry -> { + try { + return fs.exists(entry.getValue().getPath()); + } catch (IOException e) { + LOG.error("Checking exists of path: {} error", entry.getValue().getPath()); + throw new HoodieException(e); + } + }) + .filter(entry -> StreamerUtil.isValidFile(entry.getValue())) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } + + private static Map getFilesToRead( + Configuration hadoopConf, + HoodieCommitMetadata metadata, + String basePath, + HoodieTableType tableType + ) { + switch (tableType) { + case COPY_ON_WRITE: + return metadata.getFileIdToFileStatus(hadoopConf, basePath); + case MERGE_ON_READ: + return metadata.getFullPathToFileStatus(hadoopConf, basePath); + default: + throw new AssertionError(); + } + } + + /** + * Returns the commit metadata of the given instant safely. + * + * @param tableName The table name + * @param basePath The table base path + * @param instant The hoodie instant + * @param timeline The timeline + * @return the commit metadata or empty if any error occurs + */ + public static Option getCommitMetadataSafely( + String tableName, + Path basePath, + HoodieInstant instant, + HoodieTimeline timeline) { + try { + byte[] data = timeline.getInstantDetails(instant).get(); + return Option.of(HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class)); + } catch (FileNotFoundException fe) { + // make this fail safe. + LOG.warn("Instant {} was deleted by the cleaner, ignore", instant.getTimestamp()); + return Option.empty(); + } catch (Throwable throwable) { + LOG.error("Get write metadata for table {} with instant {} and path: {} error", + tableName, instant.getTimestamp(), basePath); + return Option.empty(); + } + } + + /** + * Returns the commit metadata of the given instant. + * + * @param tableName The table name + * @param basePath The table base path + * @param instant The hoodie instant + * @param timeline The timeline + * @return the commit metadata + */ + public static HoodieCommitMetadata getCommitMetadata( + String tableName, + Path basePath, + HoodieInstant instant, + HoodieTimeline timeline) { + try { + return HoodieInputFormatUtils.getCommitMetadata(instant, timeline); + } catch (IOException e) { + LOG.error("Get write metadata for table {} with instant {} and path: {} error", + tableName, instant.getTimestamp(), basePath); + throw new HoodieException(e); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/ChainedTransformer.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/ChainedTransformer.java new file mode 100644 index 0000000000000..2fe2867b75463 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/ChainedTransformer.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.transform; + +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.table.data.RowData; + +import java.util.List; +import java.util.stream.Collectors; + +/** + * A {@link Transformer} to chain other {@link Transformer}s and apply sequentially. + */ +public class ChainedTransformer implements Transformer { + + private List transformers; + + public ChainedTransformer(List transformers) { + this.transformers = transformers; + } + + public List getTransformersNames() { + return transformers.stream().map(t -> t.getClass().getName()).collect(Collectors.toList()); + } + + @Override + public DataStream apply(DataStream source) { + DataStream dataStream = source; + for (Transformer t : transformers) { + dataStream = t.apply(dataStream); + } + + return dataStream; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunction.java new file mode 100644 index 0000000000000..bfc7d7d62ad45 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunction.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.transform; + +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieOperation; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.keygen.factory.HoodieAvroKeyGeneratorFactory; +import org.apache.hudi.sink.utils.PayloadCreation; +import org.apache.hudi.util.RowDataToAvroConverters; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.flink.api.common.functions.RichMapFunction; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; + +import java.io.IOException; + +import static org.apache.hudi.util.StreamerUtil.flinkConf2TypedProperties; + +/** + * Function that transforms RowData to HoodieRecord. + */ +public class RowDataToHoodieFunction + extends RichMapFunction { + /** + * Row type of the input. + */ + private final RowType rowType; + + /** + * Avro schema of the input. + */ + private transient Schema avroSchema; + + /** + * RowData to Avro record converter. + */ + private transient RowDataToAvroConverters.RowDataToAvroConverter converter; + + /** + * HoodieKey generator. + */ + private transient KeyGenerator keyGenerator; + + /** + * Utilities to create hoodie pay load instance. + */ + private transient PayloadCreation payloadCreation; + + /** + * Config options. + */ + private final Configuration config; + + public RowDataToHoodieFunction(RowType rowType, Configuration config) { + this.rowType = rowType; + this.config = config; + } + + @Override + public void open(Configuration parameters) throws Exception { + super.open(parameters); + this.avroSchema = StreamerUtil.getSourceSchema(this.config); + this.converter = RowDataToAvroConverters.createConverter(this.rowType); + this.keyGenerator = + HoodieAvroKeyGeneratorFactory + .createKeyGenerator(flinkConf2TypedProperties(this.config)); + this.payloadCreation = PayloadCreation.instance(config); + } + + @SuppressWarnings("unchecked") + @Override + public O map(I i) throws Exception { + return (O) toHoodieRecord(i); + } + + /** + * Converts the give record to a {@link HoodieRecord}. + * + * @param record The input record + * @return HoodieRecord based on the configuration + * @throws IOException if error occurs + */ + @SuppressWarnings("rawtypes") + private HoodieRecord toHoodieRecord(I record) throws Exception { + GenericRecord gr = (GenericRecord) this.converter.convert(this.avroSchema, record); + final HoodieKey hoodieKey = keyGenerator.getKey(gr); + + HoodieRecordPayload payload = payloadCreation.createPayload(gr); + HoodieOperation operation = HoodieOperation.fromValue(record.getRowKind().toByteValue()); + return new HoodieAvroRecord<>(hoodieKey, payload, operation); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunctionWithRateLimit.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunctionWithRateLimit.java new file mode 100644 index 0000000000000..fc9c2177e7c0b --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunctionWithRateLimit.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.transform; + +import org.apache.hudi.adapter.RateLimiterAdapter; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.configuration.FlinkOptions; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; + +/** + * Function that transforms RowData to a HoodieRecord with RateLimit. + */ +public class RowDataToHoodieFunctionWithRateLimit + extends RowDataToHoodieFunction { + /** + * Total rate limit per second for this job. + */ + private final double totalLimit; + + /** + * Rate limit per second for per task. + */ + private transient RateLimiterAdapter rateLimiter; + + public RowDataToHoodieFunctionWithRateLimit(RowType rowType, Configuration config) { + super(rowType, config); + this.totalLimit = config.getLong(FlinkOptions.WRITE_RATE_LIMIT); + } + + @Override + public void open(Configuration parameters) throws Exception { + super.open(parameters); + this.rateLimiter = + RateLimiterAdapter.create(totalLimit / getRuntimeContext().getNumberOfParallelSubtasks()); + } + + @Override + public O map(I i) throws Exception { + rateLimiter.acquire(); + return super.map(i); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunctions.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunctions.java new file mode 100644 index 0000000000000..0007fd1e5055a --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunctions.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.transform; + +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.configuration.FlinkOptions; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; + +/** + * Utilities for {@link RowDataToHoodieFunction}. + */ +public abstract class RowDataToHoodieFunctions { + private RowDataToHoodieFunctions() { + } + + /** + * Creates a {@link RowDataToHoodieFunction} instance based on the given configuration. + */ + @SuppressWarnings("rawtypes") + public static RowDataToHoodieFunction create(RowType rowType, Configuration conf) { + if (conf.getLong(FlinkOptions.WRITE_RATE_LIMIT) > 0) { + return new RowDataToHoodieFunctionWithRateLimit<>(rowType, conf); + } else { + return new RowDataToHoodieFunction<>(rowType, conf); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/Transformer.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/Transformer.java new file mode 100644 index 0000000000000..282cca7cd5955 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/Transformer.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.transform; + +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.table.data.RowData; + +/** + * Transform source stream to target stream before writing. + */ +public interface Transformer { + + /** + * Transform source DataStream to target DataStream. + * + * @param source + */ + DataStream apply(DataStream source); + +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/HiveSyncContext.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/HiveSyncContext.java new file mode 100644 index 0000000000000..b1c8457c1ac1d --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/HiveSyncContext.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.hudi.aws.sync.AwsGlueCatalogSyncTool; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.hive.HiveSyncTool; +import org.apache.hudi.hive.ddl.HiveSyncMode; +import org.apache.hudi.table.format.FilePathUtils; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; + +import java.util.Properties; + +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_MODE; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_TABLE_PROPERTIES; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_TABLE_SERDE_PROPERTIES; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_JDBC; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT; +import static org.apache.hudi.hive.HiveSyncConfigHolder.METASTORE_URIS; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DECODE_PARTITION; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_USE_FILE_LISTING_FROM_METADATA; + +/** + * Hive synchronization context. + * + *

    Use this context to create the {@link HiveSyncTool} for synchronization. + */ +public class HiveSyncContext { + + private final Properties props; + private final HiveConf hiveConf; + + private HiveSyncContext(Properties props, HiveConf hiveConf) { + this.props = props; + this.hiveConf = hiveConf; + } + + public HiveSyncTool hiveSyncTool() { + HiveSyncMode syncMode = HiveSyncMode.of(props.getProperty(HIVE_SYNC_MODE.key())); + if (syncMode == HiveSyncMode.GLUE) { + return new AwsGlueCatalogSyncTool(props, hiveConf); + } + return new HiveSyncTool(props, hiveConf); + } + + public static HiveSyncContext create(Configuration conf, SerializableConfiguration serConf) { + Properties props = buildSyncConfig(conf); + org.apache.hadoop.conf.Configuration hadoopConf = HadoopConfigurations.getHadoopConf(conf); + HiveConf hiveConf = new HiveConf(); + hiveConf.addResource(serConf.get()); + if (!FlinkOptions.isDefaultValueDefined(conf, FlinkOptions.HIVE_SYNC_METASTORE_URIS)) { + hadoopConf.set(HiveConf.ConfVars.METASTOREURIS.varname, conf.getString(FlinkOptions.HIVE_SYNC_METASTORE_URIS)); + } + hiveConf.addResource(hadoopConf); + return new HiveSyncContext(props, hiveConf); + } + + @VisibleForTesting + public static Properties buildSyncConfig(Configuration conf) { + TypedProperties props = StreamerUtil.flinkConf2TypedProperties(conf); + props.setPropertyIfNonNull(META_SYNC_BASE_PATH.key(), conf.getString(FlinkOptions.PATH)); + props.setPropertyIfNonNull(META_SYNC_BASE_FILE_FORMAT.key(), conf.getString(FlinkOptions.HIVE_SYNC_FILE_FORMAT)); + props.setPropertyIfNonNull(HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), "false"); + props.setPropertyIfNonNull(META_SYNC_DATABASE_NAME.key(), conf.getString(FlinkOptions.HIVE_SYNC_DB)); + props.setPropertyIfNonNull(META_SYNC_TABLE_NAME.key(), conf.getString(FlinkOptions.HIVE_SYNC_TABLE)); + props.setPropertyIfNonNull(HIVE_SYNC_MODE.key(), conf.getString(FlinkOptions.HIVE_SYNC_MODE)); + props.setPropertyIfNonNull(HIVE_USER.key(), conf.getString(FlinkOptions.HIVE_SYNC_USERNAME)); + props.setPropertyIfNonNull(HIVE_PASS.key(), conf.getString(FlinkOptions.HIVE_SYNC_PASSWORD)); + props.setPropertyIfNonNull(HIVE_URL.key(), conf.getString(FlinkOptions.HIVE_SYNC_JDBC_URL)); + props.setPropertyIfNonNull(METASTORE_URIS.key(), conf.getString(FlinkOptions.HIVE_SYNC_METASTORE_URIS)); + props.setPropertyIfNonNull(HIVE_TABLE_PROPERTIES.key(), conf.getString(FlinkOptions.HIVE_SYNC_TABLE_PROPERTIES)); + props.setPropertyIfNonNull(HIVE_TABLE_SERDE_PROPERTIES.key(), conf.getString(FlinkOptions.HIVE_SYNC_TABLE_SERDE_PROPERTIES)); + props.setPropertyIfNonNull(META_SYNC_PARTITION_FIELDS.key(), String.join(",", FilePathUtils.extractHivePartitionFields(conf))); + props.setPropertyIfNonNull(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), conf.getString(FlinkOptions.HIVE_SYNC_PARTITION_EXTRACTOR_CLASS_NAME)); + props.setPropertyIfNonNull(HIVE_USE_JDBC.key(), String.valueOf(conf.getBoolean(FlinkOptions.HIVE_SYNC_USE_JDBC))); + props.setPropertyIfNonNull(META_SYNC_USE_FILE_LISTING_FROM_METADATA.key(), String.valueOf(conf.getBoolean(FlinkOptions.METADATA_ENABLED))); + props.setPropertyIfNonNull(HIVE_IGNORE_EXCEPTIONS.key(), String.valueOf(conf.getBoolean(FlinkOptions.HIVE_SYNC_IGNORE_EXCEPTIONS))); + props.setPropertyIfNonNull(HIVE_SUPPORT_TIMESTAMP_TYPE.key(), String.valueOf(conf.getBoolean(FlinkOptions.HIVE_SYNC_SUPPORT_TIMESTAMP))); + props.setPropertyIfNonNull(HIVE_AUTO_CREATE_DATABASE.key(), String.valueOf(conf.getBoolean(FlinkOptions.HIVE_SYNC_AUTO_CREATE_DB))); + props.setPropertyIfNonNull(META_SYNC_DECODE_PARTITION.key(), String.valueOf(conf.getBoolean(FlinkOptions.URL_ENCODE_PARTITIONING))); + props.setPropertyIfNonNull(HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.key(), String.valueOf(conf.getBoolean(FlinkOptions.HIVE_SYNC_SKIP_RO_SUFFIX))); + props.setPropertyIfNonNull(META_SYNC_ASSUME_DATE_PARTITION.key(), String.valueOf(conf.getBoolean(FlinkOptions.HIVE_SYNC_ASSUME_DATE_PARTITION))); + return props; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/NonThrownExecutor.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/NonThrownExecutor.java new file mode 100644 index 0000000000000..4ed1716545f9c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/NonThrownExecutor.java @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.util.ExceptionUtils; +import org.apache.flink.util.function.ThrowingRunnable; +import org.slf4j.Logger; + +import javax.annotation.Nullable; + +import java.util.Objects; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.function.Supplier; + +/** + * An executor service that catches all the throwable with logging. + * + *

    A post-exception hook {@link ExceptionHook} can be defined on construction + * or on each execution. + */ +public class NonThrownExecutor implements AutoCloseable { + private final Logger logger; + + /** + * A single-thread executor to handle all the asynchronous jobs. + */ + private final ExecutorService executor; + + /** + * Exception hook for post-exception handling. + */ + @VisibleForTesting + protected final ExceptionHook exceptionHook; + + /** + * Flag saying whether to wait for the tasks finish on #close. + */ + private final boolean waitForTasksFinish; + + @VisibleForTesting + protected NonThrownExecutor(Logger logger, @Nullable ExceptionHook exceptionHook, boolean waitForTasksFinish) { + this.executor = Executors.newSingleThreadExecutor(); + this.logger = logger; + this.exceptionHook = exceptionHook; + this.waitForTasksFinish = waitForTasksFinish; + } + + public static Builder builder(Logger logger) { + return new Builder(logger); + } + + /** + * Run the action in a loop. + */ + public void execute( + final ThrowingRunnable action, + final String actionName, + final Object... actionParams) { + execute(action, this.exceptionHook, actionName, actionParams); + } + + /** + * Run the action in a loop. + */ + public void execute( + final ThrowingRunnable action, + final ExceptionHook hook, + final String actionName, + final Object... actionParams) { + executor.execute(wrapAction(action, hook, actionName, actionParams)); + } + + /** + * Run the action in a loop and wait for completion. + */ + public void executeSync(ThrowingRunnable action, String actionName, Object... actionParams) { + try { + executor.submit(wrapAction(action, this.exceptionHook, actionName, actionParams)).get(); + } catch (InterruptedException e) { + handleException(e, this.exceptionHook, getActionString(actionName, actionParams)); + } catch (ExecutionException e) { + // nonfatal exceptions are handled by wrapAction + ExceptionUtils.rethrowIfFatalErrorOrOOM(e.getCause()); + } + } + + @Override + public void close() throws Exception { + if (executor != null) { + if (waitForTasksFinish) { + executor.shutdown(); + } else { + executor.shutdownNow(); + } + // We do not expect this to actually block for long. At this point, there should + // be very few task running in the executor, if any. + executor.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS); + } + } + + private Runnable wrapAction( + final ThrowingRunnable action, + final ExceptionHook hook, + final String actionName, + final Object... actionParams) { + + return () -> { + final Supplier actionString = getActionString(actionName, actionParams); + try { + action.run(); + logger.info("Executor executes action [{}] success!", actionString.get()); + } catch (Throwable t) { + handleException(t, hook, actionString); + } + }; + } + + private void handleException(Throwable t, ExceptionHook hook, Supplier actionString) { + final String errMsg = String.format("Executor executes action [%s] error", actionString.get()); + logger.error(errMsg, t); + if (hook != null) { + hook.apply(errMsg, t); + } + // if we have a JVM critical error, promote it immediately, there is a good + // chance the + // logging or job failing will not succeed any more + ExceptionUtils.rethrowIfFatalErrorOrOOM(t); + } + + private Supplier getActionString(String actionName, Object... actionParams) { + // avoid String.format before OOM rethrown + return () -> String.format(actionName, actionParams); + } + + // ------------------------------------------------------------------------- + // Inner Class + // ------------------------------------------------------------------------- + + /** + * The exception hook. + */ + public interface ExceptionHook { + void apply(String errMsg, Throwable t); + } + + /** + * Builder for {@link NonThrownExecutor}. + */ + public static class Builder { + private final Logger logger; + private ExceptionHook exceptionHook; + private boolean waitForTasksFinish = false; + + private Builder(Logger logger) { + this.logger = Objects.requireNonNull(logger); + } + + public NonThrownExecutor build() { + return new NonThrownExecutor(logger, exceptionHook, waitForTasksFinish); + } + + public Builder exceptionHook(ExceptionHook exceptionHook) { + this.exceptionHook = exceptionHook; + return this; + } + + public Builder waitForTasksFinish(boolean waitForTasksFinish) { + this.waitForTasksFinish = waitForTasksFinish; + return this; + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/PayloadCreation.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/PayloadCreation.java new file mode 100644 index 0000000000000..fb850bace7d48 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/PayloadCreation.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.BaseAvroPayload; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.OptionsResolver; + +import org.apache.avro.generic.GenericRecord; +import org.apache.flink.configuration.Configuration; + +import javax.annotation.Nullable; + +import java.io.Serializable; +import java.lang.reflect.Constructor; + +/** + * Util to create hoodie pay load instance. + */ +public class PayloadCreation implements Serializable { + private static final long serialVersionUID = 1L; + + private final boolean shouldCombine; + private final Constructor constructor; + private final String preCombineField; + + private PayloadCreation( + boolean shouldCombine, + Constructor constructor, + @Nullable String preCombineField) { + this.shouldCombine = shouldCombine; + this.constructor = constructor; + this.preCombineField = preCombineField; + } + + public static PayloadCreation instance(Configuration conf) throws Exception { + String preCombineField = OptionsResolver.getPreCombineField(conf); + boolean needCombine = conf.getBoolean(FlinkOptions.PRE_COMBINE) + || WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION)) == WriteOperationType.UPSERT; + boolean shouldCombine = needCombine && preCombineField != null; + + final Class[] argTypes; + final Constructor constructor; + if (shouldCombine) { + argTypes = new Class[] {GenericRecord.class, Comparable.class}; + } else { + argTypes = new Class[] {Option.class}; + } + final String clazz = conf.getString(FlinkOptions.PAYLOAD_CLASS_NAME); + constructor = ReflectionUtils.getClass(clazz).getConstructor(argTypes); + return new PayloadCreation(shouldCombine, constructor, preCombineField); + } + + public HoodieRecordPayload createPayload(GenericRecord record) throws Exception { + if (shouldCombine) { + ValidationUtils.checkState(preCombineField != null); + Comparable orderingVal = (Comparable) HoodieAvroUtils.getNestedFieldVal(record, + preCombineField, false, false); + return (HoodieRecordPayload) constructor.newInstance(record, orderingVal); + } else { + return (HoodieRecordPayload) this.constructor.newInstance(Option.of(record)); + } + } + + public HoodieRecordPayload createDeletePayload(BaseAvroPayload payload) throws Exception { + if (shouldCombine) { + return (HoodieRecordPayload) constructor.newInstance(null, payload.orderingVal); + } else { + return (HoodieRecordPayload) this.constructor.newInstance(Option.empty()); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/Pipelines.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/Pipelines.java new file mode 100644 index 0000000000000..d17213dcc0493 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/Pipelines.java @@ -0,0 +1,452 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.hudi.common.model.ClusteringOperation; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.sink.CleanFunction; +import org.apache.hudi.sink.StreamWriteOperator; +import org.apache.hudi.sink.append.AppendWriteOperator; +import org.apache.hudi.sink.bootstrap.BootstrapOperator; +import org.apache.hudi.sink.bootstrap.batch.BatchBootstrapOperator; +import org.apache.hudi.sink.bucket.BucketBulkInsertWriterHelper; +import org.apache.hudi.sink.bucket.BucketStreamWriteOperator; +import org.apache.hudi.sink.bulk.BulkInsertWriteOperator; +import org.apache.hudi.sink.bulk.RowDataKeyGen; +import org.apache.hudi.sink.bulk.sort.SortOperatorGen; +import org.apache.hudi.sink.clustering.ClusteringCommitEvent; +import org.apache.hudi.sink.clustering.ClusteringCommitSink; +import org.apache.hudi.sink.clustering.ClusteringOperator; +import org.apache.hudi.sink.clustering.ClusteringPlanEvent; +import org.apache.hudi.sink.clustering.ClusteringPlanOperator; +import org.apache.hudi.sink.common.WriteOperatorFactory; +import org.apache.hudi.sink.compact.CompactOperator; +import org.apache.hudi.sink.compact.CompactionCommitEvent; +import org.apache.hudi.sink.compact.CompactionCommitSink; +import org.apache.hudi.sink.compact.CompactionPlanEvent; +import org.apache.hudi.sink.compact.CompactionPlanOperator; +import org.apache.hudi.sink.partitioner.BucketAssignFunction; +import org.apache.hudi.sink.partitioner.BucketIndexPartitioner; +import org.apache.hudi.sink.transform.RowDataToHoodieFunctions; +import org.apache.hudi.table.format.FilePathUtils; + +import org.apache.flink.api.common.functions.Partitioner; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.state.KeyGroupRangeAssignment; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; +import org.apache.flink.streaming.api.functions.sink.SinkFunction; +import org.apache.flink.streaming.api.operators.KeyedProcessOperator; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.planner.plan.nodes.exec.utils.ExecNodeUtil; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.flink.table.types.logical.RowType; + +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * Utilities to generate all kinds of sub-pipelines. + */ +public class Pipelines { + + /** + * Bulk insert the input dataset at once. + * + *

    By default, the input dataset would shuffle by the partition path first then + * sort by the partition path before passing around to the write function. + * The whole pipeline looks like the following: + * + *

    +   *      | input1 | ===\     /=== |sorter| === | task1 | (p1, p2)
    +   *                   shuffle
    +   *      | input2 | ===/     \=== |sorter| === | task2 | (p3, p4)
    +   *
    +   *      Note: Both input1 and input2's dataset come from partitions: p1, p2, p3, p4
    +   * 
    + * + *

    The write task switches to new file handle each time it receives a record + * from the different partition path, the shuffle and sort would reduce small files. + * + *

    The bulk insert should be run in batch execution mode. + * + * @param conf The configuration + * @param rowType The input row type + * @param dataStream The input data stream + * @return the bulk insert data stream sink + */ + public static DataStreamSink bulkInsert(Configuration conf, RowType rowType, DataStream dataStream) { + WriteOperatorFactory operatorFactory = BulkInsertWriteOperator.getFactory(conf, rowType); + if (OptionsResolver.isBucketIndexType(conf)) { + String indexKeys = conf.getString(FlinkOptions.INDEX_KEY_FIELD); + int numBuckets = conf.getInteger(FlinkOptions.BUCKET_INDEX_NUM_BUCKETS); + + BucketIndexPartitioner partitioner = new BucketIndexPartitioner<>(numBuckets, indexKeys); + RowDataKeyGen keyGen = RowDataKeyGen.instance(conf, rowType); + RowType rowTypeWithFileId = BucketBulkInsertWriterHelper.rowTypeWithFileId(rowType); + InternalTypeInfo typeInfo = InternalTypeInfo.of(rowTypeWithFileId); + + Map bucketIdToFileId = new HashMap<>(); + dataStream = dataStream.partitionCustom(partitioner, keyGen::getHoodieKey) + .map(record -> BucketBulkInsertWriterHelper.rowWithFileId(bucketIdToFileId, keyGen, record, indexKeys, numBuckets), typeInfo) + .setParallelism(conf.getInteger(FlinkOptions.WRITE_TASKS)); // same parallelism as write task to avoid shuffle + if (conf.getBoolean(FlinkOptions.WRITE_BULK_INSERT_SORT_INPUT)) { + SortOperatorGen sortOperatorGen = BucketBulkInsertWriterHelper.getFileIdSorterGen(rowTypeWithFileId); + dataStream = dataStream.transform("file_sorter", typeInfo, sortOperatorGen.createSortOperator()) + .setParallelism(conf.getInteger(FlinkOptions.WRITE_TASKS)); // same parallelism as write task to avoid shuffle + ExecNodeUtil.setManagedMemoryWeight(dataStream.getTransformation(), + conf.getInteger(FlinkOptions.WRITE_SORT_MEMORY) * 1024L * 1024L); + } + return dataStream + .transform(opName("bucket_bulk_insert", conf), TypeInformation.of(Object.class), operatorFactory) + .uid(opUID("bucket_bulk_insert", conf)) + .setParallelism(conf.getInteger(FlinkOptions.WRITE_TASKS)) + .addSink(DummySink.INSTANCE) + .name("dummy"); + } + + final String[] partitionFields = FilePathUtils.extractPartitionKeys(conf); + if (partitionFields.length > 0) { + RowDataKeyGen rowDataKeyGen = RowDataKeyGen.instance(conf, rowType); + if (conf.getBoolean(FlinkOptions.WRITE_BULK_INSERT_SHUFFLE_INPUT)) { + + // shuffle by partition keys + // use #partitionCustom instead of #keyBy to avoid duplicate sort operations, + // see BatchExecutionUtils#applyBatchExecutionSettings for details. + Partitioner partitioner = (key, channels) -> KeyGroupRangeAssignment.assignKeyToParallelOperator(key, + KeyGroupRangeAssignment.computeDefaultMaxParallelism(conf.getInteger(FlinkOptions.WRITE_TASKS)), channels); + dataStream = dataStream.partitionCustom(partitioner, rowDataKeyGen::getPartitionPath); + } + if (conf.getBoolean(FlinkOptions.WRITE_BULK_INSERT_SORT_INPUT)) { + SortOperatorGen sortOperatorGen = new SortOperatorGen(rowType, partitionFields); + // sort by partition keys + dataStream = dataStream + .transform("partition_key_sorter", + InternalTypeInfo.of(rowType), + sortOperatorGen.createSortOperator()) + .setParallelism(conf.getInteger(FlinkOptions.WRITE_TASKS)); + ExecNodeUtil.setManagedMemoryWeight(dataStream.getTransformation(), + conf.getInteger(FlinkOptions.WRITE_SORT_MEMORY) * 1024L * 1024L); + } + } + return dataStream + .transform(opName("hoodie_bulk_insert_write", conf), + TypeInformation.of(Object.class), + operatorFactory) + // follow the parallelism of upstream operators to avoid shuffle + .setParallelism(conf.getInteger(FlinkOptions.WRITE_TASKS)) + .addSink(DummySink.INSTANCE) + .name("dummy"); + } + + /** + * Insert the dataset with append mode(no upsert or deduplication). + * + *

    The input dataset would be rebalanced among the write tasks: + * + *

    +   *      | input1 | ===\     /=== | task1 | (p1, p2, p3, p4)
    +   *                   shuffle
    +   *      | input2 | ===/     \=== | task2 | (p1, p2, p3, p4)
    +   *
    +   *      Note: Both input1 and input2's dataset come from partitions: p1, p2, p3, p4
    +   * 
    + * + *

    The write task switches to new file handle each time it receives a record + * from the different partition path, so there may be many small files. + * + * @param conf The configuration + * @param rowType The input row type + * @param dataStream The input data stream + * @param bounded Whether the input stream is bounded + * @return the appending data stream sink + */ + public static DataStream append( + Configuration conf, + RowType rowType, + DataStream dataStream, + boolean bounded) { + if (!bounded) { + // In principle, the config should be immutable, but the boundedness + // is only visible when creating the sink pipeline. + conf.setBoolean(FlinkOptions.WRITE_BULK_INSERT_SORT_INPUT, false); + } + WriteOperatorFactory operatorFactory = AppendWriteOperator.getFactory(conf, rowType); + + return dataStream + .transform(opName("hoodie_append_write", conf), TypeInformation.of(Object.class), operatorFactory) + .uid(opUID("hoodie_stream_write", conf)) + .setParallelism(conf.getInteger(FlinkOptions.WRITE_TASKS)); + } + + /** + * Constructs bootstrap pipeline as streaming. + * The bootstrap operator loads the existing data index (primary key to file id mapping), + * then sends the indexing data set to subsequent operator(usually the bucket assign operator). + */ + public static DataStream bootstrap( + Configuration conf, + RowType rowType, + DataStream dataStream) { + return bootstrap(conf, rowType, dataStream, false, false); + } + + /** + * Constructs bootstrap pipeline. + * The bootstrap operator loads the existing data index (primary key to file id mapping), + * then send the indexing data set to subsequent operator(usually the bucket assign operator). + * + * @param conf The configuration + * @param rowType The row type + * @param dataStream The data stream + * @param bounded Whether the source is bounded + * @param overwrite Whether it is insert overwrite + */ + public static DataStream bootstrap( + Configuration conf, + RowType rowType, + DataStream dataStream, + boolean bounded, + boolean overwrite) { + final boolean globalIndex = conf.getBoolean(FlinkOptions.INDEX_GLOBAL_ENABLED); + if (overwrite || OptionsResolver.isBucketIndexType(conf)) { + return rowDataToHoodieRecord(conf, rowType, dataStream); + } else if (bounded && !globalIndex && OptionsResolver.isPartitionedTable(conf)) { + return boundedBootstrap(conf, rowType, dataStream); + } else { + return streamBootstrap(conf, rowType, dataStream, bounded); + } + } + + private static DataStream streamBootstrap( + Configuration conf, + RowType rowType, + DataStream dataStream, + boolean bounded) { + DataStream dataStream1 = rowDataToHoodieRecord(conf, rowType, dataStream); + + if (conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED) || bounded) { + dataStream1 = dataStream1 + .transform( + "index_bootstrap", + TypeInformation.of(HoodieRecord.class), + new BootstrapOperator<>(conf)) + .setParallelism(conf.getOptional(FlinkOptions.INDEX_BOOTSTRAP_TASKS).orElse(dataStream1.getParallelism())) + .uid(opUID("index_bootstrap", conf)); + } + + return dataStream1; + } + + /** + * Constructs bootstrap pipeline for batch execution mode. + * The indexing data set is loaded before the actual data write + * in order to support batch UPSERT. + */ + private static DataStream boundedBootstrap( + Configuration conf, + RowType rowType, + DataStream dataStream) { + final RowDataKeyGen rowDataKeyGen = RowDataKeyGen.instance(conf, rowType); + // shuffle by partition keys + dataStream = dataStream + .keyBy(rowDataKeyGen::getPartitionPath); + + return rowDataToHoodieRecord(conf, rowType, dataStream) + .transform( + "batch_index_bootstrap", + TypeInformation.of(HoodieRecord.class), + new BatchBootstrapOperator<>(conf)) + .setParallelism(conf.getOptional(FlinkOptions.INDEX_BOOTSTRAP_TASKS).orElse(dataStream.getParallelism())) + .uid(opUID("batch_index_bootstrap", conf)); + } + + /** + * Transforms the row data to hoodie records. + */ + public static DataStream rowDataToHoodieRecord(Configuration conf, RowType rowType, DataStream dataStream) { + return dataStream.map(RowDataToHoodieFunctions.create(rowType, conf), TypeInformation.of(HoodieRecord.class)) + .setParallelism(dataStream.getParallelism()).name("row_data_to_hoodie_record"); + } + + /** + * The streaming write pipeline. + * + *

    The input dataset shuffles by the primary key first then + * shuffles by the file group ID before passing around to the write function. + * The whole pipeline looks like the following: + * + *

    +   *      | input1 | ===\     /=== | bucket assigner | ===\     /=== | task1 |
    +   *                   shuffle(by PK)                    shuffle(by bucket ID)
    +   *      | input2 | ===/     \=== | bucket assigner | ===/     \=== | task2 |
    +   *
    +   *      Note: a file group must be handled by one write task to avoid write conflict.
    +   * 
    + * + *

    The bucket assigner assigns the inputs to suitable file groups, the write task caches + * and flushes the data set to disk. + * + * @param conf The configuration + * @param dataStream The input data stream + * @return the stream write data stream pipeline + */ + public static DataStream hoodieStreamWrite(Configuration conf, DataStream dataStream) { + if (OptionsResolver.isBucketIndexType(conf)) { + WriteOperatorFactory operatorFactory = BucketStreamWriteOperator.getFactory(conf); + int bucketNum = conf.getInteger(FlinkOptions.BUCKET_INDEX_NUM_BUCKETS); + String indexKeyFields = conf.getString(FlinkOptions.INDEX_KEY_FIELD); + BucketIndexPartitioner partitioner = new BucketIndexPartitioner<>(bucketNum, indexKeyFields); + return dataStream.partitionCustom(partitioner, HoodieRecord::getKey) + .transform(opName("bucket_write", conf), TypeInformation.of(Object.class), operatorFactory) + .uid(opUID("bucket_write", conf)) + .setParallelism(conf.getInteger(FlinkOptions.WRITE_TASKS)); + } else { + WriteOperatorFactory operatorFactory = StreamWriteOperator.getFactory(conf); + return dataStream + // Key-by record key, to avoid multiple subtasks write to a bucket at the same time + .keyBy(HoodieRecord::getRecordKey) + .transform( + "bucket_assigner", + TypeInformation.of(HoodieRecord.class), + new KeyedProcessOperator<>(new BucketAssignFunction<>(conf))) + .uid(opUID("bucket_assigner", conf)) + .setParallelism(conf.getInteger(FlinkOptions.BUCKET_ASSIGN_TASKS)) + // shuffle by fileId(bucket id) + .keyBy(record -> record.getCurrentLocation().getFileId()) + .transform(opName("stream_write", conf), TypeInformation.of(Object.class), operatorFactory) + .uid(opUID("stream_write", conf)) + .setParallelism(conf.getInteger(FlinkOptions.WRITE_TASKS)); + } + } + + /** + * The compaction tasks pipeline. + * + *

    The compaction plan operator monitors the new compaction plan on the timeline + * then distributes the sub-plans to the compaction tasks. The compaction task then + * handle over the metadata to commit task for compaction transaction commit. + * The whole pipeline looks like the following: + * + *

    +   *                                     /=== | task1 | ===\
    +   *      | plan generation | ===> hash                      | commit |
    +   *                                     \=== | task2 | ===/
    +   *
    +   *      Note: both the compaction plan generation task and commission task are singleton.
    +   * 
    + * + * @param conf The configuration + * @param dataStream The input data stream + * @return the compaction pipeline + */ + public static DataStreamSink compact(Configuration conf, DataStream dataStream) { + return dataStream.transform("compact_plan_generate", + TypeInformation.of(CompactionPlanEvent.class), + new CompactionPlanOperator(conf)) + .setParallelism(1) // plan generate must be singleton + // make the distribution strategy deterministic to avoid concurrent modifications + // on the same bucket files + .keyBy(plan -> plan.getOperation().getFileGroupId().getFileId()) + .transform("compact_task", + TypeInformation.of(CompactionCommitEvent.class), + new CompactOperator(conf)) + .setParallelism(conf.getInteger(FlinkOptions.COMPACTION_TASKS)) + .addSink(new CompactionCommitSink(conf)) + .name("compact_commit") + .setParallelism(1); // compaction commit should be singleton + } + + /** + * The clustering tasks pipeline. + * + *

    The clustering plan operator monitors the new clustering plan on the timeline + * then distributes the sub-plans to the clustering tasks. The clustering task then + * handle over the metadata to commit task for clustering transaction commit. + * The whole pipeline looks like the following: + * + *

    +   *                                     /=== | task1 | ===\
    +   *      | plan generation | ===> hash                      | commit |
    +   *                                     \=== | task2 | ===/
    +   *
    +   *      Note: both the clustering plan generation task and commission task are singleton.
    +   * 
    + * + * @param conf The configuration + * @param rowType The input row type + * @param dataStream The input data stream + * @return the clustering pipeline + */ + public static DataStreamSink cluster(Configuration conf, RowType rowType, DataStream dataStream) { + DataStream clusteringStream = dataStream.transform("cluster_plan_generate", + TypeInformation.of(ClusteringPlanEvent.class), + new ClusteringPlanOperator(conf)) + .setParallelism(1) // plan generate must be singleton + .keyBy(plan -> + // make the distribution strategy deterministic to avoid concurrent modifications + // on the same bucket files + plan.getClusteringGroupInfo().getOperations() + .stream().map(ClusteringOperation::getFileId).collect(Collectors.joining())) + .transform("clustering_task", + TypeInformation.of(ClusteringCommitEvent.class), + new ClusteringOperator(conf, rowType)) + .setParallelism(conf.getInteger(FlinkOptions.CLUSTERING_TASKS)); + if (OptionsResolver.sortClusteringEnabled(conf)) { + ExecNodeUtil.setManagedMemoryWeight(clusteringStream.getTransformation(), + conf.getInteger(FlinkOptions.WRITE_SORT_MEMORY) * 1024L * 1024L); + } + return clusteringStream.addSink(new ClusteringCommitSink(conf)) + .name("clustering_commit") + .setParallelism(1); // compaction commit should be singleton + } + + public static DataStreamSink clean(Configuration conf, DataStream dataStream) { + return dataStream.addSink(new CleanFunction<>(conf)) + .setParallelism(1) + .name("clean_commits"); + } + + public static DataStreamSink dummySink(DataStream dataStream) { + return dataStream.addSink(Pipelines.DummySink.INSTANCE) + .setParallelism(1) + .name("dummy"); + } + + public static String opName(String operatorN, Configuration conf) { + return operatorN + ": " + conf.getString(FlinkOptions.TABLE_NAME); + } + + public static String opUID(String operatorN, Configuration conf) { + return "uid_" + operatorN + "_" + conf.getString(FlinkOptions.TABLE_NAME); + } + + /** + * Dummy sink that does nothing. + */ + public static class DummySink implements SinkFunction { + private static final long serialVersionUID = 1L; + public static DummySink INSTANCE = new DummySink(); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/TimeWait.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/TimeWait.java new file mode 100644 index 0000000000000..0441673c33d12 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/TimeWait.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.hudi.exception.HoodieException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Objects; +import java.util.concurrent.TimeUnit; + +/** + * Tool used for time waiting. + */ +public class TimeWait { + private static final Logger LOG = LoggerFactory.getLogger(TimeWait.class); + + private final long timeout; // timeout in SECONDS + private final long interval; // interval in MILLISECONDS + private final String action; // action to report error message + + private long waitingTime = 0L; + + private TimeWait(long timeout, long interval, String action) { + this.timeout = timeout; + this.interval = interval; + this.action = action; + } + + public static Builder builder() { + return new Builder(); + } + + /** + * Wait for an interval time. + */ + public void waitFor() { + try { + if (waitingTime > timeout) { + throw new HoodieException("Timeout(" + waitingTime + "ms) while waiting for " + action); + } + TimeUnit.MILLISECONDS.sleep(interval); + waitingTime += interval; + } catch (InterruptedException e) { + throw new HoodieException("Error while waiting for " + action, e); + } + } + + /** + * Builder. + */ + public static class Builder { + private long timeout = 5 * 60 * 1000L; // default 5 minutes + private long interval = 1000; + private String action; + + private Builder() { + } + + public Builder timeout(long timeout) { + if (timeout > 0) { + this.timeout = timeout; + } + return this; + } + + public Builder interval(long interval) { + this.interval = interval; + return this; + } + + public Builder action(String action) { + this.action = action; + return this; + } + + public TimeWait build() { + Objects.requireNonNull(this.action); + return new TimeWait(this.timeout, this.interval, this.action); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/FileIndex.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/FileIndex.java new file mode 100644 index 0000000000000..7c9aae101c669 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/FileIndex.java @@ -0,0 +1,306 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.source; + +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.source.stats.ColumnStatsIndices; +import org.apache.hudi.source.stats.ExpressionEvaluator; +import org.apache.hudi.util.DataTypeUtils; +import org.apache.hudi.util.ExpressionUtils; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.types.logical.RowType; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * A file index which supports listing files efficiently through metadata table. + * + *

    It caches the partition paths to avoid redundant look up. + */ +public class FileIndex { + private static final Logger LOG = LoggerFactory.getLogger(FileIndex.class); + + private final Path path; + private final RowType rowType; + private final HoodieMetadataConfig metadataConfig; + private final boolean dataSkippingEnabled; + private List partitionPaths; // cache of partition paths + private List filters; // push down filters + private final boolean tableExists; + + private FileIndex(Path path, Configuration conf, RowType rowType) { + this.path = path; + this.rowType = rowType; + this.metadataConfig = metadataConfig(conf); + this.dataSkippingEnabled = conf.getBoolean(FlinkOptions.READ_DATA_SKIPPING_ENABLED); + this.tableExists = StreamerUtil.tableExists(path.toString(), HadoopConfigurations.getHadoopConf(conf)); + } + + public static FileIndex instance(Path path, Configuration conf, RowType rowType) { + return new FileIndex(path, conf, rowType); + } + + /** + * Returns the partition path key and values as a list of map, each map item in the list + * is a mapping of the partition key name to its actual partition value. For example, say + * there is a file path with partition keys [key1, key2, key3]: + * + *

    +   *   -- file:/// ... key1=val1/key2=val2/key3=val3
    +   *   -- file:/// ... key1=val4/key2=val5/key3=val6
    +   * 
    + * + *

    The return list should be [{key1:val1, key2:val2, key3:val3}, {key1:val4, key2:val5, key3:val6}]. + * + * @param partitionKeys The partition key list + * @param defaultParName The default partition name for nulls + * @param hivePartition Whether the partition path is in Hive style + */ + public List> getPartitions( + List partitionKeys, + String defaultParName, + boolean hivePartition) { + if (partitionKeys.size() == 0) { + // non partitioned table + return Collections.emptyList(); + } + List partitionPaths = getOrBuildPartitionPaths(); + if (partitionPaths.size() == 1 && partitionPaths.get(0).isEmpty()) { + return Collections.emptyList(); + } + List> partitions = new ArrayList<>(); + for (String partitionPath : partitionPaths) { + String[] paths = partitionPath.split(Path.SEPARATOR); + Map partitionMapping = new LinkedHashMap<>(); + if (hivePartition) { + Arrays.stream(paths).forEach(p -> { + String[] kv = p.split("="); + if (kv.length == 2) { + partitionMapping.put(kv[0], defaultParName.equals(kv[1]) ? null : kv[1]); + } + }); + } else { + for (int i = 0; i < partitionKeys.size(); i++) { + partitionMapping.put(partitionKeys.get(i), defaultParName.equals(paths[i]) ? null : paths[i]); + } + } + partitions.add(partitionMapping); + } + return partitions; + } + + /** + * Returns all the file statuses under the table base path. + */ + public FileStatus[] getFilesInPartitions() { + if (!tableExists) { + return new FileStatus[0]; + } + String[] partitions = getOrBuildPartitionPaths().stream().map(p -> fullPartitionPath(path, p)).toArray(String[]::new); + FileStatus[] allFileStatus = FSUtils.getFilesInPartitions(HoodieFlinkEngineContext.DEFAULT, metadataConfig, path.toString(), + partitions, "/tmp/") + .values().stream().flatMap(Arrays::stream).toArray(FileStatus[]::new); + Set candidateFiles = candidateFilesInMetadataTable(allFileStatus); + if (candidateFiles == null) { + // no need to filter by col stats or error occurs. + return allFileStatus; + } + return Arrays.stream(allFileStatus).parallel() + .filter(fileStatus -> candidateFiles.contains(fileStatus.getPath().getName())) + .toArray(FileStatus[]::new); + } + + /** + * Returns the full partition path. + * + * @param basePath The base path. + * @param partitionPath The relative partition path, may be empty if the table is non-partitioned. + * @return The full partition path string + */ + private static String fullPartitionPath(Path basePath, String partitionPath) { + if (partitionPath.isEmpty()) { + return basePath.toString(); + } + return new Path(basePath, partitionPath).toString(); + } + + /** + * Reset the state of the file index. + */ + @VisibleForTesting + public void reset() { + this.partitionPaths = null; + } + + // ------------------------------------------------------------------------- + // Getter/Setter + // ------------------------------------------------------------------------- + + /** + * Sets up explicit partition paths for pruning. + */ + public void setPartitionPaths(@Nullable Set partitionPaths) { + if (partitionPaths != null) { + this.partitionPaths = new ArrayList<>(partitionPaths); + } + } + + /** + * Sets up pushed down filters. + */ + public void setFilters(List filters) { + if (filters.size() > 0) { + this.filters = new ArrayList<>(filters); + } + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + /** + * Computes pruned list of candidate base-files' names based on provided list of data filters. + * conditions, by leveraging Metadata Table's Column Statistics index (hereon referred as ColStats for brevity) + * bearing "min", "max", "num_nulls" statistics for all columns. + * + *

    NOTE: This method has to return complete set of candidate files, since only provided candidates will + * ultimately be scanned as part of query execution. Hence, this method has to maintain the + * invariant of conservatively including every base-file's name, that is NOT referenced in its index. + * + *

    The {@code filters} must all be simple. + * + * @return list of pruned (data-skipped) candidate base-files' names + */ + @Nullable + private Set candidateFilesInMetadataTable(FileStatus[] allFileStatus) { + // NOTE: Data Skipping is only effective when it references columns that are indexed w/in + // the Column Stats Index (CSI). Following cases could not be effectively handled by Data Skipping: + // - Expressions on top-level column's fields (ie, for ex filters like "struct.field > 0", since + // CSI only contains stats for top-level columns, in this case for "struct") + // - Any expression not directly referencing top-level column (for ex, sub-queries, since there's + // nothing CSI in particular could be applied for) + if (!metadataConfig.enabled() || !dataSkippingEnabled) { + validateConfig(); + return null; + } + if (this.filters == null || this.filters.size() == 0) { + return null; + } + String[] referencedCols = ExpressionUtils.referencedColumns(filters); + if (referencedCols.length == 0) { + return null; + } + try { + final List colStats = ColumnStatsIndices.readColumnStatsIndex(path.toString(), metadataConfig, referencedCols); + final Pair, String[]> colStatsTable = ColumnStatsIndices.transposeColumnStatsIndex(colStats, referencedCols, rowType); + List transposedColStats = colStatsTable.getLeft(); + String[] queryCols = colStatsTable.getRight(); + if (queryCols.length == 0) { + // the indexed columns have no intersection with the referenced columns, returns early + return null; + } + RowType.RowField[] queryFields = DataTypeUtils.projectRowFields(rowType, queryCols); + + Set allIndexedFileNames = transposedColStats.stream().parallel() + .map(row -> row.getString(0).toString()) + .collect(Collectors.toSet()); + Set candidateFileNames = transposedColStats.stream().parallel() + .filter(row -> ExpressionEvaluator.filterExprs(filters, row, queryFields)) + .map(row -> row.getString(0).toString()) + .collect(Collectors.toSet()); + + // NOTE: Col-Stats Index isn't guaranteed to have complete set of statistics for every + // base-file: since it's bound to clustering, which could occur asynchronously + // at arbitrary point in time, and is not likely to be touching all the base files. + // + // To close that gap, we manually compute the difference b/w all indexed (by col-stats-index) + // files and all outstanding base-files, and make sure that all base files not + // represented w/in the index are included in the output of this method + Set nonIndexedFileNames = Arrays.stream(allFileStatus) + .map(fileStatus -> fileStatus.getPath().getName()).collect(Collectors.toSet()); + nonIndexedFileNames.removeAll(allIndexedFileNames); + + candidateFileNames.addAll(nonIndexedFileNames); + return candidateFileNames; + } catch (Throwable throwable) { + LOG.warn("Read column stats for data skipping error", throwable); + return null; + } + } + + private void validateConfig() { + if (dataSkippingEnabled && !metadataConfig.enabled()) { + LOG.warn("Data skipping requires Metadata Table to be enabled! " + + "isMetadataTableEnabled = {}", metadataConfig.enabled()); + } + } + + /** + * Returns all the relative partition paths. + * + *

    The partition paths are cached once invoked. + */ + public List getOrBuildPartitionPaths() { + if (this.partitionPaths != null) { + return this.partitionPaths; + } + this.partitionPaths = this.tableExists + ? FSUtils.getAllPartitionPaths(HoodieFlinkEngineContext.DEFAULT, metadataConfig, path.toString()) + : Collections.emptyList(); + return this.partitionPaths; + } + + private static HoodieMetadataConfig metadataConfig(org.apache.flink.configuration.Configuration conf) { + Properties properties = new Properties(); + + // set up metadata.enabled=true in table DDL to enable metadata listing + properties.put(HoodieMetadataConfig.ENABLE.key(), conf.getBoolean(FlinkOptions.METADATA_ENABLED)); + + return HoodieMetadataConfig.newBuilder().fromProperties(properties).build(); + } + + @VisibleForTesting + public List getFilters() { + return filters; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/IncrementalInputSplits.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/IncrementalInputSplits.java new file mode 100644 index 0000000000000..2dd86d652869f --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/IncrementalInputSplits.java @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.source; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.BaseFile; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.log.InstantRange; +import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.sink.partitioner.profile.WriteProfiles; +import org.apache.hudi.table.format.mor.MergeOnReadInputSplit; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.Path; +import org.apache.flink.table.types.logical.RowType; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import scala.Serializable; + +import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN_OR_EQUALS; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS; + +/** + * Utilities to generate incremental input splits {@link MergeOnReadInputSplit}. + * The input splits are used for streaming and incremental read. + * + *

    How to generate the input splits: + *

      + *
    1. first fetch all the commit metadata for the incremental instants;
    2. + *
    3. resolve the incremental commit file paths;
    4. + *
    5. filter the full file paths by required partitions;
    6. + *
    7. use the file paths from #step 3 as the back-up of the filesystem view.
    8. + *
    + */ +public class IncrementalInputSplits implements Serializable { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(IncrementalInputSplits.class); + private final Configuration conf; + private final Path path; + private final RowType rowType; + private final long maxCompactionMemoryInBytes; + // for partition pruning + private final Set requiredPartitions; + // skip compaction + private final boolean skipCompaction; + + private IncrementalInputSplits( + Configuration conf, + Path path, + RowType rowType, + long maxCompactionMemoryInBytes, + @Nullable Set requiredPartitions, + boolean skipCompaction) { + this.conf = conf; + this.path = path; + this.rowType = rowType; + this.maxCompactionMemoryInBytes = maxCompactionMemoryInBytes; + this.requiredPartitions = requiredPartitions; + this.skipCompaction = skipCompaction; + } + + /** + * Returns the builder. + */ + public static Builder builder() { + return new Builder(); + } + + /** + * Returns the incremental input splits. + * + * @param metaClient The meta client + * @param hadoopConf The hadoop configuration + * @return The list of incremental input splits or empty if there are no new instants + */ + public Result inputSplits( + HoodieTableMetaClient metaClient, + org.apache.hadoop.conf.Configuration hadoopConf) { + HoodieTimeline commitTimeline = metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants(); + if (commitTimeline.empty()) { + LOG.warn("No splits found for the table under path " + path); + return Result.EMPTY; + } + + final String startCommit = this.conf.getString(FlinkOptions.READ_START_COMMIT); + final String endCommit = this.conf.getString(FlinkOptions.READ_END_COMMIT); + final boolean startFromEarliest = FlinkOptions.START_COMMIT_EARLIEST.equalsIgnoreCase(startCommit); + final boolean startOutOfRange = startCommit != null && commitTimeline.isBeforeTimelineStarts(startCommit); + final boolean endOutOfRange = endCommit != null && commitTimeline.isBeforeTimelineStarts(endCommit); + boolean fullTableScan = startFromEarliest || startOutOfRange || endOutOfRange; + + // Step1: find out the files to read, tries to read the files from the commit metadata first, + // fallback to full table scan if any of the following conditions matches: + // 1. there are files in metadata be deleted; + // 2. read from earliest + // 3. the start commit is archived + // 4. the end commit is archived + Set readPartitions; + final FileStatus[] fileStatuses; + List instants = filterInstantsWithRange(commitTimeline, null); + if (fullTableScan) { + // scans the partitions and files directly. + FileIndex fileIndex = getFileIndex(); + readPartitions = new HashSet<>(fileIndex.getOrBuildPartitionPaths()); + if (readPartitions.size() == 0) { + LOG.warn("No partitions found for reading in user provided path."); + return Result.EMPTY; + } + fileStatuses = fileIndex.getFilesInPartitions(); + } else { + if (instants.size() == 0) { + LOG.info("No new instant found for the table under path " + path + ", skip reading"); + return Result.EMPTY; + } + String tableName = conf.getString(FlinkOptions.TABLE_NAME); + List metadataList = instants.stream() + .map(instant -> WriteProfiles.getCommitMetadata(tableName, path, instant, commitTimeline)).collect(Collectors.toList()); + readPartitions = getReadPartitions(metadataList); + if (readPartitions.size() == 0) { + LOG.warn("No partitions found for reading in user provided path."); + return Result.EMPTY; + } + FileStatus[] files = WriteProfiles.getRawWritePathsOfInstants(path, hadoopConf, metadataList, metaClient.getTableType()); + FileSystem fs = FSUtils.getFs(path.toString(), hadoopConf); + if (Arrays.stream(files).anyMatch(fileStatus -> !StreamerUtil.fileExists(fs, fileStatus.getPath()))) { + LOG.warn("Found deleted files in metadata, fall back to full table scan."); + // fallback to full table scan + fullTableScan = true; + // reading from the earliest, scans the partitions and files directly. + FileIndex fileIndex = getFileIndex(); + readPartitions = new HashSet<>(fileIndex.getOrBuildPartitionPaths()); + if (readPartitions.size() == 0) { + LOG.warn("No partitions found for reading in user provided path."); + return Result.EMPTY; + } + fileStatuses = fileIndex.getFilesInPartitions(); + } else { + fileStatuses = files; + } + } + + if (fileStatuses.length == 0) { + LOG.warn("No files found for reading in user provided path."); + return Result.EMPTY; + } + + // Step2: generates the instant range + // if the specified end commit is archived, still uses the specified timestamp, + // else uses the latest filtered instant time + // (would be the latest instant time if the specified end commit is greater than the latest instant time) + final String rangeEnd = endOutOfRange ? endCommit : instants.get(instants.size() - 1).getTimestamp(); + // keep the same semantics with streaming read, default start from the latest commit + final String rangeStart = startFromEarliest ? null : (startCommit == null ? rangeEnd : startCommit); + final InstantRange instantRange; + if (!fullTableScan) { + instantRange = InstantRange.builder().startInstant(rangeStart).endInstant(rangeEnd) + .rangeType(InstantRange.RangeType.CLOSE_CLOSE).build(); + } else if (startFromEarliest && endCommit == null) { + // short-cut for snapshot read + instantRange = null; + } else { + instantRange = InstantRange.builder().startInstant(rangeStart).endInstant(rangeEnd) + .rangeType(InstantRange.RangeType.CLOSE_CLOSE).nullableBoundary(true).build(); + } + + // Step3: decides the read end commit + final String endInstant = fullTableScan + ? commitTimeline.lastInstant().get().getTimestamp() + : instants.get(instants.size() - 1).getTimestamp(); + + List inputSplits = getInputSplits(metaClient, commitTimeline, + fileStatuses, readPartitions, endInstant, instantRange); + + return Result.instance(inputSplits, endInstant); + } + + /** + * Returns the incremental input splits. + * + * @param metaClient The meta client + * @param hadoopConf The hadoop configuration + * @param issuedInstant The last issued instant, only valid in streaming read + * @return The list of incremental input splits or empty if there are no new instants + */ + public Result inputSplits( + HoodieTableMetaClient metaClient, + org.apache.hadoop.conf.Configuration hadoopConf, + String issuedInstant) { + metaClient.reloadActiveTimeline(); + HoodieTimeline commitTimeline = metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants(); + if (commitTimeline.empty()) { + LOG.warn("No splits found for the table under path " + path); + return Result.EMPTY; + } + List instants = filterInstantsWithRange(commitTimeline, issuedInstant); + // get the latest instant that satisfies condition + final HoodieInstant instantToIssue = instants.size() == 0 ? null : instants.get(instants.size() - 1); + final InstantRange instantRange; + if (instantToIssue != null) { + if (issuedInstant != null) { + // the streaming reader may record the last issued instant, if the issued instant is present, + // the instant range should be: (issued instant, the latest instant]. + instantRange = InstantRange.builder().startInstant(issuedInstant).endInstant(instantToIssue.getTimestamp()) + .rangeType(InstantRange.RangeType.OPEN_CLOSE).build(); + } else if (this.conf.getOptional(FlinkOptions.READ_START_COMMIT).isPresent()) { + // first time consume and has a start commit + final String startCommit = this.conf.getString(FlinkOptions.READ_START_COMMIT); + instantRange = startCommit.equalsIgnoreCase(FlinkOptions.START_COMMIT_EARLIEST) + ? null + : InstantRange.builder().startInstant(startCommit).endInstant(instantToIssue.getTimestamp()) + .rangeType(InstantRange.RangeType.CLOSE_CLOSE).build(); + } else { + // first time consume and no start commit, consumes the latest incremental data set. + instantRange = InstantRange.builder().startInstant(instantToIssue.getTimestamp()).endInstant(instantToIssue.getTimestamp()) + .rangeType(InstantRange.RangeType.CLOSE_CLOSE).build(); + } + } else { + LOG.info("No new instant found for the table under path " + path + ", skip reading"); + return Result.EMPTY; + } + + String tableName = conf.getString(FlinkOptions.TABLE_NAME); + + Set readPartitions; + final FileStatus[] fileStatuses; + + if (instantRange == null) { + // reading from the earliest, scans the partitions and files directly. + FileIndex fileIndex = getFileIndex(); + readPartitions = new HashSet<>(fileIndex.getOrBuildPartitionPaths()); + if (readPartitions.size() == 0) { + LOG.warn("No partitions found for reading in user provided path."); + return Result.EMPTY; + } + fileStatuses = fileIndex.getFilesInPartitions(); + } else { + List activeMetadataList = instants.stream() + .map(instant -> WriteProfiles.getCommitMetadata(tableName, path, instant, commitTimeline)).collect(Collectors.toList()); + List archivedMetadataList = getArchivedMetadata(metaClient, instantRange, commitTimeline, tableName); + if (archivedMetadataList.size() > 0) { + LOG.warn("\n" + + "--------------------------------------------------------------------------------\n" + + "---------- caution: the reader has fall behind too much from the writer,\n" + + "---------- tweak 'read.tasks' option to add parallelism of read tasks.\n" + + "--------------------------------------------------------------------------------"); + } + List metadataList = archivedMetadataList.size() > 0 + // IMPORTANT: the merged metadata list must be in ascending order by instant time + ? mergeList(archivedMetadataList, activeMetadataList) + : activeMetadataList; + + readPartitions = getReadPartitions(metadataList); + if (readPartitions.size() == 0) { + LOG.warn("No partitions found for reading in user provided path."); + return Result.EMPTY; + } + fileStatuses = WriteProfiles.getWritePathsOfInstants(path, hadoopConf, metadataList, metaClient.getTableType()); + } + + if (fileStatuses.length == 0) { + LOG.warn("No files found for reading in user provided path."); + return Result.EMPTY; + } + + final String endInstant = instantToIssue.getTimestamp(); + List inputSplits = getInputSplits(metaClient, commitTimeline, + fileStatuses, readPartitions, endInstant, instantRange); + + return Result.instance(inputSplits, endInstant); + } + + private List getInputSplits( + HoodieTableMetaClient metaClient, + HoodieTimeline commitTimeline, + FileStatus[] fileStatuses, + Set readPartitions, + String endInstant, + InstantRange instantRange) { + final HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, commitTimeline, fileStatuses); + final AtomicInteger cnt = new AtomicInteger(0); + final String mergeType = this.conf.getString(FlinkOptions.MERGE_TYPE); + return readPartitions.stream() + .map(relPartitionPath -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, endInstant) + .map(fileSlice -> { + Option> logPaths = Option.ofNullable(fileSlice.getLogFiles() + .sorted(HoodieLogFile.getLogFileComparator()) + .map(logFile -> logFile.getPath().toString()) + .collect(Collectors.toList())); + String basePath = fileSlice.getBaseFile().map(BaseFile::getPath).orElse(null); + return new MergeOnReadInputSplit(cnt.getAndAdd(1), + basePath, logPaths, endInstant, + metaClient.getBasePath(), maxCompactionMemoryInBytes, mergeType, instantRange, fileSlice.getFileId()); + }).collect(Collectors.toList())) + .flatMap(Collection::stream) + .collect(Collectors.toList()); + } + + private FileIndex getFileIndex() { + FileIndex fileIndex = FileIndex.instance(new org.apache.hadoop.fs.Path(path.toUri()), conf, rowType); + if (this.requiredPartitions != null) { + // apply partition push down + fileIndex.setPartitionPaths(this.requiredPartitions); + } + return fileIndex; + } + + /** + * Returns the partitions to read with given metadata list. + * The partitions would be filtered by the pushed down required partitions. + * + * @param metadataList The metadata list + * @return the set of read partitions + */ + private Set getReadPartitions(List metadataList) { + Set partitions = HoodieInputFormatUtils.getWritePartitionPaths(metadataList); + // apply partition push down + if (this.requiredPartitions != null) { + return partitions.stream() + .filter(this.requiredPartitions::contains).collect(Collectors.toSet()); + } + return partitions; + } + + /** + * Returns the archived metadata in case the reader consumes untimely or it wants + * to read from the earliest. + * + *

    Note: should improve it with metadata table when the metadata table is stable enough. + * + * @param metaClient The meta client + * @param instantRange The instant range to filter the timeline instants + * @param commitTimeline The commit timeline + * @param tableName The table name + * @return the list of archived metadata, or empty if there is no need to read the archived timeline + */ + private List getArchivedMetadata( + HoodieTableMetaClient metaClient, + InstantRange instantRange, + HoodieTimeline commitTimeline, + String tableName) { + if (commitTimeline.isBeforeTimelineStarts(instantRange.getStartInstant())) { + // read the archived metadata if the start instant is archived. + HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline(instantRange.getStartInstant()); + HoodieTimeline archivedCompleteTimeline = archivedTimeline.getCommitsTimeline().filterCompletedInstants(); + if (!archivedCompleteTimeline.empty()) { + Stream instantStream = archivedCompleteTimeline.getInstants(); + return maySkipCompaction(instantStream) + .map(instant -> WriteProfiles.getCommitMetadata(tableName, path, instant, archivedTimeline)).collect(Collectors.toList()); + } + } + return Collections.emptyList(); + } + + /** + * Returns the instants with a given issuedInstant to start from. + * + * @param commitTimeline The completed commits timeline + * @param issuedInstant The last issued instant that has already been delivered to downstream + * @return the filtered hoodie instants + */ + @VisibleForTesting + public List filterInstantsWithRange( + HoodieTimeline commitTimeline, + final String issuedInstant) { + HoodieTimeline completedTimeline = commitTimeline.filterCompletedInstants(); + if (issuedInstant != null) { + // returns early for streaming mode + return maySkipCompaction(completedTimeline.getInstants()) + .filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), GREATER_THAN, issuedInstant)) + .collect(Collectors.toList()); + } + + Stream instantStream = completedTimeline.getInstants(); + + if (OptionsResolver.hasNoSpecificReadCommits(this.conf)) { + // by default read from the latest commit + List instants = completedTimeline.getInstants().collect(Collectors.toList()); + if (instants.size() > 1) { + return Collections.singletonList(instants.get(instants.size() - 1)); + } + return instants; + } + + if (OptionsResolver.isSpecificStartCommit(this.conf)) { + final String startCommit = this.conf.get(FlinkOptions.READ_START_COMMIT); + instantStream = instantStream + .filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), GREATER_THAN_OR_EQUALS, startCommit)); + } + if (this.conf.getOptional(FlinkOptions.READ_END_COMMIT).isPresent()) { + final String endCommit = this.conf.get(FlinkOptions.READ_END_COMMIT); + instantStream = instantStream.filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), LESSER_THAN_OR_EQUALS, endCommit)); + } + return maySkipCompaction(instantStream).collect(Collectors.toList()); + } + + private Stream maySkipCompaction(Stream instants) { + return this.skipCompaction + ? instants.filter(instant -> !instant.getAction().equals(HoodieTimeline.COMMIT_ACTION)) + : instants; + } + + private static List mergeList(List list1, List list2) { + List merged = new ArrayList<>(list1); + merged.addAll(list2); + return merged; + } + + // ------------------------------------------------------------------------- + // Inner Class + // ------------------------------------------------------------------------- + + /** + * Represents a result of calling {@link #inputSplits}. + */ + public static class Result { + private final List inputSplits; // input splits + private final String endInstant; // end instant to consume to + + public static final Result EMPTY = instance(Collections.emptyList(), ""); + + public boolean isEmpty() { + return this.inputSplits.size() == 0; + } + + public List getInputSplits() { + return this.inputSplits; + } + + public String getEndInstant() { + return this.endInstant; + } + + private Result(List inputSplits, String endInstant) { + this.inputSplits = inputSplits; + this.endInstant = endInstant; + } + + public static Result instance(List inputSplits, String endInstant) { + return new Result(inputSplits, endInstant); + } + } + + /** + * Builder for {@link IncrementalInputSplits}. + */ + public static class Builder { + private Configuration conf; + private Path path; + private RowType rowType; + private long maxCompactionMemoryInBytes; + // for partition pruning + private Set requiredPartitions; + // skip compaction + private boolean skipCompaction = false; + + public Builder() { + } + + public Builder conf(Configuration conf) { + this.conf = conf; + return this; + } + + public Builder path(Path path) { + this.path = path; + return this; + } + + public Builder rowType(RowType rowType) { + this.rowType = rowType; + return this; + } + + public Builder maxCompactionMemoryInBytes(long maxCompactionMemoryInBytes) { + this.maxCompactionMemoryInBytes = maxCompactionMemoryInBytes; + return this; + } + + public Builder requiredPartitions(@Nullable Set requiredPartitions) { + this.requiredPartitions = requiredPartitions; + return this; + } + + public Builder skipCompaction(boolean skipCompaction) { + this.skipCompaction = skipCompaction; + return this; + } + + public IncrementalInputSplits build() { + return new IncrementalInputSplits( + Objects.requireNonNull(this.conf), Objects.requireNonNull(this.path), Objects.requireNonNull(this.rowType), + this.maxCompactionMemoryInBytes, this.requiredPartitions, this.skipCompaction); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadMonitoringFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadMonitoringFunction.java new file mode 100644 index 0000000000000..3318cecf10369 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadMonitoringFunction.java @@ -0,0 +1,258 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.source; + +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.table.format.mor.MergeOnReadInputSplit; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.typeutils.base.StringSerializer; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.Path; +import org.apache.flink.runtime.state.FunctionInitializationContext; +import org.apache.flink.runtime.state.FunctionSnapshotContext; +import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; +import org.apache.flink.streaming.api.functions.source.RichSourceFunction; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.table.types.logical.RowType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.concurrent.TimeUnit; + +/** + * This is the single (non-parallel) monitoring task which takes a {@link MergeOnReadInputSplit} + * , it is responsible for: + * + *

      + *
    1. Monitoring a user-provided hoodie table path.
    2. + *
    3. Deciding which files(or split) should be further read and processed.
    4. + *
    5. Creating the {@link MergeOnReadInputSplit splits} corresponding to those files.
    6. + *
    7. Assigning them to downstream tasks for further processing.
    8. + *
    + * + *

    The splits to be read are forwarded to the downstream {@link StreamReadOperator} + * which can have parallelism greater than one. + * + *

    IMPORTANT NOTE: Splits are forwarded downstream for reading in ascending instant commits time order, + * in each downstream task, the splits are also read in receiving sequence. We do not ensure split consuming sequence + * among the downstream tasks. + */ +public class StreamReadMonitoringFunction + extends RichSourceFunction implements CheckpointedFunction { + private static final Logger LOG = LoggerFactory.getLogger(StreamReadMonitoringFunction.class); + + private static final long serialVersionUID = 1L; + + /** + * The path to monitor. + */ + private final Path path; + + /** + * The interval between consecutive path scans. + */ + private final long interval; + + private transient Object checkpointLock; + + private volatile boolean isRunning = true; + + private String issuedInstant; + + private transient ListState instantState; + + private final Configuration conf; + + private transient org.apache.hadoop.conf.Configuration hadoopConf; + + private HoodieTableMetaClient metaClient; + + private final IncrementalInputSplits incrementalInputSplits; + + public StreamReadMonitoringFunction( + Configuration conf, + Path path, + RowType rowType, + long maxCompactionMemoryInBytes, + @Nullable Set requiredPartitionPaths) { + this.conf = conf; + this.path = path; + this.interval = conf.getInteger(FlinkOptions.READ_STREAMING_CHECK_INTERVAL); + this.incrementalInputSplits = IncrementalInputSplits.builder() + .conf(conf) + .path(path) + .rowType(rowType) + .maxCompactionMemoryInBytes(maxCompactionMemoryInBytes) + .requiredPartitions(requiredPartitionPaths) + .skipCompaction(conf.getBoolean(FlinkOptions.READ_STREAMING_SKIP_COMPACT)) + .build(); + } + + @Override + public void initializeState(FunctionInitializationContext context) throws Exception { + + ValidationUtils.checkState(this.instantState == null, + "The " + getClass().getSimpleName() + " has already been initialized."); + + this.instantState = context.getOperatorStateStore().getListState( + new ListStateDescriptor<>( + "file-monitoring-state", + StringSerializer.INSTANCE + ) + ); + + if (context.isRestored()) { + LOG.info("Restoring state for the class {} with table {} and base path {}.", + getClass().getSimpleName(), conf.getString(FlinkOptions.TABLE_NAME), path); + + List retrievedStates = new ArrayList<>(); + for (String entry : this.instantState.get()) { + retrievedStates.add(entry); + } + + ValidationUtils.checkArgument(retrievedStates.size() <= 1, + getClass().getSimpleName() + " retrieved invalid state."); + + if (retrievedStates.size() == 1 && issuedInstant != null) { + // this is the case where we have both legacy and new state. + // the two should be mutually exclusive for the operator, thus we throw the exception. + + throw new IllegalArgumentException( + "The " + getClass().getSimpleName() + " has already restored from a previous Flink version."); + + } else if (retrievedStates.size() == 1) { + this.issuedInstant = retrievedStates.get(0); + if (LOG.isDebugEnabled()) { + LOG.debug("{} retrieved a issued instant of time {} for table {} with path {}.", + getClass().getSimpleName(), issuedInstant, conf.get(FlinkOptions.TABLE_NAME), path); + } + } + } + } + + @Override + public void open(Configuration parameters) throws Exception { + super.open(parameters); + this.hadoopConf = HadoopConfigurations.getHadoopConf(conf); + } + + @Override + public void run(SourceFunction.SourceContext context) throws Exception { + checkpointLock = context.getCheckpointLock(); + while (isRunning) { + synchronized (checkpointLock) { + monitorDirAndForwardSplits(context); + } + TimeUnit.SECONDS.sleep(interval); + } + } + + @Nullable + private HoodieTableMetaClient getOrCreateMetaClient() { + if (this.metaClient != null) { + return this.metaClient; + } + if (StreamerUtil.tableExists(this.path.toString(), hadoopConf)) { + this.metaClient = StreamerUtil.createMetaClient(this.path.toString(), hadoopConf); + return this.metaClient; + } + // fallback + return null; + } + + @VisibleForTesting + public void monitorDirAndForwardSplits(SourceContext context) { + HoodieTableMetaClient metaClient = getOrCreateMetaClient(); + if (metaClient == null) { + // table does not exist + return; + } + IncrementalInputSplits.Result result = + incrementalInputSplits.inputSplits(metaClient, this.hadoopConf, this.issuedInstant); + if (result.isEmpty()) { + // no new instants, returns early + return; + } + + for (MergeOnReadInputSplit split : result.getInputSplits()) { + context.collect(split); + } + // update the issues instant time + this.issuedInstant = result.getEndInstant(); + LOG.info("\n" + + "------------------------------------------------------------\n" + + "---------- consumed to instant: {}\n" + + "------------------------------------------------------------", + this.issuedInstant); + } + + @Override + public void close() throws Exception { + super.close(); + + if (checkpointLock != null) { + synchronized (checkpointLock) { + issuedInstant = null; + isRunning = false; + } + } + + if (LOG.isDebugEnabled()) { + LOG.debug("Closed File Monitoring Source for path: " + path + "."); + } + } + + @Override + public void cancel() { + if (checkpointLock != null) { + // this is to cover the case where cancel() is called before the run() + synchronized (checkpointLock) { + issuedInstant = null; + isRunning = false; + } + } else { + issuedInstant = null; + isRunning = false; + } + } + + // ------------------------------------------------------------------------- + // Checkpointing + // ------------------------------------------------------------------------- + + @Override + public void snapshotState(FunctionSnapshotContext context) throws Exception { + this.instantState.clear(); + if (this.issuedInstant != null) { + this.instantState.add(this.issuedInstant); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadOperator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadOperator.java new file mode 100644 index 0000000000000..bf6dc98f4de06 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadOperator.java @@ -0,0 +1,258 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.source; + +import org.apache.hudi.adapter.AbstractStreamOperatorAdapter; +import org.apache.hudi.adapter.AbstractStreamOperatorFactoryAdapter; +import org.apache.hudi.adapter.MailboxExecutorAdapter; +import org.apache.hudi.adapter.Utils; +import org.apache.hudi.table.format.mor.MergeOnReadInputFormat; +import org.apache.hudi.table.format.mor.MergeOnReadInputSplit; + +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.runtime.state.JavaSerializer; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.runtime.state.StateSnapshotContext; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.StreamOperator; +import org.apache.flink.streaming.api.operators.StreamOperatorParameters; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.runtime.tasks.ProcessingTimeService; +import org.apache.flink.table.data.RowData; +import org.apache.flink.util.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Queue; +import java.util.concurrent.LinkedBlockingDeque; + +/** + * The operator that reads the {@link MergeOnReadInputSplit splits} received from the preceding {@link + * StreamReadMonitoringFunction}. Contrary to the {@link StreamReadMonitoringFunction} which has a parallelism of 1, + * this operator can have multiple parallelism. + * + *

    As soon as an input split {@link MergeOnReadInputSplit} is received, it is put into a queue, + * the {@code MailboxExecutor} read the actual data of the split. + * This architecture allows the separation of split reading from processing the checkpoint barriers, + * thus removing any potential back-pressure. + */ +public class StreamReadOperator extends AbstractStreamOperatorAdapter + implements OneInputStreamOperator { + + private static final Logger LOG = LoggerFactory.getLogger(StreamReadOperator.class); + + private static final int MINI_BATCH_SIZE = 2048; + + // It's the same thread that runs this operator and checkpoint actions. Use this executor to schedule only + // splits for subsequent reading, so that a new checkpoint could be triggered without blocking a long time + // for exhausting all scheduled split reading tasks. + private final MailboxExecutorAdapter executor; + + private MergeOnReadInputFormat format; + + private transient SourceFunction.SourceContext sourceContext; + + private transient ListState inputSplitsState; + + private transient Queue splits; + + // Splits are read by the same thread that calls #processElement. Each read task is submitted to that thread by adding + // them to the executor. This state is used to ensure that only one read task is in that splits queue at a time, so that + // read tasks do not accumulate ahead of checkpoint tasks. When there is a read task in the queue, this is set to RUNNING. + // When there are no more files to read, this will be set to IDLE. + private transient volatile SplitState currentSplitState; + + private StreamReadOperator(MergeOnReadInputFormat format, ProcessingTimeService timeService, + MailboxExecutorAdapter mailboxExecutor) { + this.format = Preconditions.checkNotNull(format, "The InputFormat should not be null."); + this.processingTimeService = timeService; + this.executor = Preconditions.checkNotNull(mailboxExecutor, "The mailboxExecutor should not be null."); + } + + @Override + public void initializeState(StateInitializationContext context) throws Exception { + super.initializeState(context); + + // TODO Replace Java serialization with Avro approach to keep state compatibility. + inputSplitsState = context.getOperatorStateStore().getListState( + new ListStateDescriptor<>("splits", new JavaSerializer<>())); + + // Initialize the current split state to IDLE. + currentSplitState = SplitState.IDLE; + + // Recover splits state from flink state backend if possible. + splits = new LinkedBlockingDeque<>(); + if (context.isRestored()) { + int subtaskIdx = getRuntimeContext().getIndexOfThisSubtask(); + LOG.info("Restoring state for operator {} (task ID: {}).", getClass().getSimpleName(), subtaskIdx); + + for (MergeOnReadInputSplit split : inputSplitsState.get()) { + splits.add(split); + } + } + + this.sourceContext = Utils.getSourceContext( + getOperatorConfig().getTimeCharacteristic(), + getProcessingTimeService(), + getContainingTask(), + output, + getRuntimeContext().getExecutionConfig().getAutoWatermarkInterval()); + + // Enqueue to process the recovered input splits. + enqueueProcessSplits(); + } + + @Override + public void snapshotState(StateSnapshotContext context) throws Exception { + super.snapshotState(context); + + inputSplitsState.clear(); + inputSplitsState.addAll(new ArrayList<>(splits)); + } + + @Override + public void processElement(StreamRecord element) { + splits.add(element.getValue()); + enqueueProcessSplits(); + } + + private void enqueueProcessSplits() { + if (currentSplitState == SplitState.IDLE && !splits.isEmpty()) { + currentSplitState = SplitState.RUNNING; + executor.execute(this::processSplits, "process input split"); + } + } + + private void processSplits() throws IOException { + MergeOnReadInputSplit split = splits.peek(); + if (split == null) { + currentSplitState = SplitState.IDLE; + return; + } + + // 1. open a fresh new input split and start reading as mini-batch + // 2. if the input split has remaining records to read, switches to another runnable to handle + // 3. if the input split reads to the end, close the format and remove the split from the queue #splits + // 4. for each runnable, reads at most #MINI_BATCH_SIZE number of records + if (format.isClosed()) { + // This log is important to indicate the consuming process, + // there is only one log message for one data bucket. + LOG.info("Processing input split : {}", split); + format.open(split); + } + try { + consumeAsMiniBatch(split); + } finally { + currentSplitState = SplitState.IDLE; + } + + // Re-schedule to process the next split. + enqueueProcessSplits(); + } + + /** + * Consumes at most {@link #MINI_BATCH_SIZE} number of records + * for the given input split {@code split}. + * + *

    Note: close the input format and remove the input split for the queue {@link #splits} + * if the split reads to the end. + * + * @param split The input split + */ + private void consumeAsMiniBatch(MergeOnReadInputSplit split) throws IOException { + for (int i = 0; i < MINI_BATCH_SIZE; i++) { + if (!format.reachedEnd()) { + sourceContext.collect(format.nextRecord(null)); + split.consume(); + } else { + // close the input format + format.close(); + // remove the split + splits.poll(); + break; + } + } + } + + @Override + public void processWatermark(Watermark mark) { + // we do nothing because we emit our own watermarks if needed. + } + + @Override + public void close() throws Exception { + super.close(); + + if (format != null) { + format.close(); + format.closeInputFormat(); + format = null; + } + + sourceContext = null; + } + + @Override + public void finish() throws Exception { + super.finish(); + output.close(); + if (sourceContext != null) { + sourceContext.emitWatermark(Watermark.MAX_WATERMARK); + sourceContext.close(); + sourceContext = null; + } + } + + public static OneInputStreamOperatorFactory factory(MergeOnReadInputFormat format) { + return new OperatorFactory(format); + } + + private enum SplitState { + IDLE, RUNNING + } + + private static class OperatorFactory extends AbstractStreamOperatorFactoryAdapter + implements OneInputStreamOperatorFactory { + + private final MergeOnReadInputFormat format; + + private OperatorFactory(MergeOnReadInputFormat format) { + this.format = format; + } + + @SuppressWarnings("unchecked") + @Override + public > O createStreamOperator(StreamOperatorParameters parameters) { + StreamReadOperator operator = new StreamReadOperator(format, processingTimeService, getMailboxExecutorAdapter()); + operator.setup(parameters.getContainingTask(), parameters.getStreamConfig(), parameters.getOutput()); + return (O) operator; + } + + @Override + public Class getStreamOperatorClass(ClassLoader classLoader) { + return StreamReadOperator.class; + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/stats/ColumnStatsIndices.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/stats/ColumnStatsIndices.java new file mode 100644 index 0000000000000..428661844eac4 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/stats/ColumnStatsIndices.java @@ -0,0 +1,362 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.source.stats; + +import org.apache.hudi.avro.model.HoodieMetadataRecord; +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.common.util.hash.ColumnIndexID; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.metadata.HoodieMetadataPayload; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.HoodieTableMetadataUtil; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.AvroToRowDataConverters; +import org.apache.hudi.util.RowDataProjection; + +import org.apache.avro.generic.GenericRecord; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.concurrent.ConcurrentHashMap; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.apache.hudi.common.util.ValidationUtils.checkState; + +/** + * Utilities for abstracting away heavy-lifting of interactions with Metadata Table's Column Stats Index, + * providing convenient interfaces to read it, transpose, etc. + */ +public class ColumnStatsIndices { + private static final DataType METADATA_DATA_TYPE = getMetadataDataType(); + private static final DataType COL_STATS_DATA_TYPE = getColStatsDataType(); + private static final int[] COL_STATS_TARGET_POS = getColStatsTargetPos(); + + // the column schema: + // |- file_name: string + // |- min_val: row + // |- max_val: row + // |- null_cnt: long + // |- val_cnt: long + // |- column_name: string + private static final int ORD_FILE_NAME = 0; + private static final int ORD_MIN_VAL = 1; + private static final int ORD_MAX_VAL = 2; + private static final int ORD_NULL_CNT = 3; + private static final int ORD_VAL_CNT = 4; + private static final int ORD_COL_NAME = 5; + + private ColumnStatsIndices() { + } + + public static List readColumnStatsIndex(String basePath, HoodieMetadataConfig metadataConfig, String[] targetColumns) { + // NOTE: If specific columns have been provided, we can considerably trim down amount of data fetched + // by only fetching Column Stats Index records pertaining to the requested columns. + // Otherwise, we fall back to read whole Column Stats Index + ValidationUtils.checkArgument(targetColumns.length > 0, + "Column stats is only valid when push down filters have referenced columns"); + final List metadataRows = readColumnStatsIndexByColumns(basePath, targetColumns, metadataConfig); + return projectNestedColStatsColumns(metadataRows); + } + + private static List projectNestedColStatsColumns(List rows) { + int pos = HoodieMetadataRecord.SCHEMA$.getField(HoodieMetadataPayload.SCHEMA_FIELD_ID_COLUMN_STATS).pos(); + RowDataProjection projection = RowDataProjection.instanceV2((RowType) COL_STATS_DATA_TYPE.getLogicalType(), COL_STATS_TARGET_POS); + return rows.stream().parallel() + .map(row -> { + RowData columnStatsField = row.getRow(pos, 9); + return projection.project(columnStatsField); + }).collect(Collectors.toList()); + } + + /** + * Transposes and converts the raw table format of the Column Stats Index representation, + * where each row/record corresponds to individual (column, file) pair, into the table format + * where each row corresponds to single file with statistic for individual columns collated + * w/in such row: + *

    + * Metadata Table Column Stats Index format: + * + *

    +   *  +---------------------------+------------+------------+------------+-------------+
    +   *  |        fileName           | columnName |  minValue  |  maxValue  |  num_nulls  |
    +   *  +---------------------------+------------+------------+------------+-------------+
    +   *  | one_base_file.parquet     |          A |          1 |         10 |           0 |
    +   *  | another_base_file.parquet |          A |        -10 |          0 |           5 |
    +   *  +---------------------------+------------+------------+------------+-------------+
    +   * 
    + *

    + * Returned table format + * + *

    +   *  +---------------------------+------------+------------+-------------+
    +   *  |          file             | A_minValue | A_maxValue | A_nullCount |
    +   *  +---------------------------+------------+------------+-------------+
    +   *  | one_base_file.parquet     |          1 |         10 |           0 |
    +   *  | another_base_file.parquet |        -10 |          0 |           5 |
    +   *  +---------------------------+------------+------------+-------------+
    +   * 
    + *

    + * NOTE: Column Stats Index might potentially contain statistics for many columns (if not all), while + * query at hand might only be referencing a handful of those. As such, we collect all the + * column references from the filtering expressions, and only transpose records corresponding to the + * columns referenced in those + * + * @param colStats RowData list bearing raw Column Stats Index table + * @param queryColumns target columns to be included into the final table + * @param tableSchema schema of the source data table + * @return reshaped table according to the format outlined above + */ + public static Pair, String[]> transposeColumnStatsIndex(List colStats, String[] queryColumns, RowType tableSchema) { + + Map tableFieldTypeMap = tableSchema.getFields().stream() + .collect(Collectors.toMap(RowType.RowField::getName, RowType.RowField::getType)); + + // NOTE: We have to collect list of indexed columns to make sure we properly align the rows + // w/in the transposed dataset: since some files might not have all the columns indexed + // either due to the Column Stats Index config changes, schema evolution, etc. we have + // to make sure that all the rows w/in transposed data-frame are properly padded (with null + // values) for such file-column combinations + Set indexedColumns = colStats.stream().map(row -> row.getString(ORD_COL_NAME) + .toString()).collect(Collectors.toSet()); + + // NOTE: We're sorting the columns to make sure final index schema matches layout + // of the transposed table + TreeSet sortedTargetColumns = Arrays.stream(queryColumns).sorted() + .filter(indexedColumns::contains) + .collect(Collectors.toCollection(TreeSet::new)); + + final Map converters = new ConcurrentHashMap<>(); + Map> fileNameToRows = colStats.stream().parallel() + .filter(row -> sortedTargetColumns.contains(row.getString(ORD_COL_NAME).toString())) + .map(row -> { + if (row.isNullAt(ORD_MIN_VAL) && row.isNullAt(ORD_MAX_VAL)) { + // Corresponding row could be null in either of the 2 cases + // - Column contains only null values (in that case both min/max have to be nulls) + // - This is a stubbed Column Stats record (used as a tombstone) + return row; + } else { + String colName = row.getString(ORD_COL_NAME).toString(); + LogicalType colType = tableFieldTypeMap.get(colName); + return unpackMinMaxVal(row, colType, converters); + } + }).collect(Collectors.groupingBy(rowData -> rowData.getString(ORD_FILE_NAME))); + + return Pair.of(foldRowsByFiles(sortedTargetColumns, fileNameToRows), sortedTargetColumns.toArray(new String[0])); + } + + private static List foldRowsByFiles( + TreeSet sortedTargetColumns, + Map> fileNameToRows) { + return fileNameToRows.values().stream().parallel().map(rows -> { + // Rows seq is always non-empty (otherwise it won't be grouped into) + StringData fileName = rows.get(0).getString(ORD_FILE_NAME); + long valueCount = rows.get(0).getLong(ORD_VAL_CNT); + + // To properly align individual rows (corresponding to a file) w/in the transposed projection, we need + // to align existing column-stats for individual file with the list of expected ones for the + // whole transposed projection (a superset of all files) + Map columnRowsMap = rows.stream() + .collect(Collectors.toMap(row -> row.getString(ORD_COL_NAME).toString(), row -> row)); + SortedMap alignedColumnRowsMap = new TreeMap<>(); + sortedTargetColumns.forEach(col -> alignedColumnRowsMap.put(col, columnRowsMap.get(col))); + + List columnStats = alignedColumnRowsMap.values().stream().map(row -> { + if (row == null) { + // NOTE: Since we're assuming missing column to essentially contain exclusively + // null values, we set null-count to be equal to value-count (this behavior is + // consistent with reading non-existent columns from Parquet) + return Tuple3.of(null, null, valueCount); + } else { + GenericRowData gr = (GenericRowData) row; + return Tuple3.of(gr.getField(ORD_MIN_VAL), gr.getField(ORD_MAX_VAL), gr.getField(ORD_NULL_CNT)); + } + }).collect(Collectors.toList()); + GenericRowData foldedRow = new GenericRowData(2 + 3 * columnStats.size()); + foldedRow.setField(0, fileName); + foldedRow.setField(1, valueCount); + for (int i = 0; i < columnStats.size(); i++) { + Tuple3 stats = columnStats.get(i); + int startPos = 2 + 3 * i; + foldedRow.setField(startPos, stats.f0); + foldedRow.setField(startPos + 1, stats.f1); + foldedRow.setField(startPos + 2, stats.f2); + } + return foldedRow; + }).collect(Collectors.toList()); + } + + private static RowData unpackMinMaxVal( + RowData row, + LogicalType colType, + Map converters) { + + RowData minValueStruct = row.getRow(ORD_MIN_VAL, 1); + RowData maxValueStruct = row.getRow(ORD_MAX_VAL, 1); + + checkState(minValueStruct != null && maxValueStruct != null, + "Invalid Column Stats record: either both min/max have to be null, or both have to be non-null"); + + Object minValue = tryUnpackNonNullVal(minValueStruct, colType, converters); + Object maxValue = tryUnpackNonNullVal(maxValueStruct, colType, converters); + + // the column schema: + // |- file_name: string + // |- min_val: row + // |- max_val: row + // |- null_cnt: long + // |- val_cnt: long + // |- column_name: string + + GenericRowData unpackedRow = new GenericRowData(row.getArity()); + unpackedRow.setField(0, row.getString(0)); + unpackedRow.setField(1, minValue); + unpackedRow.setField(2, maxValue); + unpackedRow.setField(3, row.getLong(3)); + unpackedRow.setField(4, row.getLong(4)); + unpackedRow.setField(5, row.getString(5)); + + return unpackedRow; + } + + private static Object tryUnpackNonNullVal( + RowData rowData, + LogicalType colType, + Map converters) { + for (int i = 0; i < rowData.getArity(); i++) { + // row data converted from avro is definitely generic. + Object nested = ((GenericRowData) rowData).getField(i); + if (nested != null) { + return doUnpack(nested, colType, converters); + } + } + return null; + } + + private static Object doUnpack( + Object rawVal, + LogicalType logicalType, + Map converters) { + AvroToRowDataConverters.AvroToRowDataConverter converter = + converters.computeIfAbsent(logicalType, k -> AvroToRowDataConverters.createConverter(logicalType)); + return converter.convert(rawVal); + } + + private static List readColumnStatsIndexByColumns( + String basePath, + String[] targetColumns, + HoodieMetadataConfig metadataConfig) { + + // Read Metadata Table's Column Stats Index into Flink's RowData list by + // - Fetching the records from CSI by key-prefixes (encoded column names) + // - Deserializing fetched records into [[RowData]]s + HoodieTableMetadata metadataTable = HoodieTableMetadata.create( + HoodieFlinkEngineContext.DEFAULT, + metadataConfig, basePath, + FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue()); + + // TODO encoding should be done internally w/in HoodieBackedTableMetadata + List encodedTargetColumnNames = Arrays.stream(targetColumns) + .map(colName -> new ColumnIndexID(colName).asBase64EncodedString()).collect(Collectors.toList()); + + HoodieData> records = + metadataTable.getRecordsByKeyPrefixes(encodedTargetColumnNames, HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS, false); + + org.apache.hudi.util.AvroToRowDataConverters.AvroToRowDataConverter converter = + AvroToRowDataConverters.createRowConverter((RowType) METADATA_DATA_TYPE.getLogicalType()); + return records.collectAsList().stream().parallel().map(record -> { + // schema and props are ignored for generating metadata record from the payload + // instead, the underlying file system, or bloom filter, or columns stats metadata (part of payload) are directly used + GenericRecord genericRecord; + try { + genericRecord = (GenericRecord) record.getData().getInsertValue(null, null).orElse(null); + } catch (IOException e) { + throw new HoodieException("Exception while getting insert value from metadata payload"); + } + return (RowData) converter.convert(genericRecord); + } + ).collect(Collectors.toList()); + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + private static class Tuple3 { + public Object f0; + public Object f1; + public Object f2; + + private Tuple3(Object f0, Object f1, Object f2) { + this.f0 = f0; + this.f1 = f1; + this.f2 = f2; + } + + public static Tuple3 of(Object f0, Object f1, Object f2) { + return new Tuple3(f0, f1, f2); + } + } + + private static DataType getMetadataDataType() { + return AvroSchemaConverter.convertToDataType(HoodieMetadataRecord.SCHEMA$); + } + + private static DataType getColStatsDataType() { + int pos = HoodieMetadataRecord.SCHEMA$.getField(HoodieMetadataPayload.SCHEMA_FIELD_ID_COLUMN_STATS).pos(); + return METADATA_DATA_TYPE.getChildren().get(pos); + } + + // the column schema: + // |- file_name: string + // |- min_val: row + // |- max_val: row + // |- null_cnt: long + // |- val_cnt: long + // |- column_name: string + private static int[] getColStatsTargetPos() { + RowType colStatsRowType = (RowType) COL_STATS_DATA_TYPE.getLogicalType(); + return Stream.of( + HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME, + HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE, + HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE, + HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT, + HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT, + HoodieMetadataPayload.COLUMN_STATS_FIELD_COLUMN_NAME) + .mapToInt(colStatsRowType::getFieldIndex) + .toArray(); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/stats/ExpressionEvaluator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/stats/ExpressionEvaluator.java new file mode 100644 index 0000000000000..08ded144e0a4b --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/stats/ExpressionEvaluator.java @@ -0,0 +1,556 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.source.stats; + +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.util.ExpressionUtils; + +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.expressions.CallExpression; +import org.apache.flink.table.expressions.Expression; +import org.apache.flink.table.expressions.FieldReferenceExpression; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.expressions.ValueLiteralExpression; +import org.apache.flink.table.functions.BuiltInFunctionDefinitions; +import org.apache.flink.table.functions.FunctionDefinition; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimestampType; + +import javax.validation.constraints.NotNull; + +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Tool to evaluate the {@link org.apache.flink.table.expressions.ResolvedExpression}s. + */ +public class ExpressionEvaluator { + private static final int IN_PREDICATE_LIMIT = 200; + + /** + * Filter the index row with specific data filters and query fields. + * + * @param filters The pushed down data filters + * @param indexRow The index row + * @param queryFields The query fields referenced by the filters + * @return true if the index row should be considered as a candidate + */ + public static boolean filterExprs(List filters, RowData indexRow, RowType.RowField[] queryFields) { + for (ResolvedExpression filter : filters) { + if (!Evaluator.bindCall((CallExpression) filter, indexRow, queryFields).eval()) { + return false; + } + } + return true; + } + + /** + * Used for deciding whether the literal values match the column stats. + * The evaluator can be nested. + */ + public abstract static class Evaluator { + // the constant literal value + protected Object val; + + // column stats + protected Object minVal; + protected Object maxVal; + protected long nullCnt = 0; + + // referenced field type + protected LogicalType type; + + /** + * Binds the evaluator with specific call expression. + * + *

    Three steps to bind the call: + * 1. map the evaluator instance; + * 2. bind the field reference; + * 3. bind the column stats. + * + *

    Normalize the expression to simplify the subsequent decision logic: + * always put the literal expression in the RHS. + */ + public static Evaluator bindCall(CallExpression call, RowData indexRow, RowType.RowField[] queryFields) { + FunctionDefinition funDef = call.getFunctionDefinition(); + List childExprs = call.getChildren(); + + boolean normalized = childExprs.get(0) instanceof FieldReferenceExpression; + final Evaluator evaluator; + + if (BuiltInFunctionDefinitions.NOT.equals(funDef)) { + evaluator = Not.getInstance(); + Evaluator childEvaluator = bindCall((CallExpression) childExprs.get(0), indexRow, queryFields); + return ((Not) evaluator).bindEvaluator(childEvaluator); + } + + if (BuiltInFunctionDefinitions.AND.equals(funDef)) { + evaluator = And.getInstance(); + Evaluator evaluator1 = bindCall((CallExpression) childExprs.get(0), indexRow, queryFields); + Evaluator evaluator2 = bindCall((CallExpression) childExprs.get(1), indexRow, queryFields); + return ((And) evaluator).bindEvaluator(evaluator1, evaluator2); + } + + if (BuiltInFunctionDefinitions.OR.equals(funDef)) { + evaluator = Or.getInstance(); + Evaluator evaluator1 = bindCall((CallExpression) childExprs.get(0), indexRow, queryFields); + Evaluator evaluator2 = bindCall((CallExpression) childExprs.get(1), indexRow, queryFields); + return ((Or) evaluator).bindEvaluator(evaluator1, evaluator2); + } + + // handle IN specifically + if (BuiltInFunctionDefinitions.IN.equals(funDef)) { + ValidationUtils.checkState(normalized, "The IN expression expects to be normalized"); + evaluator = In.getInstance(); + FieldReferenceExpression rExpr = (FieldReferenceExpression) childExprs.get(0); + evaluator.bindFieldReference(rExpr); + ((In) evaluator).bindVals(getInLiteralVals(childExprs)); + return evaluator.bindColStats(indexRow, queryFields, rExpr); + } + + // handle unary operators + if (BuiltInFunctionDefinitions.IS_NULL.equals(funDef)) { + FieldReferenceExpression rExpr = (FieldReferenceExpression) childExprs.get(0); + return IsNull.getInstance() + .bindFieldReference(rExpr) + .bindColStats(indexRow, queryFields, rExpr); + } else if (BuiltInFunctionDefinitions.IS_NOT_NULL.equals(funDef)) { + FieldReferenceExpression rExpr = (FieldReferenceExpression) childExprs.get(0); + return IsNotNull.getInstance() + .bindFieldReference(rExpr) + .bindColStats(indexRow, queryFields, rExpr); + } + + // handle binary operators + if (BuiltInFunctionDefinitions.EQUALS.equals(funDef)) { + evaluator = EqualTo.getInstance(); + } else if (BuiltInFunctionDefinitions.NOT_EQUALS.equals(funDef)) { + evaluator = NotEqualTo.getInstance(); + } else if (BuiltInFunctionDefinitions.LESS_THAN.equals(funDef)) { + evaluator = normalized ? LessThan.getInstance() : GreaterThan.getInstance(); + } else if (BuiltInFunctionDefinitions.GREATER_THAN.equals(funDef)) { + evaluator = normalized ? GreaterThan.getInstance() : LessThan.getInstance(); + } else if (BuiltInFunctionDefinitions.LESS_THAN_OR_EQUAL.equals(funDef)) { + evaluator = normalized ? LessThanOrEqual.getInstance() : GreaterThanOrEqual.getInstance(); + } else if (BuiltInFunctionDefinitions.GREATER_THAN_OR_EQUAL.equals(funDef)) { + evaluator = normalized ? GreaterThanOrEqual.getInstance() : LessThanOrEqual.getInstance(); + } else { + throw new AssertionError("Unexpected function definition " + funDef); + } + FieldReferenceExpression rExpr = normalized + ? (FieldReferenceExpression) childExprs.get(0) + : (FieldReferenceExpression) childExprs.get(1); + ValueLiteralExpression vExpr = normalized + ? (ValueLiteralExpression) childExprs.get(1) + : (ValueLiteralExpression) childExprs.get(0); + evaluator + .bindFieldReference(rExpr) + .bindVal(vExpr) + .bindColStats(indexRow, queryFields, rExpr); + return evaluator; + } + + public Evaluator bindColStats( + RowData indexRow, + RowType.RowField[] queryFields, + FieldReferenceExpression expr) { + int colPos = -1; + for (int i = 0; i < queryFields.length; i++) { + if (expr.getName().equals(queryFields[i].getName())) { + colPos = i; + } + } + ValidationUtils.checkState(colPos != -1, "Can not find column " + expr.getName()); + int startPos = 2 + colPos * 3; + LogicalType colType = queryFields[colPos].getType(); + Object minVal = indexRow.isNullAt(startPos) ? null : getValAsJavaObj(indexRow, startPos, colType); + Object maxVal = indexRow.isNullAt(startPos + 1) ? null : getValAsJavaObj(indexRow, startPos + 1, colType); + long nullCnt = indexRow.getLong(startPos + 2); + + this.minVal = minVal; + this.maxVal = maxVal; + this.nullCnt = nullCnt; + return this; + } + + public Evaluator bindVal(ValueLiteralExpression vExpr) { + this.val = ExpressionUtils.getValueFromLiteral(vExpr); + return this; + } + + public Evaluator bindFieldReference(FieldReferenceExpression expr) { + this.type = expr.getOutputDataType().getLogicalType(); + return this; + } + + public abstract boolean eval(); + } + + /** + * To evaluate = expr. + */ + public static class EqualTo extends Evaluator { + + public static EqualTo getInstance() { + return new EqualTo(); + } + + @Override + public boolean eval() { + if (this.minVal == null || this.maxVal == null || this.val == null) { + return false; + } + if (compare(this.minVal, this.val, this.type) > 0) { + return false; + } + return compare(this.maxVal, this.val, this.type) >= 0; + } + } + + /** + * To evaluate <> expr. + */ + public static class NotEqualTo extends Evaluator { + public static NotEqualTo getInstance() { + return new NotEqualTo(); + } + + @Override + public boolean eval() { + // because the bounds are not necessarily a min or max value, this cannot be answered using + // them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col. + return true; + } + } + + /** + * To evaluate IS NULL expr. + */ + public static class IsNull extends Evaluator { + public static IsNull getInstance() { + return new IsNull(); + } + + @Override + public boolean eval() { + return this.nullCnt > 0; + } + } + + /** + * To evaluate IS NOT NULL expr. + */ + public static class IsNotNull extends Evaluator { + public static IsNotNull getInstance() { + return new IsNotNull(); + } + + @Override + public boolean eval() { + // should consider FLOAT/DOUBLE & NAN + return this.minVal != null || this.nullCnt <= 0; + } + } + + /** + * To evaluate < expr. + */ + public static class LessThan extends Evaluator { + public static LessThan getInstance() { + return new LessThan(); + } + + @Override + public boolean eval() { + if (this.minVal == null) { + return false; + } + return compare(this.minVal, this.val, this.type) < 0; + } + } + + /** + * To evaluate > expr. + */ + public static class GreaterThan extends Evaluator { + public static GreaterThan getInstance() { + return new GreaterThan(); + } + + @Override + public boolean eval() { + if (this.maxVal == null) { + return false; + } + return compare(this.maxVal, this.val, this.type) > 0; + } + } + + /** + * To evaluate <= expr. + */ + public static class LessThanOrEqual extends Evaluator { + public static LessThanOrEqual getInstance() { + return new LessThanOrEqual(); + } + + @Override + public boolean eval() { + if (this.minVal == null) { + return false; + } + return compare(this.minVal, this.val, this.type) <= 0; + } + } + + /** + * To evaluate >= expr. + */ + public static class GreaterThanOrEqual extends Evaluator { + public static GreaterThanOrEqual getInstance() { + return new GreaterThanOrEqual(); + } + + @Override + public boolean eval() { + if (this.maxVal == null) { + return false; + } + return compare(this.maxVal, this.val, this.type) >= 0; + } + } + + /** + * To evaluate IN expr. + */ + public static class In extends Evaluator { + public static In getInstance() { + return new In(); + } + + private Object[] vals; + + @Override + public boolean eval() { + if (this.minVal == null) { + return false; // values are all null and literalSet cannot contain null. + } + + if (vals.length > IN_PREDICATE_LIMIT) { + // skip evaluating the predicate if the number of values is too big + return true; + } + + vals = Arrays.stream(vals).filter(v -> compare(this.minVal, v, this.type) <= 0).toArray(); + if (vals.length == 0) { // if all values are less than lower bound, rows cannot match. + return false; + } + + vals = Arrays.stream(vals).filter(v -> compare(this.maxVal, v, this.type) >= 0).toArray(); + if (vals.length == 0) { // if all remaining values are greater than upper bound, rows cannot match. + return false; + } + + return true; + } + + public void bindVals(Object... vals) { + this.vals = vals; + } + } + + // component predicate + + /** + * To evaluate NOT expr. + */ + public static class Not extends Evaluator { + public static Not getInstance() { + return new Not(); + } + + private Evaluator evaluator; + + @Override + public boolean eval() { + return !this.evaluator.eval(); + } + + public Evaluator bindEvaluator(Evaluator evaluator) { + this.evaluator = evaluator; + return this; + } + } + + /** + * To evaluate AND expr. + */ + public static class And extends Evaluator { + public static And getInstance() { + return new And(); + } + + private Evaluator[] evaluators; + + @Override + public boolean eval() { + for (Evaluator evaluator : evaluators) { + if (!evaluator.eval()) { + return false; + } + } + return true; + } + + public Evaluator bindEvaluator(Evaluator... evaluators) { + this.evaluators = evaluators; + return this; + } + } + + /** + * To evaluate OR expr. + */ + public static class Or extends Evaluator { + public static Or getInstance() { + return new Or(); + } + + private Evaluator[] evaluators; + + @Override + public boolean eval() { + for (Evaluator evaluator : evaluators) { + if (evaluator.eval()) { + return true; + } + } + return false; + } + + public Evaluator bindEvaluator(Evaluator... evaluators) { + this.evaluators = evaluators; + return this; + } + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + private static int compare(@NotNull Object val1, @NotNull Object val2, LogicalType logicalType) { + switch (logicalType.getTypeRoot()) { + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIME_WITHOUT_TIME_ZONE: + case DATE: + return ((Long) val1).compareTo((Long) val2); + case BOOLEAN: + return ((Boolean) val1).compareTo((Boolean) val2); + case TINYINT: + case SMALLINT: + case INTEGER: + return ((Integer) val1).compareTo((Integer) val2); + case FLOAT: + return ((Float) val1).compareTo((Float) val2); + case DOUBLE: + return ((Double) val1).compareTo((Double) val2); + case BINARY: + case VARBINARY: + return compareBytes((byte[]) val1, (byte[]) val2); + case CHAR: + case VARCHAR: + return ((String) val1).compareTo((String) val2); + case DECIMAL: + return ((BigDecimal) val1).compareTo((BigDecimal) val2); + default: + throw new UnsupportedOperationException("Unsupported type: " + logicalType); + } + } + + private static int compareBytes(byte[] v1, byte[] v2) { + int len1 = v1.length; + int len2 = v2.length; + int lim = Math.min(len1, len2); + + int k = 0; + while (k < lim) { + byte c1 = v1[k]; + byte c2 = v2[k]; + if (c1 != c2) { + return c1 - c2; + } + k++; + } + return len1 - len2; + } + + /** + * Returns the IN expression literal values. + */ + private static Object[] getInLiteralVals(List childExprs) { + List vals = new ArrayList<>(); + for (int i = 1; i < childExprs.size(); i++) { + vals.add(ExpressionUtils.getValueFromLiteral((ValueLiteralExpression) childExprs.get(i))); + } + return vals.toArray(); + } + + /** + * Returns the value as Java object at position {@code pos} of row {@code indexRow}. + */ + private static Object getValAsJavaObj(RowData indexRow, int pos, LogicalType colType) { + switch (colType.getTypeRoot()) { + // NOTE: Since we can't rely on Avro's "date", and "timestamp-micros" logical-types, we're + // manually encoding corresponding values as int and long w/in the Column Stats Index and + // here we have to decode those back into corresponding logical representation. + case TIMESTAMP_WITHOUT_TIME_ZONE: + TimestampType tsType = (TimestampType) colType; + return indexRow.getTimestamp(pos, tsType.getPrecision()).getMillisecond(); + case TIME_WITHOUT_TIME_ZONE: + case DATE: + return indexRow.getLong(pos); + // NOTE: All integral types of size less than Int are encoded as Ints in MT + case BOOLEAN: + return indexRow.getBoolean(pos); + case TINYINT: + case SMALLINT: + case INTEGER: + return indexRow.getInt(pos); + case FLOAT: + return indexRow.getFloat(pos); + case DOUBLE: + return indexRow.getDouble(pos); + case BINARY: + case VARBINARY: + return indexRow.getBinary(pos); + case CHAR: + case VARCHAR: + return indexRow.getString(pos).toString(); + case DECIMAL: + DecimalType decimalType = (DecimalType) colType; + return indexRow.getDecimal(pos, decimalType.getPrecision(), decimalType.getScale()).toBigDecimal(); + default: + throw new UnsupportedOperationException("Unsupported type: " + colType); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java new file mode 100644 index 0000000000000..f022b04ea1acc --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java @@ -0,0 +1,430 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.streamer; + +import org.apache.hudi.client.utils.OperationConverter; +import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.hive.MultiPartKeysValueExtractor; +import org.apache.hudi.keygen.constant.KeyGeneratorType; +import org.apache.hudi.util.FlinkStateBackendConverter; +import org.apache.hudi.util.StreamerUtil; + +import com.beust.jcommander.Parameter; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.state.StateBackend; +import org.apache.flink.runtime.state.hashmap.HashMapStateBackend; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.apache.hudi.common.util.PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH; +import static org.apache.hudi.configuration.FlinkOptions.PARTITION_FORMAT_DAY; + +/** + * Configurations for Hoodie Flink streamer. + */ +public class FlinkStreamerConfig extends Configuration { + @Parameter(names = {"--kafka-topic"}, description = "Kafka topic name.", required = true) + public String kafkaTopic; + + @Parameter(names = {"--kafka-group-id"}, description = "Kafka consumer group id.", required = true) + public String kafkaGroupId; + + @Parameter(names = {"--kafka-bootstrap-servers"}, description = "Kafka bootstrap.servers.", required = true) + public String kafkaBootstrapServers; + + @Parameter(names = {"--flink-checkpoint-path"}, description = "Flink checkpoint path.") + public String flinkCheckPointPath; + + @Parameter(names = {"--flink-state-backend-type"}, description = "Flink state backend type, support only hashmap and rocksdb by now," + + " default hashmap.", converter = FlinkStateBackendConverter.class) + public StateBackend stateBackend = new HashMapStateBackend(); + + @Parameter(names = {"--instant-retry-times"}, description = "Times to retry when latest instant has not completed.") + public String instantRetryTimes = "10"; + + @Parameter(names = {"--instant-retry-interval"}, description = "Seconds between two tries when latest instant has not completed.") + public String instantRetryInterval = "1"; + + @Parameter(names = {"--target-base-path"}, + description = "Base path for the target hoodie table. " + + "(Will be created if did not exist first time around. If exists, expected to be a hoodie table).", + required = true) + public String targetBasePath; + + @Parameter(names = {"--target-table"}, description = "Name of the target table in Hive.", required = true) + public String targetTableName; + + @Parameter(names = {"--table-type"}, description = "Type of table. COPY_ON_WRITE (or) MERGE_ON_READ.", required = true) + public String tableType; + + @Parameter(names = {"--insert-cluster"}, description = "Whether to merge small files for insert mode, " + + "if true, the write throughput will decrease because the read/write of existing small file, default false.") + public Boolean insertCluster = false; + + @Parameter(names = {"--props"}, description = "Path to properties file on localfs or dfs, with configurations for " + + "hoodie client, schema provider, key generator and data source. For hoodie client props, sane defaults are " + + "used, but recommend use to provide basic things like metrics endpoints, hive configs etc. For sources, refer" + + "to individual classes, for supported properties.") + public String propsFilePath = ""; + + @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file " + + "(using the CLI parameter \"--props\") can also be passed command line using this parameter.") + public List configs = new ArrayList<>(); + + @Parameter(names = {"--record-key-field"}, description = "Record key field. Value to be used as the `recordKey` component of `HoodieKey`.\n" + + "Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using " + + "the dot notation eg: `a.b.c`. By default `uuid`.") + public String recordKeyField = "uuid"; + + @Parameter(names = {"--partition-path-field"}, description = "Partition path field. Value to be used at \n" + + "the `partitionPath` component of `HoodieKey`. Actual value obtained by invoking .toString(). By default `partitionpath`.") + public String partitionPathField = "partitionpath"; + + @Parameter(names = {"--keygen-class"}, description = "Key generator class, that implements will extract the key out of incoming record.") + public String keygenClass; + + @Parameter(names = {"--keygen-type"}, description = "Key generator type, that implements will extract the key out of incoming record \n" + + "By default `SIMPLE`.") + public String keygenType = KeyGeneratorType.SIMPLE.name(); + + @Parameter(names = {"--source-ordering-field"}, description = "Field within source record to decide how" + + " to break ties between records with same key in input data. Default: 'ts' holding unix timestamp of record.") + public String sourceOrderingField = "ts"; + + @Parameter(names = {"--payload-class"}, description = "Subclass of HoodieRecordPayload, that works off " + + "a GenericRecord. Implement your own, if you want to do something other than overwriting existing value.") + public String payloadClassName = OverwriteWithLatestAvroPayload.class.getName(); + + @Parameter(names = {"--op"}, description = "Takes one of these values : UPSERT (default), INSERT (use when input " + + "is purely new data/inserts to gain speed).", converter = OperationConverter.class) + public WriteOperationType operation = WriteOperationType.UPSERT; + + @Parameter(names = {"--filter-dupes"}, + description = "Should duplicate records from source be dropped/filtered out before insert/bulk-insert.") + public Boolean preCombine = false; + + @Parameter(names = {"--commit-on-errors"}, description = "Commit even when some records failed to be written.") + public Boolean commitOnErrors = false; + + @Parameter(names = {"--transformer-class"}, + description = "A subclass or a list of subclasses of org.apache.hudi.sink.transform.Transformer" + + ". Allows transforming raw source DataStream to a target DataStream (conforming to target schema) before " + + "writing. Default : Not set. Pass a comma-separated list of subclass names to chain the transformations.") + public List transformerClassNames = null; + + @Parameter(names = {"--metadata-enabled"}, description = "Enable the internal metadata table which serves table metadata like level file listings, default false.") + public Boolean metadataEnabled = false; + + @Parameter(names = {"--metadata-compaction-delta_commits"}, description = "Max delta commits for metadata table to trigger compaction, default 10.") + public Integer metadataCompactionDeltaCommits = 10; + + @Parameter(names = {"--write-partition-format"}, description = "Partition path format, default is 'yyyyMMdd'.") + public String writePartitionFormat = PARTITION_FORMAT_DAY; + + @Parameter(names = {"--write-rate-limit"}, description = "Write record rate limit per second to prevent traffic jitter and improve stability, default 0 (no limit).") + public Long writeRateLimit = 0L; + + @Parameter(names = {"--write-parquet-block-size"}, description = "Parquet RowGroup size. It's recommended to make this large enough that scan costs can be" + + " amortized by packing enough column values into a single row group.") + public Integer writeParquetBlockSize = 120; + + @Parameter(names = {"--write-parquet-max-file-size"}, description = "Target size for parquet files produced by Hudi write phases. " + + "For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance.") + public Integer writeParquetMaxFileSize = 120; + + @Parameter(names = {"--parquet-page-size"}, description = "Parquet page size. Page is the unit of read within a parquet file. " + + "Within a block, pages are compressed separately.") + public Integer parquetPageSize = 1; + + /** + * Flink checkpoint interval. + */ + @Parameter(names = {"--checkpoint-interval"}, description = "Flink checkpoint interval.") + public Long checkpointInterval = 1000 * 5L; + + @Parameter(names = {"--help", "-h"}, help = true) + public Boolean help = false; + + @Parameter(names = {"--index-bootstrap-num"}, description = "Parallelism of tasks that do bucket assign, default is 4.") + public Integer indexBootstrapNum = 4; + + @Parameter(names = {"--bucket-assign-num"}, description = "Parallelism of tasks that do bucket assign, default is 4.") + public Integer bucketAssignNum = 4; + + @Parameter(names = {"--write-task-num"}, description = "Parallelism of tasks that do actual write, default is 4.") + public Integer writeTaskNum = 4; + + @Parameter(names = {"--partition-default-name"}, + description = "The default partition name in case the dynamic partition column value is null/empty string") + public String partitionDefaultName = DEFAULT_PARTITION_PATH; + + @Parameter(names = {"--index-bootstrap-enabled"}, + description = "Whether to bootstrap the index state from existing hoodie table, default false") + public Boolean indexBootstrapEnabled = false; + + @Parameter(names = {"--index-state-ttl"}, description = "Index state ttl in days, default stores the index permanently") + public Double indexStateTtl = 0D; + + @Parameter(names = {"--index-global-enabled"}, description = "Whether to update index for the old partition path " + + "if same key record with different partition path came in, default true") + public Boolean indexGlobalEnabled = true; + + @Parameter(names = {"--index-partition-regex"}, + description = "Whether to load partitions in state if partition path matching, default *") + public String indexPartitionRegex = ".*"; + + @Parameter(names = {"--source-avro-schema-path"}, description = "Source avro schema file path, the parsed schema is used for deserialization") + public String sourceAvroSchemaPath = ""; + + @Parameter(names = {"--source-avro-schema"}, description = "Source avro schema string, the parsed schema is used for deserialization") + public String sourceAvroSchema = ""; + + @Parameter(names = {"--utc-timezone"}, description = "Use UTC timezone or local timezone to the conversion between epoch" + + " time and LocalDateTime. Hive 0.x/1.x/2.x use local timezone. But Hive 3.x" + + " use UTC timezone, by default true") + public Boolean utcTimezone = true; + + @Parameter(names = {"--write-partition-url-encode"}, description = "Whether to encode the partition path url, default false") + public Boolean writePartitionUrlEncode = false; + + @Parameter(names = {"--hive-style-partitioning"}, description = "Whether to use Hive style partitioning.\n" + + "If set true, the names of partition folders follow = format.\n" + + "By default false (the names of partition folders are only partition values)") + public Boolean hiveStylePartitioning = false; + + @Parameter(names = {"--write-task-max-size"}, description = "Maximum memory in MB for a write task, when the threshold hits,\n" + + "it flushes the max size data bucket to avoid OOM, default 1GB") + public Double writeTaskMaxSize = 1024D; + + @Parameter(names = {"--write-batch-size"}, + description = "Batch buffer size in MB to flush data into the underneath filesystem, default 256MB") + public Double writeBatchSize = 256D; + + @Parameter(names = {"--write-log-block-size"}, description = "Max log block size in MB for log file, default 128MB") + public Integer writeLogBlockSize = 128; + + @Parameter(names = {"--write-log-max-size"}, + description = "Maximum size allowed in MB for a log file before it is rolled over to the next version, default 1GB") + public Integer writeLogMaxSize = 1024; + + @Parameter(names = {"--write-merge-max-memory"}, description = "Max memory in MB for merge, default 100MB") + public Integer writeMergeMaxMemory = 100; + + @Parameter(names = {"--compaction-async-enabled"}, description = "Async Compaction, enabled by default for MOR") + public Boolean compactionAsyncEnabled = true; + + @Parameter(names = {"--compaction-tasks"}, description = "Parallelism of tasks that do actual compaction, default is 10") + public Integer compactionTasks = 10; + + @Parameter(names = {"--compaction-trigger-strategy"}, + description = "Strategy to trigger compaction, options are 'num_commits': trigger compaction when reach N delta commits;\n" + + "'time_elapsed': trigger compaction when time elapsed > N seconds since last compaction;\n" + + "'num_and_time': trigger compaction when both NUM_COMMITS and TIME_ELAPSED are satisfied;\n" + + "'num_or_time': trigger compaction when NUM_COMMITS or TIME_ELAPSED is satisfied.\n" + + "Default is 'num_commits'") + public String compactionTriggerStrategy = FlinkOptions.NUM_COMMITS; + + @Parameter(names = {"--compaction-delta-commits"}, description = "Max delta commits needed to trigger compaction, default 5 commits") + public Integer compactionDeltaCommits = 5; + + @Parameter(names = {"--compaction-delta-seconds"}, description = "Max delta seconds time needed to trigger compaction, default 1 hour") + public Integer compactionDeltaSeconds = 3600; + + @Parameter(names = {"--compaction-max-memory"}, description = "Max memory in MB for compaction spillable map, default 100MB") + public Integer compactionMaxMemory = 100; + + @Parameter(names = {"--compaction-target-io"}, description = "Target IO per compaction (both read and write), default 500 GB") + public Long compactionTargetIo = 512000L; + + @Parameter(names = {"--clean-async-enabled"}, description = "Whether to cleanup the old commits immediately on new commits, enabled by default") + public Boolean cleanAsyncEnabled = true; + + @Parameter(names = {"--clean-policy"}, + description = "Clean policy to manage the Hudi table. Available option: KEEP_LATEST_COMMITS, KEEP_LATEST_FILE_VERSIONS, KEEP_LATEST_BY_HOURS." + + "Default is KEEP_LATEST_COMMITS.") + public String cleanPolicy = HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name(); + + @Parameter(names = {"--clean-retain-commits"}, + description = "Number of commits to retain. So data will be retained for num_of_commits * time_between_commits (scheduled).\n" + + "This also directly translates into how much you can incrementally pull on this table, default 10") + public Integer cleanRetainCommits = 10; + + @Parameter(names = {"--clean-retain-file-versions"}, + description = "Number of file versions to retain. Each file group will be retained for this number of version. default 5") + public Integer cleanRetainFileVersions = 5; + + @Parameter(names = {"--archive-max-commits"}, + description = "Max number of commits to keep before archiving older commits into a sequential log, default 30") + public Integer archiveMaxCommits = 30; + + @Parameter(names = {"--archive-min-commits"}, + description = "Min number of commits to keep before archiving older commits into a sequential log, default 20") + public Integer archiveMinCommits = 20; + + @Parameter(names = {"--hive-sync-enable"}, description = "Asynchronously sync Hive meta to HMS, default false") + public Boolean hiveSyncEnabled = false; + + @Parameter(names = {"--hive-sync-db"}, description = "Database name for hive sync, default 'default'") + public String hiveSyncDb = "default"; + + @Parameter(names = {"--hive-sync-table"}, description = "Table name for hive sync, default 'unknown'") + public String hiveSyncTable = "unknown"; + + @Parameter(names = {"--hive-sync-file-format"}, description = "File format for hive sync, default 'PARQUET'") + public String hiveSyncFileFormat = "PARQUET"; + + @Parameter(names = {"--hive-sync-mode"}, description = "Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql, default 'jdbc'") + public String hiveSyncMode = "jdbc"; + + @Parameter(names = {"--hive-sync-username"}, description = "Username for hive sync, default 'hive'") + public String hiveSyncUsername = "hive"; + + @Parameter(names = {"--hive-sync-password"}, description = "Password for hive sync, default 'hive'") + public String hiveSyncPassword = "hive"; + + @Parameter(names = {"--hive-sync-jdbc-url"}, description = "Jdbc URL for hive sync, default 'jdbc:hive2://localhost:10000'") + public String hiveSyncJdbcUrl = "jdbc:hive2://localhost:10000"; + + @Parameter(names = {"--hive-sync-metastore-uris"}, description = "Metastore uris for hive sync, default ''") + public String hiveSyncMetastoreUri = ""; + + @Parameter(names = {"--hive-sync-partition-fields"}, description = "Partition fields for hive sync, default ''") + public String hiveSyncPartitionFields = ""; + + @Parameter(names = {"--hive-sync-partition-extractor-class"}, description = "Tool to extract the partition value from HDFS path, " + + "default 'MultiPartKeysValueExtractor'") + public String hiveSyncPartitionExtractorClass = MultiPartKeysValueExtractor.class.getCanonicalName(); + + @Parameter(names = {"--hive-sync-assume-date-partitioning"}, description = "Assume partitioning is yyyy/mm/dd, default false") + public Boolean hiveSyncAssumeDatePartition = false; + + @Parameter(names = {"--hive-sync-use-jdbc"}, description = "Use JDBC when hive synchronization is enabled, default true") + public Boolean hiveSyncUseJdbc = true; + + @Parameter(names = {"--hive-sync-auto-create-db"}, description = "Auto create hive database if it does not exists, default true") + public Boolean hiveSyncAutoCreateDb = true; + + @Parameter(names = {"--hive-sync-ignore-exceptions"}, description = "Ignore exceptions during hive synchronization, default false") + public Boolean hiveSyncIgnoreExceptions = false; + + @Parameter(names = {"--hive-sync-skip-ro-suffix"}, description = "Skip the _ro suffix for Read optimized table when registering, default false") + public Boolean hiveSyncSkipRoSuffix = false; + + @Parameter(names = {"--hive-sync-support-timestamp"}, description = "INT64 with original type TIMESTAMP_MICROS is converted to hive timestamp type.\n" + + "Disabled by default for backward compatibility.") + public Boolean hiveSyncSupportTimestamp = false; + + + /** + * Transforms a {@code HoodieFlinkStreamer.Config} into {@code Configuration}. + * The latter is more suitable for the table APIs. It reads all the properties + * in the properties file (set by `--props` option) and cmd line options + * (set by `--hoodie-conf` option). + */ + @SuppressWarnings("unchecked, rawtypes") + public static org.apache.flink.configuration.Configuration toFlinkConfig(FlinkStreamerConfig config) { + Map propsMap = new HashMap((Map) StreamerUtil.getProps(config)); + org.apache.flink.configuration.Configuration conf = fromMap(propsMap); + + conf.setString(FlinkOptions.PATH, config.targetBasePath); + conf.setString(FlinkOptions.TABLE_NAME, config.targetTableName); + // copy_on_write works same as COPY_ON_WRITE + conf.setString(FlinkOptions.TABLE_TYPE, config.tableType.toUpperCase()); + conf.setBoolean(FlinkOptions.INSERT_CLUSTER, config.insertCluster); + conf.setString(FlinkOptions.OPERATION, config.operation.value()); + conf.setString(FlinkOptions.PRECOMBINE_FIELD, config.sourceOrderingField); + conf.setString(FlinkOptions.PAYLOAD_CLASS_NAME, config.payloadClassName); + conf.setBoolean(FlinkOptions.PRE_COMBINE, config.preCombine); + conf.setInteger(FlinkOptions.RETRY_TIMES, Integer.parseInt(config.instantRetryTimes)); + conf.setLong(FlinkOptions.RETRY_INTERVAL_MS, Long.parseLong(config.instantRetryInterval)); + conf.setBoolean(FlinkOptions.IGNORE_FAILED, config.commitOnErrors); + conf.setString(FlinkOptions.RECORD_KEY_FIELD, config.recordKeyField); + conf.setString(FlinkOptions.PARTITION_PATH_FIELD, config.partitionPathField); + conf.setBoolean(FlinkOptions.METADATA_ENABLED, config.metadataEnabled); + conf.setInteger(FlinkOptions.METADATA_COMPACTION_DELTA_COMMITS, config.metadataCompactionDeltaCommits); + conf.setString(FlinkOptions.PARTITION_FORMAT, config.writePartitionFormat); + conf.setLong(FlinkOptions.WRITE_RATE_LIMIT, config.writeRateLimit); + conf.setInteger(FlinkOptions.WRITE_PARQUET_BLOCK_SIZE, config.writeParquetBlockSize); + conf.setInteger(FlinkOptions.WRITE_PARQUET_MAX_FILE_SIZE, config.writeParquetMaxFileSize); + conf.setInteger(FlinkOptions.WRITE_PARQUET_PAGE_SIZE, config.parquetPageSize); + if (!StringUtils.isNullOrEmpty(config.keygenClass)) { + conf.setString(FlinkOptions.KEYGEN_CLASS_NAME, config.keygenClass); + } else { + conf.setString(FlinkOptions.KEYGEN_TYPE, config.keygenType); + } + conf.setInteger(FlinkOptions.INDEX_BOOTSTRAP_TASKS, config.indexBootstrapNum); + conf.setInteger(FlinkOptions.BUCKET_ASSIGN_TASKS, config.bucketAssignNum); + conf.setInteger(FlinkOptions.WRITE_TASKS, config.writeTaskNum); + conf.setString(FlinkOptions.PARTITION_DEFAULT_NAME, config.partitionDefaultName); + conf.setBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED, config.indexBootstrapEnabled); + conf.setDouble(FlinkOptions.INDEX_STATE_TTL, config.indexStateTtl); + conf.setBoolean(FlinkOptions.INDEX_GLOBAL_ENABLED, config.indexGlobalEnabled); + conf.setString(FlinkOptions.INDEX_PARTITION_REGEX, config.indexPartitionRegex); + if (!StringUtils.isNullOrEmpty(config.sourceAvroSchemaPath)) { + conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA_PATH, config.sourceAvroSchemaPath); + } + conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA, config.sourceAvroSchema); + conf.setBoolean(FlinkOptions.UTC_TIMEZONE, config.utcTimezone); + conf.setBoolean(FlinkOptions.URL_ENCODE_PARTITIONING, config.writePartitionUrlEncode); + conf.setBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING, config.hiveStylePartitioning); + conf.setDouble(FlinkOptions.WRITE_TASK_MAX_SIZE, config.writeTaskMaxSize); + conf.setDouble(FlinkOptions.WRITE_BATCH_SIZE, config.writeBatchSize); + conf.setInteger(FlinkOptions.WRITE_LOG_BLOCK_SIZE, config.writeLogBlockSize); + conf.setLong(FlinkOptions.WRITE_LOG_MAX_SIZE, config.writeLogMaxSize); + conf.setInteger(FlinkOptions.WRITE_MERGE_MAX_MEMORY, config.writeMergeMaxMemory); + conf.setBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED, config.compactionAsyncEnabled); + conf.setInteger(FlinkOptions.COMPACTION_TASKS, config.compactionTasks); + conf.setString(FlinkOptions.COMPACTION_TRIGGER_STRATEGY, config.compactionTriggerStrategy); + conf.setInteger(FlinkOptions.COMPACTION_DELTA_COMMITS, config.compactionDeltaCommits); + conf.setInteger(FlinkOptions.COMPACTION_DELTA_SECONDS, config.compactionDeltaSeconds); + conf.setInteger(FlinkOptions.COMPACTION_MAX_MEMORY, config.compactionMaxMemory); + conf.setLong(FlinkOptions.COMPACTION_TARGET_IO, config.compactionTargetIo); + conf.setBoolean(FlinkOptions.CLEAN_ASYNC_ENABLED, config.cleanAsyncEnabled); + conf.setString(FlinkOptions.CLEAN_POLICY, config.cleanPolicy); + conf.setInteger(FlinkOptions.CLEAN_RETAIN_COMMITS, config.cleanRetainCommits); + conf.setInteger(FlinkOptions.CLEAN_RETAIN_FILE_VERSIONS, config.cleanRetainFileVersions); + conf.setInteger(FlinkOptions.ARCHIVE_MAX_COMMITS, config.archiveMaxCommits); + conf.setInteger(FlinkOptions.ARCHIVE_MIN_COMMITS, config.archiveMinCommits); + conf.setBoolean(FlinkOptions.HIVE_SYNC_ENABLED, config.hiveSyncEnabled); + conf.setString(FlinkOptions.HIVE_SYNC_DB, config.hiveSyncDb); + conf.setString(FlinkOptions.HIVE_SYNC_TABLE, config.hiveSyncTable); + conf.setString(FlinkOptions.HIVE_SYNC_FILE_FORMAT, config.hiveSyncFileFormat); + conf.setString(FlinkOptions.HIVE_SYNC_MODE, config.hiveSyncMode); + conf.setString(FlinkOptions.HIVE_SYNC_USERNAME, config.hiveSyncUsername); + conf.setString(FlinkOptions.HIVE_SYNC_PASSWORD, config.hiveSyncPassword); + conf.setString(FlinkOptions.HIVE_SYNC_JDBC_URL, config.hiveSyncJdbcUrl); + conf.setString(FlinkOptions.HIVE_SYNC_METASTORE_URIS, config.hiveSyncMetastoreUri); + conf.setString(FlinkOptions.HIVE_SYNC_PARTITION_FIELDS, config.hiveSyncPartitionFields); + conf.setString(FlinkOptions.HIVE_SYNC_PARTITION_EXTRACTOR_CLASS_NAME, config.hiveSyncPartitionExtractorClass); + conf.setBoolean(FlinkOptions.HIVE_SYNC_ASSUME_DATE_PARTITION, config.hiveSyncAssumeDatePartition); + conf.setBoolean(FlinkOptions.HIVE_SYNC_USE_JDBC, config.hiveSyncUseJdbc); + conf.setBoolean(FlinkOptions.HIVE_SYNC_AUTO_CREATE_DB, config.hiveSyncAutoCreateDb); + conf.setBoolean(FlinkOptions.HIVE_SYNC_IGNORE_EXCEPTIONS, config.hiveSyncIgnoreExceptions); + conf.setBoolean(FlinkOptions.HIVE_SYNC_SKIP_RO_SUFFIX, config.hiveSyncSkipRoSuffix); + conf.setBoolean(FlinkOptions.HIVE_SYNC_SUPPORT_TIMESTAMP, config.hiveSyncSupportTimestamp); + return conf; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/HoodieFlinkStreamer.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/HoodieFlinkStreamer.java new file mode 100644 index 0000000000000..b153b2273cf6b --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/HoodieFlinkStreamer.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.streamer; + +import org.apache.hudi.common.config.DFSPropertiesConfiguration; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.OptionsInference; +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.sink.transform.Transformer; +import org.apache.hudi.sink.utils.Pipelines; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.StreamerUtil; + +import com.beust.jcommander.JCommander; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.formats.common.TimestampFormat; +import org.apache.flink.formats.json.JsonRowDataDeserializationSchema; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.flink.table.types.logical.RowType; + +/** + * A utility which can incrementally consume data from Kafka and apply it to the target table. + * It has the similar functionality with SQL data source except that the source is bind to Kafka + * and the format is bind to JSON. + */ +public class HoodieFlinkStreamer { + public static void main(String[] args) throws Exception { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + final FlinkStreamerConfig cfg = new FlinkStreamerConfig(); + JCommander cmd = new JCommander(cfg, null, args); + if (cfg.help || args.length == 0) { + cmd.usage(); + System.exit(1); + } + env.enableCheckpointing(cfg.checkpointInterval); + env.getConfig().setGlobalJobParameters(cfg); + // We use checkpoint to trigger write operation, including instant generating and committing, + // There can only be one checkpoint at one time. + env.getCheckpointConfig().setMaxConcurrentCheckpoints(1); + + env.setStateBackend(cfg.stateBackend); + if (cfg.flinkCheckPointPath != null) { + env.getCheckpointConfig().setCheckpointStorage(cfg.flinkCheckPointPath); + } + + TypedProperties kafkaProps = DFSPropertiesConfiguration.getGlobalProps(); + kafkaProps.putAll(StreamerUtil.appendKafkaProps(cfg)); + + Configuration conf = FlinkStreamerConfig.toFlinkConfig(cfg); + // Read from kafka source + RowType rowType = + (RowType) AvroSchemaConverter.convertToDataType(StreamerUtil.getSourceSchema(conf)) + .getLogicalType(); + + long ckpTimeout = env.getCheckpointConfig().getCheckpointTimeout(); + conf.setLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT, ckpTimeout); + + DataStream dataStream = env.addSource(new FlinkKafkaConsumer<>( + cfg.kafkaTopic, + new JsonRowDataDeserializationSchema( + rowType, + InternalTypeInfo.of(rowType), + false, + true, + TimestampFormat.ISO_8601 + ), kafkaProps)) + .name("kafka_source") + .uid("uid_kafka_source"); + + if (cfg.transformerClassNames != null && !cfg.transformerClassNames.isEmpty()) { + Option transformer = StreamerUtil.createTransformer(cfg.transformerClassNames); + if (transformer.isPresent()) { + dataStream = transformer.get().apply(dataStream); + } + } + + OptionsInference.setupSinkTasks(conf, env.getParallelism()); + DataStream hoodieRecordDataStream = Pipelines.bootstrap(conf, rowType, dataStream); + DataStream pipeline = Pipelines.hoodieStreamWrite(conf, hoodieRecordDataStream); + if (OptionsResolver.needsAsyncCompaction(conf)) { + Pipelines.compact(conf, pipeline); + } else { + Pipelines.clean(conf, pipeline); + } + + env.execute(cfg.targetTableName); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java new file mode 100644 index 0000000000000..1cf66ea3437ef --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java @@ -0,0 +1,350 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table; + +import org.apache.hudi.common.model.DefaultHoodieRecordPayload; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.exception.HoodieValidationException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.keygen.ComplexAvroKeyGenerator; +import org.apache.hudi.keygen.NonpartitionedAvroKeyGenerator; +import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.DataTypeUtils; + +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.ValidationException; +import org.apache.flink.table.api.constraints.UniqueConstraint; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.ObjectIdentifier; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.connector.sink.DynamicTableSink; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.factories.DynamicTableSinkFactory; +import org.apache.flink.table.factories.DynamicTableSourceFactory; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; + +/** + * Hoodie data source/sink factory. + */ +public class HoodieTableFactory implements DynamicTableSourceFactory, DynamicTableSinkFactory { + private static final Logger LOG = LoggerFactory.getLogger(HoodieTableFactory.class); + + public static final String FACTORY_ID = "hudi"; + + @Override + public DynamicTableSource createDynamicTableSource(Context context) { + Configuration conf = FlinkOptions.fromMap(context.getCatalogTable().getOptions()); + ResolvedSchema schema = context.getCatalogTable().getResolvedSchema(); + sanityCheck(conf, schema); + setupConfOptions(conf, context.getObjectIdentifier(), context.getCatalogTable(), schema); + + Path path = new Path(conf.getOptional(FlinkOptions.PATH).orElseThrow(() -> + new ValidationException("Option [path] should not be empty."))); + return new HoodieTableSource( + schema, + path, + context.getCatalogTable().getPartitionKeys(), + conf.getString(FlinkOptions.PARTITION_DEFAULT_NAME), + conf); + } + + @Override + public DynamicTableSink createDynamicTableSink(Context context) { + Configuration conf = FlinkOptions.fromMap(context.getCatalogTable().getOptions()); + checkArgument(!StringUtils.isNullOrEmpty(conf.getString(FlinkOptions.PATH)), + "Option [path] should not be empty."); + ResolvedSchema schema = context.getCatalogTable().getResolvedSchema(); + sanityCheck(conf, schema); + setupConfOptions(conf, context.getObjectIdentifier(), context.getCatalogTable(), schema); + return new HoodieTableSink(conf, schema); + } + + @Override + public String factoryIdentifier() { + return FACTORY_ID; + } + + @Override + public Set> requiredOptions() { + return Collections.singleton(FlinkOptions.PATH); + } + + @Override + public Set> optionalOptions() { + return FlinkOptions.optionalOptions(); + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + /** + * The sanity check. + * + * @param conf The table options + * @param schema The table schema + */ + private void sanityCheck(Configuration conf, ResolvedSchema schema) { + List fields = schema.getColumnNames(); + + // validate record key in pk absence. + if (!schema.getPrimaryKey().isPresent()) { + String[] recordKeys = conf.get(FlinkOptions.RECORD_KEY_FIELD).split(","); + if (recordKeys.length == 1 + && FlinkOptions.RECORD_KEY_FIELD.defaultValue().equals(recordKeys[0]) + && !fields.contains(recordKeys[0])) { + throw new HoodieValidationException("Primary key definition is required, use either PRIMARY KEY syntax " + + "or option '" + FlinkOptions.RECORD_KEY_FIELD.key() + "' to specify."); + } + + Arrays.stream(recordKeys) + .filter(field -> !fields.contains(field)) + .findAny() + .ifPresent(f -> { + throw new HoodieValidationException("Field '" + f + "' specified in option " + + "'" + FlinkOptions.RECORD_KEY_FIELD.key() + "' does not exist in the table schema."); + }); + } + + // validate pre_combine key + String preCombineField = conf.get(FlinkOptions.PRECOMBINE_FIELD); + if (!fields.contains(preCombineField)) { + if (OptionsResolver.isDefaultHoodieRecordPayloadClazz(conf)) { + throw new HoodieValidationException("Option '" + FlinkOptions.PRECOMBINE_FIELD.key() + + "' is required for payload class: " + DefaultHoodieRecordPayload.class.getName()); + } + if (preCombineField.equals(FlinkOptions.PRECOMBINE_FIELD.defaultValue())) { + conf.setString(FlinkOptions.PRECOMBINE_FIELD, FlinkOptions.NO_PRE_COMBINE); + } else if (!preCombineField.equals(FlinkOptions.NO_PRE_COMBINE)) { + throw new HoodieValidationException("Field " + preCombineField + " does not exist in the table schema." + + "Please check '" + FlinkOptions.PRECOMBINE_FIELD.key() + "' option."); + } + } + } + + /** + * Sets up the config options based on the table definition, for e.g, the table name, primary key. + * + * @param conf The configuration to set up + * @param tablePath The table path + * @param table The catalog table + * @param schema The physical schema + */ + private static void setupConfOptions( + Configuration conf, + ObjectIdentifier tablePath, + CatalogTable table, + ResolvedSchema schema) { + // table name + conf.setString(FlinkOptions.TABLE_NAME.key(), tablePath.getObjectName()); + // hoodie key about options + setupHoodieKeyOptions(conf, table); + // compaction options + setupCompactionOptions(conf); + // hive options + setupHiveOptions(conf, tablePath); + // read options + setupReadOptions(conf); + // write options + setupWriteOptions(conf); + // infer avro schema from physical DDL schema + inferAvroSchema(conf, schema.toPhysicalRowDataType().notNull().getLogicalType()); + } + + /** + * Sets up the hoodie key options (e.g. record key and partition key) from the table definition. + */ + private static void setupHoodieKeyOptions(Configuration conf, CatalogTable table) { + List pkColumns = table.getSchema().getPrimaryKey() + .map(UniqueConstraint::getColumns).orElse(Collections.emptyList()); + if (pkColumns.size() > 0) { + // the PRIMARY KEY syntax always has higher priority than option FlinkOptions#RECORD_KEY_FIELD + String recordKey = String.join(",", pkColumns); + conf.setString(FlinkOptions.RECORD_KEY_FIELD, recordKey); + } + List partitionKeys = table.getPartitionKeys(); + if (partitionKeys.size() > 0) { + // the PARTITIONED BY syntax always has higher priority than option FlinkOptions#PARTITION_PATH_FIELD + conf.setString(FlinkOptions.PARTITION_PATH_FIELD, String.join(",", partitionKeys)); + } + // set index key for bucket index if not defined + if (conf.getString(FlinkOptions.INDEX_TYPE).equals(HoodieIndex.IndexType.BUCKET.name())) { + if (conf.getString(FlinkOptions.INDEX_KEY_FIELD).isEmpty()) { + conf.setString(FlinkOptions.INDEX_KEY_FIELD, conf.getString(FlinkOptions.RECORD_KEY_FIELD)); + } else { + Set recordKeySet = + Arrays.stream(conf.getString(FlinkOptions.RECORD_KEY_FIELD).split(",")).collect(Collectors.toSet()); + Set indexKeySet = + Arrays.stream(conf.getString(FlinkOptions.INDEX_KEY_FIELD).split(",")).collect(Collectors.toSet()); + if (!recordKeySet.containsAll(indexKeySet)) { + throw new HoodieValidationException( + FlinkOptions.INDEX_KEY_FIELD + " should be a subset of or equal to the recordKey fields"); + } + } + } + + // tweak the key gen class if possible + final String[] partitions = conf.getString(FlinkOptions.PARTITION_PATH_FIELD).split(","); + final String[] pks = conf.getString(FlinkOptions.RECORD_KEY_FIELD).split(","); + if (partitions.length == 1) { + final String partitionField = partitions[0]; + if (partitionField.isEmpty()) { + conf.setString(FlinkOptions.KEYGEN_CLASS_NAME, NonpartitionedAvroKeyGenerator.class.getName()); + LOG.info("Table option [{}] is reset to {} because this is a non-partitioned table", + FlinkOptions.KEYGEN_CLASS_NAME.key(), NonpartitionedAvroKeyGenerator.class.getName()); + return; + } + DataType partitionFieldType = table.getSchema().getFieldDataType(partitionField) + .orElseThrow(() -> new HoodieValidationException("Field " + partitionField + " does not exist")); + if (pks.length <= 1 && DataTypeUtils.isDatetimeType(partitionFieldType)) { + // timestamp based key gen only supports simple primary key + setupTimestampKeygenOptions(conf, partitionFieldType); + return; + } + } + boolean complexHoodieKey = pks.length > 1 || partitions.length > 1; + if (complexHoodieKey && FlinkOptions.isDefaultValueDefined(conf, FlinkOptions.KEYGEN_CLASS_NAME)) { + conf.setString(FlinkOptions.KEYGEN_CLASS_NAME, ComplexAvroKeyGenerator.class.getName()); + LOG.info("Table option [{}] is reset to {} because record key or partition path has two or more fields", + FlinkOptions.KEYGEN_CLASS_NAME.key(), ComplexAvroKeyGenerator.class.getName()); + } + } + + /** + * Sets up the keygen options when the partition path is datetime type. + * + *

    The UTC timezone is used as default. + */ + public static void setupTimestampKeygenOptions(Configuration conf, DataType fieldType) { + if (conf.contains(FlinkOptions.KEYGEN_CLASS_NAME)) { + // the keygen clazz has been set up explicitly, skipping + return; + } + + conf.setString(FlinkOptions.KEYGEN_CLASS_NAME, TimestampBasedAvroKeyGenerator.class.getName()); + LOG.info("Table option [{}] is reset to {} because datetime partitioning turns on", + FlinkOptions.KEYGEN_CLASS_NAME.key(), TimestampBasedAvroKeyGenerator.class.getName()); + if (DataTypeUtils.isTimestampType(fieldType)) { + int precision = DataTypeUtils.precision(fieldType.getLogicalType()); + if (precision == 0) { + // seconds + conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_TYPE_FIELD_PROP, + TimestampBasedAvroKeyGenerator.TimestampType.UNIX_TIMESTAMP.name()); + } else if (precision == 3) { + // milliseconds + conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_TYPE_FIELD_PROP, + TimestampBasedAvroKeyGenerator.TimestampType.EPOCHMILLISECONDS.name()); + } + String outputPartitionFormat = conf.getOptional(FlinkOptions.PARTITION_FORMAT).orElse(FlinkOptions.PARTITION_FORMAT_HOUR); + conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, outputPartitionFormat); + } else { + conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_TYPE_FIELD_PROP, + TimestampBasedAvroKeyGenerator.TimestampType.SCALAR.name()); + conf.setString(KeyGeneratorOptions.Config.INPUT_TIME_UNIT, TimeUnit.DAYS.toString()); + + String outputPartitionFormat = conf.getOptional(FlinkOptions.PARTITION_FORMAT).orElse(FlinkOptions.PARTITION_FORMAT_DAY); + conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, outputPartitionFormat); + // the option is actually useless, it only works for validation + conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, FlinkOptions.PARTITION_FORMAT_DAY); + } + conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP, "UTC"); + } + + /** + * Sets up the compaction options from the table definition. + */ + private static void setupCompactionOptions(Configuration conf) { + int commitsToRetain = conf.getInteger(FlinkOptions.CLEAN_RETAIN_COMMITS); + int minCommitsToKeep = conf.getInteger(FlinkOptions.ARCHIVE_MIN_COMMITS); + if (commitsToRetain >= minCommitsToKeep) { + LOG.info("Table option [{}] is reset to {} to be greater than {}={},\n" + + "to avoid risk of missing data from few instants in incremental pull", + FlinkOptions.ARCHIVE_MIN_COMMITS.key(), commitsToRetain + 10, + FlinkOptions.CLEAN_RETAIN_COMMITS.key(), commitsToRetain); + conf.setInteger(FlinkOptions.ARCHIVE_MIN_COMMITS, commitsToRetain + 10); + conf.setInteger(FlinkOptions.ARCHIVE_MAX_COMMITS, commitsToRetain + 20); + } + } + + /** + * Sets up the hive options from the table definition. + */ + private static void setupHiveOptions(Configuration conf, ObjectIdentifier tablePath) { + if (!conf.contains(FlinkOptions.HIVE_SYNC_DB)) { + conf.setString(FlinkOptions.HIVE_SYNC_DB, tablePath.getDatabaseName()); + } + if (!conf.contains(FlinkOptions.HIVE_SYNC_TABLE)) { + conf.setString(FlinkOptions.HIVE_SYNC_TABLE, tablePath.getObjectName()); + } + } + + /** + * Sets up the read options from the table definition. + */ + private static void setupReadOptions(Configuration conf) { + if (!conf.getBoolean(FlinkOptions.READ_AS_STREAMING) + && (conf.getOptional(FlinkOptions.READ_START_COMMIT).isPresent() || conf.getOptional(FlinkOptions.READ_END_COMMIT).isPresent())) { + conf.setString(FlinkOptions.QUERY_TYPE, FlinkOptions.QUERY_TYPE_INCREMENTAL); + } + } + + /** + * Sets up the write options from the table definition. + */ + private static void setupWriteOptions(Configuration conf) { + if (FlinkOptions.isDefaultValueDefined(conf, FlinkOptions.OPERATION) + && OptionsResolver.isCowTable(conf)) { + conf.setBoolean(FlinkOptions.PRE_COMBINE, true); + } + } + + /** + * Inferences the deserialization Avro schema from the table schema (e.g. the DDL) + * if both options {@link FlinkOptions#SOURCE_AVRO_SCHEMA_PATH} and + * {@link FlinkOptions#SOURCE_AVRO_SCHEMA} are not specified. + * + * @param conf The configuration + * @param rowType The specified table row type + */ + private static void inferAvroSchema(Configuration conf, LogicalType rowType) { + if (!conf.getOptional(FlinkOptions.SOURCE_AVRO_SCHEMA_PATH).isPresent() + && !conf.getOptional(FlinkOptions.SOURCE_AVRO_SCHEMA).isPresent()) { + String inferredSchema = AvroSchemaConverter.convertToSchema(rowType).toString(); + conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA, inferredSchema); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSink.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSink.java new file mode 100644 index 0000000000000..f8799d3ac940a --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSink.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table; + +import org.apache.hudi.adapter.DataStreamSinkProviderAdapter; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.OptionsInference; +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.sink.utils.Pipelines; +import org.apache.hudi.util.ChangelogModes; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.sink.DynamicTableSink; +import org.apache.flink.table.connector.sink.abilities.SupportsOverwrite; +import org.apache.flink.table.connector.sink.abilities.SupportsPartitioning; +import org.apache.flink.table.types.logical.RowType; + +import java.util.Map; + +/** + * Hoodie table sink. + */ +public class HoodieTableSink implements DynamicTableSink, SupportsPartitioning, SupportsOverwrite { + + private final Configuration conf; + private final ResolvedSchema schema; + private boolean overwrite = false; + + public HoodieTableSink(Configuration conf, ResolvedSchema schema) { + this.conf = conf; + this.schema = schema; + } + + public HoodieTableSink(Configuration conf, ResolvedSchema schema, boolean overwrite) { + this.conf = conf; + this.schema = schema; + this.overwrite = overwrite; + } + + @Override + public SinkRuntimeProvider getSinkRuntimeProvider(Context context) { + return (DataStreamSinkProviderAdapter) dataStream -> { + + // setup configuration + long ckpTimeout = dataStream.getExecutionEnvironment() + .getCheckpointConfig().getCheckpointTimeout(); + conf.setLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT, ckpTimeout); + // set up default parallelism + OptionsInference.setupSinkTasks(conf, dataStream.getExecutionConfig().getParallelism()); + + RowType rowType = (RowType) schema.toSinkRowDataType().notNull().getLogicalType(); + + // bulk_insert mode + final String writeOperation = this.conf.get(FlinkOptions.OPERATION); + if (WriteOperationType.fromValue(writeOperation) == WriteOperationType.BULK_INSERT) { + return Pipelines.bulkInsert(conf, rowType, dataStream); + } + + // Append mode + if (OptionsResolver.isAppendMode(conf)) { + DataStream pipeline = Pipelines.append(conf, rowType, dataStream, context.isBounded()); + if (OptionsResolver.needsAsyncClustering(conf)) { + return Pipelines.cluster(conf, rowType, pipeline); + } else { + return Pipelines.dummySink(pipeline); + } + } + + DataStream pipeline; + // bootstrap + final DataStream hoodieRecordDataStream = + Pipelines.bootstrap(conf, rowType, dataStream, context.isBounded(), overwrite); + // write pipeline + pipeline = Pipelines.hoodieStreamWrite(conf, hoodieRecordDataStream); + // compaction + if (OptionsResolver.needsAsyncCompaction(conf)) { + // use synchronous compaction for bounded source. + if (context.isBounded()) { + conf.setBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED, false); + } + return Pipelines.compact(conf, pipeline); + } else { + return Pipelines.clean(conf, pipeline); + } + }; + } + + @VisibleForTesting + public Configuration getConf() { + return this.conf; + } + + @Override + public ChangelogMode getChangelogMode(ChangelogMode changelogMode) { + if (conf.getBoolean(FlinkOptions.CHANGELOG_ENABLED)) { + return ChangelogModes.FULL; + } else { + return ChangelogModes.UPSERT; + } + } + + @Override + public DynamicTableSink copy() { + return new HoodieTableSink(this.conf, this.schema, this.overwrite); + } + + @Override + public String asSummaryString() { + return "HoodieTableSink"; + } + + @Override + public void applyStaticPartition(Map partitions) { + // #applyOverwrite should have been invoked. + if (this.overwrite && partitions.size() > 0) { + this.conf.setString(FlinkOptions.OPERATION, WriteOperationType.INSERT_OVERWRITE.value()); + } + } + + @Override + public void applyOverwrite(boolean overwrite) { + this.overwrite = overwrite; + // set up the operation as INSERT_OVERWRITE_TABLE first, + // if there are explicit partitions, #applyStaticPartition would overwrite the option. + this.conf.setString(FlinkOptions.OPERATION, WriteOperationType.INSERT_OVERWRITE_TABLE.value()); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java new file mode 100644 index 0000000000000..4ea14c413cca1 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java @@ -0,0 +1,524 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table; + +import org.apache.hudi.adapter.DataStreamScanProviderAdapter; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.BaseFile; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.configuration.OptionsInference; +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.sink.utils.Pipelines; +import org.apache.hudi.source.FileIndex; +import org.apache.hudi.source.IncrementalInputSplits; +import org.apache.hudi.source.StreamReadMonitoringFunction; +import org.apache.hudi.source.StreamReadOperator; +import org.apache.hudi.table.format.FilePathUtils; +import org.apache.hudi.table.format.cow.CopyOnWriteInputFormat; +import org.apache.hudi.table.format.mor.MergeOnReadInputFormat; +import org.apache.hudi.table.format.mor.MergeOnReadInputSplit; +import org.apache.hudi.table.format.mor.MergeOnReadTableState; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.ChangelogModes; +import org.apache.hudi.util.ExpressionUtils; +import org.apache.hudi.util.InputFormats; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.avro.Schema; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.io.InputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSource; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.InputFormatSourceFunction; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.connector.source.ScanTableSource; +import org.apache.flink.table.connector.source.abilities.SupportsFilterPushDown; +import org.apache.flink.table.connector.source.abilities.SupportsLimitPushDown; +import org.apache.flink.table.connector.source.abilities.SupportsPartitionPushDown; +import org.apache.flink.table.connector.source.abilities.SupportsProjectionPushDown; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.runtime.types.TypeInfoDataTypeConverter; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static org.apache.hudi.configuration.HadoopConfigurations.getParquetConf; + +/** + * Hoodie batch table source that always read the latest snapshot of the underneath table. + */ +public class HoodieTableSource implements + ScanTableSource, + SupportsPartitionPushDown, + SupportsProjectionPushDown, + SupportsLimitPushDown, + SupportsFilterPushDown { + private static final Logger LOG = LoggerFactory.getLogger(HoodieTableSource.class); + + private static final int NO_LIMIT_CONSTANT = -1; + + private final transient org.apache.hadoop.conf.Configuration hadoopConf; + private final transient HoodieTableMetaClient metaClient; + private final long maxCompactionMemoryInBytes; + + private final ResolvedSchema schema; + private final RowType tableRowType; + private final Path path; + private final List partitionKeys; + private final String defaultPartName; + private final Configuration conf; + private final FileIndex fileIndex; + + private int[] requiredPos; + private long limit; + + private List> requiredPartitions; + + public HoodieTableSource( + ResolvedSchema schema, + Path path, + List partitionKeys, + String defaultPartName, + Configuration conf) { + this(schema, path, partitionKeys, defaultPartName, conf, null, null, null, null, null); + } + + public HoodieTableSource( + ResolvedSchema schema, + Path path, + List partitionKeys, + String defaultPartName, + Configuration conf, + @Nullable FileIndex fileIndex, + @Nullable List> requiredPartitions, + @Nullable int[] requiredPos, + @Nullable Long limit, + @Nullable HoodieTableMetaClient metaClient) { + this.schema = schema; + this.tableRowType = (RowType) schema.toPhysicalRowDataType().notNull().getLogicalType(); + this.path = path; + this.partitionKeys = partitionKeys; + this.defaultPartName = defaultPartName; + this.conf = conf; + this.fileIndex = fileIndex == null + ? FileIndex.instance(this.path, this.conf, this.tableRowType) + : fileIndex; + this.requiredPartitions = requiredPartitions; + this.requiredPos = requiredPos == null + ? IntStream.range(0, this.tableRowType.getFieldCount()).toArray() + : requiredPos; + this.limit = limit == null ? NO_LIMIT_CONSTANT : limit; + this.hadoopConf = HadoopConfigurations.getHadoopConf(conf); + this.metaClient = metaClient == null ? StreamerUtil.metaClientForReader(conf, hadoopConf) : metaClient; + this.maxCompactionMemoryInBytes = StreamerUtil.getMaxCompactionMemoryInBytes(conf); + } + + @Override + public ScanRuntimeProvider getScanRuntimeProvider(ScanContext scanContext) { + return new DataStreamScanProviderAdapter() { + + @Override + public boolean isBounded() { + return !conf.getBoolean(FlinkOptions.READ_AS_STREAMING); + } + + @Override + public DataStream produceDataStream(StreamExecutionEnvironment execEnv) { + @SuppressWarnings("unchecked") + TypeInformation typeInfo = + (TypeInformation) TypeInfoDataTypeConverter.fromDataTypeToTypeInfo(getProducedDataType()); + OptionsInference.setupSourceTasks(conf, execEnv.getParallelism()); + if (conf.getBoolean(FlinkOptions.READ_AS_STREAMING)) { + StreamReadMonitoringFunction monitoringFunction = new StreamReadMonitoringFunction( + conf, FilePathUtils.toFlinkPath(path), tableRowType, maxCompactionMemoryInBytes, getRequiredPartitionPaths()); + InputFormat inputFormat = getInputFormat(true); + OneInputStreamOperatorFactory factory = StreamReadOperator.factory((MergeOnReadInputFormat) inputFormat); + SingleOutputStreamOperator source = execEnv.addSource(monitoringFunction, getSourceOperatorName("split_monitor")) + .uid(Pipelines.opUID("split_monitor", conf)) + .setParallelism(1) + .keyBy(MergeOnReadInputSplit::getFileId) + .transform("split_reader", typeInfo, factory) + .uid(Pipelines.opUID("split_reader", conf)) + .setParallelism(conf.getInteger(FlinkOptions.READ_TASKS)); + return new DataStreamSource<>(source); + } else { + InputFormatSourceFunction func = new InputFormatSourceFunction<>(getInputFormat(), typeInfo); + DataStreamSource source = execEnv.addSource(func, asSummaryString(), typeInfo); + return source.name(getSourceOperatorName("bounded_source")).setParallelism(conf.getInteger(FlinkOptions.READ_TASKS)); + } + } + }; + } + + @Override + public ChangelogMode getChangelogMode() { + // when read as streaming and changelog mode is enabled, emit as FULL mode; + // when all the changes are compacted or read as batch, emit as INSERT mode. + return OptionsResolver.emitChangelog(conf) ? ChangelogModes.FULL : ChangelogMode.insertOnly(); + } + + @Override + public DynamicTableSource copy() { + return new HoodieTableSource(schema, path, partitionKeys, defaultPartName, + conf, fileIndex, requiredPartitions, requiredPos, limit, metaClient); + } + + @Override + public String asSummaryString() { + return "HudiTableSource"; + } + + @Override + public Result applyFilters(List filters) { + List callExpressionFilters = filters.stream() + .filter(ExpressionUtils::isSimpleCallExpression) + .collect(Collectors.toList()); + this.fileIndex.setFilters(callExpressionFilters); + // refuse all the filters now + return SupportsFilterPushDown.Result.of(Collections.emptyList(), new ArrayList<>(filters)); + } + + @Override + public Optional>> listPartitions() { + List> partitions = this.fileIndex.getPartitions( + this.partitionKeys, defaultPartName, conf.getBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING)); + return Optional.of(partitions); + } + + @Override + public void applyPartitions(List> partitions) { + this.requiredPartitions = partitions; + } + + @Override + public boolean supportsNestedProjection() { + return false; + } + + @Override + public void applyProjection(int[][] projections) { + // nested projection is not supported. + this.requiredPos = Arrays.stream(projections).mapToInt(array -> array[0]).toArray(); + } + + @Override + public void applyLimit(long limit) { + this.limit = limit; + } + + private DataType getProducedDataType() { + String[] schemaFieldNames = this.schema.getColumnNames().toArray(new String[0]); + DataType[] schemaTypes = this.schema.getColumnDataTypes().toArray(new DataType[0]); + + return DataTypes.ROW(Arrays.stream(this.requiredPos) + .mapToObj(i -> DataTypes.FIELD(schemaFieldNames[i], schemaTypes[i])) + .toArray(DataTypes.Field[]::new)) + .bridgedTo(RowData.class); + } + + private String getSourceOperatorName(String operatorName) { + String[] schemaFieldNames = this.schema.getColumnNames().toArray(new String[0]); + List fields = Arrays.stream(this.requiredPos) + .mapToObj(i -> schemaFieldNames[i]) + .collect(Collectors.toList()); + StringBuilder sb = new StringBuilder(); + sb.append(operatorName) + .append("(") + .append("table=").append(Collections.singletonList(conf.getString(FlinkOptions.TABLE_NAME))) + .append(", ") + .append("fields=").append(fields) + .append(")"); + return sb.toString(); + } + + @Nullable + private Set getRequiredPartitionPaths() { + if (this.requiredPartitions == null) { + // returns null for non partition pruning + return null; + } + return FilePathUtils.toRelativePartitionPaths(this.partitionKeys, this.requiredPartitions, + conf.getBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING)); + } + + private List buildFileIndex() { + Set requiredPartitionPaths = getRequiredPartitionPaths(); + fileIndex.setPartitionPaths(requiredPartitionPaths); + List relPartitionPaths = fileIndex.getOrBuildPartitionPaths(); + if (relPartitionPaths.size() == 0) { + return Collections.emptyList(); + } + FileStatus[] fileStatuses = fileIndex.getFilesInPartitions(); + if (fileStatuses.length == 0) { + throw new HoodieException("No files found for reading in user provided path."); + } + + HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, + // file-slice after pending compaction-requested instant-time is also considered valid + metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants(), fileStatuses); + String latestCommit = fsView.getLastInstant().get().getTimestamp(); + final String mergeType = this.conf.getString(FlinkOptions.MERGE_TYPE); + final AtomicInteger cnt = new AtomicInteger(0); + // generates one input split for each file group + return relPartitionPaths.stream() + .map(relPartitionPath -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, latestCommit) + .map(fileSlice -> { + String basePath = fileSlice.getBaseFile().map(BaseFile::getPath).orElse(null); + Option> logPaths = Option.ofNullable(fileSlice.getLogFiles() + .sorted(HoodieLogFile.getLogFileComparator()) + .map(logFile -> logFile.getPath().toString()) + .collect(Collectors.toList())); + return new MergeOnReadInputSplit(cnt.getAndAdd(1), basePath, logPaths, latestCommit, + metaClient.getBasePath(), maxCompactionMemoryInBytes, mergeType, null, fileSlice.getFileId()); + }).collect(Collectors.toList())) + .flatMap(Collection::stream) + .collect(Collectors.toList()); + } + + public InputFormat getInputFormat() { + return getInputFormat(false); + } + + @VisibleForTesting + public InputFormat getInputFormat(boolean isStreaming) { + return isStreaming ? getStreamInputFormat() : getBatchInputFormat(); + } + + private InputFormat getBatchInputFormat() { + final Schema tableAvroSchema = getTableAvroSchema(); + final DataType rowDataType = AvroSchemaConverter.convertToDataType(tableAvroSchema); + final RowType rowType = (RowType) rowDataType.getLogicalType(); + final RowType requiredRowType = (RowType) getProducedDataType().notNull().getLogicalType(); + + final String queryType = this.conf.getString(FlinkOptions.QUERY_TYPE); + switch (queryType) { + case FlinkOptions.QUERY_TYPE_SNAPSHOT: + final HoodieTableType tableType = HoodieTableType.valueOf(this.conf.getString(FlinkOptions.TABLE_TYPE)); + switch (tableType) { + case MERGE_ON_READ: + final List inputSplits = buildFileIndex(); + if (inputSplits.size() == 0) { + // When there is no input splits, just return an empty source. + LOG.warn("No input splits generate for MERGE_ON_READ input format, returns empty collection instead"); + return InputFormats.EMPTY_INPUT_FORMAT; + } + return mergeOnReadInputFormat(rowType, requiredRowType, tableAvroSchema, + rowDataType, inputSplits, false); + case COPY_ON_WRITE: + return baseFileOnlyInputFormat(); + default: + throw new HoodieException("Unexpected table type: " + this.conf.getString(FlinkOptions.TABLE_TYPE)); + } + case FlinkOptions.QUERY_TYPE_READ_OPTIMIZED: + return baseFileOnlyInputFormat(); + case FlinkOptions.QUERY_TYPE_INCREMENTAL: + IncrementalInputSplits incrementalInputSplits = IncrementalInputSplits.builder() + .conf(conf) + .path(FilePathUtils.toFlinkPath(path)) + .rowType(this.tableRowType) + .maxCompactionMemoryInBytes(maxCompactionMemoryInBytes) + .requiredPartitions(getRequiredPartitionPaths()).build(); + final IncrementalInputSplits.Result result = incrementalInputSplits.inputSplits(metaClient, hadoopConf); + if (result.isEmpty()) { + // When there is no input splits, just return an empty source. + LOG.warn("No input splits generate for incremental read, returns empty collection instead"); + return InputFormats.EMPTY_INPUT_FORMAT; + } + return mergeOnReadInputFormat(rowType, requiredRowType, tableAvroSchema, + rowDataType, result.getInputSplits(), false); + default: + String errMsg = String.format("Invalid query type : '%s', options ['%s', '%s', '%s'] are supported now", queryType, + FlinkOptions.QUERY_TYPE_SNAPSHOT, FlinkOptions.QUERY_TYPE_READ_OPTIMIZED, FlinkOptions.QUERY_TYPE_INCREMENTAL); + throw new HoodieException(errMsg); + } + } + + private InputFormat getStreamInputFormat() { + // if table does not exist or table data does not exist, use schema from the DDL + Schema tableAvroSchema = (this.metaClient == null || !tableDataExists()) ? inferSchemaFromDdl() : getTableAvroSchema(); + final DataType rowDataType = AvroSchemaConverter.convertToDataType(tableAvroSchema); + final RowType rowType = (RowType) rowDataType.getLogicalType(); + final RowType requiredRowType = (RowType) getProducedDataType().notNull().getLogicalType(); + + final String queryType = this.conf.getString(FlinkOptions.QUERY_TYPE); + if (FlinkOptions.QUERY_TYPE_SNAPSHOT.equals(queryType)) { + final HoodieTableType tableType = HoodieTableType.valueOf(this.conf.getString(FlinkOptions.TABLE_TYPE)); + boolean emitDelete = tableType == HoodieTableType.MERGE_ON_READ; + return mergeOnReadInputFormat(rowType, requiredRowType, tableAvroSchema, + rowDataType, Collections.emptyList(), emitDelete); + } + String errMsg = String.format("Invalid query type : '%s', options ['%s'] are supported now", queryType, + FlinkOptions.QUERY_TYPE_SNAPSHOT); + throw new HoodieException(errMsg); + } + + /** + * Returns whether the hoodie table data exists . + */ + private boolean tableDataExists() { + HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); + Option> instantAndCommitMetadata = activeTimeline.getLastCommitMetadataWithValidData(); + return instantAndCommitMetadata.isPresent(); + } + + private MergeOnReadInputFormat mergeOnReadInputFormat( + RowType rowType, + RowType requiredRowType, + Schema tableAvroSchema, + DataType rowDataType, + List inputSplits, + boolean emitDelete) { + final MergeOnReadTableState hoodieTableState = new MergeOnReadTableState( + rowType, + requiredRowType, + tableAvroSchema.toString(), + AvroSchemaConverter.convertToSchema(requiredRowType).toString(), + inputSplits, + conf.getString(FlinkOptions.RECORD_KEY_FIELD).split(",")); + return MergeOnReadInputFormat.builder() + .config(this.conf) + .tableState(hoodieTableState) + // use the explicit fields' data type because the AvroSchemaConverter + // is not very stable. + .fieldTypes(rowDataType.getChildren()) + .defaultPartName(conf.getString(FlinkOptions.PARTITION_DEFAULT_NAME)) + .limit(this.limit) + .emitDelete(emitDelete) + .build(); + } + + private InputFormat baseFileOnlyInputFormat() { + final FileStatus[] fileStatuses = getReadFiles(); + if (fileStatuses.length == 0) { + return InputFormats.EMPTY_INPUT_FORMAT; + } + + HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, + metaClient.getCommitsAndCompactionTimeline().filterCompletedInstants(), fileStatuses); + Path[] paths = fsView.getLatestBaseFiles() + .map(HoodieBaseFile::getFileStatus) + .map(FileStatus::getPath).toArray(Path[]::new); + + return new CopyOnWriteInputFormat( + FilePathUtils.toFlinkPaths(paths), + this.schema.getColumnNames().toArray(new String[0]), + this.schema.getColumnDataTypes().toArray(new DataType[0]), + this.requiredPos, + this.conf.getString(FlinkOptions.PARTITION_DEFAULT_NAME), + this.limit == NO_LIMIT_CONSTANT ? Long.MAX_VALUE : this.limit, // ParquetInputFormat always uses the limit value + getParquetConf(this.conf, this.hadoopConf), + this.conf.getBoolean(FlinkOptions.UTC_TIMEZONE) + ); + } + + private Schema inferSchemaFromDdl() { + Schema schema = AvroSchemaConverter.convertToSchema(this.tableRowType); + return HoodieAvroUtils.addMetadataFields(schema, conf.getBoolean(FlinkOptions.CHANGELOG_ENABLED)); + } + + @VisibleForTesting + public Schema getTableAvroSchema() { + try { + TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient); + return schemaResolver.getTableAvroSchema(); + } catch (Throwable e) { + // table exists but has no written data + LOG.warn("Get table avro schema error, use schema from the DDL instead", e); + return inferSchemaFromDdl(); + } + } + + @VisibleForTesting + public HoodieTableMetaClient getMetaClient() { + return this.metaClient; + } + + @VisibleForTesting + public Configuration getConf() { + return this.conf; + } + + /** + * Reset the state of the table source. + */ + @VisibleForTesting + public void reset() { + this.metaClient.reloadActiveTimeline(); + this.requiredPartitions = null; + this.fileIndex.reset(); + } + + /** + * Get the reader paths with partition path expanded. + */ + @VisibleForTesting + public FileStatus[] getReadFiles() { + Set requiredPartitionPaths = getRequiredPartitionPaths(); + fileIndex.setPartitionPaths(requiredPartitionPaths); + List relPartitionPaths = fileIndex.getOrBuildPartitionPaths(); + if (relPartitionPaths.size() == 0) { + return new FileStatus[0]; + } + return fileIndex.getFilesInPartitions(); + } + + @VisibleForTesting + FileIndex getFileIndex() { + return fileIndex; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/CatalogOptions.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/CatalogOptions.java new file mode 100644 index 0000000000000..58eb3171dad01 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/CatalogOptions.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.hudi.exception.HoodieException; + +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.catalog.CommonCatalogOptions; + +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * Hoodie catalog options. + */ +public class CatalogOptions { + public static final String HIVE_SITE_FILE = "hive-site.xml"; + public static final String DEFAULT_DB = "default"; + + public static final ConfigOption CATALOG_PATH = + ConfigOptions.key("catalog.path") + .stringType() + .noDefaultValue() + .withDescription("Catalog base DFS path, used for inferring the sink table path. " + + "The default strategy for a table path is: ${catalog.path}/${db_name}/${table_name}"); + + public static final ConfigOption DEFAULT_DATABASE = + ConfigOptions.key(CommonCatalogOptions.DEFAULT_DATABASE_KEY) + .stringType() + .defaultValue("default"); + + public static final ConfigOption HIVE_CONF_DIR = ConfigOptions + .key("hive.conf.dir") + .stringType() + .noDefaultValue(); + + public static final ConfigOption MODE = ConfigOptions + .key("mode") + .stringType() + .defaultValue("dfs"); + + public static final ConfigOption TABLE_EXTERNAL = ConfigOptions + .key("table.external") + .booleanType() + .defaultValue(false) + .withDescription("Whether the table is external, default false"); + + /** + * Returns all the config options. + */ + public static List> allOptions() { + Field[] declaredFields = CatalogOptions.class.getDeclaredFields(); + List> options = new ArrayList<>(); + for (Field field : declaredFields) { + if (java.lang.reflect.Modifier.isStatic(field.getModifiers()) + && field.getType().equals(ConfigOption.class)) { + try { + options.add((ConfigOption) field.get(ConfigOption.class)); + } catch (IllegalAccessException e) { + throw new HoodieException("Error while fetching static config option", e); + } + } + } + return options; + } + + /** + * Returns all the common table options that can be shared. + * + * @param catalogOptions The catalog options + */ + public static Map tableCommonOptions(Configuration catalogOptions) { + Configuration copied = new Configuration(catalogOptions); + copied.removeConfig(DEFAULT_DATABASE); + copied.removeConfig(CATALOG_PATH); + return copied.toMap(); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HiveSchemaUtils.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HiveSchemaUtils.java new file mode 100644 index 0000000000000..4383b42e9f8d9 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HiveSchemaUtils.java @@ -0,0 +1,252 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.configuration.FlinkOptions; + +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.stream.Collectors; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * Utilities for Hive field schema. + */ +public class HiveSchemaUtils { + /** + * Get field names from field schemas. + */ + public static List getFieldNames(List fieldSchemas) { + return fieldSchemas.stream().map(FieldSchema::getName).collect(Collectors.toList()); + } + + public static org.apache.flink.table.api.Schema convertTableSchema(Table hiveTable) { + List allCols = hiveTable.getSd().getCols().stream() + // filter out the metadata columns + .filter(s -> !HoodieAvroUtils.isMetadataField(s.getName())) + .collect(Collectors.toList()); + // need to refactor the partition key field positions: they are not always in the last + allCols.addAll(hiveTable.getPartitionKeys()); + + String pkConstraintName = hiveTable.getParameters().get(TableOptionProperties.PK_CONSTRAINT_NAME); + String pkColumnStr = hiveTable.getParameters().getOrDefault(FlinkOptions.RECORD_KEY_FIELD.key(), FlinkOptions.RECORD_KEY_FIELD.defaultValue()); + List pkColumns = StringUtils.split(pkColumnStr, ","); + + String[] colNames = new String[allCols.size()]; + DataType[] colTypes = new DataType[allCols.size()]; + + for (int i = 0; i < allCols.size(); i++) { + FieldSchema fs = allCols.get(i); + + colNames[i] = fs.getName(); + colTypes[i] = + toFlinkType(TypeInfoUtils.getTypeInfoFromTypeString(fs.getType())); + if (pkColumns.contains(colNames[i])) { + colTypes[i] = colTypes[i].notNull(); + } + } + + org.apache.flink.table.api.Schema.Builder builder = org.apache.flink.table.api.Schema.newBuilder().fromFields(colNames, colTypes); + if (!StringUtils.isNullOrEmpty(pkConstraintName)) { + builder.primaryKeyNamed(pkConstraintName, pkColumns); + } else { + builder.primaryKey(pkColumns); + } + + return builder.build(); + } + + /** + * Convert Hive data type to a Flink data type. + * + * @param hiveType a Hive data type + * @return the corresponding Flink data type + */ + public static DataType toFlinkType(TypeInfo hiveType) { + checkNotNull(hiveType, "hiveType cannot be null"); + + switch (hiveType.getCategory()) { + case PRIMITIVE: + return toFlinkPrimitiveType((PrimitiveTypeInfo) hiveType); + case LIST: + ListTypeInfo listTypeInfo = (ListTypeInfo) hiveType; + return DataTypes.ARRAY(toFlinkType(listTypeInfo.getListElementTypeInfo())); + case MAP: + MapTypeInfo mapTypeInfo = (MapTypeInfo) hiveType; + return DataTypes.MAP( + toFlinkType(mapTypeInfo.getMapKeyTypeInfo()), + toFlinkType(mapTypeInfo.getMapValueTypeInfo())); + case STRUCT: + StructTypeInfo structTypeInfo = (StructTypeInfo) hiveType; + + List names = structTypeInfo.getAllStructFieldNames(); + List typeInfos = structTypeInfo.getAllStructFieldTypeInfos(); + + DataTypes.Field[] fields = new DataTypes.Field[names.size()]; + + for (int i = 0; i < fields.length; i++) { + fields[i] = DataTypes.FIELD(names.get(i), toFlinkType(typeInfos.get(i))); + } + + return DataTypes.ROW(fields); + default: + throw new UnsupportedOperationException( + String.format("Flink doesn't support Hive data type %s yet.", hiveType)); + } + } + + private static DataType toFlinkPrimitiveType(PrimitiveTypeInfo hiveType) { + checkNotNull(hiveType, "hiveType cannot be null"); + + switch (hiveType.getPrimitiveCategory()) { + case CHAR: + return DataTypes.CHAR(((CharTypeInfo) hiveType).getLength()); + case VARCHAR: + return DataTypes.VARCHAR(((VarcharTypeInfo) hiveType).getLength()); + case STRING: + return DataTypes.STRING(); + case BOOLEAN: + return DataTypes.BOOLEAN(); + case BYTE: + return DataTypes.TINYINT(); + case SHORT: + return DataTypes.SMALLINT(); + case INT: + return DataTypes.INT(); + case LONG: + return DataTypes.BIGINT(); + case FLOAT: + return DataTypes.FLOAT(); + case DOUBLE: + return DataTypes.DOUBLE(); + case DATE: + return DataTypes.DATE(); + case TIMESTAMP: + // see org.apache.hudi.hive.util.HiveSchemaUtil#convertField for details. + return DataTypes.TIMESTAMP(6); + case BINARY: + return DataTypes.BYTES(); + case DECIMAL: + DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) hiveType; + return DataTypes.DECIMAL( + decimalTypeInfo.getPrecision(), decimalTypeInfo.getScale()); + default: + throw new UnsupportedOperationException( + String.format( + "Flink doesn't support Hive primitive type %s yet", hiveType)); + } + } + + /** + * Create Hive field schemas from Flink table schema including the hoodie metadata fields. + */ + public static List toHiveFieldSchema(TableSchema schema, boolean withOperationField) { + List columns = new ArrayList<>(); + Collection metaFields = withOperationField + ? HoodieRecord.HOODIE_META_COLUMNS_WITH_OPERATION // caution that the set may break sequence + : HoodieRecord.HOODIE_META_COLUMNS; + for (String metaField : metaFields) { + columns.add(new FieldSchema(metaField, "string", null)); + } + columns.addAll(createHiveColumns(schema)); + return columns; + } + + /** + * Create Hive columns from Flink table schema. + */ + private static List createHiveColumns(TableSchema schema) { + final DataType dataType = schema.toPersistedRowDataType(); + final RowType rowType = (RowType) dataType.getLogicalType(); + final String[] fieldNames = rowType.getFieldNames().toArray(new String[0]); + final DataType[] fieldTypes = dataType.getChildren().toArray(new DataType[0]); + + List columns = new ArrayList<>(fieldNames.length); + + for (int i = 0; i < fieldNames.length; i++) { + columns.add( + new FieldSchema( + fieldNames[i], + toHiveTypeInfo(fieldTypes[i]).getTypeName(), + null)); + } + + return columns; + } + + /** + * Convert Flink DataType to Hive TypeInfo. For types with a precision parameter, e.g. + * timestamp, the supported precisions in Hive and Flink can be different. Therefore the + * conversion will fail for those types if the precision is not supported by Hive and + * checkPrecision is true. + * + * @param dataType a Flink DataType + * @return the corresponding Hive data type + */ + public static TypeInfo toHiveTypeInfo(DataType dataType) { + checkNotNull(dataType, "type cannot be null"); + LogicalType logicalType = dataType.getLogicalType(); + return logicalType.accept(new TypeInfoLogicalTypeVisitor(dataType)); + } + + /** + * Split the field schemas by given partition keys. + * + * @param fieldSchemas The Hive field schemas. + * @param partitionKeys The partition keys. + * @return The pair of (regular columns, partition columns) schema fields + */ + public static Pair, List> splitSchemaByPartitionKeys( + List fieldSchemas, + List partitionKeys) { + List regularColumns = new ArrayList<>(); + List partitionColumns = new ArrayList<>(); + for (FieldSchema fieldSchema : fieldSchemas) { + if (partitionKeys.contains(fieldSchema.getName())) { + partitionColumns.add(fieldSchema); + } else { + regularColumns.add(fieldSchema); + } + } + return Pair.of(regularColumns, partitionColumns); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalog.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalog.java new file mode 100644 index 0000000000000..956d61cc3c2a4 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalog.java @@ -0,0 +1,511 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.avro.Schema; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.catalog.AbstractCatalog; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.catalog.CatalogDatabase; +import org.apache.flink.table.catalog.CatalogDatabaseImpl; +import org.apache.flink.table.catalog.CatalogFunction; +import org.apache.flink.table.catalog.CatalogPartition; +import org.apache.flink.table.catalog.CatalogPartitionSpec; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.CatalogView; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.ResolvedCatalogTable; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.catalog.exceptions.CatalogException; +import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; +import org.apache.flink.table.catalog.exceptions.FunctionAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; +import org.apache.flink.table.catalog.exceptions.PartitionAlreadyExistsException; +import org.apache.flink.table.catalog.exceptions.PartitionNotExistException; +import org.apache.flink.table.catalog.exceptions.PartitionSpecInvalidException; +import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; +import org.apache.flink.table.catalog.exceptions.TablePartitionedException; +import org.apache.flink.table.catalog.stats.CatalogColumnStatistics; +import org.apache.flink.table.catalog.stats.CatalogTableStatistics; +import org.apache.flink.table.expressions.Expression; +import org.apache.flink.util.CollectionUtil; +import org.apache.flink.util.StringUtils; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.hudi.table.catalog.CatalogOptions.CATALOG_PATH; +import static org.apache.hudi.table.catalog.CatalogOptions.DEFAULT_DATABASE; + +/** + * Catalog that can set up common options for underneath table. + */ +public class HoodieCatalog extends AbstractCatalog { + private static final Logger LOG = LoggerFactory.getLogger(HoodieCatalog.class); + + private final org.apache.hadoop.conf.Configuration hadoopConf; + private final String catalogPathStr; + private final Map tableCommonOptions; + + private Path catalogPath; + private FileSystem fs; + + public HoodieCatalog(String name, Configuration options) { + super(name, options.get(DEFAULT_DATABASE)); + this.catalogPathStr = options.get(CATALOG_PATH); + this.hadoopConf = HadoopConfigurations.getHadoopConf(options); + this.tableCommonOptions = CatalogOptions.tableCommonOptions(options); + } + + @Override + public void open() throws CatalogException { + fs = FSUtils.getFs(catalogPathStr, hadoopConf); + catalogPath = new Path(catalogPathStr); + try { + if (!fs.exists(catalogPath)) { + throw new CatalogException(String.format("Catalog %s path %s does not exist.", getName(), catalogPathStr)); + } + } catch (IOException e) { + throw new CatalogException(String.format("Checking catalog path %s exists exception.", catalogPathStr), e); + } + } + + @Override + public void close() throws CatalogException { + try { + fs.close(); + } catch (IOException e) { + throw new CatalogException("Closing FileSystem exception.", e); + } + } + + // ------ databases ------ + + @Override + public List listDatabases() throws CatalogException { + try { + FileStatus[] fileStatuses = fs.listStatus(catalogPath); + return Arrays.stream(fileStatuses) + .filter(FileStatus::isDirectory) + .map(fileStatus -> fileStatus.getPath().getName()) + .collect(Collectors.toList()); + } catch (IOException e) { + throw new CatalogException("Listing database exception.", e); + } + } + + @Override + public CatalogDatabase getDatabase(String databaseName) throws DatabaseNotExistException, CatalogException { + if (databaseExists(databaseName)) { + return new CatalogDatabaseImpl(Collections.emptyMap(), null); + } else { + throw new DatabaseNotExistException(getName(), databaseName); + } + } + + @Override + public boolean databaseExists(String databaseName) throws CatalogException { + checkArgument(!StringUtils.isNullOrWhitespaceOnly(databaseName)); + + return listDatabases().contains(databaseName); + } + + @Override + public void createDatabase(String databaseName, CatalogDatabase catalogDatabase, boolean ignoreIfExists) + throws DatabaseAlreadyExistException, CatalogException { + if (databaseExists(databaseName)) { + if (ignoreIfExists) { + return; + } else { + throw new DatabaseAlreadyExistException(getName(), databaseName); + } + } + + if (!CollectionUtil.isNullOrEmpty(catalogDatabase.getProperties())) { + throw new CatalogException("Hudi catalog doesn't support to create database with options."); + } + + Path dbPath = new Path(catalogPath, databaseName); + try { + fs.mkdirs(dbPath); + } catch (IOException e) { + throw new CatalogException(String.format("Creating database %s exception.", databaseName), e); + } + } + + @Override + public void dropDatabase(String databaseName, boolean ignoreIfNotExists, boolean cascade) + throws DatabaseNotExistException, DatabaseNotEmptyException, CatalogException { + if (!databaseExists(databaseName)) { + if (ignoreIfNotExists) { + return; + } else { + throw new DatabaseNotExistException(getName(), databaseName); + } + } + + List tables = listTables(databaseName); + if (!tables.isEmpty() && !cascade) { + throw new DatabaseNotEmptyException(getName(), databaseName); + } + + if (databaseName.equals(getDefaultDatabase())) { + throw new IllegalArgumentException( + "Hudi catalog doesn't support to drop the default database."); + } + + Path dbPath = new Path(catalogPath, databaseName); + try { + fs.delete(dbPath, true); + } catch (IOException e) { + throw new CatalogException(String.format("Dropping database %s exception.", databaseName), e); + } + } + + @Override + public void alterDatabase(String databaseName, CatalogDatabase catalogDatabase, boolean ignoreIfNotExists) + throws DatabaseNotExistException, CatalogException { + throw new UnsupportedOperationException("Altering database is not implemented."); + } + + // ------ tables ------ + + @Override + public List listTables(String databaseName) throws DatabaseNotExistException, CatalogException { + if (!databaseExists(databaseName)) { + throw new DatabaseNotExistException(getName(), databaseName); + } + + Path dbPath = new Path(catalogPath, databaseName); + try { + return Arrays.stream(fs.listStatus(dbPath)) + .filter(FileStatus::isDirectory) + .map(fileStatus -> fileStatus.getPath().getName()) + .collect(Collectors.toList()); + } catch (IOException e) { + throw new CatalogException(String.format("Listing table in database %s exception.", dbPath), e); + } + } + + @Override + public CatalogBaseTable getTable(ObjectPath tablePath) throws TableNotExistException, CatalogException { + if (!tableExists(tablePath)) { + throw new TableNotExistException(getName(), tablePath); + } + + final String path = inferTablePath(catalogPathStr, tablePath); + Map options = TableOptionProperties.loadFromProperties(path, hadoopConf); + final Schema latestSchema = getLatestTableSchema(path); + if (latestSchema != null) { + org.apache.flink.table.api.Schema.Builder builder = org.apache.flink.table.api.Schema.newBuilder() + .fromRowDataType(AvroSchemaConverter.convertToDataType(latestSchema)); + final String pkConstraintName = TableOptionProperties.getPkConstraintName(options); + if (pkConstraintName != null) { + builder.primaryKeyNamed(pkConstraintName, TableOptionProperties.getPkColumns(options)); + } + final org.apache.flink.table.api.Schema schema = builder.build(); + return CatalogTable.of( + schema, + TableOptionProperties.getComment(options), + TableOptionProperties.getPartitionColumns(options), + TableOptionProperties.getTableOptions(options)); + } else { + throw new TableNotExistException(getName(), tablePath); + } + } + + @Override + public void createTable(ObjectPath tablePath, CatalogBaseTable catalogTable, boolean ignoreIfExists) + throws TableAlreadyExistException, DatabaseNotExistException, CatalogException { + if (!databaseExists(tablePath.getDatabaseName())) { + throw new DatabaseNotExistException(getName(), tablePath.getDatabaseName()); + } + if (tableExists(tablePath)) { + if (ignoreIfExists) { + return; + } else { + throw new TableAlreadyExistException(getName(), tablePath); + } + } + + if (catalogTable instanceof CatalogView) { + throw new UnsupportedOperationException( + "Hudi catalog doesn't support to CREATE VIEW."); + } + + ResolvedCatalogTable resolvedTable = (ResolvedCatalogTable) catalogTable; + final String tablePathStr = inferTablePath(catalogPathStr, tablePath); + Map options = applyOptionsHook(tablePathStr, catalogTable.getOptions()); + Configuration conf = Configuration.fromMap(options); + conf.setString(FlinkOptions.PATH, tablePathStr); + ResolvedSchema resolvedSchema = resolvedTable.getResolvedSchema(); + if (!resolvedSchema.getPrimaryKey().isPresent()) { + throw new CatalogException("Primary key definition is missing"); + } + final String avroSchema = AvroSchemaConverter.convertToSchema(resolvedSchema.toPhysicalRowDataType().getLogicalType()).toString(); + conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA, avroSchema); + + // stores two copies of options: + // - partition keys + // - primary keys + // because the HoodieTableMetaClient is a heavy impl, we try to avoid initializing it + // when calling #getTable. + + final String pkColumns = String.join(",", resolvedSchema.getPrimaryKey().get().getColumns()); + conf.setString(FlinkOptions.RECORD_KEY_FIELD, pkColumns); + options.put(TableOptionProperties.PK_CONSTRAINT_NAME, resolvedSchema.getPrimaryKey().get().getName()); + options.put(TableOptionProperties.PK_COLUMNS, pkColumns); + + if (resolvedTable.isPartitioned()) { + final String partitions = String.join(",", resolvedTable.getPartitionKeys()); + conf.setString(FlinkOptions.PARTITION_PATH_FIELD, partitions); + options.put(TableOptionProperties.PARTITION_COLUMNS, partitions); + } + conf.setString(FlinkOptions.TABLE_NAME, tablePath.getObjectName()); + try { + StreamerUtil.initTableIfNotExists(conf); + // prepare the non-table-options properties + if (!StringUtils.isNullOrWhitespaceOnly(resolvedTable.getComment())) { + options.put(TableOptionProperties.COMMENT, resolvedTable.getComment()); + } + TableOptionProperties.createProperties(tablePathStr, hadoopConf, options); + } catch (IOException e) { + throw new CatalogException(String.format("Initialize table path %s exception.", tablePathStr), e); + } + } + + @Override + public boolean tableExists(ObjectPath tablePath) throws CatalogException { + return StreamerUtil.tableExists(inferTablePath(catalogPathStr, tablePath), hadoopConf); + } + + @Override + public void dropTable(ObjectPath tablePath, boolean ignoreIfNotExists) + throws TableNotExistException, CatalogException { + if (!tableExists(tablePath)) { + if (ignoreIfNotExists) { + return; + } else { + throw new TableNotExistException(getName(), tablePath); + } + } + + Path path = new Path(inferTablePath(catalogPathStr, tablePath)); + try { + this.fs.delete(path, true); + } catch (IOException e) { + throw new CatalogException(String.format("Dropping table %s exception.", tablePath), e); + } + } + + @Override + public void renameTable(ObjectPath tablePath, String newTableName, boolean ignoreIfNotExists) + throws TableNotExistException, TableAlreadyExistException, CatalogException { + throw new UnsupportedOperationException("renameTable is not implemented."); + } + + @Override + public void alterTable(ObjectPath tablePath, CatalogBaseTable catalogBaseTable, boolean ignoreIfNotExists) + throws TableNotExistException, CatalogException { + throw new UnsupportedOperationException("alterTable is not implemented."); + } + + @Override + public List listViews(String databaseName) throws DatabaseNotExistException, CatalogException { + return Collections.emptyList(); + } + + @Override + public List listPartitions(ObjectPath tablePath) + throws TableNotExistException, TableNotPartitionedException, CatalogException { + return Collections.emptyList(); + } + + @Override + public List listPartitions(ObjectPath tablePath, CatalogPartitionSpec catalogPartitionSpec) + throws TableNotExistException, TableNotPartitionedException, PartitionSpecInvalidException, CatalogException { + return Collections.emptyList(); + } + + @Override + public List listPartitionsByFilter(ObjectPath tablePath, List filters) + throws TableNotExistException, TableNotPartitionedException, CatalogException { + return Collections.emptyList(); + } + + @Override + public CatalogPartition getPartition(ObjectPath tablePath, CatalogPartitionSpec catalogPartitionSpec) + throws PartitionNotExistException, CatalogException { + throw new PartitionNotExistException(getName(), tablePath, catalogPartitionSpec); + } + + @Override + public boolean partitionExists(ObjectPath tablePath, CatalogPartitionSpec catalogPartitionSpec) throws CatalogException { + return false; + } + + @Override + public void createPartition(ObjectPath tablePath, CatalogPartitionSpec catalogPartitionSpec, CatalogPartition catalogPartition, boolean ignoreIfExists) + throws TableNotExistException, TableNotPartitionedException, PartitionSpecInvalidException, PartitionAlreadyExistsException, CatalogException { + throw new UnsupportedOperationException("createPartition is not implemented."); + } + + @Override + public void dropPartition(ObjectPath tablePath, CatalogPartitionSpec catalogPartitionSpec, boolean ignoreIfNotExists) + throws PartitionNotExistException, CatalogException { + throw new UnsupportedOperationException("dropPartition is not implemented."); + } + + @Override + public void alterPartition(ObjectPath tablePath, CatalogPartitionSpec catalogPartitionSpec, CatalogPartition catalogPartition, boolean ignoreIfNotExists) + throws PartitionNotExistException, CatalogException { + throw new UnsupportedOperationException("alterPartition is not implemented."); + } + + @Override + public List listFunctions(String databaseName) throws DatabaseNotExistException, CatalogException { + return Collections.emptyList(); + } + + @Override + public CatalogFunction getFunction(ObjectPath functionPath) throws FunctionNotExistException, CatalogException { + throw new FunctionNotExistException(getName(), functionPath); + } + + @Override + public boolean functionExists(ObjectPath functionPath) throws CatalogException { + return false; + } + + @Override + public void createFunction(ObjectPath functionPath, CatalogFunction catalogFunction, boolean ignoreIfExists) + throws FunctionAlreadyExistException, DatabaseNotExistException, CatalogException { + throw new UnsupportedOperationException("createFunction is not implemented."); + } + + @Override + public void alterFunction(ObjectPath functionPath, CatalogFunction catalogFunction, boolean ignoreIfNotExists) + throws FunctionNotExistException, CatalogException { + throw new UnsupportedOperationException("alterFunction is not implemented."); + } + + @Override + public void dropFunction(ObjectPath functionPath, boolean ignoreIfNotExists) + throws FunctionNotExistException, CatalogException { + throw new UnsupportedOperationException("dropFunction is not implemented."); + } + + @Override + public CatalogTableStatistics getTableStatistics(ObjectPath tablePath) + throws TableNotExistException, CatalogException { + return CatalogTableStatistics.UNKNOWN; + } + + @Override + public CatalogColumnStatistics getTableColumnStatistics(ObjectPath tablePath) + throws TableNotExistException, CatalogException { + return CatalogColumnStatistics.UNKNOWN; + } + + @Override + public CatalogTableStatistics getPartitionStatistics(ObjectPath tablePath, CatalogPartitionSpec catalogPartitionSpec) + throws PartitionNotExistException, CatalogException { + return CatalogTableStatistics.UNKNOWN; + } + + @Override + public CatalogColumnStatistics getPartitionColumnStatistics(ObjectPath tablePath, CatalogPartitionSpec catalogPartitionSpec) + throws PartitionNotExistException, CatalogException { + return CatalogColumnStatistics.UNKNOWN; + } + + @Override + public void alterTableStatistics(ObjectPath tablePath, CatalogTableStatistics catalogTableStatistics, boolean ignoreIfNotExists) + throws TableNotExistException, CatalogException { + throw new UnsupportedOperationException("alterTableStatistics is not implemented."); + } + + @Override + public void alterTableColumnStatistics(ObjectPath tablePath, CatalogColumnStatistics catalogColumnStatistics, boolean ignoreIfNotExists) + throws TableNotExistException, CatalogException, TablePartitionedException { + throw new UnsupportedOperationException("alterTableColumnStatistics is not implemented."); + } + + @Override + public void alterPartitionStatistics(ObjectPath tablePath, CatalogPartitionSpec catalogPartitionSpec, CatalogTableStatistics catalogTableStatistics, boolean ignoreIfNotExists) + throws PartitionNotExistException, CatalogException { + throw new UnsupportedOperationException("alterPartitionStatistics is not implemented."); + } + + @Override + public void alterPartitionColumnStatistics(ObjectPath tablePath, CatalogPartitionSpec catalogPartitionSpec, CatalogColumnStatistics catalogColumnStatistics, boolean ignoreIfNotExists) + throws PartitionNotExistException, CatalogException { + throw new UnsupportedOperationException("alterPartitionColumnStatistics is not implemented."); + } + + private @Nullable Schema getLatestTableSchema(String path) { + if (path != null && StreamerUtil.tableExists(path, hadoopConf)) { + try { + HoodieTableMetaClient metaClient = StreamerUtil.createMetaClient(path, hadoopConf); + return new TableSchemaResolver(metaClient).getTableAvroSchema(false); // change log mode is not supported now + } catch (Throwable throwable) { + LOG.warn("Error while resolving the latest table schema.", throwable); + // ignored + } + } + return null; + } + + private Map applyOptionsHook(String tablePath, Map options) { + Map newOptions = new HashMap<>(options); + newOptions.put("connector", "hudi"); + newOptions.computeIfAbsent(FlinkOptions.PATH.key(), k -> tablePath); + tableCommonOptions.forEach(newOptions::putIfAbsent); + return newOptions; + } + + private String inferTablePath(String catalogPath, ObjectPath tablePath) { + return String.format("%s/%s/%s", catalogPath, tablePath.getDatabaseName(), tablePath.getObjectName()); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalogFactory.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalogFactory.java new file mode 100644 index 0000000000000..436b836eff468 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalogFactory.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.hudi.exception.HoodieCatalogException; + +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.catalog.Catalog; +import org.apache.flink.table.factories.CatalogFactory; +import org.apache.flink.table.factories.FactoryUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collections; +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +/** + * A catalog factory impl that creates {@link HoodieCatalog}. + */ +public class HoodieCatalogFactory implements CatalogFactory { + private static final Logger LOG = LoggerFactory.getLogger(HoodieCatalogFactory.class); + + public static final String IDENTIFIER = "hudi"; + + @Override + public String factoryIdentifier() { + return IDENTIFIER; + } + + @Override + public Catalog createCatalog(Context context) { + final FactoryUtil.CatalogFactoryHelper helper = + FactoryUtil.createCatalogFactoryHelper(this, context); + helper.validate(); + String mode = helper.getOptions().get(CatalogOptions.MODE); + switch (mode.toLowerCase(Locale.ROOT)) { + case "hms": + return new HoodieHiveCatalog( + context.getName(), + (Configuration) helper.getOptions()); + case "dfs": + return new HoodieCatalog( + context.getName(), + (Configuration) helper.getOptions()); + default: + throw new HoodieCatalogException(String.format("Invalid catalog mode: %s, supported modes: [hms, dfs].", mode)); + } + } + + @Override + public Set> requiredOptions() { + return Collections.emptySet(); + } + + @Override + public Set> optionalOptions() { + return new HashSet<>(CatalogOptions.allOptions()); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalogUtil.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalogUtil.java new file mode 100644 index 0000000000000..3dc191afb4c50 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalogUtil.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.HadoopConfigurations; + +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.exceptions.CatalogException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +import static org.apache.flink.util.StringUtils.isNullOrWhitespaceOnly; +import static org.apache.hudi.table.catalog.CatalogOptions.HIVE_SITE_FILE; + +/** + * Utilities for Hoodie Catalog. + */ +public class HoodieCatalogUtil { + private static final Logger LOG = LoggerFactory.getLogger(HoodieCatalogUtil.class); + + /** + * Returns a new {@code HiveConf}. + * + * @param hiveConfDir Hive conf directory path. + * @return A HiveConf instance. + */ + public static HiveConf createHiveConf(@Nullable String hiveConfDir) { + // create HiveConf from hadoop configuration with hadoop conf directory configured. + Configuration hadoopConf = HadoopConfigurations.getHadoopConf(new org.apache.flink.configuration.Configuration()); + + // ignore all the static conf file URLs that HiveConf may have set + HiveConf.setHiveSiteLocation(null); + HiveConf.setLoadMetastoreConfig(false); + HiveConf.setLoadHiveServer2Config(false); + HiveConf hiveConf = new HiveConf(hadoopConf, HiveConf.class); + + LOG.info("Setting hive conf dir as {}", hiveConfDir); + + if (hiveConfDir != null) { + Path hiveSite = new Path(hiveConfDir, HIVE_SITE_FILE); + if (!hiveSite.toUri().isAbsolute()) { + // treat relative URI as local file to be compatible with previous behavior + hiveSite = new Path(new File(hiveSite.toString()).toURI()); + } + try (InputStream inputStream = hiveSite.getFileSystem(hadoopConf).open(hiveSite)) { + hiveConf.addResource(inputStream, hiveSite.toString()); + // trigger a read from the conf so that the input stream is read + isEmbeddedMetastore(hiveConf); + } catch (IOException e) { + throw new CatalogException( + "Failed to load hive-site.xml from specified path:" + hiveSite, e); + } + } else { + // user doesn't provide hive conf dir, we try to find it in classpath + URL hiveSite = + Thread.currentThread().getContextClassLoader().getResource(HIVE_SITE_FILE); + if (hiveSite != null) { + LOG.info("Found {} in classpath: {}", HIVE_SITE_FILE, hiveSite); + hiveConf.addResource(hiveSite); + } + } + return hiveConf; + } + + /** + * Check whether the hive.metastore.uris is empty + */ + public static boolean isEmbeddedMetastore(HiveConf hiveConf) { + return isNullOrWhitespaceOnly(hiveConf.getVar(HiveConf.ConfVars.METASTOREURIS)); + } + + /** + * Returns the partition key list with given table. + */ + public static List getPartitionKeys(CatalogTable table) { + // the PARTITIONED BY syntax always has higher priority than option FlinkOptions#PARTITION_PATH_FIELD + if (table.isPartitioned()) { + return table.getPartitionKeys(); + } else if (table.getOptions().containsKey(FlinkOptions.PARTITION_PATH_FIELD.key())) { + return Arrays.stream(table.getOptions().get(FlinkOptions.PARTITION_PATH_FIELD.key()).split(",")) + .collect(Collectors.toList()); + } + return Collections.emptyList(); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieHiveCatalog.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieHiveCatalog.java new file mode 100644 index 0000000000000..8a34695ff3b29 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieHiveCatalog.java @@ -0,0 +1,917 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.exception.HoodieCatalogException; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.sync.common.util.ConfigUtils; +import org.apache.hudi.table.format.FilePathUtils; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.DataTypeUtils; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.avro.Schema; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabase; +import org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabaseOwner; +import org.apache.flink.sql.parser.hive.ddl.SqlCreateHiveDatabase; +import org.apache.flink.table.catalog.AbstractCatalog; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.catalog.CatalogDatabase; +import org.apache.flink.table.catalog.CatalogDatabaseImpl; +import org.apache.flink.table.catalog.CatalogFunction; +import org.apache.flink.table.catalog.CatalogPartition; +import org.apache.flink.table.catalog.CatalogPartitionSpec; +import org.apache.flink.table.catalog.CatalogPropertiesUtil; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.CatalogView; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.CatalogException; +import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; +import org.apache.flink.table.catalog.exceptions.FunctionAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; +import org.apache.flink.table.catalog.exceptions.PartitionAlreadyExistsException; +import org.apache.flink.table.catalog.exceptions.PartitionNotExistException; +import org.apache.flink.table.catalog.exceptions.PartitionSpecInvalidException; +import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; +import org.apache.flink.table.catalog.exceptions.TablePartitionedException; +import org.apache.flink.table.catalog.stats.CatalogColumnStatistics; +import org.apache.flink.table.catalog.stats.CatalogTableStatistics; +import org.apache.flink.table.expressions.Expression; +import org.apache.flink.table.types.DataType; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.metastore.TableType; +import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.InvalidOperationException; +import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; +import org.apache.hadoop.hive.metastore.api.PrincipalType; +import org.apache.hadoop.hive.metastore.api.SerDeInfo; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.api.UnknownDBException; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.thrift.TException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabase.ALTER_DATABASE_OP; +import static org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabaseOwner.DATABASE_OWNER_NAME; +import static org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabaseOwner.DATABASE_OWNER_TYPE; +import static org.apache.flink.table.factories.FactoryUtil.CONNECTOR; +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.flink.util.Preconditions.checkNotNull; +import static org.apache.flink.util.StringUtils.isNullOrWhitespaceOnly; +import static org.apache.hudi.configuration.FlinkOptions.PATH; +import static org.apache.hudi.table.catalog.TableOptionProperties.COMMENT; +import static org.apache.hudi.table.catalog.TableOptionProperties.PK_CONSTRAINT_NAME; +import static org.apache.hudi.table.catalog.TableOptionProperties.SPARK_SOURCE_PROVIDER; + +/** + * A catalog implementation for Hoodie based on MetaStore. + */ +public class HoodieHiveCatalog extends AbstractCatalog { + private static final Logger LOG = LoggerFactory.getLogger(HoodieHiveCatalog.class); + + private final HiveConf hiveConf; + private IMetaStoreClient client; + + // optional catalog base path: used for db/table path inference. + private final String catalogPath; + private final boolean external; + + public HoodieHiveCatalog(String catalogName, Configuration options) { + this(catalogName, options, HoodieCatalogUtil.createHiveConf(options.getString(CatalogOptions.HIVE_CONF_DIR)), false); + } + + public HoodieHiveCatalog( + String catalogName, + Configuration options, + HiveConf hiveConf, + boolean allowEmbedded) { + super(catalogName, options.getString(CatalogOptions.DEFAULT_DATABASE)); + // fallback to hive.metastore.warehouse.dir if catalog path is not specified + this.hiveConf = hiveConf; + this.catalogPath = options.getString(CatalogOptions.CATALOG_PATH, hiveConf.getVar(HiveConf.ConfVars.METASTOREWAREHOUSE)); + this.external = options.getBoolean(CatalogOptions.TABLE_EXTERNAL); + if (!allowEmbedded) { + checkArgument( + !HoodieCatalogUtil.isEmbeddedMetastore(this.hiveConf), + "Embedded metastore is not allowed. Make sure you have set a valid value for " + + HiveConf.ConfVars.METASTOREURIS); + } + LOG.info("Created Hoodie Catalog '{}' in hms mode", catalogName); + } + + @Override + public void open() throws CatalogException { + if (this.client == null) { + try { + this.client = Hive.get(hiveConf).getMSC(); + } catch (Exception e) { + throw new HoodieCatalogException("Failed to create hive metastore client", e); + } + LOG.info("Connected to Hive metastore"); + } + if (!databaseExists(getDefaultDatabase())) { + LOG.info("{} does not exist, will be created.", getDefaultDatabase()); + CatalogDatabase database = new CatalogDatabaseImpl(Collections.emptyMap(), "default database"); + try { + createDatabase(getDefaultDatabase(), database, true); + } catch (DatabaseAlreadyExistException e) { + throw new HoodieCatalogException(getName(), e); + } + } + } + + @Override + public void close() throws CatalogException { + if (client != null) { + client.close(); + client = null; + LOG.info("Disconnect to hive metastore"); + } + } + + public HiveConf getHiveConf() { + return hiveConf; + } + + // ------ databases ------ + + @Override + public List listDatabases() throws CatalogException { + try { + return client.getAllDatabases(); + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to list all databases in %s", getName()), e); + } + } + + private Database getHiveDatabase(String databaseName) throws DatabaseNotExistException { + try { + return client.getDatabase(databaseName); + } catch (NoSuchObjectException e) { + throw new DatabaseNotExistException(getName(), databaseName); + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to get database %s from %s", databaseName, getName()), e); + } + } + + @Override + public CatalogDatabase getDatabase(String databaseName) + throws DatabaseNotExistException, CatalogException { + Database hiveDatabase = getHiveDatabase(databaseName); + + Map properties = new HashMap<>(hiveDatabase.getParameters()); + + properties.put(SqlCreateHiveDatabase.DATABASE_LOCATION_URI, hiveDatabase.getLocationUri()); + + return new CatalogDatabaseImpl(properties, hiveDatabase.getDescription()); + } + + @Override + public boolean databaseExists(String databaseName) throws CatalogException { + try { + return client.getDatabase(databaseName) != null; + } catch (NoSuchObjectException e) { + return false; + } catch (TException e) { + throw new HoodieCatalogException( + String.format( + "Failed to determine whether database %s exists or not", databaseName), + e); + } + } + + @Override + public void createDatabase( + String databaseName, CatalogDatabase database, boolean ignoreIfExists) + throws DatabaseAlreadyExistException, CatalogException { + checkArgument( + !isNullOrWhitespaceOnly(databaseName), "Database name can not null or empty"); + checkNotNull(database, "database cannot be null"); + + Map properties = database.getProperties(); + + String dbLocationUri = properties.remove(SqlCreateHiveDatabase.DATABASE_LOCATION_URI); + if (dbLocationUri == null && this.catalogPath != null) { + // infer default location uri + dbLocationUri = new Path(this.catalogPath, databaseName).toString(); + } + + Database hiveDatabase = + new Database(databaseName, database.getComment(), dbLocationUri, properties); + + try { + client.createDatabase(hiveDatabase); + } catch (AlreadyExistsException e) { + if (!ignoreIfExists) { + throw new DatabaseAlreadyExistException(getName(), hiveDatabase.getName()); + } + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to create database %s", hiveDatabase.getName()), e); + } + } + + @Override + public void dropDatabase(String name, boolean ignoreIfNotExists, boolean cascade) + throws DatabaseNotExistException, DatabaseNotEmptyException, CatalogException { + try { + client.dropDatabase(name, true, ignoreIfNotExists, cascade); + } catch (NoSuchObjectException e) { + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), name); + } + } catch (InvalidOperationException e) { + throw new DatabaseNotEmptyException(getName(), name); + } catch (TException e) { + throw new HoodieCatalogException(String.format("Failed to drop database %s", name), e); + } + } + + @Override + public void alterDatabase( + String databaseName, CatalogDatabase newDatabase, boolean ignoreIfNotExists) + throws DatabaseNotExistException, CatalogException { + checkArgument( + !isNullOrWhitespaceOnly(databaseName), "Database name cannot be null or empty"); + checkNotNull(newDatabase, "New database cannot be null"); + + // client.alterDatabase doesn't throw any exception if there is no existing database + Database hiveDB; + try { + hiveDB = getHiveDatabase(databaseName); + } catch (DatabaseNotExistException e) { + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), databaseName); + } + + return; + } + + try { + client.alterDatabase(databaseName, alterDatabase(hiveDB, newDatabase)); + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to alter database %s", databaseName), e); + } + } + + private static Database alterDatabase(Database hiveDB, CatalogDatabase newDatabase) { + Map newParams = newDatabase.getProperties(); + String opStr = newParams.remove(ALTER_DATABASE_OP); + if (opStr == null) { + // by default is to alter db properties + opStr = SqlAlterHiveDatabase.AlterHiveDatabaseOp.CHANGE_PROPS.name(); + } + String newLocation = newParams.remove(SqlCreateHiveDatabase.DATABASE_LOCATION_URI); + SqlAlterHiveDatabase.AlterHiveDatabaseOp op = + SqlAlterHiveDatabase.AlterHiveDatabaseOp.valueOf(opStr); + switch (op) { + case CHANGE_PROPS: + hiveDB.setParameters(newParams); + break; + case CHANGE_LOCATION: + hiveDB.setLocationUri(newLocation); + break; + case CHANGE_OWNER: + String ownerName = newParams.remove(DATABASE_OWNER_NAME); + String ownerType = newParams.remove(DATABASE_OWNER_TYPE); + hiveDB.setOwnerName(ownerName); + switch (ownerType) { + case SqlAlterHiveDatabaseOwner.ROLE_OWNER: + hiveDB.setOwnerType(PrincipalType.ROLE); + break; + case SqlAlterHiveDatabaseOwner.USER_OWNER: + hiveDB.setOwnerType(PrincipalType.USER); + break; + default: + throw new CatalogException("Unsupported database owner type: " + ownerType); + } + break; + default: + throw new CatalogException("Unsupported alter database op:" + opStr); + } + // is_generic is deprecated, remove it + if (hiveDB.getParameters() != null) { + hiveDB.getParameters().remove(CatalogPropertiesUtil.IS_GENERIC); + } + return hiveDB; + } + + // ------ tables ------ + + private Table isHoodieTable(Table hiveTable) { + if (!hiveTable.getParameters().getOrDefault(SPARK_SOURCE_PROVIDER, "").equalsIgnoreCase("hudi") + && !isFlinkHoodieTable(hiveTable)) { + throw new HoodieCatalogException(String.format("the %s is not hoodie table", hiveTable.getTableName())); + } + return hiveTable; + } + + private boolean isFlinkHoodieTable(Table hiveTable) { + return hiveTable.getParameters().getOrDefault(CONNECTOR.key(), "").equalsIgnoreCase("hudi"); + } + + @VisibleForTesting + public Table getHiveTable(ObjectPath tablePath) throws TableNotExistException { + try { + Table hiveTable = client.getTable(tablePath.getDatabaseName(), tablePath.getObjectName()); + return isHoodieTable(hiveTable); + } catch (NoSuchObjectException e) { + throw new TableNotExistException(getName(), tablePath); + } catch (TException e) { + throw new HoodieCatalogException(String.format("Failed to get table %s from Hive metastore", tablePath.getObjectName())); + } + } + + private Table translateSparkTable2Flink(ObjectPath tablePath, Table hiveTable) { + if (!isFlinkHoodieTable(hiveTable)) { + try { + Map parameters = hiveTable.getParameters(); + parameters.putAll(TableOptionProperties.translateSparkTableProperties2Flink(hiveTable)); + String path = hiveTable.getSd().getLocation(); + parameters.put(PATH.key(), path); + if (!parameters.containsKey(FlinkOptions.HIVE_STYLE_PARTITIONING.key())) { + Path hoodieTablePath = new Path(path); + boolean hiveStyle = Arrays.stream(FSUtils.getFs(hoodieTablePath, hiveConf).listStatus(hoodieTablePath)) + .map(fileStatus -> fileStatus.getPath().getName()) + .filter(f -> !f.equals(".hoodie") && !f.equals("default")) + .anyMatch(FilePathUtils::isHiveStylePartitioning); + parameters.put(FlinkOptions.HIVE_STYLE_PARTITIONING.key(), String.valueOf(hiveStyle)); + } + client.alter_table(tablePath.getDatabaseName(), tablePath.getObjectName(), hiveTable); + } catch (Exception e) { + throw new HoodieCatalogException("Failed to update table schema", e); + } + } + return hiveTable; + } + + @Override + public CatalogBaseTable getTable(ObjectPath tablePath) throws TableNotExistException, CatalogException { + checkNotNull(tablePath, "Table path cannot be null"); + Table hiveTable = translateSparkTable2Flink(tablePath, getHiveTable(tablePath)); + String path = hiveTable.getSd().getLocation(); + Map parameters = hiveTable.getParameters(); + Schema latestTableSchema = StreamerUtil.getLatestTableSchema(path, hiveConf); + String pkColumnsStr = parameters.get(FlinkOptions.RECORD_KEY_FIELD.key()); + List pkColumns = StringUtils.isNullOrEmpty(pkColumnsStr) + ? null : StringUtils.split(pkColumnsStr, ","); + org.apache.flink.table.api.Schema schema; + if (latestTableSchema != null) { + // if the table is initialized from spark, the write schema is nullable for pk columns. + DataType tableDataType = DataTypeUtils.ensureColumnsAsNonNullable( + AvroSchemaConverter.convertToDataType(latestTableSchema), pkColumns); + org.apache.flink.table.api.Schema.Builder builder = org.apache.flink.table.api.Schema.newBuilder() + .fromRowDataType(tableDataType); + String pkConstraintName = parameters.get(PK_CONSTRAINT_NAME); + if (!StringUtils.isNullOrEmpty(pkConstraintName)) { + // pkColumns expect not to be null + builder.primaryKeyNamed(pkConstraintName, pkColumns); + } else if (pkColumns != null) { + builder.primaryKey(pkColumns); + } + schema = builder.build(); + } else { + LOG.warn("{} does not have any hoodie schema, and use hive table schema to infer the table schema", tablePath); + schema = HiveSchemaUtils.convertTableSchema(hiveTable); + } + Map options = supplementOptions(tablePath, parameters); + return CatalogTable.of(schema, parameters.get(COMMENT), + HiveSchemaUtils.getFieldNames(hiveTable.getPartitionKeys()), options); + } + + @Override + public void createTable(ObjectPath tablePath, CatalogBaseTable table, boolean ignoreIfExists) + throws TableAlreadyExistException, DatabaseNotExistException, CatalogException { + checkNotNull(tablePath, "Table path cannot be null"); + checkNotNull(table, "Table cannot be null"); + + if (!databaseExists(tablePath.getDatabaseName())) { + throw new DatabaseNotExistException(getName(), tablePath.getDatabaseName()); + } + + if (!table.getOptions().getOrDefault(CONNECTOR.key(), "").equalsIgnoreCase("hudi")) { + throw new HoodieCatalogException(String.format("The %s is not hoodie table", tablePath.getObjectName())); + } + + if (table instanceof CatalogView) { + throw new HoodieCatalogException("CREATE VIEW is not supported."); + } + + try { + boolean isMorTable = OptionsResolver.isMorTable(table.getOptions()); + Table hiveTable = instantiateHiveTable(tablePath, table, inferTablePath(tablePath, table), isMorTable); + //create hive table + client.createTable(hiveTable); + //init hoodie metaClient + initTableIfNotExists(tablePath, (CatalogTable) table); + } catch (AlreadyExistsException e) { + if (!ignoreIfExists) { + throw new TableAlreadyExistException(getName(), tablePath, e); + } + } catch (Exception e) { + throw new HoodieCatalogException( + String.format("Failed to create table %s", tablePath.getFullName()), e); + } + } + + private void initTableIfNotExists(ObjectPath tablePath, CatalogTable catalogTable) { + Configuration flinkConf = Configuration.fromMap(catalogTable.getOptions()); + final String avroSchema = AvroSchemaConverter.convertToSchema(catalogTable.getSchema().toPersistedRowDataType().getLogicalType()).toString(); + flinkConf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA, avroSchema); + + // stores two copies of options: + // - partition keys + // - primary keys + // because the HoodieTableMetaClient is a heavy impl, we try to avoid initializing it + // when calling #getTable. + + if (catalogTable.getUnresolvedSchema().getPrimaryKey().isPresent() + && !flinkConf.contains(FlinkOptions.RECORD_KEY_FIELD)) { + final String pkColumns = String.join(",", catalogTable.getUnresolvedSchema().getPrimaryKey().get().getColumnNames()); + flinkConf.setString(FlinkOptions.RECORD_KEY_FIELD, pkColumns); + } + + if (catalogTable.isPartitioned() && !flinkConf.contains(FlinkOptions.PARTITION_PATH_FIELD)) { + final String partitions = String.join(",", catalogTable.getPartitionKeys()); + flinkConf.setString(FlinkOptions.PARTITION_PATH_FIELD, partitions); + } + + if (!flinkConf.getOptional(PATH).isPresent()) { + flinkConf.setString(PATH, inferTablePath(tablePath, catalogTable)); + } + + flinkConf.setString(FlinkOptions.TABLE_NAME, tablePath.getObjectName()); + try { + StreamerUtil.initTableIfNotExists(flinkConf, hiveConf); + } catch (IOException e) { + throw new HoodieCatalogException("Initialize table exception.", e); + } + } + + private String inferTablePath(ObjectPath tablePath, CatalogBaseTable table) { + String location = table.getOptions().getOrDefault(PATH.key(), ""); + if (StringUtils.isNullOrEmpty(location)) { + try { + Path dbLocation = new Path(client.getDatabase(tablePath.getDatabaseName()).getLocationUri()); + location = new Path(dbLocation, tablePath.getObjectName()).toString(); + } catch (TException e) { + throw new HoodieCatalogException(String.format("Failed to infer hoodie table path for table %s", tablePath), e); + } + } + return location; + } + + private Table instantiateHiveTable(ObjectPath tablePath, CatalogBaseTable table, String location, boolean useRealTimeInputFormat) throws IOException { + // let Hive set default parameters for us, e.g. serialization.format + Table hiveTable = + org.apache.hadoop.hive.ql.metadata.Table.getEmptyTable( + tablePath.getDatabaseName(), tablePath.getObjectName()); + + hiveTable.setOwner(UserGroupInformation.getCurrentUser().getUserName()); + hiveTable.setCreateTime((int) (System.currentTimeMillis() / 1000)); + + Map properties = new HashMap<>(table.getOptions()); + + if (external) { + hiveTable.setTableType(TableType.EXTERNAL_TABLE.toString()); + properties.put("EXTERNAL", "TRUE"); + } + + // Table comment + if (table.getComment() != null) { + properties.put(COMMENT, table.getComment()); + } + + //set pk + if (table.getUnresolvedSchema().getPrimaryKey().isPresent() + && !properties.containsKey(FlinkOptions.RECORD_KEY_FIELD.key())) { + String pkColumns = String.join(",", table.getUnresolvedSchema().getPrimaryKey().get().getColumnNames()); + properties.put(PK_CONSTRAINT_NAME, table.getUnresolvedSchema().getPrimaryKey().get().getConstraintName()); + properties.put(FlinkOptions.RECORD_KEY_FIELD.key(), pkColumns); + } + + if (!properties.containsKey(FlinkOptions.PATH.key())) { + properties.put(FlinkOptions.PATH.key(), location); + } + + //set sd + StorageDescriptor sd = new StorageDescriptor(); + // the metadata fields should be included to keep sync with the hive sync tool, + // because since Hive 3.x, there is validation when altering table, + // when the metadata fields are synced through the hive sync tool, + // a compatability issue would be reported. + boolean withOperationField = Boolean.parseBoolean(table.getOptions().getOrDefault(FlinkOptions.CHANGELOG_ENABLED.key(), "false")); + List allColumns = HiveSchemaUtils.toHiveFieldSchema(table.getSchema(), withOperationField); + + // Table columns and partition keys + CatalogTable catalogTable = (CatalogTable) table; + + final List partitionKeys = HoodieCatalogUtil.getPartitionKeys(catalogTable); + if (partitionKeys.size() > 0) { + Pair, List> splitSchemas = HiveSchemaUtils.splitSchemaByPartitionKeys(allColumns, partitionKeys); + List regularColumns = splitSchemas.getLeft(); + List partitionColumns = splitSchemas.getRight(); + + sd.setCols(regularColumns); + hiveTable.setPartitionKeys(partitionColumns); + } else { + sd.setCols(allColumns); + hiveTable.setPartitionKeys(Collections.emptyList()); + } + + HoodieFileFormat baseFileFormat = HoodieFileFormat.PARQUET; + //ignore uber input Format + String inputFormatClassName = HoodieInputFormatUtils.getInputFormatClassName(baseFileFormat, useRealTimeInputFormat); + String outputFormatClassName = HoodieInputFormatUtils.getOutputFormatClassName(baseFileFormat); + String serDeClassName = HoodieInputFormatUtils.getSerDeClassName(baseFileFormat); + sd.setInputFormat(inputFormatClassName); + sd.setOutputFormat(outputFormatClassName); + Map serdeProperties = new HashMap<>(); + serdeProperties.put("path", location); + serdeProperties.put(ConfigUtils.IS_QUERY_AS_RO_TABLE, String.valueOf(!useRealTimeInputFormat)); + serdeProperties.put("serialization.format", "1"); + + serdeProperties.putAll(TableOptionProperties.translateFlinkTableProperties2Spark(catalogTable, hiveConf, properties, partitionKeys)); + + sd.setSerdeInfo(new SerDeInfo(null, serDeClassName, serdeProperties)); + + sd.setLocation(location); + hiveTable.setSd(sd); + + hiveTable.setParameters(properties); + return hiveTable; + } + + @Override + public List listTables(String databaseName) + throws DatabaseNotExistException, CatalogException { + checkArgument( + !isNullOrWhitespaceOnly(databaseName), "Database name cannot be null or empty"); + + try { + return client.getAllTables(databaseName); + } catch (UnknownDBException e) { + throw new DatabaseNotExistException(getName(), databaseName); + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to list tables in database %s", databaseName), e); + } + } + + @Override + public List listViews(String databaseName) + throws DatabaseNotExistException, CatalogException { + throw new HoodieCatalogException("Hoodie catalog does not support to listViews"); + } + + @Override + public boolean tableExists(ObjectPath tablePath) throws CatalogException { + checkNotNull(tablePath, "Table path cannot be null"); + + try { + return client.tableExists(tablePath.getDatabaseName(), tablePath.getObjectName()); + } catch (UnknownDBException e) { + return false; + } catch (TException e) { + throw new CatalogException( + String.format( + "Failed to check whether table %s exists or not.", + tablePath.getFullName()), + e); + } + } + + @Override + public void dropTable(ObjectPath tablePath, boolean ignoreIfNotExists) + throws TableNotExistException, CatalogException { + checkNotNull(tablePath, "Table path cannot be null"); + + try { + client.dropTable( + tablePath.getDatabaseName(), + tablePath.getObjectName(), + // Indicate whether associated data should be deleted. + // Set to 'true' for now because Flink tables shouldn't have data in Hive. Can + // be changed later if necessary + true, + ignoreIfNotExists); + } catch (NoSuchObjectException e) { + if (!ignoreIfNotExists) { + throw new TableNotExistException(getName(), tablePath); + } + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to drop table %s", tablePath.getFullName()), e); + } + } + + @Override + public void renameTable(ObjectPath tablePath, String newTableName, boolean ignoreIfNotExists) + throws TableNotExistException, TableAlreadyExistException, CatalogException { + checkNotNull(tablePath, "Table path cannot be null"); + checkArgument( + !isNullOrWhitespaceOnly(newTableName), "New table name cannot be null or empty"); + + try { + // alter_table() doesn't throw a clear exception when target table doesn't exist. + // Thus, check the table existence explicitly + if (tableExists(tablePath)) { + ObjectPath newPath = new ObjectPath(tablePath.getDatabaseName(), newTableName); + // alter_table() doesn't throw a clear exception when new table already exists. + // Thus, check the table existence explicitly + if (tableExists(newPath)) { + throw new TableAlreadyExistException(getName(), newPath); + } else { + Table hiveTable = getHiveTable(tablePath); + + //update hoodie + StorageDescriptor sd = hiveTable.getSd(); + String location = sd.getLocation(); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setBasePath(location).setConf(hiveConf).build(); + //Init table with new name + HoodieTableMetaClient.withPropertyBuilder().fromProperties(metaClient.getTableConfig().getProps()) + .setTableName(newTableName) + .initTable(hiveConf, location); + + hiveTable.setTableName(newTableName); + client.alter_table( + tablePath.getDatabaseName(), tablePath.getObjectName(), hiveTable); + } + } else if (!ignoreIfNotExists) { + throw new TableNotExistException(getName(), tablePath); + } + } catch (Exception e) { + throw new HoodieCatalogException( + String.format("Failed to rename table %s", tablePath.getFullName()), e); + } + } + + private boolean sameOptions(Map existingOptions, Map newOptions, ConfigOption option) { + return existingOptions.getOrDefault(option.key(), String.valueOf(option.defaultValue())) + .equalsIgnoreCase(newOptions.getOrDefault(option.key(), String.valueOf(option.defaultValue()))); + } + + @Override + public void alterTable( + ObjectPath tablePath, CatalogBaseTable newCatalogTable, boolean ignoreIfNotExists) + throws TableNotExistException, CatalogException { + checkNotNull(tablePath, "Table path cannot be null"); + checkNotNull(newCatalogTable, "New catalog table cannot be null"); + + if (!newCatalogTable.getOptions().getOrDefault(CONNECTOR.key(), "").equalsIgnoreCase("hudi")) { + throw new HoodieCatalogException(String.format("The %s is not hoodie table", tablePath.getObjectName())); + } + if (newCatalogTable instanceof CatalogView) { + throw new HoodieCatalogException("Hoodie catalog does not support to ALTER VIEW"); + } + + try { + Table hiveTable = getHiveTable(tablePath); + if (!sameOptions(hiveTable.getParameters(), newCatalogTable.getOptions(), FlinkOptions.TABLE_TYPE) + || !sameOptions(hiveTable.getParameters(), newCatalogTable.getOptions(), FlinkOptions.INDEX_TYPE)) { + throw new HoodieCatalogException("Hoodie catalog does not support to alter table type and index type"); + } + } catch (TableNotExistException e) { + if (!ignoreIfNotExists) { + throw e; + } + return; + } + + try { + boolean isMorTable = OptionsResolver.isMorTable(newCatalogTable.getOptions()); + Table hiveTable = instantiateHiveTable(tablePath, newCatalogTable, inferTablePath(tablePath, newCatalogTable), isMorTable); + //alter hive table + client.alter_table(tablePath.getDatabaseName(), tablePath.getObjectName(), hiveTable); + } catch (Exception e) { + LOG.error("Failed to alter table {}", tablePath.getObjectName(), e); + throw new HoodieCatalogException(String.format("Failed to alter table %s", tablePath.getObjectName()), e); + } + } + + @Override + public List listPartitions(ObjectPath tablePath) + throws TableNotExistException, TableNotPartitionedException, CatalogException { + throw new HoodieCatalogException("Not supported."); + } + + @Override + public List listPartitions( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws TableNotExistException, TableNotPartitionedException, + PartitionSpecInvalidException, CatalogException { + throw new HoodieCatalogException("Not supported."); + } + + @Override + public List listPartitionsByFilter( + ObjectPath tablePath, List expressions) + throws TableNotExistException, TableNotPartitionedException, CatalogException { + throw new HoodieCatalogException("Not supported."); + } + + @Override + public CatalogPartition getPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws PartitionNotExistException, CatalogException { + throw new HoodieCatalogException("Not supported."); + } + + @Override + public boolean partitionExists(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws CatalogException { + throw new HoodieCatalogException("Not supported."); + } + + @Override + public void createPartition( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogPartition partition, + boolean ignoreIfExists) + throws TableNotExistException, TableNotPartitionedException, + PartitionSpecInvalidException, PartitionAlreadyExistsException, + CatalogException { + throw new HoodieCatalogException("Not supported."); + } + + @Override + public void dropPartition( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec, boolean ignoreIfNotExists) + throws PartitionNotExistException, CatalogException { + throw new HoodieCatalogException("Not supported."); + } + + @Override + public void alterPartition( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogPartition newPartition, + boolean ignoreIfNotExists) + throws PartitionNotExistException, CatalogException { + throw new HoodieCatalogException("Not supported."); + } + + @Override + public List listFunctions(String databaseName) + throws DatabaseNotExistException, CatalogException { + return Collections.emptyList(); + } + + @Override + public CatalogFunction getFunction(ObjectPath functionPath) + throws FunctionNotExistException, CatalogException { + throw new FunctionNotExistException(getName(), functionPath); + } + + @Override + public boolean functionExists(ObjectPath functionPath) throws CatalogException { + return false; + } + + @Override + public void createFunction( + ObjectPath functionPath, CatalogFunction function, boolean ignoreIfExists) + throws FunctionAlreadyExistException, DatabaseNotExistException, CatalogException { + throw new HoodieCatalogException("Not supported."); + } + + @Override + public void alterFunction( + ObjectPath functionPath, CatalogFunction newFunction, boolean ignoreIfNotExists) + throws FunctionNotExistException, CatalogException { + throw new HoodieCatalogException("Not supported."); + } + + @Override + public void dropFunction(ObjectPath functionPath, boolean ignoreIfNotExists) + throws FunctionNotExistException, CatalogException { + throw new HoodieCatalogException("Not supported."); + } + + @Override + public CatalogTableStatistics getTableStatistics(ObjectPath tablePath) + throws TableNotExistException, CatalogException { + return CatalogTableStatistics.UNKNOWN; + } + + @Override + public CatalogColumnStatistics getTableColumnStatistics(ObjectPath tablePath) + throws TableNotExistException, CatalogException { + return CatalogColumnStatistics.UNKNOWN; + } + + @Override + public CatalogTableStatistics getPartitionStatistics( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws PartitionNotExistException, CatalogException { + return CatalogTableStatistics.UNKNOWN; + } + + @Override + public CatalogColumnStatistics getPartitionColumnStatistics( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws PartitionNotExistException, CatalogException { + return CatalogColumnStatistics.UNKNOWN; + } + + @Override + public void alterTableStatistics( + ObjectPath tablePath, CatalogTableStatistics tableStatistics, boolean ignoreIfNotExists) + throws TableNotExistException, CatalogException { + throw new HoodieCatalogException("Not supported."); + } + + @Override + public void alterTableColumnStatistics( + ObjectPath tablePath, + CatalogColumnStatistics columnStatistics, + boolean ignoreIfNotExists) + throws TableNotExistException, CatalogException, TablePartitionedException { + throw new HoodieCatalogException("Not supported."); + } + + @Override + public void alterPartitionStatistics( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogTableStatistics partitionStatistics, + boolean ignoreIfNotExists) + throws PartitionNotExistException, CatalogException { + throw new HoodieCatalogException("Not supported."); + } + + @Override + public void alterPartitionColumnStatistics( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogColumnStatistics columnStatistics, + boolean ignoreIfNotExists) + throws PartitionNotExistException, CatalogException { + throw new HoodieCatalogException("Not supported."); + } + + private Map supplementOptions( + ObjectPath tablePath, + Map options) { + if (HoodieCatalogUtil.isEmbeddedMetastore(hiveConf)) { + return options; + } else { + Map newOptions = new HashMap<>(options); + // set up hive sync options + newOptions.putIfAbsent(FlinkOptions.HIVE_SYNC_ENABLED.key(), "true"); + newOptions.putIfAbsent(FlinkOptions.HIVE_SYNC_METASTORE_URIS.key(), hiveConf.getVar(HiveConf.ConfVars.METASTOREURIS)); + newOptions.putIfAbsent(FlinkOptions.HIVE_SYNC_MODE.key(), "hms"); + newOptions.putIfAbsent(FlinkOptions.HIVE_SYNC_SUPPORT_TIMESTAMP.key(), "true"); + newOptions.computeIfAbsent(FlinkOptions.HIVE_SYNC_DB.key(), k -> tablePath.getDatabaseName()); + newOptions.computeIfAbsent(FlinkOptions.HIVE_SYNC_TABLE.key(), k -> tablePath.getObjectName()); + return newOptions; + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/TableOptionProperties.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/TableOptionProperties.java new file mode 100644 index 0000000000000..a0864bbf3773b --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/TableOptionProperties.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.sync.common.util.SparkDataSourceTableUtils; +import org.apache.hudi.util.AvroSchemaConverter; + +import org.apache.avro.Schema; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.stream.Collectors; + +import static org.apache.flink.table.factories.FactoryUtil.CONNECTOR; +import static org.apache.hudi.common.table.HoodieTableMetaClient.AUXILIARYFOLDER_NAME; + +/** + * Helper class to read/write flink table options as a map. + */ +public class TableOptionProperties { + private static final Logger LOG = LoggerFactory.getLogger(TableOptionProperties.class); + + public static final String SPARK_SOURCE_PROVIDER = "spark.sql.sources.provider"; + public static final String SPARK_VERSION = "spark.version"; + public static final String DEFAULT_SPARK_VERSION = "spark2.4.4"; + static final Map VALUE_MAPPING = new HashMap<>(); + static final Map KEY_MAPPING = new HashMap<>(); + + private static final String FILE_NAME = "table_option.properties"; + + public static final String PK_CONSTRAINT_NAME = "pk.constraint.name"; + public static final String PK_COLUMNS = "pk.columns"; + public static final String COMMENT = "comment"; + public static final String PARTITION_COLUMNS = "partition.columns"; + + public static final List NON_OPTION_KEYS = Arrays.asList(PK_CONSTRAINT_NAME, PK_COLUMNS, COMMENT, PARTITION_COLUMNS); + + static { + VALUE_MAPPING.put("mor", HoodieTableType.MERGE_ON_READ.name()); + VALUE_MAPPING.put("cow", HoodieTableType.COPY_ON_WRITE.name()); + + VALUE_MAPPING.put(HoodieTableType.MERGE_ON_READ.name(), "mor"); + VALUE_MAPPING.put(HoodieTableType.COPY_ON_WRITE.name(), "cow"); + + KEY_MAPPING.put("type", FlinkOptions.TABLE_TYPE.key()); + KEY_MAPPING.put("primaryKey", FlinkOptions.RECORD_KEY_FIELD.key()); + KEY_MAPPING.put("preCombineField", FlinkOptions.PRECOMBINE_FIELD.key()); + KEY_MAPPING.put("payloadClass", FlinkOptions.PAYLOAD_CLASS_NAME.key()); + KEY_MAPPING.put(SPARK_SOURCE_PROVIDER, CONNECTOR.key()); + KEY_MAPPING.put(FlinkOptions.KEYGEN_CLASS_NAME.key(), FlinkOptions.KEYGEN_CLASS_NAME.key()); + KEY_MAPPING.put(FlinkOptions.TABLE_TYPE.key(), "type"); + KEY_MAPPING.put(FlinkOptions.RECORD_KEY_FIELD.key(), "primaryKey"); + KEY_MAPPING.put(FlinkOptions.PRECOMBINE_FIELD.key(), "preCombineField"); + KEY_MAPPING.put(FlinkOptions.PAYLOAD_CLASS_NAME.key(), "payloadClass"); + } + + /** + * Initialize the {@link #FILE_NAME} meta file. + */ + public static void createProperties(String basePath, + Configuration hadoopConf, + Map options) throws IOException { + Path propertiesFilePath = getPropertiesFilePath(basePath); + FileSystem fs = FSUtils.getFs(basePath, hadoopConf); + try (FSDataOutputStream outputStream = fs.create(propertiesFilePath)) { + Properties properties = new Properties(); + properties.putAll(options); + properties.store(outputStream, + "Table option properties saved on " + new Date(System.currentTimeMillis())); + } + LOG.info(String.format("Create file %s success.", propertiesFilePath)); + } + + /** + * Read table options map from the given table base path. + */ + public static Map loadFromProperties(String basePath, Configuration hadoopConf) { + Path propertiesFilePath = getPropertiesFilePath(basePath); + Map options = new HashMap<>(); + Properties props = new Properties(); + + FileSystem fs = FSUtils.getFs(basePath, hadoopConf); + try (FSDataInputStream inputStream = fs.open(propertiesFilePath)) { + props.load(inputStream); + for (final String name : props.stringPropertyNames()) { + options.put(name, props.getProperty(name)); + } + } catch (IOException e) { + throw new HoodieIOException(String.format("Could not load table option properties from %s", propertiesFilePath), e); + } + LOG.info(String.format("Loading table option properties from %s success.", propertiesFilePath)); + return options; + } + + private static Path getPropertiesFilePath(String basePath) { + String auxPath = basePath + Path.SEPARATOR + AUXILIARYFOLDER_NAME; + return new Path(auxPath, FILE_NAME); + } + + public static String getPkConstraintName(Map options) { + return options.get(PK_CONSTRAINT_NAME); + } + + public static List getPkColumns(Map options) { + if (options.containsKey(PK_COLUMNS)) { + return Arrays.stream(options.get(PK_COLUMNS).split(",")).collect(Collectors.toList()); + } else { + return Collections.emptyList(); + } + } + + public static List getPartitionColumns(Map options) { + if (options.containsKey(PARTITION_COLUMNS)) { + return Arrays.stream(options.get(PARTITION_COLUMNS).split(",")).collect(Collectors.toList()); + } else { + return Collections.emptyList(); + } + } + + public static String getComment(Map options) { + return options.get(COMMENT); + } + + public static Map getTableOptions(Map options) { + Map copied = new HashMap<>(options); + NON_OPTION_KEYS.forEach(copied::remove); + return copied; + } + + public static Map translateFlinkTableProperties2Spark( + CatalogTable catalogTable, + Configuration hadoopConf, + Map properties, + List partitionKeys) { + Schema schema = AvroSchemaConverter.convertToSchema(catalogTable.getSchema().toPhysicalRowDataType().getLogicalType()); + MessageType messageType = TableSchemaResolver.convertAvroSchemaToParquet(schema, hadoopConf); + String sparkVersion = catalogTable.getOptions().getOrDefault(SPARK_VERSION, DEFAULT_SPARK_VERSION); + Map sparkTableProperties = SparkDataSourceTableUtils.getSparkTableProperties( + partitionKeys, + sparkVersion, + 4000, + messageType); + properties.putAll(sparkTableProperties); + return properties.entrySet().stream() + .filter(e -> KEY_MAPPING.containsKey(e.getKey()) && !catalogTable.getOptions().containsKey(KEY_MAPPING.get(e.getKey()))) + .collect(Collectors.toMap(e -> KEY_MAPPING.get(e.getKey()), + e -> e.getKey().equalsIgnoreCase(FlinkOptions.TABLE_TYPE.key()) ? VALUE_MAPPING.get(e.getValue()) : e.getValue())); + } + + public static Map translateSparkTableProperties2Flink(Map options) { + if (options.containsKey(CONNECTOR.key())) { + return options; + } + return options.entrySet().stream().filter(e -> KEY_MAPPING.containsKey(e.getKey())) + .collect(Collectors.toMap(e -> KEY_MAPPING.get(e.getKey()), + e -> e.getKey().equalsIgnoreCase("type") ? VALUE_MAPPING.get(e.getValue()) : e.getValue())); + } + + public static Map translateSparkTableProperties2Flink(Table hiveTable) { + return translateSparkTableProperties2Flink(hiveTable.getParameters()); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/TypeInfoLogicalTypeVisitor.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/TypeInfoLogicalTypeVisitor.java new file mode 100644 index 0000000000000..e6b15788fe79e --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/TypeInfoLogicalTypeVisitor.java @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.BigIntType; +import org.apache.flink.table.types.logical.BooleanType; +import org.apache.flink.table.types.logical.CharType; +import org.apache.flink.table.types.logical.DateType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.DoubleType; +import org.apache.flink.table.types.logical.FloatType; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.NullType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.SmallIntType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.table.types.logical.TinyIntType; +import org.apache.flink.table.types.logical.VarBinaryType; +import org.apache.flink.table.types.logical.VarCharType; +import org.apache.flink.table.types.logical.utils.LogicalTypeDefaultVisitor; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; + +import java.util.ArrayList; +import java.util.List; + +/** + * Create a TypeInfoLogicalTypeVisitor for hoodie table. + */ +public class TypeInfoLogicalTypeVisitor extends LogicalTypeDefaultVisitor { + private final LogicalType type; + + TypeInfoLogicalTypeVisitor(DataType dataType) { + this(dataType.getLogicalType()); + } + + TypeInfoLogicalTypeVisitor(LogicalType type) { + this.type = type; + } + + @Override + public TypeInfo visit(CharType charType) { + // hoodie only supports avro compatible data type + return TypeInfoFactory.stringTypeInfo; + } + + @Override + public TypeInfo visit(VarCharType varCharType) { + // hoodie only supports avro compatible data type + return TypeInfoFactory.stringTypeInfo; + } + + @Override + public TypeInfo visit(BooleanType booleanType) { + return TypeInfoFactory.booleanTypeInfo; + } + + @Override + public TypeInfo visit(VarBinaryType varBinaryType) { + // Flink's BytesType is defined as VARBINARY(Integer.MAX_VALUE) + // We don't have more information in LogicalTypeRoot to distinguish BytesType and a + // VARBINARY(Integer.MAX_VALUE) instance + // Thus always treat VARBINARY(Integer.MAX_VALUE) as BytesType + if (varBinaryType.getLength() == VarBinaryType.MAX_LENGTH) { + return TypeInfoFactory.binaryTypeInfo; + } + return defaultMethod(varBinaryType); + } + + @Override + public TypeInfo visit(DecimalType decimalType) { + // Flink and Hive share the same precision and scale range + // Flink already validates the type so we don't need to validate again here + return TypeInfoFactory.getDecimalTypeInfo( + decimalType.getPrecision(), decimalType.getScale()); + } + + @Override + public TypeInfo visit(TinyIntType tinyIntType) { + // hoodie only supports avro compatible data type + return TypeInfoFactory.intTypeInfo; + } + + @Override + public TypeInfo visit(SmallIntType smallIntType) { + // hoodie only supports avro compatible data type + return TypeInfoFactory.intTypeInfo; + } + + @Override + public TypeInfo visit(IntType intType) { + return TypeInfoFactory.intTypeInfo; + } + + @Override + public TypeInfo visit(BigIntType bigIntType) { + return TypeInfoFactory.longTypeInfo; + } + + @Override + public TypeInfo visit(FloatType floatType) { + return TypeInfoFactory.floatTypeInfo; + } + + @Override + public TypeInfo visit(DoubleType doubleType) { + return TypeInfoFactory.doubleTypeInfo; + } + + @Override + public TypeInfo visit(DateType dateType) { + return TypeInfoFactory.dateTypeInfo; + } + + @Override + public TypeInfo visit(TimestampType timestampType) { + int precision = timestampType.getPrecision(); + // see org.apache.hudi.hive.util.HiveSchemaUtil#convertField for details. + // default supports timestamp + if (precision == 6) { + return TypeInfoFactory.timestampTypeInfo; + } else { + return TypeInfoFactory.longTypeInfo; + } + } + + @Override + public TypeInfo visit(ArrayType arrayType) { + LogicalType elementType = arrayType.getElementType(); + TypeInfo elementTypeInfo = elementType.accept(this); + if (null != elementTypeInfo) { + return TypeInfoFactory.getListTypeInfo(elementTypeInfo); + } else { + return defaultMethod(arrayType); + } + } + + @Override + public TypeInfo visit(MapType mapType) { + LogicalType keyType = mapType.getKeyType(); + LogicalType valueType = mapType.getValueType(); + TypeInfo keyTypeInfo = keyType.accept(this); + TypeInfo valueTypeInfo = valueType.accept(this); + if (null == keyTypeInfo || null == valueTypeInfo) { + return defaultMethod(mapType); + } else { + return TypeInfoFactory.getMapTypeInfo(keyTypeInfo, valueTypeInfo); + } + } + + @Override + public TypeInfo visit(RowType rowType) { + List names = rowType.getFieldNames(); + List typeInfos = new ArrayList<>(names.size()); + for (String name : names) { + TypeInfo typeInfo = rowType.getTypeAt(rowType.getFieldIndex(name)).accept(this); + if (null != typeInfo) { + typeInfos.add(typeInfo); + } else { + return defaultMethod(rowType); + } + } + return TypeInfoFactory.getStructTypeInfo(names, typeInfos); + } + + @Override + public TypeInfo visit(NullType nullType) { + return TypeInfoFactory.voidTypeInfo; + } + + @Override + protected TypeInfo defaultMethod(LogicalType logicalType) { + throw new UnsupportedOperationException( + String.format( + "Flink doesn't support converting type %s to Hive type yet.", + type.toString())); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FilePathUtils.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FilePathUtils.java new file mode 100644 index 0000000000000..0da1aca0e243c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FilePathUtils.java @@ -0,0 +1,449 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.configuration.FlinkOptions; + +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.table.api.TableException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +/** + * Reference the Flink {@link org.apache.flink.table.utils.PartitionPathUtils} + * but supports simple partition path besides the Hive style. + */ +public class FilePathUtils { + + private static final Pattern HIVE_PARTITION_NAME_PATTERN = Pattern.compile("([^/]+)=([^/]+)"); + + private static final BitSet CHAR_TO_ESCAPE = new BitSet(128); + + static { + for (char c = 0; c < ' '; c++) { + CHAR_TO_ESCAPE.set(c); + } + + /* + * ASCII 01-1F are HTTP control characters that need to be escaped. + * \u000A and \u000D are \n and \r, respectively. + */ + char[] clist = new char[] {'\u0001', '\u0002', '\u0003', '\u0004', + '\u0005', '\u0006', '\u0007', '\u0008', '\u0009', '\n', '\u000B', + '\u000C', '\r', '\u000E', '\u000F', '\u0010', '\u0011', '\u0012', + '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', + '\u001A', '\u001B', '\u001C', '\u001D', '\u001E', '\u001F', + '"', '#', '%', '\'', '*', '/', ':', '=', '?', '\\', '\u007F', '{', + '[', ']', '^'}; + + for (char c : clist) { + CHAR_TO_ESCAPE.set(c); + } + } + + private static boolean needsEscaping(char c) { + return c < CHAR_TO_ESCAPE.size() && CHAR_TO_ESCAPE.get(c); + } + + /** + * Make partition path from partition spec. + * + * @param partitionKVs The partition key value mapping + * @param hivePartition Whether the partition path is with Hive style, + * e.g. {partition key} = {partition value} + * @param sepSuffix Whether to append the path separator as suffix + * @return an escaped, valid partition name + */ + public static String generatePartitionPath( + LinkedHashMap partitionKVs, + boolean hivePartition, + boolean sepSuffix) { + if (partitionKVs.isEmpty()) { + return ""; + } + StringBuilder suffixBuf = new StringBuilder(); + int i = 0; + for (Map.Entry e : partitionKVs.entrySet()) { + if (i > 0) { + suffixBuf.append(Path.SEPARATOR); + } + if (hivePartition) { + suffixBuf.append(escapePathName(e.getKey())); + suffixBuf.append('='); + } + suffixBuf.append(escapePathName(e.getValue())); + i++; + } + if (sepSuffix) { + suffixBuf.append(Path.SEPARATOR); + } + return suffixBuf.toString(); + } + + /** + * Escapes a path name. + * + * @param path The path to escape. + * @return An escaped path name. + */ + private static String escapePathName(String path) { + if (path == null || path.length() == 0) { + throw new TableException("Path should not be null or empty: " + path); + } + + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < path.length(); i++) { + char c = path.charAt(i); + if (needsEscaping(c)) { + sb.append('%'); + sb.append(String.format("%1$02X", (int) c)); + } else { + sb.append(c); + } + } + return sb.toString(); + } + + /** + * Generates partition key value mapping from path. + * + * @param currPath Partition file path + * @param hivePartition Whether the partition path is with Hive style + * @param partitionKeys Partition keys + * @return Sequential partition specs. + */ + public static LinkedHashMap extractPartitionKeyValues( + Path currPath, + boolean hivePartition, + String[] partitionKeys) { + LinkedHashMap fullPartSpec = new LinkedHashMap<>(); + if (partitionKeys.length == 0) { + return fullPartSpec; + } + List kvs = new ArrayList<>(); + int curDepth = 0; + do { + String component = currPath.getName(); + final String[] kv = new String[2]; + if (hivePartition) { + Matcher m = HIVE_PARTITION_NAME_PATTERN.matcher(component); + if (m.matches()) { + String k = unescapePathName(m.group(1)); + String v = unescapePathName(m.group(2)); + kv[0] = k; + kv[1] = v; + } + } else { + kv[0] = partitionKeys[partitionKeys.length - 1 - curDepth]; + kv[1] = unescapePathName(component); + } + kvs.add(kv); + currPath = currPath.getParent(); + curDepth++; + } while (currPath != null && !currPath.getName().isEmpty() && curDepth < partitionKeys.length); + + // reverse the list since we checked the part from leaf dir to table's base dir + for (int i = kvs.size(); i > 0; i--) { + fullPartSpec.put(kvs.get(i - 1)[0], kvs.get(i - 1)[1]); + } + + return fullPartSpec; + } + + public static String unescapePathName(String path) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < path.length(); i++) { + char c = path.charAt(i); + if (c == '%' && i + 2 < path.length()) { + int code = -1; + try { + code = Integer.parseInt(path.substring(i + 1, i + 3), 16); + } catch (Exception ignored) { + // do nothing + } + if (code >= 0) { + sb.append((char) code); + i += 2; + continue; + } + } + sb.append(c); + } + return sb.toString(); + } + + /** + * Search all partitions in this path. + * + * @param fs File system + * @param path Search path + * @param hivePartition Whether the partition path is with Hive style + * @param partitionKeys Partition keys + * @return all partition key value mapping in sequence of the given path + */ + public static List, Path>> searchPartKeyValueAndPaths( + FileSystem fs, + Path path, + boolean hivePartition, + String[] partitionKeys) { + // expectLevel start from 0, E.G. base_path/level0/level1/level2 + FileStatus[] generatedParts = getFileStatusRecursively(path, partitionKeys.length, fs); + List, Path>> ret = new ArrayList<>(); + for (FileStatus part : generatedParts) { + ret.add( + new Tuple2<>( + extractPartitionKeyValues(part.getPath(), hivePartition, partitionKeys), + part.getPath())); + } + return ret; + } + + public static FileStatus[] getFileStatusRecursively(Path path, int expectLevel, Configuration conf) { + return getFileStatusRecursively(path, expectLevel, FSUtils.getFs(path.toString(), conf)); + } + + public static FileStatus[] getFileStatusRecursively(Path path, int expectLevel, FileSystem fs) { + ArrayList result = new ArrayList<>(); + + try { + FileStatus fileStatus = fs.getFileStatus(path); + listStatusRecursively(fs, fileStatus, 0, expectLevel, result); + } catch (IOException ignore) { + return new FileStatus[0]; + } + + return result.toArray(new FileStatus[0]); + } + + private static void listStatusRecursively( + FileSystem fs, + FileStatus fileStatus, + int level, + int expectLevel, + List results) throws IOException { + if (expectLevel == level && !isHiddenFile(fileStatus)) { + results.add(fileStatus); + return; + } + + if (fileStatus.isDirectory() && !isHiddenFile(fileStatus)) { + for (FileStatus stat : fs.listStatus(fileStatus.getPath())) { + listStatusRecursively(fs, stat, level + 1, expectLevel, results); + } + } + } + + private static boolean isHiddenFile(FileStatus fileStatus) { + String name = fileStatus.getPath().getName(); + // the log files is hidden file + return name.startsWith("_") || (name.startsWith(".") && !name.contains(".log.")); + } + + /** + * Returns the partition path key and values as a list of map, each map item in the list + * is a mapping of the partition key name to its actual partition value. For example, say + * there is a file path with partition keys [key1, key2, key3]: + * + *

    +   *   -- file:/// ... key1=val1/key2=val2/key3=val3
    +   *   -- file:/// ... key1=val4/key2=val5/key3=val6
    +   * 
    + * + *

    The return list should be [{key1:val1, key2:val2, key3:val3}, {key1:val4, key2:val5, key3:val6}]. + * + * @param path The base path + * @param hadoopConf The hadoop configuration + * @param partitionKeys The partition key list + * @param defaultParName The default partition name for nulls + * @param hivePartition Whether the partition path is in Hive style + */ + public static List> getPartitions( + Path path, + Configuration hadoopConf, + List partitionKeys, + String defaultParName, + boolean hivePartition) { + try { + return FilePathUtils + .searchPartKeyValueAndPaths( + FSUtils.getFs(path.toString(), hadoopConf), + path, + hivePartition, + partitionKeys.toArray(new String[0])) + .stream() + .map(tuple2 -> tuple2.f0) + .map(spec -> { + LinkedHashMap ret = new LinkedHashMap<>(); + spec.forEach((k, v) -> ret.put(k, defaultParName.equals(v) ? null : v)); + return ret; + }) + .collect(Collectors.toList()); + } catch (Exception e) { + throw new TableException("Fetch partitions fail.", e); + } + } + + /** + * Reorder the partition key value mapping based on the given partition keys sequence. + * + * @param partitionKVs The partition key and value mapping + * @param partitionKeys The partition key list + */ + public static LinkedHashMap validateAndReorderPartitions( + Map partitionKVs, + List partitionKeys) { + if (partitionKeys.size() == 0) { + // in case the partition fields are not in schema + return new LinkedHashMap<>(partitionKVs); + } + LinkedHashMap map = new LinkedHashMap<>(); + for (String k : partitionKeys) { + if (!partitionKVs.containsKey(k)) { + throw new TableException("Partition keys are: " + partitionKeys + + ", incomplete partition spec: " + partitionKVs); + } + map.put(k, partitionKVs.get(k)); + } + return map; + } + + /** + * Returns all the file paths that is the parents of the data files. + * + * @param path The base path + * @param conf The Flink configuration + * @param hadoopConf The hadoop configuration + * @param partitionKeys The partition key list + */ + public static Path[] getReadPaths( + Path path, + org.apache.flink.configuration.Configuration conf, + Configuration hadoopConf, + List partitionKeys) { + if (partitionKeys.isEmpty()) { + return new Path[] {path}; + } else { + final String defaultParName = conf.getString(FlinkOptions.PARTITION_DEFAULT_NAME); + final boolean hivePartition = conf.getBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING); + List> partitionPaths = + getPartitions(path, hadoopConf, partitionKeys, defaultParName, hivePartition); + return partitionPath2ReadPath(path, partitionKeys, partitionPaths, hivePartition); + } + } + + /** + * Transforms the given partition key value mapping to read paths. + * + * @param path The base path + * @param partitionKeys The partition key list + * @param partitionPaths The partition key value mapping + * @param hivePartition Whether the partition path is in Hive style + * @see #getReadPaths + */ + public static Path[] partitionPath2ReadPath( + Path path, + List partitionKeys, + List> partitionPaths, + boolean hivePartition) { + return partitionPaths.stream() + .map(m -> validateAndReorderPartitions(m, partitionKeys)) + .map(kvs -> FilePathUtils.generatePartitionPath(kvs, hivePartition, true)) + .map(n -> new Path(path, n)) + .toArray(Path[]::new); + } + + /** + * Transforms the given partition key value mapping to relative partition paths. + * + * @param partitionKeys The partition key list + * @param partitionPaths The partition key value mapping + * @param hivePartition Whether the partition path is in Hive style + * @see #getReadPaths + */ + public static Set toRelativePartitionPaths( + List partitionKeys, + List> partitionPaths, + boolean hivePartition) { + return partitionPaths.stream() + .map(m -> validateAndReorderPartitions(m, partitionKeys)) + .map(kvs -> FilePathUtils.generatePartitionPath(kvs, hivePartition, false)) + .collect(Collectors.toSet()); + } + + /** + * Transforms the array of Hadoop paths to Flink paths. + */ + public static org.apache.flink.core.fs.Path[] toFlinkPaths(Path[] paths) { + return Arrays.stream(paths) + .map(FilePathUtils::toFlinkPath) + .toArray(org.apache.flink.core.fs.Path[]::new); + } + + /** + * Transforms the Hadoop path to Flink path. + */ + public static org.apache.flink.core.fs.Path toFlinkPath(Path path) { + return new org.apache.flink.core.fs.Path(path.toUri()); + } + + /** + * Extracts the partition keys with given configuration. + * + * @param conf The flink configuration + * @return array of the partition fields + */ + public static String[] extractPartitionKeys(org.apache.flink.configuration.Configuration conf) { + if (FlinkOptions.isDefaultValueDefined(conf, FlinkOptions.PARTITION_PATH_FIELD)) { + return new String[0]; + } + return conf.getString(FlinkOptions.PARTITION_PATH_FIELD).split(","); + } + + /** + * Extracts the hive sync partition fields with given configuration. + * + * @param conf The flink configuration + * @return array of the hive partition fields + */ + public static String[] extractHivePartitionFields(org.apache.flink.configuration.Configuration conf) { + if (FlinkOptions.isDefaultValueDefined(conf, FlinkOptions.HIVE_SYNC_PARTITION_FIELDS)) { + return extractPartitionKeys(conf); + } + return conf.getString(FlinkOptions.HIVE_SYNC_PARTITION_FIELDS).split(","); + } + + public static boolean isHiveStylePartitioning(String path) { + return HIVE_PARTITION_NAME_PATTERN.matcher(path).matches(); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java new file mode 100644 index 0000000000000..6357b898d49da --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieOperation; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.table.log.HoodieUnMergedLogRecordScanner; +import org.apache.hudi.common.util.DefaultSizeEstimator; +import org.apache.hudi.common.util.Functions; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; +import org.apache.hudi.common.util.queue.BoundedInMemoryQueueProducer; +import org.apache.hudi.common.util.queue.FunctionBasedQueueProducer; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.hadoop.config.HoodieRealtimeConfig; +import org.apache.hudi.table.format.mor.MergeOnReadInputSplit; +import org.apache.hudi.util.FlinkWriteClients; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.GenericRecordBuilder; +import org.apache.avro.generic.IndexedRecord; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.RowKind; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.function.Function; + +/** + * Utilities for format. + */ +public class FormatUtils { + private FormatUtils() { + } + + /** + * Sets up the row kind to the row data {@code rowData} from the resolved operation. + */ + public static void setRowKind(RowData rowData, IndexedRecord record, int index) { + if (index == -1) { + return; + } + rowData.setRowKind(getRowKind(record, index)); + } + + /** + * Returns the RowKind of the given record, never null. + * Returns RowKind.INSERT when the given field value not found. + */ + private static RowKind getRowKind(IndexedRecord record, int index) { + Object val = record.get(index); + if (val == null) { + return RowKind.INSERT; + } + final HoodieOperation operation = HoodieOperation.fromName(val.toString()); + if (HoodieOperation.isInsert(operation)) { + return RowKind.INSERT; + } else if (HoodieOperation.isUpdateBefore(operation)) { + return RowKind.UPDATE_BEFORE; + } else if (HoodieOperation.isUpdateAfter(operation)) { + return RowKind.UPDATE_AFTER; + } else if (HoodieOperation.isDelete(operation)) { + return RowKind.DELETE; + } else { + throw new AssertionError(); + } + } + + /** + * Returns the RowKind of the given record, never null. + * Returns RowKind.INSERT when the given field value not found. + */ + public static RowKind getRowKindSafely(IndexedRecord record, int index) { + if (index == -1) { + return RowKind.INSERT; + } + return getRowKind(record, index); + } + + public static GenericRecord buildAvroRecordBySchema( + IndexedRecord record, + Schema requiredSchema, + int[] requiredPos, + GenericRecordBuilder recordBuilder) { + List requiredFields = requiredSchema.getFields(); + assert (requiredFields.size() == requiredPos.length); + Iterator positionIterator = Arrays.stream(requiredPos).iterator(); + requiredFields.forEach(f -> recordBuilder.set(f, getVal(record, positionIterator.next()))); + return recordBuilder.build(); + } + + private static Object getVal(IndexedRecord record, int pos) { + return pos == -1 ? null : record.get(pos); + } + + public static HoodieMergedLogRecordScanner logScanner( + MergeOnReadInputSplit split, + Schema logSchema, + org.apache.flink.configuration.Configuration flinkConf, + Configuration hadoopConf) { + HoodieWriteConfig writeConfig = FlinkWriteClients.getHoodieClientConfig(flinkConf); + FileSystem fs = FSUtils.getFs(split.getTablePath(), hadoopConf); + return HoodieMergedLogRecordScanner.newBuilder() + .withFileSystem(fs) + .withBasePath(split.getTablePath()) + .withLogFilePaths(split.getLogPaths().get()) + .withReaderSchema(logSchema) + .withLatestInstantTime(split.getLatestCommit()) + .withReadBlocksLazily(writeConfig.getCompactionLazyBlockReadEnabled()) + .withReverseReader(false) + .withBufferSize(writeConfig.getMaxDFSStreamBufferSize()) + .withMaxMemorySizeInBytes(split.getMaxCompactionMemoryInBytes()) + .withDiskMapType(writeConfig.getCommonConfig().getSpillableDiskMapType()) + .withBitCaskDiskMapCompressionEnabled(writeConfig.getCommonConfig().isBitCaskDiskMapCompressionEnabled()) + .withSpillableMapBasePath(writeConfig.getSpillableMapBasePath()) + .withInstantRange(split.getInstantRange()) + .withOperationField(flinkConf.getBoolean(FlinkOptions.CHANGELOG_ENABLED)) + .build(); + } + + private static HoodieUnMergedLogRecordScanner unMergedLogScanner( + MergeOnReadInputSplit split, + Schema logSchema, + org.apache.flink.configuration.Configuration flinkConf, + Configuration hadoopConf, + HoodieUnMergedLogRecordScanner.LogRecordScannerCallback callback) { + FileSystem fs = FSUtils.getFs(split.getTablePath(), hadoopConf); + return HoodieUnMergedLogRecordScanner.newBuilder() + .withFileSystem(fs) + .withBasePath(split.getTablePath()) + .withLogFilePaths(split.getLogPaths().get()) + .withReaderSchema(logSchema) + .withLatestInstantTime(split.getLatestCommit()) + .withReadBlocksLazily( + string2Boolean( + flinkConf.getString(HoodieRealtimeConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, + HoodieRealtimeConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED))) + .withReverseReader(false) + .withBufferSize( + flinkConf.getInteger(HoodieRealtimeConfig.MAX_DFS_STREAM_BUFFER_SIZE_PROP, + HoodieRealtimeConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE)) + .withInstantRange(split.getInstantRange()) + .withLogRecordScannerCallback(callback) + .build(); + } + + /** + * Utility to read and buffer the records in the unMerged log record scanner. + */ + public static class BoundedMemoryRecords { + // Log Record unmerged scanner + private final HoodieUnMergedLogRecordScanner scanner; + + // Executor that runs the above producers in parallel + private final BoundedInMemoryExecutor, HoodieRecord, ?> executor; + + // Iterator for the buffer consumer + private final Iterator> iterator; + + public BoundedMemoryRecords( + MergeOnReadInputSplit split, + Schema logSchema, + Configuration hadoopConf, + org.apache.flink.configuration.Configuration flinkConf) { + this.executor = new BoundedInMemoryExecutor<>( + StreamerUtil.getMaxCompactionMemoryInBytes(flinkConf), + getParallelProducers(), + Option.empty(), + Function.identity(), + new DefaultSizeEstimator<>(), + Functions.noop()); + // Consumer of this record reader + this.iterator = this.executor.getQueue().iterator(); + this.scanner = FormatUtils.unMergedLogScanner(split, logSchema, flinkConf, hadoopConf, + record -> executor.getQueue().insertRecord(record)); + // Start reading and buffering + this.executor.startProducers(); + } + + public Iterator> getRecordsIterator() { + return this.iterator; + } + + /** + * Setup log and parquet reading in parallel. Both write to central buffer. + */ + private List>> getParallelProducers() { + List>> producers = new ArrayList<>(); + producers.add(new FunctionBasedQueueProducer<>(buffer -> { + scanner.scan(); + return null; + })); + return producers; + } + + public void close() { + this.executor.shutdownNow(); + } + } + + public static HoodieMergedLogRecordScanner logScanner( + List logPaths, + Schema logSchema, + String latestInstantTime, + HoodieWriteConfig writeConfig, + Configuration hadoopConf) { + String basePath = writeConfig.getBasePath(); + return HoodieMergedLogRecordScanner.newBuilder() + .withFileSystem(FSUtils.getFs(basePath, hadoopConf)) + .withBasePath(basePath) + .withLogFilePaths(logPaths) + .withReaderSchema(logSchema) + .withLatestInstantTime(latestInstantTime) + .withReadBlocksLazily(writeConfig.getCompactionLazyBlockReadEnabled()) + .withReverseReader(false) + .withBufferSize(writeConfig.getMaxDFSStreamBufferSize()) + .withMaxMemorySizeInBytes(writeConfig.getMaxMemoryPerPartitionMerge()) + .withSpillableMapBasePath(writeConfig.getSpillableMapBasePath()) + .withDiskMapType(writeConfig.getCommonConfig().getSpillableDiskMapType()) + .withBitCaskDiskMapCompressionEnabled(writeConfig.getCommonConfig().isBitCaskDiskMapCompressionEnabled()) + .build(); + } + + private static Boolean string2Boolean(String s) { + return "true".equals(s.toLowerCase(Locale.ROOT)); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/CopyOnWriteInputFormat.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/CopyOnWriteInputFormat.java new file mode 100644 index 0000000000000..c5ea3d4ab98f4 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/CopyOnWriteInputFormat.java @@ -0,0 +1,397 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow; + +import java.util.Comparator; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.table.format.cow.vector.reader.ParquetColumnarRowSplitReader; +import org.apache.hudi.util.DataTypeUtils; + +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.io.FilePathFilter; +import org.apache.flink.api.common.io.GlobFilePathFilter; +import org.apache.flink.api.common.io.compression.InflaterInputStreamFactory; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.SerializableConfiguration; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.utils.PartitionPathUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.BlockLocation; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Set; + +/** + * An implementation of {@link FileInputFormat} to read {@link RowData} records + * from Parquet files. + * + *

    Note: Reference Flink release 1.11.2 + * {@code org.apache.flink.formats.parquet.ParquetFileSystemFormatFactory.ParquetInputFormat} + * to support TIMESTAMP_MILLIS. + * + *

    Note: Override the {@link #createInputSplits} method from parent to rewrite the logic creating the FileSystem, + * use {@link FSUtils#getFs} to get a plugin filesystem. + * + * @see ParquetSplitReaderUtil + */ +public class CopyOnWriteInputFormat extends FileInputFormat { + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(CopyOnWriteInputFormat.class); + + private final String[] fullFieldNames; + private final DataType[] fullFieldTypes; + private final int[] selectedFields; + private final String partDefaultName; + private final boolean utcTimestamp; + private final SerializableConfiguration conf; + private final long limit; + + private transient ParquetColumnarRowSplitReader reader; + private transient long currentReadCount; + + /** + * Files filter for determining what files/directories should be included. + */ + private FilePathFilter localFilesFilter = new GlobFilePathFilter(); + + public CopyOnWriteInputFormat( + Path[] paths, + String[] fullFieldNames, + DataType[] fullFieldTypes, + int[] selectedFields, + String partDefaultName, + long limit, + Configuration conf, + boolean utcTimestamp) { + super.setFilePaths(paths); + this.limit = limit; + this.partDefaultName = partDefaultName; + this.fullFieldNames = fullFieldNames; + this.fullFieldTypes = fullFieldTypes; + this.selectedFields = selectedFields; + this.conf = new SerializableConfiguration(conf); + this.utcTimestamp = utcTimestamp; + } + + @Override + public void open(FileInputSplit fileSplit) throws IOException { + // generate partition specs. + List fieldNameList = Arrays.asList(fullFieldNames); + LinkedHashMap partSpec = PartitionPathUtils.extractPartitionSpecFromPath( + fileSplit.getPath()); + LinkedHashMap partObjects = new LinkedHashMap<>(); + partSpec.forEach((k, v) -> { + final int idx = fieldNameList.indexOf(k); + if (idx == -1) { + // for any rare cases that the partition field does not exist in schema, + // fallback to file read + return; + } + DataType fieldType = fullFieldTypes[idx]; + if (!DataTypeUtils.isDatetimeType(fieldType)) { + // date time type partition field is formatted specifically, + // read directly from the data file to avoid format mismatch or precision loss + partObjects.put(k, DataTypeUtils.resolvePartition(partDefaultName.equals(v) ? null : v, fieldType)); + } + }); + + this.reader = ParquetSplitReaderUtil.genPartColumnarRowReader( + utcTimestamp, + true, + conf.conf(), + fullFieldNames, + fullFieldTypes, + partObjects, + selectedFields, + 2048, + fileSplit.getPath(), + fileSplit.getStart(), + fileSplit.getLength()); + this.currentReadCount = 0L; + } + + @Override + public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException { + if (minNumSplits < 1) { + throw new IllegalArgumentException("Number of input splits has to be at least 1."); + } + + // take the desired number of splits into account + minNumSplits = Math.max(minNumSplits, this.numSplits); + + final List inputSplits = new ArrayList<>(minNumSplits); + + // get all the files that are involved in the splits + List files = new ArrayList<>(); + long totalLength = 0; + + for (Path path : getFilePaths()) { + final org.apache.hadoop.fs.Path hadoopPath = new org.apache.hadoop.fs.Path(path.toUri()); + final FileSystem fs = FSUtils.getFs(hadoopPath.toString(), this.conf.conf()); + final FileStatus pathFile = fs.getFileStatus(hadoopPath); + + if (pathFile.isDirectory()) { + totalLength += addFilesInDir(hadoopPath, files, true); + } else { + testForUnsplittable(pathFile); + + files.add(pathFile); + totalLength += pathFile.getLen(); + } + } + + // returns if unsplittable + if (unsplittable) { + int splitNum = 0; + for (final FileStatus file : files) { + final FileSystem fs = FSUtils.getFs(file.getPath().toString(), this.conf.conf()); + final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, file.getLen()); + Set hosts = new HashSet<>(); + for (BlockLocation block : blocks) { + hosts.addAll(Arrays.asList(block.getHosts())); + } + long len = file.getLen(); + if (testForUnsplittable(file)) { + len = READ_WHOLE_SPLIT_FLAG; + } + FileInputSplit fis = new FileInputSplit(splitNum++, new Path(file.getPath().toUri()), 0, len, + hosts.toArray(new String[0])); + inputSplits.add(fis); + } + return inputSplits.toArray(new FileInputSplit[0]); + } + + + final long maxSplitSize = totalLength / minNumSplits + (totalLength % minNumSplits == 0 ? 0 : 1); + + // now that we have the files, generate the splits + int splitNum = 0; + for (final FileStatus file : files) { + + final FileSystem fs = FSUtils.getFs(file.getPath().toString(), this.conf.conf()); + final long len = file.getLen(); + final long blockSize = file.getBlockSize(); + + final long minSplitSize; + if (this.minSplitSize <= blockSize) { + minSplitSize = this.minSplitSize; + } else { + if (LOG.isWarnEnabled()) { + LOG.warn("Minimal split size of " + this.minSplitSize + " is larger than the block size of " + + blockSize + ". Decreasing minimal split size to block size."); + } + minSplitSize = blockSize; + } + + final long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize)); + final long halfSplit = splitSize >>> 1; + + final long maxBytesForLastSplit = (long) (splitSize * 1.1f); + + if (len > 0) { + + // get the block locations and make sure they are in order with respect to their offset + final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len); + Arrays.sort(blocks, Comparator.comparingLong(BlockLocation::getOffset)); + + long bytesUnassigned = len; + long position = 0; + + int blockIndex = 0; + + while (bytesUnassigned > maxBytesForLastSplit) { + // get the block containing the majority of the data + blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex); + // create a new split + FileInputSplit fis = new FileInputSplit(splitNum++, new Path(file.getPath().toUri()), position, splitSize, + blocks[blockIndex].getHosts()); + inputSplits.add(fis); + + // adjust the positions + position += splitSize; + bytesUnassigned -= splitSize; + } + + // assign the last split + if (bytesUnassigned > 0) { + blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex); + final FileInputSplit fis = new FileInputSplit(splitNum++, new Path(file.getPath().toUri()), position, + bytesUnassigned, blocks[blockIndex].getHosts()); + inputSplits.add(fis); + } + } else { + // special case with a file of zero bytes size + final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0); + String[] hosts; + if (blocks.length > 0) { + hosts = blocks[0].getHosts(); + } else { + hosts = new String[0]; + } + final FileInputSplit fis = new FileInputSplit(splitNum++, new Path(file.getPath().toUri()), 0, 0, hosts); + inputSplits.add(fis); + } + } + + return inputSplits.toArray(new FileInputSplit[0]); + } + + @Override + public boolean supportsMultiPaths() { + return true; + } + + @Override + public boolean reachedEnd() throws IOException { + if (currentReadCount >= limit) { + return true; + } else { + return reader.reachedEnd(); + } + } + + @Override + public RowData nextRecord(RowData reuse) { + currentReadCount++; + return reader.nextRecord(); + } + + @Override + public void close() throws IOException { + if (reader != null) { + this.reader.close(); + } + this.reader = null; + } + + /** + * Enumerate all files in the directory and recursive if enumerateNestedFiles is true. + * + * @return the total length of accepted files. + */ + private long addFilesInDir(org.apache.hadoop.fs.Path path, List files, boolean logExcludedFiles) + throws IOException { + final org.apache.hadoop.fs.Path hadoopPath = new org.apache.hadoop.fs.Path(path.toUri()); + final FileSystem fs = FSUtils.getFs(hadoopPath.toString(), this.conf.conf()); + + long length = 0; + + for (FileStatus dir : fs.listStatus(hadoopPath)) { + if (dir.isDirectory()) { + if (acceptFile(dir) && enumerateNestedFiles) { + length += addFilesInDir(dir.getPath(), files, logExcludedFiles); + } else { + if (logExcludedFiles && LOG.isDebugEnabled()) { + LOG.debug("Directory " + dir.getPath().toString() + " did not pass the file-filter and is excluded."); + } + } + } else { + if (acceptFile(dir)) { + files.add(dir); + length += dir.getLen(); + testForUnsplittable(dir); + } else { + if (logExcludedFiles && LOG.isDebugEnabled()) { + LOG.debug("Directory " + dir.getPath().toString() + " did not pass the file-filter and is excluded."); + } + } + } + } + return length; + } + + @Override + public void setFilesFilter(FilePathFilter filesFilter) { + this.localFilesFilter = filesFilter; + super.setFilesFilter(filesFilter); + } + + /** + * A simple hook to filter files and directories from the input. + * The method may be overridden. Hadoop's FileInputFormat has a similar mechanism and applies the + * same filters by default. + * + * @param fileStatus The file status to check. + * @return true, if the given file or directory is accepted + */ + public boolean acceptFile(FileStatus fileStatus) { + final String name = fileStatus.getPath().getName(); + return !name.startsWith("_") + && !name.startsWith(".") + && !localFilesFilter.filterPath(new Path(fileStatus.getPath().toUri())); + } + + /** + * Retrieves the index of the BlockLocation that contains the part of the file described by the given + * offset. + * + * @param blocks The different blocks of the file. Must be ordered by their offset. + * @param offset The offset of the position in the file. + * @param startIndex The earliest index to look at. + * @return The index of the block containing the given position. + */ + private int getBlockIndexForPosition(BlockLocation[] blocks, long offset, long halfSplitSize, int startIndex) { + // go over all indexes after the startIndex + for (int i = startIndex; i < blocks.length; i++) { + long blockStart = blocks[i].getOffset(); + long blockEnd = blockStart + blocks[i].getLength(); + + if (offset >= blockStart && offset < blockEnd) { + // got the block where the split starts + // check if the next block contains more than this one does + if (i < blocks.length - 1 && blockEnd - offset < halfSplitSize) { + return i + 1; + } else { + return i; + } + } + } + throw new IllegalArgumentException("The given offset is not contained in the any block."); + } + + private boolean testForUnsplittable(FileStatus pathFile) { + if (getInflaterInputStreamFactory(pathFile.getPath()) != null) { + unsplittable = true; + return true; + } + return false; + } + + private InflaterInputStreamFactory getInflaterInputStreamFactory(org.apache.hadoop.fs.Path path) { + String fileExtension = extractFileExtension(path.getName()); + if (fileExtension != null) { + return getInflaterInputStreamFactory(fileExtension); + } else { + return null; + } + } + +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java new file mode 100644 index 0000000000000..c9b6561bdef20 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java @@ -0,0 +1,836 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.mor; + +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieOperation; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.table.log.InstantRange; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.keygen.KeyGenUtils; +import org.apache.hudi.table.format.FilePathUtils; +import org.apache.hudi.table.format.FormatUtils; +import org.apache.hudi.table.format.cow.ParquetSplitReaderUtil; +import org.apache.hudi.table.format.cow.vector.reader.ParquetColumnarRowSplitReader; +import org.apache.hudi.util.AvroToRowDataConverters; +import org.apache.hudi.util.DataTypeUtils; +import org.apache.hudi.util.RowDataProjection; +import org.apache.hudi.util.RowDataToAvroConverters; +import org.apache.hudi.util.StreamerUtil; +import org.apache.hudi.util.StringToRowDataConverter; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.GenericRecordBuilder; +import org.apache.avro.generic.IndexedRecord; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.io.DefaultInputSplitAssigner; +import org.apache.flink.api.common.io.RichInputFormat; +import org.apache.flink.api.common.io.statistics.BaseStatistics; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.io.InputSplitAssigner; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.RowKind; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Properties; +import java.util.Set; +import java.util.stream.IntStream; + +import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.HOODIE_COMMIT_TIME_COL_POS; +import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.HOODIE_RECORD_KEY_COL_POS; +import static org.apache.hudi.table.format.FormatUtils.buildAvroRecordBySchema; + +/** + * The base InputFormat class to read from Hoodie data + log files. + * + *

    Use {@code ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * overrides {@link #createInputSplits(int)} and {@link #close()} to change the behaviors. + */ +public class MergeOnReadInputFormat + extends RichInputFormat { + + private static final long serialVersionUID = 1L; + + private final Configuration conf; + + private transient org.apache.hadoop.conf.Configuration hadoopConf; + + private final MergeOnReadTableState tableState; + + /** + * Uniform iterator view for the underneath records. + */ + private transient RecordIterator iterator; + + // for project push down + /** + * Full table names. + */ + private final List fieldNames; + + /** + * Full field data types. + */ + private final List fieldTypes; + + /** + * Default partition name when the field value is null. + */ + private final String defaultPartName; + + /** + * Required field positions. + */ + private final int[] requiredPos; + + // for limit push down + /** + * Limit for the reader, -1 when the reading is not limited. + */ + private final long limit; + + /** + * Recording the current read count for limit check. + */ + private long currentReadCount = 0; + + /** + * Flag saying whether to emit the deletes. In streaming read mode, downstream + * operators need the DELETE messages to retract the legacy accumulator. + */ + private boolean emitDelete; + + /** + * Flag saying whether the input format has been closed. + */ + private boolean closed = true; + + private MergeOnReadInputFormat( + Configuration conf, + MergeOnReadTableState tableState, + List fieldTypes, + String defaultPartName, + long limit, + boolean emitDelete) { + this.conf = conf; + this.tableState = tableState; + this.fieldNames = tableState.getRowType().getFieldNames(); + this.fieldTypes = fieldTypes; + this.defaultPartName = defaultPartName; + // Needs improvement: this requiredPos is only suitable for parquet reader, + // because we need to + this.requiredPos = tableState.getRequiredPositions(); + this.limit = limit; + this.emitDelete = emitDelete; + } + + /** + * Returns the builder for {@link MergeOnReadInputFormat}. + */ + public static Builder builder() { + return new Builder(); + } + + @Override + public void open(MergeOnReadInputSplit split) throws IOException { + this.currentReadCount = 0L; + this.closed = false; + this.hadoopConf = HadoopConfigurations.getHadoopConf(this.conf); + if (!(split.getLogPaths().isPresent() && split.getLogPaths().get().size() > 0)) { + if (split.getInstantRange() != null) { + // base file only with commit time filtering + this.iterator = new BaseFileOnlyFilteringIterator( + split.getInstantRange(), + this.tableState.getRequiredRowType(), + getReader(split.getBasePath().get(), getRequiredPosWithCommitTime(this.requiredPos))); + } else { + // base file only + this.iterator = new BaseFileOnlyIterator(getRequiredSchemaReader(split.getBasePath().get())); + } + } else if (!split.getBasePath().isPresent()) { + // log files only + if (OptionsResolver.emitChangelog(conf)) { + this.iterator = new LogFileOnlyIterator(getUnMergedLogFileIterator(split)); + } else { + this.iterator = new LogFileOnlyIterator(getLogFileIterator(split)); + } + } else if (split.getMergeType().equals(FlinkOptions.REALTIME_SKIP_MERGE)) { + this.iterator = new SkipMergeIterator( + getRequiredSchemaReader(split.getBasePath().get()), + getLogFileIterator(split)); + } else if (split.getMergeType().equals(FlinkOptions.REALTIME_PAYLOAD_COMBINE)) { + this.iterator = new MergeIterator( + conf, + hadoopConf, + split, + this.tableState.getRowType(), + this.tableState.getRequiredRowType(), + new Schema.Parser().parse(this.tableState.getAvroSchema()), + new Schema.Parser().parse(this.tableState.getRequiredAvroSchema()), + this.requiredPos, + this.emitDelete, + this.tableState.getOperationPos(), + getFullSchemaReader(split.getBasePath().get())); + } else { + throw new HoodieException("Unable to select an Iterator to read the Hoodie MOR File Split for " + + "file path: " + split.getBasePath() + + "log paths: " + split.getLogPaths() + + "hoodie table path: " + split.getTablePath() + + "spark partition Index: " + split.getSplitNumber() + + "merge type: " + split.getMergeType()); + } + mayShiftInputSplit(split); + } + + @Override + public void configure(Configuration configuration) { + // no operation + // may support nested files in the future. + } + + @Override + public BaseStatistics getStatistics(BaseStatistics baseStatistics) { + // statistics not supported yet. + return null; + } + + @Override + public MergeOnReadInputSplit[] createInputSplits(int minNumSplits) { + return this.tableState.getInputSplits().toArray(new MergeOnReadInputSplit[0]); + } + + @Override + public InputSplitAssigner getInputSplitAssigner(MergeOnReadInputSplit[] mergeOnReadInputSplits) { + return new DefaultInputSplitAssigner(mergeOnReadInputSplits); + } + + @Override + public boolean reachedEnd() throws IOException { + if (limit > 0 && currentReadCount >= limit) { + return true; + } else { + // log file reaches end ? + return this.iterator.reachedEnd(); + } + } + + @Override + public RowData nextRecord(RowData o) { + currentReadCount++; + return this.iterator.nextRecord(); + } + + @Override + public void close() throws IOException { + if (this.iterator != null) { + this.iterator.close(); + } + this.iterator = null; + this.closed = true; + } + + public boolean isClosed() { + return this.closed; + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + /** + * Shifts the input split by its consumed records number. + * + *

    Note: This action is time-consuming. + */ + private void mayShiftInputSplit(MergeOnReadInputSplit split) throws IOException { + if (split.isConsumed()) { + // if the input split has been consumed before, + // shift the input split with consumed num of records first + for (long i = 0; i < split.getConsumed() && !reachedEnd(); i++) { + nextRecord(null); + } + } + } + + private ParquetColumnarRowSplitReader getFullSchemaReader(String path) throws IOException { + return getReader(path, IntStream.range(0, this.tableState.getRowType().getFieldCount()).toArray()); + } + + private ParquetColumnarRowSplitReader getRequiredSchemaReader(String path) throws IOException { + return getReader(path, this.requiredPos); + } + + private ParquetColumnarRowSplitReader getReader(String path, int[] requiredPos) throws IOException { + // generate partition specs. + LinkedHashMap partSpec = FilePathUtils.extractPartitionKeyValues( + new org.apache.hadoop.fs.Path(path).getParent(), + this.conf.getBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING), + FilePathUtils.extractPartitionKeys(this.conf)); + LinkedHashMap partObjects = new LinkedHashMap<>(); + partSpec.forEach((k, v) -> { + final int idx = fieldNames.indexOf(k); + if (idx == -1) { + // for any rare cases that the partition field does not exist in schema, + // fallback to file read + return; + } + DataType fieldType = fieldTypes.get(idx); + if (!DataTypeUtils.isDatetimeType(fieldType)) { + // date time type partition field is formatted specifically, + // read directly from the data file to avoid format mismatch or precision loss + partObjects.put(k, DataTypeUtils.resolvePartition(defaultPartName.equals(v) ? null : v, fieldType)); + } + }); + + return ParquetSplitReaderUtil.genPartColumnarRowReader( + this.conf.getBoolean(FlinkOptions.UTC_TIMEZONE), + true, + HadoopConfigurations.getParquetConf(this.conf, hadoopConf), + fieldNames.toArray(new String[0]), + fieldTypes.toArray(new DataType[0]), + partObjects, + requiredPos, + 2048, + new org.apache.flink.core.fs.Path(path), + 0, + Long.MAX_VALUE); // read the whole file + } + + private ClosableIterator getLogFileIterator(MergeOnReadInputSplit split) { + final Schema tableSchema = new Schema.Parser().parse(tableState.getAvroSchema()); + final Schema requiredSchema = new Schema.Parser().parse(tableState.getRequiredAvroSchema()); + final GenericRecordBuilder recordBuilder = new GenericRecordBuilder(requiredSchema); + final AvroToRowDataConverters.AvroToRowDataConverter avroToRowDataConverter = + AvroToRowDataConverters.createRowConverter(tableState.getRequiredRowType()); + final HoodieMergedLogRecordScanner scanner = FormatUtils.logScanner(split, tableSchema, conf, hadoopConf); + final Iterator logRecordsKeyIterator = scanner.getRecords().keySet().iterator(); + final int[] pkOffset = tableState.getPkOffsetsInRequired(); + // flag saying whether the pk semantics has been dropped by user specified + // projections. For e.g, if the pk fields are [a, b] but user only select a, + // then the pk semantics is lost. + final boolean pkSemanticLost = Arrays.stream(pkOffset).anyMatch(offset -> offset == -1); + final LogicalType[] pkTypes = pkSemanticLost ? null : tableState.getPkTypes(pkOffset); + final StringToRowDataConverter converter = pkSemanticLost ? null : new StringToRowDataConverter(pkTypes); + + return new ClosableIterator() { + private RowData currentRecord; + + @Override + public boolean hasNext() { + while (logRecordsKeyIterator.hasNext()) { + String curAvroKey = logRecordsKeyIterator.next(); + Option curAvroRecord = null; + final HoodieAvroRecord hoodieRecord = (HoodieAvroRecord) scanner.getRecords().get(curAvroKey); + try { + curAvroRecord = hoodieRecord.getData().getInsertValue(tableSchema); + } catch (IOException e) { + throw new HoodieException("Get avro insert value error for key: " + curAvroKey, e); + } + if (!curAvroRecord.isPresent()) { + // delete record found + if (emitDelete && !pkSemanticLost) { + GenericRowData delete = new GenericRowData(tableState.getRequiredRowType().getFieldCount()); + + final String recordKey = hoodieRecord.getRecordKey(); + final String[] pkFields = KeyGenUtils.extractRecordKeys(recordKey); + final Object[] converted = converter.convert(pkFields); + for (int i = 0; i < pkOffset.length; i++) { + delete.setField(pkOffset[i], converted[i]); + } + delete.setRowKind(RowKind.DELETE); + + this.currentRecord = delete; + return true; + } + // skipping if the condition is unsatisfied + // continue; + } else { + final IndexedRecord avroRecord = curAvroRecord.get(); + final RowKind rowKind = FormatUtils.getRowKindSafely(avroRecord, tableState.getOperationPos()); + if (rowKind == RowKind.DELETE && !emitDelete) { + // skip the delete record + continue; + } + GenericRecord requiredAvroRecord = buildAvroRecordBySchema( + avroRecord, + requiredSchema, + requiredPos, + recordBuilder); + currentRecord = (RowData) avroToRowDataConverter.convert(requiredAvroRecord); + currentRecord.setRowKind(rowKind); + return true; + } + } + return false; + } + + @Override + public RowData next() { + return currentRecord; + } + + @Override + public void close() { + scanner.close(); + } + }; + } + + private ClosableIterator getUnMergedLogFileIterator(MergeOnReadInputSplit split) { + final Schema tableSchema = new Schema.Parser().parse(tableState.getAvroSchema()); + final Schema requiredSchema = new Schema.Parser().parse(tableState.getRequiredAvroSchema()); + final GenericRecordBuilder recordBuilder = new GenericRecordBuilder(requiredSchema); + final AvroToRowDataConverters.AvroToRowDataConverter avroToRowDataConverter = + AvroToRowDataConverters.createRowConverter(tableState.getRequiredRowType()); + final FormatUtils.BoundedMemoryRecords records = new FormatUtils.BoundedMemoryRecords(split, tableSchema, hadoopConf, conf); + final Iterator> recordsIterator = records.getRecordsIterator(); + + return new ClosableIterator() { + private RowData currentRecord; + + @Override + public boolean hasNext() { + while (recordsIterator.hasNext()) { + Option curAvroRecord = null; + final HoodieAvroRecord hoodieRecord = (HoodieAvroRecord) recordsIterator.next(); + try { + curAvroRecord = hoodieRecord.getData().getInsertValue(tableSchema); + } catch (IOException e) { + throw new HoodieException("Get avro insert value error for key: " + hoodieRecord.getRecordKey(), e); + } + if (curAvroRecord.isPresent()) { + final IndexedRecord avroRecord = curAvroRecord.get(); + GenericRecord requiredAvroRecord = buildAvroRecordBySchema( + avroRecord, + requiredSchema, + requiredPos, + recordBuilder); + currentRecord = (RowData) avroToRowDataConverter.convert(requiredAvroRecord); + FormatUtils.setRowKind(currentRecord, avroRecord, tableState.getOperationPos()); + return true; + } + } + return false; + } + + @Override + public RowData next() { + return currentRecord; + } + + @Override + public void close() { + records.close(); + } + }; + } + + // ------------------------------------------------------------------------- + // Inner Class + // ------------------------------------------------------------------------- + private interface RecordIterator { + boolean reachedEnd() throws IOException; + + RowData nextRecord(); + + void close() throws IOException; + } + + static class BaseFileOnlyIterator implements RecordIterator { + // base file reader + private final ParquetColumnarRowSplitReader reader; + + BaseFileOnlyIterator(ParquetColumnarRowSplitReader reader) { + this.reader = reader; + } + + @Override + public boolean reachedEnd() throws IOException { + return this.reader.reachedEnd(); + } + + @Override + public RowData nextRecord() { + return this.reader.nextRecord(); + } + + @Override + public void close() throws IOException { + if (this.reader != null) { + this.reader.close(); + } + } + } + + /** + * Similar with {@link BaseFileOnlyIterator} but with instant time filtering. + */ + static class BaseFileOnlyFilteringIterator implements RecordIterator { + // base file reader + private final ParquetColumnarRowSplitReader reader; + private final InstantRange instantRange; + private final RowDataProjection projection; + + private RowData currentRecord; + + BaseFileOnlyFilteringIterator( + Option instantRange, + RowType requiredRowType, + ParquetColumnarRowSplitReader reader) { + this.reader = reader; + this.instantRange = instantRange.orElse(null); + int[] positions = IntStream.range(1, 1 + requiredRowType.getFieldCount()).toArray(); + projection = RowDataProjection.instance(requiredRowType, positions); + } + + @Override + public boolean reachedEnd() throws IOException { + while (!this.reader.reachedEnd()) { + currentRecord = this.reader.nextRecord(); + if (instantRange != null) { + boolean isInRange = instantRange.isInRange(currentRecord.getString(HOODIE_COMMIT_TIME_COL_POS).toString()); + if (isInRange) { + return false; + } + } else { + return false; + } + } + return true; + } + + @Override + public RowData nextRecord() { + // can promote: no need to project with null instant range + return projection.project(currentRecord); + } + + @Override + public void close() throws IOException { + if (this.reader != null) { + this.reader.close(); + } + } + } + + static class LogFileOnlyIterator implements RecordIterator { + // iterator for log files + private final ClosableIterator iterator; + + LogFileOnlyIterator(ClosableIterator iterator) { + this.iterator = iterator; + } + + @Override + public boolean reachedEnd() { + return !this.iterator.hasNext(); + } + + @Override + public RowData nextRecord() { + return this.iterator.next(); + } + + @Override + public void close() { + if (this.iterator != null) { + this.iterator.close(); + } + } + } + + static class SkipMergeIterator implements RecordIterator { + // base file reader + private final ParquetColumnarRowSplitReader reader; + // iterator for log files + private final ClosableIterator iterator; + + // add the flag because the flink ParquetColumnarRowSplitReader is buggy: + // method #reachedEnd() returns false after it returns true. + // refactor it out once FLINK-22370 is resolved. + private boolean readLogs = false; + + private RowData currentRecord; + + SkipMergeIterator(ParquetColumnarRowSplitReader reader, ClosableIterator iterator) { + this.reader = reader; + this.iterator = iterator; + } + + @Override + public boolean reachedEnd() throws IOException { + if (!readLogs && !this.reader.reachedEnd()) { + currentRecord = this.reader.nextRecord(); + return false; + } + readLogs = true; + if (this.iterator.hasNext()) { + currentRecord = this.iterator.next(); + return false; + } + return true; + } + + @Override + public RowData nextRecord() { + return currentRecord; + } + + @Override + public void close() throws IOException { + if (this.reader != null) { + this.reader.close(); + } + if (this.iterator != null) { + this.iterator.close(); + } + } + } + + static class MergeIterator implements RecordIterator { + // base file reader + private final ParquetColumnarRowSplitReader reader; + // log keys used for merging + private final Iterator logKeysIterator; + // scanner + private final HoodieMergedLogRecordScanner scanner; + + private final Schema tableSchema; + private final Schema requiredSchema; + private final int[] requiredPos; + private final boolean emitDelete; + private final int operationPos; + private final RowDataToAvroConverters.RowDataToAvroConverter rowDataToAvroConverter; + private final AvroToRowDataConverters.AvroToRowDataConverter avroToRowDataConverter; + private final GenericRecordBuilder recordBuilder; + + private final RowDataProjection projection; + + private final InstantRange instantRange; + + // add the flag because the flink ParquetColumnarRowSplitReader is buggy: + // method #reachedEnd() returns false after it returns true. + // refactor it out once FLINK-22370 is resolved. + private boolean readLogs = false; + + private final Set keyToSkip = new HashSet<>(); + + private final Properties payloadProps; + + private RowData currentRecord; + + MergeIterator( + Configuration flinkConf, + org.apache.hadoop.conf.Configuration hadoopConf, + MergeOnReadInputSplit split, + RowType tableRowType, + RowType requiredRowType, + Schema tableSchema, + Schema requiredSchema, + int[] requiredPos, + boolean emitDelete, + int operationPos, + ParquetColumnarRowSplitReader reader) { // the reader should be with full schema + this.tableSchema = tableSchema; + this.reader = reader; + this.scanner = FormatUtils.logScanner(split, tableSchema, flinkConf, hadoopConf); + this.payloadProps = StreamerUtil.getPayloadConfig(flinkConf).getProps(); + this.logKeysIterator = scanner.getRecords().keySet().iterator(); + this.requiredSchema = requiredSchema; + this.requiredPos = requiredPos; + this.emitDelete = emitDelete; + this.operationPos = operationPos; + this.recordBuilder = new GenericRecordBuilder(requiredSchema); + this.rowDataToAvroConverter = RowDataToAvroConverters.createConverter(tableRowType); + this.avroToRowDataConverter = AvroToRowDataConverters.createRowConverter(requiredRowType); + this.projection = RowDataProjection.instance(requiredRowType, requiredPos); + this.instantRange = split.getInstantRange().orElse(null); + } + + @Override + public boolean reachedEnd() throws IOException { + while (!readLogs && !this.reader.reachedEnd()) { + currentRecord = this.reader.nextRecord(); + if (instantRange != null) { + boolean isInRange = instantRange.isInRange(currentRecord.getString(HOODIE_COMMIT_TIME_COL_POS).toString()); + if (!isInRange) { + // filter base file by instant range + continue; + } + } + final String curKey = currentRecord.getString(HOODIE_RECORD_KEY_COL_POS).toString(); + if (scanner.getRecords().containsKey(curKey)) { + keyToSkip.add(curKey); + Option mergedAvroRecord = mergeRowWithLog(currentRecord, curKey); + if (!mergedAvroRecord.isPresent()) { + // deleted + continue; + } else { + final RowKind rowKind = FormatUtils.getRowKindSafely(mergedAvroRecord.get(), this.operationPos); + if (!emitDelete && rowKind == RowKind.DELETE) { + // deleted + continue; + } + GenericRecord avroRecord = buildAvroRecordBySchema( + mergedAvroRecord.get(), + requiredSchema, + requiredPos, + recordBuilder); + this.currentRecord = (RowData) avroToRowDataConverter.convert(avroRecord); + this.currentRecord.setRowKind(rowKind); + return false; + } + } + // project the full record in base with required positions + currentRecord = projection.project(currentRecord); + return false; + } + // read the logs + readLogs = true; + while (logKeysIterator.hasNext()) { + final String curKey = logKeysIterator.next(); + if (!keyToSkip.contains(curKey)) { + Option insertAvroRecord = getInsertValue(curKey); + if (insertAvroRecord.isPresent()) { + // the record is a DELETE if insertAvroRecord not present, skipping + GenericRecord avroRecord = buildAvroRecordBySchema( + insertAvroRecord.get(), + requiredSchema, + requiredPos, + recordBuilder); + this.currentRecord = (RowData) avroToRowDataConverter.convert(avroRecord); + FormatUtils.setRowKind(this.currentRecord, insertAvroRecord.get(), this.operationPos); + return false; + } + } + } + return true; + } + + private Option getInsertValue(String curKey) throws IOException { + final HoodieAvroRecord record = (HoodieAvroRecord) scanner.getRecords().get(curKey); + if (!emitDelete && HoodieOperation.isDelete(record.getOperation())) { + return Option.empty(); + } + return record.getData().getInsertValue(tableSchema); + } + + @Override + public RowData nextRecord() { + return currentRecord; + } + + @Override + public void close() throws IOException { + if (this.reader != null) { + this.reader.close(); + } + if (this.scanner != null) { + this.scanner.close(); + } + } + + private Option mergeRowWithLog( + RowData curRow, + String curKey) throws IOException { + final HoodieAvroRecord record = (HoodieAvroRecord) scanner.getRecords().get(curKey); + GenericRecord historyAvroRecord = (GenericRecord) rowDataToAvroConverter.convert(tableSchema, curRow); + return record.getData().combineAndGetUpdateValue(historyAvroRecord, tableSchema, payloadProps); + } + } + + /** + * Builder for {@link MergeOnReadInputFormat}. + */ + public static class Builder { + private Configuration conf; + private MergeOnReadTableState tableState; + private List fieldTypes; + private String defaultPartName; + private long limit = -1; + private boolean emitDelete = false; + + public Builder config(Configuration conf) { + this.conf = conf; + return this; + } + + public Builder tableState(MergeOnReadTableState tableState) { + this.tableState = tableState; + return this; + } + + public Builder fieldTypes(List fieldTypes) { + this.fieldTypes = fieldTypes; + return this; + } + + public Builder defaultPartName(String defaultPartName) { + this.defaultPartName = defaultPartName; + return this; + } + + public Builder limit(long limit) { + this.limit = limit; + return this; + } + + public Builder emitDelete(boolean emitDelete) { + this.emitDelete = emitDelete; + return this; + } + + public MergeOnReadInputFormat build() { + return new MergeOnReadInputFormat(conf, tableState, fieldTypes, + defaultPartName, limit, emitDelete); + } + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + private static int[] getRequiredPosWithCommitTime(int[] requiredPos) { + int[] requiredPos2 = new int[requiredPos.length + 1]; + requiredPos2[0] = HOODIE_COMMIT_TIME_COL_POS; + System.arraycopy(requiredPos, 0, requiredPos2, 1, requiredPos.length); + return requiredPos2; + } + + @VisibleForTesting + public void isEmitDelete(boolean emitDelete) { + this.emitDelete = emitDelete; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputSplit.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputSplit.java new file mode 100644 index 0000000000000..cde646e41f035 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputSplit.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.mor; + +import org.apache.hudi.common.table.log.InstantRange; +import org.apache.hudi.common.util.Option; + +import org.apache.flink.core.io.InputSplit; + +import javax.annotation.Nullable; + +import java.util.List; + +/** + * Represents an input split of source, actually a data bucket. + */ +public class MergeOnReadInputSplit implements InputSplit { + private static final long serialVersionUID = 1L; + + private static final long NUM_NO_CONSUMPTION = 0L; + + private final int splitNum; + private final Option basePath; + private final Option> logPaths; + private final String latestCommit; + private final String tablePath; + private final long maxCompactionMemoryInBytes; + private final String mergeType; + private final Option instantRange; + private String fileId; + + // for streaming reader to record the consumed offset, + // which is the start of next round reading. + private long consumed = NUM_NO_CONSUMPTION; + + public MergeOnReadInputSplit( + int splitNum, + @Nullable String basePath, + Option> logPaths, + String latestCommit, + String tablePath, + long maxCompactionMemoryInBytes, + String mergeType, + @Nullable InstantRange instantRange, + String fileId) { + this.splitNum = splitNum; + this.basePath = Option.ofNullable(basePath); + this.logPaths = logPaths; + this.latestCommit = latestCommit; + this.tablePath = tablePath; + this.maxCompactionMemoryInBytes = maxCompactionMemoryInBytes; + this.mergeType = mergeType; + this.instantRange = Option.ofNullable(instantRange); + this.fileId = fileId; + } + + public String getFileId() { + return fileId; + } + + public void setFileId(String fileId) { + this.fileId = fileId; + } + + public Option getBasePath() { + return basePath; + } + + public Option> getLogPaths() { + return logPaths; + } + + public String getLatestCommit() { + return latestCommit; + } + + public String getTablePath() { + return tablePath; + } + + public long getMaxCompactionMemoryInBytes() { + return maxCompactionMemoryInBytes; + } + + public String getMergeType() { + return mergeType; + } + + public Option getInstantRange() { + return this.instantRange; + } + + @Override + public int getSplitNumber() { + return this.splitNum; + } + + public void consume() { + this.consumed += 1L; + } + + public long getConsumed() { + return consumed; + } + + public boolean isConsumed() { + return this.consumed != NUM_NO_CONSUMPTION; + } + + @Override + public String toString() { + return "MergeOnReadInputSplit{" + + "splitNum=" + splitNum + + ", basePath=" + basePath + + ", logPaths=" + logPaths + + ", latestCommit='" + latestCommit + '\'' + + ", tablePath='" + tablePath + '\'' + + ", maxCompactionMemoryInBytes=" + maxCompactionMemoryInBytes + + ", mergeType='" + mergeType + '\'' + + ", instantRange=" + instantRange + + '}'; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadTableState.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadTableState.java new file mode 100644 index 0000000000000..36dfecbb79a5f --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadTableState.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.mor; + +import org.apache.hudi.common.model.HoodieRecord; + +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.List; + +/** + * Statistics for merge on read table source. + */ +public class MergeOnReadTableState implements Serializable { + + private static final long serialVersionUID = 1L; + + private final RowType rowType; + private final RowType requiredRowType; + private final String avroSchema; + private final String requiredAvroSchema; + private final List inputSplits; + private final String[] pkFields; + private final int operationPos; + + public MergeOnReadTableState( + RowType rowType, + RowType requiredRowType, + String avroSchema, + String requiredAvroSchema, + List inputSplits, + String[] pkFields) { + this.rowType = rowType; + this.requiredRowType = requiredRowType; + this.avroSchema = avroSchema; + this.requiredAvroSchema = requiredAvroSchema; + this.inputSplits = inputSplits; + this.pkFields = pkFields; + this.operationPos = rowType.getFieldIndex(HoodieRecord.OPERATION_METADATA_FIELD); + } + + public RowType getRowType() { + return rowType; + } + + public RowType getRequiredRowType() { + return requiredRowType; + } + + public String getAvroSchema() { + return avroSchema; + } + + public String getRequiredAvroSchema() { + return requiredAvroSchema; + } + + public List getInputSplits() { + return inputSplits; + } + + public int getOperationPos() { + return operationPos; + } + + public int[] getRequiredPositions() { + final List fieldNames = rowType.getFieldNames(); + return requiredRowType.getFieldNames().stream() + .map(fieldNames::indexOf) + .mapToInt(i -> i) + .toArray(); + } + + /** + * Get the primary key positions in required row type. + */ + public int[] getPkOffsetsInRequired() { + final List fieldNames = requiredRowType.getFieldNames(); + return Arrays.stream(pkFields) + .map(fieldNames::indexOf) + .mapToInt(i -> i) + .toArray(); + } + + /** + * Returns the primary key fields logical type with given offsets. + * + * @param pkOffsets the pk offsets in required row type + * @return pk field logical types + * @see #getPkOffsetsInRequired() + */ + public LogicalType[] getPkTypes(int[] pkOffsets) { + final LogicalType[] requiredTypes = requiredRowType.getFields().stream() + .map(RowType.RowField::getType).toArray(LogicalType[]::new); + return Arrays.stream(pkOffsets).mapToObj(offset -> requiredTypes[offset]) + .toArray(LogicalType[]::new); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/AvroSchemaConverter.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/AvroSchemaConverter.java new file mode 100644 index 0000000000000..925819c871e57 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/AvroSchemaConverter.java @@ -0,0 +1,388 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.types.AtomicDataType; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeFamily; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.MultisetType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimeType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.table.types.logical.TypeInformationRawType; + +import java.util.List; +import java.util.stream.Collectors; + +/** + * Converts an Avro schema into Flink's type information. It uses {@link org.apache.flink.api.java.typeutils.RowTypeInfo} for + * representing objects and converts Avro types into types that are compatible with Flink's Table & + * SQL API. + * + *

    Note: Changes in this class need to be kept in sync with the corresponding runtime classes + * {@code org.apache.flink.formats.avro.AvroRowDeserializationSchema} and {@code org.apache.flink.formats.avro.AvroRowSerializationSchema}. + * + *

    NOTE: reference from Flink release 1.12.0, should remove when Flink version upgrade to that. + */ +public class AvroSchemaConverter { + + /** + * Converts an Avro schema {@code schema} into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param schema Avro schema definition + * @return data type matching the schema + */ + public static DataType convertToDataType(Schema schema) { + switch (schema.getType()) { + case RECORD: + final List schemaFields = schema.getFields(); + + final DataTypes.Field[] fields = new DataTypes.Field[schemaFields.size()]; + for (int i = 0; i < schemaFields.size(); i++) { + final Schema.Field field = schemaFields.get(i); + fields[i] = DataTypes.FIELD(field.name(), convertToDataType(field.schema())); + } + return DataTypes.ROW(fields).notNull(); + case ENUM: + case STRING: + // convert Avro's Utf8/CharSequence to String + return DataTypes.STRING().notNull(); + case ARRAY: + return DataTypes.ARRAY(convertToDataType(schema.getElementType())).notNull(); + case MAP: + return DataTypes.MAP( + DataTypes.STRING().notNull(), + convertToDataType(schema.getValueType())) + .notNull(); + case UNION: + final Schema actualSchema; + final boolean nullable; + if (schema.getTypes().size() == 2 + && schema.getTypes().get(0).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(1); + nullable = true; + } else if (schema.getTypes().size() == 2 + && schema.getTypes().get(1).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(0); + nullable = true; + } else if (schema.getTypes().size() == 1) { + actualSchema = schema.getTypes().get(0); + nullable = false; + } else { + List nonNullTypes = schema.getTypes().stream() + .filter(s -> s.getType() != Schema.Type.NULL) + .collect(Collectors.toList()); + nullable = schema.getTypes().size() > nonNullTypes.size(); + + // use Kryo for serialization + DataType rawDataType = new AtomicDataType( + new TypeInformationRawType<>(false, Types.GENERIC(Object.class))) + .notNull(); + + if (recordTypesOfSameNumFields(nonNullTypes)) { + DataType converted = DataTypes.ROW( + DataTypes.FIELD("wrapper", rawDataType)) + .notNull(); + return nullable ? converted.nullable() : converted; + } + // use Kryo for serialization + return nullable ? rawDataType.nullable() : rawDataType; + } + DataType converted = convertToDataType(actualSchema); + return nullable ? converted.nullable() : converted; + case FIXED: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + final LogicalTypes.Decimal decimalType = + (LogicalTypes.Decimal) schema.getLogicalType(); + return DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale()) + .notNull(); + } + // convert fixed size binary data to primitive byte arrays + return DataTypes.VARBINARY(schema.getFixedSize()).notNull(); + case BYTES: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + final LogicalTypes.Decimal decimalType = + (LogicalTypes.Decimal) schema.getLogicalType(); + return DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale()) + .notNull(); + } + return DataTypes.BYTES().notNull(); + case INT: + // logical date and time type + final org.apache.avro.LogicalType logicalType = schema.getLogicalType(); + if (logicalType == LogicalTypes.date()) { + return DataTypes.DATE().notNull(); + } else if (logicalType == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } + return DataTypes.INT().notNull(); + case LONG: + // logical timestamp type + if (schema.getLogicalType() == LogicalTypes.timestampMillis()) { + return DataTypes.TIMESTAMP(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMillis()) { + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timestampMicros()) { + return DataTypes.TIMESTAMP(6).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMicros()) { + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(6).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMicros()) { + return DataTypes.TIME(6).notNull(); + } + return DataTypes.BIGINT().notNull(); + case FLOAT: + return DataTypes.FLOAT().notNull(); + case DOUBLE: + return DataTypes.DOUBLE().notNull(); + case BOOLEAN: + return DataTypes.BOOLEAN().notNull(); + case NULL: + return DataTypes.NULL(); + default: + throw new IllegalArgumentException("Unsupported Avro type '" + schema.getType() + "'."); + } + } + + /** + * Returns true if all the types are RECORD type with same number of fields. + */ + private static boolean recordTypesOfSameNumFields(List types) { + if (types == null || types.size() == 0) { + return false; + } + if (types.stream().anyMatch(s -> s.getType() != Schema.Type.RECORD)) { + return false; + } + int numFields = types.get(0).getFields().size(); + return types.stream().allMatch(s -> s.getFields().size() == numFields); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

    Use "record" as the type name. + * + * @param schema the schema type, usually it should be the top level record type, e.g. not a + * nested type + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType schema) { + return convertToSchema(schema, "record"); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

    The "{rowName}." is used as the nested row type name prefix in order to generate the right + * schema. Nested record type that only differs with type name is still compatible. + * + * @param logicalType logical type + * @param rowName the record name + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType logicalType, String rowName) { + int precision; + boolean nullable = logicalType.isNullable(); + switch (logicalType.getTypeRoot()) { + case NULL: + return SchemaBuilder.builder().nullType(); + case BOOLEAN: + Schema bool = SchemaBuilder.builder().booleanType(); + return nullable ? nullableSchema(bool) : bool; + case TINYINT: + case SMALLINT: + case INTEGER: + Schema integer = SchemaBuilder.builder().intType(); + return nullable ? nullableSchema(integer) : integer; + case BIGINT: + Schema bigint = SchemaBuilder.builder().longType(); + return nullable ? nullableSchema(bigint) : bigint; + case FLOAT: + Schema f = SchemaBuilder.builder().floatType(); + return nullable ? nullableSchema(f) : f; + case DOUBLE: + Schema d = SchemaBuilder.builder().doubleType(); + return nullable ? nullableSchema(d) : d; + case CHAR: + case VARCHAR: + Schema str = SchemaBuilder.builder().stringType(); + return nullable ? nullableSchema(str) : str; + case BINARY: + case VARBINARY: + Schema binary = SchemaBuilder.builder().bytesType(); + return nullable ? nullableSchema(binary) : binary; + case TIMESTAMP_WITHOUT_TIME_ZONE: + // use long to represents Timestamp + final TimestampType timestampType = (TimestampType) logicalType; + precision = timestampType.getPrecision(); + org.apache.avro.LogicalType timestampLogicalType; + if (precision <= 3) { + timestampLogicalType = LogicalTypes.timestampMillis(); + } else if (precision <= 6) { + timestampLogicalType = LogicalTypes.timestampMicros(); + } else { + throw new IllegalArgumentException( + "Avro does not support TIMESTAMP type with precision: " + + precision + + ", it only supports precision less than 6."); + } + Schema timestamp = timestampLogicalType.addToSchema(SchemaBuilder.builder().longType()); + return nullable ? nullableSchema(timestamp) : timestamp; + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + // use long to represents LocalZonedTimestampType + final LocalZonedTimestampType localZonedTimestampType = (LocalZonedTimestampType) logicalType; + precision = localZonedTimestampType.getPrecision(); + org.apache.avro.LogicalType localZonedTimestampLogicalType; + if (precision <= 3) { + localZonedTimestampLogicalType = LogicalTypes.localTimestampMillis(); + } else if (precision <= 6) { + localZonedTimestampLogicalType = LogicalTypes.localTimestampMicros(); + } else { + throw new IllegalArgumentException( + "Avro does not support LOCAL TIMESTAMP type with precision: " + + precision + + ", it only supports precision less than 6."); + } + Schema localZonedTimestamp = localZonedTimestampLogicalType.addToSchema(SchemaBuilder.builder().longType()); + return nullable ? nullableSchema(localZonedTimestamp) : localZonedTimestamp; + case DATE: + // use int to represents Date + Schema date = LogicalTypes.date().addToSchema(SchemaBuilder.builder().intType()); + return nullable ? nullableSchema(date) : date; + case TIME_WITHOUT_TIME_ZONE: + precision = ((TimeType) logicalType).getPrecision(); + if (precision > 3) { + throw new IllegalArgumentException( + "Avro does not support TIME type with precision: " + + precision + + ", it only supports precision less than 3."); + } + // use int to represents Time, we only support millisecond when deserialization + Schema time = + LogicalTypes.timeMillis().addToSchema(SchemaBuilder.builder().intType()); + return nullable ? nullableSchema(time) : time; + case DECIMAL: + DecimalType decimalType = (DecimalType) logicalType; + // store BigDecimal as Fixed + // for spark compatibility. + Schema decimal = + LogicalTypes.decimal(decimalType.getPrecision(), decimalType.getScale()) + .addToSchema(SchemaBuilder + .fixed(String.format("%s.fixed", rowName)) + .size(computeMinBytesForDecimlPrecision(decimalType.getPrecision()))); + return nullable ? nullableSchema(decimal) : decimal; + case ROW: + RowType rowType = (RowType) logicalType; + List fieldNames = rowType.getFieldNames(); + // we have to make sure the record name is different in a Schema + SchemaBuilder.FieldAssembler builder = + SchemaBuilder.builder().record(rowName).fields(); + for (int i = 0; i < rowType.getFieldCount(); i++) { + String fieldName = fieldNames.get(i); + LogicalType fieldType = rowType.getTypeAt(i); + SchemaBuilder.GenericDefault fieldBuilder = + builder.name(fieldName) + .type(convertToSchema(fieldType, rowName + "." + fieldName)); + + if (fieldType.isNullable()) { + builder = fieldBuilder.withDefault(null); + } else { + builder = fieldBuilder.noDefault(); + } + } + Schema record = builder.endRecord(); + return nullable ? nullableSchema(record) : record; + case MULTISET: + case MAP: + Schema map = + SchemaBuilder.builder() + .map() + .values( + convertToSchema( + extractValueTypeToAvroMap(logicalType), rowName)); + return nullable ? nullableSchema(map) : map; + case ARRAY: + ArrayType arrayType = (ArrayType) logicalType; + Schema array = + SchemaBuilder.builder() + .array() + .items(convertToSchema(arrayType.getElementType(), rowName)); + return nullable ? nullableSchema(array) : array; + case RAW: + default: + throw new UnsupportedOperationException( + "Unsupported to derive Schema for type: " + logicalType); + } + } + + public static LogicalType extractValueTypeToAvroMap(LogicalType type) { + LogicalType keyType; + LogicalType valueType; + if (type instanceof MapType) { + MapType mapType = (MapType) type; + keyType = mapType.getKeyType(); + valueType = mapType.getValueType(); + } else { + MultisetType multisetType = (MultisetType) type; + keyType = multisetType.getElementType(); + valueType = new IntType(); + } + if (!DataTypeUtils.isFamily(keyType, LogicalTypeFamily.CHARACTER_STRING)) { + throw new UnsupportedOperationException( + "Avro format doesn't support non-string as key type of map. " + + "The key type is: " + + keyType.asSummaryString()); + } + return valueType; + } + + /** + * Returns schema with nullable true. + */ + private static Schema nullableSchema(Schema schema) { + return schema.isNullable() + ? schema + : Schema.createUnion(SchemaBuilder.builder().nullType(), schema); + } + + private static int computeMinBytesForDecimlPrecision(int precision) { + int numBytes = 1; + while (Math.pow(2.0, 8 * numBytes - 1) < Math.pow(10.0, precision)) { + numBytes += 1; + } + return numBytes; + } +} + diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/AvroToRowDataConverters.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/AvroToRowDataConverters.java new file mode 100644 index 0000000000000..5c9988dc0b2ed --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/AvroToRowDataConverters.java @@ -0,0 +1,330 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.avro.generic.GenericFixed; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.table.types.logical.utils.LogicalTypeUtils; +import org.joda.time.DateTime; +import org.joda.time.DateTimeFieldType; + +import java.io.Serializable; +import java.lang.reflect.Array; +import java.nio.ByteBuffer; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalTime; +import java.time.temporal.ChronoField; +import java.time.temporal.ChronoUnit; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Tool class used to convert from Avro {@link GenericRecord} to {@link RowData}. + * + *

    NOTE: reference from Flink release 1.12.0, should remove when Flink version upgrade to that. + */ +@Internal +public class AvroToRowDataConverters { + + /** + * Runtime converter that converts Avro data structures into objects of Flink Table & SQL + * internal data structures. + */ + @FunctionalInterface + public interface AvroToRowDataConverter extends Serializable { + Object convert(Object object); + } + + // ------------------------------------------------------------------------------------- + // Runtime Converters + // ------------------------------------------------------------------------------------- + + public static AvroToRowDataConverter createRowConverter(RowType rowType) { + final AvroToRowDataConverter[] fieldConverters = + rowType.getFields().stream() + .map(RowType.RowField::getType) + .map(AvroToRowDataConverters::createNullableConverter) + .toArray(AvroToRowDataConverter[]::new); + final int arity = rowType.getFieldCount(); + + return avroObject -> { + IndexedRecord record = (IndexedRecord) avroObject; + GenericRowData row = new GenericRowData(arity); + for (int i = 0; i < arity; ++i) { + row.setField(i, fieldConverters[i].convert(record.get(i))); + } + return row; + }; + } + + /** + * Creates a runtime converter which is null safe. + */ + private static AvroToRowDataConverter createNullableConverter(LogicalType type) { + final AvroToRowDataConverter converter = createConverter(type); + return avroObject -> { + if (avroObject == null) { + return null; + } + return converter.convert(avroObject); + }; + } + + /** + * Creates a runtime converter which assuming input object is not null. + */ + public static AvroToRowDataConverter createConverter(LogicalType type) { + switch (type.getTypeRoot()) { + case NULL: + return avroObject -> null; + case TINYINT: + return avroObject -> ((Integer) avroObject).byteValue(); + case SMALLINT: + return avroObject -> ((Integer) avroObject).shortValue(); + case BOOLEAN: // boolean + case INTEGER: // int + case INTERVAL_YEAR_MONTH: // long + case BIGINT: // long + case INTERVAL_DAY_TIME: // long + case FLOAT: // float + case DOUBLE: // double + case RAW: + return avroObject -> avroObject; + case DATE: + return AvroToRowDataConverters::convertToDate; + case TIME_WITHOUT_TIME_ZONE: + return AvroToRowDataConverters::convertToTime; + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + return createTimestampConverter(((LocalZonedTimestampType) type).getPrecision()); + case TIMESTAMP_WITHOUT_TIME_ZONE: + return createTimestampConverter(((TimestampType) type).getPrecision()); + case CHAR: + case VARCHAR: + return avroObject -> StringData.fromString(avroObject.toString()); + case BINARY: + case VARBINARY: + return AvroToRowDataConverters::convertToBytes; + case DECIMAL: + return createDecimalConverter((DecimalType) type); + case ARRAY: + return createArrayConverter((ArrayType) type); + case ROW: + return createRowConverter((RowType) type); + case MAP: + case MULTISET: + return createMapConverter(type); + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + } + + private static AvroToRowDataConverter createDecimalConverter(DecimalType decimalType) { + final int precision = decimalType.getPrecision(); + final int scale = decimalType.getScale(); + return avroObject -> { + final byte[] bytes; + if (avroObject instanceof GenericFixed) { + bytes = ((GenericFixed) avroObject).bytes(); + } else if (avroObject instanceof ByteBuffer) { + ByteBuffer byteBuffer = (ByteBuffer) avroObject; + bytes = new byte[byteBuffer.remaining()]; + byteBuffer.get(bytes); + } else { + bytes = (byte[]) avroObject; + } + return DecimalData.fromUnscaledBytes(bytes, precision, scale); + }; + } + + private static AvroToRowDataConverter createArrayConverter(ArrayType arrayType) { + final AvroToRowDataConverter elementConverter = + createNullableConverter(arrayType.getElementType()); + final Class elementClass = + LogicalTypeUtils.toInternalConversionClass(arrayType.getElementType()); + + return avroObject -> { + final List list = (List) avroObject; + final int length = list.size(); + final Object[] array = (Object[]) Array.newInstance(elementClass, length); + for (int i = 0; i < length; ++i) { + array[i] = elementConverter.convert(list.get(i)); + } + return new GenericArrayData(array); + }; + } + + private static AvroToRowDataConverter createMapConverter(LogicalType type) { + final AvroToRowDataConverter keyConverter = + createConverter(DataTypes.STRING().getLogicalType()); + final AvroToRowDataConverter valueConverter = + createNullableConverter(AvroSchemaConverter.extractValueTypeToAvroMap(type)); + + return avroObject -> { + final Map map = (Map) avroObject; + Map result = new HashMap<>(); + for (Map.Entry entry : map.entrySet()) { + Object key = keyConverter.convert(entry.getKey()); + Object value = valueConverter.convert(entry.getValue()); + result.put(key, value); + } + return new GenericMapData(result); + }; + } + + private static AvroToRowDataConverter createTimestampConverter(int precision) { + final ChronoUnit chronoUnit; + if (precision <= 3) { + chronoUnit = ChronoUnit.MILLIS; + } else if (precision <= 6) { + chronoUnit = ChronoUnit.MICROS; + } else { + throw new IllegalArgumentException( + "Avro does not support TIMESTAMP type with precision: " + + precision + + ", it only supports precision less than 6."); + } + return avroObject -> { + final Instant instant; + if (avroObject instanceof Long) { + instant = Instant.EPOCH.plus((Long) avroObject, chronoUnit); + } else if (avroObject instanceof Instant) { + instant = (Instant) avroObject; + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + // joda time has only millisecond precision + instant = Instant.ofEpochMilli(jodaConverter.convertTimestamp(avroObject)); + } else { + throw new IllegalArgumentException( + "Unexpected object type for TIMESTAMP logical type. Received: " + avroObject); + } + } + return TimestampData.fromInstant(instant); + }; + } + + private static int convertToDate(Object object) { + if (object instanceof Integer) { + return (Integer) object; + } else if (object instanceof LocalDate) { + return (int) ((LocalDate) object).toEpochDay(); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + return (int) jodaConverter.convertDate(object); + } else { + throw new IllegalArgumentException( + "Unexpected object type for DATE logical type. Received: " + object); + } + } + } + + private static int convertToTime(Object object) { + final int millis; + if (object instanceof Integer) { + millis = (Integer) object; + } else if (object instanceof LocalTime) { + millis = ((LocalTime) object).get(ChronoField.MILLI_OF_DAY); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + millis = jodaConverter.convertTime(object); + } else { + throw new IllegalArgumentException( + "Unexpected object type for TIME logical type. Received: " + object); + } + } + return millis; + } + + private static byte[] convertToBytes(Object object) { + if (object instanceof GenericFixed) { + return ((GenericFixed) object).bytes(); + } else if (object instanceof ByteBuffer) { + ByteBuffer byteBuffer = (ByteBuffer) object; + byte[] bytes = new byte[byteBuffer.remaining()]; + byteBuffer.get(bytes); + return bytes; + } else { + return (byte[]) object; + } + } + + /** + * Encapsulates joda optional dependency. Instantiates this class only if joda is available on the + * classpath. + */ + static class JodaConverter { + + private static JodaConverter instance; + + public static JodaConverter getConverter() { + if (instance != null) { + return instance; + } + + try { + Class.forName( + "org.joda.time.DateTime", + false, + Thread.currentThread().getContextClassLoader()); + instance = new JodaConverter(); + } catch (ClassNotFoundException e) { + instance = null; + } + return instance; + } + + public long convertDate(Object object) { + final org.joda.time.LocalDate value = (org.joda.time.LocalDate) object; + return value.toDate().getTime(); + } + + public int convertTime(Object object) { + final org.joda.time.LocalTime value = (org.joda.time.LocalTime) object; + return value.get(DateTimeFieldType.millisOfDay()); + } + + public long convertTimestamp(Object object) { + final DateTime value = (DateTime) object; + return value.toDate().getTime(); + } + + private JodaConverter() { + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/ChangelogModes.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/ChangelogModes.java new file mode 100644 index 0000000000000..164815b4ac695 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/ChangelogModes.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.types.RowKind; + +/** + * Utilities for all kinds of common {@link org.apache.flink.table.connector.ChangelogMode}s. + */ +public class ChangelogModes { + public static final ChangelogMode FULL = ChangelogMode.newBuilder() + .addContainedKind(RowKind.INSERT) + .addContainedKind(RowKind.UPDATE_BEFORE) + .addContainedKind(RowKind.UPDATE_AFTER) + .addContainedKind(RowKind.DELETE) + .build(); + + /** + * Change log mode that ignores UPDATE_BEFORE, e.g UPSERT. + */ + public static final ChangelogMode UPSERT = ChangelogMode.newBuilder() + .addContainedKind(RowKind.INSERT) + .addContainedKind(RowKind.UPDATE_AFTER) + .addContainedKind(RowKind.DELETE) + .build(); + + private ChangelogModes() { + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/ClusteringUtil.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/ClusteringUtil.java new file mode 100644 index 0000000000000..580dbacc4d344 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/ClusteringUtil.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.ClusteringUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.table.HoodieFlinkTable; + +import org.apache.flink.configuration.Configuration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.stream.Collectors; + +/** + * Utilities for flink hudi clustering. + */ +public class ClusteringUtil { + + private static final Logger LOG = LoggerFactory.getLogger(ClusteringUtil.class); + + public static void validateClusteringScheduling(Configuration conf) { + if (OptionsResolver.isBucketIndexType(conf)) { + throw new UnsupportedOperationException("Clustering is not supported for bucket index."); + } + } + + /** + * Schedules clustering plan by condition. + * + * @param conf The configuration + * @param writeClient The write client + * @param committed Whether the instant was committed + */ + public static void scheduleClustering(Configuration conf, HoodieFlinkWriteClient writeClient, boolean committed) { + validateClusteringScheduling(conf); + if (committed) { + writeClient.scheduleClustering(Option.empty()); + } + } + + /** + * Force rolls back all the inflight clustering instants, especially for job failover restart. + * + * @param table The hoodie table + * @param writeClient The write client + */ + public static void rollbackClustering(HoodieFlinkTable table, HoodieFlinkWriteClient writeClient) { + List inflightInstants = ClusteringUtils.getPendingClusteringInstantTimes(table.getMetaClient()) + .stream() + .filter(instant -> instant.getState() == HoodieInstant.State.INFLIGHT) + .collect(Collectors.toList()); + inflightInstants.forEach(inflightInstant -> { + LOG.info("Rollback the inflight clustering instant: " + inflightInstant + " for failover"); + table.rollbackInflightClustering(inflightInstant, + commitToRollback -> writeClient.getPendingRollbackInfo(table.getMetaClient(), commitToRollback, false)); + table.getMetaClient().reloadActiveTimeline(); + }); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/CompactionUtil.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/CompactionUtil.java new file mode 100644 index 0000000000000..d64e3a4af790a --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/CompactionUtil.java @@ -0,0 +1,225 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.sink.compact.FlinkCompactionConfig; +import org.apache.hudi.table.HoodieFlinkTable; + +import org.apache.avro.Schema; +import org.apache.flink.configuration.Configuration; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Locale; + +/** + * Utilities for flink hudi compaction. + */ +public class CompactionUtil { + + private static final Logger LOG = LoggerFactory.getLogger(CompactionUtil.class); + + /** + * Schedules a new compaction instant. + * + * @param metaClient The metadata client + * @param writeClient The write client + * @param deltaTimeCompaction Whether the compaction is trigger by elapsed delta time + * @param committed Whether the last instant was committed successfully + */ + public static void scheduleCompaction( + HoodieTableMetaClient metaClient, + HoodieFlinkWriteClient writeClient, + boolean deltaTimeCompaction, + boolean committed) { + if (committed) { + writeClient.scheduleCompaction(Option.empty()); + } else if (deltaTimeCompaction) { + // if there are no new commits and the compaction trigger strategy is based on elapsed delta time, + // schedules the compaction anyway. + metaClient.reloadActiveTimeline(); + Option compactionInstantTime = CompactionUtil.getCompactionInstantTime(metaClient); + if (compactionInstantTime.isPresent()) { + writeClient.scheduleCompactionAtInstant(compactionInstantTime.get(), Option.empty()); + } + } + } + + /** + * Gets compaction Instant time. + */ + public static Option getCompactionInstantTime(HoodieTableMetaClient metaClient) { + Option firstPendingInstant = metaClient.getCommitsTimeline() + .filterPendingExcludingCompaction().firstInstant(); + Option lastCompleteInstant = metaClient.getActiveTimeline().getWriteTimeline() + .filterCompletedAndCompactionInstants().lastInstant(); + if (firstPendingInstant.isPresent() && lastCompleteInstant.isPresent()) { + String firstPendingTimestamp = firstPendingInstant.get().getTimestamp(); + String lastCompleteTimestamp = lastCompleteInstant.get().getTimestamp(); + // Committed and pending compaction instants should have strictly lower timestamps + return StreamerUtil.medianInstantTime(firstPendingTimestamp, lastCompleteTimestamp); + } else if (!lastCompleteInstant.isPresent()) { + LOG.info("No instants to schedule the compaction plan"); + return Option.empty(); + } else { + return Option.of(HoodieActiveTimeline.createNewInstantTime()); + } + } + + /** + * Sets up the avro schema string into the give configuration {@code conf} + * through reading from the hoodie table metadata. + * + * @param conf The configuration + */ + public static void setAvroSchema(Configuration conf, HoodieTableMetaClient metaClient) throws Exception { + TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient); + Schema tableAvroSchema = tableSchemaResolver.getTableAvroSchema(false); + conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA, tableAvroSchema.toString()); + } + + /** + * Sets up the avro schema string into the HoodieWriteConfig {@code HoodieWriteConfig} + * through reading from the hoodie table metadata. + * + * @param writeConfig The HoodieWriteConfig + */ + public static void setAvroSchema(HoodieWriteConfig writeConfig, HoodieTableMetaClient metaClient) throws Exception { + TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient); + Schema tableAvroSchema = tableSchemaResolver.getTableAvroSchema(false); + writeConfig.setSchema(tableAvroSchema.toString()); + } + + /** + * Sets up the preCombine field into the given configuration {@code conf} + * through reading from the hoodie table metadata. + *

    + * This value is non-null as compaction can only be performed on MOR tables. + * Of which, MOR tables will have non-null precombine fields. + * + * @param conf The configuration + */ + public static void setPreCombineField(Configuration conf, HoodieTableMetaClient metaClient) { + String preCombineField = metaClient.getTableConfig().getPreCombineField(); + conf.setString(FlinkOptions.PRECOMBINE_FIELD, preCombineField); + } + + /** + * Infers the changelog mode based on the data file schema(including metadata fields). + * + *

    We can improve the code if the changelog mode is set up as table config. + * + * @param conf The configuration + */ + public static void inferChangelogMode(Configuration conf, HoodieTableMetaClient metaClient) throws Exception { + TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient); + Schema tableAvroSchema = tableSchemaResolver.getTableAvroSchemaFromDataFile(); + if (tableAvroSchema.getField(HoodieRecord.OPERATION_METADATA_FIELD) != null) { + conf.setBoolean(FlinkOptions.CHANGELOG_ENABLED, true); + } + } + + /** + * Cleans the metadata file for given instant {@code instant}. + */ + public static void cleanInstant(HoodieTableMetaClient metaClient, HoodieInstant instant) { + Path commitFilePath = new Path(metaClient.getMetaAuxiliaryPath(), instant.getFileName()); + try { + if (metaClient.getFs().exists(commitFilePath)) { + boolean deleted = metaClient.getFs().delete(commitFilePath, false); + if (deleted) { + LOG.info("Removed instant " + instant); + } else { + throw new HoodieIOException("Could not delete instant " + instant); + } + } + } catch (IOException e) { + throw new HoodieIOException("Could not remove requested commit " + commitFilePath, e); + } + } + + public static void rollbackCompaction(HoodieFlinkTable table, String instantTime) { + HoodieInstant inflightInstant = HoodieTimeline.getCompactionInflightInstant(instantTime); + if (table.getMetaClient().reloadActiveTimeline().filterPendingCompactionTimeline().containsInstant(inflightInstant)) { + LOG.warn("Rollback failed compaction instant: [" + instantTime + "]"); + table.rollbackInflightCompaction(inflightInstant); + } + } + + /** + * Force rolls back all the inflight compaction instants, especially for job failover restart. + * + * @param table The hoodie table + */ + public static void rollbackCompaction(HoodieFlinkTable table) { + HoodieTimeline inflightCompactionTimeline = table.getActiveTimeline() + .filterPendingCompactionTimeline() + .filter(instant -> + instant.getState() == HoodieInstant.State.INFLIGHT); + inflightCompactionTimeline.getInstants().forEach(inflightInstant -> { + LOG.info("Rollback the inflight compaction instant: " + inflightInstant + " for failover"); + table.rollbackInflightCompaction(inflightInstant); + table.getMetaClient().reloadActiveTimeline(); + }); + } + + /** + * Rolls back the earliest compaction if there exists. + * + *

    Makes the strategy not that radical: firstly check whether there exists inflight compaction instants, + * rolls back the first inflight instant only if it has timed out. That means, if there are + * multiple timed out instants on the timeline, we only roll back the first one at a time. + */ + public static void rollbackEarliestCompaction(HoodieFlinkTable table, Configuration conf) { + Option earliestInflight = table.getActiveTimeline() + .filterPendingCompactionTimeline() + .filter(instant -> + instant.getState() == HoodieInstant.State.INFLIGHT).firstInstant(); + if (earliestInflight.isPresent()) { + HoodieInstant instant = earliestInflight.get(); + String currentTime = HoodieActiveTimeline.createNewInstantTime(); + int timeout = conf.getInteger(FlinkOptions.COMPACTION_TIMEOUT_SECONDS); + if (StreamerUtil.instantTimeDiffSeconds(currentTime, instant.getTimestamp()) >= timeout) { + LOG.info("Rollback the inflight compaction instant: " + instant + " for timeout(" + timeout + "s)"); + table.rollbackInflightCompaction(instant); + table.getMetaClient().reloadActiveTimeline(); + } + } + } + + /** + * Returns whether the execution sequence is LIFO. + */ + public static boolean isLIFO(String seq) { + return seq.toUpperCase(Locale.ROOT).equals(FlinkCompactionConfig.SEQ_LIFO); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/DataTypeUtils.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/DataTypeUtils.java new file mode 100644 index 0000000000000..c772dc8539171 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/DataTypeUtils.java @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeFamily; +import org.apache.flink.table.types.logical.LogicalTypeRoot; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimestampType; + +import javax.annotation.Nullable; + +import java.math.BigDecimal; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Utilities for {@link org.apache.flink.table.types.DataType}. + */ +public class DataTypeUtils { + /** + * Returns whether the given type is TIMESTAMP type. + */ + public static boolean isTimestampType(DataType type) { + return type.getLogicalType().getTypeRoot() == LogicalTypeRoot.TIMESTAMP_WITHOUT_TIME_ZONE; + } + + /** + * Returns the precision of the given TIMESTAMP type. + */ + public static int precision(LogicalType logicalType) { + if (logicalType instanceof TimestampType) { + return ((TimestampType) logicalType).getPrecision(); + } else if (logicalType instanceof LocalZonedTimestampType) { + return ((LocalZonedTimestampType) logicalType).getPrecision(); + } else { + throw new AssertionError("Unexpected type: " + logicalType); + } + } + + /** + * Returns whether the given type is DATE type. + */ + public static boolean isDateType(DataType type) { + return type.getLogicalType().getTypeRoot() == LogicalTypeRoot.DATE; + } + + /** + * Returns whether the given type is DATETIME type. + */ + public static boolean isDatetimeType(DataType type) { + return isTimestampType(type) || isDateType(type); + } + + /** + * Projects the row fields with given names. + */ + public static RowType.RowField[] projectRowFields(RowType rowType, String[] names) { + int[] fieldIndices = Arrays.stream(names).mapToInt(rowType::getFieldIndex).toArray(); + return Arrays.stream(fieldIndices).mapToObj(i -> rowType.getFields().get(i)).toArray(RowType.RowField[]::new); + } + + /** + * Returns whether the given logical type belongs to the family. + */ + public static boolean isFamily(LogicalType logicalType, LogicalTypeFamily family) { + return logicalType.getTypeRoot().getFamilies().contains(family); + } + + /** + * Resolves the partition path string into value obj with given data type. + */ + public static Object resolvePartition(String partition, DataType type) { + if (partition == null) { + return null; + } + + LogicalTypeRoot typeRoot = type.getLogicalType().getTypeRoot(); + switch (typeRoot) { + case CHAR: + case VARCHAR: + return partition; + case BOOLEAN: + return Boolean.parseBoolean(partition); + case TINYINT: + return Integer.valueOf(partition).byteValue(); + case SMALLINT: + return Short.valueOf(partition); + case INTEGER: + return Integer.valueOf(partition); + case BIGINT: + return Long.valueOf(partition); + case FLOAT: + return Float.valueOf(partition); + case DOUBLE: + return Double.valueOf(partition); + case DATE: + return LocalDate.parse(partition); + case TIMESTAMP_WITHOUT_TIME_ZONE: + return LocalDateTime.parse(partition); + case DECIMAL: + return new BigDecimal(partition); + default: + throw new RuntimeException( + String.format( + "Can not convert %s to type %s for partition value", partition, type)); + } + } + + /** + * Ensures the give columns of the row data type are not nullable(for example, the primary keys). + * + * @param dataType The row data type, datatype logicaltype must be rowtype + * @param pkColumns The primary keys + * @return a new row data type if any column nullability is tweaked or the original data type + */ + public static DataType ensureColumnsAsNonNullable(DataType dataType, @Nullable List pkColumns) { + if (pkColumns == null || pkColumns.isEmpty()) { + return dataType; + } + LogicalType dataTypeLogicalType = dataType.getLogicalType(); + if (!(dataTypeLogicalType instanceof RowType)) { + throw new RuntimeException("The datatype to be converted must be row type, but this type is :" + dataTypeLogicalType.getClass()); + } + RowType rowType = (RowType) dataTypeLogicalType; + List originalFieldTypes = dataType.getChildren(); + List fieldNames = rowType.getFieldNames(); + List fieldTypes = new ArrayList<>(); + boolean tweaked = false; + for (int i = 0; i < fieldNames.size(); i++) { + if (pkColumns.contains(fieldNames.get(i)) && rowType.getTypeAt(i).isNullable()) { + fieldTypes.add(originalFieldTypes.get(i).notNull()); + tweaked = true; + } else { + fieldTypes.add(originalFieldTypes.get(i)); + } + } + if (!tweaked) { + return dataType; + } + List fields = new ArrayList<>(); + for (int i = 0; i < fieldNames.size(); i++) { + fields.add(DataTypes.FIELD(fieldNames.get(i), fieldTypes.get(i))); + } + return DataTypes.ROW(fields.stream().toArray(DataTypes.Field[]::new)).notNull(); + } + +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/ExpressionUtils.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/ExpressionUtils.java new file mode 100644 index 0000000000000..20473acdcda6b --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/ExpressionUtils.java @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.flink.table.expressions.CallExpression; +import org.apache.flink.table.expressions.Expression; +import org.apache.flink.table.expressions.FieldReferenceExpression; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.expressions.ValueLiteralExpression; +import org.apache.flink.table.functions.BuiltInFunctionDefinitions; +import org.apache.flink.table.functions.FunctionDefinition; +import org.apache.flink.table.types.logical.LogicalType; + +import javax.annotation.Nullable; + +import java.math.BigDecimal; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.ZoneOffset; +import java.time.temporal.ChronoField; +import java.util.Arrays; +import java.util.List; + +/** + * Utilities for expression resolving. + */ +public class ExpressionUtils { + + /** + * Collect the referenced columns with given expressions, + * only simple call expression is supported. + */ + public static String[] referencedColumns(List exprs) { + return exprs.stream() + .map(ExpressionUtils::getReferencedColumns) + .filter(columns -> columns.length > 0) + .flatMap(Arrays::stream) + .distinct() // deduplication + .toArray(String[]::new); + } + + /** + * Returns whether the given expression is simple call expression: + * a binary call with one operand as field reference and another operand + * as literal. + */ + public static boolean isSimpleCallExpression(Expression expr) { + if (!(expr instanceof CallExpression)) { + return false; + } + CallExpression callExpression = (CallExpression) expr; + FunctionDefinition funcDef = callExpression.getFunctionDefinition(); + // simple call list: + // NOT AND OR IN EQUALS NOT_EQUALS IS_NULL IS_NOT_NULL LESS_THAN GREATER_THAN + // LESS_THAN_OR_EQUAL GREATER_THAN_OR_EQUAL + + if (funcDef == BuiltInFunctionDefinitions.NOT + || funcDef == BuiltInFunctionDefinitions.AND + || funcDef == BuiltInFunctionDefinitions.OR) { + return callExpression.getChildren().stream() + .allMatch(ExpressionUtils::isSimpleCallExpression); + } + if (!(funcDef == BuiltInFunctionDefinitions.IN + || funcDef == BuiltInFunctionDefinitions.EQUALS + || funcDef == BuiltInFunctionDefinitions.NOT_EQUALS + || funcDef == BuiltInFunctionDefinitions.IS_NULL + || funcDef == BuiltInFunctionDefinitions.IS_NOT_NULL + || funcDef == BuiltInFunctionDefinitions.LESS_THAN + || funcDef == BuiltInFunctionDefinitions.GREATER_THAN + || funcDef == BuiltInFunctionDefinitions.LESS_THAN_OR_EQUAL + || funcDef == BuiltInFunctionDefinitions.GREATER_THAN_OR_EQUAL)) { + return false; + } + // handle IN + if (funcDef == BuiltInFunctionDefinitions.IN) { + // In expression RHS operands are always literals + return true; + } + // handle unary operator + if (funcDef == BuiltInFunctionDefinitions.IS_NULL + || funcDef == BuiltInFunctionDefinitions.IS_NOT_NULL) { + return callExpression.getChildren().stream() + .allMatch(e -> e instanceof FieldReferenceExpression); + } + // handle binary operator + return isFieldReferenceAndLiteral(callExpression.getChildren()); + } + + private static boolean isFieldReferenceAndLiteral(List exprs) { + if (exprs.size() != 2) { + return false; + } + final Expression expr0 = exprs.get(0); + final Expression expr1 = exprs.get(1); + return expr0 instanceof FieldReferenceExpression && expr1 instanceof ValueLiteralExpression + || expr0 instanceof ValueLiteralExpression && expr1 instanceof FieldReferenceExpression; + } + + private static String[] getReferencedColumns(ResolvedExpression expression) { + CallExpression callExpr = (CallExpression) expression; + FunctionDefinition funcDef = callExpr.getFunctionDefinition(); + if (funcDef == BuiltInFunctionDefinitions.NOT + || funcDef == BuiltInFunctionDefinitions.AND + || funcDef == BuiltInFunctionDefinitions.OR) { + return callExpr.getChildren().stream() + .map(e -> getReferencedColumns((ResolvedExpression) e)) + .flatMap(Arrays::stream) + .toArray(String[]::new); + } + + return expression.getChildren().stream() + .filter(expr -> expr instanceof FieldReferenceExpression) + .map(expr -> ((FieldReferenceExpression) expr).getName()) + .toArray(String[]::new); + } + + /** + * Returns the value with given value literal expression. + * + *

    Returns null if the value can not parse as the output data type correctly, + * should call {@code ValueLiteralExpression.isNull} first to decide whether + * the literal is NULL. + */ + @Nullable + public static Object getValueFromLiteral(ValueLiteralExpression expr) { + LogicalType logicalType = expr.getOutputDataType().getLogicalType(); + switch (logicalType.getTypeRoot()) { + case TIMESTAMP_WITHOUT_TIME_ZONE: + return expr.getValueAs(LocalDateTime.class) + .map(ldt -> ldt.toInstant(ZoneOffset.UTC).toEpochMilli()) + .orElse(null); + case TIME_WITHOUT_TIME_ZONE: + return expr.getValueAs(LocalTime.class) + .map(lt -> lt.get(ChronoField.MILLI_OF_DAY)) + .orElse(null); + case DATE: + return expr.getValueAs(LocalDate.class) + .map(LocalDate::toEpochDay) + .orElse(null); + // NOTE: All integral types of size less than Int are encoded as Ints in MT + case BOOLEAN: + return expr.getValueAs(Boolean.class).orElse(null); + case TINYINT: + case SMALLINT: + case INTEGER: + return expr.getValueAs(Integer.class).orElse(null); + case FLOAT: + return expr.getValueAs(Float.class).orElse(null); + case DOUBLE: + return expr.getValueAs(Double.class).orElse(null); + case BINARY: + case VARBINARY: + return expr.getValueAs(byte[].class).orElse(null); + case CHAR: + case VARCHAR: + return expr.getValueAs(String.class).orElse(null); + case DECIMAL: + return expr.getValueAs(BigDecimal.class).orElse(null); + default: + throw new UnsupportedOperationException("Unsupported type: " + logicalType); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/FlinkStateBackendConverter.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/FlinkStateBackendConverter.java new file mode 100644 index 0000000000000..c147f60c94d16 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/FlinkStateBackendConverter.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.hudi.exception.HoodieException; + +import com.beust.jcommander.IStringConverter; +import com.beust.jcommander.ParameterException; +import org.apache.flink.contrib.streaming.state.EmbeddedRocksDBStateBackend; +import org.apache.flink.runtime.state.StateBackend; +import org.apache.flink.runtime.state.hashmap.HashMapStateBackend; + +/** + * Converter that converts a string into Flink StateBackend. + */ +public class FlinkStateBackendConverter implements IStringConverter { + @Override + public StateBackend convert(String value) throws ParameterException { + switch (value) { + case "hashmap": + return new HashMapStateBackend(); + case "rocksdb": + return new EmbeddedRocksDBStateBackend(); + default: + throw new HoodieException(String.format("Unknown flink state backend %s. Supports only hashmap and rocksdb by now", value)); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/FlinkTables.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/FlinkTables.java new file mode 100644 index 0000000000000..ee164d3cda951 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/FlinkTables.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.hudi.client.FlinkTaskContextSupplier; +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieFlinkTable; + +import org.apache.flink.api.common.functions.RuntimeContext; +import org.apache.flink.configuration.Configuration; + +import static org.apache.hudi.configuration.HadoopConfigurations.getHadoopConf; + +/** + * Utilities for {@link org.apache.hudi.table.HoodieFlinkTable}. + */ +public class FlinkTables { + private FlinkTables() { + } + + /** + * Creates the hoodie flink table. + * + *

    This expects to be used by client. + */ + public static HoodieFlinkTable createTable(Configuration conf, RuntimeContext runtimeContext) { + HoodieFlinkEngineContext context = new HoodieFlinkEngineContext( + new SerializableConfiguration(getHadoopConf(conf)), + new FlinkTaskContextSupplier(runtimeContext)); + HoodieWriteConfig writeConfig = FlinkWriteClients.getHoodieClientConfig(conf, true); + return HoodieFlinkTable.create(writeConfig, context); + } + + /** + * Creates the hoodie flink table. + * + *

    This expects to be used by client. + */ + public static HoodieFlinkTable createTable( + HoodieWriteConfig writeConfig, + org.apache.hadoop.conf.Configuration hadoopConf, + RuntimeContext runtimeContext) { + HoodieFlinkEngineContext context = new HoodieFlinkEngineContext( + new SerializableConfiguration(hadoopConf), + new FlinkTaskContextSupplier(runtimeContext)); + return HoodieFlinkTable.create(writeConfig, context); + } + + /** + * Creates the hoodie flink table. + * + *

    This expects to be used by driver. + */ + public static HoodieFlinkTable createTable(Configuration conf) { + HoodieWriteConfig writeConfig = FlinkWriteClients.getHoodieClientConfig(conf, true, false); + return HoodieFlinkTable.create(writeConfig, HoodieFlinkEngineContext.DEFAULT); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/FlinkWriteClients.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/FlinkWriteClients.java new file mode 100644 index 0000000000000..41712e8fb9829 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/FlinkWriteClients.java @@ -0,0 +1,238 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.hudi.client.FlinkTaskContextSupplier; +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.client.transaction.lock.FileSystemBasedLockProvider; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.config.HoodieArchivalConfig; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieLockConfig; +import org.apache.hudi.config.HoodieMemoryConfig; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.table.action.cluster.ClusteringPlanPartitionFilterMode; +import org.apache.hudi.table.action.compact.CompactionTriggerStrategy; + +import org.apache.flink.api.common.functions.RuntimeContext; +import org.apache.flink.configuration.Configuration; + +import java.io.IOException; +import java.util.Locale; + +import static org.apache.hudi.util.StreamerUtil.flinkConf2TypedProperties; +import static org.apache.hudi.util.StreamerUtil.getPayloadConfig; +import static org.apache.hudi.util.StreamerUtil.getSourceSchema; + +/** + * Utilities for {@link org.apache.hudi.client.HoodieFlinkWriteClient}. + */ +public class FlinkWriteClients { + + /** + * Creates the Flink write client. + * + *

    This expects to be used by the driver, the client can then send requests for files view. + * + *

    The task context supplier is a constant: the write token is always '0-1-0'. + */ + @SuppressWarnings("rawtypes") + public static HoodieFlinkWriteClient createWriteClient(Configuration conf) throws IOException { + HoodieWriteConfig writeConfig = getHoodieClientConfig(conf, true, false); + // build the write client to start the embedded timeline server + final HoodieFlinkWriteClient writeClient = new HoodieFlinkWriteClient<>(new HoodieFlinkEngineContext(HadoopConfigurations.getHadoopConf(conf)), writeConfig); + writeClient.setOperationType(WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION))); + // create the filesystem view storage properties for client + final FileSystemViewStorageConfig viewStorageConfig = writeConfig.getViewStorageConfig(); + // rebuild the view storage config with simplified options. + FileSystemViewStorageConfig rebuilt = FileSystemViewStorageConfig.newBuilder() + .withStorageType(viewStorageConfig.getStorageType()) + .withRemoteServerHost(viewStorageConfig.getRemoteViewServerHost()) + .withRemoteServerPort(viewStorageConfig.getRemoteViewServerPort()) + .withRemoteTimelineClientTimeoutSecs(viewStorageConfig.getRemoteTimelineClientTimeoutSecs()) + .withRemoteTimelineClientRetry(viewStorageConfig.isRemoteTimelineClientRetryEnabled()) + .withRemoteTimelineClientMaxRetryNumbers(viewStorageConfig.getRemoteTimelineClientMaxRetryNumbers()) + .withRemoteTimelineInitialRetryIntervalMs(viewStorageConfig.getRemoteTimelineInitialRetryIntervalMs()) + .withRemoteTimelineClientMaxRetryIntervalMs(viewStorageConfig.getRemoteTimelineClientMaxRetryIntervalMs()) + .withRemoteTimelineClientRetryExceptions(viewStorageConfig.getRemoteTimelineClientRetryExceptions()) + .build(); + ViewStorageProperties.createProperties(conf.getString(FlinkOptions.PATH), rebuilt, conf); + return writeClient; + } + + /** + * Creates the Flink write client. + * + *

    This expects to be used by the driver, the client can then send requests for files view. + * + *

    The task context supplier is a constant: the write token is always '0-1-0'. + * + *

    Note: different with {@link #createWriteClient}, the fs view storage options are set into the given + * configuration {@code conf}. + */ + @SuppressWarnings("rawtypes") + public static HoodieFlinkWriteClient createWriteClientV2(Configuration conf) { + HoodieWriteConfig writeConfig = getHoodieClientConfig(conf, true, false); + // build the write client to start the embedded timeline server + final HoodieFlinkWriteClient writeClient = new HoodieFlinkWriteClient<>(new HoodieFlinkEngineContext(HadoopConfigurations.getHadoopConf(conf)), writeConfig); + writeClient.setOperationType(WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION))); + // create the filesystem view storage properties for client + final FileSystemViewStorageConfig viewStorageConfig = writeConfig.getViewStorageConfig(); + conf.setString(FileSystemViewStorageConfig.VIEW_TYPE.key(), viewStorageConfig.getStorageType().name()); + conf.setString(FileSystemViewStorageConfig.REMOTE_HOST_NAME.key(), viewStorageConfig.getRemoteViewServerHost()); + conf.setInteger(FileSystemViewStorageConfig.REMOTE_PORT_NUM.key(), viewStorageConfig.getRemoteViewServerPort()); + return writeClient; + } + + /** + * Creates the Flink write client. + * + *

    This expects to be used by client, the driver should start an embedded timeline server. + */ + @SuppressWarnings("rawtypes") + public static HoodieFlinkWriteClient createWriteClient(Configuration conf, RuntimeContext runtimeContext) { + return createWriteClient(conf, runtimeContext, true); + } + + /** + * Creates the Flink write client. + * + *

    This expects to be used by client, set flag {@code loadFsViewStorageConfig} to use + * remote filesystem view storage config, or an in-memory filesystem view storage is used. + */ + @SuppressWarnings("rawtypes") + public static HoodieFlinkWriteClient createWriteClient(Configuration conf, RuntimeContext runtimeContext, boolean loadFsViewStorageConfig) { + HoodieFlinkEngineContext context = + new HoodieFlinkEngineContext( + new SerializableConfiguration(HadoopConfigurations.getHadoopConf(conf)), + new FlinkTaskContextSupplier(runtimeContext)); + + HoodieWriteConfig writeConfig = getHoodieClientConfig(conf, loadFsViewStorageConfig); + return new HoodieFlinkWriteClient<>(context, writeConfig); + } + + /** + * Mainly used for tests. + */ + public static HoodieWriteConfig getHoodieClientConfig(Configuration conf) { + return getHoodieClientConfig(conf, false, false); + } + + public static HoodieWriteConfig getHoodieClientConfig(Configuration conf, boolean loadFsViewStorageConfig) { + return getHoodieClientConfig(conf, false, loadFsViewStorageConfig); + } + + public static HoodieWriteConfig getHoodieClientConfig( + Configuration conf, + boolean enableEmbeddedTimelineService, + boolean loadFsViewStorageConfig) { + HoodieWriteConfig.Builder builder = + HoodieWriteConfig.newBuilder() + .withEngineType(EngineType.FLINK) + .withPath(conf.getString(FlinkOptions.PATH)) + .combineInput(conf.getBoolean(FlinkOptions.PRE_COMBINE), true) + .withMergeAllowDuplicateOnInserts(OptionsResolver.insertClustering(conf)) + .withClusteringConfig( + HoodieClusteringConfig.newBuilder() + .withAsyncClustering(conf.getBoolean(FlinkOptions.CLUSTERING_SCHEDULE_ENABLED)) + .withClusteringPlanStrategyClass(conf.getString(FlinkOptions.CLUSTERING_PLAN_STRATEGY_CLASS)) + .withClusteringPlanPartitionFilterMode( + ClusteringPlanPartitionFilterMode.valueOf(conf.getString(FlinkOptions.CLUSTERING_PLAN_PARTITION_FILTER_MODE_NAME))) + .withClusteringTargetPartitions(conf.getInteger(FlinkOptions.CLUSTERING_TARGET_PARTITIONS)) + .withClusteringMaxNumGroups(conf.getInteger(FlinkOptions.CLUSTERING_MAX_NUM_GROUPS)) + .withClusteringTargetFileMaxBytes(conf.getLong(FlinkOptions.CLUSTERING_PLAN_STRATEGY_TARGET_FILE_MAX_BYTES)) + .withClusteringPlanSmallFileLimit(conf.getLong(FlinkOptions.CLUSTERING_PLAN_STRATEGY_SMALL_FILE_LIMIT) * 1024 * 1024L) + .withClusteringSkipPartitionsFromLatest(conf.getInteger(FlinkOptions.CLUSTERING_PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST)) + .withAsyncClusteringMaxCommits(conf.getInteger(FlinkOptions.CLUSTERING_DELTA_COMMITS)) + .build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withAsyncClean(conf.getBoolean(FlinkOptions.CLEAN_ASYNC_ENABLED)) + .retainCommits(conf.getInteger(FlinkOptions.CLEAN_RETAIN_COMMITS)) + .cleanerNumHoursRetained(conf.getInteger(FlinkOptions.CLEAN_RETAIN_HOURS)) + .retainFileVersions(conf.getInteger(FlinkOptions.CLEAN_RETAIN_FILE_VERSIONS)) + // override and hardcode to 20, + // actually Flink cleaning is always with parallelism 1 now + .withCleanerParallelism(20) + .withCleanerPolicy(HoodieCleaningPolicy.valueOf(conf.getString(FlinkOptions.CLEAN_POLICY))) + .build()) + .withArchivalConfig(HoodieArchivalConfig.newBuilder() + .archiveCommitsWith(conf.getInteger(FlinkOptions.ARCHIVE_MIN_COMMITS), conf.getInteger(FlinkOptions.ARCHIVE_MAX_COMMITS)) + .build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withTargetIOPerCompactionInMB(conf.getLong(FlinkOptions.COMPACTION_TARGET_IO)) + .withInlineCompactionTriggerStrategy( + CompactionTriggerStrategy.valueOf(conf.getString(FlinkOptions.COMPACTION_TRIGGER_STRATEGY).toUpperCase(Locale.ROOT))) + .withMaxNumDeltaCommitsBeforeCompaction(conf.getInteger(FlinkOptions.COMPACTION_DELTA_COMMITS)) + .withMaxDeltaSecondsBeforeCompaction(conf.getInteger(FlinkOptions.COMPACTION_DELTA_SECONDS)) + .build()) + .withMemoryConfig( + HoodieMemoryConfig.newBuilder() + .withMaxMemoryMaxSize( + conf.getInteger(FlinkOptions.WRITE_MERGE_MAX_MEMORY) * 1024 * 1024L, + conf.getInteger(FlinkOptions.COMPACTION_MAX_MEMORY) * 1024 * 1024L + ).build()) + .forTable(conf.getString(FlinkOptions.TABLE_NAME)) + .withStorageConfig(HoodieStorageConfig.newBuilder() + .logFileDataBlockMaxSize(conf.getInteger(FlinkOptions.WRITE_LOG_BLOCK_SIZE) * 1024 * 1024) + .logFileMaxSize(conf.getLong(FlinkOptions.WRITE_LOG_MAX_SIZE) * 1024 * 1024) + .parquetBlockSize(conf.getInteger(FlinkOptions.WRITE_PARQUET_BLOCK_SIZE) * 1024 * 1024) + .parquetPageSize(conf.getInteger(FlinkOptions.WRITE_PARQUET_PAGE_SIZE) * 1024 * 1024) + .parquetMaxFileSize(conf.getInteger(FlinkOptions.WRITE_PARQUET_MAX_FILE_SIZE) * 1024 * 1024L) + .build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(conf.getBoolean(FlinkOptions.METADATA_ENABLED)) + .withMaxNumDeltaCommitsBeforeCompaction(conf.getInteger(FlinkOptions.METADATA_COMPACTION_DELTA_COMMITS)) + .build()) + .withLockConfig(HoodieLockConfig.newBuilder() + .withLockProvider(FileSystemBasedLockProvider.class) + .withLockWaitTimeInMillis(2000L) // 2s + .withFileSystemLockExpire(1) // 1 minute + .withClientNumRetries(30) + .withFileSystemLockPath(StreamerUtil.getAuxiliaryPath(conf)) + .build()) + .withPayloadConfig(getPayloadConfig(conf)) + .withEmbeddedTimelineServerEnabled(enableEmbeddedTimelineService) + .withEmbeddedTimelineServerReuseEnabled(true) // make write client embedded timeline service singleton + .withAutoCommit(false) + .withAllowOperationMetadataField(conf.getBoolean(FlinkOptions.CHANGELOG_ENABLED)) + .withProps(flinkConf2TypedProperties(conf)) + .withSchema(getSourceSchema(conf).toString()); + + // do not configure cleaning strategy as LAZY until multi-writers is supported. + HoodieWriteConfig writeConfig = builder.build(); + if (loadFsViewStorageConfig && !conf.containsKey(FileSystemViewStorageConfig.REMOTE_HOST_NAME.key())) { + // do not use the builder to give a change for recovering the original fs view storage config + FileSystemViewStorageConfig viewStorageConfig = ViewStorageProperties.loadFromProperties(conf.getString(FlinkOptions.PATH), conf); + writeConfig.setViewStorageConfig(viewStorageConfig); + } + return writeConfig; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/HoodiePipeline.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/HoodiePipeline.java new file mode 100644 index 0000000000000..f95367c83613c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/HoodiePipeline.java @@ -0,0 +1,270 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.hudi.adapter.Utils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.table.HoodieTableFactory; + +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.internal.TableEnvironmentImpl; +import org.apache.flink.table.catalog.Catalog; +import org.apache.flink.table.catalog.ObjectIdentifier; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.ResolvedCatalogTable; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.connector.sink.DataStreamSinkProvider; +import org.apache.flink.table.connector.source.DataStreamScanProvider; +import org.apache.flink.table.connector.source.ScanTableSource; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.factories.FactoryUtil; +import org.apache.flink.table.runtime.connector.sink.SinkRuntimeProviderContext; +import org.apache.flink.table.runtime.connector.source.ScanRuntimeProviderContext; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * A tool class to construct hoodie flink pipeline. + * + *

    How to use ?

    + * Method {@link #builder(String)} returns a pipeline builder. The builder + * can then define the hudi table columns, primary keys and partitions. + * + *

    An example:

    + *
    + *    HoodiePipeline.Builder builder = HoodiePipeline.builder("myTable");
    + *    DataStreamSink sinkStream = builder
    + *        .column("f0 int")
    + *        .column("f1 varchar(10)")
    + *        .column("f2 varchar(20)")
    + *        .pk("f0,f1")
    + *        .partition("f2")
    + *        .sink(input, false);
    + *  
    + */ +public class HoodiePipeline { + + private static final Logger LOG = LogManager.getLogger(HoodiePipeline.class); + + /** + * Returns the builder for hoodie pipeline construction. + */ + public static Builder builder(String tableName) { + return new Builder(tableName); + } + + /** + * Builder for hudi source/sink pipeline construction. + */ + public static class Builder { + private final String tableName; + private final List columns; + private final Map options; + + private String pk; + private List partitions; + + private Builder(String tableName) { + this.tableName = tableName; + this.columns = new ArrayList<>(); + this.options = new HashMap<>(); + this.partitions = new ArrayList<>(); + } + + /** + * Add a table column definition. + * + * @param column the column format should be in the form like 'f0 int' + */ + public Builder column(String column) { + this.columns.add(column); + return this; + } + + /** + * Add primary keys. + */ + public Builder pk(String... pks) { + this.pk = String.join(",", pks); + return this; + } + + /** + * Add partition fields. + */ + public Builder partition(String... partitions) { + this.partitions = new ArrayList<>(Arrays.asList(partitions)); + return this; + } + + /** + * Add a config option. + */ + public Builder option(ConfigOption option, Object val) { + this.options.put(option.key(), val.toString()); + return this; + } + + public Builder option(String key, Object val) { + this.options.put(key, val.toString()); + return this; + } + + public Builder options(Map options) { + this.options.putAll(options); + return this; + } + + public DataStreamSink sink(DataStream input, boolean bounded) { + TableDescriptor tableDescriptor = getTableDescriptor(); + return HoodiePipeline.sink(input, tableDescriptor.getTableId(), tableDescriptor.getResolvedCatalogTable(), bounded); + } + + public TableDescriptor getTableDescriptor() { + EnvironmentSettings environmentSettings = EnvironmentSettings + .newInstance() + .build(); + TableEnvironmentImpl tableEnv = TableEnvironmentImpl.create(environmentSettings); + String sql = getCreateHoodieTableDDL(this.tableName, this.columns, this.options, this.pk, this.partitions); + tableEnv.executeSql(sql); + String currentCatalog = tableEnv.getCurrentCatalog(); + ResolvedCatalogTable catalogTable = null; + String defaultDatabase = null; + try { + Catalog catalog = tableEnv.getCatalog(currentCatalog).get(); + defaultDatabase = catalog.getDefaultDatabase(); + catalogTable = (ResolvedCatalogTable) catalog.getTable(new ObjectPath(defaultDatabase, this.tableName)); + } catch (TableNotExistException e) { + throw new HoodieException("Create table " + this.tableName + " exception", e); + } + ObjectIdentifier tableId = ObjectIdentifier.of(currentCatalog, defaultDatabase, this.tableName); + return new TableDescriptor(tableId, catalogTable); + } + + public DataStream source(StreamExecutionEnvironment execEnv) { + TableDescriptor tableDescriptor = getTableDescriptor(); + return HoodiePipeline.source(execEnv, tableDescriptor.tableId, tableDescriptor.getResolvedCatalogTable()); + } + } + + private static String getCreateHoodieTableDDL( + String tableName, + List fields, + Map options, + String pkField, + List partitionField) { + StringBuilder builder = new StringBuilder(); + builder.append("create table ") + .append(tableName) + .append("(\n"); + for (String field : fields) { + builder.append(" ") + .append(field) + .append(",\n"); + } + builder.append(" PRIMARY KEY(") + .append(pkField) + .append(") NOT ENFORCED\n") + .append(")\n"); + if (!partitionField.isEmpty()) { + String partitons = partitionField + .stream() + .map(partitionName -> "`" + partitionName + "`") + .collect(Collectors.joining(",")); + builder.append("PARTITIONED BY (") + .append(partitons) + .append(")\n"); + } + builder.append("with ('connector' = 'hudi'"); + options.forEach((k, v) -> builder + .append(",\n") + .append(" '") + .append(k) + .append("' = '") + .append(v) + .append("'")); + builder.append("\n)"); + return builder.toString(); + } + + /** + * Returns the data stream sink with given catalog table. + * + * @param input The input datastream + * @param tablePath The table path to the hoodie table in the catalog + * @param catalogTable The hoodie catalog table + * @param isBounded A flag indicating whether the input data stream is bounded + */ + private static DataStreamSink sink(DataStream input, ObjectIdentifier tablePath, ResolvedCatalogTable catalogTable, boolean isBounded) { + FactoryUtil.DefaultDynamicTableContext context = Utils.getTableContext(tablePath, catalogTable, Configuration.fromMap(catalogTable.getOptions())); + HoodieTableFactory hoodieTableFactory = new HoodieTableFactory(); + return ((DataStreamSinkProvider) hoodieTableFactory.createDynamicTableSink(context) + .getSinkRuntimeProvider(new SinkRuntimeProviderContext(isBounded))) + .consumeDataStream(input); + } + + /** + * Returns the data stream source with given catalog table. + * + * @param execEnv The execution environment + * @param tablePath The table path to the hoodie table in the catalog + * @param catalogTable The hoodie catalog table + */ + private static DataStream source(StreamExecutionEnvironment execEnv, ObjectIdentifier tablePath, ResolvedCatalogTable catalogTable) { + FactoryUtil.DefaultDynamicTableContext context = Utils.getTableContext(tablePath, catalogTable, Configuration.fromMap(catalogTable.getOptions())); + HoodieTableFactory hoodieTableFactory = new HoodieTableFactory(); + DataStreamScanProvider dataStreamScanProvider = (DataStreamScanProvider) ((ScanTableSource) hoodieTableFactory + .createDynamicTableSource(context)) + .getScanRuntimeProvider(new ScanRuntimeProviderContext()); + return dataStreamScanProvider.produceDataStream(execEnv); + } + + /*** + * A POJO that contains tableId and resolvedCatalogTable. + */ + public static class TableDescriptor { + private final ObjectIdentifier tableId; + private final ResolvedCatalogTable resolvedCatalogTable; + + public TableDescriptor(ObjectIdentifier tableId, ResolvedCatalogTable resolvedCatalogTable) { + this.tableId = tableId; + this.resolvedCatalogTable = resolvedCatalogTable; + } + + public ObjectIdentifier getTableId() { + return tableId; + } + + public ResolvedCatalogTable getResolvedCatalogTable() { + return resolvedCatalogTable; + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/InputFormats.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/InputFormats.java new file mode 100644 index 0000000000000..f193357e88809 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/InputFormats.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.flink.api.common.io.InputFormat; +import org.apache.flink.api.java.io.CollectionInputFormat; +import org.apache.flink.table.data.RowData; + +import java.util.Collections; + +/** + * Utilities for all kinds of {@link org.apache.flink.api.common.io.InputFormat}s. + */ +public class InputFormats { + public static final InputFormat EMPTY_INPUT_FORMAT = + new CollectionInputFormat<>(Collections.emptyList(), null); +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/RowDataProjection.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/RowDataProjection.java new file mode 100644 index 0000000000000..8076d982b9919 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/RowDataProjection.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.hudi.common.util.ValidationUtils; + +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.List; + +/** + * Utilities to project the row data with given positions. + */ +public class RowDataProjection implements Serializable { + private static final long serialVersionUID = 1L; + + private final RowData.FieldGetter[] fieldGetters; + + private RowDataProjection(LogicalType[] types, int[] positions) { + ValidationUtils.checkArgument(types.length == positions.length, + "types and positions should have the equal number"); + this.fieldGetters = new RowData.FieldGetter[types.length]; + for (int i = 0; i < types.length; i++) { + final LogicalType type = types[i]; + final int pos = positions[i]; + this.fieldGetters[i] = RowData.createFieldGetter(type, pos); + } + } + + public static RowDataProjection instance(RowType rowType, int[] positions) { + final LogicalType[] types = rowType.getChildren().toArray(new LogicalType[0]); + return new RowDataProjection(types, positions); + } + + public static RowDataProjection instanceV2(RowType rowType, int[] positions) { + List fieldTypes = rowType.getChildren(); + final LogicalType[] types = Arrays.stream(positions).mapToObj(fieldTypes::get).toArray(LogicalType[]::new); + return new RowDataProjection(types, positions); + } + + public static RowDataProjection instance(LogicalType[] types, int[] positions) { + return new RowDataProjection(types, positions); + } + + /** + * Returns the projected row data. + */ + public RowData project(RowData rowData) { + GenericRowData genericRowData = new GenericRowData(this.fieldGetters.length); + for (int i = 0; i < this.fieldGetters.length; i++) { + final Object val = this.fieldGetters[i].getFieldOrNull(rowData); + genericRowData.setField(i, val); + } + return genericRowData; + } + + /** + * Returns the projected values array. + */ + public Object[] projectAsValues(RowData rowData) { + Object[] values = new Object[this.fieldGetters.length]; + for (int i = 0; i < this.fieldGetters.length; i++) { + final Object val = this.fieldGetters[i].getFieldOrNull(rowData); + values[i] = val; + } + return values; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/RowDataToAvroConverters.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/RowDataToAvroConverters.java new file mode 100644 index 0000000000000..ecebd1adcdbc8 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/RowDataToAvroConverters.java @@ -0,0 +1,329 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.avro.Conversions; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.util.Utf8; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; + +import java.io.Serializable; +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.time.Instant; +import java.time.temporal.ChronoUnit; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Tool class used to convert from {@link RowData} to Avro {@link GenericRecord}. + * + *

    NOTE: reference from Flink release 1.12.0, should remove when Flink version upgrade to that. + */ +@Internal +public class RowDataToAvroConverters { + + private static Conversions.DecimalConversion decimalConversion = new Conversions.DecimalConversion(); + + // -------------------------------------------------------------------------------- + // Runtime Converters + // -------------------------------------------------------------------------------- + + /** + * Runtime converter that converts objects of Flink Table & SQL internal data structures to + * corresponding Avro data structures. + */ + @FunctionalInterface + public interface RowDataToAvroConverter extends Serializable { + Object convert(Schema schema, Object object); + } + + // -------------------------------------------------------------------------------- + // IMPORTANT! We use anonymous classes instead of lambdas for a reason here. It is + // necessary because the maven shade plugin cannot relocate classes in + // SerializedLambdas (MSHADE-260). On the other hand we want to relocate Avro for + // sql-client uber jars. + // -------------------------------------------------------------------------------- + + /** + * Creates a runtime converter according to the given logical type that converts objects of + * Flink Table & SQL internal data structures to corresponding Avro data structures. + */ + public static RowDataToAvroConverter createConverter(LogicalType type) { + final RowDataToAvroConverter converter; + switch (type.getTypeRoot()) { + case NULL: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return null; + } + }; + break; + case TINYINT: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ((Byte) object).intValue(); + } + }; + break; + case SMALLINT: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ((Short) object).intValue(); + } + }; + break; + case BOOLEAN: // boolean + case INTEGER: // int + case INTERVAL_YEAR_MONTH: // long + case BIGINT: // long + case INTERVAL_DAY_TIME: // long + case FLOAT: // float + case DOUBLE: // double + case TIME_WITHOUT_TIME_ZONE: // int + case DATE: // int + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return object; + } + }; + break; + case CHAR: + case VARCHAR: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return new Utf8(object.toString()); + } + }; + break; + case BINARY: + case VARBINARY: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ByteBuffer.wrap((byte[]) object); + } + }; + break; + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + case TIMESTAMP_WITHOUT_TIME_ZONE: + final int precision = DataTypeUtils.precision(type); + if (precision <= 3) { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ((TimestampData) object).toInstant().toEpochMilli(); + } + }; + } else if (precision <= 6) { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ChronoUnit.MICROS.between(Instant.EPOCH, ((TimestampData) object).toInstant()); + } + }; + } else { + throw new UnsupportedOperationException("Unsupported timestamp precision: " + precision); + } + break; + case DECIMAL: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + BigDecimal javaDecimal = ((DecimalData) object).toBigDecimal(); + return decimalConversion.toFixed(javaDecimal, schema, schema.getLogicalType()); + } + }; + break; + case ARRAY: + converter = createArrayConverter((ArrayType) type); + break; + case ROW: + converter = createRowConverter((RowType) type); + break; + case MAP: + case MULTISET: + converter = createMapConverter(type); + break; + case RAW: + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + + // wrap into nullable converter + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + if (object == null) { + return null; + } + + // get actual schema if it is a nullable schema + Schema actualSchema; + if (schema.getType() == Schema.Type.UNION) { + List types = schema.getTypes(); + int size = types.size(); + if (size == 2 && types.get(1).getType() == Schema.Type.NULL) { + actualSchema = types.get(0); + } else if (size == 2 && types.get(0).getType() == Schema.Type.NULL) { + actualSchema = types.get(1); + } else { + throw new IllegalArgumentException( + "The Avro schema is not a nullable type: " + schema); + } + } else { + actualSchema = schema; + } + return converter.convert(actualSchema, object); + } + }; + } + + private static RowDataToAvroConverter createRowConverter(RowType rowType) { + final RowDataToAvroConverter[] fieldConverters = + rowType.getChildren().stream() + .map(RowDataToAvroConverters::createConverter) + .toArray(RowDataToAvroConverter[]::new); + final LogicalType[] fieldTypes = + rowType.getFields().stream() + .map(RowType.RowField::getType) + .toArray(LogicalType[]::new); + final RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[fieldTypes.length]; + for (int i = 0; i < fieldTypes.length; i++) { + fieldGetters[i] = RowData.createFieldGetter(fieldTypes[i], i); + } + final int length = rowType.getFieldCount(); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final RowData row = (RowData) object; + final List fields = schema.getFields(); + final GenericRecord record = new GenericData.Record(schema); + for (int i = 0; i < length; ++i) { + final Schema.Field schemaField = fields.get(i); + Object avroObject = + fieldConverters[i].convert( + schemaField.schema(), fieldGetters[i].getFieldOrNull(row)); + record.put(i, avroObject); + } + return record; + } + }; + } + + private static RowDataToAvroConverter createArrayConverter(ArrayType arrayType) { + LogicalType elementType = arrayType.getElementType(); + final ArrayData.ElementGetter elementGetter = ArrayData.createElementGetter(elementType); + final RowDataToAvroConverter elementConverter = createConverter(arrayType.getElementType()); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final Schema elementSchema = schema.getElementType(); + ArrayData arrayData = (ArrayData) object; + List list = new ArrayList<>(); + for (int i = 0; i < arrayData.size(); ++i) { + list.add( + elementConverter.convert( + elementSchema, elementGetter.getElementOrNull(arrayData, i))); + } + return list; + } + }; + } + + private static RowDataToAvroConverter createMapConverter(LogicalType type) { + LogicalType valueType = AvroSchemaConverter.extractValueTypeToAvroMap(type); + final ArrayData.ElementGetter valueGetter = ArrayData.createElementGetter(valueType); + final RowDataToAvroConverter valueConverter = createConverter(valueType); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final Schema valueSchema = schema.getValueType(); + final MapData mapData = (MapData) object; + final ArrayData keyArray = mapData.keyArray(); + final ArrayData valueArray = mapData.valueArray(); + final Map map = new HashMap<>(mapData.size()); + for (int i = 0; i < mapData.size(); ++i) { + final String key = keyArray.getString(i).toString(); + final Object value = + valueConverter.convert( + valueSchema, valueGetter.getElementOrNull(valueArray, i)); + map.put(key, value); + } + return map; + } + }; + } +} + diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java new file mode 100644 index 0000000000000..98664c6dc3bd2 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java @@ -0,0 +1,404 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.hudi.common.config.DFSPropertiesConfiguration; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.log.HoodieLogFormat; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.config.HoodiePayloadConfig; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.keygen.SimpleAvroKeyGenerator; +import org.apache.hudi.schema.FilebasedSchemaProvider; +import org.apache.hudi.sink.transform.ChainedTransformer; +import org.apache.hudi.sink.transform.Transformer; +import org.apache.hudi.streamer.FlinkStreamerConfig; + +import org.apache.avro.Schema; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.util.Preconditions; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.orc.OrcFile; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.StringReader; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.List; +import java.util.Properties; + +import static org.apache.hudi.common.model.HoodieFileFormat.HOODIE_LOG; +import static org.apache.hudi.common.model.HoodieFileFormat.ORC; +import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET; +import static org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER; +import static org.apache.hudi.common.table.HoodieTableMetaClient.AUXILIARYFOLDER_NAME; + +/** + * Utilities for Flink stream read and write. + */ +public class StreamerUtil { + + private static final Logger LOG = LoggerFactory.getLogger(StreamerUtil.class); + + public static TypedProperties appendKafkaProps(FlinkStreamerConfig config) { + TypedProperties properties = getProps(config); + properties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, config.kafkaBootstrapServers); + properties.put(ConsumerConfig.GROUP_ID_CONFIG, config.kafkaGroupId); + return properties; + } + + public static TypedProperties getProps(FlinkStreamerConfig cfg) { + if (cfg.propsFilePath.isEmpty()) { + return new TypedProperties(); + } + return readConfig( + HadoopConfigurations.getHadoopConf(cfg), + new Path(cfg.propsFilePath), cfg.configs).getProps(); + } + + public static Schema getSourceSchema(org.apache.flink.configuration.Configuration conf) { + if (conf.getOptional(FlinkOptions.SOURCE_AVRO_SCHEMA_PATH).isPresent()) { + return new FilebasedSchemaProvider(conf).getSourceSchema(); + } else if (conf.getOptional(FlinkOptions.SOURCE_AVRO_SCHEMA).isPresent()) { + final String schemaStr = conf.get(FlinkOptions.SOURCE_AVRO_SCHEMA); + return new Schema.Parser().parse(schemaStr); + } else { + final String errorMsg = String.format("Either option '%s' or '%s' " + + "should be specified for avro schema deserialization", + FlinkOptions.SOURCE_AVRO_SCHEMA_PATH.key(), FlinkOptions.SOURCE_AVRO_SCHEMA.key()); + throw new HoodieException(errorMsg); + } + } + + /** + * Read config from properties file (`--props` option) and cmd line (`--hoodie-conf` option). + */ + public static DFSPropertiesConfiguration readConfig(org.apache.hadoop.conf.Configuration hadoopConfig, Path cfgPath, List overriddenProps) { + DFSPropertiesConfiguration conf = new DFSPropertiesConfiguration(hadoopConfig, cfgPath); + try { + if (!overriddenProps.isEmpty()) { + LOG.info("Adding overridden properties to file properties."); + conf.addPropsFromStream(new BufferedReader(new StringReader(String.join("\n", overriddenProps)))); + } + } catch (IOException ioe) { + throw new HoodieIOException("Unexpected error adding config overrides", ioe); + } + + return conf; + } + + /** + * Returns the payload config with given configuration. + */ + public static HoodiePayloadConfig getPayloadConfig(Configuration conf) { + return HoodiePayloadConfig.newBuilder() + .withPayloadClass(conf.getString(FlinkOptions.PAYLOAD_CLASS_NAME)) + .withPayloadOrderingField(conf.getString(FlinkOptions.PRECOMBINE_FIELD)) + .withPayloadEventTimeField(conf.getString(FlinkOptions.PRECOMBINE_FIELD)) + .build(); + } + + /** + * Converts the give {@link Configuration} to {@link TypedProperties}. + * The default values are also set up. + * + * @param conf The flink configuration + * @return a TypedProperties instance + */ + public static TypedProperties flinkConf2TypedProperties(Configuration conf) { + Configuration flatConf = FlinkOptions.flatOptions(conf); + Properties properties = new Properties(); + // put all the set options + flatConf.addAllToProperties(properties); + // put all the default options + for (ConfigOption option : FlinkOptions.optionalOptions()) { + if (!flatConf.contains(option) && option.hasDefaultValue()) { + properties.put(option.key(), option.defaultValue()); + } + } + return new TypedProperties(properties); + } + + public static void checkRequiredProperties(TypedProperties props, List checkPropNames) { + checkPropNames.forEach(prop -> + Preconditions.checkState(props.containsKey(prop), "Required property " + prop + " is missing")); + } + + /** + * Initialize the table if it does not exist. + * + * @param conf the configuration + * @throws IOException if errors happens when writing metadata + */ + public static HoodieTableMetaClient initTableIfNotExists(Configuration conf) throws IOException { + return initTableIfNotExists(conf, HadoopConfigurations.getHadoopConf(conf)); + } + + /** + * Initialize the table if it does not exist. + * + * @param conf the configuration + * @throws IOException if errors happens when writing metadata + */ + public static HoodieTableMetaClient initTableIfNotExists( + Configuration conf, + org.apache.hadoop.conf.Configuration hadoopConf) throws IOException { + final String basePath = conf.getString(FlinkOptions.PATH); + if (!tableExists(basePath, hadoopConf)) { + HoodieTableMetaClient metaClient = HoodieTableMetaClient.withPropertyBuilder() + .setTableCreateSchema(conf.getString(FlinkOptions.SOURCE_AVRO_SCHEMA)) + .setTableType(conf.getString(FlinkOptions.TABLE_TYPE)) + .setTableName(conf.getString(FlinkOptions.TABLE_NAME)) + .setRecordKeyFields(conf.getString(FlinkOptions.RECORD_KEY_FIELD, null)) + .setPayloadClassName(conf.getString(FlinkOptions.PAYLOAD_CLASS_NAME)) + .setPreCombineField(OptionsResolver.getPreCombineField(conf)) + .setArchiveLogFolder(ARCHIVELOG_FOLDER.defaultValue()) + .setPartitionFields(conf.getString(FlinkOptions.PARTITION_PATH_FIELD, null)) + .setKeyGeneratorClassProp( + conf.getOptional(FlinkOptions.KEYGEN_CLASS_NAME).orElse(SimpleAvroKeyGenerator.class.getName())) + .setHiveStylePartitioningEnable(conf.getBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING)) + .setUrlEncodePartitioning(conf.getBoolean(FlinkOptions.URL_ENCODE_PARTITIONING)) + .setTimelineLayoutVersion(1) + .initTable(hadoopConf, basePath); + LOG.info("Table initialized under base path {}", basePath); + return metaClient; + } else { + LOG.info("Table [{}/{}] already exists, no need to initialize the table", + basePath, conf.getString(FlinkOptions.TABLE_NAME)); + return StreamerUtil.createMetaClient(basePath, hadoopConf); + } + // Do not close the filesystem in order to use the CACHE, + // some filesystems release the handles in #close method. + } + + /** + * Returns whether the hoodie table exists under given path {@code basePath}. + */ + public static boolean tableExists(String basePath, org.apache.hadoop.conf.Configuration hadoopConf) { + // Hadoop FileSystem + FileSystem fs = FSUtils.getFs(basePath, hadoopConf); + try { + return fs.exists(new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME)); + } catch (IOException e) { + throw new HoodieException("Error while checking whether table exists under path:" + basePath, e); + } + } + + /** + * Generates the bucket ID using format {partition path}_{fileID}. + */ + public static String generateBucketKey(String partitionPath, String fileId) { + return String.format("%s_%s", partitionPath, fileId); + } + + /** + * Creates the meta client for reader. + * + *

    The streaming pipeline process is long-running, so empty table path is allowed, + * the reader would then check and refresh the meta client. + * + * @see org.apache.hudi.source.StreamReadMonitoringFunction + */ + public static HoodieTableMetaClient metaClientForReader( + Configuration conf, + org.apache.hadoop.conf.Configuration hadoopConf) { + final String basePath = conf.getString(FlinkOptions.PATH); + if (conf.getBoolean(FlinkOptions.READ_AS_STREAMING) && !tableExists(basePath, hadoopConf)) { + return null; + } else { + return createMetaClient(basePath, hadoopConf); + } + } + + /** + * Creates the meta client. + */ + public static HoodieTableMetaClient createMetaClient(String basePath, org.apache.hadoop.conf.Configuration hadoopConf) { + return HoodieTableMetaClient.builder().setBasePath(basePath).setConf(hadoopConf).build(); + } + + /** + * Creates the meta client. + */ + public static HoodieTableMetaClient createMetaClient(Configuration conf) { + return createMetaClient(conf.getString(FlinkOptions.PATH), HadoopConfigurations.getHadoopConf(conf)); + } + + /** + * Returns the median instant time between the given two instant time. + */ + public static Option medianInstantTime(String highVal, String lowVal) { + try { + long high = HoodieActiveTimeline.parseDateFromInstantTime(highVal).getTime(); + long low = HoodieActiveTimeline.parseDateFromInstantTime(lowVal).getTime(); + ValidationUtils.checkArgument(high > low, + "Instant [" + highVal + "] should have newer timestamp than instant [" + lowVal + "]"); + long median = low + (high - low) / 2; + final String instantTime = HoodieActiveTimeline.formatDate(new Date(median)); + if (HoodieTimeline.compareTimestamps(lowVal, HoodieTimeline.GREATER_THAN_OR_EQUALS, instantTime) + || HoodieTimeline.compareTimestamps(highVal, HoodieTimeline.LESSER_THAN_OR_EQUALS, instantTime)) { + return Option.empty(); + } + return Option.of(instantTime); + } catch (ParseException e) { + throw new HoodieException("Get median instant time with interval [" + lowVal + ", " + highVal + "] error", e); + } + } + + /** + * Returns the time interval in seconds between the given instant time. + */ + public static long instantTimeDiffSeconds(String newInstantTime, String oldInstantTime) { + try { + long newTimestamp = HoodieActiveTimeline.parseDateFromInstantTime(newInstantTime).getTime(); + long oldTimestamp = HoodieActiveTimeline.parseDateFromInstantTime(oldInstantTime).getTime(); + return (newTimestamp - oldTimestamp) / 1000; + } catch (ParseException e) { + throw new HoodieException("Get instant time diff with interval [" + oldInstantTime + ", " + newInstantTime + "] error", e); + } + } + + public static Option createTransformer(List classNames) throws IOException { + try { + List transformers = new ArrayList<>(); + for (String className : Option.ofNullable(classNames).orElse(Collections.emptyList())) { + transformers.add(ReflectionUtils.loadClass(className)); + } + return transformers.isEmpty() ? Option.empty() : Option.of(new ChainedTransformer(transformers)); + } catch (Throwable e) { + throw new IOException("Could not load transformer class(es) " + classNames, e); + } + } + + /** + * Returns whether the give file is in valid hoodie format. + * For example, filtering out the empty or corrupt files. + */ + public static boolean isValidFile(FileStatus fileStatus) { + final String extension = FSUtils.getFileExtension(fileStatus.getPath().toString()); + if (PARQUET.getFileExtension().equals(extension)) { + return fileStatus.getLen() > ParquetFileWriter.MAGIC.length; + } + + if (ORC.getFileExtension().equals(extension)) { + return fileStatus.getLen() > OrcFile.MAGIC.length(); + } + + if (HOODIE_LOG.getFileExtension().equals(extension)) { + return fileStatus.getLen() > HoodieLogFormat.MAGIC.length; + } + + return fileStatus.getLen() > 0; + } + + public static String getLastPendingInstant(HoodieTableMetaClient metaClient) { + return getLastPendingInstant(metaClient, true); + } + + public static String getLastPendingInstant(HoodieTableMetaClient metaClient, boolean reloadTimeline) { + if (reloadTimeline) { + metaClient.reloadActiveTimeline(); + } + return metaClient.getCommitsTimeline().filterPendingExcludingCompaction() + .lastInstant() + .map(HoodieInstant::getTimestamp) + .orElse(null); + } + + public static String getLastCompletedInstant(HoodieTableMetaClient metaClient) { + return metaClient.getCommitsTimeline().filterCompletedInstants() + .lastInstant() + .map(HoodieInstant::getTimestamp) + .orElse(null); + } + + /** + * Returns whether there are successful commits on the timeline. + * + * @param metaClient The meta client + * @return true if there is any successful commit + */ + public static boolean haveSuccessfulCommits(HoodieTableMetaClient metaClient) { + return !metaClient.getCommitsTimeline().filterCompletedInstants().empty(); + } + + /** + * Returns the max compaction memory in bytes with given conf. + */ + public static long getMaxCompactionMemoryInBytes(Configuration conf) { + return (long) conf.getInteger(FlinkOptions.COMPACTION_MAX_MEMORY) * 1024 * 1024; + } + + public static Schema getTableAvroSchema(HoodieTableMetaClient metaClient, boolean includeMetadataFields) throws Exception { + TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient); + return schemaUtil.getTableAvroSchema(includeMetadataFields); + } + + public static Schema getLatestTableSchema(String path, org.apache.hadoop.conf.Configuration hadoopConf) { + if (StringUtils.isNullOrEmpty(path) || !StreamerUtil.tableExists(path, hadoopConf)) { + return null; + } + + try { + HoodieTableMetaClient metaClient = StreamerUtil.createMetaClient(path, hadoopConf); + return getTableAvroSchema(metaClient, false); + } catch (Exception e) { + LOG.warn("Error while resolving the latest table schema", e); + } + return null; + } + + public static boolean fileExists(FileSystem fs, Path path) { + try { + return fs.exists(path); + } catch (IOException e) { + throw new HoodieException("Exception while checking file " + path + " existence", e); + } + } + + /** + * Returns the auxiliary path. + */ + public static String getAuxiliaryPath(Configuration conf) { + return conf.getString(FlinkOptions.PATH) + Path.SEPARATOR + AUXILIARYFOLDER_NAME; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/StringToRowDataConverter.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/StringToRowDataConverter.java new file mode 100644 index 0000000000000..216fa3f0f336f --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/StringToRowDataConverter.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.hudi.common.util.ValidationUtils; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.TimestampType; + +import java.math.BigDecimal; +import java.nio.charset.StandardCharsets; +import java.time.Instant; +import java.time.LocalDate; +import java.time.temporal.ChronoUnit; +import java.util.Arrays; + +/** + * A converter that converts a string array into internal row data fields. + * The converter is designed to be stateful(not pure stateless tool) + * in order to reuse the specific converters. + */ +@Internal +public class StringToRowDataConverter { + private final Converter[] converters; + + public StringToRowDataConverter(LogicalType[] fieldTypes) { + this.converters = Arrays.stream(fieldTypes) + .map(StringToRowDataConverter::getConverter) + .toArray(Converter[]::new); + } + + public Object[] convert(String[] fields) { + ValidationUtils.checkArgument(converters.length == fields.length, + "Field types and values should equal with number"); + + Object[] converted = new Object[fields.length]; + for (int i = 0; i < fields.length; i++) { + converted[i] = converters[i].convert(fields[i]); + } + return converted; + } + + private interface Converter { + Object convert(String field); + } + + private static Converter getConverter(LogicalType logicalType) { + switch (logicalType.getTypeRoot()) { + case NULL: + return field -> null; + case TINYINT: + return Byte::parseByte; + case SMALLINT: + return Short::parseShort; + case BOOLEAN: + return Boolean::parseBoolean; + case INTEGER: + case TIME_WITHOUT_TIME_ZONE: + return Integer::parseInt; + case BIGINT: + return Long::parseLong; + case FLOAT: + return Float::parseFloat; + case DOUBLE: + return Double::parseDouble; + case DATE: + // see HoodieAvroUtils#convertValueForAvroLogicalTypes + return field -> (int) LocalDate.parse(field).toEpochDay(); + case TIMESTAMP_WITHOUT_TIME_ZONE: + final int precision = ((TimestampType) logicalType).getPrecision(); + if (precision <= 3) { + return field -> TimestampData.fromInstant(Instant.EPOCH.plus(Long.parseLong(field), ChronoUnit.MILLIS)); + } else if (precision <= 6) { + return field -> TimestampData.fromInstant(Instant.EPOCH.plus(Long.parseLong(field), ChronoUnit.MICROS)); + } else { + throw new UnsupportedOperationException("Unsupported type: " + logicalType); + } + case CHAR: + case VARCHAR: + return StringData::fromString; + case BINARY: + case VARBINARY: + return field -> field.getBytes(StandardCharsets.UTF_8); + case DECIMAL: + DecimalType decimalType = (DecimalType) logicalType; + return field -> + DecimalData.fromBigDecimal( + new BigDecimal(field), + decimalType.getPrecision(), + decimalType.getScale()); + default: + throw new UnsupportedOperationException("Unsupported type: " + logicalType); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/ViewStorageProperties.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/ViewStorageProperties.java new file mode 100644 index 0000000000000..a5e9f31145618 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/ViewStorageProperties.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.util; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.exception.HoodieIOException; + +import org.apache.flink.configuration.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Date; +import java.util.Properties; + +import static org.apache.hudi.common.table.HoodieTableMetaClient.AUXILIARYFOLDER_NAME; + +/** + * Helper class to read/write {@link FileSystemViewStorageConfig}. + */ +public class ViewStorageProperties { + private static final Logger LOG = LoggerFactory.getLogger(ViewStorageProperties.class); + + private static final String FILE_NAME = "view_storage_conf.properties"; + + /** + * Initialize the {@link #FILE_NAME} meta file. + */ + public static void createProperties( + String basePath, + FileSystemViewStorageConfig config, + Configuration flinkConf) throws IOException { + Path propertyPath = getPropertiesFilePath(basePath); + FileSystem fs = FSUtils.getFs(basePath, HadoopConfigurations.getHadoopConf(flinkConf)); + fs.delete(propertyPath, false); + try (FSDataOutputStream outputStream = fs.create(propertyPath)) { + config.getProps().store(outputStream, + "Filesystem view storage properties saved on " + new Date(System.currentTimeMillis())); + } + } + + /** + * Read the {@link FileSystemViewStorageConfig} with given table base path. + */ + public static FileSystemViewStorageConfig loadFromProperties(String basePath, Configuration conf) { + Path propertyPath = getPropertiesFilePath(basePath); + LOG.info("Loading filesystem view storage properties from " + propertyPath); + FileSystem fs = FSUtils.getFs(basePath, HadoopConfigurations.getHadoopConf(conf)); + Properties props = new Properties(); + try { + try (FSDataInputStream inputStream = fs.open(propertyPath)) { + props.load(inputStream); + } + return FileSystemViewStorageConfig.newBuilder().fromProperties(props).build(); + } catch (IOException e) { + throw new HoodieIOException("Could not load filesystem view storage properties from " + propertyPath, e); + } + } + + private static Path getPropertiesFilePath(String basePath) { + String auxPath = basePath + Path.SEPARATOR + AUXILIARYFOLDER_NAME; + return new Path(auxPath, FILE_NAME); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory b/hudi-flink-datasource/hudi-flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory new file mode 100644 index 0000000000000..47435c745c461 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.hudi.table.HoodieTableFactory +org.apache.hudi.table.catalog.HoodieCatalogFactory diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/ITTestDataStreamWrite.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/ITTestDataStreamWrite.java new file mode 100644 index 0000000000000..6ab4b1b6e0d48 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/ITTestDataStreamWrite.java @@ -0,0 +1,430 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink; + +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.OptionsInference; +import org.apache.hudi.sink.transform.ChainedTransformer; +import org.apache.hudi.sink.transform.Transformer; +import org.apache.hudi.sink.utils.Pipelines; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.HoodiePipeline; +import org.apache.hudi.util.StreamerUtil; +import org.apache.hudi.utils.FlinkMiniCluster; +import org.apache.hudi.utils.TestConfigurations; +import org.apache.hudi.utils.TestData; +import org.apache.hudi.utils.TestUtils; +import org.apache.hudi.utils.source.ContinuousFileSource; + +import org.apache.flink.api.common.JobStatus; +import org.apache.flink.api.common.io.FilePathFilter; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.io.TextInputFormat; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.common.TimestampFormat; +import org.apache.flink.formats.json.JsonRowDataDeserializationSchema; +import org.apache.flink.streaming.api.CheckpointingMode; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.FileProcessingMode; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.util.TestLogger; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.File; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.TimeUnit; + +/** + * Integration test for Flink Hoodie stream sink. + */ +@ExtendWith(FlinkMiniCluster.class) +public class ITTestDataStreamWrite extends TestLogger { + + private static final Map> EXPECTED = new HashMap<>(); + private static final Map> EXPECTED_TRANSFORMER = new HashMap<>(); + private static final Map> EXPECTED_CHAINED_TRANSFORMER = new HashMap<>(); + + static { + EXPECTED.put("par1", Arrays.asList("id1,par1,id1,Danny,23,1000,par1", "id2,par1,id2,Stephen,33,2000,par1")); + EXPECTED.put("par2", Arrays.asList("id3,par2,id3,Julian,53,3000,par2", "id4,par2,id4,Fabian,31,4000,par2")); + EXPECTED.put("par3", Arrays.asList("id5,par3,id5,Sophia,18,5000,par3", "id6,par3,id6,Emma,20,6000,par3")); + EXPECTED.put("par4", Arrays.asList("id7,par4,id7,Bob,44,7000,par4", "id8,par4,id8,Han,56,8000,par4")); + + EXPECTED_TRANSFORMER.put("par1", Arrays.asList("id1,par1,id1,Danny,24,1000,par1", "id2,par1,id2,Stephen,34,2000,par1")); + EXPECTED_TRANSFORMER.put("par2", Arrays.asList("id3,par2,id3,Julian,54,3000,par2", "id4,par2,id4,Fabian,32,4000,par2")); + EXPECTED_TRANSFORMER.put("par3", Arrays.asList("id5,par3,id5,Sophia,19,5000,par3", "id6,par3,id6,Emma,21,6000,par3")); + EXPECTED_TRANSFORMER.put("par4", Arrays.asList("id7,par4,id7,Bob,45,7000,par4", "id8,par4,id8,Han,57,8000,par4")); + + EXPECTED_CHAINED_TRANSFORMER.put("par1", Arrays.asList("id1,par1,id1,Danny,25,1000,par1", "id2,par1,id2,Stephen,35,2000,par1")); + EXPECTED_CHAINED_TRANSFORMER.put("par2", Arrays.asList("id3,par2,id3,Julian,55,3000,par2", "id4,par2,id4,Fabian,33,4000,par2")); + EXPECTED_CHAINED_TRANSFORMER.put("par3", Arrays.asList("id5,par3,id5,Sophia,20,5000,par3", "id6,par3,id6,Emma,22,6000,par3")); + EXPECTED_CHAINED_TRANSFORMER.put("par4", Arrays.asList("id7,par4,id7,Bob,46,7000,par4", "id8,par4,id8,Han,58,8000,par4")); + } + + @TempDir + File tempFile; + + @ParameterizedTest + @ValueSource(strings = {"BUCKET", "FLINK_STATE"}) + public void testWriteCopyOnWrite(String indexType) throws Exception { + Configuration conf = TestConfigurations.getDefaultConf(tempFile.toURI().toString()); + conf.setString(FlinkOptions.INDEX_TYPE, indexType); + conf.setInteger(FlinkOptions.BUCKET_INDEX_NUM_BUCKETS, 1); + conf.setString(FlinkOptions.INDEX_KEY_FIELD, "id"); + conf.setBoolean(FlinkOptions.PRE_COMBINE, true); + + testWriteToHoodie(conf, "cow_write", 2, EXPECTED); + } + + @Test + public void testWriteCopyOnWriteWithTransformer() throws Exception { + Transformer transformer = (ds) -> ds.map((rowdata) -> { + if (rowdata instanceof GenericRowData) { + GenericRowData genericRD = (GenericRowData) rowdata; + //update age field to age + 1 + genericRD.setField(2, genericRD.getInt(2) + 1); + return genericRD; + } else { + throw new RuntimeException("Unrecognized row type information: " + rowdata.getClass().getSimpleName()); + } + }); + + testWriteToHoodie(transformer, "cow_write_with_transformer", EXPECTED_TRANSFORMER); + } + + @Test + public void testWriteCopyOnWriteWithChainedTransformer() throws Exception { + Transformer t1 = (ds) -> ds.map(rowData -> { + if (rowData instanceof GenericRowData) { + GenericRowData genericRD = (GenericRowData) rowData; + //update age field to age + 1 + genericRD.setField(2, genericRD.getInt(2) + 1); + return genericRD; + } else { + throw new RuntimeException("Unrecognized row type : " + rowData.getClass().getSimpleName()); + } + }); + + ChainedTransformer chainedTransformer = new ChainedTransformer(Arrays.asList(t1, t1)); + + testWriteToHoodie(chainedTransformer, "cow_write_with_chained_transformer", EXPECTED_CHAINED_TRANSFORMER); + } + + @ParameterizedTest + @ValueSource(strings = {"BUCKET", "FLINK_STATE"}) + public void testWriteMergeOnReadWithCompaction(String indexType) throws Exception { + Configuration conf = TestConfigurations.getDefaultConf(tempFile.toURI().toString()); + conf.setString(FlinkOptions.INDEX_TYPE, indexType); + conf.setInteger(FlinkOptions.BUCKET_INDEX_NUM_BUCKETS, 4); + conf.setString(FlinkOptions.INDEX_KEY_FIELD, "id"); + conf.setInteger(FlinkOptions.COMPACTION_DELTA_COMMITS, 1); + conf.setString(FlinkOptions.TABLE_TYPE, HoodieTableType.MERGE_ON_READ.name()); + + testWriteToHoodie(conf, "mor_write_with_compact", 1, EXPECTED); + } + + @Test + public void testWriteCopyOnWriteWithClustering() throws Exception { + testWriteCopyOnWriteWithClustering(false); + } + + @Test + public void testWriteCopyOnWriteWithSortClustering() throws Exception { + testWriteCopyOnWriteWithClustering(true); + } + + private void testWriteCopyOnWriteWithClustering(boolean sortClusteringEnabled) throws Exception { + Configuration conf = TestConfigurations.getDefaultConf(tempFile.toURI().toString()); + conf.setBoolean(FlinkOptions.CLUSTERING_SCHEDULE_ENABLED, true); + conf.setInteger(FlinkOptions.CLUSTERING_DELTA_COMMITS, 1); + conf.setString(FlinkOptions.OPERATION, "insert"); + if (sortClusteringEnabled) { + conf.setString(FlinkOptions.CLUSTERING_SORT_COLUMNS, "uuid"); + } + + testWriteToHoodieWithCluster(conf, "cow_write_with_cluster", 1, EXPECTED); + } + + private void testWriteToHoodie( + Transformer transformer, + String jobName, + Map> expected) throws Exception { + testWriteToHoodie(TestConfigurations.getDefaultConf(tempFile.toURI().toString()), + Option.of(transformer), jobName, 2, expected); + } + + private void testWriteToHoodie( + Configuration conf, + String jobName, + int checkpoints, + Map> expected) throws Exception { + testWriteToHoodie(conf, Option.empty(), jobName, checkpoints, expected); + } + + private void testWriteToHoodie( + Configuration conf, + Option transformer, + String jobName, + int checkpoints, + Map> expected) throws Exception { + + StreamExecutionEnvironment execEnv = StreamExecutionEnvironment.getExecutionEnvironment(); + execEnv.getConfig().disableObjectReuse(); + execEnv.setParallelism(4); + // set up checkpoint interval + execEnv.enableCheckpointing(4000, CheckpointingMode.EXACTLY_ONCE); + execEnv.getCheckpointConfig().setMaxConcurrentCheckpoints(1); + + // Read from file source + RowType rowType = + (RowType) AvroSchemaConverter.convertToDataType(StreamerUtil.getSourceSchema(conf)) + .getLogicalType(); + + JsonRowDataDeserializationSchema deserializationSchema = new JsonRowDataDeserializationSchema( + rowType, + InternalTypeInfo.of(rowType), + false, + true, + TimestampFormat.ISO_8601 + ); + String sourcePath = Objects.requireNonNull(Thread.currentThread() + .getContextClassLoader().getResource("test_source.data")).toString(); + + boolean isMor = conf.getString(FlinkOptions.TABLE_TYPE).equals(HoodieTableType.MERGE_ON_READ.name()); + + DataStream dataStream; + if (isMor) { + TextInputFormat format = new TextInputFormat(new Path(sourcePath)); + format.setFilesFilter(FilePathFilter.createDefaultFilter()); + TypeInformation typeInfo = BasicTypeInfo.STRING_TYPE_INFO; + format.setCharsetName("UTF-8"); + + dataStream = execEnv + // use PROCESS_CONTINUOUSLY mode to trigger checkpoint + .readFile(format, sourcePath, FileProcessingMode.PROCESS_CONTINUOUSLY, 1000, typeInfo) + .map(record -> deserializationSchema.deserialize(record.getBytes(StandardCharsets.UTF_8))) + .setParallelism(1); + } else { + dataStream = execEnv + // use continuous file source to trigger checkpoint + .addSource(new ContinuousFileSource.BoundedSourceFunction(new Path(sourcePath), checkpoints)) + .name("continuous_file_source") + .setParallelism(1) + .map(record -> deserializationSchema.deserialize(record.getBytes(StandardCharsets.UTF_8))) + .setParallelism(4); + } + + if (transformer.isPresent()) { + dataStream = transformer.get().apply(dataStream); + } + + OptionsInference.setupSinkTasks(conf, execEnv.getParallelism()); + DataStream hoodieRecordDataStream = Pipelines.bootstrap(conf, rowType, dataStream); + DataStream pipeline = Pipelines.hoodieStreamWrite(conf, hoodieRecordDataStream); + execEnv.addOperator(pipeline.getTransformation()); + + if (isMor) { + Pipelines.compact(conf, pipeline); + } + + execute(execEnv, isMor, jobName); + TestData.checkWrittenDataCOW(tempFile, expected); + } + + private void testWriteToHoodieWithCluster( + Configuration conf, + String jobName, + int checkpoints, + Map> expected) throws Exception { + + StreamExecutionEnvironment execEnv = StreamExecutionEnvironment.getExecutionEnvironment(); + execEnv.getConfig().disableObjectReuse(); + execEnv.setParallelism(4); + // set up checkpoint interval + execEnv.enableCheckpointing(4000, CheckpointingMode.EXACTLY_ONCE); + execEnv.getCheckpointConfig().setMaxConcurrentCheckpoints(1); + + // Read from file source + RowType rowType = + (RowType) AvroSchemaConverter.convertToDataType(StreamerUtil.getSourceSchema(conf)) + .getLogicalType(); + + JsonRowDataDeserializationSchema deserializationSchema = new JsonRowDataDeserializationSchema( + rowType, + InternalTypeInfo.of(rowType), + false, + true, + TimestampFormat.ISO_8601 + ); + String sourcePath = Objects.requireNonNull(Thread.currentThread() + .getContextClassLoader().getResource("test_source.data")).toString(); + + DataStream dataStream = execEnv + // use continuous file source to trigger checkpoint + .addSource(new ContinuousFileSource.BoundedSourceFunction(new Path(sourcePath), checkpoints)) + .name("continuous_file_source") + .setParallelism(1) + .map(record -> deserializationSchema.deserialize(record.getBytes(StandardCharsets.UTF_8))) + .setParallelism(4); + + OptionsInference.setupSinkTasks(conf, execEnv.getParallelism()); + DataStream pipeline = Pipelines.append(conf, rowType, dataStream, true); + execEnv.addOperator(pipeline.getTransformation()); + + Pipelines.cluster(conf, rowType, pipeline); + execute(execEnv, false, jobName); + + TestData.checkWrittenDataCOW(tempFile, expected); + } + + public void execute(StreamExecutionEnvironment execEnv, boolean isMor, String jobName) throws Exception { + if (isMor) { + JobClient client = execEnv.executeAsync(jobName); + if (client.getJobStatus().get() != JobStatus.FAILED) { + try { + TimeUnit.SECONDS.sleep(20); // wait long enough for the compaction to finish + client.cancel(); + } catch (Throwable var1) { + // ignored + } + } + } else { + // wait for the streaming job to finish + execEnv.execute(jobName); + } + } + + @Test + public void testHoodiePipelineBuilderSource() throws Exception { + //create a StreamExecutionEnvironment instance. + StreamExecutionEnvironment execEnv = StreamExecutionEnvironment.getExecutionEnvironment(); + execEnv.getConfig().disableObjectReuse(); + execEnv.setParallelism(1); + // set up checkpoint interval + execEnv.enableCheckpointing(4000, CheckpointingMode.EXACTLY_ONCE); + execEnv.getCheckpointConfig().setMaxConcurrentCheckpoints(1); + Configuration conf = TestConfigurations.getDefaultConf(tempFile.toURI().toString()); + conf.setString(FlinkOptions.TABLE_NAME, "t1"); + conf.setString(FlinkOptions.TABLE_TYPE, "MERGE_ON_READ"); + + // write 3 batches of data set + TestData.writeData(TestData.dataSetInsert(1, 2), conf); + TestData.writeData(TestData.dataSetInsert(3, 4), conf); + TestData.writeData(TestData.dataSetInsert(5, 6), conf); + + String latestCommit = TestUtils.getLastCompleteInstant(tempFile.toURI().toString()); + + Map options = new HashMap<>(); + options.put(FlinkOptions.PATH.key(), tempFile.toURI().toString()); + options.put(FlinkOptions.READ_START_COMMIT.key(), latestCommit); + + //read a hoodie table use low-level source api. + HoodiePipeline.Builder builder = HoodiePipeline.builder("test_source") + .column("uuid string not null") + .column("name string") + .column("age int") + .column("`ts` timestamp(3)") + .column("`partition` string") + .pk("uuid") + .partition("partition") + .options(options); + DataStream rowDataDataStream = builder.source(execEnv); + List result = new ArrayList<>(); + rowDataDataStream.executeAndCollect().forEachRemaining(result::add); + TimeUnit.SECONDS.sleep(2);//sleep 2 second for collect data + TestData.assertRowDataEquals(result, TestData.dataSetInsert(5, 6)); + } + + @Test + public void testHoodiePipelineBuilderSink() throws Exception { + StreamExecutionEnvironment execEnv = StreamExecutionEnvironment.getExecutionEnvironment(); + Map options = new HashMap<>(); + execEnv.getConfig().disableObjectReuse(); + execEnv.setParallelism(4); + // set up checkpoint interval + execEnv.enableCheckpointing(4000, CheckpointingMode.EXACTLY_ONCE); + execEnv.getCheckpointConfig().setMaxConcurrentCheckpoints(1); + + options.put(FlinkOptions.PATH.key(), tempFile.toURI().toString()); + options.put(FlinkOptions.SOURCE_AVRO_SCHEMA_PATH.key(), Objects.requireNonNull(Thread.currentThread().getContextClassLoader().getResource("test_read_schema.avsc")).toString()); + Configuration conf = Configuration.fromMap(options); + // Read from file source + RowType rowType = + (RowType) AvroSchemaConverter.convertToDataType(StreamerUtil.getSourceSchema(conf)) + .getLogicalType(); + + JsonRowDataDeserializationSchema deserializationSchema = new JsonRowDataDeserializationSchema( + rowType, + InternalTypeInfo.of(rowType), + false, + true, + TimestampFormat.ISO_8601 + ); + String sourcePath = Objects.requireNonNull(Thread.currentThread() + .getContextClassLoader().getResource("test_source.data")).toString(); + + TextInputFormat format = new TextInputFormat(new Path(sourcePath)); + format.setFilesFilter(FilePathFilter.createDefaultFilter()); + format.setCharsetName("UTF-8"); + + DataStream dataStream = execEnv + // use continuous file source to trigger checkpoint + .addSource(new ContinuousFileSource.BoundedSourceFunction(new Path(sourcePath), 2)) + .name("continuous_file_source") + .setParallelism(1) + .map(record -> deserializationSchema.deserialize(record.getBytes(StandardCharsets.UTF_8))) + .setParallelism(4); + + //sink to hoodie table use low-level sink api. + HoodiePipeline.Builder builder = HoodiePipeline.builder("test_sink") + .column("uuid string not null") + .column("name string") + .column("age int") + .column("`ts` timestamp(3)") + .column("`partition` string") + .pk("uuid") + .partition("partition") + .options(options); + + builder.sink(dataStream, false); + + execute(execEnv, false, "Api_Sink_Test"); + TestData.checkWrittenDataCOW(tempFile, EXPECTED); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/TestStreamWriteOperatorCoordinator.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/TestStreamWriteOperatorCoordinator.java new file mode 100644 index 0000000000000..07f560c14f1be --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/TestStreamWriteOperatorCoordinator.java @@ -0,0 +1,427 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.sink.event.WriteMetadataEvent; +import org.apache.hudi.sink.utils.MockCoordinatorExecutor; +import org.apache.hudi.sink.utils.NonThrownExecutor; +import org.apache.hudi.util.StreamerUtil; +import org.apache.hudi.utils.TestConfigurations; +import org.apache.hudi.utils.TestUtils; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.operators.coordination.MockOperatorCoordinatorContext; +import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.util.FileUtils; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.Mockito; +import org.slf4j.Logger; + +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeUnit; + +import static org.hamcrest.CoreMatchers.instanceOf; +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Test cases for StreamingSinkOperatorCoordinator. + */ +public class TestStreamWriteOperatorCoordinator { + private StreamWriteOperatorCoordinator coordinator; + + @TempDir + File tempFile; + + @BeforeEach + public void before() throws Exception { + OperatorCoordinator.Context context = new MockOperatorCoordinatorContext(new OperatorID(), 2); + coordinator = new StreamWriteOperatorCoordinator( + TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()), context); + coordinator.start(); + coordinator.setExecutor(new MockCoordinatorExecutor(context)); + + coordinator.handleEventFromOperator(0, WriteMetadataEvent.emptyBootstrap(0)); + coordinator.handleEventFromOperator(1, WriteMetadataEvent.emptyBootstrap(1)); + } + + @AfterEach + public void after() throws Exception { + coordinator.close(); + } + + @Test + void testInstantState() { + String instant = coordinator.getInstant(); + assertNotEquals("", instant); + + OperatorEvent event0 = createOperatorEvent(0, instant, "par1", true, 0.1); + OperatorEvent event1 = createOperatorEvent(1, instant, "par2", false, 0.2); + coordinator.handleEventFromOperator(0, event0); + coordinator.handleEventFromOperator(1, event1); + + coordinator.notifyCheckpointComplete(1); + String inflight = TestUtils.getLastPendingInstant(tempFile.getAbsolutePath()); + String lastCompleted = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath()); + assertThat("Instant should be complete", lastCompleted, is(instant)); + assertNotEquals("", inflight, "Should start a new instant"); + assertNotEquals(instant, inflight, "Should start a new instant"); + } + + @Test + public void testTableInitialized() throws IOException { + final org.apache.hadoop.conf.Configuration hadoopConf = HadoopConfigurations.getHadoopConf(new Configuration()); + String basePath = tempFile.getAbsolutePath(); + try (FileSystem fs = FSUtils.getFs(basePath, hadoopConf)) { + assertTrue(fs.exists(new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME))); + } + } + + @Test + public void testCheckpointAndRestore() throws Exception { + CompletableFuture future = new CompletableFuture<>(); + coordinator.checkpointCoordinator(1, future); + coordinator.resetToCheckpoint(1, future.get()); + } + + @Test + public void testReceiveInvalidEvent() { + CompletableFuture future = new CompletableFuture<>(); + coordinator.checkpointCoordinator(1, future); + OperatorEvent event = WriteMetadataEvent.builder() + .taskID(0) + .instantTime("abc") + .writeStatus(Collections.emptyList()) + .build(); + + assertError(() -> coordinator.handleEventFromOperator(0, event), + "Receive an unexpected event for instant abc from task 0"); + } + + @Test + public void testCheckpointCompleteWithPartialEvents() { + final CompletableFuture future = new CompletableFuture<>(); + coordinator.checkpointCoordinator(1, future); + String instant = coordinator.getInstant(); + OperatorEvent event = WriteMetadataEvent.builder() + .taskID(0) + .instantTime(instant) + .writeStatus(Collections.emptyList()) + .build(); + coordinator.handleEventFromOperator(0, event); + + assertDoesNotThrow(() -> coordinator.notifyCheckpointComplete(1), + "Returns early for empty write results"); + String lastCompleted = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath()); + assertNull(lastCompleted, "Returns early for empty write results"); + assertNull(coordinator.getEventBuffer()[0]); + + OperatorEvent event1 = createOperatorEvent(1, instant, "par2", false, 0.2); + coordinator.handleEventFromOperator(1, event1); + assertDoesNotThrow(() -> coordinator.notifyCheckpointComplete(2), + "Commits the instant with partial events anyway"); + lastCompleted = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath()); + assertThat("Commits the instant with partial events anyway", lastCompleted, is(instant)); + } + + @Test + public void testHiveSyncInvoked() throws Exception { + // reset + reset(); + // override the default configuration + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setBoolean(FlinkOptions.HIVE_SYNC_ENABLED, true); + OperatorCoordinator.Context context = new MockOperatorCoordinatorContext(new OperatorID(), 1); + coordinator = new StreamWriteOperatorCoordinator(conf, context); + coordinator.start(); + coordinator.setExecutor(new MockCoordinatorExecutor(context)); + + final WriteMetadataEvent event0 = WriteMetadataEvent.emptyBootstrap(0); + + coordinator.handleEventFromOperator(0, event0); + + String instant = mockWriteWithMetadata(); + assertNotEquals("", instant); + + // never throw for hive synchronization now + assertDoesNotThrow(() -> coordinator.notifyCheckpointComplete(1)); + } + + @Test + void testSyncMetadataTable() throws Exception { + // reset + reset(); + // override the default configuration + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setBoolean(FlinkOptions.METADATA_ENABLED, true); + conf.setInteger(FlinkOptions.METADATA_COMPACTION_DELTA_COMMITS, 5); + OperatorCoordinator.Context context = new MockOperatorCoordinatorContext(new OperatorID(), 1); + coordinator = new StreamWriteOperatorCoordinator(conf, context); + coordinator.start(); + coordinator.setExecutor(new MockCoordinatorExecutor(context)); + + final WriteMetadataEvent event0 = WriteMetadataEvent.emptyBootstrap(0); + + coordinator.handleEventFromOperator(0, event0); + + String instant = coordinator.getInstant(); + assertNotEquals("", instant); + + final String metadataTableBasePath = HoodieTableMetadata.getMetadataTableBasePath(tempFile.getAbsolutePath()); + HoodieTableMetaClient metadataTableMetaClient = StreamerUtil.createMetaClient(metadataTableBasePath, HadoopConfigurations.getHadoopConf(conf)); + HoodieTimeline completedTimeline = metadataTableMetaClient.getActiveTimeline().filterCompletedInstants(); + assertThat("One instant need to sync to metadata table", completedTimeline.getInstants().count(), is(1L)); + assertThat(completedTimeline.lastInstant().get().getTimestamp(), is(HoodieTableMetadata.SOLO_COMMIT_TIMESTAMP)); + + // test metadata table compaction + // write another 4 commits + for (int i = 1; i < 5; i++) { + instant = mockWriteWithMetadata(); + metadataTableMetaClient.reloadActiveTimeline(); + completedTimeline = metadataTableMetaClient.getActiveTimeline().filterCompletedInstants(); + assertThat("One instant need to sync to metadata table", completedTimeline.getInstants().count(), is(i + 1L)); + assertThat(completedTimeline.lastInstant().get().getTimestamp(), is(instant)); + } + // the 5th commit triggers the compaction + mockWriteWithMetadata(); + metadataTableMetaClient.reloadActiveTimeline(); + completedTimeline = metadataTableMetaClient.getActiveTimeline().filterCompletedAndCompactionInstants(); + assertThat("One instant need to sync to metadata table", completedTimeline.getInstants().count(), is(7L)); + assertThat(completedTimeline.nthFromLastInstant(1).get().getTimestamp(), is(instant + "001")); + assertThat(completedTimeline.nthFromLastInstant(1).get().getAction(), is(HoodieTimeline.COMMIT_ACTION)); + // write another 2 commits + for (int i = 7; i < 8; i++) { + instant = mockWriteWithMetadata(); + metadataTableMetaClient.reloadActiveTimeline(); + completedTimeline = metadataTableMetaClient.getActiveTimeline().filterCompletedInstants(); + assertThat("One instant need to sync to metadata table", completedTimeline.getInstants().count(), is(i + 1L)); + assertThat(completedTimeline.lastInstant().get().getTimestamp(), is(instant)); + } + + // write another commit to trigger clean + instant = mockWriteWithMetadata(); + metadataTableMetaClient.reloadActiveTimeline(); + completedTimeline = metadataTableMetaClient.getActiveTimeline().filterCompletedAndCompactionInstants(); + assertThat("One instant need to sync to metadata table", completedTimeline.getInstants().count(), is(10L)); + assertThat(completedTimeline.lastInstant().get().getTimestamp(), is(instant + "002")); + assertThat(completedTimeline.lastInstant().get().getAction(), is(HoodieTimeline.CLEAN_ACTION)); + + // write another commit + mockWriteWithMetadata(); + // write another commit + instant = mockWriteWithMetadata(); + // write another commit to trigger compaction + mockWriteWithMetadata(); + metadataTableMetaClient.reloadActiveTimeline(); + completedTimeline = metadataTableMetaClient.getActiveTimeline().filterCompletedAndCompactionInstants(); + assertThat("One instant need to sync to metadata table", completedTimeline.getInstants().count(), is(14L)); + assertThat(completedTimeline.nthFromLastInstant(1).get().getTimestamp(), is(instant + "001")); + assertThat(completedTimeline.nthFromLastInstant(1).get().getAction(), is(HoodieTimeline.COMMIT_ACTION)); + } + + @Test + void testSyncMetadataTableWithReusedInstant() throws Exception { + // reset + reset(); + // override the default configuration + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setBoolean(FlinkOptions.METADATA_ENABLED, true); + OperatorCoordinator.Context context = new MockOperatorCoordinatorContext(new OperatorID(), 1); + coordinator = new StreamWriteOperatorCoordinator(conf, context); + coordinator.start(); + coordinator.setExecutor(new MockCoordinatorExecutor(context)); + + final WriteMetadataEvent event0 = WriteMetadataEvent.emptyBootstrap(0); + + coordinator.handleEventFromOperator(0, event0); + + String instant = coordinator.getInstant(); + assertNotEquals("", instant); + + final String metadataTableBasePath = HoodieTableMetadata.getMetadataTableBasePath(tempFile.getAbsolutePath()); + HoodieTableMetaClient metadataTableMetaClient = StreamerUtil.createMetaClient(metadataTableBasePath, HadoopConfigurations.getHadoopConf(conf)); + HoodieTimeline completedTimeline = metadataTableMetaClient.getActiveTimeline().filterCompletedInstants(); + assertThat("One instant need to sync to metadata table", completedTimeline.getInstants().count(), is(1L)); + assertThat(completedTimeline.lastInstant().get().getTimestamp(), is(HoodieTableMetadata.SOLO_COMMIT_TIMESTAMP)); + + // writes a normal commit + mockWriteWithMetadata(); + instant = coordinator.getInstant(); + // creates an inflight commit on the metadata timeline + metadataTableMetaClient.getActiveTimeline() + .createNewInstant(new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieActiveTimeline.DELTA_COMMIT_ACTION, instant)); + metadataTableMetaClient.getActiveTimeline().transitionRequestedToInflight(HoodieActiveTimeline.DELTA_COMMIT_ACTION, instant); + metadataTableMetaClient.reloadActiveTimeline(); + + // write another commit with existing instant on the metadata timeline + instant = mockWriteWithMetadata(); + metadataTableMetaClient.reloadActiveTimeline(); + + completedTimeline = metadataTableMetaClient.getActiveTimeline().filterCompletedInstants(); + assertThat("One instant need to sync to metadata table", completedTimeline.getInstants().count(), is(3L)); + assertThat(completedTimeline.lastInstant().get().getTimestamp(), is(instant)); + } + + @Test + public void testEndInputIsTheLastEvent() throws Exception { + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + MockOperatorCoordinatorContext context = new MockOperatorCoordinatorContext(new OperatorID(), 1); + Logger logger = Mockito.mock(Logger.class); // avoid too many logs by executor + NonThrownExecutor executor = NonThrownExecutor.builder(logger).waitForTasksFinish(true).build(); + + try (StreamWriteOperatorCoordinator coordinator = new StreamWriteOperatorCoordinator(conf, context)) { + coordinator.start(); + coordinator.setExecutor(executor); + coordinator.handleEventFromOperator(0, WriteMetadataEvent.emptyBootstrap(0)); + TimeUnit.SECONDS.sleep(5); // wait for handled bootstrap event + + int eventCount = 20_000; // big enough to fill executor's queue + for (int i = 0; i < eventCount; i++) { + coordinator.handleEventFromOperator(0, createOperatorEvent(0, coordinator.getInstant(), "par1", true, 0.1)); + } + + WriteMetadataEvent endInput = WriteMetadataEvent.builder() + .taskID(0) + .instantTime(coordinator.getInstant()) + .writeStatus(Collections.emptyList()) + .endInput(true) + .build(); + coordinator.handleEventFromOperator(0, endInput); + + // wait for submitted events completed + executor.close(); + + // there should be no events after endInput + assertNull(coordinator.getEventBuffer()[0]); + } + } + + @Test + void testLockForMetadataTable() throws Exception { + // reset + reset(); + // override the default configuration + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setBoolean(FlinkOptions.METADATA_ENABLED, true); + + conf.setString(HoodieWriteConfig.WRITE_CONCURRENCY_MODE.key(), "optimistic_concurrency_control"); + conf.setInteger("hoodie.write.lock.client.num_retries", 1); + + OperatorCoordinator.Context context = new MockOperatorCoordinatorContext(new OperatorID(), 1); + coordinator = new StreamWriteOperatorCoordinator(conf, context); + coordinator.start(); + coordinator.setExecutor(new MockCoordinatorExecutor(context)); + + final WriteMetadataEvent event0 = WriteMetadataEvent.emptyBootstrap(0); + + coordinator.handleEventFromOperator(0, event0); + + String instant = coordinator.getInstant(); + assertNotEquals("", instant); + + final String metadataTableBasePath = HoodieTableMetadata.getMetadataTableBasePath(tempFile.getAbsolutePath()); + HoodieTableMetaClient metadataTableMetaClient = StreamerUtil.createMetaClient(metadataTableBasePath, HadoopConfigurations.getHadoopConf(conf)); + HoodieTimeline completedTimeline = metadataTableMetaClient.getActiveTimeline().filterCompletedInstants(); + assertThat("One instant need to sync to metadata table", completedTimeline.countInstants(), is(1)); + assertThat(completedTimeline.lastInstant().get().getTimestamp(), is(HoodieTableMetadata.SOLO_COMMIT_TIMESTAMP)); + + instant = mockWriteWithMetadata(); + metadataTableMetaClient.reloadActiveTimeline(); + completedTimeline = metadataTableMetaClient.getActiveTimeline().filterCompletedInstants(); + assertThat("One instant need to sync to metadata table", completedTimeline.countInstants(), is(2)); + assertThat(completedTimeline.lastInstant().get().getTimestamp(), is(instant)); + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + private String mockWriteWithMetadata() { + final String instant = coordinator.getInstant(); + OperatorEvent event = createOperatorEvent(0, instant, "par1", true, 0.1); + + coordinator.handleEventFromOperator(0, event); + coordinator.notifyCheckpointComplete(0); + return instant; + } + + private static WriteMetadataEvent createOperatorEvent( + int taskId, + String instant, + String partitionPath, + boolean trackSuccessRecords, + double failureFraction) { + final WriteStatus writeStatus = new WriteStatus(trackSuccessRecords, failureFraction); + writeStatus.setPartitionPath(partitionPath); + + HoodieWriteStat writeStat = new HoodieWriteStat(); + writeStat.setPartitionPath(partitionPath); + writeStat.setFileId("fileId123"); + writeStat.setPath("path123"); + writeStat.setFileSizeInBytes(123); + writeStat.setTotalWriteBytes(123); + writeStat.setNumWrites(1); + + writeStatus.setStat(writeStat); + + return WriteMetadataEvent.builder() + .taskID(taskId) + .instantTime(instant) + .writeStatus(Collections.singletonList(writeStatus)) + .lastBatch(true) + .build(); + } + + private void reset() throws Exception { + FileUtils.cleanDirectory(tempFile); + } + + private void assertError(Runnable runnable, String message) { + runnable.run(); + // wait a little while for the task to finish + assertThat(coordinator.getContext(), instanceOf(MockOperatorCoordinatorContext.class)); + MockOperatorCoordinatorContext context = (MockOperatorCoordinatorContext) coordinator.getContext(); + assertTrue(context.isJobFailed(), message); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java new file mode 100644 index 0000000000000..21dd6fd1d18ab --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java @@ -0,0 +1,463 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink; + +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.FileSystemViewStorageType; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.sink.utils.TestWriteBase; +import org.apache.hudi.util.FlinkWriteClients; +import org.apache.hudi.utils.TestConfigurations; +import org.apache.hudi.utils.TestData; + +import org.apache.flink.configuration.Configuration; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertSame; + +/** + * Test cases for stream write. + */ +public class TestWriteCopyOnWrite extends TestWriteBase { + + protected Configuration conf; + + @TempDir + File tempFile; + + @BeforeEach + public void before() { + conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setString(FlinkOptions.TABLE_TYPE, getTableType().name()); + setUp(conf); + } + + /** + * Override to have custom configuration. + */ + protected void setUp(Configuration conf) { + // for sub-class extension + } + + @Test + public void testCheckpoint() throws Exception { + preparePipeline() + .consume(TestData.DATA_SET_INSERT) + // no checkpoint, so the coordinator does not accept any events + .emptyEventBuffer() + .checkpoint(1) + .assertNextEvent(4, "par1,par2,par3,par4") + .checkpointComplete(1) + // checkpoint for next round, no data input, so after the checkpoint, + // there should not be REQUESTED Instant + // this triggers the data write and event send + .checkpoint(2) + .assertEmptyEvent() + .emptyCheckpoint(2) + .end(); + } + + @Test + public void testCheckpointFails() throws Exception { + // reset the config option + conf.setLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT, 1L); + preparePipeline(conf) + // no data written and triggers checkpoint fails, + // then we should revert the start instant + .checkpoint(1) + .assertEmptyEvent() + .checkpointFails(1) + .consume(TestData.DATA_SET_INSERT) + //.checkpointThrows(2, + // "Timeout(1000ms) while waiting for instant initialize") + // do not send the write event and fails the checkpoint, + // behaves like the last checkpoint is successful. + .checkpointFails(2) + .end(); + } + + @Test + public void testSubtaskFails() throws Exception { + // open the function and ingest data + preparePipeline() + .checkpoint(1) + .assertEmptyEvent() + .subTaskFails(0) + .noCompleteInstant() + .end(); + } + + @Test + public void testInsert() throws Exception { + // open the function and ingest data + preparePipeline() + .consume(TestData.DATA_SET_INSERT) + .assertEmptyDataFiles() + .checkpoint(1) + .assertNextEvent() + .checkpointComplete(1) + .checkWrittenData(EXPECTED1) + .end(); + } + + @Test + public void testInsertDuplicates() throws Exception { + // reset the config option + conf.setBoolean(FlinkOptions.PRE_COMBINE, true); + preparePipeline(conf) + .consume(TestData.DATA_SET_INSERT_DUPLICATES) + .assertEmptyDataFiles() + .checkpoint(1) + .assertNextEvent() + .checkpointComplete(1) + .checkWrittenData(EXPECTED3, 1) + // insert duplicates again + .consume(TestData.DATA_SET_INSERT_DUPLICATES) + .checkpoint(2) + .assertNextEvent() + .checkpointComplete(2) + .checkWrittenData(EXPECTED3, 1) + .end(); + } + + @Test + public void testUpsert() throws Exception { + // open the function and ingest data + preparePipeline() + .consume(TestData.DATA_SET_INSERT) + .assertEmptyDataFiles() + .checkpoint(1) + .assertNextEvent() + .checkpointComplete(1) + // upsert another data buffer + .consume(TestData.DATA_SET_UPDATE_INSERT) + // the data is not flushed yet + .checkWrittenData(EXPECTED1) + .checkpoint(2) + .assertNextEvent() + .checkpointComplete(2) + .checkWrittenData(EXPECTED2) + .end(); + } + + @Test + public void testUpsertWithDelete() throws Exception { + // open the function and ingest data + preparePipeline() + .consume(TestData.DATA_SET_INSERT) + .assertEmptyDataFiles() + .checkpoint(1) + .assertNextEvent() + .checkpointComplete(1) + .consume(TestData.DATA_SET_UPDATE_DELETE) + .checkWrittenData(EXPECTED1) + .checkpoint(2) + .assertNextEvent() + .checkpointComplete(2) + .checkWrittenData(getUpsertWithDeleteExpected()) + .end(); + } + + @Test + public void testInsertWithMiniBatches() throws Exception { + // reset the config option + conf.setDouble(FlinkOptions.WRITE_BATCH_SIZE, 0.0008); // 839 bytes batch size + + Map expected = getMiniBatchExpected(); + + preparePipeline(conf) + // record (operation: 'I') is 304 bytes and record (operation: 'U') is 352 bytes. + // so 3 records expect to trigger a mini-batch write + .consume(TestData.DATA_SET_INSERT_DUPLICATES) + .assertDataBuffer(1, 2) + .checkpoint(1) + .allDataFlushed() + .handleEvents(2) + .checkpointComplete(1) + .checkWrittenData(expected, 1) + .consume(TestData.DATA_SET_INSERT_DUPLICATES) + .checkpoint(2) + .handleEvents(2) + .checkpointComplete(2) + .checkWrittenData(expected, 1) + .end(); + } + + @Test + public void testInsertWithDeduplication() throws Exception { + // reset the config option + conf.setDouble(FlinkOptions.WRITE_BATCH_SIZE, 0.0008); // 839 bytes batch size + conf.setBoolean(FlinkOptions.PRE_COMBINE, true); + + Map expected = new HashMap<>(); + expected.put("par1", "[id1,par1,id1,Danny,23,4,par1]"); + + preparePipeline(conf) + // record (operation: 'I') is 304 bytes and record (operation: 'U') is 352 bytes. + // so 3 records expect to trigger a mini-batch write + .consume(TestData.DATA_SET_INSERT_SAME_KEY) + .assertDataBuffer(1, 2) + .checkpoint(1) + .allDataFlushed() + .handleEvents(2) + .checkpointComplete(1) + .checkWrittenData(expected, 1) + .consume(TestData.DATA_SET_INSERT_SAME_KEY) + .checkpoint(2) + .handleEvents(2) + .checkpointComplete(2) + .checkWrittenData(expected, 1) + .end(); + } + + @Test + public void testInsertAppendMode() throws Exception { + prepareInsertPipeline() + // Each record is 208 bytes. so 4 records expect to trigger a mini-batch write + .consume(TestData.DATA_SET_INSERT_SAME_KEY) + .checkpoint(1) + .assertNextEvent() + .checkpointComplete(1) + .checkWrittenAllData(EXPECTED4, 1) + .consume(TestData.DATA_SET_INSERT_SAME_KEY) + .checkpoint(2) + .assertNextEvent() + .checkpointComplete(2) + .checkWrittenDataCOW(EXPECTED5) + .end(); + } + + /** + * The test is almost same with {@link #testInsertWithSmallBufferSize} except that + * it is with insert clustering mode. + */ + @Test + public void testInsertClustering() throws Exception { + // reset the config option + conf.setString(FlinkOptions.OPERATION, "insert"); + conf.setBoolean(FlinkOptions.INSERT_CLUSTER, true); + conf.setDouble(FlinkOptions.WRITE_TASK_MAX_SIZE, 200.0008); // 839 bytes buffer size + + TestWriteMergeOnRead.TestHarness.instance() + // record (operation: 'I') is 304 bytes and record (operation: 'U') is 352 bytes. + // so 3 records expect to trigger a mini-batch write + // flush the max size bucket once at a time. + .preparePipeline(tempFile, conf) + .consume(TestData.DATA_SET_INSERT_SAME_KEY) + .assertDataBuffer(1, 2) + .checkpoint(1) + .allDataFlushed() + .handleEvents(2) + .checkpointComplete(1) + .checkWrittenData(EXPECTED4, 1) + // insert duplicates again + .consume(TestData.DATA_SET_INSERT_SAME_KEY) + .checkpoint(2) + .handleEvents(2) + .checkpointComplete(2) + .checkWrittenDataCOW(EXPECTED5) + .end(); + } + + @Test + public void testInsertAsyncClustering() throws Exception { + // reset the config option + conf.setString(FlinkOptions.OPERATION, "insert"); + conf.setBoolean(FlinkOptions.CLUSTERING_SCHEDULE_ENABLED, true); + conf.setBoolean(FlinkOptions.CLUSTERING_ASYNC_ENABLED, true); + conf.setInteger(FlinkOptions.CLUSTERING_DELTA_COMMITS, 1); + + prepareInsertPipeline(conf) + .consume(TestData.DATA_SET_INSERT_SAME_KEY) + .checkpoint(1) + .handleEvents(1) + .checkpointComplete(1) + .checkWrittenData(EXPECTED4, 1) + // insert duplicates again + .consume(TestData.DATA_SET_INSERT_SAME_KEY) + .checkpoint(2) + .handleEvents(1) + .checkpointComplete(2) + .checkWrittenDataCOW(EXPECTED5) + .end(); + } + + @Test + public void testInsertWithSmallBufferSize() throws Exception { + // reset the config option + conf.setDouble(FlinkOptions.WRITE_TASK_MAX_SIZE, 200.0008); // 839 bytes buffer size + + Map expected = getMiniBatchExpected(); + + preparePipeline(conf) + // record (operation: 'I') is 304 bytes and record (operation: 'U') is 352 bytes. + // so 3 records expect to trigger a mini-batch write + // flush the max size bucket once at a time. + .consume(TestData.DATA_SET_INSERT_DUPLICATES) + .assertDataBuffer(1, 2) + .checkpoint(1) + .allDataFlushed() + .handleEvents(2) + .checkpointComplete(1) + .checkWrittenData(expected, 1) + // insert duplicates again + .consume(TestData.DATA_SET_INSERT_DUPLICATES) + .checkpoint(2) + .handleEvents(2) + .checkpointComplete(2) + // Same the original base file content. + .checkWrittenData(expected, 1) + .end(); + } + + protected Map getMiniBatchExpected() { + Map expected = new HashMap<>(); + // the last 2 lines are merged + expected.put("par1", "[" + + "id1,par1,id1,Danny,23,1,par1, " + + "id1,par1,id1,Danny,23,1,par1, " + + "id1,par1,id1,Danny,23,1,par1" + "]"); + return expected; + } + + protected Map getUpsertWithDeleteExpected() { + Map expected = new HashMap<>(); + // id3, id5 were deleted and id9 is ignored + expected.put("par1", "[id1,par1,id1,Danny,24,1,par1, id2,par1,id2,Stephen,34,2,par1]"); + expected.put("par2", "[id4,par2,id4,Fabian,31,4,par2]"); + expected.put("par3", "[id6,par3,id6,Emma,20,6,par3]"); + expected.put("par4", "[id7,par4,id7,Bob,44,7,par4, id8,par4,id8,Han,56,8,par4]"); + return expected; + } + + protected Map getExpectedBeforeCheckpointComplete() { + return EXPECTED2; + } + + @Test + public void testIndexStateBootstrap() throws Exception { + // open the function and ingest data + preparePipeline() + .consume(TestData.DATA_SET_INSERT) + .assertEmptyDataFiles() + .checkpoint(1) + .assertNextEvent() + .checkpointComplete(1) + .checkWrittenData(EXPECTED1, 4) + .end(); + + // reset the config option + conf.setBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED, true); + validateIndexLoaded(); + } + + protected void validateIndexLoaded() throws Exception { + preparePipeline(conf) + .consume(TestData.DATA_SET_UPDATE_INSERT) + .checkIndexLoaded( + new HoodieKey("id1", "par1"), + new HoodieKey("id2", "par1"), + new HoodieKey("id3", "par2"), + new HoodieKey("id4", "par2"), + new HoodieKey("id5", "par3"), + new HoodieKey("id6", "par3"), + new HoodieKey("id7", "par4"), + new HoodieKey("id8", "par4"), + new HoodieKey("id9", "par3"), + new HoodieKey("id10", "par4"), + new HoodieKey("id11", "par4")) + .checkpoint(1) + .assertBootstrapped() + .assertNextEvent() + .checkWrittenData(getExpectedBeforeCheckpointComplete()) + .checkpointComplete(1) + .checkWrittenData(EXPECTED2) + .end(); + } + + @Test + public void testWriteExactlyOnce() throws Exception { + // reset the config option + conf.setLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT, 1L); + conf.setDouble(FlinkOptions.WRITE_TASK_MAX_SIZE, 200.0006); // 630 bytes buffer size + preparePipeline(conf) + .consume(TestData.DATA_SET_INSERT) + .emptyEventBuffer() + .checkpoint(1) + .assertConfirming() + .handleEvents(4) + .checkpointComplete(1) + .consume(TestData.DATA_SET_INSERT) + .assertNotConfirming() + .checkpoint(2) + .assertConsumeThrows(TestData.DATA_SET_INSERT, + "Timeout(1000ms) while waiting for instant initialize") + .end(); + } + + @Test + public void testReuseEmbeddedServer() throws IOException { + conf.setInteger("hoodie.filesystem.view.remote.timeout.secs", 500); + HoodieFlinkWriteClient writeClient = FlinkWriteClients.createWriteClient(conf); + FileSystemViewStorageConfig viewStorageConfig = writeClient.getConfig().getViewStorageConfig(); + + assertSame(viewStorageConfig.getStorageType(), FileSystemViewStorageType.REMOTE_FIRST); + + // get another write client + writeClient = FlinkWriteClients.createWriteClient(conf); + assertSame(writeClient.getConfig().getViewStorageConfig().getStorageType(), FileSystemViewStorageType.REMOTE_FIRST); + assertEquals(viewStorageConfig.getRemoteViewServerPort(), writeClient.getConfig().getViewStorageConfig().getRemoteViewServerPort()); + assertEquals(viewStorageConfig.getRemoteTimelineClientTimeoutSecs(), 500); + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + private TestHarness preparePipeline() throws Exception { + return preparePipeline(conf); + } + + protected TestHarness preparePipeline(Configuration conf) throws Exception { + return TestHarness.instance().preparePipeline(tempFile, conf); + } + + protected TestHarness prepareInsertPipeline() throws Exception { + return prepareInsertPipeline(conf); + } + + protected TestHarness prepareInsertPipeline(Configuration conf) throws Exception { + return TestHarness.instance().preparePipeline(tempFile, conf, true); + } + + protected HoodieTableType getTableType() { + return HoodieTableType.COPY_ON_WRITE; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteMergeOnRead.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteMergeOnRead.java new file mode 100644 index 0000000000000..df01fc9076f0c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteMergeOnRead.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink; + +import org.apache.hudi.common.model.EventTimeAvroPayload; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.utils.TestData; + +import org.apache.flink.configuration.Configuration; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.util.HashMap; +import java.util.Map; + +/** + * Test cases for delta stream write. + */ +public class TestWriteMergeOnRead extends TestWriteCopyOnWrite { + + @Override + protected void setUp(Configuration conf) { + conf.setBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED, false); + } + + @Test + public void testIndexStateBootstrapWithMultiFilesInOneSlice() throws Exception { + // open the function and ingest data + preparePipeline(conf) + .consume(TestData.filterOddRows(TestData.DATA_SET_INSERT)) + .assertEmptyDataFiles() + .checkpoint(1) + .assertNextEvent() + .checkpointComplete(1) + .consume(TestData.filterEvenRows(TestData.DATA_SET_INSERT)) + .checkpoint(2) + .assertNextEvent() + .checkpointComplete(2) + .checkWrittenData(EXPECTED1, 4) + // write another commit but does not complete it + .consume(TestData.filterEvenRows(TestData.DATA_SET_INSERT)) + .checkpoint(3) + .assertNextEvent() + .end(); + + // reset the config option + conf.setBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED, true); + validateIndexLoaded(); + } + + @Test + public void testIndexStateBootstrapWithCompactionScheduled() throws Exception { + // sets up the delta commits as 1 to generate a new compaction plan. + conf.setInteger(FlinkOptions.COMPACTION_DELTA_COMMITS, 1); + // open the function and ingest data + preparePipeline(conf) + .consume(TestData.DATA_SET_INSERT) + .assertEmptyDataFiles() + .checkpoint(1) + .assertNextEvent() + .checkpointComplete(1) + .checkWrittenData(EXPECTED1, 4) + .end(); + + // reset config options + conf.removeConfig(FlinkOptions.COMPACTION_DELTA_COMMITS); + // sets up index bootstrap + conf.setBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED, true); + validateIndexLoaded(); + } + + @Test + public void testEventTimeAvroPayloadMergeRead() throws Exception { + conf.set(FlinkOptions.COMPACTION_ASYNC_ENABLED, true); + conf.set(FlinkOptions.PATH, tempFile.getAbsolutePath()); + conf.set(FlinkOptions.TABLE_TYPE, HoodieTableType.MERGE_ON_READ.name()); + conf.set(FlinkOptions.OPERATION, "upsert"); + conf.set(FlinkOptions.CHANGELOG_ENABLED, false); + conf.set(FlinkOptions.COMPACTION_DELTA_COMMITS, 2); + conf.set(FlinkOptions.PRE_COMBINE, true); + conf.set(FlinkOptions.PRECOMBINE_FIELD, "ts"); + conf.set(FlinkOptions.PAYLOAD_CLASS_NAME, EventTimeAvroPayload.class.getName()); + HashMap mergedExpected = new HashMap<>(EXPECTED1); + mergedExpected.put("par1", "[id1,par1,id1,Danny,22,4,par1, id2,par1,id2,Stephen,33,2,par1]"); + TestHarness.instance().preparePipeline(tempFile, conf) + .consume(TestData.DATA_SET_INSERT) + .emptyEventBuffer() + .checkpoint(1) + .assertNextEvent() + .checkpointComplete(1) + .checkWrittenData(EXPECTED1, 4) + .consume(TestData.DATA_SET_DISORDER_INSERT) + .emptyEventBuffer() + .checkpoint(2) + .assertNextEvent() + .checkpointComplete(2) + .checkWrittenData(mergedExpected, 4) + .consume(TestData.DATA_SET_SINGLE_INSERT) + .emptyEventBuffer() + .checkpoint(3) + .assertNextEvent() + .checkpointComplete(3) + .checkWrittenData(mergedExpected, 4) + .end(); + } + + @ParameterizedTest + @ValueSource(ints = {1, 2}) + public void testOnlyBaseFileOrOnlyLogFileRead(int compactionDeltaCommits) throws Exception { + conf.set(FlinkOptions.COMPACTION_ASYNC_ENABLED, true); + conf.set(FlinkOptions.PATH, tempFile.getAbsolutePath()); + conf.set(FlinkOptions.TABLE_TYPE, HoodieTableType.MERGE_ON_READ.name()); + conf.set(FlinkOptions.OPERATION, "upsert"); + conf.set(FlinkOptions.CHANGELOG_ENABLED, false); + conf.set(FlinkOptions.COMPACTION_DELTA_COMMITS, compactionDeltaCommits); + TestHarness.instance().preparePipeline(tempFile, conf) + .consume(TestData.DATA_SET_INSERT) + .emptyEventBuffer() + .checkpoint(1) + .assertNextEvent() + .checkpointComplete(1) + .checkWrittenData(EXPECTED1, 4) + .end(); + } + + @Override + public void testInsertClustering() { + // insert clustering is only valid for cow table. + } + + @Override + protected Map getExpectedBeforeCheckpointComplete() { + return EXPECTED1; + } + + protected Map getMiniBatchExpected() { + Map expected = new HashMap<>(); + // MOR mode merges the messages with the same key. + expected.put("par1", "[id1,par1,id1,Danny,23,1,par1]"); + return expected; + } + + @Override + protected HoodieTableType getTableType() { + return HoodieTableType.MERGE_ON_READ; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteMergeOnReadWithCompact.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteMergeOnReadWithCompact.java new file mode 100644 index 0000000000000..704d94caba395 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteMergeOnReadWithCompact.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink; + +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.configuration.FlinkOptions; + +import org.apache.flink.configuration.Configuration; + +import java.util.HashMap; +import java.util.Map; + +/** + * Test cases for delta stream write with compaction. + */ +public class TestWriteMergeOnReadWithCompact extends TestWriteCopyOnWrite { + + @Override + protected void setUp(Configuration conf) { + // trigger the compaction for every finished checkpoint + conf.setInteger(FlinkOptions.COMPACTION_DELTA_COMMITS, 1); + } + + @Override + public void testInsertClustering() { + // insert clustering is only valid for cow table. + } + + @Override + protected Map getExpectedBeforeCheckpointComplete() { + return EXPECTED1; + } + + protected Map getMiniBatchExpected() { + Map expected = new HashMap<>(); + // MOR mode merges the messages with the same key. + expected.put("par1", "[id1,par1,id1,Danny,23,1,par1]"); + return expected; + } + + @Override + protected HoodieTableType getTableType() { + return HoodieTableType.MERGE_ON_READ; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/bulk/TestBulkInsertWriteHelper.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/bulk/TestBulkInsertWriteHelper.java new file mode 100644 index 0000000000000..80d38fe293ba2 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/bulk/TestBulkInsertWriteHelper.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.bulk; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.util.FlinkTables; +import org.apache.hudi.util.StreamerUtil; +import org.apache.hudi.utils.TestConfigurations; +import org.apache.hudi.utils.TestData; + +import org.apache.avro.generic.GenericRecord; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.logical.RowType; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; + +/** + * Test cases for {@link BulkInsertWriterHelper}. + */ +public class TestBulkInsertWriteHelper { + protected Configuration conf; + + @TempDir + File tempFile; + + @BeforeEach + public void before() throws IOException { + conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + StreamerUtil.initTableIfNotExists(conf); + } + + @Test + void testWrite() throws Exception { + HoodieFlinkTable table = FlinkTables.createTable(conf); + String instant = HoodieActiveTimeline.createNewInstantTime(); + RowType rowType = TestConfigurations.ROW_TYPE; + BulkInsertWriterHelper writerHelper = new BulkInsertWriterHelper(conf, table, table.getConfig(), instant, + 1, 1, 0, rowType, false); + for (RowData row: TestData.DATA_SET_INSERT) { + writerHelper.write(row); + } + List writeStatusList = writerHelper.getWriteStatuses(1); + assertWriteStatus(writeStatusList); + + Map expected = new HashMap<>(); + expected.put("par1", "[id1,par1,id1,Danny,23,1,par1, id2,par1,id2,Stephen,33,2,par1]"); + expected.put("par2", "[id3,par2,id3,Julian,53,3,par2, id4,par2,id4,Fabian,31,4,par2]"); + expected.put("par3", "[id5,par3,id5,Sophia,18,5,par3, id6,par3,id6,Emma,20,6,par3]"); + expected.put("par4", "[id7,par4,id7,Bob,44,7,par4, id8,par4,id8,Han,56,8,par4]"); + + TestData.checkWrittenData(tempFile, expected); + + // set up preserveHoodieMetadata as true and check again + RowType rowType2 = BulkInsertWriterHelper.addMetadataFields(rowType, false); + BulkInsertWriterHelper writerHelper2 = new BulkInsertWriterHelper(conf, table, table.getConfig(), instant, + 1, 1, 0, rowType2, true); + for (RowData row: rowsWithMetadata(instant, TestData.DATA_SET_INSERT)) { + writerHelper.write(row); + } + List writeStatusList2 = writerHelper.getWriteStatuses(1); + assertWriteStatus(writeStatusList2); + + String expectRows = "[" + instant + ", " + instant + "]"; + Map expected2 = new HashMap<>(); + expected2.put("par1", expectRows); + expected2.put("par2", expectRows); + expected2.put("par3", expectRows); + expected2.put("par4", expectRows); + + TestData.checkWrittenData(tempFile, expected2, 4, TestBulkInsertWriteHelper::filterCommitTime); + } + + private void assertWriteStatus(List writeStatusList) { + String partitions = writeStatusList.stream() + .map(writeStatus -> StringUtils.nullToEmpty(writeStatus.getStat().getPartitionPath())) + .sorted() + .collect(Collectors.joining(",")); + assertThat(partitions, is("par1,par2,par3,par4")); + List files = writeStatusList.stream() + .map(writeStatus -> writeStatus.getStat().getPath()) + .collect(Collectors.toList()); + assertThat(files.size(), is(4)); + } + + private static List rowsWithMetadata(String instantTime, List rows) { + List rowsWithMetadata = new ArrayList<>(); + int seqNum = 0; + for (RowData row : rows) { + GenericRowData rebuilt = new GenericRowData(row.getArity() + 5); + rebuilt.setField(0, StringData.fromString(instantTime)); + rebuilt.setField(1, seqNum++); + rebuilt.setField(2, row.getString(0)); + rebuilt.setField(3, row.getString(4)); + rebuilt.setField(4, StringData.fromString("f" + seqNum)); + } + return rowsWithMetadata; + } + + private static String filterCommitTime(GenericRecord genericRecord) { + return genericRecord.get("_hoodie_commit_time").toString(); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/bulk/TestRowDataKeyGen.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/bulk/TestRowDataKeyGen.java new file mode 100644 index 0000000000000..a66874c486414 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/bulk/TestRowDataKeyGen.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.bulk; + +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieKeyException; +import org.apache.hudi.table.HoodieTableFactory; +import org.apache.hudi.utils.TestConfigurations; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import static org.apache.hudi.common.util.PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH; +import static org.apache.hudi.utils.TestData.insertRow; +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertThrows; + +/** + * Test cases for {@link RowDataKeyGen}. + */ +public class TestRowDataKeyGen { + @Test + void testSimpleKeyAndPartition() { + Configuration conf = TestConfigurations.getDefaultConf("path1"); + final RowData rowData1 = insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(1), StringData.fromString("par1")); + final RowDataKeyGen keyGen1 = RowDataKeyGen.instance(conf, TestConfigurations.ROW_TYPE); + assertThat(keyGen1.getRecordKey(rowData1), is("id1")); + assertThat(keyGen1.getPartitionPath(rowData1), is("par1")); + + // null record key and partition path + final RowData rowData2 = insertRow(TestConfigurations.ROW_TYPE, null, StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(1), null); + assertThrows(HoodieKeyException.class, () -> keyGen1.getRecordKey(rowData2)); + assertThat(keyGen1.getPartitionPath(rowData2), is(DEFAULT_PARTITION_PATH)); + // empty record key and partition path + final RowData rowData3 = insertRow(StringData.fromString(""), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(1), StringData.fromString("")); + assertThrows(HoodieKeyException.class, () -> keyGen1.getRecordKey(rowData3)); + assertThat(keyGen1.getPartitionPath(rowData3), is(DEFAULT_PARTITION_PATH)); + + // hive style partitioning + conf.set(FlinkOptions.HIVE_STYLE_PARTITIONING, true); + final RowDataKeyGen keyGen2 = RowDataKeyGen.instance(conf, TestConfigurations.ROW_TYPE); + assertThat(keyGen2.getPartitionPath(rowData1), is(String.format("partition=%s", "par1"))); + assertThat(keyGen2.getPartitionPath(rowData2), is(String.format("partition=%s", DEFAULT_PARTITION_PATH))); + assertThat(keyGen2.getPartitionPath(rowData3), is(String.format("partition=%s", DEFAULT_PARTITION_PATH))); + } + + @Test + void testComplexKeyAndPartition() { + Configuration conf = TestConfigurations.getDefaultConf("path1"); + conf.set(FlinkOptions.RECORD_KEY_FIELD, "uuid,name"); + conf.set(FlinkOptions.PARTITION_PATH_FIELD, "partition,ts"); + RowData rowData1 = insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(1), StringData.fromString("par1")); + RowDataKeyGen keyGen1 = RowDataKeyGen.instance(conf, TestConfigurations.ROW_TYPE); + assertThat(keyGen1.getRecordKey(rowData1), is("uuid:id1,name:Danny")); + assertThat(keyGen1.getPartitionPath(rowData1), is("par1/1970-01-01T00:00:00.001")); + + // null record key and partition path + final RowData rowData2 = insertRow(TestConfigurations.ROW_TYPE, null, null, 23, null, null); + assertThrows(HoodieKeyException.class, () -> keyGen1.getRecordKey(rowData2)); + assertThat(keyGen1.getPartitionPath(rowData2), is(String.format("%s/%s", DEFAULT_PARTITION_PATH, DEFAULT_PARTITION_PATH))); + // empty record key and partition path + final RowData rowData3 = insertRow(StringData.fromString(""), StringData.fromString(""), 23, + TimestampData.fromEpochMillis(1), StringData.fromString("")); + assertThrows(HoodieKeyException.class, () -> keyGen1.getRecordKey(rowData3)); + assertThat(keyGen1.getPartitionPath(rowData3), is(String.format("%s/1970-01-01T00:00:00.001", DEFAULT_PARTITION_PATH))); + + // hive style partitioning + conf.set(FlinkOptions.HIVE_STYLE_PARTITIONING, true); + final RowDataKeyGen keyGen2 = RowDataKeyGen.instance(conf, TestConfigurations.ROW_TYPE); + assertThat(keyGen2.getPartitionPath(rowData1), is(String.format("partition=%s/ts=%s", "par1", "1970-01-01T00:00:00.001"))); + assertThat(keyGen2.getPartitionPath(rowData2), is(String.format("partition=%s/ts=%s", DEFAULT_PARTITION_PATH, DEFAULT_PARTITION_PATH))); + assertThat(keyGen2.getPartitionPath(rowData3), is(String.format("partition=%s/ts=%s", DEFAULT_PARTITION_PATH, "1970-01-01T00:00:00.001"))); + } + + @Test + void testTimestampBasedKeyGenerator() { + Configuration conf = TestConfigurations.getDefaultConf("path1"); + conf.setString(FlinkOptions.PARTITION_PATH_FIELD, "ts"); + HoodieTableFactory.setupTimestampKeygenOptions(conf, DataTypes.TIMESTAMP(3)); + final RowData rowData1 = insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(7200000), StringData.fromString("par1")); + final RowDataKeyGen keyGen1 = RowDataKeyGen.instance(conf, TestConfigurations.ROW_TYPE); + + assertThat(keyGen1.getRecordKey(rowData1), is("id1")); + assertThat(keyGen1.getPartitionPath(rowData1), is("1970010102")); + + // null record key and partition path + final RowData rowData2 = insertRow(TestConfigurations.ROW_TYPE, null, StringData.fromString("Danny"), 23, + null, StringData.fromString("par1")); + assertThrows(HoodieKeyException.class, () -> keyGen1.getRecordKey(rowData2)); + assertThat(keyGen1.getPartitionPath(rowData2), is("1970010100")); + // empty record key and partition path + final RowData rowData3 = insertRow(StringData.fromString(""), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(1), StringData.fromString("par1")); + assertThrows(HoodieKeyException.class, () -> keyGen1.getRecordKey(rowData3)); + assertThat(keyGen1.getPartitionPath(rowData3), is("1970010100")); + + // hive style partitioning + conf.set(FlinkOptions.HIVE_STYLE_PARTITIONING, true); + final RowDataKeyGen keyGen2 = RowDataKeyGen.instance(conf, TestConfigurations.ROW_TYPE); + assertThat(keyGen2.getPartitionPath(rowData1), is("ts=1970010102")); + assertThat(keyGen2.getPartitionPath(rowData2), is("ts=1970010100")); + assertThat(keyGen2.getPartitionPath(rowData3), is("ts=1970010100")); + } + + @ParameterizedTest + @ValueSource(strings = {FlinkOptions.PARTITION_FORMAT_DASHED_DAY, FlinkOptions.PARTITION_FORMAT_DAY}) + void testDateBasedKeyGenerator(String partitionFormat) { + boolean dashed = partitionFormat.equals(FlinkOptions.PARTITION_FORMAT_DASHED_DAY); + Configuration conf = TestConfigurations.getDefaultConf("path1", TestConfigurations.ROW_DATA_TYPE_DATE); + conf.setString(FlinkOptions.PARTITION_PATH_FIELD, "dt"); + conf.setString(FlinkOptions.PARTITION_FORMAT, partitionFormat); + HoodieTableFactory.setupTimestampKeygenOptions(conf, DataTypes.DATE()); + final RowData rowData1 = insertRow(TestConfigurations.ROW_TYPE_DATE, + StringData.fromString("id1"), StringData.fromString("Danny"), 23, 1); + final RowDataKeyGen keyGen1 = RowDataKeyGen.instance(conf, TestConfigurations.ROW_TYPE_DATE); + + assertThat(keyGen1.getRecordKey(rowData1), is("id1")); + String expectedPartition1 = dashed ? "1970-01-02" : "19700102"; + assertThat(keyGen1.getPartitionPath(rowData1), is(expectedPartition1)); + + // null record key and partition path + final RowData rowData2 = insertRow(TestConfigurations.ROW_TYPE_DATE, null, StringData.fromString("Danny"), 23, null); + assertThrows(HoodieKeyException.class, () -> keyGen1.getRecordKey(rowData2)); + String expectedPartition2 = dashed ? "1970-01-02" : "19700102"; + assertThat(keyGen1.getPartitionPath(rowData2), is(expectedPartition2)); + + // empty record key + String expectedPartition3 = dashed ? "1970-01-03" : "19700103"; + final RowData rowData3 = insertRow(TestConfigurations.ROW_TYPE_DATE, StringData.fromString(""), StringData.fromString("Danny"), 23, 2); + assertThrows(HoodieKeyException.class, () -> keyGen1.getRecordKey(rowData3)); + assertThat(keyGen1.getPartitionPath(rowData3), is(expectedPartition3)); + + // hive style partitioning + conf.set(FlinkOptions.HIVE_STYLE_PARTITIONING, true); + final RowDataKeyGen keyGen2 = RowDataKeyGen.instance(conf, TestConfigurations.ROW_TYPE_DATE); + assertThat(keyGen2.getPartitionPath(rowData1), is("dt=" + expectedPartition1)); + assertThat(keyGen2.getPartitionPath(rowData2), is("dt=" + expectedPartition2)); + assertThat(keyGen2.getPartitionPath(rowData3), is("dt=" + expectedPartition3)); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/cluster/ITTestHoodieFlinkClustering.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/cluster/ITTestHoodieFlinkClustering.java new file mode 100644 index 0000000000000..f2273e40a26db --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/cluster/ITTestHoodieFlinkClustering.java @@ -0,0 +1,294 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.cluster; + +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.ClusteringUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.sink.clustering.ClusteringCommitEvent; +import org.apache.hudi.sink.clustering.ClusteringCommitSink; +import org.apache.hudi.sink.clustering.ClusteringOperator; +import org.apache.hudi.sink.clustering.ClusteringPlanSourceFunction; +import org.apache.hudi.sink.clustering.FlinkClusteringConfig; +import org.apache.hudi.sink.clustering.HoodieFlinkClusteringJob; +import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.CompactionUtil; +import org.apache.hudi.util.FlinkWriteClients; +import org.apache.hudi.util.StreamerUtil; +import org.apache.hudi.utils.FlinkMiniCluster; +import org.apache.hudi.utils.TestConfigurations; +import org.apache.hudi.utils.TestData; +import org.apache.hudi.utils.TestSQL; + +import org.apache.avro.Schema; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.config.ExecutionConfigOptions; +import org.apache.flink.table.api.internal.TableEnvironmentImpl; +import org.apache.flink.table.planner.plan.nodes.exec.utils.ExecNodeUtil; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +import java.io.File; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * IT cases for {@link HoodieFlinkClusteringJob}. + */ +@ExtendWith(FlinkMiniCluster.class) +public class ITTestHoodieFlinkClustering { + + private static final Map EXPECTED = new HashMap<>(); + + static { + EXPECTED.put("par1", "[id1,par1,id1,Danny,23,1000,par1, id2,par1,id2,Stephen,33,2000,par1]"); + EXPECTED.put("par2", "[id3,par2,id3,Julian,53,3000,par2, id4,par2,id4,Fabian,31,4000,par2]"); + EXPECTED.put("par3", "[id5,par3,id5,Sophia,18,5000,par3, id6,par3,id6,Emma,20,6000,par3]"); + EXPECTED.put("par4", "[id7,par4,id7,Bob,44,7000,par4, id8,par4,id8,Han,56,8000,par4]"); + } + + @TempDir + File tempFile; + + @Test + public void testHoodieFlinkClustering() throws Exception { + // Create hoodie table and insert into data. + EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build(); + TableEnvironment tableEnv = TableEnvironmentImpl.create(settings); + tableEnv.getConfig().getConfiguration() + .setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 4); + Map options = new HashMap<>(); + options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath()); + + // use append mode + options.put(FlinkOptions.OPERATION.key(), WriteOperationType.INSERT.value()); + options.put(FlinkOptions.INSERT_CLUSTER.key(), "false"); + + String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options); + tableEnv.executeSql(hoodieTableDDL); + tableEnv.executeSql(TestSQL.INSERT_T1).await(); + + // wait for the asynchronous commit to finish + TimeUnit.SECONDS.sleep(3); + + // Make configuration and setAvroSchema. + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + FlinkClusteringConfig cfg = new FlinkClusteringConfig(); + cfg.path = tempFile.getAbsolutePath(); + cfg.targetPartitions = 4; + Configuration conf = FlinkClusteringConfig.toFlinkConfig(cfg); + + // create metaClient + HoodieTableMetaClient metaClient = StreamerUtil.createMetaClient(conf); + + // set the table name + conf.setString(FlinkOptions.TABLE_NAME, metaClient.getTableConfig().getTableName()); + + // set record key field + conf.setString(FlinkOptions.RECORD_KEY_FIELD, metaClient.getTableConfig().getRecordKeyFieldProp()); + // set partition field + conf.setString(FlinkOptions.PARTITION_PATH_FIELD, metaClient.getTableConfig().getPartitionFieldProp()); + + long ckpTimeout = env.getCheckpointConfig().getCheckpointTimeout(); + conf.setLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT, ckpTimeout); + conf.setString(FlinkOptions.PARTITION_PATH_FIELD, "partition"); + + // set table schema + CompactionUtil.setAvroSchema(conf, metaClient); + + // judge whether have operation + // To compute the clustering instant time and do clustering. + String clusteringInstantTime = HoodieActiveTimeline.createNewInstantTime(); + + HoodieFlinkWriteClient writeClient = FlinkWriteClients.createWriteClient(conf); + HoodieFlinkTable table = writeClient.getHoodieTable(); + + boolean scheduled = writeClient.scheduleClusteringAtInstant(clusteringInstantTime, Option.empty()); + + assertTrue(scheduled, "The clustering plan should be scheduled"); + + // fetch the instant based on the configured execution sequence + table.getMetaClient().reloadActiveTimeline(); + HoodieTimeline timeline = table.getActiveTimeline().filterPendingReplaceTimeline() + .filter(instant -> instant.getState() == HoodieInstant.State.REQUESTED); + + // generate clustering plan + // should support configurable commit metadata + Option> clusteringPlanOption = ClusteringUtils.getClusteringPlan( + table.getMetaClient(), timeline.lastInstant().get()); + + HoodieClusteringPlan clusteringPlan = clusteringPlanOption.get().getRight(); + + // Mark instant as clustering inflight + HoodieInstant instant = HoodieTimeline.getReplaceCommitRequestedInstant(clusteringInstantTime); + table.getActiveTimeline().transitionReplaceRequestedToInflight(instant, Option.empty()); + + final Schema tableAvroSchema = StreamerUtil.getTableAvroSchema(table.getMetaClient(), false); + final DataType rowDataType = AvroSchemaConverter.convertToDataType(tableAvroSchema); + final RowType rowType = (RowType) rowDataType.getLogicalType(); + + DataStream dataStream = env.addSource(new ClusteringPlanSourceFunction(clusteringInstantTime, clusteringPlan)) + .name("clustering_source") + .uid("uid_clustering_source") + .rebalance() + .transform("clustering_task", + TypeInformation.of(ClusteringCommitEvent.class), + new ClusteringOperator(conf, rowType)) + .setParallelism(clusteringPlan.getInputGroups().size()); + + ExecNodeUtil.setManagedMemoryWeight(dataStream.getTransformation(), + conf.getInteger(FlinkOptions.WRITE_SORT_MEMORY) * 1024L * 1024L); + + dataStream + .addSink(new ClusteringCommitSink(conf)) + .name("clustering_commit") + .uid("uid_clustering_commit") + .setParallelism(1); + + env.execute("flink_hudi_clustering"); + TestData.checkWrittenData(tempFile, EXPECTED, 4); + } + + @Test + public void testHoodieFlinkClusteringService() throws Exception { + // Create hoodie table and insert into data. + EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build(); + TableEnvironment tableEnv = TableEnvironmentImpl.create(settings); + tableEnv.getConfig().getConfiguration() + .setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 4); + Map options = new HashMap<>(); + options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath()); + + // use append mode + options.put(FlinkOptions.OPERATION.key(), WriteOperationType.INSERT.value()); + options.put(FlinkOptions.INSERT_CLUSTER.key(), "false"); + + String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options); + tableEnv.executeSql(hoodieTableDDL); + tableEnv.executeSql(TestSQL.INSERT_T1).await(); + + // wait for the asynchronous commit to finish + TimeUnit.SECONDS.sleep(3); + + // Make configuration and setAvroSchema. + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + FlinkClusteringConfig cfg = new FlinkClusteringConfig(); + cfg.path = tempFile.getAbsolutePath(); + cfg.minClusteringIntervalSeconds = 3; + cfg.schedule = true; + Configuration conf = FlinkClusteringConfig.toFlinkConfig(cfg); + + HoodieFlinkClusteringJob.AsyncClusteringService asyncClusteringService = new HoodieFlinkClusteringJob.AsyncClusteringService(cfg, conf, env); + asyncClusteringService.start(null); + + // wait for the asynchronous commit to finish + TimeUnit.SECONDS.sleep(5); + + asyncClusteringService.shutDown(); + + TestData.checkWrittenData(tempFile, EXPECTED, 4); + } + + @Test + public void testHoodieFlinkClusteringSchedule() throws Exception { + // Create hoodie table and insert into data. + EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build(); + TableEnvironment tableEnv = TableEnvironmentImpl.create(settings); + Map options = new HashMap<>(); + options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath()); + + // use append mode + options.put(FlinkOptions.OPERATION.key(), WriteOperationType.INSERT.value()); + options.put(FlinkOptions.INSERT_CLUSTER.key(), "false"); + + String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options); + tableEnv.executeSql(hoodieTableDDL); + tableEnv.executeSql(TestSQL.INSERT_T1).await(); + + // wait for the asynchronous commit to finish + TimeUnit.SECONDS.sleep(3); + + // Make configuration and setAvroSchema. + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + FlinkClusteringConfig cfg = new FlinkClusteringConfig(); + cfg.path = tempFile.getAbsolutePath(); + Configuration conf = FlinkClusteringConfig.toFlinkConfig(cfg); + + // create metaClient + HoodieTableMetaClient metaClient = StreamerUtil.createMetaClient(conf); + + // set the table name + conf.setString(FlinkOptions.TABLE_NAME, metaClient.getTableConfig().getTableName()); + + // set record key field + conf.setString(FlinkOptions.RECORD_KEY_FIELD, metaClient.getTableConfig().getRecordKeyFieldProp()); + // set partition field + conf.setString(FlinkOptions.PARTITION_PATH_FIELD, metaClient.getTableConfig().getPartitionFieldProp()); + + long ckpTimeout = env.getCheckpointConfig().getCheckpointTimeout(); + conf.setLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT, ckpTimeout); + conf.setString(FlinkOptions.PARTITION_PATH_FIELD, "partition"); + conf.setInteger(FlinkOptions.CLUSTERING_DELTA_COMMITS, 2); + conf.setBoolean(FlinkOptions.CLUSTERING_ASYNC_ENABLED, false); + conf.setBoolean(FlinkOptions.CLUSTERING_SCHEDULE_ENABLED, true); + + // set table schema + CompactionUtil.setAvroSchema(conf, metaClient); + + // To compute the clustering instant time. + String clusteringInstantTime = HoodieActiveTimeline.createNewInstantTime(); + + HoodieFlinkWriteClient writeClient = FlinkWriteClients.createWriteClient(conf); + + boolean scheduled = writeClient.scheduleClusteringAtInstant(clusteringInstantTime, Option.empty()); + + assertFalse(scheduled, "1 delta commit, the clustering plan should not be scheduled"); + + tableEnv.executeSql(TestSQL.INSERT_T1).await(); + // wait for the asynchronous commit to finish + TimeUnit.SECONDS.sleep(3); + + clusteringInstantTime = HoodieActiveTimeline.createNewInstantTime(); + + scheduled = writeClient.scheduleClusteringAtInstant(clusteringInstantTime, Option.empty()); + + assertTrue(scheduled, "2 delta commits, the clustering plan should be scheduled"); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/compact/ITTestHoodieFlinkCompactor.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/compact/ITTestHoodieFlinkCompactor.java new file mode 100644 index 0000000000000..6157b5e901130 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/compact/ITTestHoodieFlinkCompactor.java @@ -0,0 +1,323 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.compact; + +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.CompactionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.util.CompactionUtil; +import org.apache.hudi.util.FlinkWriteClients; +import org.apache.hudi.util.StreamerUtil; +import org.apache.hudi.utils.FlinkMiniCluster; +import org.apache.hudi.utils.TestConfigurations; +import org.apache.hudi.utils.TestData; +import org.apache.hudi.utils.TestSQL; + +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.config.ExecutionConfigOptions; +import org.apache.flink.table.api.internal.TableEnvironmentImpl; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * IT cases for {@link org.apache.hudi.common.model.HoodieRecord}. + */ +@ExtendWith(FlinkMiniCluster.class) +public class ITTestHoodieFlinkCompactor { + + protected static final Logger LOG = LoggerFactory.getLogger(ITTestHoodieFlinkCompactor.class); + + private static final Map> EXPECTED1 = new HashMap<>(); + + private static final Map> EXPECTED2 = new HashMap<>(); + + private static final Map> EXPECTED3 = new HashMap<>(); + + static { + EXPECTED1.put("par1", Arrays.asList("id1,par1,id1,Danny,23,1000,par1", "id2,par1,id2,Stephen,33,2000,par1")); + EXPECTED1.put("par2", Arrays.asList("id3,par2,id3,Julian,53,3000,par2", "id4,par2,id4,Fabian,31,4000,par2")); + EXPECTED1.put("par3", Arrays.asList("id5,par3,id5,Sophia,18,5000,par3", "id6,par3,id6,Emma,20,6000,par3")); + EXPECTED1.put("par4", Arrays.asList("id7,par4,id7,Bob,44,7000,par4", "id8,par4,id8,Han,56,8000,par4")); + + EXPECTED2.put("par1", Arrays.asList("id1,par1,id1,Danny,24,1000,par1", "id2,par1,id2,Stephen,34,2000,par1")); + EXPECTED2.put("par2", Arrays.asList("id3,par2,id3,Julian,54,3000,par2", "id4,par2,id4,Fabian,32,4000,par2")); + EXPECTED2.put("par3", Arrays.asList("id5,par3,id5,Sophia,18,5000,par3", "id6,par3,id6,Emma,20,6000,par3", "id9,par3,id9,Jane,19,6000,par3")); + EXPECTED2.put("par4", Arrays.asList("id7,par4,id7,Bob,44,7000,par4", "id8,par4,id8,Han,56,8000,par4", "id10,par4,id10,Ella,38,7000,par4", "id11,par4,id11,Phoebe,52,8000,par4")); + + EXPECTED3.put("par1", Arrays.asList("id1,par1,id1,Danny,23,1000,par1", "id2,par1,id2,Stephen,33,2000,par1")); + EXPECTED3.put("par2", Arrays.asList("id3,par2,id3,Julian,53,3000,par2", "id4,par2,id4,Fabian,31,4000,par2")); + EXPECTED3.put("par3", Arrays.asList("id5,par3,id5,Sophia,18,5000,par3", "id6,par3,id6,Emma,20,6000,par3")); + EXPECTED3.put("par4", Arrays.asList("id7,par4,id7,Bob,44,7000,par4", "id8,par4,id8,Han,56,8000,par4")); + EXPECTED3.put("par5", Arrays.asList("id12,par5,id12,Tony,27,9000,par5", "id13,par5,id13,Jenny,72,10000,par5")); + } + + @TempDir + File tempFile; + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testHoodieFlinkCompactor(boolean enableChangelog) throws Exception { + // Create hoodie table and insert into data. + EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build(); + TableEnvironment tableEnv = TableEnvironmentImpl.create(settings); + tableEnv.getConfig().getConfiguration() + .setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 4); + Map options = new HashMap<>(); + options.put(FlinkOptions.COMPACTION_SCHEDULE_ENABLED.key(), "false"); + options.put(FlinkOptions.COMPACTION_ASYNC_ENABLED.key(), "false"); + options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath()); + options.put(FlinkOptions.TABLE_TYPE.key(), "MERGE_ON_READ"); + options.put(FlinkOptions.CHANGELOG_ENABLED.key(), enableChangelog + ""); + String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options); + tableEnv.executeSql(hoodieTableDDL); + tableEnv.executeSql(TestSQL.INSERT_T1).await(); + + // wait for the asynchronous commit to finish + TimeUnit.SECONDS.sleep(3); + + // Make configuration and setAvroSchema. + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + FlinkCompactionConfig cfg = new FlinkCompactionConfig(); + cfg.path = tempFile.getAbsolutePath(); + Configuration conf = FlinkCompactionConfig.toFlinkConfig(cfg); + conf.setString(FlinkOptions.TABLE_TYPE.key(), "MERGE_ON_READ"); + + // create metaClient + HoodieTableMetaClient metaClient = StreamerUtil.createMetaClient(conf); + + // set the table name + conf.setString(FlinkOptions.TABLE_NAME, metaClient.getTableConfig().getTableName()); + + // set table schema + CompactionUtil.setAvroSchema(conf, metaClient); + + // infer changelog mode + CompactionUtil.inferChangelogMode(conf, metaClient); + + HoodieFlinkWriteClient writeClient = FlinkWriteClients.createWriteClient(conf); + + String compactionInstantTime = scheduleCompactionPlan(metaClient, writeClient); + + HoodieFlinkTable table = writeClient.getHoodieTable(); + // generate compaction plan + // should support configurable commit metadata + HoodieCompactionPlan compactionPlan = CompactionUtils.getCompactionPlan( + table.getMetaClient(), compactionInstantTime); + + HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime); + // Mark instant as compaction inflight + table.getActiveTimeline().transitionCompactionRequestedToInflight(instant); + + env.addSource(new CompactionPlanSourceFunction(Collections.singletonList(Pair.of(compactionInstantTime, compactionPlan)))) + .name("compaction_source") + .uid("uid_compaction_source") + .rebalance() + .transform("compact_task", + TypeInformation.of(CompactionCommitEvent.class), + new CompactOperator(conf)) + .setParallelism(FlinkMiniCluster.DEFAULT_PARALLELISM) + .addSink(new CompactionCommitSink(conf)) + .name("clean_commits") + .uid("uid_clean_commits") + .setParallelism(1); + + env.execute("flink_hudi_compaction"); + writeClient.close(); + TestData.checkWrittenDataCOW(tempFile, EXPECTED1); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testHoodieFlinkCompactorService(boolean enableChangelog) throws Exception { + // Create hoodie table and insert into data. + EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build(); + TableEnvironment tableEnv = TableEnvironmentImpl.create(settings); + tableEnv.getConfig().getConfiguration() + .setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 4); + Map options = new HashMap<>(); + options.put(FlinkOptions.COMPACTION_ASYNC_ENABLED.key(), "false"); + options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath()); + options.put(FlinkOptions.TABLE_TYPE.key(), "MERGE_ON_READ"); + options.put(FlinkOptions.CHANGELOG_ENABLED.key(), enableChangelog + ""); + String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options); + tableEnv.executeSql(hoodieTableDDL); + + // insert dataset + tableEnv.executeSql(TestSQL.INSERT_T1).await(); + // update the dataset + tableEnv.executeSql(TestSQL.UPDATE_INSERT_T1).await(); + + // Make configuration and setAvroSchema. + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + FlinkCompactionConfig cfg = new FlinkCompactionConfig(); + cfg.path = tempFile.getAbsolutePath(); + cfg.minCompactionIntervalSeconds = 3; + cfg.schedule = true; + Configuration conf = FlinkCompactionConfig.toFlinkConfig(cfg); + conf.setString(FlinkOptions.TABLE_TYPE.key(), "MERGE_ON_READ"); + conf.setInteger(FlinkOptions.COMPACTION_TASKS.key(), FlinkMiniCluster.DEFAULT_PARALLELISM); + + HoodieFlinkCompactor.AsyncCompactionService asyncCompactionService = new HoodieFlinkCompactor.AsyncCompactionService(cfg, conf, env); + asyncCompactionService.start(null); + + // wait for the asynchronous commit to finish + TimeUnit.SECONDS.sleep(10); + + asyncCompactionService.shutDown(); + + TestData.checkWrittenDataCOW(tempFile, EXPECTED2); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testHoodieFlinkCompactorWithPlanSelectStrategy(boolean enableChangelog) throws Exception { + // Create hoodie table and insert into data. + EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build(); + TableEnvironment tableEnv = TableEnvironmentImpl.create(settings); + tableEnv.getConfig().getConfiguration() + .setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 4); + Map options = new HashMap<>(); + options.put(FlinkOptions.COMPACTION_ASYNC_ENABLED.key(), "false"); + options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath()); + options.put(FlinkOptions.TABLE_TYPE.key(), "MERGE_ON_READ"); + options.put(FlinkOptions.CHANGELOG_ENABLED.key(), enableChangelog + ""); + String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options); + tableEnv.executeSql(hoodieTableDDL); + tableEnv.executeSql(TestSQL.INSERT_T1).await(); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + FlinkCompactionConfig cfg = new FlinkCompactionConfig(); + cfg.path = tempFile.getAbsolutePath(); + Configuration conf = FlinkCompactionConfig.toFlinkConfig(cfg); + conf.setString(FlinkOptions.TABLE_TYPE.key(), "MERGE_ON_READ"); + + HoodieTableMetaClient metaClient = StreamerUtil.createMetaClient(conf); + conf.setString(FlinkOptions.TABLE_NAME, metaClient.getTableConfig().getTableName()); + CompactionUtil.setAvroSchema(conf, metaClient); + CompactionUtil.inferChangelogMode(conf, metaClient); + + List compactionInstantTimeList = new ArrayList<>(2); + + HoodieFlinkWriteClient writeClient = FlinkWriteClients.createWriteClient(conf); + + compactionInstantTimeList.add(scheduleCompactionPlan(metaClient, writeClient)); + + // insert a new record to new partition, so that we can generate a new compaction plan + String insertT1ForNewPartition = "insert into t1 values\n" + + "('id12','Tony',27,TIMESTAMP '1970-01-01 00:00:09','par5'),\n" + + "('id13','Jenny',72,TIMESTAMP '1970-01-01 00:00:10','par5')"; + tableEnv.executeSql(insertT1ForNewPartition).await(); + + writeClient.close(); + // re-create the write client/fs view server + // or there is low probability that connection refused occurs then + // the reader metadata view is not complete + writeClient = FlinkWriteClients.createWriteClient(conf); + + metaClient.reloadActiveTimeline(); + compactionInstantTimeList.add(scheduleCompactionPlan(metaClient, writeClient)); + + HoodieFlinkTable table = writeClient.getHoodieTable(); + + List> compactionPlans = new ArrayList<>(2); + for (String compactionInstantTime : compactionInstantTimeList) { + HoodieCompactionPlan plan = CompactionUtils.getCompactionPlan(table.getMetaClient(), compactionInstantTime); + compactionPlans.add(Pair.of(compactionInstantTime, plan)); + } + + // Mark instant as compaction inflight + for (String compactionInstantTime : compactionInstantTimeList) { + HoodieInstant hoodieInstant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime); + table.getActiveTimeline().transitionCompactionRequestedToInflight(hoodieInstant); + } + table.getMetaClient().reloadActiveTimeline(); + + env.addSource(new CompactionPlanSourceFunction(compactionPlans)) + .name("compaction_source") + .uid("uid_compaction_source") + .rebalance() + .transform("compact_task", + TypeInformation.of(CompactionCommitEvent.class), + new CompactOperator(conf)) + .setParallelism(1) + .addSink(new CompactionCommitSink(conf)) + .name("compaction_commit") + .uid("uid_compaction_commit") + .setParallelism(1); + + env.execute("flink_hudi_compaction"); + writeClient.close(); + TestData.checkWrittenDataCOW(tempFile, EXPECTED3); + } + + @Test + public void testCompactionInBatchExecutionMode() throws Exception { + EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build(); + TableEnvironment tableEnv = TableEnvironmentImpl.create(settings); + tableEnv.getConfig().getConfiguration() + .setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 4); + Map options = new HashMap<>(); + options.put(FlinkOptions.COMPACTION_DELTA_COMMITS.key(), "2"); + options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath()); + options.put(FlinkOptions.TABLE_TYPE.key(), "MERGE_ON_READ"); + String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options); + tableEnv.executeSql(hoodieTableDDL); + tableEnv.executeSql(TestSQL.INSERT_T1).await(); + tableEnv.executeSql(TestSQL.UPDATE_INSERT_T1).await(); + TestData.checkWrittenDataCOW(tempFile, EXPECTED2); + } + + private String scheduleCompactionPlan(HoodieTableMetaClient metaClient, HoodieFlinkWriteClient writeClient) { + boolean scheduled = false; + // judge whether there are any compaction operations. + Option compactionInstantTimeOption = CompactionUtil.getCompactionInstantTime(metaClient); + if (compactionInstantTimeOption.isPresent()) { + scheduled = writeClient.scheduleCompactionAtInstant(compactionInstantTimeOption.get(), Option.empty()); + } + assertTrue(scheduled, "The compaction plan should be scheduled"); + return compactionInstantTimeOption.get(); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/compact/TestCompactionPlanStrategy.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/compact/TestCompactionPlanStrategy.java new file mode 100644 index 0000000000000..edce1744cfa14 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/compact/TestCompactionPlanStrategy.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.compact; + +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.sink.compact.strategy.CompactionPlanStrategies; +import org.apache.hudi.sink.compact.strategy.CompactionPlanStrategy; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * Test case for every {@link CompactionPlanStrategy} implements + */ +public class TestCompactionPlanStrategy { + private HoodieTimeline timeline; + private HoodieTimeline emptyTimeline; + private HoodieTimeline allCompleteTimeline; + + private static final HoodieInstant INSTANT_001 = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "001"); + private static final HoodieInstant INSTANT_002 = new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "002"); + private static final HoodieInstant INSTANT_003 = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "003"); + private static final HoodieInstant INSTANT_004 = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "004"); + private static final HoodieInstant INSTANT_005 = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMPACTION_ACTION, "005"); + private static final HoodieInstant INSTANT_006 = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "006"); + + @BeforeEach + public void beforeEach() { + timeline = new MockHoodieActiveTimeline(INSTANT_001, INSTANT_002, INSTANT_003, INSTANT_004, INSTANT_005, INSTANT_006); + emptyTimeline = new MockHoodieActiveTimeline(); + allCompleteTimeline = new MockHoodieActiveTimeline(INSTANT_001, INSTANT_005); + } + + @Test + void testSingleCompactionPlanSelectStrategy() { + HoodieTimeline pendingCompactionTimeline = this.timeline.filterPendingCompactionTimeline(); + FlinkCompactionConfig compactionConfig = new FlinkCompactionConfig(); + CompactionPlanStrategy strategy = CompactionPlanStrategies.getStrategy(compactionConfig); + + assertHoodieInstantsEquals(new HoodieInstant[] {INSTANT_002}, strategy.select(pendingCompactionTimeline)); + + compactionConfig.compactionSeq = FlinkCompactionConfig.SEQ_LIFO; + assertHoodieInstantsEquals(new HoodieInstant[] {INSTANT_006}, strategy.select(pendingCompactionTimeline)); + + HoodieTimeline emptyPendingCompactionTimeline = emptyTimeline.filterPendingCompactionTimeline(); + assertHoodieInstantsEquals(new HoodieInstant[] {}, strategy.select(emptyPendingCompactionTimeline)); + + HoodieTimeline allCompleteCompactionTimeline = allCompleteTimeline.filterPendingCompactionTimeline(); + assertHoodieInstantsEquals(new HoodieInstant[] {}, strategy.select(allCompleteCompactionTimeline)); + } + + @Test + void testMultiCompactionPlanSelectStrategy() { + HoodieTimeline pendingCompactionTimeline = this.timeline.filterPendingCompactionTimeline(); + FlinkCompactionConfig compactionConfig = new FlinkCompactionConfig(); + compactionConfig.maxNumCompactionPlans = 2; + + CompactionPlanStrategy strategy = CompactionPlanStrategies.getStrategy(compactionConfig); + assertHoodieInstantsEquals(new HoodieInstant[] {INSTANT_002, INSTANT_003}, strategy.select(pendingCompactionTimeline)); + + compactionConfig.compactionSeq = FlinkCompactionConfig.SEQ_LIFO; + assertHoodieInstantsEquals(new HoodieInstant[] {INSTANT_006, INSTANT_004}, strategy.select(pendingCompactionTimeline)); + + HoodieTimeline emptyPendingCompactionTimeline = emptyTimeline.filterPendingCompactionTimeline(); + assertHoodieInstantsEquals(new HoodieInstant[] {}, strategy.select(emptyPendingCompactionTimeline)); + + HoodieTimeline allCompleteCompactionTimeline = allCompleteTimeline.filterPendingCompactionTimeline(); + assertHoodieInstantsEquals(new HoodieInstant[] {}, strategy.select(allCompleteCompactionTimeline)); + } + + @Test + void testAllPendingCompactionPlanSelectStrategy() { + HoodieTimeline pendingCompactionTimeline = this.timeline.filterPendingCompactionTimeline(); + FlinkCompactionConfig compactionConfig = new FlinkCompactionConfig(); + compactionConfig.compactionPlanSelectStrategy = CompactionPlanStrategy.ALL; + CompactionPlanStrategy strategy = CompactionPlanStrategies.getStrategy(compactionConfig); + + assertHoodieInstantsEquals(new HoodieInstant[] {INSTANT_002, INSTANT_003, INSTANT_004, INSTANT_006}, + strategy.select(pendingCompactionTimeline)); + + HoodieTimeline emptyPendingCompactionTimeline = emptyTimeline.filterPendingCompactionTimeline(); + assertHoodieInstantsEquals(new HoodieInstant[] {}, strategy.select(emptyPendingCompactionTimeline)); + + HoodieTimeline allCompleteCompactionTimeline = allCompleteTimeline.filterPendingCompactionTimeline(); + assertHoodieInstantsEquals(new HoodieInstant[] {}, strategy.select(allCompleteCompactionTimeline)); + } + + @Test + void testInstantCompactionPlanSelectStrategy() { + HoodieTimeline pendingCompactionTimeline = this.timeline.filterPendingCompactionTimeline(); + FlinkCompactionConfig compactionConfig = new FlinkCompactionConfig(); + + compactionConfig.compactionPlanSelectStrategy = CompactionPlanStrategy.INSTANTS; + CompactionPlanStrategy strategy = CompactionPlanStrategies.getStrategy(compactionConfig); + compactionConfig.compactionPlanInstant = "004"; + + assertHoodieInstantsEquals(new HoodieInstant[] {INSTANT_004}, strategy.select(pendingCompactionTimeline)); + + compactionConfig.compactionPlanInstant = "002,003"; + assertHoodieInstantsEquals(new HoodieInstant[] {INSTANT_002, INSTANT_003}, strategy.select(pendingCompactionTimeline)); + + compactionConfig.compactionPlanInstant = "002,005"; + assertHoodieInstantsEquals(new HoodieInstant[] {INSTANT_002}, strategy.select(pendingCompactionTimeline)); + + compactionConfig.compactionPlanInstant = "005"; + assertHoodieInstantsEquals(new HoodieInstant[] {}, strategy.select(pendingCompactionTimeline)); + } + + private void assertHoodieInstantsEquals(HoodieInstant[] expected, List actual) { + assertEquals(expected.length, actual.size()); + for (int index = 0; index < expected.length; index++) { + assertHoodieInstantEquals(expected[index], actual.get(index)); + } + } + + private void assertHoodieInstantEquals(HoodieInstant expected, HoodieInstant actual) { + assertEquals(expected.getState(), actual.getState()); + assertEquals(expected.getAction(), actual.getAction()); + assertEquals(expected.getTimestamp(), actual.getTimestamp()); + } + + private static final class MockHoodieActiveTimeline extends HoodieActiveTimeline { + public MockHoodieActiveTimeline(HoodieInstant... instants) { + super(); + setInstants(Arrays.asList(instants)); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/meta/TestCkpMetadata.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/meta/TestCkpMetadata.java new file mode 100644 index 0000000000000..fe7ce3f9478d6 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/meta/TestCkpMetadata.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.meta; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.util.StreamerUtil; +import org.apache.hudi.utils.TestConfigurations; + +import org.apache.flink.configuration.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.File; +import java.util.stream.IntStream; + +import static org.hamcrest.CoreMatchers.equalTo; +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; + +/** + * Test cases for {@link CkpMetadata}. + */ +public class TestCkpMetadata { + + private CkpMetadata metadata; + + @TempDir + File tempFile; + + @BeforeEach + public void beforeEach() throws Exception { + String basePath = tempFile.getAbsolutePath(); + FileSystem fs = FSUtils.getFs(tempFile.getAbsolutePath(), HadoopConfigurations.getHadoopConf(new Configuration())); + + Configuration conf = TestConfigurations.getDefaultConf(basePath); + StreamerUtil.initTableIfNotExists(conf); + + this.metadata = CkpMetadata.getInstance(fs, basePath); + } + + @Test + void testWriteAndReadMessage() { + // write and read 5 committed checkpoints + IntStream.range(0, 3).forEach(i -> metadata.startInstant(i + "")); + + assertThat(metadata.lastPendingInstant(), is("2")); + metadata.commitInstant("2"); + assertThat(metadata.lastPendingInstant(), equalTo(null)); + + // test cleaning + IntStream.range(3, 6).forEach(i -> metadata.startInstant(i + "")); + assertThat(metadata.getMessages().size(), is(3)); + // commit and abort instant does not trigger cleaning + metadata.commitInstant("6"); + metadata.abortInstant("7"); + assertThat(metadata.getMessages().size(), is(5)); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/partitioner/TestBucketAssigner.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/partitioner/TestBucketAssigner.java new file mode 100644 index 0000000000000..07a3b7515a04f --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/partitioner/TestBucketAssigner.java @@ -0,0 +1,473 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.partitioner; + +import org.apache.hudi.client.FlinkTaskContextSupplier; +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.sink.partitioner.profile.WriteProfile; +import org.apache.hudi.table.action.commit.BucketInfo; +import org.apache.hudi.table.action.commit.BucketType; +import org.apache.hudi.table.action.commit.SmallFile; +import org.apache.hudi.util.FlinkWriteClients; +import org.apache.hudi.util.StreamerUtil; +import org.apache.hudi.utils.TestConfigurations; +import org.apache.hudi.utils.TestData; + +import org.apache.flink.configuration.Configuration; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Test cases for {@link BucketAssigner}. + */ +public class TestBucketAssigner { + private HoodieWriteConfig writeConfig; + private HoodieFlinkEngineContext context; + private Configuration conf; + + @TempDir + File tempFile; + + @BeforeEach + public void before() throws IOException { + final String basePath = tempFile.getAbsolutePath(); + conf = TestConfigurations.getDefaultConf(basePath); + + writeConfig = FlinkWriteClients.getHoodieClientConfig(conf); + context = new HoodieFlinkEngineContext( + new SerializableConfiguration(HadoopConfigurations.getHadoopConf(conf)), + new FlinkTaskContextSupplier(null)); + StreamerUtil.initTableIfNotExists(conf); + } + + /** + * Test that the file ids generated by the task can finally shuffled to itself. + */ + @Test + void testSmallFilesOfThisTask() { + MockBucketAssigner mockBucketAssigner1 = new MockBucketAssigner(context, writeConfig); + String fileId1 = mockBucketAssigner1.createFileIdOfThisTask(); + SmallFile smallFile1 = new SmallFile(); + smallFile1.location = new HoodieRecordLocation("t0", fileId1); + smallFile1.sizeBytes = 123; + List smallFiles1 = mockBucketAssigner1.smallFilesOfThisTask(Collections.singletonList(smallFile1)); + assertThat(smallFiles1.size(), is(1)); + + // modify the parallelism and test again + MockBucketAssigner mockBucketAssigner2 = new MockBucketAssigner(123, 200, context, writeConfig, Collections.emptyMap()); + String fileId2 = mockBucketAssigner2.createFileIdOfThisTask(); + SmallFile smallFile2 = new SmallFile(); + smallFile2.location = new HoodieRecordLocation("t0", fileId2); + smallFile2.sizeBytes = 123; + + String fileId3 = mockBucketAssigner2.createFileIdOfThisTask(); + SmallFile smallFile3 = new SmallFile(); + smallFile3.location = new HoodieRecordLocation("t0", fileId3); + smallFile3.sizeBytes = 456; + + List smallFiles2 = mockBucketAssigner1.smallFilesOfThisTask(Arrays.asList(smallFile2, smallFile3)); + assertThat(smallFiles2.size(), is(2)); + } + + @Test + public void testAddUpdate() { + MockBucketAssigner mockBucketAssigner = new MockBucketAssigner(context, writeConfig); + BucketInfo bucketInfo = mockBucketAssigner.addUpdate("par1", "file_id_0"); + assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "file_id_0"); + + mockBucketAssigner.addUpdate("par1", "file_id_0"); + bucketInfo = mockBucketAssigner.addUpdate("par1", "file_id_0"); + assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "file_id_0"); + + mockBucketAssigner.addUpdate("par1", "file_id_1"); + bucketInfo = mockBucketAssigner.addUpdate("par1", "file_id_1"); + assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "file_id_1"); + + bucketInfo = mockBucketAssigner.addUpdate("par2", "file_id_0"); + assertBucketEquals(bucketInfo, "par2", BucketType.UPDATE, "file_id_0"); + + bucketInfo = mockBucketAssigner.addUpdate("par3", "file_id_2"); + assertBucketEquals(bucketInfo, "par3", BucketType.UPDATE, "file_id_2"); + } + + @Test + public void testAddInsert() { + MockBucketAssigner mockBucketAssigner = new MockBucketAssigner(context, writeConfig); + BucketInfo bucketInfo = mockBucketAssigner.addInsert("par1"); + assertBucketEquals(bucketInfo, "par1", BucketType.INSERT); + + mockBucketAssigner.addInsert("par1"); + bucketInfo = mockBucketAssigner.addInsert("par1"); + assertBucketEquals(bucketInfo, "par1", BucketType.INSERT); + + mockBucketAssigner.addInsert("par2"); + bucketInfo = mockBucketAssigner.addInsert("par2"); + assertBucketEquals(bucketInfo, "par2", BucketType.INSERT); + + bucketInfo = mockBucketAssigner.addInsert("par3"); + assertBucketEquals(bucketInfo, "par3", BucketType.INSERT); + + bucketInfo = mockBucketAssigner.addInsert("par3"); + assertBucketEquals(bucketInfo, "par3", BucketType.INSERT); + } + + @Test + public void testInsertOverBucketAssigned() { + conf.setInteger(HoodieCompactionConfig.COPY_ON_WRITE_INSERT_SPLIT_SIZE.key(), 2); + writeConfig = FlinkWriteClients.getHoodieClientConfig(conf); + + MockBucketAssigner mockBucketAssigner = new MockBucketAssigner(context, writeConfig); + BucketInfo bucketInfo1 = mockBucketAssigner.addInsert("par1"); + assertBucketEquals(bucketInfo1, "par1", BucketType.INSERT); + + BucketInfo bucketInfo2 = mockBucketAssigner.addInsert("par1"); + assertBucketEquals(bucketInfo2, "par1", BucketType.INSERT); + + assertEquals(bucketInfo1, bucketInfo2); + + BucketInfo bucketInfo3 = mockBucketAssigner.addInsert("par1"); + assertBucketEquals(bucketInfo3, "par1", BucketType.INSERT); + + assertNotEquals(bucketInfo1, bucketInfo3); + } + + @Test + public void testInsertWithSmallFiles() { + SmallFile f0 = new SmallFile(); + f0.location = new HoodieRecordLocation("t0", "f0"); + f0.sizeBytes = 12; + + SmallFile f1 = new SmallFile(); + f1.location = new HoodieRecordLocation("t0", "f1"); + f1.sizeBytes = 122879; // no left space to append new records to this bucket + + SmallFile f2 = new SmallFile(); + f2.location = new HoodieRecordLocation("t0", "f2"); + f2.sizeBytes = 56; + + Map> smallFilesMap = new HashMap<>(); + smallFilesMap.put("par1", Arrays.asList(f0, f1)); + smallFilesMap.put("par2", Collections.singletonList(f2)); + + MockBucketAssigner mockBucketAssigner = new MockBucketAssigner(context, writeConfig, smallFilesMap); + BucketInfo bucketInfo = mockBucketAssigner.addInsert("par1"); + assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f0"); + + mockBucketAssigner.addInsert("par1"); + bucketInfo = mockBucketAssigner.addInsert("par1"); + assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f0"); + + mockBucketAssigner.addInsert("par2"); + bucketInfo = mockBucketAssigner.addInsert("par2"); + assertBucketEquals(bucketInfo, "par2", BucketType.UPDATE, "f2"); + + bucketInfo = mockBucketAssigner.addInsert("par3"); + assertBucketEquals(bucketInfo, "par3", BucketType.INSERT); + + bucketInfo = mockBucketAssigner.addInsert("par3"); + assertBucketEquals(bucketInfo, "par3", BucketType.INSERT); + } + + /** + * Test that only partial small files are assigned to the task. + */ + @Test + public void testInsertWithPartialSmallFiles() { + SmallFile f0 = new SmallFile(); + f0.location = new HoodieRecordLocation("t0", "f0"); + f0.sizeBytes = 12; + + SmallFile f1 = new SmallFile(); + f1.location = new HoodieRecordLocation("t0", "f1"); + f1.sizeBytes = 122879; // no left space to append new records to this bucket + + SmallFile f2 = new SmallFile(); + f2.location = new HoodieRecordLocation("t0", "f2"); + f2.sizeBytes = 56; + + Map> smallFilesMap = new HashMap<>(); + smallFilesMap.put("par1", Arrays.asList(f0, f1, f2)); + + MockBucketAssigner mockBucketAssigner = new MockBucketAssigner(0, 2, context, writeConfig, smallFilesMap); + BucketInfo bucketInfo = mockBucketAssigner.addInsert("par1"); + assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f2"); + + mockBucketAssigner.addInsert("par1"); + bucketInfo = mockBucketAssigner.addInsert("par1"); + assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f2"); + + bucketInfo = mockBucketAssigner.addInsert("par3"); + assertBucketEquals(bucketInfo, "par3", BucketType.INSERT); + + bucketInfo = mockBucketAssigner.addInsert("par3"); + assertBucketEquals(bucketInfo, "par3", BucketType.INSERT); + + MockBucketAssigner mockBucketAssigner2 = new MockBucketAssigner(1, 2, context, writeConfig, smallFilesMap); + BucketInfo bucketInfo2 = mockBucketAssigner2.addInsert("par1"); + assertBucketEquals(bucketInfo2, "par1", BucketType.UPDATE, "f0"); + + mockBucketAssigner2.addInsert("par1"); + bucketInfo2 = mockBucketAssigner2.addInsert("par1"); + assertBucketEquals(bucketInfo2, "par1", BucketType.UPDATE, "f0"); + + bucketInfo2 = mockBucketAssigner2.addInsert("par3"); + assertBucketEquals(bucketInfo2, "par3", BucketType.INSERT); + + bucketInfo2 = mockBucketAssigner2.addInsert("par3"); + assertBucketEquals(bucketInfo2, "par3", BucketType.INSERT); + } + + @Test + public void testUpdateAndInsertWithSmallFiles() { + SmallFile f0 = new SmallFile(); + f0.location = new HoodieRecordLocation("t0", "f0"); + f0.sizeBytes = 12; + + SmallFile f1 = new SmallFile(); + f1.location = new HoodieRecordLocation("t0", "f1"); + f1.sizeBytes = 122879; // no left space to append new records to this bucket + + SmallFile f2 = new SmallFile(); + f2.location = new HoodieRecordLocation("t0", "f2"); + f2.sizeBytes = 56; + + Map> smallFilesMap = new HashMap<>(); + smallFilesMap.put("par1", Arrays.asList(f0, f1)); + smallFilesMap.put("par2", Collections.singletonList(f2)); + + MockBucketAssigner mockBucketAssigner = new MockBucketAssigner(context, writeConfig, smallFilesMap); + mockBucketAssigner.addUpdate("par1", "f0"); + + BucketInfo bucketInfo = mockBucketAssigner.addInsert("par1"); + assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f0"); + + mockBucketAssigner.addInsert("par1"); + bucketInfo = mockBucketAssigner.addInsert("par1"); + assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f0"); + + mockBucketAssigner.addUpdate("par1", "f2"); + + mockBucketAssigner.addInsert("par1"); + bucketInfo = mockBucketAssigner.addInsert("par1"); + assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f0"); + + mockBucketAssigner.addUpdate("par2", "f0"); + + mockBucketAssigner.addInsert("par2"); + bucketInfo = mockBucketAssigner.addInsert("par2"); + assertBucketEquals(bucketInfo, "par2", BucketType.UPDATE, "f2"); + } + + /** + * Test that only partial small files are assigned to the task. + */ + @Test + public void testUpdateAndInsertWithPartialSmallFiles() { + SmallFile f0 = new SmallFile(); + f0.location = new HoodieRecordLocation("t0", "f0"); + f0.sizeBytes = 12; + + SmallFile f1 = new SmallFile(); + f1.location = new HoodieRecordLocation("t0", "f1"); + f1.sizeBytes = 122879; // no left space to append new records to this bucket + + SmallFile f2 = new SmallFile(); + f2.location = new HoodieRecordLocation("t0", "f2"); + f2.sizeBytes = 56; + + Map> smallFilesMap = new HashMap<>(); + smallFilesMap.put("par1", Arrays.asList(f0, f1, f2)); + + MockBucketAssigner mockBucketAssigner = new MockBucketAssigner(0, 2, context, writeConfig, smallFilesMap); + mockBucketAssigner.addUpdate("par1", "f0"); + + BucketInfo bucketInfo = mockBucketAssigner.addInsert("par1"); + assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f2"); + + mockBucketAssigner.addInsert("par1"); + bucketInfo = mockBucketAssigner.addInsert("par1"); + assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f2"); + + mockBucketAssigner.addUpdate("par1", "f2"); + + mockBucketAssigner.addInsert("par1"); + bucketInfo = mockBucketAssigner.addInsert("par1"); + assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f2"); + + + MockBucketAssigner mockBucketAssigner2 = new MockBucketAssigner(1, 2, context, writeConfig, smallFilesMap); + mockBucketAssigner2.addUpdate("par1", "f0"); + + BucketInfo bucketInfo2 = mockBucketAssigner2.addInsert("par1"); + assertBucketEquals(bucketInfo2, "par1", BucketType.UPDATE, "f0"); + + mockBucketAssigner2.addInsert("par1"); + bucketInfo2 = mockBucketAssigner2.addInsert("par1"); + assertBucketEquals(bucketInfo2, "par1", BucketType.UPDATE, "f0"); + + mockBucketAssigner2.addUpdate("par1", "f2"); + + mockBucketAssigner2.addInsert("par1"); + bucketInfo2 = mockBucketAssigner2.addInsert("par1"); + assertBucketEquals(bucketInfo2, "par1", BucketType.UPDATE, "f0"); + } + + @Test + public void testWriteProfileReload() throws Exception { + WriteProfile writeProfile = new WriteProfile(writeConfig, context); + List smallFiles1 = writeProfile.getSmallFiles("par1"); + assertTrue(smallFiles1.isEmpty(), "Should have no small files"); + + TestData.writeData(TestData.DATA_SET_INSERT, conf); + String instantOption = getLastCompleteInstant(writeProfile); + assertNull(instantOption); + + writeProfile.reload(1); + String instant1 = getLastCompleteInstant(writeProfile); + assertNotNull(instant1); + List smallFiles2 = writeProfile.getSmallFiles("par1"); + assertThat("Should have 1 small file", smallFiles2.size(), is(1)); + assertThat("Small file should have same timestamp as last complete instant", + smallFiles2.get(0).location.getInstantTime(), is(instant1)); + + TestData.writeData(TestData.DATA_SET_INSERT, conf); + List smallFiles3 = writeProfile.getSmallFiles("par1"); + assertThat("Should have 1 small file", smallFiles3.size(), is(1)); + assertThat("Non-reloaded write profile has the same base file view as before", + smallFiles3.get(0).location.getInstantTime(), is(instant1)); + + writeProfile.reload(2); + String instant2 = getLastCompleteInstant(writeProfile); + assertNotEquals(instant2, instant1, "Should have new complete instant"); + List smallFiles4 = writeProfile.getSmallFiles("par1"); + assertThat("Should have 1 small file", smallFiles4.size(), is(1)); + assertThat("Small file should have same timestamp as last complete instant", + smallFiles4.get(0).location.getInstantTime(), is(instant2)); + } + + @Test + public void testWriteProfileMetadataCache() throws Exception { + WriteProfile writeProfile = new WriteProfile(writeConfig, context); + assertTrue(writeProfile.getMetadataCache().isEmpty(), "Empty table should no have any instant metadata"); + + // write 3 instants of data + for (int i = 0; i < 3; i++) { + TestData.writeData(TestData.DATA_SET_INSERT, conf); + } + // the record profile triggers the metadata loading + writeProfile.reload(1); + assertThat("Metadata cache should have same number entries as timeline instants", + writeProfile.getMetadataCache().size(), is(3)); + + writeProfile.getSmallFiles("par1"); + assertThat("The metadata should be reused", + writeProfile.getMetadataCache().size(), is(3)); + } + + private static String getLastCompleteInstant(WriteProfile profile) { + return StreamerUtil.getLastCompletedInstant(profile.getMetaClient()); + } + + private void assertBucketEquals( + BucketInfo bucketInfo, + String partition, + BucketType bucketType, + String fileId) { + BucketInfo actual = new BucketInfo(bucketType, fileId, partition); + assertThat(bucketInfo, is(actual)); + } + + private void assertBucketEquals( + BucketInfo bucketInfo, + String partition, + BucketType bucketType) { + assertThat(bucketInfo.getPartitionPath(), is(partition)); + assertThat(bucketInfo.getBucketType(), is(bucketType)); + } + + /** + * Mock BucketAssigner that can specify small files explicitly. + */ + static class MockBucketAssigner extends BucketAssigner { + + MockBucketAssigner( + HoodieFlinkEngineContext context, + HoodieWriteConfig config) { + this(context, config, Collections.emptyMap()); + } + + MockBucketAssigner( + HoodieFlinkEngineContext context, + HoodieWriteConfig config, + Map> smallFilesMap) { + this(0, 1, context, config, smallFilesMap); + } + + MockBucketAssigner( + int taskID, + int numTasks, + HoodieFlinkEngineContext context, + HoodieWriteConfig config, + Map> smallFilesMap) { + super(taskID, 1024, numTasks, new MockWriteProfile(config, context, smallFilesMap), config); + } + } + + /** + * Mock WriteProfile that can specify small files explicitly. + */ + static class MockWriteProfile extends WriteProfile { + private final Map> smallFilesMap; + + public MockWriteProfile(HoodieWriteConfig config, HoodieFlinkEngineContext context, Map> smallFilesMap) { + super(config, context); + this.smallFilesMap = smallFilesMap; + } + + @Override + protected List smallFilesProfile(String partitionPath) { + if (this.smallFilesMap.containsKey(partitionPath)) { + return this.smallFilesMap.get(partitionPath); + } + return Collections.emptyList(); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/ClusteringFunctionWrapper.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/ClusteringFunctionWrapper.java new file mode 100644 index 0000000000000..e3b75cbf6379c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/ClusteringFunctionWrapper.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.hudi.sink.clustering.ClusteringCommitEvent; +import org.apache.hudi.sink.clustering.ClusteringCommitSink; +import org.apache.hudi.sink.clustering.ClusteringOperator; +import org.apache.hudi.sink.clustering.ClusteringPlanEvent; +import org.apache.hudi.sink.clustering.ClusteringPlanOperator; +import org.apache.hudi.utils.TestConfigurations; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.io.disk.iomanager.IOManager; +import org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.memory.MemoryManager; +import org.apache.flink.runtime.operators.coordination.MockOperatorCoordinatorContext; +import org.apache.flink.runtime.operators.testutils.MockEnvironment; +import org.apache.flink.runtime.operators.testutils.MockEnvironmentBuilder; +import org.apache.flink.streaming.api.graph.StreamConfig; +import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.runtime.tasks.StreamTask; +import org.apache.flink.streaming.runtime.tasks.TestProcessingTimeService; + +/** + * A wrapper class to manipulate the {@link ClusteringOperator} instance for testing. + */ +public class ClusteringFunctionWrapper { + private final Configuration conf; + + private final IOManager ioManager; + private final StreamingRuntimeContext runtimeContext; + + private final StreamTask streamTask; + private final StreamConfig streamConfig; + + /** + * Function that generates the {@code HoodieClusteringPlan}. + */ + private ClusteringPlanOperator clusteringPlanOperator; + /** + * Output to collect the clustering commit events. + */ + private CollectorOutput commitEventOutput; + /** + * Function that executes the clustering task. + */ + private ClusteringOperator clusteringOperator; + /** + * Stream sink to handle clustering commits. + */ + private ClusteringCommitSink commitSink; + + public ClusteringFunctionWrapper(Configuration conf, StreamTask streamTask, StreamConfig streamConfig) { + this.conf = conf; + this.ioManager = new IOManagerAsync(); + MockEnvironment environment = new MockEnvironmentBuilder() + .setTaskName("mockTask") + .setManagedMemorySize(4 * MemoryManager.DEFAULT_PAGE_SIZE) + .setIOManager(ioManager) + .build(); + this.runtimeContext = new MockStreamingRuntimeContext(false, 1, 0, environment); + this.streamTask = streamTask; + this.streamConfig = streamConfig; + } + + public void openFunction() throws Exception { + clusteringPlanOperator = new ClusteringPlanOperator(conf); + clusteringPlanOperator.open(); + + clusteringOperator = new ClusteringOperator(conf, TestConfigurations.ROW_TYPE); + // CAUTION: deprecated API used. + clusteringOperator.setProcessingTimeService(new TestProcessingTimeService()); + commitEventOutput = new CollectorOutput<>(); + clusteringOperator.setup(streamTask, streamConfig, commitEventOutput); + clusteringOperator.open(); + final NonThrownExecutor syncExecutor = new MockCoordinatorExecutor( + new MockOperatorCoordinatorContext(new OperatorID(), 1)); + clusteringOperator.setExecutor(syncExecutor); + + commitSink = new ClusteringCommitSink(conf); + commitSink.setRuntimeContext(runtimeContext); + commitSink.open(conf); + } + + public void cluster(long checkpointID) throws Exception { + // collect the ClusteringPlanEvents. + CollectorOutput planOutput = new CollectorOutput<>(); + clusteringPlanOperator.setOutput(planOutput); + clusteringPlanOperator.notifyCheckpointComplete(checkpointID); + // collect the ClusteringCommitEvents + for (ClusteringPlanEvent event : planOutput.getRecords()) { + clusteringOperator.processElement(new StreamRecord<>(event)); + } + // handle and commit the clustering + for (ClusteringCommitEvent event : commitEventOutput.getRecords()) { + commitSink.invoke(event, null); + } + } + + public void close() throws Exception { + ioManager.close(); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/CollectorOutput.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/CollectorOutput.java new file mode 100644 index 0000000000000..b18cfac51b44f --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/CollectorOutput.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.hudi.adapter.OutputAdapter; + +import org.apache.flink.streaming.api.operators.Output; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.runtime.streamrecord.LatencyMarker; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.util.OutputTag; + +import java.util.ArrayList; +import java.util.List; + +/** + * Collecting {@link Output} for {@link StreamRecord}. + */ +public class CollectorOutput implements OutputAdapter> { + + private final List records; + + public CollectorOutput() { + this.records = new ArrayList<>(); + } + + public List getRecords() { + return this.records; + } + + @Override + public void emitWatermark(Watermark mark) { + // no operation + } + + @Override + public void emitLatencyMarker(LatencyMarker latencyMarker) { + // no operation + } + + @Override + public void collect(StreamRecord record) { + records.add(record.getValue()); + } + + @Override + public void collect(OutputTag outputTag, StreamRecord record) { + throw new UnsupportedOperationException("Side output not supported for CollectorOutput"); + } + + @Override + public void close() { + this.records.clear(); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/CompactFunctionWrapper.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/CompactFunctionWrapper.java new file mode 100644 index 0000000000000..78a8305c9c51b --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/CompactFunctionWrapper.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.sink.compact.CompactOperator; +import org.apache.hudi.sink.compact.CompactionCommitEvent; +import org.apache.hudi.sink.compact.CompactionCommitSink; +import org.apache.hudi.sink.compact.CompactionPlanEvent; +import org.apache.hudi.sink.compact.CompactionPlanOperator; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.io.disk.iomanager.IOManager; +import org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.memory.MemoryManager; +import org.apache.flink.runtime.operators.coordination.MockOperatorCoordinatorContext; +import org.apache.flink.runtime.operators.testutils.MockEnvironment; +import org.apache.flink.runtime.operators.testutils.MockEnvironmentBuilder; +import org.apache.flink.streaming.api.graph.StreamConfig; +import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.runtime.tasks.StreamTask; +import org.apache.flink.streaming.runtime.tasks.TestProcessingTimeService; + +/** + * A wrapper class to manipulate the {@link CompactOperator} instance for testing. + */ +public class CompactFunctionWrapper { + private final Configuration conf; + + private final IOManager ioManager; + private final StreamingRuntimeContext runtimeContext; + + private final StreamTask streamTask; + private final StreamConfig streamConfig; + + /** + * Function that generates the {@link HoodieCompactionPlan}. + */ + private CompactionPlanOperator compactionPlanOperator; + /** + * Output to collect the compaction commit events. + */ + private CollectorOutput commitEventOutput; + /** + * Function that executes the compaction task. + */ + private CompactOperator compactOperator; + /** + * Stream sink to handle compaction commits. + */ + private CompactionCommitSink commitSink; + + public CompactFunctionWrapper(Configuration conf, StreamTask streamTask, StreamConfig streamConfig) { + this.conf = conf; + this.ioManager = new IOManagerAsync(); + MockEnvironment environment = new MockEnvironmentBuilder() + .setTaskName("mockTask") + .setManagedMemorySize(4 * MemoryManager.DEFAULT_PAGE_SIZE) + .setIOManager(ioManager) + .build(); + this.runtimeContext = new MockStreamingRuntimeContext(false, 1, 0, environment); + this.streamTask = streamTask; + this.streamConfig = streamConfig; + } + + public void openFunction() throws Exception { + compactionPlanOperator = new CompactionPlanOperator(conf); + compactionPlanOperator.open(); + + compactOperator = new CompactOperator(conf); + // CAUTION: deprecated API used. + compactOperator.setProcessingTimeService(new TestProcessingTimeService()); + commitEventOutput = new CollectorOutput<>(); + compactOperator.setup(streamTask, streamConfig, commitEventOutput); + compactOperator.open(); + final NonThrownExecutor syncExecutor = new MockCoordinatorExecutor( + new MockOperatorCoordinatorContext(new OperatorID(), 1)); + compactOperator.setExecutor(syncExecutor); + + commitSink = new CompactionCommitSink(conf); + commitSink.setRuntimeContext(runtimeContext); + commitSink.open(conf); + } + + public void compact(long checkpointID) throws Exception { + // collect the CompactEvents. + CollectorOutput output = new CollectorOutput<>(); + compactionPlanOperator.setOutput(output); + compactionPlanOperator.notifyCheckpointComplete(checkpointID); + // collect the CompactCommitEvents + for (CompactionPlanEvent event : output.getRecords()) { + compactOperator.processElement(new StreamRecord<>(event)); + } + // handle and commit the compaction + for (CompactionCommitEvent event : commitEventOutput.getRecords()) { + commitSink.invoke(event, null); + } + } + + public void close() throws Exception { + ioManager.close(); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/InsertFunctionWrapper.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/InsertFunctionWrapper.java new file mode 100644 index 0000000000000..707fe45c47358 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/InsertFunctionWrapper.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.sink.StreamWriteOperatorCoordinator; +import org.apache.hudi.sink.append.AppendWriteFunction; +import org.apache.hudi.sink.bulk.BulkInsertWriterHelper; +import org.apache.hudi.sink.event.WriteMetadataEvent; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.io.disk.iomanager.IOManager; +import org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.memory.MemoryManager; +import org.apache.flink.runtime.operators.coordination.MockOperatorCoordinatorContext; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.runtime.operators.testutils.MockEnvironment; +import org.apache.flink.runtime.operators.testutils.MockEnvironmentBuilder; +import org.apache.flink.streaming.api.graph.StreamConfig; +import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; +import org.apache.flink.streaming.api.operators.collect.utils.MockFunctionSnapshotContext; +import org.apache.flink.streaming.api.operators.collect.utils.MockOperatorEventGateway; +import org.apache.flink.streaming.runtime.tasks.StreamTask; +import org.apache.flink.streaming.util.MockStreamTaskBuilder; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; + +import java.util.concurrent.CompletableFuture; + +/** + * A wrapper class to manipulate the {@link AppendWriteFunction} instance for testing. + * + * @param Input type + */ +public class InsertFunctionWrapper implements TestFunctionWrapper { + private final Configuration conf; + private final RowType rowType; + + private final StreamingRuntimeContext runtimeContext; + private final MockOperatorEventGateway gateway; + private final MockOperatorCoordinatorContext coordinatorContext; + private final StreamWriteOperatorCoordinator coordinator; + private final MockStateInitializationContext stateInitializationContext; + + private final boolean asyncClustering; + private ClusteringFunctionWrapper clusteringFunctionWrapper; + + /** + * Append write function. + */ + private AppendWriteFunction writeFunction; + + public InsertFunctionWrapper(String tablePath, Configuration conf) throws Exception { + IOManager ioManager = new IOManagerAsync(); + MockEnvironment environment = new MockEnvironmentBuilder() + .setTaskName("mockTask") + .setManagedMemorySize(4 * MemoryManager.DEFAULT_PAGE_SIZE) + .setIOManager(ioManager) + .build(); + this.runtimeContext = new MockStreamingRuntimeContext(false, 1, 0, environment); + this.gateway = new MockOperatorEventGateway(); + this.conf = conf; + this.rowType = (RowType) AvroSchemaConverter.convertToDataType(StreamerUtil.getSourceSchema(conf)).getLogicalType(); + // one function + this.coordinatorContext = new MockOperatorCoordinatorContext(new OperatorID(), 1); + this.coordinator = new StreamWriteOperatorCoordinator(conf, this.coordinatorContext); + this.stateInitializationContext = new MockStateInitializationContext(); + + this.asyncClustering = OptionsResolver.needsAsyncClustering(conf); + StreamConfig streamConfig = new StreamConfig(conf); + streamConfig.setOperatorID(new OperatorID()); + StreamTask streamTask = new MockStreamTaskBuilder(environment) + .setConfig(new StreamConfig(conf)) + .setExecutionConfig(new ExecutionConfig().enableObjectReuse()) + .build(); + this.clusteringFunctionWrapper = new ClusteringFunctionWrapper(this.conf, streamTask, streamConfig); + } + + public void openFunction() throws Exception { + this.coordinator.start(); + this.coordinator.setExecutor(new MockCoordinatorExecutor(coordinatorContext)); + + setupWriteFunction(); + + if (asyncClustering) { + clusteringFunctionWrapper.openFunction(); + } + } + + public void invoke(I record) throws Exception { + writeFunction.processElement((RowData) record, null, null); + } + + public WriteMetadataEvent[] getEventBuffer() { + return this.coordinator.getEventBuffer(); + } + + public OperatorEvent getNextEvent() { + return this.gateway.getNextEvent(); + } + + public void checkpointFunction(long checkpointId) throws Exception { + // checkpoint the coordinator first + this.coordinator.checkpointCoordinator(checkpointId, new CompletableFuture<>()); + + writeFunction.snapshotState(new MockFunctionSnapshotContext(checkpointId)); + stateInitializationContext.getOperatorStateStore().checkpointBegin(checkpointId); + } + + public void checkpointComplete(long checkpointId) { + stateInitializationContext.getOperatorStateStore().checkpointSuccess(checkpointId); + coordinator.notifyCheckpointComplete(checkpointId); + if (asyncClustering) { + try { + clusteringFunctionWrapper.cluster(checkpointId); + } catch (Exception e) { + throw new HoodieException(e); + } + } + } + + public StreamWriteOperatorCoordinator getCoordinator() { + return coordinator; + } + + @Override + public void close() throws Exception { + this.coordinator.close(); + if (clusteringFunctionWrapper != null) { + clusteringFunctionWrapper.close(); + } + } + + public BulkInsertWriterHelper getWriterHelper() { + return this.writeFunction.getWriterHelper(); + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + private void setupWriteFunction() throws Exception { + writeFunction = new AppendWriteFunction<>(conf, rowType); + writeFunction.setRuntimeContext(runtimeContext); + writeFunction.setOperatorEventGateway(gateway); + writeFunction.initializeState(this.stateInitializationContext); + writeFunction.open(conf); + + // handle the bootstrap event + coordinator.handleEventFromOperator(0, getNextEvent()); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockCoordinatorExecutor.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockCoordinatorExecutor.java new file mode 100644 index 0000000000000..7e84453aa375e --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockCoordinatorExecutor.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.hudi.exception.HoodieException; + +import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; +import org.apache.flink.util.ExceptionUtils; +import org.apache.flink.util.function.ThrowingRunnable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A mock {@link NonThrownExecutor} that executes the actions synchronously. + */ +public class MockCoordinatorExecutor extends NonThrownExecutor { + private static final Logger LOG = LoggerFactory.getLogger(MockCoordinatorExecutor.class); + + public MockCoordinatorExecutor(OperatorCoordinator.Context context) { + super(LOG, (errMsg, t) -> context.failJob(new HoodieException(errMsg, t)), true); + } + + @Override + public void execute( + ThrowingRunnable action, + ExceptionHook hook, + String actionName, + Object... actionParams) { + final String actionString = String.format(actionName, actionParams); + try { + action.run(); + LOG.info("Executor executes action [{}] success!", actionString); + } catch (Throwable t) { + // if we have a JVM critical error, promote it immediately, there is a good + // chance the + // logging or job failing will not succeed anymore + ExceptionUtils.rethrowIfFatalErrorOrOOM(t); + final String errMsg = String.format("Executor executes action [%s] error", actionString); + if (hook != null) { + hook.apply(errMsg, t); + } + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockMapState.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockMapState.java new file mode 100644 index 0000000000000..b0d1c285d76eb --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockMapState.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.flink.api.common.state.MapState; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + +/** + * Mock map state for testing. + * + * @param Type of state key + * @param Type of state value + */ +public class MockMapState implements MapState { + private final Map backingMap = new HashMap<>(); + + @Override + public V get(K uk) { + return backingMap.get(uk); + } + + @Override + public void put(K uk, V uv) { + backingMap.put(uk, uv); + } + + @Override + public void putAll(Map map) { + backingMap.putAll(map); + } + + @Override + public void remove(K uk) { + backingMap.remove(uk); + } + + @Override + public boolean contains(K uk) { + return backingMap.containsKey(uk); + } + + @Override + public Iterable> entries() { + return backingMap.entrySet(); + } + + @Override + public Iterable keys() { + return backingMap.keySet(); + } + + @Override + public Iterable values() { + return backingMap.values(); + } + + @Override + public Iterator> iterator() { + return backingMap.entrySet().iterator(); + } + + @Override + public boolean isEmpty() { + return backingMap.isEmpty(); + } + + @Override + public void clear() { + backingMap.clear(); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockOperatorStateStore.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockOperatorStateStore.java new file mode 100644 index 0000000000000..cc16be4f0eb60 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockOperatorStateStore.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.flink.api.common.state.AggregatingState; +import org.apache.flink.api.common.state.AggregatingStateDescriptor; +import org.apache.flink.api.common.state.BroadcastState; +import org.apache.flink.api.common.state.KeyedStateStore; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.state.MapState; +import org.apache.flink.api.common.state.MapStateDescriptor; +import org.apache.flink.api.common.state.OperatorStateStore; +import org.apache.flink.api.common.state.ReducingState; +import org.apache.flink.api.common.state.ReducingStateDescriptor; +import org.apache.flink.api.common.state.ValueState; +import org.apache.flink.api.common.state.ValueStateDescriptor; +import org.apache.flink.streaming.api.functions.sink.filesystem.TestUtils; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +/** + * An {@link OperatorStateStore} for testing purpose. + */ +@SuppressWarnings("rawtypes") +public class MockOperatorStateStore implements KeyedStateStore, OperatorStateStore { + + private final Map> historyStateMap; + + private Map currentStateMap; + private Map lastSuccessStateMap; + + private MapState mapState; + private Map valueStateMap; + + public MockOperatorStateStore() { + this.historyStateMap = new HashMap<>(); + + this.currentStateMap = new HashMap<>(); + this.lastSuccessStateMap = new HashMap<>(); + + this.mapState = new MockMapState<>(); + this.valueStateMap = new HashMap<>(); + } + + @Override + public BroadcastState getBroadcastState(MapStateDescriptor stateDescriptor) throws Exception { + return null; + } + + @Override + public ValueState getState(ValueStateDescriptor valueStateDescriptor) { + String name = valueStateDescriptor.getName(); + valueStateMap.putIfAbsent(name, new MockValueState()); + return valueStateMap.get(name); + } + + @Override + @SuppressWarnings("unchecked") + public ListState getListState(ListStateDescriptor stateDescriptor) { + String name = stateDescriptor.getName(); + currentStateMap.putIfAbsent(name, new TestUtils.MockListState()); + return currentStateMap.get(name); + } + + @Override + public ReducingState getReducingState(ReducingStateDescriptor reducingStateDescriptor) { + return null; + } + + @Override + public AggregatingState getAggregatingState(AggregatingStateDescriptor aggregatingStateDescriptor) { + return null; + } + + @Override + @SuppressWarnings("unchecked") + public MapState getMapState(MapStateDescriptor mapStateDescriptor) { + return this.mapState; + } + + @Override + public ListState getUnionListState(ListStateDescriptor stateDescriptor) throws Exception { + throw new UnsupportedOperationException(); + } + + @Override + public Set getRegisteredStateNames() { + throw new UnsupportedOperationException(); + } + + @Override + public Set getRegisteredBroadcastStateNames() { + throw new UnsupportedOperationException(); + } + + public void checkpointBegin(long checkpointId) { + Map copiedStates = Collections.unmodifiableMap(copyStates(currentStateMap)); + historyStateMap.put(checkpointId, copiedStates); + } + + public void checkpointSuccess(long checkpointId) { + lastSuccessStateMap = historyStateMap.get(checkpointId); + } + + public void rollBackToLastSuccessCheckpoint() { + this.currentStateMap = copyStates(lastSuccessStateMap); + } + + @SuppressWarnings("unchecked") + private Map copyStates(Map stateMap) { + Map copiedStates = new HashMap<>(); + for (Map.Entry entry : stateMap.entrySet()) { + TestUtils.MockListState copiedState = new TestUtils.MockListState(); + copiedState.addAll(entry.getValue().getBackingList()); + copiedStates.put(entry.getKey(), copiedState); + } + return copiedStates; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockStateInitializationContext.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockStateInitializationContext.java new file mode 100644 index 0000000000000..945d1bbbe75f5 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockStateInitializationContext.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.hudi.adapter.StateInitializationContextAdapter; + +import org.apache.flink.api.common.state.KeyedStateStore; +import org.apache.flink.runtime.state.FunctionInitializationContext; +import org.apache.flink.runtime.state.KeyGroupStatePartitionStreamProvider; +import org.apache.flink.runtime.state.StatePartitionStreamProvider; + +/** + * A {@link FunctionInitializationContext} for testing purpose. + */ +public class MockStateInitializationContext implements StateInitializationContextAdapter { + + private final MockOperatorStateStore operatorStateStore; + + public MockStateInitializationContext() { + operatorStateStore = new MockOperatorStateStore(); + } + + @Override + public boolean isRestored() { + return false; + } + + @Override + public MockOperatorStateStore getOperatorStateStore() { + return operatorStateStore; + } + + @Override + public KeyedStateStore getKeyedStateStore() { + return operatorStateStore; + } + + @Override + public Iterable getRawOperatorStateInputs() { + return null; + } + + @Override + public Iterable getRawKeyedStateInputs() { + return null; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockStreamingRuntimeContext.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockStreamingRuntimeContext.java new file mode 100644 index 0000000000000..7c5b79700e434 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockStreamingRuntimeContext.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.hudi.adapter.StreamingRuntimeContextAdapter; + +import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.api.common.state.KeyedStateStore; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.memory.MemoryManager; +import org.apache.flink.runtime.operators.testutils.MockEnvironment; +import org.apache.flink.runtime.operators.testutils.MockEnvironmentBuilder; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; +import org.apache.flink.streaming.runtime.tasks.ProcessingTimeService; +import org.apache.flink.streaming.runtime.tasks.TestProcessingTimeService; + +import java.util.HashMap; + +/** + * Mock {@link StreamingRuntimeContext} to use in tests. + * + *

    NOTE: Adapted from Apache Flink, the MockStreamOperator is modified to support MapState. + */ +public class MockStreamingRuntimeContext extends StreamingRuntimeContextAdapter { + + private final boolean isCheckpointingEnabled; + + private final int numParallelSubtasks; + private final int subtaskIndex; + + public MockStreamingRuntimeContext( + boolean isCheckpointingEnabled, + int numParallelSubtasks, + int subtaskIndex) { + + this(isCheckpointingEnabled, numParallelSubtasks, subtaskIndex, new MockEnvironmentBuilder() + .setTaskName("mockTask") + .setManagedMemorySize(4 * MemoryManager.DEFAULT_PAGE_SIZE) + .build()); + } + + public MockStreamingRuntimeContext( + boolean isCheckpointingEnabled, + int numParallelSubtasks, + int subtaskIndex, + MockEnvironment environment) { + + super(new MockStreamOperator(), environment, new HashMap<>()); + + this.isCheckpointingEnabled = isCheckpointingEnabled; + this.numParallelSubtasks = numParallelSubtasks; + this.subtaskIndex = subtaskIndex; + } + + @Override + public boolean isCheckpointingEnabled() { + return isCheckpointingEnabled; + } + + @Override + public int getIndexOfThisSubtask() { + return subtaskIndex; + } + + @Override + public int getNumberOfParallelSubtasks() { + return numParallelSubtasks; + } + + private static class MockStreamOperator extends AbstractStreamOperator { + private static final long serialVersionUID = -1153976702711944427L; + + private transient TestProcessingTimeService testProcessingTimeService; + + private transient MockOperatorStateStore mockOperatorStateStore; + + @Override + public ExecutionConfig getExecutionConfig() { + return new ExecutionConfig(); + } + + @Override + public OperatorID getOperatorID() { + return new OperatorID(); + } + + @Override + public ProcessingTimeService getProcessingTimeService() { + if (testProcessingTimeService == null) { + testProcessingTimeService = new TestProcessingTimeService(); + } + return testProcessingTimeService; + } + + @Override + public KeyedStateStore getKeyedStateStore() { + if (mockOperatorStateStore == null) { + mockOperatorStateStore = new MockOperatorStateStore(); + } + return mockOperatorStateStore; + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockValueState.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockValueState.java new file mode 100644 index 0000000000000..eac3e7a3db4b8 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockValueState.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.flink.api.common.state.ValueState; + +/** + * Mock value state for testing. + * + * @param Type of state value + */ +public class MockValueState implements ValueState { + private V v = null; + + @Override + public V value() { + return v; + } + + @Override + public void update(V value) { + this.v = value; + } + + @Override + public void clear() { + v = null; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/StreamWriteFunctionWrapper.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/StreamWriteFunctionWrapper.java new file mode 100644 index 0000000000000..db8ff36962b34 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/StreamWriteFunctionWrapper.java @@ -0,0 +1,304 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.sink.StreamWriteFunction; +import org.apache.hudi.sink.StreamWriteOperatorCoordinator; +import org.apache.hudi.sink.bootstrap.BootstrapOperator; +import org.apache.hudi.sink.event.WriteMetadataEvent; +import org.apache.hudi.sink.partitioner.BucketAssignFunction; +import org.apache.hudi.sink.transform.RowDataToHoodieFunction; +import org.apache.hudi.utils.TestConfigurations; + +import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.io.disk.iomanager.IOManager; +import org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.memory.MemoryManager; +import org.apache.flink.runtime.operators.coordination.MockOperatorCoordinatorContext; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.runtime.operators.testutils.MockEnvironment; +import org.apache.flink.runtime.operators.testutils.MockEnvironmentBuilder; +import org.apache.flink.streaming.api.graph.StreamConfig; +import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; +import org.apache.flink.streaming.api.operators.collect.utils.MockFunctionSnapshotContext; +import org.apache.flink.streaming.api.operators.collect.utils.MockOperatorEventGateway; +import org.apache.flink.streaming.util.MockStreamTask; +import org.apache.flink.streaming.util.MockStreamTaskBuilder; +import org.apache.flink.table.data.RowData; +import org.apache.flink.util.Collector; + +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.CompletableFuture; + +/** + * A wrapper class to manipulate the {@link StreamWriteFunction} instance for testing. + * + * @param Input type + */ +public class StreamWriteFunctionWrapper implements TestFunctionWrapper { + private final Configuration conf; + + private final IOManager ioManager; + private final StreamingRuntimeContext runtimeContext; + private final MockOperatorEventGateway gateway; + private final MockOperatorCoordinatorContext coordinatorContext; + private final StreamWriteOperatorCoordinator coordinator; + private final MockStateInitializationContext stateInitializationContext; + + /** + * Function that converts row data to HoodieRecord. + */ + private RowDataToHoodieFunction> toHoodieFunction; + /** + * Function that load index in state. + */ + private BootstrapOperator, HoodieRecord> bootstrapOperator; + /** + * Function that assigns bucket ID. + */ + private BucketAssignFunction, HoodieRecord> bucketAssignerFunction; + /** + * BucketAssignOperator context. + **/ + private final MockBucketAssignFunctionContext bucketAssignFunctionContext; + /** + * Stream write function. + */ + private StreamWriteFunction> writeFunction; + + private CompactFunctionWrapper compactFunctionWrapper; + + private final MockStreamTask streamTask; + + private final StreamConfig streamConfig; + + private final boolean asyncCompaction; + + public StreamWriteFunctionWrapper(String tablePath) throws Exception { + this(tablePath, TestConfigurations.getDefaultConf(tablePath)); + } + + public StreamWriteFunctionWrapper(String tablePath, Configuration conf) throws Exception { + this.ioManager = new IOManagerAsync(); + MockEnvironment environment = new MockEnvironmentBuilder() + .setTaskName("mockTask") + .setManagedMemorySize(4 * MemoryManager.DEFAULT_PAGE_SIZE) + .setIOManager(ioManager) + .build(); + this.runtimeContext = new MockStreamingRuntimeContext(false, 1, 0, environment); + this.gateway = new MockOperatorEventGateway(); + this.conf = conf; + // one function + this.coordinatorContext = new MockOperatorCoordinatorContext(new OperatorID(), 1); + this.coordinator = new StreamWriteOperatorCoordinator(conf, this.coordinatorContext); + this.bucketAssignFunctionContext = new MockBucketAssignFunctionContext(); + this.stateInitializationContext = new MockStateInitializationContext(); + this.asyncCompaction = OptionsResolver.needsAsyncCompaction(conf); + this.streamConfig = new StreamConfig(conf); + streamConfig.setOperatorID(new OperatorID()); + this.streamTask = new MockStreamTaskBuilder(environment) + .setConfig(new StreamConfig(conf)) + .setExecutionConfig(new ExecutionConfig().enableObjectReuse()) + .build(); + this.compactFunctionWrapper = new CompactFunctionWrapper(this.conf, this.streamTask, this.streamConfig); + } + + public void openFunction() throws Exception { + this.coordinator.start(); + this.coordinator.setExecutor(new MockCoordinatorExecutor(coordinatorContext)); + toHoodieFunction = new RowDataToHoodieFunction<>(TestConfigurations.ROW_TYPE, conf); + toHoodieFunction.setRuntimeContext(runtimeContext); + toHoodieFunction.open(conf); + + bucketAssignerFunction = new BucketAssignFunction<>(conf); + bucketAssignerFunction.setRuntimeContext(runtimeContext); + bucketAssignerFunction.open(conf); + bucketAssignerFunction.initializeState(this.stateInitializationContext); + + if (conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED)) { + bootstrapOperator = new BootstrapOperator<>(conf); + CollectorOutput> output = new CollectorOutput<>(); + bootstrapOperator.setup(streamTask, streamConfig, output); + bootstrapOperator.initializeState(this.stateInitializationContext); + + Collector> collector = ScalaCollector.getInstance(); + for (HoodieRecord bootstrapRecord : output.getRecords()) { + bucketAssignerFunction.processElement(bootstrapRecord, null, collector); + bucketAssignFunctionContext.setCurrentKey(bootstrapRecord.getRecordKey()); + } + } + + setupWriteFunction(); + + if (asyncCompaction) { + compactFunctionWrapper.openFunction(); + } + } + + public void invoke(I record) throws Exception { + HoodieRecord hoodieRecord = toHoodieFunction.map((RowData) record); + ScalaCollector> collector = ScalaCollector.getInstance(); + bucketAssignerFunction.processElement(hoodieRecord, null, collector); + bucketAssignFunctionContext.setCurrentKey(hoodieRecord.getRecordKey()); + writeFunction.processElement(collector.getVal(), null, null); + } + + public WriteMetadataEvent[] getEventBuffer() { + return this.coordinator.getEventBuffer(); + } + + public OperatorEvent getNextEvent() { + return this.gateway.getNextEvent(); + } + + public Map> getDataBuffer() { + return this.writeFunction.getDataBuffer(); + } + + public void checkpointFunction(long checkpointId) throws Exception { + // checkpoint the coordinator first + this.coordinator.checkpointCoordinator(checkpointId, new CompletableFuture<>()); + if (conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED)) { + bootstrapOperator.snapshotState(null); + } + bucketAssignerFunction.snapshotState(null); + + writeFunction.snapshotState(new MockFunctionSnapshotContext(checkpointId)); + stateInitializationContext.getOperatorStateStore().checkpointBegin(checkpointId); + } + + public void endInput() { + writeFunction.endInput(); + } + + public void checkpointComplete(long checkpointId) { + stateInitializationContext.getOperatorStateStore().checkpointSuccess(checkpointId); + coordinator.notifyCheckpointComplete(checkpointId); + this.bucketAssignerFunction.notifyCheckpointComplete(checkpointId); + if (asyncCompaction) { + try { + compactFunctionWrapper.compact(checkpointId); + } catch (Exception e) { + throw new HoodieException(e); + } + } + } + + public void checkpointFails(long checkpointId) { + coordinator.notifyCheckpointAborted(checkpointId); + } + + public void subTaskFails(int taskID) throws Exception { + coordinator.subtaskFailed(taskID, new RuntimeException("Dummy exception")); + setupWriteFunction(); + } + + public void close() throws Exception { + coordinator.close(); + ioManager.close(); + bucketAssignerFunction.close(); + writeFunction.close(); + if (compactFunctionWrapper != null) { + compactFunctionWrapper.close(); + } + } + + public StreamWriteOperatorCoordinator getCoordinator() { + return coordinator; + } + + public MockOperatorCoordinatorContext getCoordinatorContext() { + return coordinatorContext; + } + + public boolean isKeyInState(HoodieKey hoodieKey) { + return this.bucketAssignFunctionContext.isKeyInState(hoodieKey.getRecordKey()); + } + + public boolean isConforming() { + return this.writeFunction.isConfirming(); + } + + public boolean isAlreadyBootstrap() throws Exception { + return this.bootstrapOperator.isAlreadyBootstrap(); + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + private void setupWriteFunction() throws Exception { + writeFunction = new StreamWriteFunction<>(conf); + writeFunction.setRuntimeContext(runtimeContext); + writeFunction.setOperatorEventGateway(gateway); + writeFunction.initializeState(this.stateInitializationContext); + writeFunction.open(conf); + + // handle the bootstrap event + coordinator.handleEventFromOperator(0, getNextEvent()); + } + + // ------------------------------------------------------------------------- + // Inner Class + // ------------------------------------------------------------------------- + + private static class MockBucketAssignFunctionContext { + private final Set updateKeys = new HashSet<>(); + + public void setCurrentKey(Object key) { + this.updateKeys.add(key); + } + + public boolean isKeyInState(String key) { + return this.updateKeys.contains(key); + } + } + + private static class ScalaCollector implements Collector { + private T val; + + public static ScalaCollector getInstance() { + return new ScalaCollector<>(); + } + + @Override + public void collect(T t) { + this.val = t; + } + + @Override + public void close() { + this.val = null; + } + + public T getVal() { + return val; + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestFunctionWrapper.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestFunctionWrapper.java new file mode 100644 index 0000000000000..d2fe8196502c3 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestFunctionWrapper.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.sink.StreamWriteOperatorCoordinator; +import org.apache.hudi.sink.event.WriteMetadataEvent; + +import org.apache.flink.runtime.operators.coordination.MockOperatorCoordinatorContext; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; + +import java.util.List; +import java.util.Map; + +/** + * Define the common interfaces for test function wrappers. + */ +public interface TestFunctionWrapper { + /** + * Open all the functions within this wrapper. + */ + void openFunction() throws Exception; + + /** + * Process the given input record {@code record}. + */ + void invoke(I record) throws Exception; + + /** + * Returns the event buffer sent by the write tasks. + */ + WriteMetadataEvent[] getEventBuffer(); + + /** + * Returns the next event. + */ + OperatorEvent getNextEvent(); + + /** + * Snapshot all the functions in the wrapper. + */ + void checkpointFunction(long checkpointId) throws Exception; + + /** + * Mark checkpoint with id {code checkpointId} as success. + */ + void checkpointComplete(long checkpointId); + + /** + * Returns the operator coordinator. + */ + StreamWriteOperatorCoordinator getCoordinator(); + + /** + * Returns the data buffer of the write task. + */ + default Map> getDataBuffer() { + throw new UnsupportedOperationException(); + } + + /** + * Mark checkpoint with id {code checkpointId} as failed. + */ + default void checkpointFails(long checkpointId) { + throw new UnsupportedOperationException(); + } + + /** + * Returns the context of the coordinator. + */ + default MockOperatorCoordinatorContext getCoordinatorContext() { + throw new UnsupportedOperationException(); + } + + /** + * Mark sub-task with id {@code taskId} as failed. + */ + default void subTaskFails(int taskId) throws Exception { + throw new UnsupportedOperationException(); + } + + /** + * Returns whether the given key {@code key} is in the state store. + */ + default boolean isKeyInState(HoodieKey key) { + throw new UnsupportedOperationException(); + } + + /** + * Returns whether the bootstrap function already bootstrapped. + */ + default boolean isAlreadyBootstrap() throws Exception { + throw new UnsupportedOperationException(); + } + + /** + * Returns whether the write task is confirming. + */ + default boolean isConforming() { + throw new UnsupportedOperationException(); + } + + /** + * Close this function wrapper. + */ + void close() throws Exception; +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestHiveSyncContext.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestHiveSyncContext.java new file mode 100644 index 0000000000000..ae30b39906e0b --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestHiveSyncContext.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.hive.HiveSyncConfig; + +import org.apache.flink.configuration.Configuration; +import org.junit.jupiter.api.Test; + +import java.util.Properties; + +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Test cases for {@link HiveSyncContext}. + */ +public class TestHiveSyncContext { + /** + * Test partition path fields sync. + */ + @Test + void testSyncedPartitions() { + Configuration configuration1 = new Configuration(); + Configuration configuration2 = new Configuration(); + String hiveSyncPartitionField = "hiveSyncPartitionField"; + String partitionPathField = "partitionPathField"; + + configuration1.setString(FlinkOptions.HIVE_SYNC_PARTITION_FIELDS, hiveSyncPartitionField); + configuration1.setString(FlinkOptions.PARTITION_PATH_FIELD, partitionPathField); + + configuration2.setString(FlinkOptions.PARTITION_PATH_FIELD, partitionPathField); + + Properties props1 = HiveSyncContext.buildSyncConfig(configuration1); + Properties props2 = HiveSyncContext.buildSyncConfig(configuration2); + + assertEquals(hiveSyncPartitionField, props1.getProperty(META_SYNC_PARTITION_FIELDS.key())); + assertEquals(partitionPathField, props2.getProperty(META_SYNC_PARTITION_FIELDS.key())); + } + + /** + * Test an option that has no shortcut key. + */ + @Test + void testOptionWithoutShortcutKey() { + Configuration configuration3 = new Configuration(); + configuration3.setBoolean(HiveSyncConfig.HIVE_CREATE_MANAGED_TABLE.key(), true); + Properties props3 = HiveSyncContext.buildSyncConfig(configuration3); + assertTrue(Boolean.parseBoolean(props3.getProperty(HiveSyncConfig.HIVE_CREATE_MANAGED_TABLE.key(), "false"))); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestWriteBase.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestWriteBase.java new file mode 100644 index 0000000000000..b6ae0767d68a3 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestWriteBase.java @@ -0,0 +1,417 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.sink.event.WriteMetadataEvent; +import org.apache.hudi.utils.TestData; +import org.apache.hudi.utils.TestUtils; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.table.data.RowData; +import org.apache.hadoop.fs.FileSystem; +import org.hamcrest.MatcherAssert; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.hamcrest.CoreMatchers.instanceOf; +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Base class for write test cases. + */ +public class TestWriteBase { + protected static final Map EXPECTED1 = new HashMap<>(); + + protected static final Map EXPECTED2 = new HashMap<>(); + + protected static final Map EXPECTED3 = new HashMap<>(); + + protected static final Map EXPECTED4 = new HashMap<>(); + + protected static final Map> EXPECTED5 = new HashMap<>(); + + static { + EXPECTED1.put("par1", "[id1,par1,id1,Danny,23,1,par1, id2,par1,id2,Stephen,33,2,par1]"); + EXPECTED1.put("par2", "[id3,par2,id3,Julian,53,3,par2, id4,par2,id4,Fabian,31,4,par2]"); + EXPECTED1.put("par3", "[id5,par3,id5,Sophia,18,5,par3, id6,par3,id6,Emma,20,6,par3]"); + EXPECTED1.put("par4", "[id7,par4,id7,Bob,44,7,par4, id8,par4,id8,Han,56,8,par4]"); + + EXPECTED2.put("par1", "[id1,par1,id1,Danny,24,1,par1, id2,par1,id2,Stephen,34,2,par1]"); + EXPECTED2.put("par2", "[id3,par2,id3,Julian,54,3,par2, id4,par2,id4,Fabian,32,4,par2]"); + EXPECTED2.put("par3", "[id5,par3,id5,Sophia,18,5,par3, id6,par3,id6,Emma,20,6,par3, " + + "id9,par3,id9,Jane,19,6,par3]"); + EXPECTED2.put("par4", "[id10,par4,id10,Ella,38,7,par4, id11,par4,id11,Phoebe,52,8,par4, " + + "id7,par4,id7,Bob,44,7,par4, id8,par4,id8,Han,56,8,par4]"); + + EXPECTED3.put("par1", "[id1,par1,id1,Danny,23,1,par1]"); + + EXPECTED4.put("par1", "[" + + "id1,par1,id1,Danny,23,0,par1, " + + "id1,par1,id1,Danny,23,1,par1, " + + "id1,par1,id1,Danny,23,2,par1, " + + "id1,par1,id1,Danny,23,3,par1, " + + "id1,par1,id1,Danny,23,4,par1]"); + + EXPECTED5.put("par1", Arrays.asList( + "id1,par1,id1,Danny,23,0,par1", + "id1,par1,id1,Danny,23,0,par1", + "id1,par1,id1,Danny,23,1,par1", + "id1,par1,id1,Danny,23,1,par1", + "id1,par1,id1,Danny,23,2,par1", + "id1,par1,id1,Danny,23,2,par1", + "id1,par1,id1,Danny,23,3,par1", + "id1,par1,id1,Danny,23,3,par1", + "id1,par1,id1,Danny,23,4,par1", + "id1,par1,id1,Danny,23,4,par1")); + } + + // ------------------------------------------------------------------------- + // Inner Class + // ------------------------------------------------------------------------- + + /** + * Utils to composite the test stages. + */ + public static class TestHarness { + public static TestHarness instance() { + return new TestHarness(); + } + + private File baseFile; + private String basePath; + private Configuration conf; + private TestFunctionWrapper pipeline; + + private String lastPending; + private String lastComplete; + + public TestHarness preparePipeline(File basePath, Configuration conf) throws Exception { + preparePipeline(basePath, conf, false); + return this; + } + + public TestHarness preparePipeline(File basePath, Configuration conf, boolean append) throws Exception { + this.baseFile = basePath; + this.basePath = this.baseFile.getAbsolutePath(); + this.conf = conf; + this.pipeline = append + ? new InsertFunctionWrapper<>(this.basePath, conf) + : new StreamWriteFunctionWrapper<>(this.basePath, conf); + // open the function and ingest data + this.pipeline.openFunction(); + return this; + } + + public TestHarness consume(List inputs) throws Exception { + for (RowData rowData : inputs) { + this.pipeline.invoke(rowData); + } + return this; + } + + public TestHarness assertConsumeThrows(List inputs, String message) { + assertThrows(HoodieException.class, () -> consume(inputs), message); + return this; + } + + /** + * Assert the event buffer is empty. + */ + public TestHarness emptyEventBuffer() { + assertTrue( + this.pipeline.getEventBuffer().length == 1 + && this.pipeline.getEventBuffer()[0] == null, + "The coordinator events buffer expect to be empty"); + return this; + } + + /** + * Assert the next event exists and handle over it to the coordinator. + */ + public TestHarness assertNextEvent() { + final OperatorEvent nextEvent = this.pipeline.getNextEvent(); + MatcherAssert.assertThat("The operator expect to send an event", nextEvent, instanceOf(WriteMetadataEvent.class)); + this.pipeline.getCoordinator().handleEventFromOperator(0, nextEvent); + assertNotNull(this.pipeline.getEventBuffer()[0], "The coordinator missed the event"); + return this; + } + + /** + * Assert the next event exists and handle over it to the coordinator. + * + * @param numWriteStatus The expected write status num reported by the event + * @param partitions The written partitions reported by the event + */ + public TestHarness assertNextEvent(int numWriteStatus, String partitions) { + final OperatorEvent nextEvent = this.pipeline.getNextEvent(); + MatcherAssert.assertThat("The operator expect to send an event", nextEvent, instanceOf(WriteMetadataEvent.class)); + List writeStatuses = ((WriteMetadataEvent) nextEvent).getWriteStatuses(); + assertNotNull(writeStatuses); + MatcherAssert.assertThat(writeStatuses.size(), is(numWriteStatus)); + assertThat(writeStatuses.stream() + .map(WriteStatus::getPartitionPath).sorted(Comparator.naturalOrder()) + .collect(Collectors.joining(",")), + is(partitions)); + this.pipeline.getCoordinator().handleEventFromOperator(0, nextEvent); + assertNotNull(this.pipeline.getEventBuffer()[0], "The coordinator missed the event"); + return this; + } + + /** + * Assert the next event exists and handle over it to the coordinator. + * + *

    Validates that the write metadata reported by the event is empty. + */ + public TestHarness assertEmptyEvent() { + final OperatorEvent nextEvent = this.pipeline.getNextEvent(); + MatcherAssert.assertThat("The operator expect to send an event", nextEvent, instanceOf(WriteMetadataEvent.class)); + List writeStatuses = ((WriteMetadataEvent) nextEvent).getWriteStatuses(); + assertNotNull(writeStatuses); + MatcherAssert.assertThat(writeStatuses.size(), is(0)); + this.pipeline.getCoordinator().handleEventFromOperator(0, nextEvent); + assertNotNull(this.pipeline.getEventBuffer()[0], "The coordinator missed the event"); + return this; + } + + /** + * Assert the data buffer with given number of buckets and records. + */ + public TestHarness assertDataBuffer(int numBuckets, int numRecords) { + Map> dataBuffer = this.pipeline.getDataBuffer(); + assertThat("Should have " + numBuckets + " data bucket", dataBuffer.size(), is(numBuckets)); + assertThat(numRecords + " records expect to flush out as a mini-batch", + dataBuffer.values().stream().findFirst().map(List::size).orElse(-1), + is(numRecords)); + return this; + } + + /** + * Checkpoints the pipeline, which triggers the data write and event send. + */ + public TestHarness checkpoint(long checkpointId) throws Exception { + this.pipeline.checkpointFunction(checkpointId); + return this; + } + + public TestHarness allDataFlushed() { + Map> dataBuffer = this.pipeline.getDataBuffer(); + assertThat("All data should be flushed out", dataBuffer.size(), is(0)); + return this; + } + + /** + * Handle the next {@code numEvents} events and handle over them to the coordinator. + */ + public TestHarness handleEvents(int numEvents) { + for (int i = 0; i < numEvents; i++) { + final OperatorEvent event = this.pipeline.getNextEvent(); // remove the first event first + assertThat("The operator expect to send an event", event, instanceOf(WriteMetadataEvent.class)); + this.pipeline.getCoordinator().handleEventFromOperator(0, event); + } + assertNotNull(this.pipeline.getEventBuffer()[0], "The coordinator missed the event"); + return this; + } + + /** + * Mark the checkpoint with id {@code checkpointId} as finished. + */ + public TestHarness checkpointComplete(long checkpointId) { + this.lastPending = lastPendingInstant(); + this.pipeline.checkpointComplete(checkpointId); + // started a new instant already + checkInflightInstant(); + checkInstantState(HoodieInstant.State.COMPLETED, lastPending); + this.lastComplete = lastPending; + this.lastPending = lastPendingInstant(); // refresh last pending instant + return this; + } + + /** + * Mark the checkpoint finished with empty write metadata. + */ + public TestHarness emptyCheckpoint(long checkpointId) { + String lastPending = lastPendingInstant(); + this.pipeline.checkpointComplete(checkpointId); + // last pending instant was reused + assertEquals(this.lastPending, lastPending); + checkInstantState(HoodieInstant.State.COMPLETED, lastComplete); + return this; + } + + /** + * Mark the checkpoint with id {@code checkpointId} as failed. + */ + public TestHarness checkpointFails(long checkpointId) { + this.pipeline.checkpointFails(checkpointId); + assertFalse(this.pipeline.getCoordinatorContext().isJobFailed(), + "The last checkpoint was aborted, ignore the events"); + // no complete instant + checkInstantState(HoodieInstant.State.COMPLETED, null); + return this; + } + + public TestHarness checkpointThrows(long checkpointId, String message) { + // this returns early because there is no inflight instant + assertThrows(HoodieException.class, () -> checkpoint(checkpointId), message); + return this; + } + + /** + * Mark the task with id {@code taskId} as failed. + */ + public TestHarness subTaskFails(int taskId) throws Exception { + // fails the subtask + String instant1 = lastPendingInstant(); + this.pipeline.subTaskFails(taskId); + + String instant2 = lastPendingInstant(); + assertNotEquals(instant2, instant1, "The previous instant should be rolled back when starting new instant"); + return this; + } + + public TestHarness noCompleteInstant() { + // no complete instant + checkInstantState(HoodieInstant.State.COMPLETED, null); + return this; + } + + /** + * Asserts the data files are empty. + */ + public TestHarness assertEmptyDataFiles() { + File[] dataFiles = baseFile.listFiles(file -> !file.getName().startsWith(".")); + assertNotNull(dataFiles); + assertThat(dataFiles.length, is(0)); + return this; + } + + public TestHarness checkWrittenData(Map expected) throws Exception { + checkWrittenData(expected, 4); + return this; + } + + public TestHarness checkWrittenData( + Map expected, + int partitions) throws Exception { + if (OptionsResolver.isCowTable(conf)) { + TestData.checkWrittenData(this.baseFile, expected, partitions); + } else { + checkWrittenDataMor(baseFile, expected, partitions); + } + return this; + } + + private void checkWrittenDataMor(File baseFile, Map expected, int partitions) throws Exception { + FileSystem fs = FSUtils.getFs(basePath, new org.apache.hadoop.conf.Configuration()); + TestData.checkWrittenDataMOR(fs, baseFile, expected, partitions); + } + + public TestHarness checkWrittenDataCOW(Map> expected) throws IOException { + TestData.checkWrittenDataCOW(this.baseFile, expected); + return this; + } + + public TestHarness checkWrittenAllData(Map expected, int partitions) throws IOException { + TestData.checkWrittenAllData(baseFile, expected, partitions); + return this; + } + + public TestHarness checkIndexLoaded(HoodieKey... keys) { + for (HoodieKey key : keys) { + assertTrue(this.pipeline.isKeyInState(key), + "Key: " + key + " assumes to be in the index state"); + } + return this; + } + + public TestHarness assertBootstrapped() throws Exception { + assertTrue(this.pipeline.isAlreadyBootstrap()); + return this; + } + + public TestHarness assertConfirming() { + assertTrue(this.pipeline.isConforming(), + "The write function should be waiting for the instant to commit"); + return this; + } + + public TestHarness assertNotConfirming() { + assertFalse(this.pipeline.isConforming(), + "The write function should finish waiting for the instant to commit"); + return this; + } + + public void end() throws Exception { + this.pipeline.close(); + } + + private String lastPendingInstant() { + return TestUtils.getLastPendingInstant(basePath); + } + + private void checkInflightInstant() { + final String instant = TestUtils.getLastPendingInstant(basePath); + assertNotNull(instant); + } + + private void checkInstantState(HoodieInstant.State state, String instantStr) { + final String instant; + switch (state) { + case REQUESTED: + instant = lastPendingInstant(); + break; + case COMPLETED: + instant = lastCompleteInstant(); + break; + default: + throw new AssertionError("Unexpected state"); + } + assertThat(instant, is(instantStr)); + } + + protected String lastCompleteInstant() { + return OptionsResolver.isMorTable(conf) + ? TestUtils.getLastDeltaCompleteInstant(basePath) + : TestUtils.getLastCompleteInstant(basePath, HoodieTimeline.COMMIT_ACTION); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestFileIndex.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestFileIndex.java new file mode 100644 index 0000000000000..f4918a0fe42be --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestFileIndex.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.source; + +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.keygen.NonpartitionedAvroKeyGenerator; +import org.apache.hudi.utils.TestConfigurations; +import org.apache.hudi.utils.TestData; + +import org.apache.flink.configuration.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.File; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.apache.hudi.configuration.FlinkOptions.HIVE_STYLE_PARTITIONING; +import static org.apache.hudi.configuration.FlinkOptions.KEYGEN_CLASS_NAME; +import static org.apache.hudi.configuration.FlinkOptions.METADATA_ENABLED; +import static org.apache.hudi.configuration.FlinkOptions.PARTITION_DEFAULT_NAME; +import static org.apache.hudi.configuration.FlinkOptions.PARTITION_PATH_FIELD; +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Test cases for {@link FileIndex}. + */ +public class TestFileIndex { + @TempDir + File tempFile; + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testFileListingUsingMetadata(boolean hiveStylePartitioning) throws Exception { + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setBoolean(METADATA_ENABLED, true); + conf.setBoolean(HIVE_STYLE_PARTITIONING, hiveStylePartitioning); + TestData.writeData(TestData.DATA_SET_INSERT, conf); + FileIndex fileIndex = FileIndex.instance(new Path(tempFile.getAbsolutePath()), conf, TestConfigurations.ROW_TYPE); + List partitionKeys = Collections.singletonList("partition"); + List> partitions = fileIndex.getPartitions(partitionKeys, PARTITION_DEFAULT_NAME.defaultValue(), hiveStylePartitioning); + assertTrue(partitions.stream().allMatch(m -> m.size() == 1)); + String partitionPaths = partitions.stream() + .map(Map::values).flatMap(Collection::stream).sorted().collect(Collectors.joining(",")); + assertThat("should have 4 partitions", partitionPaths, is("par1,par2,par3,par4")); + + FileStatus[] fileStatuses = fileIndex.getFilesInPartitions(); + assertThat(fileStatuses.length, is(4)); + assertTrue(Arrays.stream(fileStatuses) + .allMatch(fileStatus -> fileStatus.getPath().toString().endsWith(HoodieFileFormat.PARQUET.getFileExtension()))); + } + + @Test + void testFileListingUsingMetadataNonPartitionedTable() throws Exception { + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setString(PARTITION_PATH_FIELD, ""); + conf.setString(KEYGEN_CLASS_NAME, NonpartitionedAvroKeyGenerator.class.getName()); + conf.setBoolean(METADATA_ENABLED, true); + TestData.writeData(TestData.DATA_SET_INSERT, conf); + FileIndex fileIndex = FileIndex.instance(new Path(tempFile.getAbsolutePath()), conf, TestConfigurations.ROW_TYPE); + List partitionKeys = Collections.singletonList(""); + List> partitions = fileIndex.getPartitions(partitionKeys, PARTITION_DEFAULT_NAME.defaultValue(), false); + assertThat(partitions.size(), is(0)); + + FileStatus[] fileStatuses = fileIndex.getFilesInPartitions(); + assertThat(fileStatuses.length, is(1)); + assertTrue(fileStatuses[0].getPath().toString().endsWith(HoodieFileFormat.PARQUET.getFileExtension())); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testFileListingEmptyTable(boolean enableMetadata) { + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setBoolean(METADATA_ENABLED, enableMetadata); + FileIndex fileIndex = FileIndex.instance(new Path(tempFile.getAbsolutePath()), conf, TestConfigurations.ROW_TYPE); + List partitionKeys = Collections.singletonList("partition"); + List> partitions = fileIndex.getPartitions(partitionKeys, PARTITION_DEFAULT_NAME.defaultValue(), false); + assertThat(partitions.size(), is(0)); + + FileStatus[] fileStatuses = fileIndex.getFilesInPartitions(); + assertThat(fileStatuses.length, is(0)); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestIncrementalInputSplits.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestIncrementalInputSplits.java new file mode 100644 index 0000000000000..b42fd2c04a3c0 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestIncrementalInputSplits.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.source; + +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.utils.TestConfigurations; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.Path; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertIterableEquals; + +/** + * Test cases for {@link IncrementalInputSplits}. + */ +public class TestIncrementalInputSplits extends HoodieCommonTestHarness { + + @BeforeEach + private void init() throws IOException { + initPath(); + initMetaClient(); + } + + @Test + void testFilterInstantsWithRange() { + HoodieActiveTimeline timeline = new HoodieActiveTimeline(metaClient, true); + Configuration conf = TestConfigurations.getDefaultConf(basePath); + IncrementalInputSplits iis = IncrementalInputSplits.builder() + .conf(conf) + .path(new Path(basePath)) + .rowType(TestConfigurations.ROW_TYPE) + .build(); + + HoodieInstant commit1 = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "1"); + HoodieInstant commit2 = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "2"); + HoodieInstant commit3 = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "3"); + timeline.createNewInstant(commit1); + timeline.createNewInstant(commit2); + timeline.createNewInstant(commit3); + timeline = timeline.reload(); + + // previous read iteration read till instant time "1", next read iteration should return ["2", "3"] + List instantRange2 = iis.filterInstantsWithRange(timeline, "1"); + assertEquals(2, instantRange2.size()); + assertIterableEquals(Arrays.asList(commit2, commit3), instantRange2); + + // simulate first iteration cycle with read from LATEST commit + List instantRange1 = iis.filterInstantsWithRange(timeline, null); + assertEquals(1, instantRange1.size()); + assertIterableEquals(Collections.singletonList(commit3), instantRange1); + + // specifying a start and end commit + conf.set(FlinkOptions.READ_START_COMMIT, "1"); + conf.set(FlinkOptions.READ_END_COMMIT, "3"); + List instantRange3 = iis.filterInstantsWithRange(timeline, null); + assertEquals(3, instantRange3.size()); + assertIterableEquals(Arrays.asList(commit1, commit2, commit3), instantRange3); + } + +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadMonitoringFunction.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadMonitoringFunction.java new file mode 100644 index 0000000000000..541890f7b05f4 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadMonitoringFunction.java @@ -0,0 +1,329 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.source; + +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.table.format.mor.MergeOnReadInputSplit; +import org.apache.hudi.util.StreamerUtil; +import org.apache.hudi.utils.TestConfigurations; +import org.apache.hudi.utils.TestData; +import org.apache.hudi.utils.TestUtils; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.streaming.api.operators.StreamSource; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.util.AbstractStreamOperatorTestHarness; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.File; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Test cases for {@link StreamReadMonitoringFunction}. + */ +public class TestStreamReadMonitoringFunction { + private static final long WAIT_TIME_MILLIS = 5 * 1000L; + + private Configuration conf; + + @TempDir + File tempFile; + + @BeforeEach + public void before() throws Exception { + final String basePath = tempFile.getAbsolutePath(); + conf = TestConfigurations.getDefaultConf(basePath); + conf.setString(FlinkOptions.TABLE_TYPE, FlinkOptions.TABLE_TYPE_MERGE_ON_READ); + conf.setInteger(FlinkOptions.READ_STREAMING_CHECK_INTERVAL, 2); // check every 2 seconds + + StreamerUtil.initTableIfNotExists(conf); + } + + @Test + public void testConsumeFromLatestCommit() throws Exception { + // write 2 commits first, and all the splits should come from the second commit. + TestData.writeData(TestData.DATA_SET_INSERT, conf); + TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf); + StreamReadMonitoringFunction function = TestUtils.getMonitorFunc(conf); + try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { + harness.setup(); + harness.open(); + + CountDownLatch latch = new CountDownLatch(4); + CollectingSourceContext sourceContext = new CollectingSourceContext(latch); + + runAsync(sourceContext, function); + + assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation"); + assertThat("Should produce the expected splits", + sourceContext.getPartitionPaths(), is("par1,par2,par3,par4")); + + assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), + "All the instants should have range limit"); + String latestCommit = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath()); + assertTrue(sourceContext.splits.stream().allMatch(split -> split.getLatestCommit().equals(latestCommit)), + "All the splits should be with latestCommit instant time"); + + // Stop the stream task. + function.close(); + } + } + + @Test + public void testConsumeFromLastCommit() throws Exception { + TestData.writeData(TestData.DATA_SET_INSERT, conf); + StreamReadMonitoringFunction function = TestUtils.getMonitorFunc(conf); + try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { + harness.setup(); + harness.open(); + + CountDownLatch latch = new CountDownLatch(4); + CollectingSourceContext sourceContext = new CollectingSourceContext(latch); + + runAsync(sourceContext, function); + + assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation"); + assertThat("Should produce the expected splits", + sourceContext.getPartitionPaths(), is("par1,par2,par3,par4")); + assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), + "All instants should have range limit"); + + Thread.sleep(1000L); + + // reset the source context + latch = new CountDownLatch(4); + sourceContext.reset(latch); + + // write another instant and validate + TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf); + + assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation"); + assertThat("Should produce the expected splits", + sourceContext.getPartitionPaths(), is("par1,par2,par3,par4")); + assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), + "All the instants should have range limit"); + + // Stop the stream task. + function.close(); + } + } + + @Test + public void testConsumeFromSpecifiedCommit() throws Exception { + // write 2 commits first, use the second commit time as the specified start instant, + // all the splits should come from the second commit. + TestData.writeData(TestData.DATA_SET_INSERT, conf); + TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf); + String specifiedCommit = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath()); + conf.setString(FlinkOptions.READ_START_COMMIT, specifiedCommit); + StreamReadMonitoringFunction function = TestUtils.getMonitorFunc(conf); + try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { + harness.setup(); + harness.open(); + + CountDownLatch latch = new CountDownLatch(4); + CollectingSourceContext sourceContext = new CollectingSourceContext(latch); + + runAsync(sourceContext, function); + + assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation"); + assertThat("Should produce the expected splits", + sourceContext.getPartitionPaths(), is("par1,par2,par3,par4")); + assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), + "All the instants should have range limit"); + assertTrue(sourceContext.splits.stream().allMatch(split -> split.getLatestCommit().equals(specifiedCommit)), + "All the splits should be with specified instant time"); + + // Stop the stream task. + function.close(); + } + } + + @Test + public void testConsumeFromEarliestCommit() throws Exception { + // write 2 commits first, then specify the start commit as 'earliest', + // all the splits should come from the earliest commit. + TestData.writeData(TestData.DATA_SET_INSERT, conf); + TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf); + String specifiedCommit = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath()); + conf.setString(FlinkOptions.READ_START_COMMIT, FlinkOptions.START_COMMIT_EARLIEST); + StreamReadMonitoringFunction function = TestUtils.getMonitorFunc(conf); + try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { + harness.setup(); + harness.open(); + + CountDownLatch latch = new CountDownLatch(4); + CollectingSourceContext sourceContext = new CollectingSourceContext(latch); + + runAsync(sourceContext, function); + + assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation"); + assertThat("Should produce the expected splits", + sourceContext.getPartitionPaths(), is("par1,par2,par3,par4")); + assertTrue(sourceContext.splits.stream().noneMatch(split -> split.getInstantRange().isPresent()), + "No instants should have range limit"); + assertTrue(sourceContext.splits.stream().allMatch(split -> split.getLatestCommit().equals(specifiedCommit)), + "All the splits should be with specified instant time"); + + // Stop the stream task. + function.close(); + } + } + + @Test + public void testCheckpointRestore() throws Exception { + TestData.writeData(TestData.DATA_SET_INSERT, conf); + + StreamReadMonitoringFunction function = TestUtils.getMonitorFunc(conf); + OperatorSubtaskState state; + try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { + harness.setup(); + harness.open(); + + CountDownLatch latch = new CountDownLatch(4); + CollectingSourceContext sourceContext = new CollectingSourceContext(latch); + runAsync(sourceContext, function); + + assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation"); + Thread.sleep(1000L); + + state = harness.snapshot(1, 1); + + // Stop the stream task. + function.close(); + + assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation"); + assertThat("Should produce the expected splits", + sourceContext.getPartitionPaths(), is("par1,par2,par3,par4")); + assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), + "All instants should have range limit"); + + } + + TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf); + StreamReadMonitoringFunction function2 = TestUtils.getMonitorFunc(conf); + try (AbstractStreamOperatorTestHarness harness = createHarness(function2)) { + harness.setup(); + // Recover to process the remaining snapshots. + harness.initializeState(state); + harness.open(); + + CountDownLatch latch = new CountDownLatch(4); + CollectingSourceContext sourceContext = new CollectingSourceContext(latch); + runAsync(sourceContext, function2); + + // Stop the stream task. + function.close(); + + assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation"); + assertThat("Should produce the expected splits", + sourceContext.getPartitionPaths(), is("par1,par2,par3,par4")); + assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), + "All the instants should have range limit"); + } + } + + private AbstractStreamOperatorTestHarness createHarness( + StreamReadMonitoringFunction function) throws Exception { + StreamSource streamSource = new StreamSource<>(function); + return new AbstractStreamOperatorTestHarness<>(streamSource, 1, 1, 0); + } + + private void runAsync( + CollectingSourceContext sourceContext, + StreamReadMonitoringFunction function) { + Thread task = new Thread(() -> { + try { + function.run(sourceContext); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + task.start(); + } + + /** + * Source context that collects the outputs in to a list. + */ + private static class CollectingSourceContext implements SourceFunction.SourceContext { + private final List splits = new ArrayList<>(); + private final Object checkpointLock = new Object(); + private volatile CountDownLatch latch; + + CollectingSourceContext(CountDownLatch latch) { + this.latch = latch; + } + + @Override + public void collect(MergeOnReadInputSplit element) { + splits.add(element); + latch.countDown(); + } + + @Override + public void collectWithTimestamp(MergeOnReadInputSplit element, long timestamp) { + collect(element); + } + + @Override + public void emitWatermark(Watermark mark) { + + } + + @Override + public void markAsTemporarilyIdle() { + + } + + @Override + public Object getCheckpointLock() { + return checkpointLock; + } + + @Override + public void close() { + + } + + public void reset(CountDownLatch latch) { + this.latch = latch; + this.splits.clear(); + } + + public String getPartitionPaths() { + return this.splits.stream() + .map(TestUtils::getSplitPartitionPath) + .sorted(Comparator.naturalOrder()) + .collect(Collectors.joining(",")); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadOperator.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadOperator.java new file mode 100644 index 0000000000000..63d5c1f6bdbf1 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadOperator.java @@ -0,0 +1,290 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.source; + +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.table.format.mor.MergeOnReadInputFormat; +import org.apache.hudi.table.format.mor.MergeOnReadInputSplit; +import org.apache.hudi.table.format.mor.MergeOnReadTableState; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.StreamerUtil; +import org.apache.hudi.utils.TestConfigurations; +import org.apache.hudi.utils.TestData; +import org.apache.hudi.utils.TestUtils; + +import org.apache.avro.Schema; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; +import org.apache.flink.streaming.api.TimeCharacteristic; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; +import org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor; +import org.apache.flink.streaming.runtime.tasks.mailbox.MailboxDefaultAction; +import org.apache.flink.streaming.runtime.tasks.mailbox.SteppingMailboxProcessor; +import org.apache.flink.streaming.util.CollectingSourceContext; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.File; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.apache.hudi.configuration.FlinkOptions.PARTITION_DEFAULT_NAME; +import static org.apache.hudi.configuration.FlinkOptions.TABLE_TYPE; +import static org.apache.hudi.configuration.FlinkOptions.TABLE_TYPE_MERGE_ON_READ; +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Test cases for {@link StreamReadOperator}. + */ +public class TestStreamReadOperator { + private static final Map EXPECTED = new HashMap<>(); + + static { + EXPECTED.put("par1", "+I[id1, Danny, 23, 1970-01-01T00:00:00.001, par1], +I[id2, Stephen, 33, 1970-01-01T00:00:00.002, par1]"); + EXPECTED.put("par2", "+I[id3, Julian, 53, 1970-01-01T00:00:00.003, par2], +I[id4, Fabian, 31, 1970-01-01T00:00:00.004, par2]"); + EXPECTED.put("par3", "+I[id5, Sophia, 18, 1970-01-01T00:00:00.005, par3], +I[id6, Emma, 20, 1970-01-01T00:00:00.006, par3]"); + EXPECTED.put("par4", "+I[id7, Bob, 44, 1970-01-01T00:00:00.007, par4], +I[id8, Han, 56, 1970-01-01T00:00:00.008, par4]"); + } + + private Configuration conf; + + @TempDir + File tempFile; + + @BeforeEach + public void before() throws Exception { + final String basePath = tempFile.getAbsolutePath(); + conf = TestConfigurations.getDefaultConf(basePath); + conf.setString(TABLE_TYPE, TABLE_TYPE_MERGE_ON_READ); + + StreamerUtil.initTableIfNotExists(conf); + } + + @Test + void testWriteRecords() throws Exception { + TestData.writeData(TestData.DATA_SET_INSERT, conf); + try (OneInputStreamOperatorTestHarness harness = createReader()) { + harness.setup(); + harness.open(); + + SteppingMailboxProcessor processor = createLocalMailbox(harness); + StreamReadMonitoringFunction func = TestUtils.getMonitorFunc(conf); + + List splits = generateSplits(func); + assertThat("Should have 4 splits", splits.size(), is(4)); + for (MergeOnReadInputSplit split : splits) { + // Process this element to enqueue to mail-box. + harness.processElement(split, -1); + + // Run the mail-box once to read all records from the given split. + assertThat("Should process 1 split", processor.runMailboxStep()); + } + // Assert the output has expected elements. + TestData.assertRowDataEquals(harness.extractOutputValues(), TestData.DATA_SET_INSERT); + + TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf); + final List splits2 = generateSplits(func); + assertThat("Should have 4 splits", splits2.size(), is(4)); + for (MergeOnReadInputSplit split : splits2) { + // Process this element to enqueue to mail-box. + harness.processElement(split, -1); + + // Run the mail-box once to read all records from the given split. + assertThat("Should processed 1 split", processor.runMailboxStep()); + } + // The result sets behaves like append only: DATA_SET_ONE + DATA_SET_TWO + List expected = new ArrayList<>(TestData.DATA_SET_INSERT); + expected.addAll(TestData.DATA_SET_UPDATE_INSERT); + TestData.assertRowDataEquals(harness.extractOutputValues(), expected); + } + } + + @Test + public void testCheckpoint() throws Exception { + // Received emitted splits: split1, split2, split3, split4, checkpoint request is triggered + // when reading records from split1. + TestData.writeData(TestData.DATA_SET_INSERT, conf); + long timestamp = 0; + try (OneInputStreamOperatorTestHarness harness = createReader()) { + harness.setup(); + harness.open(); + + SteppingMailboxProcessor processor = createLocalMailbox(harness); + StreamReadMonitoringFunction func = TestUtils.getMonitorFunc(conf); + + List splits = generateSplits(func); + assertThat("Should have 4 splits", splits.size(), is(4)); + + for (MergeOnReadInputSplit split : splits) { + harness.processElement(split, ++timestamp); + } + + // Trigger snapshot state, it will start to work once all records from split0 are read. + processor.getMainMailboxExecutor() + .execute(() -> harness.snapshot(1, 3), "Trigger snapshot"); + + assertTrue(processor.runMailboxStep(), "Should have processed the split0"); + assertTrue(processor.runMailboxStep(), "Should have processed the snapshot state action"); + + assertThat(TestData.rowDataToString(harness.extractOutputValues()), + is(getSplitExpected(Collections.singletonList(splits.get(0)), EXPECTED))); + + // Read records from split1. + assertTrue(processor.runMailboxStep(), "Should have processed the split1"); + + // Read records from split2. + assertTrue(processor.runMailboxStep(), "Should have processed the split2"); + + // Read records from split3. + assertTrue(processor.runMailboxStep(), "Should have processed the split3"); + + // Assert the output has expected elements. + TestData.assertRowDataEquals(harness.extractOutputValues(), TestData.DATA_SET_INSERT); + } + } + + @Test + public void testCheckpointRestore() throws Exception { + TestData.writeData(TestData.DATA_SET_INSERT, conf); + + OperatorSubtaskState state; + final List splits; + try (OneInputStreamOperatorTestHarness harness = createReader()) { + harness.setup(); + harness.open(); + + StreamReadMonitoringFunction func = TestUtils.getMonitorFunc(conf); + + splits = generateSplits(func); + assertThat("Should have 4 splits", splits.size(), is(4)); + + // Enqueue all the splits. + for (MergeOnReadInputSplit split : splits) { + harness.processElement(split, -1); + } + + // Read all records from the first 2 splits. + SteppingMailboxProcessor localMailbox = createLocalMailbox(harness); + for (int i = 0; i < 2; i++) { + assertTrue(localMailbox.runMailboxStep(), "Should have processed the split#" + i); + } + + assertThat(TestData.rowDataToString(harness.extractOutputValues()), + is(getSplitExpected(splits.subList(0, 2), EXPECTED))); + + // Snapshot state now, there are 2 splits left in the state. + state = harness.snapshot(1, 1); + } + + try (OneInputStreamOperatorTestHarness harness = createReader()) { + harness.setup(); + // Recover to process the remaining splits. + harness.initializeState(state); + harness.open(); + + SteppingMailboxProcessor localMailbox = createLocalMailbox(harness); + + for (int i = 2; i < 4; i++) { + assertTrue(localMailbox.runMailboxStep(), "Should have processed one split#" + i); + } + + // expect to output the left data + assertThat(TestData.rowDataToString(harness.extractOutputValues()), + is(getSplitExpected(splits.subList(2, 4), EXPECTED))); + } + } + + private static String getSplitExpected(List splits, Map expected) { + return splits.stream() + .map(TestUtils::getSplitPartitionPath) + .map(expected::get) + .sorted(Comparator.naturalOrder()) + .collect(Collectors.toList()).toString(); + } + + private List generateSplits(StreamReadMonitoringFunction func) throws Exception { + final List splits = new ArrayList<>(); + func.open(conf); + func.monitorDirAndForwardSplits(new CollectingSourceContext<>(new Object(), splits)); + return splits; + } + + private OneInputStreamOperatorTestHarness createReader() throws Exception { + final String basePath = tempFile.getAbsolutePath(); + final org.apache.hadoop.conf.Configuration hadoopConf = HadoopConfigurations.getHadoopConf(new Configuration()); + final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() + .setConf(hadoopConf).setBasePath(basePath).build(); + final List partitionKeys = Collections.singletonList("partition"); + + // This input format is used to opening the emitted split. + TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient); + final Schema tableAvroSchema; + try { + tableAvroSchema = schemaResolver.getTableAvroSchema(); + } catch (Exception e) { + throw new HoodieException("Get table avro schema error", e); + } + final DataType rowDataType = AvroSchemaConverter.convertToDataType(tableAvroSchema); + final RowType rowType = (RowType) rowDataType.getLogicalType(); + final MergeOnReadTableState hoodieTableState = new MergeOnReadTableState( + rowType, + TestConfigurations.ROW_TYPE, + tableAvroSchema.toString(), + AvroSchemaConverter.convertToSchema(TestConfigurations.ROW_TYPE).toString(), + Collections.emptyList(), + new String[0]); + MergeOnReadInputFormat inputFormat = MergeOnReadInputFormat.builder() + .config(conf) + .tableState(hoodieTableState) + .fieldTypes(rowDataType.getChildren()) + .defaultPartName(PARTITION_DEFAULT_NAME.defaultValue()).limit(1000L) + .emitDelete(true) + .build(); + + OneInputStreamOperatorFactory factory = StreamReadOperator.factory(inputFormat); + OneInputStreamOperatorTestHarness harness = new OneInputStreamOperatorTestHarness<>( + factory, 1, 1, 0); + harness.getStreamConfig().setTimeCharacteristic(TimeCharacteristic.ProcessingTime); + + return harness; + } + + private SteppingMailboxProcessor createLocalMailbox( + OneInputStreamOperatorTestHarness harness) { + return new SteppingMailboxProcessor( + MailboxDefaultAction.Controller::suspendDefaultAction, + harness.getTaskMailbox(), + StreamTaskActionExecutor.IMMEDIATE); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/stats/TestColumnStatsIndices.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/stats/TestColumnStatsIndices.java new file mode 100644 index 0000000000000..837f419248636 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/stats/TestColumnStatsIndices.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.source.stats; + +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.utils.TestConfigurations; +import org.apache.hudi.utils.TestData; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.File; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertThrows; + +/** + * Test cases for {@link ColumnStatsIndices}. + */ +public class TestColumnStatsIndices { + @TempDir + File tempFile; + + @Test + void testTransposeColumnStatsIndex() throws Exception { + final String path = tempFile.getAbsolutePath(); + Configuration conf = TestConfigurations.getDefaultConf(path); + conf.setBoolean(FlinkOptions.METADATA_ENABLED, true); + conf.setBoolean(FlinkOptions.READ_DATA_SKIPPING_ENABLED, true); + conf.setString("hoodie.metadata.index.column.stats.enable", "true"); + + HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder() + .enable(true) + .withMetadataIndexColumnStats(true) + .build(); + TestData.writeData(TestData.DATA_SET_INSERT, conf); + + // explicit query columns + String[] queryColumns1 = {"uuid", "age"}; + List indexRows1 = ColumnStatsIndices.readColumnStatsIndex(path, metadataConfig, queryColumns1); + Pair, String[]> transposedIndexTable1 = ColumnStatsIndices + .transposeColumnStatsIndex(indexRows1, queryColumns1, TestConfigurations.ROW_TYPE); + assertThat("The schema columns should sort by natural order", + Arrays.toString(transposedIndexTable1.getRight()), is("[age, uuid]")); + List transposed1 = filterOutFileNames(transposedIndexTable1.getLeft()); + assertThat(transposed1.size(), is(4)); + final String expected = "[" + + "+I(2,18,20,0,id5,id6,0), " + + "+I(2,23,33,0,id1,id2,0), " + + "+I(2,31,53,0,id3,id4,0), " + + "+I(2,44,56,0,id7,id8,0)]"; + assertThat(transposed1.toString(), is(expected)); + + // no query columns, only for tests + assertThrows(IllegalArgumentException.class, + () -> ColumnStatsIndices.readColumnStatsIndex(path, metadataConfig, new String[0])); + } + + private static List filterOutFileNames(List indexRows) { + return indexRows.stream().map(row -> { + GenericRowData gr = (GenericRowData) row; + GenericRowData converted = new GenericRowData(gr.getArity() - 1); + for (int i = 1; i < gr.getArity(); i++) { + converted.setField(i - 1, gr.getField(i)); + } + return converted; + }) + // sort by age min values + .sorted(Comparator.comparingInt(r -> r.getInt(1))) + .collect(Collectors.toList()); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/stats/TestExpressionEvaluator.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/stats/TestExpressionEvaluator.java new file mode 100644 index 0000000000000..4ad286b780aee --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/stats/TestExpressionEvaluator.java @@ -0,0 +1,374 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.source.stats; + +import org.apache.hudi.utils.TestData; + +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.expressions.FieldReferenceExpression; +import org.apache.flink.table.expressions.ValueLiteralExpression; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Test cases for {@link ExpressionEvaluator}. + */ +public class TestExpressionEvaluator { + private static final DataType ROW_DATA_TYPE = DataTypes.ROW( + DataTypes.FIELD("f_tinyint", DataTypes.TINYINT()), + DataTypes.FIELD("f_smallint", DataTypes.SMALLINT()), + DataTypes.FIELD("f_int", DataTypes.INT()), + DataTypes.FIELD("f_long", DataTypes.BIGINT()), + DataTypes.FIELD("f_float", DataTypes.FLOAT()), + DataTypes.FIELD("f_double", DataTypes.DOUBLE()), + DataTypes.FIELD("f_boolean", DataTypes.BOOLEAN()), + DataTypes.FIELD("f_decimal", DataTypes.DECIMAL(10, 2)), + DataTypes.FIELD("f_bytes", DataTypes.VARBINARY(10)), + DataTypes.FIELD("f_string", DataTypes.VARCHAR(10)), + DataTypes.FIELD("f_time", DataTypes.TIME(3)), + DataTypes.FIELD("f_date", DataTypes.DATE()), + DataTypes.FIELD("f_timestamp", DataTypes.TIMESTAMP(3)) + ).notNull(); + private static final DataType INDEX_ROW_DATA_TYPE = DataTypes.ROW( + DataTypes.FIELD("file_name", DataTypes.STRING()), + DataTypes.FIELD("value_cnt", DataTypes.BIGINT()), + DataTypes.FIELD("f_int_min", DataTypes.INT()), + DataTypes.FIELD("f_int_max", DataTypes.INT()), + DataTypes.FIELD("f_int_null_cnt", DataTypes.BIGINT()), + DataTypes.FIELD("f_string_min", DataTypes.VARCHAR(10)), + DataTypes.FIELD("f_string_max", DataTypes.VARCHAR(10)), + DataTypes.FIELD("f_string_null_cnt", DataTypes.BIGINT()), + DataTypes.FIELD("f_timestamp_min", DataTypes.TIMESTAMP(3)), + DataTypes.FIELD("f_timestamp_max", DataTypes.TIMESTAMP(3)), + DataTypes.FIELD("f_timestamp_null_cnt", DataTypes.BIGINT()) + ).notNull(); + + private static final RowType INDEX_ROW_TYPE = (RowType) INDEX_ROW_DATA_TYPE.getLogicalType(); + + @Test + void testEqualTo() { + ExpressionEvaluator.EqualTo equalTo = ExpressionEvaluator.EqualTo.getInstance(); + FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2); + ValueLiteralExpression vExpr = new ValueLiteralExpression(12); + + RowData indexRow1 = intIndexRow(11, 13); + equalTo.bindFieldReference(rExpr) + .bindVal(vExpr) + .bindColStats(indexRow1, queryFields(2), rExpr); + assertTrue(equalTo.eval(), "11 < 12 < 13"); + + RowData indexRow2 = intIndexRow(12, 13); + equalTo.bindColStats(indexRow2, queryFields(2), rExpr); + assertTrue(equalTo.eval(), "12 <= 12 < 13"); + + RowData indexRow3 = intIndexRow(11, 12); + equalTo.bindColStats(indexRow3, queryFields(2), rExpr); + assertTrue(equalTo.eval(), "11 < 12 <= 12"); + + RowData indexRow4 = intIndexRow(10, 11); + equalTo.bindColStats(indexRow4, queryFields(2), rExpr); + assertFalse(equalTo.eval(), "11 < 12"); + + RowData indexRow5 = intIndexRow(13, 14); + equalTo.bindColStats(indexRow5, queryFields(2), rExpr); + assertFalse(equalTo.eval(), "12 < 13"); + + RowData indexRow6 = intIndexRow(null, null); + equalTo.bindColStats(indexRow6, queryFields(2), rExpr); + assertFalse(equalTo.eval(), "12 <> null"); + + equalTo.bindVal(new ValueLiteralExpression(null, DataTypes.INT())); + assertFalse(equalTo.eval(), "null <> null"); + } + + @Test + void testNotEqualTo() { + ExpressionEvaluator.NotEqualTo notEqualTo = ExpressionEvaluator.NotEqualTo.getInstance(); + FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2); + ValueLiteralExpression vExpr = new ValueLiteralExpression(12); + + RowData indexRow1 = intIndexRow(11, 13); + notEqualTo.bindFieldReference(rExpr) + .bindVal(vExpr) + .bindColStats(indexRow1, queryFields(2), rExpr); + assertTrue(notEqualTo.eval(), "11 <> 12 && 12 <> 13"); + + RowData indexRow2 = intIndexRow(12, 13); + notEqualTo.bindColStats(indexRow2, queryFields(2), rExpr); + assertTrue(notEqualTo.eval(), "12 <> 13"); + + RowData indexRow3 = intIndexRow(11, 12); + notEqualTo.bindColStats(indexRow3, queryFields(2), rExpr); + assertTrue(notEqualTo.eval(), "11 <> 12"); + + RowData indexRow4 = intIndexRow(10, 11); + notEqualTo.bindColStats(indexRow4, queryFields(2), rExpr); + assertTrue(notEqualTo.eval(), "10 <> 12 and 11 < 12"); + + RowData indexRow5 = intIndexRow(13, 14); + notEqualTo.bindColStats(indexRow5, queryFields(2), rExpr); + assertTrue(notEqualTo.eval(), "12 <> 13 and 12 <> 14"); + + RowData indexRow6 = intIndexRow(null, null); + notEqualTo.bindColStats(indexRow6, queryFields(2), rExpr); + assertTrue(notEqualTo.eval(), "12 <> null"); + + notEqualTo.bindVal(new ValueLiteralExpression(null, DataTypes.INT())); + assertTrue(notEqualTo.eval(), "null <> null"); + } + + @Test + void testIsNull() { + ExpressionEvaluator.IsNull isNull = ExpressionEvaluator.IsNull.getInstance(); + FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2); + + RowData indexRow1 = intIndexRow(11, 13); + isNull.bindFieldReference(rExpr) + .bindColStats(indexRow1, queryFields(2), rExpr); + assertTrue(isNull.eval(), "2 nulls"); + + RowData indexRow2 = intIndexRow(12, 13, 0L); + isNull.bindColStats(indexRow2, queryFields(2), rExpr); + assertFalse(isNull.eval(), "0 nulls"); + } + + @Test + void testIsNotNull() { + ExpressionEvaluator.IsNotNull isNotNull = ExpressionEvaluator.IsNotNull.getInstance(); + FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2); + + RowData indexRow1 = intIndexRow(11, 13); + isNotNull.bindFieldReference(rExpr) + .bindColStats(indexRow1, queryFields(2), rExpr); + assertTrue(isNotNull.eval(), "min 11 is not null"); + + RowData indexRow2 = intIndexRow(null, null, 0L); + isNotNull.bindColStats(indexRow2, queryFields(2), rExpr); + assertTrue(isNotNull.eval(), "min is null and 0 nulls"); + } + + @Test + void testLessThan() { + ExpressionEvaluator.LessThan lessThan = ExpressionEvaluator.LessThan.getInstance(); + FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2); + ValueLiteralExpression vExpr = new ValueLiteralExpression(12); + + RowData indexRow1 = intIndexRow(11, 13); + lessThan.bindFieldReference(rExpr) + .bindVal(vExpr) + .bindColStats(indexRow1, queryFields(2), rExpr); + assertTrue(lessThan.eval(), "12 < 13"); + + RowData indexRow2 = intIndexRow(12, 13); + lessThan.bindColStats(indexRow2, queryFields(2), rExpr); + assertFalse(lessThan.eval(), "min 12 = 12"); + + RowData indexRow3 = intIndexRow(11, 12); + lessThan.bindColStats(indexRow3, queryFields(2), rExpr); + assertTrue(lessThan.eval(), "11 < 12"); + + RowData indexRow4 = intIndexRow(10, 11); + lessThan.bindColStats(indexRow4, queryFields(2), rExpr); + assertTrue(lessThan.eval(), "11 < 12"); + + RowData indexRow5 = intIndexRow(13, 14); + lessThan.bindColStats(indexRow5, queryFields(2), rExpr); + assertFalse(lessThan.eval(), "12 < min 13"); + + RowData indexRow6 = intIndexRow(null, null); + lessThan.bindColStats(indexRow6, queryFields(2), rExpr); + assertFalse(lessThan.eval(), "12 <> null"); + + lessThan.bindVal(new ValueLiteralExpression(null, DataTypes.INT())); + assertFalse(lessThan.eval(), "null <> null"); + } + + @Test + void testGreaterThan() { + ExpressionEvaluator.GreaterThan greaterThan = ExpressionEvaluator.GreaterThan.getInstance(); + FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2); + ValueLiteralExpression vExpr = new ValueLiteralExpression(12); + + RowData indexRow1 = intIndexRow(11, 13); + greaterThan.bindFieldReference(rExpr) + .bindVal(vExpr) + .bindColStats(indexRow1, queryFields(2), rExpr); + assertTrue(greaterThan.eval(), "12 < 13"); + + RowData indexRow2 = intIndexRow(12, 13); + greaterThan.bindColStats(indexRow2, queryFields(2), rExpr); + assertTrue(greaterThan.eval(), "12 < 13"); + + RowData indexRow3 = intIndexRow(11, 12); + greaterThan.bindColStats(indexRow3, queryFields(2), rExpr); + assertFalse(greaterThan.eval(), "max 12 = 12"); + + RowData indexRow4 = intIndexRow(10, 11); + greaterThan.bindColStats(indexRow4, queryFields(2), rExpr); + assertFalse(greaterThan.eval(), "max 11 < 12"); + + RowData indexRow5 = intIndexRow(13, 14); + greaterThan.bindColStats(indexRow5, queryFields(2), rExpr); + assertTrue(greaterThan.eval(), "12 < 13"); + + RowData indexRow6 = intIndexRow(null, null); + greaterThan.bindColStats(indexRow6, queryFields(2), rExpr); + assertFalse(greaterThan.eval(), "12 <> null"); + + greaterThan.bindVal(new ValueLiteralExpression(null, DataTypes.INT())); + assertFalse(greaterThan.eval(), "null <> null"); + } + + @Test + void testLessThanOrEqual() { + ExpressionEvaluator.LessThanOrEqual lessThanOrEqual = ExpressionEvaluator.LessThanOrEqual.getInstance(); + FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2); + ValueLiteralExpression vExpr = new ValueLiteralExpression(12); + + RowData indexRow1 = intIndexRow(11, 13); + lessThanOrEqual.bindFieldReference(rExpr) + .bindVal(vExpr) + .bindColStats(indexRow1, queryFields(2), rExpr); + assertTrue(lessThanOrEqual.eval(), "11 < 12"); + + RowData indexRow2 = intIndexRow(12, 13); + lessThanOrEqual.bindColStats(indexRow2, queryFields(2), rExpr); + assertTrue(lessThanOrEqual.eval(), "min 12 = 12"); + + RowData indexRow3 = intIndexRow(11, 12); + lessThanOrEqual.bindColStats(indexRow3, queryFields(2), rExpr); + assertTrue(lessThanOrEqual.eval(), "max 12 = 12"); + + RowData indexRow4 = intIndexRow(10, 11); + lessThanOrEqual.bindColStats(indexRow4, queryFields(2), rExpr); + assertTrue(lessThanOrEqual.eval(), "max 11 < 12"); + + RowData indexRow5 = intIndexRow(13, 14); + lessThanOrEqual.bindColStats(indexRow5, queryFields(2), rExpr); + assertFalse(lessThanOrEqual.eval(), "12 < 13"); + + RowData indexRow6 = intIndexRow(null, null); + lessThanOrEqual.bindColStats(indexRow6, queryFields(2), rExpr); + assertFalse(lessThanOrEqual.eval(), "12 <> null"); + + lessThanOrEqual.bindVal(new ValueLiteralExpression(null, DataTypes.INT())); + assertFalse(lessThanOrEqual.eval(), "null <> null"); + } + + @Test + void testGreaterThanOrEqual() { + ExpressionEvaluator.GreaterThanOrEqual greaterThanOrEqual = ExpressionEvaluator.GreaterThanOrEqual.getInstance(); + FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2); + ValueLiteralExpression vExpr = new ValueLiteralExpression(12); + + RowData indexRow1 = intIndexRow(11, 13); + greaterThanOrEqual.bindFieldReference(rExpr) + .bindVal(vExpr) + .bindColStats(indexRow1, queryFields(2), rExpr); + assertTrue(greaterThanOrEqual.eval(), "12 < 13"); + + RowData indexRow2 = intIndexRow(12, 13); + greaterThanOrEqual.bindColStats(indexRow2, queryFields(2), rExpr); + assertTrue(greaterThanOrEqual.eval(), "min 12 = 12"); + + RowData indexRow3 = intIndexRow(11, 12); + greaterThanOrEqual.bindColStats(indexRow3, queryFields(2), rExpr); + assertTrue(greaterThanOrEqual.eval(), "max 12 = 12"); + + RowData indexRow4 = intIndexRow(10, 11); + greaterThanOrEqual.bindColStats(indexRow4, queryFields(2), rExpr); + assertFalse(greaterThanOrEqual.eval(), "max 11 < 12"); + + RowData indexRow5 = intIndexRow(13, 14); + greaterThanOrEqual.bindColStats(indexRow5, queryFields(2), rExpr); + assertTrue(greaterThanOrEqual.eval(), "12 < 13"); + + RowData indexRow6 = intIndexRow(null, null); + greaterThanOrEqual.bindColStats(indexRow6, queryFields(2), rExpr); + assertFalse(greaterThanOrEqual.eval(), "12 <> null"); + + greaterThanOrEqual.bindVal(new ValueLiteralExpression(null, DataTypes.INT())); + assertFalse(greaterThanOrEqual.eval(), "null <> null"); + } + + @Test + void testIn() { + ExpressionEvaluator.In in = ExpressionEvaluator.In.getInstance(); + FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2); + + RowData indexRow1 = intIndexRow(11, 13); + in.bindFieldReference(rExpr) + .bindColStats(indexRow1, queryFields(2), rExpr); + in.bindVals(12); + assertTrue(in.eval(), "11 < 12 < 13"); + + RowData indexRow2 = intIndexRow(12, 13); + in.bindColStats(indexRow2, queryFields(2), rExpr); + assertTrue(in.eval(), "min 12 = 12"); + + RowData indexRow3 = intIndexRow(11, 12); + in.bindColStats(indexRow3, queryFields(2), rExpr); + assertTrue(in.eval(), "max 12 = 12"); + + RowData indexRow4 = intIndexRow(10, 11); + in.bindColStats(indexRow4, queryFields(2), rExpr); + assertFalse(in.eval(), "max 11 < 12"); + + RowData indexRow5 = intIndexRow(13, 14); + in.bindColStats(indexRow5, queryFields(2), rExpr); + assertFalse(in.eval(), "12 < 13"); + + RowData indexRow6 = intIndexRow(null, null); + in.bindColStats(indexRow6, queryFields(2), rExpr); + assertFalse(in.eval(), "12 <> null"); + + in.bindVals((Object) null); + assertFalse(in.eval(), "null <> null"); + } + + private static RowData intIndexRow(Integer minVal, Integer maxVal) { + return intIndexRow(minVal, maxVal, 2L); + } + + private static RowData intIndexRow(Integer minVal, Integer maxVal, Long nullCnt) { + return indexRow(StringData.fromString("f1"), 100L, + minVal, maxVal, nullCnt, + StringData.fromString("1"), StringData.fromString("100"), 5L, + TimestampData.fromEpochMillis(1), TimestampData.fromEpochMillis(100), 3L); + } + + private static RowData indexRow(Object... fields) { + return TestData.insertRow(INDEX_ROW_TYPE, fields); + } + + private static RowType.RowField[] queryFields(int... pos) { + List fields = ((RowType) ROW_DATA_TYPE.getLogicalType()).getFields(); + return Arrays.stream(pos).mapToObj(fields::get).toArray(RowType.RowField[]::new); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/ITTestHoodieDataSource.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/ITTestHoodieDataSource.java new file mode 100644 index 0000000000000..afaeee34406db --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/ITTestHoodieDataSource.java @@ -0,0 +1,1618 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table; + +import org.apache.hudi.adapter.TestTableEnvs; +import org.apache.hudi.common.model.DefaultHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.table.catalog.HoodieCatalogTestUtils; +import org.apache.hudi.table.catalog.HoodieHiveCatalog; +import org.apache.hudi.util.StreamerUtil; +import org.apache.hudi.utils.FlinkMiniCluster; +import org.apache.hudi.utils.TestConfigurations; +import org.apache.hudi.utils.TestData; +import org.apache.hudi.utils.TestSQL; +import org.apache.hudi.utils.TestUtils; +import org.apache.hudi.utils.factory.CollectSinkTableFactory; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.api.config.ExecutionConfigOptions; +import org.apache.flink.table.api.internal.TableEnvironmentImpl; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.Row; +import org.apache.flink.util.CollectionUtil; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.File; +import java.time.ZoneId; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.apache.hudi.utils.TestConfigurations.catalog; +import static org.apache.hudi.utils.TestConfigurations.sql; +import static org.apache.hudi.utils.TestData.assertRowsEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * IT cases for Hoodie table source and sink. + */ +@ExtendWith(FlinkMiniCluster.class) +public class ITTestHoodieDataSource { + private TableEnvironment streamTableEnv; + private TableEnvironment batchTableEnv; + + @BeforeEach + void beforeEach() { + EnvironmentSettings settings = EnvironmentSettings.newInstance().build(); + streamTableEnv = TableEnvironmentImpl.create(settings); + streamTableEnv.getConfig().getConfiguration() + .setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 4); + Configuration execConf = streamTableEnv.getConfig().getConfiguration(); + execConf.setString("execution.checkpointing.interval", "2s"); + // configure not to retry after failure + execConf.setString("restart-strategy", "fixed-delay"); + execConf.setString("restart-strategy.fixed-delay.attempts", "0"); + + batchTableEnv = TestTableEnvs.getBatchTableEnv(); + batchTableEnv.getConfig().getConfiguration() + .setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 4); + } + + @TempDir + File tempFile; + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testStreamWriteAndReadFromSpecifiedCommit(HoodieTableType tableType) throws Exception { + // create filesystem table named source + String createSource = TestConfigurations.getFileSourceDDL("source"); + streamTableEnv.executeSql(createSource); + + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.READ_AS_STREAMING, true) + .option(FlinkOptions.TABLE_TYPE, tableType) + .end(); + streamTableEnv.executeSql(hoodieTableDDL); + String insertInto = "insert into t1 select * from source"; + execInsertSql(streamTableEnv, insertInto); + + String firstCommit = TestUtils.getFirstCompleteInstant(tempFile.getAbsolutePath()); + streamTableEnv.executeSql("drop table t1"); + hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.READ_AS_STREAMING, true) + .option(FlinkOptions.TABLE_TYPE, tableType) + .option(FlinkOptions.READ_START_COMMIT, firstCommit) + .end(); + streamTableEnv.executeSql(hoodieTableDDL); + List rows = execSelectSql(streamTableEnv, "select * from t1", 10); + assertRowsEquals(rows, TestData.DATA_SET_SOURCE_INSERT); + + // insert another batch of data + execInsertSql(streamTableEnv, insertInto); + List rows2 = execSelectSql(streamTableEnv, "select * from t1", 10); + assertRowsEquals(rows2, TestData.DATA_SET_SOURCE_INSERT); + + streamTableEnv.getConfig().getConfiguration() + .setBoolean("table.dynamic-table-options.enabled", true); + // specify the start commit as earliest + List rows3 = execSelectSql(streamTableEnv, + "select * from t1/*+options('read.start-commit'='earliest')*/", 10); + assertRowsEquals(rows3, TestData.DATA_SET_SOURCE_INSERT); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testStreamWriteAndRead(HoodieTableType tableType) throws Exception { + // create filesystem table named source + String createSource = TestConfigurations.getFileSourceDDL("source"); + streamTableEnv.executeSql(createSource); + + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.READ_AS_STREAMING, true) + .option(FlinkOptions.TABLE_TYPE, tableType) + .end(); + streamTableEnv.executeSql(hoodieTableDDL); + String insertInto = "insert into t1 select * from source"; + execInsertSql(streamTableEnv, insertInto); + + // reading from the latest commit instance. + List rows = execSelectSql(streamTableEnv, "select * from t1", 10); + assertRowsEquals(rows, TestData.DATA_SET_SOURCE_INSERT_LATEST_COMMIT); + + // insert another batch of data + execInsertSql(streamTableEnv, insertInto); + List rows2 = execSelectSql(streamTableEnv, "select * from t1", 10); + assertRowsEquals(rows2, TestData.DATA_SET_SOURCE_INSERT_LATEST_COMMIT); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testStreamReadAppendData(HoodieTableType tableType) throws Exception { + // create filesystem table named source + String createSource = TestConfigurations.getFileSourceDDL("source"); + String createSource2 = TestConfigurations.getFileSourceDDL("source2", "test_source_2.data"); + streamTableEnv.executeSql(createSource); + streamTableEnv.executeSql(createSource2); + + String createHoodieTable = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.READ_AS_STREAMING, true) + .option(FlinkOptions.TABLE_TYPE, tableType) + .end(); + streamTableEnv.executeSql(createHoodieTable); + String insertInto = "insert into t1 select * from source"; + // execute 2 times + execInsertSql(streamTableEnv, insertInto); + // remember the commit + String specifiedCommit = TestUtils.getFirstCompleteInstant(tempFile.getAbsolutePath()); + // another update batch + String insertInto2 = "insert into t1 select * from source2"; + execInsertSql(streamTableEnv, insertInto2); + // now we consume starting from the oldest commit + String createHoodieTable2 = sql("t2") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.READ_AS_STREAMING, true) + .option(FlinkOptions.TABLE_TYPE, tableType) + .option(FlinkOptions.READ_START_COMMIT, specifiedCommit) + .end(); + streamTableEnv.executeSql(createHoodieTable2); + List rows = execSelectSql(streamTableEnv, "select * from t2", 10); + // all the data with same keys are appended within one data bucket and one log file, + // so when consume, the same keys are merged + assertRowsEquals(rows, TestData.DATA_SET_SOURCE_MERGED); + } + + @Test + void testStreamWriteBatchRead() { + // create filesystem table named source + String createSource = TestConfigurations.getFileSourceDDL("source"); + streamTableEnv.executeSql(createSource); + + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .end(); + streamTableEnv.executeSql(hoodieTableDDL); + String insertInto = "insert into t1 select * from source"; + execInsertSql(streamTableEnv, insertInto); + + List rows = CollectionUtil.iterableToList( + () -> streamTableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(rows, TestData.DATA_SET_SOURCE_INSERT); + } + + @Test + void testStreamWriteBatchReadOptimized() throws Exception { + // create filesystem table named source + String createSource = TestConfigurations.getFileSourceDDL("source"); + streamTableEnv.executeSql(createSource); + + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.TABLE_TYPE, FlinkOptions.TABLE_TYPE_MERGE_ON_READ) + // read optimized is supported for both MOR and COR table, + // test MOR streaming write with compaction then reads as + // query type 'read_optimized'. + .option(FlinkOptions.QUERY_TYPE, FlinkOptions.QUERY_TYPE_READ_OPTIMIZED) + .option(FlinkOptions.COMPACTION_DELTA_COMMITS, 1) + .option(FlinkOptions.COMPACTION_TASKS, 1) + // disable the metadata table because + // the lock conflicts resolution takes time + .option(FlinkOptions.METADATA_ENABLED, false) + .end(); + streamTableEnv.executeSql(hoodieTableDDL); + String insertInto = "insert into t1 select * from source"; + execInsertSql(streamTableEnv, insertInto); + + // give some buffer time for finishing the async compaction tasks + TimeUnit.SECONDS.sleep(5); + List rows = CollectionUtil.iterableToList( + () -> streamTableEnv.sqlQuery("select * from t1").execute().collect()); + + // the test is flaky based on whether the first compaction is pending when + // scheduling the 2nd compaction. + // see details in CompactionPlanOperator#scheduleCompaction. + if (rows.size() < TestData.DATA_SET_SOURCE_INSERT.size()) { + assertRowsEquals(rows, TestData.DATA_SET_SOURCE_INSERT_FIRST_COMMIT); + } else { + assertRowsEquals(rows, TestData.DATA_SET_SOURCE_INSERT); + } + } + + @Test + void testStreamWriteReadSkippingCompaction() throws Exception { + // create filesystem table named source + String createSource = TestConfigurations.getFileSourceDDL("source", 4); + streamTableEnv.executeSql(createSource); + + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.TABLE_TYPE, FlinkOptions.TABLE_TYPE_MERGE_ON_READ) + .option(FlinkOptions.READ_AS_STREAMING, true) + .option(FlinkOptions.READ_STREAMING_SKIP_COMPACT, true) + .option(FlinkOptions.COMPACTION_DELTA_COMMITS, 1) + .option(FlinkOptions.COMPACTION_TASKS, 1) + .end(); + streamTableEnv.executeSql(hoodieTableDDL); + String insertInto = "insert into t1 select * from source"; + execInsertSql(streamTableEnv, insertInto); + + String instant = TestUtils.getNthCompleteInstant(tempFile.getAbsolutePath(), 2, true); + + streamTableEnv.getConfig().getConfiguration() + .setBoolean("table.dynamic-table-options.enabled", true); + final String query = String.format("select * from t1/*+ options('read.start-commit'='%s')*/", instant); + List rows = execSelectSql(streamTableEnv, query, 10); + assertRowsEquals(rows, TestData.DATA_SET_SOURCE_INSERT_LATEST_COMMIT); + } + + @Test + void testStreamWriteWithCleaning() { + // create filesystem table named source + + // the source generates 4 commits but the cleaning task + // would always try to keep the remaining commits number as 1 + String createSource = TestConfigurations.getFileSourceDDL( + "source", "test_source_3.data", 4); + streamTableEnv.executeSql(createSource); + + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.CLEAN_RETAIN_COMMITS, 1) + .end(); + streamTableEnv.executeSql(hoodieTableDDL); + String insertInto = "insert into t1 select * from source"; + execInsertSql(streamTableEnv, insertInto); + + Configuration defaultConf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + Map options1 = new HashMap<>(defaultConf.toMap()); + options1.put(FlinkOptions.TABLE_NAME.key(), "t1"); + Configuration conf = Configuration.fromMap(options1); + HoodieTimeline timeline = StreamerUtil.createMetaClient(conf).getActiveTimeline(); + assertTrue(timeline.filterCompletedInstants() + .getInstants().anyMatch(instant -> instant.getAction().equals("clean")), + "some commits should be cleaned"); + } + + @Test + void testStreamReadWithDeletes() throws Exception { + // create filesystem table named source + + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setString(FlinkOptions.TABLE_NAME, "t1"); + conf.setString(FlinkOptions.TABLE_TYPE, FlinkOptions.TABLE_TYPE_MERGE_ON_READ); + conf.setBoolean(FlinkOptions.CHANGELOG_ENABLED, true); + + // write one commit + TestData.writeData(TestData.DATA_SET_INSERT, conf); + // write another commit with deletes + TestData.writeData(TestData.DATA_SET_UPDATE_DELETE, conf); + + String latestCommit = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath()); + + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.TABLE_TYPE, FlinkOptions.TABLE_TYPE_MERGE_ON_READ) + .option(FlinkOptions.READ_AS_STREAMING, true) + .option(FlinkOptions.READ_STREAMING_CHECK_INTERVAL, 2) + .option(FlinkOptions.READ_START_COMMIT, latestCommit) + .option(FlinkOptions.CHANGELOG_ENABLED, true) + .end(); + streamTableEnv.executeSql(hoodieTableDDL); + + final String sinkDDL = "create table sink(\n" + + " name varchar(20),\n" + + " age_sum int\n" + + ") with (\n" + + " 'connector' = '" + CollectSinkTableFactory.FACTORY_ID + "'" + + ")"; + List result = execSelectSql(streamTableEnv, + "select name, sum(age) from t1 group by name", sinkDDL, 10); + final String expected = "[+I(+I[Danny, 24]), +I(+I[Stephen, 34])]"; + assertRowsEquals(result, expected, true); + } + + @ParameterizedTest + @MethodSource("tableTypeAndPartitioningParams") + void testStreamReadFilterByPartition(HoodieTableType tableType, boolean hiveStylePartitioning) throws Exception { + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setString(FlinkOptions.TABLE_NAME, "t1"); + conf.setString(FlinkOptions.TABLE_TYPE, tableType.name()); + conf.setBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING, hiveStylePartitioning); + + // write one commit + TestData.writeData(TestData.DATA_SET_INSERT, conf); + + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.TABLE_TYPE, tableType) + .option(FlinkOptions.READ_AS_STREAMING, true) + .option(FlinkOptions.READ_STREAMING_CHECK_INTERVAL, 2) + .option(FlinkOptions.HIVE_STYLE_PARTITIONING, hiveStylePartitioning) + .end(); + streamTableEnv.executeSql(hoodieTableDDL); + + List result = execSelectSql(streamTableEnv, + "select * from t1 where `partition`='par1'", 10); + final String expected = "[" + + "+I(+I[id1, Danny, 23, 1970-01-01T00:00:00.001, par1]), " + + "+I(+I[id2, Stephen, 33, 1970-01-01T00:00:00.002, par1])]"; + assertRowsEquals(result, expected, true); + } + + @Test + void testStreamReadMorTableWithCompactionPlan() throws Exception { + String createSource = TestConfigurations.getFileSourceDDL("source"); + streamTableEnv.executeSql(createSource); + + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.TABLE_TYPE, FlinkOptions.TABLE_TYPE_MERGE_ON_READ) + .option(FlinkOptions.READ_AS_STREAMING, true) + .option(FlinkOptions.READ_START_COMMIT, FlinkOptions.START_COMMIT_EARLIEST) + .option(FlinkOptions.READ_STREAMING_CHECK_INTERVAL, 2) + // close the async compaction + .option(FlinkOptions.COMPACTION_ASYNC_ENABLED, false) + // generate compaction plan for each commit + .option(FlinkOptions.COMPACTION_DELTA_COMMITS, 1) + .noPartition() + .end(); + streamTableEnv.executeSql(hoodieTableDDL); + + String insertInto = "insert into t1 select * from source"; + execInsertSql(streamTableEnv, insertInto); + + List result = execSelectSql(streamTableEnv, "select * from t1", 10); + final String expected = "[" + + "+I[id1, Danny, 23, 1970-01-01T00:00:01, par1], " + + "+I[id2, Stephen, 33, 1970-01-01T00:00:02, par1], " + + "+I[id3, Julian, 53, 1970-01-01T00:00:03, par2], " + + "+I[id4, Fabian, 31, 1970-01-01T00:00:04, par2], " + + "+I[id5, Sophia, 18, 1970-01-01T00:00:05, par3], " + + "+I[id6, Emma, 20, 1970-01-01T00:00:06, par3], " + + "+I[id7, Bob, 44, 1970-01-01T00:00:07, par4], " + + "+I[id8, Han, 56, 1970-01-01T00:00:08, par4]]"; + assertRowsEquals(result, expected); + } + + @ParameterizedTest + @MethodSource("executionModeAndPartitioningParams") + void testWriteAndRead(ExecMode execMode, boolean hiveStylePartitioning) { + TableEnvironment tableEnv = execMode == ExecMode.BATCH ? batchTableEnv : streamTableEnv; + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.HIVE_STYLE_PARTITIONING, hiveStylePartitioning) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + execInsertSql(tableEnv, TestSQL.INSERT_T1); + + List result1 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result1, TestData.DATA_SET_SOURCE_INSERT); + // apply filters + List result2 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1 where uuid > 'id5'").execute().collect()); + assertRowsEquals(result2, "[" + + "+I[id6, Emma, 20, 1970-01-01T00:00:06, par3], " + + "+I[id7, Bob, 44, 1970-01-01T00:00:07, par4], " + + "+I[id8, Han, 56, 1970-01-01T00:00:08, par4]]"); + } + + @ParameterizedTest + @MethodSource("tableTypeAndPartitioningParams") + void testWriteAndReadWithProctimeSequence(HoodieTableType tableType, boolean hiveStylePartitioning) { + TableEnvironment tableEnv = batchTableEnv; + String hoodieTableDDL = sql("t1") + .field("uuid varchar(20)") + .field("name varchar(10)") + .field("age int") + .field("tss timestamp(3)") // use a different field with default precombine field 'ts' + .field("`partition` varchar(10)") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.TABLE_TYPE, tableType) + .option(FlinkOptions.HIVE_STYLE_PARTITIONING, hiveStylePartitioning) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + execInsertSql(tableEnv, TestSQL.INSERT_SAME_KEY_T1); + + List result1 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result1, "[+I[id1, Danny, 23, 1970-01-01T00:00:01, par1]]"); + } + + @ParameterizedTest + @MethodSource("tableTypeAndPartitioningParams") + void testWriteAndReadWithProctimeSequenceWithTsColumnExisting(HoodieTableType tableType, boolean hiveStylePartitioning) { + TableEnvironment tableEnv = batchTableEnv; + String hoodieTableDDL = sql("t1") + .field("uuid varchar(20)") + .field("name varchar(10)") + .field("age int") + .field("ts timestamp(3)") // use the default precombine field 'ts' + .field("`partition` varchar(10)") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.TABLE_TYPE, tableType) + .option(FlinkOptions.HIVE_STYLE_PARTITIONING, hiveStylePartitioning) + .option(FlinkOptions.PRECOMBINE_FIELD, FlinkOptions.NO_PRE_COMBINE) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + execInsertSql(tableEnv, TestSQL.INSERT_SAME_KEY_T1); + + List result1 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result1, "[+I[id1, Danny, 23, 1970-01-01T00:00:01, par1]]"); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testBatchModeUpsertWithoutPartition(HoodieTableType tableType) { + TableEnvironment tableEnv = batchTableEnv; + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.TABLE_NAME, tableType.name()) + .option("hoodie.parquet.small.file.limit", "0") // invalidate the small file strategy + .option("hoodie.parquet.max.file.size", "0") + .noPartition() + .end(); + tableEnv.executeSql(hoodieTableDDL); + + execInsertSql(tableEnv, TestSQL.INSERT_T1); + + List result1 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result1, TestData.DATA_SET_SOURCE_INSERT); + + // batchMode update + execInsertSql(tableEnv, TestSQL.UPDATE_INSERT_T1); + List result2 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result2, TestData.DATA_SET_SOURCE_MERGED); + } + + @ParameterizedTest + @MethodSource("tableTypeAndPartitioningParams") + void testBatchModeUpsert(HoodieTableType tableType, boolean hiveStylePartitioning) { + TableEnvironment tableEnv = batchTableEnv; + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.TABLE_NAME, tableType) + .option(FlinkOptions.HIVE_STYLE_PARTITIONING, hiveStylePartitioning) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + execInsertSql(tableEnv, TestSQL.INSERT_T1); + + List result1 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result1, TestData.DATA_SET_SOURCE_INSERT); + + // batchMode update + execInsertSql(tableEnv, TestSQL.UPDATE_INSERT_T1); + + List result2 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result2, TestData.DATA_SET_SOURCE_MERGED); + } + + @ParameterizedTest + @EnumSource(value = ExecMode.class) + void testWriteAndReadParMiddle(ExecMode execMode) throws Exception { + boolean streaming = execMode == ExecMode.STREAM; + String hoodieTableDDL = "create table t1(\n" + + " uuid varchar(20),\n" + + " name varchar(10),\n" + + " age int,\n" + + " `partition` varchar(20),\n" // test streaming read with partition field in the middle + + " ts timestamp(3),\n" + + " PRIMARY KEY(uuid) NOT ENFORCED\n" + + ")\n" + + "PARTITIONED BY (`partition`)\n" + + "with (\n" + + " 'connector' = 'hudi',\n" + + " 'path' = '" + tempFile.getAbsolutePath() + "',\n" + + " 'read.streaming.enabled' = '" + streaming + "'\n" + + ")"; + streamTableEnv.executeSql(hoodieTableDDL); + String insertInto = "insert into t1 values\n" + + "('id1','Danny',23,'par1',TIMESTAMP '1970-01-01 00:00:01'),\n" + + "('id2','Stephen',33,'par1',TIMESTAMP '1970-01-01 00:00:02'),\n" + + "('id3','Julian',53,'par2',TIMESTAMP '1970-01-01 00:00:03'),\n" + + "('id4','Fabian',31,'par2',TIMESTAMP '1970-01-01 00:00:04'),\n" + + "('id5','Sophia',18,'par3',TIMESTAMP '1970-01-01 00:00:05'),\n" + + "('id6','Emma',20,'par3',TIMESTAMP '1970-01-01 00:00:06'),\n" + + "('id7','Bob',44,'par4',TIMESTAMP '1970-01-01 00:00:07'),\n" + + "('id8','Han',56,'par4',TIMESTAMP '1970-01-01 00:00:08')"; + execInsertSql(streamTableEnv, insertInto); + + final String expected = "[" + + "+I[id1, Danny, 23, par1, 1970-01-01T00:00:01], " + + "+I[id2, Stephen, 33, par1, 1970-01-01T00:00:02], " + + "+I[id3, Julian, 53, par2, 1970-01-01T00:00:03], " + + "+I[id4, Fabian, 31, par2, 1970-01-01T00:00:04], " + + "+I[id5, Sophia, 18, par3, 1970-01-01T00:00:05], " + + "+I[id6, Emma, 20, par3, 1970-01-01T00:00:06], " + + "+I[id7, Bob, 44, par4, 1970-01-01T00:00:07], " + + "+I[id8, Han, 56, par4, 1970-01-01T00:00:08]]"; + + List result = execSelectSql(streamTableEnv, "select * from t1", execMode); + + assertRowsEquals(result, expected); + + // insert another batch of data + execInsertSql(streamTableEnv, insertInto); + List result2 = execSelectSql(streamTableEnv, "select * from t1", execMode); + assertRowsEquals(result2, expected); + } + + @ParameterizedTest + @EnumSource(value = ExecMode.class) + void testWriteAndReadWithTimestampMicros(ExecMode execMode) throws Exception { + boolean streaming = execMode == ExecMode.STREAM; + String hoodieTableDDL = sql("t1") + .field("id int") + .field("name varchar(10)") + .field("ts timestamp(6)") + .pkField("id") + .noPartition() + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.READ_AS_STREAMING, streaming) + .end(); + streamTableEnv.executeSql(hoodieTableDDL); + String insertInto = "insert into t1 values\n" + + "(1,'Danny',TIMESTAMP '2021-12-01 01:02:01.100001'),\n" + + "(2,'Stephen',TIMESTAMP '2021-12-02 03:04:02.200002'),\n" + + "(3,'Julian',TIMESTAMP '2021-12-03 13:14:03.300003'),\n" + + "(4,'Fabian',TIMESTAMP '2021-12-04 15:16:04.400004')"; + execInsertSql(streamTableEnv, insertInto); + + final String expected = "[" + + "+I[1, Danny, 2021-12-01T01:02:01.100001], " + + "+I[2, Stephen, 2021-12-02T03:04:02.200002], " + + "+I[3, Julian, 2021-12-03T13:14:03.300003], " + + "+I[4, Fabian, 2021-12-04T15:16:04.400004]]"; + + List result = execSelectSql(streamTableEnv, "select * from t1", execMode); + assertRowsEquals(result, expected); + + // insert another batch of data + execInsertSql(streamTableEnv, insertInto); + List result2 = execSelectSql(streamTableEnv, "select * from t1", execMode); + assertRowsEquals(result2, expected); + } + + @ParameterizedTest + @EnumSource(value = ExecMode.class) + void testInsertOverwrite(ExecMode execMode) { + TableEnvironment tableEnv = execMode == ExecMode.BATCH ? batchTableEnv : streamTableEnv; + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + execInsertSql(tableEnv, TestSQL.INSERT_T1); + + // overwrite partition 'par1' and increase in age by 1 + final String insertInto2 = "insert overwrite t1 partition(`partition`='par1') values\n" + + "('id1','Danny',24,TIMESTAMP '1970-01-01 00:00:01'),\n" + + "('id2','Stephen',34,TIMESTAMP '1970-01-01 00:00:02')\n"; + + execInsertSql(tableEnv, insertInto2); + + List result1 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result1, TestData.DATA_SET_SOURCE_INSERT_OVERWRITE); + + // overwrite the whole table + final String insertInto3 = "insert overwrite t1 values\n" + + "('id1','Danny',24,TIMESTAMP '1970-01-01 00:00:01', 'par1'),\n" + + "('id2','Stephen',34,TIMESTAMP '1970-01-01 00:00:02', 'par2')\n"; + + execInsertSql(tableEnv, insertInto3); + + List result2 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + final String expected = "[" + + "+I[id1, Danny, 24, 1970-01-01T00:00:01, par1], " + + "+I[id2, Stephen, 34, 1970-01-01T00:00:02, par2]]"; + assertRowsEquals(result2, expected); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testStreamWriteAndReadWithMiniBatches(HoodieTableType tableType) throws Exception { + // create filesystem table named source + String createSource = TestConfigurations.getFileSourceDDL("source", 4); + streamTableEnv.executeSql(createSource); + + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.READ_AS_STREAMING, true) + .option(FlinkOptions.TABLE_TYPE, tableType) + .option(FlinkOptions.READ_START_COMMIT, "earliest") + .option(FlinkOptions.WRITE_BATCH_SIZE, 0.00001) + .noPartition() + .end(); + streamTableEnv.executeSql(hoodieTableDDL); + String insertInto = "insert into t1 select * from source"; + execInsertSql(streamTableEnv, insertInto); + + // reading from the earliest commit instance. + List rows = execSelectSql(streamTableEnv, "select * from t1", 20); + assertRowsEquals(rows, TestData.DATA_SET_SOURCE_INSERT); + } + + @ParameterizedTest + @MethodSource("executionModeAndTableTypeParams") + void testBatchUpsertWithMiniBatches(ExecMode execMode, HoodieTableType tableType) { + TableEnvironment tableEnv = execMode == ExecMode.BATCH ? batchTableEnv : streamTableEnv; + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.WRITE_BATCH_SIZE, "0.001") + .option(FlinkOptions.TABLE_TYPE, tableType) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + final String insertInto1 = "insert into t1 values\n" + + "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1')"; + + execInsertSql(tableEnv, insertInto1); + + final String insertInto2 = "insert into t1 values\n" + + "('id1','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','par1'),\n" + + "('id1','Julian',53,TIMESTAMP '1970-01-01 00:00:03','par1'),\n" + + "('id1','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','par1'),\n" + + "('id1','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par1')"; + + execInsertSql(tableEnv, insertInto2); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result, "[+I[id1, Sophia, 18, 1970-01-01T00:00:05, par1]]"); + } + + @ParameterizedTest + @MethodSource("executionModeAndTableTypeParams") + void testBatchUpsertWithMiniBatchesGlobalIndex(ExecMode execMode, HoodieTableType tableType) { + TableEnvironment tableEnv = execMode == ExecMode.BATCH ? batchTableEnv : streamTableEnv; + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.WRITE_BATCH_SIZE, "0.001") + .option(FlinkOptions.TABLE_TYPE, tableType) + .option(FlinkOptions.INDEX_GLOBAL_ENABLED, true) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + final String insertInto1 = "insert into t1 values\n" + + "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1')"; + + execInsertSql(tableEnv, insertInto1); + + final String insertInto2 = "insert into t1 values\n" + + "('id1','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','par2'),\n" + + "('id1','Julian',53,TIMESTAMP '1970-01-01 00:00:03','par1'),\n" + + "('id1','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','par2'),\n" + + "('id1','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par3')"; + + execInsertSql(tableEnv, insertInto2); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result, "[+I[id1, Sophia, 18, 1970-01-01T00:00:05, par3]]"); + } + + @Test + void testUpdateWithDefaultHoodieRecordPayload() { + TableEnvironment tableEnv = batchTableEnv; + String hoodieTableDDL = sql("t1") + .field("id int") + .field("name string") + .field("price double") + .field("ts bigint") + .pkField("id") + .noPartition() + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.PAYLOAD_CLASS_NAME, DefaultHoodieRecordPayload.class.getName()) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + final String insertInto1 = "insert into t1 values\n" + + "(1,'a1',20,20)"; + execInsertSql(tableEnv, insertInto1); + + final String insertInto4 = "insert into t1 values\n" + + "(1,'a1',20,1)"; + execInsertSql(tableEnv, insertInto4); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result, "[+I[1, a1, 20.0, 20]]"); + } + + @ParameterizedTest + @MethodSource("executionModeAndTableTypeParams") + void testWriteNonPartitionedTable(ExecMode execMode, HoodieTableType tableType) { + TableEnvironment tableEnv = execMode == ExecMode.BATCH ? batchTableEnv : streamTableEnv; + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.TABLE_TYPE, tableType) + .noPartition() + .end(); + tableEnv.executeSql(hoodieTableDDL); + + final String insertInto1 = "insert into t1 values\n" + + "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1')"; + + execInsertSql(tableEnv, insertInto1); + + final String insertInto2 = "insert into t1 values\n" + + "('id1','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','par2'),\n" + + "('id1','Julian',53,TIMESTAMP '1970-01-01 00:00:03','par3'),\n" + + "('id1','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','par4'),\n" + + "('id1','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par5')"; + + execInsertSql(tableEnv, insertInto2); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result, "[+I[id1, Sophia, 18, 1970-01-01T00:00:05, par5]]"); + } + + @Test + void testWriteGlobalIndex() { + // the source generates 4 commits + String createSource = TestConfigurations.getFileSourceDDL( + "source", "test_source_4.data", 4); + streamTableEnv.executeSql(createSource); + + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.INDEX_GLOBAL_ENABLED, true) + .option(FlinkOptions.PRE_COMBINE, true) + .end(); + streamTableEnv.executeSql(hoodieTableDDL); + + final String insertInto2 = "insert into t1 select * from source"; + + execInsertSql(streamTableEnv, insertInto2); + + List result = CollectionUtil.iterableToList( + () -> streamTableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result, "[+I[id1, Phoebe, 52, 1970-01-01T00:00:08, par4]]"); + } + + @Test + void testWriteLocalIndex() { + // the source generates 4 commits + String createSource = TestConfigurations.getFileSourceDDL( + "source", "test_source_4.data", 4); + streamTableEnv.executeSql(createSource); + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.INDEX_GLOBAL_ENABLED, false) + .option(FlinkOptions.PRE_COMBINE, true) + .end(); + streamTableEnv.executeSql(hoodieTableDDL); + + final String insertInto2 = "insert into t1 select * from source"; + + execInsertSql(streamTableEnv, insertInto2); + + List result = CollectionUtil.iterableToList( + () -> streamTableEnv.sqlQuery("select * from t1").execute().collect()); + final String expected = "[" + + "+I[id1, Stephen, 34, 1970-01-01T00:00:02, par1], " + + "+I[id1, Fabian, 32, 1970-01-01T00:00:04, par2], " + + "+I[id1, Jane, 19, 1970-01-01T00:00:06, par3], " + + "+I[id1, Phoebe, 52, 1970-01-01T00:00:08, par4]]"; + assertRowsEquals(result, expected, 3); + } + + @Test + void testStreamReadEmptyTablePath() throws Exception { + // case1: table metadata path does not exists + // create a flink source table + String createHoodieTable = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.READ_AS_STREAMING, "true") + .option(FlinkOptions.TABLE_TYPE, FlinkOptions.TABLE_TYPE_MERGE_ON_READ) + .end(); + streamTableEnv.executeSql(createHoodieTable); + + // no exception expects to be thrown + List rows1 = execSelectSql(streamTableEnv, "select * from t1", 10); + assertRowsEquals(rows1, "[]"); + + // case2: empty table without data files + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + StreamerUtil.initTableIfNotExists(conf); + + List rows2 = execSelectSql(streamTableEnv, "select * from t1", 10); + assertRowsEquals(rows2, "[]"); + } + + @Test + void testBatchReadEmptyTablePath() throws Exception { + // case1: table metadata path does not exists + // create a flink source table + String createHoodieTable = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.TABLE_TYPE, FlinkOptions.TABLE_TYPE_MERGE_ON_READ) + .end(); + batchTableEnv.executeSql(createHoodieTable); + + // no exception expects to be thrown + assertThrows(Exception.class, + () -> execSelectSql(batchTableEnv, "select * from t1", 10), + "Exception should throw when querying non-exists table in batch mode"); + + // case2: empty table without data files + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + StreamerUtil.initTableIfNotExists(conf); + + List rows2 = CollectionUtil.iteratorToList(batchTableEnv.executeSql("select * from t1").collect()); + assertRowsEquals(rows2, "[]"); + } + + @ParameterizedTest + @EnumSource(value = ExecMode.class) + void testWriteAndReadDebeziumJson(ExecMode execMode) throws Exception { + String sourcePath = Objects.requireNonNull(Thread.currentThread() + .getContextClassLoader().getResource("debezium_json.data")).toString(); + String sourceDDL = "" + + "CREATE TABLE debezium_source(\n" + + " id INT NOT NULL PRIMARY KEY NOT ENFORCED,\n" + + " ts BIGINT,\n" + + " name STRING,\n" + + " description STRING,\n" + + " weight DOUBLE\n" + + ") WITH (\n" + + " 'connector' = 'filesystem',\n" + + " 'path' = '" + sourcePath + "',\n" + + " 'format' = 'debezium-json'\n" + + ")"; + streamTableEnv.executeSql(sourceDDL); + String hoodieTableDDL = sql("hoodie_sink") + .field("id INT NOT NULL") + .field("ts BIGINT") + .field("name STRING") + .field("weight DOUBLE") + .pkField("id") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.READ_AS_STREAMING, execMode == ExecMode.STREAM) + .option(FlinkOptions.PRE_COMBINE, true) + .noPartition() + .end(); + streamTableEnv.executeSql(hoodieTableDDL); + String insertInto = "insert into hoodie_sink select id, ts, name, weight from debezium_source"; + execInsertSql(streamTableEnv, insertInto); + + final String expected = "[" + + "+I[101, 1000, scooter, 3.140000104904175], " + + "+I[102, 2000, car battery, 8.100000381469727], " + + "+I[103, 3000, 12-pack drill bits, 0.800000011920929], " + + "+I[104, 4000, hammer, 0.75], " + + "+I[105, 5000, hammer, 0.875], " + + "+I[106, 10000, hammer, 1.0], " + + "+I[107, 11000, rocks, 5.099999904632568], " + + "+I[108, 8000, jacket, 0.10000000149011612], " + + "+I[109, 9000, spare tire, 22.200000762939453], " + + "+I[110, 14000, jacket, 0.5]]"; + + List result = execSelectSql(streamTableEnv, "select * from hoodie_sink", execMode); + + assertRowsEquals(result, expected); + } + + @ParameterizedTest + @MethodSource("indexAndPartitioningParams") + void testBulkInsert(String indexType, boolean hiveStylePartitioning) { + TableEnvironment tableEnv = batchTableEnv; + // csv source + String csvSourceDDL = TestConfigurations.getCsvSourceDDL("csv_source", "test_source_5.data"); + tableEnv.executeSql(csvSourceDDL); + + String hoodieTableDDL = sql("hoodie_sink") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.OPERATION, "bulk_insert") + .option(FlinkOptions.WRITE_BULK_INSERT_SHUFFLE_INPUT, true) + .option(FlinkOptions.INDEX_TYPE, indexType) + .option(FlinkOptions.HIVE_STYLE_PARTITIONING, hiveStylePartitioning) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + String insertInto = "insert into hoodie_sink select * from csv_source"; + execInsertSql(tableEnv, insertInto); + + List result1 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from hoodie_sink").execute().collect()); + assertRowsEquals(result1, TestData.DATA_SET_SOURCE_INSERT); + // apply filters + List result2 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from hoodie_sink where uuid > 'id5'").execute().collect()); + assertRowsEquals(result2, "[" + + "+I[id6, Emma, 20, 1970-01-01T00:00:06, par3], " + + "+I[id7, Bob, 44, 1970-01-01T00:00:07, par4], " + + "+I[id8, Han, 56, 1970-01-01T00:00:08, par4]]"); + } + + @Test + void testBulkInsertNonPartitionedTable() { + TableEnvironment tableEnv = batchTableEnv; + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.OPERATION, "bulk_insert") + .noPartition() + .end(); + tableEnv.executeSql(hoodieTableDDL); + + final String insertInto1 = "insert into t1 values\n" + + "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1')"; + + execInsertSql(tableEnv, insertInto1); + + final String insertInto2 = "insert into t1 values\n" + + "('id1','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','par2'),\n" + + "('id1','Julian',53,TIMESTAMP '1970-01-01 00:00:03','par3'),\n" + + "('id1','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','par4'),\n" + + "('id1','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par5')"; + + execInsertSql(tableEnv, insertInto2); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result, "[" + + "+I[id1, Danny, 23, 1970-01-01T00:00:01, par1], " + + "+I[id1, Stephen, 33, 1970-01-01T00:00:02, par2], " + + "+I[id1, Julian, 53, 1970-01-01T00:00:03, par3], " + + "+I[id1, Fabian, 31, 1970-01-01T00:00:04, par4], " + + "+I[id1, Sophia, 18, 1970-01-01T00:00:05, par5]]", 3); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testAppendWrite(boolean clustering) { + TableEnvironment tableEnv = streamTableEnv; + // csv source + String sourceDDL = TestConfigurations.getFileSourceDDL("source"); + tableEnv.executeSql(sourceDDL); + + String hoodieTableDDL = sql("hoodie_sink") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.OPERATION, "insert") + .option(FlinkOptions.INSERT_CLUSTER, clustering) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + String insertInto = "insert into hoodie_sink select * from source"; + execInsertSql(tableEnv, insertInto); + + List result1 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from hoodie_sink").execute().collect()); + assertRowsEquals(result1, TestData.DATA_SET_SOURCE_INSERT); + // apply filters + List result2 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from hoodie_sink where uuid > 'id5'").execute().collect()); + assertRowsEquals(result2, "[" + + "+I[id6, Emma, 20, 1970-01-01T00:00:06, par3], " + + "+I[id7, Bob, 44, 1970-01-01T00:00:07, par4], " + + "+I[id8, Han, 56, 1970-01-01T00:00:08, par4]]"); + } + + @ParameterizedTest + @MethodSource("executionModeAndPartitioningParams") + void testWriteAndReadWithTimestampPartitioning(ExecMode execMode, boolean hiveStylePartitioning) { + TableEnvironment tableEnv = execMode == ExecMode.BATCH ? batchTableEnv : streamTableEnv; + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.HIVE_STYLE_PARTITIONING, hiveStylePartitioning) + .partitionField("ts") // use timestamp as partition path field + .end(); + tableEnv.executeSql(hoodieTableDDL); + + execInsertSql(tableEnv, TestSQL.INSERT_T1); + + List result1 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result1, TestData.DATA_SET_SOURCE_INSERT); + // apply filters + List result2 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1 where uuid > 'id5'").execute().collect()); + assertRowsEquals(result2, "[" + + "+I[id6, Emma, 20, 1970-01-01T00:00:06, par3], " + + "+I[id7, Bob, 44, 1970-01-01T00:00:07, par4], " + + "+I[id8, Han, 56, 1970-01-01T00:00:08, par4]]"); + } + + @Test + void testMergeOnReadCompactionWithTimestampPartitioning() { + TableEnvironment tableEnv = batchTableEnv; + + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.TABLE_TYPE, FlinkOptions.TABLE_TYPE_MERGE_ON_READ) + .option(FlinkOptions.COMPACTION_DELTA_COMMITS, 1) + .option(FlinkOptions.COMPACTION_TASKS, 1) + .partitionField("ts") + .end(); + tableEnv.executeSql(hoodieTableDDL); + execInsertSql(tableEnv, TestSQL.INSERT_T1); + + List rows = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + + assertRowsEquals(rows, TestData.DATA_SET_SOURCE_INSERT); + } + + @ParameterizedTest + @ValueSource(strings = {FlinkOptions.PARTITION_FORMAT_DAY, FlinkOptions.PARTITION_FORMAT_DASHED_DAY}) + void testWriteAndReadWithDatePartitioning(String partitionFormat) { + TableEnvironment tableEnv = batchTableEnv; + String hoodieTableDDL = sql("t1") + .field("uuid varchar(20)") + .field("name varchar(10)") + .field("age int") + .field("ts date") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.PARTITION_FORMAT, partitionFormat) + .partitionField("ts") // use date as partition path field + .end(); + tableEnv.executeSql(hoodieTableDDL); + + execInsertSql(tableEnv, TestSQL.INSERT_DATE_PARTITION_T1); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + String expected = "[" + + "+I[id1, Danny, 23, 1970-01-01], " + + "+I[id2, Stephen, 33, 1970-01-01], " + + "+I[id3, Julian, 53, 1970-01-01], " + + "+I[id4, Fabian, 31, 1970-01-01], " + + "+I[id5, Sophia, 18, 1970-01-01], " + + "+I[id6, Emma, 20, 1970-01-01], " + + "+I[id7, Bob, 44, 1970-01-01], " + + "+I[id8, Han, 56, 1970-01-01]]"; + assertRowsEquals(result, expected); + } + + @ParameterizedTest + @ValueSource(strings = {"bulk_insert", "upsert"}) + void testWriteReadDecimals(String operation) { + TableEnvironment tableEnv = batchTableEnv; + String createTable = sql("decimals") + .field("f0 decimal(3, 2)") + .field("f1 decimal(10, 2)") + .field("f2 decimal(20, 2)") + .field("f3 decimal(38, 18)") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.OPERATION, operation) + .option(FlinkOptions.PRECOMBINE_FIELD, "f1") + .pkField("f0") + .noPartition() + .end(); + tableEnv.executeSql(createTable); + + String insertInto = "insert into decimals values\n" + + "(1.23, 12345678.12, 12345.12, 123456789.12345)"; + execInsertSql(tableEnv, insertInto); + + List result1 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from decimals").execute().collect()); + assertRowsEquals(result1, "[+I[1.23, 12345678.12, 12345.12, 123456789.123450000000000000]]"); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testIncrementalRead(HoodieTableType tableType) throws Exception { + TableEnvironment tableEnv = batchTableEnv; + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setString(FlinkOptions.TABLE_NAME, "t1"); + conf.setString(FlinkOptions.TABLE_TYPE, tableType.name()); + + // write 3 batches of data set + TestData.writeData(TestData.dataSetInsert(1, 2), conf); + TestData.writeData(TestData.dataSetInsert(3, 4), conf); + TestData.writeData(TestData.dataSetInsert(5, 6), conf); + + String latestCommit = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath()); + + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.TABLE_TYPE, tableType) + .option(FlinkOptions.READ_START_COMMIT, latestCommit) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result, TestData.dataSetInsert(5, 6)); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testIncrementalReadArchivedCommits(HoodieTableType tableType) throws Exception { + TableEnvironment tableEnv = batchTableEnv; + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setString(FlinkOptions.TABLE_NAME, "t1"); + conf.setString(FlinkOptions.TABLE_TYPE, tableType.name()); + conf.setInteger(FlinkOptions.ARCHIVE_MIN_COMMITS, 3); + conf.setInteger(FlinkOptions.ARCHIVE_MAX_COMMITS, 4); + conf.setInteger(FlinkOptions.CLEAN_RETAIN_COMMITS, 2); + conf.setString("hoodie.commits.archival.batch", "1"); + + // write 10 batches of data set + for (int i = 0; i < 20; i += 2) { + List dataset = TestData.dataSetInsert(i + 1, i + 2); + TestData.writeData(dataset, conf); + } + + String secondArchived = TestUtils.getNthArchivedInstant(tempFile.getAbsolutePath(), 1); + + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.TABLE_TYPE, tableType) + .option(FlinkOptions.READ_START_COMMIT, secondArchived) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result, TestData.dataSetInsert(3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testReadWithWiderSchema(HoodieTableType tableType) throws Exception { + TableEnvironment tableEnv = batchTableEnv; + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setString(FlinkOptions.TABLE_NAME, "t1"); + conf.setString(FlinkOptions.TABLE_TYPE, tableType.name()); + + // write a batch of data set + TestData.writeData(TestData.DATA_SET_INSERT, conf); + + String hoodieTableDDL = sql("t1") + .field("uuid varchar(20)") + .field("name varchar(10)") + .field("age int") + .field("salary double") + .field("ts timestamp(3)") + .field("`partition` varchar(10)") + .pkField("uuid") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.TABLE_TYPE, tableType) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + final String expected = "[" + + "+I[id1, Danny, 23, null, 1970-01-01T00:00:00.001, par1], " + + "+I[id2, Stephen, 33, null, 1970-01-01T00:00:00.002, par1], " + + "+I[id3, Julian, 53, null, 1970-01-01T00:00:00.003, par2], " + + "+I[id4, Fabian, 31, null, 1970-01-01T00:00:00.004, par2], " + + "+I[id5, Sophia, 18, null, 1970-01-01T00:00:00.005, par3], " + + "+I[id6, Emma, 20, null, 1970-01-01T00:00:00.006, par3], " + + "+I[id7, Bob, 44, null, 1970-01-01T00:00:00.007, par4], " + + "+I[id8, Han, 56, null, 1970-01-01T00:00:00.008, par4]]"; + assertRowsEquals(result, expected); + } + + @ParameterizedTest + @ValueSource(strings = {"insert", "upsert", "bulk_insert"}) + void testParquetComplexTypes(String operation) { + TableEnvironment tableEnv = batchTableEnv; + + String hoodieTableDDL = sql("t1") + .field("f_int int") + .field("f_array array") + .field("f_map map") + .field("f_row row(f_row_f0 int, f_row_f1 varchar(10))") + .pkField("f_int") + .noPartition() + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.OPERATION, operation) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + execInsertSql(tableEnv, TestSQL.COMPLEX_TYPE_INSERT_T1); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + final String expected = "[" + + "+I[1, [abc1, def1], {abc1=1, def1=3}, +I[1, abc1]], " + + "+I[2, [abc2, def2], {def2=3, abc2=1}, +I[2, abc2]], " + + "+I[3, [abc3, def3], {def3=3, abc3=1}, +I[3, abc3]]]"; + assertRowsEquals(result, expected); + } + + @ParameterizedTest + @ValueSource(strings = {"insert", "upsert", "bulk_insert"}) + void testParquetComplexNestedRowTypes(String operation) { + TableEnvironment tableEnv = batchTableEnv; + + String hoodieTableDDL = sql("t1") + .field("f_int int") + .field("f_array array") + .field("int_array array") + .field("f_map map") + .field("f_row row(f_nested_array array, f_nested_row row(f_row_f0 int, f_row_f1 varchar(10)))") + .pkField("f_int") + .noPartition() + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.OPERATION, operation) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + execInsertSql(tableEnv, TestSQL.COMPLEX_NESTED_ROW_TYPE_INSERT_T1); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + final String expected = "[" + + "+I[1, [abc1, def1], [1, 1], {abc1=1, def1=3}, +I[[abc1, def1], +I[1, abc1]]], " + + "+I[2, [abc2, def2], [2, 2], {def2=3, abc2=1}, +I[[abc2, def2], +I[2, abc2]]], " + + "+I[3, [abc3, def3], [3, 3], {def3=3, abc3=1}, +I[[abc3, def3], +I[3, abc3]]]]"; + assertRowsEquals(result, expected); + } + + @ParameterizedTest + @ValueSource(strings = {"insert", "upsert", "bulk_insert"}) + void testParquetNullChildColumnsRowTypes(String operation) { + TableEnvironment tableEnv = batchTableEnv; + + String hoodieTableDDL = sql("t1") + .field("f_int int") + .field("f_row row(f_row_f0 int, f_row_f1 varchar(10))") + .pkField("f_int") + .noPartition() + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.OPERATION, operation) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + execInsertSql(tableEnv, TestSQL.NULL_CHILD_COLUMNS_ROW_TYPE_INSERT_T1); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + final String expected = "[" + + "+I[1, +I[null, abc1]], " + + "+I[2, +I[2, null]], " + + "+I[3, null]]"; + assertRowsEquals(result, expected); + } + + @ParameterizedTest + @ValueSource(strings = {"insert", "upsert", "bulk_insert"}) + void testBuiltinFunctionWithCatalog(String operation) { + TableEnvironment tableEnv = batchTableEnv; + + String hudiCatalogDDL = catalog("hudi_" + operation) + .catalogPath(tempFile.getAbsolutePath()) + .end(); + + tableEnv.executeSql(hudiCatalogDDL); + tableEnv.executeSql("use catalog " + ("hudi_" + operation)); + + String dbName = "hudi"; + tableEnv.executeSql("create database " + dbName); + tableEnv.executeSql("use " + dbName); + + String hoodieTableDDL = sql("t1") + .field("f_int int") + .field("f_date DATE") + .pkField("f_int") + .partitionField("f_int") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath() + "/" + dbName + "/" + operation) + .option(FlinkOptions.OPERATION, operation) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + String insertSql = "insert into t1 values (1, TO_DATE('2022-02-02')), (2, DATE '2022-02-02')"; + execInsertSql(tableEnv, insertSql); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + final String expected = "[" + + "+I[1, 2022-02-02], " + + "+I[2, 2022-02-02]]"; + assertRowsEquals(result, expected); + + List partitionResult = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1 where f_int = 1").execute().collect()); + assertRowsEquals(partitionResult, "[+I[1, 2022-02-02]]"); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testWriteAndReadWithDataSkipping(HoodieTableType tableType) { + TableEnvironment tableEnv = batchTableEnv; + String hoodieTableDDL = sql("t1") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.METADATA_ENABLED, true) + .option("hoodie.metadata.index.column.stats.enable", true) + .option(FlinkOptions.READ_DATA_SKIPPING_ENABLED, true) + .option(FlinkOptions.TABLE_TYPE, tableType) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + execInsertSql(tableEnv, TestSQL.INSERT_T1); + + List result1 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result1, TestData.DATA_SET_SOURCE_INSERT); + // apply filters + List result2 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1 where uuid > 'id5' and age > 20").execute().collect()); + assertRowsEquals(result2, "[" + + "+I[id7, Bob, 44, 1970-01-01T00:00:07, par4], " + + "+I[id8, Han, 56, 1970-01-01T00:00:08, par4]]"); + // filter by timestamp + List result3 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1 where ts > TIMESTAMP '1970-01-01 00:00:05'").execute().collect()); + assertRowsEquals(result3, "[" + + "+I[id6, Emma, 20, 1970-01-01T00:00:06, par3], " + + "+I[id7, Bob, 44, 1970-01-01T00:00:07, par4], " + + "+I[id8, Han, 56, 1970-01-01T00:00:08, par4]]"); + } + + @Test + void testBuiltinFunctionWithHMSCatalog() { + TableEnvironment tableEnv = batchTableEnv; + + HoodieHiveCatalog hoodieCatalog = HoodieCatalogTestUtils.createHiveCatalog("hudi_catalog"); + + tableEnv.registerCatalog("hudi_catalog", hoodieCatalog); + tableEnv.executeSql("use catalog hudi_catalog"); + + String dbName = "hudi"; + tableEnv.executeSql("create database " + dbName); + tableEnv.executeSql("use " + dbName); + + String hoodieTableDDL = sql("t1") + .field("f_int int") + .field("f_date DATE") + .field("f_par string") + .pkField("f_int") + .partitionField("f_par") + .option(FlinkOptions.RECORD_KEY_FIELD, "f_int") + .option(FlinkOptions.PRECOMBINE_FIELD, "f_date") + .end(); + tableEnv.executeSql(hoodieTableDDL); + + String insertSql = "insert into t1 values (1, TO_DATE('2022-02-02'), '1'), (2, DATE '2022-02-02', '1')"; + execInsertSql(tableEnv, insertSql); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + final String expected = "[" + + "+I[1, 2022-02-02, 1], " + + "+I[2, 2022-02-02, 1]]"; + assertRowsEquals(result, expected); + + List partitionResult = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1 where f_int = 1").execute().collect()); + assertRowsEquals(partitionResult, "[+I[1, 2022-02-02, 1]]"); + } + + @Test + void testWriteReadWithComputedColumns() { + TableEnvironment tableEnv = batchTableEnv; + String createTable = sql("t1") + .field("f0 int") + .field("f1 varchar(10)") + .field("f2 bigint") + .field("f3 as f0 + f2") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.PRECOMBINE_FIELD, "f1") + .pkField("f0") + .noPartition() + .end(); + tableEnv.executeSql(createTable); + + String insertInto = "insert into t1 values\n" + + "(1, 'abc', 2)"; + execInsertSql(tableEnv, insertInto); + + List result1 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + assertRowsEquals(result1, "[+I[1, abc, 2, 3]]"); + + List result2 = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select f3 from t1").execute().collect()); + assertRowsEquals(result2, "[+I[3]]"); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testWriteReadWithLocalTimestamp(HoodieTableType tableType) { + TableEnvironment tableEnv = batchTableEnv; + tableEnv.getConfig().setLocalTimeZone(ZoneId.of("Asia/Shanghai")); + String createTable = sql("t1") + .field("f0 int") + .field("f1 varchar(10)") + .field("f2 TIMESTAMP_LTZ(3)") + .field("f4 TIMESTAMP_LTZ(6)") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.PRECOMBINE_FIELD, "f1") + .option(FlinkOptions.TABLE_TYPE, tableType) + .pkField("f0") + .noPartition() + .end(); + tableEnv.executeSql(createTable); + + String insertInto = "insert into t1 values\n" + + "(1, 'abc', TIMESTAMP '1970-01-01 08:00:01', TIMESTAMP '1970-01-01 08:00:02'),\n" + + "(2, 'def', TIMESTAMP '1970-01-01 08:00:03', TIMESTAMP '1970-01-01 08:00:04')"; + execInsertSql(tableEnv, insertInto); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + final String expected = "[" + + "+I[1, abc, 1970-01-01T00:00:01Z, 1970-01-01T00:00:02Z], " + + "+I[2, def, 1970-01-01T00:00:03Z, 1970-01-01T00:00:04Z]]"; + assertRowsEquals(result, expected); + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + private enum ExecMode { + BATCH, STREAM + } + + /** + * Return test params => (execution mode, table type). + */ + private static Stream executionModeAndTableTypeParams() { + Object[][] data = + new Object[][] { + {ExecMode.BATCH, HoodieTableType.MERGE_ON_READ}, + {ExecMode.BATCH, HoodieTableType.COPY_ON_WRITE}, + {ExecMode.STREAM, HoodieTableType.MERGE_ON_READ}, + {ExecMode.STREAM, HoodieTableType.COPY_ON_WRITE}}; + return Stream.of(data).map(Arguments::of); + } + + /** + * Return test params => (execution mode, hive style partitioning). + */ + private static Stream executionModeAndPartitioningParams() { + Object[][] data = + new Object[][] { + {ExecMode.BATCH, false}, + {ExecMode.BATCH, true}, + {ExecMode.STREAM, false}, + {ExecMode.STREAM, true}}; + return Stream.of(data).map(Arguments::of); + } + + /** + * Return test params => (HoodieTableType, hive style partitioning). + */ + private static Stream tableTypeAndPartitioningParams() { + Object[][] data = + new Object[][] { + {HoodieTableType.COPY_ON_WRITE, false}, + {HoodieTableType.COPY_ON_WRITE, true}, + {HoodieTableType.MERGE_ON_READ, false}, + {HoodieTableType.MERGE_ON_READ, true}}; + return Stream.of(data).map(Arguments::of); + } + + /** + * Return test params => (index type, hive style partitioning). + */ + private static Stream indexAndPartitioningParams() { + Object[][] data = + new Object[][] { + {"FLINK_STATE", false}, + {"FLINK_STATE", true}, + {"BUCKET", false}, + {"BUCKET", true}}; + return Stream.of(data).map(Arguments::of); + } + + private void execInsertSql(TableEnvironment tEnv, String insert) { + TableResult tableResult = tEnv.executeSql(insert); + // wait to finish + try { + tableResult.await(); + } catch (InterruptedException | ExecutionException ex) { + // ignored + } + } + + private List execSelectSql(TableEnvironment tEnv, String select, ExecMode execMode) + throws TableNotExistException, InterruptedException { + final String[] splits = select.split(" "); + final String tableName = splits[splits.length - 1]; + switch (execMode) { + case STREAM: + return execSelectSql(tEnv, select, 10, tableName); + case BATCH: + return CollectionUtil.iterableToList( + () -> tEnv.sqlQuery("select * from " + tableName).execute().collect()); + default: + throw new AssertionError(); + } + } + + private List execSelectSql(TableEnvironment tEnv, String select, long timeout) + throws InterruptedException, TableNotExistException { + return execSelectSql(tEnv, select, timeout, null); + } + + private List execSelectSql(TableEnvironment tEnv, String select, long timeout, String sourceTable) + throws InterruptedException, TableNotExistException { + final String sinkDDL; + if (sourceTable != null) { + // use the source table schema as the sink schema if the source table was specified, . + ObjectPath objectPath = new ObjectPath(tEnv.getCurrentDatabase(), sourceTable); + TableSchema schema = tEnv.getCatalog(tEnv.getCurrentCatalog()).get().getTable(objectPath).getSchema(); + sinkDDL = TestConfigurations.getCollectSinkDDL("sink", schema); + } else { + sinkDDL = TestConfigurations.getCollectSinkDDL("sink"); + } + return execSelectSql(tEnv, select, sinkDDL, timeout); + } + + private List execSelectSql(TableEnvironment tEnv, String select, String sinkDDL, long timeout) + throws InterruptedException { + tEnv.executeSql("DROP TABLE IF EXISTS sink"); + tEnv.executeSql(sinkDDL); + TableResult tableResult = tEnv.executeSql("insert into sink " + select); + // wait for the timeout then cancels the job + TimeUnit.SECONDS.sleep(timeout); + tableResult.getJobClient().ifPresent(JobClient::cancel); + tEnv.executeSql("DROP TABLE IF EXISTS sink"); + return CollectSinkTableFactory.RESULT.values().stream() + .flatMap(Collection::stream) + .collect(Collectors.toList()); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java new file mode 100644 index 0000000000000..f7a35e57f2b09 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java @@ -0,0 +1,581 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table; + +import org.apache.hudi.common.model.DefaultHoodieRecordPayload; +import org.apache.hudi.common.model.EventTimeAvroPayload; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieValidationException; +import org.apache.hudi.hive.MultiPartKeysValueExtractor; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.keygen.ComplexAvroKeyGenerator; +import org.apache.hudi.keygen.NonpartitionedAvroKeyGenerator; +import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.apache.hudi.util.StreamerUtil; +import org.apache.hudi.utils.SchemaBuilder; +import org.apache.hudi.utils.TestConfigurations; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.Schema; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.ObjectIdentifier; +import org.apache.flink.table.catalog.ResolvedCatalogTable; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.factories.DynamicTableFactory; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +/** + * Test cases for {@link HoodieTableFactory}. + */ +public class TestHoodieTableFactory { + private static final String AVRO_SCHEMA_FILE_PATH = Objects.requireNonNull(Thread.currentThread() + .getContextClassLoader().getResource("test_read_schema.avsc")).toString(); + private static final String INFERRED_SCHEMA = "{\"type\":\"record\"," + + "\"name\":\"record\"," + + "\"fields\":[" + + "{\"name\":\"uuid\",\"type\":[\"null\",\"string\"],\"default\":null}," + + "{\"name\":\"name\",\"type\":[\"null\",\"string\"],\"default\":null}," + + "{\"name\":\"age\",\"type\":[\"null\",\"int\"],\"default\":null}," + + "{\"name\":\"ts\",\"type\":[\"null\",{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"}],\"default\":null}," + + "{\"name\":\"partition\",\"type\":[\"null\",\"string\"],\"default\":null}]}"; + + private Configuration conf; + + @TempDir + File tempFile; + + @BeforeEach + void beforeEach() throws IOException { + this.conf = new Configuration(); + this.conf.setString(FlinkOptions.PATH, tempFile.getAbsolutePath()); + this.conf.setString(FlinkOptions.TABLE_NAME, "t1"); + StreamerUtil.initTableIfNotExists(this.conf); + } + + @Test + void testRequiredOptionsForSource() { + // miss pk and precombine key will throw exception + ResolvedSchema schema1 = SchemaBuilder.instance() + .field("f0", DataTypes.INT().notNull()) + .field("f1", DataTypes.VARCHAR(20)) + .field("f2", DataTypes.TIMESTAMP(3)) + .build(); + final MockContext sourceContext1 = MockContext.getInstance(this.conf, schema1, "f2"); + assertThrows(HoodieValidationException.class, () -> new HoodieTableFactory().createDynamicTableSource(sourceContext1)); + assertThrows(HoodieValidationException.class, () -> new HoodieTableFactory().createDynamicTableSink(sourceContext1)); + + // a non-exists precombine key will throw exception + ResolvedSchema schema2 = SchemaBuilder.instance() + .field("f0", DataTypes.INT().notNull()) + .field("f1", DataTypes.VARCHAR(20)) + .field("f2", DataTypes.TIMESTAMP(3)) + .build(); + this.conf.setString(FlinkOptions.PRECOMBINE_FIELD, "non_exist_field"); + final MockContext sourceContext2 = MockContext.getInstance(this.conf, schema2, "f2"); + assertThrows(HoodieValidationException.class, () -> new HoodieTableFactory().createDynamicTableSource(sourceContext2)); + assertThrows(HoodieValidationException.class, () -> new HoodieTableFactory().createDynamicTableSink(sourceContext2)); + this.conf.setString(FlinkOptions.PRECOMBINE_FIELD, FlinkOptions.PRECOMBINE_FIELD.defaultValue()); + + // given the pk but miss the pre combine key will be ok + ResolvedSchema schema3 = SchemaBuilder.instance() + .field("f0", DataTypes.INT().notNull()) + .field("f1", DataTypes.VARCHAR(20)) + .field("f2", DataTypes.TIMESTAMP(3)) + .primaryKey("f0") + .build(); + final MockContext sourceContext3 = MockContext.getInstance(this.conf, schema3, "f2"); + HoodieTableSource tableSource = (HoodieTableSource) new HoodieTableFactory().createDynamicTableSource(sourceContext3); + HoodieTableSink tableSink = (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(sourceContext3); + // the precombine field is overwritten + assertThat(tableSource.getConf().getString(FlinkOptions.PRECOMBINE_FIELD), is(FlinkOptions.NO_PRE_COMBINE)); + assertThat(tableSink.getConf().getString(FlinkOptions.PRECOMBINE_FIELD), is(FlinkOptions.NO_PRE_COMBINE)); + // precombine field not specified, use the default payload clazz + assertThat(tableSource.getConf().getString(FlinkOptions.PAYLOAD_CLASS_NAME), is(FlinkOptions.PAYLOAD_CLASS_NAME.defaultValue())); + assertThat(tableSink.getConf().getString(FlinkOptions.PAYLOAD_CLASS_NAME), is(FlinkOptions.PAYLOAD_CLASS_NAME.defaultValue())); + + // given pk but miss the pre combine key with DefaultHoodieRecordPayload should throw + this.conf.setString(FlinkOptions.PAYLOAD_CLASS_NAME, DefaultHoodieRecordPayload.class.getName()); + final MockContext sourceContext4 = MockContext.getInstance(this.conf, schema3, "f2"); + + assertThrows(HoodieValidationException.class, () -> new HoodieTableFactory().createDynamicTableSource(sourceContext4)); + assertThrows(HoodieValidationException.class, () -> new HoodieTableFactory().createDynamicTableSink(sourceContext4)); + this.conf.setString(FlinkOptions.PAYLOAD_CLASS_NAME, FlinkOptions.PAYLOAD_CLASS_NAME.defaultValue()); + + // given pk and pre combine key will be ok + ResolvedSchema schema4 = SchemaBuilder.instance() + .field("f0", DataTypes.INT().notNull()) + .field("f1", DataTypes.VARCHAR(20)) + .field("f2", DataTypes.TIMESTAMP(3)) + .field("ts", DataTypes.TIMESTAMP(3)) + .primaryKey("f0") + .build(); + final MockContext sourceContext5 = MockContext.getInstance(this.conf, schema4, "f2"); + + assertDoesNotThrow(() -> new HoodieTableFactory().createDynamicTableSource(sourceContext5)); + assertDoesNotThrow(() -> new HoodieTableFactory().createDynamicTableSink(sourceContext5)); + // precombine field specified(default ts), use DefaultHoodieRecordPayload as payload clazz + HoodieTableSource tableSource5 = (HoodieTableSource) new HoodieTableFactory().createDynamicTableSource(sourceContext5); + HoodieTableSink tableSink5 = (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(sourceContext5); + assertThat(tableSource5.getConf().getString(FlinkOptions.PAYLOAD_CLASS_NAME), is(EventTimeAvroPayload.class.getName())); + assertThat(tableSink5.getConf().getString(FlinkOptions.PAYLOAD_CLASS_NAME), is(EventTimeAvroPayload.class.getName())); + + // given pk and set pre combine key to no_precombine will be ok + ResolvedSchema schema5 = SchemaBuilder.instance() + .field("f0", DataTypes.INT().notNull()) + .field("f1", DataTypes.VARCHAR(20)) + .field("f2", DataTypes.TIMESTAMP(3)) + .field("ts", DataTypes.TIMESTAMP(3)) + .primaryKey("f0") + .build(); + this.conf.setString(FlinkOptions.PRECOMBINE_FIELD, FlinkOptions.NO_PRE_COMBINE); + final MockContext sourceContext6 = MockContext.getInstance(this.conf, schema5, "f2"); + + assertDoesNotThrow(() -> new HoodieTableFactory().createDynamicTableSource(sourceContext6)); + assertDoesNotThrow(() -> new HoodieTableFactory().createDynamicTableSink(sourceContext6)); + } + + @Test + void testInferAvroSchemaForSource() { + // infer the schema if not specified + final HoodieTableSource tableSource1 = + (HoodieTableSource) new HoodieTableFactory().createDynamicTableSource(MockContext.getInstance(this.conf)); + final Configuration conf1 = tableSource1.getConf(); + assertThat(conf1.get(FlinkOptions.SOURCE_AVRO_SCHEMA), is(INFERRED_SCHEMA)); + + // set up the explicit schema using the file path + this.conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA_PATH, AVRO_SCHEMA_FILE_PATH); + HoodieTableSource tableSource2 = + (HoodieTableSource) new HoodieTableFactory().createDynamicTableSource(MockContext.getInstance(this.conf)); + Configuration conf2 = tableSource2.getConf(); + assertNull(conf2.get(FlinkOptions.SOURCE_AVRO_SCHEMA), "expect schema string as null"); + } + + @Test + void testSetupHoodieKeyOptionsForSource() { + this.conf.setString(FlinkOptions.RECORD_KEY_FIELD, "dummyField"); + this.conf.setString(FlinkOptions.KEYGEN_CLASS_NAME, "dummyKeyGenClass"); + // definition with simple primary key and partition path + ResolvedSchema schema1 = SchemaBuilder.instance() + .field("f0", DataTypes.INT().notNull()) + .field("f1", DataTypes.VARCHAR(20)) + .field("f2", DataTypes.BIGINT()) + .field("ts", DataTypes.TIMESTAMP(3)) + .primaryKey("f0") + .build(); + final MockContext sourceContext1 = MockContext.getInstance(this.conf, schema1, "f2"); + final HoodieTableSource tableSource1 = (HoodieTableSource) new HoodieTableFactory().createDynamicTableSource(sourceContext1); + final Configuration conf1 = tableSource1.getConf(); + assertThat(conf1.get(FlinkOptions.RECORD_KEY_FIELD), is("f0")); + assertThat(conf1.get(FlinkOptions.KEYGEN_CLASS_NAME), is("dummyKeyGenClass")); + + // definition with complex primary keys and partition paths + this.conf.removeConfig(FlinkOptions.KEYGEN_CLASS_NAME); + ResolvedSchema schema2 = SchemaBuilder.instance() + .field("f0", DataTypes.INT().notNull()) + .field("f1", DataTypes.VARCHAR(20).notNull()) + .field("f2", DataTypes.TIMESTAMP(3)) + .field("ts", DataTypes.TIMESTAMP(3)) + .primaryKey("f0", "f1") + .build(); + final MockContext sourceContext2 = MockContext.getInstance(this.conf, schema2, "f2"); + final HoodieTableSource tableSource2 = (HoodieTableSource) new HoodieTableFactory().createDynamicTableSource(sourceContext2); + final Configuration conf2 = tableSource2.getConf(); + assertThat(conf2.get(FlinkOptions.RECORD_KEY_FIELD), is("f0,f1")); + assertThat(conf2.get(FlinkOptions.KEYGEN_CLASS_NAME), is(ComplexAvroKeyGenerator.class.getName())); + + // definition with complex primary keys and empty partition paths + this.conf.removeConfig(FlinkOptions.KEYGEN_CLASS_NAME); + final MockContext sourceContext3 = MockContext.getInstance(this.conf, schema2, ""); + final HoodieTableSource tableSource3 = (HoodieTableSource) new HoodieTableFactory().createDynamicTableSource(sourceContext3); + final Configuration conf3 = tableSource3.getConf(); + assertThat(conf3.get(FlinkOptions.RECORD_KEY_FIELD), is("f0,f1")); + assertThat(conf3.get(FlinkOptions.KEYGEN_CLASS_NAME), is(NonpartitionedAvroKeyGenerator.class.getName())); + } + + @Test + void testSetupHiveOptionsForSource() { + // definition with simple primary key and partition path + ResolvedSchema schema1 = SchemaBuilder.instance() + .field("f0", DataTypes.INT().notNull()) + .field("f1", DataTypes.VARCHAR(20)) + .field("f2", DataTypes.TIMESTAMP(3)) + .field("ts", DataTypes.TIMESTAMP(3)) + .primaryKey("f0") + .build(); + + final MockContext sourceContext1 = MockContext.getInstance(this.conf, schema1, "f2"); + final HoodieTableSource tableSource1 = (HoodieTableSource) new HoodieTableFactory().createDynamicTableSource(sourceContext1); + final Configuration conf1 = tableSource1.getConf(); + assertThat(conf1.getString(FlinkOptions.HIVE_SYNC_DB), is("db1")); + assertThat(conf1.getString(FlinkOptions.HIVE_SYNC_TABLE), is("t1")); + assertThat(conf1.getString(FlinkOptions.HIVE_SYNC_PARTITION_EXTRACTOR_CLASS_NAME), is(MultiPartKeysValueExtractor.class.getName())); + + // set up hive style partitioning is true. + this.conf.setString(FlinkOptions.HIVE_SYNC_DB, "db2"); + this.conf.setString(FlinkOptions.HIVE_SYNC_TABLE, "t2"); + this.conf.setBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING, true); + + final MockContext sourceContext2 = MockContext.getInstance(this.conf, schema1, "f2"); + final HoodieTableSource tableSource2 = (HoodieTableSource) new HoodieTableFactory().createDynamicTableSource(sourceContext2); + final Configuration conf2 = tableSource2.getConf(); + assertThat(conf2.getString(FlinkOptions.HIVE_SYNC_DB), is("db2")); + assertThat(conf2.getString(FlinkOptions.HIVE_SYNC_TABLE), is("t2")); + assertThat(conf2.getString(FlinkOptions.HIVE_SYNC_PARTITION_EXTRACTOR_CLASS_NAME), is(MultiPartKeysValueExtractor.class.getName())); + } + + @Test + void testSetupCleaningOptionsForSource() { + // definition with simple primary key and partition path + ResolvedSchema schema1 = SchemaBuilder.instance() + .field("f0", DataTypes.INT().notNull()) + .field("f1", DataTypes.VARCHAR(20)) + .field("f2", DataTypes.TIMESTAMP(3)) + .field("ts", DataTypes.TIMESTAMP(3)) + .primaryKey("f0") + .build(); + // set up new retains commits that is less than min archive commits + this.conf.setString(FlinkOptions.CLEAN_RETAIN_COMMITS.key(), "11"); + + final MockContext sourceContext1 = MockContext.getInstance(this.conf, schema1, "f2"); + final HoodieTableSource tableSource1 = (HoodieTableSource) new HoodieTableFactory().createDynamicTableSource(sourceContext1); + final Configuration conf1 = tableSource1.getConf(); + assertThat(conf1.getInteger(FlinkOptions.ARCHIVE_MIN_COMMITS), is(FlinkOptions.ARCHIVE_MIN_COMMITS.defaultValue())); + assertThat(conf1.getInteger(FlinkOptions.ARCHIVE_MAX_COMMITS), is(FlinkOptions.ARCHIVE_MAX_COMMITS.defaultValue())); + + // set up new retains commits that is greater than min archive commits + final int retainCommits = FlinkOptions.ARCHIVE_MIN_COMMITS.defaultValue() + 5; + this.conf.setInteger(FlinkOptions.CLEAN_RETAIN_COMMITS.key(), retainCommits); + + final MockContext sourceContext2 = MockContext.getInstance(this.conf, schema1, "f2"); + final HoodieTableSource tableSource2 = (HoodieTableSource) new HoodieTableFactory().createDynamicTableSource(sourceContext2); + final Configuration conf2 = tableSource2.getConf(); + assertThat(conf2.getInteger(FlinkOptions.ARCHIVE_MIN_COMMITS), is(retainCommits + 10)); + assertThat(conf2.getInteger(FlinkOptions.ARCHIVE_MAX_COMMITS), is(retainCommits + 20)); + } + + @Test + void testSetupReadOptionsForSource() { + // definition with simple primary key and partition path + ResolvedSchema schema1 = SchemaBuilder.instance() + .field("f0", DataTypes.INT().notNull()) + .field("f1", DataTypes.VARCHAR(20)) + .field("f2", DataTypes.TIMESTAMP(3)) + .field("ts", DataTypes.TIMESTAMP(3)) + .primaryKey("f0") + .build(); + // set up new retains commits that is less than min archive commits + this.conf.setString(FlinkOptions.READ_END_COMMIT, "123"); + + final MockContext sourceContext1 = MockContext.getInstance(this.conf, schema1, "f2"); + final HoodieTableSource tableSource1 = (HoodieTableSource) new HoodieTableFactory().createDynamicTableSource(sourceContext1); + final Configuration conf1 = tableSource1.getConf(); + assertThat(conf1.getString(FlinkOptions.QUERY_TYPE), is(FlinkOptions.QUERY_TYPE_INCREMENTAL)); + + this.conf.removeConfig(FlinkOptions.READ_END_COMMIT); + this.conf.setString(FlinkOptions.READ_START_COMMIT, "123"); + final MockContext sourceContext2 = MockContext.getInstance(this.conf, schema1, "f2"); + final HoodieTableSource tableSource2 = (HoodieTableSource) new HoodieTableFactory().createDynamicTableSource(sourceContext2); + final Configuration conf2 = tableSource2.getConf(); + assertThat(conf2.getString(FlinkOptions.QUERY_TYPE), is(FlinkOptions.QUERY_TYPE_INCREMENTAL)); + } + + @Test + void testBucketIndexOptionForSink() { + ResolvedSchema schema1 = SchemaBuilder.instance() + .field("f0", DataTypes.INT().notNull()) + .field("f1", DataTypes.VARCHAR(20).notNull()) + .field("f2", DataTypes.TIMESTAMP(3)) + .primaryKey("f0", "f1") + .build(); + + this.conf.setString(FlinkOptions.INDEX_TYPE, HoodieIndex.IndexType.BUCKET.name()); + + // default use recordKey fields + final MockContext context = MockContext.getInstance(this.conf, schema1, "f2"); + HoodieTableSink tableSink = (HoodieTableSink) (new HoodieTableFactory().createDynamicTableSink(context)); + final Configuration conf = tableSink.getConf(); + assertThat(conf.getString(FlinkOptions.INDEX_KEY_FIELD), is("f0,f1")); + + this.conf.setString(FlinkOptions.INDEX_KEY_FIELD, "f0"); + final MockContext context2 = MockContext.getInstance(this.conf, schema1, "f2"); + HoodieTableSink tableSink2 = (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(context2); + final Configuration conf2 = tableSink2.getConf(); + assertThat(conf2.getString(FlinkOptions.INDEX_KEY_FIELD), is("f0")); + + this.conf.setString(FlinkOptions.INDEX_KEY_FIELD, "f1"); + final MockContext context3 = MockContext.getInstance(this.conf, schema1, "f2"); + HoodieTableSink tableSink3 = (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(context3); + final Configuration conf3 = tableSink3.getConf(); + assertThat(conf3.getString(FlinkOptions.INDEX_KEY_FIELD), is("f1")); + + this.conf.setString(FlinkOptions.INDEX_KEY_FIELD, "f0,f1"); + final MockContext context4 = MockContext.getInstance(this.conf, schema1, "f2"); + HoodieTableSink tableSink4 = (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(context4); + final Configuration conf4 = tableSink4.getConf(); + assertThat(conf4.getString(FlinkOptions.INDEX_KEY_FIELD), is("f0,f1")); + + // index key field is not a subset of or equal to the recordKey fields, will throw exception + this.conf.setString(FlinkOptions.INDEX_KEY_FIELD, "f2"); + final MockContext context5 = MockContext.getInstance(this.conf, schema1, "f2"); + assertThrows(HoodieValidationException.class, () -> new HoodieTableFactory().createDynamicTableSource(context5)); + } + + @Test + void testInferAvroSchemaForSink() { + // infer the schema if not specified + final HoodieTableSink tableSink1 = + (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(MockContext.getInstance(this.conf)); + final Configuration conf1 = tableSink1.getConf(); + assertThat(conf1.get(FlinkOptions.SOURCE_AVRO_SCHEMA), is(INFERRED_SCHEMA)); + + // set up the explicit schema using the file path + this.conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA_PATH, AVRO_SCHEMA_FILE_PATH); + HoodieTableSink tableSink2 = + (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(MockContext.getInstance(this.conf)); + Configuration conf2 = tableSink2.getConf(); + assertNull(conf2.get(FlinkOptions.SOURCE_AVRO_SCHEMA), "expect schema string as null"); + } + + @Test + void testSetupHoodieKeyOptionsForSink() { + this.conf.setString(FlinkOptions.RECORD_KEY_FIELD, "dummyField"); + this.conf.setString(FlinkOptions.KEYGEN_CLASS_NAME, "dummyKeyGenClass"); + // definition with simple primary key and partition path + ResolvedSchema schema1 = SchemaBuilder.instance() + .field("f0", DataTypes.INT().notNull()) + .field("f1", DataTypes.VARCHAR(20)) + .field("f2", DataTypes.BIGINT()) + .field("ts", DataTypes.TIMESTAMP(3)) + .primaryKey("f0") + .build(); + final MockContext sinkContext1 = MockContext.getInstance(this.conf, schema1, "f2"); + final HoodieTableSink tableSink1 = (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(sinkContext1); + final Configuration conf1 = tableSink1.getConf(); + assertThat(conf1.get(FlinkOptions.RECORD_KEY_FIELD), is("f0")); + assertThat(conf1.get(FlinkOptions.KEYGEN_CLASS_NAME), is("dummyKeyGenClass")); + + // definition with complex primary keys and partition paths + this.conf.removeConfig(FlinkOptions.KEYGEN_CLASS_NAME); + ResolvedSchema schema2 = SchemaBuilder.instance() + .field("f0", DataTypes.INT().notNull()) + .field("f1", DataTypes.VARCHAR(20).notNull()) + .field("f2", DataTypes.TIMESTAMP(3)) + .field("ts", DataTypes.TIMESTAMP(3)) + .primaryKey("f0", "f1") + .build(); + final MockContext sinkContext2 = MockContext.getInstance(this.conf, schema2, "f2"); + final HoodieTableSink tableSink2 = (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(sinkContext2); + final Configuration conf2 = tableSink2.getConf(); + assertThat(conf2.get(FlinkOptions.RECORD_KEY_FIELD), is("f0,f1")); + assertThat(conf2.get(FlinkOptions.KEYGEN_CLASS_NAME), is(ComplexAvroKeyGenerator.class.getName())); + + // definition with complex primary keys and empty partition paths + this.conf.removeConfig(FlinkOptions.KEYGEN_CLASS_NAME); + final MockContext sinkContext3 = MockContext.getInstance(this.conf, schema2, ""); + final HoodieTableSink tableSink3 = (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(sinkContext3); + final Configuration conf3 = tableSink3.getConf(); + assertThat(conf3.get(FlinkOptions.RECORD_KEY_FIELD), is("f0,f1")); + assertThat(conf3.get(FlinkOptions.KEYGEN_CLASS_NAME), is(NonpartitionedAvroKeyGenerator.class.getName())); + + // definition of bucket index + this.conf.setString(FlinkOptions.INDEX_TYPE, HoodieIndex.IndexType.BUCKET.name()); + final MockContext sinkContext4 = MockContext.getInstance(this.conf, schema2, ""); + final HoodieTableSink tableSink4 = (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(sinkContext4); + final Configuration conf4 = tableSink4.getConf(); + assertThat(conf4.get(FlinkOptions.RECORD_KEY_FIELD), is("f0,f1")); + assertThat(conf4.get(FlinkOptions.INDEX_KEY_FIELD), is("f0,f1")); + assertThat(conf4.get(FlinkOptions.INDEX_TYPE), is(HoodieIndex.IndexType.BUCKET.name())); + assertThat(conf4.get(FlinkOptions.KEYGEN_CLASS_NAME), is(NonpartitionedAvroKeyGenerator.class.getName())); + } + + @Test + void testSetupHiveOptionsForSink() { + // definition with simple primary key and partition path + ResolvedSchema schema1 = SchemaBuilder.instance() + .field("f0", DataTypes.INT().notNull()) + .field("f1", DataTypes.VARCHAR(20)) + .field("f2", DataTypes.TIMESTAMP(3)) + .field("ts", DataTypes.TIMESTAMP(3)) + .primaryKey("f0") + .build(); + + final MockContext sinkContext1 = MockContext.getInstance(this.conf, schema1, "f2"); + final HoodieTableSink tableSink1 = (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(sinkContext1); + final Configuration conf1 = tableSink1.getConf(); + assertThat(conf1.getString(FlinkOptions.HIVE_SYNC_DB), is("db1")); + assertThat(conf1.getString(FlinkOptions.HIVE_SYNC_TABLE), is("t1")); + assertThat(conf1.getString(FlinkOptions.HIVE_SYNC_PARTITION_EXTRACTOR_CLASS_NAME), is(MultiPartKeysValueExtractor.class.getName())); + + // set up hive style partitioning is true. + this.conf.setString(FlinkOptions.HIVE_SYNC_DB, "db2"); + this.conf.setString(FlinkOptions.HIVE_SYNC_TABLE, "t2"); + this.conf.setBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING, true); + + final MockContext sinkContext2 = MockContext.getInstance(this.conf, schema1, "f2"); + final HoodieTableSink tableSink2 = (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(sinkContext2); + final Configuration conf2 = tableSink2.getConf(); + assertThat(conf2.getString(FlinkOptions.HIVE_SYNC_DB), is("db2")); + assertThat(conf2.getString(FlinkOptions.HIVE_SYNC_TABLE), is("t2")); + assertThat(conf2.getString(FlinkOptions.HIVE_SYNC_PARTITION_EXTRACTOR_CLASS_NAME), is(MultiPartKeysValueExtractor.class.getName())); + } + + @Test + void testSetupCleaningOptionsForSink() { + // definition with simple primary key and partition path + ResolvedSchema schema1 = SchemaBuilder.instance() + .field("f0", DataTypes.INT().notNull()) + .field("f1", DataTypes.VARCHAR(20)) + .field("f2", DataTypes.TIMESTAMP(3)) + .field("ts", DataTypes.TIMESTAMP(3)) + .primaryKey("f0") + .build(); + // set up new retains commits that is less than min archive commits + this.conf.setString(FlinkOptions.CLEAN_RETAIN_COMMITS.key(), "11"); + + final MockContext sinkContext1 = MockContext.getInstance(this.conf, schema1, "f2"); + final HoodieTableSink tableSink1 = (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(sinkContext1); + final Configuration conf1 = tableSink1.getConf(); + assertThat(conf1.getInteger(FlinkOptions.ARCHIVE_MIN_COMMITS), is(FlinkOptions.ARCHIVE_MIN_COMMITS.defaultValue())); + assertThat(conf1.getInteger(FlinkOptions.ARCHIVE_MAX_COMMITS), is(FlinkOptions.ARCHIVE_MAX_COMMITS.defaultValue())); + + // set up new retains commits that is greater than min archive commits + final int retainCommits = FlinkOptions.ARCHIVE_MIN_COMMITS.defaultValue() + 5; + this.conf.setInteger(FlinkOptions.CLEAN_RETAIN_COMMITS.key(), retainCommits); + + final MockContext sinkContext2 = MockContext.getInstance(this.conf, schema1, "f2"); + final HoodieTableSink tableSink2 = (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(sinkContext2); + final Configuration conf2 = tableSink2.getConf(); + assertThat(conf2.getInteger(FlinkOptions.ARCHIVE_MIN_COMMITS), is(retainCommits + 10)); + assertThat(conf2.getInteger(FlinkOptions.ARCHIVE_MAX_COMMITS), is(retainCommits + 20)); + } + + @Test + void testSetupTimestampBasedKeyGenForSink() { + this.conf.setString(FlinkOptions.RECORD_KEY_FIELD, "dummyField"); + // definition with simple primary key and partition path + ResolvedSchema schema1 = SchemaBuilder.instance() + .field("f0", DataTypes.INT().notNull()) + .field("f1", DataTypes.VARCHAR(20)) + .field("f2", DataTypes.TIMESTAMP(3)) + .field("ts", DataTypes.TIMESTAMP(3)) + .primaryKey("f0") + .build(); + final MockContext sourceContext1 = MockContext.getInstance(this.conf, schema1, "ts"); + final HoodieTableSource tableSource1 = (HoodieTableSource) new HoodieTableFactory().createDynamicTableSource(sourceContext1); + final Configuration conf1 = tableSource1.getConf(); + assertThat(conf1.get(FlinkOptions.RECORD_KEY_FIELD), is("f0")); + assertThat(conf1.get(FlinkOptions.KEYGEN_CLASS_NAME), is(TimestampBasedAvroKeyGenerator.class.getName())); + assertThat(conf1.getString(KeyGeneratorOptions.Config.TIMESTAMP_TYPE_FIELD_PROP, "dummy"), + is("EPOCHMILLISECONDS")); + assertThat(conf1.getString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "dummy"), + is(FlinkOptions.PARTITION_FORMAT_HOUR)); + assertThat(conf1.getString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP, "dummy"), + is("UTC")); + } + + @Test + void testSetupWriteOptionsForSink() { + final HoodieTableSink tableSink1 = + (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(MockContext.getInstance(this.conf)); + final Configuration conf1 = tableSink1.getConf(); + assertThat(conf1.get(FlinkOptions.PRE_COMBINE), is(true)); + + // set up operation as 'insert' + this.conf.setString(FlinkOptions.OPERATION, "insert"); + HoodieTableSink tableSink2 = + (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(MockContext.getInstance(this.conf)); + Configuration conf2 = tableSink2.getConf(); + assertThat(conf2.get(FlinkOptions.PRE_COMBINE), is(false)); + } + + // ------------------------------------------------------------------------- + // Inner Class + // ------------------------------------------------------------------------- + + /** + * Mock dynamic table factory context. + */ + private static class MockContext implements DynamicTableFactory.Context { + private final Configuration conf; + private final ResolvedSchema schema; + private final List partitions; + + private MockContext(Configuration conf, ResolvedSchema schema, List partitions) { + this.conf = conf; + this.schema = schema; + this.partitions = partitions; + } + + static MockContext getInstance(Configuration conf) { + return getInstance(conf, TestConfigurations.TABLE_SCHEMA, Collections.singletonList("partition")); + } + + static MockContext getInstance(Configuration conf, ResolvedSchema schema, String partition) { + return getInstance(conf, schema, Collections.singletonList(partition)); + } + + static MockContext getInstance(Configuration conf, ResolvedSchema schema, List partitions) { + return new MockContext(conf, schema, partitions); + } + + @Override + public ObjectIdentifier getObjectIdentifier() { + return ObjectIdentifier.of("hudi", "db1", "t1"); + } + + @Override + public ResolvedCatalogTable getCatalogTable() { + CatalogTable catalogTable = CatalogTable.of(Schema.newBuilder().fromResolvedSchema(schema).build(), + "mock source table", partitions, conf.toMap()); + return new ResolvedCatalogTable(catalogTable, schema); + } + + @Override + public ReadableConfig getConfiguration() { + return conf; + } + + @Override + public ClassLoader getClassLoader() { + return null; + } + + @Override + public boolean isTemporary() { + return false; + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableSource.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableSource.java new file mode 100644 index 0000000000000..10a7e44373573 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableSource.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table; + +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.table.format.mor.MergeOnReadInputFormat; +import org.apache.hudi.utils.TestConfigurations; +import org.apache.hudi.utils.TestData; + +import org.apache.avro.Schema; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.io.InputFormat; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.expressions.CallExpression; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.functions.BuiltInFunctionDefinitions; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.function.ThrowingSupplier; +import org.junit.jupiter.api.io.TempDir; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.hamcrest.CoreMatchers.instanceOf; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.core.Is.is; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; + +/** + * Test cases for HoodieTableSource. + */ +public class TestHoodieTableSource { + private static final Logger LOG = LoggerFactory.getLogger(TestHoodieTableSource.class); + + private Configuration conf; + + @TempDir + File tempFile; + + void beforeEach() throws Exception { + final String path = tempFile.getAbsolutePath(); + conf = TestConfigurations.getDefaultConf(path); + TestData.writeData(TestData.DATA_SET_INSERT, conf); + } + + @Test + void testGetReadPaths() throws Exception { + beforeEach(); + HoodieTableSource tableSource = new HoodieTableSource( + TestConfigurations.TABLE_SCHEMA, + new Path(tempFile.getPath()), + Arrays.asList(conf.getString(FlinkOptions.PARTITION_PATH_FIELD).split(",")), + "default-par", + conf); + FileStatus[] fileStatuses = tableSource.getReadFiles(); + assertNotNull(fileStatuses); + assertThat(fileStatuses.length, is(4)); + // apply partition pruning + Map partitions = new HashMap<>(); + partitions.put("partition", "par1"); + + tableSource.applyPartitions(Collections.singletonList(partitions)); + + FileStatus[] fileStatuses2 = tableSource.getReadFiles(); + assertNotNull(fileStatuses2); + assertThat(fileStatuses2.length, is(1)); + } + + @Test + void testGetInputFormat() throws Exception { + beforeEach(); + // write some data to let the TableSchemaResolver get the right instant + TestData.writeData(TestData.DATA_SET_INSERT, conf); + + HoodieTableSource tableSource = new HoodieTableSource( + TestConfigurations.TABLE_SCHEMA, + new Path(tempFile.getPath()), + Arrays.asList(conf.getString(FlinkOptions.PARTITION_PATH_FIELD).split(",")), + "default-par", + conf); + InputFormat inputFormat = tableSource.getInputFormat(); + assertThat(inputFormat, is(instanceOf(FileInputFormat.class))); + conf.setString(FlinkOptions.TABLE_TYPE, FlinkOptions.TABLE_TYPE_MERGE_ON_READ); + inputFormat = tableSource.getInputFormat(); + assertThat(inputFormat, is(instanceOf(MergeOnReadInputFormat.class))); + conf.setString(FlinkOptions.QUERY_TYPE.key(), FlinkOptions.QUERY_TYPE_INCREMENTAL); + assertDoesNotThrow( + (ThrowingSupplier>) tableSource::getInputFormat, + "Query type: 'incremental' should be supported"); + } + + @Test + void testGetTableAvroSchema() { + HoodieTableSource tableSource = getEmptyStreamingSource(); + assertNull(tableSource.getMetaClient(), "Streaming source with empty table path is allowed"); + final String schemaFields = tableSource.getTableAvroSchema().getFields().stream() + .map(Schema.Field::name) + .collect(Collectors.joining(",")); + final String expected = "_hoodie_commit_time," + + "_hoodie_commit_seqno," + + "_hoodie_record_key," + + "_hoodie_partition_path," + + "_hoodie_file_name," + + "uuid,name,age,ts,partition"; + assertThat(schemaFields, is(expected)); + } + + @Test + void testDataSkippingFilterShouldBeNotNullWhenTableSourceIsCopied() { + HoodieTableSource tableSource = getEmptyStreamingSource(); + ResolvedExpression mockExpression = new CallExpression( + BuiltInFunctionDefinitions.IN, + Collections.emptyList(), + TestConfigurations.ROW_DATA_TYPE); + List expectedFilters = Collections.singletonList(mockExpression); + tableSource.applyFilters(expectedFilters); + HoodieTableSource copiedSource = (HoodieTableSource) tableSource.copy(); + List actualFilters = copiedSource.getFileIndex().getFilters(); + assertEquals(expectedFilters, actualFilters); + } + + @Test + void testHoodieSourceCachedMetaClient() { + HoodieTableSource tableSource = getEmptyStreamingSource(); + HoodieTableMetaClient metaClient = tableSource.getMetaClient(); + HoodieTableSource tableSourceCopy = (HoodieTableSource) tableSource.copy(); + assertThat(metaClient, is(tableSourceCopy.getMetaClient())); + } + + private HoodieTableSource getEmptyStreamingSource() { + final String path = tempFile.getAbsolutePath(); + conf = TestConfigurations.getDefaultConf(path); + conf.setBoolean(FlinkOptions.READ_AS_STREAMING, true); + + return new HoodieTableSource( + TestConfigurations.TABLE_SCHEMA, + new Path(tempFile.getPath()), + Arrays.asList(conf.getString(FlinkOptions.PARTITION_PATH_FIELD).split(",")), + "default-par", + conf); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/HoodieCatalogTestUtils.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/HoodieCatalogTestUtils.java new file mode 100644 index 0000000000000..c98b4ac0da297 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/HoodieCatalogTestUtils.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.catalog.exceptions.CatalogException; +import org.apache.hadoop.hive.conf.HiveConf; + +import java.io.IOException; + +/** + * Test utils for Hoodie catalog. + */ +public class HoodieCatalogTestUtils { + private static final String HIVE_WAREHOUSE_URI_FORMAT = + "jdbc:derby:;databaseName=%s;create=true"; + + private static final String TEST_CATALOG_NAME = "test_catalog"; + + private static final org.junit.rules.TemporaryFolder TEMPORARY_FOLDER = new org.junit.rules.TemporaryFolder(); + + /** + * Create a HiveCatalog with an embedded Hive Metastore. + */ + public static HoodieHiveCatalog createHiveCatalog() { + return createHiveCatalog(TEST_CATALOG_NAME); + } + + public static HoodieHiveCatalog createHiveCatalog(String name) { + return createHiveCatalog(name, false); + } + + public static HoodieHiveCatalog createHiveCatalog(String name, boolean external) { + Configuration options = new Configuration(); + options.setBoolean(CatalogOptions.TABLE_EXTERNAL, external); + return new HoodieHiveCatalog( + name, + options, + createHiveConf(), + true); + } + + public static HiveConf createHiveConf() { + ClassLoader classLoader = HoodieCatalogTestUtils.class.getClassLoader(); + try { + TEMPORARY_FOLDER.create(); + String warehouseDir = TEMPORARY_FOLDER.newFolder().getAbsolutePath() + "/metastore_db"; + String warehouseUri = String.format(HIVE_WAREHOUSE_URI_FORMAT, warehouseDir); + + HiveConf.setHiveSiteLocation(classLoader.getResource(CatalogOptions.HIVE_SITE_FILE)); + HiveConf hiveConf = new HiveConf(); + hiveConf.setVar( + HiveConf.ConfVars.METASTOREWAREHOUSE, + TEMPORARY_FOLDER.newFolder("hive_warehouse").getAbsolutePath()); + hiveConf.setVar(HiveConf.ConfVars.METASTORECONNECTURLKEY, warehouseUri); + return hiveConf; + } catch (IOException e) { + throw new CatalogException("Failed to create test HiveConf to HiveCatalog.", e); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieCatalog.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieCatalog.java new file mode 100644 index 0000000000000..8e23ef9d63bcb --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieCatalog.java @@ -0,0 +1,272 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.hudi.configuration.FlinkOptions; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.Schema; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.config.ExecutionConfigOptions; +import org.apache.flink.table.api.internal.TableEnvironmentImpl; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.catalog.CatalogDatabase; +import org.apache.flink.table.catalog.CatalogDatabaseImpl; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.ResolvedCatalogTable; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.catalog.UniqueConstraint; +import org.apache.flink.table.catalog.exceptions.CatalogException; +import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; +import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.types.logical.LogicalTypeRoot; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.apache.hudi.table.catalog.CatalogOptions.CATALOG_PATH; +import static org.apache.hudi.table.catalog.CatalogOptions.DEFAULT_DATABASE; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Test cases for {@link HoodieCatalog}. + */ +public class TestHoodieCatalog { + + private static final String TEST_DEFAULT_DATABASE = "test_db"; + private static final String NONE_EXIST_DATABASE = "none_exist_database"; + private static final List CREATE_COLUMNS = Arrays.asList( + Column.physical("uuid", DataTypes.VARCHAR(20)), + Column.physical("name", DataTypes.VARCHAR(20)), + Column.physical("age", DataTypes.INT()), + Column.physical("tss", DataTypes.TIMESTAMP(3)), + Column.physical("partition", DataTypes.VARCHAR(10)) + ); + private static final UniqueConstraint CONSTRAINTS = UniqueConstraint.primaryKey("uuid", Arrays.asList("uuid")); + private static final ResolvedSchema CREATE_TABLE_SCHEMA = + new ResolvedSchema( + CREATE_COLUMNS, + Collections.emptyList(), + CONSTRAINTS); + + private static final List EXPECTED_TABLE_COLUMNS = + CREATE_COLUMNS.stream() + .map( + col -> { + // Flink char/varchar is transform to string in avro. + if (col.getDataType() + .getLogicalType() + .getTypeRoot() + .equals(LogicalTypeRoot.VARCHAR)) { + return Column.physical(col.getName(), DataTypes.STRING()); + } else { + return col; + } + }) + .collect(Collectors.toList()); + private static final ResolvedSchema EXPECTED_TABLE_SCHEMA = + new ResolvedSchema(EXPECTED_TABLE_COLUMNS, Collections.emptyList(), CONSTRAINTS); + + private static final Map EXPECTED_OPTIONS = new HashMap<>(); + + static { + EXPECTED_OPTIONS.put(FlinkOptions.TABLE_TYPE.key(), FlinkOptions.TABLE_TYPE_MERGE_ON_READ); + EXPECTED_OPTIONS.put(FlinkOptions.INDEX_GLOBAL_ENABLED.key(), "false"); + EXPECTED_OPTIONS.put(FlinkOptions.PRE_COMBINE.key(), "true"); + } + + private static final ResolvedCatalogTable EXPECTED_CATALOG_TABLE = new ResolvedCatalogTable( + CatalogTable.of( + Schema.newBuilder().fromResolvedSchema(CREATE_TABLE_SCHEMA).build(), + "test", + Arrays.asList("partition"), + EXPECTED_OPTIONS), + CREATE_TABLE_SCHEMA + ); + + private TableEnvironment streamTableEnv; + private HoodieCatalog catalog; + + @TempDir + File tempFile; + + @BeforeEach + void beforeEach() { + EnvironmentSettings settings = EnvironmentSettings.newInstance().build(); + streamTableEnv = TableEnvironmentImpl.create(settings); + streamTableEnv.getConfig().getConfiguration() + .setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 2); + File testDb = new File(tempFile, TEST_DEFAULT_DATABASE); + testDb.mkdir(); + Map catalogOptions = new HashMap<>(); + catalogOptions.put(CATALOG_PATH.key(), tempFile.getAbsolutePath()); + catalogOptions.put(DEFAULT_DATABASE.key(), TEST_DEFAULT_DATABASE); + catalog = new HoodieCatalog("hudi", Configuration.fromMap(catalogOptions)); + catalog.open(); + } + + @AfterEach + void afterEach() { + if (catalog != null) { + catalog.close(); + } + } + + @Test + public void testListDatabases() { + List actual = catalog.listDatabases(); + assertTrue(actual.contains(TEST_DEFAULT_DATABASE)); + assertFalse(actual.contains(NONE_EXIST_DATABASE)); + } + + @Test + public void testDatabaseExists() { + assertTrue(catalog.databaseExists(TEST_DEFAULT_DATABASE)); + assertFalse(catalog.databaseExists(NONE_EXIST_DATABASE)); + } + + @Test + public void testCreateAndDropDatabase() throws Exception { + CatalogDatabase expected = new CatalogDatabaseImpl(Collections.emptyMap(), null); + catalog.createDatabase("db1", expected, true); + + CatalogDatabase actual = catalog.getDatabase("db1"); + assertTrue(catalog.listDatabases().contains("db1")); + assertEquals(expected.getProperties(), actual.getProperties()); + + // create exist database + assertThrows(DatabaseAlreadyExistException.class, + () -> catalog.createDatabase("db1", expected, false)); + + // drop exist database + catalog.dropDatabase("db1", true); + assertFalse(catalog.listDatabases().contains("db1")); + + // drop non-exist database + assertThrows(DatabaseNotExistException.class, + () -> catalog.dropDatabase(NONE_EXIST_DATABASE, false)); + } + + @Test + public void testCreateDatabaseWithOptions() { + Map options = new HashMap<>(); + options.put("k1", "v1"); + options.put("k2", "v2"); + + assertThrows( + CatalogException.class, + () -> catalog.createDatabase("db1", new CatalogDatabaseImpl(options, null), true)); + } + + @Test + public void testCreateTable() throws Exception { + ObjectPath tablePath = new ObjectPath(TEST_DEFAULT_DATABASE, "tb1"); + // test create table + catalog.createTable(tablePath, EXPECTED_CATALOG_TABLE, true); + + // test table exist + assertTrue(catalog.tableExists(tablePath)); + + // test create exist table + assertThrows(TableAlreadyExistException.class, + () -> catalog.createTable(tablePath, EXPECTED_CATALOG_TABLE, false)); + } + + @Test + public void testListTable() throws Exception { + ObjectPath tablePath1 = new ObjectPath(TEST_DEFAULT_DATABASE, "tb1"); + ObjectPath tablePath2 = new ObjectPath(TEST_DEFAULT_DATABASE, "tb2"); + + // create table + catalog.createTable(tablePath1, EXPECTED_CATALOG_TABLE, true); + catalog.createTable(tablePath2, EXPECTED_CATALOG_TABLE, true); + + // test list table + List tables = catalog.listTables(TEST_DEFAULT_DATABASE); + assertTrue(tables.contains(tablePath1.getObjectName())); + assertTrue(tables.contains(tablePath2.getObjectName())); + + // test list non-exist database table + assertThrows(DatabaseNotExistException.class, + () -> catalog.listTables(NONE_EXIST_DATABASE)); + } + + @Test + public void testGetTable() throws Exception { + ObjectPath tablePath = new ObjectPath(TEST_DEFAULT_DATABASE, "tb1"); + // create table + catalog.createTable(tablePath, EXPECTED_CATALOG_TABLE, true); + + Map expectedOptions = new HashMap<>(EXPECTED_OPTIONS); + expectedOptions.put(FlinkOptions.TABLE_TYPE.key(), FlinkOptions.TABLE_TYPE_MERGE_ON_READ); + expectedOptions.put(FlinkOptions.INDEX_GLOBAL_ENABLED.key(), "false"); + expectedOptions.put(FlinkOptions.PRE_COMBINE.key(), "true"); + expectedOptions.put("connector", "hudi"); + expectedOptions.put( + FlinkOptions.PATH.key(), + String.format("%s/%s/%s", tempFile.getAbsolutePath(), tablePath.getDatabaseName(), tablePath.getObjectName())); + + // test get table + CatalogBaseTable actualTable = catalog.getTable(tablePath); + // validate schema + Schema actualSchema = actualTable.getUnresolvedSchema(); + Schema expectedSchema = Schema.newBuilder().fromResolvedSchema(EXPECTED_TABLE_SCHEMA).build(); + assertEquals(expectedSchema, actualSchema); + // validate options + Map actualOptions = actualTable.getOptions(); + assertEquals(expectedOptions, actualOptions); + // validate comment + assertEquals(EXPECTED_CATALOG_TABLE.getComment(), actualTable.getComment()); + // validate partition key + assertEquals(EXPECTED_CATALOG_TABLE.getPartitionKeys(), ((CatalogTable) actualTable).getPartitionKeys()); + } + + @Test + public void dropTable() throws Exception { + ObjectPath tablePath = new ObjectPath(TEST_DEFAULT_DATABASE, "tb1"); + // create table + catalog.createTable(tablePath, EXPECTED_CATALOG_TABLE, true); + + // test drop table + catalog.dropTable(tablePath, true); + assertFalse(catalog.tableExists(tablePath)); + + // drop non-exist table + assertThrows(TableNotExistException.class, + () -> catalog.dropTable(new ObjectPath(TEST_DEFAULT_DATABASE, "non_exist"), false)); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieCatalogFactory.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieCatalogFactory.java new file mode 100644 index 0000000000000..6e7ee2e8f84bd --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieCatalogFactory.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.hudi.adapter.TestTableEnvs; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.catalog.AbstractCatalog; +import org.apache.flink.table.catalog.Catalog; +import org.apache.flink.table.catalog.CommonCatalogOptions; +import org.apache.flink.table.factories.FactoryUtil; +import org.apache.hadoop.hive.conf.HiveConf; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.File; +import java.net.URL; +import java.util.HashMap; +import java.util.Map; + +import static org.apache.hudi.table.catalog.CatalogOptions.CATALOG_PATH; +import static org.apache.hudi.table.catalog.CatalogOptions.DEFAULT_DATABASE; +import static org.hamcrest.CoreMatchers.containsString; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +/** + * Test cases for {@link HoodieCatalogFactory}. + */ +public class TestHoodieCatalogFactory { + private static final URL CONF_DIR = + Thread.currentThread().getContextClassLoader().getResource("test-catalog-factory-conf"); + + @TempDir + File tempFile; + + @Test + void testCreateCatalogThroughSQL() { + TableEnvironment tableEnv = TestTableEnvs.getBatchTableEnv(); + String catalogDDL = "" + + "create catalog hudi_catalog\n" + + " with(\n" + + " 'type' = 'hudi',\n" + + " 'catalog.path' = '" + tempFile.getAbsolutePath() + "/warehouse',\n" + + " 'mode' = 'hms',\n" + + " 'hive.conf.dir' = '" + CONF_DIR.getPath() + "',\n" + + " 'table.external' = 'true'\n" + + " )\n"; + RuntimeException exception = assertThrows(RuntimeException.class, () -> tableEnv.executeSql(catalogDDL)); + assertThat(exception.getMessage(), containsString("hive metastore")); + } + + @Test + void testCreateHMSCatalog() { + final String catalogName = "mycatalog"; + + final HoodieHiveCatalog expectedCatalog = HoodieCatalogTestUtils.createHiveCatalog(catalogName); + + final Map options = new HashMap<>(); + options.put(CommonCatalogOptions.CATALOG_TYPE.key(), HoodieCatalogFactory.IDENTIFIER); + options.put(CatalogOptions.HIVE_CONF_DIR.key(), CONF_DIR.getPath()); + options.put(CatalogOptions.MODE.key(), "hms"); + options.put(CatalogOptions.TABLE_EXTERNAL.key(), "false"); + + final Catalog actualCatalog = + FactoryUtil.createCatalog( + catalogName, options, null, Thread.currentThread().getContextClassLoader()); + + assertEquals( + ((HoodieHiveCatalog) actualCatalog) + .getHiveConf() + .getVar(HiveConf.ConfVars.METASTOREURIS), "dummy-hms"); + checkEquals(expectedCatalog, (HoodieHiveCatalog) actualCatalog); + } + + @Test + void testCreateDFSCatalog() { + final String catalogName = "mycatalog"; + + Map catalogOptions = new HashMap<>(); + catalogOptions.put(CATALOG_PATH.key(), tempFile.getAbsolutePath()); + catalogOptions.put(DEFAULT_DATABASE.key(), "test_db"); + HoodieCatalog expectedCatalog = new HoodieCatalog(catalogName, Configuration.fromMap(catalogOptions)); + + final Map options = new HashMap<>(); + options.put(CommonCatalogOptions.CATALOG_TYPE.key(), HoodieCatalogFactory.IDENTIFIER); + options.put(CATALOG_PATH.key(), tempFile.getAbsolutePath()); + options.put(DEFAULT_DATABASE.key(), "test_db"); + options.put(CatalogOptions.MODE.key(), "dfs"); + + final Catalog actualCatalog = + FactoryUtil.createCatalog( + catalogName, options, null, Thread.currentThread().getContextClassLoader()); + + checkEquals(expectedCatalog, (AbstractCatalog) actualCatalog); + } + + private static void checkEquals(AbstractCatalog c1, AbstractCatalog c2) { + // Only assert a few selected properties for now + assertEquals(c2.getName(), c1.getName()); + assertEquals(c2.getDefaultDatabase(), c1.getDefaultDatabase()); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieHiveCatalog.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieHiveCatalog.java new file mode 100644 index 0000000000000..ffae71d6b2499 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieHiveCatalog.java @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieCatalogException; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.Schema; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.CatalogTableImpl; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; +import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.factories.FactoryUtil; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.api.Table; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.apache.flink.table.factories.FactoryUtil.CONNECTOR; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Test cases for {@link HoodieHiveCatalog}. + */ +public class TestHoodieHiveCatalog { + TableSchema schema = + TableSchema.builder() + .field("uuid", DataTypes.INT().notNull()) + .field("name", DataTypes.STRING()) + .field("age", DataTypes.INT()) + .field("par1", DataTypes.STRING()) + .field("ts", DataTypes.BIGINT()) + .primaryKey("uuid") + .build(); + List partitions = Collections.singletonList("par1"); + private static HoodieHiveCatalog hoodieCatalog; + private final ObjectPath tablePath = new ObjectPath("default", "test"); + + @BeforeAll + public static void createCatalog() { + hoodieCatalog = HoodieCatalogTestUtils.createHiveCatalog(); + hoodieCatalog.open(); + } + + @AfterEach + public void dropTable() throws TableNotExistException { + hoodieCatalog.dropTable(tablePath, true); + } + + @AfterAll + public static void closeCatalog() { + if (hoodieCatalog != null) { + hoodieCatalog.close(); + } + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + public void testCreateAndGetHoodieTable(HoodieTableType tableType) throws Exception { + Map options = new HashMap<>(); + options.put(FactoryUtil.CONNECTOR.key(), "hudi"); + options.put(FlinkOptions.TABLE_TYPE.key(), tableType.toString()); + + CatalogTable table = + new CatalogTableImpl(schema, partitions, options, "hudi table"); + hoodieCatalog.createTable(tablePath, table, false); + + // validate hive table + Table hiveTable = hoodieCatalog.getHiveTable(tablePath); + String fieldSchema = hiveTable.getSd().getCols().stream() + .map(f -> f.getName() + ":" + f.getType()) + .collect(Collectors.joining(",")); + String expectedFieldSchema = "" + + "_hoodie_commit_time:string," + + "_hoodie_commit_seqno:string," + + "_hoodie_record_key:string," + + "_hoodie_partition_path:string," + + "_hoodie_file_name:string," + + "uuid:int," + + "name:string," + + "age:int," + + "ts:bigint"; + assertEquals(expectedFieldSchema, fieldSchema); + String partitionSchema = hiveTable.getPartitionKeys().stream() + .map(f -> f.getName() + ":" + f.getType()) + .collect(Collectors.joining(",")); + assertEquals("par1:string", partitionSchema); + + // validate catalog table + CatalogBaseTable table1 = hoodieCatalog.getTable(tablePath); + assertEquals("hudi", table1.getOptions().get(CONNECTOR.key())); + assertEquals(tableType.toString(), table1.getOptions().get(FlinkOptions.TABLE_TYPE.key())); + assertEquals("uuid", table1.getOptions().get(FlinkOptions.RECORD_KEY_FIELD.key())); + assertNull(table1.getOptions().get(FlinkOptions.PRECOMBINE_FIELD.key()), "preCombine key is not declared"); + String tableSchema = table1.getUnresolvedSchema().getColumns().stream() + .map(Schema.UnresolvedColumn::toString) + .collect(Collectors.joining(",")); + String expectedTableSchema = "`uuid` INT NOT NULL,`name` STRING,`age` INT,`par1` STRING,`ts` BIGINT"; + assertEquals(expectedTableSchema, tableSchema); + assertEquals(Collections.singletonList("uuid"), table1.getUnresolvedSchema().getPrimaryKey().get().getColumnNames()); + assertEquals(Collections.singletonList("par1"), ((CatalogTable) table1).getPartitionKeys()); + + // validate explicit primary key + options.put(FlinkOptions.RECORD_KEY_FIELD.key(), "id"); + table = new CatalogTableImpl(schema, partitions, options, "hudi table"); + hoodieCatalog.alterTable(tablePath, table, true); + + CatalogBaseTable table2 = hoodieCatalog.getTable(tablePath); + assertEquals("id", table2.getOptions().get(FlinkOptions.RECORD_KEY_FIELD.key())); + } + + @Test + public void testCreateExternalTable() throws TableAlreadyExistException, DatabaseNotExistException, TableNotExistException, IOException { + HoodieHiveCatalog catalog = HoodieCatalogTestUtils.createHiveCatalog("myCatalog", true); + catalog.open(); + Map originOptions = new HashMap<>(); + originOptions.put(FactoryUtil.CONNECTOR.key(), "hudi"); + CatalogTable table = + new CatalogTableImpl(schema, originOptions, "hudi table"); + catalog.createTable(tablePath, table, false); + Table table1 = catalog.getHiveTable(tablePath); + assertTrue(Boolean.parseBoolean(table1.getParameters().get("EXTERNAL"))); + assertEquals("EXTERNAL_TABLE", table1.getTableType()); + + catalog.dropTable(tablePath, false); + Path path = new Path(table1.getParameters().get(FlinkOptions.PATH.key())); + boolean created = StreamerUtil.fileExists(FSUtils.getFs(path, new Configuration()), path); + assertTrue(created, "Table should have been created"); + } + + @Test + public void testCreateNonHoodieTable() throws TableAlreadyExistException, DatabaseNotExistException { + CatalogTable table = + new CatalogTableImpl(schema, Collections.emptyMap(), "hudi table"); + try { + hoodieCatalog.createTable(tablePath, table, false); + } catch (HoodieCatalogException e) { + assertEquals(String.format("The %s is not hoodie table", tablePath.getObjectName()), e.getMessage()); + } + } + + @Test + public void testAlterTable() throws Exception { + Map originOptions = new HashMap<>(); + originOptions.put(FactoryUtil.CONNECTOR.key(), "hudi"); + CatalogTable originTable = + new CatalogTableImpl(schema, partitions, originOptions, "hudi table"); + hoodieCatalog.createTable(tablePath, originTable, false); + + Table hiveTable = hoodieCatalog.getHiveTable(tablePath); + Map newOptions = hiveTable.getParameters(); + newOptions.put("k", "v"); + CatalogTable newTable = new CatalogTableImpl(schema, partitions, newOptions, "alter hudi table"); + hoodieCatalog.alterTable(tablePath, newTable, false); + + hiveTable = hoodieCatalog.getHiveTable(tablePath); + assertEquals(hiveTable.getParameters().get(CONNECTOR.key()), "hudi"); + assertEquals(hiveTable.getParameters().get("k"), "v"); + } + + @Test + public void testRenameTable() throws Exception { + Map originOptions = new HashMap<>(); + originOptions.put(FactoryUtil.CONNECTOR.key(), "hudi"); + CatalogTable originTable = + new CatalogTableImpl(schema, partitions, originOptions, "hudi table"); + hoodieCatalog.createTable(tablePath, originTable, false); + + hoodieCatalog.renameTable(tablePath, "test1", false); + + assertEquals(hoodieCatalog.getHiveTable(new ObjectPath("default", "test1")).getTableName(), "test1"); + + hoodieCatalog.renameTable(new ObjectPath("default", "test1"), "test", false); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/format/TestInputFormat.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/format/TestInputFormat.java new file mode 100644 index 0000000000000..b76905ed8af07 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/format/TestInputFormat.java @@ -0,0 +1,650 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format; + +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.model.EventTimeAvroPayload; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.table.HoodieTableSource; +import org.apache.hudi.table.format.cow.CopyOnWriteInputFormat; +import org.apache.hudi.table.format.mor.MergeOnReadInputFormat; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.FlinkWriteClients; +import org.apache.hudi.util.StreamerUtil; +import org.apache.hudi.utils.TestConfigurations; +import org.apache.hudi.utils.TestData; + +import org.apache.flink.api.common.io.InputFormat; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.io.InputSplit; +import org.apache.flink.table.data.RowData; +import org.apache.hadoop.fs.Path; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.hamcrest.CoreMatchers.instanceOf; +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; + +/** + * Test cases for MergeOnReadInputFormat and ParquetInputFormat. + */ +public class TestInputFormat { + + private HoodieTableSource tableSource; + private Configuration conf; + + @TempDir + File tempFile; + + void beforeEach(HoodieTableType tableType) throws IOException { + beforeEach(tableType, Collections.emptyMap()); + } + + void beforeEach(HoodieTableType tableType, Map options) throws IOException { + conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setString(FlinkOptions.TABLE_TYPE, tableType.name()); + conf.setBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED, false); // close the async compaction + options.forEach((key, value) -> conf.setString(key, value)); + + StreamerUtil.initTableIfNotExists(conf); + this.tableSource = getTableSource(conf); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testRead(HoodieTableType tableType) throws Exception { + beforeEach(tableType); + + TestData.writeData(TestData.DATA_SET_INSERT, conf); + + InputFormat inputFormat = this.tableSource.getInputFormat(); + + List result = readData(inputFormat); + + String actual = TestData.rowDataToString(result); + String expected = TestData.rowDataToString(TestData.DATA_SET_INSERT); + assertThat(actual, is(expected)); + + // write another commit to read again + TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf); + + // refresh the input format + this.tableSource.reset(); + inputFormat = this.tableSource.getInputFormat(); + + result = readData(inputFormat); + + actual = TestData.rowDataToString(result); + expected = "[" + + "+I[id1, Danny, 24, 1970-01-01T00:00:00.001, par1], " + + "+I[id2, Stephen, 34, 1970-01-01T00:00:00.002, par1], " + + "+I[id3, Julian, 54, 1970-01-01T00:00:00.003, par2], " + + "+I[id4, Fabian, 32, 1970-01-01T00:00:00.004, par2], " + + "+I[id5, Sophia, 18, 1970-01-01T00:00:00.005, par3], " + + "+I[id6, Emma, 20, 1970-01-01T00:00:00.006, par3], " + + "+I[id7, Bob, 44, 1970-01-01T00:00:00.007, par4], " + + "+I[id8, Han, 56, 1970-01-01T00:00:00.008, par4], " + + "+I[id9, Jane, 19, 1970-01-01T00:00:00.006, par3], " + + "+I[id10, Ella, 38, 1970-01-01T00:00:00.007, par4], " + + "+I[id11, Phoebe, 52, 1970-01-01T00:00:00.008, par4]]"; + assertThat(actual, is(expected)); + } + + @Test + void testReadBaseAndLogFiles() throws Exception { + beforeEach(HoodieTableType.MERGE_ON_READ); + + // write base first with compaction + conf.setBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED, true); + conf.setInteger(FlinkOptions.COMPACTION_DELTA_COMMITS, 1); + TestData.writeData(TestData.DATA_SET_INSERT, conf); + + InputFormat inputFormat = this.tableSource.getInputFormat(); + + List result = readData(inputFormat); + + String actual = TestData.rowDataToString(result); + String expected = TestData.rowDataToString(TestData.DATA_SET_INSERT); + assertThat(actual, is(expected)); + + // write another commit using logs and read again + conf.setBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED, false); + TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf); + + // write another commit using logs with separate partition + // so the file group has only logs + TestData.writeData(TestData.DATA_SET_INSERT_SEPARATE_PARTITION, conf); + + // refresh the input format + this.tableSource.reset(); + inputFormat = this.tableSource.getInputFormat(); + + result = readData(inputFormat); + + actual = TestData.rowDataToString(result); + expected = "[" + + "+I[id1, Danny, 24, 1970-01-01T00:00:00.001, par1], " + + "+I[id2, Stephen, 34, 1970-01-01T00:00:00.002, par1], " + + "+I[id3, Julian, 54, 1970-01-01T00:00:00.003, par2], " + + "+I[id4, Fabian, 32, 1970-01-01T00:00:00.004, par2], " + + "+I[id5, Sophia, 18, 1970-01-01T00:00:00.005, par3], " + + "+I[id6, Emma, 20, 1970-01-01T00:00:00.006, par3], " + + "+I[id7, Bob, 44, 1970-01-01T00:00:00.007, par4], " + + "+I[id8, Han, 56, 1970-01-01T00:00:00.008, par4], " + + "+I[id9, Jane, 19, 1970-01-01T00:00:00.006, par3], " + + "+I[id10, Ella, 38, 1970-01-01T00:00:00.007, par4], " + + "+I[id11, Phoebe, 52, 1970-01-01T00:00:00.008, par4], " + + "+I[id12, Monica, 27, 1970-01-01T00:00:00.009, par5], " + + "+I[id13, Phoebe, 31, 1970-01-01T00:00:00.010, par5], " + + "+I[id14, Rachel, 52, 1970-01-01T00:00:00.011, par6], " + + "+I[id15, Ross, 29, 1970-01-01T00:00:00.012, par6]]"; + assertThat(actual, is(expected)); + } + + @Test + void testReadBaseAndLogFilesWithDeletes() throws Exception { + Map options = new HashMap<>(); + options.put(FlinkOptions.CHANGELOG_ENABLED.key(), "true"); + beforeEach(HoodieTableType.MERGE_ON_READ, options); + + // write base first with compaction. + conf.setBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED, true); + conf.setInteger(FlinkOptions.COMPACTION_DELTA_COMMITS, 1); + TestData.writeData(TestData.DATA_SET_INSERT, conf); + + // write another commit using logs and read again. + conf.setBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED, false); + TestData.writeData(TestData.DATA_SET_UPDATE_DELETE, conf); + + InputFormat inputFormat = this.tableSource.getInputFormat(); + assertThat(inputFormat, instanceOf(MergeOnReadInputFormat.class)); + + // when isEmitDelete is false. + List result1 = readData(inputFormat); + + final String actual1 = TestData.rowDataToString(result1); + final String expected1 = "[" + + "+I[id1, Danny, 24, 1970-01-01T00:00:00.001, par1], " + + "+I[id2, Stephen, 34, 1970-01-01T00:00:00.002, par1], " + + "+I[id4, Fabian, 31, 1970-01-01T00:00:00.004, par2], " + + "+I[id6, Emma, 20, 1970-01-01T00:00:00.006, par3], " + + "+I[id7, Bob, 44, 1970-01-01T00:00:00.007, par4], " + + "+I[id8, Han, 56, 1970-01-01T00:00:00.008, par4]]"; + assertThat(actual1, is(expected1)); + + // refresh the input format and set isEmitDelete to true. + this.tableSource.reset(); + inputFormat = this.tableSource.getInputFormat(); + ((MergeOnReadInputFormat) inputFormat).isEmitDelete(true); + + List result2 = readData(inputFormat); + + final String actual2 = TestData.rowDataToString(result2); + final String expected2 = "[" + + "+I[id1, Danny, 24, 1970-01-01T00:00:00.001, par1], " + + "+I[id2, Stephen, 34, 1970-01-01T00:00:00.002, par1], " + + "-D[id3, Julian, 53, 1970-01-01T00:00:00.003, par2], " + + "+I[id4, Fabian, 31, 1970-01-01T00:00:00.004, par2], " + + "-D[id5, Sophia, 18, 1970-01-01T00:00:00.005, par3], " + + "+I[id6, Emma, 20, 1970-01-01T00:00:00.006, par3], " + + "+I[id7, Bob, 44, 1970-01-01T00:00:00.007, par4], " + + "+I[id8, Han, 56, 1970-01-01T00:00:00.008, par4], " + + "-D[id9, Jane, 19, 1970-01-01T00:00:00.006, par3]]"; + assertThat(actual2, is(expected2)); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testReadBaseAndLogFilesWithDisorderUpdateDelete(boolean compact) throws Exception { + Map options = new HashMap<>(); + options.put(FlinkOptions.CHANGELOG_ENABLED.key(), "true"); + beforeEach(HoodieTableType.MERGE_ON_READ, options); + + // write base first with compaction. + conf.setBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED, true); + conf.setInteger(FlinkOptions.COMPACTION_DELTA_COMMITS, 1); + TestData.writeData(TestData.DATA_SET_SINGLE_INSERT, conf); + + // write another commit using logs and read again. + conf.setBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED, compact); + TestData.writeData(TestData.DATA_SET_DISORDER_UPDATE_DELETE, conf); + + InputFormat inputFormat = this.tableSource.getInputFormat(); + assertThat(inputFormat, instanceOf(MergeOnReadInputFormat.class)); + + // when isEmitDelete is false. + List result1 = readData(inputFormat); + + final String rowKind = compact ? "I" : "U"; + final String expected = "[+" + rowKind + "[id1, Danny, 22, 1970-01-01T00:00:00.004, par1]]"; + + final String actual1 = TestData.rowDataToString(result1); + assertThat(actual1, is(expected)); + + // refresh the input format and set isEmitDelete to true. + this.tableSource.reset(); + inputFormat = this.tableSource.getInputFormat(); + ((MergeOnReadInputFormat) inputFormat).isEmitDelete(true); + + List result2 = readData(inputFormat); + + final String actual2 = TestData.rowDataToString(result2); + assertThat(actual2, is(expected)); + } + + @Test + void testReadWithDeletesMOR() throws Exception { + Map options = new HashMap<>(); + options.put(FlinkOptions.CHANGELOG_ENABLED.key(), "true"); + beforeEach(HoodieTableType.MERGE_ON_READ, options); + + // write another commit to read again + TestData.writeData(TestData.DATA_SET_UPDATE_DELETE, conf); + + InputFormat inputFormat = this.tableSource.getInputFormat(); + assertThat(inputFormat, instanceOf(MergeOnReadInputFormat.class)); + ((MergeOnReadInputFormat) inputFormat).isEmitDelete(true); + + List result = readData(inputFormat); + + final String actual = TestData.rowDataToString(result); + final String expected = "[" + + "+I[id1, Danny, 24, 1970-01-01T00:00:00.001, par1], " + + "+I[id2, Stephen, 34, 1970-01-01T00:00:00.002, par1], " + + "-D[id3, Julian, 53, 1970-01-01T00:00:00.003, par2], " + + "-D[id5, Sophia, 18, 1970-01-01T00:00:00.005, par3], " + + "-D[id9, Jane, 19, 1970-01-01T00:00:00.006, par3]]"; + assertThat(actual, is(expected)); + } + + @Test + void testReadWithDeletesCOW() throws Exception { + beforeEach(HoodieTableType.COPY_ON_WRITE); + + // write another commit to read again + TestData.writeData(TestData.DATA_SET_UPDATE_DELETE, conf); + + InputFormat inputFormat = this.tableSource.getInputFormat(); + assertThat(inputFormat, instanceOf(CopyOnWriteInputFormat.class)); + + List result = readData(inputFormat); + + final String actual = TestData.rowDataToString(result); + final String expected = "[" + + "+I[id1, Danny, 24, 1970-01-01T00:00:00.001, par1], " + + "+I[id2, Stephen, 34, 1970-01-01T00:00:00.002, par1]]"; + assertThat(actual, is(expected)); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testReadWithPartitionPrune(HoodieTableType tableType) throws Exception { + beforeEach(tableType); + + TestData.writeData(TestData.DATA_SET_INSERT, conf); + + Map prunedPartitions = new HashMap<>(); + prunedPartitions.put("partition", "par1"); + // prune to only be with partition 'par1' + tableSource.applyPartitions(Collections.singletonList(prunedPartitions)); + InputFormat inputFormat = tableSource.getInputFormat(); + + List result = readData(inputFormat); + + String actual = TestData.rowDataToString(result); + String expected = "[" + + "+I[id1, Danny, 23, 1970-01-01T00:00:00.001, par1], " + + "+I[id2, Stephen, 33, 1970-01-01T00:00:00.002, par1]]"; + assertThat(actual, is(expected)); + } + + @Test + void testReadChangesMergedMOR() throws Exception { + Map options = new HashMap<>(); + options.put(FlinkOptions.CHANGELOG_ENABLED.key(), "true"); + beforeEach(HoodieTableType.MERGE_ON_READ, options); + + // write another commit to read again + TestData.writeData(TestData.DATA_SET_INSERT_UPDATE_DELETE, conf); + + InputFormat inputFormat = this.tableSource.getInputFormat(); + assertThat(inputFormat, instanceOf(MergeOnReadInputFormat.class)); + + List result1 = readData(inputFormat); + + final String actual1 = TestData.rowDataToString(result1); + // the data set is merged when the data source is bounded. + final String expected1 = "[]"; + assertThat(actual1, is(expected1)); + + // refresh the input format and set isEmitDelete to true. + this.tableSource.reset(); + inputFormat = this.tableSource.getInputFormat(); + ((MergeOnReadInputFormat) inputFormat).isEmitDelete(true); + + List result2 = readData(inputFormat); + + final String actual2 = TestData.rowDataToString(result2); + final String expected2 = "[-D[id1, Danny, 22, 1970-01-01T00:00:00.005, par1]]"; + assertThat(actual2, is(expected2)); + } + + @Test + void testReadChangesUnMergedMOR() throws Exception { + Map options = new HashMap<>(); + options.put(FlinkOptions.CHANGELOG_ENABLED.key(), "true"); + options.put(FlinkOptions.READ_AS_STREAMING.key(), "true"); + beforeEach(HoodieTableType.MERGE_ON_READ, options); + + // write another commit to read again + TestData.writeData(TestData.DATA_SET_INSERT_UPDATE_DELETE, conf); + + InputFormat inputFormat = this.tableSource.getInputFormat(); + assertThat(inputFormat, instanceOf(MergeOnReadInputFormat.class)); + + List result = readData(inputFormat); + + final String actual = TestData.rowDataToString(result); + // the data set is merged when the data source is bounded. + final String expected = "[" + + "+I[id1, Danny, 19, 1970-01-01T00:00:00.001, par1], " + + "-U[id1, Danny, 19, 1970-01-01T00:00:00.001, par1], " + + "+U[id1, Danny, 20, 1970-01-01T00:00:00.002, par1], " + + "-U[id1, Danny, 20, 1970-01-01T00:00:00.002, par1], " + + "+U[id1, Danny, 21, 1970-01-01T00:00:00.003, par1], " + + "-U[id1, Danny, 21, 1970-01-01T00:00:00.003, par1], " + + "+U[id1, Danny, 22, 1970-01-01T00:00:00.004, par1], " + + "-D[id1, Danny, 22, 1970-01-01T00:00:00.005, par1]]"; + assertThat(actual, is(expected)); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testReadIncrementally(HoodieTableType tableType) throws Exception { + Map options = new HashMap<>(); + options.put(FlinkOptions.QUERY_TYPE.key(), FlinkOptions.QUERY_TYPE_INCREMENTAL); + beforeEach(tableType, options); + + // write another commit to read again + for (int i = 0; i < 6; i += 2) { + List dataset = TestData.dataSetInsert(i + 1, i + 2); + TestData.writeData(dataset, conf); + } + + HoodieTableMetaClient metaClient = StreamerUtil.createMetaClient(tempFile.getAbsolutePath(), HadoopConfigurations.getHadoopConf(conf)); + List commits = metaClient.getCommitsTimeline().filterCompletedInstants().getInstants() + .map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + + assertThat(commits.size(), is(3)); + + // only the start commit + conf.setString(FlinkOptions.READ_START_COMMIT, commits.get(1)); + this.tableSource = getTableSource(conf); + InputFormat inputFormat1 = this.tableSource.getInputFormat(); + assertThat(inputFormat1, instanceOf(MergeOnReadInputFormat.class)); + + List actual1 = readData(inputFormat1); + final List expected1 = TestData.dataSetInsert(3, 4, 5, 6); + TestData.assertRowDataEquals(actual1, expected1); + + // only the start commit: earliest + conf.setString(FlinkOptions.READ_START_COMMIT, FlinkOptions.START_COMMIT_EARLIEST); + this.tableSource = getTableSource(conf); + InputFormat inputFormat2 = this.tableSource.getInputFormat(); + assertThat(inputFormat2, instanceOf(MergeOnReadInputFormat.class)); + + List actual2 = readData(inputFormat2); + final List expected2 = TestData.dataSetInsert(1, 2, 3, 4, 5, 6); + TestData.assertRowDataEquals(actual2, expected2); + + // start and end commit: [start commit, end commit] + conf.setString(FlinkOptions.READ_START_COMMIT, commits.get(0)); + conf.setString(FlinkOptions.READ_END_COMMIT, commits.get(1)); + this.tableSource = getTableSource(conf); + InputFormat inputFormat3 = this.tableSource.getInputFormat(); + assertThat(inputFormat3, instanceOf(MergeOnReadInputFormat.class)); + + List actual3 = readData(inputFormat3); + final List expected3 = TestData.dataSetInsert(1, 2, 3, 4); + TestData.assertRowDataEquals(actual3, expected3); + + // only the end commit: point in time query + conf.removeConfig(FlinkOptions.READ_START_COMMIT); + conf.setString(FlinkOptions.READ_END_COMMIT, commits.get(1)); + this.tableSource = getTableSource(conf); + InputFormat inputFormat4 = this.tableSource.getInputFormat(); + assertThat(inputFormat4, instanceOf(MergeOnReadInputFormat.class)); + + List actual4 = readData(inputFormat4); + final List expected4 = TestData.dataSetInsert(3, 4); + TestData.assertRowDataEquals(actual4, expected4); + + // start and end commit: start commit out of range + conf.setString(FlinkOptions.READ_START_COMMIT, "000"); + conf.setString(FlinkOptions.READ_END_COMMIT, commits.get(1)); + this.tableSource = getTableSource(conf); + InputFormat inputFormat5 = this.tableSource.getInputFormat(); + assertThat(inputFormat4, instanceOf(MergeOnReadInputFormat.class)); + + List actual5 = readData(inputFormat5); + final List expected5 = TestData.dataSetInsert(1, 2, 3, 4); + TestData.assertRowDataEquals(actual5, expected5); + + // start and end commit: both are out of range + conf.setString(FlinkOptions.READ_START_COMMIT, "001"); + conf.setString(FlinkOptions.READ_END_COMMIT, "002"); + this.tableSource = getTableSource(conf); + InputFormat inputFormat6 = this.tableSource.getInputFormat(); + assertThat(inputFormat6, instanceOf(MergeOnReadInputFormat.class)); + + List actual6 = readData(inputFormat6); + TestData.assertRowDataEquals(actual6, Collections.emptyList()); + } + + @Test + void testMergeOnReadDisorderUpdateAfterCompaction() throws Exception { + Map options = new HashMap<>(); + options.put(FlinkOptions.PAYLOAD_CLASS_NAME.key(), EventTimeAvroPayload.class.getName()); + beforeEach(HoodieTableType.MERGE_ON_READ, options); + + // write base file first with compaction. + conf.setBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED, true); + conf.setInteger(FlinkOptions.COMPACTION_DELTA_COMMITS, 1); + TestData.writeData(TestData.DATA_SET_DISORDER_INSERT, conf); + InputFormat inputFormat = this.tableSource.getInputFormat(); + final String baseResult = TestData.rowDataToString(readData(inputFormat)); + String expected = "[+I[id1, Danny, 22, 1970-01-01T00:00:00.004, par1]]"; + assertThat(baseResult, is(expected)); + + // write another commit using logs and read again. + conf.setBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED, false); + TestData.writeData(TestData.DATA_SET_SINGLE_INSERT, conf); + this.tableSource.reset(); + inputFormat = this.tableSource.getInputFormat(); + assertThat(inputFormat, instanceOf(MergeOnReadInputFormat.class)); + final String baseMergeLogFileResult = TestData.rowDataToString(readData(inputFormat)); + assertThat(baseMergeLogFileResult, is(expected)); + } + + @Test + void testReadArchivedCommitsIncrementally() throws Exception { + Map options = new HashMap<>(); + options.put(FlinkOptions.QUERY_TYPE.key(), FlinkOptions.QUERY_TYPE_INCREMENTAL); + options.put(FlinkOptions.ARCHIVE_MIN_COMMITS.key(), "3"); + options.put(FlinkOptions.ARCHIVE_MAX_COMMITS.key(), "4"); + options.put(FlinkOptions.CLEAN_RETAIN_COMMITS.key(), "2"); + // disable the metadata table to make the archiving behavior deterministic + options.put(FlinkOptions.METADATA_ENABLED.key(), "false"); + options.put("hoodie.commits.archival.batch", "1"); + beforeEach(HoodieTableType.COPY_ON_WRITE, options); + + // write 10 batches of data set + for (int i = 0; i < 20; i += 2) { + List dataset = TestData.dataSetInsert(i + 1, i + 2); + TestData.writeData(dataset, conf); + } + // cleaning + HoodieFlinkWriteClient writeClient = new HoodieFlinkWriteClient<>( + HoodieFlinkEngineContext.DEFAULT, FlinkWriteClients.getHoodieClientConfig(conf)); + writeClient.clean(); + + HoodieTableMetaClient metaClient = StreamerUtil.createMetaClient(tempFile.getAbsolutePath(), HadoopConfigurations.getHadoopConf(conf)); + List commits = metaClient.getCommitsTimeline().filterCompletedInstants().getInstants() + .map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + + assertThat(commits.size(), is(4)); + + List archivedCommits = metaClient.getArchivedTimeline().getCommitsTimeline().filterCompletedInstants() + .getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + + assertThat(archivedCommits.size(), is(6)); + + // start and end commit: both are archived and cleaned + conf.setString(FlinkOptions.READ_START_COMMIT, archivedCommits.get(0)); + conf.setString(FlinkOptions.READ_END_COMMIT, archivedCommits.get(1)); + this.tableSource = getTableSource(conf); + InputFormat inputFormat1 = this.tableSource.getInputFormat(); + assertThat(inputFormat1, instanceOf(MergeOnReadInputFormat.class)); + + List actual1 = readData(inputFormat1); + final List expected1 = TestData.dataSetInsert(1, 2, 3, 4); + TestData.assertRowDataEquals(actual1, expected1); + + // only the start commit: is archived and cleaned + conf.setString(FlinkOptions.READ_START_COMMIT, archivedCommits.get(1)); + conf.removeConfig(FlinkOptions.READ_END_COMMIT); + this.tableSource = getTableSource(conf); + InputFormat inputFormat2 = this.tableSource.getInputFormat(); + assertThat(inputFormat2, instanceOf(MergeOnReadInputFormat.class)); + + List actual2 = readData(inputFormat2); + final List expected2 = TestData.dataSetInsert(3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20); + TestData.assertRowDataEquals(actual2, expected2); + + // only the end commit: is archived and cleaned + conf.removeConfig(FlinkOptions.READ_START_COMMIT); + conf.setString(FlinkOptions.READ_END_COMMIT, archivedCommits.get(1)); + this.tableSource = getTableSource(conf); + InputFormat inputFormat3 = this.tableSource.getInputFormat(); + assertThat(inputFormat3, instanceOf(MergeOnReadInputFormat.class)); + + List actual3 = readData(inputFormat3); + final List expected3 = TestData.dataSetInsert(3, 4); + TestData.assertRowDataEquals(actual3, expected3); + + // start and end commit: start is archived and cleaned, end is active + conf.setString(FlinkOptions.READ_START_COMMIT, archivedCommits.get(1)); + conf.setString(FlinkOptions.READ_END_COMMIT, commits.get(0)); + this.tableSource = getTableSource(conf); + InputFormat inputFormat4 = this.tableSource.getInputFormat(); + assertThat(inputFormat4, instanceOf(MergeOnReadInputFormat.class)); + + List actual4 = readData(inputFormat4); + final List expected4 = TestData.dataSetInsert(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); + TestData.assertRowDataEquals(actual4, expected4); + } + + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testReadWithWiderSchema(HoodieTableType tableType) throws Exception { + Map options = new HashMap<>(); + options.put(FlinkOptions.SOURCE_AVRO_SCHEMA.key(), + AvroSchemaConverter.convertToSchema(TestConfigurations.ROW_TYPE_WIDER).toString()); + beforeEach(tableType, options); + + TestData.writeData(TestData.DATA_SET_INSERT, conf); + InputFormat inputFormat = this.tableSource.getInputFormat(); + List result = readData(inputFormat); + TestData.assertRowDataEquals(result, TestData.DATA_SET_INSERT); + } + + /** + * Test reading file groups with compaction plan scheduled and delta logs. + * File-slice after pending compaction-requested instant-time should also be considered valid. + */ + @Test + void testReadMORWithCompactionPlanScheduled() throws Exception { + Map options = new HashMap<>(); + // compact for each commit + options.put(FlinkOptions.COMPACTION_DELTA_COMMITS.key(), "1"); + options.put(FlinkOptions.COMPACTION_ASYNC_ENABLED.key(), "false"); + beforeEach(HoodieTableType.MERGE_ON_READ, options); + + // write three commits + for (int i = 0; i < 6; i += 2) { + List dataset = TestData.dataSetInsert(i + 1, i + 2); + TestData.writeData(dataset, conf); + } + + InputFormat inputFormat1 = this.tableSource.getInputFormat(); + assertThat(inputFormat1, instanceOf(MergeOnReadInputFormat.class)); + + List actual = readData(inputFormat1); + final List expected = TestData.dataSetInsert(1, 2, 3, 4, 5, 6); + TestData.assertRowDataEquals(actual, expected); + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + private HoodieTableSource getTableSource(Configuration conf) { + return new HoodieTableSource( + TestConfigurations.TABLE_SCHEMA, + new Path(tempFile.getAbsolutePath()), + Collections.singletonList("partition"), + "default", + conf); + } + + @SuppressWarnings("unchecked, rawtypes") + private static List readData(InputFormat inputFormat) throws IOException { + InputSplit[] inputSplits = inputFormat.createInputSplits(1); + + List result = new ArrayList<>(); + + for (InputSplit inputSplit : inputSplits) { + inputFormat.open(inputSplit); + while (!inputFormat.reachedEnd()) { + result.add(TestConfigurations.SERIALIZER.copy((RowData) inputFormat.nextRecord(null))); // no reuse + } + inputFormat.close(); + } + return result; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/format/cow/TestBlockLocationSort.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/format/cow/TestBlockLocationSort.java new file mode 100644 index 0000000000000..d868dce4d9153 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/format/cow/TestBlockLocationSort.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.core.IsEqual.equalTo; + +import java.util.Arrays; +import java.util.Comparator; +import org.apache.hadoop.fs.BlockLocation; +import org.junit.jupiter.api.Test; + +public class TestBlockLocationSort { + + private static BlockLocation createBlockLocation(int offset, int length) { + return new BlockLocation(new String[0], new String[0], offset, length); + } + + @Test + void testBlockLocationSort() { + BlockLocation o1 = createBlockLocation(0, 5); + BlockLocation o2 = createBlockLocation(6, 4); + BlockLocation o3 = createBlockLocation(5, 5); + + BlockLocation[] blocks = {o1, o2, o3}; + BlockLocation[] sortedBlocks = {o1, o3, o2}; + + Arrays.sort(blocks, Comparator.comparingLong(BlockLocation::getOffset)); + assertThat(blocks, equalTo(sortedBlocks)); + + // Sort again to ensure idempotency + Arrays.sort(blocks, Comparator.comparingLong(BlockLocation::getOffset)); + assertThat(blocks, equalTo(sortedBlocks)); + } + +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/FlinkMiniCluster.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/FlinkMiniCluster.java new file mode 100644 index 0000000000000..96d07cd6565bd --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/FlinkMiniCluster.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils; + +import org.apache.flink.runtime.client.JobStatusMessage; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.test.util.AbstractTestBase; +import org.apache.flink.test.util.MiniClusterWithClientResource; + +import org.junit.jupiter.api.extension.AfterAllCallback; +import org.junit.jupiter.api.extension.AfterEachCallback; +import org.junit.jupiter.api.extension.BeforeAllCallback; +import org.junit.jupiter.api.extension.ExtensionContext; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Class for tests that run multiple tests and want to reuse the same Flink cluster. + * Unlike {@link AbstractTestBase}, this class is designed to run with JUnit 5. + */ +public class FlinkMiniCluster implements BeforeAllCallback, AfterAllCallback, AfterEachCallback { + private static final Logger LOG = LoggerFactory.getLogger(FlinkMiniCluster.class); + + public static final int DEFAULT_PARALLELISM = 4; + + private static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = + new MiniClusterWithClientResource( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(1) + .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) + .build()); + + @Override + public void beforeAll(ExtensionContext context) throws Exception { + MINI_CLUSTER_RESOURCE.before(); + } + + @Override + public void afterAll(ExtensionContext context) { + MINI_CLUSTER_RESOURCE.after(); + } + + @Override + public void afterEach(ExtensionContext context) throws Exception { + cleanupRunningJobs(); + } + + private void cleanupRunningJobs() throws Exception { + if (!MINI_CLUSTER_RESOURCE.getMiniCluster().isRunning()) { + // do nothing if the MiniCluster is not running + LOG.warn("Mini cluster is not running after the test!"); + return; + } + + for (JobStatusMessage path : MINI_CLUSTER_RESOURCE.getClusterClient().listJobs().get()) { + if (!path.getJobState().isTerminalState()) { + try { + MINI_CLUSTER_RESOURCE.getClusterClient().cancel(path.getJobId()).get(); + } catch (Exception ignored) { + // ignore exceptions when cancelling dangling jobs + } + } + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/SchemaBuilder.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/SchemaBuilder.java new file mode 100644 index 0000000000000..39dd6d659f7b7 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/SchemaBuilder.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils; + +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.catalog.UniqueConstraint; +import org.apache.flink.table.catalog.WatermarkSpec; +import org.apache.flink.table.types.DataType; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Builder for {@link ResolvedSchema}. + */ +public class SchemaBuilder { + private List columns; + private List watermarkSpecs; + private UniqueConstraint constraint; + + public static SchemaBuilder instance() { + return new SchemaBuilder(); + } + + private SchemaBuilder() { + this.columns = new ArrayList<>(); + this.watermarkSpecs = new ArrayList<>(); + } + + public SchemaBuilder field(String name, DataType type) { + this.columns.add(Column.physical(name, type)); + return this; + } + + public SchemaBuilder fields(List names, List types) { + List columns = IntStream.range(0, names.size()) + .mapToObj(idx -> Column.physical(names.get(idx), types.get(idx))) + .collect(Collectors.toList()); + this.columns.addAll(columns); + return this; + } + + public SchemaBuilder primaryKey(String... columns) { + this.constraint = UniqueConstraint.primaryKey("pk", Arrays.asList(columns)); + return this; + } + + public ResolvedSchema build() { + return new ResolvedSchema(columns, watermarkSpecs, constraint); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestAvroSchemaConverter.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestAvroSchemaConverter.java new file mode 100644 index 0000000000000..b297b627ba3ab --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestAvroSchemaConverter.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils; + +import org.apache.hudi.avro.model.HoodieMetadataRecord; +import org.apache.hudi.metadata.HoodieMetadataPayload; +import org.apache.hudi.util.AvroSchemaConverter; + +import org.apache.avro.Schema; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.types.DataType; +import org.junit.jupiter.api.Test; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; + +/** + * Test cases for {@link org.apache.hudi.util.AvroSchemaConverter}. + */ +public class TestAvroSchemaConverter { + @Test + void testUnionSchemaWithMultipleRecordTypes() { + Schema schema = HoodieMetadataRecord.SCHEMA$; + DataType dataType = AvroSchemaConverter.convertToDataType(schema); + int pos = HoodieMetadataRecord.SCHEMA$.getField(HoodieMetadataPayload.SCHEMA_FIELD_ID_COLUMN_STATS).pos(); + final String expected = "ROW<" + + "`fileName` STRING, " + + "`columnName` STRING, " + + "`minValue` ROW<`wrapper` RAW('java.lang.Object', ?) NOT NULL>, " + + "`maxValue` ROW<`wrapper` RAW('java.lang.Object', ?) NOT NULL>, " + + "`valueCount` BIGINT, " + + "`nullCount` BIGINT, " + + "`totalSize` BIGINT, " + + "`totalUncompressedSize` BIGINT, " + + "`isDeleted` BOOLEAN NOT NULL>"; + assertThat(dataType.getChildren().get(pos).toString(), is(expected)); + } + + @Test + void testLocalTimestampType() { + DataType dataType = DataTypes.ROW( + DataTypes.FIELD("f_localtimestamp_millis", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3)), + DataTypes.FIELD("f_localtimestamp_micros", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(6)) + ); + // convert to avro schema + Schema schema = AvroSchemaConverter.convertToSchema(dataType.getLogicalType()); + final String expectedSchema = "" + + "[ \"null\", {\n" + + " \"type\" : \"record\",\n" + + " \"name\" : \"record\",\n" + + " \"fields\" : [ {\n" + + " \"name\" : \"f_localtimestamp_millis\",\n" + + " \"type\" : [ \"null\", {\n" + + " \"type\" : \"long\",\n" + + " \"logicalType\" : \"local-timestamp-millis\"\n" + + " } ],\n" + + " \"default\" : null\n" + + " }, {\n" + + " \"name\" : \"f_localtimestamp_micros\",\n" + + " \"type\" : [ \"null\", {\n" + + " \"type\" : \"long\",\n" + + " \"logicalType\" : \"local-timestamp-micros\"\n" + + " } ],\n" + + " \"default\" : null\n" + + " } ]\n" + + "} ]"; + assertThat(schema.toString(true), is(expectedSchema)); + // convert it back + DataType convertedDataType = AvroSchemaConverter.convertToDataType(schema); + final String expectedDataType = "ROW<" + + "`f_localtimestamp_millis` TIMESTAMP_LTZ(3), " + + "`f_localtimestamp_micros` TIMESTAMP_LTZ(6)>"; + assertThat(convertedDataType.toString(), is(expectedDataType)); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestCompactionUtil.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestCompactionUtil.java new file mode 100644 index 0000000000000..e8d2885854800 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestCompactionUtil.java @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils; + +import org.apache.hudi.avro.model.HoodieCompactionOperation; +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.client.HoodieFlinkWriteClient; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.metadata.FlinkHoodieBackedTableMetadataWriter; +import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.util.CompactionUtil; +import org.apache.hudi.util.FlinkTables; +import org.apache.hudi.util.FlinkWriteClients; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.flink.configuration.Configuration; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Test cases for {@link org.apache.hudi.util.CompactionUtil}. + */ +public class TestCompactionUtil { + + private HoodieFlinkTable table; + private HoodieTableMetaClient metaClient; + private Configuration conf; + + @TempDir + File tempFile; + + void beforeEach() throws IOException { + beforeEach(Collections.emptyMap()); + } + + void beforeEach(Map options) throws IOException { + this.conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setString(FlinkOptions.TABLE_TYPE, FlinkOptions.TABLE_TYPE_MERGE_ON_READ); + options.forEach((k, v) -> conf.setString(k, v)); + + StreamerUtil.initTableIfNotExists(conf); + + this.table = FlinkTables.createTable(conf); + this.metaClient = table.getMetaClient(); + // initialize the metadata table path + if (conf.getBoolean(FlinkOptions.METADATA_ENABLED)) { + FlinkHoodieBackedTableMetadataWriter.create(table.getHadoopConf(), table.getConfig(), + table.getContext(), Option.empty(), Option.empty()); + } + } + + @Test + void rollbackCompaction() throws Exception { + beforeEach(); + List oriInstants = IntStream.range(0, 3) + .mapToObj(i -> generateCompactionPlan()).collect(Collectors.toList()); + List instants = metaClient.getActiveTimeline() + .filterPendingCompactionTimeline() + .filter(instant -> instant.getState() == HoodieInstant.State.INFLIGHT) + .getInstants() + .collect(Collectors.toList()); + assertThat("all the instants should be in pending state", instants.size(), is(3)); + CompactionUtil.rollbackCompaction(table); + boolean allRolledBack = metaClient.getActiveTimeline().filterPendingCompactionTimeline().getInstants() + .allMatch(instant -> instant.getState() == HoodieInstant.State.REQUESTED); + assertTrue(allRolledBack, "all the instants should be rolled back"); + List actualInstants = metaClient.getActiveTimeline() + .filterPendingCompactionTimeline().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + assertThat(actualInstants, is(oriInstants)); + } + + @Test + void rollbackEarliestCompaction() throws Exception { + beforeEach(); + conf.setInteger(FlinkOptions.COMPACTION_TIMEOUT_SECONDS, 0); + List oriInstants = IntStream.range(0, 3) + .mapToObj(i -> generateCompactionPlan()).collect(Collectors.toList()); + List instants = metaClient.getActiveTimeline() + .filterPendingCompactionTimeline() + .filter(instant -> instant.getState() == HoodieInstant.State.INFLIGHT) + .getInstants() + .collect(Collectors.toList()); + assertThat("all the instants should be in pending state", instants.size(), is(3)); + CompactionUtil.rollbackEarliestCompaction(table, conf); + long requestedCnt = metaClient.getActiveTimeline().filterPendingCompactionTimeline().getInstants() + .filter(instant -> instant.getState() == HoodieInstant.State.REQUESTED).count(); + assertThat("Only the first instant expects to be rolled back", requestedCnt, is(1L)); + + String instantTime = metaClient.getActiveTimeline() + .filterPendingCompactionTimeline().filter(instant -> instant.getState() == HoodieInstant.State.REQUESTED) + .firstInstant().get().getTimestamp(); + assertThat(instantTime, is(oriInstants.get(0))); + } + + @Test + void testScheduleCompaction() throws Exception { + Map options = new HashMap<>(); + options.put(FlinkOptions.COMPACTION_SCHEDULE_ENABLED.key(), "false"); + options.put(FlinkOptions.COMPACTION_TRIGGER_STRATEGY.key(), FlinkOptions.TIME_ELAPSED); + options.put(FlinkOptions.COMPACTION_DELTA_SECONDS.key(), "0"); + beforeEach(options); + + // write a commit with data first + TestData.writeDataAsBatch(TestData.DATA_SET_SINGLE_INSERT, conf); + + HoodieFlinkWriteClient writeClient = FlinkWriteClients.createWriteClient(conf); + CompactionUtil.scheduleCompaction(metaClient, writeClient, true, true); + + Option pendingCompactionInstant = metaClient.reloadActiveTimeline().filterPendingCompactionTimeline().lastInstant(); + assertTrue(pendingCompactionInstant.isPresent(), "A compaction plan expects to be scheduled"); + + // write another commit with data and start a new instant + TestData.writeDataAsBatch(TestData.DATA_SET_INSERT, conf); + TimeUnit.SECONDS.sleep(3); // in case the instant time interval is too close + writeClient.startCommit(); + + CompactionUtil.scheduleCompaction(metaClient, writeClient, true, false); + int numCompactionCommits = metaClient.reloadActiveTimeline().filterPendingCompactionTimeline().countInstants(); + assertThat("Two compaction plan expects to be scheduled", numCompactionCommits, is(2)); + } + + /** + * Generates a compaction plan on the timeline and returns its instant time. + */ + private String generateCompactionPlan() { + HoodieCompactionOperation operation = new HoodieCompactionOperation(); + HoodieCompactionPlan plan = new HoodieCompactionPlan(Collections.singletonList(operation), Collections.emptyMap(), 1); + String instantTime = HoodieActiveTimeline.createNewInstantTime(); + HoodieInstant compactionInstant = + new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, instantTime); + try { + metaClient.getActiveTimeline().saveToCompactionRequested(compactionInstant, + TimelineMetadataUtils.serializeCompactionPlan(plan)); + table.getActiveTimeline().transitionCompactionRequestedToInflight(compactionInstant); + } catch (IOException ioe) { + throw new HoodieIOException("Exception scheduling compaction", ioe); + } + metaClient.reloadActiveTimeline(); + return instantTime; + } +} + diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestConfigurations.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestConfigurations.java new file mode 100644 index 0000000000000..a5b7e368a8856 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestConfigurations.java @@ -0,0 +1,341 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils; + +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.streamer.FlinkStreamerConfig; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.utils.factory.CollectSinkTableFactory; +import org.apache.hudi.utils.factory.ContinuousFileSourceFactory; + +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.runtime.typeutils.RowDataSerializer; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + +/** + * Configurations for the test. + */ +public class TestConfigurations { + private TestConfigurations() { + } + + public static final DataType ROW_DATA_TYPE = DataTypes.ROW( + DataTypes.FIELD("uuid", DataTypes.VARCHAR(20)),// record key + DataTypes.FIELD("name", DataTypes.VARCHAR(10)), + DataTypes.FIELD("age", DataTypes.INT()), + DataTypes.FIELD("ts", DataTypes.TIMESTAMP(3)), // precombine field + DataTypes.FIELD("partition", DataTypes.VARCHAR(10))) + .notNull(); + + public static final RowType ROW_TYPE = (RowType) ROW_DATA_TYPE.getLogicalType(); + + public static final ResolvedSchema TABLE_SCHEMA = SchemaBuilder.instance() + .fields(ROW_TYPE.getFieldNames(), ROW_DATA_TYPE.getChildren()) + .build(); + + private static final List FIELDS = ROW_TYPE.getFields().stream() + .map(RowType.RowField::asSummaryString).collect(Collectors.toList()); + + public static final DataType ROW_DATA_TYPE_WIDER = DataTypes.ROW( + DataTypes.FIELD("uuid", DataTypes.VARCHAR(20)),// record key + DataTypes.FIELD("name", DataTypes.VARCHAR(10)), + DataTypes.FIELD("age", DataTypes.INT()), + DataTypes.FIELD("salary", DataTypes.DOUBLE()), + DataTypes.FIELD("ts", DataTypes.TIMESTAMP(3)), // precombine field + DataTypes.FIELD("partition", DataTypes.VARCHAR(10))) + .notNull(); + + public static final RowType ROW_TYPE_WIDER = (RowType) ROW_DATA_TYPE_WIDER.getLogicalType(); + + public static final DataType ROW_DATA_TYPE_DATE = DataTypes.ROW( + DataTypes.FIELD("uuid", DataTypes.VARCHAR(20)),// record key + DataTypes.FIELD("name", DataTypes.VARCHAR(10)), + DataTypes.FIELD("age", DataTypes.INT()), + DataTypes.FIELD("dt", DataTypes.DATE())) + .notNull(); + + public static final RowType ROW_TYPE_DATE = (RowType) ROW_DATA_TYPE_DATE.getLogicalType(); + + public static String getCreateHoodieTableDDL(String tableName, Map options) { + return getCreateHoodieTableDDL(tableName, options, true, "partition"); + } + + public static String getCreateHoodieTableDDL( + String tableName, + Map options, + boolean havePartition, + String partitionField) { + return getCreateHoodieTableDDL(tableName, FIELDS, options, havePartition, "uuid", partitionField); + } + + public static String getCreateHoodieTableDDL( + String tableName, + List fields, + Map options, + boolean havePartition, + String pkField, + String partitionField) { + StringBuilder builder = new StringBuilder(); + builder.append("create table ").append(tableName).append("(\n"); + for (String field : fields) { + builder.append(" ").append(field).append(",\n"); + } + builder.append(" PRIMARY KEY(").append(pkField).append(") NOT ENFORCED\n") + .append(")\n"); + if (havePartition) { + builder.append("PARTITIONED BY (`").append(partitionField).append("`)\n"); + } + final String connector = options.computeIfAbsent("connector", k -> "hudi"); + builder.append("with (\n" + + " 'connector' = '").append(connector).append("'"); + options.forEach((k, v) -> builder.append(",\n") + .append(" '").append(k).append("' = '").append(v).append("'")); + builder.append("\n)"); + return builder.toString(); + } + + public static String getCreateHudiCatalogDDL(final String catalogName, final String catalogPath) { + StringBuilder builder = new StringBuilder(); + builder.append("create catalog ").append(catalogName).append(" with (\n"); + builder.append(" 'type' = 'hudi',\n" + + " 'catalog.path' = '").append(catalogPath).append("'"); + builder.append("\n)"); + return builder.toString(); + } + + public static String getFileSourceDDL(String tableName) { + return getFileSourceDDL(tableName, "test_source.data"); + } + + public static String getFileSourceDDL(String tableName, int checkpoints) { + return getFileSourceDDL(tableName, "test_source.data", checkpoints); + } + + public static String getFileSourceDDL(String tableName, String fileName) { + return getFileSourceDDL(tableName, fileName, 2); + } + + public static String getFileSourceDDL(String tableName, String fileName, int checkpoints) { + String sourcePath = Objects.requireNonNull(Thread.currentThread() + .getContextClassLoader().getResource(fileName)).toString(); + return "create table " + tableName + "(\n" + + " uuid varchar(20),\n" + + " name varchar(10),\n" + + " age int,\n" + + " ts timestamp(3),\n" + + " `partition` varchar(20)\n" + + ") with (\n" + + " 'connector' = '" + ContinuousFileSourceFactory.FACTORY_ID + "',\n" + + " 'path' = '" + sourcePath + "',\n" + + " 'checkpoints' = '" + checkpoints + "'\n" + + ")"; + } + + public static String getCollectSinkDDL(String tableName) { + return "create table " + tableName + "(\n" + + " uuid varchar(20),\n" + + " name varchar(10),\n" + + " age int,\n" + + " ts timestamp(3),\n" + + " `partition` varchar(20)\n" + + ") with (\n" + + " 'connector' = '" + CollectSinkTableFactory.FACTORY_ID + "'" + + ")"; + } + + public static String getCollectSinkDDL(String tableName, TableSchema tableSchema) { + final StringBuilder builder = new StringBuilder("create table " + tableName + "(\n"); + String[] fieldNames = tableSchema.getFieldNames(); + DataType[] fieldTypes = tableSchema.getFieldDataTypes(); + for (int i = 0; i < fieldNames.length; i++) { + builder.append(" `") + .append(fieldNames[i]) + .append("` ") + .append(fieldTypes[i].toString()); + if (i != fieldNames.length - 1) { + builder.append(","); + } + builder.append("\n"); + } + final String withProps = "" + + ") with (\n" + + " 'connector' = '" + CollectSinkTableFactory.FACTORY_ID + "'\n" + + ")"; + builder.append(withProps); + return builder.toString(); + } + + public static String getCsvSourceDDL(String tableName, String fileName) { + String sourcePath = Objects.requireNonNull(Thread.currentThread() + .getContextClassLoader().getResource(fileName)).toString(); + return "create table " + tableName + "(\n" + + " uuid varchar(20),\n" + + " name varchar(10),\n" + + " age int,\n" + + " ts timestamp(3),\n" + + " `partition` varchar(20)\n" + + ") with (\n" + + " 'connector' = 'filesystem',\n" + + " 'path' = '" + sourcePath + "',\n" + + " 'format' = 'csv'\n" + + ")"; + } + + public static final RowDataSerializer SERIALIZER = new RowDataSerializer(ROW_TYPE); + + public static Configuration getDefaultConf(String tablePath) { + Configuration conf = new Configuration(); + conf.setString(FlinkOptions.PATH, tablePath); + conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA_PATH, + Objects.requireNonNull(Thread.currentThread() + .getContextClassLoader().getResource("test_read_schema.avsc")).toString()); + conf.setString(FlinkOptions.TABLE_NAME, "TestHoodieTable"); + conf.setString(FlinkOptions.PARTITION_PATH_FIELD, "partition"); + return conf; + } + + public static Configuration getDefaultConf(String tablePath, DataType dataType) { + Configuration conf = new Configuration(); + conf.setString(FlinkOptions.PATH, tablePath); + conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA, AvroSchemaConverter.convertToSchema(dataType.getLogicalType()).toString()); + conf.setString(FlinkOptions.TABLE_NAME, "TestHoodieTable"); + conf.setString(FlinkOptions.PARTITION_PATH_FIELD, "partition"); + return conf; + } + + public static FlinkStreamerConfig getDefaultStreamerConf(String tablePath) { + FlinkStreamerConfig streamerConf = new FlinkStreamerConfig(); + streamerConf.targetBasePath = tablePath; + streamerConf.sourceAvroSchemaPath = Objects.requireNonNull(Thread.currentThread() + .getContextClassLoader().getResource("test_read_schema.avsc")).toString(); + streamerConf.targetTableName = "TestHoodieTable"; + streamerConf.partitionPathField = "partition"; + streamerConf.tableType = "COPY_ON_WRITE"; + streamerConf.checkpointInterval = 4000L; + return streamerConf; + } + + /** + * Creates the tool to build hoodie table DDL. + */ + public static Sql sql(String tableName) { + return new Sql(tableName); + } + + public static Catalog catalog(String catalogName) { + return new Catalog(catalogName); + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + /** + * Tool to build hoodie table DDL with schema {@link #TABLE_SCHEMA}. + */ + public static class Sql { + private final Map options; + private final String tableName; + private List fields = new ArrayList<>(); + private boolean withPartition = true; + private String pkField = "uuid"; + private String partitionField = "partition"; + + public Sql(String tableName) { + options = new HashMap<>(); + this.tableName = tableName; + } + + public Sql option(ConfigOption option, Object val) { + this.options.put(option.key(), val.toString()); + return this; + } + + public Sql option(String key, Object val) { + this.options.put(key, val.toString()); + return this; + } + + public Sql options(Map options) { + this.options.putAll(options); + return this; + } + + public Sql noPartition() { + this.withPartition = false; + return this; + } + + public Sql pkField(String pkField) { + this.pkField = pkField; + return this; + } + + public Sql partitionField(String partitionField) { + this.partitionField = partitionField; + return this; + } + + public Sql field(String fieldSchema) { + fields.add(fieldSchema); + return this; + } + + public String end() { + if (this.fields.size() == 0) { + this.fields = FIELDS; + } + return TestConfigurations.getCreateHoodieTableDDL(this.tableName, this.fields, options, + this.withPartition, this.pkField, this.partitionField); + } + } + + /** + * Tool to construct the catalog DDL. + */ + public static class Catalog { + private final String catalogName; + private String catalogPath = "."; + + public Catalog(String catalogName) { + this.catalogName = catalogName; + } + + public Catalog catalogPath(String catalogPath) { + this.catalogPath = catalogPath; + return this; + } + + public String end() { + return TestConfigurations.getCreateHudiCatalogDDL(catalogName, catalogPath); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestData.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestData.java new file mode 100644 index 0000000000000..7ee745e0dcc0e --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestData.java @@ -0,0 +1,874 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils; + +import org.apache.hudi.client.common.HoodieFlinkEngineContext; +import org.apache.hudi.common.config.HoodieCommonConfig; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.BaseFile; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.sink.utils.StreamWriteFunctionWrapper; +import org.apache.hudi.table.HoodieFlinkTable; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.GenericRecordBuilder; +import org.apache.avro.generic.IndexedRecord; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.data.binary.BinaryRowData; +import org.apache.flink.table.data.conversion.DataStructureConverter; +import org.apache.flink.table.data.conversion.DataStructureConverters; +import org.apache.flink.table.data.writer.BinaryRowWriter; +import org.apache.flink.table.data.writer.BinaryWriter; +import org.apache.flink.table.runtime.typeutils.InternalSerializers; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.flink.types.RowKind; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.avro.AvroParquetReader; +import org.apache.parquet.hadoop.ParquetReader; + +import java.io.File; +import java.io.FileFilter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static junit.framework.TestCase.assertEquals; +import static org.apache.hudi.common.table.HoodieTableConfig.HOODIE_PROPERTIES_FILE; +import static org.apache.hudi.common.table.HoodieTableMetaClient.METAFOLDER_NAME; +import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.HOODIE_RECORD_KEY_COL_POS; +import static org.apache.hudi.table.format.FormatUtils.buildAvroRecordBySchema; +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Data set for testing, also some utilities to check the results. + */ +public class TestData { + public static List DATA_SET_INSERT = Arrays.asList( + insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(1), StringData.fromString("par1")), + insertRow(StringData.fromString("id2"), StringData.fromString("Stephen"), 33, + TimestampData.fromEpochMillis(2), StringData.fromString("par1")), + insertRow(StringData.fromString("id3"), StringData.fromString("Julian"), 53, + TimestampData.fromEpochMillis(3), StringData.fromString("par2")), + insertRow(StringData.fromString("id4"), StringData.fromString("Fabian"), 31, + TimestampData.fromEpochMillis(4), StringData.fromString("par2")), + insertRow(StringData.fromString("id5"), StringData.fromString("Sophia"), 18, + TimestampData.fromEpochMillis(5), StringData.fromString("par3")), + insertRow(StringData.fromString("id6"), StringData.fromString("Emma"), 20, + TimestampData.fromEpochMillis(6), StringData.fromString("par3")), + insertRow(StringData.fromString("id7"), StringData.fromString("Bob"), 44, + TimestampData.fromEpochMillis(7), StringData.fromString("par4")), + insertRow(StringData.fromString("id8"), StringData.fromString("Han"), 56, + TimestampData.fromEpochMillis(8), StringData.fromString("par4")) + ); + + public static List DATA_SET_UPDATE_INSERT = Arrays.asList( + // advance the age by 1 + insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 24, + TimestampData.fromEpochMillis(1), StringData.fromString("par1")), + insertRow(StringData.fromString("id2"), StringData.fromString("Stephen"), 34, + TimestampData.fromEpochMillis(2), StringData.fromString("par1")), + insertRow(StringData.fromString("id3"), StringData.fromString("Julian"), 54, + TimestampData.fromEpochMillis(3), StringData.fromString("par2")), + insertRow(StringData.fromString("id4"), StringData.fromString("Fabian"), 32, + TimestampData.fromEpochMillis(4), StringData.fromString("par2")), + // same with before + insertRow(StringData.fromString("id5"), StringData.fromString("Sophia"), 18, + TimestampData.fromEpochMillis(5), StringData.fromString("par3")), + // new data + insertRow(StringData.fromString("id9"), StringData.fromString("Jane"), 19, + TimestampData.fromEpochMillis(6), StringData.fromString("par3")), + insertRow(StringData.fromString("id10"), StringData.fromString("Ella"), 38, + TimestampData.fromEpochMillis(7), StringData.fromString("par4")), + insertRow(StringData.fromString("id11"), StringData.fromString("Phoebe"), 52, + TimestampData.fromEpochMillis(8), StringData.fromString("par4")) + ); + + public static List DATA_SET_INSERT_SEPARATE_PARTITION = Arrays.asList( + insertRow(StringData.fromString("id12"), StringData.fromString("Monica"), 27, + TimestampData.fromEpochMillis(9), StringData.fromString("par5")), + insertRow(StringData.fromString("id13"), StringData.fromString("Phoebe"), 31, + TimestampData.fromEpochMillis(10), StringData.fromString("par5")), + insertRow(StringData.fromString("id14"), StringData.fromString("Rachel"), 52, + TimestampData.fromEpochMillis(11), StringData.fromString("par6")), + insertRow(StringData.fromString("id15"), StringData.fromString("Ross"), 29, + TimestampData.fromEpochMillis(12), StringData.fromString("par6")) + ); + + public static List DATA_SET_INSERT_DUPLICATES = new ArrayList<>(); + + static { + IntStream.range(0, 5).forEach(i -> DATA_SET_INSERT_DUPLICATES.add( + insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(1), StringData.fromString("par1")))); + } + + public static List DATA_SET_INSERT_SAME_KEY = new ArrayList<>(); + + static { + IntStream.range(0, 5).forEach(i -> DATA_SET_INSERT_SAME_KEY.add( + insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(i), StringData.fromString("par1")))); + } + + // data set of test_source.data + public static List DATA_SET_SOURCE_INSERT = Arrays.asList( + insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(1000), StringData.fromString("par1")), + insertRow(StringData.fromString("id2"), StringData.fromString("Stephen"), 33, + TimestampData.fromEpochMillis(2000), StringData.fromString("par1")), + insertRow(StringData.fromString("id3"), StringData.fromString("Julian"), 53, + TimestampData.fromEpochMillis(3000), StringData.fromString("par2")), + insertRow(StringData.fromString("id4"), StringData.fromString("Fabian"), 31, + TimestampData.fromEpochMillis(4000), StringData.fromString("par2")), + insertRow(StringData.fromString("id5"), StringData.fromString("Sophia"), 18, + TimestampData.fromEpochMillis(5000), StringData.fromString("par3")), + insertRow(StringData.fromString("id6"), StringData.fromString("Emma"), 20, + TimestampData.fromEpochMillis(6000), StringData.fromString("par3")), + insertRow(StringData.fromString("id7"), StringData.fromString("Bob"), 44, + TimestampData.fromEpochMillis(7000), StringData.fromString("par4")), + insertRow(StringData.fromString("id8"), StringData.fromString("Han"), 56, + TimestampData.fromEpochMillis(8000), StringData.fromString("par4")) + ); + + // data set of test_source.data first commit. + public static List DATA_SET_SOURCE_INSERT_FIRST_COMMIT = Arrays.asList( + insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(1000), StringData.fromString("par1")), + insertRow(StringData.fromString("id2"), StringData.fromString("Stephen"), 33, + TimestampData.fromEpochMillis(2000), StringData.fromString("par1")), + insertRow(StringData.fromString("id3"), StringData.fromString("Julian"), 53, + TimestampData.fromEpochMillis(3000), StringData.fromString("par2")), + insertRow(StringData.fromString("id4"), StringData.fromString("Fabian"), 31, + TimestampData.fromEpochMillis(4000), StringData.fromString("par2")) + ); + + // data set of test_source.data latest commit. + public static List DATA_SET_SOURCE_INSERT_LATEST_COMMIT = Arrays.asList( + insertRow(StringData.fromString("id5"), StringData.fromString("Sophia"), 18, + TimestampData.fromEpochMillis(5000), StringData.fromString("par3")), + insertRow(StringData.fromString("id6"), StringData.fromString("Emma"), 20, + TimestampData.fromEpochMillis(6000), StringData.fromString("par3")), + insertRow(StringData.fromString("id7"), StringData.fromString("Bob"), 44, + TimestampData.fromEpochMillis(7000), StringData.fromString("par4")), + insertRow(StringData.fromString("id8"), StringData.fromString("Han"), 56, + TimestampData.fromEpochMillis(8000), StringData.fromString("par4")) + ); + + // merged data set of test_source.data and test_source_2.data + public static List DATA_SET_SOURCE_MERGED = Arrays.asList( + insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 24, + TimestampData.fromEpochMillis(1000), StringData.fromString("par1")), + insertRow(StringData.fromString("id2"), StringData.fromString("Stephen"), 34, + TimestampData.fromEpochMillis(2000), StringData.fromString("par1")), + insertRow(StringData.fromString("id3"), StringData.fromString("Julian"), 54, + TimestampData.fromEpochMillis(3000), StringData.fromString("par2")), + insertRow(StringData.fromString("id4"), StringData.fromString("Fabian"), 32, + TimestampData.fromEpochMillis(4000), StringData.fromString("par2")), + insertRow(StringData.fromString("id5"), StringData.fromString("Sophia"), 18, + TimestampData.fromEpochMillis(5000), StringData.fromString("par3")), + insertRow(StringData.fromString("id6"), StringData.fromString("Emma"), 20, + TimestampData.fromEpochMillis(6000), StringData.fromString("par3")), + insertRow(StringData.fromString("id7"), StringData.fromString("Bob"), 44, + TimestampData.fromEpochMillis(7000), StringData.fromString("par4")), + insertRow(StringData.fromString("id8"), StringData.fromString("Han"), 56, + TimestampData.fromEpochMillis(8000), StringData.fromString("par4")), + insertRow(StringData.fromString("id9"), StringData.fromString("Jane"), 19, + TimestampData.fromEpochMillis(6000), StringData.fromString("par3")), + insertRow(StringData.fromString("id10"), StringData.fromString("Ella"), 38, + TimestampData.fromEpochMillis(7000), StringData.fromString("par4")), + insertRow(StringData.fromString("id11"), StringData.fromString("Phoebe"), 52, + TimestampData.fromEpochMillis(8000), StringData.fromString("par4")) + ); + + // data set of test_source.data with partition 'par1' overwrite + public static List DATA_SET_SOURCE_INSERT_OVERWRITE = Arrays.asList( + insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 24, + TimestampData.fromEpochMillis(1000), StringData.fromString("par1")), + insertRow(StringData.fromString("id2"), StringData.fromString("Stephen"), 34, + TimestampData.fromEpochMillis(2000), StringData.fromString("par1")), + insertRow(StringData.fromString("id3"), StringData.fromString("Julian"), 53, + TimestampData.fromEpochMillis(3000), StringData.fromString("par2")), + insertRow(StringData.fromString("id4"), StringData.fromString("Fabian"), 31, + TimestampData.fromEpochMillis(4000), StringData.fromString("par2")), + insertRow(StringData.fromString("id5"), StringData.fromString("Sophia"), 18, + TimestampData.fromEpochMillis(5000), StringData.fromString("par3")), + insertRow(StringData.fromString("id6"), StringData.fromString("Emma"), 20, + TimestampData.fromEpochMillis(6000), StringData.fromString("par3")), + insertRow(StringData.fromString("id7"), StringData.fromString("Bob"), 44, + TimestampData.fromEpochMillis(7000), StringData.fromString("par4")), + insertRow(StringData.fromString("id8"), StringData.fromString("Han"), 56, + TimestampData.fromEpochMillis(8000), StringData.fromString("par4")) + ); + + public static List DATA_SET_UPDATE_DELETE = Arrays.asList( + // this is update + insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 24, + TimestampData.fromEpochMillis(1), StringData.fromString("par1")), + insertRow(StringData.fromString("id2"), StringData.fromString("Stephen"), 34, + TimestampData.fromEpochMillis(2), StringData.fromString("par1")), + // this is delete + deleteRow(StringData.fromString("id3"), StringData.fromString("Julian"), 53, + TimestampData.fromEpochMillis(3), StringData.fromString("par2")), + deleteRow(StringData.fromString("id5"), StringData.fromString("Sophia"), 18, + TimestampData.fromEpochMillis(5), StringData.fromString("par3")), + // delete a record that has no inserts + deleteRow(StringData.fromString("id9"), StringData.fromString("Jane"), 19, + TimestampData.fromEpochMillis(6), StringData.fromString("par3")) + ); + + public static List DATA_SET_INSERT_UPDATE_DELETE = Arrays.asList( + // INSERT + insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 19, + TimestampData.fromEpochMillis(1), StringData.fromString("par1")), + // UPDATE + updateBeforeRow(StringData.fromString("id1"), StringData.fromString("Danny"), 19, + TimestampData.fromEpochMillis(1), StringData.fromString("par1")), + updateAfterRow(StringData.fromString("id1"), StringData.fromString("Danny"), 20, + TimestampData.fromEpochMillis(2), StringData.fromString("par1")), + updateBeforeRow(StringData.fromString("id1"), StringData.fromString("Danny"), 20, + TimestampData.fromEpochMillis(2), StringData.fromString("par1")), + updateAfterRow(StringData.fromString("id1"), StringData.fromString("Danny"), 21, + TimestampData.fromEpochMillis(3), StringData.fromString("par1")), + updateBeforeRow(StringData.fromString("id1"), StringData.fromString("Danny"), 21, + TimestampData.fromEpochMillis(3), StringData.fromString("par1")), + updateAfterRow(StringData.fromString("id1"), StringData.fromString("Danny"), 22, + TimestampData.fromEpochMillis(4), StringData.fromString("par1")), + // DELETE + deleteRow(StringData.fromString("id1"), StringData.fromString("Danny"), 22, + TimestampData.fromEpochMillis(5), StringData.fromString("par1")) + ); + + public static List DATA_SET_SINGLE_INSERT = Collections.singletonList( + insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(1), StringData.fromString("par1"))); + + public static List DATA_SET_DISORDER_INSERT = Arrays.asList( + insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(3), StringData.fromString("par1")), + insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 22, + TimestampData.fromEpochMillis(4), StringData.fromString("par1")), + insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(2), StringData.fromString("par1")), + insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(1), StringData.fromString("par1")) + ); + + public static List DATA_SET_DISORDER_UPDATE_DELETE = Arrays.asList( + // DISORDER UPDATE + updateAfterRow(StringData.fromString("id1"), StringData.fromString("Danny"), 21, + TimestampData.fromEpochMillis(3), StringData.fromString("par1")), + updateAfterRow(StringData.fromString("id1"), StringData.fromString("Danny"), 20, + TimestampData.fromEpochMillis(2), StringData.fromString("par1")), + updateBeforeRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(1), StringData.fromString("par1")), + updateBeforeRow(StringData.fromString("id1"), StringData.fromString("Danny"), 20, + TimestampData.fromEpochMillis(2), StringData.fromString("par1")), + updateAfterRow(StringData.fromString("id1"), StringData.fromString("Danny"), 22, + TimestampData.fromEpochMillis(4), StringData.fromString("par1")), + updateBeforeRow(StringData.fromString("id1"), StringData.fromString("Danny"), 21, + TimestampData.fromEpochMillis(3), StringData.fromString("par1")), + // DISORDER DELETE + deleteRow(StringData.fromString("id1"), StringData.fromString("Danny"), 22, + TimestampData.fromEpochMillis(2), StringData.fromString("par1")) + ); + + public static List dataSetInsert(int... ids) { + List inserts = new ArrayList<>(); + Arrays.stream(ids).forEach(i -> inserts.add( + insertRow(StringData.fromString("id" + i), StringData.fromString("Danny"), 23, + TimestampData.fromEpochMillis(i), StringData.fromString("par1")))); + return inserts; + } + + public static List filterOddRows(List rows) { + return filterRowsByIndexPredicate(rows, i -> i % 2 != 0); + } + + public static List filterEvenRows(List rows) { + return filterRowsByIndexPredicate(rows, i -> i % 2 == 0); + } + + private static List filterRowsByIndexPredicate(List rows, Predicate predicate) { + List filtered = new ArrayList<>(); + for (int i = 0; i < rows.size(); i++) { + if (predicate.test(i)) { + filtered.add(rows.get(i)); + } + } + return filtered; + } + + private static Integer toIdSafely(Object id) { + if (id == null) { + return -1; + } + final String idStr = id.toString(); + if (idStr.startsWith("id")) { + return Integer.parseInt(idStr.substring(2)); + } + return -1; + } + + /** + * Returns string format of a list of RowData. + */ + public static String rowDataToString(List rows) { + DataStructureConverter converter = + DataStructureConverters.getConverter(TestConfigurations.ROW_DATA_TYPE); + return rows.stream() + .sorted(Comparator.comparing(o -> toIdSafely(o.getString(0)))) + .map(row -> converter.toExternal(row).toString()) + .collect(Collectors.toList()).toString(); + } + + /** + * Write a list of row data with Hoodie format base on the given configuration. + * + * @param dataBuffer The data buffer to write + * @param conf The flink configuration + * @throws Exception if error occurs + */ + public static void writeData( + List dataBuffer, + Configuration conf) throws Exception { + StreamWriteFunctionWrapper funcWrapper = new StreamWriteFunctionWrapper<>( + conf.getString(FlinkOptions.PATH), + conf); + funcWrapper.openFunction(); + + for (RowData rowData : dataBuffer) { + funcWrapper.invoke(rowData); + } + + // this triggers the data write and event send + funcWrapper.checkpointFunction(1); + + final OperatorEvent nextEvent = funcWrapper.getNextEvent(); + funcWrapper.getCoordinator().handleEventFromOperator(0, nextEvent); + funcWrapper.checkpointComplete(1); + + funcWrapper.close(); + } + + /** + * Write a list of row data with Hoodie format base on the given configuration. + * + *

    The difference with {@link #writeData} is that it flush data using #endInput, and it + * does not generate inflight instant. + * + * @param dataBuffer The data buffer to write + * @param conf The flink configuration + * @throws Exception if error occurs + */ + public static void writeDataAsBatch( + List dataBuffer, + Configuration conf) throws Exception { + StreamWriteFunctionWrapper funcWrapper = new StreamWriteFunctionWrapper<>( + conf.getString(FlinkOptions.PATH), + conf); + funcWrapper.openFunction(); + + for (RowData rowData : dataBuffer) { + funcWrapper.invoke(rowData); + } + + // this triggers the data write and event send + funcWrapper.endInput(); + + final OperatorEvent nextEvent = funcWrapper.getNextEvent(); + funcWrapper.getCoordinator().handleEventFromOperator(0, nextEvent); + + funcWrapper.close(); + } + + private static String toStringSafely(Object obj) { + return obj == null ? "null" : obj.toString(); + } + + /** + * Sort the {@code rows} using field at index 0 and asserts + * it equals with the expected string {@code expected}. + * + * @param rows Actual result rows + * @param expected Expected string of the sorted rows + */ + public static void assertRowsEquals(List rows, String expected) { + assertRowsEquals(rows, expected, false); + } + + /** + * Sort the {@code rows} using field at index 0 and asserts + * it equals with the expected string {@code expected}. + * + * @param rows Actual result rows + * @param expected Expected string of the sorted rows + * @param withChangeFlag Whether compares with change flags + */ + public static void assertRowsEquals(List rows, String expected, boolean withChangeFlag) { + String rowsString = rows.stream() + .sorted(Comparator.comparing(o -> toStringSafely(o.getField(0)))) + .map(row -> { + final String rowStr = row.toString(); + if (withChangeFlag) { + return row.getKind().shortString() + "(" + rowStr + ")"; + } else { + return rowStr; + } + }) + .collect(Collectors.toList()).toString(); + assertThat(rowsString, is(expected)); + } + + /** + * Sort the {@code rows} using field at index {@code orderingPos} and asserts + * it equals with the expected string {@code expected}. + * + * @param rows Actual result rows + * @param expected Expected string of the sorted rows + * @param orderingPos Field position for ordering + */ + public static void assertRowsEquals(List rows, String expected, int orderingPos) { + String rowsString = rows.stream() + .sorted(Comparator.comparing(o -> toStringSafely(o.getField(orderingPos)))) + .collect(Collectors.toList()).toString(); + assertThat(rowsString, is(expected)); + } + + /** + * Sort the {@code rows} using field at index 0 and asserts + * it equals with the expected row data list {@code expected}. + * + * @param rows Actual result rows + * @param expected Expected row data list + */ + public static void assertRowsEquals(List rows, List expected) { + String rowsString = rows.stream() + .sorted(Comparator.comparing(o -> toIdSafely(o.getField(0)))) + .collect(Collectors.toList()).toString(); + assertThat(rowsString, is(rowDataToString(expected))); + } + + /** + * Sort the {@code rows} using field at index 0 and asserts + * it equals with the expected string {@code expected}. + * + * @param rows Actual result rows + * @param expected Expected string of the sorted rows + */ + public static void assertRowDataEquals(List rows, String expected) { + String rowsString = rowDataToString(rows); + assertThat(rowsString, is(expected)); + } + + /** + * Sort the {@code rows} using field at index 0 and asserts + * it equals with the expected row data list {@code expected}. + * + * @param rows Actual result rows + * @param expected Expected row data list + */ + public static void assertRowDataEquals(List rows, List expected) { + String rowsString = rowDataToString(rows); + assertThat(rowsString, is(rowDataToString(expected))); + } + + /** + * Checks the source data set are written as expected. + * + *

    Note: Replace it with the Flink reader when it is supported. + * + * @param baseFile The file base to check, should be a directory + * @param expected The expected results mapping, the key should be the partition path + * and value should be values list with the key partition + */ + public static void checkWrittenData(File baseFile, Map expected) throws IOException { + checkWrittenData(baseFile, expected, 4); + } + + /** + * Checks the source data set are written as expected. + * + *

    Note: Replace it with the Flink reader when it is supported. + * + * @param baseFile The file base to check, should be a directory + * @param expected The expected results mapping, the key should be the partition path + * and value should be values list with the key partition + * @param partitions The expected partition number + */ + public static void checkWrittenData( + File baseFile, + Map expected, + int partitions) throws IOException { + checkWrittenData(baseFile, expected, partitions, TestData::filterOutVariables); + } + + /** + * Checks the source data set are written as expected. + * + *

    Note: Replace it with the Flink reader when it is supported. + * + * @param baseFile The file base to check, should be a directory + * @param expected The expected results mapping, the key should be the partition path + * and value should be values list with the key partition + * @param partitions The expected partition number + * @param extractor The fields extractor + */ + public static void checkWrittenData( + File baseFile, + Map expected, + int partitions, + Function extractor) throws IOException { + assert baseFile.isDirectory(); + FileFilter filter = file -> !file.getName().startsWith("."); + File[] partitionDirs = baseFile.listFiles(filter); + assertNotNull(partitionDirs); + assertThat(partitionDirs.length, is(partitions)); + for (File partitionDir : partitionDirs) { + File[] dataFiles = partitionDir.listFiles(filter); + assertNotNull(dataFiles); + File latestDataFile = Arrays.stream(dataFiles) + .max(Comparator.comparing(f -> FSUtils.getCommitTime(f.getName()))) + .orElse(dataFiles[0]); + ParquetReader reader = AvroParquetReader + .builder(new Path(latestDataFile.getAbsolutePath())).build(); + List readBuffer = new ArrayList<>(); + GenericRecord nextRecord = reader.read(); + while (nextRecord != null) { + readBuffer.add(extractor.apply(nextRecord)); + nextRecord = reader.read(); + } + readBuffer.sort(Comparator.naturalOrder()); + assertThat(readBuffer.toString(), is(expected.get(partitionDir.getName()))); + } + } + + /** + * Checks the source data set are written as expected. + * Different with {@link #checkWrittenData}, it reads all the data files. + * + *

    Note: Replace it with the Flink reader when it is supported. + * + * @param baseFile The file base to check, should be a directory + * @param expected The expected results mapping, the key should be the partition path + * and value should be values list with the key partition + * @param partitions The expected partition number + */ + public static void checkWrittenAllData( + File baseFile, + Map expected, + int partitions) throws IOException { + assert baseFile.isDirectory(); + FileFilter filter = file -> !file.getName().startsWith("."); + File[] partitionDirs = baseFile.listFiles(filter); + + assertNotNull(partitionDirs); + assertThat(partitionDirs.length, is(partitions)); + + for (File partitionDir : partitionDirs) { + File[] dataFiles = partitionDir.listFiles(filter); + assertNotNull(dataFiles); + + List readBuffer = new ArrayList<>(); + for (File dataFile : dataFiles) { + ParquetReader reader = AvroParquetReader + .builder(new Path(dataFile.getAbsolutePath())).build(); + GenericRecord nextRecord = reader.read(); + while (nextRecord != null) { + readBuffer.add(filterOutVariables(nextRecord)); + nextRecord = reader.read(); + } + } + + readBuffer.sort(Comparator.naturalOrder()); + assertThat(readBuffer.toString(), is(expected.get(partitionDir.getName()))); + } + } + + /** + * Checks the source data are written as expected. + * + *

    Note: Replace it with the Flink reader when it is supported. + * + * @param basePath The file base to check, should be a directory + * @param expected The expected results mapping, the key should be the partition path + */ + public static void checkWrittenDataCOW( + File basePath, + Map> expected) throws IOException { + checkWrittenDataCOW(basePath, expected, TestData::filterOutVariables); + } + + /** + * Checks the source data are written as expected. + * + *

    Note: Replace it with the Flink reader when it is supported. + * + * @param basePath The file base to check, should be a directory + * @param expected The expected results mapping, the key should be the partition path + * @param extractor The extractor to extract the required fields from the avro row + */ + public static void checkWrittenDataCOW( + File basePath, + Map> expected, + Function extractor) throws IOException { + + // 1. init flink table + HoodieTableMetaClient metaClient = HoodieTestUtils.init(basePath.toURI().toString()); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath.toURI().toString()).build(); + HoodieFlinkTable table = HoodieFlinkTable.create(config, HoodieFlinkEngineContext.DEFAULT, metaClient); + + // 2. check each partition data + expected.forEach((partition, partitionDataSet) -> { + + List readBuffer = new ArrayList<>(); + + table.getBaseFileOnlyView().getLatestBaseFiles(partition) + .forEach(baseFile -> { + String path = baseFile.getPath(); + try { + ParquetReader reader = AvroParquetReader.builder(new Path(path)).build(); + GenericRecord nextRecord = reader.read(); + while (nextRecord != null) { + readBuffer.add(extractor.apply(nextRecord)); + nextRecord = reader.read(); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + + assertThat("Unexpected records number under partition: " + partition, + readBuffer.size(), is(partitionDataSet.size())); + for (String record : readBuffer) { + assertTrue(partitionDataSet.contains(record), "Unexpected record: " + record); + } + }); + + } + + /** + * Checks the MERGE_ON_READ source data are written as expected. + * + *

    Note: Replace it with the Flink reader when it is supported. + * + * @param fs The file system + * @param baseFile The file base to check, should be a directory + * @param expected The expected results mapping, the key should be the partition path + * @param partitions The expected partition number + */ + public static void checkWrittenDataMOR( + FileSystem fs, + File baseFile, + Map expected, + int partitions) throws Exception { + assert baseFile.isDirectory() : "Base path should be a directory"; + String basePath = baseFile.getAbsolutePath(); + File hoodiePropertiesFile = new File(baseFile + "/" + METAFOLDER_NAME + "/" + HOODIE_PROPERTIES_FILE); + assert hoodiePropertiesFile.exists(); + // 1. init flink table + HoodieWriteConfig config = HoodieWriteConfig.newBuilder() + .fromFile(hoodiePropertiesFile) + .withPath(basePath).build(); + HoodieTableMetaClient metaClient = HoodieTestUtils.init(basePath, HoodieTableType.MERGE_ON_READ, config.getProps()); + HoodieFlinkTable table = HoodieFlinkTable.create(config, HoodieFlinkEngineContext.DEFAULT, metaClient); + Schema schema = new TableSchemaResolver(metaClient).getTableAvroSchema(); + + String latestInstant = metaClient.getActiveTimeline().filterCompletedInstants() + .lastInstant().map(HoodieInstant::getTimestamp).orElse(null); + assertNotNull(latestInstant, "No completed commit under table path" + basePath); + + File[] partitionDirs = baseFile.listFiles(file -> !file.getName().startsWith(".") && file.isDirectory()); + assertNotNull(partitionDirs); + assertThat("The partitions number should be: " + partitions, partitionDirs.length, is(partitions)); + + // 2. check each partition data + final int[] requiredPos = IntStream.range(0, schema.getFields().size()).toArray(); + for (File partitionDir : partitionDirs) { + List readBuffer = new ArrayList<>(); + List fileSlices = table.getSliceView().getLatestMergedFileSlicesBeforeOrOn(partitionDir.getName(), latestInstant).collect(Collectors.toList()); + for (FileSlice fileSlice : fileSlices) { + HoodieMergedLogRecordScanner scanner = null; + List logPaths = fileSlice.getLogFiles() + .sorted(HoodieLogFile.getLogFileComparator()) + .map(logFile -> logFile.getPath().toString()) + .collect(Collectors.toList()); + if (logPaths.size() > 0) { + scanner = getScanner(fs, basePath, logPaths, schema, latestInstant); + } + String baseFilePath = fileSlice.getBaseFile().map(BaseFile::getPath).orElse(null); + Set keyToSkip = new HashSet<>(); + if (baseFilePath != null) { + // read the base file first + GenericRecordBuilder recordBuilder = new GenericRecordBuilder(schema); + ParquetReader reader = AvroParquetReader.builder(new Path(baseFilePath)).build(); + GenericRecord currentRecord = reader.read(); + while (currentRecord != null) { + String curKey = currentRecord.get(HOODIE_RECORD_KEY_COL_POS).toString(); + if (scanner != null && scanner.getRecords().containsKey(curKey)) { + keyToSkip.add(curKey); + // merge row with log. + final HoodieAvroRecord record = (HoodieAvroRecord) scanner.getRecords().get(curKey); + Option combineResult = record.getData().combineAndGetUpdateValue(currentRecord, schema, config.getProps()); + if (combineResult.isPresent()) { + GenericRecord avroRecord = buildAvroRecordBySchema(combineResult.get(), schema, requiredPos, recordBuilder); + readBuffer.add(filterOutVariables(avroRecord)); + } + } else { + readBuffer.add(filterOutVariables(currentRecord)); + } + currentRecord = reader.read(); + } + } + // read the remaining log data. + if (scanner != null) { + for (String curKey : scanner.getRecords().keySet()) { + if (!keyToSkip.contains(curKey)) { + Option record = (Option) scanner.getRecords() + .get(curKey).getData() + .getInsertValue(schema, config.getProps()); + if (record.isPresent()) { + readBuffer.add(filterOutVariables(record.get())); + } + } + } + } + } + // Ensure that to write and read sequences are consistent. + readBuffer.sort(String::compareTo); + assertThat(readBuffer.toString(), is(expected.get(partitionDir.getName()))); + } + } + + /** + * Returns the scanner to read avro log files. + */ + private static HoodieMergedLogRecordScanner getScanner( + FileSystem fs, + String basePath, + List logPaths, + Schema readSchema, + String instant) { + return HoodieMergedLogRecordScanner.newBuilder() + .withFileSystem(fs) + .withBasePath(basePath) + .withLogFilePaths(logPaths) + .withReaderSchema(readSchema) + .withLatestInstantTime(instant) + .withReadBlocksLazily(false) + .withReverseReader(false) + .withBufferSize(16 * 1024 * 1024) + .withMaxMemorySizeInBytes(1024 * 1024L) + .withSpillableMapBasePath("/tmp/") + .withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()) + .withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()) + .build(); + } + + /** + * Filter out the variables like file name. + */ + private static String filterOutVariables(GenericRecord genericRecord) { + List fields = new ArrayList<>(); + fields.add(genericRecord.get("_hoodie_record_key").toString()); + fields.add(genericRecord.get("_hoodie_partition_path").toString()); + fields.add(genericRecord.get("uuid").toString()); + fields.add(genericRecord.get("name").toString()); + fields.add(genericRecord.get("age").toString()); + fields.add(genericRecord.get("ts").toString()); + fields.add(genericRecord.get("partition").toString()); + return String.join(",", fields); + } + + public static BinaryRowData insertRow(Object... fields) { + return insertRow(TestConfigurations.ROW_TYPE, fields); + } + + public static BinaryRowData insertRow(RowType rowType, Object... fields) { + LogicalType[] types = rowType.getFields().stream().map(RowType.RowField::getType) + .toArray(LogicalType[]::new); + assertEquals( + "Filed count inconsistent with type information", + fields.length, + types.length); + BinaryRowData row = new BinaryRowData(fields.length); + BinaryRowWriter writer = new BinaryRowWriter(row); + writer.reset(); + for (int i = 0; i < fields.length; i++) { + Object field = fields[i]; + if (field == null) { + writer.setNullAt(i); + } else { + BinaryWriter.write(writer, i, field, types[i], InternalSerializers.create(types[i])); + } + } + writer.complete(); + return row; + } + + private static BinaryRowData deleteRow(Object... fields) { + BinaryRowData rowData = insertRow(fields); + rowData.setRowKind(RowKind.DELETE); + return rowData; + } + + private static BinaryRowData updateBeforeRow(Object... fields) { + BinaryRowData rowData = insertRow(fields); + rowData.setRowKind(RowKind.UPDATE_BEFORE); + return rowData; + } + + private static BinaryRowData updateAfterRow(Object... fields) { + BinaryRowData rowData = insertRow(fields); + rowData.setRowKind(RowKind.UPDATE_AFTER); + return rowData; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestHoodieRowData.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestHoodieRowData.java new file mode 100644 index 0000000000000..10e7ca1b2c29e --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestHoodieRowData.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils; + +import org.apache.hudi.client.model.HoodieRowData; +import org.apache.hudi.common.model.HoodieRecord; + +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.junit.jupiter.api.Test; + +import java.math.BigDecimal; +import java.util.Random; +import java.util.UUID; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * Unit tests {@link HoodieRowData}. + */ +public class TestHoodieRowData { + private final int metaColumnsNum = HoodieRecord.HOODIE_META_COLUMNS_WITH_OPERATION.size(); + private static final Random RANDOM = new Random(); + private static final int INTEGER_INDEX = 0; + private static final int STRING_INDEX = 1; + private static final int BOOLEAN_INDEX = 2; + private static final int SHORT_INDEX = 3; + private static final int BYTE_INDEX = 4; + private static final int LONG_INDEX = 5; + private static final int FLOAT_INDEX = 6; + private static final int DOUBLE_INDEX = 7; + private static final int DECIMAL_INDEX = 8; + private static final int BINARY_INDEX = 9; + private static final int ROW_INDEX = 10; + + private static final DataType BASIC_DATA_TYPE = DataTypes.ROW( + DataTypes.FIELD("integer", DataTypes.INT()), + DataTypes.FIELD("string", DataTypes.STRING()), + DataTypes.FIELD("boolean", DataTypes.BOOLEAN()), + DataTypes.FIELD("short", DataTypes.SMALLINT()), + DataTypes.FIELD("byte", DataTypes.TINYINT()), + DataTypes.FIELD("long", DataTypes.BIGINT()), + DataTypes.FIELD("float", DataTypes.FLOAT()), + DataTypes.FIELD("double", DataTypes.DOUBLE()), + DataTypes.FIELD("decimal", DataTypes.DECIMAL(10, 4)), + DataTypes.FIELD("binary", DataTypes.BYTES()), + DataTypes.FIELD("row", DataTypes.ROW())) + .notNull(); + private static final RowType ROW_TYPE = (RowType) BASIC_DATA_TYPE.getLogicalType(); + + @Test + public void testGet() { + Object[] values = getRandomValue(true); + RowData rowData = TestData.insertRow(ROW_TYPE, values); + + HoodieRowData hoodieRowData = new HoodieRowData("commitTime", "commitSeqNo", "recordKey", "partitionPath", "fileName", + rowData, true); + assertValues(hoodieRowData, "commitTime", "commitSeqNo", "recordKey", "partitionPath", + "fileName", values); + } + + /** + * Fetches a random Object[] of values for testing. + * + * @param haveRowType true if rowType need to be added as one of the elements in the Object[] + * @return the random Object[] thus generated + */ + private Object[] getRandomValue(boolean haveRowType) { + Object[] values = new Object[11]; + values[INTEGER_INDEX] = RANDOM.nextInt(); + values[STRING_INDEX] = StringData.fromString(UUID.randomUUID().toString()); + values[BOOLEAN_INDEX] = RANDOM.nextBoolean(); + values[SHORT_INDEX] = (short) RANDOM.nextInt(2); + byte[] bytes = new byte[1]; + RANDOM.nextBytes(bytes); + values[BYTE_INDEX] = bytes[0]; + values[LONG_INDEX] = RANDOM.nextLong(); + values[FLOAT_INDEX] = RANDOM.nextFloat(); + values[DOUBLE_INDEX] = RANDOM.nextDouble(); + values[DECIMAL_INDEX] = DecimalData.fromBigDecimal(new BigDecimal("1005.12313"), 10, 4); + bytes = new byte[20]; + RANDOM.nextBytes(bytes); + values[BINARY_INDEX] = bytes; + if (haveRowType) { + Object[] rowField = getRandomValue(false); + values[ROW_INDEX] = TestData.insertRow(ROW_TYPE, rowField); + } + return values; + } + + private void assertValues(HoodieRowData hoodieRowData, String commitTime, String commitSeqNo, String recordKey, String partitionPath, + String filename, Object[] values) { + assertEquals(commitTime, hoodieRowData.getString(0).toString()); + assertEquals(commitSeqNo, hoodieRowData.getString(1).toString()); + assertEquals(recordKey, hoodieRowData.getString(2).toString()); + assertEquals(partitionPath, hoodieRowData.getString(3).toString()); + assertEquals(filename, hoodieRowData.getString(4).toString()); + assertEquals("I", hoodieRowData.getString(5).toString()); + // row data. + assertEquals(values[INTEGER_INDEX], hoodieRowData.getInt(INTEGER_INDEX + metaColumnsNum)); + assertEquals(values[STRING_INDEX], hoodieRowData.getString(STRING_INDEX + metaColumnsNum)); + assertEquals(values[BOOLEAN_INDEX], hoodieRowData.getBoolean(BOOLEAN_INDEX + metaColumnsNum)); + assertEquals(values[SHORT_INDEX], hoodieRowData.getShort(SHORT_INDEX + metaColumnsNum)); + assertEquals(values[BYTE_INDEX], hoodieRowData.getByte(BYTE_INDEX + metaColumnsNum)); + assertEquals(values[LONG_INDEX], hoodieRowData.getLong(LONG_INDEX + metaColumnsNum)); + assertEquals(values[FLOAT_INDEX], hoodieRowData.getFloat(FLOAT_INDEX + metaColumnsNum)); + assertEquals(values[DOUBLE_INDEX], hoodieRowData.getDouble(DOUBLE_INDEX + metaColumnsNum)); + assertEquals(values[DECIMAL_INDEX], hoodieRowData.getDecimal(DECIMAL_INDEX + metaColumnsNum, 10, 4)); + byte[] exceptBinary = (byte[]) values[BINARY_INDEX]; + byte[] binary = hoodieRowData.getBinary(BINARY_INDEX + metaColumnsNum); + for (int i = 0; i < exceptBinary.length; i++) { + assertEquals(exceptBinary[i], binary[i]); + } + assertEquals(values[ROW_INDEX], hoodieRowData.getRow(ROW_INDEX + metaColumnsNum, values.length)); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestSQL.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestSQL.java new file mode 100644 index 0000000000000..531847f3c87b0 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestSQL.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils; + +/** + * Test sql statements. + */ +public class TestSQL { + private TestSQL() { + } + + public static final String INSERT_T1 = "insert into t1 values\n" + + "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1'),\n" + + "('id2','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','par1'),\n" + + "('id3','Julian',53,TIMESTAMP '1970-01-01 00:00:03','par2'),\n" + + "('id4','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','par2'),\n" + + "('id5','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par3'),\n" + + "('id6','Emma',20,TIMESTAMP '1970-01-01 00:00:06','par3'),\n" + + "('id7','Bob',44,TIMESTAMP '1970-01-01 00:00:07','par4'),\n" + + "('id8','Han',56,TIMESTAMP '1970-01-01 00:00:08','par4')"; + + public static final String INSERT_SAME_KEY_T1 = "insert into t1 values\n" + + "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:05','par1'),\n" + + "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:04','par1'),\n" + + "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:03','par1'),\n" + + "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:02','par1'),\n" + + "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1')"; + + public static final String UPDATE_INSERT_T1 = "insert into t1 values\n" + + "('id1','Danny',24,TIMESTAMP '1970-01-01 00:00:01','par1'),\n" + + "('id2','Stephen',34,TIMESTAMP '1970-01-01 00:00:02','par1'),\n" + + "('id3','Julian',54,TIMESTAMP '1970-01-01 00:00:03','par2'),\n" + + "('id4','Fabian',32,TIMESTAMP '1970-01-01 00:00:04','par2'),\n" + + "('id5','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par3'),\n" + + "('id9','Jane',19,TIMESTAMP '1970-01-01 00:00:06','par3'),\n" + + "('id10','Ella',38,TIMESTAMP '1970-01-01 00:00:07','par4'),\n" + + "('id11','Phoebe',52,TIMESTAMP '1970-01-01 00:00:08','par4')"; + + public static final String COMPLEX_TYPE_INSERT_T1 = "insert into t1 values\n" + + "(1, array['abc1', 'def1'], map['abc1', 1, 'def1', 3], row(1, 'abc1')),\n" + + "(2, array['abc2', 'def2'], map['abc2', 1, 'def2', 3], row(2, 'abc2')),\n" + + "(3, array['abc3', 'def3'], map['abc3', 1, 'def3', 3], row(3, 'abc3'))"; + + public static final String COMPLEX_NESTED_ROW_TYPE_INSERT_T1 = "insert into t1 values\n" + + "(1, array['abc1', 'def1'], array[1, 1], map['abc1', 1, 'def1', 3], row(array['abc1', 'def1'], row(1, 'abc1'))),\n" + + "(2, array['abc2', 'def2'], array[2, 2], map['abc2', 1, 'def2', 3], row(array['abc2', 'def2'], row(2, 'abc2'))),\n" + + "(3, array['abc3', 'def3'], array[3, 3], map['abc3', 1, 'def3', 3], row(array['abc3', 'def3'], row(3, 'abc3')))"; + + public static final String NULL_CHILD_COLUMNS_ROW_TYPE_INSERT_T1 = "insert into t1 values\n" + + "(1, row(cast(null as int), 'abc1')),\n" + + "(2, row(2, cast(null as varchar))),\n" + + "(3, row(cast(null as int), cast(null as varchar)))"; + + public static final String INSERT_DATE_PARTITION_T1 = "insert into t1 values\n" + + "('id1','Danny',23,DATE '1970-01-01'),\n" + + "('id2','Stephen',33,DATE '1970-01-01'),\n" + + "('id3','Julian',53,DATE '1970-01-01'),\n" + + "('id4','Fabian',31,DATE '1970-01-01'),\n" + + "('id5','Sophia',18,DATE '1970-01-01'),\n" + + "('id6','Emma',20,DATE '1970-01-01'),\n" + + "('id7','Bob',44,DATE '1970-01-01'),\n" + + "('id8','Han',56,DATE '1970-01-01')"; +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestStreamerUtil.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestStreamerUtil.java new file mode 100644 index 0000000000000..a641811bb738a --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestStreamerUtil.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils; + +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.FileIOUtils; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.keygen.SimpleAvroKeyGenerator; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.flink.configuration.Configuration; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.File; +import java.io.IOException; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Test cases for {@link StreamerUtil}. + */ +public class TestStreamerUtil { + + @TempDir + File tempFile; + + @Test + void testInitTableIfNotExists() throws IOException { + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + + // Test for partitioned table. + conf.setString(FlinkOptions.PRECOMBINE_FIELD, "ts"); + conf.setString(FlinkOptions.PARTITION_PATH_FIELD, "p0,p1"); + StreamerUtil.initTableIfNotExists(conf); + + // Validate the partition fields & preCombineField in hoodie.properties. + HoodieTableMetaClient metaClient1 = HoodieTableMetaClient.builder() + .setBasePath(tempFile.getAbsolutePath()) + .setConf(new org.apache.hadoop.conf.Configuration()) + .build(); + assertTrue(metaClient1.getTableConfig().getPartitionFields().isPresent(), + "Missing partition columns in the hoodie.properties."); + assertArrayEquals(metaClient1.getTableConfig().getPartitionFields().get(), new String[] {"p0", "p1"}); + assertEquals(metaClient1.getTableConfig().getPreCombineField(), "ts"); + assertEquals(metaClient1.getTableConfig().getKeyGeneratorClassName(), SimpleAvroKeyGenerator.class.getName()); + + // Test for non-partitioned table. + conf.removeConfig(FlinkOptions.PARTITION_PATH_FIELD); + FileIOUtils.deleteDirectory(tempFile); + StreamerUtil.initTableIfNotExists(conf); + HoodieTableMetaClient metaClient2 = HoodieTableMetaClient.builder() + .setBasePath(tempFile.getAbsolutePath()) + .setConf(new org.apache.hadoop.conf.Configuration()) + .build(); + assertFalse(metaClient2.getTableConfig().getPartitionFields().isPresent()); + assertEquals(metaClient2.getTableConfig().getKeyGeneratorClassName(), SimpleAvroKeyGenerator.class.getName()); + } + + @Test + void testMedianInstantTime() { + String higher = "20210705125921"; + String lower = "20210705125806"; + String expectedMedianInstant = "20210705125844499"; + String median1 = StreamerUtil.medianInstantTime(higher, lower).get(); + assertThat(median1, is(expectedMedianInstant)); + // test symmetry + assertThrows(IllegalArgumentException.class, + () -> StreamerUtil.medianInstantTime(lower, higher), + "The first argument should have newer instant time"); + // test very near instant time + assertFalse(StreamerUtil.medianInstantTime("20211116115634", "20211116115633").isPresent()); + } + + @Test + void testInstantTimeDiff() { + String higher = "20210705125921"; + String lower = "20210705125806"; + long diff = StreamerUtil.instantTimeDiffSeconds(higher, lower); + assertThat(diff, is(75L)); + } +} + diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestStringToRowDataConverter.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestStringToRowDataConverter.java new file mode 100644 index 0000000000000..8f7ecad1384cf --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestStringToRowDataConverter.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils; + +import org.apache.hudi.keygen.KeyGenUtils; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.RowDataToAvroConverters; +import org.apache.hudi.util.StringToRowDataConverter; + +import org.apache.avro.generic.GenericRecord; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.junit.jupiter.api.Test; + +import java.math.BigDecimal; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalTime; +import java.time.temporal.ChronoField; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; + +/** + * Test cases for {@link StringToRowDataConverter}. + */ +public class TestStringToRowDataConverter { + @Test + void testConvert() { + String[] fields = new String[] {"1.1", "3.4", "2021-03-30", "56669000", "1617119069000", "1617119069666111", "12345.67"}; + LogicalType[] fieldTypes = new LogicalType[] { + DataTypes.FLOAT().getLogicalType(), + DataTypes.DOUBLE().getLogicalType(), + DataTypes.DATE().getLogicalType(), + DataTypes.TIME(3).getLogicalType(), + DataTypes.TIMESTAMP(3).getLogicalType(), + DataTypes.TIMESTAMP(6).getLogicalType(), + DataTypes.DECIMAL(7, 2).getLogicalType() + }; + StringToRowDataConverter converter = new StringToRowDataConverter(fieldTypes); + Object[] converted = converter.convert(fields); + Object[] expected = new Object[] { + 1.1f, 3.4D, (int) LocalDate.parse("2021-03-30").toEpochDay(), + LocalTime.parse("15:44:29").get(ChronoField.MILLI_OF_DAY), + TimestampData.fromInstant(Instant.parse("2021-03-30T15:44:29Z")), + TimestampData.fromInstant(Instant.parse("2021-03-30T15:44:29.666111Z")), + DecimalData.fromBigDecimal(new BigDecimal("12345.67"), 7, 2) + }; + assertArrayEquals(expected, converted); + } + + @Test + void testRowDataToAvroStringToRowData() { + GenericRowData rowData = new GenericRowData(7); + rowData.setField(0, 1.1f); + rowData.setField(1, 3.4D); + rowData.setField(2, (int) LocalDate.parse("2021-03-30").toEpochDay()); + rowData.setField(3, LocalTime.parse("15:44:29").get(ChronoField.MILLI_OF_DAY)); + rowData.setField(4, TimestampData.fromInstant(Instant.parse("2021-03-30T15:44:29Z"))); + rowData.setField(5, TimestampData.fromInstant(Instant.parse("2021-03-30T15:44:29.666111Z"))); + rowData.setField(6, DecimalData.fromBigDecimal(new BigDecimal("12345.67"), 7, 2)); + + DataType dataType = DataTypes.ROW( + DataTypes.FIELD("f_float", DataTypes.FLOAT()), + DataTypes.FIELD("f_double", DataTypes.DOUBLE()), + DataTypes.FIELD("f_date", DataTypes.DATE()), + DataTypes.FIELD("f_time", DataTypes.TIME(3)), + DataTypes.FIELD("f_timestamp", DataTypes.TIMESTAMP(3)), + DataTypes.FIELD("f_timestamp_micros", DataTypes.TIMESTAMP(6)), + DataTypes.FIELD("f_decimal", DataTypes.DECIMAL(7, 2)) + ); + RowType rowType = (RowType) dataType.getLogicalType(); + RowDataToAvroConverters.RowDataToAvroConverter converter = + RowDataToAvroConverters.createConverter(rowType); + GenericRecord avroRecord = + (GenericRecord) converter.convert(AvroSchemaConverter.convertToSchema(rowType), rowData); + StringToRowDataConverter stringToRowDataConverter = + new StringToRowDataConverter(rowType.getChildren().toArray(new LogicalType[0])); + final String recordKey = KeyGenUtils.getRecordKey(avroRecord, rowType.getFieldNames(), false); + final String[] recordKeys = KeyGenUtils.extractRecordKeys(recordKey); + Object[] convertedKeys = stringToRowDataConverter.convert(recordKeys); + + GenericRowData converted = new GenericRowData(7); + for (int i = 0; i < 7; i++) { + converted.setField(i, convertedKeys[i]); + } + assertThat(converted, is(rowData)); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestUtils.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestUtils.java new file mode 100644 index 0000000000000..4b3d87e387931 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestUtils.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils; + +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.configuration.HadoopConfigurations; +import org.apache.hudi.source.StreamReadMonitoringFunction; +import org.apache.hudi.table.format.mor.MergeOnReadInputSplit; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.Path; + +import javax.annotation.Nullable; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Common test utils. + */ +public class TestUtils { + public static String getLastPendingInstant(String basePath) { + final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() + .setConf(HadoopConfigurations.getHadoopConf(new Configuration())).setBasePath(basePath).build(); + return StreamerUtil.getLastPendingInstant(metaClient); + } + + public static String getLastCompleteInstant(String basePath) { + final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() + .setConf(HadoopConfigurations.getHadoopConf(new Configuration())).setBasePath(basePath).build(); + return StreamerUtil.getLastCompletedInstant(metaClient); + } + + public static String getLastCompleteInstant(String basePath, String commitAction) { + final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() + .setConf(HadoopConfigurations.getHadoopConf(new Configuration())).setBasePath(basePath).build(); + return metaClient.getCommitsTimeline().filterCompletedInstants() + .filter(instant -> commitAction.equals(instant.getAction())) + .lastInstant() + .map(HoodieInstant::getTimestamp) + .orElse(null); + } + + public static String getLastDeltaCompleteInstant(String basePath) { + final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() + .setConf(HadoopConfigurations.getHadoopConf(new Configuration())).setBasePath(basePath).build(); + return metaClient.getCommitsTimeline().filterCompletedInstants() + .filter(hoodieInstant -> hoodieInstant.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION)) + .lastInstant() + .map(HoodieInstant::getTimestamp) + .orElse(null); + } + + public static String getFirstCompleteInstant(String basePath) { + final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() + .setConf(HadoopConfigurations.getHadoopConf(new Configuration())).setBasePath(basePath).build(); + return metaClient.getCommitsAndCompactionTimeline().filterCompletedInstants().firstInstant() + .map(HoodieInstant::getTimestamp).orElse(null); + } + + @Nullable + public static String getNthCompleteInstant(String basePath, int n, boolean isDelta) { + final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() + .setConf(HadoopConfigurations.getHadoopConf(new Configuration())).setBasePath(basePath).build(); + return metaClient.getActiveTimeline() + .filterCompletedInstants() + .filter(instant -> isDelta ? HoodieTimeline.DELTA_COMMIT_ACTION.equals(instant.getAction()) : HoodieTimeline.COMMIT_ACTION.equals(instant.getAction())) + .nthInstant(n).map(HoodieInstant::getTimestamp) + .orElse(null); + } + + @Nullable + public static String getNthArchivedInstant(String basePath, int n) { + final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() + .setConf(HadoopConfigurations.getHadoopConf(new Configuration())).setBasePath(basePath).build(); + return metaClient.getArchivedTimeline().getCommitsTimeline().filterCompletedInstants() + .nthInstant(n).map(HoodieInstant::getTimestamp).orElse(null); + } + + public static String getSplitPartitionPath(MergeOnReadInputSplit split) { + assertTrue(split.getLogPaths().isPresent()); + final String logPath = split.getLogPaths().get().get(0); + String[] paths = logPath.split(Path.SEPARATOR); + return paths[paths.length - 2]; + } + + public static StreamReadMonitoringFunction getMonitorFunc(Configuration conf) { + final String basePath = conf.getString(FlinkOptions.PATH); + return new StreamReadMonitoringFunction(conf, new Path(basePath), TestConfigurations.ROW_TYPE, 1024 * 1024L, null); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestViewStorageProperties.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestViewStorageProperties.java new file mode 100644 index 0000000000000..084f211e66098 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestViewStorageProperties.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils; + +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.FileSystemViewStorageType; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.util.FlinkWriteClients; +import org.apache.hudi.util.ViewStorageProperties; + +import org.apache.flink.configuration.Configuration; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.File; +import java.io.IOException; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; + +/** + * Test cases for {@link ViewStorageProperties}. + */ +public class TestViewStorageProperties { + @TempDir + File tempFile; + + @Test + void testReadWriteProperties() throws IOException { + String basePath = tempFile.getAbsolutePath(); + FileSystemViewStorageConfig config = FileSystemViewStorageConfig.newBuilder() + .withStorageType(FileSystemViewStorageType.SPILLABLE_DISK) + .withRemoteServerHost("host1") + .withRemoteServerPort(1234).build(); + Configuration flinkConfig = new Configuration(); + ViewStorageProperties.createProperties(basePath, config, flinkConfig); + ViewStorageProperties.createProperties(basePath, config, flinkConfig); + ViewStorageProperties.createProperties(basePath, config, flinkConfig); + + FileSystemViewStorageConfig readConfig = ViewStorageProperties.loadFromProperties(basePath, new Configuration()); + assertThat(readConfig.getStorageType(), is(FileSystemViewStorageType.SPILLABLE_DISK)); + assertThat(readConfig.getRemoteViewServerHost(), is("host1")); + assertThat(readConfig.getRemoteViewServerPort(), is(1234)); + } + + @Test + void testDumpRemoteViewStorageConfig() throws IOException { + Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + FlinkWriteClients.createWriteClient(conf); + FileSystemViewStorageConfig storageConfig = ViewStorageProperties.loadFromProperties(conf.getString(FlinkOptions.PATH), new Configuration()); + assertThat(storageConfig.getStorageType(), is(FileSystemViewStorageType.REMOTE_FIRST)); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/factory/CollectSinkTableFactory.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/factory/CollectSinkTableFactory.java new file mode 100644 index 0000000000000..33e9d376588e1 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/factory/CollectSinkTableFactory.java @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils.factory; + +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.runtime.state.FunctionInitializationContext; +import org.apache.flink.runtime.state.FunctionSnapshotContext; +import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; +import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; +import org.apache.flink.streaming.api.functions.sink.SinkFunction; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.sink.DynamicTableSink; +import org.apache.flink.table.connector.sink.SinkFunctionProvider; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.factories.DynamicTableSinkFactory; +import org.apache.flink.table.factories.FactoryUtil; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.flink.types.Row; +import org.apache.flink.types.RowKind; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Factory for CollectTableSink. + * + *

    Note: The CollectTableSink collects all the data of a table into a global collection {@code RESULT}, + * so the tests should executed in single thread and the table name should be the same. + */ +public class CollectSinkTableFactory implements DynamicTableSinkFactory { + public static final String FACTORY_ID = "collect"; + + // global results to collect and query + public static final Map> RESULT = new HashMap<>(); + + @Override + public DynamicTableSink createDynamicTableSink(Context context) { + FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context); + helper.validate(); + + TableSchema schema = context.getCatalogTable().getSchema(); + RESULT.clear(); + return new CollectTableSink(schema, context.getObjectIdentifier().getObjectName()); + } + + @Override + public String factoryIdentifier() { + return FACTORY_ID; + } + + @Override + public Set> requiredOptions() { + return Collections.emptySet(); + } + + @Override + public Set> optionalOptions() { + return Collections.emptySet(); + } + + // -------------------------------------------------------------------------------------------- + // Table sinks + // -------------------------------------------------------------------------------------------- + + /** + * Values {@link DynamicTableSink} for testing. + */ + private static class CollectTableSink implements DynamicTableSink { + + private final TableSchema schema; + private final String tableName; + + private CollectTableSink( + TableSchema schema, + String tableName) { + this.schema = schema; + this.tableName = tableName; + } + + @Override + public ChangelogMode getChangelogMode(ChangelogMode requestedMode) { + return ChangelogMode.newBuilder() + .addContainedKind(RowKind.INSERT) + .addContainedKind(RowKind.DELETE) + .addContainedKind(RowKind.UPDATE_AFTER) + .build(); + } + + @Override + public SinkRuntimeProvider getSinkRuntimeProvider(Context context) { + final DataType rowType = schema.toPhysicalRowDataType(); + final RowTypeInfo rowTypeInfo = (RowTypeInfo) TypeConversions.fromDataTypeToLegacyInfo(rowType); + DataStructureConverter converter = context.createDataStructureConverter(schema.toPhysicalRowDataType()); + return SinkFunctionProvider.of(new CollectSinkFunction(converter, rowTypeInfo)); + } + + @Override + public DynamicTableSink copy() { + return new CollectTableSink(schema, tableName); + } + + @Override + public String asSummaryString() { + return "CollectSink"; + } + } + + static class CollectSinkFunction extends RichSinkFunction implements CheckpointedFunction { + + private static final long serialVersionUID = 1L; + private final DynamicTableSink.DataStructureConverter converter; + private final RowTypeInfo rowTypeInfo; + + protected transient ListState resultState; + protected transient List localResult; + + private int taskID; + + protected CollectSinkFunction(DynamicTableSink.DataStructureConverter converter, RowTypeInfo rowTypeInfo) { + this.converter = converter; + this.rowTypeInfo = rowTypeInfo; + } + + @Override + public void invoke(RowData value, SinkFunction.Context context) { + Row row = (Row) converter.toExternal(value); + assert row != null; + row.setKind(value.getRowKind()); + RESULT.get(taskID).add(row); + } + + @Override + public void initializeState(FunctionInitializationContext context) throws Exception { + this.resultState = context.getOperatorStateStore().getListState( + new ListStateDescriptor<>("sink-results", rowTypeInfo)); + this.localResult = new ArrayList<>(); + if (context.isRestored()) { + for (Row value : resultState.get()) { + localResult.add(value); + } + } + this.taskID = getRuntimeContext().getIndexOfThisSubtask(); + synchronized (CollectSinkTableFactory.class) { + RESULT.put(taskID, localResult); + } + } + + @Override + public void snapshotState(FunctionSnapshotContext context) throws Exception { + resultState.clear(); + resultState.addAll(RESULT.get(taskID)); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/factory/ContinuousFileSourceFactory.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/factory/ContinuousFileSourceFactory.java new file mode 100644 index 0000000000000..31b3ad5c7669d --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/factory/ContinuousFileSourceFactory.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils.factory; + +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.utils.source.ContinuousFileSource; + +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.Path; +import org.apache.flink.table.api.ValidationException; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.factories.DynamicTableSourceFactory; +import org.apache.flink.table.factories.FactoryUtil; + +import java.util.Collections; +import java.util.Set; + +/** + * Factory for ContinuousFileSource. + */ +public class ContinuousFileSourceFactory implements DynamicTableSourceFactory { + public static final String FACTORY_ID = "continuous-file-source"; + + public static final ConfigOption CHECKPOINTS = ConfigOptions + .key("checkpoints") + .intType() + .defaultValue(2) + .withDescription("Number of checkpoints to write the data set as, default 2"); + + @Override + public DynamicTableSource createDynamicTableSource(Context context) { + FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context); + helper.validate(); + + Configuration conf = (Configuration) helper.getOptions(); + Path path = new Path(conf.getOptional(FlinkOptions.PATH).orElseThrow(() -> + new ValidationException("Option [path] should be not empty."))); + return new ContinuousFileSource(context.getCatalogTable().getResolvedSchema(), path, conf); + } + + @Override + public String factoryIdentifier() { + return FACTORY_ID; + } + + @Override + public Set> requiredOptions() { + return Collections.singleton(FlinkOptions.PATH); + } + + @Override + public Set> optionalOptions() { + return Collections.singleton(CHECKPOINTS); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/source/ContinuousFileSource.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/source/ContinuousFileSource.java new file mode 100644 index 0000000000000..2830eefef013f --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/source/ContinuousFileSource.java @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils.source; + +import org.apache.hudi.adapter.DataStreamScanProviderAdapter; + +import org.apache.flink.api.common.state.CheckpointListener; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.common.TimestampFormat; +import org.apache.flink.formats.json.JsonRowDataDeserializationSchema; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.connector.source.ScanTableSource; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.flink.table.types.logical.RowType; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.apache.hudi.utils.factory.ContinuousFileSourceFactory.CHECKPOINTS; + +/** + * A continuous file source that can trigger checkpoints continuously. + * + *

    It loads the data in the specified file and split the data into number of checkpoints batches. + * Say, if you want 4 checkpoints and there are 8 records in the file, the emit strategy is: + * + *

    + *   | 2 records | 2 records | 2 records | 2 records |
    + *   | cp1       | cp2       |cp3        | cp4       |
    + * 
    + * + *

    If all the data are flushed out, it waits for the next checkpoint to finish and tear down the source. + */ +public class ContinuousFileSource implements ScanTableSource { + + private final ResolvedSchema tableSchema; + private final Path path; + private final Configuration conf; + + public ContinuousFileSource( + ResolvedSchema tableSchema, + Path path, + Configuration conf) { + this.tableSchema = tableSchema; + this.path = path; + this.conf = conf; + } + + @Override + public ScanRuntimeProvider getScanRuntimeProvider(ScanContext scanContext) { + return new DataStreamScanProviderAdapter() { + + @Override + public boolean isBounded() { + return false; + } + + @Override + public DataStream produceDataStream(StreamExecutionEnvironment execEnv) { + final RowType rowType = (RowType) tableSchema.toSourceRowDataType().getLogicalType(); + JsonRowDataDeserializationSchema deserializationSchema = new JsonRowDataDeserializationSchema( + rowType, + InternalTypeInfo.of(rowType), + false, + true, + TimestampFormat.ISO_8601); + + return execEnv.addSource(new BoundedSourceFunction(path, conf.getInteger(CHECKPOINTS))) + .name("continuous_file_source") + .setParallelism(1) + .map(record -> deserializationSchema.deserialize(record.getBytes(StandardCharsets.UTF_8)), + InternalTypeInfo.of(rowType)); + } + }; + } + + @Override + public ChangelogMode getChangelogMode() { + return ChangelogMode.insertOnly(); + } + + @Override + public DynamicTableSource copy() { + return new ContinuousFileSource(this.tableSchema, this.path, this.conf); + } + + @Override + public String asSummaryString() { + return "ContinuousFileSource"; + } + + /** + * Source function that partition the data into given number checkpoints batches. + */ + public static class BoundedSourceFunction implements SourceFunction, CheckpointListener { + private final Path path; + private List dataBuffer; + + private final int checkpoints; + private final AtomicInteger currentCP = new AtomicInteger(0); + + private volatile boolean isRunning = true; + + public BoundedSourceFunction(Path path, int checkpoints) { + this.path = path; + this.checkpoints = checkpoints; + } + + @Override + public void run(SourceContext context) throws Exception { + if (this.dataBuffer == null) { + loadDataBuffer(); + } + int oldCP = this.currentCP.get(); + boolean finish = false; + while (isRunning) { + int batchSize = this.dataBuffer.size() / this.checkpoints; + int start = batchSize * oldCP; + synchronized (context.getCheckpointLock()) { + for (int i = start; i < start + batchSize; i++) { + if (i >= this.dataBuffer.size()) { + finish = true; + break; + // wait for the next checkpoint and exit + } + context.collect(this.dataBuffer.get(i)); + } + } + oldCP++; + while (this.currentCP.get() < oldCP) { + synchronized (context.getCheckpointLock()) { + context.getCheckpointLock().wait(10); + } + } + if (finish || !isRunning) { + return; + } + } + } + + @Override + public void cancel() { + this.isRunning = false; + } + + private void loadDataBuffer() { + try { + this.dataBuffer = Files.readAllLines(Paths.get(this.path.toUri())); + } catch (IOException e) { + throw new RuntimeException("Read file " + this.path + " error", e); + } + } + + @Override + public void notifyCheckpointComplete(long l) { + this.currentCP.incrementAndGet(); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory b/hudi-flink-datasource/hudi-flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory new file mode 100644 index 0000000000000..19e43c4f81d82 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.hudi.utils.factory.ContinuousFileSourceFactory +org.apache.hudi.utils.factory.CollectSinkTableFactory diff --git a/hudi-flink-datasource/hudi-flink/src/test/resources/debezium_json.data b/hudi-flink-datasource/hudi-flink/src/test/resources/debezium_json.data new file mode 100644 index 0000000000000..d4c02e9e9ee7a --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/resources/debezium_json.data @@ -0,0 +1,16 @@ +{"before":null,"after":{"id":101,"ts":1000,"name":"scooter","description":"Small 2-wheel scooter","weight":3.140000104904175},"source":{"version":"1.1.1.Final","connector":"mysql","name":"dbserver1","ts_ms":0,"snapshot":"true","db":"inventory","table":"products","server_id":0,"gtid":null,"file":"mysql-bin.000003","pos":154,"row":0,"thread":null,"query":null},"op":"c","ts_ms":1589355606100,"transaction":null} +{"before":null,"after":{"id":102,"ts":2000,"name":"car battery","description":"12V car battery","weight":8.100000381469727},"source":{"version":"1.1.1.Final","connector":"mysql","name":"dbserver1","ts_ms":0,"snapshot":"true","db":"inventory","table":"products","server_id":0,"gtid":null,"file":"mysql-bin.000003","pos":154,"row":0,"thread":null,"query":null},"op":"c","ts_ms":1589355606101,"transaction":null} +{"before":null,"after":{"id":103,"ts":3000,"name":"12-pack drill bits","description":"12-pack of drill bits with sizes ranging from #40 to #3","weight":0.800000011920929},"source":{"version":"1.1.1.Final","connector":"mysql","name":"dbserver1","ts_ms":0,"snapshot":"true","db":"inventory","table":"products","server_id":0,"gtid":null,"file":"mysql-bin.000003","pos":154,"row":0,"thread":null,"query":null},"op":"c","ts_ms":1589355606101,"transaction":null} +{"before":null,"after":{"id":104,"ts":4000,"name":"hammer","description":"12oz carpenter's hammer","weight":0.75},"source":{"version":"1.1.1.Final","connector":"mysql","name":"dbserver1","ts_ms":0,"snapshot":"true","db":"inventory","table":"products","server_id":0,"gtid":null,"file":"mysql-bin.000003","pos":154,"row":0,"thread":null,"query":null},"op":"c","ts_ms":1589355606101,"transaction":null} +{"before":null,"after":{"id":105,"ts":5000,"name":"hammer","description":"14oz carpenter's hammer","weight":0.875},"source":{"version":"1.1.1.Final","connector":"mysql","name":"dbserver1","ts_ms":0,"snapshot":"true","db":"inventory","table":"products","server_id":0,"gtid":null,"file":"mysql-bin.000003","pos":154,"row":0,"thread":null,"query":null},"op":"c","ts_ms":1589355606101,"transaction":null} +{"before":null,"after":{"id":106,"ts":6000,"name":"hammer","description":"16oz carpenter's hammer","weight":1},"source":{"version":"1.1.1.Final","connector":"mysql","name":"dbserver1","ts_ms":0,"snapshot":"true","db":"inventory","table":"products","server_id":0,"gtid":null,"file":"mysql-bin.000003","pos":154,"row":0,"thread":null,"query":null},"op":"c","ts_ms":1589355606101,"transaction":null} +{"before":null,"after":{"id":107,"ts":7000,"name":"rocks","description":"box of assorted rocks","weight":5.300000190734863},"source":{"version":"1.1.1.Final","connector":"mysql","name":"dbserver1","ts_ms":0,"snapshot":"true","db":"inventory","table":"products","server_id":0,"gtid":null,"file":"mysql-bin.000003","pos":154,"row":0,"thread":null,"query":null},"op":"c","ts_ms":1589355606101,"transaction":null} +{"before":null,"after":{"id":108,"ts":8000,"name":"jacket","description":"water resistent black wind breaker","weight":0.10000000149011612},"source":{"version":"1.1.1.Final","connector":"mysql","name":"dbserver1","ts_ms":0,"snapshot":"true","db":"inventory","table":"products","server_id":0,"gtid":null,"file":"mysql-bin.000003","pos":154,"row":0,"thread":null,"query":null},"op":"c","ts_ms":1589355606101,"transaction":null} +{"before":null,"after":{"id":109,"ts":9000,"name":"spare tire","description":"24 inch spare tire","weight":22.200000762939453},"source":{"version":"1.1.1.Final","connector":"mysql","name":"dbserver1","ts_ms":0,"snapshot":"true","db":"inventory","table":"products","server_id":0,"gtid":null,"file":"mysql-bin.000003","pos":154,"row":0,"thread":null,"query":null},"op":"c","ts_ms":1589355606101,"transaction":null} +{"before":{"id":106,"ts":6000,"name":"hammer","description":"16oz carpenter's hammer","weight":1},"after":{"id":106,"ts":10000,"name":"hammer","description":"18oz carpenter hammer","weight":1},"source":{"version":"1.1.1.Final","connector":"mysql","name":"dbserver1","ts_ms":1589361987000,"snapshot":"false","db":"inventory","table":"products","server_id":223344,"gtid":null,"file":"mysql-bin.000003","pos":362,"row":0,"thread":2,"query":null},"op":"u","ts_ms":1589361987936,"transaction":null} +{"before":{"id":107,"ts":7000,"name":"rocks","description":"box of assorted rocks","weight":5.300000190734863},"after":{"id":107,"ts":11000,"name":"rocks","description":"box of assorted rocks","weight":5.099999904632568},"source":{"version":"1.1.1.Final","connector":"mysql","name":"dbserver1","ts_ms":1589362099000,"snapshot":"false","db":"inventory","table":"products","server_id":223344,"gtid":null,"file":"mysql-bin.000003","pos":717,"row":0,"thread":2,"query":null},"op":"u","ts_ms":1589362099505,"transaction":null} +{"before":null,"after":{"id":110,"ts":12000,"name":"jacket","description":"water resistent white wind breaker","weight":0.20000000298023224},"source":{"version":"1.1.1.Final","connector":"mysql","name":"dbserver1","ts_ms":1589362210000,"snapshot":"false","db":"inventory","table":"products","server_id":223344,"gtid":null,"file":"mysql-bin.000003","pos":1068,"row":0,"thread":2,"query":null},"op":"c","ts_ms":1589362210230,"transaction":null} +{"before":null,"after":{"id":111,"ts":13000,"name":"scooter","description":"Big 2-wheel scooter ","weight":5.179999828338623},"source":{"version":"1.1.1.Final","connector":"mysql","name":"dbserver1","ts_ms":1589362243000,"snapshot":"false","db":"inventory","table":"products","server_id":223344,"gtid":null,"file":"mysql-bin.000003","pos":1394,"row":0,"thread":2,"query":null},"op":"c","ts_ms":1589362243428,"transaction":null} +{"before":{"id":110,"ts":12000,"name":"jacket","description":"water resistent white wind breaker","weight":0.20000000298023224},"after":{"id":110,"ts":14000,"name":"jacket","description":"new water resistent white wind breaker","weight":0.5},"source":{"version":"1.1.1.Final","connector":"mysql","name":"dbserver1","ts_ms":1589362293000,"snapshot":"false","db":"inventory","table":"products","server_id":223344,"gtid":null,"file":"mysql-bin.000003","pos":1707,"row":0,"thread":2,"query":null},"op":"u","ts_ms":1589362293539,"transaction":null} +{"before":{"id":111,"ts":13000,"name":"scooter","description":"Big 2-wheel scooter ","weight":5.179999828338623},"after":{"id":111,"ts":15000,"name":"scooter","description":"Big 2-wheel scooter ","weight":5.170000076293945},"source":{"version":"1.1.1.Final","connector":"mysql","name":"dbserver1","ts_ms":1589362330000,"snapshot":"false","db":"inventory","table":"products","server_id":223344,"gtid":null,"file":"mysql-bin.000003","pos":2090,"row":0,"thread":2,"query":null},"op":"u","ts_ms":1589362330904,"transaction":null} +{"before":{"id":111,"ts":16000,"name":"scooter","description":"Big 2-wheel scooter ","weight":5.170000076293945},"after":null,"source":{"version":"1.1.1.Final","connector":"mysql","name":"dbserver1","ts_ms":1589362344000,"snapshot":"false","db":"inventory","table":"products","server_id":223344,"gtid":null,"file":"mysql-bin.000003","pos":2443,"row":0,"thread":2,"query":null},"op":"d","ts_ms":1589362344455,"transaction":null} diff --git a/hudi-flink-datasource/hudi-flink/src/test/resources/hive-site.xml b/hudi-flink-datasource/hudi-flink/src/test/resources/hive-site.xml new file mode 100644 index 0000000000000..5c486f756ef53 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/resources/hive-site.xml @@ -0,0 +1,30 @@ + + + + + + + hive.metastore.schema.verification + false + + + + datanucleus.schema.autoCreateTables + true + + diff --git a/hudi-flink-datasource/hudi-flink/src/test/resources/test-catalog-factory-conf/hive-site.xml b/hudi-flink-datasource/hudi-flink/src/test/resources/test-catalog-factory-conf/hive-site.xml new file mode 100644 index 0000000000000..8152695857b06 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/resources/test-catalog-factory-conf/hive-site.xml @@ -0,0 +1,26 @@ + + + + + + + + hive.metastore.uris + dummy-hms + + diff --git a/hudi-flink-datasource/hudi-flink/src/test/resources/test_read_schema.avsc b/hudi-flink-datasource/hudi-flink/src/test/resources/test_read_schema.avsc new file mode 100644 index 0000000000000..0cbb4e3d2c6c2 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/resources/test_read_schema.avsc @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "type" : "record", + "name" : "record", + "fields" : [ { + "name" : "uuid", + "type" : [ "null", "string" ], + "default" : null + }, { + "name" : "name", + "type" : [ "null", "string" ], + "default" : null + }, { + "name" : "age", + "type" : [ "null", "int" ], + "default" : null + }, { + "name" : "ts", + "type" : [ "null", { + "type" : "long", + "logicalType" : "timestamp-millis" + } ], + "default" : null + }, { + "name" : "partition", + "type" : [ "null", "string" ], + "default" : null + } ] +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/resources/test_source.data b/hudi-flink-datasource/hudi-flink/src/test/resources/test_source.data new file mode 100644 index 0000000000000..2f628e29c535b --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/resources/test_source.data @@ -0,0 +1,8 @@ +{"uuid": "id1", "name": "Danny", "age": 23, "ts": "1970-01-01T00:00:01", "partition": "par1"} +{"uuid": "id2", "name": "Stephen", "age": 33, "ts": "1970-01-01T00:00:02", "partition": "par1"} +{"uuid": "id3", "name": "Julian", "age": 53, "ts": "1970-01-01T00:00:03", "partition": "par2"} +{"uuid": "id4", "name": "Fabian", "age": 31, "ts": "1970-01-01T00:00:04", "partition": "par2"} +{"uuid": "id5", "name": "Sophia", "age": 18, "ts": "1970-01-01T00:00:05", "partition": "par3"} +{"uuid": "id6", "name": "Emma", "age": 20, "ts": "1970-01-01T00:00:06", "partition": "par3"} +{"uuid": "id7", "name": "Bob", "age": 44, "ts": "1970-01-01T00:00:07", "partition": "par4"} +{"uuid": "id8", "name": "Han", "age": 56, "ts": "1970-01-01T00:00:08", "partition": "par4"} diff --git a/hudi-flink-datasource/hudi-flink/src/test/resources/test_source_2.data b/hudi-flink-datasource/hudi-flink/src/test/resources/test_source_2.data new file mode 100644 index 0000000000000..ff8265d4b9eb1 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/resources/test_source_2.data @@ -0,0 +1,8 @@ +{"uuid": "id1", "name": "Danny", "age": 24, "ts": "1970-01-01T00:00:01", "partition": "par1"} +{"uuid": "id2", "name": "Stephen", "age": 34, "ts": "1970-01-01T00:00:02", "partition": "par1"} +{"uuid": "id3", "name": "Julian", "age": 54, "ts": "1970-01-01T00:00:03", "partition": "par2"} +{"uuid": "id4", "name": "Fabian", "age": 32, "ts": "1970-01-01T00:00:04", "partition": "par2"} +{"uuid": "id5", "name": "Sophia", "age": 18, "ts": "1970-01-01T00:00:05", "partition": "par3"} +{"uuid": "id9", "name": "Jane", "age": 19, "ts": "1970-01-01T00:00:06", "partition": "par3"} +{"uuid": "id10", "name": "Ella", "age": 38, "ts": "1970-01-01T00:00:07", "partition": "par4"} +{"uuid": "id11", "name": "Phoebe", "age": 52, "ts": "1970-01-01T00:00:08", "partition": "par4"} diff --git a/hudi-flink-datasource/hudi-flink/src/test/resources/test_source_3.data b/hudi-flink-datasource/hudi-flink/src/test/resources/test_source_3.data new file mode 100644 index 0000000000000..18f0a9d0ed33d --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/resources/test_source_3.data @@ -0,0 +1,8 @@ +{"uuid": "id1", "name": "Danny", "age": 24, "ts": "1970-01-01T00:00:01", "partition": "par1"} +{"uuid": "id2", "name": "Stephen", "age": 34, "ts": "1970-01-01T00:00:02", "partition": "par1"} +{"uuid": "id1", "name": "Danny", "age": 24, "ts": "1970-01-01T00:00:01", "partition": "par1"} +{"uuid": "id2", "name": "Stephen", "age": 34, "ts": "1970-01-01T00:00:02", "partition": "par1"} +{"uuid": "id1", "name": "Danny", "age": 24, "ts": "1970-01-01T00:00:01", "partition": "par1"} +{"uuid": "id2", "name": "Stephen", "age": 34, "ts": "1970-01-01T00:00:02", "partition": "par1"} +{"uuid": "id1", "name": "Danny", "age": 24, "ts": "1970-01-01T00:00:01", "partition": "par1"} +{"uuid": "id2", "name": "Stephen", "age": 34, "ts": "1970-01-01T00:00:02", "partition": "par1"} diff --git a/hudi-flink-datasource/hudi-flink/src/test/resources/test_source_4.data b/hudi-flink-datasource/hudi-flink/src/test/resources/test_source_4.data new file mode 100644 index 0000000000000..1ed4d19fbfaa0 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/resources/test_source_4.data @@ -0,0 +1,8 @@ +{"uuid": "id1", "name": "Danny", "age": 24, "ts": "1970-01-01T00:00:01", "partition": "par1"} +{"uuid": "id1", "name": "Stephen", "age": 34, "ts": "1970-01-01T00:00:02", "partition": "par1"} +{"uuid": "id1", "name": "Julian", "age": 54, "ts": "1970-01-01T00:00:03", "partition": "par2"} +{"uuid": "id1", "name": "Fabian", "age": 32, "ts": "1970-01-01T00:00:04", "partition": "par2"} +{"uuid": "id1", "name": "Sophia", "age": 18, "ts": "1970-01-01T00:00:05", "partition": "par3"} +{"uuid": "id1", "name": "Jane", "age": 19, "ts": "1970-01-01T00:00:06", "partition": "par3"} +{"uuid": "id1", "name": "Ella", "age": 38, "ts": "1970-01-01T00:00:07", "partition": "par4"} +{"uuid": "id1", "name": "Phoebe", "age": 52, "ts": "1970-01-01T00:00:08", "partition": "par4"} diff --git a/hudi-flink-datasource/hudi-flink/src/test/resources/test_source_5.data b/hudi-flink-datasource/hudi-flink/src/test/resources/test_source_5.data new file mode 100644 index 0000000000000..19b6a25a77acc --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/resources/test_source_5.data @@ -0,0 +1,8 @@ +id1,Danny,23,1970-01-01 00:00:01,par1 +id2,Stephen,33,1970-01-01 00:00:02,par1 +id3,Julian,53,1970-01-01 00:00:03,par2 +id4,Fabian,31,1970-01-01 00:00:04,par2 +id5,Sophia,18,1970-01-01 00:00:05,par3 +id6,Emma,20,1970-01-01 00:00:06,par3 +id7,Bob,44,1970-01-01 00:00:07,par4 +id8,Han,56,1970-01-01 00:00:08,par4 diff --git a/hudi-flink-datasource/hudi-flink1.13.x/pom.xml b/hudi-flink-datasource/hudi-flink1.13.x/pom.xml new file mode 100644 index 0000000000000..17790960f36dc --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/pom.xml @@ -0,0 +1,177 @@ + + + + + hudi-flink-datasource + org.apache.hudi + 0.12.2-dt-SNAPSHOT + + 4.0.0 + + hudi-flink1.13.x + 0.12.2-dt-SNAPSHOT + jar + + + ${project.parent.parent.basedir} + + + + + + org.apache.logging.log4j + log4j-1.2-api + + + org.apache.logging.log4j + log4j-slf4j-impl + + + org.slf4j + slf4j-api + + + + + org.apache.hudi + hudi-common + ${project.version} + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + provided + + + org.apache.flink + flink-table-runtime-blink_${scala.binary.version} + ${flink1.13.version} + provided + + + org.apache.flink + flink-streaming-java_${scala.binary.version} + ${flink1.13.version} + provided + + + org.apache.flink + flink-core + ${flink1.13.version} + provided + + + org.apache.flink + flink-parquet_${scala.binary.version} + ${flink1.13.version} + provided + + + org.apache.flink + flink-json + ${flink1.13.version} + provided + + + org.apache.flink + flink-runtime_${scala.binary.version} + ${flink1.13.version} + test + test-jar + + + + + org.apache.hudi + hudi-tests-common + ${project.version} + test + + + + org.junit.jupiter + junit-jupiter-api + test + + + org.junit.jupiter + junit-jupiter-engine + test + + + org.junit.vintage + junit-vintage-engine + test + + + org.junit.jupiter + junit-jupiter-params + test + + + org.mockito + mockito-junit-jupiter + test + + + org.junit.platform + junit-platform-runner + test + + + org.junit.platform + junit-platform-suite-api + test + + + org.junit.platform + junit-platform-commons + test + + + + + + + org.jacoco + jacoco-maven-plugin + + + org.apache.maven.plugins + maven-jar-plugin + + + + test-jar + + test-compile + + + + false + + + + org.apache.rat + apache-rat-plugin + + + + diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/AbstractStreamOperatorAdapter.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/AbstractStreamOperatorAdapter.java new file mode 100644 index 0000000000000..51c53f368fb9d --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/AbstractStreamOperatorAdapter.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; + +/** + * Adapter clazz for {@code AbstractStreamOperator}. + */ +public abstract class AbstractStreamOperatorAdapter extends AbstractStreamOperator { + @Override + public void close() throws Exception { + super.dispose(); + } + + public void finish() throws Exception { + super.close(); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/AbstractStreamOperatorFactoryAdapter.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/AbstractStreamOperatorFactoryAdapter.java new file mode 100644 index 0000000000000..0ea0968f17585 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/AbstractStreamOperatorFactoryAdapter.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.MailboxExecutor; +import org.apache.flink.streaming.api.operators.YieldingOperatorFactory; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * Adapter clazz for {@link AbstractStreamOperatorFactory}. + */ +public abstract class AbstractStreamOperatorFactoryAdapter + extends AbstractStreamOperatorFactory implements YieldingOperatorFactory { + private transient MailboxExecutor mailboxExecutor; + + @Override + public void setMailboxExecutor(MailboxExecutor mailboxExecutor) { + this.mailboxExecutor = mailboxExecutor; + } + + public MailboxExecutorAdapter getMailboxExecutorAdapter() { + return new MailboxExecutorAdapter(getMailboxExecutor()); + } + + /** + * Provides the mailbox executor iff this factory implements {@link YieldingOperatorFactory}. + */ + protected MailboxExecutor getMailboxExecutor() { + return checkNotNull( + mailboxExecutor, "Factory does not implement %s", YieldingOperatorFactory.class); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/DataStreamScanProviderAdapter.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/DataStreamScanProviderAdapter.java new file mode 100644 index 0000000000000..867395c43f199 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/DataStreamScanProviderAdapter.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.table.connector.source.DataStreamScanProvider; + +/** + * Adapter clazz for {@code DataStreamScanProvider}. + */ +public interface DataStreamScanProviderAdapter extends DataStreamScanProvider { +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/DataStreamSinkProviderAdapter.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/DataStreamSinkProviderAdapter.java new file mode 100644 index 0000000000000..e8eaa3c62d441 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/DataStreamSinkProviderAdapter.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.table.connector.sink.DataStreamSinkProvider; + +/** + * Adapter clazz for {@code DataStreamSinkProvider}. + */ +public interface DataStreamSinkProviderAdapter extends DataStreamSinkProvider { +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/MailboxExecutorAdapter.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/MailboxExecutorAdapter.java new file mode 100644 index 0000000000000..9ae3ca6912f65 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/MailboxExecutorAdapter.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.streaming.api.operators.MailboxExecutor; +import org.apache.flink.util.function.ThrowingRunnable; + +/** + * Adapter clazz for {@link MailboxExecutor}. + */ +public class MailboxExecutorAdapter { + private final MailboxExecutor executor; + + public MailboxExecutorAdapter(MailboxExecutor executor) { + this.executor = executor; + } + + public void execute(ThrowingRunnable command, String description) { + this.executor.execute(command, description); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/RateLimiterAdapter.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/RateLimiterAdapter.java new file mode 100644 index 0000000000000..6d058de89bc55 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/RateLimiterAdapter.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.shaded.guava18.com.google.common.util.concurrent.RateLimiter; + +/** + * Bridge class for shaded guava clazz {@code RateLimiter}. + */ +public class RateLimiterAdapter { + private final RateLimiter rateLimiter; + + private RateLimiterAdapter(double permitsPerSecond) { + this.rateLimiter = RateLimiter.create(permitsPerSecond); + } + + public static RateLimiterAdapter create(double permitsPerSecond) { + return new RateLimiterAdapter(permitsPerSecond); + } + + public void acquire() { + this.rateLimiter.acquire(); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/Utils.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/Utils.java new file mode 100644 index 0000000000000..1f9ebb582394c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/adapter/Utils.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.TimeCharacteristic; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.streaming.api.operators.Output; +import org.apache.flink.streaming.api.operators.StreamSourceContexts; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.runtime.tasks.ProcessingTimeService; +import org.apache.flink.streaming.runtime.tasks.StreamTask; +import org.apache.flink.table.catalog.ObjectIdentifier; +import org.apache.flink.table.catalog.ResolvedCatalogTable; +import org.apache.flink.table.factories.FactoryUtil; + +/** + * Adapter utils. + */ +public class Utils { + public static SourceFunction.SourceContext getSourceContext( + TimeCharacteristic timeCharacteristic, + ProcessingTimeService processingTimeService, + StreamTask streamTask, + Output> output, + long watermarkInterval) { + return StreamSourceContexts.getSourceContext( + timeCharacteristic, + processingTimeService, + new Object(), // no actual locking needed + streamTask.getStreamStatusMaintainer(), + output, + watermarkInterval, + -1); + } + + public static FactoryUtil.DefaultDynamicTableContext getTableContext( + ObjectIdentifier tablePath, + ResolvedCatalogTable catalogTable, + ReadableConfig conf) { + return new FactoryUtil.DefaultDynamicTableContext(tablePath, catalogTable, + conf, Thread.currentThread().getContextClassLoader(), false); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/data/ColumnarArrayData.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/data/ColumnarArrayData.java new file mode 100644 index 0000000000000..20c63d26f7492 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/data/ColumnarArrayData.java @@ -0,0 +1,270 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.data; + +import org.apache.hudi.table.data.vector.MapColumnVector; +import org.apache.hudi.table.data.vector.RowColumnVector; + +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RawValueData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.data.binary.TypedSetters; +import org.apache.flink.table.data.vector.ArrayColumnVector; +import org.apache.flink.table.data.vector.BooleanColumnVector; +import org.apache.flink.table.data.vector.ByteColumnVector; +import org.apache.flink.table.data.vector.BytesColumnVector; +import org.apache.flink.table.data.vector.ColumnVector; +import org.apache.flink.table.data.vector.DecimalColumnVector; +import org.apache.flink.table.data.vector.DoubleColumnVector; +import org.apache.flink.table.data.vector.FloatColumnVector; +import org.apache.flink.table.data.vector.IntColumnVector; +import org.apache.flink.table.data.vector.LongColumnVector; +import org.apache.flink.table.data.vector.ShortColumnVector; +import org.apache.flink.table.data.vector.TimestampColumnVector; + +import java.util.Arrays; + +/** + * Columnar array to support access to vector column data. + * + *

    References {@code org.apache.flink.table.data.ColumnarArrayData} to include FLINK-15390. + */ +public final class ColumnarArrayData implements ArrayData, TypedSetters { + + private final ColumnVector data; + private final int offset; + private final int numElements; + + public ColumnarArrayData(ColumnVector data, int offset, int numElements) { + this.data = data; + this.offset = offset; + this.numElements = numElements; + } + + @Override + public int size() { + return numElements; + } + + @Override + public boolean isNullAt(int pos) { + return data.isNullAt(offset + pos); + } + + @Override + public void setNullAt(int pos) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public boolean getBoolean(int pos) { + return ((BooleanColumnVector) data).getBoolean(offset + pos); + } + + @Override + public byte getByte(int pos) { + return ((ByteColumnVector) data).getByte(offset + pos); + } + + @Override + public short getShort(int pos) { + return ((ShortColumnVector) data).getShort(offset + pos); + } + + @Override + public int getInt(int pos) { + return ((IntColumnVector) data).getInt(offset + pos); + } + + @Override + public long getLong(int pos) { + return ((LongColumnVector) data).getLong(offset + pos); + } + + @Override + public float getFloat(int pos) { + return ((FloatColumnVector) data).getFloat(offset + pos); + } + + @Override + public double getDouble(int pos) { + return ((DoubleColumnVector) data).getDouble(offset + pos); + } + + @Override + public StringData getString(int pos) { + BytesColumnVector.Bytes byteArray = getByteArray(pos); + return StringData.fromBytes(byteArray.data, byteArray.offset, byteArray.len); + } + + @Override + public DecimalData getDecimal(int pos, int precision, int scale) { + return ((DecimalColumnVector) data).getDecimal(offset + pos, precision, scale); + } + + @Override + public TimestampData getTimestamp(int pos, int precision) { + return ((TimestampColumnVector) data).getTimestamp(offset + pos, precision); + } + + @Override + public RawValueData getRawValue(int pos) { + throw new UnsupportedOperationException("RawValueData is not supported."); + } + + @Override + public byte[] getBinary(int pos) { + BytesColumnVector.Bytes byteArray = getByteArray(pos); + if (byteArray.len == byteArray.data.length) { + return byteArray.data; + } else { + return Arrays.copyOfRange(byteArray.data, byteArray.offset, byteArray.len); + } + } + + @Override + public ArrayData getArray(int pos) { + return ((ArrayColumnVector) data).getArray(offset + pos); + } + + @Override + public MapData getMap(int pos) { + return ((MapColumnVector) data).getMap(offset + pos); + } + + @Override + public RowData getRow(int pos, int numFields) { + return ((RowColumnVector) data).getRow(offset + pos); + } + + @Override + public void setBoolean(int pos, boolean value) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public void setByte(int pos, byte value) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public void setShort(int pos, short value) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public void setInt(int pos, int value) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public void setLong(int pos, long value) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public void setFloat(int pos, float value) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public void setDouble(int pos, double value) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public void setDecimal(int pos, DecimalData value, int precision) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public void setTimestamp(int pos, TimestampData value, int precision) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public boolean[] toBooleanArray() { + boolean[] res = new boolean[numElements]; + for (int i = 0; i < numElements; i++) { + res[i] = getBoolean(i); + } + return res; + } + + @Override + public byte[] toByteArray() { + byte[] res = new byte[numElements]; + for (int i = 0; i < numElements; i++) { + res[i] = getByte(i); + } + return res; + } + + @Override + public short[] toShortArray() { + short[] res = new short[numElements]; + for (int i = 0; i < numElements; i++) { + res[i] = getShort(i); + } + return res; + } + + @Override + public int[] toIntArray() { + int[] res = new int[numElements]; + for (int i = 0; i < numElements; i++) { + res[i] = getInt(i); + } + return res; + } + + @Override + public long[] toLongArray() { + long[] res = new long[numElements]; + for (int i = 0; i < numElements; i++) { + res[i] = getLong(i); + } + return res; + } + + @Override + public float[] toFloatArray() { + float[] res = new float[numElements]; + for (int i = 0; i < numElements; i++) { + res[i] = getFloat(i); + } + return res; + } + + @Override + public double[] toDoubleArray() { + double[] res = new double[numElements]; + for (int i = 0; i < numElements; i++) { + res[i] = getDouble(i); + } + return res; + } + + private BytesColumnVector.Bytes getByteArray(int pos) { + return ((BytesColumnVector) data).getBytes(offset + pos); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/data/ColumnarMapData.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/data/ColumnarMapData.java new file mode 100644 index 0000000000000..bba462f404b35 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/data/ColumnarMapData.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.data; + +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.vector.ColumnVector; + +/** + * Columnar map to support access to vector column data. + * + *

    Referenced from flink 1.14.0 {@code org.apache.flink.table.data.ColumnarMapData}. + */ +public final class ColumnarMapData implements MapData { + + private final ColumnVector keyColumnVector; + private final ColumnVector valueColumnVector; + private final int offset; + private final int numElements; + + public ColumnarMapData( + ColumnVector keyColumnVector, + ColumnVector valueColumnVector, + int offset, + int numElements) { + this.keyColumnVector = keyColumnVector; + this.valueColumnVector = valueColumnVector; + this.offset = offset; + this.numElements = numElements; + } + + @Override + public int size() { + return numElements; + } + + @Override + public ArrayData keyArray() { + return new ColumnarArrayData(keyColumnVector, offset, numElements); + } + + @Override + public ArrayData valueArray() { + return new ColumnarArrayData(valueColumnVector, offset, numElements); + } + + @Override + public boolean equals(Object o) { + throw new UnsupportedOperationException( + "ColumnarMapData do not support equals, please compare fields one by one!"); + } + + @Override + public int hashCode() { + throw new UnsupportedOperationException( + "ColumnarMapData do not support hashCode, please hash fields one by one!"); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/data/ColumnarRowData.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/data/ColumnarRowData.java new file mode 100644 index 0000000000000..9a95035b27038 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/data/ColumnarRowData.java @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.data; + +import org.apache.hudi.table.data.vector.VectorizedColumnBatch; + +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RawValueData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.data.binary.TypedSetters; +import org.apache.flink.table.data.vector.BytesColumnVector.Bytes; +import org.apache.flink.types.RowKind; + +/** + * Columnar row to support access to vector column data. + * It is a row view in {@link VectorizedColumnBatch}. + * + *

    References {@code org.apache.flink.table.data.ColumnarRowData} to include FLINK-15390. + */ +public final class ColumnarRowData implements RowData, TypedSetters { + + private RowKind rowKind = RowKind.INSERT; + private VectorizedColumnBatch vectorizedColumnBatch; + private int rowId; + + public ColumnarRowData() { + } + + public ColumnarRowData(VectorizedColumnBatch vectorizedColumnBatch) { + this(vectorizedColumnBatch, 0); + } + + public ColumnarRowData(VectorizedColumnBatch vectorizedColumnBatch, int rowId) { + this.vectorizedColumnBatch = vectorizedColumnBatch; + this.rowId = rowId; + } + + public void setVectorizedColumnBatch(VectorizedColumnBatch vectorizedColumnBatch) { + this.vectorizedColumnBatch = vectorizedColumnBatch; + this.rowId = 0; + } + + public void setRowId(int rowId) { + this.rowId = rowId; + } + + @Override + public RowKind getRowKind() { + return rowKind; + } + + @Override + public void setRowKind(RowKind kind) { + this.rowKind = kind; + } + + @Override + public int getArity() { + return vectorizedColumnBatch.getArity(); + } + + @Override + public boolean isNullAt(int pos) { + return vectorizedColumnBatch.isNullAt(rowId, pos); + } + + @Override + public boolean getBoolean(int pos) { + return vectorizedColumnBatch.getBoolean(rowId, pos); + } + + @Override + public byte getByte(int pos) { + return vectorizedColumnBatch.getByte(rowId, pos); + } + + @Override + public short getShort(int pos) { + return vectorizedColumnBatch.getShort(rowId, pos); + } + + @Override + public int getInt(int pos) { + return vectorizedColumnBatch.getInt(rowId, pos); + } + + @Override + public long getLong(int pos) { + return vectorizedColumnBatch.getLong(rowId, pos); + } + + @Override + public float getFloat(int pos) { + return vectorizedColumnBatch.getFloat(rowId, pos); + } + + @Override + public double getDouble(int pos) { + return vectorizedColumnBatch.getDouble(rowId, pos); + } + + @Override + public StringData getString(int pos) { + Bytes byteArray = vectorizedColumnBatch.getByteArray(rowId, pos); + return StringData.fromBytes(byteArray.data, byteArray.offset, byteArray.len); + } + + @Override + public DecimalData getDecimal(int pos, int precision, int scale) { + return vectorizedColumnBatch.getDecimal(rowId, pos, precision, scale); + } + + @Override + public TimestampData getTimestamp(int pos, int precision) { + return vectorizedColumnBatch.getTimestamp(rowId, pos, precision); + } + + @Override + public RawValueData getRawValue(int pos) { + throw new UnsupportedOperationException("RawValueData is not supported."); + } + + @Override + public byte[] getBinary(int pos) { + Bytes byteArray = vectorizedColumnBatch.getByteArray(rowId, pos); + if (byteArray.len == byteArray.data.length) { + return byteArray.data; + } else { + byte[] ret = new byte[byteArray.len]; + System.arraycopy(byteArray.data, byteArray.offset, ret, 0, byteArray.len); + return ret; + } + } + + @Override + public RowData getRow(int pos, int numFields) { + return vectorizedColumnBatch.getRow(rowId, pos); + } + + @Override + public ArrayData getArray(int pos) { + return vectorizedColumnBatch.getArray(rowId, pos); + } + + @Override + public MapData getMap(int pos) { + return vectorizedColumnBatch.getMap(rowId, pos); + } + + @Override + public void setNullAt(int pos) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public void setBoolean(int pos, boolean value) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public void setByte(int pos, byte value) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public void setShort(int pos, short value) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public void setInt(int pos, int value) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public void setLong(int pos, long value) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public void setFloat(int pos, float value) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public void setDouble(int pos, double value) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public void setDecimal(int pos, DecimalData value, int precision) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public void setTimestamp(int pos, TimestampData value, int precision) { + throw new UnsupportedOperationException("Not support the operation!"); + } + + @Override + public boolean equals(Object o) { + throw new UnsupportedOperationException( + "ColumnarRowData do not support equals, please compare fields one by one!"); + } + + @Override + public int hashCode() { + throw new UnsupportedOperationException( + "ColumnarRowData do not support hashCode, please hash fields one by one!"); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/data/vector/MapColumnVector.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/data/vector/MapColumnVector.java new file mode 100644 index 0000000000000..6bdf8782f4d3e --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/data/vector/MapColumnVector.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.data.vector; + +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.vector.ColumnVector; + +/** + * Map column vector. + */ +public interface MapColumnVector extends ColumnVector { + MapData getMap(int i); +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/data/vector/RowColumnVector.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/data/vector/RowColumnVector.java new file mode 100644 index 0000000000000..bd0e9bbe7de72 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/data/vector/RowColumnVector.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.data.vector; + +import org.apache.hudi.table.data.ColumnarRowData; + +import org.apache.flink.table.data.vector.ColumnVector; + +/** + * Row column vector. + */ +public interface RowColumnVector extends ColumnVector { + ColumnarRowData getRow(int i); +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/data/vector/VectorizedColumnBatch.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/data/vector/VectorizedColumnBatch.java new file mode 100644 index 0000000000000..bccaec8fdcadf --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/data/vector/VectorizedColumnBatch.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.data.vector; + +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.data.vector.ArrayColumnVector; +import org.apache.flink.table.data.vector.BooleanColumnVector; +import org.apache.flink.table.data.vector.ByteColumnVector; +import org.apache.flink.table.data.vector.BytesColumnVector; +import org.apache.flink.table.data.vector.BytesColumnVector.Bytes; +import org.apache.flink.table.data.vector.ColumnVector; +import org.apache.flink.table.data.vector.DecimalColumnVector; +import org.apache.flink.table.data.vector.DoubleColumnVector; +import org.apache.flink.table.data.vector.FloatColumnVector; +import org.apache.flink.table.data.vector.IntColumnVector; +import org.apache.flink.table.data.vector.LongColumnVector; +import org.apache.flink.table.data.vector.ShortColumnVector; +import org.apache.flink.table.data.vector.TimestampColumnVector; + +import java.io.Serializable; +import java.nio.charset.StandardCharsets; + +/** + * A VectorizedColumnBatch is a set of rows, organized with each column as a vector. It is the unit + * of query execution, organized to minimize the cost per row. + * + *

    {@code VectorizedColumnBatch}s are influenced by Apache Hive VectorizedRowBatch. + * + *

    References {@code org.apache.flink.table.data.vector.VectorizedColumnBatch} to include FLINK-15390. + */ +public class VectorizedColumnBatch implements Serializable { + private static final long serialVersionUID = 8180323238728166155L; + + /** + * This number is carefully chosen to minimize overhead and typically allows one + * VectorizedColumnBatch to fit in cache. + */ + public static final int DEFAULT_SIZE = 2048; + + private int numRows; + public final ColumnVector[] columns; + + public VectorizedColumnBatch(ColumnVector[] vectors) { + this.columns = vectors; + } + + public void setNumRows(int numRows) { + this.numRows = numRows; + } + + public int getNumRows() { + return numRows; + } + + public int getArity() { + return columns.length; + } + + public boolean isNullAt(int rowId, int colId) { + return columns[colId].isNullAt(rowId); + } + + public boolean getBoolean(int rowId, int colId) { + return ((BooleanColumnVector) columns[colId]).getBoolean(rowId); + } + + public byte getByte(int rowId, int colId) { + return ((ByteColumnVector) columns[colId]).getByte(rowId); + } + + public short getShort(int rowId, int colId) { + return ((ShortColumnVector) columns[colId]).getShort(rowId); + } + + public int getInt(int rowId, int colId) { + return ((IntColumnVector) columns[colId]).getInt(rowId); + } + + public long getLong(int rowId, int colId) { + return ((LongColumnVector) columns[colId]).getLong(rowId); + } + + public float getFloat(int rowId, int colId) { + return ((FloatColumnVector) columns[colId]).getFloat(rowId); + } + + public double getDouble(int rowId, int colId) { + return ((DoubleColumnVector) columns[colId]).getDouble(rowId); + } + + public Bytes getByteArray(int rowId, int colId) { + return ((BytesColumnVector) columns[colId]).getBytes(rowId); + } + + private byte[] getBytes(int rowId, int colId) { + Bytes byteArray = getByteArray(rowId, colId); + if (byteArray.len == byteArray.data.length) { + return byteArray.data; + } else { + return byteArray.getBytes(); + } + } + + public String getString(int rowId, int colId) { + Bytes byteArray = getByteArray(rowId, colId); + return new String(byteArray.data, byteArray.offset, byteArray.len, StandardCharsets.UTF_8); + } + + public DecimalData getDecimal(int rowId, int colId, int precision, int scale) { + return ((DecimalColumnVector) (columns[colId])).getDecimal(rowId, precision, scale); + } + + public TimestampData getTimestamp(int rowId, int colId, int precision) { + return ((TimestampColumnVector) (columns[colId])).getTimestamp(rowId, precision); + } + + public ArrayData getArray(int rowId, int colId) { + return ((ArrayColumnVector) columns[colId]).getArray(rowId); + } + + public RowData getRow(int rowId, int colId) { + return ((RowColumnVector) columns[colId]).getRow(rowId); + } + + public MapData getMap(int rowId, int colId) { + return ((MapColumnVector) columns[colId]).getMap(rowId); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/ParquetSplitReaderUtil.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/ParquetSplitReaderUtil.java new file mode 100644 index 0000000000000..75cf3272d6611 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/ParquetSplitReaderUtil.java @@ -0,0 +1,527 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow; + +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.table.data.vector.VectorizedColumnBatch; +import org.apache.hudi.table.format.cow.vector.HeapArrayVector; +import org.apache.hudi.table.format.cow.vector.HeapMapColumnVector; +import org.apache.hudi.table.format.cow.vector.HeapRowColumnVector; +import org.apache.hudi.table.format.cow.vector.ParquetDecimalVector; +import org.apache.hudi.table.format.cow.vector.reader.ArrayColumnReader; +import org.apache.hudi.table.format.cow.vector.reader.FixedLenBytesColumnReader; +import org.apache.hudi.table.format.cow.vector.reader.Int64TimestampColumnReader; +import org.apache.hudi.table.format.cow.vector.reader.MapColumnReader; +import org.apache.hudi.table.format.cow.vector.reader.ParquetColumnarRowSplitReader; +import org.apache.hudi.table.format.cow.vector.reader.RowColumnReader; + +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.vector.reader.BooleanColumnReader; +import org.apache.flink.formats.parquet.vector.reader.ByteColumnReader; +import org.apache.flink.formats.parquet.vector.reader.BytesColumnReader; +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.formats.parquet.vector.reader.DoubleColumnReader; +import org.apache.flink.formats.parquet.vector.reader.FloatColumnReader; +import org.apache.flink.formats.parquet.vector.reader.IntColumnReader; +import org.apache.flink.formats.parquet.vector.reader.LongColumnReader; +import org.apache.flink.formats.parquet.vector.reader.ShortColumnReader; +import org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.data.vector.ColumnVector; +import org.apache.flink.table.data.vector.heap.HeapBooleanVector; +import org.apache.flink.table.data.vector.heap.HeapByteVector; +import org.apache.flink.table.data.vector.heap.HeapBytesVector; +import org.apache.flink.table.data.vector.heap.HeapDoubleVector; +import org.apache.flink.table.data.vector.heap.HeapFloatVector; +import org.apache.flink.table.data.vector.heap.HeapIntVector; +import org.apache.flink.table.data.vector.heap.HeapLongVector; +import org.apache.flink.table.data.vector.heap.HeapShortVector; +import org.apache.flink.table.data.vector.heap.HeapTimestampVector; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.table.types.logical.VarBinaryType; +import org.apache.flink.util.Preconditions; +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.ParquetRuntimeException; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.InvalidSchemaException; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +import java.io.IOException; +import java.math.BigDecimal; +import java.nio.charset.StandardCharsets; +import java.sql.Date; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.apache.flink.table.runtime.functions.SqlDateTimeUtils.dateToInternal; +import static org.apache.parquet.Preconditions.checkArgument; + +/** + * Util for generating {@link ParquetColumnarRowSplitReader}. + * + *

    NOTE: reference from Flink release 1.11.2 {@code ParquetSplitReaderUtil}, modify to support INT64 + * based TIMESTAMP_MILLIS as ConvertedType, should remove when Flink supports that. + */ +public class ParquetSplitReaderUtil { + + /** + * Util for generating partitioned {@link ParquetColumnarRowSplitReader}. + */ + public static ParquetColumnarRowSplitReader genPartColumnarRowReader( + boolean utcTimestamp, + boolean caseSensitive, + Configuration conf, + String[] fullFieldNames, + DataType[] fullFieldTypes, + Map partitionSpec, + int[] selectedFields, + int batchSize, + Path path, + long splitStart, + long splitLength) throws IOException { + List selNonPartNames = Arrays.stream(selectedFields) + .mapToObj(i -> fullFieldNames[i]) + .filter(n -> !partitionSpec.containsKey(n)) + .collect(Collectors.toList()); + + int[] selParquetFields = Arrays.stream(selectedFields) + .filter(i -> !partitionSpec.containsKey(fullFieldNames[i])) + .toArray(); + + ParquetColumnarRowSplitReader.ColumnBatchGenerator gen = readVectors -> { + // create and initialize the row batch + ColumnVector[] vectors = new ColumnVector[selectedFields.length]; + for (int i = 0; i < vectors.length; i++) { + String name = fullFieldNames[selectedFields[i]]; + LogicalType type = fullFieldTypes[selectedFields[i]].getLogicalType(); + vectors[i] = createVector(readVectors, selNonPartNames, name, type, partitionSpec, batchSize); + } + return new VectorizedColumnBatch(vectors); + }; + + return new ParquetColumnarRowSplitReader( + utcTimestamp, + caseSensitive, + conf, + Arrays.stream(selParquetFields) + .mapToObj(i -> fullFieldTypes[i].getLogicalType()) + .toArray(LogicalType[]::new), + selNonPartNames.toArray(new String[0]), + gen, + batchSize, + new org.apache.hadoop.fs.Path(path.toUri()), + splitStart, + splitLength); + } + + private static ColumnVector createVector( + ColumnVector[] readVectors, + List selNonPartNames, + String name, + LogicalType type, + Map partitionSpec, + int batchSize) { + if (partitionSpec.containsKey(name)) { + return createVectorFromConstant(type, partitionSpec.get(name), batchSize); + } + ColumnVector readVector = readVectors[selNonPartNames.indexOf(name)]; + if (readVector == null) { + // when the read vector is null, use a constant null vector instead + readVector = createVectorFromConstant(type, null, batchSize); + } + return readVector; + } + + private static ColumnVector createVectorFromConstant( + LogicalType type, + Object value, + int batchSize) { + switch (type.getTypeRoot()) { + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + HeapBytesVector bsv = new HeapBytesVector(batchSize); + if (value == null) { + bsv.fillWithNulls(); + } else { + bsv.fill(value instanceof byte[] + ? (byte[]) value + : value.toString().getBytes(StandardCharsets.UTF_8)); + } + return bsv; + case BOOLEAN: + HeapBooleanVector bv = new HeapBooleanVector(batchSize); + if (value == null) { + bv.fillWithNulls(); + } else { + bv.fill((boolean) value); + } + return bv; + case TINYINT: + HeapByteVector byteVector = new HeapByteVector(batchSize); + if (value == null) { + byteVector.fillWithNulls(); + } else { + byteVector.fill(((Number) value).byteValue()); + } + return byteVector; + case SMALLINT: + HeapShortVector sv = new HeapShortVector(batchSize); + if (value == null) { + sv.fillWithNulls(); + } else { + sv.fill(((Number) value).shortValue()); + } + return sv; + case INTEGER: + HeapIntVector iv = new HeapIntVector(batchSize); + if (value == null) { + iv.fillWithNulls(); + } else { + iv.fill(((Number) value).intValue()); + } + return iv; + case BIGINT: + HeapLongVector lv = new HeapLongVector(batchSize); + if (value == null) { + lv.fillWithNulls(); + } else { + lv.fill(((Number) value).longValue()); + } + return lv; + case DECIMAL: + DecimalType decimalType = (DecimalType) type; + int precision = decimalType.getPrecision(); + int scale = decimalType.getScale(); + DecimalData decimal = value == null + ? null + : Preconditions.checkNotNull(DecimalData.fromBigDecimal((BigDecimal) value, precision, scale)); + ColumnVector internalVector = createVectorFromConstant( + new VarBinaryType(), + decimal == null ? null : decimal.toUnscaledBytes(), + batchSize); + return new ParquetDecimalVector(internalVector); + case FLOAT: + HeapFloatVector fv = new HeapFloatVector(batchSize); + if (value == null) { + fv.fillWithNulls(); + } else { + fv.fill(((Number) value).floatValue()); + } + return fv; + case DOUBLE: + HeapDoubleVector dv = new HeapDoubleVector(batchSize); + if (value == null) { + dv.fillWithNulls(); + } else { + dv.fill(((Number) value).doubleValue()); + } + return dv; + case DATE: + if (value instanceof LocalDate) { + value = Date.valueOf((LocalDate) value); + } + return createVectorFromConstant( + new IntType(), + value == null ? null : dateToInternal((Date) value), + batchSize); + case TIMESTAMP_WITHOUT_TIME_ZONE: + HeapTimestampVector tv = new HeapTimestampVector(batchSize); + if (value == null) { + tv.fillWithNulls(); + } else { + tv.fill(TimestampData.fromLocalDateTime((LocalDateTime) value)); + } + return tv; + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + } + + private static List filterDescriptors(int depth, Type type, List columns) throws ParquetRuntimeException { + List filtered = new ArrayList<>(); + for (ColumnDescriptor descriptor : columns) { + if (depth >= descriptor.getPath().length) { + throw new InvalidSchemaException("Expect depth " + depth + " for schema: " + descriptor); + } + if (type.getName().equals(descriptor.getPath()[depth])) { + filtered.add(descriptor); + } + } + ValidationUtils.checkState(filtered.size() > 0, "Corrupted Parquet schema"); + return filtered; + } + + public static ColumnReader createColumnReader( + boolean utcTimestamp, + LogicalType fieldType, + Type physicalType, + List descriptors, + PageReadStore pages) throws IOException { + return createColumnReader(utcTimestamp, fieldType, physicalType, descriptors, + pages, 0); + } + + private static ColumnReader createColumnReader( + boolean utcTimestamp, + LogicalType fieldType, + Type physicalType, + List columns, + PageReadStore pages, + int depth) throws IOException { + List descriptors = filterDescriptors(depth, physicalType, columns); + ColumnDescriptor descriptor = descriptors.get(0); + PageReader pageReader = pages.getPageReader(descriptor); + switch (fieldType.getTypeRoot()) { + case BOOLEAN: + return new BooleanColumnReader(descriptor, pageReader); + case TINYINT: + return new ByteColumnReader(descriptor, pageReader); + case DOUBLE: + return new DoubleColumnReader(descriptor, pageReader); + case FLOAT: + return new FloatColumnReader(descriptor, pageReader); + case INTEGER: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + return new IntColumnReader(descriptor, pageReader); + case BIGINT: + return new LongColumnReader(descriptor, pageReader); + case SMALLINT: + return new ShortColumnReader(descriptor, pageReader); + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + return new BytesColumnReader(descriptor, pageReader); + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) { + case INT64: + int precision = fieldType instanceof TimestampType + ? ((TimestampType) fieldType).getPrecision() + : ((LocalZonedTimestampType) fieldType).getPrecision(); + return new Int64TimestampColumnReader(utcTimestamp, descriptor, pageReader, precision); + case INT96: + return new TimestampColumnReader(utcTimestamp, descriptor, pageReader); + default: + throw new AssertionError(); + } + case DECIMAL: + switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) { + case INT32: + return new IntColumnReader(descriptor, pageReader); + case INT64: + return new LongColumnReader(descriptor, pageReader); + case BINARY: + return new BytesColumnReader(descriptor, pageReader); + case FIXED_LEN_BYTE_ARRAY: + return new FixedLenBytesColumnReader( + descriptor, pageReader, ((DecimalType) fieldType).getPrecision()); + default: + throw new AssertionError(); + } + case ARRAY: + return new ArrayColumnReader( + descriptor, + pageReader, + utcTimestamp, + descriptor.getPrimitiveType(), + fieldType); + case MAP: + MapType mapType = (MapType) fieldType; + ArrayColumnReader keyReader = + new ArrayColumnReader( + descriptor, + pageReader, + utcTimestamp, + descriptor.getPrimitiveType(), + new ArrayType(mapType.getKeyType())); + ArrayColumnReader valueReader = + new ArrayColumnReader( + descriptors.get(1), + pages.getPageReader(descriptors.get(1)), + utcTimestamp, + descriptors.get(1).getPrimitiveType(), + new ArrayType(mapType.getValueType())); + return new MapColumnReader(keyReader, valueReader, fieldType); + case ROW: + RowType rowType = (RowType) fieldType; + GroupType groupType = physicalType.asGroupType(); + List fieldReaders = new ArrayList<>(); + for (int i = 0; i < rowType.getFieldCount(); i++) { + fieldReaders.add( + createColumnReader( + utcTimestamp, + rowType.getTypeAt(i), + groupType.getType(i), + descriptors, + pages, + depth + 1)); + } + return new RowColumnReader(fieldReaders); + default: + throw new UnsupportedOperationException(fieldType + " is not supported now."); + } + } + + public static WritableColumnVector createWritableColumnVector( + int batchSize, + LogicalType fieldType, + Type physicalType, + List descriptors) { + return createWritableColumnVector(batchSize, fieldType, physicalType, descriptors, 0); + } + + private static WritableColumnVector createWritableColumnVector( + int batchSize, + LogicalType fieldType, + Type physicalType, + List columns, + int depth) { + List descriptors = filterDescriptors(depth, physicalType, columns); + PrimitiveType primitiveType = descriptors.get(0).getPrimitiveType(); + PrimitiveType.PrimitiveTypeName typeName = primitiveType.getPrimitiveTypeName(); + switch (fieldType.getTypeRoot()) { + case BOOLEAN: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.BOOLEAN, + "Unexpected type: %s", typeName); + return new HeapBooleanVector(batchSize); + case TINYINT: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.INT32, + "Unexpected type: %s", typeName); + return new HeapByteVector(batchSize); + case DOUBLE: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.DOUBLE, + "Unexpected type: %s", typeName); + return new HeapDoubleVector(batchSize); + case FLOAT: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.FLOAT, + "Unexpected type: %s", typeName); + return new HeapFloatVector(batchSize); + case INTEGER: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.INT32, + "Unexpected type: %s", typeName); + return new HeapIntVector(batchSize); + case BIGINT: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.INT64, + "Unexpected type: %s", typeName); + return new HeapLongVector(batchSize); + case SMALLINT: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.INT32, + "Unexpected type: %s", typeName); + return new HeapShortVector(batchSize); + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.BINARY, + "Unexpected type: %s", typeName); + return new HeapBytesVector(batchSize); + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + checkArgument(primitiveType.getOriginalType() != OriginalType.TIME_MICROS, + "TIME_MICROS original type is not "); + return new HeapTimestampVector(batchSize); + case DECIMAL: + checkArgument( + (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY + || typeName == PrimitiveType.PrimitiveTypeName.BINARY) + && primitiveType.getOriginalType() == OriginalType.DECIMAL, + "Unexpected type: %s", typeName); + return new HeapBytesVector(batchSize); + case ARRAY: + ArrayType arrayType = (ArrayType) fieldType; + return new HeapArrayVector( + batchSize, + createWritableColumnVector( + batchSize, + arrayType.getElementType(), + physicalType, + descriptors, + depth)); + case MAP: + MapType mapType = (MapType) fieldType; + GroupType repeatedType = physicalType.asGroupType().getType(0).asGroupType(); + // the map column has three level paths. + return new HeapMapColumnVector( + batchSize, + createWritableColumnVector( + batchSize, + mapType.getKeyType(), + repeatedType.getType(0), + descriptors, + depth + 2), + createWritableColumnVector( + batchSize, + mapType.getValueType(), + repeatedType.getType(1), + descriptors, + depth + 2)); + case ROW: + RowType rowType = (RowType) fieldType; + GroupType groupType = physicalType.asGroupType(); + WritableColumnVector[] columnVectors = + new WritableColumnVector[rowType.getFieldCount()]; + for (int i = 0; i < columnVectors.length; i++) { + columnVectors[i] = + createWritableColumnVector( + batchSize, + rowType.getTypeAt(i), + groupType.getType(i), + descriptors, + depth + 1); + } + return new HeapRowColumnVector(batchSize, columnVectors); + default: + throw new UnsupportedOperationException(fieldType + " is not supported now."); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapArrayVector.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapArrayVector.java new file mode 100644 index 0000000000000..6d31d26b8d978 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapArrayVector.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector; + +import org.apache.hudi.table.data.ColumnarArrayData; + +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.vector.ArrayColumnVector; +import org.apache.flink.table.data.vector.ColumnVector; +import org.apache.flink.table.data.vector.heap.AbstractHeapVector; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; + +/** + * This class represents a nullable heap array column vector. + */ +public class HeapArrayVector extends AbstractHeapVector + implements WritableColumnVector, ArrayColumnVector { + + public long[] offsets; + public long[] lengths; + public ColumnVector child; + private int size; + + public HeapArrayVector(int len) { + super(len); + offsets = new long[len]; + lengths = new long[len]; + } + + public HeapArrayVector(int len, ColumnVector vector) { + super(len); + offsets = new long[len]; + lengths = new long[len]; + this.child = vector; + } + + public int getSize() { + return size; + } + + public void setSize(int size) { + this.size = size; + } + + public int getLen() { + return this.isNull.length; + } + + @Override + public ArrayData getArray(int i) { + long offset = offsets[i]; + long length = lengths[i]; + return new ColumnarArrayData(child, (int) offset, (int) length); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapMapColumnVector.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapMapColumnVector.java new file mode 100644 index 0000000000000..cf39fc981624a --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapMapColumnVector.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector; + +import org.apache.hudi.table.data.ColumnarMapData; +import org.apache.hudi.table.data.vector.MapColumnVector; + +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.vector.ColumnVector; +import org.apache.flink.table.data.vector.heap.AbstractHeapVector; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; + +/** + * This class represents a nullable heap map column vector. + */ +public class HeapMapColumnVector extends AbstractHeapVector + implements WritableColumnVector, MapColumnVector { + + private long[] offsets; + private long[] lengths; + private int size; + private ColumnVector keys; + private ColumnVector values; + + public HeapMapColumnVector(int len, ColumnVector keys, ColumnVector values) { + super(len); + size = 0; + offsets = new long[len]; + lengths = new long[len]; + this.keys = keys; + this.values = values; + } + + public void setOffsets(long[] offsets) { + this.offsets = offsets; + } + + public void setLengths(long[] lengths) { + this.lengths = lengths; + } + + public void setKeys(ColumnVector keys) { + this.keys = keys; + } + + public void setValues(ColumnVector values) { + this.values = values; + } + + public int getSize() { + return size; + } + + public void setSize(int size) { + this.size = size; + } + + @Override + public MapData getMap(int i) { + long offset = offsets[i]; + long length = lengths[i]; + return new ColumnarMapData(keys, values, (int) offset, (int) length); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapRowColumnVector.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapRowColumnVector.java new file mode 100644 index 0000000000000..03da9205d313e --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapRowColumnVector.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector; + +import org.apache.hudi.table.data.ColumnarRowData; +import org.apache.hudi.table.data.vector.RowColumnVector; +import org.apache.hudi.table.data.vector.VectorizedColumnBatch; + +import org.apache.flink.table.data.vector.heap.AbstractHeapVector; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; + +/** + * This class represents a nullable heap row column vector. + */ +public class HeapRowColumnVector extends AbstractHeapVector + implements WritableColumnVector, RowColumnVector { + + public WritableColumnVector[] vectors; + + public HeapRowColumnVector(int len, WritableColumnVector... vectors) { + super(len); + this.vectors = vectors; + } + + @Override + public ColumnarRowData getRow(int i) { + ColumnarRowData columnarRowData = new ColumnarRowData(new VectorizedColumnBatch(vectors)); + columnarRowData.setRowId(i); + return columnarRowData; + } + + @Override + public void reset() { + super.reset(); + for (WritableColumnVector vector : vectors) { + vector.reset(); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/ParquetDecimalVector.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/ParquetDecimalVector.java new file mode 100644 index 0000000000000..a2f6d5b0cd74c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/ParquetDecimalVector.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector; + +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.vector.BytesColumnVector; +import org.apache.flink.table.data.vector.ColumnVector; +import org.apache.flink.table.data.vector.DecimalColumnVector; + +/** + * Parquet write decimal as int32 and int64 and binary, this class wrap the real vector to + * provide {@link DecimalColumnVector} interface. + * + *

    Reference Flink release 1.11.2 {@link org.apache.flink.formats.parquet.vector.ParquetDecimalVector} + * because it is not public. + */ +public class ParquetDecimalVector implements DecimalColumnVector { + + public final ColumnVector vector; + + public ParquetDecimalVector(ColumnVector vector) { + this.vector = vector; + } + + @Override + public DecimalData getDecimal(int i, int precision, int scale) { + return DecimalData.fromUnscaledBytes( + ((BytesColumnVector) vector).getBytes(i).getBytes(), + precision, + scale); + } + + @Override + public boolean isNullAt(int i) { + return vector.isNullAt(i); + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/AbstractColumnReader.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/AbstractColumnReader.java new file mode 100644 index 0000000000000..07416a371715c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/AbstractColumnReader.java @@ -0,0 +1,325 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.formats.parquet.vector.ParquetDictionary; +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; +import org.apache.flink.table.data.vector.writable.WritableIntVector; +import org.apache.parquet.Preconditions; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Dictionary; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.page.DataPage; +import org.apache.parquet.column.page.DataPageV1; +import org.apache.parquet.column.page.DataPageV2; +import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.column.values.ValuesReader; +import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.schema.PrimitiveType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import static org.apache.parquet.column.ValuesType.REPETITION_LEVEL; + +/** + * Abstract {@link ColumnReader}. + * See {@link org.apache.parquet.column.impl.ColumnReaderImpl}, + * part of the code is referred from Apache Spark and Apache Parquet. + * + *

    Note: Reference Flink release 1.11.2 {@link org.apache.flink.formats.parquet.vector.reader.AbstractColumnReader} + * because some of the package scope methods. + */ +public abstract class AbstractColumnReader + implements ColumnReader { + + private static final Logger LOG = LoggerFactory.getLogger(org.apache.flink.formats.parquet.vector.reader.AbstractColumnReader.class); + + private final PageReader pageReader; + + /** + * The dictionary, if this column has dictionary encoding. + */ + protected final Dictionary dictionary; + + /** + * Maximum definition level for this column. + */ + protected final int maxDefLevel; + + protected final ColumnDescriptor descriptor; + + /** + * Total number of values read. + */ + private long valuesRead; + + /** + * value that indicates the end of the current page. That is, if valuesRead == + * endOfPageValueCount, we are at the end of the page. + */ + private long endOfPageValueCount; + + /** + * If true, the current page is dictionary encoded. + */ + private boolean isCurrentPageDictionaryEncoded; + + /** + * Total values in the current page. + */ + private int pageValueCount; + + /* + * Input streams: + * 1.Run length encoder to encode every data, so we have run length stream to get + * run length information. + * 2.Data maybe is real data, maybe is dictionary ids which need be decode to real + * data from Dictionary. + * + * Run length stream ------> Data stream + * | + * ------> Dictionary ids stream + */ + + /** + * Run length decoder for data and dictionary. + */ + protected RunLengthDecoder runLenDecoder; + + /** + * Data input stream. + */ + ByteBufferInputStream dataInputStream; + + /** + * Dictionary decoder to wrap dictionary ids input stream. + */ + private RunLengthDecoder dictionaryIdsDecoder; + + public AbstractColumnReader( + ColumnDescriptor descriptor, + PageReader pageReader) throws IOException { + this.descriptor = descriptor; + this.pageReader = pageReader; + this.maxDefLevel = descriptor.getMaxDefinitionLevel(); + + DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); + if (dictionaryPage != null) { + try { + this.dictionary = dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage); + this.isCurrentPageDictionaryEncoded = true; + } catch (IOException e) { + throw new IOException("could not decode the dictionary for " + descriptor, e); + } + } else { + this.dictionary = null; + this.isCurrentPageDictionaryEncoded = false; + } + /* + * Total number of values in this column (in this row group). + */ + long totalValueCount = pageReader.getTotalValueCount(); + if (totalValueCount == 0) { + throw new IOException("totalValueCount == 0"); + } + } + + protected void checkTypeName(PrimitiveType.PrimitiveTypeName expectedName) { + PrimitiveType.PrimitiveTypeName actualName = descriptor.getPrimitiveType().getPrimitiveTypeName(); + Preconditions.checkArgument( + actualName == expectedName, + "Expected type name: %s, actual type name: %s", + expectedName, + actualName); + } + + /** + * Reads `total` values from this columnReader into column. + */ + @Override + public final void readToVector(int readNumber, V vector) throws IOException { + int rowId = 0; + WritableIntVector dictionaryIds = null; + if (dictionary != null) { + dictionaryIds = vector.reserveDictionaryIds(readNumber); + } + while (readNumber > 0) { + // Compute the number of values we want to read in this page. + int leftInPage = (int) (endOfPageValueCount - valuesRead); + if (leftInPage == 0) { + DataPage page = pageReader.readPage(); + if (page instanceof DataPageV1) { + readPageV1((DataPageV1) page); + } else if (page instanceof DataPageV2) { + readPageV2((DataPageV2) page); + } else { + throw new RuntimeException("Unsupported page type: " + page.getClass()); + } + leftInPage = (int) (endOfPageValueCount - valuesRead); + } + int num = Math.min(readNumber, leftInPage); + if (isCurrentPageDictionaryEncoded) { + // Read and decode dictionary ids. + runLenDecoder.readDictionaryIds( + num, dictionaryIds, vector, rowId, maxDefLevel, this.dictionaryIdsDecoder); + + if (vector.hasDictionary() || (rowId == 0 && supportLazyDecode())) { + // Column vector supports lazy decoding of dictionary values so just set the dictionary. + // We can't do this if rowId != 0 AND the column doesn't have a dictionary (i.e. some + // non-dictionary encoded values have already been added). + vector.setDictionary(new ParquetDictionary(dictionary)); + } else { + readBatchFromDictionaryIds(rowId, num, vector, dictionaryIds); + } + } else { + if (vector.hasDictionary() && rowId != 0) { + // This batch already has dictionary encoded values but this new page is not. The batch + // does not support a mix of dictionary and not so we will decode the dictionary. + readBatchFromDictionaryIds(0, rowId, vector, vector.getDictionaryIds()); + } + vector.setDictionary(null); + readBatch(rowId, num, vector); + } + + valuesRead += num; + rowId += num; + readNumber -= num; + } + } + + private void readPageV1(DataPageV1 page) throws IOException { + this.pageValueCount = page.getValueCount(); + ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL); + + // Initialize the decoders. + if (page.getDlEncoding() != Encoding.RLE && descriptor.getMaxDefinitionLevel() != 0) { + throw new UnsupportedOperationException("Unsupported encoding: " + page.getDlEncoding()); + } + int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel()); + this.runLenDecoder = new RunLengthDecoder(bitWidth); + try { + BytesInput bytes = page.getBytes(); + ByteBufferInputStream in = bytes.toInputStream(); + rlReader.initFromPage(pageValueCount, in); + this.runLenDecoder.initFromStream(pageValueCount, in); + prepareNewPage(page.getValueEncoding(), in); + } catch (IOException e) { + throw new IOException("could not read page " + page + " in col " + descriptor, e); + } + } + + private void readPageV2(DataPageV2 page) throws IOException { + this.pageValueCount = page.getValueCount(); + + int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel()); + // do not read the length from the stream. v2 pages handle dividing the page bytes. + this.runLenDecoder = new RunLengthDecoder(bitWidth, false); + this.runLenDecoder.initFromStream( + this.pageValueCount, page.getDefinitionLevels().toInputStream()); + try { + prepareNewPage(page.getDataEncoding(), page.getData().toInputStream()); + } catch (IOException e) { + throw new IOException("could not read page " + page + " in col " + descriptor, e); + } + } + + private void prepareNewPage( + Encoding dataEncoding, + ByteBufferInputStream in) throws IOException { + this.endOfPageValueCount = valuesRead + pageValueCount; + if (dataEncoding.usesDictionary()) { + if (dictionary == null) { + throw new IOException("Could not read page in col " + + descriptor + + " as the dictionary was missing for encoding " + + dataEncoding); + } + @SuppressWarnings("deprecation") + Encoding plainDict = Encoding.PLAIN_DICTIONARY; // var to allow warning suppression + if (dataEncoding != plainDict && dataEncoding != Encoding.RLE_DICTIONARY) { + throw new UnsupportedOperationException("Unsupported encoding: " + dataEncoding); + } + this.dataInputStream = null; + this.dictionaryIdsDecoder = new RunLengthDecoder(); + try { + this.dictionaryIdsDecoder.initFromStream(pageValueCount, in); + } catch (IOException e) { + throw new IOException("could not read dictionary in col " + descriptor, e); + } + this.isCurrentPageDictionaryEncoded = true; + } else { + if (dataEncoding != Encoding.PLAIN) { + throw new UnsupportedOperationException("Unsupported encoding: " + dataEncoding); + } + this.dictionaryIdsDecoder = null; + LOG.debug("init from page at offset {} for length {}", in.position(), in.available()); + this.dataInputStream = in.remainingStream(); + this.isCurrentPageDictionaryEncoded = false; + } + + afterReadPage(); + } + + final ByteBuffer readDataBuffer(int length) { + try { + return dataInputStream.slice(length).order(ByteOrder.LITTLE_ENDIAN); + } catch (IOException e) { + throw new ParquetDecodingException("Failed to read " + length + " bytes", e); + } + } + + /** + * After read a page, we may need some initialization. + */ + protected void afterReadPage() { + } + + /** + * Support lazy dictionary ids decode. See more in {@link ParquetDictionary}. + * If return false, we will decode all the data first. + */ + protected boolean supportLazyDecode() { + return true; + } + + /** + * Read batch from {@link #runLenDecoder} and {@link #dataInputStream}. + */ + protected abstract void readBatch(int rowId, int num, V column); + + /** + * Decode dictionary ids to data. + * From {@link #runLenDecoder} and {@link #dictionaryIdsDecoder}. + */ + protected abstract void readBatchFromDictionaryIds( + int rowId, + int num, + V column, + WritableIntVector dictionaryIds); +} + diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ArrayColumnReader.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ArrayColumnReader.java new file mode 100644 index 0000000000000..67dbb74902605 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ArrayColumnReader.java @@ -0,0 +1,473 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.hudi.table.data.vector.VectorizedColumnBatch; +import org.apache.hudi.table.format.cow.vector.HeapArrayVector; +import org.apache.hudi.table.format.cow.vector.ParquetDecimalVector; + +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.data.vector.heap.HeapBooleanVector; +import org.apache.flink.table.data.vector.heap.HeapByteVector; +import org.apache.flink.table.data.vector.heap.HeapBytesVector; +import org.apache.flink.table.data.vector.heap.HeapDoubleVector; +import org.apache.flink.table.data.vector.heap.HeapFloatVector; +import org.apache.flink.table.data.vector.heap.HeapIntVector; +import org.apache.flink.table.data.vector.heap.HeapLongVector; +import org.apache.flink.table.data.vector.heap.HeapShortVector; +import org.apache.flink.table.data.vector.heap.HeapTimestampVector; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Array {@link ColumnReader}. + */ +public class ArrayColumnReader extends BaseVectorizedColumnReader { + + // The value read in last time + private Object lastValue; + + // flag to indicate if there is no data in parquet data page + private boolean eof = false; + + // flag to indicate if it's the first time to read parquet data page with this instance + boolean isFirstRow = true; + + public ArrayColumnReader( + ColumnDescriptor descriptor, + PageReader pageReader, + boolean isUtcTimestamp, + Type type, + LogicalType logicalType) + throws IOException { + super(descriptor, pageReader, isUtcTimestamp, type, logicalType); + } + + @Override + public void readToVector(int readNumber, WritableColumnVector vector) throws IOException { + HeapArrayVector lcv = (HeapArrayVector) vector; + // before readBatch, initial the size of offsets & lengths as the default value, + // the actual size will be assigned in setChildrenInfo() after reading complete. + lcv.offsets = new long[VectorizedColumnBatch.DEFAULT_SIZE]; + lcv.lengths = new long[VectorizedColumnBatch.DEFAULT_SIZE]; + // Because the length of ListColumnVector.child can't be known now, + // the valueList will save all data for ListColumnVector temporary. + List valueList = new ArrayList<>(); + + LogicalType category = ((ArrayType) logicalType).getElementType(); + + // read the first row in parquet data page, this will be only happened once for this + // instance + if (isFirstRow) { + if (!fetchNextValue(category)) { + return; + } + isFirstRow = false; + } + + int index = collectDataFromParquetPage(readNumber, lcv, valueList, category); + + // Convert valueList to array for the ListColumnVector.child + fillColumnVector(category, lcv, valueList, index); + } + + /** + * Reads a single value from parquet page, puts it into lastValue. Returns a boolean indicating + * if there is more values to read (true). + * + * @param category + * @return boolean + * @throws IOException + */ + private boolean fetchNextValue(LogicalType category) throws IOException { + int left = readPageIfNeed(); + if (left > 0) { + // get the values of repetition and definitionLevel + readRepetitionAndDefinitionLevels(); + // read the data if it isn't null + if (definitionLevel == maxDefLevel) { + if (isCurrentPageDictionaryEncoded) { + lastValue = dataColumn.readValueDictionaryId(); + } else { + lastValue = readPrimitiveTypedRow(category); + } + } else { + lastValue = null; + } + return true; + } else { + eof = true; + return false; + } + } + + private int readPageIfNeed() throws IOException { + // Compute the number of values we want to read in this page. + int leftInPage = (int) (endOfPageValueCount - valuesRead); + if (leftInPage == 0) { + // no data left in current page, load data from new page + readPage(); + leftInPage = (int) (endOfPageValueCount - valuesRead); + } + return leftInPage; + } + + // Need to be in consistent with that VectorizedPrimitiveColumnReader#readBatchHelper + // TODO Reduce the duplicated code + private Object readPrimitiveTypedRow(LogicalType category) { + switch (category.getTypeRoot()) { + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + return dataColumn.readString(); + case BOOLEAN: + return dataColumn.readBoolean(); + case TIME_WITHOUT_TIME_ZONE: + case DATE: + case INTEGER: + return dataColumn.readInteger(); + case TINYINT: + return dataColumn.readTinyInt(); + case SMALLINT: + return dataColumn.readSmallInt(); + case BIGINT: + return dataColumn.readLong(); + case FLOAT: + return dataColumn.readFloat(); + case DOUBLE: + return dataColumn.readDouble(); + case DECIMAL: + switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) { + case INT32: + return dataColumn.readInteger(); + case INT64: + return dataColumn.readLong(); + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + return dataColumn.readString(); + default: + throw new AssertionError(); + } + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + return dataColumn.readTimestamp(); + default: + throw new RuntimeException("Unsupported type in the list: " + type); + } + } + + private Object dictionaryDecodeValue(LogicalType category, Integer dictionaryValue) { + if (dictionaryValue == null) { + return null; + } + + switch (category.getTypeRoot()) { + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + return dictionary.readString(dictionaryValue); + case DATE: + case TIME_WITHOUT_TIME_ZONE: + case INTEGER: + return dictionary.readInteger(dictionaryValue); + case BOOLEAN: + return dictionary.readBoolean(dictionaryValue) ? 1 : 0; + case DOUBLE: + return dictionary.readDouble(dictionaryValue); + case FLOAT: + return dictionary.readFloat(dictionaryValue); + case TINYINT: + return dictionary.readTinyInt(dictionaryValue); + case SMALLINT: + return dictionary.readSmallInt(dictionaryValue); + case BIGINT: + return dictionary.readLong(dictionaryValue); + case DECIMAL: + switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) { + case INT32: + return dictionary.readInteger(dictionaryValue); + case INT64: + return dictionary.readLong(dictionaryValue); + case FIXED_LEN_BYTE_ARRAY: + case BINARY: + return dictionary.readString(dictionaryValue); + default: + throw new AssertionError(); + } + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + return dictionary.readTimestamp(dictionaryValue); + default: + throw new RuntimeException("Unsupported type in the list: " + type); + } + } + + /** + * Collects data from a parquet page and returns the final row index where it stopped. The + * returned index can be equal to or less than total. + * + * @param total maximum number of rows to collect + * @param lcv column vector to do initial setup in data collection time + * @param valueList collection of values that will be fed into the vector later + * @param category + * @return int + * @throws IOException + */ + private int collectDataFromParquetPage( + int total, HeapArrayVector lcv, List valueList, LogicalType category) + throws IOException { + int index = 0; + /* + * Here is a nested loop for collecting all values from a parquet page. + * A column of array type can be considered as a list of lists, so the two loops are as below: + * 1. The outer loop iterates on rows (index is a row index, so points to a row in the batch), e.g.: + * [0, 2, 3] <- index: 0 + * [NULL, 3, 4] <- index: 1 + * + * 2. The inner loop iterates on values within a row (sets all data from parquet data page + * for an element in ListColumnVector), so fetchNextValue returns values one-by-one: + * 0, 2, 3, NULL, 3, 4 + * + * As described below, the repetition level (repetitionLevel != 0) + * can be used to decide when we'll start to read values for the next list. + */ + while (!eof && index < total) { + // add element to ListColumnVector one by one + lcv.offsets[index] = valueList.size(); + /* + * Let's collect all values for a single list. + * Repetition level = 0 means that a new list started there in the parquet page, + * in that case, let's exit from the loop, and start to collect value for a new list. + */ + do { + /* + * Definition level = 0 when a NULL value was returned instead of a list + * (this is not the same as a NULL value in of a list). + */ + if (definitionLevel == 0) { + lcv.setNullAt(index); + } + valueList.add( + isCurrentPageDictionaryEncoded + ? dictionaryDecodeValue(category, (Integer) lastValue) + : lastValue); + } while (fetchNextValue(category) && (repetitionLevel != 0)); + + lcv.lengths[index] = valueList.size() - lcv.offsets[index]; + index++; + } + return index; + } + + /** + * The lengths & offsets will be initialized as default size (1024), it should be set to the + * actual size according to the element number. + */ + private void setChildrenInfo(HeapArrayVector lcv, int itemNum, int elementNum) { + lcv.setSize(itemNum); + long[] lcvLength = new long[elementNum]; + long[] lcvOffset = new long[elementNum]; + System.arraycopy(lcv.lengths, 0, lcvLength, 0, elementNum); + System.arraycopy(lcv.offsets, 0, lcvOffset, 0, elementNum); + lcv.lengths = lcvLength; + lcv.offsets = lcvOffset; + } + + private void fillColumnVector( + LogicalType category, HeapArrayVector lcv, List valueList, int elementNum) { + int total = valueList.size(); + setChildrenInfo(lcv, total, elementNum); + switch (category.getTypeRoot()) { + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + lcv.child = new HeapBytesVector(total); + ((HeapBytesVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + byte[] src = ((List) valueList).get(i); + if (src == null) { + ((HeapBytesVector) lcv.child).setNullAt(i); + } else { + ((HeapBytesVector) lcv.child).appendBytes(i, src, 0, src.length); + } + } + break; + case BOOLEAN: + lcv.child = new HeapBooleanVector(total); + ((HeapBooleanVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapBooleanVector) lcv.child).setNullAt(i); + } else { + ((HeapBooleanVector) lcv.child).vector[i] = + ((List) valueList).get(i); + } + } + break; + case TINYINT: + lcv.child = new HeapByteVector(total); + ((HeapByteVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapByteVector) lcv.child).setNullAt(i); + } else { + ((HeapByteVector) lcv.child).vector[i] = + (byte) ((List) valueList).get(i).intValue(); + } + } + break; + case SMALLINT: + lcv.child = new HeapShortVector(total); + ((HeapShortVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapShortVector) lcv.child).setNullAt(i); + } else { + ((HeapShortVector) lcv.child).vector[i] = + (short) ((List) valueList).get(i).intValue(); + } + } + break; + case INTEGER: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + lcv.child = new HeapIntVector(total); + ((HeapIntVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapIntVector) lcv.child).setNullAt(i); + } else { + ((HeapIntVector) lcv.child).vector[i] = ((List) valueList).get(i); + } + } + break; + case FLOAT: + lcv.child = new HeapFloatVector(total); + ((HeapFloatVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapFloatVector) lcv.child).setNullAt(i); + } else { + ((HeapFloatVector) lcv.child).vector[i] = ((List) valueList).get(i); + } + } + break; + case BIGINT: + lcv.child = new HeapLongVector(total); + ((HeapLongVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapLongVector) lcv.child).setNullAt(i); + } else { + ((HeapLongVector) lcv.child).vector[i] = ((List) valueList).get(i); + } + } + break; + case DOUBLE: + lcv.child = new HeapDoubleVector(total); + ((HeapDoubleVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapDoubleVector) lcv.child).setNullAt(i); + } else { + ((HeapDoubleVector) lcv.child).vector[i] = + ((List) valueList).get(i); + } + } + break; + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + lcv.child = new HeapTimestampVector(total); + ((HeapTimestampVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapTimestampVector) lcv.child).setNullAt(i); + } else { + ((HeapTimestampVector) lcv.child) + .setTimestamp(i, ((List) valueList).get(i)); + } + } + break; + case DECIMAL: + PrimitiveType.PrimitiveTypeName primitiveTypeName = + descriptor.getPrimitiveType().getPrimitiveTypeName(); + switch (primitiveTypeName) { + case INT32: + lcv.child = new ParquetDecimalVector(new HeapIntVector(total)); + ((HeapIntVector) ((ParquetDecimalVector) lcv.child).vector).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapIntVector) ((ParquetDecimalVector) lcv.child).vector) + .setNullAt(i); + } else { + ((HeapIntVector) ((ParquetDecimalVector) lcv.child).vector) + .vector[i] = + ((List) valueList).get(i); + } + } + break; + case INT64: + lcv.child = new ParquetDecimalVector(new HeapLongVector(total)); + ((HeapLongVector) ((ParquetDecimalVector) lcv.child).vector).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapLongVector) ((ParquetDecimalVector) lcv.child).vector) + .setNullAt(i); + } else { + ((HeapLongVector) ((ParquetDecimalVector) lcv.child).vector) + .vector[i] = + ((List) valueList).get(i); + } + } + break; + default: + lcv.child = new ParquetDecimalVector(new HeapBytesVector(total)); + ((HeapBytesVector) ((ParquetDecimalVector) lcv.child).vector).reset(); + for (int i = 0; i < valueList.size(); i++) { + byte[] src = ((List) valueList).get(i); + if (valueList.get(i) == null) { + ((HeapBytesVector) ((ParquetDecimalVector) lcv.child).vector) + .setNullAt(i); + } else { + ((HeapBytesVector) ((ParquetDecimalVector) lcv.child).vector) + .appendBytes(i, src, 0, src.length); + } + } + break; + } + break; + default: + throw new RuntimeException("Unsupported type in the list: " + type); + } + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/BaseVectorizedColumnReader.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/BaseVectorizedColumnReader.java new file mode 100644 index 0000000000000..073c704c4b24f --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/BaseVectorizedColumnReader.java @@ -0,0 +1,313 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.page.DataPage; +import org.apache.parquet.column.page.DataPageV1; +import org.apache.parquet.column.page.DataPageV2; +import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.column.values.ValuesReader; +import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder; +import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayInputStream; +import java.io.IOException; + +import static org.apache.parquet.column.ValuesType.DEFINITION_LEVEL; +import static org.apache.parquet.column.ValuesType.REPETITION_LEVEL; +import static org.apache.parquet.column.ValuesType.VALUES; + +/** + * Abstract {@link ColumnReader}. part of the code is referred from Apache Hive and Apache Parquet. + */ +public abstract class BaseVectorizedColumnReader implements ColumnReader { + + private static final Logger LOG = LoggerFactory.getLogger(BaseVectorizedColumnReader.class); + + protected boolean isUtcTimestamp; + + /** + * Total number of values read. + */ + protected long valuesRead; + + /** + * value that indicates the end of the current page. That is, if valuesRead == + * endOfPageValueCount, we are at the end of the page. + */ + protected long endOfPageValueCount; + + /** + * The dictionary, if this column has dictionary encoding. + */ + protected final ParquetDataColumnReader dictionary; + + /** + * If true, the current page is dictionary encoded. + */ + protected boolean isCurrentPageDictionaryEncoded; + + /** + * Maximum definition level for this column. + */ + protected final int maxDefLevel; + + protected int definitionLevel; + protected int repetitionLevel; + + /** + * Repetition/Definition/Value readers. + */ + protected IntIterator repetitionLevelColumn; + + protected IntIterator definitionLevelColumn; + protected ParquetDataColumnReader dataColumn; + + /** + * Total values in the current page. + */ + protected int pageValueCount; + + protected final PageReader pageReader; + protected final ColumnDescriptor descriptor; + protected final Type type; + protected final LogicalType logicalType; + + public BaseVectorizedColumnReader( + ColumnDescriptor descriptor, + PageReader pageReader, + boolean isUtcTimestamp, + Type parquetType, + LogicalType logicalType) + throws IOException { + this.descriptor = descriptor; + this.type = parquetType; + this.pageReader = pageReader; + this.maxDefLevel = descriptor.getMaxDefinitionLevel(); + this.isUtcTimestamp = isUtcTimestamp; + this.logicalType = logicalType; + + DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); + if (dictionaryPage != null) { + try { + this.dictionary = + ParquetDataColumnReaderFactory.getDataColumnReaderByTypeOnDictionary( + parquetType.asPrimitiveType(), + dictionaryPage + .getEncoding() + .initDictionary(descriptor, dictionaryPage), + isUtcTimestamp); + this.isCurrentPageDictionaryEncoded = true; + } catch (IOException e) { + throw new IOException("could not decode the dictionary for " + descriptor, e); + } + } else { + this.dictionary = null; + this.isCurrentPageDictionaryEncoded = false; + } + } + + protected void readRepetitionAndDefinitionLevels() { + repetitionLevel = repetitionLevelColumn.nextInt(); + definitionLevel = definitionLevelColumn.nextInt(); + valuesRead++; + } + + protected void readPage() throws IOException { + DataPage page = pageReader.readPage(); + + if (page == null) { + return; + } + + page.accept( + new DataPage.Visitor() { + @Override + public Void visit(DataPageV1 dataPageV1) { + readPageV1(dataPageV1); + return null; + } + + @Override + public Void visit(DataPageV2 dataPageV2) { + readPageV2(dataPageV2); + return null; + } + }); + } + + private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) + throws IOException { + this.pageValueCount = valueCount; + this.endOfPageValueCount = valuesRead + pageValueCount; + if (dataEncoding.usesDictionary()) { + this.dataColumn = null; + if (dictionary == null) { + throw new IOException( + "could not read page in col " + + descriptor + + " as the dictionary was missing for encoding " + + dataEncoding); + } + dataColumn = + ParquetDataColumnReaderFactory.getDataColumnReaderByType( + type.asPrimitiveType(), + dataEncoding.getDictionaryBasedValuesReader( + descriptor, VALUES, dictionary.getDictionary()), + isUtcTimestamp); + this.isCurrentPageDictionaryEncoded = true; + } else { + dataColumn = + ParquetDataColumnReaderFactory.getDataColumnReaderByType( + type.asPrimitiveType(), + dataEncoding.getValuesReader(descriptor, VALUES), + isUtcTimestamp); + this.isCurrentPageDictionaryEncoded = false; + } + + try { + dataColumn.initFromPage(pageValueCount, in); + } catch (IOException e) { + throw new IOException("could not read page in col " + descriptor, e); + } + } + + private void readPageV1(DataPageV1 page) { + ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL); + ValuesReader dlReader = page.getDlEncoding().getValuesReader(descriptor, DEFINITION_LEVEL); + this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader); + this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader); + try { + BytesInput bytes = page.getBytes(); + LOG.debug("page size " + bytes.size() + " bytes and " + pageValueCount + " records"); + ByteBufferInputStream in = bytes.toInputStream(); + LOG.debug("reading repetition levels at " + in.position()); + rlReader.initFromPage(pageValueCount, in); + LOG.debug("reading definition levels at " + in.position()); + dlReader.initFromPage(pageValueCount, in); + LOG.debug("reading data at " + in.position()); + initDataReader(page.getValueEncoding(), in, page.getValueCount()); + } catch (IOException e) { + throw new ParquetDecodingException( + "could not read page " + page + " in col " + descriptor, e); + } + } + + private void readPageV2(DataPageV2 page) { + this.pageValueCount = page.getValueCount(); + this.repetitionLevelColumn = + newRLEIterator(descriptor.getMaxRepetitionLevel(), page.getRepetitionLevels()); + this.definitionLevelColumn = + newRLEIterator(descriptor.getMaxDefinitionLevel(), page.getDefinitionLevels()); + try { + LOG.debug( + "page data size " + + page.getData().size() + + " bytes and " + + pageValueCount + + " records"); + initDataReader( + page.getDataEncoding(), page.getData().toInputStream(), page.getValueCount()); + } catch (IOException e) { + throw new ParquetDecodingException( + "could not read page " + page + " in col " + descriptor, e); + } + } + + private IntIterator newRLEIterator(int maxLevel, BytesInput bytes) { + try { + if (maxLevel == 0) { + return new NullIntIterator(); + } + return new RLEIntIterator( + new RunLengthBitPackingHybridDecoder( + BytesUtils.getWidthFromMaxInt(maxLevel), + new ByteArrayInputStream(bytes.toByteArray()))); + } catch (IOException e) { + throw new ParquetDecodingException( + "could not read levels in page for col " + descriptor, e); + } + } + + /** + * Utility classes to abstract over different way to read ints with different encodings. + */ + abstract static class IntIterator { + abstract int nextInt(); + } + + /** + * read ints from {@link ValuesReader}. + */ + protected static final class ValuesReaderIntIterator extends IntIterator { + ValuesReader delegate; + + public ValuesReaderIntIterator(ValuesReader delegate) { + this.delegate = delegate; + } + + @Override + int nextInt() { + return delegate.readInteger(); + } + } + + /** + * read ints from {@link RunLengthBitPackingHybridDecoder}. + */ + protected static final class RLEIntIterator extends IntIterator { + RunLengthBitPackingHybridDecoder delegate; + + public RLEIntIterator(RunLengthBitPackingHybridDecoder delegate) { + this.delegate = delegate; + } + + @Override + int nextInt() { + try { + return delegate.readInt(); + } catch (IOException e) { + throw new ParquetDecodingException(e); + } + } + } + + /** + * return zero. + */ + protected static final class NullIntIterator extends IntIterator { + @Override + int nextInt() { + return 0; + } + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/FixedLenBytesColumnReader.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/FixedLenBytesColumnReader.java new file mode 100644 index 0000000000000..61461a728c3b8 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/FixedLenBytesColumnReader.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.table.data.vector.writable.WritableBytesVector; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; +import org.apache.flink.table.data.vector.writable.WritableIntVector; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; + +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * Fixed length bytes {@code ColumnReader}, just for decimal. + * + *

    Note: Reference Flink release 1.13.2 + * {@code org.apache.flink.formats.parquet.vector.reader.FixedLenBytesColumnReader} + * to always write as legacy decimal format. + */ +public class FixedLenBytesColumnReader + extends AbstractColumnReader { + + public FixedLenBytesColumnReader( + ColumnDescriptor descriptor, PageReader pageReader, int precision) throws IOException { + super(descriptor, pageReader); + checkTypeName(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY); + } + + @Override + protected void readBatch(int rowId, int num, V column) { + int bytesLen = descriptor.getPrimitiveType().getTypeLength(); + WritableBytesVector bytesVector = (WritableBytesVector) column; + for (int i = 0; i < num; i++) { + if (runLenDecoder.readInteger() == maxDefLevel) { + byte[] bytes = readDataBinary(bytesLen).getBytes(); + bytesVector.appendBytes(rowId + i, bytes, 0, bytes.length); + } else { + bytesVector.setNullAt(rowId + i); + } + } + } + + @Override + protected void readBatchFromDictionaryIds( + int rowId, int num, V column, WritableIntVector dictionaryIds) { + WritableBytesVector bytesVector = (WritableBytesVector) column; + for (int i = rowId; i < rowId + num; ++i) { + if (!bytesVector.isNullAt(i)) { + byte[] v = dictionary.decodeToBinary(dictionaryIds.getInt(i)).getBytes(); + bytesVector.appendBytes(i, v, 0, v.length); + } + } + } + + private Binary readDataBinary(int len) { + ByteBuffer buffer = readDataBuffer(len); + if (buffer.hasArray()) { + return Binary.fromConstantByteArray( + buffer.array(), buffer.arrayOffset() + buffer.position(), len); + } else { + byte[] bytes = new byte[len]; + buffer.get(bytes); + return Binary.fromConstantByteArray(bytes); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/Int64TimestampColumnReader.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/Int64TimestampColumnReader.java new file mode 100644 index 0000000000000..555853bda6bd8 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/Int64TimestampColumnReader.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.data.vector.writable.WritableIntVector; +import org.apache.flink.table.data.vector.writable.WritableTimestampVector; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.schema.PrimitiveType; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.sql.Timestamp; +import java.time.Instant; +import java.time.temporal.ChronoUnit; + +/** + * Timestamp {@link org.apache.flink.formats.parquet.vector.reader.ColumnReader} that supports INT64 8 bytes, + * TIMESTAMP_MILLIS is the deprecated ConvertedType counterpart of a TIMESTAMP logical type + * that is UTC normalized and has MILLIS precision. + * + *

    See https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp + * TIMESTAMP_MILLIS and TIMESTAMP_MICROS are the deprecated ConvertedType. + */ +public class Int64TimestampColumnReader extends AbstractColumnReader { + + private final boolean utcTimestamp; + + private final ChronoUnit chronoUnit; + + public Int64TimestampColumnReader( + boolean utcTimestamp, + ColumnDescriptor descriptor, + PageReader pageReader, + int precision) throws IOException { + super(descriptor, pageReader); + this.utcTimestamp = utcTimestamp; + if (precision <= 3) { + this.chronoUnit = ChronoUnit.MILLIS; + } else if (precision <= 6) { + this.chronoUnit = ChronoUnit.MICROS; + } else { + throw new IllegalArgumentException( + "Avro does not support TIMESTAMP type with precision: " + + precision + + ", it only supports precision less than 6."); + } + checkTypeName(PrimitiveType.PrimitiveTypeName.INT64); + } + + @Override + protected boolean supportLazyDecode() { + return false; + } + + @Override + protected void readBatch(int rowId, int num, WritableTimestampVector column) { + for (int i = 0; i < num; i++) { + if (runLenDecoder.readInteger() == maxDefLevel) { + ByteBuffer buffer = readDataBuffer(8); + column.setTimestamp(rowId + i, int64ToTimestamp(utcTimestamp, buffer.getLong(), chronoUnit)); + } else { + column.setNullAt(rowId + i); + } + } + } + + @Override + protected void readBatchFromDictionaryIds( + int rowId, + int num, + WritableTimestampVector column, + WritableIntVector dictionaryIds) { + for (int i = rowId; i < rowId + num; ++i) { + if (!column.isNullAt(i)) { + column.setTimestamp(i, decodeInt64ToTimestamp( + utcTimestamp, dictionary, dictionaryIds.getInt(i), chronoUnit)); + } + } + } + + public static TimestampData decodeInt64ToTimestamp( + boolean utcTimestamp, + org.apache.parquet.column.Dictionary dictionary, + int id, + ChronoUnit unit) { + long value = dictionary.decodeToLong(id); + return int64ToTimestamp(utcTimestamp, value, unit); + } + + private static TimestampData int64ToTimestamp( + boolean utcTimestamp, + long interval, + ChronoUnit unit) { + final Instant instant = Instant.EPOCH.plus(interval, unit); + if (utcTimestamp) { + return TimestampData.fromInstant(instant); + } else { + // this applies the local timezone + return TimestampData.fromTimestamp(Timestamp.from(instant)); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/MapColumnReader.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/MapColumnReader.java new file mode 100644 index 0000000000000..015a867c4f22d --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/MapColumnReader.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.hudi.table.format.cow.vector.HeapArrayVector; +import org.apache.hudi.table.format.cow.vector.HeapMapColumnVector; + +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.vector.ColumnVector; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; + +import java.io.IOException; + +/** + * Map {@link ColumnReader}. + */ +public class MapColumnReader implements ColumnReader { + + private final LogicalType logicalType; + private final ArrayColumnReader keyReader; + private final ArrayColumnReader valueReader; + + public MapColumnReader( + ArrayColumnReader keyReader, ArrayColumnReader valueReader, LogicalType logicalType) { + this.keyReader = keyReader; + this.valueReader = valueReader; + this.logicalType = logicalType; + } + + public void readBatch(int total, ColumnVector column) throws IOException { + HeapMapColumnVector mapColumnVector = (HeapMapColumnVector) column; + MapType mapType = (MapType) logicalType; + // initialize 2 ListColumnVector for keys and values + HeapArrayVector keyArrayColumnVector = new HeapArrayVector(total); + HeapArrayVector valueArrayColumnVector = new HeapArrayVector(total); + // read the keys and values + keyReader.readToVector(total, keyArrayColumnVector); + valueReader.readToVector(total, valueArrayColumnVector); + + // set the related attributes according to the keys and values + mapColumnVector.setKeys(keyArrayColumnVector.child); + mapColumnVector.setValues(valueArrayColumnVector.child); + mapColumnVector.setOffsets(keyArrayColumnVector.offsets); + mapColumnVector.setLengths(keyArrayColumnVector.lengths); + mapColumnVector.setSize(keyArrayColumnVector.getSize()); + for (int i = 0; i < keyArrayColumnVector.getLen(); i++) { + if (keyArrayColumnVector.isNullAt(i)) { + mapColumnVector.setNullAt(i); + } + } + } + + @Override + public void readToVector(int readNumber, WritableColumnVector vector) throws IOException { + readBatch(readNumber, vector); + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetColumnarRowSplitReader.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetColumnarRowSplitReader.java new file mode 100644 index 0000000000000..e47fb6036210c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetColumnarRowSplitReader.java @@ -0,0 +1,385 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.hudi.table.data.ColumnarRowData; +import org.apache.hudi.table.data.vector.VectorizedColumnBatch; +import org.apache.hudi.table.format.cow.vector.ParquetDecimalVector; + +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.vector.ColumnVector; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeRoot; +import org.apache.flink.util.FlinkRuntimeException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.stream.IntStream; + +import static org.apache.hudi.table.format.cow.ParquetSplitReaderUtil.createColumnReader; +import static org.apache.hudi.table.format.cow.ParquetSplitReaderUtil.createWritableColumnVector; +import static org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups; +import static org.apache.parquet.format.converter.ParquetMetadataConverter.range; +import static org.apache.parquet.hadoop.ParquetFileReader.readFooter; +import static org.apache.parquet.hadoop.ParquetInputFormat.getFilter; + +/** + * This reader is used to read a {@link VectorizedColumnBatch} from input split. + * + *

    Note: Reference Flink release 1.11.2 + * {@code org.apache.flink.formats.parquet.vector.ParquetColumnarRowSplitReader} + * because it is package scope. + */ +public class ParquetColumnarRowSplitReader implements Closeable { + + private final boolean utcTimestamp; + + private final MessageType fileSchema; + + private final LogicalType[] requestedTypes; + + private final MessageType requestedSchema; + + /** + * The total number of rows this RecordReader will eventually read. The sum of the rows of all + * the row groups. + */ + private final long totalRowCount; + + private final WritableColumnVector[] writableVectors; + + private final VectorizedColumnBatch columnarBatch; + + private final ColumnarRowData row; + + private final int batchSize; + + private ParquetFileReader reader; + + /** + * For each request column, the reader to read this column. This is NULL if this column is + * missing from the file, in which case we populate the attribute with NULL. + */ + private ColumnReader[] columnReaders; + + /** + * The number of rows that have been returned. + */ + private long rowsReturned; + + /** + * The number of rows that have been reading, including the current in flight row group. + */ + private long totalCountLoadedSoFar; + + // the index of the next row to return + private int nextRow; + + // the number of rows in the current batch + private int rowsInBatch; + + public ParquetColumnarRowSplitReader( + boolean utcTimestamp, + boolean caseSensitive, + Configuration conf, + LogicalType[] selectedTypes, + String[] selectedFieldNames, + ColumnBatchGenerator generator, + int batchSize, + Path path, + long splitStart, + long splitLength) throws IOException { + this.utcTimestamp = utcTimestamp; + this.batchSize = batchSize; + // then we need to apply the predicate push down filter + ParquetMetadata footer = readFooter(conf, path, range(splitStart, splitStart + splitLength)); + MessageType fileSchema = footer.getFileMetaData().getSchema(); + FilterCompat.Filter filter = getFilter(conf); + List blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); + + this.fileSchema = footer.getFileMetaData().getSchema(); + + Type[] types = clipParquetSchema(fileSchema, selectedFieldNames, caseSensitive); + int[] requestedIndices = IntStream.range(0, types.length).filter(i -> types[i] != null).toArray(); + Type[] readTypes = Arrays.stream(requestedIndices).mapToObj(i -> types[i]).toArray(Type[]::new); + + this.requestedTypes = Arrays.stream(requestedIndices).mapToObj(i -> selectedTypes[i]).toArray(LogicalType[]::new); + this.requestedSchema = Types.buildMessage().addFields(readTypes).named("flink-parquet"); + this.reader = new ParquetFileReader( + conf, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns()); + + long totalRowCount = 0; + for (BlockMetaData block : blocks) { + totalRowCount += block.getRowCount(); + } + this.totalRowCount = totalRowCount; + this.nextRow = 0; + this.rowsInBatch = 0; + this.rowsReturned = 0; + + checkSchema(); + + this.writableVectors = createWritableVectors(); + ColumnVector[] columnVectors = patchedVector(selectedFieldNames.length, createReadableVectors(), requestedIndices); + this.columnarBatch = generator.generate(columnVectors); + this.row = new ColumnarRowData(columnarBatch); + } + + /** + * Patches the given vectors with nulls. + * The vector position that is not requested (or read from file) is patched as null. + * + * @param fields The total selected fields number + * @param vectors The readable vectors + * @param indices The requested indices from the selected fields + */ + private static ColumnVector[] patchedVector(int fields, ColumnVector[] vectors, int[] indices) { + ColumnVector[] patched = new ColumnVector[fields]; + for (int i = 0; i < indices.length; i++) { + patched[indices[i]] = vectors[i]; + } + return patched; + } + + /** + * Clips `parquetSchema` according to `fieldNames`. + */ + private static Type[] clipParquetSchema( + GroupType parquetSchema, String[] fieldNames, boolean caseSensitive) { + Type[] types = new Type[fieldNames.length]; + if (caseSensitive) { + for (int i = 0; i < fieldNames.length; ++i) { + String fieldName = fieldNames[i]; + types[i] = parquetSchema.containsField(fieldName) ? parquetSchema.getType(fieldName) : null; + } + } else { + Map caseInsensitiveFieldMap = new HashMap<>(); + for (Type type : parquetSchema.getFields()) { + caseInsensitiveFieldMap.compute(type.getName().toLowerCase(Locale.ROOT), + (key, previousType) -> { + if (previousType != null) { + throw new FlinkRuntimeException( + "Parquet with case insensitive mode should have no duplicate key: " + key); + } + return type; + }); + } + for (int i = 0; i < fieldNames.length; ++i) { + Type type = caseInsensitiveFieldMap.get(fieldNames[i].toLowerCase(Locale.ROOT)); + // TODO clip for array,map,row types. + types[i] = type; + } + } + + return types; + } + + private WritableColumnVector[] createWritableVectors() { + WritableColumnVector[] columns = new WritableColumnVector[requestedTypes.length]; + List types = requestedSchema.getFields(); + List descriptors = requestedSchema.getColumns(); + for (int i = 0; i < requestedTypes.length; i++) { + columns[i] = createWritableColumnVector( + batchSize, + requestedTypes[i], + types.get(i), + descriptors); + } + return columns; + } + + /** + * Create readable vectors from writable vectors. + * Especially for decimal, see {@link org.apache.flink.formats.parquet.vector.ParquetDecimalVector}. + */ + private ColumnVector[] createReadableVectors() { + ColumnVector[] vectors = new ColumnVector[writableVectors.length]; + for (int i = 0; i < writableVectors.length; i++) { + vectors[i] = requestedTypes[i].getTypeRoot() == LogicalTypeRoot.DECIMAL + ? new ParquetDecimalVector(writableVectors[i]) + : writableVectors[i]; + } + return vectors; + } + + private void checkSchema() throws IOException, UnsupportedOperationException { + /* + * Check that the requested schema is supported. + */ + for (int i = 0; i < requestedSchema.getFieldCount(); ++i) { + String[] colPath = requestedSchema.getPaths().get(i); + if (fileSchema.containsPath(colPath)) { + ColumnDescriptor fd = fileSchema.getColumnDescription(colPath); + if (!fd.equals(requestedSchema.getColumns().get(i))) { + throw new UnsupportedOperationException("Schema evolution not supported."); + } + } else { + if (requestedSchema.getColumns().get(i).getMaxDefinitionLevel() == 0) { + // Column is missing in data but the required data is non-nullable. This file is invalid. + throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath)); + } + } + } + } + + /** + * Method used to check if the end of the input is reached. + * + * @return True if the end is reached, otherwise false. + * @throws IOException Thrown, if an I/O error occurred. + */ + public boolean reachedEnd() throws IOException { + return !ensureBatch(); + } + + public ColumnarRowData nextRecord() { + // return the next row + row.setRowId(this.nextRow++); + return row; + } + + /** + * Checks if there is at least one row left in the batch to return. If no more row are + * available, it reads another batch of rows. + * + * @return Returns true if there is one more row to return, false otherwise. + * @throws IOException throw if an exception happens while reading a batch. + */ + private boolean ensureBatch() throws IOException { + if (nextRow >= rowsInBatch) { + // No more rows available in the Rows array. + nextRow = 0; + // Try to read the next batch if rows from the file. + return nextBatch(); + } + // there is at least one Row left in the Rows array. + return true; + } + + /** + * Advances to the next batch of rows. Returns false if there are no more. + */ + private boolean nextBatch() throws IOException { + for (WritableColumnVector v : writableVectors) { + v.reset(); + } + columnarBatch.setNumRows(0); + if (rowsReturned >= totalRowCount) { + return false; + } + if (rowsReturned == totalCountLoadedSoFar) { + readNextRowGroup(); + } + + int num = (int) Math.min(batchSize, totalCountLoadedSoFar - rowsReturned); + for (int i = 0; i < columnReaders.length; ++i) { + //noinspection unchecked + columnReaders[i].readToVector(num, writableVectors[i]); + } + rowsReturned += num; + columnarBatch.setNumRows(num); + rowsInBatch = num; + return true; + } + + private void readNextRowGroup() throws IOException { + PageReadStore pages = reader.readNextRowGroup(); + if (pages == null) { + throw new IOException("expecting more rows but reached last block. Read " + + rowsReturned + " out of " + totalRowCount); + } + List types = requestedSchema.getFields(); + List columns = requestedSchema.getColumns(); + columnReaders = new ColumnReader[types.size()]; + for (int i = 0; i < types.size(); ++i) { + columnReaders[i] = createColumnReader( + utcTimestamp, + requestedTypes[i], + types.get(i), + columns, + pages); + } + totalCountLoadedSoFar += pages.getRowCount(); + } + + /** + * Seek to a particular row number. + */ + public void seekToRow(long rowCount) throws IOException { + if (totalCountLoadedSoFar != 0) { + throw new UnsupportedOperationException("Only support seek at first."); + } + + List blockMetaData = reader.getRowGroups(); + + for (BlockMetaData metaData : blockMetaData) { + if (metaData.getRowCount() > rowCount) { + break; + } else { + reader.skipNextRowGroup(); + rowsReturned += metaData.getRowCount(); + totalCountLoadedSoFar += metaData.getRowCount(); + rowsInBatch = (int) metaData.getRowCount(); + nextRow = (int) metaData.getRowCount(); + rowCount -= metaData.getRowCount(); + } + } + for (int i = 0; i < rowCount; i++) { + boolean end = reachedEnd(); + if (end) { + throw new RuntimeException("Seek to many rows."); + } + nextRecord(); + } + } + + @Override + public void close() throws IOException { + if (reader != null) { + reader.close(); + reader = null; + } + } + + /** + * Interface to gen {@link VectorizedColumnBatch}. + */ + public interface ColumnBatchGenerator { + VectorizedColumnBatch generate(ColumnVector[] readVectors); + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetDataColumnReader.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetDataColumnReader.java new file mode 100644 index 0000000000000..e96cf22d29ef1 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetDataColumnReader.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.table.data.TimestampData; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.column.Dictionary; + +import java.io.IOException; + +/** + * The interface to wrap the underlying Parquet dictionary and non dictionary encoded page reader. + */ +public interface ParquetDataColumnReader { + + /** + * Initialize the reader by page data. + * + * @param valueCount value count + * @param in page data + * @throws IOException + */ + void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException; + + /** + * @return the next Dictionary ID from the page + */ + int readValueDictionaryId(); + + /** + * @return the next Long from the page + */ + long readLong(); + + /** + * @return the next Integer from the page + */ + int readInteger(); + + /** + * @return the next SmallInt from the page + */ + int readSmallInt(); + + /** + * @return the next TinyInt from the page + */ + int readTinyInt(); + + /** + * @return the next Float from the page + */ + float readFloat(); + + /** + * @return the next Boolean from the page + */ + boolean readBoolean(); + + /** + * @return the next String from the page + */ + byte[] readString(); + + /** + * @return the next Varchar from the page + */ + byte[] readVarchar(); + + /** + * @return the next Char from the page + */ + byte[] readChar(); + + /** + * @return the next Bytes from the page + */ + byte[] readBytes(); + + /** + * @return the next Decimal from the page + */ + byte[] readDecimal(); + + /** + * @return the next Double from the page + */ + double readDouble(); + + /** + * @return the next TimestampData from the page + */ + TimestampData readTimestamp(); + + /** + * @return is data valid + */ + boolean isValid(); + + /** + * @return the underlying dictionary if current reader is dictionary encoded + */ + Dictionary getDictionary(); + + /** + * @param id in dictionary + * @return the Bytes from the dictionary by id + */ + byte[] readBytes(int id); + + /** + * @param id in dictionary + * @return the Float from the dictionary by id + */ + float readFloat(int id); + + /** + * @param id in dictionary + * @return the Double from the dictionary by id + */ + double readDouble(int id); + + /** + * @param id in dictionary + * @return the Integer from the dictionary by id + */ + int readInteger(int id); + + /** + * @param id in dictionary + * @return the Long from the dictionary by id + */ + long readLong(int id); + + /** + * @param id in dictionary + * @return the Small Int from the dictionary by id + */ + int readSmallInt(int id); + + /** + * @param id in dictionary + * @return the tiny int from the dictionary by id + */ + int readTinyInt(int id); + + /** + * @param id in dictionary + * @return the Boolean from the dictionary by id + */ + boolean readBoolean(int id); + + /** + * @param id in dictionary + * @return the Decimal from the dictionary by id + */ + byte[] readDecimal(int id); + + /** + * @param id in dictionary + * @return the TimestampData from the dictionary by id + */ + TimestampData readTimestamp(int id); + + /** + * @param id in dictionary + * @return the String from the dictionary by id + */ + byte[] readString(int id); + + /** + * @param id in dictionary + * @return the Varchar from the dictionary by id + */ + byte[] readVarchar(int id); + + /** + * @param id in dictionary + * @return the Char from the dictionary by id + */ + byte[] readChar(int id); +} + diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetDataColumnReaderFactory.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetDataColumnReaderFactory.java new file mode 100644 index 0000000000000..861d5cb00bbe7 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetDataColumnReaderFactory.java @@ -0,0 +1,304 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.table.data.TimestampData; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.column.Dictionary; +import org.apache.parquet.column.values.ValuesReader; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.sql.Timestamp; + +import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.JULIAN_EPOCH_OFFSET_DAYS; +import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.MILLIS_IN_DAY; +import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.NANOS_PER_MILLISECOND; +import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.NANOS_PER_SECOND; + +/** + * Parquet file has self-describing schema which may differ from the user required schema (e.g. + * schema evolution). This factory is used to retrieve user required typed data via corresponding + * reader which reads the underlying data. + */ +public final class ParquetDataColumnReaderFactory { + + private ParquetDataColumnReaderFactory() { + } + + /** + * default reader for {@link ParquetDataColumnReader}. + */ + public static class DefaultParquetDataColumnReader implements ParquetDataColumnReader { + protected ValuesReader valuesReader; + protected Dictionary dict; + + // After the data is read in the parquet type, isValid will be set to true if the data can + // be returned in the type defined in HMS. Otherwise isValid is set to false. + boolean isValid = true; + + public DefaultParquetDataColumnReader(ValuesReader valuesReader) { + this.valuesReader = valuesReader; + } + + public DefaultParquetDataColumnReader(Dictionary dict) { + this.dict = dict; + } + + @Override + public void initFromPage(int i, ByteBufferInputStream in) throws IOException { + valuesReader.initFromPage(i, in); + } + + @Override + public boolean readBoolean() { + return valuesReader.readBoolean(); + } + + @Override + public boolean readBoolean(int id) { + return dict.decodeToBoolean(id); + } + + @Override + public byte[] readString(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public byte[] readString() { + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readVarchar() { + // we need to enforce the size here even the types are the same + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readVarchar(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public byte[] readChar() { + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readChar(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public byte[] readBytes() { + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readBytes(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public byte[] readDecimal() { + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readDecimal(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public float readFloat() { + return valuesReader.readFloat(); + } + + @Override + public float readFloat(int id) { + return dict.decodeToFloat(id); + } + + @Override + public double readDouble() { + return valuesReader.readDouble(); + } + + @Override + public double readDouble(int id) { + return dict.decodeToDouble(id); + } + + @Override + public TimestampData readTimestamp() { + throw new RuntimeException("Unsupported operation"); + } + + @Override + public TimestampData readTimestamp(int id) { + throw new RuntimeException("Unsupported operation"); + } + + @Override + public int readInteger() { + return valuesReader.readInteger(); + } + + @Override + public int readInteger(int id) { + return dict.decodeToInt(id); + } + + @Override + public boolean isValid() { + return isValid; + } + + @Override + public long readLong(int id) { + return dict.decodeToLong(id); + } + + @Override + public long readLong() { + return valuesReader.readLong(); + } + + @Override + public int readSmallInt() { + return valuesReader.readInteger(); + } + + @Override + public int readSmallInt(int id) { + return dict.decodeToInt(id); + } + + @Override + public int readTinyInt() { + return valuesReader.readInteger(); + } + + @Override + public int readTinyInt(int id) { + return dict.decodeToInt(id); + } + + @Override + public int readValueDictionaryId() { + return valuesReader.readValueDictionaryId(); + } + + public void skip() { + valuesReader.skip(); + } + + @Override + public Dictionary getDictionary() { + return dict; + } + } + + /** + * The reader who reads from the underlying Timestamp value value. + */ + public static class TypesFromInt96PageReader extends DefaultParquetDataColumnReader { + private final boolean isUtcTimestamp; + + public TypesFromInt96PageReader(ValuesReader realReader, boolean isUtcTimestamp) { + super(realReader); + this.isUtcTimestamp = isUtcTimestamp; + } + + public TypesFromInt96PageReader(Dictionary dict, boolean isUtcTimestamp) { + super(dict); + this.isUtcTimestamp = isUtcTimestamp; + } + + private TimestampData convert(Binary binary) { + ByteBuffer buf = binary.toByteBuffer(); + buf.order(ByteOrder.LITTLE_ENDIAN); + long timeOfDayNanos = buf.getLong(); + int julianDay = buf.getInt(); + return int96ToTimestamp(isUtcTimestamp, timeOfDayNanos, julianDay); + } + + @Override + public TimestampData readTimestamp(int id) { + return convert(dict.decodeToBinary(id)); + } + + @Override + public TimestampData readTimestamp() { + return convert(valuesReader.readBytes()); + } + } + + private static ParquetDataColumnReader getDataColumnReaderByTypeHelper( + boolean isDictionary, + PrimitiveType parquetType, + Dictionary dictionary, + ValuesReader valuesReader, + boolean isUtcTimestamp) { + if (parquetType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.INT96) { + return isDictionary + ? new TypesFromInt96PageReader(dictionary, isUtcTimestamp) + : new TypesFromInt96PageReader(valuesReader, isUtcTimestamp); + } else { + return isDictionary + ? new DefaultParquetDataColumnReader(dictionary) + : new DefaultParquetDataColumnReader(valuesReader); + } + } + + public static ParquetDataColumnReader getDataColumnReaderByTypeOnDictionary( + PrimitiveType parquetType, Dictionary realReader, boolean isUtcTimestamp) { + return getDataColumnReaderByTypeHelper(true, parquetType, realReader, null, isUtcTimestamp); + } + + public static ParquetDataColumnReader getDataColumnReaderByType( + PrimitiveType parquetType, ValuesReader realReader, boolean isUtcTimestamp) { + return getDataColumnReaderByTypeHelper( + false, parquetType, null, realReader, isUtcTimestamp); + } + + private static TimestampData int96ToTimestamp( + boolean utcTimestamp, long nanosOfDay, int julianDay) { + long millisecond = julianDayToMillis(julianDay) + (nanosOfDay / NANOS_PER_MILLISECOND); + + if (utcTimestamp) { + int nanoOfMillisecond = (int) (nanosOfDay % NANOS_PER_MILLISECOND); + return TimestampData.fromEpochMillis(millisecond, nanoOfMillisecond); + } else { + Timestamp timestamp = new Timestamp(millisecond); + timestamp.setNanos((int) (nanosOfDay % NANOS_PER_SECOND)); + return TimestampData.fromTimestamp(timestamp); + } + } + + private static long julianDayToMillis(int julianDay) { + return (julianDay - JULIAN_EPOCH_OFFSET_DAYS) * MILLIS_IN_DAY; + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/RowColumnReader.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/RowColumnReader.java new file mode 100644 index 0000000000000..524c00f402d47 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/RowColumnReader.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.hudi.table.format.cow.vector.HeapRowColumnVector; + +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; + +import java.io.IOException; +import java.util.List; + +/** + * Row {@link ColumnReader}. + */ +public class RowColumnReader implements ColumnReader { + + private final List fieldReaders; + + public RowColumnReader(List fieldReaders) { + this.fieldReaders = fieldReaders; + } + + @Override + public void readToVector(int readNumber, WritableColumnVector vector) throws IOException { + HeapRowColumnVector rowColumnVector = (HeapRowColumnVector) vector; + WritableColumnVector[] vectors = rowColumnVector.vectors; + // row vector null array + boolean[] isNulls = new boolean[readNumber]; + for (int i = 0; i < vectors.length; i++) { + fieldReaders.get(i).readToVector(readNumber, vectors[i]); + + for (int j = 0; j < readNumber; j++) { + if (i == 0) { + isNulls[j] = vectors[i].isNullAt(j); + } else { + isNulls[j] = isNulls[j] && vectors[i].isNullAt(j); + } + if (i == vectors.length - 1 && isNulls[j]) { + // rowColumnVector[j] is null only when all fields[j] of rowColumnVector[j] is + // null + rowColumnVector.setNullAt(j); + } + } + } + } +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/RunLengthDecoder.java b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/RunLengthDecoder.java new file mode 100644 index 0000000000000..3266f835e4d1c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/RunLengthDecoder.java @@ -0,0 +1,304 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.table.data.vector.writable.WritableColumnVector; +import org.apache.flink.table.data.vector.writable.WritableIntVector; +import org.apache.parquet.Preconditions; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.values.bitpacking.BytePacker; +import org.apache.parquet.column.values.bitpacking.Packer; +import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder; +import org.apache.parquet.io.ParquetDecodingException; + +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * Run length decoder for data and dictionary ids. + * See https://github.com/apache/parquet-format/blob/master/Encodings.md + * See {@link RunLengthBitPackingHybridDecoder}. + * + *

    Note: Reference Flink release 1.11.2 + * {@code org.apache.flink.formats.parquet.vector.reader.RunLengthDecoder} + * because it is package scope. + */ +final class RunLengthDecoder { + + /** + * If true, the bit width is fixed. This decoder is used in different places and this also + * controls if we need to read the bitwidth from the beginning of the data stream. + */ + private final boolean fixedWidth; + private final boolean readLength; + + // Encoded data. + private ByteBufferInputStream in; + + // bit/byte width of decoded data and utility to batch unpack them. + private int bitWidth; + private int bytesWidth; + private BytePacker packer; + + // Current decoding mode and values + MODE mode; + int currentCount; + int currentValue; + + // Buffer of decoded values if the values are PACKED. + int[] currentBuffer = new int[16]; + int currentBufferIdx = 0; + + RunLengthDecoder() { + this.fixedWidth = false; + this.readLength = false; + } + + RunLengthDecoder(int bitWidth) { + this.fixedWidth = true; + this.readLength = bitWidth != 0; + initWidthAndPacker(bitWidth); + } + + RunLengthDecoder(int bitWidth, boolean readLength) { + this.fixedWidth = true; + this.readLength = readLength; + initWidthAndPacker(bitWidth); + } + + /** + * Init from input stream. + */ + void initFromStream(int valueCount, ByteBufferInputStream in) throws IOException { + this.in = in; + if (fixedWidth) { + // initialize for repetition and definition levels + if (readLength) { + int length = readIntLittleEndian(); + this.in = in.sliceStream(length); + } + } else { + // initialize for values + if (in.available() > 0) { + initWidthAndPacker(in.read()); + } + } + if (bitWidth == 0) { + // 0 bit width, treat this as an RLE run of valueCount number of 0's. + this.mode = MODE.RLE; + this.currentCount = valueCount; + this.currentValue = 0; + } else { + this.currentCount = 0; + } + } + + /** + * Initializes the internal state for decoding ints of `bitWidth`. + */ + private void initWidthAndPacker(int bitWidth) { + Preconditions.checkArgument(bitWidth >= 0 && bitWidth <= 32, "bitWidth must be >= 0 and <= 32"); + this.bitWidth = bitWidth; + this.bytesWidth = BytesUtils.paddedByteCountFromBits(bitWidth); + this.packer = Packer.LITTLE_ENDIAN.newBytePacker(bitWidth); + } + + int readInteger() { + if (this.currentCount == 0) { + this.readNextGroup(); + } + + this.currentCount--; + switch (mode) { + case RLE: + return this.currentValue; + case PACKED: + return this.currentBuffer[currentBufferIdx++]; + default: + throw new AssertionError(); + } + } + + /** + * Decoding for dictionary ids. The IDs are populated into `values` and the nullability is + * populated into `nulls`. + */ + void readDictionaryIds( + int total, + WritableIntVector values, + WritableColumnVector nulls, + int rowId, + int level, + RunLengthDecoder data) { + int left = total; + while (left > 0) { + if (this.currentCount == 0) { + this.readNextGroup(); + } + int n = Math.min(left, this.currentCount); + switch (mode) { + case RLE: + if (currentValue == level) { + data.readDictionaryIdData(n, values, rowId); + } else { + nulls.setNulls(rowId, n); + } + break; + case PACKED: + for (int i = 0; i < n; ++i) { + if (currentBuffer[currentBufferIdx++] == level) { + values.setInt(rowId + i, data.readInteger()); + } else { + nulls.setNullAt(rowId + i); + } + } + break; + default: + throw new AssertionError(); + } + rowId += n; + left -= n; + currentCount -= n; + } + } + + /** + * It is used to decode dictionary IDs. + */ + private void readDictionaryIdData(int total, WritableIntVector c, int rowId) { + int left = total; + while (left > 0) { + if (this.currentCount == 0) { + this.readNextGroup(); + } + int n = Math.min(left, this.currentCount); + switch (mode) { + case RLE: + c.setInts(rowId, n, currentValue); + break; + case PACKED: + c.setInts(rowId, n, currentBuffer, currentBufferIdx); + currentBufferIdx += n; + break; + default: + throw new AssertionError(); + } + rowId += n; + left -= n; + currentCount -= n; + } + } + + /** + * Reads the next varint encoded int. + */ + private int readUnsignedVarInt() throws IOException { + int value = 0; + int shift = 0; + int b; + do { + b = in.read(); + value |= (b & 0x7F) << shift; + shift += 7; + } while ((b & 0x80) != 0); + return value; + } + + /** + * Reads the next 4 byte little endian int. + */ + private int readIntLittleEndian() throws IOException { + int ch4 = in.read(); + int ch3 = in.read(); + int ch2 = in.read(); + int ch1 = in.read(); + return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + ch4); + } + + /** + * Reads the next byteWidth little endian int. + */ + private int readIntLittleEndianPaddedOnBitWidth() throws IOException { + switch (bytesWidth) { + case 0: + return 0; + case 1: + return in.read(); + case 2: { + int ch2 = in.read(); + int ch1 = in.read(); + return (ch1 << 8) + ch2; + } + case 3: { + int ch3 = in.read(); + int ch2 = in.read(); + int ch1 = in.read(); + return (ch1 << 16) + (ch2 << 8) + ch3; + } + case 4: { + return readIntLittleEndian(); + } + default: + throw new RuntimeException("Unreachable"); + } + } + + /** + * Reads the next group. + */ + void readNextGroup() { + try { + int header = readUnsignedVarInt(); + this.mode = (header & 1) == 0 ? MODE.RLE : MODE.PACKED; + switch (mode) { + case RLE: + this.currentCount = header >>> 1; + this.currentValue = readIntLittleEndianPaddedOnBitWidth(); + return; + case PACKED: + int numGroups = header >>> 1; + this.currentCount = numGroups * 8; + + if (this.currentBuffer.length < this.currentCount) { + this.currentBuffer = new int[this.currentCount]; + } + currentBufferIdx = 0; + int valueIndex = 0; + while (valueIndex < this.currentCount) { + // values are bit packed 8 at a time, so reading bitWidth will always work + ByteBuffer buffer = in.slice(bitWidth); + this.packer.unpack8Values(buffer, buffer.position(), this.currentBuffer, valueIndex); + valueIndex += 8; + } + return; + default: + throw new ParquetDecodingException("not a valid mode " + this.mode); + } + } catch (IOException e) { + throw new ParquetDecodingException("Failed to read from input stream", e); + } + } + + enum MODE { + RLE, + PACKED + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/test/java/org/apache/hudi/adapter/OutputAdapter.java b/hudi-flink-datasource/hudi-flink1.13.x/src/test/java/org/apache/hudi/adapter/OutputAdapter.java new file mode 100644 index 0000000000000..18686b811c400 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/test/java/org/apache/hudi/adapter/OutputAdapter.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.streaming.api.operators.Output; + +/** + * Adapter clazz for {@link Output}. + */ +public interface OutputAdapter extends Output { +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/test/java/org/apache/hudi/adapter/StateInitializationContextAdapter.java b/hudi-flink-datasource/hudi-flink1.13.x/src/test/java/org/apache/hudi/adapter/StateInitializationContextAdapter.java new file mode 100644 index 0000000000000..8563d2422b648 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/test/java/org/apache/hudi/adapter/StateInitializationContextAdapter.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.runtime.state.StateInitializationContext; + +/** + * Adapter clazz for {@link StateInitializationContext}. + */ +public interface StateInitializationContextAdapter extends StateInitializationContext { +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/test/java/org/apache/hudi/adapter/StreamingRuntimeContextAdapter.java b/hudi-flink-datasource/hudi-flink1.13.x/src/test/java/org/apache/hudi/adapter/StreamingRuntimeContextAdapter.java new file mode 100644 index 0000000000000..176783e8108c6 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/test/java/org/apache/hudi/adapter/StreamingRuntimeContextAdapter.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.api.common.accumulators.Accumulator; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.flink.runtime.execution.Environment; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; + +import java.util.Map; + +/** + * Adapter clazz for {@link StreamingRuntimeContext}. + */ +public class StreamingRuntimeContextAdapter extends StreamingRuntimeContext { + + public StreamingRuntimeContextAdapter(AbstractStreamOperator operator, Environment env, + Map> accumulators) { + super(operator, env, accumulators); + } + + @Override + public MetricGroup getMetricGroup() { + return new UnregisteredMetricsGroup(); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.13.x/src/test/java/org/apache/hudi/adapter/TestTableEnvs.java b/hudi-flink-datasource/hudi-flink1.13.x/src/test/java/org/apache/hudi/adapter/TestTableEnvs.java new file mode 100644 index 0000000000000..e3088356709f1 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.13.x/src/test/java/org/apache/hudi/adapter/TestTableEnvs.java @@ -0,0 +1,34 @@ +package org.apache.hudi.adapter; + +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.internal.TableEnvironmentImpl; + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * TableEnv for test goals. + */ +public class TestTableEnvs { + + public static TableEnvironment getBatchTableEnv() { + EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build(); + return TableEnvironmentImpl.create(settings); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.14.x/pom.xml b/hudi-flink-datasource/hudi-flink1.14.x/pom.xml new file mode 100644 index 0000000000000..ec44b9fc9265a --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/pom.xml @@ -0,0 +1,195 @@ + + + + + hudi-flink-datasource + org.apache.hudi + 0.12.2-dt-SNAPSHOT + + 4.0.0 + + hudi-flink1.14.x + 0.12.2-dt-SNAPSHOT + jar + + + ${project.parent.parent.basedir} + + + + + + org.apache.logging.log4j + log4j-1.2-api + + + org.apache.logging.log4j + log4j-slf4j-impl + + + org.slf4j + slf4j-api + + + + + org.apache.hudi + hudi-common + ${project.version} + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + provided + + + org.apache.flink + flink-table-api-java + ${flink1.14.version} + provided + + + org.apache.flink + flink-table-api-java-bridge_${scala.binary.version} + ${flink1.14.version} + provided + + + org.apache.flink + flink-table-runtime_${scala.binary.version} + ${flink1.14.version} + provided + + + org.apache.flink + flink-shaded-guava + 30.1.1-jre-14.0 + provided + + + org.apache.flink + flink-core + ${flink1.14.version} + provided + + + org.apache.flink + flink-streaming-java_${scala.binary.version} + ${flink1.14.version} + provided + + + org.apache.flink + flink-parquet_${scala.binary.version} + ${flink1.14.version} + provided + + + org.apache.flink + flink-json + ${flink1.14.version} + provided + + + org.apache.flink + flink-runtime + ${flink1.14.version} + test + test-jar + + + + + org.apache.hudi + hudi-tests-common + ${project.version} + test + + + + org.junit.jupiter + junit-jupiter-api + test + + + org.junit.jupiter + junit-jupiter-engine + test + + + org.junit.vintage + junit-vintage-engine + test + + + org.junit.jupiter + junit-jupiter-params + test + + + org.mockito + mockito-junit-jupiter + test + + + org.junit.platform + junit-platform-runner + test + + + org.junit.platform + junit-platform-suite-api + test + + + org.junit.platform + junit-platform-commons + test + + + + + + + org.jacoco + jacoco-maven-plugin + + + org.apache.maven.plugins + maven-jar-plugin + + + + test-jar + + test-compile + + + + false + + + + org.apache.rat + apache-rat-plugin + + + + diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/AbstractStreamOperatorAdapter.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/AbstractStreamOperatorAdapter.java new file mode 100644 index 0000000000000..d4c6bc3a8f4da --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/AbstractStreamOperatorAdapter.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; + +/** + * Adapter clazz for {@code AbstractStreamOperator}. + */ +public abstract class AbstractStreamOperatorAdapter extends AbstractStreamOperator { +} diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/AbstractStreamOperatorFactoryAdapter.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/AbstractStreamOperatorFactoryAdapter.java new file mode 100644 index 0000000000000..6dcfe71ccfd9d --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/AbstractStreamOperatorFactoryAdapter.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.YieldingOperatorFactory; + +/** + * Adapter clazz for {@link AbstractStreamOperatorFactory}. + */ +public abstract class AbstractStreamOperatorFactoryAdapter + extends AbstractStreamOperatorFactory implements YieldingOperatorFactory { + + public MailboxExecutorAdapter getMailboxExecutorAdapter() { + return new MailboxExecutorAdapter(getMailboxExecutor()); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/DataStreamScanProviderAdapter.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/DataStreamScanProviderAdapter.java new file mode 100644 index 0000000000000..867395c43f199 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/DataStreamScanProviderAdapter.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.table.connector.source.DataStreamScanProvider; + +/** + * Adapter clazz for {@code DataStreamScanProvider}. + */ +public interface DataStreamScanProviderAdapter extends DataStreamScanProvider { +} diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/DataStreamSinkProviderAdapter.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/DataStreamSinkProviderAdapter.java new file mode 100644 index 0000000000000..e8eaa3c62d441 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/DataStreamSinkProviderAdapter.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.table.connector.sink.DataStreamSinkProvider; + +/** + * Adapter clazz for {@code DataStreamSinkProvider}. + */ +public interface DataStreamSinkProviderAdapter extends DataStreamSinkProvider { +} diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/MailboxExecutorAdapter.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/MailboxExecutorAdapter.java new file mode 100644 index 0000000000000..0c836f3db391b --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/MailboxExecutorAdapter.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.api.common.operators.MailboxExecutor; +import org.apache.flink.util.function.ThrowingRunnable; + +/** + * Adapter clazz for {@link MailboxExecutor}. + */ +public class MailboxExecutorAdapter { + private final MailboxExecutor executor; + + public MailboxExecutorAdapter(MailboxExecutor executor) { + this.executor = executor; + } + + public void execute(ThrowingRunnable command, String description) { + this.executor.execute(command, description); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/RateLimiterAdapter.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/RateLimiterAdapter.java new file mode 100644 index 0000000000000..865c0c81d4d9d --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/RateLimiterAdapter.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.shaded.guava30.com.google.common.util.concurrent.RateLimiter; + +/** + * Bridge class for shaded guava clazz {@code RateLimiter}. + */ +public class RateLimiterAdapter { + private final RateLimiter rateLimiter; + + private RateLimiterAdapter(double permitsPerSecond) { + this.rateLimiter = RateLimiter.create(permitsPerSecond); + } + + public static RateLimiterAdapter create(double permitsPerSecond) { + return new RateLimiterAdapter(permitsPerSecond); + } + + public void acquire() { + this.rateLimiter.acquire(); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/Utils.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/Utils.java new file mode 100644 index 0000000000000..30c6a22bfd8ea --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/adapter/Utils.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.TimeCharacteristic; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.streaming.api.operators.Output; +import org.apache.flink.streaming.api.operators.StreamSourceContexts; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.runtime.tasks.ProcessingTimeService; +import org.apache.flink.streaming.runtime.tasks.StreamTask; +import org.apache.flink.table.catalog.ObjectIdentifier; +import org.apache.flink.table.catalog.ResolvedCatalogTable; +import org.apache.flink.table.factories.FactoryUtil; + +/** + * Adapter utils. + */ +public class Utils { + public static SourceFunction.SourceContext getSourceContext( + TimeCharacteristic timeCharacteristic, + ProcessingTimeService processingTimeService, + StreamTask streamTask, + Output> output, + long watermarkInterval) { + return StreamSourceContexts.getSourceContext( + timeCharacteristic, + processingTimeService, + new Object(), // no actual locking needed + output, + watermarkInterval, + -1, + true); + } + + public static FactoryUtil.DefaultDynamicTableContext getTableContext( + ObjectIdentifier tablePath, + ResolvedCatalogTable catalogTable, + ReadableConfig conf) { + return new FactoryUtil.DefaultDynamicTableContext(tablePath, catalogTable, + conf, Thread.currentThread().getContextClassLoader(), false); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/ParquetSplitReaderUtil.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/ParquetSplitReaderUtil.java new file mode 100644 index 0000000000000..dc59abe460212 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/ParquetSplitReaderUtil.java @@ -0,0 +1,527 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow; + +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.table.format.cow.vector.HeapArrayVector; +import org.apache.hudi.table.format.cow.vector.HeapMapColumnVector; +import org.apache.hudi.table.format.cow.vector.HeapRowColumnVector; +import org.apache.hudi.table.format.cow.vector.ParquetDecimalVector; +import org.apache.hudi.table.format.cow.vector.reader.ArrayColumnReader; +import org.apache.hudi.table.format.cow.vector.reader.FixedLenBytesColumnReader; +import org.apache.hudi.table.format.cow.vector.reader.Int64TimestampColumnReader; +import org.apache.hudi.table.format.cow.vector.reader.MapColumnReader; +import org.apache.hudi.table.format.cow.vector.reader.ParquetColumnarRowSplitReader; +import org.apache.hudi.table.format.cow.vector.reader.RowColumnReader; + +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.vector.reader.BooleanColumnReader; +import org.apache.flink.formats.parquet.vector.reader.ByteColumnReader; +import org.apache.flink.formats.parquet.vector.reader.BytesColumnReader; +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.formats.parquet.vector.reader.DoubleColumnReader; +import org.apache.flink.formats.parquet.vector.reader.FloatColumnReader; +import org.apache.flink.formats.parquet.vector.reader.IntColumnReader; +import org.apache.flink.formats.parquet.vector.reader.LongColumnReader; +import org.apache.flink.formats.parquet.vector.reader.ShortColumnReader; +import org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.data.vector.ColumnVector; +import org.apache.flink.table.data.vector.VectorizedColumnBatch; +import org.apache.flink.table.data.vector.heap.HeapBooleanVector; +import org.apache.flink.table.data.vector.heap.HeapByteVector; +import org.apache.flink.table.data.vector.heap.HeapBytesVector; +import org.apache.flink.table.data.vector.heap.HeapDoubleVector; +import org.apache.flink.table.data.vector.heap.HeapFloatVector; +import org.apache.flink.table.data.vector.heap.HeapIntVector; +import org.apache.flink.table.data.vector.heap.HeapLongVector; +import org.apache.flink.table.data.vector.heap.HeapShortVector; +import org.apache.flink.table.data.vector.heap.HeapTimestampVector; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.table.types.logical.VarBinaryType; +import org.apache.flink.util.Preconditions; +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.ParquetRuntimeException; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.InvalidSchemaException; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +import java.io.IOException; +import java.math.BigDecimal; +import java.nio.charset.StandardCharsets; +import java.sql.Date; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.apache.flink.table.runtime.functions.SqlDateTimeUtils.dateToInternal; +import static org.apache.parquet.Preconditions.checkArgument; + +/** + * Util for generating {@link ParquetColumnarRowSplitReader}. + * + *

    NOTE: reference from Flink release 1.11.2 {@code ParquetSplitReaderUtil}, modify to support INT64 + * based TIMESTAMP_MILLIS as ConvertedType, should remove when Flink supports that. + */ +public class ParquetSplitReaderUtil { + + /** + * Util for generating partitioned {@link ParquetColumnarRowSplitReader}. + */ + public static ParquetColumnarRowSplitReader genPartColumnarRowReader( + boolean utcTimestamp, + boolean caseSensitive, + Configuration conf, + String[] fullFieldNames, + DataType[] fullFieldTypes, + Map partitionSpec, + int[] selectedFields, + int batchSize, + Path path, + long splitStart, + long splitLength) throws IOException { + List selNonPartNames = Arrays.stream(selectedFields) + .mapToObj(i -> fullFieldNames[i]) + .filter(n -> !partitionSpec.containsKey(n)) + .collect(Collectors.toList()); + + int[] selParquetFields = Arrays.stream(selectedFields) + .filter(i -> !partitionSpec.containsKey(fullFieldNames[i])) + .toArray(); + + ParquetColumnarRowSplitReader.ColumnBatchGenerator gen = readVectors -> { + // create and initialize the row batch + ColumnVector[] vectors = new ColumnVector[selectedFields.length]; + for (int i = 0; i < vectors.length; i++) { + String name = fullFieldNames[selectedFields[i]]; + LogicalType type = fullFieldTypes[selectedFields[i]].getLogicalType(); + vectors[i] = createVector(readVectors, selNonPartNames, name, type, partitionSpec, batchSize); + } + return new VectorizedColumnBatch(vectors); + }; + + return new ParquetColumnarRowSplitReader( + utcTimestamp, + caseSensitive, + conf, + Arrays.stream(selParquetFields) + .mapToObj(i -> fullFieldTypes[i].getLogicalType()) + .toArray(LogicalType[]::new), + selNonPartNames.toArray(new String[0]), + gen, + batchSize, + new org.apache.hadoop.fs.Path(path.toUri()), + splitStart, + splitLength); + } + + private static ColumnVector createVector( + ColumnVector[] readVectors, + List selNonPartNames, + String name, + LogicalType type, + Map partitionSpec, + int batchSize) { + if (partitionSpec.containsKey(name)) { + return createVectorFromConstant(type, partitionSpec.get(name), batchSize); + } + ColumnVector readVector = readVectors[selNonPartNames.indexOf(name)]; + if (readVector == null) { + // when the read vector is null, use a constant null vector instead + readVector = createVectorFromConstant(type, null, batchSize); + } + return readVector; + } + + private static ColumnVector createVectorFromConstant( + LogicalType type, + Object value, + int batchSize) { + switch (type.getTypeRoot()) { + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + HeapBytesVector bsv = new HeapBytesVector(batchSize); + if (value == null) { + bsv.fillWithNulls(); + } else { + bsv.fill(value instanceof byte[] + ? (byte[]) value + : value.toString().getBytes(StandardCharsets.UTF_8)); + } + return bsv; + case BOOLEAN: + HeapBooleanVector bv = new HeapBooleanVector(batchSize); + if (value == null) { + bv.fillWithNulls(); + } else { + bv.fill((boolean) value); + } + return bv; + case TINYINT: + HeapByteVector byteVector = new HeapByteVector(batchSize); + if (value == null) { + byteVector.fillWithNulls(); + } else { + byteVector.fill(((Number) value).byteValue()); + } + return byteVector; + case SMALLINT: + HeapShortVector sv = new HeapShortVector(batchSize); + if (value == null) { + sv.fillWithNulls(); + } else { + sv.fill(((Number) value).shortValue()); + } + return sv; + case INTEGER: + HeapIntVector iv = new HeapIntVector(batchSize); + if (value == null) { + iv.fillWithNulls(); + } else { + iv.fill(((Number) value).intValue()); + } + return iv; + case BIGINT: + HeapLongVector lv = new HeapLongVector(batchSize); + if (value == null) { + lv.fillWithNulls(); + } else { + lv.fill(((Number) value).longValue()); + } + return lv; + case DECIMAL: + DecimalType decimalType = (DecimalType) type; + int precision = decimalType.getPrecision(); + int scale = decimalType.getScale(); + DecimalData decimal = value == null + ? null + : Preconditions.checkNotNull(DecimalData.fromBigDecimal((BigDecimal) value, precision, scale)); + ColumnVector internalVector = createVectorFromConstant( + new VarBinaryType(), + decimal == null ? null : decimal.toUnscaledBytes(), + batchSize); + return new ParquetDecimalVector(internalVector); + case FLOAT: + HeapFloatVector fv = new HeapFloatVector(batchSize); + if (value == null) { + fv.fillWithNulls(); + } else { + fv.fill(((Number) value).floatValue()); + } + return fv; + case DOUBLE: + HeapDoubleVector dv = new HeapDoubleVector(batchSize); + if (value == null) { + dv.fillWithNulls(); + } else { + dv.fill(((Number) value).doubleValue()); + } + return dv; + case DATE: + if (value instanceof LocalDate) { + value = Date.valueOf((LocalDate) value); + } + return createVectorFromConstant( + new IntType(), + value == null ? null : dateToInternal((Date) value), + batchSize); + case TIMESTAMP_WITHOUT_TIME_ZONE: + HeapTimestampVector tv = new HeapTimestampVector(batchSize); + if (value == null) { + tv.fillWithNulls(); + } else { + tv.fill(TimestampData.fromLocalDateTime((LocalDateTime) value)); + } + return tv; + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + } + + private static List filterDescriptors(int depth, Type type, List columns) throws ParquetRuntimeException { + List filtered = new ArrayList<>(); + for (ColumnDescriptor descriptor : columns) { + if (depth >= descriptor.getPath().length) { + throw new InvalidSchemaException("Expect depth " + depth + " for schema: " + descriptor); + } + if (type.getName().equals(descriptor.getPath()[depth])) { + filtered.add(descriptor); + } + } + ValidationUtils.checkState(filtered.size() > 0, "Corrupted Parquet schema"); + return filtered; + } + + public static ColumnReader createColumnReader( + boolean utcTimestamp, + LogicalType fieldType, + Type physicalType, + List descriptors, + PageReadStore pages) throws IOException { + return createColumnReader(utcTimestamp, fieldType, physicalType, descriptors, + pages, 0); + } + + private static ColumnReader createColumnReader( + boolean utcTimestamp, + LogicalType fieldType, + Type physicalType, + List columns, + PageReadStore pages, + int depth) throws IOException { + List descriptors = filterDescriptors(depth, physicalType, columns); + ColumnDescriptor descriptor = descriptors.get(0); + PageReader pageReader = pages.getPageReader(descriptor); + switch (fieldType.getTypeRoot()) { + case BOOLEAN: + return new BooleanColumnReader(descriptor, pageReader); + case TINYINT: + return new ByteColumnReader(descriptor, pageReader); + case DOUBLE: + return new DoubleColumnReader(descriptor, pageReader); + case FLOAT: + return new FloatColumnReader(descriptor, pageReader); + case INTEGER: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + return new IntColumnReader(descriptor, pageReader); + case BIGINT: + return new LongColumnReader(descriptor, pageReader); + case SMALLINT: + return new ShortColumnReader(descriptor, pageReader); + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + return new BytesColumnReader(descriptor, pageReader); + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) { + case INT64: + int precision = fieldType instanceof TimestampType + ? ((TimestampType) fieldType).getPrecision() + : ((LocalZonedTimestampType) fieldType).getPrecision(); + return new Int64TimestampColumnReader(utcTimestamp, descriptor, pageReader, precision); + case INT96: + return new TimestampColumnReader(utcTimestamp, descriptor, pageReader); + default: + throw new AssertionError(); + } + case DECIMAL: + switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) { + case INT32: + return new IntColumnReader(descriptor, pageReader); + case INT64: + return new LongColumnReader(descriptor, pageReader); + case BINARY: + return new BytesColumnReader(descriptor, pageReader); + case FIXED_LEN_BYTE_ARRAY: + return new FixedLenBytesColumnReader( + descriptor, pageReader, ((DecimalType) fieldType).getPrecision()); + default: + throw new AssertionError(); + } + case ARRAY: + return new ArrayColumnReader( + descriptor, + pageReader, + utcTimestamp, + descriptor.getPrimitiveType(), + fieldType); + case MAP: + MapType mapType = (MapType) fieldType; + ArrayColumnReader keyReader = + new ArrayColumnReader( + descriptor, + pageReader, + utcTimestamp, + descriptor.getPrimitiveType(), + new ArrayType(mapType.getKeyType())); + ArrayColumnReader valueReader = + new ArrayColumnReader( + descriptors.get(1), + pages.getPageReader(descriptors.get(1)), + utcTimestamp, + descriptors.get(1).getPrimitiveType(), + new ArrayType(mapType.getValueType())); + return new MapColumnReader(keyReader, valueReader, fieldType); + case ROW: + RowType rowType = (RowType) fieldType; + GroupType groupType = physicalType.asGroupType(); + List fieldReaders = new ArrayList<>(); + for (int i = 0; i < rowType.getFieldCount(); i++) { + fieldReaders.add( + createColumnReader( + utcTimestamp, + rowType.getTypeAt(i), + groupType.getType(i), + descriptors, + pages, + depth + 1)); + } + return new RowColumnReader(fieldReaders); + default: + throw new UnsupportedOperationException(fieldType + " is not supported now."); + } + } + + public static WritableColumnVector createWritableColumnVector( + int batchSize, + LogicalType fieldType, + Type physicalType, + List descriptors) { + return createWritableColumnVector(batchSize, fieldType, physicalType, descriptors, 0); + } + + private static WritableColumnVector createWritableColumnVector( + int batchSize, + LogicalType fieldType, + Type physicalType, + List columns, + int depth) { + List descriptors = filterDescriptors(depth, physicalType, columns); + PrimitiveType primitiveType = descriptors.get(0).getPrimitiveType(); + PrimitiveType.PrimitiveTypeName typeName = primitiveType.getPrimitiveTypeName(); + switch (fieldType.getTypeRoot()) { + case BOOLEAN: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.BOOLEAN, + "Unexpected type: %s", typeName); + return new HeapBooleanVector(batchSize); + case TINYINT: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.INT32, + "Unexpected type: %s", typeName); + return new HeapByteVector(batchSize); + case DOUBLE: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.DOUBLE, + "Unexpected type: %s", typeName); + return new HeapDoubleVector(batchSize); + case FLOAT: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.FLOAT, + "Unexpected type: %s", typeName); + return new HeapFloatVector(batchSize); + case INTEGER: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.INT32, + "Unexpected type: %s", typeName); + return new HeapIntVector(batchSize); + case BIGINT: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.INT64, + "Unexpected type: %s", typeName); + return new HeapLongVector(batchSize); + case SMALLINT: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.INT32, + "Unexpected type: %s", typeName); + return new HeapShortVector(batchSize); + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.BINARY, + "Unexpected type: %s", typeName); + return new HeapBytesVector(batchSize); + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + checkArgument(primitiveType.getOriginalType() != OriginalType.TIME_MICROS, + "TIME_MICROS original type is not "); + return new HeapTimestampVector(batchSize); + case DECIMAL: + checkArgument( + (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY + || typeName == PrimitiveType.PrimitiveTypeName.BINARY) + && primitiveType.getOriginalType() == OriginalType.DECIMAL, + "Unexpected type: %s", typeName); + return new HeapBytesVector(batchSize); + case ARRAY: + ArrayType arrayType = (ArrayType) fieldType; + return new HeapArrayVector( + batchSize, + createWritableColumnVector( + batchSize, + arrayType.getElementType(), + physicalType, + descriptors, + depth)); + case MAP: + MapType mapType = (MapType) fieldType; + GroupType repeatedType = physicalType.asGroupType().getType(0).asGroupType(); + // the map column has three level paths. + return new HeapMapColumnVector( + batchSize, + createWritableColumnVector( + batchSize, + mapType.getKeyType(), + repeatedType.getType(0), + descriptors, + depth + 2), + createWritableColumnVector( + batchSize, + mapType.getValueType(), + repeatedType.getType(1), + descriptors, + depth + 2)); + case ROW: + RowType rowType = (RowType) fieldType; + GroupType groupType = physicalType.asGroupType(); + WritableColumnVector[] columnVectors = + new WritableColumnVector[rowType.getFieldCount()]; + for (int i = 0; i < columnVectors.length; i++) { + columnVectors[i] = + createWritableColumnVector( + batchSize, + rowType.getTypeAt(i), + groupType.getType(i), + descriptors, + depth + 1); + } + return new HeapRowColumnVector(batchSize, columnVectors); + default: + throw new UnsupportedOperationException(fieldType + " is not supported now."); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapArrayVector.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapArrayVector.java new file mode 100644 index 0000000000000..edd90714c87a7 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapArrayVector.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector; + +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.ColumnarArrayData; +import org.apache.flink.table.data.vector.ArrayColumnVector; +import org.apache.flink.table.data.vector.ColumnVector; +import org.apache.flink.table.data.vector.heap.AbstractHeapVector; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; + +/** + * This class represents a nullable heap array column vector. + */ +public class HeapArrayVector extends AbstractHeapVector + implements WritableColumnVector, ArrayColumnVector { + + public long[] offsets; + public long[] lengths; + public ColumnVector child; + private int size; + + public HeapArrayVector(int len) { + super(len); + offsets = new long[len]; + lengths = new long[len]; + } + + public HeapArrayVector(int len, ColumnVector vector) { + super(len); + offsets = new long[len]; + lengths = new long[len]; + this.child = vector; + } + + public int getSize() { + return size; + } + + public void setSize(int size) { + this.size = size; + } + + public int getLen() { + return this.isNull.length; + } + + @Override + public ArrayData getArray(int i) { + long offset = offsets[i]; + long length = lengths[i]; + return new ColumnarArrayData(child, (int) offset, (int) length); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapMapColumnVector.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapMapColumnVector.java new file mode 100644 index 0000000000000..2b34a02f116b3 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapMapColumnVector.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector; + +import org.apache.flink.table.data.ColumnarMapData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.vector.ColumnVector; +import org.apache.flink.table.data.vector.MapColumnVector; +import org.apache.flink.table.data.vector.heap.AbstractHeapVector; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; + +/** + * This class represents a nullable heap map column vector. + */ +public class HeapMapColumnVector extends AbstractHeapVector + implements WritableColumnVector, MapColumnVector { + + private long[] offsets; + private long[] lengths; + private int size; + private ColumnVector keys; + private ColumnVector values; + + public HeapMapColumnVector(int len, ColumnVector keys, ColumnVector values) { + super(len); + size = 0; + offsets = new long[len]; + lengths = new long[len]; + this.keys = keys; + this.values = values; + } + + public void setOffsets(long[] offsets) { + this.offsets = offsets; + } + + public void setLengths(long[] lengths) { + this.lengths = lengths; + } + + public void setKeys(ColumnVector keys) { + this.keys = keys; + } + + public void setValues(ColumnVector values) { + this.values = values; + } + + public int getSize() { + return size; + } + + public void setSize(int size) { + this.size = size; + } + + @Override + public MapData getMap(int i) { + long offset = offsets[i]; + long length = lengths[i]; + return new ColumnarMapData(keys, values, (int) offset, (int) length); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapRowColumnVector.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapRowColumnVector.java new file mode 100644 index 0000000000000..53a1eee68cd2a --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapRowColumnVector.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector; + +import org.apache.flink.table.data.ColumnarRowData; +import org.apache.flink.table.data.vector.RowColumnVector; +import org.apache.flink.table.data.vector.VectorizedColumnBatch; +import org.apache.flink.table.data.vector.heap.AbstractHeapVector; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; + +/** + * This class represents a nullable heap row column vector. + */ +public class HeapRowColumnVector extends AbstractHeapVector + implements WritableColumnVector, RowColumnVector { + + public WritableColumnVector[] vectors; + + public HeapRowColumnVector(int len, WritableColumnVector... vectors) { + super(len); + this.vectors = vectors; + } + + @Override + public ColumnarRowData getRow(int i) { + ColumnarRowData columnarRowData = new ColumnarRowData(new VectorizedColumnBatch(vectors)); + columnarRowData.setRowId(i); + return columnarRowData; + } + + @Override + public void reset() { + super.reset(); + for (WritableColumnVector vector : vectors) { + vector.reset(); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/ParquetDecimalVector.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/ParquetDecimalVector.java new file mode 100644 index 0000000000000..a2f6d5b0cd74c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/ParquetDecimalVector.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector; + +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.vector.BytesColumnVector; +import org.apache.flink.table.data.vector.ColumnVector; +import org.apache.flink.table.data.vector.DecimalColumnVector; + +/** + * Parquet write decimal as int32 and int64 and binary, this class wrap the real vector to + * provide {@link DecimalColumnVector} interface. + * + *

    Reference Flink release 1.11.2 {@link org.apache.flink.formats.parquet.vector.ParquetDecimalVector} + * because it is not public. + */ +public class ParquetDecimalVector implements DecimalColumnVector { + + public final ColumnVector vector; + + public ParquetDecimalVector(ColumnVector vector) { + this.vector = vector; + } + + @Override + public DecimalData getDecimal(int i, int precision, int scale) { + return DecimalData.fromUnscaledBytes( + ((BytesColumnVector) vector).getBytes(i).getBytes(), + precision, + scale); + } + + @Override + public boolean isNullAt(int i) { + return vector.isNullAt(i); + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/AbstractColumnReader.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/AbstractColumnReader.java new file mode 100644 index 0000000000000..07416a371715c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/AbstractColumnReader.java @@ -0,0 +1,325 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.formats.parquet.vector.ParquetDictionary; +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; +import org.apache.flink.table.data.vector.writable.WritableIntVector; +import org.apache.parquet.Preconditions; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Dictionary; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.page.DataPage; +import org.apache.parquet.column.page.DataPageV1; +import org.apache.parquet.column.page.DataPageV2; +import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.column.values.ValuesReader; +import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.schema.PrimitiveType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import static org.apache.parquet.column.ValuesType.REPETITION_LEVEL; + +/** + * Abstract {@link ColumnReader}. + * See {@link org.apache.parquet.column.impl.ColumnReaderImpl}, + * part of the code is referred from Apache Spark and Apache Parquet. + * + *

    Note: Reference Flink release 1.11.2 {@link org.apache.flink.formats.parquet.vector.reader.AbstractColumnReader} + * because some of the package scope methods. + */ +public abstract class AbstractColumnReader + implements ColumnReader { + + private static final Logger LOG = LoggerFactory.getLogger(org.apache.flink.formats.parquet.vector.reader.AbstractColumnReader.class); + + private final PageReader pageReader; + + /** + * The dictionary, if this column has dictionary encoding. + */ + protected final Dictionary dictionary; + + /** + * Maximum definition level for this column. + */ + protected final int maxDefLevel; + + protected final ColumnDescriptor descriptor; + + /** + * Total number of values read. + */ + private long valuesRead; + + /** + * value that indicates the end of the current page. That is, if valuesRead == + * endOfPageValueCount, we are at the end of the page. + */ + private long endOfPageValueCount; + + /** + * If true, the current page is dictionary encoded. + */ + private boolean isCurrentPageDictionaryEncoded; + + /** + * Total values in the current page. + */ + private int pageValueCount; + + /* + * Input streams: + * 1.Run length encoder to encode every data, so we have run length stream to get + * run length information. + * 2.Data maybe is real data, maybe is dictionary ids which need be decode to real + * data from Dictionary. + * + * Run length stream ------> Data stream + * | + * ------> Dictionary ids stream + */ + + /** + * Run length decoder for data and dictionary. + */ + protected RunLengthDecoder runLenDecoder; + + /** + * Data input stream. + */ + ByteBufferInputStream dataInputStream; + + /** + * Dictionary decoder to wrap dictionary ids input stream. + */ + private RunLengthDecoder dictionaryIdsDecoder; + + public AbstractColumnReader( + ColumnDescriptor descriptor, + PageReader pageReader) throws IOException { + this.descriptor = descriptor; + this.pageReader = pageReader; + this.maxDefLevel = descriptor.getMaxDefinitionLevel(); + + DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); + if (dictionaryPage != null) { + try { + this.dictionary = dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage); + this.isCurrentPageDictionaryEncoded = true; + } catch (IOException e) { + throw new IOException("could not decode the dictionary for " + descriptor, e); + } + } else { + this.dictionary = null; + this.isCurrentPageDictionaryEncoded = false; + } + /* + * Total number of values in this column (in this row group). + */ + long totalValueCount = pageReader.getTotalValueCount(); + if (totalValueCount == 0) { + throw new IOException("totalValueCount == 0"); + } + } + + protected void checkTypeName(PrimitiveType.PrimitiveTypeName expectedName) { + PrimitiveType.PrimitiveTypeName actualName = descriptor.getPrimitiveType().getPrimitiveTypeName(); + Preconditions.checkArgument( + actualName == expectedName, + "Expected type name: %s, actual type name: %s", + expectedName, + actualName); + } + + /** + * Reads `total` values from this columnReader into column. + */ + @Override + public final void readToVector(int readNumber, V vector) throws IOException { + int rowId = 0; + WritableIntVector dictionaryIds = null; + if (dictionary != null) { + dictionaryIds = vector.reserveDictionaryIds(readNumber); + } + while (readNumber > 0) { + // Compute the number of values we want to read in this page. + int leftInPage = (int) (endOfPageValueCount - valuesRead); + if (leftInPage == 0) { + DataPage page = pageReader.readPage(); + if (page instanceof DataPageV1) { + readPageV1((DataPageV1) page); + } else if (page instanceof DataPageV2) { + readPageV2((DataPageV2) page); + } else { + throw new RuntimeException("Unsupported page type: " + page.getClass()); + } + leftInPage = (int) (endOfPageValueCount - valuesRead); + } + int num = Math.min(readNumber, leftInPage); + if (isCurrentPageDictionaryEncoded) { + // Read and decode dictionary ids. + runLenDecoder.readDictionaryIds( + num, dictionaryIds, vector, rowId, maxDefLevel, this.dictionaryIdsDecoder); + + if (vector.hasDictionary() || (rowId == 0 && supportLazyDecode())) { + // Column vector supports lazy decoding of dictionary values so just set the dictionary. + // We can't do this if rowId != 0 AND the column doesn't have a dictionary (i.e. some + // non-dictionary encoded values have already been added). + vector.setDictionary(new ParquetDictionary(dictionary)); + } else { + readBatchFromDictionaryIds(rowId, num, vector, dictionaryIds); + } + } else { + if (vector.hasDictionary() && rowId != 0) { + // This batch already has dictionary encoded values but this new page is not. The batch + // does not support a mix of dictionary and not so we will decode the dictionary. + readBatchFromDictionaryIds(0, rowId, vector, vector.getDictionaryIds()); + } + vector.setDictionary(null); + readBatch(rowId, num, vector); + } + + valuesRead += num; + rowId += num; + readNumber -= num; + } + } + + private void readPageV1(DataPageV1 page) throws IOException { + this.pageValueCount = page.getValueCount(); + ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL); + + // Initialize the decoders. + if (page.getDlEncoding() != Encoding.RLE && descriptor.getMaxDefinitionLevel() != 0) { + throw new UnsupportedOperationException("Unsupported encoding: " + page.getDlEncoding()); + } + int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel()); + this.runLenDecoder = new RunLengthDecoder(bitWidth); + try { + BytesInput bytes = page.getBytes(); + ByteBufferInputStream in = bytes.toInputStream(); + rlReader.initFromPage(pageValueCount, in); + this.runLenDecoder.initFromStream(pageValueCount, in); + prepareNewPage(page.getValueEncoding(), in); + } catch (IOException e) { + throw new IOException("could not read page " + page + " in col " + descriptor, e); + } + } + + private void readPageV2(DataPageV2 page) throws IOException { + this.pageValueCount = page.getValueCount(); + + int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel()); + // do not read the length from the stream. v2 pages handle dividing the page bytes. + this.runLenDecoder = new RunLengthDecoder(bitWidth, false); + this.runLenDecoder.initFromStream( + this.pageValueCount, page.getDefinitionLevels().toInputStream()); + try { + prepareNewPage(page.getDataEncoding(), page.getData().toInputStream()); + } catch (IOException e) { + throw new IOException("could not read page " + page + " in col " + descriptor, e); + } + } + + private void prepareNewPage( + Encoding dataEncoding, + ByteBufferInputStream in) throws IOException { + this.endOfPageValueCount = valuesRead + pageValueCount; + if (dataEncoding.usesDictionary()) { + if (dictionary == null) { + throw new IOException("Could not read page in col " + + descriptor + + " as the dictionary was missing for encoding " + + dataEncoding); + } + @SuppressWarnings("deprecation") + Encoding plainDict = Encoding.PLAIN_DICTIONARY; // var to allow warning suppression + if (dataEncoding != plainDict && dataEncoding != Encoding.RLE_DICTIONARY) { + throw new UnsupportedOperationException("Unsupported encoding: " + dataEncoding); + } + this.dataInputStream = null; + this.dictionaryIdsDecoder = new RunLengthDecoder(); + try { + this.dictionaryIdsDecoder.initFromStream(pageValueCount, in); + } catch (IOException e) { + throw new IOException("could not read dictionary in col " + descriptor, e); + } + this.isCurrentPageDictionaryEncoded = true; + } else { + if (dataEncoding != Encoding.PLAIN) { + throw new UnsupportedOperationException("Unsupported encoding: " + dataEncoding); + } + this.dictionaryIdsDecoder = null; + LOG.debug("init from page at offset {} for length {}", in.position(), in.available()); + this.dataInputStream = in.remainingStream(); + this.isCurrentPageDictionaryEncoded = false; + } + + afterReadPage(); + } + + final ByteBuffer readDataBuffer(int length) { + try { + return dataInputStream.slice(length).order(ByteOrder.LITTLE_ENDIAN); + } catch (IOException e) { + throw new ParquetDecodingException("Failed to read " + length + " bytes", e); + } + } + + /** + * After read a page, we may need some initialization. + */ + protected void afterReadPage() { + } + + /** + * Support lazy dictionary ids decode. See more in {@link ParquetDictionary}. + * If return false, we will decode all the data first. + */ + protected boolean supportLazyDecode() { + return true; + } + + /** + * Read batch from {@link #runLenDecoder} and {@link #dataInputStream}. + */ + protected abstract void readBatch(int rowId, int num, V column); + + /** + * Decode dictionary ids to data. + * From {@link #runLenDecoder} and {@link #dictionaryIdsDecoder}. + */ + protected abstract void readBatchFromDictionaryIds( + int rowId, + int num, + V column, + WritableIntVector dictionaryIds); +} + diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ArrayColumnReader.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ArrayColumnReader.java new file mode 100644 index 0000000000000..d94c1e1da4bb6 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ArrayColumnReader.java @@ -0,0 +1,473 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.hudi.table.format.cow.vector.HeapArrayVector; +import org.apache.hudi.table.format.cow.vector.ParquetDecimalVector; + +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.data.vector.VectorizedColumnBatch; +import org.apache.flink.table.data.vector.heap.HeapBooleanVector; +import org.apache.flink.table.data.vector.heap.HeapByteVector; +import org.apache.flink.table.data.vector.heap.HeapBytesVector; +import org.apache.flink.table.data.vector.heap.HeapDoubleVector; +import org.apache.flink.table.data.vector.heap.HeapFloatVector; +import org.apache.flink.table.data.vector.heap.HeapIntVector; +import org.apache.flink.table.data.vector.heap.HeapLongVector; +import org.apache.flink.table.data.vector.heap.HeapShortVector; +import org.apache.flink.table.data.vector.heap.HeapTimestampVector; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Array {@link ColumnReader}. + */ +public class ArrayColumnReader extends BaseVectorizedColumnReader { + + // The value read in last time + private Object lastValue; + + // flag to indicate if there is no data in parquet data page + private boolean eof = false; + + // flag to indicate if it's the first time to read parquet data page with this instance + boolean isFirstRow = true; + + public ArrayColumnReader( + ColumnDescriptor descriptor, + PageReader pageReader, + boolean isUtcTimestamp, + Type type, + LogicalType logicalType) + throws IOException { + super(descriptor, pageReader, isUtcTimestamp, type, logicalType); + } + + @Override + public void readToVector(int readNumber, WritableColumnVector vector) throws IOException { + HeapArrayVector lcv = (HeapArrayVector) vector; + // before readBatch, initial the size of offsets & lengths as the default value, + // the actual size will be assigned in setChildrenInfo() after reading complete. + lcv.offsets = new long[VectorizedColumnBatch.DEFAULT_SIZE]; + lcv.lengths = new long[VectorizedColumnBatch.DEFAULT_SIZE]; + // Because the length of ListColumnVector.child can't be known now, + // the valueList will save all data for ListColumnVector temporary. + List valueList = new ArrayList<>(); + + LogicalType category = ((ArrayType) logicalType).getElementType(); + + // read the first row in parquet data page, this will be only happened once for this + // instance + if (isFirstRow) { + if (!fetchNextValue(category)) { + return; + } + isFirstRow = false; + } + + int index = collectDataFromParquetPage(readNumber, lcv, valueList, category); + + // Convert valueList to array for the ListColumnVector.child + fillColumnVector(category, lcv, valueList, index); + } + + /** + * Reads a single value from parquet page, puts it into lastValue. Returns a boolean indicating + * if there is more values to read (true). + * + * @param category + * @return boolean + * @throws IOException + */ + private boolean fetchNextValue(LogicalType category) throws IOException { + int left = readPageIfNeed(); + if (left > 0) { + // get the values of repetition and definitionLevel + readRepetitionAndDefinitionLevels(); + // read the data if it isn't null + if (definitionLevel == maxDefLevel) { + if (isCurrentPageDictionaryEncoded) { + lastValue = dataColumn.readValueDictionaryId(); + } else { + lastValue = readPrimitiveTypedRow(category); + } + } else { + lastValue = null; + } + return true; + } else { + eof = true; + return false; + } + } + + private int readPageIfNeed() throws IOException { + // Compute the number of values we want to read in this page. + int leftInPage = (int) (endOfPageValueCount - valuesRead); + if (leftInPage == 0) { + // no data left in current page, load data from new page + readPage(); + leftInPage = (int) (endOfPageValueCount - valuesRead); + } + return leftInPage; + } + + // Need to be in consistent with that VectorizedPrimitiveColumnReader#readBatchHelper + // TODO Reduce the duplicated code + private Object readPrimitiveTypedRow(LogicalType category) { + switch (category.getTypeRoot()) { + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + return dataColumn.readString(); + case BOOLEAN: + return dataColumn.readBoolean(); + case TIME_WITHOUT_TIME_ZONE: + case DATE: + case INTEGER: + return dataColumn.readInteger(); + case TINYINT: + return dataColumn.readTinyInt(); + case SMALLINT: + return dataColumn.readSmallInt(); + case BIGINT: + return dataColumn.readLong(); + case FLOAT: + return dataColumn.readFloat(); + case DOUBLE: + return dataColumn.readDouble(); + case DECIMAL: + switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) { + case INT32: + return dataColumn.readInteger(); + case INT64: + return dataColumn.readLong(); + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + return dataColumn.readString(); + default: + throw new AssertionError(); + } + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + return dataColumn.readTimestamp(); + default: + throw new RuntimeException("Unsupported type in the list: " + type); + } + } + + private Object dictionaryDecodeValue(LogicalType category, Integer dictionaryValue) { + if (dictionaryValue == null) { + return null; + } + + switch (category.getTypeRoot()) { + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + return dictionary.readString(dictionaryValue); + case DATE: + case TIME_WITHOUT_TIME_ZONE: + case INTEGER: + return dictionary.readInteger(dictionaryValue); + case BOOLEAN: + return dictionary.readBoolean(dictionaryValue) ? 1 : 0; + case DOUBLE: + return dictionary.readDouble(dictionaryValue); + case FLOAT: + return dictionary.readFloat(dictionaryValue); + case TINYINT: + return dictionary.readTinyInt(dictionaryValue); + case SMALLINT: + return dictionary.readSmallInt(dictionaryValue); + case BIGINT: + return dictionary.readLong(dictionaryValue); + case DECIMAL: + switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) { + case INT32: + return dictionary.readInteger(dictionaryValue); + case INT64: + return dictionary.readLong(dictionaryValue); + case FIXED_LEN_BYTE_ARRAY: + case BINARY: + return dictionary.readString(dictionaryValue); + default: + throw new AssertionError(); + } + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + return dictionary.readTimestamp(dictionaryValue); + default: + throw new RuntimeException("Unsupported type in the list: " + type); + } + } + + /** + * Collects data from a parquet page and returns the final row index where it stopped. The + * returned index can be equal to or less than total. + * + * @param total maximum number of rows to collect + * @param lcv column vector to do initial setup in data collection time + * @param valueList collection of values that will be fed into the vector later + * @param category + * @return int + * @throws IOException + */ + private int collectDataFromParquetPage( + int total, HeapArrayVector lcv, List valueList, LogicalType category) + throws IOException { + int index = 0; + /* + * Here is a nested loop for collecting all values from a parquet page. + * A column of array type can be considered as a list of lists, so the two loops are as below: + * 1. The outer loop iterates on rows (index is a row index, so points to a row in the batch), e.g.: + * [0, 2, 3] <- index: 0 + * [NULL, 3, 4] <- index: 1 + * + * 2. The inner loop iterates on values within a row (sets all data from parquet data page + * for an element in ListColumnVector), so fetchNextValue returns values one-by-one: + * 0, 2, 3, NULL, 3, 4 + * + * As described below, the repetition level (repetitionLevel != 0) + * can be used to decide when we'll start to read values for the next list. + */ + while (!eof && index < total) { + // add element to ListColumnVector one by one + lcv.offsets[index] = valueList.size(); + /* + * Let's collect all values for a single list. + * Repetition level = 0 means that a new list started there in the parquet page, + * in that case, let's exit from the loop, and start to collect value for a new list. + */ + do { + /* + * Definition level = 0 when a NULL value was returned instead of a list + * (this is not the same as a NULL value in of a list). + */ + if (definitionLevel == 0) { + lcv.setNullAt(index); + } + valueList.add( + isCurrentPageDictionaryEncoded + ? dictionaryDecodeValue(category, (Integer) lastValue) + : lastValue); + } while (fetchNextValue(category) && (repetitionLevel != 0)); + + lcv.lengths[index] = valueList.size() - lcv.offsets[index]; + index++; + } + return index; + } + + /** + * The lengths & offsets will be initialized as default size (1024), it should be set to the + * actual size according to the element number. + */ + private void setChildrenInfo(HeapArrayVector lcv, int itemNum, int elementNum) { + lcv.setSize(itemNum); + long[] lcvLength = new long[elementNum]; + long[] lcvOffset = new long[elementNum]; + System.arraycopy(lcv.lengths, 0, lcvLength, 0, elementNum); + System.arraycopy(lcv.offsets, 0, lcvOffset, 0, elementNum); + lcv.lengths = lcvLength; + lcv.offsets = lcvOffset; + } + + private void fillColumnVector( + LogicalType category, HeapArrayVector lcv, List valueList, int elementNum) { + int total = valueList.size(); + setChildrenInfo(lcv, total, elementNum); + switch (category.getTypeRoot()) { + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + lcv.child = new HeapBytesVector(total); + ((HeapBytesVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + byte[] src = ((List) valueList).get(i); + if (src == null) { + ((HeapBytesVector) lcv.child).setNullAt(i); + } else { + ((HeapBytesVector) lcv.child).appendBytes(i, src, 0, src.length); + } + } + break; + case BOOLEAN: + lcv.child = new HeapBooleanVector(total); + ((HeapBooleanVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapBooleanVector) lcv.child).setNullAt(i); + } else { + ((HeapBooleanVector) lcv.child).vector[i] = + ((List) valueList).get(i); + } + } + break; + case TINYINT: + lcv.child = new HeapByteVector(total); + ((HeapByteVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapByteVector) lcv.child).setNullAt(i); + } else { + ((HeapByteVector) lcv.child).vector[i] = + (byte) ((List) valueList).get(i).intValue(); + } + } + break; + case SMALLINT: + lcv.child = new HeapShortVector(total); + ((HeapShortVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapShortVector) lcv.child).setNullAt(i); + } else { + ((HeapShortVector) lcv.child).vector[i] = + (short) ((List) valueList).get(i).intValue(); + } + } + break; + case INTEGER: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + lcv.child = new HeapIntVector(total); + ((HeapIntVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapIntVector) lcv.child).setNullAt(i); + } else { + ((HeapIntVector) lcv.child).vector[i] = ((List) valueList).get(i); + } + } + break; + case FLOAT: + lcv.child = new HeapFloatVector(total); + ((HeapFloatVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapFloatVector) lcv.child).setNullAt(i); + } else { + ((HeapFloatVector) lcv.child).vector[i] = ((List) valueList).get(i); + } + } + break; + case BIGINT: + lcv.child = new HeapLongVector(total); + ((HeapLongVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapLongVector) lcv.child).setNullAt(i); + } else { + ((HeapLongVector) lcv.child).vector[i] = ((List) valueList).get(i); + } + } + break; + case DOUBLE: + lcv.child = new HeapDoubleVector(total); + ((HeapDoubleVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapDoubleVector) lcv.child).setNullAt(i); + } else { + ((HeapDoubleVector) lcv.child).vector[i] = + ((List) valueList).get(i); + } + } + break; + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + lcv.child = new HeapTimestampVector(total); + ((HeapTimestampVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapTimestampVector) lcv.child).setNullAt(i); + } else { + ((HeapTimestampVector) lcv.child) + .setTimestamp(i, ((List) valueList).get(i)); + } + } + break; + case DECIMAL: + PrimitiveType.PrimitiveTypeName primitiveTypeName = + descriptor.getPrimitiveType().getPrimitiveTypeName(); + switch (primitiveTypeName) { + case INT32: + lcv.child = new ParquetDecimalVector(new HeapIntVector(total)); + ((HeapIntVector) ((ParquetDecimalVector) lcv.child).vector).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapIntVector) ((ParquetDecimalVector) lcv.child).vector) + .setNullAt(i); + } else { + ((HeapIntVector) ((ParquetDecimalVector) lcv.child).vector) + .vector[i] = + ((List) valueList).get(i); + } + } + break; + case INT64: + lcv.child = new ParquetDecimalVector(new HeapLongVector(total)); + ((HeapLongVector) ((ParquetDecimalVector) lcv.child).vector).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapLongVector) ((ParquetDecimalVector) lcv.child).vector) + .setNullAt(i); + } else { + ((HeapLongVector) ((ParquetDecimalVector) lcv.child).vector) + .vector[i] = + ((List) valueList).get(i); + } + } + break; + default: + lcv.child = new ParquetDecimalVector(new HeapBytesVector(total)); + ((HeapBytesVector) ((ParquetDecimalVector) lcv.child).vector).reset(); + for (int i = 0; i < valueList.size(); i++) { + byte[] src = ((List) valueList).get(i); + if (valueList.get(i) == null) { + ((HeapBytesVector) ((ParquetDecimalVector) lcv.child).vector) + .setNullAt(i); + } else { + ((HeapBytesVector) ((ParquetDecimalVector) lcv.child).vector) + .appendBytes(i, src, 0, src.length); + } + } + break; + } + break; + default: + throw new RuntimeException("Unsupported type in the list: " + type); + } + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/BaseVectorizedColumnReader.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/BaseVectorizedColumnReader.java new file mode 100644 index 0000000000000..073c704c4b24f --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/BaseVectorizedColumnReader.java @@ -0,0 +1,313 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.page.DataPage; +import org.apache.parquet.column.page.DataPageV1; +import org.apache.parquet.column.page.DataPageV2; +import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.column.values.ValuesReader; +import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder; +import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayInputStream; +import java.io.IOException; + +import static org.apache.parquet.column.ValuesType.DEFINITION_LEVEL; +import static org.apache.parquet.column.ValuesType.REPETITION_LEVEL; +import static org.apache.parquet.column.ValuesType.VALUES; + +/** + * Abstract {@link ColumnReader}. part of the code is referred from Apache Hive and Apache Parquet. + */ +public abstract class BaseVectorizedColumnReader implements ColumnReader { + + private static final Logger LOG = LoggerFactory.getLogger(BaseVectorizedColumnReader.class); + + protected boolean isUtcTimestamp; + + /** + * Total number of values read. + */ + protected long valuesRead; + + /** + * value that indicates the end of the current page. That is, if valuesRead == + * endOfPageValueCount, we are at the end of the page. + */ + protected long endOfPageValueCount; + + /** + * The dictionary, if this column has dictionary encoding. + */ + protected final ParquetDataColumnReader dictionary; + + /** + * If true, the current page is dictionary encoded. + */ + protected boolean isCurrentPageDictionaryEncoded; + + /** + * Maximum definition level for this column. + */ + protected final int maxDefLevel; + + protected int definitionLevel; + protected int repetitionLevel; + + /** + * Repetition/Definition/Value readers. + */ + protected IntIterator repetitionLevelColumn; + + protected IntIterator definitionLevelColumn; + protected ParquetDataColumnReader dataColumn; + + /** + * Total values in the current page. + */ + protected int pageValueCount; + + protected final PageReader pageReader; + protected final ColumnDescriptor descriptor; + protected final Type type; + protected final LogicalType logicalType; + + public BaseVectorizedColumnReader( + ColumnDescriptor descriptor, + PageReader pageReader, + boolean isUtcTimestamp, + Type parquetType, + LogicalType logicalType) + throws IOException { + this.descriptor = descriptor; + this.type = parquetType; + this.pageReader = pageReader; + this.maxDefLevel = descriptor.getMaxDefinitionLevel(); + this.isUtcTimestamp = isUtcTimestamp; + this.logicalType = logicalType; + + DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); + if (dictionaryPage != null) { + try { + this.dictionary = + ParquetDataColumnReaderFactory.getDataColumnReaderByTypeOnDictionary( + parquetType.asPrimitiveType(), + dictionaryPage + .getEncoding() + .initDictionary(descriptor, dictionaryPage), + isUtcTimestamp); + this.isCurrentPageDictionaryEncoded = true; + } catch (IOException e) { + throw new IOException("could not decode the dictionary for " + descriptor, e); + } + } else { + this.dictionary = null; + this.isCurrentPageDictionaryEncoded = false; + } + } + + protected void readRepetitionAndDefinitionLevels() { + repetitionLevel = repetitionLevelColumn.nextInt(); + definitionLevel = definitionLevelColumn.nextInt(); + valuesRead++; + } + + protected void readPage() throws IOException { + DataPage page = pageReader.readPage(); + + if (page == null) { + return; + } + + page.accept( + new DataPage.Visitor() { + @Override + public Void visit(DataPageV1 dataPageV1) { + readPageV1(dataPageV1); + return null; + } + + @Override + public Void visit(DataPageV2 dataPageV2) { + readPageV2(dataPageV2); + return null; + } + }); + } + + private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) + throws IOException { + this.pageValueCount = valueCount; + this.endOfPageValueCount = valuesRead + pageValueCount; + if (dataEncoding.usesDictionary()) { + this.dataColumn = null; + if (dictionary == null) { + throw new IOException( + "could not read page in col " + + descriptor + + " as the dictionary was missing for encoding " + + dataEncoding); + } + dataColumn = + ParquetDataColumnReaderFactory.getDataColumnReaderByType( + type.asPrimitiveType(), + dataEncoding.getDictionaryBasedValuesReader( + descriptor, VALUES, dictionary.getDictionary()), + isUtcTimestamp); + this.isCurrentPageDictionaryEncoded = true; + } else { + dataColumn = + ParquetDataColumnReaderFactory.getDataColumnReaderByType( + type.asPrimitiveType(), + dataEncoding.getValuesReader(descriptor, VALUES), + isUtcTimestamp); + this.isCurrentPageDictionaryEncoded = false; + } + + try { + dataColumn.initFromPage(pageValueCount, in); + } catch (IOException e) { + throw new IOException("could not read page in col " + descriptor, e); + } + } + + private void readPageV1(DataPageV1 page) { + ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL); + ValuesReader dlReader = page.getDlEncoding().getValuesReader(descriptor, DEFINITION_LEVEL); + this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader); + this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader); + try { + BytesInput bytes = page.getBytes(); + LOG.debug("page size " + bytes.size() + " bytes and " + pageValueCount + " records"); + ByteBufferInputStream in = bytes.toInputStream(); + LOG.debug("reading repetition levels at " + in.position()); + rlReader.initFromPage(pageValueCount, in); + LOG.debug("reading definition levels at " + in.position()); + dlReader.initFromPage(pageValueCount, in); + LOG.debug("reading data at " + in.position()); + initDataReader(page.getValueEncoding(), in, page.getValueCount()); + } catch (IOException e) { + throw new ParquetDecodingException( + "could not read page " + page + " in col " + descriptor, e); + } + } + + private void readPageV2(DataPageV2 page) { + this.pageValueCount = page.getValueCount(); + this.repetitionLevelColumn = + newRLEIterator(descriptor.getMaxRepetitionLevel(), page.getRepetitionLevels()); + this.definitionLevelColumn = + newRLEIterator(descriptor.getMaxDefinitionLevel(), page.getDefinitionLevels()); + try { + LOG.debug( + "page data size " + + page.getData().size() + + " bytes and " + + pageValueCount + + " records"); + initDataReader( + page.getDataEncoding(), page.getData().toInputStream(), page.getValueCount()); + } catch (IOException e) { + throw new ParquetDecodingException( + "could not read page " + page + " in col " + descriptor, e); + } + } + + private IntIterator newRLEIterator(int maxLevel, BytesInput bytes) { + try { + if (maxLevel == 0) { + return new NullIntIterator(); + } + return new RLEIntIterator( + new RunLengthBitPackingHybridDecoder( + BytesUtils.getWidthFromMaxInt(maxLevel), + new ByteArrayInputStream(bytes.toByteArray()))); + } catch (IOException e) { + throw new ParquetDecodingException( + "could not read levels in page for col " + descriptor, e); + } + } + + /** + * Utility classes to abstract over different way to read ints with different encodings. + */ + abstract static class IntIterator { + abstract int nextInt(); + } + + /** + * read ints from {@link ValuesReader}. + */ + protected static final class ValuesReaderIntIterator extends IntIterator { + ValuesReader delegate; + + public ValuesReaderIntIterator(ValuesReader delegate) { + this.delegate = delegate; + } + + @Override + int nextInt() { + return delegate.readInteger(); + } + } + + /** + * read ints from {@link RunLengthBitPackingHybridDecoder}. + */ + protected static final class RLEIntIterator extends IntIterator { + RunLengthBitPackingHybridDecoder delegate; + + public RLEIntIterator(RunLengthBitPackingHybridDecoder delegate) { + this.delegate = delegate; + } + + @Override + int nextInt() { + try { + return delegate.readInt(); + } catch (IOException e) { + throw new ParquetDecodingException(e); + } + } + } + + /** + * return zero. + */ + protected static final class NullIntIterator extends IntIterator { + @Override + int nextInt() { + return 0; + } + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/FixedLenBytesColumnReader.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/FixedLenBytesColumnReader.java new file mode 100644 index 0000000000000..61461a728c3b8 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/FixedLenBytesColumnReader.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.table.data.vector.writable.WritableBytesVector; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; +import org.apache.flink.table.data.vector.writable.WritableIntVector; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; + +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * Fixed length bytes {@code ColumnReader}, just for decimal. + * + *

    Note: Reference Flink release 1.13.2 + * {@code org.apache.flink.formats.parquet.vector.reader.FixedLenBytesColumnReader} + * to always write as legacy decimal format. + */ +public class FixedLenBytesColumnReader + extends AbstractColumnReader { + + public FixedLenBytesColumnReader( + ColumnDescriptor descriptor, PageReader pageReader, int precision) throws IOException { + super(descriptor, pageReader); + checkTypeName(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY); + } + + @Override + protected void readBatch(int rowId, int num, V column) { + int bytesLen = descriptor.getPrimitiveType().getTypeLength(); + WritableBytesVector bytesVector = (WritableBytesVector) column; + for (int i = 0; i < num; i++) { + if (runLenDecoder.readInteger() == maxDefLevel) { + byte[] bytes = readDataBinary(bytesLen).getBytes(); + bytesVector.appendBytes(rowId + i, bytes, 0, bytes.length); + } else { + bytesVector.setNullAt(rowId + i); + } + } + } + + @Override + protected void readBatchFromDictionaryIds( + int rowId, int num, V column, WritableIntVector dictionaryIds) { + WritableBytesVector bytesVector = (WritableBytesVector) column; + for (int i = rowId; i < rowId + num; ++i) { + if (!bytesVector.isNullAt(i)) { + byte[] v = dictionary.decodeToBinary(dictionaryIds.getInt(i)).getBytes(); + bytesVector.appendBytes(i, v, 0, v.length); + } + } + } + + private Binary readDataBinary(int len) { + ByteBuffer buffer = readDataBuffer(len); + if (buffer.hasArray()) { + return Binary.fromConstantByteArray( + buffer.array(), buffer.arrayOffset() + buffer.position(), len); + } else { + byte[] bytes = new byte[len]; + buffer.get(bytes); + return Binary.fromConstantByteArray(bytes); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/Int64TimestampColumnReader.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/Int64TimestampColumnReader.java new file mode 100644 index 0000000000000..555853bda6bd8 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/Int64TimestampColumnReader.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.data.vector.writable.WritableIntVector; +import org.apache.flink.table.data.vector.writable.WritableTimestampVector; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.schema.PrimitiveType; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.sql.Timestamp; +import java.time.Instant; +import java.time.temporal.ChronoUnit; + +/** + * Timestamp {@link org.apache.flink.formats.parquet.vector.reader.ColumnReader} that supports INT64 8 bytes, + * TIMESTAMP_MILLIS is the deprecated ConvertedType counterpart of a TIMESTAMP logical type + * that is UTC normalized and has MILLIS precision. + * + *

    See https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp + * TIMESTAMP_MILLIS and TIMESTAMP_MICROS are the deprecated ConvertedType. + */ +public class Int64TimestampColumnReader extends AbstractColumnReader { + + private final boolean utcTimestamp; + + private final ChronoUnit chronoUnit; + + public Int64TimestampColumnReader( + boolean utcTimestamp, + ColumnDescriptor descriptor, + PageReader pageReader, + int precision) throws IOException { + super(descriptor, pageReader); + this.utcTimestamp = utcTimestamp; + if (precision <= 3) { + this.chronoUnit = ChronoUnit.MILLIS; + } else if (precision <= 6) { + this.chronoUnit = ChronoUnit.MICROS; + } else { + throw new IllegalArgumentException( + "Avro does not support TIMESTAMP type with precision: " + + precision + + ", it only supports precision less than 6."); + } + checkTypeName(PrimitiveType.PrimitiveTypeName.INT64); + } + + @Override + protected boolean supportLazyDecode() { + return false; + } + + @Override + protected void readBatch(int rowId, int num, WritableTimestampVector column) { + for (int i = 0; i < num; i++) { + if (runLenDecoder.readInteger() == maxDefLevel) { + ByteBuffer buffer = readDataBuffer(8); + column.setTimestamp(rowId + i, int64ToTimestamp(utcTimestamp, buffer.getLong(), chronoUnit)); + } else { + column.setNullAt(rowId + i); + } + } + } + + @Override + protected void readBatchFromDictionaryIds( + int rowId, + int num, + WritableTimestampVector column, + WritableIntVector dictionaryIds) { + for (int i = rowId; i < rowId + num; ++i) { + if (!column.isNullAt(i)) { + column.setTimestamp(i, decodeInt64ToTimestamp( + utcTimestamp, dictionary, dictionaryIds.getInt(i), chronoUnit)); + } + } + } + + public static TimestampData decodeInt64ToTimestamp( + boolean utcTimestamp, + org.apache.parquet.column.Dictionary dictionary, + int id, + ChronoUnit unit) { + long value = dictionary.decodeToLong(id); + return int64ToTimestamp(utcTimestamp, value, unit); + } + + private static TimestampData int64ToTimestamp( + boolean utcTimestamp, + long interval, + ChronoUnit unit) { + final Instant instant = Instant.EPOCH.plus(interval, unit); + if (utcTimestamp) { + return TimestampData.fromInstant(instant); + } else { + // this applies the local timezone + return TimestampData.fromTimestamp(Timestamp.from(instant)); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/MapColumnReader.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/MapColumnReader.java new file mode 100644 index 0000000000000..015a867c4f22d --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/MapColumnReader.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.hudi.table.format.cow.vector.HeapArrayVector; +import org.apache.hudi.table.format.cow.vector.HeapMapColumnVector; + +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.vector.ColumnVector; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; + +import java.io.IOException; + +/** + * Map {@link ColumnReader}. + */ +public class MapColumnReader implements ColumnReader { + + private final LogicalType logicalType; + private final ArrayColumnReader keyReader; + private final ArrayColumnReader valueReader; + + public MapColumnReader( + ArrayColumnReader keyReader, ArrayColumnReader valueReader, LogicalType logicalType) { + this.keyReader = keyReader; + this.valueReader = valueReader; + this.logicalType = logicalType; + } + + public void readBatch(int total, ColumnVector column) throws IOException { + HeapMapColumnVector mapColumnVector = (HeapMapColumnVector) column; + MapType mapType = (MapType) logicalType; + // initialize 2 ListColumnVector for keys and values + HeapArrayVector keyArrayColumnVector = new HeapArrayVector(total); + HeapArrayVector valueArrayColumnVector = new HeapArrayVector(total); + // read the keys and values + keyReader.readToVector(total, keyArrayColumnVector); + valueReader.readToVector(total, valueArrayColumnVector); + + // set the related attributes according to the keys and values + mapColumnVector.setKeys(keyArrayColumnVector.child); + mapColumnVector.setValues(valueArrayColumnVector.child); + mapColumnVector.setOffsets(keyArrayColumnVector.offsets); + mapColumnVector.setLengths(keyArrayColumnVector.lengths); + mapColumnVector.setSize(keyArrayColumnVector.getSize()); + for (int i = 0; i < keyArrayColumnVector.getLen(); i++) { + if (keyArrayColumnVector.isNullAt(i)) { + mapColumnVector.setNullAt(i); + } + } + } + + @Override + public void readToVector(int readNumber, WritableColumnVector vector) throws IOException { + readBatch(readNumber, vector); + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetColumnarRowSplitReader.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetColumnarRowSplitReader.java new file mode 100644 index 0000000000000..92f5d1e191771 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetColumnarRowSplitReader.java @@ -0,0 +1,385 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.hudi.table.format.cow.vector.ParquetDecimalVector; + +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.ColumnarRowData; +import org.apache.flink.table.data.vector.ColumnVector; +import org.apache.flink.table.data.vector.VectorizedColumnBatch; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeRoot; +import org.apache.flink.util.FlinkRuntimeException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.stream.IntStream; + +import static org.apache.hudi.table.format.cow.ParquetSplitReaderUtil.createColumnReader; +import static org.apache.hudi.table.format.cow.ParquetSplitReaderUtil.createWritableColumnVector; +import static org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups; +import static org.apache.parquet.format.converter.ParquetMetadataConverter.range; +import static org.apache.parquet.hadoop.ParquetFileReader.readFooter; +import static org.apache.parquet.hadoop.ParquetInputFormat.getFilter; + +/** + * This reader is used to read a {@link VectorizedColumnBatch} from input split. + * + *

    Note: Reference Flink release 1.11.2 + * {@code org.apache.flink.formats.parquet.vector.ParquetColumnarRowSplitReader} + * because it is package scope. + */ +public class ParquetColumnarRowSplitReader implements Closeable { + + private final boolean utcTimestamp; + + private final MessageType fileSchema; + + private final LogicalType[] requestedTypes; + + private final MessageType requestedSchema; + + /** + * The total number of rows this RecordReader will eventually read. The sum of the rows of all + * the row groups. + */ + private final long totalRowCount; + + private final WritableColumnVector[] writableVectors; + + private final VectorizedColumnBatch columnarBatch; + + private final ColumnarRowData row; + + private final int batchSize; + + private ParquetFileReader reader; + + /** + * For each request column, the reader to read this column. This is NULL if this column is + * missing from the file, in which case we populate the attribute with NULL. + */ + private ColumnReader[] columnReaders; + + /** + * The number of rows that have been returned. + */ + private long rowsReturned; + + /** + * The number of rows that have been reading, including the current in flight row group. + */ + private long totalCountLoadedSoFar; + + // the index of the next row to return + private int nextRow; + + // the number of rows in the current batch + private int rowsInBatch; + + public ParquetColumnarRowSplitReader( + boolean utcTimestamp, + boolean caseSensitive, + Configuration conf, + LogicalType[] selectedTypes, + String[] selectedFieldNames, + ColumnBatchGenerator generator, + int batchSize, + Path path, + long splitStart, + long splitLength) throws IOException { + this.utcTimestamp = utcTimestamp; + this.batchSize = batchSize; + // then we need to apply the predicate push down filter + ParquetMetadata footer = readFooter(conf, path, range(splitStart, splitStart + splitLength)); + MessageType fileSchema = footer.getFileMetaData().getSchema(); + FilterCompat.Filter filter = getFilter(conf); + List blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); + + this.fileSchema = footer.getFileMetaData().getSchema(); + + Type[] types = clipParquetSchema(fileSchema, selectedFieldNames, caseSensitive); + int[] requestedIndices = IntStream.range(0, types.length).filter(i -> types[i] != null).toArray(); + Type[] readTypes = Arrays.stream(requestedIndices).mapToObj(i -> types[i]).toArray(Type[]::new); + + this.requestedTypes = Arrays.stream(requestedIndices).mapToObj(i -> selectedTypes[i]).toArray(LogicalType[]::new); + this.requestedSchema = Types.buildMessage().addFields(readTypes).named("flink-parquet"); + this.reader = new ParquetFileReader( + conf, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns()); + + long totalRowCount = 0; + for (BlockMetaData block : blocks) { + totalRowCount += block.getRowCount(); + } + this.totalRowCount = totalRowCount; + this.nextRow = 0; + this.rowsInBatch = 0; + this.rowsReturned = 0; + + checkSchema(); + + this.writableVectors = createWritableVectors(); + ColumnVector[] columnVectors = patchedVector(selectedFieldNames.length, createReadableVectors(), requestedIndices); + this.columnarBatch = generator.generate(columnVectors); + this.row = new ColumnarRowData(columnarBatch); + } + + /** + * Patches the given vectors with nulls. + * The vector position that is not requested (or read from file) is patched as null. + * + * @param fields The total selected fields number + * @param vectors The readable vectors + * @param indices The requested indices from the selected fields + */ + private static ColumnVector[] patchedVector(int fields, ColumnVector[] vectors, int[] indices) { + ColumnVector[] patched = new ColumnVector[fields]; + for (int i = 0; i < indices.length; i++) { + patched[indices[i]] = vectors[i]; + } + return patched; + } + + /** + * Clips `parquetSchema` according to `fieldNames`. + */ + private static Type[] clipParquetSchema( + GroupType parquetSchema, String[] fieldNames, boolean caseSensitive) { + Type[] types = new Type[fieldNames.length]; + if (caseSensitive) { + for (int i = 0; i < fieldNames.length; ++i) { + String fieldName = fieldNames[i]; + types[i] = parquetSchema.containsField(fieldName) ? parquetSchema.getType(fieldName) : null; + } + } else { + Map caseInsensitiveFieldMap = new HashMap<>(); + for (Type type : parquetSchema.getFields()) { + caseInsensitiveFieldMap.compute(type.getName().toLowerCase(Locale.ROOT), + (key, previousType) -> { + if (previousType != null) { + throw new FlinkRuntimeException( + "Parquet with case insensitive mode should have no duplicate key: " + key); + } + return type; + }); + } + for (int i = 0; i < fieldNames.length; ++i) { + Type type = caseInsensitiveFieldMap.get(fieldNames[i].toLowerCase(Locale.ROOT)); + // TODO clip for array,map,row types. + types[i] = type; + } + } + + return types; + } + + private WritableColumnVector[] createWritableVectors() { + WritableColumnVector[] columns = new WritableColumnVector[requestedTypes.length]; + List types = requestedSchema.getFields(); + List descriptors = requestedSchema.getColumns(); + for (int i = 0; i < requestedTypes.length; i++) { + columns[i] = createWritableColumnVector( + batchSize, + requestedTypes[i], + types.get(i), + descriptors); + } + return columns; + } + + /** + * Create readable vectors from writable vectors. + * Especially for decimal, see {@link org.apache.flink.formats.parquet.vector.ParquetDecimalVector}. + */ + private ColumnVector[] createReadableVectors() { + ColumnVector[] vectors = new ColumnVector[writableVectors.length]; + for (int i = 0; i < writableVectors.length; i++) { + vectors[i] = requestedTypes[i].getTypeRoot() == LogicalTypeRoot.DECIMAL + ? new ParquetDecimalVector(writableVectors[i]) + : writableVectors[i]; + } + return vectors; + } + + private void checkSchema() throws IOException, UnsupportedOperationException { + /* + * Check that the requested schema is supported. + */ + for (int i = 0; i < requestedSchema.getFieldCount(); ++i) { + String[] colPath = requestedSchema.getPaths().get(i); + if (fileSchema.containsPath(colPath)) { + ColumnDescriptor fd = fileSchema.getColumnDescription(colPath); + if (!fd.equals(requestedSchema.getColumns().get(i))) { + throw new UnsupportedOperationException("Schema evolution not supported."); + } + } else { + if (requestedSchema.getColumns().get(i).getMaxDefinitionLevel() == 0) { + // Column is missing in data but the required data is non-nullable. This file is invalid. + throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath)); + } + } + } + } + + /** + * Method used to check if the end of the input is reached. + * + * @return True if the end is reached, otherwise false. + * @throws IOException Thrown, if an I/O error occurred. + */ + public boolean reachedEnd() throws IOException { + return !ensureBatch(); + } + + public ColumnarRowData nextRecord() { + // return the next row + row.setRowId(this.nextRow++); + return row; + } + + /** + * Checks if there is at least one row left in the batch to return. If no more row are + * available, it reads another batch of rows. + * + * @return Returns true if there is one more row to return, false otherwise. + * @throws IOException throw if an exception happens while reading a batch. + */ + private boolean ensureBatch() throws IOException { + if (nextRow >= rowsInBatch) { + // No more rows available in the Rows array. + nextRow = 0; + // Try to read the next batch if rows from the file. + return nextBatch(); + } + // there is at least one Row left in the Rows array. + return true; + } + + /** + * Advances to the next batch of rows. Returns false if there are no more. + */ + private boolean nextBatch() throws IOException { + for (WritableColumnVector v : writableVectors) { + v.reset(); + } + columnarBatch.setNumRows(0); + if (rowsReturned >= totalRowCount) { + return false; + } + if (rowsReturned == totalCountLoadedSoFar) { + readNextRowGroup(); + } + + int num = (int) Math.min(batchSize, totalCountLoadedSoFar - rowsReturned); + for (int i = 0; i < columnReaders.length; ++i) { + //noinspection unchecked + columnReaders[i].readToVector(num, writableVectors[i]); + } + rowsReturned += num; + columnarBatch.setNumRows(num); + rowsInBatch = num; + return true; + } + + private void readNextRowGroup() throws IOException { + PageReadStore pages = reader.readNextRowGroup(); + if (pages == null) { + throw new IOException("expecting more rows but reached last block. Read " + + rowsReturned + " out of " + totalRowCount); + } + List types = requestedSchema.getFields(); + List columns = requestedSchema.getColumns(); + columnReaders = new ColumnReader[types.size()]; + for (int i = 0; i < types.size(); ++i) { + columnReaders[i] = createColumnReader( + utcTimestamp, + requestedTypes[i], + types.get(i), + columns, + pages); + } + totalCountLoadedSoFar += pages.getRowCount(); + } + + /** + * Seek to a particular row number. + */ + public void seekToRow(long rowCount) throws IOException { + if (totalCountLoadedSoFar != 0) { + throw new UnsupportedOperationException("Only support seek at first."); + } + + List blockMetaData = reader.getRowGroups(); + + for (BlockMetaData metaData : blockMetaData) { + if (metaData.getRowCount() > rowCount) { + break; + } else { + reader.skipNextRowGroup(); + rowsReturned += metaData.getRowCount(); + totalCountLoadedSoFar += metaData.getRowCount(); + rowsInBatch = (int) metaData.getRowCount(); + nextRow = (int) metaData.getRowCount(); + rowCount -= metaData.getRowCount(); + } + } + for (int i = 0; i < rowCount; i++) { + boolean end = reachedEnd(); + if (end) { + throw new RuntimeException("Seek to many rows."); + } + nextRecord(); + } + } + + @Override + public void close() throws IOException { + if (reader != null) { + reader.close(); + reader = null; + } + } + + /** + * Interface to gen {@link VectorizedColumnBatch}. + */ + public interface ColumnBatchGenerator { + VectorizedColumnBatch generate(ColumnVector[] readVectors); + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetDataColumnReader.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetDataColumnReader.java new file mode 100644 index 0000000000000..e96cf22d29ef1 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetDataColumnReader.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.table.data.TimestampData; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.column.Dictionary; + +import java.io.IOException; + +/** + * The interface to wrap the underlying Parquet dictionary and non dictionary encoded page reader. + */ +public interface ParquetDataColumnReader { + + /** + * Initialize the reader by page data. + * + * @param valueCount value count + * @param in page data + * @throws IOException + */ + void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException; + + /** + * @return the next Dictionary ID from the page + */ + int readValueDictionaryId(); + + /** + * @return the next Long from the page + */ + long readLong(); + + /** + * @return the next Integer from the page + */ + int readInteger(); + + /** + * @return the next SmallInt from the page + */ + int readSmallInt(); + + /** + * @return the next TinyInt from the page + */ + int readTinyInt(); + + /** + * @return the next Float from the page + */ + float readFloat(); + + /** + * @return the next Boolean from the page + */ + boolean readBoolean(); + + /** + * @return the next String from the page + */ + byte[] readString(); + + /** + * @return the next Varchar from the page + */ + byte[] readVarchar(); + + /** + * @return the next Char from the page + */ + byte[] readChar(); + + /** + * @return the next Bytes from the page + */ + byte[] readBytes(); + + /** + * @return the next Decimal from the page + */ + byte[] readDecimal(); + + /** + * @return the next Double from the page + */ + double readDouble(); + + /** + * @return the next TimestampData from the page + */ + TimestampData readTimestamp(); + + /** + * @return is data valid + */ + boolean isValid(); + + /** + * @return the underlying dictionary if current reader is dictionary encoded + */ + Dictionary getDictionary(); + + /** + * @param id in dictionary + * @return the Bytes from the dictionary by id + */ + byte[] readBytes(int id); + + /** + * @param id in dictionary + * @return the Float from the dictionary by id + */ + float readFloat(int id); + + /** + * @param id in dictionary + * @return the Double from the dictionary by id + */ + double readDouble(int id); + + /** + * @param id in dictionary + * @return the Integer from the dictionary by id + */ + int readInteger(int id); + + /** + * @param id in dictionary + * @return the Long from the dictionary by id + */ + long readLong(int id); + + /** + * @param id in dictionary + * @return the Small Int from the dictionary by id + */ + int readSmallInt(int id); + + /** + * @param id in dictionary + * @return the tiny int from the dictionary by id + */ + int readTinyInt(int id); + + /** + * @param id in dictionary + * @return the Boolean from the dictionary by id + */ + boolean readBoolean(int id); + + /** + * @param id in dictionary + * @return the Decimal from the dictionary by id + */ + byte[] readDecimal(int id); + + /** + * @param id in dictionary + * @return the TimestampData from the dictionary by id + */ + TimestampData readTimestamp(int id); + + /** + * @param id in dictionary + * @return the String from the dictionary by id + */ + byte[] readString(int id); + + /** + * @param id in dictionary + * @return the Varchar from the dictionary by id + */ + byte[] readVarchar(int id); + + /** + * @param id in dictionary + * @return the Char from the dictionary by id + */ + byte[] readChar(int id); +} + diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetDataColumnReaderFactory.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetDataColumnReaderFactory.java new file mode 100644 index 0000000000000..861d5cb00bbe7 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetDataColumnReaderFactory.java @@ -0,0 +1,304 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.table.data.TimestampData; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.column.Dictionary; +import org.apache.parquet.column.values.ValuesReader; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.sql.Timestamp; + +import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.JULIAN_EPOCH_OFFSET_DAYS; +import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.MILLIS_IN_DAY; +import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.NANOS_PER_MILLISECOND; +import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.NANOS_PER_SECOND; + +/** + * Parquet file has self-describing schema which may differ from the user required schema (e.g. + * schema evolution). This factory is used to retrieve user required typed data via corresponding + * reader which reads the underlying data. + */ +public final class ParquetDataColumnReaderFactory { + + private ParquetDataColumnReaderFactory() { + } + + /** + * default reader for {@link ParquetDataColumnReader}. + */ + public static class DefaultParquetDataColumnReader implements ParquetDataColumnReader { + protected ValuesReader valuesReader; + protected Dictionary dict; + + // After the data is read in the parquet type, isValid will be set to true if the data can + // be returned in the type defined in HMS. Otherwise isValid is set to false. + boolean isValid = true; + + public DefaultParquetDataColumnReader(ValuesReader valuesReader) { + this.valuesReader = valuesReader; + } + + public DefaultParquetDataColumnReader(Dictionary dict) { + this.dict = dict; + } + + @Override + public void initFromPage(int i, ByteBufferInputStream in) throws IOException { + valuesReader.initFromPage(i, in); + } + + @Override + public boolean readBoolean() { + return valuesReader.readBoolean(); + } + + @Override + public boolean readBoolean(int id) { + return dict.decodeToBoolean(id); + } + + @Override + public byte[] readString(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public byte[] readString() { + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readVarchar() { + // we need to enforce the size here even the types are the same + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readVarchar(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public byte[] readChar() { + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readChar(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public byte[] readBytes() { + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readBytes(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public byte[] readDecimal() { + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readDecimal(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public float readFloat() { + return valuesReader.readFloat(); + } + + @Override + public float readFloat(int id) { + return dict.decodeToFloat(id); + } + + @Override + public double readDouble() { + return valuesReader.readDouble(); + } + + @Override + public double readDouble(int id) { + return dict.decodeToDouble(id); + } + + @Override + public TimestampData readTimestamp() { + throw new RuntimeException("Unsupported operation"); + } + + @Override + public TimestampData readTimestamp(int id) { + throw new RuntimeException("Unsupported operation"); + } + + @Override + public int readInteger() { + return valuesReader.readInteger(); + } + + @Override + public int readInteger(int id) { + return dict.decodeToInt(id); + } + + @Override + public boolean isValid() { + return isValid; + } + + @Override + public long readLong(int id) { + return dict.decodeToLong(id); + } + + @Override + public long readLong() { + return valuesReader.readLong(); + } + + @Override + public int readSmallInt() { + return valuesReader.readInteger(); + } + + @Override + public int readSmallInt(int id) { + return dict.decodeToInt(id); + } + + @Override + public int readTinyInt() { + return valuesReader.readInteger(); + } + + @Override + public int readTinyInt(int id) { + return dict.decodeToInt(id); + } + + @Override + public int readValueDictionaryId() { + return valuesReader.readValueDictionaryId(); + } + + public void skip() { + valuesReader.skip(); + } + + @Override + public Dictionary getDictionary() { + return dict; + } + } + + /** + * The reader who reads from the underlying Timestamp value value. + */ + public static class TypesFromInt96PageReader extends DefaultParquetDataColumnReader { + private final boolean isUtcTimestamp; + + public TypesFromInt96PageReader(ValuesReader realReader, boolean isUtcTimestamp) { + super(realReader); + this.isUtcTimestamp = isUtcTimestamp; + } + + public TypesFromInt96PageReader(Dictionary dict, boolean isUtcTimestamp) { + super(dict); + this.isUtcTimestamp = isUtcTimestamp; + } + + private TimestampData convert(Binary binary) { + ByteBuffer buf = binary.toByteBuffer(); + buf.order(ByteOrder.LITTLE_ENDIAN); + long timeOfDayNanos = buf.getLong(); + int julianDay = buf.getInt(); + return int96ToTimestamp(isUtcTimestamp, timeOfDayNanos, julianDay); + } + + @Override + public TimestampData readTimestamp(int id) { + return convert(dict.decodeToBinary(id)); + } + + @Override + public TimestampData readTimestamp() { + return convert(valuesReader.readBytes()); + } + } + + private static ParquetDataColumnReader getDataColumnReaderByTypeHelper( + boolean isDictionary, + PrimitiveType parquetType, + Dictionary dictionary, + ValuesReader valuesReader, + boolean isUtcTimestamp) { + if (parquetType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.INT96) { + return isDictionary + ? new TypesFromInt96PageReader(dictionary, isUtcTimestamp) + : new TypesFromInt96PageReader(valuesReader, isUtcTimestamp); + } else { + return isDictionary + ? new DefaultParquetDataColumnReader(dictionary) + : new DefaultParquetDataColumnReader(valuesReader); + } + } + + public static ParquetDataColumnReader getDataColumnReaderByTypeOnDictionary( + PrimitiveType parquetType, Dictionary realReader, boolean isUtcTimestamp) { + return getDataColumnReaderByTypeHelper(true, parquetType, realReader, null, isUtcTimestamp); + } + + public static ParquetDataColumnReader getDataColumnReaderByType( + PrimitiveType parquetType, ValuesReader realReader, boolean isUtcTimestamp) { + return getDataColumnReaderByTypeHelper( + false, parquetType, null, realReader, isUtcTimestamp); + } + + private static TimestampData int96ToTimestamp( + boolean utcTimestamp, long nanosOfDay, int julianDay) { + long millisecond = julianDayToMillis(julianDay) + (nanosOfDay / NANOS_PER_MILLISECOND); + + if (utcTimestamp) { + int nanoOfMillisecond = (int) (nanosOfDay % NANOS_PER_MILLISECOND); + return TimestampData.fromEpochMillis(millisecond, nanoOfMillisecond); + } else { + Timestamp timestamp = new Timestamp(millisecond); + timestamp.setNanos((int) (nanosOfDay % NANOS_PER_SECOND)); + return TimestampData.fromTimestamp(timestamp); + } + } + + private static long julianDayToMillis(int julianDay) { + return (julianDay - JULIAN_EPOCH_OFFSET_DAYS) * MILLIS_IN_DAY; + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/RowColumnReader.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/RowColumnReader.java new file mode 100644 index 0000000000000..524c00f402d47 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/RowColumnReader.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.hudi.table.format.cow.vector.HeapRowColumnVector; + +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.vector.writable.WritableColumnVector; + +import java.io.IOException; +import java.util.List; + +/** + * Row {@link ColumnReader}. + */ +public class RowColumnReader implements ColumnReader { + + private final List fieldReaders; + + public RowColumnReader(List fieldReaders) { + this.fieldReaders = fieldReaders; + } + + @Override + public void readToVector(int readNumber, WritableColumnVector vector) throws IOException { + HeapRowColumnVector rowColumnVector = (HeapRowColumnVector) vector; + WritableColumnVector[] vectors = rowColumnVector.vectors; + // row vector null array + boolean[] isNulls = new boolean[readNumber]; + for (int i = 0; i < vectors.length; i++) { + fieldReaders.get(i).readToVector(readNumber, vectors[i]); + + for (int j = 0; j < readNumber; j++) { + if (i == 0) { + isNulls[j] = vectors[i].isNullAt(j); + } else { + isNulls[j] = isNulls[j] && vectors[i].isNullAt(j); + } + if (i == vectors.length - 1 && isNulls[j]) { + // rowColumnVector[j] is null only when all fields[j] of rowColumnVector[j] is + // null + rowColumnVector.setNullAt(j); + } + } + } + } +} diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/RunLengthDecoder.java b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/RunLengthDecoder.java new file mode 100644 index 0000000000000..3266f835e4d1c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/RunLengthDecoder.java @@ -0,0 +1,304 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.table.data.vector.writable.WritableColumnVector; +import org.apache.flink.table.data.vector.writable.WritableIntVector; +import org.apache.parquet.Preconditions; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.values.bitpacking.BytePacker; +import org.apache.parquet.column.values.bitpacking.Packer; +import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder; +import org.apache.parquet.io.ParquetDecodingException; + +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * Run length decoder for data and dictionary ids. + * See https://github.com/apache/parquet-format/blob/master/Encodings.md + * See {@link RunLengthBitPackingHybridDecoder}. + * + *

    Note: Reference Flink release 1.11.2 + * {@code org.apache.flink.formats.parquet.vector.reader.RunLengthDecoder} + * because it is package scope. + */ +final class RunLengthDecoder { + + /** + * If true, the bit width is fixed. This decoder is used in different places and this also + * controls if we need to read the bitwidth from the beginning of the data stream. + */ + private final boolean fixedWidth; + private final boolean readLength; + + // Encoded data. + private ByteBufferInputStream in; + + // bit/byte width of decoded data and utility to batch unpack them. + private int bitWidth; + private int bytesWidth; + private BytePacker packer; + + // Current decoding mode and values + MODE mode; + int currentCount; + int currentValue; + + // Buffer of decoded values if the values are PACKED. + int[] currentBuffer = new int[16]; + int currentBufferIdx = 0; + + RunLengthDecoder() { + this.fixedWidth = false; + this.readLength = false; + } + + RunLengthDecoder(int bitWidth) { + this.fixedWidth = true; + this.readLength = bitWidth != 0; + initWidthAndPacker(bitWidth); + } + + RunLengthDecoder(int bitWidth, boolean readLength) { + this.fixedWidth = true; + this.readLength = readLength; + initWidthAndPacker(bitWidth); + } + + /** + * Init from input stream. + */ + void initFromStream(int valueCount, ByteBufferInputStream in) throws IOException { + this.in = in; + if (fixedWidth) { + // initialize for repetition and definition levels + if (readLength) { + int length = readIntLittleEndian(); + this.in = in.sliceStream(length); + } + } else { + // initialize for values + if (in.available() > 0) { + initWidthAndPacker(in.read()); + } + } + if (bitWidth == 0) { + // 0 bit width, treat this as an RLE run of valueCount number of 0's. + this.mode = MODE.RLE; + this.currentCount = valueCount; + this.currentValue = 0; + } else { + this.currentCount = 0; + } + } + + /** + * Initializes the internal state for decoding ints of `bitWidth`. + */ + private void initWidthAndPacker(int bitWidth) { + Preconditions.checkArgument(bitWidth >= 0 && bitWidth <= 32, "bitWidth must be >= 0 and <= 32"); + this.bitWidth = bitWidth; + this.bytesWidth = BytesUtils.paddedByteCountFromBits(bitWidth); + this.packer = Packer.LITTLE_ENDIAN.newBytePacker(bitWidth); + } + + int readInteger() { + if (this.currentCount == 0) { + this.readNextGroup(); + } + + this.currentCount--; + switch (mode) { + case RLE: + return this.currentValue; + case PACKED: + return this.currentBuffer[currentBufferIdx++]; + default: + throw new AssertionError(); + } + } + + /** + * Decoding for dictionary ids. The IDs are populated into `values` and the nullability is + * populated into `nulls`. + */ + void readDictionaryIds( + int total, + WritableIntVector values, + WritableColumnVector nulls, + int rowId, + int level, + RunLengthDecoder data) { + int left = total; + while (left > 0) { + if (this.currentCount == 0) { + this.readNextGroup(); + } + int n = Math.min(left, this.currentCount); + switch (mode) { + case RLE: + if (currentValue == level) { + data.readDictionaryIdData(n, values, rowId); + } else { + nulls.setNulls(rowId, n); + } + break; + case PACKED: + for (int i = 0; i < n; ++i) { + if (currentBuffer[currentBufferIdx++] == level) { + values.setInt(rowId + i, data.readInteger()); + } else { + nulls.setNullAt(rowId + i); + } + } + break; + default: + throw new AssertionError(); + } + rowId += n; + left -= n; + currentCount -= n; + } + } + + /** + * It is used to decode dictionary IDs. + */ + private void readDictionaryIdData(int total, WritableIntVector c, int rowId) { + int left = total; + while (left > 0) { + if (this.currentCount == 0) { + this.readNextGroup(); + } + int n = Math.min(left, this.currentCount); + switch (mode) { + case RLE: + c.setInts(rowId, n, currentValue); + break; + case PACKED: + c.setInts(rowId, n, currentBuffer, currentBufferIdx); + currentBufferIdx += n; + break; + default: + throw new AssertionError(); + } + rowId += n; + left -= n; + currentCount -= n; + } + } + + /** + * Reads the next varint encoded int. + */ + private int readUnsignedVarInt() throws IOException { + int value = 0; + int shift = 0; + int b; + do { + b = in.read(); + value |= (b & 0x7F) << shift; + shift += 7; + } while ((b & 0x80) != 0); + return value; + } + + /** + * Reads the next 4 byte little endian int. + */ + private int readIntLittleEndian() throws IOException { + int ch4 = in.read(); + int ch3 = in.read(); + int ch2 = in.read(); + int ch1 = in.read(); + return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + ch4); + } + + /** + * Reads the next byteWidth little endian int. + */ + private int readIntLittleEndianPaddedOnBitWidth() throws IOException { + switch (bytesWidth) { + case 0: + return 0; + case 1: + return in.read(); + case 2: { + int ch2 = in.read(); + int ch1 = in.read(); + return (ch1 << 8) + ch2; + } + case 3: { + int ch3 = in.read(); + int ch2 = in.read(); + int ch1 = in.read(); + return (ch1 << 16) + (ch2 << 8) + ch3; + } + case 4: { + return readIntLittleEndian(); + } + default: + throw new RuntimeException("Unreachable"); + } + } + + /** + * Reads the next group. + */ + void readNextGroup() { + try { + int header = readUnsignedVarInt(); + this.mode = (header & 1) == 0 ? MODE.RLE : MODE.PACKED; + switch (mode) { + case RLE: + this.currentCount = header >>> 1; + this.currentValue = readIntLittleEndianPaddedOnBitWidth(); + return; + case PACKED: + int numGroups = header >>> 1; + this.currentCount = numGroups * 8; + + if (this.currentBuffer.length < this.currentCount) { + this.currentBuffer = new int[this.currentCount]; + } + currentBufferIdx = 0; + int valueIndex = 0; + while (valueIndex < this.currentCount) { + // values are bit packed 8 at a time, so reading bitWidth will always work + ByteBuffer buffer = in.slice(bitWidth); + this.packer.unpack8Values(buffer, buffer.position(), this.currentBuffer, valueIndex); + valueIndex += 8; + } + return; + default: + throw new ParquetDecodingException("not a valid mode " + this.mode); + } + } catch (IOException e) { + throw new ParquetDecodingException("Failed to read from input stream", e); + } + } + + enum MODE { + RLE, + PACKED + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/test/java/org/apache/hudi/adapter/OutputAdapter.java b/hudi-flink-datasource/hudi-flink1.14.x/src/test/java/org/apache/hudi/adapter/OutputAdapter.java new file mode 100644 index 0000000000000..c0d83e6096e3c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/test/java/org/apache/hudi/adapter/OutputAdapter.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.streaming.api.operators.Output; +import org.apache.flink.streaming.runtime.watermarkstatus.WatermarkStatus; + +/** + * Adapter clazz for {@link Output}. + */ +public interface OutputAdapter extends Output { + @Override + default void emitWatermarkStatus(WatermarkStatus watermarkStatus) { + // no operation + } +} diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/test/java/org/apache/hudi/adapter/StateInitializationContextAdapter.java b/hudi-flink-datasource/hudi-flink1.14.x/src/test/java/org/apache/hudi/adapter/StateInitializationContextAdapter.java new file mode 100644 index 0000000000000..1f76ad692f33f --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/test/java/org/apache/hudi/adapter/StateInitializationContextAdapter.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.runtime.state.StateInitializationContext; + +import java.util.OptionalLong; + +/** + * Adapter clazz for {@link StateInitializationContext}. + */ +public interface StateInitializationContextAdapter extends StateInitializationContext { + @Override + default OptionalLong getRestoredCheckpointId() { + return OptionalLong.empty(); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/test/java/org/apache/hudi/adapter/StreamingRuntimeContextAdapter.java b/hudi-flink-datasource/hudi-flink1.14.x/src/test/java/org/apache/hudi/adapter/StreamingRuntimeContextAdapter.java new file mode 100644 index 0000000000000..4461c28943d3a --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/test/java/org/apache/hudi/adapter/StreamingRuntimeContextAdapter.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.api.common.accumulators.Accumulator; +import org.apache.flink.metrics.groups.OperatorMetricGroup; +import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.flink.runtime.execution.Environment; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; + +import java.util.Map; + +/** + * Adapter clazz for {@link StreamingRuntimeContext}. + */ +public class StreamingRuntimeContextAdapter extends StreamingRuntimeContext { + + public StreamingRuntimeContextAdapter(AbstractStreamOperator operator, Environment env, + Map> accumulators) { + super(operator, env, accumulators); + } + + @Override + public OperatorMetricGroup getMetricGroup() { + return UnregisteredMetricsGroup.createOperatorMetricGroup(); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.14.x/src/test/java/org/apache/hudi/adapter/TestTableEnvs.java b/hudi-flink-datasource/hudi-flink1.14.x/src/test/java/org/apache/hudi/adapter/TestTableEnvs.java new file mode 100644 index 0000000000000..e65437609a21e --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.14.x/src/test/java/org/apache/hudi/adapter/TestTableEnvs.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; + +/** + * TableEnv for test goals. + */ +public class TestTableEnvs { + + public static TableEnvironment getBatchTableEnv() { + Configuration conf = new Configuration(); + // for batch upsert use cases: current suggestion is to disable these 2 options, + // from 1.14, flink runtime execution mode has switched from streaming + // to batch for batch execution mode(before that, both streaming and batch use streaming execution mode), + // current batch execution mode has these limitations: + // + // 1. the keyed stream default to always sort the inputs by key; + // 2. the batch state-backend requires the inputs sort by state key + // + // For our hudi batch pipeline upsert case, we rely on the consuming sequence for index records and data records, + // the index records must be loaded first before data records for BucketAssignFunction to keep upsert semantics correct, + // so we suggest disabling these 2 options to use streaming state-backend for batch execution mode + // to keep the strategy before 1.14. + conf.setBoolean("execution.sorted-inputs.enabled", false); + conf.setBoolean("execution.batch-state-backend.enabled", false); + StreamExecutionEnvironment execEnv = StreamExecutionEnvironment.getExecutionEnvironment(conf); + EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build(); + return StreamTableEnvironment.create(execEnv, settings); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.15.x/pom.xml b/hudi-flink-datasource/hudi-flink1.15.x/pom.xml new file mode 100644 index 0000000000000..62bb08106422b --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/pom.xml @@ -0,0 +1,195 @@ + + + + + hudi-flink-datasource + org.apache.hudi + 0.12.2-dt-SNAPSHOT + + 4.0.0 + + hudi-flink1.15.x + 0.12.2-dt-SNAPSHOT + jar + + + ${project.parent.parent.basedir} + + + + + + org.apache.logging.log4j + log4j-1.2-api + + + org.apache.logging.log4j + log4j-slf4j-impl + + + org.slf4j + slf4j-api + + + + + org.apache.hudi + hudi-common + ${project.version} + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + provided + + + org.apache.flink + flink-table-api-java + ${flink1.15.version} + provided + + + org.apache.flink + flink-table-api-java-bridge + ${flink1.15.version} + provided + + + org.apache.flink + flink-shaded-guava + 30.1.1-jre-14.0 + provided + + + org.apache.flink + flink-core + ${flink1.15.version} + provided + + + org.apache.flink + flink-streaming-java + ${flink1.15.version} + provided + + + org.apache.flink + flink-table-runtime + ${flink1.15.version} + provided + + + org.apache.flink + flink-parquet + ${flink1.15.version} + provided + + + org.apache.flink + flink-json + ${flink1.15.version} + provided + + + org.apache.flink + flink-runtime + ${flink1.15.version} + test + test-jar + + + + + org.apache.hudi + hudi-tests-common + ${project.version} + test + + + + org.junit.jupiter + junit-jupiter-api + test + + + org.junit.jupiter + junit-jupiter-engine + test + + + org.junit.vintage + junit-vintage-engine + test + + + org.junit.jupiter + junit-jupiter-params + test + + + org.mockito + mockito-junit-jupiter + test + + + org.junit.platform + junit-platform-runner + test + + + org.junit.platform + junit-platform-suite-api + test + + + org.junit.platform + junit-platform-commons + test + + + + + + + org.jacoco + jacoco-maven-plugin + + + org.apache.maven.plugins + maven-jar-plugin + + + + test-jar + + test-compile + + + + false + + + + org.apache.rat + apache-rat-plugin + + + + diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/AbstractStreamOperatorAdapter.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/AbstractStreamOperatorAdapter.java new file mode 100644 index 0000000000000..d4c6bc3a8f4da --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/AbstractStreamOperatorAdapter.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; + +/** + * Adapter clazz for {@code AbstractStreamOperator}. + */ +public abstract class AbstractStreamOperatorAdapter extends AbstractStreamOperator { +} diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/AbstractStreamOperatorFactoryAdapter.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/AbstractStreamOperatorFactoryAdapter.java new file mode 100644 index 0000000000000..6dcfe71ccfd9d --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/AbstractStreamOperatorFactoryAdapter.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.YieldingOperatorFactory; + +/** + * Adapter clazz for {@link AbstractStreamOperatorFactory}. + */ +public abstract class AbstractStreamOperatorFactoryAdapter + extends AbstractStreamOperatorFactory implements YieldingOperatorFactory { + + public MailboxExecutorAdapter getMailboxExecutorAdapter() { + return new MailboxExecutorAdapter(getMailboxExecutor()); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/DataStreamScanProviderAdapter.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/DataStreamScanProviderAdapter.java new file mode 100644 index 0000000000000..a6b5439ea1ffd --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/DataStreamScanProviderAdapter.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.connector.ProviderContext; +import org.apache.flink.table.connector.source.DataStreamScanProvider; +import org.apache.flink.table.data.RowData; + +/** + * Adapter clazz for {@code DataStreamScanProvider}. + */ +public interface DataStreamScanProviderAdapter extends DataStreamScanProvider { + default DataStream produceDataStream(ProviderContext providerContext, StreamExecutionEnvironment streamExecutionEnvironment) { + return produceDataStream(streamExecutionEnvironment); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/DataStreamSinkProviderAdapter.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/DataStreamSinkProviderAdapter.java new file mode 100644 index 0000000000000..349f60f30acfe --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/DataStreamSinkProviderAdapter.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; +import org.apache.flink.table.connector.ProviderContext; +import org.apache.flink.table.connector.sink.DataStreamSinkProvider; +import org.apache.flink.table.data.RowData; + +/** + * Adapter clazz for {@code DataStreamSinkProvider}. + */ +public interface DataStreamSinkProviderAdapter extends DataStreamSinkProvider { + DataStreamSink consumeDataStream(DataStream dataStream); + + @Override + default DataStreamSink consumeDataStream(ProviderContext providerContext, DataStream dataStream) { + return consumeDataStream(dataStream); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/MailboxExecutorAdapter.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/MailboxExecutorAdapter.java new file mode 100644 index 0000000000000..0c836f3db391b --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/MailboxExecutorAdapter.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.api.common.operators.MailboxExecutor; +import org.apache.flink.util.function.ThrowingRunnable; + +/** + * Adapter clazz for {@link MailboxExecutor}. + */ +public class MailboxExecutorAdapter { + private final MailboxExecutor executor; + + public MailboxExecutorAdapter(MailboxExecutor executor) { + this.executor = executor; + } + + public void execute(ThrowingRunnable command, String description) { + this.executor.execute(command, description); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/RateLimiterAdapter.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/RateLimiterAdapter.java new file mode 100644 index 0000000000000..865c0c81d4d9d --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/RateLimiterAdapter.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.shaded.guava30.com.google.common.util.concurrent.RateLimiter; + +/** + * Bridge class for shaded guava clazz {@code RateLimiter}. + */ +public class RateLimiterAdapter { + private final RateLimiter rateLimiter; + + private RateLimiterAdapter(double permitsPerSecond) { + this.rateLimiter = RateLimiter.create(permitsPerSecond); + } + + public static RateLimiterAdapter create(double permitsPerSecond) { + return new RateLimiterAdapter(permitsPerSecond); + } + + public void acquire() { + this.rateLimiter.acquire(); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/Utils.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/Utils.java new file mode 100644 index 0000000000000..ca2357ce49301 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/adapter/Utils.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.TimeCharacteristic; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.streaming.api.operators.Output; +import org.apache.flink.streaming.api.operators.StreamSourceContexts; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.runtime.tasks.ProcessingTimeService; +import org.apache.flink.streaming.runtime.tasks.StreamTask; +import org.apache.flink.table.catalog.ObjectIdentifier; +import org.apache.flink.table.catalog.ResolvedCatalogTable; +import org.apache.flink.table.factories.FactoryUtil; + +import java.util.Collections; + +/** + * Adapter utils. + */ +public class Utils { + public static SourceFunction.SourceContext getSourceContext( + TimeCharacteristic timeCharacteristic, + ProcessingTimeService processingTimeService, + StreamTask streamTask, + Output> output, + long watermarkInterval) { + return StreamSourceContexts.getSourceContext( + timeCharacteristic, + processingTimeService, + new Object(), // no actual locking needed + output, + watermarkInterval, + -1, + true); + } + + public static FactoryUtil.DefaultDynamicTableContext getTableContext( + ObjectIdentifier tablePath, + ResolvedCatalogTable catalogTable, + ReadableConfig conf) { + return new FactoryUtil.DefaultDynamicTableContext(tablePath, catalogTable, + Collections.emptyMap(), conf, Thread.currentThread().getContextClassLoader(), false); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/ParquetSplitReaderUtil.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/ParquetSplitReaderUtil.java new file mode 100644 index 0000000000000..5eeb42514a2cc --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/ParquetSplitReaderUtil.java @@ -0,0 +1,527 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow; + +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.table.format.cow.vector.HeapArrayVector; +import org.apache.hudi.table.format.cow.vector.HeapMapColumnVector; +import org.apache.hudi.table.format.cow.vector.HeapRowColumnVector; +import org.apache.hudi.table.format.cow.vector.ParquetDecimalVector; +import org.apache.hudi.table.format.cow.vector.reader.ArrayColumnReader; +import org.apache.hudi.table.format.cow.vector.reader.FixedLenBytesColumnReader; +import org.apache.hudi.table.format.cow.vector.reader.Int64TimestampColumnReader; +import org.apache.hudi.table.format.cow.vector.reader.MapColumnReader; +import org.apache.hudi.table.format.cow.vector.reader.ParquetColumnarRowSplitReader; +import org.apache.hudi.table.format.cow.vector.reader.RowColumnReader; + +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.vector.reader.BooleanColumnReader; +import org.apache.flink.formats.parquet.vector.reader.ByteColumnReader; +import org.apache.flink.formats.parquet.vector.reader.BytesColumnReader; +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.formats.parquet.vector.reader.DoubleColumnReader; +import org.apache.flink.formats.parquet.vector.reader.FloatColumnReader; +import org.apache.flink.formats.parquet.vector.reader.IntColumnReader; +import org.apache.flink.formats.parquet.vector.reader.LongColumnReader; +import org.apache.flink.formats.parquet.vector.reader.ShortColumnReader; +import org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.data.columnar.vector.ColumnVector; +import org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch; +import org.apache.flink.table.data.columnar.vector.heap.HeapBooleanVector; +import org.apache.flink.table.data.columnar.vector.heap.HeapByteVector; +import org.apache.flink.table.data.columnar.vector.heap.HeapBytesVector; +import org.apache.flink.table.data.columnar.vector.heap.HeapDoubleVector; +import org.apache.flink.table.data.columnar.vector.heap.HeapFloatVector; +import org.apache.flink.table.data.columnar.vector.heap.HeapIntVector; +import org.apache.flink.table.data.columnar.vector.heap.HeapLongVector; +import org.apache.flink.table.data.columnar.vector.heap.HeapShortVector; +import org.apache.flink.table.data.columnar.vector.heap.HeapTimestampVector; +import org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.table.types.logical.VarBinaryType; +import org.apache.flink.util.Preconditions; +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.ParquetRuntimeException; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.InvalidSchemaException; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +import java.io.IOException; +import java.math.BigDecimal; +import java.nio.charset.StandardCharsets; +import java.sql.Date; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.apache.flink.table.utils.DateTimeUtils.toInternal; +import static org.apache.parquet.Preconditions.checkArgument; + +/** + * Util for generating {@link ParquetColumnarRowSplitReader}. + * + *

    NOTE: reference from Flink release 1.11.2 {@code ParquetSplitReaderUtil}, modify to support INT64 + * based TIMESTAMP_MILLIS as ConvertedType, should remove when Flink supports that. + */ +public class ParquetSplitReaderUtil { + + /** + * Util for generating partitioned {@link ParquetColumnarRowSplitReader}. + */ + public static ParquetColumnarRowSplitReader genPartColumnarRowReader( + boolean utcTimestamp, + boolean caseSensitive, + Configuration conf, + String[] fullFieldNames, + DataType[] fullFieldTypes, + Map partitionSpec, + int[] selectedFields, + int batchSize, + Path path, + long splitStart, + long splitLength) throws IOException { + List selNonPartNames = Arrays.stream(selectedFields) + .mapToObj(i -> fullFieldNames[i]) + .filter(n -> !partitionSpec.containsKey(n)) + .collect(Collectors.toList()); + + int[] selParquetFields = Arrays.stream(selectedFields) + .filter(i -> !partitionSpec.containsKey(fullFieldNames[i])) + .toArray(); + + ParquetColumnarRowSplitReader.ColumnBatchGenerator gen = readVectors -> { + // create and initialize the row batch + ColumnVector[] vectors = new ColumnVector[selectedFields.length]; + for (int i = 0; i < vectors.length; i++) { + String name = fullFieldNames[selectedFields[i]]; + LogicalType type = fullFieldTypes[selectedFields[i]].getLogicalType(); + vectors[i] = createVector(readVectors, selNonPartNames, name, type, partitionSpec, batchSize); + } + return new VectorizedColumnBatch(vectors); + }; + + return new ParquetColumnarRowSplitReader( + utcTimestamp, + caseSensitive, + conf, + Arrays.stream(selParquetFields) + .mapToObj(i -> fullFieldTypes[i].getLogicalType()) + .toArray(LogicalType[]::new), + selNonPartNames.toArray(new String[0]), + gen, + batchSize, + new org.apache.hadoop.fs.Path(path.toUri()), + splitStart, + splitLength); + } + + private static ColumnVector createVector( + ColumnVector[] readVectors, + List selNonPartNames, + String name, + LogicalType type, + Map partitionSpec, + int batchSize) { + if (partitionSpec.containsKey(name)) { + return createVectorFromConstant(type, partitionSpec.get(name), batchSize); + } + ColumnVector readVector = readVectors[selNonPartNames.indexOf(name)]; + if (readVector == null) { + // when the read vector is null, use a constant null vector instead + readVector = createVectorFromConstant(type, null, batchSize); + } + return readVector; + } + + private static ColumnVector createVectorFromConstant( + LogicalType type, + Object value, + int batchSize) { + switch (type.getTypeRoot()) { + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + HeapBytesVector bsv = new HeapBytesVector(batchSize); + if (value == null) { + bsv.fillWithNulls(); + } else { + bsv.fill(value instanceof byte[] + ? (byte[]) value + : value.toString().getBytes(StandardCharsets.UTF_8)); + } + return bsv; + case BOOLEAN: + HeapBooleanVector bv = new HeapBooleanVector(batchSize); + if (value == null) { + bv.fillWithNulls(); + } else { + bv.fill((boolean) value); + } + return bv; + case TINYINT: + HeapByteVector byteVector = new HeapByteVector(batchSize); + if (value == null) { + byteVector.fillWithNulls(); + } else { + byteVector.fill(((Number) value).byteValue()); + } + return byteVector; + case SMALLINT: + HeapShortVector sv = new HeapShortVector(batchSize); + if (value == null) { + sv.fillWithNulls(); + } else { + sv.fill(((Number) value).shortValue()); + } + return sv; + case INTEGER: + HeapIntVector iv = new HeapIntVector(batchSize); + if (value == null) { + iv.fillWithNulls(); + } else { + iv.fill(((Number) value).intValue()); + } + return iv; + case BIGINT: + HeapLongVector lv = new HeapLongVector(batchSize); + if (value == null) { + lv.fillWithNulls(); + } else { + lv.fill(((Number) value).longValue()); + } + return lv; + case DECIMAL: + DecimalType decimalType = (DecimalType) type; + int precision = decimalType.getPrecision(); + int scale = decimalType.getScale(); + DecimalData decimal = value == null + ? null + : Preconditions.checkNotNull(DecimalData.fromBigDecimal((BigDecimal) value, precision, scale)); + ColumnVector internalVector = createVectorFromConstant( + new VarBinaryType(), + decimal == null ? null : decimal.toUnscaledBytes(), + batchSize); + return new ParquetDecimalVector(internalVector); + case FLOAT: + HeapFloatVector fv = new HeapFloatVector(batchSize); + if (value == null) { + fv.fillWithNulls(); + } else { + fv.fill(((Number) value).floatValue()); + } + return fv; + case DOUBLE: + HeapDoubleVector dv = new HeapDoubleVector(batchSize); + if (value == null) { + dv.fillWithNulls(); + } else { + dv.fill(((Number) value).doubleValue()); + } + return dv; + case DATE: + if (value instanceof LocalDate) { + value = Date.valueOf((LocalDate) value); + } + return createVectorFromConstant( + new IntType(), + value == null ? null : toInternal((Date) value), + batchSize); + case TIMESTAMP_WITHOUT_TIME_ZONE: + HeapTimestampVector tv = new HeapTimestampVector(batchSize); + if (value == null) { + tv.fillWithNulls(); + } else { + tv.fill(TimestampData.fromLocalDateTime((LocalDateTime) value)); + } + return tv; + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + } + + private static List filterDescriptors(int depth, Type type, List columns) throws ParquetRuntimeException { + List filtered = new ArrayList<>(); + for (ColumnDescriptor descriptor : columns) { + if (depth >= descriptor.getPath().length) { + throw new InvalidSchemaException("Expect depth " + depth + " for schema: " + descriptor); + } + if (type.getName().equals(descriptor.getPath()[depth])) { + filtered.add(descriptor); + } + } + ValidationUtils.checkState(filtered.size() > 0, "Corrupted Parquet schema"); + return filtered; + } + + public static ColumnReader createColumnReader( + boolean utcTimestamp, + LogicalType fieldType, + Type physicalType, + List descriptors, + PageReadStore pages) throws IOException { + return createColumnReader(utcTimestamp, fieldType, physicalType, descriptors, + pages, 0); + } + + private static ColumnReader createColumnReader( + boolean utcTimestamp, + LogicalType fieldType, + Type physicalType, + List columns, + PageReadStore pages, + int depth) throws IOException { + List descriptors = filterDescriptors(depth, physicalType, columns); + ColumnDescriptor descriptor = descriptors.get(0); + PageReader pageReader = pages.getPageReader(descriptor); + switch (fieldType.getTypeRoot()) { + case BOOLEAN: + return new BooleanColumnReader(descriptor, pageReader); + case TINYINT: + return new ByteColumnReader(descriptor, pageReader); + case DOUBLE: + return new DoubleColumnReader(descriptor, pageReader); + case FLOAT: + return new FloatColumnReader(descriptor, pageReader); + case INTEGER: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + return new IntColumnReader(descriptor, pageReader); + case BIGINT: + return new LongColumnReader(descriptor, pageReader); + case SMALLINT: + return new ShortColumnReader(descriptor, pageReader); + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + return new BytesColumnReader(descriptor, pageReader); + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) { + case INT64: + int precision = fieldType instanceof TimestampType + ? ((TimestampType) fieldType).getPrecision() + : ((LocalZonedTimestampType) fieldType).getPrecision(); + return new Int64TimestampColumnReader(utcTimestamp, descriptor, pageReader, precision); + case INT96: + return new TimestampColumnReader(utcTimestamp, descriptor, pageReader); + default: + throw new AssertionError(); + } + case DECIMAL: + switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) { + case INT32: + return new IntColumnReader(descriptor, pageReader); + case INT64: + return new LongColumnReader(descriptor, pageReader); + case BINARY: + return new BytesColumnReader(descriptor, pageReader); + case FIXED_LEN_BYTE_ARRAY: + return new FixedLenBytesColumnReader( + descriptor, pageReader, ((DecimalType) fieldType).getPrecision()); + default: + throw new AssertionError(); + } + case ARRAY: + return new ArrayColumnReader( + descriptor, + pageReader, + utcTimestamp, + descriptor.getPrimitiveType(), + fieldType); + case MAP: + MapType mapType = (MapType) fieldType; + ArrayColumnReader keyReader = + new ArrayColumnReader( + descriptor, + pageReader, + utcTimestamp, + descriptor.getPrimitiveType(), + new ArrayType(mapType.getKeyType())); + ArrayColumnReader valueReader = + new ArrayColumnReader( + descriptors.get(1), + pages.getPageReader(descriptors.get(1)), + utcTimestamp, + descriptors.get(1).getPrimitiveType(), + new ArrayType(mapType.getValueType())); + return new MapColumnReader(keyReader, valueReader, fieldType); + case ROW: + RowType rowType = (RowType) fieldType; + GroupType groupType = physicalType.asGroupType(); + List fieldReaders = new ArrayList<>(); + for (int i = 0; i < rowType.getFieldCount(); i++) { + fieldReaders.add( + createColumnReader( + utcTimestamp, + rowType.getTypeAt(i), + groupType.getType(i), + descriptors, + pages, + depth + 1)); + } + return new RowColumnReader(fieldReaders); + default: + throw new UnsupportedOperationException(fieldType + " is not supported now."); + } + } + + public static WritableColumnVector createWritableColumnVector( + int batchSize, + LogicalType fieldType, + Type physicalType, + List descriptors) { + return createWritableColumnVector(batchSize, fieldType, physicalType, descriptors, 0); + } + + private static WritableColumnVector createWritableColumnVector( + int batchSize, + LogicalType fieldType, + Type physicalType, + List columns, + int depth) { + List descriptors = filterDescriptors(depth, physicalType, columns); + PrimitiveType primitiveType = descriptors.get(0).getPrimitiveType(); + PrimitiveType.PrimitiveTypeName typeName = primitiveType.getPrimitiveTypeName(); + switch (fieldType.getTypeRoot()) { + case BOOLEAN: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.BOOLEAN, + "Unexpected type: %s", typeName); + return new HeapBooleanVector(batchSize); + case TINYINT: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.INT32, + "Unexpected type: %s", typeName); + return new HeapByteVector(batchSize); + case DOUBLE: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.DOUBLE, + "Unexpected type: %s", typeName); + return new HeapDoubleVector(batchSize); + case FLOAT: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.FLOAT, + "Unexpected type: %s", typeName); + return new HeapFloatVector(batchSize); + case INTEGER: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.INT32, + "Unexpected type: %s", typeName); + return new HeapIntVector(batchSize); + case BIGINT: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.INT64, + "Unexpected type: %s", typeName); + return new HeapLongVector(batchSize); + case SMALLINT: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.INT32, + "Unexpected type: %s", typeName); + return new HeapShortVector(batchSize); + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + checkArgument( + typeName == PrimitiveType.PrimitiveTypeName.BINARY, + "Unexpected type: %s", typeName); + return new HeapBytesVector(batchSize); + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + checkArgument(primitiveType.getOriginalType() != OriginalType.TIME_MICROS, + "TIME_MICROS original type is not "); + return new HeapTimestampVector(batchSize); + case DECIMAL: + checkArgument( + (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY + || typeName == PrimitiveType.PrimitiveTypeName.BINARY) + && primitiveType.getOriginalType() == OriginalType.DECIMAL, + "Unexpected type: %s", typeName); + return new HeapBytesVector(batchSize); + case ARRAY: + ArrayType arrayType = (ArrayType) fieldType; + return new HeapArrayVector( + batchSize, + createWritableColumnVector( + batchSize, + arrayType.getElementType(), + physicalType, + descriptors, + depth)); + case MAP: + MapType mapType = (MapType) fieldType; + GroupType repeatedType = physicalType.asGroupType().getType(0).asGroupType(); + // the map column has three level paths. + return new HeapMapColumnVector( + batchSize, + createWritableColumnVector( + batchSize, + mapType.getKeyType(), + repeatedType.getType(0), + descriptors, + depth + 2), + createWritableColumnVector( + batchSize, + mapType.getValueType(), + repeatedType.getType(1), + descriptors, + depth + 2)); + case ROW: + RowType rowType = (RowType) fieldType; + GroupType groupType = physicalType.asGroupType(); + WritableColumnVector[] columnVectors = + new WritableColumnVector[rowType.getFieldCount()]; + for (int i = 0; i < columnVectors.length; i++) { + columnVectors[i] = + createWritableColumnVector( + batchSize, + rowType.getTypeAt(i), + groupType.getType(i), + descriptors, + depth + 1); + } + return new HeapRowColumnVector(batchSize, columnVectors); + default: + throw new UnsupportedOperationException(fieldType + " is not supported now."); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapArrayVector.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapArrayVector.java new file mode 100644 index 0000000000000..7db66d23d6fc8 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapArrayVector.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector; + +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.columnar.ColumnarArrayData; +import org.apache.flink.table.data.columnar.vector.ArrayColumnVector; +import org.apache.flink.table.data.columnar.vector.ColumnVector; +import org.apache.flink.table.data.columnar.vector.heap.AbstractHeapVector; +import org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector; + +/** + * This class represents a nullable heap array column vector. + */ +public class HeapArrayVector extends AbstractHeapVector + implements WritableColumnVector, ArrayColumnVector { + + public long[] offsets; + public long[] lengths; + public ColumnVector child; + private int size; + + public HeapArrayVector(int len) { + super(len); + offsets = new long[len]; + lengths = new long[len]; + } + + public HeapArrayVector(int len, ColumnVector vector) { + super(len); + offsets = new long[len]; + lengths = new long[len]; + this.child = vector; + } + + public int getSize() { + return size; + } + + public void setSize(int size) { + this.size = size; + } + + public int getLen() { + return this.isNull.length; + } + + @Override + public ArrayData getArray(int i) { + long offset = offsets[i]; + long length = lengths[i]; + return new ColumnarArrayData(child, (int) offset, (int) length); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapMapColumnVector.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapMapColumnVector.java new file mode 100644 index 0000000000000..a379737169502 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapMapColumnVector.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector; + +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.columnar.ColumnarMapData; +import org.apache.flink.table.data.columnar.vector.ColumnVector; +import org.apache.flink.table.data.columnar.vector.MapColumnVector; +import org.apache.flink.table.data.columnar.vector.heap.AbstractHeapVector; +import org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector; + +/** + * This class represents a nullable heap map column vector. + */ +public class HeapMapColumnVector extends AbstractHeapVector + implements WritableColumnVector, MapColumnVector { + + private long[] offsets; + private long[] lengths; + private int size; + private ColumnVector keys; + private ColumnVector values; + + public HeapMapColumnVector(int len, ColumnVector keys, ColumnVector values) { + super(len); + size = 0; + offsets = new long[len]; + lengths = new long[len]; + this.keys = keys; + this.values = values; + } + + public void setOffsets(long[] offsets) { + this.offsets = offsets; + } + + public void setLengths(long[] lengths) { + this.lengths = lengths; + } + + public void setKeys(ColumnVector keys) { + this.keys = keys; + } + + public void setValues(ColumnVector values) { + this.values = values; + } + + public int getSize() { + return size; + } + + public void setSize(int size) { + this.size = size; + } + + @Override + public MapData getMap(int i) { + long offset = offsets[i]; + long length = lengths[i]; + return new ColumnarMapData(keys, values, (int) offset, (int) length); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapRowColumnVector.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapRowColumnVector.java new file mode 100644 index 0000000000000..ae194e4e6ab05 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/HeapRowColumnVector.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector; + +import org.apache.flink.table.data.columnar.ColumnarRowData; +import org.apache.flink.table.data.columnar.vector.RowColumnVector; +import org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch; +import org.apache.flink.table.data.columnar.vector.heap.AbstractHeapVector; +import org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector; + +/** + * This class represents a nullable heap row column vector. + */ +public class HeapRowColumnVector extends AbstractHeapVector + implements WritableColumnVector, RowColumnVector { + + public WritableColumnVector[] vectors; + + public HeapRowColumnVector(int len, WritableColumnVector... vectors) { + super(len); + this.vectors = vectors; + } + + @Override + public ColumnarRowData getRow(int i) { + ColumnarRowData columnarRowData = new ColumnarRowData(new VectorizedColumnBatch(vectors)); + columnarRowData.setRowId(i); + return columnarRowData; + } + + @Override + public void reset() { + super.reset(); + for (WritableColumnVector vector : vectors) { + vector.reset(); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/ParquetDecimalVector.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/ParquetDecimalVector.java new file mode 100644 index 0000000000000..98b5e61050898 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/ParquetDecimalVector.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector; + +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.columnar.vector.BytesColumnVector; +import org.apache.flink.table.data.columnar.vector.ColumnVector; +import org.apache.flink.table.data.columnar.vector.DecimalColumnVector; + +/** + * Parquet write decimal as int32 and int64 and binary, this class wrap the real vector to + * provide {@link DecimalColumnVector} interface. + * + *

    Reference Flink release 1.11.2 {@link org.apache.flink.formats.parquet.vector.ParquetDecimalVector} + * because it is not public. + */ +public class ParquetDecimalVector implements DecimalColumnVector { + + public final ColumnVector vector; + + public ParquetDecimalVector(ColumnVector vector) { + this.vector = vector; + } + + @Override + public DecimalData getDecimal(int i, int precision, int scale) { + return DecimalData.fromUnscaledBytes( + ((BytesColumnVector) vector).getBytes(i).getBytes(), + precision, + scale); + } + + @Override + public boolean isNullAt(int i) { + return vector.isNullAt(i); + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/AbstractColumnReader.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/AbstractColumnReader.java new file mode 100644 index 0000000000000..a8b733de636a5 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/AbstractColumnReader.java @@ -0,0 +1,325 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.formats.parquet.vector.ParquetDictionary; +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector; +import org.apache.flink.table.data.columnar.vector.writable.WritableIntVector; +import org.apache.parquet.Preconditions; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Dictionary; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.page.DataPage; +import org.apache.parquet.column.page.DataPageV1; +import org.apache.parquet.column.page.DataPageV2; +import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.column.values.ValuesReader; +import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.schema.PrimitiveType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import static org.apache.parquet.column.ValuesType.REPETITION_LEVEL; + +/** + * Abstract {@link ColumnReader}. + * See {@link org.apache.parquet.column.impl.ColumnReaderImpl}, + * part of the code is referred from Apache Spark and Apache Parquet. + * + *

    Note: Reference Flink release 1.11.2 {@link org.apache.flink.formats.parquet.vector.reader.AbstractColumnReader} + * because some of the package scope methods. + */ +public abstract class AbstractColumnReader + implements ColumnReader { + + private static final Logger LOG = LoggerFactory.getLogger(org.apache.flink.formats.parquet.vector.reader.AbstractColumnReader.class); + + private final PageReader pageReader; + + /** + * The dictionary, if this column has dictionary encoding. + */ + protected final Dictionary dictionary; + + /** + * Maximum definition level for this column. + */ + protected final int maxDefLevel; + + protected final ColumnDescriptor descriptor; + + /** + * Total number of values read. + */ + private long valuesRead; + + /** + * value that indicates the end of the current page. That is, if valuesRead == + * endOfPageValueCount, we are at the end of the page. + */ + private long endOfPageValueCount; + + /** + * If true, the current page is dictionary encoded. + */ + private boolean isCurrentPageDictionaryEncoded; + + /** + * Total values in the current page. + */ + private int pageValueCount; + + /* + * Input streams: + * 1.Run length encoder to encode every data, so we have run length stream to get + * run length information. + * 2.Data maybe is real data, maybe is dictionary ids which need be decode to real + * data from Dictionary. + * + * Run length stream ------> Data stream + * | + * ------> Dictionary ids stream + */ + + /** + * Run length decoder for data and dictionary. + */ + protected RunLengthDecoder runLenDecoder; + + /** + * Data input stream. + */ + ByteBufferInputStream dataInputStream; + + /** + * Dictionary decoder to wrap dictionary ids input stream. + */ + private RunLengthDecoder dictionaryIdsDecoder; + + public AbstractColumnReader( + ColumnDescriptor descriptor, + PageReader pageReader) throws IOException { + this.descriptor = descriptor; + this.pageReader = pageReader; + this.maxDefLevel = descriptor.getMaxDefinitionLevel(); + + DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); + if (dictionaryPage != null) { + try { + this.dictionary = dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage); + this.isCurrentPageDictionaryEncoded = true; + } catch (IOException e) { + throw new IOException("could not decode the dictionary for " + descriptor, e); + } + } else { + this.dictionary = null; + this.isCurrentPageDictionaryEncoded = false; + } + /* + * Total number of values in this column (in this row group). + */ + long totalValueCount = pageReader.getTotalValueCount(); + if (totalValueCount == 0) { + throw new IOException("totalValueCount == 0"); + } + } + + protected void checkTypeName(PrimitiveType.PrimitiveTypeName expectedName) { + PrimitiveType.PrimitiveTypeName actualName = descriptor.getPrimitiveType().getPrimitiveTypeName(); + Preconditions.checkArgument( + actualName == expectedName, + "Expected type name: %s, actual type name: %s", + expectedName, + actualName); + } + + /** + * Reads `total` values from this columnReader into column. + */ + @Override + public final void readToVector(int readNumber, V vector) throws IOException { + int rowId = 0; + WritableIntVector dictionaryIds = null; + if (dictionary != null) { + dictionaryIds = vector.reserveDictionaryIds(readNumber); + } + while (readNumber > 0) { + // Compute the number of values we want to read in this page. + int leftInPage = (int) (endOfPageValueCount - valuesRead); + if (leftInPage == 0) { + DataPage page = pageReader.readPage(); + if (page instanceof DataPageV1) { + readPageV1((DataPageV1) page); + } else if (page instanceof DataPageV2) { + readPageV2((DataPageV2) page); + } else { + throw new RuntimeException("Unsupported page type: " + page.getClass()); + } + leftInPage = (int) (endOfPageValueCount - valuesRead); + } + int num = Math.min(readNumber, leftInPage); + if (isCurrentPageDictionaryEncoded) { + // Read and decode dictionary ids. + runLenDecoder.readDictionaryIds( + num, dictionaryIds, vector, rowId, maxDefLevel, this.dictionaryIdsDecoder); + + if (vector.hasDictionary() || (rowId == 0 && supportLazyDecode())) { + // Column vector supports lazy decoding of dictionary values so just set the dictionary. + // We can't do this if rowId != 0 AND the column doesn't have a dictionary (i.e. some + // non-dictionary encoded values have already been added). + vector.setDictionary(new ParquetDictionary(dictionary)); + } else { + readBatchFromDictionaryIds(rowId, num, vector, dictionaryIds); + } + } else { + if (vector.hasDictionary() && rowId != 0) { + // This batch already has dictionary encoded values but this new page is not. The batch + // does not support a mix of dictionary and not so we will decode the dictionary. + readBatchFromDictionaryIds(0, rowId, vector, vector.getDictionaryIds()); + } + vector.setDictionary(null); + readBatch(rowId, num, vector); + } + + valuesRead += num; + rowId += num; + readNumber -= num; + } + } + + private void readPageV1(DataPageV1 page) throws IOException { + this.pageValueCount = page.getValueCount(); + ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL); + + // Initialize the decoders. + if (page.getDlEncoding() != Encoding.RLE && descriptor.getMaxDefinitionLevel() != 0) { + throw new UnsupportedOperationException("Unsupported encoding: " + page.getDlEncoding()); + } + int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel()); + this.runLenDecoder = new RunLengthDecoder(bitWidth); + try { + BytesInput bytes = page.getBytes(); + ByteBufferInputStream in = bytes.toInputStream(); + rlReader.initFromPage(pageValueCount, in); + this.runLenDecoder.initFromStream(pageValueCount, in); + prepareNewPage(page.getValueEncoding(), in); + } catch (IOException e) { + throw new IOException("could not read page " + page + " in col " + descriptor, e); + } + } + + private void readPageV2(DataPageV2 page) throws IOException { + this.pageValueCount = page.getValueCount(); + + int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel()); + // do not read the length from the stream. v2 pages handle dividing the page bytes. + this.runLenDecoder = new RunLengthDecoder(bitWidth, false); + this.runLenDecoder.initFromStream( + this.pageValueCount, page.getDefinitionLevels().toInputStream()); + try { + prepareNewPage(page.getDataEncoding(), page.getData().toInputStream()); + } catch (IOException e) { + throw new IOException("could not read page " + page + " in col " + descriptor, e); + } + } + + private void prepareNewPage( + Encoding dataEncoding, + ByteBufferInputStream in) throws IOException { + this.endOfPageValueCount = valuesRead + pageValueCount; + if (dataEncoding.usesDictionary()) { + if (dictionary == null) { + throw new IOException("Could not read page in col " + + descriptor + + " as the dictionary was missing for encoding " + + dataEncoding); + } + @SuppressWarnings("deprecation") + Encoding plainDict = Encoding.PLAIN_DICTIONARY; // var to allow warning suppression + if (dataEncoding != plainDict && dataEncoding != Encoding.RLE_DICTIONARY) { + throw new UnsupportedOperationException("Unsupported encoding: " + dataEncoding); + } + this.dataInputStream = null; + this.dictionaryIdsDecoder = new RunLengthDecoder(); + try { + this.dictionaryIdsDecoder.initFromStream(pageValueCount, in); + } catch (IOException e) { + throw new IOException("could not read dictionary in col " + descriptor, e); + } + this.isCurrentPageDictionaryEncoded = true; + } else { + if (dataEncoding != Encoding.PLAIN) { + throw new UnsupportedOperationException("Unsupported encoding: " + dataEncoding); + } + this.dictionaryIdsDecoder = null; + LOG.debug("init from page at offset {} for length {}", in.position(), in.available()); + this.dataInputStream = in.remainingStream(); + this.isCurrentPageDictionaryEncoded = false; + } + + afterReadPage(); + } + + final ByteBuffer readDataBuffer(int length) { + try { + return dataInputStream.slice(length).order(ByteOrder.LITTLE_ENDIAN); + } catch (IOException e) { + throw new ParquetDecodingException("Failed to read " + length + " bytes", e); + } + } + + /** + * After read a page, we may need some initialization. + */ + protected void afterReadPage() { + } + + /** + * Support lazy dictionary ids decode. See more in {@link ParquetDictionary}. + * If return false, we will decode all the data first. + */ + protected boolean supportLazyDecode() { + return true; + } + + /** + * Read batch from {@link #runLenDecoder} and {@link #dataInputStream}. + */ + protected abstract void readBatch(int rowId, int num, V column); + + /** + * Decode dictionary ids to data. + * From {@link #runLenDecoder} and {@link #dictionaryIdsDecoder}. + */ + protected abstract void readBatchFromDictionaryIds( + int rowId, + int num, + V column, + WritableIntVector dictionaryIds); +} + diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ArrayColumnReader.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ArrayColumnReader.java new file mode 100644 index 0000000000000..6a8a01b74946a --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ArrayColumnReader.java @@ -0,0 +1,473 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.hudi.table.format.cow.vector.HeapArrayVector; +import org.apache.hudi.table.format.cow.vector.ParquetDecimalVector; + +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch; +import org.apache.flink.table.data.columnar.vector.heap.HeapBooleanVector; +import org.apache.flink.table.data.columnar.vector.heap.HeapByteVector; +import org.apache.flink.table.data.columnar.vector.heap.HeapBytesVector; +import org.apache.flink.table.data.columnar.vector.heap.HeapDoubleVector; +import org.apache.flink.table.data.columnar.vector.heap.HeapFloatVector; +import org.apache.flink.table.data.columnar.vector.heap.HeapIntVector; +import org.apache.flink.table.data.columnar.vector.heap.HeapLongVector; +import org.apache.flink.table.data.columnar.vector.heap.HeapShortVector; +import org.apache.flink.table.data.columnar.vector.heap.HeapTimestampVector; +import org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Array {@link ColumnReader}. + */ +public class ArrayColumnReader extends BaseVectorizedColumnReader { + + // The value read in last time + private Object lastValue; + + // flag to indicate if there is no data in parquet data page + private boolean eof = false; + + // flag to indicate if it's the first time to read parquet data page with this instance + boolean isFirstRow = true; + + public ArrayColumnReader( + ColumnDescriptor descriptor, + PageReader pageReader, + boolean isUtcTimestamp, + Type type, + LogicalType logicalType) + throws IOException { + super(descriptor, pageReader, isUtcTimestamp, type, logicalType); + } + + @Override + public void readToVector(int readNumber, WritableColumnVector vector) throws IOException { + HeapArrayVector lcv = (HeapArrayVector) vector; + // before readBatch, initial the size of offsets & lengths as the default value, + // the actual size will be assigned in setChildrenInfo() after reading complete. + lcv.offsets = new long[VectorizedColumnBatch.DEFAULT_SIZE]; + lcv.lengths = new long[VectorizedColumnBatch.DEFAULT_SIZE]; + // Because the length of ListColumnVector.child can't be known now, + // the valueList will save all data for ListColumnVector temporary. + List valueList = new ArrayList<>(); + + LogicalType category = ((ArrayType) logicalType).getElementType(); + + // read the first row in parquet data page, this will be only happened once for this + // instance + if (isFirstRow) { + if (!fetchNextValue(category)) { + return; + } + isFirstRow = false; + } + + int index = collectDataFromParquetPage(readNumber, lcv, valueList, category); + + // Convert valueList to array for the ListColumnVector.child + fillColumnVector(category, lcv, valueList, index); + } + + /** + * Reads a single value from parquet page, puts it into lastValue. Returns a boolean indicating + * if there is more values to read (true). + * + * @param category + * @return boolean + * @throws IOException + */ + private boolean fetchNextValue(LogicalType category) throws IOException { + int left = readPageIfNeed(); + if (left > 0) { + // get the values of repetition and definitionLevel + readRepetitionAndDefinitionLevels(); + // read the data if it isn't null + if (definitionLevel == maxDefLevel) { + if (isCurrentPageDictionaryEncoded) { + lastValue = dataColumn.readValueDictionaryId(); + } else { + lastValue = readPrimitiveTypedRow(category); + } + } else { + lastValue = null; + } + return true; + } else { + eof = true; + return false; + } + } + + private int readPageIfNeed() throws IOException { + // Compute the number of values we want to read in this page. + int leftInPage = (int) (endOfPageValueCount - valuesRead); + if (leftInPage == 0) { + // no data left in current page, load data from new page + readPage(); + leftInPage = (int) (endOfPageValueCount - valuesRead); + } + return leftInPage; + } + + // Need to be in consistent with that VectorizedPrimitiveColumnReader#readBatchHelper + // TODO Reduce the duplicated code + private Object readPrimitiveTypedRow(LogicalType category) { + switch (category.getTypeRoot()) { + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + return dataColumn.readString(); + case BOOLEAN: + return dataColumn.readBoolean(); + case TIME_WITHOUT_TIME_ZONE: + case DATE: + case INTEGER: + return dataColumn.readInteger(); + case TINYINT: + return dataColumn.readTinyInt(); + case SMALLINT: + return dataColumn.readSmallInt(); + case BIGINT: + return dataColumn.readLong(); + case FLOAT: + return dataColumn.readFloat(); + case DOUBLE: + return dataColumn.readDouble(); + case DECIMAL: + switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) { + case INT32: + return dataColumn.readInteger(); + case INT64: + return dataColumn.readLong(); + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + return dataColumn.readString(); + default: + throw new AssertionError(); + } + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + return dataColumn.readTimestamp(); + default: + throw new RuntimeException("Unsupported type in the list: " + type); + } + } + + private Object dictionaryDecodeValue(LogicalType category, Integer dictionaryValue) { + if (dictionaryValue == null) { + return null; + } + + switch (category.getTypeRoot()) { + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + return dictionary.readString(dictionaryValue); + case DATE: + case TIME_WITHOUT_TIME_ZONE: + case INTEGER: + return dictionary.readInteger(dictionaryValue); + case BOOLEAN: + return dictionary.readBoolean(dictionaryValue) ? 1 : 0; + case DOUBLE: + return dictionary.readDouble(dictionaryValue); + case FLOAT: + return dictionary.readFloat(dictionaryValue); + case TINYINT: + return dictionary.readTinyInt(dictionaryValue); + case SMALLINT: + return dictionary.readSmallInt(dictionaryValue); + case BIGINT: + return dictionary.readLong(dictionaryValue); + case DECIMAL: + switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) { + case INT32: + return dictionary.readInteger(dictionaryValue); + case INT64: + return dictionary.readLong(dictionaryValue); + case FIXED_LEN_BYTE_ARRAY: + case BINARY: + return dictionary.readString(dictionaryValue); + default: + throw new AssertionError(); + } + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + return dictionary.readTimestamp(dictionaryValue); + default: + throw new RuntimeException("Unsupported type in the list: " + type); + } + } + + /** + * Collects data from a parquet page and returns the final row index where it stopped. The + * returned index can be equal to or less than total. + * + * @param total maximum number of rows to collect + * @param lcv column vector to do initial setup in data collection time + * @param valueList collection of values that will be fed into the vector later + * @param category + * @return int + * @throws IOException + */ + private int collectDataFromParquetPage( + int total, HeapArrayVector lcv, List valueList, LogicalType category) + throws IOException { + int index = 0; + /* + * Here is a nested loop for collecting all values from a parquet page. + * A column of array type can be considered as a list of lists, so the two loops are as below: + * 1. The outer loop iterates on rows (index is a row index, so points to a row in the batch), e.g.: + * [0, 2, 3] <- index: 0 + * [NULL, 3, 4] <- index: 1 + * + * 2. The inner loop iterates on values within a row (sets all data from parquet data page + * for an element in ListColumnVector), so fetchNextValue returns values one-by-one: + * 0, 2, 3, NULL, 3, 4 + * + * As described below, the repetition level (repetitionLevel != 0) + * can be used to decide when we'll start to read values for the next list. + */ + while (!eof && index < total) { + // add element to ListColumnVector one by one + lcv.offsets[index] = valueList.size(); + /* + * Let's collect all values for a single list. + * Repetition level = 0 means that a new list started there in the parquet page, + * in that case, let's exit from the loop, and start to collect value for a new list. + */ + do { + /* + * Definition level = 0 when a NULL value was returned instead of a list + * (this is not the same as a NULL value in of a list). + */ + if (definitionLevel == 0) { + lcv.setNullAt(index); + } + valueList.add( + isCurrentPageDictionaryEncoded + ? dictionaryDecodeValue(category, (Integer) lastValue) + : lastValue); + } while (fetchNextValue(category) && (repetitionLevel != 0)); + + lcv.lengths[index] = valueList.size() - lcv.offsets[index]; + index++; + } + return index; + } + + /** + * The lengths & offsets will be initialized as default size (1024), it should be set to the + * actual size according to the element number. + */ + private void setChildrenInfo(HeapArrayVector lcv, int itemNum, int elementNum) { + lcv.setSize(itemNum); + long[] lcvLength = new long[elementNum]; + long[] lcvOffset = new long[elementNum]; + System.arraycopy(lcv.lengths, 0, lcvLength, 0, elementNum); + System.arraycopy(lcv.offsets, 0, lcvOffset, 0, elementNum); + lcv.lengths = lcvLength; + lcv.offsets = lcvOffset; + } + + private void fillColumnVector( + LogicalType category, HeapArrayVector lcv, List valueList, int elementNum) { + int total = valueList.size(); + setChildrenInfo(lcv, total, elementNum); + switch (category.getTypeRoot()) { + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + lcv.child = new HeapBytesVector(total); + ((HeapBytesVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + byte[] src = ((List) valueList).get(i); + if (src == null) { + ((HeapBytesVector) lcv.child).setNullAt(i); + } else { + ((HeapBytesVector) lcv.child).appendBytes(i, src, 0, src.length); + } + } + break; + case BOOLEAN: + lcv.child = new HeapBooleanVector(total); + ((HeapBooleanVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapBooleanVector) lcv.child).setNullAt(i); + } else { + ((HeapBooleanVector) lcv.child).vector[i] = + ((List) valueList).get(i); + } + } + break; + case TINYINT: + lcv.child = new HeapByteVector(total); + ((HeapByteVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapByteVector) lcv.child).setNullAt(i); + } else { + ((HeapByteVector) lcv.child).vector[i] = + (byte) ((List) valueList).get(i).intValue(); + } + } + break; + case SMALLINT: + lcv.child = new HeapShortVector(total); + ((HeapShortVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapShortVector) lcv.child).setNullAt(i); + } else { + ((HeapShortVector) lcv.child).vector[i] = + (short) ((List) valueList).get(i).intValue(); + } + } + break; + case INTEGER: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + lcv.child = new HeapIntVector(total); + ((HeapIntVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapIntVector) lcv.child).setNullAt(i); + } else { + ((HeapIntVector) lcv.child).vector[i] = ((List) valueList).get(i); + } + } + break; + case FLOAT: + lcv.child = new HeapFloatVector(total); + ((HeapFloatVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapFloatVector) lcv.child).setNullAt(i); + } else { + ((HeapFloatVector) lcv.child).vector[i] = ((List) valueList).get(i); + } + } + break; + case BIGINT: + lcv.child = new HeapLongVector(total); + ((HeapLongVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapLongVector) lcv.child).setNullAt(i); + } else { + ((HeapLongVector) lcv.child).vector[i] = ((List) valueList).get(i); + } + } + break; + case DOUBLE: + lcv.child = new HeapDoubleVector(total); + ((HeapDoubleVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapDoubleVector) lcv.child).setNullAt(i); + } else { + ((HeapDoubleVector) lcv.child).vector[i] = + ((List) valueList).get(i); + } + } + break; + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + lcv.child = new HeapTimestampVector(total); + ((HeapTimestampVector) lcv.child).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapTimestampVector) lcv.child).setNullAt(i); + } else { + ((HeapTimestampVector) lcv.child) + .setTimestamp(i, ((List) valueList).get(i)); + } + } + break; + case DECIMAL: + PrimitiveType.PrimitiveTypeName primitiveTypeName = + descriptor.getPrimitiveType().getPrimitiveTypeName(); + switch (primitiveTypeName) { + case INT32: + lcv.child = new ParquetDecimalVector(new HeapIntVector(total)); + ((HeapIntVector) ((ParquetDecimalVector) lcv.child).vector).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapIntVector) ((ParquetDecimalVector) lcv.child).vector) + .setNullAt(i); + } else { + ((HeapIntVector) ((ParquetDecimalVector) lcv.child).vector) + .vector[i] = + ((List) valueList).get(i); + } + } + break; + case INT64: + lcv.child = new ParquetDecimalVector(new HeapLongVector(total)); + ((HeapLongVector) ((ParquetDecimalVector) lcv.child).vector).reset(); + for (int i = 0; i < valueList.size(); i++) { + if (valueList.get(i) == null) { + ((HeapLongVector) ((ParquetDecimalVector) lcv.child).vector) + .setNullAt(i); + } else { + ((HeapLongVector) ((ParquetDecimalVector) lcv.child).vector) + .vector[i] = + ((List) valueList).get(i); + } + } + break; + default: + lcv.child = new ParquetDecimalVector(new HeapBytesVector(total)); + ((HeapBytesVector) ((ParquetDecimalVector) lcv.child).vector).reset(); + for (int i = 0; i < valueList.size(); i++) { + byte[] src = ((List) valueList).get(i); + if (valueList.get(i) == null) { + ((HeapBytesVector) ((ParquetDecimalVector) lcv.child).vector) + .setNullAt(i); + } else { + ((HeapBytesVector) ((ParquetDecimalVector) lcv.child).vector) + .appendBytes(i, src, 0, src.length); + } + } + break; + } + break; + default: + throw new RuntimeException("Unsupported type in the list: " + type); + } + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/BaseVectorizedColumnReader.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/BaseVectorizedColumnReader.java new file mode 100644 index 0000000000000..fea6dc47af504 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/BaseVectorizedColumnReader.java @@ -0,0 +1,313 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.page.DataPage; +import org.apache.parquet.column.page.DataPageV1; +import org.apache.parquet.column.page.DataPageV2; +import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.column.values.ValuesReader; +import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder; +import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayInputStream; +import java.io.IOException; + +import static org.apache.parquet.column.ValuesType.DEFINITION_LEVEL; +import static org.apache.parquet.column.ValuesType.REPETITION_LEVEL; +import static org.apache.parquet.column.ValuesType.VALUES; + +/** + * Abstract {@link ColumnReader}. part of the code is referred from Apache Hive and Apache Parquet. + */ +public abstract class BaseVectorizedColumnReader implements ColumnReader { + + private static final Logger LOG = LoggerFactory.getLogger(BaseVectorizedColumnReader.class); + + protected boolean isUtcTimestamp; + + /** + * Total number of values read. + */ + protected long valuesRead; + + /** + * value that indicates the end of the current page. That is, if valuesRead == + * endOfPageValueCount, we are at the end of the page. + */ + protected long endOfPageValueCount; + + /** + * The dictionary, if this column has dictionary encoding. + */ + protected final ParquetDataColumnReader dictionary; + + /** + * If true, the current page is dictionary encoded. + */ + protected boolean isCurrentPageDictionaryEncoded; + + /** + * Maximum definition level for this column. + */ + protected final int maxDefLevel; + + protected int definitionLevel; + protected int repetitionLevel; + + /** + * Repetition/Definition/Value readers. + */ + protected IntIterator repetitionLevelColumn; + + protected IntIterator definitionLevelColumn; + protected ParquetDataColumnReader dataColumn; + + /** + * Total values in the current page. + */ + protected int pageValueCount; + + protected final PageReader pageReader; + protected final ColumnDescriptor descriptor; + protected final Type type; + protected final LogicalType logicalType; + + public BaseVectorizedColumnReader( + ColumnDescriptor descriptor, + PageReader pageReader, + boolean isUtcTimestamp, + Type parquetType, + LogicalType logicalType) + throws IOException { + this.descriptor = descriptor; + this.type = parquetType; + this.pageReader = pageReader; + this.maxDefLevel = descriptor.getMaxDefinitionLevel(); + this.isUtcTimestamp = isUtcTimestamp; + this.logicalType = logicalType; + + DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); + if (dictionaryPage != null) { + try { + this.dictionary = + ParquetDataColumnReaderFactory.getDataColumnReaderByTypeOnDictionary( + parquetType.asPrimitiveType(), + dictionaryPage + .getEncoding() + .initDictionary(descriptor, dictionaryPage), + isUtcTimestamp); + this.isCurrentPageDictionaryEncoded = true; + } catch (IOException e) { + throw new IOException("could not decode the dictionary for " + descriptor, e); + } + } else { + this.dictionary = null; + this.isCurrentPageDictionaryEncoded = false; + } + } + + protected void readRepetitionAndDefinitionLevels() { + repetitionLevel = repetitionLevelColumn.nextInt(); + definitionLevel = definitionLevelColumn.nextInt(); + valuesRead++; + } + + protected void readPage() throws IOException { + DataPage page = pageReader.readPage(); + + if (page == null) { + return; + } + + page.accept( + new DataPage.Visitor() { + @Override + public Void visit(DataPageV1 dataPageV1) { + readPageV1(dataPageV1); + return null; + } + + @Override + public Void visit(DataPageV2 dataPageV2) { + readPageV2(dataPageV2); + return null; + } + }); + } + + private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) + throws IOException { + this.pageValueCount = valueCount; + this.endOfPageValueCount = valuesRead + pageValueCount; + if (dataEncoding.usesDictionary()) { + this.dataColumn = null; + if (dictionary == null) { + throw new IOException( + "could not read page in col " + + descriptor + + " as the dictionary was missing for encoding " + + dataEncoding); + } + dataColumn = + ParquetDataColumnReaderFactory.getDataColumnReaderByType( + type.asPrimitiveType(), + dataEncoding.getDictionaryBasedValuesReader( + descriptor, VALUES, dictionary.getDictionary()), + isUtcTimestamp); + this.isCurrentPageDictionaryEncoded = true; + } else { + dataColumn = + ParquetDataColumnReaderFactory.getDataColumnReaderByType( + type.asPrimitiveType(), + dataEncoding.getValuesReader(descriptor, VALUES), + isUtcTimestamp); + this.isCurrentPageDictionaryEncoded = false; + } + + try { + dataColumn.initFromPage(pageValueCount, in); + } catch (IOException e) { + throw new IOException("could not read page in col " + descriptor, e); + } + } + + private void readPageV1(DataPageV1 page) { + ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL); + ValuesReader dlReader = page.getDlEncoding().getValuesReader(descriptor, DEFINITION_LEVEL); + this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader); + this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader); + try { + BytesInput bytes = page.getBytes(); + LOG.debug("page size " + bytes.size() + " bytes and " + pageValueCount + " records"); + ByteBufferInputStream in = bytes.toInputStream(); + LOG.debug("reading repetition levels at " + in.position()); + rlReader.initFromPage(pageValueCount, in); + LOG.debug("reading definition levels at " + in.position()); + dlReader.initFromPage(pageValueCount, in); + LOG.debug("reading data at " + in.position()); + initDataReader(page.getValueEncoding(), in, page.getValueCount()); + } catch (IOException e) { + throw new ParquetDecodingException( + "could not read page " + page + " in col " + descriptor, e); + } + } + + private void readPageV2(DataPageV2 page) { + this.pageValueCount = page.getValueCount(); + this.repetitionLevelColumn = + newRLEIterator(descriptor.getMaxRepetitionLevel(), page.getRepetitionLevels()); + this.definitionLevelColumn = + newRLEIterator(descriptor.getMaxDefinitionLevel(), page.getDefinitionLevels()); + try { + LOG.debug( + "page data size " + + page.getData().size() + + " bytes and " + + pageValueCount + + " records"); + initDataReader( + page.getDataEncoding(), page.getData().toInputStream(), page.getValueCount()); + } catch (IOException e) { + throw new ParquetDecodingException( + "could not read page " + page + " in col " + descriptor, e); + } + } + + private IntIterator newRLEIterator(int maxLevel, BytesInput bytes) { + try { + if (maxLevel == 0) { + return new NullIntIterator(); + } + return new RLEIntIterator( + new RunLengthBitPackingHybridDecoder( + BytesUtils.getWidthFromMaxInt(maxLevel), + new ByteArrayInputStream(bytes.toByteArray()))); + } catch (IOException e) { + throw new ParquetDecodingException( + "could not read levels in page for col " + descriptor, e); + } + } + + /** + * Utility classes to abstract over different way to read ints with different encodings. + */ + abstract static class IntIterator { + abstract int nextInt(); + } + + /** + * read ints from {@link ValuesReader}. + */ + protected static final class ValuesReaderIntIterator extends IntIterator { + ValuesReader delegate; + + public ValuesReaderIntIterator(ValuesReader delegate) { + this.delegate = delegate; + } + + @Override + int nextInt() { + return delegate.readInteger(); + } + } + + /** + * read ints from {@link RunLengthBitPackingHybridDecoder}. + */ + protected static final class RLEIntIterator extends IntIterator { + RunLengthBitPackingHybridDecoder delegate; + + public RLEIntIterator(RunLengthBitPackingHybridDecoder delegate) { + this.delegate = delegate; + } + + @Override + int nextInt() { + try { + return delegate.readInt(); + } catch (IOException e) { + throw new ParquetDecodingException(e); + } + } + } + + /** + * return zero. + */ + protected static final class NullIntIterator extends IntIterator { + @Override + int nextInt() { + return 0; + } + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/FixedLenBytesColumnReader.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/FixedLenBytesColumnReader.java new file mode 100644 index 0000000000000..1e9aecd24997c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/FixedLenBytesColumnReader.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.table.data.columnar.vector.writable.WritableBytesVector; +import org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector; +import org.apache.flink.table.data.columnar.vector.writable.WritableIntVector; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; + +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * Fixed length bytes {@code ColumnReader}, just for decimal. + * + *

    Note: Reference Flink release 1.13.2 + * {@code org.apache.flink.formats.parquet.vector.reader.FixedLenBytesColumnReader} + * to always write as legacy decimal format. + */ +public class FixedLenBytesColumnReader + extends AbstractColumnReader { + + public FixedLenBytesColumnReader( + ColumnDescriptor descriptor, PageReader pageReader, int precision) throws IOException { + super(descriptor, pageReader); + checkTypeName(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY); + } + + @Override + protected void readBatch(int rowId, int num, V column) { + int bytesLen = descriptor.getPrimitiveType().getTypeLength(); + WritableBytesVector bytesVector = (WritableBytesVector) column; + for (int i = 0; i < num; i++) { + if (runLenDecoder.readInteger() == maxDefLevel) { + byte[] bytes = readDataBinary(bytesLen).getBytes(); + bytesVector.appendBytes(rowId + i, bytes, 0, bytes.length); + } else { + bytesVector.setNullAt(rowId + i); + } + } + } + + @Override + protected void readBatchFromDictionaryIds( + int rowId, int num, V column, WritableIntVector dictionaryIds) { + WritableBytesVector bytesVector = (WritableBytesVector) column; + for (int i = rowId; i < rowId + num; ++i) { + if (!bytesVector.isNullAt(i)) { + byte[] v = dictionary.decodeToBinary(dictionaryIds.getInt(i)).getBytes(); + bytesVector.appendBytes(i, v, 0, v.length); + } + } + } + + private Binary readDataBinary(int len) { + ByteBuffer buffer = readDataBuffer(len); + if (buffer.hasArray()) { + return Binary.fromConstantByteArray( + buffer.array(), buffer.arrayOffset() + buffer.position(), len); + } else { + byte[] bytes = new byte[len]; + buffer.get(bytes); + return Binary.fromConstantByteArray(bytes); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/Int64TimestampColumnReader.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/Int64TimestampColumnReader.java new file mode 100644 index 0000000000000..417b1155bbd7b --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/Int64TimestampColumnReader.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.data.columnar.vector.writable.WritableIntVector; +import org.apache.flink.table.data.columnar.vector.writable.WritableTimestampVector; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.schema.PrimitiveType; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.sql.Timestamp; +import java.time.Instant; +import java.time.temporal.ChronoUnit; + +/** + * Timestamp {@link org.apache.flink.formats.parquet.vector.reader.ColumnReader} that supports INT64 8 bytes, + * TIMESTAMP_MILLIS is the deprecated ConvertedType counterpart of a TIMESTAMP logical type + * that is UTC normalized and has MILLIS precision. + * + *

    See https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp + * TIMESTAMP_MILLIS and TIMESTAMP_MICROS are the deprecated ConvertedType. + */ +public class Int64TimestampColumnReader extends AbstractColumnReader { + + private final boolean utcTimestamp; + + private final ChronoUnit chronoUnit; + + public Int64TimestampColumnReader( + boolean utcTimestamp, + ColumnDescriptor descriptor, + PageReader pageReader, + int precision) throws IOException { + super(descriptor, pageReader); + this.utcTimestamp = utcTimestamp; + if (precision <= 3) { + this.chronoUnit = ChronoUnit.MILLIS; + } else if (precision <= 6) { + this.chronoUnit = ChronoUnit.MICROS; + } else { + throw new IllegalArgumentException( + "Avro does not support TIMESTAMP type with precision: " + + precision + + ", it only supports precision less than 6."); + } + checkTypeName(PrimitiveType.PrimitiveTypeName.INT64); + } + + @Override + protected boolean supportLazyDecode() { + return false; + } + + @Override + protected void readBatch(int rowId, int num, WritableTimestampVector column) { + for (int i = 0; i < num; i++) { + if (runLenDecoder.readInteger() == maxDefLevel) { + ByteBuffer buffer = readDataBuffer(8); + column.setTimestamp(rowId + i, int64ToTimestamp(utcTimestamp, buffer.getLong(), chronoUnit)); + } else { + column.setNullAt(rowId + i); + } + } + } + + @Override + protected void readBatchFromDictionaryIds( + int rowId, + int num, + WritableTimestampVector column, + WritableIntVector dictionaryIds) { + for (int i = rowId; i < rowId + num; ++i) { + if (!column.isNullAt(i)) { + column.setTimestamp(i, decodeInt64ToTimestamp( + utcTimestamp, dictionary, dictionaryIds.getInt(i), chronoUnit)); + } + } + } + + public static TimestampData decodeInt64ToTimestamp( + boolean utcTimestamp, + org.apache.parquet.column.Dictionary dictionary, + int id, + ChronoUnit unit) { + long value = dictionary.decodeToLong(id); + return int64ToTimestamp(utcTimestamp, value, unit); + } + + private static TimestampData int64ToTimestamp( + boolean utcTimestamp, + long interval, + ChronoUnit unit) { + final Instant instant = Instant.EPOCH.plus(interval, unit); + if (utcTimestamp) { + return TimestampData.fromInstant(instant); + } else { + // this applies the local timezone + return TimestampData.fromTimestamp(Timestamp.from(instant)); + } + } +} diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/MapColumnReader.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/MapColumnReader.java new file mode 100644 index 0000000000000..a6762d2e175c1 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/MapColumnReader.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.hudi.table.format.cow.vector.HeapArrayVector; +import org.apache.hudi.table.format.cow.vector.HeapMapColumnVector; + +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.columnar.vector.ColumnVector; +import org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; + +import java.io.IOException; + +/** + * Map {@link ColumnReader}. + */ +public class MapColumnReader implements ColumnReader { + + private final LogicalType logicalType; + private final ArrayColumnReader keyReader; + private final ArrayColumnReader valueReader; + + public MapColumnReader( + ArrayColumnReader keyReader, ArrayColumnReader valueReader, LogicalType logicalType) { + this.keyReader = keyReader; + this.valueReader = valueReader; + this.logicalType = logicalType; + } + + public void readBatch(int total, ColumnVector column) throws IOException { + HeapMapColumnVector mapColumnVector = (HeapMapColumnVector) column; + MapType mapType = (MapType) logicalType; + // initialize 2 ListColumnVector for keys and values + HeapArrayVector keyArrayColumnVector = new HeapArrayVector(total); + HeapArrayVector valueArrayColumnVector = new HeapArrayVector(total); + // read the keys and values + keyReader.readToVector(total, keyArrayColumnVector); + valueReader.readToVector(total, valueArrayColumnVector); + + // set the related attributes according to the keys and values + mapColumnVector.setKeys(keyArrayColumnVector.child); + mapColumnVector.setValues(valueArrayColumnVector.child); + mapColumnVector.setOffsets(keyArrayColumnVector.offsets); + mapColumnVector.setLengths(keyArrayColumnVector.lengths); + mapColumnVector.setSize(keyArrayColumnVector.getSize()); + for (int i = 0; i < keyArrayColumnVector.getLen(); i++) { + if (keyArrayColumnVector.isNullAt(i)) { + mapColumnVector.setNullAt(i); + } + } + } + + @Override + public void readToVector(int readNumber, WritableColumnVector vector) throws IOException { + readBatch(readNumber, vector); + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetColumnarRowSplitReader.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetColumnarRowSplitReader.java new file mode 100644 index 0000000000000..cd1bb59c34d40 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetColumnarRowSplitReader.java @@ -0,0 +1,385 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.hudi.table.format.cow.vector.ParquetDecimalVector; + +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.columnar.ColumnarRowData; +import org.apache.flink.table.data.columnar.vector.ColumnVector; +import org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch; +import org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeRoot; +import org.apache.flink.util.FlinkRuntimeException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.stream.IntStream; + +import static org.apache.hudi.table.format.cow.ParquetSplitReaderUtil.createColumnReader; +import static org.apache.hudi.table.format.cow.ParquetSplitReaderUtil.createWritableColumnVector; +import static org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups; +import static org.apache.parquet.format.converter.ParquetMetadataConverter.range; +import static org.apache.parquet.hadoop.ParquetFileReader.readFooter; +import static org.apache.parquet.hadoop.ParquetInputFormat.getFilter; + +/** + * This reader is used to read a {@link VectorizedColumnBatch} from input split. + * + *

    Note: Reference Flink release 1.11.2 + * {@code org.apache.flink.formats.parquet.vector.ParquetColumnarRowSplitReader} + * because it is package scope. + */ +public class ParquetColumnarRowSplitReader implements Closeable { + + private final boolean utcTimestamp; + + private final MessageType fileSchema; + + private final LogicalType[] requestedTypes; + + private final MessageType requestedSchema; + + /** + * The total number of rows this RecordReader will eventually read. The sum of the rows of all + * the row groups. + */ + private final long totalRowCount; + + private final WritableColumnVector[] writableVectors; + + private final VectorizedColumnBatch columnarBatch; + + private final ColumnarRowData row; + + private final int batchSize; + + private ParquetFileReader reader; + + /** + * For each request column, the reader to read this column. This is NULL if this column is + * missing from the file, in which case we populate the attribute with NULL. + */ + private ColumnReader[] columnReaders; + + /** + * The number of rows that have been returned. + */ + private long rowsReturned; + + /** + * The number of rows that have been reading, including the current in flight row group. + */ + private long totalCountLoadedSoFar; + + // the index of the next row to return + private int nextRow; + + // the number of rows in the current batch + private int rowsInBatch; + + public ParquetColumnarRowSplitReader( + boolean utcTimestamp, + boolean caseSensitive, + Configuration conf, + LogicalType[] selectedTypes, + String[] selectedFieldNames, + ColumnBatchGenerator generator, + int batchSize, + Path path, + long splitStart, + long splitLength) throws IOException { + this.utcTimestamp = utcTimestamp; + this.batchSize = batchSize; + // then we need to apply the predicate push down filter + ParquetMetadata footer = readFooter(conf, path, range(splitStart, splitStart + splitLength)); + MessageType fileSchema = footer.getFileMetaData().getSchema(); + FilterCompat.Filter filter = getFilter(conf); + List blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); + + this.fileSchema = footer.getFileMetaData().getSchema(); + + Type[] types = clipParquetSchema(fileSchema, selectedFieldNames, caseSensitive); + int[] requestedIndices = IntStream.range(0, types.length).filter(i -> types[i] != null).toArray(); + Type[] readTypes = Arrays.stream(requestedIndices).mapToObj(i -> types[i]).toArray(Type[]::new); + + this.requestedTypes = Arrays.stream(requestedIndices).mapToObj(i -> selectedTypes[i]).toArray(LogicalType[]::new); + this.requestedSchema = Types.buildMessage().addFields(readTypes).named("flink-parquet"); + this.reader = new ParquetFileReader( + conf, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns()); + + long totalRowCount = 0; + for (BlockMetaData block : blocks) { + totalRowCount += block.getRowCount(); + } + this.totalRowCount = totalRowCount; + this.nextRow = 0; + this.rowsInBatch = 0; + this.rowsReturned = 0; + + checkSchema(); + + this.writableVectors = createWritableVectors(); + ColumnVector[] columnVectors = patchedVector(selectedFieldNames.length, createReadableVectors(), requestedIndices); + this.columnarBatch = generator.generate(columnVectors); + this.row = new ColumnarRowData(columnarBatch); + } + + /** + * Patches the given vectors with nulls. + * The vector position that is not requested (or read from file) is patched as null. + * + * @param fields The total selected fields number + * @param vectors The readable vectors + * @param indices The requested indices from the selected fields + */ + private static ColumnVector[] patchedVector(int fields, ColumnVector[] vectors, int[] indices) { + ColumnVector[] patched = new ColumnVector[fields]; + for (int i = 0; i < indices.length; i++) { + patched[indices[i]] = vectors[i]; + } + return patched; + } + + /** + * Clips `parquetSchema` according to `fieldNames`. + */ + private static Type[] clipParquetSchema( + GroupType parquetSchema, String[] fieldNames, boolean caseSensitive) { + Type[] types = new Type[fieldNames.length]; + if (caseSensitive) { + for (int i = 0; i < fieldNames.length; ++i) { + String fieldName = fieldNames[i]; + types[i] = parquetSchema.containsField(fieldName) ? parquetSchema.getType(fieldName) : null; + } + } else { + Map caseInsensitiveFieldMap = new HashMap<>(); + for (Type type : parquetSchema.getFields()) { + caseInsensitiveFieldMap.compute(type.getName().toLowerCase(Locale.ROOT), + (key, previousType) -> { + if (previousType != null) { + throw new FlinkRuntimeException( + "Parquet with case insensitive mode should have no duplicate key: " + key); + } + return type; + }); + } + for (int i = 0; i < fieldNames.length; ++i) { + Type type = caseInsensitiveFieldMap.get(fieldNames[i].toLowerCase(Locale.ROOT)); + // TODO clip for array,map,row types. + types[i] = type; + } + } + + return types; + } + + private WritableColumnVector[] createWritableVectors() { + WritableColumnVector[] columns = new WritableColumnVector[requestedTypes.length]; + List types = requestedSchema.getFields(); + List descriptors = requestedSchema.getColumns(); + for (int i = 0; i < requestedTypes.length; i++) { + columns[i] = createWritableColumnVector( + batchSize, + requestedTypes[i], + types.get(i), + descriptors); + } + return columns; + } + + /** + * Create readable vectors from writable vectors. + * Especially for decimal, see {@link org.apache.flink.formats.parquet.vector.ParquetDecimalVector}. + */ + private ColumnVector[] createReadableVectors() { + ColumnVector[] vectors = new ColumnVector[writableVectors.length]; + for (int i = 0; i < writableVectors.length; i++) { + vectors[i] = requestedTypes[i].getTypeRoot() == LogicalTypeRoot.DECIMAL + ? new ParquetDecimalVector(writableVectors[i]) + : writableVectors[i]; + } + return vectors; + } + + private void checkSchema() throws IOException, UnsupportedOperationException { + /* + * Check that the requested schema is supported. + */ + for (int i = 0; i < requestedSchema.getFieldCount(); ++i) { + String[] colPath = requestedSchema.getPaths().get(i); + if (fileSchema.containsPath(colPath)) { + ColumnDescriptor fd = fileSchema.getColumnDescription(colPath); + if (!fd.equals(requestedSchema.getColumns().get(i))) { + throw new UnsupportedOperationException("Schema evolution not supported."); + } + } else { + if (requestedSchema.getColumns().get(i).getMaxDefinitionLevel() == 0) { + // Column is missing in data but the required data is non-nullable. This file is invalid. + throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath)); + } + } + } + } + + /** + * Method used to check if the end of the input is reached. + * + * @return True if the end is reached, otherwise false. + * @throws IOException Thrown, if an I/O error occurred. + */ + public boolean reachedEnd() throws IOException { + return !ensureBatch(); + } + + public ColumnarRowData nextRecord() { + // return the next row + row.setRowId(this.nextRow++); + return row; + } + + /** + * Checks if there is at least one row left in the batch to return. If no more row are + * available, it reads another batch of rows. + * + * @return Returns true if there is one more row to return, false otherwise. + * @throws IOException throw if an exception happens while reading a batch. + */ + private boolean ensureBatch() throws IOException { + if (nextRow >= rowsInBatch) { + // No more rows available in the Rows array. + nextRow = 0; + // Try to read the next batch if rows from the file. + return nextBatch(); + } + // there is at least one Row left in the Rows array. + return true; + } + + /** + * Advances to the next batch of rows. Returns false if there are no more. + */ + private boolean nextBatch() throws IOException { + for (WritableColumnVector v : writableVectors) { + v.reset(); + } + columnarBatch.setNumRows(0); + if (rowsReturned >= totalRowCount) { + return false; + } + if (rowsReturned == totalCountLoadedSoFar) { + readNextRowGroup(); + } + + int num = (int) Math.min(batchSize, totalCountLoadedSoFar - rowsReturned); + for (int i = 0; i < columnReaders.length; ++i) { + //noinspection unchecked + columnReaders[i].readToVector(num, writableVectors[i]); + } + rowsReturned += num; + columnarBatch.setNumRows(num); + rowsInBatch = num; + return true; + } + + private void readNextRowGroup() throws IOException { + PageReadStore pages = reader.readNextRowGroup(); + if (pages == null) { + throw new IOException("expecting more rows but reached last block. Read " + + rowsReturned + " out of " + totalRowCount); + } + List types = requestedSchema.getFields(); + List columns = requestedSchema.getColumns(); + columnReaders = new ColumnReader[types.size()]; + for (int i = 0; i < types.size(); ++i) { + columnReaders[i] = createColumnReader( + utcTimestamp, + requestedTypes[i], + types.get(i), + columns, + pages); + } + totalCountLoadedSoFar += pages.getRowCount(); + } + + /** + * Seek to a particular row number. + */ + public void seekToRow(long rowCount) throws IOException { + if (totalCountLoadedSoFar != 0) { + throw new UnsupportedOperationException("Only support seek at first."); + } + + List blockMetaData = reader.getRowGroups(); + + for (BlockMetaData metaData : blockMetaData) { + if (metaData.getRowCount() > rowCount) { + break; + } else { + reader.skipNextRowGroup(); + rowsReturned += metaData.getRowCount(); + totalCountLoadedSoFar += metaData.getRowCount(); + rowsInBatch = (int) metaData.getRowCount(); + nextRow = (int) metaData.getRowCount(); + rowCount -= metaData.getRowCount(); + } + } + for (int i = 0; i < rowCount; i++) { + boolean end = reachedEnd(); + if (end) { + throw new RuntimeException("Seek to many rows."); + } + nextRecord(); + } + } + + @Override + public void close() throws IOException { + if (reader != null) { + reader.close(); + reader = null; + } + } + + /** + * Interface to gen {@link VectorizedColumnBatch}. + */ + public interface ColumnBatchGenerator { + VectorizedColumnBatch generate(ColumnVector[] readVectors); + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetDataColumnReader.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetDataColumnReader.java new file mode 100644 index 0000000000000..e96cf22d29ef1 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetDataColumnReader.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.table.data.TimestampData; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.column.Dictionary; + +import java.io.IOException; + +/** + * The interface to wrap the underlying Parquet dictionary and non dictionary encoded page reader. + */ +public interface ParquetDataColumnReader { + + /** + * Initialize the reader by page data. + * + * @param valueCount value count + * @param in page data + * @throws IOException + */ + void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException; + + /** + * @return the next Dictionary ID from the page + */ + int readValueDictionaryId(); + + /** + * @return the next Long from the page + */ + long readLong(); + + /** + * @return the next Integer from the page + */ + int readInteger(); + + /** + * @return the next SmallInt from the page + */ + int readSmallInt(); + + /** + * @return the next TinyInt from the page + */ + int readTinyInt(); + + /** + * @return the next Float from the page + */ + float readFloat(); + + /** + * @return the next Boolean from the page + */ + boolean readBoolean(); + + /** + * @return the next String from the page + */ + byte[] readString(); + + /** + * @return the next Varchar from the page + */ + byte[] readVarchar(); + + /** + * @return the next Char from the page + */ + byte[] readChar(); + + /** + * @return the next Bytes from the page + */ + byte[] readBytes(); + + /** + * @return the next Decimal from the page + */ + byte[] readDecimal(); + + /** + * @return the next Double from the page + */ + double readDouble(); + + /** + * @return the next TimestampData from the page + */ + TimestampData readTimestamp(); + + /** + * @return is data valid + */ + boolean isValid(); + + /** + * @return the underlying dictionary if current reader is dictionary encoded + */ + Dictionary getDictionary(); + + /** + * @param id in dictionary + * @return the Bytes from the dictionary by id + */ + byte[] readBytes(int id); + + /** + * @param id in dictionary + * @return the Float from the dictionary by id + */ + float readFloat(int id); + + /** + * @param id in dictionary + * @return the Double from the dictionary by id + */ + double readDouble(int id); + + /** + * @param id in dictionary + * @return the Integer from the dictionary by id + */ + int readInteger(int id); + + /** + * @param id in dictionary + * @return the Long from the dictionary by id + */ + long readLong(int id); + + /** + * @param id in dictionary + * @return the Small Int from the dictionary by id + */ + int readSmallInt(int id); + + /** + * @param id in dictionary + * @return the tiny int from the dictionary by id + */ + int readTinyInt(int id); + + /** + * @param id in dictionary + * @return the Boolean from the dictionary by id + */ + boolean readBoolean(int id); + + /** + * @param id in dictionary + * @return the Decimal from the dictionary by id + */ + byte[] readDecimal(int id); + + /** + * @param id in dictionary + * @return the TimestampData from the dictionary by id + */ + TimestampData readTimestamp(int id); + + /** + * @param id in dictionary + * @return the String from the dictionary by id + */ + byte[] readString(int id); + + /** + * @param id in dictionary + * @return the Varchar from the dictionary by id + */ + byte[] readVarchar(int id); + + /** + * @param id in dictionary + * @return the Char from the dictionary by id + */ + byte[] readChar(int id); +} + diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetDataColumnReaderFactory.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetDataColumnReaderFactory.java new file mode 100644 index 0000000000000..861d5cb00bbe7 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/ParquetDataColumnReaderFactory.java @@ -0,0 +1,304 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.table.data.TimestampData; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.column.Dictionary; +import org.apache.parquet.column.values.ValuesReader; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.sql.Timestamp; + +import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.JULIAN_EPOCH_OFFSET_DAYS; +import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.MILLIS_IN_DAY; +import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.NANOS_PER_MILLISECOND; +import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.NANOS_PER_SECOND; + +/** + * Parquet file has self-describing schema which may differ from the user required schema (e.g. + * schema evolution). This factory is used to retrieve user required typed data via corresponding + * reader which reads the underlying data. + */ +public final class ParquetDataColumnReaderFactory { + + private ParquetDataColumnReaderFactory() { + } + + /** + * default reader for {@link ParquetDataColumnReader}. + */ + public static class DefaultParquetDataColumnReader implements ParquetDataColumnReader { + protected ValuesReader valuesReader; + protected Dictionary dict; + + // After the data is read in the parquet type, isValid will be set to true if the data can + // be returned in the type defined in HMS. Otherwise isValid is set to false. + boolean isValid = true; + + public DefaultParquetDataColumnReader(ValuesReader valuesReader) { + this.valuesReader = valuesReader; + } + + public DefaultParquetDataColumnReader(Dictionary dict) { + this.dict = dict; + } + + @Override + public void initFromPage(int i, ByteBufferInputStream in) throws IOException { + valuesReader.initFromPage(i, in); + } + + @Override + public boolean readBoolean() { + return valuesReader.readBoolean(); + } + + @Override + public boolean readBoolean(int id) { + return dict.decodeToBoolean(id); + } + + @Override + public byte[] readString(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public byte[] readString() { + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readVarchar() { + // we need to enforce the size here even the types are the same + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readVarchar(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public byte[] readChar() { + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readChar(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public byte[] readBytes() { + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readBytes(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public byte[] readDecimal() { + return valuesReader.readBytes().getBytesUnsafe(); + } + + @Override + public byte[] readDecimal(int id) { + return dict.decodeToBinary(id).getBytesUnsafe(); + } + + @Override + public float readFloat() { + return valuesReader.readFloat(); + } + + @Override + public float readFloat(int id) { + return dict.decodeToFloat(id); + } + + @Override + public double readDouble() { + return valuesReader.readDouble(); + } + + @Override + public double readDouble(int id) { + return dict.decodeToDouble(id); + } + + @Override + public TimestampData readTimestamp() { + throw new RuntimeException("Unsupported operation"); + } + + @Override + public TimestampData readTimestamp(int id) { + throw new RuntimeException("Unsupported operation"); + } + + @Override + public int readInteger() { + return valuesReader.readInteger(); + } + + @Override + public int readInteger(int id) { + return dict.decodeToInt(id); + } + + @Override + public boolean isValid() { + return isValid; + } + + @Override + public long readLong(int id) { + return dict.decodeToLong(id); + } + + @Override + public long readLong() { + return valuesReader.readLong(); + } + + @Override + public int readSmallInt() { + return valuesReader.readInteger(); + } + + @Override + public int readSmallInt(int id) { + return dict.decodeToInt(id); + } + + @Override + public int readTinyInt() { + return valuesReader.readInteger(); + } + + @Override + public int readTinyInt(int id) { + return dict.decodeToInt(id); + } + + @Override + public int readValueDictionaryId() { + return valuesReader.readValueDictionaryId(); + } + + public void skip() { + valuesReader.skip(); + } + + @Override + public Dictionary getDictionary() { + return dict; + } + } + + /** + * The reader who reads from the underlying Timestamp value value. + */ + public static class TypesFromInt96PageReader extends DefaultParquetDataColumnReader { + private final boolean isUtcTimestamp; + + public TypesFromInt96PageReader(ValuesReader realReader, boolean isUtcTimestamp) { + super(realReader); + this.isUtcTimestamp = isUtcTimestamp; + } + + public TypesFromInt96PageReader(Dictionary dict, boolean isUtcTimestamp) { + super(dict); + this.isUtcTimestamp = isUtcTimestamp; + } + + private TimestampData convert(Binary binary) { + ByteBuffer buf = binary.toByteBuffer(); + buf.order(ByteOrder.LITTLE_ENDIAN); + long timeOfDayNanos = buf.getLong(); + int julianDay = buf.getInt(); + return int96ToTimestamp(isUtcTimestamp, timeOfDayNanos, julianDay); + } + + @Override + public TimestampData readTimestamp(int id) { + return convert(dict.decodeToBinary(id)); + } + + @Override + public TimestampData readTimestamp() { + return convert(valuesReader.readBytes()); + } + } + + private static ParquetDataColumnReader getDataColumnReaderByTypeHelper( + boolean isDictionary, + PrimitiveType parquetType, + Dictionary dictionary, + ValuesReader valuesReader, + boolean isUtcTimestamp) { + if (parquetType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.INT96) { + return isDictionary + ? new TypesFromInt96PageReader(dictionary, isUtcTimestamp) + : new TypesFromInt96PageReader(valuesReader, isUtcTimestamp); + } else { + return isDictionary + ? new DefaultParquetDataColumnReader(dictionary) + : new DefaultParquetDataColumnReader(valuesReader); + } + } + + public static ParquetDataColumnReader getDataColumnReaderByTypeOnDictionary( + PrimitiveType parquetType, Dictionary realReader, boolean isUtcTimestamp) { + return getDataColumnReaderByTypeHelper(true, parquetType, realReader, null, isUtcTimestamp); + } + + public static ParquetDataColumnReader getDataColumnReaderByType( + PrimitiveType parquetType, ValuesReader realReader, boolean isUtcTimestamp) { + return getDataColumnReaderByTypeHelper( + false, parquetType, null, realReader, isUtcTimestamp); + } + + private static TimestampData int96ToTimestamp( + boolean utcTimestamp, long nanosOfDay, int julianDay) { + long millisecond = julianDayToMillis(julianDay) + (nanosOfDay / NANOS_PER_MILLISECOND); + + if (utcTimestamp) { + int nanoOfMillisecond = (int) (nanosOfDay % NANOS_PER_MILLISECOND); + return TimestampData.fromEpochMillis(millisecond, nanoOfMillisecond); + } else { + Timestamp timestamp = new Timestamp(millisecond); + timestamp.setNanos((int) (nanosOfDay % NANOS_PER_SECOND)); + return TimestampData.fromTimestamp(timestamp); + } + } + + private static long julianDayToMillis(int julianDay) { + return (julianDay - JULIAN_EPOCH_OFFSET_DAYS) * MILLIS_IN_DAY; + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/RowColumnReader.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/RowColumnReader.java new file mode 100644 index 0000000000000..79b50487f13c1 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/RowColumnReader.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.hudi.table.format.cow.vector.HeapRowColumnVector; + +import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector; + +import java.io.IOException; +import java.util.List; + +/** + * Row {@link ColumnReader}. + */ +public class RowColumnReader implements ColumnReader { + + private final List fieldReaders; + + public RowColumnReader(List fieldReaders) { + this.fieldReaders = fieldReaders; + } + + @Override + public void readToVector(int readNumber, WritableColumnVector vector) throws IOException { + HeapRowColumnVector rowColumnVector = (HeapRowColumnVector) vector; + WritableColumnVector[] vectors = rowColumnVector.vectors; + // row vector null array + boolean[] isNulls = new boolean[readNumber]; + for (int i = 0; i < vectors.length; i++) { + fieldReaders.get(i).readToVector(readNumber, vectors[i]); + + for (int j = 0; j < readNumber; j++) { + if (i == 0) { + isNulls[j] = vectors[i].isNullAt(j); + } else { + isNulls[j] = isNulls[j] && vectors[i].isNullAt(j); + } + if (i == vectors.length - 1 && isNulls[j]) { + // rowColumnVector[j] is null only when all fields[j] of rowColumnVector[j] is + // null + rowColumnVector.setNullAt(j); + } + } + } + } +} diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/RunLengthDecoder.java b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/RunLengthDecoder.java new file mode 100644 index 0000000000000..4371ec30ae4c6 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/main/java/org/apache/hudi/table/format/cow/vector/reader/RunLengthDecoder.java @@ -0,0 +1,304 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.format.cow.vector.reader; + +import org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector; +import org.apache.flink.table.data.columnar.vector.writable.WritableIntVector; +import org.apache.parquet.Preconditions; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.values.bitpacking.BytePacker; +import org.apache.parquet.column.values.bitpacking.Packer; +import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder; +import org.apache.parquet.io.ParquetDecodingException; + +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * Run length decoder for data and dictionary ids. + * See https://github.com/apache/parquet-format/blob/master/Encodings.md + * See {@link RunLengthBitPackingHybridDecoder}. + * + *

    Note: Reference Flink release 1.11.2 + * {@code org.apache.flink.formats.parquet.vector.reader.RunLengthDecoder} + * because it is package scope. + */ +final class RunLengthDecoder { + + /** + * If true, the bit width is fixed. This decoder is used in different places and this also + * controls if we need to read the bitwidth from the beginning of the data stream. + */ + private final boolean fixedWidth; + private final boolean readLength; + + // Encoded data. + private ByteBufferInputStream in; + + // bit/byte width of decoded data and utility to batch unpack them. + private int bitWidth; + private int bytesWidth; + private BytePacker packer; + + // Current decoding mode and values + MODE mode; + int currentCount; + int currentValue; + + // Buffer of decoded values if the values are PACKED. + int[] currentBuffer = new int[16]; + int currentBufferIdx = 0; + + RunLengthDecoder() { + this.fixedWidth = false; + this.readLength = false; + } + + RunLengthDecoder(int bitWidth) { + this.fixedWidth = true; + this.readLength = bitWidth != 0; + initWidthAndPacker(bitWidth); + } + + RunLengthDecoder(int bitWidth, boolean readLength) { + this.fixedWidth = true; + this.readLength = readLength; + initWidthAndPacker(bitWidth); + } + + /** + * Init from input stream. + */ + void initFromStream(int valueCount, ByteBufferInputStream in) throws IOException { + this.in = in; + if (fixedWidth) { + // initialize for repetition and definition levels + if (readLength) { + int length = readIntLittleEndian(); + this.in = in.sliceStream(length); + } + } else { + // initialize for values + if (in.available() > 0) { + initWidthAndPacker(in.read()); + } + } + if (bitWidth == 0) { + // 0 bit width, treat this as an RLE run of valueCount number of 0's. + this.mode = MODE.RLE; + this.currentCount = valueCount; + this.currentValue = 0; + } else { + this.currentCount = 0; + } + } + + /** + * Initializes the internal state for decoding ints of `bitWidth`. + */ + private void initWidthAndPacker(int bitWidth) { + Preconditions.checkArgument(bitWidth >= 0 && bitWidth <= 32, "bitWidth must be >= 0 and <= 32"); + this.bitWidth = bitWidth; + this.bytesWidth = BytesUtils.paddedByteCountFromBits(bitWidth); + this.packer = Packer.LITTLE_ENDIAN.newBytePacker(bitWidth); + } + + int readInteger() { + if (this.currentCount == 0) { + this.readNextGroup(); + } + + this.currentCount--; + switch (mode) { + case RLE: + return this.currentValue; + case PACKED: + return this.currentBuffer[currentBufferIdx++]; + default: + throw new AssertionError(); + } + } + + /** + * Decoding for dictionary ids. The IDs are populated into `values` and the nullability is + * populated into `nulls`. + */ + void readDictionaryIds( + int total, + WritableIntVector values, + WritableColumnVector nulls, + int rowId, + int level, + RunLengthDecoder data) { + int left = total; + while (left > 0) { + if (this.currentCount == 0) { + this.readNextGroup(); + } + int n = Math.min(left, this.currentCount); + switch (mode) { + case RLE: + if (currentValue == level) { + data.readDictionaryIdData(n, values, rowId); + } else { + nulls.setNulls(rowId, n); + } + break; + case PACKED: + for (int i = 0; i < n; ++i) { + if (currentBuffer[currentBufferIdx++] == level) { + values.setInt(rowId + i, data.readInteger()); + } else { + nulls.setNullAt(rowId + i); + } + } + break; + default: + throw new AssertionError(); + } + rowId += n; + left -= n; + currentCount -= n; + } + } + + /** + * It is used to decode dictionary IDs. + */ + private void readDictionaryIdData(int total, WritableIntVector c, int rowId) { + int left = total; + while (left > 0) { + if (this.currentCount == 0) { + this.readNextGroup(); + } + int n = Math.min(left, this.currentCount); + switch (mode) { + case RLE: + c.setInts(rowId, n, currentValue); + break; + case PACKED: + c.setInts(rowId, n, currentBuffer, currentBufferIdx); + currentBufferIdx += n; + break; + default: + throw new AssertionError(); + } + rowId += n; + left -= n; + currentCount -= n; + } + } + + /** + * Reads the next varint encoded int. + */ + private int readUnsignedVarInt() throws IOException { + int value = 0; + int shift = 0; + int b; + do { + b = in.read(); + value |= (b & 0x7F) << shift; + shift += 7; + } while ((b & 0x80) != 0); + return value; + } + + /** + * Reads the next 4 byte little endian int. + */ + private int readIntLittleEndian() throws IOException { + int ch4 = in.read(); + int ch3 = in.read(); + int ch2 = in.read(); + int ch1 = in.read(); + return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + ch4); + } + + /** + * Reads the next byteWidth little endian int. + */ + private int readIntLittleEndianPaddedOnBitWidth() throws IOException { + switch (bytesWidth) { + case 0: + return 0; + case 1: + return in.read(); + case 2: { + int ch2 = in.read(); + int ch1 = in.read(); + return (ch1 << 8) + ch2; + } + case 3: { + int ch3 = in.read(); + int ch2 = in.read(); + int ch1 = in.read(); + return (ch1 << 16) + (ch2 << 8) + ch3; + } + case 4: { + return readIntLittleEndian(); + } + default: + throw new RuntimeException("Unreachable"); + } + } + + /** + * Reads the next group. + */ + void readNextGroup() { + try { + int header = readUnsignedVarInt(); + this.mode = (header & 1) == 0 ? MODE.RLE : MODE.PACKED; + switch (mode) { + case RLE: + this.currentCount = header >>> 1; + this.currentValue = readIntLittleEndianPaddedOnBitWidth(); + return; + case PACKED: + int numGroups = header >>> 1; + this.currentCount = numGroups * 8; + + if (this.currentBuffer.length < this.currentCount) { + this.currentBuffer = new int[this.currentCount]; + } + currentBufferIdx = 0; + int valueIndex = 0; + while (valueIndex < this.currentCount) { + // values are bit packed 8 at a time, so reading bitWidth will always work + ByteBuffer buffer = in.slice(bitWidth); + this.packer.unpack8Values(buffer, buffer.position(), this.currentBuffer, valueIndex); + valueIndex += 8; + } + return; + default: + throw new ParquetDecodingException("not a valid mode " + this.mode); + } + } catch (IOException e) { + throw new ParquetDecodingException("Failed to read from input stream", e); + } + } + + enum MODE { + RLE, + PACKED + } +} + diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/test/java/org/apache/hudi/adapter/OutputAdapter.java b/hudi-flink-datasource/hudi-flink1.15.x/src/test/java/org/apache/hudi/adapter/OutputAdapter.java new file mode 100644 index 0000000000000..c0d83e6096e3c --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/test/java/org/apache/hudi/adapter/OutputAdapter.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.streaming.api.operators.Output; +import org.apache.flink.streaming.runtime.watermarkstatus.WatermarkStatus; + +/** + * Adapter clazz for {@link Output}. + */ +public interface OutputAdapter extends Output { + @Override + default void emitWatermarkStatus(WatermarkStatus watermarkStatus) { + // no operation + } +} diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/test/java/org/apache/hudi/adapter/StateInitializationContextAdapter.java b/hudi-flink-datasource/hudi-flink1.15.x/src/test/java/org/apache/hudi/adapter/StateInitializationContextAdapter.java new file mode 100644 index 0000000000000..c903ec2ed4080 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/test/java/org/apache/hudi/adapter/StateInitializationContextAdapter.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.runtime.state.StateInitializationContext; + +import java.util.OptionalLong; + +/** + * Adapter clazz for {@link StateInitializationContext}. + */ +public interface StateInitializationContextAdapter extends StateInitializationContext { + default OptionalLong getRestoredCheckpointId() { + return OptionalLong.empty(); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/test/java/org/apache/hudi/adapter/StreamingRuntimeContextAdapter.java b/hudi-flink-datasource/hudi-flink1.15.x/src/test/java/org/apache/hudi/adapter/StreamingRuntimeContextAdapter.java new file mode 100644 index 0000000000000..4461c28943d3a --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/test/java/org/apache/hudi/adapter/StreamingRuntimeContextAdapter.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.api.common.accumulators.Accumulator; +import org.apache.flink.metrics.groups.OperatorMetricGroup; +import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.flink.runtime.execution.Environment; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; + +import java.util.Map; + +/** + * Adapter clazz for {@link StreamingRuntimeContext}. + */ +public class StreamingRuntimeContextAdapter extends StreamingRuntimeContext { + + public StreamingRuntimeContextAdapter(AbstractStreamOperator operator, Environment env, + Map> accumulators) { + super(operator, env, accumulators); + } + + @Override + public OperatorMetricGroup getMetricGroup() { + return UnregisteredMetricsGroup.createOperatorMetricGroup(); + } +} diff --git a/hudi-flink-datasource/hudi-flink1.15.x/src/test/java/org/apache/hudi/adapter/TestTableEnvs.java b/hudi-flink-datasource/hudi-flink1.15.x/src/test/java/org/apache/hudi/adapter/TestTableEnvs.java new file mode 100644 index 0000000000000..e65437609a21e --- /dev/null +++ b/hudi-flink-datasource/hudi-flink1.15.x/src/test/java/org/apache/hudi/adapter/TestTableEnvs.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.adapter; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; + +/** + * TableEnv for test goals. + */ +public class TestTableEnvs { + + public static TableEnvironment getBatchTableEnv() { + Configuration conf = new Configuration(); + // for batch upsert use cases: current suggestion is to disable these 2 options, + // from 1.14, flink runtime execution mode has switched from streaming + // to batch for batch execution mode(before that, both streaming and batch use streaming execution mode), + // current batch execution mode has these limitations: + // + // 1. the keyed stream default to always sort the inputs by key; + // 2. the batch state-backend requires the inputs sort by state key + // + // For our hudi batch pipeline upsert case, we rely on the consuming sequence for index records and data records, + // the index records must be loaded first before data records for BucketAssignFunction to keep upsert semantics correct, + // so we suggest disabling these 2 options to use streaming state-backend for batch execution mode + // to keep the strategy before 1.14. + conf.setBoolean("execution.sorted-inputs.enabled", false); + conf.setBoolean("execution.batch-state-backend.enabled", false); + StreamExecutionEnvironment execEnv = StreamExecutionEnvironment.getExecutionEnvironment(conf); + EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build(); + return StreamTableEnvironment.create(execEnv, settings); + } +} diff --git a/hudi-flink-datasource/pom.xml b/hudi-flink-datasource/pom.xml new file mode 100644 index 0000000000000..6c36965a076b0 --- /dev/null +++ b/hudi-flink-datasource/pom.xml @@ -0,0 +1,42 @@ + + + + + hudi + org.apache.hudi + 0.12.2-dt-SNAPSHOT + + 4.0.0 + + hudi-flink-datasource + 0.12.2-dt-SNAPSHOT + pom + + + ${project.parent.basedir} + + + + hudi-flink1.13.x + hudi-flink1.14.x + hudi-flink1.15.x + hudi-flink + + + diff --git a/hudi-flink/pom.xml b/hudi-flink/pom.xml deleted file mode 100644 index 4b8cfd78e1009..0000000000000 --- a/hudi-flink/pom.xml +++ /dev/null @@ -1,177 +0,0 @@ - - - - - - - hudi - org.apache.hudi - 0.6.1-SNAPSHOT - - 4.0.0 - - hudi-flink_${scala.binary.version} - jar - - - ${project.parent.basedir} - - - - - - org.jacoco - jacoco-maven-plugin - - - org.apache.maven.plugins - maven-compiler-plugin - - 1.8 - 1.8 - - - - org.apache.maven.plugins - maven-jar-plugin - 3.1.2 - - - - test-jar - - - - - - org.apache.rat - apache-rat-plugin - - - - - - src/main/resources - - - src/test/resources - - - - - - - - org.apache.hudi - hudi-common - ${project.version} - - - org.apache.hudi - hudi-client-common - ${project.version} - - - org.apache.hudi - hudi-flink-client - ${project.version} - - - - - org.apache.flink - flink-streaming-java_${scala.binary.version} - compile - - - org.apache.flink - flink-clients_${scala.binary.version} - compile - - - com.esotericsoftware.kryo - kryo - - - com.esotericsoftware.minlog - minlog - - - - - org.apache.flink - flink-connector-kafka_${scala.binary.version} - compile - - - org.apache.kafka - kafka-clients - ${kafka.version} - - - - - org.apache.hadoop - hadoop-common - compile - - - org.apache.hadoop - hadoop-hdfs - compile - - - org.apache.hadoop - hadoop-auth - compile - - - - - org.apache.avro - avro - compile - - - - - org.apache.parquet - parquet-avro - compile - - - - - org.apache.hadoop - hadoop-mapreduce-client-core - compile - - - - com.beust - jcommander - compile - - - com.twitter - bijection-avro_${scala.binary.version} - 0.9.7 - - - diff --git a/hudi-flink/src/main/java/org/apache/hudi/HoodieFlinkStreamer.java b/hudi-flink/src/main/java/org/apache/hudi/HoodieFlinkStreamer.java deleted file mode 100644 index 0c9991da3db4a..0000000000000 --- a/hudi-flink/src/main/java/org/apache/hudi/HoodieFlinkStreamer.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; -import org.apache.hudi.common.model.WriteOperationType; -import org.apache.hudi.operator.InstantGenerateOperator; -import org.apache.hudi.operator.KeyedWriteProcessFunction; -import org.apache.hudi.operator.KeyedWriteProcessOperator; -import org.apache.hudi.sink.CommitSink; -import org.apache.hudi.source.JsonStringToHoodieRecordMapFunction; -import org.apache.hudi.util.StreamerUtil; - -import com.beust.jcommander.IStringConverter; -import com.beust.jcommander.JCommander; -import com.beust.jcommander.Parameter; -import com.beust.jcommander.ParameterException; -import org.apache.flink.api.common.serialization.SimpleStringSchema; -import org.apache.flink.api.common.typeinfo.TypeHint; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.java.tuple.Tuple3; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.runtime.state.filesystem.FsStateBackend; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; - -import java.util.ArrayList; -import java.util.List; -import java.util.Objects; -import java.util.Properties; - -/** - * An Utility which can incrementally consume data from Kafka and apply it to the target table. - * currently, it only support COW table and insert, upsert operation. - */ -public class HoodieFlinkStreamer { - public static void main(String[] args) throws Exception { - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - - final Config cfg = new Config(); - JCommander cmd = new JCommander(cfg, null, args); - if (cfg.help || args.length == 0) { - cmd.usage(); - System.exit(1); - } - env.enableCheckpointing(cfg.checkpointInterval); - env.getConfig().setGlobalJobParameters(cfg); - // We use checkpoint to trigger write operation, including instant generating and committing, - // There can only be one checkpoint at one time. - env.getCheckpointConfig().setMaxConcurrentCheckpoints(1); - env.disableOperatorChaining(); - - if (cfg.flinkCheckPointPath != null) { - env.setStateBackend(new FsStateBackend(cfg.flinkCheckPointPath)); - } - - Properties kafkaProps = StreamerUtil.getKafkaProps(cfg); - - // Read from kafka source - DataStream inputRecords = - env.addSource(new FlinkKafkaConsumer<>(cfg.kafkaTopic, new SimpleStringSchema(), kafkaProps)) - .filter(Objects::nonNull) - .map(new JsonStringToHoodieRecordMapFunction(cfg)) - .name("kafka_to_hudi_record") - .uid("kafka_to_hudi_record_uid"); - - // InstantGenerateOperator helps to emit globally unique instantTime, it must be executed in one parallelism - inputRecords.transform(InstantGenerateOperator.NAME, TypeInformation.of(HoodieRecord.class), new InstantGenerateOperator()) - .name("instant_generator") - .uid("instant_generator_id") - .setParallelism(1) - - // Keyby partition path, to avoid multiple subtasks writing to a partition at the same time - .keyBy(HoodieRecord::getPartitionPath) - - // write operator, where the write operation really happens - .transform(KeyedWriteProcessOperator.NAME, TypeInformation.of(new TypeHint, Integer>>() { - }), new KeyedWriteProcessOperator(new KeyedWriteProcessFunction())) - .name("write_process") - .uid("write_process_uid") - .setParallelism(env.getParallelism()) - - // Commit can only be executed once, so make it one parallelism - .addSink(new CommitSink()) - .name("commit_sink") - .uid("commit_sink_uid") - .setParallelism(1); - - env.execute(cfg.targetTableName); - } - - public static class Config extends Configuration { - @Parameter(names = {"--kafka-topic"}, description = "kafka topic", required = true) - public String kafkaTopic; - - @Parameter(names = {"--kafka-group-id"}, description = "kafka consumer group id", required = true) - public String kafkaGroupId; - - @Parameter(names = {"--kafka-bootstrap-servers"}, description = "kafka bootstrap.servers", required = true) - public String kafkaBootstrapServers; - - @Parameter(names = {"--flink-checkpoint-path"}, description = "flink checkpoint path") - public String flinkCheckPointPath; - - @Parameter(names = {"--flink-block-retry-times"}, description = "Times to retry when latest instant has not completed") - public String blockRetryTime = "10"; - - @Parameter(names = {"--flink-block-retry-interval"}, description = "Seconds between two tries when latest instant has not completed") - public String blockRetryInterval = "1"; - - @Parameter(names = {"--target-base-path"}, - description = "base path for the target hoodie table. " - + "(Will be created if did not exist first time around. If exists, expected to be a hoodie table)", - required = true) - public String targetBasePath; - - @Parameter(names = {"--target-table"}, description = "name of the target table in Hive", required = true) - public String targetTableName; - - @Parameter(names = {"--table-type"}, description = "Type of table. COPY_ON_WRITE (or) MERGE_ON_READ", required = true) - public String tableType; - - @Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for " - + "hoodie client, schema provider, key generator and data source. For hoodie client props, sane defaults are " - + "used, but recommend use to provide basic things like metrics endpoints, hive configs etc. For sources, refer" - + "to individual classes, for supported properties.") - public String propsFilePath = - "file://" + System.getProperty("user.dir") + "/src/test/resources/delta-streamer-config/dfs-source.properties"; - - @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file " - + "(using the CLI parameter \"--props\") can also be passed command line using this parameter.") - public List configs = new ArrayList<>(); - - @Parameter(names = {"--source-ordering-field"}, description = "Field within source record to decide how" - + " to break ties between records with same key in input data. Default: 'ts' holding unix timestamp of record") - public String sourceOrderingField = "ts"; - - @Parameter(names = {"--payload-class"}, description = "subclass of HoodieRecordPayload, that works off " - + "a GenericRecord. Implement your own, if you want to do something other than overwriting existing value") - public String payloadClassName = OverwriteWithLatestAvroPayload.class.getName(); - - @Parameter(names = {"--op"}, description = "Takes one of these values : UPSERT (default), INSERT (use when input " - + "is purely new data/inserts to gain speed)", converter = OperationConverter.class) - public WriteOperationType operation = WriteOperationType.UPSERT; - - @Parameter(names = {"--filter-dupes"}, - description = "Should duplicate records from source be dropped/filtered out before insert/bulk-insert") - public Boolean filterDupes = false; - - @Parameter(names = {"--commit-on-errors"}, description = "Commit even when some records failed to be written") - public Boolean commitOnErrors = false; - - /** - * Flink checkpoint interval. - */ - @Parameter(names = {"--checkpoint-interval"}, description = "Flink checkpoint interval.") - public Long checkpointInterval = 1000 * 5L; - - @Parameter(names = {"--help", "-h"}, help = true) - public Boolean help = false; - } - - private static class OperationConverter implements IStringConverter { - - @Override - public WriteOperationType convert(String value) throws ParameterException { - return WriteOperationType.valueOf(value); - } - } -} diff --git a/hudi-flink/src/main/java/org/apache/hudi/exception/HoodieFlinkStreamerException.java b/hudi-flink/src/main/java/org/apache/hudi/exception/HoodieFlinkStreamerException.java deleted file mode 100644 index 0aadce83dedf1..0000000000000 --- a/hudi-flink/src/main/java/org/apache/hudi/exception/HoodieFlinkStreamerException.java +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.exception; - -public class HoodieFlinkStreamerException extends HoodieException { - - public HoodieFlinkStreamerException(String msg, Throwable e) { - super(msg, e); - } - - public HoodieFlinkStreamerException(String msg) { - super(msg); - } -} diff --git a/hudi-flink/src/main/java/org/apache/hudi/operator/InstantGenerateOperator.java b/hudi-flink/src/main/java/org/apache/hudi/operator/InstantGenerateOperator.java deleted file mode 100644 index 165eeb087b207..0000000000000 --- a/hudi-flink/src/main/java/org/apache/hudi/operator/InstantGenerateOperator.java +++ /dev/null @@ -1,225 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.operator; - -import org.apache.hudi.HoodieFlinkStreamer; -import org.apache.hudi.client.FlinkTaskContextSupplier; -import org.apache.hudi.client.HoodieFlinkWriteClient; -import org.apache.hudi.client.common.HoodieFlinkEngineContext; -import org.apache.hudi.client.common.TaskContextSupplier; -import org.apache.hudi.common.config.SerializableConfiguration; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieTableType; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.StringUtils; -import org.apache.hudi.util.StreamerUtil; - -import org.apache.flink.api.common.state.ListState; -import org.apache.flink.api.common.state.ListStateDescriptor; -import org.apache.flink.runtime.state.StateInitializationContext; -import org.apache.flink.runtime.state.StateSnapshotContext; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.concurrent.TimeUnit; - -/** - * Operator helps to generate globally unique instant, it must be executed in one parallelism. Before generate a new - * instant , {@link InstantGenerateOperator} will always check whether the last instant has completed. if it is - * completed, a new instant will be generated immediately, otherwise, wait and check the state of last instant until - * time out and throw an exception. - */ -public class InstantGenerateOperator extends AbstractStreamOperator implements OneInputStreamOperator { - - private static final Logger LOG = LoggerFactory.getLogger(InstantGenerateOperator.class); - public static final String NAME = "InstantGenerateOperator"; - - private HoodieFlinkStreamer.Config cfg; - private HoodieFlinkWriteClient writeClient; - private SerializableConfiguration serializableHadoopConf; - private transient FileSystem fs; - private String latestInstant = ""; - private List latestInstantList = new ArrayList<>(1); - private transient ListState latestInstantState; - private List bufferedRecords = new LinkedList(); - private transient ListState recordsState; - private Integer retryTimes; - private Integer retryInterval; - - @Override - public void processElement(StreamRecord streamRecord) throws Exception { - if (streamRecord.getValue() != null) { - bufferedRecords.add(streamRecord); - output.collect(streamRecord); - } - } - - @Override - public void open() throws Exception { - super.open(); - // get configs from runtimeContext - cfg = (HoodieFlinkStreamer.Config) getRuntimeContext().getExecutionConfig().getGlobalJobParameters(); - - // retry times - retryTimes = Integer.valueOf(cfg.blockRetryTime); - - // retry interval - retryInterval = Integer.valueOf(cfg.blockRetryInterval); - - // hadoopConf - serializableHadoopConf = new SerializableConfiguration(StreamerUtil.getHadoopConf()); - - // Hadoop FileSystem - fs = FSUtils.getFs(cfg.targetBasePath, serializableHadoopConf.get()); - - TaskContextSupplier taskContextSupplier = new FlinkTaskContextSupplier(null); - - // writeClient - writeClient = new HoodieFlinkWriteClient(new HoodieFlinkEngineContext(taskContextSupplier), StreamerUtil.getHoodieClientConfig(cfg), true); - - // init table, create it if not exists. - initTable(); - } - - @Override - public void prepareSnapshotPreBarrier(long checkpointId) throws Exception { - super.prepareSnapshotPreBarrier(checkpointId); - // check whether the last instant is completed, if not, wait 10s and then throws an exception - if (!StringUtils.isNullOrEmpty(latestInstant)) { - doCheck(); - // last instant completed, set it empty - latestInstant = ""; - } - - // no data no new instant - if (!bufferedRecords.isEmpty()) { - latestInstant = startNewInstant(checkpointId); - } - } - - @Override - public void initializeState(StateInitializationContext context) throws Exception { - // instantState - ListStateDescriptor latestInstantStateDescriptor = new ListStateDescriptor("latestInstant", String.class); - latestInstantState = context.getOperatorStateStore().getListState(latestInstantStateDescriptor); - - // recordState - ListStateDescriptor recordsStateDescriptor = new ListStateDescriptor("recordsState", StreamRecord.class); - recordsState = context.getOperatorStateStore().getListState(recordsStateDescriptor); - - if (context.isRestored()) { - Iterator latestInstantIterator = latestInstantState.get().iterator(); - latestInstantIterator.forEachRemaining(x -> latestInstant = x); - LOG.info("InstantGenerateOperator initializeState get latestInstant [{}]", latestInstant); - - Iterator recordIterator = recordsState.get().iterator(); - bufferedRecords.clear(); - recordIterator.forEachRemaining(x -> bufferedRecords.add(x)); - } - } - - @Override - public void snapshotState(StateSnapshotContext functionSnapshotContext) throws Exception { - if (latestInstantList.isEmpty()) { - latestInstantList.add(latestInstant); - } else { - latestInstantList.set(0, latestInstant); - } - latestInstantState.update(latestInstantList); - LOG.info("Update latest instant [{}]", latestInstant); - - recordsState.update(bufferedRecords); - LOG.info("Update records state size = [{}]", bufferedRecords.size()); - bufferedRecords.clear(); - } - - /** - * Create a new instant. - * - * @param checkpointId - */ - private String startNewInstant(long checkpointId) { - String newTime = writeClient.startCommit(); - LOG.info("create instant [{}], at checkpoint [{}]", newTime, checkpointId); - return newTime; - } - - /** - * Check the status of last instant. - */ - private void doCheck() throws InterruptedException { - // query the requested and inflight commit/deltacommit instants - String commitType = cfg.tableType.equals(HoodieTableType.COPY_ON_WRITE.name()) ? HoodieTimeline.COMMIT_ACTION : HoodieTimeline.DELTA_COMMIT_ACTION; - LOG.info("Query latest instant [{}]", latestInstant); - List rollbackPendingCommits = writeClient.getInflightsAndRequestedInstants(commitType); - int tryTimes = 0; - while (tryTimes < retryTimes) { - tryTimes++; - StringBuffer sb = new StringBuffer(); - if (rollbackPendingCommits.contains(latestInstant)) { - rollbackPendingCommits.forEach(x -> sb.append(x).append(",")); - LOG.warn("Latest transaction [{}] is not completed! unCompleted transaction:[{}],try times [{}]", latestInstant, sb.toString(), tryTimes); - TimeUnit.SECONDS.sleep(retryInterval); - rollbackPendingCommits = writeClient.getInflightsAndRequestedInstants(commitType); - } else { - LOG.warn("Latest transaction [{}] is completed! Completed transaction, try times [{}]", latestInstant, tryTimes); - return; - } - } - throw new InterruptedException("Last instant costs more than ten second, stop task now"); - } - - - /** - * Create table if not exists. - */ - private void initTable() throws IOException { - if (!fs.exists(new Path(cfg.targetBasePath))) { - HoodieTableMetaClient.initTableType(new Configuration(serializableHadoopConf.get()), cfg.targetBasePath, - HoodieTableType.valueOf(cfg.tableType), cfg.targetTableName, "archived", cfg.payloadClassName, 1); - LOG.info("Table initialized"); - } else { - LOG.info("Table already [{}/{}] exists, do nothing here", cfg.targetBasePath, cfg.targetTableName); - } - } - - @Override - public void close() throws Exception { - if (writeClient != null) { - writeClient.close(); - } - if (fs != null) { - fs.close(); - } - } -} diff --git a/hudi-flink/src/main/java/org/apache/hudi/operator/KeyedWriteProcessFunction.java b/hudi-flink/src/main/java/org/apache/hudi/operator/KeyedWriteProcessFunction.java deleted file mode 100644 index d3ebddfc30fda..0000000000000 --- a/hudi-flink/src/main/java/org/apache/hudi/operator/KeyedWriteProcessFunction.java +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.operator; - -import org.apache.hudi.HoodieFlinkStreamer; -import org.apache.hudi.client.FlinkTaskContextSupplier; -import org.apache.hudi.client.HoodieFlinkWriteClient; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieFlinkEngineContext; -import org.apache.hudi.common.config.SerializableConfiguration; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieTableType; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.exception.HoodieFlinkStreamerException; -import org.apache.hudi.util.StreamerUtil; - -import org.apache.flink.api.java.tuple.Tuple3; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.runtime.state.FunctionInitializationContext; -import org.apache.flink.runtime.state.FunctionSnapshotContext; -import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; -import org.apache.flink.streaming.api.functions.KeyedProcessFunction; -import org.apache.flink.util.Collector; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.LinkedList; -import java.util.List; - -/** - * A {@link KeyedProcessFunction} where the write operations really happens. - */ -public class KeyedWriteProcessFunction extends KeyedProcessFunction, Integer>> implements CheckpointedFunction { - - private static final Logger LOG = LoggerFactory.getLogger(KeyedWriteProcessFunction.class); - /** - * Records buffer, will be processed in snapshotState function. - */ - private List bufferedRecords = new LinkedList<>(); - - /** - * Flink collector help s to send data downstream. - */ - private Collector, Integer>> output; - - /** - * Id of current subtask. - */ - private int indexOfThisSubtask; - - /** - * Instant time this batch belongs to. - */ - private String latestInstant; - - /** - * Flag indicate whether this subtask has records in. - */ - private boolean hasRecordsIn; - - /** - * Job conf. - */ - private HoodieFlinkStreamer.Config cfg; - - /** - * Write Client. - */ - private transient HoodieFlinkWriteClient writeClient; - - @Override - public void open(Configuration parameters) throws Exception { - super.open(parameters); - - indexOfThisSubtask = getRuntimeContext().getIndexOfThisSubtask(); - - cfg = (HoodieFlinkStreamer.Config) getRuntimeContext().getExecutionConfig().getGlobalJobParameters(); - - HoodieFlinkEngineContext context = - new HoodieFlinkEngineContext(new SerializableConfiguration(new org.apache.hadoop.conf.Configuration()), new FlinkTaskContextSupplier(getRuntimeContext())); - - writeClient = new HoodieFlinkWriteClient<>(context, StreamerUtil.getHoodieClientConfig(cfg)); - } - - @Override - public void snapshotState(FunctionSnapshotContext context) { - - // get latest requested instant - String commitType = cfg.tableType.equals(HoodieTableType.COPY_ON_WRITE.name()) ? HoodieTimeline.COMMIT_ACTION : HoodieTimeline.DELTA_COMMIT_ACTION; - List latestInstants = writeClient.getInflightsAndRequestedInstants(commitType); - latestInstant = latestInstants.isEmpty() ? null : latestInstants.get(0); - - if (bufferedRecords.size() > 0) { - hasRecordsIn = true; - if (output != null && latestInstant != null) { - String instantTimestamp = latestInstant; - LOG.info("Write records, subtask id = [{}] checkpoint_id = [{}}] instant = [{}], record size = [{}]", indexOfThisSubtask, context.getCheckpointId(), instantTimestamp, bufferedRecords.size()); - - List writeStatus; - switch (cfg.operation) { - case INSERT: - writeStatus = writeClient.insert(bufferedRecords, instantTimestamp); - break; - case UPSERT: - writeStatus = writeClient.upsert(bufferedRecords, instantTimestamp); - break; - default: - throw new HoodieFlinkStreamerException("Unknown operation : " + cfg.operation); - } - output.collect(new Tuple3<>(instantTimestamp, writeStatus, indexOfThisSubtask)); - bufferedRecords.clear(); - } - } else { - LOG.info("No data in subtask [{}]", indexOfThisSubtask); - hasRecordsIn = false; - } - } - - @Override - public void initializeState(FunctionInitializationContext functionInitializationContext) { - // no operation - } - - @Override - public void processElement(HoodieRecord hoodieRecord, Context context, Collector, Integer>> collector) { - if (output == null) { - output = collector; - } - - // buffer the records - bufferedRecords.add(hoodieRecord); - } - - public boolean hasRecordsIn() { - return hasRecordsIn; - } - - public String getLatestInstant() { - return latestInstant; - } - - @Override - public void close() { - if (writeClient != null) { - writeClient.close(); - } - } -} diff --git a/hudi-flink/src/main/java/org/apache/hudi/operator/KeyedWriteProcessOperator.java b/hudi-flink/src/main/java/org/apache/hudi/operator/KeyedWriteProcessOperator.java deleted file mode 100644 index 1d8e3c5abff4a..0000000000000 --- a/hudi-flink/src/main/java/org/apache/hudi/operator/KeyedWriteProcessOperator.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.operator; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.model.HoodieRecord; - -import org.apache.flink.api.java.tuple.Tuple3; -import org.apache.flink.runtime.state.StateSnapshotContext; -import org.apache.flink.streaming.api.functions.KeyedProcessFunction; -import org.apache.flink.streaming.api.operators.KeyedProcessOperator; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; -import java.util.List; - -/** - * Operator helps to mock empty write results and deliver downstream when no data flow in some subtask. - */ -public class KeyedWriteProcessOperator extends KeyedProcessOperator, Integer>> { - - public static final String NAME = "WriteProcessOperator"; - private static final Logger LOG = LoggerFactory.getLogger(KeyedWriteProcessOperator.class); - private KeyedWriteProcessFunction writeProcessFunction; - - public KeyedWriteProcessOperator(KeyedProcessFunction, Integer>> function) { - super(function); - this.writeProcessFunction = (KeyedWriteProcessFunction) function; - } - - @Override - public void snapshotState(StateSnapshotContext context) throws Exception { - // This super.snapshotState(context) triggers `writeProcessFunction.snapshotState()` method. which means the logic - // below will be executed after `writeProcessFunction.snapshotState()` method. - - // If there is no data flows in `writeProcessFunction`, it will never send anything downstream. so, in order to make - // sure each subtask will send a write status downstream, we implement this operator`s snapshotState() to mock empty - // write status and send it downstream when there is no data flows in some subtasks. - super.snapshotState(context); - - // make up an empty result and send downstream - if (!writeProcessFunction.hasRecordsIn() && writeProcessFunction.getLatestInstant() != null) { - String instantTime = writeProcessFunction.getLatestInstant(); - LOG.info("Mock empty writeStatus, subtaskId = [{}], instant = [{}]", getRuntimeContext().getIndexOfThisSubtask(), instantTime); - output.collect(new StreamRecord<>(new Tuple3(instantTime, new ArrayList(), getRuntimeContext().getIndexOfThisSubtask()))); - } - } -} diff --git a/hudi-flink/src/main/java/org/apache/hudi/schema/SchemaProvider.java b/hudi-flink/src/main/java/org/apache/hudi/schema/SchemaProvider.java deleted file mode 100644 index 74b4067822a80..0000000000000 --- a/hudi-flink/src/main/java/org/apache/hudi/schema/SchemaProvider.java +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.schema; - -import org.apache.hudi.common.config.TypedProperties; - -import org.apache.avro.Schema; - -import java.io.Serializable; - -/** - * Class to provide schema for reading data and also writing into a Hoodie table. - */ -public abstract class SchemaProvider implements Serializable { - - protected TypedProperties config; - - protected SchemaProvider(TypedProperties props) { - this.config = props; - } - - public abstract Schema getSourceSchema(); - - public Schema getTargetSchema() { - // by default, use source schema as target for hoodie table as well - return getSourceSchema(); - } -} diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/CommitSink.java b/hudi-flink/src/main/java/org/apache/hudi/sink/CommitSink.java deleted file mode 100644 index 4ca793076fa3b..0000000000000 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/CommitSink.java +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.sink; - -import org.apache.hudi.client.FlinkTaskContextSupplier; -import org.apache.hudi.client.HoodieFlinkWriteClient; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.common.HoodieFlinkEngineContext; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieFlinkStreamerException; -import org.apache.hudi.HoodieFlinkStreamer; -import org.apache.hudi.util.StreamerUtil; - -import org.apache.flink.api.java.tuple.Tuple3; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -/** - * Function helps to execute commit operation. this operation should be executed only once. - */ -public class CommitSink extends RichSinkFunction, Integer>> { - - private static final Logger LOG = LoggerFactory.getLogger(CommitSink.class); - /** - * Job conf. - */ - private HoodieFlinkStreamer.Config cfg; - - /** - * Write client. - */ - private transient HoodieFlinkWriteClient writeClient; - - /** - * Write result buffer. - */ - private Map>> bufferedWriteStatus = new HashMap<>(); - - /** - * Parallelism of this job. - */ - private Integer writeParallelSize = 0; - - @Override - public void open(Configuration parameters) throws Exception { - super.open(parameters); - // Get configs from runtimeContext - cfg = (HoodieFlinkStreamer.Config) getRuntimeContext().getExecutionConfig().getGlobalJobParameters(); - - writeParallelSize = getRuntimeContext().getExecutionConfig().getParallelism(); - - // writeClient - writeClient = new HoodieFlinkWriteClient<>(new HoodieFlinkEngineContext(new FlinkTaskContextSupplier(null)), StreamerUtil.getHoodieClientConfig(cfg)); - } - - @Override - public void invoke(Tuple3, Integer> writeStatues, Context context) { - LOG.info("Receive records, instantTime = [{}], subtaskId = [{}], WriteStatus size = [{}]", writeStatues.f0, writeStatues.f2, writeStatues.f1.size()); - try { - if (bufferedWriteStatus.containsKey(writeStatues.f0)) { - bufferedWriteStatus.get(writeStatues.f0).add(writeStatues.f1); - } else { - List> oneBatchData = new ArrayList<>(writeParallelSize); - oneBatchData.add(writeStatues.f1); - bufferedWriteStatus.put(writeStatues.f0, oneBatchData); - } - // check and commit - checkAndCommit(writeStatues.f0); - } catch (Exception e) { - throw new HoodieFlinkStreamerException("Invoke sink error", e); - } - } - - /** - * Check and commit if all subtask completed. - * - * @throws Exception - */ - private void checkAndCommit(String instantTime) throws Exception { - if (bufferedWriteStatus.get(instantTime).size() == writeParallelSize) { - LOG.info("Instant [{}] process complete, start commit!", instantTime); - doCommit(instantTime); - bufferedWriteStatus.clear(); - LOG.info("Instant [{}] commit completed!", instantTime); - } else { - LOG.info("Instant [{}], can not commit yet, subtask completed : [{}/{}]", instantTime, bufferedWriteStatus.get(instantTime).size(), writeParallelSize); - } - } - - private void doCommit(String instantTime) { - // get the records to commit - List writeResults = bufferedWriteStatus.get(instantTime).stream().flatMap(Collection::stream).collect(Collectors.toList()); - - // commit and rollback - long totalErrorRecords = writeResults.stream().map(WriteStatus::getTotalErrorRecords).reduce(Long::sum).orElse(0L); - long totalRecords = writeResults.stream().map(WriteStatus::getTotalRecords).reduce(Long::sum).orElse(0L); - boolean hasErrors = totalErrorRecords > 0; - - if (!hasErrors || cfg.commitOnErrors) { - HashMap checkpointCommitMetadata = new HashMap<>(); - if (hasErrors) { - LOG.warn("Some records failed to be merged but forcing commit since commitOnErrors set. Errors/Total=" - + totalErrorRecords + "/" + totalRecords); - } - - boolean success = writeClient.commit(instantTime, writeResults, Option.of(checkpointCommitMetadata)); - if (success) { - LOG.warn("Commit " + instantTime + " successful!"); - } else { - LOG.warn("Commit " + instantTime + " failed!"); - throw new HoodieException("Commit " + instantTime + " failed!"); - } - } else { - LOG.error("Streamer sync found errors when writing. Errors/Total=" + totalErrorRecords + "/" + totalRecords); - LOG.error("Printing out the top 100 errors"); - writeResults.stream().filter(WriteStatus::hasErrors).limit(100).forEach(ws -> { - LOG.error("Global error :", ws.getGlobalError()); - if (ws.getErrors().size() > 0) { - ws.getErrors().forEach((key, value) -> LOG.trace("Error for key:" + key + " is " + value)); - } - }); - // Rolling back instant - writeClient.rollback(instantTime); - throw new HoodieException("Commit " + instantTime + " failed and rolled-back !"); - } - } -} \ No newline at end of file diff --git a/hudi-flink/src/main/java/org/apache/hudi/source/JsonStringToHoodieRecordMapFunction.java b/hudi-flink/src/main/java/org/apache/hudi/source/JsonStringToHoodieRecordMapFunction.java deleted file mode 100644 index a01a67dad70dd..0000000000000 --- a/hudi-flink/src/main/java/org/apache/hudi/source/JsonStringToHoodieRecordMapFunction.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.source; - -import org.apache.hudi.HoodieFlinkStreamer; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.exception.HoodieFlinkStreamerException; -import org.apache.hudi.keygen.KeyGenerator; -import org.apache.hudi.keygen.SimpleAvroKeyGenerator; -import org.apache.hudi.schema.FilebasedSchemaProvider; -import org.apache.hudi.util.AvroConvertor; -import org.apache.hudi.util.StreamerUtil; - -import org.apache.avro.generic.GenericRecord; -import org.apache.flink.api.common.functions.MapFunction; - -import java.io.IOException; - -/** - * Function helps to transfer json string to {@link HoodieRecord}. - */ -public class JsonStringToHoodieRecordMapFunction implements MapFunction { - - private final HoodieFlinkStreamer.Config cfg; - private KeyGenerator keyGenerator; - private AvroConvertor avroConvertor; - - public JsonStringToHoodieRecordMapFunction(HoodieFlinkStreamer.Config cfg) { - this.cfg = cfg; - init(); - } - - @Override - public HoodieRecord map(String value) throws Exception { - GenericRecord gr = avroConvertor.fromJson(value); - HoodieRecordPayload payload = StreamerUtil.createPayload(cfg.payloadClassName, gr, - (Comparable) HoodieAvroUtils.getNestedFieldVal(gr, cfg.sourceOrderingField, false)); - - return new HoodieRecord<>(keyGenerator.getKey(gr), payload); - } - - private void init() { - TypedProperties props = StreamerUtil.getProps(cfg); - avroConvertor = new AvroConvertor(new FilebasedSchemaProvider(props).getSourceSchema()); - try { - keyGenerator = StreamerUtil.createKeyGenerator(props); - } catch (IOException e) { - throw new HoodieFlinkStreamerException(String.format("KeyGenerator %s initialization failed", - props.getString("hoodie.datasource.write.keygenerator.class", SimpleAvroKeyGenerator.class.getName())), e); - } - } -} diff --git a/hudi-flink/src/main/java/org/apache/hudi/util/AvroConvertor.java b/hudi-flink/src/main/java/org/apache/hudi/util/AvroConvertor.java deleted file mode 100644 index 40ce9f19ac363..0000000000000 --- a/hudi-flink/src/main/java/org/apache/hudi/util/AvroConvertor.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.util; - -import org.apache.hudi.avro.MercifulJsonConverter; - -import com.twitter.bijection.Injection; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; - -import java.io.Serializable; - -/** - * Convert a variety of datum into Avro GenericRecords. Has a bunch of lazy fields to circumvent issues around - * serializing these objects from driver to executors - */ -public class AvroConvertor implements Serializable { - - private static final long serialVersionUID = 1L; - /** - * To be lazily inited on executors. - */ - private transient Schema schema; - - private final String schemaStr; - - /** - * To be lazily inited on executors. - */ - private transient MercifulJsonConverter jsonConverter; - - - /** - * To be lazily inited on executors. - */ - private transient Injection recordInjection; - - public AvroConvertor(String schemaStr) { - this.schemaStr = schemaStr; - } - - public AvroConvertor(Schema schema) { - this.schemaStr = schema.toString(); - this.schema = schema; - } - - private void initSchema() { - if (schema == null) { - Schema.Parser parser = new Schema.Parser(); - schema = parser.parse(schemaStr); - } - } - - private void initJsonConvertor() { - if (jsonConverter == null) { - jsonConverter = new MercifulJsonConverter(); - } - } - - public GenericRecord fromJson(String json) { - initSchema(); - initJsonConvertor(); - return jsonConverter.convert(json, schema); - } -} diff --git a/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java b/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java deleted file mode 100644 index f9dacae1e9449..0000000000000 --- a/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.util; - -import org.apache.hudi.HoodieFlinkStreamer; -import org.apache.hudi.common.config.DFSPropertiesConfiguration; -import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.client.common.EngineType; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.ReflectionUtils; -import org.apache.hudi.config.HoodieCompactionConfig; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.exception.HoodieNotSupportedException; -import org.apache.hudi.keygen.KeyGenerator; -import org.apache.hudi.keygen.SimpleAvroKeyGenerator; -import org.apache.hudi.schema.FilebasedSchemaProvider; - -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.clients.consumer.ConsumerConfig; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.StringReader; -import java.util.List; -import java.util.Properties; - -public class StreamerUtil { - - private static Logger LOG = LoggerFactory.getLogger(StreamerUtil.class); - - public static Properties getKafkaProps(HoodieFlinkStreamer.Config cfg) { - Properties result = new Properties(); - result.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, cfg.kafkaBootstrapServers); - result.put(ConsumerConfig.GROUP_ID_CONFIG, cfg.kafkaGroupId); - return result; - } - - public static TypedProperties getProps(HoodieFlinkStreamer.Config cfg) { - return readConfig( - FSUtils.getFs(cfg.propsFilePath, getHadoopConf()), - new Path(cfg.propsFilePath), cfg.configs).getConfig(); - } - - - /** - * Read conig from files. - */ - public static DFSPropertiesConfiguration readConfig(FileSystem fs, Path cfgPath, List overriddenProps) { - DFSPropertiesConfiguration conf; - try { - conf = new DFSPropertiesConfiguration(cfgPath.getFileSystem(fs.getConf()), cfgPath); - } catch (Exception e) { - conf = new DFSPropertiesConfiguration(); - LOG.warn("Unexpected error read props file at :" + cfgPath, e); - } - - try { - if (!overriddenProps.isEmpty()) { - LOG.info("Adding overridden properties to file properties."); - conf.addProperties(new BufferedReader(new StringReader(String.join("\n", overriddenProps)))); - } - } catch (IOException ioe) { - throw new HoodieIOException("Unexpected error adding config overrides", ioe); - } - - return conf; - } - - public static Configuration getHadoopConf() { - return new Configuration(); - } - - public static void checkRequiredProperties(TypedProperties props, List checkPropNames) { - checkPropNames.forEach(prop -> { - if (!props.containsKey(prop)) { - throw new HoodieNotSupportedException("Required property " + prop + " is missing"); - } - }); - } - - /** - * Create a key generator class via reflection, passing in any configs needed. - *

    - * If the class name of key generator is configured through the properties file, i.e., {@code props}, use the corresponding key generator class; otherwise, use the default key generator class - * specified in {@code DataSourceWriteOptions}. - */ - public static KeyGenerator createKeyGenerator(TypedProperties props) throws IOException { - String keyGeneratorClass = props.getString("hoodie.datasource.write.keygenerator.class", - SimpleAvroKeyGenerator.class.getName()); - try { - return (KeyGenerator) ReflectionUtils.loadClass(keyGeneratorClass, props); - } catch (Throwable e) { - throw new IOException("Could not load key generator class " + keyGeneratorClass, e); - } - } - - /** - * Create a payload class via reflection, passing in an ordering/precombine value. - */ - public static HoodieRecordPayload createPayload(String payloadClass, GenericRecord record, Comparable orderingVal) - throws IOException { - try { - return (HoodieRecordPayload) ReflectionUtils.loadClass(payloadClass, - new Class[] {GenericRecord.class, Comparable.class}, record, orderingVal); - } catch (Throwable e) { - throw new IOException("Could not create payload for class: " + payloadClass, e); - } - } - - public static HoodieWriteConfig getHoodieClientConfig(HoodieFlinkStreamer.Config cfg) { - FileSystem fs = FSUtils.getFs(cfg.targetBasePath, getHadoopConf()); - HoodieWriteConfig.Builder builder = - HoodieWriteConfig.newBuilder().withEngineType(EngineType.FLINK).withPath(cfg.targetBasePath).combineInput(cfg.filterDupes, true) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().withPayloadClass(cfg.payloadClassName).build()) - .forTable(cfg.targetTableName) - .withAutoCommit(false) - .withProps(readConfig(fs, new Path(cfg.propsFilePath), cfg.configs) - .getConfig()); - - builder = builder.withSchema(new FilebasedSchemaProvider(getProps(cfg)).getTargetSchema().toString()); - HoodieWriteConfig config = builder.build(); - return config; - } - -} diff --git a/hudi-gcp/pom.xml b/hudi-gcp/pom.xml new file mode 100644 index 0000000000000..14971d1e9891c --- /dev/null +++ b/hudi-gcp/pom.xml @@ -0,0 +1,160 @@ + + + + + hudi + org.apache.hudi + 0.12.2-dt-SNAPSHOT + ../pom.xml + + + 4.0.0 + + hudi-gcp + jar + + + + + + com.google.cloud + libraries-bom + 25.1.0 + pom + import + + + + + + + + org.apache.logging.log4j + log4j-1.2-api + + + + + org.apache.hudi + hudi-common + ${project.version} + + + org.apache.hudi + hudi-sync-common + ${project.version} + + + + com.google.cloud + google-cloud-bigquery + + + + org.apache.parquet + parquet-avro + + + + + org.apache.hadoop + hadoop-common + + + + org.apache.hudi + hudi-tests-common + ${project.version} + test + + + + org.junit.jupiter + junit-jupiter-api + test + + + + org.junit.jupiter + junit-jupiter-engine + test + + + + org.junit.vintage + junit-vintage-engine + test + + + + org.junit.jupiter + junit-jupiter-params + test + + + + org.mockito + mockito-junit-jupiter + test + + + + org.junit.platform + junit-platform-runner + test + + + + org.junit.platform + junit-platform-suite-api + test + + + + + + + src/main/resources + + + + + org.apache.rat + apache-rat-plugin + + + org.apache.maven.plugins + maven-jar-plugin + ${maven-jar-plugin.version} + + + + test-jar + + + + + + org.jacoco + jacoco-maven-plugin + + + + diff --git a/hudi-sync/hudi-dla-sync/src/assembly/src.xml b/hudi-gcp/src/assembly/src.xml similarity index 100% rename from hudi-sync/hudi-dla-sync/src/assembly/src.xml rename to hudi-gcp/src/assembly/src.xml diff --git a/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySyncConfig.java b/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySyncConfig.java new file mode 100644 index 0000000000000..b46cd9a9f81f7 --- /dev/null +++ b/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySyncConfig.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.gcp.bigquery; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.sync.common.HoodieSyncConfig; + +import com.beust.jcommander.Parameter; +import com.beust.jcommander.ParametersDelegate; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +/** + * Configs needed to sync data into BigQuery. + */ +public class BigQuerySyncConfig extends HoodieSyncConfig implements Serializable { + + public static final ConfigProperty BIGQUERY_SYNC_PROJECT_ID = ConfigProperty + .key("hoodie.gcp.bigquery.sync.project_id") + .noDefaultValue() + .withDocumentation("Name of the target project in BigQuery"); + + public static final ConfigProperty BIGQUERY_SYNC_DATASET_NAME = ConfigProperty + .key("hoodie.gcp.bigquery.sync.dataset_name") + .noDefaultValue() + .withDocumentation("Name of the target dataset in BigQuery"); + + public static final ConfigProperty BIGQUERY_SYNC_DATASET_LOCATION = ConfigProperty + .key("hoodie.gcp.bigquery.sync.dataset_location") + .noDefaultValue() + .withDocumentation("Location of the target dataset in BigQuery"); + + public static final ConfigProperty BIGQUERY_SYNC_TABLE_NAME = ConfigProperty + .key("hoodie.gcp.bigquery.sync.table_name") + .noDefaultValue() + .withDocumentation("Name of the target table in BigQuery"); + + public static final ConfigProperty BIGQUERY_SYNC_SOURCE_URI = ConfigProperty + .key("hoodie.gcp.bigquery.sync.source_uri") + .noDefaultValue() + .withDocumentation("Name of the source uri gcs path of the table"); + + public static final ConfigProperty BIGQUERY_SYNC_SOURCE_URI_PREFIX = ConfigProperty + .key("hoodie.gcp.bigquery.sync.source_uri_prefix") + .noDefaultValue() + .withDocumentation("Name of the source uri gcs path prefix of the table"); + + public static final ConfigProperty BIGQUERY_SYNC_SYNC_BASE_PATH = ConfigProperty + .key("hoodie.gcp.bigquery.sync.base_path") + .noDefaultValue() + .withDocumentation("Base path of the hoodie table to sync"); + + public static final ConfigProperty BIGQUERY_SYNC_PARTITION_FIELDS = ConfigProperty + .key("hoodie.gcp.bigquery.sync.partition_fields") + .noDefaultValue() + .withDocumentation("Comma-delimited partition fields. Default to non-partitioned."); + + public static final ConfigProperty BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA = ConfigProperty + .key("hoodie.gcp.bigquery.sync.use_file_listing_from_metadata") + .defaultValue(false) + .withDocumentation("Fetch file listing from Hudi's metadata"); + + public static final ConfigProperty BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING = ConfigProperty + .key("hoodie.gcp.bigquery.sync.assume_date_partitioning") + .defaultValue(false) + .withDocumentation("Assume standard yyyy/mm/dd partitioning, this" + + " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter"); + + public BigQuerySyncConfig(Properties props) { + super(props); + } + + public static class BigQuerySyncConfigParams { + + @ParametersDelegate() + public final HoodieSyncConfigParams hoodieSyncConfigParams = new HoodieSyncConfigParams(); + + @Parameter(names = {"--project-id"}, description = "Name of the target project in BigQuery", required = true) + public String projectId; + @Parameter(names = {"--dataset-name"}, description = "Name of the target dataset in BigQuery", required = true) + public String datasetName; + @Parameter(names = {"--dataset-location"}, description = "Location of the target dataset in BigQuery", required = true) + public String datasetLocation; + @Parameter(names = {"--table-name"}, description = "Name of the target table in BigQuery", required = true) + public String tableName; + @Parameter(names = {"--source-uri"}, description = "Name of the source uri gcs path of the table", required = true) + public String sourceUri; + @Parameter(names = {"--source-uri-prefix"}, description = "Name of the source uri gcs path prefix of the table", required = true) + public String sourceUriPrefix; + @Parameter(names = {"--base-path"}, description = "Base path of the hoodie table to sync", required = true) + public String basePath; + @Parameter(names = {"--partitioned-by"}, description = "Comma-delimited partition fields. Default to non-partitioned.") + public List partitionFields = new ArrayList<>(); + @Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata") + public boolean useFileListingFromMetadata = false; + @Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this" + + " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter") + public boolean assumeDatePartitioning = false; + + public boolean isHelp() { + return hoodieSyncConfigParams.isHelp(); + } + + public Properties toProps() { + final Properties props = hoodieSyncConfigParams.toProps(); + props.setProperty(BIGQUERY_SYNC_PROJECT_ID.key(), projectId); + props.setProperty(BIGQUERY_SYNC_DATASET_NAME.key(), datasetName); + props.setProperty(BIGQUERY_SYNC_DATASET_LOCATION.key(), datasetLocation); + props.setProperty(BIGQUERY_SYNC_TABLE_NAME.key(), tableName); + props.setProperty(BIGQUERY_SYNC_SOURCE_URI.key(), sourceUri); + props.setProperty(BIGQUERY_SYNC_SOURCE_URI_PREFIX.key(), sourceUriPrefix); + props.setProperty(BIGQUERY_SYNC_SYNC_BASE_PATH.key(), basePath); + props.setProperty(BIGQUERY_SYNC_PARTITION_FIELDS.key(), String.join(",", partitionFields)); + props.setProperty(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA.key(), String.valueOf(useFileListingFromMetadata)); + props.setProperty(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING.key(), String.valueOf(assumeDatePartitioning)); + return props; + } + } +} diff --git a/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySyncTool.java b/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySyncTool.java new file mode 100644 index 0000000000000..4716d6e494997 --- /dev/null +++ b/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySyncTool.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.gcp.bigquery; + +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.sync.common.HoodieSyncTool; +import org.apache.hudi.sync.common.util.ManifestFileWriter; + +import com.beust.jcommander.JCommander; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Properties; + +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING; +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_NAME; +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_PARTITION_FIELDS; +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SOURCE_URI; +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SOURCE_URI_PREFIX; +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SYNC_BASE_PATH; +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_TABLE_NAME; +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA; + +/** + * Tool to sync a hoodie table with a big query table. Either use it as an api + * BigQuerySyncTool.syncHoodieTable(BigQuerySyncConfig) or as a command line java -cp hoodie-hive.jar BigQuerySyncTool [args] + *

    + * This utility will get the schema from the latest commit and will sync big query table schema. + * + * @Experimental + */ +public class BigQuerySyncTool extends HoodieSyncTool { + + private static final Logger LOG = LogManager.getLogger(BigQuerySyncTool.class); + + public final BigQuerySyncConfig config; + public final String tableName; + public final String manifestTableName; + public final String versionsTableName; + public final String snapshotViewName; + + public BigQuerySyncTool(Properties props) { + super(props); + this.config = new BigQuerySyncConfig(props); + this.tableName = config.getString(BIGQUERY_SYNC_TABLE_NAME); + this.manifestTableName = tableName + "_manifest"; + this.versionsTableName = tableName + "_versions"; + this.snapshotViewName = tableName; + } + + @Override + public void syncHoodieTable() { + try (HoodieBigQuerySyncClient bqSyncClient = new HoodieBigQuerySyncClient(config)) { + switch (bqSyncClient.getTableType()) { + case COPY_ON_WRITE: + syncCoWTable(bqSyncClient); + break; + case MERGE_ON_READ: + default: + throw new UnsupportedOperationException(bqSyncClient.getTableType() + " table type is not supported yet."); + } + } catch (Exception e) { + throw new HoodieBigQuerySyncException("Failed to sync BigQuery for table:" + tableName, e); + } + } + + private void syncCoWTable(HoodieBigQuerySyncClient bqSyncClient) { + ValidationUtils.checkState(bqSyncClient.getTableType() == HoodieTableType.COPY_ON_WRITE); + LOG.info("Sync hoodie table " + snapshotViewName + " at base path " + bqSyncClient.getBasePath()); + + if (!bqSyncClient.datasetExists()) { + throw new HoodieBigQuerySyncException("Dataset not found: " + config.getString(BIGQUERY_SYNC_DATASET_NAME)); + } + + ManifestFileWriter manifestFileWriter = ManifestFileWriter.builder() + .setConf(config.getHadoopConf()) + .setBasePath(config.getString(BIGQUERY_SYNC_SYNC_BASE_PATH)) + .setUseFileListingFromMetadata(config.getBoolean(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA)) + .setAssumeDatePartitioning(config.getBoolean(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING)) + .build(); + manifestFileWriter.writeManifestFile(); + + if (!bqSyncClient.tableExists(manifestTableName)) { + bqSyncClient.createManifestTable(manifestTableName, manifestFileWriter.getManifestSourceUri()); + LOG.info("Manifest table creation complete for " + manifestTableName); + } + if (!bqSyncClient.tableExists(versionsTableName)) { + bqSyncClient.createVersionsTable( + versionsTableName, + config.getString(BIGQUERY_SYNC_SOURCE_URI), + config.getString(BIGQUERY_SYNC_SOURCE_URI_PREFIX), + config.getSplitStrings(BIGQUERY_SYNC_PARTITION_FIELDS)); + LOG.info("Versions table creation complete for " + versionsTableName); + } + if (!bqSyncClient.tableExists(snapshotViewName)) { + bqSyncClient.createSnapshotView(snapshotViewName, versionsTableName, manifestTableName); + LOG.info("Snapshot view creation complete for " + snapshotViewName); + } + + // TODO: Implement automatic schema evolution when you add a new column. + LOG.info("Sync table complete for " + snapshotViewName); + } + + public static void main(String[] args) { + final BigQuerySyncConfig.BigQuerySyncConfigParams params = new BigQuerySyncConfig.BigQuerySyncConfigParams(); + JCommander cmd = JCommander.newBuilder().addObject(params).build(); + cmd.parse(args); + if (params.isHelp()) { + cmd.usage(); + System.exit(0); + } + new BigQuerySyncTool(params.toProps()).syncHoodieTable(); + } +} diff --git a/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/HoodieBigQuerySyncClient.java b/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/HoodieBigQuerySyncClient.java new file mode 100644 index 0000000000000..0dca74f6d1760 --- /dev/null +++ b/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/HoodieBigQuerySyncClient.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.gcp.bigquery; + +import org.apache.hudi.sync.common.HoodieSyncClient; + +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.BigQueryException; +import com.google.cloud.bigquery.BigQueryOptions; +import com.google.cloud.bigquery.CsvOptions; +import com.google.cloud.bigquery.Dataset; +import com.google.cloud.bigquery.DatasetId; +import com.google.cloud.bigquery.ExternalTableDefinition; +import com.google.cloud.bigquery.Field; +import com.google.cloud.bigquery.FormatOptions; +import com.google.cloud.bigquery.HivePartitioningOptions; +import com.google.cloud.bigquery.Schema; +import com.google.cloud.bigquery.StandardSQLTypeName; +import com.google.cloud.bigquery.Table; +import com.google.cloud.bigquery.TableId; +import com.google.cloud.bigquery.TableInfo; +import com.google.cloud.bigquery.ViewDefinition; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_LOCATION; +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_NAME; +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_PROJECT_ID; + +public class HoodieBigQuerySyncClient extends HoodieSyncClient { + + private static final Logger LOG = LogManager.getLogger(HoodieBigQuerySyncClient.class); + + protected final BigQuerySyncConfig config; + private final String projectId; + private final String datasetName; + private transient BigQuery bigquery; + + public HoodieBigQuerySyncClient(final BigQuerySyncConfig config) { + super(config); + this.config = config; + this.projectId = config.getString(BIGQUERY_SYNC_PROJECT_ID); + this.datasetName = config.getString(BIGQUERY_SYNC_DATASET_NAME); + this.createBigQueryConnection(); + } + + private void createBigQueryConnection() { + if (bigquery == null) { + try { + // Initialize client that will be used to send requests. This client only needs to be created + // once, and can be reused for multiple requests. + bigquery = BigQueryOptions.newBuilder().setLocation(config.getString(BIGQUERY_SYNC_DATASET_LOCATION)).build().getService(); + LOG.info("Successfully established BigQuery connection."); + } catch (BigQueryException e) { + throw new HoodieBigQuerySyncException("Cannot create bigQuery connection ", e); + } + } + } + + public void createManifestTable(String tableName, String sourceUri) { + try { + TableId tableId = TableId.of(projectId, datasetName, tableName); + CsvOptions csvOptions = CsvOptions.newBuilder() + .setFieldDelimiter(",") + .setAllowJaggedRows(false) + .setAllowQuotedNewLines(false) + .setSkipLeadingRows(0) + .build(); + Schema schema = Schema.of( + Field.of("filename", StandardSQLTypeName.STRING)); + + ExternalTableDefinition customTable = + ExternalTableDefinition.newBuilder(sourceUri, schema, csvOptions) + .setAutodetect(false) + .setIgnoreUnknownValues(false) + .setMaxBadRecords(0) + .build(); + bigquery.create(TableInfo.of(tableId, customTable)); + LOG.info("Manifest External table created."); + } catch (BigQueryException e) { + throw new HoodieBigQuerySyncException("Manifest External table was not created ", e); + } + } + + public void createVersionsTable(String tableName, String sourceUri, String sourceUriPrefix, List partitionFields) { + try { + ExternalTableDefinition customTable; + TableId tableId = TableId.of(projectId, datasetName, tableName); + + if (partitionFields.isEmpty()) { + customTable = + ExternalTableDefinition.newBuilder(sourceUri, FormatOptions.parquet()) + .setAutodetect(true) + .setIgnoreUnknownValues(true) + .setMaxBadRecords(0) + .build(); + } else { + // Configuring partitioning options for partitioned table. + HivePartitioningOptions hivePartitioningOptions = + HivePartitioningOptions.newBuilder() + .setMode("AUTO") + .setRequirePartitionFilter(false) + .setSourceUriPrefix(sourceUriPrefix) + .build(); + customTable = + ExternalTableDefinition.newBuilder(sourceUri, FormatOptions.parquet()) + .setAutodetect(true) + .setHivePartitioningOptions(hivePartitioningOptions) + .setIgnoreUnknownValues(true) + .setMaxBadRecords(0) + .build(); + } + + bigquery.create(TableInfo.of(tableId, customTable)); + LOG.info("External table created using hivepartitioningoptions"); + } catch (BigQueryException e) { + throw new HoodieBigQuerySyncException("External table was not created ", e); + } + } + + public void createSnapshotView(String viewName, String versionsTableName, String manifestTableName) { + try { + TableId tableId = TableId.of(projectId, datasetName, viewName); + String query = + String.format( + "SELECT * FROM `%s.%s.%s` WHERE _hoodie_file_name IN " + + "(SELECT filename FROM `%s.%s.%s`)", + projectId, + datasetName, + versionsTableName, + projectId, + datasetName, + manifestTableName); + + ViewDefinition viewDefinition = + ViewDefinition.newBuilder(query).setUseLegacySql(false).build(); + + bigquery.create(TableInfo.of(tableId, viewDefinition)); + LOG.info("View created successfully"); + } catch (BigQueryException e) { + throw new HoodieBigQuerySyncException("View was not created ", e); + } + } + + @Override + public Map getMetastoreSchema(String tableName) { + // TODO: Implement automatic schema evolution when you add a new column. + return Collections.emptyMap(); + } + + public boolean datasetExists() { + Dataset dataset = bigquery.getDataset(DatasetId.of(projectId, datasetName)); + return dataset != null; + } + + @Override + public boolean tableExists(String tableName) { + TableId tableId = TableId.of(projectId, datasetName, tableName); + Table table = bigquery.getTable(tableId, BigQuery.TableOption.fields()); + return table != null && table.exists(); + } + + @Override + public void close() { + bigquery = null; + } +} diff --git a/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/HoodieBigQuerySyncException.java b/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/HoodieBigQuerySyncException.java new file mode 100644 index 0000000000000..4d30b2faa1d9b --- /dev/null +++ b/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/HoodieBigQuerySyncException.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.gcp.bigquery; + +public class HoodieBigQuerySyncException extends RuntimeException { + + public HoodieBigQuerySyncException() { + super(); + } + + public HoodieBigQuerySyncException(String message) { + super(message); + } + + public HoodieBigQuerySyncException(String message, Throwable t) { + super(message, t); + } + + public HoodieBigQuerySyncException(Throwable t) { + super(t); + } + + protected static String format(String message, Object... args) { + return String.format(String.valueOf(message), args); + } +} diff --git a/hudi-gcp/src/test/java/org/apache/hudi/gcp/bigquery/TestBigQuerySyncConfig.java b/hudi-gcp/src/test/java/org/apache/hudi/gcp/bigquery/TestBigQuerySyncConfig.java new file mode 100644 index 0000000000000..82a8527738477 --- /dev/null +++ b/hudi-gcp/src/test/java/org/apache/hudi/gcp/bigquery/TestBigQuerySyncConfig.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.gcp.bigquery; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.Properties; + +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING; +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_LOCATION; +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_NAME; +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_PARTITION_FIELDS; +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_PROJECT_ID; +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SOURCE_URI; +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SOURCE_URI_PREFIX; +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SYNC_BASE_PATH; +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_TABLE_NAME; +import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA; +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestBigQuerySyncConfig { + + BigQuerySyncConfig syncConfig; + + @BeforeEach + void setUp() { + Properties props = new Properties(); + props.setProperty(BIGQUERY_SYNC_PROJECT_ID.key(), "fooproject"); + props.setProperty(BIGQUERY_SYNC_DATASET_NAME.key(), "foodataset"); + props.setProperty(BIGQUERY_SYNC_DATASET_LOCATION.key(), "US"); + props.setProperty(BIGQUERY_SYNC_TABLE_NAME.key(), "footable"); + props.setProperty(BIGQUERY_SYNC_SOURCE_URI.key(), "gs://test-bucket/dwh/table_name/dt=*"); + props.setProperty(BIGQUERY_SYNC_SOURCE_URI_PREFIX.key(), "gs://test-bucket/dwh/table_name/"); + props.setProperty(BIGQUERY_SYNC_SYNC_BASE_PATH.key(), "gs://test-bucket/dwh/table_name"); + props.setProperty(BIGQUERY_SYNC_PARTITION_FIELDS.key(), "a,b"); + props.setProperty(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA.key(), "true"); + props.setProperty(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING.key(), "true"); + syncConfig = new BigQuerySyncConfig(props); + } + + @Test + public void testGetConfigs() { + assertEquals("fooproject", syncConfig.getString(BIGQUERY_SYNC_PROJECT_ID)); + assertEquals("foodataset", syncConfig.getString(BIGQUERY_SYNC_DATASET_NAME)); + assertEquals("US", syncConfig.getString(BIGQUERY_SYNC_DATASET_LOCATION)); + assertEquals("footable", syncConfig.getString(BIGQUERY_SYNC_TABLE_NAME)); + assertEquals("gs://test-bucket/dwh/table_name/dt=*", syncConfig.getString(BIGQUERY_SYNC_SOURCE_URI)); + assertEquals("gs://test-bucket/dwh/table_name/", syncConfig.getString(BIGQUERY_SYNC_SOURCE_URI_PREFIX)); + assertEquals("gs://test-bucket/dwh/table_name", syncConfig.getString(BIGQUERY_SYNC_SYNC_BASE_PATH)); + assertEquals(Arrays.asList("a", "b"), syncConfig.getSplitStrings(BIGQUERY_SYNC_PARTITION_FIELDS)); + assertEquals(true, syncConfig.getBoolean(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA)); + assertEquals(true, syncConfig.getBoolean(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING)); + } + +} diff --git a/hudi-hadoop-mr/pom.xml b/hudi-hadoop-mr/pom.xml index ce2c03fd948e6..9e735a4e528b8 100644 --- a/hudi-hadoop-mr/pom.xml +++ b/hudi-hadoop-mr/pom.xml @@ -19,7 +19,7 @@ hudi org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 @@ -30,6 +30,12 @@ + + + org.apache.logging.log4j + log4j-1.2-api + + org.apache.hudi @@ -75,6 +81,12 @@ + + org.apache.hudi + hudi-tests-common + ${project.version} + test + org.apache.hudi hudi-common diff --git a/hudi-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/hive/HoodieCombineHiveInputFormat.java b/hudi-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/hive/HoodieCombineHiveInputFormat.java deleted file mode 100644 index 02afd19fe86f4..0000000000000 --- a/hudi-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/hive/HoodieCombineHiveInputFormat.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.uber.hoodie.hadoop.hive; - -import com.uber.hoodie.hadoop.HoodieInputFormat; -import com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hudi.hadoop.HoodieParquetInputFormat; -import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; - -public class HoodieCombineHiveInputFormat - extends org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat { - - @Override - protected String getParquetInputFormatClassName() { - return HoodieInputFormat.class.getName(); - } - - @Override - protected String getParquetRealtimeInputFormatClassName() { - return HoodieRealtimeInputFormat.class.getName(); - } - - @Override - protected org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat.HoodieCombineFileInputFormatShim createInputFormatShim() { - return new HoodieCombineFileInputFormatShim<>(); - } - - public static class HoodieCombineFileInputFormatShim - extends org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat.HoodieCombineFileInputFormatShim { - - @Override - protected HoodieParquetInputFormat createParquetInputFormat() { - return new HoodieInputFormat(); - } - - @Override - protected HoodieParquetRealtimeInputFormat createParquetRealtimeInputFormat() { - return new HoodieRealtimeInputFormat(); - } - } -} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/BootstrapBaseFileSplit.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/BootstrapBaseFileSplit.java index 437304fb043d0..6db1751771904 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/BootstrapBaseFileSplit.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/BootstrapBaseFileSplit.java @@ -32,9 +32,11 @@ public class BootstrapBaseFileSplit extends FileSplit { private FileSplit bootstrapFileSplit; - public BootstrapBaseFileSplit() { - super(); - } + /** + * NOTE: This ctor is necessary for Hive to be able to serialize and + * then instantiate it when deserializing back + */ + public BootstrapBaseFileSplit() {} public BootstrapBaseFileSplit(FileSplit baseSplit, FileSplit bootstrapFileSplit) throws IOException { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieTableFileIndex.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieTableFileIndex.java new file mode 100644 index 0000000000000..000fce5e8fbff --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieTableFileIndex.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.BaseHoodieTableFileIndex; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieTableQueryType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; + +/** + * Implementation of {@link BaseHoodieTableFileIndex} for Hive-based query engines + */ +public class HiveHoodieTableFileIndex extends BaseHoodieTableFileIndex { + + public static final Logger LOG = LoggerFactory.getLogger(HiveHoodieTableFileIndex.class); + + public HiveHoodieTableFileIndex(HoodieEngineContext engineContext, + HoodieTableMetaClient metaClient, + TypedProperties configProperties, + HoodieTableQueryType queryType, + List queryPaths, + Option specifiedQueryInstant, + boolean shouldIncludePendingCommits + ) { + super(engineContext, + metaClient, + configProperties, + queryType, + queryPaths, + specifiedQueryInstant, + shouldIncludePendingCommits, + true, + new NoopCache()); + } + + @Override + public Object[] parsePartitionColumnValues(String[] partitionColumns, String partitionPath) { + // NOTE: Parsing partition path into partition column values isn't required on Hive, + // since Hive does partition pruning in a different way (based on the input-path being + // fetched by the query engine) + return new Object[0]; + } + + static class NoopCache implements FileStatusCache { + @Override + public Option get(Path path) { + return Option.empty(); + } + + @Override + public void put(Path path, FileStatus[] leafFiles) { + // no-op + } + + @Override + public void invalidate() { + // no-op + } + } +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieColumnProjectionUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieColumnProjectionUtils.java index b7141a8ee762f..9ca99c41888b1 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieColumnProjectionUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieColumnProjectionUtils.java @@ -19,7 +19,6 @@ package org.apache.hudi.hadoop; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; -import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; @@ -32,14 +31,12 @@ import java.util.ArrayList; import java.util.Arrays; -import java.util.HashSet; import java.util.List; -import java.util.Set; import java.util.stream.Collectors; import java.util.stream.IntStream; /** - * Utility funcitons copied from Hive ColumnProjectionUtils.java. + * Utility functions copied from Hive ColumnProjectionUtils.java. * Needed to copy as we see NoSuchMethod errors when directly using these APIs with/without Spark. * Some of these methods are not available across hive versions. */ @@ -53,127 +50,9 @@ public class HoodieColumnProjectionUtils { * c:struct_of (a:string,b:string). * the column a's path is c.a and b's path is c.b */ - public static final String READ_NESTED_COLUMN_PATH_CONF_STR = - "hive.io.file.readNestedColumn.paths"; - public static final String READ_ALL_COLUMNS = "hive.io.file.read.all.columns"; public static final String READ_COLUMN_NAMES_CONF_STR = "hive.io.file.readcolumn.names"; private static final String READ_COLUMN_IDS_CONF_STR_DEFAULT = ""; private static final String READ_COLUMN_NAMES_CONF_STR_DEFAULT = ""; - private static final String READ_NESTED_COLUMN_PATH_CONF_STR_DEFAULT = ""; - private static final boolean READ_ALL_COLUMNS_DEFAULT = true; - - private static final String COMMA = ","; - - /** Special Column Names added during Parquet Projection. **/ - public static final String PARQUET_BLOCK_OFFSET_COL_NAME = "BLOCK__OFFSET__INSIDE__FILE"; - public static final String PARQUET_INPUT_FILE_NAME = "INPUT__FILE__NAME"; - public static final String PARQUET_ROW_ID = "ROW__ID"; - - public static final List PARQUET_SPECIAL_COLUMN_NAMES = CollectionUtils - .createImmutableList(PARQUET_BLOCK_OFFSET_COL_NAME, PARQUET_INPUT_FILE_NAME, - PARQUET_ROW_ID); - - /** - * Sets the READ_ALL_COLUMNS flag and removes any previously - * set column ids. - */ - public static void setReadAllColumns(Configuration conf) { - conf.setBoolean(READ_ALL_COLUMNS, true); - setReadColumnIDConf(conf, READ_COLUMN_IDS_CONF_STR_DEFAULT); - setReadColumnNamesConf(conf, READ_COLUMN_NAMES_CONF_STR_DEFAULT); - } - - /** - * Returns the READ_ALL_COLUMNS columns flag. - */ - public static boolean isReadAllColumns(Configuration conf) { - return conf.getBoolean(READ_ALL_COLUMNS, READ_ALL_COLUMNS_DEFAULT); - } - - /** - * Sets the READ_ALL_COLUMNS flag to false and overwrites column ids - * with the provided list. - */ - public static void setReadColumns(Configuration conf, List ids, List names) { - setReadColumnIDConf(conf, READ_COLUMN_IDS_CONF_STR_DEFAULT); - setReadColumnNamesConf(conf, READ_COLUMN_NAMES_CONF_STR_DEFAULT); - appendReadColumns(conf, ids); - appendReadColumnNames(conf, names); - } - - /** - * Appends read columns' ids (start from zero). Once a column - * is included in the list, a underlying record reader of a columnar file format - * (e.g. RCFile and ORC) can know what columns are needed. - */ - public static void appendReadColumns(Configuration conf, List ids) { - String id = toReadColumnIDString(ids); - String old = conf.get(READ_COLUMN_IDS_CONF_STR, null); - String newConfStr = id; - if (old != null && !old.isEmpty()) { - newConfStr = newConfStr + StringUtils.COMMA_STR + old; - } - setReadColumnIDConf(conf, newConfStr); - // Set READ_ALL_COLUMNS to false - conf.setBoolean(READ_ALL_COLUMNS, false); - } - - /** - * Appends read nested column's paths. Once a read nested column path - * is included in the list, a underlying record reader of a columnar file format - * (e.g. Parquet and ORC) can know what columns are needed. - */ - public static void appendNestedColumnPaths( - Configuration conf, - List paths) { - if (paths == null || paths.isEmpty()) { - return; - } - String pathsStr = StringUtils.join(StringUtils.COMMA_STR, - paths.toArray(new String[paths.size()])); - String old = conf.get(READ_NESTED_COLUMN_PATH_CONF_STR, null); - String newConfStr = pathsStr; - if (old != null && !old.isEmpty()) { - newConfStr = newConfStr + StringUtils.COMMA_STR + old; - } - setReadNestedColumnPathConf(conf, newConfStr); - } - - - /** - * This method appends read column information to configuration to use for PPD. It is - * currently called with information from TSOP. Names come from TSOP input RowSchema, and - * IDs are the indexes inside the schema (which PPD assumes correspond to indexes inside the - * files to PPD in; something that would be invalid in many cases of schema evolution). - * @param conf Config to set values to. - * @param ids Column ids. - * @param names Column names. - */ - public static void appendReadColumns( - Configuration conf, List ids, List names, List groupPaths) { - if (ids.size() != names.size()) { - LOG.warn("Read column counts do not match: " - + ids.size() + " ids, " + names.size() + " names"); - } - appendReadColumns(conf, ids); - appendReadColumnNames(conf, names); - appendNestedColumnPaths(conf, groupPaths); - } - - public static void appendReadColumns( - StringBuilder readColumnsBuffer, StringBuilder readColumnNamesBuffer, List ids, - List names) { - String preppedIdStr = ids.stream().map(x -> String.valueOf(x)).collect(Collectors.joining(COMMA)); - String preppedNamesStr = names.stream().collect(Collectors.joining(COMMA)); - if (readColumnsBuffer.length() > 0) { - readColumnsBuffer.append(COMMA); - } - readColumnsBuffer.append(preppedIdStr); - if (readColumnNamesBuffer.length() > 0) { - readColumnNamesBuffer.append(COMMA); - } - readColumnNamesBuffer.append(preppedNamesStr); - } /** * Returns an array of column ids(start from zero) which is set in the given @@ -197,12 +76,6 @@ public static List getReadColumnIDs(Configuration conf) { return result; } - public static Set getNestedColumnPaths(Configuration conf) { - String skips = - conf.get(READ_NESTED_COLUMN_PATH_CONF_STR, READ_NESTED_COLUMN_PATH_CONF_STR_DEFAULT); - return new HashSet<>(Arrays.asList(StringUtils.split(skips))); - } - public static String[] getReadColumnNames(Configuration conf) { String colNames = conf.get(READ_COLUMN_NAMES_CONF_STR, READ_COLUMN_NAMES_CONF_STR_DEFAULT); if (colNames != null && !colNames.isEmpty()) { @@ -236,65 +109,4 @@ public static List> getIOColumnNameAndTypes(Configuration co .collect(Collectors.toList()); } - public static void setIOColumnNameAndTypes(Configuration conf, List> colNamesAndTypes) { - String colNames = colNamesAndTypes.stream().map(e -> e.getKey()).collect(Collectors.joining(",")); - String colTypes = colNamesAndTypes.stream().map(e -> e.getValue()).collect(Collectors.joining(",")); - conf.set(IOConstants.COLUMNS, colNames); - conf.set(IOConstants.COLUMNS_TYPES, colTypes); - } - - private static void setReadColumnIDConf(Configuration conf, String id) { - if (id.trim().isEmpty()) { - conf.set(READ_COLUMN_IDS_CONF_STR, READ_COLUMN_IDS_CONF_STR_DEFAULT); - } else { - conf.set(READ_COLUMN_IDS_CONF_STR, id); - } - } - - private static void setReadColumnNamesConf(Configuration conf, String id) { - if (id.trim().isEmpty()) { - conf.set(READ_COLUMN_NAMES_CONF_STR, READ_COLUMN_NAMES_CONF_STR_DEFAULT); - } else { - conf.set(READ_COLUMN_NAMES_CONF_STR, id); - } - } - - private static void setReadNestedColumnPathConf( - Configuration conf, - String nestedColumnPaths) { - nestedColumnPaths = nestedColumnPaths.toLowerCase(); - if (nestedColumnPaths.trim().isEmpty()) { - conf.set(READ_NESTED_COLUMN_PATH_CONF_STR, READ_NESTED_COLUMN_PATH_CONF_STR_DEFAULT); - } else { - conf.set(READ_NESTED_COLUMN_PATH_CONF_STR, nestedColumnPaths); - } - } - - private static void appendReadColumnNames(Configuration conf, List cols) { - String old = conf.get(READ_COLUMN_NAMES_CONF_STR, ""); - StringBuilder result = new StringBuilder(old); - boolean first = old.isEmpty(); - for (String col: cols) { - if (first) { - first = false; - } else { - result.append(','); - } - result.append(col); - } - conf.set(READ_COLUMN_NAMES_CONF_STR, result.toString()); - } - - private static String toReadColumnIDString(List ids) { - String id = ""; - for (int i = 0; i < ids.size(); i++) { - if (i == 0) { - id = id + ids.get(i); - } else { - id = id + StringUtils.COMMA_STR + ids.get(i); - } - } - return id; - } - } \ No newline at end of file diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieCopyOnWriteTableInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieCopyOnWriteTableInputFormat.java new file mode 100644 index 0000000000000..ce441bf2e2892 --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieCopyOnWriteTableInputFormat.java @@ -0,0 +1,369 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.HoodieLocalEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieTableQueryType; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.FileSystemViewManager; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.hadoop.realtime.HoodieVirtualKeyInfo; +import org.apache.hudi.hadoop.utils.HoodieHiveUtils; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.metadata.HoodieTableMetadataUtil; + +import org.apache.avro.Schema; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapreduce.Job; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import javax.annotation.Nonnull; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.config.HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS; +import static org.apache.hudi.common.config.HoodieMetadataConfig.ENABLE; +import static org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE; +import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.buildMetadataConfig; +import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.getFileStatus; + +/** + * Base implementation of the Hive's {@link FileInputFormat} allowing for reading of Hudi's + * Copy-on-Write (COW) tables in various configurations: + * + *

      + *
    • Snapshot mode: reading table's state as of particular timestamp (or instant, in Hudi's terms)
    • + *
    • Incremental mode: reading table's state as of particular timestamp (or instant, in Hudi's terms)
    • + *
    • External mode: reading non-Hudi partitions
    • + *
    + * + * NOTE: This class is invariant of the underlying file-format of the files being read + */ +public class HoodieCopyOnWriteTableInputFormat extends HoodieTableInputFormat { + + private static final Logger LOG = LogManager.getLogger(HoodieCopyOnWriteTableInputFormat.class); + + @Override + protected boolean isSplitable(FileSystem fs, Path filename) { + return !(filename instanceof PathWithBootstrapFileStatus); + } + + @Override + protected FileSplit makeSplit(Path file, long start, long length, + String[] hosts) { + FileSplit split = new FileSplit(file, start, length, hosts); + + if (file instanceof PathWithBootstrapFileStatus) { + return makeExternalFileSplit((PathWithBootstrapFileStatus)file, split); + } + return split; + } + + @Override + protected FileSplit makeSplit(Path file, long start, long length, + String[] hosts, String[] inMemoryHosts) { + FileSplit split = new FileSplit(file, start, length, hosts, inMemoryHosts); + if (file instanceof PathWithBootstrapFileStatus) { + return makeExternalFileSplit((PathWithBootstrapFileStatus)file, split); + } + return split; + } + + @Override + public FileStatus[] listStatus(JobConf job) throws IOException { + // Segregate inputPaths[] to incremental, snapshot and non hoodie paths + List incrementalTables = HoodieHiveUtils.getIncrementalTableNames(Job.getInstance(job)); + InputPathHandler inputPathHandler = new InputPathHandler(conf, getInputPaths(job), incrementalTables); + List returns = new ArrayList<>(); + + Map tableMetaClientMap = inputPathHandler.getTableMetaClientMap(); + // process incremental pulls first + for (String table : incrementalTables) { + HoodieTableMetaClient metaClient = tableMetaClientMap.get(table); + if (metaClient == null) { + /* This can happen when the INCREMENTAL mode is set for a table but there were no InputPaths + * in the jobConf + */ + continue; + } + List inputPaths = inputPathHandler.getGroupedIncrementalPaths().get(metaClient); + List result = listStatusForIncrementalMode(job, metaClient, inputPaths, table); + if (result != null) { + returns.addAll(result); + } + } + + // process non hoodie Paths next. + List nonHoodiePaths = inputPathHandler.getNonHoodieInputPaths(); + if (nonHoodiePaths.size() > 0) { + setInputPaths(job, nonHoodiePaths.toArray(new Path[nonHoodiePaths.size()])); + FileStatus[] fileStatuses = doListStatus(job); + returns.addAll(Arrays.asList(fileStatuses)); + } + + // process snapshot queries next. + List snapshotPaths = inputPathHandler.getSnapshotPaths(); + if (snapshotPaths.size() > 0) { + returns.addAll(listStatusForSnapshotMode(job, tableMetaClientMap, snapshotPaths)); + } + return returns.toArray(new FileStatus[0]); + } + + @Override + public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { + throw new UnsupportedEncodingException("not implemented"); + } + + /** + * Abstracts and exposes {@link FileInputFormat#listStatus(JobConf)} operation to subclasses that + * lists files (returning an array of {@link FileStatus}) corresponding to the input paths specified + * as part of provided {@link JobConf} + */ + protected final FileStatus[] doListStatus(JobConf job) throws IOException { + return super.listStatus(job); + } + + /** + * Achieves listStatus functionality for an incrementally queried table. Instead of listing all + * partitions and then filtering based on the commits of interest, this logic first extracts the + * partitions touched by the desired commits and then lists only those partitions. + */ + protected List listStatusForIncrementalMode(JobConf job, + HoodieTableMetaClient tableMetaClient, + List inputPaths, + String incrementalTable) throws IOException { + Job jobContext = Job.getInstance(job); + Option timeline = HoodieInputFormatUtils.getFilteredCommitsTimeline(jobContext, tableMetaClient); + if (!timeline.isPresent()) { + return null; + } + Option> commitsToCheck = HoodieInputFormatUtils.getCommitsForIncrementalQuery(jobContext, incrementalTable, timeline.get()); + if (!commitsToCheck.isPresent()) { + return null; + } + Option incrementalInputPaths = HoodieInputFormatUtils.getAffectedPartitions(commitsToCheck.get(), tableMetaClient, timeline.get(), inputPaths); + // Mutate the JobConf to set the input paths to only partitions touched by incremental pull. + if (!incrementalInputPaths.isPresent()) { + return null; + } + setInputPaths(job, incrementalInputPaths.get()); + FileStatus[] fileStatuses = doListStatus(job); + return HoodieInputFormatUtils.filterIncrementalFileStatus(jobContext, tableMetaClient, timeline.get(), fileStatuses, commitsToCheck.get()); + } + + protected FileStatus createFileStatusUnchecked(FileSlice fileSlice, Option instantOpt, String basePath, Option virtualKeyInfoOpt) { + Option baseFileOpt = fileSlice.getBaseFile(); + + if (baseFileOpt.isPresent()) { + return getFileStatusUnchecked(baseFileOpt.get()); + } else { + throw new IllegalStateException("Invalid state: base-file has to be present"); + } + } + + private BootstrapBaseFileSplit makeExternalFileSplit(PathWithBootstrapFileStatus file, FileSplit split) { + try { + LOG.info("Making external data split for " + file); + FileStatus externalFileStatus = file.getBootstrapFileStatus(); + FileSplit externalFileSplit = makeSplit(externalFileStatus.getPath(), 0, externalFileStatus.getLen(), + new String[0], new String[0]); + return new BootstrapBaseFileSplit(split, externalFileSplit); + } catch (IOException e) { + throw new HoodieIOException(e.getMessage(), e); + } + } + + @Nonnull + private List listStatusForSnapshotMode(JobConf job, + Map tableMetaClientMap, + List snapshotPaths) throws IOException { + HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(job); + List targetFiles = new ArrayList<>(); + + TypedProperties props = new TypedProperties(new Properties()); + + Map> groupedPaths = + HoodieInputFormatUtils.groupSnapshotPathsByMetaClient(tableMetaClientMap.values(), snapshotPaths); + Map fsViewCache = new HashMap<>(); + + for (Map.Entry> entry : groupedPaths.entrySet()) { + HoodieTableMetaClient tableMetaClient = entry.getKey(); + List partitionPaths = entry.getValue(); + + // Hive job might specify a max commit instant up to which table's state + // should be examined. We simply pass it as query's instant to the file-index + Option queryCommitInstant = + HoodieHiveUtils.getMaxCommit(job, tableMetaClient.getTableConfig().getTableName()); + + boolean shouldIncludePendingCommits = + HoodieHiveUtils.shouldIncludePendingCommits(job, tableMetaClient.getTableConfig().getTableName()); + + // NOTE: Fetching virtual key info is a costly operation as it needs to load the commit metadata. + // This is only needed for MOR realtime splits. Hence, for COW tables, this can be avoided. + Option virtualKeyInfoOpt = tableMetaClient.getTableType().equals(COPY_ON_WRITE) ? Option.empty() : getHoodieVirtualKeyInfo(tableMetaClient); + String basePath = tableMetaClient.getBasePathV2().toString(); + + if (conf.getBoolean(ENABLE.key(), DEFAULT_METADATA_ENABLE_FOR_READERS) && HoodieTableMetadataUtil.isFilesPartitionAvailable(tableMetaClient)) { + HiveHoodieTableFileIndex fileIndex = + new HiveHoodieTableFileIndex( + engineContext, + tableMetaClient, + props, + HoodieTableQueryType.SNAPSHOT, + partitionPaths, + queryCommitInstant, + shouldIncludePendingCommits); + + Map> partitionedFileSlices = fileIndex.listFileSlices(); + + targetFiles.addAll( + partitionedFileSlices.values() + .stream() + .flatMap(Collection::stream) + .filter(fileSlice -> checkIfValidFileSlice(fileSlice)) + .map(fileSlice -> createFileStatusUnchecked(fileSlice, fileIndex.getLatestCompletedInstant(), basePath, virtualKeyInfoOpt)) + .collect(Collectors.toList()) + ); + } else { + HoodieTimeline timeline = getActiveTimeline(tableMetaClient, shouldIncludePendingCommits); + Option queryInstant = queryCommitInstant.or(() -> timeline.lastInstant().map(HoodieInstant::getTimestamp)); + validateInstant(timeline, queryInstant); + + try { + HoodieTableFileSystemView fsView = fsViewCache.computeIfAbsent(tableMetaClient, hoodieTableMetaClient -> + FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(engineContext, hoodieTableMetaClient, buildMetadataConfig(job), timeline)); + + List filteredFileSlices = new ArrayList<>(); + + for (Path p : entry.getValue()) { + String relativePartitionPath = FSUtils.getRelativePartitionPath(new Path(basePath), p); + + List fileSlices = queryInstant.map( + instant -> fsView.getLatestMergedFileSlicesBeforeOrOn(relativePartitionPath, instant)) + .orElse(fsView.getLatestFileSlices(relativePartitionPath)) + .collect(Collectors.toList()); + + filteredFileSlices.addAll(fileSlices); + } + + targetFiles.addAll( + filteredFileSlices.stream() + .filter(fileSlice -> checkIfValidFileSlice(fileSlice)) + .map(fileSlice -> createFileStatusUnchecked(fileSlice, timeline.filterCompletedInstants().lastInstant(), basePath, virtualKeyInfoOpt)) + .collect(Collectors.toList())); + } finally { + fsViewCache.forEach(((metaClient, fsView) -> fsView.close())); + } + } + } + + return targetFiles; + } + + private static HoodieTimeline getActiveTimeline(HoodieTableMetaClient metaClient, boolean shouldIncludePendingCommits) { + HoodieTimeline timeline = metaClient.getCommitsAndCompactionTimeline(); + if (shouldIncludePendingCommits) { + return timeline; + } else { + return timeline.filterCompletedAndCompactionInstants(); + } + } + + private static void validateInstant(HoodieTimeline activeTimeline, Option queryInstant) { + if (queryInstant.isPresent() && !activeTimeline.containsInstant(queryInstant.get())) { + throw new HoodieIOException(String.format("Query instant (%s) not found in the timeline", queryInstant.get())); + } + } + + protected boolean checkIfValidFileSlice(FileSlice fileSlice) { + Option baseFileOpt = fileSlice.getBaseFile(); + Option latestLogFileOpt = fileSlice.getLatestLogFile(); + + if (baseFileOpt.isPresent()) { + return true; + } else if (latestLogFileOpt.isPresent()) { + // It happens when reading optimized query to mor. + return false; + } else { + throw new IllegalStateException("Invalid state: base-file has to be present for " + fileSlice.getFileId()); + } + } + + @Nonnull + protected static FileStatus getFileStatusUnchecked(HoodieBaseFile baseFile) { + try { + return getFileStatus(baseFile); + } catch (IOException ioe) { + throw new HoodieIOException("Failed to get file-status", ioe); + } + } + + protected static Option getHoodieVirtualKeyInfo(HoodieTableMetaClient metaClient) { + HoodieTableConfig tableConfig = metaClient.getTableConfig(); + if (tableConfig.populateMetaFields()) { + return Option.empty(); + } + TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient); + try { + Schema schema = tableSchemaResolver.getTableAvroSchema(); + boolean isNonPartitionedKeyGen = StringUtils.isNullOrEmpty(tableConfig.getPartitionFieldProp()); + return Option.of( + new HoodieVirtualKeyInfo( + tableConfig.getRecordKeyFieldProp(), + isNonPartitionedKeyGen ? Option.empty() : Option.of(tableConfig.getPartitionFieldProp()), + schema.getField(tableConfig.getRecordKeyFieldProp()).pos(), + isNonPartitionedKeyGen ? Option.empty() : Option.of(schema.getField(tableConfig.getPartitionFieldProp()).pos()))); + } catch (Exception exception) { + throw new HoodieException("Fetching table schema failed with exception ", exception); + } + } +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHFileInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHFileInputFormat.java index 1747888b15ad7..6eb1663a0d12c 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHFileInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHFileInputFormat.java @@ -18,128 +18,29 @@ package org.apache.hudi.hadoop; -import org.apache.hadoop.conf.Configurable; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hudi.common.model.HoodieFileFormat; -import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.hadoop.utils.HoodieHiveUtils; import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Map; /** * HoodieInputFormat for HUDI datasets which store data in HFile base file format. */ @UseFileSplitsFromInputFormat -public class HoodieHFileInputFormat extends FileInputFormat implements Configurable { - - private static final Logger LOG = LogManager.getLogger(HoodieHFileInputFormat.class); - - protected Configuration conf; +public class HoodieHFileInputFormat extends HoodieCopyOnWriteTableInputFormat { protected HoodieDefaultTimeline filterInstantsTimeline(HoodieDefaultTimeline timeline) { return HoodieInputFormatUtils.filterInstantsTimeline(timeline); } - @Override - public FileStatus[] listStatus(JobConf job) throws IOException { - // Segregate inputPaths[] to incremental, snapshot and non hoodie paths - List incrementalTables = HoodieHiveUtils.getIncrementalTableNames(Job.getInstance(job)); - InputPathHandler inputPathHandler = new InputPathHandler(conf, getInputPaths(job), incrementalTables); - List returns = new ArrayList<>(); - - Map tableMetaClientMap = inputPathHandler.getTableMetaClientMap(); - // process incremental pulls first - for (String table : incrementalTables) { - HoodieTableMetaClient metaClient = tableMetaClientMap.get(table); - if (metaClient == null) { - /* This can happen when the INCREMENTAL mode is set for a table but there were no InputPaths - * in the jobConf - */ - continue; - } - List inputPaths = inputPathHandler.getGroupedIncrementalPaths().get(metaClient); - List result = listStatusForIncrementalMode(job, metaClient, inputPaths); - if (result != null) { - returns.addAll(result); - } - } - - // process non hoodie Paths next. - List nonHoodiePaths = inputPathHandler.getNonHoodieInputPaths(); - if (nonHoodiePaths.size() > 0) { - setInputPaths(job, nonHoodiePaths.toArray(new Path[nonHoodiePaths.size()])); - FileStatus[] fileStatuses = super.listStatus(job); - returns.addAll(Arrays.asList(fileStatuses)); - } - - // process snapshot queries next. - List snapshotPaths = inputPathHandler.getSnapshotPaths(); - if (snapshotPaths.size() > 0) { - setInputPaths(job, snapshotPaths.toArray(new Path[snapshotPaths.size()])); - FileStatus[] fileStatuses = super.listStatus(job); - Map> groupedFileStatus = - HoodieInputFormatUtils.groupFileStatusForSnapshotPaths(fileStatuses, HoodieFileFormat.HFILE.getFileExtension(), - tableMetaClientMap.values()); - LOG.info("Found a total of " + groupedFileStatus.size() + " groups"); - for (Map.Entry> entry : groupedFileStatus.entrySet()) { - List result = HoodieInputFormatUtils.filterFileStatusForSnapshotMode(job, entry.getKey(), entry.getValue()); - if (result != null) { - returns.addAll(result); - } - } - } - return returns.toArray(new FileStatus[returns.size()]); - } - - /** - * Achieves listStatus functionality for an incrementally queried table. Instead of listing all - * partitions and then filtering based on the commits of interest, this logic first extracts the - * partitions touched by the desired commits and then lists only those partitions. - */ - private List listStatusForIncrementalMode( - JobConf job, HoodieTableMetaClient tableMetaClient, List inputPaths) throws IOException { - String tableName = tableMetaClient.getTableConfig().getTableName(); - Job jobContext = Job.getInstance(job); - Option timeline = HoodieInputFormatUtils.getFilteredCommitsTimeline(jobContext, tableMetaClient); - if (!timeline.isPresent()) { - return null; - } - Option> commitsToCheck = HoodieInputFormatUtils.getCommitsForIncrementalQuery(jobContext, tableName, timeline.get()); - if (!commitsToCheck.isPresent()) { - return null; - } - Option incrementalInputPaths = HoodieInputFormatUtils.getAffectedPartitions(commitsToCheck.get(), tableMetaClient, timeline.get(), inputPaths); - // Mutate the JobConf to set the input paths to only partitions touched by incremental pull. - if (!incrementalInputPaths.isPresent()) { - return null; - } - setInputPaths(job, incrementalInputPaths.get()); - FileStatus[] fileStatuses = super.listStatus(job); - return HoodieInputFormatUtils.filterIncrementalFileStatus(jobContext, tableMetaClient, timeline.get(), fileStatuses, commitsToCheck.get()); - } - @Override public RecordReader getRecordReader(final InputSplit split, final JobConf job, final Reporter reporter) throws IOException { @@ -151,13 +52,4 @@ protected boolean isSplitable(FileSystem fs, Path filename) { // This file isn't splittable. return false; } - - public void setConf(Configuration conf) { - this.conf = conf; - } - - @Override - public Configuration getConf() { - return conf; - } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java index 8b89949e803b6..7b79f61e49bcf 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java @@ -18,24 +18,6 @@ package org.apache.hudi.hadoop; -import org.apache.hudi.common.model.HoodieFileFormat; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.hadoop.utils.HoodieHiveUtils; -import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; - -import org.apache.hadoop.conf.Configurable; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.io.ArrayWritable; @@ -45,122 +27,34 @@ import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; -import org.apache.hadoop.mapreduce.Job; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.hadoop.utils.HoodieHiveUtils; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; import java.util.List; -import java.util.Map; import java.util.stream.Collectors; import java.util.stream.IntStream; /** * HoodieInputFormat which understands the Hoodie File Structure and filters files based on the Hoodie Mode. If paths * that does not correspond to a hoodie table then they are passed in as is (as what FileInputFormat.listStatus() - * would do). The JobConf could have paths from multipe Hoodie/Non-Hoodie tables + * would do). The JobConf could have paths from multiple Hoodie/Non-Hoodie tables */ @UseRecordReaderFromInputFormat @UseFileSplitsFromInputFormat -public class HoodieParquetInputFormat extends MapredParquetInputFormat implements Configurable { +public class HoodieParquetInputFormat extends HoodieParquetInputFormatBase { private static final Logger LOG = LogManager.getLogger(HoodieParquetInputFormat.class); - protected Configuration conf; - - protected HoodieDefaultTimeline filterInstantsTimeline(HoodieDefaultTimeline timeline) { - return HoodieInputFormatUtils.filterInstantsTimeline(timeline); - } - - @Override - public FileStatus[] listStatus(JobConf job) throws IOException { - // Segregate inputPaths[] to incremental, snapshot and non hoodie paths - List incrementalTables = HoodieHiveUtils.getIncrementalTableNames(Job.getInstance(job)); - InputPathHandler inputPathHandler = new InputPathHandler(conf, getInputPaths(job), incrementalTables); - List returns = new ArrayList<>(); - - Map tableMetaClientMap = inputPathHandler.getTableMetaClientMap(); - // process incremental pulls first - for (String table : incrementalTables) { - HoodieTableMetaClient metaClient = tableMetaClientMap.get(table); - if (metaClient == null) { - /* This can happen when the INCREMENTAL mode is set for a table but there were no InputPaths - * in the jobConf - */ - continue; - } - List inputPaths = inputPathHandler.getGroupedIncrementalPaths().get(metaClient); - List result = listStatusForIncrementalMode(job, metaClient, inputPaths); - if (result != null) { - returns.addAll(result); - } - } - - // process non hoodie Paths next. - List nonHoodiePaths = inputPathHandler.getNonHoodieInputPaths(); - if (nonHoodiePaths.size() > 0) { - setInputPaths(job, nonHoodiePaths.toArray(new Path[nonHoodiePaths.size()])); - FileStatus[] fileStatuses = super.listStatus(job); - returns.addAll(Arrays.asList(fileStatuses)); - } - - // process snapshot queries next. - List snapshotPaths = inputPathHandler.getSnapshotPaths(); - if (snapshotPaths.size() > 0) { - setInputPaths(job, snapshotPaths.toArray(new Path[snapshotPaths.size()])); - FileStatus[] fileStatuses = super.listStatus(job); - Map> groupedFileStatus = - HoodieInputFormatUtils.groupFileStatusForSnapshotPaths(fileStatuses, - HoodieFileFormat.PARQUET.getFileExtension(), tableMetaClientMap.values()); - LOG.info("Found a total of " + groupedFileStatus.size() + " groups"); - for (Map.Entry> entry : groupedFileStatus.entrySet()) { - List result = HoodieInputFormatUtils.filterFileStatusForSnapshotMode(job, entry.getKey(), entry.getValue()); - if (result != null) { - returns.addAll(result); - } - } - } - return returns.toArray(new FileStatus[returns.size()]); - } - - - - /** - * Achieves listStatus functionality for an incrementally queried table. Instead of listing all - * partitions and then filtering based on the commits of interest, this logic first extracts the - * partitions touched by the desired commits and then lists only those partitions. - */ - private List listStatusForIncrementalMode( - JobConf job, HoodieTableMetaClient tableMetaClient, List inputPaths) throws IOException { - String tableName = tableMetaClient.getTableConfig().getTableName(); - Job jobContext = Job.getInstance(job); - Option timeline = HoodieInputFormatUtils.getFilteredCommitsTimeline(jobContext, tableMetaClient); - if (!timeline.isPresent()) { - return null; - } - Option> commitsToCheck = HoodieInputFormatUtils.getCommitsForIncrementalQuery(jobContext, tableName, timeline.get()); - if (!commitsToCheck.isPresent()) { - return null; - } - Option incrementalInputPaths = HoodieInputFormatUtils.getAffectedPartitions(commitsToCheck.get(), tableMetaClient, timeline.get(), inputPaths); - // Mutate the JobConf to set the input paths to only partitions touched by incremental pull. - if (!incrementalInputPaths.isPresent()) { - return null; - } - setInputPaths(job, incrementalInputPaths.get()); - FileStatus[] fileStatuses = super.listStatus(job); - return HoodieInputFormatUtils.filterIncrementalFileStatus(jobContext, tableMetaClient, timeline.get(), fileStatuses, commitsToCheck.get()); + public HoodieParquetInputFormat() { + super(new HoodieCopyOnWriteTableInputFormat()); } - public void setConf(Configuration conf) { - this.conf = conf; - } - - @Override - public Configuration getConf() { - return conf; + protected HoodieParquetInputFormat(HoodieCopyOnWriteTableInputFormat delegate) { + super(delegate); } @Override @@ -179,90 +73,68 @@ public RecordReader getRecordReader(final InputSpli // clearOutExistingPredicate(job); // } if (split instanceof BootstrapBaseFileSplit) { - BootstrapBaseFileSplit eSplit = (BootstrapBaseFileSplit) split; - String[] rawColNames = HoodieColumnProjectionUtils.getReadColumnNames(job); - List rawColIds = HoodieColumnProjectionUtils.getReadColumnIDs(job); - List> projectedColsWithIndex = - IntStream.range(0, rawColIds.size()).mapToObj(idx -> Pair.of(rawColIds.get(idx), rawColNames[idx])) - .collect(Collectors.toList()); - - List> hoodieColsProjected = projectedColsWithIndex.stream() - .filter(idxWithName -> HoodieRecord.HOODIE_META_COLUMNS.contains(idxWithName.getValue())) - .collect(Collectors.toList()); - List> externalColsProjected = projectedColsWithIndex.stream() - .filter(idxWithName -> !HoodieRecord.HOODIE_META_COLUMNS.contains(idxWithName.getValue()) - && !HoodieHiveUtils.VIRTUAL_COLUMN_NAMES.contains(idxWithName.getValue())) - .collect(Collectors.toList()); - - // This always matches hive table description - List> colNameWithTypes = HoodieColumnProjectionUtils.getIOColumnNameAndTypes(job); - List> colNamesWithTypesForExternal = colNameWithTypes.stream() - .filter(p -> !HoodieRecord.HOODIE_META_COLUMNS.contains(p.getKey())).collect(Collectors.toList()); - LOG.info("colNameWithTypes =" + colNameWithTypes + ", Num Entries =" + colNameWithTypes.size()); - if (hoodieColsProjected.isEmpty()) { - return super.getRecordReader(eSplit.getBootstrapFileSplit(), job, reporter); - } else if (externalColsProjected.isEmpty()) { - return super.getRecordReader(split, job, reporter); - } else { - FileSplit rightSplit = eSplit.getBootstrapFileSplit(); - // Hive PPD works at row-group level and only enabled when hive.optimize.index.filter=true; - // The above config is disabled by default. But when enabled, would cause misalignment between - // skeleton and bootstrap file. We will disable them specifically when query needs bootstrap and skeleton - // file to be stitched. - // This disables row-group filtering - JobConf jobConfCopy = new JobConf(job); - jobConfCopy.unset(TableScanDesc.FILTER_EXPR_CONF_STR); - jobConfCopy.unset(ConvertAstToSearchArg.SARG_PUSHDOWN); - - LOG.info("Generating column stitching reader for " + eSplit.getPath() + " and " + rightSplit.getPath()); - return new BootstrapColumnStichingRecordReader(super.getRecordReader(eSplit, jobConfCopy, reporter), - HoodieRecord.HOODIE_META_COLUMNS.size(), - super.getRecordReader(rightSplit, jobConfCopy, reporter), - colNamesWithTypesForExternal.size(), - true); - } + return createBootstrappingRecordReader(split, job, reporter); } + if (LOG.isDebugEnabled()) { LOG.debug("EMPLOYING DEFAULT RECORD READER - " + split); } - return super.getRecordReader(split, job, reporter); - } - - @Override - protected boolean isSplitable(FileSystem fs, Path filename) { - return !(filename instanceof PathWithBootstrapFileStatus); - } - - @Override - protected FileSplit makeSplit(Path file, long start, long length, - String[] hosts) { - FileSplit split = new FileSplit(file, start, length, hosts); - if (file instanceof PathWithBootstrapFileStatus) { - return makeExternalFileSplit((PathWithBootstrapFileStatus)file, split); - } - return split; + return getRecordReaderInternal(split, job, reporter); } - @Override - protected FileSplit makeSplit(Path file, long start, long length, - String[] hosts, String[] inMemoryHosts) { - FileSplit split = new FileSplit(file, start, length, hosts, inMemoryHosts); - if (file instanceof PathWithBootstrapFileStatus) { - return makeExternalFileSplit((PathWithBootstrapFileStatus)file, split); - } - return split; + private RecordReader getRecordReaderInternal(InputSplit split, + JobConf job, + Reporter reporter) throws IOException { + return super.getRecordReader(split, job, reporter); } - private BootstrapBaseFileSplit makeExternalFileSplit(PathWithBootstrapFileStatus file, FileSplit split) { - try { - LOG.info("Making external data split for " + file); - FileStatus externalFileStatus = file.getBootstrapFileStatus(); - FileSplit externalFileSplit = makeSplit(externalFileStatus.getPath(), 0, externalFileStatus.getLen(), - new String[0], new String[0]); - return new BootstrapBaseFileSplit(split, externalFileSplit); - } catch (IOException e) { - throw new HoodieIOException(e.getMessage(), e); + private RecordReader createBootstrappingRecordReader(InputSplit split, + JobConf job, + Reporter reporter) throws IOException { + BootstrapBaseFileSplit eSplit = (BootstrapBaseFileSplit) split; + String[] rawColNames = HoodieColumnProjectionUtils.getReadColumnNames(job); + List rawColIds = HoodieColumnProjectionUtils.getReadColumnIDs(job); + List> projectedColsWithIndex = + IntStream.range(0, rawColIds.size()).mapToObj(idx -> Pair.of(rawColIds.get(idx), rawColNames[idx])) + .collect(Collectors.toList()); + + List> hoodieColsProjected = projectedColsWithIndex.stream() + .filter(idxWithName -> HoodieRecord.HOODIE_META_COLUMNS.contains(idxWithName.getValue())) + .collect(Collectors.toList()); + List> externalColsProjected = projectedColsWithIndex.stream() + .filter(idxWithName -> !HoodieRecord.HOODIE_META_COLUMNS.contains(idxWithName.getValue()) + && !HoodieHiveUtils.VIRTUAL_COLUMN_NAMES.contains(idxWithName.getValue())) + .collect(Collectors.toList()); + + // This always matches hive table description + List> colNameWithTypes = HoodieColumnProjectionUtils.getIOColumnNameAndTypes(job); + List> colNamesWithTypesForExternal = colNameWithTypes.stream() + .filter(p -> !HoodieRecord.HOODIE_META_COLUMNS.contains(p.getKey())).collect(Collectors.toList()); + + LOG.info("colNameWithTypes =" + colNameWithTypes + ", Num Entries =" + colNameWithTypes.size()); + + if (hoodieColsProjected.isEmpty()) { + return getRecordReaderInternal(eSplit.getBootstrapFileSplit(), job, reporter); + } else if (externalColsProjected.isEmpty()) { + return getRecordReaderInternal(split, job, reporter); + } else { + FileSplit rightSplit = eSplit.getBootstrapFileSplit(); + // Hive PPD works at row-group level and only enabled when hive.optimize.index.filter=true; + // The above config is disabled by default. But when enabled, would cause misalignment between + // skeleton and bootstrap file. We will disable them specifically when query needs bootstrap and skeleton + // file to be stitched. + // This disables row-group filtering + JobConf jobConfCopy = new JobConf(job); + jobConfCopy.unset(TableScanDesc.FILTER_EXPR_CONF_STR); + jobConfCopy.unset(ConvertAstToSearchArg.SARG_PUSHDOWN); + + LOG.info("Generating column stitching reader for " + eSplit.getPath() + " and " + rightSplit.getPath()); + return new BootstrapColumnStichingRecordReader(getRecordReaderInternal(eSplit, jobConfCopy, reporter), + HoodieRecord.HOODIE_META_COLUMNS.size(), + getRecordReaderInternal(rightSplit, jobConfCopy, reporter), + colNamesWithTypesForExternal.size(), + true); } } } \ No newline at end of file diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormatBase.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormatBase.java new file mode 100644 index 0000000000000..ed88acacb4d2f --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormatBase.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop; + +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hudi.hadoop.realtime.HoodieMergeOnReadTableInputFormat; + +import java.io.IOException; + +/** + * !!! PLEASE READ CAREFULLY !!! + * + * NOTE: Hive bears optimizations which are based upon validating whether {@link FileInputFormat} + * implementation inherits from {@link MapredParquetInputFormat}. + * + * To make sure that Hudi implementations are leveraging these optimizations to the fullest, this class + * serves as a base-class for every {@link FileInputFormat} implementations working with Parquet file-format. + * + * However, this class serves as a simple delegate to the actual implementation hierarchy: it expects + * either {@link HoodieCopyOnWriteTableInputFormat} or {@link HoodieMergeOnReadTableInputFormat} to be supplied + * to which it delegates all of its necessary methods. + */ +public abstract class HoodieParquetInputFormatBase extends MapredParquetInputFormat implements Configurable { + + private final HoodieTableInputFormat inputFormatDelegate; + + protected HoodieParquetInputFormatBase(HoodieCopyOnWriteTableInputFormat inputFormatDelegate) { + this.inputFormatDelegate = inputFormatDelegate; + } + + @Override + public final void setConf(Configuration conf) { + inputFormatDelegate.setConf(conf); + } + + @Override + public final Configuration getConf() { + return inputFormatDelegate.getConf(); + } + + @Override + public final InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { + return inputFormatDelegate.getSplits(job, numSplits); + } + + @Override + protected final boolean isSplitable(FileSystem fs, Path filename) { + return inputFormatDelegate.isSplitable(fs, filename); + } + + @Override + protected final FileSplit makeSplit(Path file, long start, long length, + String[] hosts) { + return inputFormatDelegate.makeSplit(file, start, length, hosts); + } + + @Override + protected final FileSplit makeSplit(Path file, long start, long length, + String[] hosts, String[] inMemoryHosts) { + return inputFormatDelegate.makeSplit(file, start, length, hosts, inMemoryHosts); + } + + @Override + public final FileStatus[] listStatus(JobConf job) throws IOException { + return inputFormatDelegate.listStatus(job); + } +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieROTablePathFilter.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieROTablePathFilter.java index 1e616f896bb30..de1fd0055dc27 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieROTablePathFilter.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieROTablePathFilter.java @@ -18,18 +18,20 @@ package org.apache.hudi.hadoop; -import java.util.Map; -import java.util.Set; -import org.apache.hadoop.conf.Configurable; import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.engine.HoodieLocalEngineContext; +import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.view.FileSystemViewManager; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.TableNotFoundException; import org.apache.hudi.hadoop.utils.HoodieHiveUtils; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -41,8 +43,13 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Collectors; +import static org.apache.hudi.common.config.HoodieCommonConfig.TIMESTAMP_AS_OF; + /** * Given a path is a part of - Hoodie table = accepts ONLY the latest version of each path - Non-Hoodie table = then * always accept @@ -67,7 +74,7 @@ public class HoodieROTablePathFilter implements Configurable, PathFilter, Serial /** * Paths that are known to be non-hoodie tables. */ - private Set nonHoodiePathCache; + Set nonHoodiePathCache; /** * Table Meta Client Cache. @@ -79,6 +86,9 @@ public class HoodieROTablePathFilter implements Configurable, PathFilter, Serial */ private SerializableConfiguration conf; + private transient HoodieLocalEngineContext engineContext; + + private transient FileSystem fs; public HoodieROTablePathFilter() { @@ -86,7 +96,7 @@ public HoodieROTablePathFilter() { } public HoodieROTablePathFilter(Configuration conf) { - this.hoodiePathCache = new HashMap<>(); + this.hoodiePathCache = new ConcurrentHashMap<>(); this.nonHoodiePathCache = new HashSet<>(); this.conf = new SerializableConfiguration(conf); this.metaClientCache = new HashMap<>(); @@ -108,6 +118,10 @@ private Path safeGetParentsParent(Path path) { @Override public boolean accept(Path path) { + if (engineContext == null) { + this.engineContext = new HoodieLocalEngineContext(this.conf.get()); + } + if (LOG.isDebugEnabled()) { LOG.debug("Checking acceptance for path " + path); } @@ -156,16 +170,34 @@ public boolean accept(Path path) { } if (baseDir != null) { + // Check whether baseDir in nonHoodiePathCache + if (nonHoodiePathCache.contains(baseDir.toString())) { + if (LOG.isDebugEnabled()) { + LOG.debug("Accepting non-hoodie path from cache: " + path); + } + return true; + } + HoodieTableFileSystemView fsView = null; try { HoodieTableMetaClient metaClient = metaClientCache.get(baseDir.toString()); if (null == metaClient) { - metaClient = new HoodieTableMetaClient(fs.getConf(), baseDir.toString(), true); + metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(baseDir.toString()).setLoadActiveTimelineOnLoad(true).build(); metaClientCache.put(baseDir.toString(), metaClient); } - HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, - metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(), fs.listStatus(folder)); - List latestFiles = fsView.getLatestBaseFiles().collect(Collectors.toList()); + if (getConf().get(TIMESTAMP_AS_OF.key()) != null) { + // Build FileSystemViewManager with specified time, it's necessary to set this config when you may + // access old version files. For example, in spark side, using "hoodie.datasource.read.paths" + // which contains old version files, if not specify this value, these files will be filtered. + fsView = FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(engineContext, + metaClient, HoodieInputFormatUtils.buildMetadataConfig(getConf()), + metaClient.getActiveTimeline().filterCompletedInstants().findInstantsBeforeOrEquals(getConf().get(TIMESTAMP_AS_OF.key()))); + } else { + fsView = FileSystemViewManager.createInMemoryFileSystemView(engineContext, + metaClient, HoodieInputFormatUtils.buildMetadataConfig(getConf())); + } + String partition = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), folder); + List latestFiles = fsView.getLatestBaseFiles(partition).collect(Collectors.toList()); // populate the cache if (!hoodiePathCache.containsKey(folder.toString())) { hoodiePathCache.put(folder.toString(), new HashSet<>()); @@ -185,10 +217,15 @@ public boolean accept(Path path) { } catch (TableNotFoundException e) { // Non-hoodie path, accept it. if (LOG.isDebugEnabled()) { - LOG.debug(String.format("(1) Caching non-hoodie path under %s \n", folder.toString())); + LOG.debug(String.format("(1) Caching non-hoodie path under %s with basePath %s \n", folder.toString(), baseDir.toString())); } nonHoodiePathCache.add(folder.toString()); + nonHoodiePathCache.add(baseDir.toString()); return true; + } finally { + if (fsView != null) { + fsView.close(); + } } } else { // files is at < 3 level depth in FS tree, can't be hoodie dataset diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieTableInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieTableInputFormat.java new file mode 100644 index 0000000000000..d18cb7895ad00 --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieTableInputFormat.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop; + +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.JobConf; + +import java.io.IOException; + +/** + * Abstract base class of the Hive's {@link FileInputFormat} implementations allowing for reading of Hudi's + * Copy-on-Write (COW) and Merge-on-Read (MOR) tables + */ +public abstract class HoodieTableInputFormat extends FileInputFormat + implements Configurable { + + protected Configuration conf; + + @Override + public final Configuration getConf() { + return conf; + } + + @Override + public final void setConf(Configuration conf) { + this.conf = conf; + } + + @Override + protected boolean isSplitable(FileSystem fs, Path filename) { + return super.isSplitable(fs, filename); + } + + @Override + protected FileSplit makeSplit(Path file, long start, long length, String[] hosts) { + return super.makeSplit(file, start, length, hosts); + } + + @Override + protected FileSplit makeSplit(Path file, long start, long length, String[] hosts, String[] inMemoryHosts) { + return super.makeSplit(file, start, length, hosts, inMemoryHosts); + } + + @Override + protected FileStatus[] listStatus(JobConf job) throws IOException { + return super.listStatus(job); + } +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputPathHandler.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputPathHandler.java index 0a5055a056fa2..24d190700fea3 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputPathHandler.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputPathHandler.java @@ -19,11 +19,13 @@ package org.apache.hudi.hadoop; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.exception.InvalidTableException; import org.apache.hudi.exception.TableNotFoundException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; +import org.apache.hudi.hadoop.utils.HoodieHiveUtils; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -33,23 +35,32 @@ import java.util.List; import java.util.Map; -import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.getTableMetaClientForBasePath; +import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.getTableMetaClientForBasePathUnchecked; /** * InputPathHandler takes in a set of input paths and incremental tables list. Then, classifies the * input paths to incremental, snapshot paths and non-hoodie paths. This is then accessed later to * mutate the JobConf before processing incremental mode queries and snapshot queries. + * + * Note: We are adding jobConf of a mapreduce or spark job. The properties in the jobConf are two + * type: session properties and table properties from metastore. While session property is common + * for all the tables in a query the table properties are unique per table so there is no need to + * check if it belongs to the table for which the path handler is now instantiated. The jobConf has + * all table properties such as name, last modification time and so on which are unique to a table. + * This class is written in such a way that it can handle multiple tables and properties unique to + * a table but for table level property such check is not required. */ public class InputPathHandler { public static final Logger LOG = LogManager.getLogger(InputPathHandler.class); private final Configuration conf; - // tablename to metadata mapping for all Hoodie tables(both incremental & snapshot) + // tableName to metadata mapping for all Hoodie tables(both incremental & snapshot) private final Map tableMetaClientMap; private final Map> groupedIncrementalPaths; private final List snapshotPaths; private final List nonHoodieInputPaths; + private boolean isIncrementalUseDatabase; public InputPathHandler(Configuration conf, Path[] inputPaths, List incrementalTables) throws IOException { this.conf = conf; @@ -57,14 +68,14 @@ public InputPathHandler(Configuration conf, Path[] inputPaths, List incr snapshotPaths = new ArrayList<>(); nonHoodieInputPaths = new ArrayList<>(); groupedIncrementalPaths = new HashMap<>(); + this.isIncrementalUseDatabase = HoodieHiveUtils.isIncrementalUseDatabase(conf); parseInputPaths(inputPaths, incrementalTables); } /** * Takes in the original InputPaths and classifies each of them into incremental, snapshot and * non-hoodie InputPaths. The logic is as follows: - * - * 1. Check if an inputPath starts with the same basepath as any of the metadata basepaths we know + * 1. Check if an inputPath starts with the same basePath as any of the metadata basePaths we know * 1a. If yes, this belongs to a Hoodie table that we already know about. Simply classify this * as incremental or snapshot - We can get the table name of this inputPath from the * metadata. Then based on the list of incrementalTables, we can classify this inputPath. @@ -88,19 +99,17 @@ private void parseInputPaths(Path[] inputPaths, List incrementalTables) // We already know the base path for this inputPath. basePathKnown = true; // Check if this is for a snapshot query - String tableName = metaClient.getTableConfig().getTableName(); - tagAsIncrementalOrSnapshot(inputPath, tableName, metaClient, incrementalTables); + tagAsIncrementalOrSnapshot(inputPath, metaClient, incrementalTables); break; } } if (!basePathKnown) { - // This path is for a table that we dont know about yet. + // This path is for a table that we don't know about yet. HoodieTableMetaClient metaClient; try { - metaClient = getTableMetaClientForBasePath(inputPath.getFileSystem(conf), inputPath); - String tableName = metaClient.getTableConfig().getTableName(); - tableMetaClientMap.put(tableName, metaClient); - tagAsIncrementalOrSnapshot(inputPath, tableName, metaClient, incrementalTables); + metaClient = getTableMetaClientForBasePathUnchecked(conf, inputPath); + tableMetaClientMap.put(getIncrementalTable(metaClient), metaClient); + tagAsIncrementalOrSnapshot(inputPath, metaClient, incrementalTables); } catch (TableNotFoundException | InvalidTableException e) { // This is a non Hoodie inputPath LOG.info("Handling a non-hoodie path " + inputPath); @@ -110,9 +119,8 @@ private void parseInputPaths(Path[] inputPaths, List incrementalTables) } } - private void tagAsIncrementalOrSnapshot(Path inputPath, String tableName, - HoodieTableMetaClient metaClient, List incrementalTables) { - if (!incrementalTables.contains(tableName)) { + private void tagAsIncrementalOrSnapshot(Path inputPath, HoodieTableMetaClient metaClient, List incrementalTables) { + if (!incrementalTables.contains(getIncrementalTable(metaClient))) { snapshotPaths.add(inputPath); } else { // Group incremental Paths belonging to same table. @@ -138,4 +146,11 @@ public List getSnapshotPaths() { public List getNonHoodieInputPaths() { return nonHoodieInputPaths; } + + private String getIncrementalTable(HoodieTableMetaClient metaClient) { + String databaseName = metaClient.getTableConfig().getDatabaseName(); + String tableName = metaClient.getTableConfig().getTableName(); + return isIncrementalUseDatabase && !StringUtils.isNullOrEmpty(databaseName) + ? databaseName + "." + tableName : tableName; + } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputSplitUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputSplitUtils.java index 745657183d7bd..5dcd66cd826d0 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputSplitUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputSplitUtils.java @@ -22,14 +22,6 @@ import java.io.DataOutput; import java.io.IOException; import java.nio.charset.StandardCharsets; -import org.apache.avro.Schema; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.mapred.FileSplit; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; -import org.apache.hudi.io.storage.HoodieFileReader; -import org.apache.hudi.io.storage.HoodieFileReaderFactory; public class InputSplitUtils { @@ -45,23 +37,11 @@ public static String readString(DataInput in) throws IOException { return new String(bytes, StandardCharsets.UTF_8); } - /** - * Return correct base-file schema based on split. - * - * @param split File Split - * @param conf Configuration - * @return - */ - public static Schema getBaseFileSchema(FileSplit split, Configuration conf) { - try { - if (split instanceof BootstrapBaseFileSplit) { - HoodieFileReader storageReader = HoodieFileReaderFactory.getFileReader(conf, - ((BootstrapBaseFileSplit)(split)).getBootstrapFileSplit().getPath()); - return HoodieAvroUtils.addMetadataFields(storageReader.getSchema()); - } - return HoodieRealtimeRecordReaderUtils.readSchema(conf, split.getPath()); - } catch (IOException e) { - throw new HoodieIOException("Failed to read footer for parquet " + split.getPath(), e); - } + public static void writeBoolean(Boolean valueToWrite, DataOutput out) throws IOException { + out.writeBoolean(valueToWrite); + } + + public static boolean readBoolean(DataInput in) throws IOException { + return in.readBoolean(); } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/RealtimeFileStatus.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/RealtimeFileStatus.java new file mode 100644 index 0000000000000..641aa2759ff20 --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/RealtimeFileStatus.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.hadoop.realtime.HoodieRealtimePath; +import org.apache.hudi.hadoop.realtime.HoodieVirtualKeyInfo; + +import java.io.IOException; +import java.util.List; + +/** + * With the base input format implementations in Hadoop/Hive, + * we need to encode additional information in Path to track base files and logs files for realtime read. + * Hence, this class tracks a log/base file status + * in Path. + */ +public class RealtimeFileStatus extends FileStatus { + /** + * Base path of the table this path belongs to + */ + private final String basePath; + /** + * List of delta log-files holding updated records for this base-file + */ + private final List deltaLogFiles; + /** + * Marks whether this path produced as part of Incremental Query + */ + private final boolean belongsToIncrementalQuery; + /** + * Latest commit instant available at the time of the query in which all of the files + * pertaining to this split are represented + */ + private String maxCommitTime = ""; + /** + * File status for the Bootstrap file (only relevant if this table is a bootstrapped table + */ + private FileStatus bootStrapFileStatus; + /** + * Virtual key configuration of the table this split belongs to + */ + private final Option virtualKeyInfo; + + public RealtimeFileStatus(FileStatus fileStatus, + String basePath, + List deltaLogFiles, + boolean belongsToIncrementalQuery, + Option virtualKeyInfo) throws IOException { + super(fileStatus); + this.basePath = basePath; + this.deltaLogFiles = deltaLogFiles; + this.belongsToIncrementalQuery = belongsToIncrementalQuery; + this.virtualKeyInfo = virtualKeyInfo; + } + + @Override + public Path getPath() { + Path path = super.getPath(); + + HoodieRealtimePath realtimePath = new HoodieRealtimePath(path.getParent(), path.getName(), basePath, + deltaLogFiles, maxCommitTime, belongsToIncrementalQuery, virtualKeyInfo); + + if (bootStrapFileStatus != null) { + realtimePath.setPathWithBootstrapFileStatus((PathWithBootstrapFileStatus)bootStrapFileStatus.getPath()); + } + + return realtimePath; + } + + public List getDeltaLogFiles() { + return deltaLogFiles; + } + + public String getMaxCommitTime() { + return maxCommitTime; + } + + public void setMaxCommitTime(String maxCommitTime) { + this.maxCommitTime = maxCommitTime; + } + + public void setBootStrapFileStatus(FileStatus bootStrapFileStatus) { + this.bootStrapFileStatus = bootStrapFileStatus; + } +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/hive/HoodieCombineHiveInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/hive/HoodieCombineHiveInputFormat.java index 356ae96da3c51..8736883cea72c 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/hive/HoodieCombineHiveInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/hive/HoodieCombineHiveInputFormat.java @@ -18,15 +18,8 @@ package org.apache.hudi.hadoop.hive; -import org.apache.hudi.common.util.ReflectionUtils; -import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.hadoop.HoodieParquetInputFormat; -import org.apache.hudi.hadoop.realtime.HoodieCombineRealtimeRecordReader; -import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; -import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.hive.common.StringInternUtils; @@ -63,6 +56,13 @@ import org.apache.hadoop.mapred.lib.CombineFileInputFormat; import org.apache.hadoop.mapred.lib.CombineFileSplit; import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.hadoop.HoodieParquetInputFormat; +import org.apache.hudi.hadoop.HoodieParquetInputFormatBase; +import org.apache.hudi.hadoop.realtime.HoodieCombineRealtimeRecordReader; +import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -170,7 +170,7 @@ private InputSplit[] getCombineSplits(JobConf job, int numSplits, Map partitions = new ArrayList<>(part.getPartSpec().keySet()); if (!partitions.isEmpty()) { - String partitionStr = String.join(",", partitions); + String partitionStr = String.join("/", partitions); LOG.info("Setting Partitions in jobConf - Partition Keys for Path : " + path + " is :" + partitionStr); job.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, partitionStr); } else { @@ -183,8 +183,8 @@ private InputSplit[] getCombineSplits(JobConf job, int numSplits, Map listStatus(JobContext job) throws IOException { LOG.info("Listing status in HoodieCombineHiveInputFormat.HoodieCombineFileInputFormatShim"); List result; if (hoodieFilter) { - HoodieParquetInputFormat input; + HoodieParquetInputFormatBase input; if (isRealTime) { LOG.info("Using HoodieRealtimeInputFormat"); input = createParquetRealtimeInputFormat(); @@ -917,7 +917,7 @@ public CombineFileSplit[] getSplits(JobConf job, int numSplits) throws IOExcepti job.set("hudi.hive.realtime", "true"); InputSplit[] splits; if (hoodieFilter) { - HoodieParquetInputFormat input = createParquetRealtimeInputFormat(); + HoodieParquetRealtimeInputFormat input = createParquetRealtimeInputFormat(); input.setConf(job); splits = input.getSplits(job, numSplits); } else { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java index 050b91add5772..ea71de07ba00b 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java @@ -19,26 +19,26 @@ package org.apache.hudi.hadoop.realtime; import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodiePayloadProps; +import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.log.LogReaderUtils; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.hadoop.InputSplitUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; -import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; +import java.util.Properties; import java.util.stream.Collectors; /** @@ -50,10 +50,12 @@ public abstract class AbstractRealtimeRecordReader { protected final RealtimeSplit split; protected final JobConf jobConf; protected final boolean usesCustomPayload; + protected Properties payloadProps = new Properties(); // Schema handles private Schema readerSchema; private Schema writerSchema; private Schema hiveSchema; + private HoodieTableMetaClient metaClient; public AbstractRealtimeRecordReader(RealtimeSplit split, JobConf job) { this.split = split; @@ -62,34 +64,32 @@ public AbstractRealtimeRecordReader(RealtimeSplit split, JobConf job) { LOG.info("columnIds ==> " + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); LOG.info("partitioningColumns ==> " + job.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "")); try { - this.usesCustomPayload = usesCustomPayload(); + metaClient = HoodieTableMetaClient.builder().setConf(jobConf).setBasePath(split.getBasePath()).build(); + if (metaClient.getTableConfig().getPreCombineField() != null) { + this.payloadProps.setProperty(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY, metaClient.getTableConfig().getPreCombineField()); + } + this.usesCustomPayload = usesCustomPayload(metaClient); LOG.info("usesCustomPayload ==> " + this.usesCustomPayload); init(); - } catch (IOException e) { - throw new HoodieIOException("Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e); + } catch (Exception e) { + throw new HoodieException("Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e); } } - private boolean usesCustomPayload() { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jobConf, split.getBasePath()); + private boolean usesCustomPayload(HoodieTableMetaClient metaClient) { return !(metaClient.getTableConfig().getPayloadClass().contains(HoodieAvroPayload.class.getName()) - || metaClient.getTableConfig().getPayloadClass().contains("org.apache.hudi.OverwriteWithLatestAvroPayload")); + || metaClient.getTableConfig().getPayloadClass().contains(OverwriteWithLatestAvroPayload.class.getName())); } /** - * Goes through the log files in reverse order and finds the schema from the last available data block. If not, falls + * Gets schema from HoodieTableMetaClient. If not, falls * back to the schema from the latest parquet file. Finally, sets the partition column and projection fields into the * job conf. */ - private void init() throws IOException { - Schema schemaFromLogFile = LogReaderUtils.readLatestSchemaFromLogFiles(split.getBasePath(), split.getDeltaLogPaths(), jobConf); - if (schemaFromLogFile == null) { - writerSchema = InputSplitUtils.getBaseFileSchema((FileSplit)split, jobConf); - LOG.info("Writer Schema From Parquet => " + writerSchema.getFields()); - } else { - writerSchema = schemaFromLogFile; - LOG.info("Writer Schema From Log => " + writerSchema.toString(true)); - } + private void init() throws Exception { + LOG.info("Getting writer schema from table avro schema "); + writerSchema = new TableSchemaResolver(metaClient).getTableAvroSchema(); + // Add partitioning fields to writer schema for resulting row to contain null values for these fields String partitionFields = jobConf.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, ""); List partitioningFields = @@ -147,4 +147,12 @@ public Schema getWriterSchema() { public Schema getHiveSchema() { return hiveSchema; } + + public RealtimeSplit getSplit() { + return split; + } + + public JobConf getJobConf() { + return jobConf; + } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java index 7a0bd37bb4279..7fecd57927050 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieCombineRealtimeRecordReader.java @@ -20,8 +20,8 @@ import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.hadoop.hive.HoodieCombineRealtimeFileSplit; -import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; +import org.apache.hadoop.hive.ql.io.IOContextMap; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.InputSplit; @@ -65,13 +65,14 @@ public HoodieCombineRealtimeRecordReader(JobConf jobConf, CombineFileSplit split @Override public boolean next(NullWritable key, ArrayWritable value) throws IOException { if (this.currentRecordReader.next(key, value)) { - LOG.info("Reading from record reader"); - LOG.info(HoodieRealtimeRecordReaderUtils.arrayWritableToString(value)); return true; } else if (recordReaders.size() > 0) { this.currentRecordReader.close(); this.currentRecordReader = recordReaders.remove(0); - return this.currentRecordReader.next(key, value); + AbstractRealtimeRecordReader reader = (AbstractRealtimeRecordReader)currentRecordReader.getReader(); + // when switch reader, ioctx should be updated + IOContextMap.get(reader.getJobConf()).setInputPath(reader.getSplit().getPath()); + return next(key, value); } else { return false; } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieEmptyRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieEmptyRecordReader.java new file mode 100644 index 0000000000000..d995e44cd16c4 --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieEmptyRecordReader.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop.realtime; + +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; + +import java.io.IOException; + +/** + * Dummy record for log only realtime split. + */ +public class HoodieEmptyRecordReader extends AbstractRealtimeRecordReader + implements RecordReader { + + public HoodieEmptyRecordReader(RealtimeSplit split, JobConf job) { + super(split, job); + } + + @Override + public boolean next(NullWritable nullWritable, ArrayWritable arrayWritable) throws IOException { + return false; + } + + @Override + public NullWritable createKey() { + return null; + } + + @Override + public ArrayWritable createValue() { + return new ArrayWritable(Writable.class, new Writable[getHiveSchema().getFields().size()]); + } + + @Override + public long getPos() throws IOException { + return 0; + } + + @Override + public void close() throws IOException { + + } + + @Override + public float getProgress() throws IOException { + return 0; + } +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieHFileRealtimeInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieHFileRealtimeInputFormat.java index e75cff6416954..799d90bce5df4 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieHFileRealtimeInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieHFileRealtimeInputFormat.java @@ -18,20 +18,16 @@ package org.apache.hudi.hadoop.realtime; -import java.io.IOException; -import java.util.Arrays; -import java.util.stream.Stream; - -import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; -import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.hadoop.HoodieHFileInputFormat; import org.apache.hudi.hadoop.UseFileSplitsFromInputFormat; @@ -41,32 +37,19 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import java.io.IOException; + /** * HoodieRealtimeInputFormat for HUDI datasets which store data in HFile base file format. */ @UseRecordReaderFromInputFormat @UseFileSplitsFromInputFormat -public class HoodieHFileRealtimeInputFormat extends HoodieHFileInputFormat { +public class HoodieHFileRealtimeInputFormat extends HoodieMergeOnReadTableInputFormat { private static final Logger LOG = LogManager.getLogger(HoodieHFileRealtimeInputFormat.class); - @Override - public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { - Stream fileSplits = Arrays.stream(super.getSplits(job, numSplits)).map(is -> (FileSplit) is); - return HoodieRealtimeInputFormatUtils.getRealtimeSplits(job, fileSplits); - } - - @Override - public FileStatus[] listStatus(JobConf job) throws IOException { - // Call the HoodieInputFormat::listStatus to obtain all latest hfiles, based on commit timeline. - return super.listStatus(job); - } - - @Override - protected HoodieDefaultTimeline filterInstantsTimeline(HoodieDefaultTimeline timeline) { - // no specific filtering for Realtime format - return timeline; - } + // NOTE: We're only using {@code HoodieHFileInputFormat} to compose {@code RecordReader} + private final HoodieHFileInputFormat hFileInputFormat = new HoodieHFileInputFormat(); @Override public RecordReader getRecordReader(final InputSplit split, final JobConf jobConf, @@ -89,14 +72,14 @@ public RecordReader getRecordReader(final InputSpli // For e:g _hoodie_record_key would be missing and merge step would throw exceptions. // TO fix this, hoodie columns are appended late at the time record-reader gets built instead of construction // time. - HoodieRealtimeInputFormatUtils.cleanProjectionColumnIds(jobConf); - HoodieRealtimeInputFormatUtils.addRequiredProjectionFields(jobConf); + HoodieRealtimeInputFormatUtils.addRequiredProjectionFields(jobConf, Option.empty()); this.conf = jobConf; this.conf.set(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP, "true"); } } } + HoodieRealtimeInputFormatUtils.cleanProjectionColumnIds(jobConf); LOG.info("Creating record reader with readCols :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); @@ -105,6 +88,12 @@ public RecordReader getRecordReader(final InputSpli "HoodieRealtimeRecordReader can only work on HoodieRealtimeFileSplit and not with " + split); return new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) split, jobConf, - super.getRecordReader(split, jobConf, reporter)); + hFileInputFormat.getRecordReader(split, jobConf, reporter)); + } + + @Override + protected boolean isSplitable(FileSystem fs, Path filename) { + // This file isn't splittable. + return false; } } \ No newline at end of file diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieMergeOnReadTableInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieMergeOnReadTableInputFormat.java new file mode 100644 index 0000000000000..448a5811bcc11 --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieMergeOnReadTableInputFormat.java @@ -0,0 +1,386 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop.realtime; + +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SplitLocationInfo; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFileGroup; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.hadoop.BootstrapBaseFileSplit; +import org.apache.hudi.hadoop.FileStatusWithBootstrapBaseFile; +import org.apache.hudi.hadoop.HoodieCopyOnWriteTableInputFormat; +import org.apache.hudi.hadoop.LocatedFileStatusWithBootstrapBaseFile; +import org.apache.hudi.hadoop.RealtimeFileStatus; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.apache.hudi.common.util.ValidationUtils.checkState; + +/** + * Base implementation of the Hive's {@link FileInputFormat} allowing for reading of Hudi's + * Merge-on-Read (COW) tables in various configurations: + * + *
      + *
    • Snapshot mode: reading table's state as of particular timestamp (or instant, in Hudi's terms)
    • + *
    • Incremental mode: reading table's state as of particular timestamp (or instant, in Hudi's terms)
    • + *
    • External mode: reading non-Hudi partitions
    • + *
    + *

    + * NOTE: This class is invariant of the underlying file-format of the files being read + */ +public class HoodieMergeOnReadTableInputFormat extends HoodieCopyOnWriteTableInputFormat implements Configurable { + + @Override + public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { + List fileSplits = Arrays.stream(super.getSplits(job, numSplits)) + .map(is -> (FileSplit) is) + .collect(Collectors.toList()); + + return (containsIncrementalQuerySplits(fileSplits) ? filterIncrementalQueryFileSplits(fileSplits) : fileSplits) + .toArray(new FileSplit[0]); + } + + @Override + protected FileStatus createFileStatusUnchecked(FileSlice fileSlice, + Option latestCompletedInstantOpt, + String tableBasePath, + Option virtualKeyInfoOpt) { + Option baseFileOpt = fileSlice.getBaseFile(); + Option latestLogFileOpt = fileSlice.getLatestLogFile(); + Stream logFiles = fileSlice.getLogFiles(); + + // Check if we're reading a MOR table + if (baseFileOpt.isPresent()) { + return createRealtimeFileStatusUnchecked(baseFileOpt.get(), logFiles, tableBasePath, latestCompletedInstantOpt, virtualKeyInfoOpt); + } else if (latestLogFileOpt.isPresent()) { + return createRealtimeFileStatusUnchecked(latestLogFileOpt.get(), logFiles, tableBasePath, latestCompletedInstantOpt, virtualKeyInfoOpt); + } else { + throw new IllegalStateException("Invalid state: either base-file or log-file has to be present"); + } + } + + @Override + protected boolean checkIfValidFileSlice(FileSlice fileSlice) { + Option baseFileOpt = fileSlice.getBaseFile(); + Option latestLogFileOpt = fileSlice.getLatestLogFile(); + + if (baseFileOpt.isPresent() || latestLogFileOpt.isPresent()) { + return true; + } else { + throw new IllegalStateException("Invalid state: either base-file or log-file has to be present for " + fileSlice.getFileId()); + } + } + + /** + * Keep the logic of mor_incr_view as same as spark datasource. + * Step1: Get list of commits to be fetched based on start commit and max commits(for snapshot max commits is -1). + * Step2: Get list of affected files status for these affected file status. + * Step3: Construct HoodieTableFileSystemView based on those affected file status. + * a. Filter affected partitions based on inputPaths. + * b. Get list of fileGroups based on affected partitions by fsView.getAllFileGroups. + * Step4: Set input paths based on filtered affected partition paths. changes that amony original input paths passed to + * this method. some partitions did not have commits as part of the trimmed down list of commits and hence we need this step. + * Step5: Find candidate fileStatus, since when we get baseFileStatus from HoodieTableFileSystemView, + * the BaseFileStatus will missing file size information. + * We should use candidate fileStatus to update the size information for BaseFileStatus. + * Step6: For every file group from step3(b) + * Get 1st available base file from all file slices. then we use candidate file status to update the baseFileStatus, + * and construct RealTimeFileStatus and add it to result along with log files. + * If file group just has log files, construct RealTimeFileStatus and add it to result. + * TODO: unify the incremental view code between hive/spark-sql and spark datasource + */ + @Override + protected List listStatusForIncrementalMode(JobConf job, + HoodieTableMetaClient tableMetaClient, + List inputPaths, + String incrementalTableName) throws IOException { + List result = new ArrayList<>(); + Job jobContext = Job.getInstance(job); + + // step1 + Option timeline = HoodieInputFormatUtils.getFilteredCommitsTimeline(jobContext, tableMetaClient); + if (!timeline.isPresent()) { + return result; + } + HoodieTimeline commitsTimelineToReturn = HoodieInputFormatUtils.getHoodieTimelineForIncrementalQuery(jobContext, incrementalTableName, timeline.get()); + Option> commitsToCheck = Option.of(commitsTimelineToReturn.getInstants().collect(Collectors.toList())); + if (!commitsToCheck.isPresent()) { + return result; + } + // step2 + commitsToCheck.get().sort(HoodieInstant::compareTo); + List metadataList = commitsToCheck + .get().stream().map(instant -> { + try { + return HoodieInputFormatUtils.getCommitMetadata(instant, commitsTimelineToReturn); + } catch (IOException e) { + throw new HoodieException(String.format("cannot get metadata for instant: %s", instant)); + } + }).collect(Collectors.toList()); + + // build fileGroup from fsView + List affectedFileStatus = Arrays.asList(HoodieInputFormatUtils + .listAffectedFilesForCommits(job, new Path(tableMetaClient.getBasePath()), metadataList)); + // step3 + HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(tableMetaClient, commitsTimelineToReturn, affectedFileStatus.toArray(new FileStatus[0])); + // build fileGroup from fsView + Path basePath = new Path(tableMetaClient.getBasePath()); + // filter affectedPartition by inputPaths + List affectedPartition = HoodieInputFormatUtils.getWritePartitionPaths(metadataList).stream() + .filter(k -> k.isEmpty() ? inputPaths.contains(basePath) : inputPaths.contains(new Path(basePath, k))).collect(Collectors.toList()); + if (affectedPartition.isEmpty()) { + return result; + } + List fileGroups = affectedPartition.stream() + .flatMap(partitionPath -> fsView.getAllFileGroups(partitionPath)).collect(Collectors.toList()); + // step4 + setInputPaths(job, affectedPartition.stream() + .map(p -> p.isEmpty() ? basePath.toString() : new Path(basePath, p).toString()).collect(Collectors.joining(","))); + + // step5 + // find all file status in partitionPaths. + FileStatus[] fileStatuses = doListStatus(job); + Map candidateFileStatus = new HashMap<>(); + for (int i = 0; i < fileStatuses.length; i++) { + String key = fileStatuses[i].getPath().toString(); + candidateFileStatus.put(key, fileStatuses[i]); + } + + Option virtualKeyInfoOpt = getHoodieVirtualKeyInfo(tableMetaClient); + String maxCommitTime = fsView.getLastInstant().get().getTimestamp(); + // step6 + result.addAll(collectAllIncrementalFiles(fileGroups, maxCommitTime, basePath.toString(), candidateFileStatus, virtualKeyInfoOpt)); + return result; + } + + @Override + protected boolean isSplitable(FileSystem fs, Path filename) { + if (filename instanceof HoodieRealtimePath) { + return ((HoodieRealtimePath) filename).isSplitable(); + } + + return super.isSplitable(fs, filename); + } + + // make split for path. + // When query the incremental view, the read files may be bootstrap files, we wrap those bootstrap files into + // PathWithLogFilePath, so those bootstrap files should be processed int this function. + @Override + protected FileSplit makeSplit(Path file, long start, long length, String[] hosts) { + if (file instanceof HoodieRealtimePath) { + return doMakeSplitForRealtimePath((HoodieRealtimePath) file, start, length, hosts, null); + } + return super.makeSplit(file, start, length, hosts); + } + + @Override + protected FileSplit makeSplit(Path file, long start, long length, String[] hosts, String[] inMemoryHosts) { + if (file instanceof HoodieRealtimePath) { + return doMakeSplitForRealtimePath((HoodieRealtimePath) file, start, length, hosts, inMemoryHosts); + } + return super.makeSplit(file, start, length, hosts, inMemoryHosts); + } + + private static List collectAllIncrementalFiles(List fileGroups, + String maxCommitTime, + String basePath, + Map candidateFileStatus, + Option virtualKeyInfoOpt) { + + List result = new ArrayList<>(); + fileGroups.stream().forEach(f -> { + try { + List baseFiles = f.getAllFileSlices().filter(slice -> slice.getBaseFile().isPresent()).collect(Collectors.toList()); + if (!baseFiles.isEmpty()) { + FileStatus baseFileStatus = HoodieInputFormatUtils.getFileStatus(baseFiles.get(0).getBaseFile().get()); + String baseFilePath = baseFileStatus.getPath().toUri().toString(); + if (!candidateFileStatus.containsKey(baseFilePath)) { + throw new HoodieException("Error obtaining fileStatus for file: " + baseFilePath); + } + List deltaLogFiles = f.getLatestFileSlice().get().getLogFiles().collect(Collectors.toList()); + // We cannot use baseFileStatus.getPath() here, since baseFileStatus.getPath() missing file size information. + // So we use candidateFileStatus.get(baseFileStatus.getPath()) to get a correct path. + RealtimeFileStatus fileStatus = new RealtimeFileStatus(candidateFileStatus.get(baseFilePath), + basePath, deltaLogFiles, true, virtualKeyInfoOpt); + fileStatus.setMaxCommitTime(maxCommitTime); + if (baseFileStatus instanceof LocatedFileStatusWithBootstrapBaseFile || baseFileStatus instanceof FileStatusWithBootstrapBaseFile) { + fileStatus.setBootStrapFileStatus(baseFileStatus); + } + result.add(fileStatus); + } + // add file group which has only logs. + if (f.getLatestFileSlice().isPresent() && baseFiles.isEmpty()) { + List logFileStatus = f.getLatestFileSlice().get().getLogFiles().map(logFile -> logFile.getFileStatus()).collect(Collectors.toList()); + if (logFileStatus.size() > 0) { + List deltaLogFiles = logFileStatus.stream().map(l -> new HoodieLogFile(l.getPath(), l.getLen())).collect(Collectors.toList()); + RealtimeFileStatus fileStatus = new RealtimeFileStatus(logFileStatus.get(0), basePath, + deltaLogFiles, true, virtualKeyInfoOpt); + fileStatus.setMaxCommitTime(maxCommitTime); + result.add(fileStatus); + } + } + } catch (IOException e) { + throw new HoodieException("Error obtaining data file/log file grouping ", e); + } + }); + return result; + } + + private FileSplit doMakeSplitForRealtimePath(HoodieRealtimePath path, long start, long length, String[] hosts, String[] inMemoryHosts) { + if (path.includeBootstrapFilePath()) { + FileSplit bf = + inMemoryHosts == null + ? super.makeSplit(path.getPathWithBootstrapFileStatus(), start, length, hosts) + : super.makeSplit(path.getPathWithBootstrapFileStatus(), start, length, hosts, inMemoryHosts); + return createRealtimeBoostrapBaseFileSplit( + (BootstrapBaseFileSplit) bf, + path.getBasePath(), + path.getDeltaLogFiles(), + path.getMaxCommitTime(), + path.getBelongsToIncrementalQuery(), + path.getVirtualKeyInfo() + ); + } + + return createRealtimeFileSplit(path, start, length, hosts); + } + + private static boolean containsIncrementalQuerySplits(List fileSplits) { + return fileSplits.stream().anyMatch(HoodieRealtimeInputFormatUtils::doesBelongToIncrementalQuery); + } + + private static List filterIncrementalQueryFileSplits(List fileSplits) { + return fileSplits.stream().filter(HoodieRealtimeInputFormatUtils::doesBelongToIncrementalQuery) + .collect(Collectors.toList()); + } + + private static HoodieRealtimeFileSplit createRealtimeFileSplit(HoodieRealtimePath path, long start, long length, String[] hosts) { + try { + return new HoodieRealtimeFileSplit(new FileSplit(path, start, length, hosts), path); + } catch (IOException e) { + throw new HoodieIOException(String.format("Failed to create instance of %s", HoodieRealtimeFileSplit.class.getName()), e); + } + } + + private static HoodieRealtimeBootstrapBaseFileSplit createRealtimeBoostrapBaseFileSplit(BootstrapBaseFileSplit split, + String basePath, + List logFiles, + String maxInstantTime, + boolean belongsToIncrementalQuery, + Option virtualKeyInfoOpt) { + try { + String[] hosts = split.getLocationInfo() != null ? Arrays.stream(split.getLocationInfo()) + .filter(x -> !x.isInMemory()).toArray(String[]::new) : new String[0]; + String[] inMemoryHosts = split.getLocationInfo() != null ? Arrays.stream(split.getLocationInfo()) + .filter(SplitLocationInfo::isInMemory).toArray(String[]::new) : new String[0]; + FileSplit baseSplit = new FileSplit(split.getPath(), split.getStart(), split.getLength(), + hosts, inMemoryHosts); + return new HoodieRealtimeBootstrapBaseFileSplit(baseSplit, basePath, logFiles, maxInstantTime, split.getBootstrapFileSplit(), + belongsToIncrementalQuery, virtualKeyInfoOpt); + } catch (IOException e) { + throw new HoodieIOException("Error creating hoodie real time split ", e); + } + } + + /** + * Creates {@link RealtimeFileStatus} for the file-slice where base file is present + */ + private static RealtimeFileStatus createRealtimeFileStatusUnchecked(HoodieBaseFile baseFile, + Stream logFiles, + String basePath, + Option latestCompletedInstantOpt, + Option virtualKeyInfoOpt) { + FileStatus baseFileStatus = getFileStatusUnchecked(baseFile); + List sortedLogFiles = logFiles.sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList()); + + try { + RealtimeFileStatus rtFileStatus = new RealtimeFileStatus(baseFileStatus, basePath, sortedLogFiles, + false, virtualKeyInfoOpt); + + if (latestCompletedInstantOpt.isPresent()) { + HoodieInstant latestCompletedInstant = latestCompletedInstantOpt.get(); + checkState(latestCompletedInstant.isCompleted()); + + rtFileStatus.setMaxCommitTime(latestCompletedInstant.getTimestamp()); + } + + if (baseFileStatus instanceof LocatedFileStatusWithBootstrapBaseFile || baseFileStatus instanceof FileStatusWithBootstrapBaseFile) { + rtFileStatus.setBootStrapFileStatus(baseFileStatus); + } + + return rtFileStatus; + } catch (IOException e) { + throw new HoodieIOException(String.format("Failed to init %s", RealtimeFileStatus.class.getSimpleName()), e); + } + } + + /** + * Creates {@link RealtimeFileStatus} for the file-slice where base file is NOT present + */ + private static RealtimeFileStatus createRealtimeFileStatusUnchecked(HoodieLogFile latestLogFile, + Stream logFiles, + String basePath, + Option latestCompletedInstantOpt, + Option virtualKeyInfoOpt) { + List sortedLogFiles = logFiles.sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList()); + try { + RealtimeFileStatus rtFileStatus = new RealtimeFileStatus(latestLogFile.getFileStatus(), basePath, + sortedLogFiles, false, virtualKeyInfoOpt); + + if (latestCompletedInstantOpt.isPresent()) { + HoodieInstant latestCompletedInstant = latestCompletedInstantOpt.get(); + checkState(latestCompletedInstant.isCompleted()); + + rtFileStatus.setMaxCommitTime(latestCompletedInstant.getTimestamp()); + } + + return rtFileStatus; + } catch (IOException e) { + throw new HoodieIOException(String.format("Failed to init %s", RealtimeFileStatus.class.getSimpleName()), e); + } + } +} + diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java index d8f0a01a911ed..e8c806ed2cf67 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java @@ -18,65 +18,60 @@ package org.apache.hudi.hadoop.realtime; -import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; -import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.hadoop.HoodieParquetInputFormat; -import org.apache.hudi.hadoop.UseFileSplitsFromInputFormat; -import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; -import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils; - -import org.apache.hadoop.conf.Configurable; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.hadoop.HoodieParquetInputFormat; +import org.apache.hudi.hadoop.UseFileSplitsFromInputFormat; import org.apache.hudi.hadoop.UseRecordReaderFromInputFormat; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.IOException; -import java.util.Arrays; -import java.util.stream.Stream; /** * Input Format, that provides a real-time view of data in a Hoodie table. */ @UseRecordReaderFromInputFormat @UseFileSplitsFromInputFormat -public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat implements Configurable { +public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat { private static final Logger LOG = LogManager.getLogger(HoodieParquetRealtimeInputFormat.class); + public HoodieParquetRealtimeInputFormat() { + super(new HoodieMergeOnReadTableInputFormat()); + } + // To make Hive on Spark queries work with RT tables. Our theory is that due to // {@link org.apache.hadoop.hive.ql.io.parquet.ProjectionPusher} // not handling empty list correctly, the ParquetRecordReaderWrapper ends up adding the same column ids multiple // times which ultimately breaks the query. - @Override - public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { - - Stream fileSplits = Arrays.stream(super.getSplits(job, numSplits)).map(is -> (FileSplit) is); - - return HoodieRealtimeInputFormatUtils.getRealtimeSplits(job, fileSplits); - } + public RecordReader getRecordReader(final InputSplit split, final JobConf jobConf, + final Reporter reporter) throws IOException { + // sanity check + ValidationUtils.checkArgument(split instanceof RealtimeSplit, + "HoodieRealtimeRecordReader can only work on RealtimeSplit and not with " + split); + RealtimeSplit realtimeSplit = (RealtimeSplit) split; + addProjectionToJobConf(realtimeSplit, jobConf); + LOG.info("Creating record reader with readCols :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); - @Override - public FileStatus[] listStatus(JobConf job) throws IOException { - // Call the HoodieInputFormat::listStatus to obtain all latest parquet files, based on commit - // timeline. - return super.listStatus(job); - } + // for log only split, set the parquet reader as empty. + if (FSUtils.isLogFile(realtimeSplit.getPath())) { + return new HoodieRealtimeRecordReader(realtimeSplit, jobConf, new HoodieEmptyRecordReader(realtimeSplit, jobConf)); + } - @Override - protected HoodieDefaultTimeline filterInstantsTimeline(HoodieDefaultTimeline timeline) { - // no specific filtering for Realtime format - return timeline; + return new HoodieRealtimeRecordReader(realtimeSplit, jobConf, + super.getRecordReader(split, jobConf, reporter)); } void addProjectionToJobConf(final RealtimeSplit realtimeSplit, final JobConf jobConf) { @@ -85,12 +80,12 @@ void addProjectionToJobConf(final RealtimeSplit realtimeSplit, final JobConf job // risk of experiencing race conditions. Hence, we synchronize on the JobConf object here. There is negligible // latency incurred here due to the synchronization since get record reader is called once per spilt before the // actual heavy lifting of reading the parquet files happen. - if (jobConf.get(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP) == null) { + if (HoodieRealtimeInputFormatUtils.canAddProjectionToJobConf(realtimeSplit, jobConf)) { synchronized (jobConf) { LOG.info( "Before adding Hoodie columns, Projections :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); - if (jobConf.get(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP) == null) { + if (HoodieRealtimeInputFormatUtils.canAddProjectionToJobConf(realtimeSplit, jobConf)) { // Hive (across all versions) fails for queries like select count(`_hoodie_commit_time`) from table; // In this case, the projection fields gets removed. Looking at HiveInputFormat implementation, in some cases // hoodie additional projection columns are reset after calling setConf and only natural projections @@ -98,33 +93,15 @@ void addProjectionToJobConf(final RealtimeSplit realtimeSplit, final JobConf job // For e:g _hoodie_record_key would be missing and merge step would throw exceptions. // TO fix this, hoodie columns are appended late at the time record-reader gets built instead of construction // time. - HoodieRealtimeInputFormatUtils.cleanProjectionColumnIds(jobConf); if (!realtimeSplit.getDeltaLogPaths().isEmpty()) { - HoodieRealtimeInputFormatUtils.addRequiredProjectionFields(jobConf); + HoodieRealtimeInputFormatUtils.addRequiredProjectionFields(jobConf, realtimeSplit.getVirtualKeyInfo()); } - this.conf = jobConf; - this.conf.set(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP, "true"); + jobConf.set(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP, "true"); + setConf(jobConf); } } } - } - - @Override - public RecordReader getRecordReader(final InputSplit split, final JobConf jobConf, - final Reporter reporter) throws IOException { - // sanity check - ValidationUtils.checkArgument(split instanceof RealtimeSplit, - "HoodieRealtimeRecordReader can only work on RealtimeSplit and not with " + split); - RealtimeSplit realtimeSplit = (RealtimeSplit) split; - addProjectionToJobConf(realtimeSplit, jobConf); - LOG.info("Creating record reader with readCols :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) - + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); - return new HoodieRealtimeRecordReader(realtimeSplit, jobConf, - super.getRecordReader(split, jobConf, reporter)); - } - @Override - public Configuration getConf() { - return conf; + HoodieRealtimeInputFormatUtils.cleanProjectionColumnIds(jobConf); } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeBootstrapBaseFileSplit.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeBootstrapBaseFileSplit.java new file mode 100644 index 0000000000000..c7022c98ad3cd --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeBootstrapBaseFileSplit.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop.realtime; + +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.hadoop.BootstrapBaseFileSplit; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Realtime {@link FileSplit} with external base file + * + * NOTE: If you're adding fields here you need to make sure that you appropriately de-/serialize them + * in {@link #readFromInput(DataInput)} and {@link #writeToOutput(DataOutput)} + */ +public class HoodieRealtimeBootstrapBaseFileSplit extends BootstrapBaseFileSplit implements RealtimeSplit { + /** + * Marks whether this path produced as part of Incremental Query + */ + private boolean belongsToIncrementalQuery = false; + /** + * List of delta log-files holding updated records for this base-file + */ + private List deltaLogFiles = new ArrayList<>(); + /** + * Latest commit instant available at the time of the query in which all of the files + * pertaining to this split are represented + */ + private String maxCommitTime; + /** + * Base path of the table this path belongs to + */ + private String basePath; + /** + * Virtual key configuration of the table this split belongs to + */ + private Option virtualKeyInfo = Option.empty(); + + /** + * NOTE: This ctor is necessary for Hive to be able to serialize and + * then instantiate it when deserializing back + */ + public HoodieRealtimeBootstrapBaseFileSplit() {} + + public HoodieRealtimeBootstrapBaseFileSplit(FileSplit baseSplit, + String basePath, + List deltaLogFiles, + String maxInstantTime, + FileSplit externalFileSplit, + boolean belongsToIncrementalQuery, + Option virtualKeyInfoOpt) throws IOException { + super(baseSplit, externalFileSplit); + this.maxCommitTime = maxInstantTime; + this.deltaLogFiles = deltaLogFiles; + this.basePath = basePath; + this.belongsToIncrementalQuery = belongsToIncrementalQuery; + this.virtualKeyInfo = virtualKeyInfoOpt; + } + + @Override + public void write(DataOutput out) throws IOException { + super.write(out); + writeToOutput(out); + } + + @Override + public void readFields(DataInput in) throws IOException { + super.readFields(in); + readFromInput(in); + } + + @Override + public List getDeltaLogFiles() { + return deltaLogFiles; + } + + @Override + public void setDeltaLogFiles(List deltaLogFiles) { + this.deltaLogFiles = deltaLogFiles; + } + + @Override + public String getMaxCommitTime() { + return maxCommitTime; + } + + @Override + public String getBasePath() { + return basePath; + } + + @Override + public Option getVirtualKeyInfo() { + return virtualKeyInfo; + } + + @Override + public boolean getBelongsToIncrementalQuery() { + return belongsToIncrementalQuery; + } + + @Override + public void setBelongsToIncrementalQuery(boolean belongsToIncrementalPath) { + this.belongsToIncrementalQuery = belongsToIncrementalPath; + } + + @Override + public void setMaxCommitTime(String maxInstantTime) { + this.maxCommitTime = maxInstantTime; + } + + @Override + public void setBasePath(String basePath) { + this.basePath = basePath; + } + + @Override + public void setVirtualKeyInfo(Option virtualKeyInfo) { + this.virtualKeyInfo = virtualKeyInfo; + } +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeFileSplit.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeFileSplit.java index fe481f0a202dc..a424f021c2d20 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeFileSplit.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeFileSplit.java @@ -19,57 +19,124 @@ package org.apache.hudi.hadoop.realtime; import org.apache.hadoop.mapred.FileSplit; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.util.Option; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; +import java.util.ArrayList; import java.util.List; /** - * Filesplit that wraps the base split and a list of log files to merge deltas from. + * {@link FileSplit} implementation that holds + *

      + *
    1. Split corresponding to the base file
    2. + *
    3. List of {@link HoodieLogFile} that holds the delta to be merged (upon reading)
    4. + *
    + * + * This split is correspondent to a single file-slice in the Hudi terminology. + * + * NOTE: If you're adding fields here you need to make sure that you appropriately de-/serialize them + * in {@link #readFromInput(DataInput)} and {@link #writeToOutput(DataOutput)} */ public class HoodieRealtimeFileSplit extends FileSplit implements RealtimeSplit { - - private List deltaLogPaths; - - private String maxCommitTime; - + /** + * List of delta log-files holding updated records for this base-file + */ + private List deltaLogFiles = new ArrayList<>(); + /** + * Base path of the table this path belongs to + */ private String basePath; - - public HoodieRealtimeFileSplit() { - super(); + /** + * Latest commit instant available at the time of the query in which all of the files + * pertaining to this split are represented + */ + private String maxCommitTime; + /** + * Marks whether this path produced as part of Incremental Query + */ + private boolean belongsToIncrementalQuery = false; + /** + * Virtual key configuration of the table this split belongs to + */ + private Option virtualKeyInfo = Option.empty(); + + public HoodieRealtimeFileSplit() {} + + public HoodieRealtimeFileSplit(FileSplit baseSplit, + HoodieRealtimePath path) + throws IOException { + this(baseSplit, + path.getBasePath(), + path.getDeltaLogFiles(), + path.getMaxCommitTime(), + path.getBelongsToIncrementalQuery(), + path.getVirtualKeyInfo()); } - public HoodieRealtimeFileSplit(FileSplit baseSplit, String basePath, List deltaLogPaths, String maxCommitTime) + /** + * @VisibleInTesting + */ + public HoodieRealtimeFileSplit(FileSplit baseSplit, + String basePath, + List deltaLogFiles, + String maxCommitTime, + boolean belongsToIncrementalQuery, + Option virtualKeyInfo) throws IOException { super(baseSplit.getPath(), baseSplit.getStart(), baseSplit.getLength(), baseSplit.getLocations()); - this.deltaLogPaths = deltaLogPaths; - this.maxCommitTime = maxCommitTime; + this.deltaLogFiles = deltaLogFiles; this.basePath = basePath; + this.maxCommitTime = maxCommitTime; + this.belongsToIncrementalQuery = belongsToIncrementalQuery; + this.virtualKeyInfo = virtualKeyInfo; + } + + public List getDeltaLogFiles() { + return deltaLogFiles; } - public List getDeltaLogPaths() { - return deltaLogPaths; + @Override + public void setDeltaLogFiles(List deltaLogFiles) { + this.deltaLogFiles = deltaLogFiles; } public String getMaxCommitTime() { return maxCommitTime; } + public void setMaxCommitTime(String maxCommitTime) { + this.maxCommitTime = maxCommitTime; + } + public String getBasePath() { return basePath; } - public void setDeltaLogPaths(List deltaLogPaths) { - this.deltaLogPaths = deltaLogPaths; + public void setBasePath(String basePath) { + this.basePath = basePath; } - public void setMaxCommitTime(String maxCommitTime) { - this.maxCommitTime = maxCommitTime; + @Override + public void setVirtualKeyInfo(Option virtualKeyInfo) { + this.virtualKeyInfo = virtualKeyInfo; } - public void setBasePath(String basePath) { - this.basePath = basePath; + @Override + public Option getVirtualKeyInfo() { + return virtualKeyInfo; + } + + @Override + public boolean getBelongsToIncrementalQuery() { + return belongsToIncrementalQuery; + } + + @Override + public void setBelongsToIncrementalQuery(boolean belongsToIncrementalPath) { + this.belongsToIncrementalQuery = belongsToIncrementalPath; } @Override @@ -86,7 +153,7 @@ public void readFields(DataInput in) throws IOException { @Override public String toString() { - return "HoodieRealtimeFileSplit{DataPath=" + getPath() + ", deltaLogPaths=" + deltaLogPaths + return "HoodieRealtimeFileSplit{DataPath=" + getPath() + ", deltaLogPaths=" + getDeltaLogPaths() + ", maxCommitTime='" + maxCommitTime + '\'' + ", basePath='" + basePath + '\'' + '}'; } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimePath.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimePath.java new file mode 100644 index 0000000000000..37b59a9627504 --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimePath.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop.realtime; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.hadoop.PathWithBootstrapFileStatus; + +import java.util.List; + +/** + * {@link Path} implementation encoding additional information necessary to appropriately read + * base files of the MOR tables, such as list of delta log files (holding updated records) associated + * w/ the base file, etc. + */ +public class HoodieRealtimePath extends Path { + /** + * Marks whether this path produced as part of Incremental Query + */ + private final boolean belongsToIncrementalQuery; + /** + * List of delta log-files holding updated records for this base-file + */ + private final List deltaLogFiles; + /** + * Latest commit instant available at the time of the query in which all of the files + * pertaining to this split are represented + */ + private final String maxCommitTime; + /** + * Base path of the table this path belongs to + */ + private final String basePath; + /** + * Virtual key configuration of the table this split belongs to + */ + private final Option virtualKeyInfo; + /** + * File status for the Bootstrap file (only relevant if this table is a bootstrapped table + */ + private PathWithBootstrapFileStatus pathWithBootstrapFileStatus; + + public HoodieRealtimePath(Path parent, + String child, + String basePath, + List deltaLogFiles, + String maxCommitTime, + boolean belongsToIncrementalQuery, + Option virtualKeyInfo) { + super(parent, child); + this.basePath = basePath; + this.deltaLogFiles = deltaLogFiles; + this.maxCommitTime = maxCommitTime; + this.belongsToIncrementalQuery = belongsToIncrementalQuery; + this.virtualKeyInfo = virtualKeyInfo; + } + + public List getDeltaLogFiles() { + return deltaLogFiles; + } + + public String getMaxCommitTime() { + return maxCommitTime; + } + + public String getBasePath() { + return basePath; + } + + public boolean getBelongsToIncrementalQuery() { + return belongsToIncrementalQuery; + } + + public boolean isSplitable() { + return !toString().contains(".log") && deltaLogFiles.isEmpty() && !includeBootstrapFilePath(); + } + + public PathWithBootstrapFileStatus getPathWithBootstrapFileStatus() { + return pathWithBootstrapFileStatus; + } + + public void setPathWithBootstrapFileStatus(PathWithBootstrapFileStatus pathWithBootstrapFileStatus) { + this.pathWithBootstrapFileStatus = pathWithBootstrapFileStatus; + } + + public boolean includeBootstrapFilePath() { + return pathWithBootstrapFileStatus != null; + } + + public Option getVirtualKeyInfo() { + return virtualKeyInfo; + } +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeRecordReader.java index 1e3a25ac78a11..1cd18cf97bb2a 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeRecordReader.java @@ -103,4 +103,8 @@ public void close() throws IOException { public float getProgress() throws IOException { return this.reader.getProgress(); } + + public RecordReader getReader() { + return this.reader; + } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieVirtualKeyInfo.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieVirtualKeyInfo.java new file mode 100644 index 0000000000000..41031e613d112 --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieVirtualKeyInfo.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop.realtime; + +import org.apache.hudi.common.util.Option; + +import java.io.Serializable; + +/** + * Class to hold virtual key info when meta fields are disabled. + */ +public class HoodieVirtualKeyInfo implements Serializable { + + private final String recordKeyField; + private final Option partitionPathField; + private final int recordKeyFieldIndex; + private final Option partitionPathFieldIndex; + + public HoodieVirtualKeyInfo(String recordKeyField, Option partitionPathField, int recordKeyFieldIndex, Option partitionPathFieldIndex) { + this.recordKeyField = recordKeyField; + this.partitionPathField = partitionPathField; + this.recordKeyFieldIndex = recordKeyFieldIndex; + this.partitionPathFieldIndex = partitionPathFieldIndex; + } + + public String getRecordKeyField() { + return recordKeyField; + } + + public Option getPartitionPathField() { + return partitionPathField; + } + + public int getRecordKeyFieldIndex() { + return recordKeyFieldIndex; + } + + public Option getPartitionPathFieldIndex() { + return partitionPathFieldIndex; + } + + @Override + public String toString() { + return "HoodieVirtualKeyInfo{" + + "recordKeyField='" + recordKeyField + '\'' + + ", partitionPathField='" + (partitionPathField.isPresent() ? partitionPathField.get() : "null") + '\'' + + ", recordKeyFieldIndex=" + recordKeyFieldIndex + + ", partitionPathFieldIndex=" + (partitionPathFieldIndex.isPresent() ? partitionPathFieldIndex.get() : "-1") + + '}'; + } +} \ No newline at end of file diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeBootstrapBaseFileSplit.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeBootstrapBaseFileSplit.java deleted file mode 100644 index fd3b5b81074aa..0000000000000 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeBootstrapBaseFileSplit.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.hadoop.realtime; - -import org.apache.hudi.hadoop.BootstrapBaseFileSplit; - -import org.apache.hadoop.mapred.FileSplit; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.List; - -/** - * Realtime File Split with external base file. - */ -public class RealtimeBootstrapBaseFileSplit extends BootstrapBaseFileSplit implements RealtimeSplit { - - private List deltaLogPaths; - - private String maxInstantTime; - - private String basePath; - - public RealtimeBootstrapBaseFileSplit() { - super(); - } - - public RealtimeBootstrapBaseFileSplit(FileSplit baseSplit, String basePath, List deltaLogPaths, - String maxInstantTime, FileSplit externalFileSplit) throws IOException { - super(baseSplit, externalFileSplit); - this.maxInstantTime = maxInstantTime; - this.deltaLogPaths = deltaLogPaths; - this.basePath = basePath; - } - - @Override - public void write(DataOutput out) throws IOException { - super.write(out); - writeToOutput(out); - } - - @Override - public void readFields(DataInput in) throws IOException { - super.readFields(in); - readFromInput(in); - } - - @Override - public List getDeltaLogPaths() { - return deltaLogPaths; - } - - @Override - public String getMaxCommitTime() { - return maxInstantTime; - } - - @Override - public String getBasePath() { - return basePath; - } - - @Override - public void setDeltaLogPaths(List deltaLogPaths) { - this.deltaLogPaths = deltaLogPaths; - } - - @Override - public void setMaxCommitTime(String maxInstantTime) { - this.maxInstantTime = maxInstantTime; - } - - @Override - public void setBasePath(String basePath) { - this.basePath = basePath; - } -} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java index a139997cad974..b917f004bcd06 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java @@ -18,8 +18,16 @@ package org.apache.hudi.hadoop.realtime; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; @@ -27,18 +35,14 @@ import org.apache.hudi.hadoop.config.HoodieRealtimeConfig; import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; - -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.io.ArrayWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.IOException; +import java.util.HashSet; +import java.util.Iterator; import java.util.Map; +import java.util.Set; class RealtimeCompactedRecordReader extends AbstractRealtimeRecordReader implements RecordReader { @@ -48,11 +52,21 @@ class RealtimeCompactedRecordReader extends AbstractRealtimeRecordReader protected final RecordReader parquetReader; private final Map> deltaRecordMap; + private final Set deltaRecordKeys; + private final HoodieMergedLogRecordScanner mergedLogRecordScanner; + private final int recordKeyIndex; + private Iterator deltaItr; + public RealtimeCompactedRecordReader(RealtimeSplit split, JobConf job, RecordReader realReader) throws IOException { super(split, job); this.parquetReader = realReader; - this.deltaRecordMap = getMergedLogRecordScanner().getRecords(); + this.mergedLogRecordScanner = getMergedLogRecordScanner(); + this.deltaRecordMap = mergedLogRecordScanner.getRecords(); + this.deltaRecordKeys = new HashSet<>(this.deltaRecordMap.keySet()); + this.recordKeyIndex = split.getVirtualKeyInfo() + .map(HoodieVirtualKeyInfo::getRecordKeyFieldIndex) + .orElse(HoodieInputFormatUtils.HOODIE_RECORD_KEY_COL_POS); } /** @@ -74,70 +88,89 @@ private HoodieMergedLogRecordScanner getMergedLogRecordScanner() throws IOExcept .withReverseReader(false) .withBufferSize(jobConf.getInt(HoodieRealtimeConfig.MAX_DFS_STREAM_BUFFER_SIZE_PROP, HoodieRealtimeConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE)) .withSpillableMapBasePath(jobConf.get(HoodieRealtimeConfig.SPILLABLE_MAP_BASE_PATH_PROP, HoodieRealtimeConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH)) + .withDiskMapType(jobConf.getEnum(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue())) + .withBitCaskDiskMapCompressionEnabled(jobConf.getBoolean(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), + HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue())) .build(); } + private Option buildGenericRecordwithCustomPayload(HoodieRecord record) throws IOException { + if (usesCustomPayload) { + return ((HoodieAvroRecord) record).getData().getInsertValue(getWriterSchema(), payloadProps); + } else { + return ((HoodieAvroRecord) record).getData().getInsertValue(getReaderSchema(), payloadProps); + } + } + @Override public boolean next(NullWritable aVoid, ArrayWritable arrayWritable) throws IOException { // Call the underlying parquetReader.next - which may replace the passed in ArrayWritable // with a new block of values - boolean result = this.parquetReader.next(aVoid, arrayWritable); - if (!result) { - // if the result is false, then there are no more records - return false; - } - if (!deltaRecordMap.isEmpty()) { - // TODO(VC): Right now, we assume all records in log, have a matching base record. (which - // would be true until we have a way to index logs too) - // return from delta records map if we have some match. - String key = arrayWritable.get()[HoodieInputFormatUtils.HOODIE_RECORD_KEY_COL_POS].toString(); - if (deltaRecordMap.containsKey(key)) { - // TODO(NA): Invoke preCombine here by converting arrayWritable to Avro. This is required since the - // deltaRecord may not be a full record and needs values of columns from the parquet - Option rec; - if (usesCustomPayload) { - rec = deltaRecordMap.get(key).getData().getInsertValue(getWriterSchema()); - } else { - rec = deltaRecordMap.get(key).getData().getInsertValue(getReaderSchema()); - } - if (!rec.isPresent()) { + while (this.parquetReader.next(aVoid, arrayWritable)) { + if (!deltaRecordMap.isEmpty()) { + String key = arrayWritable.get()[recordKeyIndex].toString(); + if (deltaRecordMap.containsKey(key)) { + // mark the key as handled + this.deltaRecordKeys.remove(key); + // TODO(NA): Invoke preCombine here by converting arrayWritable to Avro. This is required since the + // deltaRecord may not be a full record and needs values of columns from the parquet + Option rec = buildGenericRecordwithCustomPayload(deltaRecordMap.get(key)); // If the record is not present, this is a delete record using an empty payload so skip this base record // and move to the next record - return next(aVoid, arrayWritable); - } - GenericRecord recordToReturn = rec.get(); - if (usesCustomPayload) { - // If using a custom payload, return only the projection fields. The readerSchema is a schema derived from - // the writerSchema with only the projection fields - recordToReturn = HoodieAvroUtils.rewriteRecordWithOnlyNewSchemaFields(rec.get(), getReaderSchema()); - } - // we assume, a later safe record in the log, is newer than what we have in the map & - // replace it. Since we want to return an arrayWritable which is the same length as the elements in the latest - // schema, we use writerSchema to create the arrayWritable from the latest generic record - ArrayWritable aWritable = (ArrayWritable) HoodieRealtimeRecordReaderUtils.avroToArrayWritable(recordToReturn, getHiveSchema()); - Writable[] replaceValue = aWritable.get(); - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("key %s, base values: %s, log values: %s", key, HoodieRealtimeRecordReaderUtils.arrayWritableToString(arrayWritable), - HoodieRealtimeRecordReaderUtils.arrayWritableToString(aWritable))); - } - Writable[] originalValue = arrayWritable.get(); - try { - // Sometime originalValue.length > replaceValue.length. - // This can happen when hive query is looking for pseudo parquet columns like BLOCK_OFFSET_INSIDE_FILE - System.arraycopy(replaceValue, 0, originalValue, 0, - Math.min(originalValue.length, replaceValue.length)); - arrayWritable.set(originalValue); - } catch (RuntimeException re) { - LOG.error("Got exception when doing array copy", re); - LOG.error("Base record :" + HoodieRealtimeRecordReaderUtils.arrayWritableToString(arrayWritable)); - LOG.error("Log record :" + HoodieRealtimeRecordReaderUtils.arrayWritableToString(aWritable)); - String errMsg = "Base-record :" + HoodieRealtimeRecordReaderUtils.arrayWritableToString(arrayWritable) - + " ,Log-record :" + HoodieRealtimeRecordReaderUtils.arrayWritableToString(aWritable) + " ,Error :" + re.getMessage(); - throw new RuntimeException(errMsg, re); + if (!rec.isPresent()) { + continue; + } + setUpWritable(rec, arrayWritable, key); + return true; } } + return true; + } + if (this.deltaItr == null) { + this.deltaItr = this.deltaRecordKeys.iterator(); + } + while (this.deltaItr.hasNext()) { + final String key = this.deltaItr.next(); + Option rec = buildGenericRecordwithCustomPayload(deltaRecordMap.get(key)); + if (rec.isPresent()) { + setUpWritable(rec, arrayWritable, key); + return true; + } + } + return false; + } + + private void setUpWritable(Option rec, ArrayWritable arrayWritable, String key) { + GenericRecord recordToReturn = rec.get(); + if (usesCustomPayload) { + // If using a custom payload, return only the projection fields. The readerSchema is a schema derived from + // the writerSchema with only the projection fields + recordToReturn = HoodieAvroUtils.rewriteRecord(rec.get(), getReaderSchema()); + } + // we assume, a later safe record in the log, is newer than what we have in the map & + // replace it. Since we want to return an arrayWritable which is the same length as the elements in the latest + // schema, we use writerSchema to create the arrayWritable from the latest generic record + ArrayWritable aWritable = (ArrayWritable) HoodieRealtimeRecordReaderUtils.avroToArrayWritable(recordToReturn, getHiveSchema()); + Writable[] replaceValue = aWritable.get(); + if (LOG.isDebugEnabled()) { + LOG.debug(String.format("key %s, base values: %s, log values: %s", key, HoodieRealtimeRecordReaderUtils.arrayWritableToString(arrayWritable), + HoodieRealtimeRecordReaderUtils.arrayWritableToString(aWritable))); + } + Writable[] originalValue = arrayWritable.get(); + try { + // Sometime originalValue.length > replaceValue.length. + // This can happen when hive query is looking for pseudo parquet columns like BLOCK_OFFSET_INSIDE_FILE + System.arraycopy(replaceValue, 0, originalValue, 0, + Math.min(originalValue.length, replaceValue.length)); + arrayWritable.set(originalValue); + } catch (RuntimeException re) { + LOG.error("Got exception when doing array copy", re); + LOG.error("Base record :" + HoodieRealtimeRecordReaderUtils.arrayWritableToString(arrayWritable)); + LOG.error("Log record :" + HoodieRealtimeRecordReaderUtils.arrayWritableToString(aWritable)); + String errMsg = "Base-record :" + HoodieRealtimeRecordReaderUtils.arrayWritableToString(arrayWritable) + + " ,Log-record :" + HoodieRealtimeRecordReaderUtils.arrayWritableToString(aWritable) + " ,Error :" + re.getMessage(); + throw new RuntimeException(errMsg, re); } - return true; } @Override @@ -158,6 +191,9 @@ public long getPos() throws IOException { @Override public void close() throws IOException { parquetReader.close(); + // need clean the tmp file which created by logScanner + // Otherwise, for resident process such as presto, the /tmp directory will overflow + mergedLogRecordScanner.close(); } @Override diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeSplit.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeSplit.java index 0fab734342e49..4b0b2d6ea79e2 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeSplit.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeSplit.java @@ -18,16 +18,18 @@ package org.apache.hudi.hadoop.realtime; -import org.apache.hudi.hadoop.InputSplitUtils; - import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.InputSplitWithLocationInfo; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.hadoop.InputSplitUtils; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; /** * Realtime Input Split Interface. @@ -38,7 +40,13 @@ public interface RealtimeSplit extends InputSplitWithLocationInfo { * Return Log File Paths. * @return */ - List getDeltaLogPaths(); + default List getDeltaLogPaths() { + return getDeltaLogFiles().stream().map(entry -> entry.getPath().toString()).collect(Collectors.toList()); + } + + List getDeltaLogFiles(); + + void setDeltaLogFiles(List deltaLogFiles); /** * Return Max Instant Time. @@ -53,10 +61,15 @@ public interface RealtimeSplit extends InputSplitWithLocationInfo { String getBasePath(); /** - * Update Log File Paths. - * @param deltaLogPaths + * Returns Virtual key info if meta fields are disabled. + * @return + */ + Option getVirtualKeyInfo(); + + /** + * Returns the flag whether this split belongs to an Incremental Query */ - void setDeltaLogPaths(List deltaLogPaths); + boolean getBelongsToIncrementalQuery(); /** * Update Maximum valid instant time. @@ -70,38 +83,76 @@ public interface RealtimeSplit extends InputSplitWithLocationInfo { */ void setBasePath(String basePath); + /** + * Sets the flag whether this split belongs to an Incremental Query + */ + void setBelongsToIncrementalQuery(boolean belongsToIncrementalQuery); + + void setVirtualKeyInfo(Option virtualKeyInfo); + default void writeToOutput(DataOutput out) throws IOException { InputSplitUtils.writeString(getBasePath(), out); InputSplitUtils.writeString(getMaxCommitTime(), out); - out.writeInt(getDeltaLogPaths().size()); - for (String logFilePath : getDeltaLogPaths()) { - InputSplitUtils.writeString(logFilePath, out); + InputSplitUtils.writeBoolean(getBelongsToIncrementalQuery(), out); + + out.writeInt(getDeltaLogFiles().size()); + for (HoodieLogFile logFile : getDeltaLogFiles()) { + InputSplitUtils.writeString(logFile.getPath().toString(), out); + out.writeLong(logFile.getFileSize()); + } + + Option virtualKeyInfoOpt = getVirtualKeyInfo(); + if (!virtualKeyInfoOpt.isPresent()) { + InputSplitUtils.writeBoolean(false, out); + } else { + InputSplitUtils.writeBoolean(true, out); + InputSplitUtils.writeString(virtualKeyInfoOpt.get().getRecordKeyField(), out); + InputSplitUtils.writeString(String.valueOf(virtualKeyInfoOpt.get().getRecordKeyFieldIndex()), out); + InputSplitUtils.writeBoolean(virtualKeyInfoOpt.get().getPartitionPathField().isPresent(), out); + if (virtualKeyInfoOpt.get().getPartitionPathField().isPresent()) { + InputSplitUtils.writeString(virtualKeyInfoOpt.get().getPartitionPathField().get(), out); + InputSplitUtils.writeString(String.valueOf(virtualKeyInfoOpt.get().getPartitionPathFieldIndex()), out); + } } } default void readFromInput(DataInput in) throws IOException { setBasePath(InputSplitUtils.readString(in)); setMaxCommitTime(InputSplitUtils.readString(in)); + setBelongsToIncrementalQuery(InputSplitUtils.readBoolean(in)); + int totalLogFiles = in.readInt(); - List deltaLogPaths = new ArrayList<>(totalLogFiles); + List deltaLogPaths = new ArrayList<>(totalLogFiles); for (int i = 0; i < totalLogFiles; i++) { - deltaLogPaths.add(InputSplitUtils.readString(in)); + String logFilePath = InputSplitUtils.readString(in); + long logFileSize = in.readLong(); + deltaLogPaths.add(new HoodieLogFile(new Path(logFilePath), logFileSize)); + } + setDeltaLogFiles(deltaLogPaths); + + boolean hoodieVirtualKeyPresent = InputSplitUtils.readBoolean(in); + if (hoodieVirtualKeyPresent) { + String recordKeyField = InputSplitUtils.readString(in); + int recordFieldIndex = Integer.parseInt(InputSplitUtils.readString(in)); + boolean isPartitionPathFieldPresent = InputSplitUtils.readBoolean(in); + Option partitionPathField = isPartitionPathFieldPresent ? Option.of(InputSplitUtils.readString(in)) : Option.empty(); + Option partitionPathIndex = isPartitionPathFieldPresent ? Option.of(Integer.parseInt(InputSplitUtils.readString(in))) : Option.empty(); + setVirtualKeyInfo(Option.of(new HoodieVirtualKeyInfo(recordKeyField, partitionPathField, recordFieldIndex, partitionPathIndex))); } - setDeltaLogPaths(deltaLogPaths); } /** * The file containing this split's data. */ - public Path getPath(); + Path getPath(); /** * The position of the first byte in the file to process. */ - public long getStart(); + long getStart(); /** * The number of bytes in the file to process. */ - public long getLength(); + long getLength(); } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeUnmergedRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeUnmergedRecordReader.java index d209a5a388fe6..84c808865072a 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeUnmergedRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeUnmergedRecordReader.java @@ -18,18 +18,10 @@ package org.apache.hudi.hadoop.realtime; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.io.ArrayWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.table.log.HoodieUnMergedLogRecordScanner; import org.apache.hudi.common.util.DefaultSizeEstimator; +import org.apache.hudi.common.util.Functions; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; import org.apache.hudi.common.util.queue.BoundedInMemoryQueueProducer; @@ -40,6 +32,18 @@ import org.apache.hudi.hadoop.config.HoodieRealtimeConfig; import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.function.Function; + class RealtimeUnmergedRecordReader extends AbstractRealtimeRecordReader implements RecordReader { @@ -63,7 +67,7 @@ class RealtimeUnmergedRecordReader extends AbstractRealtimeRecordReader * clients to consume. * * @param split File split - * @param jobConf Job Configuration + * @param job Job Configuration * @param realReader Parquet Reader */ public RealtimeUnmergedRecordReader(RealtimeSplit split, JobConf job, @@ -74,7 +78,7 @@ public RealtimeUnmergedRecordReader(RealtimeSplit split, JobConf job, this.parquetRecordsIterator = new RecordReaderValueIterator<>(this.parquetReader); this.executor = new BoundedInMemoryExecutor<>( HoodieRealtimeRecordReaderUtils.getMaxCompactionMemoryInBytes(jobConf), getParallelProducers(), - Option.empty(), x -> x, new DefaultSizeEstimator<>()); + Option.empty(), Function.identity(), new DefaultSizeEstimator<>(), Functions.noop()); // Consumer of this record reader this.iterator = this.executor.getQueue().iterator(); this.logRecordScanner = HoodieUnMergedLogRecordScanner.newBuilder() @@ -88,7 +92,7 @@ public RealtimeUnmergedRecordReader(RealtimeSplit split, JobConf job, .withBufferSize(this.jobConf.getInt(HoodieRealtimeConfig.MAX_DFS_STREAM_BUFFER_SIZE_PROP, HoodieRealtimeConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE)) .withLogRecordScannerCallback(record -> { // convert Hoodie log record to Hadoop AvroWritable and buffer - GenericRecord rec = (GenericRecord) record.getData().getInsertValue(getReaderSchema()).get(); + GenericRecord rec = (GenericRecord) record.getData().getInsertValue(getReaderSchema(), payloadProps).get(); ArrayWritable aWritable = (ArrayWritable) HoodieRealtimeRecordReaderUtils.avroToArrayWritable(rec, getHiveSchema()); this.executor.getQueue().insertRecord(aWritable); }) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java index 8dbe080753f9d..cbfd197f43897 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java @@ -18,9 +18,12 @@ package org.apache.hudi.hadoop.utils; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.Option; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -37,9 +40,12 @@ public class HoodieHiveUtils { public static final Logger LOG = LogManager.getLogger(HoodieHiveUtils.class); + public static final String HOODIE_INCREMENTAL_USE_DATABASE = "hoodie.incremental.use.database"; public static final String HOODIE_CONSUME_MODE_PATTERN = "hoodie.%s.consume.mode"; public static final String HOODIE_START_COMMIT_PATTERN = "hoodie.%s.consume.start.timestamp"; public static final String HOODIE_MAX_COMMIT_PATTERN = "hoodie.%s.consume.max.commits"; + public static final String HOODIE_CONSUME_PENDING_COMMITS = "hoodie.%s.consume.pending.commits"; + public static final String HOODIE_CONSUME_COMMIT = "hoodie.%s.consume.commit"; public static final Set VIRTUAL_COLUMN_NAMES = CollectionUtils.createImmutableSet( "INPUT__FILE__NAME", "BLOCK__OFFSET__INSIDE__FILE", "ROW__OFFSET__INSIDE__BLOCK", "RAW__DATA__SIZE", "ROW__ID", "GROUPING__ID"); @@ -65,8 +71,16 @@ public class HoodieHiveUtils { public static final String DEFAULT_SCAN_MODE = SNAPSHOT_SCAN_MODE; public static final int DEFAULT_MAX_COMMITS = 1; public static final int MAX_COMMIT_ALL = -1; - public static final int DEFAULT_LEVELS_TO_BASEPATH = 3; public static final Pattern HOODIE_CONSUME_MODE_PATTERN_STRING = Pattern.compile("hoodie\\.(.*)\\.consume\\.mode"); + public static final String GLOBALLY_CONSISTENT_READ_TIMESTAMP = "last_replication_timestamp"; + + public static boolean shouldIncludePendingCommits(JobConf job, String tableName) { + return job.getBoolean(String.format(HOODIE_CONSUME_PENDING_COMMITS, tableName), false); + } + + public static Option getMaxCommit(JobConf job, String tableName) { + return Option.ofNullable(job.get(String.format(HOODIE_CONSUME_COMMIT, tableName))); + } public static boolean stopAtCompaction(JobContext job, String tableName) { String compactionPropName = String.format(HOODIE_STOP_AT_COMPACTION_PATTERN, tableName); @@ -91,6 +105,13 @@ public static String readStartCommitTime(JobContext job, String tableName) { return job.getConfiguration().get(startCommitTimestampName); } + /** + * Gets the n'th parent for the Path. Assumes the path has at-least n components + * + * @param path + * @param n + * @return + */ public static Path getNthParent(Path path, int n) { Path parent = path; for (int i = 0; i < n; i++) { @@ -99,6 +120,12 @@ public static Path getNthParent(Path path, int n) { return parent; } + /** + * Returns a list of tableNames for which hoodie..consume.mode is set to incremental else returns empty List + * + * @param job + * @return + */ public static List getIncrementalTableNames(JobContext job) { Map tablesModeMap = job.getConfiguration() .getValByRegex(HOODIE_CONSUME_MODE_PATTERN_STRING.pattern()); @@ -115,4 +142,8 @@ public static List getIncrementalTableNames(JobContext job) { } return result; } + + public static boolean isIncrementalUseDatabase(Configuration conf) { + return conf.getBoolean(HOODIE_INCREMENTAL_USE_DATABASE, false); + } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java index b36885106cce0..fa1b60480651d 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java @@ -18,6 +18,20 @@ package org.apache.hudi.hadoop.utils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcSerde; +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; +import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieCommitMetadata; @@ -32,22 +46,13 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.TableNotFoundException; import org.apache.hudi.hadoop.FileStatusWithBootstrapBaseFile; import org.apache.hudi.hadoop.HoodieHFileInputFormat; import org.apache.hudi.hadoop.HoodieParquetInputFormat; import org.apache.hudi.hadoop.LocatedFileStatusWithBootstrapBaseFile; import org.apache.hudi.hadoop.realtime.HoodieHFileRealtimeInputFormat; import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; -import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; -import org.apache.hadoop.mapred.FileInputFormat; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapreduce.Job; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -62,6 +67,10 @@ import java.util.function.Function; import java.util.stream.Collectors; +import static org.apache.hudi.common.config.HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS; +import static org.apache.hudi.common.config.HoodieMetadataConfig.ENABLE; +import static org.apache.hudi.common.table.HoodieTableMetaClient.METAFOLDER_NAME; + public class HoodieInputFormatUtils { // These positions have to be deterministic across all tables @@ -113,6 +122,8 @@ public static String getInputFormatClassName(HoodieFileFormat baseFileFormat, bo } else { return HoodieHFileInputFormat.class.getName(); } + case ORC: + return OrcInputFormat.class.getName(); default: throw new HoodieIOException("Hoodie InputFormat not implemented for base file format " + baseFileFormat); } @@ -124,6 +135,8 @@ public static String getOutputFormatClassName(HoodieFileFormat baseFileFormat) { return MapredParquetOutputFormat.class.getName(); case HFILE: return MapredParquetOutputFormat.class.getName(); + case ORC: + return OrcOutputFormat.class.getName(); default: throw new HoodieIOException("No OutputFormat for base file format " + baseFileFormat); } @@ -135,6 +148,8 @@ public static String getSerDeClassName(HoodieFileFormat baseFileFormat) { return ParquetHiveSerDe.class.getName(); case HFILE: return ParquetHiveSerDe.class.getName(); + case ORC: + return OrcSerde.class.getName(); default: throw new HoodieIOException("No SerDe for base file format " + baseFileFormat); } @@ -148,6 +163,10 @@ public static FileInputFormat getInputFormat(String path, boolean realtime, Conf if (extension.equals(HoodieFileFormat.HFILE.getFileExtension())) { return getInputFormat(HoodieFileFormat.HFILE, realtime, conf); } + // now we support read log file, try to find log file + if (FSUtils.isLogFile(new Path(path)) && realtime) { + return getInputFormat(HoodieFileFormat.PARQUET, realtime, conf); + } throw new HoodieIOException("Hoodie InputFormat not implemented for base file of type " + extension); } @@ -168,7 +187,7 @@ public static FileInputFormat getInputFormat(String path, boolean realtime, Conf * @return */ public static HoodieDefaultTimeline filterInstantsTimeline(HoodieDefaultTimeline timeline) { - HoodieDefaultTimeline commitsAndCompactionTimeline = timeline.getCommitsAndCompactionTimeline(); + HoodieDefaultTimeline commitsAndCompactionTimeline = timeline.getWriteTimeline(); Option pendingCompactionInstant = commitsAndCompactionTimeline .filterPendingCompactionTimeline().firstInstant(); if (pendingCompactionInstant.isPresent()) { @@ -229,14 +248,14 @@ public static Option getAffectedPartitions(List commitsTo * those partitions. */ for (Path path : inputPaths) { - if (path.toString().contains(s)) { + if (path.toString().endsWith(s)) { return true; } } return false; }) .collect(Collectors.joining(",")); - return Option.of(incrementalInputPaths); + return StringUtils.isNullOrEmpty(incrementalInputPaths) ? Option.empty() : Option.of(incrementalInputPaths); } /** @@ -245,7 +264,7 @@ public static Option getAffectedPartitions(List commitsTo * @param tableMetaClient * @return */ - public static Option getFilteredCommitsTimeline(Job job, HoodieTableMetaClient tableMetaClient) { + public static Option getFilteredCommitsTimeline(JobContext job, HoodieTableMetaClient tableMetaClient) { String tableName = tableMetaClient.getTableConfig().getTableName(); HoodieDefaultTimeline baseTimeline; if (HoodieHiveUtils.stopAtCompaction(job, tableName)) { @@ -264,33 +283,38 @@ public static Option getFilteredCommitsTimeline(Job job, HoodieT * @return */ public static Option> getCommitsForIncrementalQuery(Job job, String tableName, HoodieTimeline timeline) { + return Option.of(getHoodieTimelineForIncrementalQuery(job, tableName, timeline) + .getInstants().collect(Collectors.toList())); + } + + /** + * Get HoodieTimeline for incremental query from Hive map reduce configuration. + * + * @param job + * @param tableName + * @param timeline + * @return + */ + public static HoodieTimeline getHoodieTimelineForIncrementalQuery(JobContext job, String tableName, HoodieTimeline timeline) { String lastIncrementalTs = HoodieHiveUtils.readStartCommitTime(job, tableName); // Total number of commits to return in this batch. Set this to -1 to get all the commits. Integer maxCommits = HoodieHiveUtils.readMaxCommits(job, tableName); LOG.info("Last Incremental timestamp was set as " + lastIncrementalTs); - return Option.of(timeline.findInstantsAfter(lastIncrementalTs, maxCommits) - .getInstants().collect(Collectors.toList())); + return timeline.findInstantsAfter(lastIncrementalTs, maxCommits); } /** - * Extract HoodieTableMetaClient by base path. - * @param conf - * @param partitions - * @return + * Extract HoodieTableMetaClient by partition path. + * @param conf The hadoop conf + * @param partitions The partitions + * @return partition path to table meta client mapping */ - public static Map getTableMetaClientByBasePath(Configuration conf, Set partitions) { - Map metaClientMap = new HashMap<>(); + public static Map getTableMetaClientByPartitionPath(Configuration conf, Set partitions) { + Map metaClientMap = new HashMap<>(); return partitions.stream().collect(Collectors.toMap(Function.identity(), p -> { - // Get meta client if this path is the base path. - Option matchingBasePath = Option.fromJavaOptional( - metaClientMap.keySet().stream().filter(basePath -> p.toString().startsWith(basePath)).findFirst()); - if (matchingBasePath.isPresent()) { - return metaClientMap.get(matchingBasePath.get()); - } - try { - HoodieTableMetaClient metaClient = getTableMetaClientForBasePath(p.getFileSystem(conf), p); - metaClientMap.put(metaClient.getBasePath(), metaClient); + HoodieTableMetaClient metaClient = getTableMetaClientForBasePathUnchecked(conf, p); + metaClientMap.put(p, metaClient); return metaClient; } catch (IOException e) { throw new HoodieIOException("Error creating hoodie meta client against : " + p, e); @@ -299,22 +323,29 @@ public static Map getTableMetaClientByBasePath(Conf } /** - * Extract HoodieTableMetaClient from a partition path(not base path). - * @param fs - * @param dataPath - * @return - * @throws IOException + * Extract HoodieTableMetaClient from a partition path (not base path) */ - public static HoodieTableMetaClient getTableMetaClientForBasePath(FileSystem fs, Path dataPath) throws IOException { - int levels = HoodieHiveUtils.DEFAULT_LEVELS_TO_BASEPATH; - if (HoodiePartitionMetadata.hasPartitionMetadata(fs, dataPath)) { - HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, dataPath); + public static HoodieTableMetaClient getTableMetaClientForBasePathUnchecked(Configuration conf, Path partitionPath) throws IOException { + Path baseDir = partitionPath; + FileSystem fs = partitionPath.getFileSystem(conf); + if (HoodiePartitionMetadata.hasPartitionMetadata(fs, partitionPath)) { + HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, partitionPath); metadata.readFromFS(); - levels = metadata.getPartitionDepth(); + int levels = metadata.getPartitionDepth(); + baseDir = HoodieHiveUtils.getNthParent(partitionPath, levels); + } else { + for (int i = 0; i < partitionPath.depth(); i++) { + if (fs.exists(new Path(baseDir, METAFOLDER_NAME))) { + break; + } else if (i == partitionPath.depth() - 1) { + throw new TableNotFoundException(partitionPath.toString()); + } else { + baseDir = baseDir.getParent(); + } + } } - Path baseDir = HoodieHiveUtils.getNthParent(dataPath, levels); LOG.info("Reading hoodie metadata from path " + baseDir.toString()); - return new HoodieTableMetaClient(fs.getConf(), baseDir.toString()); + return HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(baseDir.toString()).build(); } public static FileStatus getFileStatus(HoodieBaseFile baseFile) throws IOException { @@ -391,34 +422,24 @@ public static Map> groupFileStatusForSna return grouped; } - /** - * Filters data files for a snapshot queried table. - * @param job - * @param metadata - * @param fileStatuses - * @return - */ - public static List filterFileStatusForSnapshotMode( - JobConf job, HoodieTableMetaClient metadata, List fileStatuses) throws IOException { - FileStatus[] statuses = fileStatuses.toArray(new FileStatus[0]); - if (LOG.isDebugEnabled()) { - LOG.debug("Hoodie Metadata initialized with completed commit Ts as :" + metadata); + public static Map> groupSnapshotPathsByMetaClient( + Collection metaClientList, + List snapshotPaths + ) { + Map> grouped = new HashMap<>(); + metaClientList.forEach(metaClient -> grouped.put(metaClient, new ArrayList<>())); + for (Path path : snapshotPaths) { + // Find meta client associated with the input path + metaClientList.stream().filter(metaClient -> path.toString().contains(metaClient.getBasePath())) + .forEach(metaClient -> grouped.get(metaClient).add(path)); } - // Get all commits, delta commits, compactions, as all of them produce a base parquet file today - HoodieTimeline timeline = metadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - TableFileSystemView.BaseFileOnlyView roView = new HoodieTableFileSystemView(metadata, timeline, statuses); - // filter files on the latest commit found - List filteredFiles = roView.getLatestBaseFiles().collect(Collectors.toList()); - LOG.info("Total paths to process after hoodie filter " + filteredFiles.size()); - List returns = new ArrayList<>(); - for (HoodieBaseFile filteredFile : filteredFiles) { - if (LOG.isDebugEnabled()) { - LOG.debug("Processing latest hoodie file - " + filteredFile.getPath()); - } - filteredFile = refreshFileStatus(job, filteredFile); - returns.add(getFileStatus(filteredFile)); - } - return returns; + return grouped; + } + + public static HoodieMetadataConfig buildMetadataConfig(Configuration conf) { + return HoodieMetadataConfig.newBuilder() + .enable(conf.getBoolean(ENABLE.key(), DEFAULT_METADATA_ENABLE_FOR_READERS)) + .build(); } /** @@ -429,7 +450,7 @@ public static List filterFileStatusForSnapshotMode( * @param dataFile * @return */ - private static HoodieBaseFile refreshFileStatus(Configuration conf, HoodieBaseFile dataFile) { + public static HoodieBaseFile refreshFileStatus(Configuration conf, HoodieBaseFile dataFile) { Path dataPath = dataFile.getFileStatus().getPath(); try { if (dataFile.getFileSize() == 0) { @@ -443,4 +464,51 @@ private static HoodieBaseFile refreshFileStatus(Configuration conf, HoodieBaseFi } } + /** + * Iterate through a list of commit metadata in natural order, and extract the file status of + * all affected files from the commits metadata grouping by file full path. If the files has + * been touched multiple times in the given commits, the return value will keep the one + * from the latest commit. + * + * @param basePath The table base path + * @param metadataList The metadata list to read the data from + * + * @return the affected file status array + */ + public static FileStatus[] listAffectedFilesForCommits(Configuration hadoopConf, Path basePath, List metadataList) { + // TODO: Use HoodieMetaTable to extract affected file directly. + HashMap fullPathToFileStatus = new HashMap<>(); + // Iterate through the given commits. + for (HoodieCommitMetadata metadata: metadataList) { + fullPathToFileStatus.putAll(metadata.getFullPathToFileStatus(hadoopConf, basePath.toString())); + } + return fullPathToFileStatus.values().toArray(new FileStatus[0]); + } + + /** + * Returns all the incremental write partition paths as a set with the given commits metadata. + * + * @param metadataList The commits metadata + * @return the partition path set + */ + public static Set getWritePartitionPaths(List metadataList) { + return metadataList.stream() + .map(HoodieCommitMetadata::getWritePartitionPaths) + .flatMap(Collection::stream) + .collect(Collectors.toSet()); + } + + /** + * Returns the commit metadata of the given instant. + * + * @param instant The hoodie instant + * @param timeline The timeline + * @return the commit metadata + */ + public static HoodieCommitMetadata getCommitMetadata( + HoodieInstant instant, + HoodieTimeline timeline) throws IOException { + byte[] data = timeline.getInstantDetails(instant).get(); + return HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class); + } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeInputFormatUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeInputFormatUtils.java index c8a0d7f82e96d..4b351d1205636 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeInputFormatUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeInputFormatUtils.java @@ -18,156 +18,37 @@ package org.apache.hudi.hadoop.utils; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.FileSlice; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.view.HoodieTableFileSystemView; -import org.apache.hudi.common.util.CollectionUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.hadoop.BootstrapBaseFileSplit; -import org.apache.hudi.hadoop.realtime.HoodieRealtimeFileSplit; -import org.apache.hudi.hadoop.realtime.RealtimeBootstrapBaseFileSplit; - import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.mapred.FileSplit; -import org.apache.hadoop.mapred.InputSplit; -import org.apache.hadoop.mapred.SplitLocationInfo; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.hadoop.realtime.HoodieRealtimeBootstrapBaseFileSplit; +import org.apache.hudi.hadoop.realtime.HoodieRealtimeFileSplit; +import org.apache.hudi.hadoop.realtime.HoodieVirtualKeyInfo; +import org.apache.hudi.hadoop.realtime.RealtimeSplit; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.Stream; +import static org.apache.hudi.common.util.TypeUtils.unsafeCast; public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils { private static final Logger LOG = LogManager.getLogger(HoodieRealtimeInputFormatUtils.class); - public static InputSplit[] getRealtimeSplits(Configuration conf, Stream fileSplits) throws IOException { - Map> partitionsToParquetSplits = - fileSplits.collect(Collectors.groupingBy(split -> split.getPath().getParent())); - // TODO(vc): Should we handle also non-hoodie splits here? - Map partitionsToMetaClient = getTableMetaClientByBasePath(conf, partitionsToParquetSplits.keySet()); - - // for all unique split parents, obtain all delta files based on delta commit timeline, - // grouped on file id - List rtSplits = new ArrayList<>(); - partitionsToParquetSplits.keySet().forEach(partitionPath -> { - // for each partition path obtain the data & log file groupings, then map back to inputsplits - HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath); - HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline()); - String relPartitionPath = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), partitionPath); - - try { - // Both commit and delta-commits are included - pick the latest completed one - Option latestCompletedInstant = - metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant(); - - Stream latestFileSlices = latestCompletedInstant - .map(instant -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, instant.getTimestamp())) - .orElse(Stream.empty()); - - // subgroup splits again by file id & match with log files. - Map> groupedInputSplits = partitionsToParquetSplits.get(partitionPath).stream() - .collect(Collectors.groupingBy(split -> FSUtils.getFileId(split.getPath().getName()))); - // Get the maxCommit from the last delta or compaction or commit - when bootstrapped from COW table - String maxCommitTime = metaClient.getActiveTimeline().getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.COMMIT_ACTION, - HoodieTimeline.ROLLBACK_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION, HoodieTimeline.REPLACE_COMMIT_ACTION)) - .filterCompletedInstants().lastInstant().get().getTimestamp(); - latestFileSlices.forEach(fileSlice -> { - List dataFileSplits = groupedInputSplits.get(fileSlice.getFileId()); - dataFileSplits.forEach(split -> { - try { - List logFilePaths = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()) - .map(logFile -> logFile.getPath().toString()).collect(Collectors.toList()); - if (split instanceof BootstrapBaseFileSplit) { - BootstrapBaseFileSplit eSplit = (BootstrapBaseFileSplit) split; - String[] hosts = split.getLocationInfo() != null ? Arrays.stream(split.getLocationInfo()) - .filter(x -> !x.isInMemory()).toArray(String[]::new) : new String[0]; - String[] inMemoryHosts = split.getLocationInfo() != null ? Arrays.stream(split.getLocationInfo()) - .filter(SplitLocationInfo::isInMemory).toArray(String[]::new) : new String[0]; - FileSplit baseSplit = new FileSplit(eSplit.getPath(), eSplit.getStart(), eSplit.getLength(), - hosts, inMemoryHosts); - rtSplits.add(new RealtimeBootstrapBaseFileSplit(baseSplit,metaClient.getBasePath(), - logFilePaths, maxCommitTime, eSplit.getBootstrapFileSplit())); - } else { - rtSplits.add(new HoodieRealtimeFileSplit(split, metaClient.getBasePath(), logFilePaths, maxCommitTime)); - } - } catch (IOException e) { - throw new HoodieIOException("Error creating hoodie real time split ", e); - } - }); - }); - } catch (Exception e) { - throw new HoodieException("Error obtaining data file/log file grouping: " + partitionPath, e); - } - }); - LOG.info("Returning a total splits of " + rtSplits.size()); - return rtSplits.toArray(new InputSplit[0]); - } - - // Return parquet file with a list of log files in the same file group. - public static Map> groupLogsByBaseFile(Configuration conf, List fileStatuses) { - Map> partitionsToParquetSplits = - fileStatuses.stream().collect(Collectors.groupingBy(file -> file.getFileStatus().getPath().getParent())); - // TODO(vc): Should we handle also non-hoodie splits here? - Map partitionsToMetaClient = getTableMetaClientByBasePath(conf, partitionsToParquetSplits.keySet()); - - // for all unique split parents, obtain all delta files based on delta commit timeline, - // grouped on file id - Map> resultMap = new HashMap<>(); - partitionsToParquetSplits.keySet().forEach(partitionPath -> { - // for each partition path obtain the data & log file groupings, then map back to inputsplits - HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath); - HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline()); - String relPartitionPath = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), partitionPath); - - try { - // Both commit and delta-commits are included - pick the latest completed one - Option latestCompletedInstant = - metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant(); - - Stream latestFileSlices = latestCompletedInstant - .map(instant -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, instant.getTimestamp())) - .orElse(Stream.empty()); + public static boolean doesBelongToIncrementalQuery(FileSplit s) { + if (s instanceof HoodieRealtimeFileSplit) { + HoodieRealtimeFileSplit bs = unsafeCast(s); + return bs.getBelongsToIncrementalQuery(); + } else if (s instanceof HoodieRealtimeBootstrapBaseFileSplit) { + HoodieRealtimeBootstrapBaseFileSplit bs = unsafeCast(s); + return bs.getBelongsToIncrementalQuery(); + } - // subgroup splits again by file id & match with log files. - Map> groupedInputSplits = partitionsToParquetSplits.get(partitionPath).stream() - .collect(Collectors.groupingBy(file -> FSUtils.getFileId(file.getFileStatus().getPath().getName()))); - latestFileSlices.forEach(fileSlice -> { - List dataFileSplits = groupedInputSplits.get(fileSlice.getFileId()); - dataFileSplits.forEach(split -> { - try { - List logFilePaths = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()) - .map(logFile -> logFile.getPath().toString()).collect(Collectors.toList()); - resultMap.put(split, logFilePaths); - } catch (Exception e) { - throw new HoodieException("Error creating hoodie real time split ", e); - } - }); - }); - } catch (Exception e) { - throw new HoodieException("Error obtaining data file/log file grouping: " + partitionPath, e); - } - }); - return resultMap; + return false; } - /** * Add a field to the existing fields projected. */ @@ -197,11 +78,37 @@ private static Configuration addProjectionField(Configuration conf, String field return conf; } - public static void addRequiredProjectionFields(Configuration configuration) { + public static void addRequiredProjectionFields(Configuration configuration, Option hoodieVirtualKeyInfo) { // Need this to do merge records in HoodieRealtimeRecordReader - addProjectionField(configuration, HoodieRecord.RECORD_KEY_METADATA_FIELD, HoodieInputFormatUtils.HOODIE_RECORD_KEY_COL_POS); - addProjectionField(configuration, HoodieRecord.COMMIT_TIME_METADATA_FIELD, HoodieInputFormatUtils.HOODIE_COMMIT_TIME_COL_POS); - addProjectionField(configuration, HoodieRecord.PARTITION_PATH_METADATA_FIELD, HoodieInputFormatUtils.HOODIE_PARTITION_PATH_COL_POS); + if (!hoodieVirtualKeyInfo.isPresent()) { + addProjectionField(configuration, HoodieRecord.RECORD_KEY_METADATA_FIELD, HoodieInputFormatUtils.HOODIE_RECORD_KEY_COL_POS); + addProjectionField(configuration, HoodieRecord.COMMIT_TIME_METADATA_FIELD, HoodieInputFormatUtils.HOODIE_COMMIT_TIME_COL_POS); + addProjectionField(configuration, HoodieRecord.PARTITION_PATH_METADATA_FIELD, HoodieInputFormatUtils.HOODIE_PARTITION_PATH_COL_POS); + } else { + HoodieVirtualKeyInfo hoodieVirtualKey = hoodieVirtualKeyInfo.get(); + addProjectionField(configuration, hoodieVirtualKey.getRecordKeyField(), hoodieVirtualKey.getRecordKeyFieldIndex()); + if (hoodieVirtualKey.getPartitionPathField().isPresent()) { + addProjectionField(configuration, hoodieVirtualKey.getPartitionPathField().get(), hoodieVirtualKey.getPartitionPathFieldIndex().get()); + } + } + } + + public static boolean requiredProjectionFieldsExistInConf(Configuration configuration, Option hoodieVirtualKeyInfo) { + String readColNames = configuration.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, ""); + if (!hoodieVirtualKeyInfo.isPresent()) { + return readColNames.contains(HoodieRecord.RECORD_KEY_METADATA_FIELD) + && readColNames.contains(HoodieRecord.COMMIT_TIME_METADATA_FIELD) + && readColNames.contains(HoodieRecord.PARTITION_PATH_METADATA_FIELD); + } else { + return readColNames.contains(hoodieVirtualKeyInfo.get().getRecordKeyField()) + && (hoodieVirtualKeyInfo.get().getPartitionPathField().isPresent() ? readColNames.contains(hoodieVirtualKeyInfo.get().getPartitionPathField().get()) + : true); + } + } + + public static boolean canAddProjectionToJobConf(final RealtimeSplit realtimeSplit, final JobConf jobConf) { + return jobConf.get(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP) == null + || (!realtimeSplit.getDeltaLogPaths().isEmpty() && !HoodieRealtimeInputFormatUtils.requiredProjectionFieldsExistInConf(jobConf, realtimeSplit.getVirtualKeyInfo())); } /** diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeRecordReaderUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeRecordReaderUtils.java index b10e778f38c17..430c318bcb390 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeRecordReaderUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeRecordReaderUtils.java @@ -18,20 +18,18 @@ package org.apache.hudi.hadoop.utils; -import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.hadoop.config.HoodieRealtimeConfig; -import org.apache.hudi.io.storage.HoodieFileReader; -import org.apache.hudi.io.storage.HoodieFileReaderFactory; + +import org.apache.avro.AvroRuntimeException; +import org.apache.avro.JsonProperties; import org.apache.avro.LogicalTypes; import org.apache.avro.Schema; import org.apache.avro.generic.GenericArray; import org.apache.avro.generic.GenericFixed; import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.io.DoubleWritable; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; @@ -48,7 +46,6 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; @@ -59,21 +56,12 @@ import java.util.TreeMap; import java.util.stream.Collectors; +import static org.apache.hudi.avro.AvroSchemaUtils.appendFieldsToSchema; +import static org.apache.hudi.avro.AvroSchemaUtils.createNullableSchema; + public class HoodieRealtimeRecordReaderUtils { private static final Logger LOG = LogManager.getLogger(HoodieRealtimeRecordReaderUtils.class); - /** - * Reads the schema from the base file. - */ - public static Schema readSchema(Configuration conf, Path filePath) { - try { - HoodieFileReader storageReader = HoodieFileReaderFactory.getFileReader(conf, filePath); - return storageReader.getSchema(); - } catch (IOException e) { - throw new HoodieIOException("Failed to read schema from " + filePath, e); - } - } - /** * get the max compaction memory in bytes from JobConf. */ @@ -166,6 +154,9 @@ public static Writable avroToArrayWritable(Object value, Schema schema) { case BYTES: return new BytesWritable(((ByteBuffer)value).array()); case INT: + if (schema.getLogicalType() != null && schema.getLogicalType().getName().equals("date")) { + return new DateWritable((Integer) value); + } return new IntWritable((Integer) value); case LONG: return new LongWritable((Long) value); @@ -182,7 +173,14 @@ public static Writable avroToArrayWritable(Object value, Schema schema) { Writable[] recordValues = new Writable[schema.getFields().size()]; int recordValueIndex = 0; for (Schema.Field field : schema.getFields()) { - recordValues[recordValueIndex++] = avroToArrayWritable(record.get(field.name()), field.schema()); + // TODO Revisit Avro exception handling in future + Object fieldValue = null; + try { + fieldValue = record.get(field.name()); + } catch (AvroRuntimeException e) { + LOG.debug("Field:" + field.name() + "not found in Schema:" + schema.toString()); + } + recordValues[recordValueIndex++] = avroToArrayWritable(fieldValue, field.schema()); } return new ArrayWritable(Writable.class, recordValues); case ENUM: @@ -283,6 +281,14 @@ public static Schema addPartitionFields(Schema schema, List partitioning List fieldsToAdd = partitioningFields.stream().map(String::toLowerCase) .filter(x -> !firstLevelFieldNames.contains(x)).collect(Collectors.toList()); - return HoodieAvroUtils.appendNullSchemaFields(schema, fieldsToAdd); + return appendNullSchemaFields(schema, fieldsToAdd); + } + + private static Schema appendNullSchemaFields(Schema schema, List newFieldNames) { + List newFields = new ArrayList<>(); + for (String newField : newFieldNames) { + newFields.add(new Schema.Field(newField, createNullableSchema(Schema.Type.STRING), "", JsonProperties.NULL_VALUE)); + } + return appendFieldsToSchema(schema, newFields); } } diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestGloballyConsistentTimeStampFilteringInputFormat.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestGloballyConsistentTimeStampFilteringInputFormat.java new file mode 100644 index 0000000000000..50e4a6ed72b32 --- /dev/null +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestGloballyConsistentTimeStampFilteringInputFormat.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.hadoop.testutils.InputFormatTestUtil; +import org.apache.hudi.hadoop.utils.HoodieHiveUtils; + +import org.apache.hadoop.fs.FileStatus; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestGloballyConsistentTimeStampFilteringInputFormat + extends TestHoodieParquetInputFormat { + + @BeforeEach + public void setUp() { + super.setUp(); + } + + @Test + public void testInputFormatLoad() throws IOException { + super.testInputFormatLoad(); + + // set filtering timestamp to 0 now the timeline wont have any commits. + InputFormatTestUtil.setupSnapshotMaxCommitTimeQueryMode(jobConf, "0"); + + Assertions.assertThrows(HoodieIOException.class, () -> inputFormat.getSplits(jobConf, 10)); + Assertions.assertThrows(HoodieIOException.class, () -> inputFormat.listStatus(jobConf)); + } + + @Test + public void testInputFormatUpdates() throws IOException { + super.testInputFormatUpdates(); + + // set the globally replicated timestamp to 199 so only 100 is read and update is ignored. + InputFormatTestUtil.setupSnapshotMaxCommitTimeQueryMode(jobConf, "100"); + + FileStatus[] files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length); + + ensureFilesInCommit("5 files have been updated to commit 200. but should get filtered out ", + files,"200", 0); + ensureFilesInCommit("We should see 10 files from commit 100 ", files, "100", 10); + } + + @Override + public void testIncrementalSimple() throws IOException { + // setting filtering timestamp to zero should not in any way alter the result of the test which + // pulls in zero files due to incremental ts being the actual commit time + jobConf.set(HoodieHiveUtils.GLOBALLY_CONSISTENT_READ_TIMESTAMP, "0"); + super.testIncrementalSimple(); + } + + @Override + public void testIncrementalWithMultipleCommits() throws IOException { + super.testIncrementalWithMultipleCommits(); + + // set globally replicated timestamp to 400 so commits from 500, 600 does not show up + InputFormatTestUtil.setupSnapshotMaxCommitTimeQueryMode(jobConf, "400"); + InputFormatTestUtil.setupIncremental(jobConf, "100", HoodieHiveUtils.MAX_COMMIT_ALL); + + FileStatus[] files = inputFormat.listStatus(jobConf); + + assertEquals( + 5, files.length,"Pulling ALL commits from 100, should get us the 3 files from 400 commit, 1 file from 300 " + + "commit and 1 file from 200 commit"); + ensureFilesInCommit("Pulling 3 commits from 100, should get us the 3 files from 400 commit", + files, "400", 3); + ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 300 commit", + files, "300", 1); + ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 200 commit", + files, "200", 1); + + List commits = Arrays.asList("100", "200", "300", "400", "500", "600"); + for (int idx = 0; idx < commits.size(); ++idx) { + for (int jdx = 0; jdx < commits.size(); ++jdx) { + InputFormatTestUtil.setupIncremental(jobConf, commits.get(idx), HoodieHiveUtils.MAX_COMMIT_ALL); + InputFormatTestUtil.setupSnapshotMaxCommitTimeQueryMode(jobConf, commits.get(jdx)); + + files = inputFormat.listStatus(jobConf); + + if (jdx <= idx) { + assertEquals(0, files.length,"all commits should be filtered"); + } else { + // only commits upto the timestamp is allowed + for (FileStatus file : files) { + String commitTs = FSUtils.getCommitTime(file.getPath().getName()); + assertTrue(commits.indexOf(commitTs) <= jdx); + assertTrue(commits.indexOf(commitTs) > idx); + } + } + } + } + } +} diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieCopyOnWriteTableInputFormat.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieCopyOnWriteTableInputFormat.java new file mode 100644 index 0000000000000..902778ed1c383 --- /dev/null +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieCopyOnWriteTableInputFormat.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.hadoop; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.net.URI; +import java.nio.file.Files; + +import static org.junit.jupiter.api.Assertions.assertFalse; + +public class TestHoodieCopyOnWriteTableInputFormat { + + @TempDir + java.nio.file.Path tempDir; + private FileSystem fs; + + @BeforeEach + void setUp() throws IOException { + fs = FileSystem.get(tempDir.toUri(), new Configuration()); + } + + @AfterEach + void tearDown() throws IOException { + fs.close(); + } + + @Test + void pathNotSplitableForBootstrapScenario() throws IOException { + URI source = Files.createTempFile(tempDir, "source", ".parquet").toUri(); + URI target = Files.createTempFile(tempDir, "target", ".parquet").toUri(); + PathWithBootstrapFileStatus path = new PathWithBootstrapFileStatus(new Path(target), fs.getFileStatus(new Path(source))); + HoodieCopyOnWriteTableInputFormat cowInputFormat = new HoodieCopyOnWriteTableInputFormat(); + assertFalse(cowInputFormat.isSplitable(fs, path), "Path for bootstrap should not be splitable."); + } +} diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieHFileInputFormat.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieHFileInputFormat.java index 2c340270236e3..92bf6f3ca718c 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieHFileInputFormat.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieHFileInputFormat.java @@ -22,6 +22,7 @@ import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; @@ -161,6 +162,21 @@ public void testInputFormatLoad() throws IOException { assertEquals(10, inputSplits.length); } + @Test + public void testInputFormatLoadWithEmptyTable() throws IOException { + // initial hoodie table + String bathPathStr = "/tmp/test_empty_table"; + HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), bathPathStr, HoodieTableType.COPY_ON_WRITE, + baseFileFormat); + // Add the paths + FileInputFormat.setInputPaths(jobConf, bathPathStr); + + FileStatus[] files = inputFormat.listStatus(jobConf); + assertEquals(0, files.length); + InputSplit[] inputSplits = inputFormat.getSplits(jobConf, 0); + assertEquals(0, inputSplits.length); + } + @Test public void testInputFormatUpdates() throws IOException { // initial commit @@ -232,9 +248,90 @@ public void testIncrementalSimple() throws IOException { InputFormatTestUtil.setupIncremental(jobConf, "100", 1); + HoodieTableMetaClient metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), + HoodieTableType.COPY_ON_WRITE, baseFileFormat); + assertEquals(null, metaClient.getTableConfig().getDatabaseName(), + "When hoodie.database.name is not set, it should default to null"); + + FileStatus[] files = inputFormat.listStatus(jobConf); + assertEquals(0, files.length, + "We should exclude commit 100 when returning incremental pull with start commit time as 100"); + + InputFormatTestUtil.setupIncremental(jobConf, "100", 1, true); + + files = inputFormat.listStatus(jobConf); + assertEquals(0, files.length, + "We should exclude commit 100 when returning incremental pull with start commit time as 100"); + + metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), HoodieTableType.COPY_ON_WRITE, + baseFileFormat, HoodieTestUtils.HOODIE_DATABASE); + assertEquals(HoodieTestUtils.HOODIE_DATABASE, metaClient.getTableConfig().getDatabaseName(), + String.format("The hoodie.database.name should be %s ", HoodieTestUtils.HOODIE_DATABASE)); + + files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length, + "When hoodie.incremental.use.database is true and hoodie.database.name is not null or empty" + + " and the incremental database name is not set, then the incremental query will not take effect"); + } + + @Test + public void testIncrementalWithDatabaseName() throws IOException { + // initial commit + File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100"); + createCommitFile(basePath, "100", "2016/05/01"); + + // Add the paths + FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + + InputFormatTestUtil.setupIncremental(jobConf, "100", 1, HoodieTestUtils.HOODIE_DATABASE, true); + + HoodieTableMetaClient metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), + HoodieTableType.COPY_ON_WRITE, baseFileFormat); + assertEquals(null, metaClient.getTableConfig().getDatabaseName(), + "When hoodie.database.name is not set, it should default to null"); + FileStatus[] files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length, + "When hoodie.database.name is null, then the incremental query will not take effect"); + + metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), HoodieTableType.COPY_ON_WRITE, + baseFileFormat, ""); + assertEquals("", metaClient.getTableConfig().getDatabaseName(), + "The hoodie.database.name should be empty"); + + files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length, + "When hoodie.database.name is empty, then the incremental query will not take effect"); + + metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), HoodieTableType.COPY_ON_WRITE, + baseFileFormat, HoodieTestUtils.HOODIE_DATABASE); + assertEquals(HoodieTestUtils.HOODIE_DATABASE, metaClient.getTableConfig().getDatabaseName(), + String.format("The hoodie.database.name should be %s ", HoodieTestUtils.HOODIE_DATABASE)); + + files = inputFormat.listStatus(jobConf); assertEquals(0, files.length, "We should exclude commit 100 when returning incremental pull with start commit time as 100"); + + InputFormatTestUtil.setupIncremental(jobConf, "100", 1, HoodieTestUtils.HOODIE_DATABASE, false); + + files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length, + "When hoodie.incremental.use.database is false and the incremental database name is set," + + "then the incremental query will not take effect"); + + // The configuration with and without database name exists together + InputFormatTestUtil.setupIncremental(jobConf, "1", 1, true); + + files = inputFormat.listStatus(jobConf); + assertEquals(0, files.length, + "When hoodie.incremental.use.database is true, " + + "We should exclude commit 100 because the returning incremental pull with start commit time is 100"); + + InputFormatTestUtil.setupIncremental(jobConf, "1", 1, false); + files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length, + "When hoodie.incremental.use.database is false, " + + "We should include commit 100 because the returning incremental pull with start commit time is 1"); } private void createCommitFile(java.nio.file.Path basePath, String commitNumber, String partitionPath) @@ -316,7 +413,7 @@ public void testIncrementalWithMultipleCommits() throws IOException { ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 200 commit", files, "200", 1); } - // TODO enable this after enabling predicate pushdown + // TODO enable this after enabling predicate push down public void testPredicatePushDown() throws IOException { // initial commit Schema schema = getSchemaFromResource(TestHoodieHFileInputFormat.class, "/sample1.avsc"); @@ -337,7 +434,7 @@ public void testPredicatePushDown() throws IOException { // check whether we have 2 records at this point ensureRecordsInCommit("We need to have 2 records that was modified at commit " + commit2 + " and no more", commit2, 2, 2); - // Make sure we have the 10 records if we roll back the stattime + // Make sure we have the 10 records if we roll back the start time InputFormatTestUtil.setupIncremental(jobConf, "0", 2); ensureRecordsInCommit("We need to have 8 records that was modified at commit " + commit1 + " and no more", commit1, 8, 10); @@ -347,19 +444,19 @@ public void testPredicatePushDown() throws IOException { @Test public void testGetIncrementalTableNames() throws IOException { - String[] expectedincrTables = {"db1.raw_trips", "db2.model_trips", "db3.model_trips"}; + String[] expectedIncrTables = {"db1.raw_trips", "db2.model_trips", "db3.model_trips"}; JobConf conf = new JobConf(); - String incrementalMode1 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, expectedincrTables[0]); + String incrementalMode1 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, expectedIncrTables[0]); conf.set(incrementalMode1, HoodieHiveUtils.INCREMENTAL_SCAN_MODE); - String incrementalMode2 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, expectedincrTables[1]); + String incrementalMode2 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, expectedIncrTables[1]); conf.set(incrementalMode2,HoodieHiveUtils.INCREMENTAL_SCAN_MODE); String incrementalMode3 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, "db3.model_trips"); conf.set(incrementalMode3, HoodieHiveUtils.INCREMENTAL_SCAN_MODE.toLowerCase()); - String defaultmode = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, "db3.first_trips"); - conf.set(defaultmode, HoodieHiveUtils.DEFAULT_SCAN_MODE); - List actualincrTables = HoodieHiveUtils.getIncrementalTableNames(Job.getInstance(conf)); - for (String expectedincrTable : expectedincrTables) { - assertTrue(actualincrTables.contains(expectedincrTable)); + String defaultMode = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, "db3.first_trips"); + conf.set(defaultMode, HoodieHiveUtils.DEFAULT_SCAN_MODE); + List actualIncrTables = HoodieHiveUtils.getIncrementalTableNames(Job.getInstance(conf)); + for (String expectedIncrTable : expectedIncrTables) { + assertTrue(actualIncrTables.contains(expectedIncrTable)); } } diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieParquetInputFormat.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieParquetInputFormat.java index 27838897d5ec3..ccc57d0f6185c 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieParquetInputFormat.java @@ -18,29 +18,38 @@ package org.apache.hudi.hadoop; +import org.apache.avro.Schema; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapreduce.Job; + +import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.testutils.FileCreateUtils; import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.testutils.SchemaTestUtil; +import org.apache.hudi.common.util.CommitUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.hadoop.testutils.InputFormatTestUtil; import org.apache.hudi.hadoop.utils.HoodieHiveUtils; - -import org.apache.avro.Schema; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.io.ArrayWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapred.FileInputFormat; -import org.apache.hadoop.mapred.InputSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; -import org.apache.hadoop.mapreduce.Job; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @@ -50,18 +59,22 @@ import java.io.FileOutputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; +import java.nio.file.Paths; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; public class TestHoodieParquetInputFormat { - private HoodieParquetInputFormat inputFormat; - private JobConf jobConf; + protected HoodieParquetInputFormat inputFormat; + protected JobConf jobConf; private final HoodieFileFormat baseFileFormat = HoodieFileFormat.PARQUET; private final String baseFileExtension = baseFileFormat.getFileExtension(); @@ -109,20 +122,18 @@ public void testPendingCompactionWithActiveCommits() throws IOException { timeline.setInstants(instants); // Verify getCommitsTimelineBeforePendingCompaction does not return instants after first compaction instant - HoodieTimeline filteredTimeline = inputFormat.filterInstantsTimeline(timeline); + HoodieTimeline filteredTimeline = HoodieInputFormatUtils.filterInstantsTimeline(timeline); assertTrue(filteredTimeline.containsInstant(t1)); assertTrue(filteredTimeline.containsInstant(t2)); assertFalse(filteredTimeline.containsInstant(t3)); assertFalse(filteredTimeline.containsInstant(t4)); assertFalse(filteredTimeline.containsInstant(t5)); assertFalse(filteredTimeline.containsInstant(t6)); - - // remove compaction instant and setup timeline again instants.remove(t3); timeline = new HoodieActiveTimeline(metaClient); timeline.setInstants(instants); - filteredTimeline = inputFormat.filterInstantsTimeline(timeline); + filteredTimeline = HoodieInputFormatUtils.filterInstantsTimeline(timeline); // verify all remaining instants are returned. assertTrue(filteredTimeline.containsInstant(t1)); @@ -136,7 +147,7 @@ public void testPendingCompactionWithActiveCommits() throws IOException { instants.remove(t5); timeline = new HoodieActiveTimeline(metaClient); timeline.setInstants(instants); - filteredTimeline = inputFormat.filterInstantsTimeline(timeline); + filteredTimeline = HoodieInputFormatUtils.filterInstantsTimeline(timeline); // verify all remaining instants are returned. assertTrue(filteredTimeline.containsInstant(t1)); @@ -163,6 +174,41 @@ public void testInputFormatLoad() throws IOException { assertEquals(10, files.length); } + @Test + public void testInputFormatLoadForNonPartitionedAndVirtualKeyedTable() throws IOException { + // initial commit + Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); + File partitionDir = InputFormatTestUtil.prepareCustomizedTable(basePath, baseFileFormat, 10, "100", true, false, + true, schema); + HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, + schema.toString(), HoodieTimeline.COMMIT_ACTION); + FileCreateUtils.createCommit(basePath.toString(), "100", Option.of(commitMetadata)); + + // Add the paths + FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + + InputSplit[] inputSplits = inputFormat.getSplits(jobConf, 10); + assertEquals(10, inputSplits.length); + + FileStatus[] files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length); + } + + @Test + public void testInputFormatLoadWithEmptyTable() throws IOException { + // initial hoodie table + String bathPathStr = "/tmp/test_empty_table"; + HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), bathPathStr, HoodieTableType.COPY_ON_WRITE, + baseFileFormat); + // Add the paths + FileInputFormat.setInputPaths(jobConf, bathPathStr); + + FileStatus[] files = inputFormat.listStatus(jobConf); + assertEquals(0, files.length); + InputSplit[] inputSplits = inputFormat.getSplits(jobConf, 0); + assertEquals(0, inputSplits.length); + } + @Test public void testInputFormatUpdates() throws IOException { // initial commit @@ -190,6 +236,55 @@ public void testInputFormatUpdates() throws IOException { + "files from 200 commit", files, "100", 5); } + @Test + public void testSnapshotWithInvalidCommitShouldThrowException() throws IOException { + File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100"); + InputFormatTestUtil.commit(basePath, "100"); + + FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + InputFormatTestUtil.setupSnapshotIncludePendingCommits(jobConf, "1"); + Exception exception = assertThrows(HoodieIOException.class, () -> inputFormat.listStatus(jobConf)); + assertEquals("Query instant (1) not found in the timeline", exception.getMessage()); + + InputFormatTestUtil.setupSnapshotMaxCommitTimeQueryMode(jobConf, "1"); + exception = assertThrows(HoodieIOException.class, () -> inputFormat.listStatus(jobConf)); + assertEquals("Query instant (1) not found in the timeline", exception.getMessage()); + } + + @Test + public void testPointInTimeQueryWithUpdates() throws IOException { + // initial commit + File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100"); + InputFormatTestUtil.commit(basePath, "100"); + + // Add the paths + FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + + FileStatus[] files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length); + + // update files + InputFormatTestUtil.simulateUpdates(partitionDir, baseFileExtension, "100", 5, "200", true); + // Before the commit + files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length); + ensureFilesInCommit("Commit 200 has not been committed. We should not see files from this commit", files, "200", 0); + InputFormatTestUtil.commit(basePath, "200"); + + InputFormatTestUtil.setupSnapshotMaxCommitTimeQueryMode(jobConf, "100"); + + files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length); + ensureFilesInCommit("We shouldn't have any file pertaining to commit 200", files, "200", 0); + ensureFilesInCommit("All files should be from commit 100", files, "100", 10); + + InputFormatTestUtil.setupSnapshotMaxCommitTimeQueryMode(jobConf, "200"); + files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length); + ensureFilesInCommit("5 files for commit 200", files, "200", 5); + ensureFilesInCommit("5 files for commit 100", files, "100", 5); + } + @Test public void testInputFormatWithCompaction() throws IOException { // initial commit @@ -234,9 +329,149 @@ public void testIncrementalSimple() throws IOException { InputFormatTestUtil.setupIncremental(jobConf, "100", 1); + HoodieTableMetaClient metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), + HoodieTableType.COPY_ON_WRITE, baseFileFormat); + assertEquals(null, metaClient.getTableConfig().getDatabaseName(), + "When hoodie.database.name is not set, it should default to null"); + FileStatus[] files = inputFormat.listStatus(jobConf); assertEquals(0, files.length, "We should exclude commit 100 when returning incremental pull with start commit time as 100"); + + InputFormatTestUtil.setupIncremental(jobConf, "100", 1, true); + + files = inputFormat.listStatus(jobConf); + assertEquals(0, files.length, + "We should exclude commit 100 when returning incremental pull with start commit time as 100"); + + metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), HoodieTableType.COPY_ON_WRITE, + baseFileFormat, HoodieTestUtils.HOODIE_DATABASE); + assertEquals(HoodieTestUtils.HOODIE_DATABASE, metaClient.getTableConfig().getDatabaseName(), + String.format("The hoodie.database.name should be %s ", HoodieTestUtils.HOODIE_DATABASE)); + + files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length, + "When hoodie.incremental.use.database is true and hoodie.database.name is not null or empty" + + " and the incremental database name is not set, then the incremental query will not take effect"); + } + + @Test + public void testIncrementalWithDatabaseName() throws IOException { + // initial commit + File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100"); + createCommitFile(basePath, "100", "2016/05/01"); + + // Add the paths + FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + + InputFormatTestUtil.setupIncremental(jobConf, "100", 1, HoodieTestUtils.HOODIE_DATABASE, true); + + HoodieTableMetaClient metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), + HoodieTableType.COPY_ON_WRITE, baseFileFormat); + assertEquals(null, metaClient.getTableConfig().getDatabaseName(), + "When hoodie.database.name is not set, it should default to null"); + + FileStatus[] files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length, + "When hoodie.database.name is null, then the incremental query will not take effect"); + + metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), HoodieTableType.COPY_ON_WRITE, + baseFileFormat, ""); + assertEquals("", metaClient.getTableConfig().getDatabaseName(), + "The hoodie.database.name should be empty"); + + files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length, + "When hoodie.database.name is empty, then the incremental query will not take effect"); + + metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), HoodieTableType.COPY_ON_WRITE, + baseFileFormat, HoodieTestUtils.HOODIE_DATABASE); + assertEquals(HoodieTestUtils.HOODIE_DATABASE, metaClient.getTableConfig().getDatabaseName(), + String.format("The hoodie.database.name should be %s ", HoodieTestUtils.HOODIE_DATABASE)); + + files = inputFormat.listStatus(jobConf); + assertEquals(0, files.length, + "We should exclude commit 100 when returning incremental pull with start commit time as 100"); + + InputFormatTestUtil.setupIncremental(jobConf, "100", 1, HoodieTestUtils.HOODIE_DATABASE, false); + + files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length, + "When hoodie.incremental.use.database is false and the incremental database name is set, " + + "then the incremental query will not take effect"); + + // The configuration with and without database name exists together + InputFormatTestUtil.setupIncremental(jobConf, "1", 1, true); + + files = inputFormat.listStatus(jobConf); + assertEquals(0, files.length, + "When hoodie.incremental.use.database is true, " + + "We should exclude commit 100 because the returning incremental pull with start commit time is 100"); + + InputFormatTestUtil.setupIncremental(jobConf, "1", 1, false); + files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length, + "When hoodie.incremental.use.database is false, " + + "We should include commit 100 because the returning incremental pull with start commit time is 1"); + } + + @Test + public void testMultiPartitionTableIncremental() throws IOException { + // initial commit + java.nio.file.Path tablePath = Paths.get(basePath.toString(), "raw_trips"); + + // create hudi table and insert data to it + // create only one file + File partitionDir1 = InputFormatTestUtil + .prepareMultiPartitionTable(basePath, baseFileFormat, 1, "100", "1"); + createCommitFile(basePath, "100", "2016/05/1"); + + // insert new data to partition "2016/05/11" + // create only one file + File partitionDir2 = InputFormatTestUtil + .prepareMultiPartitionTable(basePath, baseFileFormat, 1, "100", "11"); + createCommitFile(basePath, "101", "2016/05/11"); + + + // now partitionDir2.getPath().contain(partitionDir1.getPath()), and hudi-1817 will occur + assertEquals(true, partitionDir2.getPath().contains(partitionDir1.getPath())); + + // set partitionDir2 to be the inputPaths of current inputFormat + FileInputFormat.setInputPaths(jobConf, partitionDir2.getPath()); + + // set incremental startCommit=0 and numberOfCommitsToPull=3 to pull all the data from hudi table + InputFormatTestUtil.setupIncremental(jobConf, "0", 3); + + FileStatus[] files = inputFormat.listStatus(jobConf); + assertEquals(1, files.length, + "We should get one file from partition: " + partitionDir2.getPath()); + } + + @Test + public void testIncrementalEmptyPartitions() throws IOException { + // initial commit + File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100"); + createCommitFile(basePath, "100", "2016/05/01"); + + // Add the paths + FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + + InputFormatTestUtil.setupIncremental(jobConf, "000", 1); + + FileStatus[] files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length, + "We should include only 1 commit 100 when returning incremental pull with start commit time as 100"); + ensureFilesInCommit("Pulling 1 commits from 000, should get us the 10 files from 100 commit", files, "100", 10); + + // Add new commit only to a new partition + partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "200"); + createCommitFile(basePath, "200", "2017/05/01"); + + InputFormatTestUtil.setupIncremental(jobConf, "100", 1); + files = inputFormat.listStatus(jobConf); + + assertEquals(0, files.length, + "We should exclude commit 200 when returning incremental pull with start commit time as 100 as filePaths does not include new partition"); } private void createCommitFile(java.nio.file.Path basePath, String commitNumber, String partitionPath) @@ -318,7 +553,7 @@ public void testIncrementalWithMultipleCommits() throws IOException { ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 200 commit", files, "200", 1); } - @Disabled("enable this after enabling predicate pushdown") + @Disabled("enable this after enabling predicate push down") @Test public void testPredicatePushDown() throws IOException { // initial commit @@ -340,7 +575,7 @@ public void testPredicatePushDown() throws IOException { // check whether we have 2 records at this point ensureRecordsInCommit("We need to have 2 records that was modified at commit " + commit2 + " and no more", commit2, 2, 2); - // Make sure we have the 10 records if we roll back the stattime + // Make sure we have the 10 records if we roll back the start time InputFormatTestUtil.setupIncremental(jobConf, "0", 2); ensureRecordsInCommit("We need to have 8 records that was modified at commit " + commit1 + " and no more", commit1, 8, 10); @@ -350,19 +585,19 @@ public void testPredicatePushDown() throws IOException { @Test public void testGetIncrementalTableNames() throws IOException { - String[] expectedincrTables = {"db1.raw_trips", "db2.model_trips", "db3.model_trips"}; + String[] expectedIncrTables = {"db1.raw_trips", "db2.model_trips", "db3.model_trips"}; JobConf conf = new JobConf(); - String incrementalMode1 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, expectedincrTables[0]); + String incrementalMode1 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, expectedIncrTables[0]); conf.set(incrementalMode1, HoodieHiveUtils.INCREMENTAL_SCAN_MODE); - String incrementalMode2 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, expectedincrTables[1]); - conf.set(incrementalMode2,HoodieHiveUtils.INCREMENTAL_SCAN_MODE); + String incrementalMode2 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, expectedIncrTables[1]); + conf.set(incrementalMode2, HoodieHiveUtils.INCREMENTAL_SCAN_MODE); String incrementalMode3 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, "db3.model_trips"); conf.set(incrementalMode3, HoodieHiveUtils.INCREMENTAL_SCAN_MODE.toLowerCase()); - String defaultmode = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, "db3.first_trips"); - conf.set(defaultmode, HoodieHiveUtils.DEFAULT_SCAN_MODE); - List actualincrTables = HoodieHiveUtils.getIncrementalTableNames(Job.getInstance(conf)); - for (String expectedincrTable : expectedincrTables) { - assertTrue(actualincrTables.contains(expectedincrTable)); + String defaultMode = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, "db3.first_trips"); + conf.set(defaultMode, HoodieHiveUtils.DEFAULT_SCAN_MODE); + List actualIncrTables = HoodieHiveUtils.getIncrementalTableNames(Job.getInstance(conf)); + for (String expectedIncrTable : expectedIncrTables) { + assertTrue(actualIncrTables.contains(expectedIncrTable)); } } @@ -403,6 +638,49 @@ public void testIncrementalWithPendingCompaction() throws IOException { } + @Test + public void testSnapshotPreCommitValidate() throws IOException { + // initial commit + File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100"); + createCommitFile(basePath, "100", "2016/05/01"); + + // Add the paths + FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + FileStatus[] files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length, "Snapshot read must return all files in partition"); + + // add more files + InputFormatTestUtil.simulateInserts(partitionDir, baseFileExtension, "fileId2-", 5, "200"); + FileCreateUtils.createInflightCommit(basePath.toString(), "200"); + + // Verify that validate mode reads uncommitted files + InputFormatTestUtil.setupSnapshotIncludePendingCommits(jobConf, "200"); + files = inputFormat.listStatus(jobConf); + assertEquals(15, files.length, "Must return uncommitted files"); + ensureFilesInCommit("Pulling 1 commit from 100, should get us the 5 files committed at 200", files, "200", 5); + ensureFilesInCommit("Pulling 1 commit from 100, should get us the 10 files committed at 100", files, "100", 10); + + try { + // Verify that Validate mode throws error with invalid commit time + InputFormatTestUtil.setupSnapshotIncludePendingCommits(jobConf, "300"); + inputFormat.listStatus(jobConf); + fail("Expected list status to fail when validate is called with unknown timestamp"); + } catch (HoodieIOException e) { + // expected because validate is called with invalid instantTime + } + + //Creating a new jobCOnf Object because old one has hoodie.%.consume.commit set + jobConf = new JobConf(); + inputFormat.setConf(jobConf); + FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + + // verify that snapshot mode skips uncommitted files + InputFormatTestUtil.setupSnapshotScanMode(jobConf); + files = inputFormat.listStatus(jobConf); + assertEquals(10, files.length, "snapshot scan mode must not return uncommitted files"); + ensureFilesInCommit("Pulling 1 commit from 100, should get us the 10 files committed at 100", files, "100", 10); + } + private void ensureRecordsInCommit(String msg, String commit, int expectedNumberOfRecordsInCommit, int totalExpected) throws IOException { int actualCount = 0; diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieROTablePathFilter.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieROTablePathFilter.java index ba88df3e4c4e8..260afd5ced03a 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieROTablePathFilter.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieROTablePathFilter.java @@ -68,9 +68,11 @@ public void testHoodiePaths() throws Exception { assertFalse(pathFilter.accept(testTable.getInflightCommitFilePath("003"))); assertFalse(pathFilter.accept(testTable.getRequestedCompactionFilePath("004"))); assertFalse(pathFilter.accept(new Path("file:///" + basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"))); + assertFalse(pathFilter.accept(new Path("file:///" + basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/hoodie.properties"))); assertFalse(pathFilter.accept(new Path("file:///" + basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME))); assertEquals(1, pathFilter.metaClientCache.size()); + assertEquals(0, pathFilter.nonHoodiePathCache.size(), "NonHoodiePathCache size should be 0"); } @Test @@ -82,6 +84,7 @@ public void testNonHoodiePaths() throws IOException { java.nio.file.Path path2 = Paths.get(basePath, "nonhoodiefolder/somefile"); Files.createFile(path2); assertTrue(pathFilter.accept(new Path(path2.toUri()))); + assertEquals(2, pathFilter.nonHoodiePathCache.size(), "NonHoodiePathCache size should be 2"); } @Test @@ -93,5 +96,6 @@ public void testPartitionPathsAsNonHoodiePaths() throws Exception { Path partitionPath2 = testTable.getPartitionPath(p2).getParent(); assertTrue(pathFilter.accept(partitionPath1), "Directories should be accepted"); assertTrue(pathFilter.accept(partitionPath2), "Directories should be accepted"); + assertEquals(2, pathFilter.nonHoodiePathCache.size(), "NonHoodiePathCache size should be 2"); } } diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestInputPathHandler.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestInputPathHandler.java index 3a8b19744f228..0be736b2a043d 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestInputPathHandler.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestInputPathHandler.java @@ -23,12 +23,14 @@ import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.testutils.minicluster.HdfsTestService; +import org.apache.hudi.hadoop.utils.HoodieHiveUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.mapred.JobConf; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -54,6 +56,12 @@ public class TestInputPathHandler { // non Hoodie table public static final String TRIPS_STATS_TEST_NAME = "trips_stats"; + // empty snapshot table + public static final String EMPTY_SNAPSHOT_TEST_NAME = "empty_snapshot"; + + // empty incremental table + public static final String EMPTY_INCREMENTAL_TEST_NAME = "empty_incremental"; + @TempDir static java.nio.file.Path parentPath; @@ -65,6 +73,8 @@ public class TestInputPathHandler { private static String basePathTable2 = null; private static String basePathTable3 = null; private static String basePathTable4 = null; // non hoodie Path + private static String basePathTable5 = null; + private static String basePathTable6 = null; private static List incrementalTables; private static List incrementalPaths; private static List snapshotPaths; @@ -93,7 +103,7 @@ public static void setUpDFS() throws IOException { public static void cleanUp() throws Exception { if (hdfsTestService != null) { hdfsTestService.stop(); - dfsCluster.shutdown(); + dfsCluster.shutdown(true, true); dfsCluster = null; dfs = null; hdfsTestService = null; @@ -108,6 +118,9 @@ static void initTables() throws IOException { basePathTable2 = parentPath.resolve(MODEL_TRIPS_TEST_NAME).toAbsolutePath().toString(); basePathTable3 = parentPath.resolve(ETL_TRIPS_TEST_NAME).toAbsolutePath().toString(); basePathTable4 = parentPath.resolve(TRIPS_STATS_TEST_NAME).toAbsolutePath().toString(); + String tempPath = "/tmp/"; + basePathTable5 = tempPath + EMPTY_SNAPSHOT_TEST_NAME; + basePathTable6 = tempPath + EMPTY_INCREMENTAL_TEST_NAME; dfs.mkdirs(new Path(basePathTable1)); initTableType(dfs.getConf(), basePathTable1, RAW_TRIPS_TEST_NAME, HoodieTableType.MERGE_ON_READ); @@ -124,6 +137,12 @@ static void initTables() throws IOException { dfs.mkdirs(new Path(basePathTable4)); nonHoodiePaths.addAll(generatePartitions(dfs, basePathTable4)); + initTableType(dfs.getConf(), basePathTable5, EMPTY_SNAPSHOT_TEST_NAME, HoodieTableType.COPY_ON_WRITE); + snapshotPaths.add(new Path(basePathTable5)); + + initTableType(dfs.getConf(), basePathTable6, EMPTY_INCREMENTAL_TEST_NAME, HoodieTableType.MERGE_ON_READ); + incrementalPaths.add(new Path(basePathTable6)); + inputPaths.addAll(incrementalPaths); inputPaths.addAll(snapshotPaths); inputPaths.addAll(nonHoodiePaths); @@ -131,14 +150,15 @@ static void initTables() throws IOException { incrementalTables = new ArrayList<>(); incrementalTables.add(RAW_TRIPS_TEST_NAME); incrementalTables.add(MODEL_TRIPS_TEST_NAME); + incrementalTables.add(EMPTY_INCREMENTAL_TEST_NAME); } static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, String tableName, HoodieTableType tableType) throws IOException { Properties properties = new Properties(); - properties.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, tableName); - properties.setProperty(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, tableType.name()); - properties.setProperty(HoodieTableConfig.HOODIE_PAYLOAD_CLASS_PROP_NAME, HoodieAvroPayload.class.getName()); + properties.setProperty(HoodieTableConfig.NAME.key(), tableName); + properties.setProperty(HoodieTableConfig.TYPE.key(), tableType.name()); + properties.setProperty(HoodieTableConfig.PAYLOAD_CLASS_NAME.key(), HoodieAvroPayload.class.getName()); return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, basePath, properties); } @@ -169,6 +189,21 @@ public void testInputPathHandler() throws IOException { assertTrue(actualComparesToExpected(actualPaths, nonHoodiePaths)); } + @Test + public void testInputPathHandlerWithGloballyReplicatedTimeStamp() throws IOException { + JobConf jobConf = new JobConf(); + jobConf.set(HoodieHiveUtils.GLOBALLY_CONSISTENT_READ_TIMESTAMP, "1"); + inputPathHandler = new InputPathHandler(dfs.getConf(), inputPaths.toArray( + new Path[inputPaths.size()]), incrementalTables); + List actualPaths = inputPathHandler.getGroupedIncrementalPaths().values().stream() + .flatMap(List::stream).collect(Collectors.toList()); + assertTrue(actualComparesToExpected(actualPaths, incrementalPaths)); + actualPaths = inputPathHandler.getSnapshotPaths(); + assertTrue(actualComparesToExpected(actualPaths, snapshotPaths)); + actualPaths = inputPathHandler.getNonHoodieInputPaths(); + assertTrue(actualComparesToExpected(actualPaths, nonHoodiePaths)); + } + private boolean actualComparesToExpected(List actualPaths, List expectedPaths) { if (actualPaths.size() != expectedPaths.size()) { return false; diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/functional/TestHoodieCombineHiveInputFormat.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/functional/TestHoodieCombineHiveInputFormat.java deleted file mode 100644 index 5150d6abce7ae..0000000000000 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/functional/TestHoodieCombineHiveInputFormat.java +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.hadoop.functional; - -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.common.model.HoodieTableType; -import org.apache.hudi.common.table.log.HoodieLogFormat; -import org.apache.hudi.common.testutils.HoodieCommonTestHarness; -import org.apache.hudi.common.testutils.HoodieTestUtils; -import org.apache.hudi.common.testutils.SchemaTestUtil; -import org.apache.hudi.common.testutils.minicluster.MiniClusterUtil; -import org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat; -import org.apache.hudi.hadoop.testutils.InputFormatTestUtil; - -import org.apache.avro.Schema; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.exec.Utilities; -import org.apache.hadoop.hive.ql.exec.mr.ExecMapper; -import org.apache.hadoop.hive.ql.plan.MapredWork; -import org.apache.hadoop.hive.ql.plan.PartitionDesc; -import org.apache.hadoop.hive.ql.plan.TableDesc; -import org.apache.hadoop.io.ArrayWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapred.FileInputFormat; -import org.apache.hadoop.mapred.InputSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - -import java.io.File; -import java.io.IOException; -import java.util.LinkedHashMap; - -import static org.apache.hadoop.hive.ql.exec.Utilities.HAS_MAP_WORK; -import static org.apache.hadoop.hive.ql.exec.Utilities.MAPRED_MAPPER_CLASS; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -public class TestHoodieCombineHiveInputFormat extends HoodieCommonTestHarness { - - private JobConf jobConf; - private FileSystem fs; - private Configuration hadoopConf; - - @BeforeAll - public static void setUpClass() throws IOException, InterruptedException { - // Append is not supported in LocalFileSystem. HDFS needs to be setup. - MiniClusterUtil.setUp(); - } - - @AfterAll - public static void tearDownClass() { - MiniClusterUtil.shutdown(); - } - - @BeforeEach - public void setUp() throws IOException, InterruptedException { - this.fs = MiniClusterUtil.fileSystem; - jobConf = new JobConf(); - hadoopConf = HoodieTestUtils.getDefaultHadoopConf(); - assertTrue(fs.mkdirs(new Path(tempDir.toAbsolutePath().toString()))); - HoodieTestUtils.init(MiniClusterUtil.configuration, tempDir.toAbsolutePath().toString(), HoodieTableType.MERGE_ON_READ); - } - - @Test - @Disabled - public void testHoodieRealtimeCombineHoodieInputFormat() throws Exception { - - Configuration conf = new Configuration(); - // initial commit - Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); - HoodieTestUtils.init(hadoopConf, tempDir.toAbsolutePath().toString(), HoodieTableType.MERGE_ON_READ); - String commitTime = "100"; - final int numRecords = 1000; - // Create 3 parquet files with 1000 records each - File partitionDir = InputFormatTestUtil.prepareParquetTable(tempDir, schema, 3, numRecords, commitTime); - InputFormatTestUtil.commit(tempDir, commitTime); - - // insert 1000 update records to log file 0 - String newCommitTime = "101"; - HoodieLogFormat.Writer writer = - InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", commitTime, newCommitTime, - numRecords, numRecords, 0); - writer.close(); - // insert 1000 update records to log file 1 - writer = - InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid1", commitTime, newCommitTime, - numRecords, numRecords, 0); - writer.close(); - // insert 1000 update records to log file 2 - writer = - InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid2", commitTime, newCommitTime, - numRecords, numRecords, 0); - writer.close(); - - TableDesc tblDesc = Utilities.defaultTd; - // Set the input format - tblDesc.setInputFileFormatClass(HoodieCombineHiveInputFormat.class); - PartitionDesc partDesc = new PartitionDesc(tblDesc, null); - LinkedHashMap pt = new LinkedHashMap<>(); - pt.put(new Path(tempDir.toAbsolutePath().toString()), partDesc); - MapredWork mrwork = new MapredWork(); - mrwork.getMapWork().setPathToPartitionInfo(pt); - Path mapWorkPath = new Path(tempDir.toAbsolutePath().toString()); - Utilities.setMapRedWork(conf, mrwork, mapWorkPath); - jobConf = new JobConf(conf); - // Add the paths - FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); - jobConf.set(HAS_MAP_WORK, "true"); - // The following config tells Hive to choose ExecMapper to read the MAP_WORK - jobConf.set(MAPRED_MAPPER_CLASS, ExecMapper.class.getName()); - // setting the split size to be 3 to create one split for 3 file groups - jobConf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MAXSIZE, "3"); - - HoodieCombineHiveInputFormat combineHiveInputFormat = new HoodieCombineHiveInputFormat(); - String tripsHiveColumnTypes = "double,string,string,string,double,double,double,double,double"; - InputFormatTestUtil.setPropsForInputFormat(jobConf, schema, tripsHiveColumnTypes); - InputSplit[] splits = combineHiveInputFormat.getSplits(jobConf, 1); - // Since the SPLIT_SIZE is 3, we should create only 1 split with all 3 file groups - assertEquals(1, splits.length); - RecordReader recordReader = - combineHiveInputFormat.getRecordReader(splits[0], jobConf, null); - NullWritable nullWritable = recordReader.createKey(); - ArrayWritable arrayWritable = recordReader.createValue(); - int counter = 0; - while (recordReader.next(nullWritable, arrayWritable)) { - // read over all the splits - counter++; - } - // should read out 3 splits, each for file0, file1, file2 containing 1000 records each - assertEquals(3000, counter); - } - -} diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/hive/TestHoodieCombineHiveInputFormat.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/hive/TestHoodieCombineHiveInputFormat.java new file mode 100644 index 0000000000000..9b26a7915dd44 --- /dev/null +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/hive/TestHoodieCombineHiveInputFormat.java @@ -0,0 +1,395 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop.hive; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.log.HoodieLogFormat; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.testutils.FileCreateUtils; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.testutils.SchemaTestUtil; +import org.apache.hudi.common.testutils.minicluster.MiniClusterUtil; +import org.apache.hudi.common.util.CommitUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; +import org.apache.hudi.hadoop.testutils.InputFormatTestUtil; + +import org.apache.avro.Schema; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.exec.mr.ExecMapper; +import org.apache.hadoop.hive.ql.io.IOContextMap; +import org.apache.hadoop.hive.ql.plan.MapredWork; +import org.apache.hadoop.hive.ql.plan.PartitionDesc; +import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.stream.Collectors; + +import static org.apache.hadoop.hive.ql.exec.Utilities.HAS_MAP_WORK; +import static org.apache.hadoop.hive.ql.exec.Utilities.MAPRED_MAPPER_CLASS; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestHoodieCombineHiveInputFormat extends HoodieCommonTestHarness { + + private JobConf jobConf; + private FileSystem fs; + private Configuration hadoopConf; + + @BeforeAll + public static void setUpClass() throws IOException, InterruptedException { + // Append is not supported in LocalFileSystem. HDFS needs to be setup. + MiniClusterUtil.setUp(); + } + + @AfterAll + public static void tearDownClass() { + MiniClusterUtil.shutdown(); + } + + @BeforeEach + public void setUp() throws IOException, InterruptedException { + this.fs = MiniClusterUtil.fileSystem; + jobConf = new JobConf(); + hadoopConf = HoodieTestUtils.getDefaultHadoopConf(); + assertTrue(fs.mkdirs(new Path(tempDir.toAbsolutePath().toString()))); + HoodieTestUtils.init(MiniClusterUtil.configuration, tempDir.toAbsolutePath().toString(), HoodieTableType.MERGE_ON_READ); + } + + @Test + public void multiPartitionReadersRealtimeCombineHoodieInputFormat() throws Exception { + // test for HUDI-1718 + Configuration conf = new Configuration(); + // initial commit + Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); + HoodieTestUtils.init(hadoopConf, tempDir.toAbsolutePath().toString(), HoodieTableType.MERGE_ON_READ); + String commitTime = "100"; + final int numRecords = 1000; + // Create 3 partitions, each partition holds one parquet file and 1000 records + List partitionDirs = InputFormatTestUtil + .prepareMultiPartitionedParquetTable(tempDir, schema, 3, numRecords, commitTime, HoodieTableType.MERGE_ON_READ); + HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, + schema.toString(), HoodieTimeline.COMMIT_ACTION); + FileCreateUtils.createCommit(tempDir.toString(), commitTime, Option.of(commitMetadata)); + + TableDesc tblDesc = Utilities.defaultTd; + // Set the input format + tblDesc.setInputFileFormatClass(HoodieParquetRealtimeInputFormat.class); + LinkedHashMap pt = new LinkedHashMap<>(); + LinkedHashMap> talias = new LinkedHashMap<>(); + + PartitionDesc partDesc = new PartitionDesc(tblDesc, null); + + pt.put(new Path(tempDir.toAbsolutePath().toString()), partDesc); + + ArrayList arrayList = new ArrayList<>(); + arrayList.add(tempDir.toAbsolutePath().toString()); + talias.put(new Path(tempDir.toAbsolutePath().toString()), arrayList); + + MapredWork mrwork = new MapredWork(); + mrwork.getMapWork().setPathToPartitionInfo(pt); + mrwork.getMapWork().setPathToAliases(talias); + + Path mapWorkPath = new Path(tempDir.toAbsolutePath().toString()); + Utilities.setMapRedWork(conf, mrwork, mapWorkPath); + jobConf = new JobConf(conf); + // Add three partition path to InputPaths + Path[] partitionDirArray = new Path[partitionDirs.size()]; + partitionDirs.stream().map(p -> new Path(p.getPath())).collect(Collectors.toList()).toArray(partitionDirArray); + FileInputFormat.setInputPaths(jobConf, partitionDirArray); + jobConf.set(HAS_MAP_WORK, "true"); + // The following config tells Hive to choose ExecMapper to read the MAP_WORK + jobConf.set(MAPRED_MAPPER_CLASS, ExecMapper.class.getName()); + // setting the split size to be 3 to create one split for 3 file groups + jobConf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MAXSIZE, "128000000"); + + HoodieCombineHiveInputFormat combineHiveInputFormat = new HoodieCombineHiveInputFormat(); + String tripsHiveColumnTypes = "double,string,string,string,double,double,double,double,double"; + InputFormatTestUtil.setPropsForInputFormat(jobConf, schema, tripsHiveColumnTypes); + + InputSplit[] splits = combineHiveInputFormat.getSplits(jobConf, 1); + // Since the SPLIT_SIZE is 3, we should create only 1 split with all 3 file groups + assertEquals(1, splits.length); + + RecordReader recordReader = + combineHiveInputFormat.getRecordReader(splits[0], jobConf, null); + NullWritable nullWritable = recordReader.createKey(); + ArrayWritable arrayWritable = recordReader.createValue(); + int counter = 0; + + HoodieCombineRealtimeHiveSplit hiveSplit = (HoodieCombineRealtimeHiveSplit)splits[0]; + HoodieCombineRealtimeFileSplit fileSplit = (HoodieCombineRealtimeFileSplit)hiveSplit.getInputSplitShim(); + List realtimeFileSplits = fileSplit.getRealtimeFileSplits(); + + while (recordReader.next(nullWritable, arrayWritable)) { + // since each file holds 1000 records, when counter % 1000 == 0, + // HoodieCombineRealtimeRecordReader will switch reader internal + // Hive use ioctx to extract partition info, when switch reader, ioctx should be updated. + if (counter < 1000) { + assertEquals(IOContextMap.get(jobConf).getInputPath().toString(), realtimeFileSplits.get(0).getPath().toString()); + } else if (counter < 2000) { + assertEquals(IOContextMap.get(jobConf).getInputPath().toString(), realtimeFileSplits.get(1).getPath().toString()); + } else { + assertEquals(IOContextMap.get(jobConf).getInputPath().toString(), realtimeFileSplits.get(2).getPath().toString()); + } + counter++; + } + // should read out 3 splits, each for file0, file1, file2 containing 1000 records each + assertEquals(3000, counter); + recordReader.close(); + } + + @Test + public void multiLevelPartitionReadersRealtimeCombineHoodieInputFormat() throws Exception { + // test for HUDI-1718 + Configuration conf = new Configuration(); + // initial commit + Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); + HoodieTestUtils.init(hadoopConf, tempDir.toAbsolutePath().toString(), HoodieTableType.MERGE_ON_READ); + String commitTime = "100"; + final int numRecords = 1000; + // Create 3 parquet files with 1000 records each + File partitionDir = InputFormatTestUtil.prepareParquetTable(tempDir, schema, 3, numRecords, commitTime); + HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, + schema.toString(), HoodieTimeline.COMMIT_ACTION); + FileCreateUtils.createCommit(tempDir.toString(), commitTime, Option.of(commitMetadata)); + + TableDesc tblDesc = Utilities.defaultTd; + // Set the input format + tblDesc.setInputFileFormatClass(HoodieParquetRealtimeInputFormat.class); + LinkedHashMap pt = new LinkedHashMap<>(); + LinkedHashMap> talias = new LinkedHashMap<>(); + LinkedHashMap partSpec = new LinkedHashMap<>(); + // add three level partitions info + partSpec.put("year", "2016"); + partSpec.put("month", "05"); + partSpec.put("day", "01"); + PartitionDesc partDesc = new PartitionDesc(tblDesc, partSpec); + + pt.put(new Path(tempDir.toAbsolutePath().toString()), partDesc); + + ArrayList arrayList = new ArrayList<>(); + arrayList.add(tempDir.toAbsolutePath().toString()); + talias.put(new Path(tempDir.toAbsolutePath().toString()), arrayList); + + MapredWork mrwork = new MapredWork(); + mrwork.getMapWork().setPathToPartitionInfo(pt); + mrwork.getMapWork().setPathToAliases(talias); + + Path mapWorkPath = new Path(tempDir.toAbsolutePath().toString()); + Utilities.setMapRedWork(conf, mrwork, mapWorkPath); + jobConf = new JobConf(conf); + // Add the paths + FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + jobConf.set(HAS_MAP_WORK, "true"); + // The following config tells Hive to choose ExecMapper to read the MAP_WORK + jobConf.set(MAPRED_MAPPER_CLASS, ExecMapper.class.getName()); + // setting the split size to be 3 to create one split for 3 file groups + jobConf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MAXSIZE, "128000000"); + + HoodieCombineHiveInputFormat combineHiveInputFormat = new HoodieCombineHiveInputFormat(); + String tripsHiveColumnTypes = "double,string,string,string,double,double,double,double,double"; + InputFormatTestUtil.setPropsForInputFormat(jobConf, schema, tripsHiveColumnTypes); + // unset META_TABLE_PARTITION_COLUMNS to trigger HUDI-1718 + jobConf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, ""); + InputSplit[] splits = combineHiveInputFormat.getSplits(jobConf, 1); + // Since the SPLIT_SIZE is 3, we should create only 1 split with all 3 file groups + assertEquals(1, splits.length); + + // if HUDI-1718 is not fixed, the follow code will throw exception + RecordReader recordReader = + combineHiveInputFormat.getRecordReader(splits[0], jobConf, null); + NullWritable nullWritable = recordReader.createKey(); + ArrayWritable arrayWritable = recordReader.createValue(); + int counter = 0; + while (recordReader.next(nullWritable, arrayWritable)) { + // read over all the splits + counter++; + } + // should read out 3 splits, each for file0, file1, file2 containing 1000 records each + assertEquals(3000, counter); + recordReader.close(); + } + + @Test + public void testMultiReaderRealtimeCombineHoodieInputFormat() throws Exception { + // test for hudi-1722 + Configuration conf = new Configuration(); + // initial commit + Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); + HoodieTestUtils.init(hadoopConf, tempDir.toAbsolutePath().toString(), HoodieTableType.MERGE_ON_READ); + String commitTime = "100"; + final int numRecords = 1000; + // Create 3 parquet files with 1000 records each + File partitionDir = InputFormatTestUtil.prepareParquetTable(tempDir, schema, 3, numRecords, commitTime); + HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, + schema.toString(), HoodieTimeline.COMMIT_ACTION); + FileCreateUtils.createCommit(tempDir.toString(), commitTime, Option.of(commitMetadata)); + + String newCommitTime = "101"; + // to trigger the bug of HUDI-1772, only update fileid2 + // insert 1000 update records to log file 2 + // now fileid0, fileid1 has no log files, fileid2 has log file + HoodieLogFormat.Writer writer = + InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid2", commitTime, newCommitTime, + numRecords, numRecords, 0); + writer.close(); + + TableDesc tblDesc = Utilities.defaultTd; + // Set the input format + tblDesc.setInputFileFormatClass(HoodieParquetRealtimeInputFormat.class); + PartitionDesc partDesc = new PartitionDesc(tblDesc, null); + LinkedHashMap pt = new LinkedHashMap<>(); + LinkedHashMap> tableAlias = new LinkedHashMap<>(); + ArrayList alias = new ArrayList<>(); + alias.add(tempDir.toAbsolutePath().toString()); + tableAlias.put(new Path(tempDir.toAbsolutePath().toString()), alias); + pt.put(new Path(tempDir.toAbsolutePath().toString()), partDesc); + + MapredWork mrwork = new MapredWork(); + mrwork.getMapWork().setPathToPartitionInfo(pt); + mrwork.getMapWork().setPathToAliases(tableAlias); + Path mapWorkPath = new Path(tempDir.toAbsolutePath().toString()); + Utilities.setMapRedWork(conf, mrwork, mapWorkPath); + jobConf = new JobConf(conf); + // Add the paths + FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + jobConf.set(HAS_MAP_WORK, "true"); + // The following config tells Hive to choose ExecMapper to read the MAP_WORK + jobConf.set(MAPRED_MAPPER_CLASS, ExecMapper.class.getName()); + // set SPLIT_MAXSIZE larger to create one split for 3 files groups + jobConf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MAXSIZE, "128000000"); + + HoodieCombineHiveInputFormat combineHiveInputFormat = new HoodieCombineHiveInputFormat(); + String tripsHiveColumnTypes = "double,string,string,string,double,double,double,double,double"; + InputFormatTestUtil.setProjectFieldsForInputFormat(jobConf, schema, tripsHiveColumnTypes); + InputSplit[] splits = combineHiveInputFormat.getSplits(jobConf, 1); + // Since the SPLIT_SIZE is 3, we should create only 1 split with all 3 file groups + assertEquals(1, splits.length); + RecordReader recordReader = + combineHiveInputFormat.getRecordReader(splits[0], jobConf, null); + NullWritable nullWritable = recordReader.createKey(); + ArrayWritable arrayWritable = recordReader.createValue(); + int counter = 0; + while (recordReader.next(nullWritable, arrayWritable)) { + // read over all the splits + counter++; + } + // should read out 3 splits, each for file0, file1, file2 containing 1000 records each + assertEquals(3000, counter); + recordReader.close(); + } + + @Test + @Disabled + public void testHoodieRealtimeCombineHoodieInputFormat() throws Exception { + + Configuration conf = new Configuration(); + // initial commit + Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); + HoodieTestUtils.init(hadoopConf, tempDir.toAbsolutePath().toString(), HoodieTableType.MERGE_ON_READ); + String commitTime = "100"; + final int numRecords = 1000; + // Create 3 parquet files with 1000 records each + File partitionDir = InputFormatTestUtil.prepareParquetTable(tempDir, schema, 3, numRecords, commitTime); + HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, + schema.toString(), HoodieTimeline.COMMIT_ACTION); + FileCreateUtils.createCommit(tempDir.toString(), commitTime, Option.of(commitMetadata)); + + // insert 1000 update records to log file 0 + String newCommitTime = "101"; + HoodieLogFormat.Writer writer = + InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", commitTime, newCommitTime, + numRecords, numRecords, 0); + writer.close(); + // insert 1000 update records to log file 1 + writer = + InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid1", commitTime, newCommitTime, + numRecords, numRecords, 0); + writer.close(); + // insert 1000 update records to log file 2 + writer = + InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid2", commitTime, newCommitTime, + numRecords, numRecords, 0); + writer.close(); + + TableDesc tblDesc = Utilities.defaultTd; + // Set the input format + tblDesc.setInputFileFormatClass(HoodieCombineHiveInputFormat.class); + PartitionDesc partDesc = new PartitionDesc(tblDesc, null); + LinkedHashMap pt = new LinkedHashMap<>(); + pt.put(new Path(tempDir.toAbsolutePath().toString()), partDesc); + MapredWork mrwork = new MapredWork(); + mrwork.getMapWork().setPathToPartitionInfo(pt); + Path mapWorkPath = new Path(tempDir.toAbsolutePath().toString()); + Utilities.setMapRedWork(conf, mrwork, mapWorkPath); + jobConf = new JobConf(conf); + // Add the paths + FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + jobConf.set(HAS_MAP_WORK, "true"); + // The following config tells Hive to choose ExecMapper to read the MAP_WORK + jobConf.set(MAPRED_MAPPER_CLASS, ExecMapper.class.getName()); + // setting the split size to be 3 to create one split for 3 file groups + jobConf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MAXSIZE, "3"); + + HoodieCombineHiveInputFormat combineHiveInputFormat = new HoodieCombineHiveInputFormat(); + String tripsHiveColumnTypes = "double,string,string,string,double,double,double,double,double"; + InputFormatTestUtil.setPropsForInputFormat(jobConf, schema, tripsHiveColumnTypes); + InputSplit[] splits = combineHiveInputFormat.getSplits(jobConf, 1); + // Since the SPLIT_SIZE is 3, we should create only 1 split with all 3 file groups + assertEquals(1, splits.length); + RecordReader recordReader = + combineHiveInputFormat.getRecordReader(splits[0], jobConf, null); + NullWritable nullWritable = recordReader.createKey(); + ArrayWritable arrayWritable = recordReader.createValue(); + int counter = 0; + while (recordReader.next(nullWritable, arrayWritable)) { + // read over all the splits + counter++; + } + // should read out 3 splits, each for file0, file1, file2 containing 1000 records each + assertEquals(3000, counter); + } + +} diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieMergeOnReadTableInputFormat.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieMergeOnReadTableInputFormat.java new file mode 100644 index 0000000000000..6a5404762a9c7 --- /dev/null +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieMergeOnReadTableInputFormat.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.hadoop.realtime; + +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.hadoop.PathWithBootstrapFileStatus; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.net.URI; +import java.nio.file.Files; +import java.util.Collections; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestHoodieMergeOnReadTableInputFormat { + + @TempDir + java.nio.file.Path tempDir; + private FileSystem fs; + + @BeforeEach + void setUp() throws IOException { + fs = FileSystem.get(tempDir.toUri(), new Configuration()); + } + + @AfterEach + void tearDown() throws IOException { + fs.close(); + } + + @Test + void pathNotSplitableForBootstrapScenario() throws IOException { + URI source = Files.createTempFile(tempDir, "source", ".parquet").toUri(); + URI target = Files.createTempFile(tempDir, "target", ".parquet").toUri(); + HoodieRealtimePath rtPath = new HoodieRealtimePath(new Path("foo"), "bar", target.toString(), Collections.emptyList(), "000", false, Option.empty()); + assertTrue(new HoodieMergeOnReadTableInputFormat().isSplitable(fs, rtPath)); + + PathWithBootstrapFileStatus path = new PathWithBootstrapFileStatus(new Path(target), fs.getFileStatus(new Path(source))); + rtPath.setPathWithBootstrapFileStatus(path); + assertFalse(new HoodieMergeOnReadTableInputFormat().isSplitable(fs, rtPath), "Path for bootstrap should not be splitable."); + } + + @Test + void pathNotSplitableIfContainsDeltaFiles() throws IOException { + URI basePath = Files.createTempFile(tempDir, "target", ".parquet").toUri(); + HoodieRealtimePath rtPath = new HoodieRealtimePath(new Path("foo"), "bar", basePath.toString(), Collections.emptyList(), "000", false, Option.empty()); + assertTrue(new HoodieMergeOnReadTableInputFormat().isSplitable(fs, rtPath), "Path only contains the base file should be splittable"); + + URI logPath = Files.createTempFile(tempDir, ".test", ".log.4_1-149-180").toUri(); + HoodieLogFile logFile = new HoodieLogFile(fs.getFileStatus(new Path(logPath))); + rtPath = new HoodieRealtimePath(new Path("foo"), "bar", basePath.toString(), Collections.singletonList(logFile), "000", false, Option.empty()); + assertFalse(new HoodieMergeOnReadTableInputFormat().isSplitable(fs, rtPath), "Path contains log files should not be splittable."); + } +} diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeFileSplit.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeFileSplit.java index 4e0adb0e3224d..d2c4f1be61d9f 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeFileSplit.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeFileSplit.java @@ -18,6 +18,9 @@ package org.apache.hudi.hadoop.realtime; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.util.Option; + import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileSplit; @@ -31,7 +34,11 @@ import org.mockito.stubbing.Answer; import java.io.DataInput; +import java.io.DataInputStream; import java.io.DataOutput; +import java.io.DataOutputStream; +import java.io.FileInputStream; +import java.io.FileOutputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.Collections; @@ -54,6 +61,7 @@ public class TestHoodieRealtimeFileSplit { private HoodieRealtimeFileSplit split; private String basePath; + private List deltaLogFiles; private List deltaLogPaths; private String fileSplitName; private FileSplit baseFileSplit; @@ -62,12 +70,13 @@ public class TestHoodieRealtimeFileSplit { @BeforeEach public void setUp(@TempDir java.nio.file.Path tempDir) throws Exception { basePath = tempDir.toAbsolutePath().toString(); + deltaLogFiles = Collections.singletonList(new HoodieLogFile(new Path(basePath + "/1.log"), 0L)); deltaLogPaths = Collections.singletonList(basePath + "/1.log"); fileSplitName = basePath + "/test.file"; baseFileSplit = new FileSplit(new Path(fileSplitName), 0, 100, new String[] {}); maxCommitTime = "10001"; - split = new HoodieRealtimeFileSplit(baseFileSplit, basePath, deltaLogPaths, maxCommitTime); + split = new HoodieRealtimeFileSplit(baseFileSplit, basePath, deltaLogFiles, maxCommitTime, false, Option.empty()); } @Test @@ -97,6 +106,7 @@ public void testWrite() throws IOException { inorder.verify(out, times(1)).writeInt(eq(deltaLogPaths.size())); inorder.verify(out, times(1)).writeInt(eq(deltaLogPaths.get(0).length())); inorder.verify(out, times(1)).write(aryEq(deltaLogPaths.get(0).getBytes(StandardCharsets.UTF_8))); + inorder.verify(out, times(1)).writeBoolean(false); // verify there are no more interactions happened on the mocked object inorder.verifyNoMoreInteractions(); } @@ -151,4 +161,18 @@ public Void answer(InvocationOnMock invocation) throws Throwable { assertEquals(deltaLogPaths, read.getDeltaLogPaths()); assertEquals(split.toString(), read.toString()); } + + @Test + public void testSerDe(@TempDir java.nio.file.Path tempDir) throws IOException { + final HoodieRealtimeFileSplit original = split; + java.nio.file.Path tempFilePath = tempDir.resolve("tmp.txt"); + try (DataOutputStream out = new DataOutputStream(new FileOutputStream(tempFilePath.toFile()))) { + original.write(out); + } + HoodieRealtimeFileSplit deserialized = new HoodieRealtimeFileSplit(); + try (DataInputStream in = new DataInputStream(new FileInputStream(tempFilePath.toFile()))) { + deserialized.readFields(in); + } + assertEquals(original.toString(), deserialized.toString()); + } } diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeRecordReader.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeRecordReader.java index 5a735ce1b5e97..f334bbf3bc977 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeRecordReader.java @@ -19,23 +19,39 @@ package org.apache.hudi.hadoop.realtime; import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.HoodieLogFormat.Writer; +import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.testutils.FileCreateUtils; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.testutils.SchemaTestUtil; +import org.apache.hudi.common.util.CommitUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.hadoop.testutils.InputFormatTestUtil; +import org.apache.hudi.hadoop.RealtimeFileStatus; import org.apache.hudi.hadoop.config.HoodieRealtimeConfig; +import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; +import org.apache.hudi.hadoop.testutils.InputFormatTestUtil; +import org.apache.avro.generic.GenericRecord; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; @@ -51,39 +67,55 @@ import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; + import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; +import java.net.URI; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import java.util.stream.Stream; import static org.apache.hudi.hadoop.realtime.HoodieRealtimeRecordReader.REALTIME_SKIP_MERGE_PROP; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.params.provider.Arguments.arguments; public class TestHoodieRealtimeRecordReader { private static final String PARTITION_COLUMN = "datestr"; - private JobConf jobConf; + private JobConf baseJobConf; private FileSystem fs; private Configuration hadoopConf; @BeforeEach public void setUp() { - jobConf = new JobConf(); - jobConf.set(HoodieRealtimeConfig.MAX_DFS_STREAM_BUFFER_SIZE_PROP, String.valueOf(1024 * 1024)); hadoopConf = HoodieTestUtils.getDefaultHadoopConf(); - fs = FSUtils.getFs(basePath.toString(), hadoopConf); + hadoopConf.set("fs.defaultFS", "file:///"); + hadoopConf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + baseJobConf = new JobConf(hadoopConf); + baseJobConf.set(HoodieRealtimeConfig.MAX_DFS_STREAM_BUFFER_SIZE_PROP, String.valueOf(1024 * 1024)); + fs = FSUtils.getFs(basePath.toUri().toString(), baseJobConf); } @TempDir @@ -96,16 +128,6 @@ private Writer writeLogFile(File partitionDir, Schema schema, String fileId, Str 0); } - @Test - public void testReader() throws Exception { - testReader(true); - } - - @Test - public void testNonPartitionedReader() throws Exception { - testReader(false); - } - private void setHiveColumnNameProps(List fields, JobConf jobConf, boolean isPartitioned) { String names = fields.stream().map(Field::name).collect(Collectors.joining(",")); String positions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); @@ -121,16 +143,49 @@ private void setHiveColumnNameProps(List fields, JobConf jobConf, jobConf.set(hive_metastoreConstants.META_TABLE_COLUMNS, hiveOrderedColumnNames); } - private void testReader(boolean partitioned) throws Exception { + @ParameterizedTest + @MethodSource("testArguments") + public void testReader(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean partitioned) throws Exception { + testReaderInternal(diskMapType, isCompressionEnabled, partitioned); + } + + @Test + public void testHFileInlineReader() throws Exception { + testReaderInternal(ExternalSpillableMap.DiskMapType.BITCASK, false, false, + HoodieLogBlock.HoodieLogBlockType.HFILE_DATA_BLOCK); + } + + @Test + public void testParquetInlineReader() throws Exception { + testReaderInternal(ExternalSpillableMap.DiskMapType.BITCASK, false, false, + HoodieLogBlock.HoodieLogBlockType.PARQUET_DATA_BLOCK); + } + + private void testReaderInternal(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean partitioned) throws Exception { + testReaderInternal(diskMapType, isCompressionEnabled, partitioned, HoodieLogBlock.HoodieLogBlockType.AVRO_DATA_BLOCK); + } + + private void testReaderInternal(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled, + boolean partitioned, HoodieLogBlock.HoodieLogBlockType logBlockType) throws Exception { // initial commit Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ); String baseInstant = "100"; - File partitionDir = partitioned ? InputFormatTestUtil.prepareParquetTable(basePath, schema, 1, 100, baseInstant) - : InputFormatTestUtil.prepareNonPartitionedParquetTable(basePath, schema, 1, 100, baseInstant); - InputFormatTestUtil.commit(basePath, baseInstant); + File partitionDir = partitioned ? InputFormatTestUtil.prepareParquetTable(basePath, schema, 1, 100, baseInstant, + HoodieTableType.MERGE_ON_READ) + : InputFormatTestUtil.prepareNonPartitionedParquetTable(basePath, schema, 1, 100, baseInstant, + HoodieTableType.MERGE_ON_READ); + + HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, + schema.toString(), HoodieTimeline.DELTA_COMMIT_ACTION); + FileCreateUtils.createDeltaCommit(basePath.toString(), baseInstant, commitMetadata); // Add the paths - FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath()); List> logVersionsWithAction = new ArrayList<>(); logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 1)); @@ -158,44 +213,57 @@ private void testReader(boolean partitioned) throws Exception { } else { writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", baseInstant, - instantTime, 100, 0, logVersion); + instantTime, 120, 0, logVersion, logBlockType); } long size = writer.getCurrentSize(); writer.close(); assertTrue(size > 0, "block - size should be > 0"); + FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime, commitMetadata); // create a split with baseFile (parquet file written earlier) and new log file(s) fileSlice.addLogFile(writer.getLogFile()); HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( - new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1, jobConf), - basePath.toString(), fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()) - .map(h -> h.getPath().toString()).collect(Collectors.toList()), - instantTime); + new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1, baseJobConf), + basePath.toUri().toString(), fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()) + .collect(Collectors.toList()), + instantTime, + false, + Option.empty()); // create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader reader = new MapredParquetInputFormat().getRecordReader( - new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); - JobConf jobConf = new JobConf(); + new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), baseJobConf, null); + JobConf jobConf = new JobConf(baseJobConf); List fields = schema.getFields(); setHiveColumnNameProps(fields, jobConf, partitioned); + jobConf.setEnum(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), diskMapType); + jobConf.setBoolean(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), isCompressionEnabled); + // validate record reader compaction + long logTmpFileStartTime = System.currentTimeMillis(); HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader); // use reader to read base Parquet File and log file, merge in flight and return latest commit // here all 100 records should be updated, see above + // another 20 new insert records should also output with new commit time. NullWritable key = recordReader.createKey(); ArrayWritable value = recordReader.createValue(); + int recordCnt = 0; while (recordReader.next(key, value)) { Writable[] values = value.get(); // check if the record written is with latest commit, here "101" assertEquals(latestInstant, values[0].toString()); key = recordReader.createKey(); value = recordReader.createValue(); + recordCnt++; } recordReader.getPos(); assertEquals(1.0, recordReader.getProgress(), 0.05); + assertEquals(120, recordCnt); recordReader.close(); + // the temp file produced by logScanner should be deleted + assertTrue(!getLogTempFile(logTmpFileStartTime, System.currentTimeMillis(), diskMapType.toString()).exists()); } catch (Exception ioe) { throw new HoodieException(ioe.getMessage(), ioe); } @@ -205,6 +273,13 @@ private void testReader(boolean partitioned) throws Exception { } + private File getLogTempFile(long startTime, long endTime, String diskType) { + return Arrays.stream(new File("/tmp").listFiles()) + .filter(f -> f.isDirectory() && f.getName().startsWith("hudi-" + diskType) && f.lastModified() > startTime && f.lastModified() < endTime) + .findFirst() + .orElse(new File("")); + } + @Test public void testUnMergedReader() throws Exception { // initial commit @@ -214,10 +289,13 @@ public void testUnMergedReader() throws Exception { final int numRecords = 1000; final int firstBatchLastRecordKey = numRecords - 1; final int secondBatchLastRecordKey = 2 * numRecords - 1; - File partitionDir = InputFormatTestUtil.prepareParquetTable(basePath, schema, 1, numRecords, instantTime); - InputFormatTestUtil.commit(basePath, instantTime); + File partitionDir = InputFormatTestUtil.prepareParquetTable(basePath, schema, 1, numRecords, instantTime, + HoodieTableType.MERGE_ON_READ); + HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, + schema.toString(), HoodieTimeline.DELTA_COMMIT_ACTION); + FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime, commitMetadata); // Add the paths - FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath()); // insert new records to log file String newCommitTime = "101"; @@ -227,17 +305,17 @@ public void testUnMergedReader() throws Exception { long size = writer.getCurrentSize(); writer.close(); assertTrue(size > 0, "block - size should be > 0"); + FileCreateUtils.createDeltaCommit(basePath.toString(), newCommitTime, commitMetadata); // create a split with baseFile (parquet file written earlier) and new log file(s) - String logFilePath = writer.getLogFile().getPath().toString(); HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( - new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + instantTime + ".parquet"), 0, 1, jobConf), - basePath.toString(), Collections.singletonList(logFilePath), newCommitTime); + new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + instantTime + ".parquet"), 0, 1, baseJobConf), + basePath.toUri().toString(), Collections.singletonList(writer.getLogFile()), newCommitTime, false, Option.empty()); // create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader reader = new MapredParquetInputFormat().getRecordReader( - new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); - JobConf jobConf = new JobConf(); + new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), baseJobConf, null); + JobConf jobConf = new JobConf(baseJobConf); List fields = schema.getFields(); setHiveColumnNameProps(fields, jobConf, true); // Enable merge skipping. @@ -283,18 +361,24 @@ public void testUnMergedReader() throws Exception { recordReader.close(); } - @Test - public void testReaderWithNestedAndComplexSchema() throws Exception { + @ParameterizedTest + @MethodSource("testArguments") + public void testReaderWithNestedAndComplexSchema(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled) throws Exception { // initial commit Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getComplexEvolvedSchema()); HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ); String instantTime = "100"; int numberOfRecords = 100; int numberOfLogRecords = numberOfRecords / 2; - File partitionDir = InputFormatTestUtil.prepareParquetTable(basePath, schema, 1, numberOfRecords, instantTime); - InputFormatTestUtil.commit(basePath, instantTime); + File partitionDir = InputFormatTestUtil.prepareParquetTable(basePath, schema, 1, numberOfRecords, + instantTime, HoodieTableType.MERGE_ON_READ); + + HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, + schema.toString(), HoodieTimeline.COMMIT_ACTION); + FileCreateUtils.createCommit(basePath.toString(), instantTime, Option.of(commitMetadata)); // Add the paths - FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath()); // update files or generate new log file String newCommitTime = "101"; @@ -303,21 +387,25 @@ public void testReaderWithNestedAndComplexSchema() throws Exception { long size = writer.getCurrentSize(); writer.close(); assertTrue(size > 0, "block - size should be > 0"); - InputFormatTestUtil.deltaCommit(basePath, newCommitTime); + commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, + schema.toString(), HoodieTimeline.DELTA_COMMIT_ACTION); + FileCreateUtils.createDeltaCommit(basePath.toString(), newCommitTime, commitMetadata); // create a split with baseFile (parquet file written earlier) and new log file(s) - String logFilePath = writer.getLogFile().getPath().toString(); HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( - new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + instantTime + ".parquet"), 0, 1, jobConf), - basePath.toString(), Collections.singletonList(logFilePath), newCommitTime); + new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + instantTime + ".parquet"), 0, 1, baseJobConf), + basePath.toUri().toString(), Collections.singletonList(writer.getLogFile()), newCommitTime, false, Option.empty()); // create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader reader = new MapredParquetInputFormat().getRecordReader( - new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); - JobConf jobConf = new JobConf(); + new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), baseJobConf, null); + JobConf jobConf = new JobConf(baseJobConf); List fields = schema.getFields(); setHiveColumnNameProps(fields, jobConf, true); + jobConf.setEnum(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), diskMapType); + jobConf.setBoolean(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), isCompressionEnabled); + // validate record reader compaction HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader); @@ -408,23 +496,29 @@ public void testReaderWithNestedAndComplexSchema() throws Exception { assertEquals("stringArray" + i + recordCommitTimeSuffix, arrayValues[i].toString(), "test value for field: stringArray"); } + reader.close(); } } - @Test - public void testSchemaEvolutionAndRollbackBlockInLastLogFile() throws Exception { + @ParameterizedTest + @MethodSource("testArguments") + public void testSchemaEvolutionAndRollbackBlockInLastLogFile(ExternalSpillableMap.DiskMapType diskMapType, + boolean isCompressionEnabled) throws Exception { // initial commit - List logFilePaths = new ArrayList<>(); + List logFiles = new ArrayList<>(); Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ); String instantTime = "100"; int numberOfRecords = 100; int numberOfLogRecords = numberOfRecords / 2; File partitionDir = - InputFormatTestUtil.prepareSimpleParquetTable(basePath, schema, 1, numberOfRecords, instantTime); - InputFormatTestUtil.commit(basePath, instantTime); + InputFormatTestUtil.prepareSimpleParquetTable(basePath, schema, 1, numberOfRecords, + instantTime, HoodieTableType.MERGE_ON_READ); + HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, + schema.toString(), HoodieTimeline.COMMIT_ACTION); + FileCreateUtils.createCommit(basePath.toString(), instantTime, Option.of(commitMetadata)); // Add the paths - FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); + FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath()); List firstSchemaFields = schema.getFields(); // update files and generate new log file but don't commit @@ -434,7 +528,7 @@ public void testSchemaEvolutionAndRollbackBlockInLastLogFile() throws Exception InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", instantTime, newCommitTime, numberOfLogRecords, 0, 1); long size = writer.getCurrentSize(); - logFilePaths.add(writer.getLogFile().getPath().toString()); + logFiles.add(writer.getLogFile()); writer.close(); assertTrue(size > 0, "block - size should be > 0"); @@ -442,19 +536,22 @@ public void testSchemaEvolutionAndRollbackBlockInLastLogFile() throws Exception newCommitTime = "102"; writer = InputFormatTestUtil.writeRollbackBlockToLogFile(partitionDir, fs, schema, "fileid0", instantTime, newCommitTime, "101", 1); - logFilePaths.add(writer.getLogFile().getPath().toString()); + logFiles.add(writer.getLogFile()); writer.close(); - InputFormatTestUtil.deltaCommit(basePath, newCommitTime); + + commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, + schema.toString(), HoodieTimeline.DELTA_COMMIT_ACTION); + FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime, commitMetadata); // create a split with baseFile (parquet file written earlier) and new log file(s) HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( - new FileSplit(new Path(partitionDir + "/fileid0_1_" + instantTime + ".parquet"), 0, 1, jobConf), - basePath.toString(), logFilePaths, newCommitTime); + new FileSplit(new Path(partitionDir + "/fileid0_1_" + instantTime + ".parquet"), 0, 1, baseJobConf), + basePath.toUri().toString(), logFiles, newCommitTime, false, Option.empty()); // create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader reader = new MapredParquetInputFormat().getRecordReader( - new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); - JobConf jobConf = new JobConf(); + new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), baseJobConf, null); + JobConf jobConf = new JobConf(baseJobConf); List fields = schema.getFields(); assertFalse(firstSchemaFields.containsAll(fields)); @@ -462,6 +559,9 @@ public void testSchemaEvolutionAndRollbackBlockInLastLogFile() throws Exception // Try to read all the fields passed by the new schema setHiveColumnNameProps(fields, jobConf, true); + jobConf.setEnum(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), diskMapType); + jobConf.setBoolean(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), isCompressionEnabled); + HoodieRealtimeRecordReader recordReader; try { // validate record reader compaction @@ -481,5 +581,352 @@ public void testSchemaEvolutionAndRollbackBlockInLastLogFile() throws Exception while (recordReader.next(key, value)) { // keep reading } + reader.close(); + } + + @Test + public void testSchemaEvolution() throws Exception { + ExternalSpillableMap.DiskMapType diskMapType = ExternalSpillableMap.DiskMapType.BITCASK; + boolean isCompressionEnabled = true; + // initial commit + List logFiles = new ArrayList<>(); + Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); + HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ); + String instantTime = "100"; + int numberOfRecords = 100; + int numberOfLogRecords = numberOfRecords / 2; + File partitionDir = + InputFormatTestUtil.prepareSimpleParquetTable(basePath, schema, 1, numberOfRecords, + instantTime, HoodieTableType.MERGE_ON_READ); + HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, + schema.toString(), HoodieTimeline.COMMIT_ACTION); + FileCreateUtils.createCommit(basePath.toString(), instantTime, Option.of(commitMetadata)); + // Add the paths + FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath()); + List firstSchemaFields = schema.getFields(); + + // 2nd commit w/ evolved schema + Schema evolvedSchema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedCompatibleSchema()); + List secondSchemaFields = evolvedSchema.getFields(); + String newCommitTime = "101"; + File partitionDir1 = + InputFormatTestUtil.prepareSimpleParquetTable(basePath, evolvedSchema, 1, numberOfRecords, + instantTime, HoodieTableType.MERGE_ON_READ,"2017","05","01"); + HoodieCommitMetadata commitMetadata1 = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, + evolvedSchema.toString(), HoodieTimeline.COMMIT_ACTION); + FileCreateUtils.createCommit(basePath.toString(), newCommitTime, Option.of(commitMetadata1)); + // Add the paths + FileInputFormat.setInputPaths(baseJobConf, partitionDir1.getPath()); + + // create a split with baseFile from 1st commit. + HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( + new FileSplit(new Path(partitionDir + "/fileid0_1_" + instantTime + ".parquet"), 0, 1, baseJobConf), + basePath.toUri().toString(), logFiles, newCommitTime, false, Option.empty()); + + // create a RecordReader to be used by HoodieRealtimeRecordReader + RecordReader reader = new MapredParquetInputFormat().getRecordReader( + new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), baseJobConf, null); + JobConf jobConf = new JobConf(baseJobConf); + + // Try to read all the fields passed by the new schema + setHiveColumnNameProps(secondSchemaFields, jobConf, true); + // This time read only the fields which are part of parquet + HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader); + // use reader to read base Parquet File and log file + NullWritable key = recordReader.createKey(); + ArrayWritable value = recordReader.createValue(); + while (recordReader.next(key, value)) { + // keep reading + } + reader.close(); + } + + private static Stream testArguments() { + // Arg1: ExternalSpillableMap Type, Arg2: isDiskMapCompressionEnabled, Arg3: partitioned + return Stream.of( + arguments(ExternalSpillableMap.DiskMapType.BITCASK, false, false), + arguments(ExternalSpillableMap.DiskMapType.ROCKS_DB, false, false), + arguments(ExternalSpillableMap.DiskMapType.BITCASK, true, false), + arguments(ExternalSpillableMap.DiskMapType.ROCKS_DB, true, false), + arguments(ExternalSpillableMap.DiskMapType.BITCASK, false, true), + arguments(ExternalSpillableMap.DiskMapType.ROCKS_DB, false, true), + arguments(ExternalSpillableMap.DiskMapType.BITCASK, true, true), + arguments(ExternalSpillableMap.DiskMapType.ROCKS_DB, true, true) + ); + } + + @Test + public void testIncrementalWithOnlylog() throws Exception { + // initial commit + Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); + HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ); + String instantTime = "100"; + final int numRecords = 1000; + File partitionDir = InputFormatTestUtil.prepareParquetTable(basePath, schema, 1, numRecords, instantTime, + HoodieTableType.MERGE_ON_READ); + createDeltaCommitFile(basePath, instantTime,"2016/05/01", "2016/05/01/fileid0_1-0-1_100.parquet", "fileid0", schema.toString()); + // Add the paths + FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath()); + + // insert new records to log file + try { + String newCommitTime = "102"; + HoodieLogFormat.Writer writer = + InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", instantTime, newCommitTime, + numRecords, numRecords, 0); + writer.close(); + createDeltaCommitFile(basePath, newCommitTime,"2016/05/01", "2016/05/01/.fileid0_100.log.1_1-0-1", "fileid0", schema.toString()); + + InputFormatTestUtil.setupIncremental(baseJobConf, "101", 1); + + HoodieParquetRealtimeInputFormat inputFormat = new HoodieParquetRealtimeInputFormat(); + inputFormat.setConf(baseJobConf); + InputSplit[] splits = inputFormat.getSplits(baseJobConf, 1); + assertEquals(1, splits.length); + JobConf newJobConf = new JobConf(baseJobConf); + List fields = schema.getFields(); + setHiveColumnNameProps(fields, newJobConf, false); + RecordReader reader = inputFormat.getRecordReader(splits[0], newJobConf, Reporter.NULL); + // use reader to read log file. + NullWritable key = reader.createKey(); + ArrayWritable value = reader.createValue(); + while (reader.next(key, value)) { + Writable[] values = value.get(); + // since we set incremental start commit as 101 and commit_number as 1. + // the data belong to commit 102 should be read out. + assertEquals(newCommitTime, values[0].toString()); + key = reader.createKey(); + value = reader.createValue(); + } + reader.close(); + } catch (IOException e) { + throw new HoodieException(e.getMessage(), e); + } + } + + @Test + public void testIncrementalWithReplace() throws Exception { + // initial commit + Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); + HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ); + String baseInstant = "100"; + File partitionDir = InputFormatTestUtil.prepareParquetTable(basePath, schema, 1, 100, baseInstant, + HoodieTableType.MERGE_ON_READ); + createDeltaCommitFile(basePath, baseInstant,"2016/05/01", "2016/05/01/fileid0_1-0-1_100.parquet", "fileid0", schema.toString()); + // Add the paths + FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath()); + + InputFormatTestUtil.simulateInserts(partitionDir, ".parquet", "fileid1", 1, "200"); + Map> partitionToReplaceFileIds = new HashMap<>(); + List replacedFileId = new ArrayList<>(); + replacedFileId.add("fileid0"); + partitionToReplaceFileIds.put("2016/05/01", replacedFileId); + createReplaceCommitFile(basePath, + "200","2016/05/01", "2016/05/01/fileid10_1-0-1_200.parquet", "fileid10", partitionToReplaceFileIds); + + InputFormatTestUtil.setupIncremental(baseJobConf, "0", 1); + + HoodieParquetRealtimeInputFormat inputFormat = new HoodieParquetRealtimeInputFormat(); + inputFormat.setConf(baseJobConf); + InputSplit[] splits = inputFormat.getSplits(baseJobConf, 1); + assertTrue(splits.length == 1); + JobConf newJobConf = new JobConf(baseJobConf); + List fields = schema.getFields(); + setHiveColumnNameProps(fields, newJobConf, false); + newJobConf.set("columns.types", "string,string,string,string,string,string,string,string,bigint,string,string"); + RecordReader reader = inputFormat.getRecordReader(splits[0], newJobConf, Reporter.NULL); + + // use reader to read log file. + NullWritable key = reader.createKey(); + ArrayWritable value = reader.createValue(); + while (reader.next(key, value)) { + Writable[] values = value.get(); + // since we set incremental start commit as 0 and commit_number as 1. + // the data belong to commit 100 should be read out. + assertEquals("100", values[0].toString()); + key = reader.createKey(); + value = reader.createValue(); + } + reader.close(); + } + + private void createReplaceCommitFile( + java.nio.file.Path basePath, + String commitNumber, + String partitionPath, + String filePath, + String fileId, + Map> partitionToReplaceFileIds) throws IOException { + List writeStats = new ArrayList<>(); + HoodieWriteStat writeStat = createHoodieWriteStat(basePath, commitNumber, partitionPath, filePath, fileId); + writeStats.add(writeStat); + HoodieReplaceCommitMetadata replaceMetadata = new HoodieReplaceCommitMetadata(); + replaceMetadata.setPartitionToReplaceFileIds(partitionToReplaceFileIds); + writeStats.forEach(stat -> replaceMetadata.addWriteStat(partitionPath, stat)); + File file = basePath.resolve(".hoodie").resolve(commitNumber + ".replacecommit").toFile(); + file.createNewFile(); + FileOutputStream fileOutputStream = new FileOutputStream(file); + fileOutputStream.write(replaceMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)); + fileOutputStream.flush(); + fileOutputStream.close(); + } + + private HoodieWriteStat createHoodieWriteStat(java.nio.file.Path basePath, String commitNumber, String partitionPath, String filePath, String fileId) { + HoodieWriteStat writeStat = new HoodieWriteStat(); + writeStat.setFileId(fileId); + writeStat.setNumDeletes(0); + writeStat.setNumUpdateWrites(100); + writeStat.setNumWrites(100); + writeStat.setPath(filePath); + writeStat.setFileSizeInBytes(new File(new Path(basePath.toString(), filePath).toString()).length()); + writeStat.setPartitionPath(partitionPath); + writeStat.setTotalLogFilesCompacted(100L); + HoodieWriteStat.RuntimeStats runtimeStats = new HoodieWriteStat.RuntimeStats(); + runtimeStats.setTotalScanTime(100); + runtimeStats.setTotalCreateTime(100); + runtimeStats.setTotalUpsertTime(100); + writeStat.setRuntimeStats(runtimeStats); + return writeStat; + } + + private void createDeltaCommitFile( + java.nio.file.Path basePath, + String commitNumber, + String partitionPath, + String filePath, + String fileId, + String schemaStr) throws IOException { + List writeStats = new ArrayList<>(); + HoodieWriteStat writeStat = createHoodieWriteStat(basePath, commitNumber, partitionPath, filePath, fileId); + writeStats.add(writeStat); + + HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); + writeStats.forEach(stat -> commitMetadata.addWriteStat(partitionPath, stat)); + if (schemaStr != null) { + commitMetadata.getExtraMetadata().put(HoodieCommitMetadata.SCHEMA_KEY, schemaStr); + } + File file = basePath.resolve(".hoodie").resolve(commitNumber + ".deltacommit").toFile(); + file.createNewFile(); + FileOutputStream fileOutputStream = new FileOutputStream(file); + fileOutputStream.write(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)); + fileOutputStream.flush(); + fileOutputStream.close(); + } + + @Test + public void testLogOnlyReader() throws Exception { + // initial commit + Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); + URI baseUri = basePath.toUri(); + HoodieTestUtils.init(hadoopConf, baseUri.toString(), HoodieTableType.MERGE_ON_READ); + String baseInstant = "100"; + File partitionDir = InputFormatTestUtil.prepareNonPartitionedParquetTable(basePath, schema, 1, 100, baseInstant, + HoodieTableType.MERGE_ON_READ); + FileCreateUtils.createDeltaCommit(basePath.toString(), baseInstant); + // Add the paths + FileInputFormat.setInputPaths(baseJobConf, partitionDir.toURI().toString()); + + FileSlice fileSlice = new FileSlice("default", baseInstant, "fileid1"); + try { + // update files or generate new log file + int logVersion = 1; + int baseInstantTs = Integer.parseInt(baseInstant); + String instantTime = String.valueOf(baseInstantTs + logVersion); + HoodieLogFormat.Writer writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid1", baseInstant, + instantTime, 100, 0, logVersion); + long size = writer.getCurrentSize(); + writer.close(); + assertTrue(size > 0, "block - size should be > 0"); + HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, + schema.toString(), HoodieTimeline.COMMIT_ACTION); + FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime, commitMetadata); + // create a split with new log file(s) + fileSlice.addLogFile(new HoodieLogFile(writer.getLogFile().getPath(), size)); + RealtimeFileStatus realtimeFileStatus = new RealtimeFileStatus( + new FileStatus(writer.getLogFile().getFileSize(), false, 1, 1, 0, writer.getLogFile().getPath()), + baseUri.toString(), + fileSlice.getLogFiles().collect(Collectors.toList()), + false, + Option.empty()); + realtimeFileStatus.setMaxCommitTime(instantTime); + HoodieRealtimePath realtimePath = (HoodieRealtimePath) realtimeFileStatus.getPath(); + HoodieRealtimeFileSplit split = + new HoodieRealtimeFileSplit(new FileSplit(realtimePath, 0, 0, new String[] {""}), realtimePath); + + JobConf newJobConf = new JobConf(baseJobConf); + List fields = schema.getFields(); + setHiveColumnNameProps(fields, newJobConf, false); + // create a dummy RecordReader to be used by HoodieRealtimeRecordReader + RecordReader reader = new HoodieRealtimeRecordReader(split, newJobConf, new HoodieEmptyRecordReader(split, newJobConf)); + // use reader to read log file. + NullWritable key = reader.createKey(); + ArrayWritable value = reader.createValue(); + while (reader.next(key, value)) { + Writable[] values = value.get(); + assertEquals(instantTime, values[0].toString()); + key = reader.createKey(); + value = reader.createValue(); + } + reader.close(); + } catch (Exception e) { + throw new HoodieException(e.getMessage(), e); + } + } + + @Test + public void testIncrementalWithCompaction() throws Exception { + // initial commit + Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); + HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ); + String baseInstant = "100"; + File partitionDir = InputFormatTestUtil.prepareParquetTable(basePath, schema, 1, 100, baseInstant, + HoodieTableType.MERGE_ON_READ); + createDeltaCommitFile(basePath, baseInstant,"2016/05/01", "2016/05/01/fileid0_1-0-1_100.parquet", "fileid0", schema.toString()); + // Add the paths + FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath()); + + createCompactionFile(basePath, "125"); + + // add inserts after compaction timestamp + InputFormatTestUtil.simulateInserts(partitionDir, ".parquet", "fileId2", 5, "200"); + InputFormatTestUtil.commit(basePath, "200"); + + InputFormatTestUtil.setupIncremental(baseJobConf, "100", 10); + + // verify that incremental reads do NOT show inserts after compaction timestamp + HoodieParquetRealtimeInputFormat inputFormat = new HoodieParquetRealtimeInputFormat(); + inputFormat.setConf(baseJobConf); + InputSplit[] splits = inputFormat.getSplits(baseJobConf, 1); + assertTrue(splits.length == 0); + } + + @Test + public void testAvroToArrayWritable() throws IOException { + Schema schema = SchemaTestUtil.getEvolvedSchema(); + GenericRecord record = SchemaTestUtil.generateAvroRecordFromJson(schema, 1, "100", "100", false); + ArrayWritable aWritable = (ArrayWritable) HoodieRealtimeRecordReaderUtils.avroToArrayWritable(record, schema); + assertEquals(schema.getFields().size(), aWritable.get().length); + + // In some queries, generic records that Hudi gets are just part of the full records. + // Here test the case that some fields are missing in the record. + Schema schemaWithMetaFields = HoodieAvroUtils.addMetadataFields(schema); + ArrayWritable aWritable2 = (ArrayWritable) HoodieRealtimeRecordReaderUtils.avroToArrayWritable(record, schemaWithMetaFields); + assertEquals(schemaWithMetaFields.getFields().size(), aWritable2.get().length); + } + + private File createCompactionFile(java.nio.file.Path basePath, String commitTime) + throws IOException { + File file = basePath.resolve(".hoodie") + .resolve(HoodieTimeline.makeRequestedCompactionFileName(commitTime)).toFile(); + assertTrue(file.createNewFile()); + FileOutputStream os = new FileOutputStream(file); + try { + HoodieCompactionPlan compactionPlan = HoodieCompactionPlan.newBuilder().setVersion(2).build(); + // Write empty commit metadata + os.write(TimelineMetadataUtils.serializeCompactionPlan(compactionPlan).get()); + return file; + } finally { + os.close(); + } } } diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/testutils/InputFormatTestUtil.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/testutils/InputFormatTestUtil.java index fe0be469d28fe..a4471845c3e42 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/testutils/InputFormatTestUtil.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/testutils/InputFormatTestUtil.java @@ -22,14 +22,19 @@ import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; import org.apache.hudi.common.table.log.block.HoodieCommandBlock; +import org.apache.hudi.common.table.log.block.HoodieDataBlock; +import org.apache.hudi.common.table.log.block.HoodieHFileDataBlock; import org.apache.hudi.common.table.log.block.HoodieLogBlock; +import org.apache.hudi.common.table.log.block.HoodieParquetDataBlock; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.testutils.SchemaTestUtil; +import org.apache.hudi.common.util.Option; import org.apache.hudi.hadoop.utils.HoodieHiveUtils; import org.apache.avro.Schema; @@ -37,11 +42,15 @@ import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RawLocalFileSystem; +import org.apache.hadoop.hbase.io.compress.Compression; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.mapred.JobConf; import org.apache.parquet.avro.AvroParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; import java.io.File; import java.io.IOException; @@ -62,13 +71,50 @@ public class InputFormatTestUtil { private static String TEST_WRITE_TOKEN = "1-0-1"; public static File prepareTable(java.nio.file.Path basePath, HoodieFileFormat baseFileFormat, int numberOfFiles, - String commitNumber) + String commitNumber) throws IOException { + return prepareCustomizedTable(basePath, baseFileFormat, numberOfFiles, commitNumber, false, true, false, null); + } + + public static File prepareCustomizedTable(java.nio.file.Path basePath, HoodieFileFormat baseFileFormat, int numberOfFiles, + String commitNumber, boolean useNonPartitionedKeyGen, boolean populateMetaFields, boolean injectData, Schema schema) + throws IOException { + if (useNonPartitionedKeyGen) { + HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), HoodieTableType.COPY_ON_WRITE, + baseFileFormat, true, "org.apache.hudi.keygen.NonpartitionedKeyGenerator", populateMetaFields); + } else { + HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), HoodieTableType.COPY_ON_WRITE, + baseFileFormat); + } + + java.nio.file.Path partitionPath = useNonPartitionedKeyGen + ? basePath + : basePath.resolve(Paths.get("2016", "05", "01")); + + setupPartition(basePath, partitionPath); + + if (injectData) { + try { + createSimpleData(schema, partitionPath, numberOfFiles, 100, commitNumber); + return partitionPath.toFile(); + } catch (Exception e) { + throw new IOException("Excpetion thrown while writing data ", e); + } + } else { + return simulateInserts(partitionPath.toFile(), baseFileFormat.getFileExtension(), "fileId1", numberOfFiles, + commitNumber); + } + } + + public static File prepareMultiPartitionTable(java.nio.file.Path basePath, HoodieFileFormat baseFileFormat, int numberOfFiles, + String commitNumber, String finalLevelPartitionName) throws IOException { HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), HoodieTableType.COPY_ON_WRITE, baseFileFormat); - java.nio.file.Path partitionPath = basePath.resolve(Paths.get("2016", "05", "01")); - Files.createDirectories(partitionPath); - return simulateInserts(partitionPath.toFile(), baseFileFormat.getFileExtension(), "fileId1", numberOfFiles, + + java.nio.file.Path partitionPath = basePath.resolve(Paths.get("2016", "05", finalLevelPartitionName)); + setupPartition(basePath, partitionPath); + + return simulateInserts(partitionPath.toFile(), baseFileFormat.getFileExtension(), "fileId1" + finalLevelPartitionName, numberOfFiles, commitNumber); } @@ -77,7 +123,7 @@ public static File simulateInserts(File partitionPath, String baseFileExtension, throws IOException { for (int i = 0; i < numberOfFiles; i++) { Files.createFile(partitionPath.toPath() - .resolve(FSUtils.makeDataFileName(commitNumber, TEST_WRITE_TOKEN, fileId + i, baseFileExtension))); + .resolve(FSUtils.makeBaseFileName(commitNumber, TEST_WRITE_TOKEN, fileId + i, baseFileExtension))); } return partitionPath; } @@ -94,7 +140,7 @@ public static void simulateUpdates(File directory, String baseFileExtension, fin List toUpdateList = dataFiles.subList(0, Math.min(numberOfFilesUpdated, dataFiles.size())); for (File file : toUpdateList) { String fileId = FSUtils.getFileId(file.getName()); - Files.createFile(directory.toPath().resolve(FSUtils.makeDataFileName(newCommit, TEST_WRITE_TOKEN, fileId, + Files.createFile(directory.toPath().resolve(FSUtils.makeBaseFileName(newCommit, TEST_WRITE_TOKEN, fileId, baseFileExtension))); } } @@ -110,6 +156,10 @@ public static void deltaCommit(java.nio.file.Path basePath, String commitNumber) } public static void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull) { + setupIncremental(jobConf, startCommit, numberOfCommitsToPull, false); + } + + public static void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull, boolean isIncrementalUseDatabase) { String modePropertyName = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); jobConf.set(modePropertyName, HoodieHiveUtils.INCREMENTAL_SCAN_MODE); @@ -121,36 +171,128 @@ public static void setupIncremental(JobConf jobConf, String startCommit, int num String maxCommitPulls = String.format(HoodieHiveUtils.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); jobConf.setInt(maxCommitPulls, numberOfCommitsToPull); + + jobConf.setBoolean(HoodieHiveUtils.HOODIE_INCREMENTAL_USE_DATABASE, isIncrementalUseDatabase); + } + + public static void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull, String databaseName, boolean isIncrementalUseDatabase) { + String modePropertyName = + String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, databaseName + "." + HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.set(modePropertyName, HoodieHiveUtils.INCREMENTAL_SCAN_MODE); + + String startCommitTimestampName = + String.format(HoodieHiveUtils.HOODIE_START_COMMIT_PATTERN, databaseName + "." + HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.set(startCommitTimestampName, startCommit); + + String maxCommitPulls = + String.format(HoodieHiveUtils.HOODIE_MAX_COMMIT_PATTERN, databaseName + "." + HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.setInt(maxCommitPulls, numberOfCommitsToPull); + + jobConf.setBoolean(HoodieHiveUtils.HOODIE_INCREMENTAL_USE_DATABASE, isIncrementalUseDatabase); + } + + public static void setupSnapshotIncludePendingCommits(JobConf jobConf, String instantTime) { + setupSnapshotScanMode(jobConf, true); + String validateTimestampName = + String.format(HoodieHiveUtils.HOODIE_CONSUME_COMMIT, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.set(validateTimestampName, instantTime); + } + + public static void setupSnapshotMaxCommitTimeQueryMode(JobConf jobConf, String maxInstantTime) { + setUpScanMode(jobConf); + String validateTimestampName = + String.format(HoodieHiveUtils.HOODIE_CONSUME_COMMIT, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.set(validateTimestampName, maxInstantTime); + } + + public static void setupSnapshotScanMode(JobConf jobConf) { + setupSnapshotScanMode(jobConf, false); + } + + private static void setupSnapshotScanMode(JobConf jobConf, boolean includePending) { + setUpScanMode(jobConf); + String includePendingCommitsName = + String.format(HoodieHiveUtils.HOODIE_CONSUME_PENDING_COMMITS, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.setBoolean(includePendingCommitsName, includePending); + } + + private static void setUpScanMode(JobConf jobConf) { + String modePropertyName = + String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.set(modePropertyName, HoodieHiveUtils.SNAPSHOT_SCAN_MODE); } public static File prepareParquetTable(java.nio.file.Path basePath, Schema schema, int numberOfFiles, int numberOfRecords, String commitNumber) throws IOException { - HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString()); + return prepareParquetTable(basePath, schema, numberOfFiles, numberOfRecords, commitNumber, HoodieTableType.COPY_ON_WRITE); + } + + public static File prepareParquetTable(java.nio.file.Path basePath, Schema schema, int numberOfFiles, + int numberOfRecords, String commitNumber, HoodieTableType tableType) throws IOException { + HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), tableType, HoodieFileFormat.PARQUET); + java.nio.file.Path partitionPath = basePath.resolve(Paths.get("2016", "05", "01")); + setupPartition(basePath, partitionPath); + createData(schema, partitionPath, numberOfFiles, numberOfRecords, commitNumber); + return partitionPath.toFile(); } public static File prepareSimpleParquetTable(java.nio.file.Path basePath, Schema schema, int numberOfFiles, int numberOfRecords, String commitNumber) throws Exception { - HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString()); - java.nio.file.Path partitionPath = basePath.resolve(Paths.get("2016", "05", "01")); + return prepareSimpleParquetTable(basePath, schema, numberOfFiles, numberOfRecords, commitNumber, HoodieTableType.COPY_ON_WRITE); + } + + public static File prepareSimpleParquetTable(java.nio.file.Path basePath, Schema schema, int numberOfFiles, + int numberOfRecords, String commitNumber, HoodieTableType tableType) throws Exception { + return prepareSimpleParquetTable(basePath, schema, numberOfFiles, numberOfRecords, commitNumber, tableType, "2016","05","01"); + } + + public static File prepareSimpleParquetTable(java.nio.file.Path basePath, Schema schema, int numberOfFiles, + int numberOfRecords, String commitNumber, HoodieTableType tableType, String year, String month, String date) throws Exception { + HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), tableType, HoodieFileFormat.PARQUET); + + java.nio.file.Path partitionPath = basePath.resolve(Paths.get(year, month, date)); + setupPartition(basePath, partitionPath); + createSimpleData(schema, partitionPath, numberOfFiles, numberOfRecords, commitNumber); + return partitionPath.toFile(); } public static File prepareNonPartitionedParquetTable(java.nio.file.Path basePath, Schema schema, int numberOfFiles, int numberOfRecords, String commitNumber) throws IOException { - HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString()); + return prepareNonPartitionedParquetTable(basePath, schema, numberOfFiles, numberOfRecords, commitNumber, HoodieTableType.COPY_ON_WRITE); + } + + public static File prepareNonPartitionedParquetTable(java.nio.file.Path basePath, Schema schema, int numberOfFiles, + int numberOfRecords, String commitNumber, HoodieTableType tableType) throws IOException { + HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), tableType, HoodieFileFormat.PARQUET); createData(schema, basePath, numberOfFiles, numberOfRecords, commitNumber); return basePath.toFile(); } + public static List prepareMultiPartitionedParquetTable(java.nio.file.Path basePath, Schema schema, + int numberPartitions, int numberOfRecordsPerPartition, String commitNumber, HoodieTableType tableType) throws IOException { + List result = new ArrayList<>(); + HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), tableType, HoodieFileFormat.PARQUET); + for (int i = 0; i < numberPartitions; i++) { + java.nio.file.Path partitionPath = basePath.resolve(Paths.get(2016 + i + "", "05", "01")); + setupPartition(basePath, partitionPath); + + createData(schema, partitionPath, 1, numberOfRecordsPerPartition, commitNumber); + + result.add(partitionPath.toFile()); + } + return result; + } + private static void createData(Schema schema, java.nio.file.Path partitionPath, int numberOfFiles, int numberOfRecords, String commitNumber) throws IOException { AvroParquetWriter parquetWriter; for (int i = 0; i < numberOfFiles; i++) { - String fileId = FSUtils.makeDataFileName(commitNumber, TEST_WRITE_TOKEN, "fileid" + i); + String fileId = FSUtils.makeBaseFileName(commitNumber, TEST_WRITE_TOKEN, "fileid" + i, HoodieFileFormat.PARQUET.getFileExtension()); parquetWriter = new AvroParquetWriter(new Path(partitionPath.resolve(fileId).toString()), schema); try { for (GenericRecord record : generateAvroRecords(schema, numberOfRecords, commitNumber, fileId)) { @@ -166,7 +308,7 @@ private static void createSimpleData(Schema schema, java.nio.file.Path partition String commitNumber) throws Exception { AvroParquetWriter parquetWriter; for (int i = 0; i < numberOfFiles; i++) { - String fileId = FSUtils.makeDataFileName(commitNumber, "1", "fileid" + i); + String fileId = FSUtils.makeBaseFileName(commitNumber, "1", "fileid" + i, HoodieFileFormat.PARQUET.getFileExtension()); parquetWriter = new AvroParquetWriter(new Path(partitionPath.resolve(fileId).toString()), schema); try { List records = SchemaTestUtil.generateTestRecords(0, numberOfRecords); @@ -197,7 +339,8 @@ public static void simulateParquetUpdates(File directory, Schema schema, String int totalNumberOfRecords, int numberOfRecordsToUpdate, String newCommit) throws IOException { File fileToUpdate = Objects.requireNonNull(directory.listFiles((dir, name) -> name.endsWith("parquet")))[0]; String fileId = FSUtils.getFileId(fileToUpdate.getName()); - File dataFile = new File(directory, FSUtils.makeDataFileName(newCommit, TEST_WRITE_TOKEN, fileId)); + File dataFile = new File(directory, + FSUtils.makeBaseFileName(newCommit, TEST_WRITE_TOKEN, fileId, HoodieFileFormat.PARQUET.getFileExtension())); try (AvroParquetWriter parquetWriter = new AvroParquetWriter(new Path(dataFile.getAbsolutePath()), schema)) { for (GenericRecord record : generateAvroRecords(schema, totalNumberOfRecords, originalCommit, fileId)) { if (numberOfRecordsToUpdate > 0) { @@ -216,7 +359,7 @@ public static HoodieLogFormat.Writer writeRollback(File partitionDir, FileSystem String newCommit, String rolledBackInstant, int logVersion) throws InterruptedException, IOException { HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(partitionDir.getPath())).withFileId(fileId) - .overBaseCommit(baseCommit).withFs(fs).withLogVersion(logVersion).withLogWriteToken("1-0-1") + .overBaseCommit(baseCommit).withFs(fs).withLogVersion(logVersion).withRolloverLogWriteToken("1-0-1") .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); // generate metadata Map header = new HashMap<>(); @@ -225,17 +368,24 @@ public static HoodieLogFormat.Writer writeRollback(File partitionDir, FileSystem header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); // if update belongs to an existing log file - writer = writer.appendBlock(new HoodieCommandBlock(header)); + writer.appendBlock(new HoodieCommandBlock(header)); return writer; } public static HoodieLogFormat.Writer writeDataBlockToLogFile(File partitionDir, FileSystem fs, Schema schema, String fileId, - String baseCommit, String newCommit, int numberOfRecords, int offset, int logVersion) + String baseCommit, String newCommit, int numberOfRecords, int offset, int logVersion) throws IOException, InterruptedException { + return writeDataBlockToLogFile(partitionDir, fs, schema, fileId, baseCommit, newCommit, numberOfRecords, offset, logVersion, HoodieLogBlock.HoodieLogBlockType.AVRO_DATA_BLOCK); + } + + public static HoodieLogFormat.Writer writeDataBlockToLogFile(File partitionDir, FileSystem fs, Schema schema, String + fileId, + String baseCommit, String newCommit, int numberOfRecords, int offset, int logVersion, + HoodieLogBlock.HoodieLogBlockType logBlockType) throws InterruptedException, IOException { HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(partitionDir.getPath())) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(fileId).withLogVersion(logVersion) - .withLogWriteToken("1-0-1").overBaseCommit(baseCommit).withFs(fs).build(); + .withRolloverLogWriteToken("1-0-1").overBaseCommit(baseCommit).withFs(fs).build(); List records = new ArrayList<>(); for (int i = offset; i < offset + numberOfRecords; i++) { records.add(SchemaTestUtil.generateAvroRecordFromJson(schema, i, newCommit, "fileid0")); @@ -244,8 +394,16 @@ public static HoodieLogFormat.Writer writeDataBlockToLogFile(File partitionDir, Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, newCommit); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writeSchema.toString()); - HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); - writer = writer.appendBlock(dataBlock); + HoodieDataBlock dataBlock = null; + if (logBlockType == HoodieLogBlock.HoodieLogBlockType.HFILE_DATA_BLOCK) { + dataBlock = new HoodieHFileDataBlock( + records, header, Compression.Algorithm.GZ, writer.getLogFile().getPath()); + } else if (logBlockType == HoodieLogBlock.HoodieLogBlockType.PARQUET_DATA_BLOCK) { + dataBlock = new HoodieParquetDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD, CompressionCodecName.GZIP); + } else { + dataBlock = new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD); + } + writer.appendBlock(dataBlock); return writer; } @@ -264,10 +422,37 @@ public static HoodieLogFormat.Writer writeRollbackBlockToLogFile(File partitionD header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); HoodieCommandBlock rollbackBlock = new HoodieCommandBlock(header); - writer = writer.appendBlock(rollbackBlock); + writer.appendBlock(rollbackBlock); return writer; } + public static void setProjectFieldsForInputFormat(JobConf jobConf, + Schema schema, String hiveColumnTypes) { + List fields = schema.getFields(); + String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); + String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); + Configuration conf = HoodieTestUtils.getDefaultHadoopConf(); + + String hiveColumnNames = fields.stream().filter(field -> !field.name().equalsIgnoreCase("datestr")) + .map(Schema.Field::name).collect(Collectors.joining(",")); + hiveColumnNames = hiveColumnNames + ",datestr"; + String modifiedHiveColumnTypes = HoodieAvroUtils.addMetadataColumnTypes(hiveColumnTypes); + modifiedHiveColumnTypes = modifiedHiveColumnTypes + ",string"; + jobConf.set(hive_metastoreConstants.META_TABLE_COLUMNS, hiveColumnNames); + jobConf.set(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, modifiedHiveColumnTypes); + // skip choose hoodie meta_columns, only choose one origin column to trigger HUID-1722 + jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names.split(",")[5]); + jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions.split(",")[5]); + jobConf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "datestr"); + conf.set(hive_metastoreConstants.META_TABLE_COLUMNS, hiveColumnNames); + // skip choose hoodie meta_columns, only choose one origin column to trigger HUID-1722 + conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names.split(",")[5]); + conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions.split(",")[5]); + conf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "datestr"); + conf.set(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, modifiedHiveColumnTypes); + jobConf.addResource(conf); + } + public static void setPropsForInputFormat(JobConf jobConf, Schema schema, String hiveColumnTypes) { List fields = schema.getFields(); @@ -293,10 +478,27 @@ public static void setPropsForInputFormat(JobConf jobConf, jobConf.addResource(conf); } + private static void setupPartition(java.nio.file.Path basePath, java.nio.file.Path partitionPath) throws IOException { + Files.createDirectories(partitionPath); + + // Create partition metadata to properly setup table's partition + RawLocalFileSystem lfs = new RawLocalFileSystem(); + lfs.setConf(HoodieTestUtils.getDefaultHadoopConf()); + + HoodiePartitionMetadata partitionMetadata = + new HoodiePartitionMetadata( + new LocalFileSystem(lfs), + "0", + new Path(basePath.toAbsolutePath().toString()), + new Path(partitionPath.toAbsolutePath().toString()), + Option.of(HoodieFileFormat.PARQUET)); + + partitionMetadata.trySave((int) (Math.random() * 1000)); + } + public static void setInputPath(JobConf jobConf, String inputPath) { jobConf.set("mapreduce.input.fileinputformat.inputdir", inputPath); jobConf.set("mapreduce.input.fileinputformat.inputdir", inputPath); jobConf.set("map.input.dir", inputPath); } - } diff --git a/hudi-hadoop-mr/src/test/resources/log4j-surefire-quiet.properties b/hudi-hadoop-mr/src/test/resources/log4j-surefire-quiet.properties deleted file mode 100644 index b21b5d4070c41..0000000000000 --- a/hudi-hadoop-mr/src/test/resources/log4j-surefire-quiet.properties +++ /dev/null @@ -1,29 +0,0 @@ -### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -### -log4j.rootLogger=WARN, CONSOLE -log4j.logger.org.apache.hudi=DEBUG - -# CONSOLE is set to be a ConsoleAppender. -log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# CONSOLE uses PatternLayout. -log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout -log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n -log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter -log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true -log4j.appender.CONSOLE.filter.a.LevelMin=WARN -log4j.appender.CONSOLE.filter.a.LevelMax=FATAL diff --git a/hudi-hadoop-mr/src/test/resources/log4j-surefire.properties b/hudi-hadoop-mr/src/test/resources/log4j-surefire.properties deleted file mode 100644 index c03e808cca1f8..0000000000000 --- a/hudi-hadoop-mr/src/test/resources/log4j-surefire.properties +++ /dev/null @@ -1,30 +0,0 @@ -### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -### -log4j.rootLogger=WARN, CONSOLE -log4j.logger.org.apache=INFO -log4j.logger.org.apache.hudi=DEBUG - -# A1 is set to be a ConsoleAppender. -log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. -log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout -log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n -log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter -log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true -log4j.appender.CONSOLE.filter.a.LevelMin=WARN -log4j.appender.CONSOLE.filter.a.LevelMax=FATAL \ No newline at end of file diff --git a/hudi-integ-test/README.md b/hudi-integ-test/README.md index a6cdd08478e1a..687ad9a2a90d2 100644 --- a/hudi-integ-test/README.md +++ b/hudi-integ-test/README.md @@ -82,8 +82,8 @@ spark-submit 2.YAML file -Choose to write up the entire DAG of operations in YAML, take a look at `complex-dag-cow.yaml` or -`complex-dag-mor.yaml`. +Choose to write up the entire DAG of operations in YAML, take a look at `simple-deltastreamer.yaml` or +`simple-deltastreamer.yaml`. Once you're ready with the DAG you want to execute, simply pass the yaml file path as follows: ``` @@ -126,7 +126,7 @@ NOTE : The properties-file should have all the necessary information required to information on what properties need to be set, take a look at the test suite section under demo steps. ``` shell$ ./prepare_integration_suite.sh --spark-command -spark-submit --packages com.databricks:spark-avro_2.11:4.0.0 --master prepare_integration_suite.sh --deploy-mode +spark-submit --master prepare_integration_suite.sh --deploy-mode --properties-file --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob target/hudi-integ-test-0.6 .0-SNAPSHOT.jar --source-class --source-ordering-field --input-base-path --target-base-path --target-table --props --storage-type --payload-class --workload-yaml-path --input-file-size -- ``` @@ -142,7 +142,9 @@ Start the Hudi Docker demo: docker/setup_demo.sh ``` -NOTE: We need to make a couple of environment changes for Hive 2.x support. This will be fixed once Hudi moves to Spark 3.x +NOTE: We need to make a couple of environment changes for Hive 2.x support. This will be fixed once Hudi moves to Spark 3.x. +Execute below if you are using Hudi query node in your dag. If not, below section is not required. +Also, for longer running tests, go to next section. ``` docker exec -it adhoc-2 bash @@ -175,7 +177,7 @@ cd /opt Copy the integration tests jar into the docker container ``` -docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.6.1-SNAPSHOT.jar adhoc-2:/opt +docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar adhoc-2:/opt ``` ``` @@ -196,7 +198,6 @@ Launch a Copy-on-Write job: ========================= ## Run the following command to start the test suite spark-submit \ ---packages org.apache.spark:spark-avro_2.11:2.4.0 \ --conf spark.task.cpus=1 \ --conf spark.executor.cores=1 \ --conf spark.task.maxFailures=100 \ @@ -212,21 +213,29 @@ spark-submit \ --conf spark.network.timeout=600s \ --conf spark.yarn.max.executor.failures=10 \ --conf spark.sql.catalogImplementation=hive \ +--conf spark.driver.extraClassPath=/var/demo/jars/* \ +--conf spark.executor.extraClassPath=/var/demo/jars/* \ --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \ -/opt/hudi-integ-test-bundle-0.6.1-SNAPSHOT.jar \ ---source-ordering-field timestamp \ +/opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \ +--source-ordering-field test_suite_source_ordering_field \ --use-deltastreamer \ --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \ --input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \ --target-table table1 \ --props file:/var/hoodie/ws/docker/demo/config/test-suite/test.properties \ ---schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ +--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \ --source-class org.apache.hudi.utilities.sources.AvroDFSSource \ --input-file-size 125829120 \ ---workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-cow.yaml \ +--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/simple-deltastreamer.yaml \ --workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \ --table-type COPY_ON_WRITE \ ---compact-scheduling-minshare 1 +--compact-scheduling-minshare 1 \ +--hoodie-conf hoodie.metrics.on=true \ +--hoodie-conf hoodie.metrics.reporter.type=GRAPHITE \ +--hoodie-conf hoodie.metrics.graphite.host=graphite \ +--hoodie-conf hoodie.metrics.graphite.port=2003 \ +--clean-input \ +--clean-output ``` Or a Merge-on-Read job: @@ -235,7 +244,6 @@ Or a Merge-on-Read job: ========================= ## Run the following command to start the test suite spark-submit \ ---packages org.apache.spark:spark-avro_2.11:2.4.0 \ --conf spark.task.cpus=1 \ --conf spark.executor.cores=1 \ --conf spark.task.maxFailures=100 \ @@ -251,19 +259,418 @@ spark-submit \ --conf spark.network.timeout=600s \ --conf spark.yarn.max.executor.failures=10 \ --conf spark.sql.catalogImplementation=hive \ +--conf spark.driver.extraClassPath=/var/demo/jars/* \ +--conf spark.executor.extraClassPath=/var/demo/jars/* \ --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \ -/opt/hudi-integ-test-bundle-0.6.1-SNAPSHOT.jar \ ---source-ordering-field timestamp \ +/opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \ +--source-ordering-field test_suite_source_ordering_field \ --use-deltastreamer \ --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \ --input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \ --target-table table1 \ --props file:/var/hoodie/ws/docker/demo/config/test-suite/test.properties \ ---schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ +--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \ --source-class org.apache.hudi.utilities.sources.AvroDFSSource \ --input-file-size 125829120 \ ---workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-mor.yaml \ +--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/simple-deltastreamer.yaml \ --workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \ --table-type MERGE_ON_READ \ ---compact-scheduling-minshare 1 +--compact-scheduling-minshare 1 \ +--hoodie-conf hoodie.metrics.on=true \ +--hoodie-conf hoodie.metrics.reporter.type=GRAPHITE \ +--hoodie-conf hoodie.metrics.graphite.host=graphite \ +--hoodie-conf hoodie.metrics.graphite.port=2003 \ +--clean-input \ +--clean-output ``` + +## Visualize and inspect the hoodie metrics and performance (local) +Graphite server is already setup (and up) in ```docker/setup_demo.sh```. + +Open browser and access metrics at +``` +http://localhost:80 +``` +Dashboard +``` +http://localhost/dashboard + +``` + +## Running long running test suite in Local Docker environment + +For long running test suite, validation has to be done differently. Idea is to run same dag in a repeated manner for +N iterations. Hence "ValidateDatasetNode" is introduced which will read entire input data and compare it with hudi +contents both via spark datasource and hive table via spark sql engine. Hive validation is configurable. + +If you have "ValidateDatasetNode" in your dag, do not replace hive jars as instructed above. Spark sql engine does not +go well w/ hive2* jars. So, after running docker setup, follow the below steps. +``` +docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar adhoc-2:/opt/ +docker cp docker/demo/config/test-suite/test.properties adhoc-2:/opt/ +``` +Also copy your dag of interest to adhoc-2:/opt/ +``` +docker cp docker/demo/config/test-suite/simple-deltastreamer.yaml adhoc-2:/opt/ +``` + +For repeated runs, two additional configs need to be set. "dag_rounds" and "dag_intermittent_delay_mins". +This means that your dag will be repeated for N times w/ a delay of Y mins between each round. Note: simple-deltastreamer.yaml +already has all these configs set. So no changes required just to try it out. + +Also, ValidateDatasetNode can be configured in two ways. Either with "delete_input_data" set to true or without +setting the config. When "delete_input_data" is set for ValidateDatasetNode, once validation is complete, entire input +data will be deleted. So, suggestion is to use this ValidateDatasetNode as the last node in the dag with "delete_input_data". + +Example dag: +``` + Insert + Upsert + ValidateDatasetNode with delete_input_data = true +``` + +If above dag is run with "dag_rounds" = 10 and "dag_intermittent_delay_mins" = 10, then this dag will run for 10 times +with 10 mins delay between every run. At the end of every run, records written as part of this round will be validated. +At the end of each validation, all contents of input are deleted. +To illustrate each round +``` +Round1: + insert => inputPath/batch1 + upsert -> inputPath/batch2 + Validate with delete_input_data = true + Validates contents from batch1 and batch2 are in hudi and ensures Row equality + Since "delete_input_data" is set, deletes contents from batch1 and batch2. +Round2: + insert => inputPath/batch3 + upsert -> inputPath/batch4 + Validate with delete_input_data = true + Validates contents from batch3 and batch4 are in hudi and ensures Row equality + Since "delete_input_data" is set, deletes contents from batch3 and batch4. +Round3: + insert => inputPath/batch5 + upsert -> inputPath/batch6 + Validate with delete_input_data = true + Validates contents from batch5 and batch6 are in hudi and ensures Row equality + Since "delete_input_data" is set, deletes contents from batch5 and batch6. +. +. +``` +If you wish to do a cumulative validation, do not set delete_input_data in ValidateDatasetNode. But remember that this +may not scale beyond certain point since input data as well as hudi content's keeps occupying the disk and grows for +every cycle. + +Lets see an example where you don't set "delete_input_data" as part of Validation. +``` + Insert + Upsert + ValidateDatasetNode +``` +Here is the illustration of each round +``` +Round1: + insert => inputPath/batch1 + upsert -> inputPath/batch2 + Validate: validates contents from batch1 and batch2 are in hudi and ensures Row equality +Round2: + insert => inputPath/batch3 + upsert -> inputPath/batch4 + Validate: validates contents from batch1 to batch4 are in hudi and ensures Row equality +Round3: + insert => inputPath/batch5 + upsert -> inputPath/batch6 + Validate: validates contents from batch1 and batch6 are in hudi and ensures Row equality +. +. +``` + +You could also have validations in the middle of your dag and not set the "delete_input_data". But set it only in the +last node in the dag. +``` +Round1: + insert => inputPath/batch1 + upsert -> inputPath/batch2 + Validate: validates contents from batch1 and batch2 are in hudi and ensures Row equality + insert => inputPath/batch3 + upsert -> inputPath/batch4 + Validate with delete_input_data = true + Validates contents from batch1 to batch4 are in hudi and ensures Row equality + since "delete_input_data" is set to true, this node deletes contents from batch1 and batch4. +Round2: + insert => inputPath/batch5 + upsert -> inputPath/batch6 + Validate: validates contents from batch5 and batch6 are in hudi and ensures Row equality + insert => inputPath/batch7 + upsert -> inputPath/batch8 + Validate: validates contents from batch5 to batch8 are in hudi and ensures Row equality + since "delete_input_data" is set to true, this node deletes contents from batch5 to batch8. +Round3: + insert => inputPath/batch9 + upsert -> inputPath/batch10 + Validate: validates contents from batch9 and batch10 are in hudi and ensures Row equality + insert => inputPath/batch11 + upsert -> inputPath/batch12 + Validate with delete_input_data = true + Validates contents from batch9 to batch12 are in hudi and ensures Row equality + Set "delete_input_data" to true. so this node deletes contents from batch9 to batch12. +. +. +``` +Above dag was just an example for illustration purposes. But you can make it complex as per your needs. +``` + Insert + Upsert + Delete + Validate w/o deleting + Insert + Rollback + Validate w/o deleting + Upsert + Validate w/ deletion +``` + +Once you have copied the jar, test.properties and your dag to adhoc-2:/opt/, you can run the following command to execute +the test suite job. +``` +docker exec -it adhoc-2 /bin/bash +``` +Sample COW command +``` +spark-submit \ +--conf spark.task.cpus=1 \ +--conf spark.executor.cores=1 \ +--conf spark.task.maxFailures=100 \ +--conf spark.memory.fraction=0.4 \ +--conf spark.rdd.compress=true \ +--conf spark.kryoserializer.buffer.max=2000m \ +--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ +--conf spark.memory.storageFraction=0.1 \ +--conf spark.shuffle.service.enabled=true \ +--conf spark.sql.hive.convertMetastoreParquet=false \ +--conf spark.driver.maxResultSize=12g \ +--conf spark.executor.heartbeatInterval=120s \ +--conf spark.network.timeout=600s \ +--conf spark.yarn.max.executor.failures=10 \ +--conf spark.sql.catalogImplementation=hive \ +--conf spark.driver.extraClassPath=/var/demo/jars/* \ +--conf spark.executor.extraClassPath=/var/demo/jars/* \ +--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \ +/opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \ +--source-ordering-field test_suite_source_ordering_field \ +--use-deltastreamer \ +--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \ +--input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \ +--target-table table1 \ +--props test.properties \ +--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \ +--source-class org.apache.hudi.utilities.sources.AvroDFSSource \ +--input-file-size 125829120 \ +--workload-yaml-path file:/opt/simple-deltastreamer.yaml \ +--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \ +--table-type COPY_ON_WRITE \ +--compact-scheduling-minshare 1 \ +--clean-input \ +--clean-output +``` + +If you wish to enable metrics add below properties as well +``` +--hoodie-conf hoodie.metrics.on=true \ +--hoodie-conf hoodie.metrics.reporter.type=GRAPHITE \ +--hoodie-conf hoodie.metrics.graphite.host=graphite \ +--hoodie-conf hoodie.metrics.graphite.port=2003 \ +``` + +Few ready to use dags are available under docker/demo/config/test-suite/ that could give you an idea for long running +dags. +``` +simple-deltastreamer.yaml: simple 1 round dag for COW table. +simple-deltastreamer.yaml: simple 1 round dag for MOR table. +cow-clustering-example.yaml : dag with 3 rounds, in which inline clustering will trigger during 2nd iteration. +cow-long-running-example.yaml : long running dag with 50 iterations. only 1 partition is used. +cow-long-running-multi-partitions.yaml: long running dag wit 50 iterations with multiple partitions. +``` + +To run test suite jobs for MOR table, pretty much any of these dags can be used as is. Only change is with the +spark-shell commnad, you need to fix the table type. +``` +--table-type MERGE_ON_READ +``` +But if you had to switch from one table type to other, ensure you clean up all test paths explicitly before switching to +a different table type. +``` +hdfs dfs -rm -r /user/hive/warehouse/hudi-integ-test-suite/output/ +hdfs dfs -rm -r /user/hive/warehouse/hudi-integ-test-suite/input/ +``` + +As of now, "ValidateDatasetNode" uses spark data source and hive tables for comparison. Hence COW and real time view in +MOR can be tested. + +To run test suite jobs for validating all versions of schema, a DAG with insert, upsert nodes can be supplied with every version of schema to be evaluated, with "--saferSchemaEvolution" flag indicating the job is for schema validations. First run of the job will populate the dataset with data files with every version of schema and perform an upsert operation for verifying schema evolution. + +Second and subsequent runs will verify that the data can be inserted with latest version of schema and perform an upsert operation to evolve all older version of schema (created by older run) to the latest version of schema. + +Sample DAG: +``` +rollback with num_rollbacks = 2 +insert with schema_version = +.... +upsert with fraction_upsert_per_file = 0.5 +``` + +Spark submit with the flag: +``` +--saferSchemaEvolution +``` + +### Multi-writer tests +Integ test framework also supports multi-writer tests. + +#### Multi-writer tests with deltastreamer and a spark data source writer. + +Sample spark-submit command to test one delta streamer and a spark data source writer. +```shell +./bin/spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.0 \ +--conf spark.task.cpus=3 --conf spark.executor.cores=3 \ +--conf spark.task.maxFailures=100 --conf spark.memory.fraction=0.4 \ +--conf spark.rdd.compress=true --conf spark.kryoserializer.buffer.max=2000m \ +--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ +--conf spark.memory.storageFraction=0.1 --conf spark.shuffle.service.enabled=true \ +--conf spark.sql.hive.convertMetastoreParquet=false --conf spark.driver.maxResultSize=12g \ +--conf spark.executor.heartbeatInterval=120s --conf spark.network.timeout=600s \ +--conf spark.yarn.max.executor.failures=10 \ +--conf spark.sql.catalogImplementation=hive \ +--class org.apache.hudi.integ.testsuite.HoodieMultiWriterTestSuiteJob \ +/packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.12.0-SNAPSHOT.jar \ +--source-ordering-field test_suite_source_ordering_field \ +--use-deltastreamer \ +--target-base-path /tmp/hudi/output \ +--input-base-paths "/tmp/hudi/input1,/tmp/hudi/input2" \ +--target-table table1 \ +--props-paths "file:/docker/demo/config/test-suite/multi-writer-local-1.properties,file:/hudi/docker/demo/config/test-suite/multi-writer-local-2.properties" \ +--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \ +--source-class org.apache.hudi.utilities.sources.AvroDFSSource \ +--input-file-size 125829120 \ +--workload-yaml-paths "file:/docker/demo/config/test-suite/multi-writer-1-ds.yaml,file:/docker/demo/config/test-suite/multi-writer-2-sds.yaml" \ +--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \ +--table-type COPY_ON_WRITE \ +--compact-scheduling-minshare 1 \ +--input-base-path "dummyValue" \ +--workload-yaml-path "dummyValue" \ +--props "dummyValue" \ +--use-hudi-data-to-generate-updates +``` + +#### Multi-writer tests with 4 concurrent spark data source writer. + +```shell +./bin/spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.0 \ +--conf spark.task.cpus=3 --conf spark.executor.cores=3 \ +--conf spark.task.maxFailures=100 --conf spark.memory.fraction=0.4 \ +--conf spark.rdd.compress=true --conf spark.kryoserializer.buffer.max=2000m \ +--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ +--conf spark.memory.storageFraction=0.1 --conf spark.shuffle.service.enabled=true \ +--conf spark.sql.hive.convertMetastoreParquet=false --conf spark.driver.maxResultSize=12g \ +--conf spark.executor.heartbeatInterval=120s --conf spark.network.timeout=600s \ +--conf spark.yarn.max.executor.failures=10 --conf spark.sql.catalogImplementation=hive \ +--class org.apache.hudi.integ.testsuite.HoodieMultiWriterTestSuiteJob \ +/hudi-integ-test-bundle-0.12.0-SNAPSHOT.jar \ +--source-ordering-field test_suite_source_ordering_field \ +--use-deltastreamer \ +--target-base-path /tmp/hudi/output \ +--input-base-paths "/tmp/hudi/input1,/tmp/hudi/input2,/tmp/hudi/input3,/tmp/hudi/input4" \ +--target-table table1 \ +--props-paths "file:/multi-writer-local-1.properties,file:/multi-writer-local-2.properties,file:/multi-writer-local-3.properties,file:/multi-writer-local-4.properties" +--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \ +--source-class org.apache.hudi.utilities.sources.AvroDFSSource \ +--input-file-size 125829120 \ +--workload-yaml-paths "file:/multi-writer-1-sds.yaml,file:/multi-writer-2-sds.yaml,file:/multi-writer-3-sds.yaml,file:/multi-writer-4-sds.yaml" \ +--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \ +--table-type COPY_ON_WRITE \ +--compact-scheduling-minshare 1 \ +--input-base-path "dummyValue" \ +--workload-yaml-path "dummyValue" \ +--props "dummyValue" \ +--use-hudi-data-to-generate-updates +``` + +======= +### Testing async table services +We can test async table services with deltastreamer using below command. 3 additional arguments are required to test async +table services comapared to previous command. + +```shell +--continuous \ +--test-continuous-mode \ +--min-sync-interval-seconds 20 +``` + +Here is the full command: +```shell +./bin/spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.4 \ + --conf spark.task.cpus=1 --conf spark.executor.cores=1 \ +--conf spark.task.maxFailures=100 \ +--conf spark.memory.fraction=0.4 \ +--conf spark.rdd.compress=true \ +--conf spark.kryoserializer.buffer.max=2000m \ +--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ +--conf spark.memory.storageFraction=0.1 \ +--conf spark.shuffle.service.enabled=true \ +--conf spark.sql.hive.convertMetastoreParquet=false \ +--conf spark.driver.maxResultSize=12g \ +--conf spark.executor.heartbeatInterval=120s \ +--conf spark.network.timeout=600s \ +--conf spark.yarn.max.executor.failures=10 \ +--conf spark.sql.catalogImplementation=hive \ +--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob /hudi-integ-test-bundle-0.12.0-SNAPSHOT.jar \ +--source-ordering-field test_suite_source_ordering_field \ +--use-deltastreamer \ +--target-base-path /tmp/hudi/output \ +--input-base-path /tmp/hudi/input \ +--target-table table1 \ +-props file:/tmp/test.properties \ +--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \ +--source-class org.apache.hudi.utilities.sources.AvroDFSSource \ +--input-file-size 125829120 \ +--workload-yaml-path file:/tmp/simple-deltastreamer.yaml \ +--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \ +--table-type COPY_ON_WRITE \ +--compact-scheduling-minshare 1 \ +--clean-input \ +--clean-output \ +--continuous \ +--test-continuous-mode \ +--min-sync-interval-seconds 20 +``` + +We can use any yaml and properties file w/ above spark-submit command to test deltastreamer w/ async table services. + +## Automated tests for N no of yamls in Local Docker environment + +Hudi provides a script to assist you in testing N no of yamls automatically. Checkout the script under +hudi_root/docker folder. +generate_test_suite.sh + +Example command : // execute the command from within docker folder. +./generate_test_suite.sh --execute_test_suite false --include_medium_test_suite_yaml true --include_long_test_suite_yaml true + +By default, generate_test_suite will run sanity test. In addition it supports 3 more yamls. +medium_test_suite, long_test_suite and clustering_test_suite. Users can add the required yamls via command line as per thier +necessity. + +Also, "--execute_test_suite" false will generate all required files and yamls in a local staging directory if users want to inspect them. +To go ahead and execute the same, you can give "--execute_test_suite true". +staging dir: docker/demo/config/test-suite/staging + +Also, there are other additional configs which users can override depending on their needs. +Some of the options are + +--table_type COPY_ON_WRITE/MERGE_ON_READ // refers to table type. +--medium_num_iterations 20 // refers to total iterations medium test suite should run. +--long_num_iterations 100 // refers to total iterations long test suite should run. +--intermittent_delay_mins 1 // refers to delay between successive runs within a single test suite job. +--cluster_num_itr 30 // refers to total iterations for clustering test suite. +--cluster_delay_mins 2 // refers to delay between successive runs for clustering test suite job. +--cluster_exec_itr_count 15 // refers to the iteration at which clustering needs to be triggered. + + + diff --git a/hudi-integ-test/pom.xml b/hudi-integ-test/pom.xml index 90ca94dbb4aeb..b4387071f450b 100644 --- a/hudi-integ-test/pom.xml +++ b/hudi-integ-test/pom.xml @@ -21,7 +21,7 @@ hudi org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT ../pom.xml hudi-integ-test @@ -81,14 +81,42 @@ org.eclipse.jetty * + + org.apache.curator + * + + + org.apache.spark + spark-avro_${scala.binary.version} + ${spark.version} + test + + org.apache.hudi hudi-spark_${scala.binary.version} ${project.version} + + + org.apache.curator + * + + + + + org.apache.hudi + hudi-spark-common_${scala.binary.version} + ${project.version} + + + org.apache.curator + * + + org.apache.hudi @@ -112,13 +140,33 @@ com.databricks spark-avro_${scala.binary.version} + + org.apache.curator + * + + + org.apache.curator + curator-framework + ${zk-curator.version} + + + org.apache.curator + curator-client + ${zk-curator.version} + + + org.apache.curator + curator-recipes + ${zk-curator.version} + + - log4j - log4j + org.apache.logging.log4j + log4j-1.2-api @@ -130,13 +178,18 @@ + + org.apache.hudi + hudi-tests-common + ${project.version} + test + org.apache.hudi hudi-common ${project.version} tests test-jar - test org.apache.hudi @@ -201,6 +254,19 @@ ${project.version} test-jar + + org.apache.hudi + hudi-spark-common_${scala.binary.version} + ${project.version} + test-jar + + + + org.apache.curator + curator-test + ${zk-curator.version} + test + @@ -231,6 +297,7 @@ org.apache.hadoop hadoop-common tests + test org.mortbay.jetty @@ -251,6 +318,7 @@ org.apache.hadoop hadoop-hdfs tests + test javax.servlet @@ -306,42 +374,69 @@ test + + com.facebook.presto + presto-jdbc + + + + io.trino + trino-jdbc + + org.awaitility awaitility - 3.1.2 test + org.junit.jupiter junit-jupiter-api test - org.junit.jupiter junit-jupiter-engine test - org.junit.vintage junit-vintage-engine test - org.junit.jupiter junit-jupiter-params test - org.mockito mockito-junit-jupiter test + + org.junit.platform + junit-platform-runner + test + + + org.junit.platform + junit-platform-suite-api + test + + + org.junit.platform + junit-platform-commons + test + + + org.scalatest + scalatest_${scala.binary.version} + ${scalatest.version} + test + @@ -349,12 +444,53 @@ ${project.basedir}/compose_env ${project.basedir}/../docker/compose/docker-compose_hadoop284_hive233_spark244.yml ${skipITs} - true ${project.parent.basedir} + + + + net.alchim31.maven + scala-maven-plugin + ${scala-maven-plugin.version} + + + -nobootcp + + + + + org.apache.maven.plugins + maven-compiler-plugin + + + + + + + net.alchim31.maven + scala-maven-plugin + + + scala-compile-first + process-resources + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + org.codehaus.mojo exec-maven-plugin @@ -377,6 +513,14 @@ + + org.scalatest + scalatest-maven-plugin + + + org.scalastyle + scalastyle-maven-plugin + org.jacoco jacoco-maven-plugin diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieContinousTestSuiteWriter.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieContinousTestSuiteWriter.java new file mode 100644 index 0000000000000..76f9d7424ac90 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieContinousTestSuiteWriter.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats; +import org.apache.hudi.utilities.schema.SchemaProvider; + +import org.apache.avro.generic.GenericRecord; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.rdd.RDD; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Map; +import java.util.Properties; + +/** + * Test suite Writer that assists in testing async table operations with Deltastreamer continuous mode. + * + * Sample command + * ./bin/spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.4 \ + * --conf spark.task.cpus=1 --conf spark.executor.cores=1 \ + * --conf spark.task.maxFailures=100 \ + * --conf spark.memory.fraction=0.4 \ + * --conf spark.rdd.compress=true \ + * --conf spark.kryoserializer.buffer.max=2000m \ + * --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + * --conf spark.memory.storageFraction=0.1 \ + * --conf spark.shuffle.service.enabled=true \ + * --conf spark.sql.hive.convertMetastoreParquet=false \ + * --conf spark.driver.maxResultSize=12g \ + * --conf spark.executor.heartbeatInterval=120s \ + * --conf spark.network.timeout=600s \ + * --conf spark.yarn.max.executor.failures=10 \ + * --conf spark.sql.catalogImplementation=hive \ + * --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob /hudi-integ-test-bundle-0.12.0-SNAPSHOT.jar \ + * --source-ordering-field test_suite_source_ordering_field \ + * --use-deltastreamer \ + * --target-base-path /tmp/hudi/output \ + * --input-base-path /tmp/hudi/input \ + * --target-table table1 \ + * -props file:/tmp/test.properties \ + * --schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \ + * --source-class org.apache.hudi.utilities.sources.AvroDFSSource \ + * --input-file-size 125829120 \ + * --workload-yaml-path file:/tmp/simple-deltastreamer.yaml \ + * --workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \ + * --table-type COPY_ON_WRITE \ + * --compact-scheduling-minshare 1 \ + * --clean-input \ + * --clean-output \ + * --continuous \ + * --test-continuous-mode \ + * --min-sync-interval-seconds 20 + */ +public class HoodieContinousTestSuiteWriter extends HoodieTestSuiteWriter { + + private static Logger log = LoggerFactory.getLogger(HoodieContinousTestSuiteWriter.class); + + public HoodieContinousTestSuiteWriter(JavaSparkContext jsc, Properties props, HoodieTestSuiteJob.HoodieTestSuiteConfig cfg, String schema) throws Exception { + super(jsc, props, cfg, schema); + } + + @Override + public void shutdownResources() { + log.info("Shutting down deltastreamer gracefully "); + this.deltaStreamerWrapper.shutdownGracefully(); + } + + @Override + public RDD getNextBatch() throws Exception { + return null; + } + + @Override + public Pair>> fetchSource() throws Exception { + return null; + } + + @Override + public Option startCommit() { + return null; + } + + public JavaRDD upsert(Option instantTime) throws Exception { + return null; + } + + @Override + public JavaRDD insert(Option instantTime) throws Exception { + return null; + } + + @Override + public JavaRDD insertOverwrite(Option instantTime) throws Exception { + return null; + } + + @Override + public JavaRDD insertOverwriteTable(Option instantTime) throws Exception { + return null; + } + + @Override + public JavaRDD bulkInsert(Option instantTime) throws Exception { + return null; + } + + @Override + public JavaRDD compact(Option instantTime) throws Exception { + return null; + } + + @Override + public void inlineClustering() { + } + + @Override + public Option scheduleCompaction(Option> previousCommitExtraMetadata) throws + Exception { + return Option.empty(); + } + + @Override + public void commit(JavaRDD records, JavaRDD generatedDataStats, + Option instantTime) { + } + + @Override + public void commitCompaction(JavaRDD records, JavaRDD generatedDataStats, + Option instantTime) throws IOException { + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieDeltaStreamerWrapper.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieDeltaStreamerWrapper.java index 0387731d6b469..24d281da73172 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieDeltaStreamerWrapper.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieDeltaStreamerWrapper.java @@ -25,13 +25,13 @@ import org.apache.hudi.utilities.deltastreamer.DeltaSync; import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer; import org.apache.hudi.utilities.schema.SchemaProvider; + import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; /** - * Extends the {@link HoodieDeltaStreamer} to expose certain operations helpful in running the Test Suite. - * This is done to achieve 2 things 1) Leverage some components of {@link HoodieDeltaStreamer} 2) - * Piggyback on the suite to test {@link HoodieDeltaStreamer} + * Extends the {@link HoodieDeltaStreamer} to expose certain operations helpful in running the Test Suite. This is done to achieve 2 things 1) Leverage some components of {@link HoodieDeltaStreamer} + * 2) Piggyback on the suite to test {@link HoodieDeltaStreamer} */ public class HoodieDeltaStreamerWrapper extends HoodieDeltaStreamer { @@ -53,6 +53,16 @@ public JavaRDD bulkInsert() throws return upsert(WriteOperationType.BULK_INSERT); } + public JavaRDD insertOverwrite() throws + Exception { + return upsert(WriteOperationType.INSERT_OVERWRITE); + } + + public JavaRDD insertOverwriteTable() throws + Exception { + return upsert(WriteOperationType.INSERT_OVERWRITE_TABLE); + } + public void scheduleCompact() throws Exception { // Since we don't support scheduleCompact() operation in delta-streamer, assume upsert without any data that will // trigger scheduling compaction diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieInlineTestSuiteWriter.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieInlineTestSuiteWriter.java new file mode 100644 index 0000000000000..91a7cf358c011 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieInlineTestSuiteWriter.java @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite; + +import org.apache.hudi.avro.model.HoodieCompactionPlan; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig; +import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.compact.CompactHelpers; +import org.apache.hudi.utilities.schema.SchemaProvider; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.rdd.RDD; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; + +/** + * A writer abstraction for the Hudi test suite. This class wraps different implementations of writers used to perform write operations into the target hudi dataset. Current supported writers are + * {@link HoodieDeltaStreamerWrapper} and {@link SparkRDDWriteClient}. + */ +public class HoodieInlineTestSuiteWriter extends HoodieTestSuiteWriter { + + private static Logger log = LoggerFactory.getLogger(HoodieInlineTestSuiteWriter.class); + + private static final String GENERATED_DATA_PATH = "generated.data.path"; + + public HoodieInlineTestSuiteWriter(JavaSparkContext jsc, Properties props, HoodieTestSuiteConfig cfg, String schema) throws Exception { + super(jsc, props, cfg, schema); + } + + public void shutdownResources() { + // no-op for non continuous mode test suite writer. + } + + public RDD getNextBatch() throws Exception { + Pair>> nextBatch = fetchSource(); + lastCheckpoint = Option.of(nextBatch.getValue().getLeft()); + JavaRDD inputRDD = nextBatch.getRight().getRight(); + return inputRDD.map(r -> (GenericRecord) ((HoodieAvroRecord) r).getData() + .getInsertValue(new Schema.Parser().parse(schema)).get()).rdd(); + } + + public Pair>> fetchSource() throws Exception { + return this.deltaStreamerWrapper.fetchSource(); + } + + public Option startCommit() { + if (cfg.useDeltaStreamer) { + return Option.of(HoodieActiveTimeline.createNewInstantTime()); + } else { + return Option.of(writeClient.startCommit()); + } + } + + public JavaRDD upsert(Option instantTime) throws Exception { + if (cfg.useDeltaStreamer) { + return deltaStreamerWrapper.upsert(WriteOperationType.UPSERT); + } else { + Pair>> nextBatch = fetchSource(); + lastCheckpoint = Option.of(nextBatch.getValue().getLeft()); + return writeClient.upsert(nextBatch.getRight().getRight(), instantTime.get()); + } + } + + public JavaRDD insert(Option instantTime) throws Exception { + if (cfg.useDeltaStreamer) { + return deltaStreamerWrapper.insert(); + } else { + Pair>> nextBatch = fetchSource(); + lastCheckpoint = Option.of(nextBatch.getValue().getLeft()); + return writeClient.insert(nextBatch.getRight().getRight(), instantTime.get()); + } + } + + public JavaRDD insertOverwrite(Option instantTime) throws Exception { + if (cfg.useDeltaStreamer) { + return deltaStreamerWrapper.insertOverwrite(); + } else { + Pair>> nextBatch = fetchSource(); + lastCheckpoint = Option.of(nextBatch.getValue().getLeft()); + return writeClient.insertOverwrite(nextBatch.getRight().getRight(), instantTime.get()).getWriteStatuses(); + } + } + + public JavaRDD insertOverwriteTable(Option instantTime) throws Exception { + if (cfg.useDeltaStreamer) { + return deltaStreamerWrapper.insertOverwriteTable(); + } else { + Pair>> nextBatch = fetchSource(); + lastCheckpoint = Option.of(nextBatch.getValue().getLeft()); + return writeClient.insertOverwriteTable(nextBatch.getRight().getRight(), instantTime.get()).getWriteStatuses(); + } + } + + public JavaRDD bulkInsert(Option instantTime) throws Exception { + if (cfg.useDeltaStreamer) { + return deltaStreamerWrapper.bulkInsert(); + } else { + Pair>> nextBatch = fetchSource(); + lastCheckpoint = Option.of(nextBatch.getValue().getLeft()); + return writeClient.bulkInsert(nextBatch.getRight().getRight(), instantTime.get()); + } + } + + public JavaRDD compact(Option instantTime) throws Exception { + if (cfg.useDeltaStreamer) { + return deltaStreamerWrapper.compact(); + } else { + if (!instantTime.isPresent()) { + Option> compactionPlanPair = Option + .fromJavaOptional(hoodieReadClient.getPendingCompactions() + .stream().findFirst()); + if (compactionPlanPair.isPresent()) { + instantTime = Option.of(compactionPlanPair.get().getLeft()); + } + } + if (instantTime.isPresent()) { + HoodieWriteMetadata> compactionMetadata = writeClient.compact(instantTime.get()); + return compactionMetadata.getWriteStatuses(); + } else { + return null; + } + } + } + + public void inlineClustering() { + if (!cfg.useDeltaStreamer) { + Option clusteringInstantOpt = writeClient.scheduleClustering(Option.empty()); + clusteringInstantOpt.ifPresent(clusteringInstant -> { + // inline cluster should auto commit as the user is never given control + log.warn("Clustering instant :: " + clusteringInstant); + writeClient.cluster(clusteringInstant, true); + }); + } else { + // TODO: fix clustering to be done async https://issues.apache.org/jira/browse/HUDI-1590 + throw new IllegalArgumentException("Clustering cannot be triggered with deltastreamer"); + } + } + + public Option scheduleCompaction(Option> previousCommitExtraMetadata) throws + Exception { + if (cfg.useDeltaStreamer) { + deltaStreamerWrapper.scheduleCompact(); + return Option.empty(); + } else { + return writeClient.scheduleCompaction(previousCommitExtraMetadata); + } + } + + public void commit(JavaRDD records, JavaRDD generatedDataStats, + Option instantTime) { + if (!cfg.useDeltaStreamer) { + Map extraMetadata = new HashMap<>(); + /** Store the checkpoint in the commit metadata just like + * {@link HoodieDeltaStreamer#commit(SparkRDDWriteClient, JavaRDD, Option)} **/ + extraMetadata.put(HoodieDeltaStreamerWrapper.CHECKPOINT_KEY, lastCheckpoint.get()); + if (generatedDataStats != null && generatedDataStats.count() > 1) { + // Just stores the path where this batch of data is generated to + extraMetadata.put(GENERATED_DATA_PATH, generatedDataStats.map(s -> s.getFilePath()).collect().get(0)); + } + writeClient.commit(instantTime.get(), records, Option.of(extraMetadata)); + } + } + + public void commitCompaction(JavaRDD records, JavaRDD generatedDataStats, + Option instantTime) throws IOException { + if (!cfg.useDeltaStreamer) { + Map extraMetadata = new HashMap<>(); + /** Store the checkpoint in the commit metadata just like + * {@link HoodieDeltaStreamer#commit(SparkRDDWriteClient, JavaRDD, Option)} **/ + extraMetadata.put(HoodieDeltaStreamerWrapper.CHECKPOINT_KEY, lastCheckpoint.get()); + if (generatedDataStats != null && generatedDataStats.count() > 1) { + // Just stores the path where this batch of data is generated to + extraMetadata.put(GENERATED_DATA_PATH, generatedDataStats.map(s -> s.getFilePath()).collect().get(0)); + } + HoodieSparkTable table = HoodieSparkTable.create(writeClient.getConfig(), writeClient.getEngineContext()); + HoodieCommitMetadata metadata = CompactHelpers.getInstance().createCompactionMetadata(table, instantTime.get(), HoodieJavaRDD.of(records), writeClient.getConfig().getSchema()); + writeClient.commitCompaction(instantTime.get(), metadata, Option.of(extraMetadata)); + } + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieMultiWriterTestSuiteJob.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieMultiWriterTestSuiteJob.java new file mode 100644 index 0000000000000..dea16aef5fa4b --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieMultiWriterTestSuiteJob.java @@ -0,0 +1,251 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite; + +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.utilities.UtilHelpers; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; + +import java.util.ArrayList; +import java.util.List; +import java.util.Random; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * Multi write test suite job to assist in testing multi-writer scenarios. This test spins up one thread per writer as per configurations. + * Three params are of interest to this job in addition to regular HoodieTestsuiteJob. + * --input-base-paths "base_path/input1,base_path/input2" + * --props-paths "file:props_path/multi-writer-1.properties,file:/props_path/multi-writer-2.properties" + * --workload-yaml-paths "file:some_path/multi-writer-1-ds.yaml,file:/some_path/multi-writer-2-sds.yaml" + * + * Each of these should have same number of comma separated entries. + * Each writer will generate data in the corresponding input-base-path. + * and each writer will take in its own properties path and the respective yaml file as well. + * + * Common tests: + * Writer 1 DeltaStreamer ingesting data into partitions 0 to 10, Writer 2 Spark datasource ingesting data into partitions 100 to 110. + * Multiple spark datasource writers, each writing to exclusive set of partitions. + * + * Example comamnd + * spark-submit + * --packages org.apache.spark:spark-avro_2.11:2.4.0 + * --conf spark.task.cpus=3 + * --conf spark.executor.cores=3 + * --conf spark.task.maxFailures=100 + * --conf spark.memory.fraction=0.4 + * --conf spark.rdd.compress=true + * --conf spark.kryoserializer.buffer.max=2000m + * --conf spark.serializer=org.apache.spark.serializer.KryoSerializer + * --conf spark.memory.storageFraction=0.1 + * --conf spark.shuffle.service.enabled=true + * --conf spark.sql.hive.convertMetastoreParquet=false + * --conf spark.driver.maxResultSize=12g + * --conf spark.executor.heartbeatInterval=120s + * --conf spark.network.timeout=600s + * --conf spark.yarn.max.executor.failures=10 + * --conf spark.sql.catalogImplementation=hive + * --conf spark.driver.extraClassPath=/var/demo/jars/* + * --conf spark.executor.extraClassPath=/var/demo/jars/* + * --class org.apache.hudi.integ.testsuite.HoodieMultiWriterTestSuiteJob /opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar + * --source-ordering-field test_suite_source_ordering_field + * --use-deltastreamer + * --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output + * --input-base-paths "/user/hive/warehouse/hudi-integ-test-suite/input1,/user/hive/warehouse/hudi-integ-test-suite/input2" + * --target-table hudi_table + * --props-paths "multi-writer-1.properties,multi-writer-2.properties" + * --schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider + * --source-class org.apache.hudi.utilities.sources.AvroDFSSource --input-file-size 125829120 + * --workload-yaml-paths "file:/opt/multi-writer-1-ds.yaml,file:/opt/multi-writer-2-sds.yaml" + * --workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator + * --table-type COPY_ON_WRITE --compact-scheduling-minshare 1 + * --input-base-path "dummyValue" + * --workload-yaml-path "dummyValue" + * --props "dummyValue" + * --use-hudi-data-to-generate-updates + * + * Example command that works w/ docker. + * + */ +public class HoodieMultiWriterTestSuiteJob { + + private static final Logger LOG = LogManager.getLogger(HoodieMultiWriterTestSuiteJob.class); + + public static void main(String[] args) throws Exception { + final HoodieMultiWriterTestSuiteConfig cfg = new HoodieMultiWriterTestSuiteConfig(); + JCommander cmd = new JCommander(cfg, args); + if (cfg.help || args.length == 0) { + cmd.usage(); + System.exit(1); + } + + JavaSparkContext jssc = UtilHelpers.buildSparkContext("multi-writer-test-run-" + cfg.outputTypeName + + "-" + cfg.inputFormatName, cfg.sparkMaster); + + String[] inputPaths = cfg.inputBasePaths.split(","); + String[] yamls = cfg.workloadYamlPaths.split(","); + String[] propsFiles = cfg.propsFilePaths.split(","); + + if (inputPaths.length != yamls.length || yamls.length != propsFiles.length) { + throw new HoodieException("Input paths, property file and yaml file counts does not match "); + } + + ExecutorService executor = Executors.newFixedThreadPool(inputPaths.length); + Random random = new Random(); + + List testSuiteConfigList = new ArrayList<>(); + int jobIndex = 0; + for (String inputPath : inputPaths) { + HoodieMultiWriterTestSuiteConfig testSuiteConfig = new HoodieMultiWriterTestSuiteConfig(); + deepCopyConfigs(cfg, testSuiteConfig); + testSuiteConfig.inputBasePath = inputPath; + testSuiteConfig.workloadYamlPath = yamls[jobIndex]; + testSuiteConfig.propsFilePath = propsFiles[jobIndex]; + testSuiteConfigList.add(testSuiteConfig); + jobIndex++; + } + + AtomicBoolean jobFailed = new AtomicBoolean(false); + AtomicInteger counter = new AtomicInteger(0); + List waitTimes = new ArrayList<>(); + for (int i = 0; i < jobIndex; i++) { + if (i == 0) { + waitTimes.add(0L); + } else { + // every job after 1st, will start after 1 min + some delta. + waitTimes.add(60000L + random.nextInt(10000)); + } + } + List> completableFutureList = new ArrayList<>(); + testSuiteConfigList.forEach(hoodieTestSuiteConfig -> { + try { + // start each job at 20 seconds interval so that metaClient instantiation does not overstep + Thread.sleep(waitTimes.get(counter.get())); + LOG.info("Starting job " + hoodieTestSuiteConfig.toString()); + } catch (InterruptedException e) { + e.printStackTrace(); + } + completableFutureList.add(CompletableFuture.supplyAsync(() -> { + boolean toReturn = true; + try { + new HoodieTestSuiteJob(hoodieTestSuiteConfig, jssc, false).runTestSuite(); + LOG.info("Job completed successfully"); + } catch (Exception e) { + if (!jobFailed.getAndSet(true)) { + LOG.error("Exception thrown " + e.getMessage() + ", cause : " + e.getCause()); + throw new RuntimeException("HoodieTestSuiteJob Failed " + e.getCause() + ", and msg " + e.getMessage(), e); + } else { + LOG.info("Already a job failed. so, not throwing any exception "); + } + } + return toReturn; + }, executor)); + counter.getAndIncrement(); + }); + + LOG.info("Going to await until all jobs complete"); + try { + CompletableFuture completableFuture = allOfTerminateOnFailure(completableFutureList); + completableFuture.get(); + } finally { + executor.shutdownNow(); + if (jssc != null) { + LOG.info("Completed and shutting down spark context "); + LOG.info("Shutting down spark session and JavaSparkContext"); + SparkSession.builder().config(jssc.getConf()).enableHiveSupport().getOrCreate().stop(); + jssc.close(); + } + } + } + + public static CompletableFuture allOfTerminateOnFailure(List> futures) { + CompletableFuture failure = new CompletableFuture(); + AtomicBoolean jobFailed = new AtomicBoolean(false); + for (CompletableFuture f : futures) { + f.exceptionally(ex -> { + if (!jobFailed.getAndSet(true)) { + System.out.println("One of the job failed. Cancelling all other futures. " + ex.getCause() + ", " + ex.getMessage()); + futures.forEach(future -> future.cancel(true)); + } + return null; + }); + } + return CompletableFuture.anyOf(failure, CompletableFuture.allOf(futures.toArray(new CompletableFuture[0]))); + } + + static void deepCopyConfigs(HoodieMultiWriterTestSuiteConfig globalConfig, HoodieMultiWriterTestSuiteConfig tableConfig) { + tableConfig.enableHiveSync = globalConfig.enableHiveSync; + tableConfig.enableMetaSync = globalConfig.enableMetaSync; + tableConfig.schemaProviderClassName = globalConfig.schemaProviderClassName; + tableConfig.sourceOrderingField = globalConfig.sourceOrderingField; + tableConfig.sourceClassName = globalConfig.sourceClassName; + tableConfig.tableType = globalConfig.tableType; + tableConfig.targetTableName = globalConfig.targetTableName; + tableConfig.operation = globalConfig.operation; + tableConfig.sourceLimit = globalConfig.sourceLimit; + tableConfig.checkpoint = globalConfig.checkpoint; + tableConfig.continuousMode = globalConfig.continuousMode; + tableConfig.filterDupes = globalConfig.filterDupes; + tableConfig.payloadClassName = globalConfig.payloadClassName; + tableConfig.forceDisableCompaction = globalConfig.forceDisableCompaction; + tableConfig.maxPendingCompactions = globalConfig.maxPendingCompactions; + tableConfig.maxPendingClustering = globalConfig.maxPendingClustering; + tableConfig.minSyncIntervalSeconds = globalConfig.minSyncIntervalSeconds; + tableConfig.transformerClassNames = globalConfig.transformerClassNames; + tableConfig.commitOnErrors = globalConfig.commitOnErrors; + tableConfig.compactSchedulingMinShare = globalConfig.compactSchedulingMinShare; + tableConfig.compactSchedulingWeight = globalConfig.compactSchedulingWeight; + tableConfig.deltaSyncSchedulingMinShare = globalConfig.deltaSyncSchedulingMinShare; + tableConfig.deltaSyncSchedulingWeight = globalConfig.deltaSyncSchedulingWeight; + tableConfig.sparkMaster = globalConfig.sparkMaster; + tableConfig.workloadDagGenerator = globalConfig.workloadDagGenerator; + tableConfig.outputTypeName = globalConfig.outputTypeName; + tableConfig.inputFormatName = globalConfig.inputFormatName; + tableConfig.inputParallelism = globalConfig.inputParallelism; + tableConfig.useDeltaStreamer = globalConfig.useDeltaStreamer; + tableConfig.cleanInput = globalConfig.cleanInput; + tableConfig.cleanOutput = globalConfig.cleanOutput; + tableConfig.targetBasePath = globalConfig.targetBasePath; + } + + public static class HoodieMultiWriterTestSuiteConfig extends HoodieTestSuiteJob.HoodieTestSuiteConfig { + + @Parameter(names = {"--input-base-paths"}, description = "base paths for input data" + + "(Will be created if did not exist first time around. If exists, more data will be added to that path)", + required = true) + public String inputBasePaths; + + @Parameter(names = { + "--workload-yaml-paths"}, description = "Workflow Dag yaml path to generate the workload") + public String workloadYamlPaths; + + @Parameter(names = { + "--props-paths"}, description = "Workflow Dag yaml path to generate the workload") + public String propsFilePaths; + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteJob.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteJob.java index 7b3324e4b569e..b911f116d3794 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteJob.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteJob.java @@ -18,40 +18,56 @@ package org.apache.hudi.integ.testsuite; -import com.beust.jcommander.JCommander; -import com.beust.jcommander.Parameter; -import java.io.IOException; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hudi.DataSourceUtils; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; import org.apache.hudi.integ.testsuite.dag.DagUtils; import org.apache.hudi.integ.testsuite.dag.WorkflowDag; import org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator; +import org.apache.hudi.integ.testsuite.dag.WriterContext; +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; +import org.apache.hudi.integ.testsuite.dag.nodes.RollbackNode; import org.apache.hudi.integ.testsuite.dag.scheduler.DagScheduler; +import org.apache.hudi.integ.testsuite.dag.scheduler.SaferSchemaDagScheduler; +import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider; +import org.apache.hudi.integ.testsuite.helpers.ZookeeperServiceProvider; import org.apache.hudi.integ.testsuite.reader.DeltaInputType; import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode; -import org.apache.hudi.integ.testsuite.dag.WriterContext; import org.apache.hudi.keygen.BuiltinKeyGenerator; +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; import org.apache.hudi.utilities.UtilHelpers; import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer; -import org.apache.hudi.utilities.schema.SchemaProvider; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import org.apache.avro.Schema; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import static org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER; +import static org.apache.hudi.common.util.StringUtils.EMPTY_STRING; + /** - * This is the entry point for running a Hudi Test Suite. Although this class has similarities with - * {@link HoodieDeltaStreamer} this class does not extend it since do not want to create a dependency on the changes in - * DeltaStreamer. + * This is the entry point for running a Hudi Test Suite. Although this class has similarities with {@link HoodieDeltaStreamer} this class does not extend it since do not want to create a dependency + * on the changes in DeltaStreamer. */ public class HoodieTestSuiteJob { @@ -79,22 +95,35 @@ public class HoodieTestSuiteJob { */ private transient HiveConf hiveConf; + private boolean stopJsc = true; private BuiltinKeyGenerator keyGenerator; + private transient HoodieTableMetaClient metaClient; public HoodieTestSuiteJob(HoodieTestSuiteConfig cfg, JavaSparkContext jsc) throws IOException { + this(cfg, jsc, true); + } + + public HoodieTestSuiteJob(HoodieTestSuiteConfig cfg, JavaSparkContext jsc, boolean stopJsc) throws IOException { + log.warn("Running spark job w/ app id " + jsc.sc().applicationId()); this.cfg = cfg; this.jsc = jsc; + this.stopJsc = stopJsc; cfg.propsFilePath = FSUtils.addSchemeIfLocalPath(cfg.propsFilePath).toString(); - this.sparkSession = SparkSession.builder().config(jsc.getConf()).getOrCreate(); + this.sparkSession = SparkSession.builder().config(jsc.getConf()).enableHiveSupport().getOrCreate(); this.fs = FSUtils.getFs(cfg.inputBasePath, jsc.hadoopConfiguration()); - this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig(); + this.props = UtilHelpers.readConfig(fs.getConf(), new Path(cfg.propsFilePath), cfg.configs).getProps(); log.info("Creating workload generator with configs : {}", props.toString()); this.hiveConf = getDefaultHiveConf(jsc.hadoopConfiguration()); - this.keyGenerator = (BuiltinKeyGenerator) DataSourceUtils.createKeyGenerator(props); + this.keyGenerator = (BuiltinKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(props); if (!fs.exists(new Path(cfg.targetBasePath))) { - HoodieTableMetaClient.initTableType(jsc.hadoopConfiguration(), cfg.targetBasePath, - HoodieTableType.valueOf(cfg.tableType), cfg.targetTableName, "archived"); + metaClient = HoodieTableMetaClient.withPropertyBuilder() + .setTableType(cfg.tableType) + .setTableName(cfg.targetTableName) + .setArchiveLogFolder(ARCHIVELOG_FOLDER.defaultValue()) + .initTable(jsc.hadoopConfiguration(), cfg.targetBasePath); + } else { + metaClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(cfg.targetBasePath).build(); } if (cfg.cleanInput) { @@ -112,6 +141,28 @@ public HoodieTestSuiteJob(HoodieTestSuiteConfig cfg, JavaSparkContext jsc) throw } } + int getSchemaVersionFromCommit(int nthCommit) throws Exception { + int version = 0; + try { + HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitsTimeline(); + // Pickup the schema version from nth commit from last (most recent insert/upsert will be rolled back). + HoodieInstant prevInstant = timeline.nthFromLastInstant(nthCommit).get(); + HoodieCommitMetadata commit = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(prevInstant).get(), + HoodieCommitMetadata.class); + Map extraMetadata = commit.getExtraMetadata(); + String avroSchemaStr = extraMetadata.get(HoodieCommitMetadata.SCHEMA_KEY); + Schema avroSchema = new Schema.Parser().parse(avroSchemaStr); + version = Integer.parseInt(avroSchema.getObjectProp("schemaVersion").toString()); + // DAG will generate & ingest data for 2 versions (n-th version being validated, n-1). + log.info(String.format("Last used schemaVersion from latest commit file was %d. Optimizing the DAG.", version)); + } catch (Exception e) { + // failed to open the commit to read schema version. + // continue executing the DAG without any changes. + log.info("Last used schemaVersion could not be validated from commit file. Skipping SaferSchema Optimization."); + } + return version; + } + private static HiveConf getDefaultHiveConf(Configuration cfg) { HiveConf hiveConf = new HiveConf(); hiveConf.addResource(cfg); @@ -128,34 +179,77 @@ public static void main(String[] args) throws Exception { JavaSparkContext jssc = UtilHelpers.buildSparkContext("workload-generator-" + cfg.outputTypeName + "-" + cfg.inputFormatName, cfg.sparkMaster); - new HoodieTestSuiteJob(cfg, jssc).runTestSuite(); + new HoodieTestSuiteJob(cfg, jssc, true).runTestSuite(); } public WorkflowDag createWorkflowDag() throws IOException { WorkflowDag workflowDag = this.cfg.workloadYamlPath == null ? ((WorkflowDagGenerator) ReflectionUtils - .loadClass((this.cfg).workloadDagGenerator)).build() - : DagUtils.convertYamlPathToDag( - FSUtils.getFs(this.cfg.workloadYamlPath, jsc.hadoopConfiguration(), true), - this.cfg.workloadYamlPath); + .loadClass((this.cfg).workloadDagGenerator)).build() + : DagUtils.convertYamlPathToDag( + FSUtils.getFs(this.cfg.workloadYamlPath, jsc.hadoopConfiguration(), true), + this.cfg.workloadYamlPath); return workflowDag; } public void runTestSuite() { + WriterContext writerContext = null; try { WorkflowDag workflowDag = createWorkflowDag(); log.info("Workflow Dag => " + DagUtils.convertDagToYaml(workflowDag)); long startTime = System.currentTimeMillis(); - WriterContext writerContext = new WriterContext(jsc, props, cfg, keyGenerator, sparkSession); + writerContext = new WriterContext(jsc, props, cfg, keyGenerator, sparkSession); writerContext.initContext(jsc); - DagScheduler dagScheduler = new DagScheduler(workflowDag, writerContext); - dagScheduler.schedule(); + startOtherServicesIfNeeded(writerContext); + if (this.cfg.saferSchemaEvolution) { + int numRollbacks = 2; // rollback most recent upsert/insert, by default. + // if root is RollbackNode, get num_rollbacks + List root = workflowDag.getNodeList(); + if (!root.isEmpty() && root.get(0) instanceof RollbackNode) { + numRollbacks = root.get(0).getConfig().getNumRollbacks(); + } + + int version = getSchemaVersionFromCommit(numRollbacks - 1); + SaferSchemaDagScheduler dagScheduler = new SaferSchemaDagScheduler(workflowDag, writerContext, jsc, version); + dagScheduler.schedule(); + } else { + DagScheduler dagScheduler = new DagScheduler(workflowDag, writerContext, jsc); + dagScheduler.schedule(); + } log.info("Finished scheduling all tasks, Time taken {}", System.currentTimeMillis() - startTime); } catch (Exception e) { log.error("Failed to run Test Suite ", e); throw new HoodieException("Failed to run Test Suite ", e); } finally { + if (writerContext != null) { + writerContext.shutdownResources(); + } + if (stopJsc) { + stopQuietly(); + } + } + } + + protected void stopQuietly() { + try { sparkSession.stop(); jsc.stop(); + } catch (Exception e) { + log.error("Unable to stop spark session", e); + } + } + + private void startOtherServicesIfNeeded(WriterContext writerContext) throws Exception { + if (cfg.startHiveMetastore) { + HiveServiceProvider hiveServiceProvider = new HiveServiceProvider( + Config.newBuilder().withHiveLocal(true).build()); + hiveServiceProvider.startLocalHiveServiceIfNeeded(writerContext.getHoodieTestSuiteWriter().getConfiguration()); + hiveServiceProvider.syncToLocalHiveIfNeeded(writerContext.getHoodieTestSuiteWriter()); + } + + if (cfg.startZookeeper) { + ZookeeperServiceProvider zookeeperServiceProvider = new ZookeeperServiceProvider(Config.newBuilder().withHiveLocal(true).build(), + writerContext.getHoodieTestSuiteWriter().getConfiguration()); + zookeeperServiceProvider.startLocalZookeeperIfNeeded(); } } @@ -208,5 +302,53 @@ public static class HoodieTestSuiteConfig extends HoodieDeltaStreamer.Config { @Parameter(names = {"--clean-output"}, description = "Clean the output folders and delete all files within it " + "before starting the Job") public Boolean cleanOutput = false; + + @Parameter(names = {"--saferSchemaEvolution"}, description = "Optimize the DAG for safer schema evolution." + + "(If not provided, assumed to be false.)", + required = false) + public Boolean saferSchemaEvolution = false; + + @Parameter(names = {"--start-zookeeper"}, description = "Start Zookeeper instance to use for optimistic lock ") + public Boolean startZookeeper = false; + + @Parameter(names = {"--start-hive-metastore"}, description = "Start Hive Metastore to use for optimistic lock ") + public Boolean startHiveMetastore = false; + + @Parameter(names = {"--use-hudi-data-to-generate-updates"}, description = "Use data from hudi to generate updates for new batches ") + public Boolean useHudiToGenerateUpdates = false; + + @Parameter(names = {"--test-continuous-mode"}, description = "Tests continuous mode in deltastreamer.") + public Boolean testContinousMode = false; + + @Parameter(names = {"--enable-presto-validation"}, description = "Enables presto validation") + public Boolean enablePrestoValidation = false; + + @Parameter(names = {"--presto-jdbc-url"}, description = "Presto JDBC URL in the format jdbc:presto://:// " + + "e.g. URL to connect to Presto running on localhost port 8080 with the catalog `hive` and the schema `sales`: " + + "jdbc:presto://localhost:8080/hive/sales") + public String prestoJdbcUrl = EMPTY_STRING; + + @Parameter(names = {"--presto-jdbc-username"}, description = "Username to use for authentication") + public String prestoUsername = "test"; + + @Parameter(names = {"--presto-jdbc-password"}, description = "Password corresponding to the username to use for authentication") + public String prestoPassword; + + @Parameter(names = {"--trino-jdbc-url"}, description = "Trino JDBC URL in the format jdbc:trino://:// " + + "e.g. URL to connect to Trino running on localhost port 8080 with the catalog `hive` and the schema `sales`: " + + "jdbc:trino://localhost:8080/hive/sales") + public String trinoJdbcUrl = EMPTY_STRING; + + @Parameter(names = {"--trino-jdbc-username"}, description = "Username to use for authentication") + public String trinoUsername = "test"; + + @Parameter(names = {"--trino-jdbc-password"}, description = "Password corresponding to the username to use for authentication") + public String trinoPassword; + + @Parameter(names = {"--index-type"}, description = "Index type to use for writes") + public String indexType = "SIMPLE"; + + @Parameter(names = {"--enable-metadata-on-read"}, description = "Enable's metadata for queries") + public Boolean enableMetadataOnRead = HoodieMetadataConfig.ENABLE.defaultValue(); } } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteWriter.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteWriter.java index bf6fca70ecea4..75d3fd94101f3 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteWriter.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteWriter.java @@ -18,73 +18,66 @@ package org.apache.hudi.integ.testsuite; -import org.apache.hadoop.conf.Configuration; -import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.client.HoodieReadClient; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.WriteOperationType; -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodiePayloadConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig; import org.apache.hudi.integ.testsuite.dag.nodes.CleanNode; import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; import org.apache.hudi.integ.testsuite.dag.nodes.RollbackNode; import org.apache.hudi.integ.testsuite.dag.nodes.ScheduleCompactNode; import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats; import org.apache.hudi.utilities.schema.SchemaProvider; + +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.rdd.RDD; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.io.Serializable; import java.util.Arrays; -import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Properties; import java.util.Set; -/** - * A writer abstraction for the Hudi test suite. This class wraps different implementations of writers used to perform - * write operations into the target hudi dataset. Current supported writers are {@link HoodieDeltaStreamerWrapper} - * and {@link SparkRDDWriteClient}. - */ -public class HoodieTestSuiteWriter { - - private HoodieDeltaStreamerWrapper deltaStreamerWrapper; - private SparkRDDWriteClient writeClient; - protected HoodieTestSuiteConfig cfg; - private Option lastCheckpoint; - private HoodieReadClient hoodieReadClient; - private Properties props; - private String schema; - private transient Configuration configuration; - private transient JavaSparkContext sparkContext; - private static Set VALID_DAG_NODES_TO_ALLOW_WRITE_CLIENT_IN_DELTASTREAMER_MODE = new HashSet<>( +public abstract class HoodieTestSuiteWriter implements Serializable { + + private static Logger log = LoggerFactory.getLogger(HoodieTestSuiteWriter.class); + + protected HoodieDeltaStreamerWrapper deltaStreamerWrapper; + protected HoodieWriteConfig writeConfig; + protected SparkRDDWriteClient writeClient; + protected HoodieTestSuiteJob.HoodieTestSuiteConfig cfg; + protected Option lastCheckpoint; + protected HoodieReadClient hoodieReadClient; + protected Properties props; + protected String schema; + protected transient Configuration configuration; + protected transient JavaSparkContext sparkContext; + protected static Set VALID_DAG_NODES_TO_ALLOW_WRITE_CLIENT_IN_DELTASTREAMER_MODE = new HashSet<>( Arrays.asList(RollbackNode.class.getName(), CleanNode.class.getName(), ScheduleCompactNode.class.getName())); - private static final String GENERATED_DATA_PATH = "generated.data.path"; - - public HoodieTestSuiteWriter(JavaSparkContext jsc, Properties props, HoodieTestSuiteConfig cfg, String schema) throws - Exception { - this(jsc, props, cfg, schema, true); - } - public HoodieTestSuiteWriter(JavaSparkContext jsc, Properties props, HoodieTestSuiteConfig cfg, String schema, - boolean rollbackInflight) throws Exception { + public HoodieTestSuiteWriter(JavaSparkContext jsc, Properties props, HoodieTestSuiteJob.HoodieTestSuiteConfig cfg, String schema) throws Exception { // We ensure that only 1 instance of HoodieWriteClient is instantiated for a HoodieTestSuiteWriter // This does not instantiate a HoodieWriteClient until a // {@link HoodieDeltaStreamer#commit(HoodieWriteClient, JavaRDD, Option)} is invoked. HoodieSparkEngineContext context = new HoodieSparkEngineContext(jsc); this.deltaStreamerWrapper = new HoodieDeltaStreamerWrapper(cfg, jsc); this.hoodieReadClient = new HoodieReadClient(context, cfg.targetBasePath); + this.writeConfig = getHoodieClientConfig(cfg, props, schema); if (!cfg.useDeltaStreamer) { - this.writeClient = new SparkRDDWriteClient(context, getHoodieClientConfig(cfg, props, schema), rollbackInflight); + this.writeClient = new SparkRDDWriteClient(context, writeConfig); } this.cfg = cfg; this.configuration = jsc.hadoopConfiguration(); @@ -93,13 +86,21 @@ public HoodieTestSuiteWriter(JavaSparkContext jsc, Properties props, HoodieTestS this.schema = schema; } - private HoodieWriteConfig getHoodieClientConfig(HoodieTestSuiteConfig cfg, Properties props, String schema) { + public HoodieWriteConfig getWriteConfig() { + return this.writeConfig; + } + + private HoodieWriteConfig getHoodieClientConfig(HoodieTestSuiteJob.HoodieTestSuiteConfig cfg, Properties props, String schema) { + HoodieWriteConfig.Builder builder = HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.targetBasePath) .withAutoCommit(false) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().withPayloadClass(cfg.payloadClassName).build()) + .withPayloadConfig(HoodiePayloadConfig.newBuilder() + .withPayloadOrderingField(cfg.sourceOrderingField) + .withPayloadClass(cfg.payloadClassName) + .build()) .forTable(cfg.targetTableName) - .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.valueOf(cfg.indexType)).build()) .withProps(props); builder = builder.withSchema(schema); return builder.build(); @@ -112,92 +113,35 @@ private boolean allowWriteClientAccess(DagNode dagNode) { return false; } - public Pair>> fetchSource() throws Exception { - return this.deltaStreamerWrapper.fetchSource(); - } + public abstract void shutdownResources(); - public Option startCommit() { - if (cfg.useDeltaStreamer) { - return Option.of(HoodieActiveTimeline.createNewInstantTime()); - } else { - return Option.of(writeClient.startCommit()); - } - } + public abstract RDD getNextBatch() throws Exception; - public JavaRDD upsert(Option instantTime) throws Exception { - if (cfg.useDeltaStreamer) { - return deltaStreamerWrapper.upsert(WriteOperationType.UPSERT); - } else { - Pair>> nextBatch = fetchSource(); - lastCheckpoint = Option.of(nextBatch.getValue().getLeft()); - return writeClient.upsert(nextBatch.getRight().getRight(), instantTime.get()); - } - } + public abstract Pair>> fetchSource() throws Exception; - public JavaRDD insert(Option instantTime) throws Exception { - if (cfg.useDeltaStreamer) { - return deltaStreamerWrapper.insert(); - } else { - Pair>> nextBatch = fetchSource(); - lastCheckpoint = Option.of(nextBatch.getValue().getLeft()); - return writeClient.insert(nextBatch.getRight().getRight(), instantTime.get()); - } - } + public abstract Option startCommit(); - public JavaRDD bulkInsert(Option instantTime) throws Exception { - if (cfg.useDeltaStreamer) { - return deltaStreamerWrapper.bulkInsert(); - } else { - Pair>> nextBatch = fetchSource(); - lastCheckpoint = Option.of(nextBatch.getValue().getLeft()); - return writeClient.bulkInsert(nextBatch.getRight().getRight(), instantTime.get()); - } - } + public abstract JavaRDD upsert(Option instantTime) throws Exception; - public JavaRDD compact(Option instantTime) throws Exception { - if (cfg.useDeltaStreamer) { - return deltaStreamerWrapper.compact(); - } else { - if (!instantTime.isPresent()) { - Option> compactionPlanPair = Option - .fromJavaOptional(hoodieReadClient.getPendingCompactions() - .stream().findFirst()); - if (compactionPlanPair.isPresent()) { - instantTime = Option.of(compactionPlanPair.get().getLeft()); - } - } - if (instantTime.isPresent()) { - return (JavaRDD) writeClient.compact(instantTime.get()); - } else { - return null; - } - } - } + public abstract JavaRDD insert(Option instantTime) throws Exception; - public Option scheduleCompaction(Option> previousCommitExtraMetadata) throws - Exception { - if (!cfg.useDeltaStreamer) { - deltaStreamerWrapper.scheduleCompact(); - return Option.empty(); - } else { - return writeClient.scheduleCompaction(previousCommitExtraMetadata); - } - } + public abstract JavaRDD insertOverwrite(Option instantTime) throws Exception; - public void commit(JavaRDD records, JavaRDD generatedDataStats, - Option instantTime) { - if (!cfg.useDeltaStreamer) { - Map extraMetadata = new HashMap<>(); - /** Store the checkpoint in the commit metadata just like - * {@link HoodieDeltaStreamer#commit(SparkRDDWriteClient, JavaRDD, Option)} **/ - extraMetadata.put(HoodieDeltaStreamerWrapper.CHECKPOINT_KEY, lastCheckpoint.get()); - if (generatedDataStats != null) { - // Just stores the path where this batch of data is generated to - extraMetadata.put(GENERATED_DATA_PATH, generatedDataStats.map(s -> s.getFilePath()).collect().get(0)); - } - writeClient.commit(instantTime.get(), records, Option.of(extraMetadata)); - } - } + public abstract JavaRDD insertOverwriteTable(Option instantTime) throws Exception; + + public abstract JavaRDD bulkInsert(Option instantTime) throws Exception; + + public abstract JavaRDD compact(Option instantTime) throws Exception; + + public abstract void inlineClustering() throws Exception; + + public abstract Option scheduleCompaction(Option> previousCommitExtraMetadata) throws Exception; + + public abstract void commit(JavaRDD records, JavaRDD generatedDataStats, + Option instantTime); + + public abstract void commitCompaction(JavaRDD records, JavaRDD generatedDataStats, + Option instantTime) throws Exception; public SparkRDDWriteClient getWriteClient(DagNode dagNode) throws IllegalAccessException { if (cfg.useDeltaStreamer & !allowWriteClientAccess(dagNode)) { @@ -205,7 +149,7 @@ public SparkRDDWriteClient getWriteClient(DagNode dagNode) throws IllegalAccessE } synchronized (this) { if (writeClient == null) { - this.writeClient = new SparkRDDWriteClient(new HoodieSparkEngineContext(this.sparkContext), getHoodieClientConfig(cfg, props, schema), false); + this.writeClient = new SparkRDDWriteClient(new HoodieSparkEngineContext(this.sparkContext), getHoodieClientConfig(cfg, props, schema)); } } return writeClient; @@ -215,7 +159,7 @@ public HoodieDeltaStreamerWrapper getDeltaStreamerWrapper() { return deltaStreamerWrapper; } - public HoodieTestSuiteConfig getCfg() { + public HoodieTestSuiteJob.HoodieTestSuiteConfig getCfg() { return cfg; } @@ -230,4 +174,13 @@ public JavaSparkContext getSparkContext() { public Option getLastCheckpoint() { return lastCheckpoint; } + + public Properties getProps() { + return props; + } + + public String getSchema() { + return schema; + } } + diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/SparkDataSourceContinuousIngestTool.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/SparkDataSourceContinuousIngestTool.java new file mode 100644 index 0000000000000..d8a4bbe7dac61 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/SparkDataSourceContinuousIngestTool.java @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite; + +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.utilities.HoodieRepairTool; +import org.apache.hudi.utilities.IdentitySplitter; +import org.apache.hudi.utilities.UtilHelpers; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + + +/** + * Sample command + * + * ./bin/spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.4 --driver-memory 4g --executor-memory 4g \ + * --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.sql.catalogImplementation=hive \ + * --class org.apache.hudi.integ.testsuite.SparkDSContinuousIngestTool \ + * ${HUDI_ROOT_DIR}/packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \ + * --source-path file:${SOURCE_DIR}/spark_ds_continuous --checkpoint-file-path /tmp/hudi/checkpoint \ + * --base-path file:///tmp/hudi/tbl_path/ --props /tmp/hudi_props.out + * + * Contents of hudi.properties + * + * hoodie.insert.shuffle.parallelism=4 + * hoodie.upsert.shuffle.parallelism=4 + * hoodie.bulkinsert.shuffle.parallelism=4 + * hoodie.delete.shuffle.parallelism=4 + * hoodie.datasource.write.recordkey.field=VendorID + * hoodie.datasource.write.partitionpath.field=date_col + * hoodie.datasource.write.operation=upsert + * hoodie.datasource.write.precombine.field=tpep_pickup_datetime + * hoodie.metadata.enable=false + * hoodie.table.name=hudi_tbl + */ + +public class SparkDataSourceContinuousIngestTool { + + private static final Logger LOG = LogManager.getLogger(SparkDataSourceContinuousIngestTool.class); + + private final Config cfg; + // Properties with source, hoodie client, key generator etc. + private TypedProperties props; + private HoodieSparkEngineContext context; + private SparkSession sparkSession; + + public SparkDataSourceContinuousIngestTool(JavaSparkContext jsc, Config cfg) { + if (cfg.propsFilePath != null) { + cfg.propsFilePath = FSUtils.addSchemeIfLocalPath(cfg.propsFilePath).toString(); + } + this.context = new HoodieSparkEngineContext(jsc); + this.sparkSession = SparkSession.builder().config(jsc.getConf()).getOrCreate(); + this.cfg = cfg; + this.props = cfg.propsFilePath == null + ? UtilHelpers.buildProperties(cfg.configs) + : readConfigFromFileSystem(jsc, cfg); + } + + public static void main(String[] args) { + final Config cfg = new Config(); + JCommander cmd = new JCommander(cfg, null, args); + if (cfg.help || args.length == 0) { + cmd.usage(); + System.exit(1); + } + final JavaSparkContext jsc = UtilHelpers.buildSparkContext("spark-datasource-continuous-ingestion-tool", cfg.sparkMaster, cfg.sparkMemory); + try { + new SparkDataSourceContinuousIngestTool(jsc, cfg).run(); + } catch (Throwable throwable) { + LOG.error("Fail to run Continuous Ingestion for spark datasource " + cfg.basePath, throwable); + } finally { + jsc.stop(); + } + } + + public void run() { + try { + SparkDataSourceContinuousIngest sparkDataSourceContinuousIngest = + new SparkDataSourceContinuousIngest(sparkSession, context.getHadoopConf().get(), new Path(cfg.sourcePath), cfg.sparkFormat, + new Path(cfg.checkpointFilePath), new Path(cfg.basePath), getPropsAsMap(props), + cfg.minSyncIntervalSeconds); + sparkDataSourceContinuousIngest.startIngestion(); + } finally { + sparkSession.stop(); + context.getJavaSparkContext().stop(); + } + } + + private Map getPropsAsMap(TypedProperties typedProperties) { + Map props = new HashMap<>(); + typedProperties.entrySet().forEach(entry -> props.put(entry.getKey().toString(), entry.getValue().toString())); + return props; + } + + /** + * Reads config from the file system. + * + * @param jsc {@link JavaSparkContext} instance. + * @param cfg {@link HoodieRepairTool.Config} instance. + * @return the {@link TypedProperties} instance. + */ + private TypedProperties readConfigFromFileSystem(JavaSparkContext jsc, Config cfg) { + return UtilHelpers.readConfig(jsc.hadoopConfiguration(), new Path(cfg.propsFilePath), cfg.configs) + .getProps(true); + } + + public static class Config implements Serializable { + @Parameter(names = {"--source-path", "-sp"}, description = "Source path for the parquet data to consume", required = true) + public String sourcePath = null; + @Parameter(names = {"--source-format", "-sf"}, description = "source data format", required = false) + public String sparkFormat = "parquet"; + @Parameter(names = {"--checkpoint-file-path", "-cpf"}, description = "Checkpoint file path to store/fetch checkpointing info", required = true) + public String checkpointFilePath = null; + @Parameter(names = {"--base-path", "-bp"}, description = "Base path for the hudi table", required = true) + public String basePath = null; + @Parameter(names = {"--spark-master", "-ms"}, description = "Spark master", required = false) + public String sparkMaster = null; + @Parameter(names = {"--spark-memory", "-sm"}, description = "spark memory to use", required = false) + public String sparkMemory = "1g"; + @Parameter(names = {"--min-sync-interval-seconds"}, + description = "the min sync interval of each sync in continuous mode") + public Integer minSyncIntervalSeconds = 0; + @Parameter(names = {"--help", "-h"}, help = true) + public Boolean help = false; + + @Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for " + + "hoodie client for table repair") + public String propsFilePath = null; + + @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file " + + "(using the CLI parameter \"--props\") can also be passed command line using this parameter. This can be repeated", + splitter = IdentitySplitter.class) + public List configs = new ArrayList<>(); + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DFSDeltaConfig.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DFSDeltaConfig.java index 0ac36687f485c..771127c623eaa 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DFSDeltaConfig.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DFSDeltaConfig.java @@ -40,11 +40,12 @@ public class DFSDeltaConfig extends DeltaConfig { private int inputParallelism; // Whether to delete older input data once it has been ingested private boolean deleteOldInputData; + private boolean useHudiToGenerateUpdates; public DFSDeltaConfig(DeltaOutputMode deltaOutputMode, DeltaInputType deltaInputType, SerializableConfiguration configuration, String deltaBasePath, String targetBasePath, String schemaStr, Long maxFileSize, - int inputParallelism, boolean deleteOldInputData) { + int inputParallelism, boolean deleteOldInputData, boolean useHudiToGenerateUpdates) { super(deltaOutputMode, deltaInputType, configuration); this.deltaBasePath = deltaBasePath; this.schemaStr = schemaStr; @@ -52,6 +53,7 @@ public DFSDeltaConfig(DeltaOutputMode deltaOutputMode, DeltaInputType deltaInput this.datasetOutputPath = targetBasePath; this.inputParallelism = inputParallelism; this.deleteOldInputData = deleteOldInputData; + this.useHudiToGenerateUpdates = useHudiToGenerateUpdates; } public String getDeltaBasePath() { @@ -85,4 +87,8 @@ public int getInputParallelism() { public boolean shouldDeleteOldInputData() { return deleteOldInputData; } + + public boolean shouldUseHudiToGenerateUpdates() { + return useHudiToGenerateUpdates; + } } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java index 81f406be3cc32..03182d2784b25 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java @@ -19,6 +19,7 @@ package org.apache.hudi.integ.testsuite.configuration; import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.integ.testsuite.reader.DeltaInputType; import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode; @@ -69,9 +70,14 @@ public static class Config { public static final String TYPE = "type"; public static final String NODE_NAME = "name"; public static final String DEPENDENCIES = "deps"; + public static final String NO_DEPENDENCY_VALUE = "none"; public static final String CHILDREN = "children"; public static final String HIVE_QUERIES = "hive_queries"; public static final String HIVE_PROPERTIES = "hive_props"; + public static final String PRESTO_QUERIES = "presto_queries"; + public static final String PRESTO_PROPERTIES = "presto_props"; + public static final String TRINO_QUERIES = "trino_queries"; + public static final String TRINO_PROPERTIES = "trino_props"; private static String NUM_RECORDS_INSERT = "num_records_insert"; private static String NUM_RECORDS_UPSERT = "num_records_upsert"; private static String NUM_RECORDS_DELETE = "num_records_delete"; @@ -87,6 +93,48 @@ public static class Config { private static String HIVE_LOCAL = "hive_local"; private static String REINIT_CONTEXT = "reinitialize_context"; private static String START_PARTITION = "start_partition"; + private static String DELETE_INPUT_DATA = "delete_input_data"; + private static String VALIDATE_HIVE = "validate_hive"; + private static String VALIDATE_ONCE_EVERY_ITR = "validate_once_every_itr"; + private static String EXECUTE_ITR_COUNT = "execute_itr_count"; + private static String VALIDATE_ARCHIVAL = "validate_archival"; + private static String VALIDATE_CLEAN = "validate_clean"; + private static String SCHEMA_VERSION = "schema_version"; + private static String NUM_ROLLBACKS = "num_rollbacks"; + private static String ENABLE_ROW_WRITING = "enable_row_writing"; + private static String ENABLE_METADATA_VALIDATE = "enable_metadata_validate"; + private static String VALIDATE_FULL_DATA = "validate_full_data"; + private static String DELETE_INPUT_DATA_EXCEPT_LATEST = "delete_input_data_except_latest"; + private static String PARTITIONS_TO_DELETE = "partitions_to_delete"; + private static String INPUT_PARTITIONS_TO_SKIP_VALIDATE = "input_partitions_to_skip_validate"; + private static String MAX_WAIT_TIME_FOR_DELTASTREAMER_TO_CATCH_UP_MS = "max_wait_time_for_deltastreamer_catch_up_ms"; + + // Spark SQL Create Table + private static String TABLE_TYPE = "table_type"; + private static String IS_EXTERNAL = "is_external"; + private static String USE_CTAS = "use_ctas"; + private static String PRIMARY_KEY = "primary_key"; + private static String PRE_COMBINE_FIELD = "pre_combine_field"; + private static String PARTITION_FIELD = "partition_field"; + // Spark SQL Merge + private static String MERGE_CONDITION = "merge_condition"; + private static String DEFAULT_MERGE_CONDITION = "target._row_key = source._row_key"; + private static String MERGE_MATCHED_ACTION = "matched_action"; + private static String DEFAULT_MERGE_MATCHED_ACTION = "update set *"; + private static String MERGE_NOT_MATCHED_ACTION = "not_matched_action"; + private static String DEFAULT_MERGE_NOT_MATCHED_ACTION = "insert *"; + // Spark SQL Update + // column to update. The logic is fixed, i.e., to do "fare = fare * 1.6". to be fixed. + private static String UPDATE_COLUMN = "update_column"; + private static String DEFAULT_UPDATE_COLUMN = "fare"; + private static String WHERE_CONDITION_COLUMN = "condition_column"; + // the where condition expression is like "begin_lon between 0.1 and 0.2" + // the value range is determined by the ratio of records to update or delete + // only support numeric type column for now + private static String DEFAULT_WHERE_CONDITION_COLUMN = "begin_lon"; + // the ratio range is between 0.01 and 1.0. The ratio is approximate to the actual ratio achieved + private static String RATIO_RECORDS_CHANGE = "ratio_records_change"; + private static double DEFAULT_RATIO_RECORDS_CHANGE = 0.5; private Map configsMap; @@ -114,6 +162,10 @@ public int getRecordSize() { return Integer.valueOf(configsMap.getOrDefault(RECORD_SIZE, 1024).toString()); } + public boolean isEnableMetadataValidate() { + return Boolean.valueOf(configsMap.getOrDefault(ENABLE_METADATA_VALIDATE, false).toString()); + } + public int getNumInsertPartitions() { return Integer.valueOf(configsMap.getOrDefault(NUM_PARTITIONS_INSERT, 1).toString()); } @@ -126,6 +178,14 @@ public int getNumUpsertPartitions() { return Integer.valueOf(configsMap.getOrDefault(NUM_PARTITIONS_UPSERT, 0).toString()); } + public int getSchemaVersion() { + return Integer.valueOf(configsMap.getOrDefault(SCHEMA_VERSION, Integer.MAX_VALUE).toString()); + } + + public int getNumRollbacks() { + return Integer.valueOf(configsMap.getOrDefault(NUM_ROLLBACKS, 1).toString()); + } + public int getStartPartition() { return Integer.valueOf(configsMap.getOrDefault(START_PARTITION, 0).toString()); } @@ -135,7 +195,7 @@ public int getNumDeletePartitions() { } public int getNumUpsertFiles() { - return Integer.valueOf(configsMap.getOrDefault(NUM_FILES_UPSERT, 0).toString()); + return Integer.valueOf(configsMap.getOrDefault(NUM_FILES_UPSERT, 1).toString()); } public double getFractionUpsertPerFile() { @@ -150,10 +210,110 @@ public boolean isDisableIngest() { return Boolean.valueOf(configsMap.getOrDefault(DISABLE_INGEST, false).toString()); } + public String getPartitionsToDelete() { + return configsMap.getOrDefault(PARTITIONS_TO_DELETE, "").toString(); + } + public boolean getReinitContext() { return Boolean.valueOf(configsMap.getOrDefault(REINIT_CONTEXT, false).toString()); } + public boolean isDeleteInputData() { + return Boolean.valueOf(configsMap.getOrDefault(DELETE_INPUT_DATA, false).toString()); + } + + public boolean isDeleteInputDataExceptLatest() { + return Boolean.valueOf(configsMap.getOrDefault(DELETE_INPUT_DATA_EXCEPT_LATEST, false).toString()); + } + + public boolean isValidateHive() { + return Boolean.valueOf(configsMap.getOrDefault(VALIDATE_HIVE, false).toString()); + } + + public int validateOnceEveryIteration() { + return Integer.valueOf(configsMap.getOrDefault(VALIDATE_ONCE_EVERY_ITR, 1).toString()); + } + + public String inputPartitonsToSkipWithValidate() { + return configsMap.getOrDefault(INPUT_PARTITIONS_TO_SKIP_VALIDATE, "").toString(); + } + + public boolean isValidateFullData() { + return Boolean.valueOf(configsMap.getOrDefault(VALIDATE_FULL_DATA, false).toString()); + } + + public int getIterationCountToExecute() { + return Integer.valueOf(configsMap.getOrDefault(EXECUTE_ITR_COUNT, -1).toString()); + } + + public boolean validateArchival() { + return Boolean.valueOf(configsMap.getOrDefault(VALIDATE_ARCHIVAL, false).toString()); + } + + public boolean validateClean() { + return Boolean.valueOf(configsMap.getOrDefault(VALIDATE_CLEAN, false).toString()); + } + + public boolean enableRowWriting() { + return Boolean.valueOf(configsMap.getOrDefault(ENABLE_ROW_WRITING, false).toString()); + } + + public long maxWaitTimeForDeltastreamerToCatchupMs() { + return Long.valueOf(configsMap.getOrDefault(MAX_WAIT_TIME_FOR_DELTASTREAMER_TO_CATCH_UP_MS, 5 * 60 * 1000).toString()); + } + + public Option getTableType() { + return !configsMap.containsKey(TABLE_TYPE) ? Option.empty() + : Option.of(configsMap.get(TABLE_TYPE).toString()); + } + + public boolean shouldUseCtas() { + return Boolean.valueOf(configsMap.getOrDefault(USE_CTAS, false).toString()); + } + + public boolean isTableExternal() { + return Boolean.valueOf(configsMap.getOrDefault(IS_EXTERNAL, false).toString()); + } + + public Option getPrimaryKey() { + return !configsMap.containsKey(PRIMARY_KEY) ? Option.empty() + : Option.of(configsMap.get(PRIMARY_KEY).toString()); + } + + public Option getPreCombineField() { + return !configsMap.containsKey(PRE_COMBINE_FIELD) ? Option.empty() + : Option.of(configsMap.get(PRE_COMBINE_FIELD).toString()); + } + + public Option getPartitionField() { + return !configsMap.containsKey(PARTITION_FIELD) ? Option.empty() + : Option.of(configsMap.get(PARTITION_FIELD).toString()); + } + + public String getMergeCondition() { + return configsMap.getOrDefault(MERGE_CONDITION, DEFAULT_MERGE_CONDITION).toString(); + } + + public String getMatchedAction() { + return configsMap.getOrDefault(MERGE_MATCHED_ACTION, DEFAULT_MERGE_MATCHED_ACTION).toString(); + } + + public String getNotMatchedAction() { + return configsMap.getOrDefault(MERGE_NOT_MATCHED_ACTION, DEFAULT_MERGE_NOT_MATCHED_ACTION).toString(); + } + + public String getUpdateColumn() { + return configsMap.getOrDefault(UPDATE_COLUMN, DEFAULT_UPDATE_COLUMN).toString(); + } + + public String getWhereConditionColumn() { + return configsMap.getOrDefault(WHERE_CONDITION_COLUMN, DEFAULT_WHERE_CONDITION_COLUMN).toString(); + } + + public double getRatioRecordsChange() { + return Double.valueOf(configsMap.getOrDefault(RATIO_RECORDS_CHANGE, DEFAULT_RATIO_RECORDS_CHANGE).toString()); + } + public Map getOtherConfigs() { if (configsMap == null) { return new HashMap<>(); @@ -163,7 +323,7 @@ public Map getOtherConfigs() { public List> getHiveQueries() { try { - return (List>) this.configsMap.getOrDefault("hive_queries", new ArrayList<>()); + return (List>) this.configsMap.getOrDefault(HIVE_QUERIES, new ArrayList<>()); } catch (Exception e) { throw new RuntimeException("unable to get hive queries from configs"); } @@ -177,6 +337,30 @@ public List getHiveProperties() { return (List) this.configsMap.getOrDefault(HIVE_PROPERTIES, new ArrayList<>()); } + public List getPrestoProperties() { + return (List) this.configsMap.getOrDefault(PRESTO_PROPERTIES, new ArrayList<>()); + } + + public List getTrinoProperties() { + return (List) this.configsMap.getOrDefault(TRINO_PROPERTIES, new ArrayList<>()); + } + + public List> getPrestoQueries() { + try { + return (List>) this.configsMap.getOrDefault(PRESTO_QUERIES, new ArrayList<>()); + } catch (Exception e) { + throw new RuntimeException("unable to get presto queries from configs"); + } + } + + public List> getTrinoQueries() { + try { + return (List>) this.configsMap.getOrDefault(TRINO_QUERIES, new ArrayList<>()); + } catch (Exception e) { + throw new RuntimeException("unable to get trino queries from configs"); + } + } + @Override public String toString() { try { @@ -223,6 +407,16 @@ public Builder withNumDeletePartitions(int numDeletePartitions) { return this; } + public Builder withSchemaVersion(int version) { + this.configsMap.put(SCHEMA_VERSION, version); + return this; + } + + public Builder withNumRollbacks(int numRollbacks) { + this.configsMap.put(NUM_ROLLBACKS, numRollbacks); + return this; + } + public Builder withNumUpsertFiles(int numUpsertFiles) { this.configsMap.put(NUM_FILES_UPSERT, numUpsertFiles); return this; @@ -283,6 +477,26 @@ public Builder withHiveProperties(List hiveProperties) { return this; } + public Builder withPrestoProperties(List prestoProperties) { + this.configsMap.put(PRESTO_PROPERTIES, prestoProperties); + return this; + } + + public Builder withTrinoProperties(List trinoProperties) { + this.configsMap.put(TRINO_PROPERTIES, trinoProperties); + return this; + } + + public Builder withPrestoQueryAndResults(List> prestoQueries) { + this.configsMap.put(PRESTO_QUERIES, prestoQueries); + return this; + } + + public Builder withTrinoQueryAndResults(List> trinoQueries) { + this.configsMap.put(TRINO_QUERIES, trinoQueries); + return this; + } + public Builder withConfigsMap(Map configsMap) { this.configsMap = configsMap; return this; diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/converter/DeleteConverter.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/converter/DeleteConverter.java index f6dc08b5a05ad..d5c4860bfd0fb 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/converter/DeleteConverter.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/converter/DeleteConverter.java @@ -24,8 +24,6 @@ import org.apache.avro.generic.GenericRecord; import org.apache.spark.api.java.JavaRDD; -import java.util.List; - public class DeleteConverter implements Converter { private final String schemaStr; diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/converter/UpdateConverter.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/converter/UpdateConverter.java index 24520a3626cf3..1e8acf580d962 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/converter/UpdateConverter.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/converter/UpdateConverter.java @@ -18,12 +18,14 @@ package org.apache.hudi.integ.testsuite.converter; -import java.util.List; -import org.apache.avro.generic.GenericRecord; import org.apache.hudi.integ.testsuite.generator.LazyRecordGeneratorIterator; import org.apache.hudi.integ.testsuite.generator.UpdateGeneratorIterator; + +import org.apache.avro.generic.GenericRecord; import org.apache.spark.api.java.JavaRDD; +import java.util.List; + /** * This converter creates an update {@link GenericRecord} from an existing {@link GenericRecord}. */ @@ -36,7 +38,7 @@ public class UpdateConverter implements Converter private final List recordKeyFields; private final int minPayloadSize; - public UpdateConverter(String schemaStr, int minPayloadSize, List partitionPathFields, + public UpdateConverter(String schemaStr, int minPayloadSize, List partitionPathFields, List recordKeyFields) { this.schemaStr = schemaStr; this.partitionPathFields = partitionPathFields; diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/DagUtils.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/DagUtils.java index d5358238d51a1..b5b2aaa51aa21 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/DagUtils.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/DagUtils.java @@ -18,12 +18,28 @@ package org.apache.hudi.integ.testsuite.dag; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; + +import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.databind.DeserializationContext; +import com.fasterxml.jackson.databind.JsonDeserializer; import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.JsonSerializer; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializerProvider; +import com.fasterxml.jackson.databind.module.SimpleModule; import com.fasterxml.jackson.databind.node.ObjectNode; import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import com.fasterxml.jackson.dataformat.yaml.YAMLGenerator.Feature; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; @@ -35,19 +51,30 @@ import java.util.Map; import java.util.Map.Entry; import java.util.stream.Collectors; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hudi.common.util.ReflectionUtils; -import org.apache.hudi.common.util.StringUtils; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; -import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; + +import static org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config.CONFIG_NAME; +import static org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config.HIVE_PROPERTIES; +import static org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config.HIVE_QUERIES; +import static org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config.NO_DEPENDENCY_VALUE; +import static org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config.PRESTO_PROPERTIES; +import static org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config.PRESTO_QUERIES; +import static org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config.TRINO_PROPERTIES; +import static org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config.TRINO_QUERIES; /** * Utility class to SerDe workflow dag. */ public class DagUtils { + public static final String DAG_NAME = "dag_name"; + public static final String DAG_ROUNDS = "dag_rounds"; + public static final String DAG_INTERMITTENT_DELAY_MINS = "dag_intermittent_delay_mins"; + public static final String DAG_CONTENT = "dag_content"; + + public static int DEFAULT_DAG_ROUNDS = 1; + public static int DEFAULT_INTERMITTENT_DELAY_MINS = 10; + public static String DEFAULT_DAG_NAME = "TestDagName"; + static final ObjectMapper MAPPER = new ObjectMapper(); /** @@ -62,15 +89,39 @@ public static WorkflowDag convertYamlPathToDag(FileSystem fs, String path) throw * Converts a YAML representation to {@link WorkflowDag}. */ public static WorkflowDag convertYamlToDag(String yaml) throws IOException { + int dagRounds = DEFAULT_DAG_ROUNDS; + int intermittentDelayMins = DEFAULT_INTERMITTENT_DELAY_MINS; + String dagName = DEFAULT_DAG_NAME; Map allNodes = new HashMap<>(); final ObjectMapper yamlReader = new ObjectMapper(new YAMLFactory()); final JsonNode jsonNode = yamlReader.readTree(yaml); Iterator> itr = jsonNode.fields(); while (itr.hasNext()) { Entry dagNode = itr.next(); - allNodes.put(dagNode.getKey(), convertJsonToDagNode(allNodes, dagNode.getKey(), dagNode.getValue())); + String key = dagNode.getKey(); + switch (key) { + case DAG_NAME: + dagName = dagNode.getValue().asText(); + break; + case DAG_ROUNDS: + dagRounds = dagNode.getValue().asInt(); + break; + case DAG_INTERMITTENT_DELAY_MINS: + intermittentDelayMins = dagNode.getValue().asInt(); + break; + case DAG_CONTENT: + JsonNode dagContent = dagNode.getValue(); + Iterator> contentItr = dagContent.fields(); + while (contentItr.hasNext()) { + Entry dagContentNode = contentItr.next(); + allNodes.put(dagContentNode.getKey(), convertJsonToDagNode(allNodes, dagContentNode.getKey(), dagContentNode.getValue())); + } + break; + default: + break; + } } - return new WorkflowDag(findRootNodes(allNodes)); + return new WorkflowDag(dagName, dagRounds, intermittentDelayMins, findRootNodes(allNodes)); } /** @@ -80,7 +131,12 @@ public static String convertDagToYaml(WorkflowDag dag) throws IOException { final ObjectMapper yamlWriter = new ObjectMapper(new YAMLFactory().disable(Feature.WRITE_DOC_START_MARKER) .enable(Feature.MINIMIZE_QUOTES).enable(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES)); JsonNode yamlNode = MAPPER.createObjectNode(); - convertDagToYaml(yamlNode, dag.getNodeList()); + ((ObjectNode) yamlNode).put(DAG_NAME, dag.getDagName()); + ((ObjectNode) yamlNode).put(DAG_ROUNDS, dag.getRounds()); + ((ObjectNode) yamlNode).put(DAG_INTERMITTENT_DELAY_MINS, dag.getIntermittentDelayMins()); + JsonNode dagContentNode = MAPPER.createObjectNode(); + convertDagToYaml(dagContentNode, dag.getNodeList()); + ((ObjectNode) yamlNode).put(DAG_CONTENT, dagContentNode); return yamlWriter.writerWithDefaultPrettyPrinter().writeValueAsString(yamlNode); } @@ -138,15 +194,27 @@ private static JsonNode convertDagNodeToJsonNode(DagNode node) throws IOExceptio private static Map convertJsonNodeToMap(JsonNode node) { Map configsMap = new HashMap<>(); - Iterator> itr = node.get(DeltaConfig.Config.CONFIG_NAME).fields(); + Iterator> itr = node.get(CONFIG_NAME).fields(); while (itr.hasNext()) { Entry entry = itr.next(); switch (entry.getKey()) { - case DeltaConfig.Config.HIVE_QUERIES: - configsMap.put(DeltaConfig.Config.HIVE_QUERIES, getHiveQueries(entry)); + case HIVE_QUERIES: + configsMap.put(HIVE_QUERIES, getQueries(entry)); + break; + case HIVE_PROPERTIES: + configsMap.put(HIVE_PROPERTIES, getQuerySessionProperties(entry)); break; - case DeltaConfig.Config.HIVE_PROPERTIES: - configsMap.put(DeltaConfig.Config.HIVE_PROPERTIES, getProperties(entry)); + case PRESTO_QUERIES: + configsMap.put(PRESTO_QUERIES, getQueries(entry)); + break; + case PRESTO_PROPERTIES: + configsMap.put(PRESTO_PROPERTIES, getQuerySessionProperties(entry)); + break; + case TRINO_QUERIES: + configsMap.put(TRINO_QUERIES, getQueries(entry)); + break; + case TRINO_PROPERTIES: + configsMap.put(TRINO_PROPERTIES, getQuerySessionProperties(entry)); break; default: configsMap.put(entry.getKey(), getValue(entry.getValue())); @@ -156,20 +224,26 @@ private static Map convertJsonNodeToMap(JsonNode node) { return configsMap; } - private static List> getHiveQueries(Entry entry) { + private static List> getQueries(Entry entry) { List> queries = new ArrayList<>(); - Iterator> queriesItr = entry.getValue().fields(); - while (queriesItr.hasNext()) { - queries.add(Pair.of(queriesItr.next().getValue().textValue(), queriesItr.next().getValue().asInt())); + try { + List flattened = new ArrayList<>(); + flattened.add(entry.getValue()); + queries = (List>) getQueryMapper().readValue(flattened.toString(), List.class); + } catch (Exception e) { + e.printStackTrace(); } return queries; } - private static List getProperties(Entry entry) { + private static List getQuerySessionProperties(Entry entry) { List properties = new ArrayList<>(); - Iterator> queriesItr = entry.getValue().fields(); - while (queriesItr.hasNext()) { - properties.add(queriesItr.next().getValue().textValue()); + try { + List flattened = new ArrayList<>(); + flattened.add(entry.getValue()); + properties = (List) getQueryEnginePropertyMapper().readValue(flattened.toString(), List.class); + } catch (Exception e) { + e.printStackTrace(); } return properties; } @@ -194,9 +268,46 @@ private static Object getValue(JsonNode node) { private static JsonNode createJsonNode(DagNode node, String type) throws IOException { JsonNode configNode = MAPPER.readTree(node.getConfig().toString()); JsonNode jsonNode = MAPPER.createObjectNode(); - ((ObjectNode) jsonNode).put(DeltaConfig.Config.CONFIG_NAME, configNode); + Iterator> itr = configNode.fields(); + while (itr.hasNext()) { + Entry entry = itr.next(); + switch (entry.getKey()) { + case HIVE_QUERIES: + ((ObjectNode) configNode).put(HIVE_QUERIES, + MAPPER.readTree(getQueryMapper().writeValueAsString(node.getConfig().getHiveQueries()))); + break; + case HIVE_PROPERTIES: + ((ObjectNode) configNode).put(HIVE_PROPERTIES, + MAPPER.readTree(getQueryEnginePropertyMapper().writeValueAsString(node.getConfig().getHiveProperties()))); + break; + case PRESTO_QUERIES: + ((ObjectNode) configNode).put(PRESTO_QUERIES, + MAPPER.readTree(getQueryMapper().writeValueAsString(node.getConfig().getHiveQueries()))); + break; + case PRESTO_PROPERTIES: + ((ObjectNode) configNode).put(PRESTO_PROPERTIES, + MAPPER.readTree(getQueryEnginePropertyMapper().writeValueAsString(node.getConfig().getHiveProperties()))); + break; + case TRINO_QUERIES: + ((ObjectNode) configNode).put(TRINO_QUERIES, + MAPPER.readTree(getQueryMapper().writeValueAsString(node.getConfig().getHiveQueries()))); + break; + case TRINO_PROPERTIES: + ((ObjectNode) configNode).put(TRINO_PROPERTIES, + MAPPER.readTree(getQueryEnginePropertyMapper().writeValueAsString(node.getConfig().getHiveProperties()))); + break; + default: + break; + } + } + ((ObjectNode) jsonNode).put(CONFIG_NAME, configNode); ((ObjectNode) jsonNode).put(DeltaConfig.Config.TYPE, type); - ((ObjectNode) jsonNode).put(DeltaConfig.Config.DEPENDENCIES, getDependencyNames(node)); + String dependencyNames = getDependencyNames(node); + if (StringUtils.isNullOrEmpty(dependencyNames) || "\"\"".equals(dependencyNames)) { + // Set "none" if there is no dependency + dependencyNames = NO_DEPENDENCY_VALUE; + } + ((ObjectNode) jsonNode).put(DeltaConfig.Config.DEPENDENCIES, dependencyNames); return jsonNode; } @@ -216,4 +327,103 @@ public static String toString(InputStream inputStream) throws IOException { return result.toString("utf-8"); } + private static ObjectMapper getQueryMapper() { + SimpleModule module = new SimpleModule(); + ObjectMapper queryMapper = new ObjectMapper(); + module.addSerializer(List.class, new QuerySerializer()); + module.addDeserializer(List.class, new QueryDeserializer()); + queryMapper.registerModule(module); + return queryMapper; + } + + private static final class QuerySerializer extends JsonSerializer { + Integer index = 0; + + @Override + public void serialize(List pairs, JsonGenerator gen, SerializerProvider serializers) throws IOException { + gen.writeStartObject(); + for (Pair pair : (List) pairs) { + gen.writeStringField("query" + index, pair.getLeft().toString()); + gen.writeNumberField("result" + index, Integer.parseInt(pair.getRight().toString())); + index++; + } + gen.writeEndObject(); + } + } + + private static final class QueryDeserializer extends JsonDeserializer { + @Override + public List deserialize(JsonParser parser, DeserializationContext context) throws IOException { + List> pairs = new ArrayList<>(); + String query = ""; + Integer result = 0; + // [{query0:, result0:,query1:, result1:}] + while (!parser.isClosed()) { + JsonToken jsonToken = parser.nextToken(); + if (jsonToken.equals(JsonToken.END_ARRAY)) { + break; + } + if (JsonToken.FIELD_NAME.equals(jsonToken)) { + String fieldName = parser.getCurrentName(); + parser.nextToken(); + + if (fieldName.contains("query")) { + query = parser.getValueAsString(); + } else if (fieldName.contains("result")) { + result = parser.getValueAsInt(); + pairs.add(Pair.of(query, result)); + } + } + } + return pairs; + } + } + + private static ObjectMapper getQueryEnginePropertyMapper() { + SimpleModule module = new SimpleModule(); + ObjectMapper propMapper = new ObjectMapper(); + module.addSerializer(List.class, new QueryEnginePropertySerializer()); + module.addDeserializer(List.class, new QueryEnginePropertyDeserializer()); + propMapper.registerModule(module); + return propMapper; + } + + private static final class QueryEnginePropertySerializer extends JsonSerializer { + Integer index = 0; + + @Override + public void serialize(List props, JsonGenerator gen, SerializerProvider serializers) throws IOException { + gen.writeStartObject(); + for (String prop : (List) props) { + gen.writeStringField("prop" + index, prop); + index++; + } + gen.writeEndObject(); + } + } + + private static final class QueryEnginePropertyDeserializer extends JsonDeserializer { + @Override + public List deserialize(JsonParser parser, DeserializationContext context) throws IOException { + List props = new ArrayList<>(); + String prop = ""; + // [{prop0:,...}] + while (!parser.isClosed()) { + JsonToken jsonToken = parser.nextToken(); + if (jsonToken.equals(JsonToken.END_ARRAY)) { + break; + } + if (JsonToken.FIELD_NAME.equals(jsonToken)) { + String fieldName = parser.getCurrentName(); + parser.nextToken(); + + if (parser.getCurrentName().contains("prop")) { + prop = parser.getValueAsString(); + props.add(prop); + } + } + } + return props; + } + } } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/ExecutionContext.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/ExecutionContext.java index 17148f538ccc8..e4cf84a1fb0a3 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/ExecutionContext.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/ExecutionContext.java @@ -26,7 +26,7 @@ /** * This wraps the context needed for an execution of - * a {@link DagNode#execute(ExecutionContext)}. + * a {@link DagNode#execute(ExecutionContext, int)}. */ public class ExecutionContext implements Serializable { diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/SimpleWorkflowDagGenerator.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/SimpleWorkflowDagGenerator.java index ad6e9cb0cb1b9..1fe2294423121 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/SimpleWorkflowDagGenerator.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/SimpleWorkflowDagGenerator.java @@ -18,8 +18,6 @@ package org.apache.hudi.integ.testsuite.dag; -import java.util.ArrayList; -import java.util.List; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; @@ -27,9 +25,11 @@ import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode; import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode; +import java.util.ArrayList; +import java.util.List; + /** - * An example of how to generate a workflow dag programmatically. This is also used as the default workflow dag if - * none is provided. + * An example of how to generate a workflow dag programmatically. This is also used as the default workflow dag if none is provided. */ public class SimpleWorkflowDagGenerator implements WorkflowDagGenerator { diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/WorkflowDag.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/WorkflowDag.java index e9171fc4774d0..f622bb7a7e448 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/WorkflowDag.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/WorkflowDag.java @@ -18,20 +18,47 @@ package org.apache.hudi.integ.testsuite.dag; -import java.util.List; import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; +import java.util.List; + +import static org.apache.hudi.integ.testsuite.dag.DagUtils.DEFAULT_DAG_NAME; +import static org.apache.hudi.integ.testsuite.dag.DagUtils.DEFAULT_DAG_ROUNDS; +import static org.apache.hudi.integ.testsuite.dag.DagUtils.DEFAULT_INTERMITTENT_DELAY_MINS; + /** * Workflow dag that encapsulates all execute nodes. */ public class WorkflowDag { + private String dagName; + private int rounds; + private int intermittentDelayMins; private List> nodeList; public WorkflowDag(List> nodeList) { + this(DEFAULT_DAG_NAME, DEFAULT_DAG_ROUNDS, DEFAULT_INTERMITTENT_DELAY_MINS, nodeList); + } + + public WorkflowDag(String dagName, int rounds, int intermittentDelayMins, List> nodeList) { + this.dagName = dagName; + this.rounds = rounds; + this.intermittentDelayMins = intermittentDelayMins; this.nodeList = nodeList; } + public String getDagName() { + return dagName; + } + + public int getRounds() { + return rounds; + } + + public int getIntermittentDelayMins() { + return intermittentDelayMins; + } + public List> getNodeList() { return nodeList; } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/WriterContext.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/WriterContext.java index e457f0a8daca7..83b5751c8646b 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/WriterContext.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/WriterContext.java @@ -21,25 +21,29 @@ import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.integ.testsuite.HoodieContinousTestSuiteWriter; +import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig; +import org.apache.hudi.integ.testsuite.HoodieInlineTestSuiteWriter; import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter; import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig; +import org.apache.hudi.integ.testsuite.generator.DeltaGenerator; import org.apache.hudi.integ.testsuite.reader.DeltaInputType; import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode; import org.apache.hudi.keygen.BuiltinKeyGenerator; -import org.apache.hudi.integ.testsuite.generator.DeltaGenerator; -import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig; import org.apache.hudi.utilities.UtilHelpers; import org.apache.hudi.utilities.schema.SchemaProvider; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; /** - * WriterContext wraps the delta writer/data generator related configuration needed - * to init/reinit. + * WriterContext wraps the delta writer/data generator related configuration needed to init/reinit. */ public class WriterContext { @@ -53,8 +57,10 @@ public class WriterContext { private BuiltinKeyGenerator keyGenerator; private transient SparkSession sparkSession; private transient JavaSparkContext jsc; + private ExecutorService executorService; + public WriterContext(JavaSparkContext jsc, TypedProperties props, HoodieTestSuiteConfig cfg, - BuiltinKeyGenerator keyGenerator, SparkSession sparkSession) { + BuiltinKeyGenerator keyGenerator, SparkSession sparkSession) { this.cfg = cfg; this.props = props; this.keyGenerator = keyGenerator; @@ -66,14 +72,19 @@ public void initContext(JavaSparkContext jsc) throws HoodieException { try { this.schemaProvider = UtilHelpers.createSchemaProvider(cfg.schemaProviderClassName, props, jsc); String schemaStr = schemaProvider.getSourceSchema().toString(); - this.hoodieTestSuiteWriter = new HoodieTestSuiteWriter(jsc, props, cfg, schemaStr); + this.hoodieTestSuiteWriter = (cfg.testContinousMode && cfg.useDeltaStreamer) ? new HoodieContinousTestSuiteWriter(jsc, props, cfg, schemaStr) + : new HoodieInlineTestSuiteWriter(jsc, props, cfg, schemaStr); int inputParallelism = cfg.inputParallelism > 0 ? cfg.inputParallelism : jsc.defaultParallelism(); this.deltaGenerator = new DeltaGenerator( new DFSDeltaConfig(DeltaOutputMode.valueOf(cfg.outputTypeName), DeltaInputType.valueOf(cfg.inputFormatName), new SerializableConfiguration(jsc.hadoopConfiguration()), cfg.inputBasePath, cfg.targetBasePath, - schemaStr, cfg.limitFileSize, inputParallelism, cfg.deleteOldInput), + schemaStr, cfg.limitFileSize, inputParallelism, cfg.deleteOldInput, cfg.useHudiToGenerateUpdates), jsc, sparkSession, schemaStr, keyGenerator); log.info(String.format("Initialized writerContext with: %s", schemaStr)); + if (cfg.testContinousMode) { + executorService = Executors.newFixedThreadPool(1); + executorService.execute(new TestSuiteWriterRunnable(hoodieTestSuiteWriter)); + } } catch (Exception e) { throw new HoodieException("Failed to reinitialize writerContext", e); } @@ -108,4 +119,39 @@ public TypedProperties getProps() { public String toString() { return this.hoodieTestSuiteWriter.toString() + "\n" + this.deltaGenerator.toString() + "\n"; } + + public SparkSession getSparkSession() { + return sparkSession; + } + + public void shutdownResources() { + this.hoodieTestSuiteWriter.shutdownResources(); + if (executorService != null) { + executorService.shutdownNow(); + } + } + + /** + * TestSuiteWriterRunnable to spin up a thread to execute deltastreamer with async table services. + */ + class TestSuiteWriterRunnable implements Runnable { + private HoodieTestSuiteWriter hoodieTestSuiteWriter; + + TestSuiteWriterRunnable(HoodieTestSuiteWriter hoodieTestSuiteWriter) { + this.hoodieTestSuiteWriter = hoodieTestSuiteWriter; + } + + @Override + public void run() { + try { + Thread.sleep(20000); + log.info("Starting continuous sync with deltastreamer "); + hoodieTestSuiteWriter.getDeltaStreamerWrapper().sync(); + log.info("Completed continuous sync with deltastreamer "); + } catch (Exception e) { + log.error("Deltastreamer failed in continuous mode " + e.getMessage()); + throw new HoodieException("Shutting down deltastreamer in continuous mode failed ", e); + } + } + } } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseQueryNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseQueryNode.java new file mode 100644 index 0000000000000..3870dce9dd9cd --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseQueryNode.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.common.util.collection.Pair; + +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.List; + +public abstract class BaseQueryNode extends DagNode { + + public void setSessionProperties(List properties, Statement stmt) throws SQLException { + for (String prop : properties) { + executeStatement(prop, stmt); + } + } + + public void executeAndValidateQueries(List> queriesWithResult, Statement stmt) throws SQLException { + for (Pair queryAndResult : queriesWithResult) { + log.info("Running {}", queryAndResult.getLeft()); + ResultSet res = stmt.executeQuery(queryAndResult.getLeft()); + if (!res.next()) { + log.info("res.next() was False - typically this means the query returned no rows."); + assert 0 == queryAndResult.getRight(); + } else { + Integer result = res.getInt(1); + if (!queryAndResult.getRight().equals(result)) { + throw new AssertionError( + "QUERY: " + queryAndResult.getLeft() + + " | EXPECTED RESULT = " + queryAndResult.getRight() + + " | ACTUAL RESULT = " + result + ); + } + } + log.info("Successfully validated query!"); + } + } + + private void executeStatement(String query, Statement stmt) throws SQLException { + log.info("Executing statement {}", stmt.toString()); + stmt.execute(query); + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java new file mode 100644 index 0000000000000..5f659c879850d --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.DataSourceWriteOptions; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; +import org.apache.hudi.integ.testsuite.schema.SchemaUtils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.api.java.function.ReduceFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer$; +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder; +import org.apache.spark.sql.catalyst.encoders.RowEncoder; +import org.apache.spark.sql.catalyst.expressions.Attribute; +import org.apache.spark.sql.types.StructType; +import org.slf4j.Logger; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; + +import scala.Tuple2; +import scala.collection.JavaConversions; +import scala.collection.JavaConverters; + +import static org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.CHECKPOINT_KEY; + +/** + * This nodes validates contents from input path are in tact with Hudi. By default no configs are required for this node. But there is an + * optional config "delete_input_data" that you can set for this node. If set, once validation completes, contents from inputPath are deleted. This will come in handy for long running test suites. + * README has more details under docker set up for usages of this node. + */ +public abstract class BaseValidateDatasetNode extends DagNode { + + public BaseValidateDatasetNode(DeltaConfig.Config config) { + this.config = config; + } + + /** + * @return {@link Logger} instance to use. + */ + public abstract Logger getLogger(); + + /** + * @param session {@link SparkSession} instance to use. + * @param context {@link ExecutionContext} instance to use. + * @param inputSchema input schema in {@link StructType} + * @return data in {@link Dataset} to validate. + */ + public abstract Dataset getDatasetToValidate(SparkSession session, ExecutionContext context, + StructType inputSchema); + + @Override + public void execute(ExecutionContext context, int curItrCount) throws Exception { + int validateOnceEveryItr = config.validateOnceEveryIteration(); + int itrCountToExecute = config.getIterationCountToExecute(); + if ((itrCountToExecute != -1 && itrCountToExecute == curItrCount) + || (itrCountToExecute == -1 && ((curItrCount % validateOnceEveryItr) == 0))) { + FileSystem fs = new Path(context.getHoodieTestSuiteWriter().getCfg().inputBasePath) + .getFileSystem(context.getHoodieTestSuiteWriter().getConfiguration()); + if (context.getHoodieTestSuiteWriter().getCfg().testContinousMode) { + awaitUntilDeltaStreamerCaughtUp(context, context.getHoodieTestSuiteWriter().getCfg().targetBasePath, fs, + context.getHoodieTestSuiteWriter().getCfg().inputBasePath); + } + SparkSession session = SparkSession.builder().sparkContext(context.getJsc().sc()).getOrCreate(); + // todo: Fix partitioning schemes. For now, assumes data based partitioning. + String inputPath = context.getHoodieTestSuiteWriter().getCfg().inputBasePath + "/*/*"; + log.info("Validation using data from input path " + inputPath); + // listing batches to be validated + String inputPathStr = context.getHoodieTestSuiteWriter().getCfg().inputBasePath; + if (log.isDebugEnabled()) { + FileStatus[] fileStatuses = fs.listStatus(new Path(inputPathStr)); + log.info("fileStatuses length: " + fileStatuses.length); + for (FileStatus fileStatus : fileStatuses) { + log.debug("Listing all Micro batches to be validated :: " + fileStatus.getPath().toString()); + } + } + + Dataset inputSnapshotDf = getInputDf(context, session, inputPath); + + // read from hudi and remove meta columns. + Dataset trimmedHudiDf = getDatasetToValidate(session, context, inputSnapshotDf.schema()); + if (config.isValidateFullData()) { + log.debug("Validating full dataset"); + Dataset exceptInputDf = inputSnapshotDf.except(trimmedHudiDf); + Dataset exceptHudiDf = trimmedHudiDf.except(inputSnapshotDf); + long exceptInputCount = exceptInputDf.count(); + long exceptHudiCount = exceptHudiDf.count(); + log.debug("Except input df count " + exceptInputDf + ", except hudi count " + exceptHudiCount); + if (exceptInputCount != 0 || exceptHudiCount != 0) { + log.error("Data set validation failed. Total count in hudi " + trimmedHudiDf.count() + ", input df count " + inputSnapshotDf.count() + + ". InputDf except hudi df = " + exceptInputCount + ", Hudi df except Input df " + exceptHudiCount); + throw new AssertionError("Hudi contents does not match contents input data. "); + } + } else { + Dataset intersectionDf = inputSnapshotDf.intersect(trimmedHudiDf); + long inputCount = inputSnapshotDf.count(); + long outputCount = trimmedHudiDf.count(); + log.debug("Input count: " + inputCount + "; output count: " + outputCount); + // the intersected df should be same as inputDf. if not, there is some mismatch. + if (outputCount == 0 || inputCount == 0 || inputSnapshotDf.except(intersectionDf).count() != 0) { + log.error("Data set validation failed. Total count in hudi " + outputCount + ", input df count " + inputCount); + throw new AssertionError("Hudi contents does not match contents input data. "); + } + + if (config.isValidateHive()) { + String database = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_DATABASE().key()); + String tableName = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_TABLE().key()); + log.warn("Validating hive table with db : " + database + " and table : " + tableName); + session.sql("REFRESH TABLE " + database + "." + tableName); + Dataset cowDf = session.sql("SELECT _row_key, rider, driver, begin_lat, begin_lon, end_lat, end_lon, fare, _hoodie_is_deleted, " + + "test_suite_source_ordering_field FROM " + database + "." + tableName); + Dataset reorderedInputDf = inputSnapshotDf.select("_row_key", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon", "fare", + "_hoodie_is_deleted", "test_suite_source_ordering_field"); + + Dataset intersectedHiveDf = reorderedInputDf.intersect(cowDf); + outputCount = trimmedHudiDf.count(); + log.warn("Input count: " + inputCount + "; output count: " + outputCount); + // the intersected df should be same as inputDf. if not, there is some mismatch. + if (outputCount == 0 || reorderedInputDf.except(intersectedHiveDf).count() != 0) { + log.error("Data set validation failed for COW hive table. Total count in hudi " + outputCount + ", input df count " + inputCount); + throw new AssertionError("Hudi hive table contents does not match contents input data. "); + } + } + + // if delete input data is enabled, erase input data. + if (config.isDeleteInputData()) { + // clean up input data for current group of writes. + inputPathStr = context.getHoodieTestSuiteWriter().getCfg().inputBasePath; + FileStatus[] fileStatuses = fs.listStatus(new Path(inputPathStr)); + for (FileStatus fileStatus : fileStatuses) { + log.debug("Micro batch to be deleted " + fileStatus.getPath().toString()); + fs.delete(fileStatus.getPath(), true); + } + } + } + } + } + + private void awaitUntilDeltaStreamerCaughtUp(ExecutionContext context, String hudiTablePath, FileSystem fs, String inputPath) throws IOException, InterruptedException { + HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(new Configuration(fs.getConf())).setBasePath(hudiTablePath).build(); + HoodieTimeline commitTimeline = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); + Option latestCheckpoint = getLatestCheckpoint(commitTimeline); + FileStatus[] subDirs = fs.listStatus(new Path(inputPath)); + List subDirList = Arrays.asList(subDirs); + subDirList.sort(Comparator.comparingLong(entry -> Long.parseLong(entry.getPath().getName()))); + String latestSubDir = subDirList.get(subDirList.size() - 1).getPath().getName(); + log.info("Latest sub directory in input path " + latestSubDir + ", latest checkpoint from deltastreamer " + + (latestCheckpoint.isPresent() ? latestCheckpoint.get() : "none")); + long maxWaitTime = config.maxWaitTimeForDeltastreamerToCatchupMs(); + long waitedSoFar = 0; + while (!(latestCheckpoint.isPresent() && latestCheckpoint.get().equals(latestSubDir))) { + log.warn("Sleeping for 20 secs awaiting for deltastreamer to catch up with ingested data"); + Thread.sleep(20000); + meta.reloadActiveTimeline(); + commitTimeline = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); + latestCheckpoint = getLatestCheckpoint(commitTimeline); + waitedSoFar += 20000; + if (waitedSoFar >= maxWaitTime) { + throw new AssertionError("DeltaStreamer has not caught up after 5 mins of wait time. Last known checkpoint " + + (latestCheckpoint.isPresent() ? latestCheckpoint.get() : "none") + ", expected checkpoint to have caugth up " + latestSubDir); + } + log.info("Latest sub directory in input path " + latestSubDir + ", latest checkpoint from deltastreamer " + + (latestCheckpoint.isPresent() ? latestCheckpoint.get() : "none")); + } + } + + private Option getLatestCheckpoint(HoodieTimeline timeline) { + return (Option) timeline.getReverseOrderedInstants().map(instant -> { + try { + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata + .fromBytes(timeline.getInstantDetails(instant).get(), HoodieCommitMetadata.class); + if (!StringUtils.isNullOrEmpty(commitMetadata.getMetadata(CHECKPOINT_KEY))) { + return Option.of(commitMetadata.getMetadata(CHECKPOINT_KEY)); + } else { + return Option.empty(); + } + } catch (IOException e) { + throw new HoodieIOException("Failed to parse HoodieCommitMetadata for " + instant.toString(), e); + } + }).filter(Option::isPresent).findFirst().orElse(Option.empty()); + } + + private Dataset getInputDf(ExecutionContext context, SparkSession session, String inputPath) { + String recordKeyField = context.getWriterContext().getProps().getString(DataSourceWriteOptions.RECORDKEY_FIELD().key()); + String partitionPathField = context.getWriterContext().getProps().getString(DataSourceWriteOptions.PARTITIONPATH_FIELD().key()); + // todo: fix hard coded fields from configs. + // read input and resolve insert, updates, etc. + Dataset inputDf = session.read().format("avro").load(inputPath); + Dataset trimmedDf = inputDf; + if (!config.inputPartitonsToSkipWithValidate().isEmpty()) { + trimmedDf = inputDf.filter("instr(" + partitionPathField + ", \'" + config.inputPartitonsToSkipWithValidate() + "\') != 1"); + } + + ExpressionEncoder encoder = getEncoder(inputDf.schema()); + return trimmedDf.groupByKey( + (MapFunction) value -> + (partitionPathField.isEmpty() ? value.getAs(recordKeyField) : (value.getAs(partitionPathField) + "+" + value.getAs(recordKeyField))), Encoders.STRING()) + .reduceGroups((ReduceFunction) (v1, v2) -> { + int ts1 = v1.getAs(SchemaUtils.SOURCE_ORDERING_FIELD); + int ts2 = v2.getAs(SchemaUtils.SOURCE_ORDERING_FIELD); + if (ts1 > ts2) { + return v1; + } else { + return v2; + } + }) + .map((MapFunction, Row>) value -> value._2, encoder) + .filter("_hoodie_is_deleted != true"); + } + + private ExpressionEncoder getEncoder(StructType schema) { + List attributes = JavaConversions.asJavaCollection(schema.toAttributes()).stream() + .map(Attribute::toAttribute).collect(Collectors.toList()); + return RowEncoder.apply(schema) + .resolveAndBind(JavaConverters.asScalaBufferConverter(attributes).asScala().toSeq(), + SimpleAnalyzer$.MODULE$); + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CleanNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CleanNode.java index 83a8d5e103b93..0f449a832c735 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CleanNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CleanNode.java @@ -32,7 +32,7 @@ public CleanNode(Config config) { } @Override - public void execute(ExecutionContext executionContext) throws Exception { + public void execute(ExecutionContext executionContext, int curItrCount) throws Exception { log.info("Executing clean node {}", this.getName()); executionContext.getHoodieTestSuiteWriter().getWriteClient(this).clean(); } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ClusteringNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ClusteringNode.java new file mode 100644 index 0000000000000..9ee5ca270de20 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ClusteringNode.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.common.util.Option; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; + +/** + * Triggers inline clustering. Works only with writeClient. Also, add this as last node and end with validation if possible. As of now, after clustering, further inserts/upserts may not work since we + * call deltaStreamer. + */ +public class ClusteringNode extends DagNode> { + + public ClusteringNode(Config config) { + this.config = config; + } + + @Override + public void execute(ExecutionContext executionContext, int curItrCount) throws Exception { + if (config.getIterationCountToExecute() == curItrCount) { + try { + log.warn("Executing ClusteringNode node {}", this.getName()); + executionContext.getHoodieTestSuiteWriter().inlineClustering(); + } catch (Exception e) { + log.warn("Exception thrown in ClusteringNode Node :: " + e.getCause() + ", msg :: " + e.getMessage()); + throw e; + } + } + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CompactNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CompactNode.java index 4c3ad61559530..dd7d880f6aef6 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CompactNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/CompactNode.java @@ -24,6 +24,7 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; import org.apache.hudi.integ.testsuite.dag.ExecutionContext; + import org.apache.spark.api.java.JavaRDD; /** @@ -40,18 +41,20 @@ public CompactNode(Config config) { * if it has one. * * @param executionContext Execution context to run this compaction - * @throws Exception will be thrown if any error occurred. + * @param curItrCount cur interation count. + * @throws Exception will be thrown if any error occurred. */ @Override - public void execute(ExecutionContext executionContext) throws Exception { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(executionContext.getHoodieTestSuiteWriter().getConfiguration(), - executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath); + public void execute(ExecutionContext executionContext, int curItrCount) throws Exception { + HoodieTableMetaClient metaClient = + HoodieTableMetaClient.builder().setConf(executionContext.getHoodieTestSuiteWriter().getConfiguration()).setBasePath(executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath) + .build(); Option lastInstant = metaClient.getActiveTimeline() - .getCommitsAndCompactionTimeline().filterPendingCompactionTimeline().lastInstant(); + .getWriteTimeline().filterPendingCompactionTimeline().lastInstant(); if (lastInstant.isPresent()) { log.info("Compacting instant {}", lastInstant.get()); this.result = executionContext.getHoodieTestSuiteWriter().compact(Option.of(lastInstant.get().getTimestamp())); + executionContext.getHoodieTestSuiteWriter().commitCompaction(result, executionContext.getJsc().emptyRDD(), Option.of(lastInstant.get().getTimestamp())); } } - } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DagNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DagNode.java index df54b4c811989..b5b8f807038bd 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DagNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DagNode.java @@ -41,6 +41,17 @@ public abstract class DagNode implements Comparable> { protected Config config; private boolean isCompleted; + public DagNode clone() { + List> tempChildNodes = new ArrayList<>(); + for (DagNode dagNode: childNodes) { + tempChildNodes.add(dagNode.clone()); + } + this.childNodes = tempChildNodes; + this.result = null; + this.isCompleted = false; + return this; + } + public DagNode addChildNode(DagNode childNode) { childNode.getParentNodes().add(this); getChildNodes().add(childNode); @@ -80,9 +91,10 @@ public void setParentNodes(List> parentNodes) { * Execute the {@link DagNode}. * * @param context The context needed for an execution of a node. + * @param curItrCount iteration count for executing the node. * @throws Exception Thrown if the execution failed. */ - public abstract void execute(ExecutionContext context) throws Exception; + public abstract void execute(ExecutionContext context, int curItrCount) throws Exception; public boolean isCompleted() { return isCompleted; diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DelayNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DelayNode.java new file mode 100644 index 0000000000000..369501c250edf --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DelayNode.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Delay Node to add delays between each group of test runs. + */ +public class DelayNode extends DagNode { + + private static Logger log = LoggerFactory.getLogger(ValidateDatasetNode.class); + private int delayMins; + + public DelayNode(int delayMins) { + this.delayMins = delayMins; + } + + @Override + public void execute(ExecutionContext context, int curItrCount) throws Exception { + log.warn("Waiting for " + delayMins + " mins before going for next test run"); + Thread.sleep(delayMins * 60 * 1000); + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DeleteInputDatasetNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DeleteInputDatasetNode.java new file mode 100644 index 0000000000000..2836f240ead3c --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DeleteInputDatasetNode.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +/** + * Deletes all input except latest batch. Mostly used in insert_overwrite operations. + */ +public class DeleteInputDatasetNode extends DagNode { + + public DeleteInputDatasetNode(DeltaConfig.Config config) { + this.config = config; + } + + @Override + public void execute(ExecutionContext context, int curItrCount) throws Exception { + + String latestBatch = String.valueOf(context.getWriterContext().getDeltaGenerator().getBatchId()); + + if (config.isDeleteInputDataExceptLatest()) { + String inputPathStr = context.getHoodieTestSuiteWriter().getCfg().inputBasePath; + FileSystem fs = new Path(inputPathStr) + .getFileSystem(context.getHoodieTestSuiteWriter().getConfiguration()); + FileStatus[] fileStatuses = fs.listStatus(new Path(inputPathStr)); + for (FileStatus fileStatus : fileStatuses) { + if (!fileStatus.getPath().getName().equals(latestBatch)) { + log.debug("Micro batch to be deleted " + fileStatus.getPath().toString()); + fs.delete(fileStatus.getPath(), true); + } + } + } + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DeleteNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DeleteNode.java index b538b01d17496..8eaea65413709 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DeleteNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DeleteNode.java @@ -38,7 +38,7 @@ public DeleteNode(Config config) { @Override protected void generate(DeltaGenerator deltaGenerator) throws Exception { if (!config.isDisableGenerate()) { - deltaGenerator.writeRecords(deltaGenerator.generateDeletes(config)).count(); + deltaGenerator.writeRecords(deltaGenerator.generateDeletes(config)).getValue().count(); } } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveQueryNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveQueryNode.java index f36b7d4511d68..0b9149f928f95 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveQueryNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveQueryNode.java @@ -18,22 +18,26 @@ package org.apache.hudi.integ.testsuite.dag.nodes; -import java.sql.Connection; -import java.sql.DriverManager; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.sql.Statement; -import org.apache.hudi.DataSourceUtils; -import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.exception.HoodieValidationException; import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; import org.apache.hudi.integ.testsuite.dag.ExecutionContext; import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider; +import org.apache.hudi.sync.common.HoodieSyncConfig; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.Statement; + +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER; /** * A hive query node in the DAG of operations for a workflow. used to perform a hive query with given config. */ -public class HiveQueryNode extends DagNode { +public class HiveQueryNode extends BaseQueryNode { private HiveServiceProvider hiveServiceProvider; @@ -43,48 +47,28 @@ public HiveQueryNode(DeltaConfig.Config config) { } @Override - public void execute(ExecutionContext executionContext) throws Exception { + public void execute(ExecutionContext executionContext, int curItrCount) throws Exception { log.info("Executing hive query node {}", this.getName()); this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration()); - HiveSyncConfig hiveSyncConfig = DataSourceUtils - .buildHiveSyncConfig(executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper() - .getDeltaSyncService().getDeltaSync().getProps(), - executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper() - .getDeltaSyncService().getDeltaSync().getCfg().targetBasePath, - executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper() - .getDeltaSyncService().getDeltaSync().getCfg().baseFileFormat); + TypedProperties properties = new TypedProperties(); + properties.putAll(executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper() + .getDeltaSyncService().getDeltaSync().getProps()); + properties.put(HoodieSyncConfig.META_SYNC_BASE_PATH.key(), executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper() + .getDeltaSyncService().getDeltaSync().getCfg().targetBasePath); + properties.put(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key(), executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper() + .getDeltaSyncService().getDeltaSync().getCfg().baseFileFormat); + HiveSyncConfig hiveSyncConfig = new HiveSyncConfig(properties); this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter()); - Connection con = DriverManager.getConnection(hiveSyncConfig.jdbcUrl, hiveSyncConfig.hiveUser, - hiveSyncConfig.hivePass); - Statement stmt = con.createStatement(); - stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat"); - for (String hiveProperty : this.config.getHiveProperties()) { - executeStatement(hiveProperty, stmt); - } - for (Pair queryAndResult : this.config.getHiveQueries()) { - log.info("Running {}", queryAndResult.getLeft()); - ResultSet res = stmt.executeQuery(queryAndResult.getLeft()); - if (!res.next()) { - log.info("res.next() was False - typically this means the query returned no rows."); - assert 0 == queryAndResult.getRight(); - } else { - Integer result = res.getInt(1); - if (!queryAndResult.getRight().equals(result)) { - throw new AssertionError( - "QUERY: " + queryAndResult.getLeft() - + " | EXPECTED RESULT = " + queryAndResult.getRight() - + " | ACTUAL RESULT = " + result - ); - } - } - log.info("Successfully validated query!"); + try (Connection con = DriverManager.getConnection(hiveSyncConfig.getString(HIVE_URL), + hiveSyncConfig.getString(HIVE_USER), hiveSyncConfig.getString(HIVE_PASS))) { + Statement stmt = con.createStatement(); + stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat"); + setSessionProperties(this.config.getHiveProperties(), stmt); + executeAndValidateQueries(this.config.getHiveQueries(), stmt); + stmt.close(); + this.hiveServiceProvider.stopLocalHiveServiceIfNeeded(); + } catch (Exception e) { + throw new HoodieValidationException("Hive query validation failed due to " + e.getMessage(), e); } - this.hiveServiceProvider.stopLocalHiveServiceIfNeeded(); } - - private void executeStatement(String query, Statement stmt) throws SQLException { - log.info("Executing statement {}", stmt.toString()); - stmt.execute(query); - } - } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveSyncNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveSyncNode.java index a2b4ee5eea359..7415880a83616 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveSyncNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/HiveSyncNode.java @@ -18,31 +18,29 @@ package org.apache.hudi.integ.testsuite.dag.nodes; -import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.hive.HiveSyncTool; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; import org.apache.hudi.integ.testsuite.dag.ExecutionContext; +import org.apache.hudi.sync.common.util.SyncUtilHelpers; + +import org.apache.hadoop.fs.Path; /** * Represents a hive sync node in the DAG of operations for a workflow. Helps to sync hoodie data to hive table. */ public class HiveSyncNode extends DagNode { - private HiveServiceProvider hiveServiceProvider; - public HiveSyncNode(Config config) { this.config = config; - this.hiveServiceProvider = new HiveServiceProvider(config); } @Override - public void execute(ExecutionContext executionContext) throws Exception { + public void execute(ExecutionContext executionContext, int curItrCount) throws Exception { log.info("Executing hive sync node"); - this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration()); - this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter()); - this.hiveServiceProvider.stopLocalHiveServiceIfNeeded(); - } - - public HiveServiceProvider getHiveServiceProvider() { - return hiveServiceProvider; + SyncUtilHelpers.runHoodieMetaSync(HiveSyncTool.class.getName(), new TypedProperties(executionContext.getHoodieTestSuiteWriter().getProps()), + executionContext.getHoodieTestSuiteWriter().getConfiguration(), + new Path(executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath).getFileSystem(executionContext.getHoodieTestSuiteWriter().getConfiguration()), + executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath, executionContext.getHoodieTestSuiteWriter().getCfg().baseFileFormat); } } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertNode.java index 1571349f834ba..33cce79e0d1bf 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertNode.java @@ -39,7 +39,7 @@ public InsertNode(Config config) { } @Override - public void execute(ExecutionContext executionContext) throws Exception { + public void execute(ExecutionContext executionContext, int curItrCount) throws Exception { // if the insert node has schema override set, reinitialize the table with new schema. if (this.config.getReinitContext()) { log.info(String.format("Reinitializing table with %s", this.config.getOtherConfigs().toString())); @@ -59,7 +59,8 @@ public void execute(ExecutionContext executionContext) throws Exception { protected void generate(DeltaGenerator deltaGenerator) throws Exception { if (!config.isDisableGenerate()) { log.info("Generating input data for node {}", this.getName()); - this.deltaWriteStatsRDD = deltaGenerator.writeRecords(deltaGenerator.generateInserts(config)); + this.deltaWriteStatsRDD = deltaGenerator.writeRecords(deltaGenerator.generateInserts(config)).getValue(); + this.deltaWriteStatsRDD.cache(); this.deltaWriteStatsRDD.count(); } } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertOverwriteNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertOverwriteNode.java new file mode 100644 index 0000000000000..bcd01ff77d1ad --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertOverwriteNode.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.spark.api.java.JavaRDD; + +public class InsertOverwriteNode extends InsertNode { + + public InsertOverwriteNode(Config config) { + super(config); + } + + @Override + protected JavaRDD ingest(HoodieTestSuiteWriter hoodieTestSuiteWriter, + Option commitTime) + throws Exception { + log.info("Execute insert overwrite node {}", this.getName()); + return hoodieTestSuiteWriter.insertOverwrite(commitTime); + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertOverwriteTableNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertOverwriteTableNode.java new file mode 100644 index 0000000000000..508b5b4e4f1aa --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/InsertOverwriteTableNode.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.spark.api.java.JavaRDD; + +public class InsertOverwriteTableNode extends InsertNode { + + public InsertOverwriteTableNode(Config config) { + super(config); + } + + @Override + protected JavaRDD ingest(HoodieTestSuiteWriter hoodieTestSuiteWriter, + Option commitTime) + throws Exception { + log.info("Execute insert overwrite table node {}", this.getName()); + return hoodieTestSuiteWriter.insertOverwriteTable(commitTime); + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/PrestoQueryNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/PrestoQueryNode.java new file mode 100644 index 0000000000000..45f087717cd8a --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/PrestoQueryNode.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.exception.HoodieValidationException; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.Statement; + +public class PrestoQueryNode extends BaseQueryNode { + + public PrestoQueryNode(DeltaConfig.Config config) { + this.config = config; + } + + @Override + public void execute(ExecutionContext context, int curItrCount) throws Exception { + if (!context.getHoodieTestSuiteWriter().getCfg().enablePrestoValidation) { + return; + } + int validateOnceEveryItr = config.validateOnceEveryIteration(); + int itrCountToExecute = config.getIterationCountToExecute(); + if ((itrCountToExecute != -1 && itrCountToExecute == curItrCount) + || (itrCountToExecute == -1 && ((curItrCount % validateOnceEveryItr) == 0))) { + log.info("Executing presto query node {}", this.getName()); + String url = context.getHoodieTestSuiteWriter().getCfg().prestoJdbcUrl; + if (StringUtils.isNullOrEmpty(url)) { + throw new IllegalArgumentException("Presto JDBC connection url not provided. Please set --presto-jdbc-url."); + } + String user = context.getHoodieTestSuiteWriter().getCfg().prestoUsername; + String pass = context.getHoodieTestSuiteWriter().getCfg().prestoPassword; + try { + Class.forName("com.facebook.presto.jdbc.PrestoDriver"); + } catch (ClassNotFoundException e) { + throw new HoodieValidationException("Presto query validation failed due to " + e.getMessage(), e); + } + try (Connection connection = DriverManager.getConnection(url, user, pass)) { + Statement stmt = connection.createStatement(); + setSessionProperties(this.config.getPrestoProperties(), stmt); + executeAndValidateQueries(this.config.getPrestoQueries(), stmt); + stmt.close(); + } catch (Exception e) { + throw new HoodieValidationException("Presto query validation failed due to " + e.getMessage(), e); + } + } + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/RollbackNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/RollbackNode.java index 12588ac036cbd..1e9be1b0a3b0e 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/RollbackNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/RollbackNode.java @@ -23,7 +23,6 @@ import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; import org.apache.hudi.integ.testsuite.dag.ExecutionContext; import org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector; @@ -42,25 +41,31 @@ public RollbackNode(Config config) { * Method helps to rollback the last commit instant in the timeline, if it has one. * * @param executionContext Execution context to perform this rollback + * @param curItrCount current iteration count. * @throws Exception will be thrown if any error occurred */ @Override - public void execute(ExecutionContext executionContext) throws Exception { - log.info("Executing rollback node {}", this.getName()); + public void execute(ExecutionContext executionContext, int curItrCount) throws Exception { + int numRollbacks = config.getNumRollbacks(); + log.info(String.format("Executing rollback node %s with %d rollbacks", this.getName(), numRollbacks)); // Can only be done with an instantiation of a new WriteClient hence cannot be done during DeltaStreamer // testing for now - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(executionContext.getHoodieTestSuiteWriter().getConfiguration(), - executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath); - Option lastInstant = metaClient.getActiveTimeline().getCommitsTimeline().lastInstant(); - if (lastInstant.isPresent()) { - log.info("Rolling back last instant {}", lastInstant.get()); - log.info("Cleaning up generated data for the instant being rolled back {}", lastInstant.get()); - ValidationUtils.checkArgument(executionContext.getWriterContext().getProps().getOrDefault(DFSPathSelector.Config.SOURCE_INPUT_SELECTOR, - DFSPathSelector.class.getName()).toString().equalsIgnoreCase(DFSTestSuitePathSelector.class.getName()), "Test Suite only supports DFSTestSuitePathSelector"); - executionContext.getHoodieTestSuiteWriter().getWriteClient(this).rollback(lastInstant.get().getTimestamp()); - metaClient.getFs().delete(new Path(executionContext.getWriterContext().getCfg().inputBasePath, - executionContext.getWriterContext().getHoodieTestSuiteWriter().getLastCheckpoint().orElse("")), true); - this.result = lastInstant; + HoodieTableMetaClient metaClient = + HoodieTableMetaClient.builder().setConf(executionContext.getHoodieTestSuiteWriter().getConfiguration()).setBasePath(executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath) + .build(); + for (int i = 0; i < numRollbacks; i++) { + metaClient.reloadActiveTimeline(); + Option lastInstant = metaClient.getActiveTimeline().getCommitsTimeline().lastInstant(); + if (lastInstant.isPresent()) { + log.info("Rolling back last instant {}", lastInstant.get()); + log.info("Cleaning up generated data for the instant being rolled back {}", lastInstant.get()); + ValidationUtils.checkArgument(executionContext.getWriterContext().getProps().getOrDefault(DFSPathSelector.Config.SOURCE_INPUT_SELECTOR, + DFSPathSelector.class.getName()).toString().equalsIgnoreCase(DFSTestSuitePathSelector.class.getName()), "Test Suite only supports DFSTestSuitePathSelector"); + executionContext.getHoodieTestSuiteWriter().getWriteClient(this).rollback(lastInstant.get().getTimestamp()); + metaClient.getFs().delete(new Path(executionContext.getWriterContext().getCfg().inputBasePath, + executionContext.getWriterContext().getHoodieTestSuiteWriter().getLastCheckpoint().orElse("")), true); + this.result = lastInstant; + } } } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ScheduleCompactNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ScheduleCompactNode.java index 0aa67f417467f..0297bc70384f0 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ScheduleCompactNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ScheduleCompactNode.java @@ -35,25 +35,25 @@ public ScheduleCompactNode(Config config) { } @Override - public void execute(ExecutionContext executionContext) throws Exception { + public void execute(ExecutionContext executionContext, int curItrCount) throws Exception { log.info("Executing schedule compact node {}", this.getName()); // Can only be done with an instantiation of a new WriteClient hence cannot be done during DeltaStreamer // testing for now // Find the last commit and extra the extra metadata to be passed to the schedule compaction. This is // done to ensure the CHECKPOINT is correctly passed from commit to commit - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(executionContext.getHoodieTestSuiteWriter().getConfiguration(), - executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath); + HoodieTableMetaClient metaClient = + HoodieTableMetaClient.builder().setConf(executionContext.getHoodieTestSuiteWriter().getConfiguration()).setBasePath(executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath) + .build(); Option lastInstant = metaClient.getActiveTimeline().getCommitsTimeline().lastInstant(); if (lastInstant.isPresent()) { HoodieCommitMetadata metadata = org.apache.hudi.common.model.HoodieCommitMetadata.fromBytes(metaClient .getActiveTimeline().getInstantDetails(lastInstant.get()).get(), HoodieCommitMetadata.class); Option scheduledInstant = executionContext.getHoodieTestSuiteWriter().scheduleCompaction(Option.of(metadata - .getExtraMetadata())); + .getExtraMetadata())); if (scheduledInstant.isPresent()) { log.info("Scheduling compaction instant {}", scheduledInstant.get()); } this.result = scheduledInstant; } } - } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/SparkSQLQueryNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/SparkSQLQueryNode.java index e06d6defe8c77..8efd96c11df24 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/SparkSQLQueryNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/SparkSQLQueryNode.java @@ -42,10 +42,11 @@ public SparkSQLQueryNode(Config config) { * Method helps to execute a sparkSql query from a hive table. * * @param executionContext Execution context to perform this query. + * @param curItrCount current iteration count. * @throws Exception will be thrown if ant error occurred */ @Override - public void execute(ExecutionContext executionContext) throws Exception { + public void execute(ExecutionContext executionContext, int curItrCount) throws Exception { log.info("Executing spark sql query node"); this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration()); this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter()); diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/TrinoQueryNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/TrinoQueryNode.java new file mode 100644 index 0000000000000..1a53e29fa0987 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/TrinoQueryNode.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.exception.HoodieValidationException; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.Statement; + +public class TrinoQueryNode extends BaseQueryNode { + + public TrinoQueryNode(DeltaConfig.Config config) { + this.config = config; + } + + @Override + public void execute(ExecutionContext context, int curItrCount) throws Exception { + log.info("Executing trino query node {}", this.getName()); + String url = context.getHoodieTestSuiteWriter().getCfg().trinoJdbcUrl; + if (StringUtils.isNullOrEmpty(url)) { + throw new IllegalArgumentException("Trino JDBC connection url not provided. Please set --trino-jdbc-url."); + } + String user = context.getHoodieTestSuiteWriter().getCfg().trinoUsername; + String pass = context.getHoodieTestSuiteWriter().getCfg().trinoPassword; + try { + Class.forName("io.trino.jdbc.TrinoDriver"); + } catch (ClassNotFoundException e) { + throw new HoodieValidationException("Trino query validation failed due to " + e.getMessage(), e); + } + try (Connection connection = DriverManager.getConnection(url, user, pass)) { + Statement stmt = connection.createStatement(); + setSessionProperties(this.config.getTrinoProperties(), stmt); + executeAndValidateQueries(this.config.getTrinoQueries(), stmt); + stmt.close(); + } catch (Exception e) { + throw new HoodieValidationException("Trino query validation failed due to " + e.getMessage(), e); + } + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/UpsertNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/UpsertNode.java index 1377a4d6b80a2..427ee74b63882 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/UpsertNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/UpsertNode.java @@ -38,7 +38,7 @@ public UpsertNode(Config config) { protected void generate(DeltaGenerator deltaGenerator) throws Exception { if (!config.isDisableGenerate()) { log.info("Generating input data {}", this.getName()); - deltaGenerator.writeRecords(deltaGenerator.generateUpdates(config)).count(); + deltaGenerator.writeRecords(deltaGenerator.generateUpdates(config)).getValue().count(); } } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateAsyncOperations.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateAsyncOperations.java new file mode 100644 index 0000000000000..714f3bf6cac81 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateAsyncOperations.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.CleanerUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Node to validate data set sanity like total file versions retained, has cleaning happened, has archival happened, etc. + */ +public class ValidateAsyncOperations extends DagNode> { + + private static Logger log = LoggerFactory.getLogger(ValidateAsyncOperations.class); + + public ValidateAsyncOperations(Config config) { + this.config = config; + } + + @Override + public void execute(ExecutionContext executionContext, int curItrCount) throws Exception { + if (config.getIterationCountToExecute() == curItrCount) { + try { + log.warn("Executing ValidateHoodieAsyncOperations node {} with target base path {} ", this.getName(), + executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath); + String basePath = executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath; + + int maxCommitsRetained = executionContext.getHoodieTestSuiteWriter().getWriteConfig().getCleanerCommitsRetained() + 1; + FileSystem fs = FSUtils.getFs(basePath, executionContext.getHoodieTestSuiteWriter().getConfiguration()); + + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setBasePath(executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath) + .setConf(executionContext.getJsc().hadoopConfiguration()).build(); + Option latestCleanInstant = metaClient.getActiveTimeline().getCleanerTimeline().filterCompletedInstants().lastInstant(); + if (latestCleanInstant.isPresent()) { + log.warn("Latest clean commit " + latestCleanInstant.get()); + HoodieCleanMetadata cleanMetadata = CleanerUtils.getCleanerMetadata(metaClient, latestCleanInstant.get()); + String earliestCommitToRetain = cleanMetadata.getEarliestCommitToRetain(); + log.warn("Earliest commit to retain : " + earliestCommitToRetain); + long unCleanedInstants = metaClient.getActiveTimeline().filterCompletedInstants().filter(instant -> + HoodieTimeline.compareTimestamps(instant.getTimestamp(), HoodieTimeline.GREATER_THAN_OR_EQUALS, earliestCommitToRetain)).getInstants().count(); + ValidationUtils.checkArgument(unCleanedInstants >= (maxCommitsRetained + 1), "Total uncleaned instants " + unCleanedInstants + + " mismatched with max commits retained " + (maxCommitsRetained + 1)); + } + + if (config.validateArchival() || config.validateClean()) { + final Pattern ARCHIVE_FILE_PATTERN = + Pattern.compile("\\.commits_\\.archive\\..*"); + final Pattern CLEAN_FILE_PATTERN = + Pattern.compile(".*\\.clean\\..*"); + + String metadataPath = executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath + "/.hoodie"; + FileStatus[] metaFileStatuses = fs.listStatus(new Path(metadataPath)); + boolean cleanFound = false; + for (FileStatus fileStatus : metaFileStatuses) { + Matcher cleanFileMatcher = CLEAN_FILE_PATTERN.matcher(fileStatus.getPath().getName()); + if (cleanFileMatcher.matches()) { + cleanFound = true; + break; + } + } + + String archivalPath = executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath + "/.hoodie/archived"; + metaFileStatuses = fs.listStatus(new Path(archivalPath)); + boolean archFound = false; + for (FileStatus fileStatus : metaFileStatuses) { + Matcher archFileMatcher = ARCHIVE_FILE_PATTERN.matcher(fileStatus.getPath().getName()); + if (archFileMatcher.matches()) { + archFound = true; + } + } + + if (config.validateArchival() && !archFound) { + throw new AssertionError("Archival NotFound in " + metadataPath); + } + + if (config.validateClean() && !cleanFound) { + throw new AssertionError("Clean commits NotFound in " + metadataPath); + } + } + } catch (Exception e) { + log.warn("Exception thrown in ValidateHoodieAsyncOperations Node :: " + e.getCause() + ", msg :: " + e.getMessage()); + throw e; + } + } + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java new file mode 100644 index 0000000000000..bd50616d142a4 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.DataSourceWriteOptions; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; + +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.StructType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This validation node uses spark datasource for comparison purposes. + */ +public class ValidateDatasetNode extends BaseValidateDatasetNode { + + private static final Logger LOG = LoggerFactory.getLogger(ValidateDatasetNode.class); + + public ValidateDatasetNode(DeltaConfig.Config config) { + super(config); + } + + @Override + public Logger getLogger() { + return LOG; + } + + @Override + public Dataset getDatasetToValidate(SparkSession session, ExecutionContext context, + StructType inputSchema) { + String partitionPathField = context.getWriterContext().getProps().getString(DataSourceWriteOptions.PARTITIONPATH_FIELD().key()); + String hudiPath = context.getHoodieTestSuiteWriter().getCfg().targetBasePath + (partitionPathField.isEmpty() ? "/" : "/*/*/*"); + Dataset hudiDf = session.read().option(HoodieMetadataConfig.ENABLE.key(), String.valueOf(context.getHoodieTestSuiteWriter().getCfg().enableMetadataOnRead)) + .format("hudi").load(hudiPath); + return hudiDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD) + .drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.FILENAME_METADATA_FIELD); + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateNode.java index 37244c0a833c0..e4c4adb1f7d83 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateNode.java @@ -40,9 +40,10 @@ public ValidateNode(Config config, Function, R> function) { * was set to true or default, but the parent nodes have not completed yet. * * @param executionContext Context to execute this node + * @param curItrCount current iteration count. */ @Override - public void execute(ExecutionContext executionContext) { + public void execute(ExecutionContext executionContext, int curItrCount) { if (this.getParentNodes().size() > 0 && (Boolean) this.config.getOtherConfigs().getOrDefault("WAIT_FOR_PARENTS", true)) { for (DagNode node : (List) this.getParentNodes()) { diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/scheduler/DagScheduler.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/scheduler/DagScheduler.java index 5c70ea164e873..847f0a43c511c 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/scheduler/DagScheduler.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/scheduler/DagScheduler.java @@ -23,8 +23,9 @@ import org.apache.hudi.integ.testsuite.dag.WorkflowDag; import org.apache.hudi.integ.testsuite.dag.WriterContext; import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; -import org.apache.hudi.metrics.Metrics; +import org.apache.hudi.integ.testsuite.dag.nodes.DelayNode; +import org.apache.spark.api.java.JavaSparkContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -50,9 +51,9 @@ public class DagScheduler { private WorkflowDag workflowDag; private ExecutionContext executionContext; - public DagScheduler(WorkflowDag workflowDag, WriterContext writerContext) { + public DagScheduler(WorkflowDag workflowDag, WriterContext writerContext, JavaSparkContext jsc) { this.workflowDag = workflowDag; - this.executionContext = new ExecutionContext(null, writerContext); + this.executionContext = new ExecutionContext(jsc, writerContext); } /** @@ -63,7 +64,7 @@ public DagScheduler(WorkflowDag workflowDag, WriterContext writerContext) { public void schedule() throws Exception { ExecutorService service = Executors.newFixedThreadPool(2); try { - execute(service, workflowDag.getNodeList()); + execute(service, workflowDag); service.shutdown(); } finally { if (!service.isShutdown()) { @@ -77,33 +78,45 @@ public void schedule() throws Exception { * Method to start executing the nodes in workflow DAGs. * * @param service ExecutorService - * @param nodes Nodes to be executed + * @param workflowDag instance of workflow dag that needs to be executed * @throws Exception will be thrown if ant error occurred */ - private void execute(ExecutorService service, List nodes) throws Exception { + private void execute(ExecutorService service, WorkflowDag workflowDag) throws Exception { // Nodes at the same level are executed in parallel - Queue queue = new PriorityQueue<>(nodes); log.info("Running workloads"); + List nodes = workflowDag.getNodeList(); + int curRound = 1; do { - List futures = new ArrayList<>(); - Set childNodes = new HashSet<>(); - while (queue.size() > 0) { - DagNode nodeToExecute = queue.poll(); - log.info("Node to execute in dag scheduler " + nodeToExecute.getConfig().toString()); - futures.add(service.submit(() -> executeNode(nodeToExecute))); - if (nodeToExecute.getChildNodes().size() > 0) { - childNodes.addAll(nodeToExecute.getChildNodes()); - } + log.warn("==================================================================="); + log.warn("Running workloads for round num " + curRound); + log.warn("==================================================================="); + Queue queue = new PriorityQueue<>(); + for (DagNode dagNode : nodes) { + queue.add(dagNode.clone()); } - queue.addAll(childNodes); - childNodes.clear(); - for (Future future : futures) { - future.get(1, TimeUnit.HOURS); + do { + List futures = new ArrayList<>(); + Set childNodes = new HashSet<>(); + while (queue.size() > 0) { + DagNode nodeToExecute = queue.poll(); + log.warn("Executing node \"" + nodeToExecute.getConfig().getOtherConfigs().get(CONFIG_NAME) + "\" :: " + nodeToExecute.getConfig()); + int finalCurRound = curRound; + futures.add(service.submit(() -> executeNode(nodeToExecute, finalCurRound))); + if (nodeToExecute.getChildNodes().size() > 0) { + childNodes.addAll(nodeToExecute.getChildNodes()); + } + } + queue.addAll(childNodes); + childNodes.clear(); + for (Future future : futures) { + future.get(1, TimeUnit.HOURS); + } + } while (queue.size() > 0); + log.info("Finished workloads for round num " + curRound); + if (curRound < workflowDag.getRounds()) { + new DelayNode(workflowDag.getIntermittentDelayMins()).execute(executionContext, curRound); } - - // After each level, report and flush the metrics - Metrics.flush(); - } while (queue.size() > 0); + } while (curRound++ < workflowDag.getRounds()); log.info("Finished workloads"); } @@ -112,15 +125,14 @@ private void execute(ExecutorService service, List nodes) throws Except * * @param node The node to be executed */ - private void executeNode(DagNode node) { + protected void executeNode(DagNode node, int curRound) { if (node.isCompleted()) { throw new RuntimeException("DagNode already completed! Cannot re-execute"); } try { int repeatCount = node.getConfig().getRepeatCount(); while (repeatCount > 0) { - log.warn("executing node: \"" + node.getConfig().getOtherConfigs().get(CONFIG_NAME) + "\" of type: " + node.getClass() + " :: " + node.getConfig().toString()); - node.execute(executionContext); + node.execute(executionContext, curRound); log.info("Finished executing {}", node.getName()); repeatCount--; } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/scheduler/SaferSchemaDagScheduler.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/scheduler/SaferSchemaDagScheduler.java new file mode 100644 index 0000000000000..70ebe6c541414 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/scheduler/SaferSchemaDagScheduler.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.scheduler; + +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.integ.testsuite.dag.WorkflowDag; +import org.apache.hudi.integ.testsuite.dag.WriterContext; +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaSparkContext; + +public class SaferSchemaDagScheduler extends DagScheduler { + private static Logger LOG = LogManager.getLogger(SaferSchemaDagScheduler.class); + int processedVersion; + + public SaferSchemaDagScheduler(WorkflowDag workflowDag, WriterContext writerContext, JavaSparkContext jsc) { + super(workflowDag, writerContext, jsc); + } + + public SaferSchemaDagScheduler(WorkflowDag workflowDag, WriterContext writerContext, JavaSparkContext jsc, int version) { + super(workflowDag, writerContext, jsc); + processedVersion = version; + } + + @Override + protected void executeNode(DagNode node, int curRound) throws HoodieException { + if (node.getConfig().getSchemaVersion() < processedVersion) { + LOG.info(String.format("----------------- Processed SaferSchema version %d is available. " + + "Skipping redundant Insert Operation. (Processed = %d) -----------------", + node.getConfig().getSchemaVersion(), processedVersion)); + return; + } + super.executeNode(node, curRound); + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeleteGeneratorIterator.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeleteGeneratorIterator.java index b95bd0e9df143..22729fea5ba32 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeleteGeneratorIterator.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeleteGeneratorIterator.java @@ -21,9 +21,7 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; -import java.util.ArrayList; import java.util.Iterator; -import java.util.List; /** * Lazy delete record generator. diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeltaGenerator.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeltaGenerator.java index 53af8eb74068e..e643d9f9d0f97 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeltaGenerator.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeltaGenerator.java @@ -18,25 +18,9 @@ package org.apache.hudi.integ.testsuite.generator; -import java.io.IOException; -import java.io.Serializable; -import java.io.UncheckedIOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import java.util.stream.StreamSupport; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; import org.apache.hudi.integ.testsuite.converter.Converter; @@ -45,12 +29,16 @@ import org.apache.hudi.integ.testsuite.reader.DFSAvroDeltaInputReader; import org.apache.hudi.integ.testsuite.reader.DFSHoodieDatasetInputReader; import org.apache.hudi.integ.testsuite.reader.DeltaInputReader; +import org.apache.hudi.integ.testsuite.schema.SchemaUtils; import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode; import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats; import org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter; import org.apache.hudi.integ.testsuite.writer.DeltaWriterFactory; import org.apache.hudi.keygen.BuiltinKeyGenerator; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; @@ -58,6 +46,19 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.io.Serializable; +import java.io.UncheckedIOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.StreamSupport; import scala.Tuple2; @@ -82,11 +83,11 @@ public DeltaGenerator(DFSDeltaConfig deltaOutputConfig, JavaSparkContext jsc, Sp this.jsc = jsc; this.sparkSession = sparkSession; this.schemaStr = schemaStr; - this.recordRowKeyFieldNames = keyGenerator.getRecordKeyFields(); + this.recordRowKeyFieldNames = keyGenerator.getRecordKeyFieldNames(); this.partitionPathFieldNames = keyGenerator.getPartitionPathFields(); } - public JavaRDD writeRecords(JavaRDD records) { + public Pair> writeRecords(JavaRDD records) { if (deltaOutputConfig.shouldDeleteOldInputData() && batchId > 1) { Path oldInputDir = new Path(deltaOutputConfig.getDeltaBasePath(), Integer.toString(batchId - 1)); try { @@ -108,24 +109,32 @@ public JavaRDD writeRecords(JavaRDD records) { } }).flatMap(List::iterator); batchId++; - return ws; + return Pair.of(batchId, ws); + } + + public int getBatchId() { + return batchId; } public JavaRDD generateInserts(Config operation) { int numPartitions = operation.getNumInsertPartitions(); - long recordsPerPartition = operation.getNumRecordsInsert() / numPartitions; + long recordsPerPartition = operation.getNumRecordsInsert(); int minPayloadSize = operation.getRecordSize(); int startPartition = operation.getStartPartition(); // Each spark partition below will generate records for a single partition given by the integer index. - List partitionIndexes = IntStream.rangeClosed(0 + startPartition, numPartitions + startPartition) + List partitionIndexes = IntStream.rangeClosed(0 + startPartition, numPartitions + startPartition - 1) .boxed().collect(Collectors.toList()); JavaRDD inputBatch = jsc.parallelize(partitionIndexes, numPartitions) .mapPartitionsWithIndex((index, p) -> { return new LazyRecordGeneratorIterator(new FlexibleSchemaRecordGenerationIterator(recordsPerPartition, - minPayloadSize, schemaStr, partitionPathFieldNames, (Integer)index)); - }, true); + minPayloadSize, schemaStr, partitionPathFieldNames, numPartitions, startPartition)); + }, true) + .map(record -> { + record.put(SchemaUtils.SOURCE_ORDERING_FIELD, batchId); + return record; + }); if (deltaOutputConfig.getInputParallelism() < numPartitions) { inputBatch = inputBatch.coalesce(deltaOutputConfig.getInputParallelism()); @@ -149,15 +158,22 @@ public JavaRDD generateUpdates(Config config) throws IOException adjustedRDD = deltaInputReader.read(config.getNumRecordsUpsert()); adjustedRDD = adjustRDDToGenerateExactNumUpdates(adjustedRDD, jsc, config.getNumRecordsUpsert()); } else { - deltaInputReader = - new DFSHoodieDatasetInputReader(jsc, ((DFSDeltaConfig) deltaOutputConfig).getDatasetOutputPath(), - schemaStr); - if (config.getFractionUpsertPerFile() > 0) { - adjustedRDD = deltaInputReader.read(config.getNumUpsertPartitions(), config.getNumUpsertFiles(), - config.getFractionUpsertPerFile()); + if (((DFSDeltaConfig) deltaOutputConfig).shouldUseHudiToGenerateUpdates()) { + deltaInputReader = + new DFSHoodieDatasetInputReader(jsc, ((DFSDeltaConfig) deltaOutputConfig).getDeltaBasePath(), + schemaStr); + if (config.getFractionUpsertPerFile() > 0) { + adjustedRDD = deltaInputReader.read(config.getNumUpsertPartitions(), config.getNumUpsertFiles(), + config.getFractionUpsertPerFile()); + } else { + adjustedRDD = deltaInputReader.read(config.getNumUpsertPartitions(), config.getNumUpsertFiles(), config + .getNumRecordsUpsert()); + } } else { - adjustedRDD = deltaInputReader.read(config.getNumUpsertPartitions(), config.getNumUpsertFiles(), config - .getNumRecordsUpsert()); + deltaInputReader = new DFSAvroDeltaInputReader(sparkSession, schemaStr, + ((DFSDeltaConfig) deltaOutputConfig).getDeltaBasePath(), Option.empty(), Option.empty()); + adjustedRDD = deltaInputReader.read(config.getNumRecordsUpsert()); + adjustedRDD = adjustRDDToGenerateExactNumUpdates(adjustedRDD, jsc, config.getNumRecordsUpsert()); } } @@ -167,10 +183,13 @@ public JavaRDD generateUpdates(Config config) throws IOException log.info("Repartitioning records into " + numPartition + " partitions for updates"); adjustedRDD = adjustedRDD.repartition(numPartition); log.info("Repartitioning records done for updates"); - UpdateConverter converter = new UpdateConverter(schemaStr, config.getRecordSize(), partitionPathFieldNames, recordRowKeyFieldNames); - JavaRDD updates = converter.convert(adjustedRDD); + JavaRDD convertedRecords = converter.convert(adjustedRDD); + JavaRDD updates = convertedRecords.map(record -> { + record.put(SchemaUtils.SOURCE_ORDERING_FIELD, batchId); + return record; + }); updates.persist(StorageLevel.DISK_ONLY()); if (inserts == null) { inserts = updates; @@ -197,22 +216,34 @@ public JavaRDD generateDeletes(Config config) throws IOException adjustedRDD = deltaInputReader.read(config.getNumRecordsDelete()); adjustedRDD = adjustRDDToGenerateExactNumUpdates(adjustedRDD, jsc, config.getNumRecordsDelete()); } else { - deltaInputReader = - new DFSHoodieDatasetInputReader(jsc, ((DFSDeltaConfig) deltaOutputConfig).getDatasetOutputPath(), - schemaStr); - if (config.getFractionUpsertPerFile() > 0) { - adjustedRDD = deltaInputReader.read(config.getNumDeletePartitions(), config.getNumUpsertFiles(), - config.getFractionUpsertPerFile()); + if (((DFSDeltaConfig) deltaOutputConfig).shouldUseHudiToGenerateUpdates()) { + deltaInputReader = + new DFSHoodieDatasetInputReader(jsc, ((DFSDeltaConfig) deltaOutputConfig).getDatasetOutputPath(), + schemaStr); + if (config.getFractionUpsertPerFile() > 0) { + adjustedRDD = deltaInputReader.read(config.getNumDeletePartitions(), config.getNumUpsertFiles(), + config.getFractionUpsertPerFile()); + } else { + adjustedRDD = deltaInputReader.read(config.getNumDeletePartitions(), config.getNumUpsertFiles(), config + .getNumRecordsDelete()); + } } else { - adjustedRDD = deltaInputReader.read(config.getNumDeletePartitions(), config.getNumUpsertFiles(), config - .getNumRecordsDelete()); + deltaInputReader = new DFSAvroDeltaInputReader(sparkSession, schemaStr, + ((DFSDeltaConfig) deltaOutputConfig).getDeltaBasePath(), Option.empty(), Option.empty()); + adjustedRDD = deltaInputReader.read(config.getNumRecordsDelete()); + adjustedRDD = adjustRDDToGenerateExactNumUpdates(adjustedRDD, jsc, config.getNumRecordsDelete()); } } + log.info("Repartitioning records for delete"); // persist this since we will make multiple passes over this adjustedRDD = adjustedRDD.repartition(jsc.defaultParallelism()); Converter converter = new DeleteConverter(schemaStr, config.getRecordSize()); - JavaRDD deletes = converter.convert(adjustedRDD); + JavaRDD convertedRecords = converter.convert(adjustedRDD); + JavaRDD deletes = convertedRecords.map(record -> { + record.put(SchemaUtils.SOURCE_ORDERING_FIELD, batchId); + return record; + }); deletes.persist(StorageLevel.DISK_ONLY()); return deletes; } else { @@ -220,7 +251,6 @@ public JavaRDD generateDeletes(Config config) throws IOException } } - public Map getPartitionToCountMap(JavaRDD records) { // Requires us to keep the partitioner the same return records.mapPartitionsWithIndex((index, itr) -> { diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/FlexibleSchemaRecordGenerationIterator.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/FlexibleSchemaRecordGenerationIterator.java index 256dfa49ebf6c..4af926246ce57 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/FlexibleSchemaRecordGenerationIterator.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/FlexibleSchemaRecordGenerationIterator.java @@ -41,17 +41,23 @@ public class FlexibleSchemaRecordGenerationIterator implements Iterator partitionPathFieldNames; + private String firstPartitionPathField; public FlexibleSchemaRecordGenerationIterator(long maxEntriesToProduce, String schema) { - this(maxEntriesToProduce, GenericRecordFullPayloadGenerator.DEFAULT_PAYLOAD_SIZE, schema, null, 0); + this(maxEntriesToProduce, GenericRecordFullPayloadGenerator.DEFAULT_PAYLOAD_SIZE, schema, null, + GenericRecordFullPayloadGenerator.DEFAULT_NUM_DATE_PARTITIONS, + GenericRecordFullPayloadGenerator.DEFAULT_START_PARTITION); } public FlexibleSchemaRecordGenerationIterator(long maxEntriesToProduce, int minPayloadSize, String schemaStr, - List partitionPathFieldNames, int partitionIndex) { + List partitionPathFieldNames, int numPartitions, int startPartition) { this.counter = maxEntriesToProduce; this.partitionPathFieldNames = new HashSet<>(partitionPathFieldNames); + if (partitionPathFieldNames != null && partitionPathFieldNames.size() > 0) { + this.firstPartitionPathField = partitionPathFieldNames.get(0); + } Schema schema = new Schema.Parser().parse(schemaStr); - this.generator = new GenericRecordFullPayloadGenerator(schema, minPayloadSize, partitionIndex); + this.generator = new GenericRecordFullPayloadGenerator(schema, minPayloadSize, numPartitions, startPartition); } @Override @@ -62,12 +68,18 @@ public boolean hasNext() { @Override public GenericRecord next() { this.counter--; + boolean partitionPathsNonEmpty = partitionPathFieldNames != null && partitionPathFieldNames.size() > 0; if (lastRecord == null) { - GenericRecord record = this.generator.getNewPayload(partitionPathFieldNames); + GenericRecord record = partitionPathsNonEmpty + ? this.generator.getNewPayloadWithTimestamp(this.firstPartitionPathField) + : this.generator.getNewPayload(partitionPathFieldNames); lastRecord = record; return record; } else { - return this.generator.randomize(lastRecord, partitionPathFieldNames); + return partitionPathsNonEmpty + ? this.generator.getUpdatePayloadWithTimestamp(lastRecord, + this.partitionPathFieldNames, firstPartitionPathField) + : this.generator.getUpdatePayload(lastRecord, this.partitionPathFieldNames); } } } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordFullPayloadGenerator.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordFullPayloadGenerator.java index 7d5ca081444f0..9089f751b955a 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordFullPayloadGenerator.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/GenericRecordFullPayloadGenerator.java @@ -46,10 +46,12 @@ */ public class GenericRecordFullPayloadGenerator implements Serializable { - private static Logger LOG = LoggerFactory.getLogger(GenericRecordFullPayloadGenerator.class); - + private static final Logger LOG = LoggerFactory.getLogger(GenericRecordFullPayloadGenerator.class); public static final int DEFAULT_PAYLOAD_SIZE = 1024 * 10; // 10 KB + public static final int DEFAULT_NUM_DATE_PARTITIONS = 50; public static final String DEFAULT_HOODIE_IS_DELETED_COL = "_hoodie_is_deleted"; + public static final int DEFAULT_START_PARTITION = 0; + protected final Random random = new Random(); // The source schema used to generate a payload private final transient Schema baseSchema; @@ -58,10 +60,14 @@ public class GenericRecordFullPayloadGenerator implements Serializable { // The index of partition for which records are being generated private int partitionIndex = 0; // The size of a full record where every field of a generic record created contains 1 random value - private final int estimatedFullPayloadSize; + private int estimatedFullPayloadSize; // Number of extra entries to add in a complex/collection field to achieve the desired record size Map extraEntriesMap = new HashMap<>(); + // Start partition - default 0 + private int startPartition = DEFAULT_START_PARTITION; + // The number of unique dates to create + private int numDatePartitions = DEFAULT_NUM_DATE_PARTITIONS; // LogicalTypes in Avro 1.8.2 private static final String DECIMAL = "decimal"; private static final String UUID_NAME = "uuid"; @@ -75,6 +81,13 @@ public GenericRecordFullPayloadGenerator(Schema schema) { this(schema, DEFAULT_PAYLOAD_SIZE); } + public GenericRecordFullPayloadGenerator(Schema schema, int minPayloadSize, + int numDatePartitions, int startPartition) { + this(schema, minPayloadSize); + this.numDatePartitions = numDatePartitions; + this.startPartition = startPartition; + } + public GenericRecordFullPayloadGenerator(Schema schema, int minPayloadSize) { Pair sizeInfo = new GenericRecordFullPayloadSizeEstimator(schema) .typeEstimateAndNumComplexFields(); @@ -83,19 +96,13 @@ public GenericRecordFullPayloadGenerator(Schema schema, int minPayloadSize) { if (estimatedFullPayloadSize < minPayloadSize) { int numberOfComplexFields = sizeInfo.getRight(); if (numberOfComplexFields < 1) { - LOG.warn("The schema does not have any collections/complex fields. " - + "Cannot achieve minPayloadSize => " + minPayloadSize); + LOG.warn("The schema does not have any collections/complex fields. Cannot achieve minPayloadSize : {}", + minPayloadSize); } - determineExtraEntriesRequired(numberOfComplexFields, minPayloadSize - estimatedFullPayloadSize); } } - public GenericRecordFullPayloadGenerator(Schema schema, int minPayloadSize, int partitionIndex) { - this(schema, minPayloadSize); - this.partitionIndex = partitionIndex; - } - protected static boolean isPrimitive(Schema localSchema) { if (localSchema.getType() != Type.ARRAY && localSchema.getType() != Type.MAP @@ -131,15 +138,28 @@ public GenericRecord getNewPayload(Set partitionPathFieldNames) { return create(baseSchema, partitionPathFieldNames); } + public GenericRecord getNewPayloadWithTimestamp(String tsFieldName) { + return updateTimestamp(create(baseSchema, null), tsFieldName); + } + + public GenericRecord getUpdatePayloadWithTimestamp(GenericRecord record, Set blacklistFields, + String tsFieldName) { + return updateTimestamp(randomize(record, blacklistFields), tsFieldName); + } + protected GenericRecord create(Schema schema, Set partitionPathFieldNames) { GenericRecord result = new GenericData.Record(schema); for (Schema.Field f : schema.getFields()) { - if (isPartialLongField(f, partitionPathFieldNames)) { - // This is a long field used as partition field. Set it to seconds since epoch. - long value = TimeUnit.SECONDS.convert(partitionIndex, TimeUnit.DAYS); - result.put(f.name(), (long) value); + if (f.name().equals(DEFAULT_HOODIE_IS_DELETED_COL)) { + result.put(f.name(), false); } else { - result.put(f.name(), typeConvert(f)); + if (isPartialLongField(f, partitionPathFieldNames)) { + // This is a long field used as partition field. Set it to seconds since epoch. + long value = TimeUnit.SECONDS.convert(partitionIndex, TimeUnit.DAYS); + result.put(f.name(), (long) value); + } else { + result.put(f.name(), typeConvert(f)); + } } } return result; @@ -310,6 +330,19 @@ public boolean validate(GenericRecord record) { return genericData.validate(baseSchema, record); } + /** + * Generates a sequential timestamp (daily increment), and updates the timestamp field of the record. + * Note: When generating records, number of records to be generated must be more than numDatePartitions * parallelism, + * to guarantee that at least numDatePartitions are created. + * + * @VisibleForTesting + */ + public GenericRecord updateTimestamp(GenericRecord record, String fieldName) { + long delta = TimeUnit.SECONDS.convert((partitionIndex++ % numDatePartitions) + startPartition, TimeUnit.DAYS); + record.put(fieldName, delta); + return record; + } + /** * Check whether a schema is option. return true if it match the follows: 1. Its type is Type.UNION 2. Has two types 3. Has a NULL type. */ diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/UpdateGeneratorIterator.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/UpdateGeneratorIterator.java index 51b1fd9ed48dc..89cda658e12cc 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/UpdateGeneratorIterator.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/UpdateGeneratorIterator.java @@ -18,16 +18,16 @@ package org.apache.hudi.integ.testsuite.generator; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Set; - import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + /** * A lazy update payload generator to generate {@link GenericRecord}s lazily. */ diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/DFSTestSuitePathSelector.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/DFSTestSuitePathSelector.java index b7d71f583777a..e35033f4fd764 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/DFSTestSuitePathSelector.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/DFSTestSuitePathSelector.java @@ -18,20 +18,6 @@ package org.apache.hudi.integ.testsuite.helpers; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; -import java.util.Optional; -import java.util.stream.Collectors; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FsStatus; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.ImmutablePair; @@ -40,13 +26,25 @@ import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob; import org.apache.hudi.utilities.sources.helpers.DFSPathSelector; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + /** * A custom dfs path selector used only for the hudi test suite. To be used only if workload is not run inline. */ public class DFSTestSuitePathSelector extends DFSPathSelector { + private static volatile Logger log = LoggerFactory.getLogger(HoodieTestSuiteJob.class); public DFSTestSuitePathSelector(TypedProperties props, Configuration hadoopConf) { @@ -67,6 +65,7 @@ public Pair, String> getNextFilePathsAndMaxModificationTime( lastBatchId = 0; nextBatchId = 1; } + // obtain all eligible files for the batch List eligibleFiles = new ArrayList<>(); FileStatus[] fileStatuses = fs.globStatus( @@ -74,10 +73,10 @@ public Pair, String> getNextFilePathsAndMaxModificationTime( // Say input data is as follow input/1, input/2, input/5 since 3,4 was rolled back and 5 is new generated data // checkpoint from the latest commit metadata will be 2 since 3,4 has been rolled back. We need to set the // next batch id correctly as 5 instead of 3 - Optional correctBatchIdDueToRollback = Arrays.stream(fileStatuses) + Option correctBatchIdDueToRollback = Option.fromJavaOptional(Arrays.stream(fileStatuses) .map(f -> f.getPath().toString().split("/")[f.getPath().toString().split("/").length - 1]) .filter(bid1 -> Integer.parseInt(bid1) > lastBatchId) - .min((bid1, bid2) -> Integer.min(Integer.parseInt(bid1), Integer.parseInt(bid2))); + .min((bid1, bid2) -> Integer.min(Integer.parseInt(bid1), Integer.parseInt(bid2)))); if (correctBatchIdDueToRollback.isPresent() && Integer.parseInt(correctBatchIdDueToRollback.get()) > nextBatchId) { nextBatchId = Integer.parseInt(correctBatchIdDueToRollback.get()); } @@ -87,7 +86,8 @@ public Pair, String> getNextFilePathsAndMaxModificationTime( if (!fileStatus.isDirectory() || IGNORE_FILEPREFIX_LIST.stream() .anyMatch(pfx -> fileStatus.getPath().getName().startsWith(pfx))) { continue; - } else if (fileStatus.getPath().getName().compareTo(lastBatchId.toString()) > 0) { + } else if (Integer.parseInt(fileStatus.getPath().getName()) > lastBatchId && Integer.parseInt(fileStatus.getPath() + .getName()) <= nextBatchId) { RemoteIterator files = fs.listFiles(fileStatus.getPath(), true); while (files.hasNext()) { eligibleFiles.add(files.next()); @@ -95,7 +95,6 @@ public Pair, String> getNextFilePathsAndMaxModificationTime( } } - log.info("Reading " + eligibleFiles.size() + " files. "); // no data to readAvro if (eligibleFiles.size() == 0) { return new ImmutablePair<>(Option.empty(), diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/HiveServiceProvider.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/HiveServiceProvider.java index 85a292c2a2701..9dbfcf3860dde 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/HiveServiceProvider.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/HiveServiceProvider.java @@ -18,13 +18,16 @@ package org.apache.hudi.integ.testsuite.helpers; -import java.io.IOException; -import org.apache.hadoop.conf.Configuration; -import org.apache.hive.service.server.HiveServer2; +import org.apache.hudi.hive.HiveSyncTool; import org.apache.hudi.hive.testutils.HiveTestService; import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; +import org.apache.hadoop.conf.Configuration; +import org.apache.hive.service.server.HiveServer2; + +import java.io.IOException; + /** * Hive Service provider. */ @@ -46,12 +49,15 @@ public void startLocalHiveServiceIfNeeded(Configuration configuration) throws IO } public void syncToLocalHiveIfNeeded(HoodieTestSuiteWriter writer) { + HiveSyncTool hiveSyncTool; if (this.config.isHiveLocal()) { - writer.getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync() - .syncHive(getLocalHiveServer().getHiveConf()); + hiveSyncTool = new HiveSyncTool(writer.getWriteConfig().getProps(), + getLocalHiveServer().getHiveConf()); } else { - writer.getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync().syncHive(); + hiveSyncTool = new HiveSyncTool(writer.getWriteConfig().getProps(), + writer.getConfiguration()); } + hiveSyncTool.syncHoodieTable(); } public void stopLocalHiveServiceIfNeeded() throws IOException { diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/ZookeeperServiceProvider.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/ZookeeperServiceProvider.java new file mode 100644 index 0000000000000..afd8a07590587 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/ZookeeperServiceProvider.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.helpers; + +import org.apache.curator.framework.CuratorFramework; +import org.apache.hadoop.conf.Configuration; +import org.apache.hudi.common.testutils.minicluster.ZookeeperTestService; +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; + +public class ZookeeperServiceProvider { + + Configuration configuration; + private CuratorFramework client; + private Config config; + + public ZookeeperServiceProvider(Config config, Configuration configuration) { + this.configuration = configuration; + this.config = config; + } + + public void startLocalZookeeperIfNeeded() throws Exception { + if (config.isHiveLocal()) { + ZookeeperTestService zookeeperTestService = new ZookeeperTestService(configuration); + zookeeperTestService.start(); + } + } + + public void stopLocalZookeeperIfNeeded() throws Exception { + if (config.isHiveLocal()) { + ZookeeperTestService zookeeperTestService = new ZookeeperTestService(configuration); + zookeeperTestService.stop(); + } + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSAvroDeltaInputReader.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSAvroDeltaInputReader.java index f1bb02a6998e8..3fa4d375a6fc3 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSAvroDeltaInputReader.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSAvroDeltaInputReader.java @@ -22,11 +22,12 @@ import java.util.Arrays; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.fs.PathFilter; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.SparkSession; + import org.apache.hudi.common.util.Option; import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter; import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.sql.SparkSession; /** * A reader of {@link DeltaOutputMode#DFS} and {@link DeltaInputType#AVRO}. @@ -46,7 +47,8 @@ public class DFSAvroDeltaInputReader extends DFSDeltaInputReader { } }; - public DFSAvroDeltaInputReader(SparkSession sparkSession, String schemaStr, String basePath, + public DFSAvroDeltaInputReader( + SparkSession sparkSession, String schemaStr, String basePath, Option structName, Option nameSpace) { this.sparkSession = sparkSession; diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSDeltaInputReader.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSDeltaInputReader.java index da9268161d4d8..ad6ef10463009 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSDeltaInputReader.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSDeltaInputReader.java @@ -80,7 +80,6 @@ protected Pair getFileStatusIndexRange(List fileSt } if (totalSizeOfFilesPresent <= totalSizeToRead) { endOffset++; - continue; } else { return Pair.of(startOffset, endOffset); } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java index 2bd507ca5a3cf..2648740f54e0f 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/DFSHoodieDatasetInputReader.java @@ -18,29 +18,13 @@ package org.apache.hudi.integ.testsuite.reader; -import static java.util.Map.Entry.comparingByValue; -import static java.util.stream.Collectors.toMap; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.NoSuchElementException; -import java.util.stream.Collectors; -import java.util.stream.StreamSupport; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.HoodieCommonConfig; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -48,21 +32,42 @@ import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.table.view.TableFileSystemView; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ParquetReaderIterator; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.config.HoodieMemoryConfig; -import org.apache.parquet.avro.AvroParquetReader; +import org.apache.hudi.io.storage.HoodieFileReader; +import org.apache.hudi.io.storage.HoodieFileReaderFactory; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + import scala.Tuple2; +import static java.util.Map.Entry.comparingByValue; +import static java.util.stream.Collectors.toMap; + /** - * This class helps to generate updates from an already existing hoodie dataset. It supports generating updates in - * across partitions, files and records. + * This class helps to generate updates from an already existing hoodie dataset. It supports generating updates in across partitions, files and records. */ public class DFSHoodieDatasetInputReader extends DFSDeltaInputReader { @@ -75,14 +80,15 @@ public class DFSHoodieDatasetInputReader extends DFSDeltaInputReader { public DFSHoodieDatasetInputReader(JavaSparkContext jsc, String basePath, String schemaStr) { this.jsc = jsc; this.schemaStr = schemaStr; - this.metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); + this.metaClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build(); } protected List getPartitions(Option partitionsLimit) throws IOException { // Using FSUtils.getFS here instead of metaClient.getFS() since we dont want to count these listStatus // calls in metrics as they are not part of normal HUDI operation. - FileSystem fs = FSUtils.getFs(metaClient.getBasePath(), metaClient.getHadoopConf()); - List partitionPaths = FSUtils.getAllPartitionPaths(fs, metaClient.getBasePath(), false); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + List partitionPaths = FSUtils.getAllPartitionPaths(engineContext, metaClient.getBasePath(), + HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, false); // Sort partition so we can pick last N partitions by default Collections.sort(partitionPaths); if (!partitionPaths.isEmpty()) { @@ -108,8 +114,14 @@ private JavaPairRDD> getPartitionToFileSlice(HoodieT @Override protected long analyzeSingleFile(String filePath) { - return SparkBasedReader.readParquet(new SparkSession(jsc.sc()), Arrays.asList(filePath), - Option.empty(), Option.empty()).count(); + if (filePath.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { + return SparkBasedReader.readParquet(new SparkSession(jsc.sc()), Arrays.asList(filePath), + Option.empty(), Option.empty()).count(); + } else if (filePath.endsWith(HoodieFileFormat.ORC.getFileExtension())) { + return SparkBasedReader.readOrc(new SparkSession(jsc.sc()), Arrays.asList(filePath), + Option.empty(), Option.empty()).count(); + } + throw new UnsupportedOperationException("Format for " + filePath + " is not supported yet."); } private JavaRDD fetchAnyRecordsFromDataset(Option numRecordsToUpdate) throws IOException { @@ -144,19 +156,25 @@ private JavaRDD fetchRecordsFromDataset(Option numPartit // TODO : read record count from metadata // Read the records in a single file - long recordsInSingleFile = iteratorSize(readParquetOrLogFiles(getSingleSliceFromRDD(partitionToFileSlice))); + long recordsInSingleFile = iteratorSize(readColumnarOrLogFiles(getSingleSliceFromRDD(partitionToFileSlice))); int numFilesToUpdate; long numRecordsToUpdatePerFile; - if (!numFiles.isPresent() || numFiles.get() == 0) { + if (!numFiles.isPresent() || numFiles.get() <= 0) { // If num files are not passed, find the number of files to update based on total records to update and records // per file - numFilesToUpdate = (int)Math.ceil((double)numRecordsToUpdate.get() / recordsInSingleFile); - // recordsInSingleFile is not average so we still need to account for bias is records distribution - // in the files. Limit to the maximum number of files available. - int totalExistingFilesCount = partitionToFileIdCountMap.values().stream().reduce((a, b) -> a + b).get(); - numFilesToUpdate = Math.min(numFilesToUpdate, totalExistingFilesCount); - log.info("Files to update {}", numFilesToUpdate); - numRecordsToUpdatePerFile = recordsInSingleFile; + numFilesToUpdate = (int) Math.floor((double) numRecordsToUpdate.get() / recordsInSingleFile); + if (numFilesToUpdate > 0) { + // recordsInSingleFile is not average so we still need to account for bias is records distribution + // in the files. Limit to the maximum number of files available. + int totalExistingFilesCount = partitionToFileIdCountMap.values().stream().reduce((a, b) -> a + b).get(); + numFilesToUpdate = Math.min(numFilesToUpdate, totalExistingFilesCount); + log.info("Files to update {}, records to update per file {}", numFilesToUpdate, recordsInSingleFile); + numRecordsToUpdatePerFile = recordsInSingleFile; + } else { + numFilesToUpdate = 1; + numRecordsToUpdatePerFile = numRecordsToUpdate.get(); + log.info("Total records passed in < records in single file. Hence setting numFilesToUpdate to 1 and numRecordsToUpdate to {} ", numRecordsToUpdatePerFile); + } } else { // If num files is passed, find the number of records per file based on either percentage or total records to // update and num files passed @@ -170,6 +188,7 @@ private JavaRDD fetchRecordsFromDataset(Option numPartit partitionPaths.size(), numFilesToUpdate, partitionToFileIdCountMap); JavaRDD updates = projectSchema(generateUpdates(adjustedPartitionToFileIdCountMap, partitionToFileSlice, numFilesToUpdate, (int) numRecordsToUpdatePerFile)); + if (numRecordsToUpdate.isPresent() && numFiles.isPresent() && numFiles.get() != 0 && numRecordsToUpdate.get() != numRecordsToUpdatePerFile * numFiles.get()) { long remainingRecordsToAdd = (numRecordsToUpdate.get() - (numRecordsToUpdatePerFile * numFiles.get())); @@ -183,7 +202,7 @@ private JavaRDD fetchRecordsFromDataset(Option numPartit private JavaRDD projectSchema(JavaRDD updates) { // The records read from the hoodie dataset have the hoodie record fields, rewrite the record to eliminate them return updates - .map(r -> HoodieAvroUtils.rewriteRecordWithOnlyNewSchemaFields(r, new Schema.Parser().parse(schemaStr))); + .map(r -> HoodieAvroUtils.rewriteRecord(r, new Schema.Parser().parse(schemaStr))); } private JavaRDD generateUpdates(Map adjustedPartitionToFileIdCountMap, @@ -193,9 +212,9 @@ private JavaRDD generateUpdates(Map adjustedPart return iteratorLimit(p._2, maxFilesToRead); }).flatMap(p -> p).repartition(numFiles).map(fileSlice -> { if (numRecordsToReadPerFile > 0) { - return iteratorLimit(readParquetOrLogFiles(fileSlice), numRecordsToReadPerFile); + return iteratorLimit(readColumnarOrLogFiles(fileSlice), numRecordsToReadPerFile); } else { - return readParquetOrLogFiles(fileSlice); + return readColumnarOrLogFiles(fileSlice); } }).flatMap(p -> p).map(i -> (GenericRecord) i); } @@ -214,7 +233,7 @@ private Map getFilesToReadPerPartition(JavaPairRDD adjustedPartitionToFileIdCountMap = new HashMap<>(); partitionToFileIdCountSortedMap.entrySet().stream().forEach(e -> { if (e.getValue() <= numFilesPerPartition) { @@ -241,12 +260,13 @@ private FileSlice getSingleSliceFromRDD(JavaPairRDD> }).take(1).get(0); } - private Iterator readParquetOrLogFiles(FileSlice fileSlice) throws IOException { + private Iterator readColumnarOrLogFiles(FileSlice fileSlice) throws IOException { if (fileSlice.getBaseFile().isPresent()) { - Iterator itr = - new ParquetReaderIterator(AvroParquetReader.builder(new - Path(fileSlice.getBaseFile().get().getPath())).withConf(metaClient.getHadoopConf()).build()); - return itr; + // Read the base files using the latest writer schema. + Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr)); + HoodieFileReader reader = HoodieFileReaderFactory.getFileReader(metaClient.getHadoopConf(), + new Path(fileSlice.getBaseFile().get().getPath())); + return reader.getRecordIterator(schema); } else { // If there is no data file, fall back to reading log files HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder() @@ -261,8 +281,10 @@ private Iterator readParquetOrLogFiles(FileSlice fileSlice) throw HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES) .withReadBlocksLazily(true) .withReverseReader(false) - .withBufferSize(HoodieMemoryConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE) - .withSpillableMapBasePath(HoodieMemoryConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH) + .withBufferSize(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE.defaultValue()) + .withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()) + .withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()) + .withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()) .build(); // readAvro log files Iterable> iterable = () -> scanner.iterator(); @@ -279,9 +301,7 @@ private Iterator readParquetOrLogFiles(FileSlice fileSlice) throw } /** - * Returns the number of elements remaining in {@code iterator}. The iterator - * will be left exhausted: its {@code hasNext()} method will return - * {@code false}. + * Returns the number of elements remaining in {@code iterator}. The iterator will be left exhausted: its {@code hasNext()} method will return {@code false}. */ private static int iteratorSize(Iterator iterator) { int count = 0; @@ -293,11 +313,8 @@ private static int iteratorSize(Iterator iterator) { } /** - * Creates an iterator returning the first {@code limitSize} elements of the - * given iterator. If the original iterator does not contain that many - * elements, the returned iterator will have the same behavior as the original - * iterator. The returned iterator supports {@code remove()} if the original - * iterator does. + * Creates an iterator returning the first {@code limitSize} elements of the given iterator. If the original iterator does not contain that many elements, the returned iterator will have the same + * behavior as the original iterator. The returned iterator supports {@code remove()} if the original iterator does. * * @param iterator the iterator to limit * @param limitSize the maximum number of elements in the returned iterator diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/SparkBasedReader.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/SparkBasedReader.java index fc23a47b35331..32969f2cf2c11 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/SparkBasedReader.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/reader/SparkBasedReader.java @@ -18,15 +18,18 @@ package org.apache.hudi.integ.testsuite.reader; -import java.util.List; -import org.apache.avro.generic.GenericRecord; import org.apache.hudi.HoodieSparkUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.utilities.schema.RowBasedSchemaProvider; + +import org.apache.avro.generic.GenericRecord; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; + +import java.util.List; + import scala.collection.JavaConverters; @@ -51,7 +54,7 @@ public static JavaRDD readAvro(SparkSession sparkSession, String return HoodieSparkUtils .createRdd(dataSet.toDF(), structName.orElse(RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME), - nameSpace.orElse(RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE)) + nameSpace.orElse(RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE), false, Option.empty()) .toJavaRDD(); } @@ -63,8 +66,21 @@ public static JavaRDD readParquet(SparkSession sparkSession, List return HoodieSparkUtils .createRdd(dataSet.toDF(), structName.orElse(RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME), - RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE) + RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE, false, Option.empty()) .toJavaRDD(); } + public static JavaRDD readOrc(SparkSession sparkSession, List + listOfPaths, Option structName, Option nameSpace) { + + Dataset dataSet = sparkSession.read() + .orc((JavaConverters.asScalaIteratorConverter(listOfPaths.iterator()).asScala().toSeq())); + + return HoodieSparkUtils.createRdd(dataSet.toDF(), + structName.orElse(RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME), + RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE, + false, Option.empty() + ).toJavaRDD(); + } + } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/schema/SchemaUtils.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/schema/SchemaUtils.java new file mode 100644 index 0000000000000..2de945286d910 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/schema/SchemaUtils.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.schema; + +public class SchemaUtils { + + public static final String SOURCE_ORDERING_FIELD = "test_suite_source_ordering_field"; + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/schema/TestSuiteFileBasedSchemaProvider.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/schema/TestSuiteFileBasedSchemaProvider.java new file mode 100644 index 0000000000000..e67c5afae80dc --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/schema/TestSuiteFileBasedSchemaProvider.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.schema; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.integ.testsuite.dag.WriterContext; +import org.apache.hudi.utilities.schema.FilebasedSchemaProvider; + +import org.apache.avro.Schema; +import org.apache.avro.Schema.Field; +import org.apache.avro.Schema.Type; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaSparkContext; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * Appends source ordering field to both source and target schemas. This is required to assist in validation to differentiate records written in different batches. + */ +public class TestSuiteFileBasedSchemaProvider extends FilebasedSchemaProvider { + + protected static Logger log = LogManager.getLogger(WriterContext.class); + + public TestSuiteFileBasedSchemaProvider(TypedProperties props, JavaSparkContext jssc) { + super(props, jssc); + this.sourceSchema = addSourceOrderingFieldToSchema(sourceSchema); + this.targetSchema = addSourceOrderingFieldToSchema(targetSchema); + } + + private Schema addSourceOrderingFieldToSchema(Schema schema) { + List fields = new ArrayList<>(); + for (Schema.Field field : schema.getFields()) { + Schema.Field newField = new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultVal()); + for (Map.Entry prop : field.getObjectProps().entrySet()) { + newField.addProp(prop.getKey(), prop.getValue()); + } + fields.add(newField); + } + Schema.Field sourceOrderingField = + new Schema.Field(SchemaUtils.SOURCE_ORDERING_FIELD, Schema.create(Type.INT), "", 0); + fields.add(sourceOrderingField); + Schema mergedSchema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), false); + mergedSchema.setFields(fields); + return mergedSchema; + } + +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DFSDeltaWriterAdapter.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DFSDeltaWriterAdapter.java index 65e4ee13cf55f..19016a1e4e454 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DFSDeltaWriterAdapter.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DFSDeltaWriterAdapter.java @@ -30,22 +30,28 @@ */ public class DFSDeltaWriterAdapter implements DeltaWriterAdapter { - private DeltaInputWriter deltaInputGenerator; + private DeltaInputWriter deltaInputWriter; private List metrics = new ArrayList<>(); + private int preCombineFieldVal = 0; - public DFSDeltaWriterAdapter(DeltaInputWriter deltaInputGenerator) { - this.deltaInputGenerator = deltaInputGenerator; + public DFSDeltaWriterAdapter(DeltaInputWriter deltaInputWriter, int preCombineFieldVal) { + this.deltaInputWriter = deltaInputWriter; + this.preCombineFieldVal = preCombineFieldVal; + } + + public DFSDeltaWriterAdapter(DeltaInputWriter deltaInputWriter) { + this.deltaInputWriter = deltaInputWriter; } @Override public List write(Iterator input) throws IOException { while (input.hasNext()) { GenericRecord next = input.next(); - if (this.deltaInputGenerator.canWrite()) { - this.deltaInputGenerator.writeData(next); - } else if (input.hasNext()) { + if (this.deltaInputWriter.canWrite()) { + this.deltaInputWriter.writeData(next); + } else { rollOver(); - this.deltaInputGenerator.writeData(next); + this.deltaInputWriter.writeData(next); } } close(); @@ -54,11 +60,11 @@ public List write(Iterator input) throws IOExcep public void rollOver() throws IOException { close(); - this.deltaInputGenerator = this.deltaInputGenerator.getNewWriter(); + this.deltaInputWriter = this.deltaInputWriter.getNewWriter(); } private void close() throws IOException { - this.deltaInputGenerator.close(); - this.metrics.add(this.deltaInputGenerator.getDeltaWriteStats()); + this.deltaInputWriter.close(); + this.metrics.add(this.deltaInputWriter.getDeltaWriteStats()); } } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriterFactory.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriterFactory.java index b4d9b9f8956d8..a00e8e15d401a 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriterFactory.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/writer/DeltaWriterFactory.java @@ -18,16 +18,17 @@ package org.apache.hudi.integ.testsuite.writer; -import java.io.IOException; -import org.apache.avro.generic.GenericRecord; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; import org.apache.hudi.integ.testsuite.reader.DeltaInputType; +import org.apache.avro.generic.GenericRecord; + +import java.io.IOException; + /** - * A factory to help instantiate different {@link DeltaWriterAdapter}s depending on the {@link DeltaOutputMode} and - * {@link DeltaInputType}. + * A factory to help instantiate different {@link DeltaWriterAdapter}s depending on the {@link DeltaOutputMode} and {@link DeltaInputType}. */ public class DeltaWriterFactory { @@ -44,9 +45,9 @@ public static DeltaWriterAdapter getDeltaWriterAdapter(DeltaConfig config, Integ DeltaInputWriter fileDeltaInputGenerator = new AvroFileDeltaInputWriter( dfsDeltaConfig.getConfiguration(), StringUtils - .join(new String[]{dfsDeltaConfig.getDeltaBasePath(), dfsDeltaConfig.getBatchId().toString()}, + .join(new String[] {dfsDeltaConfig.getDeltaBasePath(), dfsDeltaConfig.getBatchId().toString()}, "/"), dfsDeltaConfig.getSchemaStr(), dfsDeltaConfig.getMaxFileSize()); - return new DFSDeltaWriterAdapter(fileDeltaInputGenerator); + return new DFSDeltaWriterAdapter(fileDeltaInputGenerator, batchId); default: throw new IllegalArgumentException("Invalid delta input format " + config.getDeltaInputType()); } diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/SparkDataSourceContinuousIngest.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/SparkDataSourceContinuousIngest.scala new file mode 100644 index 0000000000000..6352106326930 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/SparkDataSourceContinuousIngest.scala @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path, PathFilter} +import org.apache.log4j.LogManager +import org.apache.spark.sql.{SaveMode, SparkSession} + +import java.io.Serializable + +class SparkDataSourceContinuousIngest(val spark: SparkSession, val conf: Configuration, val sourcePath: Path, + val sourceFormat: String, val checkpointFile: Path, hudiBasePath: Path, hudiOptions: java.util.Map[String, String], + minSyncIntervalSeconds: Long) extends Serializable { + + private val log = LogManager.getLogger(getClass) + + def startIngestion(): Unit = { + val fs = sourcePath.getFileSystem(conf) + var checkPointFs = checkpointFile.getFileSystem(conf) + var orderedBatch : Array[FileStatus] = null + if (checkPointFs.exists(checkpointFile)) { + log.info("Checkpoint file exists. ") + val checkpoint = spark.sparkContext.textFile(checkpointFile.toString).collect()(0) + log.warn("Checkpoint to resume from " + checkpoint) + + orderedBatch = fetchListOfFilesToConsume(fs, sourcePath, new PathFilter { + override def accept(path: Path): Boolean = { + path.getName.toLong > checkpoint.toLong + } + }) + if (log.isDebugEnabled) { + log.debug("List of batches to consume in order ") + orderedBatch.foreach(entry => log.warn(" " + entry.getPath.getName)) + } + + } else { + log.warn("No checkpoint file exists. Starting from scratch ") + orderedBatch = fetchListOfFilesToConsume(fs, sourcePath, new PathFilter { + override def accept(path: Path): Boolean = { + true + } + }) + if (log.isDebugEnabled) { + log.debug("List of batches to consume in order ") + orderedBatch.foreach(entry => log.warn(" " + entry.getPath.getName)) + } + } + + if (orderedBatch.isEmpty) { + log.info("All batches have been consumed. Exiting.") + } else { + orderedBatch.foreach(entry => { + log.info("Consuming from batch " + entry) + val pathToConsume = new Path(sourcePath.toString + "/" + entry.getPath.getName) + val df = spark.read.format(sourceFormat).load(pathToConsume.toString) + + df.write.format("hudi").options(hudiOptions).mode(SaveMode.Append).save(hudiBasePath.toString) + writeToFile(checkpointFile, entry.getPath.getName, checkPointFs) + log.info("Completed batch " + entry + ". Moving to next batch. Sleeping for " + minSyncIntervalSeconds + " secs before next batch") + Thread.sleep(minSyncIntervalSeconds * 1000) + }) + } + } + + def fetchListOfFilesToConsume(fs: FileSystem, basePath: Path, pathFilter: PathFilter): Array[FileStatus] = { + val nextBatches = fs.listStatus(basePath, pathFilter) + nextBatches.sortBy(fileStatus => fileStatus.getPath.getName.toLong) + } + + def writeToFile(checkpointFilePath: Path, str: String, fs: FileSystem): Unit = { + if (!fs.exists(checkpointFilePath)) { + fs.create(checkpointFilePath) + } + val fsOutStream = fs.create(checkpointFilePath, true) + fsOutStream.writeBytes(str) + fsOutStream.flush() + fsOutStream.close() + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkBulkInsertNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkBulkInsertNode.scala new file mode 100644 index 0000000000000..b426f87071127 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkBulkInsertNode.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes + +import org.apache.hudi.client.WriteStatus +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.DataSourceWriteOptions +import org.apache.spark.rdd.RDD + +/** + * Spark datasource based bulk insert node + * + * @param dagNodeConfig DAG node configurations. + */ +class SparkBulkInsertNode(dagNodeConfig: Config) extends SparkInsertNode(dagNodeConfig) { + + override def getOperation(): String = { + DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkDeleteNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkDeleteNode.scala new file mode 100644 index 0000000000000..ecf94b94ec975 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkDeleteNode.scala @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes + +import org.apache.avro.Schema +import org.apache.hudi.client.WriteStatus +import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.{AvroConversionUtils, DataSourceWriteOptions, HoodieSparkUtils} +import org.apache.log4j.LogManager +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SaveMode + +import scala.collection.JavaConverters._ + +/** + * Spark datasource based upsert node + * + * @param dagNodeConfig DAG node configurations. + */ +class SparkDeleteNode(dagNodeConfig: Config) extends DagNode[RDD[WriteStatus]] { + + private val log = LogManager.getLogger(getClass) + config = dagNodeConfig + + /** + * Execute the {@link DagNode}. + * + * @param context The context needed for an execution of a node. + * @param curItrCount iteration count for executing the node. + * @throws Exception Thrown if the execution failed. + */ + override def execute(context: ExecutionContext, curItrCount: Int): Unit = { + // Deletes can't be fetched using getNextBatch() bcoz, getInsert(schema) from payload will return empty for delete + // records + + val batchIdRecords = context.getDeltaGenerator().writeRecords(context.getDeltaGenerator().generateDeletes(config)) + batchIdRecords.getValue().count() + + val pathToRead = context.getWriterContext.getCfg.inputBasePath + "/" + batchIdRecords.getKey() + val avroDf = context.getWriterContext.getSparkSession.read.format("avro").load(pathToRead) + val genRecsRDD = HoodieSparkUtils.createRdd(avroDf, "testStructName", "testNamespace", false, + org.apache.hudi.common.util.Option.of(new Schema.Parser().parse(context.getWriterContext.getHoodieTestSuiteWriter.getSchema))) + + val inputDF = AvroConversionUtils.createDataFrame(genRecsRDD, + context.getWriterContext.getHoodieTestSuiteWriter.getSchema, + context.getWriterContext.getSparkSession) + + inputDF.write.format("hudi") + .options(DataSourceWriteOptions.translateSqlOptions(context.getWriterContext.getProps.asScala.toMap)) + .option(DataSourceWriteOptions.TABLE_NAME.key, context.getHoodieTestSuiteWriter.getCfg.targetTableName) + .option(DataSourceWriteOptions.TABLE_TYPE.key, context.getHoodieTestSuiteWriter.getCfg.tableType) + .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL) + .option(HoodieWriteConfig.TBL_NAME.key, context.getHoodieTestSuiteWriter.getCfg.targetTableName) + .mode(SaveMode.Append) + .save(context.getHoodieTestSuiteWriter.getWriteConfig.getBasePath) + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkDeletePartitionNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkDeletePartitionNode.scala new file mode 100644 index 0000000000000..9354deea28bb0 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkDeletePartitionNode.scala @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes + + +import org.apache.avro.Schema +import org.apache.hudi.client.WriteStatus +import org.apache.hudi.common.util.collection.Pair +import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.schema.SchemaUtils +import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats +import org.apache.hudi.{AvroConversionUtils, DataSourceWriteOptions, HoodieSparkUtils} +import org.apache.log4j.LogManager +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SaveMode + +import scala.collection.JavaConverters._ + +/** + * Spark datasource based insert node + * + * @param dagNodeConfig DAG node configurations. + */ +class SparkDeletePartitionNode(dagNodeConfig: Config) extends DagNode[RDD[WriteStatus]] { + + private val log = LogManager.getLogger(getClass) + config = dagNodeConfig + + /** + * Execute the {@link DagNode}. + * + * @param context The context needed for an execution of a node. + * @param curItrCount iteration count for executing the node. + * @throws Exception Thrown if the execution failed. + */ + override def execute(context: ExecutionContext, curItrCount: Int): Unit = { + println("Generating input data for node {}", this.getName) + + context.getWriterContext.getSparkSession.emptyDataFrame.write.format("hudi") + .options(DataSourceWriteOptions.translateSqlOptions(context.getWriterContext.getProps.asScala.toMap)) + .option(DataSourceWriteOptions.PRECOMBINE_FIELD.key(), SchemaUtils.SOURCE_ORDERING_FIELD) + .option(DataSourceWriteOptions.TABLE_NAME.key, context.getHoodieTestSuiteWriter.getCfg.targetTableName) + .option(DataSourceWriteOptions.TABLE_TYPE.key, context.getHoodieTestSuiteWriter.getCfg.tableType) + .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.DELETE_PARTITION_OPERATION_OPT_VAL) + .option(HoodieWriteConfig.TBL_NAME.key, context.getHoodieTestSuiteWriter.getCfg.targetTableName) + .option(DataSourceWriteOptions.PARTITIONS_TO_DELETE.key, config.getPartitionsToDelete) + .mode(SaveMode.Append) + .save(context.getHoodieTestSuiteWriter.getWriteConfig.getBasePath) + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertNode.scala new file mode 100644 index 0000000000000..6c1d39e2f6c37 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertNode.scala @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes + +import org.apache.avro.Schema +import org.apache.hudi.client.WriteStatus +import org.apache.hudi.common.util.collection.Pair +import org.apache.hudi.config.{HoodieIndexConfig, HoodieWriteConfig} +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats +import org.apache.hudi.{AvroConversionUtils, DataSourceWriteOptions, HoodieSparkUtils} +import org.apache.log4j.LogManager +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SaveMode + +import scala.collection.JavaConverters._ + +/** + * Spark datasource based insert node + * + * @param dagNodeConfig DAG node configurations. + */ +class SparkInsertNode(dagNodeConfig: Config) extends DagNode[RDD[WriteStatus]] { + + private val log = LogManager.getLogger(getClass) + config = dagNodeConfig + + /** + * Execute the {@link DagNode}. + * + * @param context The context needed for an execution of a node. + * @param curItrCount iteration count for executing the node. + * @throws Exception Thrown if the execution failed. + */ + override def execute(context: ExecutionContext, curItrCount: Int): Unit = { + println("Generating input data for node {}", this.getName) + + val batchIdRecords = writeRecords(context) + batchIdRecords.getValue().count() + + val pathToRead = context.getWriterContext.getCfg.inputBasePath + "/" + batchIdRecords.getKey() + val avroDf = context.getWriterContext.getSparkSession.read.format("avro").load(pathToRead) + val genRecsRDD = HoodieSparkUtils.createRdd(avroDf, "testStructName", "testNamespace", false, + org.apache.hudi.common.util.Option.of(new Schema.Parser().parse(context.getWriterContext.getHoodieTestSuiteWriter.getSchema))) + + val inputDF = AvroConversionUtils.createDataFrame(genRecsRDD, + context.getWriterContext.getHoodieTestSuiteWriter.getSchema, + context.getWriterContext.getSparkSession) + + inputDF.write.format("hudi") + .options(DataSourceWriteOptions.translateSqlOptions(context.getWriterContext.getProps.asScala.toMap)) + .option(DataSourceWriteOptions.PRECOMBINE_FIELD.key(), "test_suite_source_ordering_field") + .option(DataSourceWriteOptions.TABLE_NAME.key, context.getHoodieTestSuiteWriter.getCfg.targetTableName) + .option(DataSourceWriteOptions.TABLE_TYPE.key, context.getHoodieTestSuiteWriter.getCfg.tableType) + .option(HoodieIndexConfig.INDEX_TYPE.key, context.getHoodieTestSuiteWriter.getCfg.indexType) + .option(DataSourceWriteOptions.OPERATION.key, getOperation()) + .option(HoodieIndexConfig.INDEX_TYPE.key, context.getHoodieTestSuiteWriter.getCfg.indexType) + .option(HoodieWriteConfig.TBL_NAME.key, context.getHoodieTestSuiteWriter.getCfg.targetTableName) + .mode(SaveMode.Append) + .save(context.getHoodieTestSuiteWriter.getWriteConfig.getBasePath) + } + + def getOperation(): String = { + DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL + } + + def writeRecords(context: ExecutionContext): Pair[Integer, JavaRDD[DeltaWriteStats]] = { + context.getDeltaGenerator().writeRecords(context.getDeltaGenerator().generateInserts(config)) + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertOverwriteNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertOverwriteNode.scala new file mode 100644 index 0000000000000..6dd2eac522974 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertOverwriteNode.scala @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes + +import org.apache.hudi.DataSourceWriteOptions +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config + +class SparkInsertOverwriteNode(dagNodeConfig: Config) extends SparkInsertNode(dagNodeConfig) { + + override def getOperation(): String = { + DataSourceWriteOptions.INSERT_OVERWRITE_OPERATION_OPT_VAL + } + +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertOverwriteTableNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertOverwriteTableNode.scala new file mode 100644 index 0000000000000..a6b80b3a90cc1 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertOverwriteTableNode.scala @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes + +import org.apache.hudi.DataSourceWriteOptions +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config + +class SparkInsertOverwriteTableNode(dagNodeConfig: Config) extends SparkInsertNode(dagNodeConfig) { + + override def getOperation(): String = { + DataSourceWriteOptions.INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkUpsertNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkUpsertNode.scala new file mode 100644 index 0000000000000..76e7576b176af --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkUpsertNode.scala @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes + +import org.apache.hudi.DataSourceWriteOptions +import org.apache.hudi.common.util.collection.Pair +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats +import org.apache.log4j.LogManager +import org.apache.spark.api.java.JavaRDD + +/** + * Spark datasource based upsert node + * + * @param dagNodeConfig DAG node configurations. + */ +class SparkUpsertNode(dagNodeConfig: Config) extends SparkInsertNode(dagNodeConfig) { + + private val log = LogManager.getLogger(getClass) + + override def getOperation(): String = { + DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL + } + + override def writeRecords(context: ExecutionContext): Pair[Integer, JavaRDD[DeltaWriteStats]] = { + context.getDeltaGenerator().writeRecords(context.getDeltaGenerator().generateUpdates(config)) + } + + /** + * Execute the {@link DagNode}. + * + * @param context The context needed for an execution of a node. + * @param curItrCount iteration count for executing the node. + * @throws Exception Thrown if the execution failed. + */ + /*override def execute(context: ExecutionContext, curItrCount: Int): Unit = { + println("Generating input data for node {}", this.getName) + + val batchIdRecords = writeRecords(context) + batchIdRecords.getValue().count() + + val pathToRead = context.getWriterContext.getCfg.inputBasePath + "/" + batchIdRecords.getKey() + val avroDf = context.getWriterContext.getSparkSession.read.format("avro").load(pathToRead) + val genRecsRDD = HoodieSparkUtils.createRdd(avroDf, "testStructName", "testNamespace", false, + org.apache.hudi.common.util.Option.of(new Schema.Parser().parse(context.getWriterContext.getHoodieTestSuiteWriter.getSchema))) + + val inputDF = AvroConversionUtils.createDataFrame(genRecsRDD, + context.getWriterContext.getHoodieTestSuiteWriter.getSchema, + context.getWriterContext.getSparkSession) + + inputDF.write.format("hudi") + .options(DataSourceWriteOptions.translateSqlOptions(context.getWriterContext.getProps.asScala.toMap)) + .option(DataSourceWriteOptions.PRECOMBINE_FIELD.key(), "test_suite_source_ordering_field") + .option(DataSourceWriteOptions.TABLE_NAME.key, context.getHoodieTestSuiteWriter.getCfg.targetTableName) + .option(DataSourceWriteOptions.TABLE_TYPE.key, context.getHoodieTestSuiteWriter.getCfg.tableType) + .option(DataSourceWriteOptions.OPERATION.key, getOperation()) + .option(HoodieWriteConfig.TBL_NAME.key, context.getHoodieTestSuiteWriter.getCfg.targetTableName) + .mode(SaveMode.Append) + .save(context.getHoodieTestSuiteWriter.getWriteConfig.getBasePath) + }*/ +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/BaseSparkSqlNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/BaseSparkSqlNode.scala new file mode 100644 index 0000000000000..83e5598d49378 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/BaseSparkSqlNode.scala @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes.spark.sql + +import org.apache.avro.generic.GenericRecord +import org.apache.hudi.AvroConversionUtils +import org.apache.hudi.client.WriteStatus +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode +import org.apache.hudi.integ.testsuite.utils.SparkSqlUtils +import org.apache.spark.rdd.RDD +import org.slf4j.{Logger, LoggerFactory} + +/** + * Abstract class for DAG node of running Spark SQL. + * + * @param dagNodeConfig DAG node configurations. + */ +abstract class BaseSparkSqlNode(dagNodeConfig: Config) extends DagNode[RDD[WriteStatus]] { + + val LOG: Logger = LoggerFactory.getLogger(this.getClass) + val TEMP_TABLE_NAME = "_spark_sql_temp_table" + config = dagNodeConfig + + /** + * Returns the Spark SQL query to execute for this {@link DagNode}. + * + * @param config DAG node configurations. + * @param context The context needed for an execution of a node. + * @return the query String. + */ + def queryToRun(config: Config, context: ExecutionContext): String + + /** + * Prepares the data for the Spark write operation. + * + * @param context The context needed for an execution of a node. + * @return Records in {@link RDD}. + */ + def prepareData(context: ExecutionContext): RDD[GenericRecord] = { + if (!config.isDisableGenerate) { + context.getDeltaGenerator().writeRecords(context.getDeltaGenerator().generateInserts(config)).getValue().count() + } + context.getWriterContext.getHoodieTestSuiteWriter.getNextBatch + } + + /** + * @return Name of the temp table containing the input data. + */ + def getTempTableName(): String = { + TEMP_TABLE_NAME + } + + /** + * Execute the {@link DagNode}. + * + * @param context The context needed for an execution of a node. + * @param curItrCount iteration count for executing the node. + * @throws Exception Thrown if the execution failed. + */ + override def execute(context: ExecutionContext, curItrCount: Int): Unit = { + LOG.info("Run query in Spark SQL ...") + val nextBatch = prepareData(context) + val sparkSession = context.getWriterContext.getSparkSession + val inputDF = AvroConversionUtils.createDataFrame(nextBatch, + context.getWriterContext.getHoodieTestSuiteWriter.getSchema, + sparkSession) + inputDF.createOrReplaceTempView(TEMP_TABLE_NAME) + + val query = queryToRun(config, context) + SparkSqlUtils.logQuery(LOG, query) + sparkSession.sql(query) + LOG.info("Finish run query in Spark SQL.") + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlCreateTableNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlCreateTableNode.scala new file mode 100644 index 0000000000000..dabe54d822ba6 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlCreateTableNode.scala @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes.spark.sql + +import org.apache.hadoop.fs.Path +import org.apache.hudi.AvroConversionUtils +import org.apache.hudi.client.WriteStatus +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode +import org.apache.hudi.integ.testsuite.utils.SparkSqlUtils +import org.apache.spark.rdd.RDD +import org.slf4j.{Logger, LoggerFactory} + +/** + * DAG node of create table using Spark SQL. + * + * @param dagNodeConfig DAG node configurations. + */ +class SparkSqlCreateTableNode(dagNodeConfig: Config) extends DagNode[RDD[WriteStatus]] { + + val LOG: Logger = LoggerFactory.getLogger(classOf[SparkSqlCreateTableNode]) + val TEMP_TABLE_NAME: String = "_spark_sql_temp_table" + + config = dagNodeConfig + + /** + * Execute the {@link DagNode}. + * + * @param context The context needed for an execution of a node. + * @param curItrCount iteration count for executing the node. + * @throws Exception Thrown if the execution failed. + */ + override def execute(context: ExecutionContext, curItrCount: Int): Unit = { + LOG.info("Creating table in Spark SQL ...") + val sparkSession = context.getWriterContext.getSparkSession + val targetTableName = context.getWriterContext.getCfg.targetTableName + val targetBasePath = context.getWriterContext.getCfg.targetBasePath + "_sql" + + if (config.shouldUseCtas) { + // Prepares data for CTAS query + if (!config.isDisableGenerate) { + context.getDeltaGenerator.writeRecords(context.getDeltaGenerator.generateInserts(config)).getValue().count() + } + val nextBatch = context.getWriterContext.getHoodieTestSuiteWriter.getNextBatch + val sparkSession = context.getWriterContext.getSparkSession + val inputDF = AvroConversionUtils.createDataFrame(nextBatch, + context.getWriterContext.getHoodieTestSuiteWriter.getSchema, + sparkSession) + inputDF.createOrReplaceTempView(TEMP_TABLE_NAME) + } + + // Cleans up the table + sparkSession.sql("drop table if exists " + targetTableName) + if (config.isTableExternal) { + LOG.info("Clean up " + targetBasePath) + val fs = FSUtils.getFs(targetBasePath, context.getJsc.hadoopConfiguration()) + val targetPath = new Path(targetBasePath) + if (fs.exists(targetPath)) { + fs.delete(targetPath, true) + } + } + + // Executes the create table query + val createTableQuery = SparkSqlUtils.constructCreateTableQuery( + config, targetTableName, targetBasePath, + context.getWriterContext.getHoodieTestSuiteWriter.getSchema, TEMP_TABLE_NAME) + SparkSqlUtils.logQuery(LOG, createTableQuery) + sparkSession.sql(createTableQuery) + val targetTableCount = sparkSession.sql("select * from " + targetTableName) + LOG.info("Target table count: " + targetTableCount.count()) + LOG.info("Finish create table in Spark SQL.") + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlDeleteNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlDeleteNode.scala new file mode 100644 index 0000000000000..645f2030b32e1 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlDeleteNode.scala @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes.spark.sql + +import org.apache.avro.generic.GenericRecord +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode +import org.apache.hudi.integ.testsuite.utils.SparkSqlUtils +import org.apache.spark.rdd.RDD + +/** + * DAG node of delete using Spark SQL. + * + * @param dagNodeConfig DAG node configurations. + */ +class SparkSqlDeleteNode(dagNodeConfig: Config) extends BaseSparkSqlNode(dagNodeConfig) { + + config = dagNodeConfig + + /** + * Prepares the data for the write operation. + * + * @param context The context needed for an execution of a node. + * @return Records in {@link RDD}. + */ + override def prepareData(context: ExecutionContext): RDD[GenericRecord] = { + val sparkSession = context.getWriterContext.getSparkSession + val recordsToDelete = SparkSqlUtils.generateDeleteRecords( + config, sparkSession, context.getWriterContext.getHoodieTestSuiteWriter.getSchema, + context.getWriterContext.getCfg.targetTableName, sparkSession.sparkContext.defaultParallelism) + LOG.info("Number of records to delete: " + recordsToDelete.count()) + // The update records corresponding to the SQL are only used for data validation + context.getDeltaGenerator().writeRecords(recordsToDelete).getValue().count() + recordsToDelete + } + + /** + * Returns the Spark SQL query to execute for this {@link DagNode}. + * + * @param config DAG node configurations. + * @param context The context needed for an execution of a node. + * @return the query String. + */ + override def queryToRun(config: Config, context: ExecutionContext): String = { + SparkSqlUtils.constructDeleteQuery(config, context.getWriterContext.getSparkSession, + context.getWriterContext.getCfg.targetTableName) + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlInsertNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlInsertNode.scala new file mode 100644 index 0000000000000..6fc79f4ab33dc --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlInsertNode.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes.spark.sql + +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode +import org.apache.hudi.integ.testsuite.utils.SparkSqlUtils + +/** + * DAG node of insert using Spark SQL. + * + * @param dagNodeConfig DAG node configurations. + */ +class SparkSqlInsertNode(dagNodeConfig: Config) extends BaseSparkSqlNode(dagNodeConfig) { + + config = dagNodeConfig + + /** + * Returns the Spark SQL query to execute for this {@link DagNode}. + * + * @param config DAG node configurations. + * @param context The context needed for an execution of a node. + * @return the query String. + */ + override def queryToRun(config: Config, context: ExecutionContext): String = { + val targetTableName = context.getWriterContext.getCfg.targetTableName + SparkSqlUtils.constructInsertQuery( + "into", targetTableName, + SparkSqlUtils.getTableSchema(context.getWriterContext.getSparkSession, targetTableName), + getTempTableName()) + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlInsertOverwriteNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlInsertOverwriteNode.scala new file mode 100644 index 0000000000000..248b70d545e5f --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlInsertOverwriteNode.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes.spark.sql + +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode +import org.apache.hudi.integ.testsuite.utils.SparkSqlUtils + +/** + * DAG node of insert overwrite using Spark SQL. + * + * @param dagNodeConfig DAG node configurations. + */ +class SparkSqlInsertOverwriteNode(dagNodeConfig: Config) extends BaseSparkSqlNode(dagNodeConfig) { + + config = dagNodeConfig + + /** + * Returns the Spark SQL query to execute for this {@link DagNode}. + * + * @param config DAG node configurations. + * @param context The context needed for an execution of a node. + * @return the query String. + */ + override def queryToRun(config: Config, context: ExecutionContext): String = { + val targetTableName = context.getWriterContext.getCfg.targetTableName + SparkSqlUtils.constructInsertQuery( + "overwrite", targetTableName, + SparkSqlUtils.getTableSchema(context.getWriterContext.getSparkSession, targetTableName), + getTempTableName()) + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlMergeNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlMergeNode.scala new file mode 100644 index 0000000000000..52ba6be21ae23 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlMergeNode.scala @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes.spark.sql + +import org.apache.avro.generic.GenericRecord +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.utils.SparkSqlUtils +import org.apache.spark.rdd.RDD + +/** + * DAG node of merge using Spark SQL. + * + * @param dagNodeConfig DAG node configurations. + */ +class SparkSqlMergeNode(dagNodeConfig: Config) extends BaseSparkSqlNode(dagNodeConfig) { + + config = dagNodeConfig + + /** + * Prepares the data for the Spark write operation. + * + * @param context The context needed for an execution of a node. + * @return Records in {@link RDD}. + */ + override def prepareData(context: ExecutionContext): RDD[GenericRecord] = { + if (!config.isDisableGenerate) { + context.getDeltaGenerator().writeRecords(context.getDeltaGenerator().generateUpdates(config)).getValue().count() + } + context.getWriterContext.getHoodieTestSuiteWriter.getNextBatch + } + + /** + * Returns the Spark SQL query to execute for this {@link DagNode}. + * + * @param config DAG node configurations. + * @param context The context needed for an execution of a node. + * @return the query String. + */ + override def queryToRun(config: Config, context: ExecutionContext): String = { + val targetTableName = context.getWriterContext.getCfg.targetTableName + SparkSqlUtils.constructMergeQuery( + config, targetTableName, + SparkSqlUtils.getTableSchema(context.getWriterContext.getSparkSession, targetTableName), + getTempTableName()) + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlUpdateNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlUpdateNode.scala new file mode 100644 index 0000000000000..7405d3ff48cab --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlUpdateNode.scala @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes.spark.sql + +import org.apache.avro.generic.GenericRecord +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.dag.nodes.DagNode +import org.apache.hudi.integ.testsuite.utils.SparkSqlUtils +import org.apache.spark.rdd.RDD + +/** + * DAG node of update using Spark SQL. + * + * @param dagNodeConfig DAG node configurations. + */ +class SparkSqlUpdateNode(dagNodeConfig: Config) extends BaseSparkSqlNode(dagNodeConfig) { + + config = dagNodeConfig + + /** + * Prepares the data for the Spark write operation. + * + * @param context The context needed for an execution of a node. + * @return Records in {@link RDD}. + */ + override def prepareData(context: ExecutionContext): RDD[GenericRecord] = { + val sparkSession = context.getWriterContext.getSparkSession + val recordsToUpdate = SparkSqlUtils.generateUpdateRecords( + config, sparkSession, context.getWriterContext.getHoodieTestSuiteWriter.getSchema, + context.getWriterContext.getCfg.targetTableName, sparkSession.sparkContext.defaultParallelism) + LOG.info("Number of records to update: " + recordsToUpdate.count()) + // The update records corresponding to the SQL are only used for data validation + context.getDeltaGenerator().writeRecords(recordsToUpdate).getValue().count() + recordsToUpdate + } + + /** + * Returns the Spark SQL query to execute for this {@link DagNode}. + * + * @param config DAG node configurations. + * @param context The context needed for an execution of a node. + * @return the query String. + */ + override def queryToRun(config: Config, context: ExecutionContext): String = { + SparkSqlUtils.constructUpdateQuery(config, context.getWriterContext.getSparkSession, + context.getWriterContext.getCfg.targetTableName) + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlValidateDatasetNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlValidateDatasetNode.scala new file mode 100644 index 0000000000000..01804baa9f148 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/spark/sql/SparkSqlValidateDatasetNode.scala @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes.spark.sql + +import org.apache.hudi.common.model.HoodieRecord +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.dag.ExecutionContext +import org.apache.hudi.integ.testsuite.dag.nodes.BaseValidateDatasetNode +import org.apache.hudi.integ.testsuite.utils.SparkSqlUtils +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{Dataset, Row, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +/** + * This validation node uses Spark SQL to get data for comparison purposes. + */ +class SparkSqlValidateDatasetNode(dagNodeConfig: Config) extends BaseValidateDatasetNode(dagNodeConfig) { + + val LOG: Logger = LoggerFactory.getLogger(classOf[SparkSqlValidateDatasetNode]) + + config = dagNodeConfig + + /** + * @return {@link Logger} instance to use. + */ + override def getLogger: Logger = LOG + + /** + * @param session {@link SparkSession} instance to use. + * @param context {@link ExecutionContext} instance to use. + * @param inputSchema input schema in {@link StructType} + * @return data in {@link Dataset< Row >} to validate. + */ + override def getDatasetToValidate(session: SparkSession, context: ExecutionContext, + inputSchema: StructType): Dataset[Row] = { + val tableName = context.getWriterContext.getCfg.targetTableName + LOG.info("Validate data in table " + tableName) + val sortedInputFieldNames = inputSchema.fieldNames.sorted + val tableSchema = session.table(tableName).schema + val sortedTableFieldNames = tableSchema.fieldNames + .filter(field => !HoodieRecord.HOODIE_META_COLUMNS.contains(field)).sorted + if (!(sortedInputFieldNames sameElements sortedTableFieldNames)) { + LOG.error("Input schema: ") + inputSchema.printTreeString() + LOG.error("Table schema: ") + tableSchema.printTreeString() + throw new AssertionError("Data set validation failed. The schema does not match.") + } + session.sql(SparkSqlUtils.constructSelectQuery(inputSchema, tableName)) + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/utils/SparkSqlUtils.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/utils/SparkSqlUtils.scala new file mode 100644 index 0000000000000..ca7ca6f26a4e2 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/utils/SparkSqlUtils.scala @@ -0,0 +1,527 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.utils + +import org.apache.avro.Schema +import org.apache.avro.generic.GenericRecord + +import org.apache.hudi.{AvroConversionUtils, HoodieSparkUtils} +import org.apache.hudi.common.model.HoodieRecord +import org.apache.hudi.common.util.Option +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config +import org.apache.hudi.integ.testsuite.generator.GenericRecordFullPayloadGenerator +import org.apache.hudi.utilities.schema.RowBasedSchemaProvider + +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.SparkSession +import org.apache.spark.storage.StorageLevel + +import org.slf4j.Logger + +import scala.math.BigDecimal.RoundingMode.RoundingMode + +/** + * Utils for test nodes in Spark SQL + */ +object SparkSqlUtils { + + /** + * @param sparkSession spark session to use + * @param tableName table name + * @return table schema excluding meta columns in `StructType` + */ + def getTableSchema(sparkSession: SparkSession, tableName: String): StructType = { + new StructType(sparkSession.table(tableName).schema.fields + .filter(field => !HoodieRecord.HOODIE_META_COLUMNS.contains(field.name))) + } + + /** + * Converts Avro schema in String to the SQL schema expression, with partition fields at the end + * + * For example, given the Avro schema below: + * """ + * {"type":"record","name":"triprec","fields":[{"name":"timestamp","type":"long"}, + * {"name":"_row_key","type":"string"},{"name":"rider","type":"string"},{"name":"driver","type":"string"}, + * {"name":"begin_lat","type":"double"},{"name":"begin_lon","type":"double"},{"name":"end_lat","type":"double"}, + * {"name":"end_lon","type":"double"},{"name":"fare","type":"double"}, + * {"name":"_hoodie_is_deleted","type":"boolean","default":false}]} + * """ + * and the partition columns Set("rider"), + * the SQL schema expression is: + * """ + * timestamp bigint, + * _row_key string, + * driver string, + * begin_lat double, + * begin_lon double, + * end_lat double, + * end_lon double, + * fare double, + * _hoodie_is_deleted boolean, + * rider string + * """ + * + * @param avroSchemaString Avro schema String + * @param partitionColumns partition columns + * @return corresponding SQL schema expression + */ + def convertAvroToSqlSchemaExpression(avroSchemaString: String, partitionColumns: Set[String]): String = { + val fields: Array[(String, String)] = getFieldNamesAndTypes(avroSchemaString) + val reorderedFields = fields.filter(field => !partitionColumns.contains(field._1)) ++ + fields.filter(field => partitionColumns.contains(field._1)) + reorderedFields.map(e => e._1 + " " + e._2).mkString(",\n") + } + + /** + * Converts Avro schema in String to an array of field names. + * + * For example, given the Avro schema below: + * """ + * {"type":"record","name":"triprec","fields":[{"name":"timestamp","type":"long"}, + * {"name":"_row_key","type":"string"},{"name":"rider","type":"string"},{"name":"driver","type":"string"}, + * {"name":"begin_lat","type":"double"},{"name":"begin_lon","type":"double"},{"name":"end_lat","type":"double"}, + * {"name":"end_lon","type":"double"},{"name":"fare","type":"double"}, + * {"name":"_hoodie_is_deleted","type":"boolean","default":false}]} + * """ + * the output is + * ["timestamp", "_row_key", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon", + * "fare", "_hoodie_is_deleted"] + * + * @param avroSchemaString Avro schema String + * @return an array of field names. + */ + def convertAvroToFieldNames(avroSchemaString: String): Array[String] = { + getFieldNamesAndTypes(avroSchemaString).map(e => e._1) + } + + /** + * Gets an array of field names and types from Avro schema String. + * + * For example, given the Avro schema below: + * """ + * {"type":"record","name":"triprec","fields":[{"name":"timestamp","type":"long"}, + * {"name":"_row_key","type":"string"},{"name":"rider","type":"string"},{"name":"driver","type":"string"}, + * {"name":"begin_lat","type":"double"},{"name":"begin_lon","type":"double"},{"name":"end_lat","type":"double"}, + * {"name":"end_lon","type":"double"},{"name":"fare","type":"double"}, + * {"name":"_hoodie_is_deleted","type":"boolean","default":false}]} + * """ + * the output is + * [("timestamp", "bigint"), + * ("_row_key", "string"), + * ("rider", "string", + * ("driver", "string"), + * ("begin_lat", "double"), + * ("begin_lon", "double"), + * ("end_lat", "double"), + * ("end_lon", "double"), + * ("fare", "double"), + * ("_hoodie_is_deleted", "boolean")] + * + * @param avroSchemaString Avro schema String + * @return an array of field names and types + */ + def getFieldNamesAndTypes(avroSchemaString: String): Array[(String, String)] = { + val schema = new Schema.Parser().parse(avroSchemaString) + val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema) + structType.fields.map(field => (field.name, field.dataType.simpleString)) + } + + /** + * Logs the Spark SQL query to run. + * + * @param log {@link Logger} instance to use. + * @param query query String. + */ + def logQuery(log: Logger, query: String): Unit = { + log.warn("----- Running the following Spark SQL query -----") + log.warn(query) + log.warn("-" * 50) + } + + /** + * Constructs the select query. + * + * For example, given the Avro schema below: + * """ + * {"type":"record","name":"triprec","fields":[{"name":"timestamp","type":"long"}, + * {"name":"_row_key","type":"string"},{"name":"rider","type":"string"},{"name":"driver","type":"string"}, + * {"name":"begin_lat","type":"double"},{"name":"begin_lon","type":"double"},{"name":"end_lat","type":"double"}, + * {"name":"end_lon","type":"double"},{"name":"fare","type":"double"}, + * {"name":"_hoodie_is_deleted","type":"boolean","default":false}]} + * """ + * and the partition columns Set("rider"), + * the output is + * """ + * select timestamp, _row_key, driver, begin_lat, begin_lon, end_lat, end_lon, fare, + * _hoodie_is_deleted, rider from _temp_table + * """ + * + * @param inputSchema input Avro schema String. + * @param partitionColumns partition columns + * @param tableName table name. + * @return select query String. + */ + def constructSelectQuery(inputSchema: String, partitionColumns: Set[String], tableName: String): String = { + val fieldNames: Array[String] = SparkSqlUtils.convertAvroToFieldNames(inputSchema) + val reorderedFieldNames = fieldNames.filter(name => !partitionColumns.contains(name)) ++ + fieldNames.filter(name => partitionColumns.contains(name)) + constructSelectQuery(reorderedFieldNames, tableName) + } + + /** + * Constructs the select query with {@link StructType} columns in the select. + * + * @param structType {@link StructType} instance. + * @param tableName table name. + * @return select query String. + */ + def constructSelectQuery(structType: StructType, tableName: String): String = { + constructSelectQuery(structType, Set.empty[String], tableName) + } + + /** + * Constructs the select query with {@link StructType} columns in the select and the partition + * columns at the end. + * + * @param structType {@link StructType} instance. + * @param partitionColumns partition columns in a {@link Set} + * @param tableName table name. + * @return select query String. + */ + def constructSelectQuery(structType: StructType, partitionColumns: Set[String], tableName: String): String = { + val fieldNames: Array[String] = structType.fields.map(field => field.name) + val reorderedFieldNames = fieldNames.filter(name => !partitionColumns.contains(name)) ++ + fieldNames.filter(name => partitionColumns.contains(name)) + constructSelectQuery(reorderedFieldNames, tableName) + } + + /** + * Constructs the select query with a {@link Array} of String. + * + * @param fieldNames field names in String. + * @param tableName table name. + * @return select query String. + */ + def constructSelectQuery(fieldNames: Array[String], tableName: String): String = { + val selectQueryBuilder = new StringBuilder("select "); + selectQueryBuilder.append(fieldNames.mkString(", ")) + selectQueryBuilder.append(" from ") + selectQueryBuilder.append(tableName) + selectQueryBuilder.toString() + } + + /** + * Constructs the Spark SQL create table query based on the configs. + * + * @param config DAG node configurations. + * @param targetTableName target table name. + * @param targetBasePath target bash path for external table. + * @param inputSchema input Avro schema String. + * @param inputTableName name of the table containing input data. + * @return create table query. + */ + def constructCreateTableQuery(config: Config, targetTableName: String, targetBasePath: String, + inputSchema: String, inputTableName: String): String = { + // Constructs create table statement + val createTableQueryBuilder = new StringBuilder("create table ") + createTableQueryBuilder.append(targetTableName) + val partitionColumns: Set[String] = + if (config.getPartitionField.isPresent) Set(config.getPartitionField.get) else Set.empty + if (!config.shouldUseCtas) { + // Adds the schema statement if not using CTAS + createTableQueryBuilder.append(" (") + createTableQueryBuilder.append(SparkSqlUtils.convertAvroToSqlSchemaExpression(inputSchema, partitionColumns)) + createTableQueryBuilder.append("\n)") + } + createTableQueryBuilder.append(" using hudi") + val tableTypeOption = config.getTableType + val primaryKeyOption = config.getPrimaryKey + val preCombineFieldOption = config.getPreCombineField + + // Adds location for external table + if (config.isTableExternal) { + createTableQueryBuilder.append("\nlocation '" + targetBasePath + "'") + } + + // Adds options if set + var options = Array[String]() + if (tableTypeOption.isPresent) { + options :+= ("type = '" + tableTypeOption.get() + "'") + } + if (primaryKeyOption.isPresent) { + options :+= ("primaryKey = '" + primaryKeyOption.get() + "'") + } + if (preCombineFieldOption.isPresent) { + options :+= ("preCombineField = '" + preCombineFieldOption.get() + "'") + } + if (options.length > 0) { + createTableQueryBuilder.append(options.mkString("\noptions ( \n", ",\n", "\n)")) + } + + // Adds partition fields if set + val partitionFieldOption = config.getPartitionField + if (partitionFieldOption.isPresent) { + createTableQueryBuilder.append("\npartitioned by (" + partitionFieldOption.get() + ")") + } + + if (config.shouldUseCtas()) { + // Adds as select query + createTableQueryBuilder.append("\nas\n"); + createTableQueryBuilder.append(constructSelectQuery(inputSchema, partitionColumns, inputTableName)) + } + createTableQueryBuilder.toString() + } + + /** + * Constructs the Spark SQL insert query based on the configs. + * + * @param insertType the insert type, in one of two types: "into" or "overwrite". + * @param targetTableName target table name. + * @param schema table schema to use + * @param inputTableName name of the table containing input data. + * @return insert query. + */ + def constructInsertQuery(insertType: String, targetTableName: String, schema: StructType, + inputTableName: String): String = { + // Constructs insert statement + val insertQueryBuilder = new StringBuilder("insert ") + insertQueryBuilder.append(insertType) + insertQueryBuilder.append(" ") + insertQueryBuilder.append(targetTableName) + insertQueryBuilder.append(" ") + insertQueryBuilder.append(constructSelectQuery(schema, inputTableName)) + insertQueryBuilder.toString() + } + + /** + * Constructs the Spark SQL merge query based on the configs. + * + * @param config DAG node configurations. + * @param targetTableName target table name. + * @param schema table schema to use + * @param inputTableName name of the table containing input data. + * @return merge query. + */ + def constructMergeQuery(config: Config, targetTableName: String, schema: StructType, + inputTableName: String): String = { + val mergeQueryBuilder = new StringBuilder("merge into ") + mergeQueryBuilder.append(targetTableName) + mergeQueryBuilder.append(" as target using (\n") + mergeQueryBuilder.append(constructSelectQuery(schema, inputTableName)) + mergeQueryBuilder.append("\n) source\non ") + mergeQueryBuilder.append(config.getMergeCondition) + mergeQueryBuilder.append("\nwhen matched then ") + mergeQueryBuilder.append(config.getMatchedAction) + mergeQueryBuilder.append("\nwhen not matched then ") + mergeQueryBuilder.append(config.getNotMatchedAction) + mergeQueryBuilder.toString() + } + + /** + * Constructs the Spark SQL update query based on the configs. + * + * @param config DAG node configurations. + * @param sparkSession Spark session. + * @param targetTableName target table name. + * @return update query. + */ + def constructUpdateQuery(config: Config, sparkSession: SparkSession, + targetTableName: String): String = { + val bounds = getLowerUpperBoundsFromPercentiles(config, sparkSession, targetTableName) + val updateQueryBuilder = new StringBuilder("update ") + updateQueryBuilder.append(targetTableName) + updateQueryBuilder.append(" set ") + updateQueryBuilder.append(config.getUpdateColumn) + updateQueryBuilder.append(" = ") + updateQueryBuilder.append(config.getUpdateColumn) + updateQueryBuilder.append(" * 1.6 ") + updateQueryBuilder.append(" where ") + updateQueryBuilder.append(config.getWhereConditionColumn) + updateQueryBuilder.append(" between ") + updateQueryBuilder.append(bounds._1) + updateQueryBuilder.append(" and ") + updateQueryBuilder.append(bounds._2) + updateQueryBuilder.toString() + } + + /** + * Constructs the Spark SQL delete query based on the configs. + * + * @param config DAG node configurations. + * @param sparkSession Spark session. + * @param targetTableName target table name. + * @return delete query. + */ + def constructDeleteQuery(config: Config, sparkSession: SparkSession, + targetTableName: String): String = { + val bounds = getLowerUpperBoundsFromPercentiles(config, sparkSession, targetTableName) + val deleteQueryBuilder = new StringBuilder("delete from ") + deleteQueryBuilder.append(targetTableName) + deleteQueryBuilder.append(" where ") + deleteQueryBuilder.append(config.getWhereConditionColumn) + deleteQueryBuilder.append(" between ") + deleteQueryBuilder.append(bounds._1) + deleteQueryBuilder.append(" and ") + deleteQueryBuilder.append(bounds._2) + deleteQueryBuilder.toString() + } + + /** + * Generates the pair of percentile levels based on the ratio in the config. + * + * For example, given ratio as 0.4, the output is (0.3, 0.7). + * + * @param config DAG node configurations. + * @return the lower bound and upper bound percentiles. + */ + def generatePercentiles(config: Config): (Double, Double) = { + val ratio: Double = config.getRatioRecordsChange + (Math.max(0.5 - (ratio / 2.0), 0.0), Math.min(0.5 + (ratio / 2.0), 1.0)) + } + + /** + * @param number input double number + * @param mode rounding mode + * @return rounded double + */ + def roundDouble(number: Double, mode: RoundingMode): Double = { + BigDecimal(number).setScale(4, mode).toDouble + } + + /** + * @param config DAG node configurations. + * @param sparkSession Spark session. + * @param targetTableName target table name. + * @return lower and upper bound values based on the percentiles. + */ + def getLowerUpperBoundsFromPercentiles(config: Config, sparkSession: SparkSession, + targetTableName: String): (Double, Double) = { + val percentiles = generatePercentiles(config) + val result = sparkSession.sql(constructPercentileQuery(config, targetTableName, percentiles)).collect()(0) + (roundDouble(result.get(0).asInstanceOf[Double], BigDecimal.RoundingMode.HALF_DOWN), + roundDouble(result.get(1).asInstanceOf[Double], BigDecimal.RoundingMode.HALF_UP)) + } + + /** + * Constructs the query to get percentiles for the where condition. + * + * @param config DAG node configurations. + * @param targetTableName target table name. + * @param percentiles lower and upper percentiles. + * @return percentile query in String. + */ + def constructPercentileQuery(config: Config, targetTableName: String, + percentiles: (Double, Double)): String = { + val percentileQueryBuilder = new StringBuilder("select percentile(") + percentileQueryBuilder.append(config.getWhereConditionColumn) + percentileQueryBuilder.append(", ") + percentileQueryBuilder.append(percentiles._1) + percentileQueryBuilder.append("), percentile(") + percentileQueryBuilder.append(config.getWhereConditionColumn) + percentileQueryBuilder.append(", ") + percentileQueryBuilder.append(percentiles._2) + percentileQueryBuilder.append(") from ") + percentileQueryBuilder.append(targetTableName) + percentileQueryBuilder.toString() + } + + /** + * Constructs the Spark SQL query to get update or delete records. + * + * @param config DAG node configurations. + * @param targetTableName target table name. + * @param avroSchemaString input Avro schema String. + * @param lowerBound lower bound value for the where condition. + * @param upperBound upper bound value for the where condition. + * @return delete query. + */ + def constructChangedRecordQuery(config: Config, targetTableName: String, avroSchemaString: String, + lowerBound: Double, upperBound: Double): String = { + val recordQueryBuilder = new StringBuilder(constructSelectQuery(avroSchemaString, Set.empty[String], targetTableName)) + recordQueryBuilder.append(" where ") + recordQueryBuilder.append(config.getWhereConditionColumn) + recordQueryBuilder.append(" between ") + recordQueryBuilder.append(lowerBound) + recordQueryBuilder.append(" and ") + recordQueryBuilder.append(upperBound) + recordQueryBuilder.toString() + } + + /** + * Generates the exact same records to update based on the SQL derived from the + * configs for data validation. + * + * @param config DAG node configurations. + * @param sparkSession Spark session. + * @param avroSchemaString input Avro schema String. + * @param targetTableName target table name. + * @param parallelism parallelism for RDD + * @return records in {@link JavaRdd[ GenericRecord ]}. + */ + def generateUpdateRecords(config: Config, sparkSession: SparkSession, avroSchemaString: String, + targetTableName: String, parallelism: Int): JavaRDD[GenericRecord] = { + val bounds = getLowerUpperBoundsFromPercentiles(config, sparkSession, targetTableName) + val rows = sparkSession.sql( + constructChangedRecordQuery(config, targetTableName, avroSchemaString, bounds._1, bounds._2)) + + val rdd = HoodieSparkUtils + .createRdd(rows, RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME, + RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE, reconcileToLatestSchema = false, Option.empty()) + .map(record => { + record.put(config.getUpdateColumn, record.get(config.getUpdateColumn).toString.toDouble * 1.6) + record + }) + .toJavaRDD() + val repartitionedRdd = rdd.repartition(parallelism) + repartitionedRdd.persist(StorageLevel.DISK_ONLY) + repartitionedRdd + } + + /** + * Generates the exact same records to delete based on the SQL derived from the + * configs for data validation. + * + * @param config DAG node configurations. + * @param sparkSession Spark session. + * @param avroSchemaString input Avro schema String. + * @param targetTableName target table name. + * @param parallelism parallelism for RDD + * @return records in {@link JavaRdd[ GenericRecord ]}. + */ + def generateDeleteRecords(config: Config, sparkSession: SparkSession, avroSchemaString: String, + targetTableName: String, parallelism: Int): JavaRDD[GenericRecord] = { + val bounds = getLowerUpperBoundsFromPercentiles(config, sparkSession, targetTableName) + val rows = sparkSession.sql( + constructChangedRecordQuery(config, targetTableName, avroSchemaString, bounds._1, bounds._2)) + + val rdd = HoodieSparkUtils + .createRdd(rows, RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME, + RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE, reconcileToLatestSchema = false, Option.empty()) + .map(record => { + record.put(GenericRecordFullPayloadGenerator.DEFAULT_HOODIE_IS_DELETED_COL, true) + record + }) + .toJavaRDD() + val repartitionedRdd = rdd.repartition(parallelism) + repartitionedRdd.persist(StorageLevel.DISK_ONLY) + repartitionedRdd + } +} diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/HoodieTestHiveBase.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/HoodieTestHiveBase.java index 95e4c0117cea9..f6c7e991378d3 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/HoodieTestHiveBase.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/HoodieTestHiveBase.java @@ -72,12 +72,12 @@ public void generateDataByHoodieJavaApp(String hiveTableName, String tableType, } // Run Hoodie Java App - String cmd = String.format("%s --hive-sync --table-path %s --hive-url %s --table-type %s --hive-table %s" + - " --commit-type %s --table-name %s", HOODIE_GENERATE_APP, hdfsUrl, HIVE_SERVER_JDBC_URL, + String cmd = String.format("%s --hive-sync --table-path %s --hive-url %s --table-type %s --hive-table %s" + + " --commit-type %s --table-name %s", HOODIE_GENERATE_APP, hdfsUrl, HIVE_SERVER_JDBC_URL, tableType, hiveTableName, commitType, hoodieTableName); if (partitionType == PartitionType.MULTI_KEYS_PARTITIONED) { cmd = cmd + " --use-multi-partition-keys"; - } else if (partitionType == PartitionType.NON_PARTITIONED){ + } else if (partitionType == PartitionType.NON_PARTITIONED) { cmd = cmd + " --non-partitioned"; } executeCommandStringInDocker(ADHOC_1_CONTAINER, cmd, true); diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java index 80ed1d4bf40f1..dcb63678027c8 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java @@ -18,9 +18,9 @@ package org.apache.hudi.integ; -import java.util.concurrent.TimeoutException; import org.apache.hudi.common.util.FileIOUtils; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; import com.github.dockerjava.api.DockerClient; import com.github.dockerjava.api.command.DockerCmdExecFactory; @@ -40,8 +40,10 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.concurrent.TimeoutException; import java.util.stream.Collectors; import static java.util.concurrent.TimeUnit.SECONDS; @@ -60,6 +62,7 @@ public abstract class ITTestBase { protected static final String ADHOC_2_CONTAINER = "/adhoc-2"; protected static final String HIVESERVER = "/hiveserver"; protected static final String PRESTO_COORDINATOR = "/presto-coordinator-1"; + protected static final String TRINO_COORDINATOR = "/trino-coordinator-1"; protected static final String HOODIE_WS_ROOT = "/var/hoodie/ws"; protected static final String HOODIE_JAVA_APP = HOODIE_WS_ROOT + "/hudi-spark-datasource/hudi-spark/run_hoodie_app.sh"; protected static final String HOODIE_GENERATE_APP = HOODIE_WS_ROOT + "/hudi-spark-datasource/hudi-spark/run_hoodie_generate_app.sh"; @@ -74,6 +77,7 @@ public abstract class ITTestBase { HOODIE_WS_ROOT + "/docker/hoodie/hadoop/hive_base/target/hoodie-utilities.jar"; protected static final String HIVE_SERVER_JDBC_URL = "jdbc:hive2://hiveserver:10000"; protected static final String PRESTO_COORDINATOR_URL = "presto-coordinator-1:8090"; + protected static final String TRINO_COORDINATOR_URL = "trino-coordinator-1:8091"; protected static final String HADOOP_CONF_DIR = "/etc/hadoop"; // Skip these lines when capturing output from hive @@ -82,10 +86,7 @@ public abstract class ITTestBase { protected DockerClient dockerClient; protected Map runningContainers; - static String[] getHiveConsoleCommand(String rawCommand) { - String jarCommand = "add jar " + HUDI_HADOOP_BUNDLE + ";"; - String fullCommand = jarCommand + rawCommand; - + static String[] getHiveConsoleCommand(String hiveExpr) { List cmd = new ArrayList<>(); cmd.add("hive"); cmd.add("--hiveconf"); @@ -93,7 +94,7 @@ static String[] getHiveConsoleCommand(String rawCommand) { cmd.add("--hiveconf"); cmd.add("hive.stats.autogather=false"); cmd.add("-e"); - cmd.add("\"" + fullCommand + "\""); + cmd.add("\"" + hiveExpr + "\""); return cmd.toArray(new String[0]); } @@ -114,7 +115,7 @@ static String getSparkShellCommand(String commandFile) { .append(" --master local[2] --driver-class-path ").append(HADOOP_CONF_DIR) .append( " --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client --driver-memory 1G --executor-memory 1G --num-executors 1 ") - .append(" --packages org.apache.spark:spark-avro_2.11:2.4.4 ").append(" -i ").append(commandFile).toString(); + .append(" -i ").append(commandFile).toString(); } static String getPrestoConsoleCommand(String commandFile) { @@ -123,6 +124,12 @@ static String getPrestoConsoleCommand(String commandFile) { .append(" -f " + commandFile).toString(); } + static String getTrinoConsoleCommand(String commandFile) { + return new StringBuilder().append("trino --server " + TRINO_COORDINATOR_URL) + .append(" --catalog hive --schema default") + .append(" -f " + commandFile).toString(); + } + @BeforeEach public void init() { String dockerHost = (OVERRIDDEN_DOCKER_HOST != null) ? OVERRIDDEN_DOCKER_HOST : DEFAULT_DOCKER_HOST; @@ -130,10 +137,14 @@ public void init() { DockerClientConfig config = DefaultDockerClientConfig.createDefaultConfigBuilder().withDockerHost(dockerHost).build(); // using jaxrs/jersey implementation here (netty impl is also available) - DockerCmdExecFactory dockerCmdExecFactory = new JerseyDockerCmdExecFactory().withConnectTimeout(1000) - .withMaxTotalConnections(100).withMaxPerRouteConnections(10); + DockerCmdExecFactory dockerCmdExecFactory = new JerseyDockerCmdExecFactory().withConnectTimeout(10000) + .withMaxTotalConnections(100).withMaxPerRouteConnections(50); dockerClient = DockerClientBuilder.getInstance(config).withDockerCmdExecFactory(dockerCmdExecFactory).build(); - await().atMost(60, SECONDS).until(this::servicesUp); + LOG.info("Start waiting for all the containers and services to be ready"); + long currTs = System.currentTimeMillis(); + await().atMost(300, SECONDS).until(this::servicesUp); + LOG.info(String.format("Waiting for all the containers and services finishes in %d ms", + System.currentTimeMillis() - currTs)); } private boolean servicesUp() { @@ -144,8 +155,29 @@ private boolean servicesUp() { return false; } } - runningContainers = containerList.stream().map(c -> Pair.of(c.getNames()[0], c)) - .collect(Collectors.toMap(Pair::getLeft, Pair::getRight)); + + if (runningContainers == null) { + runningContainers = containerList.stream().map(c -> Pair.of(c.getNames()[0], c)) + .collect(Collectors.toMap(Pair::getLeft, Pair::getRight)); + } + + return checkHealth(ADHOC_1_CONTAINER, "namenode", 8020); + } + + private boolean checkHealth(String fromContainerName, String hostname, int port) { + try { + String command = String.format("nc -z -v %s %d", hostname, port); + TestExecStartResultCallback resultCallback = + executeCommandStringInDocker(fromContainerName, command, false, true); + String stderrString = resultCallback.getStderr().toString().trim(); + if (!stderrString.contains("open")) { + Thread.sleep(1000); + return false; + } + } catch (Exception e) { + throw new HoodieException(String.format("Exception thrown while checking health from %s for %s:%d", + fromContainerName, hostname, port), e); + } return true; } @@ -153,10 +185,34 @@ private String singleSpace(String str) { return str.replaceAll("[\\s]+", " "); } - private TestExecStartResultCallback executeCommandInDocker(String containerName, String[] command, - boolean expectedToSucceed) throws Exception { - Container sparkWorkerContainer = runningContainers.get(containerName); - ExecCreateCmd cmd = dockerClient.execCreateCmd(sparkWorkerContainer.getId()).withCmd(command).withAttachStdout(true) + private TestExecStartResultCallback executeCommandInDocker( + String containerName, String[] command, boolean expectedToSucceed) throws Exception { + return executeCommandInDocker(containerName, command, true, expectedToSucceed, Collections.emptyMap()); + } + + private TestExecStartResultCallback executeCommandInDocker(String containerName, + String[] command, + boolean checkIfSucceed, + boolean expectedToSucceed) throws Exception { + return executeCommandInDocker(containerName, command, checkIfSucceed, expectedToSucceed, Collections.emptyMap()); + } + + private TestExecStartResultCallback executeCommandInDocker(String containerName, + String[] command, + boolean checkIfSucceed, + boolean expectedToSucceed, + Map env) throws Exception { + Container targetContainer = runningContainers.get(containerName); + + List dockerEnv = env.entrySet() + .stream() + .map(e -> String.format("%s=%s", e.getKey(), e.getValue())) + .collect(Collectors.toList()); + + ExecCreateCmd cmd = dockerClient.execCreateCmd(targetContainer.getId()) + .withEnv(dockerEnv) + .withCmd(command) + .withAttachStdout(true) .withAttachStderr(true); ExecCreateCmdResponse createCmdResponse = cmd.exec(); @@ -166,28 +222,31 @@ private TestExecStartResultCallback executeCommandInDocker(String containerName, // try to capture stdout and stderr of the stuck process. boolean completed = - dockerClient.execStartCmd(createCmdResponse.getId()).withDetach(false).withTty(false).exec(callback) - .awaitCompletion(540, SECONDS); + dockerClient.execStartCmd(createCmdResponse.getId()).withDetach(false).withTty(false).exec(callback) + .awaitCompletion(540, SECONDS); if (!completed) { callback.getStderr().flush(); callback.getStdout().flush(); - LOG.error("\n\n ###### Timed Out Command : " + Arrays.asList(command)); + LOG.error("\n\n ###### Timed Out Command : " + Arrays.asList(command)); LOG.error("\n\n ###### Stderr of timed-out command #######\n" + callback.getStderr().toString()); LOG.error("\n\n ###### stdout of timed-out command #######\n" + callback.getStdout().toString()); - throw new TimeoutException("Command " + command + " has been running for more than 9 minutes. " - + "Killing and failing !!"); + throw new TimeoutException("Command " + command + " has been running for more than 9 minutes. " + + "Killing and failing !!"); } int exitCode = dockerClient.inspectExecCmd(createCmdResponse.getId()).exec().getExitCode(); LOG.info("Exit code for command : " + exitCode); + if (exitCode != 0) { LOG.error("\n\n ###### Stdout #######\n" + callback.getStdout().toString()); } LOG.error("\n\n ###### Stderr #######\n" + callback.getStderr().toString()); - if (expectedToSucceed) { - assertEquals(0, exitCode, "Command (" + Arrays.toString(command) + ") expected to succeed. Exit (" + exitCode + ")"); - } else { - assertNotEquals(0, exitCode, "Command (" + Arrays.toString(command) + ") expected to fail. Exit (" + exitCode + ")"); + if (checkIfSucceed) { + if (expectedToSucceed) { + assertEquals(0, exitCode, "Command (" + Arrays.toString(command) + ") expected to succeed. Exit (" + exitCode + ")"); + } else { + assertNotEquals(0, exitCode, "Command (" + Arrays.toString(command) + ") expected to fail. Exit (" + exitCode + ")"); + } } cmd.close(); return callback; @@ -199,14 +258,20 @@ void executeCommandStringsInDocker(String containerName, List commands) } } - protected TestExecStartResultCallback executeCommandStringInDocker(String containerName, String cmd, boolean expectedToSucceed) + protected TestExecStartResultCallback executeCommandStringInDocker( + String containerName, String cmd, boolean expectedToSucceed) throws Exception { + return executeCommandStringInDocker(containerName, cmd, true, expectedToSucceed); + } + + protected TestExecStartResultCallback executeCommandStringInDocker( + String containerName, String cmd, boolean checkIfSucceed, boolean expectedToSucceed) throws Exception { LOG.info("\n\n#################################################################################################"); LOG.info("Container : " + containerName + ", Running command :" + cmd); LOG.info("\n#################################################################################################"); String[] cmdSplits = singleSpace(cmd).split(" "); - return executeCommandInDocker(containerName, cmdSplits, expectedToSucceed); + return executeCommandInDocker(containerName, cmdSplits, checkIfSucceed, expectedToSucceed); } protected Pair executeHiveCommand(String hiveCommand) throws Exception { @@ -216,7 +281,9 @@ protected Pair executeHiveCommand(String hiveCommand) throws Exc LOG.info("\n#################################################################################################"); String[] hiveCmd = getHiveConsoleCommand(hiveCommand); - TestExecStartResultCallback callback = executeCommandInDocker(HIVESERVER, hiveCmd, true); + Map env = Collections.singletonMap("AUX_CLASSPATH", "file://" + HUDI_HADOOP_BUNDLE); + TestExecStartResultCallback callback = + executeCommandInDocker(HIVESERVER, hiveCmd, true, true, env); return Pair.of(callback.getStdout().toString().trim(), callback.getStderr().toString().trim()); } @@ -251,6 +318,20 @@ void executePrestoCopyCommand(String fromFile, String remotePath) { .exec(); } + Pair executeTrinoCommandFile(String commandFile) throws Exception { + String trinoCmd = getTrinoConsoleCommand(commandFile); + TestExecStartResultCallback callback = executeCommandStringInDocker(ADHOC_1_CONTAINER, trinoCmd, true); + return Pair.of(callback.getStdout().toString().trim(), callback.getStderr().toString().trim()); + } + + void executeTrinoCopyCommand(String fromFile, String remotePath) { + Container adhocContainer = runningContainers.get(ADHOC_1_CONTAINER); + dockerClient.copyArchiveToContainerCmd(adhocContainer.getId()) + .withHostResource(fromFile) + .withRemotePath(remotePath) + .exec(); + } + private void saveUpLogs() { try { // save up the Hive log files for introspection diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java index 0fa9b0ffa6206..f75fe175fbb75 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java @@ -21,9 +21,9 @@ import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.collection.Pair; - import org.apache.hudi.keygen.SimpleKeyGenerator; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @@ -32,24 +32,34 @@ /** * Goes through steps described in https://hudi.apache.org/docker_demo.html - * + *

    * To run this as a standalone test in the IDE or command line. First bring up the demo setup using * `docker/setup_demo.sh` and then run the test class as you would do normally. */ public class ITTestHoodieDemo extends ITTestBase { + private static final String TRINO_TABLE_CHECK_FILENAME = "trino-table-check.commands"; + private static final String TRINO_BATCH1_FILENAME = "trino-batch1.commands"; + private static final String TRINO_BATCH2_FILENAME = "trino-batch2-after-compaction.commands"; + private static final String HDFS_DATA_DIR = "/usr/hive/data/input"; private static final String HDFS_BATCH_PATH1 = HDFS_DATA_DIR + "/batch_1.json"; private static final String HDFS_BATCH_PATH2 = HDFS_DATA_DIR + "/batch_2.json"; private static final String HDFS_PRESTO_INPUT_TABLE_CHECK_PATH = HDFS_DATA_DIR + "/presto-table-check.commands"; private static final String HDFS_PRESTO_INPUT_BATCH1_PATH = HDFS_DATA_DIR + "/presto-batch1.commands"; private static final String HDFS_PRESTO_INPUT_BATCH2_PATH = HDFS_DATA_DIR + "/presto-batch2-after-compaction.commands"; + private static final String HDFS_TRINO_INPUT_TABLE_CHECK_PATH = HDFS_DATA_DIR + "/" + TRINO_TABLE_CHECK_FILENAME; + private static final String HDFS_TRINO_INPUT_BATCH1_PATH = HDFS_DATA_DIR + "/" + TRINO_BATCH1_FILENAME; + private static final String HDFS_TRINO_INPUT_BATCH2_PATH = HDFS_DATA_DIR + "/" + TRINO_BATCH2_FILENAME; private static final String INPUT_BATCH_PATH1 = HOODIE_WS_ROOT + "/docker/demo/data/batch_1.json"; private static final String PRESTO_INPUT_TABLE_CHECK_RELATIVE_PATH = "/docker/demo/presto-table-check.commands"; private static final String PRESTO_INPUT_BATCH1_RELATIVE_PATH = "/docker/demo/presto-batch1.commands"; private static final String INPUT_BATCH_PATH2 = HOODIE_WS_ROOT + "/docker/demo/data/batch_2.json"; private static final String PRESTO_INPUT_BATCH2_RELATIVE_PATH = "/docker/demo/presto-batch2-after-compaction.commands"; + private static final String TRINO_INPUT_TABLE_CHECK_RELATIVE_PATH = "/docker/demo/" + TRINO_TABLE_CHECK_FILENAME; + private static final String TRINO_INPUT_BATCH1_RELATIVE_PATH = "/docker/demo/" + TRINO_BATCH1_FILENAME; + private static final String TRINO_INPUT_BATCH2_RELATIVE_PATH = "/docker/demo/" + TRINO_BATCH2_FILENAME; private static final String COW_BASE_PATH = "/user/hive/warehouse/stock_ticks_cow"; private static final String MOR_BASE_PATH = "/user/hive/warehouse/stock_ticks_mor"; @@ -82,13 +92,24 @@ public class ITTestHoodieDemo extends ITTestBase { private HoodieFileFormat baseFileFormat; private static String HIVE_SYNC_CMD_FMT = - " --enable-hive-sync --hoodie-conf hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000 " + " --enable-hive-sync --hoodie-conf hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ " + + " --hoodie-conf hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor " + " --hoodie-conf hoodie.datasource.hive_sync.username=hive " + " --hoodie-conf hoodie.datasource.hive_sync.password=hive " + " --hoodie-conf hoodie.datasource.hive_sync.partition_fields=%s " + " --hoodie-conf hoodie.datasource.hive_sync.database=default " + " --hoodie-conf hoodie.datasource.hive_sync.table=%s"; + @AfterEach + public void clean() throws Exception { + String hdfsCmd = "hdfs dfs -rm -R "; + List tablePaths = CollectionUtils.createImmutableList( + COW_BASE_PATH, MOR_BASE_PATH, COW_BOOTSTRAPPED_BASE_PATH, MOR_BOOTSTRAPPED_BASE_PATH); + for (String tablePath : tablePaths) { + executeCommandStringInDocker(ADHOC_1_CONTAINER, hdfsCmd + tablePath, true); + } + } + @Test public void testParquetDemo() throws Exception { baseFileFormat = HoodieFileFormat.PARQUET; @@ -99,12 +120,14 @@ public void testParquetDemo() throws Exception { ingestFirstBatchAndHiveSync(); testHiveAfterFirstBatch(); testPrestoAfterFirstBatch(); + testTrinoAfterFirstBatch(); testSparkSQLAfterFirstBatch(); // batch 2 ingestSecondBatchAndHiveSync(); testHiveAfterSecondBatch(); testPrestoAfterSecondBatch(); + testTrinoAfterSecondBatch(); testSparkSQLAfterSecondBatch(); testIncrementalHiveQueryBeforeCompaction(); testIncrementalSparkSQLQuery(); @@ -114,6 +137,7 @@ public void testParquetDemo() throws Exception { testHiveAfterSecondBatchAfterCompaction(); testPrestoAfterSecondBatchAfterCompaction(); + testTrinoAfterSecondBatchAfterCompaction(); testIncrementalHiveQueryAfterCompaction(); } @@ -122,7 +146,7 @@ public void testParquetDemo() throws Exception { public void testHFileDemo() throws Exception { baseFileFormat = HoodieFileFormat.HFILE; - // TODO: Preseto and SparkSQL support for HFile format + // TODO: Presto, Trino and SparkSQL support for HFile format setupDemo(); @@ -130,12 +154,14 @@ public void testHFileDemo() throws Exception { ingestFirstBatchAndHiveSync(); testHiveAfterFirstBatch(); //testPrestoAfterFirstBatch(); + //testTrinoAfterFirstBatch(); //testSparkSQLAfterFirstBatch(); // batch 2 ingestSecondBatchAndHiveSync(); testHiveAfterSecondBatch(); //testPrestoAfterSecondBatch(); + //testTrinoAfterSecondBatch(); //testSparkSQLAfterSecondBatch(); testIncrementalHiveQueryBeforeCompaction(); //testIncrementalSparkSQLQuery(); @@ -144,6 +170,7 @@ public void testHFileDemo() throws Exception { scheduleAndRunCompaction(); testHiveAfterSecondBatchAfterCompaction(); //testPrestoAfterSecondBatchAfterCompaction(); + //testTrinoAfterSecondBatchAfterCompaction(); //testIncrementalHiveQueryAfterCompaction(); } @@ -151,7 +178,8 @@ private void setupDemo() throws Exception { List cmds = CollectionUtils.createImmutableList("hdfs dfsadmin -safemode wait", "hdfs dfs -mkdir -p " + HDFS_DATA_DIR, "hdfs dfs -copyFromLocal -f " + INPUT_BATCH_PATH1 + " " + HDFS_BATCH_PATH1, - "/bin/bash " + DEMO_CONTAINER_SCRIPT); + "/bin/bash " + DEMO_CONTAINER_SCRIPT, + "mkdir -p " + HDFS_DATA_DIR); executeCommandStringsInDocker(ADHOC_1_CONTAINER, cmds); @@ -163,12 +191,15 @@ private void setupDemo() throws Exception { executePrestoCopyCommand(System.getProperty("user.dir") + "/.." + PRESTO_INPUT_TABLE_CHECK_RELATIVE_PATH, HDFS_DATA_DIR); executePrestoCopyCommand(System.getProperty("user.dir") + "/.." + PRESTO_INPUT_BATCH1_RELATIVE_PATH, HDFS_DATA_DIR); executePrestoCopyCommand(System.getProperty("user.dir") + "/.." + PRESTO_INPUT_BATCH2_RELATIVE_PATH, HDFS_DATA_DIR); + + executeTrinoCopyCommand(System.getProperty("user.dir") + "/.." + TRINO_INPUT_TABLE_CHECK_RELATIVE_PATH, HDFS_DATA_DIR); + executeTrinoCopyCommand(System.getProperty("user.dir") + "/.." + TRINO_INPUT_BATCH1_RELATIVE_PATH, HDFS_DATA_DIR); + executeTrinoCopyCommand(System.getProperty("user.dir") + "/.." + TRINO_INPUT_BATCH2_RELATIVE_PATH, HDFS_DATA_DIR); } private void ingestFirstBatchAndHiveSync() throws Exception { List cmds = CollectionUtils.createImmutableList( "spark-submit" - + " --conf \'spark.executor.extraJavaOptions=-Dlog4jspark.root.logger=WARN,console\'" + " --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE + " --table-type COPY_ON_WRITE " + " --base-file-format " + baseFileFormat.toString() @@ -184,9 +215,9 @@ private void ingestFirstBatchAndHiveSync() throws Exception { + " --user hive" + " --pass hive" + " --jdbc-url jdbc:hive2://hiveserver:10000" + + " --partition-value-extractor org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor" + " --partitioned-by dt", ("spark-submit" - + " --conf \'spark.executor.extraJavaOptions=-Dlog4jspark.root.logger=WARN,console\'" + " --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE + " --table-type MERGE_ON_READ " + " --base-file-format " + baseFileFormat.toString() @@ -200,33 +231,33 @@ private void ingestFirstBatchAndHiveSync() throws Exception { executeSparkSQLCommand(SPARKSQL_BS_PREP_COMMANDS, true); List bootstrapCmds = CollectionUtils.createImmutableList( "spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE - + " --table-type COPY_ON_WRITE " - + " --run-bootstrap " - + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " - + " --target-base-path " + COW_BOOTSTRAPPED_BASE_PATH + " --target-table " + COW_BOOTSTRAPPED_TABLE_NAME - + " --props /var/demo/config/dfs-source.properties" - + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " - + " --initial-checkpoint-provider" - + " org.apache.hudi.utilities.checkpointing.InitialCheckpointFromAnotherHoodieTimelineProvider" - + " --hoodie-conf hoodie.bootstrap.base.path=" + BOOTSTRAPPED_SRC_PATH - + " --hoodie-conf hoodie.deltastreamer.checkpoint.provider.path=" + COW_BASE_PATH - + " --hoodie-conf hoodie.bootstrap.parallelism=2 " - + " --hoodie-conf hoodie.bootstrap.keygen.class=" + SimpleKeyGenerator.class.getName() - + String.format(HIVE_SYNC_CMD_FMT, "dt", COW_BOOTSTRAPPED_TABLE_NAME), + + " --table-type COPY_ON_WRITE " + + " --run-bootstrap " + + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " + + " --target-base-path " + COW_BOOTSTRAPPED_BASE_PATH + " --target-table " + COW_BOOTSTRAPPED_TABLE_NAME + + " --props /var/demo/config/dfs-source.properties" + + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " + + " --initial-checkpoint-provider" + + " org.apache.hudi.utilities.checkpointing.InitialCheckpointFromAnotherHoodieTimelineProvider" + + " --hoodie-conf hoodie.bootstrap.base.path=" + BOOTSTRAPPED_SRC_PATH + + " --hoodie-conf hoodie.deltastreamer.checkpoint.provider.path=" + COW_BASE_PATH + + " --hoodie-conf hoodie.bootstrap.parallelism=2 " + + " --hoodie-conf hoodie.bootstrap.keygen.class=" + SimpleKeyGenerator.class.getName() + + String.format(HIVE_SYNC_CMD_FMT, "dt", COW_BOOTSTRAPPED_TABLE_NAME), "spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE - + " --table-type MERGE_ON_READ " - + " --run-bootstrap " - + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " - + " --target-base-path " + MOR_BOOTSTRAPPED_BASE_PATH + " --target-table " + MOR_BOOTSTRAPPED_TABLE_NAME - + " --props /var/demo/config/dfs-source.properties" - + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " - + " --initial-checkpoint-provider" - + " org.apache.hudi.utilities.checkpointing.InitialCheckpointFromAnotherHoodieTimelineProvider" - + " --hoodie-conf hoodie.bootstrap.base.path=" + BOOTSTRAPPED_SRC_PATH - + " --hoodie-conf hoodie.deltastreamer.checkpoint.provider.path=" + COW_BASE_PATH - + " --hoodie-conf hoodie.bootstrap.parallelism=2 " - + " --hoodie-conf hoodie.bootstrap.keygen.class=" + SimpleKeyGenerator.class.getName() - + String.format(HIVE_SYNC_CMD_FMT, "dt", MOR_BOOTSTRAPPED_TABLE_NAME)); + + " --table-type MERGE_ON_READ " + + " --run-bootstrap " + + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " + + " --target-base-path " + MOR_BOOTSTRAPPED_BASE_PATH + " --target-table " + MOR_BOOTSTRAPPED_TABLE_NAME + + " --props /var/demo/config/dfs-source.properties" + + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " + + " --initial-checkpoint-provider" + + " org.apache.hudi.utilities.checkpointing.InitialCheckpointFromAnotherHoodieTimelineProvider" + + " --hoodie-conf hoodie.bootstrap.base.path=" + BOOTSTRAPPED_SRC_PATH + + " --hoodie-conf hoodie.deltastreamer.checkpoint.provider.path=" + COW_BASE_PATH + + " --hoodie-conf hoodie.bootstrap.parallelism=2 " + + " --hoodie-conf hoodie.bootstrap.keygen.class=" + SimpleKeyGenerator.class.getName() + + String.format(HIVE_SYNC_CMD_FMT, "dt", MOR_BOOTSTRAPPED_TABLE_NAME)); executeCommandStringsInDocker(ADHOC_1_CONTAINER, bootstrapCmds); } @@ -241,6 +272,10 @@ private void testHiveAfterFirstBatch() throws Exception { assertStdOutContains(stdOutErrPair, "| partition |\n+----------------+\n| dt=2018-08-31 |\n+----------------+\n", 3); + // There should have 5 data source tables except stock_ticks_mor_bs_rt. + // After [HUDI-2071] has solved, we can inc the number 5 to 6. + assertStdOutContains(stdOutErrPair, "'spark.sql.sources.provider'='hudi'", 5); + stdOutErrPair = executeHiveCommandFile(HIVE_BATCH1_COMMANDS); assertStdOutContains(stdOutErrPair, "| symbol | _c1 |\n+---------+----------------------+\n" + "| GOOG | 2018-08-31 10:29:00 |\n", 6); @@ -272,7 +307,6 @@ private void ingestSecondBatchAndHiveSync() throws Exception { List cmds = CollectionUtils.createImmutableList( ("hdfs dfs -copyFromLocal -f " + INPUT_BATCH_PATH2 + " " + HDFS_BATCH_PATH2), ("spark-submit" - + " --conf \'spark.executor.extraJavaOptions=-Dlog4jspark.root.logger=WARN,console\'" + " --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE + " --table-type COPY_ON_WRITE " + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " @@ -281,7 +315,6 @@ private void ingestSecondBatchAndHiveSync() throws Exception { + " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider " + String.format(HIVE_SYNC_CMD_FMT, "dt", COW_TABLE_NAME)), ("spark-submit" - + " --conf \'spark.executor.extraJavaOptions=-Dlog4jspark.root.logger=WARN,console\'" + " --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE + " --table-type MERGE_ON_READ " + " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts " @@ -320,6 +353,20 @@ private void testPrestoAfterFirstBatch() throws Exception { "\"GOOG\",\"2018-08-31 10:29:00\",\"3391\",\"1230.1899\",\"1230.085\"", 2); } + private void testTrinoAfterFirstBatch() throws Exception { + Pair stdOutErrPair = executeTrinoCommandFile(HDFS_TRINO_INPUT_TABLE_CHECK_PATH); + assertStdOutContains(stdOutErrPair, "stock_ticks_cow", 2); + assertStdOutContains(stdOutErrPair, "stock_ticks_mor", 4); + + stdOutErrPair = executeTrinoCommandFile(HDFS_TRINO_INPUT_BATCH1_PATH); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 10:29:00\"", 4); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 09:59:00\",\"6330\",\"1230.5\",\"1230.02\"", 2); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 10:29:00\",\"3391\",\"1230.1899\",\"1230.085\"", 2); + } + private void testHiveAfterSecondBatch() throws Exception { Pair stdOutErrPair = executeHiveCommandFile(HIVE_BATCH1_COMMANDS); assertStdOutContains(stdOutErrPair, "| symbol | _c1 |\n+---------+----------------------+\n" @@ -346,7 +393,21 @@ private void testPrestoAfterSecondBatch() throws Exception { assertStdOutContains(stdOutErrPair, "\"GOOG\",\"2018-08-31 10:59:00\"", 2); assertStdOutContains(stdOutErrPair, - "\"GOOG\",\"2018-08-31 09:59:00\",\"6330\",\"1230.5\",\"1230.02\"",2); + "\"GOOG\",\"2018-08-31 09:59:00\",\"6330\",\"1230.5\",\"1230.02\"", 2); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 10:29:00\",\"3391\",\"1230.1899\",\"1230.085\""); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 10:59:00\",\"9021\",\"1227.1993\",\"1227.215\""); + } + + private void testTrinoAfterSecondBatch() throws Exception { + Pair stdOutErrPair = executeTrinoCommandFile(HDFS_TRINO_INPUT_BATCH1_PATH); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 10:29:00\"", 2); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 10:59:00\"", 2); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 09:59:00\",\"6330\",\"1230.5\",\"1230.02\"", 2); assertStdOutContains(stdOutErrPair, "\"GOOG\",\"2018-08-31 10:29:00\",\"3391\",\"1230.1899\",\"1230.085\""); assertStdOutContains(stdOutErrPair, @@ -375,6 +436,16 @@ private void testPrestoAfterSecondBatchAfterCompaction() throws Exception { "\"GOOG\",\"2018-08-31 10:59:00\",\"9021\",\"1227.1993\",\"1227.215\""); } + private void testTrinoAfterSecondBatchAfterCompaction() throws Exception { + Pair stdOutErrPair = executeTrinoCommandFile(HDFS_TRINO_INPUT_BATCH2_PATH); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 10:59:00\"", 2); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 09:59:00\",\"6330\",\"1230.5\",\"1230.02\""); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 10:59:00\",\"9021\",\"1227.1993\",\"1227.215\""); + } + private void testSparkSQLAfterSecondBatch() throws Exception { Pair stdOutErrPair = executeSparkSQLCommand(SPARKSQL_BATCH2_COMMANDS, true); assertStdOutContains(stdOutErrPair, @@ -437,7 +508,7 @@ private void testIncrementalSparkSQLQuery() throws Exception { } private void scheduleAndRunCompaction() throws Exception { - executeCommandStringInDocker(ADHOC_1_CONTAINER, HUDI_CLI_TOOL + " --cmdfile " + COMPACTION_COMMANDS, true); - executeCommandStringInDocker(ADHOC_1_CONTAINER, HUDI_CLI_TOOL + " --cmdfile " + COMPACTION_BOOTSTRAP_COMMANDS, true); + executeCommandStringInDocker(ADHOC_1_CONTAINER, HUDI_CLI_TOOL + " script --file " + COMPACTION_COMMANDS, true); + executeCommandStringInDocker(ADHOC_1_CONTAINER, HUDI_CLI_TOOL + " script --file " + COMPACTION_BOOTSTRAP_COMMANDS, true); } } diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieSanity.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieSanity.java index e432f9dc423f5..a050d7eb88ba8 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieSanity.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieSanity.java @@ -185,12 +185,12 @@ public void testRunHoodieJavaApp(String command, String hiveTableName, String ta // Ensure row count is 80 (without duplicates) (100 - 20 deleted) stdOutErr = executeHiveCommand("select count(1) from " + snapshotTableName); - assertEquals(80, Integer.parseInt(stdOutErr.getLeft().trim()), + assertEquals(80, Integer.parseInt(lastLine(stdOutErr.getLeft()).trim()), "Expecting 80 rows to be present in the snapshot table"); if (roTableName.isPresent()) { stdOutErr = executeHiveCommand("select count(1) from " + roTableName.get()); - assertEquals(80, Integer.parseInt(stdOutErr.getLeft().trim()), + assertEquals(80, Integer.parseInt(lastLine(stdOutErr.getLeft()).trim()), "Expecting 80 rows to be present in the snapshot table"); } @@ -204,10 +204,16 @@ public void testRunHoodieJavaApp(String command, String hiveTableName, String ta } else { stdOutErr = executeHiveCommand("select count(1) from " + snapshotTableName); } - assertEquals(280, Integer.parseInt(stdOutErr.getLeft().trim()), + + assertEquals(280, Integer.parseInt(lastLine(stdOutErr.getLeft()).trim()), "Expecting 280 rows to be present in the new table"); } + private static String lastLine(String output) { + String[] lines = output.split("\n"); + return lines[lines.length - 1]; + } + public void testRunHoodieJavaApp(String hiveTableName, String tableType, PartitionType partitionType) throws Exception { testRunHoodieJavaApp(HOODIE_JAVA_APP, hiveTableName, tableType, partitionType); diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/command/ITTestHoodieSyncCommand.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/command/ITTestHoodieSyncCommand.java index a6a4c3ec4201e..0b415f37cdb8d 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/command/ITTestHoodieSyncCommand.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/command/ITTestHoodieSyncCommand.java @@ -49,18 +49,18 @@ public void testValidateSync() throws Exception { hiveTableName, HoodieTableType.COPY_ON_WRITE.name(), PartitionType.SINGLE_KEY_PARTITIONED, "append", hiveTableName); TestExecStartResultCallback result = - executeCommandStringInDocker(ADHOC_1_CONTAINER, HUDI_CLI_TOOL + " --cmdfile " + SYNC_VALIDATE_COMMANDS, true); + executeCommandStringInDocker(ADHOC_1_CONTAINER, HUDI_CLI_TOOL + " script --file " + SYNC_VALIDATE_COMMANDS, true); String expected = String.format("Count difference now is (count(%s) - count(%s) == %d. Catch up count is %d", hiveTableName, hiveTableName2, 100, 200); - assertTrue(result.getStderr().toString().contains(expected)); + assertTrue(result.getStdout().toString().contains(expected)); dropHiveTables(hiveTableName, HoodieTableType.COPY_ON_WRITE.name()); dropHiveTables(hiveTableName2, HoodieTableType.COPY_ON_WRITE.name()); } private void syncHoodieTable(String hiveTableName, String op) throws Exception { - StringBuilder cmdBuilder = new StringBuilder("spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.4 ") + StringBuilder cmdBuilder = new StringBuilder("spark-submit") .append(" --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer ").append(HUDI_UTILITIES_BUNDLE) .append(" --table-type COPY_ON_WRITE ") .append(" --base-file-format ").append(HoodieFileFormat.PARQUET.toString()) diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestDFSHoodieTestSuiteWriterAdapter.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestDFSHoodieTestSuiteWriterAdapter.java index ff92bd037d558..2b69a319a53e4 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestDFSHoodieTestSuiteWriterAdapter.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestDFSHoodieTestSuiteWriterAdapter.java @@ -18,16 +18,6 @@ package org.apache.hudi.integ.testsuite; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.mockito.Mockito.times; -import static org.mockito.Mockito.when; - -import java.io.IOException; -import java.util.Iterator; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig; @@ -44,14 +34,27 @@ import org.apache.hudi.integ.testsuite.writer.DeltaWriterFactory; import org.apache.hudi.utilities.schema.FilebasedSchemaProvider; import org.apache.hudi.utilities.testutils.UtilitiesTestBase; + +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.mockito.Mockito; +import java.io.IOException; +import java.util.Iterator; + +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.when; + /** * Unit test against DeltaWriterAdapter, by testing writing DFS files. */ @@ -62,7 +65,7 @@ public class TestDFSHoodieTestSuiteWriterAdapter extends UtilitiesTestBase { @BeforeAll public static void initClass() throws Exception { - UtilitiesTestBase.initClass(); + UtilitiesTestBase.initTestServices(false, false); } @AfterAll @@ -102,6 +105,8 @@ public void testDFSOneFileWrite() throws IOException { } @Test + @Disabled + // TODO(HUDI-3668): Fix this test public void testDFSTwoFilesWriteWithRollover() throws IOException { DeltaInputWriter mockFileSinkWriter = Mockito.mock(AvroFileDeltaInputWriter.class); @@ -122,10 +127,12 @@ public void testDFSTwoFilesWriteWithRollover() throws IOException { } @Test + @Disabled + // TODO(HUDI-3668): Fix this test public void testDFSWorkloadSinkWithMultipleFilesFunctional() throws IOException { DeltaConfig dfsSinkConfig = new DFSDeltaConfig(DeltaOutputMode.DFS, DeltaInputType.AVRO, new SerializableConfiguration(jsc.hadoopConfiguration()), dfsBasePath, dfsBasePath, - schemaProvider.getSourceSchema().toString(), 10240L, jsc.defaultParallelism(), false); + schemaProvider.getSourceSchema().toString(), 10240L, jsc.defaultParallelism(), false, false); DeltaWriterAdapter dfsDeltaWriterAdapter = DeltaWriterFactory .getDeltaWriterAdapter(dfsSinkConfig, 1); FlexibleSchemaRecordGenerationIterator itr = new FlexibleSchemaRecordGenerationIterator(1000, diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestFileDeltaInputWriter.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestFileDeltaInputWriter.java index 8e175c5bffcf0..f3cda10a620b5 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestFileDeltaInputWriter.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestFileDeltaInputWriter.java @@ -58,7 +58,7 @@ public class TestFileDeltaInputWriter extends UtilitiesTestBase { @BeforeAll public static void initClass() throws Exception { - UtilitiesTestBase.initClass(); + UtilitiesTestBase.initTestServices(false, false); } @AfterAll diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/configuration/TestWorkflowBuilder.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/configuration/TestWorkflowBuilder.java index 1e5ca6886c8e6..82350999ea42c 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/configuration/TestWorkflowBuilder.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/configuration/TestWorkflowBuilder.java @@ -18,18 +18,19 @@ package org.apache.hudi.integ.testsuite.configuration; -import static junit.framework.Assert.assertTrue; -import static junit.framework.TestCase.assertEquals; - -import java.util.ArrayList; -import java.util.List; - +import org.apache.hudi.integ.testsuite.dag.WorkflowDag; import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode; import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode; -import org.apache.hudi.integ.testsuite.dag.WorkflowDag; + import org.junit.jupiter.api.Test; +import java.util.ArrayList; +import java.util.List; + +import static junit.framework.Assert.assertTrue; +import static junit.framework.TestCase.assertEquals; + /** * Unit test for the build process of {@link DagNode} and {@link WorkflowDag}. */ diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/converter/TestDeleteConverter.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/converter/TestDeleteConverter.java index 8ed98b4fb69fc..86e117197669a 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/converter/TestDeleteConverter.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/converter/TestDeleteConverter.java @@ -28,7 +28,6 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import java.util.Arrays; import java.util.List; import java.util.Map; @@ -77,8 +76,7 @@ public void testGenerateDeleteRecordsFromInputRecords() throws Exception { .collectAsMap(); List deleteRecords = outputRDD.collect(); deleteRecords.stream().forEach(updateRecord -> { - GenericRecord inputRecord = inputRecords.get(updateRecord.get("_row_key").toString()); - assertTrue((boolean)inputRecord.get(DEFAULT_HOODIE_IS_DELETED_COL)); + assertTrue((boolean) updateRecord.get(DEFAULT_HOODIE_IS_DELETED_COL)); }); } } diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/converter/TestUpdateConverter.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/converter/TestUpdateConverter.java index c48d1b13f4f2d..8d1189c5227b2 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/converter/TestUpdateConverter.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/converter/TestUpdateConverter.java @@ -20,7 +20,7 @@ import static junit.framework.TestCase.assertTrue; -import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Map; @@ -65,7 +65,7 @@ public void testGenerateUpdateRecordsFromInputRecords() throws Exception { // 2. DFS converter reads existing records and generates random updates for the same row keys UpdateConverter updateConverter = new UpdateConverter(schemaStr, minPayloadSize, - Arrays.asList("timestamp"), Arrays.asList("_row_key")); + Collections.singletonList("timestamp"), Collections.singletonList("_row_key")); List insertRowKeys = inputRDD.map(r -> r.get("_row_key").toString()).collect(); assertTrue(inputRDD.count() == 10); JavaRDD outputRDD = updateConverter.convert(inputRDD); @@ -75,7 +75,7 @@ public void testGenerateUpdateRecordsFromInputRecords() throws Exception { Map inputRecords = inputRDD.mapToPair(r -> new Tuple2<>(r.get("_row_key").toString(), r)) .collectAsMap(); List updateRecords = outputRDD.collect(); - updateRecords.stream().forEach(updateRecord -> { + updateRecords.forEach(updateRecord -> { GenericRecord inputRecord = inputRecords.get(updateRecord.get("_row_key").toString()); assertTrue(areRecordsDifferent(inputRecord, updateRecord)); }); @@ -87,11 +87,11 @@ public void testGenerateUpdateRecordsFromInputRecords() throws Exception { */ private boolean areRecordsDifferent(GenericRecord in, GenericRecord up) { for (Field field : in.getSchema().getFields()) { - if (field.name() == "_row_key") { + if (field.name().equals("_row_key")) { continue; } else { // Just convert all types to string for now since all are primitive - if (in.get(field.name()).toString() != up.get(field.name()).toString()) { + if (!in.get(field.name()).toString().equals(up.get(field.name()).toString())) { return true; } } diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/HiveSyncDagGeneratorMOR.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/HiveSyncDagGeneratorMOR.java index 07dd467a2c7b9..4082d29621f24 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/HiveSyncDagGeneratorMOR.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/HiveSyncDagGeneratorMOR.java @@ -47,7 +47,9 @@ public WorkflowDag build() { root.addChildNode(child1); DagNode child2 = new HiveQueryNode(Config.newBuilder().withHiveLocal(true).withHiveQueryAndResults(Arrays - .asList(Pair.of("select " + "count(*) from testdb1.table1_rt group " + "by rider having count(*) < 1", 0))) + .asList(Pair.of("select " + "count(*) from testdb1.hive_trips group " + "by rider having count(*) < 1", 0), + Pair.of("select " + "count(*) from testdb1.hive_trips ", 100))) + .withHiveProperties(Arrays.asList("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat")) .build()); child1.addChildNode(child2); diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/TestDagUtils.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/TestDagUtils.java index d94174471bd64..d5f2af2094723 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/TestDagUtils.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/TestDagUtils.java @@ -18,17 +18,20 @@ package org.apache.hudi.integ.testsuite.dag; -import static org.junit.jupiter.api.Assertions.assertEquals; - -import java.util.ArrayList; -import java.util.List; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode; import org.apache.hudi.utilities.testutils.UtilitiesTestBase; + import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; + /** * A utility class for DAG test. */ @@ -43,10 +46,49 @@ public void testConvertDagToYaml() throws Exception { System.out.println(yaml); } + @Test + @Disabled + // TODO(HUDI-3668): Fix this test + public void testConvertDagToYamlHiveQuery() throws Exception { + WorkflowDag dag = new HiveSyncDagGenerator().build(); + DagNode insert1 = (DagNode) dag.getNodeList().get(0); + DagNode hiveSync1 = (DagNode)insert1.getChildNodes().get(0); + DagNode hiveQuery1 = (DagNode)hiveSync1.getChildNodes().get(0); + + String yaml = DagUtils.convertDagToYaml(dag); + + WorkflowDag dag2 = DagUtils.convertYamlToDag(yaml); + DagNode insert2 = (DagNode) dag2.getNodeList().get(0); + DagNode hiveSync2 = (DagNode)insert2.getChildNodes().get(0); + DagNode hiveQuery2 = (DagNode)hiveSync2.getChildNodes().get(0); + assertEquals(hiveQuery1.getConfig().getHiveQueries().get(0), + hiveQuery2.getConfig().getHiveQueries().get(0)); + assertEquals(hiveQuery1.getConfig().getHiveProperties().get(0), + hiveQuery2.getConfig().getHiveProperties().get(0)); + } + + @Test + public void testConvertDagToYamlAndBack() throws Exception { + final ComplexDagGenerator dag = new ComplexDagGenerator(); + final WorkflowDag originalWorkflowDag = dag.build(); + final String yaml = DagUtils.convertDagToYaml(dag.build()); + final WorkflowDag regeneratedWorkflowDag = DagUtils.convertYamlToDag(yaml); + + final List originalWorkflowDagNodes = originalWorkflowDag.getNodeList(); + final List regeneratedWorkflowDagNodes = regeneratedWorkflowDag.getNodeList(); + + assertEquals(originalWorkflowDagNodes.size(), regeneratedWorkflowDagNodes.size()); + assertEquals(originalWorkflowDagNodes.get(0).getChildNodes().size(), + regeneratedWorkflowDagNodes.get(0).getChildNodes().size()); + } + @Test public void testConvertYamlToDag() throws Exception { WorkflowDag dag = DagUtils.convertYamlToDag(UtilitiesTestBase.Helpers .readFileFromAbsolutePath((System.getProperty("user.dir") + "/.." + COW_DAG_DOCKER_DEMO_RELATIVE_PATH))); + assertEquals(dag.getDagName(), "unit-test-cow-dag"); + assertEquals(dag.getRounds(), 1); + assertEquals(dag.getIntermittentDelayMins(), 10); assertEquals(dag.getNodeList().size(), 1); Assertions.assertEquals(((DagNode) dag.getNodeList().get(0)).getParentNodes().size(), 0); assertEquals(((DagNode) dag.getNodeList().get(0)).getChildNodes().size(), 1); diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadEstimator.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadEstimator.java index c31a7d67395a9..1959620aeb355 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadEstimator.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadEstimator.java @@ -18,13 +18,14 @@ package org.apache.hudi.integ.testsuite.generator; -import static junit.framework.TestCase.assertEquals; - -import org.apache.avro.Schema; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.utilities.testutils.UtilitiesTestBase; + +import org.apache.avro.Schema; import org.junit.jupiter.api.Test; +import static junit.framework.TestCase.assertEquals; + /** * Unit test for {@link GenericRecordFullPayloadSizeEstimator}. */ @@ -41,8 +42,8 @@ public void testSimpleSchemaSize() throws Exception { GenericRecordFullPayloadSizeEstimator estimator = new GenericRecordFullPayloadSizeEstimator(schema); Pair estimateAndNumComplexFields = estimator.typeEstimateAndNumComplexFields(); - assertEquals(estimateAndNumComplexFields.getRight().intValue(), 0); - assertEquals(estimateAndNumComplexFields.getLeft().intValue(), 156); + assertEquals(0, estimateAndNumComplexFields.getRight().intValue()); + assertEquals(157, estimateAndNumComplexFields.getLeft().intValue()); } @Test diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadGenerator.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadGenerator.java index 94515959d01cd..cc3711dc8eb3b 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadGenerator.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadGenerator.java @@ -27,6 +27,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.concurrent.TimeUnit; import java.util.stream.IntStream; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; @@ -56,8 +57,8 @@ public void testSimplePayload() throws Exception { @Test public void testComplexPayload() throws IOException { Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers - .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + - COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH)); + .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + + COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH)); GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema); GenericRecord record = payloadGenerator.getNewPayload(); // The generated payload should validate with the provided schema @@ -67,8 +68,8 @@ public void testComplexPayload() throws IOException { @Test public void testComplexPartialPayload() throws IOException { Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers - .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + - COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH)); + .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + + COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH)); GenericRecordPartialPayloadGenerator payloadGenerator = new GenericRecordPartialPayloadGenerator(schema); IntStream.range(0, 10).forEach(a -> { GenericRecord record = payloadGenerator.getNewPayload(); @@ -123,8 +124,8 @@ public void testSimplePayloadWithLargeMinSize() throws Exception { @Test public void testComplexPayloadWithLargeMinSize() throws Exception { Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers - .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + - COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH)); + .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + + COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH)); int minPayloadSize = 10000; GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator( schema, minPayloadSize); @@ -133,4 +134,36 @@ public void testComplexPayloadWithLargeMinSize() throws Exception { assertTrue(HoodieAvroUtils.avroToBytes(record).length < minPayloadSize + 0.1 * minPayloadSize); } + @Test + public void testUpdatePayloadGeneratorWithTimestamp() throws IOException { + Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers + .readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH)); + GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema); + List insertRowKeys = new ArrayList<>(); + List updateRowKeys = new ArrayList<>(); + List insertTimeStamps = new ArrayList<>(); + List updateTimeStamps = new ArrayList<>(); + List records = new ArrayList<>(); + Long startSeconds = 0L; + Long endSeconds = TimeUnit.SECONDS.convert(10, TimeUnit.DAYS); + // Generate 10 new records + IntStream.range(0, 10).forEach(a -> { + GenericRecord record = payloadGenerator.getNewPayloadWithTimestamp("timestamp"); + records.add(record); + insertRowKeys.add(record.get("_row_key").toString()); + insertTimeStamps.add((Long) record.get("timestamp")); + }); + Set blacklistFields = new HashSet<>(Arrays.asList("_row_key")); + records.stream().forEach(a -> { + // Generate 10 updated records + GenericRecord record = payloadGenerator.getUpdatePayloadWithTimestamp(a, blacklistFields, "timestamp"); + updateRowKeys.add(record.get("_row_key").toString()); + updateTimeStamps.add((Long) record.get("timestamp")); + }); + // The row keys from insert payloads should match all the row keys from the update payloads + assertTrue(insertRowKeys.containsAll(updateRowKeys)); + // The timestamp field for the insert payloads should not all match with the update payloads + assertFalse(insertTimeStamps.containsAll(updateTimeStamps)); + assertTrue(insertTimeStamps.stream().allMatch(t -> t >= startSeconds && t <= endSeconds)); + } } diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java index b14ef1808d00a..ddf5b07247c0d 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java @@ -18,12 +18,6 @@ package org.apache.hudi.integ.testsuite.job; -import static org.junit.jupiter.api.Assertions.assertEquals; - -import java.util.UUID; -import java.util.stream.Stream; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieTableType; @@ -37,24 +31,41 @@ import org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator; import org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector; import org.apache.hudi.integ.testsuite.reader.DeltaInputType; +import org.apache.hudi.integ.testsuite.schema.SchemaUtils; +import org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider; import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode; import org.apache.hudi.keygen.TimestampBasedKeyGenerator; -import org.apache.hudi.utilities.schema.FilebasedSchemaProvider; import org.apache.hudi.utilities.sources.AvroDFSSource; import org.apache.hudi.utilities.testutils.UtilitiesTestBase; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; import org.apache.log4j.Level; import org.apache.log4j.Logger; -import org.junit.jupiter.api.Test; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; +import java.util.UUID; +import java.util.stream.Stream; + +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL; +import static org.apache.hudi.hive.testutils.HiveTestService.HS2_JDBC_URL; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME; +import static org.junit.jupiter.api.Assertions.assertEquals; + /** * Unit test against {@link HoodieTestSuiteJob}. */ +@Disabled +// TODO(HUDI-3668): Fix this test public class TestHoodieTestSuiteJob extends UtilitiesTestBase { private static final String TEST_NAME_WITH_PARAMS = "[{index}] Test with useDeltaStreamer={0}, tableType={1}"; @@ -69,6 +80,11 @@ public class TestHoodieTestSuiteJob extends UtilitiesTestBase { private static final String MOR_DAG_FILE_NAME = "unit-test-mor-dag.yaml"; private static final String MOR_DAG_SOURCE_PATH = "/hudi-integ-test/src/test/resources/" + MOR_DAG_FILE_NAME; + private static final String COW_DAG_FILE_NAME_SPARK_DATASOURCE_NODES = "unit-test-cow-dag-spark-datasource.yaml"; + private static final String COW_DAG_SPARK_DATASOURCE_NODES_RELATIVE_PATH = "/hudi-integ-test/src/test/resources/unit-test-cow-dag-spark-datasource.yaml"; + + private static final String SPARK_SQL_DAG_FILE_NAME = "unit-test-spark-sql-dag.yaml"; + private static final String SPARK_SQL_DAG_SOURCE_PATH = "/hudi-integ-test/src/test/resources/" + SPARK_SQL_DAG_FILE_NAME; public static Stream configParams() { Object[][] data = @@ -78,7 +94,7 @@ public static Stream configParams() { @BeforeAll public static void initClass() throws Exception { - UtilitiesTestBase.initClass(); + UtilitiesTestBase.initTestServices(true, true); // prepare the configs. UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.." + BASE_PROPERTIES_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/base.properties"); @@ -92,39 +108,17 @@ public static void initClass() throws Exception { UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.." + MOR_DAG_SOURCE_PATH, dfs, dfsBasePath + "/" + MOR_DAG_FILE_NAME); - TypedProperties props = new TypedProperties(); - props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); - props.setProperty("hoodie.datasource.write.partitionpath.field", "timestamp"); - props.setProperty("hoodie.deltastreamer.keygen.timebased.timestamp.type", "UNIX_TIMESTAMP"); - props.setProperty("hoodie.deltastreamer.keygen.timebased.output.dateformat", "yyyy/MM/dd"); - props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc"); - props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/source.avsc"); - props.setProperty("hoodie.deltastreamer.source.dfs.root", dfsBasePath + "/input"); - props.setProperty("hoodie.datasource.hive_sync.assume_date_partitioning", "true"); - props.setProperty("hoodie.datasource.hive_sync.skip_ro_suffix", "true"); - props.setProperty("hoodie.datasource.write.keytranslator.class", "org.apache.hudi" - + ".DayBasedPartitionPathKeyTranslator"); - props.setProperty("hoodie.compact.inline.max.delta.commits", "3"); - props.setProperty("hoodie.parquet.max.file.size", "1024000"); - props.setProperty("hoodie.compact.inline.max.delta.commits", "0"); - props.setProperty("hoodie.index.type", HoodieIndex.IndexType.GLOBAL_SIMPLE.name()); - props.setProperty("hoodie.global.simple.index.parallelism", "2"); - // Reduce shuffle parallelism, spark hangs when numPartitions >> numRecords to process - props.setProperty("hoodie.insert.shuffle.parallelism", "10"); - props.setProperty("hoodie.upsert.shuffle.parallelism", "10"); - props.setProperty("hoodie.bulkinsert.shuffle.parallelism", "10"); - props.setProperty("hoodie.compact.inline.max.delta.commits", "0"); - // Make path selection test suite specific - props.setProperty("hoodie.deltastreamer.source.input.selector", DFSTestSuitePathSelector.class.getName()); - // Hive Configs - props.setProperty(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), "jdbc:hive2://127.0.0.1:9999/"); - props.setProperty(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), "testdb1"); - props.setProperty(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), "table1"); - props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "datestr"); - props.setProperty(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), TimestampBasedKeyGenerator.class.getName()); + TypedProperties props = getProperties(); UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/test-source" + ".properties"); + UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.." + + COW_DAG_SPARK_DATASOURCE_NODES_RELATIVE_PATH, dfs, dfsBasePath + "/" + COW_DAG_FILE_NAME_SPARK_DATASOURCE_NODES); + UtilitiesTestBase.Helpers.savePropsToDFS(getProperties(), dfs, dfsBasePath + "/test-source" + + ".properties"); + UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.." + + SPARK_SQL_DAG_SOURCE_PATH, dfs, dfsBasePath + "/" + SPARK_SQL_DAG_FILE_NAME); + // Properties used for the delta-streamer which incrementally pulls from upstream DFS Avro source and // writes to downstream hudi table TypedProperties downstreamProps = new TypedProperties(); @@ -161,6 +155,50 @@ private void cleanDFSDirs() throws Exception { dfs.delete(new Path(dfsBasePath + "/result"), true); } + private static TypedProperties getProperties() { + TypedProperties props = new TypedProperties(); + props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); + props.setProperty("hoodie.datasource.write.partitionpath.field", "timestamp"); + props.setProperty("hoodie.deltastreamer.keygen.timebased.timestamp.type", "UNIX_TIMESTAMP"); + props.setProperty("hoodie.deltastreamer.keygen.timebased.output.dateformat", "yyyy/MM/dd"); + props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc"); + props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/source.avsc"); + props.setProperty("hoodie.deltastreamer.source.dfs.root", dfsBasePath + "/input"); + props.setProperty("hoodie.datasource.hive_sync.assume_date_partitioning", "true"); + props.setProperty("hoodie.datasource.hive_sync.skip_ro_suffix", "true"); + props.setProperty("hoodie.datasource.write.keytranslator.class", "org.apache.hudi" + + ".DayBasedPartitionPathKeyTranslator"); + props.setProperty("hoodie.compact.inline.max.delta.commits", "3"); + props.setProperty("hoodie.parquet.max.file.size", "1024000"); + props.setProperty("hoodie.compact.inline.max.delta.commits", "0"); + props.setProperty("hoodie.index.type", HoodieIndex.IndexType.GLOBAL_SIMPLE.name()); + props.setProperty("hoodie.global.simple.index.parallelism", "2"); + // Reduce shuffle parallelism, spark hangs when numPartitions >> numRecords to process + props.setProperty("hoodie.insert.shuffle.parallelism", "10"); + props.setProperty("hoodie.upsert.shuffle.parallelism", "10"); + props.setProperty("hoodie.bulkinsert.shuffle.parallelism", "10"); + props.setProperty("hoodie.compact.inline.max.delta.commits", "0"); + // Make path selection test suite specific + props.setProperty("hoodie.deltastreamer.source.input.selector", DFSTestSuitePathSelector.class.getName()); + // Hive Configs + props.setProperty(HIVE_URL.key(), HS2_JDBC_URL); + props.setProperty(META_SYNC_DATABASE_NAME.key(), "testdb1"); + props.setProperty(META_SYNC_TABLE_NAME.key(), "table1"); + props.setProperty(META_SYNC_PARTITION_FIELDS.key(), "datestr"); + props.setProperty(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), TimestampBasedKeyGenerator.class.getName()); + + props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider"); + props.setProperty("hoodie.write.lock.hivemetastore.database", "testdb1"); + props.setProperty("hoodie.write.lock.hivemetastore.table", "table1"); + props.setProperty("hoodie.write.lock.zookeeper.url", "127.0.0.1"); + props.setProperty("hoodie.write.lock.zookeeper.port", "2828"); + props.setProperty("hoodie.write.lock.wait_time_ms", "1200000"); + props.setProperty("hoodie.write.lock.num_retries", "10"); + props.setProperty("hoodie.write.lock.zookeeper.lock_key", "test_table"); + props.setProperty("hoodie.write.lock.zookeeper.zk_base_path", "/test"); + return props; + } + // Tests in this class add to the test build time significantly. Since this is a Integration Test (end to end), we // would like to run this as a nightly build which is a TODO. // TODO : Clean up input / result paths after each test @@ -173,7 +211,7 @@ public void testDagWithInsertUpsertAndValidate(boolean useDeltaStreamer, String cfg.workloadDagGenerator = ComplexDagGenerator.class.getName(); HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc); hoodieTestSuiteJob.runTestSuite(); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(new Configuration()).setBasePath(cfg.targetBasePath).build(); assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 2); } @@ -192,7 +230,7 @@ public void testHiveSync() throws Exception { } HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc); hoodieTestSuiteJob.runTestSuite(); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(new Configuration()).setBasePath(cfg.targetBasePath).build(); assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 1); } @@ -207,7 +245,7 @@ public void testCOWFullDagFromYaml() throws Exception { cfg.workloadYamlPath = dfsBasePath + "/" + COW_DAG_FILE_NAME; HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc); hoodieTestSuiteJob.runTestSuite(); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(new Configuration()).setBasePath(cfg.targetBasePath).build(); //assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 5); } @@ -222,26 +260,60 @@ public void testMORFullDagFromYaml() throws Exception { cfg.workloadYamlPath = dfsBasePath + "/" + MOR_DAG_FILE_NAME; HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc); hoodieTestSuiteJob.runTestSuite(); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(new Configuration()).setBasePath(cfg.targetBasePath).build(); //assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 7); } + @Test + public void testSparkDataSourceNodesDagWithLock() throws Exception { + boolean useDeltaStreamer = false; + this.cleanDFSDirs(); + + TypedProperties props = getProperties(); + props.setProperty("hoodie.write.concurrency.mode", "optimistic_concurrency_control"); + props.setProperty("hoodie.failed.writes.cleaner.policy", "LAZY"); + UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/test-source" + + ".properties"); + String inputBasePath = dfsBasePath + "/input"; + String outputBasePath = dfsBasePath + "/result"; + HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, HoodieTableType + .COPY_ON_WRITE.name()); + cfg.workloadYamlPath = dfsBasePath + "/" + COW_DAG_FILE_NAME_SPARK_DATASOURCE_NODES; + HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc); + hoodieTestSuiteJob.runTestSuite(); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(new Configuration()).setBasePath(cfg.targetBasePath).build(); + assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 3); + } + + @Test + public void testSparkSqlDag() throws Exception { + boolean useDeltaStreamer = false; + this.cleanDFSDirs(); + String inputBasePath = dfsBasePath + "/input"; + String outputBasePath = dfsBasePath + "/result"; + HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, HoodieTableType + .COPY_ON_WRITE.name()); + cfg.workloadYamlPath = dfsBasePath + "/" + SPARK_SQL_DAG_FILE_NAME; + HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc); + hoodieTestSuiteJob.runTestSuite(); + } + protected HoodieTestSuiteConfig makeConfig(String inputBasePath, String outputBasePath, boolean useDeltaStream, - String tableType) { + String tableType) { HoodieTestSuiteConfig cfg = new HoodieTestSuiteConfig(); cfg.targetBasePath = outputBasePath; cfg.inputBasePath = inputBasePath; cfg.targetTableName = "table1"; cfg.tableType = tableType; cfg.sourceClassName = AvroDFSSource.class.getName(); - cfg.sourceOrderingField = "timestamp"; + cfg.sourceOrderingField = SchemaUtils.SOURCE_ORDERING_FIELD; cfg.propsFilePath = dfsBasePath + "/test-source.properties"; cfg.outputTypeName = DeltaOutputMode.DFS.name(); cfg.inputFormatName = DeltaInputType.AVRO.name(); cfg.limitFileSize = 1024 * 1024L; cfg.sourceLimit = 20000000; cfg.workloadDagGenerator = WorkflowDagGenerator.class.getName(); - cfg.schemaProviderClassName = FilebasedSchemaProvider.class.getName(); + cfg.schemaProviderClassName = TestSuiteFileBasedSchemaProvider.class.getName(); cfg.useDeltaStreamer = useDeltaStream; return cfg; } diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSAvroDeltaInputReader.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSAvroDeltaInputReader.java index fa8f4ac41d53f..9f9439f376880 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSAvroDeltaInputReader.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSAvroDeltaInputReader.java @@ -43,7 +43,7 @@ public class TestDFSAvroDeltaInputReader extends UtilitiesTestBase { @BeforeAll public static void initClass() throws Exception { - UtilitiesTestBase.initClass(); + UtilitiesTestBase.initTestServices(false, false); } @AfterAll diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSHoodieDatasetInputReader.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSHoodieDatasetInputReader.java index 1caf8f80f51ac..80f6e2548ce3b 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSHoodieDatasetInputReader.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSHoodieDatasetInputReader.java @@ -18,13 +18,6 @@ package org.apache.hudi.integ.testsuite.reader; -import static junit.framework.TestCase.assertEquals; -import static junit.framework.TestCase.assertTrue; - -import java.util.HashSet; -import java.util.List; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; @@ -34,13 +27,23 @@ import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.utilities.testutils.UtilitiesTestBase; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import java.util.HashSet; +import java.util.List; + +import static junit.framework.TestCase.assertEquals; +import static junit.framework.TestCase.assertTrue; + /** * Unit test for {@link DFSHoodieDatasetInputReader}. */ @@ -48,7 +51,7 @@ public class TestDFSHoodieDatasetInputReader extends UtilitiesTestBase { @BeforeAll public static void initClass() throws Exception { - UtilitiesTestBase.initClass(); + UtilitiesTestBase.initTestServices(false, false); } @AfterAll @@ -68,6 +71,8 @@ public void teardown() throws Exception { } @Test + @Disabled + // TODO(HUDI-3668): Fix this test public void testSimpleHoodieDatasetReader() throws Exception { HoodieWriteConfig config = makeHoodieClientConfig(); diff --git a/hudi-integ-test/src/test/resources/log4j-surefire-quiet.properties b/hudi-integ-test/src/test/resources/log4j-surefire-quiet.properties deleted file mode 100644 index 61fbf78d1dffb..0000000000000 --- a/hudi-integ-test/src/test/resources/log4j-surefire-quiet.properties +++ /dev/null @@ -1,30 +0,0 @@ -### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -### -log4j.rootLogger=ERROR, CONSOLE -log4j.logger.org.apache.hudi=ERROR -log4j.category.org.apache.spark=ERROR - -# CONSOLE is set to be a ConsoleAppender. -log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# CONSOLE uses PatternLayout. -log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout -log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n -log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter -log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true -log4j.appender.CONSOLE.filter.a.LevelMin=ERROR -log4j.appender.CONSOLE.filter.a.LevelMax=FATAL diff --git a/hudi-integ-test/src/test/resources/log4j-surefire.properties b/hudi-integ-test/src/test/resources/log4j-surefire.properties deleted file mode 100644 index c03e808cca1f8..0000000000000 --- a/hudi-integ-test/src/test/resources/log4j-surefire.properties +++ /dev/null @@ -1,30 +0,0 @@ -### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -### -log4j.rootLogger=WARN, CONSOLE -log4j.logger.org.apache=INFO -log4j.logger.org.apache.hudi=DEBUG - -# A1 is set to be a ConsoleAppender. -log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. -log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout -log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n -log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter -log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true -log4j.appender.CONSOLE.filter.a.LevelMin=WARN -log4j.appender.CONSOLE.filter.a.LevelMax=FATAL \ No newline at end of file diff --git a/hudi-integ-test/src/test/resources/unit-test-cow-dag-spark-datasource.yaml b/hudi-integ-test/src/test/resources/unit-test-cow-dag-spark-datasource.yaml new file mode 100644 index 0000000000000..16023f69ed5e8 --- /dev/null +++ b/hudi-integ-test/src/test/resources/unit-test-cow-dag-spark-datasource.yaml @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: unit-test-cow-dag-spark-datasource +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 70000 + num_partitions_insert: 1 + repeat_count: 2 + num_records_insert: 100 + type: SparkInsertNode + deps: none + first_upsert: + config: + record_size: 70000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 100 + type: SparkUpsertNode + deps: first_insert + second_upsert: + config: + record_size: 70000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 100 + type: SparkUpsertNode + deps: first_upsert \ No newline at end of file diff --git a/hudi-integ-test/src/test/resources/unit-test-cow-dag.yaml b/hudi-integ-test/src/test/resources/unit-test-cow-dag.yaml index 96a6c825a98d0..8228c53e54786 100644 --- a/hudi-integ-test/src/test/resources/unit-test-cow-dag.yaml +++ b/hudi-integ-test/src/test/resources/unit-test-cow-dag.yaml @@ -13,58 +13,82 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -first_insert: - config: - record_size: 70000 - num_partitions_insert: 1 - repeat_count: 2 - num_records_insert: 100 - type: InsertNode - deps: none -second_insert: - config: - record_size: 70000 - num_partitions_insert: 1 - repeat_count: 1 - num_records_insert: 100 - type: InsertNode - deps: first_insert -first_rollback: - config: - deps: second_insert - type: RollbackNode -third_insert: - config: - record_size: 70000 - num_partitions_insert: 1 - repeat_count: 1 - num_records_insert: 100 - type: InsertNode - deps: first_rollback -first_upsert: - config: - record_size: 70000 - num_partitions_upsert: 1 - repeat_count: 1 - num_records_upsert: 100 - type: UpsertNode - deps: third_insert -first_hive_sync: - config: - queue_name: "adhoc" - engine: "mr" - type: HiveSyncNode - deps: first_upsert -first_hive_query: - config: - hive_props: - prop2: "set spark.yarn.queue=" - prop3: "set hive.strict.checks.large.query=false" - prop4: "set hive.stats.autogather=false" - hive_queries: - query1: "select count(*) from testdb1.table1" - result1: 300 - query2: "select count(*) from testdb1.table1 group by `_row_key` having count(*) > 1" - result2: 0 - type: HiveQueryNode - deps: first_hive_sync \ No newline at end of file +dag_name: unit-test-cow-dag +dag_rounds: 1 +dag_intermittent_delay_mins: 10 +dag_content: + first_insert: + config: + record_size: 70000 + num_partitions_insert: 1 + repeat_count: 2 + num_records_insert: 100 + type: InsertNode + deps: none + second_insert: + config: + record_size: 70000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 100 + type: InsertNode + deps: first_insert + first_rollback: + config: + deps: second_insert + type: RollbackNode + third_insert: + config: + record_size: 70000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 100 + type: InsertNode + deps: first_rollback + first_upsert: + config: + record_size: 70000 + num_partitions_upsert: 1 + repeat_count: 1 + num_records_upsert: 100 + type: UpsertNode + deps: third_insert + first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_upsert + first_hive_query: + config: + hive_props: + prop2: "set spark.yarn.queue=" + prop3: "set hive.strict.checks.large.query=false" + prop4: "set hive.stats.autogather=false" + hive_queries: + query1: "select count(*) from testdb1.table1" + result1: 300 + query2: "select count(*) from testdb1.table1 group by `_row_key` having count(*) > 1" + result2: 0 + type: HiveQueryNode + deps: first_hive_sync + first_presto_query: + config: + presto_props: + prop1: "SET SESSION hive.parquet_use_column_names = true" + presto_queries: + query1: "select count(*) from testdb1.table1" + result1: 300 + query2: "select count(*) from testdb1.table1 group by `_row_key` having count(*) > 1" + result2: 0 + type: PrestoQueryNode + deps: first_hive_query + first_trino_query: + config: + trino_queries: + query1: "select count(*) from testdb1.table1" + result1: 300 + query2: "select count(*) from testdb1.table1 group by `_row_key` having count(*) > 1" + result2: 0 + type: TrinoQueryNode + deps: first_presto_query \ No newline at end of file diff --git a/hudi-integ-test/src/test/resources/unit-test-mor-dag.yaml b/hudi-integ-test/src/test/resources/unit-test-mor-dag.yaml index 96a6c825a98d0..2ba42455d4874 100644 --- a/hudi-integ-test/src/test/resources/unit-test-mor-dag.yaml +++ b/hudi-integ-test/src/test/resources/unit-test-mor-dag.yaml @@ -13,58 +13,62 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -first_insert: - config: - record_size: 70000 - num_partitions_insert: 1 - repeat_count: 2 - num_records_insert: 100 - type: InsertNode - deps: none -second_insert: - config: - record_size: 70000 - num_partitions_insert: 1 - repeat_count: 1 - num_records_insert: 100 - type: InsertNode - deps: first_insert -first_rollback: - config: - deps: second_insert - type: RollbackNode -third_insert: - config: - record_size: 70000 - num_partitions_insert: 1 - repeat_count: 1 - num_records_insert: 100 - type: InsertNode - deps: first_rollback -first_upsert: - config: - record_size: 70000 - num_partitions_upsert: 1 - repeat_count: 1 - num_records_upsert: 100 - type: UpsertNode - deps: third_insert -first_hive_sync: - config: - queue_name: "adhoc" - engine: "mr" - type: HiveSyncNode - deps: first_upsert -first_hive_query: - config: - hive_props: - prop2: "set spark.yarn.queue=" - prop3: "set hive.strict.checks.large.query=false" - prop4: "set hive.stats.autogather=false" - hive_queries: - query1: "select count(*) from testdb1.table1" - result1: 300 - query2: "select count(*) from testdb1.table1 group by `_row_key` having count(*) > 1" - result2: 0 - type: HiveQueryNode - deps: first_hive_sync \ No newline at end of file +dag_name: unit-test-mor-dag +dag_rounds: 1 +dag_intermittent_delay_mins: 10 +dag_content: + first_insert: + config: + record_size: 70000 + num_partitions_insert: 1 + repeat_count: 2 + num_records_insert: 100 + type: InsertNode + deps: none + second_insert: + config: + record_size: 70000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 100 + type: InsertNode + deps: first_insert + first_rollback: + config: + deps: second_insert + type: RollbackNode + third_insert: + config: + record_size: 70000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 100 + type: InsertNode + deps: first_rollback + first_upsert: + config: + record_size: 70000 + num_partitions_upsert: 1 + repeat_count: 1 + num_records_upsert: 100 + type: UpsertNode + deps: third_insert + first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_upsert + first_hive_query: + config: + hive_props: + prop2: "set spark.yarn.queue=" + prop3: "set hive.strict.checks.large.query=false" + prop4: "set hive.stats.autogather=false" + hive_queries: + query1: "select count(*) from testdb1.table1" + result1: 300 + query2: "select count(*) from testdb1.table1 group by `_row_key` having count(*) > 1" + result2: 0 + type: HiveQueryNode + deps: first_hive_sync \ No newline at end of file diff --git a/hudi-integ-test/src/test/resources/unit-test-spark-sql-dag.yaml b/hudi-integ-test/src/test/resources/unit-test-spark-sql-dag.yaml new file mode 100644 index 0000000000000..0b4ff072a97f7 --- /dev/null +++ b/hudi-integ-test/src/test/resources/unit-test-spark-sql-dag.yaml @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: unit-test-spark-sql-dag.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + create_table: + config: + table_type: cow + primary_key: _row_key + pre_combine_field: test_suite_source_ordering_field + partition_field: rider + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlCreateTableNode + deps: none + insert_records: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: spark.sql.SparkSqlInsertNode + deps: create_table + #merge_records: + # config: + # merge_condition: target._row_key = source._row_key + # matched_action: update set * + # not_matched_action: insert * + # record_size: 1000 + # num_partitions_insert: 10 + # repeat_count: 1 + # num_records_upsert: 100 + # num_records_insert: 1000 + # type: spark.sql.SparkSqlMergeNode + # deps: insert_records + #delete_records: + # config: + # condition_column: begin_lat + # record_size: 1000 + # repeat_count: 1 + # ratio_records_change: 0.2 + # type: spark.sql.SparkSqlDeleteNode + # deps: insert_records + #validate: + # config: + # delete_input_data: true + # type: spark.sql.SparkSqlValidateDatasetNode + # deps: delete_records diff --git a/hudi-kafka-connect/README.md b/hudi-kafka-connect/README.md new file mode 100644 index 0000000000000..449236ea5c663 --- /dev/null +++ b/hudi-kafka-connect/README.md @@ -0,0 +1,477 @@ + + +# Quick Start (demo) guide for Kafka Connect Sink for Hudi + +This repo contains a sample project that can be used to start off your own source connector for Kafka Connect. +This is work is tracked by [HUDI-2324](https://issues.apache.org/jira/browse/HUDI-2324) + +## Building the Hudi Sink Connector + +The first thing you need to do to start using this connector is building it. In order to do that, you need to install the following dependencies: + +- [Java 1.8+](https://openjdk.java.net/) +- [Apache Maven](https://maven.apache.org/) +- Install [kcat](https://github.com/edenhill/kcat) +- Install jq. `brew install jq` + + +## Trying the connector + +After installing these dependencies, follow steps based on your requirement. + +### 1 - Starting the environment + +For runtime dependencies, we encourage using the confluent HDFS connector jars. We have tested our setup with +version `10.1.0`. Either use confluent-hub to install the connector or download it +from [here](https://tinyurl.com/yb472f79). You can install the confluent-hub command-line tool by downloading Confluent +Platform from [here](https://tinyurl.com/s2jjby53). + +Copy the entire folder to the classpath that will be used by the Hudi Kafka Connector. + +```bash +# Points CONFLUENT_DIR to Confluent Platform installation +export CONFLUENT_DIR=/path/to/confluent_install_dir +mkdir -p /usr/local/share/kafka/plugins +$CONFLUENT_DIR/bin/confluent-hub install confluentinc/kafka-connect-hdfs:10.1.0 +cp -r $CONFLUENT_DIR/share/confluent-hub-components/confluentinc-kafka-connect-hdfs/* /usr/local/share/kafka/plugins/ +``` + +Now, build the packaged jar that contains all the hudi classes, including the Hudi Kafka Connector. And copy it to the +plugin path that contains all the other jars (`/usr/local/share/kafka/plugins/lib`) + +```bash +cd $HUDI_DIR +mvn package -DskipTests -pl packaging/hudi-kafka-connect-bundle -am +mkdir -p /usr/local/share/kafka/plugins/lib +cp $HUDI_DIR/packaging/hudi-kafka-connect-bundle/target/hudi-kafka-connect-bundle-0.13.0-SNAPSHOT.jar /usr/local/share/kafka/plugins/lib +``` + +If the Hudi Sink Connector writes to a target Hudi table on [Amazon S3](https://aws.amazon.com/s3/), you need two +additional jars, `hadoop-aws-2.10.1.jar` and `aws-java-sdk-bundle-1.11.271.jar`, in the `plugins/lib` folder. You may +download them using the following commands. Note that, when you specify the target table path on S3, you need to use +`s3a://` prefix. + +```bash +cd /usr/local/share/kafka/plugins/lib +wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.271/aws-java-sdk-bundle-1.11.271.jar +wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.10.1/hadoop-aws-2.10.1.jar +``` + +Set up a Kafka broker locally. Download the latest apache kafka from [here](https://kafka.apache.org/downloads). Once +downloaded and built, run the Zookeeper server and Kafka server using the command line tools. The servers should be +ready in one to two minutes after executing the commands. + +```bash +export KAFKA_HOME=/path/to/kafka_install_dir +cd $KAFKA_HOME +# Run the following commands in separate terminals to keep them running +./bin/zookeeper-server-start.sh ./config/zookeeper.properties +./bin/kafka-server-start.sh ./config/server.properties +``` + +Wait until the kafka cluster is up and running. + +### 2 - Set up the schema registry + +Hudi leverages schema registry to obtain the latest schema when writing records. While it supports most popular schema +registries, we use Confluent schema registry. Download the +latest [confluent platform](https://docs.confluent.io/platform/current/installation/index.html) and run the schema +registry service. + +NOTE: You must change the port from `8081` (default) to `8082` to avoid conflict, i.e., +using `listeners=http://0.0.0.0:8082` in the properties file `etc/schema-registry/schema-registry.properties`. + +```bash +cd $CONFLUENT_DIR +./bin/kafka-configs --bootstrap-server localhost:9092 --entity-type topics --entity-name _schemas --alter --add-config cleanup.policy=compact +# Make sure you have changed the listener port as above +./bin/schema-registry-start etc/schema-registry/schema-registry.properties +``` + +### 3 - Create the Hudi Control Topic for Coordination of the transactions + +The control topic should only have `1` partition, since its used to coordinate the Hudi write transactions across the +multiple Connect tasks. + +```bash +cd $KAFKA_HOME +# The following command is expected to throw an error if the control topic does not exist. +# "Error while executing topic command : Topic 'hudi-control-topic' does not exist as expected" +./bin/kafka-topics.sh --delete --topic hudi-control-topic --bootstrap-server localhost:9092 +./bin/kafka-topics.sh --create --topic hudi-control-topic --partitions 1 --replication-factor 1 --bootstrap-server localhost:9092 +``` + +### 4 - Create the Hudi Topic for the Sink and insert data into the topic + +Open a terminal to execute the following command: + +```bash +cd $HUDI_DIR/hudi-kafka-connect/demo/ +bash setupKafka.sh -n +``` + +To generate data for long-running tests, you can add `-b` option to specify the number of batches of data +to generate, with each batch containing a number of messages and idle time between batches, as follows: + +```bash +bash setupKafka.sh -n -b +``` + +### 5 - Run the Sink connector worker (multiple workers can be run) + +The Kafka connect is a distributed platform, with the ability to run one or more workers (each running multiple tasks) +that parallely process the records from the Kafka partitions for the same topic. We provide a properties file with +default properties to start a Hudi connector. + +Note that if multiple workers need to be run, the webserver needs to be reconfigured for subsequent workers to ensure +successful running of the workers. + +```bash +cd $KAFKA_HOME +./bin/connect-distributed.sh $HUDI_DIR/hudi-kafka-connect/demo/connect-distributed.properties +``` + +### 6 - To add the Hudi Sink to the Connector (delete it if you want to re-configure) + +Once the Connector has started, it will not run the Sink, until the Hudi sink is added using the web api. The following +curl APIs can be used to delete and add a new Hudi Sink. Again, a default configuration is provided for the Hudi Sink, +that can be changed based on the desired properties. + +```bash +# The following command is expected to throw an error if the Hudi Sink Connector has not been added yet. +# {"error_code":404,"message":"Connector hudi-sink not found"} +curl -X DELETE http://localhost:8083/connectors/hudi-sink +curl -X POST -H "Content-Type:application/json" -d @$HUDI_DIR/hudi-kafka-connect/demo/config-sink.json http://localhost:8083/connectors +``` + +Now, you should see that the connector is created and tasks are running. + +```bash +> curl -X GET -H "Content-Type:application/json" http://localhost:8083/connectors +["hudi-sink"] + +> curl -X GET -H "Content-Type:application/json" http://localhost:8083/connectors/hudi-sink/status | jq + % Total % Received % Xferd Average Speed Time Time Time Current + Dload Upload Total Spent Left Speed +100 329 100 329 0 0 21096 0 --:--:-- --:--:-- --:--:-- 36555 +{ + "name": "hudi-sink", + "connector": { + "state": "RUNNING", + "worker_id": "127.0.0.1:8083" + }, + "tasks": [ + { + "id": 0, + "state": "RUNNING", + "worker_id": "127.0.0.1:8083" + }, + { + "id": 1, + "state": "RUNNING", + "worker_id": "127.0.0.1:8083" + }, + { + "id": 2, + "state": "RUNNING", + "worker_id": "127.0.0.1:8083" + }, + { + "id": 3, + "state": "RUNNING", + "worker_id": "127.0.0.1:8083" + } + ], + "type": "sink" +} +``` + +And, you should see your Hudi table created, which you can query using Spark/Flink. Note: HUDI-2325 tracks Hive sync, +which will unlock pretty much every other query engine. + +```bash +> ls -a /tmp/hoodie/hudi-test-topic +. .hoodie partition_1 partition_3 +.. partition_0 partition_2 partition_4 + +> ls -lt /tmp/hoodie/hudi-test-topic/.hoodie +total 72 +-rw-r--r-- 1 user wheel 346 Sep 14 10:32 hoodie.properties +-rw-r--r-- 1 user wheel 0 Sep 13 23:18 20210913231805.inflight +-rw-r--r-- 1 user wheel 0 Sep 13 23:18 20210913231805.commit.requested +-rw-r--r-- 1 user wheel 9438 Sep 13 21:45 20210913214351.commit +-rw-r--r-- 1 user wheel 0 Sep 13 21:43 20210913214351.inflight +-rw-r--r-- 1 user wheel 0 Sep 13 21:43 20210913214351.commit.requested +-rw-r--r-- 1 user wheel 18145 Sep 13 21:43 20210913214114.commit +-rw-r--r-- 1 user wheel 0 Sep 13 21:41 20210913214114.inflight +-rw-r--r-- 1 user wheel 0 Sep 13 21:41 20210913214114.commit.requested +drwxr-xr-x 2 user wheel 64 Sep 13 21:41 archived + +> ls -l /tmp/hoodie/hudi-test-topic/partition_0 +total 5168 +-rw-r--r-- 1 user wheel 439332 Sep 13 21:43 2E0E6DB44ACC8479059574A2C71C7A7E-0_0-0-0_20210913214114.parquet +-rw-r--r-- 1 user wheel 440179 Sep 13 21:42 3B56FAAAE2BDD04E480C1CBACD463D3E-0_0-0-0_20210913214114.parquet +-rw-r--r-- 1 user wheel 437097 Sep 13 21:45 3B56FAAAE2BDD04E480C1CBACD463D3E-0_0-0-0_20210913214351.parquet +-rw-r--r-- 1 user wheel 440219 Sep 13 21:42 D5AEE453699D5D9623D704C1CF399C8C-0_0-0-0_20210913214114.parquet +-rw-r--r-- 1 user wheel 437035 Sep 13 21:45 D5AEE453699D5D9623D704C1CF399C8C-0_0-0-0_20210913214351.parquet +-rw-r--r-- 1 user wheel 440214 Sep 13 21:43 E200FA75DCD1CED60BE86BCE6BF5D23A-0_0-0-0_20210913214114.parquet +``` + +### 7 - Run async compaction and clustering if scheduled + +When using Merge-On-Read (MOR) as the table type, async compaction and clustering can be scheduled when the Sink is +running. Inline compaction and clustering are disabled by default due to performance reason. By default, async +compaction scheduling is enabled, and you can disable it by setting `hoodie.kafka.compaction.async.enable` to `false`. +Async clustering scheduling is disabled by default, and you can enable it by setting `hoodie.clustering.async.enabled` +to `true`. + +The Sink only schedules the compaction and clustering if necessary and does not execute them for performance. You need +to execute the scheduled compaction and clustering using separate Spark jobs or Hudi CLI. + +After the compaction is scheduled, you can see the requested compaction instant (`20211111111410.compaction.requested`) +below: + +``` +> ls -l /tmp/hoodie/hudi-test-topic/.hoodie +total 280 +-rw-r--r-- 1 user wheel 21172 Nov 11 11:09 20211111110807.deltacommit +-rw-r--r-- 1 user wheel 0 Nov 11 11:08 20211111110807.deltacommit.inflight +-rw-r--r-- 1 user wheel 0 Nov 11 11:08 20211111110807.deltacommit.requested +-rw-r--r-- 1 user wheel 22458 Nov 11 11:11 20211111110940.deltacommit +-rw-r--r-- 1 user wheel 0 Nov 11 11:09 20211111110940.deltacommit.inflight +-rw-r--r-- 1 user wheel 0 Nov 11 11:09 20211111110940.deltacommit.requested +-rw-r--r-- 1 user wheel 21445 Nov 11 11:13 20211111111110.deltacommit +-rw-r--r-- 1 user wheel 0 Nov 11 11:11 20211111111110.deltacommit.inflight +-rw-r--r-- 1 user wheel 0 Nov 11 11:11 20211111111110.deltacommit.requested +-rw-r--r-- 1 user wheel 24943 Nov 11 11:14 20211111111303.deltacommit +-rw-r--r-- 1 user wheel 0 Nov 11 11:13 20211111111303.deltacommit.inflight +-rw-r--r-- 1 user wheel 0 Nov 11 11:13 20211111111303.deltacommit.requested +-rw-r--r-- 1 user wheel 9885 Nov 11 11:14 20211111111410.compaction.requested +-rw-r--r-- 1 user wheel 21192 Nov 11 11:15 20211111111411.deltacommit +-rw-r--r-- 1 user wheel 0 Nov 11 11:14 20211111111411.deltacommit.inflight +-rw-r--r-- 1 user wheel 0 Nov 11 11:14 20211111111411.deltacommit.requested +-rw-r--r-- 1 user wheel 0 Nov 11 11:15 20211111111530.deltacommit.inflight +-rw-r--r-- 1 user wheel 0 Nov 11 11:15 20211111111530.deltacommit.requested +drwxr-xr-x 2 user wheel 64 Nov 11 11:08 archived +-rw-r--r-- 1 user wheel 387 Nov 11 11:08 hoodie.properties +``` + +Then you can run async compaction job with `HoodieCompactor` and `spark-submit` by: + +``` +spark-submit \ + --class org.apache.hudi.utilities.HoodieCompactor \ + hudi/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.13.0-SNAPSHOT.jar \ + --base-path /tmp/hoodie/hudi-test-topic \ + --table-name hudi-test-topic \ + --schema-file /Users/user/repo/hudi/docker/demo/config/schema.avsc \ + --instant-time 20211111111410 \ + --parallelism 2 \ + --spark-memory 1g +``` + +Note that you don't have to provide the instant time through `--instant-time`. In that case, the earliest scheduled +compaction is going to be executed. + +Alternatively, you can use Hudi CLI to execute compaction: + +``` +hudi-> connect --path /tmp/hoodie/hudi-test-topic +hudi:hudi-test-topic-> compactions show all +╔═════════════════════════╤═══════════╤═══════════════════════════════╗ +║ Compaction Instant Time │ State │ Total FileIds to be Compacted ║ +╠═════════════════════════╪═══════════╪═══════════════════════════════╣ +║ 20211111111410 │ REQUESTED │ 12 ║ +╚═════════════════════════╧═══════════╧═══════════════════════════════╝ + +compaction validate --instant 20211111111410 +compaction run --compactionInstant 20211111111410 --parallelism 2 --schemaFilePath /Users/user/repo/hudi/docker/demo/config/schema.avsc +``` + +Similarly, you can see the requested clustering instant (`20211111111813.replacecommit.requested`) after it is scheduled +by the Sink: + +``` +> ls -l /tmp/hoodie/hudi-test-topic/.hoodie +total 736 +-rw-r--r-- 1 user wheel 24943 Nov 11 11:14 20211111111303.deltacommit +-rw-r--r-- 1 user wheel 0 Nov 11 11:13 20211111111303.deltacommit.inflight +-rw-r--r-- 1 user wheel 0 Nov 11 11:13 20211111111303.deltacommit.requested +-rw-r--r-- 1 user wheel 18681 Nov 11 11:17 20211111111410.commit +-rw-r--r-- 1 user wheel 0 Nov 11 11:17 20211111111410.compaction.inflight +-rw-r--r-- 1 user wheel 9885 Nov 11 11:14 20211111111410.compaction.requested +-rw-r--r-- 1 user wheel 21192 Nov 11 11:15 20211111111411.deltacommit +-rw-r--r-- 1 user wheel 0 Nov 11 11:14 20211111111411.deltacommit.inflight +-rw-r--r-- 1 user wheel 0 Nov 11 11:14 20211111111411.deltacommit.requested +-rw-r--r-- 1 user wheel 22460 Nov 11 11:17 20211111111530.deltacommit +-rw-r--r-- 1 user wheel 0 Nov 11 11:15 20211111111530.deltacommit.inflight +-rw-r--r-- 1 user wheel 0 Nov 11 11:15 20211111111530.deltacommit.requested +-rw-r--r-- 1 user wheel 21357 Nov 11 11:18 20211111111711.deltacommit +-rw-r--r-- 1 user wheel 0 Nov 11 11:17 20211111111711.deltacommit.inflight +-rw-r--r-- 1 user wheel 0 Nov 11 11:17 20211111111711.deltacommit.requested +-rw-r--r-- 1 user wheel 6516 Nov 11 11:18 20211111111813.replacecommit.requested +-rw-r--r-- 1 user wheel 26070 Nov 11 11:20 20211111111815.deltacommit +-rw-r--r-- 1 user wheel 0 Nov 11 11:18 20211111111815.deltacommit.inflight +-rw-r--r-- 1 user wheel 0 Nov 11 11:18 20211111111815.deltacommit.requested +``` + +Then you can run async clustering job with `HoodieClusteringJob` and `spark-submit` by: + +``` +spark-submit \ + --class org.apache.hudi.utilities.HoodieClusteringJob \ + hudi/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.13.0-SNAPSHOT.jar \ + --props clusteringjob.properties \ + --mode execute \ + --base-path /tmp/hoodie/hudi-test-topic \ + --table-name sample_table \ + --instant-time 20211111111813 \ + --spark-memory 1g +``` + +Sample `clusteringjob.properties`: + +``` +hoodie.datasource.write.recordkey.field=volume +hoodie.datasource.write.partitionpath.field=date +hoodie.deltastreamer.schemaprovider.registry.url=http://localhost:8081/subjects/hudi-test-topic/versions/latest + +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy +hoodie.clustering.plan.strategy.sort.columns=volume + +hoodie.write.concurrency.mode=single_writer +``` + +Note that you don't have to provide the instant time through `--instant-time`. In that case, the earliest scheduled +clustering is going to be executed. + +### 8 - Querying via Hive + +In this section we explain how one can test syncing of the Hudi table with Hive server/ Hive Metastore, +that enable querying via Hive, Presto etc. + +To ease the deployment of HDFS, Hive Server, Hive Metastore etc. for testing hive sync, we use the docker +containers from the hudi docker demo. Refer to [this link for the setup](https://hudi.apache.org/docs/docker_demo). +Additionally, the docker deploys kafka and zookeeper too, so you do not need to run them explicitly in this setup. + +Essentially, follow the steps listed here: + +/etc/hosts : The demo references many services running in container by the hostname. Add the following settings to /etc/hosts +```bash +127.0.0.1 adhoc-1 +127.0.0.1 adhoc-2 +127.0.0.1 namenode +127.0.0.1 datanode1 +127.0.0.1 hiveserver +127.0.0.1 hivemetastore +127.0.0.1 kafkabroker +127.0.0.1 sparkmaster +127.0.0.1 zookeeper +``` + +Bring up the docker containers +```bash +cd $HUDI_DIR/docker +./setup_demo.sh +``` + +Firstly, (re)-install a different connector that is configured to write the Hudi table to Hdfs instead of local filesystem. + +```bash +# The following command is expected to throw an error if the Hudi Sink Connector has not been added yet. +# {"error_code":404,"message":"Connector hudi-sink not found"} +curl -X DELETE http://localhost:8083/connectors/hudi-sink +curl -X POST -H "Content-Type:application/json" -d @$HUDI_DIR/hudi-kafka-connect/demo/config-sink-hive.json http://localhost:8083/connectors +``` + +After running the connector, you can query the hive server using the following steps: + +```bash +docker exec -it adhoc-2 /bin/bash +beeline -u jdbc:hive2://hiveserver:10000 \ + --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \ + --hiveconf hive.stats.autogather=false + +# List Tables +0: jdbc:hive2://hiveserver:10000> show tables; ++---------------------+--+ +| tab_name | ++---------------------+--+ +| huditesttopic_ro | +| huditesttopic_rt | ++---------------------+--+ +3 rows selected (1.199 seconds) +0: jdbc:hive2://hiveserver:10000> + +# Look at partitions that were added +0: jdbc:hive2://hiveserver:10000> show partitions huditesttopic_ro; ++-------------------+--+ +| partition | ++-------------------+--+ +| date=partition_0 | +| date=partition_1 | +| date=partition_2 | +| date=partition_3 | +| date=partition_4 | ++-------------------+--+ +1 row selected (0.24 seconds) + +0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from huditesttopic_ro; ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| _hoodie_commit_time | symbol | ts | volume | open | close | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 | +| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 | ++----------------------+---------+----------------------+---------+------------+-----------+--+ +``` + +`Current Limitation:` The Hudi Kafka Connect sink uses `Merge-On-Read` by default, and inserts/ appends the kafka records +directly to the log file(s). Asynchronously, compaction service can be executed to merge the log files into base file (Parquet format). +Generally, we support both Read-Optimized that reads only parquet base files and Snapshot queries that read and merge +records across base and log files. However, currently there is a limitation where we are not able to read records from +only log files. Hence, the queries for Hudi Kafka Connect will only work after compaction merges the records into base files. Alternatively, +users have the option to reconfigure the table type to `COPY_ON_WRITE` in config-sink.json. + +### 9 - Troubleshoot + +#### javax.security.auth.login.LoginException +If during the execution of Hudi Sink connector, you see this error: + +```Caused by: javax.security.auth.login.LoginException: java.lang.NullPointerException: invalid null input: name``` +, is very likely that your Kafka Connect service was started by an unnamed user. To see if this is your case, +ssh into your Kafka Connect container/server and run: +`whoami` + +If you receive a message like this `whoami: cannot find name for user ID ...`, you'll need to change the service user starting Kafka Connect. +If you are using Docker images, modify your Dockerfile. + +To change the service user of your docker image, add this to your Dockerfile: +```dockerfile +USER +``` + +To create a new service user for your docker image, add this to your Dockerfile: +```dockerfile +RUN useradd kafka-conn-service -r +USER kafka-conn-service +``` + + + diff --git a/hudi-kafka-connect/demo/config-sink-hive.json b/hudi-kafka-connect/demo/config-sink-hive.json new file mode 100644 index 0000000000000..214fd1891906f --- /dev/null +++ b/hudi-kafka-connect/demo/config-sink-hive.json @@ -0,0 +1,30 @@ +{ + "name": "hudi-sink", + "config": { + "bootstrap.servers": "kafkabroker:9092", + "connector.class": "org.apache.hudi.connect.HoodieSinkConnector", + "tasks.max": "4", + "key.converter": "org.apache.kafka.connect.storage.StringConverter", + "value.converter": "org.apache.kafka.connect.storage.StringConverter", + "value.converter.schemas.enable": "false", + "topics": "hudi-test-topic", + "hoodie.table.name": "hudi-test-topic", + "hoodie.table.type": "MERGE_ON_READ", + "hoodie.base.path": "hdfs://namenode:8020/user/hive/warehouse/hudi-test-topic", + "hoodie.datasource.write.recordkey.field": "volume", + "hoodie.datasource.write.partitionpath.field": "date", + "hoodie.schemaprovider.class": "org.apache.hudi.schema.SchemaRegistryProvider", + "hoodie.deltastreamer.schemaprovider.registry.url": "http://localhost:8082/subjects/hudi-test-topic/versions/latest", + "hoodie.kafka.commit.interval.secs": 60, + "hoodie.meta.sync.enable": "true", + "hoodie.meta.sync.classes": "org.apache.hudi.hive.HiveSyncTool", + "hoodie.datasource.hive_sync.table": "huditesttopic", + "hoodie.datasource.hive_sync.partition_fields": "date", + "hoodie.datasource.hive_sync.partition_extractor_class": "org.apache.hudi.hive.MultiPartKeysValueExtractor", + "hoodie.datasource.hive_sync.use_jdbc": "false", + "hoodie.datasource.hive_sync.mode": "hms", + "dfs.client.use.datanode.hostname": "true", + "hive.metastore.uris": "thrift://hivemetastore:9083", + "hive.metastore.client.socket.timeout": "1500s" + } +} diff --git a/hudi-kafka-connect/demo/config-sink.json b/hudi-kafka-connect/demo/config-sink.json new file mode 100644 index 0000000000000..105c4d491e779 --- /dev/null +++ b/hudi-kafka-connect/demo/config-sink.json @@ -0,0 +1,20 @@ +{ + "name": "hudi-sink", + "config": { + "bootstrap.servers": "kafkabroker:9092", + "connector.class": "org.apache.hudi.connect.HoodieSinkConnector", + "tasks.max": "4", + "key.converter": "org.apache.kafka.connect.storage.StringConverter", + "value.converter": "org.apache.kafka.connect.storage.StringConverter", + "value.converter.schemas.enable": "false", + "topics": "hudi-test-topic", + "hoodie.table.name": "hudi-test-topic", + "hoodie.table.type": "MERGE_ON_READ", + "hoodie.base.path": "file:///tmp/hoodie/hudi-test-topic", + "hoodie.datasource.write.recordkey.field": "volume", + "hoodie.datasource.write.partitionpath.field": "date", + "hoodie.schemaprovider.class": "org.apache.hudi.schema.SchemaRegistryProvider", + "hoodie.deltastreamer.schemaprovider.registry.url": "http://localhost:8082/subjects/hudi-test-topic/versions/latest", + "hoodie.kafka.commit.interval.secs": 60 + } +} diff --git a/hudi-kafka-connect/demo/connect-distributed.properties b/hudi-kafka-connect/demo/connect-distributed.properties new file mode 100644 index 0000000000000..1c28bc60d52b3 --- /dev/null +++ b/hudi-kafka-connect/demo/connect-distributed.properties @@ -0,0 +1,33 @@ +## +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +## + +bootstrap.servers=kafkabroker:9092 +group.id=hudi-connect-cluster +key.converter=org.apache.kafka.connect.json.JsonConverter +value.converter=org.apache.kafka.connect.json.JsonConverter +key.converter.schemas.enable=true +value.converter.schemas.enable=true +offset.storage.topic=connect-offsets +offset.storage.replication.factor=1 +config.storage.topic=connect-configs +config.storage.replication.factor=1 +status.storage.topic=connect-status +status.storage.replication.factor=1 + +offset.flush.interval.ms=60000 +listeners=HTTP://:8083 +plugin.path=/usr/local/share/kafka/plugins diff --git a/hudi-kafka-connect/demo/setupKafka.sh b/hudi-kafka-connect/demo/setupKafka.sh new file mode 100755 index 0000000000000..e4e8d2e382ed1 --- /dev/null +++ b/hudi-kafka-connect/demo/setupKafka.sh @@ -0,0 +1,178 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +######################### +# The command line help # +######################### +usage() { + echo "Usage: $0" + echo " -n |--num-kafka-records, (required) number of kafka records to generate in a batch" + echo " -b |--num-batch, (optional) number of batches of records to generate (default is 1)" + echo " -t |--reuse-topic, (optional) reuses the Kafka topic (default deletes and recreate the topic)" + echo " -f |--raw-file, (optional) raw file for the kafka records" + echo " -k |--kafka-topic, (optional) Topic name for Kafka" + echo " -m |--num-kafka-partitions, (optional) number of kafka partitions" + echo " -r |--record-key, (optional) field to use as record key" + echo " -o |--record-key-offset, (optional) record key offset to start with (default is 0)" + echo " -l |--num-hudi-partitions, (optional) number of hudi partitions" + echo " -p |--partition-key, (optional) field to use as partition" + echo " -s |--schema-file, (optional) path of the file containing the schema of the records" + exit 1 +} + +case "$1" in +--help) + usage + exit 0 + ;; +esac + +if [ $# -lt 1 ]; then + echo "Illegal number of parameters" + usage + exit 0 +fi + +if [ ! $HUDI_DIR ]; then + export HUDI_DIR=$(dirname "$(dirname $PWD)") +fi + +## defaults +rawDataFile=${HUDI_DIR}/docker/demo/data/batch_1.json +kafkaBrokerHostname=localhost +kafkaTopicName=hudi-test-topic +numKafkaPartitions=4 +recordKey=volume +numHudiPartitions=5 +partitionField=date +schemaFile=${HUDI_DIR}/docker/demo/config/schema.avsc +numBatch=1 +recordValue=0 +recreateTopic="Y" + +while getopts ":n:b:tf:k:m:r:o:l:p:s:-:" opt; do + case $opt in + n) + numRecords="$OPTARG" + printf "Argument num-kafka-records is %s\n" "$numRecords" + ;; + b) + numBatch="$OPTARG" + printf "Argument num-batch is %s\n" "$numBatch" + ;; + t) + recreateTopic="N" + printf "Argument recreate-topic is N (reuse Kafka topic) \n" + ;; + f) + rawDataFile="$OPTARG" + printf "Argument raw-file is %s\n" "$rawDataFile" + ;; + k) + kafkaTopicName="$OPTARG" + printf "Argument kafka-topic is %s\n" "$kafkaTopicName" + ;; + m) + numKafkaPartitions="$OPTARG" + printf "Argument num-kafka-partitions is %s\n" "$numKafkaPartitions" + ;; + r) + recordKey="$OPTARG" + printf "Argument record-key is %s\n" "$recordKey" + ;; + o) + recordValue="$OPTARG" + printf "Argument record-key-offset is %s\n" "$recordValue" + ;; + l) + numHudiPartitions="$OPTARG" + printf "Argument num-hudi-partitions is %s\n" "$numHudiPartitions" + ;; + p) + partitionField="$OPTARG" + printf "Argument partition-key is %s\n" "$partitionField" + ;; + s) + schemaFile="$OPTARG" + printf "Argument schema-file is %s\n" "$schemaFile" + ;; + -) + echo "Invalid option -$OPTARG" >&2 + ;; + esac +done + +if [ $recreateTopic = "Y" ]; then + # First delete the existing topic + echo "Delete Kafka topic $kafkaTopicName ..." + ${KAFKA_HOME}/bin/kafka-topics.sh --delete --topic ${kafkaTopicName} --bootstrap-server ${kafkaBrokerHostname}:9092 + + # Create the topic with 4 partitions + echo "Create Kafka topic $kafkaTopicName ..." + ${KAFKA_HOME}/bin/kafka-topics.sh --create --topic ${kafkaTopicName} --partitions $numKafkaPartitions --replication-factor 1 --bootstrap-server ${kafkaBrokerHostname}:9092 +fi + +# Setup the schema registry +export SCHEMA=$(sed 's|/\*|\n&|g;s|*/|&\n|g' ${schemaFile} | sed '/\/\*/,/*\//d' | jq tostring) +curl -X POST -H "Content-Type: application/vnd.schemaregistry.v1+json" --data "{\"schema\": $SCHEMA}" http://localhost:8082/subjects/${kafkaTopicName}/versions +curl -X GET http://localhost:8082/subjects/${kafkaTopicName}/versions/latest + +# Generate kafka messages from raw records +# Each records with unique keys and generate equal messages across each hudi partition +partitions={} +for ((i = 0; i < ${numHudiPartitions}; i++)); do + partitions[$i]="partition_"$i +done + +events_file=/tmp/kcat-input.events +rm -f ${events_file} + +totalNumRecords=$((numRecords + recordValue)) + +for ((i = 1;i<=numBatch;i++)); do + rm -f ${events_file} + date + echo "Start batch $i ..." + batchRecordSeq=0 + for (( ; ; )); do + while IFS= read line; do + for partitionValue in "${partitions[@]}"; do + echo $line | jq --arg recordKey $recordKey --arg recordValue $recordValue --arg partitionField $partitionField --arg partitionValue $partitionValue -c '.[$recordKey] = $recordValue | .[$partitionField] = $partitionValue' >>${events_file} + ((recordValue = recordValue + 1)) + ((batchRecordSeq = batchRecordSeq + 1)) + + if [ $batchRecordSeq -eq $numRecords ]; then + break + fi + done + + if [ $batchRecordSeq -eq $numRecords ]; then + break + fi + done <"$rawDataFile" + + if [ $batchRecordSeq -eq $numRecords ]; then + date + echo " Record key until $recordValue" + sleep 20 + break + fi + done + + echo "publish to Kafka ..." + grep -v '^$' ${events_file} | kcat -P -b ${kafkaBrokerHostname}:9092 -t ${kafkaTopicName} +done diff --git a/hudi-kafka-connect/pom.xml b/hudi-kafka-connect/pom.xml new file mode 100644 index 0000000000000..8073095710159 --- /dev/null +++ b/hudi-kafka-connect/pom.xml @@ -0,0 +1,275 @@ + + + + + hudi + org.apache.hudi + 0.12.2-dt-SNAPSHOT + + 4.0.0 + + hudi-kafka-connect + Kafka Connect Sink Connector for Hudi + 0.12.2-dt-SNAPSHOT + jar + + + ${project.parent.basedir} + 2.5.0 + + + + + + org.jacoco + jacoco-maven-plugin + + + org.apache.maven.plugins + maven-compiler-plugin + + 1.8 + 1.8 + + + + org.apache.maven.plugins + maven-jar-plugin + 3.1.2 + + + + test-jar + + + + + + org.apache.rat + apache-rat-plugin + + + com.github.os72 + protoc-jar-maven-plugin + 3.11.4 + + + generate-sources + + run + + + ${protoc.version} + + src/main/resources + + + + + + + + + + src/main/resources + + + src/test/resources + + + + + + + + org.apache.kafka + connect-api + ${connect.api.version} + provided + + + org.apache.kafka + connect-json + ${connect.api.version} + provided + + + + + org.apache.hudi + hudi-java-client + ${project.version} + + + org.apache.hudi + hudi-utilities_${scala.binary.version} + ${project.version} + + + org.apache.hudi + hudi-client-common + ${project.version} + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + + + + org.apache.hudi + hudi-flink + ${project.version} + + + org.apache.flink + flink-core + ${flink.version} + + + com.esotericsoftware.kryo + * + + + + + org.apache.flink + ${flink.hadoop.compatibility.artifactId} + ${flink.version} + + + + + com.google.protobuf + protobuf-java + ${proto.version} + + + + + com.fasterxml.jackson.module + jackson-module-scala_${scala.binary.version} + + + com.fasterxml.jackson.dataformat + jackson-dataformat-csv + ${fasterxml.version} + + + + + org.apache.avro + avro + ${avro.version} + + + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + + + + + org.apache.hive + hive-common + ${hive.version} + + + ${hive.groupid} + hive-metastore + ${hive.version} + + + + + + org.apache.hudi + hudi-tests-common + ${project.version} + test + + + org.apache.hudi + hudi-common + ${project.version} + tests + test-jar + test + + + + + org.junit.jupiter + junit-jupiter-api + test + + + + org.junit.jupiter + junit-jupiter-engine + test + + + + org.junit.vintage + junit-vintage-engine + test + + + + org.junit.jupiter + junit-jupiter-params + test + + + + org.mockito + mockito-junit-jupiter + test + + + + org.junit.platform + junit-platform-runner + test + + + + org.junit.platform + junit-platform-suite-api + test + + + + org.junit.platform + junit-platform-commons + test + + + diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/HoodieSinkConnector.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/HoodieSinkConnector.java new file mode 100644 index 0000000000000..2d8cc47aaa6f6 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/HoodieSinkConnector.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect; + +import org.apache.kafka.common.config.ConfigDef; +import org.apache.kafka.connect.connector.Task; +import org.apache.kafka.connect.sink.SinkConnector; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * HudiSinkConnector is a Kafka Connect Connector implementation + * that ingest data from Kafka to Hudi. + */ +public class HoodieSinkConnector extends SinkConnector { + + public static final String VERSION = "0.1.0"; + private static final Logger LOG = LogManager.getLogger(HoodieSinkConnector.class); + private Map configProps; + + /** + * No-arg constructor. It is instantiated by Connect framework. + */ + public HoodieSinkConnector() { + } + + @Override + public String version() { + return VERSION; + } + + @Override + public void start(Map props) { + configProps = new HashMap<>(props); + } + + @Override + public Class taskClass() { + return HoodieSinkTask.class; + } + + @Override + public List> taskConfigs(int maxTasks) { + Map taskProps = new HashMap<>(configProps); + List> taskConfigs = new ArrayList<>(maxTasks); + for (int i = 0; i < maxTasks; ++i) { + taskConfigs.add(taskProps); + } + return taskConfigs; + } + + @Override + public void stop() { + LOG.info(String.format("Shutting down Hudi Sink connector %s", configProps.get("name"))); + } + + @Override + public ConfigDef config() { + // we use Hudi configs instead + return new ConfigDef(); + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/HoodieSinkTask.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/HoodieSinkTask.java new file mode 100644 index 0000000000000..c14a86656a6da --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/HoodieSinkTask.java @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect; + +import org.apache.hudi.connect.kafka.KafkaConnectControlAgent; +import org.apache.hudi.connect.transaction.ConnectTransactionCoordinator; +import org.apache.hudi.connect.transaction.ConnectTransactionParticipant; +import org.apache.hudi.connect.transaction.TransactionCoordinator; +import org.apache.hudi.connect.transaction.TransactionParticipant; +import org.apache.hudi.connect.writers.KafkaConnectConfigs; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; + +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.errors.RetriableException; +import org.apache.kafka.connect.sink.SinkRecord; +import org.apache.kafka.connect.sink.SinkTask; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; + +/** + * Implementation of the {@link SinkTask} interface provided by + * Kafka Connect. Implements methods to receive the Kafka records + * from the assigned partitions and commit the Kafka offsets. + * Also, handles re-assignments of partitions. + */ +public class HoodieSinkTask extends SinkTask { + + public static final String TASK_ID_CONFIG_NAME = "task.id"; + private static final Logger LOG = LogManager.getLogger(HoodieSinkTask.class); + + private final Map transactionCoordinators; + private final Map transactionParticipants; + private KafkaConnectControlAgent controlKafkaClient; + private KafkaConnectConfigs connectConfigs; + + private String taskId; + private String connectorName; + + public HoodieSinkTask() { + transactionCoordinators = new HashMap<>(); + transactionParticipants = new HashMap<>(); + } + + @Override + public String version() { + return HoodieSinkConnector.VERSION; + } + + @Override + public void start(Map props) { + connectorName = props.get("name"); + taskId = props.get(TASK_ID_CONFIG_NAME); + LOG.info(String.format("Starting Hudi Sink Task for %s connector %s with id %s with assignments %s", + props, connectorName, taskId, context.assignment())); + try { + connectConfigs = KafkaConnectConfigs.newBuilder().withProperties(props).build(); + controlKafkaClient = KafkaConnectControlAgent.createKafkaControlManager( + connectConfigs.getBootstrapServers(), + connectConfigs.getControlTopicName()); + } catch (ConfigException e) { + throw new ConnectException("Couldn't start HdfsSinkConnector due to configuration error.", e); + } catch (ConnectException e) { + LOG.error("Couldn't start HudiSinkConnector:", e); + LOG.info("Shutting down HudiSinkConnector."); + cleanup(); + // Always throw the original exception that prevent us from starting + throw e; + } + } + + @Override + public void put(Collection records) { + for (SinkRecord record : records) { + String topic = record.topic(); + int partition = record.kafkaPartition(); + TopicPartition tp = new TopicPartition(topic, partition); + + TransactionParticipant transactionParticipant = transactionParticipants.get(tp); + if (transactionParticipant != null) { + transactionParticipant.buffer(record); + } + } + + for (TopicPartition partition : context.assignment()) { + if (transactionParticipants.get(partition) == null) { + throw new RetriableException("TransactionParticipant should be created for each assigned partition, " + + "but has not been created for the topic/partition: " + partition.topic() + ":" + partition.partition()); + } + try { + transactionParticipants.get(partition).processRecords(); + } catch (HoodieIOException exception) { + throw new RetriableException("Intermittent write errors for Hudi " + + " for the topic/partition: " + partition.topic() + ":" + partition.partition() + + " , ensuring kafka connect will retry ", exception); + } + } + } + + @Override + public void stop() { + cleanup(); + } + + @Override + public void flush(Map currentOffsets) { + // No-op. The connector is managing the offsets. + } + + @Override + public Map preCommit(Map currentOffsets) { + // Although the connector manages offsets via commit files in Hudi, we still want to have Connect + // commit the consumer offsets for records this task has consumed from its topic partitions and + // committed to Hudi. + Map result = new HashMap<>(); + for (TopicPartition partition : context.assignment()) { + TransactionParticipant worker = transactionParticipants.get(partition); + if (worker != null && worker.getLastKafkaCommittedOffset() >= 0) { + result.put(partition, new OffsetAndMetadata(worker.getLastKafkaCommittedOffset())); + } + } + return result; + } + + @Override + public void open(Collection partitions) { + LOG.info("New partitions added " + partitions.toString()); + bootstrap(partitions); + } + + @Override + public void close(Collection partitions) { + LOG.info("Existing partitions deleted " + partitions.toString()); + // Close any writers we have. We may get assigned the same partitions and end up duplicating + // some effort since we'll have to reprocess those messages. It may be possible to hold on to + // the TopicPartitionWriter and continue to use the temp file, but this can get significantly + // more complex due to potential failures and network partitions. For example, we may get + // this close, then miss a few generations of group membership, during which + // data may have continued to be processed and we'd have to restart from the recovery stage, + // make sure we apply the WAL, and only reuse the temp file if the starting offset is still + // valid. For now, we prefer the simpler solution that may result in a bit of wasted effort. + for (TopicPartition partition : partitions) { + if (partition.partition() == ConnectTransactionCoordinator.COORDINATOR_KAFKA_PARTITION) { + if (transactionCoordinators.containsKey(partition)) { + transactionCoordinators.get(partition).stop(); + transactionCoordinators.remove(partition); + } + } + TransactionParticipant worker = transactionParticipants.remove(partition); + if (worker != null) { + try { + LOG.debug("Closing data writer due to task start failure."); + worker.stop(); + } catch (Throwable t) { + LOG.debug(String.format("Error closing and stopping data writer: %s", t.getMessage()), t); + } + } + } + } + + private void bootstrap(Collection partitions) { + LOG.info(String.format("Bootstrap task for connector %s with id %s with assignments %s part %s", + connectorName, taskId, context.assignment(), partitions)); + for (TopicPartition partition : partitions) { + try { + // If the partition is 0, instantiate the Leader + if (partition.partition() == ConnectTransactionCoordinator.COORDINATOR_KAFKA_PARTITION) { + ConnectTransactionCoordinator coordinator = new ConnectTransactionCoordinator( + connectConfigs, + partition, + controlKafkaClient); + coordinator.start(); + transactionCoordinators.put(partition, coordinator); + } + ConnectTransactionParticipant worker = new ConnectTransactionParticipant(connectConfigs, partition, controlKafkaClient, context); + transactionParticipants.put(partition, worker); + worker.start(); + } catch (HoodieException exception) { + LOG.error(String.format("Fatal error initializing task %s for partition %s", taskId, partition.partition()), exception); + } + } + } + + private void cleanup() { + for (TopicPartition partition : context.assignment()) { + TransactionParticipant worker = transactionParticipants.get(partition); + if (worker != null) { + try { + LOG.debug("Closing data writer due to task start failure."); + worker.stop(); + } catch (Throwable t) { + LOG.debug("Error closing and stopping data writer", t); + } + } + } + transactionParticipants.clear(); + transactionCoordinators.forEach((topic, transactionCoordinator) -> transactionCoordinator.stop()); + transactionCoordinators.clear(); + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/KafkaConnectFileIdPrefixProvider.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/KafkaConnectFileIdPrefixProvider.java new file mode 100644 index 0000000000000..436366709d5be --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/KafkaConnectFileIdPrefixProvider.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.connect.utils.KafkaConnectUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.table.FileIdPrefixProvider; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +public class KafkaConnectFileIdPrefixProvider extends FileIdPrefixProvider { + + public static final String KAFKA_CONNECT_PARTITION_ID = "hudi.kafka.connect.partition"; + private static final Logger LOG = LogManager.getLogger(KafkaConnectFileIdPrefixProvider.class); + + private final String kafkaPartition; + + public KafkaConnectFileIdPrefixProvider(TypedProperties props) { + super(props); + if (!props.containsKey(KAFKA_CONNECT_PARTITION_ID)) { + LOG.error("Fatal error due to Kafka Connect Partition Id is not set"); + throw new HoodieException("Kafka Connect Partition Key " + KAFKA_CONNECT_PARTITION_ID + " not provided"); + } + this.kafkaPartition = props.getProperty(KAFKA_CONNECT_PARTITION_ID); + } + + @Override + public String createFilePrefix(String partitionPath) { + // We use a combination of kafka partition and partition path as the file id, and then hash it + // to generate a fixed sized hash. + String rawFileIdPrefix = kafkaPartition + partitionPath; + String hashedPrefix = KafkaConnectUtils.hashDigest(rawFileIdPrefix); + LOG.info("CreateFileId for Kafka Partition " + kafkaPartition + " : " + partitionPath + " = " + rawFileIdPrefix + + " === " + hashedPrefix); + return hashedPrefix; + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaConnectControlAgent.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaConnectControlAgent.java new file mode 100644 index 0000000000000..776beafbd6bea --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaConnectControlAgent.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.kafka; + +import org.apache.hudi.connect.ControlMessage; +import org.apache.hudi.connect.transaction.TransactionCoordinator; +import org.apache.hudi.connect.transaction.TransactionParticipant; + +import org.apache.kafka.clients.consumer.CommitFailedException; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.ConsumerRecords; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.common.serialization.ByteArrayDeserializer; +import org.apache.kafka.common.serialization.StringDeserializer; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.time.Duration; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; +import java.util.UUID; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +/** + * Class that manages the Kafka consumer and producer for + * the Kafka Control Topic that ensures coordination across the + * {@link TransactionCoordinator} and {@link TransactionParticipant}s. + * Use a single instance per worker (single-threaded), + * and register multiple tasks that can receive the control messages. + */ +public class KafkaConnectControlAgent implements KafkaControlAgent { + + private static final Logger LOG = LogManager.getLogger(KafkaConnectControlAgent.class); + private static final Object LOCK = new Object(); + private static final long KAFKA_POLL_TIMEOUT_MS = 100; + private static final int EXEC_SHUTDOWN_TIMEOUT_MS = 5000; + + private static KafkaConnectControlAgent agent; + private final String bootstrapServers; + private final String controlTopicName; + private final ExecutorService executorService; + private final Map topicCoordinators; + // List of TransactionParticipants per Kafka Topic + private final Map> partitionWorkers; + private final KafkaControlProducer producer; + private KafkaConsumer consumer; + + public KafkaConnectControlAgent(String bootstrapServers, + String controlTopicName) { + this.bootstrapServers = bootstrapServers; + this.controlTopicName = controlTopicName; + this.executorService = Executors.newSingleThreadExecutor(); + this.topicCoordinators = new HashMap<>(); + this.partitionWorkers = new HashMap<>(); + this.producer = new KafkaControlProducer(bootstrapServers, controlTopicName); + start(); + } + + public static KafkaConnectControlAgent createKafkaControlManager(String bootstrapServers, + String controlTopicName) { + if (agent == null) { + synchronized (LOCK) { + if (agent == null) { + agent = new KafkaConnectControlAgent(bootstrapServers, controlTopicName); + } + } + } + return agent; + } + + @Override + public void registerTransactionParticipant(TransactionParticipant worker) { + if (!partitionWorkers.containsKey(worker.getPartition().topic())) { + partitionWorkers.put(worker.getPartition().topic(), new ConcurrentLinkedQueue<>()); + } + partitionWorkers.get(worker.getPartition().topic()).add(worker); + } + + @Override + public void deregisterTransactionParticipant(TransactionParticipant worker) { + if (partitionWorkers.containsKey(worker.getPartition().topic())) { + partitionWorkers.get(worker.getPartition().topic()).remove(worker); + } + } + + @Override + public void registerTransactionCoordinator(TransactionCoordinator coordinator) { + if (!topicCoordinators.containsKey(coordinator.getPartition().topic())) { + topicCoordinators.put(coordinator.getPartition().topic(), coordinator); + } + } + + public void deregisterTransactionCoordinator(TransactionCoordinator coordinator) { + topicCoordinators.remove(coordinator.getPartition().topic()); + } + + @Override + public void publishMessage(ControlMessage message) { + producer.publishMessage(message); + } + + private void start() { + Properties props = new Properties(); + props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); + // Todo fetch the worker id or name instead of a uuid. + props.put(ConsumerConfig.GROUP_ID_CONFIG, "hudi-control-group" + UUID.randomUUID().toString()); + props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); + props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class); + + // Since we are using Kafka Control Topic as a RPC like interface, + // we want consumers to only process messages that are sent after they come online + props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest"); + + consumer = new KafkaConsumer<>(props, new StringDeserializer(), new ByteArrayDeserializer()); + + consumer.subscribe(Collections.singletonList(controlTopicName)); + + executorService.submit(() -> { + while (true) { + ConsumerRecords records; + records = consumer.poll(Duration.ofMillis(KAFKA_POLL_TIMEOUT_MS)); + for (ConsumerRecord record : records) { + try { + LOG.debug(String.format("Kafka consumerGroupId = %s topic = %s, partition = %s, offset = %s, customer = %s, country = %s", + "", record.topic(), record.partition(), record.offset(), record.key(), record.value())); + ControlMessage message = ControlMessage.parseFrom(record.value()); + String senderTopic = message.getTopicName(); + + if (message.getReceiverType().equals(ControlMessage.EntityType.PARTICIPANT)) { + if (partitionWorkers.containsKey(senderTopic)) { + for (TransactionParticipant partitionWorker : partitionWorkers.get(senderTopic)) { + partitionWorker.processControlEvent(message); + } + } else { + LOG.warn(String.format("Failed to send message for unregistered participants for topic %s", senderTopic)); + } + } else if (message.getReceiverType().equals(ControlMessage.EntityType.COORDINATOR)) { + if (topicCoordinators.containsKey(senderTopic)) { + topicCoordinators.get(senderTopic).processControlEvent(message); + } + } else { + LOG.warn(String.format("Sender type of Control Message unknown %s", message.getSenderType().name())); + } + } catch (Exception e) { + LOG.error(String.format("Fatal error while consuming a kafka record for topic = %s partition = %s", record.topic(), record.partition()), e); + } + } + try { + consumer.commitSync(); + } catch (CommitFailedException exception) { + LOG.error("Fatal error while committing kafka control topic"); + } + } + }); + } + + public void stop() { + producer.stop(); + consumer.close(); + if (executorService != null) { + boolean terminated = false; + try { + LOG.info("Shutting down executor service."); + executorService.shutdown(); + LOG.info("Awaiting termination."); + terminated = executorService.awaitTermination(EXEC_SHUTDOWN_TIMEOUT_MS, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + // ignored + } + + if (!terminated) { + LOG.warn( + "Unclean Kafka Control Manager executor service shutdown "); + executorService.shutdownNow(); + } + } + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlAgent.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlAgent.java new file mode 100644 index 0000000000000..85b843557b1b7 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlAgent.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.kafka; + +import org.apache.hudi.connect.ControlMessage; +import org.apache.hudi.connect.transaction.TransactionCoordinator; +import org.apache.hudi.connect.transaction.TransactionParticipant; + +/** + * Manages the Kafka consumer and producer for + * the Kafka Control Topic that ensures coordination across the + * {@link TransactionCoordinator} and {@link TransactionParticipant}s. + */ +public interface KafkaControlAgent { + + void registerTransactionParticipant(TransactionParticipant worker); + + void deregisterTransactionParticipant(TransactionParticipant worker); + + void registerTransactionCoordinator(TransactionCoordinator coordinator); + + void deregisterTransactionCoordinator(TransactionCoordinator coordinator); + + void publishMessage(ControlMessage message); +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlProducer.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlProducer.java new file mode 100644 index 0000000000000..530e57059d5e0 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlProducer.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.kafka; + +import org.apache.hudi.connect.ControlMessage; + +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.Producer; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.serialization.ByteArraySerializer; +import org.apache.kafka.common.serialization.StringSerializer; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Properties; + +/** + * Kafka producer to send events to the + * Control Topic that coordinates transactions + * across Participants. + */ +public class KafkaControlProducer { + + private static final Logger LOG = LogManager.getLogger(KafkaControlProducer.class); + + private final String bootstrapServers; + private final String controlTopicName; + private Producer producer; + + public KafkaControlProducer(String bootstrapServers, String controlTopicName) { + this.bootstrapServers = bootstrapServers; + this.controlTopicName = controlTopicName; + start(); + } + + private void start() { + Properties props = new Properties(); + props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); + props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class); + props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class); + + producer = new KafkaProducer<>( + props, + new StringSerializer(), + new ByteArraySerializer() + ); + } + + public void stop() { + producer.close(); + } + + public void publishMessage(ControlMessage message) { + ProducerRecord record + = new ProducerRecord<>(controlTopicName, message.getType().name(), message.toByteArray()); + producer.send(record); + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionCoordinator.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionCoordinator.java new file mode 100644 index 0000000000000..1157b2165ac83 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionCoordinator.java @@ -0,0 +1,424 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.transaction; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.connect.ControlMessage; +import org.apache.hudi.connect.kafka.KafkaControlAgent; +import org.apache.hudi.connect.utils.KafkaConnectUtils; +import org.apache.hudi.connect.writers.ConnectTransactionServices; +import org.apache.hudi.connect.writers.KafkaConnectConfigs; +import org.apache.hudi.connect.writers.KafkaConnectTransactionServices; +import org.apache.hudi.exception.HoodieException; + +import org.apache.kafka.common.TopicPartition; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.stream.Collectors; + +/** + * Implementation of the Coordinator that + * coordinates the Hudi write transactions + * across all the Kafka partitions for a single Kafka Topic. + */ +public class ConnectTransactionCoordinator implements TransactionCoordinator, Runnable { + + public static final int COORDINATOR_KAFKA_PARTITION = 0; + + private static final Logger LOG = LogManager.getLogger(ConnectTransactionCoordinator.class); + private static final String BOOTSTRAP_SERVERS_CFG = "bootstrap.servers"; + private static final String KAFKA_OFFSET_KEY = "kafka.commit.offsets"; + private static final String KAFKA_OFFSET_DELIMITER = ","; + private static final String KAFKA_OFFSET_KV_DELIMITER = "="; + private static final Long START_COMMIT_INIT_DELAY_MS = 100L; + private static final Long RESTART_COMMIT_DELAY_MS = 500L; + private static final int COORDINATOR_EVENT_LOOP_TIMEOUT_MS = 1000; + + private final KafkaConnectConfigs configs; + private final TopicPartition partition; + private final KafkaControlAgent kafkaControlClient; + private final ConnectTransactionServices transactionServices; + private final KafkaPartitionProvider partitionProvider; + private final Map> partitionsWriteStatusReceived; + private final Map currentConsumedKafkaOffsets; + private final AtomicBoolean hasStarted = new AtomicBoolean(false); + private final BlockingQueue events; + private final ExecutorService executorService; + private final ScheduledExecutorService scheduler; + + private String currentCommitTime; + private Map globalCommittedKafkaOffsets; + private State currentState; + private int numPartitions; + + public ConnectTransactionCoordinator(KafkaConnectConfigs configs, + TopicPartition partition, + KafkaControlAgent kafkaControlClient) throws HoodieException { + this(configs, + partition, + kafkaControlClient, + new KafkaConnectTransactionServices(configs), + KafkaConnectUtils::getLatestNumPartitions); + } + + public ConnectTransactionCoordinator(KafkaConnectConfigs configs, + TopicPartition partition, + KafkaControlAgent kafkaControlClient, + ConnectTransactionServices transactionServices, + KafkaPartitionProvider partitionProvider) { + this.configs = configs; + this.partition = partition; + this.kafkaControlClient = kafkaControlClient; + this.transactionServices = transactionServices; + this.partitionProvider = partitionProvider; + this.events = new LinkedBlockingQueue<>(); + scheduler = Executors.newSingleThreadScheduledExecutor(); + executorService = Executors.newSingleThreadExecutor(); + + + this.currentCommitTime = StringUtils.EMPTY_STRING; + this.partitionsWriteStatusReceived = new HashMap<>(); + this.globalCommittedKafkaOffsets = new HashMap<>(); + this.currentConsumedKafkaOffsets = new HashMap<>(); + this.currentState = State.INIT; + } + + @Override + public void start() { + if (hasStarted.compareAndSet(false, true)) { + executorService.submit(this); + } + kafkaControlClient.registerTransactionCoordinator(this); + LOG.info(String.format("Start Transaction Coordinator for topic %s partition %s", + partition.topic(), partition.partition())); + + initializeGlobalCommittedKafkaOffsets(); + // Submit the first start commit + submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.START_COMMIT, + partition.topic(), + StringUtils.EMPTY_STRING), + START_COMMIT_INIT_DELAY_MS, TimeUnit.MILLISECONDS); + } + + @Override + public void stop() { + kafkaControlClient.deregisterTransactionCoordinator(this); + scheduler.shutdownNow(); + hasStarted.set(false); + if (executorService != null) { + boolean terminated = false; + try { + LOG.info("Shutting down executor service."); + executorService.shutdown(); + LOG.info("Awaiting termination."); + terminated = executorService.awaitTermination(100, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + // ignored + } + + if (!terminated) { + LOG.warn( + "Unclean Kafka Control Manager executor service shutdown "); + executorService.shutdownNow(); + } + } + } + + @Override + public TopicPartition getPartition() { + return partition; + } + + @Override + public void processControlEvent(ControlMessage message) { + CoordinatorEvent.CoordinatorEventType type; + if (message.getType().equals(ControlMessage.EventType.WRITE_STATUS)) { + type = CoordinatorEvent.CoordinatorEventType.WRITE_STATUS; + } else { + LOG.warn(String.format("The Coordinator should not be receiving messages of type %s", + message.getType().name())); + return; + } + + CoordinatorEvent event = new CoordinatorEvent(type, + message.getTopicName(), + message.getCommitTime()); + event.setMessage(message); + submitEvent(event); + } + + @Override + public void run() { + while (true) { + try { + CoordinatorEvent event = events.poll(COORDINATOR_EVENT_LOOP_TIMEOUT_MS, TimeUnit.MILLISECONDS); + if (event != null) { + processCoordinatorEvent(event); + } + } catch (InterruptedException exception) { + LOG.warn("Error received while polling the event loop in Partition Coordinator", exception); + } + } + } + + private void submitEvent(CoordinatorEvent event) { + this.submitEvent(event, 0, TimeUnit.SECONDS); + } + + private void submitEvent(CoordinatorEvent event, long delay, TimeUnit unit) { + scheduler.schedule(() -> { + events.add(event); + }, delay, unit); + } + + private void processCoordinatorEvent(CoordinatorEvent event) { + try { + // Ignore NULL and STALE events, unless its one to start a new COMMIT + if (event == null + || (!event.getEventType().equals(CoordinatorEvent.CoordinatorEventType.START_COMMIT) + && (!event.getCommitTime().equals(currentCommitTime)))) { + return; + } + + switch (event.getEventType()) { + case START_COMMIT: + startNewCommit(); + break; + case END_COMMIT: + endExistingCommit(); + break; + case WRITE_STATUS: + // Ignore stale write_status messages sent after + if (event.getMessage() != null + && currentState.equals(State.ENDED_COMMIT)) { + onReceiveWriteStatus(event.getMessage()); + } else { + LOG.warn("Could not process WRITE_STATUS due to missing message"); + } + break; + case ACK_COMMIT: + submitAckCommit(); + break; + case WRITE_STATUS_TIMEOUT: + handleWriteStatusTimeout(); + break; + default: + throw new IllegalStateException("Partition Coordinator has received an illegal event type " + event.getEventType().name()); + } + } catch (Exception exception) { + LOG.warn("Error received while polling the event loop in Partition Coordinator", exception); + } + } + + private void startNewCommit() { + numPartitions = partitionProvider.getLatestNumPartitions(configs.getString(BOOTSTRAP_SERVERS_CFG), partition.topic()); + partitionsWriteStatusReceived.clear(); + try { + currentCommitTime = transactionServices.startCommit(); + kafkaControlClient.publishMessage(buildControlMessage(ControlMessage.EventType.START_COMMIT)); + currentState = State.STARTED_COMMIT; + // schedule a timeout for ending the current commit + submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.END_COMMIT, + partition.topic(), + currentCommitTime), + configs.getCommitIntervalSecs(), TimeUnit.SECONDS); + } catch (Exception exception) { + LOG.error(String.format("Failed to start a new commit %s, will retry", currentCommitTime), exception); + submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.START_COMMIT, + partition.topic(), + StringUtils.EMPTY_STRING), + RESTART_COMMIT_DELAY_MS, TimeUnit.MILLISECONDS); + } + } + + private void endExistingCommit() { + try { + kafkaControlClient.publishMessage(buildControlMessage(ControlMessage.EventType.END_COMMIT)); + } catch (Exception exception) { + LOG.warn(String.format("Could not send END_COMMIT message for partition %s and commitTime %s", partition, currentCommitTime), exception); + } + currentConsumedKafkaOffsets.clear(); + currentState = State.ENDED_COMMIT; + + // schedule a timeout for receiving all write statuses + submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.WRITE_STATUS_TIMEOUT, + partition.topic(), + currentCommitTime), + configs.getCoordinatorWriteTimeoutSecs(), TimeUnit.SECONDS); + } + + private void onReceiveWriteStatus(ControlMessage message) { + ControlMessage.ParticipantInfo participantInfo = message.getParticipantInfo(); + int partitionId = message.getSenderPartition(); + partitionsWriteStatusReceived.put(partitionId, KafkaConnectUtils.getWriteStatuses(participantInfo)); + currentConsumedKafkaOffsets.put(partitionId, participantInfo.getKafkaOffset()); + if (partitionsWriteStatusReceived.size() >= numPartitions + && currentState.equals(State.ENDED_COMMIT)) { + // Commit the kafka offsets to the commit file + try { + List allWriteStatuses = new ArrayList<>(); + partitionsWriteStatusReceived.forEach((key, value) -> allWriteStatuses.addAll(value)); + + long totalErrorRecords = (long) allWriteStatuses.stream().mapToDouble(WriteStatus::getTotalErrorRecords).sum(); + long totalRecords = (long) allWriteStatuses.stream().mapToDouble(WriteStatus::getTotalRecords).sum(); + boolean hasErrors = totalErrorRecords > 0; + + if (!hasErrors || configs.allowCommitOnErrors()) { + boolean success = transactionServices.endCommit(currentCommitTime, + allWriteStatuses, + transformKafkaOffsets(currentConsumedKafkaOffsets)); + + if (success) { + LOG.info("Commit " + currentCommitTime + " successful!"); + currentState = State.WRITE_STATUS_RCVD; + globalCommittedKafkaOffsets.putAll(currentConsumedKafkaOffsets); + submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.ACK_COMMIT, + message.getTopicName(), + currentCommitTime)); + return; + } else { + LOG.error("Commit " + currentCommitTime + " failed!"); + } + } else if (hasErrors) { + LOG.error("Coordinator found errors when writing. Errors/Total=" + totalErrorRecords + "/" + totalRecords); + LOG.error("Printing out the top 100 errors"); + allWriteStatuses.stream().filter(WriteStatus::hasErrors).limit(100).forEach(ws -> { + LOG.error("Global error :", ws.getGlobalError()); + if (ws.getErrors().size() > 0) { + ws.getErrors().forEach((key, value) -> LOG.trace("Error for key:" + key + " is " + value)); + } + }); + } + + // Submit the next start commit, that will rollback the current commit. + currentState = State.FAILED_COMMIT; + LOG.warn("Current commit " + currentCommitTime + " failed, so starting a new commit after recovery delay"); + submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.START_COMMIT, + partition.topic(), + StringUtils.EMPTY_STRING), + RESTART_COMMIT_DELAY_MS, TimeUnit.MILLISECONDS); + } catch (Exception exception) { + LOG.error("Fatal error while committing file", exception); + } + } + } + + private void handleWriteStatusTimeout() { + // If we are still stuck in ENDED_STATE + if (currentState.equals(State.ENDED_COMMIT)) { + currentState = State.WRITE_STATUS_TIMEDOUT; + LOG.warn("Current commit " + currentCommitTime + " failed after a write status timeout, so starting a new commit after recovery delay"); + // Submit the next start commit + submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.START_COMMIT, + partition.topic(), + StringUtils.EMPTY_STRING), + RESTART_COMMIT_DELAY_MS, TimeUnit.MILLISECONDS); + } + } + + private void submitAckCommit() { + try { + kafkaControlClient.publishMessage(buildControlMessage(ControlMessage.EventType.ACK_COMMIT)); + } catch (Exception exception) { + LOG.warn(String.format("Could not send ACK_COMMIT message for partition %s and commitTime %s", partition, currentCommitTime), exception); + } + currentState = State.ACKED_COMMIT; + + // Submit the next start commit + submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.START_COMMIT, + partition.topic(), + StringUtils.EMPTY_STRING), + START_COMMIT_INIT_DELAY_MS, TimeUnit.MILLISECONDS); + } + + private void initializeGlobalCommittedKafkaOffsets() { + try { + Map commitMetadata = transactionServices.fetchLatestExtraCommitMetadata(); + String latestKafkaOffsets = commitMetadata.get(KAFKA_OFFSET_KEY); + if (!StringUtils.isNullOrEmpty(latestKafkaOffsets)) { + LOG.info("Retrieved Raw Kafka offsets from Hudi Commit File " + latestKafkaOffsets); + globalCommittedKafkaOffsets = Arrays.stream(latestKafkaOffsets.split(KAFKA_OFFSET_DELIMITER)) + .map(entry -> entry.split(KAFKA_OFFSET_KV_DELIMITER)) + .collect(Collectors.toMap(entry -> Integer.parseInt(entry[0]), entry -> Long.parseLong(entry[1]))); + LOG.info("Initialized the kafka offset commits " + globalCommittedKafkaOffsets); + } + } catch (Exception exception) { + throw new HoodieException("Could not deserialize the kafka commit offsets", exception); + } + } + + private Map transformKafkaOffsets(Map kafkaOffsets) { + try { + String kafkaOffsetValue = kafkaOffsets.keySet().stream() + .map(key -> key + KAFKA_OFFSET_KV_DELIMITER + kafkaOffsets.get(key)) + .collect(Collectors.joining(KAFKA_OFFSET_DELIMITER)); + return Collections.singletonMap(KAFKA_OFFSET_KEY, kafkaOffsetValue); + } catch (Exception exception) { + throw new HoodieException("Could not serialize the kafka commit offsets", exception); + } + } + + private enum State { + INIT, + STARTED_COMMIT, + ENDED_COMMIT, + FAILED_COMMIT, + WRITE_STATUS_RCVD, + WRITE_STATUS_TIMEDOUT, + ACKED_COMMIT, + } + + /** + * Provides the current partitions of a Kafka Topic dynamically. + */ + public interface KafkaPartitionProvider { + int getLatestNumPartitions(String bootstrapServers, String topicName); + } + + private ControlMessage buildControlMessage(ControlMessage.EventType eventType) { + return ControlMessage.newBuilder() + .setProtocolVersion(KafkaConnectConfigs.CURRENT_PROTOCOL_VERSION) + .setType(eventType) + .setTopicName(partition.topic()) + .setSenderType(ControlMessage.EntityType.COORDINATOR) + .setSenderPartition(partition.partition()) + .setReceiverType(ControlMessage.EntityType.PARTICIPANT) + .setCommitTime(currentCommitTime) + .setCoordinatorInfo( + ControlMessage.CoordinatorInfo.newBuilder() + .putAllGlobalKafkaCommitOffsets(globalCommittedKafkaOffsets) + .build() + ).build(); + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionParticipant.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionParticipant.java new file mode 100644 index 0000000000000..1d32d03c371bd --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionParticipant.java @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.transaction; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.connect.ControlMessage; +import org.apache.hudi.connect.kafka.KafkaControlAgent; +import org.apache.hudi.connect.utils.KafkaConnectUtils; +import org.apache.hudi.connect.writers.ConnectWriterProvider; +import org.apache.hudi.connect.writers.KafkaConnectConfigs; +import org.apache.hudi.connect.writers.KafkaConnectWriterProvider; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; + +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.connect.sink.SinkRecord; +import org.apache.kafka.connect.sink.SinkTaskContext; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingQueue; + +/** + * Implementation of the {@link TransactionParticipant} that coordinates the Hudi write transactions + * based on events from the {@link TransactionCoordinator} and manages the Hudi Writes for a specific Kafka Partition. + */ +public class ConnectTransactionParticipant implements TransactionParticipant { + + private static final Logger LOG = LogManager.getLogger(ConnectTransactionParticipant.class); + + private final LinkedList buffer; + private final BlockingQueue controlEvents; + private final TopicPartition partition; + private final SinkTaskContext context; + private final KafkaControlAgent kafkaControlAgent; + private final ConnectWriterProvider writerProvider; + + private TransactionInfo ongoingTransactionInfo; + private long committedKafkaOffset; + + public ConnectTransactionParticipant(KafkaConnectConfigs configs, + TopicPartition partition, + KafkaControlAgent kafkaControlAgent, + SinkTaskContext context) throws HoodieException { + this(partition, kafkaControlAgent, context, new KafkaConnectWriterProvider(configs, partition)); + } + + public ConnectTransactionParticipant(TopicPartition partition, + KafkaControlAgent kafkaControlAgent, + SinkTaskContext context, + ConnectWriterProvider writerProvider) throws HoodieException { + this.buffer = new LinkedList<>(); + this.controlEvents = new LinkedBlockingQueue<>(); + this.partition = partition; + this.context = context; + this.writerProvider = writerProvider; + this.kafkaControlAgent = kafkaControlAgent; + this.ongoingTransactionInfo = null; + this.committedKafkaOffset = 0; + } + + @Override + public void start() { + LOG.info("Start Hudi Transaction Participant for partition " + partition.partition()); + this.kafkaControlAgent.registerTransactionParticipant(this); + context.pause(partition); + } + + @Override + public void stop() { + this.kafkaControlAgent.deregisterTransactionParticipant(this); + cleanupOngoingTransaction(); + } + + @Override + public void buffer(SinkRecord record) { + buffer.add(record); + } + + @Override + public void processControlEvent(ControlMessage message) { + controlEvents.add(message); + } + + @Override + public long getLastKafkaCommittedOffset() { + return committedKafkaOffset; + } + + @Override + public TopicPartition getPartition() { + return partition; + } + + @Override + public void processRecords() { + while (!controlEvents.isEmpty()) { + ControlMessage message = controlEvents.poll(); + switch (message.getType()) { + case START_COMMIT: + handleStartCommit(message); + break; + case END_COMMIT: + handleEndCommit(message); + break; + case ACK_COMMIT: + handleAckCommit(message); + break; + case WRITE_STATUS: + // ignore write status since its only processed by leader + break; + default: + throw new IllegalStateException("HudiTransactionParticipant received incorrect state " + message.getType().name()); + } + } + + writeRecords(); + } + + private void handleStartCommit(ControlMessage message) { + // If there is an existing/ongoing transaction locally + // but it failed globally since we received another START_COMMIT instead of an END_COMMIT or ACK_COMMIT, + // so close it and start new transaction + cleanupOngoingTransaction(); + // Resync the last committed Kafka offset from the leader + syncKafkaOffsetWithLeader(message); + context.resume(partition); + String currentCommitTime = message.getCommitTime(); + LOG.info("Started a new transaction after receiving START_COMMIT for commit " + currentCommitTime); + try { + ongoingTransactionInfo = new TransactionInfo<>(currentCommitTime, writerProvider.getWriter(currentCommitTime)); + ongoingTransactionInfo.setExpectedKafkaOffset(committedKafkaOffset); + } catch (Exception exception) { + LOG.warn("Error received while starting a new transaction", exception); + } + } + + private void handleEndCommit(ControlMessage message) { + if (ongoingTransactionInfo == null) { + LOG.warn(String.format("END_COMMIT %s is received while we were NOT in active transaction", message.getCommitTime())); + return; + } else if (!ongoingTransactionInfo.getCommitTime().equals(message.getCommitTime())) { + LOG.error(String.format("Fatal error received END_COMMIT with commit time %s while local transaction commit time %s", + message.getCommitTime(), ongoingTransactionInfo.getCommitTime())); + // Recovery: A new END_COMMIT from leader caused interruption to an existing transaction, + // explicitly reset Kafka commit offset to ensure no data loss + cleanupOngoingTransaction(); + syncKafkaOffsetWithLeader(message); + return; + } + + context.pause(partition); + ongoingTransactionInfo.commitInitiated(); + // send Writer Status Message and wait for ACK_COMMIT in async fashion + try { + //sendWriterStatus + List writeStatuses = ongoingTransactionInfo.getWriter().close(); + + ControlMessage writeStatusEvent = ControlMessage.newBuilder() + .setProtocolVersion(KafkaConnectConfigs.CURRENT_PROTOCOL_VERSION) + .setType(ControlMessage.EventType.WRITE_STATUS) + .setTopicName(partition.topic()) + .setSenderType(ControlMessage.EntityType.PARTICIPANT) + .setSenderPartition(partition.partition()) + .setReceiverType(ControlMessage.EntityType.COORDINATOR) + .setReceiverPartition(ConnectTransactionCoordinator.COORDINATOR_KAFKA_PARTITION) + .setCommitTime(ongoingTransactionInfo.getCommitTime()) + .setParticipantInfo( + ControlMessage.ParticipantInfo.newBuilder() + .setWriteStatus(KafkaConnectUtils.buildWriteStatuses(writeStatuses)) + .setKafkaOffset(ongoingTransactionInfo.getExpectedKafkaOffset()) + .build() + ).build(); + + kafkaControlAgent.publishMessage(writeStatusEvent); + } catch (Exception exception) { + LOG.error(String.format("Error writing records and ending commit %s for partition %s", message.getCommitTime(), partition.partition()), exception); + throw new HoodieIOException(String.format("Error writing records and ending commit %s for partition %s", message.getCommitTime(), partition.partition()), + new IOException(exception)); + } + } + + private void handleAckCommit(ControlMessage message) { + // Update committedKafkaOffset that tracks the last committed kafka offset locally. + if (ongoingTransactionInfo != null && committedKafkaOffset < ongoingTransactionInfo.getExpectedKafkaOffset()) { + committedKafkaOffset = ongoingTransactionInfo.getExpectedKafkaOffset(); + } + syncKafkaOffsetWithLeader(message); + cleanupOngoingTransaction(); + } + + private void writeRecords() { + if (ongoingTransactionInfo != null && !ongoingTransactionInfo.isCommitInitiated()) { + while (!buffer.isEmpty()) { + try { + SinkRecord record = buffer.peek(); + if (record != null + && record.kafkaOffset() == ongoingTransactionInfo.getExpectedKafkaOffset()) { + ongoingTransactionInfo.getWriter().writeRecord(record); + ongoingTransactionInfo.setExpectedKafkaOffset(record.kafkaOffset() + 1); + } else if (record != null && record.kafkaOffset() > ongoingTransactionInfo.getExpectedKafkaOffset()) { + LOG.warn(String.format("Received a kafka record with offset %s above the next expected kafka offset %s for partition %s, " + + "hence resetting the kafka offset to %s", + record.kafkaOffset(), + ongoingTransactionInfo.getExpectedKafkaOffset(), + partition, + ongoingTransactionInfo.getExpectedKafkaOffset())); + context.offset(partition, ongoingTransactionInfo.getExpectedKafkaOffset()); + } else if (record != null && record.kafkaOffset() < ongoingTransactionInfo.getExpectedKafkaOffset()) { + LOG.warn(String.format("Received a kafka record with offset %s below the next expected kafka offset %s for partition %s, " + + "no action will be taken but this record will be ignored since its already written", + record.kafkaOffset(), + ongoingTransactionInfo.getExpectedKafkaOffset(), + partition)); + } + buffer.poll(); + } catch (Exception exception) { + LOG.warn(String.format("Error received while writing records for transaction %s in partition %s", + ongoingTransactionInfo.getCommitTime(), partition.partition()), + exception); + } + } + } + } + + private void cleanupOngoingTransaction() { + if (ongoingTransactionInfo != null) { + try { + ongoingTransactionInfo.getWriter().close(); + ongoingTransactionInfo = null; + } catch (HoodieIOException exception) { + LOG.warn("Error received while trying to cleanup existing transaction", exception); + } + } + } + + private void syncKafkaOffsetWithLeader(ControlMessage message) { + if (message.getCoordinatorInfo().getGlobalKafkaCommitOffsetsMap().containsKey(partition.partition())) { + Long coordinatorCommittedKafkaOffset = message.getCoordinatorInfo().getGlobalKafkaCommitOffsetsMap().get(partition.partition()); + // Recover kafka committed offsets, treating the commit offset from the coordinator + // as the source of truth + if (coordinatorCommittedKafkaOffset != null && coordinatorCommittedKafkaOffset >= 0) { + // Debug only messages + if (coordinatorCommittedKafkaOffset != committedKafkaOffset) { + LOG.warn(String.format("The coordinator offset for kafka partition %s is %d while the locally committed offset is %d, " + + "hence resetting the local committed offset to the coordinator provided one to ensure consistency", + partition, + coordinatorCommittedKafkaOffset, + committedKafkaOffset)); + } + committedKafkaOffset = coordinatorCommittedKafkaOffset; + return; + } + } else { + LOG.warn(String.format("The coordinator offset for kafka partition %s is not present while the locally committed offset is %d, " + + "hence resetting the local committed offset to 0 to avoid data loss", + partition, + committedKafkaOffset)); + } + // If the coordinator does not have a committed offset for this partition, reset to zero offset. + committedKafkaOffset = 0; + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/CoordinatorEvent.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/CoordinatorEvent.java new file mode 100644 index 0000000000000..f9f467a83bec7 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/CoordinatorEvent.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.transaction; + +import org.apache.hudi.connect.ControlMessage; + +/** + * The events within the Coordinator that trigger + * the state changes in the state machine of + * the Coordinator. + */ +public class CoordinatorEvent { + + private final CoordinatorEventType eventType; + private final String topicName; + private final String commitTime; + private ControlMessage message; + + public CoordinatorEvent(CoordinatorEventType eventType, + String topicName, + String commitTime) { + this.eventType = eventType; + this.topicName = topicName; + this.commitTime = commitTime; + } + + public CoordinatorEventType getEventType() { + return eventType; + } + + public String getTopicName() { + return topicName; + } + + public String getCommitTime() { + return commitTime; + } + + public ControlMessage getMessage() { + return message; + } + + public void setMessage(ControlMessage message) { + this.message = message; + } + + /** + * The type of Coordinator Event. + */ + public enum CoordinatorEventType { + START_COMMIT, + END_COMMIT, + WRITE_STATUS, + ACK_COMMIT, + WRITE_STATUS_TIMEOUT + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionCoordinator.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionCoordinator.java new file mode 100644 index 0000000000000..d6759d84c8858 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionCoordinator.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.transaction; + +import org.apache.hudi.connect.ControlMessage; + +import org.apache.kafka.common.TopicPartition; + +/** + * The Base Coordinator that + * coordinates the write transactions + * across all the Kafka partitions, that + * are managed by the {@link TransactionParticipant}. + */ +public interface TransactionCoordinator { + + void start(); + + void stop(); + + /* Kafka Topic that this Coordinator belongs to */ + TopicPartition getPartition(); + + /* Called when a control event is received from the Kafka control topic */ + void processControlEvent(ControlMessage message); +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionInfo.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionInfo.java new file mode 100644 index 0000000000000..7c1852e5fa5c6 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionInfo.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.transaction; + +import org.apache.hudi.connect.writers.ConnectWriter; + +/** + * Stores all the state for the current Transaction within a + * {@link TransactionParticipant}. + * @param The type of status returned by the underlying writer. + */ +public class TransactionInfo { + + private final String commitTime; + private final ConnectWriter writer; + private long expectedKafkaOffset; + private boolean commitInitiated; + + public TransactionInfo(String commitTime, ConnectWriter writer) { + this.commitTime = commitTime; + this.writer = writer; + this.expectedKafkaOffset = 0; + this.commitInitiated = false; + } + + public String getCommitTime() { + return commitTime; + } + + public ConnectWriter getWriter() { + return writer; + } + + public long getExpectedKafkaOffset() { + return expectedKafkaOffset; + } + + public boolean isCommitInitiated() { + return commitInitiated; + } + + public void setExpectedKafkaOffset(long expectedKafkaOffset) { + this.expectedKafkaOffset = expectedKafkaOffset; + } + + public void commitInitiated() { + this.commitInitiated = true; + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionParticipant.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionParticipant.java new file mode 100644 index 0000000000000..d27b14ef47644 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionParticipant.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.transaction; + +import org.apache.hudi.connect.ControlMessage; + +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.connect.sink.SinkRecord; + +/** + * Interface for the Participant that + * manages Writes for a + * single Kafka partition, based on + * coordination signals from the {@link TransactionCoordinator}. + */ +public interface TransactionParticipant { + + void start(); + + void stop(); + + void buffer(SinkRecord record); + + void processRecords(); + + TopicPartition getPartition(); + + void processControlEvent(ControlMessage message); + + long getLastKafkaCommittedOffset(); +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/utils/KafkaConnectUtils.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/utils/KafkaConnectUtils.java new file mode 100644 index 0000000000000..6b08bae2af94b --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/utils/KafkaConnectUtils.java @@ -0,0 +1,266 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.utils; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.SerializationUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.connect.ControlMessage; +import org.apache.hudi.connect.writers.KafkaConnectConfigs; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.keygen.CustomAvroKeyGenerator; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; + +import com.google.protobuf.ByteString; +import org.apache.hadoop.conf.Configuration; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.DescribeTopicsResult; +import org.apache.kafka.clients.admin.TopicDescription; +import org.apache.kafka.common.KafkaFuture; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.FileVisitOption; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Properties; +import java.util.stream.Collectors; + +/** + * Helper methods for Kafka. + */ +public class KafkaConnectUtils { + + private static final Logger LOG = LogManager.getLogger(KafkaConnectUtils.class); + private static final String HOODIE_CONF_PREFIX = "hoodie."; + public static final String HADOOP_CONF_DIR = "HADOOP_CONF_DIR"; + public static final String HADOOP_HOME = "HADOOP_HOME"; + private static final List DEFAULT_HADOOP_CONF_FILES; + + static { + DEFAULT_HADOOP_CONF_FILES = new ArrayList<>(); + try { + String hadoopConfigPath = System.getenv(HADOOP_CONF_DIR); + String hadoopHomePath = System.getenv(HADOOP_HOME); + DEFAULT_HADOOP_CONF_FILES.addAll(getHadoopConfigFiles(hadoopConfigPath, hadoopHomePath)); + if (!DEFAULT_HADOOP_CONF_FILES.isEmpty()) { + LOG.info(String.format("Found Hadoop default config files %s", DEFAULT_HADOOP_CONF_FILES)); + } + } catch (IOException e) { + LOG.error("An error occurred while getting the default Hadoop configuration. " + + "Please use hadoop.conf.dir or hadoop.home to configure Hadoop environment variables", e); + } + } + + /** + * Get hadoop config files by HADOOP_CONF_DIR or HADOOP_HOME + */ + public static List getHadoopConfigFiles(String hadoopConfigPath, String hadoopHomePath) + throws IOException { + List hadoopConfigFiles = new ArrayList<>(); + if (!StringUtils.isNullOrEmpty(hadoopConfigPath)) { + hadoopConfigFiles.addAll(walkTreeForXml(Paths.get(hadoopConfigPath))); + } + if (hadoopConfigFiles.isEmpty() && !StringUtils.isNullOrEmpty(hadoopHomePath)) { + hadoopConfigFiles.addAll(walkTreeForXml(Paths.get(hadoopHomePath, "etc", "hadoop"))); + } + return hadoopConfigFiles; + } + + /** + * Files walk to find xml + */ + private static List walkTreeForXml(Path basePath) throws IOException { + if (Files.notExists(basePath)) { + return new ArrayList<>(); + } + return Files.walk(basePath, FileVisitOption.FOLLOW_LINKS) + .filter(path -> path.toFile().isFile()) + .filter(path -> path.toString().endsWith(".xml")) + .collect(Collectors.toList()); + } + + public static int getLatestNumPartitions(String bootstrapServers, String topicName) { + Properties props = new Properties(); + props.put("bootstrap.servers", bootstrapServers); + try { + AdminClient client = AdminClient.create(props); + DescribeTopicsResult result = client.describeTopics(Arrays.asList(topicName)); + Map> values = result.values(); + KafkaFuture topicDescription = values.get(topicName); + int numPartitions = topicDescription.get().partitions().size(); + LOG.info(String.format("Latest number of partitions for topic %s is %s", topicName, numPartitions)); + return numPartitions; + } catch (Exception exception) { + throw new HoodieException("Fatal error fetching the latest partition of kafka topic name" + topicName, exception); + } + } + + /** + * Returns the default Hadoop Configuration. + * + * @return + */ + public static Configuration getDefaultHadoopConf(KafkaConnectConfigs connectConfigs) { + Configuration hadoopConf = new Configuration(); + + // add hadoop config files + if (!StringUtils.isNullOrEmpty(connectConfigs.getHadoopConfDir()) + || !StringUtils.isNullOrEmpty(connectConfigs.getHadoopConfHome())) { + try { + List configFiles = getHadoopConfigFiles(connectConfigs.getHadoopConfDir(), + connectConfigs.getHadoopConfHome()); + configFiles.forEach(f -> + hadoopConf.addResource(new org.apache.hadoop.fs.Path(f.toAbsolutePath().toUri()))); + } catch (Exception e) { + throw new HoodieException("Failed to read hadoop configuration!", e); + } + } else { + DEFAULT_HADOOP_CONF_FILES.forEach(f -> + hadoopConf.addResource(new org.apache.hadoop.fs.Path(f.toAbsolutePath().toUri()))); + } + + connectConfigs.getProps().keySet().stream().filter(prop -> { + // In order to prevent printing unnecessary warn logs, here filter out the hoodie + // configuration items before passing to hadoop/hive configs + return !prop.toString().startsWith(HOODIE_CONF_PREFIX); + }).forEach(prop -> { + hadoopConf.set(prop.toString(), connectConfigs.getProps().get(prop.toString()).toString()); + }); + return hadoopConf; + } + + /** + * Extract the record fields. + * + * @param keyGenerator key generator Instance of the keygenerator. + * @return Returns the record key columns separated by comma. + */ + public static String getRecordKeyColumns(KeyGenerator keyGenerator) { + return String.join(",", keyGenerator.getRecordKeyFieldNames()); + } + + /** + * Extract partition columns directly if an instance of class {@link BaseKeyGenerator}, + * else extract partition columns from the properties. + * + * @param keyGenerator key generator Instance of the keygenerator. + * @param typedProperties properties from the config. + * @return partition columns Returns the partition columns separated by comma. + */ + public static String getPartitionColumns(KeyGenerator keyGenerator, TypedProperties typedProperties) { + if (keyGenerator instanceof CustomAvroKeyGenerator) { + return ((BaseKeyGenerator) keyGenerator).getPartitionPathFields().stream().map( + pathField -> Arrays.stream(pathField.split(CustomAvroKeyGenerator.SPLIT_REGEX)) + .findFirst().orElse("Illegal partition path field format: '$pathField' for ${c.getClass.getSimpleName}")) + .collect(Collectors.joining(",")); + } + + if (keyGenerator instanceof BaseKeyGenerator) { + return String.join(",", ((BaseKeyGenerator) keyGenerator).getPartitionPathFields()); + } + + return typedProperties.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()); + } + + /** + * Get the Metadata from the latest commit file. + * + * @param metaClient The {@link HoodieTableMetaClient} to get access to the meta data. + * @return An Optional {@link HoodieCommitMetadata} containing the meta data from the latest commit file. + */ + public static Option getCommitMetadataForLatestInstant(HoodieTableMetaClient metaClient) { + HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline() + .filterCompletedInstants() + .filter(instant -> (metaClient.getTableType() == HoodieTableType.COPY_ON_WRITE && instant.getAction().equals(HoodieActiveTimeline.COMMIT_ACTION)) + || (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ && instant.getAction().equals(HoodieActiveTimeline.DELTA_COMMIT_ACTION)) + ); + Option latestInstant = timeline.lastInstant(); + if (latestInstant.isPresent()) { + try { + byte[] data = timeline.getInstantDetails(latestInstant.get()).get(); + return Option.of(HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class)); + } catch (Exception e) { + throw new HoodieException("Failed to read schema from commit metadata", e); + } + } else { + return Option.empty(); + } + } + + public static String hashDigest(String stringToHash) { + MessageDigest md; + try { + md = MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException e) { + LOG.error("Fatal error selecting hash algorithm", e); + throw new HoodieException(e); + } + byte[] digest = Objects.requireNonNull(md).digest(stringToHash.getBytes(StandardCharsets.UTF_8)); + return StringUtils.toHexString(digest).toUpperCase(); + } + + /** + * Build Protobuf message containing the Hudi {@link WriteStatus}. + * + * @param writeStatuses The list of Hudi {@link WriteStatus}. + * @return the protobuf message {@link org.apache.hudi.connect.ControlMessage.ConnectWriteStatus} + * that wraps the Hudi {@link WriteStatus}. + * @throws IOException thrown if the conversion failed. + */ + public static ControlMessage.ConnectWriteStatus buildWriteStatuses(List writeStatuses) throws IOException { + return ControlMessage.ConnectWriteStatus.newBuilder() + .setSerializedWriteStatus( + ByteString.copyFrom( + SerializationUtils.serialize(writeStatuses))) + .build(); + } + + /** + * Unwrap the Hudi {@link WriteStatus} from the received Protobuf message. + * + * @param participantInfo The {@link ControlMessage.ParticipantInfo} that contains the + * underlying {@link WriteStatus} sent by the participants. + * @return the list of {@link WriteStatus} returned by Hudi on a write transaction. + */ + public static List getWriteStatuses(ControlMessage.ParticipantInfo participantInfo) { + ControlMessage.ConnectWriteStatus connectWriteStatus = participantInfo.getWriteStatus(); + return SerializationUtils.deserialize(connectWriteStatus.getSerializedWriteStatus().toByteArray()); + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/AbstractConnectWriter.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/AbstractConnectWriter.java new file mode 100644 index 0000000000000..649150d16c828 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/AbstractConnectWriter.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.writers; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.connect.utils.KafkaConnectUtils; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.schema.SchemaProvider; +import org.apache.hudi.utilities.sources.helpers.AvroConvertor; + +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.connect.sink.SinkRecord; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.List; + +/** + * Base Hudi Writer that manages reading the raw Kafka records and + * converting them to {@link HoodieRecord}s that can be written to Hudi by + * the derived implementations of this class. + */ +public abstract class AbstractConnectWriter implements ConnectWriter { + + public static final String KAFKA_AVRO_CONVERTER = "io.confluent.connect.avro.AvroConverter"; + public static final String KAFKA_JSON_CONVERTER = "org.apache.kafka.connect.json.JsonConverter"; + public static final String KAFKA_STRING_CONVERTER = "org.apache.kafka.connect.storage.StringConverter"; + private static final Logger LOG = LogManager.getLogger(AbstractConnectWriter.class); + protected final String instantTime; + + private final KeyGenerator keyGenerator; + private final SchemaProvider schemaProvider; + protected final KafkaConnectConfigs connectConfigs; + + public AbstractConnectWriter(KafkaConnectConfigs connectConfigs, + KeyGenerator keyGenerator, + SchemaProvider schemaProvider, String instantTime) { + this.connectConfigs = connectConfigs; + this.keyGenerator = keyGenerator; + this.schemaProvider = schemaProvider; + this.instantTime = instantTime; + } + + @Override + public void writeRecord(SinkRecord record) throws IOException { + AvroConvertor convertor = new AvroConvertor(schemaProvider.getSourceSchema()); + Option avroRecord; + switch (connectConfigs.getKafkaValueConverter()) { + case KAFKA_AVRO_CONVERTER: + avroRecord = Option.of((GenericRecord) record.value()); + break; + case KAFKA_STRING_CONVERTER: + avroRecord = Option.of(convertor.fromJson((String) record.value())); + break; + case KAFKA_JSON_CONVERTER: + throw new UnsupportedEncodingException("Currently JSON objects are not supported"); + default: + throw new IOException("Unsupported Kafka Format type (" + connectConfigs.getKafkaValueConverter() + ")"); + } + + // Tag records with a file ID based on kafka partition and hudi partition. + HoodieRecord hoodieRecord = new HoodieAvroRecord<>(keyGenerator.getKey(avroRecord.get()), new HoodieAvroPayload(avroRecord)); + String fileId = KafkaConnectUtils.hashDigest(String.format("%s-%s", record.kafkaPartition(), hoodieRecord.getPartitionPath())); + hoodieRecord.unseal(); + hoodieRecord.setCurrentLocation(new HoodieRecordLocation(instantTime, fileId)); + hoodieRecord.setNewLocation(new HoodieRecordLocation(instantTime, fileId)); + hoodieRecord.seal(); + writeHudiRecord(hoodieRecord); + } + + @Override + public List close() { + return flushRecords(); + } + + protected abstract void writeHudiRecord(HoodieRecord record); + + protected abstract List flushRecords(); +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/BufferedConnectWriter.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/BufferedConnectWriter.java new file mode 100644 index 0000000000000..111c74f3a4883 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/BufferedConnectWriter.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.writers; + +import org.apache.hudi.client.HoodieJavaWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.util.DefaultSizeEstimator; +import org.apache.hudi.common.util.HoodieRecordSizeEstimator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ExternalSpillableMap; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.io.IOUtils; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.schema.SchemaProvider; + +import org.apache.avro.Schema; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +/** + * Specific implementation of a Hudi Writer that buffers all incoming records, + * and writes them to Hudi files on the end of a transaction using Bulk Insert. + */ +public class BufferedConnectWriter extends AbstractConnectWriter { + + private static final Logger LOG = LogManager.getLogger(BufferedConnectWriter.class); + + private final HoodieEngineContext context; + private final HoodieJavaWriteClient writeClient; + private final HoodieWriteConfig config; + private ExternalSpillableMap> bufferedRecords; + + public BufferedConnectWriter(HoodieEngineContext context, + HoodieJavaWriteClient writeClient, + String instantTime, + KafkaConnectConfigs connectConfigs, + HoodieWriteConfig config, + KeyGenerator keyGenerator, + SchemaProvider schemaProvider) { + super(connectConfigs, keyGenerator, schemaProvider, instantTime); + this.context = context; + this.writeClient = writeClient; + this.config = config; + init(); + } + + private void init() { + try { + // Load and batch all incoming records in a map + long memoryForMerge = IOUtils.getMaxMemoryPerPartitionMerge(context.getTaskContextSupplier(), config); + LOG.info("MaxMemoryPerPartitionMerge => " + memoryForMerge); + this.bufferedRecords = new ExternalSpillableMap<>(memoryForMerge, + config.getSpillableMapBasePath(), + new DefaultSizeEstimator(), + new HoodieRecordSizeEstimator(new Schema.Parser().parse(config.getSchema())), + config.getCommonConfig().getSpillableDiskMapType(), + config.getCommonConfig().isBitCaskDiskMapCompressionEnabled()); + } catch (IOException io) { + throw new HoodieIOException("Cannot instantiate an ExternalSpillableMap", io); + } + } + + @Override + public void writeHudiRecord(HoodieRecord record) { + bufferedRecords.put(record.getRecordKey(), record); + } + + @Override + public List flushRecords() { + try { + LOG.info("Number of entries in MemoryBasedMap => " + + bufferedRecords.getInMemoryMapNumEntries() + + ", Total size in bytes of MemoryBasedMap => " + + bufferedRecords.getCurrentInMemoryMapSize() + ", Number of entries in BitCaskDiskMap => " + + bufferedRecords.getDiskBasedMapNumEntries() + ", Size of file spilled to disk => " + + bufferedRecords.getSizeOfFileOnDiskInBytes()); + List writeStatuses = new ArrayList<>(); + + boolean isMorTable = Option.ofNullable(connectConfigs.getString(HoodieTableConfig.TYPE)) + .map(t -> t.equals(HoodieTableType.MERGE_ON_READ.name())) + .orElse(false); + + // Write out all records if non-empty + if (!bufferedRecords.isEmpty()) { + if (isMorTable) { + writeStatuses = writeClient.upsertPreppedRecords( + new LinkedList<>(bufferedRecords.values()), + instantTime); + } else { + writeStatuses = writeClient.bulkInsertPreppedRecords( + new LinkedList<>(bufferedRecords.values()), + instantTime, Option.empty()); + } + } + bufferedRecords.close(); + LOG.info("Flushed hudi records and got writeStatuses: " + writeStatuses); + return writeStatuses; + } catch (Exception e) { + throw new HoodieIOException("Write records failed", new IOException(e)); + } + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectTransactionServices.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectTransactionServices.java new file mode 100644 index 0000000000000..2ce44ff802bc9 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectTransactionServices.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.writers; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.connect.transaction.TransactionCoordinator; + +import java.util.List; +import java.util.Map; + +/** + * Transaction service APIs used by + * {@link TransactionCoordinator}. + */ +public interface ConnectTransactionServices { + + String startCommit(); + + boolean endCommit(String commitTime, List writeStatuses, Map extraMetadata); + + Map fetchLatestExtraCommitMetadata(); +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectWriter.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectWriter.java new file mode 100644 index 0000000000000..7249d4758ce38 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectWriter.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.writers; + +import org.apache.kafka.connect.sink.SinkRecord; + +import java.io.IOException; +import java.util.List; + +public interface ConnectWriter { + + void writeRecord(SinkRecord record) throws IOException; + + List close(); +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectWriterProvider.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectWriterProvider.java new file mode 100644 index 0000000000000..87deedc8c834e --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectWriterProvider.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.writers; + +public interface ConnectWriterProvider { + + ConnectWriter getWriter(String commitTime); +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectConfigs.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectConfigs.java new file mode 100644 index 0000000000000..e4543c692db86 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectConfigs.java @@ -0,0 +1,244 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.writers; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.hive.HiveSyncTool; +import org.apache.hudi.schema.FilebasedSchemaProvider; + +import javax.annotation.concurrent.Immutable; + +import java.util.Map; +import java.util.Properties; + +/** + * Class storing configs for the HoodieWriteClient. + */ +@Immutable +@ConfigClassProperty(name = "Kafka Sink Connect Configurations", + groupName = ConfigGroups.Names.KAFKA_CONNECT, + description = "Configurations for Kafka Connect Sink Connector for Hudi.") +public class KafkaConnectConfigs extends HoodieConfig { + + public static final int CURRENT_PROTOCOL_VERSION = 0; + public static final String KAFKA_VALUE_CONVERTER = "value.converter"; + + public static final ConfigProperty KAFKA_BOOTSTRAP_SERVERS = ConfigProperty + .key("bootstrap.servers") + .defaultValue("localhost:9092") + .withDocumentation("The bootstrap servers for the Kafka Cluster."); + + public static final ConfigProperty CONTROL_TOPIC_NAME = ConfigProperty + .key("hoodie.kafka.control.topic") + .defaultValue("hudi-control-topic") + .withDocumentation("Kafka topic name used by the Hudi Sink Connector for " + + "sending and receiving control messages. Not used for data records."); + + public static final ConfigProperty SCHEMA_PROVIDER_CLASS = ConfigProperty + .key("hoodie.schemaprovider.class") + .defaultValue(FilebasedSchemaProvider.class.getName()) + .withDocumentation("subclass of org.apache.hudi.schema.SchemaProvider " + + "to attach schemas to input & target table data, built in options: " + + "org.apache.hudi.schema.FilebasedSchemaProvider."); + + public static final ConfigProperty COMMIT_INTERVAL_SECS = ConfigProperty + .key("hoodie.kafka.commit.interval.secs") + .defaultValue("60") + .withDocumentation("The interval at which Hudi will commit the records written " + + "to the files, making them consumable on the read-side."); + + public static final ConfigProperty COORDINATOR_WRITE_TIMEOUT_SECS = ConfigProperty + .key("hoodie.kafka.coordinator.write.timeout.secs") + .defaultValue("300") + .withDocumentation("The timeout after sending an END_COMMIT until when " + + "the coordinator will wait for the write statuses from all the partitions" + + "to ignore the current commit and start a new commit."); + + public static final ConfigProperty ASYNC_COMPACT_ENABLE = ConfigProperty + .key("hoodie.kafka.compaction.async.enable") + .defaultValue("true") + .withDocumentation("Controls whether async compaction should be turned on for MOR table writing."); + + public static final ConfigProperty META_SYNC_ENABLE = ConfigProperty + .key("hoodie.meta.sync.enable") + .defaultValue("false") + .withDocumentation("Enable Meta Sync such as Hive"); + + public static final ConfigProperty META_SYNC_CLASSES = ConfigProperty + .key("hoodie.meta.sync.classes") + .defaultValue(HiveSyncTool.class.getName()) + .withDocumentation("Meta sync client tool, using comma to separate multi tools"); + + public static final ConfigProperty ALLOW_COMMIT_ON_ERRORS = ConfigProperty + .key("hoodie.kafka.allow.commit.on.errors") + .defaultValue(true) + .withDocumentation("Commit even when some records failed to be written"); + + // Reference https://docs.confluent.io/kafka-connect-hdfs/current/configuration_options.html#hdfs + public static final ConfigProperty HADOOP_CONF_DIR = ConfigProperty + .key("hadoop.conf.dir") + .noDefaultValue() + .withDocumentation("The Hadoop configuration directory."); + + public static final ConfigProperty HADOOP_HOME = ConfigProperty + .key("hadoop.home") + .noDefaultValue() + .withDocumentation("The Hadoop home directory."); + + protected KafkaConnectConfigs() { + super(); + } + + protected KafkaConnectConfigs(Properties props) { + super(props); + } + + public static KafkaConnectConfigs.Builder newBuilder() { + return new KafkaConnectConfigs.Builder(); + } + + public String getBootstrapServers() { + return getString(KAFKA_BOOTSTRAP_SERVERS); + } + + public String getControlTopicName() { + return getString(CONTROL_TOPIC_NAME); + } + + public String getSchemaProviderClass() { + return getString(SCHEMA_PROVIDER_CLASS); + } + + public Long getCommitIntervalSecs() { + return getLong(COMMIT_INTERVAL_SECS); + } + + public Long getCoordinatorWriteTimeoutSecs() { + return getLong(COORDINATOR_WRITE_TIMEOUT_SECS); + } + + public String getKafkaValueConverter() { + return getString(KAFKA_VALUE_CONVERTER); + } + + public Boolean isAsyncCompactEnabled() { + return getBoolean(ASYNC_COMPACT_ENABLE); + } + + public Boolean isMetaSyncEnabled() { + return getBoolean(META_SYNC_ENABLE); + } + + public String getMetaSyncClasses() { + return getString(META_SYNC_CLASSES); + } + + public Boolean allowCommitOnErrors() { + return getBoolean(ALLOW_COMMIT_ON_ERRORS); + } + + public String getHadoopConfDir() { + return getString(HADOOP_CONF_DIR); + } + + public String getHadoopConfHome() { + return getString(HADOOP_HOME); + } + + public static final String HIVE_USE_PRE_APACHE_INPUT_FORMAT = "hoodie.datasource.hive_sync.use_pre_apache_input_format"; + public static final String HIVE_DATABASE = "hoodie.datasource.hive_sync.database"; + public static final String HIVE_TABLE = "hoodie.datasource.hive_sync.table"; + public static final String HIVE_USER = "hoodie.datasource.hive_sync.username"; + public static final String HIVE_PASS = "hoodie.datasource.hive_sync.password"; + public static final String HIVE_URL = "hoodie.datasource.hive_sync.jdbcurl"; + public static final String HIVE_PARTITION_FIELDS = "hoodie.datasource.hive_sync.partition_fields"; + public static final String HIVE_PARTITION_EXTRACTOR_CLASS = "hoodie.datasource.hive_sync.partition_extractor_class"; + public static final String HIVE_USE_JDBC = "hoodie.datasource.hive_sync.use_jdbc"; + public static final String HIVE_SYNC_MODE = "hoodie.datasource.hive_sync.mode"; + public static final String HIVE_AUTO_CREATE_DATABASE = "hoodie.datasource.hive_sync.auto_create_database"; + public static final String HIVE_IGNORE_EXCEPTIONS = "hoodie.datasource.hive_sync.ignore_exceptions"; + public static final String HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE = "hoodie.datasource.hive_sync.skip_ro_suffix"; + public static final String HIVE_SUPPORT_TIMESTAMP_TYPE = "hoodie.datasource.hive_sync.support_timestamp"; + public static final String HIVE_METASTORE_URIS = "hive.metastore.uris"; + + public static class Builder { + + protected final KafkaConnectConfigs connectConfigs = new KafkaConnectConfigs(); + + public Builder withBootstrapServers(String bootstrapServers) { + connectConfigs.setValue(KAFKA_BOOTSTRAP_SERVERS, bootstrapServers); + return this; + } + + public Builder withControlTopicName(String controlTopicName) { + connectConfigs.setValue(CONTROL_TOPIC_NAME, controlTopicName); + return this; + } + + public Builder withCommitIntervalSecs(Long commitIntervalSecs) { + connectConfigs.setValue(COMMIT_INTERVAL_SECS, String.valueOf(commitIntervalSecs)); + return this; + } + + public Builder withCoordinatorWriteTimeoutSecs(Long coordinatorWriteTimeoutSecs) { + connectConfigs.setValue(COORDINATOR_WRITE_TIMEOUT_SECS, String.valueOf(coordinatorWriteTimeoutSecs)); + return this; + } + + public Builder withAllowCommitOnErrors(Boolean allowCommitOnErrors) { + connectConfigs.setValue(ALLOW_COMMIT_ON_ERRORS, String.valueOf(allowCommitOnErrors)); + return this; + } + + // Kafka connect task are passed with props with type Map<> + public Builder withProperties(Map properties) { + connectConfigs.getProps().putAll(properties); + return this; + } + + public Builder withProperties(Properties properties) { + connectConfigs.getProps().putAll(properties); + return this; + } + + public Builder withHadoopConfDir(String hadoopConfDir) { + connectConfigs.setValue(HADOOP_CONF_DIR, String.valueOf(hadoopConfDir)); + return this; + } + + public Builder withHadoopHome(String hadoopHome) { + connectConfigs.setValue(HADOOP_HOME, String.valueOf(hadoopHome)); + return this; + } + + protected void setDefaults() { + // Check for mandatory properties + connectConfigs.setDefaults(KafkaConnectConfigs.class.getName()); + } + + public KafkaConnectConfigs build() { + setDefaults(); + // Build HudiConnectConfigs at the end + return new KafkaConnectConfigs(connectConfigs.getProps()); + } + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java new file mode 100644 index 0000000000000..f71d8480c3ef6 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.writers; + +import org.apache.hudi.client.HoodieJavaWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.connect.transaction.TransactionCoordinator; +import org.apache.hudi.connect.utils.KafkaConnectUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.keygen.factory.HoodieAvroKeyGeneratorFactory; +import org.apache.hudi.sync.common.HoodieSyncConfig; +import org.apache.hudi.sync.common.util.SyncUtilHelpers; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Implementation of Transaction service APIs used by + * {@link TransactionCoordinator} + * using {@link HoodieJavaWriteClient}. + */ +public class KafkaConnectTransactionServices implements ConnectTransactionServices { + + private static final Logger LOG = LogManager.getLogger(KafkaConnectTransactionServices.class); + + private final KafkaConnectConfigs connectConfigs; + private final Option tableMetaClient; + private final Configuration hadoopConf; + private final HoodieWriteConfig writeConfig; + private final String tableBasePath; + private final String tableName; + private final HoodieEngineContext context; + + private final HoodieJavaWriteClient javaClient; + + public KafkaConnectTransactionServices(KafkaConnectConfigs connectConfigs) throws HoodieException { + this.connectConfigs = connectConfigs; + // This is the writeConfig for the Transaction Coordinator + this.writeConfig = HoodieWriteConfig.newBuilder() + .withEngineType(EngineType.JAVA) + .withProperties(connectConfigs.getProps()) + .build(); + + tableBasePath = writeConfig.getBasePath(); + tableName = writeConfig.getTableName(); + hadoopConf = KafkaConnectUtils.getDefaultHadoopConf(connectConfigs); + context = new HoodieJavaEngineContext(hadoopConf); + + try { + KeyGenerator keyGenerator = HoodieAvroKeyGeneratorFactory.createAvroKeyGeneratorByType( + new TypedProperties(connectConfigs.getProps())); + String recordKeyFields = KafkaConnectUtils.getRecordKeyColumns(keyGenerator); + String partitionColumns = KafkaConnectUtils.getPartitionColumns(keyGenerator, + new TypedProperties(connectConfigs.getProps())); + + LOG.info(String.format("Setting record key %s and partition fields %s for table %s", + recordKeyFields, partitionColumns, tableBasePath + tableName)); + + tableMetaClient = Option.of(HoodieTableMetaClient.withPropertyBuilder() + .setTableType(HoodieTableType.COPY_ON_WRITE.name()) + .setTableName(tableName) + .setPayloadClassName(HoodieAvroPayload.class.getName()) + .setRecordKeyFields(recordKeyFields) + .setPartitionFields(partitionColumns) + .setKeyGeneratorClassProp(writeConfig.getKeyGeneratorClass()) + .fromProperties(connectConfigs.getProps()) + .initTable(hadoopConf, tableBasePath)); + + javaClient = new HoodieJavaWriteClient<>(context, writeConfig); + } catch (Exception exception) { + throw new HoodieException("Fatal error instantiating Hudi Transaction Services ", exception); + } + } + + @Override + public String startCommit() { + String newCommitTime = javaClient.startCommit(); + javaClient.transitionInflight(newCommitTime); + LOG.info("Starting Hudi commit " + newCommitTime); + return newCommitTime; + } + + @Override + public boolean endCommit(String commitTime, List writeStatuses, Map extraMetadata) { + boolean success = javaClient.commit(commitTime, writeStatuses, Option.of(extraMetadata)); + if (success) { + LOG.info("Ending Hudi commit " + commitTime); + + // Schedule clustering and compaction as needed. + if (writeConfig.isAsyncClusteringEnabled()) { + javaClient.scheduleClustering(Option.empty()).ifPresent( + instantTs -> LOG.info("Scheduled clustering at instant time:" + instantTs)); + } + if (isAsyncCompactionEnabled()) { + javaClient.scheduleCompaction(Option.empty()).ifPresent( + instantTs -> LOG.info("Scheduled compaction at instant time:" + instantTs)); + } + syncMeta(); + } + return success; + } + + @Override + public Map fetchLatestExtraCommitMetadata() { + if (tableMetaClient.isPresent()) { + Option metadata = KafkaConnectUtils.getCommitMetadataForLatestInstant(tableMetaClient.get()); + if (metadata.isPresent()) { + return metadata.get().getExtraMetadata(); + } else { + LOG.info("Hoodie Extra Metadata from latest commit is absent"); + return Collections.emptyMap(); + } + } + throw new HoodieException("Fatal error retrieving Hoodie Extra Metadata since Table Meta Client is absent"); + } + + private boolean isAsyncCompactionEnabled() { + return tableMetaClient.isPresent() + && HoodieTableType.MERGE_ON_READ.equals(tableMetaClient.get().getTableType()) + && connectConfigs.isAsyncCompactEnabled(); + } + + private void syncMeta() { + if (connectConfigs.isMetaSyncEnabled()) { + Set syncClientToolClasses = new HashSet<>( + Arrays.asList(connectConfigs.getMetaSyncClasses().split(","))); + FileSystem fs = FSUtils.getFs(tableBasePath, new Configuration()); + for (String impl : syncClientToolClasses) { + // TODO kafka connect config needs to support setting base file format + String baseFileFormat = connectConfigs.getStringOrDefault(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT); + SyncUtilHelpers.runHoodieMetaSync(impl.trim(), connectConfigs.getProps(), hadoopConf, fs, tableBasePath, baseFileFormat); + } + } + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectWriterProvider.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectWriterProvider.java new file mode 100644 index 0000000000000..1a33560dc858a --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectWriterProvider.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.writers; + +import org.apache.hudi.client.HoodieJavaWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieArchivalConfig; +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.connect.KafkaConnectFileIdPrefixProvider; +import org.apache.hudi.connect.utils.KafkaConnectUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.keygen.factory.HoodieAvroKeyGeneratorFactory; +import org.apache.hudi.schema.SchemaProvider; + +import org.apache.hadoop.conf.Configuration; +import org.apache.kafka.common.TopicPartition; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Collections; + +/** + * Provides the Hudi Writer for the {@link org.apache.hudi.connect.transaction.TransactionParticipant} + * to write the incoming records to Hudi. + */ +public class KafkaConnectWriterProvider implements ConnectWriterProvider { + + private static final Logger LOG = LogManager.getLogger(KafkaConnectWriterProvider.class); + + private final KafkaConnectConfigs connectConfigs; + private final HoodieEngineContext context; + private final HoodieWriteConfig writeConfig; + private final HoodieJavaWriteClient hudiJavaClient; + private final KeyGenerator keyGenerator; + private final SchemaProvider schemaProvider; + + public KafkaConnectWriterProvider( + KafkaConnectConfigs connectConfigs, + TopicPartition partition) throws HoodieException { + this.connectConfigs = connectConfigs; + Configuration hadoopConf = KafkaConnectUtils.getDefaultHadoopConf(connectConfigs); + + try { + this.schemaProvider = StringUtils.isNullOrEmpty(connectConfigs.getSchemaProviderClass()) ? null + : (SchemaProvider) ReflectionUtils.loadClass(connectConfigs.getSchemaProviderClass(), + new TypedProperties(connectConfigs.getProps())); + + this.keyGenerator = HoodieAvroKeyGeneratorFactory.createKeyGenerator( + new TypedProperties(connectConfigs.getProps())); + + // This is the writeConfig for the writers for the individual Transaction Coordinators + writeConfig = HoodieWriteConfig.newBuilder() + .withEngineType(EngineType.JAVA) + .withProperties(connectConfigs.getProps()) + .withFileIdPrefixProviderClassName(KafkaConnectFileIdPrefixProvider.class.getName()) + .withProps(Collections.singletonMap( + KafkaConnectFileIdPrefixProvider.KAFKA_CONNECT_PARTITION_ID, + String.valueOf(partition))) + .withSchema(schemaProvider.getSourceSchema().toString()) + .withAutoCommit(false) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()) + // participants should not trigger table services, and leave it to the coordinator + .withArchivalConfig(HoodieArchivalConfig.newBuilder().withAutoArchive(false).build()) + .withCleanConfig(HoodieCleanConfig.newBuilder().withAutoClean(false).build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().withInlineCompaction(false).build()) + .withClusteringConfig(HoodieClusteringConfig.newBuilder().withInlineClustering(false).build()) + .build(); + + context = new HoodieJavaEngineContext(hadoopConf); + + hudiJavaClient = new HoodieJavaWriteClient<>(context, writeConfig); + } catch (Throwable e) { + throw new HoodieException("Fatal error instantiating Hudi Write Provider ", e); + } + } + + public AbstractConnectWriter getWriter(String commitTime) { + return new BufferedConnectWriter( + context, + hudiJavaClient, + commitTime, + connectConfigs, + writeConfig, + keyGenerator, + schemaProvider); + } +} diff --git a/hudi-kafka-connect/src/main/resources/ControlMessage.proto b/hudi-kafka-connect/src/main/resources/ControlMessage.proto new file mode 100644 index 0000000000000..5059897c3fe80 --- /dev/null +++ b/hudi-kafka-connect/src/main/resources/ControlMessage.proto @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto3"; + +option java_multiple_files = true; +option java_package = "org.apache.hudi.connect"; +option java_outer_classname = "ConnectControl"; + +package connect; + +message ControlMessage { + uint32 protocolVersion = 1; + EventType type = 2; + string topic_name = 3; + EntityType sender_type = 4; + uint32 sender_partition = 5; + EntityType receiver_type = 6; + uint32 receiver_partition = 7; + string commitTime = 8; + oneof payload { + CoordinatorInfo coordinator_info = 9; + ParticipantInfo participant_info = 10; + } + + message CoordinatorInfo { + map globalKafkaCommitOffsets = 1; + } + + message ParticipantInfo { + ConnectWriteStatus writeStatus = 1; + uint64 kafkaOffset = 2; + } + + message ConnectWriteStatus { + bytes serializedWriteStatus = 1; + } + + enum EventType { + START_COMMIT = 0; + END_COMMIT = 1; + ACK_COMMIT = 2; + WRITE_STATUS = 3; + } + + enum EntityType { + COORDINATOR = 0; + PARTICIPANT = 1; + } +} diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionCoordinator.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionCoordinator.java new file mode 100644 index 0000000000000..d939351a58f65 --- /dev/null +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionCoordinator.java @@ -0,0 +1,325 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.connect.transaction.ConnectTransactionCoordinator; +import org.apache.hudi.connect.transaction.TransactionCoordinator; +import org.apache.hudi.connect.transaction.TransactionParticipant; +import org.apache.hudi.connect.utils.KafkaConnectUtils; +import org.apache.hudi.connect.writers.KafkaConnectConfigs; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.helper.MockConnectTransactionServices; +import org.apache.hudi.helper.MockKafkaControlAgent; + +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.connect.sink.SinkRecord; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.mock; + +public class TestConnectTransactionCoordinator { + + private static final String TOPIC_NAME = "kafka-connect-test-topic"; + private static final int TOTAL_KAFKA_PARTITIONS = 4; + private static final int MAX_COMMIT_ROUNDS = 5; + private static final int TEST_TIMEOUT_SECS = 60; + + private KafkaConnectConfigs configs; + private MockParticipant participant; + private MockKafkaControlAgent kafkaControlAgent; + private MockConnectTransactionServices transactionServices; + private CountDownLatch latch; + + @BeforeEach + public void setUp() throws Exception { + transactionServices = new MockConnectTransactionServices(); + latch = new CountDownLatch(1); + } + + @ParameterizedTest + @EnumSource(value = MockParticipant.TestScenarios.class) + public void testSingleCommitScenario(MockParticipant.TestScenarios scenario) throws InterruptedException { + kafkaControlAgent = new MockKafkaControlAgent(); + participant = new MockParticipant(kafkaControlAgent, latch, scenario, MAX_COMMIT_ROUNDS); + participant.start(); + + KafkaConnectConfigs.Builder configBuilder = KafkaConnectConfigs.newBuilder() + .withCommitIntervalSecs(1L) + .withCoordinatorWriteTimeoutSecs(1L); + + if (scenario.equals(MockParticipant.TestScenarios.SUBSET_WRITE_STATUS_FAILED)) { + configBuilder.withAllowCommitOnErrors(false); + } + configs = configBuilder.build(); + + // Test the coordinator using the mock participant + TransactionCoordinator coordinator = new ConnectTransactionCoordinator( + configs, + new TopicPartition(TOPIC_NAME, 0), + kafkaControlAgent, + transactionServices, + (bootstrapServers, topicName) -> TOTAL_KAFKA_PARTITIONS); + coordinator.start(); + + latch.await(TEST_TIMEOUT_SECS, TimeUnit.SECONDS); + + if (latch.getCount() > 0) { + throw new HoodieException("Test timedout resulting in failure"); + } + coordinator.stop(); + participant.stop(); + } + + /** + * A mock Transaction Participant, that exercises all the test scenarios + * for the coordinator as mentioned in {@link TestScenarios}. + */ + private static class MockParticipant implements TransactionParticipant { + + private final MockKafkaControlAgent kafkaControlAgent; + private final TopicPartition partition; + private final CountDownLatch latch; + private final TestScenarios testScenario; + private final int maxNumberCommitRounds; + private final Map kafkaOffsetsCommitted; + + private ControlMessage.EventType expectedMsgType; + private int numberCommitRounds; + + public MockParticipant(MockKafkaControlAgent kafkaControlAgent, + CountDownLatch latch, + TestScenarios testScenario, + int maxNumberCommitRounds) { + this.kafkaControlAgent = kafkaControlAgent; + this.latch = latch; + this.testScenario = testScenario; + this.maxNumberCommitRounds = maxNumberCommitRounds; + this.partition = new TopicPartition(TOPIC_NAME, (TOTAL_KAFKA_PARTITIONS - 1)); + this.kafkaOffsetsCommitted = new HashMap<>(); + expectedMsgType = ControlMessage.EventType.START_COMMIT; + numberCommitRounds = 0; + } + + @Override + public void start() { + kafkaControlAgent.registerTransactionParticipant(this); + } + + @Override + public void stop() { + kafkaControlAgent.deregisterTransactionParticipant(this); + } + + @Override + public void buffer(SinkRecord record) { + } + + @Override + public void processRecords() { + } + + @Override + public TopicPartition getPartition() { + return partition; + } + + @Override + public void processControlEvent(ControlMessage message) { + assertEquals(message.getSenderType(), ControlMessage.EntityType.COORDINATOR); + assertEquals(message.getTopicName(), partition.topic()); + testScenarios(message); + } + + @Override + public long getLastKafkaCommittedOffset() { + return 0; + } + + private void testScenarios(ControlMessage message) { + assertEquals(expectedMsgType, message.getType()); + switch (message.getType()) { + case START_COMMIT: + expectedMsgType = ControlMessage.EventType.END_COMMIT; + break; + case END_COMMIT: + assertEquals(kafkaOffsetsCommitted, message.getCoordinatorInfo().getGlobalKafkaCommitOffsets()); + int numPartitionsThatReportWriteStatus; + Map kafkaOffsets = new HashMap<>(); + List controlEvents = new ArrayList<>(); + switch (testScenario) { + case ALL_CONNECT_TASKS_SUCCESS: + composeControlEvent( + message.getCommitTime(), false, false, kafkaOffsets, controlEvents); + numPartitionsThatReportWriteStatus = TOTAL_KAFKA_PARTITIONS; + // This commit round should succeed, and the kafka offsets getting committed + kafkaOffsetsCommitted.putAll(kafkaOffsets); + expectedMsgType = ControlMessage.EventType.ACK_COMMIT; + break; + case ALL_CONNECT_TASKS_WITH_EMPTY_WRITE_STATUS: + composeControlEvent( + message.getCommitTime(), false, true, kafkaOffsets, controlEvents); + numPartitionsThatReportWriteStatus = TOTAL_KAFKA_PARTITIONS; + // This commit round should succeed, and the kafka offsets getting committed + kafkaOffsetsCommitted.putAll(kafkaOffsets); + expectedMsgType = ControlMessage.EventType.ACK_COMMIT; + break; + case SUBSET_WRITE_STATUS_FAILED_BUT_IGNORED: + composeControlEvent( + message.getCommitTime(), true, false, kafkaOffsets, controlEvents); + numPartitionsThatReportWriteStatus = TOTAL_KAFKA_PARTITIONS; + // Despite error records, this commit round should succeed, and the kafka offsets getting committed + kafkaOffsetsCommitted.putAll(kafkaOffsets); + expectedMsgType = ControlMessage.EventType.ACK_COMMIT; + break; + case SUBSET_WRITE_STATUS_FAILED: + composeControlEvent( + message.getCommitTime(), true, false, kafkaOffsets, controlEvents); + numPartitionsThatReportWriteStatus = TOTAL_KAFKA_PARTITIONS; + // This commit round should fail, and a new commit round should start without kafka offsets getting committed + expectedMsgType = ControlMessage.EventType.START_COMMIT; + break; + case SUBSET_CONNECT_TASKS_FAILED: + composeControlEvent( + message.getCommitTime(), false, false, kafkaOffsets, controlEvents); + numPartitionsThatReportWriteStatus = TOTAL_KAFKA_PARTITIONS / 2; + // This commit round should fail, and a new commit round should start without kafka offsets getting committed + expectedMsgType = ControlMessage.EventType.START_COMMIT; + break; + default: + throw new HoodieException("Unknown test scenario " + testScenario); + } + + // Send events based on test scenario + for (int i = 0; i < numPartitionsThatReportWriteStatus; i++) { + kafkaControlAgent.publishMessage(controlEvents.get(i)); + } + break; + case ACK_COMMIT: + if (numberCommitRounds >= maxNumberCommitRounds) { + latch.countDown(); + } + expectedMsgType = ControlMessage.EventType.START_COMMIT; + break; + default: + throw new HoodieException("Illegal control message type " + message.getType()); + } + + if (message.getType().equals(ControlMessage.EventType.START_COMMIT)) { + if (numberCommitRounds >= maxNumberCommitRounds) { + latch.countDown(); + } + numberCommitRounds++; + expectedMsgType = ControlMessage.EventType.END_COMMIT; + } + } + + public enum TestScenarios { + SUBSET_CONNECT_TASKS_FAILED, + SUBSET_WRITE_STATUS_FAILED, + SUBSET_WRITE_STATUS_FAILED_BUT_IGNORED, + ALL_CONNECT_TASKS_SUCCESS, + ALL_CONNECT_TASKS_WITH_EMPTY_WRITE_STATUS + } + + private static void composeControlEvent( + String commitTime, boolean shouldIncludeFailedRecords, boolean useEmptyWriteStatus, + Map kafkaOffsets, List controlEvents) { + // Prepare the WriteStatuses for all partitions + for (int i = 1; i <= TOTAL_KAFKA_PARTITIONS; i++) { + try { + long kafkaOffset = (long) (Math.random() * 10000); + kafkaOffsets.put(i, kafkaOffset); + ControlMessage event = composeWriteStatusResponse( + commitTime, + new TopicPartition(TOPIC_NAME, i), + kafkaOffset, + shouldIncludeFailedRecords, + useEmptyWriteStatus); + controlEvents.add(event); + } catch (Exception exception) { + throw new HoodieException("Fatal error sending control event to Coordinator"); + } + } + } + + private static ControlMessage composeWriteStatusResponse(String commitTime, + TopicPartition partition, + long kafkaOffset, + boolean includeFailedRecords, + boolean useEmptyWriteStatus) throws Exception { + List writeStatusList = useEmptyWriteStatus ? Collections.emptyList() + : Collections.singletonList( + includeFailedRecords + ? getSubsetFailedRecordsWriteStatus() + : getAllSuccessfulRecordsWriteStatus()); + + return ControlMessage.newBuilder() + .setType(ControlMessage.EventType.WRITE_STATUS) + .setTopicName(partition.topic()) + .setSenderType(ControlMessage.EntityType.PARTICIPANT) + .setSenderPartition(partition.partition()) + .setReceiverType(ControlMessage.EntityType.COORDINATOR) + .setReceiverPartition(ConnectTransactionCoordinator.COORDINATOR_KAFKA_PARTITION) + .setCommitTime(commitTime) + .setParticipantInfo( + ControlMessage.ParticipantInfo.newBuilder() + .setWriteStatus(KafkaConnectUtils.buildWriteStatuses(writeStatusList)) + .setKafkaOffset(kafkaOffset) + .build() + ).build(); + } + } + + private static WriteStatus getAllSuccessfulRecordsWriteStatus() { + // send WS + WriteStatus status = new WriteStatus(false, 0.0); + for (int i = 0; i < 1000; i++) { + status.markSuccess(mock(HoodieRecord.class), Option.empty()); + } + return status; + } + + private static WriteStatus getSubsetFailedRecordsWriteStatus() { + // send WS + WriteStatus status = new WriteStatus(false, 0.0); + for (int i = 0; i < 1000; i++) { + if (i % 10 == 0) { + status.markFailure(mock(HoodieRecord.class), new Throwable("Error writing record on disk"), Option.empty()); + } else { + status.markSuccess(mock(HoodieRecord.class), Option.empty()); + } + } + status.setGlobalError(new Throwable("More than one records failed to be written to storage")); + return status; + } +} diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionParticipant.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionParticipant.java new file mode 100644 index 0000000000000..36da6ac32e01e --- /dev/null +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionParticipant.java @@ -0,0 +1,276 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect; + +import org.apache.hudi.common.util.Option; +import org.apache.hudi.connect.kafka.KafkaControlAgent; +import org.apache.hudi.connect.transaction.ConnectTransactionParticipant; +import org.apache.hudi.connect.transaction.TransactionCoordinator; +import org.apache.hudi.connect.writers.KafkaConnectConfigs; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.helper.MockKafkaControlAgent; +import org.apache.hudi.helper.TestHudiWriterProvider; +import org.apache.hudi.helper.MockKafkaConnect; + +import org.apache.kafka.common.TopicPartition; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import java.util.Collections; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestConnectTransactionParticipant { + + private static final String TOPIC_NAME = "kafka-connect-test-topic"; + private static final int NUM_RECORDS_BATCH = 5; + private static final int PARTITION_NUMBER = 4; + + private ConnectTransactionParticipant participant; + private MockCoordinator mockCoordinator; + private TopicPartition partition; + private KafkaConnectConfigs configs; + private KafkaControlAgent kafkaControlAgent; + private TestHudiWriterProvider testHudiWriterProvider; + private MockKafkaConnect mockKafkaConnect; + + @BeforeEach + public void setUp() throws Exception { + partition = new TopicPartition(TOPIC_NAME, PARTITION_NUMBER); + kafkaControlAgent = new MockKafkaControlAgent(); + mockKafkaConnect = new MockKafkaConnect(partition); + mockCoordinator = new MockCoordinator(kafkaControlAgent); + mockCoordinator.start(); + configs = KafkaConnectConfigs.newBuilder() + .build(); + initializeParticipant(); + } + + @ParameterizedTest + @EnumSource(value = CoordinatorFailureTestScenarios.class) + public void testAllCoordinatorFailureScenarios(CoordinatorFailureTestScenarios testScenario) { + try { + assertTrue(mockKafkaConnect.isPaused()); + switch (testScenario) { + case REGULAR_SCENARIO: + break; + case COORDINATOR_FAILED_AFTER_START_COMMIT: + triggerAndProcessStartCommit(); + // Coordinator Failed + initializeCoordinator(); + break; + case COORDINATOR_FAILED_AFTER_END_COMMIT: + triggerAndProcessStartCommit(); + triggerAndProcessEndCommit(); + // Coordinator Failed + initializeCoordinator(); + break; + default: + throw new HoodieException("Unknown test scenario " + testScenario); + } + + // Despite failures in the previous commit, a fresh 2-phase commit should PASS. + testTwoPhaseCommit(0); + } catch (Exception exception) { + throw new HoodieException("Unexpected test failure ", exception); + } + participant.stop(); + } + + @ParameterizedTest + @EnumSource(value = ParticipantFailureTestScenarios.class) + public void testAllParticipantFailureScenarios(ParticipantFailureTestScenarios testScenario) { + try { + int currentKafkaOffset = 0; + switch (testScenario) { + case FAILURE_BEFORE_START_COMMIT: + // Participant failing after START_COMMIT will not write any data in this commit cycle. + initializeParticipant(); + break; + case FAILURE_AFTER_START_COMMIT: + triggerAndProcessStartCommit(); + // Participant failing after START_COMMIT will not write any data in this commit cycle. + initializeParticipant(); + triggerAndProcessEndCommit(); + triggerAndProcessAckCommit(); + break; + case FAILURE_AFTER_END_COMMIT: + // Regular Case or Coordinator Recovery Case + triggerAndProcessStartCommit(); + triggerAndProcessEndCommit(); + initializeParticipant(); + triggerAndProcessAckCommit(); + + // Participant failing after and END_COMMIT should not cause issues with the present commit, + // since the data would have been written by previous participant before failing + // and hence moved the kafka offset. + currentKafkaOffset = NUM_RECORDS_BATCH; + break; + default: + throw new HoodieException("Unknown test scenario " + testScenario); + } + + // Despite failures in the previous commit, a fresh 2-phase commit should PASS. + testTwoPhaseCommit(currentKafkaOffset); + } catch (Exception exception) { + throw new HoodieException("Unexpected test failure ", exception); + } + } + + private void initializeParticipant() { + testHudiWriterProvider = new TestHudiWriterProvider(); + participant = new ConnectTransactionParticipant( + partition, + kafkaControlAgent, + mockKafkaConnect, + testHudiWriterProvider); + mockKafkaConnect.setParticipant(participant); + participant.start(); + } + + private void initializeCoordinator() { + mockCoordinator = new MockCoordinator(kafkaControlAgent); + mockCoordinator.start(); + } + + // Test and validate result of a single 2 Phase commit from START_COMMIT to ACK_COMMIT. + // Validates that NUM_RECORDS_BATCH number of kafka records are written, + // and the kafka offset only increments by NUM_RECORDS_BATCH. + private void testTwoPhaseCommit(long currentKafkaOffset) { + triggerAndProcessStartCommit(); + triggerAndProcessEndCommit(); + triggerAndProcessAckCommit(); + + // Validate records written, current kafka offset and kafka offsets committed across + // coordinator and participant are in sync despite failure scenarios. + assertEquals(NUM_RECORDS_BATCH, testHudiWriterProvider.getLatestNumberWrites()); + assertEquals((currentKafkaOffset + NUM_RECORDS_BATCH), mockKafkaConnect.getCurrentKafkaOffset()); + // Ensure Coordinator and participant are in sync in the kafka offsets + assertEquals(participant.getLastKafkaCommittedOffset(), mockCoordinator.getCommittedKafkaOffset()); + } + + private void triggerAndProcessStartCommit() { + mockCoordinator.sendEventFromCoordinator(ControlMessage.EventType.START_COMMIT); + mockKafkaConnect.publishBatchRecordsToParticipant(NUM_RECORDS_BATCH); + assertTrue(mockKafkaConnect.isResumed()); + } + + private void triggerAndProcessEndCommit() { + mockCoordinator.sendEventFromCoordinator(ControlMessage.EventType.END_COMMIT); + mockKafkaConnect.publishBatchRecordsToParticipant(0); + assertTrue(mockKafkaConnect.isPaused()); + } + + private void triggerAndProcessAckCommit() { + mockCoordinator.sendEventFromCoordinator(ControlMessage.EventType.ACK_COMMIT); + mockKafkaConnect.publishBatchRecordsToParticipant(0); + assertTrue(mockKafkaConnect.isPaused()); + } + + private static class MockCoordinator implements TransactionCoordinator { + + private static int currentCommitTime; + + static { + currentCommitTime = 101; + } + + private final KafkaControlAgent kafkaControlAgent; + private final TopicPartition partition; + + private Option lastReceivedWriteStatusEvent; + private long committedKafkaOffset; + + public MockCoordinator(KafkaControlAgent kafkaControlAgent) { + this.kafkaControlAgent = kafkaControlAgent; + partition = new TopicPartition(TOPIC_NAME, 0); + lastReceivedWriteStatusEvent = Option.empty(); + committedKafkaOffset = 0L; + } + + public void sendEventFromCoordinator(ControlMessage.EventType type) { + try { + if (type.equals(ControlMessage.EventType.START_COMMIT)) { + ++currentCommitTime; + } + kafkaControlAgent.publishMessage( + ControlMessage.newBuilder() + .setType(type) + .setTopicName(partition.topic()) + .setSenderType(ControlMessage.EntityType.COORDINATOR) + .setSenderPartition(partition.partition()) + .setReceiverType(ControlMessage.EntityType.PARTICIPANT) + .setCommitTime(String.valueOf(currentCommitTime)) + .setCoordinatorInfo( + ControlMessage.CoordinatorInfo.newBuilder() + .putAllGlobalKafkaCommitOffsets(Collections.singletonMap(PARTITION_NUMBER, committedKafkaOffset)) + .build() + ).build()); + } catch (Exception exception) { + throw new HoodieException("Fatal error sending control event to Participant"); + } + } + + public Option getLastReceivedWriteStatusEvent() { + return lastReceivedWriteStatusEvent; + } + + public long getCommittedKafkaOffset() { + return committedKafkaOffset; + } + + @Override + public void start() { + kafkaControlAgent.registerTransactionCoordinator(this); + } + + @Override + public void stop() { + kafkaControlAgent.deregisterTransactionCoordinator(this); + } + + @Override + public TopicPartition getPartition() { + return partition; + } + + @Override + public void processControlEvent(ControlMessage message) { + if (message.getType().equals(ControlMessage.EventType.WRITE_STATUS)) { + lastReceivedWriteStatusEvent = Option.of(message); + assertTrue(message.getParticipantInfo().getKafkaOffset() >= committedKafkaOffset); + committedKafkaOffset = message.getParticipantInfo().getKafkaOffset(); + } + } + } + + private enum CoordinatorFailureTestScenarios { + REGULAR_SCENARIO, + COORDINATOR_FAILED_AFTER_START_COMMIT, + COORDINATOR_FAILED_AFTER_END_COMMIT, + } + + private enum ParticipantFailureTestScenarios { + FAILURE_BEFORE_START_COMMIT, + FAILURE_AFTER_START_COMMIT, + FAILURE_AFTER_END_COMMIT, + } +} diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestHdfsConfiguration.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestHdfsConfiguration.java new file mode 100644 index 0000000000000..dca8f577bde8b --- /dev/null +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestHdfsConfiguration.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect; + +import org.apache.hudi.connect.utils.KafkaConnectUtils; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.apache.hudi.connect.writers.KafkaConnectConfigs; +import org.junit.jupiter.api.Test; + +import java.nio.file.Path; +import java.util.List; + +public class TestHdfsConfiguration { + + private boolean checkFiles(List paths) { + paths.removeIf(p -> { + String fileName = p.toFile().getName(); + return fileName.equals("core-site.xml") || fileName.equals("hdfs-site.xml"); + }); + return paths.isEmpty(); + } + + @Test + public void testHadoopConfigEnvs() throws Exception { + List paths = KafkaConnectUtils.getHadoopConfigFiles( + "src/test/resources/hadoop_conf", ""); + assertEquals(paths.size(), 2); + assertTrue(checkFiles(paths)); + } + + @Test + public void testHadoopHomeEnvs() throws Exception { + List paths = KafkaConnectUtils.getHadoopConfigFiles( + "","src/test/resources/hadoop_home"); + assertEquals(paths.size(), 2); + assertTrue(checkFiles(paths)); + } + + @Test + public void testKafkaConfig() throws Exception { + KafkaConnectConfigs connectConfigs = KafkaConnectConfigs.newBuilder() + .withHadoopHome("src/test/resources/hadoop_home") + .build(); + List paths = KafkaConnectUtils.getHadoopConfigFiles( + connectConfigs.getHadoopConfDir(), + connectConfigs.getHadoopConfHome() + ); + assertEquals(paths.size(), 2); + assertTrue(checkFiles(paths)); + } +} diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockConnectTransactionServices.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockConnectTransactionServices.java new file mode 100644 index 0000000000000..b3314ade3d5b4 --- /dev/null +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockConnectTransactionServices.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.helper; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.connect.writers.ConnectTransactionServices; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * Helper class for {@link ConnectTransactionServices} to generate + * a unique commit time for testing purposes. + */ +public class MockConnectTransactionServices implements ConnectTransactionServices { + + private int commitTime; + + public MockConnectTransactionServices() { + commitTime = 100; + } + + @Override + public String startCommit() { + commitTime++; + return String.valueOf(commitTime); + } + + @Override + public boolean endCommit(String commitTime, List writeStatuses, Map extraMetadata) { + assertEquals(String.valueOf(this.commitTime), commitTime); + return true; + } + + @Override + public Map fetchLatestExtraCommitMetadata() { + return new HashMap<>(); + } +} diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockKafkaConnect.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockKafkaConnect.java new file mode 100644 index 0000000000000..113b93ef87123 --- /dev/null +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockKafkaConnect.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.helper; + +import org.apache.hudi.connect.transaction.TransactionParticipant; + +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.sink.SinkRecord; +import org.apache.kafka.connect.sink.SinkTaskContext; + +import java.util.Arrays; +import java.util.Map; +import java.util.Set; + +/** + * Helper class that emulates the Kafka Connect f/w and additionally + * implements {@link SinkTaskContext} for testing purposes. + * + * Everytime the consumer (Participant) calls resume, a fixed + * batch of kafka records from the current offset are pushed. If + * the consumer resets the offsets, then a fresh batch of records + * are sent from the new offset. + */ +public class MockKafkaConnect implements SinkTaskContext { + + private final TopicPartition testPartition; + + private TransactionParticipant participant; + private long currentKafkaOffset; + private boolean isPaused; + private boolean isResetOffset; + + public MockKafkaConnect(TopicPartition testPartition) { + this.testPartition = testPartition; + isPaused = false; + currentKafkaOffset = 0L; + isResetOffset = false; + } + + public void setParticipant(TransactionParticipant participant) { + this.participant = participant; + } + + public boolean isPaused() { + return isPaused; + } + + public boolean isResumed() { + return !isPaused; + } + + public long getCurrentKafkaOffset() { + return currentKafkaOffset; + } + + @Override + public void pause(TopicPartition... partitions) { + if (Arrays.stream(partitions).allMatch(testPartition::equals)) { + isPaused = true; + } + } + + @Override + public void resume(TopicPartition... partitions) { + if (Arrays.stream(partitions).allMatch(testPartition::equals)) { + isPaused = false; + } + } + + @Override + public void offset(Map offsets) { + for (TopicPartition tp : offsets.keySet()) { + if (tp.equals(testPartition)) { + resetOffset(offsets.get(tp)); + } + } + } + + @Override + public void offset(TopicPartition tp, long offset) { + if (tp.equals(testPartition)) { + resetOffset(offset); + } + } + + @Override + public Map configs() { + return null; + } + + @Override + public void timeout(long timeoutMs) { + + } + + @Override + public Set assignment() { + return null; + } + + @Override + public void requestCommit() { + } + + public int publishBatchRecordsToParticipant(int numRecords) { + // Send NUM_RECORDS_BATCH to participant + // If client resets offset, send another batch starting + // from the new reset offset value + do { + isResetOffset = false; + for (int i = 1; i <= numRecords; i++) { + participant.buffer(getNextKafkaRecord()); + } + participant.processRecords(); + } while (isResetOffset); + return numRecords; + } + + private SinkRecord getNextKafkaRecord() { + return new SinkRecord(testPartition.topic(), + testPartition.partition(), + Schema.OPTIONAL_BYTES_SCHEMA, + ("key-" + currentKafkaOffset).getBytes(), + Schema.OPTIONAL_BYTES_SCHEMA, + "value".getBytes(), currentKafkaOffset++); + } + + private void resetOffset(long newOffset) { + currentKafkaOffset = newOffset; + isResetOffset = true; + } +} diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockKafkaControlAgent.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockKafkaControlAgent.java new file mode 100644 index 0000000000000..eed79c4861250 --- /dev/null +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockKafkaControlAgent.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.helper; + +import org.apache.hudi.connect.ControlMessage; +import org.apache.hudi.connect.kafka.KafkaControlAgent; +import org.apache.hudi.connect.transaction.TransactionCoordinator; +import org.apache.hudi.connect.transaction.TransactionParticipant; +import org.apache.hudi.exception.HoodieException; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * A mock Kafka Control Agent that supports the testing + * of a {@link TransactionCoordinator} with multiple + * instances of {@link TransactionParticipant}. + */ +public class MockKafkaControlAgent implements KafkaControlAgent { + + private final Map coordinators; + private final Map> participants; + + public MockKafkaControlAgent() { + coordinators = new HashMap<>(); + participants = new HashMap<>(); + } + + @Override + public void registerTransactionCoordinator(TransactionCoordinator coordinator) { + coordinators.put(coordinator.getPartition().topic(), coordinator); + } + + @Override + public void registerTransactionParticipant(TransactionParticipant participant) { + if (!participants.containsKey(participant.getPartition().topic())) { + participants.put(participant.getPartition().topic(), new ArrayList<>()); + } + participants.get(participant.getPartition().topic()).add(participant); + } + + @Override + public void deregisterTransactionCoordinator(TransactionCoordinator coordinator) { + coordinators.remove(coordinator.getPartition().topic()); + } + + @Override + public void deregisterTransactionParticipant(TransactionParticipant worker) { + if (participants.containsKey(worker.getPartition().topic())) { + participants.get(worker.getPartition().topic()).remove(worker); + } + } + + @Override + public void publishMessage(ControlMessage message) { + try { + String topic = message.getTopicName(); + if (message.getSenderType().equals(ControlMessage.EntityType.COORDINATOR)) { + for (TransactionParticipant participant : participants.get(topic)) { + participant.processControlEvent(message); + } + } else { + coordinators.get(topic).processControlEvent(message); + } + } catch (Exception exception) { + throw new HoodieException("Fatal error trying to relay Kafka Control Messages for Testing."); + } + } +} diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/TestHudiWriterProvider.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/TestHudiWriterProvider.java new file mode 100644 index 0000000000000..45c9b03725f5e --- /dev/null +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/TestHudiWriterProvider.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.helper; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.connect.writers.ConnectWriter; +import org.apache.hudi.connect.writers.ConnectWriterProvider; + +import org.apache.kafka.connect.sink.SinkRecord; + +import java.util.List; + +/** + * Helper class the provides a Hudi writer and + * maintains stats that are used for test validation. + */ +public class TestHudiWriterProvider implements ConnectWriterProvider { + + private TestHudiWriter currentWriter; + + public TestHudiWriterProvider() { + } + + public int getLatestNumberWrites() { + return (currentWriter != null) ? currentWriter.numberRecords : 0; + } + + public boolean isClosed() { + return currentWriter == null || currentWriter.isClosed; + } + + @Override + public ConnectWriter getWriter(String commitTime) { + currentWriter = new TestHudiWriter(); + return currentWriter; + } + + private static class TestHudiWriter implements ConnectWriter { + + private int numberRecords; + private boolean isClosed; + + public TestHudiWriter() { + this.numberRecords = 0; + this.isClosed = false; + } + + public int getNumberRecords() { + return numberRecords; + } + + public boolean isClosed() { + return isClosed; + } + + @Override + public void writeRecord(SinkRecord record) { + numberRecords++; + } + + @Override + public List close() { + isClosed = false; + return null; + } + } +} diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestAbstractConnectWriter.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestAbstractConnectWriter.java new file mode 100644 index 0000000000000..7a286e565ea34 --- /dev/null +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestAbstractConnectWriter.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.writers; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.testutils.SchemaTestUtil; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.connect.writers.AbstractConnectWriter; +import org.apache.hudi.connect.writers.KafkaConnectConfigs; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.schema.SchemaProvider; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.avro.io.DecoderFactory; +import org.apache.kafka.connect.sink.SinkRecord; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestAbstractConnectWriter { + + private static final String TOPIC_NAME = "kafka-connect-test-topic"; + private static final int PARTITION_NUMBER = 4; + private static final int NUM_RECORDS = 10; + private static final int RECORD_KEY_INDEX = 0; + + private KafkaConnectConfigs configs; + private TestKeyGenerator keyGenerator; + private SchemaProvider schemaProvider; + private long currentKafkaOffset; + + @BeforeEach + public void setUp() throws Exception { + keyGenerator = new TestKeyGenerator(new TypedProperties()); + schemaProvider = new TestSchemaProvider(); + } + + @ParameterizedTest + @EnumSource(value = TestInputFormats.class) + public void testAbstractWriterForAllFormats(TestInputFormats inputFormats) throws Exception { + Schema schema = schemaProvider.getSourceSchema(); + List inputRecords; + List expectedRecords; + + String formatConverter; + switch (inputFormats) { + case JSON_STRING: + formatConverter = AbstractConnectWriter.KAFKA_STRING_CONVERTER; + GenericDatumReader reader = new GenericDatumReader<>(schema, schema); + inputRecords = SchemaTestUtil.generateTestJsonRecords(0, NUM_RECORDS); + expectedRecords = ((List) inputRecords).stream().map(s -> { + try { + return HoodieAvroUtils.rewriteRecord((GenericRecord) reader.read(null, DecoderFactory.get().jsonDecoder(schema, s)), + schema); + } catch (IOException exception) { + throw new HoodieException("Error converting JSON records to AVRO"); + } + }).map(p -> convertToHoodieRecords(p, p.get(RECORD_KEY_INDEX).toString(), "000/00/00")).collect(Collectors.toList()); + break; + case AVRO: + formatConverter = AbstractConnectWriter.KAFKA_AVRO_CONVERTER; + inputRecords = SchemaTestUtil.generateTestRecords(0, NUM_RECORDS); + expectedRecords = inputRecords.stream().map(s -> HoodieAvroUtils.rewriteRecord((GenericRecord) s, schema)) + .map(p -> convertToHoodieRecords(p, p.get(RECORD_KEY_INDEX).toString(), "000/00/00")).collect(Collectors.toList()); + break; + default: + throw new HoodieException("Unknown test scenario " + inputFormats); + } + + configs = KafkaConnectConfigs.newBuilder() + .withProperties( + Collections.singletonMap(KafkaConnectConfigs.KAFKA_VALUE_CONVERTER, formatConverter)) + .build(); + AbstractHudiConnectWriterTestWrapper writer = new AbstractHudiConnectWriterTestWrapper( + configs, + keyGenerator, + schemaProvider); + + for (int i = 0; i < NUM_RECORDS; i++) { + writer.writeRecord(getNextKafkaRecord(inputRecords.get(i))); + } + + validateRecords(writer.getWrittenRecords(), expectedRecords); + } + + private static void validateRecords(List actualRecords, List expectedRecords) { + assertEquals(actualRecords.size(), expectedRecords.size()); + + actualRecords.sort(Comparator.comparing(HoodieRecord::getRecordKey)); + expectedRecords.sort(Comparator.comparing(HoodieRecord::getRecordKey)); + + // iterate through the elements and compare them one by one using + // the provided comparator. + Iterator it1 = actualRecords.iterator(); + Iterator it2 = expectedRecords.iterator(); + while (it1.hasNext()) { + HoodieRecord t1 = it1.next(); + HoodieRecord t2 = it2.next(); + assertEquals(t1.getRecordKey(), t2.getRecordKey()); + } + } + + private SinkRecord getNextKafkaRecord(Object record) { + return new SinkRecord(TOPIC_NAME, PARTITION_NUMBER, + org.apache.kafka.connect.data.Schema.OPTIONAL_BYTES_SCHEMA, + ("key-" + currentKafkaOffset).getBytes(), + org.apache.kafka.connect.data.Schema.OPTIONAL_BYTES_SCHEMA, + record, currentKafkaOffset++); + } + + private static class AbstractHudiConnectWriterTestWrapper extends AbstractConnectWriter { + + private List writtenRecords; + + public AbstractHudiConnectWriterTestWrapper(KafkaConnectConfigs connectConfigs, KeyGenerator keyGenerator, SchemaProvider schemaProvider) { + super(connectConfigs, keyGenerator, schemaProvider, "000"); + writtenRecords = new ArrayList<>(); + } + + public List getWrittenRecords() { + return writtenRecords; + } + + @Override + protected void writeHudiRecord(HoodieRecord record) { + writtenRecords.add(record); + } + + @Override + protected List flushRecords() { + return null; + } + } + + private static HoodieRecord convertToHoodieRecords(IndexedRecord iRecord, String key, String partitionPath) { + return new HoodieAvroRecord<>(new HoodieKey(key, partitionPath), + new HoodieAvroPayload(Option.of((GenericRecord) iRecord))); + } + + private enum TestInputFormats { + AVRO, + JSON_STRING + } + + static class TestKeyGenerator extends KeyGenerator { + + protected TestKeyGenerator(TypedProperties config) { + super(config); + } + + @Override + public HoodieKey getKey(GenericRecord record) { + return new HoodieKey(record.get(RECORD_KEY_INDEX).toString(), "000/00/00"); + } + } + + static class TestSchemaProvider extends SchemaProvider { + + @Override + public Schema getSourceSchema() { + try { + return SchemaTestUtil.getSimpleSchema(); + } catch (IOException exception) { + throw new HoodieException("Fatal error parsing schema", exception); + } + } + } +} diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestBufferedConnectWriter.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestBufferedConnectWriter.java new file mode 100644 index 0000000000000..458c79a31062c --- /dev/null +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestBufferedConnectWriter.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.writers; + +import org.apache.hudi.client.HoodieJavaWriteClient; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.connect.writers.BufferedConnectWriter; +import org.apache.hudi.connect.writers.KafkaConnectConfigs; +import org.apache.hudi.schema.SchemaProvider; + +import org.apache.hadoop.conf.Configuration; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.ArgumentCaptor; +import org.mockito.Mockito; + +import java.util.Comparator; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.ArgumentMatchers.anyList; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; + +public class TestBufferedConnectWriter { + + private static final int NUM_RECORDS = 10; + private static final String COMMIT_TIME = "101"; + + private HoodieJavaWriteClient mockHoodieJavaWriteClient; + private HoodieJavaEngineContext javaEngineContext; + private KafkaConnectConfigs configs; + private HoodieWriteConfig writeConfig; + private SchemaProvider schemaProvider; + + @BeforeEach + public void setUp() throws Exception { + mockHoodieJavaWriteClient = mock(HoodieJavaWriteClient.class); + Configuration hadoopConf = new Configuration(); + javaEngineContext = new HoodieJavaEngineContext(hadoopConf); + configs = KafkaConnectConfigs.newBuilder().build(); + schemaProvider = new TestAbstractConnectWriter.TestSchemaProvider(); + writeConfig = HoodieWriteConfig.newBuilder() + .withEngineType(EngineType.JAVA) + .withPath("/tmp") + .withSchema(schemaProvider.getSourceSchema().toString()) + .build(); + } + + @Test + public void testSimpleWriteAndFlush() throws Exception { + String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0]; + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(new String[] {partitionPath}); + List records = dataGen.generateInserts(COMMIT_TIME, NUM_RECORDS); + + BufferedConnectWriter writer = new BufferedConnectWriter( + javaEngineContext, + mockHoodieJavaWriteClient, + COMMIT_TIME, + configs, + writeConfig, + null, + schemaProvider); + + for (int i = 0; i < NUM_RECORDS; i++) { + writer.writeHudiRecord(records.get(i)); + } + Mockito.verify(mockHoodieJavaWriteClient, times(0)) + .bulkInsertPreppedRecords(anyList(), eq(COMMIT_TIME), eq(Option.empty())); + + writer.flushRecords(); + final ArgumentCaptor> actualRecords = ArgumentCaptor.forClass(List.class); + Mockito.verify(mockHoodieJavaWriteClient, times(1)) + .bulkInsertPreppedRecords(actualRecords.capture(), eq(COMMIT_TIME), eq(Option.empty())); + + actualRecords.getValue().sort(Comparator.comparing(HoodieRecord::getRecordKey)); + records.sort(Comparator.comparing(HoodieRecord::getRecordKey)); + + assertEquals(records, actualRecords.getValue()); + } +} diff --git a/hudi-kafka-connect/src/test/resources/hadoop_conf/core-site.xml b/hudi-kafka-connect/src/test/resources/hadoop_conf/core-site.xml new file mode 100644 index 0000000000000..26efcea4628b7 --- /dev/null +++ b/hudi-kafka-connect/src/test/resources/hadoop_conf/core-site.xml @@ -0,0 +1,33 @@ + + + + + + + + + fs.defaultFS + hdfs://test-hudi-path:9000 + The name of the default file system. A URI whose + scheme and authority determine the FileSystem implementation. The + uri's scheme determines the config property (fs.SCHEME.impl) naming + the FileSystem implementation class. The uri's authority is used to + determine the host, port, etc. for a filesystem. + + + diff --git a/hudi-kafka-connect/src/test/resources/hadoop_conf/hdfs-site.xml b/hudi-kafka-connect/src/test/resources/hadoop_conf/hdfs-site.xml new file mode 100644 index 0000000000000..0e5daec2e4c4e --- /dev/null +++ b/hudi-kafka-connect/src/test/resources/hadoop_conf/hdfs-site.xml @@ -0,0 +1,30 @@ + + + + + + + + dfs.namenode.http-address + http://test-hudi-path:50070 + + The address and the base port where the dfs namenode web ui will listen on. + + + + diff --git a/hudi-kafka-connect/src/test/resources/hadoop_home/etc/hadoop/core-site.xml b/hudi-kafka-connect/src/test/resources/hadoop_home/etc/hadoop/core-site.xml new file mode 100644 index 0000000000000..26efcea4628b7 --- /dev/null +++ b/hudi-kafka-connect/src/test/resources/hadoop_home/etc/hadoop/core-site.xml @@ -0,0 +1,33 @@ + + + + + + + + + fs.defaultFS + hdfs://test-hudi-path:9000 + The name of the default file system. A URI whose + scheme and authority determine the FileSystem implementation. The + uri's scheme determines the config property (fs.SCHEME.impl) naming + the FileSystem implementation class. The uri's authority is used to + determine the host, port, etc. for a filesystem. + + + diff --git a/hudi-kafka-connect/src/test/resources/hadoop_home/etc/hadoop/hdfs-site.xml b/hudi-kafka-connect/src/test/resources/hadoop_home/etc/hadoop/hdfs-site.xml new file mode 100644 index 0000000000000..0e5daec2e4c4e --- /dev/null +++ b/hudi-kafka-connect/src/test/resources/hadoop_home/etc/hadoop/hdfs-site.xml @@ -0,0 +1,30 @@ + + + + + + + + dfs.namenode.http-address + http://test-hudi-path:50070 + + The address and the base port where the dfs namenode web ui will listen on. + + + + diff --git a/hudi-spark-datasource/README.md b/hudi-spark-datasource/README.md new file mode 100644 index 0000000000000..dd1796991c873 --- /dev/null +++ b/hudi-spark-datasource/README.md @@ -0,0 +1,63 @@ + + +# Description of the relationship between each module + +This repo contains the code that integrate Hudi with Spark. The repo is split into the following modules + +`hudi-spark` +`hudi-spark2` +`hudi-spark3.1.x` +`hudi-spark3.2.x` +`hudi-spark3.3.x` +`hudi-spark2-common` +`hudi-spark3-common` +`hudi-spark-common` + +* hudi-spark is the module that contains the code that both spark2 & spark3 version would share, also contains the antlr4 +file that supports spark sql on spark 2.x version. +* hudi-spark2 is the module that contains the code that compatible with spark 2.x versions. +* hudi-spark3.1.x is the module that contains the code that compatible with spark3.1.x and spark3.0.x version. +* hudi-spark3.2.x is the module that contains the code that compatible with spark 3.2.x versions. +* hudi-spark3.3.x is the module that contains the code that compatible with spark 3.3.x+ versions. +* hudi-spark2-common is the module that contains the code that would be reused between spark2.x versions, right now the module +has no class since hudi only supports spark 2.4.4 version, and it acts as the placeholder when packaging hudi-spark-bundle module. +* hudi-spark3-common is the module that contains the code that would be reused between spark3.x versions. +* hudi-spark-common is the module that contains the code that would be reused between spark2.x and spark3.x versions. + +## Description of Time Travel +* `HoodieSpark3_2ExtendedSqlAstBuilder` have comments in the spark3.2's code fork from `org.apache.spark.sql.catalyst.parser.AstBuilder`, and additional `withTimeTravel` method. +* `SqlBase.g4` have comments in the code forked from spark3.2's parser, and add SparkSQL Syntax `TIMESTAMP AS OF` and `VERSION AS OF`. + +### Time Travel Support Spark Version: + +| version | support | +| ------ | ------- | +| 2.4.x | No | +| 3.0.x | No | +| 3.1.2 | No | +| 3.2.0 | Yes | + +### To improve: +Spark3.3 support time travel syntax link [SPARK-37219](https://issues.apache.org/jira/browse/SPARK-37219). +Once Spark 3.3 released. The files in the following list will be removed: +* hudi-spark3.3.x's `HoodieSpark3_3ExtendedSqlAstBuilder.scala`, `HoodieSpark3_3ExtendedSqlParser.scala`, `TimeTravelRelation.scala`, `SqlBase.g4`, `HoodieSqlBase.g4` +Tracking Jira: [HUDI-4468](https://issues.apache.org/jira/browse/HUDI-4468) + +Some other improvements undergoing: +* Port borrowed classes from Spark 3.3 [HUDI-4467](https://issues.apache.org/jira/browse/HUDI-4467) + diff --git a/hudi-spark-datasource/hudi-spark-common/pom.xml b/hudi-spark-datasource/hudi-spark-common/pom.xml index af6403dc59c8b..f8766ebdf1020 100644 --- a/hudi-spark-datasource/hudi-spark-common/pom.xml +++ b/hudi-spark-datasource/hudi-spark-common/pom.xml @@ -17,14 +17,14 @@ hudi-spark-datasource org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 - hudi-spark-common - ${parent.version} + hudi-spark-common_${scala.binary.version} + 0.12.2-dt-SNAPSHOT - hudi-spark-common + hudi-spark-common_${scala.binary.version} jar @@ -46,7 +46,9 @@ -nobootcp + -target:jvm-1.8 + false @@ -147,6 +149,17 @@ ${scala.version} + + org.scala-lang.modules + scala-collection-compat_${scala.binary.version} + + + + + org.apache.logging.log4j + log4j-1.2-api + + org.apache.hudi @@ -169,10 +182,120 @@ ${project.version} + + + org.apache.spark + spark-core_${scala.binary.version} + + + javax.servlet + * + + + org.apache.spark spark-sql_${scala.binary.version} + + + org.apache.spark + spark-hive_${scala.binary.version} + + + + org.apache.spark + spark-sql_${scala.binary.version} + tests + test + + + org.apache.spark + spark-core_${scala.binary.version} + tests + test + + + org.apache.spark + spark-catalyst_${scala.binary.version} + tests + test + + + + + org.apache.hudi + hudi-tests-common + ${project.version} + test + + + org.apache.hudi + hudi-client-common + ${project.version} + tests + test-jar + test + + + org.apache.hudi + hudi-spark-client + ${project.version} + tests + test-jar + test + + + org.apache.hudi + hudi-common + ${project.version} + tests + test-jar + test + + + + org.junit.jupiter + junit-jupiter-api + test + + + + org.junit.jupiter + junit-jupiter-engine + test + + + + org.junit.vintage + junit-vintage-engine + test + + + + org.junit.jupiter + junit-jupiter-params + test + + + + org.mockito + mockito-junit-jupiter + test + + + + org.junit.platform + junit-platform-runner + test + + + + org.junit.platform + junit-platform-suite-api + test + + diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java index 1cb63c98a2db4..ee807f49dae89 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java @@ -21,42 +21,39 @@ import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hudi.client.HoodieReadClient; +import org.apache.hudi.client.SparkRDDReadClient; import org.apache.hudi.client.HoodieWriteResult; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.WriteOperationType; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.CommitUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.TablePathUtils; import org.apache.hudi.config.HoodieCompactionConfig; -import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodiePayloadConfig; +import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.exception.TableNotFoundException; -import org.apache.hudi.hive.HiveSyncConfig; -import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; -import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.keygen.KeyGenerator; -import org.apache.hudi.keygen.parser.AbstractHoodieDateTimeParser; import org.apache.hudi.table.BulkInsertPartitioner; +import org.apache.hudi.util.DataTypeUtils; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.types.StructType; import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; @@ -84,48 +81,40 @@ public static String getTablePath(FileSystem fs, Path[] userProvidedPaths) throw } /** - * Create a key generator class via reflection, passing in any configs needed. - *

    - * If the class name of key generator is configured through the properties file, i.e., {@code props}, use the corresponding key generator class; otherwise, use the default key generator class - * specified in {@code DataSourceWriteOptions}. - */ - public static KeyGenerator createKeyGenerator(TypedProperties props) throws IOException { - String keyGeneratorClass = props.getString(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), - DataSourceWriteOptions.DEFAULT_KEYGENERATOR_CLASS_OPT_VAL()); - try { - return (KeyGenerator) ReflectionUtils.loadClass(keyGeneratorClass, props); - } catch (Throwable e) { - throw new IOException("Could not load key generator class " + keyGeneratorClass, e); - } - } - - /** - * Create a date time parser class for TimestampBasedKeyGenerator, passing in any configs needed. + * Create a UserDefinedBulkInsertPartitioner class via reflection, + *
    + * if the class name of UserDefinedBulkInsertPartitioner is configured through the HoodieWriteConfig. + * + * @see HoodieWriteConfig#getUserDefinedBulkInsertPartitionerClass() */ - public static AbstractHoodieDateTimeParser createDateTimeParser(TypedProperties props, String parserClass) throws IOException { + private static Option createUserDefinedBulkInsertPartitioner(HoodieWriteConfig config) + throws HoodieException { + String bulkInsertPartitionerClass = config.getUserDefinedBulkInsertPartitionerClass(); try { - return (AbstractHoodieDateTimeParser) ReflectionUtils.loadClass(parserClass, props); + return StringUtils.isNullOrEmpty(bulkInsertPartitionerClass) + ? Option.empty() : + Option.of((BulkInsertPartitioner) ReflectionUtils.loadClass(bulkInsertPartitionerClass, config)); } catch (Throwable e) { - throw new IOException("Could not load date time parser class " + parserClass, e); + throw new HoodieException("Could not create UserDefinedBulkInsertPartitioner class " + bulkInsertPartitionerClass, e); } } /** - * Create a UserDefinedBulkInsertPartitioner class via reflection, + * Create a UserDefinedBulkInsertPartitionerRows class via reflection, *
    * if the class name of UserDefinedBulkInsertPartitioner is configured through the HoodieWriteConfig. * * @see HoodieWriteConfig#getUserDefinedBulkInsertPartitionerClass() */ - private static Option createUserDefinedBulkInsertPartitioner(HoodieWriteConfig config) + public static Option>> createUserDefinedBulkInsertPartitionerWithRows(HoodieWriteConfig config) throws HoodieException { String bulkInsertPartitionerClass = config.getUserDefinedBulkInsertPartitionerClass(); try { return StringUtils.isNullOrEmpty(bulkInsertPartitionerClass) ? Option.empty() : - Option.of((BulkInsertPartitioner) ReflectionUtils.loadClass(bulkInsertPartitionerClass)); + Option.of((BulkInsertPartitioner) ReflectionUtils.loadClass(bulkInsertPartitionerClass, config)); } catch (Throwable e) { - throw new HoodieException("Could not create UserDefinedBulkInsertPartitioner class " + bulkInsertPartitionerClass, e); + throw new HoodieException("Could not create UserDefinedBulkInsertPartitionerRows class " + bulkInsertPartitionerClass, e); } } @@ -142,6 +131,18 @@ public static HoodieRecordPayload createPayload(String payloadClass, GenericReco } } + public static Map getExtraMetadata(Map properties) { + Map extraMetadataMap = new HashMap<>(); + if (properties.containsKey(DataSourceWriteOptions.COMMIT_METADATA_KEYPREFIX().key())) { + properties.entrySet().forEach(entry -> { + if (entry.getKey().startsWith(properties.get(DataSourceWriteOptions.COMMIT_METADATA_KEYPREFIX().key()))) { + extraMetadataMap.put(entry.getKey(), entry.getValue()); + } + }); + } + return extraMetadataMap; + } + /** * Create a payload class via reflection, do not ordering/precombine value. */ @@ -165,11 +166,11 @@ public static void checkRequiredProperties(TypedProperties props, List c public static HoodieWriteConfig createHoodieConfig(String schemaStr, String basePath, String tblName, Map parameters) { - boolean asyncCompact = Boolean.parseBoolean(parameters.get(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_OPT_KEY())); - boolean inlineCompact = !asyncCompact && parameters.get(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY()) + boolean asyncCompact = Boolean.parseBoolean(parameters.get(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE().key())); + boolean inlineCompact = !asyncCompact && parameters.get(DataSourceWriteOptions.TABLE_TYPE().key()) .equals(DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL()); // insert/bulk-insert combining to be true, if filtering for duplicates - boolean combineInserts = Boolean.parseBoolean(parameters.get(DataSourceWriteOptions.INSERT_DROP_DUPS_OPT_KEY())); + boolean combineInserts = Boolean.parseBoolean(parameters.get(DataSourceWriteOptions.INSERT_DROP_DUPS().key())); HoodieWriteConfig.Builder builder = HoodieWriteConfig.newBuilder() .withPath(basePath).withAutoCommit(false).combineInput(combineInserts, true); if (schemaStr != null) { @@ -177,25 +178,19 @@ public static HoodieWriteConfig createHoodieConfig(String schemaStr, String base } return builder.forTable(tblName) - .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withPayloadClass(parameters.get(DataSourceWriteOptions.PAYLOAD_CLASS_OPT_KEY())) .withInlineCompaction(inlineCompact).build()) + .withPayloadConfig(HoodiePayloadConfig.newBuilder() + .withPayloadClass(parameters.get(DataSourceWriteOptions.PAYLOAD_CLASS_NAME().key())) + .withPayloadOrderingField(parameters.get(DataSourceWriteOptions.PRECOMBINE_FIELD().key())) + .build()) // override above with Hoodie configs specified as options. .withProps(parameters).build(); } public static SparkRDDWriteClient createHoodieClient(JavaSparkContext jssc, String schemaStr, String basePath, String tblName, Map parameters) { - return new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jssc), createHoodieConfig(schemaStr, basePath, tblName, parameters), true); - } - - public static String getCommitActionType(WriteOperationType operation, HoodieTableType tableType) { - if (operation == WriteOperationType.INSERT_OVERWRITE || operation == WriteOperationType.INSERT_OVERWRITE_TABLE) { - return HoodieTimeline.REPLACE_COMMIT_ACTION; - } else { - return CommitUtils.getCommitActionType(tableType); - } + return new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jssc), createHoodieConfig(schemaStr, basePath, tblName, parameters)); } public static HoodieWriteResult doWriteOperation(SparkRDDWriteClient client, JavaRDD hoodieRecords, @@ -223,16 +218,21 @@ public static HoodieWriteResult doDeleteOperation(SparkRDDWriteClient client, Ja return new HoodieWriteResult(client.delete(hoodieKeys, instantTime)); } + public static HoodieWriteResult doDeletePartitionsOperation(SparkRDDWriteClient client, List partitionsToDelete, + String instantTime) { + return client.deletePartitions(partitionsToDelete, instantTime); + } + public static HoodieRecord createHoodieRecord(GenericRecord gr, Comparable orderingVal, HoodieKey hKey, String payloadClass) throws IOException { HoodieRecordPayload payload = DataSourceUtils.createPayload(payloadClass, gr, orderingVal); - return new HoodieRecord<>(hKey, payload); + return new HoodieAvroRecord<>(hKey, payload); } public static HoodieRecord createHoodieRecord(GenericRecord gr, HoodieKey hKey, String payloadClass) throws IOException { HoodieRecordPayload payload = DataSourceUtils.createPayload(payloadClass, gr); - return new HoodieRecord<>(hKey, payload); + return new HoodieAvroRecord<>(hKey, payload); } /** @@ -246,7 +246,7 @@ public static HoodieRecord createHoodieRecord(GenericRecord gr, HoodieKey hKey, public static JavaRDD dropDuplicates(JavaSparkContext jssc, JavaRDD incomingHoodieRecords, HoodieWriteConfig writeConfig) { try { - HoodieReadClient client = new HoodieReadClient<>(new HoodieSparkEngineContext(jssc), writeConfig); + SparkRDDReadClient client = new SparkRDDReadClient<>(new HoodieSparkEngineContext(jssc), writeConfig); return client.tagLocation(incomingHoodieRecords) .filter(r -> !((HoodieRecord) r).isCurrentLocationKnown()); } catch (TableNotFoundException e) { @@ -264,36 +264,38 @@ public static JavaRDD dropDuplicates(JavaSparkContext jssc, JavaRD return dropDuplicates(jssc, incomingHoodieRecords, writeConfig); } - public static HiveSyncConfig buildHiveSyncConfig(TypedProperties props, String basePath, String baseFileFormat) { - checkRequiredProperties(props, Collections.singletonList(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY())); - HiveSyncConfig hiveSyncConfig = new HiveSyncConfig(); - hiveSyncConfig.basePath = basePath; - hiveSyncConfig.usePreApacheInputFormat = - props.getBoolean(DataSourceWriteOptions.HIVE_USE_PRE_APACHE_INPUT_FORMAT_OPT_KEY(), - Boolean.parseBoolean(DataSourceWriteOptions.DEFAULT_USE_PRE_APACHE_INPUT_FORMAT_OPT_VAL())); - hiveSyncConfig.databaseName = props.getString(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), - DataSourceWriteOptions.DEFAULT_HIVE_DATABASE_OPT_VAL()); - hiveSyncConfig.tableName = props.getString(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY()); - hiveSyncConfig.baseFileFormat = baseFileFormat; - hiveSyncConfig.hiveUser = - props.getString(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), DataSourceWriteOptions.DEFAULT_HIVE_USER_OPT_VAL()); - hiveSyncConfig.hivePass = - props.getString(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), DataSourceWriteOptions.DEFAULT_HIVE_PASS_OPT_VAL()); - hiveSyncConfig.jdbcUrl = - props.getString(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), DataSourceWriteOptions.DEFAULT_HIVE_URL_OPT_VAL()); - hiveSyncConfig.partitionFields = - props.getStringList(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), ",", new ArrayList<>()); - hiveSyncConfig.partitionValueExtractorClass = - props.getString(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(), - SlashEncodedDayPartitionValueExtractor.class.getName()); - hiveSyncConfig.useJdbc = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_USE_JDBC_OPT_KEY(), - DataSourceWriteOptions.DEFAULT_HIVE_USE_JDBC_OPT_VAL())); - hiveSyncConfig.autoCreateDatabase = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_AUTO_CREATE_DATABASE_OPT_KEY(), - DataSourceWriteOptions.DEFAULT_HIVE_AUTO_CREATE_DATABASE_OPT_KEY())); - hiveSyncConfig.skipROSuffix = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_SKIP_RO_SUFFIX(), - DataSourceWriteOptions.DEFAULT_HIVE_SKIP_RO_SUFFIX_VAL())); - hiveSyncConfig.supportTimestamp = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_SUPPORT_TIMESTAMP(), - DataSourceWriteOptions.DEFAULT_HIVE_SUPPORT_TIMESTAMP())); - return hiveSyncConfig; + /** + * Checks whether default value (false) of "hoodie.parquet.writelegacyformat.enabled" should be + * overridden in case: + * + *

      + *
    • Property has not been explicitly set by the writer
    • + *
    • Data schema contains {@code DecimalType} that would be affected by it
    • + *
    + * + * If both of the aforementioned conditions are true, will override the default value of the config + * (by essentially setting the value) to make sure that the produced Parquet data files could be + * read by {@code AvroParquetReader} + * + * @param properties properties specified by the writer + * @param schema schema of the dataset being written + */ + public static void tryOverrideParquetWriteLegacyFormatProperty(Map properties, StructType schema) { + if (DataTypeUtils.hasSmallPrecisionDecimalType(schema) + && properties.get(HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED.key()) == null) { + // ParquetWriteSupport writes DecimalType to parquet as INT32/INT64 when the scale of decimalType + // is less than {@code Decimal.MAX_LONG_DIGITS}, but {@code AvroParquetReader} which is used by + // {@code HoodieParquetReader} does not support DecimalType encoded as INT32/INT64 as. + // + // To work this problem around we're checking whether + // - Schema contains any decimals that could be encoded as INT32/INT64 + // - {@code HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED} has not been explicitly + // set by the writer + // + // If both of these conditions are true, then we override the default value of {@code + // HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED} and set it to "true" + LOG.warn("Small Decimal Type found in the persisted schema, reverting default value of 'hoodie.parquet.writelegacyformat.enabled' to true"); + properties.put(HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED.key(), "true"); + } } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/async/SparkStreamingAsyncClusteringService.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/async/SparkStreamingAsyncClusteringService.java new file mode 100644 index 0000000000000..077b102a4a5c6 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/async/SparkStreamingAsyncClusteringService.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.async; + +import org.apache.hudi.client.BaseClusterer; +import org.apache.hudi.client.BaseHoodieWriteClient; +import org.apache.hudi.client.HoodieSparkClusteringClient; +import org.apache.hudi.common.engine.HoodieEngineContext; + +/** + * Async clustering service for Spark structured streaming. + * Here, async clustering is run in daemon mode to prevent blocking shutting down the Spark application. + */ +public class SparkStreamingAsyncClusteringService extends AsyncClusteringService { + + private static final long serialVersionUID = 1L; + + public SparkStreamingAsyncClusteringService(HoodieEngineContext context, BaseHoodieWriteClient writeClient) { + super(context, writeClient, true); + } + + @Override + protected BaseClusterer createClusteringClient(BaseHoodieWriteClient client) { + return new HoodieSparkClusteringClient(client); + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/async/SparkStreamingAsyncCompactService.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/async/SparkStreamingAsyncCompactService.java similarity index 79% rename from hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/async/SparkStreamingAsyncCompactService.java rename to hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/async/SparkStreamingAsyncCompactService.java index d1a415b175200..2ff7b46c02018 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/async/SparkStreamingAsyncCompactService.java +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/async/SparkStreamingAsyncCompactService.java @@ -18,10 +18,10 @@ package org.apache.hudi.async; -import org.apache.hudi.client.AbstractCompactor; -import org.apache.hudi.client.AbstractHoodieWriteClient; +import org.apache.hudi.client.BaseCompactor; +import org.apache.hudi.client.BaseHoodieWriteClient; import org.apache.hudi.client.HoodieSparkCompactor; -import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieEngineContext; /** * Async Compaction Service used by Structured Streaming. Here, async compaction is run in daemon mode to prevent @@ -31,12 +31,12 @@ public class SparkStreamingAsyncCompactService extends AsyncCompactService { private static final long serialVersionUID = 1L; - public SparkStreamingAsyncCompactService(HoodieEngineContext context, AbstractHoodieWriteClient client) { + public SparkStreamingAsyncCompactService(HoodieEngineContext context, BaseHoodieWriteClient client) { super(context, client, true); } @Override - protected AbstractCompactor createCompactor(AbstractHoodieWriteClient client) { - return new HoodieSparkCompactor(client); + protected BaseCompactor createCompactor(BaseHoodieWriteClient client) { + return new HoodieSparkCompactor(client, this.context); } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/internal/BaseDefaultSource.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/internal/BaseDefaultSource.java new file mode 100644 index 0000000000000..e75c9a213f36d --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/internal/BaseDefaultSource.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal; + +import org.apache.hadoop.conf.Configuration; +import org.apache.spark.sql.SparkSession; + +/** + * Base class for DefaultSource used by Spark datasource v2. + */ +public class BaseDefaultSource { + + protected SparkSession sparkSession = null; + protected Configuration configuration = null; + + protected SparkSession getSparkSession() { + if (sparkSession == null) { + sparkSession = SparkSession.builder().getOrCreate(); + } + return sparkSession; + } + + protected Configuration getConfiguration() { + if (configuration == null) { + this.configuration = getSparkSession().sparkContext().hadoopConfiguration(); + } + return configuration; + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/internal/BaseWriterCommitMessage.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/internal/BaseWriterCommitMessage.java new file mode 100644 index 0000000000000..88a7921236a58 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/internal/BaseWriterCommitMessage.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal; + +import org.apache.hudi.client.HoodieInternalWriteStatus; + +import java.util.Arrays; +import java.util.List; + +/** + * Base class for HoodieWriterCommitMessage used by Spark datasource v2. + */ +public class BaseWriterCommitMessage { + + private List writeStatuses; + + public BaseWriterCommitMessage(List writeStatuses) { + this.writeStatuses = writeStatuses; + } + + public List getWriteStatuses() { + return writeStatuses; + } + + @Override + public String toString() { + return "HoodieWriterCommitMessage{" + "writeStatuses=" + Arrays.toString(writeStatuses.toArray()) + '}'; + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/internal/DataSourceInternalWriterHelper.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/internal/DataSourceInternalWriterHelper.java new file mode 100644 index 0000000000000..3a349473b2201 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/internal/DataSourceInternalWriterHelper.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal; + +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieInstant.State; +import org.apache.hudi.common.util.CommitUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.HoodieTable; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.hadoop.conf.Configuration; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.StructType; + +import java.util.List; +import java.util.Map; + +/** + * Helper class for HoodieDataSourceInternalWriter used by Spark datasource v2. + */ +public class DataSourceInternalWriterHelper { + + private static final Logger LOG = LogManager.getLogger(DataSourceInternalWriterHelper.class); + public static final String INSTANT_TIME_OPT_KEY = "hoodie.instant.time"; + + private final String instantTime; + private final HoodieTableMetaClient metaClient; + private final SparkRDDWriteClient writeClient; + private final HoodieTable hoodieTable; + private final WriteOperationType operationType; + private Map extraMetadata; + + public DataSourceInternalWriterHelper(String instantTime, HoodieWriteConfig writeConfig, StructType structType, + SparkSession sparkSession, Configuration configuration, Map extraMetadata) { + this.instantTime = instantTime; + this.operationType = WriteOperationType.BULK_INSERT; + this.extraMetadata = extraMetadata; + this.writeClient = new SparkRDDWriteClient<>(new HoodieSparkEngineContext(new JavaSparkContext(sparkSession.sparkContext())), writeConfig); + writeClient.setOperationType(operationType); + writeClient.startCommitWithTime(instantTime); + + this.metaClient = HoodieTableMetaClient.builder().setConf(configuration).setBasePath(writeConfig.getBasePath()).build(); + this.metaClient.validateTableProperties(writeConfig.getProps()); + this.hoodieTable = HoodieSparkTable.create(writeConfig, new HoodieSparkEngineContext(new JavaSparkContext(sparkSession.sparkContext())), metaClient); + writeClient.preWrite(instantTime, WriteOperationType.BULK_INSERT, metaClient); + } + + public boolean useCommitCoordinator() { + return true; + } + + public void onDataWriterCommit(String message) { + LOG.info("Received commit of a data writer = " + message); + } + + public void commit(List writeStatList) { + try { + writeClient.commitStats(instantTime, writeStatList, Option.of(extraMetadata), + CommitUtils.getCommitActionType(operationType, metaClient.getTableType())); + } catch (Exception ioe) { + throw new HoodieException(ioe.getMessage(), ioe); + } finally { + writeClient.close(); + } + } + + public void abort() { + LOG.error("Commit " + instantTime + " aborted "); + writeClient.rollback(instantTime); + writeClient.close(); + } + + public void createInflightCommit() { + metaClient.getActiveTimeline().transitionRequestedToInflight( + new HoodieInstant(State.REQUESTED, + CommitUtils.getCommitActionType(operationType, metaClient.getTableType()), + instantTime), Option.empty()); + } + + public HoodieTable getHoodieTable() { + return hoodieTable; + } + + public WriteOperationType getWriteOperationType() { + return operationType; + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/sql/IExpressionEvaluator.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/sql/IExpressionEvaluator.java new file mode 100644 index 0000000000000..210f003510fce --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/sql/IExpressionEvaluator.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sql; + +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; + +/*** + * A interface for CodeGen to execute expressions on the record. + */ +public interface IExpressionEvaluator { + + /** + * Evaluate the result of the expressions based on the record. + */ + GenericRecord eval(IndexedRecord record); + + /** + * Get the code of the expressions. This is used for debug. + */ + String getCode(); +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/sql/InsertMode.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/sql/InsertMode.java new file mode 100644 index 0000000000000..c68bd60ba6344 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/sql/InsertMode.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sql; + +import java.util.Locale; + +/** + * Insert mode for insert into pk-table. + */ +public enum InsertMode { + /** + * In upsert mode for insert into, duplicate record on primary key + * will be updated.This is the default insert mode for pk-table. + */ + UPSERT("upsert"), + /** + * In strict mode for insert into, we do the pk uniqueness guarantee + * for COW pk-table. + * For MOR pk-table, it has the same behavior with "upsert" mode. + */ + STRICT("strict"), + /** + * In non-strict mode for insert into, we use insert operation + * to write data which allow writing the duplicate record. + */ + NON_STRICT("non-strict"); + + private String value; + + InsertMode(String value) { + this.value = value; + } + + public String value() { + return value; + } + + public static InsertMode of(String value) { + switch (value.toLowerCase(Locale.ROOT)) { + case "upsert": + return UPSERT; + case "strict": + return STRICT; + case "non-strict": + return NON_STRICT; + default: + throw new AssertionError("UnSupport Insert Mode: " + value); + } + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/hudi-spark-datasource/hudi-spark-common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister new file mode 100644 index 0000000000000..556b0feef1cdb --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister @@ -0,0 +1,20 @@ + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +org.apache.hudi.DefaultSource +org.apache.spark.sql.execution.datasources.parquet.HoodieParquetFileFormat \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala new file mode 100644 index 0000000000000..42e71e5e33241 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hudi.DataSourceReadOptions.ENABLE_HOODIE_FILE_INDEX +import org.apache.hudi.HoodieBaseRelation.projectReader +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.hadoop.HoodieROTablePathFilter +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.sources.{BaseRelation, Filter} +import org.apache.spark.sql.types.StructType + +/** + * [[BaseRelation]] implementation only reading Base files of Hudi tables, essentially supporting following querying + * modes: + *
      + *
    • For COW tables: Snapshot
    • + *
    • For MOR tables: Read-optimized
    • + *
    + * + * NOTE: The reason this Relation is used in liue of Spark's default [[HadoopFsRelation]] is primarily due to the + * fact that it injects real partition's path as the value of the partition field, which Hudi ultimately persists + * as part of the record payload. In some cases, however, partition path might not necessarily be equal to the + * verbatim value of the partition path field (when custom [[KeyGenerator]] is used) therefore leading to incorrect + * partition field values being written + */ +class BaseFileOnlyRelation(sqlContext: SQLContext, + metaClient: HoodieTableMetaClient, + optParams: Map[String, String], + userSchema: Option[StructType], + globPaths: Seq[Path]) + extends HoodieBaseRelation(sqlContext, metaClient, optParams, userSchema) with SparkAdapterSupport { + + case class HoodieBaseFileSplit(filePartition: FilePartition) extends HoodieFileSplit + + override type FileSplit = HoodieBaseFileSplit + + // TODO(HUDI-3204) this is to override behavior (exclusively) for COW tables to always extract + // partition values from partition path + // For more details please check HUDI-4161 + // NOTE: This override has to mirror semantic of whenever this Relation is converted into [[HadoopFsRelation]], + // which is currently done for all cases, except when Schema Evolution is enabled + override protected val shouldExtractPartitionValuesFromPartitionPath: Boolean = + internalSchemaOpt.isEmpty + + override lazy val mandatoryFields: Seq[String] = Seq.empty + + override def imbueConfigs(sqlContext: SQLContext): Unit = { + super.imbueConfigs(sqlContext) + sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.enableVectorizedReader", "true") + } + + protected override def composeRDD(fileSplits: Seq[HoodieBaseFileSplit], + tableSchema: HoodieTableSchema, + requiredSchema: HoodieTableSchema, + requestedColumns: Array[String], + filters: Array[Filter]): RDD[InternalRow] = { + val (partitionSchema, dataSchema, requiredDataSchema) = + tryPrunePartitionColumns(tableSchema, requiredSchema) + + val baseFileReader = createBaseFileReader( + spark = sparkSession, + partitionSchema = partitionSchema, + dataSchema = dataSchema, + requiredDataSchema = requiredDataSchema, + filters = filters, + options = optParams, + // NOTE: We have to fork the Hadoop Config here as Spark will be modifying it + // to configure Parquet reader appropriately + hadoopConf = embedInternalSchema(new Configuration(conf), requiredSchema.internalSchema) + ) + + // NOTE: In some case schema of the reader's output (reader's schema) might not match the schema expected by the caller. + // This could occur for ex, when requested schema contains partition columns which might not be persisted w/in the + // data file, but instead would be parsed from the partition path. In that case output of the file-reader will have + // different ordering of the fields than the original required schema (for more details please check out + // [[ParquetFileFormat]] impl). In that case we have to project the rows from the file-reader's schema + // back into the one expected by the caller + val projectedReader = projectReader(baseFileReader, requiredSchema.structTypeSchema) + + // SPARK-37273 FileScanRDD constructor changed in SPARK 3.3 + sparkAdapter.createHoodieFileScanRDD(sparkSession, projectedReader.apply, fileSplits.map(_.filePartition), requiredSchema.structTypeSchema) + .asInstanceOf[HoodieUnsafeRDD] + } + + protected def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[HoodieBaseFileSplit] = { + val partitions = listLatestBaseFiles(globPaths, partitionFilters, dataFilters) + val fileSplits = partitions.values.toSeq + .flatMap { files => + files.flatMap { file => + // TODO fix, currently assuming parquet as underlying format + HoodieDataSourceHelper.splitFiles( + sparkSession = sparkSession, + file = file, + partitionValues = getPartitionColumnsAsInternalRow(file) + ) + } + } + // NOTE: It's important to order the splits in the reverse order of their + // size so that we can subsequently bucket them in an efficient manner + .sortBy(_.length)(implicitly[Ordering[Long]].reverse) + + val maxSplitBytes = sparkSession.sessionState.conf.filesMaxPartitionBytes + + sparkAdapter.getFilePartitions(sparkSession, fileSplits, maxSplitBytes) + .map(HoodieBaseFileSplit.apply) + } + + /** + * NOTE: We have to fallback to [[HadoopFsRelation]] to make sure that all of the Spark optimizations could be + * equally applied to Hudi tables, since some of those are predicated on the usage of [[HadoopFsRelation]], + * and won't be applicable in case of us using our own custom relations (one of such optimizations is [[SchemaPruning]] + * rule; you can find more details in HUDI-3896) + */ + def toHadoopFsRelation: HadoopFsRelation = { + val enableFileIndex = HoodieSparkConfUtils.getConfigValue(optParams, sparkSession.sessionState.conf, + ENABLE_HOODIE_FILE_INDEX.key, ENABLE_HOODIE_FILE_INDEX.defaultValue.toString).toBoolean + if (enableFileIndex && globPaths.isEmpty) { + // NOTE: There are currently 2 ways partition values could be fetched: + // - Source columns (producing the values used for physical partitioning) will be read + // from the data file + // - Values parsed from the actual partition path would be appended to the final dataset + // + // In the former case, we don't need to provide the partition-schema to the relation, + // therefore we simply stub it w/ empty schema and use full table-schema as the one being + // read from the data file. + // + // In the latter, we have to specify proper partition schema as well as "data"-schema, essentially + // being a table-schema with all partition columns stripped out + val (partitionSchema, dataSchema) = if (shouldExtractPartitionValuesFromPartitionPath) { + (fileIndex.partitionSchema, fileIndex.dataSchema) + } else { + (StructType(Nil), tableStructSchema) + } + + HadoopFsRelation( + location = fileIndex, + partitionSchema = partitionSchema, + dataSchema = dataSchema, + bucketSpec = None, + fileFormat = fileFormat, + optParams)(sparkSession) + } else { + val readPathsStr = optParams.get(DataSourceReadOptions.READ_PATHS.key) + val extraReadPaths = readPathsStr.map(p => p.split(",").toSeq).getOrElse(Seq()) + // NOTE: Spark is able to infer partitioning values from partition path only when Hive-style partitioning + // scheme is used. Therefore, we fallback to reading the table as non-partitioned (specifying + // partitionColumns = Seq.empty) whenever Hive-style partitioning is not involved + val partitionColumns: Seq[String] = if (tableConfig.getHiveStylePartitioningEnable.toBoolean) { + this.partitionColumns + } else { + Seq.empty + } + + DataSource.apply( + sparkSession = sparkSession, + paths = extraReadPaths, + // Here we should specify the schema to the latest commit schema since + // the table schema evolution. + userSpecifiedSchema = userSchema.orElse(Some(tableStructSchema)), + className = fileFormatClassName, + options = optParams ++ Map( + // Since we're reading the table as just collection of files we have to make sure + // we only read the latest version of every Hudi's file-group, which might be compacted, clustered, etc. + // while keeping previous versions of the files around as well. + // + // We rely on [[HoodieROTablePathFilter]], to do proper filtering to assure that + "mapreduce.input.pathFilter.class" -> classOf[HoodieROTablePathFilter].getName, + + // We have to override [[EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH]] setting, since + // the relation might have this setting overridden + DataSourceReadOptions.EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH.key -> shouldExtractPartitionValuesFromPartitionPath.toString, + + // NOTE: We have to specify table's base-path explicitly, since we're requesting Spark to read it as a + // list of globbed paths which complicates partitioning discovery for Spark. + // Please check [[PartitioningAwareFileIndex#basePaths]] comment for more details. + PartitioningAwareFileIndex.BASE_PATH_PARAM -> metaClient.getBasePathV2.toString + ), + partitionColumns = partitionColumns + ) + .resolveRelation() + .asInstanceOf[HadoopFsRelation] + } + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala new file mode 100644 index 0000000000000..5cf7a5ec035ab --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala @@ -0,0 +1,481 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.avro.Conversions.DecimalConversion +import org.apache.avro.generic.GenericData +import org.apache.hudi.ColumnStatsIndexSupport._ +import org.apache.hudi.HoodieCatalystUtils.{withPersistedData, withPersistedDataset} +import org.apache.hudi.HoodieConversionUtils.toScalaOption +import org.apache.hudi.avro.model._ +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.config.HoodieMetadataConfig +import org.apache.hudi.common.data.HoodieData +import org.apache.hudi.common.model.HoodieRecord +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig +import org.apache.hudi.common.util.BinaryUtil.toBytes +import org.apache.hudi.common.util.ValidationUtils.checkState +import org.apache.hudi.common.util.collection +import org.apache.hudi.common.util.hash.ColumnIndexID +import org.apache.hudi.data.HoodieJavaRDD +import org.apache.hudi.metadata.{HoodieMetadataPayload, HoodieTableMetadata, HoodieTableMetadataUtil, MetadataPartitionType} +import org.apache.hudi.util.JFunction +import org.apache.spark.api.java.JavaSparkContext +import org.apache.spark.sql.HoodieUnsafeUtils.{createDataFrameFromInternalRows, createDataFrameFromRDD, createDataFrameFromRows} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.functions.col +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.storage.StorageLevel + +import java.nio.ByteBuffer +import scala.collection.JavaConverters._ +import scala.collection.immutable.TreeSet +import scala.collection.mutable.ListBuffer +import scala.collection.parallel.mutable.ParHashMap + +class ColumnStatsIndexSupport(spark: SparkSession, + tableSchema: StructType, + @transient metadataConfig: HoodieMetadataConfig, + @transient metaClient: HoodieTableMetaClient, + allowCaching: Boolean = false) { + + @transient private lazy val engineCtx = new HoodieSparkEngineContext(new JavaSparkContext(spark.sparkContext)) + @transient private lazy val metadataTable: HoodieTableMetadata = + HoodieTableMetadata.create(engineCtx, metadataConfig, metaClient.getBasePathV2.toString, FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue) + + @transient private lazy val cachedColumnStatsIndexViews: ParHashMap[Seq[String], DataFrame] = ParHashMap() + + // NOTE: Since [[metadataConfig]] is transient this has to be eagerly persisted, before this will be passed + // on to the executor + private val inMemoryProjectionThreshold = metadataConfig.getColumnStatsIndexInMemoryProjectionThreshold + + private lazy val indexedColumns: Set[String] = { + val customIndexedColumns = metadataConfig.getColumnsEnabledForColumnStatsIndex + // Column Stats Index could index either + // - The whole table + // - Only configured columns + if (customIndexedColumns.isEmpty) { + tableSchema.fieldNames.toSet + } else { + customIndexedColumns.asScala.toSet + } + } + + /** + * Returns true in cases when Column Stats Index is built and available as standalone partition + * w/in the Metadata Table + */ + def isIndexAvailable: Boolean = { + checkState(metadataConfig.enabled, "Metadata Table support has to be enabled") + metaClient.getTableConfig.getMetadataPartitions.contains(HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS) + } + + /** + * Determines whether it would be more optimal to read Column Stats Index a) in-memory of the invoking process, + * or b) executing it on-cluster via Spark [[Dataset]] and [[RDD]] APIs + */ + def shouldReadInMemory(fileIndex: HoodieFileIndex, queryReferencedColumns: Seq[String]): Boolean = { + Option(metadataConfig.getColumnStatsIndexProcessingModeOverride) match { + case Some(mode) => + mode == HoodieMetadataConfig.COLUMN_STATS_INDEX_PROCESSING_MODE_IN_MEMORY + case None => + fileIndex.getFileSlicesCount * queryReferencedColumns.length < inMemoryProjectionThreshold + } + } + + /** + * Loads view of the Column Stats Index in a transposed format where single row coalesces every columns' + * statistics for a single file, returning it as [[DataFrame]] + * + * Please check out scala-doc of the [[transpose]] method explaining this view in more details + */ + def loadTransposed[T](targetColumns: Seq[String], shouldReadInMemory: Boolean)(block: DataFrame => T): T = { + cachedColumnStatsIndexViews.get(targetColumns) match { + case Some(cachedDF) => + block(cachedDF) + + case None => + val colStatsRecords: HoodieData[HoodieMetadataColumnStats] = + loadColumnStatsIndexRecords(targetColumns, shouldReadInMemory) + + withPersistedData(colStatsRecords, StorageLevel.MEMORY_ONLY) { + val (transposedRows, indexSchema) = transpose(colStatsRecords, targetColumns) + val df = if (shouldReadInMemory) { + // NOTE: This will instantiate a [[Dataset]] backed by [[LocalRelation]] holding all of the rows + // of the transposed table in memory, facilitating execution of the subsequently chained operations + // on it locally (on the driver; all such operations are actually going to be performed by Spark's + // Optimizer) + createDataFrameFromRows(spark, transposedRows.collectAsList().asScala, indexSchema) + } else { + val rdd = HoodieJavaRDD.getJavaRDD(transposedRows) + spark.createDataFrame(rdd, indexSchema) + } + + if (allowCaching) { + cachedColumnStatsIndexViews.put(targetColumns, df) + // NOTE: Instead of collecting the rows from the index and hold them in memory, we instead rely + // on Spark as (potentially distributed) cache managing data lifecycle, while we simply keep + // the referenced to persisted [[DataFrame]] instance + df.persist(StorageLevel.MEMORY_ONLY) + + block(df) + } else { + withPersistedDataset(df) { + block(df) + } + } + } + } + } + + /** + * Loads a view of the Column Stats Index in a raw format, returning it as [[DataFrame]] + * + * Please check out scala-doc of the [[transpose]] method explaining this view in more details + */ + def load(targetColumns: Seq[String] = Seq.empty, shouldReadInMemory: Boolean = false): DataFrame = { + // NOTE: If specific columns have been provided, we can considerably trim down amount of data fetched + // by only fetching Column Stats Index records pertaining to the requested columns. + // Otherwise we fallback to read whole Column Stats Index + if (targetColumns.nonEmpty) { + loadColumnStatsIndexForColumnsInternal(targetColumns, shouldReadInMemory) + } else { + loadFullColumnStatsIndexInternal() + } + } + + def invalidateCaches(): Unit = { + cachedColumnStatsIndexViews.foreach { case (_, df) => df.unpersist() } + cachedColumnStatsIndexViews.clear() + } + + /** + * Transposes and converts the raw table format of the Column Stats Index representation, + * where each row/record corresponds to individual (column, file) pair, into the table format + * where each row corresponds to single file with statistic for individual columns collated + * w/in such row: + * + * Metadata Table Column Stats Index format: + * + *
    +   *  +---------------------------+------------+------------+------------+-------------+
    +   *  |        fileName           | columnName |  minValue  |  maxValue  |  num_nulls  |
    +   *  +---------------------------+------------+------------+------------+-------------+
    +   *  | one_base_file.parquet     |          A |          1 |         10 |           0 |
    +   *  | another_base_file.parquet |          A |        -10 |          0 |           5 |
    +   *  +---------------------------+------------+------------+------------+-------------+
    +   * 
    + * + * Returned table format + * + *
    +   *  +---------------------------+------------+------------+-------------+
    +   *  |          file             | A_minValue | A_maxValue | A_nullCount |
    +   *  +---------------------------+------------+------------+-------------+
    +   *  | one_base_file.parquet     |          1 |         10 |           0 |
    +   *  | another_base_file.parquet |        -10 |          0 |           5 |
    +   *  +---------------------------+------------+------------+-------------+
    +   * 
    + * + * NOTE: Column Stats Index might potentially contain statistics for many columns (if not all), while + * query at hand might only be referencing a handful of those. As such, we collect all the + * column references from the filtering expressions, and only transpose records corresponding to the + * columns referenced in those + * + * @param colStatsRecords [[HoodieData[HoodieMetadataColumnStats]]] bearing raw Column Stats Index records + * @param queryColumns target columns to be included into the final table + * @return reshaped table according to the format outlined above + */ + private def transpose(colStatsRecords: HoodieData[HoodieMetadataColumnStats], queryColumns: Seq[String]): (HoodieData[Row], StructType) = { + val tableSchemaFieldMap = tableSchema.fields.map(f => (f.name, f)).toMap + // NOTE: We're sorting the columns to make sure final index schema matches layout + // of the transposed table + val sortedTargetColumnsSet = TreeSet(queryColumns:_*) + val sortedTargetColumns = sortedTargetColumnsSet.toSeq + + // NOTE: This is a trick to avoid pulling all of [[ColumnStatsIndexSupport]] object into the lambdas' + // closures below + val indexedColumns = this.indexedColumns + + // Here we perform complex transformation which requires us to modify the layout of the rows + // of the dataset, and therefore we rely on low-level RDD API to avoid incurring encoding/decoding + // penalty of the [[Dataset]], since it's required to adhere to its schema at all times, while + // RDDs are not; + val transposedRows: HoodieData[Row] = colStatsRecords + // NOTE: Explicit conversion is required for Scala 2.11 + .filter(JFunction.toJavaSerializableFunction(r => sortedTargetColumnsSet.contains(r.getColumnName))) + .mapToPair(JFunction.toJavaSerializablePairFunction(r => { + if (r.getMinValue == null && r.getMaxValue == null) { + // Corresponding row could be null in either of the 2 cases + // - Column contains only null values (in that case both min/max have to be nulls) + // - This is a stubbed Column Stats record (used as a tombstone) + collection.Pair.of(r.getFileName, r) + } else { + val minValueWrapper = r.getMinValue + val maxValueWrapper = r.getMaxValue + + checkState(minValueWrapper != null && maxValueWrapper != null, "Invalid Column Stats record: either both min/max have to be null, or both have to be non-null") + + val colName = r.getColumnName + val colType = tableSchemaFieldMap(colName).dataType + + val minValue = deserialize(tryUnpackValueWrapper(minValueWrapper), colType) + val maxValue = deserialize(tryUnpackValueWrapper(maxValueWrapper), colType) + + // Update min-/max-value structs w/ unwrapped values in-place + r.setMinValue(minValue) + r.setMaxValue(maxValue) + + collection.Pair.of(r.getFileName, r) + } + })) + .groupByKey() + .map(JFunction.toJavaSerializableFunction(p => { + val columnRecordsSeq: Seq[HoodieMetadataColumnStats] = p.getValue.asScala.toSeq + val fileName: String = p.getKey + val valueCount: Long = columnRecordsSeq.head.getValueCount + + // To properly align individual rows (corresponding to a file) w/in the transposed projection, we need + // to align existing column-stats for individual file with the list of expected ones for the + // whole transposed projection (a superset of all files) + val columnRecordsMap = columnRecordsSeq.map(r => (r.getColumnName, r)).toMap + val alignedColStatRecordsSeq = sortedTargetColumns.map(columnRecordsMap.get) + + val coalescedRowValuesSeq = + alignedColStatRecordsSeq.foldLeft(ListBuffer[Any](fileName, valueCount)) { + case (acc, opt) => + opt match { + case Some(colStatRecord) => + acc ++= Seq(colStatRecord.getMinValue, colStatRecord.getMaxValue, colStatRecord.getNullCount) + case None => + // NOTE: This could occur in either of the following cases: + // 1. Column is not indexed in Column Stats Index: in this case we won't be returning + // any statistics for such column (ie all stats will be null) + // 2. Particular file does not have this particular column (which is indexed by Column Stats Index): + // in this case we're assuming missing column to essentially contain exclusively + // null values, we set min/max values as null and null-count to be equal to value-count (this + // behavior is consistent with reading non-existent columns from Parquet) + // + // This is a way to determine current column's index without explicit iteration (we're adding 3 stats / column) + val idx = acc.length / 3 + val colName = sortedTargetColumns(idx) + val indexed = indexedColumns.contains(colName) + + val nullCount = if (indexed) valueCount else null + + acc ++= Seq(null, null, nullCount) + } + } + + Row(coalescedRowValuesSeq:_*) + })) + + // NOTE: It's crucial to maintain appropriate ordering of the columns + // matching table layout: hence, we cherry-pick individual columns + // instead of simply filtering in the ones we're interested in the schema + val indexSchema = composeIndexSchema(sortedTargetColumns, tableSchema) + (transposedRows, indexSchema) + } + + private def loadColumnStatsIndexForColumnsInternal(targetColumns: Seq[String], shouldReadInMemory: Boolean): DataFrame = { + val colStatsDF = { + val colStatsRecords: HoodieData[HoodieMetadataColumnStats] = loadColumnStatsIndexRecords(targetColumns, shouldReadInMemory) + // NOTE: Explicit conversion is required for Scala 2.11 + val catalystRows: HoodieData[InternalRow] = colStatsRecords.mapPartitions(JFunction.toJavaSerializableFunction(it => { + val converter = AvroConversionUtils.createAvroToInternalRowConverter(HoodieMetadataColumnStats.SCHEMA$, columnStatsRecordStructType) + it.asScala.map(r => converter(r).orNull).asJava + }), false) + + if (shouldReadInMemory) { + // NOTE: This will instantiate a [[Dataset]] backed by [[LocalRelation]] holding all of the rows + // of the transposed table in memory, facilitating execution of the subsequently chained operations + // on it locally (on the driver; all such operations are actually going to be performed by Spark's + // Optimizer) + createDataFrameFromInternalRows(spark, catalystRows.collectAsList().asScala, columnStatsRecordStructType) + } else { + createDataFrameFromRDD(spark, HoodieJavaRDD.getJavaRDD(catalystRows), columnStatsRecordStructType) + } + } + + colStatsDF.select(targetColumnStatsIndexColumns.map(col): _*) + } + + private def loadColumnStatsIndexRecords(targetColumns: Seq[String], shouldReadInMemory: Boolean): HoodieData[HoodieMetadataColumnStats] = { + // Read Metadata Table's Column Stats Index records into [[HoodieData]] container by + // - Fetching the records from CSI by key-prefixes (encoded column names) + // - Extracting [[HoodieMetadataColumnStats]] records + // - Filtering out nulls + checkState(targetColumns.nonEmpty) + + // TODO encoding should be done internally w/in HoodieBackedTableMetadata + val encodedTargetColumnNames = targetColumns.map(colName => new ColumnIndexID(colName).asBase64EncodedString()) + + val metadataRecords: HoodieData[HoodieRecord[HoodieMetadataPayload]] = + metadataTable.getRecordsByKeyPrefixes(encodedTargetColumnNames.asJava, HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS, shouldReadInMemory) + + val columnStatsRecords: HoodieData[HoodieMetadataColumnStats] = + // NOTE: Explicit conversion is required for Scala 2.11 + metadataRecords.map(JFunction.toJavaSerializableFunction(record => { + toScalaOption(record.getData.getInsertValue(null, null)) + .map(metadataRecord => metadataRecord.asInstanceOf[HoodieMetadataRecord].getColumnStatsMetadata) + .orNull + })) + .filter(JFunction.toJavaSerializableFunction(columnStatsRecord => columnStatsRecord != null)) + + columnStatsRecords + } + + private def loadFullColumnStatsIndexInternal(): DataFrame = { + val metadataTablePath = HoodieTableMetadata.getMetadataTableBasePath(metaClient.getBasePathV2.toString) + // Read Metadata Table's Column Stats Index into Spark's [[DataFrame]] + val colStatsDF = spark.read.format("org.apache.hudi") + .options(metadataConfig.getProps.asScala) + .load(s"$metadataTablePath/${MetadataPartitionType.COLUMN_STATS.getPartitionPath}") + + val requiredIndexColumns = + targetColumnStatsIndexColumns.map(colName => + col(s"${HoodieMetadataPayload.SCHEMA_FIELD_ID_COLUMN_STATS}.${colName}")) + + colStatsDF.where(col(HoodieMetadataPayload.SCHEMA_FIELD_ID_COLUMN_STATS).isNotNull) + .select(requiredIndexColumns: _*) + } +} + +object ColumnStatsIndexSupport { + + private val expectedAvroSchemaValues = Set("BooleanWrapper", "IntWrapper", "LongWrapper", "FloatWrapper", "DoubleWrapper", + "BytesWrapper", "StringWrapper", "DateWrapper", "DecimalWrapper", "TimeMicrosWrapper", "TimestampMicrosWrapper") + + /** + * Target Column Stats Index columns which internally are mapped onto fields of the correspoding + * Column Stats record payload ([[HoodieMetadataColumnStats]]) persisted w/in Metadata Table + */ + private val targetColumnStatsIndexColumns = Seq( + HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME, + HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE, + HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE, + HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT, + HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT, + HoodieMetadataPayload.COLUMN_STATS_FIELD_COLUMN_NAME + ) + + private val columnStatsRecordStructType: StructType = AvroConversionUtils.convertAvroSchemaToStructType(HoodieMetadataColumnStats.SCHEMA$) + + /** + * @VisibleForTesting + */ + def composeIndexSchema(targetColumnNames: Seq[String], tableSchema: StructType): StructType = { + val fileNameField = StructField(HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME, StringType, nullable = true, Metadata.empty) + val valueCountField = StructField(HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT, LongType, nullable = true, Metadata.empty) + + val targetFields = targetColumnNames.map(colName => tableSchema.fields.find(f => f.name == colName).get) + + StructType( + targetFields.foldLeft(Seq(fileNameField, valueCountField)) { + case (acc, field) => + acc ++ Seq( + composeColumnStatStructType(field.name, HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE, field.dataType), + composeColumnStatStructType(field.name, HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE, field.dataType), + composeColumnStatStructType(field.name, HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT, LongType)) + } + ) + } + + @inline def getMinColumnNameFor(colName: String): String = + formatColName(colName, HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE) + + @inline def getMaxColumnNameFor(colName: String): String = + formatColName(colName, HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE) + + @inline def getNullCountColumnNameFor(colName: String): String = + formatColName(colName, HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT) + + @inline def getValueCountColumnNameFor: String = + HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT + + @inline private def formatColName(col: String, statName: String) = { // TODO add escaping for + String.format("%s_%s", col, statName) + } + + @inline private def composeColumnStatStructType(col: String, statName: String, dataType: DataType) = + StructField(formatColName(col, statName), dataType, nullable = true, Metadata.empty) + + private def tryUnpackValueWrapper(valueWrapper: AnyRef): Any = { + valueWrapper match { + case w: BooleanWrapper => w.getValue + case w: IntWrapper => w.getValue + case w: LongWrapper => w.getValue + case w: FloatWrapper => w.getValue + case w: DoubleWrapper => w.getValue + case w: BytesWrapper => w.getValue + case w: StringWrapper => w.getValue + case w: DateWrapper => w.getValue + case w: DecimalWrapper => w.getValue + case w: TimeMicrosWrapper => w.getValue + case w: TimestampMicrosWrapper => w.getValue + + case r: GenericData.Record if expectedAvroSchemaValues.contains(r.getSchema.getName) => + r.get("value") + + case _ => throw new UnsupportedOperationException(s"Not recognized value wrapper type (${valueWrapper.getClass.getSimpleName})") + } + } + + val decConv = new DecimalConversion() + + private def deserialize(value: Any, dataType: DataType): Any = { + dataType match { + // NOTE: Since we can't rely on Avro's "date", and "timestamp-micros" logical-types, we're + // manually encoding corresponding values as int and long w/in the Column Stats Index and + // here we have to decode those back into corresponding logical representation. + case TimestampType => DateTimeUtils.toJavaTimestamp(value.asInstanceOf[Long]) + case DateType => DateTimeUtils.toJavaDate(value.asInstanceOf[Int]) + // Standard types + case StringType => value + case BooleanType => value + // Numeric types + case FloatType => value + case DoubleType => value + case LongType => value + case IntegerType => value + // NOTE: All integral types of size less than Int are encoded as Ints in MT + case ShortType => value.asInstanceOf[Int].toShort + case ByteType => value.asInstanceOf[Int].toByte + + // TODO fix + case _: DecimalType => + value match { + case buffer: ByteBuffer => + val logicalType = DecimalWrapper.SCHEMA$.getField("value").schema().getLogicalType + decConv.fromBytes(buffer, null, logicalType) + case _ => value + } + case BinaryType => + value match { + case b: ByteBuffer => toBytes(b) + case other => other + } + + case _ => + throw new UnsupportedOperationException(s"Data type for the statistic value is not recognized $dataType") + } + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala index 1b6e49b456a04..9ce50863228a8 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala @@ -17,52 +17,147 @@ package org.apache.hudi -import org.apache.hudi.common.model.HoodieTableType -import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload -import org.apache.hudi.common.model.WriteOperationType -import org.apache.hudi.hive.HiveSyncTool -import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor -import org.apache.hudi.keygen.SimpleKeyGenerator +import org.apache.hudi.DataSourceReadOptions.{QUERY_TYPE, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, QUERY_TYPE_SNAPSHOT_OPT_VAL} +import org.apache.hudi.HoodieConversionUtils.toScalaOption +import org.apache.hudi.common.config.{ConfigProperty, DFSPropertiesConfiguration, HoodieCommonConfig, HoodieConfig, TypedProperties} +import org.apache.hudi.common.fs.ConsistencyGuardConfig +import org.apache.hudi.common.model.{HoodieTableType, WriteOperationType} +import org.apache.hudi.common.table.HoodieTableConfig +import org.apache.hudi.common.util.{Option, StringUtils} +import org.apache.hudi.common.util.ValidationUtils.checkState +import org.apache.hudi.config.{HoodieClusteringConfig, HoodieWriteConfig} +import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncConfigHolder, HiveSyncTool} import org.apache.hudi.keygen.constant.KeyGeneratorOptions +import org.apache.hudi.keygen.{ComplexKeyGenerator, CustomKeyGenerator, NonpartitionedKeyGenerator, SimpleKeyGenerator} +import org.apache.hudi.sync.common.HoodieSyncConfig +import org.apache.hudi.sync.common.util.ConfigUtils +import org.apache.hudi.util.JFunction import org.apache.log4j.LogManager +import org.apache.spark.sql.execution.datasources.{DataSourceUtils => SparkDataSourceUtils} + +import java.util.function.{Function => JavaFunction} +import scala.collection.JavaConverters._ +import scala.language.implicitConversions /** - * List of options that can be passed to the Hoodie datasource, - * in addition to the hoodie client configs - */ + * List of options that can be passed to the Hoodie datasource, + * in addition to the hoodie client configs + */ /** * Options supported for reading hoodie tables. */ object DataSourceReadOptions { + import DataSourceOptionsHelper._ - private val log = LogManager.getLogger(DataSourceReadOptions.getClass) - - /** - * Whether data needs to be read, in - * - * 1) Snapshot mode (obtain latest view, based on row & columnar data) - * 2) incremental mode (new data since an instantTime) - * 3) Read Optimized mode (obtain latest view, based on columnar data) - * - * Default: snapshot - */ - val QUERY_TYPE_OPT_KEY = "hoodie.datasource.query.type" val QUERY_TYPE_SNAPSHOT_OPT_VAL = "snapshot" val QUERY_TYPE_READ_OPTIMIZED_OPT_VAL = "read_optimized" val QUERY_TYPE_INCREMENTAL_OPT_VAL = "incremental" - val DEFAULT_QUERY_TYPE_OPT_VAL: String = QUERY_TYPE_SNAPSHOT_OPT_VAL + val QUERY_TYPE: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.query.type") + .defaultValue(QUERY_TYPE_SNAPSHOT_OPT_VAL) + .withAlternatives("hoodie.datasource.view.type") + .withValidValues(QUERY_TYPE_SNAPSHOT_OPT_VAL, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, QUERY_TYPE_INCREMENTAL_OPT_VAL) + .withDocumentation("Whether data needs to be read, in incremental mode (new data since an instantTime) " + + "(or) Read Optimized mode (obtain latest view, based on base files) (or) Snapshot mode " + + "(obtain latest view, by merging base and (if any) log files)") - /** - * For Snapshot query on merge on read table. Use this key to define the payload class. - */ - val REALTIME_MERGE_OPT_KEY = "hoodie.datasource.merge.type" val REALTIME_SKIP_MERGE_OPT_VAL = "skip_merge" val REALTIME_PAYLOAD_COMBINE_OPT_VAL = "payload_combine" + val REALTIME_MERGE: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.merge.type") + .defaultValue(REALTIME_PAYLOAD_COMBINE_OPT_VAL) + .withValidValues(REALTIME_SKIP_MERGE_OPT_VAL, REALTIME_PAYLOAD_COMBINE_OPT_VAL) + .withDocumentation("For Snapshot query on merge on read table, control whether we invoke the record " + + s"payload implementation to merge (${REALTIME_PAYLOAD_COMBINE_OPT_VAL}) or skip merging altogether" + + s"${REALTIME_SKIP_MERGE_OPT_VAL}") + + val READ_PATHS: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.read.paths") + .noDefaultValue() + .withDocumentation("Comma separated list of file paths to read within a Hudi table.") + + val READ_PRE_COMBINE_FIELD = HoodieWriteConfig.PRECOMBINE_FIELD_NAME + + val ENABLE_HOODIE_FILE_INDEX: ConfigProperty[Boolean] = ConfigProperty + .key("hoodie.file.index.enable") + .defaultValue(true) + .deprecatedAfter("0.11.0") + .withDocumentation("Enables use of the spark file index implementation for Hudi, " + + "that speeds up listing of large tables.") + + val BEGIN_INSTANTTIME: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.read.begin.instanttime") + .noDefaultValue() + .withDocumentation("Instant time to start incrementally pulling data from. The instanttime here need not necessarily " + + "correspond to an instant on the timeline. New data written with an instant_time > BEGIN_INSTANTTIME are fetched out. " + + "For e.g: ‘20170901080000’ will get all new data written after Sep 1, 2017 08:00AM.") + + val END_INSTANTTIME: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.read.end.instanttime") + .noDefaultValue() + .withDocumentation("Instant time to limit incrementally fetched data to. " + + "New data written with an instant_time <= END_INSTANTTIME are fetched out.") + + val INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.read.schema.use.end.instanttime") + .defaultValue("false") + .withDocumentation("Uses end instant schema when incrementally fetched data to. Default: users latest instant schema.") + + val PUSH_DOWN_INCR_FILTERS: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.read.incr.filters") + .defaultValue("") + .withDocumentation("For use-cases like DeltaStreamer which reads from Hoodie Incremental table and applies " + + "opaque map functions, filters appearing late in the sequence of transformations cannot be automatically " + + "pushed down. This option allows setting filters directly on Hoodie Source.") + + val INCR_PATH_GLOB: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.read.incr.path.glob") + .defaultValue("") + .withDocumentation("For the use-cases like users only want to incremental pull from certain partitions " + + "instead of the full table. This option allows using glob pattern to directly filter on path.") + + val TIME_TRAVEL_AS_OF_INSTANT: ConfigProperty[String] = HoodieCommonConfig.TIMESTAMP_AS_OF + + val ENABLE_DATA_SKIPPING: ConfigProperty[Boolean] = ConfigProperty + .key("hoodie.enable.data.skipping") + .defaultValue(false) + .sinceVersion("0.10.0") + .withDocumentation("Enables data-skipping allowing queries to leverage indexes to reduce the search space by " + + "skipping over files") + + val EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH: ConfigProperty[Boolean] = + ConfigProperty.key("hoodie.datasource.read.extract.partition.values.from.path") + .defaultValue(false) + .sinceVersion("0.11.0") + .withDocumentation("When set to true, values for partition columns (partition values) will be extracted" + + " from physical partition path (default Spark behavior). When set to false partition values will be" + + " read from the data file (in Hudi partition columns are persisted by default)." + + " This config is a fallback allowing to preserve existing behavior, and should not be used otherwise.") + + val INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.read.incr.fallback.fulltablescan.enable") + .defaultValue("false") + .withDocumentation("When doing an incremental query whether we should fall back to full table scans if file does not exist.") + + val SCHEMA_EVOLUTION_ENABLED: ConfigProperty[Boolean] = HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE + + /** @deprecated Use {@link QUERY_TYPE} and its methods instead */ + @Deprecated + val QUERY_TYPE_OPT_KEY = QUERY_TYPE.key() + /** @deprecated Use {@link QUERY_TYPE} and its methods instead */ + @Deprecated + val DEFAULT_QUERY_TYPE_OPT_VAL: String = QUERY_TYPE_SNAPSHOT_OPT_VAL + /** @deprecated Use {@link REALTIME_MERGE} and its methods instead */ + @Deprecated + val REALTIME_MERGE_OPT_KEY = REALTIME_MERGE.key() + /** @deprecated Use {@link REALTIME_MERGE} and its methods instead */ + @Deprecated val DEFAULT_REALTIME_MERGE_OPT_VAL = REALTIME_PAYLOAD_COMBINE_OPT_VAL - - val READ_PATHS_OPT_KEY = "hoodie.datasource.read.paths" - + /** @deprecated Use {@link READ_PATHS} and its methods instead */ + @Deprecated + val READ_PATHS_OPT_KEY = READ_PATHS.key() + /** @deprecated Use {@link QUERY_TYPE} and its methods instead */ @Deprecated val VIEW_TYPE_OPT_KEY = "hoodie.datasource.view.type" @Deprecated @@ -73,69 +168,30 @@ object DataSourceReadOptions { val VIEW_TYPE_REALTIME_OPT_VAL = "realtime" @Deprecated val DEFAULT_VIEW_TYPE_OPT_VAL = VIEW_TYPE_READ_OPTIMIZED_OPT_VAL - - /** - * This eases migration from old configs to new configs. - */ - def translateViewTypesToQueryTypes(optParams: Map[String, String]) : Map[String, String] = { - val translation = Map(VIEW_TYPE_READ_OPTIMIZED_OPT_VAL -> QUERY_TYPE_SNAPSHOT_OPT_VAL, - VIEW_TYPE_INCREMENTAL_OPT_VAL -> QUERY_TYPE_INCREMENTAL_OPT_VAL, - VIEW_TYPE_REALTIME_OPT_VAL -> QUERY_TYPE_SNAPSHOT_OPT_VAL) - if (!optParams.contains(QUERY_TYPE_OPT_KEY)) { - if (optParams.contains(VIEW_TYPE_OPT_KEY)) { - log.warn(VIEW_TYPE_OPT_KEY + " is deprecated and will be removed in a later release. Please use " + QUERY_TYPE_OPT_KEY) - optParams ++ Map(QUERY_TYPE_OPT_KEY -> translation(optParams(VIEW_TYPE_OPT_KEY))) - } else { - optParams ++ Map(QUERY_TYPE_OPT_KEY -> DEFAULT_QUERY_TYPE_OPT_VAL) - } - } else { - optParams - } - } - - /** - * Instant time to start incrementally pulling data from. The instanttime here need not - * necessarily correspond to an instant on the timeline. New data written with an - * `instant_time > BEGIN_INSTANTTIME` are fetched out. For e.g: '20170901080000' will get - * all new data written after Sep 1, 2017 08:00AM. - * - * Default: None (Mandatory in incremental mode) - */ - val BEGIN_INSTANTTIME_OPT_KEY = "hoodie.datasource.read.begin.instanttime" - - - /** - * Instant time to limit incrementally fetched data to. New data written with an - * `instant_time <= END_INSTANTTIME` are fetched out. - * - * Default: latest instant (i.e fetches all new data since begin instant time) - * - */ - val END_INSTANTTIME_OPT_KEY = "hoodie.datasource.read.end.instanttime" - - /** - * If use the end instant schema when incrementally fetched data to. - * - * Default: false (use latest instant schema) - * - */ - val INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME_OPT_KEY = "hoodie.datasource.read.schema.use.end.instanttime" - val DEFAULT_INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME_OPT_VAL = "false" - - /** - * For use-cases like DeltaStreamer which reads from Hoodie Incremental table and applies opaque map functions, - * filters appearing late in the sequence of transformations cannot be automatically pushed down. - * This option allows setting filters directly on Hoodie Source - */ - val PUSH_DOWN_INCR_FILTERS_OPT_KEY = "hoodie.datasource.read.incr.filters" - val DEFAULT_PUSH_DOWN_FILTERS_OPT_VAL = "" - - /** - * For the use-cases like users only want to incremental pull from certain partitions instead of the full table. - * This option allows using glob pattern to directly filter on path. - */ - val INCR_PATH_GLOB_OPT_KEY = "hoodie.datasource.read.incr.path.glob" - val DEFAULT_INCR_PATH_GLOB_OPT_VAL = "" + /** @deprecated Use {@link BEGIN_INSTANTTIME} and its methods instead */ + @Deprecated + val BEGIN_INSTANTTIME_OPT_KEY = BEGIN_INSTANTTIME.key() + /** @deprecated Use {@link END_INSTANTTIME} and its methods instead */ + @Deprecated + val END_INSTANTTIME_OPT_KEY = END_INSTANTTIME.key() + /** @deprecated Use {@link INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME} and its methods instead */ + @Deprecated + val INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME_OPT_KEY = INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME.key() + /** @deprecated Use {@link INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME} and its methods instead */ + @Deprecated + val DEFAULT_INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME_OPT_VAL = INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME.defaultValue() + /** @deprecated Use {@link PUSH_DOWN_INCR_FILTERS} and its methods instead */ + @Deprecated + val PUSH_DOWN_INCR_FILTERS_OPT_KEY = PUSH_DOWN_INCR_FILTERS.key() + /** @deprecated Use {@link PUSH_DOWN_INCR_FILTERS} and its methods instead */ + @Deprecated + val DEFAULT_PUSH_DOWN_FILTERS_OPT_VAL = PUSH_DOWN_INCR_FILTERS.defaultValue() + /** @deprecated Use {@link INCR_PATH_GLOB} and its methods instead */ + @Deprecated + val INCR_PATH_GLOB_OPT_KEY = INCR_PATH_GLOB.key() + /** @deprecated Use {@link INCR_PATH_GLOB} and its methods instead */ + @Deprecated + val DEFAULT_INCR_PATH_GLOB_OPT_VAL = INCR_PATH_GLOB.defaultValue() } /** @@ -143,192 +199,625 @@ object DataSourceReadOptions { */ object DataSourceWriteOptions { - private val log = LogManager.getLogger(DataSourceWriteOptions.getClass) + import DataSourceOptionsHelper._ - /** - * The write operation, that this write should do - * - * Default: upsert() - */ - val OPERATION_OPT_KEY = "hoodie.datasource.write.operation" val BULK_INSERT_OPERATION_OPT_VAL = WriteOperationType.BULK_INSERT.value val INSERT_OPERATION_OPT_VAL = WriteOperationType.INSERT.value val UPSERT_OPERATION_OPT_VAL = WriteOperationType.UPSERT.value val DELETE_OPERATION_OPT_VAL = WriteOperationType.DELETE.value + val DELETE_PARTITION_OPERATION_OPT_VAL = WriteOperationType.DELETE_PARTITION.value val BOOTSTRAP_OPERATION_OPT_VAL = WriteOperationType.BOOTSTRAP.value val INSERT_OVERWRITE_OPERATION_OPT_VAL = WriteOperationType.INSERT_OVERWRITE.value val INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL = WriteOperationType.INSERT_OVERWRITE_TABLE.value - val DEFAULT_OPERATION_OPT_VAL = UPSERT_OPERATION_OPT_VAL + val OPERATION: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.write.operation") + .defaultValue(UPSERT_OPERATION_OPT_VAL) + .withValidValues( + WriteOperationType.INSERT.value, + WriteOperationType.INSERT_PREPPED.value, + WriteOperationType.UPSERT.value, + WriteOperationType.UPSERT_PREPPED.value, + WriteOperationType.BULK_INSERT.value, + WriteOperationType.BULK_INSERT_PREPPED.value, + WriteOperationType.DELETE.value, + WriteOperationType.BOOTSTRAP.value, + WriteOperationType.INSERT_OVERWRITE.value, + WriteOperationType.CLUSTER.value, + WriteOperationType.DELETE_PARTITION.value, + WriteOperationType.INSERT_OVERWRITE_TABLE.value, + WriteOperationType.COMPACT.value, + WriteOperationType.INSERT.value, + WriteOperationType.ALTER_SCHEMA.value + ) + .withDocumentation("Whether to do upsert, insert or bulkinsert for the write operation. " + + "Use bulkinsert to load new data into a table, and there on use upsert/insert. " + + "bulk insert uses a disk based write path to scale to load large inputs without need to cache it.") + - /** - * The table type for the underlying data, for this write. - * Note that this can't change across writes. - * - * Default: COPY_ON_WRITE - */ - val TABLE_TYPE_OPT_KEY = "hoodie.datasource.write.table.type" val COW_TABLE_TYPE_OPT_VAL = HoodieTableType.COPY_ON_WRITE.name val MOR_TABLE_TYPE_OPT_VAL = HoodieTableType.MERGE_ON_READ.name - val DEFAULT_TABLE_TYPE_OPT_VAL = COW_TABLE_TYPE_OPT_VAL - - @Deprecated - val STORAGE_TYPE_OPT_KEY = "hoodie.datasource.write.storage.type" - @Deprecated - val COW_STORAGE_TYPE_OPT_VAL = HoodieTableType.COPY_ON_WRITE.name - @Deprecated - val MOR_STORAGE_TYPE_OPT_VAL = HoodieTableType.MERGE_ON_READ.name - @Deprecated - val DEFAULT_STORAGE_TYPE_OPT_VAL = COW_STORAGE_TYPE_OPT_VAL + val TABLE_TYPE: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.write.table.type") + .defaultValue(COW_TABLE_TYPE_OPT_VAL) + .withValidValues(COW_TABLE_TYPE_OPT_VAL, MOR_TABLE_TYPE_OPT_VAL) + .withAlternatives("hoodie.datasource.write.storage.type") + .withDocumentation("The table type for the underlying data, for this write. This can’t change between writes.") - def translateStorageTypeToTableType(optParams: Map[String, String]) : Map[String, String] = { - if (optParams.contains(STORAGE_TYPE_OPT_KEY) && !optParams.contains(TABLE_TYPE_OPT_KEY)) { - log.warn(STORAGE_TYPE_OPT_KEY + " is deprecated and will be removed in a later release; Please use " + TABLE_TYPE_OPT_KEY) - optParams ++ Map(TABLE_TYPE_OPT_KEY -> optParams(STORAGE_TYPE_OPT_KEY)) - } else { - optParams + /** + * Translate spark parameters to hudi parameters + * + * @param optParams Parameters to be translated + * @return Parameters after translation + */ + def translateSqlOptions(optParams: Map[String, String]): Map[String, String] = { + var translatedOptParams = optParams + // translate the api partitionBy of spark DataFrameWriter to PARTITIONPATH_FIELD + if (optParams.contains(SparkDataSourceUtils.PARTITIONING_COLUMNS_KEY)) { + val partitionColumns = optParams.get(SparkDataSourceUtils.PARTITIONING_COLUMNS_KEY) + .map(SparkDataSourceUtils.decodePartitioningColumns) + .getOrElse(Nil) + val keyGeneratorClass = optParams.getOrElse(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key(), + DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.defaultValue) + + val partitionPathField = + keyGeneratorClass match { + // Only CustomKeyGenerator needs special treatment, because it needs to be specified in a way + // such as "field1:PartitionKeyType1,field2:PartitionKeyType2". + // partitionBy can specify the partition like this: partitionBy("p1", "p2:SIMPLE", "p3:TIMESTAMP") + case c if c == classOf[CustomKeyGenerator].getName => + partitionColumns.map(e => { + if (e.contains(":")) { + e + } else { + s"$e:SIMPLE" + } + }).mkString(",") + case _ => + partitionColumns.mkString(",") + } + translatedOptParams = optParams ++ Map(PARTITIONPATH_FIELD.key -> partitionPathField) } + translatedOptParams } + val TABLE_NAME: ConfigProperty[String] = ConfigProperty + .key(HoodieTableConfig.HOODIE_WRITE_TABLE_NAME_KEY) + .noDefaultValue() + .withDocumentation("Table name for the datasource write. Also used to register the table into meta stores.") /** - * Hive table name, to register the table into. - * - * Default: None (mandatory) - */ - val TABLE_NAME_OPT_KEY = "hoodie.datasource.write.table.name" + * Field used in preCombining before actual write. When two records have the same + * key value, we will pick the one with the largest value for the precombine field, + * determined by Object.compareTo(..) + */ + val PRECOMBINE_FIELD = HoodieWriteConfig.PRECOMBINE_FIELD_NAME /** - * Field used in preCombining before actual write. When two records have the same - * key value, we will pick the one with the largest value for the precombine field, - * determined by Object.compareTo(..) - */ - val PRECOMBINE_FIELD_OPT_KEY = "hoodie.datasource.write.precombine.field" - val DEFAULT_PRECOMBINE_FIELD_OPT_VAL = "ts" - + * Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. + * This will render any value set for `PRECOMBINE_FIELD_OPT_VAL` in-effective + */ + val PAYLOAD_CLASS_NAME = HoodieWriteConfig.WRITE_PAYLOAD_CLASS_NAME /** - * Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. - * This will render any value set for `PRECOMBINE_FIELD_OPT_VAL` in-effective - */ - val PAYLOAD_CLASS_OPT_KEY = "hoodie.datasource.write.payload.class" - val DEFAULT_PAYLOAD_OPT_VAL = classOf[OverwriteWithLatestAvroPayload].getName + * Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value + * will be obtained by invoking .toString() on the field value. Nested fields can be specified using + * the dot notation eg: `a.b.c` + * + */ + val RECORDKEY_FIELD = KeyGeneratorOptions.RECORDKEY_FIELD_NAME /** - * Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value - * will be obtained by invoking .toString() on the field value. Nested fields can be specified using - * the dot notation eg: `a.b.c` - * - */ - val RECORDKEY_FIELD_OPT_KEY = KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY - val DEFAULT_RECORDKEY_FIELD_OPT_VAL = "uuid" + * Partition path field. Value to be used at the `partitionPath` component of `HoodieKey`. Actual + * value obtained by invoking .toString() + */ + val PARTITIONPATH_FIELD = KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME /** - * Partition path field. Value to be used at the `partitionPath` component of `HoodieKey`. Actual - * value obtained by invoking .toString() - */ - val PARTITIONPATH_FIELD_OPT_KEY = KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY - val DEFAULT_PARTITIONPATH_FIELD_OPT_VAL = "partitionpath" + * Flag to indicate whether to use Hive style partitioning. + * If set true, the names of partition folders follow = format. + * By default false (the names of partition folders are only partition values) + */ + val HIVE_STYLE_PARTITIONING = KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE /** - * Flag to indicate whether to use Hive style partitioning. - * If set true, the names of partition folders follow = format. - * By default false (the names of partition folders are only partition values) - */ - val HIVE_STYLE_PARTITIONING_OPT_KEY = KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_OPT_KEY - val DEFAULT_HIVE_STYLE_PARTITIONING_OPT_VAL = KeyGeneratorOptions.DEFAULT_HIVE_STYLE_PARTITIONING_OPT_VAL - val URL_ENCODE_PARTITIONING_OPT_KEY = KeyGeneratorOptions.URL_ENCODE_PARTITIONING_OPT_KEY - val DEFAULT_URL_ENCODE_PARTITIONING_OPT_VAL = KeyGeneratorOptions.DEFAULT_URL_ENCODE_PARTITIONING_OPT_VAL - /** - * Key generator class, that implements will extract the key out of incoming record - * + * Key generator class, that implements will extract the key out of incoming record. */ - val KEYGENERATOR_CLASS_OPT_KEY = "hoodie.datasource.write.keygenerator.class" - val DEFAULT_KEYGENERATOR_CLASS_OPT_VAL = classOf[SimpleKeyGenerator].getName + val keyGeneratorInferFunc = JFunction.toJavaFunction((config: HoodieConfig) => { + Option.of(DataSourceOptionsHelper.inferKeyGenClazz(config.getProps)) + }) + + val KEYGENERATOR_CLASS_NAME: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.write.keygenerator.class") + .defaultValue(classOf[SimpleKeyGenerator].getName) + .withInferFunction(keyGeneratorInferFunc) + .withDocumentation("Key generator class, that implements `org.apache.hudi.keygen.KeyGenerator`") + + val KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED: ConfigProperty[String] = KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED + + val ENABLE_ROW_WRITER: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.write.row.writer.enable") + .defaultValue("true") + .withDocumentation("When set to true, will perform write operations directly using the spark native " + + "`Row` representation, avoiding any additional conversion costs.") /** - * When set to true, will perform write operations directly using the spark native `Row` representation. - * By default, false (will be enabled as default in a future release) + * Enable the bulk insert for sql insert statement. */ - val ENABLE_ROW_WRITER_OPT_KEY = "hoodie.datasource.write.row.writer.enable" - val DEFAULT_ENABLE_ROW_WRITER_OPT_VAL = "false" + val SQL_ENABLE_BULK_INSERT: ConfigProperty[String] = ConfigProperty + .key("hoodie.sql.bulk.insert.enable") + .defaultValue("false") + .withDocumentation("When set to true, the sql insert statement will use bulk insert.") + + val SQL_INSERT_MODE: ConfigProperty[String] = ConfigProperty + .key("hoodie.sql.insert.mode") + .defaultValue("upsert") + .withDocumentation("Insert mode when insert data to pk-table. The optional modes are: upsert, strict and non-strict." + + "For upsert mode, insert statement do the upsert operation for the pk-table which will update the duplicate record." + + "For strict mode, insert statement will keep the primary key uniqueness constraint which do not allow duplicate record." + + "While for non-strict mode, hudi just do the insert operation for the pk-table.") + + val COMMIT_METADATA_KEYPREFIX: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.write.commitmeta.key.prefix") + .defaultValue("_") + .withDocumentation("Option keys beginning with this prefix, are automatically added to the commit/deltacommit metadata. " + + "This is useful to store checkpointing information, in a consistent way with the hudi timeline") + + val INSERT_DROP_DUPS: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.write.insert.drop.duplicates") + .defaultValue("false") + .withDocumentation("If set to true, filters out all duplicate records from incoming dataframe, during insert operations.") + + val PARTITIONS_TO_DELETE: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.write.partitions.to.delete") + .noDefaultValue() + .withDocumentation("Comma separated list of partitions to delete") + + val STREAMING_RETRY_CNT: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.write.streaming.retry.count") + .defaultValue("3") + .withDocumentation("Config to indicate how many times streaming job should retry for a failed micro batch.") + + val STREAMING_RETRY_INTERVAL_MS: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.write.streaming.retry.interval.ms") + .defaultValue("2000") + .withDocumentation(" Config to indicate how long (by millisecond) before a retry should issued for failed microbatch") /** - * Option keys beginning with this prefix, are automatically added to the commit/deltacommit metadata. - * This is useful to store checkpointing information, in a consistent way with the hoodie timeline - */ - val COMMIT_METADATA_KEYPREFIX_OPT_KEY = "hoodie.datasource.write.commitmeta.key.prefix" - val DEFAULT_COMMIT_METADATA_KEYPREFIX_OPT_VAL = "_" + * By default false. If users prefer streaming progress over data integrity, can set this to true. + */ + val STREAMING_IGNORE_FAILED_BATCH: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.write.streaming.ignore.failed.batch") + .defaultValue("false") + .withDocumentation("Config to indicate whether to ignore any non exception error (e.g. writestatus error)" + + " within a streaming microbatch. Turning this on, could hide the write status errors while the spark checkpoint moves ahead." + + "So, would recommend users to use this with caution.") - /** - * Flag to indicate whether to drop duplicates upon insert. - * By default insert will accept duplicates, to gain extra performance. - */ - val INSERT_DROP_DUPS_OPT_KEY = "hoodie.datasource.write.insert.drop.duplicates" - val DEFAULT_INSERT_DROP_DUPS_OPT_VAL = "false" + val META_SYNC_CLIENT_TOOL_CLASS_NAME: ConfigProperty[String] = ConfigProperty + .key("hoodie.meta.sync.client.tool.class") + .defaultValue(classOf[HiveSyncTool].getName) + .withDocumentation("Sync tool class name used to sync to metastore. Defaults to Hive.") - /** - * Flag to indicate how many times streaming job should retry for a failed microbatch - * By default 3 - */ - val STREAMING_RETRY_CNT_OPT_KEY = "hoodie.datasource.write.streaming.retry.count" - val DEFAULT_STREAMING_RETRY_CNT_OPT_VAL = "3" + val RECONCILE_SCHEMA: ConfigProperty[Boolean] = HoodieCommonConfig.RECONCILE_SCHEMA + // HIVE SYNC SPECIFIC CONFIGS + // NOTE: DO NOT USE uppercase for the keys as they are internally lower-cased. Using upper-cases causes + // unexpected issues with config getting reset /** - * Flag to indicate how long (by millisecond) before a retry should issued for failed microbatch - * By default 2000 and it will be doubled by every retry - */ - val STREAMING_RETRY_INTERVAL_MS_OPT_KEY = "hoodie.datasource.write.streaming.retry.interval.ms" - val DEFAULT_STREAMING_RETRY_INTERVAL_MS_OPT_VAL = "2000" + * @deprecated Hive Specific Configs are moved to {@link HiveSyncConfig} + */ + @Deprecated + val HIVE_SYNC_ENABLED: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_SYNC_ENABLED + @Deprecated + val META_SYNC_ENABLED: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_ENABLED + @Deprecated + val HIVE_DATABASE: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_DATABASE_NAME + @Deprecated + val HIVE_TABLE: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_TABLE_NAME + @Deprecated + val HIVE_BASE_FILE_FORMAT: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT + @Deprecated + val HIVE_USER: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_USER + @Deprecated + val HIVE_PASS: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_PASS + @Deprecated + val HIVE_URL: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_URL + @Deprecated + val METASTORE_URIS: ConfigProperty[String] = HiveSyncConfigHolder.METASTORE_URIS + @Deprecated + val HIVE_PARTITION_FIELDS: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_PARTITION_FIELDS + @Deprecated + val HIVE_PARTITION_EXTRACTOR_CLASS: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS + @Deprecated + val HIVE_ASSUME_DATE_PARTITION: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION + @Deprecated + val HIVE_USE_PRE_APACHE_INPUT_FORMAT: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT + + /** @deprecated Use {@link HIVE_SYNC_MODE} instead of this config from 0.9.0 */ + @Deprecated + val HIVE_USE_JDBC: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_USE_JDBC + @Deprecated + val HIVE_AUTO_CREATE_DATABASE: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE + @Deprecated + val HIVE_IGNORE_EXCEPTIONS: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS + @Deprecated + val HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE + @Deprecated + val HIVE_SUPPORT_TIMESTAMP_TYPE: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE /** - * Flag to indicate whether to ignore any non exception error (e.g. writestatus error) - * within a streaming microbatch - * By default true (in favor of streaming progressing over data integrity) - */ - val STREAMING_IGNORE_FAILED_BATCH_OPT_KEY = "hoodie.datasource.write.streaming.ignore.failed.batch" - val DEFAULT_STREAMING_IGNORE_FAILED_BATCH_OPT_VAL = "true" - val META_SYNC_CLIENT_TOOL_CLASS = "hoodie.meta.sync.client.tool.class" - val DEFAULT_META_SYNC_CLIENT_TOOL_CLASS = classOf[HiveSyncTool].getName + * Flag to indicate whether to use conditional syncing in HiveSync. + * If set true, the Hive sync procedure will only run if partition or schema changes are detected. + * By default true. + */ + @Deprecated + val HIVE_CONDITIONAL_SYNC: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_CONDITIONAL_SYNC + @Deprecated + val HIVE_TABLE_PROPERTIES: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_TABLE_PROPERTIES + @Deprecated + val HIVE_TABLE_SERDE_PROPERTIES: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_TABLE_SERDE_PROPERTIES + @Deprecated + val HIVE_SYNC_AS_DATA_SOURCE_TABLE: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_SYNC_AS_DATA_SOURCE_TABLE - // HIVE SYNC SPECIFIC CONFIGS - //NOTE: DO NOT USE uppercase for the keys as they are internally lower-cased. Using upper-cases causes - // unexpected issues with config getting reset - val HIVE_SYNC_ENABLED_OPT_KEY = "hoodie.datasource.hive_sync.enable" - val META_SYNC_ENABLED_OPT_KEY = "hoodie.datasource.meta.sync.enable" - val HIVE_DATABASE_OPT_KEY = "hoodie.datasource.hive_sync.database" - val HIVE_TABLE_OPT_KEY = "hoodie.datasource.hive_sync.table" - val HIVE_BASE_FILE_FORMAT_OPT_KEY = "hoodie.datasource.hive_sync.base_file_format" - val HIVE_USER_OPT_KEY = "hoodie.datasource.hive_sync.username" - val HIVE_PASS_OPT_KEY = "hoodie.datasource.hive_sync.password" - val HIVE_URL_OPT_KEY = "hoodie.datasource.hive_sync.jdbcurl" - val HIVE_PARTITION_FIELDS_OPT_KEY = "hoodie.datasource.hive_sync.partition_fields" - val HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY = "hoodie.datasource.hive_sync.partition_extractor_class" - val HIVE_ASSUME_DATE_PARTITION_OPT_KEY = "hoodie.datasource.hive_sync.assume_date_partitioning" - val HIVE_USE_PRE_APACHE_INPUT_FORMAT_OPT_KEY = "hoodie.datasource.hive_sync.use_pre_apache_input_format" - val HIVE_USE_JDBC_OPT_KEY = "hoodie.datasource.hive_sync.use_jdbc" - val HIVE_AUTO_CREATE_DATABASE_OPT_KEY = "hoodie.datasource.hive_sync.auto_create_database" - val HIVE_SKIP_RO_SUFFIX = "hoodie.datasource.hive_sync.skip_ro_suffix" - val HIVE_SUPPORT_TIMESTAMP = "hoodie.datasource.hive_sync.support_timestamp" - - // DEFAULT FOR HIVE SPECIFIC CONFIGS - val DEFAULT_HIVE_SYNC_ENABLED_OPT_VAL = "false" - val DEFAULT_META_SYNC_ENABLED_OPT_VAL = "false" - val DEFAULT_HIVE_DATABASE_OPT_VAL = "default" - val DEFAULT_HIVE_TABLE_OPT_VAL = "unknown" - val DEFAULT_HIVE_BASE_FILE_FORMAT_OPT_VAL = "PARQUET" - val DEFAULT_HIVE_USER_OPT_VAL = "hive" - val DEFAULT_HIVE_PASS_OPT_VAL = "hive" - val DEFAULT_HIVE_URL_OPT_VAL = "jdbc:hive2://localhost:10000" - val DEFAULT_HIVE_PARTITION_FIELDS_OPT_VAL = "" - val DEFAULT_HIVE_PARTITION_EXTRACTOR_CLASS_OPT_VAL = classOf[SlashEncodedDayPartitionValueExtractor].getCanonicalName - val DEFAULT_HIVE_ASSUME_DATE_PARTITION_OPT_VAL = "false" - val DEFAULT_USE_PRE_APACHE_INPUT_FORMAT_OPT_VAL = "false" - val DEFAULT_HIVE_USE_JDBC_OPT_VAL = "true" - val DEFAULT_HIVE_AUTO_CREATE_DATABASE_OPT_KEY = "true" - val DEFAULT_HIVE_SKIP_RO_SUFFIX_VAL = "false" - val DEFAULT_HIVE_SUPPORT_TIMESTAMP = "false" + // Create table as managed table + @Deprecated + val HIVE_CREATE_MANAGED_TABLE: ConfigProperty[java.lang.Boolean] = HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE + @Deprecated + val HIVE_BATCH_SYNC_PARTITION_NUM: ConfigProperty[java.lang.Integer] = HiveSyncConfigHolder.HIVE_BATCH_SYNC_PARTITION_NUM + @Deprecated + val HIVE_SYNC_MODE: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_SYNC_MODE + @Deprecated + val HIVE_SYNC_BUCKET_SYNC: ConfigProperty[java.lang.Boolean] = HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC + @Deprecated + val HIVE_SYNC_COMMENT: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_SYNC_COMMENT; // Async Compaction - Enabled by default for MOR - val ASYNC_COMPACT_ENABLE_OPT_KEY = "hoodie.datasource.compaction.async.enable" - val DEFAULT_ASYNC_COMPACT_ENABLE_OPT_VAL = "true" + val ASYNC_COMPACT_ENABLE: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.compaction.async.enable") + .defaultValue("true") + .withDocumentation("Controls whether async compaction should be turned on for MOR table writing.") + + val INLINE_CLUSTERING_ENABLE = HoodieClusteringConfig.INLINE_CLUSTERING + + val ASYNC_CLUSTERING_ENABLE = HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE + + val KAFKA_AVRO_VALUE_DESERIALIZER_CLASS: ConfigProperty[String] = ConfigProperty + .key("hoodie.deltastreamer.source.kafka.value.deserializer.class") + .defaultValue("io.confluent.kafka.serializers.KafkaAvroDeserializer") + .sinceVersion("0.9.0") + .withDocumentation("This class is used by kafka client to deserialize the records") + + val DROP_PARTITION_COLUMNS: ConfigProperty[Boolean] = HoodieTableConfig.DROP_PARTITION_COLUMNS + + /** @deprecated Use {@link HIVE_ASSUME_DATE_PARTITION} and its methods instead */ + @Deprecated + val HIVE_ASSUME_DATE_PARTITION_OPT_KEY = HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION.key() + /** @deprecated Use {@link HIVE_USE_PRE_APACHE_INPUT_FORMAT} and its methods instead */ + @Deprecated + val HIVE_USE_PRE_APACHE_INPUT_FORMAT_OPT_KEY = HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT.key() + /** @deprecated Use {@link HIVE_USE_JDBC} and its methods instead */ + @Deprecated + val HIVE_USE_JDBC_OPT_KEY = HiveSyncConfigHolder.HIVE_USE_JDBC.key() + /** @deprecated Use {@link HIVE_AUTO_CREATE_DATABASE} and its methods instead */ + @Deprecated + val HIVE_AUTO_CREATE_DATABASE_OPT_KEY = HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE.key() + /** @deprecated Use {@link HIVE_IGNORE_EXCEPTIONS} and its methods instead */ + @Deprecated + val HIVE_IGNORE_EXCEPTIONS_OPT_KEY = HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS.key() + /** @deprecated Use {@link STREAMING_IGNORE_FAILED_BATCH} and its methods instead */ + @Deprecated + val STREAMING_IGNORE_FAILED_BATCH_OPT_KEY = STREAMING_IGNORE_FAILED_BATCH.key() + /** @deprecated Use {@link STREAMING_IGNORE_FAILED_BATCH} and its methods instead */ + @Deprecated + val DEFAULT_STREAMING_IGNORE_FAILED_BATCH_OPT_VAL = STREAMING_IGNORE_FAILED_BATCH.defaultValue() + /** @deprecated Use {@link META_SYNC_CLIENT_TOOL_CLASS_NAME} and its methods instead */ + @Deprecated + val META_SYNC_CLIENT_TOOL_CLASS = META_SYNC_CLIENT_TOOL_CLASS_NAME.key() + /** @deprecated Use {@link META_SYNC_CLIENT_TOOL_CLASS_NAME} and its methods instead */ + @Deprecated + val DEFAULT_META_SYNC_CLIENT_TOOL_CLASS = META_SYNC_CLIENT_TOOL_CLASS_NAME.defaultValue() + /** @deprecated Use {@link HIVE_SYNC_ENABLED} and its methods instead */ + @Deprecated + val HIVE_SYNC_ENABLED_OPT_KEY = HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key() + /** @deprecated Use {@link META_SYNC_ENABLED} and its methods instead */ + @Deprecated + val META_SYNC_ENABLED_OPT_KEY = HoodieSyncConfig.META_SYNC_ENABLED.key() + /** @deprecated Use {@link HIVE_DATABASE} and its methods instead */ + @Deprecated + val HIVE_DATABASE_OPT_KEY = HoodieSyncConfig.META_SYNC_DATABASE_NAME.key() + /** @deprecated Use {@link HIVE_TABLE} and its methods instead */ + @Deprecated + val HIVE_TABLE_OPT_KEY = HoodieSyncConfig.META_SYNC_TABLE_NAME.key() + /** @deprecated Use {@link HIVE_BASE_FILE_FORMAT} and its methods instead */ + @Deprecated + val HIVE_BASE_FILE_FORMAT_OPT_KEY = HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key() + /** @deprecated Use {@link HIVE_USER} and its methods instead */ + @Deprecated + val HIVE_USER_OPT_KEY = HiveSyncConfigHolder.HIVE_USER.key() + /** @deprecated Use {@link HIVE_PASS} and its methods instead */ + @Deprecated + val HIVE_PASS_OPT_KEY = HiveSyncConfigHolder.HIVE_PASS.key() + /** @deprecated Use {@link HIVE_URL} and its methods instead */ + @Deprecated + val HIVE_URL_OPT_KEY = HiveSyncConfigHolder.HIVE_URL.key() + /** @deprecated Use {@link HIVE_PARTITION_FIELDS} and its methods instead */ + @Deprecated + val HIVE_PARTITION_FIELDS_OPT_KEY = HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key() + /** @deprecated Use {@link HIVE_PARTITION_EXTRACTOR_CLASS} and its methods instead */ + @Deprecated + val HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY = HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key() + + /** @deprecated Use {@link KEYGENERATOR_CLASS_NAME} and its methods instead */ + @Deprecated + val DEFAULT_KEYGENERATOR_CLASS_OPT_VAL = KEYGENERATOR_CLASS_NAME.defaultValue() + /** @deprecated Use {@link KEYGENERATOR_CLASS_NAME} and its methods instead */ + @Deprecated + val KEYGENERATOR_CLASS_OPT_KEY = HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key() + /** @deprecated Use {@link ENABLE_ROW_WRITER} and its methods instead */ + @Deprecated + val ENABLE_ROW_WRITER_OPT_KEY = ENABLE_ROW_WRITER.key() + /** @deprecated Use {@link ENABLE_ROW_WRITER} and its methods instead */ + @Deprecated + val DEFAULT_ENABLE_ROW_WRITER_OPT_VAL = ENABLE_ROW_WRITER.defaultValue() + /** @deprecated Use {@link HIVE_STYLE_PARTITIONING} and its methods instead */ + @Deprecated + val HIVE_STYLE_PARTITIONING_OPT_KEY = KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE.key() + /** @deprecated Use {@link HIVE_STYLE_PARTITIONING} and its methods instead */ + @Deprecated + val DEFAULT_HIVE_STYLE_PARTITIONING_OPT_VAL = HIVE_STYLE_PARTITIONING.defaultValue() + + val URL_ENCODE_PARTITIONING = KeyGeneratorOptions.URL_ENCODE_PARTITIONING + /** @deprecated Use {@link URL_ENCODE_PARTITIONING} and its methods instead */ + @Deprecated + val URL_ENCODE_PARTITIONING_OPT_KEY = KeyGeneratorOptions.URL_ENCODE_PARTITIONING.key() + /** @deprecated Use {@link URL_ENCODE_PARTITIONING} and its methods instead */ + @Deprecated + val DEFAULT_URL_ENCODE_PARTITIONING_OPT_VAL = URL_ENCODE_PARTITIONING.defaultValue() + /** @deprecated Use {@link COMMIT_METADATA_KEYPREFIX} and its methods instead */ + @Deprecated + val COMMIT_METADATA_KEYPREFIX_OPT_KEY = COMMIT_METADATA_KEYPREFIX.key() + /** @deprecated Use {@link COMMIT_METADATA_KEYPREFIX} and its methods instead */ + @Deprecated + val DEFAULT_COMMIT_METADATA_KEYPREFIX_OPT_VAL = COMMIT_METADATA_KEYPREFIX.defaultValue() + /** @deprecated Use {@link INSERT_DROP_DUPS} and its methods instead */ + @Deprecated + val INSERT_DROP_DUPS_OPT_KEY = INSERT_DROP_DUPS.key() + /** @deprecated Use {@link INSERT_DROP_DUPS} and its methods instead */ + @Deprecated + val DEFAULT_INSERT_DROP_DUPS_OPT_VAL = INSERT_DROP_DUPS.defaultValue() + /** @deprecated Use {@link STREAMING_RETRY_CNT} and its methods instead */ + @Deprecated + val STREAMING_RETRY_CNT_OPT_KEY = STREAMING_RETRY_CNT.key() + /** @deprecated Use {@link STREAMING_RETRY_CNT} and its methods instead */ + @Deprecated + val DEFAULT_STREAMING_RETRY_CNT_OPT_VAL = STREAMING_RETRY_CNT.defaultValue() + /** @deprecated Use {@link STREAMING_RETRY_INTERVAL_MS} and its methods instead */ + @Deprecated + val STREAMING_RETRY_INTERVAL_MS_OPT_KEY = STREAMING_RETRY_INTERVAL_MS.key() + /** @deprecated Use {@link STREAMING_RETRY_INTERVAL_MS} and its methods instead */ + @Deprecated + val DEFAULT_STREAMING_RETRY_INTERVAL_MS_OPT_VAL = STREAMING_RETRY_INTERVAL_MS.defaultValue() + + /** @deprecated Use {@link RECORDKEY_FIELD} and its methods instead */ + @Deprecated + val RECORDKEY_FIELD_OPT_KEY = KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key() + /** @deprecated Use {@link RECORDKEY_FIELD} and its methods instead */ + @Deprecated + val DEFAULT_RECORDKEY_FIELD_OPT_VAL = RECORDKEY_FIELD.defaultValue() + /** @deprecated Use {@link PARTITIONPATH_FIELD} and its methods instead */ + @Deprecated + val PARTITIONPATH_FIELD_OPT_KEY = KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key() + /** @deprecated Use {@link PARTITIONPATH_FIELD} and its methods instead */ + @Deprecated + val DEFAULT_PARTITIONPATH_FIELD_OPT_VAL = null + + /** @deprecated Use {@link TABLE_NAME} and its methods instead */ + @Deprecated + val TABLE_NAME_OPT_KEY = TABLE_NAME.key() + /** @deprecated Use {@link PRECOMBINE_FIELD} and its methods instead */ + @Deprecated + val PRECOMBINE_FIELD_OPT_KEY = HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key() + /** @deprecated Use {@link PRECOMBINE_FIELD} and its methods instead */ + @Deprecated + val DEFAULT_PRECOMBINE_FIELD_OPT_VAL = PRECOMBINE_FIELD.defaultValue() + + /** @deprecated Use {@link HoodieWriteConfig.WRITE_PAYLOAD_CLASS_NAME} and its methods instead */ + @Deprecated + val PAYLOAD_CLASS_OPT_KEY = HoodieWriteConfig.WRITE_PAYLOAD_CLASS_NAME.key() + /** @deprecated Use {@link HoodieWriteConfig.WRITE_PAYLOAD_CLASS_NAME} and its methods instead */ + @Deprecated + val DEFAULT_PAYLOAD_OPT_VAL = PAYLOAD_CLASS_NAME.defaultValue() + + /** @deprecated Use {@link TABLE_TYPE} and its methods instead */ + @Deprecated + val TABLE_TYPE_OPT_KEY = TABLE_TYPE.key() + /** @deprecated Use {@link TABLE_TYPE} and its methods instead */ + @Deprecated + val DEFAULT_TABLE_TYPE_OPT_VAL = TABLE_TYPE.defaultValue() + + /** @deprecated Use {@link TABLE_TYPE} and its methods instead */ + @Deprecated + val STORAGE_TYPE_OPT_KEY = "hoodie.datasource.write.storage.type" + @Deprecated + val COW_STORAGE_TYPE_OPT_VAL = HoodieTableType.COPY_ON_WRITE.name + @Deprecated + val MOR_STORAGE_TYPE_OPT_VAL = HoodieTableType.MERGE_ON_READ.name + /** @deprecated Use {@link TABLE_TYPE} and its methods instead */ + @Deprecated + val DEFAULT_STORAGE_TYPE_OPT_VAL = COW_STORAGE_TYPE_OPT_VAL + + /** @deprecated Use {@link OPERATION} and its methods instead */ + @Deprecated + val OPERATION_OPT_KEY = OPERATION.key() + /** @deprecated Use {@link OPERATION} and its methods instead */ + @Deprecated + val DEFAULT_OPERATION_OPT_VAL = OPERATION.defaultValue() + + /** @deprecated Use {@link HIVE_SYNC_ENABLED} and its methods instead */ + @Deprecated + val DEFAULT_HIVE_SYNC_ENABLED_OPT_VAL = HiveSyncConfigHolder.HIVE_SYNC_ENABLED.defaultValue() + /** @deprecated Use {@link META_SYNC_ENABLED} and its methods instead */ + @Deprecated + val DEFAULT_META_SYNC_ENABLED_OPT_VAL = HoodieSyncConfig.META_SYNC_ENABLED.defaultValue() + /** @deprecated Use {@link HIVE_DATABASE} and its methods instead */ + @Deprecated + val DEFAULT_HIVE_DATABASE_OPT_VAL = HoodieSyncConfig.META_SYNC_DATABASE_NAME.defaultValue() + /** @deprecated Use {@link HIVE_TABLE} and its methods instead */ + @Deprecated + val DEFAULT_HIVE_TABLE_OPT_VAL = HoodieSyncConfig.META_SYNC_TABLE_NAME.defaultValue() + /** @deprecated Use {@link HIVE_BASE_FILE_FORMAT} and its methods instead */ + @Deprecated + val DEFAULT_HIVE_BASE_FILE_FORMAT_OPT_VAL = HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.defaultValue() + /** @deprecated Use {@link HIVE_USER} and its methods instead */ + @Deprecated + val DEFAULT_HIVE_USER_OPT_VAL = HiveSyncConfigHolder.HIVE_USER.defaultValue() + /** @deprecated Use {@link HIVE_PASS} and its methods instead */ + @Deprecated + val DEFAULT_HIVE_PASS_OPT_VAL = HiveSyncConfigHolder.HIVE_PASS.defaultValue() + /** @deprecated Use {@link HIVE_URL} and its methods instead */ + @Deprecated + val DEFAULT_HIVE_URL_OPT_VAL = HiveSyncConfigHolder.HIVE_URL.defaultValue() + /** @deprecated Use {@link HIVE_PARTITION_FIELDS} and its methods instead */ + @Deprecated + val DEFAULT_HIVE_PARTITION_FIELDS_OPT_VAL = HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.defaultValue() + /** @deprecated Use {@link HIVE_PARTITION_EXTRACTOR_CLASS} and its methods instead */ + @Deprecated + val DEFAULT_HIVE_PARTITION_EXTRACTOR_CLASS_OPT_VAL = HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.defaultValue() + /** @deprecated Use {@link HIVE_ASSUME_DATE_PARTITION} and its methods instead */ + @Deprecated + val DEFAULT_HIVE_ASSUME_DATE_PARTITION_OPT_VAL = HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION.defaultValue() + @Deprecated + val DEFAULT_USE_PRE_APACHE_INPUT_FORMAT_OPT_VAL = "false" + /** @deprecated Use {@link HIVE_USE_JDBC} and its methods instead */ + @Deprecated + val DEFAULT_HIVE_USE_JDBC_OPT_VAL = HiveSyncConfigHolder.HIVE_USE_JDBC.defaultValue() + /** @deprecated Use {@link HIVE_AUTO_CREATE_DATABASE} and its methods instead */ + @Deprecated + val DEFAULT_HIVE_AUTO_CREATE_DATABASE_OPT_KEY = HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE.defaultValue() + /** @deprecated Use {@link HIVE_IGNORE_EXCEPTIONS} and its methods instead */ + @Deprecated + val DEFAULT_HIVE_IGNORE_EXCEPTIONS_OPT_KEY = HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS.defaultValue() + /** @deprecated Use {@link HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE} and its methods instead */ + @Deprecated + val HIVE_SKIP_RO_SUFFIX = HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.key() + /** @deprecated Use {@link HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE} and its methods instead */ + @Deprecated + val DEFAULT_HIVE_SKIP_RO_SUFFIX_VAL = HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.defaultValue() + /** @deprecated Use {@link HIVE_SUPPORT_TIMESTAMP_TYPE} and its methods instead */ + @Deprecated + val HIVE_SUPPORT_TIMESTAMP = HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.key() + /** @deprecated Use {@link HIVE_SUPPORT_TIMESTAMP_TYPE} and its methods instead */ + @Deprecated + val DEFAULT_HIVE_SUPPORT_TIMESTAMP = HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.defaultValue() + /** @deprecated Use {@link ASYNC_COMPACT_ENABLE} and its methods instead */ + @Deprecated + val ASYNC_COMPACT_ENABLE_OPT_KEY = ASYNC_COMPACT_ENABLE.key() + /** @deprecated Use {@link ASYNC_COMPACT_ENABLE} and its methods instead */ + @Deprecated + val DEFAULT_ASYNC_COMPACT_ENABLE_OPT_VAL = ASYNC_COMPACT_ENABLE.defaultValue() + /** @deprecated Use {@link KAFKA_AVRO_VALUE_DESERIALIZER_CLASS} and its methods instead */ + @Deprecated + val KAFKA_AVRO_VALUE_DESERIALIZER = KAFKA_AVRO_VALUE_DESERIALIZER_CLASS.key() + @Deprecated + val SCHEMA_PROVIDER_CLASS_PROP = "hoodie.deltastreamer.schemaprovider.class" +} + +object DataSourceOptionsHelper { + + private val log = LogManager.getLogger(DataSourceOptionsHelper.getClass) + + // put all the configs with alternatives here + val allConfigsWithAlternatives = List( + DataSourceReadOptions.QUERY_TYPE, + DataSourceWriteOptions.TABLE_TYPE, + HoodieTableConfig.BASE_FILE_FORMAT, + HoodieTableConfig.LOG_FILE_FORMAT + ) + + // put all the deprecated configs here + val allDeprecatedConfigs: Set[String] = Set( + ConsistencyGuardConfig.ENABLE.key + ) + + // maps the deprecated config name to its latest name + val allAlternatives: Map[String, String] = { + val alterMap = scala.collection.mutable.Map[String, String]() + allConfigsWithAlternatives.foreach(cfg => cfg.getAlternatives.asScala.foreach(alternative => alterMap(alternative) = cfg.key)) + alterMap.toMap + } + + val viewTypeValueMap: Map[String, String] = Map( + DataSourceReadOptions.VIEW_TYPE_READ_OPTIMIZED_OPT_VAL -> DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL, + DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL -> DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL, + DataSourceReadOptions.VIEW_TYPE_REALTIME_OPT_VAL -> DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) + + def translateConfigurations(optParams: Map[String, String]): Map[String, String] = { + val translatedOpt = scala.collection.mutable.Map[String, String]() ++= optParams + optParams.keySet.foreach(opt => { + if (allAlternatives.contains(opt) && !optParams.contains(allAlternatives(opt))) { + log.warn(opt + " is deprecated and will be removed in a later release; Please use " + allAlternatives(opt)) + if (opt == DataSourceReadOptions.VIEW_TYPE_OPT_KEY) { + // special handle for VIEW_TYPE, also need to translate its values + translatedOpt ++= Map(allAlternatives(opt) -> viewTypeValueMap(optParams(opt))) + } else { + translatedOpt ++= Map(allAlternatives(opt) -> optParams(opt)) + } + } + if (allDeprecatedConfigs.contains(opt)) { + log.warn(opt + " is deprecated and should never be used anymore") + } + }) + translatedOpt.toMap + } + + def parametersWithReadDefaults(parameters: Map[String, String]): Map[String, String] = { + // First check if the ConfigUtils.IS_QUERY_AS_RO_TABLE has set by HiveSyncTool, + // or else use query type from QUERY_TYPE. + val paramsWithGlobalProps = DFSPropertiesConfiguration.getGlobalProps.asScala.toMap ++ parameters + val queryType = paramsWithGlobalProps.get(ConfigUtils.IS_QUERY_AS_RO_TABLE) + .map(is => if (is.toBoolean) QUERY_TYPE_READ_OPTIMIZED_OPT_VAL else QUERY_TYPE_SNAPSHOT_OPT_VAL) + .getOrElse(paramsWithGlobalProps.getOrElse(QUERY_TYPE.key, QUERY_TYPE.defaultValue())) + + Map( + QUERY_TYPE.key -> queryType + ) ++ translateConfigurations(paramsWithGlobalProps) + } + + def inferKeyGenClazz(props: TypedProperties): String = { + val partitionFields = props.getString(DataSourceWriteOptions.PARTITIONPATH_FIELD.key(), null) + val recordsKeyFields = props.getString(DataSourceWriteOptions.RECORDKEY_FIELD.key(), DataSourceWriteOptions.RECORDKEY_FIELD.defaultValue()) + inferKeyGenClazz(recordsKeyFields, partitionFields) + } + + def inferKeyGenClazz(recordsKeyFields: String, partitionFields: String): String = { + if (!StringUtils.isNullOrEmpty(partitionFields)) { + val numPartFields = partitionFields.split(",").length + val numRecordKeyFields = recordsKeyFields.split(",").length + if (numPartFields == 1 && numRecordKeyFields == 1) { + classOf[SimpleKeyGenerator].getName + } else { + classOf[ComplexKeyGenerator].getName + } + } else { + classOf[NonpartitionedKeyGenerator].getName + } + } + + implicit def convert[T, U](prop: ConfigProperty[T])(implicit converter: T => U): ConfigProperty[U] = { + checkState(prop.hasDefaultValue) + var newProp: ConfigProperty[U] = ConfigProperty.key(prop.key()) + .defaultValue(converter(prop.defaultValue())) + .withDocumentation(prop.doc()) + .withAlternatives(prop.getAlternatives.asScala: _*) + + newProp = toScalaOption(prop.getSinceVersion) match { + case Some(version) => newProp.sinceVersion(version) + case None => newProp + } + newProp = toScalaOption(prop.getDeprecatedVersion) match { + case Some(version) => newProp.deprecatedAfter(version) + case None => newProp + } + + newProp + } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala new file mode 100644 index 0000000000000..86931ceda5fd1 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hadoop.fs.Path +import org.apache.hudi.DataSourceReadOptions._ +import org.apache.hudi.DataSourceWriteOptions.{BOOTSTRAP_OPERATION_OPT_VAL, OPERATION} +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.HoodieRecord +import org.apache.hudi.common.model.HoodieTableType.{COPY_ON_WRITE, MERGE_ON_READ} +import org.apache.hudi.common.table.timeline.HoodieInstant +import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.exception.HoodieException +import org.apache.hudi.util.PathUtils +import org.apache.log4j.LogManager +import org.apache.spark.sql.execution.streaming.{Sink, Source} +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.isUsingHiveCatalog +import org.apache.spark.sql.hudi.streaming.HoodieStreamSource +import org.apache.spark.sql.sources._ +import org.apache.spark.sql.streaming.OutputMode +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession} + +import scala.collection.JavaConverters._ + +/** + * Hoodie Spark Datasource, for reading and writing hoodie tables + * + */ +class DefaultSource extends RelationProvider + with SchemaRelationProvider + with CreatableRelationProvider + with DataSourceRegister + with StreamSinkProvider + with StreamSourceProvider + with SparkAdapterSupport + with Serializable { + + SparkSession.getActiveSession.foreach { spark => + val sparkVersion = spark.version + if (sparkVersion.startsWith("0.") || sparkVersion.startsWith("1.") || sparkVersion.startsWith("2.")) { + // Enable "passPartitionByAsOptions" to support "write.partitionBy(...)" + spark.conf.set("spark.sql.legacy.sources.write.passPartitionByAsOptions", "true") + } + // Revisit EMRFS incompatibilities, for now disable + spark.sparkContext.hadoopConfiguration.set("fs.s3.metadata.cache.expiration.seconds", "0") + } + + private val log = LogManager.getLogger(classOf[DefaultSource]) + + override def createRelation(sqlContext: SQLContext, + parameters: Map[String, String]): BaseRelation = { + createRelation(sqlContext, parameters, null) + } + + override def createRelation(sqlContext: SQLContext, + optParams: Map[String, String], + schema: StructType): BaseRelation = { + val path = optParams.get("path") + val readPathsStr = optParams.get(DataSourceReadOptions.READ_PATHS.key) + + if (path.isEmpty && readPathsStr.isEmpty) { + throw new HoodieException(s"'path' or '$READ_PATHS' or both must be specified.") + } + + val readPaths = readPathsStr.map(p => p.split(",").toSeq).getOrElse(Seq()) + val allPaths = path.map(p => Seq(p)).getOrElse(Seq()) ++ readPaths + + val fs = FSUtils.getFs(allPaths.head, sqlContext.sparkContext.hadoopConfiguration) + + val globPaths = if (path.exists(_.contains("*")) || readPaths.nonEmpty) { + PathUtils.checkAndGlobPathIfNecessary(allPaths, fs) + } else { + Seq.empty + } + + // Add default options for unspecified read options keys. + val parameters = (if (globPaths.nonEmpty) { + Map( + "glob.paths" -> globPaths.mkString(",") + ) + } else { + Map() + }) ++ DataSourceOptionsHelper.parametersWithReadDefaults(optParams) + + // Get the table base path + val tablePath = if (globPaths.nonEmpty) { + DataSourceUtils.getTablePath(fs, globPaths.toArray) + } else { + DataSourceUtils.getTablePath(fs, Array(new Path(path.get))) + } + log.info("Obtained hudi table path: " + tablePath) + + val metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf).setBasePath(tablePath).build() + + DefaultSource.createRelation(sqlContext, metaClient, schema, globPaths, parameters) + } + + def getValidCommits(metaClient: HoodieTableMetaClient): String = { + metaClient + .getCommitsAndCompactionTimeline.filterCompletedInstants.getInstants.toArray().map(_.asInstanceOf[HoodieInstant].getFileName).mkString(",") + } + + /** + * This DataSource API is used for writing the DataFrame at the destination. For now, we are returning a dummy + * relation here because Spark does not really make use of the relation returned, and just returns an empty + * dataset at [[org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run()]]. This saves us the cost + * of creating and returning a parquet relation here. + * + * TODO: Revisit to return a concrete relation here when we support CREATE TABLE AS for Hudi with DataSource API. + * That is the only case where Spark seems to actually need a relation to be returned here + * [[org.apache.spark.sql.execution.datasources.DataSource.writeAndRead()]] + * + * @param sqlContext Spark SQL Context + * @param mode Mode for saving the DataFrame at the destination + * @param optParams Parameters passed as part of the DataFrame write operation + * @param df Spark DataFrame to be written + * @return Spark Relation + */ + override def createRelation(sqlContext: SQLContext, + mode: SaveMode, + optParams: Map[String, String], + df: DataFrame): BaseRelation = { + val dfWithoutMetaCols = df.drop(HoodieRecord.HOODIE_META_COLUMNS.asScala:_*) + + if (optParams.get(OPERATION.key).contains(BOOTSTRAP_OPERATION_OPT_VAL)) { + HoodieSparkSqlWriter.bootstrap(sqlContext, mode, optParams, dfWithoutMetaCols) + HoodieSparkSqlWriter.cleanup() + } else { + val (success, _, _, _, _, _) = HoodieSparkSqlWriter.write(sqlContext, mode, optParams, dfWithoutMetaCols) + HoodieSparkSqlWriter.cleanup() + if (!success) { + throw new HoodieException("Write to Hudi failed") + } + } + + new HoodieEmptyRelation(sqlContext, dfWithoutMetaCols.schema) + } + + override def createSink(sqlContext: SQLContext, + optParams: Map[String, String], + partitionColumns: Seq[String], + outputMode: OutputMode): Sink = { + new HoodieStreamingSink( + sqlContext, + optParams, + partitionColumns, + outputMode) + } + + override def shortName(): String = "hudi_v1" + + override def sourceSchema(sqlContext: SQLContext, + schema: Option[StructType], + providerName: String, + parameters: Map[String, String]): (String, StructType) = { + val path = parameters.get("path") + if (path.isEmpty || path.get == null) { + throw new HoodieException(s"'path' must be specified.") + } + val metaClient = HoodieTableMetaClient.builder().setConf( + sqlContext.sparkSession.sessionState.newHadoopConf()).setBasePath(path.get).build() + val schemaResolver = new TableSchemaResolver(metaClient) + val sqlSchema = + try { + val avroSchema = schemaResolver.getTableAvroSchema + AvroConversionUtils.convertAvroSchemaToStructType(avroSchema) + } catch { + case _: Exception => + require(schema.isDefined, "Fail to resolve source schema") + schema.get + } + (shortName(), sqlSchema) + } + + override def createSource(sqlContext: SQLContext, + metadataPath: String, + schema: Option[StructType], + providerName: String, + parameters: Map[String, String]): Source = { + new HoodieStreamSource(sqlContext, metadataPath, schema, parameters) + } +} + +object DefaultSource { + + private val log = LogManager.getLogger(classOf[DefaultSource]) + + def createRelation(sqlContext: SQLContext, + metaClient: HoodieTableMetaClient, + schema: StructType, + globPaths: Seq[Path], + parameters: Map[String, String]): BaseRelation = { + val tableType = metaClient.getTableType + val isBootstrappedTable = metaClient.getTableConfig.getBootstrapBasePath.isPresent + val queryType = parameters(QUERY_TYPE.key) + + log.info(s"Is bootstrapped table => $isBootstrappedTable, tableType is: $tableType, queryType is: $queryType") + + // NOTE: In cases when Hive Metastore is used as catalog and the table is partitioned, schema in the HMS might contain + // Hive-specific partitioning columns created specifically for HMS to handle partitioning appropriately. In that + // case we opt in to not be providing catalog's schema, and instead force Hudi relations to fetch the schema + // from the table itself + val userSchema = if (isUsingHiveCatalog(sqlContext.sparkSession)) { + None + } else { + Option(schema) + } + + if (metaClient.getCommitsTimeline.filterCompletedInstants.countInstants() == 0) { + new EmptyRelation(sqlContext, metaClient) + } else { + (tableType, queryType, isBootstrappedTable) match { + case (COPY_ON_WRITE, QUERY_TYPE_SNAPSHOT_OPT_VAL, false) | + (COPY_ON_WRITE, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, false) | + (MERGE_ON_READ, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, false) => + resolveBaseFileOnlyRelation(sqlContext, globPaths, userSchema, metaClient, parameters) + + case (COPY_ON_WRITE, QUERY_TYPE_INCREMENTAL_OPT_VAL, _) => + new IncrementalRelation(sqlContext, parameters, userSchema, metaClient) + + case (MERGE_ON_READ, QUERY_TYPE_SNAPSHOT_OPT_VAL, false) => + new MergeOnReadSnapshotRelation(sqlContext, parameters, userSchema, globPaths, metaClient) + + case (MERGE_ON_READ, QUERY_TYPE_INCREMENTAL_OPT_VAL, _) => + new MergeOnReadIncrementalRelation(sqlContext, parameters, userSchema, metaClient) + + case (_, _, true) => + new HoodieBootstrapRelation(sqlContext, userSchema, globPaths, metaClient, parameters) + + case (_, _, _) => + throw new HoodieException(s"Invalid query type : $queryType for tableType: $tableType," + + s"isBootstrappedTable: $isBootstrappedTable ") + } + } + } + + private def resolveBaseFileOnlyRelation(sqlContext: SQLContext, + globPaths: Seq[Path], + userSchema: Option[StructType], + metaClient: HoodieTableMetaClient, + optParams: Map[String, String]): BaseRelation = { + val baseRelation = new BaseFileOnlyRelation(sqlContext, metaClient, optParams, userSchema, globPaths) + + // NOTE: We fallback to [[HadoopFsRelation]] in all of the cases except ones requiring usage of + // [[BaseFileOnlyRelation]] to function correctly. This is necessary to maintain performance parity w/ + // vanilla Spark, since some of the Spark optimizations are predicated on the using of [[HadoopFsRelation]]. + // + // You can check out HUDI-3896 for more details + if (baseRelation.hasSchemaOnRead) { + baseRelation + } else { + baseRelation.toHadoopFsRelation + } + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/EmptyRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/EmptyRelation.scala new file mode 100644 index 0000000000000..3645eb8d9b9e8 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/EmptyRelation.scala @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi + +import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.sources.{BaseRelation, TableScan} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{Row, SQLContext} + +import scala.util.control.NonFatal + +/** + * BaseRelation representing empty RDD. + * @param sqlContext instance of SqlContext. + */ +class EmptyRelation(val sqlContext: SQLContext, metaClient: HoodieTableMetaClient) extends BaseRelation with TableScan { + + override def schema: StructType = { + // do the best to find the table schema. + val schemaResolver = new TableSchemaResolver(metaClient) + try { + val avroSchema = schemaResolver.getTableAvroSchema + AvroConversionUtils.convertAvroSchemaToStructType(avroSchema) + } catch { + case NonFatal(e) => + StructType(Nil) + } + } + + override def buildScan(): RDD[Row] = { + sqlContext.sparkContext.emptyRDD[Row] + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala new file mode 100644 index 0000000000000..242d7eb86796f --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala @@ -0,0 +1,748 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.avro.Schema +import org.apache.avro.generic.GenericRecord +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hadoop.hbase.io.hfile.CacheConfig +import org.apache.hadoop.mapred.JobConf +import org.apache.hudi.HoodieBaseRelation._ +import org.apache.hudi.HoodieConversionUtils.toScalaOption +import org.apache.hudi.avro.HoodieAvroUtils +import org.apache.hudi.client.utils.SparkInternalSchemaConverter +import org.apache.hudi.common.config.{HoodieMetadataConfig, SerializableConfiguration} +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath +import org.apache.hudi.common.model.{FileSlice, HoodieFileFormat, HoodieRecord} +import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline} +import org.apache.hudi.common.table.view.HoodieTableFileSystemView +import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.common.util.StringUtils +import org.apache.hudi.common.util.StringUtils.isNullOrEmpty +import org.apache.hudi.common.util.ValidationUtils.checkState +import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter +import org.apache.hudi.internal.schema.utils.{InternalSchemaUtils, SerDeHelper} +import org.apache.hudi.internal.schema.{HoodieSchemaException, InternalSchema} +import org.apache.hudi.io.storage.HoodieHFileReader +import org.apache.spark.execution.datasources.HoodieInMemoryFileIndex +import org.apache.spark.internal.Logging +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.HoodieCatalystExpressionUtils.convertToCatalystExpression +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression, UnsafeProjection} +import org.apache.spark.sql.execution.FileRelation +import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat +import org.apache.spark.sql.execution.datasources.parquet.{HoodieParquetFileFormat, ParquetFileFormat} +import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils +import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan} +import org.apache.spark.sql.types.{StringType, StructField, StructType} +import org.apache.spark.sql.{HoodieCatalystExpressionUtils, Row, SQLContext, SparkSession} +import org.apache.spark.unsafe.types.UTF8String + +import java.net.URI +import java.util.Locale +import scala.collection.JavaConverters._ +import scala.util.control.NonFatal +import scala.util.{Failure, Success, Try} + +trait HoodieFileSplit {} + +case class HoodieTableSchema(structTypeSchema: StructType, avroSchemaStr: String, internalSchema: Option[InternalSchema] = None) + +case class HoodieTableState(tablePath: String, + latestCommitTimestamp: String, + recordKeyField: String, + preCombineFieldOpt: Option[String], + usesVirtualKeys: Boolean, + recordPayloadClassName: String, + metadataConfig: HoodieMetadataConfig) + +/** + * Hoodie BaseRelation which extends [[PrunedFilteredScan]]. + */ +abstract class HoodieBaseRelation(val sqlContext: SQLContext, + val metaClient: HoodieTableMetaClient, + val optParams: Map[String, String], + schemaSpec: Option[StructType]) + extends BaseRelation + with FileRelation + with PrunedFilteredScan + with Logging + with SparkAdapterSupport { + + type FileSplit <: HoodieFileSplit + + imbueConfigs(sqlContext) + + protected val sparkSession: SparkSession = sqlContext.sparkSession + + protected lazy val conf: Configuration = new Configuration(sqlContext.sparkContext.hadoopConfiguration) + protected lazy val jobConf = new JobConf(conf) + + protected lazy val tableConfig: HoodieTableConfig = metaClient.getTableConfig + + protected lazy val basePath: String = metaClient.getBasePath + + // NOTE: Record key-field is assumed singular here due to the either of + // - In case Hudi's meta fields are enabled: record key will be pre-materialized (stored) as part + // of the record's payload (as part of the Hudi's metadata) + // - In case Hudi's meta fields are disabled (virtual keys): in that case record has to bear _single field_ + // identified as its (unique) primary key w/in its payload (this is a limitation of [[SimpleKeyGenerator]], + // which is the only [[KeyGenerator]] permitted for virtual-keys payloads) + protected lazy val recordKeyField: String = + if (tableConfig.populateMetaFields()) { + HoodieRecord.RECORD_KEY_METADATA_FIELD + } else { + val keyFields = tableConfig.getRecordKeyFields.get() + checkState(keyFields.length == 1) + keyFields.head + } + + protected lazy val preCombineFieldOpt: Option[String] = + Option(tableConfig.getPreCombineField) + .orElse(optParams.get(DataSourceWriteOptions.PRECOMBINE_FIELD.key)) match { + // NOTE: This is required to compensate for cases when empty string is used to stub + // property value to avoid it being set with the default value + // TODO(HUDI-3456) cleanup + case Some(f) if !StringUtils.isNullOrEmpty(f) => Some(f) + case _ => None + } + + protected lazy val specifiedQueryTimestamp: Option[String] = + optParams.get(DataSourceReadOptions.TIME_TRAVEL_AS_OF_INSTANT.key) + .map(HoodieSqlCommonUtils.formatQueryInstant) + + /** + * NOTE: Initialization of teh following members is coupled on purpose to minimize amount of I/O + * required to fetch table's Avro and Internal schemas + */ + protected lazy val (tableAvroSchema: Schema, internalSchemaOpt: Option[InternalSchema]) = { + val schemaResolver = new TableSchemaResolver(metaClient) + val internalSchemaOpt = if (!isSchemaEvolutionEnabledOnRead(optParams, sparkSession)) { + None + } else { + Try { + specifiedQueryTimestamp.map(schemaResolver.getTableInternalSchemaFromCommitMetadata) + .getOrElse(schemaResolver.getTableInternalSchemaFromCommitMetadata) + } match { + case Success(internalSchemaOpt) => toScalaOption(internalSchemaOpt) + case Failure(e) => + logWarning("Failed to fetch internal-schema from the table", e) + None + } + } + + val avroSchema = internalSchemaOpt.map { is => + AvroInternalSchemaConverter.convert(is, "schema") + } orElse { + specifiedQueryTimestamp.map(schemaResolver.getTableAvroSchema) + } orElse { + schemaSpec.map(convertToAvroSchema) + } getOrElse { + Try(schemaResolver.getTableAvroSchema) match { + case Success(schema) => schema + case Failure(e) => + logError("Failed to fetch schema from the table", e) + throw new HoodieSchemaException("Failed to fetch schema from the table") + } + } + + (avroSchema, internalSchemaOpt) + } + + protected lazy val tableStructSchema: StructType = AvroConversionUtils.convertAvroSchemaToStructType(tableAvroSchema) + + protected val partitionColumns: Array[String] = tableConfig.getPartitionFields.orElse(Array.empty) + + /** + * Data schema optimized (externally) by Spark's Optimizer. + * + * Please check scala-doc for [[updatePrunedDataSchema]] more details + */ + protected var optimizerPrunedDataSchema: Option[StructType] = None + + /** + * Controls whether partition values (ie values of partition columns) should be + *
      + *
    1. Extracted from partition path and appended to individual rows read from the data file (we + * delegate this to Spark's [[ParquetFileFormat]])
    2. + *
    3. Read from the data-file as is (by default Hudi persists all columns including partition ones)
    4. + *
    + * + * This flag is only be relevant in conjunction with the usage of [["hoodie.datasource.write.drop.partition.columns"]] + * config, when Hudi will NOT be persisting partition columns in the data file, and therefore values for + * such partition columns (ie "partition values") will have to be parsed from the partition path, and appended + * to every row only in the fetched dataset. + * + * NOTE: Partition values extracted from partition path might be deviating from the values of the original + * partition columns: for ex, if originally as partition column was used column [[ts]] bearing epoch + * timestamp, which was used by [[TimestampBasedKeyGenerator]] to generate partition path of the format + * [["yyyy/mm/dd"]], appended partition value would bear the format verbatim as it was used in the + * partition path, meaning that string value of "2022/01/01" will be appended, and not its original + * representation + */ + protected val shouldExtractPartitionValuesFromPartitionPath: Boolean = { + // Controls whether partition columns (which are the source for the partition path values) should + // be omitted from persistence in the data files. On the read path it affects whether partition values (values + // of partition columns) will be read from the data file or extracted from partition path + val shouldOmitPartitionColumns = metaClient.getTableConfig.shouldDropPartitionColumns && partitionColumns.nonEmpty + val shouldExtractPartitionValueFromPath = + optParams.getOrElse(DataSourceReadOptions.EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH.key, + DataSourceReadOptions.EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH.defaultValue.toString).toBoolean + shouldOmitPartitionColumns || shouldExtractPartitionValueFromPath + } + + /** + * NOTE: This fields are accessed by [[NestedSchemaPruning]] component which is only enabled for + * Spark >= 3.1 + */ + lazy val (fileFormat: FileFormat, fileFormatClassName: String) = + metaClient.getTableConfig.getBaseFileFormat match { + case HoodieFileFormat.ORC => (new OrcFileFormat, "orc") + case HoodieFileFormat.PARQUET => + // We're delegating to Spark to append partition values to every row only in cases + // when these corresponding partition-values are not persisted w/in the data file itself + val parquetFileFormat = sparkAdapter.createHoodieParquetFileFormat(shouldExtractPartitionValuesFromPartitionPath).get + (parquetFileFormat, HoodieParquetFileFormat.FILE_FORMAT_ID) + } + + /** + * NOTE: PLEASE READ THIS CAREFULLY + * + * Even though [[HoodieFileIndex]] initializes eagerly listing all of the files w/in the given Hudi table, + * this variable itself is _lazy_ (and have to stay that way) which guarantees that it's not initialized, until + * it's actually accessed + */ + protected lazy val fileIndex: HoodieFileIndex = + HoodieFileIndex(sparkSession, metaClient, Some(tableStructSchema), optParams, + FileStatusCache.getOrCreate(sparkSession)) + + /** + * Columns that relation has to read from the storage to properly execute on its semantic: for ex, + * for Merge-on-Read tables key fields as well and pre-combine field comprise mandatory set of columns, + * meaning that regardless of whether this columns are being requested by the query they will be fetched + * regardless so that relation is able to combine records properly (if necessary) + * + * @VisibleInTests + */ + val mandatoryFields: Seq[String] + + protected def timeline: HoodieTimeline = + // NOTE: We're including compaction here since it's not considering a "commit" operation + metaClient.getCommitsAndCompactionTimeline.filterCompletedInstants + + protected def latestInstant: Option[HoodieInstant] = + toScalaOption(timeline.lastInstant()) + + protected def queryTimestamp: Option[String] = + specifiedQueryTimestamp.orElse(latestInstant.map(_.getTimestamp)) + + /** + * Returns true in case table supports Schema on Read (Schema Evolution) + */ + def hasSchemaOnRead: Boolean = internalSchemaOpt.isDefined + + /** + * Data schema is determined as the actual schema of the Table's Data Files (for ex, parquet/orc/etc); + * + * In cases when partition values are not persisted w/in the data files, data-schema is defined as + *
    table's schema - partition columns
    + * + * Check scala-doc for [[shouldExtractPartitionValuesFromPartitionPath]] for more details + */ + def dataSchema: StructType = if (shouldExtractPartitionValuesFromPartitionPath) { + prunePartitionColumns(tableStructSchema) + } else { + tableStructSchema + } + + /** + * Determines whether relation's schema could be pruned by Spark's Optimizer + */ + def canPruneRelationSchema: Boolean = + (fileFormat.isInstanceOf[ParquetFileFormat] || fileFormat.isInstanceOf[OrcFileFormat]) && + // NOTE: Some relations might be disabling sophisticated schema pruning techniques (for ex, nested schema pruning) + // TODO(HUDI-XXX) internal schema doesn't support nested schema pruning currently + !hasSchemaOnRead + + override def schema: StructType = { + // NOTE: Optimizer could prune the schema (applying for ex, [[NestedSchemaPruning]] rule) setting new updated + // schema in-place (via [[setPrunedDataSchema]] method), therefore we have to make sure that we pick + // pruned data schema (if present) over the standard table's one + optimizerPrunedDataSchema.getOrElse(tableStructSchema) + } + + /** + * This method controls whether relation will be producing + *
      + *
    • [[Row]], when it's being equal to true
    • + *
    • [[InternalRow]], when it's being equal to false
    • + *
    + * + * Returning [[InternalRow]] directly enables us to save on needless ser/de loop from [[InternalRow]] (being + * produced by file-reader) to [[Row]] and back + */ + override final def needConversion: Boolean = false + + override def inputFiles: Array[String] = fileIndex.allFiles.map(_.getPath.toUri.toString).toArray + + /** + * NOTE: DO NOT OVERRIDE THIS METHOD + */ + override final def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { + // NOTE: PLEAS READ CAREFULLY BEFORE MAKING CHANGES + // + // In case list of requested columns doesn't contain the Primary Key one, we + // have to add it explicitly so that + // - Merging could be performed correctly + // - In case 0 columns are to be fetched (for ex, when doing {@code count()} on Spark's [[Dataset]], + // Spark still fetches all the rows to execute the query correctly + // + // *Appending* additional columns to the ones requested by the caller is not a problem, as those + // will be "projected out" by the caller's projection; + // + // (!!!) IT'S CRITICAL TO AVOID REORDERING OF THE REQUESTED COLUMNS AS THIS WILL BREAK THE UPSTREAM + // PROJECTION + val targetColumns: Array[String] = appendMandatoryColumns(requiredColumns) + // NOTE: We explicitly fallback to default table's Avro schema to make sure we avoid unnecessary Catalyst > Avro + // schema conversion, which is lossy in nature (for ex, it doesn't preserve original Avro type-names) and + // could have an effect on subsequent de-/serializing records in some exotic scenarios (when Avro unions + // w/ more than 2 types are involved) + val sourceSchema = optimizerPrunedDataSchema.map(convertToAvroSchema).getOrElse(tableAvroSchema) + val (requiredAvroSchema, requiredStructSchema, requiredInternalSchema) = + projectSchema(Either.cond(internalSchemaOpt.isDefined, internalSchemaOpt.get, sourceSchema), targetColumns) + + val filterExpressions = convertToExpressions(filters) + val (partitionFilters, dataFilters) = filterExpressions.partition(isPartitionPredicate) + + val fileSplits = collectFileSplits(partitionFilters, dataFilters) + + val tableAvroSchemaStr = tableAvroSchema.toString + + val tableSchema = HoodieTableSchema(tableStructSchema, tableAvroSchemaStr, internalSchemaOpt) + val requiredSchema = HoodieTableSchema(requiredStructSchema, requiredAvroSchema.toString, Some(requiredInternalSchema)) + + if (fileSplits.isEmpty) { + sparkSession.sparkContext.emptyRDD + } else { + val rdd = composeRDD(fileSplits, tableSchema, requiredSchema, targetColumns, filters) + + // Here we rely on a type erasure, to workaround inherited API restriction and pass [[RDD[InternalRow]]] back as [[RDD[Row]]] + // Please check [[needConversion]] scala-doc for more details + rdd.asInstanceOf[RDD[Row]] + } + } + + /** + * Composes RDD provided file splits to read from, table and partition schemas, data filters to be applied + * + * @param fileSplits file splits to be handled by the RDD + * @param tableSchema target table's schema + * @param requiredSchema projected schema required by the reader + * @param requestedColumns columns requested by the query + * @param filters data filters to be applied + * @return instance of RDD (holding [[InternalRow]]s) + */ + protected def composeRDD(fileSplits: Seq[FileSplit], + tableSchema: HoodieTableSchema, + requiredSchema: HoodieTableSchema, + requestedColumns: Array[String], + filters: Array[Filter]): RDD[InternalRow] + + /** + * Provided with partition and date filters collects target file splits to read records from, while + * performing pruning if necessary + * + * @param partitionFilters partition filters to be applied + * @param dataFilters data filters to be applied + * @return list of [[FileSplit]] to fetch records from + */ + protected def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[FileSplit] + + /** + * Get all PartitionDirectories based on globPaths if specified, otherwise use the table path. + * Will perform pruning if necessary + */ + private def listPartitionDirectories(globPaths: Seq[Path], partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionDirectory] = { + if (globPaths.isEmpty) { + fileIndex.listFiles(partitionFilters, dataFilters) + } else { + val inMemoryFileIndex = HoodieInMemoryFileIndex.create(sparkSession, globPaths) + inMemoryFileIndex.listFiles(partitionFilters, dataFilters) + } + } + + /** + * Get all latest base files with partition paths, if globPaths is empty, will listing files + * under the table path. + */ + protected def listLatestBaseFiles(globPaths: Seq[Path], partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Map[Path, Seq[FileStatus]] = { + val partitionDirs = listPartitionDirectories(globPaths, partitionFilters, dataFilters) + val fsView = new HoodieTableFileSystemView(metaClient, timeline, partitionDirs.flatMap(_.files).toArray) + + val latestBaseFiles = fsView.getLatestBaseFiles.iterator().asScala.toList.map(_.getFileStatus) + + latestBaseFiles.groupBy(getPartitionPath) + } + + /** + * Get all fileSlices(contains base files and log files if exist) from globPaths if not empty, + * otherwise will use the table path to do the listing. + */ + protected def listLatestFileSlices(globPaths: Seq[Path], partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[FileSlice] = { + latestInstant.map { _ => + val partitionDirs = listPartitionDirectories(globPaths, partitionFilters, dataFilters) + val fsView = new HoodieTableFileSystemView(metaClient, timeline, partitionDirs.flatMap(_.files).toArray) + + val queryTimestamp = this.queryTimestamp.get + fsView.getPartitionPaths.asScala.flatMap { partitionPath => + val relativePath = getRelativePartitionPath(new Path(basePath), partitionPath) + fsView.getLatestMergedFileSlicesBeforeOrOn(relativePath, queryTimestamp).iterator().asScala.toSeq + } + }.getOrElse(Seq()) + } + + protected def convertToExpressions(filters: Array[Filter]): Array[Expression] = { + val catalystExpressions = filters.map(expr => convertToCatalystExpression(expr, tableStructSchema)) + + val failedExprs = catalystExpressions.zipWithIndex.filter { case (opt, _) => opt.isEmpty } + if (failedExprs.nonEmpty) { + val failedFilters = failedExprs.map(p => filters(p._2)) + logWarning(s"Failed to convert Filters into Catalyst expressions (${failedFilters.map(_.toString)})") + } + + catalystExpressions.filter(_.isDefined).map(_.get).toArray + } + + /** + * Checks whether given expression only references partition columns + * (and involves no sub-query) + */ + protected def isPartitionPredicate(condition: Expression): Boolean = { + // Validates that the provided names both resolve to the same entity + val resolvedNameEquals = sparkSession.sessionState.analyzer.resolver + + condition.references.forall { r => partitionColumns.exists(resolvedNameEquals(r.name, _)) } && + !SubqueryExpression.hasSubquery(condition) + } + + protected final def appendMandatoryColumns(requestedColumns: Array[String]): Array[String] = { + // For a nested field in mandatory columns, we should first get the root-level field, and then + // check for any missing column, as the requestedColumns should only contain root-level fields + // We should only append root-level field as well + val missing = mandatoryFields.map(col => HoodieAvroUtils.getRootLevelFieldName(col)) + .filter(rootField => !requestedColumns.contains(rootField)) + requestedColumns ++ missing + } + + protected def getTableState: HoodieTableState = { + // Subset of the state of table's configuration as of at the time of the query + HoodieTableState( + tablePath = basePath, + latestCommitTimestamp = queryTimestamp.get, + recordKeyField = recordKeyField, + preCombineFieldOpt = preCombineFieldOpt, + usesVirtualKeys = !tableConfig.populateMetaFields(), + recordPayloadClassName = tableConfig.getPayloadClass, + metadataConfig = fileIndex.metadataConfig + ) + } + + def imbueConfigs(sqlContext: SQLContext): Unit = { + sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.filterPushdown", "true") + sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.recordLevelFilter.enabled", "true") + // TODO(HUDI-3639) vectorized reader has to be disabled to make sure MORIncrementalRelation is working properly + sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.enableVectorizedReader", "false") + } + + /** + * For enable hoodie.datasource.write.drop.partition.columns, need to create an InternalRow on partition values + * and pass this reader on parquet file. So that, we can query the partition columns. + */ + protected def getPartitionColumnsAsInternalRow(file: FileStatus): InternalRow = { + try { + val tableConfig = metaClient.getTableConfig + if (shouldExtractPartitionValuesFromPartitionPath) { + val relativePath = new URI(metaClient.getBasePath).relativize(new URI(file.getPath.getParent.toString)).toString + val hiveStylePartitioningEnabled = tableConfig.getHiveStylePartitioningEnable.toBoolean + if (hiveStylePartitioningEnabled) { + val partitionSpec = PartitioningUtils.parsePathFragment(relativePath) + InternalRow.fromSeq(partitionColumns.map(partitionSpec(_)).map(UTF8String.fromString)) + } else { + if (partitionColumns.length == 1) { + InternalRow.fromSeq(Seq(UTF8String.fromString(relativePath))) + } else { + val parts = relativePath.split("/") + assert(parts.size == partitionColumns.length) + InternalRow.fromSeq(parts.map(UTF8String.fromString)) + } + } + } else { + InternalRow.empty + } + } catch { + case NonFatal(e) => + logWarning(s"Failed to get the right partition InternalRow for file: ${file.toString}", e) + InternalRow.empty + } + } + + protected def getColName(f: StructField): String = { + if (sparkSession.sessionState.conf.caseSensitiveAnalysis) { + f.name + } else { + f.name.toLowerCase(Locale.ROOT) + } + } + + /** + * Hook for Spark's Optimizer to update expected relation schema after pruning + * + * NOTE: Only limited number of optimizations in respect to schema pruning could be performed + * internally w/in the relation itself w/o consideration for how the relation output is used. + * Therefore more advanced optimizations (like [[NestedSchemaPruning]]) have to be carried out + * by Spark's Optimizer holistically evaluating Spark's [[LogicalPlan]] + */ + def updatePrunedDataSchema(prunedSchema: StructType): this.type = { + optimizerPrunedDataSchema = Some(prunedSchema) + this + } + + /** + * Returns file-reader routine accepting [[PartitionedFile]] and returning an [[Iterator]] + * over [[InternalRow]] + */ + protected def createBaseFileReader(spark: SparkSession, + partitionSchema: StructType, + dataSchema: HoodieTableSchema, + requiredDataSchema: HoodieTableSchema, + filters: Seq[Filter], + options: Map[String, String], + hadoopConf: Configuration): BaseFileReader = { + val tableBaseFileFormat = tableConfig.getBaseFileFormat + + // NOTE: PLEASE READ CAREFULLY + // Lambda returned from this method is going to be invoked on the executor, and therefore + // we have to eagerly initialize all of the readers even though only one specific to the type + // of the file being read will be used. This is required to avoid serialization of the whole + // relation (containing file-index for ex) and passing it to the executor + val (read: (PartitionedFile => Iterator[InternalRow]), schema: StructType) = + tableBaseFileFormat match { + case HoodieFileFormat.PARQUET => + val parquetReader = HoodieDataSourceHelper.buildHoodieParquetReader( + sparkSession = spark, + dataSchema = dataSchema.structTypeSchema, + partitionSchema = partitionSchema, + requiredSchema = requiredDataSchema.structTypeSchema, + filters = filters, + options = options, + hadoopConf = hadoopConf, + // We're delegating to Spark to append partition values to every row only in cases + // when these corresponding partition-values are not persisted w/in the data file itself + appendPartitionValues = shouldExtractPartitionValuesFromPartitionPath + ) + // Since partition values by default are omitted, and not persisted w/in data-files by Spark, + // data-file readers (such as [[ParquetFileFormat]]) have to inject partition values while reading + // the data. As such, actual full schema produced by such reader is composed of + // a) Data-file schema (projected or not) + // b) Appended partition column values + val readerSchema = StructType(requiredDataSchema.structTypeSchema.fields ++ partitionSchema.fields) + + (parquetReader, readerSchema) + + case HoodieFileFormat.HFILE => + val hfileReader = createHFileReader( + spark = spark, + dataSchema = dataSchema, + requiredDataSchema = requiredDataSchema, + filters = filters, + options = options, + hadoopConf = hadoopConf + ) + + (hfileReader, requiredDataSchema.structTypeSchema) + + case _ => throw new UnsupportedOperationException(s"Base file format is not currently supported ($tableBaseFileFormat)") + } + + BaseFileReader( + read = partitionedFile => { + val extension = FSUtils.getFileExtension(partitionedFile.filePath) + if (tableBaseFileFormat.getFileExtension.equals(extension)) { + read(partitionedFile) + } else { + throw new UnsupportedOperationException(s"Invalid base-file format ($extension), expected ($tableBaseFileFormat)") + } + }, + schema = schema + ) + } + + protected def embedInternalSchema(conf: Configuration, internalSchemaOpt: Option[InternalSchema]): Configuration = { + val internalSchema = internalSchemaOpt.getOrElse(InternalSchema.getEmptyInternalSchema) + val querySchemaString = SerDeHelper.toJson(internalSchema) + if (!isNullOrEmpty(querySchemaString)) { + val validCommits = timeline.getInstants.iterator.asScala.map(_.getFileName).mkString(",") + + conf.set(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA, SerDeHelper.toJson(internalSchema)) + conf.set(SparkInternalSchemaConverter.HOODIE_TABLE_PATH, metaClient.getBasePath) + conf.set(SparkInternalSchemaConverter.HOODIE_VALID_COMMITS_LIST, validCommits) + } + conf + } + + protected def tryPrunePartitionColumns(tableSchema: HoodieTableSchema, + requiredSchema: HoodieTableSchema): (StructType, HoodieTableSchema, HoodieTableSchema) = { + // Since schema requested by the caller might contain partition columns, we might need to + // prune it, removing all partition columns from it in case these columns are not persisted + // in the data files + // + // NOTE: This partition schema is only relevant to file reader to be able to embed + // values of partition columns (hereafter referred to as partition values) encoded into + // the partition path, and omitted from the data file, back into fetched rows; + // Note that, by default, partition columns are not omitted therefore specifying + // partition schema for reader is not required + if (shouldExtractPartitionValuesFromPartitionPath) { + val partitionSchema = StructType(partitionColumns.map(StructField(_, StringType))) + val prunedDataStructSchema = prunePartitionColumns(tableSchema.structTypeSchema) + val prunedRequiredSchema = prunePartitionColumns(requiredSchema.structTypeSchema) + + (partitionSchema, + HoodieTableSchema(prunedDataStructSchema, convertToAvroSchema(prunedDataStructSchema).toString), + HoodieTableSchema(prunedRequiredSchema, convertToAvroSchema(prunedRequiredSchema).toString)) + } else { + (StructType(Nil), tableSchema, requiredSchema) + } + } + + private def prunePartitionColumns(dataStructSchema: StructType): StructType = + StructType(dataStructSchema.filterNot(f => partitionColumns.contains(f.name))) +} + +object HoodieBaseRelation extends SparkAdapterSupport { + + case class BaseFileReader(read: PartitionedFile => Iterator[InternalRow], val schema: StructType) { + def apply(file: PartitionedFile): Iterator[InternalRow] = read.apply(file) + } + + def generateUnsafeProjection(from: StructType, to: StructType): UnsafeProjection = + HoodieCatalystExpressionUtils.generateUnsafeProjection(from, to) + + def convertToAvroSchema(structSchema: StructType): Schema = + sparkAdapter.getAvroSchemaConverters.toAvroType(structSchema, nullable = false, "Record") + + def getPartitionPath(fileStatus: FileStatus): Path = + fileStatus.getPath.getParent + + /** + * Projects provided file reader's output from its original schema, into a [[requiredSchema]] + * + * NOTE: [[requiredSchema]] has to be a proper subset of the file reader's schema + * + * @param reader file reader to be projected + * @param requiredSchema target schema for the output of the provided file reader + */ + def projectReader(reader: BaseFileReader, requiredSchema: StructType): BaseFileReader = { + checkState(reader.schema.fields.toSet.intersect(requiredSchema.fields.toSet).size == requiredSchema.size) + + if (reader.schema == requiredSchema) { + reader + } else { + val read = reader.apply(_) + val projectedRead: PartitionedFile => Iterator[InternalRow] = (file: PartitionedFile) => { + // NOTE: Projection is not a serializable object, hence it creation should only happen w/in + // the executor process + val unsafeProjection = generateUnsafeProjection(reader.schema, requiredSchema) + read(file).map(unsafeProjection) + } + + BaseFileReader(projectedRead, requiredSchema) + } + } + + /** + * Projects provided schema by picking only required (projected) top-level columns from it + * + * @param tableSchema schema to project (either of [[InternalSchema]] or Avro's [[Schema]]) + * @param requiredColumns required top-level columns to be projected + */ + def projectSchema(tableSchema: Either[Schema, InternalSchema], requiredColumns: Array[String]): (Schema, StructType, InternalSchema) = { + tableSchema match { + case Right(internalSchema) => + checkState(!internalSchema.isEmptySchema) + val prunedInternalSchema = InternalSchemaUtils.pruneInternalSchema(internalSchema, requiredColumns.toList.asJava) + val requiredAvroSchema = AvroInternalSchemaConverter.convert(prunedInternalSchema, "schema") + val requiredStructSchema = AvroConversionUtils.convertAvroSchemaToStructType(requiredAvroSchema) + + (requiredAvroSchema, requiredStructSchema, prunedInternalSchema) + + case Left(avroSchema) => + val fieldMap = avroSchema.getFields.asScala.map(f => f.name() -> f).toMap + val requiredFields = requiredColumns.map { col => + val f = fieldMap(col) + // We have to create a new [[Schema.Field]] since Avro schemas can't share field + // instances (and will throw "org.apache.avro.AvroRuntimeException: Field already used") + new Schema.Field(f.name(), f.schema(), f.doc(), f.defaultVal(), f.order()) + }.toList + val requiredAvroSchema = Schema.createRecord(avroSchema.getName, avroSchema.getDoc, + avroSchema.getNamespace, avroSchema.isError, requiredFields.asJava) + val requiredStructSchema = AvroConversionUtils.convertAvroSchemaToStructType(requiredAvroSchema) + + (requiredAvroSchema, requiredStructSchema, InternalSchema.getEmptyInternalSchema) + } + } + + private def createHFileReader(spark: SparkSession, + dataSchema: HoodieTableSchema, + requiredDataSchema: HoodieTableSchema, + filters: Seq[Filter], + options: Map[String, String], + hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { + val hadoopConfBroadcast = + spark.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) + + partitionedFile => { + val hadoopConf = hadoopConfBroadcast.value.get() + val reader = new HoodieHFileReader[GenericRecord](hadoopConf, new Path(partitionedFile.filePath), + new CacheConfig(hadoopConf)) + + val requiredRowSchema = requiredDataSchema.structTypeSchema + // NOTE: Schema has to be parsed at this point, since Avro's [[Schema]] aren't serializable + // to be passed from driver to executor + val requiredAvroSchema = new Schema.Parser().parse(requiredDataSchema.avroSchemaStr) + val avroToRowConverter = AvroConversionUtils.createAvroToInternalRowConverter(requiredAvroSchema, requiredRowSchema) + + reader.getRecordIterator(requiredAvroSchema).asScala + .map(record => { + avroToRowConverter.apply(record).get + }) + } + } + + def isSchemaEvolutionEnabledOnRead(optParams: Map[String, String], sparkSession: SparkSession): Boolean = { + // NOTE: Schema evolution could be configured both t/h optional parameters vehicle as well as + // t/h Spark Session configuration (for ex, for Spark SQL) + optParams.getOrElse(DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.key, + DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.defaultValue.toString).toBoolean || + sparkSession.conf.get(DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.key, + DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.defaultValue.toString).toBoolean + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieBootstrapRDD.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBootstrapRDD.scala similarity index 82% rename from hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieBootstrapRDD.scala rename to hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBootstrapRDD.scala index a522db6afc6f1..ea997c86acb39 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieBootstrapRDD.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBootstrapRDD.scala @@ -24,12 +24,13 @@ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.PartitionedFile import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.vectorized.ColumnarBatch + +import org.apache.hudi.HoodieDataSourceHelper._ class HoodieBootstrapRDD(@transient spark: SparkSession, - dataReadFunction: PartitionedFile => Iterator[Any], - skeletonReadFunction: PartitionedFile => Iterator[Any], - regularReadFunction: PartitionedFile => Iterator[Any], + dataReadFunction: PartitionedFile => Iterator[InternalRow], + skeletonReadFunction: PartitionedFile => Iterator[InternalRow], + regularReadFunction: PartitionedFile => Iterator[InternalRow], dataSchema: StructType, skeletonSchema: StructType, requiredColumns: Array[String], @@ -56,18 +57,18 @@ class HoodieBootstrapRDD(@transient spark: SparkSession, // It is a bootstrap split. Check both skeleton and data files. if (dataSchema.isEmpty) { // No data column to fetch, hence fetch only from skeleton file - partitionedFileIterator = read(bootstrapPartition.split.skeletonFile.get, skeletonReadFunction) + partitionedFileIterator = skeletonReadFunction(bootstrapPartition.split.skeletonFile.get) } else if (skeletonSchema.isEmpty) { // No metadata column to fetch, hence fetch only from data file - partitionedFileIterator = read(bootstrapPartition.split.dataFile, dataReadFunction) + partitionedFileIterator = dataReadFunction(bootstrapPartition.split.dataFile) } else { // Fetch from both data and skeleton file, and merge - val dataFileIterator = read(bootstrapPartition.split.dataFile, dataReadFunction) - val skeletonFileIterator = read(bootstrapPartition.split.skeletonFile.get, skeletonReadFunction) + val dataFileIterator = dataReadFunction(bootstrapPartition.split.dataFile) + val skeletonFileIterator = skeletonReadFunction(bootstrapPartition.split.skeletonFile.get) partitionedFileIterator = merge(skeletonFileIterator, dataFileIterator) } } else { - partitionedFileIterator = read(bootstrapPartition.split.dataFile, regularReadFunction) + partitionedFileIterator = regularReadFunction(bootstrapPartition.split.dataFile) } partitionedFileIterator } @@ -101,19 +102,6 @@ class HoodieBootstrapRDD(@transient spark: SparkSession, mergedRow } - def read(partitionedFile: PartitionedFile, readFileFunction: PartitionedFile => Iterator[Any]) - : Iterator[InternalRow] = { - val fileIterator = readFileFunction(partitionedFile) - - import scala.collection.JavaConverters._ - - val rows = fileIterator.flatMap(_ match { - case r: InternalRow => Seq(r) - case b: ColumnarBatch => b.rowIterator().asScala - }) - rows - } - override protected def getPartitions: Array[Partition] = { tableState.files.zipWithIndex.map(file => { if (file._1.skeletonFile.isDefined) { diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBootstrapRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBootstrapRelation.scala new file mode 100644 index 0000000000000..0dd54237ef582 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBootstrapRelation.scala @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hadoop.fs.Path +import org.apache.hudi.common.model.HoodieBaseFile +import org.apache.hudi.common.table.view.HoodieTableFileSystemView +import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.exception.HoodieException +import org.apache.spark.execution.datasources.HoodieInMemoryFileIndex +import org.apache.spark.internal.Logging +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.datasources.{FileStatusCache, PartitionedFile} +import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{Row, SQLContext} + +import scala.collection.JavaConverters._ + +/** + * This is Spark relation that can be used for querying metadata/fully bootstrapped query hoodie tables, as well as + * non-bootstrapped tables. It implements PrunedFilteredScan interface in order to support column pruning and filter + * push-down. For metadata bootstrapped files, if we query columns from both metadata and actual data then it will + * perform a merge of both to return the result. + * + * Caveat: Filter push-down does not work when querying both metadata and actual data columns over metadata + * bootstrapped files, because then the metadata file and data file can return different number of rows causing errors + * merging. + * + * @param _sqlContext Spark SQL Context + * @param userSchema User specified schema in the datasource query + * @param globPaths The global paths to query. If it not none, read from the globPaths, + * else read data from tablePath using HoodiFileIndex. + * @param metaClient Hoodie table meta client + * @param optParams DataSource options passed by the user + */ +class HoodieBootstrapRelation(@transient val _sqlContext: SQLContext, + val userSchema: Option[StructType], + val globPaths: Seq[Path], + val metaClient: HoodieTableMetaClient, + val optParams: Map[String, String]) extends BaseRelation + with PrunedFilteredScan with Logging { + + val skeletonSchema: StructType = HoodieSparkUtils.getMetaSchema + var dataSchema: StructType = _ + var fullSchema: StructType = _ + + val fileIndex: HoodieBootstrapFileIndex = buildFileIndex() + + override def sqlContext: SQLContext = _sqlContext + + override val needConversion: Boolean = false + + override def schema: StructType = inferFullSchema() + + override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { + logInfo("Starting scan..") + + // Compute splits + val bootstrapSplits = fileIndex.files.map(hoodieBaseFile => { + var skeletonFile: Option[PartitionedFile] = Option.empty + var dataFile: PartitionedFile = null + + if (hoodieBaseFile.getBootstrapBaseFile.isPresent) { + skeletonFile = Option(PartitionedFile(InternalRow.empty, hoodieBaseFile.getPath, 0, hoodieBaseFile.getFileLen)) + dataFile = PartitionedFile(InternalRow.empty, hoodieBaseFile.getBootstrapBaseFile.get().getPath, 0, + hoodieBaseFile.getBootstrapBaseFile.get().getFileLen) + } else { + dataFile = PartitionedFile(InternalRow.empty, hoodieBaseFile.getPath, 0, hoodieBaseFile.getFileLen) + } + HoodieBootstrapSplit(dataFile, skeletonFile) + }) + val tableState = HoodieBootstrapTableState(bootstrapSplits) + + // Get required schemas for column pruning + var requiredDataSchema = StructType(Seq()) + var requiredSkeletonSchema = StructType(Seq()) + // requiredColsSchema is the schema of requiredColumns, note that requiredColumns is in a random order + // so requiredColsSchema is not always equal to (requiredSkeletonSchema.fields ++ requiredDataSchema.fields) + var requiredColsSchema = StructType(Seq()) + requiredColumns.foreach(col => { + var field = dataSchema.find(_.name == col) + if (field.isDefined) { + requiredDataSchema = requiredDataSchema.add(field.get) + } else { + field = skeletonSchema.find(_.name == col) + requiredSkeletonSchema = requiredSkeletonSchema.add(field.get) + } + requiredColsSchema = requiredColsSchema.add(field.get) + }) + + // Prepare readers for reading data file and skeleton files + val dataReadFunction = HoodieDataSourceHelper.buildHoodieParquetReader( + sparkSession = _sqlContext.sparkSession, + dataSchema = dataSchema, + partitionSchema = StructType(Seq.empty), + requiredSchema = requiredDataSchema, + filters = if (requiredSkeletonSchema.isEmpty) filters else Seq() , + options = optParams, + hadoopConf = _sqlContext.sparkSession.sessionState.newHadoopConf() + ) + + val skeletonReadFunction = HoodieDataSourceHelper.buildHoodieParquetReader( + sparkSession = _sqlContext.sparkSession, + dataSchema = skeletonSchema, + partitionSchema = StructType(Seq.empty), + requiredSchema = requiredSkeletonSchema, + filters = if (requiredDataSchema.isEmpty) filters else Seq(), + options = optParams, + hadoopConf = _sqlContext.sparkSession.sessionState.newHadoopConf() + ) + + val regularReadFunction = HoodieDataSourceHelper.buildHoodieParquetReader( + sparkSession = _sqlContext.sparkSession, + dataSchema = fullSchema, + partitionSchema = StructType(Seq.empty), + requiredSchema = requiredColsSchema, + filters = filters, + options = optParams, + hadoopConf = _sqlContext.sparkSession.sessionState.newHadoopConf() + ) + + val rdd = new HoodieBootstrapRDD(_sqlContext.sparkSession, dataReadFunction, skeletonReadFunction, + regularReadFunction, requiredDataSchema, requiredSkeletonSchema, requiredColumns, tableState) + rdd.asInstanceOf[RDD[Row]] + } + + def inferFullSchema(): StructType = { + if (fullSchema == null) { + logInfo("Inferring schema..") + val schemaResolver = new TableSchemaResolver(metaClient) + val tableSchema = schemaResolver.getTableAvroSchema(false) + dataSchema = AvroConversionUtils.convertAvroSchemaToStructType(tableSchema) + fullSchema = StructType(skeletonSchema.fields ++ dataSchema.fields) + } + fullSchema + } + + def buildFileIndex(): HoodieBootstrapFileIndex = { + logInfo("Building file index..") + val fileStatuses = if (globPaths.nonEmpty) { + // Load files from the global paths if it has defined to be compatible with the original mode + val inMemoryFileIndex = HoodieInMemoryFileIndex.create(_sqlContext.sparkSession, globPaths) + inMemoryFileIndex.allFiles() + } else { // Load files by the HoodieFileIndex. + HoodieFileIndex(sqlContext.sparkSession, metaClient, Some(schema), optParams, + FileStatusCache.getOrCreate(sqlContext.sparkSession)).allFiles + } + if (fileStatuses.isEmpty) { + throw new HoodieException("No files found for reading in user provided path.") + } + + val fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline.getCommitsTimeline + .filterCompletedInstants, fileStatuses.toArray) + val latestFiles: List[HoodieBaseFile] = fsView.getLatestBaseFiles.iterator().asScala.toList + + if (log.isDebugEnabled) { + latestFiles.foreach(file => { + logDebug("Printing indexed files:") + if (file.getBootstrapBaseFile.isPresent) { + logDebug("Skeleton File: " + file.getPath + ", Data File: " + file.getBootstrapBaseFile.get().getPath) + } else { + logDebug("Regular Hoodie File: " + file.getPath) + } + }) + } + + HoodieBootstrapFileIndex(latestFiles) + } +} + +case class HoodieBootstrapFileIndex(files: List[HoodieBaseFile]) + +case class HoodieBootstrapTableState(files: List[HoodieBootstrapSplit]) + +case class HoodieBootstrapSplit(dataFile: PartitionedFile, skeletonFile: Option[PartitionedFile]) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieCLIUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieCLIUtils.scala new file mode 100644 index 0000000000000..0d3edd592d192 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieCLIUtils.scala @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi + +import org.apache.hudi.avro.model.HoodieClusteringGroup +import org.apache.hudi.client.SparkRDDWriteClient +import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.spark.SparkException +import org.apache.spark.api.java.JavaSparkContext +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.withSparkConf + +import scala.collection.JavaConverters.{collectionAsScalaIterableConverter, mapAsJavaMapConverter} + +object HoodieCLIUtils { + + def createHoodieClientFromPath(sparkSession: SparkSession, + basePath: String, + conf: Map[String, String]): SparkRDDWriteClient[_] = { + val metaClient = HoodieTableMetaClient.builder().setBasePath(basePath) + .setConf(sparkSession.sessionState.newHadoopConf()).build() + val schemaUtil = new TableSchemaResolver(metaClient) + val schemaStr = schemaUtil.getTableAvroSchemaWithoutMetadataFields.toString + val finalParameters = HoodieWriterUtils.parametersWithWriteDefaults( + withSparkConf(sparkSession, Map.empty)( + conf + (DataSourceWriteOptions.TABLE_TYPE.key() -> metaClient.getTableType.name())) + ) + + val jsc = new JavaSparkContext(sparkSession.sparkContext) + DataSourceUtils.createHoodieClient(jsc, schemaStr, basePath, + metaClient.getTableConfig.getTableName, finalParameters.asJava) + } + + def extractPartitions(clusteringGroups: Seq[HoodieClusteringGroup]): String = { + var partitionPaths: Seq[String] = Seq.empty + clusteringGroups.foreach(g => + g.getSlices.asScala.foreach(slice => + partitionPaths = partitionPaths :+ slice.getPartitionPath + ) + ) + + partitionPaths.sorted.mkString(",") + } + + def getHoodieCatalogTable(sparkSession: SparkSession, table: String): HoodieCatalogTable = { + val seq: Seq[String] = table.split('.') + seq match { + case Seq(tableName) => + HoodieCatalogTable(sparkSession, TableIdentifier(tableName)) + case Seq(database, tableName) => + HoodieCatalogTable(sparkSession, TableIdentifier(tableName, Some(database))) + case _ => + throw new SparkException(s"Unsupported identifier $table") + } + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieCatalystUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieCatalystUtils.scala new file mode 100644 index 0000000000000..0f41dc1fff3f1 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieCatalystUtils.scala @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hudi.common.data.HoodieData +import org.apache.spark.sql.Dataset +import org.apache.spark.storage.StorageLevel +import org.apache.spark.storage.StorageLevel._ + +object HoodieCatalystUtils extends SparkAdapterSupport { + + /** + * Executes provided function while keeping provided [[Dataset]] instance persisted for the + * duration of the execution + * + * @param df target [[Dataset]] to be persisted + * @param level desired [[StorageLevel]] of the persistence + * @param f target function to be executed while [[Dataset]] is kept persisted + * @tparam T return value of the target function + * @return execution outcome of the [[f]] function + */ + def withPersistedDataset[T](df: Dataset[_], level: StorageLevel = MEMORY_AND_DISK)(f: => T): T = { + df.persist(level) + try { + f + } finally { + df.unpersist() + } + } + + /** + * Executes provided function while keeping provided [[HoodieData]] instance persisted for the + * duration of the execution + * + * @param data target [[Dataset]] to be persisted + * @param level desired [[StorageLevel]] of the persistence + * @param f target function to be executed while [[Dataset]] is kept persisted + * @tparam T return value of the target function + * @return execution outcome of the [[f]] function + */ + def withPersistedData[T](data: HoodieData[_], level: StorageLevel = MEMORY_AND_DISK)(f: => T): T = { + data.persist(sparkAdapter.convertStorageLevelToString(level)) + try { + f + } finally { + data.unpersist() + } + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieDataSourceHelper.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieDataSourceHelper.scala new file mode 100644 index 0000000000000..8bd295c7f3db4 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieDataSourceHelper.scala @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.FileStatus +import org.apache.hudi.client.utils.SparkInternalSchemaConverter +import org.apache.hudi.common.util.StringUtils.isNullOrEmpty +import org.apache.hudi.internal.schema.InternalSchema +import org.apache.hudi.internal.schema.utils.SerDeHelper +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{PredicateHelper, SpecificInternalRow, UnsafeProjection} +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch + +import scala.collection.JavaConverters._ + +object HoodieDataSourceHelper extends PredicateHelper with SparkAdapterSupport { + + + /** + * Wrapper for `buildReaderWithPartitionValues` of [[ParquetFileFormat]] handling [[ColumnarBatch]], + * when Parquet's Vectorized Reader is used + * + * TODO move to HoodieBaseRelation, make private + */ + private[hudi] def buildHoodieParquetReader(sparkSession: SparkSession, + dataSchema: StructType, + partitionSchema: StructType, + requiredSchema: StructType, + filters: Seq[Filter], + options: Map[String, String], + hadoopConf: Configuration, + appendPartitionValues: Boolean = false): PartitionedFile => Iterator[InternalRow] = { + val parquetFileFormat: ParquetFileFormat = sparkAdapter.createHoodieParquetFileFormat(appendPartitionValues).get + val readParquetFile: PartitionedFile => Iterator[Any] = parquetFileFormat.buildReaderWithPartitionValues( + sparkSession = sparkSession, + dataSchema = dataSchema, + partitionSchema = partitionSchema, + requiredSchema = requiredSchema, + filters = filters, + options = options, + hadoopConf = hadoopConf + ) + + file: PartitionedFile => { + val iter = readParquetFile(file) + iter.flatMap { + case r: InternalRow => Seq(r) + case b: ColumnarBatch => b.rowIterator().asScala + } + } + } + + def splitFiles( + sparkSession: SparkSession, + file: FileStatus, + partitionValues: InternalRow): Seq[PartitionedFile] = { + val filePath = file.getPath + val maxSplitBytes = sparkSession.sessionState.conf.filesMaxPartitionBytes + (0L until file.getLen by maxSplitBytes).map { offset => + val remaining = file.getLen - offset + val size = if (remaining > maxSplitBytes) maxSplitBytes else remaining + PartitionedFile(partitionValues, filePath.toUri.toString, offset, size) + } + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieEmptyRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieEmptyRelation.scala similarity index 100% rename from hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieEmptyRelation.scala rename to hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieEmptyRelation.scala diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala new file mode 100644 index 0000000000000..5515212a3e96a --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hudi.HoodieFileIndex.{DataSkippingFailureMode, collectReferencedColumns, getConfigProperties} +import org.apache.hudi.HoodieSparkConfUtils.getConfigValue +import org.apache.hudi.common.config.{HoodieMetadataConfig, TypedProperties} +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.util.StringUtils +import org.apache.hudi.exception.HoodieException +import org.apache.hudi.keygen.constant.KeyGeneratorOptions +import org.apache.hudi.keygen.{TimestampBasedAvroKeyGenerator, TimestampBasedKeyGenerator} +import org.apache.hudi.metadata.{HoodieMetadataPayload, HoodieTableMetadataUtil} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{And, Expression, Literal} +import org.apache.spark.sql.execution.datasources.{FileIndex, FileStatusCache, NoopCache, PartitionDirectory} +import org.apache.spark.sql.hudi.DataSkippingUtils.translateIntoColumnStatsIndexFilterExpr +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{Column, SparkSession} +import org.apache.spark.unsafe.types.UTF8String + +import java.text.SimpleDateFormat +import scala.collection.JavaConverters._ +import scala.util.control.NonFatal +import scala.util.{Failure, Success, Try} + +/** + * A file index which support partition prune for hoodie snapshot and read-optimized query. + * + * Main steps to get the file list for query: + * 1、Load all files and partition values from the table path. + * 2、Do the partition prune by the partition filter condition. + * + * There are 3 cases for this: + * 1、If the partition columns size is equal to the actually partition path level, we + * read it as partitioned table.(e.g partition column is "dt", the partition path is "2021-03-10") + * + * 2、If the partition columns size is not equal to the partition path level, but the partition + * column size is "1" (e.g. partition column is "dt", but the partition path is "2021/03/10" + * who's directory level is 3).We can still read it as a partitioned table. We will mapping the + * partition path (e.g. 2021/03/10) to the only partition column (e.g. "dt"). + * + * 3、Else the the partition columns size is not equal to the partition directory level and the + * size is great than "1" (e.g. partition column is "dt,hh", the partition path is "2021/03/10/12") + * , we read it as a Non-Partitioned table because we cannot know how to mapping the partition + * path with the partition columns in this case. + * + * TODO rename to HoodieSparkSqlFileIndex + */ +case class HoodieFileIndex(spark: SparkSession, + metaClient: HoodieTableMetaClient, + schemaSpec: Option[StructType], + options: Map[String, String], + @transient fileStatusCache: FileStatusCache = NoopCache) + extends SparkHoodieTableFileIndex( + spark = spark, + metaClient = metaClient, + schemaSpec = schemaSpec, + configProperties = getConfigProperties(spark, options), + queryPaths = HoodieFileIndex.getQueryPaths(options), + specifiedQueryInstant = options.get(DataSourceReadOptions.TIME_TRAVEL_AS_OF_INSTANT.key).map(HoodieSqlCommonUtils.formatQueryInstant), + fileStatusCache = fileStatusCache + ) + with FileIndex { + + @transient private lazy val columnStatsIndex = new ColumnStatsIndexSupport(spark, schema, metadataConfig, metaClient) + + override def rootPaths: Seq[Path] = queryPaths.asScala + + /** + * Returns the FileStatus for all the base files (excluding log files). This should be used only for + * cases where Spark directly fetches the list of files via HoodieFileIndex or for read optimized query logic + * implemented internally within Hudi like HoodieBootstrapRelation. This helps avoid the use of path filter + * to filter out log files within Spark. + * + * @return List of FileStatus for base files + */ + def allFiles: Seq[FileStatus] = { + cachedAllInputFileSlices.values.asScala.flatMap(_.asScala) + .map(fs => fs.getBaseFile.orElse(null)) + .filter(_ != null) + .map(_.getFileStatus) + .toSeq + } + + /** + * Invoked by Spark to fetch list of latest base files per partition. + * + * @param partitionFilters partition column filters + * @param dataFilters data columns filters + * @return list of PartitionDirectory containing partition to base files mapping + */ + override def listFiles(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionDirectory] = { + // Look up candidate files names in the col-stats index, if all of the following conditions are true + // - Data-skipping is enabled + // - Col-Stats Index is present + // - List of predicates (filters) is present + val candidateFilesNamesOpt: Option[Set[String]] = + lookupCandidateFilesInMetadataTable(dataFilters) match { + case Success(opt) => opt + case Failure(e) => + logError("Failed to lookup candidate files in File Index", e) + + spark.sqlContext.getConf(DataSkippingFailureMode.configName, DataSkippingFailureMode.Fallback.value) match { + case DataSkippingFailureMode.Fallback.value => Option.empty + case DataSkippingFailureMode.Strict.value => throw new HoodieException(e); + } + } + + logDebug(s"Overlapping candidate files from Column Stats Index: ${candidateFilesNamesOpt.getOrElse(Set.empty)}") + + if (queryAsNonePartitionedTable) { + // Read as Non-Partitioned table + // Filter in candidate files based on the col-stats index lookup + val candidateFiles = allFiles.filter(fileStatus => + // NOTE: This predicate is true when {@code Option} is empty + candidateFilesNamesOpt.forall(_.contains(fileStatus.getPath.getName)) + ) + + logInfo(s"Total files : ${allFiles.size}; " + + s"candidate files after data skipping: ${candidateFiles.size}; " + + s"skipping percent ${if (allFiles.nonEmpty) (allFiles.size - candidateFiles.size) / allFiles.size.toDouble else 0}") + + Seq(PartitionDirectory(InternalRow.empty, candidateFiles)) + } else { + // Prune the partition path by the partition filters + val prunedPartitions = prunePartition(cachedAllInputFileSlices.keySet.asScala.toSeq, partitionFilters) + var totalFileSize = 0 + var candidateFileSize = 0 + + val result = prunedPartitions.map { partition => + val baseFileStatuses: Seq[FileStatus] = + cachedAllInputFileSlices.get(partition).asScala + .map(fs => fs.getBaseFile.orElse(null)) + .filter(_ != null) + .map(_.getFileStatus) + + // Filter in candidate files based on the col-stats index lookup + val candidateFiles = baseFileStatuses.filter(fs => + // NOTE: This predicate is true when {@code Option} is empty + candidateFilesNamesOpt.forall(_.contains(fs.getPath.getName))) + + totalFileSize += baseFileStatuses.size + candidateFileSize += candidateFiles.size + PartitionDirectory(InternalRow.fromSeq(partition.values), candidateFiles) + } + + logInfo(s"Total base files: $totalFileSize; " + + s"candidate files after data skipping : $candidateFileSize; " + + s"skipping percent ${if (allFiles.nonEmpty && totalFileSize > 0) (totalFileSize - candidateFileSize) / totalFileSize.toDouble else 0}") + + result + } + } + + private def lookupFileNamesMissingFromIndex(allIndexedFileNames: Set[String]) = { + val allBaseFileNames = allFiles.map(f => f.getPath.getName).toSet + allBaseFileNames -- allIndexedFileNames + } + + /** + * Computes pruned list of candidate base-files' names based on provided list of {@link dataFilters} + * conditions, by leveraging Metadata Table's Column Statistics index (hereon referred as ColStats for brevity) + * bearing "min", "max", "num_nulls" statistics for all columns. + * + * NOTE: This method has to return complete set of candidate files, since only provided candidates will + * ultimately be scanned as part of query execution. Hence, this method has to maintain the + * invariant of conservatively including every base-file's name, that is NOT referenced in its index. + * + * @param queryFilters list of original data filters passed down from querying engine + * @return list of pruned (data-skipped) candidate base-files' names + */ + private def lookupCandidateFilesInMetadataTable(queryFilters: Seq[Expression]): Try[Option[Set[String]]] = Try { + // NOTE: Data Skipping is only effective when it references columns that are indexed w/in + // the Column Stats Index (CSI). Following cases could not be effectively handled by Data Skipping: + // - Expressions on top-level column's fields (ie, for ex filters like "struct.field > 0", since + // CSI only contains stats for top-level columns, in this case for "struct") + // - Any expression not directly referencing top-level column (for ex, sub-queries, since there's + // nothing CSI in particular could be applied for) + lazy val queryReferencedColumns = collectReferencedColumns(spark, queryFilters, schema) + + if (!isMetadataTableEnabled || !isDataSkippingEnabled || !columnStatsIndex.isIndexAvailable) { + validateConfig() + Option.empty + } else if (queryFilters.isEmpty || queryReferencedColumns.isEmpty) { + Option.empty + } else { + // NOTE: Since executing on-cluster via Spark API has its own non-trivial amount of overhead, + // it's most often preferential to fetch Column Stats Index w/in the same process (usually driver), + // w/o resorting to on-cluster execution. + // For that we use a simple-heuristic to determine whether we should read and process CSI in-memory or + // on-cluster: total number of rows of the expected projected portion of the index has to be below the + // threshold (of 100k records) + val shouldReadInMemory = columnStatsIndex.shouldReadInMemory(this, queryReferencedColumns) + + columnStatsIndex.loadTransposed(queryReferencedColumns, shouldReadInMemory) { transposedColStatsDF => + val indexSchema = transposedColStatsDF.schema + val indexFilter = + queryFilters.map(translateIntoColumnStatsIndexFilterExpr(_, indexSchema)) + .reduce(And) + + val allIndexedFileNames = + transposedColStatsDF.select(HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME) + .collect() + .map(_.getString(0)) + .toSet + + val prunedCandidateFileNames = + transposedColStatsDF.where(new Column(indexFilter)) + .select(HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME) + .collect() + .map(_.getString(0)) + .toSet + + // NOTE: Col-Stats Index isn't guaranteed to have complete set of statistics for every + // base-file: since it's bound to clustering, which could occur asynchronously + // at arbitrary point in time, and is not likely to be touching all of the base files. + // + // To close that gap, we manually compute the difference b/w all indexed (by col-stats-index) + // files and all outstanding base-files, and make sure that all base files not + // represented w/in the index are included in the output of this method + val notIndexedFileNames = lookupFileNamesMissingFromIndex(allIndexedFileNames) + + Some(prunedCandidateFileNames ++ notIndexedFileNames) + } + } + } + + override def refresh(): Unit = { + super.refresh() + columnStatsIndex.invalidateCaches() + } + + override def inputFiles: Array[String] = + allFiles.map(_.getPath.toString).toArray + + override def sizeInBytes: Long = cachedFileSize + + private def isDataSkippingEnabled: Boolean = getConfigValue(options, spark.sessionState.conf, + DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "false").toBoolean + + private def isMetadataTableEnabled: Boolean = metadataConfig.enabled() + + private def isColumnStatsIndexEnabled: Boolean = metadataConfig.isColumnStatsIndexEnabled + + private def validateConfig(): Unit = { + if (isDataSkippingEnabled && (!isMetadataTableEnabled || !isColumnStatsIndexEnabled)) { + logWarning("Data skipping requires both Metadata Table and Column Stats Index to be enabled as well! " + + s"(isMetadataTableEnabled = $isMetadataTableEnabled, isColumnStatsIndexEnabled = $isColumnStatsIndexEnabled") + } + } + +} + +object HoodieFileIndex extends Logging { + + object DataSkippingFailureMode extends Enumeration { + val configName = "hoodie.fileIndex.dataSkippingFailureMode" + + type DataSkippingFailureMode = Value + + case class Val(value: String) extends super.Val { + override def toString(): String = value + } + + import scala.language.implicitConversions + implicit def valueToVal(x: Value): DataSkippingFailureMode = x.asInstanceOf[Val] + + val Fallback: Val = Val("fallback") + val Strict: Val = Val("strict") + } + + private def collectReferencedColumns(spark: SparkSession, queryFilters: Seq[Expression], schema: StructType): Seq[String] = { + val resolver = spark.sessionState.analyzer.resolver + val refs = queryFilters.flatMap(_.references) + schema.fieldNames.filter { colName => refs.exists(r => resolver.apply(colName, r.name)) } + } + + def getConfigProperties(spark: SparkSession, options: Map[String, String]) = { + val sqlConf: SQLConf = spark.sessionState.conf + val properties = new TypedProperties() + properties.putAll(options.filter(p => p._2 != null).asJava) + + // To support metadata listing via Spark SQL we allow users to pass the config via SQL Conf in spark session. Users + // would be able to run SET hoodie.metadata.enable=true in the spark sql session to enable metadata listing. + val isMetadataTableEnabled = getConfigValue(options, sqlConf, HoodieMetadataConfig.ENABLE.key, null) + if (isMetadataTableEnabled != null) { + properties.setProperty(HoodieMetadataConfig.ENABLE.key(), String.valueOf(isMetadataTableEnabled)) + } + + properties + } + + def convertFilterForTimestampKeyGenerator(metaClient: HoodieTableMetaClient, + partitionFilters: Seq[Expression]): Seq[Expression] = { + + val tableConfig = metaClient.getTableConfig + val keyGenerator = tableConfig.getKeyGeneratorClassName + + if (keyGenerator != null && (keyGenerator.equals(classOf[TimestampBasedKeyGenerator].getCanonicalName) || + keyGenerator.equals(classOf[TimestampBasedAvroKeyGenerator].getCanonicalName))) { + val inputFormat = tableConfig.getString(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP) + val outputFormat = tableConfig.getString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP) + if (StringUtils.isNullOrEmpty(inputFormat) || StringUtils.isNullOrEmpty(outputFormat) || + inputFormat.equals(outputFormat)) { + partitionFilters + } else { + try { + val inDateFormat = new SimpleDateFormat(inputFormat) + val outDateFormat = new SimpleDateFormat(outputFormat) + partitionFilters.toArray.map { + _.transformDown { + case Literal(value, dataType) if dataType.isInstanceOf[StringType] => + val converted = outDateFormat.format(inDateFormat.parse(value.toString)) + Literal(UTF8String.fromString(converted), StringType) + } + } + } catch { + case NonFatal(e) => + logWarning("Fail to convert filters for TimestampBaseAvroKeyGenerator", e) + partitionFilters + } + } + } else { + partitionFilters + } + } + + private def getQueryPaths(options: Map[String, String]): Seq[Path] = { + options.get("path") match { + case Some(p) => Seq(new Path(p)) + case None => + options.getOrElse("glob.paths", + throw new IllegalArgumentException("'path' or 'glob paths' option required")) + .split(",") + .map(new Path(_)) + .toSeq + } + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala new file mode 100644 index 0000000000000..512c97806f31d --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala @@ -0,0 +1,478 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.avro.Schema +import org.apache.avro.generic.{GenericRecord, GenericRecordBuilder, IndexedRecord} +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapred.JobConf +import org.apache.hudi.HoodieBaseRelation.{BaseFileReader, generateUnsafeProjection, projectReader} +import org.apache.hudi.HoodieConversionUtils.{toJavaOption, toScalaOption} +import org.apache.hudi.HoodieMergeOnReadRDD.SafeAvroProjection.collectFieldOrdinals +import org.apache.hudi.HoodieMergeOnReadRDD._ +import org.apache.hudi.common.config.{HoodieCommonConfig, HoodieMetadataConfig} +import org.apache.hudi.common.engine.HoodieLocalEngineContext +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath +import org.apache.hudi.common.model.{HoodieLogFile, HoodieRecord, HoodieRecordPayload, OverwriteWithLatestAvroPayload} +import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner +import org.apache.hudi.common.util.ValidationUtils.checkState +import org.apache.hudi.config.HoodiePayloadConfig +import org.apache.hudi.exception.HoodieException +import org.apache.hudi.hadoop.config.HoodieRealtimeConfig +import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils.getMaxCompactionMemoryInBytes +import org.apache.hudi.internal.schema.InternalSchema +import org.apache.hudi.metadata.HoodieTableMetadata.getDataTableBasePathFromMetadataTable +import org.apache.hudi.metadata.{HoodieBackedTableMetadata, HoodieTableMetadata} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.avro.HoodieAvroDeserializer +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.types.StructType +import org.apache.spark.{Partition, SerializableWritable, SparkContext, TaskContext} + +import java.io.Closeable +import java.util.Properties +import scala.annotation.tailrec +import scala.collection.JavaConverters._ +import scala.util.Try + +case class HoodieMergeOnReadPartition(index: Int, split: HoodieMergeOnReadFileSplit) extends Partition + +/** + * Class holding base-file readers for 3 different use-cases: + * + *
      + *
    1. Full-schema reader: is used when whole row has to be read to perform merging correctly. + * This could occur, when no optimizations could be applied and we have to fallback to read the whole row from + * the base file and the corresponding delta-log file to merge them correctly
    2. + * + *
    3. Required-schema reader: is used when it's fine to only read row's projected columns. + * This could occur, when row could be merged with corresponding delta-log record leveraging while only having + * projected columns
    4. + * + *
    5. Required-schema reader (skip-merging): is used when when no merging will be performed (skip-merged). + * This could occur, when file-group has no delta-log files
    6. + *
    + */ +private[hudi] case class HoodieMergeOnReadBaseFileReaders(fullSchemaReader: BaseFileReader, + requiredSchemaReader: BaseFileReader, + requiredSchemaReaderSkipMerging: BaseFileReader) + +/** + * RDD enabling Hudi's Merge-on-Read (MOR) semantic + * + * @param sc spark's context + * @param config hadoop configuration + * @param fileReaders suite of base file readers + * @param tableSchema table's full schema + * @param requiredSchema expected (potentially) projected schema + * @param tableState table's state + * @param mergeType type of merge performed + * @param fileSplits target file-splits this RDD will be iterating over + */ +class HoodieMergeOnReadRDD(@transient sc: SparkContext, + @transient config: Configuration, + fileReaders: HoodieMergeOnReadBaseFileReaders, + tableSchema: HoodieTableSchema, + requiredSchema: HoodieTableSchema, + tableState: HoodieTableState, + mergeType: String, + @transient fileSplits: Seq[HoodieMergeOnReadFileSplit]) + extends RDD[InternalRow](sc, Nil) with HoodieUnsafeRDD { + + protected val maxCompactionMemoryInBytes: Long = getMaxCompactionMemoryInBytes(new JobConf(config)) + + private val confBroadcast = sc.broadcast(new SerializableWritable(config)) + private val payloadProps = tableState.preCombineFieldOpt + .map(preCombineField => + HoodiePayloadConfig.newBuilder + .withPayloadOrderingField(preCombineField) + .build + .getProps + ) + .getOrElse(new Properties()) + + private val whitelistedPayloadClasses: Set[String] = Seq( + classOf[OverwriteWithLatestAvroPayload] + ).map(_.getName).toSet + + override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = { + val mergeOnReadPartition = split.asInstanceOf[HoodieMergeOnReadPartition] + val iter = mergeOnReadPartition.split match { + case dataFileOnlySplit if dataFileOnlySplit.logFiles.isEmpty => + val projectedReader = projectReader(fileReaders.requiredSchemaReaderSkipMerging, requiredSchema.structTypeSchema) + projectedReader(dataFileOnlySplit.dataFile.get) + + case logFileOnlySplit if logFileOnlySplit.dataFile.isEmpty => + new LogFileIterator(logFileOnlySplit, getConfig) + + case split if mergeType.equals(DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL) => + val reader = fileReaders.requiredSchemaReaderSkipMerging + new SkipMergeIterator(split, reader, getConfig) + + case split if mergeType.equals(DataSourceReadOptions.REALTIME_PAYLOAD_COMBINE_OPT_VAL) => + val reader = pickBaseFileReader + new RecordMergingFileIterator(split, reader, getConfig) + + case _ => throw new HoodieException(s"Unable to select an Iterator to read the Hoodie MOR File Split for " + + s"file path: ${mergeOnReadPartition.split.dataFile.get.filePath}" + + s"log paths: ${mergeOnReadPartition.split.logFiles.toString}" + + s"hoodie table path: ${tableState.tablePath}" + + s"spark partition Index: ${mergeOnReadPartition.index}" + + s"merge type: ${mergeType}") + } + + if (iter.isInstanceOf[Closeable]) { + // register a callback to close logScanner which will be executed on task completion. + // when tasks finished, this method will be called, and release resources. + Option(TaskContext.get()).foreach(_.addTaskCompletionListener[Unit](_ => iter.asInstanceOf[Closeable].close())) + } + + iter + } + + private def pickBaseFileReader: BaseFileReader = { + // NOTE: This is an optimization making sure that even for MOR tables we fetch absolute minimum + // of the stored data possible, while still properly executing corresponding relation's semantic + // and meet the query's requirements. + // + // Here we assume that iff queried table + // a) It does use one of the standard (and whitelisted) Record Payload classes + // then we can avoid reading and parsing the records w/ _full_ schema, and instead only + // rely on projected one, nevertheless being able to perform merging correctly + if (whitelistedPayloadClasses.contains(tableState.recordPayloadClassName)) { + fileReaders.requiredSchemaReader + } else { + fileReaders.fullSchemaReader + } + } + + override protected def getPartitions: Array[Partition] = + fileSplits.zipWithIndex.map(file => HoodieMergeOnReadPartition(file._2, file._1)).toArray + + private def getConfig: Configuration = { + val conf = confBroadcast.value.value + HoodieMergeOnReadRDD.CONFIG_INSTANTIATION_LOCK.synchronized { + new Configuration(conf) + } + } + + /** + * Provided w/ instance of [[HoodieMergeOnReadFileSplit]], iterates over all of the records stored in + * Delta Log files (represented as [[InternalRow]]s) + */ + private class LogFileIterator(split: HoodieMergeOnReadFileSplit, + config: Configuration) + extends Iterator[InternalRow] with Closeable with AvroDeserializerSupport { + + protected override val requiredAvroSchema: Schema = new Schema.Parser().parse(requiredSchema.avroSchemaStr) + protected override val requiredStructTypeSchema: StructType = requiredSchema.structTypeSchema + + protected val logFileReaderAvroSchema: Schema = new Schema.Parser().parse(tableSchema.avroSchemaStr) + + protected var recordToLoad: InternalRow = _ + + private val requiredSchemaSafeAvroProjection = SafeAvroProjection.create(logFileReaderAvroSchema, requiredAvroSchema) + + private var logScanner = { + val internalSchema = tableSchema.internalSchema.getOrElse(InternalSchema.getEmptyInternalSchema) + HoodieMergeOnReadRDD.scanLog(split.logFiles, getPartitionPath(split), logFileReaderAvroSchema, tableState, + maxCompactionMemoryInBytes, config, internalSchema) + } + + private val logRecords = logScanner.getRecords.asScala + + // NOTE: This have to stay lazy to make sure it's initialized only at the point where it's + // going to be used, since we modify `logRecords` before that and therefore can't do it any earlier + protected lazy val logRecordsIterator: Iterator[Option[GenericRecord]] = + logRecords.iterator.map { + case (_, record) => + toScalaOption(record.getData.getInsertValue(logFileReaderAvroSchema, payloadProps)) + .map(_.asInstanceOf[GenericRecord]) + } + + protected def removeLogRecord(key: String): Option[HoodieRecord[_ <: HoodieRecordPayload[_]]] = + logRecords.remove(key) + + override def hasNext: Boolean = hasNextInternal + + // NOTE: It's crucial for this method to be annotated w/ [[@tailrec]] to make sure + // that recursion is unfolded into a loop to avoid stack overflows while + // handling records + @tailrec private def hasNextInternal: Boolean = { + logRecordsIterator.hasNext && { + val avroRecordOpt = logRecordsIterator.next() + if (avroRecordOpt.isEmpty) { + // Record has been deleted, skipping + this.hasNextInternal + } else { + val projectedAvroRecord = requiredSchemaSafeAvroProjection(avroRecordOpt.get) + recordToLoad = deserialize(projectedAvroRecord) + true + } + } + } + + override final def next(): InternalRow = recordToLoad + + override def close(): Unit = + if (logScanner != null) { + try { + logScanner.close() + } finally { + logScanner = null + } + } + } + + /** + * Provided w/ instance of [[HoodieMergeOnReadFileSplit]], provides an iterator over all of the records stored in + * Base file as well as all of the Delta Log files simply returning concatenation of these streams, while not + * performing any combination/merging of the records w/ the same primary keys (ie producing duplicates potentially) + */ + private class SkipMergeIterator(split: HoodieMergeOnReadFileSplit, + baseFileReader: BaseFileReader, + config: Configuration) + extends LogFileIterator(split, config) { + + private val requiredSchemaUnsafeProjection = generateUnsafeProjection(baseFileReader.schema, requiredStructTypeSchema) + + private val baseFileIterator = baseFileReader(split.dataFile.get) + + override def hasNext: Boolean = { + if (baseFileIterator.hasNext) { + // No merge is required, simply load current row and project into required schema + recordToLoad = requiredSchemaUnsafeProjection(baseFileIterator.next()) + true + } else { + super[LogFileIterator].hasNext + } + } + } + + /** + * Provided w/ instance of [[HoodieMergeOnReadFileSplit]], provides an iterator over all of the records stored in + * a) Base file and all of the b) Delta Log files combining records with the same primary key from both of these + * streams + */ + private class RecordMergingFileIterator(split: HoodieMergeOnReadFileSplit, + baseFileReader: BaseFileReader, + config: Configuration) + extends LogFileIterator(split, config) { + + // NOTE: Record-merging iterator supports 2 modes of operation merging records bearing either + // - Full table's schema + // - Projected schema + // As such, no particular schema could be assumed, and therefore we rely on the caller + // to correspondingly set the scheme of the expected output of base-file reader + private val baseFileReaderAvroSchema = sparkAdapter.getAvroSchemaConverters.toAvroType(baseFileReader.schema, nullable = false, "record") + + private val serializer = sparkAdapter.createAvroSerializer(baseFileReader.schema, baseFileReaderAvroSchema, nullable = false) + + private val reusableRecordBuilder: GenericRecordBuilder = new GenericRecordBuilder(requiredAvroSchema) + + private val recordKeyOrdinal = baseFileReader.schema.fieldIndex(tableState.recordKeyField) + + private val requiredSchemaUnsafeProjection = generateUnsafeProjection(baseFileReader.schema, requiredStructTypeSchema) + + private val baseFileIterator = baseFileReader(split.dataFile.get) + + override def hasNext: Boolean = hasNextInternal + + // NOTE: It's crucial for this method to be annotated w/ [[@tailrec]] to make sure + // that recursion is unfolded into a loop to avoid stack overflows while + // handling records + @tailrec private def hasNextInternal: Boolean = { + if (baseFileIterator.hasNext) { + val curRow = baseFileIterator.next() + val curKey = curRow.getString(recordKeyOrdinal) + val updatedRecordOpt = removeLogRecord(curKey) + if (updatedRecordOpt.isEmpty) { + // No merge is required, simply load current row and project into required schema + recordToLoad = requiredSchemaUnsafeProjection(curRow) + true + } else { + val mergedAvroRecordOpt = merge(serialize(curRow), updatedRecordOpt.get) + if (mergedAvroRecordOpt.isEmpty) { + // Record has been deleted, skipping + this.hasNextInternal + } else { + val projectedAvroRecord = projectAvroUnsafe(mergedAvroRecordOpt.get.asInstanceOf[GenericRecord], + requiredAvroSchema, reusableRecordBuilder) + recordToLoad = deserialize(projectedAvroRecord) + true + } + } + } else { + super[LogFileIterator].hasNext + } + } + + private def serialize(curRowRecord: InternalRow): GenericRecord = + serializer.serialize(curRowRecord).asInstanceOf[GenericRecord] + + private def merge(curAvroRecord: GenericRecord, newRecord: HoodieRecord[_ <: HoodieRecordPayload[_]]): Option[IndexedRecord] = { + // NOTE: We have to pass in Avro Schema used to read from Delta Log file since we invoke combining API + // on the record from the Delta Log + toScalaOption(newRecord.getData.combineAndGetUpdateValue(curAvroRecord, logFileReaderAvroSchema, payloadProps)) + } + } +} + +private object HoodieMergeOnReadRDD { + + val CONFIG_INSTANTIATION_LOCK = new Object() + + def scanLog(logFiles: List[HoodieLogFile], + partitionPath: Path, + logSchema: Schema, + tableState: HoodieTableState, + maxCompactionMemoryInBytes: Long, + hadoopConf: Configuration, internalSchema: InternalSchema = InternalSchema.getEmptyInternalSchema): HoodieMergedLogRecordScanner = { + val tablePath = tableState.tablePath + val fs = FSUtils.getFs(tablePath, hadoopConf) + + if (HoodieTableMetadata.isMetadataTable(tablePath)) { + val metadataConfig = HoodieMetadataConfig.newBuilder() + .fromProperties(tableState.metadataConfig.getProps).enable(true).build() + val dataTableBasePath = getDataTableBasePathFromMetadataTable(tablePath) + val metadataTable = new HoodieBackedTableMetadata( + new HoodieLocalEngineContext(hadoopConf), metadataConfig, + dataTableBasePath, + hadoopConf.get(HoodieRealtimeConfig.SPILLABLE_MAP_BASE_PATH_PROP, HoodieRealtimeConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH)) + + // We have to force full-scan for the MT log record reader, to make sure + // we can iterate over all of the partitions, since by default some of the partitions (Column Stats, + // Bloom Filter) are in "point-lookup" mode + val forceFullScan = true + + // NOTE: In case of Metadata Table partition path equates to partition name (since there's just one level + // of indirection among MT partitions) + val relativePartitionPath = getRelativePartitionPath(new Path(tablePath), partitionPath) + metadataTable.getLogRecordScanner(logFiles.asJava, relativePartitionPath, toJavaOption(Some(forceFullScan))) + .getLeft + } else { + val logRecordScannerBuilder = HoodieMergedLogRecordScanner.newBuilder() + .withFileSystem(fs) + .withBasePath(tablePath) + .withLogFilePaths(logFiles.map(logFile => logFile.getPath.toString).asJava) + .withReaderSchema(logSchema) + .withLatestInstantTime(tableState.latestCommitTimestamp) + .withReadBlocksLazily( + Try(hadoopConf.get(HoodieRealtimeConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, + HoodieRealtimeConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED).toBoolean) + .getOrElse(false)) + .withReverseReader(false) + .withInternalSchema(internalSchema) + .withBufferSize( + hadoopConf.getInt(HoodieRealtimeConfig.MAX_DFS_STREAM_BUFFER_SIZE_PROP, + HoodieRealtimeConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE)) + .withMaxMemorySizeInBytes(maxCompactionMemoryInBytes) + .withSpillableMapBasePath( + hadoopConf.get(HoodieRealtimeConfig.SPILLABLE_MAP_BASE_PATH_PROP, + HoodieRealtimeConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH)) + .withDiskMapType( + hadoopConf.getEnum(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key, + HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue)) + .withBitCaskDiskMapCompressionEnabled( + hadoopConf.getBoolean(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), + HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue())) + if (logFiles.nonEmpty) { + logRecordScannerBuilder.withPartition( + getRelativePartitionPath(new Path(tableState.tablePath), logFiles.head.getPath.getParent)) + } + + logRecordScannerBuilder.build() + } + } + + private def projectAvroUnsafe(record: GenericRecord, projectedSchema: Schema, reusableRecordBuilder: GenericRecordBuilder): GenericRecord = { + val fields = projectedSchema.getFields.asScala + fields.foreach(field => reusableRecordBuilder.set(field, record.get(field.name()))) + reusableRecordBuilder.build() + } + + private def getPartitionPath(split: HoodieMergeOnReadFileSplit): Path = { + // Determine partition path as an immediate parent folder of either + // - The base file + // - Some log file + split.dataFile.map(baseFile => new Path(baseFile.filePath)) + .getOrElse(split.logFiles.head.getPath) + .getParent + } + + // TODO extract to HoodieAvroSchemaUtils + abstract class AvroProjection extends (GenericRecord => GenericRecord) + + class SafeAvroProjection(sourceSchema: Schema, + projectedSchema: Schema, + reusableRecordBuilder: GenericRecordBuilder = null) extends AvroProjection { + + private val ordinals: List[Int] = collectFieldOrdinals(projectedSchema, sourceSchema) + private val recordBuilder: GenericRecordBuilder = + if (reusableRecordBuilder != null) { + reusableRecordBuilder + } else { + new GenericRecordBuilder(projectedSchema) + } + + override def apply(record: GenericRecord): GenericRecord = { + val fields = projectedSchema.getFields.asScala + checkState(fields.length == ordinals.length) + fields.zip(ordinals).foreach { + case (field, pos) => recordBuilder.set(field, record.get(pos)) + } + recordBuilder.build() + } + } + + object SafeAvroProjection { + def create(sourceSchema: Schema, projectedSchema: Schema, reusableRecordBuilder: GenericRecordBuilder = null): SafeAvroProjection = + new SafeAvroProjection( + sourceSchema = sourceSchema, + projectedSchema = projectedSchema, + reusableRecordBuilder = reusableRecordBuilder) + + /** + * Maps [[projected]] [[Schema]] onto [[source]] one, collecting corresponding field ordinals w/in it, which + * will be subsequently used by either [[projectRowUnsafe]] or [[projectAvroUnsafe()]] method + * + * @param projected target projected schema (which is a proper subset of [[source]] [[Schema]]) + * @param source source schema of the record being projected + * @return list of ordinals of corresponding fields of [[projected]] schema w/in [[source]] one + */ + private def collectFieldOrdinals(projected: Schema, source: Schema): List[Int] = { + projected.getFields.asScala.map(f => source.getField(f.name()).pos()).toList + } + } + + trait AvroDeserializerSupport extends SparkAdapterSupport { + protected val requiredAvroSchema: Schema + protected val requiredStructTypeSchema: StructType + + private lazy val deserializer: HoodieAvroDeserializer = + sparkAdapter.createAvroDeserializer(requiredAvroSchema, requiredStructTypeSchema) + + protected def deserialize(avroRecord: GenericRecord): InternalRow = { + checkState(avroRecord.getSchema.getFields.size() == requiredStructTypeSchema.fields.length) + deserializer.deserialize(avroRecord).get.asInstanceOf[InternalRow] + } + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkConfUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkConfUtils.scala new file mode 100644 index 0000000000000..5a6f03aed1bde --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkConfUtils.scala @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi + +import org.apache.spark.sql.internal.SQLConf + +/** + * Util methods for Hudi Spark and SQL configurations + */ +object HoodieSparkConfUtils { + /** + * Gets boolean config value from config properties and SQL conf. + * + * @param options Config properties. + * @param sqlConf SQL conf. + * @param configKey Config key to fetch. + * @param defaultValue Default value to return if not configured. + * @return The config value. + */ + def getConfigValue(options: Map[String, String], + sqlConf: SQLConf, + configKey: String, + defaultValue: String): String = { + options.getOrElse(configKey, sqlConf.getConfString(configKey, defaultValue)) + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala new file mode 100644 index 0000000000000..c9f424825031f --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala @@ -0,0 +1,817 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.avro.Schema +import org.apache.avro.generic.GenericRecord +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.hudi.AvroConversionUtils.{convertStructTypeToAvroSchema, getAvroRecordNameAndNamespace} +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.HoodieConversionUtils.{toProperties, toScalaOption} +import org.apache.hudi.HoodieWriterUtils._ +import org.apache.hudi.avro.HoodieAvroUtils +import org.apache.hudi.client.{HoodieWriteResult, SparkRDDWriteClient} +import org.apache.hudi.common.config.{HoodieCommonConfig, HoodieConfig, HoodieMetadataConfig, TypedProperties} +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model._ +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline +import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.common.util.{CommitUtils, StringUtils} +import org.apache.hudi.config.HoodieBootstrapConfig.{BASE_PATH, INDEX_CLASS_NAME, KEYGEN_CLASS_NAME} +import org.apache.hudi.config.{HoodieInternalConfig, HoodieWriteConfig} +import org.apache.hudi.exception.HoodieException +import org.apache.hudi.execution.bulkinsert.{BulkInsertInternalPartitionerWithRowsFactory, NonSortPartitionerWithRows} +import org.apache.hudi.hive.{HiveSyncConfigHolder, HiveSyncTool} +import org.apache.hudi.index.SparkHoodieIndexFactory +import org.apache.hudi.internal.DataSourceInternalWriterHelper +import org.apache.hudi.internal.schema.InternalSchema +import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter +import org.apache.hudi.internal.schema.utils.{AvroSchemaEvolutionUtils, SerDeHelper} +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory +import org.apache.hudi.keygen.{TimestampBasedAvroKeyGenerator, TimestampBasedKeyGenerator} +import org.apache.hudi.metrics.Metrics +import org.apache.hudi.sync.common.HoodieSyncConfig +import org.apache.hudi.sync.common.util.SyncUtilHelpers +import org.apache.hudi.table.BulkInsertPartitioner +import org.apache.hudi.util.SparkKeyGenUtils +import org.apache.log4j.LogManager +import org.apache.spark.api.java.JavaSparkContext +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.internal.StaticSQLConf +import org.apache.spark.sql.types.StructType +import org.apache.spark.{SPARK_VERSION, SparkContext} + +import scala.collection.JavaConversions._ +import scala.collection.mutable + +object HoodieSparkSqlWriter { + + private val log = LogManager.getLogger(getClass) + private var tableExists: Boolean = false + private var asyncCompactionTriggerFnDefined: Boolean = false + private var asyncClusteringTriggerFnDefined: Boolean = false + + def write(sqlContext: SQLContext, + mode: SaveMode, + optParams: Map[String, String], + df: DataFrame, + hoodieTableConfigOpt: Option[HoodieTableConfig] = Option.empty, + hoodieWriteClient: Option[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] = Option.empty, + asyncCompactionTriggerFn: Option[SparkRDDWriteClient[HoodieRecordPayload[Nothing]] => Unit] = Option.empty, + asyncClusteringTriggerFn: Option[SparkRDDWriteClient[HoodieRecordPayload[Nothing]] => Unit] = Option.empty) + : (Boolean, common.util.Option[String], common.util.Option[String], common.util.Option[String], + SparkRDDWriteClient[HoodieRecordPayload[Nothing]], HoodieTableConfig) = { + + assert(optParams.get("path").exists(!StringUtils.isNullOrEmpty(_)), "'path' must be set") + val path = optParams("path") + val basePath = new Path(path) + val sparkContext = sqlContext.sparkContext + val fs = basePath.getFileSystem(sparkContext.hadoopConfiguration) + tableExists = fs.exists(new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME)) + var tableConfig = getHoodieTableConfig(sparkContext, path, hoodieTableConfigOpt) + validateTableConfig(sqlContext.sparkSession, optParams, tableConfig, mode == SaveMode.Overwrite) + + val (parameters, hoodieConfig) = mergeParamsAndGetHoodieConfig(optParams, tableConfig, mode) + val originKeyGeneratorClassName = HoodieWriterUtils.getOriginKeyGenerator(parameters) + val timestampKeyGeneratorConfigs = extractConfigsRelatedToTimestampBasedKeyGenerator( + originKeyGeneratorClassName, parameters) + //validate datasource and tableconfig keygen are the same + validateKeyGeneratorConfig(originKeyGeneratorClassName, tableConfig); + val databaseName = hoodieConfig.getStringOrDefault(HoodieTableConfig.DATABASE_NAME, "") + val tblName = hoodieConfig.getStringOrThrow(HoodieWriteConfig.TBL_NAME, + s"'${HoodieWriteConfig.TBL_NAME.key}' must be set.").trim + assert(!StringUtils.isNullOrEmpty(hoodieConfig.getString(HoodieWriteConfig.TBL_NAME)), + s"'${HoodieWriteConfig.TBL_NAME.key}' must be set.") + + asyncCompactionTriggerFnDefined = asyncCompactionTriggerFn.isDefined + asyncClusteringTriggerFnDefined = asyncClusteringTriggerFn.isDefined + sparkContext.getConf.getOption("spark.serializer") match { + case Some(ser) if ser.equals("org.apache.spark.serializer.KryoSerializer") => + case _ => throw new HoodieException("hoodie only support org.apache.spark.serializer.KryoSerializer as spark.serializer") + } + val tableType = HoodieTableType.valueOf(hoodieConfig.getString(TABLE_TYPE)) + var operation = WriteOperationType.fromValue(hoodieConfig.getString(OPERATION)) + // It does not make sense to allow upsert() operation if INSERT_DROP_DUPS is true + // Auto-correct the operation to "insert" if OPERATION is set to "upsert" wrongly + // or not set (in which case it will be set as "upsert" by parametersWithWriteDefaults()) . + if (hoodieConfig.getBoolean(INSERT_DROP_DUPS) && + operation == WriteOperationType.UPSERT) { + + log.warn(s"$UPSERT_OPERATION_OPT_VAL is not applicable " + + s"when $INSERT_DROP_DUPS is set to be true, " + + s"overriding the $OPERATION to be $INSERT_OPERATION_OPT_VAL") + + operation = WriteOperationType.INSERT + } + + val jsc = new JavaSparkContext(sparkContext) + if (asyncCompactionTriggerFn.isDefined) { + if (jsc.getConf.getOption(SparkConfigs.SPARK_SCHEDULER_ALLOCATION_FILE_KEY).isDefined) { + jsc.setLocalProperty("spark.scheduler.pool", SparkConfigs.SPARK_DATASOURCE_WRITER_POOL_NAME) + } + } + val instantTime = HoodieActiveTimeline.createNewInstantTime() + val keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(hoodieConfig.getProps)) + + if (mode == SaveMode.Ignore && tableExists) { + log.warn(s"hoodie table at $basePath already exists. Ignoring & not performing actual writes.") + (false, common.util.Option.empty(), common.util.Option.empty(), common.util.Option.empty(), hoodieWriteClient.orNull, tableConfig) + } else { + // Handle various save modes + handleSaveModes(sqlContext.sparkSession, mode, basePath, tableConfig, tblName, operation, fs) + val partitionColumns = SparkKeyGenUtils.getPartitionColumns(keyGenerator, toProperties(parameters)) + val tableMetaClient = if (tableExists) { + HoodieTableMetaClient.builder + .setConf(sparkContext.hadoopConfiguration) + .setBasePath(path) + .build() + } else { + val baseFileFormat = hoodieConfig.getStringOrDefault(HoodieTableConfig.BASE_FILE_FORMAT) + val archiveLogFolder = hoodieConfig.getStringOrDefault(HoodieTableConfig.ARCHIVELOG_FOLDER) + val recordKeyFields = hoodieConfig.getString(DataSourceWriteOptions.RECORDKEY_FIELD) + val populateMetaFields = hoodieConfig.getBooleanOrDefault(HoodieTableConfig.POPULATE_META_FIELDS) + val useBaseFormatMetaFile = hoodieConfig.getBooleanOrDefault(HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT); + HoodieTableMetaClient.withPropertyBuilder() + .setTableType(tableType) + .setDatabaseName(databaseName) + .setTableName(tblName) + .setRecordKeyFields(recordKeyFields) + .setBaseFileFormat(baseFileFormat) + .setArchiveLogFolder(archiveLogFolder) + .setPayloadClassName(hoodieConfig.getString(PAYLOAD_CLASS_NAME)) + // we can't fetch preCombine field from hoodieConfig object, since it falls back to "ts" as default value, + // but we are interested in what user has set, hence fetching from optParams. + .setPreCombineField(optParams.getOrElse(PRECOMBINE_FIELD.key(), null)) + .setPartitionFields(partitionColumns) + .setPopulateMetaFields(populateMetaFields) + .setRecordKeyFields(hoodieConfig.getString(RECORDKEY_FIELD)) + .setKeyGeneratorClassProp(originKeyGeneratorClassName) + .set(timestampKeyGeneratorConfigs) + .setHiveStylePartitioningEnable(hoodieConfig.getBoolean(HIVE_STYLE_PARTITIONING)) + .setUrlEncodePartitioning(hoodieConfig.getBoolean(URL_ENCODE_PARTITIONING)) + .setPartitionMetafileUseBaseFormat(useBaseFormatMetaFile) + .setShouldDropPartitionColumns(hoodieConfig.getBooleanOrDefault(HoodieTableConfig.DROP_PARTITION_COLUMNS)) + .setCommitTimezone(HoodieTimelineTimeZone.valueOf(hoodieConfig.getStringOrDefault(HoodieTableConfig.TIMELINE_TIMEZONE))) + .initTable(sparkContext.hadoopConfiguration, path) + } + tableConfig = tableMetaClient.getTableConfig + + val commitActionType = CommitUtils.getCommitActionType(operation, tableConfig.getTableType) + val dropPartitionColumns = hoodieConfig.getBoolean(DataSourceWriteOptions.DROP_PARTITION_COLUMNS) + + // short-circuit if bulk_insert via row is enabled. + // scalastyle:off + if (hoodieConfig.getBoolean(ENABLE_ROW_WRITER) && + operation == WriteOperationType.BULK_INSERT) { + val (success, commitTime: common.util.Option[String]) = bulkInsertAsRow(sqlContext, parameters, df, tblName, + basePath, path, instantTime, partitionColumns, tableConfig.isTablePartitioned) + return (success, commitTime, common.util.Option.empty(), common.util.Option.empty(), hoodieWriteClient.orNull, tableConfig) + } + // scalastyle:on + + val reconcileSchema = parameters(DataSourceWriteOptions.RECONCILE_SCHEMA.key()).toBoolean + val (writeResult, writeClient: SparkRDDWriteClient[HoodieRecordPayload[Nothing]]) = + operation match { + case WriteOperationType.DELETE => { + val genericRecords = registerKryoClassesAndGetGenericRecords(tblName, sparkContext, df, reconcileSchema) + // Convert to RDD[HoodieKey] + val hoodieKeysToDelete = genericRecords.map(gr => keyGenerator.getKey(gr)).toJavaRDD() + + if (!tableExists) { + throw new HoodieException(s"hoodie table at $basePath does not exist") + } + + // Create a HoodieWriteClient & issue the delete. + val internalSchemaOpt = getLatestTableInternalSchema(hoodieConfig, tableMetaClient) + val client = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, + null, path, tblName, + mapAsJavaMap(addSchemaEvolutionParameters(parameters, internalSchemaOpt) - HoodieWriteConfig.AUTO_COMMIT_ENABLE.key))) + .asInstanceOf[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] + + if (isAsyncCompactionEnabled(client, tableConfig, parameters, jsc.hadoopConfiguration())) { + asyncCompactionTriggerFn.get.apply(client) + } + if (isAsyncClusteringEnabled(client, parameters)) { + asyncClusteringTriggerFn.get.apply(client) + } + + // Issue deletes + client.startCommitWithTime(instantTime, commitActionType) + val writeStatuses = DataSourceUtils.doDeleteOperation(client, hoodieKeysToDelete, instantTime) + (writeStatuses, client) + } + case WriteOperationType.DELETE_PARTITION => { + if (!tableExists) { + throw new HoodieException(s"hoodie table at $basePath does not exist") + } + + // Get list of partitions to delete + val partitionsToDelete = if (parameters.containsKey(DataSourceWriteOptions.PARTITIONS_TO_DELETE.key())) { + val partitionColsToDelete = parameters(DataSourceWriteOptions.PARTITIONS_TO_DELETE.key()).split(",") + java.util.Arrays.asList(partitionColsToDelete: _*) + } else { + val genericRecords = registerKryoClassesAndGetGenericRecords(tblName, sparkContext, df, reconcileSchema) + genericRecords.map(gr => keyGenerator.getKey(gr).getPartitionPath).toJavaRDD().distinct().collect() + } + // Create a HoodieWriteClient & issue the delete. + val client = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, + null, path, tblName, + mapAsJavaMap(parameters - HoodieWriteConfig.AUTO_COMMIT_ENABLE.key))) + .asInstanceOf[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] + // Issue delete partitions + client.startCommitWithTime(instantTime, commitActionType) + val writeStatuses = DataSourceUtils.doDeletePartitionsOperation(client, partitionsToDelete, instantTime) + (writeStatuses, client) + } + case _ => { // any other operation + // register classes & schemas + val (structName, nameSpace) = AvroConversionUtils.getAvroRecordNameAndNamespace(tblName) + sparkContext.getConf.registerKryoClasses( + Array(classOf[org.apache.avro.generic.GenericData], + classOf[org.apache.avro.Schema])) + + // TODO(HUDI-4472) revisit and simplify schema handling + val sourceSchema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, nameSpace) + val latestTableSchema = getLatestTableSchema(sqlContext.sparkSession, tableMetaClient).getOrElse(sourceSchema) + + val schemaEvolutionEnabled = parameters.getOrDefault(DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.key(), "false").toBoolean + var internalSchemaOpt = getLatestTableInternalSchema(hoodieConfig, tableMetaClient) + + val writerSchema: Schema = + if (reconcileSchema) { + // In case we need to reconcile the schema and schema evolution is enabled, + // we will force-apply schema evolution to the writer's schema + if (schemaEvolutionEnabled && internalSchemaOpt.isEmpty) { + internalSchemaOpt = Some(AvroInternalSchemaConverter.convert(sourceSchema)) + } + + if (internalSchemaOpt.isDefined) { + // Apply schema evolution, by auto-merging write schema and read schema + val mergedInternalSchema = AvroSchemaEvolutionUtils.reconcileSchema(sourceSchema, internalSchemaOpt.get) + AvroInternalSchemaConverter.convert(mergedInternalSchema, latestTableSchema.getName) + } else if (TableSchemaResolver.isSchemaCompatible(sourceSchema, latestTableSchema)) { + // In case schema reconciliation is enabled and source and latest table schemas + // are compatible (as defined by [[TableSchemaResolver#isSchemaCompatible]], then we will + // pick latest table's schema as the writer's schema + latestTableSchema + } else { + // Otherwise fallback to original source's schema + sourceSchema + } + } else { + // In case reconciliation is disabled, we still have to do nullability attributes + // (minor) reconciliation, making sure schema of the incoming batch is in-line with + // the data already committed in the table + AvroSchemaEvolutionUtils.canonicalizeColumnNullability(sourceSchema, latestTableSchema) + } + + validateSchemaForHoodieIsDeleted(writerSchema) + sparkContext.getConf.registerAvroSchemas(writerSchema) + log.info(s"Registered avro schema : ${writerSchema.toString(true)}") + + // Convert to RDD[HoodieRecord] + val genericRecords: RDD[GenericRecord] = HoodieSparkUtils.createRdd(df, structName, nameSpace, reconcileSchema, + org.apache.hudi.common.util.Option.of(writerSchema)) + val shouldCombine = parameters(INSERT_DROP_DUPS.key()).toBoolean || + operation.equals(WriteOperationType.UPSERT) || + parameters.getOrElse(HoodieWriteConfig.COMBINE_BEFORE_INSERT.key(), + HoodieWriteConfig.COMBINE_BEFORE_INSERT.defaultValue()).toBoolean + val hoodieAllIncomingRecords = genericRecords.map(gr => { + val processedRecord = getProcessedRecord(partitionColumns, gr, dropPartitionColumns) + val hoodieRecord = if (shouldCombine) { + val orderingVal = HoodieAvroUtils.getNestedFieldVal(gr, hoodieConfig.getString(PRECOMBINE_FIELD), false, parameters.getOrElse( + DataSourceWriteOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), + DataSourceWriteOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue()).toBoolean) + .asInstanceOf[Comparable[_]] + DataSourceUtils.createHoodieRecord(processedRecord, + orderingVal, + keyGenerator.getKey(gr), + hoodieConfig.getString(PAYLOAD_CLASS_NAME)) + } else { + DataSourceUtils.createHoodieRecord(processedRecord, keyGenerator.getKey(gr), hoodieConfig.getString(PAYLOAD_CLASS_NAME)) + } + hoodieRecord + }).toJavaRDD() + + val writerDataSchema = if (dropPartitionColumns) generateSchemaWithoutPartitionColumns(partitionColumns, writerSchema) else writerSchema + // Create a HoodieWriteClient & issue the write. + + val client = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, writerDataSchema.toString, path, + tblName, mapAsJavaMap(addSchemaEvolutionParameters(parameters, internalSchemaOpt) - HoodieWriteConfig.AUTO_COMMIT_ENABLE.key) + )).asInstanceOf[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] + + if (isAsyncCompactionEnabled(client, tableConfig, parameters, jsc.hadoopConfiguration())) { + asyncCompactionTriggerFn.get.apply(client) + } + + if (isAsyncClusteringEnabled(client, parameters)) { + asyncClusteringTriggerFn.get.apply(client) + } + + val hoodieRecords = + if (hoodieConfig.getBoolean(INSERT_DROP_DUPS)) { + DataSourceUtils.dropDuplicates(jsc, hoodieAllIncomingRecords, mapAsJavaMap(parameters)) + } else { + hoodieAllIncomingRecords + } + client.startCommitWithTime(instantTime, commitActionType) + val writeResult = DataSourceUtils.doWriteOperation(client, hoodieRecords, instantTime, operation) + (writeResult, client) + } + } + + // Check for errors and commit the write. + val (writeSuccessful, compactionInstant, clusteringInstant) = + commitAndPerformPostOperations(sqlContext.sparkSession, df.schema, + writeResult, parameters, writeClient, tableConfig, jsc, + TableInstantInfo(basePath, instantTime, commitActionType, operation)) + + (writeSuccessful, common.util.Option.ofNullable(instantTime), compactionInstant, clusteringInstant, writeClient, tableConfig) + } + } + + def generateSchemaWithoutPartitionColumns(partitionParam: String, schema: Schema): Schema = { + val fieldsToRemove = new java.util.HashSet[String]() + partitionParam.split(",").map(partitionField => partitionField.trim) + .filter(s => s.nonEmpty).map(field => fieldsToRemove.add(field)) + HoodieAvroUtils.removeFields(schema, fieldsToRemove) + } + + def getProcessedRecord(partitionParam: String, record: GenericRecord, + dropPartitionColumns: Boolean): GenericRecord = { + var processedRecord = record + if (dropPartitionColumns) { + val writeSchema = generateSchemaWithoutPartitionColumns(partitionParam, record.getSchema) + processedRecord = HoodieAvroUtils.rewriteRecord(record, writeSchema) + } + processedRecord + } + + def addSchemaEvolutionParameters(parameters: Map[String, String], internalSchemaOpt: Option[InternalSchema]): Map[String, String] = { + val schemaEvolutionEnable = if (internalSchemaOpt.isDefined) "true" else "false" + parameters ++ Map(HoodieWriteConfig.INTERNAL_SCHEMA_STRING.key() -> SerDeHelper.toJson(internalSchemaOpt.getOrElse(null)), + HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.key() -> schemaEvolutionEnable) + } + + /** + * get latest internalSchema from table + * + * @param config instance of {@link HoodieConfig} + * @param tableMetaClient instance of HoodieTableMetaClient + * @return Pair of(boolean, table schema), where first entry will be true only if schema conversion is required. + */ + def getLatestTableInternalSchema(config: HoodieConfig, + tableMetaClient: HoodieTableMetaClient): Option[InternalSchema] = { + if (!config.getBooleanOrDefault(DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED)) { + Option.empty[InternalSchema] + } else { + try { + val tableSchemaResolver = new TableSchemaResolver(tableMetaClient) + val internalSchemaOpt = tableSchemaResolver.getTableInternalSchemaFromCommitMetadata + if (internalSchemaOpt.isPresent) Some(internalSchemaOpt.get()) else None + } catch { + case _: Exception => None + } + } + } + + private def getLatestTableSchema(spark: SparkSession, + tableMetaClient: HoodieTableMetaClient): Option[Schema] = { + val tableSchemaResolver = new TableSchemaResolver(tableMetaClient) + toScalaOption(tableSchemaResolver.getTableAvroSchemaFromLatestCommit(false)) + } + + def registerKryoClassesAndGetGenericRecords(tblName: String, sparkContext: SparkContext, df: Dataset[Row], + reconcileSchema: Boolean): RDD[GenericRecord] = { + val structName = s"${tblName}_record" + val nameSpace = s"hoodie.${tblName}" + sparkContext.getConf.registerKryoClasses( + Array(classOf[org.apache.avro.generic.GenericData], + classOf[org.apache.avro.Schema])) + HoodieSparkUtils.createRdd(df, structName, nameSpace, reconcileSchema) + } + + def bootstrap(sqlContext: SQLContext, + mode: SaveMode, + optParams: Map[String, String], + df: DataFrame, + hoodieTableConfigOpt: Option[HoodieTableConfig] = Option.empty, + hoodieWriteClient: Option[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] = Option.empty): Boolean = { + + assert(optParams.get("path").exists(!StringUtils.isNullOrEmpty(_)), "'path' must be set") + val path = optParams("path") + val basePath = new Path(path) + val sparkContext = sqlContext.sparkContext + val fs = basePath.getFileSystem(sparkContext.hadoopConfiguration) + tableExists = fs.exists(new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME)) + val tableConfig = getHoodieTableConfig(sparkContext, path, hoodieTableConfigOpt) + validateTableConfig(sqlContext.sparkSession, optParams, tableConfig, mode == SaveMode.Overwrite) + + val (parameters, hoodieConfig) = mergeParamsAndGetHoodieConfig(optParams, tableConfig, mode) + val tableName = hoodieConfig.getStringOrThrow(HoodieWriteConfig.TBL_NAME, s"'${HoodieWriteConfig.TBL_NAME.key}' must be set.") + val tableType = hoodieConfig.getStringOrDefault(TABLE_TYPE) + val bootstrapBasePath = hoodieConfig.getStringOrThrow(BASE_PATH, + s"'${BASE_PATH.key}' is required for '${BOOTSTRAP_OPERATION_OPT_VAL}'" + + " operation'") + val bootstrapIndexClass = hoodieConfig.getStringOrDefault(INDEX_CLASS_NAME) + + var schema: String = null + if (df.schema.nonEmpty) { + val (structName, namespace) = AvroConversionUtils.getAvroRecordNameAndNamespace(tableName) + schema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, namespace).toString + } else { + schema = HoodieAvroUtils.getNullSchema.toString + } + + if (mode == SaveMode.Ignore && tableExists) { + log.warn(s"hoodie table at $basePath already exists. Ignoring & not performing actual writes.") + if (!hoodieWriteClient.isEmpty) { + hoodieWriteClient.get.close() + } + false + } else { + // Handle various save modes + handleSaveModes(sqlContext.sparkSession, mode, basePath, tableConfig, tableName, WriteOperationType.BOOTSTRAP, fs) + + if (!tableExists) { + val archiveLogFolder = hoodieConfig.getStringOrDefault(HoodieTableConfig.ARCHIVELOG_FOLDER) + val partitionColumns = HoodieWriterUtils.getPartitionColumns(parameters) + val recordKeyFields = hoodieConfig.getString(DataSourceWriteOptions.RECORDKEY_FIELD) + val keyGenProp = + if (StringUtils.nonEmpty(hoodieConfig.getString(KEYGEN_CLASS_NAME))) hoodieConfig.getString(KEYGEN_CLASS_NAME) + else hoodieConfig.getString(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME) + val timestampKeyGeneratorConfigs = extractConfigsRelatedToTimestampBasedKeyGenerator(keyGenProp, parameters) + val populateMetaFields = java.lang.Boolean.parseBoolean(parameters.getOrElse( + HoodieTableConfig.POPULATE_META_FIELDS.key(), + String.valueOf(HoodieTableConfig.POPULATE_META_FIELDS.defaultValue()) + )) + val baseFileFormat = hoodieConfig.getStringOrDefault(HoodieTableConfig.BASE_FILE_FORMAT) + val useBaseFormatMetaFile = java.lang.Boolean.parseBoolean(parameters.getOrElse( + HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT.key(), + String.valueOf(HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT.defaultValue()) + )) + + HoodieTableMetaClient.withPropertyBuilder() + .setTableType(HoodieTableType.valueOf(tableType)) + .setTableName(tableName) + .setRecordKeyFields(recordKeyFields) + .setArchiveLogFolder(archiveLogFolder) + .setPayloadClassName(hoodieConfig.getStringOrDefault(PAYLOAD_CLASS_NAME)) + .setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD, null)) + .setBootstrapIndexClass(bootstrapIndexClass) + .setBaseFileFormat(baseFileFormat) + .setBootstrapBasePath(bootstrapBasePath) + .setPartitionFields(partitionColumns) + .setPopulateMetaFields(populateMetaFields) + .setKeyGeneratorClassProp(keyGenProp) + .set(timestampKeyGeneratorConfigs) + .setHiveStylePartitioningEnable(hoodieConfig.getBoolean(HIVE_STYLE_PARTITIONING)) + .setUrlEncodePartitioning(hoodieConfig.getBoolean(URL_ENCODE_PARTITIONING)) + .setCommitTimezone(HoodieTimelineTimeZone.valueOf(hoodieConfig.getStringOrDefault(HoodieTableConfig.TIMELINE_TIMEZONE))) + .setPartitionMetafileUseBaseFormat(useBaseFormatMetaFile) + .initTable(sparkContext.hadoopConfiguration, path) + } + + val jsc = new JavaSparkContext(sqlContext.sparkContext) + val writeClient = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, + schema, path, tableName, mapAsJavaMap(parameters))) + try { + writeClient.bootstrap(org.apache.hudi.common.util.Option.empty()) + } finally { + writeClient.close() + } + val metaSyncSuccess = metaSync(sqlContext.sparkSession, hoodieConfig, basePath, df.schema) + metaSyncSuccess + } + } + + def validateSchemaForHoodieIsDeleted(schema: Schema): Unit = { + if (schema.getField(HoodieRecord.HOODIE_IS_DELETED) != null && + AvroConversionUtils.resolveAvroTypeNullability(schema.getField(HoodieRecord.HOODIE_IS_DELETED).schema())._2.getType != Schema.Type.BOOLEAN) { + throw new HoodieException(HoodieRecord.HOODIE_IS_DELETED + " has to be BOOLEAN type. Passed in dataframe's schema has type " + + schema.getField(HoodieRecord.HOODIE_IS_DELETED).schema().getType) + } + } + + def bulkInsertAsRow(sqlContext: SQLContext, + parameters: Map[String, String], + df: DataFrame, + tblName: String, + basePath: Path, + path: String, + instantTime: String, + partitionColumns: String, + isTablePartitioned: Boolean): (Boolean, common.util.Option[String]) = { + val sparkContext = sqlContext.sparkContext + val populateMetaFields = java.lang.Boolean.parseBoolean(parameters.getOrElse(HoodieTableConfig.POPULATE_META_FIELDS.key(), + String.valueOf(HoodieTableConfig.POPULATE_META_FIELDS.defaultValue()))) + val dropPartitionColumns = parameters.get(DataSourceWriteOptions.DROP_PARTITION_COLUMNS.key()).map(_.toBoolean) + .getOrElse(DataSourceWriteOptions.DROP_PARTITION_COLUMNS.defaultValue()) + // register classes & schemas + val (structName, nameSpace) = AvroConversionUtils.getAvroRecordNameAndNamespace(tblName) + sparkContext.getConf.registerKryoClasses( + Array(classOf[org.apache.avro.generic.GenericData], + classOf[org.apache.avro.Schema])) + var schema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, nameSpace) + if (dropPartitionColumns) { + schema = generateSchemaWithoutPartitionColumns(partitionColumns, schema) + } + validateSchemaForHoodieIsDeleted(schema) + sparkContext.getConf.registerAvroSchemas(schema) + log.info(s"Registered avro schema : ${schema.toString(true)}") + if (parameters(INSERT_DROP_DUPS.key).toBoolean) { + throw new HoodieException("Dropping duplicates with bulk_insert in row writer path is not supported yet") + } + val params: mutable.Map[String, String] = collection.mutable.Map(parameters.toSeq: _*) + params(HoodieWriteConfig.AVRO_SCHEMA_STRING.key) = schema.toString + val writeConfig = DataSourceUtils.createHoodieConfig(schema.toString, path, tblName, mapAsJavaMap(params)) + val bulkInsertPartitionerRows: BulkInsertPartitioner[Dataset[Row]] = if (populateMetaFields) { + val userDefinedBulkInsertPartitionerOpt = DataSourceUtils.createUserDefinedBulkInsertPartitionerWithRows(writeConfig) + if (userDefinedBulkInsertPartitionerOpt.isPresent) { + userDefinedBulkInsertPartitionerOpt.get + } else { + BulkInsertInternalPartitionerWithRowsFactory.get( + writeConfig.getBulkInsertSortMode, isTablePartitioned) + } + } else { + // Sort modes are not yet supported when meta fields are disabled + new NonSortPartitionerWithRows() + } + val arePartitionRecordsSorted = bulkInsertPartitionerRows.arePartitionRecordsSorted() + params(HoodieInternalConfig.BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED) = arePartitionRecordsSorted.toString + val isGlobalIndex = if (populateMetaFields) { + SparkHoodieIndexFactory.isGlobalIndex(writeConfig) + } else { + false + } + + val hoodieDF = HoodieDatasetBulkInsertHelper.prepareForBulkInsert(df, writeConfig, bulkInsertPartitionerRows, dropPartitionColumns) + + if (HoodieSparkUtils.isSpark2) { + hoodieDF.write.format("org.apache.hudi.internal") + .option(DataSourceInternalWriterHelper.INSTANT_TIME_OPT_KEY, instantTime) + .options(params) + .mode(SaveMode.Append) + .save() + } else if (HoodieSparkUtils.isSpark3) { + hoodieDF.write.format("org.apache.hudi.spark3.internal") + .option(DataSourceInternalWriterHelper.INSTANT_TIME_OPT_KEY, instantTime) + .option(HoodieInternalConfig.BULKINSERT_INPUT_DATA_SCHEMA_DDL.key, hoodieDF.schema.toDDL) + .options(params) + .mode(SaveMode.Append) + .save() + } else { + throw new HoodieException("Bulk insert using row writer is not supported with current Spark version." + + " To use row writer please switch to spark 2 or spark 3") + } + val syncHiveSuccess = metaSync(sqlContext.sparkSession, writeConfig, basePath, df.schema) + (syncHiveSuccess, common.util.Option.ofNullable(instantTime)) + } + + def cleanup() : Unit = { + Metrics.shutdown() + } + + private def handleSaveModes(spark: SparkSession, mode: SaveMode, tablePath: Path, tableConfig: HoodieTableConfig, tableName: String, + operation: WriteOperationType, fs: FileSystem): Unit = { + if (mode == SaveMode.Append && tableExists) { + val existingTableName = tableConfig.getTableName + val resolver = spark.sessionState.conf.resolver + if (!resolver(existingTableName, tableName)) { + throw new HoodieException(s"hoodie table with name $existingTableName already exists at $tablePath," + + s" can not append data to the table with another name $tableName.") + } + } + + if (operation != WriteOperationType.DELETE) { + if (mode == SaveMode.ErrorIfExists && tableExists) { + throw new HoodieException(s"hoodie table at $tablePath already exists.") + } else if (mode == SaveMode.Overwrite && tableExists && operation != WriteOperationType.INSERT_OVERWRITE_TABLE) { + // When user set operation as INSERT_OVERWRITE_TABLE, + // overwrite will use INSERT_OVERWRITE_TABLE operator in doWriteOperation + log.warn(s"hoodie table at $tablePath already exists. Deleting existing data & overwriting with new data.") + fs.delete(tablePath, true) + tableExists = false + } + } else { + // Delete Operation only supports Append mode + if (mode != SaveMode.Append) { + throw new HoodieException(s"Append is the only save mode applicable for ${operation.toString} operation") + } + } + } + + private def metaSync(spark: SparkSession, hoodieConfig: HoodieConfig, basePath: Path, + schema: StructType): Boolean = { + val hiveSyncEnabled = hoodieConfig.getStringOrDefault(HiveSyncConfigHolder.HIVE_SYNC_ENABLED).toBoolean + var metaSyncEnabled = hoodieConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_ENABLED).toBoolean + var syncClientToolClassSet = scala.collection.mutable.Set[String]() + hoodieConfig.getString(META_SYNC_CLIENT_TOOL_CLASS_NAME).split(",").foreach(syncClass => syncClientToolClassSet += syncClass) + + // for backward compatibility + if (hiveSyncEnabled) { + metaSyncEnabled = true + syncClientToolClassSet += classOf[HiveSyncTool].getName + } + + if (metaSyncEnabled) { + val fs = basePath.getFileSystem(spark.sessionState.newHadoopConf()) + val baseFileFormat = hoodieConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT); + val properties = new TypedProperties() + properties.putAll(hoodieConfig.getProps) + properties.put(HiveSyncConfigHolder.HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD.key, spark.sessionState.conf.getConf(StaticSQLConf.SCHEMA_STRING_LENGTH_THRESHOLD).toString) + properties.put(HoodieSyncConfig.META_SYNC_SPARK_VERSION.key, SPARK_VERSION) + properties.put(HoodieSyncConfig.META_SYNC_USE_FILE_LISTING_FROM_METADATA.key, hoodieConfig.getBoolean(HoodieMetadataConfig.ENABLE)) + + syncClientToolClassSet.foreach(impl => { + SyncUtilHelpers.runHoodieMetaSync(impl.trim, properties, fs.getConf, fs, basePath.toString, baseFileFormat) + }) + } + + // Since Hive tables are now synced as Spark data source tables which are cached after Spark SQL queries + // we must invalidate this table in the cache so writes are reflected in later queries + if (metaSyncEnabled) { + getHiveTableNames(hoodieConfig).foreach(name => { + val qualifiedTableName = String.join(".", hoodieConfig.getStringOrDefault(HIVE_DATABASE), name) + if (spark.catalog.tableExists(qualifiedTableName)) { + spark.catalog.refreshTable(qualifiedTableName) + } + }) + } + true + } + + private def getHiveTableNames(hoodieConfig: HoodieConfig): List[String] = { + val tableName = hoodieConfig.getStringOrDefault(HIVE_TABLE) + val tableType = hoodieConfig.getStringOrDefault(TABLE_TYPE) + + if (tableType.equals(COW_TABLE_TYPE_OPT_VAL)) { + List(tableName) + } else { + val roSuffix = if (hoodieConfig.getBooleanOrDefault(HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE)) { + "" + } else { + HiveSyncTool.SUFFIX_READ_OPTIMIZED_TABLE + } + List(tableName + roSuffix, + tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE) + } + } + + /** + * Group all table/action specific information into a case class. + */ + case class TableInstantInfo(basePath: Path, instantTime: String, commitActionType: String, operation: WriteOperationType) + + private def commitAndPerformPostOperations(spark: SparkSession, + schema: StructType, + writeResult: HoodieWriteResult, + parameters: Map[String, String], + client: SparkRDDWriteClient[HoodieRecordPayload[Nothing]], + tableConfig: HoodieTableConfig, + jsc: JavaSparkContext, + tableInstantInfo: TableInstantInfo + ): (Boolean, common.util.Option[java.lang.String], common.util.Option[java.lang.String]) = { + if (writeResult.getWriteStatuses.rdd.filter(ws => ws.hasErrors).count() == 0) { + log.info("Proceeding to commit the write.") + val metaMap = parameters.filter(kv => + kv._1.startsWith(parameters(COMMIT_METADATA_KEYPREFIX.key))) + val commitSuccess = + client.commit(tableInstantInfo.instantTime, writeResult.getWriteStatuses, + common.util.Option.of(new java.util.HashMap[String, String](mapAsJavaMap(metaMap))), + tableInstantInfo.commitActionType, + writeResult.getPartitionToReplaceFileIds) + + if (commitSuccess) { + log.info("Commit " + tableInstantInfo.instantTime + " successful!") + } + else { + log.info("Commit " + tableInstantInfo.instantTime + " failed!") + } + + val asyncCompactionEnabled = isAsyncCompactionEnabled(client, tableConfig, parameters, jsc.hadoopConfiguration()) + val compactionInstant: common.util.Option[java.lang.String] = + if (asyncCompactionEnabled) { + client.scheduleCompaction(common.util.Option.of(new java.util.HashMap[String, String](mapAsJavaMap(metaMap)))) + } else { + common.util.Option.empty() + } + + log.info(s"Compaction Scheduled is $compactionInstant") + + val asyncClusteringEnabled = isAsyncClusteringEnabled(client, parameters) + val clusteringInstant: common.util.Option[java.lang.String] = + if (asyncClusteringEnabled) { + client.scheduleClustering(common.util.Option.of(new java.util.HashMap[String, String](mapAsJavaMap(metaMap)))) + } else { + common.util.Option.empty() + } + + log.info(s"Clustering Scheduled is $clusteringInstant") + + val metaSyncSuccess = metaSync(spark, HoodieWriterUtils.convertMapToHoodieConfig(parameters), + tableInstantInfo.basePath, schema) + + log.info(s"Is Async Compaction Enabled ? $asyncCompactionEnabled") + if (!asyncCompactionEnabled && !asyncClusteringEnabled) { + client.close() + } + (commitSuccess && metaSyncSuccess, compactionInstant, clusteringInstant) + } else { + log.error(s"${tableInstantInfo.operation} failed with errors") + if (log.isTraceEnabled) { + log.trace("Printing out the top 100 errors") + writeResult.getWriteStatuses.rdd.filter(ws => ws.hasErrors) + .take(100) + .foreach(ws => { + log.trace("Global error :", ws.getGlobalError) + if (ws.getErrors.size() > 0) { + ws.getErrors.foreach(kt => + log.trace(s"Error for key: ${kt._1}", kt._2)) + } + }) + } + (false, common.util.Option.empty(), common.util.Option.empty()) + } + } + + private def isAsyncCompactionEnabled(client: SparkRDDWriteClient[HoodieRecordPayload[Nothing]], + tableConfig: HoodieTableConfig, + parameters: Map[String, String], configuration: Configuration): Boolean = { + log.info(s"Config.inlineCompactionEnabled ? ${client.getConfig.inlineCompactionEnabled}") + if (asyncCompactionTriggerFnDefined && !client.getConfig.inlineCompactionEnabled + && parameters.get(ASYNC_COMPACT_ENABLE.key).exists(r => r.toBoolean)) { + tableConfig.getTableType == HoodieTableType.MERGE_ON_READ + } else { + false + } + } + + private def isAsyncClusteringEnabled(client: SparkRDDWriteClient[HoodieRecordPayload[Nothing]], + parameters: Map[String, String]): Boolean = { + log.info(s"Config.asyncClusteringEnabled ? ${client.getConfig.isAsyncClusteringEnabled}") + asyncClusteringTriggerFnDefined && client.getConfig.isAsyncClusteringEnabled + } + + private def getHoodieTableConfig(sparkContext: SparkContext, + tablePath: String, + hoodieTableConfigOpt: Option[HoodieTableConfig]): HoodieTableConfig = { + if (tableExists) { + hoodieTableConfigOpt.getOrElse( + HoodieTableMetaClient.builder().setConf(sparkContext.hadoopConfiguration).setBasePath(tablePath) + .build().getTableConfig) + } else { + null + } + } + + private def mergeParamsAndGetHoodieConfig(optParams: Map[String, String], + tableConfig: HoodieTableConfig, mode: SaveMode): (Map[String, String], HoodieConfig) = { + val translatedOptions = DataSourceWriteOptions.translateSqlOptions(optParams) + val mergedParams = mutable.Map.empty ++ HoodieWriterUtils.parametersWithWriteDefaults(translatedOptions) + if (!mergedParams.contains(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key) + && mergedParams.contains(KEYGENERATOR_CLASS_NAME.key)) { + mergedParams(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key) = mergedParams(KEYGENERATOR_CLASS_NAME.key) + } + if (null != tableConfig && mode != SaveMode.Overwrite) { + tableConfig.getProps.foreach { case (key, value) => + mergedParams(key) = value + } + } + + // use preCombineField to fill in PAYLOAD_ORDERING_FIELD_PROP_KEY + if (mergedParams.contains(PRECOMBINE_FIELD.key())) { + mergedParams.put(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY, mergedParams(PRECOMBINE_FIELD.key())) + } + val params = mergedParams.toMap + (params, HoodieWriterUtils.convertMapToHoodieConfig(params)) + } + + private def extractConfigsRelatedToTimestampBasedKeyGenerator(keyGenerator: String, + params: Map[String, String]): Map[String, String] = { + if (classOf[TimestampBasedKeyGenerator].getCanonicalName.equals(keyGenerator) || + classOf[TimestampBasedAvroKeyGenerator].getCanonicalName.equals(keyGenerator)) { + params.filterKeys(HoodieTableConfig.PERSISTED_CONFIG_LIST.contains) + } else { + Map.empty + } + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieStreamingSink.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieStreamingSink.scala new file mode 100644 index 0000000000000..23b79a5ed4181 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieStreamingSink.scala @@ -0,0 +1,339 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hudi + +import com.fasterxml.jackson.annotation.JsonInclude.Include +import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.async.{AsyncClusteringService, AsyncCompactService, SparkStreamingAsyncClusteringService, SparkStreamingAsyncCompactService} +import org.apache.hudi.client.SparkRDDWriteClient +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.model.HoodieRecordPayload +import org.apache.hudi.common.table.marker.MarkerType +import org.apache.hudi.common.table.timeline.HoodieInstant.State +import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline} +import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} +import org.apache.hudi.common.util.ValidationUtils.checkArgument +import org.apache.hudi.common.util.{ClusteringUtils, CommitUtils, CompactionUtils, StringUtils} +import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.exception.{HoodieCorruptedDataException, HoodieException, TableNotFoundException} +import org.apache.log4j.LogManager +import org.apache.spark.api.java.JavaSparkContext +import org.apache.spark.sql.execution.streaming.{Sink, StreamExecution} +import org.apache.spark.sql.streaming.OutputMode +import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} + +import java.lang +import java.util.function.Function +import scala.collection.JavaConversions._ +import scala.util.{Failure, Success, Try} + +class HoodieStreamingSink(sqlContext: SQLContext, + options: Map[String, String], + partitionColumns: Seq[String], + outputMode: OutputMode) + extends Sink + with Serializable { + @volatile private var latestCommittedBatchId = -1L + + private val log = LogManager.getLogger(classOf[HoodieStreamingSink]) + + private val tablePath = options.get("path") + if (tablePath.isEmpty || tablePath.get == null) { + throw new HoodieException(s"'path' must be specified.") + } + private var metaClient: Option[HoodieTableMetaClient] = { + try { + Some(HoodieTableMetaClient.builder() + .setConf(sqlContext.sparkContext.hadoopConfiguration) + .setBasePath(tablePath.get) + .build()) + } catch { + case _: TableNotFoundException => + log.warn("Ignore TableNotFoundException as it is first microbatch.") + Option.empty + } + } + private val retryCnt = options.getOrDefault(STREAMING_RETRY_CNT.key, + STREAMING_RETRY_CNT.defaultValue).toInt + private val retryIntervalMs = options.getOrDefault(STREAMING_RETRY_INTERVAL_MS.key, + STREAMING_RETRY_INTERVAL_MS.defaultValue).toLong + private val ignoreFailedBatch = options.getOrDefault(STREAMING_IGNORE_FAILED_BATCH.key, + STREAMING_IGNORE_FAILED_BATCH.defaultValue).toBoolean + // This constant serves as the checkpoint key for streaming sink so that each microbatch is processed exactly-once. + private val SINK_CHECKPOINT_KEY = "_hudi_streaming_sink_checkpoint" + + private var isAsyncCompactorServiceShutdownAbnormally = false + private var isAsyncClusteringServiceShutdownAbnormally = false + + private val mode = + if (outputMode == OutputMode.Append()) { + SaveMode.Append + } else { + SaveMode.Overwrite + } + + private var asyncCompactorService: AsyncCompactService = _ + private var asyncClusteringService: AsyncClusteringService = _ + private var writeClient: Option[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] = Option.empty + private var hoodieTableConfig: Option[HoodieTableConfig] = Option.empty + + override def addBatch(batchId: Long, data: DataFrame): Unit = this.synchronized { + if (isAsyncCompactorServiceShutdownAbnormally) { + throw new IllegalStateException("Async Compactor shutdown unexpectedly") + } + if (isAsyncClusteringServiceShutdownAbnormally) { + log.error("Async clustering service shutdown unexpectedly") + throw new IllegalStateException("Async clustering service shutdown unexpectedly") + } + + val queryId = sqlContext.sparkContext.getLocalProperty(StreamExecution.QUERY_ID_KEY) + checkArgument(queryId != null, "queryId is null") + if (metaClient.isDefined && canSkipBatch(batchId, options.getOrDefault(OPERATION.key, UPSERT_OPERATION_OPT_VAL))) { + log.warn(s"Skipping already completed batch $batchId in query $queryId") + return + } + + // Override to use direct markers. In Structured streaming, timeline server is closed after + // first micro-batch and subsequent micro-batches do not have timeline server running. + // Thus, we can't use timeline-server-based markers. + var updatedOptions = options.updated(HoodieWriteConfig.MARKERS_TYPE.key(), MarkerType.DIRECT.name()) + // we need auto adjustment enabled for streaming sink since async table services are feasible within the same JVM. + updatedOptions = updatedOptions.updated(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key, "true") + // Add batchId as checkpoint to the extra metadata. To enable same checkpoint metadata structure for multi-writers, + // SINK_CHECKPOINT_KEY holds a map of batchId to writer context (composed of applicationId and queryId), e.g. + // "_hudi_streaming_sink_checkpoint" : "{\"$batchId\":\"${sqlContext.sparkContext.applicationId}-$queryId\"}" + // NOTE: In case of multi-writers, this map should be mutable and sorted by key to facilitate merging of batchIds. + // HUDI-4432 tracks the implementation of checkpoint management for multi-writer. + val checkpointMap = Map(batchId.toString -> s"${sqlContext.sparkContext.applicationId}-$queryId") + updatedOptions = updatedOptions.updated(SINK_CHECKPOINT_KEY, HoodieSinkCheckpoint.toJson(checkpointMap)) + + retry(retryCnt, retryIntervalMs)( + Try( + HoodieSparkSqlWriter.write( + sqlContext, mode, updatedOptions, data, hoodieTableConfig, writeClient, Some(triggerAsyncCompactor), Some(triggerAsyncClustering)) + ) + match { + case Success((true, commitOps, compactionInstantOps, clusteringInstant, client, tableConfig)) => + log.info(s"Micro batch id=$batchId succeeded" + + (commitOps.isPresent match { + case true => s" for commit=${commitOps.get()}" + case _ => s" with no new commits" + })) + log.info(s"Current value of latestCommittedBatchId: $latestCommittedBatchId. Setting latestCommittedBatchId to batchId $batchId.") + latestCommittedBatchId = batchId + writeClient = Some(client) + hoodieTableConfig = Some(tableConfig) + if (client != null) { + metaClient = Some(HoodieTableMetaClient.builder() + .setConf(sqlContext.sparkContext.hadoopConfiguration) + .setBasePath(client.getConfig.getBasePath) + .build()) + } + if (compactionInstantOps.isPresent) { + asyncCompactorService.enqueuePendingAsyncServiceInstant( + new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, compactionInstantOps.get())) + } + if (clusteringInstant.isPresent) { + asyncClusteringService.enqueuePendingAsyncServiceInstant(new HoodieInstant( + State.REQUESTED, HoodieTimeline.REPLACE_COMMIT_ACTION, clusteringInstant.get() + )) + } + Success((true, commitOps, compactionInstantOps)) + case Failure(e) => + // clean up persist rdds in the write process + data.sparkSession.sparkContext.getPersistentRDDs + .foreach { + case (_, rdd) => + try { + rdd.unpersist() + } catch { + case t: Exception => log.warn("Got excepting trying to unpersist rdd", t) + } + } + log.error(s"Micro batch id=$batchId threw following exception: ", e) + if (ignoreFailedBatch) { + log.warn(s"Ignore the exception and move on streaming as per " + + s"${STREAMING_IGNORE_FAILED_BATCH.key} configuration") + Success((true, None, None)) + } else { + if (retryCnt > 1) log.info(s"Retrying the failed micro batch id=$batchId ...") + Failure(e) + } + case Success((false, commitOps, _, _, _, _)) => + log.error(s"Micro batch id=$batchId ended up with errors" + + (commitOps.isPresent match { + case true => s" for commit=${commitOps.get()}" + case _ => s"" + })) + if (ignoreFailedBatch) { + log.info(s"Ignore the errors and move on streaming as per " + + s"${STREAMING_IGNORE_FAILED_BATCH.key} configuration") + Success((true, None, None)) + } else { + if (retryCnt > 1) log.warn(s"Retrying the failed micro batch id=$batchId ...") + Failure(new HoodieCorruptedDataException(s"Micro batch id=$batchId ended up with errors")) + } + } + ) + match { + case Failure(e) => + if (!ignoreFailedBatch) { + log.error(s"Micro batch id=$batchId threw following expections," + + s"aborting streaming app to avoid data loss: ", e) + // spark sometimes hangs upon exceptions and keep on hold of the executors + // this is to force exit upon errors / exceptions and release all executors + // will require redeployment / supervise mode to restart the streaming + reset(true) + System.exit(1) + } + case Success(_) => + log.info(s"Micro batch id=$batchId succeeded") + } + } + + override def toString: String = s"HoodieStreamingSink[${options("path")}]" + + @annotation.tailrec + private def retry[T](n: Int, waitInMillis: Long)(fn: => Try[T]): Try[T] = { + fn match { + case x: Success[T] => + x + case _ if n > 1 => + Thread.sleep(waitInMillis) + retry(n - 1, waitInMillis * 2)(fn) + case f => + reset(false) + f + } + } + + protected def triggerAsyncCompactor(client: SparkRDDWriteClient[HoodieRecordPayload[Nothing]]): Unit = { + if (null == asyncCompactorService) { + log.info("Triggering Async compaction !!") + asyncCompactorService = new SparkStreamingAsyncCompactService(new HoodieSparkEngineContext(new JavaSparkContext(sqlContext.sparkContext)), + client) + asyncCompactorService.start(new Function[java.lang.Boolean, java.lang.Boolean] { + override def apply(errored: lang.Boolean): lang.Boolean = { + log.info(s"Async Compactor shutdown. Errored ? $errored") + isAsyncCompactorServiceShutdownAbnormally = errored + reset(false) + log.info("Done resetting write client.") + true + } + }) + + // Add Shutdown Hook + Runtime.getRuntime.addShutdownHook(new Thread(new Runnable { + override def run(): Unit = reset(true) + })) + + // First time, scan .hoodie folder and get all pending compactions + val metaClient = HoodieTableMetaClient.builder().setConf(sqlContext.sparkContext.hadoopConfiguration) + .setBasePath(client.getConfig.getBasePath).build() + val pendingInstants: java.util.List[HoodieInstant] = + CompactionUtils.getPendingCompactionInstantTimes(metaClient) + pendingInstants.foreach((h: HoodieInstant) => asyncCompactorService.enqueuePendingAsyncServiceInstant(h)) + } + } + + protected def triggerAsyncClustering(client: SparkRDDWriteClient[HoodieRecordPayload[Nothing]]): Unit = { + if (null == asyncClusteringService) { + log.info("Triggering async clustering!") + asyncClusteringService = new SparkStreamingAsyncClusteringService(new HoodieSparkEngineContext(new JavaSparkContext(sqlContext.sparkContext)), + client) + asyncClusteringService.start(new Function[java.lang.Boolean, java.lang.Boolean] { + override def apply(errored: lang.Boolean): lang.Boolean = { + log.info(s"Async clustering service shutdown. Errored ? $errored") + isAsyncClusteringServiceShutdownAbnormally = errored + reset(false) + true + } + }) + + // Add Shutdown Hook + Runtime.getRuntime.addShutdownHook(new Thread(new Runnable { + override def run(): Unit = reset(true) + })) + + // First time, scan .hoodie folder and get all pending clustering instants + val metaClient = HoodieTableMetaClient.builder().setConf(sqlContext.sparkContext.hadoopConfiguration) + .setBasePath(client.getConfig.getBasePath).build() + val pendingInstants: java.util.List[HoodieInstant] = ClusteringUtils.getPendingClusteringInstantTimes(metaClient) + pendingInstants.foreach((h: HoodieInstant) => asyncClusteringService.enqueuePendingAsyncServiceInstant(h)) + } + } + + private def reset(force: Boolean): Unit = this.synchronized { + if (asyncCompactorService != null) { + asyncCompactorService.shutdown(force) + asyncCompactorService = null + } + + if (asyncClusteringService != null) { + asyncClusteringService.shutdown(force) + asyncClusteringService = null + } + + if (writeClient.isDefined) { + writeClient.get.close() + writeClient = Option.empty + } + } + + private def canSkipBatch(incomingBatchId: Long, operationType: String): Boolean = { + if (!DELETE_OPERATION_OPT_VAL.equals(operationType)) { + // get the latest checkpoint from the commit metadata to check if the microbatch has already been prcessed or not + val commitMetadata = CommitUtils.getLatestCommitMetadataWithValidCheckpointInfo( + metaClient.get.getActiveTimeline.getCommitsTimeline, SINK_CHECKPOINT_KEY) + if (commitMetadata.isPresent) { + val lastCheckpoint = commitMetadata.get.getMetadata(SINK_CHECKPOINT_KEY) + if (!StringUtils.isNullOrEmpty(lastCheckpoint)) { + latestCommittedBatchId = HoodieSinkCheckpoint.fromJson(lastCheckpoint).keys.head.toLong + } + } + latestCommittedBatchId >= incomingBatchId + } else { + // In case of DELETE_OPERATION_OPT_VAL the incoming batch id is sentinel value (-1) + false + } + } +} + +/** + * SINK_CHECKPOINT_KEY holds a map of batchId to writer context (composed of applicationId and queryId). + * This is a util object to serialize/deserialize map to/from json. + */ +object HoodieSinkCheckpoint { + + lazy val mapper: ObjectMapper = { + val _mapper = new ObjectMapper + _mapper.setSerializationInclusion(Include.NON_ABSENT) + _mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) + _mapper.registerModule(DefaultScalaModule) + _mapper + } + + def toJson(checkpoint: Map[String, String]): String = { + mapper.writeValueAsString(checkpoint) + } + + def fromJson(json: String): Map[String, String] = { + mapper.readValue(json, classOf[Map[String, String]]) + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala new file mode 100644 index 0000000000000..335fe68bd2099 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala @@ -0,0 +1,233 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hudi.DataSourceOptionsHelper.allAlternatives +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.common.config.HoodieMetadataConfig.ENABLE +import org.apache.hudi.common.config.{DFSPropertiesConfiguration, HoodieCommonConfig, HoodieConfig} +import org.apache.hudi.common.table.HoodieTableConfig +import org.apache.hudi.exception.HoodieException +import org.apache.hudi.hive.HiveSyncConfigHolder +import org.apache.hudi.keygen.{NonpartitionedKeyGenerator, SimpleKeyGenerator} +import org.apache.hudi.sync.common.HoodieSyncConfig +import org.apache.hudi.util.SparkKeyGenUtils +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.hudi.command.SqlKeyGenerator + +import java.util.Properties +import scala.collection.JavaConversions.mapAsJavaMap +import scala.collection.JavaConverters._ + +/** + * WriterUtils to assist in write path in Datasource and tests. + */ +object HoodieWriterUtils { + + def javaParametersWithWriteDefaults(parameters: java.util.Map[String, String]): java.util.Map[String, String] = { + mapAsJavaMap(parametersWithWriteDefaults(parameters.asScala.toMap)) + } + + /** + * Add default options for unspecified write options keys. + * + * @param parameters + * @return + */ + def parametersWithWriteDefaults(parameters: Map[String, String]): Map[String, String] = { + val globalProps = DFSPropertiesConfiguration.getGlobalProps.asScala + val props = new Properties() + props.putAll(parameters) + val hoodieConfig: HoodieConfig = new HoodieConfig(props) + hoodieConfig.setDefaultValue(OPERATION) + hoodieConfig.setDefaultValue(TABLE_TYPE) + hoodieConfig.setDefaultValue(PRECOMBINE_FIELD) + hoodieConfig.setDefaultValue(PAYLOAD_CLASS_NAME) + hoodieConfig.setDefaultValue(RECORDKEY_FIELD) + hoodieConfig.setDefaultValue(KEYGENERATOR_CLASS_NAME) + hoodieConfig.setDefaultValue(ENABLE) + hoodieConfig.setDefaultValue(COMMIT_METADATA_KEYPREFIX) + hoodieConfig.setDefaultValue(INSERT_DROP_DUPS) + hoodieConfig.setDefaultValue(STREAMING_RETRY_CNT) + hoodieConfig.setDefaultValue(STREAMING_RETRY_INTERVAL_MS) + hoodieConfig.setDefaultValue(STREAMING_IGNORE_FAILED_BATCH) + hoodieConfig.setDefaultValue(META_SYNC_CLIENT_TOOL_CLASS_NAME) + hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_SYNC_ENABLED) + hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_ENABLED) + hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_DATABASE_NAME) + hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_TABLE_NAME) + hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT) + hoodieConfig.setDefaultValue(HiveSyncConfigHolder.METASTORE_URIS) + hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_USER) + hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_PASS) + hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_URL) + hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS) + hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS) + hoodieConfig.setDefaultValue(HIVE_STYLE_PARTITIONING) + hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_USE_JDBC) + hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE) + hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_SYNC_AS_DATA_SOURCE_TABLE) + hoodieConfig.setDefaultValue(ASYNC_COMPACT_ENABLE) + hoodieConfig.setDefaultValue(INLINE_CLUSTERING_ENABLE) + hoodieConfig.setDefaultValue(ASYNC_CLUSTERING_ENABLE) + hoodieConfig.setDefaultValue(ENABLE_ROW_WRITER) + hoodieConfig.setDefaultValue(RECONCILE_SCHEMA) + hoodieConfig.setDefaultValue(DROP_PARTITION_COLUMNS) + hoodieConfig.setDefaultValue(KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED) + Map() ++ hoodieConfig.getProps.asScala ++ globalProps ++ DataSourceOptionsHelper.translateConfigurations(parameters) + } + + /** + * Get the partition columns to stored to hoodie.properties. + * @param parameters + * @return + */ + def getPartitionColumns(parameters: Map[String, String]): String = { + val props = new Properties() + props.putAll(parameters.asJava) + SparkKeyGenUtils.getPartitionColumns(props) + } + + def convertMapToHoodieConfig(parameters: Map[String, String]): HoodieConfig = { + val properties = new Properties() + properties.putAll(mapAsJavaMap(parameters)) + new HoodieConfig(properties) + } + + def getOriginKeyGenerator(parameters: Map[String, String]): String = { + val kg = parameters.getOrElse(KEYGENERATOR_CLASS_NAME.key(), null) + if (classOf[SqlKeyGenerator].getCanonicalName == kg) { + parameters.getOrElse(SqlKeyGenerator.ORIGINAL_KEYGEN_CLASS_NAME, null) + } else { + kg + } + } + + def validateTableConfig(spark: SparkSession, params: Map[String, String], + tableConfig: HoodieConfig): Unit = { + validateTableConfig(spark, params, tableConfig, false) + } + + /** + * Detects conflicts between new parameters and existing table configurations + */ + def validateTableConfig(spark: SparkSession, params: Map[String, String], + tableConfig: HoodieConfig, isOverWriteMode: Boolean): Unit = { + // If Overwrite is set as save mode, we don't need to do table config validation. + if (!isOverWriteMode) { + val resolver = spark.sessionState.conf.resolver + val diffConfigs = StringBuilder.newBuilder + params.foreach { case (key, value) => + val existingValue = getStringFromTableConfigWithAlternatives(tableConfig, key) + if (null != existingValue && !resolver(existingValue, value)) { + diffConfigs.append(s"$key:\t$value\t${tableConfig.getString(key)}\n") + } + } + + if (null != tableConfig) { + val datasourceRecordKey = params.getOrElse(RECORDKEY_FIELD.key(), null) + val tableConfigRecordKey = tableConfig.getString(HoodieTableConfig.RECORDKEY_FIELDS) + if (null != datasourceRecordKey && null != tableConfigRecordKey + && datasourceRecordKey != tableConfigRecordKey) { + diffConfigs.append(s"RecordKey:\t$datasourceRecordKey\t$tableConfigRecordKey\n") + } + + val datasourcePreCombineKey = params.getOrElse(PRECOMBINE_FIELD.key(), null) + val tableConfigPreCombineKey = tableConfig.getString(HoodieTableConfig.PRECOMBINE_FIELD) + if (null != datasourcePreCombineKey && null != tableConfigPreCombineKey + && datasourcePreCombineKey != tableConfigPreCombineKey) { + diffConfigs.append(s"PreCombineKey:\t$datasourcePreCombineKey\t$tableConfigPreCombineKey\n") + } + + val datasourceKeyGen = getOriginKeyGenerator(params) + val tableConfigKeyGen = tableConfig.getString(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME) + if (null != datasourceKeyGen && null != tableConfigKeyGen + && datasourceKeyGen != tableConfigKeyGen) { + diffConfigs.append(s"KeyGenerator:\t$datasourceKeyGen\t$tableConfigKeyGen\n") + } + } + + if (diffConfigs.nonEmpty) { + diffConfigs.insert(0, "\nConfig conflict(key\tcurrent value\texisting value):\n") + throw new HoodieException(diffConfigs.toString.trim) + } + } + + // Check schema evolution for bootstrap table. + // now we do not support bootstrap table. + if (params.get(OPERATION.key).contains(BOOTSTRAP_OPERATION_OPT_VAL) + && params.getOrElse(HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.key(), "false").toBoolean) { + throw new HoodieException(String + .format("now schema evolution cannot support bootstrap table, pls set %s to false", HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.key())) + } + } + + /** + * Detects conflicts between datasourceKeyGen and existing table configuration keyGen + */ + def validateKeyGeneratorConfig(datasourceKeyGen: String, tableConfig: HoodieConfig): Unit = { + val diffConfigs = StringBuilder.newBuilder + + if (null != tableConfig) { + val tableConfigKeyGen = tableConfig.getString(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME) + if (null != tableConfigKeyGen && null != datasourceKeyGen) { + val nonPartitionedTableConfig = tableConfigKeyGen.equals(classOf[NonpartitionedKeyGenerator].getCanonicalName) + val simpleKeyDataSourceConfig = datasourceKeyGen.equals(classOf[SimpleKeyGenerator].getCanonicalName) + if (nonPartitionedTableConfig && simpleKeyDataSourceConfig) { + diffConfigs.append(s"KeyGenerator:\t$datasourceKeyGen\t$tableConfigKeyGen\n") + } + } + } + + if (diffConfigs.nonEmpty) { + diffConfigs.insert(0, "\nConfig conflict(key\tcurrent value\texisting value):\n") + throw new HoodieException(diffConfigs.toString.trim) + } + } + + private def getStringFromTableConfigWithAlternatives(tableConfig: HoodieConfig, key: String): String = { + if (null == tableConfig) { + null + } else { + if (allAlternatives.contains(key)) { + tableConfig.getString(allAlternatives(key)) + } else { + tableConfig.getString(key) + } + } + } + + val sparkDatasourceConfigsToTableConfigsMap = Map( + TABLE_NAME -> HoodieTableConfig.NAME, + TABLE_TYPE -> HoodieTableConfig.TYPE, + PRECOMBINE_FIELD -> HoodieTableConfig.PRECOMBINE_FIELD, + PARTITIONPATH_FIELD -> HoodieTableConfig.PARTITION_FIELDS, + RECORDKEY_FIELD -> HoodieTableConfig.RECORDKEY_FIELDS, + PAYLOAD_CLASS_NAME -> HoodieTableConfig.PAYLOAD_CLASS_NAME + ) + def mappingSparkDatasourceConfigsToTableConfigs(options: Map[String, String]): Map[String, String] = { + val includingTableConfigs = scala.collection.mutable.Map() ++ options + sparkDatasourceConfigsToTableConfigsMap.foreach(kv => { + if (options.containsKey(kv._1.key)) { + includingTableConfigs(kv._2.key) = options(kv._1.key) + includingTableConfigs.remove(kv._1.key) + } + }) + includingTableConfigs.toMap + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/IncrementalRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/IncrementalRelation.scala new file mode 100644 index 0000000000000..44b54eb7fa91a --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/IncrementalRelation.scala @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.avro.Schema +import org.apache.hudi.common.model.{HoodieCommitMetadata, HoodieFileFormat, HoodieRecord, HoodieReplaceCommitMetadata} +import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} + +import java.util.stream.Collectors +import org.apache.hadoop.fs.{GlobPattern, Path} +import org.apache.hudi.HoodieBaseRelation.isSchemaEvolutionEnabledOnRead +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.client.utils.SparkInternalSchemaConverter +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline} +import org.apache.hudi.common.util.{HoodieTimer, InternalSchemaCache} +import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.exception.HoodieException +import org.apache.hudi.internal.schema.InternalSchema +import org.apache.hudi.internal.schema.utils.SerDeHelper +import org.apache.hudi.table.HoodieSparkTable +import org.apache.log4j.LogManager +import org.apache.spark.api.java.JavaSparkContext +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.execution.datasources.parquet.HoodieParquetFileFormat +import org.apache.spark.sql.sources.{BaseRelation, TableScan} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{DataFrame, Row, SQLContext} + +import scala.collection.JavaConversions._ +import scala.collection.mutable + +/** + * Relation, that implements the Hoodie incremental view. + * + * Implemented for Copy_on_write storage. + * TODO: rebase w/ HoodieBaseRelation HUDI-5362 + * + */ +class IncrementalRelation(val sqlContext: SQLContext, + val optParams: Map[String, String], + val userSchema: Option[StructType], + val metaClient: HoodieTableMetaClient) extends BaseRelation with TableScan { + + private val log = LogManager.getLogger(classOf[IncrementalRelation]) + + val skeletonSchema: StructType = HoodieSparkUtils.getMetaSchema + private val basePath = metaClient.getBasePathV2 + // TODO : Figure out a valid HoodieWriteConfig + private val hoodieTable = HoodieSparkTable.create(HoodieWriteConfig.newBuilder().withPath(basePath.toString).build(), + new HoodieSparkEngineContext(new JavaSparkContext(sqlContext.sparkContext)), + metaClient) + private val commitTimeline = hoodieTable.getMetaClient.getCommitTimeline.filterCompletedInstants() + if (commitTimeline.empty()) { + throw new HoodieException("No instants to incrementally pull") + } + if (!optParams.contains(DataSourceReadOptions.BEGIN_INSTANTTIME.key)) { + throw new HoodieException(s"Specify the begin instant time to pull from using " + + s"option ${DataSourceReadOptions.BEGIN_INSTANTTIME.key}") + } + + if (!metaClient.getTableConfig.populateMetaFields()) { + throw new HoodieException("Incremental queries are not supported when meta fields are disabled") + } + + val useEndInstantSchema = optParams.getOrElse(DataSourceReadOptions.INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME.key, + DataSourceReadOptions.INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME.defaultValue).toBoolean + + private val lastInstant = commitTimeline.lastInstant().get() + + private val commitsTimelineToReturn = commitTimeline.findInstantsInRange( + optParams(DataSourceReadOptions.BEGIN_INSTANTTIME.key), + optParams.getOrElse(DataSourceReadOptions.END_INSTANTTIME.key(), lastInstant.getTimestamp)) + private val commitsToReturn = commitsTimelineToReturn.getInstants.iterator().toList + + // use schema from a file produced in the end/latest instant + + val (usedSchema, internalSchema) = { + log.info("Inferring schema..") + val schemaResolver = new TableSchemaResolver(metaClient) + val iSchema : InternalSchema = if (!isSchemaEvolutionEnabledOnRead(optParams, sqlContext.sparkSession)) { + InternalSchema.getEmptyInternalSchema + } else if (useEndInstantSchema && !commitsToReturn.isEmpty) { + InternalSchemaCache.searchSchemaAndCache(commitsToReturn.last.getTimestamp.toLong, metaClient, hoodieTable.getConfig.getInternalSchemaCacheEnable) + } else { + schemaResolver.getTableInternalSchemaFromCommitMetadata.orElse(null) + } + + val tableSchema = if (useEndInstantSchema && iSchema.isEmptySchema) { + if (commitsToReturn.isEmpty) schemaResolver.getTableAvroSchemaWithoutMetadataFields() else + schemaResolver.getTableAvroSchema(commitsToReturn.last, false) + } else { + schemaResolver.getTableAvroSchemaWithoutMetadataFields() + } + if (tableSchema.getType == Schema.Type.NULL) { + // if there is only one commit in the table and is an empty commit without schema, return empty RDD here + (StructType(Nil), InternalSchema.getEmptyInternalSchema) + } else { + val dataSchema = AvroConversionUtils.convertAvroSchemaToStructType(tableSchema) + if (iSchema != null && !iSchema.isEmptySchema) { + // if internalSchema is ready, dataSchema will contains skeletonSchema + (dataSchema, iSchema) + } else { + (StructType(skeletonSchema.fields ++ dataSchema.fields), InternalSchema.getEmptyInternalSchema) + } + } + } + + private val filters = optParams.getOrElse(DataSourceReadOptions.PUSH_DOWN_INCR_FILTERS.key, + DataSourceReadOptions.PUSH_DOWN_INCR_FILTERS.defaultValue).split(",").filter(!_.isEmpty) + + override def schema: StructType = usedSchema + + override def buildScan(): RDD[Row] = { + if (usedSchema == StructType(Nil)) { + // if first commit in a table is an empty commit without schema, return empty RDD here + sqlContext.sparkContext.emptyRDD[Row] + } else { + val regularFileIdToFullPath = mutable.HashMap[String, String]() + var metaBootstrapFileIdToFullPath = mutable.HashMap[String, String]() + + // create Replaced file group + val replacedTimeline = commitsTimelineToReturn.getCompletedReplaceTimeline + val replacedFile = replacedTimeline.getInstants.collect(Collectors.toList[HoodieInstant]).flatMap { instant => + val replaceMetadata = HoodieReplaceCommitMetadata. + fromBytes(metaClient.getActiveTimeline.getInstantDetails(instant).get, classOf[HoodieReplaceCommitMetadata]) + replaceMetadata.getPartitionToReplaceFileIds.entrySet().flatMap { entry => + entry.getValue.map { e => + val fullPath = FSUtils.getPartitionPath(basePath, entry.getKey).toString + (e, fullPath) + } + } + }.toMap + + for (commit <- commitsToReturn) { + val metadata: HoodieCommitMetadata = HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit) + .get, classOf[HoodieCommitMetadata]) + + if (HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS == commit.getTimestamp) { + metaBootstrapFileIdToFullPath ++= metadata.getFileIdAndFullPaths(basePath).toMap.filterNot { case (k, v) => + replacedFile.contains(k) && v.startsWith(replacedFile(k)) + } + } else { + regularFileIdToFullPath ++= metadata.getFileIdAndFullPaths(basePath).toMap.filterNot { case (k, v) => + replacedFile.contains(k) && v.startsWith(replacedFile(k)) + } + } + } + + if (metaBootstrapFileIdToFullPath.nonEmpty) { + // filer out meta bootstrap files that have had more commits since metadata bootstrap + metaBootstrapFileIdToFullPath = metaBootstrapFileIdToFullPath + .filterNot(fileIdFullPath => regularFileIdToFullPath.contains(fileIdFullPath._1)) + } + + val pathGlobPattern = optParams.getOrElse( + DataSourceReadOptions.INCR_PATH_GLOB.key, + DataSourceReadOptions.INCR_PATH_GLOB.defaultValue) + val (filteredRegularFullPaths, filteredMetaBootstrapFullPaths) = { + if (!pathGlobPattern.equals(DataSourceReadOptions.INCR_PATH_GLOB.defaultValue)) { + val globMatcher = new GlobPattern("*" + pathGlobPattern) + (regularFileIdToFullPath.filter(p => globMatcher.matches(p._2)).values, + metaBootstrapFileIdToFullPath.filter(p => globMatcher.matches(p._2)).values) + } else { + (regularFileIdToFullPath.values, metaBootstrapFileIdToFullPath.values) + } + } + // pass internalSchema to hadoopConf, so it can be used in executors. + val validCommits = metaClient + .getCommitsAndCompactionTimeline.filterCompletedInstants.getInstants.toArray().map(_.asInstanceOf[HoodieInstant].getFileName).mkString(",") + sqlContext.sparkContext.hadoopConfiguration.set(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA, SerDeHelper.toJson(internalSchema)) + sqlContext.sparkContext.hadoopConfiguration.set(SparkInternalSchemaConverter.HOODIE_TABLE_PATH, metaClient.getBasePath) + sqlContext.sparkContext.hadoopConfiguration.set(SparkInternalSchemaConverter.HOODIE_VALID_COMMITS_LIST, validCommits) + val formatClassName = metaClient.getTableConfig.getBaseFileFormat match { + case HoodieFileFormat.PARQUET => HoodieParquetFileFormat.FILE_FORMAT_ID + case HoodieFileFormat.ORC => "orc" + } + + // Fallback to full table scan if any of the following conditions matches: + // 1. the start commit is archived + // 2. the end commit is archived + // 3. there are files in metadata be deleted + val fallbackToFullTableScan = optParams.getOrElse(DataSourceReadOptions.INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES.key, + DataSourceReadOptions.INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES.defaultValue).toBoolean + + val sOpts = optParams.filter(p => !p._1.equalsIgnoreCase("path")) + + val startInstantTime = optParams(DataSourceReadOptions.BEGIN_INSTANTTIME.key) + val startInstantArchived = commitTimeline.isBeforeTimelineStarts(startInstantTime) + val endInstantTime = optParams.getOrElse(DataSourceReadOptions.END_INSTANTTIME.key(), lastInstant.getTimestamp) + val endInstantArchived = commitTimeline.isBeforeTimelineStarts(endInstantTime) + + val scanDf = if (fallbackToFullTableScan && (startInstantArchived || endInstantArchived)) { + log.info(s"Falling back to full table scan as startInstantArchived: $startInstantArchived, endInstantArchived: $endInstantArchived") + fullTableScanDataFrame(startInstantTime, endInstantTime) + } else { + if (filteredRegularFullPaths.isEmpty && filteredMetaBootstrapFullPaths.isEmpty) { + sqlContext.createDataFrame(sqlContext.sparkContext.emptyRDD[Row], usedSchema) + } else { + log.info("Additional Filters to be applied to incremental source are :" + filters.mkString("Array(", ", ", ")")) + + var df: DataFrame = sqlContext.createDataFrame(sqlContext.sparkContext.emptyRDD[Row], usedSchema) + + var doFullTableScan = false + + if (fallbackToFullTableScan) { + val fs = basePath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration); + val timer = new HoodieTimer().startTimer(); + + val allFilesToCheck = filteredMetaBootstrapFullPaths ++ filteredRegularFullPaths + val firstNotFoundPath = allFilesToCheck.find(path => !fs.exists(new Path(path))) + val timeTaken = timer.endTimer() + log.info("Checking if paths exists took " + timeTaken + "ms") + + if (firstNotFoundPath.isDefined) { + doFullTableScan = true + log.info("Falling back to full table scan as some files cannot be found.") + } + } + + if (doFullTableScan) { + fullTableScanDataFrame(startInstantTime, endInstantTime) + } else { + if (metaBootstrapFileIdToFullPath.nonEmpty) { + df = sqlContext.sparkSession.read + .format("hudi_v1") + .schema(usedSchema) + .option(DataSourceReadOptions.READ_PATHS.key, filteredMetaBootstrapFullPaths.mkString(",")) + // Setting time to the END_INSTANT_TIME, to avoid pathFilter filter out files incorrectly. + .option(DataSourceReadOptions.TIME_TRAVEL_AS_OF_INSTANT.key(), endInstantTime) + .load() + } + + if (regularFileIdToFullPath.nonEmpty) { + df = df.union(sqlContext.read.options(sOpts) + .schema(usedSchema).format(formatClassName) + // Setting time to the END_INSTANT_TIME, to avoid pathFilter filter out files incorrectly. + .option(DataSourceReadOptions.TIME_TRAVEL_AS_OF_INSTANT.key(), endInstantTime) + .load(filteredRegularFullPaths.toList: _*) + .filter(String.format("%s >= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, + commitsToReturn.head.getTimestamp)) + .filter(String.format("%s <= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, + commitsToReturn.last.getTimestamp))) + } + df + } + } + } + + filters.foldLeft(scanDf)((e, f) => e.filter(f)).rdd + } + } + + private def fullTableScanDataFrame(startInstantTime: String, endInstantTime: String): DataFrame = { + val hudiDF = sqlContext.read + .format("hudi_v1") + .schema(usedSchema) + .load(basePath.toString) + .filter(String.format("%s > '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, //Notice the > in place of >= because we are working with optParam instead of first commit > optParam + startInstantTime)) + .filter(String.format("%s <= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, + endInstantTime)) + + // schema enforcement does not happen in above spark.read with hudi. hence selecting explicitly w/ right column order + val fieldNames = usedSchema.fieldNames + hudiDF.select(fieldNames.head, fieldNames.tail: _*) + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala new file mode 100644 index 0000000000000..446c806b1804d --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hadoop.fs.{FileStatus, GlobPattern, Path} +import org.apache.hudi.HoodieConversionUtils.toScalaOption +import org.apache.hudi.common.model.{FileSlice, HoodieRecord} +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline} +import org.apache.hudi.common.table.view.HoodieTableFileSystemView +import org.apache.hudi.common.util.StringUtils +import org.apache.hudi.exception.HoodieException +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.{getCommitMetadata, getWritePartitionPaths, listAffectedFilesForCommits} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.sources._ +import org.apache.spark.sql.types.StructType + +import scala.collection.JavaConverters._ +import scala.collection.immutable + +/** + * @Experimental + */ +class MergeOnReadIncrementalRelation(sqlContext: SQLContext, + optParams: Map[String, String], + userSchema: Option[StructType], + metaClient: HoodieTableMetaClient) + extends MergeOnReadSnapshotRelation(sqlContext, optParams, userSchema, Seq(), metaClient) with HoodieIncrementalRelationTrait { + + override type FileSplit = HoodieMergeOnReadFileSplit + + override def imbueConfigs(sqlContext: SQLContext): Unit = { + super.imbueConfigs(sqlContext) + sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.enableVectorizedReader", "false") + } + + override protected def timeline: HoodieTimeline = { + if (fullTableScan) { + super.timeline + } else { + super.timeline.findInstantsInRange(startTimestamp, endTimestamp) + } + } + + protected override def composeRDD(fileSplits: Seq[HoodieMergeOnReadFileSplit], + tableSchema: HoodieTableSchema, + requiredSchema: HoodieTableSchema, + requestedColumns: Array[String], + filters: Array[Filter]): RDD[InternalRow] = { + // The only required filters are ones that make sure we're only fetching records that + // fall into incremental span of the timeline being queried + val requiredFilters = incrementalSpanRecordFilters + val optionalFilters = filters + val readers = createBaseFileReaders(tableSchema, requiredSchema, requestedColumns, requiredFilters, optionalFilters) + + val hoodieTableState = getTableState + // TODO(HUDI-3639) implement incremental span record filtering w/in RDD to make sure returned iterator is appropriately + // filtered, since file-reader might not be capable to perform filtering + new HoodieMergeOnReadRDD( + sqlContext.sparkContext, + config = jobConf, + fileReaders = readers, + tableSchema = tableSchema, + requiredSchema = requiredSchema, + tableState = hoodieTableState, + mergeType = mergeType, + fileSplits = fileSplits) + } + + override protected def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): List[HoodieMergeOnReadFileSplit] = { + if (includedCommits.isEmpty) { + List() + } else { + val fileSlices = if (fullTableScan) { + listLatestFileSlices(Seq(), partitionFilters, dataFilters) + } else { + val latestCommit = includedCommits.last.getTimestamp + + val fsView = new HoodieTableFileSystemView(metaClient, timeline, affectedFilesInCommits) + + val modifiedPartitions = getWritePartitionPaths(commitsMetadata) + + modifiedPartitions.asScala.flatMap { relativePartitionPath => + fsView.getLatestMergedFileSlicesBeforeOrOn(relativePartitionPath, latestCommit).iterator().asScala + }.toSeq + } + + buildSplits(filterFileSlices(fileSlices, globPattern)) + } + } + + private def filterFileSlices(fileSlices: Seq[FileSlice], pathGlobPattern: String): Seq[FileSlice] = { + val filteredFileSlices = if (!StringUtils.isNullOrEmpty(pathGlobPattern)) { + val globMatcher = new GlobPattern("*" + pathGlobPattern) + fileSlices.filter(fileSlice => { + val path = toScalaOption(fileSlice.getBaseFile).map(_.getPath) + .orElse(toScalaOption(fileSlice.getLatestLogFile).map(_.getPath.toString)) + .get + globMatcher.matches(path) + }) + } else { + fileSlices + } + filteredFileSlices + } +} + +trait HoodieIncrementalRelationTrait extends HoodieBaseRelation { + + // Validate this Incremental implementation is properly configured + validate() + + protected def startTimestamp: String = optParams(DataSourceReadOptions.BEGIN_INSTANTTIME.key) + protected def endTimestamp: String = optParams.getOrElse(DataSourceReadOptions.END_INSTANTTIME.key, super.timeline.lastInstant().get.getTimestamp) + + protected def startInstantArchived: Boolean = super.timeline.isBeforeTimelineStarts(startTimestamp) + protected def endInstantArchived: Boolean = super.timeline.isBeforeTimelineStarts(endTimestamp) + + // Fallback to full table scan if any of the following conditions matches: + // 1. the start commit is archived + // 2. the end commit is archived + // 3. there are files in metadata be deleted + protected lazy val fullTableScan: Boolean = { + val fallbackToFullTableScan = optParams.getOrElse(DataSourceReadOptions.INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES.key, + DataSourceReadOptions.INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES.defaultValue).toBoolean + + fallbackToFullTableScan && (startInstantArchived || endInstantArchived || affectedFilesInCommits.exists(fileStatus => !metaClient.getFs.exists(fileStatus.getPath))) + } + + protected lazy val includedCommits: immutable.Seq[HoodieInstant] = { + if (!startInstantArchived || !endInstantArchived) { + // If endTimestamp commit is not archived, will filter instants + // before endTimestamp. + super.timeline.findInstantsInRange(startTimestamp, endTimestamp).getInstants.iterator().asScala.toList + } else { + super.timeline.getInstants.iterator().asScala.toList + } + } + + protected lazy val commitsMetadata = includedCommits.map(getCommitMetadata(_, super.timeline)).asJava + + protected lazy val affectedFilesInCommits: Array[FileStatus] = { + listAffectedFilesForCommits(conf, new Path(metaClient.getBasePath), commitsMetadata) + } + + // Record filters making sure that only records w/in the requested bounds are being fetched as part of the + // scan collected by this relation + protected lazy val incrementalSpanRecordFilters: Seq[Filter] = { + val isNotNullFilter = IsNotNull(HoodieRecord.COMMIT_TIME_METADATA_FIELD) + + val largerThanFilter = GreaterThan(HoodieRecord.COMMIT_TIME_METADATA_FIELD, startTimestamp) + + val lessThanFilter = LessThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, + if (endInstantArchived) endTimestamp else includedCommits.last.getTimestamp) + + Seq(isNotNullFilter, largerThanFilter, lessThanFilter) + } + + override lazy val mandatoryFields: Seq[String] = { + // NOTE: This columns are required for Incremental flow to be able to handle the rows properly, even in + // cases when no columns are requested to be fetched (for ex, when using {@code count()} API) + Seq(HoodieRecord.RECORD_KEY_METADATA_FIELD, HoodieRecord.COMMIT_TIME_METADATA_FIELD) ++ + preCombineFieldOpt.map(Seq(_)).getOrElse(Seq()) + } + + protected def validate(): Unit = { + if (super.timeline.empty()) { + throw new HoodieException("No instants to incrementally pull") + } + + if (!this.optParams.contains(DataSourceReadOptions.BEGIN_INSTANTTIME.key)) { + throw new HoodieException(s"Specify the begin instant time to pull from using " + + s"option ${DataSourceReadOptions.BEGIN_INSTANTTIME.key}") + } + + if (!this.tableConfig.populateMetaFields()) { + throw new HoodieException("Incremental queries are not supported when meta fields are disabled") + } + } + + protected def globPattern: String = + optParams.getOrElse(DataSourceReadOptions.INCR_PATH_GLOB.key, DataSourceReadOptions.INCR_PATH_GLOB.defaultValue) + +} + diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala new file mode 100644 index 0000000000000..1d0d533e5bb81 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hudi.HoodieBaseRelation.{BaseFileReader, convertToAvroSchema} +import org.apache.hudi.HoodieConversionUtils.toScalaOption +import org.apache.hudi.MergeOnReadSnapshotRelation.getFilePath +import org.apache.hudi.avro.HoodieAvroUtils +import org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath +import org.apache.hudi.common.model.{FileSlice, HoodieLogFile} +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.view.HoodieTableFileSystemView +import org.apache.spark.execution.datasources.HoodieInMemoryFileIndex +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.StructType + +import scala.collection.JavaConverters._ + +case class HoodieMergeOnReadFileSplit(dataFile: Option[PartitionedFile], + logFiles: List[HoodieLogFile]) extends HoodieFileSplit + +class MergeOnReadSnapshotRelation(sqlContext: SQLContext, + optParams: Map[String, String], + userSchema: Option[StructType], + globPaths: Seq[Path], + metaClient: HoodieTableMetaClient) + extends HoodieBaseRelation(sqlContext, metaClient, optParams, userSchema) { + + override type FileSplit = HoodieMergeOnReadFileSplit + + /** + * NOTE: These are the fields that are required to properly fulfil Merge-on-Read (MOR) + * semantic: + * + *
      + *
    1. Primary key is required to make sure we're able to correlate records from the base + * file with the updated records from the delta-log file
    2. + *
    3. Pre-combine key is required to properly perform the combining (or merging) of the + * existing and updated records
    4. + *
    + * + * However, in cases when merging is NOT performed (for ex, if file-group only contains base + * files but no delta-log files, or if the query-type is equal to [["skip_merge"]]) neither + * of primary-key or pre-combine-key are required to be fetched from storage (unless requested + * by the query), therefore saving on throughput + */ + protected lazy val mandatoryFieldsForMerging: Seq[String] = + Seq(recordKeyField) ++ preCombineFieldOpt.map(Seq(_)).getOrElse(Seq()) + + override lazy val mandatoryFields: Seq[String] = mandatoryFieldsForMerging + + protected val mergeType: String = optParams.getOrElse(DataSourceReadOptions.REALTIME_MERGE.key, + DataSourceReadOptions.REALTIME_MERGE.defaultValue) + + override def imbueConfigs(sqlContext: SQLContext): Unit = { + super.imbueConfigs(sqlContext) + sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.enableVectorizedReader", "true") + } + + protected override def composeRDD(fileSplits: Seq[HoodieMergeOnReadFileSplit], + tableSchema: HoodieTableSchema, + requiredSchema: HoodieTableSchema, + requestedColumns: Array[String], + filters: Array[Filter]): RDD[InternalRow] = { + val requiredFilters = Seq.empty + val optionalFilters = filters + val readers = createBaseFileReaders(tableSchema, requiredSchema, requestedColumns, requiredFilters, optionalFilters) + + val tableState = getTableState + new HoodieMergeOnReadRDD( + sqlContext.sparkContext, + config = jobConf, + fileReaders = readers, + tableSchema = tableSchema, + requiredSchema = requiredSchema, + tableState = tableState, + mergeType = mergeType, + fileSplits = fileSplits) + } + + protected def createBaseFileReaders(tableSchema: HoodieTableSchema, + requiredSchema: HoodieTableSchema, + requestedColumns: Array[String], + requiredFilters: Seq[Filter], + optionalFilters: Seq[Filter] = Seq.empty): HoodieMergeOnReadBaseFileReaders = { + val (partitionSchema, dataSchema, requiredDataSchema) = + tryPrunePartitionColumns(tableSchema, requiredSchema) + + val fullSchemaReader = createBaseFileReader( + spark = sqlContext.sparkSession, + partitionSchema = partitionSchema, + dataSchema = dataSchema, + requiredDataSchema = dataSchema, + // This file-reader is used to read base file records, subsequently merging them with the records + // stored in delta-log files. As such, we have to read _all_ records from the base file, while avoiding + // applying any filtering _before_ we complete combining them w/ delta-log records (to make sure that + // we combine them correctly); + // As such only required filters could be pushed-down to such reader + filters = requiredFilters, + options = optParams, + // NOTE: We have to fork the Hadoop Config here as Spark will be modifying it + // to configure Parquet reader appropriately + hadoopConf = embedInternalSchema(new Configuration(conf), internalSchemaOpt) + ) + + val requiredSchemaReader = createBaseFileReader( + spark = sqlContext.sparkSession, + partitionSchema = partitionSchema, + dataSchema = dataSchema, + requiredDataSchema = requiredDataSchema, + // This file-reader is used to read base file records, subsequently merging them with the records + // stored in delta-log files. As such, we have to read _all_ records from the base file, while avoiding + // applying any filtering _before_ we complete combining them w/ delta-log records (to make sure that + // we combine them correctly); + // As such only required filters could be pushed-down to such reader + filters = requiredFilters, + options = optParams, + // NOTE: We have to fork the Hadoop Config here as Spark will be modifying it + // to configure Parquet reader appropriately + hadoopConf = embedInternalSchema(new Configuration(conf), requiredDataSchema.internalSchema) + ) + + // Check whether fields required for merging were also requested to be fetched + // by the query: + // - In case they were, there's no optimization we could apply here (we will have + // to fetch such fields) + // - In case they were not, we will provide 2 separate file-readers + // a) One which would be applied to file-groups w/ delta-logs (merging) + // b) One which would be applied to file-groups w/ no delta-logs or + // in case query-mode is skipping merging + val mandatoryColumns = mandatoryFieldsForMerging.map(HoodieAvroUtils.getRootLevelFieldName) + if (mandatoryColumns.forall(requestedColumns.contains)) { + HoodieMergeOnReadBaseFileReaders( + fullSchemaReader = fullSchemaReader, + requiredSchemaReader = requiredSchemaReader, + requiredSchemaReaderSkipMerging = requiredSchemaReader + ) + } else { + val prunedRequiredSchema = { + val superfluousColumnNames = mandatoryColumns.filterNot(requestedColumns.contains) + val prunedStructSchema = + StructType(requiredDataSchema.structTypeSchema.fields + .filterNot(f => superfluousColumnNames.contains(f.name))) + + HoodieTableSchema(prunedStructSchema, convertToAvroSchema(prunedStructSchema).toString) + } + + val requiredSchemaReaderSkipMerging = createBaseFileReader( + spark = sqlContext.sparkSession, + partitionSchema = partitionSchema, + dataSchema = dataSchema, + requiredDataSchema = prunedRequiredSchema, + // This file-reader is only used in cases when no merging is performed, therefore it's safe to push + // down these filters to the base file readers + filters = requiredFilters ++ optionalFilters, + options = optParams, + // NOTE: We have to fork the Hadoop Config here as Spark will be modifying it + // to configure Parquet reader appropriately + hadoopConf = embedInternalSchema(new Configuration(conf), requiredDataSchema.internalSchema) + ) + + HoodieMergeOnReadBaseFileReaders( + fullSchemaReader = fullSchemaReader, + requiredSchemaReader = requiredSchemaReader, + requiredSchemaReaderSkipMerging = requiredSchemaReaderSkipMerging + ) + } + } + + protected override def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): List[HoodieMergeOnReadFileSplit] = { + val convertedPartitionFilters = + HoodieFileIndex.convertFilterForTimestampKeyGenerator(metaClient, partitionFilters) + + if (globPaths.isEmpty) { + val fileSlices = fileIndex.listFileSlices(convertedPartitionFilters) + buildSplits(fileSlices.values.flatten.toSeq) + } else { + val fileSlices = listLatestFileSlices(globPaths, partitionFilters, dataFilters) + buildSplits(fileSlices) + } + } + + protected def buildSplits(fileSlices: Seq[FileSlice]): List[HoodieMergeOnReadFileSplit] = { + fileSlices.map { fileSlice => + val baseFile = toScalaOption(fileSlice.getBaseFile) + val logFiles = fileSlice.getLogFiles.sorted(HoodieLogFile.getLogFileComparator).iterator().asScala.toList + + val partitionedBaseFile = baseFile.map { file => + val filePath = getFilePath(file.getFileStatus.getPath) + PartitionedFile(getPartitionColumnsAsInternalRow(file.getFileStatus), filePath, 0, file.getFileLen) + } + + HoodieMergeOnReadFileSplit(partitionedBaseFile, logFiles) + }.toList + } +} + +object MergeOnReadSnapshotRelation { + + def getFilePath(path: Path): String = { + // Here we use the Path#toUri to encode the path string, as there is a decode in + // ParquetFileFormat#buildReaderWithPartitionValues in the spark project when read the table + // .So we should encode the file path here. Otherwise, there is a FileNotException throw + // out. + // For example, If the "pt" is the partition path field, and "pt" = "2021/02/02", If + // we enable the URL_ENCODE_PARTITIONING and write data to hudi table.The data path + // in the table will just like "/basePath/2021%2F02%2F02/xxxx.parquet". When we read + // data from the table, if there are no encode for the file path, + // ParquetFileFormat#buildReaderWithPartitionValues will decode it to + // "/basePath/2021/02/02/xxxx.parquet" witch will result to a FileNotException. + // See FileSourceScanExec#createBucketedReadRDD in spark project which do the same thing + // when create PartitionedFile. + path.toUri.toString + } + +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkConfigs.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkConfigs.scala new file mode 100644 index 0000000000000..73e1f86948e88 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkConfigs.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi + +object SparkConfigs { + + // spark data source write pool name. Incase of streaming sink, users might be interested to set custom scheduling configs + // for regular writes and async compaction. In such cases, this pool name will be used for spark datasource writes. + val SPARK_DATASOURCE_WRITER_POOL_NAME = "sparkdatasourcewrite" + + /* + When async compaction is enabled (deltastreamer or streaming sink), users might be interested to set custom + scheduling configs for regular writes and async table services like compaction and clustering. This is the property + used to set custom scheduler config file with spark. In Deltastreamer, the file is generated within hudi and set if + necessary. Where as in case of streaming sink, users have to set this property when they invoke spark shell. + Sample format of the file contents. + + + + FAIR + 4 + 2 + + + FAIR + 3 + 1 + + + FAIR + 2 + 1 + + + */ + val SPARK_SCHEDULER_ALLOCATION_FILE_KEY = "spark.scheduler.allocation.file" + +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala new file mode 100644 index 0000000000000..a9a38f5f82bdd --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala @@ -0,0 +1,348 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hudi.BaseHoodieTableFileIndex.PartitionPath +import org.apache.hudi.DataSourceReadOptions.{QUERY_TYPE, QUERY_TYPE_INCREMENTAL_OPT_VAL, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, QUERY_TYPE_SNAPSHOT_OPT_VAL} +import org.apache.hudi.HoodieConversionUtils.toJavaOption +import org.apache.hudi.SparkHoodieTableFileIndex.{deduceQueryType, generateFieldMap, shouldValidatePartitionColumns} +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.bootstrap.index.BootstrapIndex +import org.apache.hudi.common.config.TypedProperties +import org.apache.hudi.common.model.{FileSlice, HoodieTableQueryType} +import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.hadoop.CachingPath +import org.apache.hudi.hadoop.CachingPath.createPathUnsafe +import org.apache.hudi.keygen.{TimestampBasedAvroKeyGenerator, TimestampBasedKeyGenerator} +import org.apache.spark.api.java.JavaSparkContext +import org.apache.spark.internal.Logging +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, BoundReference, Expression, InterpretedPredicate} +import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.catalyst.{InternalRow, expressions} +import org.apache.spark.sql.execution.datasources.{FileStatusCache, NoopCache} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{StringType, StructField, StructType} +import org.apache.spark.unsafe.types.UTF8String + +import scala.collection.JavaConverters._ +import scala.language.implicitConversions + +/** + * Implementation of the [[BaseHoodieTableFileIndex]] for Spark + * + * @param spark spark session + * @param metaClient Hudi table's meta-client + * @param schemaSpec optional table's schema + * @param configProperties unifying configuration (in the form of generic properties) + * @param specifiedQueryInstant instant as of which table is being queried + * @param fileStatusCache transient cache of fetched [[FileStatus]]es + */ +class SparkHoodieTableFileIndex(spark: SparkSession, + metaClient: HoodieTableMetaClient, + schemaSpec: Option[StructType], + configProperties: TypedProperties, + queryPaths: Seq[Path], + specifiedQueryInstant: Option[String] = None, + @transient fileStatusCache: FileStatusCache = NoopCache) + extends BaseHoodieTableFileIndex( + new HoodieSparkEngineContext(new JavaSparkContext(spark.sparkContext)), + metaClient, + configProperties, + deduceQueryType(configProperties), + queryPaths.asJava, + toJavaOption(specifiedQueryInstant), + false, + false, + SparkHoodieTableFileIndex.adapt(fileStatusCache) + ) + with SparkAdapterSupport + with Logging { + + /** + * Get the schema of the table. + */ + lazy val schema: StructType = schemaSpec.getOrElse({ + val schemaUtil = new TableSchemaResolver(metaClient) + AvroConversionUtils.convertAvroSchemaToStructType(schemaUtil.getTableAvroSchema) + }) + + private lazy val sparkParsePartitionUtil = sparkAdapter.getSparkParsePartitionUtil + + /** + * Get the partition schema from the hoodie.properties. + */ + private lazy val _partitionSchemaFromProperties: StructType = { + val tableConfig = metaClient.getTableConfig + val partitionColumns = tableConfig.getPartitionFields + val nameFieldMap = generateFieldMap(schema) + + if (partitionColumns.isPresent) { + // Note that key generator class name could be null + val keyGeneratorClassName = tableConfig.getKeyGeneratorClassName + if (classOf[TimestampBasedKeyGenerator].getName.equalsIgnoreCase(keyGeneratorClassName) + || classOf[TimestampBasedAvroKeyGenerator].getName.equalsIgnoreCase(keyGeneratorClassName)) { + val partitionFields = partitionColumns.get().map(column => StructField(column, StringType)) + StructType(partitionFields) + } else { + val partitionFields = partitionColumns.get().filter(column => nameFieldMap.contains(column)) + .map(column => nameFieldMap.apply(column)) + + if (partitionFields.size != partitionColumns.get().size) { + val isBootstrapTable = BootstrapIndex.getBootstrapIndex(metaClient).useIndex() + if (isBootstrapTable) { + // For bootstrapped tables its possible the schema does not contain partition field when source table + // is hive style partitioned. In this case we would like to treat the table as non-partitioned + // as opposed to failing + new StructType() + } else { + throw new IllegalArgumentException(s"Cannot find columns: " + + s"'${partitionColumns.get().filter(col => !nameFieldMap.contains(col)).mkString(",")}' " + + s"in the schema[${schema.fields.mkString(",")}]") + } + } else { + new StructType(partitionFields) + } + } + } else { + // If the partition columns have not stored in hoodie.properties(the table that was + // created earlier), we trait it as a non-partitioned table. + logWarning("No partition columns available from hoodie.properties." + + " Partition pruning will not work") + new StructType() + } + } + + /** + * Get the data schema of the table. + * + * @return + */ + def dataSchema: StructType = { + val partitionColumns = partitionSchema.fields.map(_.name).toSet + StructType(schema.fields.filterNot(f => partitionColumns.contains(f.name))) + } + + /** + * @VisibleForTesting + */ + def partitionSchema: StructType = { + if (queryAsNonePartitionedTable) { + // If we read it as Non-Partitioned table, we should not + // return the partition schema. + new StructType() + } else { + _partitionSchemaFromProperties + } + } + + /** + * Fetch list of latest base files w/ corresponding log files, after performing + * partition pruning + * + * TODO unify w/ HoodieFileIndex#listFiles + * + * @param partitionFilters partition column filters + * @return mapping from string partition paths to its base/log files + */ + def listFileSlices(partitionFilters: Seq[Expression]): Map[String, Seq[FileSlice]] = { + // Prune the partition path by the partition filters + val prunedPartitions = prunePartition(cachedAllInputFileSlices.keySet().asScala.toSeq, partitionFilters) + prunedPartitions.map(partition => { + (partition.path, cachedAllInputFileSlices.get(partition).asScala) + }).toMap + } + + /** + * Get all the cached partition paths pruned by the filter. + * + * @param predicates The filter condition + * @return The pruned partition paths + */ + def getPartitionPaths(predicates: Seq[Expression]): Seq[PartitionPath] = { + prunePartition(cachedAllInputFileSlices.keySet().asScala.toSeq, predicates) + } + + /** + * Prune the partition by the filter.This implementation is fork from + * org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex#prunePartitions. + * + * @param partitionPaths All the partition paths. + * @param predicates The filter condition. + * @return The pruned partition paths. + */ + protected def prunePartition(partitionPaths: Seq[PartitionPath], predicates: Seq[Expression]): Seq[PartitionPath] = { + val partitionColumnNames = partitionSchema.fields.map(_.name).toSet + val partitionPruningPredicates = predicates.filter { + _.references.map(_.name).toSet.subsetOf(partitionColumnNames) + } + if (partitionPruningPredicates.nonEmpty) { + val predicate = partitionPruningPredicates.reduce(expressions.And) + + val boundPredicate = InterpretedPredicate(predicate.transform { + case a: AttributeReference => + val index = partitionSchema.indexWhere(a.name == _.name) + BoundReference(index, partitionSchema(index).dataType, nullable = true) + }) + + val prunedPartitionPaths = partitionPaths.filter { + partitionPath => boundPredicate.eval(InternalRow.fromSeq(partitionPath.values)) + } + + logInfo(s"Total partition size is: ${partitionPaths.size}," + + s" after partition prune size is: ${prunedPartitionPaths.size}") + prunedPartitionPaths + } else { + partitionPaths + } + } + + protected def parsePartitionColumnValues(partitionColumns: Array[String], partitionPath: String): Array[Object] = { + if (partitionColumns.length == 0) { + // This is a non-partitioned table + Array.empty + } else { + val partitionFragments = partitionPath.split("/") + + if (partitionFragments.length != partitionColumns.length && + partitionColumns.length == 1) { + // If the partition column size is not equal to the partition fragment size + // and the partition column size is 1, we map the whole partition path + // to the partition column which can benefit from the partition prune. + val prefix = s"${partitionColumns.head}=" + val partitionValue = if (partitionPath.startsWith(prefix)) { + // support hive style partition path + partitionPath.substring(prefix.length) + } else { + partitionPath + } + Array(UTF8String.fromString(partitionValue)) + } else if (partitionFragments.length != partitionColumns.length && + partitionColumns.length > 1) { + // If the partition column size is not equal to the partition fragments size + // and the partition column size > 1, we do not know how to map the partition + // fragments to the partition columns. So we trait it as a Non-Partitioned Table + // for the query which do not benefit from the partition prune. + logWarning(s"Cannot do the partition prune for table $basePath." + + s"The partitionFragments size (${partitionFragments.mkString(",")})" + + s" is not equal to the partition columns size(${partitionColumns.mkString(",")})") + Array.empty + } else { + // If partitionSeqs.length == partitionSchema.fields.length + // Append partition name to the partition value if the + // HIVE_STYLE_PARTITIONING is disable. + // e.g. convert "/xx/xx/2021/02" to "/xx/xx/year=2021/month=02" + val partitionWithName = + partitionFragments.zip(partitionColumns).map { + case (partition, columnName) => + if (partition.indexOf("=") == -1) { + s"${columnName}=$partition" + } else { + partition + } + }.mkString("/") + + val pathWithPartitionName = new CachingPath(basePath, createPathUnsafe(partitionWithName)) + val partitionValues = parsePartitionPath(pathWithPartitionName, partitionSchema) + + partitionValues.map(_.asInstanceOf[Object]).toArray + } + } + } + + private def parsePartitionPath(partitionPath: Path, partitionSchema: StructType): Seq[Any] = { + val timeZoneId = configProperties.getString(DateTimeUtils.TIMEZONE_OPTION, SQLConf.get.sessionLocalTimeZone) + val partitionDataTypes = partitionSchema.map(f => f.name -> f.dataType).toMap + + sparkParsePartitionUtil.parsePartition( + partitionPath, + typeInference = false, + Set(basePath), + partitionDataTypes, + DateTimeUtils.getTimeZone(timeZoneId), + validatePartitionValues = shouldValidatePartitionColumns(spark) + ) + .toSeq(partitionSchema) + } +} + + +object SparkHoodieTableFileIndex { + + /** + * This method unravels [[StructType]] into a [[Map]] of pairs of dot-path notation with corresponding + * [[StructField]] object for every field of the provided [[StructType]], recursively. + * + * For example, following struct + *
    +   *   StructType(
    +   *     StructField("a",
    +   *       StructType(
    +   *          StructField("b", StringType),
    +   *          StructField("c", IntType)
    +   *       )
    +   *     )
    +   *   )
    +   * 
    + * + * will be converted into following mapping: + * + *
    +   *   "a.b" -> StructField("b", StringType),
    +   *   "a.c" -> StructField("c", IntType),
    +   * 
    + */ + private def generateFieldMap(structType: StructType) : Map[String, StructField] = { + def traverse(structField: Either[StructField, StructType]) : Map[String, StructField] = { + structField match { + case Right(struct) => struct.fields.flatMap(f => traverse(Left(f))).toMap + case Left(field) => field.dataType match { + case struct: StructType => traverse(Right(struct)).map { + case (key, structField) => (s"${field.name}.$key", structField) + } + case _ => Map(field.name -> field) + } + } + } + + traverse(Right(structType)) + } + + private def deduceQueryType(configProperties: TypedProperties): HoodieTableQueryType = { + configProperties.asScala.getOrElse(QUERY_TYPE.key, QUERY_TYPE.defaultValue) match { + case QUERY_TYPE_SNAPSHOT_OPT_VAL => HoodieTableQueryType.SNAPSHOT + case QUERY_TYPE_INCREMENTAL_OPT_VAL => HoodieTableQueryType.INCREMENTAL + case QUERY_TYPE_READ_OPTIMIZED_OPT_VAL => HoodieTableQueryType.READ_OPTIMIZED + case _ @ qt => throw new IllegalArgumentException(s"query-type ($qt) not supported") + } + } + + private def adapt(cache: FileStatusCache): BaseHoodieTableFileIndex.FileStatusCache = { + new BaseHoodieTableFileIndex.FileStatusCache { + override def get(path: Path): org.apache.hudi.common.util.Option[Array[FileStatus]] = toJavaOption(cache.getLeafFiles(path)) + override def put(path: Path, leafFiles: Array[FileStatus]): Unit = cache.putLeafFiles(path, leafFiles) + override def invalidate(): Unit = cache.invalidateAll() + } + } + + private def shouldValidatePartitionColumns(spark: SparkSession): Boolean = { + // NOTE: We can't use helper, method nor the config-entry to stay compatible w/ Spark 2.4 + spark.sessionState.conf.getConfString("spark.sql.sources.validatePartitionColumns", "true").toBoolean + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/package.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/package.scala similarity index 100% rename from hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/package.scala rename to hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/package.scala diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/HoodieHadoopFSUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/HoodieHadoopFSUtils.scala new file mode 100644 index 0000000000000..353d94a7c105f --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/HoodieHadoopFSUtils.scala @@ -0,0 +1,370 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.viewfs.ViewFileSystem +import org.apache.hadoop.fs._ +import org.apache.hadoop.hdfs.DistributedFileSystem +import org.apache.spark.internal.Logging +import org.apache.spark.metrics.source.HiveCatalogMetrics +import org.apache.spark.util.SerializableConfiguration + +import java.io.FileNotFoundException +import scala.collection.mutable + +/** + * NOTE: This method class is replica of HadoopFSUtils from Spark 3.2.1, with the following adjustments + * + * - Filtering out of the listed files is adjusted to include files starting w/ "." (to include Hoodie Delta Log + * files) + */ +object HoodieHadoopFSUtils extends Logging { + /** + * Lists a collection of paths recursively. Picks the listing strategy adaptively depending + * on the number of paths to list. + * + * This may only be called on the driver. + * + * @param sc Spark context used to run parallel listing. + * @param paths Input paths to list + * @param hadoopConf Hadoop configuration + * @param filter Path filter used to exclude leaf files from result + * @param ignoreMissingFiles Ignore missing files that happen during recursive listing + * (e.g., due to race conditions) + * @param ignoreLocality Whether to fetch data locality info when listing leaf files. If false, + * this will return `FileStatus` without `BlockLocation` info. + * @param parallelismThreshold The threshold to enable parallelism. If the number of input paths + * is smaller than this value, this will fallback to use + * sequential listing. + * @param parallelismMax The maximum parallelism for listing. If the number of input paths is + * larger than this value, parallelism will be throttled to this value + * to avoid generating too many tasks. + * @return for each input path, the set of discovered files for the path + */ + def parallelListLeafFiles(sc: SparkContext, + paths: Seq[Path], + hadoopConf: Configuration, + filter: PathFilter, + ignoreMissingFiles: Boolean, + ignoreLocality: Boolean, + parallelismThreshold: Int, + parallelismMax: Int): Seq[(Path, Seq[FileStatus])] = { + parallelListLeafFilesInternal(sc, paths, hadoopConf, filter, isRootLevel = true, + ignoreMissingFiles, ignoreLocality, parallelismThreshold, parallelismMax) + } + + // scalastyle:off parameter.number + private def parallelListLeafFilesInternal(sc: SparkContext, + paths: Seq[Path], + hadoopConf: Configuration, + filter: PathFilter, + isRootLevel: Boolean, + ignoreMissingFiles: Boolean, + ignoreLocality: Boolean, + parallelismThreshold: Int, + parallelismMax: Int): Seq[(Path, Seq[FileStatus])] = { + + // Short-circuits parallel listing when serial listing is likely to be faster. + if (paths.size <= parallelismThreshold) { + // scalastyle:off return + return paths.map { path => + val leafFiles = listLeafFiles( + path, + hadoopConf, + filter, + Some(sc), + ignoreMissingFiles = ignoreMissingFiles, + ignoreLocality = ignoreLocality, + isRootPath = isRootLevel, + parallelismThreshold = parallelismThreshold, + parallelismMax = parallelismMax) + (path, leafFiles) + } + // scalastyle:on return + } + + logInfo(s"Listing leaf files and directories in parallel under ${paths.length} paths." + + s" The first several paths are: ${paths.take(10).mkString(", ")}.") + HiveCatalogMetrics.incrementParallelListingJobCount(1) + + val serializableConfiguration = new SerializableConfiguration(hadoopConf) + val serializedPaths = paths.map(_.toString) + + // Set the number of parallelism to prevent following file listing from generating many tasks + // in case of large #defaultParallelism. + val numParallelism = Math.min(paths.size, parallelismMax) + + val previousJobDescription = sc.getLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION) + val statusMap = try { + val description = paths.size match { + case 0 => + "Listing leaf files and directories 0 paths" + case 1 => + s"Listing leaf files and directories for 1 path:
    ${paths(0)}" + case s => + s"Listing leaf files and directories for $s paths:
    ${paths(0)}, ..." + } + sc.setJobDescription(description) + sc + .parallelize(serializedPaths, numParallelism) + .mapPartitions { pathStrings => + val hadoopConf = serializableConfiguration.value + pathStrings.map(new Path(_)).toSeq.map { path => + val leafFiles = listLeafFiles( + path = path, + hadoopConf = hadoopConf, + filter = filter, + contextOpt = None, // Can't execute parallel scans on workers + ignoreMissingFiles = ignoreMissingFiles, + ignoreLocality = ignoreLocality, + isRootPath = isRootLevel, + parallelismThreshold = Int.MaxValue, + parallelismMax = 0) + (path, leafFiles) + }.iterator + }.map { case (path, statuses) => + val serializableStatuses = statuses.map { status => + // Turn FileStatus into SerializableFileStatus so we can send it back to the driver + val blockLocations = status match { + case f: LocatedFileStatus => + f.getBlockLocations.map { loc => + SerializableBlockLocation( + loc.getNames, + loc.getHosts, + loc.getOffset, + loc.getLength) + } + + case _ => + Array.empty[SerializableBlockLocation] + } + + SerializableFileStatus( + status.getPath.toString, + status.getLen, + status.isDirectory, + status.getReplication, + status.getBlockSize, + status.getModificationTime, + status.getAccessTime, + blockLocations) + } + (path.toString, serializableStatuses) + }.collect() + } finally { + sc.setJobDescription(previousJobDescription) + } + + // turn SerializableFileStatus back to Status + statusMap.map { case (path, serializableStatuses) => + val statuses = serializableStatuses.map { f => + val blockLocations = f.blockLocations.map { loc => + new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length) + } + new LocatedFileStatus( + new FileStatus( + f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, + new Path(f.path)), + blockLocations) + } + (new Path(path), statuses) + } + } + // scalastyle:on parameter.number + + // scalastyle:off parameter.number + /** + * Lists a single filesystem path recursively. If a `SparkContext` object is specified, this + * function may launch Spark jobs to parallelize listing based on `parallelismThreshold`. + * + * If sessionOpt is None, this may be called on executors. + * + * @return all children of path that match the specified filter. + */ + private def listLeafFiles(path: Path, + hadoopConf: Configuration, + filter: PathFilter, + contextOpt: Option[SparkContext], + ignoreMissingFiles: Boolean, + ignoreLocality: Boolean, + isRootPath: Boolean, + parallelismThreshold: Int, + parallelismMax: Int): Seq[FileStatus] = { + + logTrace(s"Listing $path") + val fs = path.getFileSystem(hadoopConf) + + // Note that statuses only include FileStatus for the files and dirs directly under path, + // and does not include anything else recursively. + val statuses: Array[FileStatus] = try { + fs match { + // DistributedFileSystem overrides listLocatedStatus to make 1 single call to namenode + // to retrieve the file status with the file block location. The reason to still fallback + // to listStatus is because the default implementation would potentially throw a + // FileNotFoundException which is better handled by doing the lookups manually below. + case (_: DistributedFileSystem | _: ViewFileSystem) if !ignoreLocality => + val remoteIter = fs.listLocatedStatus(path) + new Iterator[LocatedFileStatus]() { + def next(): LocatedFileStatus = remoteIter.next + + def hasNext(): Boolean = remoteIter.hasNext + }.toArray + case _ => fs.listStatus(path) + } + } catch { + // If we are listing a root path for SQL (e.g. a top level directory of a table), we need to + // ignore FileNotFoundExceptions during this root level of the listing because + // + // (a) certain code paths might construct an InMemoryFileIndex with root paths that + // might not exist (i.e. not all callers are guaranteed to have checked + // path existence prior to constructing InMemoryFileIndex) and, + // (b) we need to ignore deleted root paths during REFRESH TABLE, otherwise we break + // existing behavior and break the ability drop SessionCatalog tables when tables' + // root directories have been deleted (which breaks a number of Spark's own tests). + // + // If we are NOT listing a root path then a FileNotFoundException here means that the + // directory was present in a previous level of file listing but is absent in this + // listing, likely indicating a race condition (e.g. concurrent table overwrite or S3 + // list inconsistency). + // + // The trade-off in supporting existing behaviors / use-cases is that we won't be + // able to detect race conditions involving root paths being deleted during + // InMemoryFileIndex construction. However, it's still a net improvement to detect and + // fail-fast on the non-root cases. For more info see the SPARK-27676 review discussion. + case _: FileNotFoundException if isRootPath || ignoreMissingFiles => + logWarning(s"The directory $path was not found. Was it deleted very recently?") + Array.empty[FileStatus] + } + + val filteredStatuses = + statuses.filterNot(status => shouldFilterOutPathName(status.getPath.getName)) + + val allLeafStatuses = { + val (dirs, topLevelFiles) = filteredStatuses.partition(_.isDirectory) + val nestedFiles: Seq[FileStatus] = contextOpt match { + case Some(context) if dirs.size > parallelismThreshold => + parallelListLeafFilesInternal( + context, + dirs.map(_.getPath), + hadoopConf = hadoopConf, + filter = filter, + isRootLevel = false, + ignoreMissingFiles = ignoreMissingFiles, + ignoreLocality = ignoreLocality, + parallelismThreshold = parallelismThreshold, + parallelismMax = parallelismMax + ).flatMap(_._2) + case _ => + dirs.flatMap { dir => + listLeafFiles( + path = dir.getPath, + hadoopConf = hadoopConf, + filter = filter, + contextOpt = contextOpt, + ignoreMissingFiles = ignoreMissingFiles, + ignoreLocality = ignoreLocality, + isRootPath = false, + parallelismThreshold = parallelismThreshold, + parallelismMax = parallelismMax) + } + } + val allFiles = topLevelFiles ++ nestedFiles + if (filter != null) allFiles.filter(f => filter.accept(f.getPath)) else allFiles + } + + val missingFiles = mutable.ArrayBuffer.empty[String] + val resolvedLeafStatuses = allLeafStatuses.flatMap { + case f: LocatedFileStatus => + Some(f) + + // NOTE: + // + // - Although S3/S3A/S3N file system can be quite slow for remote file metadata + // operations, calling `getFileBlockLocations` does no harm here since these file system + // implementations don't actually issue RPC for this method. + // + // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not + // be a big deal since we always use to `parallelListLeafFiles` when the number of + // paths exceeds threshold. + case f if !ignoreLocality => + // The other constructor of LocatedFileStatus will call FileStatus.getPermission(), + // which is very slow on some file system (RawLocalFileSystem, which is launch a + // subprocess and parse the stdout). + try { + val locations = fs.getFileBlockLocations(f, 0, f.getLen).map { loc => + // Store BlockLocation objects to consume less memory + if (loc.getClass == classOf[BlockLocation]) { + loc + } else { + new BlockLocation(loc.getNames, loc.getHosts, loc.getOffset, loc.getLength) + } + } + val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize, + f.getModificationTime, 0, null, null, null, null, f.getPath, locations) + if (f.isSymlink) { + lfs.setSymlink(f.getSymlink) + } + Some(lfs) + } catch { + case _: FileNotFoundException if ignoreMissingFiles => + missingFiles += f.getPath.toString + None + } + + case f => Some(f) + } + + if (missingFiles.nonEmpty) { + logWarning( + s"the following files were missing during file scan:\n ${missingFiles.mkString("\n ")}") + } + + resolvedLeafStatuses + } + // scalastyle:on parameter.number + + /** A serializable variant of HDFS's BlockLocation. This is required by Hadoop 2.7. */ + private case class SerializableBlockLocation(names: Array[String], + hosts: Array[String], + offset: Long, + length: Long) + + /** A serializable variant of HDFS's FileStatus. This is required by Hadoop 2.7. */ + private case class SerializableFileStatus(path: String, + length: Long, + isDir: Boolean, + blockReplication: Short, + blockSize: Long, + modificationTime: Long, + accessTime: Long, + blockLocations: Array[SerializableBlockLocation]) + + /** Checks if we should filter out this path name. */ + def shouldFilterOutPathName(pathName: String): Boolean = { + // We filter follow paths: + // 1. everything that starts with _ and ., except _common_metadata and _metadata + // because Parquet needs to find those metadata files from leaf files returned by this method. + // We should refactor this logic to not mix metadata files with data files. + // 2. everything that ends with `._COPYING_`, because this is a intermediate state of file. we + // should skip this file in case of double reading. + val exclude = (pathName.startsWith("_") && !pathName.contains("=")) || pathName.endsWith("._COPYING_") + val include = pathName.startsWith("_common_metadata") || pathName.startsWith("_metadata") + exclude && !include + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/execution/datasources/HoodieInMemoryFileIndex.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/execution/datasources/HoodieInMemoryFileIndex.scala new file mode 100644 index 0000000000000..3e541bb09a0c3 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/execution/datasources/HoodieInMemoryFileIndex.scala @@ -0,0 +1,175 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.execution.datasources + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileStatus, Path, PathFilter} +import org.apache.hadoop.mapred.{FileInputFormat, JobConf} +import org.apache.hudi.SparkAdapterSupport +import org.apache.spark.HoodieHadoopFSUtils +import org.apache.spark.metrics.source.HiveCatalogMetrics +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.{InternalRow, expressions} +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, BoundReference, Expression, InterpretedPredicate} +import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.types.StructType + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer + +class HoodieInMemoryFileIndex(sparkSession: SparkSession, + rootPathsSpecified: Seq[Path], + parameters: Map[String, String], + userSpecifiedSchema: Option[StructType], + fileStatusCache: FileStatusCache = NoopCache) + extends InMemoryFileIndex(sparkSession, rootPathsSpecified, parameters, userSpecifiedSchema, fileStatusCache) + with SparkAdapterSupport { + + /** + * Returns all valid files grouped into partitions when the data is partitioned. If the data is unpartitioned, + * this will return a single partition with no partition values + * + * NOTE: This method replicates the one it overrides, however it uses custom method + * that accepts files starting with "." + */ + override def listFiles(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionDirectory] = { + val selectedPartitions = if (partitionSpec().partitionColumns.isEmpty) { + PartitionDirectory(InternalRow.empty, allFiles().filter(f => isDataPath(f.getPath))) :: Nil + } else { + prunePartitions(partitionFilters, partitionSpec()).map { + case PartitionPath(values, path) => + val files: Seq[FileStatus] = leafDirToChildrenFiles.get(path) match { + case Some(existingDir) => + // Directory has children files in it, return them + existingDir.filter(f => isDataPath(f.getPath)) + + case None => + // Directory does not exist, or has no children files + Nil + } + PartitionDirectory(values, files) + } + } + logTrace("Selected files after partition pruning:\n\t" + selectedPartitions.mkString("\n\t")) + selectedPartitions + } + + private def isDataPath(path: Path): Boolean = { + val name = path.getName + !(name.startsWith("_") && !name.contains("=")) + } + + private def prunePartitions( + predicates: Seq[Expression], + partitionSpec: PartitionSpec): Seq[PartitionPath] = { + val PartitionSpec(partitionColumns, partitions) = partitionSpec + val partitionColumnNames = partitionColumns.map(_.name).toSet + val partitionPruningPredicates = predicates.filter { + _.references.map(_.name).toSet.subsetOf(partitionColumnNames) + } + + if (partitionPruningPredicates.nonEmpty) { + val predicate = partitionPruningPredicates.reduce(expressions.And) + + val boundPredicate = sparkAdapter.createInterpretedPredicate(predicate.transform { + case a: AttributeReference => + val index = partitionColumns.indexWhere(a.name == _.name) + BoundReference(index, partitionColumns(index).dataType, nullable = true) + }) + + val selected = partitions.filter { + case PartitionPath(values, _) => boundPredicate.eval(values) + } + logInfo { + val total = partitions.length + val selectedSize = selected.length + val percentPruned = (1 - selectedSize.toDouble / total.toDouble) * 100 + s"Selected $selectedSize partitions out of $total, " + + s"pruned ${if (total == 0) "0" else s"$percentPruned%"} partitions." + } + + selected + } else { + partitions + } + } + + /** + * List leaf files of given paths. This method will submit a Spark job to do parallel + * listing whenever there is a path having more files than the parallel partition discovery threshold. + * + * This is publicly visible for testing. + * + * NOTE: This method replicates the one it overrides, however it uses custom method to run parallel + * listing that accepts files starting with "." + */ + override def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = { + val startTime = System.nanoTime() + val output = mutable.LinkedHashSet[FileStatus]() + val pathsToFetch = mutable.ArrayBuffer[Path]() + for (path <- paths) { + fileStatusCache.getLeafFiles(path) match { + case Some(files) => + HiveCatalogMetrics.incrementFileCacheHits(files.length) + output ++= files + case None => + pathsToFetch += path + } + () // for some reasons scalac 2.12 needs this; return type doesn't matter + } + val filter = FileInputFormat.getInputPathFilter(new JobConf(hadoopConf, this.getClass)) + val discovered = bulkListLeafFiles(sparkSession, pathsToFetch, filter, hadoopConf) + + discovered.foreach { case (path, leafFiles) => + HiveCatalogMetrics.incrementFilesDiscovered(leafFiles.size) + fileStatusCache.putLeafFiles(path, leafFiles.toArray) + output ++= leafFiles + } + + logInfo(s"It took ${(System.nanoTime() - startTime) / (1000 * 1000)} ms to list leaf files" + + s" for ${paths.length} paths.") + + output + } + + protected def bulkListLeafFiles(sparkSession: SparkSession, paths: ArrayBuffer[Path], filter: PathFilter, hadoopConf: Configuration): Seq[(Path, Seq[FileStatus])] = { + HoodieHadoopFSUtils.parallelListLeafFiles( + sc = sparkSession.sparkContext, + paths = paths, + hadoopConf = hadoopConf, + filter = new PathFilterWrapper(filter), + ignoreMissingFiles = sparkSession.sessionState.conf.ignoreMissingFiles, + // NOTE: We're disabling fetching Block Info to speed up file listing + ignoreLocality = true, + parallelismThreshold = sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold, + parallelismMax = sparkSession.sessionState.conf.parallelPartitionDiscoveryParallelism) + } +} + +object HoodieInMemoryFileIndex { + def create(sparkSession: SparkSession, globbedPaths: Seq[Path]): HoodieInMemoryFileIndex = { + val fileStatusCache = FileStatusCache.getOrCreate(sparkSession) + new HoodieInMemoryFileIndex(sparkSession, globbedPaths, Map(), Option.empty, fileStatusCache) + } +} + +private class PathFilterWrapper(val filter: PathFilter) extends PathFilter with Serializable { + override def accept(path: Path): Boolean = { + (filter == null || filter.accept(path)) && !HoodieHadoopFSUtils.shouldFilterOutPathName(path.getName) + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/HoodieSparkTypeUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/HoodieSparkTypeUtils.scala new file mode 100644 index 0000000000000..3b0fcf0f322f1 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/HoodieSparkTypeUtils.scala @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.sql.types.{DataType, DecimalType, NumericType, StringType} + +// TODO unify w/ DataTypeUtils +object HoodieSparkTypeUtils { + + /** + * Returns whether this DecimalType is wider than `other`. If yes, it means `other` + * can be casted into `this` safely without losing any precision or range. + */ + def isWiderThan(one: DecimalType, another: DecimalType) = + one.isWiderThan(another) + + /** + * Checks whether casting expression of [[from]] [[DataType]] to [[to]] [[DataType]] will + * preserve ordering of the elements + */ + def isCastPreservingOrdering(from: DataType, to: DataType): Boolean = + (from, to) match { + // NOTE: In the casting rules defined by Spark, only casting from String to Numeric + // (and vice versa) are the only casts that might break the ordering of the elements after casting + case (StringType, _: NumericType) => false + case (_: NumericType, StringType) => false + + case _ => true + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/avro/HoodieSparkAvroSchemaConverters.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/avro/HoodieSparkAvroSchemaConverters.scala new file mode 100644 index 0000000000000..65306ac44686b --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/avro/HoodieSparkAvroSchemaConverters.scala @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.avro + +import org.apache.avro.Schema +import org.apache.spark.sql.avro.SchemaConverters.SchemaType +import org.apache.spark.sql.types.DataType + +/** + * This interface is simply a facade abstracting away Spark's [[SchemaConverters]] implementation, allowing + * the rest of the code-base to not depend on it directly + */ +object HoodieSparkAvroSchemaConverters extends HoodieAvroSchemaConverters { + + override def toSqlType(avroSchema: Schema): (DataType, Boolean) = + SchemaConverters.toSqlType(avroSchema) match { + case SchemaType(dataType, nullable) => (dataType, nullable) + } + + override def toAvroType(catalystType: DataType, nullable: Boolean, recordName: String, nameSpace: String): Schema = + SchemaConverters.toAvroType(catalystType, nullable, recordName, nameSpace) + +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala new file mode 100644 index 0000000000000..c178d1b84919e --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.avro + +import org.apache.avro.LogicalTypes.{Date, Decimal, TimestampMicros, TimestampMillis} +import org.apache.avro.Schema.Type._ +import org.apache.avro.{LogicalTypes, Schema, SchemaBuilder} +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.sql.types.Decimal.minBytesForPrecision +import org.apache.spark.sql.types._ + +import scala.collection.JavaConverters._ + +/** + * This object contains method that are used to convert sparkSQL schemas to avro schemas and vice + * versa. + * + * NOTE: This code is borrowed from Spark 3.2.1 + * This code is borrowed, so that we can better control compatibility w/in Spark minor + * branches (3.2.x, 3.1.x, etc) + * + * PLEASE REFRAIN MAKING ANY CHANGES TO THIS CODE UNLESS ABSOLUTELY NECESSARY + */ +@DeveloperApi +private[sql] object SchemaConverters { + private lazy val nullSchema = Schema.create(Schema.Type.NULL) + + /** + * Internal wrapper for SQL data type and nullability. + * + * @since 2.4.0 + */ + case class SchemaType(dataType: DataType, nullable: Boolean) + + /** + * Converts an Avro schema to a corresponding Spark SQL schema. + * + * @since 2.4.0 + */ + def toSqlType(avroSchema: Schema): SchemaType = { + toSqlTypeHelper(avroSchema, Set.empty) + } + + private val unionFieldMemberPrefix = "member" + + private def toSqlTypeHelper(avroSchema: Schema, existingRecordNames: Set[String]): SchemaType = { + avroSchema.getType match { + case INT => avroSchema.getLogicalType match { + case _: Date => SchemaType(DateType, nullable = false) + case _ => SchemaType(IntegerType, nullable = false) + } + case STRING => SchemaType(StringType, nullable = false) + case BOOLEAN => SchemaType(BooleanType, nullable = false) + case BYTES | FIXED => avroSchema.getLogicalType match { + // For FIXED type, if the precision requires more bytes than fixed size, the logical + // type will be null, which is handled by Avro library. + case d: Decimal => SchemaType(DecimalType(d.getPrecision, d.getScale), nullable = false) + case _ => SchemaType(BinaryType, nullable = false) + } + + case DOUBLE => SchemaType(DoubleType, nullable = false) + case FLOAT => SchemaType(FloatType, nullable = false) + case LONG => avroSchema.getLogicalType match { + case _: TimestampMillis | _: TimestampMicros => SchemaType(TimestampType, nullable = false) + case _ => SchemaType(LongType, nullable = false) + } + + case ENUM => SchemaType(StringType, nullable = false) + + case NULL => SchemaType(NullType, nullable = true) + + case RECORD => + if (existingRecordNames.contains(avroSchema.getFullName)) { + throw new IncompatibleSchemaException( + s""" + |Found recursive reference in Avro schema, which can not be processed by Spark: + |${avroSchema.toString(true)} + """.stripMargin) + } + val newRecordNames = existingRecordNames + avroSchema.getFullName + val fields = avroSchema.getFields.asScala.map { f => + val schemaType = toSqlTypeHelper(f.schema(), newRecordNames) + StructField(f.name, schemaType.dataType, schemaType.nullable) + } + + SchemaType(StructType(fields.toSeq), nullable = false) + + case ARRAY => + val schemaType = toSqlTypeHelper(avroSchema.getElementType, existingRecordNames) + SchemaType( + ArrayType(schemaType.dataType, containsNull = schemaType.nullable), + nullable = false) + + case MAP => + val schemaType = toSqlTypeHelper(avroSchema.getValueType, existingRecordNames) + SchemaType( + MapType(StringType, schemaType.dataType, valueContainsNull = schemaType.nullable), + nullable = false) + + case UNION => + if (avroSchema.getTypes.asScala.exists(_.getType == NULL)) { + // In case of a union with null, eliminate it and make a recursive call + val remainingUnionTypes = avroSchema.getTypes.asScala.filterNot(_.getType == NULL) + if (remainingUnionTypes.size == 1) { + toSqlTypeHelper(remainingUnionTypes.head, existingRecordNames).copy(nullable = true) + } else { + toSqlTypeHelper(Schema.createUnion(remainingUnionTypes.asJava), existingRecordNames) + .copy(nullable = true) + } + } else avroSchema.getTypes.asScala.map(_.getType).toSeq match { + case Seq(t1) => + toSqlTypeHelper(avroSchema.getTypes.get(0), existingRecordNames) + case Seq(t1, t2) if Set(t1, t2) == Set(INT, LONG) => + SchemaType(LongType, nullable = false) + case Seq(t1, t2) if Set(t1, t2) == Set(FLOAT, DOUBLE) => + SchemaType(DoubleType, nullable = false) + case _ => + // Convert complex unions to struct types where field names are member0, member1, etc. + // This is consistent with the behavior when converting between Avro and Parquet. + val fields = avroSchema.getTypes.asScala.zipWithIndex.map { + case (s, i) => + val schemaType = toSqlTypeHelper(s, existingRecordNames) + // All fields are nullable because only one of them is set at a time + StructField(s"$unionFieldMemberPrefix$i", schemaType.dataType, nullable = true) + } + + SchemaType(StructType(fields.toSeq), nullable = false) + } + + case other => throw new IncompatibleSchemaException(s"Unsupported type $other") + } + } + + /** + * Converts a Spark SQL schema to a corresponding Avro schema. + * + * @since 2.4.0 + */ + def toAvroType(catalystType: DataType, + nullable: Boolean = false, + recordName: String = "topLevelRecord", + nameSpace: String = ""): Schema = { + val builder = SchemaBuilder.builder() + + val schema = catalystType match { + case BooleanType => builder.booleanType() + case ByteType | ShortType | IntegerType => builder.intType() + case LongType => builder.longType() + case DateType => + LogicalTypes.date().addToSchema(builder.intType()) + case TimestampType => + LogicalTypes.timestampMicros().addToSchema(builder.longType()) + + case FloatType => builder.floatType() + case DoubleType => builder.doubleType() + case StringType => builder.stringType() + case NullType => builder.nullType() + case d: DecimalType => + val avroType = LogicalTypes.decimal(d.precision, d.scale) + val fixedSize = minBytesForPrecision(d.precision) + // Need to avoid naming conflict for the fixed fields + val name = nameSpace match { + case "" => s"$recordName.fixed" + case _ => s"$nameSpace.$recordName.fixed" + } + avroType.addToSchema(SchemaBuilder.fixed(name).size(fixedSize)) + + case BinaryType => builder.bytesType() + case ArrayType(et, containsNull) => + builder.array() + .items(toAvroType(et, containsNull, recordName, nameSpace)) + case MapType(StringType, vt, valueContainsNull) => + builder.map() + .values(toAvroType(vt, valueContainsNull, recordName, nameSpace)) + case st: StructType => + val childNameSpace = if (nameSpace != "") s"$nameSpace.$recordName" else recordName + if (canBeUnion(st)) { + val nonNullUnionFieldTypes = st.map(f => toAvroType(f.dataType, nullable = false, f.name, childNameSpace)) + val unionFieldTypes = if (nullable) { + nullSchema +: nonNullUnionFieldTypes + } else { + nonNullUnionFieldTypes + } + Schema.createUnion(unionFieldTypes:_*) + } else { + val fieldsAssembler = builder.record(recordName).namespace(nameSpace).fields() + st.foreach { f => + val fieldAvroType = + toAvroType(f.dataType, f.nullable, f.name, childNameSpace) + fieldsAssembler.name(f.name).`type`(fieldAvroType).noDefault() + } + fieldsAssembler.endRecord() + } + + // This should never happen. + case other => throw new IncompatibleSchemaException(s"Unexpected type $other.") + } + + if (nullable && catalystType != NullType && schema.getType != Schema.Type.UNION) { + Schema.createUnion(schema, nullSchema) + } else { + schema + } + } + + private def canBeUnion(st: StructType): Boolean = { + // We use a heuristic to determine whether a [[StructType]] could potentially have been produced + // by converting Avro union to Catalyst's [[StructType]]: + // - It has to have at least 1 field + // - All fields have to be of the following format "memberN" (where N is sequentially increasing integer) + // - All fields are nullable + st.fields.length > 0 && + st.forall { f => + f.name.matches(s"$unionFieldMemberPrefix\\d+") && f.nullable + } + } +} + +private[avro] class IncompatibleSchemaException(msg: String, ex: Throwable = null) extends Exception(msg, ex) + +private[avro] class UnsupportedAvroTypeException(msg: String) extends Exception(msg) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala new file mode 100644 index 0000000000000..c31cd3b20565e --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala @@ -0,0 +1,369 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.catalog + +import org.apache.hudi.DataSourceWriteOptions.OPERATION +import org.apache.hudi.HoodieWriterUtils._ +import org.apache.hudi.common.config.DFSPropertiesConfiguration +import org.apache.hudi.common.model.HoodieTableType +import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} +import org.apache.hudi.common.util.{StringUtils, ValidationUtils} +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory +import org.apache.hudi.{AvroConversionUtils, DataSourceOptionsHelper} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.avro.SchemaConverters +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.hudi.HoodieOptionConfig +import org.apache.spark.sql.hudi.HoodieOptionConfig.SQL_KEY_TABLE_PRIMARY_KEY +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._ +import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.sql.{AnalysisException, SparkSession} + +import java.util.{Locale, Properties} +import scala.collection.JavaConverters._ +import scala.collection.mutable + +/** + * Table definition for SQL functionalities. Depending on the way of data generation, + * meta of Hudi table can be from Spark catalog or meta directory on filesystem. + * [[HoodieCatalogTable]] takes both meta sources into consideration when handling + * EXTERNAL and MANAGED tables. + * + * NOTE: all the meta should be retrieved from meta directory on filesystem first. + */ +class HoodieCatalogTable(val spark: SparkSession, var table: CatalogTable) extends Logging { + + assert(table.provider.map(_.toLowerCase(Locale.ROOT)).orNull == "hudi", "It's not a Hudi table") + + private val hadoopConf = spark.sessionState.newHadoopConf + + /** + * database.table in catalog + */ + val catalogTableName: String = table.qualifiedName + + /** + * properties defined in catalog. + */ + val catalogProperties: Map[String, String] = table.storage.properties ++ table.properties + + /** + * hoodie table's location. + * if create managed hoodie table, use `catalog.defaultTablePath`. + */ + val tableLocation: String = getTableLocation(table, spark) + + /** + * A flag to whether the hoodie table exists. + */ + val hoodieTableExists: Boolean = tableExistsInPath(tableLocation, hadoopConf) + + /** + * Meta Client. + */ + lazy val metaClient: HoodieTableMetaClient = HoodieTableMetaClient.builder() + .setBasePath(tableLocation) + .setConf(hadoopConf) + .build() + + /** + * Hoodie Table Config + */ + lazy val tableConfig: HoodieTableConfig = metaClient.getTableConfig + + /** + * the name of table + */ + lazy val tableName: String = tableConfig.getTableName + + /** + * the name of dabase + */ + lazy val databaseName: String = tableConfig.getDatabaseName + + /** + * The name of type of table + */ + lazy val tableType: HoodieTableType = tableConfig.getTableType + + /** + * The type of table + */ + lazy val tableTypeName: String = tableType.name() + + /** + * Record Field List(Primary Key List) + */ + lazy val primaryKeys: Array[String] = tableConfig.getRecordKeyFields.orElse(Array.empty) + + /** + * PreCombine Field + */ + lazy val preCombineKey: Option[String] = Option(tableConfig.getPreCombineField) + + /** + * Partition Fields + */ + lazy val partitionFields: Array[String] = tableConfig.getPartitionFields.orElse(Array.empty) + + /** + * BaseFileFormat + */ + lazy val baseFileFormat: String = metaClient.getTableConfig.getBaseFileFormat.name() + + /** + * Table schema + */ + lazy val tableSchema: StructType = table.schema + + /** + * The schema without hoodie meta fields + */ + lazy val tableSchemaWithoutMetaFields: StructType = removeMetaFields(tableSchema) + + /** + * The schema of data fields + */ + lazy val dataSchema: StructType = { + StructType(tableSchema.filterNot(f => partitionFields.contains(f.name))) + } + + /** + * The schema of data fields not including hoodie meta fields + */ + lazy val dataSchemaWithoutMetaFields: StructType = removeMetaFields(dataSchema) + + /** + * The schema of partition fields + */ + lazy val partitionSchema: StructType = StructType(tableSchema.filter(f => partitionFields.contains(f.name))) + + /** + * All the partition paths + */ + def getPartitionPaths: Seq[String] = getAllPartitionPaths(spark, table) + + /** + * Check if table is a partitioned table + */ + def isPartitionedTable: Boolean = table.partitionColumnNames.nonEmpty + + /** + * Initializes table meta on filesystem when applying CREATE TABLE clause. + */ + def initHoodieTable(): Unit = { + logInfo(s"Init hoodie.properties for ${table.identifier.unquotedString}") + val (finalSchema, tableConfigs) = parseSchemaAndConfigs() + + table = table.copy(schema = finalSchema) + + // Save all the table config to the hoodie.properties. + val properties = new Properties() + properties.putAll(tableConfigs.asJava) + + val catalogDatabaseName = formatName(spark, + table.identifier.database.getOrElse(spark.sessionState.catalog.getCurrentDatabase)) + if (hoodieTableExists) { + assert(StringUtils.isNullOrEmpty(databaseName) || databaseName == catalogDatabaseName, + "The database names from this hoodie path and this catalog table is not same.") + // just persist hoodie.table.create.schema + HoodieTableMetaClient.withPropertyBuilder() + .fromProperties(properties) + .setDatabaseName(catalogDatabaseName) + .setTableCreateSchema(SchemaConverters.toAvroType(finalSchema).toString()) + .initTable(hadoopConf, tableLocation) + } else { + val (recordName, namespace) = AvroConversionUtils.getAvroRecordNameAndNamespace(table.identifier.table) + val schema = SchemaConverters.toAvroType(finalSchema, false, recordName, namespace) + val partitionColumns = if (table.partitionColumnNames.isEmpty) { + null + } else { + table.partitionColumnNames.mkString(",") + } + + HoodieTableMetaClient.withPropertyBuilder() + .fromProperties(properties) + .setDatabaseName(catalogDatabaseName) + .setTableName(table.identifier.table) + .setTableCreateSchema(schema.toString()) + .setPartitionFields(partitionColumns) + .initTable(hadoopConf, tableLocation) + } + } + + /** + * Derives the SQL schema and configurations for a Hudi table: + * 1. Columns in the schema fall under two categories -- the data columns described in + * CREATE TABLE clause and meta columns enumerated in [[HoodieRecord#HOODIE_META_COLUMNS]]; + * 2. Configurations derived come from config file, PROPERTIES and OPTIONS in CREATE TABLE clause. + */ + private def parseSchemaAndConfigs(): (StructType, Map[String, String]) = { + val globalProps = DFSPropertiesConfiguration.getGlobalProps.asScala.toMap + val globalTableConfigs = mappingSparkDatasourceConfigsToTableConfigs(globalProps) + val globalSqlOptions = HoodieOptionConfig.mappingTableConfigToSqlOption(globalTableConfigs) + + val sqlOptions = HoodieOptionConfig.withDefaultSqlOptions(globalSqlOptions ++ catalogProperties) + + // get final schema and parameters + val (finalSchema, tableConfigs) = (table.tableType, hoodieTableExists) match { + case (CatalogTableType.EXTERNAL, true) => + val existingTableConfig = tableConfig.getProps.asScala.toMap + val currentTableConfig = globalTableConfigs ++ existingTableConfig + val catalogTableProps = HoodieOptionConfig.mappingSqlOptionToTableConfig(catalogProperties) + validateTableConfig(spark, catalogTableProps, convertMapToHoodieConfig(existingTableConfig)) + + val options = extraTableConfig(hoodieTableExists, currentTableConfig) ++ + HoodieOptionConfig.mappingSqlOptionToTableConfig(sqlOptions) ++ currentTableConfig + + val schemaFromMetaOpt = loadTableSchemaByMetaClient() + val schema = if (schemaFromMetaOpt.nonEmpty) { + schemaFromMetaOpt.get + } else if (table.schema.nonEmpty) { + addMetaFields(table.schema) + } else { + throw new AnalysisException( + s"Missing schema fields when applying CREATE TABLE clause for ${catalogTableName}") + } + (schema, options) + + case (_, false) => + ValidationUtils.checkArgument(table.schema.nonEmpty, + s"Missing schema for Create Table: $catalogTableName") + val schema = table.schema + val options = extraTableConfig(tableExists = false, globalTableConfigs) ++ + HoodieOptionConfig.mappingSqlOptionToTableConfig(sqlOptions) + (addMetaFields(schema), options) + + case (CatalogTableType.MANAGED, true) => + throw new AnalysisException(s"Can not create the managed table('$catalogTableName')" + + s". The associated location('$tableLocation') already exists.") + } + HoodieOptionConfig.validateTable(spark, finalSchema, + HoodieOptionConfig.mappingTableConfigToSqlOption(tableConfigs)) + + val resolver = spark.sessionState.conf.resolver + val dataSchema = finalSchema.filterNot { f => + table.partitionColumnNames.exists(resolver(_, f.name)) + } + verifyDataSchema(table.identifier, table.tableType, dataSchema) + + (finalSchema, tableConfigs) + } + + private def extraTableConfig(tableExists: Boolean, + originTableConfig: Map[String, String] = Map.empty): Map[String, String] = { + val extraConfig = mutable.Map.empty[String, String] + if (tableExists) { + val allPartitionPaths = getPartitionPaths + if (originTableConfig.contains(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE.key)) { + extraConfig(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE.key) = + originTableConfig(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE.key) + } else { + extraConfig(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE.key) = + String.valueOf(isHiveStyledPartitioning(allPartitionPaths, table)) + } + if (originTableConfig.contains(HoodieTableConfig.URL_ENCODE_PARTITIONING.key)) { + extraConfig(HoodieTableConfig.URL_ENCODE_PARTITIONING.key) = + originTableConfig(HoodieTableConfig.URL_ENCODE_PARTITIONING.key) + } else { + extraConfig(HoodieTableConfig.URL_ENCODE_PARTITIONING.key) = + String.valueOf(isUrlEncodeEnabled(allPartitionPaths, table)) + } + } else { + extraConfig(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE.key) = "true" + extraConfig(HoodieTableConfig.URL_ENCODE_PARTITIONING.key) = HoodieTableConfig.URL_ENCODE_PARTITIONING.defaultValue() + } + + if (originTableConfig.contains(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key)) { + extraConfig(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key) = + HoodieSparkKeyGeneratorFactory.convertToSparkKeyGenerator( + originTableConfig(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key)) + } else { + val primaryKeys = table.properties.get(SQL_KEY_TABLE_PRIMARY_KEY.sqlKeyName).getOrElse(SQL_KEY_TABLE_PRIMARY_KEY.defaultValue.get) + val partitions = table.partitionColumnNames.mkString(",") + extraConfig(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key) = + DataSourceOptionsHelper.inferKeyGenClazz(primaryKeys, partitions) + } + extraConfig.toMap + } + + private def loadTableSchemaByMetaClient(): Option[StructType] = { + val resolver = spark.sessionState.conf.resolver + getTableSqlSchema(metaClient, includeMetadataFields = true).map(originSchema => { + // Load table schema from meta on filesystem, and fill in 'comment' + // information from Spark catalog. + val fields = originSchema.fields.map { f => + val nullableField: StructField = f.copy(nullable = true) + val catalogField = findColumnByName(table.schema, nullableField.name, resolver) + if (catalogField.isDefined) { + catalogField.get.getComment().map(nullableField.withComment).getOrElse(nullableField) + } else { + nullableField + } + } + StructType(fields) + }) + } + + // This code is forked from org.apache.spark.sql.hive.HiveExternalCatalog#verifyDataSchema + private def verifyDataSchema(tableIdentifier: TableIdentifier, tableType: CatalogTableType, + dataSchema: Seq[StructField]): Unit = { + if (tableType != CatalogTableType.VIEW) { + val invalidChars = Seq(",", ":", ";") + def verifyNestedColumnNames(schema: StructType): Unit = schema.foreach { f => + f.dataType match { + case st: StructType => verifyNestedColumnNames(st) + case _ if invalidChars.exists(f.name.contains) => + val invalidCharsString = invalidChars.map(c => s"'$c'").mkString(", ") + val errMsg = "Cannot create a table having a nested column whose name contains " + + s"invalid characters ($invalidCharsString) in Hive metastore. Table: $tableIdentifier; " + + s"Column: ${f.name}" + throw new AnalysisException(errMsg) + case _ => + } + } + + dataSchema.foreach { f => + f.dataType match { + // Checks top-level column names + case _ if f.name.contains(",") => + throw new AnalysisException("Cannot create a table having a column whose name " + + s"contains commas in Hive metastore. Table: $tableIdentifier; Column: ${f.name}") + // Checks nested column names + case st: StructType => + verifyNestedColumnNames(st) + case _ => + } + } + } + } +} + +object HoodieCatalogTable { + // The properties should not be used when create table + val needFilterProps: List[String] = List(HoodieTableConfig.DATABASE_NAME.key, HoodieTableConfig.NAME.key, OPERATION.key) + + def apply(sparkSession: SparkSession, tableIdentifier: TableIdentifier): HoodieCatalogTable = { + val catalogTable = sparkSession.sessionState.catalog.getTableMetadata(tableIdentifier) + HoodieCatalogTable(sparkSession, catalogTable) + } + + def apply(sparkSession: SparkSession, catalogTable: CatalogTable): HoodieCatalogTable = { + new HoodieCatalogTable(sparkSession, catalogTable) + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/trees/HoodieLeafLike.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/trees/HoodieLeafLike.scala new file mode 100644 index 0000000000000..bde1ba29e8b63 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/trees/HoodieLeafLike.scala @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.trees + +/** + * Similar to `LeafLike` in Spark3.2. + */ +trait HoodieLeafLike[T <: TreeNode[T]] { self: TreeNode[T] => + + override final def children: Seq[T] = Nil + + override final def mapChildren(f: T => T): T = this.asInstanceOf[T] + + final def withNewChildrenInternal(newChildren: IndexedSeq[T]): T = this.asInstanceOf[T] +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormat.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormat.scala new file mode 100644 index 0000000000000..a52e9335fe374 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormat.scala @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import org.apache.hadoop.conf.Configuration +import org.apache.hudi.{DataSourceReadOptions, SparkAdapterSupport} +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.execution.datasources.parquet.HoodieParquetFileFormat.FILE_FORMAT_ID +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.StructType + + +class HoodieParquetFileFormat extends ParquetFileFormat with SparkAdapterSupport { + + override def shortName(): String = FILE_FORMAT_ID + + override def toString: String = "Hoodie-Parquet" + + override def buildReaderWithPartitionValues(sparkSession: SparkSession, + dataSchema: StructType, + partitionSchema: StructType, + requiredSchema: StructType, + filters: Seq[Filter], + options: Map[String, String], + hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { + val shouldExtractPartitionValuesFromPartitionPath = + options.getOrElse(DataSourceReadOptions.EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH.key, + DataSourceReadOptions.EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH.defaultValue.toString).toBoolean + + sparkAdapter + .createHoodieParquetFileFormat(shouldExtractPartitionValuesFromPartitionPath).get + .buildReaderWithPartitionValues(sparkSession, dataSchema, partitionSchema, requiredSchema, filters, options, hadoopConf) + } +} + +object HoodieParquetFileFormat { + val FILE_FORMAT_ID = "hoodie-parquet" +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hive/HiveClientUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hive/HiveClientUtils.scala new file mode 100644 index 0000000000000..f4b1d37cd266f --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hive/HiveClientUtils.scala @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive + +import org.apache.hadoop.conf.Configuration +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.hive.client.HiveClient + +object HiveClientUtils { + + /** + * A Hive client used to interact with the metastore. + */ + @volatile private var client: HiveClient = null + + def newClientForMetadata(conf: SparkConf, hadoopConf: Configuration): HiveClient = { + HiveUtils.newClientForMetadata(conf, hadoopConf) + } + + def getSingletonClientForMetadata(sparkSession: SparkSession): HiveClient = synchronized { + if (client == null) { + client = HiveUtils.newClientForMetadata(sparkSession.sparkContext.conf, + sparkSession.sessionState.newHadoopConf()) + } + client + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala new file mode 100644 index 0000000000000..4a3cf38895f0a --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala @@ -0,0 +1,422 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +import org.apache.hudi.ColumnStatsIndexSupport.{getMaxColumnNameFor, getMinColumnNameFor, getNullCountColumnNameFor, getValueCountColumnNameFor} +import org.apache.hudi.SparkAdapterSupport +import org.apache.hudi.common.util.ValidationUtils.checkState +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.expressions.Literal.TrueLiteral +import org.apache.spark.sql.catalyst.expressions.{Alias, And, Attribute, AttributeReference, EqualNullSafe, EqualTo, Expression, ExtractValue, GetStructField, GreaterThan, GreaterThanOrEqual, In, InSet, IsNotNull, IsNull, LessThan, LessThanOrEqual, Literal, Not, Or, StartsWith, SubqueryExpression} +import org.apache.spark.sql.functions.col +import org.apache.spark.sql.hudi.ColumnStatsExpressionUtils._ +import org.apache.spark.sql.types.{StringType, StructType} +import org.apache.spark.sql.{AnalysisException, HoodieCatalystExpressionUtils} +import org.apache.spark.unsafe.types.UTF8String + +object DataSkippingUtils extends Logging { + + /** + * Translates provided {@link filterExpr} into corresponding filter-expression for column-stats index index table + * to filter out candidate files that would hold records matching the original filter + * + * @param dataTableFilterExpr source table's query's filter expression + * @param indexSchema index table schema + * @return filter for column-stats index's table + */ + def translateIntoColumnStatsIndexFilterExpr(dataTableFilterExpr: Expression, indexSchema: StructType): Expression = { + try { + createColumnStatsIndexFilterExprInternal(dataTableFilterExpr, indexSchema) + } catch { + case e: AnalysisException => + logDebug(s"Failed to translated provided data table filter expr into column stats one ($dataTableFilterExpr)", e) + throw e + } + } + + private def createColumnStatsIndexFilterExprInternal(dataTableFilterExpr: Expression, indexSchema: StructType): Expression = { + // Try to transform original Source Table's filter expression into + // Column-Stats Index filter expression + tryComposeIndexFilterExpr(dataTableFilterExpr, indexSchema) match { + case Some(e) => e + // NOTE: In case we can't transform source filter expression, we fallback + // to {@code TrueLiteral}, to essentially avoid pruning any indexed files from scanning + case None => TrueLiteral + } + } + + private def tryComposeIndexFilterExpr(sourceFilterExpr: Expression, indexSchema: StructType): Option[Expression] = { + // + // For translation of the Filter Expression for the Data Table into Filter Expression for Column Stats Index, we're + // assuming that + // - The column A is queried in the Data Table (hereafter referred to as "colA") + // - Filter Expression is a relational expression (ie "=", "<", "<=", ...) of the following form + // + // ```transform_expr(colA) = value_expr``` + // + // Where + // - "transform_expr" is an expression of the _transformation_ which preserve ordering of the "colA" + // - "value_expr" is an "value"-expression (ie one NOT referring to other attributes/columns or containing sub-queries) + // + // We translate original Filter Expr into the one querying Column Stats Index like following: let's consider + // equality Filter Expr referred to above: + // + // ```transform_expr(colA) = value_expr``` + // + // This expression will be translated into following Filter Expression for the Column Stats Index: + // + // ```(transform_expr(colA_minValue) <= value_expr) AND (value_expr <= transform_expr(colA_maxValue))``` + // + // Which will enable us to match files with the range of values in column A containing the target ```value_expr``` + // + // NOTE: That we can apply ```transform_expr``` transformation precisely b/c it preserves the ordering of the + // values of the source column, ie following holds true: + // + // colA_minValue = min(colA) => transform_expr(colA_minValue) = min(transform_expr(colA)) + // colA_maxValue = max(colA) => transform_expr(colA_maxValue) = max(transform_expr(colA)) + // + sourceFilterExpr match { + // If Expression is not resolved, we can't perform the analysis accurately, bailing + case expr if !expr.resolved => None + + // Filter "expr(colA) = B" and "B = expr(colA)" + // Translates to "(expr(colA_minValue) <= B) AND (B <= expr(colA_maxValue))" condition for index lookup + case EqualTo(sourceExpr @ AllowedTransformationExpression(attrRef), valueExpr: Expression) if isValueExpression(valueExpr) => + getTargetIndexedColumnName(attrRef, indexSchema) + .map { colName => + // NOTE: Since we're supporting (almost) arbitrary expressions of the form `f(colA) = B`, we have to + // appropriately translate such original expression targeted at Data Table, to corresponding + // expression targeted at Column Stats Index Table. For that, we take original expression holding + // [[AttributeReference]] referring to the Data Table, and swap it w/ expression referring to + // corresponding column in the Column Stats Index + val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) + genColumnValuesEqualToExpression(colName, valueExpr, targetExprBuilder) + } + + case EqualTo(valueExpr: Expression, sourceExpr @ AllowedTransformationExpression(attrRef)) if isValueExpression(valueExpr) => + getTargetIndexedColumnName(attrRef, indexSchema) + .map { colName => + val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) + genColumnValuesEqualToExpression(colName, valueExpr, targetExprBuilder) + } + + // Filter "expr(colA) != B" and "B != expr(colA)" + // Translates to "NOT(expr(colA_minValue) = B AND expr(colA_maxValue) = B)" + // NOTE: This is NOT an inversion of `colA = b`, instead this filter ONLY excludes files for which `colA = B` + // holds true + case Not(EqualTo(sourceExpr @ AllowedTransformationExpression(attrRef), value: Expression)) if isValueExpression(value) => + getTargetIndexedColumnName(attrRef, indexSchema) + .map { colName => + val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) + Not(genColumnOnlyValuesEqualToExpression(colName, value, targetExprBuilder)) + } + + case Not(EqualTo(value: Expression, sourceExpr @ AllowedTransformationExpression(attrRef))) if isValueExpression(value) => + getTargetIndexedColumnName(attrRef, indexSchema) + .map { colName => + val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) + Not(genColumnOnlyValuesEqualToExpression(colName, value, targetExprBuilder)) + } + + // Filter "colA = null" + // Translates to "colA_nullCount = null" for index lookup + case EqualNullSafe(attrRef: AttributeReference, litNull @ Literal(null, _)) => + getTargetIndexedColumnName(attrRef, indexSchema) + .map(colName => EqualTo(genColNumNullsExpr(colName), litNull)) + + // Filter "expr(colA) < B" and "B > expr(colA)" + // Translates to "expr(colA_minValue) < B" for index lookup + case LessThan(sourceExpr @ AllowedTransformationExpression(attrRef), value: Expression) if isValueExpression(value) => + getTargetIndexedColumnName(attrRef, indexSchema) + .map { colName => + val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) + LessThan(targetExprBuilder.apply(genColMinValueExpr(colName)), value) + } + + case GreaterThan(value: Expression, sourceExpr @ AllowedTransformationExpression(attrRef)) if isValueExpression(value) => + getTargetIndexedColumnName(attrRef, indexSchema) + .map { colName => + val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) + LessThan(targetExprBuilder.apply(genColMinValueExpr(colName)), value) + } + + // Filter "B < expr(colA)" and "expr(colA) > B" + // Translates to "B < colA_maxValue" for index lookup + case LessThan(value: Expression, sourceExpr @ AllowedTransformationExpression(attrRef)) if isValueExpression(value) => + getTargetIndexedColumnName(attrRef, indexSchema) + .map { colName => + val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) + GreaterThan(targetExprBuilder.apply(genColMaxValueExpr(colName)), value) + } + + case GreaterThan(sourceExpr @ AllowedTransformationExpression(attrRef), value: Expression) if isValueExpression(value) => + getTargetIndexedColumnName(attrRef, indexSchema) + .map { colName => + val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) + GreaterThan(targetExprBuilder.apply(genColMaxValueExpr(colName)), value) + } + + // Filter "expr(colA) <= B" and "B >= expr(colA)" + // Translates to "colA_minValue <= B" for index lookup + case LessThanOrEqual(sourceExpr @ AllowedTransformationExpression(attrRef), value: Expression) if isValueExpression(value) => + getTargetIndexedColumnName(attrRef, indexSchema) + .map { colName => + val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) + LessThanOrEqual(targetExprBuilder.apply(genColMinValueExpr(colName)), value) + } + + case GreaterThanOrEqual(value: Expression, sourceExpr @ AllowedTransformationExpression(attrRef)) if isValueExpression(value) => + getTargetIndexedColumnName(attrRef, indexSchema) + .map { colName => + val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) + LessThanOrEqual(targetExprBuilder.apply(genColMinValueExpr(colName)), value) + } + + // Filter "B <= expr(colA)" and "expr(colA) >= B" + // Translates to "B <= colA_maxValue" for index lookup + case LessThanOrEqual(value: Expression, sourceExpr @ AllowedTransformationExpression(attrRef)) if isValueExpression(value) => + getTargetIndexedColumnName(attrRef, indexSchema) + .map { colName => + val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) + GreaterThanOrEqual(targetExprBuilder.apply(genColMaxValueExpr(colName)), value) + } + + case GreaterThanOrEqual(sourceExpr @ AllowedTransformationExpression(attrRef), value: Expression) if isValueExpression(value) => + getTargetIndexedColumnName(attrRef, indexSchema) + .map { colName => + val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) + GreaterThanOrEqual(targetExprBuilder.apply(genColMaxValueExpr(colName)), value) + } + + // Filter "colA is null" + // Translates to "colA_nullCount > 0" for index lookup + case IsNull(attribute: AttributeReference) => + getTargetIndexedColumnName(attribute, indexSchema) + .map(colName => GreaterThan(genColNumNullsExpr(colName), Literal(0))) + + // Filter "colA is not null" + // Translates to "colA_nullCount < colA_valueCount" for index lookup + case IsNotNull(attribute: AttributeReference) => + getTargetIndexedColumnName(attribute, indexSchema) + .map(colName => LessThan(genColNumNullsExpr(colName), genColValueCountExpr)) + + // Filter "expr(colA) in (B1, B2, ...)" + // Translates to "(colA_minValue <= B1 AND colA_maxValue >= B1) OR (colA_minValue <= B2 AND colA_maxValue >= B2) ... " + // for index lookup + // NOTE: This is equivalent to "colA = B1 OR colA = B2 OR ..." + case In(sourceExpr @ AllowedTransformationExpression(attrRef), list: Seq[Expression]) if list.forall(isValueExpression) => + getTargetIndexedColumnName(attrRef, indexSchema) + .map { colName => + val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) + list.map(lit => genColumnValuesEqualToExpression(colName, lit, targetExprBuilder)).reduce(Or) + } + + // Filter "expr(colA) in (B1, B2, ...)" + // NOTE: [[InSet]] is an optimized version of the [[In]] expression, where every sub-expression w/in the + // set is a static literal + case InSet(sourceExpr @ AllowedTransformationExpression(attrRef), hset: Set[Any]) => + getTargetIndexedColumnName(attrRef, indexSchema) + .map { colName => + val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) + hset.map { value => + // NOTE: [[Literal]] has a gap where it could hold [[UTF8String]], but [[Literal#apply]] doesn't + // accept [[UTF8String]]. As such we have to handle it separately + val lit = value match { + case str: UTF8String => Literal(str.toString) + case _ => Literal(value) + } + genColumnValuesEqualToExpression(colName, lit, targetExprBuilder) + }.reduce(Or) + } + + // Filter "expr(colA) not in (B1, B2, ...)" + // Translates to "NOT((colA_minValue = B1 AND colA_maxValue = B1) OR (colA_minValue = B2 AND colA_maxValue = B2))" for index lookup + // NOTE: This is NOT an inversion of `in (B1, B2, ...)` expr, this is equivalent to "colA != B1 AND colA != B2 AND ..." + case Not(In(sourceExpr @ AllowedTransformationExpression(attrRef), list: Seq[Expression])) if list.forall(_.foldable) => + getTargetIndexedColumnName(attrRef, indexSchema) + .map { colName => + val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) + Not(list.map(lit => genColumnOnlyValuesEqualToExpression(colName, lit, targetExprBuilder)).reduce(Or)) + } + + // Filter "colA like 'xxx%'" + // Translates to "colA_minValue <= xxx AND xxx <= colA_maxValue" for index lookup + // + // NOTE: Since a) this operator matches strings by prefix and b) given that this column is going to be ordered + // lexicographically, we essentially need to check that provided literal falls w/in min/max bounds of the + // given column + case StartsWith(sourceExpr @ AllowedTransformationExpression(attrRef), v @ Literal(_: UTF8String, _)) => + getTargetIndexedColumnName(attrRef, indexSchema) + .map { colName => + val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) + genColumnValuesEqualToExpression(colName, v, targetExprBuilder) + } + + // Filter "expr(colA) not like 'xxx%'" + // Translates to "NOT(expr(colA_minValue) like 'xxx%' AND expr(colA_maxValue) like 'xxx%')" for index lookup + // NOTE: This is NOT an inversion of "colA like xxx" + case Not(StartsWith(sourceExpr @ AllowedTransformationExpression(attrRef), value @ Literal(_: UTF8String, _))) => + getTargetIndexedColumnName(attrRef, indexSchema) + .map { colName => + val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) + val minValueExpr = targetExprBuilder.apply(genColMinValueExpr(colName)) + val maxValueExpr = targetExprBuilder.apply(genColMaxValueExpr(colName)) + Not(And(StartsWith(minValueExpr, value), StartsWith(maxValueExpr, value))) + } + + case or: Or => + val resLeft = createColumnStatsIndexFilterExprInternal(or.left, indexSchema) + val resRight = createColumnStatsIndexFilterExprInternal(or.right, indexSchema) + + Option(Or(resLeft, resRight)) + + case and: And => + val resLeft = createColumnStatsIndexFilterExprInternal(and.left, indexSchema) + val resRight = createColumnStatsIndexFilterExprInternal(and.right, indexSchema) + + Option(And(resLeft, resRight)) + + // + // Pushing Logical NOT inside the AND/OR expressions + // NOTE: This is required to make sure we're properly handling negations in + // cases like {@code NOT(colA = 0)}, {@code NOT(colA in (a, b, ...)} + // + + case Not(And(left: Expression, right: Expression)) => + Option(createColumnStatsIndexFilterExprInternal(Or(Not(left), Not(right)), indexSchema)) + + case Not(Or(left: Expression, right: Expression)) => + Option(createColumnStatsIndexFilterExprInternal(And(Not(left), Not(right)), indexSchema)) + + case _: Expression => None + } + } + + private def checkColIsIndexed(colName: String, indexSchema: StructType): Boolean = { + Set.apply( + getMinColumnNameFor(colName), + getMaxColumnNameFor(colName), + getNullCountColumnNameFor(colName) + ) + .forall(stat => indexSchema.exists(_.name == stat)) + } + + private def getTargetIndexedColumnName(resolvedExpr: AttributeReference, indexSchema: StructType): Option[String] = { + val colName = UnresolvedAttribute(getTargetColNameParts(resolvedExpr)).name + + // Verify that the column is indexed + if (checkColIsIndexed(colName, indexSchema)) { + Option.apply(colName) + } else { + None + } + } + + private def getTargetColNameParts(resolvedTargetCol: Expression): Seq[String] = { + resolvedTargetCol match { + case attr: Attribute => Seq(attr.name) + case Alias(c, _) => getTargetColNameParts(c) + case GetStructField(c, _, Some(name)) => getTargetColNameParts(c) :+ name + case ex: ExtractValue => + throw new AnalysisException(s"convert reference to name failed, Updating nested fields is only supported for StructType: ${ex}.") + case other => + throw new AnalysisException(s"convert reference to name failed, Found unsupported expression ${other}") + } + } +} + +private object ColumnStatsExpressionUtils { + + @inline def genColMinValueExpr(colName: String): Expression = col(getMinColumnNameFor(colName)).expr + @inline def genColMaxValueExpr(colName: String): Expression = col(getMaxColumnNameFor(colName)).expr + @inline def genColNumNullsExpr(colName: String): Expression = col(getNullCountColumnNameFor(colName)).expr + @inline def genColValueCountExpr: Expression = col(getValueCountColumnNameFor).expr + + @inline def genColumnValuesEqualToExpression(colName: String, + value: Expression, + targetExprBuilder: Function[Expression, Expression] = Predef.identity): Expression = { + val minValueExpr = targetExprBuilder.apply(genColMinValueExpr(colName)) + val maxValueExpr = targetExprBuilder.apply(genColMaxValueExpr(colName)) + // Only case when column C contains value V is when min(C) <= V <= max(c) + And(LessThanOrEqual(minValueExpr, value), GreaterThanOrEqual(maxValueExpr, value)) + } + + def genColumnOnlyValuesEqualToExpression(colName: String, + value: Expression, + targetExprBuilder: Function[Expression, Expression] = Predef.identity): Expression = { + val minValueExpr = targetExprBuilder.apply(genColMinValueExpr(colName)) + val maxValueExpr = targetExprBuilder.apply(genColMaxValueExpr(colName)) + // Only case when column C contains _only_ value V is when min(C) = V AND max(c) = V + And(EqualTo(minValueExpr, value), EqualTo(maxValueExpr, value)) + } + + def swapAttributeRefInExpr(sourceExpr: Expression, from: AttributeReference, to: Expression): Expression = { + checkState(sourceExpr.references.size == 1) + sourceExpr.transformDown { + case attrRef: AttributeReference if attrRef.sameRef(from) => to + } + } + + /** + * This check is used to validate that the expression that target column is compared against + *
    +   *    a) Has no references to other attributes (for ex, columns)
    +   *    b) Does not contain sub-queries
    +   * 
    + * + * This in turn allows us to be certain that Spark will be able to evaluate such expression + * against Column Stats Index as well + */ + def isValueExpression(expr: Expression): Boolean = + expr.references.isEmpty && !SubqueryExpression.hasSubquery(expr) + + /** + * This utility pattern-matches an expression iff + * + *
      + *
    1. It references *exactly* 1 attribute (column)
    2. + *
    3. It does NOT contain sub-queries
    4. + *
    5. It contains only whitelisted transformations that preserve ordering of the source column [1]
    6. + *
    + * + * [1] This is required to make sure that we can correspondingly map Column Stats Index values as well. Applying + * transformations that do not preserve the ordering might lead to incorrect results being returned by Data + * Skipping flow. + * + * Returns only [[AttributeReference]] contained as a sub-expression + */ + object AllowedTransformationExpression extends SparkAdapterSupport { + val exprUtils: HoodieCatalystExpressionUtils = sparkAdapter.getCatalystExpressionUtils + + def unapply(expr: Expression): Option[AttributeReference] = { + // First step, we check that expression + // - Does NOT contain sub-queries + // - Does contain exactly 1 attribute + if (SubqueryExpression.hasSubquery(expr) || expr.references.size != 1) { + None + } else { + // Second step, we validate that holding expression is an actually permitted + // transformation + // NOTE: That transformation composition is permitted + exprUtils.tryMatchAttributeOrderingPreservingTransformation(expr) + } + } + } +} + diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala new file mode 100644 index 0000000000000..732367cf5a5e5 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala @@ -0,0 +1,263 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +import org.apache.hudi.DataSourceWriteOptions +import org.apache.hudi.avro.HoodieAvroUtils.getRootLevelFieldName +import org.apache.hudi.common.table.HoodieTableConfig +import org.apache.hudi.common.util.ValidationUtils +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.types.StructType + + +/** + * The HoodieOptionConfig defines some short name for the hoodie + * option key and value. + */ +object HoodieOptionConfig { + + /** + * The short name for the value of COW_TABLE_TYPE_OPT_VAL. + */ + val SQL_VALUE_TABLE_TYPE_COW = "cow" + + /** + * The short name for the value of MOR_TABLE_TYPE_OPT_VAL. + */ + val SQL_VALUE_TABLE_TYPE_MOR = "mor" + + + val SQL_KEY_TABLE_PRIMARY_KEY: HoodieSQLOption[String] = buildConf() + .withSqlKey("primaryKey") + .withHoodieKey(DataSourceWriteOptions.RECORDKEY_FIELD.key) + .withTableConfigKey(HoodieTableConfig.RECORDKEY_FIELDS.key) + .defaultValue(DataSourceWriteOptions.RECORDKEY_FIELD.defaultValue()) + .build() + + val SQL_KEY_TABLE_TYPE: HoodieSQLOption[String] = buildConf() + .withSqlKey("type") + .withHoodieKey(DataSourceWriteOptions.TABLE_TYPE.key) + .withTableConfigKey(HoodieTableConfig.TYPE.key) + .defaultValue(SQL_VALUE_TABLE_TYPE_COW) + .build() + + val SQL_KEY_PRECOMBINE_FIELD: HoodieSQLOption[String] = buildConf() + .withSqlKey("preCombineField") + .withHoodieKey(DataSourceWriteOptions.PRECOMBINE_FIELD.key) + .withTableConfigKey(HoodieTableConfig.PRECOMBINE_FIELD.key) + .build() + + val SQL_PAYLOAD_CLASS: HoodieSQLOption[String] = buildConf() + .withSqlKey("payloadClass") + .withHoodieKey(DataSourceWriteOptions.PAYLOAD_CLASS_NAME.key) + .withTableConfigKey(HoodieTableConfig.PAYLOAD_CLASS_NAME.key) + .defaultValue(DataSourceWriteOptions.PAYLOAD_CLASS_NAME.defaultValue()) + .build() + + /** + * The mapping of the sql short name key to the hoodie's config key. + */ + private lazy val keyMapping: Map[String, String] = { + HoodieOptionConfig.getClass.getDeclaredFields + .filter(f => f.getType == classOf[HoodieSQLOption[_]]) + .map(f => {f.setAccessible(true); f.get(HoodieOptionConfig).asInstanceOf[HoodieSQLOption[_]]}) + .map(option => option.sqlKeyName -> option.hoodieKeyName) + .toMap + } + + /** + * The mapping of the sql short name key to the hoodie table config key + * defined in HoodieTableConfig. + */ + private lazy val keyTableConfigMapping: Map[String, String] = { + HoodieOptionConfig.getClass.getDeclaredFields + .filter(f => f.getType == classOf[HoodieSQLOption[_]]) + .map(f => {f.setAccessible(true); f.get(HoodieOptionConfig).asInstanceOf[HoodieSQLOption[_]]}) + .filter(_.tableConfigKey.isDefined) + .map(option => option.sqlKeyName -> option.tableConfigKey.get) + .toMap + } + + private lazy val tableConfigKeyToSqlKey: Map[String, String] = + keyTableConfigMapping.map(f => f._2 -> f._1) + + /** + * Mapping of the short sql value to the hoodie's config value + */ + private val valueMapping: Map[String, String] = Map ( + SQL_VALUE_TABLE_TYPE_COW -> DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL, + SQL_VALUE_TABLE_TYPE_MOR -> DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL + ) + + private lazy val reverseValueMapping = valueMapping.map(f => f._2 -> f._1) + + def withDefaultSqlOptions(options: Map[String, String]): Map[String, String] = defaultSqlOptions ++ options + + /** + * Mapping the sql's short name key/value in the options to the hoodie's config key/value. + * @param options + * @return + */ + def mappingSqlOptionToHoodieParam(options: Map[String, String]): Map[String, String] = { + options.map (kv => + keyMapping.getOrElse(kv._1, kv._1) -> valueMapping.getOrElse(kv._2, kv._2)) + } + + /** + * Mapping the sql options to the hoodie table config which used to store to the hoodie + * .properties when create the table. + * @param options + * @return + */ + def mappingSqlOptionToTableConfig(options: Map[String, String]): Map[String, String] = { + options.map { case (k, v) => + if (keyTableConfigMapping.contains(k)) { + keyTableConfigMapping(k) -> valueMapping.getOrElse(v, v) + } else { + k -> v + } + } + } + + /** + * Mapping the table config (loaded from the hoodie.properties) to the sql options. + */ + def mappingTableConfigToSqlOption(options: Map[String, String]): Map[String, String] = { + options.map(kv => tableConfigKeyToSqlKey.getOrElse(kv._1, kv._1) -> reverseValueMapping.getOrElse(kv._2, kv._2)) + } + + val defaultSqlOptions: Map[String, String] = { + HoodieOptionConfig.getClass.getDeclaredFields + .filter(f => f.getType == classOf[HoodieSQLOption[_]]) + .map(f => {f.setAccessible(true); f.get(HoodieOptionConfig).asInstanceOf[HoodieSQLOption[_]]}) + .filter(option => option.tableConfigKey.isDefined && option.defaultValue.isDefined) + .map(option => option.sqlKeyName -> option.defaultValue.get.toString) + .toMap + } + + /** + * Get the primary key from the table options. + * @param options + * @return + */ + def getPrimaryColumns(options: Map[String, String]): Array[String] = { + val params = mappingSqlOptionToHoodieParam(options) + params.get(DataSourceWriteOptions.RECORDKEY_FIELD.key) + .map(_.split(",").filter(_.nonEmpty)) + .getOrElse(Array.empty) + } + + /** + * Get the table type from the table options. + * @param options + * @return + */ + def getTableType(options: Map[String, String]): String = { + val params = mappingSqlOptionToHoodieParam(options) + params.getOrElse(DataSourceWriteOptions.TABLE_TYPE.key, + DataSourceWriteOptions.TABLE_TYPE.defaultValue) + } + + def getPreCombineField(options: Map[String, String]): Option[String] = { + val params = mappingSqlOptionToHoodieParam(options) + params.get(DataSourceWriteOptions.PRECOMBINE_FIELD.key).filter(_.nonEmpty) + } + + def deleteHoodieOptions(options: Map[String, String]): Map[String, String] = { + options.filterNot(_._1.startsWith("hoodie.")).filterNot(kv => keyMapping.contains(kv._1)) + } + + // extract primaryKey, preCombineField, type options + def extractSqlOptions(options: Map[String, String]): Map[String, String] = { + val sqlOptions = mappingTableConfigToSqlOption(options) + val targetOptions = keyMapping.keySet -- Set(SQL_PAYLOAD_CLASS.sqlKeyName) + sqlOptions.filterKeys(targetOptions.contains) + } + + // validate primaryKey, preCombineField and type options + def validateTable(spark: SparkSession, schema: StructType, sqlOptions: Map[String, String]): Unit = { + val resolver = spark.sessionState.conf.resolver + + // validate primary key + val primaryKeys = sqlOptions.get(SQL_KEY_TABLE_PRIMARY_KEY.sqlKeyName) + .map(_.split(",").filter(_.length > 0)) + ValidationUtils.checkArgument(primaryKeys.nonEmpty, "No `primaryKey` is specified.") + primaryKeys.get.foreach { primaryKey => + ValidationUtils.checkArgument(schema.exists(f => resolver(f.name, getRootLevelFieldName(primaryKey))), + s"Can't find primaryKey `$primaryKey` in ${schema.treeString}.") + } + + // validate preCombine key + val preCombineKey = sqlOptions.get(SQL_KEY_PRECOMBINE_FIELD.sqlKeyName) + if (preCombineKey.isDefined && preCombineKey.get.nonEmpty) { + ValidationUtils.checkArgument(schema.exists(f => resolver(f.name, getRootLevelFieldName(preCombineKey.get))), + s"Can't find preCombineKey `${preCombineKey.get}` in ${schema.treeString}.") + } + + // validate table type + val tableType = sqlOptions.get(SQL_KEY_TABLE_TYPE.sqlKeyName) + ValidationUtils.checkArgument(tableType.nonEmpty, "No `type` is specified.") + ValidationUtils.checkArgument( + tableType.get.equalsIgnoreCase(SQL_VALUE_TABLE_TYPE_COW) || + tableType.get.equalsIgnoreCase(SQL_VALUE_TABLE_TYPE_MOR), + s"'type' must be '$SQL_VALUE_TABLE_TYPE_COW' or '$SQL_VALUE_TABLE_TYPE_MOR'") + } + + def buildConf[T](): HoodieSQLOptionBuilder[T] = { + new HoodieSQLOptionBuilder[T] + } +} + +case class HoodieSQLOption[T]( + sqlKeyName: String, + hoodieKeyName: String, + tableConfigKey: Option[String], + defaultValue: Option[T] +) + +class HoodieSQLOptionBuilder[T] { + + private var sqlKeyName: String = _ + private var hoodieKeyName: String =_ + private var tableConfigKey: String =_ + private var defaultValue: T =_ + + def withSqlKey(sqlKeyName: String): HoodieSQLOptionBuilder[T] = { + this.sqlKeyName = sqlKeyName + this + } + + def withHoodieKey(hoodieKeyName: String): HoodieSQLOptionBuilder[T] = { + this.hoodieKeyName = hoodieKeyName + this + } + + def withTableConfigKey(tableConfigKey: String): HoodieSQLOptionBuilder[T] = { + this.tableConfigKey = tableConfigKey + this + } + + def defaultValue(defaultValue: T): HoodieSQLOptionBuilder[T] = { + this.defaultValue = defaultValue + this + } + + def build(): HoodieSQLOption[T] = { + HoodieSQLOption(sqlKeyName, hoodieKeyName, Option(tableConfigKey), Option(defaultValue)) + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlCommonUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlCommonUtils.scala new file mode 100644 index 0000000000000..025a224373aed --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlCommonUtils.scala @@ -0,0 +1,385 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.config.{DFSPropertiesConfiguration, HoodieMetadataConfig} +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.HoodieRecord +import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieInstantTimeGenerator} +import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.common.util.PartitionPathEncodeUtils +import org.apache.hudi.{AvroConversionUtils, DataSourceOptionsHelper, DataSourceReadOptions, SparkAdapterSupport} +import org.apache.spark.api.java.JavaSparkContext +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.Resolver +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HoodieCatalogTable} +import org.apache.spark.sql.catalyst.expressions.{And, Attribute, Cast, Expression, Literal} +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} +import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{AnalysisException, Column, DataFrame, SparkSession} + +import java.net.URI +import java.text.SimpleDateFormat +import java.util.{Locale, Properties} +import scala.collection.JavaConverters._ +import scala.collection.immutable.Map + +object HoodieSqlCommonUtils extends SparkAdapterSupport { + // NOTE: {@code SimpleDataFormat} is NOT thread-safe + // TODO replace w/ DateTimeFormatter + private val defaultDateFormat = + ThreadLocal.withInitial(new java.util.function.Supplier[SimpleDateFormat] { + override def get() = new SimpleDateFormat("yyyy-MM-dd") + }) + + def getTableIdentifier(table: LogicalPlan): TableIdentifier = { + table match { + case SubqueryAlias(name, _) => sparkAdapter.getCatalystPlanUtils.toTableIdentifier(name) + case _ => throw new IllegalArgumentException(s"Illegal table: $table") + } + } + + def getTableSqlSchema(metaClient: HoodieTableMetaClient, + includeMetadataFields: Boolean = false): Option[StructType] = { + val schemaResolver = new TableSchemaResolver(metaClient) + val avroSchema = try Some(schemaResolver.getTableAvroSchema(includeMetadataFields)) + catch { + case _: Throwable => None + } + avroSchema.map(AvroConversionUtils.convertAvroSchemaToStructType) + } + + def getAllPartitionPaths(spark: SparkSession, table: CatalogTable): Seq[String] = { + val sparkEngine = new HoodieSparkEngineContext(new JavaSparkContext(spark.sparkContext)) + val metadataConfig = { + val properties = new Properties() + properties.putAll((spark.sessionState.conf.getAllConfs ++ table.storage.properties ++ table.properties).asJava) + HoodieMetadataConfig.newBuilder.fromProperties(properties).build() + } + FSUtils.getAllPartitionPaths(sparkEngine, metadataConfig, getTableLocation(table, spark)).asScala + } + + /** + * This method is used to compatible with the old non-hive-styled partition table. + * By default we enable the "hoodie.datasource.write.hive_style_partitioning" + * when writing data to hudi table by spark sql by default. + * If the exist table is a non-hive-styled partitioned table, we should + * disable the "hoodie.datasource.write.hive_style_partitioning" when + * merge or update the table. Or else, we will get an incorrect merge result + * as the partition path mismatch. + */ + def isHiveStyledPartitioning(partitionPaths: Seq[String], table: CatalogTable): Boolean = { + if (table.partitionColumnNames.nonEmpty) { + val isHiveStylePartitionPath = (path: String) => { + val fragments = path.split("/") + if (fragments.size != table.partitionColumnNames.size) { + false + } else { + fragments.zip(table.partitionColumnNames).forall { + case (pathFragment, partitionColumn) => pathFragment.startsWith(s"$partitionColumn=") + } + } + } + partitionPaths.forall(isHiveStylePartitionPath) + } else { + true + } + } + + /** + * Determine whether URL encoding is enabled + */ + def isUrlEncodeEnabled(partitionPaths: Seq[String], table: CatalogTable): Boolean = { + if (table.partitionColumnNames.nonEmpty) { + partitionPaths.forall(partitionPath => partitionPath.split("/").length == table.partitionColumnNames.size) + } else { + false + } + } + + private def tripAlias(plan: LogicalPlan): LogicalPlan = { + plan match { + case SubqueryAlias(_, relation: LogicalPlan) => + tripAlias(relation) + case other => + other + } + } + + /** + * Add the hoodie meta fields to the schema. + * @param schema + * @return + */ + def addMetaFields(schema: StructType): StructType = { + val metaFields = HoodieRecord.HOODIE_META_COLUMNS.asScala + // filter the meta field to avoid duplicate field. + val dataFields = schema.fields.filterNot(f => metaFields.contains(f.name)) + val fields = metaFields.map(StructField(_, StringType)) ++ dataFields + StructType(fields) + } + + private lazy val metaFields = HoodieRecord.HOODIE_META_COLUMNS.asScala.toSet + + /** + * Remove the meta fields from the schema. + * @param schema + * @return + */ + def removeMetaFields(schema: StructType): StructType = { + StructType(schema.fields.filterNot(f => isMetaField(f.name))) + } + + def isMetaField(name: String): Boolean = { + metaFields.contains(name) + } + + def removeMetaFields(df: DataFrame): DataFrame = { + val withoutMetaColumns = df.logicalPlan.output + .filterNot(attr => isMetaField(attr.name)) + .map(new Column(_)) + if (withoutMetaColumns.length != df.logicalPlan.output.size) { + df.select(withoutMetaColumns: _*) + } else { + df + } + } + + def removeMetaFields(attrs: Seq[Attribute]): Seq[Attribute] = { + attrs.filterNot(attr => isMetaField(attr.name)) + } + + /** + * Get the table location. + * @param tableId + * @param spark + * @return + */ + def getTableLocation(tableId: TableIdentifier, spark: SparkSession): String = { + val table = spark.sessionState.catalog.getTableMetadata(tableId) + getTableLocation(table, spark) + } + + def getTableLocation(properties: Map[String, String], identifier: TableIdentifier, sparkSession: SparkSession): String = { + val location: Option[String] = Some(properties.getOrElse("location", "")) + val isManaged = location.isEmpty || location.get.isEmpty + val uri = if (isManaged) { + Some(sparkSession.sessionState.catalog.defaultTablePath(identifier)) + } else { + Some(new Path(location.get).toUri) + } + getTableLocation(uri, identifier, sparkSession) + } + + def getTableLocation(table: CatalogTable, sparkSession: SparkSession): String = { + val uri = table.storage.locationUri.orElse { + Some(sparkSession.sessionState.catalog.defaultTablePath(table.identifier)) + } + getTableLocation(uri, table.identifier, sparkSession) + } + + def getTableLocation(uri: Option[URI], identifier: TableIdentifier, sparkSession: SparkSession): String = { + val conf = sparkSession.sessionState.newHadoopConf() + uri.map(makePathQualified(_, conf)) + .map(removePlaceHolder) + .getOrElse(throw new IllegalArgumentException(s"Missing location for $identifier")) + } + + private def removePlaceHolder(path: String): String = { + if (path == null || path.length == 0) { + path + } else if (path.endsWith("-__PLACEHOLDER__")) { + path.substring(0, path.length() - 16) + } else { + path + } + } + + def makePathQualified(path: URI, hadoopConf: Configuration): String = { + val hadoopPath = new Path(path) + val fs = hadoopPath.getFileSystem(hadoopConf) + fs.makeQualified(hadoopPath).toUri.toString + } + + /** + * Check if the hoodie.properties exists in the table path. + */ + def tableExistsInPath(tablePath: String, conf: Configuration): Boolean = { + val basePath = new Path(tablePath) + val fs = basePath.getFileSystem(conf) + val metaPath = new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME) + fs.exists(metaPath) + } + + /** + * Split the expression to a sub expression seq by the AND operation. + * @param expression + * @return + */ + def splitByAnd(expression: Expression): Seq[Expression] = { + expression match { + case And(left, right) => + splitByAnd(left) ++ splitByAnd(right) + case exp => Seq(exp) + } + } + + /** + * Append the spark config and table options to the baseConfig. + */ + def withSparkConf(spark: SparkSession, options: Map[String, String]) + (baseConfig: Map[String, String] = Map.empty): Map[String, String] = { + baseConfig ++ DFSPropertiesConfiguration.getGlobalProps.asScala ++ // Table options has the highest priority + (spark.sessionState.conf.getAllConfs ++ HoodieOptionConfig.mappingSqlOptionToHoodieParam(options)) + .filterKeys(isHoodieConfigKey) + } + + /** + * Check if Sql options are Hoodie Config keys. + * + * TODO: standardize the key prefix so that we don't need this helper (HUDI-4935) + */ + def isHoodieConfigKey(key: String): Boolean = + key.startsWith("hoodie.") || key == DataSourceReadOptions.TIME_TRAVEL_AS_OF_INSTANT.key + + /** + * Checks whether Spark is using Hive as Session's Catalog + */ + def isUsingHiveCatalog(sparkSession: SparkSession): Boolean = + sparkSession.sessionState.conf.getConf(StaticSQLConf.CATALOG_IMPLEMENTATION) == "hive" + + /** + * Convert different query instant time format to the commit time format. + * Currently we support three kinds of instant time format for time travel query: + * 1、yyyy-MM-dd HH:mm:ss + * 2、yyyy-MM-dd + * This will convert to 'yyyyMMdd000000'. + * 3、yyyyMMddHHmmss + */ + def formatQueryInstant(queryInstant: String): String = { + val instantLength = queryInstant.length + if (instantLength == 19 || instantLength == 23) { // for yyyy-MM-dd HH:mm:ss[.SSS] + HoodieInstantTimeGenerator.getInstantForDateString(queryInstant) + } else if (instantLength == HoodieInstantTimeGenerator.SECS_INSTANT_ID_LENGTH + || instantLength == HoodieInstantTimeGenerator.MILLIS_INSTANT_ID_LENGTH) { // for yyyyMMddHHmmss[SSS] + HoodieActiveTimeline.parseDateFromInstantTime(queryInstant) // validate the format + queryInstant + } else if (instantLength == 10) { // for yyyy-MM-dd + HoodieActiveTimeline.formatDate(defaultDateFormat.get().parse(queryInstant)) + } else { + throw new IllegalArgumentException(s"Unsupported query instant time format: $queryInstant," + + s"Supported time format are: 'yyyy-MM-dd: HH:mm:ss.SSS' or 'yyyy-MM-dd' or 'yyyyMMddHHmmssSSS'") + } + } + + def formatName(sparkSession: SparkSession, name: String): String = { + if (sparkSession.sessionState.conf.caseSensitiveAnalysis) name else name.toLowerCase(Locale.ROOT) + } + + /** + * Check if this is a empty table path. + */ + def isEmptyPath(tablePath: String, conf: Configuration): Boolean = { + val basePath = new Path(tablePath) + val fs = basePath.getFileSystem(conf) + if (fs.exists(basePath)) { + fs.listStatus(basePath).isEmpty + } else { + true + } + } + + // Find the origin column from schema by column name, throw an AnalysisException if the column + // reference is invalid. + def findColumnByName(schema: StructType, name: String, resolver: Resolver):Option[StructField] = { + schema.fields.collectFirst { + case field if resolver(field.name, name) => field + } + } + + // Compare a [[StructField]] to another, return true if they have the same column + // name(by resolver) and dataType. + def columnEqual(field: StructField, other: StructField, resolver: Resolver): Boolean = { + resolver(field.name, other.name) && field.dataType == other.dataType + } + + def castIfNeeded(child: Expression, dataType: DataType, conf: SQLConf): Expression = { + child match { + case Literal(nul, NullType) => Literal(nul, dataType) + case expr if child.dataType != dataType => Cast(expr, dataType, Option(conf.sessionLocalTimeZone)) + case _ => child + } + } + + def normalizePartitionSpec[T]( + partitionSpec: Map[String, T], + partColNames: Seq[String], + tblName: String, + resolver: Resolver): Map[String, T] = { + val normalizedPartSpec = partitionSpec.toSeq.map { case (key, value) => + val normalizedKey = partColNames.find(resolver(_, key)).getOrElse { + throw new AnalysisException(s"$key is not a valid partition column in table $tblName.") + } + normalizedKey -> value + } + + if (normalizedPartSpec.size < partColNames.size) { + throw new AnalysisException( + "All partition columns need to be specified for Hoodie's partition") + } + + val lowerPartColNames = partColNames.map(_.toLowerCase) + if (lowerPartColNames.distinct.length != lowerPartColNames.length) { + val duplicateColumns = lowerPartColNames.groupBy(identity).collect { + case (x, ys) if ys.length > 1 => s"`$x`" + } + throw new AnalysisException( + s"Found duplicate column(s) in the partition schema: ${duplicateColumns.mkString(", ")}") + } + + normalizedPartSpec.toMap + } + + def getPartitionPathToDrop( + hoodieCatalogTable: HoodieCatalogTable, + normalizedSpecs: Seq[Map[String, String]]): String = { + val table = hoodieCatalogTable.table + val allPartitionPaths = hoodieCatalogTable.getPartitionPaths + val enableHiveStylePartitioning = isHiveStyledPartitioning(allPartitionPaths, table) + val enableEncodeUrl = isUrlEncodeEnabled(allPartitionPaths, table) + val partitionsToDrop = normalizedSpecs.map { spec => + hoodieCatalogTable.partitionFields.map { partitionColumn => + val encodedPartitionValue = if (enableEncodeUrl) { + PartitionPathEncodeUtils.escapePathName(spec(partitionColumn)) + } else { + spec(partitionColumn) + } + if (enableHiveStylePartitioning) { + partitionColumn + "=" + encodedPartitionValue + } else { + encodedPartitionValue + } + }.mkString("/") + }.mkString(",") + partitionsToDrop + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala new file mode 100644 index 0000000000000..61acdf866102b --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala @@ -0,0 +1,323 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.common.config.TypedProperties +import org.apache.hudi.common.model.{OverwriteWithLatestAvroPayload, WriteOperationType} +import org.apache.hudi.common.table.HoodieTableConfig +import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME +import org.apache.hudi.config.{HoodieIndexConfig, HoodieWriteConfig} +import org.apache.hudi.hive.ddl.HiveSyncMode +import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncConfigHolder, MultiPartKeysValueExtractor} +import org.apache.hudi.keygen.ComplexKeyGenerator +import org.apache.hudi.sql.InsertMode +import org.apache.hudi.sync.common.HoodieSyncConfig +import org.apache.hudi.{DataSourceWriteOptions, HoodieWriterUtils} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.hive.HiveExternalCatalog +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.{isUsingHiveCatalog, withSparkConf} +import org.apache.spark.sql.hudi.command.{SqlKeyGenerator, ValidateDuplicateKeyPayload} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructType + +import java.util.Locale +import scala.collection.JavaConverters._ + +trait ProvidesHoodieConfig extends Logging { + + def buildHoodieConfig(hoodieCatalogTable: HoodieCatalogTable): Map[String, String] = { + val sparkSession: SparkSession = hoodieCatalogTable.spark + val catalogProperties = hoodieCatalogTable.catalogProperties + val tableConfig = hoodieCatalogTable.tableConfig + + // NOTE: Here we fallback to "" to make sure that null value is not overridden with + // default value ("ts") + // TODO(HUDI-3456) clean up + val preCombineField = Option(tableConfig.getPreCombineField).getOrElse("") + + require(hoodieCatalogTable.primaryKeys.nonEmpty, + s"There are no primary key in table ${hoodieCatalogTable.table.identifier}, cannot execute update operator") + + val hoodieProps = getHoodieProps(catalogProperties, tableConfig, sparkSession.sqlContext.conf) + + val hiveSyncConfig = buildHiveSyncConfig(hoodieProps, hoodieCatalogTable) + + withSparkConf(sparkSession, catalogProperties) { + Map.apply( + "path" -> hoodieCatalogTable.tableLocation, + RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","), + TBL_NAME.key -> hoodieCatalogTable.tableName, + PRECOMBINE_FIELD.key -> preCombineField, + HIVE_STYLE_PARTITIONING.key -> tableConfig.getHiveStylePartitioningEnable, + URL_ENCODE_PARTITIONING.key -> tableConfig.getUrlEncodePartitioning, + KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName, + SqlKeyGenerator.ORIGINAL_KEYGEN_CLASS_NAME -> tableConfig.getKeyGeneratorClassName, + OPERATION.key -> UPSERT_OPERATION_OPT_VAL, + PARTITIONPATH_FIELD.key -> tableConfig.getPartitionFieldProp, + HoodieSyncConfig.META_SYNC_ENABLED.key -> hiveSyncConfig.getString(HoodieSyncConfig.META_SYNC_ENABLED.key), + HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key -> hiveSyncConfig.getString(HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key), + HiveSyncConfigHolder.HIVE_SYNC_MODE.key -> hiveSyncConfig.getStringOrDefault(HiveSyncConfigHolder.HIVE_SYNC_MODE, HiveSyncMode.HMS.name()), + HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_DATABASE_NAME), + HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_TABLE_NAME), + HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> tableConfig.getPartitionFieldProp, + HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS), + HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.getBoolean(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE).toString, + HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key, "200"), + SqlKeyGenerator.PARTITION_SCHEMA -> hoodieCatalogTable.partitionSchema.toDDL + ) + .filter { case(_, v) => v != null } + } + } + + /** + * Build the default config for insert. + * + * @return + */ + def buildHoodieInsertConfig(hoodieCatalogTable: HoodieCatalogTable, + sparkSession: SparkSession, + isOverwrite: Boolean, + insertPartitions: Map[String, Option[String]] = Map.empty, + extraOptions: Map[String, String]): Map[String, String] = { + + if (insertPartitions.nonEmpty && + (insertPartitions.keys.toSet != hoodieCatalogTable.partitionFields.toSet)) { + throw new IllegalArgumentException(s"Insert partition fields" + + s"[${insertPartitions.keys.mkString(" ")}]" + + s" not equal to the defined partition in table[${hoodieCatalogTable.partitionFields.mkString(",")}]") + } + val path = hoodieCatalogTable.tableLocation + val tableType = hoodieCatalogTable.tableTypeName + val tableConfig = hoodieCatalogTable.tableConfig + val catalogProperties = hoodieCatalogTable.catalogProperties + + val hoodieProps = getHoodieProps(catalogProperties, tableConfig, sparkSession.sqlContext.conf, extraOptions) + val hiveSyncConfig = buildHiveSyncConfig(hoodieProps, hoodieCatalogTable) + + val parameters = withSparkConf(sparkSession, catalogProperties)() + + val partitionFieldsStr = hoodieCatalogTable.partitionFields.mkString(",") + + // NOTE: Here we fallback to "" to make sure that null value is not overridden with + // default value ("ts") + // TODO(HUDI-3456) clean up + val preCombineField = hoodieCatalogTable.preCombineKey.getOrElse("") + + val hiveStylePartitioningEnable = Option(tableConfig.getHiveStylePartitioningEnable).getOrElse("true") + val urlEncodePartitioning = Option(tableConfig.getUrlEncodePartitioning).getOrElse("false") + val keyGeneratorClassName = Option(tableConfig.getKeyGeneratorClassName) + .getOrElse(classOf[ComplexKeyGenerator].getCanonicalName) + + val enableBulkInsert = parameters.getOrElse(DataSourceWriteOptions.SQL_ENABLE_BULK_INSERT.key, + DataSourceWriteOptions.SQL_ENABLE_BULK_INSERT.defaultValue()).toBoolean || + parameters.get(DataSourceWriteOptions.OPERATION.key).exists(_.equalsIgnoreCase(WriteOperationType.BULK_INSERT.value)) + val dropDuplicate = sparkSession.conf + .getOption(INSERT_DROP_DUPS.key).getOrElse(INSERT_DROP_DUPS.defaultValue).toBoolean + + val insertMode = InsertMode.of(parameters.getOrElse(DataSourceWriteOptions.SQL_INSERT_MODE.key, + DataSourceWriteOptions.SQL_INSERT_MODE.defaultValue())) + val isNonStrictMode = insertMode == InsertMode.NON_STRICT + val isPartitionedTable = hoodieCatalogTable.partitionFields.nonEmpty + val hasPrecombineColumn = hoodieCatalogTable.preCombineKey.nonEmpty + val operation = + (enableBulkInsert, isOverwrite, dropDuplicate, isNonStrictMode, isPartitionedTable) match { + case (true, _, _, false, _) => + throw new IllegalArgumentException(s"Table with primaryKey can not use bulk insert in ${insertMode.value()} mode.") + case (true, true, _, _, true) => + throw new IllegalArgumentException(s"Insert Overwrite Partition can not use bulk insert.") + case (true, _, true, _, _) => + throw new IllegalArgumentException(s"Bulk insert cannot support drop duplication." + + s" Please disable $INSERT_DROP_DUPS and try again.") + // if enableBulkInsert is true, use bulk insert for the insert overwrite non-partitioned table. + case (true, true, _, _, false) => BULK_INSERT_OPERATION_OPT_VAL + // insert overwrite table + case (false, true, _, _, false) => INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL + // insert overwrite partition + case (_, true, _, _, true) => INSERT_OVERWRITE_OPERATION_OPT_VAL + // disable dropDuplicate, and provide preCombineKey, use the upsert operation for strict and upsert mode. + case (false, false, false, false, _) if hasPrecombineColumn => UPSERT_OPERATION_OPT_VAL + // if table is pk table and has enableBulkInsert use bulk insert for non-strict mode. + case (true, _, _, true, _) => BULK_INSERT_OPERATION_OPT_VAL + // for the rest case, use the insert operation + case _ => INSERT_OPERATION_OPT_VAL + } + + val payloadClassName = if (operation == UPSERT_OPERATION_OPT_VAL && + tableType == COW_TABLE_TYPE_OPT_VAL && insertMode == InsertMode.STRICT) { + // Validate duplicate key for COW, for MOR it will do the merge with the DefaultHoodieRecordPayload + // on reading. + classOf[ValidateDuplicateKeyPayload].getCanonicalName + } else if (operation == INSERT_OPERATION_OPT_VAL && tableType == COW_TABLE_TYPE_OPT_VAL && + insertMode == InsertMode.STRICT){ + // Validate duplicate key for inserts to COW table when using strict insert mode. + classOf[ValidateDuplicateKeyPayload].getCanonicalName + } else { + classOf[OverwriteWithLatestAvroPayload].getCanonicalName + } + + + logInfo(s"Insert statement use write operation type: $operation, payloadClass: $payloadClassName") + + withSparkConf(sparkSession, catalogProperties) { + Map( + "path" -> path, + TABLE_TYPE.key -> tableType, + TBL_NAME.key -> hoodieCatalogTable.tableName, + OPERATION.key -> operation, + HIVE_STYLE_PARTITIONING.key -> hiveStylePartitioningEnable, + URL_ENCODE_PARTITIONING.key -> urlEncodePartitioning, + KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName, + SqlKeyGenerator.ORIGINAL_KEYGEN_CLASS_NAME -> keyGeneratorClassName, + RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","), + PRECOMBINE_FIELD.key -> preCombineField, + PARTITIONPATH_FIELD.key -> partitionFieldsStr, + PAYLOAD_CLASS_NAME.key -> payloadClassName, + ENABLE_ROW_WRITER.key -> enableBulkInsert.toString, + HoodieWriteConfig.COMBINE_BEFORE_INSERT.key -> String.valueOf(hasPrecombineColumn), + HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> partitionFieldsStr, + HoodieSyncConfig.META_SYNC_ENABLED.key -> hiveSyncConfig.getString(HoodieSyncConfig.META_SYNC_ENABLED.key), + HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key -> hiveSyncConfig.getString(HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key), + HiveSyncConfigHolder.HIVE_SYNC_MODE.key -> hiveSyncConfig.getStringOrDefault(HiveSyncConfigHolder.HIVE_SYNC_MODE, HiveSyncMode.HMS.name()), + HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_DATABASE_NAME), + HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_TABLE_NAME), + HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.getBoolean(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE).toString, + HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS), + HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key, "200"), + HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key, "200"), + SqlKeyGenerator.PARTITION_SCHEMA -> hoodieCatalogTable.partitionSchema.toDDL + ) + .filter { case (_, v) => v != null } + } + } + + def buildHoodieDropPartitionsConfig( + sparkSession: SparkSession, + hoodieCatalogTable: HoodieCatalogTable, + partitionsToDrop: String): Map[String, String] = { + val partitionFields = hoodieCatalogTable.partitionFields.mkString(",") + val catalogProperties = hoodieCatalogTable.catalogProperties + val tableConfig = hoodieCatalogTable.tableConfig + + val hoodieProps = getHoodieProps(catalogProperties, tableConfig, sparkSession.sqlContext.conf) + val hiveSyncConfig = buildHiveSyncConfig(hoodieProps, hoodieCatalogTable) + + withSparkConf(sparkSession, catalogProperties) { + Map( + "path" -> hoodieCatalogTable.tableLocation, + TBL_NAME.key -> hoodieCatalogTable.tableName, + TABLE_TYPE.key -> hoodieCatalogTable.tableTypeName, + OPERATION.key -> DataSourceWriteOptions.DELETE_PARTITION_OPERATION_OPT_VAL, + PARTITIONS_TO_DELETE.key -> partitionsToDrop, + RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","), + PRECOMBINE_FIELD.key -> hoodieCatalogTable.preCombineKey.getOrElse(""), + PARTITIONPATH_FIELD.key -> partitionFields, + HoodieSyncConfig.META_SYNC_ENABLED.key -> hiveSyncConfig.getString(HoodieSyncConfig.META_SYNC_ENABLED.key), + HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key -> hiveSyncConfig.getString(HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key), + HiveSyncConfigHolder.HIVE_SYNC_MODE.key -> hiveSyncConfig.getStringOrDefault(HiveSyncConfigHolder.HIVE_SYNC_MODE, HiveSyncMode.HMS.name()), + HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_DATABASE_NAME), + HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_TABLE_NAME), + HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.getBoolean(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE).toString, + HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> partitionFields, + HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS) + ) + .filter { case (_, v) => v != null } + } + } + + def buildHoodieDeleteTableConfig(hoodieCatalogTable: HoodieCatalogTable, + sparkSession: SparkSession): Map[String, String] = { + val path = hoodieCatalogTable.tableLocation + val catalogProperties = hoodieCatalogTable.catalogProperties + val tableConfig = hoodieCatalogTable.tableConfig + val tableSchema = hoodieCatalogTable.tableSchema + val partitionColumns = tableConfig.getPartitionFieldProp.split(",").map(_.toLowerCase(Locale.ROOT)) + val partitionSchema = StructType(tableSchema.filter(f => partitionColumns.contains(f.name))) + + assert(hoodieCatalogTable.primaryKeys.nonEmpty, + s"There are no primary key defined in table ${hoodieCatalogTable.table.identifier}, cannot execute delete operation") + + val hoodieProps = getHoodieProps(catalogProperties, tableConfig, sparkSession.sqlContext.conf) + val hiveSyncConfig = buildHiveSyncConfig(hoodieProps, hoodieCatalogTable) + + val options = hoodieCatalogTable.catalogProperties + val enableHive = isUsingHiveCatalog(sparkSession) + val partitionFields = hoodieCatalogTable.partitionFields.mkString(",") + + withSparkConf(sparkSession, options) { + Map( + "path" -> path, + RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","), + TBL_NAME.key -> tableConfig.getTableName, + HIVE_STYLE_PARTITIONING.key -> tableConfig.getHiveStylePartitioningEnable, + URL_ENCODE_PARTITIONING.key -> tableConfig.getUrlEncodePartitioning, + KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName, + SqlKeyGenerator.ORIGINAL_KEYGEN_CLASS_NAME -> tableConfig.getKeyGeneratorClassName, + OPERATION.key -> DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL, + PARTITIONPATH_FIELD.key -> tableConfig.getPartitionFieldProp, + HoodieSyncConfig.META_SYNC_ENABLED.key -> hiveSyncConfig.getString(HoodieSyncConfig.META_SYNC_ENABLED.key), + HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key -> hiveSyncConfig.getString(HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key), + HiveSyncConfigHolder.HIVE_SYNC_MODE.key -> hiveSyncConfig.getStringOrDefault(HiveSyncConfigHolder.HIVE_SYNC_MODE, HiveSyncMode.HMS.name()), + HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_DATABASE_NAME), + HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_TABLE_NAME), + HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.getBoolean(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE).toString, + HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> partitionFields, + HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS), + HoodieWriteConfig.DELETE_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.DELETE_PARALLELISM_VALUE.key, "200"), + SqlKeyGenerator.PARTITION_SCHEMA -> partitionSchema.toDDL + ) + } + } + + def getHoodieProps(catalogProperties: Map[String, String], tableConfig: HoodieTableConfig, conf: SQLConf, extraOptions: Map[String, String] = Map.empty): TypedProperties = { + val options: Map[String, String] = catalogProperties ++ tableConfig.getProps.asScala.toMap ++ conf.getAllConfs ++ extraOptions + val hoodieConfig = HoodieWriterUtils.convertMapToHoodieConfig(options) + hoodieConfig.getProps + } + + def buildHiveSyncConfig( + props: TypedProperties, + hoodieCatalogTable: HoodieCatalogTable, + sparkSession: SparkSession = SparkSession.active): HiveSyncConfig = { + // Enable the hive sync by default if spark have enable the hive metastore. + val enableHive = isUsingHiveCatalog(sparkSession) + val hiveSyncConfig: HiveSyncConfig = new HiveSyncConfig(props) + hiveSyncConfig.setValue(HoodieSyncConfig.META_SYNC_ENABLED.key, enableHive.toString) + hiveSyncConfig.setValue(HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key, enableHive.toString) + hiveSyncConfig.setValue(HiveSyncConfigHolder.HIVE_SYNC_MODE.key, props.getString(HiveSyncConfigHolder.HIVE_SYNC_MODE.key, HiveSyncMode.HMS.name())) + hiveSyncConfig.setValue(HoodieSyncConfig.META_SYNC_BASE_PATH, hoodieCatalogTable.tableLocation) + hiveSyncConfig.setValue(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT, hoodieCatalogTable.baseFileFormat) + hiveSyncConfig.setValue(HoodieSyncConfig.META_SYNC_DATABASE_NAME, hoodieCatalogTable.table.identifier.database.getOrElse("default")) + hiveSyncConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_TABLE_NAME, hoodieCatalogTable.table.identifier.table) + if (props.get(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key) != null) { + hiveSyncConfig.setValue(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS, props.getString(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key)) + } + hiveSyncConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS, classOf[MultiPartKeysValueExtractor].getName) + hiveSyncConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE, "true") + if (hiveSyncConfig.useBucketSync()) + hiveSyncConfig.setValue(HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC_SPEC, + HiveSyncConfig.getBucketSpec(props.getString(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key), + props.getInteger(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS.key))) + if (props.containsKey(HiveExternalCatalog.CREATED_SPARK_VERSION)) + hiveSyncConfig.setValue(HoodieSyncConfig.META_SYNC_SPARK_VERSION, + props.getString(HiveExternalCatalog.CREATED_SPARK_VERSION)) + hiveSyncConfig + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/SerDeUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/SerDeUtils.scala new file mode 100644 index 0000000000000..631644121c133 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/SerDeUtils.scala @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +import org.apache.hudi.common.util.BinaryUtil +import org.apache.spark.SparkConf +import org.apache.spark.serializer.{KryoSerializer, SerializerInstance} + +import java.nio.ByteBuffer + + +object SerDeUtils { + + private val SERIALIZER_THREAD_LOCAL = new ThreadLocal[SerializerInstance] { + + override protected def initialValue: SerializerInstance = { + new KryoSerializer(new SparkConf(true)).newInstance() + } + } + + def toBytes(o: Any): Array[Byte] = { + val buf = SERIALIZER_THREAD_LOCAL.get.serialize(o) + BinaryUtil.toBytes(buf) + } + + def toObject(bytes: Array[Byte]): Any = { + SERIALIZER_THREAD_LOCAL.get.deserialize(ByteBuffer.wrap(bytes)) + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableAddColumnsCommand.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableAddColumnsCommand.scala new file mode 100644 index 0000000000000..9852f296d3360 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableAddColumnsCommand.scala @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import java.nio.charset.StandardCharsets + +import org.apache.avro.Schema +import org.apache.hudi.common.model.{HoodieCommitMetadata, WriteOperationType} +import org.apache.hudi.common.table.timeline.HoodieInstant.State +import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieInstant} +import org.apache.hudi.common.util.{CommitUtils, Option} +import org.apache.hudi.table.HoodieSparkTable +import org.apache.hudi.{AvroConversionUtils, DataSourceUtils, HoodieWriterUtils} +import org.apache.spark.api.java.JavaSparkContext +import org.apache.spark.sql.{AnalysisException, Row, SparkSession} +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HoodieCatalogTable} +import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.sql.util.SchemaUtils + +import scala.collection.JavaConverters._ +import scala.util.control.NonFatal + +/** + * Command for add new columns to the hudi table. + */ +case class AlterHoodieTableAddColumnsCommand( + tableId: TableIdentifier, + colsToAdd: Seq[StructField]) + extends HoodieLeafRunnableCommand { + + override def run(sparkSession: SparkSession): Seq[Row] = { + if (colsToAdd.nonEmpty) { + val resolver = sparkSession.sessionState.conf.resolver + val hoodieCatalogTable = HoodieCatalogTable(sparkSession, tableId) + val tableSchema = hoodieCatalogTable.tableSchema + val existsColumns = + colsToAdd.map(_.name).filter(col => tableSchema.fieldNames.exists(f => resolver(f, col))) + + if (existsColumns.nonEmpty) { + throw new AnalysisException(s"Columns: [${existsColumns.mkString(",")}] already exists in the table," + + s" table columns is: [${hoodieCatalogTable.tableSchemaWithoutMetaFields.fieldNames.mkString(",")}]") + } + // Get the new schema + val rearrangedSchema = hoodieCatalogTable.dataSchema ++ colsToAdd ++ hoodieCatalogTable.partitionSchema + val newSqlSchema = StructType(rearrangedSchema) + val (structName, nameSpace) = AvroConversionUtils.getAvroRecordNameAndNamespace(tableId.table) + val newSchema = AvroConversionUtils.convertStructTypeToAvroSchema(newSqlSchema, structName, nameSpace) + + // Commit with new schema to change the table schema + AlterHoodieTableAddColumnsCommand.commitWithSchema(newSchema, hoodieCatalogTable, sparkSession) + + // Refresh the new schema to meta + val newDataSchema = StructType(hoodieCatalogTable.dataSchema.fields ++ colsToAdd) + refreshSchemaInMeta(sparkSession, hoodieCatalogTable.table, newDataSchema) + } + Seq.empty[Row] + } + + private def refreshSchemaInMeta(sparkSession: SparkSession, table: CatalogTable, + newSqlDataSchema: StructType): Unit = { + try { + sparkSession.catalog.uncacheTable(tableId.quotedString) + } catch { + case NonFatal(e) => + log.warn(s"Exception when attempting to uncache table ${tableId.quotedString}", e) + } + sparkSession.catalog.refreshTable(table.identifier.unquotedString) + + SchemaUtils.checkColumnNameDuplication( + newSqlDataSchema.map(_.name), + "in the table definition of " + table.identifier, + conf.caseSensitiveAnalysis) + + sparkSession.sessionState.catalog.alterTableDataSchema(tableId, newSqlDataSchema) + } +} + +object AlterHoodieTableAddColumnsCommand { + /** + * Generate an empty commit with new schema to change the table's schema. + * @param schema The new schema to commit. + * @param hoodieCatalogTable The hoodie catalog table. + * @param sparkSession The spark session. + */ + def commitWithSchema(schema: Schema, hoodieCatalogTable: HoodieCatalogTable, + sparkSession: SparkSession): Unit = { + + val jsc = new JavaSparkContext(sparkSession.sparkContext) + val client = DataSourceUtils.createHoodieClient( + jsc, + schema.toString, + hoodieCatalogTable.tableLocation, + hoodieCatalogTable.tableName, + HoodieWriterUtils.parametersWithWriteDefaults(hoodieCatalogTable.catalogProperties).asJava + ) + + val commitActionType = CommitUtils.getCommitActionType(WriteOperationType.ALTER_SCHEMA, hoodieCatalogTable.tableType) + val instantTime = HoodieActiveTimeline.createNewInstantTime + + client.startCommitWithTime(instantTime, commitActionType) + client.preWrite(instantTime, WriteOperationType.ALTER_SCHEMA, hoodieCatalogTable.metaClient) + + val hoodieTable = HoodieSparkTable.create(client.getConfig, client.getEngineContext) + val timeLine = hoodieTable.getActiveTimeline + val requested = new HoodieInstant(State.REQUESTED, commitActionType, instantTime) + val metadata = new HoodieCommitMetadata + metadata.setOperationType(WriteOperationType.ALTER_SCHEMA) + timeLine.transitionRequestedToInflight(requested, Option.of(metadata.toJsonString.getBytes(StandardCharsets.UTF_8))) + + client.commit(instantTime, jsc.emptyRDD) + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableChangeColumnCommand.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableChangeColumnCommand.scala new file mode 100644 index 0000000000000..3aa5ca945486e --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableChangeColumnCommand.scala @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.avro.Schema +import org.apache.hudi.AvroConversionUtils +import org.apache.hudi.avro.HoodieAvroUtils +import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.exception.HoodieException + +import org.apache.spark.sql.{AnalysisException, Row, SparkSession} +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._ +import org.apache.spark.sql.types.{StructField, StructType} + +import scala.util.control.NonFatal + +/** + * Command for alter hudi table's column type. + */ +case class AlterHoodieTableChangeColumnCommand( + tableIdentifier: TableIdentifier, + columnName: String, + newColumn: StructField) + extends HoodieLeafRunnableCommand { + + override def run(sparkSession: SparkSession): Seq[Row] = { + val resolver = sparkSession.sessionState.conf.resolver + val hoodieCatalogTable = HoodieCatalogTable(sparkSession, tableIdentifier) + + // Find the origin column from dataSchema by column name. + val originColumn = findColumnByName(hoodieCatalogTable.dataSchema, columnName, resolver).getOrElse( + throw new AnalysisException(s"Can't find column `$columnName` given table data columns " + + s"${hoodieCatalogTable.dataSchema.fieldNames.mkString("[`", "`, `", "`]")}") + ) + // Throw an AnalysisException if the column name/dataType is changed. + if (!columnEqual(originColumn, newColumn, resolver)) { + throw new AnalysisException( + "ALTER TABLE CHANGE COLUMN is not supported for changing column " + + s"'${originColumn.name}' with type '${originColumn.dataType}' to " + + s"'${newColumn.name}' with type '${newColumn.dataType}'") + } + + // Get the new schema + val newTableSchema = StructType( + hoodieCatalogTable.tableSchema.fields.map { field => + if (field.name == originColumn.name) { + newColumn + } else { + field + } + }) + val newDataSchema = StructType( + hoodieCatalogTable.dataSchema.fields.map { field => + if (field.name == columnName) { + newColumn + } else { + field + } + }) + val (structName, nameSpace) = AvroConversionUtils.getAvroRecordNameAndNamespace(tableIdentifier.table) + val newSchema = AvroConversionUtils.convertStructTypeToAvroSchema(newTableSchema, structName, nameSpace) + + // Validate the compatibility between new schema and origin schema. + validateSchema(newSchema, hoodieCatalogTable.metaClient) + // Commit new schema to change the table schema + AlterHoodieTableAddColumnsCommand.commitWithSchema(newSchema, hoodieCatalogTable, sparkSession) + + try { + sparkSession.catalog.uncacheTable(tableIdentifier.quotedString) + } catch { + case NonFatal(e) => + log.warn(s"Exception when attempting to uncache table ${tableIdentifier.quotedString}", e) + } + sparkSession.catalog.refreshTable(tableIdentifier.unquotedString) + // Change the schema in the meta using new data schema. + sparkSession.sessionState.catalog.alterTableDataSchema(tableIdentifier, newDataSchema) + + Seq.empty[Row] + } + + private def validateSchema(newSchema: Schema, metaClient: HoodieTableMetaClient): Unit = { + val schemaUtil = new TableSchemaResolver(metaClient) + val tableSchema = HoodieAvroUtils.createHoodieWriteSchema(schemaUtil.getTableAvroSchemaWithoutMetadataFields) + if (!TableSchemaResolver.isSchemaCompatible(tableSchema, newSchema)) { + throw new HoodieException("Failed schema compatibility check for newSchema :" + newSchema + + ", origin table schema :" + tableSchema + ", base path :" + metaClient.getBasePath) + } + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableDropPartitionCommand.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableDropPartitionCommand.scala new file mode 100644 index 0000000000000..c6aa2e7aedacc --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableDropPartitionCommand.scala @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.hudi.HoodieSparkSqlWriter +import org.apache.hudi.exception.HoodieException +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.execution.command.DDLUtils +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._ +import org.apache.spark.sql.hudi.ProvidesHoodieConfig +import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} + +case class AlterHoodieTableDropPartitionCommand( + tableIdentifier: TableIdentifier, + partitionSpecs: Seq[TablePartitionSpec], + ifExists : Boolean, + purge : Boolean, + retainData : Boolean) + extends HoodieLeafRunnableCommand with ProvidesHoodieConfig { + + override def run(sparkSession: SparkSession): Seq[Row] = { + val fullTableName = s"${tableIdentifier.database}.${tableIdentifier.table}" + logInfo(s"start execute alter table drop partition command for $fullTableName") + + val hoodieCatalogTable = HoodieCatalogTable(sparkSession, tableIdentifier) + + if (!hoodieCatalogTable.isPartitionedTable) { + throw new AnalysisException(s"$fullTableName is a non-partitioned table that is not allowed to drop partition") + } + + DDLUtils.verifyAlterTableType( + sparkSession.sessionState.catalog, hoodieCatalogTable.table, isView = false) + + val normalizedSpecs: Seq[Map[String, String]] = partitionSpecs.map { spec => + normalizePartitionSpec( + spec, + hoodieCatalogTable.partitionFields, + hoodieCatalogTable.tableName, + sparkSession.sessionState.conf.resolver) + } + + // drop partitions to lazy clean (https://github.com/apache/hudi/pull/4489) + // delete partition files by enabling cleaner and setting retention policies. + val partitionsToDrop = getPartitionPathToDrop(hoodieCatalogTable, normalizedSpecs) + val parameters = buildHoodieDropPartitionsConfig(sparkSession, hoodieCatalogTable, partitionsToDrop) + val (success, _, _, _, _, _) = HoodieSparkSqlWriter.write( + sparkSession.sqlContext, + SaveMode.Append, + parameters, + sparkSession.emptyDataFrame) + if (!success) { + throw new HoodieException("Alter table command failed") + } + + sparkSession.catalog.refreshTable(tableIdentifier.unquotedString) + logInfo(s"Finish execute alter table drop partition command for $fullTableName") + Seq.empty[Row] + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableRenameCommand.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableRenameCommand.scala new file mode 100644 index 0000000000000..ac6bec744a0e3 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableRenameCommand.scala @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.execution.command.{AlterTableRenameCommand, AlterTableSetPropertiesCommand} + +/** + * Command for alter hudi table's table name. + */ +case class AlterHoodieTableRenameCommand( + oldName: TableIdentifier, + newName: TableIdentifier, + isView: Boolean) + extends HoodieLeafRunnableCommand { + + override def run(sparkSession: SparkSession): Seq[Row] = { + if (newName != oldName) { + val hadoopConf = sparkSession.sessionState.newHadoopConf() + val hoodieCatalogTable = HoodieCatalogTable(sparkSession, oldName) + + // Init table with new name. + HoodieTableMetaClient.withPropertyBuilder() + .fromProperties(hoodieCatalogTable.tableConfig.getProps) + .setTableName(newName.table) + .initTable(hadoopConf, hoodieCatalogTable.tableLocation) + + // Call AlterTableRenameCommand#run to rename table in meta. + AlterTableRenameCommand(oldName, newName, isView).run(sparkSession) + + // update table properties path in every op + if (hoodieCatalogTable.table.properties.contains("path")) { + val catalogTable = sparkSession.sessionState.catalog.getTableMetadata(newName) + val path = catalogTable.storage.locationUri.get.getPath + AlterTableSetPropertiesCommand(newName, Map("path" -> path), isView).run(sparkSession) + } + + } + Seq.empty[Row] + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableCommand.scala new file mode 100644 index 0000000000000..b1d1fcc6d0dca --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableCommand.scala @@ -0,0 +1,258 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.hadoop.fs.Path + +import org.apache.hudi.common.model.{HoodieFileFormat, HoodieTableType} +import org.apache.hudi.common.table.HoodieTableConfig +import org.apache.hudi.hadoop.HoodieParquetInputFormat +import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils +import org.apache.hudi.sync.common.util.ConfigUtils +import org.apache.hudi.{DataSourceWriteOptions, SparkAdapterSupport} + +import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException +import org.apache.spark.sql.catalyst.catalog._ +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable.needFilterProps +import org.apache.spark.sql.hive.HiveClientUtils +import org.apache.spark.sql.hive.HiveExternalCatalog._ +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.isUsingHiveCatalog +import org.apache.spark.sql.hudi.{HoodieOptionConfig, HoodieSqlCommonUtils} +import org.apache.spark.sql.internal.StaticSQLConf.SCHEMA_STRING_LENGTH_THRESHOLD +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{AnalysisException, Row, SparkSession} +import org.apache.spark.{SPARK_VERSION, SparkConf} + +import scala.collection.JavaConverters._ +import scala.collection.mutable +import scala.util.control.NonFatal + +/** + * Command for create hoodie table. + */ +case class CreateHoodieTableCommand(table: CatalogTable, ignoreIfExists: Boolean) + extends HoodieLeafRunnableCommand with SparkAdapterSupport { + + override def run(sparkSession: SparkSession): Seq[Row] = { + val tableIsExists = sparkSession.sessionState.catalog.tableExists(table.identifier) + if (tableIsExists) { + if (ignoreIfExists) { + // scalastyle:off + return Seq.empty[Row] + // scalastyle:on + } else { + throw new IllegalArgumentException(s"Table ${table.identifier.unquotedString} already exists.") + } + } + + val hoodieCatalogTable = HoodieCatalogTable(sparkSession, table) + // check if there are conflict between table configs defined in hoodie table and properties defined in catalog. + CreateHoodieTableCommand.validateTblProperties(hoodieCatalogTable) + + val queryAsProp = hoodieCatalogTable.catalogProperties.get(ConfigUtils.IS_QUERY_AS_RO_TABLE) + if (queryAsProp.isEmpty) { + // init hoodie table for a normal table (not a ro/rt table) + hoodieCatalogTable.initHoodieTable() + } else { + if (!hoodieCatalogTable.hoodieTableExists) { + throw new AnalysisException("Creating ro/rt table need the existence of the base table.") + } + if (HoodieTableType.MERGE_ON_READ != hoodieCatalogTable.tableType) { + throw new AnalysisException("Creating ro/rt table should only apply to a mor table.") + } + } + + try { + // create catalog table for this hoodie table + CreateHoodieTableCommand.createTableInCatalog(sparkSession, hoodieCatalogTable, ignoreIfExists, queryAsProp) + } catch { + case NonFatal(e) => + logWarning(s"Failed to create catalog table in metastore: ${e.getMessage}") + } + Seq.empty[Row] + } +} + +object CreateHoodieTableCommand { + + def validateTblProperties(hoodieCatalogTable: HoodieCatalogTable): Unit = { + if (hoodieCatalogTable.hoodieTableExists) { + val originTableConfig = hoodieCatalogTable.tableConfig.getProps.asScala.toMap + val tableOptions = hoodieCatalogTable.catalogProperties + + checkTableConfigEqual(originTableConfig, tableOptions, HoodieTableConfig.PRECOMBINE_FIELD.key) + checkTableConfigEqual(originTableConfig, tableOptions, HoodieTableConfig.PARTITION_FIELDS.key) + checkTableConfigEqual(originTableConfig, tableOptions, HoodieTableConfig.RECORDKEY_FIELDS.key) + checkTableConfigEqual(originTableConfig, tableOptions, HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key) + checkTableConfigEqual(originTableConfig, tableOptions, HoodieTableConfig.URL_ENCODE_PARTITIONING.key) + checkTableConfigEqual(originTableConfig, tableOptions, HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE.key) + } + } + + def createTableInCatalog( + sparkSession: SparkSession, + hoodieCatalogTable: HoodieCatalogTable, + ignoreIfExists: Boolean, + queryAsProp: Option[String] = None): Unit = { + val table = hoodieCatalogTable.table + assert(table.tableType != CatalogTableType.VIEW) + + val catalog = sparkSession.sessionState.catalog + val path = hoodieCatalogTable.tableLocation + val tableConfig = hoodieCatalogTable.tableConfig + val properties = tableConfig.getProps.asScala.toMap + + val tableType = tableConfig.getTableType.name() + val inputFormat = tableType match { + case DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL => + classOf[HoodieParquetInputFormat].getCanonicalName + case DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL => + classOf[HoodieParquetRealtimeInputFormat].getCanonicalName + case _=> throw new IllegalArgumentException(s"UnKnow table type:$tableType") + } + val outputFormat = HoodieInputFormatUtils.getOutputFormatClassName(HoodieFileFormat.PARQUET) + val serdeFormat = HoodieInputFormatUtils.getSerDeClassName(HoodieFileFormat.PARQUET) + + // only parameters irrelevant to hudi can be set to storage.properties + val storageProperties = HoodieOptionConfig.deleteHoodieOptions(properties) + val newStorage = new CatalogStorageFormat( + Some(new Path(path).toUri), + Some(inputFormat), + Some(outputFormat), + Some(serdeFormat), + table.storage.compressed, + storageProperties + ("path" -> path) ++ queryAsProp.map(ConfigUtils.IS_QUERY_AS_RO_TABLE -> _) + ) + + val tableName = HoodieSqlCommonUtils.formatName(sparkSession, table.identifier.table) + val newDatabaseName = HoodieSqlCommonUtils.formatName(sparkSession, table.identifier.database + .getOrElse(catalog.getCurrentDatabase)) + + val newTableIdentifier = table.identifier + .copy(table = tableName, database = Some(newDatabaseName)) + + val partitionColumnNames = hoodieCatalogTable.partitionSchema.map(_.name) + // Remove some properties should not be used;append pk, preCombineKey, type to the properties of table + val newTblProperties = + hoodieCatalogTable.catalogProperties.--(needFilterProps) ++ HoodieOptionConfig.extractSqlOptions(properties) + val newTable = table.copy( + identifier = newTableIdentifier, + storage = newStorage, + schema = hoodieCatalogTable.tableSchema, + partitionColumnNames = partitionColumnNames, + createVersion = SPARK_VERSION, + properties = newTblProperties + ) + + // Create table in the catalog + val enableHive = isUsingHiveCatalog(sparkSession) + if (enableHive) { + createHiveDataSourceTable(sparkSession, newTable, ignoreIfExists) + } else { + catalog.createTable(newTable, ignoreIfExists = false, validateLocation = false) + } + } + + /** + * Create Hive table for hudi. + * Firstly, do some check for the schema & table. + * Secondly, append some table properties need for spark datasource table. + * Thirdly, create hive table using the HiveClient. + * @param table + * @param sparkSession + */ + private def createHiveDataSourceTable(sparkSession: SparkSession, table: CatalogTable, + ignoreIfExists: Boolean): Unit = { + val dbName = table.identifier.database.get + // check database + val dbExists = sparkSession.sessionState.catalog.databaseExists(dbName) + if (!dbExists) { + throw new NoSuchDatabaseException(dbName) + } + // append some table properties need for spark data source table. + val dataSourceProps = tableMetaToTableProps(sparkSession.sparkContext.conf, + table, table.schema) + + val tableWithDataSourceProps = table.copy(properties = dataSourceProps ++ table.properties) + val client = HiveClientUtils.getSingletonClientForMetadata(sparkSession) + // create hive table. + client.createTable(tableWithDataSourceProps, ignoreIfExists = true) + } + + // This code is forked from org.apache.spark.sql.hive.HiveExternalCatalog#tableMetaToTableProps + private def tableMetaToTableProps(sparkConf: SparkConf, table: CatalogTable, + schema: StructType): Map[String, String] = { + val partitionColumns = table.partitionColumnNames + val bucketSpec = table.bucketSpec + + val properties = new mutable.HashMap[String, String] + properties.put(DATASOURCE_PROVIDER, "hudi") + properties.put(CREATED_SPARK_VERSION, table.createVersion) + + // Serialized JSON schema string may be too long to be stored into a single metastore table + // property. In this case, we split the JSON string and store each part as a separate table + // property. + val threshold = sparkConf.get(SCHEMA_STRING_LENGTH_THRESHOLD) + val schemaJsonString = schema.json + // Split the JSON string. + val parts = schemaJsonString.grouped(threshold).toSeq + properties.put(DATASOURCE_SCHEMA_PREFIX + "numParts", parts.size.toString) + parts.zipWithIndex.foreach { case (part, index) => + properties.put(s"$DATASOURCE_SCHEMA_PART_PREFIX$index", part) + } + + if (partitionColumns.nonEmpty) { + properties.put(DATASOURCE_SCHEMA_NUMPARTCOLS, partitionColumns.length.toString) + partitionColumns.zipWithIndex.foreach { case (partCol, index) => + properties.put(s"$DATASOURCE_SCHEMA_PARTCOL_PREFIX$index", partCol) + } + } + + if (bucketSpec.isDefined) { + val BucketSpec(numBuckets, bucketColumnNames, sortColumnNames) = bucketSpec.get + + properties.put(DATASOURCE_SCHEMA_NUMBUCKETS, numBuckets.toString) + properties.put(DATASOURCE_SCHEMA_NUMBUCKETCOLS, bucketColumnNames.length.toString) + bucketColumnNames.zipWithIndex.foreach { case (bucketCol, index) => + properties.put(s"$DATASOURCE_SCHEMA_BUCKETCOL_PREFIX$index", bucketCol) + } + + if (sortColumnNames.nonEmpty) { + properties.put(DATASOURCE_SCHEMA_NUMSORTCOLS, sortColumnNames.length.toString) + sortColumnNames.zipWithIndex.foreach { case (sortCol, index) => + properties.put(s"$DATASOURCE_SCHEMA_SORTCOL_PREFIX$index", sortCol) + } + } + } + + properties.toMap + } + + private def checkTableConfigEqual( + originTableConfig: Map[String, String], + newTableConfig: Map[String, String], + configKey: String): Unit = { + if (originTableConfig.contains(configKey) && newTableConfig.contains(configKey)) { + assert(originTableConfig(configKey) == newTableConfig(configKey), + s"Table config: $configKey in the create table is: ${newTableConfig(configKey)}, is not the same with the value in " + + s"hoodie.properties, which is: ${originTableConfig(configKey)}. Please keep the same.") + } + } +} + diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/DropHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/DropHoodieTableCommand.scala new file mode 100644 index 0000000000000..a0252861dbf63 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/DropHoodieTableCommand.scala @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.hadoop.fs.Path +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.HoodieTableType +import org.apache.hudi.sync.common.util.ConfigUtils +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.{QualifiedTableName, TableIdentifier} +import org.apache.spark.sql.catalyst.catalog._ + +/** + * Physical plan node for dropping a table. + */ +case class DropHoodieTableCommand( + tableIdentifier: TableIdentifier, + ifExists: Boolean, + isView: Boolean, + purge: Boolean) extends HoodieLeafRunnableCommand { + + private val MOR_SNAPSHOT_TABLE_SUFFIX = "_rt" + private val MOR_READ_OPTIMIZED_TABLE_SUFFIX = "_ro" + + override def run(sparkSession: SparkSession): Seq[Row] = { + logInfo(s"Start executing 'DROP TABLE' on ${tableIdentifier.unquotedString}" + + s" (ifExists=${ifExists}, purge=${purge}).") + if (!sparkSession.catalog.tableExists(tableIdentifier.unquotedString)) { + sparkSession.catalog.refreshTable(tableIdentifier.unquotedString) + } + val qualifiedTableName = QualifiedTableName( + tableIdentifier.database.getOrElse(sparkSession.sessionState.catalog.getCurrentDatabase), + tableIdentifier.table) + sparkSession.sessionState.catalog.invalidateCachedTable(qualifiedTableName) + + dropTableInCatalog(sparkSession, tableIdentifier, ifExists, purge) + + logInfo(s"Finished executing 'DROP TABLE' on ${tableIdentifier.unquotedString}.") + Seq.empty[Row] + } + + /** + * Drops table in Spark catalog. Note that RO & RT table could coexist with a MOR table. + * If `purge` enabled, RO & RT table and corresponding data directory on filesystem will + * all be removed. + */ + private def dropTableInCatalog(sparkSession: SparkSession, + tableIdentifier: TableIdentifier, + ifExists: Boolean, + purge: Boolean): Unit = { + val hoodieCatalogTable = HoodieCatalogTable(sparkSession, tableIdentifier) + val table = hoodieCatalogTable.table + assert(table.tableType != CatalogTableType.VIEW) + + val basePath = hoodieCatalogTable.tableLocation + val catalog = sparkSession.sessionState.catalog + + // Drop table in the catalog + if (hoodieCatalogTable.hoodieTableExists && + HoodieTableType.MERGE_ON_READ == hoodieCatalogTable.tableType && purge) { + val (rtTableOpt, roTableOpt) = getTableRTAndRO(catalog, hoodieCatalogTable) + rtTableOpt.foreach(table => catalog.dropTable(table.identifier, true, false)) + roTableOpt.foreach(table => catalog.dropTable(table.identifier, true, false)) + catalog.dropTable(table.identifier.copy(table = hoodieCatalogTable.tableName), ifExists, purge) + } else { + catalog.dropTable(table.identifier, ifExists, purge) + } + + // Recursively delete table directories + if (purge) { + logInfo("Clean up " + basePath) + val targetPath = new Path(basePath) + val engineContext = new HoodieSparkEngineContext(sparkSession.sparkContext) + val fs = FSUtils.getFs(basePath, sparkSession.sparkContext.hadoopConfiguration) + FSUtils.deleteDir(engineContext, fs, targetPath, sparkSession.sparkContext.defaultParallelism) + } + } + + private def getTableRTAndRO(catalog: SessionCatalog, + hoodieTable: HoodieCatalogTable): (Option[CatalogTable], Option[CatalogTable]) = { + val rtIdt = hoodieTable.table.identifier.copy( + table = s"${hoodieTable.tableName}${MOR_SNAPSHOT_TABLE_SUFFIX}") + val roIdt = hoodieTable.table.identifier.copy( + table = s"${hoodieTable.tableName}${MOR_READ_OPTIMIZED_TABLE_SUFFIX}") + + var rtTableOpt: Option[CatalogTable] = None + var roTableOpt: Option[CatalogTable] = None + if (catalog.tableExists(rtIdt)) { + val rtTable = catalog.getTableMetadata(rtIdt) + if (rtTable.storage.locationUri.equals(hoodieTable.table.storage.locationUri)) { + rtTable.storage.properties.get(ConfigUtils.IS_QUERY_AS_RO_TABLE) match { + case Some(v) if v.equalsIgnoreCase("false") => rtTableOpt = Some(rtTable) + case _ => // do-nothing + } + } + } + if (catalog.tableExists(roIdt)) { + val roTable = catalog.getTableMetadata(roIdt) + if (roTable.storage.locationUri.equals(hoodieTable.table.storage.locationUri)) { + roTable.storage.properties.get(ConfigUtils.IS_QUERY_AS_RO_TABLE) match { + case Some(v) if v.equalsIgnoreCase("true") => roTableOpt = Some(roTable) + case _ => // do-nothing + } + } + } + (rtTableOpt, roTableOpt) + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/HoodieLeafRunnableCommand.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/HoodieLeafRunnableCommand.scala new file mode 100644 index 0000000000000..47e884e962d4b --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/HoodieLeafRunnableCommand.scala @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.trees.HoodieLeafLike +import org.apache.spark.sql.execution.command.RunnableCommand + +/** + * Similar to `LeafRunnableCommand` in Spark3.2, `HoodieLeafRunnableCommand` mixed in + * `HoodieLeafLike` can avoid subclasses of `RunnableCommand` to override + * the `withNewChildrenInternal` method repeatedly. + */ +trait HoodieLeafRunnableCommand extends RunnableCommand with HoodieLeafLike[LogicalPlan] diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/ShowHoodieTablePartitionsCommand.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/ShowHoodieTablePartitionsCommand.scala new file mode 100644 index 0000000000000..d896fecae0cd0 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/ShowHoodieTablePartitionsCommand.scala @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.hudi.common.util.PartitionPathEncodeUtils + +import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.datasources.PartitioningUtils +import org.apache.spark.sql.types.StringType + +/** + * Command for show hudi table's partitions. + */ +case class ShowHoodieTablePartitionsCommand( + tableIdentifier: TableIdentifier, + specOpt: Option[TablePartitionSpec]) + extends HoodieLeafRunnableCommand { + + override val output: Seq[Attribute] = { + AttributeReference("partition", StringType, nullable = false)() :: Nil + } + + override def run(sparkSession: SparkSession): Seq[Row] = { + val hoodieCatalogTable = HoodieCatalogTable(sparkSession, tableIdentifier) + + val schemaOpt = hoodieCatalogTable.tableSchema + val partitionColumnNamesOpt = hoodieCatalogTable.tableConfig.getPartitionFields + + if (partitionColumnNamesOpt.isPresent && partitionColumnNamesOpt.get.nonEmpty && schemaOpt.nonEmpty) { + if (specOpt.isEmpty) { + hoodieCatalogTable.getPartitionPaths.map(Row(_)) + } else { + val spec = specOpt.get + hoodieCatalogTable.getPartitionPaths.filter { partitionPath => + val part = PartitioningUtils.parsePathFragment(partitionPath) + spec.forall { case (col, value) => + PartitionPathEncodeUtils.escapePartitionValue(value) == part.getOrElse(col, null) + } + }.map(Row(_)) + } + } else { + Seq.empty[Row] + } + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala new file mode 100644 index 0000000000000..01c995fed437c --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.avro.generic.GenericRecord +import org.apache.hudi.common.config.TypedProperties +import org.apache.hudi.common.model.HoodieKey +import org.apache.hudi.common.util.PartitionPathEncodeUtils +import org.apache.hudi.common.util.ValidationUtils.checkArgument +import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.keygen._ +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.types.{StructType, TimestampType} +import org.apache.spark.unsafe.types.UTF8String +import org.joda.time.format.DateTimeFormat + +import java.sql.Timestamp +import java.util +import java.util.Collections +import java.util.concurrent.TimeUnit.{MICROSECONDS, MILLISECONDS} + +/** + * Custom Spark-specific [[KeyGenerator]] overriding behavior handling [[TimestampType]] partition values + */ +class SqlKeyGenerator(props: TypedProperties) extends BuiltinKeyGenerator(props) { + + private lazy val partitionSchema = { + val partitionSchema = props.getString(SqlKeyGenerator.PARTITION_SCHEMA, "") + if (partitionSchema != null && partitionSchema.nonEmpty) { + Some(StructType.fromDDL(partitionSchema)) + } else { + None + } + } + + private lazy val complexKeyGen = new ComplexKeyGenerator(props) + private lazy val originalKeyGen = + Option(props.getString(SqlKeyGenerator.ORIGINAL_KEYGEN_CLASS_NAME, null)) + .map { originalKeyGenClassName => + checkArgument(originalKeyGenClassName.nonEmpty) + + val convertedKeyGenClassName = HoodieSparkKeyGeneratorFactory.convertToSparkKeyGenerator(originalKeyGenClassName) + + val keyGenProps = new TypedProperties() + keyGenProps.putAll(props) + keyGenProps.remove(SqlKeyGenerator.ORIGINAL_KEYGEN_CLASS_NAME) + keyGenProps.put(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key, convertedKeyGenClassName) + + KeyGenUtils.createKeyGeneratorByClassName(keyGenProps).asInstanceOf[SparkKeyGeneratorInterface] + } + + override def getRecordKey(record: GenericRecord): String = + originalKeyGen.map { + _.getKey(record).getRecordKey + } getOrElse { + complexKeyGen.getRecordKey(record) + } + + override def getRecordKey(row: Row): String = + originalKeyGen.map { + _.getRecordKey(row) + } getOrElse { + complexKeyGen.getRecordKey(row) + } + + + override def getRecordKey(internalRow: InternalRow, schema: StructType): UTF8String = + originalKeyGen.map { + _.getRecordKey(internalRow, schema) + } getOrElse { + complexKeyGen.getRecordKey(internalRow, schema) + } + + override def getPartitionPath(record: GenericRecord): String = { + val partitionPath = originalKeyGen.map { + _.getKey(record).getPartitionPath + } getOrElse { + complexKeyGen.getPartitionPath(record) + } + + convertPartitionPathToSqlType(partitionPath, rowType = false) + } + + override def getPartitionPath(row: Row): String = { + val partitionPath = originalKeyGen.map { + _.getPartitionPath(row) + } getOrElse { + complexKeyGen.getPartitionPath(row) + } + + convertPartitionPathToSqlType(partitionPath, rowType = true) + } + + override def getPartitionPath(internalRow: InternalRow, schema: StructType): UTF8String = { + val partitionPath = originalKeyGen.map { + _.getPartitionPath(internalRow, schema) + } getOrElse { + complexKeyGen.getPartitionPath(internalRow, schema) + } + + UTF8String.fromString(convertPartitionPathToSqlType(partitionPath.toString, rowType = true)) + } + + override def getRecordKeyFieldNames: util.List[String] = { + originalKeyGen.map(_.getRecordKeyFieldNames) + .getOrElse(complexKeyGen.getRecordKeyFieldNames) + } + + override def getPartitionPathFields: util.List[String] = { + originalKeyGen.map { + case bkg: BaseKeyGenerator => bkg.getPartitionPathFields + case _ => + Option(super.getPartitionPathFields).getOrElse(Collections.emptyList[String]) + } getOrElse { + complexKeyGen.getPartitionPathFields + } + } + + // TODO clean up + private def convertPartitionPathToSqlType(partitionPath: String, rowType: Boolean): String = { + if (partitionSchema.isDefined) { + // we can split the partitionPath here because we enable the URL_ENCODE_PARTITIONING_OPT + // by default for sql. + val partitionFragments = partitionPath.split(KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR) + // If it is a table which is not write by spark sql before and the url encode has disabled, + // the partition path level may not equal to the partition schema size. Just return the partitionPath + // in this case. + if (partitionFragments.size != partitionSchema.get.size) { + partitionPath + } else { + partitionFragments.zip(partitionSchema.get.fields).map { + case (partitionValue, partitionField) => + val hiveStylePrefix = s"${partitionField.name}=" + val isHiveStyle = partitionValue.startsWith(hiveStylePrefix) + val _partitionValue = if (isHiveStyle) partitionValue.substring(hiveStylePrefix.length) else partitionValue + + partitionField.dataType match { + case TimestampType => + val timeMs = if (rowType) { // In RowType, the partitionPathValue is the time format string, convert to millis + SqlKeyGenerator.sqlTimestampFormat.parseMillis(_partitionValue) + } else { + if (isConsistentLogicalTimestampEnabled) { + Timestamp.valueOf(_partitionValue).getTime + } else { + MILLISECONDS.convert(_partitionValue.toLong, MICROSECONDS) + } + } + val timestampFormat = PartitionPathEncodeUtils.escapePathName( + SqlKeyGenerator.timestampTimeFormat.print(timeMs)) + if (isHiveStyle) s"$hiveStylePrefix$timestampFormat" else timestampFormat + case _ => partitionValue + } + }.mkString(KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR) + } + } else partitionPath + } +} + +object SqlKeyGenerator { + val PARTITION_SCHEMA = "hoodie.sql.partition.schema" + val ORIGINAL_KEYGEN_CLASS_NAME = "hoodie.sql.origin.keygen.class" + private val timestampTimeFormat = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss") + private val sqlTimestampFormat = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.S") +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/TruncateHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/TruncateHoodieTableCommand.scala new file mode 100644 index 0000000000000..05f96efdae531 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/TruncateHoodieTableCommand.scala @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.hadoop.fs.Path +import org.apache.hudi.HoodieSparkSqlWriter +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.exception.HoodieException +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec +import org.apache.spark.sql.catalyst.catalog.{CatalogTableType, HoodieCatalogTable} +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.{getPartitionPathToDrop, normalizePartitionSpec} +import org.apache.spark.sql.hudi.ProvidesHoodieConfig +import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} + +/** + * Command for truncate hudi table. + */ +case class TruncateHoodieTableCommand( + tableIdentifier: TableIdentifier, + partitionSpec: Option[TablePartitionSpec]) + extends HoodieLeafRunnableCommand with ProvidesHoodieConfig { + + override def run(sparkSession: SparkSession): Seq[Row] = { + val fullTableName = s"${tableIdentifier.database}.${tableIdentifier.table}" + logInfo(s"start execute truncate table command for $fullTableName") + + val hoodieCatalogTable = HoodieCatalogTable(sparkSession, tableIdentifier) + + val catalog = sparkSession.sessionState.catalog + val table = catalog.getTableMetadata(tableIdentifier) + val tableId = table.identifier.quotedString + + if (table.tableType == CatalogTableType.VIEW) { + throw new AnalysisException( + s"Operation not allowed: TRUNCATE TABLE on views: $tableId") + } + + if (table.partitionColumnNames.isEmpty && partitionSpec.isDefined) { + throw new AnalysisException( + s"Operation not allowed: TRUNCATE TABLE ... PARTITION is not supported " + + s"for tables that are not partitioned: $tableId") + } + + val basePath = hoodieCatalogTable.tableLocation + val properties = hoodieCatalogTable.tableConfig.getProps + val hadoopConf = sparkSession.sessionState.newHadoopConf() + + // If we have not specified the partition, truncate will delete all the data in the table path + if (partitionSpec.isEmpty) { + val targetPath = new Path(basePath) + val engineContext = new HoodieSparkEngineContext(sparkSession.sparkContext) + val fs = FSUtils.getFs(basePath, sparkSession.sparkContext.hadoopConfiguration) + FSUtils.deleteDir(engineContext, fs, targetPath, sparkSession.sparkContext.defaultParallelism) + + // ReInit hoodie.properties + HoodieTableMetaClient.withPropertyBuilder() + .fromProperties(properties) + .initTable(hadoopConf, hoodieCatalogTable.tableLocation) + } else { + val normalizedSpecs: Seq[Map[String, String]] = Seq(partitionSpec.map { spec => + normalizePartitionSpec( + spec, + hoodieCatalogTable.partitionFields, + hoodieCatalogTable.tableName, + sparkSession.sessionState.conf.resolver) + }.get) + + // drop partitions to lazy clean + val partitionsToDrop = getPartitionPathToDrop(hoodieCatalogTable, normalizedSpecs) + val parameters = buildHoodieDropPartitionsConfig(sparkSession, hoodieCatalogTable, partitionsToDrop) + val (success, _, _, _, _, _) = HoodieSparkSqlWriter.write( + sparkSession.sqlContext, + SaveMode.Append, + parameters, + sparkSession.emptyDataFrame) + if (!success) { + throw new HoodieException("Truncate Hoodie Table command failed") + } + } + + // After deleting the data, refresh the table to make sure we don't keep around a stale + // file relation in the metastore cache and cached table data in the cache manager. + sparkSession.catalog.refreshTable(table.identifier.quotedString) + logInfo(s"Finish execute truncate table command for $fullTableName") + Seq.empty[Row] + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/ValidateDuplicateKeyPayload.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/ValidateDuplicateKeyPayload.scala new file mode 100644 index 0000000000000..2619d1d9fe151 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/ValidateDuplicateKeyPayload.scala @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.avro.Schema +import org.apache.avro.generic.{GenericRecord, IndexedRecord} +import org.apache.hudi.common.model.{DefaultHoodieRecordPayload, HoodieRecord} +import org.apache.hudi.common.util.{Option => HOption} +import org.apache.hudi.exception.HoodieDuplicateKeyException + + +import java.util.Properties + +/** + * Validate the duplicate key for insert statement without enable the INSERT_DROP_DUPS_OPT + * config. + */ +class ValidateDuplicateKeyPayload(record: GenericRecord, orderingVal: Comparable[_]) + extends DefaultHoodieRecordPayload(record, orderingVal) { + + def this(record: HOption[GenericRecord]) { + this(if (record.isPresent) record.get else null, 0) + } + + override def combineAndGetUpdateValue(currentValue: IndexedRecord, + schema: Schema, properties: Properties): HOption[IndexedRecord] = { + val key = currentValue.asInstanceOf[GenericRecord].get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString + throw new HoodieDuplicateKeyException(key) + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/payload/SqlTypedRecord.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/payload/SqlTypedRecord.scala new file mode 100644 index 0000000000000..4e33c3be1f2c5 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/payload/SqlTypedRecord.scala @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.payload + +import com.github.benmanes.caffeine.cache.{Cache, Caffeine} +import org.apache.avro.Schema +import org.apache.avro.generic.IndexedRecord +import org.apache.hudi.HoodieSparkUtils.sparkAdapter +import org.apache.hudi.AvroConversionUtils +import org.apache.spark.sql.avro.HoodieAvroDeserializer +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.hudi.command.payload.SqlTypedRecord.{getAvroDeserializer, getSqlType} +import org.apache.spark.sql.types.StructType + +import java.util.function.Function + +/** + * A sql typed record which will convert the avro field to sql typed value. + */ +class SqlTypedRecord(val record: IndexedRecord) extends IndexedRecord { + + private lazy val structType = getSqlType(getSchema) + private lazy val sqlRow = getAvroDeserializer(getSchema).deserialize(record).get.asInstanceOf[InternalRow] + + override def put(i: Int, v: Any): Unit = { + record.put(i, v) + } + + override def get(i: Int): AnyRef = { + sqlRow.get(i, structType(i).dataType) + } + + override def getSchema: Schema = record.getSchema +} + +object SqlTypedRecord { + + private val sqlTypeCache = Caffeine.newBuilder() + .maximumSize(16).build[Schema, StructType] + + private val avroDeserializerCacheLocal = new ThreadLocal[Cache[Schema, HoodieAvroDeserializer]] { + override def initialValue(): Cache[Schema, HoodieAvroDeserializer] = { + Caffeine.newBuilder().maximumSize(16).build[Schema, HoodieAvroDeserializer] + } + } + + def getSqlType(schema: Schema): StructType = { + sqlTypeCache.get(schema, new Function[Schema, StructType] { + override def apply(t: Schema): StructType = AvroConversionUtils.convertAvroSchemaToStructType(t) + }) + } + + def getAvroDeserializer(schema: Schema): HoodieAvroDeserializer= { + avroDeserializerCacheLocal.get().get(schema, new Function[Schema, HoodieAvroDeserializer] { + override def apply(t: Schema): HoodieAvroDeserializer = sparkAdapter.createAvroDeserializer(t, getSqlType(t)) + }) + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieSourceOffset.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieSourceOffset.scala new file mode 100644 index 0000000000000..a5561a65a0157 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieSourceOffset.scala @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.streaming + +import com.fasterxml.jackson.annotation.JsonInclude.Include +import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper +import org.apache.hudi.common.table.timeline.HoodieTimeline +import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset} + +case class HoodieSourceOffset(commitTime: String) extends Offset { + + override val json: String = { + HoodieSourceOffset.toJson(this) + } + + override def equals(obj: Any): Boolean = { + obj match { + case HoodieSourceOffset(otherCommitTime) => + otherCommitTime == commitTime + case _=> false + } + } + + override def hashCode(): Int = { + commitTime.hashCode + } +} + + +object HoodieSourceOffset { + + lazy val mapper: ObjectMapper = { + val _mapper = new ObjectMapper + _mapper.setSerializationInclusion(Include.NON_ABSENT) + _mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) + _mapper.registerModule(DefaultScalaModule) + _mapper + } + + def toJson(offset: HoodieSourceOffset): String = { + mapper.writeValueAsString(offset) + } + + def fromJson(json: String): HoodieSourceOffset = { + mapper.readValue(json, classOf[HoodieSourceOffset]) + } + + def apply(offset: Offset): HoodieSourceOffset = { + offset match { + case SerializedOffset(json) => fromJson(json) + case o: HoodieSourceOffset => o + } + } + + val INIT_OFFSET = HoodieSourceOffset(HoodieTimeline.INIT_INSTANT_TS) +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieStreamSource.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieStreamSource.scala new file mode 100644 index 0000000000000..bd8fdf263b3a1 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieStreamSource.scala @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.streaming + +import java.io.{BufferedWriter, InputStream, OutputStream, OutputStreamWriter} +import java.nio.charset.StandardCharsets +import java.util.Date + +import org.apache.hadoop.fs.Path + +import org.apache.hudi.{AvroConversionUtils, DataSourceReadOptions, IncrementalRelation, MergeOnReadIncrementalRelation, SparkAdapterSupport} +import org.apache.hudi.common.model.HoodieTableType +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline +import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.common.util.{FileIOUtils, TablePathUtils} + +import org.apache.spark.sql.hudi.streaming.HoodieStreamSource.VERSION +import org.apache.spark.sql.hudi.streaming.HoodieSourceOffset.INIT_OFFSET +import org.apache.spark.internal.Logging +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.RowEncoder +import org.apache.spark.sql.execution.streaming.{HDFSMetadataLog, Offset, Source} +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{DataFrame, SQLContext} + +/** + * The Struct Stream Source for Hudi to consume the data by streaming job. + * @param sqlContext + * @param metadataPath + * @param schemaOption + * @param parameters + */ +class HoodieStreamSource( + sqlContext: SQLContext, + metadataPath: String, + schemaOption: Option[StructType], + parameters: Map[String, String]) + extends Source with Logging with Serializable with SparkAdapterSupport { + + @transient private val hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf() + private lazy val tablePath: Path = { + val path = new Path(parameters.getOrElse("path", "Missing 'path' option")) + val fs = path.getFileSystem(hadoopConf) + TablePathUtils.getTablePath(fs, path).get() + } + private lazy val metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(tablePath.toString).build() + private lazy val tableType = metaClient.getTableType + + @transient private var lastOffset: HoodieSourceOffset = _ + @transient private lazy val initialOffsets = { + val metadataLog = + new HDFSMetadataLog[HoodieSourceOffset](sqlContext.sparkSession, metadataPath) { + override def serialize(metadata: HoodieSourceOffset, out: OutputStream): Unit = { + val writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8)) + writer.write("v" + VERSION + "\n") + writer.write(metadata.json) + writer.flush() + } + + /** + * Deserialize the init offset from the metadata file. + * The format in the metadata file is like this: + * ---------------------------------------------- + * v1 -- The version info in the first line + * offsetJson -- The json string of HoodieSourceOffset in the rest of the file + * ----------------------------------------------- + * @param in + * @return + */ + override def deserialize(in: InputStream): HoodieSourceOffset = { + val content = FileIOUtils.readAsUTFString(in) + // Get version from the first line + val firstLineEnd = content.indexOf("\n") + if (firstLineEnd > 0) { + val version = getVersion(content.substring(0, firstLineEnd)) + if (version > VERSION) { + throw new IllegalStateException(s"UnSupportVersion: max support version is: $VERSION" + + s" current version is: $version") + } + // Get offset from the rest line in the file + HoodieSourceOffset.fromJson(content.substring(firstLineEnd + 1)) + } else { + throw new IllegalStateException(s"Bad metadata format, failed to find the version line.") + } + } + } + metadataLog.get(0).getOrElse { + metadataLog.add(0, INIT_OFFSET) + INIT_OFFSET + } + } + + private def getVersion(versionLine: String): Int = { + if (versionLine.startsWith("v")) { + versionLine.substring(1).toInt + } else { + throw new IllegalStateException(s"Illegal version line: $versionLine " + + s"in the streaming metadata path") + } + } + + override def schema: StructType = { + schemaOption.getOrElse { + val schemaUtil = new TableSchemaResolver(metaClient) + AvroConversionUtils.convertAvroSchemaToStructType(schemaUtil.getTableAvroSchema) + } + } + + /** + * Get the latest offset from the hoodie table. + * @return + */ + override def getOffset: Option[Offset] = { + metaClient.reloadActiveTimeline() + val activeInstants = metaClient.getActiveTimeline.getCommitsTimeline.filterCompletedInstants + if (!activeInstants.empty()) { + val currentLatestCommitTime = activeInstants.lastInstant().get().getTimestamp + if (lastOffset == null || currentLatestCommitTime > lastOffset.commitTime) { + lastOffset = HoodieSourceOffset(currentLatestCommitTime) + } + } else { // if there are no active commits, use the init offset + lastOffset = initialOffsets + } + Some(lastOffset) + } + + override def getBatch(start: Option[Offset], end: Offset): DataFrame = { + initialOffsets + + val startOffset = start.map(HoodieSourceOffset(_)) + .getOrElse(initialOffsets) + val endOffset = HoodieSourceOffset(end) + + if (startOffset == endOffset) { + sqlContext.internalCreateDataFrame( + sqlContext.sparkContext.emptyRDD[InternalRow].setName("empty"), schema, isStreaming = true) + } else { + // Consume the data between (startCommitTime, endCommitTime] + val incParams = parameters ++ Map( + DataSourceReadOptions.QUERY_TYPE.key -> DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL, + DataSourceReadOptions.BEGIN_INSTANTTIME.key -> startCommitTime(startOffset), + DataSourceReadOptions.END_INSTANTTIME.key -> endOffset.commitTime + ) + + val rdd = tableType match { + case HoodieTableType.COPY_ON_WRITE => + val serDe = sparkAdapter.createSparkRowSerDe(schema) + new IncrementalRelation(sqlContext, incParams, Some(schema), metaClient) + .buildScan() + .map(serDe.serializeRow) + case HoodieTableType.MERGE_ON_READ => + val requiredColumns = schema.fields.map(_.name) + new MergeOnReadIncrementalRelation(sqlContext, incParams, Some(schema), metaClient) + .buildScan(requiredColumns, Array.empty[Filter]) + .asInstanceOf[RDD[InternalRow]] + case _ => throw new IllegalArgumentException(s"UnSupport tableType: $tableType") + } + sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true) + } + } + + private def startCommitTime(startOffset: HoodieSourceOffset): String = { + startOffset match { + case INIT_OFFSET => startOffset.commitTime + case HoodieSourceOffset(commitTime) => + val time = HoodieActiveTimeline.parseDateFromInstantTime(commitTime).getTime + // As we consume the data between (start, end], start is not included, + // so we +1s to the start commit time here. + HoodieActiveTimeline.formatDate(new Date(time + 1000)) + case _=> throw new IllegalStateException("UnKnow offset type.") + } + } + + override def stop(): Unit = { + + } +} + +object HoodieStreamSource { + val VERSION = 1 +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/test/java/org/apache/hudi/internal/HoodieBulkInsertInternalWriterTestBase.java b/hudi-spark-datasource/hudi-spark-common/src/test/java/org/apache/hudi/internal/HoodieBulkInsertInternalWriterTestBase.java new file mode 100644 index 0000000000000..b9f77bccfd56d --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/test/java/org/apache/hudi/internal/HoodieBulkInsertInternalWriterTestBase.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.internal; + +import org.apache.hudi.DataSourceWriteOptions; +import org.apache.hudi.client.HoodieInternalWriteStatus; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.keygen.SimpleKeyGenerator; +import org.apache.hudi.testutils.HoodieClientTestHarness; +import org.apache.hudi.testutils.SparkDatasetTestUtils; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Random; + +import static org.apache.hudi.testutils.SparkDatasetTestUtils.getConfigBuilder; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Base class for TestHoodieBulkInsertDataInternalWriter. + */ +public class HoodieBulkInsertInternalWriterTestBase extends HoodieClientTestHarness { + + protected static final Random RANDOM = new Random(); + + @BeforeEach + public void setUp() throws Exception { + initSparkContexts(); + initPath(); + initFileSystem(); + initTestDataGenerator(); + initMetaClient(); + initTimelineService(); + } + + @AfterEach + public void tearDown() throws Exception { + cleanupResources(); + } + + protected HoodieWriteConfig getWriteConfig(boolean populateMetaFields) { + return getWriteConfig(populateMetaFields, DataSourceWriteOptions.HIVE_STYLE_PARTITIONING().defaultValue()); + } + + protected HoodieWriteConfig getWriteConfig(boolean populateMetaFields, String hiveStylePartitioningValue) { + Properties properties = new Properties(); + if (!populateMetaFields) { + properties.setProperty(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), SimpleKeyGenerator.class.getName()); + properties.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD().key(), SparkDatasetTestUtils.RECORD_KEY_FIELD_NAME); + properties.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), SparkDatasetTestUtils.PARTITION_PATH_FIELD_NAME); + properties.setProperty(HoodieTableConfig.POPULATE_META_FIELDS.key(), "false"); + } + properties.setProperty(DataSourceWriteOptions.HIVE_STYLE_PARTITIONING().key(), hiveStylePartitioningValue); + return getConfigBuilder(basePath, timelineServicePort).withProperties(properties).build(); + } + + protected void assertWriteStatuses(List writeStatuses, int batches, int size, + Option> fileAbsPaths, Option> fileNames) { + assertWriteStatuses(writeStatuses, batches, size, false, fileAbsPaths, fileNames, false); + } + + protected void assertWriteStatuses(List writeStatuses, int batches, int size, boolean areRecordsSorted, + Option> fileAbsPaths, Option> fileNames, boolean isHiveStylePartitioning) { + if (areRecordsSorted) { + assertEquals(batches, writeStatuses.size()); + } else { + assertEquals(Math.min(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS.length, batches), writeStatuses.size()); + } + + Map sizeMap = new HashMap<>(); + if (!areRecordsSorted) { + // no of records are written per batch. Every 4th batch goes into same writeStatus. So, populating the size expected + // per write status + for (int i = 0; i < batches; i++) { + String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[i % 3]; + if (!sizeMap.containsKey(partitionPath)) { + sizeMap.put(partitionPath, 0L); + } + sizeMap.put(partitionPath, sizeMap.get(partitionPath) + size); + } + } + + int counter = 0; + for (HoodieInternalWriteStatus writeStatus : writeStatuses) { + // verify write status + String actualPartitionPathFormat = isHiveStylePartitioning ? SparkDatasetTestUtils.PARTITION_PATH_FIELD_NAME + "=%s" : "%s"; + assertEquals(String.format(actualPartitionPathFormat, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[counter % 3]), writeStatus.getPartitionPath()); + if (areRecordsSorted) { + assertEquals(writeStatus.getTotalRecords(), size); + } else { + assertEquals(writeStatus.getTotalRecords(), sizeMap.get(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[counter % 3])); + } + assertNull(writeStatus.getGlobalError()); + assertEquals(writeStatus.getFailedRowsSize(), 0); + assertEquals(writeStatus.getTotalErrorRecords(), 0); + assertFalse(writeStatus.hasErrors()); + assertNotNull(writeStatus.getFileId()); + String fileId = writeStatus.getFileId(); + if (fileAbsPaths.isPresent()) { + fileAbsPaths.get().add(basePath + "/" + writeStatus.getStat().getPath()); + } + if (fileNames.isPresent()) { + fileNames.get().add(writeStatus.getStat().getPath() + .substring(writeStatus.getStat().getPath().lastIndexOf('/') + 1)); + } + HoodieWriteStat writeStat = writeStatus.getStat(); + if (areRecordsSorted) { + assertEquals(size, writeStat.getNumInserts()); + assertEquals(size, writeStat.getNumWrites()); + } else { + assertEquals(sizeMap.get(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[counter % 3]), writeStat.getNumInserts()); + assertEquals(sizeMap.get(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[counter % 3]), writeStat.getNumWrites()); + } + assertEquals(fileId, writeStat.getFileId()); + assertEquals(String.format(actualPartitionPathFormat, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[counter++ % 3]), writeStat.getPartitionPath()); + assertEquals(0, writeStat.getNumDeletes()); + assertEquals(0, writeStat.getNumUpdateWrites()); + assertEquals(0, writeStat.getTotalWriteErrors()); + } + } + + protected void assertOutput(Dataset expectedRows, Dataset actualRows, String instantTime, Option> fileNames, + boolean populateMetaColumns) { + if (populateMetaColumns) { + // verify 3 meta fields that are filled in within create handle + actualRows.collectAsList().forEach(entry -> { + assertEquals(entry.get(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD)).toString(), instantTime); + assertFalse(entry.isNullAt(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.FILENAME_METADATA_FIELD))); + if (fileNames.isPresent()) { + assertTrue(fileNames.get().contains(entry.get(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS + .get(HoodieRecord.FILENAME_METADATA_FIELD)))); + } + assertFalse(entry.isNullAt(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD))); + }); + + // after trimming 2 of the meta fields, rest of the fields should match + Dataset trimmedExpected = expectedRows.drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, HoodieRecord.COMMIT_TIME_METADATA_FIELD, HoodieRecord.FILENAME_METADATA_FIELD); + Dataset trimmedActual = actualRows.drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, HoodieRecord.COMMIT_TIME_METADATA_FIELD, HoodieRecord.FILENAME_METADATA_FIELD); + assertEquals(0, trimmedActual.except(trimmedExpected).count()); + } else { // operation = BULK_INSERT_APPEND_ONLY + // all meta columns are untouched + assertEquals(0, expectedRows.except(actualRows).count()); + } + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/test/scala/org/apache/spark/execution/datasources/TestHoodieInMemoryFileIndex.scala b/hudi-spark-datasource/hudi-spark-common/src/test/scala/org/apache/spark/execution/datasources/TestHoodieInMemoryFileIndex.scala new file mode 100644 index 0000000000000..8e7f6bf14b7e5 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/test/scala/org/apache/spark/execution/datasources/TestHoodieInMemoryFileIndex.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.execution.datasources + +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.SparkSession +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.io.TempDir + +import java.io.File +import java.nio.file.Paths + +class TestHoodieInMemoryFileIndex { + + @Test + def testCreateInMemoryIndex(@TempDir tempDir: File): Unit = { + val spark = SparkSession.builder + .appName("Hoodie Datasource test") + .master("local[2]") + .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .getOrCreate + + val folders: Seq[Path] = Seq( + new Path(Paths.get(tempDir.getAbsolutePath, "folder1").toUri), + new Path(Paths.get(tempDir.getAbsolutePath, "folder2").toUri) + ) + + val files: Seq[Path] = Seq( + new Path(Paths.get(tempDir.getAbsolutePath, "folder1", "file1").toUri), + new Path(Paths.get(tempDir.getAbsolutePath, "folder1", "file2").toUri), + new Path(Paths.get(tempDir.getAbsolutePath, "folder2", "file3").toUri), + new Path(Paths.get(tempDir.getAbsolutePath, "folder2", "file4").toUri) + ) + + folders.foreach(folder => new File(folder.toUri).mkdir()) + files.foreach(file => new File(file.toUri).createNewFile()) + + val index = HoodieInMemoryFileIndex.create(spark, Seq(folders(0), folders(1))) + val indexedFilePaths = index.allFiles().map(fs => fs.getPath) + assertEquals(files.sortWith(_.toString < _.toString), indexedFilePaths.sortWith(_.toString < _.toString)) + spark.stop() + } + +} diff --git a/hudi-spark-datasource/hudi-spark/pom.xml b/hudi-spark-datasource/hudi-spark/pom.xml index 65efdce0a4f72..71fcd43dd9c9a 100644 --- a/hudi-spark-datasource/hudi-spark/pom.xml +++ b/hudi-spark-datasource/hudi-spark/pom.xml @@ -19,12 +19,12 @@ hudi-spark-datasource org.apache.hudi - 0.6.1-SNAPSHOT + 0.12.2-dt-SNAPSHOT 4.0.0 hudi-spark_${scala.binary.version} - ${parent.version} + 0.12.2-dt-SNAPSHOT hudi-spark_${scala.binary.version} jar @@ -48,7 +48,9 @@ -nobootcp + -target:jvm-1.8 + false
    @@ -133,21 +135,6 @@ org.scalatest scalatest-maven-plugin - 1.0 - - ${skipUTs} - ${project.build.directory}/surefire-reports - . - TestSuite.txt - - - - test - - test - - - org.scalastyle @@ -157,70 +144,76 @@ org.jacoco jacoco-maven-plugin + + org.antlr + antlr4-maven-plugin + ${antlr.version} + + + + antlr4 + + + + + true + true + ../hudi-spark/src/main/antlr4/ + +
    - - - org.scala-lang - scala-library - ${scala.version} - - + - org.apache.hudi - hudi-client-common - ${project.version} + org.apache.logging.log4j + log4j-1.2-api + + org.apache.hudi hudi-spark-client ${project.version} + org.apache.hudi - hudi-common - ${project.version} - - - org.apache.hudi - hudi-hadoop-mr + ${hudi.spark.module}_${scala.binary.version} ${project.version} + + + org.apache.hudi - hudi-hive-sync + hudi-client-common ${project.version} + org.apache.hudi - hudi-sync-common + hudi-common ${project.version} org.apache.hudi - hudi-spark-common + hudi-hadoop-mr ${project.version} org.apache.hudi - hudi-spark2_${scala.binary.version} + hudi-hive-sync ${project.version} org.apache.hudi - hudi-spark3_2.12 + hudi-sync-common ${project.version} - - - log4j - log4j - - com.fasterxml.jackson.core @@ -266,11 +259,30 @@ spark-sql_${scala.binary.version} - org.apache.spark - spark-avro_${scala.binary.version} - provided + spark-hive_${scala.binary.version} + + + + org.apache.spark + spark-sql_${scala.binary.version} + tests + test + + + + org.apache.spark + spark-core_${scala.binary.version} + tests + test + + + + org.apache.spark + spark-catalyst_${scala.binary.version} + tests + test @@ -360,7 +372,31 @@ + + org.apache.curator + curator-framework + ${zk-curator.version} + + + + org.apache.curator + curator-client + ${zk-curator.version} + + + + org.apache.curator + curator-recipes + ${zk-curator.version} + + + + org.apache.hudi + hudi-tests-common + ${project.version} + test + org.apache.hudi hudi-client-common @@ -385,6 +421,12 @@ test-jar test + + org.apache.hudi + hudi-java-client + ${project.version} + test + org.scalatest @@ -423,5 +465,45 @@ test + + org.junit.platform + junit-platform-runner + test + + + + org.junit.platform + junit-platform-suite-api + test + + + + org.slf4j + slf4j-api + ${slf4j.version} + test + + + + org.apache.hadoop + hadoop-hdfs + tests + test + + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + diff --git a/hudi-spark-datasource/hudi-spark/run_hoodie_app.sh b/hudi-spark-datasource/hudi-spark/run_hoodie_app.sh index 9782aa359556f..ba5eb6ed56521 100755 --- a/hudi-spark-datasource/hudi-spark/run_hoodie_app.sh +++ b/hudi-spark-datasource/hudi-spark/run_hoodie_app.sh @@ -23,7 +23,7 @@ function error_exit { DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" #Ensure we pick the right jar even for hive11 builds -HUDI_JAR=`ls -c $DIR/../../packaging/hudi-spark-bundle/target/hudi-spark-bundle*.jar | grep -v sources | head -1` +HUDI_JAR=`ls -c $DIR/../../packaging/hudi-spark-bundle/target/hudi-spark*-bundle*.jar | grep -v sources | head -1` if [ -z "$HADOOP_CONF_DIR" ]; then echo "setting hadoop conf dir" diff --git a/hudi-spark-datasource/hudi-spark/run_hoodie_generate_app.sh b/hudi-spark-datasource/hudi-spark/run_hoodie_generate_app.sh index a2769517b9eb4..15c6c0d48cc2e 100755 --- a/hudi-spark-datasource/hudi-spark/run_hoodie_generate_app.sh +++ b/hudi-spark-datasource/hudi-spark/run_hoodie_generate_app.sh @@ -23,7 +23,7 @@ function error_exit { DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" #Ensure we pick the right jar even for hive11 builds -HUDI_JAR=`ls -c $DIR/../../packaging/hudi-spark-bundle/target/hudi-spark-bundle*.jar | grep -v sources | head -1` +HUDI_JAR=`ls -c $DIR/../../packaging/hudi-spark-bundle/target/hudi-spark*-bundle*.jar | grep -v sources | head -1` if [ -z "$HADOOP_CONF_DIR" ]; then echo "setting hadoop conf dir" diff --git a/hudi-spark-datasource/hudi-spark/run_hoodie_streaming_app.sh b/hudi-spark-datasource/hudi-spark/run_hoodie_streaming_app.sh index 9a81a4c0684e3..0501ff8f43bde 100755 --- a/hudi-spark-datasource/hudi-spark/run_hoodie_streaming_app.sh +++ b/hudi-spark-datasource/hudi-spark/run_hoodie_streaming_app.sh @@ -23,7 +23,7 @@ function error_exit { DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" #Ensure we pick the right jar even for hive11 builds -HUDI_JAR=`ls -c $DIR/../../packaging/hudi-spark-bundle/target/hudi-spark-bundle*.jar | grep -v sources | head -1` +HUDI_JAR=`ls -c $DIR/../../packaging/hudi-spark-bundle/target/hudi-spark*-bundle*.jar | grep -v sources | head -1` if [ -z "$HADOOP_CONF_DIR" ]; then echo "setting hadoop conf dir" diff --git a/hudi-spark-datasource/hudi-spark/src/main/antlr4/org/apache/hudi/spark/sql/parser/HoodieSqlCommon.g4 b/hudi-spark-datasource/hudi-spark/src/main/antlr4/org/apache/hudi/spark/sql/parser/HoodieSqlCommon.g4 new file mode 100644 index 0000000000000..8643170f892bf --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/antlr4/org/apache/hudi/spark/sql/parser/HoodieSqlCommon.g4 @@ -0,0 +1,299 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + grammar HoodieSqlCommon; + + @lexer::members { + /** + * Verify whether current token is a valid decimal token (which contains dot). + * Returns true if the character that follows the token is not a digit or letter or underscore. + * + * For example: + * For char stream "2.3", "2." is not a valid decimal token, because it is followed by digit '3'. + * For char stream "2.3_", "2.3" is not a valid decimal token, because it is followed by '_'. + * For char stream "2.3W", "2.3" is not a valid decimal token, because it is followed by 'W'. + * For char stream "12.0D 34.E2+0.12 " 12.0D is a valid decimal token because it is followed + * by a space. 34.E2 is a valid decimal token because it is followed by symbol '+' + * which is not a digit or letter or underscore. + */ + public boolean isValidDecimal() { + int nextChar = _input.LA(1); + if (nextChar >= 'A' && nextChar <= 'Z' || nextChar >= '0' && nextChar <= '9' || + nextChar == '_') { + return false; + } else { + return true; + } + } +} + + singleStatement + : statement ';'* EOF + ; + + statement + : compactionStatement #compactionCommand + | CALL multipartIdentifier '(' (callArgument (',' callArgument)*)? ')' #call + | CREATE INDEX (IF NOT EXISTS)? identifier ON TABLE? + tableIdentifier (USING indexType=identifier)? + LEFT_PAREN columns=multipartIdentifierPropertyList RIGHT_PAREN + (OPTIONS indexOptions=propertyList)? #createIndex + | DROP INDEX (IF EXISTS)? identifier ON TABLE? tableIdentifier #dropIndex + | SHOW INDEXES (FROM | IN) TABLE? tableIdentifier #showIndexes + | REFRESH INDEX identifier ON TABLE? tableIdentifier #refreshIndex + | .*? #passThrough + ; + + compactionStatement + : operation = (RUN | SCHEDULE) COMPACTION ON tableIdentifier (AT instantTimestamp = INTEGER_VALUE)? #compactionOnTable + | operation = (RUN | SCHEDULE) COMPACTION ON path = STRING (AT instantTimestamp = INTEGER_VALUE)? #compactionOnPath + | SHOW COMPACTION ON tableIdentifier (LIMIT limit = INTEGER_VALUE)? #showCompactionOnTable + | SHOW COMPACTION ON path = STRING (LIMIT limit = INTEGER_VALUE)? #showCompactionOnPath + ; + + tableIdentifier + : (db=IDENTIFIER '.')? table=IDENTIFIER + ; + + callArgument + : expression #positionalArgument + | identifier '=>' expression #namedArgument + ; + + expression + : constant + | stringMap + ; + + constant + : number #numericLiteral + | booleanValue #booleanLiteral + | STRING+ #stringLiteral + | identifier STRING #typeConstructor + ; + + stringMap + : MAP '(' constant (',' constant)* ')' + ; + + booleanValue + : TRUE | FALSE + ; + + number + : MINUS? EXPONENT_VALUE #exponentLiteral + | MINUS? DECIMAL_VALUE #decimalLiteral + | MINUS? INTEGER_VALUE #integerLiteral + | MINUS? BIGINT_LITERAL #bigIntLiteral + | MINUS? SMALLINT_LITERAL #smallIntLiteral + | MINUS? TINYINT_LITERAL #tinyIntLiteral + | MINUS? DOUBLE_LITERAL #doubleLiteral + | MINUS? FLOAT_LITERAL #floatLiteral + | MINUS? BIGDECIMAL_LITERAL #bigDecimalLiteral + ; + + multipartIdentifierPropertyList + : multipartIdentifierProperty (COMMA multipartIdentifierProperty)* + ; + + multipartIdentifierProperty + : multipartIdentifier (OPTIONS options=propertyList)? + ; + + multipartIdentifier + : parts+=identifier ('.' parts+=identifier)* + ; + + identifier + : IDENTIFIER #unquotedIdentifier + | quotedIdentifier #quotedIdentifierAlternative + | nonReserved #unquotedIdentifier + ; + + quotedIdentifier + : BACKQUOTED_IDENTIFIER + ; + + nonReserved + : CALL + | COMPACTION + | CREATE + | DROP + | EXISTS + | FROM + | IN + | INDEX + | INDEXES + | IF + | LIMIT + | NOT + | ON + | OPTIONS + | REFRESH + | RUN + | SCHEDULE + | SHOW + | TABLE + | USING + ; + + propertyList + : LEFT_PAREN property (COMMA property)* RIGHT_PAREN + ; + + property + : key=propertyKey (EQ? value=propertyValue)? + ; + + propertyKey + : identifier (DOT identifier)* + | STRING + ; + + propertyValue + : INTEGER_VALUE + | DECIMAL_VALUE + | booleanValue + | STRING + ; + + LEFT_PAREN: '('; + RIGHT_PAREN: ')'; + COMMA: ','; + DOT: '.'; + + ALL: 'ALL'; + AT: 'AT'; + CALL: 'CALL'; + COMPACTION: 'COMPACTION'; + RUN: 'RUN'; + SCHEDULE: 'SCHEDULE'; + ON: 'ON'; + SHOW: 'SHOW'; + LIMIT: 'LIMIT'; + MAP: 'MAP'; + NULL: 'NULL'; + TRUE: 'TRUE'; + FALSE: 'FALSE'; + INTERVAL: 'INTERVAL'; + TO: 'TO'; + CREATE: 'CREATE'; + INDEX: 'INDEX'; + INDEXES: 'INDEXES'; + IF: 'IF'; + NOT: 'NOT'; + EXISTS: 'EXISTS'; + TABLE: 'TABLE'; + USING: 'USING'; + OPTIONS: 'OPTIONS'; + DROP: 'DROP'; + FROM: 'FROM'; + IN: 'IN'; + REFRESH: 'REFRESH'; + + EQ: '=' | '=='; + + PLUS: '+'; + MINUS: '-'; + + STRING + : '\'' ( ~('\''|'\\') | ('\\' .) )* '\'' + | '"' ( ~('"'|'\\') | ('\\' .) )* '"' + ; + + BIGINT_LITERAL + : DIGIT+ 'L' + ; + + SMALLINT_LITERAL + : DIGIT+ 'S' + ; + + TINYINT_LITERAL + : DIGIT+ 'Y' + ; + + INTEGER_VALUE + : DIGIT+ + ; + + EXPONENT_VALUE + : DIGIT+ EXPONENT + | DECIMAL_DIGITS EXPONENT {isValidDecimal()}? + ; + + DECIMAL_VALUE + : DECIMAL_DIGITS {isValidDecimal()}? + ; + + FLOAT_LITERAL + : DIGIT+ EXPONENT? 'F' + | DECIMAL_DIGITS EXPONENT? 'F' {isValidDecimal()}? + ; + + DOUBLE_LITERAL + : DIGIT+ EXPONENT? 'D' + | DECIMAL_DIGITS EXPONENT? 'D' {isValidDecimal()}? + ; + + BIGDECIMAL_LITERAL + : DIGIT+ EXPONENT? 'BD' + | DECIMAL_DIGITS EXPONENT? 'BD' {isValidDecimal()}? + ; + + IDENTIFIER + : (LETTER | DIGIT | '_')+ + ; + + BACKQUOTED_IDENTIFIER + : '`' ( ~'`' | '``' )* '`' + ; + + fragment DECIMAL_DIGITS + : DIGIT+ '.' DIGIT* + | '.' DIGIT+ + ; + + fragment EXPONENT + : 'E' [+-]? DIGIT+ + ; + + fragment DIGIT + : [0-9] + ; + + fragment LETTER + : [A-Z] + ; + + SIMPLE_COMMENT + : '--' ~[\r\n]* '\r'? '\n'? -> channel(HIDDEN) + ; + + BRACKETED_COMMENT + : '/*' .*? '*/' -> channel(HIDDEN) + ; + + WS : [ \r\n\t]+ -> channel(HIDDEN) + ; + + // Catch-all for anything we can't recognize. + // We use this to be able to ignore and recover all the text + // when splitting statements with DelimiterLexer + UNRECOGNIZED + : . + ; diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/HoodieDataSourceHelpers.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/HoodieDataSourceHelpers.java index 734e0c0ea7a91..9491e43e21fe8 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/HoodieDataSourceHelpers.java +++ b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/HoodieDataSourceHelpers.java @@ -18,12 +18,16 @@ package org.apache.hudi; +import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.ClusteringUtils; import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hadoop.fs.FileSystem; @@ -70,7 +74,7 @@ public static String latestCommit(FileSystem fs, String basePath) { */ @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) public static HoodieTimeline allCompletedCommitsCompactions(FileSystem fs, String basePath) { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath, true); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); if (metaClient.getTableType().equals(HoodieTableType.MERGE_ON_READ)) { return metaClient.getActiveTimeline().getTimelineOfActions( CollectionUtils.createSet(HoodieActiveTimeline.COMMIT_ACTION, @@ -80,4 +84,17 @@ public static HoodieTimeline allCompletedCommitsCompactions(FileSystem fs, Strin return metaClient.getCommitTimeline().filterCompletedInstants(); } } + + @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + public static Option getClusteringPlan(FileSystem fs, String basePath, String instantTime) { + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()) + .setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); + HoodieInstant hoodieInstant = HoodieTimeline.getReplaceCommitRequestedInstant(instantTime); + Option> clusteringPlan = ClusteringUtils.getClusteringPlan(metaClient, hoodieInstant); + if (clusteringPlan.isPresent()) { + return Option.of(clusteringPlan.get().getValue()); + } else { + return Option.empty(); + } + } } diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/HoodieDatasetBulkInsertHelper.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/HoodieDatasetBulkInsertHelper.java deleted file mode 100644 index c820ebef43a5a..0000000000000 --- a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/HoodieDatasetBulkInsertHelper.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi; - -import static org.apache.spark.sql.functions.callUDF; - -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.util.ReflectionUtils; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.keygen.BuiltinKeyGenerator; - -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.sql.Column; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; -import org.apache.spark.sql.api.java.UDF1; -import org.apache.spark.sql.functions; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.StructType; -import scala.collection.JavaConverters; - -/** - * Helper class to assist in preparing {@link Dataset}s for bulk insert with datasource implementation. - */ -public class HoodieDatasetBulkInsertHelper { - - private static final Logger LOG = LogManager.getLogger(HoodieDatasetBulkInsertHelper.class); - - private static final String RECORD_KEY_UDF_FN = "hudi_recordkey_gen_function"; - private static final String PARTITION_PATH_UDF_FN = "hudi_partition_gen_function"; - - /** - * Prepares input hoodie spark dataset for bulk insert. It does the following steps. - * 1. Uses KeyGenerator to generate hoodie record keys and partition path. - * 2. Add hoodie columns to input spark dataset. - * 3. Reorders input dataset columns so that hoodie columns appear in the beginning. - * 4. Sorts input dataset by hoodie partition path and record key - * - * @param sqlContext SQL Context - * @param config Hoodie Write Config - * @param rows Spark Input dataset - * @return hoodie dataset which is ready for bulk insert. - */ - public static Dataset prepareHoodieDatasetForBulkInsert(SQLContext sqlContext, - HoodieWriteConfig config, Dataset rows, String structName, String recordNamespace) { - List originalFields = - Arrays.stream(rows.schema().fields()).map(f -> new Column(f.name())).collect(Collectors.toList()); - - TypedProperties properties = new TypedProperties(); - properties.putAll(config.getProps()); - String keyGeneratorClass = properties.getString(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY()); - BuiltinKeyGenerator keyGenerator = (BuiltinKeyGenerator) ReflectionUtils.loadClass(keyGeneratorClass, properties); - StructType structTypeForUDF = rows.schema(); - - sqlContext.udf().register(RECORD_KEY_UDF_FN, (UDF1) keyGenerator::getRecordKey, DataTypes.StringType); - sqlContext.udf().register(PARTITION_PATH_UDF_FN, (UDF1) keyGenerator::getPartitionPath, DataTypes.StringType); - - final Dataset rowDatasetWithRecordKeys = rows.withColumn(HoodieRecord.RECORD_KEY_METADATA_FIELD, - callUDF(RECORD_KEY_UDF_FN, org.apache.spark.sql.functions.struct( - JavaConverters.collectionAsScalaIterableConverter(originalFields).asScala().toSeq()))); - - final Dataset rowDatasetWithRecordKeysAndPartitionPath = - rowDatasetWithRecordKeys.withColumn(HoodieRecord.PARTITION_PATH_METADATA_FIELD, - callUDF(PARTITION_PATH_UDF_FN, - org.apache.spark.sql.functions.struct( - JavaConverters.collectionAsScalaIterableConverter(originalFields).asScala().toSeq()))); - - // Add other empty hoodie fields which will be populated before writing to parquet. - Dataset rowDatasetWithHoodieColumns = - rowDatasetWithRecordKeysAndPartitionPath.withColumn(HoodieRecord.COMMIT_TIME_METADATA_FIELD, - functions.lit("").cast(DataTypes.StringType)) - .withColumn(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, - functions.lit("").cast(DataTypes.StringType)) - .withColumn(HoodieRecord.FILENAME_METADATA_FIELD, - functions.lit("").cast(DataTypes.StringType)); - List orderedFields = Stream.concat(HoodieRecord.HOODIE_META_COLUMNS.stream().map(Column::new), - originalFields.stream()).collect(Collectors.toList()); - Dataset colOrderedDataset = rowDatasetWithHoodieColumns.select( - JavaConverters.collectionAsScalaIterableConverter(orderedFields).asScala().toSeq()); - - return colOrderedDataset - .sort(functions.col(HoodieRecord.PARTITION_PATH_METADATA_FIELD), functions.col(HoodieRecord.RECORD_KEY_METADATA_FIELD)) - .coalesce(config.getBulkInsertShuffleParallelism()); - } -} diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java index febdf190c40b4..453cbb4e748ac 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java +++ b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java @@ -19,10 +19,12 @@ package org.apache.hudi; import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.avro.Schema; @@ -33,6 +35,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -99,7 +102,7 @@ public int getNumExistingKeys() { } public static GenericRecord generateGenericRecord(String rowKey, String riderName, String driverName, - long timestamp) { + long timestamp) { GenericRecord rec = new GenericData.Record(avroSchema); rec.put("uuid", rowKey); rec.put("ts", timestamp); @@ -121,11 +124,22 @@ public static GenericRecord generateGenericRecord(String rowKey, String riderNam */ public static OverwriteWithLatestAvroPayload generateRandomValue(HoodieKey key, String riderDriverSuffix) throws IOException { + // The timestamp generated is limited to range from 7 days before to now, to avoid generating too many + // partitionPaths when user use timestamp as partitionPath filed. GenericRecord rec = - generateGenericRecord(key.getRecordKey(), "rider-" + riderDriverSuffix, "driver-" + riderDriverSuffix, 0); + generateGenericRecord(key.getRecordKey(), "rider-" + riderDriverSuffix, "driver-" + + riderDriverSuffix, generateRangeRandomTimestamp(7)); return new OverwriteWithLatestAvroPayload(Option.of(rec)); } + /** + * Generate timestamp range from {@param daysTillNow} before to now. + */ + private static long generateRangeRandomTimestamp(int daysTillNow) { + long maxIntervalMillis = daysTillNow * 24 * 60 * 60 * 1000L; + return System.currentTimeMillis() - (long) (Math.random() * maxIntervalMillis); + } + /** * Generates new inserts, uniformly across the partition paths above. It also updates the list of existing keys. */ @@ -138,7 +152,7 @@ public Stream generateInsertsStream(String randomString, Integer n existingKeys.put(currSize + i, key); numExistingKeys++; try { - return new HoodieRecord(key, generateRandomValue(key, randomString)); + return new HoodieAvroRecord(key, generateRandomValue(key, randomString)); } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); } @@ -154,7 +168,7 @@ public List generateInserts(Integer n) throws IOException { } public HoodieRecord generateUpdateRecord(HoodieKey key, String randomString) throws IOException { - return new HoodieRecord(key, generateRandomValue(key, randomString)); + return new HoodieAvroRecord(key, generateRandomValue(key, randomString)); } /** @@ -164,15 +178,42 @@ public HoodieRecord generateUpdateRecord(HoodieKey key, String randomString) thr * @param n Number of updates (including dups) * @return list of hoodie record updates */ - public List generateUpdates(Integer n) throws IOException { + public List generateUpdates(Integer n) { + if (numExistingKeys == 0) { + throw new HoodieException("Data must have been written before performing the update operation"); + } String randomString = generateRandomString(); - List updates = new ArrayList<>(); - for (int i = 0; i < n; i++) { - HoodieKey key = existingKeys.get(rand.nextInt(numExistingKeys)); - HoodieRecord record = generateUpdateRecord(key, randomString); - updates.add(record); + return IntStream.range(0, n).boxed().map(x -> { + try { + return generateUpdateRecord(existingKeys.get(rand.nextInt(numExistingKeys)), randomString); + } catch (IOException e) { + throw new HoodieIOException(e.getMessage(), e); + } + }).collect(Collectors.toList()); + } + + /** + * Generates new updates, one for each of the keys above + * list + * + * @param n Number of updates (must be no more than number of existing keys) + * @return list of hoodie record updates + */ + public List generateUniqueUpdates(Integer n) { + if (numExistingKeys < n) { + throw new HoodieException("Data must have been written before performing the update operation"); } - return updates; + List keys = IntStream.range(0, numExistingKeys).boxed() + .collect(Collectors.toCollection(ArrayList::new)); + Collections.shuffle(keys); + String randomString = generateRandomString(); + return IntStream.range(0, n).boxed().map(x -> { + try { + return generateUpdateRecord(existingKeys.get(keys.get(x)), randomString); + } catch (IOException e) { + throw new HoodieIOException(e.getMessage(), e); + } + }).collect(Collectors.toList()); } /** @@ -182,8 +223,12 @@ public List generateUpdates(Integer n) throws IOException { * @return list of hoodie records to delete */ public List generateDeletes(List rows) { - return rows.stream().map(row -> - convertToString(row.getAs("uuid"), row.getAs("partitionpath"))).filter(os -> os.isPresent()).map(os -> os.get()) + // if row.length() == 2, then the record contains "uuid" and "partitionpath" fields, otherwise, + // another field "ts" is available + return rows.stream().map(row -> row.length() == 2 + ? convertToString(row.getAs("uuid"), row.getAs("partitionpath"), null) : + convertToString(row.getAs("uuid"), row.getAs("partitionpath"), row.getAs("ts")) + ).filter(os -> os.isPresent()).map(os -> os.get()) .collect(Collectors.toList()); } @@ -204,10 +249,10 @@ private static Option convertToString(HoodieRecord record) { } } - private static Option convertToString(String uuid, String partitionPath) { + private static Option convertToString(String uuid, String partitionPath, Long ts) { StringBuffer stringBuffer = new StringBuffer(); stringBuffer.append("{"); - stringBuffer.append("\"ts\": 0.0,"); + stringBuffer.append("\"ts\": \"" + (ts == null ? "0.0" : ts) + "\","); stringBuffer.append("\"uuid\": \"" + uuid + "\","); stringBuffer.append("\"partitionpath\": \"" + partitionPath + "\""); stringBuffer.append("}"); @@ -223,6 +268,8 @@ public static Map getQuickstartWriteConfigs() { Map demoConfigs = new HashMap<>(); demoConfigs.put("hoodie.insert.shuffle.parallelism", "2"); demoConfigs.put("hoodie.upsert.shuffle.parallelism", "2"); + demoConfigs.put("hoodie.bulkinsert.shuffle.parallelism", "2"); + demoConfigs.put("hoodie.delete.shuffle.parallelism", "2"); return demoConfigs; } -} \ No newline at end of file +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkFullBootstrapDataProviderBase.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkFullBootstrapDataProviderBase.java new file mode 100644 index 0000000000000..8f76a062f574a --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkFullBootstrapDataProviderBase.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.bootstrap; + +import org.apache.hudi.DataSourceUtils; +import org.apache.hudi.HoodieSparkUtils; +import org.apache.hudi.avro.model.HoodieFileStatus; +import org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.bootstrap.FileStatusUtils; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; + +import org.apache.avro.generic.GenericRecord; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.SparkSession; + +import java.io.IOException; +import java.util.List; + +public abstract class SparkFullBootstrapDataProviderBase extends FullRecordBootstrapDataProvider> { + + private final transient SparkSession sparkSession; + + public SparkFullBootstrapDataProviderBase(TypedProperties props, + HoodieSparkEngineContext context) { + super(props, context); + this.sparkSession = SparkSession.builder().config(context.getJavaSparkContext().getConf()).getOrCreate(); + } + + @Override + public JavaRDD generateInputRecords(String tableName, String sourceBasePath, + List>> partitionPathsWithFiles) { + String[] filePaths = partitionPathsWithFiles.stream().map(Pair::getValue) + .flatMap(f -> f.stream().map(fs -> FileStatusUtils.toPath(fs.getPath()).toString())) + .toArray(String[]::new); + + // NOTE: "basePath" option is required for spark to discover the partition column + // More details at https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery + Dataset inputDataset = sparkSession.read().format(getFormat()).option("basePath", sourceBasePath).load(filePaths); + try { + KeyGenerator keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(props); + String structName = tableName + "_record"; + String namespace = "hoodie." + tableName; + RDD genericRecords = HoodieSparkUtils.createRdd(inputDataset, structName, namespace, false, + Option.empty()); + return genericRecords.toJavaRDD().map(gr -> { + try { + return DataSourceUtils.createHoodieRecord(gr, keyGenerator.getKey(gr), + props.getString("hoodie.datasource.write.payload.class")); + } catch (IOException ioe) { + throw new HoodieIOException(ioe.getMessage(), ioe); + } + }); + } catch (IOException ioe) { + throw new HoodieIOException(ioe.getMessage(), ioe); + } + } + + protected abstract String getFormat(); +} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkOrcBootstrapDataProvider.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkOrcBootstrapDataProvider.java new file mode 100644 index 0000000000000..9176d19366625 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkOrcBootstrapDataProvider.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.bootstrap; + +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.TypedProperties; + +/** + * Spark Data frame based bootstrap input provider. + */ +public class SparkOrcBootstrapDataProvider extends SparkFullBootstrapDataProviderBase { + + public SparkOrcBootstrapDataProvider(TypedProperties props, + HoodieSparkEngineContext context) { + super(props, context); + } + + @Override + protected String getFormat() { + return "orc"; + } +} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkParquetBootstrapDataProvider.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkParquetBootstrapDataProvider.java index 6c5eb0ed5748d..e3bdbfe0aa888 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkParquetBootstrapDataProvider.java +++ b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/bootstrap/SparkParquetBootstrapDataProvider.java @@ -18,66 +18,21 @@ package org.apache.hudi.bootstrap; -import org.apache.hudi.DataSourceUtils; -import org.apache.hudi.HoodieSparkUtils; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.avro.model.HoodieFileStatus; -import org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider; import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.bootstrap.FileStatusUtils; import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.keygen.KeyGenerator; - -import org.apache.avro.generic.GenericRecord; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.rdd.RDD; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SparkSession; - -import java.io.IOException; -import java.util.List; /** * Spark Data frame based bootstrap input provider. */ -public class SparkParquetBootstrapDataProvider extends FullRecordBootstrapDataProvider> { - - private final transient SparkSession sparkSession; +public class SparkParquetBootstrapDataProvider extends SparkFullBootstrapDataProviderBase { public SparkParquetBootstrapDataProvider(TypedProperties props, HoodieSparkEngineContext context) { super(props, context); - this.sparkSession = SparkSession.builder().config(context.getJavaSparkContext().getConf()).getOrCreate(); } @Override - public JavaRDD generateInputRecords(String tableName, String sourceBasePath, - List>> partitionPathsWithFiles) { - String[] filePaths = partitionPathsWithFiles.stream().map(Pair::getValue) - .flatMap(f -> f.stream().map(fs -> FileStatusUtils.toPath(fs.getPath()).toString())) - .toArray(String[]::new); - - Dataset inputDataset = sparkSession.read().parquet(filePaths); - try { - KeyGenerator keyGenerator = DataSourceUtils.createKeyGenerator(props); - String structName = tableName + "_record"; - String namespace = "hoodie." + tableName; - RDD genericRecords = HoodieSparkUtils.createRdd(inputDataset, structName, namespace); - return genericRecords.toJavaRDD().map(gr -> { - String orderingVal = HoodieAvroUtils.getNestedFieldValAsString( - gr, props.getString("hoodie.datasource.write.precombine.field"), false); - try { - return DataSourceUtils.createHoodieRecord(gr, orderingVal, keyGenerator.getKey(gr), - props.getString("hoodie.datasource.write.payload.class")); - } catch (IOException ioe) { - throw new HoodieIOException(ioe.getMessage(), ioe); - } - }); - } catch (IOException ioe) { - throw new HoodieIOException(ioe.getMessage(), ioe); - } + protected String getFormat() { + return "parquet"; } } \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/cli/BootstrapExecutorUtils.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/cli/BootstrapExecutorUtils.java new file mode 100644 index 0000000000000..97d3cfc441618 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/cli/BootstrapExecutorUtils.java @@ -0,0 +1,260 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli; + +import org.apache.hudi.DataSourceWriteOptions; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.hive.HiveSyncConfig; +import org.apache.hudi.hive.HiveSyncTool; +import org.apache.hudi.index.HoodieIndex; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaSparkContext; + +import java.io.IOException; +import java.io.Serializable; +import java.util.HashMap; + +import static org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER; +import static org.apache.hudi.config.HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD; +import static org.apache.hudi.config.HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC_SPEC; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH; + +/** + * Performs bootstrap from a non-hudi source. + */ +public class BootstrapExecutorUtils implements Serializable { + + private static final Logger LOG = LogManager.getLogger(BootstrapExecutorUtils.class); + + /** + * Config. + */ + private final Config cfg; + + /** + * Spark context. + */ + private final transient JavaSparkContext jssc; + + /** + * Bag of properties with source, hoodie client, key generator etc. + */ + private final TypedProperties props; + + /** + * Hadoop Configuration. + */ + private final Configuration configuration; + + /** + * Bootstrap Configuration. + */ + private final HoodieWriteConfig bootstrapConfig; + + /** + * FileSystem instance. + */ + private final transient FileSystem fs; + + private final String bootstrapBasePath; + + public static final String CHECKPOINT_KEY = HoodieWriteConfig.DELTASTREAMER_CHECKPOINT_KEY; + + /** + * Bootstrap Executor. + * + * @param cfg DeltaStreamer Config + * @param jssc Java Spark Context + * @param fs File System + * @param properties Bootstrap Writer Properties + * @throws IOException + */ + public BootstrapExecutorUtils(Config cfg, JavaSparkContext jssc, FileSystem fs, Configuration conf, + TypedProperties properties) throws IOException { + this.cfg = cfg; + this.jssc = jssc; + this.fs = fs; + this.configuration = conf; + this.props = properties; + + ValidationUtils.checkArgument(properties.containsKey(HoodieTableConfig.BOOTSTRAP_BASE_PATH + .key()), + HoodieTableConfig.BOOTSTRAP_BASE_PATH.key() + " must be specified."); + this.bootstrapBasePath = properties.getString(HoodieTableConfig.BOOTSTRAP_BASE_PATH.key()); + + // Add more defaults if full bootstrap requested + this.props.putIfAbsent(DataSourceWriteOptions.PAYLOAD_CLASS_NAME().key(), + DataSourceWriteOptions.PAYLOAD_CLASS_NAME().defaultValue()); + /* + * Schema provider that supplies the command for reading the input and writing out the target table. + */ + SchemaProvider schemaProvider = createSchemaProvider(cfg.schemaProviderClass, props, jssc); + HoodieWriteConfig.Builder builder = + HoodieWriteConfig.newBuilder().withPath(cfg.basePath) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().withInlineCompaction(false).build()) + .forTable(cfg.tableName) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .withAutoCommit(true) + .withProps(props); + + if (null != schemaProvider && null != schemaProvider.getTargetSchema()) { + builder = builder.withSchema(schemaProvider.getTargetSchema().toString()); + } + this.bootstrapConfig = builder.build(); + LOG.info("Created bootstrap executor with configs : " + bootstrapConfig.getProps()); + } + + public static SchemaProvider createSchemaProvider(String schemaProviderClass, TypedProperties cfg, + JavaSparkContext jssc) throws IOException { + try { + return StringUtils.isNullOrEmpty(schemaProviderClass) ? null + : (SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc); + } catch (Throwable e) { + throw new IOException("Could not load schema provider class " + schemaProviderClass, e); + } + } + + /** + * Executes Bootstrap. + */ + public void execute() throws IOException { + initializeTable(); + + try (SparkRDDWriteClient bootstrapClient = new SparkRDDWriteClient(new HoodieSparkEngineContext(jssc), bootstrapConfig)) { + HashMap checkpointCommitMetadata = new HashMap<>(); + checkpointCommitMetadata.put(CHECKPOINT_KEY, Config.checkpoint); + bootstrapClient.bootstrap(Option.of(checkpointCommitMetadata)); + syncHive(); + } + } + + /** + * Sync to Hive. + */ + private void syncHive() { + if (cfg.enableHiveSync) { + TypedProperties metaProps = new TypedProperties(); + metaProps.putAll(props); + metaProps.put(META_SYNC_BASE_PATH.key(), cfg.basePath); + metaProps.put(META_SYNC_BASE_FILE_FORMAT.key(), cfg.baseFileFormat); + if (props.getBoolean(HIVE_SYNC_BUCKET_SYNC.key(), HIVE_SYNC_BUCKET_SYNC.defaultValue())) { + metaProps.put(HIVE_SYNC_BUCKET_SYNC_SPEC.key(), HiveSyncConfig.getBucketSpec(props.getString(BUCKET_INDEX_HASH_FIELD.key()), + props.getInteger(BUCKET_INDEX_NUM_BUCKETS.key()))); + } + + new HiveSyncTool(metaProps, configuration).syncHoodieTable(); + } + } + + private void initializeTable() throws IOException { + Path basePath = new Path(cfg.basePath); + if (fs.exists(basePath)) { + if (cfg.bootstrapOverwrite) { + LOG.warn("Target base path already exists, overwrite it"); + fs.delete(basePath, true); + } else { + throw new HoodieException("target base path already exists at " + cfg.basePath + + ". Cannot bootstrap data on top of an existing table"); + } + } + HoodieTableMetaClient.withPropertyBuilder() + .fromProperties(props) + .setTableType(cfg.tableType) + .setTableName(cfg.tableName) + .setArchiveLogFolder(ARCHIVELOG_FOLDER.defaultValue()) + .setPayloadClassName(cfg.payloadClass) + .setBaseFileFormat(cfg.baseFileFormat) + .setBootstrapIndexClass(cfg.bootstrapIndexClass) + .setBootstrapBasePath(bootstrapBasePath) + .initTable(new Configuration(jssc.hadoopConfiguration()), cfg.basePath); + } + + public static class Config { + private String tableName; + private String tableType; + + private String basePath; + + private String baseFileFormat; + private String bootstrapIndexClass; + private String schemaProviderClass; + private String payloadClass; + private Boolean enableHiveSync; + + private Boolean bootstrapOverwrite; + + public static String checkpoint = null; + + public void setTableName(String tableName) { + this.tableName = tableName; + } + + public void setTableType(String tableType) { + this.tableType = tableType; + } + + public void setBasePath(String basePath) { + this.basePath = basePath; + } + + public void setBaseFileFormat(String baseFileFormat) { + this.baseFileFormat = baseFileFormat; + } + + public void setBootstrapIndexClass(String bootstrapIndexClass) { + this.bootstrapIndexClass = bootstrapIndexClass; + } + + public void setSchemaProviderClass(String schemaProviderClass) { + this.schemaProviderClass = schemaProviderClass; + } + + public void setPayloadClass(String payloadClass) { + this.payloadClass = payloadClass; + } + + public void setEnableHiveSync(Boolean enableHiveSync) { + this.enableHiveSync = enableHiveSync; + } + + public void setBootstrapOverwrite(Boolean bootstrapOverwrite) { + this.bootstrapOverwrite = bootstrapOverwrite; + } + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/cli/HDFSParquetImporterUtils.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/cli/HDFSParquetImporterUtils.java new file mode 100644 index 0000000000000..6937a3389b1f4 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/cli/HDFSParquetImporterUtils.java @@ -0,0 +1,325 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.HoodieJsonPayload; +import org.apache.hudi.common.config.DFSPropertiesConfiguration; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.parquet.avro.AvroReadSupport; +import org.apache.parquet.hadoop.ParquetInputFormat; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.util.LongAccumulator; +import scala.Tuple2; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Serializable; +import java.io.StringReader; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.time.Instant; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +/** + * Loads data from Parquet Sources. + */ +public class HDFSParquetImporterUtils implements Serializable { + + private static final long serialVersionUID = 1L; + private static final Logger LOG = LogManager.getLogger(HDFSParquetImporterUtils.class); + private static final DateTimeFormatter PARTITION_FORMATTER = DateTimeFormatter.ofPattern("yyyy/MM/dd") + .withZone(ZoneId.systemDefault()); + + private final String command; + private final String srcPath; + private final String targetPath; + private final String tableName; + private final String tableType; + private final String rowKey; + private final String partitionKey; + private final int parallelism; + private final String schemaFile; + private int retry; + private final String propsFilePath; + private final List configs = new ArrayList<>(); + private TypedProperties props; + + public HDFSParquetImporterUtils( + String command, + String srcPath, + String targetPath, + String tableName, + String tableType, + String rowKey, + String partitionKey, + int parallelism, + String schemaFile, + int retry, + String propsFilePath) { + this.command = command; + this.srcPath = srcPath; + this.targetPath = targetPath; + this.tableName = tableName; + this.tableType = tableType; + this.rowKey = rowKey; + this.partitionKey = partitionKey; + this.parallelism = parallelism; + this.schemaFile = schemaFile; + this.retry = retry; + this.propsFilePath = propsFilePath; + } + + public boolean isUpsert() { + return "upsert".equalsIgnoreCase(this.command); + } + + public int dataImport(JavaSparkContext jsc) { + FileSystem fs = FSUtils.getFs(this.targetPath, jsc.hadoopConfiguration()); + this.props = this.propsFilePath == null || this.propsFilePath.isEmpty() ? buildProperties(this.configs) + : readConfig(fs.getConf(), new Path(this.propsFilePath), this.configs).getProps(true); + LOG.info("Starting data import with configs : " + props.toString()); + int ret = -1; + try { + // Verify that targetPath is not present. + if (fs.exists(new Path(this.targetPath)) && !isUpsert()) { + throw new HoodieIOException(String.format("Make sure %s is not present.", this.targetPath)); + } + do { + ret = dataImport(jsc, fs); + } while (ret != 0 && retry-- > 0); + } catch (Throwable t) { + LOG.error("dataImport failed", t); + } + return ret; + } + + public int dataImport(JavaSparkContext jsc, FileSystem fs) { + try { + if (fs.exists(new Path(this.targetPath)) && !isUpsert()) { + // cleanup target directory. + fs.delete(new Path(this.targetPath), true); + } + + if (!fs.exists(new Path(this.targetPath))) { + // Initialize target hoodie table. + Properties properties = HoodieTableMetaClient.withPropertyBuilder() + .setTableName(this.tableName) + .setTableType(this.tableType) + .build(); + HoodieTableMetaClient.initTableAndGetMetaClient(jsc.hadoopConfiguration(), this.targetPath, properties); + } + + // Get schema. + String schemaStr = parseSchema(fs, this.schemaFile); + + SparkRDDWriteClient client = + createHoodieClient(jsc, this.targetPath, schemaStr, this.parallelism, Option.empty(), props); + + JavaRDD> hoodieRecords = buildHoodieRecordsForImport(jsc, schemaStr); + // Get instant time. + String instantTime = client.startCommit(); + JavaRDD writeResponse = load(client, instantTime, hoodieRecords); + return handleErrors(jsc, instantTime, writeResponse); + } catch (Throwable t) { + LOG.error("Error occurred.", t); + } + return -1; + } + + public JavaRDD> buildHoodieRecordsForImport(JavaSparkContext jsc, + String schemaStr) throws IOException { + Job job = Job.getInstance(jsc.hadoopConfiguration()); + // Allow recursive directories to be found + job.getConfiguration().set(FileInputFormat.INPUT_DIR_RECURSIVE, "true"); + // To parallelize reading file status. + job.getConfiguration().set(FileInputFormat.LIST_STATUS_NUM_THREADS, "1024"); + AvroReadSupport.setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr))); + ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class)); + + HoodieEngineContext context = new HoodieSparkEngineContext(jsc); + context.setJobStatus(this.getClass().getSimpleName(), "Build records for import: " + this.tableName); + return jsc.newAPIHadoopFile(this.srcPath, ParquetInputFormat.class, Void.class, GenericRecord.class, + job.getConfiguration()) + // To reduce large number of tasks. + .coalesce(16 * this.parallelism).map(entry -> { + GenericRecord genericRecord = ((Tuple2) entry)._2(); + Object partitionField = genericRecord.get(this.partitionKey); + if (partitionField == null) { + throw new HoodieIOException("partition key is missing. :" + this.partitionKey); + } + Object rowField = genericRecord.get(this.rowKey); + if (rowField == null) { + throw new HoodieIOException("row field is missing. :" + this.rowKey); + } + String partitionPath = partitionField.toString(); + LOG.debug("Row Key : " + rowField + ", Partition Path is (" + partitionPath + ")"); + if (partitionField instanceof Number) { + try { + long ts = (long) (Double.parseDouble(partitionField.toString()) * 1000L); + partitionPath = PARTITION_FORMATTER.format(Instant.ofEpochMilli(ts)); + } catch (NumberFormatException nfe) { + LOG.warn("Unable to parse date from partition field. Assuming partition as (" + partitionField + ")"); + } + } + return new HoodieAvroRecord<>(new HoodieKey(rowField.toString(), partitionPath), + new HoodieJsonPayload(genericRecord.toString())); + }); + } + + /** + * Imports records to Hoodie table. + * + * @param client Hoodie Client + * @param instantTime Instant Time + * @param hoodieRecords Hoodie Records + * @param Type + */ + public JavaRDD load(SparkRDDWriteClient client, String instantTime, + JavaRDD> hoodieRecords) { + switch (this.command.toLowerCase()) { + case "upsert": { + return client.upsert(hoodieRecords, instantTime); + } + case "bulkinsert": { + return client.bulkInsert(hoodieRecords, instantTime); + } + default: { + return client.insert(hoodieRecords, instantTime); + } + } + } + + public static TypedProperties buildProperties(List props) { + TypedProperties properties = DFSPropertiesConfiguration.getGlobalProps(); + props.forEach(x -> { + String[] kv = x.split("="); + ValidationUtils.checkArgument(kv.length == 2); + properties.setProperty(kv[0], kv[1]); + }); + return properties; + } + + public static DFSPropertiesConfiguration readConfig(Configuration hadoopConfig, Path cfgPath, List overriddenProps) { + DFSPropertiesConfiguration conf = new DFSPropertiesConfiguration(hadoopConfig, cfgPath); + try { + if (!overriddenProps.isEmpty()) { + LOG.info("Adding overridden properties to file properties."); + conf.addPropsFromStream(new BufferedReader(new StringReader(String.join("\n", overriddenProps)))); + } + } catch (IOException ioe) { + throw new HoodieIOException("Unexpected error adding config overrides", ioe); + } + + return conf; + } + + /** + * Build Hoodie write client. + * + * @param jsc Java Spark Context + * @param basePath Base Path + * @param schemaStr Schema + * @param parallelism Parallelism + */ + public static SparkRDDWriteClient createHoodieClient(JavaSparkContext jsc, String basePath, String schemaStr, + int parallelism, Option compactionStrategyClass, TypedProperties properties) { + HoodieCompactionConfig compactionConfig = compactionStrategyClass + .map(strategy -> HoodieCompactionConfig.newBuilder().withInlineCompaction(false) + .withCompactionStrategy(ReflectionUtils.loadClass(strategy)).build()) + .orElse(HoodieCompactionConfig.newBuilder().withInlineCompaction(false).build()); + HoodieWriteConfig config = + HoodieWriteConfig.newBuilder().withPath(basePath) + .withParallelism(parallelism, parallelism) + .withBulkInsertParallelism(parallelism) + .withDeleteParallelism(parallelism) + .withSchema(schemaStr).combineInput(true, true).withCompactionConfig(compactionConfig) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .withProps(properties).build(); + return new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jsc), config); + } + + /** + * Parse Schema from file. + * + * @param fs File System + * @param schemaFile Schema File + */ + public static String parseSchema(FileSystem fs, String schemaFile) throws Exception { + // Read schema file. + Path p = new Path(schemaFile); + if (!fs.exists(p)) { + throw new Exception(String.format("Could not find - %s - schema file.", schemaFile)); + } + long len = fs.getFileStatus(p).getLen(); + ByteBuffer buf = ByteBuffer.allocate((int) len); + try (FSDataInputStream inputStream = fs.open(p)) { + inputStream.readFully(0, buf.array(), 0, buf.array().length); + } + return new String(buf.array(), StandardCharsets.UTF_8); + } + + public static int handleErrors(JavaSparkContext jsc, String instantTime, JavaRDD writeResponse) { + LongAccumulator errors = jsc.sc().longAccumulator(); + writeResponse.foreach(writeStatus -> { + if (writeStatus.hasErrors()) { + errors.add(1); + LOG.error(String.format("Error processing records :writeStatus:%s", writeStatus.getStat().toString())); + } + }); + if (errors.value() == 0) { + LOG.info(String.format("Table imported into hoodie with %s instant time.", instantTime)); + return 0; + } + LOG.error(String.format("Import failed with %d errors.", errors.value())); + return -1; + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/cli/SchemaProvider.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/cli/SchemaProvider.java new file mode 100644 index 0000000000000..de6770bf3038b --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/cli/SchemaProvider.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli; + +import org.apache.avro.Schema; +import org.apache.hudi.ApiMaturityLevel; +import org.apache.hudi.PublicAPIClass; +import org.apache.hudi.PublicAPIMethod; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.spark.api.java.JavaSparkContext; + +import java.io.Serializable; + +/** + * Class to provide schema for reading data and also writing into a Hoodie table, + * used by deltastreamer (runs over Spark). + */ +@PublicAPIClass(maturity = ApiMaturityLevel.STABLE) +public abstract class SchemaProvider implements Serializable { + + protected TypedProperties config; + + protected JavaSparkContext jssc; + + public SchemaProvider(TypedProperties props) { + this(props, null); + } + + protected SchemaProvider(TypedProperties props, JavaSparkContext jssc) { + this.config = props; + this.jssc = jssc; + } + + @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + public abstract Schema getSourceSchema(); + + @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + public Schema getTargetSchema() { + // by default, use source schema as target for hoodie table as well + return getSourceSchema(); + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/payload/AWSDmsAvroPayload.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/payload/AWSDmsAvroPayload.java index d0e1326761076..1411d4f4796c0 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/payload/AWSDmsAvroPayload.java +++ b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/payload/AWSDmsAvroPayload.java @@ -18,14 +18,8 @@ package org.apache.hudi.payload; -import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; -import org.apache.hudi.common.util.Option; - -import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; - -import java.io.IOException; +import org.apache.hudi.common.util.Option; /** * Provides support for seamlessly applying changes captured via Amazon Database Migration Service onto S3. @@ -41,43 +35,14 @@ * This payload implementation will issue matching insert, delete, updates against the hudi table * */ -public class AWSDmsAvroPayload extends OverwriteWithLatestAvroPayload { - - public static final String OP_FIELD = "Op"; +@Deprecated +public class AWSDmsAvroPayload extends org.apache.hudi.common.model.AWSDmsAvroPayload { public AWSDmsAvroPayload(GenericRecord record, Comparable orderingVal) { super(record, orderingVal); } public AWSDmsAvroPayload(Option record) { - this(record.get(), 0); // natural order - } - - /** - * - * Handle a possible delete - check for "D" in Op column and return empty row if found. - * @param insertValue The new row that is being "inserted". - */ - private Option handleDeleteOperation(IndexedRecord insertValue) throws IOException { - boolean delete = false; - if (insertValue instanceof GenericRecord) { - GenericRecord record = (GenericRecord) insertValue; - delete = record.get(OP_FIELD) != null && record.get(OP_FIELD).toString().equalsIgnoreCase("D"); - } - - return delete ? Option.empty() : Option.of(insertValue); - } - - @Override - public Option getInsertValue(Schema schema) throws IOException { - IndexedRecord insertValue = super.getInsertValue(schema).get(); - return handleDeleteOperation(insertValue); - } - - @Override - public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) - throws IOException { - IndexedRecord insertValue = super.getInsertValue(schema).get(); - return handleDeleteOperation(insertValue); + super(record); } } diff --git a/hudi-spark-datasource/hudi-spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/hudi-spark-datasource/hudi-spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister deleted file mode 100644 index d751e13c771bf..0000000000000 --- a/hudi-spark-datasource/hudi-spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister +++ /dev/null @@ -1,19 +0,0 @@ - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -org.apache.hudi.DefaultSource \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala deleted file mode 100644 index 4a7837816976b..0000000000000 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi - -import org.apache.hadoop.fs.Path -import org.apache.hudi.DataSourceReadOptions._ -import org.apache.hudi.common.model.HoodieTableType -import org.apache.hudi.DataSourceWriteOptions.{BOOTSTRAP_OPERATION_OPT_VAL, OPERATION_OPT_KEY} -import org.apache.hudi.common.fs.FSUtils -import org.apache.hudi.common.table.HoodieTableMetaClient -import org.apache.hudi.exception.HoodieException -import org.apache.hudi.hadoop.HoodieROTablePathFilter -import org.apache.log4j.LogManager -import org.apache.spark.sql.execution.datasources.DataSource -import org.apache.spark.sql.execution.streaming.Sink -import org.apache.spark.sql.sources._ -import org.apache.spark.sql.streaming.OutputMode -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} - -/** - * Hoodie Spark Datasource, for reading and writing hoodie tables - * - */ -class DefaultSource extends RelationProvider - with SchemaRelationProvider - with CreatableRelationProvider - with DataSourceRegister - with StreamSinkProvider - with Serializable { - - private val log = LogManager.getLogger(classOf[DefaultSource]) - - override def createRelation(sqlContext: SQLContext, - parameters: Map[String, String]): BaseRelation = { - createRelation(sqlContext, parameters, null) - } - - override def createRelation(sqlContext: SQLContext, - optParams: Map[String, String], - schema: StructType): BaseRelation = { - // Add default options for unspecified read options keys. - val parameters = translateViewTypesToQueryTypes(optParams) - - val path = parameters.get("path") - val readPathsStr = parameters.get(DataSourceReadOptions.READ_PATHS_OPT_KEY) - if (path.isEmpty && readPathsStr.isEmpty) { - throw new HoodieException(s"'path' or '$READ_PATHS_OPT_KEY' or both must be specified.") - } - - val readPaths = readPathsStr.map(p => p.split(",").toSeq).getOrElse(Seq()) - val allPaths = path.map(p => Seq(p)).getOrElse(Seq()) ++ readPaths - - val fs = FSUtils.getFs(allPaths.head, sqlContext.sparkContext.hadoopConfiguration) - val globPaths = HoodieSparkUtils.checkAndGlobPathIfNecessary(allPaths, fs) - - val tablePath = DataSourceUtils.getTablePath(fs, globPaths.toArray) - log.info("Obtained hudi table path: " + tablePath) - - val metaClient = new HoodieTableMetaClient(fs.getConf, tablePath) - val isBootstrappedTable = metaClient.getTableConfig.getBootstrapBasePath.isPresent - log.info("Is bootstrapped table => " + isBootstrappedTable) - - if (parameters(QUERY_TYPE_OPT_KEY).equals(QUERY_TYPE_SNAPSHOT_OPT_VAL)) { - if (metaClient.getTableType.equals(HoodieTableType.MERGE_ON_READ)) { - if (isBootstrappedTable) { - // Snapshot query is not supported for Bootstrapped MOR tables - log.warn("Snapshot query is not supported for Bootstrapped Merge-on-Read tables." + - " Falling back to Read Optimized query.") - new HoodieBootstrapRelation(sqlContext, schema, globPaths, metaClient, optParams) - } else { - new MergeOnReadSnapshotRelation(sqlContext, optParams, schema, globPaths, metaClient) - } - } else { - getBaseFileOnlyView(sqlContext, parameters, schema, readPaths, isBootstrappedTable, globPaths, metaClient) - } - } else if(parameters(QUERY_TYPE_OPT_KEY).equals(QUERY_TYPE_READ_OPTIMIZED_OPT_VAL)) { - getBaseFileOnlyView(sqlContext, parameters, schema, readPaths, isBootstrappedTable, globPaths, metaClient) - } else if (parameters(QUERY_TYPE_OPT_KEY).equals(QUERY_TYPE_INCREMENTAL_OPT_VAL)) { - new IncrementalRelation(sqlContext, tablePath, optParams, schema) - } else { - throw new HoodieException("Invalid query type :" + parameters(QUERY_TYPE_OPT_KEY)) - } - } - - /** - * This DataSource API is used for writing the DataFrame at the destination. For now, we are returning a dummy - * relation here because Spark does not really make use of the relation returned, and just returns an empty - * dataset at [[org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run()]]. This saves us the cost - * of creating and returning a parquet relation here. - * - * TODO: Revisit to return a concrete relation here when we support CREATE TABLE AS for Hudi with DataSource API. - * That is the only case where Spark seems to actually need a relation to be returned here - * [[DataSource.writeAndRead()]] - * - * @param sqlContext Spark SQL Context - * @param mode Mode for saving the DataFrame at the destination - * @param optParams Parameters passed as part of the DataFrame write operation - * @param df Spark DataFrame to be written - * @return Spark Relation - */ - override def createRelation(sqlContext: SQLContext, - mode: SaveMode, - optParams: Map[String, String], - df: DataFrame): BaseRelation = { - val parameters = HoodieWriterUtils.parametersWithWriteDefaults(optParams) - if (parameters(OPERATION_OPT_KEY).equals(BOOTSTRAP_OPERATION_OPT_VAL)) { - HoodieSparkSqlWriter.bootstrap(sqlContext, mode, parameters, df) - } else { - HoodieSparkSqlWriter.write(sqlContext, mode, parameters, df) - } - new HoodieEmptyRelation(sqlContext, df.schema) - } - - override def createSink(sqlContext: SQLContext, - optParams: Map[String, String], - partitionColumns: Seq[String], - outputMode: OutputMode): Sink = { - val parameters = HoodieWriterUtils.parametersWithWriteDefaults(optParams) - new HoodieStreamingSink( - sqlContext, - parameters, - partitionColumns, - outputMode) - } - - override def shortName(): String = "hudi" - - private def getBaseFileOnlyView(sqlContext: SQLContext, - optParams: Map[String, String], - schema: StructType, - extraReadPaths: Seq[String], - isBootstrappedTable: Boolean, - globPaths: Seq[Path], - metaClient: HoodieTableMetaClient): BaseRelation = { - log.warn("Loading Base File Only View.") - - if (isBootstrappedTable) { - // For bootstrapped tables, use our custom Spark relation for querying - new HoodieBootstrapRelation(sqlContext, schema, globPaths, metaClient, optParams) - } else { - // this is just effectively RO view only, where `path` can contain a mix of - // non-hoodie/hoodie path files. set the path filter up - sqlContext.sparkContext.hadoopConfiguration.setClass( - "mapreduce.input.pathFilter.class", - classOf[HoodieROTablePathFilter], - classOf[org.apache.hadoop.fs.PathFilter]) - - log.info("Constructing hoodie (as parquet) data source with options :" + optParams) - // simply return as a regular parquet relation - DataSource.apply( - sparkSession = sqlContext.sparkSession, - paths = extraReadPaths, - userSpecifiedSchema = Option(schema), - className = "parquet", - options = optParams) - .resolveRelation() - } - } -} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieBootstrapRelation.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieBootstrapRelation.scala deleted file mode 100644 index a1e9947cafacf..0000000000000 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieBootstrapRelation.scala +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi - -import org.apache.hadoop.fs.Path -import org.apache.hudi.common.model.HoodieBaseFile -import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} -import org.apache.hudi.common.table.view.HoodieTableFileSystemView -import org.apache.hudi.exception.HoodieException -import org.apache.spark.internal.Logging -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.execution.datasources.PartitionedFile -import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat -import org.apache.spark.sql.{Row, SQLContext} -import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan} -import org.apache.spark.sql.types.StructType - -import scala.collection.JavaConverters._ - -/** - * This is Spark relation that can be used for querying metadata/fully bootstrapped query hoodie tables, as well as - * non-bootstrapped tables. It implements PrunedFilteredScan interface in order to support column pruning and filter - * push-down. For metadata bootstrapped files, if we query columns from both metadata and actual data then it will - * perform a merge of both to return the result. - * - * Caveat: Filter push-down does not work when querying both metadata and actual data columns over metadata - * bootstrapped files, because then the metadata file and data file can return different number of rows causing errors - * merging. - * - * @param _sqlContext Spark SQL Context - * @param userSchema User specified schema in the datasource query - * @param globPaths Globbed paths obtained from the user provided path for querying - * @param metaClient Hoodie table meta client - * @param optParams DataSource options passed by the user - */ -class HoodieBootstrapRelation(@transient val _sqlContext: SQLContext, - val userSchema: StructType, - val globPaths: Seq[Path], - val metaClient: HoodieTableMetaClient, - val optParams: Map[String, String]) extends BaseRelation - with PrunedFilteredScan with Logging { - - val skeletonSchema: StructType = HoodieSparkUtils.getMetaSchema - var dataSchema: StructType = _ - var fullSchema: StructType = _ - - val fileIndex: HoodieBootstrapFileIndex = buildFileIndex() - - override def sqlContext: SQLContext = _sqlContext - - override val needConversion: Boolean = false - - override def schema: StructType = inferFullSchema() - - override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { - logInfo("Starting scan..") - - // Compute splits - val bootstrapSplits = fileIndex.files.map(hoodieBaseFile => { - var skeletonFile: Option[PartitionedFile] = Option.empty - var dataFile: PartitionedFile = null - - if (hoodieBaseFile.getBootstrapBaseFile.isPresent) { - skeletonFile = Option(PartitionedFile(InternalRow.empty, hoodieBaseFile.getPath, 0, hoodieBaseFile.getFileLen)) - dataFile = PartitionedFile(InternalRow.empty, hoodieBaseFile.getBootstrapBaseFile.get().getPath, 0, - hoodieBaseFile.getBootstrapBaseFile.get().getFileLen) - } else { - dataFile = PartitionedFile(InternalRow.empty, hoodieBaseFile.getPath, 0, hoodieBaseFile.getFileLen) - } - HoodieBootstrapSplit(dataFile, skeletonFile) - }) - val tableState = HoodieBootstrapTableState(bootstrapSplits) - - // Get required schemas for column pruning - var requiredDataSchema = StructType(Seq()) - var requiredSkeletonSchema = StructType(Seq()) - requiredColumns.foreach(col => { - var field = dataSchema.find(_.name == col) - if (field.isDefined) { - requiredDataSchema = requiredDataSchema.add(field.get) - } else { - field = skeletonSchema.find(_.name == col) - requiredSkeletonSchema = requiredSkeletonSchema.add(field.get) - } - }) - - // Prepare readers for reading data file and skeleton files - val dataReadFunction = new ParquetFileFormat() - .buildReaderWithPartitionValues( - sparkSession = _sqlContext.sparkSession, - dataSchema = dataSchema, - partitionSchema = StructType(Seq.empty), - requiredSchema = requiredDataSchema, - filters = if (requiredSkeletonSchema.isEmpty) filters else Seq() , - options = Map.empty, - hadoopConf = _sqlContext.sparkSession.sessionState.newHadoopConf() - ) - - val skeletonReadFunction = new ParquetFileFormat() - .buildReaderWithPartitionValues( - sparkSession = _sqlContext.sparkSession, - dataSchema = skeletonSchema, - partitionSchema = StructType(Seq.empty), - requiredSchema = requiredSkeletonSchema, - filters = if (requiredDataSchema.isEmpty) filters else Seq(), - options = Map.empty, - hadoopConf = _sqlContext.sparkSession.sessionState.newHadoopConf() - ) - - val regularReadFunction = new ParquetFileFormat() - .buildReaderWithPartitionValues( - sparkSession = _sqlContext.sparkSession, - dataSchema = fullSchema, - partitionSchema = StructType(Seq.empty), - requiredSchema = StructType(requiredSkeletonSchema.fields ++ requiredDataSchema.fields), - filters = filters, - options = Map.empty, - hadoopConf = _sqlContext.sparkSession.sessionState.newHadoopConf()) - - val rdd = new HoodieBootstrapRDD(_sqlContext.sparkSession, dataReadFunction, skeletonReadFunction, - regularReadFunction, requiredDataSchema, requiredSkeletonSchema, requiredColumns, tableState) - rdd.asInstanceOf[RDD[Row]] - } - - def inferFullSchema(): StructType = { - if (fullSchema == null) { - logInfo("Inferring schema..") - val schemaResolver = new TableSchemaResolver(metaClient) - val tableSchema = schemaResolver.getTableAvroSchemaWithoutMetadataFields - dataSchema = AvroConversionUtils.convertAvroSchemaToStructType(tableSchema) - fullSchema = StructType(skeletonSchema.fields ++ dataSchema.fields) - } - fullSchema - } - - def buildFileIndex(): HoodieBootstrapFileIndex = { - logInfo("Building file index..") - val inMemoryFileIndex = HoodieSparkUtils.createInMemoryFileIndex(_sqlContext.sparkSession, globPaths) - val fileStatuses = inMemoryFileIndex.allFiles() - - if (fileStatuses.isEmpty) { - throw new HoodieException("No files found for reading in user provided path.") - } - - val fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline.getCommitsTimeline - .filterCompletedInstants, fileStatuses.toArray) - val latestFiles: List[HoodieBaseFile] = fsView.getLatestBaseFiles.iterator().asScala.toList - - if (log.isDebugEnabled) { - latestFiles.foreach(file => { - logDebug("Printing indexed files:") - if (file.getBootstrapBaseFile.isPresent) { - logDebug("Skeleton File: " + file.getPath + ", Data File: " + file.getBootstrapBaseFile.get().getPath) - } else { - logDebug("Regular Hoodie File: " + file.getPath) - } - }) - } - - HoodieBootstrapFileIndex(latestFiles) - } -} - -case class HoodieBootstrapFileIndex(files: List[HoodieBaseFile]) - -case class HoodieBootstrapTableState(files: List[HoodieBootstrapSplit]) - -case class HoodieBootstrapSplit(dataFile: PartitionedFile, skeletonFile: Option[PartitionedFile]) diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala deleted file mode 100644 index e8caa63912397..0000000000000 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala +++ /dev/null @@ -1,278 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi - -import org.apache.hudi.common.fs.FSUtils -import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner -import org.apache.hudi.exception.HoodieException -import org.apache.hudi.hadoop.config.HoodieRealtimeConfig -import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.HOODIE_RECORD_KEY_COL_POS -import org.apache.avro.Schema -import org.apache.avro.generic.{GenericRecord, GenericRecordBuilder} -import org.apache.hadoop.conf.Configuration -import org.apache.spark.{Partition, SerializableWritable, SparkContext, TaskContext} -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.avro.{AvroDeserializer, AvroSerializer} -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{SpecificInternalRow, UnsafeProjection} -import org.apache.spark.sql.execution.datasources.PartitionedFile -import org.apache.spark.sql.vectorized.ColumnarBatch - -import scala.collection.JavaConverters._ -import scala.collection.mutable -import scala.util.Try - -case class HoodieMergeOnReadPartition(index: Int, split: HoodieMergeOnReadFileSplit) extends Partition - -class HoodieMergeOnReadRDD(@transient sc: SparkContext, - @transient config: Configuration, - fullSchemaFileReader: PartitionedFile => Iterator[Any], - requiredSchemaFileReader: PartitionedFile => Iterator[Any], - tableState: HoodieMergeOnReadTableState) - extends RDD[InternalRow](sc, Nil) { - - private val confBroadcast = sc.broadcast(new SerializableWritable(config)) - - override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = { - val mergeParquetPartition = split.asInstanceOf[HoodieMergeOnReadPartition] - mergeParquetPartition.split match { - case dataFileOnlySplit if dataFileOnlySplit.logPaths.isEmpty => - read(mergeParquetPartition.split.dataFile, requiredSchemaFileReader) - case skipMergeSplit if skipMergeSplit.mergeType - .equals(DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL) => - skipMergeFileIterator( - skipMergeSplit, - read(mergeParquetPartition.split.dataFile, requiredSchemaFileReader), - getConfig - ) - case payloadCombineSplit if payloadCombineSplit.mergeType - .equals(DataSourceReadOptions.REALTIME_PAYLOAD_COMBINE_OPT_VAL) => - payloadCombineFileIterator( - payloadCombineSplit, - read(mergeParquetPartition.split.dataFile, fullSchemaFileReader), - getConfig - ) - case _ => throw new HoodieException(s"Unable to select an Iterator to read the Hoodie MOR File Split for " + - s"file path: ${mergeParquetPartition.split.dataFile.filePath}" + - s"log paths: ${mergeParquetPartition.split.logPaths.toString}" + - s"hoodie table path: ${mergeParquetPartition.split.tablePath}" + - s"spark partition Index: ${mergeParquetPartition.index}" + - s"merge type: ${mergeParquetPartition.split.mergeType}") - } - } - - override protected def getPartitions: Array[Partition] = { - tableState - .hoodieRealtimeFileSplits - .zipWithIndex - .map(file => HoodieMergeOnReadPartition(file._2, file._1)).toArray - } - - private def getConfig: Configuration = { - val conf = confBroadcast.value.value - HoodieMergeOnReadRDD.CONFIG_INSTANTIATION_LOCK.synchronized { - new Configuration(conf) - } - } - - private def read(partitionedFile: PartitionedFile, - readFileFunction: PartitionedFile => Iterator[Any]): Iterator[InternalRow] = { - val fileIterator = readFileFunction(partitionedFile) - val rows = fileIterator.flatMap(_ match { - case r: InternalRow => Seq(r) - case b: ColumnarBatch => b.rowIterator().asScala - }) - rows - } - - private def skipMergeFileIterator(split: HoodieMergeOnReadFileSplit, - baseFileIterator: Iterator[InternalRow], - config: Configuration): Iterator[InternalRow] = - new Iterator[InternalRow] { - private val tableAvroSchema = new Schema.Parser().parse(tableState.tableAvroSchema) - private val requiredAvroSchema = new Schema.Parser().parse(tableState.requiredAvroSchema) - private val requiredFieldPosition = - tableState.requiredStructSchema - .map(f => tableAvroSchema.getField(f.name).pos()).toList - private val recordBuilder = new GenericRecordBuilder(requiredAvroSchema) - private val deserializer = new AvroDeserializer(requiredAvroSchema, tableState.requiredStructSchema) - private val unsafeProjection = UnsafeProjection.create(tableState.requiredStructSchema) - private val logRecords = HoodieMergeOnReadRDD.scanLog(split, tableAvroSchema, config).getRecords - private val logRecordsKeyIterator = logRecords.keySet().iterator().asScala - - private var recordToLoad: InternalRow = _ - - @scala.annotation.tailrec - override def hasNext: Boolean = { - if (baseFileIterator.hasNext) { - recordToLoad = baseFileIterator.next() - true - } else { - if (logRecordsKeyIterator.hasNext) { - val curAvrokey = logRecordsKeyIterator.next() - val curAvroRecord = logRecords.get(curAvrokey).getData.getInsertValue(tableAvroSchema) - if (!curAvroRecord.isPresent) { - // delete record found, skipping - this.hasNext - } else { - val requiredAvroRecord = AvroConversionUtils - .buildAvroRecordBySchema(curAvroRecord.get(), requiredAvroSchema, requiredFieldPosition, recordBuilder) - recordToLoad = unsafeProjection(deserializer.deserialize(requiredAvroRecord).asInstanceOf[InternalRow]) - true - } - } else { - false - } - } - } - - override def next(): InternalRow = { - recordToLoad - } - } - - private def payloadCombineFileIterator(split: HoodieMergeOnReadFileSplit, - baseFileIterator: Iterator[InternalRow], - config: Configuration): Iterator[InternalRow] = - new Iterator[InternalRow] { - private val tableAvroSchema = new Schema.Parser().parse(tableState.tableAvroSchema) - private val requiredAvroSchema = new Schema.Parser().parse(tableState.requiredAvroSchema) - private val requiredFieldPosition = - tableState.requiredStructSchema - .map(f => tableAvroSchema.getField(f.name).pos()).toList - private val serializer = new AvroSerializer(tableState.tableStructSchema, tableAvroSchema, false) - private val requiredDeserializer = new AvroDeserializer(requiredAvroSchema, tableState.requiredStructSchema) - private val recordBuilder = new GenericRecordBuilder(requiredAvroSchema) - private val unsafeProjection = UnsafeProjection.create(tableState.requiredStructSchema) - private val logRecords = HoodieMergeOnReadRDD.scanLog(split, tableAvroSchema, config).getRecords - private val logRecordsKeyIterator = logRecords.keySet().iterator().asScala - private val keyToSkip = mutable.Set.empty[String] - - private var recordToLoad: InternalRow = _ - - @scala.annotation.tailrec - override def hasNext: Boolean = { - if (baseFileIterator.hasNext) { - val curRow = baseFileIterator.next() - val curKey = curRow.getString(HOODIE_RECORD_KEY_COL_POS) - if (logRecords.containsKey(curKey)) { - // duplicate key found, merging - keyToSkip.add(curKey) - val mergedAvroRecord = mergeRowWithLog(curRow, curKey) - if (!mergedAvroRecord.isPresent) { - // deleted - this.hasNext - } else { - // load merged record as InternalRow with required schema - val requiredAvroRecord = AvroConversionUtils - .buildAvroRecordBySchema( - mergedAvroRecord.get(), - requiredAvroSchema, - requiredFieldPosition, - recordBuilder - ) - recordToLoad = unsafeProjection(requiredDeserializer - .deserialize(requiredAvroRecord).asInstanceOf[InternalRow]) - true - } - } else { - // No merge needed, load current row with required schema - recordToLoad = unsafeProjection(createRowWithRequiredSchema(curRow)) - true - } - } else { - if (logRecordsKeyIterator.hasNext) { - val curKey = logRecordsKeyIterator.next() - if (keyToSkip.contains(curKey)) { - this.hasNext - } else { - val insertAvroRecord = - logRecords.get(curKey).getData.getInsertValue(tableAvroSchema) - if (!insertAvroRecord.isPresent) { - // stand alone delete record, skipping - this.hasNext - } else { - val requiredAvroRecord = AvroConversionUtils - .buildAvroRecordBySchema( - insertAvroRecord.get(), - requiredAvroSchema, - requiredFieldPosition, - recordBuilder - ) - recordToLoad = unsafeProjection(requiredDeserializer - .deserialize(requiredAvroRecord).asInstanceOf[InternalRow]) - true - } - } - } else { - false - } - } - } - - override def next(): InternalRow = recordToLoad - - private def createRowWithRequiredSchema(row: InternalRow): InternalRow = { - val rowToReturn = new SpecificInternalRow(tableState.requiredStructSchema) - val posIterator = requiredFieldPosition.iterator - var curIndex = 0 - tableState.requiredStructSchema.foreach( - f => { - val curPos = posIterator.next() - val curField = row.get(curPos, f.dataType) - rowToReturn.update(curIndex, curField) - curIndex = curIndex + 1 - } - ) - rowToReturn - } - - private def mergeRowWithLog(curRow: InternalRow, curKey: String) = { - val historyAvroRecord = serializer.serialize(curRow).asInstanceOf[GenericRecord] - logRecords.get(curKey).getData.combineAndGetUpdateValue(historyAvroRecord, tableAvroSchema) - } - } -} - -private object HoodieMergeOnReadRDD { - val CONFIG_INSTANTIATION_LOCK = new Object() - - def scanLog(split: HoodieMergeOnReadFileSplit, logSchema: Schema, config: Configuration): HoodieMergedLogRecordScanner = { - val fs = FSUtils.getFs(split.tablePath, config) - HoodieMergedLogRecordScanner.newBuilder() - .withFileSystem(fs) - .withBasePath(split.tablePath) - .withLogFilePaths(split.logPaths.get.asJava) - .withReaderSchema(logSchema) - .withLatestInstantTime(split.latestCommit) - .withReadBlocksLazily( - Try(config.get(HoodieRealtimeConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, - HoodieRealtimeConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED).toBoolean) - .getOrElse(false)) - .withReverseReader(false) - .withBufferSize( - config.getInt(HoodieRealtimeConfig.MAX_DFS_STREAM_BUFFER_SIZE_PROP, - HoodieRealtimeConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE)) - .withMaxMemorySizeInBytes(split.maxCompactionMemoryInBytes) - .withSpillableMapBasePath( - config.get(HoodieRealtimeConfig.SPILLABLE_MAP_BASE_PATH_PROP, - HoodieRealtimeConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH)) - .build() - } -} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala deleted file mode 100644 index d66103600f9a1..0000000000000 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala +++ /dev/null @@ -1,494 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi - -import java.util -import java.util.Properties - -import org.apache.avro.Schema -import org.apache.avro.generic.GenericRecord -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileSystem, Path} -import org.apache.hadoop.hive.conf.HiveConf -import org.apache.hudi.DataSourceWriteOptions._ -import org.apache.hudi.avro.HoodieAvroUtils -import org.apache.hudi.client.HoodieWriteResult -import org.apache.hudi.client.SparkRDDWriteClient -import org.apache.hudi.common.config.TypedProperties -import org.apache.hudi.common.model.{HoodieRecordPayload, HoodieTableType, WriteOperationType} -import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} -import org.apache.hudi.common.table.timeline.HoodieActiveTimeline -import org.apache.hudi.common.util.ReflectionUtils -import org.apache.hudi.config.HoodieBootstrapConfig.{BOOTSTRAP_BASE_PATH_PROP, BOOTSTRAP_INDEX_CLASS_PROP, DEFAULT_BOOTSTRAP_INDEX_CLASS} -import org.apache.hudi.config.HoodieWriteConfig -import org.apache.hudi.exception.HoodieException -import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncTool} -import org.apache.hudi.internal.HoodieDataSourceInternalWriter -import org.apache.hudi.sync.common.AbstractSyncTool -import org.apache.log4j.LogManager -import org.apache.spark.SPARK_VERSION -import org.apache.spark.SparkContext -import org.apache.spark.api.java.JavaSparkContext -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} - -import scala.collection.JavaConversions._ -import scala.collection.mutable.ListBuffer - -private[hudi] object HoodieSparkSqlWriter { - - private val log = LogManager.getLogger(getClass) - private var tableExists: Boolean = false - private var asyncCompactionTriggerFnDefined: Boolean = false - - def write(sqlContext: SQLContext, - mode: SaveMode, - parameters: Map[String, String], - df: DataFrame, - hoodieTableConfigOpt: Option[HoodieTableConfig] = Option.empty, - hoodieWriteClient: Option[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] = Option.empty, - asyncCompactionTriggerFn: Option[Function1[SparkRDDWriteClient[HoodieRecordPayload[Nothing]], Unit]] = Option.empty - ) - : (Boolean, common.util.Option[String], common.util.Option[String], - SparkRDDWriteClient[HoodieRecordPayload[Nothing]], HoodieTableConfig) = { - - val sparkContext = sqlContext.sparkContext - val path = parameters.get("path") - val tblNameOp = parameters.get(HoodieWriteConfig.TABLE_NAME) - asyncCompactionTriggerFnDefined = asyncCompactionTriggerFn.isDefined - if (path.isEmpty || tblNameOp.isEmpty) { - throw new HoodieException(s"'${HoodieWriteConfig.TABLE_NAME}', 'path' must be set.") - } - val tblName = tblNameOp.get.trim - sparkContext.getConf.getOption("spark.serializer") match { - case Some(ser) if ser.equals("org.apache.spark.serializer.KryoSerializer") => - case _ => throw new HoodieException("hoodie only support org.apache.spark.serializer.KryoSerializer as spark.serializer") - } - val tableType = HoodieTableType.valueOf(parameters(TABLE_TYPE_OPT_KEY)) - var operation = WriteOperationType.fromValue(parameters(OPERATION_OPT_KEY)) - // It does not make sense to allow upsert() operation if INSERT_DROP_DUPS_OPT_KEY is true - // Auto-correct the operation to "insert" if OPERATION_OPT_KEY is set to "upsert" wrongly - // or not set (in which case it will be set as "upsert" by parametersWithWriteDefaults()) . - if (parameters(INSERT_DROP_DUPS_OPT_KEY).toBoolean && - operation == WriteOperationType.UPSERT) { - - log.warn(s"$UPSERT_OPERATION_OPT_VAL is not applicable " + - s"when $INSERT_DROP_DUPS_OPT_KEY is set to be true, " + - s"overriding the $OPERATION_OPT_KEY to be $INSERT_OPERATION_OPT_VAL") - - operation = WriteOperationType.INSERT - } - - // If the mode is Overwrite, can set operation to INSERT_OVERWRITE_TABLE. - // Then in DataSourceUtils.doWriteOperation will use client.insertOverwriteTable to overwrite - // the table. This will replace the old fs.delete(tablepath) mode. - if (mode == SaveMode.Overwrite && operation != WriteOperationType.INSERT_OVERWRITE_TABLE) { - operation = WriteOperationType.INSERT_OVERWRITE_TABLE - } - - val jsc = new JavaSparkContext(sparkContext) - val basePath = new Path(path.get) - val instantTime = HoodieActiveTimeline.createNewInstantTime() - val fs = basePath.getFileSystem(sparkContext.hadoopConfiguration) - tableExists = fs.exists(new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME)) - var tableConfig = getHoodieTableConfig(sparkContext, path.get, hoodieTableConfigOpt) - - if (mode == SaveMode.Ignore && tableExists) { - log.warn(s"hoodie table at $basePath already exists. Ignoring & not performing actual writes.") - (false, common.util.Option.empty(), common.util.Option.empty(), hoodieWriteClient.orNull, tableConfig) - } else { - // Handle various save modes - handleSaveModes(mode, basePath, tableConfig, tblName, operation, fs) - // Create the table if not present - if (!tableExists) { - val archiveLogFolder = parameters.getOrElse( - HoodieTableConfig.HOODIE_ARCHIVELOG_FOLDER_PROP_NAME, "archived") - val tableMetaClient = HoodieTableMetaClient.initTableType(sparkContext.hadoopConfiguration, path.get, - tableType, tblName, archiveLogFolder, parameters(PAYLOAD_CLASS_OPT_KEY), - null.asInstanceOf[String]) - tableConfig = tableMetaClient.getTableConfig - } - - val commitActionType = DataSourceUtils.getCommitActionType(operation, tableConfig.getTableType) - - // short-circuit if bulk_insert via row is enabled. - // scalastyle:off - if (parameters(ENABLE_ROW_WRITER_OPT_KEY).toBoolean && - operation == WriteOperationType.BULK_INSERT) { - if (!SPARK_VERSION.startsWith("2.")) { - throw new HoodieException("Bulk insert using row writer is not supported with Spark 3. To use row writer please switch to spark 2.") - } - val (success, commitTime: common.util.Option[String]) = bulkInsertAsRow(sqlContext, parameters, df, tblName, - basePath, path, instantTime) - return (success, commitTime, common.util.Option.empty(), hoodieWriteClient.orNull, tableConfig) - } - // scalastyle:on - - val (writeResult, writeClient: SparkRDDWriteClient[HoodieRecordPayload[Nothing]]) = - if (operation != WriteOperationType.DELETE) { - // register classes & schemas - val (structName, nameSpace) = AvroConversionUtils.getAvroRecordNameAndNamespace(tblName) - sparkContext.getConf.registerKryoClasses( - Array(classOf[org.apache.avro.generic.GenericData], - classOf[org.apache.avro.Schema])) - val schema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, nameSpace) - sparkContext.getConf.registerAvroSchemas(schema) - log.info(s"Registered avro schema : ${schema.toString(true)}") - - // Convert to RDD[HoodieRecord] - val keyGenerator = DataSourceUtils.createKeyGenerator(toProperties(parameters)) - val genericRecords: RDD[GenericRecord] = HoodieSparkUtils.createRdd(df, schema, structName, nameSpace) - val shouldCombine = parameters(INSERT_DROP_DUPS_OPT_KEY).toBoolean || operation.equals(WriteOperationType.UPSERT); - val hoodieAllIncomingRecords = genericRecords.map(gr => { - val hoodieRecord = if (shouldCombine) { - val orderingVal = HoodieAvroUtils.getNestedFieldVal(gr, parameters(PRECOMBINE_FIELD_OPT_KEY), false) - .asInstanceOf[Comparable[_]] - DataSourceUtils.createHoodieRecord(gr, - orderingVal, keyGenerator.getKey(gr), - parameters(PAYLOAD_CLASS_OPT_KEY)) - } else { - DataSourceUtils.createHoodieRecord(gr, keyGenerator.getKey(gr), parameters(PAYLOAD_CLASS_OPT_KEY)) - } - hoodieRecord - }).toJavaRDD() - - // Create a HoodieWriteClient & issue the write. - val client = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, schema.toString, path.get, - tblName, mapAsJavaMap(parameters - HoodieWriteConfig.HOODIE_AUTO_COMMIT_PROP) - )).asInstanceOf[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] - - if (isAsyncCompactionEnabled(client, tableConfig, parameters, jsc.hadoopConfiguration())) { - asyncCompactionTriggerFn.get.apply(client) - } - - val hoodieRecords = - if (parameters(INSERT_DROP_DUPS_OPT_KEY).toBoolean) { - DataSourceUtils.dropDuplicates(jsc, hoodieAllIncomingRecords, mapAsJavaMap(parameters)) - } else { - hoodieAllIncomingRecords - } - - if (hoodieRecords.isEmpty()) { - log.info("new batch has no new records, skipping...") - (true, common.util.Option.empty()) - } - client.startCommitWithTime(instantTime, commitActionType) - val writeResult = DataSourceUtils.doWriteOperation(client, hoodieRecords, instantTime, operation) - (writeResult, client) - } else { - val structName = s"${tblName}_record" - val nameSpace = s"hoodie.${tblName}" - sparkContext.getConf.registerKryoClasses( - Array(classOf[org.apache.avro.generic.GenericData], - classOf[org.apache.avro.Schema])) - - // Convert to RDD[HoodieKey] - val keyGenerator = DataSourceUtils.createKeyGenerator(toProperties(parameters)) - val genericRecords: RDD[GenericRecord] = HoodieSparkUtils.createRdd(df, structName, nameSpace) - val hoodieKeysToDelete = genericRecords.map(gr => keyGenerator.getKey(gr)).toJavaRDD() - - if (!tableExists) { - throw new HoodieException(s"hoodie table at $basePath does not exist") - } - - // Create a HoodieWriteClient & issue the delete. - val client = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, - Schema.create(Schema.Type.NULL).toString, path.get, tblName, - mapAsJavaMap(parameters - HoodieWriteConfig.HOODIE_AUTO_COMMIT_PROP))) - .asInstanceOf[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] - - if (isAsyncCompactionEnabled(client, tableConfig, parameters, jsc.hadoopConfiguration())) { - asyncCompactionTriggerFn.get.apply(client) - } - - // Issue deletes - client.startCommitWithTime(instantTime, commitActionType) - val writeStatuses = DataSourceUtils.doDeleteOperation(client, hoodieKeysToDelete, instantTime) - (writeStatuses, client) - } - - // Check for errors and commit the write. - val (writeSuccessful, compactionInstant) = - commitAndPerformPostOperations(writeResult, parameters, writeClient, tableConfig, jsc, - TableInstantInfo(basePath, instantTime, commitActionType, operation)) - (writeSuccessful, common.util.Option.ofNullable(instantTime), compactionInstant, writeClient, tableConfig) - } - } - - def bootstrap(sqlContext: SQLContext, - mode: SaveMode, - parameters: Map[String, String], - df: DataFrame, - hoodieTableConfigOpt: Option[HoodieTableConfig] = Option.empty, - hoodieWriteClient: Option[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] = Option.empty): Boolean = { - - val sparkContext = sqlContext.sparkContext - val path = parameters.getOrElse("path", throw new HoodieException("'path' must be set.")) - val tableName = parameters.getOrElse(HoodieWriteConfig.TABLE_NAME, - throw new HoodieException(s"'${HoodieWriteConfig.TABLE_NAME}' must be set.")) - val tableType = parameters(TABLE_TYPE_OPT_KEY) - val bootstrapBasePath = parameters.getOrElse(BOOTSTRAP_BASE_PATH_PROP, - throw new HoodieException(s"'${BOOTSTRAP_BASE_PATH_PROP}' is required for '${BOOTSTRAP_OPERATION_OPT_VAL}'" + - " operation'")) - val bootstrapIndexClass = parameters.getOrDefault(BOOTSTRAP_INDEX_CLASS_PROP, DEFAULT_BOOTSTRAP_INDEX_CLASS) - - var schema: String = null - if (df.schema.nonEmpty) { - val (structName, namespace) = AvroConversionUtils.getAvroRecordNameAndNamespace(tableName) - schema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, namespace).toString - } else { - schema = HoodieAvroUtils.getNullSchema.toString - } - - val basePath = new Path(path) - val fs = basePath.getFileSystem(sparkContext.hadoopConfiguration) - tableExists = fs.exists(new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME)) - val tableConfig = getHoodieTableConfig(sparkContext, path, hoodieTableConfigOpt) - - // Handle various save modes - if (mode == SaveMode.Ignore && tableExists) { - log.warn(s"hoodie table at $basePath already exists. Ignoring & not performing actual writes.") - false - } else { - handleSaveModes(mode, basePath, tableConfig, tableName, WriteOperationType.BOOTSTRAP, fs) - } - - if (!tableExists) { - val archiveLogFolder = parameters.getOrElse( - HoodieTableConfig.HOODIE_ARCHIVELOG_FOLDER_PROP_NAME, "archived") - HoodieTableMetaClient.initTableTypeWithBootstrap(sparkContext.hadoopConfiguration, path, - HoodieTableType.valueOf(tableType), tableName, archiveLogFolder, parameters(PAYLOAD_CLASS_OPT_KEY), - null, bootstrapIndexClass, bootstrapBasePath) - } - - val jsc = new JavaSparkContext(sqlContext.sparkContext) - val writeClient = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, - schema, path, tableName, mapAsJavaMap(parameters))) - try { - writeClient.bootstrap(org.apache.hudi.common.util.Option.empty()) - } finally { - writeClient.close() - } - val metaSyncSuccess = metaSync(parameters, basePath, jsc.hadoopConfiguration) - metaSyncSuccess - } - - def bulkInsertAsRow(sqlContext: SQLContext, - parameters: Map[String, String], - df: DataFrame, - tblName: String, - basePath: Path, - path: Option[String], - instantTime: String): (Boolean, common.util.Option[String]) = { - val structName = s"${tblName}_record" - val nameSpace = s"hoodie.${tblName}" - val writeConfig = DataSourceUtils.createHoodieConfig(null, path.get, tblName, mapAsJavaMap(parameters)) - val hoodieDF = HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, writeConfig, df, structName, nameSpace) - hoodieDF.write.format("org.apache.hudi.internal") - .option(HoodieDataSourceInternalWriter.INSTANT_TIME_OPT_KEY, instantTime) - .options(parameters) - .save() - val hiveSyncEnabled = parameters.get(HIVE_SYNC_ENABLED_OPT_KEY).exists(r => r.toBoolean) - val metaSyncEnabled = parameters.get(META_SYNC_ENABLED_OPT_KEY).exists(r => r.toBoolean) - val syncHiveSucess = if (hiveSyncEnabled || metaSyncEnabled) { - metaSync(parameters, basePath, sqlContext.sparkContext.hadoopConfiguration) - } else { - true - } - (syncHiveSucess, common.util.Option.ofNullable(instantTime)) - } - - def toProperties(params: Map[String, String]): TypedProperties = { - val props = new TypedProperties() - params.foreach(kv => props.setProperty(kv._1, kv._2)) - props - } - - private def handleSaveModes(mode: SaveMode, tablePath: Path, tableConfig: HoodieTableConfig, tableName: String, - operation: WriteOperationType, fs: FileSystem): Unit = { - if (mode == SaveMode.Append && tableExists) { - val existingTableName = tableConfig.getTableName - if (!existingTableName.equals(tableName)) { - throw new HoodieException(s"hoodie table with name $existingTableName already exists at $tablePath") - } - } - - if (operation != WriteOperationType.DELETE) { - if (mode == SaveMode.ErrorIfExists && tableExists) { - throw new HoodieException(s"hoodie table at $tablePath already exists.") - } - } else { - // Delete Operation only supports Append mode - if (mode != SaveMode.Append) { - throw new HoodieException(s"Append is the only save mode applicable for ${operation.toString} operation") - } - } - } - - private def syncHive(basePath: Path, fs: FileSystem, parameters: Map[String, String]): Boolean = { - val hiveSyncConfig: HiveSyncConfig = buildSyncConfig(basePath, parameters) - val hiveConf: HiveConf = new HiveConf() - hiveConf.addResource(fs.getConf) - new HiveSyncTool(hiveSyncConfig, hiveConf, fs).syncHoodieTable() - true - } - - private def buildSyncConfig(basePath: Path, parameters: Map[String, String]): HiveSyncConfig = { - val hiveSyncConfig: HiveSyncConfig = new HiveSyncConfig() - hiveSyncConfig.basePath = basePath.toString - hiveSyncConfig.baseFileFormat = parameters(HIVE_BASE_FILE_FORMAT_OPT_KEY); - hiveSyncConfig.usePreApacheInputFormat = - parameters.get(HIVE_USE_PRE_APACHE_INPUT_FORMAT_OPT_KEY).exists(r => r.toBoolean) - hiveSyncConfig.databaseName = parameters(HIVE_DATABASE_OPT_KEY) - hiveSyncConfig.tableName = parameters(HIVE_TABLE_OPT_KEY) - hiveSyncConfig.hiveUser = parameters(HIVE_USER_OPT_KEY) - hiveSyncConfig.hivePass = parameters(HIVE_PASS_OPT_KEY) - hiveSyncConfig.jdbcUrl = parameters(HIVE_URL_OPT_KEY) - hiveSyncConfig.partitionFields = - ListBuffer(parameters(HIVE_PARTITION_FIELDS_OPT_KEY).split(",").map(_.trim).filter(!_.isEmpty).toList: _*) - hiveSyncConfig.partitionValueExtractorClass = parameters(HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY) - hiveSyncConfig.useJdbc = parameters(HIVE_USE_JDBC_OPT_KEY).toBoolean - hiveSyncConfig.supportTimestamp = parameters.get(HIVE_SUPPORT_TIMESTAMP).exists(r => r.toBoolean) - hiveSyncConfig - } - - private def metaSync(parameters: Map[String, String], - basePath: Path, - hadoopConf: Configuration): Boolean = { - val hiveSyncEnabled = parameters.get(HIVE_SYNC_ENABLED_OPT_KEY).exists(r => r.toBoolean) - var metaSyncEnabled = parameters.get(META_SYNC_ENABLED_OPT_KEY).exists(r => r.toBoolean) - var syncClientToolClassSet = scala.collection.mutable.Set[String]() - parameters(META_SYNC_CLIENT_TOOL_CLASS).split(",").foreach(syncClass => syncClientToolClassSet += syncClass) - - // for backward compatibility - if (hiveSyncEnabled) { - metaSyncEnabled = true - syncClientToolClassSet += classOf[HiveSyncTool].getName - } - var metaSyncSuccess = true - if (metaSyncEnabled) { - val fs = basePath.getFileSystem(hadoopConf) - syncClientToolClassSet.foreach(impl => { - val syncSuccess = impl.trim match { - case "org.apache.hudi.hive.HiveSyncTool" => { - log.info("Syncing to Hive Metastore (URL: " + parameters(HIVE_URL_OPT_KEY) + ")") - syncHive(basePath, fs, parameters) - true - } - case _ => { - val properties = new Properties(); - properties.putAll(parameters) - properties.put("basePath", basePath.toString) - val syncHoodie = ReflectionUtils.loadClass(impl.trim, Array[Class[_]](classOf[Properties], classOf[FileSystem]), properties, fs).asInstanceOf[AbstractSyncTool] - syncHoodie.syncHoodieTable() - true - } - } - metaSyncSuccess = metaSyncSuccess && syncSuccess - }) - } - metaSyncSuccess - } - - /** - * Group all table/action specific information into a case class. - */ - case class TableInstantInfo(basePath: Path, instantTime: String, commitActionType: String, operation: WriteOperationType) - - private def commitAndPerformPostOperations(writeResult: HoodieWriteResult, - parameters: Map[String, String], - client: SparkRDDWriteClient[HoodieRecordPayload[Nothing]], - tableConfig: HoodieTableConfig, - jsc: JavaSparkContext, - tableInstantInfo: TableInstantInfo - ): (Boolean, common.util.Option[java.lang.String]) = { - val errorCount = writeResult.getWriteStatuses.rdd.filter(ws => ws.hasErrors).count() - if (errorCount == 0) { - log.info("No errors. Proceeding to commit the write.") - val metaMap = parameters.filter(kv => - kv._1.startsWith(parameters(COMMIT_METADATA_KEYPREFIX_OPT_KEY))) - val commitSuccess = - client.commit(tableInstantInfo.instantTime, writeResult.getWriteStatuses, - common.util.Option.of(new util.HashMap[String, String](mapAsJavaMap(metaMap))), - tableInstantInfo.commitActionType, - writeResult.getPartitionToReplaceFileIds) - - if (commitSuccess) { - log.info("Commit " + tableInstantInfo.instantTime + " successful!") - } - else { - log.info("Commit " + tableInstantInfo.instantTime + " failed!") - } - - val asyncCompactionEnabled = isAsyncCompactionEnabled(client, tableConfig, parameters, jsc.hadoopConfiguration()) - val compactionInstant : common.util.Option[java.lang.String] = - if (asyncCompactionEnabled) { - client.scheduleCompaction(common.util.Option.of(new util.HashMap[String, String](mapAsJavaMap(metaMap)))) - } else { - common.util.Option.empty() - } - - log.info(s"Compaction Scheduled is $compactionInstant") - val metaSyncSuccess = metaSync(parameters, tableInstantInfo.basePath, jsc.hadoopConfiguration()) - - log.info(s"Is Async Compaction Enabled ? $asyncCompactionEnabled") - if (!asyncCompactionEnabled) { - client.close() - } - (commitSuccess && metaSyncSuccess, compactionInstant) - } else { - log.error(s"${tableInstantInfo.operation} failed with $errorCount errors :") - if (log.isTraceEnabled) { - log.trace("Printing out the top 100 errors") - writeResult.getWriteStatuses.rdd.filter(ws => ws.hasErrors) - .take(100) - .foreach(ws => { - log.trace("Global error :", ws.getGlobalError) - if (ws.getErrors.size() > 0) { - ws.getErrors.foreach(kt => - log.trace(s"Error for key: ${kt._1}", kt._2)) - } - }) - } - (false, common.util.Option.empty()) - } - } - - private def isAsyncCompactionEnabled(client: SparkRDDWriteClient[HoodieRecordPayload[Nothing]], - tableConfig: HoodieTableConfig, - parameters: Map[String, String], configuration: Configuration) : Boolean = { - log.info(s"Config.isInlineCompaction ? ${client.getConfig.isInlineCompaction}") - if (asyncCompactionTriggerFnDefined && !client.getConfig.isInlineCompaction - && parameters.get(ASYNC_COMPACT_ENABLE_OPT_KEY).exists(r => r.toBoolean)) { - tableConfig.getTableType == HoodieTableType.MERGE_ON_READ - } else { - false - } - } - - private def getHoodieTableConfig(sparkContext: SparkContext, - tablePath: String, - hoodieTableConfigOpt: Option[HoodieTableConfig]): HoodieTableConfig = { - if (tableExists) { - hoodieTableConfigOpt.getOrElse( - new HoodieTableMetaClient(sparkContext.hadoopConfiguration, tablePath).getTableConfig) - } else { - null - } - } -} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala deleted file mode 100644 index 02880f22b93fc..0000000000000 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi - -import org.apache.avro.Schema -import org.apache.avro.generic.GenericRecord -import org.apache.hadoop.fs.{FileSystem, Path} -import org.apache.hudi.client.utils.SparkRowDeserializer -import org.apache.hudi.common.model.HoodieRecord -import org.apache.spark.SPARK_VERSION -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Row, SparkSession} -import org.apache.spark.sql.avro.SchemaConverters -import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} -import org.apache.spark.sql.execution.datasources.{FileStatusCache, InMemoryFileIndex} -import org.apache.spark.sql.types.{StringType, StructField, StructType} - -import scala.collection.JavaConverters._ - - -object HoodieSparkUtils { - - def getMetaSchema: StructType = { - StructType(HoodieRecord.HOODIE_META_COLUMNS.asScala.map(col => { - StructField(col, StringType, nullable = true) - })) - } - - /** - * This method copied from [[org.apache.spark.deploy.SparkHadoopUtil]]. - * [[org.apache.spark.deploy.SparkHadoopUtil]] becomes private since Spark 3.0.0 and hence we had to copy it locally. - */ - def isGlobPath(pattern: Path): Boolean = { - pattern.toString.exists("{}[]*?\\".toSet.contains) - } - - /** - * This method copied from [[org.apache.spark.deploy.SparkHadoopUtil]]. - * [[org.apache.spark.deploy.SparkHadoopUtil]] becomes private since Spark 3.0.0 and hence we had to copy it locally. - */ - def globPath(fs: FileSystem, pattern: Path): Seq[Path] = { - Option(fs.globStatus(pattern)).map { statuses => - statuses.map(_.getPath.makeQualified(fs.getUri, fs.getWorkingDirectory)).toSeq - }.getOrElse(Seq.empty[Path]) - } - - /** - * This method copied from [[org.apache.spark.deploy.SparkHadoopUtil]]. - * [[org.apache.spark.deploy.SparkHadoopUtil]] becomes private since Spark 3.0.0 and hence we had to copy it locally. - */ - def globPathIfNecessary(fs: FileSystem, pattern: Path): Seq[Path] = { - if (isGlobPath(pattern)) globPath(fs, pattern) else Seq(pattern) - } - - /** - * Checks to see whether input path contains a glob pattern and if yes, maps it to a list of absolute paths - * which match the glob pattern. Otherwise, returns original path - * - * @param paths List of absolute or globbed paths - * @param fs File system - * @return list of absolute file paths - */ - def checkAndGlobPathIfNecessary(paths: Seq[String], fs: FileSystem): Seq[Path] = { - paths.flatMap(path => { - val qualified = new Path(path).makeQualified(fs.getUri, fs.getWorkingDirectory) - val globPaths = globPathIfNecessary(fs, qualified) - globPaths - }) - } - - def createInMemoryFileIndex(sparkSession: SparkSession, globbedPaths: Seq[Path]): InMemoryFileIndex = { - val fileStatusCache = FileStatusCache.getOrCreate(sparkSession) - new InMemoryFileIndex(sparkSession, globbedPaths, Map(), Option.empty, fileStatusCache) - } - - def createRdd(df: DataFrame, structName: String, recordNamespace: String): RDD[GenericRecord] = { - val avroSchema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, recordNamespace) - createRdd(df, avroSchema, structName, recordNamespace) - } - - def createRdd(df: DataFrame, avroSchema: Schema, structName: String, recordNamespace: String) - : RDD[GenericRecord] = { - // Use the Avro schema to derive the StructType which has the correct nullability information - val dataType = SchemaConverters.toSqlType(avroSchema).dataType.asInstanceOf[StructType] - val encoder = RowEncoder.apply(dataType).resolveAndBind() - val deserializer = HoodieSparkUtils.createDeserializer(encoder) - df.queryExecution.toRdd.map(row => deserializer.deserializeRow(row)) - .mapPartitions { records => - if (records.isEmpty) Iterator.empty - else { - val convertor = AvroConversionHelper.createConverterToAvro(dataType, structName, recordNamespace) - records.map { x => convertor(x).asInstanceOf[GenericRecord] } - } - } - } - - def createDeserializer(encoder: ExpressionEncoder[Row]): SparkRowDeserializer = { - // TODO remove Spark2RowDeserializer if Spark 2.x support is dropped - if (SPARK_VERSION.startsWith("2.")) { - new Spark2RowDeserializer(encoder) - } else { - new Spark3RowDeserializer(encoder) - } - } -} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieStreamingSink.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieStreamingSink.scala deleted file mode 100644 index 846212d7eb4d6..0000000000000 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieStreamingSink.scala +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hudi - -import java.lang -import java.util.function.Function - -import org.apache.hudi.async.{AsyncCompactService, SparkStreamingAsyncCompactService} -import org.apache.hudi.client.SparkRDDWriteClient -import org.apache.hudi.client.common.HoodieSparkEngineContext -import org.apache.hudi.common.model.HoodieRecordPayload -import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} -import org.apache.hudi.common.table.timeline.HoodieInstant.State -import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline} -import org.apache.hudi.common.util.CompactionUtils -import org.apache.hudi.exception.HoodieCorruptedDataException -import org.apache.log4j.LogManager -import org.apache.spark.api.java.JavaSparkContext -import org.apache.spark.sql.execution.streaming.Sink -import org.apache.spark.sql.streaming.OutputMode -import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} - -import scala.util.{Failure, Success, Try} -import scala.collection.JavaConversions._ - -class HoodieStreamingSink(sqlContext: SQLContext, - options: Map[String, String], - partitionColumns: Seq[String], - outputMode: OutputMode) - extends Sink - with Serializable { - @volatile private var latestBatchId = -1L - - private val log = LogManager.getLogger(classOf[HoodieStreamingSink]) - - private val retryCnt = options(DataSourceWriteOptions.STREAMING_RETRY_CNT_OPT_KEY).toInt - private val retryIntervalMs = options(DataSourceWriteOptions.STREAMING_RETRY_INTERVAL_MS_OPT_KEY).toLong - private val ignoreFailedBatch = options(DataSourceWriteOptions.STREAMING_IGNORE_FAILED_BATCH_OPT_KEY).toBoolean - - private var isAsyncCompactorServiceShutdownAbnormally = false - - private val mode = - if (outputMode == OutputMode.Append()) { - SaveMode.Append - } else { - SaveMode.Overwrite - } - - private var asyncCompactorService : AsyncCompactService = _ - private var writeClient : Option[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] = Option.empty - private var hoodieTableConfig : Option[HoodieTableConfig] = Option.empty - - override def addBatch(batchId: Long, data: DataFrame): Unit = this.synchronized { - if (isAsyncCompactorServiceShutdownAbnormally) { - throw new IllegalStateException("Async Compactor shutdown unexpectedly") - } - - retry(retryCnt, retryIntervalMs)( - Try( - HoodieSparkSqlWriter.write( - sqlContext, mode, options, data, hoodieTableConfig, writeClient, Some(triggerAsyncCompactor)) - ) match { - case Success((true, commitOps, compactionInstantOps, client, tableConfig)) => - log.info(s"Micro batch id=$batchId succeeded" - + (commitOps.isPresent match { - case true => s" for commit=${commitOps.get()}" - case _ => s" with no new commits" - })) - writeClient = Some(client) - hoodieTableConfig = Some(tableConfig) - if (compactionInstantOps.isPresent) { - asyncCompactorService.enqueuePendingCompaction( - new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, compactionInstantOps.get())) - } - Success((true, commitOps, compactionInstantOps)) - case Failure(e) => - // clean up persist rdds in the write process - data.sparkSession.sparkContext.getPersistentRDDs - .foreach { - case (id, rdd) => - try { - rdd.unpersist() - } catch { - case t: Exception => log.warn("Got excepting trying to unpersist rdd", t) - } - } - log.error(s"Micro batch id=$batchId threw following exception: ", e) - if (ignoreFailedBatch) { - log.info(s"Ignore the exception and move on streaming as per " + - s"${DataSourceWriteOptions.STREAMING_IGNORE_FAILED_BATCH_OPT_KEY} configuration") - Success((true, None, None)) - } else { - if (retryCnt > 1) log.info(s"Retrying the failed micro batch id=$batchId ...") - Failure(e) - } - case Success((false, commitOps, compactionInstantOps, client, tableConfig)) => - log.error(s"Micro batch id=$batchId ended up with errors" - + (commitOps.isPresent match { - case true => s" for commit=${commitOps.get()}" - case _ => s"" - })) - if (ignoreFailedBatch) { - log.info(s"Ignore the errors and move on streaming as per " + - s"${DataSourceWriteOptions.STREAMING_IGNORE_FAILED_BATCH_OPT_KEY} configuration") - Success((true, None, None)) - } else { - if (retryCnt > 1) log.info(s"Retrying the failed micro batch id=$batchId ...") - Failure(new HoodieCorruptedDataException(s"Micro batch id=$batchId ended up with errors")) - } - } - ) match { - case Failure(e) => - if (!ignoreFailedBatch) { - log.error(s"Micro batch id=$batchId threw following expections," + - s"aborting streaming app to avoid data loss: ", e) - // spark sometimes hangs upon exceptions and keep on hold of the executors - // this is to force exit upon errors / exceptions and release all executors - // will require redeployment / supervise mode to restart the streaming - reset(true) - System.exit(1) - } - case Success(_) => - log.info(s"Micro batch id=$batchId succeeded") - } - } - - override def toString: String = s"HoodieStreamingSink[${options("path")}]" - - @annotation.tailrec - private def retry[T](n: Int, waitInMillis: Long)(fn: => Try[T]): Try[T] = { - fn match { - case x: Success[T] => - x - case _ if n > 1 => - Thread.sleep(waitInMillis) - retry(n - 1, waitInMillis * 2)(fn) - case f => - reset(false) - f - } - } - - protected def triggerAsyncCompactor(client: SparkRDDWriteClient[HoodieRecordPayload[Nothing]]): Unit = { - if (null == asyncCompactorService) { - log.info("Triggering Async compaction !!") - asyncCompactorService = new SparkStreamingAsyncCompactService(new HoodieSparkEngineContext(new JavaSparkContext(sqlContext.sparkContext)), - client) - asyncCompactorService.start(new Function[java.lang.Boolean, java.lang.Boolean] { - override def apply(errored: lang.Boolean): lang.Boolean = { - log.info(s"Async Compactor shutdown. Errored ? $errored") - isAsyncCompactorServiceShutdownAbnormally = errored - reset(false) - log.info("Done resetting write client.") - true - } - }) - - // Add Shutdown Hook - Runtime.getRuntime.addShutdownHook(new Thread(new Runnable { - override def run(): Unit = reset(true) - })) - - // First time, scan .hoodie folder and get all pending compactions - val metaClient = new HoodieTableMetaClient(sqlContext.sparkContext.hadoopConfiguration, - client.getConfig.getBasePath) - val pendingInstants :java.util.List[HoodieInstant] = - CompactionUtils.getPendingCompactionInstantTimes(metaClient) - pendingInstants.foreach((h : HoodieInstant) => asyncCompactorService.enqueuePendingCompaction(h)) - } - } - - private def reset(force: Boolean) : Unit = this.synchronized { - if (asyncCompactorService != null) { - asyncCompactorService.shutdown(force) - asyncCompactorService = null - } - - if (writeClient.isDefined) { - writeClient.get.close() - writeClient = Option.empty - } - } -} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala deleted file mode 100644 index 294050b6d46ca..0000000000000 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi - -import org.apache.hudi.DataSourceWriteOptions._ -import org.apache.hudi.common.config.TypedProperties - -import scala.collection.JavaConversions.mapAsJavaMap -import scala.collection.JavaConverters.mapAsScalaMapConverter - -/** - * WriterUtils to assist in write path in Datasource and tests. - */ -object HoodieWriterUtils { - - def javaParametersWithWriteDefaults(parameters: java.util.Map[String, String]): java.util.Map[String, String] = { - mapAsJavaMap(parametersWithWriteDefaults(parameters.asScala.toMap)) - } - - /** - * Add default options for unspecified write options keys. - * - * @param parameters - * @return - */ - def parametersWithWriteDefaults(parameters: Map[String, String]): Map[String, String] = { - Map(OPERATION_OPT_KEY -> DEFAULT_OPERATION_OPT_VAL, - TABLE_TYPE_OPT_KEY -> DEFAULT_TABLE_TYPE_OPT_VAL, - PRECOMBINE_FIELD_OPT_KEY -> DEFAULT_PRECOMBINE_FIELD_OPT_VAL, - PAYLOAD_CLASS_OPT_KEY -> DEFAULT_PAYLOAD_OPT_VAL, - RECORDKEY_FIELD_OPT_KEY -> DEFAULT_RECORDKEY_FIELD_OPT_VAL, - PARTITIONPATH_FIELD_OPT_KEY -> DEFAULT_PARTITIONPATH_FIELD_OPT_VAL, - KEYGENERATOR_CLASS_OPT_KEY -> DEFAULT_KEYGENERATOR_CLASS_OPT_VAL, - COMMIT_METADATA_KEYPREFIX_OPT_KEY -> DEFAULT_COMMIT_METADATA_KEYPREFIX_OPT_VAL, - INSERT_DROP_DUPS_OPT_KEY -> DEFAULT_INSERT_DROP_DUPS_OPT_VAL, - STREAMING_RETRY_CNT_OPT_KEY -> DEFAULT_STREAMING_RETRY_CNT_OPT_VAL, - STREAMING_RETRY_INTERVAL_MS_OPT_KEY -> DEFAULT_STREAMING_RETRY_INTERVAL_MS_OPT_VAL, - STREAMING_IGNORE_FAILED_BATCH_OPT_KEY -> DEFAULT_STREAMING_IGNORE_FAILED_BATCH_OPT_VAL, - META_SYNC_CLIENT_TOOL_CLASS -> DEFAULT_META_SYNC_CLIENT_TOOL_CLASS, - HIVE_SYNC_ENABLED_OPT_KEY -> DEFAULT_HIVE_SYNC_ENABLED_OPT_VAL, - META_SYNC_ENABLED_OPT_KEY -> DEFAULT_META_SYNC_ENABLED_OPT_VAL, - HIVE_DATABASE_OPT_KEY -> DEFAULT_HIVE_DATABASE_OPT_VAL, - HIVE_TABLE_OPT_KEY -> DEFAULT_HIVE_TABLE_OPT_VAL, - HIVE_BASE_FILE_FORMAT_OPT_KEY -> DEFAULT_HIVE_BASE_FILE_FORMAT_OPT_VAL, - HIVE_USER_OPT_KEY -> DEFAULT_HIVE_USER_OPT_VAL, - HIVE_PASS_OPT_KEY -> DEFAULT_HIVE_PASS_OPT_VAL, - HIVE_URL_OPT_KEY -> DEFAULT_HIVE_URL_OPT_VAL, - HIVE_PARTITION_FIELDS_OPT_KEY -> DEFAULT_HIVE_PARTITION_FIELDS_OPT_VAL, - HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY -> DEFAULT_HIVE_PARTITION_EXTRACTOR_CLASS_OPT_VAL, - HIVE_STYLE_PARTITIONING_OPT_KEY -> DEFAULT_HIVE_STYLE_PARTITIONING_OPT_VAL, - HIVE_USE_JDBC_OPT_KEY -> DEFAULT_HIVE_USE_JDBC_OPT_VAL, - ASYNC_COMPACT_ENABLE_OPT_KEY -> DEFAULT_ASYNC_COMPACT_ENABLE_OPT_VAL, - ENABLE_ROW_WRITER_OPT_KEY -> DEFAULT_ENABLE_ROW_WRITER_OPT_VAL - ) ++ translateStorageTypeToTableType(parameters) - } - - def toProperties(params: Map[String, String]): TypedProperties = { - val props = new TypedProperties() - params.foreach(kv => props.setProperty(kv._1, kv._2)) - props - } -} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/IncrementalRelation.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/IncrementalRelation.scala deleted file mode 100644 index 9cd562cd664e5..0000000000000 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/IncrementalRelation.scala +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi - -import org.apache.hudi.common.model.{HoodieCommitMetadata, HoodieRecord, HoodieTableType} -import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} -import org.apache.hudi.common.table.timeline.HoodieTimeline -import org.apache.hudi.config.HoodieWriteConfig -import org.apache.hudi.exception.HoodieException -import org.apache.hadoop.fs.GlobPattern -import org.apache.hudi.client.common.HoodieSparkEngineContext -import org.apache.hudi.table.HoodieSparkTable -import org.apache.log4j.LogManager -import org.apache.spark.api.java.JavaSparkContext -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.sources.{BaseRelation, TableScan} -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.{DataFrame, Row, SQLContext} - -import scala.collection.JavaConversions._ -import scala.collection.mutable - -/** - * Relation, that implements the Hoodie incremental view. - * - * Implemented for Copy_on_write storage. - * - */ -class IncrementalRelation(val sqlContext: SQLContext, - val basePath: String, - val optParams: Map[String, String], - val userSchema: StructType) extends BaseRelation with TableScan { - - private val log = LogManager.getLogger(classOf[IncrementalRelation]) - - val skeletonSchema: StructType = HoodieSparkUtils.getMetaSchema - private val metaClient = new HoodieTableMetaClient(sqlContext.sparkContext.hadoopConfiguration, basePath, true) - - // MOR tables not supported yet - if (metaClient.getTableType.equals(HoodieTableType.MERGE_ON_READ)) { - throw new HoodieException("Incremental view not implemented yet, for merge-on-read tables") - } - // TODO : Figure out a valid HoodieWriteConfig - private val hoodieTable = HoodieSparkTable.create(HoodieWriteConfig.newBuilder().withPath(basePath).build(), - new HoodieSparkEngineContext(new JavaSparkContext(sqlContext.sparkContext)), - metaClient) - private val commitTimeline = hoodieTable.getMetaClient.getCommitTimeline.filterCompletedInstants() - if (commitTimeline.empty()) { - throw new HoodieException("No instants to incrementally pull") - } - if (!optParams.contains(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY)) { - throw new HoodieException(s"Specify the begin instant time to pull from using " + - s"option ${DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY}") - } - - val useEndInstantSchema = optParams.getOrElse(DataSourceReadOptions.INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME_OPT_KEY, - DataSourceReadOptions.DEFAULT_INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME_OPT_VAL).toBoolean - - private val lastInstant = commitTimeline.lastInstant().get() - - private val commitsToReturn = commitTimeline.findInstantsInRange( - optParams(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY), - optParams.getOrElse(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY, lastInstant.getTimestamp)) - .getInstants.iterator().toList - - // use schema from a file produced in the end/latest instant - val usedSchema: StructType = { - log.info("Inferring schema..") - val schemaResolver = new TableSchemaResolver(metaClient) - val tableSchema = if (useEndInstantSchema) { - if (commitsToReturn.isEmpty) schemaResolver.getTableAvroSchemaWithoutMetadataFields() else - schemaResolver.getTableAvroSchemaWithoutMetadataFields(commitsToReturn.last) - } else { - schemaResolver.getTableAvroSchemaWithoutMetadataFields() - } - val dataSchema = AvroConversionUtils.convertAvroSchemaToStructType(tableSchema) - StructType(skeletonSchema.fields ++ dataSchema.fields) - } - - private val filters = optParams.getOrElse(DataSourceReadOptions.PUSH_DOWN_INCR_FILTERS_OPT_KEY, - DataSourceReadOptions.DEFAULT_PUSH_DOWN_FILTERS_OPT_VAL).split(",").filter(!_.isEmpty) - - override def schema: StructType = usedSchema - - override def buildScan(): RDD[Row] = { - val regularFileIdToFullPath = mutable.HashMap[String, String]() - var metaBootstrapFileIdToFullPath = mutable.HashMap[String, String]() - - for (commit <- commitsToReturn) { - val metadata: HoodieCommitMetadata = HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit) - .get, classOf[HoodieCommitMetadata]) - - if (HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS == commit.getTimestamp) { - metaBootstrapFileIdToFullPath ++= metadata.getFileIdAndFullPaths(basePath).toMap - } else { - regularFileIdToFullPath ++= metadata.getFileIdAndFullPaths(basePath).toMap - } - } - - if (metaBootstrapFileIdToFullPath.nonEmpty) { - // filer out meta bootstrap files that have had more commits since metadata bootstrap - metaBootstrapFileIdToFullPath = metaBootstrapFileIdToFullPath - .filterNot(fileIdFullPath => regularFileIdToFullPath.contains(fileIdFullPath._1)) - } - - val pathGlobPattern = optParams.getOrElse( - DataSourceReadOptions.INCR_PATH_GLOB_OPT_KEY, - DataSourceReadOptions.DEFAULT_INCR_PATH_GLOB_OPT_VAL) - val (filteredRegularFullPaths, filteredMetaBootstrapFullPaths) = { - if(!pathGlobPattern.equals(DataSourceReadOptions.DEFAULT_INCR_PATH_GLOB_OPT_VAL)) { - val globMatcher = new GlobPattern("*" + pathGlobPattern) - (regularFileIdToFullPath.filter(p => globMatcher.matches(p._2)).values, - metaBootstrapFileIdToFullPath.filter(p => globMatcher.matches(p._2)).values) - } else { - (regularFileIdToFullPath.values, metaBootstrapFileIdToFullPath.values) - } - } - // unset the path filter, otherwise if end_instant_time is not the latest instant, path filter set for RO view - // will filter out all the files incorrectly. - sqlContext.sparkContext.hadoopConfiguration.unset("mapreduce.input.pathFilter.class") - val sOpts = optParams.filter(p => !p._1.equalsIgnoreCase("path")) - if (filteredRegularFullPaths.isEmpty && filteredMetaBootstrapFullPaths.isEmpty) { - sqlContext.sparkContext.emptyRDD[Row] - } else { - log.info("Additional Filters to be applied to incremental source are :" + filters) - - var df: DataFrame = sqlContext.createDataFrame(sqlContext.sparkContext.emptyRDD[Row], usedSchema) - - if (metaBootstrapFileIdToFullPath.nonEmpty) { - df = sqlContext.sparkSession.read - .format("hudi") - .schema(usedSchema) - .option(DataSourceReadOptions.READ_PATHS_OPT_KEY, filteredMetaBootstrapFullPaths.mkString(",")) - .load() - } - - if (regularFileIdToFullPath.nonEmpty) - { - df = df.union(sqlContext.read.options(sOpts) - .schema(usedSchema) - .parquet(filteredRegularFullPaths.toList: _*) - .filter(String.format("%s >= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, - commitsToReturn.head.getTimestamp)) - .filter(String.format("%s <= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, - commitsToReturn.last.getTimestamp))) - } - - filters.foldLeft(df)((e, f) => e.filter(f)).rdd - } - } -} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala deleted file mode 100644 index 0b81fa7b804cf..0000000000000 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi - -import org.apache.hudi.common.model.HoodieBaseFile -import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} -import org.apache.hudi.common.table.view.HoodieTableFileSystemView -import org.apache.hudi.exception.HoodieException -import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils -import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils.getMaxCompactionMemoryInBytes - -import org.apache.hadoop.fs.{FileSystem, Path} -import org.apache.hadoop.mapred.JobConf -import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.internal.Logging -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.execution.datasources.PartitionedFile -import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat -import org.apache.spark.sql.{Row, SQLContext} -import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan} -import org.apache.spark.sql.types.StructType - -import scala.collection.JavaConverters._ - -case class HoodieMergeOnReadFileSplit(dataFile: PartitionedFile, - logPaths: Option[List[String]], - latestCommit: String, - tablePath: String, - maxCompactionMemoryInBytes: Long, - mergeType: String) - -case class HoodieMergeOnReadTableState(tableStructSchema: StructType, - requiredStructSchema: StructType, - tableAvroSchema: String, - requiredAvroSchema: String, - hoodieRealtimeFileSplits: List[HoodieMergeOnReadFileSplit]) - -class MergeOnReadSnapshotRelation(val sqlContext: SQLContext, - val optParams: Map[String, String], - val userSchema: StructType, - val globPaths: Seq[Path], - val metaClient: HoodieTableMetaClient) - extends BaseRelation with PrunedFilteredScan with Logging { - - private val conf = sqlContext.sparkContext.hadoopConfiguration - private val jobConf = new JobConf(conf) - // use schema from latest metadata, if not present, read schema from the data file - private val schemaUtil = new TableSchemaResolver(metaClient) - private val tableAvroSchema = schemaUtil.getTableAvroSchema - private val tableStructSchema = AvroConversionUtils.convertAvroSchemaToStructType(tableAvroSchema) - private val mergeType = optParams.getOrElse( - DataSourceReadOptions.REALTIME_MERGE_OPT_KEY, - DataSourceReadOptions.DEFAULT_REALTIME_MERGE_OPT_VAL) - private val maxCompactionMemoryInBytes = getMaxCompactionMemoryInBytes(jobConf) - private val fileIndex = buildFileIndex() - - override def schema: StructType = tableStructSchema - - override def needConversion: Boolean = false - - override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { - log.debug(s" buildScan requiredColumns = ${requiredColumns.mkString(",")}") - log.debug(s" buildScan filters = ${filters.mkString(",")}") - var requiredStructSchema = StructType(Seq()) - requiredColumns.foreach(col => { - val field = tableStructSchema.find(_.name == col) - if (field.isDefined) { - requiredStructSchema = requiredStructSchema.add(field.get) - } - }) - val requiredAvroSchema = AvroConversionUtils - .convertStructTypeToAvroSchema(requiredStructSchema, tableAvroSchema.getName, tableAvroSchema.getNamespace) - val hoodieTableState = HoodieMergeOnReadTableState( - tableStructSchema, - requiredStructSchema, - tableAvroSchema.toString, - requiredAvroSchema.toString, - fileIndex - ) - val fullSchemaParquetReader = new ParquetFileFormat().buildReaderWithPartitionValues( - sparkSession = sqlContext.sparkSession, - dataSchema = tableStructSchema, - partitionSchema = StructType(Nil), - requiredSchema = tableStructSchema, - filters = Seq(), - options = optParams, - hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf() - ) - val requiredSchemaParquetReader = new ParquetFileFormat().buildReaderWithPartitionValues( - sparkSession = sqlContext.sparkSession, - dataSchema = tableStructSchema, - partitionSchema = StructType(Nil), - requiredSchema = requiredStructSchema, - filters = filters, - options = optParams, - hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf() - ) - - val rdd = new HoodieMergeOnReadRDD( - sqlContext.sparkContext, - jobConf, - fullSchemaParquetReader, - requiredSchemaParquetReader, - hoodieTableState - ) - rdd.asInstanceOf[RDD[Row]] - } - - def buildFileIndex(): List[HoodieMergeOnReadFileSplit] = { - val inMemoryFileIndex = HoodieSparkUtils.createInMemoryFileIndex(sqlContext.sparkSession, globPaths) - val fileStatuses = inMemoryFileIndex.allFiles() - if (fileStatuses.isEmpty) { - throw new HoodieException("No files found for reading in user provided path.") - } - - val fsView = new HoodieTableFileSystemView(metaClient, - metaClient.getActiveTimeline.getCommitsTimeline - .filterCompletedInstants, fileStatuses.toArray) - val latestFiles: List[HoodieBaseFile] = fsView.getLatestBaseFiles.iterator().asScala.toList - val latestCommit = fsView.getLastInstant.get().getTimestamp - val fileGroup = HoodieRealtimeInputFormatUtils.groupLogsByBaseFile(conf, latestFiles.asJava).asScala - val fileSplits = fileGroup.map(kv => { - val baseFile = kv._1 - val logPaths = if (kv._2.isEmpty) Option.empty else Option(kv._2.asScala.toList) - val partitionedFile = PartitionedFile(InternalRow.empty, baseFile.getPath, 0, baseFile.getFileLen) - HoodieMergeOnReadFileSplit(partitionedFile, logPaths, latestCommit, - metaClient.getBasePath, maxCompactionMemoryInBytes, mergeType) - }).toList - fileSplits - } -} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Call.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Call.scala new file mode 100644 index 0000000000000..df2a953752fa8 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Call.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.catalyst.plans.logical + +import org.apache.spark.sql.catalyst.expressions.Expression + +case class CallCommand(name: Seq[String], args: Seq[CallArgument]) extends Command { + override def children: Seq[LogicalPlan] = Seq.empty + + def withNewChildrenInternal(newChildren: IndexedSeq[LogicalPlan]): CallCommand = { + this + } +} + +/** + * An argument in a CALL statement. + */ +sealed trait CallArgument { + def expr: Expression +} + +/** + * An argument in a CALL statement identified by name. + */ +case class NamedArgument(name: String, expr: Expression) extends CallArgument + +/** + * An argument in a CALL statement identified by position. + */ +case class PositionalArgument(expr: Expression) extends CallArgument diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Compaction.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Compaction.scala new file mode 100644 index 0000000000000..f445e7c8a0dab --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Compaction.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.plans.logical + +import org.apache.spark.sql.catalyst.plans.logical.CompactionOperation.CompactionOperation + +case class CompactionTable(table: LogicalPlan, operation: CompactionOperation, instantTimestamp: Option[Long]) + extends Command { + override def children: Seq[LogicalPlan] = Seq(table) + + def withNewChildrenInternal(newChildren: IndexedSeq[LogicalPlan]): CompactionTable = { + copy(table = newChildren.head) + } +} + +case class CompactionPath(path: String, operation: CompactionOperation, instantTimestamp: Option[Long]) + extends Command { + override def children: Seq[LogicalPlan] = Seq.empty + + def withNewChildrenInternal(newChildren: IndexedSeq[LogicalPlan]): CompactionPath = { + this + } +} + +case class CompactionShowOnTable(table: LogicalPlan, limit: Int = 20) + extends Command { + override def children: Seq[LogicalPlan] = Seq(table) + + def withNewChildrenInternal(newChildren: IndexedSeq[LogicalPlan]): CompactionShowOnTable = { + copy(table = newChildren.head) + } +} + +case class CompactionShowOnPath(path: String, limit: Int = 20) extends Command { + override def children: Seq[LogicalPlan] = Seq.empty + + def withNewChildrenInternal(newChildren: IndexedSeq[LogicalPlan]): CompactionShowOnPath = { + this + } +} + +object CompactionOperation extends Enumeration { + type CompactionOperation = Value + val SCHEDULE, RUN = Value +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Index.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Index.scala new file mode 100644 index 0000000000000..12ee2e8058343 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Index.scala @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.catalyst.plans.logical + +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} +import org.apache.spark.sql.types.StringType + +/** + * The logical plan of the CREATE INDEX command. + */ +case class CreateIndex( + table: LogicalPlan, + indexName: String, + indexType: String, + ignoreIfExists: Boolean, + columns: Seq[(Attribute, Map[String, String])], + properties: Map[String, String], + override val output: Seq[Attribute] = CreateIndex.getOutputAttrs) extends Command { + + override def children: Seq[LogicalPlan] = Seq(table) + + override lazy val resolved: Boolean = table.resolved && columns.forall(_._1.resolved) + + def withNewChildrenInternal(newChild: IndexedSeq[LogicalPlan]): CreateIndex = { + copy(table = newChild.head) + } +} + +object CreateIndex { + def getOutputAttrs: Seq[Attribute] = Seq.empty +} + +/** + * The logical plan of the DROP INDEX command. + */ +case class DropIndex( + table: LogicalPlan, + indexName: String, + ignoreIfNotExists: Boolean, + override val output: Seq[Attribute] = DropIndex.getOutputAttrs) extends Command { + + override def children: Seq[LogicalPlan] = Seq(table) + + def withNewChildrenInternal(newChild: IndexedSeq[LogicalPlan]): DropIndex = { + copy(table = newChild.head) + } +} + +object DropIndex { + def getOutputAttrs: Seq[Attribute] = Seq.empty +} + +/** + * The logical plan of the SHOW INDEXES command. + */ +case class ShowIndexes( + table: LogicalPlan, + override val output: Seq[Attribute] = ShowIndexes.getOutputAttrs) extends Command { + + override def children: Seq[LogicalPlan] = Seq(table) + + def withNewChildrenInternal(newChild: IndexedSeq[LogicalPlan]): ShowIndexes = { + copy(table = newChild.head) + } +} + +object ShowIndexes { + def getOutputAttrs: Seq[Attribute] = Seq( + AttributeReference("index_name", StringType, nullable = false)(), + AttributeReference("col_name", StringType, nullable = false)(), + AttributeReference("index_type", StringType, nullable = false)(), + AttributeReference("col_options", StringType, nullable = true)(), + AttributeReference("options", StringType, nullable = true)() + ) +} + +/** + * The logical plan of the REFRESH INDEX command. + */ +case class RefreshIndex( + table: LogicalPlan, + indexName: String, + override val output: Seq[Attribute] = RefreshIndex.getOutputAttrs) extends Command { + + override def children: Seq[LogicalPlan] = Seq(table) + + def withNewChildrenInternal(newChild: IndexedSeq[LogicalPlan]): RefreshIndex = { + copy(table = newChild.head) + } +} + +object RefreshIndex { + def getOutputAttrs: Seq[Attribute] = Seq.empty +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/DeDupeType.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/DeDupeType.scala new file mode 100644 index 0000000000000..93cec470ec99c --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/DeDupeType.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +object DeDupeType extends Enumeration { + + type dedupeType = Value + + val INSERT_TYPE = Value("insert_type") + val UPDATE_TYPE = Value("update_type") + val UPSERT_TYPE = Value("upsert_type") +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/DedupeSparkJob.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/DedupeSparkJob.scala new file mode 100644 index 0000000000000..e39d22aa05462 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/DedupeSparkJob.scala @@ -0,0 +1,245 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +import org.apache.hadoop.fs.{FileSystem, FileUtil, Path} +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.{HoodieBaseFile, HoodieRecord} +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.view.HoodieTableFileSystemView +import org.apache.hudi.exception.HoodieException +import org.apache.log4j.Logger +import org.apache.spark.sql.{DataFrame, Row, SQLContext} + +import java.util.stream.Collectors +import scala.collection.JavaConversions._ +import scala.collection.mutable.{Buffer, HashMap, HashSet, ListBuffer} + +/** + * Spark job to de-duplicate data present in a partition path + */ +class DedupeSparkJob(basePath: String, + duplicatedPartitionPath: String, + repairOutputPath: String, + sqlContext: SQLContext, + fs: FileSystem, + dedupeType: DeDupeType.Value) { + + val sparkHelper = new SparkHelper(sqlContext, fs) + val LOG = Logger.getLogger(this.getClass) + + /** + * + * @param tblName + * @return + */ + def getDupeKeyDF(tblName: String): DataFrame = { + val dupeSql = + s""" + select `${HoodieRecord.RECORD_KEY_METADATA_FIELD}` as dupe_key, + count(*) as dupe_cnt + from $tblName + group by `${HoodieRecord.RECORD_KEY_METADATA_FIELD}` + having dupe_cnt > 1 + """ + sqlContext.sql(dupeSql) + } + + /** + * + * Check a given partition for duplicates and suggest the deletions that need to be done in each file, + * in order to set things right. + * + * @return + */ + private def planDuplicateFix(): HashMap[String, HashSet[String]] = { + val tmpTableName = s"htbl_${System.currentTimeMillis()}" + val dedupeTblName = s"${tmpTableName}_dupeKeys" + + val metadata = HoodieTableMetaClient.builder().setConf(fs.getConf).setBasePath(basePath).build() + + val allFiles = fs.listStatus(new org.apache.hadoop.fs.Path(s"$basePath/$duplicatedPartitionPath")) + val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles) + val latestFiles: java.util.List[HoodieBaseFile] = fsView.getLatestBaseFiles().collect(Collectors.toList[HoodieBaseFile]()) + val filteredStatuses = latestFiles.map(f => f.getPath) + LOG.info(s" List of files under partition: ${} => ${filteredStatuses.mkString(" ")}") + + val df = sqlContext.parquetFile(filteredStatuses: _*) + df.registerTempTable(tmpTableName) + val dupeKeyDF = getDupeKeyDF(tmpTableName) + dupeKeyDF.registerTempTable(dedupeTblName) + + // Obtain necessary satellite information for duplicate rows + val dupeDataSql = + s""" + SELECT `_hoodie_record_key`, `_hoodie_partition_path`, `_hoodie_file_name`, `_hoodie_commit_time` + FROM $tmpTableName h + JOIN $dedupeTblName d + ON h.`_hoodie_record_key` = d.dupe_key + """ + val dupeMap = sqlContext.sql(dupeDataSql).collectAsList().groupBy(r => r.getString(0)) + getDedupePlan(dupeMap) + } + + private def getDedupePlan(dupeMap: Map[String, Buffer[Row]]): HashMap[String, HashSet[String]] = { + val fileToDeleteKeyMap = new HashMap[String, HashSet[String]]() + dupeMap.foreach(rt => { + val (key, rows) = rt + + dedupeType match { + case DeDupeType.UPDATE_TYPE => + /* + This corresponds to the case where all duplicates have been updated at least once. + Once updated, duplicates are bound to have same commit time unless forcefully modified. + */ + rows.init.foreach(r => { + val f = r(2).asInstanceOf[String].split("_")(0) + if (!fileToDeleteKeyMap.contains(f)) { + fileToDeleteKeyMap(f) = HashSet[String]() + } + fileToDeleteKeyMap(f).add(key) + }) + + case DeDupeType.INSERT_TYPE => + /* + This corresponds to the case where duplicates got created due to INSERT and have never been updated. + */ + var maxCommit = -1L + + rows.foreach(r => { + val c = r(3).asInstanceOf[String].toLong + if (c > maxCommit) + maxCommit = c + }) + + rows.foreach(r => { + val c = r(3).asInstanceOf[String].toLong + if (c != maxCommit) { + val f = r(2).asInstanceOf[String].split("_")(0) + if (!fileToDeleteKeyMap.contains(f)) { + fileToDeleteKeyMap(f) = HashSet[String]() + } + fileToDeleteKeyMap(f).add(key) + } + }) + + case DeDupeType.UPSERT_TYPE => + /* + This corresponds to the case where duplicates got created as a result of inserts as well as updates, + i.e few duplicate records have been updated, while others were never updated. + */ + var maxCommit = -1L + + rows.foreach(r => { + val c = r(3).asInstanceOf[String].toLong + if (c > maxCommit) + maxCommit = c + }) + val rowsWithMaxCommit = new ListBuffer[Row]() + rows.foreach(r => { + val c = r(3).asInstanceOf[String].toLong + if (c != maxCommit) { + val f = r(2).asInstanceOf[String].split("_")(0) + if (!fileToDeleteKeyMap.contains(f)) { + fileToDeleteKeyMap(f) = HashSet[String]() + } + fileToDeleteKeyMap(f).add(key) + } else { + rowsWithMaxCommit += r + } + }) + + rowsWithMaxCommit.toList.init.foreach(r => { + val f = r(2).asInstanceOf[String].split("_")(0) + if (!fileToDeleteKeyMap.contains(f)) { + fileToDeleteKeyMap(f) = HashSet[String]() + } + fileToDeleteKeyMap(f).add(key) + }) + + case _ => throw new IllegalArgumentException("Please provide valid type for deduping!") + } + }) + LOG.debug(s"fileToDeleteKeyMap size: ${fileToDeleteKeyMap.size}, map: $fileToDeleteKeyMap") + fileToDeleteKeyMap + } + + def fixDuplicates(dryRun: Boolean = true) = { + val metadata = HoodieTableMetaClient.builder().setConf(fs.getConf).setBasePath(basePath).build() + + val allFiles = fs.listStatus(new Path(s"$basePath/$duplicatedPartitionPath")) + val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles) + + val latestFiles: java.util.List[HoodieBaseFile] = fsView.getLatestBaseFiles().collect(Collectors.toList[HoodieBaseFile]()) + + val fileNameToPathMap = latestFiles.map(f => (f.getFileId, new Path(f.getPath))).toMap + val dupeFixPlan = planDuplicateFix() + + // 1. Copy all latest files into the temp fix path + fileNameToPathMap.foreach { case (fileName, filePath) => + val badSuffix = if (dupeFixPlan.contains(fileName)) ".bad" else "" + val dstPath = new Path(s"$repairOutputPath/${filePath.getName}$badSuffix") + LOG.info(s"Copying from $filePath to $dstPath") + FileUtil.copy(fs, filePath, fs, dstPath, false, true, fs.getConf) + } + + // 2. Remove duplicates from the bad files + dupeFixPlan.foreach { case (fileName, keysToSkip) => + val instantTime = FSUtils.getCommitTime(fileNameToPathMap(fileName).getName) + val badFilePath = new Path(s"$repairOutputPath/${fileNameToPathMap(fileName).getName}.bad") + val newFilePath = new Path(s"$repairOutputPath/${fileNameToPathMap(fileName).getName}") + LOG.info(" Skipping and writing new file for : " + fileName) + SparkHelpers.skipKeysAndWriteNewFile(instantTime, fs, badFilePath, newFilePath, dupeFixPlan(fileName)) + fs.delete(badFilePath, true) + } + + // 3. Check that there are no duplicates anymore. + val df = sqlContext.read.parquet(s"$repairOutputPath/*.parquet") + df.registerTempTable("fixedTbl") + val dupeKeyDF = getDupeKeyDF("fixedTbl") + val dupeCnt = dupeKeyDF.count() + if (dupeCnt != 0) { + dupeKeyDF.show() + throw new HoodieException("Still found some duplicates!!.. Inspect output") + } + + // 4. Additionally ensure no record keys are left behind. + val sourceDF = sparkHelper.getDistinctKeyDF(fileNameToPathMap.map(t => t._2.toString).toList) + val fixedDF = sparkHelper.getDistinctKeyDF(fileNameToPathMap.map(t => s"$repairOutputPath/${t._2.getName}").toList) + val missedRecordKeysDF = sourceDF.except(fixedDF) + val missedCnt = missedRecordKeysDF.count() + if (missedCnt != 0) { + missedRecordKeysDF.show() + throw new HoodieException("Some records in source are not found in fixed files. Inspect output!!") + } + + println("No duplicates found & counts are in check!!!! ") + // 5. Prepare to copy the fixed files back. + fileNameToPathMap.foreach { case (_, filePath) => + val srcPath = new Path(s"$repairOutputPath/${filePath.getName}") + val dstPath = new Path(s"$basePath/$duplicatedPartitionPath/${filePath.getName}") + if (dryRun) { + LOG.info(s"[JUST KIDDING!!!] Copying from $srcPath to $dstPath") + } else { + // for real + LOG.info(s"[FOR REAL!!!] Copying from $srcPath to $dstPath") + FileUtil.copy(fs, srcPath, fs, dstPath, false, true, fs.getConf) + } + } + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSparkSessionExtension.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSparkSessionExtension.scala new file mode 100644 index 0000000000000..0f2c146822a17 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSparkSessionExtension.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +import org.apache.hudi.SparkAdapterSupport +import org.apache.spark.sql.SparkSessionExtensions +import org.apache.spark.sql.hudi.analysis.HoodieAnalysis +import org.apache.spark.sql.parser.HoodieCommonSqlParser + +/** + * The Hoodie SparkSessionExtension for extending the syntax and add the rules. + */ +class HoodieSparkSessionExtension extends (SparkSessionExtensions => Unit) + with SparkAdapterSupport { + override def apply(extensions: SparkSessionExtensions): Unit = { + extensions.injectParser { (session, parser) => + new HoodieCommonSqlParser(session, parser) + } + + HoodieAnalysis.customOptimizerRules.foreach { ruleBuilder => + extensions.injectOptimizerRule(ruleBuilder(_)) + } + + HoodieAnalysis.customResolutionRules.foreach { ruleBuilder => + extensions.injectResolutionRule(ruleBuilder(_)) + } + + HoodieAnalysis.customPostHocResolutionRules.foreach { ruleBuilder => + extensions.injectPostHocResolutionRule(ruleBuilder(_)) + } + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlUtils.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlUtils.scala new file mode 100644 index 0000000000000..9a031e9200472 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlUtils.scala @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +import org.apache.hudi.SparkAdapterSupport +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.expressions.{And, Expression} +import org.apache.spark.sql.catalyst.plans.logical.{MergeIntoTable, SubqueryAlias} + +object HoodieSqlUtils extends SparkAdapterSupport { + + /** + * Get the TableIdentifier of the target table in MergeInto. + */ + def getMergeIntoTargetTableId(mergeInto: MergeIntoTable): TableIdentifier = { + val aliaId = mergeInto.targetTable match { + case SubqueryAlias(_, SubqueryAlias(tableId, _)) => tableId + case SubqueryAlias(tableId, _) => tableId + case plan => throw new IllegalArgumentException(s"Illegal plan $plan in target") + } + sparkAdapter.getCatalystPlanUtils.toTableIdentifier(aliaId) + } + + /** + * Split the expression to a sub expression seq by the AND operation. + * @param expression + * @return + */ + def splitByAnd(expression: Expression): Seq[Expression] = { + expression match { + case And(left, right) => + splitByAnd(left) ++ splitByAnd(right) + case exp => Seq(exp) + } + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/SparkHelpers.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/SparkHelpers.scala new file mode 100644 index 0000000000000..1ed0e5e1a456a --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/SparkHelpers.scala @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +import org.apache.avro.Schema +import org.apache.avro.generic.IndexedRecord +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.hudi.avro.HoodieAvroWriteSupport +import org.apache.hudi.client.SparkTaskContextSupplier +import org.apache.hudi.common.bloom.{BloomFilter, BloomFilterFactory} +import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord} +import org.apache.hudi.common.util.BaseFileUtils +import org.apache.hudi.config.{HoodieIndexConfig, HoodieStorageConfig} +import org.apache.hudi.io.storage.{HoodieAvroParquetWriter, HoodieParquetConfig} +import org.apache.parquet.avro.AvroSchemaConverter +import org.apache.parquet.hadoop.metadata.CompressionCodecName +import org.apache.spark.sql.{DataFrame, SQLContext} + +import scala.collection.JavaConversions._ +import scala.collection.mutable._ + +object SparkHelpers { + @throws[Exception] + def skipKeysAndWriteNewFile(instantTime: String, fs: FileSystem, sourceFile: Path, destinationFile: Path, keysToSkip: Set[String]) { + val sourceRecords = BaseFileUtils.getInstance(HoodieFileFormat.PARQUET).readAvroRecords(fs.getConf, sourceFile) + val schema: Schema = sourceRecords.get(0).getSchema + val filter: BloomFilter = BloomFilterFactory.createBloomFilter(HoodieIndexConfig.BLOOM_FILTER_NUM_ENTRIES_VALUE.defaultValue.toInt, HoodieIndexConfig.BLOOM_FILTER_FPP_VALUE.defaultValue.toDouble, + HoodieIndexConfig.BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES.defaultValue.toInt, HoodieIndexConfig.BLOOM_FILTER_TYPE.defaultValue); + val writeSupport: HoodieAvroWriteSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(fs.getConf).convert(schema), schema, org.apache.hudi.common.util.Option.of(filter)) + val parquetConfig: HoodieParquetConfig[HoodieAvroWriteSupport] = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP, HoodieStorageConfig.PARQUET_BLOCK_SIZE.defaultValue.toInt, HoodieStorageConfig.PARQUET_PAGE_SIZE.defaultValue.toInt, HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.defaultValue.toInt, fs.getConf, HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION.defaultValue.toDouble) + + // Add current classLoad for config, if not will throw classNotFound of 'HoodieWrapperFileSystem'. + parquetConfig.getHadoopConf().setClassLoader(Thread.currentThread.getContextClassLoader) + + val writer = new HoodieAvroParquetWriter[IndexedRecord](destinationFile, parquetConfig, instantTime, new SparkTaskContextSupplier(), true) + for (rec <- sourceRecords) { + val key: String = rec.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString + if (!keysToSkip.contains(key)) { + + writer.writeAvro(key, rec) + } + } + writer.close + } +} + +/** + * Bunch of Spark Shell/Scala stuff useful for debugging + */ +class SparkHelper(sqlContext: SQLContext, fs: FileSystem) { + + /** + * Print keys from a file + * + * @param file + */ + def printKeysFromFile(file: String) = { + getRowKeyDF(file).collect().foreach(println(_)) + } + + /** + * + * @param file + * @return + */ + def getRowKeyDF(file: String): DataFrame = { + sqlContext.read.parquet(file).select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`") + } + + /** + * Does the rowKey actually exist in the file. + * + * @param rowKey + * @param file + * @return + */ + def isFileContainsKey(rowKey: String, file: String): Boolean = { + println(s"Checking $file for key $rowKey") + val ff = getRowKeyDF(file).filter(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}` = '$rowKey'") + if (ff.count() > 0) true else false + } + + /** + * Number of keys in a given file + * + * @param file + * @param sqlContext + */ + def getKeyCount(file: String, sqlContext: org.apache.spark.sql.SQLContext) = { + val keyCount = getRowKeyDF(file).collect().length + println(keyCount) + keyCount + } + + /** + * + * Checks that all the keys in the file, have been added to the bloom filter + * in the footer + * + * @param conf + * @param sqlContext + * @param file + * @return + */ + def fileKeysAgainstBF(conf: Configuration, sqlContext: SQLContext, file: String): Boolean = { + val bf = BaseFileUtils.getInstance(HoodieFileFormat.PARQUET).readBloomFilterFromMetadata(conf, new Path(file)) + val foundCount = sqlContext.parquetFile(file) + .select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`") + .collect().count(r => !bf.mightContain(r.getString(0))) + val totalCount = getKeyCount(file, sqlContext) + println(s"totalCount: $totalCount, foundCount: $foundCount") + totalCount == foundCount + } + + def getDistinctKeyDF(paths: List[String]): DataFrame = { + sqlContext.read.parquet(paths: _*).select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`").distinct() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala new file mode 100644 index 0000000000000..820891ce956e4 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala @@ -0,0 +1,624 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.analysis + +import org.apache.hudi.DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL +import org.apache.hudi.common.model.HoodieRecord +import org.apache.hudi.common.util.ReflectionUtils +import org.apache.hudi.{DataSourceReadOptions, HoodieSparkUtils, SparkAdapterSupport} +import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedRelation, UnresolvedStar} +import org.apache.spark.sql.catalyst.catalog.{CatalogUtils, HoodieCatalogTable} +import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, Expression, GenericInternalRow, Literal, NamedExpression} +import org.apache.spark.sql.catalyst.plans.Inner +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution.command._ +import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, LogicalRelation} +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.{getTableIdentifier, removeMetaFields} +import org.apache.spark.sql.hudi.HoodieSqlUtils._ +import org.apache.spark.sql.hudi.command._ +import org.apache.spark.sql.hudi.command.procedures.{HoodieProcedures, Procedure, ProcedureArgs} +import org.apache.spark.sql.hudi.{HoodieOptionConfig, HoodieSqlCommonUtils} +import org.apache.spark.sql.types.StringType +import org.apache.spark.sql.{AnalysisException, SparkSession} + +import java.util +import scala.collection.JavaConverters._ +import scala.collection.mutable.ListBuffer + +object HoodieAnalysis { + type RuleBuilder = SparkSession => Rule[LogicalPlan] + + def customOptimizerRules: Seq[RuleBuilder] = { + if (HoodieSparkUtils.gteqSpark3_1) { + val nestedSchemaPruningClass = + if (HoodieSparkUtils.gteqSpark3_3) { + "org.apache.spark.sql.execution.datasources.Spark33NestedSchemaPruning" + } else if (HoodieSparkUtils.gteqSpark3_2) { + "org.apache.spark.sql.execution.datasources.Spark32NestedSchemaPruning" + } else { + // spark 3.1 + "org.apache.spark.sql.execution.datasources.Spark31NestedSchemaPruning" + } + + val nestedSchemaPruningRule = ReflectionUtils.loadClass(nestedSchemaPruningClass).asInstanceOf[Rule[LogicalPlan]] + Seq(_ => nestedSchemaPruningRule) + } else { + Seq.empty + } + } + + def customResolutionRules: Seq[RuleBuilder] = { + val rules: ListBuffer[RuleBuilder] = ListBuffer( + // Default rules + session => HoodieResolveReferences(session), + session => HoodieAnalysis(session) + ) + + if (HoodieSparkUtils.gteqSpark3_2) { + val dataSourceV2ToV1FallbackClass = "org.apache.spark.sql.hudi.analysis.HoodieDataSourceV2ToV1Fallback" + val dataSourceV2ToV1Fallback: RuleBuilder = + session => ReflectionUtils.loadClass(dataSourceV2ToV1FallbackClass, session).asInstanceOf[Rule[LogicalPlan]] + + val spark3AnalysisClass = "org.apache.spark.sql.hudi.analysis.HoodieSpark3Analysis" + val spark3Analysis: RuleBuilder = + session => ReflectionUtils.loadClass(spark3AnalysisClass, session).asInstanceOf[Rule[LogicalPlan]] + + val resolveAlterTableCommandsClass = + if (HoodieSparkUtils.gteqSpark3_3) + "org.apache.spark.sql.hudi.Spark33ResolveHudiAlterTableCommand" + else "org.apache.spark.sql.hudi.Spark32ResolveHudiAlterTableCommand" + val resolveAlterTableCommands: RuleBuilder = + session => ReflectionUtils.loadClass(resolveAlterTableCommandsClass, session).asInstanceOf[Rule[LogicalPlan]] + + // NOTE: PLEASE READ CAREFULLY + // + // It's critical for this rules to follow in this order, so that DataSource V2 to V1 fallback + // is performed prior to other rules being evaluated + rules ++= Seq(dataSourceV2ToV1Fallback, spark3Analysis, resolveAlterTableCommands) + + } else if (HoodieSparkUtils.gteqSpark3_1) { + val spark31ResolveAlterTableCommandsClass = "org.apache.spark.sql.hudi.Spark31ResolveHudiAlterTableCommand" + val spark31ResolveAlterTableCommands: RuleBuilder = + session => ReflectionUtils.loadClass(spark31ResolveAlterTableCommandsClass, session).asInstanceOf[Rule[LogicalPlan]] + + rules ++= Seq(spark31ResolveAlterTableCommands) + } + + rules + } + + def customPostHocResolutionRules: Seq[RuleBuilder] = { + val rules: ListBuffer[RuleBuilder] = ListBuffer( + // Default rules + session => HoodiePostAnalysisRule(session) + ) + + if (HoodieSparkUtils.gteqSpark3_2) { + val spark3PostHocResolutionClass = "org.apache.spark.sql.hudi.analysis.HoodieSpark3PostAnalysisRule" + val spark3PostHocResolution: RuleBuilder = + session => ReflectionUtils.loadClass(spark3PostHocResolutionClass, session).asInstanceOf[Rule[LogicalPlan]] + + rules += spark3PostHocResolution + } + + rules + } + +} + +/** + * Rule for convert the logical plan to command. + * + * @param sparkSession + */ +case class HoodieAnalysis(sparkSession: SparkSession) extends Rule[LogicalPlan] + with SparkAdapterSupport { + + override def apply(plan: LogicalPlan): LogicalPlan = { + plan match { + // Convert to MergeIntoHoodieTableCommand + case m @ MergeIntoTable(target, _, _, _, _) + if m.resolved && sparkAdapter.isHoodieTable(target, sparkSession) => + MergeIntoHoodieTableCommand(m) + + // Convert to UpdateHoodieTableCommand + case u @ UpdateTable(table, _, _) + if u.resolved && sparkAdapter.isHoodieTable(table, sparkSession) => + UpdateHoodieTableCommand(u) + + // Convert to DeleteHoodieTableCommand + case d @ DeleteFromTable(table, _) + if d.resolved && sparkAdapter.isHoodieTable(table, sparkSession) => + DeleteHoodieTableCommand(d) + + // Convert to InsertIntoHoodieTableCommand + case l if sparkAdapter.getCatalystPlanUtils.isInsertInto(l) => + val (table, partition, query, overwrite, _) = sparkAdapter.getCatalystPlanUtils.getInsertIntoChildren(l).get + table match { + case relation: LogicalRelation if sparkAdapter.isHoodieTable(relation, sparkSession) => + new InsertIntoHoodieTableCommand(relation, query, partition, overwrite) + case _ => + l + } + + // Convert to CreateHoodieTableAsSelectCommand + case CreateTable(table, mode, Some(query)) + if query.resolved && sparkAdapter.isHoodieTable(table) => + CreateHoodieTableAsSelectCommand(table, mode, query) + + // Convert to CompactionHoodieTableCommand + case CompactionTable(table, operation, options) + if table.resolved && sparkAdapter.isHoodieTable(table, sparkSession) => + val tableId = getTableIdentifier(table) + val catalogTable = sparkSession.sessionState.catalog.getTableMetadata(tableId) + CompactionHoodieTableCommand(catalogTable, operation, options) + // Convert to CompactionHoodiePathCommand + case CompactionPath(path, operation, options) => + CompactionHoodiePathCommand(path, operation, options) + // Convert to CompactionShowOnTable + case CompactionShowOnTable(table, limit) + if sparkAdapter.isHoodieTable(table, sparkSession) => + val tableId = getTableIdentifier(table) + val catalogTable = sparkSession.sessionState.catalog.getTableMetadata(tableId) + CompactionShowHoodieTableCommand(catalogTable, limit) + // Convert to CompactionShowHoodiePathCommand + case CompactionShowOnPath(path, limit) => + CompactionShowHoodiePathCommand(path, limit) + // Convert to HoodieCallProcedureCommand + case c@CallCommand(_, _) => + val procedure: Option[Procedure] = loadProcedure(c.name) + val input = buildProcedureArgs(c.args) + if (procedure.nonEmpty) { + CallProcedureHoodieCommand(procedure.get, input) + } else { + c + } + + // Convert to CreateIndexCommand + case CreateIndex(table, indexName, indexType, ignoreIfExists, columns, properties, output) + if table.resolved && sparkAdapter.isHoodieTable(table, sparkSession) => + CreateIndexCommand( + getTableIdentifier(table), indexName, indexType, ignoreIfExists, columns, properties, output) + + // Convert to DropIndexCommand + case DropIndex(table, indexName, ignoreIfNotExists, output) + if table.resolved && sparkAdapter.isHoodieTable(table, sparkSession) => + DropIndexCommand(getTableIdentifier(table), indexName, ignoreIfNotExists, output) + + // Convert to ShowIndexesCommand + case ShowIndexes(table, output) + if table.resolved && sparkAdapter.isHoodieTable(table, sparkSession) => + ShowIndexesCommand(getTableIdentifier(table), output) + + // Covert to RefreshCommand + case RefreshIndex(table, indexName, output) + if table.resolved && sparkAdapter.isHoodieTable(table, sparkSession) => + RefreshIndexCommand(getTableIdentifier(table), indexName, output) + + case _ => plan + } + } + + private def loadProcedure(name: Seq[String]): Option[Procedure] = { + val procedure: Option[Procedure] = if (name.nonEmpty) { + val builder = HoodieProcedures.newBuilder(name.last) + if (builder != null) { + Option(builder.build) + } else { + throw new AnalysisException(s"procedure: ${name.last} is not exists") + } + } else { + None + } + procedure + } + + private def buildProcedureArgs(exprs: Seq[CallArgument]): ProcedureArgs = { + val values = new Array[Any](exprs.size) + var isNamedArgs: Boolean = false + val map = new util.LinkedHashMap[String, Int]() + for (index <- exprs.indices) { + exprs(index) match { + case expr: NamedArgument => + map.put(expr.name, index) + values(index) = expr.expr.eval() + isNamedArgs = true + case _ => + map.put(index.toString, index) + values(index) = exprs(index).expr.eval() + isNamedArgs = false + } + } + ProcedureArgs(isNamedArgs, map, new GenericInternalRow(values)) + } +} + +/** + * Rule for resolve hoodie's extended syntax or rewrite some logical plan. + * + * @param sparkSession + */ +case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[LogicalPlan] + with SparkAdapterSupport { + private lazy val analyzer = sparkSession.sessionState.analyzer + + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp { + // Resolve merge into + case mergeInto @ MergeIntoTable(target, source, mergeCondition, matchedActions, notMatchedActions) + if sparkAdapter.isHoodieTable(target, sparkSession) && target.resolved => + val resolver = sparkSession.sessionState.conf.resolver + val resolvedSource = analyzer.execute(source) + try { + analyzer.checkAnalysis(resolvedSource) + } catch { + case e: AnalysisException => + val ae = new AnalysisException(e.message, e.line, e.startPosition, Option(resolvedSource)) + ae.setStackTrace(e.getStackTrace) + throw ae + } + + def isInsertOrUpdateStar(assignments: Seq[Assignment]): Boolean = { + if (assignments.isEmpty) { + true + } else { + // This is a Hack for test if it is "update set *" or "insert *" for spark3. + // As spark3's own ResolveReference will append first five columns of the target + // table(which is the hoodie meta fields) to the assignments for "update set *" and + // "insert *", so we test if the first five assignmentFieldNames is the meta fields + // to judge if it is "update set *" or "insert *". + // We can do this because under the normal case, we should not allow to update or set + // the hoodie's meta field in sql statement, it is a system field, cannot set the value + // by user. + if (HoodieSparkUtils.isSpark3) { + val resolvedAssignments = assignments.map { assign => + val resolvedKey = assign.key match { + case c if !c.resolved => + resolveExpressionFrom(target)(c) + case o => o + } + Assignment(resolvedKey, null) + } + val assignmentFieldNames = resolvedAssignments.map(_.key).map { + case attr: AttributeReference => + attr.name + case _ => "" + }.toArray + val metaFields = HoodieRecord.HOODIE_META_COLUMNS.asScala + if (assignmentFieldNames.take(metaFields.length).mkString(",").startsWith(metaFields.mkString(","))) { + true + } else { + false + } + } else { + false + } + } + } + + def resolveConditionAssignments(condition: Option[Expression], + assignments: Seq[Assignment]): (Option[Expression], Seq[Assignment]) = { + val resolvedCondition = condition.map(resolveExpressionFrom(resolvedSource)(_)) + val resolvedAssignments = if (isInsertOrUpdateStar(assignments)) { + // assignments is empty means insert * or update set * + val resolvedSourceOutput = resolvedSource.output.filter(attr => !HoodieSqlCommonUtils.isMetaField(attr.name)) + val targetOutput = target.output.filter(attr => !HoodieSqlCommonUtils.isMetaField(attr.name)) + val resolvedSourceColumnNames = resolvedSourceOutput.map(_.name) + + if(targetOutput.filter(attr => resolvedSourceColumnNames.exists(resolver(_, attr.name))).equals(targetOutput)){ + //If sourceTable's columns contains all targetTable's columns, + //We fill assign all the source fields to the target fields by column name matching. + targetOutput.map(targetAttr => { + val sourceAttr = resolvedSourceOutput.find(f => resolver(f.name, targetAttr.name)).get + Assignment(targetAttr, sourceAttr) + }) + } else { + // We fill assign all the source fields to the target fields by order. + targetOutput + .zip(resolvedSourceOutput) + .map { case (targetAttr, sourceAttr) => Assignment(targetAttr, sourceAttr) } + } + } else { + // For Spark3.2, InsertStarAction/UpdateStarAction's assignments will contain the meta fields. + val withoutMetaAttrs = assignments.filterNot{ assignment => + if (assignment.key.isInstanceOf[Attribute]) { + HoodieSqlCommonUtils.isMetaField(assignment.key.asInstanceOf[Attribute].name) + } else { + false + } + } + withoutMetaAttrs.map { assignment => + val resolvedKey = resolveExpressionFrom(target)(assignment.key) + val resolvedValue = resolveExpressionFrom(resolvedSource, Some(target))(assignment.value) + Assignment(resolvedKey, resolvedValue) + } + } + (resolvedCondition, resolvedAssignments) + } + + // Resolve the merge condition + val resolvedMergeCondition = resolveExpressionFrom(resolvedSource, Some(target))(mergeCondition) + + // Resolve the matchedActions + val resolvedMatchedActions = matchedActions.map { + case UpdateAction(condition, assignments) => + val (resolvedCondition, resolvedAssignments) = + resolveConditionAssignments(condition, assignments) + + // Get the target table type and pre-combine field. + val targetTableId = getMergeIntoTargetTableId(mergeInto) + val targetTable = + sparkSession.sessionState.catalog.getTableMetadata(targetTableId) + val tblProperties = targetTable.storage.properties ++ targetTable.properties + val targetTableType = HoodieOptionConfig.getTableType(tblProperties) + val preCombineField = HoodieOptionConfig.getPreCombineField(tblProperties) + + // Get the map of target attribute to value of the update assignments. + val target2Values = resolvedAssignments.map { + case Assignment(attr: AttributeReference, value) => + attr.name -> value + case o => throw new IllegalArgumentException(s"Assignment key must be an attribute, current is: ${o.key}") + }.toMap + + // Validate if there are incorrect target attributes. + val targetColumnNames = removeMetaFields(target.output).map(_.name) + val unKnowTargets = target2Values.keys + .filterNot(name => targetColumnNames.exists(resolver(_, name))) + if (unKnowTargets.nonEmpty) { + throw new AnalysisException(s"Cannot find target attributes: ${unKnowTargets.mkString(",")}.") + } + + // Fill the missing target attribute in the update action for COW table to support partial update. + // e.g. If the update action missing 'id' attribute, we fill a "id = target.id" to the update action. + val newAssignments = removeMetaFields(target.output) + .map(attr => { + val valueOption = target2Values.find(f => resolver(f._1, attr.name)) + // TODO support partial update for MOR. + if (valueOption.isEmpty && targetTableType == MOR_TABLE_TYPE_OPT_VAL) { + throw new AnalysisException(s"Missing specify the value for target field: '${attr.name}' in merge into update action" + + s" for MOR table. Currently we cannot support partial update for MOR," + + s" please complete all the target fields just like '...update set id = s0.id, name = s0.name ....'") + } + if (preCombineField.isDefined && preCombineField.get.equalsIgnoreCase(attr.name) + && valueOption.isEmpty) { + throw new AnalysisException(s"Missing specify value for the preCombineField:" + + s" ${preCombineField.get} in merge-into update action. You should add" + + s" '... update set ${preCombineField.get} = xx....' to the when-matched clause.") + } + Assignment(attr, if (valueOption.isEmpty) attr else valueOption.get._2) + }) + UpdateAction(resolvedCondition, newAssignments) + case DeleteAction(condition) => + val resolvedCondition = condition.map(resolveExpressionFrom(resolvedSource)(_)) + DeleteAction(resolvedCondition) + case action: MergeAction => + // SPARK-34962: use UpdateStarAction as the explicit representation of * in UpdateAction. + // So match and covert this in Spark3.2 env. + val (resolvedCondition, resolvedAssignments) = + resolveConditionAssignments(action.condition, Seq.empty) + UpdateAction(resolvedCondition, resolvedAssignments) + } + // Resolve the notMatchedActions + val resolvedNotMatchedActions = notMatchedActions.map { + case InsertAction(condition, assignments) => + val (resolvedCondition, resolvedAssignments) = + resolveConditionAssignments(condition, assignments) + InsertAction(resolvedCondition, resolvedAssignments) + case action: MergeAction => + // SPARK-34962: use InsertStarAction as the explicit representation of * in InsertAction. + // So match and covert this in Spark3.2 env. + val (resolvedCondition, resolvedAssignments) = + resolveConditionAssignments(action.condition, Seq.empty) + InsertAction(resolvedCondition, resolvedAssignments) + } + // Return the resolved MergeIntoTable + MergeIntoTable(target, resolvedSource, resolvedMergeCondition, + resolvedMatchedActions, resolvedNotMatchedActions) + + // Resolve update table + case UpdateTable(table, assignments, condition) + if sparkAdapter.isHoodieTable(table, sparkSession) && table.resolved => + // Resolve condition + val resolvedCondition = condition.map(resolveExpressionFrom(table)(_)) + // Resolve assignments + val resolvedAssignments = assignments.map(assignment => { + val resolvedKey = resolveExpressionFrom(table)(assignment.key) + val resolvedValue = resolveExpressionFrom(table)(assignment.value) + Assignment(resolvedKey, resolvedValue) + }) + // Return the resolved UpdateTable + UpdateTable(table, resolvedAssignments, resolvedCondition) + + // Resolve Delete Table + case dft @ DeleteFromTable(table, condition) + if sparkAdapter.isHoodieTable(table, sparkSession) && table.resolved => + val resolveExpression = resolveExpressionFrom(table, None)(_) + sparkAdapter.resolveDeleteFromTable(dft, resolveExpression) + + // Append the meta field to the insert query to walk through the validate for the + // number of insert fields with the number of the target table fields. + case l if sparkAdapter.getCatalystPlanUtils.isInsertInto(l) => + val (table, partition, query, overwrite, ifPartitionNotExists) = + sparkAdapter.getCatalystPlanUtils.getInsertIntoChildren(l).get + + if (sparkAdapter.isHoodieTable(table, sparkSession) && query.resolved && + !containUnResolvedStar(query) && + !checkAlreadyAppendMetaField(query)) { + val metaFields = HoodieRecord.HOODIE_META_COLUMNS.asScala.map( + Alias(Literal.create(null, StringType), _)()).toArray[NamedExpression] + val newQuery = query match { + case project: Project => + val withMetaFieldProjects = + metaFields ++ project.projectList + // Append the meta fields to the insert query. + Project(withMetaFieldProjects, project.child) + case _ => + val withMetaFieldProjects = metaFields ++ query.output + Project(withMetaFieldProjects, query) + } + sparkAdapter.getCatalystPlanUtils.createInsertInto(table, partition, newQuery, overwrite, ifPartitionNotExists) + } else { + l + } + + case l if sparkAdapter.getCatalystPlanUtils.isRelationTimeTravel(l) => + val (plan: UnresolvedRelation, timestamp, version) = + sparkAdapter.getCatalystPlanUtils.getRelationTimeTravel(l).get + + if (timestamp.isEmpty && version.nonEmpty) { + throw new AnalysisException( + "version expression is not supported for time travel") + } + + val tableIdentifier = sparkAdapter.getCatalystPlanUtils.toTableIdentifier(plan) + if (sparkAdapter.isHoodieTable(tableIdentifier, sparkSession)) { + val hoodieCatalogTable = HoodieCatalogTable(sparkSession, tableIdentifier) + val table = hoodieCatalogTable.table + val pathOption = table.storage.locationUri.map("path" -> CatalogUtils.URIToString(_)) + val instantOption = Map( + DataSourceReadOptions.TIME_TRAVEL_AS_OF_INSTANT.key -> timestamp.get.toString()) + val dataSource = + DataSource( + sparkSession, + userSpecifiedSchema = if (table.schema.isEmpty) None else Some(table.schema), + partitionColumns = table.partitionColumnNames, + bucketSpec = table.bucketSpec, + className = table.provider.get, + options = table.storage.properties ++ pathOption ++ instantOption, + catalogTable = Some(table)) + + LogicalRelation(dataSource.resolveRelation(checkFilesExist = false), table) + } else { + l + } + + case p => p + } + + private def containUnResolvedStar(query: LogicalPlan): Boolean = { + query match { + case project: Project => project.projectList.exists(_.isInstanceOf[UnresolvedStar]) + case _ => false + } + } + + /** + * Check if the the query of insert statement has already append the meta fields to avoid + * duplicate append. + * + * @param query + * @return + */ + private def checkAlreadyAppendMetaField(query: LogicalPlan): Boolean = { + query.output.take(HoodieRecord.HOODIE_META_COLUMNS.size()) + .filter(isMetaField) + .map { + case AttributeReference(name, _, _, _) => name.toLowerCase + case other => throw new IllegalArgumentException(s"$other should not be a hoodie meta field") + }.toSet == HoodieRecord.HOODIE_META_COLUMNS.asScala.toSet + } + + private def isMetaField(exp: Expression): Boolean = { + val metaFields = HoodieRecord.HOODIE_META_COLUMNS.asScala.toSet + exp match { + case Alias(_, name) if metaFields.contains(name.toLowerCase) => true + case AttributeReference(name, _, _, _) if metaFields.contains(name.toLowerCase) => true + case _=> false + } + } + + /** + * Resolve the expression. + * 1、 Fake a a project for the expression based on the source plan + * 2、 Resolve the fake project + * 3、 Get the resolved expression from the faked project + * @param left The left source plan for the expression. + * @param right The right source plan for the expression. + * @param expression The expression to resolved. + * @return The resolved expression. + */ + private def resolveExpressionFrom(left: LogicalPlan, right: Option[LogicalPlan] = None) + (expression: Expression): Expression = { + // Fake a project for the expression based on the source plan. + val fakeProject = if (right.isDefined) { + Project(Seq(Alias(expression, "_c0")()), + sparkAdapter.getCatalystPlanUtils.createJoin(left, right.get, Inner)) + } else { + Project(Seq(Alias(expression, "_c0")()), + left) + } + // Resolve the fake project + val resolvedProject = + analyzer.ResolveReferences.apply(fakeProject).asInstanceOf[Project] + val unResolvedAttrs = resolvedProject.projectList.head.collect { + case attr: UnresolvedAttribute => attr + } + if (unResolvedAttrs.nonEmpty) { + throw new AnalysisException(s"Cannot resolve ${unResolvedAttrs.mkString(",")} in " + + s"${expression.sql}, the input " + s"columns is: [${fakeProject.child.output.mkString(", ")}]") + } + // Fetch the resolved expression from the fake project. + resolvedProject.projectList.head.asInstanceOf[Alias].child + } +} + +/** + * Rule for rewrite some spark commands to hudi's implementation. + * @param sparkSession + */ +case class HoodiePostAnalysisRule(sparkSession: SparkSession) extends Rule[LogicalPlan] { + override def apply(plan: LogicalPlan): LogicalPlan = { + plan match { + // Rewrite the CreateDataSourceTableCommand to CreateHoodieTableCommand + case CreateDataSourceTableCommand(table, ignoreIfExists) + if sparkAdapter.isHoodieTable(table) => + CreateHoodieTableCommand(table, ignoreIfExists) + // Rewrite the DropTableCommand to DropHoodieTableCommand + case DropTableCommand(tableName, ifExists, false, purge) + if sparkAdapter.isHoodieTable(tableName, sparkSession) => + DropHoodieTableCommand(tableName, ifExists, false, purge) + // Rewrite the AlterTableDropPartitionCommand to AlterHoodieTableDropPartitionCommand + case AlterTableDropPartitionCommand(tableName, specs, ifExists, purge, retainData) + if sparkAdapter.isHoodieTable(tableName, sparkSession) => + AlterHoodieTableDropPartitionCommand(tableName, specs, ifExists, purge, retainData) + // Rewrite the AlterTableRenameCommand to AlterHoodieTableRenameCommand + // Rewrite the AlterTableAddColumnsCommand to AlterHoodieTableAddColumnsCommand + case AlterTableAddColumnsCommand(tableId, colsToAdd) + if sparkAdapter.isHoodieTable(tableId, sparkSession) => + AlterHoodieTableAddColumnsCommand(tableId, colsToAdd) + // Rewrite the AlterTableRenameCommand to AlterHoodieTableRenameCommand + case AlterTableRenameCommand(oldName, newName, isView) + if !isView && sparkAdapter.isHoodieTable(oldName, sparkSession) => + AlterHoodieTableRenameCommand(oldName, newName, isView) + // Rewrite the AlterTableChangeColumnCommand to AlterHoodieTableChangeColumnCommand + case AlterTableChangeColumnCommand(tableName, columnName, newColumn) + if sparkAdapter.isHoodieTable(tableName, sparkSession) => + AlterHoodieTableChangeColumnCommand(tableName, columnName, newColumn) + // SPARK-34238: the definition of ShowPartitionsCommand has been changed in Spark3.2. + // Match the class type instead of call the `unapply` method. + case s: ShowPartitionsCommand + if sparkAdapter.isHoodieTable(s.tableName, sparkSession) => + ShowHoodieTablePartitionsCommand(s.tableName, s.spec) + // Rewrite TruncateTableCommand to TruncateHoodieTableCommand + case TruncateTableCommand(tableName, partitionSpec) + if sparkAdapter.isHoodieTable(tableName, sparkSession) => + TruncateHoodieTableCommand(tableName, partitionSpec) + case _ => plan + } + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CallProcedureHoodieCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CallProcedureHoodieCommand.scala new file mode 100644 index 0000000000000..f63f4115e9195 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CallProcedureHoodieCommand.scala @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.hudi.command.procedures.{Procedure, ProcedureArgs} +import org.apache.spark.sql.{Row, SparkSession} + +import scala.collection.Seq + +case class CallProcedureHoodieCommand( + procedure: Procedure, + args: ProcedureArgs) extends HoodieLeafRunnableCommand { + + override def output: Seq[Attribute] = procedure.outputType.toAttributes + + override def run(sparkSession: SparkSession): Seq[Row] = { + procedure.call(args) + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionHoodiePathCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionHoodiePathCommand.scala new file mode 100644 index 0000000000000..57aff092b7429 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionHoodiePathCommand.scala @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.hudi.common.model.HoodieTableType +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.plans.logical.CompactionOperation.{CompactionOperation, RUN, SCHEDULE} +import org.apache.spark.sql.hudi.command.procedures.{HoodieProcedureUtils, RunCompactionProcedure} +import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.unsafe.types.UTF8String + +@Deprecated +case class CompactionHoodiePathCommand(path: String, + operation: CompactionOperation, + instantTimestamp: Option[Long] = None) + extends HoodieLeafRunnableCommand { + + override def run(sparkSession: SparkSession): Seq[Row] = { + val metaClient = HoodieTableMetaClient.builder().setBasePath(path) + .setConf(sparkSession.sessionState.newHadoopConf()).build() + assert(metaClient.getTableType == HoodieTableType.MERGE_ON_READ, s"Must compaction on a Merge On Read table.") + + val op = operation match { + case SCHEDULE => UTF8String.fromString("schedule") + case RUN => UTF8String.fromString("run") + case _ => throw new UnsupportedOperationException(s"Unsupported compaction operation: $operation") + } + + var args: Map[String, Any] = Map("op" -> op, "path" -> UTF8String.fromString(path)) + instantTimestamp.foreach(timestamp => args += "timestamp" -> timestamp) + val procedureArgs = HoodieProcedureUtils.buildProcedureArgs(args) + RunCompactionProcedure.builder.get().build.call(procedureArgs) + } + + override val output: Seq[Attribute] = RunCompactionProcedure.builder.get().build.outputType.toAttributes +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionHoodieTableCommand.scala new file mode 100644 index 0000000000000..adaaeae9e55c9 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionHoodieTableCommand.scala @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.spark.sql.catalyst.catalog.CatalogTable +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.plans.logical.CompactionOperation.CompactionOperation +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.getTableLocation +import org.apache.spark.sql.hudi.command.procedures.RunCompactionProcedure +import org.apache.spark.sql.{Row, SparkSession} + +@Deprecated +case class CompactionHoodieTableCommand(table: CatalogTable, + operation: CompactionOperation, + instantTimestamp: Option[Long]) + extends HoodieLeafRunnableCommand { + + override def run(sparkSession: SparkSession): Seq[Row] = { + val basePath = getTableLocation(table, sparkSession) + CompactionHoodiePathCommand(basePath, operation, instantTimestamp).run(sparkSession) + } + + override val output: Seq[Attribute] = RunCompactionProcedure.builder.get().build.outputType.toAttributes +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionShowHoodiePathCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionShowHoodiePathCommand.scala new file mode 100644 index 0000000000000..95a4ecf7800e6 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionShowHoodiePathCommand.scala @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.hudi.common.model.HoodieTableType +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.hudi.command.procedures.{HoodieProcedureUtils, ShowCompactionProcedure} +import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.unsafe.types.UTF8String + +@Deprecated +case class CompactionShowHoodiePathCommand(path: String, limit: Int) + extends HoodieLeafRunnableCommand { + + override def run(sparkSession: SparkSession): Seq[Row] = { + val metaClient = HoodieTableMetaClient.builder().setBasePath(path) + .setConf(sparkSession.sessionState.newHadoopConf()).build() + + assert(metaClient.getTableType == HoodieTableType.MERGE_ON_READ, + s"Cannot show compaction on a Non Merge On Read table.") + + val args = Map("path" -> UTF8String.fromString(path), "limit" -> limit) + val procedureArgs = HoodieProcedureUtils.buildProcedureArgs(args) + ShowCompactionProcedure.builder.get().build.call(procedureArgs) + } + + override val output: Seq[Attribute] = ShowCompactionProcedure.builder.get().build.outputType.toAttributes +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionShowHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionShowHoodieTableCommand.scala new file mode 100644 index 0000000000000..afd15d5153db6 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionShowHoodieTableCommand.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.spark.sql.catalyst.catalog.CatalogTable +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.getTableLocation +import org.apache.spark.sql.hudi.command.procedures.ShowCompactionProcedure +import org.apache.spark.sql.{Row, SparkSession} + +@Deprecated +case class CompactionShowHoodieTableCommand(table: CatalogTable, limit: Int) + extends HoodieLeafRunnableCommand { + + override def run(sparkSession: SparkSession): Seq[Row] = { + val basePath = getTableLocation(table, sparkSession) + CompactionShowHoodiePathCommand(basePath, limit).run(sparkSession) + } + + override val output: Seq[Attribute] = ShowCompactionProcedure.builder.get().build.outputType.toAttributes +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableAsSelectCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableAsSelectCommand.scala new file mode 100644 index 0000000000000..1f8d009530146 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableAsSelectCommand.scala @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hudi.DataSourceWriteOptions +import org.apache.hudi.common.util.ValidationUtils.checkState +import org.apache.hudi.hive.HiveSyncConfigHolder +import org.apache.hudi.sql.InsertMode +import org.apache.hudi.sync.common.util.ConfigUtils +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable.needFilterProps +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType, HoodieCatalogTable} +import org.apache.spark.sql.catalyst.plans.QueryPlan +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} + +import scala.collection.JavaConverters._ + +/** + * Command for create table as query statement. + */ +case class CreateHoodieTableAsSelectCommand( + table: CatalogTable, + mode: SaveMode, + query: LogicalPlan) extends HoodieLeafRunnableCommand { + override def innerChildren: Seq[QueryPlan[_]] = Seq(query) + + override def run(sparkSession: SparkSession): Seq[Row] = { + checkState(table.tableType != CatalogTableType.VIEW) + checkState(table.provider.isDefined) + + val hasQueryAsProp = (table.storage.properties ++ table.properties).contains(ConfigUtils.IS_QUERY_AS_RO_TABLE) + if (hasQueryAsProp) { + throw new AnalysisException("Not support CTAS for the ro/rt table") + } + + val sessionState = sparkSession.sessionState + val db = table.identifier.database.getOrElse(sessionState.catalog.getCurrentDatabase) + val qualifiedTableIdentifier = table.identifier.copy(database = Some(db)) + val tableName = qualifiedTableIdentifier.unquotedString + + if (sessionState.catalog.tableExists(qualifiedTableIdentifier)) { + checkState(mode != SaveMode.Overwrite, + s"Expect the table $tableName has been dropped when the save mode is Overwrite") + + if (mode == SaveMode.ErrorIfExists) { + throw new AnalysisException(s"Table $tableName already exists. You need to drop it first.") + } + + if (mode == SaveMode.Ignore) { + // Since the table already exists and the save mode is Ignore, we will just return. + // scalastyle:off + return Seq.empty + // scalastyle:on + } + } + + // Remove some properties should not be used + val updatedStorageFormat = table.storage.copy( + properties = table.storage.properties -- needFilterProps) + + val updatedTable = table.copy( + identifier = qualifiedTableIdentifier, + storage = updatedStorageFormat, + // TODO need to add meta-fields here + schema = query.schema, + properties = table.properties -- needFilterProps + ) + + val hoodieCatalogTable = HoodieCatalogTable(sparkSession, updatedTable) + val tablePath = hoodieCatalogTable.tableLocation + val hadoopConf = sparkSession.sessionState.newHadoopConf() + + try { + // Init hoodie table + hoodieCatalogTable.initHoodieTable() + + val tableProperties = hoodieCatalogTable.catalogProperties + // NOTE: Users might be specifying write-configuration (inadvertently) as options or table properties + // in CTAS, therefore we need to make sure that these are appropriately propagated to the + // write operation + val options = tableProperties ++ Map( + HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE.key -> (table.tableType == CatalogTableType.MANAGED).toString, + HiveSyncConfigHolder.HIVE_TABLE_SERDE_PROPERTIES.key -> ConfigUtils.configToString(tableProperties.asJava), + HiveSyncConfigHolder.HIVE_TABLE_PROPERTIES.key -> ConfigUtils.configToString(updatedTable.properties.asJava), + DataSourceWriteOptions.SQL_INSERT_MODE.key -> InsertMode.NON_STRICT.value(), + DataSourceWriteOptions.SQL_ENABLE_BULK_INSERT.key -> "true" + ) + val partitionSpec = updatedTable.partitionColumnNames.map((_, None)).toMap + val success = InsertIntoHoodieTableCommand.run(sparkSession, updatedTable, query, partitionSpec, + mode == SaveMode.Overwrite, refreshTable = false, extraOptions = options) + if (success) { + // If write success, create the table in catalog if it has not synced to the + // catalog by the meta sync. + if (!sparkSession.sessionState.catalog.tableExists(qualifiedTableIdentifier)) { + // create catalog table for this hoodie table + CreateHoodieTableCommand.createTableInCatalog(sparkSession, hoodieCatalogTable, mode == SaveMode.Ignore) + } + } else { // failed to insert data, clear table path + clearTablePath(tablePath, hadoopConf) + } + } catch { + case e: Throwable => // failed to insert data, clear table path + clearTablePath(tablePath, hadoopConf) + throw e + } + Seq.empty[Row] + } + + private def clearTablePath(tablePath: String, conf: Configuration): Unit = { + val path = new Path(tablePath) + val fs = path.getFileSystem(conf) + fs.delete(path, true) + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/DeleteHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/DeleteHoodieTableCommand.scala new file mode 100644 index 0000000000000..e1dc8daa4ca97 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/DeleteHoodieTableCommand.scala @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.hudi.SparkAdapterSupport +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.catalyst.plans.logical.DeleteFromTable +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._ +import org.apache.spark.sql.hudi.ProvidesHoodieConfig + +case class DeleteHoodieTableCommand(deleteTable: DeleteFromTable) extends HoodieLeafRunnableCommand + with SparkAdapterSupport with ProvidesHoodieConfig { + + private val table = deleteTable.table + + private val tableId = getTableIdentifier(table) + + override def run(sparkSession: SparkSession): Seq[Row] = { + logInfo(s"start execute delete command for $tableId") + + // Remove meta fields from the data frame + var df = removeMetaFields(Dataset.ofRows(sparkSession, table)) + val condition = sparkAdapter.extractDeleteCondition(deleteTable) + if (condition != null) df = df.filter(Column(condition)) + + val hoodieCatalogTable = HoodieCatalogTable(sparkSession, tableId) + val config = buildHoodieDeleteTableConfig(hoodieCatalogTable, sparkSession) + df.write + .format("hudi") + .mode(SaveMode.Append) + .options(config) + .save() + sparkSession.catalog.refreshTable(tableId.unquotedString) + logInfo(s"Finish execute delete command for $tableId") + Seq.empty[Row] + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/IndexCommands.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/IndexCommands.scala new file mode 100644 index 0000000000000..5d73af31a9497 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/IndexCommands.scala @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.hudi.common.index.HoodieIndex +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.{Row, SparkSession} + +case class CreateIndexCommand( + tableId: TableIdentifier, + indexName: String, + indexType: String, + ignoreIfExists: Boolean, + columns: Seq[(Attribute, Map[String, String])], + properties: Map[String, String], + override val output: Seq[Attribute]) extends IndexBaseCommand { + + override def run(sparkSession: SparkSession): Seq[Row] = { + // The implementation for different index type + Seq.empty + } +} + +case class DropIndexCommand( + tableId: TableIdentifier, + indexName: String, + ignoreIfNotExists: Boolean, + override val output: Seq[Attribute]) extends IndexBaseCommand { + + override def run(sparkSession: SparkSession): Seq[Row] = { + // The implementation for different index type + Seq.empty + } +} + +case class ShowIndexesCommand( + tableId: TableIdentifier, + override val output: Seq[Attribute]) extends IndexBaseCommand { + + override def run(sparkSession: SparkSession): Seq[Row] = { + // The implementation for different index type + Seq.empty + } +} + +case class RefreshIndexCommand( + tableId: TableIdentifier, + indexName: String, + override val output: Seq[Attribute]) extends IndexBaseCommand { + + override def run(sparkSession: SparkSession): Seq[Row] = { + // The implementation for different index type + Seq.empty + } +} + +abstract class IndexBaseCommand extends HoodieLeafRunnableCommand with Logging { + + /** + * Check hoodie index exists. In a hoodie table, hoodie index name + * must be unique, so the index name will be checked firstly, + * + * @param secondaryIndexes Current hoodie indexes + * @param indexName The index name to be checked + * @param colNames The column names to be checked + * @return true if the index exists + */ + def indexExists( + secondaryIndexes: Option[Array[HoodieIndex]], + indexName: String, + indexType: Option[String] = None, + colNames: Option[Array[String]] = None): Boolean = { + secondaryIndexes.exists(i => { + i.exists(_.getIndexName.equals(indexName)) || + // Index type and column name need to be checked if present + indexType.exists(t => + colNames.exists(c => + i.exists(index => + index.getIndexType.name().equalsIgnoreCase(t) && index.getColNames.sameElements(c)))) + }) + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala new file mode 100644 index 0000000000000..0228e5ddcf7c3 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.hudi.exception.HoodieException +import org.apache.hudi.{HoodieSparkSqlWriter, SparkAdapterSupport} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HoodieCatalogTable} +import org.apache.spark.sql.catalyst.expressions.{Alias, Cast, Literal, NamedExpression} +import org.apache.spark.sql.catalyst.plans.QueryPlan +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} +import org.apache.spark.sql.execution.datasources.LogicalRelation +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._ +import org.apache.spark.sql.hudi.ProvidesHoodieConfig +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{StringType, StructField, StructType} +import org.apache.spark.sql._ + +/** + * Command for insert into Hudi table. + * + * This is correspondent to Spark's native [[InsertIntoStatement]] + * + * @param logicalRelation the [[LogicalRelation]] representing the table to be writing into. + * @param query the logical plan representing data to be written + * @param partitionSpec a map from the partition key to the partition value (optional). + * If the value is missing, dynamic partition insert will be performed. + * As an example, `INSERT INTO tbl PARTITION (a=1, b=2) AS` would have + * Map('a' -> Some('1'), 'b' -> Some('2')), + * and `INSERT INTO tbl PARTITION (a=1, b) AS ...` + * would have Map('a' -> Some('1'), 'b' -> None). + * @param overwrite overwrite existing table or partitions. + */ +case class InsertIntoHoodieTableCommand(logicalRelation: LogicalRelation, + query: LogicalPlan, + partitionSpec: Map[String, Option[String]], + overwrite: Boolean) + extends HoodieLeafRunnableCommand { + override def innerChildren: Seq[QueryPlan[_]] = Seq(query) + + override def run(sparkSession: SparkSession): Seq[Row] = { + assert(logicalRelation.catalogTable.isDefined, "Missing catalog table") + + val table = logicalRelation.catalogTable.get + InsertIntoHoodieTableCommand.run(sparkSession, table, query, partitionSpec, overwrite) + Seq.empty[Row] + } +} + +object InsertIntoHoodieTableCommand extends Logging with ProvidesHoodieConfig with SparkAdapterSupport { + + /** + * Run the insert query. We support both dynamic partition insert and static partition insert. + * @param sparkSession The spark session. + * @param table The insert table. + * @param query The insert query. + * @param partitionSpec The specified insert partition map. + * e.g. "insert into h(dt = '2021') select id, name from src" + * "dt" is the key in the map and "2021" is the partition value. If the + * partition value has not specified(in the case of dynamic partition) + * , it is None in the map. + * @param overwrite Whether to overwrite the table. + * @param refreshTable Whether to refresh the table after insert finished. + * @param extraOptions Extra options for insert. + */ + def run(sparkSession: SparkSession, + table: CatalogTable, + query: LogicalPlan, + partitionSpec: Map[String, Option[String]], + overwrite: Boolean, + refreshTable: Boolean = true, + extraOptions: Map[String, String] = Map.empty): Boolean = { + val catalogTable = new HoodieCatalogTable(sparkSession, table) + val config = buildHoodieInsertConfig(catalogTable, sparkSession, overwrite, partitionSpec, extraOptions) + + // NOTE: In case of partitioned table we override specified "overwrite" parameter + // to instead append to the dataset + val mode = if (overwrite && catalogTable.partitionFields.isEmpty) { + SaveMode.Overwrite + } else { + SaveMode.Append + } + + val alignedQuery = alignQueryOutput(query, catalogTable, partitionSpec, sparkSession.sessionState.conf) + + val (success, _, _, _, _, _) = HoodieSparkSqlWriter.write(sparkSession.sqlContext, mode, config, Dataset.ofRows(sparkSession, alignedQuery)) + + if (!success) { + throw new HoodieException("Insert Into to Hudi table failed") + } + + if (success && refreshTable) { + sparkSession.catalog.refreshTable(table.identifier.unquotedString) + } + + success + } + + /** + * Align provided [[query]]'s output with the expected [[catalogTable]] schema by + * + *
      + *
    • Performing type coercion (casting corresponding outputs, where needed)
    • + *
    • Adding aliases (matching column names) to corresponding outputs
    • + *
    + * + * @param query target query whose output is to be inserted + * @param catalogTable catalog table + * @param partitionsSpec partition spec specifying static/dynamic partition values + * @param conf Spark's [[SQLConf]] + */ + private def alignQueryOutput(query: LogicalPlan, + catalogTable: HoodieCatalogTable, + partitionsSpec: Map[String, Option[String]], + conf: SQLConf): LogicalPlan = { + + val targetPartitionSchema = catalogTable.partitionSchema + val staticPartitionValues = filterStaticPartitionValues(partitionsSpec) + + // Make sure we strip out meta-fields from the incoming dataset (these will have to be discarded anyway) + val cleanedQuery = stripMetaFields(query) + // To validate and align properly output of the query, we simply filter out partition columns with already + // provided static values from the table's schema + // + // NOTE: This is a crucial step: since coercion might rely on either of a) name-based or b) positional-based + // matching it's important to strip out partition columns, having static values provided in the partition spec, + // since such columns wouldn't be otherwise specified w/in the query itself and therefore couldn't be matched + // positionally for example + val expectedQueryColumns = catalogTable.tableSchemaWithoutMetaFields.filterNot(f => staticPartitionValues.contains(f.name)) + val coercedQueryOutput = coerceQueryOutputColumns(StructType(expectedQueryColumns), cleanedQuery, catalogTable, conf) + // After potential reshaping validate that the output of the query conforms to the table's schema + validate(removeMetaFields(coercedQueryOutput.schema), partitionsSpec, catalogTable) + + val staticPartitionValuesExprs = createStaticPartitionValuesExpressions(staticPartitionValues, targetPartitionSchema, conf) + + Project(coercedQueryOutput.output ++ staticPartitionValuesExprs, coercedQueryOutput) + } + + private def coerceQueryOutputColumns(expectedSchema: StructType, + query: LogicalPlan, + catalogTable: HoodieCatalogTable, + conf: SQLConf): LogicalPlan = { + val planUtils = sparkAdapter.getCatalystPlanUtils + try { + planUtils.resolveOutputColumns(catalogTable.catalogTableName, expectedSchema.toAttributes, query, byName = true, conf) + } catch { + // NOTE: In case matching by name didn't match the query output, we will attempt positional matching + case ae: AnalysisException if ae.getMessage().startsWith("Cannot write incompatible data to table") => + planUtils.resolveOutputColumns(catalogTable.catalogTableName, expectedSchema.toAttributes, query, byName = false, conf) + } + } + + private def validate(queryOutputSchema: StructType, partitionsSpec: Map[String, Option[String]], catalogTable: HoodieCatalogTable): Unit = { + // Validate that partition-spec has proper format (it could be empty if all of the partition values are dynamic, + // ie there are no static partition-values specified) + if (partitionsSpec.nonEmpty && partitionsSpec.size != catalogTable.partitionSchema.size) { + throw new HoodieException(s"Required partition schema is: ${catalogTable.partitionSchema.fieldNames.mkString("[", ", ", "]")}, " + + s"partition spec is: ${partitionsSpec.mkString("[", ", ", "]")}") + } + + val staticPartitionValues = filterStaticPartitionValues(partitionsSpec) + val fullQueryOutputSchema = StructType(queryOutputSchema.fields ++ staticPartitionValues.keys.map(StructField(_, StringType))) + + // Assert that query provides all the required columns + if (!conforms(fullQueryOutputSchema, catalogTable.tableSchemaWithoutMetaFields)) { + throw new HoodieException(s"Expected table's schema: ${catalogTable.tableSchemaWithoutMetaFields.fields.mkString("[", ", ", "]")}, " + + s"query's output (including static partition values): ${fullQueryOutputSchema.fields.mkString("[", ", ", "]")}") + } + } + + private def createStaticPartitionValuesExpressions(staticPartitionValues: Map[String, String], + partitionSchema: StructType, + conf: SQLConf): Seq[NamedExpression] = { + partitionSchema.fields + .filter(pf => staticPartitionValues.contains(pf.name)) + .map(pf => { + val staticPartitionValue = staticPartitionValues(pf.name) + val castExpr = castIfNeeded(Literal.create(staticPartitionValue), pf.dataType, conf) + + Alias(castExpr, pf.name)() + }) + } + + private def conforms(sourceSchema: StructType, targetSchema: StructType): Boolean = { + if (sourceSchema.fields.length != targetSchema.fields.length) { + false + } else { + targetSchema.fields.zip(sourceSchema).forall { + case (targetColumn, sourceColumn) => + // Make sure we can cast source column to the target column type + Cast.canCast(sourceColumn.dataType, targetColumn.dataType) + } + } + } + + def stripMetaFields(query: LogicalPlan): LogicalPlan = { + val filteredOutput = query.output.filterNot(attr => isMetaField(attr.name)) + if (filteredOutput == query.output) { + query + } else { + Project(filteredOutput, query) + } + } + + private def filterStaticPartitionValues(partitionsSpec: Map[String, Option[String]]): Map[String, String] = + partitionsSpec.filter(p => p._2.isDefined).mapValues(_.get) +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala new file mode 100644 index 0000000000000..b148d3d510fb1 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.avro.Schema +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.common.util.StringUtils +import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME +import org.apache.hudi.exception.HoodieException +import org.apache.hudi.hive.HiveSyncConfigHolder +import org.apache.hudi.sync.common.HoodieSyncConfig +import org.apache.hudi.{AvroConversionUtils, DataSourceWriteOptions, HoodieSparkSqlWriter, SparkAdapterSupport} +import org.apache.spark.sql.HoodieCatalystExpressionUtils.MatchCast +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.Resolver +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, BoundReference, Cast, EqualTo, Expression, Literal} +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._ +import org.apache.spark.sql.hudi.HoodieSqlUtils.getMergeIntoTargetTableId +import org.apache.spark.sql.hudi.command.MergeIntoHoodieTableCommand.CoercedAttributeReference +import org.apache.spark.sql.hudi.command.payload.ExpressionPayload +import org.apache.spark.sql.hudi.command.payload.ExpressionPayload._ +import org.apache.spark.sql.hudi.{ProvidesHoodieConfig, SerDeUtils} +import org.apache.spark.sql.types.{BooleanType, StructType} + +import java.util.Base64 + + +/** + * The Command for hoodie MergeIntoTable. + * The match on condition must contain the row key fields currently, so that we can use Hoodie + * Index to speed up the performance. + * + * The main algorithm: + * + * We pushed down all the matched and not matched (condition, assignment) expression pairs to the + * ExpressionPayload. And the matched (condition, assignment) expression pairs will execute in the + * ExpressionPayload#combineAndGetUpdateValue to compute the result record, while the not matched + * expression pairs will execute in the ExpressionPayload#getInsertValue. + * + * For Mor table, it is a litter complex than this. The matched record also goes through the getInsertValue + * and write append to the log. So the update actions & insert actions should process by the same + * way. We pushed all the update actions & insert actions together to the + * ExpressionPayload#getInsertValue. + * + */ +case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends HoodieLeafRunnableCommand + with SparkAdapterSupport with ProvidesHoodieConfig { + + private var sparkSession: SparkSession = _ + + /** + * The target table identify. + */ + private lazy val targetTableIdentify: TableIdentifier = getMergeIntoTargetTableId(mergeInto) + + /** + * The target table schema without hoodie meta fields. + */ + private var sourceDFOutput = mergeInto.sourceTable.output.filter(attr => !isMetaField(attr.name)) + + /** + * The target table schema without hoodie meta fields. + */ + private lazy val targetTableSchemaWithoutMetaFields = + removeMetaFields(mergeInto.targetTable.schema).fields + + private lazy val hoodieCatalogTable = HoodieCatalogTable(sparkSession, targetTableIdentify) + + private lazy val targetTableType = hoodieCatalogTable.tableTypeName + + /** + * + * Return a map of target key to the source expression from the Merge-On Condition. + * e.g. merge on t.id = s.s_id AND t.name = s.s_name, we return + * Map("id" -> "s_id", "name" ->"s_name") + * TODO Currently Non-equivalent conditions are not supported. + */ + private lazy val targetKey2SourceExpression: Map[String, Expression] = { + val resolver = sparkSession.sessionState.conf.resolver + val conditions = splitByAnd(mergeInto.mergeCondition) + val allEqs = conditions.forall(p => p.isInstanceOf[EqualTo]) + if (!allEqs) { + throw new IllegalArgumentException("Non-Equal condition is not support for Merge " + + s"Into Statement: ${mergeInto.mergeCondition.sql}") + } + val targetAttrs = mergeInto.targetTable.output + + val cleanedConditions = conditions.map(_.asInstanceOf[EqualTo]).map { + // Here we're unraveling superfluous casting of expressions on both sides of the matched-on condition, + // in case both of them are casted to the same type (which might be result of either explicit casting + // from the user, or auto-casting performed by Spark for type coercion), which has potential + // potential of rendering the whole operation as invalid (check out HUDI-4861 for more details) + case EqualTo(MatchCast(leftExpr, leftCastTargetType, _, _), MatchCast(rightExpr, rightCastTargetType, _, _)) + if leftCastTargetType.sameType(rightCastTargetType) => EqualTo(leftExpr, rightExpr) + + case c => c + } + + val exprUtils = sparkAdapter.getCatalystExpressionUtils + // Expressions of the following forms are supported: + // `target.id = ` (or ` = target.id`) + // `cast(target.id, ...) = ` (or ` = cast(target.id, ...)`) + // + // In the latter case, there are further restrictions: since cast will be dropped on the + // target table side (since we're gonna be matching against primary-key column as is) expression + // on the opposite side of the comparison should be cast-able to the primary-key column's data-type + // t/h "up-cast" (ie w/o any loss in precision) + val target2Source = cleanedConditions.map { + case EqualTo(CoercedAttributeReference(attr), expr) + if targetAttrs.exists(f => attributeEqual(f, attr, resolver)) => + if (exprUtils.canUpCast(expr.dataType, attr.dataType)) { + targetAttrs.find(f => resolver(f.name, attr.name)).get.name -> + castIfNeeded(expr, attr.dataType, sparkSession.sqlContext.conf) + } else { + throw new AnalysisException(s"Invalid MERGE INTO matching condition: ${expr.sql}: " + + s"can't cast ${expr.sql} (of ${expr.dataType}) to ${attr.dataType}") + } + + case EqualTo(expr, CoercedAttributeReference(attr)) + if targetAttrs.exists(f => attributeEqual(f, attr, resolver)) => + if (exprUtils.canUpCast(expr.dataType, attr.dataType)) { + targetAttrs.find(f => resolver(f.name, attr.name)).get.name -> + castIfNeeded(expr, attr.dataType, sparkSession.sqlContext.conf) + } else { + throw new AnalysisException(s"Invalid MERGE INTO matching condition: ${expr.sql}: " + + s"can't cast ${expr.sql} (of ${expr.dataType}) to ${attr.dataType}") + } + + case expr => + throw new AnalysisException(s"Invalid MERGE INTO matching condition: `${expr.sql}`: " + + "expected condition should be 'target.id = ', e.g. " + + "`t.id = s.id` or `t.id = cast(s.id, ...)`") + }.toMap + + target2Source + } + + /** + * Get the mapping of target preCombineField to the source expression. + */ + private lazy val target2SourcePreCombineFiled: Option[(String, Expression)] = { + val updateActions = mergeInto.matchedActions.collect { case u: UpdateAction => u } + assert(updateActions.size <= 1, s"Only support one updateAction currently, current update action count is: ${updateActions.size}") + + val updateAction = updateActions.headOption + hoodieCatalogTable.preCombineKey.map(preCombineField => { + val sourcePreCombineField = + updateAction.map(u => u.assignments.filter { + case Assignment(key: AttributeReference, _) => key.name.equalsIgnoreCase(preCombineField) + case _=> false + }.head.value + ).getOrElse { + // If there is no update action, mapping the target column to the source by order. + val target2Source = mergeInto.targetTable.output + .filter(attr => !isMetaField(attr.name)) + .map(_.name) + .zip(mergeInto.sourceTable.output.filter(attr => !isMetaField(attr.name))) + .toMap + target2Source.getOrElse(preCombineField, null) + } + (preCombineField, sourcePreCombineField) + }).filter(p => p._2 != null) + } + + override def run(sparkSession: SparkSession): Seq[Row] = { + this.sparkSession = sparkSession + + // Create the write parameters + val parameters = buildMergeIntoConfig(hoodieCatalogTable) + executeUpsert(sourceDF, parameters) + + sparkSession.catalog.refreshTable(targetTableIdentify.unquotedString) + Seq.empty[Row] + } + + /** + * Build the sourceDF. We will append the source primary key expressions and + * preCombine field expression to the sourceDF. + * e.g. + *

    + * merge into h0 + * using (select 1 as id, 'a1' as name, 1000 as ts) s0 + * on h0.id = s0.id + 1 + * when matched then update set id = s0.id, name = s0.name, ts = s0.ts + 1 + *

    + * "ts" is the pre-combine field of h0. + * + * The targetKey2SourceExpression is: ("id", "s0.id + 1"). + * The target2SourcePreCombineFiled is:("ts", "s0.ts + 1"). + * We will append the "s0.id + 1 as id" and "s0.ts + 1 as ts" to the sourceDF to compute the + * row key and pre-combine field. + * + */ + private lazy val sourceDF: DataFrame = { + var sourceDF = Dataset.ofRows(sparkSession, mergeInto.sourceTable) + targetKey2SourceExpression.foreach { + case (targetColumn, sourceExpression) + if !containsPrimaryKeyFieldReference(targetColumn, sourceExpression) => + sourceDF = sourceDF.withColumn(targetColumn, new Column(sourceExpression)) + sourceDFOutput = sourceDFOutput :+ AttributeReference(targetColumn, sourceExpression.dataType)() + case _=> + } + target2SourcePreCombineFiled.foreach { + case (targetPreCombineField, sourceExpression) + if !containsPreCombineFieldReference(targetPreCombineField, sourceExpression) => + sourceDF = sourceDF.withColumn(targetPreCombineField, new Column(sourceExpression)) + sourceDFOutput = sourceDFOutput :+ AttributeReference(targetPreCombineField, sourceExpression.dataType)() + case _=> + } + sourceDF + } + + /** + * Check whether the source expression has the same column name with target column. + * + * Merge condition cases that return true: + * 1) merge into .. on h0.id = s0.id .. + * 2) merge into .. on h0.id = cast(s0.id as int) .. + * "id" is primaryKey field of h0. + */ + private def containsPrimaryKeyFieldReference(targetColumnName: String, sourceExpression: Expression): Boolean = { + val sourceColumnNames = sourceDFOutput.map(_.name) + val resolver = sparkSession.sessionState.conf.resolver + + sourceExpression match { + case attr: AttributeReference if sourceColumnNames.find(resolver(_, attr.name)).get.equals(targetColumnName) => true + // SPARK-35857: the definition of Cast has been changed in Spark3.2. + // Match the class type instead of call the `unapply` method. + case cast: Cast => + cast.child match { + case attr: AttributeReference if sourceColumnNames.find(resolver(_, attr.name)).get.equals(targetColumnName) => true + case _ => false + } + case _=> false + } + } + + /** + * Check whether the source expression on preCombine field contains the same column name with target column. + * + * Merge expression cases that return true: + * 1) merge into .. on .. update set ts = s0.ts + * 2) merge into .. on .. update set ts = cast(s0.ts as int) + * 3) merge into .. on .. update set ts = s0.ts+1 (expressions like this whose sub node has the same column name with target) + * "ts" is preCombine field of h0. + */ + private def containsPreCombineFieldReference(targetColumnName: String, sourceExpression: Expression): Boolean = { + val sourceColumnNames = sourceDFOutput.map(_.name) + val resolver = sparkSession.sessionState.conf.resolver + + // sub node of the expression may have same column name with target column name + sourceExpression.find { + case attr: AttributeReference => sourceColumnNames.find(resolver(_, attr.name)).get.equals(targetColumnName) + case _ => false + }.isDefined + } + + /** + * Compare a [[Attribute]] to another, return true if they have the same column name(by resolver) and exprId + */ + private def attributeEqual( + attr: Attribute, other: Attribute, resolver: Resolver): Boolean = { + resolver(attr.name, other.name) && attr.exprId == other.exprId + } + + /** + * Execute the update and delete action. All the matched and not-matched actions will + * execute in one upsert write operation. We pushed down the matched condition and assignment + * expressions to the ExpressionPayload#combineAndGetUpdateValue and the not matched + * expressions to the ExpressionPayload#getInsertValue. + */ + private def executeUpsert(sourceDF: DataFrame, parameters: Map[String, String]): Unit = { + val operation = if (StringUtils.isNullOrEmpty(parameters.getOrElse(PRECOMBINE_FIELD.key, ""))) { + INSERT_OPERATION_OPT_VAL + } else { + UPSERT_OPERATION_OPT_VAL + } + + // Append the table schema to the parameters. In the case of merge into, the schema of sourceDF + // may be different from the target table, because the are transform logical in the update or + // insert actions. + var writeParams = parameters + + (OPERATION.key -> operation) + + (HoodieWriteConfig.WRITE_SCHEMA.key -> getTableSchema.toString) + + (DataSourceWriteOptions.TABLE_TYPE.key -> targetTableType) + + val updateActions = mergeInto.matchedActions.filter(_.isInstanceOf[UpdateAction]) + .map(_.asInstanceOf[UpdateAction]) + // Check for the update actions + checkUpdateAssignments(updateActions) + + val deleteActions = mergeInto.matchedActions.filter(_.isInstanceOf[DeleteAction]) + .map(_.asInstanceOf[DeleteAction]) + assert(deleteActions.size <= 1, "Should be only one delete action in the merge into statement.") + val deleteAction = deleteActions.headOption + + // Map of Condition -> Assignments + val updateConditionToAssignments = + updateActions.map(update => { + val rewriteCondition = update.condition.map(replaceAttributeInExpression) + .getOrElse(Literal.create(true, BooleanType)) + val formatAssignments = rewriteAndReOrderAssignments(update.assignments) + rewriteCondition -> formatAssignments + }).toMap + // Serialize the Map[UpdateCondition, UpdateAssignments] to base64 string + val serializedUpdateConditionAndExpressions = Base64.getEncoder + .encodeToString(SerDeUtils.toBytes(updateConditionToAssignments)) + writeParams += (PAYLOAD_UPDATE_CONDITION_AND_ASSIGNMENTS -> + serializedUpdateConditionAndExpressions) + + if (deleteAction.isDefined) { + val deleteCondition = deleteAction.get.condition + .map(replaceAttributeInExpression) + .getOrElse(Literal.create(true, BooleanType)) + // Serialize the Map[DeleteCondition, empty] to base64 string + val serializedDeleteCondition = Base64.getEncoder + .encodeToString(SerDeUtils.toBytes(Map(deleteCondition -> Seq.empty[Assignment]))) + writeParams += (PAYLOAD_DELETE_CONDITION -> serializedDeleteCondition) + } + + val insertActions = + mergeInto.notMatchedActions.map(_.asInstanceOf[InsertAction]) + + // Check for the insert actions + checkInsertAssignments(insertActions) + + // Serialize the Map[InsertCondition, InsertAssignments] to base64 string + writeParams += (PAYLOAD_INSERT_CONDITION_AND_ASSIGNMENTS -> + serializedInsertConditionAndExpressions(insertActions)) + + // Remove the meta fields from the sourceDF as we do not need these when writing. + val sourceDFWithoutMetaFields = removeMetaFields(sourceDF) + val (success, _, _, _, _, _) = HoodieSparkSqlWriter.write(sparkSession.sqlContext, SaveMode.Append, writeParams, sourceDFWithoutMetaFields) + if (!success) { + throw new HoodieException("Merge into Hoodie table command failed") + } + } + + private def checkUpdateAssignments(updateActions: Seq[UpdateAction]): Unit = { + updateActions.foreach(update => + assert(update.assignments.length == targetTableSchemaWithoutMetaFields.length, + s"The number of update assignments[${update.assignments.length}] must equal to the " + + s"targetTable field size[${targetTableSchemaWithoutMetaFields.length}]")) + // For MOR table, the target table field cannot be the right-value in the update action. + if (targetTableType == MOR_TABLE_TYPE_OPT_VAL) { + updateActions.foreach(update => { + val targetAttrs = update.assignments.flatMap(a => a.value.collect { + case attr: AttributeReference if mergeInto.targetTable.outputSet.contains(attr) => attr + }) + assert(targetAttrs.isEmpty, + s"Target table's field(${targetAttrs.map(_.name).mkString(",")}) cannot be the right-value of the update clause for MOR table.") + }) + } + } + + private def checkInsertAssignments(insertActions: Seq[InsertAction]): Unit = { + insertActions.foreach(insert => + assert(insert.assignments.length == targetTableSchemaWithoutMetaFields.length, + s"The number of insert assignments[${insert.assignments.length}] must equal to the " + + s"targetTable field size[${targetTableSchemaWithoutMetaFields.length}]")) + + } + + private def getTableSchema: Schema = { + val (structName, nameSpace) = AvroConversionUtils + .getAvroRecordNameAndNamespace(targetTableIdentify.identifier) + AvroConversionUtils.convertStructTypeToAvroSchema( + new StructType(targetTableSchemaWithoutMetaFields), structName, nameSpace) + } + + /** + * Serialize the Map[InsertCondition, InsertAssignments] to base64 string. + * @param insertActions + * @return + */ + private def serializedInsertConditionAndExpressions(insertActions: Seq[InsertAction]): String = { + val insertConditionAndAssignments = + insertActions.map(insert => { + val rewriteCondition = insert.condition.map(replaceAttributeInExpression) + .getOrElse(Literal.create(true, BooleanType)) + val formatAssignments = rewriteAndReOrderAssignments(insert.assignments) + // Do the check for the insert assignments + checkInsertExpression(formatAssignments) + + rewriteCondition -> formatAssignments + }).toMap + Base64.getEncoder.encodeToString( + SerDeUtils.toBytes(insertConditionAndAssignments)) + } + + /** + * Rewrite and ReOrder the assignments. + * The Rewrite is to replace the AttributeReference to BoundReference. + * The ReOrder is to make the assignments's order same with the target table. + * @param assignments + * @return + */ + private def rewriteAndReOrderAssignments(assignments: Seq[Expression]): Seq[Expression] = { + val attr2Assignment = assignments.map { + case Assignment(attr: AttributeReference, value) => { + val rewriteValue = replaceAttributeInExpression(value) + attr -> Alias(rewriteValue, attr.name)() + } + case assignment => throw new IllegalArgumentException(s"Illegal Assignment: ${assignment.sql}") + }.toMap[Attribute, Expression] + // reorder the assignments by the target table field + mergeInto.targetTable.output + .filterNot(attr => isMetaField(attr.name)) + .map(attr => { + val assignment = attr2Assignment.find(f => attributeEqual(f._1, attr, sparkSession.sessionState.conf.resolver)) + .getOrElse(throw new IllegalArgumentException(s"Cannot find related assignment for field: ${attr.name}")) + castIfNeeded(assignment._2, attr.dataType, sparkSession.sqlContext.conf) + }) + } + + /** + * Replace the AttributeReference to BoundReference. This is for the convenience of CodeGen + * in ExpressionCodeGen which use the field index to generate the code. So we must replace + * the AttributeReference to BoundReference here. + * @param exp + * @return + */ + private def replaceAttributeInExpression(exp: Expression): Expression = { + val sourceJoinTargetFields = sourceDFOutput ++ + mergeInto.targetTable.output.filterNot(attr => isMetaField(attr.name)) + + exp transform { + case attr: AttributeReference => + val index = sourceJoinTargetFields.indexWhere(p => p.semanticEquals(attr)) + if (index == -1) { + throw new IllegalArgumentException(s"cannot find ${attr.qualifiedName} in source or " + + s"target at the merge into statement") + } + BoundReference(index, attr.dataType, attr.nullable) + case other => other + } + } + + /** + * Check the insert action expression. + * The insert expression should not contain target table field. + */ + private def checkInsertExpression(expressions: Seq[Expression]): Unit = { + expressions.foreach(exp => { + val references = exp.collect { + case reference: BoundReference => reference + } + references.foreach(ref => { + if (ref.ordinal >= sourceDFOutput.size) { + val targetColumn = targetTableSchemaWithoutMetaFields(ref.ordinal - sourceDFOutput.size) + throw new IllegalArgumentException(s"Insert clause cannot contain target table's field: ${targetColumn.name}" + + s" in ${exp.sql}") + } + }) + }) + } + + /** + * Create the config for hoodie writer. + */ + private def buildMergeIntoConfig(hoodieCatalogTable: HoodieCatalogTable): Map[String, String] = { + + val targetTableDb = targetTableIdentify.database.getOrElse("default") + val targetTableName = targetTableIdentify.identifier + val path = hoodieCatalogTable.tableLocation + // force to use ExpressionPayload as WRITE_PAYLOAD_CLASS_NAME in MergeIntoHoodieTableCommand + val catalogProperties = hoodieCatalogTable.catalogProperties + (PAYLOAD_CLASS_NAME.key -> classOf[ExpressionPayload].getCanonicalName) + val tableConfig = hoodieCatalogTable.tableConfig + val tableSchema = hoodieCatalogTable.tableSchema + val partitionColumns = tableConfig.getPartitionFieldProp.split(",").map(_.toLowerCase) + val partitionSchema = StructType(tableSchema.filter(f => partitionColumns.contains(f.name))) + + // NOTE: Here we fallback to "" to make sure that null value is not overridden with + // default value ("ts") + // TODO(HUDI-3456) clean up + val preCombineField = hoodieCatalogTable.preCombineKey.getOrElse("") + + val hoodieProps = getHoodieProps(catalogProperties, tableConfig, sparkSession.sqlContext.conf) + val hiveSyncConfig = buildHiveSyncConfig(hoodieProps, hoodieCatalogTable) + + withSparkConf(sparkSession, catalogProperties) { + Map( + "path" -> path, + RECORDKEY_FIELD.key -> tableConfig.getRecordKeyFieldProp, + PRECOMBINE_FIELD.key -> preCombineField, + TBL_NAME.key -> hoodieCatalogTable.tableName, + PARTITIONPATH_FIELD.key -> tableConfig.getPartitionFieldProp, + HIVE_STYLE_PARTITIONING.key -> tableConfig.getHiveStylePartitioningEnable, + URL_ENCODE_PARTITIONING.key -> tableConfig.getUrlEncodePartitioning, + KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName, + SqlKeyGenerator.ORIGINAL_KEYGEN_CLASS_NAME -> tableConfig.getKeyGeneratorClassName, + HoodieSyncConfig.META_SYNC_ENABLED.key -> hiveSyncConfig.getString(HoodieSyncConfig.META_SYNC_ENABLED.key), + HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key -> hiveSyncConfig.getString(HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key), + HiveSyncConfigHolder.HIVE_SYNC_MODE.key -> hiveSyncConfig.getString(HiveSyncConfigHolder.HIVE_SYNC_MODE), + HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> targetTableDb, + HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> targetTableName, + HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.getBoolean(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE).toString, + HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> tableConfig.getPartitionFieldProp, + HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> hiveSyncConfig.getString(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS), + HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key, "200"), // set the default parallelism to 200 for sql + HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key, "200"), + HoodieWriteConfig.DELETE_PARALLELISM_VALUE.key -> hoodieProps.getString(HoodieWriteConfig.DELETE_PARALLELISM_VALUE.key, "200"), + SqlKeyGenerator.PARTITION_SCHEMA -> partitionSchema.toDDL + ) + .filter { case (_, v) => v != null } + } + } +} + +object MergeIntoHoodieTableCommand { + + object CoercedAttributeReference { + def unapply(expr: Expression): Option[AttributeReference] = { + expr match { + case attr: AttributeReference => Some(attr) + case MatchCast(attr: AttributeReference, _, _, _) => Some(attr) + + case _ => None + } + } + } + +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UpdateHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UpdateHoodieTableCommand.scala new file mode 100644 index 0000000000000..277f2643423dd --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UpdateHoodieTableCommand.scala @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.hudi.SparkAdapterSupport +import org.apache.hudi.common.model.HoodieRecord +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression} +import org.apache.spark.sql.catalyst.plans.logical.{Assignment, UpdateTable} +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._ +import org.apache.spark.sql.hudi.ProvidesHoodieConfig +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructField + +import scala.collection.JavaConverters._ + +case class UpdateHoodieTableCommand(updateTable: UpdateTable) extends HoodieLeafRunnableCommand + with SparkAdapterSupport with ProvidesHoodieConfig { + + private val table = updateTable.table + private val tableId = getTableIdentifier(table) + + override def run(sparkSession: SparkSession): Seq[Row] = { + logInfo(s"start execute update command for $tableId") + val sqlConf = sparkSession.sessionState.conf + val name2UpdateValue = updateTable.assignments.map { + case Assignment(attr: AttributeReference, value) => + attr.name -> value + }.toMap + + val updateExpressions = table.output + .map(attr => { + val UpdateValueOption = name2UpdateValue.find(f => sparkSession.sessionState.conf.resolver(f._1, attr.name)) + if(UpdateValueOption.isEmpty) attr else UpdateValueOption.get._2 + }) + .filter { // filter the meta columns + case attr: AttributeReference => + !HoodieRecord.HOODIE_META_COLUMNS.asScala.toSet.contains(attr.name) + case _=> true + } + + val projects = updateExpressions.zip(removeMetaFields(table.schema).fields).map { + case (attr: AttributeReference, field) => + Column(cast(attr, field, sqlConf)) + case (exp, field) => + Column(Alias(cast(exp, field, sqlConf), field.name)()) + } + + var df = Dataset.ofRows(sparkSession, table) + if (updateTable.condition.isDefined) { + df = df.filter(Column(updateTable.condition.get)) + } + df = df.select(projects: _*) + val config = buildHoodieConfig(HoodieCatalogTable(sparkSession, tableId)) + df.write + .format("hudi") + .mode(SaveMode.Append) + .options(config) + .save() + sparkSession.catalog.refreshTable(tableId.unquotedString) + logInfo(s"Finish execute update command for $tableId") + Seq.empty[Row] + } + + def cast(exp:Expression, field: StructField, sqlConf: SQLConf): Expression = { + castIfNeeded(exp, field.dataType, sqlConf) + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UuidKeyGenerator.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UuidKeyGenerator.scala new file mode 100644 index 0000000000000..14a0074fef859 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UuidKeyGenerator.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import java.util.UUID +import org.apache.avro.generic.GenericRecord +import org.apache.hudi.common.config.TypedProperties +import org.apache.spark.sql.Row + +/** + * A KeyGenerator which use the uuid as the record key. + */ +class UuidKeyGenerator(props: TypedProperties) extends SqlKeyGenerator(props) { + + override def getRecordKey(record: GenericRecord): String = UUID.randomUUID.toString + + override def getRecordKey(row: Row): String = UUID.randomUUID.toString +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionCodeGen.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionCodeGen.scala new file mode 100644 index 0000000000000..cd5b201f91eb9 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionCodeGen.scala @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.payload + +import org.apache.avro.generic.{GenericRecord, IndexedRecord} +import org.apache.hudi.sql.IExpressionEvaluator +import org.apache.spark.executor.InputMetrics +import org.apache.spark.internal.Logging +import org.apache.spark.sql.avro.AvroSerializer +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper +import org.apache.spark.sql.catalyst.expressions.codegen._ +import org.apache.spark.sql.catalyst.expressions.{BoundReference, Cast, Expression, GenericInternalRow, LeafExpression, UnsafeArrayData, UnsafeMapData, UnsafeRow} +import org.apache.spark.sql.catalyst.util.{ArrayData, MapData} +import org.apache.spark.sql.hudi.command.payload.ExpressionCodeGen.RECORD_NAME +import org.apache.spark.sql.types.{DataType, Decimal} +import org.apache.spark.unsafe.Platform +import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} +import org.apache.spark.util.ParentClassLoader +import org.apache.spark.{TaskContext, TaskKilledException} +import org.codehaus.commons.compiler.CompileException +import org.codehaus.janino.{ClassBodyEvaluator, InternalCompilerException} + +import java.util.UUID + +/** + * Do CodeGen for expression based on IndexedRecord. + * The mainly difference with the spark's CodeGen for expression is that + * the expression's input is a IndexedRecord but not a Row. + * + */ +object ExpressionCodeGen extends Logging { + + val RECORD_NAME = "record" + + /** + * CodeGen for expressions. + * @param exprs The expression list to CodeGen. + * @return An IExpressionEvaluator generate by CodeGen which take a IndexedRecord as input + * param and return a Array of results for each expression. + */ + def doCodeGen(exprs: Seq[Expression], serializer: AvroSerializer): IExpressionEvaluator = { + val ctx = new CodegenContext() + // Set the input_row to null as we do not use row as the input object but Record. + ctx.INPUT_ROW = null + + val replacedExprs = exprs.map(replaceBoundReference) + val resultVars = replacedExprs.map(_.genCode(ctx)) + val className = s"ExpressionPayloadEvaluator_${UUID.randomUUID().toString.replace("-", "_")}" + val codeBody = + s""" + |private Object[] references; + |private String code; + |private AvroSerializer serializer; + | + |public $className(Object references, String code, AvroSerializer serializer) { + | this.references = (Object[])references; + | this.code = code; + | this.serializer = serializer; + |} + | + |public GenericRecord eval(IndexedRecord $RECORD_NAME) { + | ${resultVars.map(_.code).mkString("\n")} + | Object[] results = new Object[${resultVars.length}]; + | ${ + (for (i <- resultVars.indices) yield { + s""" + |if (${resultVars(i).isNull}) { + | results[$i] = null; + |} else { + | results[$i] = ${resultVars(i).value.code}; + |} + """.stripMargin + }).mkString("\n") + } + InternalRow row = new GenericInternalRow(results); + return (GenericRecord) serializer.serialize(row); + | } + | + |public String getCode() { + | return code; + |} + """.stripMargin + + val evaluator = new ClassBodyEvaluator() + val parentClassLoader = new ParentClassLoader( + Option(Thread.currentThread().getContextClassLoader).getOrElse(getClass.getClassLoader)) + + evaluator.setParentClassLoader(parentClassLoader) + // Cannot be under package codegen, or fail with java.lang.InstantiationException + evaluator.setClassName(s"org.apache.hudi.sql.payload.$className") + evaluator.setDefaultImports( + classOf[Platform].getName, + classOf[InternalRow].getName, + classOf[UnsafeRow].getName, + classOf[UTF8String].getName, + classOf[Decimal].getName, + classOf[CalendarInterval].getName, + classOf[ArrayData].getName, + classOf[UnsafeArrayData].getName, + classOf[MapData].getName, + classOf[UnsafeMapData].getName, + classOf[Expression].getName, + classOf[TaskContext].getName, + classOf[TaskKilledException].getName, + classOf[InputMetrics].getName, + classOf[IndexedRecord].getName, + classOf[AvroSerializer].getName, + classOf[GenericRecord].getName, + classOf[GenericInternalRow].getName, + classOf[Cast].getName + ) + evaluator.setImplementedInterfaces(Array(classOf[IExpressionEvaluator])) + try { + evaluator.cook(codeBody) + } catch { + case e: InternalCompilerException => + val msg = s"failed to compile: $e" + logError(msg, e) + throw new InternalCompilerException(msg, e) + case e: CompileException => + val msg = s"failed to compile: $e" + logError(msg, e) + throw new CompileException(msg, e.getLocation) + } + val referenceArray = ctx.references.toArray.map(_.asInstanceOf[Object]) + val expressionSql = exprs.map(_.sql).mkString(" ") + + evaluator.getClazz.getConstructor(classOf[Object], classOf[String], classOf[AvroSerializer]) + .newInstance(referenceArray, s"Expressions is: [$expressionSql]\nCodeBody is: {\n$codeBody\n}", serializer) + .asInstanceOf[IExpressionEvaluator] + } + + /** + * Replace the BoundReference to the Record implement which will override the + * doGenCode method. + */ + private def replaceBoundReference(expression: Expression): Expression = { + expression transformDown { + case BoundReference(ordinal, dataType, nullable) => + RecordBoundReference(ordinal, dataType, nullable) + case other => + other + } + } +} + +case class RecordBoundReference(ordinal: Int, dataType: DataType, nullable: Boolean) + extends LeafExpression { + + /** + * Do the CodeGen for RecordBoundReference. + * Use "IndexedRecord" as the input object but not a "Row" + */ + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val javaType = JavaCode.javaType(dataType) + val boxType = JavaCode.boxedType(dataType) + + val value = s"($boxType)$RECORD_NAME.get($ordinal)" + if (nullable) { + ev.copy(code = + code""" + | boolean ${ev.isNull} = $RECORD_NAME.get($ordinal) == null; + | $javaType ${ev.value} = ${ev.isNull} ? + | ${CodeGenerator.defaultValue(dataType)} : ($value); + """ + ) + } else { + ev.copy(code = code"$javaType ${ev.value} = $value;", isNull = FalseLiteral) + } + } + + override def eval(input: InternalRow): Any = { + throw new IllegalArgumentException(s"Should not call eval method for " + + s"${getClass.getCanonicalName}") + } +} + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala new file mode 100644 index 0000000000000..55932237d95f5 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala @@ -0,0 +1,343 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.payload + +import com.github.benmanes.caffeine.cache.Caffeine +import org.apache.avro.Schema +import org.apache.avro.generic.{GenericData, GenericRecord, IndexedRecord} +import org.apache.hudi.AvroConversionUtils +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.avro.HoodieAvroUtils +import org.apache.hudi.avro.HoodieAvroUtils.bytesToAvro +import org.apache.hudi.common.model.{DefaultHoodieRecordPayload, HoodiePayloadProps, HoodieRecord} +import org.apache.hudi.common.util.{ValidationUtils, Option => HOption} +import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.io.HoodieWriteHandle +import org.apache.hudi.sql.IExpressionEvaluator +import org.apache.spark.sql.avro.{AvroSerializer, SchemaConverters} +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.hudi.SerDeUtils +import org.apache.spark.sql.hudi.command.payload.ExpressionPayload.{getEvaluator, getMergedSchema, setWriteSchema} +import org.apache.spark.sql.types.{StructField, StructType} + +import java.util.{Base64, Properties} +import java.util.function.Function +import scala.collection.JavaConverters._ +import scala.collection.mutable.ArrayBuffer + +/** + * A HoodieRecordPayload for MergeIntoHoodieTableCommand. + * It will execute the condition and assignments expression in the + * match and not-match actions and compute the final record to write. + * + * If there is no condition match the record, ExpressionPayload will return + * a HoodieWriteHandle.IGNORE_RECORD, and the write handles will ignore this record. + */ +class ExpressionPayload(record: GenericRecord, + orderingVal: Comparable[_]) + extends DefaultHoodieRecordPayload(record, orderingVal) { + + def this(recordOpt: HOption[GenericRecord]) { + this(recordOpt.orElse(null), 0) + } + + /** + * The schema of this table. + */ + private var writeSchema: Schema = _ + + override def combineAndGetUpdateValue(currentValue: IndexedRecord, + schema: Schema): HOption[IndexedRecord] = { + throw new IllegalStateException(s"Should not call this method for ${getClass.getCanonicalName}") + } + + override def getInsertValue(schema: Schema): HOption[IndexedRecord] = { + throw new IllegalStateException(s"Should not call this method for ${getClass.getCanonicalName}") + } + + override def combineAndGetUpdateValue(targetRecord: IndexedRecord, + schema: Schema, properties: Properties): HOption[IndexedRecord] = { + val sourceRecord = bytesToAvro(recordBytes, schema) + val joinSqlRecord = new SqlTypedRecord(joinRecord(sourceRecord, targetRecord)) + processMatchedRecord(joinSqlRecord, Some(targetRecord), properties) + } + + /** + * Process the matched record. Firstly test if the record matched any of the update-conditions, + * if matched, return the update assignments result. Secondly, test if the record matched + * delete-condition, if matched then return a delete record. Finally if no condition matched, + * return a {@link HoodieWriteHandle.IGNORE_RECORD} which will be ignored by HoodieWriteHandle. + * @param inputRecord The input record to process. + * @param targetRecord The origin exist record. + * @param properties The properties. + * @return The result of the record to update or delete. + */ + private def processMatchedRecord(inputRecord: SqlTypedRecord, + targetRecord: Option[IndexedRecord], properties: Properties): HOption[IndexedRecord] = { + // Process update + val updateConditionAndAssignmentsText = + properties.get(ExpressionPayload.PAYLOAD_UPDATE_CONDITION_AND_ASSIGNMENTS) + assert(updateConditionAndAssignmentsText != null, + s"${ExpressionPayload.PAYLOAD_UPDATE_CONDITION_AND_ASSIGNMENTS} have not set") + + var resultRecordOpt: HOption[IndexedRecord] = null + + // Get the Evaluator for each condition and update assignments. + initWriteSchemaIfNeed(properties) + val updateConditionAndAssignments = getEvaluator(updateConditionAndAssignmentsText.toString, writeSchema) + for ((conditionEvaluator, assignmentEvaluator) <- updateConditionAndAssignments + if resultRecordOpt == null) { + val conditionVal = evaluate(conditionEvaluator, inputRecord).get(0).asInstanceOf[Boolean] + // If the update condition matched then execute assignment expression + // to compute final record to update. We will return the first matched record. + if (conditionVal) { + val resultRecord = evaluate(assignmentEvaluator, inputRecord) + + if (targetRecord.isEmpty || needUpdatingPersistedRecord(targetRecord.get, resultRecord, properties)) { + resultRecordOpt = HOption.of(resultRecord) + } else { + // if the PreCombine field value of targetRecord is greater + // than the new incoming record, just keep the old record value. + resultRecordOpt = HOption.of(targetRecord.get) + } + } + } + if (resultRecordOpt == null) { + // Process delete + val deleteConditionText = properties.get(ExpressionPayload.PAYLOAD_DELETE_CONDITION) + if (deleteConditionText != null) { + val deleteCondition = getEvaluator(deleteConditionText.toString, writeSchema).head._1 + val deleteConditionVal = evaluate(deleteCondition, inputRecord).get(0).asInstanceOf[Boolean] + if (deleteConditionVal) { + resultRecordOpt = HOption.empty() + } + } + } + if (resultRecordOpt == null) { + // If there is no condition matched, just filter this record. + // here we return a IGNORE_RECORD, HoodieMergeHandle will not handle it. + HOption.of(HoodieWriteHandle.IGNORE_RECORD) + } else { + resultRecordOpt + } + } + + /** + * Process the not-matched record. Test if the record matched any of insert-conditions, + * if matched then return the result of insert-assignment. Or else return a + * {@link HoodieWriteHandle.IGNORE_RECORD} which will be ignored by HoodieWriteHandle. + * + * @param inputRecord The input record to process. + * @param properties The properties. + * @return The result of the record to insert. + */ + private def processNotMatchedRecord(inputRecord: SqlTypedRecord, properties: Properties): HOption[IndexedRecord] = { + val insertConditionAndAssignmentsText = + properties.get(ExpressionPayload.PAYLOAD_INSERT_CONDITION_AND_ASSIGNMENTS) + // Get the evaluator for each condition and insert assignment. + initWriteSchemaIfNeed(properties) + val insertConditionAndAssignments = + ExpressionPayload.getEvaluator(insertConditionAndAssignmentsText.toString, writeSchema) + var resultRecordOpt: HOption[IndexedRecord] = null + for ((conditionEvaluator, assignmentEvaluator) <- insertConditionAndAssignments + if resultRecordOpt == null) { + val conditionVal = evaluate(conditionEvaluator, inputRecord).get(0).asInstanceOf[Boolean] + // If matched the insert condition then execute the assignment expressions to compute the + // result record. We will return the first matched record. + if (conditionVal) { + val resultRecord = evaluate(assignmentEvaluator, inputRecord) + resultRecordOpt = HOption.of(resultRecord) + } + } + if (resultRecordOpt != null) { + resultRecordOpt + } else { + // If there is no condition matched, just filter this record. + // Here we return a IGNORE_RECORD, HoodieCreateHandle will not handle it. + HOption.of(HoodieWriteHandle.IGNORE_RECORD) + } + } + + override def getInsertValue(schema: Schema, properties: Properties): HOption[IndexedRecord] = { + val incomingRecord = bytesToAvro(recordBytes, schema) + if (isDeleteRecord(incomingRecord)) { + HOption.empty[IndexedRecord]() + } else { + val sqlTypedRecord = new SqlTypedRecord(incomingRecord) + if (isMORTable(properties)) { + // For the MOR table, both the matched and not-matched record will step into the getInsertValue() method. + // We call the processMatchedRecord() method if current is a Update-Record to process + // the matched record. Or else we call processNotMatchedRecord() method to process the not matched record. + val isUpdateRecord = properties.getProperty(HoodiePayloadProps.PAYLOAD_IS_UPDATE_RECORD_FOR_MOR, "false").toBoolean + if (isUpdateRecord) { + processMatchedRecord(sqlTypedRecord, Option.empty, properties) + } else { + processNotMatchedRecord(sqlTypedRecord, properties) + } + } else { + // For COW table, only the not-matched record will step into the getInsertValue method, So just call + // the processNotMatchedRecord() here. + processNotMatchedRecord(sqlTypedRecord, properties) + } + } + } + + private def isMORTable(properties: Properties): Boolean = { + properties.getProperty(TABLE_TYPE.key, null) == MOR_TABLE_TYPE_OPT_VAL + } + + private def convertToRecord(values: Array[AnyRef], schema: Schema): IndexedRecord = { + assert(values.length == schema.getFields.size()) + val writeRecord = new GenericData.Record(schema) + for (i <- values.indices) { + writeRecord.put(i, values(i)) + } + writeRecord + } + + /** + * Init the table schema. + */ + private def initWriteSchemaIfNeed(properties: Properties): Unit = { + if (writeSchema == null) { + writeSchema = setWriteSchema(properties) + } + } + + /** + * Join the source record with the target record. + * + * @return + */ + private def joinRecord(sourceRecord: IndexedRecord, targetRecord: IndexedRecord): IndexedRecord = { + val leftSchema = sourceRecord.getSchema + val joinSchema = getMergedSchema(leftSchema, targetRecord.getSchema) + + val values = new ArrayBuffer[AnyRef]() + for (i <- 0 until joinSchema.getFields.size()) { + val value = if (i < leftSchema.getFields.size()) { + sourceRecord.get(i) + } else { // skip meta field + targetRecord.get(i - leftSchema.getFields.size() + HoodieRecord.HOODIE_META_COLUMNS.size()) + } + values += value + } + convertToRecord(values.toArray, joinSchema) + } + + private def evaluate(evaluator: IExpressionEvaluator, sqlTypedRecord: SqlTypedRecord): GenericRecord = { + try evaluator.eval(sqlTypedRecord) catch { + case e: Throwable => + throw new RuntimeException(s"Error in execute expression: ${e.getMessage}.\n${evaluator.getCode}", e) + } + } +} + +object ExpressionPayload { + + /** + * Property for pass the merge-into delete clause condition expression. + */ + val PAYLOAD_DELETE_CONDITION = "hoodie.payload.delete.condition" + + /** + * Property for pass the merge-into update clauses's condition and assignments. + */ + val PAYLOAD_UPDATE_CONDITION_AND_ASSIGNMENTS = "hoodie.payload.update.condition.assignments" + + /** + * Property for pass the merge-into insert clauses's condition and assignments. + */ + val PAYLOAD_INSERT_CONDITION_AND_ASSIGNMENTS = "hoodie.payload.insert.condition.assignments" + + /** + * A cache for the serializedConditionAssignments to the compiled class after CodeGen. + * The Map[IExpressionEvaluator, IExpressionEvaluator] is the map of the condition expression + * to the assignments expression. + */ + private val cache = Caffeine.newBuilder() + .maximumSize(1024) + .build[String, Map[IExpressionEvaluator, IExpressionEvaluator]]() + + private val writeSchemaCache = Caffeine.newBuilder() + .maximumSize(16).build[String, Schema]() + + def setWriteSchema(properties: Properties): Schema = { + ValidationUtils.checkArgument(properties.containsKey(HoodieWriteConfig.WRITE_SCHEMA.key), + s"Missing ${HoodieWriteConfig.WRITE_SCHEMA.key}") + writeSchemaCache.get(properties.getProperty(HoodieWriteConfig.WRITE_SCHEMA.key), + new Function[String, Schema] { + override def apply(t: String): Schema = new Schema.Parser().parse(t) + }) + } + + /** + * Do the CodeGen for each condition and assignment expressions.We will cache it to reduce + * the compile time for each method call. + */ + def getEvaluator( + serializedConditionAssignments: String, writeSchema: Schema): Map[IExpressionEvaluator, IExpressionEvaluator] = { + cache.get(serializedConditionAssignments, + new Function[String, Map[IExpressionEvaluator, IExpressionEvaluator]] { + override def apply(t: String): Map[IExpressionEvaluator, IExpressionEvaluator] = { + val serializedBytes = Base64.getDecoder.decode(t) + val conditionAssignments = SerDeUtils.toObject(serializedBytes) + .asInstanceOf[Map[Expression, Seq[Expression]]] + // Do the CodeGen for condition expression and assignment expression + conditionAssignments.map { + case (condition, assignments) => + val conditionType = StructType(Seq(StructField("_col0", condition.dataType, nullable = true))) + val conditionSerializer = new AvroSerializer(conditionType, + SchemaConverters.toAvroType(conditionType), false) + val conditionEvaluator = ExpressionCodeGen.doCodeGen(Seq(condition), conditionSerializer) + + val assignSqlType = AvroConversionUtils.convertAvroSchemaToStructType(writeSchema) + val assignSerializer = new AvroSerializer(assignSqlType, writeSchema, false) + val assignmentEvaluator = ExpressionCodeGen.doCodeGen(assignments, assignSerializer) + conditionEvaluator -> assignmentEvaluator + } + } + }) + } + + private val mergedSchemaCache = Caffeine.newBuilder().maximumSize(16).build[TupleSchema, Schema]() + + def getMergedSchema(source: Schema, target: Schema): Schema = { + + mergedSchemaCache.get(TupleSchema(source, target), new Function[TupleSchema, Schema] { + override def apply(t: TupleSchema): Schema = { + val rightSchema = HoodieAvroUtils.removeMetadataFields(t.second) + mergeSchema(t.first, rightSchema) + } + }) + } + + def mergeSchema(a: Schema, b: Schema): Schema = { + val mergedFields = + a.getFields.asScala.map(field => + new Schema.Field("a_" + field.name, + field.schema, field.doc, field.defaultVal, field.order)) ++ + b.getFields.asScala.map(field => + new Schema.Field("b_" + field.name, + field.schema, field.doc, field.defaultVal, field.order)) + Schema.createRecord(a.getName, a.getDoc, a.getNamespace, a.isError, mergedFields.asJava) + } + + case class TupleSchema(first: Schema, second: Schema) +} + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/BaseProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/BaseProcedure.scala new file mode 100644 index 0000000000000..d0404664f45d8 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/BaseProcedure.scala @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.HoodieCLIUtils +import org.apache.hudi.client.SparkRDDWriteClient +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.model.HoodieRecordPayload +import org.apache.hudi.config.{HoodieIndexConfig, HoodieWriteConfig} +import org.apache.hudi.exception.HoodieClusteringException +import org.apache.hudi.index.HoodieIndex.IndexType +import org.apache.spark.api.java.JavaSparkContext +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.types._ + +abstract class BaseProcedure extends Procedure { + val INVALID_ARG_INDEX: Int = -1 + + val spark: SparkSession = SparkSession.active + val jsc = new JavaSparkContext(spark.sparkContext) + + protected def sparkSession: SparkSession = spark + + protected def createHoodieClient(jsc: JavaSparkContext, basePath: String): SparkRDDWriteClient[_ <: HoodieRecordPayload[_ <: AnyRef]] = { + val config = getWriteConfig(basePath) + new SparkRDDWriteClient(new HoodieSparkEngineContext(jsc), config) + } + + protected def getWriteConfig(basePath: String): HoodieWriteConfig = { + HoodieWriteConfig.newBuilder + .withPath(basePath) + .withIndexConfig(HoodieIndexConfig.newBuilder.withIndexType(IndexType.BLOOM).build) + .build + } + + protected def checkArgs(target: Array[ProcedureParameter], args: ProcedureArgs): Unit = { + val internalRow = args.internalRow + for (i <- target.indices) { + if (target(i).required) { + var argsIndex: Integer = null + if (args.isNamedArgs) { + argsIndex = getArgsIndex(target(i).name, args) + } else { + argsIndex = getArgsIndex(i.toString, args) + } + assert(-1 != argsIndex && internalRow.get(argsIndex, target(i).dataType) != null, + s"Argument: ${target(i).name} is required") + } + } + } + + protected def getArgsIndex(key: String, args: ProcedureArgs): Integer = { + args.map.getOrDefault(key, INVALID_ARG_INDEX) + } + + protected def getArgValueOrDefault(args: ProcedureArgs, parameter: ProcedureParameter): Option[Any] = { + var argsIndex: Int = INVALID_ARG_INDEX + if (args.isNamedArgs) { + argsIndex = getArgsIndex(parameter.name, args) + } else { + argsIndex = getArgsIndex(parameter.index.toString, args) + } + + if (argsIndex.equals(INVALID_ARG_INDEX)) { + parameter.default match { + case option: Option[Any] => option + case _ => Option.apply(parameter.default) + } + } else { + Option.apply(getInternalRowValue(args.internalRow, argsIndex, parameter.dataType)) + } + } + + protected def getInternalRowValue(row: InternalRow, index: Int, dataType: DataType): Any = { + dataType match { + case StringType => row.getString(index) + case BinaryType => row.getBinary(index) + case BooleanType => row.getBoolean(index) + case CalendarIntervalType => row.getInterval(index) + case DoubleType => row.getDouble(index) + case d: DecimalType => row.getDecimal(index, d.precision, d.scale) + case FloatType => row.getFloat(index) + case ByteType => row.getByte(index) + case IntegerType => row.getInt(index) + case LongType => row.getLong(index) + case ShortType => row.getShort(index) + case NullType => null + case _ => + throw new UnsupportedOperationException(s"type: ${dataType.typeName} not supported") + } + } + + protected def getBasePath(tableName: Option[Any], tablePath: Option[Any] = Option.empty): String = { + tableName.map( + t => HoodieCLIUtils.getHoodieCatalogTable(sparkSession, t.asInstanceOf[String]).tableLocation) + .getOrElse( + tablePath.map(p => p.asInstanceOf[String]).getOrElse( + throw new HoodieClusteringException("Table name or table path must be given one")) + ) + } + +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/CommitsCompareProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/CommitsCompareProcedure.scala new file mode 100644 index 0000000000000..9cb03bffc9ed7 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/CommitsCompareProcedure.scala @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.HoodieCLIUtils +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.HoodieTimeline +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.function.Supplier +import scala.collection.JavaConverters._ + +class CommitsCompareProcedure() extends BaseProcedure with ProcedureBuilder { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.required(1, "path", DataTypes.StringType, None) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("compare_detail", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val table = getArgValueOrDefault(args, PARAMETERS(0)).get.asInstanceOf[String] + val path = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String] + + val hoodieCatalogTable = HoodieCLIUtils.getHoodieCatalogTable(sparkSession, table) + val basePath = hoodieCatalogTable.tableLocation + val source = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + val target = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(path).build + val sourceTimeline = source.getActiveTimeline.getCommitsTimeline.filterCompletedInstants + val targetTimeline = target.getActiveTimeline.getCommitsTimeline.filterCompletedInstants + val targetLatestCommit = + if (targetTimeline.getInstants.iterator.hasNext) targetTimeline.lastInstant.get.getTimestamp else "0" + val sourceLatestCommit = + if (sourceTimeline.getInstants.iterator.hasNext) sourceTimeline.lastInstant.get.getTimestamp else "0" + + if (sourceLatestCommit != null && HoodieTimeline.compareTimestamps(targetLatestCommit, HoodieTimeline.GREATER_THAN, sourceLatestCommit)) { // source is behind the target + val commitsToCatchup = targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE).getInstants.iterator().asScala.map(instant => instant.getTimestamp).toList.asJava + Seq(Row("Source " + source.getTableConfig.getTableName + " is behind by " + commitsToCatchup.size + " commits. Commits to catch up - " + commitsToCatchup)) + } else { + val commitsToCatchup = sourceTimeline.findInstantsAfter(targetLatestCommit, Integer.MAX_VALUE).getInstants.iterator().asScala.map(instant => instant.getTimestamp).toList.asJava + Seq(Row("Source " + source.getTableConfig.getTableName + " is ahead by " + commitsToCatchup.size + " commits. Commits to catch up - " + commitsToCatchup)) + } + } + + override def build: Procedure = new CommitsCompareProcedure() +} + +object CommitsCompareProcedure { + val NAME = "commits_compare" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new CommitsCompareProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/CopyToTableProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/CopyToTableProcedure.scala new file mode 100644 index 0000000000000..b49875c94c11c --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/CopyToTableProcedure.scala @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.DataSourceReadOptions +import org.apache.spark.internal.Logging +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} +import org.apache.spark.sql.{Row, SaveMode} + +import java.util.function.Supplier + +class CopyToTableProcedure extends BaseProcedure with ProcedureBuilder with Logging { + + + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "query_type", DataTypes.StringType, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL), + ProcedureParameter.required(2, "new_table", DataTypes.StringType, None), + ProcedureParameter.optional(3, "begin_instance_time", DataTypes.StringType, ""), + ProcedureParameter.optional(4, "end_instance_time", DataTypes.StringType, ""), + ProcedureParameter.optional(5, "as_of_instant", DataTypes.StringType, ""), + ProcedureParameter.optional(6, "save_mode", DataTypes.StringType, "overwrite") + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("status", DataTypes.IntegerType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val queryType = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String] + val newTableName = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[String] + val beginInstance = getArgValueOrDefault(args, PARAMETERS(3)).get.asInstanceOf[String] + val endInstance = getArgValueOrDefault(args, PARAMETERS(4)).get.asInstanceOf[String] + val asOfInstant = getArgValueOrDefault(args, PARAMETERS(5)).get.asInstanceOf[String] + val saveModeStr = getArgValueOrDefault(args, PARAMETERS(6)).get.asInstanceOf[String] + + assert(saveModeStr.nonEmpty, "save_mode(append,overwrite) can not be null.") + + val saveMode: Any = saveModeStr.toLowerCase match { + case "overwrite" => SaveMode.Overwrite + case "append" => SaveMode.Append + case _ => assert(assertion = false, s"save_mode not support $saveModeStr.") + } + + + val tablePath = getBasePath(tableName) + + val sourceDataFrame = queryType match { + case DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL => if (asOfInstant.nonEmpty) { + sparkSession.read + .format("org.apache.hudi") + .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) + .option(DataSourceReadOptions.TIME_TRAVEL_AS_OF_INSTANT.key, asOfInstant) + .load(tablePath) + } else { + sparkSession.read + .format("org.apache.hudi") + .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) + .load(tablePath) + } + case DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL => + assert(beginInstance.nonEmpty && endInstance.nonEmpty, "when the query_type is incremental, begin_instance_time and end_instance_time can not be null.") + sparkSession.read + .format("org.apache.hudi") + .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) + .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, beginInstance) + .option(DataSourceReadOptions.END_INSTANTTIME.key, endInstance) + .load(tablePath) + case DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL => + sparkSession.read + .format("org.apache.hudi") + .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL) + .load(tablePath) + } + if (sparkSession.catalog.tableExists(newTableName)) { + val schema = sparkSession.read.table(newTableName).schema + val selectColumns = schema.fields.toStream.map(_.name) + sourceDataFrame.selectExpr(selectColumns: _*) + .write + .mode(saveMode.toString) + .saveAsTable(newTableName) + } else { + sourceDataFrame.write + .mode(saveMode.toString) + .saveAsTable(newTableName) + } + + + Seq(Row(0)) + } + + override def build = new CopyToTableProcedure() +} + +object CopyToTableProcedure { + val NAME = "copy_to_table" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new CopyToTableProcedure() + } +} + + + + + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/CreateMetadataTableProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/CreateMetadataTableProcedure.scala new file mode 100644 index 0000000000000..3a16d8319a05d --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/CreateMetadataTableProcedure.scala @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hadoop.fs.Path +import org.apache.hudi.SparkAdapterSupport +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.util.HoodieTimer +import org.apache.hudi.metadata.{HoodieTableMetadata, SparkHoodieBackedTableMetadataWriter} +import org.apache.spark.sql.Row +import org.apache.spark.sql.types._ + +import java.io.FileNotFoundException +import java.util.function.Supplier + +class CreateMetadataTableProcedure extends BaseProcedure with ProcedureBuilder with SparkAdapterSupport { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("result", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + + val basePath = getBasePath(tableName) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + val metadataPath = new Path(HoodieTableMetadata.getMetadataTableBasePath(basePath)) + + try { + val statuses = metaClient.getFs.listStatus(metadataPath) + if (statuses.nonEmpty) { + throw new RuntimeException("Metadata directory (" + metadataPath.toString + ") not empty.") + } + } catch { + case e: FileNotFoundException => + // Metadata directory does not exist yet + metaClient.getFs.mkdirs(metadataPath) + } + val timer = new HoodieTimer().startTimer + val writeConfig = getWriteConfig(basePath) + SparkHoodieBackedTableMetadataWriter.create(metaClient.getHadoopConf, writeConfig, new HoodieSparkEngineContext(jsc)) + Seq(Row("Created Metadata Table in " + metadataPath + " (duration=" + timer.endTimer / 1000.0 + "secs)")) + } + + override def build = new CreateMetadataTableProcedure() +} + +object CreateMetadataTableProcedure { + val NAME = "create_metadata_table" + var metadataBaseDirectory: Option[String] = None + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new CreateMetadataTableProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/CreateSavepointProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/CreateSavepointProcedure.scala new file mode 100644 index 0000000000000..e81b6f086a22c --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/CreateSavepointProcedure.scala @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline +import org.apache.hudi.exception.{HoodieException, HoodieSavepointException} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.function.Supplier + +class CreateSavepointProcedure extends BaseProcedure with ProcedureBuilder with Logging { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.required(1, "commit_time", DataTypes.StringType, None), + ProcedureParameter.optional(2, "user", DataTypes.StringType, ""), + ProcedureParameter.optional(3, "comments", DataTypes.StringType, "") + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("create_savepoint_result", DataTypes.BooleanType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val commitTime = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String] + val user = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[String] + val comments = getArgValueOrDefault(args, PARAMETERS(3)).get.asInstanceOf[String] + + val basePath: String = getBasePath(tableName) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + + val activeTimeline: HoodieActiveTimeline = metaClient.getActiveTimeline + if (!activeTimeline.getCommitsTimeline.filterCompletedInstants.containsInstant(commitTime)) { + throw new HoodieException("Commit " + commitTime + " not found in Commits " + activeTimeline) + } + + val client = createHoodieClient(jsc, basePath) + var result = false + + try { + client.savepoint(commitTime, user, comments) + logInfo(s"The commit $commitTime has been savepointed.") + result = true + } catch { + case _: HoodieSavepointException => + logWarning(s"Failed: Could not create savepoint $commitTime.") + } finally { + client.close() + } + + Seq(Row(result)) + } + + override def build: Procedure = new CreateSavepointProcedure() +} + +object CreateSavepointProcedure { + val NAME: String = "create_savepoint" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get(): CreateSavepointProcedure = new CreateSavepointProcedure() + } +} + + + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/DeleteMarkerProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/DeleteMarkerProcedure.scala new file mode 100644 index 0000000000000..bfbab32599bf8 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/DeleteMarkerProcedure.scala @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.table.HoodieSparkTable +import org.apache.hudi.table.marker.WriteMarkersFactory +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.function.Supplier +import scala.util.{Failure, Success, Try} + +class DeleteMarkerProcedure extends BaseProcedure with ProcedureBuilder with Logging { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.required(1, "instant_time", DataTypes.StringType, None) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("delete_marker_result", DataTypes.BooleanType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val instantTime = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String] + val basePath = getBasePath(tableName) + + val result = Try { + val client = createHoodieClient(jsc, basePath) + val config = client.getConfig + val context = client.getEngineContext + val table = HoodieSparkTable.create(config, context) + WriteMarkersFactory.get(config.getMarkersType, table, instantTime) + .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism) + } match { + case Success(_) => + logInfo(s"Marker $instantTime deleted.") + true + case Failure(e) => + logWarning(s"Failed: Could not clean marker instantTime: $instantTime.", e) + false + } + + Seq(Row(result)) + } + + override def build: Procedure = new DeleteMarkerProcedure() +} + +object DeleteMarkerProcedure { + val NAME: String = "delete_marker" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get(): DeleteMarkerProcedure = new DeleteMarkerProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/DeleteMetadataTableProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/DeleteMetadataTableProcedure.scala new file mode 100644 index 0000000000000..d6fccc1f9d252 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/DeleteMetadataTableProcedure.scala @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hadoop.fs.Path +import org.apache.hudi.SparkAdapterSupport +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.metadata.HoodieTableMetadata +import org.apache.spark.sql.Row +import org.apache.spark.sql.types._ + +import java.io.FileNotFoundException +import java.util.function.Supplier + +class DeleteMetadataTableProcedure extends BaseProcedure with ProcedureBuilder with SparkAdapterSupport { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("result", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val basePath = getBasePath(tableName) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + val metadataPath = new Path(HoodieTableMetadata.getMetadataTableBasePath(basePath)) + + try { + val statuses = metaClient.getFs.listStatus(metadataPath) + if (statuses.nonEmpty) metaClient.getFs.delete(metadataPath, true) + } catch { + case e: FileNotFoundException => + // Metadata directory does not exist + } + Seq(Row("Removed Metadata Table from " + metadataPath)) + } + + override def build = new DeleteMetadataTableProcedure() +} + +object DeleteMetadataTableProcedure { + val NAME = "delete_metadata_table" + var metadataBaseDirectory: Option[String] = None + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new DeleteMetadataTableProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/DeleteSavepointProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/DeleteSavepointProcedure.scala new file mode 100644 index 0000000000000..1cdd0638f1ae1 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/DeleteSavepointProcedure.scala @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline} +import org.apache.hudi.exception.{HoodieException, HoodieSavepointException} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.function.Supplier + +class DeleteSavepointProcedure extends BaseProcedure with ProcedureBuilder with Logging { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.required(1, "instant_time", DataTypes.StringType, None) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("delete_savepoint_result", DataTypes.BooleanType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val instantTime = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String] + + val basePath: String = getBasePath(tableName) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + + val completedInstants = metaClient.getActiveTimeline.getSavePointTimeline.filterCompletedInstants + if (completedInstants.empty) throw new HoodieException("There are no completed savepoint to run delete") + val savePoint = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, instantTime) + + if (!completedInstants.containsInstant(savePoint)) { + throw new HoodieException("Commit " + instantTime + " not found in Commits " + completedInstants) + } + + val client = createHoodieClient(jsc, basePath) + var result = false + + try { + client.deleteSavepoint(instantTime) + logInfo(s"The commit $instantTime has been deleted savepoint.") + result = true + } catch { + case _: HoodieSavepointException => + logWarning(s"Failed: Could not delete savepoint $instantTime.") + } finally { + client.close() + } + + Seq(Row(result)) + } + + override def build: Procedure = new DeleteSavepointProcedure() +} + +object DeleteSavepointProcedure { + val NAME: String = "delete_savepoint" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get(): DeleteSavepointProcedure = new DeleteSavepointProcedure() + } +} + + + + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ExportInstantsProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ExportInstantsProcedure.scala new file mode 100644 index 0000000000000..114f4c4ee130c --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ExportInstantsProcedure.scala @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.avro.generic.GenericRecord +import org.apache.avro.specific.SpecificData +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} +import org.apache.hudi.HoodieCLIUtils +import org.apache.hudi.avro.HoodieAvroUtils +import org.apache.hudi.avro.model.HoodieArchivedMetaEntry +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.HoodieLogFile +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.log.HoodieLogFormat +import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock +import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline, TimelineMetadataUtils} +import org.apache.hudi.exception.HoodieException +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.io.File +import java.util +import java.util.Collections +import java.util.function.Supplier +import scala.collection.JavaConverters._ +import scala.util.control.Breaks.break + +class ExportInstantsProcedure extends BaseProcedure with ProcedureBuilder with Logging { + var sortByFieldParameter: ProcedureParameter = _ + + val defaultActions = "clean,commit,deltacommit,rollback,savepoint,restore" + + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.required(1, "local_folder", DataTypes.StringType, None), + ProcedureParameter.optional(2, "limit", DataTypes.IntegerType, -1), + ProcedureParameter.optional(3, "actions", DataTypes.StringType, defaultActions), + ProcedureParameter.optional(4, "desc", DataTypes.BooleanType, false) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("export_detail", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val table = getArgValueOrDefault(args, PARAMETERS(0)).get.asInstanceOf[String] + val localFolder = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String] + val limit = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[Int] + val actions: String = getArgValueOrDefault(args, PARAMETERS(3)).get.asInstanceOf[String] + val desc = getArgValueOrDefault(args, PARAMETERS(4)).get.asInstanceOf[Boolean] + + val hoodieCatalogTable = HoodieCLIUtils.getHoodieCatalogTable(sparkSession, table) + val basePath = hoodieCatalogTable.tableLocation + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + val archivePath = new Path(basePath + "/.hoodie/.commits_.archive*") + val actionSet: util.Set[String] = Set(actions.split(","): _*).asJava + val numExports = if (limit == -1) Integer.MAX_VALUE else limit + var numCopied = 0 + + if (!new File(localFolder).isDirectory) throw new HoodieException(localFolder + " is not a valid local directory") + + // The non archived instants can be listed from the Timeline. + val nonArchivedInstants: util.List[HoodieInstant] = metaClient + .getActiveTimeline + .filterCompletedInstants.getInstants.iterator().asScala + .filter((i: HoodieInstant) => actionSet.contains(i.getAction)) + .toList.asJava + + // Archived instants are in the commit archive files + val statuses: Array[FileStatus] = FSUtils.getFs(basePath, jsc.hadoopConfiguration()).globStatus(archivePath) + val archivedStatuses = List(statuses: _*) + .sortWith((f1, f2) => (f1.getModificationTime - f2.getModificationTime).toInt > 0).asJava + + if (desc) { + Collections.reverse(nonArchivedInstants) + numCopied = copyNonArchivedInstants(metaClient, nonArchivedInstants, numExports, localFolder) + if (numCopied < numExports) { + Collections.reverse(archivedStatuses) + numCopied += copyArchivedInstants(basePath, archivedStatuses, actionSet, numExports - numCopied, localFolder) + } + } else { + numCopied = copyArchivedInstants(basePath, archivedStatuses, actionSet, numExports, localFolder) + if (numCopied < numExports) numCopied += copyNonArchivedInstants(metaClient, nonArchivedInstants, numExports - numCopied, localFolder) + } + + Seq(Row("Exported " + numCopied + " Instants to " + localFolder)) + } + + @throws[Exception] + private def copyArchivedInstants(basePath: String, statuses: util.List[FileStatus], actionSet: util.Set[String], limit: Int, localFolder: String) = { + import scala.collection.JavaConversions._ + var copyCount = 0 + val fileSystem = FSUtils.getFs(basePath, jsc.hadoopConfiguration()) + for (fs <- statuses) { + // read the archived file + val reader = HoodieLogFormat.newReader(fileSystem, new HoodieLogFile(fs.getPath), HoodieArchivedMetaEntry.getClassSchema) + // read the avro blocks + while ( { + reader.hasNext && copyCount < limit + }) { + val blk = reader.next.asInstanceOf[HoodieAvroDataBlock] + try { + val recordItr = blk.getRecordIterator + try while ( { + recordItr.hasNext + }) { + val ir = recordItr.next + // Archived instants are saved as arvo encoded HoodieArchivedMetaEntry records. We need to get the + // metadata record from the entry and convert it to json. + val archiveEntryRecord = SpecificData.get.deepCopy(HoodieArchivedMetaEntry.SCHEMA$, ir).asInstanceOf[HoodieArchivedMetaEntry] + val action = archiveEntryRecord.get("actionType").toString + if (!actionSet.contains(action)) break() //todo: continue is not supported + val metadata: GenericRecord = action match { + case HoodieTimeline.CLEAN_ACTION => + archiveEntryRecord.getHoodieCleanMetadata + + case HoodieTimeline.COMMIT_ACTION => + archiveEntryRecord.getHoodieCommitMetadata + + case HoodieTimeline.DELTA_COMMIT_ACTION => + archiveEntryRecord.getHoodieCommitMetadata + + case HoodieTimeline.ROLLBACK_ACTION => + archiveEntryRecord.getHoodieRollbackMetadata + + case HoodieTimeline.SAVEPOINT_ACTION => + archiveEntryRecord.getHoodieSavePointMetadata + + case HoodieTimeline.COMPACTION_ACTION => + archiveEntryRecord.getHoodieCompactionMetadata + + case _ => logInfo("Unknown type of action " + action) + null + } + val instantTime = archiveEntryRecord.get("commitTime").toString + val outPath = localFolder + Path.SEPARATOR + instantTime + "." + action + if (metadata != null) writeToFile(fileSystem, outPath, HoodieAvroUtils.avroToJson(metadata, true)) + if ( { + copyCount += 1; + copyCount + } == limit) break //todo: break is not supported + } + finally if (recordItr != null) recordItr.close() + } + } + reader.close() + } + copyCount + } + + @throws[Exception] + private def copyNonArchivedInstants(metaClient: HoodieTableMetaClient, instants: util.List[HoodieInstant], limit: Int, localFolder: String): Int = { + import scala.collection.JavaConversions._ + var copyCount = 0 + if (instants.nonEmpty) { + val timeline = metaClient.getActiveTimeline + val fileSystem = FSUtils.getFs(metaClient.getBasePath, jsc.hadoopConfiguration()) + for (instant <- instants) { + val localPath = localFolder + Path.SEPARATOR + instant.getFileName + val data: Array[Byte] = instant.getAction match { + case HoodieTimeline.CLEAN_ACTION => + val metadata = TimelineMetadataUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(instant).get) + HoodieAvroUtils.avroToJson(metadata, true) + + case HoodieTimeline.DELTA_COMMIT_ACTION => + // Already in json format + timeline.getInstantDetails(instant).get + + case HoodieTimeline.COMMIT_ACTION => + // Already in json format + timeline.getInstantDetails(instant).get + + case HoodieTimeline.COMPACTION_ACTION => + // Already in json format + timeline.getInstantDetails(instant).get + + case HoodieTimeline.ROLLBACK_ACTION => + val metadata = TimelineMetadataUtils.deserializeHoodieRollbackMetadata(timeline.getInstantDetails(instant).get) + HoodieAvroUtils.avroToJson(metadata, true) + + case HoodieTimeline.SAVEPOINT_ACTION => + val metadata = TimelineMetadataUtils.deserializeHoodieSavepointMetadata(timeline.getInstantDetails(instant).get) + HoodieAvroUtils.avroToJson(metadata, true) + + case _ => null + + } + if (data != null) { + writeToFile(fileSystem, localPath, data) + copyCount = copyCount + 1 + } + } + } + copyCount + } + + @throws[Exception] + private def writeToFile(fs: FileSystem, path: String, data: Array[Byte]): Unit = { + val out = fs.create(new Path(path)) + out.write(data) + out.flush() + out.close() + } + + override def build = new ExportInstantsProcedure() +} + +object ExportInstantsProcedure { + val NAME = "export_instants" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ExportInstantsProcedure() + } +} + + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HdfsParquetImportProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HdfsParquetImportProcedure.scala new file mode 100644 index 0000000000000..ad947f745ac38 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HdfsParquetImportProcedure.scala @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.cli.HDFSParquetImporterUtils +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.function.Supplier +import scala.language.higherKinds + +class HdfsParquetImportProcedure extends BaseProcedure with ProcedureBuilder with Logging { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.required(1, "table_type", DataTypes.StringType, None), + ProcedureParameter.required(2, "src_path", DataTypes.StringType, None), + ProcedureParameter.required(3, "target_path", DataTypes.StringType, None), + ProcedureParameter.required(4, "row_key", DataTypes.StringType, None), + ProcedureParameter.required(5, "partition_key", DataTypes.StringType, None), + ProcedureParameter.required(6, "schema_file_path", DataTypes.StringType, None), + ProcedureParameter.optional(7, "format", DataTypes.StringType, "parquet"), + ProcedureParameter.optional(8, "command", DataTypes.StringType, "insert"), + ProcedureParameter.optional(9, "retry", DataTypes.IntegerType, 0), + ProcedureParameter.optional(10, "parallelism", DataTypes.IntegerType, jsc.defaultParallelism), + ProcedureParameter.optional(11, "props_file_path", DataTypes.StringType, "") + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("import_result", DataTypes.IntegerType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)).get.asInstanceOf[String] + val tableType = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String] + val srcPath = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[String] + val targetPath = getArgValueOrDefault(args, PARAMETERS(3)).get.asInstanceOf[String] + val rowKey = getArgValueOrDefault(args, PARAMETERS(4)).get.asInstanceOf[String] + val partitionKey = getArgValueOrDefault(args, PARAMETERS(5)).get.asInstanceOf[String] + val schemaFilePath = getArgValueOrDefault(args, PARAMETERS(6)).get.asInstanceOf[String] + val format = getArgValueOrDefault(args, PARAMETERS(7)).get.asInstanceOf[String] + val command = getArgValueOrDefault(args, PARAMETERS(8)).get.asInstanceOf[String] + val retry = getArgValueOrDefault(args, PARAMETERS(9)).get.asInstanceOf[Int] + val parallelism = getArgValueOrDefault(args, PARAMETERS(10)).get.asInstanceOf[Int] + val propsFilePath = getArgValueOrDefault(args, PARAMETERS(11)).get.asInstanceOf[String] + + val parquetImporterUtils: HDFSParquetImporterUtils = new HDFSParquetImporterUtils(command, srcPath, targetPath, + tableName, tableType, rowKey, partitionKey, parallelism, schemaFilePath, retry, propsFilePath) + + Seq(Row(parquetImporterUtils.dataImport(jsc))) + } + + override def build = new HdfsParquetImportProcedure() +} + +object HdfsParquetImportProcedure { + val NAME = "hdfs_parquet_import" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new HdfsParquetImportProcedure() + } +} + + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HiveSyncProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HiveSyncProcedure.scala new file mode 100644 index 0000000000000..24944b1270404 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HiveSyncProcedure.scala @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hadoop.hive.conf.HiveConf +import org.apache.hudi.HoodieCLIUtils +import org.apache.hudi.exception.HoodieException +import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncConfigHolder, HiveSyncTool} +import org.apache.hudi.sync.common.HoodieSyncConfig +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.hudi.ProvidesHoodieConfig +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.function.Supplier + +class HiveSyncProcedure extends BaseProcedure with ProcedureBuilder + with ProvidesHoodieConfig with Logging { + + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "metastore_uri", DataTypes.StringType, ""), + ProcedureParameter.optional(2, "username", DataTypes.StringType, ""), + ProcedureParameter.optional(3, "password", DataTypes.StringType, ""), + ProcedureParameter.optional(4, "use_jdbc", DataTypes.StringType, ""), + ProcedureParameter.optional(5, "mode", DataTypes.StringType, ""), + ProcedureParameter.optional(6, "partition_fields", DataTypes.StringType, ""), + ProcedureParameter.optional(7, "partition_extractor_class", DataTypes.StringType, "") + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("result", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + override def build: Procedure = new HiveSyncProcedure + + override def parameters: Array[ProcedureParameter] = PARAMETERS + + override def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)).get.asInstanceOf[String] + val metastoreUri = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String] + val username = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[String] + val password = getArgValueOrDefault(args, PARAMETERS(3)).get.asInstanceOf[String] + val useJdbc = getArgValueOrDefault(args, PARAMETERS(4)).get.asInstanceOf[String] + val mode = getArgValueOrDefault(args, PARAMETERS(5)).get.asInstanceOf[String] + val partitionFields = getArgValueOrDefault(args, PARAMETERS(6)).get.asInstanceOf[String] + val partitionExtractorClass = getArgValueOrDefault(args, PARAMETERS(7)).get.asInstanceOf[String] + + val hoodieCatalogTable = HoodieCLIUtils.getHoodieCatalogTable(sparkSession, tableName) + val hadoopConf = sparkSession.sparkContext.hadoopConfiguration + val hiveConf = new HiveConf + val sqlConf = sparkSession.sqlContext.conf + + if (metastoreUri.nonEmpty) hadoopConf.set(HiveConf.ConfVars.METASTOREURIS.varname, metastoreUri) + if (username.nonEmpty) sqlConf.setConfString(HiveSyncConfig.HIVE_USER.key, username) + if (password.nonEmpty) sqlConf.setConfString(HiveSyncConfig.HIVE_PASS.key, password) + if (useJdbc.nonEmpty) sqlConf.setConfString(HiveSyncConfig.HIVE_USE_JDBC.key, useJdbc) + if (mode.nonEmpty) sqlConf.setConfString(HiveSyncConfigHolder.HIVE_SYNC_MODE.key, mode) + if (partitionFields.nonEmpty) sqlConf.setConfString(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key, partitionFields) + if (partitionExtractorClass.nonEmpty) sqlConf.setConfString(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key, partitionExtractorClass) + + hiveConf.addResource(hadoopConf) + + val tableConfig = hoodieCatalogTable.tableConfig + val hoodieProps = getHoodieProps(hoodieCatalogTable.catalogProperties, tableConfig, sqlConf) + val hiveSyncConfig = buildHiveSyncConfig(hoodieProps, hoodieCatalogTable) + + var hiveSyncTool: HiveSyncTool = null + try { + hiveSyncTool = new HiveSyncTool(hiveSyncConfig.getProps, hiveConf) + hiveSyncTool.syncHoodieTable() + } catch { + case e: RuntimeException => throw new HoodieException("hive sync failed", e) + } finally { + if (hiveSyncTool != null) hiveSyncTool.close() + } + + Seq(Row("hive sync success.")) + } +} + +object HiveSyncProcedure { + val NAME = "hive_sync" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new HiveSyncProcedure + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedureUtils.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedureUtils.scala new file mode 100644 index 0000000000000..374f86773d1cb --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedureUtils.scala @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.spark.sql.catalyst.expressions.GenericInternalRow + +import java.util + +object HoodieProcedureUtils { + + /** + * Build named procedure arguments from given args' map + * + * @param args The arguments map + * @return Named procedure arguments + */ + def buildProcedureArgs(args: Map[String, Any]): ProcedureArgs = { + val values: Array[Any] = new Array[Any](args.size) + val map = new util.LinkedHashMap[String, Int]() + + args.zipWithIndex.foreach { + case ((key, value), index) => + values(index) = value + map.put(key, index) + } + + ProcedureArgs(isNamedArgs = true, map, new GenericInternalRow(values)) + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala new file mode 100644 index 0000000000000..b2bbec848945f --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import java.util.Locale +import java.util.function.Supplier + +object HoodieProcedures { + private val BUILDERS: Map[String, Supplier[ProcedureBuilder]] = initProcedureBuilders + + def newBuilder(name: String): ProcedureBuilder = { + val builderSupplier = BUILDERS.get(name.toLowerCase(Locale.ROOT)) + if (builderSupplier.isDefined) builderSupplier.get.get() else null + } + + private def initProcedureBuilders: Map[String, Supplier[ProcedureBuilder]] = { + Map((RunCompactionProcedure.NAME, RunCompactionProcedure.builder) + ,(ShowCompactionProcedure.NAME, ShowCompactionProcedure.builder) + ,(CreateSavepointProcedure.NAME, CreateSavepointProcedure.builder) + ,(DeleteSavepointProcedure.NAME, DeleteSavepointProcedure.builder) + ,(RollbackToSavepointProcedure.NAME, RollbackToSavepointProcedure.builder) + ,(RollbackToInstantTimeProcedure.NAME, RollbackToInstantTimeProcedure.builder) + ,(RunClusteringProcedure.NAME, RunClusteringProcedure.builder) + ,(ShowClusteringProcedure.NAME, ShowClusteringProcedure.builder) + ,(ShowCommitsProcedure.NAME, ShowCommitsProcedure.builder) + ,(ShowCommitsMetadataProcedure.NAME, ShowCommitsMetadataProcedure.builder) + ,(ShowArchivedCommitsProcedure.NAME, ShowArchivedCommitsProcedure.builder) + ,(ShowArchivedCommitsMetadataProcedure.NAME, ShowArchivedCommitsMetadataProcedure.builder) + ,(ShowCommitFilesProcedure.NAME, ShowCommitFilesProcedure.builder) + ,(ShowCommitPartitionsProcedure.NAME, ShowCommitPartitionsProcedure.builder) + ,(ShowCommitWriteStatsProcedure.NAME, ShowCommitWriteStatsProcedure.builder) + ,(CommitsCompareProcedure.NAME, CommitsCompareProcedure.builder) + ,(ShowSavepointsProcedure.NAME, ShowSavepointsProcedure.builder) + ,(DeleteMarkerProcedure.NAME, DeleteMarkerProcedure.builder) + ,(ShowRollbacksProcedure.NAME, ShowRollbacksProcedure.builder) + ,(ShowRollbackDetailProcedure.NAME, ShowRollbackDetailProcedure.builder) + ,(ExportInstantsProcedure.NAME, ExportInstantsProcedure.builder) + ,(ShowAllFileSystemViewProcedure.NAME, ShowAllFileSystemViewProcedure.builder) + ,(ShowLatestFileSystemViewProcedure.NAME, ShowLatestFileSystemViewProcedure.builder) + ,(ShowHoodieLogFileMetadataProcedure.NAME, ShowHoodieLogFileMetadataProcedure.builder) + ,(ShowHoodieLogFileRecordsProcedure.NAME, ShowHoodieLogFileRecordsProcedure.builder) + ,(StatsWriteAmplificationProcedure.NAME, StatsWriteAmplificationProcedure.builder) + ,(StatsFileSizeProcedure.NAME, StatsFileSizeProcedure.builder) + ,(HdfsParquetImportProcedure.NAME, HdfsParquetImportProcedure.builder) + ,(RunBootstrapProcedure.NAME, RunBootstrapProcedure.builder) + ,(ShowBootstrapMappingProcedure.NAME, ShowBootstrapMappingProcedure.builder) + ,(ShowBootstrapPartitionsProcedure.NAME, ShowBootstrapPartitionsProcedure.builder) + ,(UpgradeTableProcedure.NAME, UpgradeTableProcedure.builder) + ,(DowngradeTableProcedure.NAME, DowngradeTableProcedure.builder) + ,(ShowMetadataTableFilesProcedure.NAME, ShowMetadataTableFilesProcedure.builder) + ,(ShowMetadataTablePartitionsProcedure.NAME, ShowMetadataTablePartitionsProcedure.builder) + ,(CreateMetadataTableProcedure.NAME, CreateMetadataTableProcedure.builder) + ,(DeleteMetadataTableProcedure.NAME, DeleteMetadataTableProcedure.builder) + ,(InitMetadataTableProcedure.NAME, InitMetadataTableProcedure.builder) + ,(ShowMetadataTableStatsProcedure.NAME, ShowMetadataTableStatsProcedure.builder) + ,(ValidateMetadataTableFilesProcedure.NAME, ValidateMetadataTableFilesProcedure.builder) + ,(ShowFsPathDetailProcedure.NAME, ShowFsPathDetailProcedure.builder) + ,(CopyToTableProcedure.NAME, CopyToTableProcedure.builder) + ,(RepairAddpartitionmetaProcedure.NAME, RepairAddpartitionmetaProcedure.builder) + ,(RepairCorruptedCleanFilesProcedure.NAME, RepairCorruptedCleanFilesProcedure.builder) + ,(RepairDeduplicateProcedure.NAME, RepairDeduplicateProcedure.builder) + ,(RepairMigratePartitionMetaProcedure.NAME, RepairMigratePartitionMetaProcedure.builder) + ,(RepairOverwriteHoodiePropsProcedure.NAME, RepairOverwriteHoodiePropsProcedure.builder) + ,(RunCleanProcedure.NAME, RunCleanProcedure.builder) + ,(ValidateHoodieSyncProcedure.NAME, ValidateHoodieSyncProcedure.builder) + ,(ShowInvalidParquetProcedure.NAME, ShowInvalidParquetProcedure.builder) + ,(HiveSyncProcedure.NAME, HiveSyncProcedure.builder) + ) + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/InitMetadataTableProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/InitMetadataTableProcedure.scala new file mode 100644 index 0000000000000..73d1128a98d08 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/InitMetadataTableProcedure.scala @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hadoop.fs.Path +import org.apache.hudi.SparkAdapterSupport +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.util.HoodieTimer +import org.apache.hudi.metadata.{HoodieTableMetadata, SparkHoodieBackedTableMetadataWriter} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types._ + +import java.io.FileNotFoundException +import java.util.function.Supplier + +class InitMetadataTableProcedure extends BaseProcedure with ProcedureBuilder with SparkAdapterSupport with Logging { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "read_only", DataTypes.BooleanType, false) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("result", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val readOnly = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[Boolean] + + val basePath = getBasePath(tableName) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + val metadataPath = new Path(HoodieTableMetadata.getMetadataTableBasePath(basePath)) + try { + metaClient.getFs.listStatus(metadataPath) + } catch { + case e: FileNotFoundException => + // Metadata directory does not exist yet + throw new RuntimeException("Metadata directory (" + metadataPath.toString + ") does not exist.") + } + + val timer = new HoodieTimer().startTimer + if (!readOnly) { + val writeConfig = getWriteConfig(basePath) + SparkHoodieBackedTableMetadataWriter.create(metaClient.getHadoopConf, writeConfig, new HoodieSparkEngineContext(jsc)) + } + + val action = if (readOnly) "Opened" else "Initialized" + Seq(Row(action + " Metadata Table in " + metadataPath + " (duration=" + timer.endTimer / 1000.0 + "sec)")) + } + + override def build = new InitMetadataTableProcedure() +} + +object InitMetadataTableProcedure { + val NAME = "init_metadata_table" + var metadataBaseDirectory: Option[String] = None + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new InitMetadataTableProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/Procedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/Procedure.scala new file mode 100644 index 0000000000000..f34e306159827 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/Procedure.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.types.StructType + +import java.util +import scala.collection.mutable + +/** + * An interface representing a stored procedure available for execution. + */ +trait Procedure { + /** + * Returns the input parameters of this procedure. + */ + def parameters: Array[ProcedureParameter] + + /** + * Returns the type of rows produced by this procedure. + */ + def outputType: StructType + + /** + * Executes this procedure. + *

    + * Spark will align the provided arguments according to the input parameters + * defined in {@link #parameters ( )} either by position or by name before execution. + *

    + * Implementations may provide a summary of execution by returning one or many rows + * as a result. The schema of output rows must match the defined output type + * in {@link #outputType ( )}. + * + * @param args input arguments + * @return the result of executing this procedure with the given arguments + */ + def call(args: ProcedureArgs): Seq[Row] + + /** + * Returns the description of this procedure. + */ + def description: String = this.getClass.toString +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureArgs.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureArgs.scala new file mode 100644 index 0000000000000..5c462c1b892a0 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureArgs.scala @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.spark.sql.catalyst.InternalRow + +import java.util + +case class ProcedureArgs(isNamedArgs: Boolean, + map: util.LinkedHashMap[String, Int], + internalRow: InternalRow) { +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureBuilder.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureBuilder.scala new file mode 100644 index 0000000000000..b2ecd0a3089c4 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureBuilder.scala @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +trait ProcedureBuilder { + def build: Procedure +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureParameter.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureParameter.scala new file mode 100644 index 0000000000000..a9ad252bd7a05 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureParameter.scala @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.spark.sql.types.DataType + +/** + * An input parameter of a {@link Procedure stored procedure}. + */ +abstract class ProcedureParameter { + def index: Int + + /** + * Returns the name of this parameter. + */ + def name: String + + /** + * Returns the type of this parameter. + */ + def dataType: DataType + + /** + * Returns true if this parameter is required. + */ + def required: Boolean + + /** + * this parameter's default value. + */ + def default: Any +} + +object ProcedureParameter { + /** + * Creates a required input parameter. + * + * @param name the name of the parameter + * @param dataType the type of the parameter + * @return the constructed stored procedure parameter + */ + def required(index: Int, name: String, dataType: DataType, default: Any): ProcedureParameterImpl = { + ProcedureParameterImpl(index, name, dataType, default, required = true) + } + + /** + * Creates an optional input parameter. + * + * @param name the name of the parameter. + * @param dataType the type of the parameter. + * @return the constructed optional stored procedure parameter + */ + def optional(index: Int, name: String, dataType: DataType, default: Any): ProcedureParameterImpl = { + ProcedureParameterImpl(index, name, dataType, default, required = false) + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureParameterImpl.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureParameterImpl.scala new file mode 100644 index 0000000000000..a7f4117047457 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureParameterImpl.scala @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.spark.sql.types.DataType + +import java.util.Objects + +case class ProcedureParameterImpl(index: Int, name: String, dataType: DataType, default: Any, required: Boolean) + extends ProcedureParameter { + + override def equals(other: Any): Boolean = { + val that = other.asInstanceOf[ProcedureParameterImpl] + val rtn = if (this == other) { + true + } else if (other == null || (getClass ne other.getClass)) { + false + } else { + index == that.index && required == that.required && default == that.default && Objects.equals(name, that.name) && Objects.equals(dataType, that.dataType) + } + rtn + } + + override def hashCode: Int = Seq(index, name, dataType, required, default).hashCode() + + override def toString: String = s"ProcedureParameter(index='$index',name='$name', type=$dataType, required=$required, default=$default)" +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairAddpartitionmetaProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairAddpartitionmetaProcedure.scala new file mode 100644 index 0000000000000..bb65174c4b47b --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairAddpartitionmetaProcedure.scala @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hadoop.fs.Path +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.HoodiePartitionMetadata +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util +import java.util.function.Supplier +import scala.collection.JavaConversions._ + +class RepairAddpartitionmetaProcedure extends BaseProcedure with ProcedureBuilder with Logging { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "dry_run", DataTypes.BooleanType, true) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("partition_path", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("metadata_is_present", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("action", DataTypes.StringType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val dryRun = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[Boolean] + val tablePath = getBasePath(tableName) + + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(tablePath).build + + val latestCommit: String = metaClient.getActiveTimeline.getCommitTimeline.lastInstant.get.getTimestamp + val partitionPaths: util.List[String] = FSUtils.getAllPartitionFoldersThreeLevelsDown(metaClient.getFs, tablePath); + val basePath: Path = new Path(tablePath) + + val rows = new util.ArrayList[Row](partitionPaths.size) + for (partition <- partitionPaths) { + val partitionPath: Path = FSUtils.getPartitionPath(basePath, partition) + var isPresent = "Yes" + var action = "None" + if (!HoodiePartitionMetadata.hasPartitionMetadata(metaClient.getFs, partitionPath)) { + isPresent = "No" + if (!dryRun) { + val partitionMetadata: HoodiePartitionMetadata = new HoodiePartitionMetadata(metaClient.getFs, latestCommit, basePath, partitionPath, metaClient.getTableConfig.getPartitionMetafileFormat) + partitionMetadata.trySave(0) + action = "Repaired" + } + } + rows.add(Row(partition, isPresent, action)) + } + + rows.stream().toArray().map(r => r.asInstanceOf[Row]).toList + } + + override def build: Procedure = new RepairAddpartitionmetaProcedure() +} + +object RepairAddpartitionmetaProcedure { + val NAME = "repair_add_partition_meta" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new RepairAddpartitionmetaProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairCorruptedCleanFilesProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairCorruptedCleanFilesProcedure.scala new file mode 100644 index 0000000000000..ff185d1bdfab1 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairCorruptedCleanFilesProcedure.scala @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.avro.AvroRuntimeException +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieInstant} +import org.apache.hudi.common.util.CleanerUtils +import org.apache.hudi.exception.HoodieIOException +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.io.IOException +import java.util.function.Supplier +import scala.collection.JavaConverters.asScalaIteratorConverter + +class RepairCorruptedCleanFilesProcedure extends BaseProcedure with ProcedureBuilder with Logging { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("result", DataTypes.BooleanType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val tablePath = getBasePath(tableName) + + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(tablePath).build + + val cleanerTimeline = metaClient.getActiveTimeline.getCleanerTimeline + logInfo("Inspecting pending clean metadata in timeline for corrupted files") + var result = true + cleanerTimeline.filterInflightsAndRequested.getInstants.iterator().asScala.foreach((instant: HoodieInstant) => { + try { + CleanerUtils.getCleanerPlan(metaClient, instant) + } catch { + case e: AvroRuntimeException => + logWarning("Corruption found. Trying to remove corrupted clean instant file: " + instant) + HoodieActiveTimeline.deleteInstantFile(metaClient.getFs, metaClient.getMetaPath, instant) + case ioe: IOException => + if (ioe.getMessage.contains("Not an Avro data file")) { + logWarning("Corruption found. Trying to remove corrupted clean instant file: " + instant) + HoodieActiveTimeline.deleteInstantFile(metaClient.getFs, metaClient.getMetaPath, instant) + } else { + result = false + throw new HoodieIOException(ioe.getMessage, ioe) + } + } + }) + Seq(Row(result)) + } + + override def build: Procedure = new RepairCorruptedCleanFilesProcedure() +} + +object RepairCorruptedCleanFilesProcedure { + val NAME = "repair_corrupted_clean_files" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new RepairCorruptedCleanFilesProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairDeduplicateProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairDeduplicateProcedure.scala new file mode 100644 index 0000000000000..8ee5055e1fdb2 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairDeduplicateProcedure.scala @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.exception.HoodieException +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} +import java.util.function.Supplier + +import org.apache.spark.sql.hudi.{DeDupeType, DedupeSparkJob} + +import scala.util.{Failure, Success, Try} + +class RepairDeduplicateProcedure extends BaseProcedure with ProcedureBuilder with Logging { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.required(1, "duplicated_partition_path", DataTypes.StringType, None), + ProcedureParameter.required(2, "repaired_output_path", DataTypes.StringType, None), + ProcedureParameter.optional(3, "dry_run", DataTypes.BooleanType, true), + ProcedureParameter.optional(4, "dedupe_type", DataTypes.StringType, "insert_type") + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("result", DataTypes.StringType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val duplicatedPartitionPath = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String] + val repairedOutputPath = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[String] + val dryRun = getArgValueOrDefault(args, PARAMETERS(3)).get.asInstanceOf[Boolean] + val dedupeType = getArgValueOrDefault(args, PARAMETERS(4)).get.asInstanceOf[String] + + if (!DeDupeType.values.contains(DeDupeType.withName(dedupeType))) { + throw new IllegalArgumentException("Please provide valid dedupe type!") + } + val basePath = getBasePath(tableName) + + Try { + val job = new DedupeSparkJob(basePath, duplicatedPartitionPath, repairedOutputPath, spark.sqlContext, + FSUtils.getFs(basePath, jsc.hadoopConfiguration), DeDupeType.withName(dedupeType)) + job.fixDuplicates(dryRun) + } match { + case Success(_) => + if (dryRun){ + Seq(Row(s"Deduplicated files placed in: $repairedOutputPath.")) + } else { + Seq(Row(s"Deduplicated files placed in: $duplicatedPartitionPath.")) + } + case Failure(e) => + throw new HoodieException(s"Deduplication failed!", e) + } + } + override def build: Procedure = new RepairDeduplicateProcedure() +} + +object RepairDeduplicateProcedure { + val NAME = "repair_deduplicate" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new RepairDeduplicateProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairMigratePartitionMetaProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairMigratePartitionMetaProcedure.scala new file mode 100644 index 0000000000000..7daacb2f184c8 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairMigratePartitionMetaProcedure.scala @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hadoop.fs.Path +import org.apache.hudi.common.engine.HoodieLocalEngineContext +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.HoodiePartitionMetadata +import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} +import org.apache.hudi.common.util.Option +import org.apache.hudi.exception.HoodieIOException +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.io.IOException +import java.util +import java.util.Properties +import java.util.function.{Consumer, Supplier} +import scala.collection.JavaConversions._ + +class RepairMigratePartitionMetaProcedure extends BaseProcedure with ProcedureBuilder with Logging { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "dry_run", DataTypes.BooleanType, true) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("partition_path", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("text_metafile_present", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("base_metafile_present", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("action", DataTypes.StringType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val dryRun = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[Boolean] + val tablePath = getBasePath(tableName) + + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(tablePath).build + + val engineContext: HoodieLocalEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf) + val partitionPaths: util.List[String] = FSUtils.getAllPartitionPaths(engineContext, tablePath, false, false) + val basePath: Path = new Path(tablePath) + + val rows = new util.ArrayList[Row](partitionPaths.size) + for (partitionPath <- partitionPaths) { + val partition: Path = FSUtils.getPartitionPath(tablePath, partitionPath) + val textFormatFile: Option[Path] = HoodiePartitionMetadata.textFormatMetaPathIfExists(metaClient.getFs, partition) + val baseFormatFile: Option[Path] = HoodiePartitionMetadata.baseFormatMetaPathIfExists(metaClient.getFs, partition) + val latestCommit: String = metaClient.getActiveTimeline.getCommitTimeline.lastInstant.get.getTimestamp + var action = if (textFormatFile.isPresent) "MIGRATE" else "NONE" + if (!dryRun) { + if (!baseFormatFile.isPresent) { + val partitionMetadata: HoodiePartitionMetadata = new HoodiePartitionMetadata(metaClient.getFs, latestCommit, + basePath, partition, Option.of(metaClient.getTableConfig.getBaseFileFormat)) + partitionMetadata.trySave(0) + } + // delete it, in case we failed midway last time. + textFormatFile.ifPresent( + new Consumer[Path] { + override def accept(p: Path): Unit = { + try metaClient.getFs.delete(p, false) + catch { + case e: IOException => + throw new HoodieIOException(e.getMessage, e) + } + } + }) + action = "MIGRATED" + } + rows.add(Row(partitionPath, String.valueOf(textFormatFile.isPresent), + String.valueOf(baseFormatFile.isPresent), action)) + } + val props: Properties = new Properties + props.setProperty(HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT.key, "true") + HoodieTableConfig.update(metaClient.getFs, new Path(metaClient.getMetaPath), props) + + rows.stream().toArray().map(r => r.asInstanceOf[Row]).toList + } + + override def build: Procedure = new RepairMigratePartitionMetaProcedure() +} + +object RepairMigratePartitionMetaProcedure { + val NAME = "repair_migrate_partition_meta" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new RepairMigratePartitionMetaProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairOverwriteHoodiePropsProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairOverwriteHoodiePropsProcedure.scala new file mode 100644 index 0000000000000..043217cf2df4b --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RepairOverwriteHoodiePropsProcedure.scala @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hadoop.fs.Path +import org.apache.hudi.common.table.HoodieTableMetaClient.METAFOLDER_NAME +import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.io.FileInputStream +import java.util +import java.util.Properties +import java.util.function.Supplier +import scala.collection.JavaConversions._ +import scala.collection.JavaConverters.asScalaIteratorConverter + +class RepairOverwriteHoodiePropsProcedure extends BaseProcedure with ProcedureBuilder with Logging { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.required(1, "new_props_file_path", DataTypes.StringType, None) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("property", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("old_value", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("new_value", DataTypes.StringType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val overwriteFilePath = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String] + val tablePath = getBasePath(tableName) + + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(tablePath).build + + var newProps = new Properties + newProps.load(new FileInputStream(overwriteFilePath)) + val oldProps = metaClient.getTableConfig.propsMap + val metaPathDir = new Path(tablePath, METAFOLDER_NAME) + HoodieTableConfig.create(metaClient.getFs, metaPathDir, newProps) + // reload new props as checksum would have been added + newProps = HoodieTableMetaClient.reload(metaClient).getTableConfig.getProps + + val allPropKeys = new util.TreeSet[String] + allPropKeys.addAll(newProps.keySet.stream.iterator().asScala.map(key => key.toString).toList) + allPropKeys.addAll(oldProps.keySet) + + val rows = new util.ArrayList[Row](allPropKeys.size) + for (propKey <- allPropKeys) { + rows.add(Row(propKey, oldProps.getOrDefault(propKey, "null"), + newProps.getOrDefault(propKey, "null").toString)) + } + + rows.stream().toArray().map(r => r.asInstanceOf[Row]).toList + } + + override def build: Procedure = new RepairOverwriteHoodiePropsProcedure() +} + +object RepairOverwriteHoodiePropsProcedure { + val NAME = "repair_overwrite_hoodie_props" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new RepairOverwriteHoodiePropsProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RollbackToInstantTimeProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RollbackToInstantTimeProcedure.scala new file mode 100644 index 0000000000000..1fcc665d61123 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RollbackToInstantTimeProcedure.scala @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.HoodieCLIUtils +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.HoodieTimeline +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion +import org.apache.hudi.common.util.Option +import org.apache.hudi.config.HoodieWriteConfig.ROLLBACK_USING_MARKERS_ENABLE +import org.apache.hudi.exception.HoodieException +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.function.Supplier + +class RollbackToInstantTimeProcedure extends BaseProcedure with ProcedureBuilder { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.required(1, "instant_time", DataTypes.StringType, None)) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("rollback_result", DataTypes.BooleanType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val table = getArgValueOrDefault(args, PARAMETERS(0)).get.asInstanceOf[String] + val instantTime = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String] + + val hoodieCatalogTable = HoodieCLIUtils.getHoodieCatalogTable(sparkSession, table) + val basePath = hoodieCatalogTable.tableLocation + val client = createHoodieClient(jsc, basePath) + client.getConfig.setValue(ROLLBACK_USING_MARKERS_ENABLE, "false") + val config = getWriteConfig(basePath) + val metaClient = HoodieTableMetaClient.builder + .setConf(jsc.hadoopConfiguration) + .setBasePath(config.getBasePath) + .setLoadActiveTimelineOnLoad(false) + .setConsistencyGuardConfig(config.getConsistencyGuardConfig) + .setLayoutVersion(Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion))) + .build + + val activeTimeline = metaClient.getActiveTimeline + val completedTimeline: HoodieTimeline = activeTimeline.getCommitsTimeline.filterCompletedInstants + val filteredTimeline = completedTimeline.containsInstant(instantTime) + if (!filteredTimeline) { + throw new HoodieException(s"Commit $instantTime not found in Commits $completedTimeline") + } + + val result = if (client.rollback(instantTime)) true else false + val outputRow = Row(result) + + Seq(outputRow) + } + + override def build: Procedure = new RollbackToInstantTimeProcedure() +} + +object RollbackToInstantTimeProcedure { + val NAME: String = "rollback_to_instant" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get(): RollbackToInstantTimeProcedure = new RollbackToInstantTimeProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RollbackToSavepointProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RollbackToSavepointProcedure.scala new file mode 100644 index 0000000000000..11f06d4a7c6c5 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RollbackToSavepointProcedure.scala @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline} +import org.apache.hudi.exception.{HoodieException, HoodieSavepointException} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.function.Supplier + +class RollbackToSavepointProcedure extends BaseProcedure with ProcedureBuilder with Logging { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.required(1, "instant_time", DataTypes.StringType, None) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("rollback_savepoint_result", DataTypes.BooleanType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val instantTime = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String] + + val basePath: String = getBasePath(tableName) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + + val completedInstants = metaClient.getActiveTimeline.getSavePointTimeline.filterCompletedInstants + if (completedInstants.empty) throw new HoodieException("There are no completed savepoint to run delete") + val savePoint = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, instantTime) + + if (!completedInstants.containsInstant(savePoint)) { + throw new HoodieException("Commit " + instantTime + " not found in Commits " + completedInstants) + } + + val client = createHoodieClient(jsc, basePath) + var result = false + + try { + client.restoreToSavepoint(instantTime) + logInfo("The commit $instantTime rolled back.") + result = true + } catch { + case _: HoodieSavepointException => + logWarning(s"The commit $instantTime failed to roll back.") + } finally { + client.close() + } + + Seq(Row(result)) + } + + override def build: Procedure = new RollbackToSavepointProcedure() +} + +object RollbackToSavepointProcedure { + val NAME: String = "rollback_to_savepoint" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get(): RollbackToSavepointProcedure = new RollbackToSavepointProcedure() + } +} + + + + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunBootstrapProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunBootstrapProcedure.scala new file mode 100644 index 0000000000000..de64650bfdf8c --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunBootstrapProcedure.scala @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hadoop.fs.Path +import org.apache.hudi.DataSourceWriteOptions +import org.apache.hudi.cli.BootstrapExecutorUtils +import org.apache.hudi.cli.HDFSParquetImporterUtils.{buildProperties, readConfig} +import org.apache.hudi.common.config.TypedProperties +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.util.StringUtils +import org.apache.hudi.config.HoodieBootstrapConfig +import org.apache.hudi.keygen.constant.KeyGeneratorType +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util +import java.util.Locale +import java.util.function.Supplier + +class RunBootstrapProcedure extends BaseProcedure with ProcedureBuilder with Logging { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.required(1, "table_type", DataTypes.StringType, None), + ProcedureParameter.required(2, "bootstrap_path", DataTypes.StringType, None), + ProcedureParameter.required(3, "base_path", DataTypes.StringType, None), + ProcedureParameter.required(4, "rowKey_field", DataTypes.StringType, None), + ProcedureParameter.optional(5, "base_file_format", DataTypes.StringType, "PARQUET"), + ProcedureParameter.optional(6, "partition_path_field", DataTypes.StringType, ""), + ProcedureParameter.optional(7, "bootstrap_index_class", DataTypes.StringType, "org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex"), + ProcedureParameter.optional(8, "selector_class", DataTypes.StringType, "org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector"), + ProcedureParameter.optional(9, "key_generator_glass", DataTypes.StringType, "org.apache.hudi.keygen.SimpleKeyGenerator"), + ProcedureParameter.optional(10, "full_bootstrap_input_provider", DataTypes.StringType, "org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider"), + ProcedureParameter.optional(11, "schema_provider_class", DataTypes.StringType, ""), + ProcedureParameter.optional(12, "payload_class", DataTypes.StringType, "org.apache.hudi.common.model.OverwriteWithLatestAvroPayload"), + ProcedureParameter.optional(13, "parallelism", DataTypes.IntegerType, 1500), + ProcedureParameter.optional(14, "enable_hive_sync", DataTypes.BooleanType, false), + ProcedureParameter.optional(15, "props_file_path", DataTypes.StringType, ""), + ProcedureParameter.optional(16, "bootstrap_overwrite", DataTypes.BooleanType, false) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("status", DataTypes.IntegerType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val tableType = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String] + val bootstrapPath = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[String] + val basePath = getArgValueOrDefault(args, PARAMETERS(3)).get.asInstanceOf[String] + val rowKeyField = getArgValueOrDefault(args, PARAMETERS(4)).get.asInstanceOf[String] + val baseFileFormat = getArgValueOrDefault(args, PARAMETERS(5)).get.asInstanceOf[String] + val partitionPathField = getArgValueOrDefault(args, PARAMETERS(6)).get.asInstanceOf[String] + val bootstrapIndexClass = getArgValueOrDefault(args, PARAMETERS(7)).get.asInstanceOf[String] + val selectorClass = getArgValueOrDefault(args, PARAMETERS(8)).get.asInstanceOf[String] + val keyGeneratorClass = getArgValueOrDefault(args, PARAMETERS(9)).get.asInstanceOf[String] + val fullBootstrapInputProvider = getArgValueOrDefault(args, PARAMETERS(10)).get.asInstanceOf[String] + val schemaProviderClass = getArgValueOrDefault(args, PARAMETERS(11)).get.asInstanceOf[String] + val payloadClass = getArgValueOrDefault(args, PARAMETERS(12)).get.asInstanceOf[String] + val parallelism = getArgValueOrDefault(args, PARAMETERS(13)).get.asInstanceOf[Int] + val enableHiveSync = getArgValueOrDefault(args, PARAMETERS(14)).get.asInstanceOf[Boolean] + val propsFilePath = getArgValueOrDefault(args, PARAMETERS(15)).get.asInstanceOf[String] + val bootstrapOverwrite = getArgValueOrDefault(args, PARAMETERS(16)).get.asInstanceOf[Boolean] + + val configs: util.List[String] = new util.ArrayList[String] + + val properties: TypedProperties = if (propsFilePath == null || propsFilePath.isEmpty) buildProperties(configs) + else readConfig(jsc.hadoopConfiguration, new Path(propsFilePath), configs).getProps(true) + + properties.setProperty(HoodieBootstrapConfig.BASE_PATH.key, bootstrapPath) + + if (!StringUtils.isNullOrEmpty(keyGeneratorClass) && KeyGeneratorType.getNames.contains(keyGeneratorClass.toUpperCase(Locale.ROOT))) { + properties.setProperty(HoodieBootstrapConfig.KEYGEN_TYPE.key, keyGeneratorClass.toUpperCase(Locale.ROOT)) + } + else { + properties.setProperty(HoodieBootstrapConfig.KEYGEN_CLASS_NAME.key, keyGeneratorClass) + } + + properties.setProperty(HoodieBootstrapConfig.FULL_BOOTSTRAP_INPUT_PROVIDER_CLASS_NAME.key, fullBootstrapInputProvider) + properties.setProperty(HoodieBootstrapConfig.PARALLELISM_VALUE.key, parallelism.toString) + properties.setProperty(HoodieBootstrapConfig.MODE_SELECTOR_CLASS_NAME.key, selectorClass) + properties.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD.key, rowKeyField) + properties.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, partitionPathField) + + val fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration) + + val cfg = new BootstrapExecutorUtils.Config() + cfg.setTableName(tableName.get.asInstanceOf[String]) + cfg.setTableType(tableType) + cfg.setBasePath(basePath) + cfg.setBaseFileFormat(baseFileFormat) + cfg.setBootstrapIndexClass(bootstrapIndexClass) + cfg.setSchemaProviderClass(schemaProviderClass) + cfg.setPayloadClass(payloadClass) + cfg.setEnableHiveSync(enableHiveSync) + cfg.setBootstrapOverwrite(bootstrapOverwrite) + + try { + new BootstrapExecutorUtils(cfg, jsc, fs, jsc.hadoopConfiguration, properties).execute() + } catch { + case e: Exception => + logWarning(s"Run bootstrap failed due to", e) + Seq(Row(-1)) + } + Seq(Row(0)) + } + + override def build = new RunBootstrapProcedure() +} + +object RunBootstrapProcedure { + val NAME = "run_bootstrap" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new RunBootstrapProcedure + } +} + + + + + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunCleanProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunCleanProcedure.scala new file mode 100644 index 0000000000000..36580176d0f76 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunCleanProcedure.scala @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.HoodieCLIUtils +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline +import org.apache.hudi.common.util.JsonUtils +import org.apache.hudi.config.HoodieCleanConfig +import org.apache.hudi.table.action.clean.CleaningTriggerStrategy +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.function.Supplier + +class RunCleanProcedure extends BaseProcedure with ProcedureBuilder with Logging { + + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "skip_locking", DataTypes.BooleanType, false), + ProcedureParameter.optional(2, "schedule_in_line", DataTypes.BooleanType, true), + ProcedureParameter.optional(3, "clean_policy", DataTypes.StringType, HoodieCleanConfig.CLEANER_POLICY.defaultValue()), + ProcedureParameter.optional(4, "retain_commits", DataTypes.IntegerType, HoodieCleanConfig.CLEANER_COMMITS_RETAINED.defaultValue().toInt), + ProcedureParameter.optional(5, "hours_retained", DataTypes.IntegerType, HoodieCleanConfig.CLEANER_HOURS_RETAINED.defaultValue().toInt), + ProcedureParameter.optional(6, "file_versions_retained", DataTypes.IntegerType, HoodieCleanConfig.CLEANER_FILE_VERSIONS_RETAINED.defaultValue().toInt), + ProcedureParameter.optional(7, "trigger_strategy", DataTypes.StringType, HoodieCleanConfig.CLEAN_TRIGGER_STRATEGY.defaultValue()), + ProcedureParameter.optional(8, "trigger_max_commits", DataTypes.IntegerType, HoodieCleanConfig.CLEAN_MAX_COMMITS.defaultValue().toInt) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("start_clean_time", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("time_taken_in_millis", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_files_deleted", DataTypes.IntegerType, nullable = true, Metadata.empty), + StructField("earliest_commit_to_retain", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("bootstrap_part_metadata", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("version", DataTypes.IntegerType, nullable = true, Metadata.empty) + )) + + override def build: Procedure = new RunCleanProcedure + + /** + * Returns the input parameters of this procedure. + */ + override def parameters: Array[ProcedureParameter] = PARAMETERS + + /** + * Returns the type of rows produced by this procedure. + */ + override def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val skipLocking = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[Boolean] + val scheduleInLine = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[Boolean] + val basePath = getBasePath(tableName, Option.empty) + val cleanInstantTime = HoodieActiveTimeline.createNewInstantTime() + val props: Map[String, String] = Map( + HoodieCleanConfig.CLEANER_POLICY.key() -> getArgValueOrDefault(args, PARAMETERS(3)).get.toString, + HoodieCleanConfig.CLEANER_COMMITS_RETAINED.key() -> getArgValueOrDefault(args, PARAMETERS(4)).get.toString, + HoodieCleanConfig.CLEANER_HOURS_RETAINED.key() -> getArgValueOrDefault(args, PARAMETERS(5)).get.toString, + HoodieCleanConfig.CLEANER_FILE_VERSIONS_RETAINED.key() -> getArgValueOrDefault(args, PARAMETERS(6)).get.toString, + HoodieCleanConfig.CLEAN_TRIGGER_STRATEGY.key() -> getArgValueOrDefault(args, PARAMETERS(7)).get.toString, + HoodieCleanConfig.CLEAN_MAX_COMMITS.key() -> getArgValueOrDefault(args, PARAMETERS(8)).get.toString + ) + val client = HoodieCLIUtils.createHoodieClientFromPath(sparkSession, basePath, props) + val hoodieCleanMeta = client.clean(cleanInstantTime, scheduleInLine, skipLocking) + + if (hoodieCleanMeta == null) Seq.empty + else Seq(Row(hoodieCleanMeta.getStartCleanTime, + hoodieCleanMeta.getTimeTakenInMillis, + hoodieCleanMeta.getTotalFilesDeleted, + hoodieCleanMeta.getEarliestCommitToRetain, + JsonUtils.getObjectMapper.writeValueAsString(hoodieCleanMeta.getBootstrapPartitionMetadata), + hoodieCleanMeta.getVersion)) + } +} + +object RunCleanProcedure { + val NAME = "run_clean" + + def builder : Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new RunCleanProcedure + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunClusteringProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunClusteringProcedure.scala new file mode 100644 index 0000000000000..fa5bbb33bbf95 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunClusteringProcedure.scala @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.DataSourceReadOptions.{QUERY_TYPE, QUERY_TYPE_SNAPSHOT_OPT_VAL} +import org.apache.hudi.client.SparkRDDWriteClient +import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieTimeline} +import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.common.util.ValidationUtils.checkArgument +import org.apache.hudi.common.util.{ClusteringUtils, Option => HOption} +import org.apache.hudi.config.HoodieClusteringConfig +import org.apache.hudi.exception.HoodieClusteringException +import org.apache.hudi.{AvroConversionUtils, HoodieCLIUtils, HoodieFileIndex} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.HoodieCatalystExpressionUtils.{resolveExpr, splitPartitionAndDataPredicates} +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.expressions.PredicateHelper +import org.apache.spark.sql.execution.datasources.FileStatusCache +import org.apache.spark.sql.types._ + +import java.util.function.Supplier +import scala.collection.JavaConverters._ + +class RunClusteringProcedure extends BaseProcedure + with ProcedureBuilder + with PredicateHelper + with Logging { + + /** + * OPTIMIZE table_name|table_path [WHERE predicate] + * [ORDER BY (col_name1 [, ...] ) ] + */ + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.optional(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "path", DataTypes.StringType, None), + ProcedureParameter.optional(2, "predicate", DataTypes.StringType, None), + ProcedureParameter.optional(3, "order", DataTypes.StringType, None), + ProcedureParameter.optional(4, "show_involved_partition", DataTypes.BooleanType, false) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("timestamp", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("input_group_size", DataTypes.IntegerType, nullable = true, Metadata.empty), + StructField("state", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("involved_partitions", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val tablePath = getArgValueOrDefault(args, PARAMETERS(1)) + val predicate = getArgValueOrDefault(args, PARAMETERS(2)) + val orderColumns = getArgValueOrDefault(args, PARAMETERS(3)) + val showInvolvedPartitions = getArgValueOrDefault(args, PARAMETERS(4)).get.asInstanceOf[Boolean] + + val basePath: String = getBasePath(tableName, tablePath) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + var conf: Map[String, String] = Map.empty + predicate match { + case Some(p) => + val prunedPartitions = prunePartition(metaClient, p.asInstanceOf[String]) + conf = conf ++ Map( + HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME.key() -> "SELECTED_PARTITIONS", + HoodieClusteringConfig.PARTITION_SELECTED.key() -> prunedPartitions + ) + logInfo(s"Partition predicates: $p, partition selected: $prunedPartitions") + case _ => + logInfo("No partition predicates") + } + + // Construct sort column info + orderColumns match { + case Some(o) => + validateOrderColumns(o.asInstanceOf[String], metaClient) + conf = conf ++ Map( + HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS.key() -> o.asInstanceOf[String] + ) + logInfo(s"Order columns: $o") + case _ => + logInfo("No order columns") + } + + // Get all pending clustering instants + var pendingClustering = ClusteringUtils.getAllPendingClusteringPlans(metaClient) + .iterator().asScala.map(_.getLeft.getTimestamp).toSeq.sortBy(f => f) + logInfo(s"Pending clustering instants: ${pendingClustering.mkString(",")}") + + var client: SparkRDDWriteClient[_] = null + try { + client = HoodieCLIUtils.createHoodieClientFromPath(sparkSession, basePath, conf) + val instantTime = HoodieActiveTimeline.createNewInstantTime + if (client.scheduleClusteringAtInstant(instantTime, HOption.empty())) { + pendingClustering ++= Seq(instantTime) + } + logInfo(s"Clustering instants to run: ${pendingClustering.mkString(",")}.") + + val startTs = System.currentTimeMillis() + pendingClustering.foreach(client.cluster(_, true)) + logInfo(s"Finish clustering all the instants: ${pendingClustering.mkString(",")}," + + s" time cost: ${System.currentTimeMillis() - startTs}ms.") + + val clusteringInstants = metaClient.reloadActiveTimeline().getInstants.iterator().asScala + .filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION && pendingClustering.contains(p.getTimestamp)) + .toSeq + .sortBy(f => f.getTimestamp) + .reverse + + val clusteringPlans = clusteringInstants.map(instant => + ClusteringUtils.getClusteringPlan(metaClient, instant) + ) + + if (showInvolvedPartitions) { + clusteringPlans.map { p => + Row(p.get().getLeft.getTimestamp, p.get().getRight.getInputGroups.size(), + p.get().getLeft.getState.name(), HoodieCLIUtils.extractPartitions(p.get().getRight.getInputGroups.asScala)) + } + } else { + clusteringPlans.map { p => + Row(p.get().getLeft.getTimestamp, p.get().getRight.getInputGroups.size(), p.get().getLeft.getState.name(), "*") + } + } + } finally { + if (client != null) { + client.close() + } + } + } + + override def build: Procedure = new RunClusteringProcedure() + + def prunePartition(metaClient: HoodieTableMetaClient, predicate: String): String = { + val options = Map(QUERY_TYPE.key() -> QUERY_TYPE_SNAPSHOT_OPT_VAL, "path" -> metaClient.getBasePath) + val hoodieFileIndex = HoodieFileIndex(sparkSession, metaClient, None, options, + FileStatusCache.getOrCreate(sparkSession)) + + // Resolve partition predicates + val schemaResolver = new TableSchemaResolver(metaClient) + val tableSchema = AvroConversionUtils.convertAvroSchemaToStructType(schemaResolver.getTableAvroSchema) + val condition = resolveExpr(sparkSession, predicate, tableSchema) + val partitionColumns = metaClient.getTableConfig.getPartitionFields.orElse(Array[String]()) + val (partitionPredicates, dataPredicates) = splitPartitionAndDataPredicates( + sparkSession, splitConjunctivePredicates(condition).toArray, partitionColumns) + checkArgument(dataPredicates.isEmpty, "Only partition predicates are allowed") + + // Get all partitions and prune partition by predicates + val prunedPartitions = hoodieFileIndex.getPartitionPaths(partitionPredicates) + prunedPartitions.map(partitionPath => partitionPath.getPath).toSet.mkString(",") + } + + private def validateOrderColumns(orderColumns: String, metaClient: HoodieTableMetaClient): Unit = { + if (orderColumns == null) { + throw new HoodieClusteringException("Order columns is null") + } + + val tableSchemaResolver = new TableSchemaResolver(metaClient) + val fields = tableSchemaResolver.getTableAvroSchema(false) + .getFields.asScala.map(_.name().toLowerCase) + orderColumns.split(",").foreach(col => { + if (!fields.contains(col.toLowerCase)) { + throw new HoodieClusteringException("Order column not exist:" + col) + } + }) + } + +} + +object RunClusteringProcedure { + val NAME = "run_clustering" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new RunClusteringProcedure + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunCompactionProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunCompactionProcedure.scala new file mode 100644 index 0000000000000..3e5a7e29e4022 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunCompactionProcedure.scala @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.common.model.HoodieCommitMetadata +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieTimeline} +import org.apache.hudi.common.util.{CompactionUtils, HoodieTimer, Option => HOption} +import org.apache.hudi.exception.HoodieException +import org.apache.hudi.{HoodieCLIUtils, SparkAdapterSupport} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types._ + +import java.util.function.Supplier + +import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ + +class RunCompactionProcedure extends BaseProcedure with ProcedureBuilder with SparkAdapterSupport with Logging { + + /** + * operation = (RUN | SCHEDULE) COMPACTION ON tableIdentifier (AT instantTimestamp = INTEGER_VALUE)? + * operation = (RUN | SCHEDULE) COMPACTION ON path = STRING (AT instantTimestamp = INTEGER_VALUE)? + */ + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "op", DataTypes.StringType, None), + ProcedureParameter.optional(1, "table", DataTypes.StringType, None), + ProcedureParameter.optional(2, "path", DataTypes.StringType, None), + ProcedureParameter.optional(3, "timestamp", DataTypes.LongType, None) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("timestamp", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("operation_size", DataTypes.IntegerType, nullable = true, Metadata.empty), + StructField("state", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val operation = getArgValueOrDefault(args, PARAMETERS(0)).get.asInstanceOf[String].toLowerCase + val tableName = getArgValueOrDefault(args, PARAMETERS(1)) + val tablePath = getArgValueOrDefault(args, PARAMETERS(2)) + val instantTimestamp = getArgValueOrDefault(args, PARAMETERS(3)) + + val basePath = getBasePath(tableName, tablePath) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + val client = HoodieCLIUtils.createHoodieClientFromPath(sparkSession, basePath, Map.empty) + + var willCompactionInstants: Seq[String] = Seq.empty + operation match { + case "schedule" => + val instantTime = instantTimestamp.map(_.toString).getOrElse(HoodieActiveTimeline.createNewInstantTime) + if (client.scheduleCompactionAtInstant(instantTime, HOption.empty[java.util.Map[String, String]])) { + willCompactionInstants = Seq(instantTime) + } + case "run" => + // Do compaction + val timeLine = metaClient.getActiveTimeline + val pendingCompactionInstants = timeLine.getWriteTimeline.getInstants.iterator().asScala + .filter(p => p.getAction == HoodieTimeline.COMPACTION_ACTION) + .map(_.getTimestamp) + .toSeq.sortBy(f => f) + willCompactionInstants = if (instantTimestamp.isEmpty) { + if (pendingCompactionInstants.nonEmpty) { + pendingCompactionInstants + } else { // If there are no pending compaction, schedule to generate one. + // CompactionHoodiePathCommand will return instanceTime for SCHEDULE. + val instantTime = HoodieActiveTimeline.createNewInstantTime() + if (client.scheduleCompactionAtInstant(instantTime, HOption.empty[java.util.Map[String, String]])) { + Seq(instantTime) + } else { + Seq.empty + } + } + } else { + // Check if the compaction timestamp has exists in the pending compaction + if (pendingCompactionInstants.contains(instantTimestamp.get.toString)) { + Seq(instantTimestamp.get.toString) + } else { + throw new IllegalArgumentException(s"Compaction instant: ${instantTimestamp.get} is not found in " + + s"$basePath, Available pending compaction instants are: ${pendingCompactionInstants.mkString(",")} ") + } + } + + if (willCompactionInstants.isEmpty) { + logInfo(s"No need to compaction on $basePath") + } else { + logInfo(s"Run compaction at instants: [${willCompactionInstants.mkString(",")}] on $basePath") + val timer = new HoodieTimer + timer.startTimer() + willCompactionInstants.foreach { compactionInstant => + val writeResponse = client.compact(compactionInstant) + handleResponse(writeResponse.getCommitMetadata.get()) + client.commitCompaction(compactionInstant, writeResponse.getCommitMetadata.get(), HOption.empty()) + } + logInfo(s"Finish Run compaction at instants: [${willCompactionInstants.mkString(",")}]," + + s" spend: ${timer.endTimer()}ms") + } + case _ => throw new UnsupportedOperationException(s"Unsupported compaction operation: $operation") + } + + val compactionInstants = metaClient.reloadActiveTimeline().getInstants.iterator().asScala + .filter(instant => willCompactionInstants.contains(instant.getTimestamp)) + .toSeq + .sortBy(p => p.getTimestamp) + .reverse + + compactionInstants.map(instant => + (instant, CompactionUtils.getCompactionPlan(metaClient, instant.getTimestamp)) + ).map { case (instant, plan) => + Row(instant.getTimestamp, plan.getOperations.size(), instant.getState.name()) + } + } + + private def handleResponse(metadata: HoodieCommitMetadata): Unit = { + // Handle error + val writeStats = metadata.getPartitionToWriteStats.entrySet().flatMap(e => e.getValue).toList + val errorsCount = writeStats.map(state => state.getTotalWriteErrors).sum + if (errorsCount > 0) { + throw new HoodieException(s" Found $errorsCount when writing record") + } + } + + override def build: Procedure = new RunCompactionProcedure() + +} + +object RunCompactionProcedure { + val NAME = "run_compaction" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new RunCompactionProcedure + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowArchivedCommitsProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowArchivedCommitsProcedure.scala new file mode 100644 index 0000000000000..a3c3ece2932b8 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowArchivedCommitsProcedure.scala @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.HoodieCLIUtils +import org.apache.hudi.common.model.HoodieCommitMetadata +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieDefaultTimeline, HoodieInstant} +import org.apache.hudi.common.util.StringUtils +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.time.ZonedDateTime +import java.util +import java.util.function.Supplier +import java.util.{Collections, Date} +import scala.collection.JavaConverters._ + +class ShowArchivedCommitsProcedure(includeExtraMetadata: Boolean) extends BaseProcedure with ProcedureBuilder { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "limit", DataTypes.IntegerType, 10), + ProcedureParameter.optional(2, "start_ts", DataTypes.StringType, ""), + ProcedureParameter.optional(3, "end_ts", DataTypes.StringType, "") + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("commit_time", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("total_bytes_written", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_files_added", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_files_updated", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_partitions_written", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_records_written", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_update_records_written", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_errors", DataTypes.LongType, nullable = true, Metadata.empty) + )) + + private val METADATA_OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("commit_time", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("action", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("partition", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("file_id", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("previous_commit", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("num_writes", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("num_inserts", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("num_deletes", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("num_update_writes", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_errors", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_log_blocks", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_corrupt_log_blocks", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_rollback_blocks", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_log_records", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_updated_records_compacted", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_bytes_written", DataTypes.LongType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = if (includeExtraMetadata) METADATA_OUTPUT_TYPE else OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val table = getArgValueOrDefault(args, PARAMETERS(0)).get.asInstanceOf[String] + val limit = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[Int] + var startTs = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[String] + var endTs = getArgValueOrDefault(args, PARAMETERS(3)).get.asInstanceOf[String] + + val hoodieCatalogTable = HoodieCLIUtils.getHoodieCatalogTable(sparkSession, table) + val basePath = hoodieCatalogTable.tableLocation + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + + // start time for commits, default: now - 10 days + // end time for commits, default: now - 1 day + if (StringUtils.isNullOrEmpty(startTs)) startTs = getTimeDaysAgo(10) + if (StringUtils.isNullOrEmpty(endTs)) endTs = getTimeDaysAgo(1) + + val archivedTimeline = metaClient.getArchivedTimeline + try { + archivedTimeline.loadInstantDetailsInMemory(startTs, endTs) + val timelineRange = archivedTimeline.findInstantsInRange(startTs, endTs) + if (includeExtraMetadata) { + getCommitsWithMetadata(timelineRange, limit) + } else { + getCommits(timelineRange, limit) + } + } finally { + // clear the instant details from memory after printing to reduce usage + archivedTimeline.clearInstantDetailsFromMemory(startTs, endTs) + } + } + + override def build: Procedure = new ShowArchivedCommitsProcedure(includeExtraMetadata) + + private def getCommitsWithMetadata(timeline: HoodieDefaultTimeline, + limit: Int): Seq[Row] = { + import scala.collection.JavaConversions._ + + val (rows: util.ArrayList[Row], newCommits: util.ArrayList[HoodieInstant]) = getSortCommits(timeline) + + for (i <- 0 until newCommits.size) { + val commit = newCommits.get(i) + val commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get, classOf[HoodieCommitMetadata]) + for (partitionWriteStat <- commitMetadata.getPartitionToWriteStats.entrySet) { + for (hoodieWriteStat <- partitionWriteStat.getValue) { + rows.add(Row( + commit.getTimestamp, commit.getAction, hoodieWriteStat.getPartitionPath, + hoodieWriteStat.getFileId, hoodieWriteStat.getPrevCommit, hoodieWriteStat.getNumWrites, + hoodieWriteStat.getNumInserts, hoodieWriteStat.getNumDeletes, hoodieWriteStat.getNumUpdateWrites, + hoodieWriteStat.getTotalWriteErrors, hoodieWriteStat.getTotalLogBlocks, hoodieWriteStat.getTotalCorruptLogBlock, + hoodieWriteStat.getTotalRollbackBlocks, hoodieWriteStat.getTotalLogRecords, + hoodieWriteStat.getTotalUpdatedRecordsCompacted, hoodieWriteStat.getTotalWriteBytes)) + } + } + } + + rows.stream().limit(limit).toArray().map(r => r.asInstanceOf[Row]).toList + } + + private def getSortCommits(timeline: HoodieDefaultTimeline): (util.ArrayList[Row], util.ArrayList[HoodieInstant]) = { + val rows = new util.ArrayList[Row] + // timeline can be read from multiple files. So sort is needed instead of reversing the collection + val commits: util.List[HoodieInstant] = timeline.getCommitsTimeline.filterCompletedInstants + .getInstants.toArray().map(instant => instant.asInstanceOf[HoodieInstant]).toList.asJava + val newCommits = new util.ArrayList[HoodieInstant](commits) + Collections.sort(newCommits, HoodieInstant.COMPARATOR.reversed) + (rows, newCommits) + } + + def getCommits(timeline: HoodieDefaultTimeline, + limit: Int): Seq[Row] = { + val (rows: util.ArrayList[Row], newCommits: util.ArrayList[HoodieInstant]) = getSortCommits(timeline) + + for (i <- 0 until newCommits.size) { + val commit = newCommits.get(i) + val commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get, classOf[HoodieCommitMetadata]) + rows.add(Row(commit.getTimestamp, commitMetadata.fetchTotalBytesWritten, commitMetadata.fetchTotalFilesInsert, + commitMetadata.fetchTotalFilesUpdated, commitMetadata.fetchTotalPartitionsWritten, + commitMetadata.fetchTotalRecordsWritten, commitMetadata.fetchTotalUpdateRecordsWritten, + commitMetadata.fetchTotalWriteErrors)) + } + + rows.stream().limit(limit).toArray().map(r => r.asInstanceOf[Row]).toList + } + + def getTimeDaysAgo(numberOfDays: Int): String = { + val date = Date.from(ZonedDateTime.now.minusDays(numberOfDays).toInstant) + HoodieActiveTimeline.formatDate(date) + } +} + +object ShowArchivedCommitsProcedure { + val NAME = "show_archived_commits" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowArchivedCommitsProcedure(false) + } +} + +object ShowArchivedCommitsMetadataProcedure { + val NAME = "show_archived_commits_metadata" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowArchivedCommitsProcedure(true) + } +} + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowBootstrapMappingProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowBootstrapMappingProcedure.scala new file mode 100644 index 0000000000000..e9c5cdaf7533f --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowBootstrapMappingProcedure.scala @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.common.bootstrap.index.BootstrapIndex +import org.apache.hudi.common.model.{BootstrapFileMapping, HoodieFileGroupId} +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.exception.HoodieException +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util +import java.util.function.Supplier +import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ + +class ShowBootstrapMappingProcedure extends BaseProcedure with ProcedureBuilder { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "partition_path", DataTypes.StringType, ""), + ProcedureParameter.optional(2, "file_ids", DataTypes.StringType, ""), + ProcedureParameter.optional(3, "limit", DataTypes.IntegerType, 10), + ProcedureParameter.optional(4, "sort_by", DataTypes.StringType, "partition"), + ProcedureParameter.optional(5, "desc", DataTypes.BooleanType, false) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("partition", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("file_id", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("source_base_path", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("source_partition", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("source_file", DataTypes.StringType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val partitionPath = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String] + val fileIds = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[String] + val limit = getArgValueOrDefault(args, PARAMETERS(3)).get.asInstanceOf[Int] + val sortBy = getArgValueOrDefault(args, PARAMETERS(4)).get.asInstanceOf[String] + val desc = getArgValueOrDefault(args, PARAMETERS(5)).get.asInstanceOf[Boolean] + + val basePath: String = getBasePath(tableName) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + + if (partitionPath.isEmpty && fileIds.nonEmpty) throw new IllegalStateException("PartitionPath is mandatory when passing fileIds.") + + val indexReader = createBootstrapIndexReader(metaClient) + val indexedPartitions = indexReader.getIndexedPartitionPaths + + if (partitionPath.nonEmpty && !indexedPartitions.contains(partitionPath)) new HoodieException(partitionPath + " is not an valid indexed partition") + + val mappingList: util.ArrayList[BootstrapFileMapping] = new util.ArrayList[BootstrapFileMapping] + if (fileIds.nonEmpty) { + val fileGroupIds = fileIds.split(",").toList.map((fileId: String) => new HoodieFileGroupId(partitionPath, fileId)).asJava + mappingList.addAll(indexReader.getSourceFileMappingForFileIds(fileGroupIds).values) + } else if (partitionPath.nonEmpty) mappingList.addAll(indexReader.getSourceFileMappingForPartition(partitionPath)) + else { + for (part <- indexedPartitions) { + mappingList.addAll(indexReader.getSourceFileMappingForPartition(part)) + } + } + + val rows: java.util.List[Row] = mappingList + .map(mapping => Row(mapping.getPartitionPath, mapping.getFileId, mapping.getBootstrapBasePath, + mapping.getBootstrapPartitionPath, mapping.getBootstrapFileStatus.getPath.getUri)).toList + + val df = spark.createDataFrame(rows, OUTPUT_TYPE) + + if (desc) { + df.orderBy(df(sortBy).desc).limit(limit).collect() + } else { + df.orderBy(df(sortBy).asc).limit(limit).collect() + } + } + + private def createBootstrapIndexReader(metaClient: HoodieTableMetaClient) = { + val index = BootstrapIndex.getBootstrapIndex(metaClient) + if (!index.useIndex) throw new HoodieException("This is not a bootstrapped Hudi table. Don't have any index info") + index.createReader + } + + override def build: Procedure = new ShowBootstrapMappingProcedure() +} + +object ShowBootstrapMappingProcedure { + val NAME = "show_bootstrap_mapping" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowBootstrapMappingProcedure + } +} + + + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowBootstrapPartitionsProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowBootstrapPartitionsProcedure.scala new file mode 100644 index 0000000000000..b3bebd7f22416 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowBootstrapPartitionsProcedure.scala @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.common.bootstrap.index.BootstrapIndex +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.exception.HoodieException +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.function.Supplier + +class ShowBootstrapPartitionsProcedure extends BaseProcedure with ProcedureBuilder { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("indexed_partitions", DataTypes.StringType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + + val basePath: String = getBasePath(tableName) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + + val indexReader = createBootstrapIndexReader(metaClient) + val indexedPartitions = indexReader.getIndexedPartitionPaths + + indexedPartitions.stream().toArray.map(r => Row(r)).toList + } + + private def createBootstrapIndexReader(metaClient: HoodieTableMetaClient) = { + val index = BootstrapIndex.getBootstrapIndex(metaClient) + if (!index.useIndex) throw new HoodieException("This is not a bootstrapped Hudi table. Don't have any index info") + index.createReader + } + + override def build = new ShowBootstrapPartitionsProcedure() +} + +object ShowBootstrapPartitionsProcedure { + val NAME = "show_bootstrap_partitions" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowBootstrapPartitionsProcedure + } +} + + + + + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowClusteringProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowClusteringProcedure.scala new file mode 100644 index 0000000000000..092610119e606 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowClusteringProcedure.scala @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.{HoodieCLIUtils, SparkAdapterSupport} +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.HoodieTimeline +import org.apache.hudi.common.util.ClusteringUtils +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types._ + +import java.util.function.Supplier + +import scala.collection.JavaConverters._ + +class ShowClusteringProcedure extends BaseProcedure with ProcedureBuilder with SparkAdapterSupport with Logging { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.optional(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "path", DataTypes.StringType, None), + ProcedureParameter.optional(2, "limit", DataTypes.IntegerType, 20), + ProcedureParameter.optional(3, "show_involved_partition", DataTypes.BooleanType, false) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("timestamp", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("input_group_size", DataTypes.IntegerType, nullable = true, Metadata.empty), + StructField("state", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("involved_partitions", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val tablePath = getArgValueOrDefault(args, PARAMETERS(1)) + val limit = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[Int] + val showInvolvedPartitions = getArgValueOrDefault(args, PARAMETERS(3)).get.asInstanceOf[Boolean] + + val basePath: String = getBasePath(tableName, tablePath) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + val clusteringInstants = metaClient.getActiveTimeline.getInstants.iterator().asScala + .filter(p => p.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION) + .toSeq + .sortBy(f => f.getTimestamp) + .reverse + .take(limit) + + val clusteringPlans = clusteringInstants.map(instant => + ClusteringUtils.getClusteringPlan(metaClient, instant) + ) + + if (showInvolvedPartitions) { + clusteringPlans.map { p => + Row(p.get().getLeft.getTimestamp, p.get().getRight.getInputGroups.size(), + p.get().getLeft.getState.name(), HoodieCLIUtils.extractPartitions(p.get().getRight.getInputGroups.asScala)) + } + } else { + clusteringPlans.map { p => + Row(p.get().getLeft.getTimestamp, p.get().getRight.getInputGroups.size(), + p.get().getLeft.getState.name(), "*") + } + } + } + + override def build: Procedure = new ShowClusteringProcedure() +} + +object ShowClusteringProcedure { + val NAME = "show_clustering" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowClusteringProcedure + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowCommitFilesProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowCommitFilesProcedure.scala new file mode 100644 index 0000000000000..53fcd072c3b7e --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowCommitFilesProcedure.scala @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.HoodieCLIUtils +import org.apache.hudi.common.model.{HoodieCommitMetadata, HoodieReplaceCommitMetadata, HoodieWriteStat} +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline} +import org.apache.hudi.exception.HoodieException +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util +import java.util.List +import java.util.function.Supplier +import scala.collection.JavaConversions._ + +class ShowCommitFilesProcedure() extends BaseProcedure with ProcedureBuilder { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "limit", DataTypes.IntegerType, 10), + ProcedureParameter.required(2, "instant_time", DataTypes.StringType, None) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("action", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("partition_path", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("file_id", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("previous_commit", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("total_records_updated", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_records_written", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_bytes_written", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_errors", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("file_size", DataTypes.LongType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val table = getArgValueOrDefault(args, PARAMETERS(0)).get.asInstanceOf[String] + val limit = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[Int] + val instantTime = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[String] + + val hoodieCatalogTable = HoodieCLIUtils.getHoodieCatalogTable(sparkSession, table) + val basePath = hoodieCatalogTable.tableLocation + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + val activeTimeline = metaClient.getActiveTimeline + val timeline = activeTimeline.getCommitsTimeline.filterCompletedInstants + val hoodieInstantOption = getCommitForInstant(timeline, instantTime) + val commitMetadataOptional = getHoodieCommitMetadata(timeline, hoodieInstantOption) + + if (commitMetadataOptional.isEmpty) { + throw new HoodieException(s"Commit $instantTime not found in Commits $timeline.") + } + + val meta = commitMetadataOptional.get + val rows = new util.ArrayList[Row] + for (entry <- meta.getPartitionToWriteStats.entrySet) { + val action: String = hoodieInstantOption.get.getAction + val path: String = entry.getKey + val stats: List[HoodieWriteStat] = entry.getValue + for (stat <- stats) { + rows.add(Row(action, path, stat.getFileId, stat.getPrevCommit, stat.getNumUpdateWrites, + stat.getNumWrites, stat.getTotalWriteBytes, stat.getTotalWriteErrors, stat.getFileSizeInBytes)) + } + } + rows.stream().limit(limit).toArray().map(r => r.asInstanceOf[Row]).toList + } + + override def build: Procedure = new ShowCommitFilesProcedure() + + private def getCommitForInstant(timeline: HoodieTimeline, instantTime: String): Option[HoodieInstant] = { + val instants: util.List[HoodieInstant] = util.Arrays.asList( + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, instantTime), + new HoodieInstant(false, HoodieTimeline.REPLACE_COMMIT_ACTION, instantTime), + new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, instantTime)) + + val hoodieInstant: Option[HoodieInstant] = instants.find((i: HoodieInstant) => timeline.containsInstant(i)) + hoodieInstant + } + + private def getHoodieCommitMetadata(timeline: HoodieTimeline, hoodieInstant: Option[HoodieInstant]): Option[HoodieCommitMetadata] = { + if (hoodieInstant.isDefined) { + if (hoodieInstant.get.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION) { + Option(HoodieReplaceCommitMetadata.fromBytes(timeline.getInstantDetails(hoodieInstant.get).get, + classOf[HoodieReplaceCommitMetadata])) + } else { + Option(HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(hoodieInstant.get).get, + classOf[HoodieCommitMetadata])) + } + } else { + Option.empty + } + } +} + +object ShowCommitFilesProcedure { + val NAME = "show_commit_files" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowCommitFilesProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowCommitPartitionsProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowCommitPartitionsProcedure.scala new file mode 100644 index 0000000000000..0a3945aee8a64 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowCommitPartitionsProcedure.scala @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.HoodieCLIUtils +import org.apache.hudi.common.model.{HoodieCommitMetadata, HoodieReplaceCommitMetadata, HoodieWriteStat} +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline} +import org.apache.hudi.exception.HoodieException +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util +import java.util.List +import java.util.function.Supplier +import scala.collection.JavaConversions._ + +class ShowCommitPartitionsProcedure() extends BaseProcedure with ProcedureBuilder { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "limit", DataTypes.IntegerType, 10), + ProcedureParameter.required(2, "instant_time", DataTypes.StringType, None) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("action", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("partition_path", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("total_files_added", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_files_updated", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_records_inserted", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_records_updated", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_bytes_written", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_errors", DataTypes.LongType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val table = getArgValueOrDefault(args, PARAMETERS(0)).get.asInstanceOf[String] + val limit = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[Int] + val instantTime = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[String] + + val hoodieCatalogTable = HoodieCLIUtils.getHoodieCatalogTable(sparkSession, table) + val basePath = hoodieCatalogTable.tableLocation + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + val activeTimeline = metaClient.getActiveTimeline + val timeline = activeTimeline.getCommitsTimeline.filterCompletedInstants + val hoodieInstantOption = getCommitForInstant(timeline, instantTime) + val commitMetadataOptional = getHoodieCommitMetadata(timeline, hoodieInstantOption) + + if (commitMetadataOptional.isEmpty) { + throw new HoodieException(s"Commit $instantTime not found in Commits $timeline.") + } + + val meta = commitMetadataOptional.get + val rows = new util.ArrayList[Row] + for (entry <- meta.getPartitionToWriteStats.entrySet) { + val action: String = hoodieInstantOption.get.getAction + val path: String = entry.getKey + val stats: List[HoodieWriteStat] = entry.getValue + var totalFilesAdded: Long = 0 + var totalFilesUpdated: Long = 0 + var totalRecordsUpdated: Long = 0 + var totalRecordsInserted: Long = 0 + var totalBytesWritten: Long = 0 + var totalWriteErrors: Long = 0 + for (stat <- stats) { + if (stat.getPrevCommit == HoodieWriteStat.NULL_COMMIT) { + totalFilesAdded += 1 + } + else { + totalFilesUpdated += 1 + totalRecordsUpdated += stat.getNumUpdateWrites + } + totalRecordsInserted += stat.getNumInserts + totalBytesWritten += stat.getTotalWriteBytes + totalWriteErrors += stat.getTotalWriteErrors + } + rows.add(Row(action, path, totalFilesAdded, totalFilesUpdated, totalRecordsInserted, totalRecordsUpdated, + totalBytesWritten, totalWriteErrors)) + } + rows.stream().limit(limit).toArray().map(r => r.asInstanceOf[Row]).toList + } + + override def build: Procedure = new ShowCommitPartitionsProcedure() + + private def getCommitForInstant(timeline: HoodieTimeline, instantTime: String): Option[HoodieInstant] = { + val instants: util.List[HoodieInstant] = util.Arrays.asList( + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, instantTime), + new HoodieInstant(false, HoodieTimeline.REPLACE_COMMIT_ACTION, instantTime), + new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, instantTime)) + + val hoodieInstant: Option[HoodieInstant] = instants.find((i: HoodieInstant) => timeline.containsInstant(i)) + hoodieInstant + } + + private def getHoodieCommitMetadata(timeline: HoodieTimeline, hoodieInstant: Option[HoodieInstant]): Option[HoodieCommitMetadata] = { + if (hoodieInstant.isDefined) { + if (hoodieInstant.get.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION) { + Option(HoodieReplaceCommitMetadata.fromBytes(timeline.getInstantDetails(hoodieInstant.get).get, + classOf[HoodieReplaceCommitMetadata])) + } else { + Option(HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(hoodieInstant.get).get, + classOf[HoodieCommitMetadata])) + } + } else { + Option.empty + } + } +} + +object ShowCommitPartitionsProcedure { + val NAME = "show_commit_partitions" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowCommitPartitionsProcedure() + } +} + + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowCommitWriteStatsProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowCommitWriteStatsProcedure.scala new file mode 100644 index 0000000000000..4e3609b533465 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowCommitWriteStatsProcedure.scala @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.HoodieCLIUtils +import org.apache.hudi.common.model.{HoodieCommitMetadata, HoodieReplaceCommitMetadata} +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline} +import org.apache.hudi.exception.HoodieException +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util +import java.util.function.Supplier +import scala.collection.JavaConversions._ + +class ShowCommitWriteStatsProcedure() extends BaseProcedure with ProcedureBuilder { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "limit", DataTypes.IntegerType, 10), + ProcedureParameter.required(2, "instant_time", DataTypes.StringType, None) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("action", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("total_bytes_written", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_records_written", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("avg_record_size", DataTypes.LongType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val table = getArgValueOrDefault(args, PARAMETERS(0)).get.asInstanceOf[String] + val limit = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[Int] + val instantTime = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[String] + + val hoodieCatalogTable = HoodieCLIUtils.getHoodieCatalogTable(sparkSession, table) + val basePath = hoodieCatalogTable.tableLocation + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + val activeTimeline = metaClient.getActiveTimeline + val timeline = activeTimeline.getCommitsTimeline.filterCompletedInstants + val hoodieInstantOption = getCommitForInstant(timeline, instantTime) + val commitMetadataOptional = getHoodieCommitMetadata(timeline, hoodieInstantOption) + + if (commitMetadataOptional.isEmpty) { + throw new HoodieException(s"Commit $instantTime not found in Commits $timeline.") + } + + val meta = commitMetadataOptional.get + + val action: String = hoodieInstantOption.get.getAction + val recordsWritten = meta.fetchTotalRecordsWritten + val bytesWritten = meta.fetchTotalBytesWritten + val avgRecSize = Math.ceil((1.0 * bytesWritten) / recordsWritten).toLong + val rows = new util.ArrayList[Row] + rows.add(Row(action, bytesWritten, recordsWritten, avgRecSize)) + + rows.stream().limit(limit).toArray().map(r => r.asInstanceOf[Row]).toList + } + + override def build: Procedure = new ShowCommitWriteStatsProcedure() + + private def getCommitForInstant(timeline: HoodieTimeline, instantTime: String): Option[HoodieInstant] = { + val instants: util.List[HoodieInstant] = util.Arrays.asList( + new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, instantTime), + new HoodieInstant(false, HoodieTimeline.REPLACE_COMMIT_ACTION, instantTime), + new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, instantTime)) + + val hoodieInstant: Option[HoodieInstant] = instants.find((i: HoodieInstant) => timeline.containsInstant(i)) + hoodieInstant + } + + private def getHoodieCommitMetadata(timeline: HoodieTimeline, hoodieInstant: Option[HoodieInstant]): Option[HoodieCommitMetadata] = { + if (hoodieInstant.isDefined) { + if (hoodieInstant.get.getAction == HoodieTimeline.REPLACE_COMMIT_ACTION) { + Option(HoodieReplaceCommitMetadata.fromBytes(timeline.getInstantDetails(hoodieInstant.get).get, + classOf[HoodieReplaceCommitMetadata])) + } else { + Option(HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(hoodieInstant.get).get, + classOf[HoodieCommitMetadata])) + } + } else { + Option.empty + } + } +} + +object ShowCommitWriteStatsProcedure { + val NAME = "show_commit_write_stats" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowCommitWriteStatsProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowCommitsProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowCommitsProcedure.scala new file mode 100644 index 0000000000000..169acce887dd1 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowCommitsProcedure.scala @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.HoodieCLIUtils +import org.apache.hudi.common.model.HoodieCommitMetadata +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.{HoodieDefaultTimeline, HoodieInstant} +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util +import java.util.Collections +import java.util.function.Supplier +import scala.collection.JavaConverters._ + +class ShowCommitsProcedure(includeExtraMetadata: Boolean) extends BaseProcedure with ProcedureBuilder { + var sortByFieldParameter: ProcedureParameter = _ + + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "limit", DataTypes.IntegerType, 10) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("commit_time", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("total_bytes_written", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_files_added", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_files_updated", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_partitions_written", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_records_written", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_update_records_written", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_errors", DataTypes.LongType, nullable = true, Metadata.empty) + )) + + private val METADATA_OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("commit_time", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("action", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("partition", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("file_id", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("previous_commit", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("num_writes", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("num_inserts", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("num_deletes", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("num_update_writes", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_errors", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_log_blocks", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_corrupt_log_blocks", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_rollback_blocks", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_log_records", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_updated_records_compacted", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_bytes_written", DataTypes.LongType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = if (includeExtraMetadata) METADATA_OUTPUT_TYPE else OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val table = getArgValueOrDefault(args, PARAMETERS(0)).get.asInstanceOf[String] + val limit = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[Int] + + val hoodieCatalogTable = HoodieCLIUtils.getHoodieCatalogTable(sparkSession, table) + val basePath = hoodieCatalogTable.tableLocation + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + + val activeTimeline = metaClient.getActiveTimeline + if (includeExtraMetadata) { + getCommitsWithMetadata(activeTimeline, limit) + } else { + getCommits(activeTimeline, limit) + } + } + + override def build: Procedure = new ShowCommitsProcedure(includeExtraMetadata) + + private def getCommitsWithMetadata(timeline: HoodieDefaultTimeline, + limit: Int): Seq[Row] = { + import scala.collection.JavaConversions._ + + val (rows: util.ArrayList[Row], newCommits: util.ArrayList[HoodieInstant]) = getSortCommits(timeline) + + for (i <- 0 until newCommits.size) { + val commit = newCommits.get(i) + val commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get, classOf[HoodieCommitMetadata]) + for (partitionWriteStat <- commitMetadata.getPartitionToWriteStats.entrySet) { + for (hoodieWriteStat <- partitionWriteStat.getValue) { + rows.add(Row( + commit.getTimestamp, commit.getAction, hoodieWriteStat.getPartitionPath, + hoodieWriteStat.getFileId, hoodieWriteStat.getPrevCommit, hoodieWriteStat.getNumWrites, + hoodieWriteStat.getNumInserts, hoodieWriteStat.getNumDeletes, hoodieWriteStat.getNumUpdateWrites, + hoodieWriteStat.getTotalWriteErrors, hoodieWriteStat.getTotalLogBlocks, hoodieWriteStat.getTotalCorruptLogBlock, + hoodieWriteStat.getTotalRollbackBlocks, hoodieWriteStat.getTotalLogRecords, + hoodieWriteStat.getTotalUpdatedRecordsCompacted, hoodieWriteStat.getTotalWriteBytes)) + } + } + } + + rows.stream().limit(limit).toArray().map(r => r.asInstanceOf[Row]).toList + } + + private def getSortCommits(timeline: HoodieDefaultTimeline): (util.ArrayList[Row], util.ArrayList[HoodieInstant]) = { + val rows = new util.ArrayList[Row] + // timeline can be read from multiple files. So sort is needed instead of reversing the collection + val commits: util.List[HoodieInstant] = timeline.getCommitsTimeline.filterCompletedInstants + .getInstants.toArray().map(instant => instant.asInstanceOf[HoodieInstant]).toList.asJava + val newCommits = new util.ArrayList[HoodieInstant](commits) + Collections.sort(newCommits, HoodieInstant.COMPARATOR.reversed) + (rows, newCommits) + } + + def getCommits(timeline: HoodieDefaultTimeline, + limit: Int): Seq[Row] = { + val (rows: util.ArrayList[Row], newCommits: util.ArrayList[HoodieInstant]) = getSortCommits(timeline) + + for (i <- 0 until newCommits.size) { + val commit = newCommits.get(i) + val commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get, classOf[HoodieCommitMetadata]) + rows.add(Row(commit.getTimestamp, commitMetadata.fetchTotalBytesWritten, commitMetadata.fetchTotalFilesInsert, + commitMetadata.fetchTotalFilesUpdated, commitMetadata.fetchTotalPartitionsWritten, + commitMetadata.fetchTotalRecordsWritten, commitMetadata.fetchTotalUpdateRecordsWritten, + commitMetadata.fetchTotalWriteErrors)) + } + + rows.stream().limit(limit).toArray().map(r => r.asInstanceOf[Row]).toList + } +} + +object ShowCommitsProcedure { + val NAME = "show_commits" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowCommitsProcedure(false) + } +} + +object ShowCommitsMetadataProcedure { + val NAME = "show_commits_metadata" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowCommitsProcedure(true) + } +} + + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowCompactionProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowCompactionProcedure.scala new file mode 100644 index 0000000000000..7a7bb2cf9d996 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowCompactionProcedure.scala @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.SparkAdapterSupport +import org.apache.hudi.common.model.HoodieTableType +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.HoodieTimeline +import org.apache.hudi.common.util.CompactionUtils + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types._ + +import java.util.function.Supplier + +import scala.collection.JavaConverters._ + +class ShowCompactionProcedure extends BaseProcedure with ProcedureBuilder with SparkAdapterSupport with Logging { + /** + * SHOW COMPACTION ON tableIdentifier (LIMIT limit = INTEGER_VALUE)? + * SHOW COMPACTION ON path = STRING (LIMIT limit = INTEGER_VALUE)? + */ + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.optional(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "path", DataTypes.StringType, None), + ProcedureParameter.optional(2, "limit", DataTypes.IntegerType, 20) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("timestamp", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("operation_size", DataTypes.IntegerType, nullable = true, Metadata.empty), + StructField("state", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val tablePath = getArgValueOrDefault(args, PARAMETERS(1)) + val limit = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[Int] + + val basePath: String = getBasePath(tableName, tablePath) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + + assert(metaClient.getTableType == HoodieTableType.MERGE_ON_READ, + s"Cannot show compaction on a Non Merge On Read table.") + val compactionInstants = metaClient.getActiveTimeline.getInstants.iterator().asScala + .filter(p => p.getAction == HoodieTimeline.COMPACTION_ACTION) + .toSeq + .sortBy(f => f.getTimestamp) + .reverse + .take(limit) + + compactionInstants.map(instant => + (instant, CompactionUtils.getCompactionPlan(metaClient, instant.getTimestamp)) + ).map { case (instant, plan) => + Row(instant.getTimestamp, plan.getOperations.size(), instant.getState.name()) + } + } + + override def build: Procedure = new ShowCompactionProcedure() +} + +object ShowCompactionProcedure { + val NAME = "show_compaction" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowCompactionProcedure + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowFileSystemViewProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowFileSystemViewProcedure.scala new file mode 100644 index 0000000000000..6f2aa2c918722 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowFileSystemViewProcedure.scala @@ -0,0 +1,260 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.{FileSlice, HoodieLogFile} +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.{HoodieDefaultTimeline, HoodieInstant, HoodieTimeline} +import org.apache.hudi.common.table.view.HoodieTableFileSystemView +import org.apache.hudi.common.util +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.function.{Function, Supplier} +import java.util.stream.Collectors +import scala.collection.JavaConversions +import scala.collection.JavaConverters.{asJavaIterableConverter, asJavaIteratorConverter, asScalaIteratorConverter} + +class ShowFileSystemViewProcedure(showLatest: Boolean) extends BaseProcedure with ProcedureBuilder { + private val PARAMETERS_ALL: Array[ProcedureParameter] = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "max_instant", DataTypes.StringType, ""), + ProcedureParameter.optional(2, "include_max", DataTypes.BooleanType, false), + ProcedureParameter.optional(3, "include_in_flight", DataTypes.BooleanType, false), + ProcedureParameter.optional(4, "exclude_compaction", DataTypes.BooleanType, false), + ProcedureParameter.optional(5, "limit", DataTypes.IntegerType, 10), + ProcedureParameter.optional(6, "path_regex", DataTypes.StringType, "*/*/*") + ) + + private val OUTPUT_TYPE_ALL: StructType = StructType(Array[StructField]( + StructField("partition", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("file_id", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("base_instant", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("data_file", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("data_file_size", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("num_delta_files", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_delta_file_size", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("delta_files", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + private val PARAMETERS_LATEST: Array[ProcedureParameter] = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "max_instant", DataTypes.StringType, ""), + ProcedureParameter.optional(2, "include_max", DataTypes.BooleanType, false), + ProcedureParameter.optional(3, "include_inflight", DataTypes.BooleanType, false), + ProcedureParameter.optional(4, "exclude_compaction", DataTypes.BooleanType, false), + ProcedureParameter.optional(5, "limit", DataTypes.IntegerType, 10), + ProcedureParameter.required(6, "partition_path", DataTypes.StringType, None), + ProcedureParameter.optional(7, "merge", DataTypes.BooleanType, true) + + ) + + private val OUTPUT_TYPE_LATEST: StructType = StructType(Array[StructField]( + StructField("partition", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("file_id", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("base_instant", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("data_file", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("data_file_size", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("num_delta_files", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_delta_file_size", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("delta_size_compaction_scheduled", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("delta_size_compaction_unscheduled", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("delta_to_base_radio_compaction_scheduled", DataTypes.DoubleType, nullable = true, Metadata.empty), + StructField("delta_to_base_radio_compaction_unscheduled", DataTypes.DoubleType, nullable = true, Metadata.empty), + StructField("delta_files_compaction_scheduled", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("delta_files_compaction_unscheduled", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + private def buildFileSystemView(table: Option[Any], + globRegex: String, + maxInstant: String, + includeMaxInstant: Boolean, + includeInflight: Boolean, + excludeCompaction: Boolean + ): HoodieTableFileSystemView = { + val basePath = getBasePath(table) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + val fs = metaClient.getFs + val globPath = String.format("%s/%s/*", basePath, globRegex) + val statuses = FSUtils.getGlobStatusExcludingMetaFolder(fs, new Path(globPath)) + var timeline: HoodieTimeline = if (excludeCompaction) { + metaClient.getActiveTimeline.getCommitsTimeline + } else { + metaClient.getActiveTimeline.getWriteTimeline + } + if (!includeInflight) { + timeline = timeline.filterCompletedInstants() + } + var instants = timeline.getInstants.iterator().asScala + if (maxInstant.nonEmpty) { + val predicate = if (includeMaxInstant) { + HoodieTimeline.GREATER_THAN_OR_EQUALS + } else { + HoodieTimeline.GREATER_THAN + } + instants = instants.filter(instant => predicate.test(maxInstant, instant.getTimestamp)) + } + + val details = new Function[HoodieInstant, org.apache.hudi.common.util.Option[Array[Byte]]] + with java.io.Serializable { + override def apply(instant: HoodieInstant): util.Option[Array[Byte]] = { + metaClient.getActiveTimeline.getInstantDetails(instant) + } + } + + val filteredTimeline = new HoodieDefaultTimeline( + new java.util.ArrayList[HoodieInstant](JavaConversions.asJavaCollection(instants.toList)).stream(), details) + new HoodieTableFileSystemView(metaClient, filteredTimeline, statuses.toArray(new Array[FileStatus](0))) + } + + private def showAllFileSlices(fsView: HoodieTableFileSystemView): java.util.List[Row] = { + val rows: java.util.List[Row] = new java.util.ArrayList[Row] + fsView.getAllFileGroups.iterator().asScala.foreach(fg => { + fg.getAllFileSlices.iterator().asScala.foreach(fs => { + val fileId = fg.getFileGroupId.getFileId + var baseFilePath = "" + var baseFileSize = 0L + if (fs.getBaseFile.isPresent) { + baseFilePath = fs.getBaseFile.get.getPath + baseFileSize = fs.getBaseFile.get.getFileSize + } + val numLogFiles = fs.getLogFiles.count() + val sumLogFileSize = fs.getLogFiles.iterator().asScala.map(_.getFileSize).sum + val logFiles = fs.getLogFiles.collect(Collectors.toList[HoodieLogFile]).toString + + rows.add(Row(fg.getPartitionPath, fileId, fs.getBaseInstantTime, baseFilePath, baseFileSize, numLogFiles, + sumLogFileSize, logFiles)) + }) + }) + rows + } + + private def showLatestFileSlices(fsView: HoodieTableFileSystemView, + table: Option[Any], + partition: String, + maxInstant: String, + merge: Boolean): java.util.List[Row] = { + var fileSliceStream: java.util.stream.Stream[FileSlice] = null + if (!merge) { + fileSliceStream = fsView.getLatestFileSlices(partition) + } else { + fileSliceStream = fsView.getLatestMergedFileSlicesBeforeOrOn(partition, if (maxInstant.isEmpty) { + val basePath = getBasePath(table) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + metaClient.getActiveTimeline.filterCompletedAndCompactionInstants().lastInstant().get().getTimestamp + } else { + maxInstant + }) + } + val rows: java.util.List[Row] = new java.util.ArrayList[Row] + fileSliceStream.iterator().asScala.foreach { + fs => { + val fileId = fs.getFileId + val baseInstantTime = fs.getBaseInstantTime + var baseFilePath = "" + var baseFileSize = 0L + if (fs.getBaseFile.isPresent) { + baseFilePath = fs.getBaseFile.get.getPath + baseFileSize = fs.getBaseFile.get.getFileSize + } + val numLogFiles = fs.getLogFiles.count() + val sumLogFileSize = fs.getLogFiles.iterator().asScala.map(_.getFileSize).sum + val logFilesScheduledForCompactionTotalSize = fs.getLogFiles.iterator().asScala + .filter(logFile => logFile.getBaseCommitTime.equals(fs.getBaseInstantTime)) + .map(_.getFileSize).sum + val logFilesUnscheduledTotalSize = fs.getLogFiles.iterator().asScala + .filter(logFile => !logFile.getBaseCommitTime.equals(fs.getBaseInstantTime)) + .map(_.getFileSize).sum + val logSelectedForCompactionToBaseRatio = if (baseFileSize > 0) { + logFilesScheduledForCompactionTotalSize / (baseFileSize * 1.0) + } else { + -1 + } + val logUnscheduledToBaseRatio = if (baseFileSize > 0) { + logFilesUnscheduledTotalSize / (baseFileSize * 1.0) + } else { + -1 + } + val logFilesCommitTimeEqualInstantTime = fs.getLogFiles.iterator().asScala + .filter(logFile => logFile.getBaseCommitTime.equals(fs.getBaseInstantTime)) + .mkString("[", ",", "]") + val logFilesCommitTimeNonEqualInstantTime = fs.getLogFiles.iterator().asScala + .filter(logFile => !logFile.getBaseCommitTime.equals(fs.getBaseInstantTime)) + .mkString("[", ",", "]") + rows.add(Row(partition, fileId, baseInstantTime, baseFilePath, baseFileSize, numLogFiles, sumLogFileSize, + logFilesScheduledForCompactionTotalSize, logFilesUnscheduledTotalSize, logSelectedForCompactionToBaseRatio, + logUnscheduledToBaseRatio, logFilesCommitTimeEqualInstantTime, logFilesCommitTimeNonEqualInstantTime + )) + } + } + rows + } + + override def parameters: Array[ProcedureParameter] = if (showLatest) { + PARAMETERS_LATEST + } else { + PARAMETERS_ALL + } + + override def outputType: StructType = if (showLatest) { + OUTPUT_TYPE_LATEST + } else { + OUTPUT_TYPE_ALL + } + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(parameters, args) + val table = getArgValueOrDefault(args, parameters(0)) + val maxInstant = getArgValueOrDefault(args, parameters(1)).get.asInstanceOf[String] + val includeMax = getArgValueOrDefault(args, parameters(2)).get.asInstanceOf[Boolean] + val includeInflight = getArgValueOrDefault(args, parameters(3)).get.asInstanceOf[Boolean] + val excludeCompaction = getArgValueOrDefault(args, parameters(4)).get.asInstanceOf[Boolean] + val limit = getArgValueOrDefault(args, parameters(5)).get.asInstanceOf[Int] + val rows: java.util.List[Row] = if (!showLatest) { + val globRegex = getArgValueOrDefault(args, parameters(6)).get.asInstanceOf[String] + val fsView = buildFileSystemView(table, globRegex, maxInstant, includeMax, includeInflight, excludeCompaction) + showAllFileSlices(fsView) + } else { + val partitionPath = getArgValueOrDefault(args, parameters(6)).get.asInstanceOf[String] + val merge = getArgValueOrDefault(args, parameters(7)).get.asInstanceOf[Boolean] + val fsView = buildFileSystemView(table, partitionPath, maxInstant, includeMax, includeInflight, excludeCompaction) + showLatestFileSlices(fsView, table, partitionPath, maxInstant, merge) + } + rows.stream().limit(limit).toArray().map(r => r.asInstanceOf[Row]).toList + } + + override def build: Procedure = new ShowFileSystemViewProcedure(showLatest) +} + +object ShowAllFileSystemViewProcedure { + val NAME = "show_fsview_all" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowFileSystemViewProcedure(false) + } +} + +object ShowLatestFileSystemViewProcedure { + val NAME = "show_fsview_latest" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get(): ProcedureBuilder = new ShowFileSystemViewProcedure(true) + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowFsPathDetailProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowFsPathDetailProcedure.scala new file mode 100644 index 0000000000000..aae3858c1140a --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowFsPathDetailProcedure.scala @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hadoop.fs.{ContentSummary, FileStatus, Path} +import org.apache.hudi.common.fs.FSUtils +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.text.DecimalFormat +import java.util.function.Supplier + +class ShowFsPathDetailProcedure extends BaseProcedure with ProcedureBuilder { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "path", DataTypes.StringType, None), + ProcedureParameter.optional(1, "is_sub", DataTypes.BooleanType, false), + ProcedureParameter.optional(2, "sort", DataTypes.BooleanType, true) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("path_num", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("file_num", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("storage_size", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("storage_size(unit)", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("storage_path", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("space_consumed", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("quota", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("space_quota", DataTypes.LongType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val srcPath = getArgValueOrDefault(args, PARAMETERS(0)).get.asInstanceOf[String] + val isSub = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[Boolean] + val sort = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[Boolean] + + val path: Path = new Path(srcPath) + val fs = FSUtils.getFs(path, jsc.hadoopConfiguration()) + val status: Array[FileStatus] = if (isSub) fs.listStatus(path) else fs.globStatus(path) + val rows: java.util.List[Row] = new java.util.ArrayList[Row]() + + if (status.nonEmpty) { + for (i <- status.indices) { + val summary: ContentSummary = fs.getContentSummary(status(i).getPath) + val storagePath: String = status(i).getPath.toString + rows.add(Row(summary.getDirectoryCount, summary.getFileCount, summary.getLength, + getFileSize(summary.getLength), storagePath, summary.getQuota, summary.getSpaceConsumed, + summary.getSpaceQuota)) + } + } + + val df = spark.sqlContext.createDataFrame(rows, OUTPUT_TYPE) + if (sort) { + df.orderBy(df("storage_size").desc).collect() + } else { + df.orderBy(df("file_num").desc).collect() + } + } + + def getFileSize(size: Long): String = { + val GB = 1024 * 1024 * 1024 + val MB = 1024 * 1024 + val KB = 1024 + val df: DecimalFormat = new DecimalFormat("0.00") + + val resultSize = if (size / GB >= 1) { + df.format(size / GB.toFloat) + "GB" + } else if (size / MB >= 1) { + df.format(size / MB.toFloat) + "MB" + } else if (size / KB >= 1) { + df.format(size / KB.toFloat) + "KB" + } else { + size + "B" + } + + resultSize + } + + override def build: Procedure = new ShowFsPathDetailProcedure() +} + +object ShowFsPathDetailProcedure { + val NAME = "show_fs_path_detail" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get(): ProcedureBuilder = new ShowFsPathDetailProcedure() + } +} + + + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowHoodieLogFileMetadataProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowHoodieLogFileMetadataProcedure.scala new file mode 100644 index 0000000000000..3a26823dedb9a --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowHoodieLogFileMetadataProcedure.scala @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import com.fasterxml.jackson.databind.ObjectMapper +import org.apache.hadoop.fs.Path +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.HoodieLogFile +import org.apache.hudi.common.table.log.HoodieLogFormat +import org.apache.hudi.common.table.log.block.HoodieLogBlock.{HeaderMetadataType, HoodieLogBlockType} +import org.apache.hudi.common.table.log.block.{HoodieCorruptBlock, HoodieDataBlock} +import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.parquet.avro.AvroSchemaConverter +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.Objects +import java.util.concurrent.atomic.AtomicInteger +import java.util.function.Supplier +import scala.collection.JavaConverters.{asScalaBufferConverter, asScalaIteratorConverter, mapAsScalaMapConverter} + +class ShowHoodieLogFileMetadataProcedure extends BaseProcedure with ProcedureBuilder { + override def parameters: Array[ProcedureParameter] = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.required(1, "log_file_path_pattern", DataTypes.StringType, None), + ProcedureParameter.optional(2, "limit", DataTypes.IntegerType, 10) + ) + + override def outputType: StructType = StructType(Array[StructField]( + StructField("instant_time", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("record_count", DataTypes.IntegerType, nullable = true, Metadata.empty), + StructField("block_type", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("header_metadata", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("footer_metadata", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + override def call(args: ProcedureArgs): Seq[Row] = { + checkArgs(parameters, args) + val table = getArgValueOrDefault(args, parameters(0)) + val logFilePathPattern: String = getArgValueOrDefault(args, parameters(1)).get.asInstanceOf[String] + val limit: Int = getArgValueOrDefault(args, parameters(2)).get.asInstanceOf[Int] + val basePath = getBasePath(table) + val fs = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build.getFs + val logFilePaths = FSUtils.getGlobStatusExcludingMetaFolder(fs, new Path(logFilePathPattern)).iterator().asScala + .map(_.getPath.toString).toList + val commitCountAndMetadata = + new java.util.HashMap[String, java.util.List[(HoodieLogBlockType, (java.util.Map[HeaderMetadataType, String], java.util.Map[HeaderMetadataType, String]), Int)]]() + var numCorruptBlocks = 0 + var dummyInstantTimeCount = 0 + logFilePaths.foreach { + logFilePath => { + val statuses = fs.listStatus(new Path(logFilePath)) + val schema = new AvroSchemaConverter() + .convert(Objects.requireNonNull(TableSchemaResolver.readSchemaFromLogFile(fs, new Path(logFilePath)))) + val reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(statuses(0).getPath), schema) + + // read the avro blocks + while (reader.hasNext) { + val block = reader.next() + val recordCount = new AtomicInteger(0) + var instantTime: String = null + if (block.isInstanceOf[HoodieCorruptBlock]) { + try { + instantTime = block.getLogBlockHeader.get(HeaderMetadataType.INSTANT_TIME) + if (null == instantTime) { + throw new java.lang.Exception("Invalid instant time " + instantTime) + } + } catch { + case _: java.lang.Exception => + numCorruptBlocks = numCorruptBlocks + 1; + instantTime = "corrupt_block_" + numCorruptBlocks + } + } else { + instantTime = block.getLogBlockHeader.get(HeaderMetadataType.INSTANT_TIME) + if (null == instantTime) { + dummyInstantTimeCount = dummyInstantTimeCount + 1 + instantTime = "dummy_instant_time_" + dummyInstantTimeCount + } + block match { + case dataBlock: HoodieDataBlock => + val recordItr = dataBlock.getRecordIterator + recordItr.asScala.foreach(_ => recordCount.incrementAndGet()) + recordItr.close() + } + } + if (commitCountAndMetadata.containsKey(instantTime)) { + val list = commitCountAndMetadata.get(instantTime) + list.add((block.getBlockType, (block.getLogBlockHeader, block.getLogBlockFooter), recordCount.get())) + } else { + val list = new java.util.ArrayList[(HoodieLogBlockType, (java.util.Map[HeaderMetadataType, String], java.util.Map[HeaderMetadataType, String]), Int)] + list.add(block.getBlockType, (block.getLogBlockHeader, block.getLogBlockFooter), recordCount.get()) + commitCountAndMetadata.put(instantTime, list) + } + } + reader.close() + } + } + val rows = new java.util.ArrayList[Row] + val objectMapper = new ObjectMapper() + commitCountAndMetadata.asScala.foreach { + case (instantTime, values) => + values.asScala.foreach { + tuple3 => + rows.add(Row( + instantTime, + tuple3._3, + tuple3._1.toString, + objectMapper.writeValueAsString(tuple3._2._1), + objectMapper.writeValueAsString(tuple3._2._2) + )) + } + } + rows.stream().limit(limit).toArray().map(r => r.asInstanceOf[Row]).toList + } + + override def build: Procedure = new ShowHoodieLogFileMetadataProcedure +} + +object ShowHoodieLogFileMetadataProcedure { + val NAME = "show_logfile_metadata" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get(): ProcedureBuilder = new ShowHoodieLogFileMetadataProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowHoodieLogFileRecordsProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowHoodieLogFileRecordsProcedure.scala new file mode 100644 index 0000000000000..2806138a8954b --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowHoodieLogFileRecordsProcedure.scala @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.avro.generic.IndexedRecord +import org.apache.hadoop.fs.Path +import org.apache.hudi.common.config.HoodieCommonConfig +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.HoodieLogFile +import org.apache.hudi.common.table.log.block.HoodieDataBlock +import org.apache.hudi.common.table.log.{HoodieLogFormat, HoodieMergedLogRecordScanner} +import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.common.util.ValidationUtils +import org.apache.hudi.config.{HoodieCompactionConfig, HoodieMemoryConfig} +import org.apache.parquet.avro.AvroSchemaConverter +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.Objects +import java.util.function.Supplier +import scala.collection.JavaConverters._ + +class ShowHoodieLogFileRecordsProcedure extends BaseProcedure with ProcedureBuilder { + override def parameters: Array[ProcedureParameter] = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.required(1, "log_file_path_pattern", DataTypes.StringType, None), + ProcedureParameter.optional(2, "merge", DataTypes.BooleanType, false), + ProcedureParameter.optional(3, "limit", DataTypes.IntegerType, 10) + ) + + override def outputType: StructType = StructType(Array[StructField]( + StructField("records", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + override def call(args: ProcedureArgs): Seq[Row] = { + checkArgs(parameters, args) + val table = getArgValueOrDefault(args, parameters(0)) + val logFilePathPattern: String = getArgValueOrDefault(args, parameters(1)).get.asInstanceOf[String] + val merge: Boolean = getArgValueOrDefault(args, parameters(2)).get.asInstanceOf[Boolean] + val limit: Int = getArgValueOrDefault(args, parameters(3)).get.asInstanceOf[Int] + val basePath = getBasePath(table) + val client = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + val fs = client.getFs + val logFilePaths = FSUtils.getGlobStatusExcludingMetaFolder(fs, new Path(logFilePathPattern)).iterator().asScala + .map(_.getPath.toString).toList + ValidationUtils.checkArgument(logFilePaths.nonEmpty, "There is no log file") + val converter = new AvroSchemaConverter() + val allRecords: java.util.List[IndexedRecord] = new java.util.ArrayList[IndexedRecord] + if (merge) { + val schema = converter.convert(Objects.requireNonNull(TableSchemaResolver.readSchemaFromLogFile(fs, new Path(logFilePaths.last)))) + val scanner = HoodieMergedLogRecordScanner.newBuilder + .withFileSystem(fs) + .withBasePath(basePath) + .withLogFilePaths(logFilePaths.asJava) + .withReaderSchema(schema) + .withLatestInstantTime(client.getActiveTimeline.getCommitTimeline.lastInstant.get.getTimestamp) + .withReadBlocksLazily(java.lang.Boolean.parseBoolean(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLE.defaultValue)) + .withReverseReader(java.lang.Boolean.parseBoolean(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLE.defaultValue)) + .withBufferSize(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE.defaultValue) + .withMaxMemorySizeInBytes(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES) + .withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue) + .withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue) + .withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue) + .build + scanner.asScala.foreach(hoodieRecord => { + val record = hoodieRecord.getData.getInsertValue(schema).get() + if (allRecords.size() < limit) { + allRecords.add(record) + } + }) + } else { + logFilePaths.toStream.takeWhile(_ => allRecords.size() < limit).foreach { + logFilePath => { + val schema = converter.convert(Objects.requireNonNull(TableSchemaResolver.readSchemaFromLogFile(fs, new Path(logFilePath)))) + val reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(logFilePath), schema) + while (reader.hasNext) { + val block = reader.next() + block match { + case dataBlock: HoodieDataBlock => + val recordItr = dataBlock.getRecordIterator + recordItr.asScala.foreach(record => { + if (allRecords.size() < limit) { + allRecords.add(record) + } + }) + recordItr.close() + } + } + reader.close() + } + } + } + val rows: java.util.List[Row] = new java.util.ArrayList[Row](allRecords.size()) + allRecords.asScala.foreach(record => { + rows.add(Row(record.toString)) + }) + rows.asScala + } + + override def build: Procedure = new ShowHoodieLogFileRecordsProcedure +} + +object ShowHoodieLogFileRecordsProcedure { + val NAME = "show_logfile_records" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get(): ProcedureBuilder = new ShowHoodieLogFileRecordsProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowInvalidParquetProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowInvalidParquetProcedure.scala new file mode 100644 index 0000000000000..11d170bbed5ea --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowInvalidParquetProcedure.scala @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hadoop.fs.Path +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.config.SerializableConfiguration +import org.apache.hudi.common.fs.FSUtils +import org.apache.parquet.format.converter.ParquetMetadataConverter.SKIP_ROW_GROUPS +import org.apache.parquet.hadoop.ParquetFileReader +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.function.Supplier + +class ShowInvalidParquetProcedure extends BaseProcedure with ProcedureBuilder { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "path", DataTypes.StringType, None) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("path", DataTypes.StringType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val srcPath = getArgValueOrDefault(args, PARAMETERS(0)).get.asInstanceOf[String] + val partitionPaths: java.util.List[String] = FSUtils.getAllPartitionPaths(new HoodieSparkEngineContext(jsc), srcPath, false, false) + val javaRdd: JavaRDD[String] = jsc.parallelize(partitionPaths, partitionPaths.size()) + val serHadoopConf = new SerializableConfiguration(jsc.hadoopConfiguration()) + javaRdd.rdd.map(part => { + val fs = FSUtils.getFs(new Path(srcPath), serHadoopConf.get()) + FSUtils.getAllDataFilesInPartition(fs, FSUtils.getPartitionPath(srcPath, part)) + }).flatMap(_.toList) + .filter(status => { + val filePath = status.getPath + var isInvalid = false + if (filePath.toString.endsWith(".parquet")) { + try ParquetFileReader.readFooter(serHadoopConf.get(), filePath, SKIP_ROW_GROUPS).getFileMetaData catch { + case e: Exception => + isInvalid = e.getMessage.contains("is not a Parquet file") + } + } + isInvalid + }) + .map(status => Row(status.getPath.toString)) + .collect() + } + + override def build = new ShowInvalidParquetProcedure() +} + +object ShowInvalidParquetProcedure { + val NAME = "show_invalid_parquet" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get(): ProcedureBuilder = new ShowInvalidParquetProcedure() + } +} + + + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowMetadataTableFilesProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowMetadataTableFilesProcedure.scala new file mode 100644 index 0000000000000..b30203dc06e8b --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowMetadataTableFilesProcedure.scala @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hudi.common.config.HoodieMetadataConfig +import org.apache.hudi.common.engine.HoodieLocalEngineContext +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.util.{HoodieTimer, StringUtils} +import org.apache.hudi.exception.HoodieException +import org.apache.hudi.metadata.HoodieBackedTableMetadata +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util +import java.util.function.Supplier + +class ShowMetadataTableFilesProcedure() extends BaseProcedure with ProcedureBuilder with Logging { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "partition", DataTypes.StringType, "") + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("file_path", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val table = getArgValueOrDefault(args, PARAMETERS(0)) + val partition = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String] + + val basePath = getBasePath(table) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + val config = HoodieMetadataConfig.newBuilder.enable(true).build + val metaReader = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(metaClient.getHadoopConf), + config, basePath, "/tmp") + if (!metaReader.enabled){ + throw new HoodieException(s"Metadata Table not enabled/initialized.") + } + + var partitionPath = new Path(basePath) + if (!StringUtils.isNullOrEmpty(partition)) { + partitionPath = new Path(basePath, partition) + } + + val timer = new HoodieTimer().startTimer + val statuses = metaReader.getAllFilesInPartition(partitionPath) + logDebug("Took " + timer.endTimer + " ms") + + val rows = new util.ArrayList[Row] + statuses.toStream.sortBy(p => p.getPath.getName).foreach((f: FileStatus) => { + rows.add(Row(f.getPath.getName)) + }) + rows.stream().toArray().map(r => r.asInstanceOf[Row]).toList + } + + override def build: Procedure = new ShowMetadataTableFilesProcedure() +} + +object ShowMetadataTableFilesProcedure { + val NAME = "show_metadata_table_files" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowMetadataTableFilesProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowMetadataTablePartitionsProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowMetadataTablePartitionsProcedure.scala new file mode 100644 index 0000000000000..f2eaa7ad838fe --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowMetadataTablePartitionsProcedure.scala @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.config.HoodieMetadataConfig +import org.apache.hudi.common.util.HoodieTimer +import org.apache.hudi.exception.HoodieException +import org.apache.hudi.metadata.HoodieBackedTableMetadata +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util +import java.util.Collections +import java.util.function.Supplier +import scala.collection.JavaConverters.asScalaIteratorConverter + +class ShowMetadataTablePartitionsProcedure() extends BaseProcedure with ProcedureBuilder with Logging { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("partition", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val table = getArgValueOrDefault(args, PARAMETERS(0)) + + val basePath = getBasePath(table) + val config = HoodieMetadataConfig.newBuilder.enable(true).build + val metadata = new HoodieBackedTableMetadata(new HoodieSparkEngineContext(jsc), + config, basePath, "/tmp") + if (!metadata.enabled){ + throw new HoodieException(s"Metadata Table not enabled/initialized.") + } + + val timer = new HoodieTimer().startTimer + val partitions = metadata.getAllPartitionPaths + Collections.sort(partitions) + logDebug("Took " + timer.endTimer + " ms") + + val rows = new util.ArrayList[Row] + partitions.stream.iterator().asScala.foreach((p: String) => { + rows.add(Row(p)) + }) + rows.stream().toArray().map(r => r.asInstanceOf[Row]).toList + } + + override def build: Procedure = new ShowMetadataTablePartitionsProcedure() +} + +object ShowMetadataTablePartitionsProcedure { + val NAME = "show_metadata_table_partitions" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowMetadataTablePartitionsProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowMetadataTableStatsProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowMetadataTableStatsProcedure.scala new file mode 100644 index 0000000000000..948e441858072 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowMetadataTableStatsProcedure.scala @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.common.config.HoodieMetadataConfig +import org.apache.hudi.common.engine.HoodieLocalEngineContext +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.metadata.HoodieBackedTableMetadata +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util +import java.util.function.Supplier +import scala.collection.JavaConversions._ + +class ShowMetadataTableStatsProcedure() extends BaseProcedure with ProcedureBuilder { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("stat_key", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("stat_value", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val table = getArgValueOrDefault(args, PARAMETERS(0)) + + val basePath = getBasePath(table) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + val config = HoodieMetadataConfig.newBuilder.enable(true).build + val metadata = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(metaClient.getHadoopConf), + config, basePath, "/tmp") + val stats = metadata.stats + + val rows = new util.ArrayList[Row] + for (entry <- stats.entrySet) { + rows.add(Row(entry.getKey, entry.getValue)) + } + rows.stream().toArray().map(r => r.asInstanceOf[Row]).toList + } + + override def build: Procedure = new ShowMetadataTableStatsProcedure() +} + +object ShowMetadataTableStatsProcedure { + val NAME = "show_metadata_table_stats" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowMetadataTableStatsProcedure() + } +} + + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowRollbacksProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowRollbacksProcedure.scala new file mode 100644 index 0000000000000..e5cacdb0626fc --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowRollbacksProcedure.scala @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import java.io.IOException +import java.util +import java.util.function.Supplier + +import org.apache.hudi.avro.model.HoodieRollbackMetadata +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.HoodieInstant.State +import org.apache.hudi.common.table.timeline.HoodieTimeline.ROLLBACK_ACTION +import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieInstant, HoodieTimeline, TimelineMetadataUtils} +import org.apache.hudi.common.util.CollectionUtils +import org.apache.hudi.exception.HoodieException +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import scala.collection.JavaConversions.asScalaBuffer +import scala.collection.JavaConverters._ + +class ShowRollbacksProcedure(showDetails: Boolean) extends BaseProcedure with ProcedureBuilder { + private val ROLLBACKS_PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "limit", DataTypes.IntegerType, 10) + ) + + private val ROLLBACK_PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "limit", DataTypes.IntegerType, 10), + ProcedureParameter.required(2, "instant_time", DataTypes.StringType, None) + ) + + private val ROLLBACKS_OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("instant", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("rollback_instant", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("total_files_deleted", DataTypes.IntegerType, nullable = true, Metadata.empty), + StructField("time_taken_in_millis", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_partitions", DataTypes.IntegerType, nullable = true, Metadata.empty) + )) + + private val ROLLBACK_OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("instant", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("rollback_instant", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("partition", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("deleted_file", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("succeeded", DataTypes.BooleanType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = if (showDetails) ROLLBACK_PARAMETERS else ROLLBACKS_PARAMETERS + + def outputType: StructType = if (showDetails) ROLLBACK_OUTPUT_TYPE else ROLLBACKS_OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(parameters, args) + + val tableName = getArgValueOrDefault(args, parameters(0)) + val limit = getArgValueOrDefault(args, parameters(1)).get.asInstanceOf[Int] + + val basePath = getBasePath(tableName) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + val activeTimeline = new RollbackTimeline(metaClient) + if (showDetails) { + val instantTime = getArgValueOrDefault(args, parameters(2)).get.asInstanceOf[String] + getRollbackDetail(activeTimeline, instantTime, limit) + } else { + getRollbacks(activeTimeline, limit) + } + } + + override def build: Procedure = new ShowRollbacksProcedure(showDetails) + + class RollbackTimeline(metaClient: HoodieTableMetaClient) extends HoodieActiveTimeline(metaClient, + CollectionUtils.createImmutableSet(HoodieTimeline.ROLLBACK_EXTENSION)) { + } + + def getRollbackDetail(activeTimeline: RollbackTimeline, + instantTime: String, + limit: Int): Seq[Row] = { + val rows = new util.ArrayList[Row] + val metadata = TimelineMetadataUtils.deserializeAvroMetadata(activeTimeline.getInstantDetails( + new HoodieInstant(State.COMPLETED, ROLLBACK_ACTION, instantTime)).get, classOf[HoodieRollbackMetadata]) + + metadata.getPartitionMetadata.asScala.toMap.iterator.foreach(entry => Stream + .concat(entry._2.getSuccessDeleteFiles.map(f => (f, true)), + entry._2.getFailedDeleteFiles.map(f => (f, false))) + .iterator.foreach(fileWithDeleteStatus => { + rows.add(Row(metadata.getStartRollbackTime, metadata.getCommitsRollback.toString, + entry._1, fileWithDeleteStatus._1, fileWithDeleteStatus._2)) + })) + rows.stream().limit(limit).toArray().map(r => r.asInstanceOf[Row]).toList + } + + def getRollbacks(activeTimeline: RollbackTimeline, + limit: Int): Seq[Row] = { + val rows = new util.ArrayList[Row] + val rollback = activeTimeline.getRollbackTimeline.filterCompletedInstants + + rollback.getInstants.iterator().asScala.foreach(instant => { + try { + val metadata = TimelineMetadataUtils.deserializeAvroMetadata(activeTimeline.getInstantDetails(instant).get, + classOf[HoodieRollbackMetadata]) + + metadata.getCommitsRollback.iterator().asScala.foreach(c => { + rows.add(Row(metadata.getStartRollbackTime, c, + metadata.getTotalFilesDeleted, metadata.getTimeTakenInMillis, + if (metadata.getPartitionMetadata != null) metadata.getPartitionMetadata.size else 0)) + }) + } catch { + case e: IOException => + throw new HoodieException(s"Failed to get rollback's info from instant ${instant.getTimestamp}.") + } + }) + rows.stream().limit(limit).toArray().map(r => r.asInstanceOf[Row]).toList + } +} + +object ShowRollbacksProcedure { + val NAME = "show_rollbacks" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowRollbacksProcedure(false) + } +} + +object ShowRollbackDetailProcedure { + val NAME = "show_rollback_detail" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowRollbacksProcedure(true) + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowSavepointsProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowSavepointsProcedure.scala new file mode 100644 index 0000000000000..e866e21555baf --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowSavepointsProcedure.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieInstant, HoodieTimeline} +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util +import java.util.function.Supplier +import java.util.stream.Collectors + +class ShowSavepointsProcedure extends BaseProcedure with ProcedureBuilder { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("savepoint_time", DataTypes.StringType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + + val basePath: String = getBasePath(tableName) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + + val activeTimeline: HoodieActiveTimeline = metaClient.getActiveTimeline + val timeline: HoodieTimeline = activeTimeline.getSavePointTimeline.filterCompletedInstants + val commits: util.List[HoodieInstant] = timeline.getReverseOrderedInstants.collect(Collectors.toList[HoodieInstant]) + + if (commits.isEmpty) Seq.empty[Row] else { + commits.toArray.map(instant => instant.asInstanceOf[HoodieInstant].getTimestamp).map(p => Row(p)).toSeq + } + } + + override def build: Procedure = new ShowSavepointsProcedure() +} + +object ShowSavepointsProcedure { + val NAME: String = "show_savepoints" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get(): ShowSavepointsProcedure = new ShowSavepointsProcedure() + } +} + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/StatsFileSizeProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/StatsFileSizeProcedure.scala new file mode 100644 index 0000000000000..29b7627619408 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/StatsFileSizeProcedure.scala @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import com.codahale.metrics.{Histogram, Snapshot, UniformReservoir} +import org.apache.hadoop.fs.Path +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.spark.sql.Row +import org.apache.spark.sql.hudi.command.procedures.StatsFileSizeProcedure.MAX_FILES +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.function.Supplier +import scala.collection.JavaConverters.{asScalaBufferConverter, mapAsScalaMapConverter} + +class StatsFileSizeProcedure extends BaseProcedure with ProcedureBuilder { + + override def parameters: Array[ProcedureParameter] = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "partition_path", DataTypes.StringType, ""), + ProcedureParameter.optional(2, "limit", DataTypes.IntegerType, 10) + ) + + override def outputType: StructType = StructType(Array[StructField]( + StructField("commit_time", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("min", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("10th", DataTypes.DoubleType, nullable = true, Metadata.empty), + StructField("50th", DataTypes.DoubleType, nullable = true, Metadata.empty), + StructField("avg", DataTypes.DoubleType, nullable = true, Metadata.empty), + StructField("95th", DataTypes.DoubleType, nullable = true, Metadata.empty), + StructField("max", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("num_files", DataTypes.IntegerType, nullable = true, Metadata.empty), + StructField("std_dev", DataTypes.DoubleType, nullable = true, Metadata.empty) + )) + + override def call(args: ProcedureArgs): Seq[Row] = { + checkArgs(parameters, args) + val table = getArgValueOrDefault(args, parameters(0)) + val globRegex = getArgValueOrDefault(args, parameters(1)).get.asInstanceOf[String] + val limit: Int = getArgValueOrDefault(args, parameters(2)).get.asInstanceOf[Int] + val basePath = getBasePath(table) + val fs = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build.getFs + val globPath = String.format("%s/%s/*", basePath, globRegex) + val statuses = FSUtils.getGlobStatusExcludingMetaFolder(fs, new Path(globPath)) + + val globalHistogram = new Histogram(new UniformReservoir(MAX_FILES)) + val commitHistogramMap = new java.util.HashMap[String, Histogram]() + statuses.asScala.foreach( + status => { + val instantTime = FSUtils.getCommitTime(status.getPath.getName) + val len = status.getLen + commitHistogramMap.putIfAbsent(instantTime, new Histogram(new UniformReservoir(MAX_FILES))) + commitHistogramMap.get(instantTime).update(len) + globalHistogram.update(len) + } + ) + val rows: java.util.List[Row] = new java.util.ArrayList[Row]() + commitHistogramMap.asScala.foreach { + case (instantTime, histogram) => + val snapshot = histogram.getSnapshot + rows.add(printFileSizeHistogram(instantTime, snapshot)) + } + val snapshot = globalHistogram.getSnapshot + rows.add(printFileSizeHistogram("ALL", snapshot)) + rows.stream().limit(limit).toArray().map(r => r.asInstanceOf[Row]).toList + } + + def printFileSizeHistogram(instantTime: String, snapshot: Snapshot): Row = { + Row( + instantTime, + snapshot.getMin, + snapshot.getValue(0.1), + snapshot.getMedian, + snapshot.getMean, + snapshot.get95thPercentile, + snapshot.getMax, + snapshot.size, + snapshot.getStdDev + ) + } + + override def build: Procedure = new StatsFileSizeProcedure +} + +object StatsFileSizeProcedure { + val MAX_FILES = 1000000 + val NAME = "stats_file_sizes" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get(): ProcedureBuilder = new StatsFileSizeProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/StatsWriteAmplificationProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/StatsWriteAmplificationProcedure.scala new file mode 100644 index 0000000000000..b83d97f047e50 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/StatsWriteAmplificationProcedure.scala @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.common.model.HoodieCommitMetadata +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.text.DecimalFormat +import java.util.function.Supplier +import scala.collection.JavaConverters.asScalaIteratorConverter + +class StatsWriteAmplificationProcedure extends BaseProcedure with ProcedureBuilder { + override def parameters: Array[ProcedureParameter] = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "limit", DataTypes.IntegerType, 10) + ) + + override def outputType: StructType = StructType(Array[StructField]( + StructField("commit_time", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("total_upserted", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_written", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("write_amplification_factor", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + override def call(args: ProcedureArgs): Seq[Row] = { + checkArgs(parameters, args) + val table = getArgValueOrDefault(args, parameters(0)) + val limit: Int = getArgValueOrDefault(args, parameters(1)).get.asInstanceOf[Int] + val basePath = getBasePath(table) + val client = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + val activeTimeline = client.getActiveTimeline + val timeline = activeTimeline.getCommitTimeline.filterCompletedInstants() + + val rows = new java.util.ArrayList[Row] + val df = new DecimalFormat("#.00") + var totalRecordsUpserted = 0L + var totalRecordsWritten = 0L + timeline.getInstants.iterator.asScala.foreach( + instantTime => { + var waf = "0" + val commit = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(instantTime).get(), classOf[HoodieCommitMetadata]) + if (commit.fetchTotalUpdateRecordsWritten() > 0) { + waf = df.format(commit.fetchTotalRecordsWritten().toFloat / commit.fetchTotalUpdateRecordsWritten()) + } + rows.add(Row(instantTime.getTimestamp, commit.fetchTotalUpdateRecordsWritten, commit.fetchTotalRecordsWritten, waf)) + totalRecordsUpserted = totalRecordsUpserted + commit.fetchTotalUpdateRecordsWritten() + totalRecordsWritten = totalRecordsWritten + commit.fetchTotalRecordsWritten() + } + ) + var waf = "0" + if (totalRecordsUpserted > 0) { + waf = df.format(totalRecordsWritten.toFloat / totalRecordsUpserted) + } + rows.add(Row("Total", totalRecordsUpserted, totalRecordsWritten, waf)) + rows.stream().limit(limit).toArray().map(r => r.asInstanceOf[Row]).toList + } + + override def build: Procedure = new StatsWriteAmplificationProcedure +} + +object StatsWriteAmplificationProcedure { + val NAME = "stats_wa" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get(): ProcedureBuilder = new StatsWriteAmplificationProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/UpgradeOrDowngradeProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/UpgradeOrDowngradeProcedure.scala new file mode 100644 index 0000000000000..49cbe5e2de6a1 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/UpgradeOrDowngradeProcedure.scala @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion +import org.apache.hudi.common.table.{HoodieTableMetaClient, HoodieTableVersion} +import org.apache.hudi.common.util.Option +import org.apache.hudi.config.{HoodieIndexConfig, HoodieWriteConfig, HoodieCleanConfig} +import org.apache.hudi.index.HoodieIndex +import org.apache.hudi.table.upgrade.{SparkUpgradeDowngradeHelper, UpgradeDowngrade} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.function.Supplier +import scala.util.{Failure, Success, Try} + +class UpgradeOrDowngradeProcedure extends BaseProcedure with ProcedureBuilder with Logging { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.required(1, "to_version", DataTypes.StringType, None) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("result", DataTypes.BooleanType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + val toVersion = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String] + val basePath = getBasePath(tableName) + + val config = getWriteConfigWithTrue(basePath) + val metaClient = HoodieTableMetaClient.builder + .setConf(jsc.hadoopConfiguration) + .setBasePath(config.getBasePath) + .setLoadActiveTimelineOnLoad(false) + .setConsistencyGuardConfig(config.getConsistencyGuardConfig) + .setLayoutVersion(Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion))) + .setFileSystemRetryConfig(config.getFileSystemRetryConfig) + .build + + val result = Try { + new UpgradeDowngrade(metaClient, config, new HoodieSparkEngineContext(jsc), SparkUpgradeDowngradeHelper.getInstance) + .run(HoodieTableVersion.valueOf(toVersion), null) + } match { + case Success(_) => + logInfo(s"Table at $basePath upgraded / downgraded to version $toVersion.") + true + case Failure(e) => + logWarning(s"Failed: Could not upgrade/downgrade table at $basePath to version $toVersion.", e) + false + } + + Seq(Row(result)) + } + + private def getWriteConfigWithTrue(basePath: String) = { + HoodieWriteConfig.newBuilder + .withPath(basePath) + .withRollbackUsingMarkers(true) + .withCleanConfig(HoodieCleanConfig.newBuilder.withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.EAGER).build) + .withIndexConfig(HoodieIndexConfig.newBuilder.withIndexType(HoodieIndex.IndexType.BLOOM).build) + .build + } + + override def build = new UpgradeOrDowngradeProcedure() +} + +object UpgradeTableProcedure { + val NAME = "upgrade_table" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new UpgradeOrDowngradeProcedure() + } +} + +object DowngradeTableProcedure { + val NAME = "downgrade_table" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new UpgradeOrDowngradeProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ValidateHoodieSyncProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ValidateHoodieSyncProcedure.scala new file mode 100644 index 0000000000000..77dd4f3ee00bc --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ValidateHoodieSyncProcedure.scala @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.common.model.HoodieCommitMetadata +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} +import org.joda.time.DateTime + +import java.io.IOException +import java.sql.{Connection, DriverManager, ResultSet, SQLException} +import java.util.function.Supplier + +import scala.collection.JavaConverters._ + +class ValidateHoodieSyncProcedure extends BaseProcedure with ProcedureBuilder with Logging { + + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "src_table", DataTypes.StringType, None), + ProcedureParameter.required(1, "dst_table", DataTypes.StringType, None), + ProcedureParameter.required(2, "mode", DataTypes.StringType, "complete"), + ProcedureParameter.required(3, "hive_server_url", DataTypes.StringType, None), + ProcedureParameter.required(4, "hive_pass", DataTypes.StringType, None), + ProcedureParameter.optional(5, "src_db", DataTypes.StringType, "rawdata"), + ProcedureParameter.optional(6, "target_db", DataTypes.StringType, "dwh_hoodie"), + ProcedureParameter.optional(7, "partition_cnt", DataTypes.IntegerType, 5), + ProcedureParameter.optional(8, "hive_user", DataTypes.StringType, "") + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("result", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + override def build: Procedure = new ValidateHoodieSyncProcedure() + + /** + * Returns the input parameters of this procedure. + */ + override def parameters: Array[ProcedureParameter] = PARAMETERS + + /** + * Returns the type of rows produced by this procedure. + */ + override def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val srcTable = getArgValueOrDefault(args, PARAMETERS(0)) + val dstTable = getArgValueOrDefault(args, PARAMETERS(1)) + val mode = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[String] + val hiveServerUrl = getArgValueOrDefault(args, PARAMETERS(3)).get.asInstanceOf[String] + val hivePass = getArgValueOrDefault(args, PARAMETERS(4)).get.asInstanceOf[String] + + val srcDb = getArgValueOrDefault(args, PARAMETERS(5)).get.asInstanceOf[String] + val tgtDb = getArgValueOrDefault(args, PARAMETERS(6)).get.asInstanceOf[String] + val partitionCount = getArgValueOrDefault(args, PARAMETERS(7)).get.asInstanceOf[Integer] + val hiveUser = getArgValueOrDefault(args, PARAMETERS(8)).get.asInstanceOf[String] + + val srcBasePath = getBasePath(srcTable, Option.empty) + val dstBasePath = getBasePath(dstTable, Option.empty) + + val srcMetaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(srcBasePath).build + val targetMetaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(dstBasePath).build + + val targetTimeline = targetMetaClient.getActiveTimeline.getCommitsTimeline + val sourceTimeline = srcMetaClient.getActiveTimeline.getCommitsTimeline + + var sourceCount: Long = 0 + var targetCount: Long = 0 + + if ("complete".equals(mode)) { + sourceCount = countRecords(hiveServerUrl, srcMetaClient, srcDb, hiveUser, hivePass) + targetCount = countRecords(hiveServerUrl, targetMetaClient, tgtDb, hiveUser, hivePass); + } else if ("latestPartitions".equals(mode)) { + sourceCount = countRecords(hiveServerUrl, srcMetaClient, srcDb, partitionCount, hiveUser, hivePass) + targetCount = countRecords(hiveServerUrl, targetMetaClient, tgtDb, partitionCount, hiveUser, hivePass) + } else { + logError(s"Unsupport mode $mode") + } + + val targetLatestCommit = + if (targetTimeline.getInstants.iterator().hasNext) targetTimeline.lastInstant().get().getTimestamp else "0" + val sourceLatestCommit = + if (sourceTimeline.getInstants.iterator().hasNext) sourceTimeline.lastInstant().get().getTimestamp else "0" + + if (sourceLatestCommit != null + && HoodieTimeline.compareTimestamps(targetLatestCommit, HoodieTimeline.GREATER_THAN, sourceLatestCommit)) + Seq(Row(getString(targetMetaClient, targetTimeline, srcMetaClient, sourceCount, targetCount, sourceLatestCommit))) + else + Seq(Row(getString(srcMetaClient, sourceTimeline, targetMetaClient, targetCount, sourceCount, targetLatestCommit))) + } + + def getString(target: HoodieTableMetaClient, targetTimeline: HoodieTimeline, source: HoodieTableMetaClient, + sourceCount: Long, targetCount: Long, sourceLatestCommit: String): String = { + + val commitsToCatchup: List[HoodieInstant] = + targetTimeline.findInstantsAfter(sourceLatestCommit, Integer.MAX_VALUE).getInstants.iterator().asScala.toList + if (commitsToCatchup.isEmpty) { + s"Count difference now is count(${target.getTableConfig.getTableName}) - count(${source.getTableConfig.getTableName}) == ${targetCount - sourceCount}" + } else { + val newInserts = countNewRecords(target, commitsToCatchup.map(elem => elem.getTimestamp)) + s"Count difference now is count(${target.getTableConfig.getTableName}) - count(${source.getTableConfig.getTableName}) == ${targetCount - sourceCount}" + + s". Catach up count is $newInserts" + } + } + + def countRecords(jdbcUrl: String, source: HoodieTableMetaClient, dbName: String, user: String, pass: String): Long = { + var conn: Connection = null + var rs: ResultSet = null + var count: Long = -1 + try { + conn = DriverManager.getConnection(jdbcUrl, user, pass) + val stmt = conn.createStatement() + + stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat"); + stmt.execute("set hive.stats.autogather=false"); + + rs = stmt.executeQuery( + s"select count(`_hoodie_commit_time`) as cnt from $dbName.${source.getTableConfig.getTableName}") + if (rs.next()) { + count = rs.getLong("cnt") + } + + println(s"Total records in ${source.getTableConfig.getTableName} is $count") + } finally { + conn.close() + if (rs != null) { + rs.close() + } + } + count + } + + @throws[SQLException] + def countRecords(jdbcUrl: String, source: HoodieTableMetaClient, srcDb: String, partitions: Int, user: String, pass: String): Long = { + def getDate(dateTime: DateTime): String = { + s"${dateTime.getYear}-${dateTime.getMonthOfYear}%02d-${dateTime.getDayOfMonth}%02d" + } + var dateTime = DateTime.now + val endDateStr = getDate(dateTime) + dateTime = dateTime.minusDays(partitions) + val startDateStr = getDate(dateTime) + println("Start date " + startDateStr + " and end date " + endDateStr) + countRecords(jdbcUrl, source, srcDb, startDateStr, endDateStr, user, pass) + + } + + @throws[SQLException] + private def countRecords(jdbcUrl: String, source: HoodieTableMetaClient, srcDb: String, startDateStr: String, endDateStr: String, user: String, pass: String): Long = { + var rs: ResultSet = null + try { + val conn = DriverManager.getConnection(jdbcUrl, user, pass) + val stmt = conn.createStatement + try { // stmt.execute("set mapred.job.queue.name="); + stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat") + stmt.execute("set hive.stats.autogather=false") + rs = stmt.executeQuery(s"select count(`_hoodie_commit_time`) as cnt from $srcDb.${source.getTableConfig.getTableName} where datestr>'$startDateStr' and datestr<='$endDateStr'") + if (rs.next) + rs.getLong("cnt") + else + -1 + } finally { + if (rs != null) rs.close() + if (conn != null) conn.close() + if (stmt != null) stmt.close() + } + } + } + + @throws[IOException] + def countNewRecords(target: HoodieTableMetaClient, commitsToCatchup: List[String]): Long = { + var totalNew: Long = 0 + val timeline: HoodieTimeline = target.reloadActiveTimeline.getCommitTimeline.filterCompletedInstants + for (commit <- commitsToCatchup) { + val c: HoodieCommitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commit)).get, classOf[HoodieCommitMetadata]) + totalNew += c.fetchTotalRecordsWritten - c.fetchTotalUpdateRecordsWritten + } + totalNew + } +} + +object ValidateHoodieSyncProcedure { + val NAME = "sync_validate" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get(): ProcedureBuilder = new ValidateHoodieSyncProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ValidateMetadataTableFilesProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ValidateMetadataTableFilesProcedure.scala new file mode 100644 index 0000000000000..81540d9684665 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ValidateMetadataTableFilesProcedure.scala @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hudi.common.config.HoodieMetadataConfig +import org.apache.hudi.common.engine.HoodieLocalEngineContext +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.util.HoodieTimer +import org.apache.hudi.exception.HoodieException +import org.apache.hudi.metadata.HoodieBackedTableMetadata +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util +import java.util.Collections +import java.util.function.Supplier +import scala.collection.JavaConversions._ +import scala.collection.JavaConverters.asScalaIteratorConverter + +class ValidateMetadataTableFilesProcedure() extends BaseProcedure with ProcedureBuilder with Logging { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "verbose", DataTypes.BooleanType, false) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("partition", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("file_name", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("is_present_in_fs", DataTypes.BooleanType, nullable = true, Metadata.empty), + StructField("is_resent_in_metadata", DataTypes.BooleanType, nullable = true, Metadata.empty), + StructField("fs_size", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("metadata_size", DataTypes.LongType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val table = getArgValueOrDefault(args, PARAMETERS(0)) + val verbose = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[Boolean] + + val basePath = getBasePath(table) + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + val config = HoodieMetadataConfig.newBuilder.enable(true).build + val metadataReader = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(metaClient.getHadoopConf), + config, basePath, "/tmp") + + if (!metadataReader.enabled){ + throw new HoodieException(s"Metadata Table not enabled/initialized.") + } + + val fsConfig = HoodieMetadataConfig.newBuilder.enable(false).build + val fsMetaReader = new HoodieBackedTableMetadata(new HoodieLocalEngineContext(metaClient.getHadoopConf), + fsConfig, basePath, "/tmp") + + val timer = new HoodieTimer().startTimer + val metadataPartitions = metadataReader.getAllPartitionPaths + logDebug("Listing partitions Took " + timer.endTimer + " ms") + val fsPartitions = fsMetaReader.getAllPartitionPaths + Collections.sort(fsPartitions) + Collections.sort(metadataPartitions) + + val allPartitions = new util.HashSet[String] + allPartitions.addAll(fsPartitions) + allPartitions.addAll(metadataPartitions) + + if (!fsPartitions.equals(metadataPartitions)) { + logError("FS partition listing is not matching with metadata partition listing!") + logError("All FS partitions: " + util.Arrays.toString(fsPartitions.toArray)) + logError("All Metadata partitions: " + util.Arrays.toString(metadataPartitions.toArray)) + } + + val rows = new util.ArrayList[Row] + for (partition <- allPartitions) { + val fileStatusMap = new util.HashMap[String, FileStatus] + val metadataFileStatusMap = new util.HashMap[String, FileStatus] + val metadataStatuses = metadataReader.getAllFilesInPartition(new Path(basePath, partition)) + util.Arrays.stream(metadataStatuses).iterator().asScala.foreach((entry: FileStatus) => metadataFileStatusMap.put(entry.getPath.getName, entry)) + val fsStatuses = fsMetaReader.getAllFilesInPartition(new Path(basePath, partition)) + util.Arrays.stream(fsStatuses).iterator().asScala.foreach((entry: FileStatus) => fileStatusMap.put(entry.getPath.getName, entry)) + val allFiles = new util.HashSet[String] + allFiles.addAll(fileStatusMap.keySet) + allFiles.addAll(metadataFileStatusMap.keySet) + for (file <- allFiles) { + val fsFileStatus = fileStatusMap.get(file) + val metaFileStatus = metadataFileStatusMap.get(file) + val doesFsFileExists = fsFileStatus != null + val doesMetadataFileExists = metaFileStatus != null + val fsFileLength = if (doesFsFileExists) fsFileStatus.getLen else 0 + val metadataFileLength = if (doesMetadataFileExists) metaFileStatus.getLen else 0 + if (verbose) { // if verbose print all files + rows.add(Row(partition, file, doesFsFileExists, doesMetadataFileExists, fsFileLength, metadataFileLength)) + } else if ((doesFsFileExists != doesMetadataFileExists) || (fsFileLength != metadataFileLength)) { // if non verbose, print only non matching files + rows.add(Row(partition, file, doesFsFileExists, doesMetadataFileExists, fsFileLength, metadataFileLength)) + } + } + if (metadataStatuses.length != fsStatuses.length) { + logError(" FS and metadata files count not matching for " + partition + ". FS files count " + fsStatuses.length + ", metadata base files count " + metadataStatuses.length) + } + for (entry <- fileStatusMap.entrySet) { + if (!metadataFileStatusMap.containsKey(entry.getKey)) { + logError("FS file not found in metadata " + entry.getKey) + } else if (entry.getValue.getLen != metadataFileStatusMap.get(entry.getKey).getLen) { + logError(" FS file size mismatch " + entry.getKey + ", size equality " + (entry.getValue.getLen == metadataFileStatusMap.get(entry.getKey).getLen) + ". FS size " + entry.getValue.getLen + ", metadata size " + metadataFileStatusMap.get(entry.getKey).getLen) + } + } + for (entry <- metadataFileStatusMap.entrySet) { + if (!fileStatusMap.containsKey(entry.getKey)) { + logError("Metadata file not found in FS " + entry.getKey) + } else if (entry.getValue.getLen != fileStatusMap.get(entry.getKey).getLen) { + logError(" Metadata file size mismatch " + entry.getKey + ", size equality " + (entry.getValue.getLen == fileStatusMap.get(entry.getKey).getLen) + ". Metadata size " + entry.getValue.getLen + ", FS size " + metadataFileStatusMap.get(entry.getKey).getLen) + } + } + } + rows.stream().toArray().map(r => r.asInstanceOf[Row]).toList + } + + override def build: Procedure = new ValidateMetadataTableFilesProcedure() +} + +object ValidateMetadataTableFilesProcedure { + val NAME = "validate_metadata_table_files" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ValidateMetadataTableFilesProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/parser/HoodieCommonSqlParser.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/parser/HoodieCommonSqlParser.scala new file mode 100644 index 0000000000000..8ce8c61938761 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/parser/HoodieCommonSqlParser.scala @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.parser + +import org.antlr.v4.runtime.{CharStream, CharStreams, CodePointCharStream, CommonTokenStream, IntStream} +import org.antlr.v4.runtime.atn.PredictionMode +import org.antlr.v4.runtime.misc.{Interval, ParseCancellationException} +import org.apache.hudi.SparkAdapterSupport +import org.apache.hudi.spark.sql.parser.{HoodieSqlCommonLexer, HoodieSqlCommonParser} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.{AnalysisException, SparkSession} +import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.parser.{ParseErrorListener, ParseException, ParserInterface} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.trees.Origin +import org.apache.spark.sql.types.{DataType, StructType} + +class HoodieCommonSqlParser(session: SparkSession, delegate: ParserInterface) + extends ParserInterface with Logging with SparkAdapterSupport { + + private lazy val builder = new HoodieSqlCommonAstBuilder(session, delegate) + private lazy val sparkExtendedParser = sparkAdapter.createExtendedSparkParser + .map(_(session, delegate)).getOrElse(delegate) + + override def parsePlan(sqlText: String): LogicalPlan = parse(sqlText) { parser => + builder.visit(parser.singleStatement()) match { + case plan: LogicalPlan => plan + case _=> sparkExtendedParser.parsePlan(sqlText) + } + } + + override def parseExpression(sqlText: String): Expression = delegate.parseExpression(sqlText) + + override def parseTableIdentifier(sqlText: String): TableIdentifier = + delegate.parseTableIdentifier(sqlText) + + override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = + delegate.parseFunctionIdentifier(sqlText) + + override def parseTableSchema(sqlText: String): StructType = delegate.parseTableSchema(sqlText) + + override def parseDataType(sqlText: String): DataType = delegate.parseDataType(sqlText) + + /* SPARK-37266 Added parseQuery to ParserInterface in Spark 3.3.0. This is a patch to prevent + hackers from tampering text with persistent view, it won't be called in older Spark + Don't mark this as override for backward compatibility + Can't use sparkExtendedParser directly here due to the same reason */ + def parseQuery(sqlText: String): LogicalPlan = parse(sqlText) { parser => + sparkAdapter.getQueryParserFromExtendedSqlParser(session, delegate, sqlText) + } + + def parseRawDataType(sqlText : String) : DataType = { + throw new UnsupportedOperationException(s"Unsupported parseRawDataType method") + } + + def parseMultipartIdentifier(sqlText: String): Seq[String] = { + sparkAdapter.parseMultipartIdentifier(delegate, sqlText) + } + + protected def parse[T](command: String)(toResult: HoodieSqlCommonParser => T): T = { + logDebug(s"Parsing command: $command") + + val lexer = new HoodieSqlCommonLexer(new UpperCaseCharStream(CharStreams.fromString(command))) + lexer.removeErrorListeners() + lexer.addErrorListener(ParseErrorListener) + + val tokenStream = new CommonTokenStream(lexer) + val parser = new HoodieSqlCommonParser(tokenStream) + parser.removeErrorListeners() + parser.addErrorListener(ParseErrorListener) + + try { + try { + // first, try parsing with potentially faster SLL mode + parser.getInterpreter.setPredictionMode(PredictionMode.SLL) + toResult(parser) + } + catch { + case e: ParseCancellationException => + // if we fail, parse with LL mode + tokenStream.seek(0) // rewind input stream + parser.reset() + + // Try Again. + parser.getInterpreter.setPredictionMode(PredictionMode.LL) + toResult(parser) + } + } + catch { + case e: ParseException if e.command.isDefined => + throw e + case e: ParseException => + throw e.withCommand(command) + case e: AnalysisException => + val position = Origin(e.line, e.startPosition) + throw new ParseException(Option(command), e.message, position, position) + } + } +} + +/** + * Fork from `org.apache.spark.sql.catalyst.parser.UpperCaseCharStream`. + */ +class UpperCaseCharStream(wrapped: CodePointCharStream) extends CharStream { + override def consume(): Unit = wrapped.consume + override def getSourceName(): String = wrapped.getSourceName + override def index(): Int = wrapped.index + override def mark(): Int = wrapped.mark + override def release(marker: Int): Unit = wrapped.release(marker) + override def seek(where: Int): Unit = wrapped.seek(where) + override def size(): Int = wrapped.size + + override def getText(interval: Interval): String = { + // ANTLR 4.7's CodePointCharStream implementations have bugs when + // getText() is called with an empty stream, or intervals where + // the start > end. See + // https://github.com/antlr/antlr4/commit/ac9f7530 for one fix + // that is not yet in a released ANTLR artifact. + if (size() > 0 && (interval.b - interval.a >= 0)) { + wrapped.getText(interval) + } else { + "" + } + } + // scalastyle:off + override def LA(i: Int): Int = { + // scalastyle:on + val la = wrapped.LA(i) + if (la == 0 || la == IntStream.EOF) la + else Character.toUpperCase(la) + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/parser/HoodieSqlCommonAstBuilder.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/parser/HoodieSqlCommonAstBuilder.scala new file mode 100644 index 0000000000000..d0e5ed6133856 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/parser/HoodieSqlCommonAstBuilder.scala @@ -0,0 +1,291 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.parser + +import org.antlr.v4.runtime.ParserRuleContext +import org.antlr.v4.runtime.tree.{ParseTree, RuleNode, TerminalNode} +import org.apache.hudi.SparkAdapterSupport +import org.apache.hudi.spark.sql.parser.HoodieSqlCommonBaseVisitor +import org.apache.hudi.spark.sql.parser.HoodieSqlCommonParser._ +import org.apache.spark.internal.Logging +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedRelation} +import org.apache.spark.sql.catalyst.expressions.{Expression, Literal} +import org.apache.spark.sql.catalyst.parser.{ParseException, ParserInterface, ParserUtils} +import org.apache.spark.sql.catalyst.plans.logical._ + +import java.util.Locale +import scala.collection.JavaConverters._ + +class HoodieSqlCommonAstBuilder(session: SparkSession, delegate: ParserInterface) + extends HoodieSqlCommonBaseVisitor[AnyRef] with Logging with SparkAdapterSupport { + + import ParserUtils._ + + /** + * Override the default behavior for all visit methods. This will only return a non-null result + * when the context has only one child. This is done because there is no generic method to + * combine the results of the context children. In all other cases null is returned. + */ + override def visitChildren(node: RuleNode): AnyRef = { + if (node.getChildCount == 1) { + node.getChild(0).accept(this) + } else { + null + } + } + + override def visitSingleStatement(ctx: SingleStatementContext): LogicalPlan = withOrigin(ctx) { + ctx.statement().accept(this).asInstanceOf[LogicalPlan] + } + + override def visitCompactionOnTable(ctx: CompactionOnTableContext): LogicalPlan = withOrigin(ctx) { + val table = ctx.tableIdentifier().accept(this).asInstanceOf[LogicalPlan] + val operation = CompactionOperation.withName(ctx.operation.getText.toUpperCase) + val timestamp = if (ctx.instantTimestamp != null) Some(ctx.instantTimestamp.getText.toLong) else None + CompactionTable(table, operation, timestamp) + } + + override def visitCompactionOnPath(ctx: CompactionOnPathContext): LogicalPlan = withOrigin(ctx) { + val path = string(ctx.path) + val operation = CompactionOperation.withName(ctx.operation.getText.toUpperCase) + val timestamp = if (ctx.instantTimestamp != null) Some(ctx.instantTimestamp.getText.toLong) else None + CompactionPath(path, operation, timestamp) + } + + override def visitShowCompactionOnTable(ctx: ShowCompactionOnTableContext): LogicalPlan = withOrigin(ctx) { + val table = ctx.tableIdentifier().accept(this).asInstanceOf[LogicalPlan] + if (ctx.limit != null) { + CompactionShowOnTable(table, ctx.limit.getText.toInt) + } else { + CompactionShowOnTable(table) + } + } + + override def visitShowCompactionOnPath(ctx: ShowCompactionOnPathContext): LogicalPlan = withOrigin(ctx) { + val path = string(ctx.path) + if (ctx.limit != null) { + CompactionShowOnPath(path, ctx.limit.getText.toInt) + } else { + CompactionShowOnPath(path) + } + } + + override def visitTableIdentifier(ctx: TableIdentifierContext): LogicalPlan = withOrigin(ctx) { + UnresolvedRelation(TableIdentifier(ctx.table.getText, Option(ctx.db).map(_.getText))) + } + + override def visitCall(ctx: CallContext): LogicalPlan = withOrigin(ctx) { + if (ctx.callArgument().isEmpty) { + throw new ParseException(s"Procedures arguments is empty", ctx) + } + + val name: Seq[String] = ctx.multipartIdentifier().parts.asScala.map(_.getText) + val args: Seq[CallArgument] = ctx.callArgument().asScala.map(typedVisit[CallArgument]) + CallCommand(name, args) + } + + /** + * Return a multi-part identifier as Seq[String]. + */ + override def visitMultipartIdentifier(ctx: MultipartIdentifierContext): Seq[String] = withOrigin(ctx) { + ctx.parts.asScala.map(_.getText) + } + + /** + * Create a positional argument in a stored procedure call. + */ + override def visitPositionalArgument(ctx: PositionalArgumentContext): CallArgument = withOrigin(ctx) { + val expr = typedVisit[Expression](ctx.expression) + PositionalArgument(expr) + } + + /** + * Create a named argument in a stored procedure call. + */ + override def visitNamedArgument(ctx: NamedArgumentContext): CallArgument = withOrigin(ctx) { + val name = ctx.identifier.getText + val expr = typedVisit[Expression](ctx.expression) + NamedArgument(name, expr) + } + + def visitConstant(ctx: ConstantContext): Literal = { + delegate.parseExpression(ctx.getText).asInstanceOf[Literal] + } + + override def visitExpression(ctx: ExpressionContext): Expression = { + // reconstruct the SQL string and parse it using the main Spark parser + // while we can avoid the logic to build Spark expressions, we still have to parse them + // we cannot call ctx.getText directly since it will not render spaces correctly + // that's why we need to recurse down the tree in reconstructSqlString + val sqlString = reconstructSqlString(ctx) + delegate.parseExpression(sqlString) + } + + private def reconstructSqlString(ctx: ParserRuleContext): String = { + ctx.children.asScala.map { + case c: ParserRuleContext => reconstructSqlString(c) + case t: TerminalNode => t.getText + }.mkString(" ") + } + + private def typedVisit[T](ctx: ParseTree): T = { + ctx.accept(this).asInstanceOf[T] + } + + /** + * Create an index, returning a [[CreateIndex]] logical plan. + * For example: + * {{{ + * CREATE INDEX index_name ON [TABLE] table_name [USING index_type] (column_index_property_list) + * [OPTIONS indexPropertyList] + * column_index_property_list: column_name [OPTIONS(indexPropertyList)] [ , . . . ] + * indexPropertyList: index_property_name [= index_property_value] [ , . . . ] + * }}} + */ + override def visitCreateIndex(ctx: CreateIndexContext): LogicalPlan = withOrigin(ctx) { + val (indexName, indexType) = if (ctx.identifier.size() == 1) { + (ctx.identifier(0).getText, "") + } else { + (ctx.identifier(0).getText, ctx.identifier(1).getText) + } + + val columns = ctx.columns.multipartIdentifierProperty.asScala + .map(_.multipartIdentifier).map(typedVisit[Seq[String]]).toSeq + val columnsProperties = ctx.columns.multipartIdentifierProperty.asScala + .map(x => (Option(x.options).map(visitPropertyKeyValues).getOrElse(Map.empty))).toSeq + val options = Option(ctx.indexOptions).map(visitPropertyKeyValues).getOrElse(Map.empty) + + CreateIndex( + visitTableIdentifier(ctx.tableIdentifier()), + indexName, + indexType, + ctx.EXISTS != null, + columns.map(UnresolvedAttribute(_)).zip(columnsProperties), + options) + } + + /** + * Drop an index, returning a [[DropIndex]] logical plan. + * For example: + * {{{ + * DROP INDEX [IF EXISTS] index_name ON [TABLE] table_name + * }}} + */ + override def visitDropIndex(ctx: DropIndexContext): LogicalPlan = withOrigin(ctx) { + val indexName = ctx.identifier.getText + DropIndex( + visitTableIdentifier(ctx.tableIdentifier()), + indexName, + ctx.EXISTS != null) + } + + /** + * Show indexes, returning a [[ShowIndexes]] logical plan. + * For example: + * {{{ + * SHOW INDEXES (FROM | IN) [TABLE] table_name + * }}} + */ + override def visitShowIndexes(ctx: ShowIndexesContext): LogicalPlan = withOrigin(ctx) { + ShowIndexes(visitTableIdentifier(ctx.tableIdentifier())) + } + + /** + * Refresh index, returning a [[RefreshIndex]] logical plan + * For example: + * {{{ + * REFRESH INDEX index_name ON [TABLE] table_name + * }}} + */ + override def visitRefreshIndex(ctx: RefreshIndexContext): LogicalPlan = withOrigin(ctx) { + RefreshIndex(visitTableIdentifier(ctx.tableIdentifier()), ctx.identifier.getText) + } + + /** + * Convert a property list into a key-value map. + * This should be called through [[visitPropertyKeyValues]] or [[visitPropertyKeys]]. + */ + override def visitPropertyList( + ctx: PropertyListContext): Map[String, String] = withOrigin(ctx) { + val properties = ctx.property.asScala.map { property => + val key = visitPropertyKey(property.key) + val value = visitPropertyValue(property.value) + key -> value + } + // Check for duplicate property names. + checkDuplicateKeys(properties.toSeq, ctx) + properties.toMap + } + + /** + * Parse a key-value map from a [[PropertyListContext]], assuming all values are specified. + */ + def visitPropertyKeyValues(ctx: PropertyListContext): Map[String, String] = { + val props = visitPropertyList(ctx) + val badKeys = props.collect { case (key, null) => key } + if (badKeys.nonEmpty) { + operationNotAllowed( + s"Values must be specified for key(s): ${badKeys.mkString("[", ",", "]")}", ctx) + } + props + } + + /** + * Parse a list of keys from a [[PropertyListContext]], assuming no values are specified. + */ + def visitPropertyKeys(ctx: PropertyListContext): Seq[String] = { + val props = visitPropertyList(ctx) + val badKeys = props.filter { case (_, v) => v != null }.keys + if (badKeys.nonEmpty) { + operationNotAllowed( + s"Values should not be specified for key(s): ${badKeys.mkString("[", ",", "]")}", ctx) + } + props.keys.toSeq + } + + /** + * A property key can either be String or a collection of dot separated elements. This + * function extracts the property key based on whether its a string literal or a property + * identifier. + */ + override def visitPropertyKey(key: PropertyKeyContext): String = { + if (key.STRING != null) { + string(key.STRING) + } else { + key.getText + } + } + + /** + * A property value can be String, Integer, Boolean or Decimal. This function extracts + * the property value based on whether its a string, integer, boolean or decimal literal. + */ + override def visitPropertyValue(value: PropertyValueContext): String = { + if (value == null) { + null + } else if (value.STRING != null) { + string(value.STRING) + } else if (value.booleanValue != null) { + value.getText.toLowerCase(Locale.ROOT) + } else { + value.getText + } + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaApp.java b/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaApp.java index 2ee6cae6248b1..54f31ee281d74 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaApp.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaApp.java @@ -16,7 +16,6 @@ * limitations under the License. */ -import org.apache.hadoop.fs.Path; import org.apache.hudi.DataSourceReadOptions; import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.HoodieDataSourceHelpers; @@ -25,14 +24,17 @@ import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.MultiPartKeysValueExtractor; import org.apache.hudi.hive.NonPartitionedExtractor; +import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; import org.apache.hudi.keygen.NonpartitionedKeyGenerator; import org.apache.hudi.keygen.SimpleKeyGenerator; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaSparkContext; @@ -48,6 +50,12 @@ import static org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings; import static org.apache.hudi.common.testutils.Transformations.randomSelectAsHoodieKeys; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_ENABLED; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME; /** * Sample program that writes & reads hoodie tables via the Spark datasource. @@ -140,22 +148,23 @@ public void run() throws Exception { // full list in HoodieWriteConfig & its package .option("hoodie.upsert.shuffle.parallelism", "2") // Hoodie Table Type - .option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY(), tableType) + .option(DataSourceWriteOptions.TABLE_TYPE().key(), tableType) // insert - .option(DataSourceWriteOptions.OPERATION_OPT_KEY(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()) + .option(DataSourceWriteOptions.OPERATION().key(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()) // This is the record key - .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") + .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key") // this is the partition to place it into - .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition") // use to combine duplicate records in input/with disk val - .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") + .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp") // Used by hive sync and queries - .option(HoodieWriteConfig.TABLE_NAME, tableName) + .option(HoodieWriteConfig.TBL_NAME.key(), tableName) // Add Key Extractor - .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), + .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : SimpleKeyGenerator.class.getCanonicalName()) - .option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_OPT_KEY(), "false") + .option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE().key(), "false") + .option(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE().key(), "true") // This will remove any existing data at path below, and create a .mode(SaveMode.Overwrite); @@ -174,16 +183,17 @@ public void run() throws Exception { Dataset inputDF2 = spark.read().json(jssc.parallelize(records2, 2)); writer = inputDF2.write().format("org.apache.hudi").option("hoodie.insert.shuffle.parallelism", "2") .option("hoodie.upsert.shuffle.parallelism", "2") - .option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type - .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") - .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") - .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") - .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), + .option(DataSourceWriteOptions.TABLE_TYPE().key(), tableType) // Hoodie Table Type + .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key") + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition") + .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp") + .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor - .option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, "1") - .option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_OPT_KEY(), "false") - .option(HoodieWriteConfig.TABLE_NAME, tableName).mode(SaveMode.Append); + .option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS.key(), "1") + .option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE().key(), "false") + .option(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE().key(), "true") + .option(HoodieWriteConfig.TBL_NAME.key(), tableName).mode(SaveMode.Append); updateHiveSyncConfig(writer); writer.save(tablePath); @@ -200,17 +210,18 @@ public void run() throws Exception { writer = inputDF3.write().format("org.apache.hudi").option("hoodie.insert.shuffle.parallelism", "2") .option("hoodie.upsert.shuffle.parallelism", "2") .option("hoodie.delete.shuffle.parallelism", "2") - .option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type - .option(DataSourceWriteOptions.OPERATION_OPT_KEY(), "delete") - .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") - .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") - .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "_row_key") - .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), + .option(DataSourceWriteOptions.TABLE_TYPE().key(), tableType) // Hoodie Table Type + .option(DataSourceWriteOptions.OPERATION().key(), "delete") + .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key") + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition") + .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp") + .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor - .option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, "1") - .option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_OPT_KEY(), "false") - .option(HoodieWriteConfig.TABLE_NAME, tableName).mode(SaveMode.Append); + .option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS.key(), "1") + .option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE().key(), "false") + .option(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE().key(), "true") + .option(HoodieWriteConfig.TBL_NAME.key(), tableName).mode(SaveMode.Append); updateHiveSyncConfig(writer); writer.save(tablePath); @@ -234,9 +245,9 @@ public void run() throws Exception { * Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE */ Dataset incQueryDF = spark.read().format("org.apache.hudi") - .option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL()) + .option(DataSourceReadOptions.QUERY_TYPE().key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL()) // Only changes in write 2 above - .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1) + .option(DataSourceReadOptions.BEGIN_INSTANTTIME().key(), commitInstantTime1) // For incremental view, pass in the root/base path of dataset .load(tablePath); @@ -251,23 +262,25 @@ public void run() throws Exception { private DataFrameWriter updateHiveSyncConfig(DataFrameWriter writer) { if (enableHiveSync) { LOG.info("Enabling Hive sync to " + hiveJdbcUrl); - writer = writer.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), hiveTable) - .option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), hiveDB) - .option(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), hiveJdbcUrl) - .option(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), hiveUser) - .option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass) - .option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true"); + writer = writer.option(META_SYNC_TABLE_NAME.key(), hiveTable) + .option(META_SYNC_DATABASE_NAME.key(), hiveDB) + .option(HIVE_URL.key(), hiveJdbcUrl) + .option(HIVE_USER.key(), hiveUser) + .option(HIVE_PASS.key(), hivePass) + .option(HIVE_SYNC_ENABLED.key(), "true"); if (nonPartitionedTable) { writer = writer - .option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(), + .option(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), NonPartitionedExtractor.class.getCanonicalName()) - .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), ""); + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), ""); } else if (useMultiPartitionKeys) { - writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day").option( - DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(), + writer = writer.option(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "year,month,day").option( + HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), MultiPartKeysValueExtractor.class.getCanonicalName()); } else { - writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr"); + writer = writer.option(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "dateStr").option( + HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), + SlashEncodedDayPartitionValueExtractor.class.getCanonicalName()); } } return writer; diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaGenerateApp.java b/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaGenerateApp.java index 012134fdb7e6d..491e164b9572d 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaGenerateApp.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaGenerateApp.java @@ -23,8 +23,10 @@ import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.MultiPartKeysValueExtractor; import org.apache.hudi.hive.NonPartitionedExtractor; +import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; import org.apache.hudi.keygen.NonpartitionedKeyGenerator; import org.apache.hudi.keygen.SimpleKeyGenerator; @@ -44,6 +46,12 @@ import java.util.List; import static org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_ENABLED; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME; public class HoodieJavaGenerateApp { @Parameter(names = {"--table-path", "-p"}, description = "Path for Hoodie sample table") @@ -124,23 +132,25 @@ private HoodieTestDataGenerator getDataGenerate() { private DataFrameWriter updateHiveSyncConfig(DataFrameWriter writer) { if (enableHiveSync) { LOG.info("Enabling Hive sync to " + hiveJdbcUrl); - writer = writer.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), hiveTable) - .option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), hiveDB) - .option(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), hiveJdbcUrl) - .option(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), hiveUser) - .option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass) - .option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true"); + writer = writer.option(META_SYNC_TABLE_NAME.key(), hiveTable) + .option(META_SYNC_DATABASE_NAME.key(), hiveDB) + .option(HIVE_URL.key(), hiveJdbcUrl) + .option(HIVE_USER.key(), hiveUser) + .option(HIVE_PASS.key(), hivePass) + .option(HIVE_SYNC_ENABLED.key(), "true"); if (nonPartitionedTable) { writer = writer - .option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(), + .option(HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), NonPartitionedExtractor.class.getCanonicalName()) - .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), ""); + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), ""); } else if (useMultiPartitionKeys) { - writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day").option( - DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(), + writer = writer.option(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "year,month,day").option( + HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), MultiPartKeysValueExtractor.class.getCanonicalName()); } else { - writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr"); + writer = writer.option(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "dateStr").option( + HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), + SlashEncodedDayPartitionValueExtractor.class.getCanonicalName()); } } return writer; @@ -165,19 +175,19 @@ private void insert(SparkSession spark) throws IOException { // full list in HoodieWriteConfig & its package .option("hoodie.upsert.shuffle.parallelism", "2") // Hoodie Table Type - .option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY(), tableType) + .option(DataSourceWriteOptions.TABLE_TYPE().key(), tableType) // insert - .option(DataSourceWriteOptions.OPERATION_OPT_KEY(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()) + .option(DataSourceWriteOptions.OPERATION().key(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()) // This is the record key - .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") + .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key") // this is the partition to place it into - .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition") // use to combine duplicate records in input/with disk val - .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") + .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp") // Used by hive sync and queries - .option(HoodieWriteConfig.TABLE_NAME, tableName) + .option(HoodieWriteConfig.TBL_NAME.key(), tableName) // Add Key Extractor - .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), + .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : SimpleKeyGenerator.class.getCanonicalName()) .mode(commitType); diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaStreamingApp.java b/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaStreamingApp.java index 1df12a35032ab..e716d34bd5efd 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaStreamingApp.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaStreamingApp.java @@ -16,7 +16,6 @@ * limitations under the License. */ -import java.util.stream.Collectors; import org.apache.hudi.DataSourceReadOptions; import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.HoodieDataSourceHelpers; @@ -28,7 +27,9 @@ import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.TableNotFoundException; +import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.MultiPartKeysValueExtractor; +import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; @@ -43,15 +44,22 @@ import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.streaming.DataStreamWriter; import org.apache.spark.sql.streaming.OutputMode; +import org.apache.spark.sql.streaming.StreamingQuery; import org.apache.spark.sql.streaming.Trigger; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; -import org.apache.spark.sql.streaming.StreamingQuery; +import java.util.stream.Collectors; import static org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_PASS; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_ENABLED; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USER; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME; /** * Sample program that writes & reads hoodie tables via the Spark datasource streaming. @@ -163,7 +171,7 @@ public void run() throws Exception { ExecutorService executor = Executors.newFixedThreadPool(2); int numInitialCommits = 0; - // thread for spark strucutured streaming + // thread for spark structured streaming try { Future streamFuture = executor.submit(() -> { LOG.info("===== Streaming Starting ====="); @@ -187,7 +195,7 @@ public void run() throws Exception { executor.shutdownNow(); } - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jssc.hadoopConfiguration(), tablePath); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(jssc.hadoopConfiguration()).setBasePath(tablePath).build(); if (tableType.equals(HoodieTableType.MERGE_ON_READ.name())) { // Ensure we have successfully completed one compaction commit ValidationUtils.checkArgument(metaClient.getActiveTimeline().getCommitTimeline().getInstants().count() == 1); @@ -210,7 +218,7 @@ public void run() throws Exception { Dataset inputDF3 = newSpark.read().json(jssc.parallelize(deletes, 2)); executor = Executors.newFixedThreadPool(2); - // thread for spark strucutured streaming + // thread for spark structured streaming try { Future streamFuture = executor.submit(() -> { LOG.info("===== Streaming Starting ====="); @@ -249,7 +257,7 @@ private void waitTillNCommits(FileSystem fs, int numCommits, int timeoutSecs, in if (timeline.countInstants() >= numCommits) { return; } - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), tablePath, true); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(tablePath).setLoadActiveTimelineOnLoad(true).build(); System.out.println("Instants :" + metaClient.getActiveTimeline().getInstants().collect(Collectors.toList())); } catch (TableNotFoundException te) { LOG.info("Got table not found exception. Retrying"); @@ -295,7 +303,9 @@ public int addInputAndValidateIngestion(SparkSession spark, FileSystem fs, Strin } if (tableType.equals(HoodieTableType.MERGE_ON_READ.name())) { - numExpCommits += 1; + if (inputDF2 != null) { + numExpCommits += 1; + } // Wait for compaction to also finish and track latest timestamp as commit timestamp waitTillNCommits(fs, numExpCommits, 180, 3); commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath); @@ -332,9 +342,9 @@ public int addInputAndValidateIngestion(SparkSession spark, FileSystem fs, Strin * Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE */ Dataset hoodieIncViewDF = spark.read().format("hudi") - .option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL()) + .option(DataSourceReadOptions.QUERY_TYPE().key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL()) // Only changes in write 2 above - .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1) + .option(DataSourceReadOptions.BEGIN_INSTANTTIME().key(), commitInstantTime1) // For incremental view, pass in the root/base path of dataset .load(tablePath); @@ -355,14 +365,17 @@ public void stream(Dataset streamingInput, String operationType, String che DataStreamWriter writer = streamingInput.writeStream().format("org.apache.hudi") .option("hoodie.insert.shuffle.parallelism", "2").option("hoodie.upsert.shuffle.parallelism", "2") .option("hoodie.delete.shuffle.parallelism", "2") - .option(DataSourceWriteOptions.OPERATION_OPT_KEY(), operationType) - .option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY(), tableType) - .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") - .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") - .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") - .option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, "1") - .option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_OPT_KEY(), "true") - .option(HoodieWriteConfig.TABLE_NAME, tableName).option("checkpointLocation", checkpointLocation) + .option(DataSourceWriteOptions.OPERATION().key(), operationType) + .option(DataSourceWriteOptions.TABLE_TYPE().key(), tableType) + .option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key") + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition") + .option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp") + .option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS.key(), "1") + .option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE().key(), "true") + .option(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE().key(), "true") + .option(HoodieCompactionConfig.PRESERVE_COMMIT_METADATA.key(), "false") + .option(DataSourceWriteOptions.STREAMING_IGNORE_FAILED_BATCH().key(),"true") + .option(HoodieWriteConfig.TBL_NAME.key(), tableName).option("checkpointLocation", checkpointLocation) .outputMode(OutputMode.Append()); updateHiveSyncConfig(writer); @@ -379,18 +392,20 @@ public void stream(Dataset streamingInput, String operationType, String che private DataStreamWriter updateHiveSyncConfig(DataStreamWriter writer) { if (enableHiveSync) { LOG.info("Enabling Hive sync to " + hiveJdbcUrl); - writer = writer.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), hiveTable) - .option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), hiveDB) - .option(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), hiveJdbcUrl) - .option(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), hiveUser) - .option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass) - .option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true"); + writer = writer.option(META_SYNC_TABLE_NAME.key(), hiveTable) + .option(META_SYNC_DATABASE_NAME.key(), hiveDB) + .option(HIVE_URL.key(), hiveJdbcUrl) + .option(HIVE_USER.key(), hiveUser) + .option(HIVE_PASS.key(), hivePass) + .option(HIVE_SYNC_ENABLED.key(), "true"); if (useMultiPartitionKeys) { - writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day").option( - DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(), + writer = writer.option(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "year,month,day").option( + HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), MultiPartKeysValueExtractor.class.getCanonicalName()); } else { - writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr"); + writer = writer.option(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "dateStr").option( + HiveSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), + SlashEncodedDayPartitionValueExtractor.class.getCanonicalName()); } } return writer; diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/ColumnStatsIndexHelper.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/ColumnStatsIndexHelper.java new file mode 100644 index 0000000000000..7c9649d44992f --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/ColumnStatsIndexHelper.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.model.HoodieColumnRangeMetadata; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.util.BaseFileUtils; +import org.apache.hudi.common.util.ParquetUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; +import org.apache.spark.SparkContext; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.Row$; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.BinaryType; +import org.apache.spark.sql.types.BooleanType; +import org.apache.spark.sql.types.ByteType; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DateType; +import org.apache.spark.sql.types.DecimalType; +import org.apache.spark.sql.types.DoubleType; +import org.apache.spark.sql.types.FloatType; +import org.apache.spark.sql.types.IntegerType; +import org.apache.spark.sql.types.LongType; +import org.apache.spark.sql.types.ShortType; +import org.apache.spark.sql.types.StringType; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.sql.types.StructType$; +import org.apache.spark.sql.types.TimestampType; +import org.apache.spark.util.SerializableConfiguration; +import scala.collection.JavaConversions; +import scala.collection.JavaConverters$; + +import javax.annotation.Nonnull; +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +// TODO merge w/ ColumnStatsIndexSupport +public class ColumnStatsIndexHelper { + + public static Pair + fetchMinMaxValues( + @Nonnull DataType colType, + @Nonnull HoodieColumnRangeMetadata colMetadata) { + if (colType instanceof IntegerType) { + return Pair.of( + new Integer(colMetadata.getMinValue().toString()), + new Integer(colMetadata.getMaxValue().toString()) + ); + } else if (colType instanceof DoubleType) { + return Pair.of( + new Double(colMetadata.getMinValue().toString()), + new Double(colMetadata.getMaxValue().toString()) + ); + } else if (colType instanceof StringType) { + return Pair.of( + colMetadata.getMinValue().toString(), + colMetadata.getMaxValue().toString()); + } else if (colType instanceof DecimalType) { + return Pair.of( + new BigDecimal(colMetadata.getMinValue().toString()), + new BigDecimal(colMetadata.getMaxValue().toString())); + } else if (colType instanceof DateType) { + return Pair.of( + java.sql.Date.valueOf(colMetadata.getMinValue().toString()), + java.sql.Date.valueOf(colMetadata.getMaxValue().toString())); + } else if (colType instanceof LongType) { + return Pair.of( + new Long(colMetadata.getMinValue().toString()), + new Long(colMetadata.getMaxValue().toString())); + } else if (colType instanceof ShortType) { + return Pair.of( + new Short(colMetadata.getMinValue().toString()), + new Short(colMetadata.getMaxValue().toString())); + } else if (colType instanceof FloatType) { + return Pair.of( + new Float(colMetadata.getMinValue().toString()), + new Float(colMetadata.getMaxValue().toString())); + } else if (colType instanceof BinaryType) { + return Pair.of( + ((ByteBuffer) colMetadata.getMinValue()).array(), + ((ByteBuffer) colMetadata.getMaxValue()).array()); + } else if (colType instanceof BooleanType) { + return Pair.of( + Boolean.valueOf(colMetadata.getMinValue().toString()), + Boolean.valueOf(colMetadata.getMaxValue().toString())); + } else if (colType instanceof ByteType) { + return Pair.of( + Byte.valueOf(colMetadata.getMinValue().toString()), + Byte.valueOf(colMetadata.getMaxValue().toString())); + } else { + throw new HoodieException(String.format("Not support type: %s", colType)); + } + } + + /** + * NOTE: THIS IS ONLY USED IN TESTING CURRENTLY, SINCE DATA SKIPPING IS NOW RELYING ON + * METADATA TABLE INDEX + * + * Parse min/max statistics from Parquet footers for provided columns and composes column-stats + * index table in the following format with 3 statistics denominated for each + * linear/Z-curve/Hilbert-curve-ordered column. For ex, if original table contained + * column {@code A}: + * + *

    +   * +---------------------------+------------+------------+-------------+
    +   * |          file             | A_minValue | A_maxValue | A_nullCount |
    +   * +---------------------------+------------+------------+-------------+
    +   * | one_base_file.parquet     |          1 |         10 |           0 |
    +   * | another_base_file.parquet |        -10 |          0 |           5 |
    +   * +---------------------------+------------+------------+-------------+
    +   * 
    + *

    + * NOTE: Currently {@link TimestampType} is not supported, since Parquet writer + * does not support statistics for it. + * + * @VisibleForTestingOnly + * + * @param sparkSession encompassing Spark session + * @param baseFilesPaths list of base-files paths to be sourced for column-stats index + * @param orderedColumnSchemas target ordered columns + * @return Spark's {@link Dataset} holding an index table + * @VisibleForTesting + */ + @Nonnull + public static Dataset buildColumnStatsTableFor( + @Nonnull SparkSession sparkSession, + @Nonnull List baseFilesPaths, + @Nonnull List orderedColumnSchemas + ) { + SparkContext sc = sparkSession.sparkContext(); + JavaSparkContext jsc = new JavaSparkContext(sc); + + List columnNames = orderedColumnSchemas.stream() + .map(StructField::name) + .collect(Collectors.toList()); + + SerializableConfiguration serializableConfiguration = new SerializableConfiguration(sc.hadoopConfiguration()); + int numParallelism = (baseFilesPaths.size() / 3 + 1); + + String previousJobDescription = sc.getLocalProperty("spark.job.description"); + + List> colMinMaxInfos; + try { + jsc.setJobDescription("Listing parquet column statistics"); + colMinMaxInfos = + jsc.parallelize(baseFilesPaths, numParallelism) + .mapPartitions(paths -> { + ParquetUtils utils = (ParquetUtils) BaseFileUtils.getInstance(HoodieFileFormat.PARQUET); + Iterable iterable = () -> paths; + return StreamSupport.stream(iterable.spliterator(), false) + .flatMap(path -> + utils.readRangeFromParquetMetadata( + serializableConfiguration.value(), + new Path(path), + columnNames + ) + .stream() + ) + .iterator(); + }) + .collect(); + } finally { + jsc.setJobDescription(previousJobDescription); + } + + // Group column's metadata by file-paths of the files it belongs to + Map>> filePathToColumnMetadataMap = + colMinMaxInfos.stream() + .collect(Collectors.groupingBy(HoodieColumnRangeMetadata::getFilePath)); + + JavaRDD allMetaDataRDD = + jsc.parallelize(new ArrayList<>(filePathToColumnMetadataMap.values()), 1) + .map(fileColumnsMetadata -> { + int colSize = fileColumnsMetadata.size(); + if (colSize == 0) { + return null; + } + + String filePath = fileColumnsMetadata.get(0).getFilePath(); + + List indexRow = new ArrayList<>(); + + // First columns of the Z-index's row is target file-path + indexRow.add(filePath); + + // For each column + orderedColumnSchemas.forEach(colSchema -> { + String colName = colSchema.name(); + + HoodieColumnRangeMetadata colMetadata = + fileColumnsMetadata.stream() + .filter(s -> s.getColumnName().trim().equalsIgnoreCase(colName)) + .findFirst() + .orElse(null); + + DataType colType = colSchema.dataType(); + if (colMetadata == null || colType == null) { + throw new HoodieException(String.format("Cannot collect min/max statistics for column (%s)", colSchema)); + } + + Pair minMaxValue = fetchMinMaxValues(colType, colMetadata); + + indexRow.add(minMaxValue.getLeft()); // min + indexRow.add(minMaxValue.getRight()); // max + indexRow.add(colMetadata.getNullCount()); + }); + + return Row$.MODULE$.apply(JavaConversions.asScalaBuffer(indexRow)); + }) + .filter(Objects::nonNull); + + StructType indexSchema = ColumnStatsIndexSupport$.MODULE$.composeIndexSchema( + JavaConverters$.MODULE$.collectionAsScalaIterableConverter(columnNames).asScala().toSeq(), + StructType$.MODULE$.apply(orderedColumnSchemas) + ); + + return sparkSession.createDataFrame(allMetaDataRDD, indexSchema); + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java index 97948b9ee3176..11f0fc97856e9 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java @@ -20,12 +20,17 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner; import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.avro.Conversions; @@ -35,9 +40,20 @@ import org.apache.avro.generic.GenericFixed; import org.apache.avro.generic.GenericRecord; import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.types.DecimalType; +import org.apache.spark.sql.types.DecimalType$; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.sql.types.StructType$; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import org.mockito.ArgumentCaptor; import org.mockito.Captor; import org.mockito.Mock; @@ -45,7 +61,14 @@ import java.math.BigDecimal; import java.time.LocalDate; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Stream; +import static org.apache.hudi.DataSourceUtils.tryOverrideParquetWriteLegacyFormatProperty; import static org.hamcrest.CoreMatchers.containsString; import static org.hamcrest.CoreMatchers.equalTo; import static org.hamcrest.CoreMatchers.instanceOf; @@ -62,6 +85,9 @@ @ExtendWith(MockitoExtension.class) public class TestDataSourceUtils { + private static final String HIVE_DATABASE = "testdb1"; + private static final String HIVE_TABLE = "hive_trips"; + @Mock private SparkRDDWriteClient hoodieWriteClient; @@ -72,6 +98,24 @@ public class TestDataSourceUtils { private ArgumentCaptor